PyPI - parsefood - Versions diffs - 0.1.0__tar.gz - Mend

parsefood 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

parsefood-0.1.0/.env.example +24 -0
parsefood-0.1.0/.github/dependabot.yml +8 -0
parsefood-0.1.0/.github/workflows/release.yml +49 -0
parsefood-0.1.0/.gitignore +60 -0
parsefood-0.1.0/.pre-commit-config.yaml +5 -0
parsefood-0.1.0/AGENTS.md +345 -0
parsefood-0.1.0/CLAUDE.md +1 -0
parsefood-0.1.0/LICENSE +21 -0
parsefood-0.1.0/Makefile +5 -0
parsefood-0.1.0/PKG-INFO +253 -0
parsefood-0.1.0/README.md +213 -0
parsefood-0.1.0/docs/pipeline.md +179 -0
parsefood-0.1.0/food_database.json +1484 -0
parsefood-0.1.0/food_log/__init__.py +89 -0
parsefood-0.1.0/food_log/config.py +113 -0
parsefood-0.1.0/food_log/database.py +267 -0
parsefood-0.1.0/food_log/llm/__init__.py +22 -0
parsefood-0.1.0/food_log/llm/client.py +36 -0
parsefood-0.1.0/food_log/llm/consistency.py +514 -0
parsefood-0.1.0/food_log/llm/extractor.py +141 -0
parsefood-0.1.0/food_log/llm/prompts.py +243 -0
parsefood-0.1.0/food_log/models.py +200 -0
parsefood-0.1.0/food_log/processing/__init__.py +28 -0
parsefood-0.1.0/food_log/processing/messages.py +339 -0
parsefood-0.1.0/food_log/processing/nutrition.py +201 -0
parsefood-0.1.0/food_log/processing/validation.py +166 -0
parsefood-0.1.0/food_log/profile.py +147 -0
parsefood-0.1.0/food_log/utils.py +179 -0
parsefood-0.1.0/food_log/visualization/__init__.py +5 -0
parsefood-0.1.0/food_log/visualization/plots.py +57 -0
parsefood-0.1.0/logo.png +0 -0
parsefood-0.1.0/main.py +558 -0
parsefood-0.1.0/process_labels.py +366 -0
parsefood-0.1.0/profiles/template.yaml.example +16 -0
parsefood-0.1.0/prompts/_food_list.txt +149 -0
parsefood-0.1.0/pyproject.toml +66 -0
parsefood-0.1.0/review_database.py +815 -0
parsefood-0.1.0/s3_storage.py +133 -0
parsefood-0.1.0/scrape_food.py +198 -0
parsefood-0.1.0/scrapers/__init__.py +39 -0
parsefood-0.1.0/scrapers/base.py +45 -0
parsefood-0.1.0/scrapers/celeiro.py +236 -0
parsefood-0.1.0/scrapers/continente.py +303 -0
parsefood-0.1.0/scrapers/llm_extraction.py +80 -0
parsefood-0.1.0/scrapers/models.py +33 -0
parsefood-0.1.0/scrapers/pingo_doce.py +311 -0
parsefood-0.1.0/scrapers/registry.py +50 -0
parsefood-0.1.0/scripts/compare_migration.py +628 -0
parsefood-0.1.0/scripts/download_telegram_messages.py +230 -0
parsefood-0.1.0/scripts/telegram_sync.py +222 -0
parsefood-0.1.0/scripts/test_new_system.py +153 -0
parsefood-0.1.0/telegram_bot.py +215 -0
parsefood-0.1.0/uv.lock +1885 -0

parsefood-0.1.0/.env.example ADDED Viewed

@@ -0,0 +1,24 @@
+MODEL_ID=google/gemini-2.5-flash
+OPENROUTER_API_KEY=KEY
+DATA_PATH=<data_path>
+TARGET_CALORIES=<target_calories>
+# Telegram API Configuration (for automated message download)
+# Get your API credentials from: https://my.telegram.org/apps
+TELEGRAM_API_ID=<telegram_api_id>
+TELEGRAM_API_HASH=<telegram_api_hash>
+TELEGRAM_PHONE=<telegram_phone>
+TELEGRAM_CHAT=<telegram_chat>
+TELEGRAM_OUTPUT_FILE=data/result.json
+# Telegram Bot Configuration (for sending summaries)
+# Create a bot via @BotFather to get the token
+# Get your chat ID by messaging @userinfobot
+TELEGRAM_BOT_TOKEN=<bot_token>
+TELEGRAM_CHAT_ID=<chat_id>
+# AWS S3 Configuration (for syncing data files)
+# Set S3_ENABLED=true to enable S3 sync, false to use local files only
+S3_ENABLED=false
+S3_BUCKET=<s3-bucket-name>
+S3_REGION=<s3-region-name>

parsefood-0.1.0/.github/dependabot.yml ADDED Viewed

@@ -0,0 +1,8 @@
+# Dependabot configuration for automated dependency updates
+# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates
+version: 2
+updates:
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "weekly"

parsefood-0.1.0/.github/workflows/release.yml ADDED Viewed

@@ -0,0 +1,49 @@
+name: Release
+on:
+  push:
+    branches: [main]
+    paths:
+      - "food_log/__init__.py"
+jobs:
+  test:
+    uses: tsilva/.github/.github/workflows/test.yml@main
+  pii-scan:
+    uses: tsilva/.github/.github/workflows/pii-scan.yml@main
+  publish:
+    needs: [test, pii-scan]
+    runs-on: ubuntu-latest
+    environment: pypi
+    permissions:
+      contents: write
+      id-token: write
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Install build tools
+        run: pip install hatch
+      - name: Build package
+        run: hatch build
+      - name: Get version
+        id: version
+        run: echo "version=$(hatch version)" >> "$GITHUB_OUTPUT"
+      - name: Create GitHub Release
+        uses: softprops/action-gh-release@v2
+        with:
+          tag_name: v${{ steps.version.outputs.version }}
+          name: v${{ steps.version.outputs.version }}
+          generate_release_notes: true
+          files: dist/*
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1

parsefood-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,60 @@
+# >>> MANAGED BY GITGUARD - DO NOT EDIT THIS SECTION <<<
+.claude/*.local.json
+.claude/*.local.json.bak
+.claude-sandbox.json
+.mcp.json
+.env
+.env.*
+!.env.example
+!.env.*.example
+.env.*.local
+.env.local
+*.pem
+*.key
+*.p12
+*.pfx
+*.gpg
+*.secret
+*-credentials.json
+service-account*.json
+credentials.json
+secrets.json
+.secrets/
+.aws/
+.ssh/
+config.local.*
+.DS_Store
+Thumbs.db
+.idea/
+.vscode/
+*.swp
+*.swo
+*.code-workspace
+__pycache__/
+*.py[cod]
+.venv/
+venv/
+env/
+*.egg-info/
+node_modules/
+.npm/
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+dist/
+build/
+logs/
+*.log
+# >>> END MANAGED <<<
+# Project-specific rules
+!profiles/_*.yaml
+*.egg-info
+__pycache__
+data
+food_database.backup_*.json
+labels
+logs
+profiles/*.yaml
+skills/
+telegram_session.session

parsefood-0.1.0/.pre-commit-config.yaml ADDED Viewed

@@ -0,0 +1,5 @@
+repos:
+  - repo: https://github.com/tsilva/.github
+    rev: main
+    hooks:
+      - id: gitleaks

parsefood-0.1.0/AGENTS.md ADDED Viewed

@@ -0,0 +1,345 @@
+# CLAUDE.md
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+## Project Overview
+Food Log Parser is a Python tool that extracts and analyzes food logs from Telegram exports. It uses LLMs (via OpenRouter API) to parse Portuguese food entries, calculate calories, and generate visualizations of calorie intake over time.
+## Package Structure
+```
+parsefood/
+├── food_log/                    # Main package
+│   ├── __init__.py              # Package exports
+│   ├── config.py                # Centralized configuration (paths, units, dates)
+│   ├── profile.py               # Profile configuration for multi-user support
+│   ├── models.py                # Pydantic models (FoodEntry, FoodLogEntry, etc.)
+│   ├── database.py              # Database operations (load/save/backup/merge)
+│   ├── utils.py                 # Shared utilities (normalize_name, normalize_unit, etc.)
+│   ├── llm/
+│   │   ├── __init__.py
+│   │   ├── client.py            # OpenRouter client factory
+│   │   ├── extractor.py         # FoodDataExtractor class
+│   │   └── prompts.py           # Prompt templates
+│   ├── processing/
+│   │   ├── __init__.py
+│   │   ├── nutrition.py         # NutritionCalculator
+│   │   ├── messages.py          # Message loading and grouping
+│   │   └── validation.py        # Data validation
+│   └── visualization/
+│       ├── __init__.py
+│       └── plots.py             # Calorie plotting
+├── profiles/                    # User profile configurations
+│   ├── _template.yaml           # Template for new profiles
+│   └── cristina.yaml            # Cristina's profile
+├── scrapers/                    # Web scrapers for food data
+├── scripts/                     # Utility scripts
+├── main.py                      # Entry point
+├── process_labels.py            # Vision AI label processor
+└── scrape_food.py               # Web scraper CLI
+```
+### Import Patterns
+```python
+# Import from food_log package
+from food_log import UNIT_MAP, FOOD_DATABASE_PATH, normalize_unit
+from food_log.models import FoodEntry, FoodLogEntry
+from food_log.database import load_database, save_database, create_backup
+from food_log.llm import FoodDataExtractor, create_openrouter_client
+from food_log.processing import NutritionCalculator
+from food_log.visualization import plot_calories
+```
+## Core Architecture
+### Data Processing Pipeline
+The application follows a multi-stage pipeline:
+1. **Message Extraction**: Reads Telegram JSON export (`data/result.json`) and filters messages from a specific sender
+2. **Date Grouping**: Groups messages by date, applying blacklists and date rewrites for specific message IDs
+3. **LLM Processing**: Uses OpenRouter API with configurable model (default: Gemini 2.5 Flash) to extract structured food data from Portuguese text
+4. **Validation**: Uses Pydantic models (`FoodEntry`, `FoodLogEntry`) for data validation and structure
+5. **Storage**: Saves processed data to `data/date_messages_map.json` with both legacy format (semicolon-separated) and original messages
+6. **Visualization**: Generates polynomial regression plot of calorie trends (`data/plot.png`)
+### Key Data Structures
+**FoodEntry**: Single food item with fields:
+- `ingredient`: Food name (lowercase, stripped)
+- `quantity`: Positive float
+- `unit`: Portuguese unit (e.g., "colher de sopa", "unidade", "copo")
+- `calories`: Non-negative integer
+**FoodLogEntry**: Daily log containing:
+- `date`: YYYY-MM-DD format
+- `raw_text`: Original message
+- `foods`: List of FoodEntry objects
+- `total_calories`: Auto-calculated sum
+**date_messages_map.json structure**:
+```json
+{
+  "YYYY-MM-DD": {
+    "original": ["raw message 1", "raw message 2"],
+    "processed": ["ingredient;quantity;unit;calories", ...],
+    "total": total_calories_int
+  }
+}
+```
+### LLM Integration
+The `FoodDataExtractor` class handles LLM interaction:
+- Connects via OpenAI-compatible client to OpenRouter
+- Uses `food_database.json` as the structured nutrition database
+- Extracts JSON array from LLM responses with regex fallback
+- Validates and coerces data types to match Pydantic models
+- Falls back to retry with data type fixes on validation errors
+- Works in conjunction with `NutritionCalculator` for deterministic calorie calculations
+### Configuration
+Environment variables (`.env` file):
+- `MODEL_ID`: OpenRouter model identifier (e.g., "google/gemini-2.5-flash")
+- `OPENROUTER_API_KEY`: API key for OpenRouter
+- Optional: `TARGET_CALORIES`, `DATA_PATH` (defaults used when no profile)
+Centralized configuration in `food_log/config.py`:
+- `START_DATE`: Messages before this date are ignored (currently 2023-08-30)
+- `BLACKLISTED_IDS`: Message IDs to skip during processing
+- `REWRITE_ID_DATES`: Message IDs with corrected timestamps
+- `UNIT_MAP`: Portuguese to standard unit mapping (e.g., "colher de sopa" → "tbsp")
+- `DATA_PATH`, `FOOD_DATABASE_PATH`: File paths
+### Profiles
+Multi-user support via YAML profile files in `profiles/` directory:
+```yaml
+# profiles/cristina.yaml
+name: "Cristina"
+sender_name: "Cristina"  # Filter Telegram messages by sender
+data_path: "/path/to/data"
+# Optional settings (defaults shown)
+start_date: "2023-08-30"
+target_calories: 2000
+blacklisted_ids: [2842, 2849]
+rewrite_id_dates:
+  7071: "2024-06-23T07:22:47"
+```
+**Key points:**
+- `food_database.json` remains **global/shared** (not per-profile)
+- Running without `--profile` uses environment defaults (backward compatible)
+- Profile files starting with `_` are excluded (templates)
+Configuration priority:
+1. Profile YAML settings (if `--profile` specified)
+2. Environment variables
+3. Code defaults
+Profile-related functions:
+```python
+from food_log import list_profiles, load_profile, get_runtime_config
+# List available profiles
+profiles = list_profiles()  # ['cristina']
+# Load and use a profile
+profile = load_profile('cristina')
+config = get_runtime_config(profile)
+# Use default config (no profile)
+config = get_runtime_config()
+```
+### Utility Scripts
+**scripts/test_new_system.py**: Integration test script for the food database system. Tests both LLM extraction and nutrition calculation.
+**scripts/compare_migration.py**: Validation and comparison tool with two modes:
+- `--quick`: Fast validation by recalculating from existing processed entries (no LLM)
+- Default: Full regeneration using LLM, generates markdown reports for discrepancies
+**process_labels.py**: Utility for adding new foods to the database by processing product nutrition label images using vision AI. Supports multiple image formats and creates timestamped backups.
+**scrape_food.py**: Web scraper for adding foods from product URLs. Uses Playwright for JavaScript-rendered pages. Currently supports Continente.pt.
+## Web Scraping
+**IMPORTANT**: When asked to scrape a URL or add a food from a product page, ALWAYS use the `scrape_food.py` CLI tool instead of ad-hoc scraping. The scraper handles JavaScript rendering, nutrition extraction, and database integration.
+### Usage
+```bash
+# Scrape and display (dry-run)
+uv run python scrape_food.py URL
+# Scrape and add to database
+uv run python scrape_food.py --add URL
+# Scrape with custom name
+uv run python scrape_food.py --add --name "custom name" URL
+# Validate against existing entry
+uv run python scrape_food.py --validate --name "existing name" URL
+# Overwrite existing entry
+uv run python scrape_food.py --add --overwrite --name "name" URL
+# List supported sites
+uv run python scrape_food.py --list-scrapers
+```
+### Supported Sites
+- **Continente.pt**: Extracts calories, proteins, carbs, fats, fiber, sodium, and unit weight from product pages
+### Adding New Scrapers
+To add support for a new store:
+1. Create `scrapers/newstore.py` implementing the `FoodScraper` protocol
+2. Register in `scrape_food.py`: `registry.register(NewStoreScraper())`
+## Development Commands
+### Running the Main Application
+```bash
+# List available profiles
+uv run python main.py --list-profiles
+# Process with a specific profile
+uv run python main.py --profile cristina
+# Process without profile (uses DATA_PATH env var)
+uv run python main.py
+```
+This will:
+1. Backup `result.json` to `backups/result_TIMESTAMP.json`
+2. Sync latest messages from Telegram to `result.json`
+3. Backup `date_messages_map.json` to `backups/date_messages_map_TIMESTAMP.json`
+4. Load and process any new or modified dates using the LLM
+5. Save results to `date_messages_map.json`
+6. Generate calorie trend plot at `plot.png`
+### Quick Daily Status
+```bash
+# Show today's food log with progress (read-only, fast)
+uv run python main.py --profile cristina --today
+# Without profile (uses defaults)
+uv run python main.py --today
+```
+### Validation and Inspection
+```bash
+# Validate all data in the processed file
+uv run python main.py validate data/date_messages_map.json
+# Inspect a specific date's data
+uv run python main.py validate data/date_messages_map.json 2024-03-27
+```
+### Testing and Validation
+```bash
+# Test the food database system
+uv run python scripts/test_new_system.py
+# Quick validation: recalculate and compare calorie values (no LLM)
+uv run python scripts/compare_migration.py --quick -n 50
+```
+### Setup
+```bash
+# Install uv (if not already installed)
+curl -LsSf https://astral.sh/uv/install.sh | sh
+# Install dependencies
+uv sync
+# Configure environment
+cp .env.example .env
+# Edit .env with your OPENROUTER_API_KEY and MODEL_ID
+```
+Note: This project uses `uv` for dependency management. Dependencies are defined in `pyproject.toml`. The `uv sync` command will create a virtual environment and install all dependencies automatically.
+## Important Implementation Details
+### Message Processing Logic
+- Messages are filtered by `sender_name` from profile (or "Cristina" by default)
+- Existing processed dates are NOT reprocessed unless the original message text changes
+- Processing uses 2 workers by default to avoid rate limits
+- Both `result.json` and `date_messages_map.json` are backed up to `backups/` directory with timestamps before processing
+### Legacy Format Support
+The system maintains backward compatibility with semicolon-separated format:
+```
+ingredient;quantity;unit;calories
+```
+The `parse_legacy_format()` function can read this format and convert to `FoodEntry` objects.
+### Validation and Migration
+`validate_and_migrate_existing_data()` validates existing processed data without modifying the stored format. It only logs successful validations without storing `structured_data` field.
+### Portuguese Unit Handling
+Common Portuguese units in the food list:
+- "colher de sopa" (tbsp) = tablespoon
+- "colher de chá" (tsp) = teaspoon
+- "copo" or "tigela" = cup
+- "unidade" = unit (individual item)
+- "fatia" = slice
+- "lata" = can
+### Food Database
+The application uses a structured JSON database (`food_database.json`) containing:
+- Portuguese food names as keys
+- Nutrition per 100g (calories, proteins, carbs, fats)
+- Unit conversions (grams_per_unit for tbsp, tsp, cup, unit, g)
+New foods can be added using `process_labels.py` which extracts nutrition info from product label images using vision AI.
+**Legacy**: `prompts/_food_list.txt` contains an old flat-file format for reference only. The active system uses `food_database.json`.
+## Testing Strategy
+When adding new functionality:
+1. Test with real Telegram export data in `data/result.json`
+2. Use validation mode to check data integrity
+3. Inspect specific dates that might have issues
+4. Verify plot generation produces reasonable visualizations
+5. Check that backups are created properly before processing
+## Data Files
+- `data/result.json`: Telegram export (synced from Telegram API or user-provided)
+- `data/date_messages_map.json`: Main processed data store
+- `data/backups/`: Timestamped backups of result.json and date_messages_map.json
+- `data/plot.png`: Generated calorie trend visualization
+- `food_database.json`: Structured nutrition database (active)
+- `prompts/_food_list.txt`: Legacy calorie reference (kept for reference only)
+- `docs/pipeline.md`: Detailed pipeline documentation (keep updated)
+## Documentation Maintenance
+**Important**: When modifying the data processing pipeline in `main.py`, update `docs/pipeline.md` to reflect the changes. This includes:
+- Adding/removing/reordering pipeline steps
+- Changing data flow between components
+- Modifying file formats or storage locations
+- Adding new integrations (APIs, storage backends, etc.)

parsefood-0.1.0/CLAUDE.md ADDED Viewed

	@@ -0,0 +1 @@
1	+ AGENTS.md

parsefood-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 tsilva
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

parsefood-0.1.0/Makefile ADDED Viewed

@@ -0,0 +1,5 @@
+release-%:
+	hatch version $*
+	git add food_log/__init__.py
+	git commit -m "chore: release $$(hatch version)"
+	git push