PyPI - anysite-cli - Versions diffs - 0.1.3__tar.gz → 0.1.5__tar.gz - Mend

anysite-cli 0.1.3tar.gz → 0.1.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

{anysite_cli-0.1.3 → anysite_cli-0.1.5}/CLAUDE.md RENAMED Viewed

@@ -49,12 +49,14 @@ anysite dataset query dataset.yaml --interactive
 anysite dataset stats dataset.yaml --source profiles
 anysite dataset profile dataset.yaml
 anysite dataset load-db dataset.yaml -c pg --drop-existing
+anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
 anysite dataset history my-dataset
 anysite dataset logs my-dataset --run 42
 anysite dataset schedule dataset.yaml --incremental --load-db pg
 anysite dataset schedule dataset.yaml --systemd --load-db pg
 anysite dataset diff dataset.yaml --source profiles --key _input_value
-anysite dataset diff dataset.yaml --source profiles --key urn --from 2026-01-30 --to 2026-02-01
+anysite dataset diff dataset.yaml --source profiles --key urn.value --from 2026-01-30 --to 2026-02-01
+anysite dataset diff dataset.yaml --source profiles --key urn.value --fields "name,headline,follower_count"
 anysite dataset reset-cursor dataset.yaml
 anysite dataset reset-cursor dataset.yaml --source profiles
@@ -104,9 +106,9 @@ anysite db upsert mydb --table users --conflict-columns id --stdin
 - `dataset/history.py` - `HistoryStore` (SQLite at `~/.anysite/dataset_history.db`): run start/finish tracking. `LogManager`: file-based per-run logs at `~/.anysite/logs/`
 - `dataset/scheduler.py` - `ScheduleGenerator`: crontab and systemd timer unit generation from cron expressions
 - `dataset/notifications.py` - `WebhookNotifier`: POST to webhook URLs on collection complete/failure
-- `dataset/differ.py` - `DatasetDiffer`: compare two Parquet snapshots using DuckDB (added/removed/changed records). `DiffResult` dataclass, `format_diff_table()` and `format_diff_records()` formatters
+- `dataset/differ.py` - `DatasetDiffer`: compare two Parquet snapshots using DuckDB (added/removed/changed records). Supports dot-notation keys via `json_extract_string()`. `DiffResult` dataclass, `format_diff_table()` and `format_diff_records()` formatters with output field filtering
 - `dataset/cli.py` - Typer subcommands: `init`, `collect` (with `--load-db`), `status`, `query`, `stats`, `profile`, `load-db`, `diff`, `history`, `logs`, `schedule`, `reset-cursor`
-- `dataset/db_loader.py` - `DatasetDbLoader`: loads Parquet data into relational DB with FK linking via provenance, dot-notation field extraction, schema inference
+- `dataset/db_loader.py` - `DatasetDbLoader`: loads Parquet data into relational DB with FK linking via provenance, dot-notation field extraction, schema inference, diff-based incremental sync (`db_load.key` + `db_load.sync: full|append`). Supports diff-based incremental sync via `db_load.key` and `--snapshot` for loading specific dates
 - `dataset/errors.py` - `DatasetError`, `CircularDependencyError`, `SourceNotFoundError`
 - `db/__init__.py` - `check_db_deps()` — verifies optional psycopg is installed for Postgres
 - `db/config.py` - `ConnectionConfig`, `DatabaseType`, `OnConflict` enums and models
@@ -164,8 +166,11 @@ Sources are topologically sorted by dependencies. `input_template` allows transf
 - Schema inference from Parquet records via `infer_table_schema()`
 - Auto-increment `id` primary key per table
 - FK linking via provenance: parent `_input_value` → child `{parent}_id` column
-- Optional `db_load` config per source: field selection, dot-notation extraction, custom table names, field exclusion
+- Optional `db_load` config per source: field selection, dot-notation extraction, custom table names, field exclusion, `key` for diff-based incremental sync
 - Topological loading order (parents before children)
+- Diff-based incremental sync: when `db_load.key` is set and table exists with >=2 snapshots, diffs the two most recent and applies INSERT/DELETE/UPDATE delta
+- `--snapshot YYYY-MM-DD` flag to load a specific snapshot date
+- `--drop-existing` forces full INSERT of latest snapshot
 **Dataset Storage Layout**:
 ```

{anysite_cli-0.1.3 → anysite_cli-0.1.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: anysite-cli
-Version: 0.1.3
+Version: 0.1.5
 Summary: CLI for Anysite API - web data extraction for humans and AI agents
 Project-URL: Homepage, https://anysite.io
 Project-URL: Documentation, https://docs.anysite.io/cli
@@ -259,6 +259,8 @@ sources:
         path: ./output/companies-{{date}}.csv
         format: csv
     db_load:
+      key: _input_value                    # Unique key for incremental sync
+      sync: full                           # full (default) or append (no DELETE)
       fields: [name, url, employee_count]
   - id: employees
@@ -274,6 +276,8 @@ sources:
       count: 5
     refresh: always                       # Re-collect every run with --incremental
     db_load:
+      key: urn.value                       # Unique key for incremental sync
+      sync: append                         # Keep old records (no DELETE on diff)
       fields: [name, url, headline]
 storage:
@@ -318,9 +322,15 @@ anysite dataset query dataset.yaml --interactive
 anysite dataset stats dataset.yaml --source companies
 anysite dataset profile dataset.yaml
-# Load into PostgreSQL with automatic FK linking
+# Load into PostgreSQL with automatic FK linking (incremental sync with db_load.key)
+anysite dataset load-db dataset.yaml -c pg
+# Drop and reload from latest snapshot
 anysite dataset load-db dataset.yaml -c pg --drop-existing
+# Load a specific snapshot date
+anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
 # Run history and logs
 anysite dataset history my-dataset
 anysite dataset logs my-dataset --run 42
@@ -328,8 +338,9 @@ anysite dataset logs my-dataset --run 42
 # Generate cron/systemd schedule
 anysite dataset schedule dataset.yaml --incremental --load-db pg
-# Compare snapshots (diff two collection dates)
+# Compare snapshots (diff two collection dates, supports dot-notation keys)
 anysite dataset diff dataset.yaml --source employees --key _input_value
+anysite dataset diff dataset.yaml --source profiles --key urn.value --fields "name,headline"
 # Reset incremental state
 anysite dataset reset-cursor dataset.yaml

{anysite_cli-0.1.3 → anysite_cli-0.1.5}/README.md RENAMED Viewed

@@ -196,6 +196,8 @@ sources:
         path: ./output/companies-{{date}}.csv
         format: csv
     db_load:
+      key: _input_value                    # Unique key for incremental sync
+      sync: full                           # full (default) or append (no DELETE)
       fields: [name, url, employee_count]
   - id: employees
@@ -211,6 +213,8 @@ sources:
       count: 5
     refresh: always                       # Re-collect every run with --incremental
     db_load:
+      key: urn.value                       # Unique key for incremental sync
+      sync: append                         # Keep old records (no DELETE on diff)
       fields: [name, url, headline]
 storage:
@@ -255,9 +259,15 @@ anysite dataset query dataset.yaml --interactive
 anysite dataset stats dataset.yaml --source companies
 anysite dataset profile dataset.yaml
-# Load into PostgreSQL with automatic FK linking
+# Load into PostgreSQL with automatic FK linking (incremental sync with db_load.key)
+anysite dataset load-db dataset.yaml -c pg
+# Drop and reload from latest snapshot
 anysite dataset load-db dataset.yaml -c pg --drop-existing
+# Load a specific snapshot date
+anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
 # Run history and logs
 anysite dataset history my-dataset
 anysite dataset logs my-dataset --run 42
@@ -265,8 +275,9 @@ anysite dataset logs my-dataset --run 42
 # Generate cron/systemd schedule
 anysite dataset schedule dataset.yaml --incremental --load-db pg
-# Compare snapshots (diff two collection dates)
+# Compare snapshots (diff two collection dates, supports dot-notation keys)
 anysite dataset diff dataset.yaml --source employees --key _input_value
+anysite dataset diff dataset.yaml --source profiles --key urn.value --fields "name,headline"
 # Reset incremental state
 anysite dataset reset-cursor dataset.yaml

{anysite_cli-0.1.3 → anysite_cli-0.1.5}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "anysite-cli"
-version = "0.1.3"
+version = "0.1.5"
 description = "CLI for Anysite API - web data extraction for humans and AI agents"
 readme = "README.md"
 license = "MIT"

{anysite_cli-0.1.3 → anysite_cli-0.1.5}/skills/anysite-cli/SKILL.md RENAMED Viewed

@@ -117,6 +117,8 @@ sources:
         path: ./output/companies-{{date}}.csv
         format: csv
     db_load:
+      key: _input_value                   # Unique key for diff-based incremental sync
+      sync: full                          # full (INSERT/DELETE/UPDATE) or append (no DELETE)
       fields: [name, url, employee_count]
   - id: employees
@@ -189,18 +191,32 @@ anysite dataset profile dataset.yaml
 # Load all sources with FK linking
 anysite dataset load-db dataset.yaml -c pg --drop-existing
+# Incremental sync (uses diff when db_load.key is set)
+anysite dataset load-db dataset.yaml -c pg
+# Load a specific snapshot date
+anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
 # Dry run
 anysite dataset load-db dataset.yaml -c pg --dry-run
 ```
 `load-db` auto-creates tables with inferred schema, adds `id` primary key, and links child tables to parents via `{parent}_id` FK columns using provenance data.
+**Incremental sync**: When `db_load.key` is set and the table already exists with >=2 snapshots, `load-db` diffs the two most recent snapshots and applies only the delta (INSERT added, DELETE removed, UPDATE changed). Without `db_load.key`, it does a full INSERT of the latest snapshot.
+**Sync modes** (`db_load.sync`):
+- `full` (default) — applies INSERT, DELETE, and UPDATE from diff
+- `append` — applies INSERT and UPDATE only, skips DELETE (keeps records that disappeared from the API). Use for sources where the API returns only the latest N items (e.g., posts, activity feeds).
 Optional `db_load` config per source controls which fields go to DB:
 ```yaml
   - id: profiles
     endpoint: /api/linkedin/user
     db_load:
       table: people              # Custom table name
+      key: urn.value             # Unique key for diff-based incremental sync
+      sync: append               # Keep old records (no DELETE on diff)
       fields:                    # Select specific fields
         - name
         - urn.value AS urn_id    # Dot-notation extraction
@@ -241,11 +257,16 @@ anysite api /api/linkedin/user user=satyanadella -q --format jsonl \
 # Diff two most recent snapshots
 anysite dataset diff dataset.yaml --source employees --key _input_value
-# Diff specific dates, compare only certain fields
+# Diff with dot-notation key (for JSON fields like urn)
+anysite dataset diff dataset.yaml --source profiles --key urn.value
+# Diff specific dates, compare and output only certain fields
 anysite dataset diff dataset.yaml --source employees --key _input_value \
   --from 2026-01-30 --to 2026-02-01 --fields "name,headline"
 ```
+`--key` supports dot-notation for JSON fields (e.g., `urn.value`). `--fields` restricts both comparison and output columns.
 ### Step 7: History, Scheduling, and Notifications
 ```bash
 # View run history

{anysite_cli-0.1.3 → anysite_cli-0.1.5}/skills/anysite-cli/references/dataset-guide.md RENAMED Viewed

@@ -39,6 +39,8 @@ sources:
         headers: {X-Token: abc}
     db_load:                   # Database loading config
       table: custom_name       # Override table name
+      key: urn.value           # Unique key for diff-based incremental sync
+      sync: full               # full (INSERT/DELETE/UPDATE) or append (no DELETE)
       fields: [name, url]      # Fields to include
       exclude: [_input_value]  # Fields to exclude
@@ -244,6 +246,7 @@ anysite dataset load-db dataset.yaml -c <connection_name> [OPTIONS]
 --connection, -c TEXT    Database connection name (required)
 --source, -s TEXT        Load specific source + dependencies
 --drop-existing          Drop tables before creating
+--snapshot TEXT           Load a specific snapshot date (YYYY-MM-DD)
 --dry-run                Show plan without executing
 --quiet, -q              Suppress output
 ```
@@ -256,6 +259,29 @@ anysite dataset load-db dataset.yaml -c <connection_name> [OPTIONS]
 4. Inserts rows, tracking which `_input_value` maps to which `id`
 5. For child sources: adds `{parent_source}_id` FK column using provenance
+### Incremental Sync with `db_load.key`
+When `db_load.key` is set and the table already exists with >=2 snapshots, `load-db` uses diff-based incremental sync instead of full re-insertion:
+1. Compares the two most recent Parquet snapshots using `DatasetDiffer`
+2. **Added** records → INSERT into DB
+3. **Removed** records → DELETE from DB (by key) — only in `sync: full` mode
+4. **Changed** records → UPDATE modified fields (by key)
+This keeps the database in sync without duplicates.
+**Sync modes** (`db_load.sync`):
+- `full` (default) — applies INSERT, DELETE, and UPDATE from diff
+- `append` — applies INSERT and UPDATE only, skips DELETE. Use for sources where the API returns only the latest N items (e.g., posts, comments) and you want to accumulate records over time.
+| Scenario | Behavior |
+|----------|----------|
+| First load (table doesn't exist) | Full INSERT of latest snapshot |
+| Table exists + `db_load.key` + >=2 snapshots | Diff-based sync (INSERT/DELETE/UPDATE delta) |
+| `--drop-existing` | Drop table, full INSERT of latest snapshot |
+| `--snapshot 2026-01-15` | Full INSERT of that specific snapshot |
+| No `db_load.key` set | Full INSERT of latest snapshot (no diff) |
 ### db_load Config
 Control which fields go to the database per source:
@@ -263,6 +289,8 @@ Control which fields go to the database per source:
 ```yaml
 db_load:
   table: people                    # Custom table name (default: source ID)
+  key: urn.value                   # Unique key for diff-based incremental sync
+  sync: append                     # full (default) or append (no DELETE on diff)
   fields:                          # Explicit field list
     - name
     - url
@@ -456,8 +484,11 @@ Compare two collection snapshots to find added, removed, and changed records.
 # Compare two most recent snapshots (auto-detect dates)
 anysite dataset diff dataset.yaml --source profiles --key _input_value
+# Compare with dot-notation key (JSON fields)
+anysite dataset diff dataset.yaml --source profiles --key urn.value
 # Compare specific dates
-anysite dataset diff dataset.yaml --source profiles --key urn --from 2026-01-30 --to 2026-02-01
+anysite dataset diff dataset.yaml --source profiles --key urn.value --from 2026-01-30 --to 2026-02-01
 # Only compare specific fields
 anysite dataset diff dataset.yaml --source profiles --key urn --fields "name,headline,follower_count"
@@ -468,9 +499,9 @@ anysite dataset diff dataset.yaml --source profiles --key urn --format json --ou
 **Options:**
 - `--source, -s` (required) — source to compare
-- `--key, -k` (required) — field to match records by (e.g., `_input_value`, `urn`)
+- `--key, -k` (required) — field to match records by. Supports dot-notation for JSON fields (e.g., `urn.value`)
 - `--from` / `--to` — snapshot dates (default: two most recent)
-- `--fields, -f` — only compare these fields
+- `--fields, -f` — restrict both comparison and output to these fields
 - `--format` — output format (table, json, jsonl, csv)
 - `--output, -o` — write to file

{anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/dataset/cli.py RENAMED Viewed

@@ -357,6 +357,10 @@ def load_db(
         bool,
         typer.Option("--quiet", "-q", help="Suppress progress output"),
     ] = False,
+    snapshot: Annotated[
+        str | None,
+        typer.Option("--snapshot", help="Load a specific snapshot date (YYYY-MM-DD)"),
+    ] = None,
 ) -> None:
     """Load collected Parquet data into a relational database with FK linking."""
     config = _load_config(config_path)
@@ -379,6 +383,7 @@ def load_db(
                 source_filter=source,
                 drop_existing=drop_existing,
                 dry_run=dry_run,
+                snapshot=snapshot,
             )
         except Exception as e:
             typer.echo(f"Load error: {e}", err=True)
@@ -519,7 +524,11 @@ def diff_cmd(
         return
     # Format and output
-    rows = format_diff_table(result) if format == "table" else format_diff_records(result)
+    rows = (
+        format_diff_table(result, output_fields=field_list)
+        if format == "table"
+        else format_diff_records(result, output_fields=field_list)
+    )
     _output_results(rows, format, output)

{anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/dataset/collector.py RENAMED Viewed

@@ -19,6 +19,7 @@ from anysite.dataset.models import DatasetConfig, DatasetSource
 from anysite.dataset.storage import (
     MetadataStore,
     get_parquet_path,
+    read_latest_parquet,
     read_parquet,
     write_parquet,
 )
@@ -412,9 +413,9 @@ async def _collect_dependent(
     if dep is None:
         raise DatasetError(f"Source {source.id} has no dependency defined")
-    # Read parent data
+    # Read parent data (latest snapshot only to avoid schema mismatch)
     parent_dir = base_path / "raw" / dep.from_source
-    parent_records = read_parquet(parent_dir)
+    parent_records = read_latest_parquet(parent_dir)
     if not parent_records:
         if not quiet:
@@ -627,7 +628,7 @@ def _count_dependent_inputs(
     if dep is None:
         return None
     parent_dir = base_path / "raw" / dep.from_source
-    parent_records = read_parquet(parent_dir)
+    parent_records = read_latest_parquet(parent_dir)
     if not parent_records:
         info = metadata.get_source_info(dep.from_source)
         return info.get("record_count") if info else None

{anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/dataset/db_loader.py RENAMED Viewed

@@ -3,12 +3,18 @@
 from __future__ import annotations
 import json
+import logging
+from datetime import date
+from pathlib import Path
 from typing import Any
 from anysite.dataset.models import DatasetConfig, DatasetSource
 from anysite.dataset.storage import get_source_dir, read_parquet
 from anysite.db.adapters.base import DatabaseAdapter
 from anysite.db.schema.inference import infer_table_schema
+from anysite.db.utils.sanitize import sanitize_identifier
+logger = logging.getLogger(__name__)
 def _get_dialect(adapter: DatabaseAdapter) -> str:
@@ -86,15 +92,31 @@ def _filter_record(
         return {k: v for k, v in record.items() if k not in exclude}
+def _get_latest_parquet(base_path: Path, source_id: str) -> Path | None:
+    """Return the path to the most recent snapshot for a source."""
+    source_dir = get_source_dir(base_path, source_id)
+    if not source_dir.exists():
+        return None
+    files = sorted(source_dir.glob("*.parquet"))
+    return files[-1] if files else None
+def _get_snapshot_for_date(base_path: Path, source_id: str, d: date) -> Path | None:
+    """Return the parquet path for a specific snapshot date."""
+    source_dir = get_source_dir(base_path, source_id)
+    path = source_dir / f"{d.isoformat()}.parquet"
+    return path if path.exists() else None
 class DatasetDbLoader:
     """Load dataset Parquet data into a relational database.
-    Handles:
-    - Schema inference from Parquet records
-    - Auto-increment primary keys (``id`` column)
-    - Foreign key linking via provenance ``_input_value`` column
-    - Dot-notation field extraction for JSON columns
-    - Topological loading order (parents before children)
+    Supports diff-based incremental sync when ``db_load.key`` is configured:
+    compares the two most recent snapshots and applies INSERT/DELETE/UPDATE
+    to keep the database in sync.
+    Falls back to full INSERT of the latest snapshot when no key is set
+    or when the table doesn't exist yet.
     """
     def __init__(
@@ -115,16 +137,18 @@ class DatasetDbLoader:
         source_filter: str | None = None,
         drop_existing: bool = False,
         dry_run: bool = False,
+        snapshot: str | None = None,
     ) -> dict[str, int]:
         """Load all sources into the database in dependency order.
         Args:
             source_filter: Only load this source (and dependencies).
-            drop_existing: Drop tables before creating.
+            drop_existing: Drop tables before creating, then full INSERT latest.
             dry_run: Show plan without executing.
+            snapshot: Load a specific snapshot date (YYYY-MM-DD).
         Returns:
-            Mapping of source_id to number of rows loaded.
+            Mapping of source_id to number of rows loaded/affected.
         """
         sources = self.config.topological_sort()
@@ -139,6 +163,7 @@ class DatasetDbLoader:
                 source,
                 drop_existing=drop_existing,
                 dry_run=dry_run,
+                snapshot=snapshot,
             )
             results[source.id] = count
@@ -150,18 +175,64 @@ class DatasetDbLoader:
         *,
         drop_existing: bool = False,
         dry_run: bool = False,
+        snapshot: str | None = None,
     ) -> int:
-        """Load a single source into the database."""
-        source_dir = get_source_dir(self.base_path, source.id)
-        if not source_dir.exists() or not any(source_dir.glob("*.parquet")):
+        """Load a single source into the database.
+        Strategy:
+        1. ``drop_existing``: drop table → full INSERT of latest snapshot
+        2. ``snapshot``: full INSERT of that specific snapshot
+        3. Table doesn't exist: full INSERT of latest snapshot
+        4. Table exists + ``db_load.key`` set + ≥2 snapshots: diff-based sync
+        5. Fallback: full INSERT of latest snapshot
+        """
+        table_name = _table_name_for(source)
+        # Handle drop_existing
+        if drop_existing and self.adapter.table_exists(table_name):
+            self.adapter.execute(f"DROP TABLE {table_name}")
+        # Determine which parquet to load
+        if snapshot:
+            snapshot_date = date.fromisoformat(snapshot)
+            parquet_path = _get_snapshot_for_date(self.base_path, source.id, snapshot_date)
+            if parquet_path is None:
+                return 0
+            return self._full_insert(source, table_name, parquet_path, dry_run=dry_run)
+        # Check if we can do diff-based sync
+        diff_key = source.db_load.key if source.db_load else None
+        table_exists = self.adapter.table_exists(table_name)
+        if diff_key and table_exists and not drop_existing:
+            from anysite.dataset.differ import DatasetDiffer
+            differ = DatasetDiffer(self.base_path)
+            dates = differ.available_dates(source.id)
+            if len(dates) >= 2:
+                return self._diff_sync(
+                    source, table_name, diff_key, differ, dates, dry_run=dry_run
+                )
+        # Fallback: full INSERT of latest snapshot
+        latest = _get_latest_parquet(self.base_path, source.id)
+        if latest is None:
             return 0
+        return self._full_insert(source, table_name, latest, dry_run=dry_run)
-        raw_records = read_parquet(source_dir)
+    def _full_insert(
+        self,
+        source: DatasetSource,
+        table_name: str,
+        parquet_path: Path,
+        *,
+        dry_run: bool = False,
+    ) -> int:
+        """Full INSERT: read parquet, transform, create table if needed, insert all rows."""
+        raw_records = read_parquet(parquet_path)
         if not raw_records:
             return 0
-        table_name = _table_name_for(source)
         # Determine parent info for FK linking
         parent_source_id = None
         parent_fk_col = None
@@ -174,7 +245,6 @@ class DatasetDbLoader:
         for record in raw_records:
             row = _filter_record(record, source)
-            # Add FK column if this is a dependent source
             if parent_source_id and parent_fk_col:
                 input_val = record.get("_input_value")
                 parent_map = self._value_to_id.get(parent_source_id, {})
@@ -189,17 +259,12 @@ class DatasetDbLoader:
             return len(rows)
         # Determine the lookup field for children to reference this source
-        # This is the field that child dependencies extract from this source
         lookup_field = self._get_child_lookup_field(source)
-        # Create table
-        if drop_existing and self.adapter.table_exists(table_name):
-            self.adapter.execute(f"DROP TABLE {table_name}")
+        # Create table if needed
         if not self.adapter.table_exists(table_name):
             schema = infer_table_schema(table_name, rows)
             sql_types = schema.to_sql_types(self._dialect)
-            # Add auto-increment id column
             col_defs = {"id": self._auto_id_type()}
             col_defs.update(sql_types)
             self.adapter.create_table(table_name, col_defs, primary_key="id")
@@ -208,10 +273,8 @@ class DatasetDbLoader:
         value_map: dict[str, int] = {}
         for i, row in enumerate(rows):
             self.adapter.insert_batch(table_name, [row])
-            # Get the last inserted id
             last_id = self._get_last_id(table_name)
-            # Build value→id map for child sources
             if lookup_field and last_id is not None:
                 raw_record = raw_records[i]
                 lookup_val = _extract_dot_value(raw_record, lookup_field)
@@ -225,6 +288,86 @@ class DatasetDbLoader:
         return len(rows)
+    def _diff_sync(
+        self,
+        source: DatasetSource,
+        table_name: str,
+        diff_key: str,
+        differ: Any,
+        dates: list[date],
+        *,
+        dry_run: bool = False,
+    ) -> int:
+        """Diff-based incremental sync: compare two most recent snapshots, apply delta."""
+        result = differ.diff(source.id, diff_key)
+        total = 0
+        sync_mode = source.db_load.sync if source.db_load else "full"
+        if dry_run:
+            count = len(result.added) + len(result.changed)
+            if sync_mode == "full":
+                count += len(result.removed)
+            return count
+        # Extract key value from a record (handles dot-notation)
+        def _get_key_val(record: dict[str, Any]) -> Any:
+            if "." in diff_key:
+                return _extract_dot_value(record, diff_key)
+            return record.get(diff_key)
+        # Determine the DB column name for the key
+        db_key_col = diff_key.replace(".", "_")
+        # INSERT added records
+        if result.added:
+            for record in result.added:
+                row = _filter_record(record, source)
+                self.adapter.insert_batch(table_name, [row])
+                total += 1
+        # DELETE removed records (skipped in append mode)
+        if result.removed and sync_mode == "full":
+            safe_col = sanitize_identifier(db_key_col)
+            for record in result.removed:
+                key_val = _get_key_val(record)
+                if key_val is not None:
+                    self.adapter.execute(
+                        f"DELETE FROM {table_name} WHERE {safe_col} = ?",
+                        (str(key_val),),
+                    )
+                    total += 1
+        # UPDATE changed records
+        if result.changed:
+            safe_col = sanitize_identifier(db_key_col)
+            for record in result.changed:
+                key_val = _get_key_val(record)
+                if key_val is None:
+                    continue
+                changed_fields = record.get("_changed_fields", [])
+                if not changed_fields:
+                    continue
+                # Build SET clause from changed fields
+                set_parts = []
+                params: list[Any] = []
+                for field_name in changed_fields:
+                    new_val = record.get(field_name)
+                    safe_field = sanitize_identifier(field_name)
+                    set_parts.append(f"{safe_field} = ?")
+                    params.append(new_val)
+                params.append(str(key_val))
+                sql = (
+                    f"UPDATE {table_name} "
+                    f"SET {', '.join(set_parts)} "
+                    f"WHERE {safe_col} = ?"
+                )
+                self.adapter.execute(sql, tuple(params))
+                total += 1
+        return total
     def _get_child_lookup_field(self, source: DatasetSource) -> str | None:
         """Find which field children use to reference this source."""
         for other in self.config.sources:

anysite-cli 0.1.3__tar.gz → 0.1.5__tar.gz

anysite-cli 0.1.3tar.gz → 0.1.5tar.gz