PyPI - anysite-cli - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

anysite-cli 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

anysite/dataset/cli.py +10 -1
anysite/dataset/collector.py +4 -3
anysite/dataset/db_loader.py +166 -23
anysite/dataset/differ.py +203 -48
anysite/dataset/models.py +5 -0
anysite/dataset/storage.py +21 -1
{anysite_cli-0.1.3.dist-info → anysite_cli-0.1.5.dist-info}/METADATA +14 -3
{anysite_cli-0.1.3.dist-info → anysite_cli-0.1.5.dist-info}/RECORD +11 -11
{anysite_cli-0.1.3.dist-info → anysite_cli-0.1.5.dist-info}/WHEEL +0 -0
{anysite_cli-0.1.3.dist-info → anysite_cli-0.1.5.dist-info}/entry_points.txt +0 -0
{anysite_cli-0.1.3.dist-info → anysite_cli-0.1.5.dist-info}/licenses/LICENSE +0 -0

anysite/dataset/cli.py CHANGED Viewed

@@ -357,6 +357,10 @@ def load_db(
         bool,
         typer.Option("--quiet", "-q", help="Suppress progress output"),
     ] = False,
+    snapshot: Annotated[
+        str | None,
+        typer.Option("--snapshot", help="Load a specific snapshot date (YYYY-MM-DD)"),
+    ] = None,
 ) -> None:
     """Load collected Parquet data into a relational database with FK linking."""
     config = _load_config(config_path)
@@ -379,6 +383,7 @@ def load_db(
                 source_filter=source,
                 drop_existing=drop_existing,
                 dry_run=dry_run,
+                snapshot=snapshot,
             )
         except Exception as e:
             typer.echo(f"Load error: {e}", err=True)
@@ -519,7 +524,11 @@ def diff_cmd(
         return
     # Format and output
-    rows = format_diff_table(result) if format == "table" else format_diff_records(result)
+    rows = (
+        format_diff_table(result, output_fields=field_list)
+        if format == "table"
+        else format_diff_records(result, output_fields=field_list)
+    )
     _output_results(rows, format, output)

anysite/dataset/collector.py CHANGED Viewed

@@ -19,6 +19,7 @@ from anysite.dataset.models import DatasetConfig, DatasetSource
 from anysite.dataset.storage import (
     MetadataStore,
     get_parquet_path,
+    read_latest_parquet,
     read_parquet,
     write_parquet,
 )
@@ -412,9 +413,9 @@ async def _collect_dependent(
     if dep is None:
         raise DatasetError(f"Source {source.id} has no dependency defined")
-    # Read parent data
+    # Read parent data (latest snapshot only to avoid schema mismatch)
     parent_dir = base_path / "raw" / dep.from_source
-    parent_records = read_parquet(parent_dir)
+    parent_records = read_latest_parquet(parent_dir)
     if not parent_records:
         if not quiet:
@@ -627,7 +628,7 @@ def _count_dependent_inputs(
     if dep is None:
         return None
     parent_dir = base_path / "raw" / dep.from_source
-    parent_records = read_parquet(parent_dir)
+    parent_records = read_latest_parquet(parent_dir)
     if not parent_records:
         info = metadata.get_source_info(dep.from_source)
         return info.get("record_count") if info else None

anysite/dataset/db_loader.py CHANGED Viewed

@@ -3,12 +3,18 @@
 from __future__ import annotations
 import json
+import logging
+from datetime import date
+from pathlib import Path
 from typing import Any
 from anysite.dataset.models import DatasetConfig, DatasetSource
 from anysite.dataset.storage import get_source_dir, read_parquet
 from anysite.db.adapters.base import DatabaseAdapter
 from anysite.db.schema.inference import infer_table_schema
+from anysite.db.utils.sanitize import sanitize_identifier
+logger = logging.getLogger(__name__)
 def _get_dialect(adapter: DatabaseAdapter) -> str:
@@ -86,15 +92,31 @@ def _filter_record(
         return {k: v for k, v in record.items() if k not in exclude}
+def _get_latest_parquet(base_path: Path, source_id: str) -> Path | None:
+    """Return the path to the most recent snapshot for a source."""
+    source_dir = get_source_dir(base_path, source_id)
+    if not source_dir.exists():
+        return None
+    files = sorted(source_dir.glob("*.parquet"))
+    return files[-1] if files else None
+def _get_snapshot_for_date(base_path: Path, source_id: str, d: date) -> Path | None:
+    """Return the parquet path for a specific snapshot date."""
+    source_dir = get_source_dir(base_path, source_id)
+    path = source_dir / f"{d.isoformat()}.parquet"
+    return path if path.exists() else None
 class DatasetDbLoader:
     """Load dataset Parquet data into a relational database.
-    Handles:
-    - Schema inference from Parquet records
-    - Auto-increment primary keys (``id`` column)
-    - Foreign key linking via provenance ``_input_value`` column
-    - Dot-notation field extraction for JSON columns
-    - Topological loading order (parents before children)
+    Supports diff-based incremental sync when ``db_load.key`` is configured:
+    compares the two most recent snapshots and applies INSERT/DELETE/UPDATE
+    to keep the database in sync.
+    Falls back to full INSERT of the latest snapshot when no key is set
+    or when the table doesn't exist yet.
     """
     def __init__(
@@ -115,16 +137,18 @@ class DatasetDbLoader:
         source_filter: str | None = None,
         drop_existing: bool = False,
         dry_run: bool = False,
+        snapshot: str | None = None,
     ) -> dict[str, int]:
         """Load all sources into the database in dependency order.
         Args:
             source_filter: Only load this source (and dependencies).
-            drop_existing: Drop tables before creating.
+            drop_existing: Drop tables before creating, then full INSERT latest.
             dry_run: Show plan without executing.
+            snapshot: Load a specific snapshot date (YYYY-MM-DD).
         Returns:
-            Mapping of source_id to number of rows loaded.
+            Mapping of source_id to number of rows loaded/affected.
         """
         sources = self.config.topological_sort()
@@ -139,6 +163,7 @@ class DatasetDbLoader:
                 source,
                 drop_existing=drop_existing,
                 dry_run=dry_run,
+                snapshot=snapshot,
             )
             results[source.id] = count
@@ -150,18 +175,64 @@ class DatasetDbLoader:
         *,
         drop_existing: bool = False,
         dry_run: bool = False,
+        snapshot: str | None = None,
     ) -> int:
-        """Load a single source into the database."""
-        source_dir = get_source_dir(self.base_path, source.id)
-        if not source_dir.exists() or not any(source_dir.glob("*.parquet")):
+        """Load a single source into the database.
+        Strategy:
+        1. ``drop_existing``: drop table → full INSERT of latest snapshot
+        2. ``snapshot``: full INSERT of that specific snapshot
+        3. Table doesn't exist: full INSERT of latest snapshot
+        4. Table exists + ``db_load.key`` set + ≥2 snapshots: diff-based sync
+        5. Fallback: full INSERT of latest snapshot
+        """
+        table_name = _table_name_for(source)
+        # Handle drop_existing
+        if drop_existing and self.adapter.table_exists(table_name):
+            self.adapter.execute(f"DROP TABLE {table_name}")
+        # Determine which parquet to load
+        if snapshot:
+            snapshot_date = date.fromisoformat(snapshot)
+            parquet_path = _get_snapshot_for_date(self.base_path, source.id, snapshot_date)
+            if parquet_path is None:
+                return 0
+            return self._full_insert(source, table_name, parquet_path, dry_run=dry_run)
+        # Check if we can do diff-based sync
+        diff_key = source.db_load.key if source.db_load else None
+        table_exists = self.adapter.table_exists(table_name)
+        if diff_key and table_exists and not drop_existing:
+            from anysite.dataset.differ import DatasetDiffer
+            differ = DatasetDiffer(self.base_path)
+            dates = differ.available_dates(source.id)
+            if len(dates) >= 2:
+                return self._diff_sync(
+                    source, table_name, diff_key, differ, dates, dry_run=dry_run
+                )
+        # Fallback: full INSERT of latest snapshot
+        latest = _get_latest_parquet(self.base_path, source.id)
+        if latest is None:
             return 0
+        return self._full_insert(source, table_name, latest, dry_run=dry_run)
-        raw_records = read_parquet(source_dir)
+    def _full_insert(
+        self,
+        source: DatasetSource,
+        table_name: str,
+        parquet_path: Path,
+        *,
+        dry_run: bool = False,
+    ) -> int:
+        """Full INSERT: read parquet, transform, create table if needed, insert all rows."""
+        raw_records = read_parquet(parquet_path)
         if not raw_records:
             return 0
-        table_name = _table_name_for(source)
         # Determine parent info for FK linking
         parent_source_id = None
         parent_fk_col = None
@@ -174,7 +245,6 @@ class DatasetDbLoader:
         for record in raw_records:
             row = _filter_record(record, source)
-            # Add FK column if this is a dependent source
             if parent_source_id and parent_fk_col:
                 input_val = record.get("_input_value")
                 parent_map = self._value_to_id.get(parent_source_id, {})
@@ -189,17 +259,12 @@ class DatasetDbLoader:
             return len(rows)
         # Determine the lookup field for children to reference this source
-        # This is the field that child dependencies extract from this source
         lookup_field = self._get_child_lookup_field(source)
-        # Create table
-        if drop_existing and self.adapter.table_exists(table_name):
-            self.adapter.execute(f"DROP TABLE {table_name}")
+        # Create table if needed
         if not self.adapter.table_exists(table_name):
             schema = infer_table_schema(table_name, rows)
             sql_types = schema.to_sql_types(self._dialect)
-            # Add auto-increment id column
             col_defs = {"id": self._auto_id_type()}
             col_defs.update(sql_types)
             self.adapter.create_table(table_name, col_defs, primary_key="id")
@@ -208,10 +273,8 @@ class DatasetDbLoader:
         value_map: dict[str, int] = {}
         for i, row in enumerate(rows):
             self.adapter.insert_batch(table_name, [row])
-            # Get the last inserted id
             last_id = self._get_last_id(table_name)
-            # Build value→id map for child sources
             if lookup_field and last_id is not None:
                 raw_record = raw_records[i]
                 lookup_val = _extract_dot_value(raw_record, lookup_field)
@@ -225,6 +288,86 @@ class DatasetDbLoader:
         return len(rows)
+    def _diff_sync(
+        self,
+        source: DatasetSource,
+        table_name: str,
+        diff_key: str,
+        differ: Any,
+        dates: list[date],
+        *,
+        dry_run: bool = False,
+    ) -> int:
+        """Diff-based incremental sync: compare two most recent snapshots, apply delta."""
+        result = differ.diff(source.id, diff_key)
+        total = 0
+        sync_mode = source.db_load.sync if source.db_load else "full"
+        if dry_run:
+            count = len(result.added) + len(result.changed)
+            if sync_mode == "full":
+                count += len(result.removed)
+            return count
+        # Extract key value from a record (handles dot-notation)
+        def _get_key_val(record: dict[str, Any]) -> Any:
+            if "." in diff_key:
+                return _extract_dot_value(record, diff_key)
+            return record.get(diff_key)
+        # Determine the DB column name for the key
+        db_key_col = diff_key.replace(".", "_")
+        # INSERT added records
+        if result.added:
+            for record in result.added:
+                row = _filter_record(record, source)
+                self.adapter.insert_batch(table_name, [row])
+                total += 1
+        # DELETE removed records (skipped in append mode)
+        if result.removed and sync_mode == "full":
+            safe_col = sanitize_identifier(db_key_col)
+            for record in result.removed:
+                key_val = _get_key_val(record)
+                if key_val is not None:
+                    self.adapter.execute(
+                        f"DELETE FROM {table_name} WHERE {safe_col} = ?",
+                        (str(key_val),),
+                    )
+                    total += 1
+        # UPDATE changed records
+        if result.changed:
+            safe_col = sanitize_identifier(db_key_col)
+            for record in result.changed:
+                key_val = _get_key_val(record)
+                if key_val is None:
+                    continue
+                changed_fields = record.get("_changed_fields", [])
+                if not changed_fields:
+                    continue
+                # Build SET clause from changed fields
+                set_parts = []
+                params: list[Any] = []
+                for field_name in changed_fields:
+                    new_val = record.get(field_name)
+                    safe_field = sanitize_identifier(field_name)
+                    set_parts.append(f"{safe_field} = ?")
+                    params.append(new_val)
+                params.append(str(key_val))
+                sql = (
+                    f"UPDATE {table_name} "
+                    f"SET {', '.join(set_parts)} "
+                    f"WHERE {safe_col} = ?"
+                )
+                self.adapter.execute(sql, tuple(params))
+                total += 1
+        return total
     def _get_child_lookup_field(self, source: DatasetSource) -> str | None:
         """Find which field children use to reference this source."""
         for other in self.config.sources:

anysite/dataset/differ.py CHANGED Viewed

@@ -12,6 +12,31 @@ from anysite.dataset.errors import DatasetError
 from anysite.dataset.storage import get_source_dir
+def _build_key_expr(key: str, all_columns: list[str]) -> tuple[str, str]:
+    """Build a DuckDB key expression, supporting dot-notation for JSON fields.
+    Returns:
+        (key_expr, key_alias) — the SQL expression and a display alias.
+        For simple keys: ('"field"', 'field')
+        For dot-notation: ("json_extract_string(\"urn\", '$.value')", 'urn.value')
+    """
+    if "." not in key:
+        if key not in all_columns:
+            raise DatasetError(
+                f"Key field '{key}' not found. "
+                f"Available: {', '.join(all_columns)}"
+            )
+        return f'"{key}"', key
+    root, rest = key.split(".", 1)
+    if root not in all_columns:
+        raise DatasetError(
+            f"Root field '{root}' (from key '{key}') not found. "
+            f"Available: {', '.join(all_columns)}"
+        )
+    return f"json_extract_string(\"{root}\", '$.{rest}')", key
 @dataclass
 class DiffResult:
     """Result of comparing two dataset snapshots."""
@@ -24,6 +49,7 @@ class DiffResult:
     removed: list[dict[str, Any]] = field(default_factory=list)
     changed: list[dict[str, Any]] = field(default_factory=list)
     unchanged_count: int = 0
+    fields: list[str] | None = field(default=None)
     @property
     def has_changes(self) -> bool:
@@ -63,10 +89,11 @@ class DatasetDiffer:
         Args:
             source_id: Source to compare.
-            key: Field to match records by (e.g., ``_input_value``, ``urn``).
+            key: Field to match records by.  Supports dot-notation for
+                JSON fields (e.g., ``urn.value``).
             from_date: Older snapshot date (default: second-to-last).
             to_date: Newer snapshot date (default: latest).
-            fields: Only compare these fields (default: all).
+            fields: Only compare (and output) these fields (default: all).
         Returns:
             DiffResult with added, removed, changed lists.
@@ -153,50 +180,43 @@ class DatasetDiffer:
             info = conn.execute("DESCRIBE _new").fetchall()
             all_columns = [col[0] for col in info]
-            if key not in all_columns:
-                raise DatasetError(
-                    f"Key field '{key}' not found in {source_id}. "
-                    f"Available: {', '.join(all_columns)}"
-                )
+            # Build key expression (supports dot-notation)
+            key_expr, key_alias = _build_key_expr(key, all_columns)
             # Determine which fields to compare
             compare_fields = fields if fields else [
-                c for c in all_columns if c != key
+                c for c in all_columns if c != key and c != key.split(".")[0]
             ]
             # Filter to fields that actually exist
             compare_fields = [c for c in compare_fields if c in all_columns]
-            quoted_key = f'"{key}"'
+            # Determine output columns: if fields specified, restrict to key + fields
+            if fields:
+                output_columns = [key_alias] + [
+                    f for f in fields if f in all_columns
+                ]
+            else:
+                output_columns = None  # all columns
             # Added: in new but not in old
-            added = conn.execute(
-                f"SELECT * FROM _new "
-                f"WHERE {quoted_key} NOT IN (SELECT {quoted_key} FROM _old)"
-            ).fetchall()
-            added_cols = [d[0] for d in conn.execute(
-                "DESCRIBE _new"
-            ).fetchall()]
-            added_records = [dict(zip(added_cols, row, strict=False)) for row in added]
+            added_records = self._query_added_removed(
+                conn, "_new", "_old", key_expr, key_alias, all_columns, output_columns
+            )
             # Removed: in old but not in new
-            removed = conn.execute(
-                f"SELECT * FROM _old "
-                f"WHERE {quoted_key} NOT IN (SELECT {quoted_key} FROM _new)"
-            ).fetchall()
-            removed_cols = [d[0] for d in conn.execute(
-                "DESCRIBE _old"
-            ).fetchall()]
-            removed_records = [dict(zip(removed_cols, row, strict=False)) for row in removed]
+            removed_records = self._query_added_removed(
+                conn, "_old", "_new", key_expr, key_alias, all_columns, output_columns
+            )
             # Changed: matching key, different values
             changed_records = self._find_changed(
-                conn, key, compare_fields, all_columns
+                conn, key_expr, key_alias, compare_fields, all_columns, output_columns
             )
             # Count unchanged
             total_matched = conn.execute(
                 f"SELECT COUNT(*) FROM _new n "
-                f"JOIN _old o ON n.{quoted_key} = o.{quoted_key}"
+                f"JOIN _old o ON ({_requalify(key_expr, 'n')}) = ({_requalify(key_expr, 'o')})"
             ).fetchone()
             matched_count = total_matched[0] if total_matched else 0
             unchanged_count = matched_count - len(changed_records)
@@ -210,23 +230,59 @@ class DatasetDiffer:
                 removed=removed_records,
                 changed=changed_records,
                 unchanged_count=unchanged_count,
+                fields=fields,
             )
         finally:
             conn.close()
+    @staticmethod
+    def _query_added_removed(
+        conn: Any,
+        present_view: str,
+        absent_view: str,
+        key_expr: str,
+        key_alias: str,
+        all_columns: list[str],
+        output_columns: list[str] | None,
+    ) -> list[dict[str, Any]]:
+        """Query records present in one view but not the other."""
+        # Build SELECT list
+        if output_columns:
+            select_parts = []
+            for col in output_columns:
+                if col == key_alias and "." in col:
+                    select_parts.append(f"{key_expr} AS \"{key_alias}\"")
+                else:
+                    select_parts.append(f'"{col}"')
+            select_clause = ", ".join(select_parts)
+        else:
+            if "." in key_alias:
+                select_clause = f"*, {key_expr} AS \"{key_alias}\""
+            else:
+                select_clause = "*"
+        sql = (
+            f"SELECT {select_clause} FROM {present_view} "
+            f"WHERE ({key_expr}) NOT IN (SELECT ({key_expr}) FROM {absent_view})"
+        )
+        result = conn.execute(sql)
+        columns = [desc[0] for desc in result.description]
+        rows = result.fetchall()
+        return [dict(zip(columns, row, strict=False)) for row in rows]
+    @staticmethod
     def _find_changed(
-        self,
         conn: Any,
-        key: str,
+        key_expr: str,
+        key_alias: str,
         compare_fields: list[str],
         all_columns: list[str],
+        output_columns: list[str] | None,
     ) -> list[dict[str, Any]]:
         """Find records that exist in both snapshots but have different values."""
         if not compare_fields:
             return []
-        quoted_key = f'"{key}"'
         # Build WHERE clause: any compared field differs
         where_parts = []
         for col in compare_fields:
@@ -234,21 +290,43 @@ class DatasetDiffer:
             where_parts.append(f"n.{qc} IS DISTINCT FROM o.{qc}")
         where_clause = " OR ".join(where_parts)
-        # Select new values + old values for compared fields
-        select_parts = [f"n.{quoted_key}"]
-        for col in all_columns:
-            if col != key:
-                qc = f'"{col}"'
-                select_parts.append(f"n.{qc}")
-        for col in compare_fields:
-            qc = f'"{col}"'
-            select_parts.append(f"o.{qc} AS \"{col}__old\"")
+        # Build JOIN condition
+        join_key_n = _requalify(key_expr, "n")
+        join_key_o = _requalify(key_expr, "o")
+        join_cond = f"({join_key_n}) = ({join_key_o})"
+        # Build SELECT: key + output fields + __old for compare fields
+        if output_columns:
+            # Restricted output
+            select_parts = []
+            for col in output_columns:
+                if col == key_alias and "." in col:
+                    select_parts.append(f"{_requalify(key_expr, 'n')} AS \"{key_alias}\"")
+                else:
+                    select_parts.append(f"n.\"{col}\"")
+            for col in compare_fields:
+                # Include __old for compare fields that are in output
+                if col in [c for c in output_columns if c != key_alias]:
+                    select_parts.append(f"o.\"{col}\" AS \"{col}__old\"")
+        else:
+            # Full output
+            select_parts = []
+            if "." in key_alias:
+                select_parts.append(f"{_requalify(key_expr, 'n')} AS \"{key_alias}\"")
+            else:
+                select_parts.append(f"n.\"{key_alias}\"")
+            for col in all_columns:
+                if col == key_alias:
+                    continue
+                select_parts.append(f"n.\"{col}\"")
+            for col in compare_fields:
+                select_parts.append(f"o.\"{col}\" AS \"{col}__old\"")
         select_clause = ", ".join(select_parts)
         sql = (
             f"SELECT {select_clause} FROM _new n "
-            f"JOIN _old o ON n.{quoted_key} = o.{quoted_key} "
+            f"JOIN _old o ON {join_cond} "
             f"WHERE {where_clause}"
         )
@@ -266,11 +344,32 @@ class DatasetDiffer:
                 old_val = record.get(old_key)
                 if _values_differ(new_val, old_val):
                     changed_fields.append(col)
+            # Fallback: DuckDB detected a change but Python comparison missed it
+            if not changed_fields:
+                changed_fields = list(compare_fields)
             record["_changed_fields"] = changed_fields
         return records
+def _requalify(key_expr: str, prefix: str) -> str:
+    """Requalify a key expression with a table alias prefix.
+    For simple keys like '"field"', returns 'prefix."field"'.
+    For json_extract_string("col", '$.path'), returns
+    json_extract_string(prefix."col", '$.path').
+    """
+    if key_expr.startswith("json_extract_string("):
+        # Replace the column reference inside json_extract_string
+        inner = key_expr[len("json_extract_string("):]
+        # inner looks like: "col", '$.path')
+        col_end = inner.index(",")
+        col = inner[:col_end].strip()
+        rest = inner[col_end:]
+        return f"json_extract_string({prefix}.{col}{rest}"
+    return f"{prefix}.{key_expr}"
 def _values_differ(a: Any, b: Any) -> bool:
     """Compare two values, treating JSON strings as equivalent to their parsed form."""
     if a == b:
@@ -281,24 +380,42 @@ def _values_differ(a: Any, b: Any) -> bool:
             return json.loads(a) != json.loads(b)
         except (json.JSONDecodeError, ValueError):
             pass
+    # Handle complex types (dict, list) — compare via JSON serialization
+    # to catch differences DuckDB sees but Python equality misses
+    if isinstance(a, (dict, list)) or isinstance(b, (dict, list)):
+        try:
+            return json.dumps(a, sort_keys=True, default=str) != json.dumps(
+                b, sort_keys=True, default=str
+            )
+        except (TypeError, ValueError):
+            pass
     return True
-def format_diff_table(result: DiffResult) -> list[dict[str, Any]]:
+def format_diff_table(
+    result: DiffResult,
+    *,
+    output_fields: list[str] | None = None,
+) -> list[dict[str, Any]]:
     """Format a DiffResult into a flat list of dicts for table/json output.
     Each record gets a ``_diff`` column with value ``added``, ``removed``,
     or ``changed``.  For changed records in table mode, modified field
     values are formatted as ``old → new``.
+    Args:
+        result: The diff result.
+        output_fields: If set, only include these fields (plus ``_diff`` and key).
     """
+    allowed = _build_allowed_set(result.key, output_fields)
     rows: list[dict[str, Any]] = []
     for record in result.added:
-        row = {"_diff": "added", **record}
+        row = {"_diff": "added", **_filter_row(record, allowed)}
         rows.append(row)
     for record in result.removed:
-        row = {"_diff": "removed", **record}
+        row = {"_diff": "removed", **_filter_row(record, allowed)}
         rows.append(row)
     for record in result.changed:
@@ -309,6 +426,8 @@ def format_diff_table(result: DiffResult) -> list[dict[str, Any]]:
                 continue
             if k.endswith("__old"):
                 continue
+            if allowed and k not in allowed:
+                continue
             # For changed fields, format as "old → new"
             if k in changed_fields:
                 old_val = record.get(f"{k}__old")
@@ -320,31 +439,67 @@ def format_diff_table(result: DiffResult) -> list[dict[str, Any]]:
     return rows
-def format_diff_records(result: DiffResult) -> list[dict[str, Any]]:
+def format_diff_records(
+    result: DiffResult,
+    *,
+    output_fields: list[str] | None = None,
+) -> list[dict[str, Any]]:
     """Format a DiffResult for JSON/CSV output.
     Each record gets ``_diff`` column.  Changed records include both
     current values and ``field__old`` columns.
+    Args:
+        result: The diff result.
+        output_fields: If set, only include these fields (plus ``_diff``, key, and ``__old``).
     """
+    allowed = _build_allowed_set(result.key, output_fields)
     rows: list[dict[str, Any]] = []
     for record in result.added:
-        rows.append({"_diff": "added", **record})
+        rows.append({"_diff": "added", **_filter_row(record, allowed)})
     for record in result.removed:
-        rows.append({"_diff": "removed", **record})
+        rows.append({"_diff": "removed", **_filter_row(record, allowed)})
     for record in result.changed:
-        row = {"_diff": "changed"}
+        row: dict[str, Any] = {"_diff": "changed"}
+        changed_fields = record.get("_changed_fields", [])
+        row["_changed_fields"] = changed_fields
         for k, v in record.items():
             if k == "_changed_fields":
                 continue
+            if allowed and k not in allowed and not k.endswith("__old"):
+                continue
+            if k.endswith("__old") and allowed:
+                base = k[: -len("__old")]
+                if base not in allowed:
+                    continue
             row[k] = v
         rows.append(row)
     return rows
+def _build_allowed_set(key: str, output_fields: list[str] | None) -> set[str] | None:
+    """Build the set of allowed field names for output filtering."""
+    if not output_fields:
+        return None
+    allowed = set(output_fields)
+    allowed.add(key)
+    # Also add the root column for dot-notation keys
+    if "." in key:
+        allowed.add(key.split(".")[0])
+    return allowed
+def _filter_row(record: dict[str, Any], allowed: set[str] | None) -> dict[str, Any]:
+    """Filter a record to only allowed fields."""
+    if not allowed:
+        return record
+    return {k: v for k, v in record.items() if k in allowed}
 def _format_val(v: Any) -> str:
     """Format a value for display, truncating long strings."""
     if v is None:

anysite/dataset/models.py CHANGED Viewed

@@ -81,6 +81,11 @@ class DbLoadConfig(BaseModel):
     """Configuration for loading a source into a relational database."""
     table: str | None = Field(default=None, description="Override table name (default: source id)")
+    key: str | None = Field(default=None, description="Unique key field for diff-based DB sync (e.g., urn.value)")
+    sync: Literal["full", "append"] = Field(
+        default="full",
+        description="Sync mode: 'full' applies INSERT/DELETE/UPDATE, 'append' skips DELETE (keeps old records)",
+    )
     fields: list[str] = Field(default_factory=list, description="Fields to include (empty = all)")
     exclude: list[str] = Field(
         default_factory=lambda: ["_input_value", "_parent_source"],

anysite/dataset/storage.py CHANGED Viewed

@@ -75,7 +75,7 @@ def read_parquet(path: Path) -> list[dict[str, Any]]:
         tables = [pq.read_table(f) for f in files]
         import pyarrow as pa
-        table = pa.concat_tables(tables)
+        table = pa.concat_tables(tables, promote_options="permissive")
     else:
         if not path.exists():
             return []
@@ -84,6 +84,26 @@ def read_parquet(path: Path) -> list[dict[str, Any]]:
     return table.to_pylist()
+def read_latest_parquet(path: Path) -> list[dict[str, Any]]:
+    """Read records from the most recent Parquet snapshot in a directory.
+    Unlike ``read_parquet(dir)``, this reads only the latest file, avoiding
+    schema mismatch errors when snapshots have different column types.
+    Args:
+        path: Directory containing dated .parquet files.
+    Returns:
+        List of dicts from the newest snapshot, or [] if none found.
+    """
+    if not path.is_dir():
+        return read_parquet(path)
+    files = sorted(path.glob("*.parquet"))
+    if not files:
+        return []
+    return read_parquet(files[-1])
 def get_source_dir(base_path: Path, source_id: str) -> Path:
     """Get the raw data directory for a source."""
     return base_path / "raw" / source_id

{anysite_cli-0.1.3.dist-info → anysite_cli-0.1.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: anysite-cli
-Version: 0.1.3
+Version: 0.1.5
 Summary: CLI for Anysite API - web data extraction for humans and AI agents
 Project-URL: Homepage, https://anysite.io
 Project-URL: Documentation, https://docs.anysite.io/cli
@@ -259,6 +259,8 @@ sources:
         path: ./output/companies-{{date}}.csv
         format: csv
     db_load:
+      key: _input_value                    # Unique key for incremental sync
+      sync: full                           # full (default) or append (no DELETE)
       fields: [name, url, employee_count]
   - id: employees
@@ -274,6 +276,8 @@ sources:
       count: 5
     refresh: always                       # Re-collect every run with --incremental
     db_load:
+      key: urn.value                       # Unique key for incremental sync
+      sync: append                         # Keep old records (no DELETE on diff)
       fields: [name, url, headline]
 storage:
@@ -318,9 +322,15 @@ anysite dataset query dataset.yaml --interactive
 anysite dataset stats dataset.yaml --source companies
 anysite dataset profile dataset.yaml
-# Load into PostgreSQL with automatic FK linking
+# Load into PostgreSQL with automatic FK linking (incremental sync with db_load.key)
+anysite dataset load-db dataset.yaml -c pg
+# Drop and reload from latest snapshot
 anysite dataset load-db dataset.yaml -c pg --drop-existing
+# Load a specific snapshot date
+anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
 # Run history and logs
 anysite dataset history my-dataset
 anysite dataset logs my-dataset --run 42
@@ -328,8 +338,9 @@ anysite dataset logs my-dataset --run 42
 # Generate cron/systemd schedule
 anysite dataset schedule dataset.yaml --incremental --load-db pg
-# Compare snapshots (diff two collection dates)
+# Compare snapshots (diff two collection dates, supports dot-notation keys)
 anysite dataset diff dataset.yaml --source employees --key _input_value
+anysite dataset diff dataset.yaml --source profiles --key urn.value --fields "name,headline"
 # Reset incremental state
 anysite dataset reset-cursor dataset.yaml

{anysite_cli-0.1.3.dist-info → anysite_cli-0.1.5.dist-info}/RECORD RENAMED Viewed

@@ -19,17 +19,17 @@ anysite/config/paths.py,sha256=EmHJD8wlf4Q9IUn8Gp1JQ8Z3ffrIYAt5iHRyImQOf5I,1087
 anysite/config/settings.py,sha256=Hc0j_aCCtkJeL4nHw-EFyfJ8WEDk57G08iNUFquUhpM,5235
 anysite/dataset/__init__.py,sha256=J0sKQkGwVOPtvp6pka7LcdeUEADvjWRs71yRuROzJxI,847
 anysite/dataset/analyzer.py,sha256=8dsPW32SbSaUTy1F0NIed1U45wjiMgQeJ2iWX7hBxRQ,9245
-anysite/dataset/cli.py,sha256=zaCo0kKeA1KNU7EZgW4WwxrP07xuKayPlolfUnCSoYI,22801
-anysite/dataset/collector.py,sha256=6CfJt8fKZZ2xvZWJ7jwnx0V9BnjoJxmBZkm8xWQiU54,23840
-anysite/dataset/db_loader.py,sha256=nlMJrDJiGBX5H1StcjsontSxLXbsFe4rwOEnDehzpk8,8443
-anysite/dataset/differ.py,sha256=hbUwoS73syTkrj0VC0gaJzuB0pVCoQXQMbsNXtpsig8,11634
+anysite/dataset/cli.py,sha256=rEWK1ka-YQ_Vbbj2nMaMYTD9g3wa3ethUWSoaWRSGTY,23066
+anysite/dataset/collector.py,sha256=ZdR3CmQQew_iuJpNtJ4knSrjt0hvkEL4WIaS0IKEkwQ,23927
+anysite/dataset/db_loader.py,sha256=ksvRt-VJISL4Syk2O1-TTkOMj1uGzk7aQARYS2n--U4,13751
+anysite/dataset/differ.py,sha256=jB_VWTb7UuEBWG9nv1ry5xeo9hmWdhA_cTm6Ed43_Uw,17746
 anysite/dataset/errors.py,sha256=r8cZXoIzSeTGCWpeYjntnN0AduCu74YZyWs3sFu17J4,914
 anysite/dataset/exporters.py,sha256=mA2FYbYJbHfrwkXbHDu4g5qPG_JJKnkVciXFKPkF1Vw,3708
 anysite/dataset/history.py,sha256=avFs0ADlM7Hr-ttqC1FfjJiQxvQP20sScM7ZoY4lvU0,5471
-anysite/dataset/models.py,sha256=_f1cg9A4FlQwWGpg-s0b9q5WMlaIRN-ENlpU9CE6mrQ,9695
+anysite/dataset/models.py,sha256=d-bkgu2dUY7_VSgH-oVh84IV3X-KpxRfja0H5WnhauU,9998
 anysite/dataset/notifications.py,sha256=ORzo9XOgOxzLb7rk4pevlKPB_Taf-jejlrtmO4Zgl2c,2367
 anysite/dataset/scheduler.py,sha256=zpbA5tRUQZXr-9lZnG58dvE3E7ZBlAd-U-PTXExe9f0,3339
-anysite/dataset/storage.py,sha256=d03goKLI5NWKJowHwCgGqQkcVTO1NctPxMu-Xu-tru4,5326
+anysite/dataset/storage.py,sha256=ySY822m4lQd6Ip0i3VNPVbHEO6U6zBBwHi-56AXOaXE,5974
 anysite/dataset/transformer.py,sha256=XBI4MiZ_F_IZdootV0GAePaM9-pUadIte7RABbjBipc,6843
 anysite/db/__init__.py,sha256=xGGZHlMt5FUZjI6MAmf2VfyNLypOeXwrRL-gmuTsyl4,1117
 anysite/db/cli.py,sha256=fYuIKWq7eF5mAfZWnXNbtlpITnbYbOFMm2TqU54xIl4,22118
@@ -58,8 +58,8 @@ anysite/streaming/writer.py,sha256=HfMsC4umUdJuNIAPK57YAxEGyTwUmy-zNrqFkwY6aew,4
 anysite/utils/__init__.py,sha256=7SnbxpxKENK-2ecUL5NfnZ9okGI7COKYw4WF46172HM,23
 anysite/utils/fields.py,sha256=bSrHadzNmabL4qubqhXXZoWb_P8KA-3S7_FLVT8nGBc,7410
 anysite/utils/retry.py,sha256=89TbXvavi5t22P2mTYCLAS6SSZoW65gQ0nnYNbYAF0M,2684
-anysite_cli-0.1.3.dist-info/METADATA,sha256=lD_AF5pq5ayHerMVMMWTTkgccwWEsKLBGCwvPfZ5y_Y,11781
-anysite_cli-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-anysite_cli-0.1.3.dist-info/entry_points.txt,sha256=FDPxNasy0fRRcOgJdZRVP7Qw01C3TwRa1OwPJiskNyg,45
-anysite_cli-0.1.3.dist-info/licenses/LICENSE,sha256=gVAxkI23CFm4x4HV_fkQYw_bGq93mQmVZEwxNs-YTa4,1069
-anysite_cli-0.1.3.dist-info/RECORD,,
+anysite_cli-0.1.5.dist-info/METADATA,sha256=B4HxyrTZxBbhMb17lb0LoRcne_cRehz8xNUYIvDraMA,12437
+anysite_cli-0.1.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+anysite_cli-0.1.5.dist-info/entry_points.txt,sha256=FDPxNasy0fRRcOgJdZRVP7Qw01C3TwRa1OwPJiskNyg,45
+anysite_cli-0.1.5.dist-info/licenses/LICENSE,sha256=gVAxkI23CFm4x4HV_fkQYw_bGq93mQmVZEwxNs-YTa4,1069
+anysite_cli-0.1.5.dist-info/RECORD,,

{anysite_cli-0.1.3.dist-info → anysite_cli-0.1.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{anysite_cli-0.1.3.dist-info → anysite_cli-0.1.5.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{anysite_cli-0.1.3.dist-info → anysite_cli-0.1.5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

anysite-cli 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

anysite-cli 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl