PyPI - anysite-cli - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

anysite-cli 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

anysite/dataset/collector.py CHANGED Viewed

@@ -19,6 +19,7 @@ from anysite.dataset.models import DatasetConfig, DatasetSource
 from anysite.dataset.storage import (
     MetadataStore,
     get_parquet_path,
+    read_latest_parquet,
     read_parquet,
     write_parquet,
 )
@@ -412,9 +413,9 @@ async def _collect_dependent(
     if dep is None:
         raise DatasetError(f"Source {source.id} has no dependency defined")
-    # Read parent data
+    # Read parent data (latest snapshot only to avoid schema mismatch)
     parent_dir = base_path / "raw" / dep.from_source
-    parent_records = read_parquet(parent_dir)
+    parent_records = read_latest_parquet(parent_dir)
     if not parent_records:
         if not quiet:
@@ -627,7 +628,7 @@ def _count_dependent_inputs(
     if dep is None:
         return None
     parent_dir = base_path / "raw" / dep.from_source
-    parent_records = read_parquet(parent_dir)
+    parent_records = read_latest_parquet(parent_dir)
     if not parent_records:
         info = metadata.get_source_info(dep.from_source)
         return info.get("record_count") if info else None

anysite/dataset/db_loader.py CHANGED Viewed

@@ -301,9 +301,13 @@ class DatasetDbLoader:
         """Diff-based incremental sync: compare two most recent snapshots, apply delta."""
         result = differ.diff(source.id, diff_key)
         total = 0
+        sync_mode = source.db_load.sync if source.db_load else "full"
         if dry_run:
-            return len(result.added) + len(result.removed) + len(result.changed)
+            count = len(result.added) + len(result.changed)
+            if sync_mode == "full":
+                count += len(result.removed)
+            return count
         # Extract key value from a record (handles dot-notation)
         def _get_key_val(record: dict[str, Any]) -> Any:
@@ -321,14 +325,15 @@ class DatasetDbLoader:
                 self.adapter.insert_batch(table_name, [row])
                 total += 1
-        # DELETE removed records
-        if result.removed:
+        # DELETE removed records (skipped in append mode)
+        ph = self._placeholder()
+        if result.removed and sync_mode == "full":
             safe_col = sanitize_identifier(db_key_col)
             for record in result.removed:
                 key_val = _get_key_val(record)
                 if key_val is not None:
                     self.adapter.execute(
-                        f"DELETE FROM {table_name} WHERE {safe_col} = ?",
+                        f"DELETE FROM {table_name} WHERE {safe_col} = {ph}",
                         (str(key_val),),
                     )
                     total += 1
@@ -350,14 +355,14 @@ class DatasetDbLoader:
                 for field_name in changed_fields:
                     new_val = record.get(field_name)
                     safe_field = sanitize_identifier(field_name)
-                    set_parts.append(f"{safe_field} = ?")
+                    set_parts.append(f"{safe_field} = {ph}")
                     params.append(new_val)
                 params.append(str(key_val))
                 sql = (
                     f"UPDATE {table_name} "
                     f"SET {', '.join(set_parts)} "
-                    f"WHERE {safe_col} = ?"
+                    f"WHERE {safe_col} = {ph}"
                 )
                 self.adapter.execute(sql, tuple(params))
                 total += 1
@@ -371,6 +376,12 @@ class DatasetDbLoader:
                 return other.dependency.field
         return None
+    def _placeholder(self) -> str:
+        """Get the parameter placeholder for the dialect."""
+        if self._dialect == "postgres":
+            return "%s"
+        return "?"
     def _auto_id_type(self) -> str:
         """Get the auto-increment ID column type for the dialect."""
         if self._dialect == "postgres":

anysite/dataset/differ.py CHANGED Viewed

@@ -344,6 +344,9 @@ class DatasetDiffer:
                 old_val = record.get(old_key)
                 if _values_differ(new_val, old_val):
                     changed_fields.append(col)
+            # Fallback: DuckDB detected a change but Python comparison missed it
+            if not changed_fields:
+                changed_fields = list(compare_fields)
             record["_changed_fields"] = changed_fields
         return records
@@ -377,6 +380,15 @@ def _values_differ(a: Any, b: Any) -> bool:
             return json.loads(a) != json.loads(b)
         except (json.JSONDecodeError, ValueError):
             pass
+    # Handle complex types (dict, list) — compare via JSON serialization
+    # to catch differences DuckDB sees but Python equality misses
+    if isinstance(a, (dict, list)) or isinstance(b, (dict, list)):
+        try:
+            return json.dumps(a, sort_keys=True, default=str) != json.dumps(
+                b, sort_keys=True, default=str
+            )
+        except (TypeError, ValueError):
+            pass
     return True
@@ -452,6 +464,8 @@ def format_diff_records(
     for record in result.changed:
         row: dict[str, Any] = {"_diff": "changed"}
+        changed_fields = record.get("_changed_fields", [])
+        row["_changed_fields"] = changed_fields
         for k, v in record.items():
             if k == "_changed_fields":
                 continue

anysite/dataset/models.py CHANGED Viewed

@@ -82,6 +82,10 @@ class DbLoadConfig(BaseModel):
     table: str | None = Field(default=None, description="Override table name (default: source id)")
     key: str | None = Field(default=None, description="Unique key field for diff-based DB sync (e.g., urn.value)")
+    sync: Literal["full", "append"] = Field(
+        default="full",
+        description="Sync mode: 'full' applies INSERT/DELETE/UPDATE, 'append' skips DELETE (keeps old records)",
+    )
     fields: list[str] = Field(default_factory=list, description="Fields to include (empty = all)")
     exclude: list[str] = Field(
         default_factory=lambda: ["_input_value", "_parent_source"],

anysite/dataset/storage.py CHANGED Viewed

@@ -75,7 +75,7 @@ def read_parquet(path: Path) -> list[dict[str, Any]]:
         tables = [pq.read_table(f) for f in files]
         import pyarrow as pa
-        table = pa.concat_tables(tables)
+        table = pa.concat_tables(tables, promote_options="permissive")
     else:
         if not path.exists():
             return []
@@ -84,6 +84,26 @@ def read_parquet(path: Path) -> list[dict[str, Any]]:
     return table.to_pylist()
+def read_latest_parquet(path: Path) -> list[dict[str, Any]]:
+    """Read records from the most recent Parquet snapshot in a directory.
+    Unlike ``read_parquet(dir)``, this reads only the latest file, avoiding
+    schema mismatch errors when snapshots have different column types.
+    Args:
+        path: Directory containing dated .parquet files.
+    Returns:
+        List of dicts from the newest snapshot, or [] if none found.
+    """
+    if not path.is_dir():
+        return read_parquet(path)
+    files = sorted(path.glob("*.parquet"))
+    if not files:
+        return []
+    return read_parquet(files[-1])
 def get_source_dir(base_path: Path, source_id: str) -> Path:
     """Get the raw data directory for a source."""
     return base_path / "raw" / source_id

{anysite_cli-0.1.4.dist-info → anysite_cli-0.1.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: anysite-cli
-Version: 0.1.4
+Version: 0.1.6
 Summary: CLI for Anysite API - web data extraction for humans and AI agents
 Project-URL: Homepage, https://anysite.io
 Project-URL: Documentation, https://docs.anysite.io/cli
@@ -259,6 +259,8 @@ sources:
         path: ./output/companies-{{date}}.csv
         format: csv
     db_load:
+      key: _input_value                    # Unique key for incremental sync
+      sync: full                           # full (default) or append (no DELETE)
       fields: [name, url, employee_count]
   - id: employees
@@ -274,6 +276,8 @@ sources:
       count: 5
     refresh: always                       # Re-collect every run with --incremental
     db_load:
+      key: urn.value                       # Unique key for incremental sync
+      sync: append                         # Keep old records (no DELETE on diff)
       fields: [name, url, headline]
 storage:
@@ -318,9 +322,15 @@ anysite dataset query dataset.yaml --interactive
 anysite dataset stats dataset.yaml --source companies
 anysite dataset profile dataset.yaml
-# Load into PostgreSQL with automatic FK linking
+# Load into PostgreSQL with automatic FK linking (incremental sync with db_load.key)
+anysite dataset load-db dataset.yaml -c pg
+# Drop and reload from latest snapshot
 anysite dataset load-db dataset.yaml -c pg --drop-existing
+# Load a specific snapshot date
+anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
 # Run history and logs
 anysite dataset history my-dataset
 anysite dataset logs my-dataset --run 42
@@ -328,8 +338,9 @@ anysite dataset logs my-dataset --run 42
 # Generate cron/systemd schedule
 anysite dataset schedule dataset.yaml --incremental --load-db pg
-# Compare snapshots (diff two collection dates)
+# Compare snapshots (diff two collection dates, supports dot-notation keys)
 anysite dataset diff dataset.yaml --source employees --key _input_value
+anysite dataset diff dataset.yaml --source profiles --key urn.value --fields "name,headline"
 # Reset incremental state
 anysite dataset reset-cursor dataset.yaml

{anysite_cli-0.1.4.dist-info → anysite_cli-0.1.6.dist-info}/RECORD RENAMED Viewed

@@ -20,16 +20,16 @@ anysite/config/settings.py,sha256=Hc0j_aCCtkJeL4nHw-EFyfJ8WEDk57G08iNUFquUhpM,52
 anysite/dataset/__init__.py,sha256=J0sKQkGwVOPtvp6pka7LcdeUEADvjWRs71yRuROzJxI,847
 anysite/dataset/analyzer.py,sha256=8dsPW32SbSaUTy1F0NIed1U45wjiMgQeJ2iWX7hBxRQ,9245
 anysite/dataset/cli.py,sha256=rEWK1ka-YQ_Vbbj2nMaMYTD9g3wa3ethUWSoaWRSGTY,23066
-anysite/dataset/collector.py,sha256=6CfJt8fKZZ2xvZWJ7jwnx0V9BnjoJxmBZkm8xWQiU54,23840
-anysite/dataset/db_loader.py,sha256=TMcvI-pX-XctbkTdo5eTyW8Co4_3uK-dEdXn_r9g8Oc,13547
-anysite/dataset/differ.py,sha256=b-qU5Laf8RkteZAlblKq4atTvnJ21W4QbxfpHBFYMJ8,17053
+anysite/dataset/collector.py,sha256=ZdR3CmQQew_iuJpNtJ4knSrjt0hvkEL4WIaS0IKEkwQ,23927
+anysite/dataset/db_loader.py,sha256=ASDO5AD5_wcOxjR4DZknX-zMEaevqXMb3VVa6507qAg,13973
+anysite/dataset/differ.py,sha256=jB_VWTb7UuEBWG9nv1ry5xeo9hmWdhA_cTm6Ed43_Uw,17746
 anysite/dataset/errors.py,sha256=r8cZXoIzSeTGCWpeYjntnN0AduCu74YZyWs3sFu17J4,914
 anysite/dataset/exporters.py,sha256=mA2FYbYJbHfrwkXbHDu4g5qPG_JJKnkVciXFKPkF1Vw,3708
 anysite/dataset/history.py,sha256=avFs0ADlM7Hr-ttqC1FfjJiQxvQP20sScM7ZoY4lvU0,5471
-anysite/dataset/models.py,sha256=-Qnh6QvbN3nzlfsYqgCiYKBqOeLcJCYK_hYrmxVCRTA,9810
+anysite/dataset/models.py,sha256=d-bkgu2dUY7_VSgH-oVh84IV3X-KpxRfja0H5WnhauU,9998
 anysite/dataset/notifications.py,sha256=ORzo9XOgOxzLb7rk4pevlKPB_Taf-jejlrtmO4Zgl2c,2367
 anysite/dataset/scheduler.py,sha256=zpbA5tRUQZXr-9lZnG58dvE3E7ZBlAd-U-PTXExe9f0,3339
-anysite/dataset/storage.py,sha256=d03goKLI5NWKJowHwCgGqQkcVTO1NctPxMu-Xu-tru4,5326
+anysite/dataset/storage.py,sha256=ySY822m4lQd6Ip0i3VNPVbHEO6U6zBBwHi-56AXOaXE,5974
 anysite/dataset/transformer.py,sha256=XBI4MiZ_F_IZdootV0GAePaM9-pUadIte7RABbjBipc,6843
 anysite/db/__init__.py,sha256=xGGZHlMt5FUZjI6MAmf2VfyNLypOeXwrRL-gmuTsyl4,1117
 anysite/db/cli.py,sha256=fYuIKWq7eF5mAfZWnXNbtlpITnbYbOFMm2TqU54xIl4,22118
@@ -58,8 +58,8 @@ anysite/streaming/writer.py,sha256=HfMsC4umUdJuNIAPK57YAxEGyTwUmy-zNrqFkwY6aew,4
 anysite/utils/__init__.py,sha256=7SnbxpxKENK-2ecUL5NfnZ9okGI7COKYw4WF46172HM,23
 anysite/utils/fields.py,sha256=bSrHadzNmabL4qubqhXXZoWb_P8KA-3S7_FLVT8nGBc,7410
 anysite/utils/retry.py,sha256=89TbXvavi5t22P2mTYCLAS6SSZoW65gQ0nnYNbYAF0M,2684
-anysite_cli-0.1.4.dist-info/METADATA,sha256=w5DUgDWzJgXynKRogJVm9baLqTJVSrg0ciHuWfWa9l0,11781
-anysite_cli-0.1.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-anysite_cli-0.1.4.dist-info/entry_points.txt,sha256=FDPxNasy0fRRcOgJdZRVP7Qw01C3TwRa1OwPJiskNyg,45
-anysite_cli-0.1.4.dist-info/licenses/LICENSE,sha256=gVAxkI23CFm4x4HV_fkQYw_bGq93mQmVZEwxNs-YTa4,1069
-anysite_cli-0.1.4.dist-info/RECORD,,
+anysite_cli-0.1.6.dist-info/METADATA,sha256=iqEFoJcISFAZoeT96LrCHiCVPqWk4WX1Xy41siFqUzs,12437
+anysite_cli-0.1.6.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+anysite_cli-0.1.6.dist-info/entry_points.txt,sha256=FDPxNasy0fRRcOgJdZRVP7Qw01C3TwRa1OwPJiskNyg,45
+anysite_cli-0.1.6.dist-info/licenses/LICENSE,sha256=gVAxkI23CFm4x4HV_fkQYw_bGq93mQmVZEwxNs-YTa4,1069
+anysite_cli-0.1.6.dist-info/RECORD,,

{anysite_cli-0.1.4.dist-info → anysite_cli-0.1.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{anysite_cli-0.1.4.dist-info → anysite_cli-0.1.6.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{anysite_cli-0.1.4.dist-info → anysite_cli-0.1.6.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

anysite-cli 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

anysite-cli 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl