anysite-cli 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,6 +19,7 @@ from anysite.dataset.models import DatasetConfig, DatasetSource
19
19
  from anysite.dataset.storage import (
20
20
  MetadataStore,
21
21
  get_parquet_path,
22
+ read_latest_parquet,
22
23
  read_parquet,
23
24
  write_parquet,
24
25
  )
@@ -412,9 +413,9 @@ async def _collect_dependent(
412
413
  if dep is None:
413
414
  raise DatasetError(f"Source {source.id} has no dependency defined")
414
415
 
415
- # Read parent data
416
+ # Read parent data (latest snapshot only to avoid schema mismatch)
416
417
  parent_dir = base_path / "raw" / dep.from_source
417
- parent_records = read_parquet(parent_dir)
418
+ parent_records = read_latest_parquet(parent_dir)
418
419
 
419
420
  if not parent_records:
420
421
  if not quiet:
@@ -627,7 +628,7 @@ def _count_dependent_inputs(
627
628
  if dep is None:
628
629
  return None
629
630
  parent_dir = base_path / "raw" / dep.from_source
630
- parent_records = read_parquet(parent_dir)
631
+ parent_records = read_latest_parquet(parent_dir)
631
632
  if not parent_records:
632
633
  info = metadata.get_source_info(dep.from_source)
633
634
  return info.get("record_count") if info else None
@@ -301,9 +301,13 @@ class DatasetDbLoader:
301
301
  """Diff-based incremental sync: compare two most recent snapshots, apply delta."""
302
302
  result = differ.diff(source.id, diff_key)
303
303
  total = 0
304
+ sync_mode = source.db_load.sync if source.db_load else "full"
304
305
 
305
306
  if dry_run:
306
- return len(result.added) + len(result.removed) + len(result.changed)
307
+ count = len(result.added) + len(result.changed)
308
+ if sync_mode == "full":
309
+ count += len(result.removed)
310
+ return count
307
311
 
308
312
  # Extract key value from a record (handles dot-notation)
309
313
  def _get_key_val(record: dict[str, Any]) -> Any:
@@ -321,14 +325,15 @@ class DatasetDbLoader:
321
325
  self.adapter.insert_batch(table_name, [row])
322
326
  total += 1
323
327
 
324
- # DELETE removed records
325
- if result.removed:
328
+ # DELETE removed records (skipped in append mode)
329
+ ph = self._placeholder()
330
+ if result.removed and sync_mode == "full":
326
331
  safe_col = sanitize_identifier(db_key_col)
327
332
  for record in result.removed:
328
333
  key_val = _get_key_val(record)
329
334
  if key_val is not None:
330
335
  self.adapter.execute(
331
- f"DELETE FROM {table_name} WHERE {safe_col} = ?",
336
+ f"DELETE FROM {table_name} WHERE {safe_col} = {ph}",
332
337
  (str(key_val),),
333
338
  )
334
339
  total += 1
@@ -350,14 +355,14 @@ class DatasetDbLoader:
350
355
  for field_name in changed_fields:
351
356
  new_val = record.get(field_name)
352
357
  safe_field = sanitize_identifier(field_name)
353
- set_parts.append(f"{safe_field} = ?")
358
+ set_parts.append(f"{safe_field} = {ph}")
354
359
  params.append(new_val)
355
360
 
356
361
  params.append(str(key_val))
357
362
  sql = (
358
363
  f"UPDATE {table_name} "
359
364
  f"SET {', '.join(set_parts)} "
360
- f"WHERE {safe_col} = ?"
365
+ f"WHERE {safe_col} = {ph}"
361
366
  )
362
367
  self.adapter.execute(sql, tuple(params))
363
368
  total += 1
@@ -371,6 +376,12 @@ class DatasetDbLoader:
371
376
  return other.dependency.field
372
377
  return None
373
378
 
379
+ def _placeholder(self) -> str:
380
+ """Get the parameter placeholder for the dialect."""
381
+ if self._dialect == "postgres":
382
+ return "%s"
383
+ return "?"
384
+
374
385
  def _auto_id_type(self) -> str:
375
386
  """Get the auto-increment ID column type for the dialect."""
376
387
  if self._dialect == "postgres":
anysite/dataset/differ.py CHANGED
@@ -344,6 +344,9 @@ class DatasetDiffer:
344
344
  old_val = record.get(old_key)
345
345
  if _values_differ(new_val, old_val):
346
346
  changed_fields.append(col)
347
+ # Fallback: DuckDB detected a change but Python comparison missed it
348
+ if not changed_fields:
349
+ changed_fields = list(compare_fields)
347
350
  record["_changed_fields"] = changed_fields
348
351
 
349
352
  return records
@@ -377,6 +380,15 @@ def _values_differ(a: Any, b: Any) -> bool:
377
380
  return json.loads(a) != json.loads(b)
378
381
  except (json.JSONDecodeError, ValueError):
379
382
  pass
383
+ # Handle complex types (dict, list) — compare via JSON serialization
384
+ # to catch differences DuckDB sees but Python equality misses
385
+ if isinstance(a, (dict, list)) or isinstance(b, (dict, list)):
386
+ try:
387
+ return json.dumps(a, sort_keys=True, default=str) != json.dumps(
388
+ b, sort_keys=True, default=str
389
+ )
390
+ except (TypeError, ValueError):
391
+ pass
380
392
  return True
381
393
 
382
394
 
@@ -452,6 +464,8 @@ def format_diff_records(
452
464
 
453
465
  for record in result.changed:
454
466
  row: dict[str, Any] = {"_diff": "changed"}
467
+ changed_fields = record.get("_changed_fields", [])
468
+ row["_changed_fields"] = changed_fields
455
469
  for k, v in record.items():
456
470
  if k == "_changed_fields":
457
471
  continue
anysite/dataset/models.py CHANGED
@@ -82,6 +82,10 @@ class DbLoadConfig(BaseModel):
82
82
 
83
83
  table: str | None = Field(default=None, description="Override table name (default: source id)")
84
84
  key: str | None = Field(default=None, description="Unique key field for diff-based DB sync (e.g., urn.value)")
85
+ sync: Literal["full", "append"] = Field(
86
+ default="full",
87
+ description="Sync mode: 'full' applies INSERT/DELETE/UPDATE, 'append' skips DELETE (keeps old records)",
88
+ )
85
89
  fields: list[str] = Field(default_factory=list, description="Fields to include (empty = all)")
86
90
  exclude: list[str] = Field(
87
91
  default_factory=lambda: ["_input_value", "_parent_source"],
@@ -75,7 +75,7 @@ def read_parquet(path: Path) -> list[dict[str, Any]]:
75
75
  tables = [pq.read_table(f) for f in files]
76
76
  import pyarrow as pa
77
77
 
78
- table = pa.concat_tables(tables)
78
+ table = pa.concat_tables(tables, promote_options="permissive")
79
79
  else:
80
80
  if not path.exists():
81
81
  return []
@@ -84,6 +84,26 @@ def read_parquet(path: Path) -> list[dict[str, Any]]:
84
84
  return table.to_pylist()
85
85
 
86
86
 
87
+ def read_latest_parquet(path: Path) -> list[dict[str, Any]]:
88
+ """Read records from the most recent Parquet snapshot in a directory.
89
+
90
+ Unlike ``read_parquet(dir)``, this reads only the latest file, avoiding
91
+ schema mismatch errors when snapshots have different column types.
92
+
93
+ Args:
94
+ path: Directory containing dated .parquet files.
95
+
96
+ Returns:
97
+ List of dicts from the newest snapshot, or [] if none found.
98
+ """
99
+ if not path.is_dir():
100
+ return read_parquet(path)
101
+ files = sorted(path.glob("*.parquet"))
102
+ if not files:
103
+ return []
104
+ return read_parquet(files[-1])
105
+
106
+
87
107
  def get_source_dir(base_path: Path, source_id: str) -> Path:
88
108
  """Get the raw data directory for a source."""
89
109
  return base_path / "raw" / source_id
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: anysite-cli
3
- Version: 0.1.4
3
+ Version: 0.1.6
4
4
  Summary: CLI for Anysite API - web data extraction for humans and AI agents
5
5
  Project-URL: Homepage, https://anysite.io
6
6
  Project-URL: Documentation, https://docs.anysite.io/cli
@@ -259,6 +259,8 @@ sources:
259
259
  path: ./output/companies-{{date}}.csv
260
260
  format: csv
261
261
  db_load:
262
+ key: _input_value # Unique key for incremental sync
263
+ sync: full # full (default) or append (no DELETE)
262
264
  fields: [name, url, employee_count]
263
265
 
264
266
  - id: employees
@@ -274,6 +276,8 @@ sources:
274
276
  count: 5
275
277
  refresh: always # Re-collect every run with --incremental
276
278
  db_load:
279
+ key: urn.value # Unique key for incremental sync
280
+ sync: append # Keep old records (no DELETE on diff)
277
281
  fields: [name, url, headline]
278
282
 
279
283
  storage:
@@ -318,9 +322,15 @@ anysite dataset query dataset.yaml --interactive
318
322
  anysite dataset stats dataset.yaml --source companies
319
323
  anysite dataset profile dataset.yaml
320
324
 
321
- # Load into PostgreSQL with automatic FK linking
325
+ # Load into PostgreSQL with automatic FK linking (incremental sync with db_load.key)
326
+ anysite dataset load-db dataset.yaml -c pg
327
+
328
+ # Drop and reload from latest snapshot
322
329
  anysite dataset load-db dataset.yaml -c pg --drop-existing
323
330
 
331
+ # Load a specific snapshot date
332
+ anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
333
+
324
334
  # Run history and logs
325
335
  anysite dataset history my-dataset
326
336
  anysite dataset logs my-dataset --run 42
@@ -328,8 +338,9 @@ anysite dataset logs my-dataset --run 42
328
338
  # Generate cron/systemd schedule
329
339
  anysite dataset schedule dataset.yaml --incremental --load-db pg
330
340
 
331
- # Compare snapshots (diff two collection dates)
341
+ # Compare snapshots (diff two collection dates, supports dot-notation keys)
332
342
  anysite dataset diff dataset.yaml --source employees --key _input_value
343
+ anysite dataset diff dataset.yaml --source profiles --key urn.value --fields "name,headline"
333
344
 
334
345
  # Reset incremental state
335
346
  anysite dataset reset-cursor dataset.yaml
@@ -20,16 +20,16 @@ anysite/config/settings.py,sha256=Hc0j_aCCtkJeL4nHw-EFyfJ8WEDk57G08iNUFquUhpM,52
20
20
  anysite/dataset/__init__.py,sha256=J0sKQkGwVOPtvp6pka7LcdeUEADvjWRs71yRuROzJxI,847
21
21
  anysite/dataset/analyzer.py,sha256=8dsPW32SbSaUTy1F0NIed1U45wjiMgQeJ2iWX7hBxRQ,9245
22
22
  anysite/dataset/cli.py,sha256=rEWK1ka-YQ_Vbbj2nMaMYTD9g3wa3ethUWSoaWRSGTY,23066
23
- anysite/dataset/collector.py,sha256=6CfJt8fKZZ2xvZWJ7jwnx0V9BnjoJxmBZkm8xWQiU54,23840
24
- anysite/dataset/db_loader.py,sha256=TMcvI-pX-XctbkTdo5eTyW8Co4_3uK-dEdXn_r9g8Oc,13547
25
- anysite/dataset/differ.py,sha256=b-qU5Laf8RkteZAlblKq4atTvnJ21W4QbxfpHBFYMJ8,17053
23
+ anysite/dataset/collector.py,sha256=ZdR3CmQQew_iuJpNtJ4knSrjt0hvkEL4WIaS0IKEkwQ,23927
24
+ anysite/dataset/db_loader.py,sha256=ASDO5AD5_wcOxjR4DZknX-zMEaevqXMb3VVa6507qAg,13973
25
+ anysite/dataset/differ.py,sha256=jB_VWTb7UuEBWG9nv1ry5xeo9hmWdhA_cTm6Ed43_Uw,17746
26
26
  anysite/dataset/errors.py,sha256=r8cZXoIzSeTGCWpeYjntnN0AduCu74YZyWs3sFu17J4,914
27
27
  anysite/dataset/exporters.py,sha256=mA2FYbYJbHfrwkXbHDu4g5qPG_JJKnkVciXFKPkF1Vw,3708
28
28
  anysite/dataset/history.py,sha256=avFs0ADlM7Hr-ttqC1FfjJiQxvQP20sScM7ZoY4lvU0,5471
29
- anysite/dataset/models.py,sha256=-Qnh6QvbN3nzlfsYqgCiYKBqOeLcJCYK_hYrmxVCRTA,9810
29
+ anysite/dataset/models.py,sha256=d-bkgu2dUY7_VSgH-oVh84IV3X-KpxRfja0H5WnhauU,9998
30
30
  anysite/dataset/notifications.py,sha256=ORzo9XOgOxzLb7rk4pevlKPB_Taf-jejlrtmO4Zgl2c,2367
31
31
  anysite/dataset/scheduler.py,sha256=zpbA5tRUQZXr-9lZnG58dvE3E7ZBlAd-U-PTXExe9f0,3339
32
- anysite/dataset/storage.py,sha256=d03goKLI5NWKJowHwCgGqQkcVTO1NctPxMu-Xu-tru4,5326
32
+ anysite/dataset/storage.py,sha256=ySY822m4lQd6Ip0i3VNPVbHEO6U6zBBwHi-56AXOaXE,5974
33
33
  anysite/dataset/transformer.py,sha256=XBI4MiZ_F_IZdootV0GAePaM9-pUadIte7RABbjBipc,6843
34
34
  anysite/db/__init__.py,sha256=xGGZHlMt5FUZjI6MAmf2VfyNLypOeXwrRL-gmuTsyl4,1117
35
35
  anysite/db/cli.py,sha256=fYuIKWq7eF5mAfZWnXNbtlpITnbYbOFMm2TqU54xIl4,22118
@@ -58,8 +58,8 @@ anysite/streaming/writer.py,sha256=HfMsC4umUdJuNIAPK57YAxEGyTwUmy-zNrqFkwY6aew,4
58
58
  anysite/utils/__init__.py,sha256=7SnbxpxKENK-2ecUL5NfnZ9okGI7COKYw4WF46172HM,23
59
59
  anysite/utils/fields.py,sha256=bSrHadzNmabL4qubqhXXZoWb_P8KA-3S7_FLVT8nGBc,7410
60
60
  anysite/utils/retry.py,sha256=89TbXvavi5t22P2mTYCLAS6SSZoW65gQ0nnYNbYAF0M,2684
61
- anysite_cli-0.1.4.dist-info/METADATA,sha256=w5DUgDWzJgXynKRogJVm9baLqTJVSrg0ciHuWfWa9l0,11781
62
- anysite_cli-0.1.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
63
- anysite_cli-0.1.4.dist-info/entry_points.txt,sha256=FDPxNasy0fRRcOgJdZRVP7Qw01C3TwRa1OwPJiskNyg,45
64
- anysite_cli-0.1.4.dist-info/licenses/LICENSE,sha256=gVAxkI23CFm4x4HV_fkQYw_bGq93mQmVZEwxNs-YTa4,1069
65
- anysite_cli-0.1.4.dist-info/RECORD,,
61
+ anysite_cli-0.1.6.dist-info/METADATA,sha256=iqEFoJcISFAZoeT96LrCHiCVPqWk4WX1Xy41siFqUzs,12437
62
+ anysite_cli-0.1.6.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
63
+ anysite_cli-0.1.6.dist-info/entry_points.txt,sha256=FDPxNasy0fRRcOgJdZRVP7Qw01C3TwRa1OwPJiskNyg,45
64
+ anysite_cli-0.1.6.dist-info/licenses/LICENSE,sha256=gVAxkI23CFm4x4HV_fkQYw_bGq93mQmVZEwxNs-YTa4,1069
65
+ anysite_cli-0.1.6.dist-info/RECORD,,