anysite-cli 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anysite/dataset/collector.py +4 -3
- anysite/dataset/db_loader.py +17 -6
- anysite/dataset/differ.py +14 -0
- anysite/dataset/models.py +4 -0
- anysite/dataset/storage.py +21 -1
- {anysite_cli-0.1.4.dist-info → anysite_cli-0.1.6.dist-info}/METADATA +14 -3
- {anysite_cli-0.1.4.dist-info → anysite_cli-0.1.6.dist-info}/RECORD +10 -10
- {anysite_cli-0.1.4.dist-info → anysite_cli-0.1.6.dist-info}/WHEEL +0 -0
- {anysite_cli-0.1.4.dist-info → anysite_cli-0.1.6.dist-info}/entry_points.txt +0 -0
- {anysite_cli-0.1.4.dist-info → anysite_cli-0.1.6.dist-info}/licenses/LICENSE +0 -0
anysite/dataset/collector.py
CHANGED
|
@@ -19,6 +19,7 @@ from anysite.dataset.models import DatasetConfig, DatasetSource
|
|
|
19
19
|
from anysite.dataset.storage import (
|
|
20
20
|
MetadataStore,
|
|
21
21
|
get_parquet_path,
|
|
22
|
+
read_latest_parquet,
|
|
22
23
|
read_parquet,
|
|
23
24
|
write_parquet,
|
|
24
25
|
)
|
|
@@ -412,9 +413,9 @@ async def _collect_dependent(
|
|
|
412
413
|
if dep is None:
|
|
413
414
|
raise DatasetError(f"Source {source.id} has no dependency defined")
|
|
414
415
|
|
|
415
|
-
# Read parent data
|
|
416
|
+
# Read parent data (latest snapshot only to avoid schema mismatch)
|
|
416
417
|
parent_dir = base_path / "raw" / dep.from_source
|
|
417
|
-
parent_records =
|
|
418
|
+
parent_records = read_latest_parquet(parent_dir)
|
|
418
419
|
|
|
419
420
|
if not parent_records:
|
|
420
421
|
if not quiet:
|
|
@@ -627,7 +628,7 @@ def _count_dependent_inputs(
|
|
|
627
628
|
if dep is None:
|
|
628
629
|
return None
|
|
629
630
|
parent_dir = base_path / "raw" / dep.from_source
|
|
630
|
-
parent_records =
|
|
631
|
+
parent_records = read_latest_parquet(parent_dir)
|
|
631
632
|
if not parent_records:
|
|
632
633
|
info = metadata.get_source_info(dep.from_source)
|
|
633
634
|
return info.get("record_count") if info else None
|
anysite/dataset/db_loader.py
CHANGED
|
@@ -301,9 +301,13 @@ class DatasetDbLoader:
|
|
|
301
301
|
"""Diff-based incremental sync: compare two most recent snapshots, apply delta."""
|
|
302
302
|
result = differ.diff(source.id, diff_key)
|
|
303
303
|
total = 0
|
|
304
|
+
sync_mode = source.db_load.sync if source.db_load else "full"
|
|
304
305
|
|
|
305
306
|
if dry_run:
|
|
306
|
-
|
|
307
|
+
count = len(result.added) + len(result.changed)
|
|
308
|
+
if sync_mode == "full":
|
|
309
|
+
count += len(result.removed)
|
|
310
|
+
return count
|
|
307
311
|
|
|
308
312
|
# Extract key value from a record (handles dot-notation)
|
|
309
313
|
def _get_key_val(record: dict[str, Any]) -> Any:
|
|
@@ -321,14 +325,15 @@ class DatasetDbLoader:
|
|
|
321
325
|
self.adapter.insert_batch(table_name, [row])
|
|
322
326
|
total += 1
|
|
323
327
|
|
|
324
|
-
# DELETE removed records
|
|
325
|
-
|
|
328
|
+
# DELETE removed records (skipped in append mode)
|
|
329
|
+
ph = self._placeholder()
|
|
330
|
+
if result.removed and sync_mode == "full":
|
|
326
331
|
safe_col = sanitize_identifier(db_key_col)
|
|
327
332
|
for record in result.removed:
|
|
328
333
|
key_val = _get_key_val(record)
|
|
329
334
|
if key_val is not None:
|
|
330
335
|
self.adapter.execute(
|
|
331
|
-
f"DELETE FROM {table_name} WHERE {safe_col} =
|
|
336
|
+
f"DELETE FROM {table_name} WHERE {safe_col} = {ph}",
|
|
332
337
|
(str(key_val),),
|
|
333
338
|
)
|
|
334
339
|
total += 1
|
|
@@ -350,14 +355,14 @@ class DatasetDbLoader:
|
|
|
350
355
|
for field_name in changed_fields:
|
|
351
356
|
new_val = record.get(field_name)
|
|
352
357
|
safe_field = sanitize_identifier(field_name)
|
|
353
|
-
set_parts.append(f"{safe_field} =
|
|
358
|
+
set_parts.append(f"{safe_field} = {ph}")
|
|
354
359
|
params.append(new_val)
|
|
355
360
|
|
|
356
361
|
params.append(str(key_val))
|
|
357
362
|
sql = (
|
|
358
363
|
f"UPDATE {table_name} "
|
|
359
364
|
f"SET {', '.join(set_parts)} "
|
|
360
|
-
f"WHERE {safe_col} =
|
|
365
|
+
f"WHERE {safe_col} = {ph}"
|
|
361
366
|
)
|
|
362
367
|
self.adapter.execute(sql, tuple(params))
|
|
363
368
|
total += 1
|
|
@@ -371,6 +376,12 @@ class DatasetDbLoader:
|
|
|
371
376
|
return other.dependency.field
|
|
372
377
|
return None
|
|
373
378
|
|
|
379
|
+
def _placeholder(self) -> str:
|
|
380
|
+
"""Get the parameter placeholder for the dialect."""
|
|
381
|
+
if self._dialect == "postgres":
|
|
382
|
+
return "%s"
|
|
383
|
+
return "?"
|
|
384
|
+
|
|
374
385
|
def _auto_id_type(self) -> str:
|
|
375
386
|
"""Get the auto-increment ID column type for the dialect."""
|
|
376
387
|
if self._dialect == "postgres":
|
anysite/dataset/differ.py
CHANGED
|
@@ -344,6 +344,9 @@ class DatasetDiffer:
|
|
|
344
344
|
old_val = record.get(old_key)
|
|
345
345
|
if _values_differ(new_val, old_val):
|
|
346
346
|
changed_fields.append(col)
|
|
347
|
+
# Fallback: DuckDB detected a change but Python comparison missed it
|
|
348
|
+
if not changed_fields:
|
|
349
|
+
changed_fields = list(compare_fields)
|
|
347
350
|
record["_changed_fields"] = changed_fields
|
|
348
351
|
|
|
349
352
|
return records
|
|
@@ -377,6 +380,15 @@ def _values_differ(a: Any, b: Any) -> bool:
|
|
|
377
380
|
return json.loads(a) != json.loads(b)
|
|
378
381
|
except (json.JSONDecodeError, ValueError):
|
|
379
382
|
pass
|
|
383
|
+
# Handle complex types (dict, list) — compare via JSON serialization
|
|
384
|
+
# to catch differences DuckDB sees but Python equality misses
|
|
385
|
+
if isinstance(a, (dict, list)) or isinstance(b, (dict, list)):
|
|
386
|
+
try:
|
|
387
|
+
return json.dumps(a, sort_keys=True, default=str) != json.dumps(
|
|
388
|
+
b, sort_keys=True, default=str
|
|
389
|
+
)
|
|
390
|
+
except (TypeError, ValueError):
|
|
391
|
+
pass
|
|
380
392
|
return True
|
|
381
393
|
|
|
382
394
|
|
|
@@ -452,6 +464,8 @@ def format_diff_records(
|
|
|
452
464
|
|
|
453
465
|
for record in result.changed:
|
|
454
466
|
row: dict[str, Any] = {"_diff": "changed"}
|
|
467
|
+
changed_fields = record.get("_changed_fields", [])
|
|
468
|
+
row["_changed_fields"] = changed_fields
|
|
455
469
|
for k, v in record.items():
|
|
456
470
|
if k == "_changed_fields":
|
|
457
471
|
continue
|
anysite/dataset/models.py
CHANGED
|
@@ -82,6 +82,10 @@ class DbLoadConfig(BaseModel):
|
|
|
82
82
|
|
|
83
83
|
table: str | None = Field(default=None, description="Override table name (default: source id)")
|
|
84
84
|
key: str | None = Field(default=None, description="Unique key field for diff-based DB sync (e.g., urn.value)")
|
|
85
|
+
sync: Literal["full", "append"] = Field(
|
|
86
|
+
default="full",
|
|
87
|
+
description="Sync mode: 'full' applies INSERT/DELETE/UPDATE, 'append' skips DELETE (keeps old records)",
|
|
88
|
+
)
|
|
85
89
|
fields: list[str] = Field(default_factory=list, description="Fields to include (empty = all)")
|
|
86
90
|
exclude: list[str] = Field(
|
|
87
91
|
default_factory=lambda: ["_input_value", "_parent_source"],
|
anysite/dataset/storage.py
CHANGED
|
@@ -75,7 +75,7 @@ def read_parquet(path: Path) -> list[dict[str, Any]]:
|
|
|
75
75
|
tables = [pq.read_table(f) for f in files]
|
|
76
76
|
import pyarrow as pa
|
|
77
77
|
|
|
78
|
-
table = pa.concat_tables(tables)
|
|
78
|
+
table = pa.concat_tables(tables, promote_options="permissive")
|
|
79
79
|
else:
|
|
80
80
|
if not path.exists():
|
|
81
81
|
return []
|
|
@@ -84,6 +84,26 @@ def read_parquet(path: Path) -> list[dict[str, Any]]:
|
|
|
84
84
|
return table.to_pylist()
|
|
85
85
|
|
|
86
86
|
|
|
87
|
+
def read_latest_parquet(path: Path) -> list[dict[str, Any]]:
|
|
88
|
+
"""Read records from the most recent Parquet snapshot in a directory.
|
|
89
|
+
|
|
90
|
+
Unlike ``read_parquet(dir)``, this reads only the latest file, avoiding
|
|
91
|
+
schema mismatch errors when snapshots have different column types.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
path: Directory containing dated .parquet files.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
List of dicts from the newest snapshot, or [] if none found.
|
|
98
|
+
"""
|
|
99
|
+
if not path.is_dir():
|
|
100
|
+
return read_parquet(path)
|
|
101
|
+
files = sorted(path.glob("*.parquet"))
|
|
102
|
+
if not files:
|
|
103
|
+
return []
|
|
104
|
+
return read_parquet(files[-1])
|
|
105
|
+
|
|
106
|
+
|
|
87
107
|
def get_source_dir(base_path: Path, source_id: str) -> Path:
|
|
88
108
|
"""Get the raw data directory for a source."""
|
|
89
109
|
return base_path / "raw" / source_id
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: anysite-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.6
|
|
4
4
|
Summary: CLI for Anysite API - web data extraction for humans and AI agents
|
|
5
5
|
Project-URL: Homepage, https://anysite.io
|
|
6
6
|
Project-URL: Documentation, https://docs.anysite.io/cli
|
|
@@ -259,6 +259,8 @@ sources:
|
|
|
259
259
|
path: ./output/companies-{{date}}.csv
|
|
260
260
|
format: csv
|
|
261
261
|
db_load:
|
|
262
|
+
key: _input_value # Unique key for incremental sync
|
|
263
|
+
sync: full # full (default) or append (no DELETE)
|
|
262
264
|
fields: [name, url, employee_count]
|
|
263
265
|
|
|
264
266
|
- id: employees
|
|
@@ -274,6 +276,8 @@ sources:
|
|
|
274
276
|
count: 5
|
|
275
277
|
refresh: always # Re-collect every run with --incremental
|
|
276
278
|
db_load:
|
|
279
|
+
key: urn.value # Unique key for incremental sync
|
|
280
|
+
sync: append # Keep old records (no DELETE on diff)
|
|
277
281
|
fields: [name, url, headline]
|
|
278
282
|
|
|
279
283
|
storage:
|
|
@@ -318,9 +322,15 @@ anysite dataset query dataset.yaml --interactive
|
|
|
318
322
|
anysite dataset stats dataset.yaml --source companies
|
|
319
323
|
anysite dataset profile dataset.yaml
|
|
320
324
|
|
|
321
|
-
# Load into PostgreSQL with automatic FK linking
|
|
325
|
+
# Load into PostgreSQL with automatic FK linking (incremental sync with db_load.key)
|
|
326
|
+
anysite dataset load-db dataset.yaml -c pg
|
|
327
|
+
|
|
328
|
+
# Drop and reload from latest snapshot
|
|
322
329
|
anysite dataset load-db dataset.yaml -c pg --drop-existing
|
|
323
330
|
|
|
331
|
+
# Load a specific snapshot date
|
|
332
|
+
anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
|
|
333
|
+
|
|
324
334
|
# Run history and logs
|
|
325
335
|
anysite dataset history my-dataset
|
|
326
336
|
anysite dataset logs my-dataset --run 42
|
|
@@ -328,8 +338,9 @@ anysite dataset logs my-dataset --run 42
|
|
|
328
338
|
# Generate cron/systemd schedule
|
|
329
339
|
anysite dataset schedule dataset.yaml --incremental --load-db pg
|
|
330
340
|
|
|
331
|
-
# Compare snapshots (diff two collection dates)
|
|
341
|
+
# Compare snapshots (diff two collection dates, supports dot-notation keys)
|
|
332
342
|
anysite dataset diff dataset.yaml --source employees --key _input_value
|
|
343
|
+
anysite dataset diff dataset.yaml --source profiles --key urn.value --fields "name,headline"
|
|
333
344
|
|
|
334
345
|
# Reset incremental state
|
|
335
346
|
anysite dataset reset-cursor dataset.yaml
|
|
@@ -20,16 +20,16 @@ anysite/config/settings.py,sha256=Hc0j_aCCtkJeL4nHw-EFyfJ8WEDk57G08iNUFquUhpM,52
|
|
|
20
20
|
anysite/dataset/__init__.py,sha256=J0sKQkGwVOPtvp6pka7LcdeUEADvjWRs71yRuROzJxI,847
|
|
21
21
|
anysite/dataset/analyzer.py,sha256=8dsPW32SbSaUTy1F0NIed1U45wjiMgQeJ2iWX7hBxRQ,9245
|
|
22
22
|
anysite/dataset/cli.py,sha256=rEWK1ka-YQ_Vbbj2nMaMYTD9g3wa3ethUWSoaWRSGTY,23066
|
|
23
|
-
anysite/dataset/collector.py,sha256=
|
|
24
|
-
anysite/dataset/db_loader.py,sha256=
|
|
25
|
-
anysite/dataset/differ.py,sha256=
|
|
23
|
+
anysite/dataset/collector.py,sha256=ZdR3CmQQew_iuJpNtJ4knSrjt0hvkEL4WIaS0IKEkwQ,23927
|
|
24
|
+
anysite/dataset/db_loader.py,sha256=ASDO5AD5_wcOxjR4DZknX-zMEaevqXMb3VVa6507qAg,13973
|
|
25
|
+
anysite/dataset/differ.py,sha256=jB_VWTb7UuEBWG9nv1ry5xeo9hmWdhA_cTm6Ed43_Uw,17746
|
|
26
26
|
anysite/dataset/errors.py,sha256=r8cZXoIzSeTGCWpeYjntnN0AduCu74YZyWs3sFu17J4,914
|
|
27
27
|
anysite/dataset/exporters.py,sha256=mA2FYbYJbHfrwkXbHDu4g5qPG_JJKnkVciXFKPkF1Vw,3708
|
|
28
28
|
anysite/dataset/history.py,sha256=avFs0ADlM7Hr-ttqC1FfjJiQxvQP20sScM7ZoY4lvU0,5471
|
|
29
|
-
anysite/dataset/models.py,sha256
|
|
29
|
+
anysite/dataset/models.py,sha256=d-bkgu2dUY7_VSgH-oVh84IV3X-KpxRfja0H5WnhauU,9998
|
|
30
30
|
anysite/dataset/notifications.py,sha256=ORzo9XOgOxzLb7rk4pevlKPB_Taf-jejlrtmO4Zgl2c,2367
|
|
31
31
|
anysite/dataset/scheduler.py,sha256=zpbA5tRUQZXr-9lZnG58dvE3E7ZBlAd-U-PTXExe9f0,3339
|
|
32
|
-
anysite/dataset/storage.py,sha256=
|
|
32
|
+
anysite/dataset/storage.py,sha256=ySY822m4lQd6Ip0i3VNPVbHEO6U6zBBwHi-56AXOaXE,5974
|
|
33
33
|
anysite/dataset/transformer.py,sha256=XBI4MiZ_F_IZdootV0GAePaM9-pUadIte7RABbjBipc,6843
|
|
34
34
|
anysite/db/__init__.py,sha256=xGGZHlMt5FUZjI6MAmf2VfyNLypOeXwrRL-gmuTsyl4,1117
|
|
35
35
|
anysite/db/cli.py,sha256=fYuIKWq7eF5mAfZWnXNbtlpITnbYbOFMm2TqU54xIl4,22118
|
|
@@ -58,8 +58,8 @@ anysite/streaming/writer.py,sha256=HfMsC4umUdJuNIAPK57YAxEGyTwUmy-zNrqFkwY6aew,4
|
|
|
58
58
|
anysite/utils/__init__.py,sha256=7SnbxpxKENK-2ecUL5NfnZ9okGI7COKYw4WF46172HM,23
|
|
59
59
|
anysite/utils/fields.py,sha256=bSrHadzNmabL4qubqhXXZoWb_P8KA-3S7_FLVT8nGBc,7410
|
|
60
60
|
anysite/utils/retry.py,sha256=89TbXvavi5t22P2mTYCLAS6SSZoW65gQ0nnYNbYAF0M,2684
|
|
61
|
-
anysite_cli-0.1.
|
|
62
|
-
anysite_cli-0.1.
|
|
63
|
-
anysite_cli-0.1.
|
|
64
|
-
anysite_cli-0.1.
|
|
65
|
-
anysite_cli-0.1.
|
|
61
|
+
anysite_cli-0.1.6.dist-info/METADATA,sha256=iqEFoJcISFAZoeT96LrCHiCVPqWk4WX1Xy41siFqUzs,12437
|
|
62
|
+
anysite_cli-0.1.6.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
63
|
+
anysite_cli-0.1.6.dist-info/entry_points.txt,sha256=FDPxNasy0fRRcOgJdZRVP7Qw01C3TwRa1OwPJiskNyg,45
|
|
64
|
+
anysite_cli-0.1.6.dist-info/licenses/LICENSE,sha256=gVAxkI23CFm4x4HV_fkQYw_bGq93mQmVZEwxNs-YTa4,1069
|
|
65
|
+
anysite_cli-0.1.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|