anysite-cli 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
anysite/dataset/cli.py CHANGED
@@ -357,6 +357,10 @@ def load_db(
357
357
  bool,
358
358
  typer.Option("--quiet", "-q", help="Suppress progress output"),
359
359
  ] = False,
360
+ snapshot: Annotated[
361
+ str | None,
362
+ typer.Option("--snapshot", help="Load a specific snapshot date (YYYY-MM-DD)"),
363
+ ] = None,
360
364
  ) -> None:
361
365
  """Load collected Parquet data into a relational database with FK linking."""
362
366
  config = _load_config(config_path)
@@ -379,6 +383,7 @@ def load_db(
379
383
  source_filter=source,
380
384
  drop_existing=drop_existing,
381
385
  dry_run=dry_run,
386
+ snapshot=snapshot,
382
387
  )
383
388
  except Exception as e:
384
389
  typer.echo(f"Load error: {e}", err=True)
@@ -413,6 +418,121 @@ def load_db(
413
418
  )
414
419
 
415
420
 
421
+ @app.command("diff")
422
+ def diff_cmd(
423
+ config_path: Annotated[
424
+ Path,
425
+ typer.Argument(help="Path to dataset.yaml"),
426
+ ],
427
+ source: Annotated[
428
+ str,
429
+ typer.Option("--source", "-s", help="Source to compare"),
430
+ ],
431
+ key: Annotated[
432
+ str,
433
+ typer.Option("--key", "-k", help="Field to match records by (e.g., _input_value, urn)"),
434
+ ],
435
+ from_date: Annotated[
436
+ str | None,
437
+ typer.Option("--from", help="Older snapshot date (YYYY-MM-DD)"),
438
+ ] = None,
439
+ to_date: Annotated[
440
+ str | None,
441
+ typer.Option("--to", help="Newer snapshot date (YYYY-MM-DD)"),
442
+ ] = None,
443
+ fields: Annotated[
444
+ str | None,
445
+ typer.Option("--fields", "-f", help="Only compare these fields (comma-separated)"),
446
+ ] = None,
447
+ format: Annotated[
448
+ str,
449
+ typer.Option("--format", help="Output format: table, json, jsonl, csv"),
450
+ ] = "table",
451
+ output: Annotated[
452
+ Path | None,
453
+ typer.Option("--output", "-o", help="Write output to file"),
454
+ ] = None,
455
+ quiet: Annotated[
456
+ bool,
457
+ typer.Option("--quiet", "-q", help="Suppress summary, only output data"),
458
+ ] = False,
459
+ ) -> None:
460
+ """Compare two snapshots of a source to show added, removed, and changed records."""
461
+ from datetime import date as date_type
462
+
463
+ from anysite.dataset.differ import (
464
+ DatasetDiffer,
465
+ format_diff_records,
466
+ format_diff_table,
467
+ )
468
+
469
+ config = _load_config(config_path)
470
+
471
+ # Validate source exists
472
+ src = config.get_source(source)
473
+ if src is None:
474
+ typer.echo(f"Error: source '{source}' not found in dataset", err=True)
475
+ raise typer.Exit(1)
476
+
477
+ differ = DatasetDiffer(config.storage_path())
478
+
479
+ # Parse dates
480
+ parsed_from = None
481
+ parsed_to = None
482
+ try:
483
+ if from_date:
484
+ parsed_from = date_type.fromisoformat(from_date)
485
+ if to_date:
486
+ parsed_to = date_type.fromisoformat(to_date)
487
+ except ValueError as e:
488
+ typer.echo(f"Error: invalid date format: {e}", err=True)
489
+ raise typer.Exit(1) from None
490
+
491
+ # Parse fields
492
+ field_list = None
493
+ if fields:
494
+ field_list = [f.strip() for f in fields.split(",") if f.strip()]
495
+
496
+ try:
497
+ result = differ.diff(
498
+ source,
499
+ key,
500
+ from_date=parsed_from,
501
+ to_date=parsed_to,
502
+ fields=field_list,
503
+ )
504
+ except DatasetError as e:
505
+ typer.echo(f"Error: {e}", err=True)
506
+ raise typer.Exit(1) from None
507
+
508
+ # Print summary unless quiet
509
+ if not quiet:
510
+ console = Console()
511
+ console.print(
512
+ f"\n[bold]Diff: {source}[/bold] "
513
+ f"({result.from_date.isoformat()} → {result.to_date.isoformat()})\n"
514
+ )
515
+ console.print(f" [green]Added:[/green] {len(result.added)}")
516
+ console.print(f" [red]Removed:[/red] {len(result.removed)}")
517
+ console.print(f" [yellow]Changed:[/yellow] {len(result.changed)}")
518
+ console.print(f" Unchanged: {result.unchanged_count}")
519
+ console.print()
520
+
521
+ if not result.has_changes:
522
+ if not quiet:
523
+ Console().print("[dim]No changes detected.[/dim]")
524
+ return
525
+
526
+ # Format and output
527
+ rows = (
528
+ format_diff_table(result, output_fields=field_list)
529
+ if format == "table"
530
+ else format_diff_records(result, output_fields=field_list)
531
+ )
532
+
533
+ _output_results(rows, format, output)
534
+
535
+
416
536
  @app.command("history")
417
537
  def history(
418
538
  name: Annotated[
@@ -43,6 +43,7 @@ class CollectionPlan:
43
43
  params: dict[str, Any] | None = None,
44
44
  dependency: str | None = None,
45
45
  estimated_requests: int | None = None,
46
+ refresh: str = "auto",
46
47
  ) -> None:
47
48
  self.steps.append({
48
49
  "source": source_id,
@@ -51,6 +52,7 @@ class CollectionPlan:
51
52
  "params": params or {},
52
53
  "dependency": dependency,
53
54
  "estimated_requests": estimated_requests,
55
+ "refresh": refresh,
54
56
  })
55
57
 
56
58
 
@@ -116,8 +118,8 @@ async def collect_dataset(
116
118
 
117
119
  try:
118
120
  for source in ordered:
119
- # Check incremental skip
120
- if incremental:
121
+ # Check incremental skip (refresh: always bypasses this)
122
+ if incremental and source.refresh != "always":
121
123
  parquet_path = get_parquet_path(base_path, source.id, today)
122
124
  if parquet_path.exists():
123
125
  if not quiet:
@@ -276,8 +278,8 @@ async def _collect_from_file(
276
278
  print_warning(f"No values extracted from {file_path}")
277
279
  return []
278
280
 
279
- # Filter already-collected inputs in incremental mode
280
- if incremental and metadata:
281
+ # Filter already-collected inputs in incremental mode (refresh: always bypasses)
282
+ if incremental and source.refresh != "always" and metadata:
281
283
  already = metadata.get_collected_inputs(source.id)
282
284
  if already:
283
285
  original = len(values)
@@ -432,8 +434,8 @@ async def _collect_dependent(
432
434
  f"Source {source.id} has a dependency but no input_key defined"
433
435
  )
434
436
 
435
- # Filter already-collected inputs in incremental mode
436
- if incremental and metadata:
437
+ # Filter already-collected inputs in incremental mode (refresh: always bypasses)
438
+ if incremental and source.refresh != "always" and metadata:
437
439
  already = metadata.get_collected_inputs(source.id)
438
440
  if already:
439
441
  original = len(values)
@@ -579,7 +581,7 @@ def _build_plan(
579
581
  plan = CollectionPlan()
580
582
 
581
583
  for source in ordered:
582
- if incremental:
584
+ if incremental and source.refresh != "always":
583
585
  parquet_path = get_parquet_path(base_path, source.id, today)
584
586
  if parquet_path.exists():
585
587
  continue
@@ -592,6 +594,7 @@ def _build_plan(
592
594
  kind="from_file",
593
595
  params={"file": source.from_file, "field": source.file_field},
594
596
  estimated_requests=est,
597
+ refresh=source.refresh,
595
598
  )
596
599
  elif source.dependency is None:
597
600
  plan.add_step(
@@ -600,6 +603,7 @@ def _build_plan(
600
603
  kind="independent",
601
604
  params=source.params,
602
605
  estimated_requests=1,
606
+ refresh=source.refresh,
603
607
  )
604
608
  else:
605
609
  est = _count_dependent_inputs(source, base_path, metadata)
@@ -609,6 +613,7 @@ def _build_plan(
609
613
  kind="dependent",
610
614
  dependency=source.dependency.from_source,
611
615
  estimated_requests=est,
616
+ refresh=source.refresh,
612
617
  )
613
618
 
614
619
  return plan
@@ -665,11 +670,14 @@ def _print_plan(plan: CollectionPlan) -> dict[str, int]:
665
670
  table.add_column("Est. Requests")
666
671
 
667
672
  for i, step in enumerate(plan.steps, 1):
673
+ kind = step["kind"]
674
+ if step.get("refresh") == "always":
675
+ kind += " (refresh)"
668
676
  table.add_row(
669
677
  str(i),
670
678
  step["source"],
671
679
  step["endpoint"],
672
- step["kind"],
680
+ kind,
673
681
  step.get("dependency") or "-",
674
682
  str(step.get("estimated_requests") or "?"),
675
683
  )
@@ -3,12 +3,18 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import json
6
+ import logging
7
+ from datetime import date
8
+ from pathlib import Path
6
9
  from typing import Any
7
10
 
8
11
  from anysite.dataset.models import DatasetConfig, DatasetSource
9
12
  from anysite.dataset.storage import get_source_dir, read_parquet
10
13
  from anysite.db.adapters.base import DatabaseAdapter
11
14
  from anysite.db.schema.inference import infer_table_schema
15
+ from anysite.db.utils.sanitize import sanitize_identifier
16
+
17
+ logger = logging.getLogger(__name__)
12
18
 
13
19
 
14
20
  def _get_dialect(adapter: DatabaseAdapter) -> str:
@@ -86,15 +92,31 @@ def _filter_record(
86
92
  return {k: v for k, v in record.items() if k not in exclude}
87
93
 
88
94
 
95
+ def _get_latest_parquet(base_path: Path, source_id: str) -> Path | None:
96
+ """Return the path to the most recent snapshot for a source."""
97
+ source_dir = get_source_dir(base_path, source_id)
98
+ if not source_dir.exists():
99
+ return None
100
+ files = sorted(source_dir.glob("*.parquet"))
101
+ return files[-1] if files else None
102
+
103
+
104
+ def _get_snapshot_for_date(base_path: Path, source_id: str, d: date) -> Path | None:
105
+ """Return the parquet path for a specific snapshot date."""
106
+ source_dir = get_source_dir(base_path, source_id)
107
+ path = source_dir / f"{d.isoformat()}.parquet"
108
+ return path if path.exists() else None
109
+
110
+
89
111
  class DatasetDbLoader:
90
112
  """Load dataset Parquet data into a relational database.
91
113
 
92
- Handles:
93
- - Schema inference from Parquet records
94
- - Auto-increment primary keys (``id`` column)
95
- - Foreign key linking via provenance ``_input_value`` column
96
- - Dot-notation field extraction for JSON columns
97
- - Topological loading order (parents before children)
114
+ Supports diff-based incremental sync when ``db_load.key`` is configured:
115
+ compares the two most recent snapshots and applies INSERT/DELETE/UPDATE
116
+ to keep the database in sync.
117
+
118
+ Falls back to full INSERT of the latest snapshot when no key is set
119
+ or when the table doesn't exist yet.
98
120
  """
99
121
 
100
122
  def __init__(
@@ -115,16 +137,18 @@ class DatasetDbLoader:
115
137
  source_filter: str | None = None,
116
138
  drop_existing: bool = False,
117
139
  dry_run: bool = False,
140
+ snapshot: str | None = None,
118
141
  ) -> dict[str, int]:
119
142
  """Load all sources into the database in dependency order.
120
143
 
121
144
  Args:
122
145
  source_filter: Only load this source (and dependencies).
123
- drop_existing: Drop tables before creating.
146
+ drop_existing: Drop tables before creating, then full INSERT latest.
124
147
  dry_run: Show plan without executing.
148
+ snapshot: Load a specific snapshot date (YYYY-MM-DD).
125
149
 
126
150
  Returns:
127
- Mapping of source_id to number of rows loaded.
151
+ Mapping of source_id to number of rows loaded/affected.
128
152
  """
129
153
  sources = self.config.topological_sort()
130
154
 
@@ -139,6 +163,7 @@ class DatasetDbLoader:
139
163
  source,
140
164
  drop_existing=drop_existing,
141
165
  dry_run=dry_run,
166
+ snapshot=snapshot,
142
167
  )
143
168
  results[source.id] = count
144
169
 
@@ -150,18 +175,64 @@ class DatasetDbLoader:
150
175
  *,
151
176
  drop_existing: bool = False,
152
177
  dry_run: bool = False,
178
+ snapshot: str | None = None,
153
179
  ) -> int:
154
- """Load a single source into the database."""
155
- source_dir = get_source_dir(self.base_path, source.id)
156
- if not source_dir.exists() or not any(source_dir.glob("*.parquet")):
180
+ """Load a single source into the database.
181
+
182
+ Strategy:
183
+ 1. ``drop_existing``: drop table → full INSERT of latest snapshot
184
+ 2. ``snapshot``: full INSERT of that specific snapshot
185
+ 3. Table doesn't exist: full INSERT of latest snapshot
186
+ 4. Table exists + ``db_load.key`` set + ≥2 snapshots: diff-based sync
187
+ 5. Fallback: full INSERT of latest snapshot
188
+ """
189
+ table_name = _table_name_for(source)
190
+
191
+ # Handle drop_existing
192
+ if drop_existing and self.adapter.table_exists(table_name):
193
+ self.adapter.execute(f"DROP TABLE {table_name}")
194
+
195
+ # Determine which parquet to load
196
+ if snapshot:
197
+ snapshot_date = date.fromisoformat(snapshot)
198
+ parquet_path = _get_snapshot_for_date(self.base_path, source.id, snapshot_date)
199
+ if parquet_path is None:
200
+ return 0
201
+ return self._full_insert(source, table_name, parquet_path, dry_run=dry_run)
202
+
203
+ # Check if we can do diff-based sync
204
+ diff_key = source.db_load.key if source.db_load else None
205
+ table_exists = self.adapter.table_exists(table_name)
206
+
207
+ if diff_key and table_exists and not drop_existing:
208
+ from anysite.dataset.differ import DatasetDiffer
209
+ differ = DatasetDiffer(self.base_path)
210
+ dates = differ.available_dates(source.id)
211
+
212
+ if len(dates) >= 2:
213
+ return self._diff_sync(
214
+ source, table_name, diff_key, differ, dates, dry_run=dry_run
215
+ )
216
+
217
+ # Fallback: full INSERT of latest snapshot
218
+ latest = _get_latest_parquet(self.base_path, source.id)
219
+ if latest is None:
157
220
  return 0
221
+ return self._full_insert(source, table_name, latest, dry_run=dry_run)
158
222
 
159
- raw_records = read_parquet(source_dir)
223
+ def _full_insert(
224
+ self,
225
+ source: DatasetSource,
226
+ table_name: str,
227
+ parquet_path: Path,
228
+ *,
229
+ dry_run: bool = False,
230
+ ) -> int:
231
+ """Full INSERT: read parquet, transform, create table if needed, insert all rows."""
232
+ raw_records = read_parquet(parquet_path)
160
233
  if not raw_records:
161
234
  return 0
162
235
 
163
- table_name = _table_name_for(source)
164
-
165
236
  # Determine parent info for FK linking
166
237
  parent_source_id = None
167
238
  parent_fk_col = None
@@ -174,7 +245,6 @@ class DatasetDbLoader:
174
245
  for record in raw_records:
175
246
  row = _filter_record(record, source)
176
247
 
177
- # Add FK column if this is a dependent source
178
248
  if parent_source_id and parent_fk_col:
179
249
  input_val = record.get("_input_value")
180
250
  parent_map = self._value_to_id.get(parent_source_id, {})
@@ -189,17 +259,12 @@ class DatasetDbLoader:
189
259
  return len(rows)
190
260
 
191
261
  # Determine the lookup field for children to reference this source
192
- # This is the field that child dependencies extract from this source
193
262
  lookup_field = self._get_child_lookup_field(source)
194
263
 
195
- # Create table
196
- if drop_existing and self.adapter.table_exists(table_name):
197
- self.adapter.execute(f"DROP TABLE {table_name}")
198
-
264
+ # Create table if needed
199
265
  if not self.adapter.table_exists(table_name):
200
266
  schema = infer_table_schema(table_name, rows)
201
267
  sql_types = schema.to_sql_types(self._dialect)
202
- # Add auto-increment id column
203
268
  col_defs = {"id": self._auto_id_type()}
204
269
  col_defs.update(sql_types)
205
270
  self.adapter.create_table(table_name, col_defs, primary_key="id")
@@ -208,10 +273,8 @@ class DatasetDbLoader:
208
273
  value_map: dict[str, int] = {}
209
274
  for i, row in enumerate(rows):
210
275
  self.adapter.insert_batch(table_name, [row])
211
- # Get the last inserted id
212
276
  last_id = self._get_last_id(table_name)
213
277
 
214
- # Build value→id map for child sources
215
278
  if lookup_field and last_id is not None:
216
279
  raw_record = raw_records[i]
217
280
  lookup_val = _extract_dot_value(raw_record, lookup_field)
@@ -225,6 +288,82 @@ class DatasetDbLoader:
225
288
 
226
289
  return len(rows)
227
290
 
291
+ def _diff_sync(
292
+ self,
293
+ source: DatasetSource,
294
+ table_name: str,
295
+ diff_key: str,
296
+ differ: Any,
297
+ dates: list[date],
298
+ *,
299
+ dry_run: bool = False,
300
+ ) -> int:
301
+ """Diff-based incremental sync: compare two most recent snapshots, apply delta."""
302
+ result = differ.diff(source.id, diff_key)
303
+ total = 0
304
+
305
+ if dry_run:
306
+ return len(result.added) + len(result.removed) + len(result.changed)
307
+
308
+ # Extract key value from a record (handles dot-notation)
309
+ def _get_key_val(record: dict[str, Any]) -> Any:
310
+ if "." in diff_key:
311
+ return _extract_dot_value(record, diff_key)
312
+ return record.get(diff_key)
313
+
314
+ # Determine the DB column name for the key
315
+ db_key_col = diff_key.replace(".", "_")
316
+
317
+ # INSERT added records
318
+ if result.added:
319
+ for record in result.added:
320
+ row = _filter_record(record, source)
321
+ self.adapter.insert_batch(table_name, [row])
322
+ total += 1
323
+
324
+ # DELETE removed records
325
+ if result.removed:
326
+ safe_col = sanitize_identifier(db_key_col)
327
+ for record in result.removed:
328
+ key_val = _get_key_val(record)
329
+ if key_val is not None:
330
+ self.adapter.execute(
331
+ f"DELETE FROM {table_name} WHERE {safe_col} = ?",
332
+ (str(key_val),),
333
+ )
334
+ total += 1
335
+
336
+ # UPDATE changed records
337
+ if result.changed:
338
+ safe_col = sanitize_identifier(db_key_col)
339
+ for record in result.changed:
340
+ key_val = _get_key_val(record)
341
+ if key_val is None:
342
+ continue
343
+ changed_fields = record.get("_changed_fields", [])
344
+ if not changed_fields:
345
+ continue
346
+
347
+ # Build SET clause from changed fields
348
+ set_parts = []
349
+ params: list[Any] = []
350
+ for field_name in changed_fields:
351
+ new_val = record.get(field_name)
352
+ safe_field = sanitize_identifier(field_name)
353
+ set_parts.append(f"{safe_field} = ?")
354
+ params.append(new_val)
355
+
356
+ params.append(str(key_val))
357
+ sql = (
358
+ f"UPDATE {table_name} "
359
+ f"SET {', '.join(set_parts)} "
360
+ f"WHERE {safe_col} = ?"
361
+ )
362
+ self.adapter.execute(sql, tuple(params))
363
+ total += 1
364
+
365
+ return total
366
+
228
367
  def _get_child_lookup_field(self, source: DatasetSource) -> str | None:
229
368
  """Find which field children use to reference this source."""
230
369
  for other in self.config.sources:
@@ -0,0 +1,496 @@
1
+ """Compare two dataset snapshots to find added, removed, and changed records."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from dataclasses import dataclass, field
7
+ from datetime import date
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from anysite.dataset.errors import DatasetError
12
+ from anysite.dataset.storage import get_source_dir
13
+
14
+
15
+ def _build_key_expr(key: str, all_columns: list[str]) -> tuple[str, str]:
16
+ """Build a DuckDB key expression, supporting dot-notation for JSON fields.
17
+
18
+ Returns:
19
+ (key_expr, key_alias) — the SQL expression and a display alias.
20
+ For simple keys: ('"field"', 'field')
21
+ For dot-notation: ("json_extract_string(\"urn\", '$.value')", 'urn.value')
22
+ """
23
+ if "." not in key:
24
+ if key not in all_columns:
25
+ raise DatasetError(
26
+ f"Key field '{key}' not found. "
27
+ f"Available: {', '.join(all_columns)}"
28
+ )
29
+ return f'"{key}"', key
30
+
31
+ root, rest = key.split(".", 1)
32
+ if root not in all_columns:
33
+ raise DatasetError(
34
+ f"Root field '{root}' (from key '{key}') not found. "
35
+ f"Available: {', '.join(all_columns)}"
36
+ )
37
+ return f"json_extract_string(\"{root}\", '$.{rest}')", key
38
+
39
+
40
+ @dataclass
41
+ class DiffResult:
42
+ """Result of comparing two dataset snapshots."""
43
+
44
+ source_id: str
45
+ from_date: date
46
+ to_date: date
47
+ key: str
48
+ added: list[dict[str, Any]] = field(default_factory=list)
49
+ removed: list[dict[str, Any]] = field(default_factory=list)
50
+ changed: list[dict[str, Any]] = field(default_factory=list)
51
+ unchanged_count: int = 0
52
+ fields: list[str] | None = field(default=None)
53
+
54
+ @property
55
+ def has_changes(self) -> bool:
56
+ return bool(self.added or self.removed or self.changed)
57
+
58
+
59
+ class DatasetDiffer:
60
+ """Compare two Parquet snapshots for a dataset source."""
61
+
62
+ def __init__(self, base_path: Path) -> None:
63
+ self.base_path = base_path
64
+
65
+ def available_dates(self, source_id: str) -> list[date]:
66
+ """List available snapshot dates for a source, sorted ascending."""
67
+ source_dir = get_source_dir(self.base_path, source_id)
68
+ if not source_dir.exists():
69
+ return []
70
+
71
+ dates: list[date] = []
72
+ for f in sorted(source_dir.glob("*.parquet")):
73
+ try:
74
+ dates.append(date.fromisoformat(f.stem))
75
+ except ValueError:
76
+ continue
77
+ return dates
78
+
79
+ def diff(
80
+ self,
81
+ source_id: str,
82
+ key: str,
83
+ *,
84
+ from_date: date | None = None,
85
+ to_date: date | None = None,
86
+ fields: list[str] | None = None,
87
+ ) -> DiffResult:
88
+ """Compare two snapshots using DuckDB.
89
+
90
+ Args:
91
+ source_id: Source to compare.
92
+ key: Field to match records by. Supports dot-notation for
93
+ JSON fields (e.g., ``urn.value``).
94
+ from_date: Older snapshot date (default: second-to-last).
95
+ to_date: Newer snapshot date (default: latest).
96
+ fields: Only compare (and output) these fields (default: all).
97
+
98
+ Returns:
99
+ DiffResult with added, removed, changed lists.
100
+ """
101
+ from_date, to_date = self._resolve_dates(source_id, from_date, to_date)
102
+ source_dir = get_source_dir(self.base_path, source_id)
103
+ old_path = source_dir / f"{from_date.isoformat()}.parquet"
104
+ new_path = source_dir / f"{to_date.isoformat()}.parquet"
105
+
106
+ if not old_path.exists():
107
+ raise DatasetError(
108
+ f"No snapshot for {source_id} on {from_date.isoformat()}"
109
+ )
110
+ if not new_path.exists():
111
+ raise DatasetError(
112
+ f"No snapshot for {source_id} on {to_date.isoformat()}"
113
+ )
114
+
115
+ return self._diff_with_duckdb(
116
+ source_id, key, old_path, new_path, from_date, to_date, fields
117
+ )
118
+
119
+ def _resolve_dates(
120
+ self,
121
+ source_id: str,
122
+ from_date: date | None,
123
+ to_date: date | None,
124
+ ) -> tuple[date, date]:
125
+ """Resolve from/to dates, defaulting to two most recent snapshots."""
126
+ if from_date and to_date:
127
+ return from_date, to_date
128
+
129
+ dates = self.available_dates(source_id)
130
+ if len(dates) < 2:
131
+ raise DatasetError(
132
+ f"Need at least 2 snapshots to diff, "
133
+ f"found {len(dates)} for {source_id}"
134
+ )
135
+
136
+ if to_date and not from_date:
137
+ # Find the date just before to_date
138
+ earlier = [d for d in dates if d < to_date]
139
+ if not earlier:
140
+ raise DatasetError(
141
+ f"No snapshot before {to_date.isoformat()} for {source_id}"
142
+ )
143
+ return earlier[-1], to_date
144
+
145
+ if from_date and not to_date:
146
+ # Find the date just after from_date
147
+ later = [d for d in dates if d > from_date]
148
+ if not later:
149
+ raise DatasetError(
150
+ f"No snapshot after {from_date.isoformat()} for {source_id}"
151
+ )
152
+ return from_date, later[0]
153
+
154
+ # Both None — use two most recent
155
+ return dates[-2], dates[-1]
156
+
157
+ def _diff_with_duckdb(
158
+ self,
159
+ source_id: str,
160
+ key: str,
161
+ old_path: Path,
162
+ new_path: Path,
163
+ from_date: date,
164
+ to_date: date,
165
+ fields: list[str] | None,
166
+ ) -> DiffResult:
167
+ """Run diff queries using DuckDB."""
168
+ import duckdb
169
+
170
+ conn = duckdb.connect(":memory:")
171
+ try:
172
+ conn.execute(
173
+ f"CREATE VIEW _old AS SELECT * FROM read_parquet('{old_path}')"
174
+ )
175
+ conn.execute(
176
+ f"CREATE VIEW _new AS SELECT * FROM read_parquet('{new_path}')"
177
+ )
178
+
179
+ # Get all column names from the new snapshot
180
+ info = conn.execute("DESCRIBE _new").fetchall()
181
+ all_columns = [col[0] for col in info]
182
+
183
+ # Build key expression (supports dot-notation)
184
+ key_expr, key_alias = _build_key_expr(key, all_columns)
185
+
186
+ # Determine which fields to compare
187
+ compare_fields = fields if fields else [
188
+ c for c in all_columns if c != key and c != key.split(".")[0]
189
+ ]
190
+ # Filter to fields that actually exist
191
+ compare_fields = [c for c in compare_fields if c in all_columns]
192
+
193
+ # Determine output columns: if fields specified, restrict to key + fields
194
+ if fields:
195
+ output_columns = [key_alias] + [
196
+ f for f in fields if f in all_columns
197
+ ]
198
+ else:
199
+ output_columns = None # all columns
200
+
201
+ # Added: in new but not in old
202
+ added_records = self._query_added_removed(
203
+ conn, "_new", "_old", key_expr, key_alias, all_columns, output_columns
204
+ )
205
+
206
+ # Removed: in old but not in new
207
+ removed_records = self._query_added_removed(
208
+ conn, "_old", "_new", key_expr, key_alias, all_columns, output_columns
209
+ )
210
+
211
+ # Changed: matching key, different values
212
+ changed_records = self._find_changed(
213
+ conn, key_expr, key_alias, compare_fields, all_columns, output_columns
214
+ )
215
+
216
+ # Count unchanged
217
+ total_matched = conn.execute(
218
+ f"SELECT COUNT(*) FROM _new n "
219
+ f"JOIN _old o ON ({_requalify(key_expr, 'n')}) = ({_requalify(key_expr, 'o')})"
220
+ ).fetchone()
221
+ matched_count = total_matched[0] if total_matched else 0
222
+ unchanged_count = matched_count - len(changed_records)
223
+
224
+ return DiffResult(
225
+ source_id=source_id,
226
+ from_date=from_date,
227
+ to_date=to_date,
228
+ key=key,
229
+ added=added_records,
230
+ removed=removed_records,
231
+ changed=changed_records,
232
+ unchanged_count=unchanged_count,
233
+ fields=fields,
234
+ )
235
+ finally:
236
+ conn.close()
237
+
238
+ @staticmethod
239
+ def _query_added_removed(
240
+ conn: Any,
241
+ present_view: str,
242
+ absent_view: str,
243
+ key_expr: str,
244
+ key_alias: str,
245
+ all_columns: list[str],
246
+ output_columns: list[str] | None,
247
+ ) -> list[dict[str, Any]]:
248
+ """Query records present in one view but not the other."""
249
+ # Build SELECT list
250
+ if output_columns:
251
+ select_parts = []
252
+ for col in output_columns:
253
+ if col == key_alias and "." in col:
254
+ select_parts.append(f"{key_expr} AS \"{key_alias}\"")
255
+ else:
256
+ select_parts.append(f'"{col}"')
257
+ select_clause = ", ".join(select_parts)
258
+ else:
259
+ if "." in key_alias:
260
+ select_clause = f"*, {key_expr} AS \"{key_alias}\""
261
+ else:
262
+ select_clause = "*"
263
+
264
+ sql = (
265
+ f"SELECT {select_clause} FROM {present_view} "
266
+ f"WHERE ({key_expr}) NOT IN (SELECT ({key_expr}) FROM {absent_view})"
267
+ )
268
+ result = conn.execute(sql)
269
+ columns = [desc[0] for desc in result.description]
270
+ rows = result.fetchall()
271
+ return [dict(zip(columns, row, strict=False)) for row in rows]
272
+
273
+ @staticmethod
274
+ def _find_changed(
275
+ conn: Any,
276
+ key_expr: str,
277
+ key_alias: str,
278
+ compare_fields: list[str],
279
+ all_columns: list[str],
280
+ output_columns: list[str] | None,
281
+ ) -> list[dict[str, Any]]:
282
+ """Find records that exist in both snapshots but have different values."""
283
+ if not compare_fields:
284
+ return []
285
+
286
+ # Build WHERE clause: any compared field differs
287
+ where_parts = []
288
+ for col in compare_fields:
289
+ qc = f'"{col}"'
290
+ where_parts.append(f"n.{qc} IS DISTINCT FROM o.{qc}")
291
+ where_clause = " OR ".join(where_parts)
292
+
293
+ # Build JOIN condition
294
+ join_key_n = _requalify(key_expr, "n")
295
+ join_key_o = _requalify(key_expr, "o")
296
+ join_cond = f"({join_key_n}) = ({join_key_o})"
297
+
298
+ # Build SELECT: key + output fields + __old for compare fields
299
+ if output_columns:
300
+ # Restricted output
301
+ select_parts = []
302
+ for col in output_columns:
303
+ if col == key_alias and "." in col:
304
+ select_parts.append(f"{_requalify(key_expr, 'n')} AS \"{key_alias}\"")
305
+ else:
306
+ select_parts.append(f"n.\"{col}\"")
307
+ for col in compare_fields:
308
+ # Include __old for compare fields that are in output
309
+ if col in [c for c in output_columns if c != key_alias]:
310
+ select_parts.append(f"o.\"{col}\" AS \"{col}__old\"")
311
+ else:
312
+ # Full output
313
+ select_parts = []
314
+ if "." in key_alias:
315
+ select_parts.append(f"{_requalify(key_expr, 'n')} AS \"{key_alias}\"")
316
+ else:
317
+ select_parts.append(f"n.\"{key_alias}\"")
318
+ for col in all_columns:
319
+ if col == key_alias:
320
+ continue
321
+ select_parts.append(f"n.\"{col}\"")
322
+ for col in compare_fields:
323
+ select_parts.append(f"o.\"{col}\" AS \"{col}__old\"")
324
+
325
+ select_clause = ", ".join(select_parts)
326
+
327
+ sql = (
328
+ f"SELECT {select_clause} FROM _new n "
329
+ f"JOIN _old o ON {join_cond} "
330
+ f"WHERE {where_clause}"
331
+ )
332
+
333
+ result = conn.execute(sql)
334
+ columns = [desc[0] for desc in result.description]
335
+ rows = result.fetchall()
336
+ records = [dict(zip(columns, row, strict=False)) for row in rows]
337
+
338
+ # Add _changed_fields metadata to each record
339
+ for record in records:
340
+ changed_fields = []
341
+ for col in compare_fields:
342
+ old_key = f"{col}__old"
343
+ new_val = record.get(col)
344
+ old_val = record.get(old_key)
345
+ if _values_differ(new_val, old_val):
346
+ changed_fields.append(col)
347
+ record["_changed_fields"] = changed_fields
348
+
349
+ return records
350
+
351
+
352
+ def _requalify(key_expr: str, prefix: str) -> str:
353
+ """Requalify a key expression with a table alias prefix.
354
+
355
+ For simple keys like '"field"', returns 'prefix."field"'.
356
+ For json_extract_string("col", '$.path'), returns
357
+ json_extract_string(prefix."col", '$.path').
358
+ """
359
+ if key_expr.startswith("json_extract_string("):
360
+ # Replace the column reference inside json_extract_string
361
+ inner = key_expr[len("json_extract_string("):]
362
+ # inner looks like: "col", '$.path')
363
+ col_end = inner.index(",")
364
+ col = inner[:col_end].strip()
365
+ rest = inner[col_end:]
366
+ return f"json_extract_string({prefix}.{col}{rest}"
367
+ return f"{prefix}.{key_expr}"
368
+
369
+
370
+ def _values_differ(a: Any, b: Any) -> bool:
371
+ """Compare two values, treating JSON strings as equivalent to their parsed form."""
372
+ if a == b:
373
+ return False
374
+ # Handle JSON string comparison
375
+ if isinstance(a, str) and isinstance(b, str):
376
+ try:
377
+ return json.loads(a) != json.loads(b)
378
+ except (json.JSONDecodeError, ValueError):
379
+ pass
380
+ return True
381
+
382
+
383
+ def format_diff_table(
384
+ result: DiffResult,
385
+ *,
386
+ output_fields: list[str] | None = None,
387
+ ) -> list[dict[str, Any]]:
388
+ """Format a DiffResult into a flat list of dicts for table/json output.
389
+
390
+ Each record gets a ``_diff`` column with value ``added``, ``removed``,
391
+ or ``changed``. For changed records in table mode, modified field
392
+ values are formatted as ``old → new``.
393
+
394
+ Args:
395
+ result: The diff result.
396
+ output_fields: If set, only include these fields (plus ``_diff`` and key).
397
+ """
398
+ allowed = _build_allowed_set(result.key, output_fields)
399
+ rows: list[dict[str, Any]] = []
400
+
401
+ for record in result.added:
402
+ row = {"_diff": "added", **_filter_row(record, allowed)}
403
+ rows.append(row)
404
+
405
+ for record in result.removed:
406
+ row = {"_diff": "removed", **_filter_row(record, allowed)}
407
+ rows.append(row)
408
+
409
+ for record in result.changed:
410
+ row: dict[str, Any] = {"_diff": "changed"}
411
+ changed_fields = record.get("_changed_fields", [])
412
+ for k, v in record.items():
413
+ if k == "_changed_fields":
414
+ continue
415
+ if k.endswith("__old"):
416
+ continue
417
+ if allowed and k not in allowed:
418
+ continue
419
+ # For changed fields, format as "old → new"
420
+ if k in changed_fields:
421
+ old_val = record.get(f"{k}__old")
422
+ row[k] = f"{_format_val(old_val)} → {_format_val(v)}"
423
+ else:
424
+ row[k] = v
425
+ rows.append(row)
426
+
427
+ return rows
428
+
429
+
430
+ def format_diff_records(
431
+ result: DiffResult,
432
+ *,
433
+ output_fields: list[str] | None = None,
434
+ ) -> list[dict[str, Any]]:
435
+ """Format a DiffResult for JSON/CSV output.
436
+
437
+ Each record gets ``_diff`` column. Changed records include both
438
+ current values and ``field__old`` columns.
439
+
440
+ Args:
441
+ result: The diff result.
442
+ output_fields: If set, only include these fields (plus ``_diff``, key, and ``__old``).
443
+ """
444
+ allowed = _build_allowed_set(result.key, output_fields)
445
+ rows: list[dict[str, Any]] = []
446
+
447
+ for record in result.added:
448
+ rows.append({"_diff": "added", **_filter_row(record, allowed)})
449
+
450
+ for record in result.removed:
451
+ rows.append({"_diff": "removed", **_filter_row(record, allowed)})
452
+
453
+ for record in result.changed:
454
+ row: dict[str, Any] = {"_diff": "changed"}
455
+ for k, v in record.items():
456
+ if k == "_changed_fields":
457
+ continue
458
+ if allowed and k not in allowed and not k.endswith("__old"):
459
+ continue
460
+ if k.endswith("__old") and allowed:
461
+ base = k[: -len("__old")]
462
+ if base not in allowed:
463
+ continue
464
+ row[k] = v
465
+ rows.append(row)
466
+
467
+ return rows
468
+
469
+
470
+ def _build_allowed_set(key: str, output_fields: list[str] | None) -> set[str] | None:
471
+ """Build the set of allowed field names for output filtering."""
472
+ if not output_fields:
473
+ return None
474
+ allowed = set(output_fields)
475
+ allowed.add(key)
476
+ # Also add the root column for dot-notation keys
477
+ if "." in key:
478
+ allowed.add(key.split(".")[0])
479
+ return allowed
480
+
481
+
482
+ def _filter_row(record: dict[str, Any], allowed: set[str] | None) -> dict[str, Any]:
483
+ """Filter a record to only allowed fields."""
484
+ if not allowed:
485
+ return record
486
+ return {k: v for k, v in record.items() if k in allowed}
487
+
488
+
489
+ def _format_val(v: Any) -> str:
490
+ """Format a value for display, truncating long strings."""
491
+ if v is None:
492
+ return "null"
493
+ s = str(v)
494
+ if len(s) > 40:
495
+ return s[:37] + "..."
496
+ return s
anysite/dataset/models.py CHANGED
@@ -81,6 +81,7 @@ class DbLoadConfig(BaseModel):
81
81
  """Configuration for loading a source into a relational database."""
82
82
 
83
83
  table: str | None = Field(default=None, description="Override table name (default: source id)")
84
+ key: str | None = Field(default=None, description="Unique key field for diff-based DB sync (e.g., urn.value)")
84
85
  fields: list[str] = Field(default_factory=list, description="Fields to include (empty = all)")
85
86
  exclude: list[str] = Field(
86
87
  default_factory=lambda: ["_input_value", "_parent_source"],
@@ -129,6 +130,10 @@ class DatasetSource(BaseModel):
129
130
  default_factory=list,
130
131
  description="Export destinations (file/webhook) applied after Parquet write",
131
132
  )
133
+ refresh: Literal["auto", "always"] = Field(
134
+ default="auto",
135
+ description="Refresh mode: 'auto' uses incremental caching, 'always' re-collects every run",
136
+ )
132
137
 
133
138
  @field_validator("endpoint")
134
139
  @classmethod
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: anysite-cli
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: CLI for Anysite API - web data extraction for humans and AI agents
5
5
  Project-URL: Homepage, https://anysite.io
6
6
  Project-URL: Documentation, https://docs.anysite.io/cli
@@ -272,6 +272,7 @@ sources:
272
272
  - type: company
273
273
  value: "{value}"
274
274
  count: 5
275
+ refresh: always # Re-collect every run with --incremental
275
276
  db_load:
276
277
  fields: [name, url, headline]
277
278
 
@@ -327,6 +328,9 @@ anysite dataset logs my-dataset --run 42
327
328
  # Generate cron/systemd schedule
328
329
  anysite dataset schedule dataset.yaml --incremental --load-db pg
329
330
 
331
+ # Compare snapshots (diff two collection dates)
332
+ anysite dataset diff dataset.yaml --source employees --key _input_value
333
+
330
334
  # Reset incremental state
331
335
  anysite dataset reset-cursor dataset.yaml
332
336
  ```
@@ -19,13 +19,14 @@ anysite/config/paths.py,sha256=EmHJD8wlf4Q9IUn8Gp1JQ8Z3ffrIYAt5iHRyImQOf5I,1087
19
19
  anysite/config/settings.py,sha256=Hc0j_aCCtkJeL4nHw-EFyfJ8WEDk57G08iNUFquUhpM,5235
20
20
  anysite/dataset/__init__.py,sha256=J0sKQkGwVOPtvp6pka7LcdeUEADvjWRs71yRuROzJxI,847
21
21
  anysite/dataset/analyzer.py,sha256=8dsPW32SbSaUTy1F0NIed1U45wjiMgQeJ2iWX7hBxRQ,9245
22
- anysite/dataset/cli.py,sha256=elBpp7XEmMcWBdyjkeDOi1730bCxVM77D0aVcmM9A0s,19423
23
- anysite/dataset/collector.py,sha256=GGzkkvNETcdgeyIV_6YNthDY2SHtDhl-U8YsXeeOuFo,23350
24
- anysite/dataset/db_loader.py,sha256=nlMJrDJiGBX5H1StcjsontSxLXbsFe4rwOEnDehzpk8,8443
22
+ anysite/dataset/cli.py,sha256=rEWK1ka-YQ_Vbbj2nMaMYTD9g3wa3ethUWSoaWRSGTY,23066
23
+ anysite/dataset/collector.py,sha256=6CfJt8fKZZ2xvZWJ7jwnx0V9BnjoJxmBZkm8xWQiU54,23840
24
+ anysite/dataset/db_loader.py,sha256=TMcvI-pX-XctbkTdo5eTyW8Co4_3uK-dEdXn_r9g8Oc,13547
25
+ anysite/dataset/differ.py,sha256=b-qU5Laf8RkteZAlblKq4atTvnJ21W4QbxfpHBFYMJ8,17053
25
26
  anysite/dataset/errors.py,sha256=r8cZXoIzSeTGCWpeYjntnN0AduCu74YZyWs3sFu17J4,914
26
27
  anysite/dataset/exporters.py,sha256=mA2FYbYJbHfrwkXbHDu4g5qPG_JJKnkVciXFKPkF1Vw,3708
27
28
  anysite/dataset/history.py,sha256=avFs0ADlM7Hr-ttqC1FfjJiQxvQP20sScM7ZoY4lvU0,5471
28
- anysite/dataset/models.py,sha256=YzpXmkmcyy_-BUxnYWrZMidpii75DbbZHBMTkw8Y1qo,9516
29
+ anysite/dataset/models.py,sha256=-Qnh6QvbN3nzlfsYqgCiYKBqOeLcJCYK_hYrmxVCRTA,9810
29
30
  anysite/dataset/notifications.py,sha256=ORzo9XOgOxzLb7rk4pevlKPB_Taf-jejlrtmO4Zgl2c,2367
30
31
  anysite/dataset/scheduler.py,sha256=zpbA5tRUQZXr-9lZnG58dvE3E7ZBlAd-U-PTXExe9f0,3339
31
32
  anysite/dataset/storage.py,sha256=d03goKLI5NWKJowHwCgGqQkcVTO1NctPxMu-Xu-tru4,5326
@@ -57,8 +58,8 @@ anysite/streaming/writer.py,sha256=HfMsC4umUdJuNIAPK57YAxEGyTwUmy-zNrqFkwY6aew,4
57
58
  anysite/utils/__init__.py,sha256=7SnbxpxKENK-2ecUL5NfnZ9okGI7COKYw4WF46172HM,23
58
59
  anysite/utils/fields.py,sha256=bSrHadzNmabL4qubqhXXZoWb_P8KA-3S7_FLVT8nGBc,7410
59
60
  anysite/utils/retry.py,sha256=89TbXvavi5t22P2mTYCLAS6SSZoW65gQ0nnYNbYAF0M,2684
60
- anysite_cli-0.1.2.dist-info/METADATA,sha256=ucMaKonPqQeg6HsoPfVCbmjiVyIZxW5Pb2HElHiAtHo,11576
61
- anysite_cli-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
62
- anysite_cli-0.1.2.dist-info/entry_points.txt,sha256=FDPxNasy0fRRcOgJdZRVP7Qw01C3TwRa1OwPJiskNyg,45
63
- anysite_cli-0.1.2.dist-info/licenses/LICENSE,sha256=gVAxkI23CFm4x4HV_fkQYw_bGq93mQmVZEwxNs-YTa4,1069
64
- anysite_cli-0.1.2.dist-info/RECORD,,
61
+ anysite_cli-0.1.4.dist-info/METADATA,sha256=w5DUgDWzJgXynKRogJVm9baLqTJVSrg0ciHuWfWa9l0,11781
62
+ anysite_cli-0.1.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
63
+ anysite_cli-0.1.4.dist-info/entry_points.txt,sha256=FDPxNasy0fRRcOgJdZRVP7Qw01C3TwRa1OwPJiskNyg,45
64
+ anysite_cli-0.1.4.dist-info/licenses/LICENSE,sha256=gVAxkI23CFm4x4HV_fkQYw_bGq93mQmVZEwxNs-YTa4,1069
65
+ anysite_cli-0.1.4.dist-info/RECORD,,