anysite-cli 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anysite/dataset/cli.py +111 -0
- anysite/dataset/collector.py +16 -8
- anysite/dataset/differ.py +355 -0
- anysite/dataset/models.py +4 -0
- {anysite_cli-0.1.2.dist-info → anysite_cli-0.1.3.dist-info}/METADATA +5 -1
- {anysite_cli-0.1.2.dist-info → anysite_cli-0.1.3.dist-info}/RECORD +9 -8
- {anysite_cli-0.1.2.dist-info → anysite_cli-0.1.3.dist-info}/WHEEL +0 -0
- {anysite_cli-0.1.2.dist-info → anysite_cli-0.1.3.dist-info}/entry_points.txt +0 -0
- {anysite_cli-0.1.2.dist-info → anysite_cli-0.1.3.dist-info}/licenses/LICENSE +0 -0
anysite/dataset/cli.py
CHANGED
|
@@ -413,6 +413,117 @@ def load_db(
|
|
|
413
413
|
)
|
|
414
414
|
|
|
415
415
|
|
|
416
|
+
@app.command("diff")
|
|
417
|
+
def diff_cmd(
|
|
418
|
+
config_path: Annotated[
|
|
419
|
+
Path,
|
|
420
|
+
typer.Argument(help="Path to dataset.yaml"),
|
|
421
|
+
],
|
|
422
|
+
source: Annotated[
|
|
423
|
+
str,
|
|
424
|
+
typer.Option("--source", "-s", help="Source to compare"),
|
|
425
|
+
],
|
|
426
|
+
key: Annotated[
|
|
427
|
+
str,
|
|
428
|
+
typer.Option("--key", "-k", help="Field to match records by (e.g., _input_value, urn)"),
|
|
429
|
+
],
|
|
430
|
+
from_date: Annotated[
|
|
431
|
+
str | None,
|
|
432
|
+
typer.Option("--from", help="Older snapshot date (YYYY-MM-DD)"),
|
|
433
|
+
] = None,
|
|
434
|
+
to_date: Annotated[
|
|
435
|
+
str | None,
|
|
436
|
+
typer.Option("--to", help="Newer snapshot date (YYYY-MM-DD)"),
|
|
437
|
+
] = None,
|
|
438
|
+
fields: Annotated[
|
|
439
|
+
str | None,
|
|
440
|
+
typer.Option("--fields", "-f", help="Only compare these fields (comma-separated)"),
|
|
441
|
+
] = None,
|
|
442
|
+
format: Annotated[
|
|
443
|
+
str,
|
|
444
|
+
typer.Option("--format", help="Output format: table, json, jsonl, csv"),
|
|
445
|
+
] = "table",
|
|
446
|
+
output: Annotated[
|
|
447
|
+
Path | None,
|
|
448
|
+
typer.Option("--output", "-o", help="Write output to file"),
|
|
449
|
+
] = None,
|
|
450
|
+
quiet: Annotated[
|
|
451
|
+
bool,
|
|
452
|
+
typer.Option("--quiet", "-q", help="Suppress summary, only output data"),
|
|
453
|
+
] = False,
|
|
454
|
+
) -> None:
|
|
455
|
+
"""Compare two snapshots of a source to show added, removed, and changed records."""
|
|
456
|
+
from datetime import date as date_type
|
|
457
|
+
|
|
458
|
+
from anysite.dataset.differ import (
|
|
459
|
+
DatasetDiffer,
|
|
460
|
+
format_diff_records,
|
|
461
|
+
format_diff_table,
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
config = _load_config(config_path)
|
|
465
|
+
|
|
466
|
+
# Validate source exists
|
|
467
|
+
src = config.get_source(source)
|
|
468
|
+
if src is None:
|
|
469
|
+
typer.echo(f"Error: source '{source}' not found in dataset", err=True)
|
|
470
|
+
raise typer.Exit(1)
|
|
471
|
+
|
|
472
|
+
differ = DatasetDiffer(config.storage_path())
|
|
473
|
+
|
|
474
|
+
# Parse dates
|
|
475
|
+
parsed_from = None
|
|
476
|
+
parsed_to = None
|
|
477
|
+
try:
|
|
478
|
+
if from_date:
|
|
479
|
+
parsed_from = date_type.fromisoformat(from_date)
|
|
480
|
+
if to_date:
|
|
481
|
+
parsed_to = date_type.fromisoformat(to_date)
|
|
482
|
+
except ValueError as e:
|
|
483
|
+
typer.echo(f"Error: invalid date format: {e}", err=True)
|
|
484
|
+
raise typer.Exit(1) from None
|
|
485
|
+
|
|
486
|
+
# Parse fields
|
|
487
|
+
field_list = None
|
|
488
|
+
if fields:
|
|
489
|
+
field_list = [f.strip() for f in fields.split(",") if f.strip()]
|
|
490
|
+
|
|
491
|
+
try:
|
|
492
|
+
result = differ.diff(
|
|
493
|
+
source,
|
|
494
|
+
key,
|
|
495
|
+
from_date=parsed_from,
|
|
496
|
+
to_date=parsed_to,
|
|
497
|
+
fields=field_list,
|
|
498
|
+
)
|
|
499
|
+
except DatasetError as e:
|
|
500
|
+
typer.echo(f"Error: {e}", err=True)
|
|
501
|
+
raise typer.Exit(1) from None
|
|
502
|
+
|
|
503
|
+
# Print summary unless quiet
|
|
504
|
+
if not quiet:
|
|
505
|
+
console = Console()
|
|
506
|
+
console.print(
|
|
507
|
+
f"\n[bold]Diff: {source}[/bold] "
|
|
508
|
+
f"({result.from_date.isoformat()} → {result.to_date.isoformat()})\n"
|
|
509
|
+
)
|
|
510
|
+
console.print(f" [green]Added:[/green] {len(result.added)}")
|
|
511
|
+
console.print(f" [red]Removed:[/red] {len(result.removed)}")
|
|
512
|
+
console.print(f" [yellow]Changed:[/yellow] {len(result.changed)}")
|
|
513
|
+
console.print(f" Unchanged: {result.unchanged_count}")
|
|
514
|
+
console.print()
|
|
515
|
+
|
|
516
|
+
if not result.has_changes:
|
|
517
|
+
if not quiet:
|
|
518
|
+
Console().print("[dim]No changes detected.[/dim]")
|
|
519
|
+
return
|
|
520
|
+
|
|
521
|
+
# Format and output
|
|
522
|
+
rows = format_diff_table(result) if format == "table" else format_diff_records(result)
|
|
523
|
+
|
|
524
|
+
_output_results(rows, format, output)
|
|
525
|
+
|
|
526
|
+
|
|
416
527
|
@app.command("history")
|
|
417
528
|
def history(
|
|
418
529
|
name: Annotated[
|
anysite/dataset/collector.py
CHANGED
|
@@ -43,6 +43,7 @@ class CollectionPlan:
|
|
|
43
43
|
params: dict[str, Any] | None = None,
|
|
44
44
|
dependency: str | None = None,
|
|
45
45
|
estimated_requests: int | None = None,
|
|
46
|
+
refresh: str = "auto",
|
|
46
47
|
) -> None:
|
|
47
48
|
self.steps.append({
|
|
48
49
|
"source": source_id,
|
|
@@ -51,6 +52,7 @@ class CollectionPlan:
|
|
|
51
52
|
"params": params or {},
|
|
52
53
|
"dependency": dependency,
|
|
53
54
|
"estimated_requests": estimated_requests,
|
|
55
|
+
"refresh": refresh,
|
|
54
56
|
})
|
|
55
57
|
|
|
56
58
|
|
|
@@ -116,8 +118,8 @@ async def collect_dataset(
|
|
|
116
118
|
|
|
117
119
|
try:
|
|
118
120
|
for source in ordered:
|
|
119
|
-
# Check incremental skip
|
|
120
|
-
if incremental:
|
|
121
|
+
# Check incremental skip (refresh: always bypasses this)
|
|
122
|
+
if incremental and source.refresh != "always":
|
|
121
123
|
parquet_path = get_parquet_path(base_path, source.id, today)
|
|
122
124
|
if parquet_path.exists():
|
|
123
125
|
if not quiet:
|
|
@@ -276,8 +278,8 @@ async def _collect_from_file(
|
|
|
276
278
|
print_warning(f"No values extracted from {file_path}")
|
|
277
279
|
return []
|
|
278
280
|
|
|
279
|
-
# Filter already-collected inputs in incremental mode
|
|
280
|
-
if incremental and metadata:
|
|
281
|
+
# Filter already-collected inputs in incremental mode (refresh: always bypasses)
|
|
282
|
+
if incremental and source.refresh != "always" and metadata:
|
|
281
283
|
already = metadata.get_collected_inputs(source.id)
|
|
282
284
|
if already:
|
|
283
285
|
original = len(values)
|
|
@@ -432,8 +434,8 @@ async def _collect_dependent(
|
|
|
432
434
|
f"Source {source.id} has a dependency but no input_key defined"
|
|
433
435
|
)
|
|
434
436
|
|
|
435
|
-
# Filter already-collected inputs in incremental mode
|
|
436
|
-
if incremental and metadata:
|
|
437
|
+
# Filter already-collected inputs in incremental mode (refresh: always bypasses)
|
|
438
|
+
if incremental and source.refresh != "always" and metadata:
|
|
437
439
|
already = metadata.get_collected_inputs(source.id)
|
|
438
440
|
if already:
|
|
439
441
|
original = len(values)
|
|
@@ -579,7 +581,7 @@ def _build_plan(
|
|
|
579
581
|
plan = CollectionPlan()
|
|
580
582
|
|
|
581
583
|
for source in ordered:
|
|
582
|
-
if incremental:
|
|
584
|
+
if incremental and source.refresh != "always":
|
|
583
585
|
parquet_path = get_parquet_path(base_path, source.id, today)
|
|
584
586
|
if parquet_path.exists():
|
|
585
587
|
continue
|
|
@@ -592,6 +594,7 @@ def _build_plan(
|
|
|
592
594
|
kind="from_file",
|
|
593
595
|
params={"file": source.from_file, "field": source.file_field},
|
|
594
596
|
estimated_requests=est,
|
|
597
|
+
refresh=source.refresh,
|
|
595
598
|
)
|
|
596
599
|
elif source.dependency is None:
|
|
597
600
|
plan.add_step(
|
|
@@ -600,6 +603,7 @@ def _build_plan(
|
|
|
600
603
|
kind="independent",
|
|
601
604
|
params=source.params,
|
|
602
605
|
estimated_requests=1,
|
|
606
|
+
refresh=source.refresh,
|
|
603
607
|
)
|
|
604
608
|
else:
|
|
605
609
|
est = _count_dependent_inputs(source, base_path, metadata)
|
|
@@ -609,6 +613,7 @@ def _build_plan(
|
|
|
609
613
|
kind="dependent",
|
|
610
614
|
dependency=source.dependency.from_source,
|
|
611
615
|
estimated_requests=est,
|
|
616
|
+
refresh=source.refresh,
|
|
612
617
|
)
|
|
613
618
|
|
|
614
619
|
return plan
|
|
@@ -665,11 +670,14 @@ def _print_plan(plan: CollectionPlan) -> dict[str, int]:
|
|
|
665
670
|
table.add_column("Est. Requests")
|
|
666
671
|
|
|
667
672
|
for i, step in enumerate(plan.steps, 1):
|
|
673
|
+
kind = step["kind"]
|
|
674
|
+
if step.get("refresh") == "always":
|
|
675
|
+
kind += " (refresh)"
|
|
668
676
|
table.add_row(
|
|
669
677
|
str(i),
|
|
670
678
|
step["source"],
|
|
671
679
|
step["endpoint"],
|
|
672
|
-
|
|
680
|
+
kind,
|
|
673
681
|
step.get("dependency") or "-",
|
|
674
682
|
str(step.get("estimated_requests") or "?"),
|
|
675
683
|
)
|
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
"""Compare two dataset snapshots to find added, removed, and changed records."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from datetime import date
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from anysite.dataset.errors import DatasetError
|
|
12
|
+
from anysite.dataset.storage import get_source_dir
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class DiffResult:
|
|
17
|
+
"""Result of comparing two dataset snapshots."""
|
|
18
|
+
|
|
19
|
+
source_id: str
|
|
20
|
+
from_date: date
|
|
21
|
+
to_date: date
|
|
22
|
+
key: str
|
|
23
|
+
added: list[dict[str, Any]] = field(default_factory=list)
|
|
24
|
+
removed: list[dict[str, Any]] = field(default_factory=list)
|
|
25
|
+
changed: list[dict[str, Any]] = field(default_factory=list)
|
|
26
|
+
unchanged_count: int = 0
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def has_changes(self) -> bool:
|
|
30
|
+
return bool(self.added or self.removed or self.changed)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class DatasetDiffer:
|
|
34
|
+
"""Compare two Parquet snapshots for a dataset source."""
|
|
35
|
+
|
|
36
|
+
def __init__(self, base_path: Path) -> None:
|
|
37
|
+
self.base_path = base_path
|
|
38
|
+
|
|
39
|
+
def available_dates(self, source_id: str) -> list[date]:
|
|
40
|
+
"""List available snapshot dates for a source, sorted ascending."""
|
|
41
|
+
source_dir = get_source_dir(self.base_path, source_id)
|
|
42
|
+
if not source_dir.exists():
|
|
43
|
+
return []
|
|
44
|
+
|
|
45
|
+
dates: list[date] = []
|
|
46
|
+
for f in sorted(source_dir.glob("*.parquet")):
|
|
47
|
+
try:
|
|
48
|
+
dates.append(date.fromisoformat(f.stem))
|
|
49
|
+
except ValueError:
|
|
50
|
+
continue
|
|
51
|
+
return dates
|
|
52
|
+
|
|
53
|
+
def diff(
|
|
54
|
+
self,
|
|
55
|
+
source_id: str,
|
|
56
|
+
key: str,
|
|
57
|
+
*,
|
|
58
|
+
from_date: date | None = None,
|
|
59
|
+
to_date: date | None = None,
|
|
60
|
+
fields: list[str] | None = None,
|
|
61
|
+
) -> DiffResult:
|
|
62
|
+
"""Compare two snapshots using DuckDB.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
source_id: Source to compare.
|
|
66
|
+
key: Field to match records by (e.g., ``_input_value``, ``urn``).
|
|
67
|
+
from_date: Older snapshot date (default: second-to-last).
|
|
68
|
+
to_date: Newer snapshot date (default: latest).
|
|
69
|
+
fields: Only compare these fields (default: all).
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
DiffResult with added, removed, changed lists.
|
|
73
|
+
"""
|
|
74
|
+
from_date, to_date = self._resolve_dates(source_id, from_date, to_date)
|
|
75
|
+
source_dir = get_source_dir(self.base_path, source_id)
|
|
76
|
+
old_path = source_dir / f"{from_date.isoformat()}.parquet"
|
|
77
|
+
new_path = source_dir / f"{to_date.isoformat()}.parquet"
|
|
78
|
+
|
|
79
|
+
if not old_path.exists():
|
|
80
|
+
raise DatasetError(
|
|
81
|
+
f"No snapshot for {source_id} on {from_date.isoformat()}"
|
|
82
|
+
)
|
|
83
|
+
if not new_path.exists():
|
|
84
|
+
raise DatasetError(
|
|
85
|
+
f"No snapshot for {source_id} on {to_date.isoformat()}"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
return self._diff_with_duckdb(
|
|
89
|
+
source_id, key, old_path, new_path, from_date, to_date, fields
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
def _resolve_dates(
|
|
93
|
+
self,
|
|
94
|
+
source_id: str,
|
|
95
|
+
from_date: date | None,
|
|
96
|
+
to_date: date | None,
|
|
97
|
+
) -> tuple[date, date]:
|
|
98
|
+
"""Resolve from/to dates, defaulting to two most recent snapshots."""
|
|
99
|
+
if from_date and to_date:
|
|
100
|
+
return from_date, to_date
|
|
101
|
+
|
|
102
|
+
dates = self.available_dates(source_id)
|
|
103
|
+
if len(dates) < 2:
|
|
104
|
+
raise DatasetError(
|
|
105
|
+
f"Need at least 2 snapshots to diff, "
|
|
106
|
+
f"found {len(dates)} for {source_id}"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
if to_date and not from_date:
|
|
110
|
+
# Find the date just before to_date
|
|
111
|
+
earlier = [d for d in dates if d < to_date]
|
|
112
|
+
if not earlier:
|
|
113
|
+
raise DatasetError(
|
|
114
|
+
f"No snapshot before {to_date.isoformat()} for {source_id}"
|
|
115
|
+
)
|
|
116
|
+
return earlier[-1], to_date
|
|
117
|
+
|
|
118
|
+
if from_date and not to_date:
|
|
119
|
+
# Find the date just after from_date
|
|
120
|
+
later = [d for d in dates if d > from_date]
|
|
121
|
+
if not later:
|
|
122
|
+
raise DatasetError(
|
|
123
|
+
f"No snapshot after {from_date.isoformat()} for {source_id}"
|
|
124
|
+
)
|
|
125
|
+
return from_date, later[0]
|
|
126
|
+
|
|
127
|
+
# Both None — use two most recent
|
|
128
|
+
return dates[-2], dates[-1]
|
|
129
|
+
|
|
130
|
+
def _diff_with_duckdb(
|
|
131
|
+
self,
|
|
132
|
+
source_id: str,
|
|
133
|
+
key: str,
|
|
134
|
+
old_path: Path,
|
|
135
|
+
new_path: Path,
|
|
136
|
+
from_date: date,
|
|
137
|
+
to_date: date,
|
|
138
|
+
fields: list[str] | None,
|
|
139
|
+
) -> DiffResult:
|
|
140
|
+
"""Run diff queries using DuckDB."""
|
|
141
|
+
import duckdb
|
|
142
|
+
|
|
143
|
+
conn = duckdb.connect(":memory:")
|
|
144
|
+
try:
|
|
145
|
+
conn.execute(
|
|
146
|
+
f"CREATE VIEW _old AS SELECT * FROM read_parquet('{old_path}')"
|
|
147
|
+
)
|
|
148
|
+
conn.execute(
|
|
149
|
+
f"CREATE VIEW _new AS SELECT * FROM read_parquet('{new_path}')"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Get all column names from the new snapshot
|
|
153
|
+
info = conn.execute("DESCRIBE _new").fetchall()
|
|
154
|
+
all_columns = [col[0] for col in info]
|
|
155
|
+
|
|
156
|
+
if key not in all_columns:
|
|
157
|
+
raise DatasetError(
|
|
158
|
+
f"Key field '{key}' not found in {source_id}. "
|
|
159
|
+
f"Available: {', '.join(all_columns)}"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Determine which fields to compare
|
|
163
|
+
compare_fields = fields if fields else [
|
|
164
|
+
c for c in all_columns if c != key
|
|
165
|
+
]
|
|
166
|
+
# Filter to fields that actually exist
|
|
167
|
+
compare_fields = [c for c in compare_fields if c in all_columns]
|
|
168
|
+
|
|
169
|
+
quoted_key = f'"{key}"'
|
|
170
|
+
|
|
171
|
+
# Added: in new but not in old
|
|
172
|
+
added = conn.execute(
|
|
173
|
+
f"SELECT * FROM _new "
|
|
174
|
+
f"WHERE {quoted_key} NOT IN (SELECT {quoted_key} FROM _old)"
|
|
175
|
+
).fetchall()
|
|
176
|
+
added_cols = [d[0] for d in conn.execute(
|
|
177
|
+
"DESCRIBE _new"
|
|
178
|
+
).fetchall()]
|
|
179
|
+
added_records = [dict(zip(added_cols, row, strict=False)) for row in added]
|
|
180
|
+
|
|
181
|
+
# Removed: in old but not in new
|
|
182
|
+
removed = conn.execute(
|
|
183
|
+
f"SELECT * FROM _old "
|
|
184
|
+
f"WHERE {quoted_key} NOT IN (SELECT {quoted_key} FROM _new)"
|
|
185
|
+
).fetchall()
|
|
186
|
+
removed_cols = [d[0] for d in conn.execute(
|
|
187
|
+
"DESCRIBE _old"
|
|
188
|
+
).fetchall()]
|
|
189
|
+
removed_records = [dict(zip(removed_cols, row, strict=False)) for row in removed]
|
|
190
|
+
|
|
191
|
+
# Changed: matching key, different values
|
|
192
|
+
changed_records = self._find_changed(
|
|
193
|
+
conn, key, compare_fields, all_columns
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Count unchanged
|
|
197
|
+
total_matched = conn.execute(
|
|
198
|
+
f"SELECT COUNT(*) FROM _new n "
|
|
199
|
+
f"JOIN _old o ON n.{quoted_key} = o.{quoted_key}"
|
|
200
|
+
).fetchone()
|
|
201
|
+
matched_count = total_matched[0] if total_matched else 0
|
|
202
|
+
unchanged_count = matched_count - len(changed_records)
|
|
203
|
+
|
|
204
|
+
return DiffResult(
|
|
205
|
+
source_id=source_id,
|
|
206
|
+
from_date=from_date,
|
|
207
|
+
to_date=to_date,
|
|
208
|
+
key=key,
|
|
209
|
+
added=added_records,
|
|
210
|
+
removed=removed_records,
|
|
211
|
+
changed=changed_records,
|
|
212
|
+
unchanged_count=unchanged_count,
|
|
213
|
+
)
|
|
214
|
+
finally:
|
|
215
|
+
conn.close()
|
|
216
|
+
|
|
217
|
+
def _find_changed(
|
|
218
|
+
self,
|
|
219
|
+
conn: Any,
|
|
220
|
+
key: str,
|
|
221
|
+
compare_fields: list[str],
|
|
222
|
+
all_columns: list[str],
|
|
223
|
+
) -> list[dict[str, Any]]:
|
|
224
|
+
"""Find records that exist in both snapshots but have different values."""
|
|
225
|
+
if not compare_fields:
|
|
226
|
+
return []
|
|
227
|
+
|
|
228
|
+
quoted_key = f'"{key}"'
|
|
229
|
+
|
|
230
|
+
# Build WHERE clause: any compared field differs
|
|
231
|
+
where_parts = []
|
|
232
|
+
for col in compare_fields:
|
|
233
|
+
qc = f'"{col}"'
|
|
234
|
+
where_parts.append(f"n.{qc} IS DISTINCT FROM o.{qc}")
|
|
235
|
+
where_clause = " OR ".join(where_parts)
|
|
236
|
+
|
|
237
|
+
# Select new values + old values for compared fields
|
|
238
|
+
select_parts = [f"n.{quoted_key}"]
|
|
239
|
+
for col in all_columns:
|
|
240
|
+
if col != key:
|
|
241
|
+
qc = f'"{col}"'
|
|
242
|
+
select_parts.append(f"n.{qc}")
|
|
243
|
+
for col in compare_fields:
|
|
244
|
+
qc = f'"{col}"'
|
|
245
|
+
select_parts.append(f"o.{qc} AS \"{col}__old\"")
|
|
246
|
+
|
|
247
|
+
select_clause = ", ".join(select_parts)
|
|
248
|
+
|
|
249
|
+
sql = (
|
|
250
|
+
f"SELECT {select_clause} FROM _new n "
|
|
251
|
+
f"JOIN _old o ON n.{quoted_key} = o.{quoted_key} "
|
|
252
|
+
f"WHERE {where_clause}"
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
result = conn.execute(sql)
|
|
256
|
+
columns = [desc[0] for desc in result.description]
|
|
257
|
+
rows = result.fetchall()
|
|
258
|
+
records = [dict(zip(columns, row, strict=False)) for row in rows]
|
|
259
|
+
|
|
260
|
+
# Add _changed_fields metadata to each record
|
|
261
|
+
for record in records:
|
|
262
|
+
changed_fields = []
|
|
263
|
+
for col in compare_fields:
|
|
264
|
+
old_key = f"{col}__old"
|
|
265
|
+
new_val = record.get(col)
|
|
266
|
+
old_val = record.get(old_key)
|
|
267
|
+
if _values_differ(new_val, old_val):
|
|
268
|
+
changed_fields.append(col)
|
|
269
|
+
record["_changed_fields"] = changed_fields
|
|
270
|
+
|
|
271
|
+
return records
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _values_differ(a: Any, b: Any) -> bool:
|
|
275
|
+
"""Compare two values, treating JSON strings as equivalent to their parsed form."""
|
|
276
|
+
if a == b:
|
|
277
|
+
return False
|
|
278
|
+
# Handle JSON string comparison
|
|
279
|
+
if isinstance(a, str) and isinstance(b, str):
|
|
280
|
+
try:
|
|
281
|
+
return json.loads(a) != json.loads(b)
|
|
282
|
+
except (json.JSONDecodeError, ValueError):
|
|
283
|
+
pass
|
|
284
|
+
return True
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def format_diff_table(result: DiffResult) -> list[dict[str, Any]]:
|
|
288
|
+
"""Format a DiffResult into a flat list of dicts for table/json output.
|
|
289
|
+
|
|
290
|
+
Each record gets a ``_diff`` column with value ``added``, ``removed``,
|
|
291
|
+
or ``changed``. For changed records in table mode, modified field
|
|
292
|
+
values are formatted as ``old → new``.
|
|
293
|
+
"""
|
|
294
|
+
rows: list[dict[str, Any]] = []
|
|
295
|
+
|
|
296
|
+
for record in result.added:
|
|
297
|
+
row = {"_diff": "added", **record}
|
|
298
|
+
rows.append(row)
|
|
299
|
+
|
|
300
|
+
for record in result.removed:
|
|
301
|
+
row = {"_diff": "removed", **record}
|
|
302
|
+
rows.append(row)
|
|
303
|
+
|
|
304
|
+
for record in result.changed:
|
|
305
|
+
row: dict[str, Any] = {"_diff": "changed"}
|
|
306
|
+
changed_fields = record.get("_changed_fields", [])
|
|
307
|
+
for k, v in record.items():
|
|
308
|
+
if k == "_changed_fields":
|
|
309
|
+
continue
|
|
310
|
+
if k.endswith("__old"):
|
|
311
|
+
continue
|
|
312
|
+
# For changed fields, format as "old → new"
|
|
313
|
+
if k in changed_fields:
|
|
314
|
+
old_val = record.get(f"{k}__old")
|
|
315
|
+
row[k] = f"{_format_val(old_val)} → {_format_val(v)}"
|
|
316
|
+
else:
|
|
317
|
+
row[k] = v
|
|
318
|
+
rows.append(row)
|
|
319
|
+
|
|
320
|
+
return rows
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def format_diff_records(result: DiffResult) -> list[dict[str, Any]]:
|
|
324
|
+
"""Format a DiffResult for JSON/CSV output.
|
|
325
|
+
|
|
326
|
+
Each record gets ``_diff`` column. Changed records include both
|
|
327
|
+
current values and ``field__old`` columns.
|
|
328
|
+
"""
|
|
329
|
+
rows: list[dict[str, Any]] = []
|
|
330
|
+
|
|
331
|
+
for record in result.added:
|
|
332
|
+
rows.append({"_diff": "added", **record})
|
|
333
|
+
|
|
334
|
+
for record in result.removed:
|
|
335
|
+
rows.append({"_diff": "removed", **record})
|
|
336
|
+
|
|
337
|
+
for record in result.changed:
|
|
338
|
+
row = {"_diff": "changed"}
|
|
339
|
+
for k, v in record.items():
|
|
340
|
+
if k == "_changed_fields":
|
|
341
|
+
continue
|
|
342
|
+
row[k] = v
|
|
343
|
+
rows.append(row)
|
|
344
|
+
|
|
345
|
+
return rows
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def _format_val(v: Any) -> str:
|
|
349
|
+
"""Format a value for display, truncating long strings."""
|
|
350
|
+
if v is None:
|
|
351
|
+
return "null"
|
|
352
|
+
s = str(v)
|
|
353
|
+
if len(s) > 40:
|
|
354
|
+
return s[:37] + "..."
|
|
355
|
+
return s
|
anysite/dataset/models.py
CHANGED
|
@@ -129,6 +129,10 @@ class DatasetSource(BaseModel):
|
|
|
129
129
|
default_factory=list,
|
|
130
130
|
description="Export destinations (file/webhook) applied after Parquet write",
|
|
131
131
|
)
|
|
132
|
+
refresh: Literal["auto", "always"] = Field(
|
|
133
|
+
default="auto",
|
|
134
|
+
description="Refresh mode: 'auto' uses incremental caching, 'always' re-collects every run",
|
|
135
|
+
)
|
|
132
136
|
|
|
133
137
|
@field_validator("endpoint")
|
|
134
138
|
@classmethod
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: anysite-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: CLI for Anysite API - web data extraction for humans and AI agents
|
|
5
5
|
Project-URL: Homepage, https://anysite.io
|
|
6
6
|
Project-URL: Documentation, https://docs.anysite.io/cli
|
|
@@ -272,6 +272,7 @@ sources:
|
|
|
272
272
|
- type: company
|
|
273
273
|
value: "{value}"
|
|
274
274
|
count: 5
|
|
275
|
+
refresh: always # Re-collect every run with --incremental
|
|
275
276
|
db_load:
|
|
276
277
|
fields: [name, url, headline]
|
|
277
278
|
|
|
@@ -327,6 +328,9 @@ anysite dataset logs my-dataset --run 42
|
|
|
327
328
|
# Generate cron/systemd schedule
|
|
328
329
|
anysite dataset schedule dataset.yaml --incremental --load-db pg
|
|
329
330
|
|
|
331
|
+
# Compare snapshots (diff two collection dates)
|
|
332
|
+
anysite dataset diff dataset.yaml --source employees --key _input_value
|
|
333
|
+
|
|
330
334
|
# Reset incremental state
|
|
331
335
|
anysite dataset reset-cursor dataset.yaml
|
|
332
336
|
```
|
|
@@ -19,13 +19,14 @@ anysite/config/paths.py,sha256=EmHJD8wlf4Q9IUn8Gp1JQ8Z3ffrIYAt5iHRyImQOf5I,1087
|
|
|
19
19
|
anysite/config/settings.py,sha256=Hc0j_aCCtkJeL4nHw-EFyfJ8WEDk57G08iNUFquUhpM,5235
|
|
20
20
|
anysite/dataset/__init__.py,sha256=J0sKQkGwVOPtvp6pka7LcdeUEADvjWRs71yRuROzJxI,847
|
|
21
21
|
anysite/dataset/analyzer.py,sha256=8dsPW32SbSaUTy1F0NIed1U45wjiMgQeJ2iWX7hBxRQ,9245
|
|
22
|
-
anysite/dataset/cli.py,sha256=
|
|
23
|
-
anysite/dataset/collector.py,sha256=
|
|
22
|
+
anysite/dataset/cli.py,sha256=zaCo0kKeA1KNU7EZgW4WwxrP07xuKayPlolfUnCSoYI,22801
|
|
23
|
+
anysite/dataset/collector.py,sha256=6CfJt8fKZZ2xvZWJ7jwnx0V9BnjoJxmBZkm8xWQiU54,23840
|
|
24
24
|
anysite/dataset/db_loader.py,sha256=nlMJrDJiGBX5H1StcjsontSxLXbsFe4rwOEnDehzpk8,8443
|
|
25
|
+
anysite/dataset/differ.py,sha256=hbUwoS73syTkrj0VC0gaJzuB0pVCoQXQMbsNXtpsig8,11634
|
|
25
26
|
anysite/dataset/errors.py,sha256=r8cZXoIzSeTGCWpeYjntnN0AduCu74YZyWs3sFu17J4,914
|
|
26
27
|
anysite/dataset/exporters.py,sha256=mA2FYbYJbHfrwkXbHDu4g5qPG_JJKnkVciXFKPkF1Vw,3708
|
|
27
28
|
anysite/dataset/history.py,sha256=avFs0ADlM7Hr-ttqC1FfjJiQxvQP20sScM7ZoY4lvU0,5471
|
|
28
|
-
anysite/dataset/models.py,sha256=
|
|
29
|
+
anysite/dataset/models.py,sha256=_f1cg9A4FlQwWGpg-s0b9q5WMlaIRN-ENlpU9CE6mrQ,9695
|
|
29
30
|
anysite/dataset/notifications.py,sha256=ORzo9XOgOxzLb7rk4pevlKPB_Taf-jejlrtmO4Zgl2c,2367
|
|
30
31
|
anysite/dataset/scheduler.py,sha256=zpbA5tRUQZXr-9lZnG58dvE3E7ZBlAd-U-PTXExe9f0,3339
|
|
31
32
|
anysite/dataset/storage.py,sha256=d03goKLI5NWKJowHwCgGqQkcVTO1NctPxMu-Xu-tru4,5326
|
|
@@ -57,8 +58,8 @@ anysite/streaming/writer.py,sha256=HfMsC4umUdJuNIAPK57YAxEGyTwUmy-zNrqFkwY6aew,4
|
|
|
57
58
|
anysite/utils/__init__.py,sha256=7SnbxpxKENK-2ecUL5NfnZ9okGI7COKYw4WF46172HM,23
|
|
58
59
|
anysite/utils/fields.py,sha256=bSrHadzNmabL4qubqhXXZoWb_P8KA-3S7_FLVT8nGBc,7410
|
|
59
60
|
anysite/utils/retry.py,sha256=89TbXvavi5t22P2mTYCLAS6SSZoW65gQ0nnYNbYAF0M,2684
|
|
60
|
-
anysite_cli-0.1.
|
|
61
|
-
anysite_cli-0.1.
|
|
62
|
-
anysite_cli-0.1.
|
|
63
|
-
anysite_cli-0.1.
|
|
64
|
-
anysite_cli-0.1.
|
|
61
|
+
anysite_cli-0.1.3.dist-info/METADATA,sha256=lD_AF5pq5ayHerMVMMWTTkgccwWEsKLBGCwvPfZ5y_Y,11781
|
|
62
|
+
anysite_cli-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
63
|
+
anysite_cli-0.1.3.dist-info/entry_points.txt,sha256=FDPxNasy0fRRcOgJdZRVP7Qw01C3TwRa1OwPJiskNyg,45
|
|
64
|
+
anysite_cli-0.1.3.dist-info/licenses/LICENSE,sha256=gVAxkI23CFm4x4HV_fkQYw_bGq93mQmVZEwxNs-YTa4,1069
|
|
65
|
+
anysite_cli-0.1.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|