anysite-cli 0.1.2__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/CLAUDE.md +5 -2
  2. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/PKG-INFO +5 -1
  3. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/README.md +4 -0
  4. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/pyproject.toml +1 -1
  5. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/skills/anysite-cli/SKILL.md +12 -1
  6. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/skills/anysite-cli/references/dataset-guide.md +53 -0
  7. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/cli.py +120 -0
  8. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/collector.py +16 -8
  9. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/db_loader.py +162 -23
  10. anysite_cli-0.1.4/src/anysite/dataset/differ.py +496 -0
  11. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/models.py +5 -0
  12. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/test_collector.py +191 -0
  13. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/test_db_loader.py +292 -0
  14. anysite_cli-0.1.4/tests/test_dataset/test_differ.py +483 -0
  15. anysite_cli-0.1.4/tests/test_dataset/test_integration_csv.py +291 -0
  16. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/.claude/settings.local.json +0 -0
  17. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/.gitignore +0 -0
  18. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/LICENSE +0 -0
  19. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/skills/anysite-cli/references/api-reference.md +0 -0
  20. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/__init__.py +0 -0
  21. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/__main__.py +0 -0
  22. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/api/__init__.py +0 -0
  23. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/api/client.py +0 -0
  24. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/api/errors.py +0 -0
  25. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/api/schemas.py +0 -0
  26. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/batch/__init__.py +0 -0
  27. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/batch/executor.py +0 -0
  28. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/batch/input.py +0 -0
  29. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/batch/rate_limiter.py +0 -0
  30. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/cli/__init__.py +0 -0
  31. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/cli/config.py +0 -0
  32. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/cli/executor.py +0 -0
  33. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/cli/options.py +0 -0
  34. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/config/__init__.py +0 -0
  35. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/config/paths.py +0 -0
  36. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/config/settings.py +0 -0
  37. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/__init__.py +0 -0
  38. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/analyzer.py +0 -0
  39. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/errors.py +0 -0
  40. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/exporters.py +0 -0
  41. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/history.py +0 -0
  42. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/notifications.py +0 -0
  43. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/scheduler.py +0 -0
  44. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/storage.py +0 -0
  45. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/transformer.py +0 -0
  46. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/__init__.py +0 -0
  47. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/adapters/__init__.py +0 -0
  48. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/adapters/base.py +0 -0
  49. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/adapters/postgres.py +0 -0
  50. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/adapters/sqlite.py +0 -0
  51. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/cli.py +0 -0
  52. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/config.py +0 -0
  53. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/manager.py +0 -0
  54. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/operations/__init__.py +0 -0
  55. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/operations/insert.py +0 -0
  56. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/operations/query.py +0 -0
  57. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/schema/__init__.py +0 -0
  58. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/schema/inference.py +0 -0
  59. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/schema/types.py +0 -0
  60. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/utils/__init__.py +0 -0
  61. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/utils/sanitize.py +0 -0
  62. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/main.py +0 -0
  63. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/models/__init__.py +0 -0
  64. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/output/__init__.py +0 -0
  65. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/output/console.py +0 -0
  66. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/output/formatters.py +0 -0
  67. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/output/templates.py +0 -0
  68. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/py.typed +0 -0
  69. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/streaming/__init__.py +0 -0
  70. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/streaming/progress.py +0 -0
  71. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/streaming/writer.py +0 -0
  72. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/utils/__init__.py +0 -0
  73. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/utils/fields.py +0 -0
  74. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/utils/retry.py +0 -0
  75. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/test_data/enriched_partners_sample_10.csv +0 -0
  76. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/test_data/linkedin-partners/company_aliases.txt +0 -0
  77. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/test_data/linkedin-partners/dataset.yaml +0 -0
  78. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/test_data/partners-deep/dataset.yaml +0 -0
  79. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/test_data/partners-intel/dataset.yaml +0 -0
  80. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/test_data/partners-linkedin/company_aliases.txt +0 -0
  81. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/test_data/partners-linkedin/dataset.yaml +0 -0
  82. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/test_data/partners-pipeline/dataset.yaml +0 -0
  83. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/__init__.py +0 -0
  84. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/conftest.py +0 -0
  85. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_api/__init__.py +0 -0
  86. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_batch/__init__.py +0 -0
  87. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_batch/test_executor.py +0 -0
  88. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_batch/test_input.py +0 -0
  89. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_batch/test_rate_limiter.py +0 -0
  90. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_cli/__init__.py +0 -0
  91. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_cli/test_main.py +0 -0
  92. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/__init__.py +0 -0
  93. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/test_analyzer.py +0 -0
  94. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/test_exporters.py +0 -0
  95. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/test_history.py +0 -0
  96. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/test_models.py +0 -0
  97. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/test_notifications.py +0 -0
  98. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/test_scheduler.py +0 -0
  99. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/test_storage.py +0 -0
  100. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/test_transformer.py +0 -0
  101. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_db/__init__.py +0 -0
  102. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_db/test_cli.py +0 -0
  103. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_db/test_config.py +0 -0
  104. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_db/test_inference.py +0 -0
  105. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_db/test_insert.py +0 -0
  106. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_db/test_manager.py +0 -0
  107. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_db/test_postgres_adapter.py +0 -0
  108. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_db/test_sanitize.py +0 -0
  109. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_db/test_sqlite_adapter.py +0 -0
  110. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_output/__init__.py +0 -0
  111. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_output/test_formatters.py +0 -0
  112. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_output/test_templates.py +0 -0
  113. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_streaming/__init__.py +0 -0
  114. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_streaming/test_progress.py +0 -0
  115. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_streaming/test_writer.py +0 -0
  116. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_utils/__init__.py +0 -0
  117. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_utils/test_fields.py +0 -0
  118. {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_utils/test_retry.py +0 -0
@@ -53,6 +53,8 @@ anysite dataset history my-dataset
53
53
  anysite dataset logs my-dataset --run 42
54
54
  anysite dataset schedule dataset.yaml --incremental --load-db pg
55
55
  anysite dataset schedule dataset.yaml --systemd --load-db pg
56
+ anysite dataset diff dataset.yaml --source profiles --key _input_value
57
+ anysite dataset diff dataset.yaml --source profiles --key urn --from 2026-01-30 --to 2026-02-01
56
58
  anysite dataset reset-cursor dataset.yaml
57
59
  anysite dataset reset-cursor dataset.yaml --source profiles
58
60
 
@@ -102,7 +104,8 @@ anysite db upsert mydb --table users --conflict-columns id --stdin
102
104
  - `dataset/history.py` - `HistoryStore` (SQLite at `~/.anysite/dataset_history.db`): run start/finish tracking. `LogManager`: file-based per-run logs at `~/.anysite/logs/`
103
105
  - `dataset/scheduler.py` - `ScheduleGenerator`: crontab and systemd timer unit generation from cron expressions
104
106
  - `dataset/notifications.py` - `WebhookNotifier`: POST to webhook URLs on collection complete/failure
105
- - `dataset/cli.py` - Typer subcommands: `init`, `collect` (with `--load-db`), `status`, `query`, `stats`, `profile`, `load-db`, `history`, `logs`, `schedule`, `reset-cursor`
107
+ - `dataset/differ.py` - `DatasetDiffer`: compare two Parquet snapshots using DuckDB (added/removed/changed records). `DiffResult` dataclass, `format_diff_table()` and `format_diff_records()` formatters
108
+ - `dataset/cli.py` - Typer subcommands: `init`, `collect` (with `--load-db`), `status`, `query`, `stats`, `profile`, `load-db`, `diff`, `history`, `logs`, `schedule`, `reset-cursor`
106
109
  - `dataset/db_loader.py` - `DatasetDbLoader`: loads Parquet data into relational DB with FK linking via provenance, dot-notation field extraction, schema inference
107
110
  - `dataset/errors.py` - `DatasetError`, `CircularDependencyError`, `SourceNotFoundError`
108
111
  - `db/__init__.py` - `check_db_deps()` — verifies optional psycopg is installed for Postgres
@@ -202,5 +205,5 @@ Tests are in `tests/` with subdirectories mirroring `src/anysite/`:
202
205
  - `test_streaming/` — Progress and writer
203
206
  - `test_output/` — Formatters and templates
204
207
  - `test_utils/` — Field selection and retry
205
- - `test_dataset/` — Dataset models, storage, collector (mocked API), DuckDB analyzer, DB loader (SQLite in-memory), transformer, exporters, history, scheduler, notifications
208
+ - `test_dataset/` — Dataset models, storage, collector (mocked API), DuckDB analyzer, DB loader (SQLite in-memory), transformer, exporters, history, scheduler, notifications, differ
206
209
  - `test_db/` — Database adapters, schema inference, connection manager, operations
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: anysite-cli
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: CLI for Anysite API - web data extraction for humans and AI agents
5
5
  Project-URL: Homepage, https://anysite.io
6
6
  Project-URL: Documentation, https://docs.anysite.io/cli
@@ -272,6 +272,7 @@ sources:
272
272
  - type: company
273
273
  value: "{value}"
274
274
  count: 5
275
+ refresh: always # Re-collect every run with --incremental
275
276
  db_load:
276
277
  fields: [name, url, headline]
277
278
 
@@ -327,6 +328,9 @@ anysite dataset logs my-dataset --run 42
327
328
  # Generate cron/systemd schedule
328
329
  anysite dataset schedule dataset.yaml --incremental --load-db pg
329
330
 
331
+ # Compare snapshots (diff two collection dates)
332
+ anysite dataset diff dataset.yaml --source employees --key _input_value
333
+
330
334
  # Reset incremental state
331
335
  anysite dataset reset-cursor dataset.yaml
332
336
  ```
@@ -209,6 +209,7 @@ sources:
209
209
  - type: company
210
210
  value: "{value}"
211
211
  count: 5
212
+ refresh: always # Re-collect every run with --incremental
212
213
  db_load:
213
214
  fields: [name, url, headline]
214
215
 
@@ -264,6 +265,9 @@ anysite dataset logs my-dataset --run 42
264
265
  # Generate cron/systemd schedule
265
266
  anysite dataset schedule dataset.yaml --incremental --load-db pg
266
267
 
268
+ # Compare snapshots (diff two collection dates)
269
+ anysite dataset diff dataset.yaml --source employees --key _input_value
270
+
267
271
  # Reset incremental state
268
272
  anysite dataset reset-cursor dataset.yaml
269
273
  ```
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "anysite-cli"
7
- version = "0.1.2"
7
+ version = "0.1.4"
8
8
  description = "CLI for Anysite API - web data extraction for humans and AI agents"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -133,6 +133,7 @@ sources:
133
133
  count: 5
134
134
  parallel: 3
135
135
  on_error: skip
136
+ refresh: always # Re-collect every run even with --incremental
136
137
 
137
138
  storage:
138
139
  format: parquet
@@ -235,7 +236,17 @@ anysite api /api/linkedin/user user=satyanadella -q --format jsonl \
235
236
  | anysite db insert pg --table profiles --stdin --auto-create
236
237
  ```
237
238
 
238
- ### Step 6: History, Scheduling, and Notifications
239
+ ### Step 6: Compare Snapshots
240
+ ```bash
241
+ # Diff two most recent snapshots
242
+ anysite dataset diff dataset.yaml --source employees --key _input_value
243
+
244
+ # Diff specific dates, compare only certain fields
245
+ anysite dataset diff dataset.yaml --source employees --key _input_value \
246
+ --from 2026-01-30 --to 2026-02-01 --fields "name,headline"
247
+ ```
248
+
249
+ ### Step 7: History, Scheduling, and Notifications
239
250
  ```bash
240
251
  # View run history
241
252
  anysite dataset history my-dataset
@@ -24,6 +24,7 @@ sources:
24
24
  parallel: 3 # Concurrent requests
25
25
  rate_limit: "10/s" # Rate limiting
26
26
  on_error: skip # stop or skip
27
+ refresh: always # auto (default) or always — re-collect every run
27
28
  transform: # Post-collection transform (for exports only)
28
29
  filter: '.count > 10' # Safe filter expression
29
30
  fields: [name, url] # Field selection with aliases
@@ -169,6 +170,25 @@ With `--incremental`:
169
170
  2. Dependent/from_file sources: skips individual input values already in `metadata.json`
170
171
  3. New values are still collected and tracked
171
172
 
173
+ ### Refresh Mode
174
+
175
+ Per-source `refresh` field controls behavior with `--incremental`:
176
+
177
+ ```yaml
178
+ - id: posts
179
+ endpoint: /api/linkedin/user/posts
180
+ dependency: { from_source: profiles, field: urn.value }
181
+ input_key: user
182
+ refresh: always # Re-collect every run even with --incremental
183
+ ```
184
+
185
+ | Setting | `--incremental` | No flag |
186
+ |---------|----------------|---------|
187
+ | `refresh: auto` (default) | Skip collected inputs | Collect all |
188
+ | `refresh: always` | Collect all (ignore cache) | Collect all |
189
+
190
+ Use `refresh: always` for sources with frequently changing data (e.g., posts, activity feeds) where you want fresh snapshots each run while still caching stable parent data.
191
+
172
192
  ### Storage Layout
173
193
 
174
194
  ```
@@ -428,6 +448,39 @@ Payload: `{event: "complete"|"failure", dataset, timestamp, record_count, source
428
448
 
429
449
  ---
430
450
 
451
+ ## Comparing Snapshots (Diff)
452
+
453
+ Compare two collection snapshots to find added, removed, and changed records.
454
+
455
+ ```bash
456
+ # Compare two most recent snapshots (auto-detect dates)
457
+ anysite dataset diff dataset.yaml --source profiles --key _input_value
458
+
459
+ # Compare specific dates
460
+ anysite dataset diff dataset.yaml --source profiles --key urn --from 2026-01-30 --to 2026-02-01
461
+
462
+ # Only compare specific fields
463
+ anysite dataset diff dataset.yaml --source profiles --key urn --fields "name,headline,follower_count"
464
+
465
+ # Output as JSON/CSV
466
+ anysite dataset diff dataset.yaml --source profiles --key urn --format json --output diff.json
467
+ ```
468
+
469
+ **Options:**
470
+ - `--source, -s` (required) — source to compare
471
+ - `--key, -k` (required) — field to match records by (e.g., `_input_value`, `urn`)
472
+ - `--from` / `--to` — snapshot dates (default: two most recent)
473
+ - `--fields, -f` — only compare these fields
474
+ - `--format` — output format (table, json, jsonl, csv)
475
+ - `--output, -o` — write to file
476
+
477
+ **Output** shows summary counts and a table of changes:
478
+ - **added** — records in the new snapshot but not the old
479
+ - **removed** — records in the old snapshot but not the new
480
+ - **changed** — records with the same key but different values (shows `old → new`)
481
+
482
+ ---
483
+
431
484
  ## Reset Incremental State
432
485
 
433
486
  Clear collected input tracking to force re-collection.
@@ -357,6 +357,10 @@ def load_db(
357
357
  bool,
358
358
  typer.Option("--quiet", "-q", help="Suppress progress output"),
359
359
  ] = False,
360
+ snapshot: Annotated[
361
+ str | None,
362
+ typer.Option("--snapshot", help="Load a specific snapshot date (YYYY-MM-DD)"),
363
+ ] = None,
360
364
  ) -> None:
361
365
  """Load collected Parquet data into a relational database with FK linking."""
362
366
  config = _load_config(config_path)
@@ -379,6 +383,7 @@ def load_db(
379
383
  source_filter=source,
380
384
  drop_existing=drop_existing,
381
385
  dry_run=dry_run,
386
+ snapshot=snapshot,
382
387
  )
383
388
  except Exception as e:
384
389
  typer.echo(f"Load error: {e}", err=True)
@@ -413,6 +418,121 @@ def load_db(
413
418
  )
414
419
 
415
420
 
421
+ @app.command("diff")
422
+ def diff_cmd(
423
+ config_path: Annotated[
424
+ Path,
425
+ typer.Argument(help="Path to dataset.yaml"),
426
+ ],
427
+ source: Annotated[
428
+ str,
429
+ typer.Option("--source", "-s", help="Source to compare"),
430
+ ],
431
+ key: Annotated[
432
+ str,
433
+ typer.Option("--key", "-k", help="Field to match records by (e.g., _input_value, urn)"),
434
+ ],
435
+ from_date: Annotated[
436
+ str | None,
437
+ typer.Option("--from", help="Older snapshot date (YYYY-MM-DD)"),
438
+ ] = None,
439
+ to_date: Annotated[
440
+ str | None,
441
+ typer.Option("--to", help="Newer snapshot date (YYYY-MM-DD)"),
442
+ ] = None,
443
+ fields: Annotated[
444
+ str | None,
445
+ typer.Option("--fields", "-f", help="Only compare these fields (comma-separated)"),
446
+ ] = None,
447
+ format: Annotated[
448
+ str,
449
+ typer.Option("--format", help="Output format: table, json, jsonl, csv"),
450
+ ] = "table",
451
+ output: Annotated[
452
+ Path | None,
453
+ typer.Option("--output", "-o", help="Write output to file"),
454
+ ] = None,
455
+ quiet: Annotated[
456
+ bool,
457
+ typer.Option("--quiet", "-q", help="Suppress summary, only output data"),
458
+ ] = False,
459
+ ) -> None:
460
+ """Compare two snapshots of a source to show added, removed, and changed records."""
461
+ from datetime import date as date_type
462
+
463
+ from anysite.dataset.differ import (
464
+ DatasetDiffer,
465
+ format_diff_records,
466
+ format_diff_table,
467
+ )
468
+
469
+ config = _load_config(config_path)
470
+
471
+ # Validate source exists
472
+ src = config.get_source(source)
473
+ if src is None:
474
+ typer.echo(f"Error: source '{source}' not found in dataset", err=True)
475
+ raise typer.Exit(1)
476
+
477
+ differ = DatasetDiffer(config.storage_path())
478
+
479
+ # Parse dates
480
+ parsed_from = None
481
+ parsed_to = None
482
+ try:
483
+ if from_date:
484
+ parsed_from = date_type.fromisoformat(from_date)
485
+ if to_date:
486
+ parsed_to = date_type.fromisoformat(to_date)
487
+ except ValueError as e:
488
+ typer.echo(f"Error: invalid date format: {e}", err=True)
489
+ raise typer.Exit(1) from None
490
+
491
+ # Parse fields
492
+ field_list = None
493
+ if fields:
494
+ field_list = [f.strip() for f in fields.split(",") if f.strip()]
495
+
496
+ try:
497
+ result = differ.diff(
498
+ source,
499
+ key,
500
+ from_date=parsed_from,
501
+ to_date=parsed_to,
502
+ fields=field_list,
503
+ )
504
+ except DatasetError as e:
505
+ typer.echo(f"Error: {e}", err=True)
506
+ raise typer.Exit(1) from None
507
+
508
+ # Print summary unless quiet
509
+ if not quiet:
510
+ console = Console()
511
+ console.print(
512
+ f"\n[bold]Diff: {source}[/bold] "
513
+ f"({result.from_date.isoformat()} → {result.to_date.isoformat()})\n"
514
+ )
515
+ console.print(f" [green]Added:[/green] {len(result.added)}")
516
+ console.print(f" [red]Removed:[/red] {len(result.removed)}")
517
+ console.print(f" [yellow]Changed:[/yellow] {len(result.changed)}")
518
+ console.print(f" Unchanged: {result.unchanged_count}")
519
+ console.print()
520
+
521
+ if not result.has_changes:
522
+ if not quiet:
523
+ Console().print("[dim]No changes detected.[/dim]")
524
+ return
525
+
526
+ # Format and output
527
+ rows = (
528
+ format_diff_table(result, output_fields=field_list)
529
+ if format == "table"
530
+ else format_diff_records(result, output_fields=field_list)
531
+ )
532
+
533
+ _output_results(rows, format, output)
534
+
535
+
416
536
  @app.command("history")
417
537
  def history(
418
538
  name: Annotated[
@@ -43,6 +43,7 @@ class CollectionPlan:
43
43
  params: dict[str, Any] | None = None,
44
44
  dependency: str | None = None,
45
45
  estimated_requests: int | None = None,
46
+ refresh: str = "auto",
46
47
  ) -> None:
47
48
  self.steps.append({
48
49
  "source": source_id,
@@ -51,6 +52,7 @@ class CollectionPlan:
51
52
  "params": params or {},
52
53
  "dependency": dependency,
53
54
  "estimated_requests": estimated_requests,
55
+ "refresh": refresh,
54
56
  })
55
57
 
56
58
 
@@ -116,8 +118,8 @@ async def collect_dataset(
116
118
 
117
119
  try:
118
120
  for source in ordered:
119
- # Check incremental skip
120
- if incremental:
121
+ # Check incremental skip (refresh: always bypasses this)
122
+ if incremental and source.refresh != "always":
121
123
  parquet_path = get_parquet_path(base_path, source.id, today)
122
124
  if parquet_path.exists():
123
125
  if not quiet:
@@ -276,8 +278,8 @@ async def _collect_from_file(
276
278
  print_warning(f"No values extracted from {file_path}")
277
279
  return []
278
280
 
279
- # Filter already-collected inputs in incremental mode
280
- if incremental and metadata:
281
+ # Filter already-collected inputs in incremental mode (refresh: always bypasses)
282
+ if incremental and source.refresh != "always" and metadata:
281
283
  already = metadata.get_collected_inputs(source.id)
282
284
  if already:
283
285
  original = len(values)
@@ -432,8 +434,8 @@ async def _collect_dependent(
432
434
  f"Source {source.id} has a dependency but no input_key defined"
433
435
  )
434
436
 
435
- # Filter already-collected inputs in incremental mode
436
- if incremental and metadata:
437
+ # Filter already-collected inputs in incremental mode (refresh: always bypasses)
438
+ if incremental and source.refresh != "always" and metadata:
437
439
  already = metadata.get_collected_inputs(source.id)
438
440
  if already:
439
441
  original = len(values)
@@ -579,7 +581,7 @@ def _build_plan(
579
581
  plan = CollectionPlan()
580
582
 
581
583
  for source in ordered:
582
- if incremental:
584
+ if incremental and source.refresh != "always":
583
585
  parquet_path = get_parquet_path(base_path, source.id, today)
584
586
  if parquet_path.exists():
585
587
  continue
@@ -592,6 +594,7 @@ def _build_plan(
592
594
  kind="from_file",
593
595
  params={"file": source.from_file, "field": source.file_field},
594
596
  estimated_requests=est,
597
+ refresh=source.refresh,
595
598
  )
596
599
  elif source.dependency is None:
597
600
  plan.add_step(
@@ -600,6 +603,7 @@ def _build_plan(
600
603
  kind="independent",
601
604
  params=source.params,
602
605
  estimated_requests=1,
606
+ refresh=source.refresh,
603
607
  )
604
608
  else:
605
609
  est = _count_dependent_inputs(source, base_path, metadata)
@@ -609,6 +613,7 @@ def _build_plan(
609
613
  kind="dependent",
610
614
  dependency=source.dependency.from_source,
611
615
  estimated_requests=est,
616
+ refresh=source.refresh,
612
617
  )
613
618
 
614
619
  return plan
@@ -665,11 +670,14 @@ def _print_plan(plan: CollectionPlan) -> dict[str, int]:
665
670
  table.add_column("Est. Requests")
666
671
 
667
672
  for i, step in enumerate(plan.steps, 1):
673
+ kind = step["kind"]
674
+ if step.get("refresh") == "always":
675
+ kind += " (refresh)"
668
676
  table.add_row(
669
677
  str(i),
670
678
  step["source"],
671
679
  step["endpoint"],
672
- step["kind"],
680
+ kind,
673
681
  step.get("dependency") or "-",
674
682
  str(step.get("estimated_requests") or "?"),
675
683
  )