anysite-cli 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/CLAUDE.md +5 -2
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/PKG-INFO +5 -1
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/README.md +4 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/pyproject.toml +1 -1
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/skills/anysite-cli/SKILL.md +12 -1
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/skills/anysite-cli/references/dataset-guide.md +53 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/cli.py +120 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/collector.py +16 -8
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/db_loader.py +162 -23
- anysite_cli-0.1.4/src/anysite/dataset/differ.py +496 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/models.py +5 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/test_collector.py +191 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/test_db_loader.py +292 -0
- anysite_cli-0.1.4/tests/test_dataset/test_differ.py +483 -0
- anysite_cli-0.1.4/tests/test_dataset/test_integration_csv.py +291 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/.claude/settings.local.json +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/.gitignore +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/LICENSE +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/skills/anysite-cli/references/api-reference.md +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/__main__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/api/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/api/client.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/api/errors.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/api/schemas.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/batch/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/batch/executor.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/batch/input.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/batch/rate_limiter.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/cli/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/cli/config.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/cli/executor.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/cli/options.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/config/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/config/paths.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/config/settings.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/analyzer.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/errors.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/exporters.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/history.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/notifications.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/scheduler.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/storage.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/transformer.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/adapters/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/adapters/base.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/adapters/postgres.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/adapters/sqlite.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/cli.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/config.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/manager.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/operations/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/operations/insert.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/operations/query.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/schema/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/schema/inference.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/schema/types.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/utils/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/db/utils/sanitize.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/main.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/models/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/output/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/output/console.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/output/formatters.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/output/templates.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/py.typed +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/streaming/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/streaming/progress.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/streaming/writer.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/utils/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/utils/fields.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/utils/retry.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/test_data/enriched_partners_sample_10.csv +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/test_data/linkedin-partners/company_aliases.txt +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/test_data/linkedin-partners/dataset.yaml +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/test_data/partners-deep/dataset.yaml +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/test_data/partners-intel/dataset.yaml +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/test_data/partners-linkedin/company_aliases.txt +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/test_data/partners-linkedin/dataset.yaml +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/test_data/partners-pipeline/dataset.yaml +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/conftest.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_api/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_batch/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_batch/test_executor.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_batch/test_input.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_batch/test_rate_limiter.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_cli/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_cli/test_main.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/test_analyzer.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/test_exporters.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/test_history.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/test_models.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/test_notifications.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/test_scheduler.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/test_storage.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_dataset/test_transformer.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_db/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_db/test_cli.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_db/test_config.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_db/test_inference.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_db/test_insert.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_db/test_manager.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_db/test_postgres_adapter.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_db/test_sanitize.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_db/test_sqlite_adapter.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_output/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_output/test_formatters.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_output/test_templates.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_streaming/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_streaming/test_progress.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_streaming/test_writer.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_utils/__init__.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_utils/test_fields.py +0 -0
- {anysite_cli-0.1.2 → anysite_cli-0.1.4}/tests/test_utils/test_retry.py +0 -0
|
@@ -53,6 +53,8 @@ anysite dataset history my-dataset
|
|
|
53
53
|
anysite dataset logs my-dataset --run 42
|
|
54
54
|
anysite dataset schedule dataset.yaml --incremental --load-db pg
|
|
55
55
|
anysite dataset schedule dataset.yaml --systemd --load-db pg
|
|
56
|
+
anysite dataset diff dataset.yaml --source profiles --key _input_value
|
|
57
|
+
anysite dataset diff dataset.yaml --source profiles --key urn --from 2026-01-30 --to 2026-02-01
|
|
56
58
|
anysite dataset reset-cursor dataset.yaml
|
|
57
59
|
anysite dataset reset-cursor dataset.yaml --source profiles
|
|
58
60
|
|
|
@@ -102,7 +104,8 @@ anysite db upsert mydb --table users --conflict-columns id --stdin
|
|
|
102
104
|
- `dataset/history.py` - `HistoryStore` (SQLite at `~/.anysite/dataset_history.db`): run start/finish tracking. `LogManager`: file-based per-run logs at `~/.anysite/logs/`
|
|
103
105
|
- `dataset/scheduler.py` - `ScheduleGenerator`: crontab and systemd timer unit generation from cron expressions
|
|
104
106
|
- `dataset/notifications.py` - `WebhookNotifier`: POST to webhook URLs on collection complete/failure
|
|
105
|
-
- `dataset/
|
|
107
|
+
- `dataset/differ.py` - `DatasetDiffer`: compare two Parquet snapshots using DuckDB (added/removed/changed records). `DiffResult` dataclass, `format_diff_table()` and `format_diff_records()` formatters
|
|
108
|
+
- `dataset/cli.py` - Typer subcommands: `init`, `collect` (with `--load-db`), `status`, `query`, `stats`, `profile`, `load-db`, `diff`, `history`, `logs`, `schedule`, `reset-cursor`
|
|
106
109
|
- `dataset/db_loader.py` - `DatasetDbLoader`: loads Parquet data into relational DB with FK linking via provenance, dot-notation field extraction, schema inference
|
|
107
110
|
- `dataset/errors.py` - `DatasetError`, `CircularDependencyError`, `SourceNotFoundError`
|
|
108
111
|
- `db/__init__.py` - `check_db_deps()` — verifies optional psycopg is installed for Postgres
|
|
@@ -202,5 +205,5 @@ Tests are in `tests/` with subdirectories mirroring `src/anysite/`:
|
|
|
202
205
|
- `test_streaming/` — Progress and writer
|
|
203
206
|
- `test_output/` — Formatters and templates
|
|
204
207
|
- `test_utils/` — Field selection and retry
|
|
205
|
-
- `test_dataset/` — Dataset models, storage, collector (mocked API), DuckDB analyzer, DB loader (SQLite in-memory), transformer, exporters, history, scheduler, notifications
|
|
208
|
+
- `test_dataset/` — Dataset models, storage, collector (mocked API), DuckDB analyzer, DB loader (SQLite in-memory), transformer, exporters, history, scheduler, notifications, differ
|
|
206
209
|
- `test_db/` — Database adapters, schema inference, connection manager, operations
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: anysite-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: CLI for Anysite API - web data extraction for humans and AI agents
|
|
5
5
|
Project-URL: Homepage, https://anysite.io
|
|
6
6
|
Project-URL: Documentation, https://docs.anysite.io/cli
|
|
@@ -272,6 +272,7 @@ sources:
|
|
|
272
272
|
- type: company
|
|
273
273
|
value: "{value}"
|
|
274
274
|
count: 5
|
|
275
|
+
refresh: always # Re-collect every run with --incremental
|
|
275
276
|
db_load:
|
|
276
277
|
fields: [name, url, headline]
|
|
277
278
|
|
|
@@ -327,6 +328,9 @@ anysite dataset logs my-dataset --run 42
|
|
|
327
328
|
# Generate cron/systemd schedule
|
|
328
329
|
anysite dataset schedule dataset.yaml --incremental --load-db pg
|
|
329
330
|
|
|
331
|
+
# Compare snapshots (diff two collection dates)
|
|
332
|
+
anysite dataset diff dataset.yaml --source employees --key _input_value
|
|
333
|
+
|
|
330
334
|
# Reset incremental state
|
|
331
335
|
anysite dataset reset-cursor dataset.yaml
|
|
332
336
|
```
|
|
@@ -209,6 +209,7 @@ sources:
|
|
|
209
209
|
- type: company
|
|
210
210
|
value: "{value}"
|
|
211
211
|
count: 5
|
|
212
|
+
refresh: always # Re-collect every run with --incremental
|
|
212
213
|
db_load:
|
|
213
214
|
fields: [name, url, headline]
|
|
214
215
|
|
|
@@ -264,6 +265,9 @@ anysite dataset logs my-dataset --run 42
|
|
|
264
265
|
# Generate cron/systemd schedule
|
|
265
266
|
anysite dataset schedule dataset.yaml --incremental --load-db pg
|
|
266
267
|
|
|
268
|
+
# Compare snapshots (diff two collection dates)
|
|
269
|
+
anysite dataset diff dataset.yaml --source employees --key _input_value
|
|
270
|
+
|
|
267
271
|
# Reset incremental state
|
|
268
272
|
anysite dataset reset-cursor dataset.yaml
|
|
269
273
|
```
|
|
@@ -133,6 +133,7 @@ sources:
|
|
|
133
133
|
count: 5
|
|
134
134
|
parallel: 3
|
|
135
135
|
on_error: skip
|
|
136
|
+
refresh: always # Re-collect every run even with --incremental
|
|
136
137
|
|
|
137
138
|
storage:
|
|
138
139
|
format: parquet
|
|
@@ -235,7 +236,17 @@ anysite api /api/linkedin/user user=satyanadella -q --format jsonl \
|
|
|
235
236
|
| anysite db insert pg --table profiles --stdin --auto-create
|
|
236
237
|
```
|
|
237
238
|
|
|
238
|
-
### Step 6:
|
|
239
|
+
### Step 6: Compare Snapshots
|
|
240
|
+
```bash
|
|
241
|
+
# Diff two most recent snapshots
|
|
242
|
+
anysite dataset diff dataset.yaml --source employees --key _input_value
|
|
243
|
+
|
|
244
|
+
# Diff specific dates, compare only certain fields
|
|
245
|
+
anysite dataset diff dataset.yaml --source employees --key _input_value \
|
|
246
|
+
--from 2026-01-30 --to 2026-02-01 --fields "name,headline"
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
### Step 7: History, Scheduling, and Notifications
|
|
239
250
|
```bash
|
|
240
251
|
# View run history
|
|
241
252
|
anysite dataset history my-dataset
|
|
@@ -24,6 +24,7 @@ sources:
|
|
|
24
24
|
parallel: 3 # Concurrent requests
|
|
25
25
|
rate_limit: "10/s" # Rate limiting
|
|
26
26
|
on_error: skip # stop or skip
|
|
27
|
+
refresh: always # auto (default) or always — re-collect every run
|
|
27
28
|
transform: # Post-collection transform (for exports only)
|
|
28
29
|
filter: '.count > 10' # Safe filter expression
|
|
29
30
|
fields: [name, url] # Field selection with aliases
|
|
@@ -169,6 +170,25 @@ With `--incremental`:
|
|
|
169
170
|
2. Dependent/from_file sources: skips individual input values already in `metadata.json`
|
|
170
171
|
3. New values are still collected and tracked
|
|
171
172
|
|
|
173
|
+
### Refresh Mode
|
|
174
|
+
|
|
175
|
+
Per-source `refresh` field controls behavior with `--incremental`:
|
|
176
|
+
|
|
177
|
+
```yaml
|
|
178
|
+
- id: posts
|
|
179
|
+
endpoint: /api/linkedin/user/posts
|
|
180
|
+
dependency: { from_source: profiles, field: urn.value }
|
|
181
|
+
input_key: user
|
|
182
|
+
refresh: always # Re-collect every run even with --incremental
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
| Setting | `--incremental` | No flag |
|
|
186
|
+
|---------|----------------|---------|
|
|
187
|
+
| `refresh: auto` (default) | Skip collected inputs | Collect all |
|
|
188
|
+
| `refresh: always` | Collect all (ignore cache) | Collect all |
|
|
189
|
+
|
|
190
|
+
Use `refresh: always` for sources with frequently changing data (e.g., posts, activity feeds) where you want fresh snapshots each run while still caching stable parent data.
|
|
191
|
+
|
|
172
192
|
### Storage Layout
|
|
173
193
|
|
|
174
194
|
```
|
|
@@ -428,6 +448,39 @@ Payload: `{event: "complete"|"failure", dataset, timestamp, record_count, source
|
|
|
428
448
|
|
|
429
449
|
---
|
|
430
450
|
|
|
451
|
+
## Comparing Snapshots (Diff)
|
|
452
|
+
|
|
453
|
+
Compare two collection snapshots to find added, removed, and changed records.
|
|
454
|
+
|
|
455
|
+
```bash
|
|
456
|
+
# Compare two most recent snapshots (auto-detect dates)
|
|
457
|
+
anysite dataset diff dataset.yaml --source profiles --key _input_value
|
|
458
|
+
|
|
459
|
+
# Compare specific dates
|
|
460
|
+
anysite dataset diff dataset.yaml --source profiles --key urn --from 2026-01-30 --to 2026-02-01
|
|
461
|
+
|
|
462
|
+
# Only compare specific fields
|
|
463
|
+
anysite dataset diff dataset.yaml --source profiles --key urn --fields "name,headline,follower_count"
|
|
464
|
+
|
|
465
|
+
# Output as JSON/CSV
|
|
466
|
+
anysite dataset diff dataset.yaml --source profiles --key urn --format json --output diff.json
|
|
467
|
+
```
|
|
468
|
+
|
|
469
|
+
**Options:**
|
|
470
|
+
- `--source, -s` (required) — source to compare
|
|
471
|
+
- `--key, -k` (required) — field to match records by (e.g., `_input_value`, `urn`)
|
|
472
|
+
- `--from` / `--to` — snapshot dates (default: two most recent)
|
|
473
|
+
- `--fields, -f` — only compare these fields
|
|
474
|
+
- `--format` — output format (table, json, jsonl, csv)
|
|
475
|
+
- `--output, -o` — write to file
|
|
476
|
+
|
|
477
|
+
**Output** shows summary counts and a table of changes:
|
|
478
|
+
- **added** — records in the new snapshot but not the old
|
|
479
|
+
- **removed** — records in the old snapshot but not the new
|
|
480
|
+
- **changed** — records with the same key but different values (shows `old → new`)
|
|
481
|
+
|
|
482
|
+
---
|
|
483
|
+
|
|
431
484
|
## Reset Incremental State
|
|
432
485
|
|
|
433
486
|
Clear collected input tracking to force re-collection.
|
|
@@ -357,6 +357,10 @@ def load_db(
|
|
|
357
357
|
bool,
|
|
358
358
|
typer.Option("--quiet", "-q", help="Suppress progress output"),
|
|
359
359
|
] = False,
|
|
360
|
+
snapshot: Annotated[
|
|
361
|
+
str | None,
|
|
362
|
+
typer.Option("--snapshot", help="Load a specific snapshot date (YYYY-MM-DD)"),
|
|
363
|
+
] = None,
|
|
360
364
|
) -> None:
|
|
361
365
|
"""Load collected Parquet data into a relational database with FK linking."""
|
|
362
366
|
config = _load_config(config_path)
|
|
@@ -379,6 +383,7 @@ def load_db(
|
|
|
379
383
|
source_filter=source,
|
|
380
384
|
drop_existing=drop_existing,
|
|
381
385
|
dry_run=dry_run,
|
|
386
|
+
snapshot=snapshot,
|
|
382
387
|
)
|
|
383
388
|
except Exception as e:
|
|
384
389
|
typer.echo(f"Load error: {e}", err=True)
|
|
@@ -413,6 +418,121 @@ def load_db(
|
|
|
413
418
|
)
|
|
414
419
|
|
|
415
420
|
|
|
421
|
+
@app.command("diff")
|
|
422
|
+
def diff_cmd(
|
|
423
|
+
config_path: Annotated[
|
|
424
|
+
Path,
|
|
425
|
+
typer.Argument(help="Path to dataset.yaml"),
|
|
426
|
+
],
|
|
427
|
+
source: Annotated[
|
|
428
|
+
str,
|
|
429
|
+
typer.Option("--source", "-s", help="Source to compare"),
|
|
430
|
+
],
|
|
431
|
+
key: Annotated[
|
|
432
|
+
str,
|
|
433
|
+
typer.Option("--key", "-k", help="Field to match records by (e.g., _input_value, urn)"),
|
|
434
|
+
],
|
|
435
|
+
from_date: Annotated[
|
|
436
|
+
str | None,
|
|
437
|
+
typer.Option("--from", help="Older snapshot date (YYYY-MM-DD)"),
|
|
438
|
+
] = None,
|
|
439
|
+
to_date: Annotated[
|
|
440
|
+
str | None,
|
|
441
|
+
typer.Option("--to", help="Newer snapshot date (YYYY-MM-DD)"),
|
|
442
|
+
] = None,
|
|
443
|
+
fields: Annotated[
|
|
444
|
+
str | None,
|
|
445
|
+
typer.Option("--fields", "-f", help="Only compare these fields (comma-separated)"),
|
|
446
|
+
] = None,
|
|
447
|
+
format: Annotated[
|
|
448
|
+
str,
|
|
449
|
+
typer.Option("--format", help="Output format: table, json, jsonl, csv"),
|
|
450
|
+
] = "table",
|
|
451
|
+
output: Annotated[
|
|
452
|
+
Path | None,
|
|
453
|
+
typer.Option("--output", "-o", help="Write output to file"),
|
|
454
|
+
] = None,
|
|
455
|
+
quiet: Annotated[
|
|
456
|
+
bool,
|
|
457
|
+
typer.Option("--quiet", "-q", help="Suppress summary, only output data"),
|
|
458
|
+
] = False,
|
|
459
|
+
) -> None:
|
|
460
|
+
"""Compare two snapshots of a source to show added, removed, and changed records."""
|
|
461
|
+
from datetime import date as date_type
|
|
462
|
+
|
|
463
|
+
from anysite.dataset.differ import (
|
|
464
|
+
DatasetDiffer,
|
|
465
|
+
format_diff_records,
|
|
466
|
+
format_diff_table,
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
config = _load_config(config_path)
|
|
470
|
+
|
|
471
|
+
# Validate source exists
|
|
472
|
+
src = config.get_source(source)
|
|
473
|
+
if src is None:
|
|
474
|
+
typer.echo(f"Error: source '{source}' not found in dataset", err=True)
|
|
475
|
+
raise typer.Exit(1)
|
|
476
|
+
|
|
477
|
+
differ = DatasetDiffer(config.storage_path())
|
|
478
|
+
|
|
479
|
+
# Parse dates
|
|
480
|
+
parsed_from = None
|
|
481
|
+
parsed_to = None
|
|
482
|
+
try:
|
|
483
|
+
if from_date:
|
|
484
|
+
parsed_from = date_type.fromisoformat(from_date)
|
|
485
|
+
if to_date:
|
|
486
|
+
parsed_to = date_type.fromisoformat(to_date)
|
|
487
|
+
except ValueError as e:
|
|
488
|
+
typer.echo(f"Error: invalid date format: {e}", err=True)
|
|
489
|
+
raise typer.Exit(1) from None
|
|
490
|
+
|
|
491
|
+
# Parse fields
|
|
492
|
+
field_list = None
|
|
493
|
+
if fields:
|
|
494
|
+
field_list = [f.strip() for f in fields.split(",") if f.strip()]
|
|
495
|
+
|
|
496
|
+
try:
|
|
497
|
+
result = differ.diff(
|
|
498
|
+
source,
|
|
499
|
+
key,
|
|
500
|
+
from_date=parsed_from,
|
|
501
|
+
to_date=parsed_to,
|
|
502
|
+
fields=field_list,
|
|
503
|
+
)
|
|
504
|
+
except DatasetError as e:
|
|
505
|
+
typer.echo(f"Error: {e}", err=True)
|
|
506
|
+
raise typer.Exit(1) from None
|
|
507
|
+
|
|
508
|
+
# Print summary unless quiet
|
|
509
|
+
if not quiet:
|
|
510
|
+
console = Console()
|
|
511
|
+
console.print(
|
|
512
|
+
f"\n[bold]Diff: {source}[/bold] "
|
|
513
|
+
f"({result.from_date.isoformat()} → {result.to_date.isoformat()})\n"
|
|
514
|
+
)
|
|
515
|
+
console.print(f" [green]Added:[/green] {len(result.added)}")
|
|
516
|
+
console.print(f" [red]Removed:[/red] {len(result.removed)}")
|
|
517
|
+
console.print(f" [yellow]Changed:[/yellow] {len(result.changed)}")
|
|
518
|
+
console.print(f" Unchanged: {result.unchanged_count}")
|
|
519
|
+
console.print()
|
|
520
|
+
|
|
521
|
+
if not result.has_changes:
|
|
522
|
+
if not quiet:
|
|
523
|
+
Console().print("[dim]No changes detected.[/dim]")
|
|
524
|
+
return
|
|
525
|
+
|
|
526
|
+
# Format and output
|
|
527
|
+
rows = (
|
|
528
|
+
format_diff_table(result, output_fields=field_list)
|
|
529
|
+
if format == "table"
|
|
530
|
+
else format_diff_records(result, output_fields=field_list)
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
_output_results(rows, format, output)
|
|
534
|
+
|
|
535
|
+
|
|
416
536
|
@app.command("history")
|
|
417
537
|
def history(
|
|
418
538
|
name: Annotated[
|
|
@@ -43,6 +43,7 @@ class CollectionPlan:
|
|
|
43
43
|
params: dict[str, Any] | None = None,
|
|
44
44
|
dependency: str | None = None,
|
|
45
45
|
estimated_requests: int | None = None,
|
|
46
|
+
refresh: str = "auto",
|
|
46
47
|
) -> None:
|
|
47
48
|
self.steps.append({
|
|
48
49
|
"source": source_id,
|
|
@@ -51,6 +52,7 @@ class CollectionPlan:
|
|
|
51
52
|
"params": params or {},
|
|
52
53
|
"dependency": dependency,
|
|
53
54
|
"estimated_requests": estimated_requests,
|
|
55
|
+
"refresh": refresh,
|
|
54
56
|
})
|
|
55
57
|
|
|
56
58
|
|
|
@@ -116,8 +118,8 @@ async def collect_dataset(
|
|
|
116
118
|
|
|
117
119
|
try:
|
|
118
120
|
for source in ordered:
|
|
119
|
-
# Check incremental skip
|
|
120
|
-
if incremental:
|
|
121
|
+
# Check incremental skip (refresh: always bypasses this)
|
|
122
|
+
if incremental and source.refresh != "always":
|
|
121
123
|
parquet_path = get_parquet_path(base_path, source.id, today)
|
|
122
124
|
if parquet_path.exists():
|
|
123
125
|
if not quiet:
|
|
@@ -276,8 +278,8 @@ async def _collect_from_file(
|
|
|
276
278
|
print_warning(f"No values extracted from {file_path}")
|
|
277
279
|
return []
|
|
278
280
|
|
|
279
|
-
# Filter already-collected inputs in incremental mode
|
|
280
|
-
if incremental and metadata:
|
|
281
|
+
# Filter already-collected inputs in incremental mode (refresh: always bypasses)
|
|
282
|
+
if incremental and source.refresh != "always" and metadata:
|
|
281
283
|
already = metadata.get_collected_inputs(source.id)
|
|
282
284
|
if already:
|
|
283
285
|
original = len(values)
|
|
@@ -432,8 +434,8 @@ async def _collect_dependent(
|
|
|
432
434
|
f"Source {source.id} has a dependency but no input_key defined"
|
|
433
435
|
)
|
|
434
436
|
|
|
435
|
-
# Filter already-collected inputs in incremental mode
|
|
436
|
-
if incremental and metadata:
|
|
437
|
+
# Filter already-collected inputs in incremental mode (refresh: always bypasses)
|
|
438
|
+
if incremental and source.refresh != "always" and metadata:
|
|
437
439
|
already = metadata.get_collected_inputs(source.id)
|
|
438
440
|
if already:
|
|
439
441
|
original = len(values)
|
|
@@ -579,7 +581,7 @@ def _build_plan(
|
|
|
579
581
|
plan = CollectionPlan()
|
|
580
582
|
|
|
581
583
|
for source in ordered:
|
|
582
|
-
if incremental:
|
|
584
|
+
if incremental and source.refresh != "always":
|
|
583
585
|
parquet_path = get_parquet_path(base_path, source.id, today)
|
|
584
586
|
if parquet_path.exists():
|
|
585
587
|
continue
|
|
@@ -592,6 +594,7 @@ def _build_plan(
|
|
|
592
594
|
kind="from_file",
|
|
593
595
|
params={"file": source.from_file, "field": source.file_field},
|
|
594
596
|
estimated_requests=est,
|
|
597
|
+
refresh=source.refresh,
|
|
595
598
|
)
|
|
596
599
|
elif source.dependency is None:
|
|
597
600
|
plan.add_step(
|
|
@@ -600,6 +603,7 @@ def _build_plan(
|
|
|
600
603
|
kind="independent",
|
|
601
604
|
params=source.params,
|
|
602
605
|
estimated_requests=1,
|
|
606
|
+
refresh=source.refresh,
|
|
603
607
|
)
|
|
604
608
|
else:
|
|
605
609
|
est = _count_dependent_inputs(source, base_path, metadata)
|
|
@@ -609,6 +613,7 @@ def _build_plan(
|
|
|
609
613
|
kind="dependent",
|
|
610
614
|
dependency=source.dependency.from_source,
|
|
611
615
|
estimated_requests=est,
|
|
616
|
+
refresh=source.refresh,
|
|
612
617
|
)
|
|
613
618
|
|
|
614
619
|
return plan
|
|
@@ -665,11 +670,14 @@ def _print_plan(plan: CollectionPlan) -> dict[str, int]:
|
|
|
665
670
|
table.add_column("Est. Requests")
|
|
666
671
|
|
|
667
672
|
for i, step in enumerate(plan.steps, 1):
|
|
673
|
+
kind = step["kind"]
|
|
674
|
+
if step.get("refresh") == "always":
|
|
675
|
+
kind += " (refresh)"
|
|
668
676
|
table.add_row(
|
|
669
677
|
str(i),
|
|
670
678
|
step["source"],
|
|
671
679
|
step["endpoint"],
|
|
672
|
-
|
|
680
|
+
kind,
|
|
673
681
|
step.get("dependency") or "-",
|
|
674
682
|
str(step.get("estimated_requests") or "?"),
|
|
675
683
|
)
|