anysite-cli 0.1.4__tar.gz → 0.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of anysite-cli might be problematic. Click here for more details.
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/CLAUDE.md +9 -4
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/PKG-INFO +14 -3
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/README.md +13 -2
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/pyproject.toml +1 -1
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/skills/anysite-cli/SKILL.md +22 -1
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/skills/anysite-cli/references/dataset-guide.md +34 -3
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/collector.py +4 -3
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/db_loader.py +17 -6
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/differ.py +14 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/models.py +4 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/storage.py +21 -1
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_db_loader.py +198 -1
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_differ.py +92 -1
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/.claude/settings.local.json +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/.gitignore +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/LICENSE +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/skills/anysite-cli/references/api-reference.md +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/__main__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/api/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/api/client.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/api/errors.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/api/schemas.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/batch/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/batch/executor.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/batch/input.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/batch/rate_limiter.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/cli/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/cli/config.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/cli/executor.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/cli/options.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/config/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/config/paths.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/config/settings.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/analyzer.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/cli.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/errors.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/exporters.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/history.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/notifications.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/scheduler.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/transformer.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/adapters/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/adapters/base.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/adapters/postgres.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/adapters/sqlite.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/cli.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/config.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/manager.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/operations/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/operations/insert.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/operations/query.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/schema/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/schema/inference.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/schema/types.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/utils/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/utils/sanitize.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/main.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/models/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/output/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/output/console.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/output/formatters.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/output/templates.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/py.typed +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/streaming/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/streaming/progress.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/streaming/writer.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/utils/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/utils/fields.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/utils/retry.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/test_data/enriched_partners_sample_10.csv +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/test_data/linkedin-partners/company_aliases.txt +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/test_data/linkedin-partners/dataset.yaml +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/test_data/partners-deep/dataset.yaml +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/test_data/partners-intel/dataset.yaml +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/test_data/partners-linkedin/company_aliases.txt +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/test_data/partners-linkedin/dataset.yaml +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/test_data/partners-pipeline/dataset.yaml +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/conftest.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_api/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_batch/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_batch/test_executor.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_batch/test_input.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_batch/test_rate_limiter.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_cli/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_cli/test_main.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_analyzer.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_collector.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_exporters.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_history.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_integration_csv.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_models.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_notifications.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_scheduler.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_storage.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_transformer.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_db/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_db/test_cli.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_db/test_config.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_db/test_inference.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_db/test_insert.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_db/test_manager.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_db/test_postgres_adapter.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_db/test_sanitize.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_db/test_sqlite_adapter.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_output/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_output/test_formatters.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_output/test_templates.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_streaming/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_streaming/test_progress.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_streaming/test_writer.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_utils/__init__.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_utils/test_fields.py +0 -0
- {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_utils/test_retry.py +0 -0
|
@@ -49,12 +49,14 @@ anysite dataset query dataset.yaml --interactive
|
|
|
49
49
|
anysite dataset stats dataset.yaml --source profiles
|
|
50
50
|
anysite dataset profile dataset.yaml
|
|
51
51
|
anysite dataset load-db dataset.yaml -c pg --drop-existing
|
|
52
|
+
anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
|
|
52
53
|
anysite dataset history my-dataset
|
|
53
54
|
anysite dataset logs my-dataset --run 42
|
|
54
55
|
anysite dataset schedule dataset.yaml --incremental --load-db pg
|
|
55
56
|
anysite dataset schedule dataset.yaml --systemd --load-db pg
|
|
56
57
|
anysite dataset diff dataset.yaml --source profiles --key _input_value
|
|
57
|
-
anysite dataset diff dataset.yaml --source profiles --key urn --from 2026-01-30 --to 2026-02-01
|
|
58
|
+
anysite dataset diff dataset.yaml --source profiles --key urn.value --from 2026-01-30 --to 2026-02-01
|
|
59
|
+
anysite dataset diff dataset.yaml --source profiles --key urn.value --fields "name,headline,follower_count"
|
|
58
60
|
anysite dataset reset-cursor dataset.yaml
|
|
59
61
|
anysite dataset reset-cursor dataset.yaml --source profiles
|
|
60
62
|
|
|
@@ -104,9 +106,9 @@ anysite db upsert mydb --table users --conflict-columns id --stdin
|
|
|
104
106
|
- `dataset/history.py` - `HistoryStore` (SQLite at `~/.anysite/dataset_history.db`): run start/finish tracking. `LogManager`: file-based per-run logs at `~/.anysite/logs/`
|
|
105
107
|
- `dataset/scheduler.py` - `ScheduleGenerator`: crontab and systemd timer unit generation from cron expressions
|
|
106
108
|
- `dataset/notifications.py` - `WebhookNotifier`: POST to webhook URLs on collection complete/failure
|
|
107
|
-
- `dataset/differ.py` - `DatasetDiffer`: compare two Parquet snapshots using DuckDB (added/removed/changed records). `DiffResult` dataclass, `format_diff_table()` and `format_diff_records()` formatters
|
|
109
|
+
- `dataset/differ.py` - `DatasetDiffer`: compare two Parquet snapshots using DuckDB (added/removed/changed records). Supports dot-notation keys via `json_extract_string()`. `DiffResult` dataclass, `format_diff_table()` and `format_diff_records()` formatters with output field filtering
|
|
108
110
|
- `dataset/cli.py` - Typer subcommands: `init`, `collect` (with `--load-db`), `status`, `query`, `stats`, `profile`, `load-db`, `diff`, `history`, `logs`, `schedule`, `reset-cursor`
|
|
109
|
-
- `dataset/db_loader.py` - `DatasetDbLoader`: loads Parquet data into relational DB with FK linking via provenance, dot-notation field extraction, schema inference
|
|
111
|
+
- `dataset/db_loader.py` - `DatasetDbLoader`: loads Parquet data into relational DB with FK linking via provenance, dot-notation field extraction, schema inference, diff-based incremental sync (`db_load.key` + `db_load.sync: full|append`). Supports diff-based incremental sync via `db_load.key` and `--snapshot` for loading specific dates
|
|
110
112
|
- `dataset/errors.py` - `DatasetError`, `CircularDependencyError`, `SourceNotFoundError`
|
|
111
113
|
- `db/__init__.py` - `check_db_deps()` — verifies optional psycopg is installed for Postgres
|
|
112
114
|
- `db/config.py` - `ConnectionConfig`, `DatabaseType`, `OnConflict` enums and models
|
|
@@ -164,8 +166,11 @@ Sources are topologically sorted by dependencies. `input_template` allows transf
|
|
|
164
166
|
- Schema inference from Parquet records via `infer_table_schema()`
|
|
165
167
|
- Auto-increment `id` primary key per table
|
|
166
168
|
- FK linking via provenance: parent `_input_value` → child `{parent}_id` column
|
|
167
|
-
- Optional `db_load` config per source: field selection, dot-notation extraction, custom table names, field exclusion
|
|
169
|
+
- Optional `db_load` config per source: field selection, dot-notation extraction, custom table names, field exclusion, `key` for diff-based incremental sync
|
|
168
170
|
- Topological loading order (parents before children)
|
|
171
|
+
- Diff-based incremental sync: when `db_load.key` is set and table exists with >=2 snapshots, diffs the two most recent and applies INSERT/DELETE/UPDATE delta
|
|
172
|
+
- `--snapshot YYYY-MM-DD` flag to load a specific snapshot date
|
|
173
|
+
- `--drop-existing` forces full INSERT of latest snapshot
|
|
169
174
|
|
|
170
175
|
**Dataset Storage Layout**:
|
|
171
176
|
```
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: anysite-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.6
|
|
4
4
|
Summary: CLI for Anysite API - web data extraction for humans and AI agents
|
|
5
5
|
Project-URL: Homepage, https://anysite.io
|
|
6
6
|
Project-URL: Documentation, https://docs.anysite.io/cli
|
|
@@ -259,6 +259,8 @@ sources:
|
|
|
259
259
|
path: ./output/companies-{{date}}.csv
|
|
260
260
|
format: csv
|
|
261
261
|
db_load:
|
|
262
|
+
key: _input_value # Unique key for incremental sync
|
|
263
|
+
sync: full # full (default) or append (no DELETE)
|
|
262
264
|
fields: [name, url, employee_count]
|
|
263
265
|
|
|
264
266
|
- id: employees
|
|
@@ -274,6 +276,8 @@ sources:
|
|
|
274
276
|
count: 5
|
|
275
277
|
refresh: always # Re-collect every run with --incremental
|
|
276
278
|
db_load:
|
|
279
|
+
key: urn.value # Unique key for incremental sync
|
|
280
|
+
sync: append # Keep old records (no DELETE on diff)
|
|
277
281
|
fields: [name, url, headline]
|
|
278
282
|
|
|
279
283
|
storage:
|
|
@@ -318,9 +322,15 @@ anysite dataset query dataset.yaml --interactive
|
|
|
318
322
|
anysite dataset stats dataset.yaml --source companies
|
|
319
323
|
anysite dataset profile dataset.yaml
|
|
320
324
|
|
|
321
|
-
# Load into PostgreSQL with automatic FK linking
|
|
325
|
+
# Load into PostgreSQL with automatic FK linking (incremental sync with db_load.key)
|
|
326
|
+
anysite dataset load-db dataset.yaml -c pg
|
|
327
|
+
|
|
328
|
+
# Drop and reload from latest snapshot
|
|
322
329
|
anysite dataset load-db dataset.yaml -c pg --drop-existing
|
|
323
330
|
|
|
331
|
+
# Load a specific snapshot date
|
|
332
|
+
anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
|
|
333
|
+
|
|
324
334
|
# Run history and logs
|
|
325
335
|
anysite dataset history my-dataset
|
|
326
336
|
anysite dataset logs my-dataset --run 42
|
|
@@ -328,8 +338,9 @@ anysite dataset logs my-dataset --run 42
|
|
|
328
338
|
# Generate cron/systemd schedule
|
|
329
339
|
anysite dataset schedule dataset.yaml --incremental --load-db pg
|
|
330
340
|
|
|
331
|
-
# Compare snapshots (diff two collection dates)
|
|
341
|
+
# Compare snapshots (diff two collection dates, supports dot-notation keys)
|
|
332
342
|
anysite dataset diff dataset.yaml --source employees --key _input_value
|
|
343
|
+
anysite dataset diff dataset.yaml --source profiles --key urn.value --fields "name,headline"
|
|
333
344
|
|
|
334
345
|
# Reset incremental state
|
|
335
346
|
anysite dataset reset-cursor dataset.yaml
|
|
@@ -196,6 +196,8 @@ sources:
|
|
|
196
196
|
path: ./output/companies-{{date}}.csv
|
|
197
197
|
format: csv
|
|
198
198
|
db_load:
|
|
199
|
+
key: _input_value # Unique key for incremental sync
|
|
200
|
+
sync: full # full (default) or append (no DELETE)
|
|
199
201
|
fields: [name, url, employee_count]
|
|
200
202
|
|
|
201
203
|
- id: employees
|
|
@@ -211,6 +213,8 @@ sources:
|
|
|
211
213
|
count: 5
|
|
212
214
|
refresh: always # Re-collect every run with --incremental
|
|
213
215
|
db_load:
|
|
216
|
+
key: urn.value # Unique key for incremental sync
|
|
217
|
+
sync: append # Keep old records (no DELETE on diff)
|
|
214
218
|
fields: [name, url, headline]
|
|
215
219
|
|
|
216
220
|
storage:
|
|
@@ -255,9 +259,15 @@ anysite dataset query dataset.yaml --interactive
|
|
|
255
259
|
anysite dataset stats dataset.yaml --source companies
|
|
256
260
|
anysite dataset profile dataset.yaml
|
|
257
261
|
|
|
258
|
-
# Load into PostgreSQL with automatic FK linking
|
|
262
|
+
# Load into PostgreSQL with automatic FK linking (incremental sync with db_load.key)
|
|
263
|
+
anysite dataset load-db dataset.yaml -c pg
|
|
264
|
+
|
|
265
|
+
# Drop and reload from latest snapshot
|
|
259
266
|
anysite dataset load-db dataset.yaml -c pg --drop-existing
|
|
260
267
|
|
|
268
|
+
# Load a specific snapshot date
|
|
269
|
+
anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
|
|
270
|
+
|
|
261
271
|
# Run history and logs
|
|
262
272
|
anysite dataset history my-dataset
|
|
263
273
|
anysite dataset logs my-dataset --run 42
|
|
@@ -265,8 +275,9 @@ anysite dataset logs my-dataset --run 42
|
|
|
265
275
|
# Generate cron/systemd schedule
|
|
266
276
|
anysite dataset schedule dataset.yaml --incremental --load-db pg
|
|
267
277
|
|
|
268
|
-
# Compare snapshots (diff two collection dates)
|
|
278
|
+
# Compare snapshots (diff two collection dates, supports dot-notation keys)
|
|
269
279
|
anysite dataset diff dataset.yaml --source employees --key _input_value
|
|
280
|
+
anysite dataset diff dataset.yaml --source profiles --key urn.value --fields "name,headline"
|
|
270
281
|
|
|
271
282
|
# Reset incremental state
|
|
272
283
|
anysite dataset reset-cursor dataset.yaml
|
|
@@ -117,6 +117,8 @@ sources:
|
|
|
117
117
|
path: ./output/companies-{{date}}.csv
|
|
118
118
|
format: csv
|
|
119
119
|
db_load:
|
|
120
|
+
key: _input_value # Unique key for diff-based incremental sync
|
|
121
|
+
sync: full # full (INSERT/DELETE/UPDATE) or append (no DELETE)
|
|
120
122
|
fields: [name, url, employee_count]
|
|
121
123
|
|
|
122
124
|
- id: employees
|
|
@@ -189,18 +191,32 @@ anysite dataset profile dataset.yaml
|
|
|
189
191
|
# Load all sources with FK linking
|
|
190
192
|
anysite dataset load-db dataset.yaml -c pg --drop-existing
|
|
191
193
|
|
|
194
|
+
# Incremental sync (uses diff when db_load.key is set)
|
|
195
|
+
anysite dataset load-db dataset.yaml -c pg
|
|
196
|
+
|
|
197
|
+
# Load a specific snapshot date
|
|
198
|
+
anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
|
|
199
|
+
|
|
192
200
|
# Dry run
|
|
193
201
|
anysite dataset load-db dataset.yaml -c pg --dry-run
|
|
194
202
|
```
|
|
195
203
|
|
|
196
204
|
`load-db` auto-creates tables with inferred schema, adds `id` primary key, and links child tables to parents via `{parent}_id` FK columns using provenance data.
|
|
197
205
|
|
|
206
|
+
**Incremental sync**: When `db_load.key` is set and the table already exists with >=2 snapshots, `load-db` diffs the two most recent snapshots and applies only the delta (INSERT added, DELETE removed, UPDATE changed). Without `db_load.key`, it does a full INSERT of the latest snapshot.
|
|
207
|
+
|
|
208
|
+
**Sync modes** (`db_load.sync`):
|
|
209
|
+
- `full` (default) — applies INSERT, DELETE, and UPDATE from diff
|
|
210
|
+
- `append` — applies INSERT and UPDATE only, skips DELETE (keeps records that disappeared from the API). Use for sources where the API returns only the latest N items (e.g., posts, activity feeds).
|
|
211
|
+
|
|
198
212
|
Optional `db_load` config per source controls which fields go to DB:
|
|
199
213
|
```yaml
|
|
200
214
|
- id: profiles
|
|
201
215
|
endpoint: /api/linkedin/user
|
|
202
216
|
db_load:
|
|
203
217
|
table: people # Custom table name
|
|
218
|
+
key: urn.value # Unique key for diff-based incremental sync
|
|
219
|
+
sync: append # Keep old records (no DELETE on diff)
|
|
204
220
|
fields: # Select specific fields
|
|
205
221
|
- name
|
|
206
222
|
- urn.value AS urn_id # Dot-notation extraction
|
|
@@ -241,11 +257,16 @@ anysite api /api/linkedin/user user=satyanadella -q --format jsonl \
|
|
|
241
257
|
# Diff two most recent snapshots
|
|
242
258
|
anysite dataset diff dataset.yaml --source employees --key _input_value
|
|
243
259
|
|
|
244
|
-
# Diff
|
|
260
|
+
# Diff with dot-notation key (for JSON fields like urn)
|
|
261
|
+
anysite dataset diff dataset.yaml --source profiles --key urn.value
|
|
262
|
+
|
|
263
|
+
# Diff specific dates, compare and output only certain fields
|
|
245
264
|
anysite dataset diff dataset.yaml --source employees --key _input_value \
|
|
246
265
|
--from 2026-01-30 --to 2026-02-01 --fields "name,headline"
|
|
247
266
|
```
|
|
248
267
|
|
|
268
|
+
`--key` supports dot-notation for JSON fields (e.g., `urn.value`). `--fields` restricts both comparison and output columns.
|
|
269
|
+
|
|
249
270
|
### Step 7: History, Scheduling, and Notifications
|
|
250
271
|
```bash
|
|
251
272
|
# View run history
|
|
@@ -39,6 +39,8 @@ sources:
|
|
|
39
39
|
headers: {X-Token: abc}
|
|
40
40
|
db_load: # Database loading config
|
|
41
41
|
table: custom_name # Override table name
|
|
42
|
+
key: urn.value # Unique key for diff-based incremental sync
|
|
43
|
+
sync: full # full (INSERT/DELETE/UPDATE) or append (no DELETE)
|
|
42
44
|
fields: [name, url] # Fields to include
|
|
43
45
|
exclude: [_input_value] # Fields to exclude
|
|
44
46
|
|
|
@@ -244,6 +246,7 @@ anysite dataset load-db dataset.yaml -c <connection_name> [OPTIONS]
|
|
|
244
246
|
--connection, -c TEXT Database connection name (required)
|
|
245
247
|
--source, -s TEXT Load specific source + dependencies
|
|
246
248
|
--drop-existing Drop tables before creating
|
|
249
|
+
--snapshot TEXT Load a specific snapshot date (YYYY-MM-DD)
|
|
247
250
|
--dry-run Show plan without executing
|
|
248
251
|
--quiet, -q Suppress output
|
|
249
252
|
```
|
|
@@ -256,6 +259,29 @@ anysite dataset load-db dataset.yaml -c <connection_name> [OPTIONS]
|
|
|
256
259
|
4. Inserts rows, tracking which `_input_value` maps to which `id`
|
|
257
260
|
5. For child sources: adds `{parent_source}_id` FK column using provenance
|
|
258
261
|
|
|
262
|
+
### Incremental Sync with `db_load.key`
|
|
263
|
+
|
|
264
|
+
When `db_load.key` is set and the table already exists with >=2 snapshots, `load-db` uses diff-based incremental sync instead of full re-insertion:
|
|
265
|
+
|
|
266
|
+
1. Compares the two most recent Parquet snapshots using `DatasetDiffer`
|
|
267
|
+
2. **Added** records → INSERT into DB
|
|
268
|
+
3. **Removed** records → DELETE from DB (by key) — only in `sync: full` mode
|
|
269
|
+
4. **Changed** records → UPDATE modified fields (by key)
|
|
270
|
+
|
|
271
|
+
This keeps the database in sync without duplicates.
|
|
272
|
+
|
|
273
|
+
**Sync modes** (`db_load.sync`):
|
|
274
|
+
- `full` (default) — applies INSERT, DELETE, and UPDATE from diff
|
|
275
|
+
- `append` — applies INSERT and UPDATE only, skips DELETE. Use for sources where the API returns only the latest N items (e.g., posts, comments) and you want to accumulate records over time.
|
|
276
|
+
|
|
277
|
+
| Scenario | Behavior |
|
|
278
|
+
|----------|----------|
|
|
279
|
+
| First load (table doesn't exist) | Full INSERT of latest snapshot |
|
|
280
|
+
| Table exists + `db_load.key` + >=2 snapshots | Diff-based sync (INSERT/DELETE/UPDATE delta) |
|
|
281
|
+
| `--drop-existing` | Drop table, full INSERT of latest snapshot |
|
|
282
|
+
| `--snapshot 2026-01-15` | Full INSERT of that specific snapshot |
|
|
283
|
+
| No `db_load.key` set | Full INSERT of latest snapshot (no diff) |
|
|
284
|
+
|
|
259
285
|
### db_load Config
|
|
260
286
|
|
|
261
287
|
Control which fields go to the database per source:
|
|
@@ -263,6 +289,8 @@ Control which fields go to the database per source:
|
|
|
263
289
|
```yaml
|
|
264
290
|
db_load:
|
|
265
291
|
table: people # Custom table name (default: source ID)
|
|
292
|
+
key: urn.value # Unique key for diff-based incremental sync
|
|
293
|
+
sync: append # full (default) or append (no DELETE on diff)
|
|
266
294
|
fields: # Explicit field list
|
|
267
295
|
- name
|
|
268
296
|
- url
|
|
@@ -456,8 +484,11 @@ Compare two collection snapshots to find added, removed, and changed records.
|
|
|
456
484
|
# Compare two most recent snapshots (auto-detect dates)
|
|
457
485
|
anysite dataset diff dataset.yaml --source profiles --key _input_value
|
|
458
486
|
|
|
487
|
+
# Compare with dot-notation key (JSON fields)
|
|
488
|
+
anysite dataset diff dataset.yaml --source profiles --key urn.value
|
|
489
|
+
|
|
459
490
|
# Compare specific dates
|
|
460
|
-
anysite dataset diff dataset.yaml --source profiles --key urn --from 2026-01-30 --to 2026-02-01
|
|
491
|
+
anysite dataset diff dataset.yaml --source profiles --key urn.value --from 2026-01-30 --to 2026-02-01
|
|
461
492
|
|
|
462
493
|
# Only compare specific fields
|
|
463
494
|
anysite dataset diff dataset.yaml --source profiles --key urn --fields "name,headline,follower_count"
|
|
@@ -468,9 +499,9 @@ anysite dataset diff dataset.yaml --source profiles --key urn --format json --ou
|
|
|
468
499
|
|
|
469
500
|
**Options:**
|
|
470
501
|
- `--source, -s` (required) — source to compare
|
|
471
|
-
- `--key, -k` (required) — field to match records by (e.g., `
|
|
502
|
+
- `--key, -k` (required) — field to match records by. Supports dot-notation for JSON fields (e.g., `urn.value`)
|
|
472
503
|
- `--from` / `--to` — snapshot dates (default: two most recent)
|
|
473
|
-
- `--fields, -f` —
|
|
504
|
+
- `--fields, -f` — restrict both comparison and output to these fields
|
|
474
505
|
- `--format` — output format (table, json, jsonl, csv)
|
|
475
506
|
- `--output, -o` — write to file
|
|
476
507
|
|
|
@@ -19,6 +19,7 @@ from anysite.dataset.models import DatasetConfig, DatasetSource
|
|
|
19
19
|
from anysite.dataset.storage import (
|
|
20
20
|
MetadataStore,
|
|
21
21
|
get_parquet_path,
|
|
22
|
+
read_latest_parquet,
|
|
22
23
|
read_parquet,
|
|
23
24
|
write_parquet,
|
|
24
25
|
)
|
|
@@ -412,9 +413,9 @@ async def _collect_dependent(
|
|
|
412
413
|
if dep is None:
|
|
413
414
|
raise DatasetError(f"Source {source.id} has no dependency defined")
|
|
414
415
|
|
|
415
|
-
# Read parent data
|
|
416
|
+
# Read parent data (latest snapshot only to avoid schema mismatch)
|
|
416
417
|
parent_dir = base_path / "raw" / dep.from_source
|
|
417
|
-
parent_records =
|
|
418
|
+
parent_records = read_latest_parquet(parent_dir)
|
|
418
419
|
|
|
419
420
|
if not parent_records:
|
|
420
421
|
if not quiet:
|
|
@@ -627,7 +628,7 @@ def _count_dependent_inputs(
|
|
|
627
628
|
if dep is None:
|
|
628
629
|
return None
|
|
629
630
|
parent_dir = base_path / "raw" / dep.from_source
|
|
630
|
-
parent_records =
|
|
631
|
+
parent_records = read_latest_parquet(parent_dir)
|
|
631
632
|
if not parent_records:
|
|
632
633
|
info = metadata.get_source_info(dep.from_source)
|
|
633
634
|
return info.get("record_count") if info else None
|
|
@@ -301,9 +301,13 @@ class DatasetDbLoader:
|
|
|
301
301
|
"""Diff-based incremental sync: compare two most recent snapshots, apply delta."""
|
|
302
302
|
result = differ.diff(source.id, diff_key)
|
|
303
303
|
total = 0
|
|
304
|
+
sync_mode = source.db_load.sync if source.db_load else "full"
|
|
304
305
|
|
|
305
306
|
if dry_run:
|
|
306
|
-
|
|
307
|
+
count = len(result.added) + len(result.changed)
|
|
308
|
+
if sync_mode == "full":
|
|
309
|
+
count += len(result.removed)
|
|
310
|
+
return count
|
|
307
311
|
|
|
308
312
|
# Extract key value from a record (handles dot-notation)
|
|
309
313
|
def _get_key_val(record: dict[str, Any]) -> Any:
|
|
@@ -321,14 +325,15 @@ class DatasetDbLoader:
|
|
|
321
325
|
self.adapter.insert_batch(table_name, [row])
|
|
322
326
|
total += 1
|
|
323
327
|
|
|
324
|
-
# DELETE removed records
|
|
325
|
-
|
|
328
|
+
# DELETE removed records (skipped in append mode)
|
|
329
|
+
ph = self._placeholder()
|
|
330
|
+
if result.removed and sync_mode == "full":
|
|
326
331
|
safe_col = sanitize_identifier(db_key_col)
|
|
327
332
|
for record in result.removed:
|
|
328
333
|
key_val = _get_key_val(record)
|
|
329
334
|
if key_val is not None:
|
|
330
335
|
self.adapter.execute(
|
|
331
|
-
f"DELETE FROM {table_name} WHERE {safe_col} =
|
|
336
|
+
f"DELETE FROM {table_name} WHERE {safe_col} = {ph}",
|
|
332
337
|
(str(key_val),),
|
|
333
338
|
)
|
|
334
339
|
total += 1
|
|
@@ -350,14 +355,14 @@ class DatasetDbLoader:
|
|
|
350
355
|
for field_name in changed_fields:
|
|
351
356
|
new_val = record.get(field_name)
|
|
352
357
|
safe_field = sanitize_identifier(field_name)
|
|
353
|
-
set_parts.append(f"{safe_field} =
|
|
358
|
+
set_parts.append(f"{safe_field} = {ph}")
|
|
354
359
|
params.append(new_val)
|
|
355
360
|
|
|
356
361
|
params.append(str(key_val))
|
|
357
362
|
sql = (
|
|
358
363
|
f"UPDATE {table_name} "
|
|
359
364
|
f"SET {', '.join(set_parts)} "
|
|
360
|
-
f"WHERE {safe_col} =
|
|
365
|
+
f"WHERE {safe_col} = {ph}"
|
|
361
366
|
)
|
|
362
367
|
self.adapter.execute(sql, tuple(params))
|
|
363
368
|
total += 1
|
|
@@ -371,6 +376,12 @@ class DatasetDbLoader:
|
|
|
371
376
|
return other.dependency.field
|
|
372
377
|
return None
|
|
373
378
|
|
|
379
|
+
def _placeholder(self) -> str:
|
|
380
|
+
"""Get the parameter placeholder for the dialect."""
|
|
381
|
+
if self._dialect == "postgres":
|
|
382
|
+
return "%s"
|
|
383
|
+
return "?"
|
|
384
|
+
|
|
374
385
|
def _auto_id_type(self) -> str:
|
|
375
386
|
"""Get the auto-increment ID column type for the dialect."""
|
|
376
387
|
if self._dialect == "postgres":
|
|
@@ -344,6 +344,9 @@ class DatasetDiffer:
|
|
|
344
344
|
old_val = record.get(old_key)
|
|
345
345
|
if _values_differ(new_val, old_val):
|
|
346
346
|
changed_fields.append(col)
|
|
347
|
+
# Fallback: DuckDB detected a change but Python comparison missed it
|
|
348
|
+
if not changed_fields:
|
|
349
|
+
changed_fields = list(compare_fields)
|
|
347
350
|
record["_changed_fields"] = changed_fields
|
|
348
351
|
|
|
349
352
|
return records
|
|
@@ -377,6 +380,15 @@ def _values_differ(a: Any, b: Any) -> bool:
|
|
|
377
380
|
return json.loads(a) != json.loads(b)
|
|
378
381
|
except (json.JSONDecodeError, ValueError):
|
|
379
382
|
pass
|
|
383
|
+
# Handle complex types (dict, list) — compare via JSON serialization
|
|
384
|
+
# to catch differences DuckDB sees but Python equality misses
|
|
385
|
+
if isinstance(a, (dict, list)) or isinstance(b, (dict, list)):
|
|
386
|
+
try:
|
|
387
|
+
return json.dumps(a, sort_keys=True, default=str) != json.dumps(
|
|
388
|
+
b, sort_keys=True, default=str
|
|
389
|
+
)
|
|
390
|
+
except (TypeError, ValueError):
|
|
391
|
+
pass
|
|
380
392
|
return True
|
|
381
393
|
|
|
382
394
|
|
|
@@ -452,6 +464,8 @@ def format_diff_records(
|
|
|
452
464
|
|
|
453
465
|
for record in result.changed:
|
|
454
466
|
row: dict[str, Any] = {"_diff": "changed"}
|
|
467
|
+
changed_fields = record.get("_changed_fields", [])
|
|
468
|
+
row["_changed_fields"] = changed_fields
|
|
455
469
|
for k, v in record.items():
|
|
456
470
|
if k == "_changed_fields":
|
|
457
471
|
continue
|
|
@@ -82,6 +82,10 @@ class DbLoadConfig(BaseModel):
|
|
|
82
82
|
|
|
83
83
|
table: str | None = Field(default=None, description="Override table name (default: source id)")
|
|
84
84
|
key: str | None = Field(default=None, description="Unique key field for diff-based DB sync (e.g., urn.value)")
|
|
85
|
+
sync: Literal["full", "append"] = Field(
|
|
86
|
+
default="full",
|
|
87
|
+
description="Sync mode: 'full' applies INSERT/DELETE/UPDATE, 'append' skips DELETE (keeps old records)",
|
|
88
|
+
)
|
|
85
89
|
fields: list[str] = Field(default_factory=list, description="Fields to include (empty = all)")
|
|
86
90
|
exclude: list[str] = Field(
|
|
87
91
|
default_factory=lambda: ["_input_value", "_parent_source"],
|
|
@@ -75,7 +75,7 @@ def read_parquet(path: Path) -> list[dict[str, Any]]:
|
|
|
75
75
|
tables = [pq.read_table(f) for f in files]
|
|
76
76
|
import pyarrow as pa
|
|
77
77
|
|
|
78
|
-
table = pa.concat_tables(tables)
|
|
78
|
+
table = pa.concat_tables(tables, promote_options="permissive")
|
|
79
79
|
else:
|
|
80
80
|
if not path.exists():
|
|
81
81
|
return []
|
|
@@ -84,6 +84,26 @@ def read_parquet(path: Path) -> list[dict[str, Any]]:
|
|
|
84
84
|
return table.to_pylist()
|
|
85
85
|
|
|
86
86
|
|
|
87
|
+
def read_latest_parquet(path: Path) -> list[dict[str, Any]]:
|
|
88
|
+
"""Read records from the most recent Parquet snapshot in a directory.
|
|
89
|
+
|
|
90
|
+
Unlike ``read_parquet(dir)``, this reads only the latest file, avoiding
|
|
91
|
+
schema mismatch errors when snapshots have different column types.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
path: Directory containing dated .parquet files.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
List of dicts from the newest snapshot, or [] if none found.
|
|
98
|
+
"""
|
|
99
|
+
if not path.is_dir():
|
|
100
|
+
return read_parquet(path)
|
|
101
|
+
files = sorted(path.glob("*.parquet"))
|
|
102
|
+
if not files:
|
|
103
|
+
return []
|
|
104
|
+
return read_parquet(files[-1])
|
|
105
|
+
|
|
106
|
+
|
|
87
107
|
def get_source_dir(base_path: Path, source_id: str) -> Path:
|
|
88
108
|
"""Get the raw data directory for a source."""
|
|
89
109
|
return base_path / "raw" / source_id
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""Tests for dataset DB loader with SQLite in-memory adapter."""
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
-
|
|
5
4
|
import pytest
|
|
6
5
|
|
|
7
6
|
from anysite.dataset.db_loader import DatasetDbLoader, _extract_dot_value, _filter_record
|
|
@@ -636,3 +635,201 @@ class TestDropExistingWithDiffKey:
|
|
|
636
635
|
assert len(rows) == 2
|
|
637
636
|
assert rows[0]["name"] == "Alice"
|
|
638
637
|
assert rows[0]["score"] == 95
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
class TestAppendSyncMode:
|
|
641
|
+
"""Test sync: append skips DELETE but still INSERTs and UPDATEs."""
|
|
642
|
+
|
|
643
|
+
def _setup_two_snapshots(self, tmp_path, source_id, old_records, new_records):
|
|
644
|
+
source_dir = get_source_dir(tmp_path / "data", source_id)
|
|
645
|
+
write_parquet(old_records, source_dir / "2026-01-01.parquet")
|
|
646
|
+
write_parquet(new_records, source_dir / "2026-01-02.parquet")
|
|
647
|
+
|
|
648
|
+
def test_append_keeps_removed_records(self, tmp_path):
|
|
649
|
+
"""With sync: append, records missing from new snapshot are NOT deleted."""
|
|
650
|
+
sources = [
|
|
651
|
+
DatasetSource(
|
|
652
|
+
id="posts", endpoint="/api/posts",
|
|
653
|
+
db_load=DbLoadConfig(key="uid", sync="append"),
|
|
654
|
+
),
|
|
655
|
+
]
|
|
656
|
+
config = _make_config(tmp_path, sources)
|
|
657
|
+
|
|
658
|
+
self._setup_two_snapshots(
|
|
659
|
+
tmp_path, "posts",
|
|
660
|
+
old_records=[
|
|
661
|
+
{"uid": "a", "text": "Hello", "likes": 10},
|
|
662
|
+
{"uid": "b", "text": "World", "likes": 5},
|
|
663
|
+
{"uid": "c", "text": "Bye", "likes": 3},
|
|
664
|
+
],
|
|
665
|
+
new_records=[
|
|
666
|
+
{"uid": "a", "text": "Hello", "likes": 15}, # changed
|
|
667
|
+
{"uid": "d", "text": "New post", "likes": 0}, # added
|
|
668
|
+
# b and c removed from snapshot
|
|
669
|
+
],
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
adapter = _sqlite_adapter()
|
|
673
|
+
with adapter:
|
|
674
|
+
# Set up DB with old data
|
|
675
|
+
source_dir = get_source_dir(tmp_path / "data", "posts")
|
|
676
|
+
loader = DatasetDbLoader(config, adapter)
|
|
677
|
+
loader._full_insert(
|
|
678
|
+
sources[0], "posts", source_dir / "2026-01-01.parquet"
|
|
679
|
+
)
|
|
680
|
+
assert len(adapter.fetch_all("SELECT * FROM posts")) == 3
|
|
681
|
+
|
|
682
|
+
# Diff sync with append mode
|
|
683
|
+
loader2 = DatasetDbLoader(config, adapter)
|
|
684
|
+
results = loader2.load_all()
|
|
685
|
+
# 1 added + 1 changed = 2 (no deletes)
|
|
686
|
+
assert results["posts"] == 2
|
|
687
|
+
|
|
688
|
+
rows = adapter.fetch_all("SELECT * FROM posts ORDER BY uid")
|
|
689
|
+
assert len(rows) == 4 # 3 original + 1 added, none deleted
|
|
690
|
+
uids = [r["uid"] for r in rows]
|
|
691
|
+
assert "a" in uids # updated
|
|
692
|
+
assert "b" in uids # kept (not deleted)
|
|
693
|
+
assert "c" in uids # kept (not deleted)
|
|
694
|
+
assert "d" in uids # added
|
|
695
|
+
|
|
696
|
+
# Verify update applied
|
|
697
|
+
a_row = [r for r in rows if r["uid"] == "a"][0]
|
|
698
|
+
assert a_row["likes"] == 15
|
|
699
|
+
|
|
700
|
+
def test_full_sync_deletes_removed_records(self, tmp_path):
|
|
701
|
+
"""With sync: full (default), removed records ARE deleted."""
|
|
702
|
+
sources = [
|
|
703
|
+
DatasetSource(
|
|
704
|
+
id="posts", endpoint="/api/posts",
|
|
705
|
+
db_load=DbLoadConfig(key="uid", sync="full"),
|
|
706
|
+
),
|
|
707
|
+
]
|
|
708
|
+
config = _make_config(tmp_path, sources)
|
|
709
|
+
|
|
710
|
+
self._setup_two_snapshots(
|
|
711
|
+
tmp_path, "posts",
|
|
712
|
+
old_records=[
|
|
713
|
+
{"uid": "a", "text": "Hello", "likes": 10},
|
|
714
|
+
{"uid": "b", "text": "World", "likes": 5},
|
|
715
|
+
],
|
|
716
|
+
new_records=[
|
|
717
|
+
{"uid": "a", "text": "Hello", "likes": 15},
|
|
718
|
+
# b removed
|
|
719
|
+
],
|
|
720
|
+
)
|
|
721
|
+
|
|
722
|
+
adapter = _sqlite_adapter()
|
|
723
|
+
with adapter:
|
|
724
|
+
source_dir = get_source_dir(tmp_path / "data", "posts")
|
|
725
|
+
loader = DatasetDbLoader(config, adapter)
|
|
726
|
+
loader._full_insert(
|
|
727
|
+
sources[0], "posts", source_dir / "2026-01-01.parquet"
|
|
728
|
+
)
|
|
729
|
+
assert len(adapter.fetch_all("SELECT * FROM posts")) == 2
|
|
730
|
+
|
|
731
|
+
loader2 = DatasetDbLoader(config, adapter)
|
|
732
|
+
results = loader2.load_all()
|
|
733
|
+
assert results["posts"] == 2 # 1 changed + 1 removed
|
|
734
|
+
|
|
735
|
+
rows = adapter.fetch_all("SELECT * FROM posts")
|
|
736
|
+
assert len(rows) == 1
|
|
737
|
+
assert rows[0]["uid"] == "a"
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
class TestPostgresPlaceholders:
|
|
741
|
+
"""Test that diff-based sync uses %s placeholders for postgres dialect."""
|
|
742
|
+
|
|
743
|
+
def _setup_two_snapshots(self, tmp_path, source_id, old_records, new_records):
|
|
744
|
+
source_dir = get_source_dir(tmp_path / "data", source_id)
|
|
745
|
+
write_parquet(old_records, source_dir / "2026-01-01.parquet")
|
|
746
|
+
write_parquet(new_records, source_dir / "2026-01-02.parquet")
|
|
747
|
+
|
|
748
|
+
def test_delete_uses_percent_s(self, tmp_path):
|
|
749
|
+
"""DELETE query uses %s placeholder for postgres."""
|
|
750
|
+
sources = [
|
|
751
|
+
DatasetSource(
|
|
752
|
+
id="items", endpoint="/api/items",
|
|
753
|
+
db_load=DbLoadConfig(key="name", sync="full"),
|
|
754
|
+
),
|
|
755
|
+
]
|
|
756
|
+
config = _make_config(tmp_path, sources)
|
|
757
|
+
|
|
758
|
+
self._setup_two_snapshots(
|
|
759
|
+
tmp_path, "items",
|
|
760
|
+
old_records=[
|
|
761
|
+
{"name": "Alice", "score": 90},
|
|
762
|
+
{"name": "Bob", "score": 80},
|
|
763
|
+
],
|
|
764
|
+
new_records=[{"name": "Alice", "score": 90}],
|
|
765
|
+
)
|
|
766
|
+
|
|
767
|
+
# Use real SQLite adapter for initial load, then mock for diff sync
|
|
768
|
+
adapter = _sqlite_adapter()
|
|
769
|
+
with adapter:
|
|
770
|
+
source_dir = get_source_dir(tmp_path / "data", "items")
|
|
771
|
+
loader = DatasetDbLoader(config, adapter)
|
|
772
|
+
loader._full_insert(
|
|
773
|
+
sources[0], "items", source_dir / "2026-01-01.parquet"
|
|
774
|
+
)
|
|
775
|
+
|
|
776
|
+
# Patch dialect to postgres and spy on execute
|
|
777
|
+
loader2 = DatasetDbLoader(config, adapter)
|
|
778
|
+
loader2._dialect = "postgres"
|
|
779
|
+
original_execute = adapter.execute
|
|
780
|
+
calls = []
|
|
781
|
+
|
|
782
|
+
def spy_execute(sql, params=None):
|
|
783
|
+
calls.append((sql, params))
|
|
784
|
+
# Replace %s with ? for SQLite execution
|
|
785
|
+
original_execute(sql.replace("%s", "?"), params)
|
|
786
|
+
|
|
787
|
+
adapter.execute = spy_execute
|
|
788
|
+
loader2.load_all()
|
|
789
|
+
|
|
790
|
+
# Verify DELETE used %s
|
|
791
|
+
delete_calls = [c for c in calls if "DELETE" in c[0]]
|
|
792
|
+
assert len(delete_calls) == 1
|
|
793
|
+
assert "%s" in delete_calls[0][0]
|
|
794
|
+
assert "?" not in delete_calls[0][0]
|
|
795
|
+
|
|
796
|
+
def test_update_uses_percent_s(self, tmp_path):
|
|
797
|
+
"""UPDATE query uses %s placeholders for postgres."""
|
|
798
|
+
sources = [
|
|
799
|
+
DatasetSource(
|
|
800
|
+
id="items", endpoint="/api/items",
|
|
801
|
+
db_load=DbLoadConfig(key="name"),
|
|
802
|
+
),
|
|
803
|
+
]
|
|
804
|
+
config = _make_config(tmp_path, sources)
|
|
805
|
+
|
|
806
|
+
self._setup_two_snapshots(
|
|
807
|
+
tmp_path, "items",
|
|
808
|
+
old_records=[{"name": "Alice", "score": 90}],
|
|
809
|
+
new_records=[{"name": "Alice", "score": 95}],
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
adapter = _sqlite_adapter()
|
|
813
|
+
with adapter:
|
|
814
|
+
source_dir = get_source_dir(tmp_path / "data", "items")
|
|
815
|
+
loader = DatasetDbLoader(config, adapter)
|
|
816
|
+
loader._full_insert(
|
|
817
|
+
sources[0], "items", source_dir / "2026-01-01.parquet"
|
|
818
|
+
)
|
|
819
|
+
|
|
820
|
+
loader2 = DatasetDbLoader(config, adapter)
|
|
821
|
+
loader2._dialect = "postgres"
|
|
822
|
+
original_execute = adapter.execute
|
|
823
|
+
calls = []
|
|
824
|
+
|
|
825
|
+
def spy_execute(sql, params=None):
|
|
826
|
+
calls.append((sql, params))
|
|
827
|
+
original_execute(sql.replace("%s", "?"), params)
|
|
828
|
+
|
|
829
|
+
adapter.execute = spy_execute
|
|
830
|
+
loader2.load_all()
|
|
831
|
+
|
|
832
|
+
update_calls = [c for c in calls if "UPDATE" in c[0]]
|
|
833
|
+
assert len(update_calls) == 1
|
|
834
|
+
assert "%s" in update_calls[0][0]
|
|
835
|
+
assert "?" not in update_calls[0][0]
|
|
@@ -8,6 +8,7 @@ import pytest
|
|
|
8
8
|
from anysite.dataset.differ import (
|
|
9
9
|
DatasetDiffer,
|
|
10
10
|
DiffResult,
|
|
11
|
+
_values_differ,
|
|
11
12
|
format_diff_records,
|
|
12
13
|
format_diff_table,
|
|
13
14
|
)
|
|
@@ -336,7 +337,8 @@ class TestFormatDiffRecords:
|
|
|
336
337
|
assert len(rows) == 1
|
|
337
338
|
assert rows[0]["name"] == "Alice Updated"
|
|
338
339
|
assert rows[0]["name__old"] == "Alice"
|
|
339
|
-
assert "_changed_fields"
|
|
340
|
+
assert "_changed_fields" in rows[0]
|
|
341
|
+
assert rows[0]["_changed_fields"] == ["name"]
|
|
340
342
|
|
|
341
343
|
|
|
342
344
|
class TestDotNotationKey:
|
|
@@ -481,3 +483,92 @@ class TestOutputFieldFiltering:
|
|
|
481
483
|
assert "name" not in rows[0]
|
|
482
484
|
assert "name__old" not in rows[0]
|
|
483
485
|
assert "urn" in rows[0] # key always included
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
class TestChangedFieldsDetail:
|
|
489
|
+
"""Verify _changed_fields and __old columns appear in output."""
|
|
490
|
+
|
|
491
|
+
def test_changed_fields_in_json_output(self):
|
|
492
|
+
"""format_diff_records includes _changed_fields for changed records."""
|
|
493
|
+
result = DiffResult(
|
|
494
|
+
source_id="items",
|
|
495
|
+
from_date=date(2026, 1, 1),
|
|
496
|
+
to_date=date(2026, 1, 2),
|
|
497
|
+
key="id",
|
|
498
|
+
changed=[{
|
|
499
|
+
"id": "1",
|
|
500
|
+
"name": "Alice Updated",
|
|
501
|
+
"name__old": "Alice",
|
|
502
|
+
"score": 90,
|
|
503
|
+
"score__old": 90,
|
|
504
|
+
"_changed_fields": ["name"],
|
|
505
|
+
}],
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
rows = format_diff_records(result)
|
|
509
|
+
assert len(rows) == 1
|
|
510
|
+
assert "_changed_fields" in rows[0]
|
|
511
|
+
assert rows[0]["_changed_fields"] == ["name"]
|
|
512
|
+
assert "name__old" in rows[0]
|
|
513
|
+
assert rows[0]["name__old"] == "Alice"
|
|
514
|
+
|
|
515
|
+
def test_changed_fields_not_in_table_output(self):
|
|
516
|
+
"""format_diff_table uses old→new arrows instead of _changed_fields."""
|
|
517
|
+
result = DiffResult(
|
|
518
|
+
source_id="items",
|
|
519
|
+
from_date=date(2026, 1, 1),
|
|
520
|
+
to_date=date(2026, 1, 2),
|
|
521
|
+
key="id",
|
|
522
|
+
changed=[{
|
|
523
|
+
"id": "1",
|
|
524
|
+
"name": "Bob",
|
|
525
|
+
"name__old": "Alice",
|
|
526
|
+
"score": 90,
|
|
527
|
+
"score__old": 90,
|
|
528
|
+
"_changed_fields": ["name"],
|
|
529
|
+
}],
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
rows = format_diff_table(result)
|
|
533
|
+
assert len(rows) == 1
|
|
534
|
+
assert "_changed_fields" not in rows[0]
|
|
535
|
+
assert "→" in rows[0]["name"]
|
|
536
|
+
assert rows[0]["score"] == 90 # unchanged, no arrow
|
|
537
|
+
|
|
538
|
+
def test_values_differ_with_dicts(self):
|
|
539
|
+
"""_values_differ handles dict comparison correctly."""
|
|
540
|
+
assert not _values_differ({"a": 1}, {"a": 1})
|
|
541
|
+
assert _values_differ({"a": 1}, {"a": 2})
|
|
542
|
+
assert not _values_differ({"b": 2, "a": 1}, {"a": 1, "b": 2}) # key order
|
|
543
|
+
|
|
544
|
+
def test_values_differ_with_lists(self):
|
|
545
|
+
"""_values_differ handles list comparison."""
|
|
546
|
+
assert not _values_differ([1, 2], [1, 2])
|
|
547
|
+
assert _values_differ([1, 2], [1, 3])
|
|
548
|
+
|
|
549
|
+
def test_values_differ_dict_vs_string(self):
|
|
550
|
+
"""_values_differ handles dict vs JSON string comparison."""
|
|
551
|
+
assert _values_differ({"a": 1}, '{"a": 1}')
|
|
552
|
+
|
|
553
|
+
def test_fallback_all_compare_fields(self, tmp_path):
|
|
554
|
+
"""When Python can't identify changed fields, all compare fields are marked."""
|
|
555
|
+
source_dir = tmp_path / "raw" / "items"
|
|
556
|
+
source_dir.mkdir(parents=True)
|
|
557
|
+
|
|
558
|
+
# Write old and new snapshots with a value DuckDB sees as different
|
|
559
|
+
write_parquet(
|
|
560
|
+
[{"id": "1", "data": json.dumps({"x": 1})}],
|
|
561
|
+
source_dir / "2026-01-01.parquet",
|
|
562
|
+
)
|
|
563
|
+
write_parquet(
|
|
564
|
+
[{"id": "1", "data": json.dumps({"x": 1, "y": 2})}],
|
|
565
|
+
source_dir / "2026-01-02.parquet",
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
differ = DatasetDiffer(tmp_path)
|
|
569
|
+
result = differ.diff("items", "id")
|
|
570
|
+
|
|
571
|
+
assert len(result.changed) == 1
|
|
572
|
+
record = result.changed[0]
|
|
573
|
+
assert "_changed_fields" in record
|
|
574
|
+
assert len(record["_changed_fields"]) > 0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|