anysite-cli 0.1.3__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/CLAUDE.md +9 -4
  2. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/PKG-INFO +14 -3
  3. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/README.md +13 -2
  4. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/pyproject.toml +1 -1
  5. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/skills/anysite-cli/SKILL.md +22 -1
  6. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/skills/anysite-cli/references/dataset-guide.md +34 -3
  7. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/dataset/cli.py +10 -1
  8. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/dataset/collector.py +4 -3
  9. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/dataset/db_loader.py +166 -23
  10. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/dataset/differ.py +203 -48
  11. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/dataset/models.py +5 -0
  12. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/dataset/storage.py +21 -1
  13. anysite_cli-0.1.5/tests/test_dataset/test_db_loader.py +738 -0
  14. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_dataset/test_differ.py +236 -0
  15. anysite_cli-0.1.3/tests/test_dataset/test_db_loader.py +0 -346
  16. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/.claude/settings.local.json +0 -0
  17. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/.gitignore +0 -0
  18. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/LICENSE +0 -0
  19. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/skills/anysite-cli/references/api-reference.md +0 -0
  20. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/__init__.py +0 -0
  21. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/__main__.py +0 -0
  22. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/api/__init__.py +0 -0
  23. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/api/client.py +0 -0
  24. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/api/errors.py +0 -0
  25. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/api/schemas.py +0 -0
  26. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/batch/__init__.py +0 -0
  27. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/batch/executor.py +0 -0
  28. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/batch/input.py +0 -0
  29. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/batch/rate_limiter.py +0 -0
  30. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/cli/__init__.py +0 -0
  31. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/cli/config.py +0 -0
  32. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/cli/executor.py +0 -0
  33. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/cli/options.py +0 -0
  34. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/config/__init__.py +0 -0
  35. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/config/paths.py +0 -0
  36. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/config/settings.py +0 -0
  37. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/dataset/__init__.py +0 -0
  38. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/dataset/analyzer.py +0 -0
  39. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/dataset/errors.py +0 -0
  40. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/dataset/exporters.py +0 -0
  41. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/dataset/history.py +0 -0
  42. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/dataset/notifications.py +0 -0
  43. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/dataset/scheduler.py +0 -0
  44. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/dataset/transformer.py +0 -0
  45. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/db/__init__.py +0 -0
  46. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/db/adapters/__init__.py +0 -0
  47. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/db/adapters/base.py +0 -0
  48. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/db/adapters/postgres.py +0 -0
  49. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/db/adapters/sqlite.py +0 -0
  50. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/db/cli.py +0 -0
  51. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/db/config.py +0 -0
  52. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/db/manager.py +0 -0
  53. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/db/operations/__init__.py +0 -0
  54. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/db/operations/insert.py +0 -0
  55. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/db/operations/query.py +0 -0
  56. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/db/schema/__init__.py +0 -0
  57. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/db/schema/inference.py +0 -0
  58. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/db/schema/types.py +0 -0
  59. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/db/utils/__init__.py +0 -0
  60. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/db/utils/sanitize.py +0 -0
  61. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/main.py +0 -0
  62. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/models/__init__.py +0 -0
  63. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/output/__init__.py +0 -0
  64. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/output/console.py +0 -0
  65. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/output/formatters.py +0 -0
  66. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/output/templates.py +0 -0
  67. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/py.typed +0 -0
  68. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/streaming/__init__.py +0 -0
  69. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/streaming/progress.py +0 -0
  70. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/streaming/writer.py +0 -0
  71. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/utils/__init__.py +0 -0
  72. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/utils/fields.py +0 -0
  73. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/src/anysite/utils/retry.py +0 -0
  74. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/test_data/enriched_partners_sample_10.csv +0 -0
  75. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/test_data/linkedin-partners/company_aliases.txt +0 -0
  76. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/test_data/linkedin-partners/dataset.yaml +0 -0
  77. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/test_data/partners-deep/dataset.yaml +0 -0
  78. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/test_data/partners-intel/dataset.yaml +0 -0
  79. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/test_data/partners-linkedin/company_aliases.txt +0 -0
  80. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/test_data/partners-linkedin/dataset.yaml +0 -0
  81. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/test_data/partners-pipeline/dataset.yaml +0 -0
  82. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/__init__.py +0 -0
  83. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/conftest.py +0 -0
  84. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_api/__init__.py +0 -0
  85. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_batch/__init__.py +0 -0
  86. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_batch/test_executor.py +0 -0
  87. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_batch/test_input.py +0 -0
  88. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_batch/test_rate_limiter.py +0 -0
  89. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_cli/__init__.py +0 -0
  90. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_cli/test_main.py +0 -0
  91. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_dataset/__init__.py +0 -0
  92. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_dataset/test_analyzer.py +0 -0
  93. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_dataset/test_collector.py +0 -0
  94. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_dataset/test_exporters.py +0 -0
  95. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_dataset/test_history.py +0 -0
  96. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_dataset/test_integration_csv.py +0 -0
  97. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_dataset/test_models.py +0 -0
  98. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_dataset/test_notifications.py +0 -0
  99. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_dataset/test_scheduler.py +0 -0
  100. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_dataset/test_storage.py +0 -0
  101. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_dataset/test_transformer.py +0 -0
  102. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_db/__init__.py +0 -0
  103. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_db/test_cli.py +0 -0
  104. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_db/test_config.py +0 -0
  105. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_db/test_inference.py +0 -0
  106. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_db/test_insert.py +0 -0
  107. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_db/test_manager.py +0 -0
  108. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_db/test_postgres_adapter.py +0 -0
  109. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_db/test_sanitize.py +0 -0
  110. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_db/test_sqlite_adapter.py +0 -0
  111. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_output/__init__.py +0 -0
  112. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_output/test_formatters.py +0 -0
  113. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_output/test_templates.py +0 -0
  114. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_streaming/__init__.py +0 -0
  115. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_streaming/test_progress.py +0 -0
  116. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_streaming/test_writer.py +0 -0
  117. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_utils/__init__.py +0 -0
  118. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_utils/test_fields.py +0 -0
  119. {anysite_cli-0.1.3 → anysite_cli-0.1.5}/tests/test_utils/test_retry.py +0 -0
@@ -49,12 +49,14 @@ anysite dataset query dataset.yaml --interactive
49
49
  anysite dataset stats dataset.yaml --source profiles
50
50
  anysite dataset profile dataset.yaml
51
51
  anysite dataset load-db dataset.yaml -c pg --drop-existing
52
+ anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
52
53
  anysite dataset history my-dataset
53
54
  anysite dataset logs my-dataset --run 42
54
55
  anysite dataset schedule dataset.yaml --incremental --load-db pg
55
56
  anysite dataset schedule dataset.yaml --systemd --load-db pg
56
57
  anysite dataset diff dataset.yaml --source profiles --key _input_value
57
- anysite dataset diff dataset.yaml --source profiles --key urn --from 2026-01-30 --to 2026-02-01
58
+ anysite dataset diff dataset.yaml --source profiles --key urn.value --from 2026-01-30 --to 2026-02-01
59
+ anysite dataset diff dataset.yaml --source profiles --key urn.value --fields "name,headline,follower_count"
58
60
  anysite dataset reset-cursor dataset.yaml
59
61
  anysite dataset reset-cursor dataset.yaml --source profiles
60
62
 
@@ -104,9 +106,9 @@ anysite db upsert mydb --table users --conflict-columns id --stdin
104
106
  - `dataset/history.py` - `HistoryStore` (SQLite at `~/.anysite/dataset_history.db`): run start/finish tracking. `LogManager`: file-based per-run logs at `~/.anysite/logs/`
105
107
  - `dataset/scheduler.py` - `ScheduleGenerator`: crontab and systemd timer unit generation from cron expressions
106
108
  - `dataset/notifications.py` - `WebhookNotifier`: POST to webhook URLs on collection complete/failure
107
- - `dataset/differ.py` - `DatasetDiffer`: compare two Parquet snapshots using DuckDB (added/removed/changed records). `DiffResult` dataclass, `format_diff_table()` and `format_diff_records()` formatters
109
+ - `dataset/differ.py` - `DatasetDiffer`: compare two Parquet snapshots using DuckDB (added/removed/changed records). Supports dot-notation keys via `json_extract_string()`. `DiffResult` dataclass, `format_diff_table()` and `format_diff_records()` formatters with output field filtering
108
110
  - `dataset/cli.py` - Typer subcommands: `init`, `collect` (with `--load-db`), `status`, `query`, `stats`, `profile`, `load-db`, `diff`, `history`, `logs`, `schedule`, `reset-cursor`
109
- - `dataset/db_loader.py` - `DatasetDbLoader`: loads Parquet data into relational DB with FK linking via provenance, dot-notation field extraction, schema inference
111
+ - `dataset/db_loader.py` - `DatasetDbLoader`: loads Parquet data into relational DB with FK linking via provenance, dot-notation field extraction, schema inference, diff-based incremental sync (`db_load.key` + `db_load.sync: full|append`). Supports diff-based incremental sync via `db_load.key` and `--snapshot` for loading specific dates
110
112
  - `dataset/errors.py` - `DatasetError`, `CircularDependencyError`, `SourceNotFoundError`
111
113
  - `db/__init__.py` - `check_db_deps()` — verifies optional psycopg is installed for Postgres
112
114
  - `db/config.py` - `ConnectionConfig`, `DatabaseType`, `OnConflict` enums and models
@@ -164,8 +166,11 @@ Sources are topologically sorted by dependencies. `input_template` allows transf
164
166
  - Schema inference from Parquet records via `infer_table_schema()`
165
167
  - Auto-increment `id` primary key per table
166
168
  - FK linking via provenance: parent `_input_value` → child `{parent}_id` column
167
- - Optional `db_load` config per source: field selection, dot-notation extraction, custom table names, field exclusion
169
+ - Optional `db_load` config per source: field selection, dot-notation extraction, custom table names, field exclusion, `key` for diff-based incremental sync
168
170
  - Topological loading order (parents before children)
171
+ - Diff-based incremental sync: when `db_load.key` is set and table exists with >=2 snapshots, diffs the two most recent and applies INSERT/DELETE/UPDATE delta
172
+ - `--snapshot YYYY-MM-DD` flag to load a specific snapshot date
173
+ - `--drop-existing` forces full INSERT of latest snapshot
169
174
 
170
175
  **Dataset Storage Layout**:
171
176
  ```
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: anysite-cli
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: CLI for Anysite API - web data extraction for humans and AI agents
5
5
  Project-URL: Homepage, https://anysite.io
6
6
  Project-URL: Documentation, https://docs.anysite.io/cli
@@ -259,6 +259,8 @@ sources:
259
259
  path: ./output/companies-{{date}}.csv
260
260
  format: csv
261
261
  db_load:
262
+ key: _input_value # Unique key for incremental sync
263
+ sync: full # full (default) or append (no DELETE)
262
264
  fields: [name, url, employee_count]
263
265
 
264
266
  - id: employees
@@ -274,6 +276,8 @@ sources:
274
276
  count: 5
275
277
  refresh: always # Re-collect every run with --incremental
276
278
  db_load:
279
+ key: urn.value # Unique key for incremental sync
280
+ sync: append # Keep old records (no DELETE on diff)
277
281
  fields: [name, url, headline]
278
282
 
279
283
  storage:
@@ -318,9 +322,15 @@ anysite dataset query dataset.yaml --interactive
318
322
  anysite dataset stats dataset.yaml --source companies
319
323
  anysite dataset profile dataset.yaml
320
324
 
321
- # Load into PostgreSQL with automatic FK linking
325
+ # Load into PostgreSQL with automatic FK linking (incremental sync with db_load.key)
326
+ anysite dataset load-db dataset.yaml -c pg
327
+
328
+ # Drop and reload from latest snapshot
322
329
  anysite dataset load-db dataset.yaml -c pg --drop-existing
323
330
 
331
+ # Load a specific snapshot date
332
+ anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
333
+
324
334
  # Run history and logs
325
335
  anysite dataset history my-dataset
326
336
  anysite dataset logs my-dataset --run 42
@@ -328,8 +338,9 @@ anysite dataset logs my-dataset --run 42
328
338
  # Generate cron/systemd schedule
329
339
  anysite dataset schedule dataset.yaml --incremental --load-db pg
330
340
 
331
- # Compare snapshots (diff two collection dates)
341
+ # Compare snapshots (diff two collection dates, supports dot-notation keys)
332
342
  anysite dataset diff dataset.yaml --source employees --key _input_value
343
+ anysite dataset diff dataset.yaml --source profiles --key urn.value --fields "name,headline"
333
344
 
334
345
  # Reset incremental state
335
346
  anysite dataset reset-cursor dataset.yaml
@@ -196,6 +196,8 @@ sources:
196
196
  path: ./output/companies-{{date}}.csv
197
197
  format: csv
198
198
  db_load:
199
+ key: _input_value # Unique key for incremental sync
200
+ sync: full # full (default) or append (no DELETE)
199
201
  fields: [name, url, employee_count]
200
202
 
201
203
  - id: employees
@@ -211,6 +213,8 @@ sources:
211
213
  count: 5
212
214
  refresh: always # Re-collect every run with --incremental
213
215
  db_load:
216
+ key: urn.value # Unique key for incremental sync
217
+ sync: append # Keep old records (no DELETE on diff)
214
218
  fields: [name, url, headline]
215
219
 
216
220
  storage:
@@ -255,9 +259,15 @@ anysite dataset query dataset.yaml --interactive
255
259
  anysite dataset stats dataset.yaml --source companies
256
260
  anysite dataset profile dataset.yaml
257
261
 
258
- # Load into PostgreSQL with automatic FK linking
262
+ # Load into PostgreSQL with automatic FK linking (incremental sync with db_load.key)
263
+ anysite dataset load-db dataset.yaml -c pg
264
+
265
+ # Drop and reload from latest snapshot
259
266
  anysite dataset load-db dataset.yaml -c pg --drop-existing
260
267
 
268
+ # Load a specific snapshot date
269
+ anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
270
+
261
271
  # Run history and logs
262
272
  anysite dataset history my-dataset
263
273
  anysite dataset logs my-dataset --run 42
@@ -265,8 +275,9 @@ anysite dataset logs my-dataset --run 42
265
275
  # Generate cron/systemd schedule
266
276
  anysite dataset schedule dataset.yaml --incremental --load-db pg
267
277
 
268
- # Compare snapshots (diff two collection dates)
278
+ # Compare snapshots (diff two collection dates, supports dot-notation keys)
269
279
  anysite dataset diff dataset.yaml --source employees --key _input_value
280
+ anysite dataset diff dataset.yaml --source profiles --key urn.value --fields "name,headline"
270
281
 
271
282
  # Reset incremental state
272
283
  anysite dataset reset-cursor dataset.yaml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "anysite-cli"
7
- version = "0.1.3"
7
+ version = "0.1.5"
8
8
  description = "CLI for Anysite API - web data extraction for humans and AI agents"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -117,6 +117,8 @@ sources:
117
117
  path: ./output/companies-{{date}}.csv
118
118
  format: csv
119
119
  db_load:
120
+ key: _input_value # Unique key for diff-based incremental sync
121
+ sync: full # full (INSERT/DELETE/UPDATE) or append (no DELETE)
120
122
  fields: [name, url, employee_count]
121
123
 
122
124
  - id: employees
@@ -189,18 +191,32 @@ anysite dataset profile dataset.yaml
189
191
  # Load all sources with FK linking
190
192
  anysite dataset load-db dataset.yaml -c pg --drop-existing
191
193
 
194
+ # Incremental sync (uses diff when db_load.key is set)
195
+ anysite dataset load-db dataset.yaml -c pg
196
+
197
+ # Load a specific snapshot date
198
+ anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
199
+
192
200
  # Dry run
193
201
  anysite dataset load-db dataset.yaml -c pg --dry-run
194
202
  ```
195
203
 
196
204
  `load-db` auto-creates tables with inferred schema, adds `id` primary key, and links child tables to parents via `{parent}_id` FK columns using provenance data.
197
205
 
206
+ **Incremental sync**: When `db_load.key` is set and the table already exists with >=2 snapshots, `load-db` diffs the two most recent snapshots and applies only the delta (INSERT added, DELETE removed, UPDATE changed). Without `db_load.key`, it does a full INSERT of the latest snapshot.
207
+
208
+ **Sync modes** (`db_load.sync`):
209
+ - `full` (default) — applies INSERT, DELETE, and UPDATE from diff
210
+ - `append` — applies INSERT and UPDATE only, skips DELETE (keeps records that disappeared from the API). Use for sources where the API returns only the latest N items (e.g., posts, activity feeds).
211
+
198
212
  Optional `db_load` config per source controls which fields go to DB:
199
213
  ```yaml
200
214
  - id: profiles
201
215
  endpoint: /api/linkedin/user
202
216
  db_load:
203
217
  table: people # Custom table name
218
+ key: urn.value # Unique key for diff-based incremental sync
219
+ sync: append # Keep old records (no DELETE on diff)
204
220
  fields: # Select specific fields
205
221
  - name
206
222
  - urn.value AS urn_id # Dot-notation extraction
@@ -241,11 +257,16 @@ anysite api /api/linkedin/user user=satyanadella -q --format jsonl \
241
257
  # Diff two most recent snapshots
242
258
  anysite dataset diff dataset.yaml --source employees --key _input_value
243
259
 
244
- # Diff specific dates, compare only certain fields
260
+ # Diff with dot-notation key (for JSON fields like urn)
261
+ anysite dataset diff dataset.yaml --source profiles --key urn.value
262
+
263
+ # Diff specific dates, compare and output only certain fields
245
264
  anysite dataset diff dataset.yaml --source employees --key _input_value \
246
265
  --from 2026-01-30 --to 2026-02-01 --fields "name,headline"
247
266
  ```
248
267
 
268
+ `--key` supports dot-notation for JSON fields (e.g., `urn.value`). `--fields` restricts both comparison and output columns.
269
+
249
270
  ### Step 7: History, Scheduling, and Notifications
250
271
  ```bash
251
272
  # View run history
@@ -39,6 +39,8 @@ sources:
39
39
  headers: {X-Token: abc}
40
40
  db_load: # Database loading config
41
41
  table: custom_name # Override table name
42
+ key: urn.value # Unique key for diff-based incremental sync
43
+ sync: full # full (INSERT/DELETE/UPDATE) or append (no DELETE)
42
44
  fields: [name, url] # Fields to include
43
45
  exclude: [_input_value] # Fields to exclude
44
46
 
@@ -244,6 +246,7 @@ anysite dataset load-db dataset.yaml -c <connection_name> [OPTIONS]
244
246
  --connection, -c TEXT Database connection name (required)
245
247
  --source, -s TEXT Load specific source + dependencies
246
248
  --drop-existing Drop tables before creating
249
+ --snapshot TEXT Load a specific snapshot date (YYYY-MM-DD)
247
250
  --dry-run Show plan without executing
248
251
  --quiet, -q Suppress output
249
252
  ```
@@ -256,6 +259,29 @@ anysite dataset load-db dataset.yaml -c <connection_name> [OPTIONS]
256
259
  4. Inserts rows, tracking which `_input_value` maps to which `id`
257
260
  5. For child sources: adds `{parent_source}_id` FK column using provenance
258
261
 
262
+ ### Incremental Sync with `db_load.key`
263
+
264
+ When `db_load.key` is set and the table already exists with >=2 snapshots, `load-db` uses diff-based incremental sync instead of full re-insertion:
265
+
266
+ 1. Compares the two most recent Parquet snapshots using `DatasetDiffer`
267
+ 2. **Added** records → INSERT into DB
268
+ 3. **Removed** records → DELETE from DB (by key) — only in `sync: full` mode
269
+ 4. **Changed** records → UPDATE modified fields (by key)
270
+
271
+ This keeps the database in sync without duplicates.
272
+
273
+ **Sync modes** (`db_load.sync`):
274
+ - `full` (default) — applies INSERT, DELETE, and UPDATE from diff
275
+ - `append` — applies INSERT and UPDATE only, skips DELETE. Use for sources where the API returns only the latest N items (e.g., posts, comments) and you want to accumulate records over time.
276
+
277
+ | Scenario | Behavior |
278
+ |----------|----------|
279
+ | First load (table doesn't exist) | Full INSERT of latest snapshot |
280
+ | Table exists + `db_load.key` + >=2 snapshots | Diff-based sync (INSERT/DELETE/UPDATE delta) |
281
+ | `--drop-existing` | Drop table, full INSERT of latest snapshot |
282
+ | `--snapshot 2026-01-15` | Full INSERT of that specific snapshot |
283
+ | No `db_load.key` set | Full INSERT of latest snapshot (no diff) |
284
+
259
285
  ### db_load Config
260
286
 
261
287
  Control which fields go to the database per source:
@@ -263,6 +289,8 @@ Control which fields go to the database per source:
263
289
  ```yaml
264
290
  db_load:
265
291
  table: people # Custom table name (default: source ID)
292
+ key: urn.value # Unique key for diff-based incremental sync
293
+ sync: append # full (default) or append (no DELETE on diff)
266
294
  fields: # Explicit field list
267
295
  - name
268
296
  - url
@@ -456,8 +484,11 @@ Compare two collection snapshots to find added, removed, and changed records.
456
484
  # Compare two most recent snapshots (auto-detect dates)
457
485
  anysite dataset diff dataset.yaml --source profiles --key _input_value
458
486
 
487
+ # Compare with dot-notation key (JSON fields)
488
+ anysite dataset diff dataset.yaml --source profiles --key urn.value
489
+
459
490
  # Compare specific dates
460
- anysite dataset diff dataset.yaml --source profiles --key urn --from 2026-01-30 --to 2026-02-01
491
+ anysite dataset diff dataset.yaml --source profiles --key urn.value --from 2026-01-30 --to 2026-02-01
461
492
 
462
493
  # Only compare specific fields
463
494
  anysite dataset diff dataset.yaml --source profiles --key urn --fields "name,headline,follower_count"
@@ -468,9 +499,9 @@ anysite dataset diff dataset.yaml --source profiles --key urn --format json --ou
468
499
 
469
500
  **Options:**
470
501
  - `--source, -s` (required) — source to compare
471
- - `--key, -k` (required) — field to match records by (e.g., `_input_value`, `urn`)
502
+ - `--key, -k` (required) — field to match records by. Supports dot-notation for JSON fields (e.g., `urn.value`)
472
503
  - `--from` / `--to` — snapshot dates (default: two most recent)
473
- - `--fields, -f` — only compare these fields
504
+ - `--fields, -f` — restrict both comparison and output to these fields
474
505
  - `--format` — output format (table, json, jsonl, csv)
475
506
  - `--output, -o` — write to file
476
507
 
@@ -357,6 +357,10 @@ def load_db(
357
357
  bool,
358
358
  typer.Option("--quiet", "-q", help="Suppress progress output"),
359
359
  ] = False,
360
+ snapshot: Annotated[
361
+ str | None,
362
+ typer.Option("--snapshot", help="Load a specific snapshot date (YYYY-MM-DD)"),
363
+ ] = None,
360
364
  ) -> None:
361
365
  """Load collected Parquet data into a relational database with FK linking."""
362
366
  config = _load_config(config_path)
@@ -379,6 +383,7 @@ def load_db(
379
383
  source_filter=source,
380
384
  drop_existing=drop_existing,
381
385
  dry_run=dry_run,
386
+ snapshot=snapshot,
382
387
  )
383
388
  except Exception as e:
384
389
  typer.echo(f"Load error: {e}", err=True)
@@ -519,7 +524,11 @@ def diff_cmd(
519
524
  return
520
525
 
521
526
  # Format and output
522
- rows = format_diff_table(result) if format == "table" else format_diff_records(result)
527
+ rows = (
528
+ format_diff_table(result, output_fields=field_list)
529
+ if format == "table"
530
+ else format_diff_records(result, output_fields=field_list)
531
+ )
523
532
 
524
533
  _output_results(rows, format, output)
525
534
 
@@ -19,6 +19,7 @@ from anysite.dataset.models import DatasetConfig, DatasetSource
19
19
  from anysite.dataset.storage import (
20
20
  MetadataStore,
21
21
  get_parquet_path,
22
+ read_latest_parquet,
22
23
  read_parquet,
23
24
  write_parquet,
24
25
  )
@@ -412,9 +413,9 @@ async def _collect_dependent(
412
413
  if dep is None:
413
414
  raise DatasetError(f"Source {source.id} has no dependency defined")
414
415
 
415
- # Read parent data
416
+ # Read parent data (latest snapshot only to avoid schema mismatch)
416
417
  parent_dir = base_path / "raw" / dep.from_source
417
- parent_records = read_parquet(parent_dir)
418
+ parent_records = read_latest_parquet(parent_dir)
418
419
 
419
420
  if not parent_records:
420
421
  if not quiet:
@@ -627,7 +628,7 @@ def _count_dependent_inputs(
627
628
  if dep is None:
628
629
  return None
629
630
  parent_dir = base_path / "raw" / dep.from_source
630
- parent_records = read_parquet(parent_dir)
631
+ parent_records = read_latest_parquet(parent_dir)
631
632
  if not parent_records:
632
633
  info = metadata.get_source_info(dep.from_source)
633
634
  return info.get("record_count") if info else None
@@ -3,12 +3,18 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import json
6
+ import logging
7
+ from datetime import date
8
+ from pathlib import Path
6
9
  from typing import Any
7
10
 
8
11
  from anysite.dataset.models import DatasetConfig, DatasetSource
9
12
  from anysite.dataset.storage import get_source_dir, read_parquet
10
13
  from anysite.db.adapters.base import DatabaseAdapter
11
14
  from anysite.db.schema.inference import infer_table_schema
15
+ from anysite.db.utils.sanitize import sanitize_identifier
16
+
17
+ logger = logging.getLogger(__name__)
12
18
 
13
19
 
14
20
  def _get_dialect(adapter: DatabaseAdapter) -> str:
@@ -86,15 +92,31 @@ def _filter_record(
86
92
  return {k: v for k, v in record.items() if k not in exclude}
87
93
 
88
94
 
95
+ def _get_latest_parquet(base_path: Path, source_id: str) -> Path | None:
96
+ """Return the path to the most recent snapshot for a source."""
97
+ source_dir = get_source_dir(base_path, source_id)
98
+ if not source_dir.exists():
99
+ return None
100
+ files = sorted(source_dir.glob("*.parquet"))
101
+ return files[-1] if files else None
102
+
103
+
104
+ def _get_snapshot_for_date(base_path: Path, source_id: str, d: date) -> Path | None:
105
+ """Return the parquet path for a specific snapshot date."""
106
+ source_dir = get_source_dir(base_path, source_id)
107
+ path = source_dir / f"{d.isoformat()}.parquet"
108
+ return path if path.exists() else None
109
+
110
+
89
111
  class DatasetDbLoader:
90
112
  """Load dataset Parquet data into a relational database.
91
113
 
92
- Handles:
93
- - Schema inference from Parquet records
94
- - Auto-increment primary keys (``id`` column)
95
- - Foreign key linking via provenance ``_input_value`` column
96
- - Dot-notation field extraction for JSON columns
97
- - Topological loading order (parents before children)
114
+ Supports diff-based incremental sync when ``db_load.key`` is configured:
115
+ compares the two most recent snapshots and applies INSERT/DELETE/UPDATE
116
+ to keep the database in sync.
117
+
118
+ Falls back to full INSERT of the latest snapshot when no key is set
119
+ or when the table doesn't exist yet.
98
120
  """
99
121
 
100
122
  def __init__(
@@ -115,16 +137,18 @@ class DatasetDbLoader:
115
137
  source_filter: str | None = None,
116
138
  drop_existing: bool = False,
117
139
  dry_run: bool = False,
140
+ snapshot: str | None = None,
118
141
  ) -> dict[str, int]:
119
142
  """Load all sources into the database in dependency order.
120
143
 
121
144
  Args:
122
145
  source_filter: Only load this source (and dependencies).
123
- drop_existing: Drop tables before creating.
146
+ drop_existing: Drop tables before creating, then full INSERT latest.
124
147
  dry_run: Show plan without executing.
148
+ snapshot: Load a specific snapshot date (YYYY-MM-DD).
125
149
 
126
150
  Returns:
127
- Mapping of source_id to number of rows loaded.
151
+ Mapping of source_id to number of rows loaded/affected.
128
152
  """
129
153
  sources = self.config.topological_sort()
130
154
 
@@ -139,6 +163,7 @@ class DatasetDbLoader:
139
163
  source,
140
164
  drop_existing=drop_existing,
141
165
  dry_run=dry_run,
166
+ snapshot=snapshot,
142
167
  )
143
168
  results[source.id] = count
144
169
 
@@ -150,18 +175,64 @@ class DatasetDbLoader:
150
175
  *,
151
176
  drop_existing: bool = False,
152
177
  dry_run: bool = False,
178
+ snapshot: str | None = None,
153
179
  ) -> int:
154
- """Load a single source into the database."""
155
- source_dir = get_source_dir(self.base_path, source.id)
156
- if not source_dir.exists() or not any(source_dir.glob("*.parquet")):
180
+ """Load a single source into the database.
181
+
182
+ Strategy:
183
+ 1. ``drop_existing``: drop table → full INSERT of latest snapshot
184
+ 2. ``snapshot``: full INSERT of that specific snapshot
185
+ 3. Table doesn't exist: full INSERT of latest snapshot
186
+ 4. Table exists + ``db_load.key`` set + ≥2 snapshots: diff-based sync
187
+ 5. Fallback: full INSERT of latest snapshot
188
+ """
189
+ table_name = _table_name_for(source)
190
+
191
+ # Handle drop_existing
192
+ if drop_existing and self.adapter.table_exists(table_name):
193
+ self.adapter.execute(f"DROP TABLE {table_name}")
194
+
195
+ # Determine which parquet to load
196
+ if snapshot:
197
+ snapshot_date = date.fromisoformat(snapshot)
198
+ parquet_path = _get_snapshot_for_date(self.base_path, source.id, snapshot_date)
199
+ if parquet_path is None:
200
+ return 0
201
+ return self._full_insert(source, table_name, parquet_path, dry_run=dry_run)
202
+
203
+ # Check if we can do diff-based sync
204
+ diff_key = source.db_load.key if source.db_load else None
205
+ table_exists = self.adapter.table_exists(table_name)
206
+
207
+ if diff_key and table_exists and not drop_existing:
208
+ from anysite.dataset.differ import DatasetDiffer
209
+ differ = DatasetDiffer(self.base_path)
210
+ dates = differ.available_dates(source.id)
211
+
212
+ if len(dates) >= 2:
213
+ return self._diff_sync(
214
+ source, table_name, diff_key, differ, dates, dry_run=dry_run
215
+ )
216
+
217
+ # Fallback: full INSERT of latest snapshot
218
+ latest = _get_latest_parquet(self.base_path, source.id)
219
+ if latest is None:
157
220
  return 0
221
+ return self._full_insert(source, table_name, latest, dry_run=dry_run)
158
222
 
159
- raw_records = read_parquet(source_dir)
223
+ def _full_insert(
224
+ self,
225
+ source: DatasetSource,
226
+ table_name: str,
227
+ parquet_path: Path,
228
+ *,
229
+ dry_run: bool = False,
230
+ ) -> int:
231
+ """Full INSERT: read parquet, transform, create table if needed, insert all rows."""
232
+ raw_records = read_parquet(parquet_path)
160
233
  if not raw_records:
161
234
  return 0
162
235
 
163
- table_name = _table_name_for(source)
164
-
165
236
  # Determine parent info for FK linking
166
237
  parent_source_id = None
167
238
  parent_fk_col = None
@@ -174,7 +245,6 @@ class DatasetDbLoader:
174
245
  for record in raw_records:
175
246
  row = _filter_record(record, source)
176
247
 
177
- # Add FK column if this is a dependent source
178
248
  if parent_source_id and parent_fk_col:
179
249
  input_val = record.get("_input_value")
180
250
  parent_map = self._value_to_id.get(parent_source_id, {})
@@ -189,17 +259,12 @@ class DatasetDbLoader:
189
259
  return len(rows)
190
260
 
191
261
  # Determine the lookup field for children to reference this source
192
- # This is the field that child dependencies extract from this source
193
262
  lookup_field = self._get_child_lookup_field(source)
194
263
 
195
- # Create table
196
- if drop_existing and self.adapter.table_exists(table_name):
197
- self.adapter.execute(f"DROP TABLE {table_name}")
198
-
264
+ # Create table if needed
199
265
  if not self.adapter.table_exists(table_name):
200
266
  schema = infer_table_schema(table_name, rows)
201
267
  sql_types = schema.to_sql_types(self._dialect)
202
- # Add auto-increment id column
203
268
  col_defs = {"id": self._auto_id_type()}
204
269
  col_defs.update(sql_types)
205
270
  self.adapter.create_table(table_name, col_defs, primary_key="id")
@@ -208,10 +273,8 @@ class DatasetDbLoader:
208
273
  value_map: dict[str, int] = {}
209
274
  for i, row in enumerate(rows):
210
275
  self.adapter.insert_batch(table_name, [row])
211
- # Get the last inserted id
212
276
  last_id = self._get_last_id(table_name)
213
277
 
214
- # Build value→id map for child sources
215
278
  if lookup_field and last_id is not None:
216
279
  raw_record = raw_records[i]
217
280
  lookup_val = _extract_dot_value(raw_record, lookup_field)
@@ -225,6 +288,86 @@ class DatasetDbLoader:
225
288
 
226
289
  return len(rows)
227
290
 
291
+ def _diff_sync(
292
+ self,
293
+ source: DatasetSource,
294
+ table_name: str,
295
+ diff_key: str,
296
+ differ: Any,
297
+ dates: list[date],
298
+ *,
299
+ dry_run: bool = False,
300
+ ) -> int:
301
+ """Diff-based incremental sync: compare two most recent snapshots, apply delta."""
302
+ result = differ.diff(source.id, diff_key)
303
+ total = 0
304
+ sync_mode = source.db_load.sync if source.db_load else "full"
305
+
306
+ if dry_run:
307
+ count = len(result.added) + len(result.changed)
308
+ if sync_mode == "full":
309
+ count += len(result.removed)
310
+ return count
311
+
312
+ # Extract key value from a record (handles dot-notation)
313
+ def _get_key_val(record: dict[str, Any]) -> Any:
314
+ if "." in diff_key:
315
+ return _extract_dot_value(record, diff_key)
316
+ return record.get(diff_key)
317
+
318
+ # Determine the DB column name for the key
319
+ db_key_col = diff_key.replace(".", "_")
320
+
321
+ # INSERT added records
322
+ if result.added:
323
+ for record in result.added:
324
+ row = _filter_record(record, source)
325
+ self.adapter.insert_batch(table_name, [row])
326
+ total += 1
327
+
328
+ # DELETE removed records (skipped in append mode)
329
+ if result.removed and sync_mode == "full":
330
+ safe_col = sanitize_identifier(db_key_col)
331
+ for record in result.removed:
332
+ key_val = _get_key_val(record)
333
+ if key_val is not None:
334
+ self.adapter.execute(
335
+ f"DELETE FROM {table_name} WHERE {safe_col} = ?",
336
+ (str(key_val),),
337
+ )
338
+ total += 1
339
+
340
+ # UPDATE changed records
341
+ if result.changed:
342
+ safe_col = sanitize_identifier(db_key_col)
343
+ for record in result.changed:
344
+ key_val = _get_key_val(record)
345
+ if key_val is None:
346
+ continue
347
+ changed_fields = record.get("_changed_fields", [])
348
+ if not changed_fields:
349
+ continue
350
+
351
+ # Build SET clause from changed fields
352
+ set_parts = []
353
+ params: list[Any] = []
354
+ for field_name in changed_fields:
355
+ new_val = record.get(field_name)
356
+ safe_field = sanitize_identifier(field_name)
357
+ set_parts.append(f"{safe_field} = ?")
358
+ params.append(new_val)
359
+
360
+ params.append(str(key_val))
361
+ sql = (
362
+ f"UPDATE {table_name} "
363
+ f"SET {', '.join(set_parts)} "
364
+ f"WHERE {safe_col} = ?"
365
+ )
366
+ self.adapter.execute(sql, tuple(params))
367
+ total += 1
368
+
369
+ return total
370
+
228
371
  def _get_child_lookup_field(self, source: DatasetSource) -> str | None:
229
372
  """Find which field children use to reference this source."""
230
373
  for other in self.config.sources: