anysite-cli 0.1.4__tar.gz → 0.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of anysite-cli might be problematic. Click here for more details.

Files changed (118) hide show
  1. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/CLAUDE.md +9 -4
  2. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/PKG-INFO +14 -3
  3. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/README.md +13 -2
  4. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/pyproject.toml +1 -1
  5. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/skills/anysite-cli/SKILL.md +22 -1
  6. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/skills/anysite-cli/references/dataset-guide.md +34 -3
  7. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/collector.py +4 -3
  8. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/db_loader.py +17 -6
  9. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/differ.py +14 -0
  10. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/models.py +4 -0
  11. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/storage.py +21 -1
  12. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_db_loader.py +198 -1
  13. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_differ.py +92 -1
  14. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/.claude/settings.local.json +0 -0
  15. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/.gitignore +0 -0
  16. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/LICENSE +0 -0
  17. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/skills/anysite-cli/references/api-reference.md +0 -0
  18. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/__init__.py +0 -0
  19. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/__main__.py +0 -0
  20. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/api/__init__.py +0 -0
  21. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/api/client.py +0 -0
  22. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/api/errors.py +0 -0
  23. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/api/schemas.py +0 -0
  24. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/batch/__init__.py +0 -0
  25. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/batch/executor.py +0 -0
  26. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/batch/input.py +0 -0
  27. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/batch/rate_limiter.py +0 -0
  28. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/cli/__init__.py +0 -0
  29. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/cli/config.py +0 -0
  30. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/cli/executor.py +0 -0
  31. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/cli/options.py +0 -0
  32. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/config/__init__.py +0 -0
  33. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/config/paths.py +0 -0
  34. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/config/settings.py +0 -0
  35. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/__init__.py +0 -0
  36. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/analyzer.py +0 -0
  37. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/cli.py +0 -0
  38. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/errors.py +0 -0
  39. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/exporters.py +0 -0
  40. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/history.py +0 -0
  41. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/notifications.py +0 -0
  42. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/scheduler.py +0 -0
  43. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/dataset/transformer.py +0 -0
  44. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/__init__.py +0 -0
  45. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/adapters/__init__.py +0 -0
  46. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/adapters/base.py +0 -0
  47. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/adapters/postgres.py +0 -0
  48. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/adapters/sqlite.py +0 -0
  49. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/cli.py +0 -0
  50. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/config.py +0 -0
  51. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/manager.py +0 -0
  52. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/operations/__init__.py +0 -0
  53. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/operations/insert.py +0 -0
  54. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/operations/query.py +0 -0
  55. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/schema/__init__.py +0 -0
  56. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/schema/inference.py +0 -0
  57. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/schema/types.py +0 -0
  58. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/utils/__init__.py +0 -0
  59. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/db/utils/sanitize.py +0 -0
  60. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/main.py +0 -0
  61. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/models/__init__.py +0 -0
  62. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/output/__init__.py +0 -0
  63. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/output/console.py +0 -0
  64. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/output/formatters.py +0 -0
  65. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/output/templates.py +0 -0
  66. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/py.typed +0 -0
  67. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/streaming/__init__.py +0 -0
  68. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/streaming/progress.py +0 -0
  69. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/streaming/writer.py +0 -0
  70. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/utils/__init__.py +0 -0
  71. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/utils/fields.py +0 -0
  72. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/src/anysite/utils/retry.py +0 -0
  73. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/test_data/enriched_partners_sample_10.csv +0 -0
  74. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/test_data/linkedin-partners/company_aliases.txt +0 -0
  75. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/test_data/linkedin-partners/dataset.yaml +0 -0
  76. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/test_data/partners-deep/dataset.yaml +0 -0
  77. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/test_data/partners-intel/dataset.yaml +0 -0
  78. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/test_data/partners-linkedin/company_aliases.txt +0 -0
  79. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/test_data/partners-linkedin/dataset.yaml +0 -0
  80. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/test_data/partners-pipeline/dataset.yaml +0 -0
  81. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/__init__.py +0 -0
  82. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/conftest.py +0 -0
  83. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_api/__init__.py +0 -0
  84. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_batch/__init__.py +0 -0
  85. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_batch/test_executor.py +0 -0
  86. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_batch/test_input.py +0 -0
  87. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_batch/test_rate_limiter.py +0 -0
  88. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_cli/__init__.py +0 -0
  89. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_cli/test_main.py +0 -0
  90. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/__init__.py +0 -0
  91. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_analyzer.py +0 -0
  92. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_collector.py +0 -0
  93. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_exporters.py +0 -0
  94. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_history.py +0 -0
  95. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_integration_csv.py +0 -0
  96. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_models.py +0 -0
  97. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_notifications.py +0 -0
  98. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_scheduler.py +0 -0
  99. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_storage.py +0 -0
  100. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_dataset/test_transformer.py +0 -0
  101. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_db/__init__.py +0 -0
  102. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_db/test_cli.py +0 -0
  103. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_db/test_config.py +0 -0
  104. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_db/test_inference.py +0 -0
  105. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_db/test_insert.py +0 -0
  106. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_db/test_manager.py +0 -0
  107. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_db/test_postgres_adapter.py +0 -0
  108. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_db/test_sanitize.py +0 -0
  109. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_db/test_sqlite_adapter.py +0 -0
  110. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_output/__init__.py +0 -0
  111. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_output/test_formatters.py +0 -0
  112. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_output/test_templates.py +0 -0
  113. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_streaming/__init__.py +0 -0
  114. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_streaming/test_progress.py +0 -0
  115. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_streaming/test_writer.py +0 -0
  116. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_utils/__init__.py +0 -0
  117. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_utils/test_fields.py +0 -0
  118. {anysite_cli-0.1.4 → anysite_cli-0.1.6}/tests/test_utils/test_retry.py +0 -0
@@ -49,12 +49,14 @@ anysite dataset query dataset.yaml --interactive
49
49
  anysite dataset stats dataset.yaml --source profiles
50
50
  anysite dataset profile dataset.yaml
51
51
  anysite dataset load-db dataset.yaml -c pg --drop-existing
52
+ anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
52
53
  anysite dataset history my-dataset
53
54
  anysite dataset logs my-dataset --run 42
54
55
  anysite dataset schedule dataset.yaml --incremental --load-db pg
55
56
  anysite dataset schedule dataset.yaml --systemd --load-db pg
56
57
  anysite dataset diff dataset.yaml --source profiles --key _input_value
57
- anysite dataset diff dataset.yaml --source profiles --key urn --from 2026-01-30 --to 2026-02-01
58
+ anysite dataset diff dataset.yaml --source profiles --key urn.value --from 2026-01-30 --to 2026-02-01
59
+ anysite dataset diff dataset.yaml --source profiles --key urn.value --fields "name,headline,follower_count"
58
60
  anysite dataset reset-cursor dataset.yaml
59
61
  anysite dataset reset-cursor dataset.yaml --source profiles
60
62
 
@@ -104,9 +106,9 @@ anysite db upsert mydb --table users --conflict-columns id --stdin
104
106
  - `dataset/history.py` - `HistoryStore` (SQLite at `~/.anysite/dataset_history.db`): run start/finish tracking. `LogManager`: file-based per-run logs at `~/.anysite/logs/`
105
107
  - `dataset/scheduler.py` - `ScheduleGenerator`: crontab and systemd timer unit generation from cron expressions
106
108
  - `dataset/notifications.py` - `WebhookNotifier`: POST to webhook URLs on collection complete/failure
107
- - `dataset/differ.py` - `DatasetDiffer`: compare two Parquet snapshots using DuckDB (added/removed/changed records). `DiffResult` dataclass, `format_diff_table()` and `format_diff_records()` formatters
109
+ - `dataset/differ.py` - `DatasetDiffer`: compare two Parquet snapshots using DuckDB (added/removed/changed records). Supports dot-notation keys via `json_extract_string()`. `DiffResult` dataclass, `format_diff_table()` and `format_diff_records()` formatters with output field filtering
108
110
  - `dataset/cli.py` - Typer subcommands: `init`, `collect` (with `--load-db`), `status`, `query`, `stats`, `profile`, `load-db`, `diff`, `history`, `logs`, `schedule`, `reset-cursor`
109
- - `dataset/db_loader.py` - `DatasetDbLoader`: loads Parquet data into relational DB with FK linking via provenance, dot-notation field extraction, schema inference
111
+ - `dataset/db_loader.py` - `DatasetDbLoader`: loads Parquet data into relational DB with FK linking via provenance, dot-notation field extraction, schema inference, diff-based incremental sync (`db_load.key` + `db_load.sync: full|append`). Supports diff-based incremental sync via `db_load.key` and `--snapshot` for loading specific dates
110
112
  - `dataset/errors.py` - `DatasetError`, `CircularDependencyError`, `SourceNotFoundError`
111
113
  - `db/__init__.py` - `check_db_deps()` — verifies optional psycopg is installed for Postgres
112
114
  - `db/config.py` - `ConnectionConfig`, `DatabaseType`, `OnConflict` enums and models
@@ -164,8 +166,11 @@ Sources are topologically sorted by dependencies. `input_template` allows transf
164
166
  - Schema inference from Parquet records via `infer_table_schema()`
165
167
  - Auto-increment `id` primary key per table
166
168
  - FK linking via provenance: parent `_input_value` → child `{parent}_id` column
167
- - Optional `db_load` config per source: field selection, dot-notation extraction, custom table names, field exclusion
169
+ - Optional `db_load` config per source: field selection, dot-notation extraction, custom table names, field exclusion, `key` for diff-based incremental sync
168
170
  - Topological loading order (parents before children)
171
+ - Diff-based incremental sync: when `db_load.key` is set and table exists with >=2 snapshots, diffs the two most recent and applies INSERT/DELETE/UPDATE delta
172
+ - `--snapshot YYYY-MM-DD` flag to load a specific snapshot date
173
+ - `--drop-existing` forces full INSERT of latest snapshot
169
174
 
170
175
  **Dataset Storage Layout**:
171
176
  ```
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: anysite-cli
3
- Version: 0.1.4
3
+ Version: 0.1.6
4
4
  Summary: CLI for Anysite API - web data extraction for humans and AI agents
5
5
  Project-URL: Homepage, https://anysite.io
6
6
  Project-URL: Documentation, https://docs.anysite.io/cli
@@ -259,6 +259,8 @@ sources:
259
259
  path: ./output/companies-{{date}}.csv
260
260
  format: csv
261
261
  db_load:
262
+ key: _input_value # Unique key for incremental sync
263
+ sync: full # full (default) or append (no DELETE)
262
264
  fields: [name, url, employee_count]
263
265
 
264
266
  - id: employees
@@ -274,6 +276,8 @@ sources:
274
276
  count: 5
275
277
  refresh: always # Re-collect every run with --incremental
276
278
  db_load:
279
+ key: urn.value # Unique key for incremental sync
280
+ sync: append # Keep old records (no DELETE on diff)
277
281
  fields: [name, url, headline]
278
282
 
279
283
  storage:
@@ -318,9 +322,15 @@ anysite dataset query dataset.yaml --interactive
318
322
  anysite dataset stats dataset.yaml --source companies
319
323
  anysite dataset profile dataset.yaml
320
324
 
321
- # Load into PostgreSQL with automatic FK linking
325
+ # Load into PostgreSQL with automatic FK linking (incremental sync with db_load.key)
326
+ anysite dataset load-db dataset.yaml -c pg
327
+
328
+ # Drop and reload from latest snapshot
322
329
  anysite dataset load-db dataset.yaml -c pg --drop-existing
323
330
 
331
+ # Load a specific snapshot date
332
+ anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
333
+
324
334
  # Run history and logs
325
335
  anysite dataset history my-dataset
326
336
  anysite dataset logs my-dataset --run 42
@@ -328,8 +338,9 @@ anysite dataset logs my-dataset --run 42
328
338
  # Generate cron/systemd schedule
329
339
  anysite dataset schedule dataset.yaml --incremental --load-db pg
330
340
 
331
- # Compare snapshots (diff two collection dates)
341
+ # Compare snapshots (diff two collection dates, supports dot-notation keys)
332
342
  anysite dataset diff dataset.yaml --source employees --key _input_value
343
+ anysite dataset diff dataset.yaml --source profiles --key urn.value --fields "name,headline"
333
344
 
334
345
  # Reset incremental state
335
346
  anysite dataset reset-cursor dataset.yaml
@@ -196,6 +196,8 @@ sources:
196
196
  path: ./output/companies-{{date}}.csv
197
197
  format: csv
198
198
  db_load:
199
+ key: _input_value # Unique key for incremental sync
200
+ sync: full # full (default) or append (no DELETE)
199
201
  fields: [name, url, employee_count]
200
202
 
201
203
  - id: employees
@@ -211,6 +213,8 @@ sources:
211
213
  count: 5
212
214
  refresh: always # Re-collect every run with --incremental
213
215
  db_load:
216
+ key: urn.value # Unique key for incremental sync
217
+ sync: append # Keep old records (no DELETE on diff)
214
218
  fields: [name, url, headline]
215
219
 
216
220
  storage:
@@ -255,9 +259,15 @@ anysite dataset query dataset.yaml --interactive
255
259
  anysite dataset stats dataset.yaml --source companies
256
260
  anysite dataset profile dataset.yaml
257
261
 
258
- # Load into PostgreSQL with automatic FK linking
262
+ # Load into PostgreSQL with automatic FK linking (incremental sync with db_load.key)
263
+ anysite dataset load-db dataset.yaml -c pg
264
+
265
+ # Drop and reload from latest snapshot
259
266
  anysite dataset load-db dataset.yaml -c pg --drop-existing
260
267
 
268
+ # Load a specific snapshot date
269
+ anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
270
+
261
271
  # Run history and logs
262
272
  anysite dataset history my-dataset
263
273
  anysite dataset logs my-dataset --run 42
@@ -265,8 +275,9 @@ anysite dataset logs my-dataset --run 42
265
275
  # Generate cron/systemd schedule
266
276
  anysite dataset schedule dataset.yaml --incremental --load-db pg
267
277
 
268
- # Compare snapshots (diff two collection dates)
278
+ # Compare snapshots (diff two collection dates, supports dot-notation keys)
269
279
  anysite dataset diff dataset.yaml --source employees --key _input_value
280
+ anysite dataset diff dataset.yaml --source profiles --key urn.value --fields "name,headline"
270
281
 
271
282
  # Reset incremental state
272
283
  anysite dataset reset-cursor dataset.yaml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "anysite-cli"
7
- version = "0.1.4"
7
+ version = "0.1.6"
8
8
  description = "CLI for Anysite API - web data extraction for humans and AI agents"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -117,6 +117,8 @@ sources:
117
117
  path: ./output/companies-{{date}}.csv
118
118
  format: csv
119
119
  db_load:
120
+ key: _input_value # Unique key for diff-based incremental sync
121
+ sync: full # full (INSERT/DELETE/UPDATE) or append (no DELETE)
120
122
  fields: [name, url, employee_count]
121
123
 
122
124
  - id: employees
@@ -189,18 +191,32 @@ anysite dataset profile dataset.yaml
189
191
  # Load all sources with FK linking
190
192
  anysite dataset load-db dataset.yaml -c pg --drop-existing
191
193
 
194
+ # Incremental sync (uses diff when db_load.key is set)
195
+ anysite dataset load-db dataset.yaml -c pg
196
+
197
+ # Load a specific snapshot date
198
+ anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
199
+
192
200
  # Dry run
193
201
  anysite dataset load-db dataset.yaml -c pg --dry-run
194
202
  ```
195
203
 
196
204
  `load-db` auto-creates tables with inferred schema, adds `id` primary key, and links child tables to parents via `{parent}_id` FK columns using provenance data.
197
205
 
206
+ **Incremental sync**: When `db_load.key` is set and the table already exists with >=2 snapshots, `load-db` diffs the two most recent snapshots and applies only the delta (INSERT added, DELETE removed, UPDATE changed). Without `db_load.key`, it does a full INSERT of the latest snapshot.
207
+
208
+ **Sync modes** (`db_load.sync`):
209
+ - `full` (default) — applies INSERT, DELETE, and UPDATE from diff
210
+ - `append` — applies INSERT and UPDATE only, skips DELETE (keeps records that disappeared from the API). Use for sources where the API returns only the latest N items (e.g., posts, activity feeds).
211
+
198
212
  Optional `db_load` config per source controls which fields go to DB:
199
213
  ```yaml
200
214
  - id: profiles
201
215
  endpoint: /api/linkedin/user
202
216
  db_load:
203
217
  table: people # Custom table name
218
+ key: urn.value # Unique key for diff-based incremental sync
219
+ sync: append # Keep old records (no DELETE on diff)
204
220
  fields: # Select specific fields
205
221
  - name
206
222
  - urn.value AS urn_id # Dot-notation extraction
@@ -241,11 +257,16 @@ anysite api /api/linkedin/user user=satyanadella -q --format jsonl \
241
257
  # Diff two most recent snapshots
242
258
  anysite dataset diff dataset.yaml --source employees --key _input_value
243
259
 
244
- # Diff specific dates, compare only certain fields
260
+ # Diff with dot-notation key (for JSON fields like urn)
261
+ anysite dataset diff dataset.yaml --source profiles --key urn.value
262
+
263
+ # Diff specific dates, compare and output only certain fields
245
264
  anysite dataset diff dataset.yaml --source employees --key _input_value \
246
265
  --from 2026-01-30 --to 2026-02-01 --fields "name,headline"
247
266
  ```
248
267
 
268
+ `--key` supports dot-notation for JSON fields (e.g., `urn.value`). `--fields` restricts both comparison and output columns.
269
+
249
270
  ### Step 7: History, Scheduling, and Notifications
250
271
  ```bash
251
272
  # View run history
@@ -39,6 +39,8 @@ sources:
39
39
  headers: {X-Token: abc}
40
40
  db_load: # Database loading config
41
41
  table: custom_name # Override table name
42
+ key: urn.value # Unique key for diff-based incremental sync
43
+ sync: full # full (INSERT/DELETE/UPDATE) or append (no DELETE)
42
44
  fields: [name, url] # Fields to include
43
45
  exclude: [_input_value] # Fields to exclude
44
46
 
@@ -244,6 +246,7 @@ anysite dataset load-db dataset.yaml -c <connection_name> [OPTIONS]
244
246
  --connection, -c TEXT Database connection name (required)
245
247
  --source, -s TEXT Load specific source + dependencies
246
248
  --drop-existing Drop tables before creating
249
+ --snapshot TEXT Load a specific snapshot date (YYYY-MM-DD)
247
250
  --dry-run Show plan without executing
248
251
  --quiet, -q Suppress output
249
252
  ```
@@ -256,6 +259,29 @@ anysite dataset load-db dataset.yaml -c <connection_name> [OPTIONS]
256
259
  4. Inserts rows, tracking which `_input_value` maps to which `id`
257
260
  5. For child sources: adds `{parent_source}_id` FK column using provenance
258
261
 
262
+ ### Incremental Sync with `db_load.key`
263
+
264
+ When `db_load.key` is set and the table already exists with >=2 snapshots, `load-db` uses diff-based incremental sync instead of full re-insertion:
265
+
266
+ 1. Compares the two most recent Parquet snapshots using `DatasetDiffer`
267
+ 2. **Added** records → INSERT into DB
268
+ 3. **Removed** records → DELETE from DB (by key) — only in `sync: full` mode
269
+ 4. **Changed** records → UPDATE modified fields (by key)
270
+
271
+ This keeps the database in sync without duplicates.
272
+
273
+ **Sync modes** (`db_load.sync`):
274
+ - `full` (default) — applies INSERT, DELETE, and UPDATE from diff
275
+ - `append` — applies INSERT and UPDATE only, skips DELETE. Use for sources where the API returns only the latest N items (e.g., posts, comments) and you want to accumulate records over time.
276
+
277
+ | Scenario | Behavior |
278
+ |----------|----------|
279
+ | First load (table doesn't exist) | Full INSERT of latest snapshot |
280
+ | Table exists + `db_load.key` + >=2 snapshots | Diff-based sync (INSERT/DELETE/UPDATE delta) |
281
+ | `--drop-existing` | Drop table, full INSERT of latest snapshot |
282
+ | `--snapshot 2026-01-15` | Full INSERT of that specific snapshot |
283
+ | No `db_load.key` set | Full INSERT of latest snapshot (no diff) |
284
+
259
285
  ### db_load Config
260
286
 
261
287
  Control which fields go to the database per source:
@@ -263,6 +289,8 @@ Control which fields go to the database per source:
263
289
  ```yaml
264
290
  db_load:
265
291
  table: people # Custom table name (default: source ID)
292
+ key: urn.value # Unique key for diff-based incremental sync
293
+ sync: append # full (default) or append (no DELETE on diff)
266
294
  fields: # Explicit field list
267
295
  - name
268
296
  - url
@@ -456,8 +484,11 @@ Compare two collection snapshots to find added, removed, and changed records.
456
484
  # Compare two most recent snapshots (auto-detect dates)
457
485
  anysite dataset diff dataset.yaml --source profiles --key _input_value
458
486
 
487
+ # Compare with dot-notation key (JSON fields)
488
+ anysite dataset diff dataset.yaml --source profiles --key urn.value
489
+
459
490
  # Compare specific dates
460
- anysite dataset diff dataset.yaml --source profiles --key urn --from 2026-01-30 --to 2026-02-01
491
+ anysite dataset diff dataset.yaml --source profiles --key urn.value --from 2026-01-30 --to 2026-02-01
461
492
 
462
493
  # Only compare specific fields
463
494
  anysite dataset diff dataset.yaml --source profiles --key urn --fields "name,headline,follower_count"
@@ -468,9 +499,9 @@ anysite dataset diff dataset.yaml --source profiles --key urn --format json --ou
468
499
 
469
500
  **Options:**
470
501
  - `--source, -s` (required) — source to compare
471
- - `--key, -k` (required) — field to match records by (e.g., `_input_value`, `urn`)
502
+ - `--key, -k` (required) — field to match records by. Supports dot-notation for JSON fields (e.g., `urn.value`)
472
503
  - `--from` / `--to` — snapshot dates (default: two most recent)
473
- - `--fields, -f` — only compare these fields
504
+ - `--fields, -f` — restrict both comparison and output to these fields
474
505
  - `--format` — output format (table, json, jsonl, csv)
475
506
  - `--output, -o` — write to file
476
507
 
@@ -19,6 +19,7 @@ from anysite.dataset.models import DatasetConfig, DatasetSource
19
19
  from anysite.dataset.storage import (
20
20
  MetadataStore,
21
21
  get_parquet_path,
22
+ read_latest_parquet,
22
23
  read_parquet,
23
24
  write_parquet,
24
25
  )
@@ -412,9 +413,9 @@ async def _collect_dependent(
412
413
  if dep is None:
413
414
  raise DatasetError(f"Source {source.id} has no dependency defined")
414
415
 
415
- # Read parent data
416
+ # Read parent data (latest snapshot only to avoid schema mismatch)
416
417
  parent_dir = base_path / "raw" / dep.from_source
417
- parent_records = read_parquet(parent_dir)
418
+ parent_records = read_latest_parquet(parent_dir)
418
419
 
419
420
  if not parent_records:
420
421
  if not quiet:
@@ -627,7 +628,7 @@ def _count_dependent_inputs(
627
628
  if dep is None:
628
629
  return None
629
630
  parent_dir = base_path / "raw" / dep.from_source
630
- parent_records = read_parquet(parent_dir)
631
+ parent_records = read_latest_parquet(parent_dir)
631
632
  if not parent_records:
632
633
  info = metadata.get_source_info(dep.from_source)
633
634
  return info.get("record_count") if info else None
@@ -301,9 +301,13 @@ class DatasetDbLoader:
301
301
  """Diff-based incremental sync: compare two most recent snapshots, apply delta."""
302
302
  result = differ.diff(source.id, diff_key)
303
303
  total = 0
304
+ sync_mode = source.db_load.sync if source.db_load else "full"
304
305
 
305
306
  if dry_run:
306
- return len(result.added) + len(result.removed) + len(result.changed)
307
+ count = len(result.added) + len(result.changed)
308
+ if sync_mode == "full":
309
+ count += len(result.removed)
310
+ return count
307
311
 
308
312
  # Extract key value from a record (handles dot-notation)
309
313
  def _get_key_val(record: dict[str, Any]) -> Any:
@@ -321,14 +325,15 @@ class DatasetDbLoader:
321
325
  self.adapter.insert_batch(table_name, [row])
322
326
  total += 1
323
327
 
324
- # DELETE removed records
325
- if result.removed:
328
+ # DELETE removed records (skipped in append mode)
329
+ ph = self._placeholder()
330
+ if result.removed and sync_mode == "full":
326
331
  safe_col = sanitize_identifier(db_key_col)
327
332
  for record in result.removed:
328
333
  key_val = _get_key_val(record)
329
334
  if key_val is not None:
330
335
  self.adapter.execute(
331
- f"DELETE FROM {table_name} WHERE {safe_col} = ?",
336
+ f"DELETE FROM {table_name} WHERE {safe_col} = {ph}",
332
337
  (str(key_val),),
333
338
  )
334
339
  total += 1
@@ -350,14 +355,14 @@ class DatasetDbLoader:
350
355
  for field_name in changed_fields:
351
356
  new_val = record.get(field_name)
352
357
  safe_field = sanitize_identifier(field_name)
353
- set_parts.append(f"{safe_field} = ?")
358
+ set_parts.append(f"{safe_field} = {ph}")
354
359
  params.append(new_val)
355
360
 
356
361
  params.append(str(key_val))
357
362
  sql = (
358
363
  f"UPDATE {table_name} "
359
364
  f"SET {', '.join(set_parts)} "
360
- f"WHERE {safe_col} = ?"
365
+ f"WHERE {safe_col} = {ph}"
361
366
  )
362
367
  self.adapter.execute(sql, tuple(params))
363
368
  total += 1
@@ -371,6 +376,12 @@ class DatasetDbLoader:
371
376
  return other.dependency.field
372
377
  return None
373
378
 
379
+ def _placeholder(self) -> str:
380
+ """Get the parameter placeholder for the dialect."""
381
+ if self._dialect == "postgres":
382
+ return "%s"
383
+ return "?"
384
+
374
385
  def _auto_id_type(self) -> str:
375
386
  """Get the auto-increment ID column type for the dialect."""
376
387
  if self._dialect == "postgres":
@@ -344,6 +344,9 @@ class DatasetDiffer:
344
344
  old_val = record.get(old_key)
345
345
  if _values_differ(new_val, old_val):
346
346
  changed_fields.append(col)
347
+ # Fallback: DuckDB detected a change but Python comparison missed it
348
+ if not changed_fields:
349
+ changed_fields = list(compare_fields)
347
350
  record["_changed_fields"] = changed_fields
348
351
 
349
352
  return records
@@ -377,6 +380,15 @@ def _values_differ(a: Any, b: Any) -> bool:
377
380
  return json.loads(a) != json.loads(b)
378
381
  except (json.JSONDecodeError, ValueError):
379
382
  pass
383
+ # Handle complex types (dict, list) — compare via JSON serialization
384
+ # to catch differences DuckDB sees but Python equality misses
385
+ if isinstance(a, (dict, list)) or isinstance(b, (dict, list)):
386
+ try:
387
+ return json.dumps(a, sort_keys=True, default=str) != json.dumps(
388
+ b, sort_keys=True, default=str
389
+ )
390
+ except (TypeError, ValueError):
391
+ pass
380
392
  return True
381
393
 
382
394
 
@@ -452,6 +464,8 @@ def format_diff_records(
452
464
 
453
465
  for record in result.changed:
454
466
  row: dict[str, Any] = {"_diff": "changed"}
467
+ changed_fields = record.get("_changed_fields", [])
468
+ row["_changed_fields"] = changed_fields
455
469
  for k, v in record.items():
456
470
  if k == "_changed_fields":
457
471
  continue
@@ -82,6 +82,10 @@ class DbLoadConfig(BaseModel):
82
82
 
83
83
  table: str | None = Field(default=None, description="Override table name (default: source id)")
84
84
  key: str | None = Field(default=None, description="Unique key field for diff-based DB sync (e.g., urn.value)")
85
+ sync: Literal["full", "append"] = Field(
86
+ default="full",
87
+ description="Sync mode: 'full' applies INSERT/DELETE/UPDATE, 'append' skips DELETE (keeps old records)",
88
+ )
85
89
  fields: list[str] = Field(default_factory=list, description="Fields to include (empty = all)")
86
90
  exclude: list[str] = Field(
87
91
  default_factory=lambda: ["_input_value", "_parent_source"],
@@ -75,7 +75,7 @@ def read_parquet(path: Path) -> list[dict[str, Any]]:
75
75
  tables = [pq.read_table(f) for f in files]
76
76
  import pyarrow as pa
77
77
 
78
- table = pa.concat_tables(tables)
78
+ table = pa.concat_tables(tables, promote_options="permissive")
79
79
  else:
80
80
  if not path.exists():
81
81
  return []
@@ -84,6 +84,26 @@ def read_parquet(path: Path) -> list[dict[str, Any]]:
84
84
  return table.to_pylist()
85
85
 
86
86
 
87
+ def read_latest_parquet(path: Path) -> list[dict[str, Any]]:
88
+ """Read records from the most recent Parquet snapshot in a directory.
89
+
90
+ Unlike ``read_parquet(dir)``, this reads only the latest file, avoiding
91
+ schema mismatch errors when snapshots have different column types.
92
+
93
+ Args:
94
+ path: Directory containing dated .parquet files.
95
+
96
+ Returns:
97
+ List of dicts from the newest snapshot, or [] if none found.
98
+ """
99
+ if not path.is_dir():
100
+ return read_parquet(path)
101
+ files = sorted(path.glob("*.parquet"))
102
+ if not files:
103
+ return []
104
+ return read_parquet(files[-1])
105
+
106
+
87
107
  def get_source_dir(base_path: Path, source_id: str) -> Path:
88
108
  """Get the raw data directory for a source."""
89
109
  return base_path / "raw" / source_id
@@ -1,7 +1,6 @@
1
1
  """Tests for dataset DB loader with SQLite in-memory adapter."""
2
2
 
3
3
  import json
4
-
5
4
  import pytest
6
5
 
7
6
  from anysite.dataset.db_loader import DatasetDbLoader, _extract_dot_value, _filter_record
@@ -636,3 +635,201 @@ class TestDropExistingWithDiffKey:
636
635
  assert len(rows) == 2
637
636
  assert rows[0]["name"] == "Alice"
638
637
  assert rows[0]["score"] == 95
638
+
639
+
640
+ class TestAppendSyncMode:
641
+ """Test sync: append skips DELETE but still INSERTs and UPDATEs."""
642
+
643
+ def _setup_two_snapshots(self, tmp_path, source_id, old_records, new_records):
644
+ source_dir = get_source_dir(tmp_path / "data", source_id)
645
+ write_parquet(old_records, source_dir / "2026-01-01.parquet")
646
+ write_parquet(new_records, source_dir / "2026-01-02.parquet")
647
+
648
+ def test_append_keeps_removed_records(self, tmp_path):
649
+ """With sync: append, records missing from new snapshot are NOT deleted."""
650
+ sources = [
651
+ DatasetSource(
652
+ id="posts", endpoint="/api/posts",
653
+ db_load=DbLoadConfig(key="uid", sync="append"),
654
+ ),
655
+ ]
656
+ config = _make_config(tmp_path, sources)
657
+
658
+ self._setup_two_snapshots(
659
+ tmp_path, "posts",
660
+ old_records=[
661
+ {"uid": "a", "text": "Hello", "likes": 10},
662
+ {"uid": "b", "text": "World", "likes": 5},
663
+ {"uid": "c", "text": "Bye", "likes": 3},
664
+ ],
665
+ new_records=[
666
+ {"uid": "a", "text": "Hello", "likes": 15}, # changed
667
+ {"uid": "d", "text": "New post", "likes": 0}, # added
668
+ # b and c removed from snapshot
669
+ ],
670
+ )
671
+
672
+ adapter = _sqlite_adapter()
673
+ with adapter:
674
+ # Set up DB with old data
675
+ source_dir = get_source_dir(tmp_path / "data", "posts")
676
+ loader = DatasetDbLoader(config, adapter)
677
+ loader._full_insert(
678
+ sources[0], "posts", source_dir / "2026-01-01.parquet"
679
+ )
680
+ assert len(adapter.fetch_all("SELECT * FROM posts")) == 3
681
+
682
+ # Diff sync with append mode
683
+ loader2 = DatasetDbLoader(config, adapter)
684
+ results = loader2.load_all()
685
+ # 1 added + 1 changed = 2 (no deletes)
686
+ assert results["posts"] == 2
687
+
688
+ rows = adapter.fetch_all("SELECT * FROM posts ORDER BY uid")
689
+ assert len(rows) == 4 # 3 original + 1 added, none deleted
690
+ uids = [r["uid"] for r in rows]
691
+ assert "a" in uids # updated
692
+ assert "b" in uids # kept (not deleted)
693
+ assert "c" in uids # kept (not deleted)
694
+ assert "d" in uids # added
695
+
696
+ # Verify update applied
697
+ a_row = [r for r in rows if r["uid"] == "a"][0]
698
+ assert a_row["likes"] == 15
699
+
700
+ def test_full_sync_deletes_removed_records(self, tmp_path):
701
+ """With sync: full (default), removed records ARE deleted."""
702
+ sources = [
703
+ DatasetSource(
704
+ id="posts", endpoint="/api/posts",
705
+ db_load=DbLoadConfig(key="uid", sync="full"),
706
+ ),
707
+ ]
708
+ config = _make_config(tmp_path, sources)
709
+
710
+ self._setup_two_snapshots(
711
+ tmp_path, "posts",
712
+ old_records=[
713
+ {"uid": "a", "text": "Hello", "likes": 10},
714
+ {"uid": "b", "text": "World", "likes": 5},
715
+ ],
716
+ new_records=[
717
+ {"uid": "a", "text": "Hello", "likes": 15},
718
+ # b removed
719
+ ],
720
+ )
721
+
722
+ adapter = _sqlite_adapter()
723
+ with adapter:
724
+ source_dir = get_source_dir(tmp_path / "data", "posts")
725
+ loader = DatasetDbLoader(config, adapter)
726
+ loader._full_insert(
727
+ sources[0], "posts", source_dir / "2026-01-01.parquet"
728
+ )
729
+ assert len(adapter.fetch_all("SELECT * FROM posts")) == 2
730
+
731
+ loader2 = DatasetDbLoader(config, adapter)
732
+ results = loader2.load_all()
733
+ assert results["posts"] == 2 # 1 changed + 1 removed
734
+
735
+ rows = adapter.fetch_all("SELECT * FROM posts")
736
+ assert len(rows) == 1
737
+ assert rows[0]["uid"] == "a"
738
+
739
+
740
+ class TestPostgresPlaceholders:
741
+ """Test that diff-based sync uses %s placeholders for postgres dialect."""
742
+
743
+ def _setup_two_snapshots(self, tmp_path, source_id, old_records, new_records):
744
+ source_dir = get_source_dir(tmp_path / "data", source_id)
745
+ write_parquet(old_records, source_dir / "2026-01-01.parquet")
746
+ write_parquet(new_records, source_dir / "2026-01-02.parquet")
747
+
748
+ def test_delete_uses_percent_s(self, tmp_path):
749
+ """DELETE query uses %s placeholder for postgres."""
750
+ sources = [
751
+ DatasetSource(
752
+ id="items", endpoint="/api/items",
753
+ db_load=DbLoadConfig(key="name", sync="full"),
754
+ ),
755
+ ]
756
+ config = _make_config(tmp_path, sources)
757
+
758
+ self._setup_two_snapshots(
759
+ tmp_path, "items",
760
+ old_records=[
761
+ {"name": "Alice", "score": 90},
762
+ {"name": "Bob", "score": 80},
763
+ ],
764
+ new_records=[{"name": "Alice", "score": 90}],
765
+ )
766
+
767
+ # Use real SQLite adapter for initial load, then mock for diff sync
768
+ adapter = _sqlite_adapter()
769
+ with adapter:
770
+ source_dir = get_source_dir(tmp_path / "data", "items")
771
+ loader = DatasetDbLoader(config, adapter)
772
+ loader._full_insert(
773
+ sources[0], "items", source_dir / "2026-01-01.parquet"
774
+ )
775
+
776
+ # Patch dialect to postgres and spy on execute
777
+ loader2 = DatasetDbLoader(config, adapter)
778
+ loader2._dialect = "postgres"
779
+ original_execute = adapter.execute
780
+ calls = []
781
+
782
+ def spy_execute(sql, params=None):
783
+ calls.append((sql, params))
784
+ # Replace %s with ? for SQLite execution
785
+ original_execute(sql.replace("%s", "?"), params)
786
+
787
+ adapter.execute = spy_execute
788
+ loader2.load_all()
789
+
790
+ # Verify DELETE used %s
791
+ delete_calls = [c for c in calls if "DELETE" in c[0]]
792
+ assert len(delete_calls) == 1
793
+ assert "%s" in delete_calls[0][0]
794
+ assert "?" not in delete_calls[0][0]
795
+
796
+ def test_update_uses_percent_s(self, tmp_path):
797
+ """UPDATE query uses %s placeholders for postgres."""
798
+ sources = [
799
+ DatasetSource(
800
+ id="items", endpoint="/api/items",
801
+ db_load=DbLoadConfig(key="name"),
802
+ ),
803
+ ]
804
+ config = _make_config(tmp_path, sources)
805
+
806
+ self._setup_two_snapshots(
807
+ tmp_path, "items",
808
+ old_records=[{"name": "Alice", "score": 90}],
809
+ new_records=[{"name": "Alice", "score": 95}],
810
+ )
811
+
812
+ adapter = _sqlite_adapter()
813
+ with adapter:
814
+ source_dir = get_source_dir(tmp_path / "data", "items")
815
+ loader = DatasetDbLoader(config, adapter)
816
+ loader._full_insert(
817
+ sources[0], "items", source_dir / "2026-01-01.parquet"
818
+ )
819
+
820
+ loader2 = DatasetDbLoader(config, adapter)
821
+ loader2._dialect = "postgres"
822
+ original_execute = adapter.execute
823
+ calls = []
824
+
825
+ def spy_execute(sql, params=None):
826
+ calls.append((sql, params))
827
+ original_execute(sql.replace("%s", "?"), params)
828
+
829
+ adapter.execute = spy_execute
830
+ loader2.load_all()
831
+
832
+ update_calls = [c for c in calls if "UPDATE" in c[0]]
833
+ assert len(update_calls) == 1
834
+ assert "%s" in update_calls[0][0]
835
+ assert "?" not in update_calls[0][0]
@@ -8,6 +8,7 @@ import pytest
8
8
  from anysite.dataset.differ import (
9
9
  DatasetDiffer,
10
10
  DiffResult,
11
+ _values_differ,
11
12
  format_diff_records,
12
13
  format_diff_table,
13
14
  )
@@ -336,7 +337,8 @@ class TestFormatDiffRecords:
336
337
  assert len(rows) == 1
337
338
  assert rows[0]["name"] == "Alice Updated"
338
339
  assert rows[0]["name__old"] == "Alice"
339
- assert "_changed_fields" not in rows[0]
340
+ assert "_changed_fields" in rows[0]
341
+ assert rows[0]["_changed_fields"] == ["name"]
340
342
 
341
343
 
342
344
  class TestDotNotationKey:
@@ -481,3 +483,92 @@ class TestOutputFieldFiltering:
481
483
  assert "name" not in rows[0]
482
484
  assert "name__old" not in rows[0]
483
485
  assert "urn" in rows[0] # key always included
486
+
487
+
488
+ class TestChangedFieldsDetail:
489
+ """Verify _changed_fields and __old columns appear in output."""
490
+
491
+ def test_changed_fields_in_json_output(self):
492
+ """format_diff_records includes _changed_fields for changed records."""
493
+ result = DiffResult(
494
+ source_id="items",
495
+ from_date=date(2026, 1, 1),
496
+ to_date=date(2026, 1, 2),
497
+ key="id",
498
+ changed=[{
499
+ "id": "1",
500
+ "name": "Alice Updated",
501
+ "name__old": "Alice",
502
+ "score": 90,
503
+ "score__old": 90,
504
+ "_changed_fields": ["name"],
505
+ }],
506
+ )
507
+
508
+ rows = format_diff_records(result)
509
+ assert len(rows) == 1
510
+ assert "_changed_fields" in rows[0]
511
+ assert rows[0]["_changed_fields"] == ["name"]
512
+ assert "name__old" in rows[0]
513
+ assert rows[0]["name__old"] == "Alice"
514
+
515
+ def test_changed_fields_not_in_table_output(self):
516
+ """format_diff_table uses old→new arrows instead of _changed_fields."""
517
+ result = DiffResult(
518
+ source_id="items",
519
+ from_date=date(2026, 1, 1),
520
+ to_date=date(2026, 1, 2),
521
+ key="id",
522
+ changed=[{
523
+ "id": "1",
524
+ "name": "Bob",
525
+ "name__old": "Alice",
526
+ "score": 90,
527
+ "score__old": 90,
528
+ "_changed_fields": ["name"],
529
+ }],
530
+ )
531
+
532
+ rows = format_diff_table(result)
533
+ assert len(rows) == 1
534
+ assert "_changed_fields" not in rows[0]
535
+ assert "→" in rows[0]["name"]
536
+ assert rows[0]["score"] == 90 # unchanged, no arrow
537
+
538
+ def test_values_differ_with_dicts(self):
539
+ """_values_differ handles dict comparison correctly."""
540
+ assert not _values_differ({"a": 1}, {"a": 1})
541
+ assert _values_differ({"a": 1}, {"a": 2})
542
+ assert not _values_differ({"b": 2, "a": 1}, {"a": 1, "b": 2}) # key order
543
+
544
+ def test_values_differ_with_lists(self):
545
+ """_values_differ handles list comparison."""
546
+ assert not _values_differ([1, 2], [1, 2])
547
+ assert _values_differ([1, 2], [1, 3])
548
+
549
+ def test_values_differ_dict_vs_string(self):
550
+ """_values_differ handles dict vs JSON string comparison."""
551
+ assert _values_differ({"a": 1}, '{"a": 1}')
552
+
553
+ def test_fallback_all_compare_fields(self, tmp_path):
554
+ """When Python can't identify changed fields, all compare fields are marked."""
555
+ source_dir = tmp_path / "raw" / "items"
556
+ source_dir.mkdir(parents=True)
557
+
558
+ # Write old and new snapshots with a value DuckDB sees as different
559
+ write_parquet(
560
+ [{"id": "1", "data": json.dumps({"x": 1})}],
561
+ source_dir / "2026-01-01.parquet",
562
+ )
563
+ write_parquet(
564
+ [{"id": "1", "data": json.dumps({"x": 1, "y": 2})}],
565
+ source_dir / "2026-01-02.parquet",
566
+ )
567
+
568
+ differ = DatasetDiffer(tmp_path)
569
+ result = differ.diff("items", "id")
570
+
571
+ assert len(result.changed) == 1
572
+ record = result.changed[0]
573
+ assert "_changed_fields" in record
574
+ assert len(record["_changed_fields"]) > 0
File without changes
File without changes