anysite-cli 0.1.3__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of anysite-cli might be problematic. Click here for more details.

Files changed (118) hide show
  1. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/PKG-INFO +1 -1
  2. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/pyproject.toml +1 -1
  3. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/cli.py +10 -1
  4. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/db_loader.py +162 -23
  5. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/differ.py +189 -48
  6. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/models.py +1 -0
  7. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_db_loader.py +292 -0
  8. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_differ.py +145 -0
  9. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/.claude/settings.local.json +0 -0
  10. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/.gitignore +0 -0
  11. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/CLAUDE.md +0 -0
  12. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/LICENSE +0 -0
  13. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/README.md +0 -0
  14. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/skills/anysite-cli/SKILL.md +0 -0
  15. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/skills/anysite-cli/references/api-reference.md +0 -0
  16. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/skills/anysite-cli/references/dataset-guide.md +0 -0
  17. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/__init__.py +0 -0
  18. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/__main__.py +0 -0
  19. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/api/__init__.py +0 -0
  20. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/api/client.py +0 -0
  21. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/api/errors.py +0 -0
  22. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/api/schemas.py +0 -0
  23. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/batch/__init__.py +0 -0
  24. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/batch/executor.py +0 -0
  25. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/batch/input.py +0 -0
  26. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/batch/rate_limiter.py +0 -0
  27. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/cli/__init__.py +0 -0
  28. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/cli/config.py +0 -0
  29. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/cli/executor.py +0 -0
  30. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/cli/options.py +0 -0
  31. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/config/__init__.py +0 -0
  32. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/config/paths.py +0 -0
  33. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/config/settings.py +0 -0
  34. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/__init__.py +0 -0
  35. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/analyzer.py +0 -0
  36. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/collector.py +0 -0
  37. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/errors.py +0 -0
  38. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/exporters.py +0 -0
  39. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/history.py +0 -0
  40. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/notifications.py +0 -0
  41. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/scheduler.py +0 -0
  42. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/storage.py +0 -0
  43. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/transformer.py +0 -0
  44. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/__init__.py +0 -0
  45. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/adapters/__init__.py +0 -0
  46. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/adapters/base.py +0 -0
  47. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/adapters/postgres.py +0 -0
  48. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/adapters/sqlite.py +0 -0
  49. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/cli.py +0 -0
  50. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/config.py +0 -0
  51. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/manager.py +0 -0
  52. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/operations/__init__.py +0 -0
  53. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/operations/insert.py +0 -0
  54. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/operations/query.py +0 -0
  55. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/schema/__init__.py +0 -0
  56. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/schema/inference.py +0 -0
  57. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/schema/types.py +0 -0
  58. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/utils/__init__.py +0 -0
  59. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/utils/sanitize.py +0 -0
  60. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/main.py +0 -0
  61. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/models/__init__.py +0 -0
  62. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/output/__init__.py +0 -0
  63. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/output/console.py +0 -0
  64. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/output/formatters.py +0 -0
  65. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/output/templates.py +0 -0
  66. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/py.typed +0 -0
  67. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/streaming/__init__.py +0 -0
  68. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/streaming/progress.py +0 -0
  69. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/streaming/writer.py +0 -0
  70. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/utils/__init__.py +0 -0
  71. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/utils/fields.py +0 -0
  72. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/utils/retry.py +0 -0
  73. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/test_data/enriched_partners_sample_10.csv +0 -0
  74. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/test_data/linkedin-partners/company_aliases.txt +0 -0
  75. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/test_data/linkedin-partners/dataset.yaml +0 -0
  76. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/test_data/partners-deep/dataset.yaml +0 -0
  77. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/test_data/partners-intel/dataset.yaml +0 -0
  78. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/test_data/partners-linkedin/company_aliases.txt +0 -0
  79. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/test_data/partners-linkedin/dataset.yaml +0 -0
  80. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/test_data/partners-pipeline/dataset.yaml +0 -0
  81. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/__init__.py +0 -0
  82. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/conftest.py +0 -0
  83. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_api/__init__.py +0 -0
  84. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_batch/__init__.py +0 -0
  85. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_batch/test_executor.py +0 -0
  86. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_batch/test_input.py +0 -0
  87. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_batch/test_rate_limiter.py +0 -0
  88. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_cli/__init__.py +0 -0
  89. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_cli/test_main.py +0 -0
  90. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/__init__.py +0 -0
  91. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_analyzer.py +0 -0
  92. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_collector.py +0 -0
  93. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_exporters.py +0 -0
  94. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_history.py +0 -0
  95. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_integration_csv.py +0 -0
  96. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_models.py +0 -0
  97. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_notifications.py +0 -0
  98. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_scheduler.py +0 -0
  99. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_storage.py +0 -0
  100. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_transformer.py +0 -0
  101. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_db/__init__.py +0 -0
  102. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_db/test_cli.py +0 -0
  103. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_db/test_config.py +0 -0
  104. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_db/test_inference.py +0 -0
  105. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_db/test_insert.py +0 -0
  106. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_db/test_manager.py +0 -0
  107. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_db/test_postgres_adapter.py +0 -0
  108. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_db/test_sanitize.py +0 -0
  109. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_db/test_sqlite_adapter.py +0 -0
  110. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_output/__init__.py +0 -0
  111. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_output/test_formatters.py +0 -0
  112. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_output/test_templates.py +0 -0
  113. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_streaming/__init__.py +0 -0
  114. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_streaming/test_progress.py +0 -0
  115. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_streaming/test_writer.py +0 -0
  116. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_utils/__init__.py +0 -0
  117. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_utils/test_fields.py +0 -0
  118. {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_utils/test_retry.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: anysite-cli
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: CLI for Anysite API - web data extraction for humans and AI agents
5
5
  Project-URL: Homepage, https://anysite.io
6
6
  Project-URL: Documentation, https://docs.anysite.io/cli
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "anysite-cli"
7
- version = "0.1.3"
7
+ version = "0.1.4"
8
8
  description = "CLI for Anysite API - web data extraction for humans and AI agents"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -357,6 +357,10 @@ def load_db(
357
357
  bool,
358
358
  typer.Option("--quiet", "-q", help="Suppress progress output"),
359
359
  ] = False,
360
+ snapshot: Annotated[
361
+ str | None,
362
+ typer.Option("--snapshot", help="Load a specific snapshot date (YYYY-MM-DD)"),
363
+ ] = None,
360
364
  ) -> None:
361
365
  """Load collected Parquet data into a relational database with FK linking."""
362
366
  config = _load_config(config_path)
@@ -379,6 +383,7 @@ def load_db(
379
383
  source_filter=source,
380
384
  drop_existing=drop_existing,
381
385
  dry_run=dry_run,
386
+ snapshot=snapshot,
382
387
  )
383
388
  except Exception as e:
384
389
  typer.echo(f"Load error: {e}", err=True)
@@ -519,7 +524,11 @@ def diff_cmd(
519
524
  return
520
525
 
521
526
  # Format and output
522
- rows = format_diff_table(result) if format == "table" else format_diff_records(result)
527
+ rows = (
528
+ format_diff_table(result, output_fields=field_list)
529
+ if format == "table"
530
+ else format_diff_records(result, output_fields=field_list)
531
+ )
523
532
 
524
533
  _output_results(rows, format, output)
525
534
 
@@ -3,12 +3,18 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import json
6
+ import logging
7
+ from datetime import date
8
+ from pathlib import Path
6
9
  from typing import Any
7
10
 
8
11
  from anysite.dataset.models import DatasetConfig, DatasetSource
9
12
  from anysite.dataset.storage import get_source_dir, read_parquet
10
13
  from anysite.db.adapters.base import DatabaseAdapter
11
14
  from anysite.db.schema.inference import infer_table_schema
15
+ from anysite.db.utils.sanitize import sanitize_identifier
16
+
17
+ logger = logging.getLogger(__name__)
12
18
 
13
19
 
14
20
  def _get_dialect(adapter: DatabaseAdapter) -> str:
@@ -86,15 +92,31 @@ def _filter_record(
86
92
  return {k: v for k, v in record.items() if k not in exclude}
87
93
 
88
94
 
95
+ def _get_latest_parquet(base_path: Path, source_id: str) -> Path | None:
96
+ """Return the path to the most recent snapshot for a source."""
97
+ source_dir = get_source_dir(base_path, source_id)
98
+ if not source_dir.exists():
99
+ return None
100
+ files = sorted(source_dir.glob("*.parquet"))
101
+ return files[-1] if files else None
102
+
103
+
104
+ def _get_snapshot_for_date(base_path: Path, source_id: str, d: date) -> Path | None:
105
+ """Return the parquet path for a specific snapshot date."""
106
+ source_dir = get_source_dir(base_path, source_id)
107
+ path = source_dir / f"{d.isoformat()}.parquet"
108
+ return path if path.exists() else None
109
+
110
+
89
111
  class DatasetDbLoader:
90
112
  """Load dataset Parquet data into a relational database.
91
113
 
92
- Handles:
93
- - Schema inference from Parquet records
94
- - Auto-increment primary keys (``id`` column)
95
- - Foreign key linking via provenance ``_input_value`` column
96
- - Dot-notation field extraction for JSON columns
97
- - Topological loading order (parents before children)
114
+ Supports diff-based incremental sync when ``db_load.key`` is configured:
115
+ compares the two most recent snapshots and applies INSERT/DELETE/UPDATE
116
+ to keep the database in sync.
117
+
118
+ Falls back to full INSERT of the latest snapshot when no key is set
119
+ or when the table doesn't exist yet.
98
120
  """
99
121
 
100
122
  def __init__(
@@ -115,16 +137,18 @@ class DatasetDbLoader:
115
137
  source_filter: str | None = None,
116
138
  drop_existing: bool = False,
117
139
  dry_run: bool = False,
140
+ snapshot: str | None = None,
118
141
  ) -> dict[str, int]:
119
142
  """Load all sources into the database in dependency order.
120
143
 
121
144
  Args:
122
145
  source_filter: Only load this source (and dependencies).
123
- drop_existing: Drop tables before creating.
146
+ drop_existing: Drop tables before creating, then full INSERT latest.
124
147
  dry_run: Show plan without executing.
148
+ snapshot: Load a specific snapshot date (YYYY-MM-DD).
125
149
 
126
150
  Returns:
127
- Mapping of source_id to number of rows loaded.
151
+ Mapping of source_id to number of rows loaded/affected.
128
152
  """
129
153
  sources = self.config.topological_sort()
130
154
 
@@ -139,6 +163,7 @@ class DatasetDbLoader:
139
163
  source,
140
164
  drop_existing=drop_existing,
141
165
  dry_run=dry_run,
166
+ snapshot=snapshot,
142
167
  )
143
168
  results[source.id] = count
144
169
 
@@ -150,18 +175,64 @@ class DatasetDbLoader:
150
175
  *,
151
176
  drop_existing: bool = False,
152
177
  dry_run: bool = False,
178
+ snapshot: str | None = None,
153
179
  ) -> int:
154
- """Load a single source into the database."""
155
- source_dir = get_source_dir(self.base_path, source.id)
156
- if not source_dir.exists() or not any(source_dir.glob("*.parquet")):
180
+ """Load a single source into the database.
181
+
182
+ Strategy:
183
+ 1. ``drop_existing``: drop table → full INSERT of latest snapshot
184
+ 2. ``snapshot``: full INSERT of that specific snapshot
185
+ 3. Table doesn't exist: full INSERT of latest snapshot
186
+ 4. Table exists + ``db_load.key`` set + ≥2 snapshots: diff-based sync
187
+ 5. Fallback: full INSERT of latest snapshot
188
+ """
189
+ table_name = _table_name_for(source)
190
+
191
+ # Handle drop_existing
192
+ if drop_existing and self.adapter.table_exists(table_name):
193
+ self.adapter.execute(f"DROP TABLE {table_name}")
194
+
195
+ # Determine which parquet to load
196
+ if snapshot:
197
+ snapshot_date = date.fromisoformat(snapshot)
198
+ parquet_path = _get_snapshot_for_date(self.base_path, source.id, snapshot_date)
199
+ if parquet_path is None:
200
+ return 0
201
+ return self._full_insert(source, table_name, parquet_path, dry_run=dry_run)
202
+
203
+ # Check if we can do diff-based sync
204
+ diff_key = source.db_load.key if source.db_load else None
205
+ table_exists = self.adapter.table_exists(table_name)
206
+
207
+ if diff_key and table_exists and not drop_existing:
208
+ from anysite.dataset.differ import DatasetDiffer
209
+ differ = DatasetDiffer(self.base_path)
210
+ dates = differ.available_dates(source.id)
211
+
212
+ if len(dates) >= 2:
213
+ return self._diff_sync(
214
+ source, table_name, diff_key, differ, dates, dry_run=dry_run
215
+ )
216
+
217
+ # Fallback: full INSERT of latest snapshot
218
+ latest = _get_latest_parquet(self.base_path, source.id)
219
+ if latest is None:
157
220
  return 0
221
+ return self._full_insert(source, table_name, latest, dry_run=dry_run)
158
222
 
159
- raw_records = read_parquet(source_dir)
223
+ def _full_insert(
224
+ self,
225
+ source: DatasetSource,
226
+ table_name: str,
227
+ parquet_path: Path,
228
+ *,
229
+ dry_run: bool = False,
230
+ ) -> int:
231
+ """Full INSERT: read parquet, transform, create table if needed, insert all rows."""
232
+ raw_records = read_parquet(parquet_path)
160
233
  if not raw_records:
161
234
  return 0
162
235
 
163
- table_name = _table_name_for(source)
164
-
165
236
  # Determine parent info for FK linking
166
237
  parent_source_id = None
167
238
  parent_fk_col = None
@@ -174,7 +245,6 @@ class DatasetDbLoader:
174
245
  for record in raw_records:
175
246
  row = _filter_record(record, source)
176
247
 
177
- # Add FK column if this is a dependent source
178
248
  if parent_source_id and parent_fk_col:
179
249
  input_val = record.get("_input_value")
180
250
  parent_map = self._value_to_id.get(parent_source_id, {})
@@ -189,17 +259,12 @@ class DatasetDbLoader:
189
259
  return len(rows)
190
260
 
191
261
  # Determine the lookup field for children to reference this source
192
- # This is the field that child dependencies extract from this source
193
262
  lookup_field = self._get_child_lookup_field(source)
194
263
 
195
- # Create table
196
- if drop_existing and self.adapter.table_exists(table_name):
197
- self.adapter.execute(f"DROP TABLE {table_name}")
198
-
264
+ # Create table if needed
199
265
  if not self.adapter.table_exists(table_name):
200
266
  schema = infer_table_schema(table_name, rows)
201
267
  sql_types = schema.to_sql_types(self._dialect)
202
- # Add auto-increment id column
203
268
  col_defs = {"id": self._auto_id_type()}
204
269
  col_defs.update(sql_types)
205
270
  self.adapter.create_table(table_name, col_defs, primary_key="id")
@@ -208,10 +273,8 @@ class DatasetDbLoader:
208
273
  value_map: dict[str, int] = {}
209
274
  for i, row in enumerate(rows):
210
275
  self.adapter.insert_batch(table_name, [row])
211
- # Get the last inserted id
212
276
  last_id = self._get_last_id(table_name)
213
277
 
214
- # Build value→id map for child sources
215
278
  if lookup_field and last_id is not None:
216
279
  raw_record = raw_records[i]
217
280
  lookup_val = _extract_dot_value(raw_record, lookup_field)
@@ -225,6 +288,82 @@ class DatasetDbLoader:
225
288
 
226
289
  return len(rows)
227
290
 
291
+ def _diff_sync(
292
+ self,
293
+ source: DatasetSource,
294
+ table_name: str,
295
+ diff_key: str,
296
+ differ: Any,
297
+ dates: list[date],
298
+ *,
299
+ dry_run: bool = False,
300
+ ) -> int:
301
+ """Diff-based incremental sync: compare two most recent snapshots, apply delta."""
302
+ result = differ.diff(source.id, diff_key)
303
+ total = 0
304
+
305
+ if dry_run:
306
+ return len(result.added) + len(result.removed) + len(result.changed)
307
+
308
+ # Extract key value from a record (handles dot-notation)
309
+ def _get_key_val(record: dict[str, Any]) -> Any:
310
+ if "." in diff_key:
311
+ return _extract_dot_value(record, diff_key)
312
+ return record.get(diff_key)
313
+
314
+ # Determine the DB column name for the key
315
+ db_key_col = diff_key.replace(".", "_")
316
+
317
+ # INSERT added records
318
+ if result.added:
319
+ for record in result.added:
320
+ row = _filter_record(record, source)
321
+ self.adapter.insert_batch(table_name, [row])
322
+ total += 1
323
+
324
+ # DELETE removed records
325
+ if result.removed:
326
+ safe_col = sanitize_identifier(db_key_col)
327
+ for record in result.removed:
328
+ key_val = _get_key_val(record)
329
+ if key_val is not None:
330
+ self.adapter.execute(
331
+ f"DELETE FROM {table_name} WHERE {safe_col} = ?",
332
+ (str(key_val),),
333
+ )
334
+ total += 1
335
+
336
+ # UPDATE changed records
337
+ if result.changed:
338
+ safe_col = sanitize_identifier(db_key_col)
339
+ for record in result.changed:
340
+ key_val = _get_key_val(record)
341
+ if key_val is None:
342
+ continue
343
+ changed_fields = record.get("_changed_fields", [])
344
+ if not changed_fields:
345
+ continue
346
+
347
+ # Build SET clause from changed fields
348
+ set_parts = []
349
+ params: list[Any] = []
350
+ for field_name in changed_fields:
351
+ new_val = record.get(field_name)
352
+ safe_field = sanitize_identifier(field_name)
353
+ set_parts.append(f"{safe_field} = ?")
354
+ params.append(new_val)
355
+
356
+ params.append(str(key_val))
357
+ sql = (
358
+ f"UPDATE {table_name} "
359
+ f"SET {', '.join(set_parts)} "
360
+ f"WHERE {safe_col} = ?"
361
+ )
362
+ self.adapter.execute(sql, tuple(params))
363
+ total += 1
364
+
365
+ return total
366
+
228
367
  def _get_child_lookup_field(self, source: DatasetSource) -> str | None:
229
368
  """Find which field children use to reference this source."""
230
369
  for other in self.config.sources: