anysite-cli 0.1.3__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of anysite-cli might be problematic. Click here for more details.
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/PKG-INFO +1 -1
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/pyproject.toml +1 -1
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/cli.py +10 -1
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/db_loader.py +162 -23
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/differ.py +189 -48
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/models.py +1 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_db_loader.py +292 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_differ.py +145 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/.claude/settings.local.json +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/.gitignore +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/CLAUDE.md +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/LICENSE +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/README.md +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/skills/anysite-cli/SKILL.md +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/skills/anysite-cli/references/api-reference.md +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/skills/anysite-cli/references/dataset-guide.md +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/__main__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/api/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/api/client.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/api/errors.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/api/schemas.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/batch/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/batch/executor.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/batch/input.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/batch/rate_limiter.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/cli/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/cli/config.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/cli/executor.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/cli/options.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/config/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/config/paths.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/config/settings.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/analyzer.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/collector.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/errors.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/exporters.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/history.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/notifications.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/scheduler.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/storage.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/dataset/transformer.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/adapters/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/adapters/base.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/adapters/postgres.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/adapters/sqlite.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/cli.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/config.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/manager.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/operations/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/operations/insert.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/operations/query.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/schema/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/schema/inference.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/schema/types.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/utils/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/db/utils/sanitize.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/main.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/models/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/output/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/output/console.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/output/formatters.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/output/templates.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/py.typed +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/streaming/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/streaming/progress.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/streaming/writer.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/utils/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/utils/fields.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/src/anysite/utils/retry.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/test_data/enriched_partners_sample_10.csv +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/test_data/linkedin-partners/company_aliases.txt +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/test_data/linkedin-partners/dataset.yaml +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/test_data/partners-deep/dataset.yaml +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/test_data/partners-intel/dataset.yaml +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/test_data/partners-linkedin/company_aliases.txt +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/test_data/partners-linkedin/dataset.yaml +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/test_data/partners-pipeline/dataset.yaml +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/conftest.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_api/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_batch/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_batch/test_executor.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_batch/test_input.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_batch/test_rate_limiter.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_cli/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_cli/test_main.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_analyzer.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_collector.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_exporters.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_history.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_integration_csv.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_models.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_notifications.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_scheduler.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_storage.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_dataset/test_transformer.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_db/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_db/test_cli.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_db/test_config.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_db/test_inference.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_db/test_insert.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_db/test_manager.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_db/test_postgres_adapter.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_db/test_sanitize.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_db/test_sqlite_adapter.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_output/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_output/test_formatters.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_output/test_templates.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_streaming/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_streaming/test_progress.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_streaming/test_writer.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_utils/__init__.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_utils/test_fields.py +0 -0
- {anysite_cli-0.1.3 → anysite_cli-0.1.4}/tests/test_utils/test_retry.py +0 -0
|
@@ -357,6 +357,10 @@ def load_db(
|
|
|
357
357
|
bool,
|
|
358
358
|
typer.Option("--quiet", "-q", help="Suppress progress output"),
|
|
359
359
|
] = False,
|
|
360
|
+
snapshot: Annotated[
|
|
361
|
+
str | None,
|
|
362
|
+
typer.Option("--snapshot", help="Load a specific snapshot date (YYYY-MM-DD)"),
|
|
363
|
+
] = None,
|
|
360
364
|
) -> None:
|
|
361
365
|
"""Load collected Parquet data into a relational database with FK linking."""
|
|
362
366
|
config = _load_config(config_path)
|
|
@@ -379,6 +383,7 @@ def load_db(
|
|
|
379
383
|
source_filter=source,
|
|
380
384
|
drop_existing=drop_existing,
|
|
381
385
|
dry_run=dry_run,
|
|
386
|
+
snapshot=snapshot,
|
|
382
387
|
)
|
|
383
388
|
except Exception as e:
|
|
384
389
|
typer.echo(f"Load error: {e}", err=True)
|
|
@@ -519,7 +524,11 @@ def diff_cmd(
|
|
|
519
524
|
return
|
|
520
525
|
|
|
521
526
|
# Format and output
|
|
522
|
-
rows =
|
|
527
|
+
rows = (
|
|
528
|
+
format_diff_table(result, output_fields=field_list)
|
|
529
|
+
if format == "table"
|
|
530
|
+
else format_diff_records(result, output_fields=field_list)
|
|
531
|
+
)
|
|
523
532
|
|
|
524
533
|
_output_results(rows, format, output)
|
|
525
534
|
|
|
@@ -3,12 +3,18 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
|
+
import logging
|
|
7
|
+
from datetime import date
|
|
8
|
+
from pathlib import Path
|
|
6
9
|
from typing import Any
|
|
7
10
|
|
|
8
11
|
from anysite.dataset.models import DatasetConfig, DatasetSource
|
|
9
12
|
from anysite.dataset.storage import get_source_dir, read_parquet
|
|
10
13
|
from anysite.db.adapters.base import DatabaseAdapter
|
|
11
14
|
from anysite.db.schema.inference import infer_table_schema
|
|
15
|
+
from anysite.db.utils.sanitize import sanitize_identifier
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
12
18
|
|
|
13
19
|
|
|
14
20
|
def _get_dialect(adapter: DatabaseAdapter) -> str:
|
|
@@ -86,15 +92,31 @@ def _filter_record(
|
|
|
86
92
|
return {k: v for k, v in record.items() if k not in exclude}
|
|
87
93
|
|
|
88
94
|
|
|
95
|
+
def _get_latest_parquet(base_path: Path, source_id: str) -> Path | None:
|
|
96
|
+
"""Return the path to the most recent snapshot for a source."""
|
|
97
|
+
source_dir = get_source_dir(base_path, source_id)
|
|
98
|
+
if not source_dir.exists():
|
|
99
|
+
return None
|
|
100
|
+
files = sorted(source_dir.glob("*.parquet"))
|
|
101
|
+
return files[-1] if files else None
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _get_snapshot_for_date(base_path: Path, source_id: str, d: date) -> Path | None:
|
|
105
|
+
"""Return the parquet path for a specific snapshot date."""
|
|
106
|
+
source_dir = get_source_dir(base_path, source_id)
|
|
107
|
+
path = source_dir / f"{d.isoformat()}.parquet"
|
|
108
|
+
return path if path.exists() else None
|
|
109
|
+
|
|
110
|
+
|
|
89
111
|
class DatasetDbLoader:
|
|
90
112
|
"""Load dataset Parquet data into a relational database.
|
|
91
113
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
114
|
+
Supports diff-based incremental sync when ``db_load.key`` is configured:
|
|
115
|
+
compares the two most recent snapshots and applies INSERT/DELETE/UPDATE
|
|
116
|
+
to keep the database in sync.
|
|
117
|
+
|
|
118
|
+
Falls back to full INSERT of the latest snapshot when no key is set
|
|
119
|
+
or when the table doesn't exist yet.
|
|
98
120
|
"""
|
|
99
121
|
|
|
100
122
|
def __init__(
|
|
@@ -115,16 +137,18 @@ class DatasetDbLoader:
|
|
|
115
137
|
source_filter: str | None = None,
|
|
116
138
|
drop_existing: bool = False,
|
|
117
139
|
dry_run: bool = False,
|
|
140
|
+
snapshot: str | None = None,
|
|
118
141
|
) -> dict[str, int]:
|
|
119
142
|
"""Load all sources into the database in dependency order.
|
|
120
143
|
|
|
121
144
|
Args:
|
|
122
145
|
source_filter: Only load this source (and dependencies).
|
|
123
|
-
drop_existing: Drop tables before creating.
|
|
146
|
+
drop_existing: Drop tables before creating, then full INSERT latest.
|
|
124
147
|
dry_run: Show plan without executing.
|
|
148
|
+
snapshot: Load a specific snapshot date (YYYY-MM-DD).
|
|
125
149
|
|
|
126
150
|
Returns:
|
|
127
|
-
Mapping of source_id to number of rows loaded.
|
|
151
|
+
Mapping of source_id to number of rows loaded/affected.
|
|
128
152
|
"""
|
|
129
153
|
sources = self.config.topological_sort()
|
|
130
154
|
|
|
@@ -139,6 +163,7 @@ class DatasetDbLoader:
|
|
|
139
163
|
source,
|
|
140
164
|
drop_existing=drop_existing,
|
|
141
165
|
dry_run=dry_run,
|
|
166
|
+
snapshot=snapshot,
|
|
142
167
|
)
|
|
143
168
|
results[source.id] = count
|
|
144
169
|
|
|
@@ -150,18 +175,64 @@ class DatasetDbLoader:
|
|
|
150
175
|
*,
|
|
151
176
|
drop_existing: bool = False,
|
|
152
177
|
dry_run: bool = False,
|
|
178
|
+
snapshot: str | None = None,
|
|
153
179
|
) -> int:
|
|
154
|
-
"""Load a single source into the database.
|
|
155
|
-
|
|
156
|
-
|
|
180
|
+
"""Load a single source into the database.
|
|
181
|
+
|
|
182
|
+
Strategy:
|
|
183
|
+
1. ``drop_existing``: drop table → full INSERT of latest snapshot
|
|
184
|
+
2. ``snapshot``: full INSERT of that specific snapshot
|
|
185
|
+
3. Table doesn't exist: full INSERT of latest snapshot
|
|
186
|
+
4. Table exists + ``db_load.key`` set + ≥2 snapshots: diff-based sync
|
|
187
|
+
5. Fallback: full INSERT of latest snapshot
|
|
188
|
+
"""
|
|
189
|
+
table_name = _table_name_for(source)
|
|
190
|
+
|
|
191
|
+
# Handle drop_existing
|
|
192
|
+
if drop_existing and self.adapter.table_exists(table_name):
|
|
193
|
+
self.adapter.execute(f"DROP TABLE {table_name}")
|
|
194
|
+
|
|
195
|
+
# Determine which parquet to load
|
|
196
|
+
if snapshot:
|
|
197
|
+
snapshot_date = date.fromisoformat(snapshot)
|
|
198
|
+
parquet_path = _get_snapshot_for_date(self.base_path, source.id, snapshot_date)
|
|
199
|
+
if parquet_path is None:
|
|
200
|
+
return 0
|
|
201
|
+
return self._full_insert(source, table_name, parquet_path, dry_run=dry_run)
|
|
202
|
+
|
|
203
|
+
# Check if we can do diff-based sync
|
|
204
|
+
diff_key = source.db_load.key if source.db_load else None
|
|
205
|
+
table_exists = self.adapter.table_exists(table_name)
|
|
206
|
+
|
|
207
|
+
if diff_key and table_exists and not drop_existing:
|
|
208
|
+
from anysite.dataset.differ import DatasetDiffer
|
|
209
|
+
differ = DatasetDiffer(self.base_path)
|
|
210
|
+
dates = differ.available_dates(source.id)
|
|
211
|
+
|
|
212
|
+
if len(dates) >= 2:
|
|
213
|
+
return self._diff_sync(
|
|
214
|
+
source, table_name, diff_key, differ, dates, dry_run=dry_run
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Fallback: full INSERT of latest snapshot
|
|
218
|
+
latest = _get_latest_parquet(self.base_path, source.id)
|
|
219
|
+
if latest is None:
|
|
157
220
|
return 0
|
|
221
|
+
return self._full_insert(source, table_name, latest, dry_run=dry_run)
|
|
158
222
|
|
|
159
|
-
|
|
223
|
+
def _full_insert(
|
|
224
|
+
self,
|
|
225
|
+
source: DatasetSource,
|
|
226
|
+
table_name: str,
|
|
227
|
+
parquet_path: Path,
|
|
228
|
+
*,
|
|
229
|
+
dry_run: bool = False,
|
|
230
|
+
) -> int:
|
|
231
|
+
"""Full INSERT: read parquet, transform, create table if needed, insert all rows."""
|
|
232
|
+
raw_records = read_parquet(parquet_path)
|
|
160
233
|
if not raw_records:
|
|
161
234
|
return 0
|
|
162
235
|
|
|
163
|
-
table_name = _table_name_for(source)
|
|
164
|
-
|
|
165
236
|
# Determine parent info for FK linking
|
|
166
237
|
parent_source_id = None
|
|
167
238
|
parent_fk_col = None
|
|
@@ -174,7 +245,6 @@ class DatasetDbLoader:
|
|
|
174
245
|
for record in raw_records:
|
|
175
246
|
row = _filter_record(record, source)
|
|
176
247
|
|
|
177
|
-
# Add FK column if this is a dependent source
|
|
178
248
|
if parent_source_id and parent_fk_col:
|
|
179
249
|
input_val = record.get("_input_value")
|
|
180
250
|
parent_map = self._value_to_id.get(parent_source_id, {})
|
|
@@ -189,17 +259,12 @@ class DatasetDbLoader:
|
|
|
189
259
|
return len(rows)
|
|
190
260
|
|
|
191
261
|
# Determine the lookup field for children to reference this source
|
|
192
|
-
# This is the field that child dependencies extract from this source
|
|
193
262
|
lookup_field = self._get_child_lookup_field(source)
|
|
194
263
|
|
|
195
|
-
# Create table
|
|
196
|
-
if drop_existing and self.adapter.table_exists(table_name):
|
|
197
|
-
self.adapter.execute(f"DROP TABLE {table_name}")
|
|
198
|
-
|
|
264
|
+
# Create table if needed
|
|
199
265
|
if not self.adapter.table_exists(table_name):
|
|
200
266
|
schema = infer_table_schema(table_name, rows)
|
|
201
267
|
sql_types = schema.to_sql_types(self._dialect)
|
|
202
|
-
# Add auto-increment id column
|
|
203
268
|
col_defs = {"id": self._auto_id_type()}
|
|
204
269
|
col_defs.update(sql_types)
|
|
205
270
|
self.adapter.create_table(table_name, col_defs, primary_key="id")
|
|
@@ -208,10 +273,8 @@ class DatasetDbLoader:
|
|
|
208
273
|
value_map: dict[str, int] = {}
|
|
209
274
|
for i, row in enumerate(rows):
|
|
210
275
|
self.adapter.insert_batch(table_name, [row])
|
|
211
|
-
# Get the last inserted id
|
|
212
276
|
last_id = self._get_last_id(table_name)
|
|
213
277
|
|
|
214
|
-
# Build value→id map for child sources
|
|
215
278
|
if lookup_field and last_id is not None:
|
|
216
279
|
raw_record = raw_records[i]
|
|
217
280
|
lookup_val = _extract_dot_value(raw_record, lookup_field)
|
|
@@ -225,6 +288,82 @@ class DatasetDbLoader:
|
|
|
225
288
|
|
|
226
289
|
return len(rows)
|
|
227
290
|
|
|
291
|
+
def _diff_sync(
|
|
292
|
+
self,
|
|
293
|
+
source: DatasetSource,
|
|
294
|
+
table_name: str,
|
|
295
|
+
diff_key: str,
|
|
296
|
+
differ: Any,
|
|
297
|
+
dates: list[date],
|
|
298
|
+
*,
|
|
299
|
+
dry_run: bool = False,
|
|
300
|
+
) -> int:
|
|
301
|
+
"""Diff-based incremental sync: compare two most recent snapshots, apply delta."""
|
|
302
|
+
result = differ.diff(source.id, diff_key)
|
|
303
|
+
total = 0
|
|
304
|
+
|
|
305
|
+
if dry_run:
|
|
306
|
+
return len(result.added) + len(result.removed) + len(result.changed)
|
|
307
|
+
|
|
308
|
+
# Extract key value from a record (handles dot-notation)
|
|
309
|
+
def _get_key_val(record: dict[str, Any]) -> Any:
|
|
310
|
+
if "." in diff_key:
|
|
311
|
+
return _extract_dot_value(record, diff_key)
|
|
312
|
+
return record.get(diff_key)
|
|
313
|
+
|
|
314
|
+
# Determine the DB column name for the key
|
|
315
|
+
db_key_col = diff_key.replace(".", "_")
|
|
316
|
+
|
|
317
|
+
# INSERT added records
|
|
318
|
+
if result.added:
|
|
319
|
+
for record in result.added:
|
|
320
|
+
row = _filter_record(record, source)
|
|
321
|
+
self.adapter.insert_batch(table_name, [row])
|
|
322
|
+
total += 1
|
|
323
|
+
|
|
324
|
+
# DELETE removed records
|
|
325
|
+
if result.removed:
|
|
326
|
+
safe_col = sanitize_identifier(db_key_col)
|
|
327
|
+
for record in result.removed:
|
|
328
|
+
key_val = _get_key_val(record)
|
|
329
|
+
if key_val is not None:
|
|
330
|
+
self.adapter.execute(
|
|
331
|
+
f"DELETE FROM {table_name} WHERE {safe_col} = ?",
|
|
332
|
+
(str(key_val),),
|
|
333
|
+
)
|
|
334
|
+
total += 1
|
|
335
|
+
|
|
336
|
+
# UPDATE changed records
|
|
337
|
+
if result.changed:
|
|
338
|
+
safe_col = sanitize_identifier(db_key_col)
|
|
339
|
+
for record in result.changed:
|
|
340
|
+
key_val = _get_key_val(record)
|
|
341
|
+
if key_val is None:
|
|
342
|
+
continue
|
|
343
|
+
changed_fields = record.get("_changed_fields", [])
|
|
344
|
+
if not changed_fields:
|
|
345
|
+
continue
|
|
346
|
+
|
|
347
|
+
# Build SET clause from changed fields
|
|
348
|
+
set_parts = []
|
|
349
|
+
params: list[Any] = []
|
|
350
|
+
for field_name in changed_fields:
|
|
351
|
+
new_val = record.get(field_name)
|
|
352
|
+
safe_field = sanitize_identifier(field_name)
|
|
353
|
+
set_parts.append(f"{safe_field} = ?")
|
|
354
|
+
params.append(new_val)
|
|
355
|
+
|
|
356
|
+
params.append(str(key_val))
|
|
357
|
+
sql = (
|
|
358
|
+
f"UPDATE {table_name} "
|
|
359
|
+
f"SET {', '.join(set_parts)} "
|
|
360
|
+
f"WHERE {safe_col} = ?"
|
|
361
|
+
)
|
|
362
|
+
self.adapter.execute(sql, tuple(params))
|
|
363
|
+
total += 1
|
|
364
|
+
|
|
365
|
+
return total
|
|
366
|
+
|
|
228
367
|
def _get_child_lookup_field(self, source: DatasetSource) -> str | None:
|
|
229
368
|
"""Find which field children use to reference this source."""
|
|
230
369
|
for other in self.config.sources:
|