anysite-cli 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
anysite/dataset/cli.py CHANGED
@@ -357,6 +357,10 @@ def load_db(
357
357
  bool,
358
358
  typer.Option("--quiet", "-q", help="Suppress progress output"),
359
359
  ] = False,
360
+ snapshot: Annotated[
361
+ str | None,
362
+ typer.Option("--snapshot", help="Load a specific snapshot date (YYYY-MM-DD)"),
363
+ ] = None,
360
364
  ) -> None:
361
365
  """Load collected Parquet data into a relational database with FK linking."""
362
366
  config = _load_config(config_path)
@@ -379,6 +383,7 @@ def load_db(
379
383
  source_filter=source,
380
384
  drop_existing=drop_existing,
381
385
  dry_run=dry_run,
386
+ snapshot=snapshot,
382
387
  )
383
388
  except Exception as e:
384
389
  typer.echo(f"Load error: {e}", err=True)
@@ -519,7 +524,11 @@ def diff_cmd(
519
524
  return
520
525
 
521
526
  # Format and output
522
- rows = format_diff_table(result) if format == "table" else format_diff_records(result)
527
+ rows = (
528
+ format_diff_table(result, output_fields=field_list)
529
+ if format == "table"
530
+ else format_diff_records(result, output_fields=field_list)
531
+ )
523
532
 
524
533
  _output_results(rows, format, output)
525
534
 
@@ -19,6 +19,7 @@ from anysite.dataset.models import DatasetConfig, DatasetSource
19
19
  from anysite.dataset.storage import (
20
20
  MetadataStore,
21
21
  get_parquet_path,
22
+ read_latest_parquet,
22
23
  read_parquet,
23
24
  write_parquet,
24
25
  )
@@ -412,9 +413,9 @@ async def _collect_dependent(
412
413
  if dep is None:
413
414
  raise DatasetError(f"Source {source.id} has no dependency defined")
414
415
 
415
- # Read parent data
416
+ # Read parent data (latest snapshot only to avoid schema mismatch)
416
417
  parent_dir = base_path / "raw" / dep.from_source
417
- parent_records = read_parquet(parent_dir)
418
+ parent_records = read_latest_parquet(parent_dir)
418
419
 
419
420
  if not parent_records:
420
421
  if not quiet:
@@ -627,7 +628,7 @@ def _count_dependent_inputs(
627
628
  if dep is None:
628
629
  return None
629
630
  parent_dir = base_path / "raw" / dep.from_source
630
- parent_records = read_parquet(parent_dir)
631
+ parent_records = read_latest_parquet(parent_dir)
631
632
  if not parent_records:
632
633
  info = metadata.get_source_info(dep.from_source)
633
634
  return info.get("record_count") if info else None
@@ -3,12 +3,18 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import json
6
+ import logging
7
+ from datetime import date
8
+ from pathlib import Path
6
9
  from typing import Any
7
10
 
8
11
  from anysite.dataset.models import DatasetConfig, DatasetSource
9
12
  from anysite.dataset.storage import get_source_dir, read_parquet
10
13
  from anysite.db.adapters.base import DatabaseAdapter
11
14
  from anysite.db.schema.inference import infer_table_schema
15
+ from anysite.db.utils.sanitize import sanitize_identifier
16
+
17
+ logger = logging.getLogger(__name__)
12
18
 
13
19
 
14
20
  def _get_dialect(adapter: DatabaseAdapter) -> str:
@@ -86,15 +92,31 @@ def _filter_record(
86
92
  return {k: v for k, v in record.items() if k not in exclude}
87
93
 
88
94
 
95
+ def _get_latest_parquet(base_path: Path, source_id: str) -> Path | None:
96
+ """Return the path to the most recent snapshot for a source."""
97
+ source_dir = get_source_dir(base_path, source_id)
98
+ if not source_dir.exists():
99
+ return None
100
+ files = sorted(source_dir.glob("*.parquet"))
101
+ return files[-1] if files else None
102
+
103
+
104
+ def _get_snapshot_for_date(base_path: Path, source_id: str, d: date) -> Path | None:
105
+ """Return the parquet path for a specific snapshot date."""
106
+ source_dir = get_source_dir(base_path, source_id)
107
+ path = source_dir / f"{d.isoformat()}.parquet"
108
+ return path if path.exists() else None
109
+
110
+
89
111
  class DatasetDbLoader:
90
112
  """Load dataset Parquet data into a relational database.
91
113
 
92
- Handles:
93
- - Schema inference from Parquet records
94
- - Auto-increment primary keys (``id`` column)
95
- - Foreign key linking via provenance ``_input_value`` column
96
- - Dot-notation field extraction for JSON columns
97
- - Topological loading order (parents before children)
114
+ Supports diff-based incremental sync when ``db_load.key`` is configured:
115
+ compares the two most recent snapshots and applies INSERT/DELETE/UPDATE
116
+ to keep the database in sync.
117
+
118
+ Falls back to full INSERT of the latest snapshot when no key is set
119
+ or when the table doesn't exist yet.
98
120
  """
99
121
 
100
122
  def __init__(
@@ -115,16 +137,18 @@ class DatasetDbLoader:
115
137
  source_filter: str | None = None,
116
138
  drop_existing: bool = False,
117
139
  dry_run: bool = False,
140
+ snapshot: str | None = None,
118
141
  ) -> dict[str, int]:
119
142
  """Load all sources into the database in dependency order.
120
143
 
121
144
  Args:
122
145
  source_filter: Only load this source (and dependencies).
123
- drop_existing: Drop tables before creating.
146
+ drop_existing: Drop tables before creating, then full INSERT latest.
124
147
  dry_run: Show plan without executing.
148
+ snapshot: Load a specific snapshot date (YYYY-MM-DD).
125
149
 
126
150
  Returns:
127
- Mapping of source_id to number of rows loaded.
151
+ Mapping of source_id to number of rows loaded/affected.
128
152
  """
129
153
  sources = self.config.topological_sort()
130
154
 
@@ -139,6 +163,7 @@ class DatasetDbLoader:
139
163
  source,
140
164
  drop_existing=drop_existing,
141
165
  dry_run=dry_run,
166
+ snapshot=snapshot,
142
167
  )
143
168
  results[source.id] = count
144
169
 
@@ -150,18 +175,64 @@ class DatasetDbLoader:
150
175
  *,
151
176
  drop_existing: bool = False,
152
177
  dry_run: bool = False,
178
+ snapshot: str | None = None,
153
179
  ) -> int:
154
- """Load a single source into the database."""
155
- source_dir = get_source_dir(self.base_path, source.id)
156
- if not source_dir.exists() or not any(source_dir.glob("*.parquet")):
180
+ """Load a single source into the database.
181
+
182
+ Strategy:
183
+ 1. ``drop_existing``: drop table → full INSERT of latest snapshot
184
+ 2. ``snapshot``: full INSERT of that specific snapshot
185
+ 3. Table doesn't exist: full INSERT of latest snapshot
186
+ 4. Table exists + ``db_load.key`` set + ≥2 snapshots: diff-based sync
187
+ 5. Fallback: full INSERT of latest snapshot
188
+ """
189
+ table_name = _table_name_for(source)
190
+
191
+ # Handle drop_existing
192
+ if drop_existing and self.adapter.table_exists(table_name):
193
+ self.adapter.execute(f"DROP TABLE {table_name}")
194
+
195
+ # Determine which parquet to load
196
+ if snapshot:
197
+ snapshot_date = date.fromisoformat(snapshot)
198
+ parquet_path = _get_snapshot_for_date(self.base_path, source.id, snapshot_date)
199
+ if parquet_path is None:
200
+ return 0
201
+ return self._full_insert(source, table_name, parquet_path, dry_run=dry_run)
202
+
203
+ # Check if we can do diff-based sync
204
+ diff_key = source.db_load.key if source.db_load else None
205
+ table_exists = self.adapter.table_exists(table_name)
206
+
207
+ if diff_key and table_exists and not drop_existing:
208
+ from anysite.dataset.differ import DatasetDiffer
209
+ differ = DatasetDiffer(self.base_path)
210
+ dates = differ.available_dates(source.id)
211
+
212
+ if len(dates) >= 2:
213
+ return self._diff_sync(
214
+ source, table_name, diff_key, differ, dates, dry_run=dry_run
215
+ )
216
+
217
+ # Fallback: full INSERT of latest snapshot
218
+ latest = _get_latest_parquet(self.base_path, source.id)
219
+ if latest is None:
157
220
  return 0
221
+ return self._full_insert(source, table_name, latest, dry_run=dry_run)
158
222
 
159
- raw_records = read_parquet(source_dir)
223
+ def _full_insert(
224
+ self,
225
+ source: DatasetSource,
226
+ table_name: str,
227
+ parquet_path: Path,
228
+ *,
229
+ dry_run: bool = False,
230
+ ) -> int:
231
+ """Full INSERT: read parquet, transform, create table if needed, insert all rows."""
232
+ raw_records = read_parquet(parquet_path)
160
233
  if not raw_records:
161
234
  return 0
162
235
 
163
- table_name = _table_name_for(source)
164
-
165
236
  # Determine parent info for FK linking
166
237
  parent_source_id = None
167
238
  parent_fk_col = None
@@ -174,7 +245,6 @@ class DatasetDbLoader:
174
245
  for record in raw_records:
175
246
  row = _filter_record(record, source)
176
247
 
177
- # Add FK column if this is a dependent source
178
248
  if parent_source_id and parent_fk_col:
179
249
  input_val = record.get("_input_value")
180
250
  parent_map = self._value_to_id.get(parent_source_id, {})
@@ -189,17 +259,12 @@ class DatasetDbLoader:
189
259
  return len(rows)
190
260
 
191
261
  # Determine the lookup field for children to reference this source
192
- # This is the field that child dependencies extract from this source
193
262
  lookup_field = self._get_child_lookup_field(source)
194
263
 
195
- # Create table
196
- if drop_existing and self.adapter.table_exists(table_name):
197
- self.adapter.execute(f"DROP TABLE {table_name}")
198
-
264
+ # Create table if needed
199
265
  if not self.adapter.table_exists(table_name):
200
266
  schema = infer_table_schema(table_name, rows)
201
267
  sql_types = schema.to_sql_types(self._dialect)
202
- # Add auto-increment id column
203
268
  col_defs = {"id": self._auto_id_type()}
204
269
  col_defs.update(sql_types)
205
270
  self.adapter.create_table(table_name, col_defs, primary_key="id")
@@ -208,10 +273,8 @@ class DatasetDbLoader:
208
273
  value_map: dict[str, int] = {}
209
274
  for i, row in enumerate(rows):
210
275
  self.adapter.insert_batch(table_name, [row])
211
- # Get the last inserted id
212
276
  last_id = self._get_last_id(table_name)
213
277
 
214
- # Build value→id map for child sources
215
278
  if lookup_field and last_id is not None:
216
279
  raw_record = raw_records[i]
217
280
  lookup_val = _extract_dot_value(raw_record, lookup_field)
@@ -225,6 +288,86 @@ class DatasetDbLoader:
225
288
 
226
289
  return len(rows)
227
290
 
291
+ def _diff_sync(
292
+ self,
293
+ source: DatasetSource,
294
+ table_name: str,
295
+ diff_key: str,
296
+ differ: Any,
297
+ dates: list[date],
298
+ *,
299
+ dry_run: bool = False,
300
+ ) -> int:
301
+ """Diff-based incremental sync: compare two most recent snapshots, apply delta."""
302
+ result = differ.diff(source.id, diff_key)
303
+ total = 0
304
+ sync_mode = source.db_load.sync if source.db_load else "full"
305
+
306
+ if dry_run:
307
+ count = len(result.added) + len(result.changed)
308
+ if sync_mode == "full":
309
+ count += len(result.removed)
310
+ return count
311
+
312
+ # Extract key value from a record (handles dot-notation)
313
+ def _get_key_val(record: dict[str, Any]) -> Any:
314
+ if "." in diff_key:
315
+ return _extract_dot_value(record, diff_key)
316
+ return record.get(diff_key)
317
+
318
+ # Determine the DB column name for the key
319
+ db_key_col = diff_key.replace(".", "_")
320
+
321
+ # INSERT added records
322
+ if result.added:
323
+ for record in result.added:
324
+ row = _filter_record(record, source)
325
+ self.adapter.insert_batch(table_name, [row])
326
+ total += 1
327
+
328
+ # DELETE removed records (skipped in append mode)
329
+ if result.removed and sync_mode == "full":
330
+ safe_col = sanitize_identifier(db_key_col)
331
+ for record in result.removed:
332
+ key_val = _get_key_val(record)
333
+ if key_val is not None:
334
+ self.adapter.execute(
335
+ f"DELETE FROM {table_name} WHERE {safe_col} = ?",
336
+ (str(key_val),),
337
+ )
338
+ total += 1
339
+
340
+ # UPDATE changed records
341
+ if result.changed:
342
+ safe_col = sanitize_identifier(db_key_col)
343
+ for record in result.changed:
344
+ key_val = _get_key_val(record)
345
+ if key_val is None:
346
+ continue
347
+ changed_fields = record.get("_changed_fields", [])
348
+ if not changed_fields:
349
+ continue
350
+
351
+ # Build SET clause from changed fields
352
+ set_parts = []
353
+ params: list[Any] = []
354
+ for field_name in changed_fields:
355
+ new_val = record.get(field_name)
356
+ safe_field = sanitize_identifier(field_name)
357
+ set_parts.append(f"{safe_field} = ?")
358
+ params.append(new_val)
359
+
360
+ params.append(str(key_val))
361
+ sql = (
362
+ f"UPDATE {table_name} "
363
+ f"SET {', '.join(set_parts)} "
364
+ f"WHERE {safe_col} = ?"
365
+ )
366
+ self.adapter.execute(sql, tuple(params))
367
+ total += 1
368
+
369
+ return total
370
+
228
371
  def _get_child_lookup_field(self, source: DatasetSource) -> str | None:
229
372
  """Find which field children use to reference this source."""
230
373
  for other in self.config.sources:
anysite/dataset/differ.py CHANGED
@@ -12,6 +12,31 @@ from anysite.dataset.errors import DatasetError
12
12
  from anysite.dataset.storage import get_source_dir
13
13
 
14
14
 
15
+ def _build_key_expr(key: str, all_columns: list[str]) -> tuple[str, str]:
16
+ """Build a DuckDB key expression, supporting dot-notation for JSON fields.
17
+
18
+ Returns:
19
+ (key_expr, key_alias) — the SQL expression and a display alias.
20
+ For simple keys: ('"field"', 'field')
21
+ For dot-notation: ("json_extract_string(\"urn\", '$.value')", 'urn.value')
22
+ """
23
+ if "." not in key:
24
+ if key not in all_columns:
25
+ raise DatasetError(
26
+ f"Key field '{key}' not found. "
27
+ f"Available: {', '.join(all_columns)}"
28
+ )
29
+ return f'"{key}"', key
30
+
31
+ root, rest = key.split(".", 1)
32
+ if root not in all_columns:
33
+ raise DatasetError(
34
+ f"Root field '{root}' (from key '{key}') not found. "
35
+ f"Available: {', '.join(all_columns)}"
36
+ )
37
+ return f"json_extract_string(\"{root}\", '$.{rest}')", key
38
+
39
+
15
40
  @dataclass
16
41
  class DiffResult:
17
42
  """Result of comparing two dataset snapshots."""
@@ -24,6 +49,7 @@ class DiffResult:
24
49
  removed: list[dict[str, Any]] = field(default_factory=list)
25
50
  changed: list[dict[str, Any]] = field(default_factory=list)
26
51
  unchanged_count: int = 0
52
+ fields: list[str] | None = field(default=None)
27
53
 
28
54
  @property
29
55
  def has_changes(self) -> bool:
@@ -63,10 +89,11 @@ class DatasetDiffer:
63
89
 
64
90
  Args:
65
91
  source_id: Source to compare.
66
- key: Field to match records by (e.g., ``_input_value``, ``urn``).
92
+ key: Field to match records by. Supports dot-notation for
93
+ JSON fields (e.g., ``urn.value``).
67
94
  from_date: Older snapshot date (default: second-to-last).
68
95
  to_date: Newer snapshot date (default: latest).
69
- fields: Only compare these fields (default: all).
96
+ fields: Only compare (and output) these fields (default: all).
70
97
 
71
98
  Returns:
72
99
  DiffResult with added, removed, changed lists.
@@ -153,50 +180,43 @@ class DatasetDiffer:
153
180
  info = conn.execute("DESCRIBE _new").fetchall()
154
181
  all_columns = [col[0] for col in info]
155
182
 
156
- if key not in all_columns:
157
- raise DatasetError(
158
- f"Key field '{key}' not found in {source_id}. "
159
- f"Available: {', '.join(all_columns)}"
160
- )
183
+ # Build key expression (supports dot-notation)
184
+ key_expr, key_alias = _build_key_expr(key, all_columns)
161
185
 
162
186
  # Determine which fields to compare
163
187
  compare_fields = fields if fields else [
164
- c for c in all_columns if c != key
188
+ c for c in all_columns if c != key and c != key.split(".")[0]
165
189
  ]
166
190
  # Filter to fields that actually exist
167
191
  compare_fields = [c for c in compare_fields if c in all_columns]
168
192
 
169
- quoted_key = f'"{key}"'
193
+ # Determine output columns: if fields specified, restrict to key + fields
194
+ if fields:
195
+ output_columns = [key_alias] + [
196
+ f for f in fields if f in all_columns
197
+ ]
198
+ else:
199
+ output_columns = None # all columns
170
200
 
171
201
  # Added: in new but not in old
172
- added = conn.execute(
173
- f"SELECT * FROM _new "
174
- f"WHERE {quoted_key} NOT IN (SELECT {quoted_key} FROM _old)"
175
- ).fetchall()
176
- added_cols = [d[0] for d in conn.execute(
177
- "DESCRIBE _new"
178
- ).fetchall()]
179
- added_records = [dict(zip(added_cols, row, strict=False)) for row in added]
202
+ added_records = self._query_added_removed(
203
+ conn, "_new", "_old", key_expr, key_alias, all_columns, output_columns
204
+ )
180
205
 
181
206
  # Removed: in old but not in new
182
- removed = conn.execute(
183
- f"SELECT * FROM _old "
184
- f"WHERE {quoted_key} NOT IN (SELECT {quoted_key} FROM _new)"
185
- ).fetchall()
186
- removed_cols = [d[0] for d in conn.execute(
187
- "DESCRIBE _old"
188
- ).fetchall()]
189
- removed_records = [dict(zip(removed_cols, row, strict=False)) for row in removed]
207
+ removed_records = self._query_added_removed(
208
+ conn, "_old", "_new", key_expr, key_alias, all_columns, output_columns
209
+ )
190
210
 
191
211
  # Changed: matching key, different values
192
212
  changed_records = self._find_changed(
193
- conn, key, compare_fields, all_columns
213
+ conn, key_expr, key_alias, compare_fields, all_columns, output_columns
194
214
  )
195
215
 
196
216
  # Count unchanged
197
217
  total_matched = conn.execute(
198
218
  f"SELECT COUNT(*) FROM _new n "
199
- f"JOIN _old o ON n.{quoted_key} = o.{quoted_key}"
219
+ f"JOIN _old o ON ({_requalify(key_expr, 'n')}) = ({_requalify(key_expr, 'o')})"
200
220
  ).fetchone()
201
221
  matched_count = total_matched[0] if total_matched else 0
202
222
  unchanged_count = matched_count - len(changed_records)
@@ -210,23 +230,59 @@ class DatasetDiffer:
210
230
  removed=removed_records,
211
231
  changed=changed_records,
212
232
  unchanged_count=unchanged_count,
233
+ fields=fields,
213
234
  )
214
235
  finally:
215
236
  conn.close()
216
237
 
238
+ @staticmethod
239
+ def _query_added_removed(
240
+ conn: Any,
241
+ present_view: str,
242
+ absent_view: str,
243
+ key_expr: str,
244
+ key_alias: str,
245
+ all_columns: list[str],
246
+ output_columns: list[str] | None,
247
+ ) -> list[dict[str, Any]]:
248
+ """Query records present in one view but not the other."""
249
+ # Build SELECT list
250
+ if output_columns:
251
+ select_parts = []
252
+ for col in output_columns:
253
+ if col == key_alias and "." in col:
254
+ select_parts.append(f"{key_expr} AS \"{key_alias}\"")
255
+ else:
256
+ select_parts.append(f'"{col}"')
257
+ select_clause = ", ".join(select_parts)
258
+ else:
259
+ if "." in key_alias:
260
+ select_clause = f"*, {key_expr} AS \"{key_alias}\""
261
+ else:
262
+ select_clause = "*"
263
+
264
+ sql = (
265
+ f"SELECT {select_clause} FROM {present_view} "
266
+ f"WHERE ({key_expr}) NOT IN (SELECT ({key_expr}) FROM {absent_view})"
267
+ )
268
+ result = conn.execute(sql)
269
+ columns = [desc[0] for desc in result.description]
270
+ rows = result.fetchall()
271
+ return [dict(zip(columns, row, strict=False)) for row in rows]
272
+
273
+ @staticmethod
217
274
  def _find_changed(
218
- self,
219
275
  conn: Any,
220
- key: str,
276
+ key_expr: str,
277
+ key_alias: str,
221
278
  compare_fields: list[str],
222
279
  all_columns: list[str],
280
+ output_columns: list[str] | None,
223
281
  ) -> list[dict[str, Any]]:
224
282
  """Find records that exist in both snapshots but have different values."""
225
283
  if not compare_fields:
226
284
  return []
227
285
 
228
- quoted_key = f'"{key}"'
229
-
230
286
  # Build WHERE clause: any compared field differs
231
287
  where_parts = []
232
288
  for col in compare_fields:
@@ -234,21 +290,43 @@ class DatasetDiffer:
234
290
  where_parts.append(f"n.{qc} IS DISTINCT FROM o.{qc}")
235
291
  where_clause = " OR ".join(where_parts)
236
292
 
237
- # Select new values + old values for compared fields
238
- select_parts = [f"n.{quoted_key}"]
239
- for col in all_columns:
240
- if col != key:
241
- qc = f'"{col}"'
242
- select_parts.append(f"n.{qc}")
243
- for col in compare_fields:
244
- qc = f'"{col}"'
245
- select_parts.append(f"o.{qc} AS \"{col}__old\"")
293
+ # Build JOIN condition
294
+ join_key_n = _requalify(key_expr, "n")
295
+ join_key_o = _requalify(key_expr, "o")
296
+ join_cond = f"({join_key_n}) = ({join_key_o})"
297
+
298
+ # Build SELECT: key + output fields + __old for compare fields
299
+ if output_columns:
300
+ # Restricted output
301
+ select_parts = []
302
+ for col in output_columns:
303
+ if col == key_alias and "." in col:
304
+ select_parts.append(f"{_requalify(key_expr, 'n')} AS \"{key_alias}\"")
305
+ else:
306
+ select_parts.append(f"n.\"{col}\"")
307
+ for col in compare_fields:
308
+ # Include __old for compare fields that are in output
309
+ if col in [c for c in output_columns if c != key_alias]:
310
+ select_parts.append(f"o.\"{col}\" AS \"{col}__old\"")
311
+ else:
312
+ # Full output
313
+ select_parts = []
314
+ if "." in key_alias:
315
+ select_parts.append(f"{_requalify(key_expr, 'n')} AS \"{key_alias}\"")
316
+ else:
317
+ select_parts.append(f"n.\"{key_alias}\"")
318
+ for col in all_columns:
319
+ if col == key_alias:
320
+ continue
321
+ select_parts.append(f"n.\"{col}\"")
322
+ for col in compare_fields:
323
+ select_parts.append(f"o.\"{col}\" AS \"{col}__old\"")
246
324
 
247
325
  select_clause = ", ".join(select_parts)
248
326
 
249
327
  sql = (
250
328
  f"SELECT {select_clause} FROM _new n "
251
- f"JOIN _old o ON n.{quoted_key} = o.{quoted_key} "
329
+ f"JOIN _old o ON {join_cond} "
252
330
  f"WHERE {where_clause}"
253
331
  )
254
332
 
@@ -266,11 +344,32 @@ class DatasetDiffer:
266
344
  old_val = record.get(old_key)
267
345
  if _values_differ(new_val, old_val):
268
346
  changed_fields.append(col)
347
+ # Fallback: DuckDB detected a change but Python comparison missed it
348
+ if not changed_fields:
349
+ changed_fields = list(compare_fields)
269
350
  record["_changed_fields"] = changed_fields
270
351
 
271
352
  return records
272
353
 
273
354
 
355
+ def _requalify(key_expr: str, prefix: str) -> str:
356
+ """Requalify a key expression with a table alias prefix.
357
+
358
+ For simple keys like '"field"', returns 'prefix."field"'.
359
+ For json_extract_string("col", '$.path'), returns
360
+ json_extract_string(prefix."col", '$.path').
361
+ """
362
+ if key_expr.startswith("json_extract_string("):
363
+ # Replace the column reference inside json_extract_string
364
+ inner = key_expr[len("json_extract_string("):]
365
+ # inner looks like: "col", '$.path')
366
+ col_end = inner.index(",")
367
+ col = inner[:col_end].strip()
368
+ rest = inner[col_end:]
369
+ return f"json_extract_string({prefix}.{col}{rest}"
370
+ return f"{prefix}.{key_expr}"
371
+
372
+
274
373
  def _values_differ(a: Any, b: Any) -> bool:
275
374
  """Compare two values, treating JSON strings as equivalent to their parsed form."""
276
375
  if a == b:
@@ -281,24 +380,42 @@ def _values_differ(a: Any, b: Any) -> bool:
281
380
  return json.loads(a) != json.loads(b)
282
381
  except (json.JSONDecodeError, ValueError):
283
382
  pass
383
+ # Handle complex types (dict, list) — compare via JSON serialization
384
+ # to catch differences DuckDB sees but Python equality misses
385
+ if isinstance(a, (dict, list)) or isinstance(b, (dict, list)):
386
+ try:
387
+ return json.dumps(a, sort_keys=True, default=str) != json.dumps(
388
+ b, sort_keys=True, default=str
389
+ )
390
+ except (TypeError, ValueError):
391
+ pass
284
392
  return True
285
393
 
286
394
 
287
- def format_diff_table(result: DiffResult) -> list[dict[str, Any]]:
395
+ def format_diff_table(
396
+ result: DiffResult,
397
+ *,
398
+ output_fields: list[str] | None = None,
399
+ ) -> list[dict[str, Any]]:
288
400
  """Format a DiffResult into a flat list of dicts for table/json output.
289
401
 
290
402
  Each record gets a ``_diff`` column with value ``added``, ``removed``,
291
403
  or ``changed``. For changed records in table mode, modified field
292
404
  values are formatted as ``old → new``.
405
+
406
+ Args:
407
+ result: The diff result.
408
+ output_fields: If set, only include these fields (plus ``_diff`` and key).
293
409
  """
410
+ allowed = _build_allowed_set(result.key, output_fields)
294
411
  rows: list[dict[str, Any]] = []
295
412
 
296
413
  for record in result.added:
297
- row = {"_diff": "added", **record}
414
+ row = {"_diff": "added", **_filter_row(record, allowed)}
298
415
  rows.append(row)
299
416
 
300
417
  for record in result.removed:
301
- row = {"_diff": "removed", **record}
418
+ row = {"_diff": "removed", **_filter_row(record, allowed)}
302
419
  rows.append(row)
303
420
 
304
421
  for record in result.changed:
@@ -309,6 +426,8 @@ def format_diff_table(result: DiffResult) -> list[dict[str, Any]]:
309
426
  continue
310
427
  if k.endswith("__old"):
311
428
  continue
429
+ if allowed and k not in allowed:
430
+ continue
312
431
  # For changed fields, format as "old → new"
313
432
  if k in changed_fields:
314
433
  old_val = record.get(f"{k}__old")
@@ -320,31 +439,67 @@ def format_diff_table(result: DiffResult) -> list[dict[str, Any]]:
320
439
  return rows
321
440
 
322
441
 
323
- def format_diff_records(result: DiffResult) -> list[dict[str, Any]]:
442
+ def format_diff_records(
443
+ result: DiffResult,
444
+ *,
445
+ output_fields: list[str] | None = None,
446
+ ) -> list[dict[str, Any]]:
324
447
  """Format a DiffResult for JSON/CSV output.
325
448
 
326
449
  Each record gets ``_diff`` column. Changed records include both
327
450
  current values and ``field__old`` columns.
451
+
452
+ Args:
453
+ result: The diff result.
454
+ output_fields: If set, only include these fields (plus ``_diff``, key, and ``__old``).
328
455
  """
456
+ allowed = _build_allowed_set(result.key, output_fields)
329
457
  rows: list[dict[str, Any]] = []
330
458
 
331
459
  for record in result.added:
332
- rows.append({"_diff": "added", **record})
460
+ rows.append({"_diff": "added", **_filter_row(record, allowed)})
333
461
 
334
462
  for record in result.removed:
335
- rows.append({"_diff": "removed", **record})
463
+ rows.append({"_diff": "removed", **_filter_row(record, allowed)})
336
464
 
337
465
  for record in result.changed:
338
- row = {"_diff": "changed"}
466
+ row: dict[str, Any] = {"_diff": "changed"}
467
+ changed_fields = record.get("_changed_fields", [])
468
+ row["_changed_fields"] = changed_fields
339
469
  for k, v in record.items():
340
470
  if k == "_changed_fields":
341
471
  continue
472
+ if allowed and k not in allowed and not k.endswith("__old"):
473
+ continue
474
+ if k.endswith("__old") and allowed:
475
+ base = k[: -len("__old")]
476
+ if base not in allowed:
477
+ continue
342
478
  row[k] = v
343
479
  rows.append(row)
344
480
 
345
481
  return rows
346
482
 
347
483
 
484
+ def _build_allowed_set(key: str, output_fields: list[str] | None) -> set[str] | None:
485
+ """Build the set of allowed field names for output filtering."""
486
+ if not output_fields:
487
+ return None
488
+ allowed = set(output_fields)
489
+ allowed.add(key)
490
+ # Also add the root column for dot-notation keys
491
+ if "." in key:
492
+ allowed.add(key.split(".")[0])
493
+ return allowed
494
+
495
+
496
+ def _filter_row(record: dict[str, Any], allowed: set[str] | None) -> dict[str, Any]:
497
+ """Filter a record to only allowed fields."""
498
+ if not allowed:
499
+ return record
500
+ return {k: v for k, v in record.items() if k in allowed}
501
+
502
+
348
503
  def _format_val(v: Any) -> str:
349
504
  """Format a value for display, truncating long strings."""
350
505
  if v is None:
anysite/dataset/models.py CHANGED
@@ -81,6 +81,11 @@ class DbLoadConfig(BaseModel):
81
81
  """Configuration for loading a source into a relational database."""
82
82
 
83
83
  table: str | None = Field(default=None, description="Override table name (default: source id)")
84
+ key: str | None = Field(default=None, description="Unique key field for diff-based DB sync (e.g., urn.value)")
85
+ sync: Literal["full", "append"] = Field(
86
+ default="full",
87
+ description="Sync mode: 'full' applies INSERT/DELETE/UPDATE, 'append' skips DELETE (keeps old records)",
88
+ )
84
89
  fields: list[str] = Field(default_factory=list, description="Fields to include (empty = all)")
85
90
  exclude: list[str] = Field(
86
91
  default_factory=lambda: ["_input_value", "_parent_source"],
@@ -75,7 +75,7 @@ def read_parquet(path: Path) -> list[dict[str, Any]]:
75
75
  tables = [pq.read_table(f) for f in files]
76
76
  import pyarrow as pa
77
77
 
78
- table = pa.concat_tables(tables)
78
+ table = pa.concat_tables(tables, promote_options="permissive")
79
79
  else:
80
80
  if not path.exists():
81
81
  return []
@@ -84,6 +84,26 @@ def read_parquet(path: Path) -> list[dict[str, Any]]:
84
84
  return table.to_pylist()
85
85
 
86
86
 
87
+ def read_latest_parquet(path: Path) -> list[dict[str, Any]]:
88
+ """Read records from the most recent Parquet snapshot in a directory.
89
+
90
+ Unlike ``read_parquet(dir)``, this reads only the latest file, avoiding
91
+ schema mismatch errors when snapshots have different column types.
92
+
93
+ Args:
94
+ path: Directory containing dated .parquet files.
95
+
96
+ Returns:
97
+ List of dicts from the newest snapshot, or [] if none found.
98
+ """
99
+ if not path.is_dir():
100
+ return read_parquet(path)
101
+ files = sorted(path.glob("*.parquet"))
102
+ if not files:
103
+ return []
104
+ return read_parquet(files[-1])
105
+
106
+
87
107
  def get_source_dir(base_path: Path, source_id: str) -> Path:
88
108
  """Get the raw data directory for a source."""
89
109
  return base_path / "raw" / source_id
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: anysite-cli
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: CLI for Anysite API - web data extraction for humans and AI agents
5
5
  Project-URL: Homepage, https://anysite.io
6
6
  Project-URL: Documentation, https://docs.anysite.io/cli
@@ -259,6 +259,8 @@ sources:
259
259
  path: ./output/companies-{{date}}.csv
260
260
  format: csv
261
261
  db_load:
262
+ key: _input_value # Unique key for incremental sync
263
+ sync: full # full (default) or append (no DELETE)
262
264
  fields: [name, url, employee_count]
263
265
 
264
266
  - id: employees
@@ -274,6 +276,8 @@ sources:
274
276
  count: 5
275
277
  refresh: always # Re-collect every run with --incremental
276
278
  db_load:
279
+ key: urn.value # Unique key for incremental sync
280
+ sync: append # Keep old records (no DELETE on diff)
277
281
  fields: [name, url, headline]
278
282
 
279
283
  storage:
@@ -318,9 +322,15 @@ anysite dataset query dataset.yaml --interactive
318
322
  anysite dataset stats dataset.yaml --source companies
319
323
  anysite dataset profile dataset.yaml
320
324
 
321
- # Load into PostgreSQL with automatic FK linking
325
+ # Load into PostgreSQL with automatic FK linking (incremental sync with db_load.key)
326
+ anysite dataset load-db dataset.yaml -c pg
327
+
328
+ # Drop and reload from latest snapshot
322
329
  anysite dataset load-db dataset.yaml -c pg --drop-existing
323
330
 
331
+ # Load a specific snapshot date
332
+ anysite dataset load-db dataset.yaml -c pg --snapshot 2026-01-15
333
+
324
334
  # Run history and logs
325
335
  anysite dataset history my-dataset
326
336
  anysite dataset logs my-dataset --run 42
@@ -328,8 +338,9 @@ anysite dataset logs my-dataset --run 42
328
338
  # Generate cron/systemd schedule
329
339
  anysite dataset schedule dataset.yaml --incremental --load-db pg
330
340
 
331
- # Compare snapshots (diff two collection dates)
341
+ # Compare snapshots (diff two collection dates, supports dot-notation keys)
332
342
  anysite dataset diff dataset.yaml --source employees --key _input_value
343
+ anysite dataset diff dataset.yaml --source profiles --key urn.value --fields "name,headline"
333
344
 
334
345
  # Reset incremental state
335
346
  anysite dataset reset-cursor dataset.yaml
@@ -19,17 +19,17 @@ anysite/config/paths.py,sha256=EmHJD8wlf4Q9IUn8Gp1JQ8Z3ffrIYAt5iHRyImQOf5I,1087
19
19
  anysite/config/settings.py,sha256=Hc0j_aCCtkJeL4nHw-EFyfJ8WEDk57G08iNUFquUhpM,5235
20
20
  anysite/dataset/__init__.py,sha256=J0sKQkGwVOPtvp6pka7LcdeUEADvjWRs71yRuROzJxI,847
21
21
  anysite/dataset/analyzer.py,sha256=8dsPW32SbSaUTy1F0NIed1U45wjiMgQeJ2iWX7hBxRQ,9245
22
- anysite/dataset/cli.py,sha256=zaCo0kKeA1KNU7EZgW4WwxrP07xuKayPlolfUnCSoYI,22801
23
- anysite/dataset/collector.py,sha256=6CfJt8fKZZ2xvZWJ7jwnx0V9BnjoJxmBZkm8xWQiU54,23840
24
- anysite/dataset/db_loader.py,sha256=nlMJrDJiGBX5H1StcjsontSxLXbsFe4rwOEnDehzpk8,8443
25
- anysite/dataset/differ.py,sha256=hbUwoS73syTkrj0VC0gaJzuB0pVCoQXQMbsNXtpsig8,11634
22
+ anysite/dataset/cli.py,sha256=rEWK1ka-YQ_Vbbj2nMaMYTD9g3wa3ethUWSoaWRSGTY,23066
23
+ anysite/dataset/collector.py,sha256=ZdR3CmQQew_iuJpNtJ4knSrjt0hvkEL4WIaS0IKEkwQ,23927
24
+ anysite/dataset/db_loader.py,sha256=ksvRt-VJISL4Syk2O1-TTkOMj1uGzk7aQARYS2n--U4,13751
25
+ anysite/dataset/differ.py,sha256=jB_VWTb7UuEBWG9nv1ry5xeo9hmWdhA_cTm6Ed43_Uw,17746
26
26
  anysite/dataset/errors.py,sha256=r8cZXoIzSeTGCWpeYjntnN0AduCu74YZyWs3sFu17J4,914
27
27
  anysite/dataset/exporters.py,sha256=mA2FYbYJbHfrwkXbHDu4g5qPG_JJKnkVciXFKPkF1Vw,3708
28
28
  anysite/dataset/history.py,sha256=avFs0ADlM7Hr-ttqC1FfjJiQxvQP20sScM7ZoY4lvU0,5471
29
- anysite/dataset/models.py,sha256=_f1cg9A4FlQwWGpg-s0b9q5WMlaIRN-ENlpU9CE6mrQ,9695
29
+ anysite/dataset/models.py,sha256=d-bkgu2dUY7_VSgH-oVh84IV3X-KpxRfja0H5WnhauU,9998
30
30
  anysite/dataset/notifications.py,sha256=ORzo9XOgOxzLb7rk4pevlKPB_Taf-jejlrtmO4Zgl2c,2367
31
31
  anysite/dataset/scheduler.py,sha256=zpbA5tRUQZXr-9lZnG58dvE3E7ZBlAd-U-PTXExe9f0,3339
32
- anysite/dataset/storage.py,sha256=d03goKLI5NWKJowHwCgGqQkcVTO1NctPxMu-Xu-tru4,5326
32
+ anysite/dataset/storage.py,sha256=ySY822m4lQd6Ip0i3VNPVbHEO6U6zBBwHi-56AXOaXE,5974
33
33
  anysite/dataset/transformer.py,sha256=XBI4MiZ_F_IZdootV0GAePaM9-pUadIte7RABbjBipc,6843
34
34
  anysite/db/__init__.py,sha256=xGGZHlMt5FUZjI6MAmf2VfyNLypOeXwrRL-gmuTsyl4,1117
35
35
  anysite/db/cli.py,sha256=fYuIKWq7eF5mAfZWnXNbtlpITnbYbOFMm2TqU54xIl4,22118
@@ -58,8 +58,8 @@ anysite/streaming/writer.py,sha256=HfMsC4umUdJuNIAPK57YAxEGyTwUmy-zNrqFkwY6aew,4
58
58
  anysite/utils/__init__.py,sha256=7SnbxpxKENK-2ecUL5NfnZ9okGI7COKYw4WF46172HM,23
59
59
  anysite/utils/fields.py,sha256=bSrHadzNmabL4qubqhXXZoWb_P8KA-3S7_FLVT8nGBc,7410
60
60
  anysite/utils/retry.py,sha256=89TbXvavi5t22P2mTYCLAS6SSZoW65gQ0nnYNbYAF0M,2684
61
- anysite_cli-0.1.3.dist-info/METADATA,sha256=lD_AF5pq5ayHerMVMMWTTkgccwWEsKLBGCwvPfZ5y_Y,11781
62
- anysite_cli-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
63
- anysite_cli-0.1.3.dist-info/entry_points.txt,sha256=FDPxNasy0fRRcOgJdZRVP7Qw01C3TwRa1OwPJiskNyg,45
64
- anysite_cli-0.1.3.dist-info/licenses/LICENSE,sha256=gVAxkI23CFm4x4HV_fkQYw_bGq93mQmVZEwxNs-YTa4,1069
65
- anysite_cli-0.1.3.dist-info/RECORD,,
61
+ anysite_cli-0.1.5.dist-info/METADATA,sha256=B4HxyrTZxBbhMb17lb0LoRcne_cRehz8xNUYIvDraMA,12437
62
+ anysite_cli-0.1.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
63
+ anysite_cli-0.1.5.dist-info/entry_points.txt,sha256=FDPxNasy0fRRcOgJdZRVP7Qw01C3TwRa1OwPJiskNyg,45
64
+ anysite_cli-0.1.5.dist-info/licenses/LICENSE,sha256=gVAxkI23CFm4x4HV_fkQYw_bGq93mQmVZEwxNs-YTa4,1069
65
+ anysite_cli-0.1.5.dist-info/RECORD,,