anysite-cli 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
anysite/dataset/cli.py CHANGED
@@ -357,6 +357,10 @@ def load_db(
357
357
  bool,
358
358
  typer.Option("--quiet", "-q", help="Suppress progress output"),
359
359
  ] = False,
360
+ snapshot: Annotated[
361
+ str | None,
362
+ typer.Option("--snapshot", help="Load a specific snapshot date (YYYY-MM-DD)"),
363
+ ] = None,
360
364
  ) -> None:
361
365
  """Load collected Parquet data into a relational database with FK linking."""
362
366
  config = _load_config(config_path)
@@ -379,6 +383,7 @@ def load_db(
379
383
  source_filter=source,
380
384
  drop_existing=drop_existing,
381
385
  dry_run=dry_run,
386
+ snapshot=snapshot,
382
387
  )
383
388
  except Exception as e:
384
389
  typer.echo(f"Load error: {e}", err=True)
@@ -519,7 +524,11 @@ def diff_cmd(
519
524
  return
520
525
 
521
526
  # Format and output
522
- rows = format_diff_table(result) if format == "table" else format_diff_records(result)
527
+ rows = (
528
+ format_diff_table(result, output_fields=field_list)
529
+ if format == "table"
530
+ else format_diff_records(result, output_fields=field_list)
531
+ )
523
532
 
524
533
  _output_results(rows, format, output)
525
534
 
@@ -3,12 +3,18 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import json
6
+ import logging
7
+ from datetime import date
8
+ from pathlib import Path
6
9
  from typing import Any
7
10
 
8
11
  from anysite.dataset.models import DatasetConfig, DatasetSource
9
12
  from anysite.dataset.storage import get_source_dir, read_parquet
10
13
  from anysite.db.adapters.base import DatabaseAdapter
11
14
  from anysite.db.schema.inference import infer_table_schema
15
+ from anysite.db.utils.sanitize import sanitize_identifier
16
+
17
+ logger = logging.getLogger(__name__)
12
18
 
13
19
 
14
20
  def _get_dialect(adapter: DatabaseAdapter) -> str:
@@ -86,15 +92,31 @@ def _filter_record(
86
92
  return {k: v for k, v in record.items() if k not in exclude}
87
93
 
88
94
 
95
+ def _get_latest_parquet(base_path: Path, source_id: str) -> Path | None:
96
+ """Return the path to the most recent snapshot for a source."""
97
+ source_dir = get_source_dir(base_path, source_id)
98
+ if not source_dir.exists():
99
+ return None
100
+ files = sorted(source_dir.glob("*.parquet"))
101
+ return files[-1] if files else None
102
+
103
+
104
+ def _get_snapshot_for_date(base_path: Path, source_id: str, d: date) -> Path | None:
105
+ """Return the parquet path for a specific snapshot date."""
106
+ source_dir = get_source_dir(base_path, source_id)
107
+ path = source_dir / f"{d.isoformat()}.parquet"
108
+ return path if path.exists() else None
109
+
110
+
89
111
  class DatasetDbLoader:
90
112
  """Load dataset Parquet data into a relational database.
91
113
 
92
- Handles:
93
- - Schema inference from Parquet records
94
- - Auto-increment primary keys (``id`` column)
95
- - Foreign key linking via provenance ``_input_value`` column
96
- - Dot-notation field extraction for JSON columns
97
- - Topological loading order (parents before children)
114
+ Supports diff-based incremental sync when ``db_load.key`` is configured:
115
+ compares the two most recent snapshots and applies INSERT/DELETE/UPDATE
116
+ to keep the database in sync.
117
+
118
+ Falls back to full INSERT of the latest snapshot when no key is set
119
+ or when the table doesn't exist yet.
98
120
  """
99
121
 
100
122
  def __init__(
@@ -115,16 +137,18 @@ class DatasetDbLoader:
115
137
  source_filter: str | None = None,
116
138
  drop_existing: bool = False,
117
139
  dry_run: bool = False,
140
+ snapshot: str | None = None,
118
141
  ) -> dict[str, int]:
119
142
  """Load all sources into the database in dependency order.
120
143
 
121
144
  Args:
122
145
  source_filter: Only load this source (and dependencies).
123
- drop_existing: Drop tables before creating.
146
+ drop_existing: Drop tables before creating, then full INSERT latest.
124
147
  dry_run: Show plan without executing.
148
+ snapshot: Load a specific snapshot date (YYYY-MM-DD).
125
149
 
126
150
  Returns:
127
- Mapping of source_id to number of rows loaded.
151
+ Mapping of source_id to number of rows loaded/affected.
128
152
  """
129
153
  sources = self.config.topological_sort()
130
154
 
@@ -139,6 +163,7 @@ class DatasetDbLoader:
139
163
  source,
140
164
  drop_existing=drop_existing,
141
165
  dry_run=dry_run,
166
+ snapshot=snapshot,
142
167
  )
143
168
  results[source.id] = count
144
169
 
@@ -150,18 +175,64 @@ class DatasetDbLoader:
150
175
  *,
151
176
  drop_existing: bool = False,
152
177
  dry_run: bool = False,
178
+ snapshot: str | None = None,
153
179
  ) -> int:
154
- """Load a single source into the database."""
155
- source_dir = get_source_dir(self.base_path, source.id)
156
- if not source_dir.exists() or not any(source_dir.glob("*.parquet")):
180
+ """Load a single source into the database.
181
+
182
+ Strategy:
183
+ 1. ``drop_existing``: drop table → full INSERT of latest snapshot
184
+ 2. ``snapshot``: full INSERT of that specific snapshot
185
+ 3. Table doesn't exist: full INSERT of latest snapshot
186
+ 4. Table exists + ``db_load.key`` set + ≥2 snapshots: diff-based sync
187
+ 5. Fallback: full INSERT of latest snapshot
188
+ """
189
+ table_name = _table_name_for(source)
190
+
191
+ # Handle drop_existing
192
+ if drop_existing and self.adapter.table_exists(table_name):
193
+ self.adapter.execute(f"DROP TABLE {table_name}")
194
+
195
+ # Determine which parquet to load
196
+ if snapshot:
197
+ snapshot_date = date.fromisoformat(snapshot)
198
+ parquet_path = _get_snapshot_for_date(self.base_path, source.id, snapshot_date)
199
+ if parquet_path is None:
200
+ return 0
201
+ return self._full_insert(source, table_name, parquet_path, dry_run=dry_run)
202
+
203
+ # Check if we can do diff-based sync
204
+ diff_key = source.db_load.key if source.db_load else None
205
+ table_exists = self.adapter.table_exists(table_name)
206
+
207
+ if diff_key and table_exists and not drop_existing:
208
+ from anysite.dataset.differ import DatasetDiffer
209
+ differ = DatasetDiffer(self.base_path)
210
+ dates = differ.available_dates(source.id)
211
+
212
+ if len(dates) >= 2:
213
+ return self._diff_sync(
214
+ source, table_name, diff_key, differ, dates, dry_run=dry_run
215
+ )
216
+
217
+ # Fallback: full INSERT of latest snapshot
218
+ latest = _get_latest_parquet(self.base_path, source.id)
219
+ if latest is None:
157
220
  return 0
221
+ return self._full_insert(source, table_name, latest, dry_run=dry_run)
158
222
 
159
- raw_records = read_parquet(source_dir)
223
+ def _full_insert(
224
+ self,
225
+ source: DatasetSource,
226
+ table_name: str,
227
+ parquet_path: Path,
228
+ *,
229
+ dry_run: bool = False,
230
+ ) -> int:
231
+ """Full INSERT: read parquet, transform, create table if needed, insert all rows."""
232
+ raw_records = read_parquet(parquet_path)
160
233
  if not raw_records:
161
234
  return 0
162
235
 
163
- table_name = _table_name_for(source)
164
-
165
236
  # Determine parent info for FK linking
166
237
  parent_source_id = None
167
238
  parent_fk_col = None
@@ -174,7 +245,6 @@ class DatasetDbLoader:
174
245
  for record in raw_records:
175
246
  row = _filter_record(record, source)
176
247
 
177
- # Add FK column if this is a dependent source
178
248
  if parent_source_id and parent_fk_col:
179
249
  input_val = record.get("_input_value")
180
250
  parent_map = self._value_to_id.get(parent_source_id, {})
@@ -189,17 +259,12 @@ class DatasetDbLoader:
189
259
  return len(rows)
190
260
 
191
261
  # Determine the lookup field for children to reference this source
192
- # This is the field that child dependencies extract from this source
193
262
  lookup_field = self._get_child_lookup_field(source)
194
263
 
195
- # Create table
196
- if drop_existing and self.adapter.table_exists(table_name):
197
- self.adapter.execute(f"DROP TABLE {table_name}")
198
-
264
+ # Create table if needed
199
265
  if not self.adapter.table_exists(table_name):
200
266
  schema = infer_table_schema(table_name, rows)
201
267
  sql_types = schema.to_sql_types(self._dialect)
202
- # Add auto-increment id column
203
268
  col_defs = {"id": self._auto_id_type()}
204
269
  col_defs.update(sql_types)
205
270
  self.adapter.create_table(table_name, col_defs, primary_key="id")
@@ -208,10 +273,8 @@ class DatasetDbLoader:
208
273
  value_map: dict[str, int] = {}
209
274
  for i, row in enumerate(rows):
210
275
  self.adapter.insert_batch(table_name, [row])
211
- # Get the last inserted id
212
276
  last_id = self._get_last_id(table_name)
213
277
 
214
- # Build value→id map for child sources
215
278
  if lookup_field and last_id is not None:
216
279
  raw_record = raw_records[i]
217
280
  lookup_val = _extract_dot_value(raw_record, lookup_field)
@@ -225,6 +288,82 @@ class DatasetDbLoader:
225
288
 
226
289
  return len(rows)
227
290
 
291
+ def _diff_sync(
292
+ self,
293
+ source: DatasetSource,
294
+ table_name: str,
295
+ diff_key: str,
296
+ differ: Any,
297
+ dates: list[date],
298
+ *,
299
+ dry_run: bool = False,
300
+ ) -> int:
301
+ """Diff-based incremental sync: compare two most recent snapshots, apply delta."""
302
+ result = differ.diff(source.id, diff_key)
303
+ total = 0
304
+
305
+ if dry_run:
306
+ return len(result.added) + len(result.removed) + len(result.changed)
307
+
308
+ # Extract key value from a record (handles dot-notation)
309
+ def _get_key_val(record: dict[str, Any]) -> Any:
310
+ if "." in diff_key:
311
+ return _extract_dot_value(record, diff_key)
312
+ return record.get(diff_key)
313
+
314
+ # Determine the DB column name for the key
315
+ db_key_col = diff_key.replace(".", "_")
316
+
317
+ # INSERT added records
318
+ if result.added:
319
+ for record in result.added:
320
+ row = _filter_record(record, source)
321
+ self.adapter.insert_batch(table_name, [row])
322
+ total += 1
323
+
324
+ # DELETE removed records
325
+ if result.removed:
326
+ safe_col = sanitize_identifier(db_key_col)
327
+ for record in result.removed:
328
+ key_val = _get_key_val(record)
329
+ if key_val is not None:
330
+ self.adapter.execute(
331
+ f"DELETE FROM {table_name} WHERE {safe_col} = ?",
332
+ (str(key_val),),
333
+ )
334
+ total += 1
335
+
336
+ # UPDATE changed records
337
+ if result.changed:
338
+ safe_col = sanitize_identifier(db_key_col)
339
+ for record in result.changed:
340
+ key_val = _get_key_val(record)
341
+ if key_val is None:
342
+ continue
343
+ changed_fields = record.get("_changed_fields", [])
344
+ if not changed_fields:
345
+ continue
346
+
347
+ # Build SET clause from changed fields
348
+ set_parts = []
349
+ params: list[Any] = []
350
+ for field_name in changed_fields:
351
+ new_val = record.get(field_name)
352
+ safe_field = sanitize_identifier(field_name)
353
+ set_parts.append(f"{safe_field} = ?")
354
+ params.append(new_val)
355
+
356
+ params.append(str(key_val))
357
+ sql = (
358
+ f"UPDATE {table_name} "
359
+ f"SET {', '.join(set_parts)} "
360
+ f"WHERE {safe_col} = ?"
361
+ )
362
+ self.adapter.execute(sql, tuple(params))
363
+ total += 1
364
+
365
+ return total
366
+
228
367
  def _get_child_lookup_field(self, source: DatasetSource) -> str | None:
229
368
  """Find which field children use to reference this source."""
230
369
  for other in self.config.sources:
anysite/dataset/differ.py CHANGED
@@ -12,6 +12,31 @@ from anysite.dataset.errors import DatasetError
12
12
  from anysite.dataset.storage import get_source_dir
13
13
 
14
14
 
15
+ def _build_key_expr(key: str, all_columns: list[str]) -> tuple[str, str]:
16
+ """Build a DuckDB key expression, supporting dot-notation for JSON fields.
17
+
18
+ Returns:
19
+ (key_expr, key_alias) — the SQL expression and a display alias.
20
+ For simple keys: ('"field"', 'field')
21
+ For dot-notation: ("json_extract_string(\"urn\", '$.value')", 'urn.value')
22
+ """
23
+ if "." not in key:
24
+ if key not in all_columns:
25
+ raise DatasetError(
26
+ f"Key field '{key}' not found. "
27
+ f"Available: {', '.join(all_columns)}"
28
+ )
29
+ return f'"{key}"', key
30
+
31
+ root, rest = key.split(".", 1)
32
+ if root not in all_columns:
33
+ raise DatasetError(
34
+ f"Root field '{root}' (from key '{key}') not found. "
35
+ f"Available: {', '.join(all_columns)}"
36
+ )
37
+ return f"json_extract_string(\"{root}\", '$.{rest}')", key
38
+
39
+
15
40
  @dataclass
16
41
  class DiffResult:
17
42
  """Result of comparing two dataset snapshots."""
@@ -24,6 +49,7 @@ class DiffResult:
24
49
  removed: list[dict[str, Any]] = field(default_factory=list)
25
50
  changed: list[dict[str, Any]] = field(default_factory=list)
26
51
  unchanged_count: int = 0
52
+ fields: list[str] | None = field(default=None)
27
53
 
28
54
  @property
29
55
  def has_changes(self) -> bool:
@@ -63,10 +89,11 @@ class DatasetDiffer:
63
89
 
64
90
  Args:
65
91
  source_id: Source to compare.
66
- key: Field to match records by (e.g., ``_input_value``, ``urn``).
92
+ key: Field to match records by. Supports dot-notation for
93
+ JSON fields (e.g., ``urn.value``).
67
94
  from_date: Older snapshot date (default: second-to-last).
68
95
  to_date: Newer snapshot date (default: latest).
69
- fields: Only compare these fields (default: all).
96
+ fields: Only compare (and output) these fields (default: all).
70
97
 
71
98
  Returns:
72
99
  DiffResult with added, removed, changed lists.
@@ -153,50 +180,43 @@ class DatasetDiffer:
153
180
  info = conn.execute("DESCRIBE _new").fetchall()
154
181
  all_columns = [col[0] for col in info]
155
182
 
156
- if key not in all_columns:
157
- raise DatasetError(
158
- f"Key field '{key}' not found in {source_id}. "
159
- f"Available: {', '.join(all_columns)}"
160
- )
183
+ # Build key expression (supports dot-notation)
184
+ key_expr, key_alias = _build_key_expr(key, all_columns)
161
185
 
162
186
  # Determine which fields to compare
163
187
  compare_fields = fields if fields else [
164
- c for c in all_columns if c != key
188
+ c for c in all_columns if c != key and c != key.split(".")[0]
165
189
  ]
166
190
  # Filter to fields that actually exist
167
191
  compare_fields = [c for c in compare_fields if c in all_columns]
168
192
 
169
- quoted_key = f'"{key}"'
193
+ # Determine output columns: if fields specified, restrict to key + fields
194
+ if fields:
195
+ output_columns = [key_alias] + [
196
+ f for f in fields if f in all_columns
197
+ ]
198
+ else:
199
+ output_columns = None # all columns
170
200
 
171
201
  # Added: in new but not in old
172
- added = conn.execute(
173
- f"SELECT * FROM _new "
174
- f"WHERE {quoted_key} NOT IN (SELECT {quoted_key} FROM _old)"
175
- ).fetchall()
176
- added_cols = [d[0] for d in conn.execute(
177
- "DESCRIBE _new"
178
- ).fetchall()]
179
- added_records = [dict(zip(added_cols, row, strict=False)) for row in added]
202
+ added_records = self._query_added_removed(
203
+ conn, "_new", "_old", key_expr, key_alias, all_columns, output_columns
204
+ )
180
205
 
181
206
  # Removed: in old but not in new
182
- removed = conn.execute(
183
- f"SELECT * FROM _old "
184
- f"WHERE {quoted_key} NOT IN (SELECT {quoted_key} FROM _new)"
185
- ).fetchall()
186
- removed_cols = [d[0] for d in conn.execute(
187
- "DESCRIBE _old"
188
- ).fetchall()]
189
- removed_records = [dict(zip(removed_cols, row, strict=False)) for row in removed]
207
+ removed_records = self._query_added_removed(
208
+ conn, "_old", "_new", key_expr, key_alias, all_columns, output_columns
209
+ )
190
210
 
191
211
  # Changed: matching key, different values
192
212
  changed_records = self._find_changed(
193
- conn, key, compare_fields, all_columns
213
+ conn, key_expr, key_alias, compare_fields, all_columns, output_columns
194
214
  )
195
215
 
196
216
  # Count unchanged
197
217
  total_matched = conn.execute(
198
218
  f"SELECT COUNT(*) FROM _new n "
199
- f"JOIN _old o ON n.{quoted_key} = o.{quoted_key}"
219
+ f"JOIN _old o ON ({_requalify(key_expr, 'n')}) = ({_requalify(key_expr, 'o')})"
200
220
  ).fetchone()
201
221
  matched_count = total_matched[0] if total_matched else 0
202
222
  unchanged_count = matched_count - len(changed_records)
@@ -210,23 +230,59 @@ class DatasetDiffer:
210
230
  removed=removed_records,
211
231
  changed=changed_records,
212
232
  unchanged_count=unchanged_count,
233
+ fields=fields,
213
234
  )
214
235
  finally:
215
236
  conn.close()
216
237
 
238
+ @staticmethod
239
+ def _query_added_removed(
240
+ conn: Any,
241
+ present_view: str,
242
+ absent_view: str,
243
+ key_expr: str,
244
+ key_alias: str,
245
+ all_columns: list[str],
246
+ output_columns: list[str] | None,
247
+ ) -> list[dict[str, Any]]:
248
+ """Query records present in one view but not the other."""
249
+ # Build SELECT list
250
+ if output_columns:
251
+ select_parts = []
252
+ for col in output_columns:
253
+ if col == key_alias and "." in col:
254
+ select_parts.append(f"{key_expr} AS \"{key_alias}\"")
255
+ else:
256
+ select_parts.append(f'"{col}"')
257
+ select_clause = ", ".join(select_parts)
258
+ else:
259
+ if "." in key_alias:
260
+ select_clause = f"*, {key_expr} AS \"{key_alias}\""
261
+ else:
262
+ select_clause = "*"
263
+
264
+ sql = (
265
+ f"SELECT {select_clause} FROM {present_view} "
266
+ f"WHERE ({key_expr}) NOT IN (SELECT ({key_expr}) FROM {absent_view})"
267
+ )
268
+ result = conn.execute(sql)
269
+ columns = [desc[0] for desc in result.description]
270
+ rows = result.fetchall()
271
+ return [dict(zip(columns, row, strict=False)) for row in rows]
272
+
273
+ @staticmethod
217
274
  def _find_changed(
218
- self,
219
275
  conn: Any,
220
- key: str,
276
+ key_expr: str,
277
+ key_alias: str,
221
278
  compare_fields: list[str],
222
279
  all_columns: list[str],
280
+ output_columns: list[str] | None,
223
281
  ) -> list[dict[str, Any]]:
224
282
  """Find records that exist in both snapshots but have different values."""
225
283
  if not compare_fields:
226
284
  return []
227
285
 
228
- quoted_key = f'"{key}"'
229
-
230
286
  # Build WHERE clause: any compared field differs
231
287
  where_parts = []
232
288
  for col in compare_fields:
@@ -234,21 +290,43 @@ class DatasetDiffer:
234
290
  where_parts.append(f"n.{qc} IS DISTINCT FROM o.{qc}")
235
291
  where_clause = " OR ".join(where_parts)
236
292
 
237
- # Select new values + old values for compared fields
238
- select_parts = [f"n.{quoted_key}"]
239
- for col in all_columns:
240
- if col != key:
241
- qc = f'"{col}"'
242
- select_parts.append(f"n.{qc}")
243
- for col in compare_fields:
244
- qc = f'"{col}"'
245
- select_parts.append(f"o.{qc} AS \"{col}__old\"")
293
+ # Build JOIN condition
294
+ join_key_n = _requalify(key_expr, "n")
295
+ join_key_o = _requalify(key_expr, "o")
296
+ join_cond = f"({join_key_n}) = ({join_key_o})"
297
+
298
+ # Build SELECT: key + output fields + __old for compare fields
299
+ if output_columns:
300
+ # Restricted output
301
+ select_parts = []
302
+ for col in output_columns:
303
+ if col == key_alias and "." in col:
304
+ select_parts.append(f"{_requalify(key_expr, 'n')} AS \"{key_alias}\"")
305
+ else:
306
+ select_parts.append(f"n.\"{col}\"")
307
+ for col in compare_fields:
308
+ # Include __old for compare fields that are in output
309
+ if col in [c for c in output_columns if c != key_alias]:
310
+ select_parts.append(f"o.\"{col}\" AS \"{col}__old\"")
311
+ else:
312
+ # Full output
313
+ select_parts = []
314
+ if "." in key_alias:
315
+ select_parts.append(f"{_requalify(key_expr, 'n')} AS \"{key_alias}\"")
316
+ else:
317
+ select_parts.append(f"n.\"{key_alias}\"")
318
+ for col in all_columns:
319
+ if col == key_alias:
320
+ continue
321
+ select_parts.append(f"n.\"{col}\"")
322
+ for col in compare_fields:
323
+ select_parts.append(f"o.\"{col}\" AS \"{col}__old\"")
246
324
 
247
325
  select_clause = ", ".join(select_parts)
248
326
 
249
327
  sql = (
250
328
  f"SELECT {select_clause} FROM _new n "
251
- f"JOIN _old o ON n.{quoted_key} = o.{quoted_key} "
329
+ f"JOIN _old o ON {join_cond} "
252
330
  f"WHERE {where_clause}"
253
331
  )
254
332
 
@@ -271,6 +349,24 @@ class DatasetDiffer:
271
349
  return records
272
350
 
273
351
 
352
+ def _requalify(key_expr: str, prefix: str) -> str:
353
+ """Requalify a key expression with a table alias prefix.
354
+
355
+ For simple keys like '"field"', returns 'prefix."field"'.
356
+ For json_extract_string("col", '$.path'), returns
357
+ json_extract_string(prefix."col", '$.path').
358
+ """
359
+ if key_expr.startswith("json_extract_string("):
360
+ # Replace the column reference inside json_extract_string
361
+ inner = key_expr[len("json_extract_string("):]
362
+ # inner looks like: "col", '$.path')
363
+ col_end = inner.index(",")
364
+ col = inner[:col_end].strip()
365
+ rest = inner[col_end:]
366
+ return f"json_extract_string({prefix}.{col}{rest}"
367
+ return f"{prefix}.{key_expr}"
368
+
369
+
274
370
  def _values_differ(a: Any, b: Any) -> bool:
275
371
  """Compare two values, treating JSON strings as equivalent to their parsed form."""
276
372
  if a == b:
@@ -284,21 +380,30 @@ def _values_differ(a: Any, b: Any) -> bool:
284
380
  return True
285
381
 
286
382
 
287
- def format_diff_table(result: DiffResult) -> list[dict[str, Any]]:
383
+ def format_diff_table(
384
+ result: DiffResult,
385
+ *,
386
+ output_fields: list[str] | None = None,
387
+ ) -> list[dict[str, Any]]:
288
388
  """Format a DiffResult into a flat list of dicts for table/json output.
289
389
 
290
390
  Each record gets a ``_diff`` column with value ``added``, ``removed``,
291
391
  or ``changed``. For changed records in table mode, modified field
292
392
  values are formatted as ``old → new``.
393
+
394
+ Args:
395
+ result: The diff result.
396
+ output_fields: If set, only include these fields (plus ``_diff`` and key).
293
397
  """
398
+ allowed = _build_allowed_set(result.key, output_fields)
294
399
  rows: list[dict[str, Any]] = []
295
400
 
296
401
  for record in result.added:
297
- row = {"_diff": "added", **record}
402
+ row = {"_diff": "added", **_filter_row(record, allowed)}
298
403
  rows.append(row)
299
404
 
300
405
  for record in result.removed:
301
- row = {"_diff": "removed", **record}
406
+ row = {"_diff": "removed", **_filter_row(record, allowed)}
302
407
  rows.append(row)
303
408
 
304
409
  for record in result.changed:
@@ -309,6 +414,8 @@ def format_diff_table(result: DiffResult) -> list[dict[str, Any]]:
309
414
  continue
310
415
  if k.endswith("__old"):
311
416
  continue
417
+ if allowed and k not in allowed:
418
+ continue
312
419
  # For changed fields, format as "old → new"
313
420
  if k in changed_fields:
314
421
  old_val = record.get(f"{k}__old")
@@ -320,31 +427,65 @@ def format_diff_table(result: DiffResult) -> list[dict[str, Any]]:
320
427
  return rows
321
428
 
322
429
 
323
- def format_diff_records(result: DiffResult) -> list[dict[str, Any]]:
430
+ def format_diff_records(
431
+ result: DiffResult,
432
+ *,
433
+ output_fields: list[str] | None = None,
434
+ ) -> list[dict[str, Any]]:
324
435
  """Format a DiffResult for JSON/CSV output.
325
436
 
326
437
  Each record gets ``_diff`` column. Changed records include both
327
438
  current values and ``field__old`` columns.
439
+
440
+ Args:
441
+ result: The diff result.
442
+ output_fields: If set, only include these fields (plus ``_diff``, key, and ``__old``).
328
443
  """
444
+ allowed = _build_allowed_set(result.key, output_fields)
329
445
  rows: list[dict[str, Any]] = []
330
446
 
331
447
  for record in result.added:
332
- rows.append({"_diff": "added", **record})
448
+ rows.append({"_diff": "added", **_filter_row(record, allowed)})
333
449
 
334
450
  for record in result.removed:
335
- rows.append({"_diff": "removed", **record})
451
+ rows.append({"_diff": "removed", **_filter_row(record, allowed)})
336
452
 
337
453
  for record in result.changed:
338
- row = {"_diff": "changed"}
454
+ row: dict[str, Any] = {"_diff": "changed"}
339
455
  for k, v in record.items():
340
456
  if k == "_changed_fields":
341
457
  continue
458
+ if allowed and k not in allowed and not k.endswith("__old"):
459
+ continue
460
+ if k.endswith("__old") and allowed:
461
+ base = k[: -len("__old")]
462
+ if base not in allowed:
463
+ continue
342
464
  row[k] = v
343
465
  rows.append(row)
344
466
 
345
467
  return rows
346
468
 
347
469
 
470
+ def _build_allowed_set(key: str, output_fields: list[str] | None) -> set[str] | None:
471
+ """Build the set of allowed field names for output filtering."""
472
+ if not output_fields:
473
+ return None
474
+ allowed = set(output_fields)
475
+ allowed.add(key)
476
+ # Also add the root column for dot-notation keys
477
+ if "." in key:
478
+ allowed.add(key.split(".")[0])
479
+ return allowed
480
+
481
+
482
+ def _filter_row(record: dict[str, Any], allowed: set[str] | None) -> dict[str, Any]:
483
+ """Filter a record to only allowed fields."""
484
+ if not allowed:
485
+ return record
486
+ return {k: v for k, v in record.items() if k in allowed}
487
+
488
+
348
489
  def _format_val(v: Any) -> str:
349
490
  """Format a value for display, truncating long strings."""
350
491
  if v is None:
anysite/dataset/models.py CHANGED
@@ -81,6 +81,7 @@ class DbLoadConfig(BaseModel):
81
81
  """Configuration for loading a source into a relational database."""
82
82
 
83
83
  table: str | None = Field(default=None, description="Override table name (default: source id)")
84
+ key: str | None = Field(default=None, description="Unique key field for diff-based DB sync (e.g., urn.value)")
84
85
  fields: list[str] = Field(default_factory=list, description="Fields to include (empty = all)")
85
86
  exclude: list[str] = Field(
86
87
  default_factory=lambda: ["_input_value", "_parent_source"],
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: anysite-cli
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: CLI for Anysite API - web data extraction for humans and AI agents
5
5
  Project-URL: Homepage, https://anysite.io
6
6
  Project-URL: Documentation, https://docs.anysite.io/cli
@@ -19,14 +19,14 @@ anysite/config/paths.py,sha256=EmHJD8wlf4Q9IUn8Gp1JQ8Z3ffrIYAt5iHRyImQOf5I,1087
19
19
  anysite/config/settings.py,sha256=Hc0j_aCCtkJeL4nHw-EFyfJ8WEDk57G08iNUFquUhpM,5235
20
20
  anysite/dataset/__init__.py,sha256=J0sKQkGwVOPtvp6pka7LcdeUEADvjWRs71yRuROzJxI,847
21
21
  anysite/dataset/analyzer.py,sha256=8dsPW32SbSaUTy1F0NIed1U45wjiMgQeJ2iWX7hBxRQ,9245
22
- anysite/dataset/cli.py,sha256=zaCo0kKeA1KNU7EZgW4WwxrP07xuKayPlolfUnCSoYI,22801
22
+ anysite/dataset/cli.py,sha256=rEWK1ka-YQ_Vbbj2nMaMYTD9g3wa3ethUWSoaWRSGTY,23066
23
23
  anysite/dataset/collector.py,sha256=6CfJt8fKZZ2xvZWJ7jwnx0V9BnjoJxmBZkm8xWQiU54,23840
24
- anysite/dataset/db_loader.py,sha256=nlMJrDJiGBX5H1StcjsontSxLXbsFe4rwOEnDehzpk8,8443
25
- anysite/dataset/differ.py,sha256=hbUwoS73syTkrj0VC0gaJzuB0pVCoQXQMbsNXtpsig8,11634
24
+ anysite/dataset/db_loader.py,sha256=TMcvI-pX-XctbkTdo5eTyW8Co4_3uK-dEdXn_r9g8Oc,13547
25
+ anysite/dataset/differ.py,sha256=b-qU5Laf8RkteZAlblKq4atTvnJ21W4QbxfpHBFYMJ8,17053
26
26
  anysite/dataset/errors.py,sha256=r8cZXoIzSeTGCWpeYjntnN0AduCu74YZyWs3sFu17J4,914
27
27
  anysite/dataset/exporters.py,sha256=mA2FYbYJbHfrwkXbHDu4g5qPG_JJKnkVciXFKPkF1Vw,3708
28
28
  anysite/dataset/history.py,sha256=avFs0ADlM7Hr-ttqC1FfjJiQxvQP20sScM7ZoY4lvU0,5471
29
- anysite/dataset/models.py,sha256=_f1cg9A4FlQwWGpg-s0b9q5WMlaIRN-ENlpU9CE6mrQ,9695
29
+ anysite/dataset/models.py,sha256=-Qnh6QvbN3nzlfsYqgCiYKBqOeLcJCYK_hYrmxVCRTA,9810
30
30
  anysite/dataset/notifications.py,sha256=ORzo9XOgOxzLb7rk4pevlKPB_Taf-jejlrtmO4Zgl2c,2367
31
31
  anysite/dataset/scheduler.py,sha256=zpbA5tRUQZXr-9lZnG58dvE3E7ZBlAd-U-PTXExe9f0,3339
32
32
  anysite/dataset/storage.py,sha256=d03goKLI5NWKJowHwCgGqQkcVTO1NctPxMu-Xu-tru4,5326
@@ -58,8 +58,8 @@ anysite/streaming/writer.py,sha256=HfMsC4umUdJuNIAPK57YAxEGyTwUmy-zNrqFkwY6aew,4
58
58
  anysite/utils/__init__.py,sha256=7SnbxpxKENK-2ecUL5NfnZ9okGI7COKYw4WF46172HM,23
59
59
  anysite/utils/fields.py,sha256=bSrHadzNmabL4qubqhXXZoWb_P8KA-3S7_FLVT8nGBc,7410
60
60
  anysite/utils/retry.py,sha256=89TbXvavi5t22P2mTYCLAS6SSZoW65gQ0nnYNbYAF0M,2684
61
- anysite_cli-0.1.3.dist-info/METADATA,sha256=lD_AF5pq5ayHerMVMMWTTkgccwWEsKLBGCwvPfZ5y_Y,11781
62
- anysite_cli-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
63
- anysite_cli-0.1.3.dist-info/entry_points.txt,sha256=FDPxNasy0fRRcOgJdZRVP7Qw01C3TwRa1OwPJiskNyg,45
64
- anysite_cli-0.1.3.dist-info/licenses/LICENSE,sha256=gVAxkI23CFm4x4HV_fkQYw_bGq93mQmVZEwxNs-YTa4,1069
65
- anysite_cli-0.1.3.dist-info/RECORD,,
61
+ anysite_cli-0.1.4.dist-info/METADATA,sha256=w5DUgDWzJgXynKRogJVm9baLqTJVSrg0ciHuWfWa9l0,11781
62
+ anysite_cli-0.1.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
63
+ anysite_cli-0.1.4.dist-info/entry_points.txt,sha256=FDPxNasy0fRRcOgJdZRVP7Qw01C3TwRa1OwPJiskNyg,45
64
+ anysite_cli-0.1.4.dist-info/licenses/LICENSE,sha256=gVAxkI23CFm4x4HV_fkQYw_bGq93mQmVZEwxNs-YTa4,1069
65
+ anysite_cli-0.1.4.dist-info/RECORD,,