anysite-cli 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. anysite/__init__.py +4 -0
  2. anysite/__main__.py +6 -0
  3. anysite/api/__init__.py +21 -0
  4. anysite/api/client.py +271 -0
  5. anysite/api/errors.py +137 -0
  6. anysite/api/schemas.py +333 -0
  7. anysite/batch/__init__.py +1 -0
  8. anysite/batch/executor.py +176 -0
  9. anysite/batch/input.py +160 -0
  10. anysite/batch/rate_limiter.py +98 -0
  11. anysite/cli/__init__.py +1 -0
  12. anysite/cli/config.py +176 -0
  13. anysite/cli/executor.py +388 -0
  14. anysite/cli/options.py +249 -0
  15. anysite/config/__init__.py +11 -0
  16. anysite/config/paths.py +46 -0
  17. anysite/config/settings.py +187 -0
  18. anysite/dataset/__init__.py +37 -0
  19. anysite/dataset/analyzer.py +268 -0
  20. anysite/dataset/cli.py +644 -0
  21. anysite/dataset/collector.py +686 -0
  22. anysite/dataset/db_loader.py +248 -0
  23. anysite/dataset/errors.py +30 -0
  24. anysite/dataset/exporters.py +121 -0
  25. anysite/dataset/history.py +153 -0
  26. anysite/dataset/models.py +245 -0
  27. anysite/dataset/notifications.py +87 -0
  28. anysite/dataset/scheduler.py +107 -0
  29. anysite/dataset/storage.py +171 -0
  30. anysite/dataset/transformer.py +213 -0
  31. anysite/db/__init__.py +38 -0
  32. anysite/db/adapters/__init__.py +1 -0
  33. anysite/db/adapters/base.py +158 -0
  34. anysite/db/adapters/postgres.py +201 -0
  35. anysite/db/adapters/sqlite.py +183 -0
  36. anysite/db/cli.py +709 -0
  37. anysite/db/config.py +92 -0
  38. anysite/db/manager.py +166 -0
  39. anysite/db/operations/__init__.py +1 -0
  40. anysite/db/operations/insert.py +199 -0
  41. anysite/db/operations/query.py +43 -0
  42. anysite/db/schema/__init__.py +1 -0
  43. anysite/db/schema/inference.py +213 -0
  44. anysite/db/schema/types.py +71 -0
  45. anysite/db/utils/__init__.py +1 -0
  46. anysite/db/utils/sanitize.py +99 -0
  47. anysite/main.py +498 -0
  48. anysite/models/__init__.py +1 -0
  49. anysite/output/__init__.py +11 -0
  50. anysite/output/console.py +45 -0
  51. anysite/output/formatters.py +301 -0
  52. anysite/output/templates.py +76 -0
  53. anysite/py.typed +0 -0
  54. anysite/streaming/__init__.py +1 -0
  55. anysite/streaming/progress.py +121 -0
  56. anysite/streaming/writer.py +130 -0
  57. anysite/utils/__init__.py +1 -0
  58. anysite/utils/fields.py +242 -0
  59. anysite/utils/retry.py +109 -0
  60. anysite_cli-0.1.2.dist-info/METADATA +455 -0
  61. anysite_cli-0.1.2.dist-info/RECORD +64 -0
  62. anysite_cli-0.1.2.dist-info/WHEEL +4 -0
  63. anysite_cli-0.1.2.dist-info/entry_points.txt +2 -0
  64. anysite_cli-0.1.2.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,686 @@
1
+ """Dataset collection orchestrator."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import contextlib
7
+ import logging
8
+ import time
9
+ from datetime import date
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ from anysite.api.client import create_client
14
+ from anysite.batch.executor import BatchExecutor
15
+ from anysite.batch.rate_limiter import RateLimiter
16
+ from anysite.cli.options import ErrorHandling
17
+ from anysite.dataset.errors import DatasetError
18
+ from anysite.dataset.models import DatasetConfig, DatasetSource
19
+ from anysite.dataset.storage import (
20
+ MetadataStore,
21
+ get_parquet_path,
22
+ read_parquet,
23
+ write_parquet,
24
+ )
25
+ from anysite.output.console import print_info, print_success, print_warning
26
+ from anysite.streaming.progress import ProgressTracker
27
+ from anysite.utils.fields import extract_field, parse_field_path
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class CollectionPlan:
33
+ """Describes what will be collected without executing."""
34
+
35
+ def __init__(self) -> None:
36
+ self.steps: list[dict[str, Any]] = []
37
+
38
+ def add_step(
39
+ self,
40
+ source_id: str,
41
+ endpoint: str,
42
+ kind: str,
43
+ params: dict[str, Any] | None = None,
44
+ dependency: str | None = None,
45
+ estimated_requests: int | None = None,
46
+ ) -> None:
47
+ self.steps.append({
48
+ "source": source_id,
49
+ "endpoint": endpoint,
50
+ "kind": kind,
51
+ "params": params or {},
52
+ "dependency": dependency,
53
+ "estimated_requests": estimated_requests,
54
+ })
55
+
56
+
57
+ async def collect_dataset(
58
+ config: DatasetConfig,
59
+ *,
60
+ config_dir: Path | None = None,
61
+ source_filter: str | None = None,
62
+ incremental: bool = False,
63
+ dry_run: bool = False,
64
+ quiet: bool = False,
65
+ ) -> dict[str, int]:
66
+ """Collect all sources in a dataset.
67
+
68
+ Args:
69
+ config: Dataset configuration.
70
+ source_filter: If set, only collect this source (and its dependencies).
71
+ incremental: Skip sources that already have data for today.
72
+ dry_run: Show plan without executing.
73
+ quiet: Suppress progress output.
74
+
75
+ Returns:
76
+ Dict mapping source_id to record count collected.
77
+ """
78
+ if config_dir is None:
79
+ config_dir = Path.cwd()
80
+ base_path = config.storage_path()
81
+ metadata = MetadataStore(base_path)
82
+ today = date.today()
83
+ start_time = time.monotonic()
84
+
85
+ # Get execution order
86
+ ordered = config.topological_sort()
87
+
88
+ # Filter to requested source (and its dependencies)
89
+ if source_filter:
90
+ ordered = _filter_sources(ordered, source_filter, config)
91
+
92
+ if dry_run:
93
+ plan = _build_plan(
94
+ ordered, config, base_path, metadata, incremental, today,
95
+ config_dir=config_dir,
96
+ )
97
+ return _print_plan(plan)
98
+
99
+ # Record run start in history
100
+ run_id: int | None = None
101
+ log_handler: logging.Handler | None = None
102
+ try:
103
+ from anysite.dataset.history import HistoryStore, LogManager
104
+
105
+ history = HistoryStore()
106
+ run_id = history.record_start(config.name)
107
+ log_mgr = LogManager()
108
+ log_handler = log_mgr.create_handler(config.name, run_id)
109
+ logging.getLogger("anysite").addHandler(log_handler)
110
+ except Exception:
111
+ pass
112
+
113
+ results: dict[str, int] = {}
114
+ total_records = 0
115
+ error_msg: str | None = None
116
+
117
+ try:
118
+ for source in ordered:
119
+ # Check incremental skip
120
+ if incremental:
121
+ parquet_path = get_parquet_path(base_path, source.id, today)
122
+ if parquet_path.exists():
123
+ if not quiet:
124
+ print_info(f"Skipping {source.id} (already collected today)")
125
+ info = metadata.get_source_info(source.id)
126
+ results[source.id] = info.get("record_count", 0) if info else 0
127
+ continue
128
+
129
+ if not quiet:
130
+ print_info(f"Collecting {source.id} from {source.endpoint}...")
131
+
132
+ if source.from_file is not None:
133
+ file_base = config_dir if config_dir else Path.cwd()
134
+ records = await _collect_from_file(
135
+ source, config_dir=file_base,
136
+ metadata=metadata, incremental=incremental, quiet=quiet,
137
+ )
138
+ elif source.dependency is None:
139
+ records = await _collect_independent(source)
140
+ else:
141
+ records = await _collect_dependent(
142
+ source, base_path,
143
+ metadata=metadata, incremental=incremental, quiet=quiet,
144
+ )
145
+
146
+ # Write FULL records to Parquet (preserves all fields for dependency resolution)
147
+ parquet_path = get_parquet_path(base_path, source.id, today)
148
+ count = write_parquet(records, parquet_path)
149
+ metadata.update_source(source.id, count, today)
150
+
151
+ # Apply per-source transform for exports only (does NOT affect Parquet)
152
+ export_records = records
153
+ if source.transform and records:
154
+ from anysite.dataset.transformer import RecordTransformer
155
+
156
+ transformer = RecordTransformer(source.transform)
157
+ before = len(records)
158
+ export_records = transformer.apply([dict(r) for r in records])
159
+ if not quiet and len(export_records) != before:
160
+ print_info(f" Transform: {before} -> {len(export_records)} records")
161
+
162
+ # Run per-source exports with transformed records
163
+ if source.export and export_records:
164
+ from anysite.dataset.exporters import run_exports
165
+
166
+ await run_exports(export_records, source.export, source.id, config.name)
167
+
168
+ # Track collected inputs for incremental dedup
169
+ if records:
170
+ input_values = [
171
+ r["_input_value"] for r in records if "_input_value" in r
172
+ ]
173
+ if input_values:
174
+ metadata.update_collected_inputs(source.id, input_values)
175
+
176
+ results[source.id] = count
177
+ total_records += count
178
+
179
+ if not quiet:
180
+ print_success(f"Collected {count} records for {source.id}")
181
+
182
+ except Exception as e:
183
+ error_msg = str(e)
184
+ raise
185
+ finally:
186
+ duration = time.monotonic() - start_time
187
+
188
+ # Record finish in history
189
+ if run_id is not None:
190
+ with contextlib.suppress(Exception):
191
+ history.record_finish( # type: ignore[possibly-undefined]
192
+ run_id,
193
+ status="failed" if error_msg else "success",
194
+ record_count=total_records,
195
+ source_count=len(results),
196
+ error=error_msg,
197
+ duration=duration,
198
+ )
199
+
200
+ # Send notifications
201
+ if config.notifications:
202
+ try:
203
+ from anysite.dataset.notifications import WebhookNotifier
204
+
205
+ notifier = WebhookNotifier(config.notifications)
206
+ if error_msg:
207
+ await notifier.notify_failure(config.name, error_msg, duration)
208
+ else:
209
+ await notifier.notify_complete(
210
+ config.name, total_records, len(results), duration,
211
+ )
212
+ except Exception as ne:
213
+ logger.error("Notification error: %s", ne)
214
+
215
+ # Remove log handler
216
+ if log_handler:
217
+ logging.getLogger("anysite").removeHandler(log_handler)
218
+ log_handler.close()
219
+
220
+ return results
221
+
222
+
223
+ async def _collect_independent(source: DatasetSource) -> list[dict[str, Any]]:
224
+ """Collect an independent source (single API call)."""
225
+ async with create_client() as client:
226
+ data = await client.post(source.endpoint, data=source.params)
227
+ # API returns list[dict] or dict
228
+ if isinstance(data, list):
229
+ return data
230
+ return [data] if isinstance(data, dict) else [{"data": data}]
231
+
232
+
233
+ async def _collect_from_file(
234
+ source: DatasetSource,
235
+ *,
236
+ config_dir: Path,
237
+ metadata: MetadataStore | None = None,
238
+ incremental: bool = False,
239
+ quiet: bool = False,
240
+ ) -> list[dict[str, Any]]:
241
+ """Collect a source by iterating over values from an input file."""
242
+ from anysite.batch.input import InputParser
243
+
244
+ if not source.from_file:
245
+ raise DatasetError(f"Source {source.id} has no from_file defined")
246
+ if not source.input_key:
247
+ raise DatasetError(f"Source {source.id} has from_file but no input_key defined")
248
+
249
+ file_path = Path(source.from_file)
250
+ if not file_path.is_absolute():
251
+ file_path = config_dir / file_path
252
+ if not file_path.exists():
253
+ raise DatasetError(f"Input file not found: {file_path}")
254
+
255
+ # Parse inputs from file
256
+ raw_inputs = InputParser.from_file(file_path)
257
+ if not raw_inputs:
258
+ if not quiet:
259
+ print_warning(f"No inputs found in {file_path}")
260
+ return []
261
+
262
+ # Extract specific field from CSV/JSONL dicts if file_field is set
263
+ values: list[str] = []
264
+ for inp in raw_inputs:
265
+ if isinstance(inp, dict) and source.file_field:
266
+ val = inp.get(source.file_field)
267
+ if val is not None:
268
+ values.append(str(val))
269
+ elif isinstance(inp, str):
270
+ values.append(inp)
271
+ else:
272
+ values.append(str(inp))
273
+
274
+ if not values:
275
+ if not quiet:
276
+ print_warning(f"No values extracted from {file_path}")
277
+ return []
278
+
279
+ # Filter already-collected inputs in incremental mode
280
+ if incremental and metadata:
281
+ already = metadata.get_collected_inputs(source.id)
282
+ if already:
283
+ original = len(values)
284
+ values = [v for v in values if str(v) not in already]
285
+ if not quiet and original != len(values):
286
+ print_info(f" Skipping {original - len(values)} already-collected inputs")
287
+
288
+ if not values:
289
+ if not quiet:
290
+ print_info(f" All inputs already collected for {source.id}")
291
+ return []
292
+
293
+ if not quiet:
294
+ print_info(f" Found {len(values)} inputs from {file_path.name}")
295
+
296
+ return await _collect_batch(source, values, quiet=quiet)
297
+
298
+
299
+ async def _collect_batch(
300
+ source: DatasetSource,
301
+ values: list[Any],
302
+ *,
303
+ parent_source: str | None = None,
304
+ quiet: bool = False,
305
+ ) -> list[dict[str, Any]]:
306
+ """Run batch API calls for a list of input values.
307
+
308
+ Each resulting record is annotated with provenance metadata:
309
+ - ``_input_value``: the raw value used to make the API call
310
+ - ``_parent_source``: the source ID that produced the input (if dependent)
311
+ """
312
+ limiter = RateLimiter(source.rate_limit) if source.rate_limit else None
313
+ on_error = ErrorHandling(source.on_error) if source.on_error else ErrorHandling.SKIP
314
+
315
+ tracker = ProgressTracker(
316
+ total=len(values),
317
+ description=f"Collecting {source.id}...",
318
+ quiet=quiet,
319
+ )
320
+
321
+ async def _fetch_one(val: str | dict[str, Any]) -> Any:
322
+ # Apply input_template if defined
323
+ if source.input_template:
324
+ input_val = _apply_template(source.input_template, val)
325
+ # If template returns a dict, use it as the full payload
326
+ # (merged with static params), not nested under input_key
327
+ if isinstance(input_val, dict):
328
+ payload = {**source.params, **input_val}
329
+ else:
330
+ payload = {source.input_key: input_val, **source.params} # type: ignore[dict-item]
331
+ else:
332
+ payload = {source.input_key: val, **source.params} # type: ignore[dict-item]
333
+ async with create_client() as client:
334
+ result = await client.post(source.endpoint, data=payload)
335
+
336
+ # Annotate each record with provenance metadata so that
337
+ # child→parent relationships can be reconstructed later.
338
+ records = _flatten_results([result])
339
+ for record in records:
340
+ record["_input_value"] = str(val)
341
+ if parent_source:
342
+ record["_parent_source"] = parent_source
343
+ return records
344
+
345
+ executor = BatchExecutor(
346
+ func=_fetch_one,
347
+ parallel=source.parallel,
348
+ on_error=on_error,
349
+ rate_limiter=limiter,
350
+ progress_callback=tracker.update,
351
+ )
352
+
353
+ with tracker:
354
+ batch_result = await executor.execute(values)
355
+
356
+ # _fetch_one returns lists of annotated dicts, but BatchExecutor
357
+ # wraps non-dict returns as {"data": result}. Handle both forms.
358
+ all_records: list[dict[str, Any]] = []
359
+ for item in batch_result.results:
360
+ if isinstance(item, list):
361
+ all_records.extend(r for r in item if isinstance(r, dict))
362
+ elif isinstance(item, dict):
363
+ data = item.get("data")
364
+ if isinstance(data, list):
365
+ all_records.extend(r for r in data if isinstance(r, dict))
366
+ elif isinstance(data, str):
367
+ # JSON-serialized list from Parquet roundtrip
368
+ import json
369
+
370
+ try:
371
+ parsed = json.loads(data)
372
+ if isinstance(parsed, list):
373
+ all_records.extend(r for r in parsed if isinstance(r, dict))
374
+ else:
375
+ all_records.append(item)
376
+ except (json.JSONDecodeError, ValueError):
377
+ all_records.append(item)
378
+ else:
379
+ all_records.append(item)
380
+ return all_records
381
+
382
+
383
+ def _flatten_results(results: list[Any]) -> list[dict[str, Any]]:
384
+ """Flatten batch results into a flat list of dicts."""
385
+ all_records: list[dict[str, Any]] = []
386
+ for result in results:
387
+ if isinstance(result, list):
388
+ all_records.extend(r for r in result if isinstance(r, dict))
389
+ elif isinstance(result, dict):
390
+ data = result.get("data", result)
391
+ if isinstance(data, list):
392
+ all_records.extend(r for r in data if isinstance(r, dict))
393
+ elif isinstance(data, dict):
394
+ all_records.append(data)
395
+ else:
396
+ all_records.append(result)
397
+ return all_records
398
+
399
+
400
+ async def _collect_dependent(
401
+ source: DatasetSource,
402
+ base_path: Path,
403
+ *,
404
+ metadata: MetadataStore | None = None,
405
+ incremental: bool = False,
406
+ quiet: bool = False,
407
+ ) -> list[dict[str, Any]]:
408
+ """Collect a dependent source by reading parent data and making per-value requests."""
409
+ dep = source.dependency
410
+ if dep is None:
411
+ raise DatasetError(f"Source {source.id} has no dependency defined")
412
+
413
+ # Read parent data
414
+ parent_dir = base_path / "raw" / dep.from_source
415
+ parent_records = read_parquet(parent_dir)
416
+
417
+ if not parent_records:
418
+ if not quiet:
419
+ print_warning(f"No parent data for {dep.from_source}, skipping {source.id}")
420
+ return []
421
+
422
+ # Extract values from parent
423
+ values = _extract_values(parent_records, dep.field, dep.match_by, dep.dedupe)
424
+
425
+ if not values:
426
+ if not quiet:
427
+ print_warning(f"No values extracted from {dep.from_source} for {source.id}")
428
+ return []
429
+
430
+ if not source.input_key:
431
+ raise DatasetError(
432
+ f"Source {source.id} has a dependency but no input_key defined"
433
+ )
434
+
435
+ # Filter already-collected inputs in incremental mode
436
+ if incremental and metadata:
437
+ already = metadata.get_collected_inputs(source.id)
438
+ if already:
439
+ original = len(values)
440
+ values = [v for v in values if str(v) not in already]
441
+ if not quiet and original != len(values):
442
+ print_info(f" Skipping {original - len(values)} already-collected inputs")
443
+
444
+ if not values:
445
+ if not quiet:
446
+ print_info(f" All inputs already collected for {source.id}")
447
+ return []
448
+
449
+ return await _collect_batch(
450
+ source, values, parent_source=dep.from_source, quiet=quiet
451
+ )
452
+
453
+
454
+ def _apply_template(template: dict[str, Any], value: Any) -> dict[str, Any]:
455
+ """Apply a template dict, replacing '{value}' placeholders with the actual value."""
456
+ result: dict[str, Any] = {}
457
+ for k, v in template.items():
458
+ if isinstance(v, str) and v == "{value}":
459
+ result[k] = str(value) if not isinstance(value, (dict, list)) else value
460
+ elif isinstance(v, str) and "{value}" in v:
461
+ result[k] = v.replace("{value}", str(value))
462
+ elif isinstance(v, dict):
463
+ result[k] = _apply_template(v, value)
464
+ elif isinstance(v, list):
465
+ result[k] = _apply_template_list(v, value)
466
+ else:
467
+ result[k] = v
468
+ return result
469
+
470
+
471
+ def _apply_template_list(template_list: list[Any], value: Any) -> list[Any]:
472
+ """Apply {value} replacement within list elements."""
473
+ result: list[Any] = []
474
+ for item in template_list:
475
+ if isinstance(item, str) and item == "{value}":
476
+ result.append(str(value) if not isinstance(value, (dict, list)) else value)
477
+ elif isinstance(item, str) and "{value}" in item:
478
+ result.append(item.replace("{value}", str(value)))
479
+ elif isinstance(item, dict):
480
+ result.append(_apply_template(item, value))
481
+ elif isinstance(item, list):
482
+ result.append(_apply_template_list(item, value))
483
+ else:
484
+ result.append(item)
485
+ return result
486
+
487
+
488
+ def _try_parse_json(value: Any) -> Any:
489
+ """Try to parse a JSON string back into a dict/list."""
490
+ import json
491
+
492
+ if isinstance(value, str):
493
+ try:
494
+ parsed = json.loads(value)
495
+ if isinstance(parsed, (dict, list)):
496
+ return parsed
497
+ except (json.JSONDecodeError, ValueError):
498
+ pass
499
+ return value
500
+
501
+
502
+ def _extract_values(
503
+ records: list[dict[str, Any]],
504
+ field: str | None,
505
+ match_by: str | None,
506
+ dedupe: bool,
507
+ ) -> list[Any]:
508
+ """Extract values from parent records."""
509
+ if match_by:
510
+ field = match_by
511
+
512
+ if not field:
513
+ raise DatasetError("Dependency must specify either 'field' or 'match_by'")
514
+
515
+ segments = parse_field_path(field)
516
+ values: list[Any] = []
517
+
518
+ for record in records:
519
+ # Parquet stores nested objects as JSON strings — parse them back
520
+ # so that dot-notation paths like "urn.value" work correctly
521
+ parsed_record = {k: _try_parse_json(v) for k, v in record.items()}
522
+ value = extract_field(parsed_record, segments)
523
+ if value is not None:
524
+ value = _try_parse_json(value)
525
+ if isinstance(value, list):
526
+ values.extend(value)
527
+ else:
528
+ values.append(value)
529
+
530
+ if dedupe:
531
+ seen: set[str] = set()
532
+ unique: list[Any] = []
533
+ for v in values:
534
+ key = str(v)
535
+ if key not in seen:
536
+ seen.add(key)
537
+ unique.append(v)
538
+ values = unique
539
+
540
+ return values
541
+
542
+
543
+ def _filter_sources(
544
+ ordered: list[DatasetSource],
545
+ source_filter: str,
546
+ config: DatasetConfig,
547
+ ) -> list[DatasetSource]:
548
+ """Filter sources to only include the target and its transitive dependencies."""
549
+ target = config.get_source(source_filter)
550
+ if target is None:
551
+ raise DatasetError(f"Source '{source_filter}' not found in dataset")
552
+
553
+ # Collect all required source IDs (target + transitive deps)
554
+ required: set[str] = set()
555
+ stack = [source_filter]
556
+ while stack:
557
+ sid = stack.pop()
558
+ if sid in required:
559
+ continue
560
+ required.add(sid)
561
+ src = config.get_source(sid)
562
+ if src and src.dependency:
563
+ stack.append(src.dependency.from_source)
564
+
565
+ return [s for s in ordered if s.id in required]
566
+
567
+
568
+ def _build_plan(
569
+ ordered: list[DatasetSource],
570
+ config: DatasetConfig,
571
+ base_path: Path,
572
+ metadata: MetadataStore,
573
+ incremental: bool,
574
+ today: date,
575
+ *,
576
+ config_dir: Path | None = None,
577
+ ) -> CollectionPlan:
578
+ """Build a dry-run plan with estimated input counts."""
579
+ plan = CollectionPlan()
580
+
581
+ for source in ordered:
582
+ if incremental:
583
+ parquet_path = get_parquet_path(base_path, source.id, today)
584
+ if parquet_path.exists():
585
+ continue
586
+
587
+ if source.from_file is not None:
588
+ est = _count_file_inputs(source, config_dir)
589
+ plan.add_step(
590
+ source_id=source.id,
591
+ endpoint=source.endpoint,
592
+ kind="from_file",
593
+ params={"file": source.from_file, "field": source.file_field},
594
+ estimated_requests=est,
595
+ )
596
+ elif source.dependency is None:
597
+ plan.add_step(
598
+ source_id=source.id,
599
+ endpoint=source.endpoint,
600
+ kind="independent",
601
+ params=source.params,
602
+ estimated_requests=1,
603
+ )
604
+ else:
605
+ est = _count_dependent_inputs(source, base_path, metadata)
606
+ plan.add_step(
607
+ source_id=source.id,
608
+ endpoint=source.endpoint,
609
+ kind="dependent",
610
+ dependency=source.dependency.from_source,
611
+ estimated_requests=est,
612
+ )
613
+
614
+ return plan
615
+
616
+
617
+ def _count_dependent_inputs(
618
+ source: DatasetSource, base_path: Path, metadata: MetadataStore
619
+ ) -> int | None:
620
+ """Count extractable input values from parent Parquet data."""
621
+ dep = source.dependency
622
+ if dep is None:
623
+ return None
624
+ parent_dir = base_path / "raw" / dep.from_source
625
+ parent_records = read_parquet(parent_dir)
626
+ if not parent_records:
627
+ info = metadata.get_source_info(dep.from_source)
628
+ return info.get("record_count") if info else None
629
+ values = _extract_values(parent_records, dep.field, dep.match_by, dep.dedupe)
630
+ return len(values)
631
+
632
+
633
+ def _count_file_inputs(
634
+ source: DatasetSource, config_dir: Path | None
635
+ ) -> int | None:
636
+ """Count input values in a from_file source."""
637
+ if not source.from_file:
638
+ return None
639
+ file_path = Path(source.from_file)
640
+ if not file_path.is_absolute() and config_dir:
641
+ file_path = config_dir / file_path
642
+ if not file_path.exists():
643
+ return None
644
+ try:
645
+ from anysite.batch.input import InputParser
646
+
647
+ raw_inputs = InputParser.from_file(file_path)
648
+ return len(raw_inputs)
649
+ except Exception:
650
+ return None
651
+
652
+
653
+ def _print_plan(plan: CollectionPlan) -> dict[str, int]:
654
+ """Print the collection plan and return empty results."""
655
+ from rich.console import Console
656
+ from rich.table import Table
657
+
658
+ console = Console()
659
+ table = Table(title="Collection Plan")
660
+ table.add_column("Step", style="bold")
661
+ table.add_column("Source")
662
+ table.add_column("Endpoint")
663
+ table.add_column("Type")
664
+ table.add_column("Depends On")
665
+ table.add_column("Est. Requests")
666
+
667
+ for i, step in enumerate(plan.steps, 1):
668
+ table.add_row(
669
+ str(i),
670
+ step["source"],
671
+ step["endpoint"],
672
+ step["kind"],
673
+ step.get("dependency") or "-",
674
+ str(step.get("estimated_requests") or "?"),
675
+ )
676
+
677
+ console.print(table)
678
+ return {}
679
+
680
+
681
+ def run_collect(
682
+ config: DatasetConfig,
683
+ **kwargs: Any,
684
+ ) -> dict[str, int]:
685
+ """Sync wrapper for collect_dataset."""
686
+ return asyncio.run(collect_dataset(config, **kwargs))