anysite-cli 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anysite/__init__.py +4 -0
- anysite/__main__.py +6 -0
- anysite/api/__init__.py +21 -0
- anysite/api/client.py +271 -0
- anysite/api/errors.py +137 -0
- anysite/api/schemas.py +333 -0
- anysite/batch/__init__.py +1 -0
- anysite/batch/executor.py +176 -0
- anysite/batch/input.py +160 -0
- anysite/batch/rate_limiter.py +98 -0
- anysite/cli/__init__.py +1 -0
- anysite/cli/config.py +176 -0
- anysite/cli/executor.py +388 -0
- anysite/cli/options.py +249 -0
- anysite/config/__init__.py +11 -0
- anysite/config/paths.py +46 -0
- anysite/config/settings.py +187 -0
- anysite/dataset/__init__.py +37 -0
- anysite/dataset/analyzer.py +268 -0
- anysite/dataset/cli.py +644 -0
- anysite/dataset/collector.py +686 -0
- anysite/dataset/db_loader.py +248 -0
- anysite/dataset/errors.py +30 -0
- anysite/dataset/exporters.py +121 -0
- anysite/dataset/history.py +153 -0
- anysite/dataset/models.py +245 -0
- anysite/dataset/notifications.py +87 -0
- anysite/dataset/scheduler.py +107 -0
- anysite/dataset/storage.py +171 -0
- anysite/dataset/transformer.py +213 -0
- anysite/db/__init__.py +38 -0
- anysite/db/adapters/__init__.py +1 -0
- anysite/db/adapters/base.py +158 -0
- anysite/db/adapters/postgres.py +201 -0
- anysite/db/adapters/sqlite.py +183 -0
- anysite/db/cli.py +709 -0
- anysite/db/config.py +92 -0
- anysite/db/manager.py +166 -0
- anysite/db/operations/__init__.py +1 -0
- anysite/db/operations/insert.py +199 -0
- anysite/db/operations/query.py +43 -0
- anysite/db/schema/__init__.py +1 -0
- anysite/db/schema/inference.py +213 -0
- anysite/db/schema/types.py +71 -0
- anysite/db/utils/__init__.py +1 -0
- anysite/db/utils/sanitize.py +99 -0
- anysite/main.py +498 -0
- anysite/models/__init__.py +1 -0
- anysite/output/__init__.py +11 -0
- anysite/output/console.py +45 -0
- anysite/output/formatters.py +301 -0
- anysite/output/templates.py +76 -0
- anysite/py.typed +0 -0
- anysite/streaming/__init__.py +1 -0
- anysite/streaming/progress.py +121 -0
- anysite/streaming/writer.py +130 -0
- anysite/utils/__init__.py +1 -0
- anysite/utils/fields.py +242 -0
- anysite/utils/retry.py +109 -0
- anysite_cli-0.1.2.dist-info/METADATA +455 -0
- anysite_cli-0.1.2.dist-info/RECORD +64 -0
- anysite_cli-0.1.2.dist-info/WHEEL +4 -0
- anysite_cli-0.1.2.dist-info/entry_points.txt +2 -0
- anysite_cli-0.1.2.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,686 @@
|
|
|
1
|
+
"""Dataset collection orchestrator."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import contextlib
|
|
7
|
+
import logging
|
|
8
|
+
import time
|
|
9
|
+
from datetime import date
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from anysite.api.client import create_client
|
|
14
|
+
from anysite.batch.executor import BatchExecutor
|
|
15
|
+
from anysite.batch.rate_limiter import RateLimiter
|
|
16
|
+
from anysite.cli.options import ErrorHandling
|
|
17
|
+
from anysite.dataset.errors import DatasetError
|
|
18
|
+
from anysite.dataset.models import DatasetConfig, DatasetSource
|
|
19
|
+
from anysite.dataset.storage import (
|
|
20
|
+
MetadataStore,
|
|
21
|
+
get_parquet_path,
|
|
22
|
+
read_parquet,
|
|
23
|
+
write_parquet,
|
|
24
|
+
)
|
|
25
|
+
from anysite.output.console import print_info, print_success, print_warning
|
|
26
|
+
from anysite.streaming.progress import ProgressTracker
|
|
27
|
+
from anysite.utils.fields import extract_field, parse_field_path
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class CollectionPlan:
|
|
33
|
+
"""Describes what will be collected without executing."""
|
|
34
|
+
|
|
35
|
+
def __init__(self) -> None:
|
|
36
|
+
self.steps: list[dict[str, Any]] = []
|
|
37
|
+
|
|
38
|
+
def add_step(
|
|
39
|
+
self,
|
|
40
|
+
source_id: str,
|
|
41
|
+
endpoint: str,
|
|
42
|
+
kind: str,
|
|
43
|
+
params: dict[str, Any] | None = None,
|
|
44
|
+
dependency: str | None = None,
|
|
45
|
+
estimated_requests: int | None = None,
|
|
46
|
+
) -> None:
|
|
47
|
+
self.steps.append({
|
|
48
|
+
"source": source_id,
|
|
49
|
+
"endpoint": endpoint,
|
|
50
|
+
"kind": kind,
|
|
51
|
+
"params": params or {},
|
|
52
|
+
"dependency": dependency,
|
|
53
|
+
"estimated_requests": estimated_requests,
|
|
54
|
+
})
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
async def collect_dataset(
|
|
58
|
+
config: DatasetConfig,
|
|
59
|
+
*,
|
|
60
|
+
config_dir: Path | None = None,
|
|
61
|
+
source_filter: str | None = None,
|
|
62
|
+
incremental: bool = False,
|
|
63
|
+
dry_run: bool = False,
|
|
64
|
+
quiet: bool = False,
|
|
65
|
+
) -> dict[str, int]:
|
|
66
|
+
"""Collect all sources in a dataset.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
config: Dataset configuration.
|
|
70
|
+
source_filter: If set, only collect this source (and its dependencies).
|
|
71
|
+
incremental: Skip sources that already have data for today.
|
|
72
|
+
dry_run: Show plan without executing.
|
|
73
|
+
quiet: Suppress progress output.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Dict mapping source_id to record count collected.
|
|
77
|
+
"""
|
|
78
|
+
if config_dir is None:
|
|
79
|
+
config_dir = Path.cwd()
|
|
80
|
+
base_path = config.storage_path()
|
|
81
|
+
metadata = MetadataStore(base_path)
|
|
82
|
+
today = date.today()
|
|
83
|
+
start_time = time.monotonic()
|
|
84
|
+
|
|
85
|
+
# Get execution order
|
|
86
|
+
ordered = config.topological_sort()
|
|
87
|
+
|
|
88
|
+
# Filter to requested source (and its dependencies)
|
|
89
|
+
if source_filter:
|
|
90
|
+
ordered = _filter_sources(ordered, source_filter, config)
|
|
91
|
+
|
|
92
|
+
if dry_run:
|
|
93
|
+
plan = _build_plan(
|
|
94
|
+
ordered, config, base_path, metadata, incremental, today,
|
|
95
|
+
config_dir=config_dir,
|
|
96
|
+
)
|
|
97
|
+
return _print_plan(plan)
|
|
98
|
+
|
|
99
|
+
# Record run start in history
|
|
100
|
+
run_id: int | None = None
|
|
101
|
+
log_handler: logging.Handler | None = None
|
|
102
|
+
try:
|
|
103
|
+
from anysite.dataset.history import HistoryStore, LogManager
|
|
104
|
+
|
|
105
|
+
history = HistoryStore()
|
|
106
|
+
run_id = history.record_start(config.name)
|
|
107
|
+
log_mgr = LogManager()
|
|
108
|
+
log_handler = log_mgr.create_handler(config.name, run_id)
|
|
109
|
+
logging.getLogger("anysite").addHandler(log_handler)
|
|
110
|
+
except Exception:
|
|
111
|
+
pass
|
|
112
|
+
|
|
113
|
+
results: dict[str, int] = {}
|
|
114
|
+
total_records = 0
|
|
115
|
+
error_msg: str | None = None
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
for source in ordered:
|
|
119
|
+
# Check incremental skip
|
|
120
|
+
if incremental:
|
|
121
|
+
parquet_path = get_parquet_path(base_path, source.id, today)
|
|
122
|
+
if parquet_path.exists():
|
|
123
|
+
if not quiet:
|
|
124
|
+
print_info(f"Skipping {source.id} (already collected today)")
|
|
125
|
+
info = metadata.get_source_info(source.id)
|
|
126
|
+
results[source.id] = info.get("record_count", 0) if info else 0
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
if not quiet:
|
|
130
|
+
print_info(f"Collecting {source.id} from {source.endpoint}...")
|
|
131
|
+
|
|
132
|
+
if source.from_file is not None:
|
|
133
|
+
file_base = config_dir if config_dir else Path.cwd()
|
|
134
|
+
records = await _collect_from_file(
|
|
135
|
+
source, config_dir=file_base,
|
|
136
|
+
metadata=metadata, incremental=incremental, quiet=quiet,
|
|
137
|
+
)
|
|
138
|
+
elif source.dependency is None:
|
|
139
|
+
records = await _collect_independent(source)
|
|
140
|
+
else:
|
|
141
|
+
records = await _collect_dependent(
|
|
142
|
+
source, base_path,
|
|
143
|
+
metadata=metadata, incremental=incremental, quiet=quiet,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Write FULL records to Parquet (preserves all fields for dependency resolution)
|
|
147
|
+
parquet_path = get_parquet_path(base_path, source.id, today)
|
|
148
|
+
count = write_parquet(records, parquet_path)
|
|
149
|
+
metadata.update_source(source.id, count, today)
|
|
150
|
+
|
|
151
|
+
# Apply per-source transform for exports only (does NOT affect Parquet)
|
|
152
|
+
export_records = records
|
|
153
|
+
if source.transform and records:
|
|
154
|
+
from anysite.dataset.transformer import RecordTransformer
|
|
155
|
+
|
|
156
|
+
transformer = RecordTransformer(source.transform)
|
|
157
|
+
before = len(records)
|
|
158
|
+
export_records = transformer.apply([dict(r) for r in records])
|
|
159
|
+
if not quiet and len(export_records) != before:
|
|
160
|
+
print_info(f" Transform: {before} -> {len(export_records)} records")
|
|
161
|
+
|
|
162
|
+
# Run per-source exports with transformed records
|
|
163
|
+
if source.export and export_records:
|
|
164
|
+
from anysite.dataset.exporters import run_exports
|
|
165
|
+
|
|
166
|
+
await run_exports(export_records, source.export, source.id, config.name)
|
|
167
|
+
|
|
168
|
+
# Track collected inputs for incremental dedup
|
|
169
|
+
if records:
|
|
170
|
+
input_values = [
|
|
171
|
+
r["_input_value"] for r in records if "_input_value" in r
|
|
172
|
+
]
|
|
173
|
+
if input_values:
|
|
174
|
+
metadata.update_collected_inputs(source.id, input_values)
|
|
175
|
+
|
|
176
|
+
results[source.id] = count
|
|
177
|
+
total_records += count
|
|
178
|
+
|
|
179
|
+
if not quiet:
|
|
180
|
+
print_success(f"Collected {count} records for {source.id}")
|
|
181
|
+
|
|
182
|
+
except Exception as e:
|
|
183
|
+
error_msg = str(e)
|
|
184
|
+
raise
|
|
185
|
+
finally:
|
|
186
|
+
duration = time.monotonic() - start_time
|
|
187
|
+
|
|
188
|
+
# Record finish in history
|
|
189
|
+
if run_id is not None:
|
|
190
|
+
with contextlib.suppress(Exception):
|
|
191
|
+
history.record_finish( # type: ignore[possibly-undefined]
|
|
192
|
+
run_id,
|
|
193
|
+
status="failed" if error_msg else "success",
|
|
194
|
+
record_count=total_records,
|
|
195
|
+
source_count=len(results),
|
|
196
|
+
error=error_msg,
|
|
197
|
+
duration=duration,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Send notifications
|
|
201
|
+
if config.notifications:
|
|
202
|
+
try:
|
|
203
|
+
from anysite.dataset.notifications import WebhookNotifier
|
|
204
|
+
|
|
205
|
+
notifier = WebhookNotifier(config.notifications)
|
|
206
|
+
if error_msg:
|
|
207
|
+
await notifier.notify_failure(config.name, error_msg, duration)
|
|
208
|
+
else:
|
|
209
|
+
await notifier.notify_complete(
|
|
210
|
+
config.name, total_records, len(results), duration,
|
|
211
|
+
)
|
|
212
|
+
except Exception as ne:
|
|
213
|
+
logger.error("Notification error: %s", ne)
|
|
214
|
+
|
|
215
|
+
# Remove log handler
|
|
216
|
+
if log_handler:
|
|
217
|
+
logging.getLogger("anysite").removeHandler(log_handler)
|
|
218
|
+
log_handler.close()
|
|
219
|
+
|
|
220
|
+
return results
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
async def _collect_independent(source: DatasetSource) -> list[dict[str, Any]]:
|
|
224
|
+
"""Collect an independent source (single API call)."""
|
|
225
|
+
async with create_client() as client:
|
|
226
|
+
data = await client.post(source.endpoint, data=source.params)
|
|
227
|
+
# API returns list[dict] or dict
|
|
228
|
+
if isinstance(data, list):
|
|
229
|
+
return data
|
|
230
|
+
return [data] if isinstance(data, dict) else [{"data": data}]
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
async def _collect_from_file(
|
|
234
|
+
source: DatasetSource,
|
|
235
|
+
*,
|
|
236
|
+
config_dir: Path,
|
|
237
|
+
metadata: MetadataStore | None = None,
|
|
238
|
+
incremental: bool = False,
|
|
239
|
+
quiet: bool = False,
|
|
240
|
+
) -> list[dict[str, Any]]:
|
|
241
|
+
"""Collect a source by iterating over values from an input file."""
|
|
242
|
+
from anysite.batch.input import InputParser
|
|
243
|
+
|
|
244
|
+
if not source.from_file:
|
|
245
|
+
raise DatasetError(f"Source {source.id} has no from_file defined")
|
|
246
|
+
if not source.input_key:
|
|
247
|
+
raise DatasetError(f"Source {source.id} has from_file but no input_key defined")
|
|
248
|
+
|
|
249
|
+
file_path = Path(source.from_file)
|
|
250
|
+
if not file_path.is_absolute():
|
|
251
|
+
file_path = config_dir / file_path
|
|
252
|
+
if not file_path.exists():
|
|
253
|
+
raise DatasetError(f"Input file not found: {file_path}")
|
|
254
|
+
|
|
255
|
+
# Parse inputs from file
|
|
256
|
+
raw_inputs = InputParser.from_file(file_path)
|
|
257
|
+
if not raw_inputs:
|
|
258
|
+
if not quiet:
|
|
259
|
+
print_warning(f"No inputs found in {file_path}")
|
|
260
|
+
return []
|
|
261
|
+
|
|
262
|
+
# Extract specific field from CSV/JSONL dicts if file_field is set
|
|
263
|
+
values: list[str] = []
|
|
264
|
+
for inp in raw_inputs:
|
|
265
|
+
if isinstance(inp, dict) and source.file_field:
|
|
266
|
+
val = inp.get(source.file_field)
|
|
267
|
+
if val is not None:
|
|
268
|
+
values.append(str(val))
|
|
269
|
+
elif isinstance(inp, str):
|
|
270
|
+
values.append(inp)
|
|
271
|
+
else:
|
|
272
|
+
values.append(str(inp))
|
|
273
|
+
|
|
274
|
+
if not values:
|
|
275
|
+
if not quiet:
|
|
276
|
+
print_warning(f"No values extracted from {file_path}")
|
|
277
|
+
return []
|
|
278
|
+
|
|
279
|
+
# Filter already-collected inputs in incremental mode
|
|
280
|
+
if incremental and metadata:
|
|
281
|
+
already = metadata.get_collected_inputs(source.id)
|
|
282
|
+
if already:
|
|
283
|
+
original = len(values)
|
|
284
|
+
values = [v for v in values if str(v) not in already]
|
|
285
|
+
if not quiet and original != len(values):
|
|
286
|
+
print_info(f" Skipping {original - len(values)} already-collected inputs")
|
|
287
|
+
|
|
288
|
+
if not values:
|
|
289
|
+
if not quiet:
|
|
290
|
+
print_info(f" All inputs already collected for {source.id}")
|
|
291
|
+
return []
|
|
292
|
+
|
|
293
|
+
if not quiet:
|
|
294
|
+
print_info(f" Found {len(values)} inputs from {file_path.name}")
|
|
295
|
+
|
|
296
|
+
return await _collect_batch(source, values, quiet=quiet)
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
async def _collect_batch(
|
|
300
|
+
source: DatasetSource,
|
|
301
|
+
values: list[Any],
|
|
302
|
+
*,
|
|
303
|
+
parent_source: str | None = None,
|
|
304
|
+
quiet: bool = False,
|
|
305
|
+
) -> list[dict[str, Any]]:
|
|
306
|
+
"""Run batch API calls for a list of input values.
|
|
307
|
+
|
|
308
|
+
Each resulting record is annotated with provenance metadata:
|
|
309
|
+
- ``_input_value``: the raw value used to make the API call
|
|
310
|
+
- ``_parent_source``: the source ID that produced the input (if dependent)
|
|
311
|
+
"""
|
|
312
|
+
limiter = RateLimiter(source.rate_limit) if source.rate_limit else None
|
|
313
|
+
on_error = ErrorHandling(source.on_error) if source.on_error else ErrorHandling.SKIP
|
|
314
|
+
|
|
315
|
+
tracker = ProgressTracker(
|
|
316
|
+
total=len(values),
|
|
317
|
+
description=f"Collecting {source.id}...",
|
|
318
|
+
quiet=quiet,
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
async def _fetch_one(val: str | dict[str, Any]) -> Any:
|
|
322
|
+
# Apply input_template if defined
|
|
323
|
+
if source.input_template:
|
|
324
|
+
input_val = _apply_template(source.input_template, val)
|
|
325
|
+
# If template returns a dict, use it as the full payload
|
|
326
|
+
# (merged with static params), not nested under input_key
|
|
327
|
+
if isinstance(input_val, dict):
|
|
328
|
+
payload = {**source.params, **input_val}
|
|
329
|
+
else:
|
|
330
|
+
payload = {source.input_key: input_val, **source.params} # type: ignore[dict-item]
|
|
331
|
+
else:
|
|
332
|
+
payload = {source.input_key: val, **source.params} # type: ignore[dict-item]
|
|
333
|
+
async with create_client() as client:
|
|
334
|
+
result = await client.post(source.endpoint, data=payload)
|
|
335
|
+
|
|
336
|
+
# Annotate each record with provenance metadata so that
|
|
337
|
+
# child→parent relationships can be reconstructed later.
|
|
338
|
+
records = _flatten_results([result])
|
|
339
|
+
for record in records:
|
|
340
|
+
record["_input_value"] = str(val)
|
|
341
|
+
if parent_source:
|
|
342
|
+
record["_parent_source"] = parent_source
|
|
343
|
+
return records
|
|
344
|
+
|
|
345
|
+
executor = BatchExecutor(
|
|
346
|
+
func=_fetch_one,
|
|
347
|
+
parallel=source.parallel,
|
|
348
|
+
on_error=on_error,
|
|
349
|
+
rate_limiter=limiter,
|
|
350
|
+
progress_callback=tracker.update,
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
with tracker:
|
|
354
|
+
batch_result = await executor.execute(values)
|
|
355
|
+
|
|
356
|
+
# _fetch_one returns lists of annotated dicts, but BatchExecutor
|
|
357
|
+
# wraps non-dict returns as {"data": result}. Handle both forms.
|
|
358
|
+
all_records: list[dict[str, Any]] = []
|
|
359
|
+
for item in batch_result.results:
|
|
360
|
+
if isinstance(item, list):
|
|
361
|
+
all_records.extend(r for r in item if isinstance(r, dict))
|
|
362
|
+
elif isinstance(item, dict):
|
|
363
|
+
data = item.get("data")
|
|
364
|
+
if isinstance(data, list):
|
|
365
|
+
all_records.extend(r for r in data if isinstance(r, dict))
|
|
366
|
+
elif isinstance(data, str):
|
|
367
|
+
# JSON-serialized list from Parquet roundtrip
|
|
368
|
+
import json
|
|
369
|
+
|
|
370
|
+
try:
|
|
371
|
+
parsed = json.loads(data)
|
|
372
|
+
if isinstance(parsed, list):
|
|
373
|
+
all_records.extend(r for r in parsed if isinstance(r, dict))
|
|
374
|
+
else:
|
|
375
|
+
all_records.append(item)
|
|
376
|
+
except (json.JSONDecodeError, ValueError):
|
|
377
|
+
all_records.append(item)
|
|
378
|
+
else:
|
|
379
|
+
all_records.append(item)
|
|
380
|
+
return all_records
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def _flatten_results(results: list[Any]) -> list[dict[str, Any]]:
|
|
384
|
+
"""Flatten batch results into a flat list of dicts."""
|
|
385
|
+
all_records: list[dict[str, Any]] = []
|
|
386
|
+
for result in results:
|
|
387
|
+
if isinstance(result, list):
|
|
388
|
+
all_records.extend(r for r in result if isinstance(r, dict))
|
|
389
|
+
elif isinstance(result, dict):
|
|
390
|
+
data = result.get("data", result)
|
|
391
|
+
if isinstance(data, list):
|
|
392
|
+
all_records.extend(r for r in data if isinstance(r, dict))
|
|
393
|
+
elif isinstance(data, dict):
|
|
394
|
+
all_records.append(data)
|
|
395
|
+
else:
|
|
396
|
+
all_records.append(result)
|
|
397
|
+
return all_records
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
async def _collect_dependent(
|
|
401
|
+
source: DatasetSource,
|
|
402
|
+
base_path: Path,
|
|
403
|
+
*,
|
|
404
|
+
metadata: MetadataStore | None = None,
|
|
405
|
+
incremental: bool = False,
|
|
406
|
+
quiet: bool = False,
|
|
407
|
+
) -> list[dict[str, Any]]:
|
|
408
|
+
"""Collect a dependent source by reading parent data and making per-value requests."""
|
|
409
|
+
dep = source.dependency
|
|
410
|
+
if dep is None:
|
|
411
|
+
raise DatasetError(f"Source {source.id} has no dependency defined")
|
|
412
|
+
|
|
413
|
+
# Read parent data
|
|
414
|
+
parent_dir = base_path / "raw" / dep.from_source
|
|
415
|
+
parent_records = read_parquet(parent_dir)
|
|
416
|
+
|
|
417
|
+
if not parent_records:
|
|
418
|
+
if not quiet:
|
|
419
|
+
print_warning(f"No parent data for {dep.from_source}, skipping {source.id}")
|
|
420
|
+
return []
|
|
421
|
+
|
|
422
|
+
# Extract values from parent
|
|
423
|
+
values = _extract_values(parent_records, dep.field, dep.match_by, dep.dedupe)
|
|
424
|
+
|
|
425
|
+
if not values:
|
|
426
|
+
if not quiet:
|
|
427
|
+
print_warning(f"No values extracted from {dep.from_source} for {source.id}")
|
|
428
|
+
return []
|
|
429
|
+
|
|
430
|
+
if not source.input_key:
|
|
431
|
+
raise DatasetError(
|
|
432
|
+
f"Source {source.id} has a dependency but no input_key defined"
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
# Filter already-collected inputs in incremental mode
|
|
436
|
+
if incremental and metadata:
|
|
437
|
+
already = metadata.get_collected_inputs(source.id)
|
|
438
|
+
if already:
|
|
439
|
+
original = len(values)
|
|
440
|
+
values = [v for v in values if str(v) not in already]
|
|
441
|
+
if not quiet and original != len(values):
|
|
442
|
+
print_info(f" Skipping {original - len(values)} already-collected inputs")
|
|
443
|
+
|
|
444
|
+
if not values:
|
|
445
|
+
if not quiet:
|
|
446
|
+
print_info(f" All inputs already collected for {source.id}")
|
|
447
|
+
return []
|
|
448
|
+
|
|
449
|
+
return await _collect_batch(
|
|
450
|
+
source, values, parent_source=dep.from_source, quiet=quiet
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def _apply_template(template: dict[str, Any], value: Any) -> dict[str, Any]:
|
|
455
|
+
"""Apply a template dict, replacing '{value}' placeholders with the actual value."""
|
|
456
|
+
result: dict[str, Any] = {}
|
|
457
|
+
for k, v in template.items():
|
|
458
|
+
if isinstance(v, str) and v == "{value}":
|
|
459
|
+
result[k] = str(value) if not isinstance(value, (dict, list)) else value
|
|
460
|
+
elif isinstance(v, str) and "{value}" in v:
|
|
461
|
+
result[k] = v.replace("{value}", str(value))
|
|
462
|
+
elif isinstance(v, dict):
|
|
463
|
+
result[k] = _apply_template(v, value)
|
|
464
|
+
elif isinstance(v, list):
|
|
465
|
+
result[k] = _apply_template_list(v, value)
|
|
466
|
+
else:
|
|
467
|
+
result[k] = v
|
|
468
|
+
return result
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def _apply_template_list(template_list: list[Any], value: Any) -> list[Any]:
|
|
472
|
+
"""Apply {value} replacement within list elements."""
|
|
473
|
+
result: list[Any] = []
|
|
474
|
+
for item in template_list:
|
|
475
|
+
if isinstance(item, str) and item == "{value}":
|
|
476
|
+
result.append(str(value) if not isinstance(value, (dict, list)) else value)
|
|
477
|
+
elif isinstance(item, str) and "{value}" in item:
|
|
478
|
+
result.append(item.replace("{value}", str(value)))
|
|
479
|
+
elif isinstance(item, dict):
|
|
480
|
+
result.append(_apply_template(item, value))
|
|
481
|
+
elif isinstance(item, list):
|
|
482
|
+
result.append(_apply_template_list(item, value))
|
|
483
|
+
else:
|
|
484
|
+
result.append(item)
|
|
485
|
+
return result
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def _try_parse_json(value: Any) -> Any:
|
|
489
|
+
"""Try to parse a JSON string back into a dict/list."""
|
|
490
|
+
import json
|
|
491
|
+
|
|
492
|
+
if isinstance(value, str):
|
|
493
|
+
try:
|
|
494
|
+
parsed = json.loads(value)
|
|
495
|
+
if isinstance(parsed, (dict, list)):
|
|
496
|
+
return parsed
|
|
497
|
+
except (json.JSONDecodeError, ValueError):
|
|
498
|
+
pass
|
|
499
|
+
return value
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
def _extract_values(
|
|
503
|
+
records: list[dict[str, Any]],
|
|
504
|
+
field: str | None,
|
|
505
|
+
match_by: str | None,
|
|
506
|
+
dedupe: bool,
|
|
507
|
+
) -> list[Any]:
|
|
508
|
+
"""Extract values from parent records."""
|
|
509
|
+
if match_by:
|
|
510
|
+
field = match_by
|
|
511
|
+
|
|
512
|
+
if not field:
|
|
513
|
+
raise DatasetError("Dependency must specify either 'field' or 'match_by'")
|
|
514
|
+
|
|
515
|
+
segments = parse_field_path(field)
|
|
516
|
+
values: list[Any] = []
|
|
517
|
+
|
|
518
|
+
for record in records:
|
|
519
|
+
# Parquet stores nested objects as JSON strings — parse them back
|
|
520
|
+
# so that dot-notation paths like "urn.value" work correctly
|
|
521
|
+
parsed_record = {k: _try_parse_json(v) for k, v in record.items()}
|
|
522
|
+
value = extract_field(parsed_record, segments)
|
|
523
|
+
if value is not None:
|
|
524
|
+
value = _try_parse_json(value)
|
|
525
|
+
if isinstance(value, list):
|
|
526
|
+
values.extend(value)
|
|
527
|
+
else:
|
|
528
|
+
values.append(value)
|
|
529
|
+
|
|
530
|
+
if dedupe:
|
|
531
|
+
seen: set[str] = set()
|
|
532
|
+
unique: list[Any] = []
|
|
533
|
+
for v in values:
|
|
534
|
+
key = str(v)
|
|
535
|
+
if key not in seen:
|
|
536
|
+
seen.add(key)
|
|
537
|
+
unique.append(v)
|
|
538
|
+
values = unique
|
|
539
|
+
|
|
540
|
+
return values
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
def _filter_sources(
|
|
544
|
+
ordered: list[DatasetSource],
|
|
545
|
+
source_filter: str,
|
|
546
|
+
config: DatasetConfig,
|
|
547
|
+
) -> list[DatasetSource]:
|
|
548
|
+
"""Filter sources to only include the target and its transitive dependencies."""
|
|
549
|
+
target = config.get_source(source_filter)
|
|
550
|
+
if target is None:
|
|
551
|
+
raise DatasetError(f"Source '{source_filter}' not found in dataset")
|
|
552
|
+
|
|
553
|
+
# Collect all required source IDs (target + transitive deps)
|
|
554
|
+
required: set[str] = set()
|
|
555
|
+
stack = [source_filter]
|
|
556
|
+
while stack:
|
|
557
|
+
sid = stack.pop()
|
|
558
|
+
if sid in required:
|
|
559
|
+
continue
|
|
560
|
+
required.add(sid)
|
|
561
|
+
src = config.get_source(sid)
|
|
562
|
+
if src and src.dependency:
|
|
563
|
+
stack.append(src.dependency.from_source)
|
|
564
|
+
|
|
565
|
+
return [s for s in ordered if s.id in required]
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
def _build_plan(
|
|
569
|
+
ordered: list[DatasetSource],
|
|
570
|
+
config: DatasetConfig,
|
|
571
|
+
base_path: Path,
|
|
572
|
+
metadata: MetadataStore,
|
|
573
|
+
incremental: bool,
|
|
574
|
+
today: date,
|
|
575
|
+
*,
|
|
576
|
+
config_dir: Path | None = None,
|
|
577
|
+
) -> CollectionPlan:
|
|
578
|
+
"""Build a dry-run plan with estimated input counts."""
|
|
579
|
+
plan = CollectionPlan()
|
|
580
|
+
|
|
581
|
+
for source in ordered:
|
|
582
|
+
if incremental:
|
|
583
|
+
parquet_path = get_parquet_path(base_path, source.id, today)
|
|
584
|
+
if parquet_path.exists():
|
|
585
|
+
continue
|
|
586
|
+
|
|
587
|
+
if source.from_file is not None:
|
|
588
|
+
est = _count_file_inputs(source, config_dir)
|
|
589
|
+
plan.add_step(
|
|
590
|
+
source_id=source.id,
|
|
591
|
+
endpoint=source.endpoint,
|
|
592
|
+
kind="from_file",
|
|
593
|
+
params={"file": source.from_file, "field": source.file_field},
|
|
594
|
+
estimated_requests=est,
|
|
595
|
+
)
|
|
596
|
+
elif source.dependency is None:
|
|
597
|
+
plan.add_step(
|
|
598
|
+
source_id=source.id,
|
|
599
|
+
endpoint=source.endpoint,
|
|
600
|
+
kind="independent",
|
|
601
|
+
params=source.params,
|
|
602
|
+
estimated_requests=1,
|
|
603
|
+
)
|
|
604
|
+
else:
|
|
605
|
+
est = _count_dependent_inputs(source, base_path, metadata)
|
|
606
|
+
plan.add_step(
|
|
607
|
+
source_id=source.id,
|
|
608
|
+
endpoint=source.endpoint,
|
|
609
|
+
kind="dependent",
|
|
610
|
+
dependency=source.dependency.from_source,
|
|
611
|
+
estimated_requests=est,
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
return plan
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
def _count_dependent_inputs(
|
|
618
|
+
source: DatasetSource, base_path: Path, metadata: MetadataStore
|
|
619
|
+
) -> int | None:
|
|
620
|
+
"""Count extractable input values from parent Parquet data."""
|
|
621
|
+
dep = source.dependency
|
|
622
|
+
if dep is None:
|
|
623
|
+
return None
|
|
624
|
+
parent_dir = base_path / "raw" / dep.from_source
|
|
625
|
+
parent_records = read_parquet(parent_dir)
|
|
626
|
+
if not parent_records:
|
|
627
|
+
info = metadata.get_source_info(dep.from_source)
|
|
628
|
+
return info.get("record_count") if info else None
|
|
629
|
+
values = _extract_values(parent_records, dep.field, dep.match_by, dep.dedupe)
|
|
630
|
+
return len(values)
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
def _count_file_inputs(
|
|
634
|
+
source: DatasetSource, config_dir: Path | None
|
|
635
|
+
) -> int | None:
|
|
636
|
+
"""Count input values in a from_file source."""
|
|
637
|
+
if not source.from_file:
|
|
638
|
+
return None
|
|
639
|
+
file_path = Path(source.from_file)
|
|
640
|
+
if not file_path.is_absolute() and config_dir:
|
|
641
|
+
file_path = config_dir / file_path
|
|
642
|
+
if not file_path.exists():
|
|
643
|
+
return None
|
|
644
|
+
try:
|
|
645
|
+
from anysite.batch.input import InputParser
|
|
646
|
+
|
|
647
|
+
raw_inputs = InputParser.from_file(file_path)
|
|
648
|
+
return len(raw_inputs)
|
|
649
|
+
except Exception:
|
|
650
|
+
return None
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
def _print_plan(plan: CollectionPlan) -> dict[str, int]:
|
|
654
|
+
"""Print the collection plan and return empty results."""
|
|
655
|
+
from rich.console import Console
|
|
656
|
+
from rich.table import Table
|
|
657
|
+
|
|
658
|
+
console = Console()
|
|
659
|
+
table = Table(title="Collection Plan")
|
|
660
|
+
table.add_column("Step", style="bold")
|
|
661
|
+
table.add_column("Source")
|
|
662
|
+
table.add_column("Endpoint")
|
|
663
|
+
table.add_column("Type")
|
|
664
|
+
table.add_column("Depends On")
|
|
665
|
+
table.add_column("Est. Requests")
|
|
666
|
+
|
|
667
|
+
for i, step in enumerate(plan.steps, 1):
|
|
668
|
+
table.add_row(
|
|
669
|
+
str(i),
|
|
670
|
+
step["source"],
|
|
671
|
+
step["endpoint"],
|
|
672
|
+
step["kind"],
|
|
673
|
+
step.get("dependency") or "-",
|
|
674
|
+
str(step.get("estimated_requests") or "?"),
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
console.print(table)
|
|
678
|
+
return {}
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
def run_collect(
|
|
682
|
+
config: DatasetConfig,
|
|
683
|
+
**kwargs: Any,
|
|
684
|
+
) -> dict[str, int]:
|
|
685
|
+
"""Sync wrapper for collect_dataset."""
|
|
686
|
+
return asyncio.run(collect_dataset(config, **kwargs))
|