anysite-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of anysite-cli might be problematic. Click here for more details.

Files changed (64) hide show
  1. anysite/__init__.py +4 -0
  2. anysite/__main__.py +6 -0
  3. anysite/api/__init__.py +21 -0
  4. anysite/api/client.py +271 -0
  5. anysite/api/errors.py +137 -0
  6. anysite/api/schemas.py +333 -0
  7. anysite/batch/__init__.py +1 -0
  8. anysite/batch/executor.py +176 -0
  9. anysite/batch/input.py +160 -0
  10. anysite/batch/rate_limiter.py +98 -0
  11. anysite/cli/__init__.py +1 -0
  12. anysite/cli/config.py +176 -0
  13. anysite/cli/executor.py +388 -0
  14. anysite/cli/options.py +249 -0
  15. anysite/config/__init__.py +11 -0
  16. anysite/config/paths.py +46 -0
  17. anysite/config/settings.py +187 -0
  18. anysite/dataset/__init__.py +37 -0
  19. anysite/dataset/analyzer.py +268 -0
  20. anysite/dataset/cli.py +644 -0
  21. anysite/dataset/collector.py +686 -0
  22. anysite/dataset/db_loader.py +248 -0
  23. anysite/dataset/errors.py +30 -0
  24. anysite/dataset/exporters.py +121 -0
  25. anysite/dataset/history.py +153 -0
  26. anysite/dataset/models.py +245 -0
  27. anysite/dataset/notifications.py +87 -0
  28. anysite/dataset/scheduler.py +107 -0
  29. anysite/dataset/storage.py +171 -0
  30. anysite/dataset/transformer.py +213 -0
  31. anysite/db/__init__.py +38 -0
  32. anysite/db/adapters/__init__.py +1 -0
  33. anysite/db/adapters/base.py +158 -0
  34. anysite/db/adapters/postgres.py +201 -0
  35. anysite/db/adapters/sqlite.py +183 -0
  36. anysite/db/cli.py +687 -0
  37. anysite/db/config.py +92 -0
  38. anysite/db/manager.py +166 -0
  39. anysite/db/operations/__init__.py +1 -0
  40. anysite/db/operations/insert.py +199 -0
  41. anysite/db/operations/query.py +43 -0
  42. anysite/db/schema/__init__.py +1 -0
  43. anysite/db/schema/inference.py +213 -0
  44. anysite/db/schema/types.py +71 -0
  45. anysite/db/utils/__init__.py +1 -0
  46. anysite/db/utils/sanitize.py +99 -0
  47. anysite/main.py +498 -0
  48. anysite/models/__init__.py +1 -0
  49. anysite/output/__init__.py +11 -0
  50. anysite/output/console.py +45 -0
  51. anysite/output/formatters.py +301 -0
  52. anysite/output/templates.py +76 -0
  53. anysite/py.typed +0 -0
  54. anysite/streaming/__init__.py +1 -0
  55. anysite/streaming/progress.py +121 -0
  56. anysite/streaming/writer.py +130 -0
  57. anysite/utils/__init__.py +1 -0
  58. anysite/utils/fields.py +242 -0
  59. anysite/utils/retry.py +109 -0
  60. anysite_cli-0.1.0.dist-info/METADATA +437 -0
  61. anysite_cli-0.1.0.dist-info/RECORD +64 -0
  62. anysite_cli-0.1.0.dist-info/WHEEL +4 -0
  63. anysite_cli-0.1.0.dist-info/entry_points.txt +2 -0
  64. anysite_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,245 @@
1
+ """Pydantic models for dataset YAML configuration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections import deque
6
+ from pathlib import Path
7
+ from typing import Any, Literal
8
+
9
+ import yaml
10
+ from pydantic import BaseModel, Field, PrivateAttr, field_validator, model_validator
11
+
12
+ from anysite.dataset.errors import CircularDependencyError, SourceNotFoundError
13
+
14
+ # ---------------------------------------------------------------------------
15
+ # New models for transform / export / schedule / notifications
16
+ # ---------------------------------------------------------------------------
17
+
18
+
19
+ class TransformConfig(BaseModel):
20
+ """Per-source transform: filter → select fields → add columns."""
21
+
22
+ filter: str | None = Field(default=None, description="Filter expression (e.g., '.employee_count > 10')")
23
+ fields: list[str] = Field(default_factory=list, description="Fields to keep (empty = all)")
24
+ add_columns: dict[str, Any] = Field(default_factory=dict, description="Static columns to add")
25
+
26
+
27
+ class ExportDestination(BaseModel):
28
+ """Per-source export destination (file or webhook)."""
29
+
30
+ type: Literal["file", "webhook"] = Field(description="Export type")
31
+ path: str | None = Field(default=None, description="Output file path (file type)")
32
+ format: str = Field(default="jsonl", description="File format: json, jsonl, csv")
33
+ url: str | None = Field(default=None, description="Webhook URL (webhook type)")
34
+ headers: dict[str, str] = Field(default_factory=dict, description="HTTP headers for webhook")
35
+
36
+ @model_validator(mode="after")
37
+ def validate_type_fields(self) -> ExportDestination:
38
+ if self.type == "file" and not self.path:
39
+ raise ValueError("File export requires 'path'")
40
+ if self.type == "webhook" and not self.url:
41
+ raise ValueError("Webhook export requires 'url'")
42
+ return self
43
+
44
+
45
+ class ScheduleConfig(BaseModel):
46
+ """Cron-based schedule for dataset collection."""
47
+
48
+ cron: str = Field(description="Cron expression (e.g., '0 9 * * MON')")
49
+
50
+
51
+ class WebhookNotification(BaseModel):
52
+ """A single webhook notification endpoint."""
53
+
54
+ url: str = Field(description="Webhook URL")
55
+ headers: dict[str, str] = Field(default_factory=dict, description="HTTP headers")
56
+
57
+
58
+ class NotificationsConfig(BaseModel):
59
+ """Notification webhooks for collection events."""
60
+
61
+ on_complete: list[WebhookNotification] = Field(default_factory=list)
62
+ on_failure: list[WebhookNotification] = Field(default_factory=list)
63
+
64
+
65
+ class SourceDependency(BaseModel):
66
+ """Dependency on another source's output."""
67
+
68
+ from_source: str = Field(description="Source ID to depend on")
69
+ field: str | None = Field(
70
+ default=None,
71
+ description="Field to extract from parent records (dot notation)",
72
+ )
73
+ match_by: str | None = Field(
74
+ default=None,
75
+ description="Field for fuzzy matching by name",
76
+ )
77
+ dedupe: bool = Field(default=False, description="Deduplicate extracted values")
78
+
79
+
80
+ class DbLoadConfig(BaseModel):
81
+ """Configuration for loading a source into a relational database."""
82
+
83
+ table: str | None = Field(default=None, description="Override table name (default: source id)")
84
+ fields: list[str] = Field(default_factory=list, description="Fields to include (empty = all)")
85
+ exclude: list[str] = Field(
86
+ default_factory=lambda: ["_input_value", "_parent_source"],
87
+ description="Fields to exclude (default: provenance metadata)",
88
+ )
89
+
90
+
91
+ class DatasetSource(BaseModel):
92
+ """A single data source within a dataset."""
93
+
94
+ id: str = Field(description="Unique source identifier")
95
+ endpoint: str = Field(description="API endpoint path (e.g., /api/linkedin/search/users)")
96
+ params: dict[str, Any] = Field(default_factory=dict, description="Static API parameters")
97
+ dependency: SourceDependency | None = Field(
98
+ default=None,
99
+ description="Dependency on another source",
100
+ )
101
+ input_key: str | None = Field(
102
+ default=None,
103
+ description="Parameter name for dependent input values",
104
+ )
105
+ input_template: dict[str, Any] | None = Field(
106
+ default=None,
107
+ description="Template for input value — use {value} placeholder (e.g., {type: company, value: '{value}'})",
108
+ )
109
+ from_file: str | None = Field(
110
+ default=None,
111
+ description="Path to input file (CSV/JSONL/text) with values to iterate over",
112
+ )
113
+ file_field: str | None = Field(
114
+ default=None,
115
+ description="Column name to extract from CSV input file",
116
+ )
117
+ parallel: int = Field(default=1, ge=1, description="Parallel requests for dependent collection")
118
+ rate_limit: str | None = Field(default=None, description="Rate limit (e.g., '10/s')")
119
+ on_error: str = Field(default="skip", description="Error handling: stop or skip")
120
+ db_load: DbLoadConfig | None = Field(
121
+ default=None,
122
+ description="Database loading configuration (optional)",
123
+ )
124
+ transform: TransformConfig | None = Field(
125
+ default=None,
126
+ description="Post-collection transform (filter/fields/add_columns)",
127
+ )
128
+ export: list[ExportDestination] = Field(
129
+ default_factory=list,
130
+ description="Export destinations (file/webhook) applied after Parquet write",
131
+ )
132
+
133
+ @field_validator("endpoint")
134
+ @classmethod
135
+ def validate_endpoint(cls, v: str) -> str:
136
+ if not v.startswith("/"):
137
+ raise ValueError(f"Endpoint must start with '/', got: {v}")
138
+ return v
139
+
140
+
141
+ class StorageConfig(BaseModel):
142
+ """Storage configuration for dataset output."""
143
+
144
+ format: str = Field(default="parquet", description="Storage format")
145
+ path: str = Field(default="./data/", description="Base directory for data files")
146
+ partition_by: list[str] = Field(
147
+ default_factory=lambda: ["source_id", "collected_date"],
148
+ description="Partition dimensions",
149
+ )
150
+
151
+
152
+ class DatasetConfig(BaseModel):
153
+ """Top-level dataset configuration parsed from YAML."""
154
+
155
+ name: str = Field(description="Dataset name")
156
+ description: str = Field(default="", description="Dataset description")
157
+ sources: list[DatasetSource] = Field(description="Data sources to collect")
158
+ storage: StorageConfig = Field(default_factory=StorageConfig)
159
+ schedule: ScheduleConfig | None = Field(default=None, description="Collection schedule")
160
+ notifications: NotificationsConfig | None = Field(default=None, description="Webhook notifications")
161
+
162
+ _config_dir: Path | None = PrivateAttr(default=None)
163
+
164
+ @field_validator("sources")
165
+ @classmethod
166
+ def validate_unique_ids(cls, v: list[DatasetSource]) -> list[DatasetSource]:
167
+ ids = [s.id for s in v]
168
+ dupes = [sid for sid in ids if ids.count(sid) > 1]
169
+ if dupes:
170
+ raise ValueError(f"Duplicate source IDs: {set(dupes)}")
171
+ return v
172
+
173
+ @classmethod
174
+ def from_yaml(cls, path: Path) -> DatasetConfig:
175
+ """Load dataset configuration from a YAML file."""
176
+ with open(path) as f:
177
+ data = yaml.safe_load(f)
178
+ config = cls.model_validate(data)
179
+ config._config_dir = path.resolve().parent
180
+ return config
181
+
182
+ def get_source(self, source_id: str) -> DatasetSource | None:
183
+ """Get a source by ID."""
184
+ for s in self.sources:
185
+ if s.id == source_id:
186
+ return s
187
+ return None
188
+
189
+ def topological_sort(self) -> list[DatasetSource]:
190
+ """Sort sources by dependency order using Kahn's algorithm.
191
+
192
+ Returns:
193
+ List of sources in execution order (independent first).
194
+
195
+ Raises:
196
+ CircularDependencyError: If dependencies form a cycle.
197
+ SourceNotFoundError: If a dependency references a non-existent source.
198
+ """
199
+ source_map = {s.id: s for s in self.sources}
200
+
201
+ # Build adjacency: in_degree counts and adjacency list
202
+ in_degree: dict[str, int] = {s.id: 0 for s in self.sources}
203
+ dependents: dict[str, list[str]] = {s.id: [] for s in self.sources}
204
+
205
+ for source in self.sources:
206
+ if source.dependency:
207
+ parent_id = source.dependency.from_source
208
+ if parent_id not in source_map:
209
+ raise SourceNotFoundError(parent_id, source.id)
210
+ in_degree[source.id] += 1
211
+ dependents[parent_id].append(source.id)
212
+
213
+ # Kahn's algorithm
214
+ queue: deque[str] = deque()
215
+ for sid, degree in in_degree.items():
216
+ if degree == 0:
217
+ queue.append(sid)
218
+
219
+ result: list[DatasetSource] = []
220
+ while queue:
221
+ sid = queue.popleft()
222
+ result.append(source_map[sid])
223
+ for dep_id in dependents[sid]:
224
+ in_degree[dep_id] -= 1
225
+ if in_degree[dep_id] == 0:
226
+ queue.append(dep_id)
227
+
228
+ if len(result) != len(self.sources):
229
+ # Find the cycle
230
+ remaining = [s.id for s in self.sources if s.id not in {r.id for r in result}]
231
+ raise CircularDependencyError(remaining)
232
+
233
+ return result
234
+
235
+ def storage_path(self) -> Path:
236
+ """Resolve the storage base path.
237
+
238
+ Relative paths are resolved against the directory containing the
239
+ YAML config file (set by ``from_yaml``). Absolute paths and
240
+ programmatic configs (no config dir) use the path as-is.
241
+ """
242
+ p = Path(self.storage.path)
243
+ if not p.is_absolute() and self._config_dir is not None:
244
+ return self._config_dir / p
245
+ return p
@@ -0,0 +1,87 @@
1
+ """Webhook notifications for dataset collection events."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from datetime import UTC, datetime
7
+ from typing import Any
8
+
9
+ from anysite.dataset.models import NotificationsConfig
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class WebhookNotifier:
15
+ """Send webhook notifications on collection complete/failure."""
16
+
17
+ def __init__(self, config: NotificationsConfig) -> None:
18
+ self.config = config
19
+
20
+ async def notify_complete(
21
+ self,
22
+ dataset_name: str,
23
+ record_count: int,
24
+ source_count: int,
25
+ duration: float,
26
+ ) -> None:
27
+ """Send on_complete webhooks."""
28
+ if not self.config.on_complete:
29
+ return
30
+
31
+ payload = _build_payload(
32
+ event="complete",
33
+ dataset_name=dataset_name,
34
+ record_count=record_count,
35
+ source_count=source_count,
36
+ duration=duration,
37
+ )
38
+
39
+ for hook in self.config.on_complete:
40
+ await self._send(hook.url, hook.headers, payload)
41
+
42
+ async def notify_failure(
43
+ self,
44
+ dataset_name: str,
45
+ error: str,
46
+ duration: float,
47
+ ) -> None:
48
+ """Send on_failure webhooks."""
49
+ if not self.config.on_failure:
50
+ return
51
+
52
+ payload = _build_payload(
53
+ event="failure",
54
+ dataset_name=dataset_name,
55
+ error=error,
56
+ duration=duration,
57
+ )
58
+
59
+ for hook in self.config.on_failure:
60
+ await self._send(hook.url, hook.headers, payload)
61
+
62
+ async def _send(self, url: str, headers: dict[str, str], payload: dict[str, Any]) -> None:
63
+ """Send a single webhook POST."""
64
+ try:
65
+ import httpx
66
+
67
+ async with httpx.AsyncClient(timeout=15.0) as client:
68
+ resp = await client.post(url, json=payload, headers=headers)
69
+ resp.raise_for_status()
70
+ logger.info("Notification sent to %s", url)
71
+ except Exception as e:
72
+ logger.error("Notification to %s failed: %s", url, e)
73
+
74
+
75
+ def _build_payload(
76
+ *,
77
+ event: str,
78
+ dataset_name: str,
79
+ **kwargs: Any,
80
+ ) -> dict[str, Any]:
81
+ """Build a webhook notification payload."""
82
+ return {
83
+ "event": event,
84
+ "dataset": dataset_name,
85
+ "timestamp": datetime.now(UTC).isoformat(),
86
+ **kwargs,
87
+ }
@@ -0,0 +1,107 @@
1
+ """Schedule generation for dataset collection — crontab and systemd."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import shutil
6
+
7
+
8
+ class ScheduleGenerator:
9
+ """Generate crontab or systemd timer entries for dataset collection."""
10
+
11
+ def __init__(self, dataset_name: str, cron_expr: str, yaml_path: str) -> None:
12
+ self.dataset_name = dataset_name
13
+ self.cron = cron_expr
14
+ self.yaml_path = yaml_path
15
+
16
+ def generate_crontab(self, *, incremental: bool = False, load_db: str | None = None) -> str:
17
+ """Generate a crontab entry."""
18
+ anysite = shutil.which("anysite") or "anysite"
19
+ cmd = f"{anysite} dataset collect {self.yaml_path}"
20
+ if incremental:
21
+ cmd += " --incremental"
22
+ if load_db:
23
+ cmd += f" --load-db {load_db}"
24
+ return f"{self.cron} {cmd} >> ~/.anysite/logs/{self.dataset_name}_cron.log 2>&1"
25
+
26
+ def generate_systemd(self, *, incremental: bool = False, load_db: str | None = None) -> dict[str, str]:
27
+ """Generate systemd service and timer unit files."""
28
+ anysite = shutil.which("anysite") or "anysite"
29
+ cmd = f"{anysite} dataset collect {self.yaml_path}"
30
+ if incremental:
31
+ cmd += " --incremental"
32
+ if load_db:
33
+ cmd += f" --load-db {load_db}"
34
+
35
+ service_name = f"anysite-dataset-{self.dataset_name}"
36
+ on_calendar = _cron_to_oncalendar(self.cron)
37
+
38
+ service = f"""[Unit]
39
+ Description=Anysite dataset collection: {self.dataset_name}
40
+
41
+ [Service]
42
+ Type=oneshot
43
+ ExecStart={cmd}
44
+ StandardOutput=journal
45
+ StandardError=journal
46
+ """
47
+
48
+ timer = f"""[Unit]
49
+ Description=Timer for anysite dataset: {self.dataset_name}
50
+
51
+ [Timer]
52
+ OnCalendar={on_calendar}
53
+ Persistent=true
54
+
55
+ [Install]
56
+ WantedBy=timers.target
57
+ """
58
+
59
+ return {
60
+ f"{service_name}.service": service,
61
+ f"{service_name}.timer": timer,
62
+ }
63
+
64
+
65
+ def _cron_to_oncalendar(cron: str) -> str:
66
+ """Convert a cron expression to systemd OnCalendar format (best effort).
67
+
68
+ Handles common patterns:
69
+ 0 9 * * MON -> Mon *-*-* 09:00:00
70
+ */5 * * * * -> *-*-* *:0/5:00
71
+ 0 0 1 * * -> *-*-01 00:00:00
72
+ """
73
+ parts = cron.split()
74
+ if len(parts) != 5:
75
+ return cron # pass through as-is
76
+
77
+ minute, hour, day, month, dow = parts
78
+
79
+ # Day-of-week mapping
80
+ dow_map = {
81
+ "0": "Sun", "1": "Mon", "2": "Tue", "3": "Wed",
82
+ "4": "Thu", "5": "Fri", "6": "Sat", "7": "Sun",
83
+ "MON": "Mon", "TUE": "Tue", "WED": "Wed",
84
+ "THU": "Thu", "FRI": "Fri", "SAT": "Sat", "SUN": "Sun",
85
+ }
86
+
87
+ # Format components
88
+ dow_str = ""
89
+ if dow != "*":
90
+ dow_parts = dow.replace(",", " ").split()
91
+ dow_str = ",".join(dow_map.get(d.upper(), d) for d in dow_parts) + " "
92
+
93
+ month_str = month if month != "*" else "*"
94
+ day_str = day.zfill(2) if day != "*" and "/" not in day else day
95
+
96
+ hour_str = hour.zfill(2) if hour != "*" and "/" not in hour else hour
97
+ min_str = minute.zfill(2) if minute != "*" and "/" not in minute else minute
98
+
99
+ # Handle step values
100
+ if "/" in minute:
101
+ base, step = minute.split("/")
102
+ min_str = f"{base or '0'}/{step}"
103
+ if "/" in hour:
104
+ base, step = hour.split("/")
105
+ hour_str = f"{base or '0'}/{step}"
106
+
107
+ return f"{dow_str}*-{month_str}-{day_str} {hour_str}:{min_str}:00"
@@ -0,0 +1,171 @@
1
+ """Parquet storage layer for dataset records."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from datetime import date
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+
11
+ def write_parquet(
12
+ records: list[dict[str, Any]],
13
+ path: Path,
14
+ ) -> int:
15
+ """Write records to a Parquet file.
16
+
17
+ Args:
18
+ records: List of dicts to write.
19
+ path: Output file path.
20
+
21
+ Returns:
22
+ Number of records written.
23
+ """
24
+ import pyarrow as pa
25
+ import pyarrow.parquet as pq
26
+
27
+ if not records:
28
+ return 0
29
+
30
+ # Normalize: flatten nested structures to JSON strings for non-scalar types
31
+ normalized = _normalize_records(records)
32
+
33
+ table = pa.Table.from_pylist(normalized)
34
+ path.parent.mkdir(parents=True, exist_ok=True)
35
+ pq.write_table(table, path)
36
+ return len(records)
37
+
38
+
39
+ def _normalize_records(records: list[dict[str, Any]]) -> list[dict[str, Any]]:
40
+ """Normalize records for Parquet compatibility.
41
+
42
+ Converts nested dicts/lists to JSON strings so that pyarrow
43
+ can infer a consistent schema across heterogeneous records.
44
+ """
45
+ if not records:
46
+ return records
47
+
48
+ result = []
49
+ for record in records:
50
+ normalized: dict[str, Any] = {}
51
+ for key, value in record.items():
52
+ if isinstance(value, (dict, list)):
53
+ normalized[key] = json.dumps(value, default=str)
54
+ else:
55
+ normalized[key] = value
56
+ result.append(normalized)
57
+ return result
58
+
59
+
60
+ def read_parquet(path: Path) -> list[dict[str, Any]]:
61
+ """Read records from a Parquet file or directory of Parquet files.
62
+
63
+ Args:
64
+ path: Parquet file or directory containing .parquet files.
65
+
66
+ Returns:
67
+ List of dicts.
68
+ """
69
+ import pyarrow.parquet as pq
70
+
71
+ if path.is_dir():
72
+ files = sorted(path.glob("*.parquet"))
73
+ if not files:
74
+ return []
75
+ tables = [pq.read_table(f) for f in files]
76
+ import pyarrow as pa
77
+
78
+ table = pa.concat_tables(tables)
79
+ else:
80
+ if not path.exists():
81
+ return []
82
+ table = pq.read_table(path)
83
+
84
+ return table.to_pylist()
85
+
86
+
87
+ def get_source_dir(base_path: Path, source_id: str) -> Path:
88
+ """Get the raw data directory for a source."""
89
+ return base_path / "raw" / source_id
90
+
91
+
92
+ def get_parquet_path(base_path: Path, source_id: str, collected_date: date | None = None) -> Path:
93
+ """Get the Parquet file path for a source on a given date."""
94
+ if collected_date is None:
95
+ collected_date = date.today()
96
+ source_dir = get_source_dir(base_path, source_id)
97
+ return source_dir / f"{collected_date.isoformat()}.parquet"
98
+
99
+
100
+ class MetadataStore:
101
+ """Read/write metadata.json for dataset state tracking."""
102
+
103
+ def __init__(self, base_path: Path) -> None:
104
+ self.path = base_path / "metadata.json"
105
+
106
+ def load(self) -> dict[str, Any]:
107
+ if self.path.exists():
108
+ with open(self.path) as f:
109
+ return json.load(f) # type: ignore[no-any-return]
110
+ return {"sources": {}}
111
+
112
+ def save(self, data: dict[str, Any]) -> None:
113
+ self.path.parent.mkdir(parents=True, exist_ok=True)
114
+ with open(self.path, "w") as f:
115
+ json.dump(data, f, indent=2, default=str)
116
+
117
+ def update_source(
118
+ self,
119
+ source_id: str,
120
+ record_count: int,
121
+ collected_date: date | None = None,
122
+ ) -> None:
123
+ """Update metadata for a collected source."""
124
+ if collected_date is None:
125
+ collected_date = date.today()
126
+
127
+ data = self.load()
128
+ sources = data.setdefault("sources", {})
129
+ sources[source_id] = {
130
+ "last_collected": collected_date.isoformat(),
131
+ "record_count": record_count,
132
+ }
133
+ data["last_run"] = collected_date.isoformat()
134
+ self.save(data)
135
+
136
+ def get_source_info(self, source_id: str) -> dict[str, Any] | None:
137
+ """Get metadata for a specific source."""
138
+ data = self.load()
139
+ return data.get("sources", {}).get(source_id)
140
+
141
+ def get_all_sources(self) -> dict[str, Any]:
142
+ """Get metadata for all sources."""
143
+ data = self.load()
144
+ return data.get("sources", {})
145
+
146
+ def update_collected_inputs(
147
+ self, source_id: str, inputs: list[str]
148
+ ) -> None:
149
+ """Append collected input values to metadata for dedup tracking."""
150
+ data = self.load()
151
+ sources = data.setdefault("sources", {})
152
+ source_info = sources.setdefault(source_id, {})
153
+ existing = set(source_info.get("collected_inputs", []))
154
+ existing.update(str(v) for v in inputs)
155
+ source_info["collected_inputs"] = sorted(existing)
156
+ self.save(data)
157
+
158
+ def get_collected_inputs(self, source_id: str) -> set[str]:
159
+ """Get the set of already-collected input values for a source."""
160
+ info = self.get_source_info(source_id)
161
+ if info and "collected_inputs" in info:
162
+ return set(info["collected_inputs"])
163
+ return set()
164
+
165
+ def reset_collected_inputs(self, source_id: str) -> None:
166
+ """Clear collected input tracking for a source (forces re-collection)."""
167
+ data = self.load()
168
+ source_info = data.get("sources", {}).get(source_id, {})
169
+ if "collected_inputs" in source_info:
170
+ del source_info["collected_inputs"]
171
+ self.save(data)