anysite-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of anysite-cli might be problematic. Click here for more details.
- anysite/__init__.py +4 -0
- anysite/__main__.py +6 -0
- anysite/api/__init__.py +21 -0
- anysite/api/client.py +271 -0
- anysite/api/errors.py +137 -0
- anysite/api/schemas.py +333 -0
- anysite/batch/__init__.py +1 -0
- anysite/batch/executor.py +176 -0
- anysite/batch/input.py +160 -0
- anysite/batch/rate_limiter.py +98 -0
- anysite/cli/__init__.py +1 -0
- anysite/cli/config.py +176 -0
- anysite/cli/executor.py +388 -0
- anysite/cli/options.py +249 -0
- anysite/config/__init__.py +11 -0
- anysite/config/paths.py +46 -0
- anysite/config/settings.py +187 -0
- anysite/dataset/__init__.py +37 -0
- anysite/dataset/analyzer.py +268 -0
- anysite/dataset/cli.py +644 -0
- anysite/dataset/collector.py +686 -0
- anysite/dataset/db_loader.py +248 -0
- anysite/dataset/errors.py +30 -0
- anysite/dataset/exporters.py +121 -0
- anysite/dataset/history.py +153 -0
- anysite/dataset/models.py +245 -0
- anysite/dataset/notifications.py +87 -0
- anysite/dataset/scheduler.py +107 -0
- anysite/dataset/storage.py +171 -0
- anysite/dataset/transformer.py +213 -0
- anysite/db/__init__.py +38 -0
- anysite/db/adapters/__init__.py +1 -0
- anysite/db/adapters/base.py +158 -0
- anysite/db/adapters/postgres.py +201 -0
- anysite/db/adapters/sqlite.py +183 -0
- anysite/db/cli.py +687 -0
- anysite/db/config.py +92 -0
- anysite/db/manager.py +166 -0
- anysite/db/operations/__init__.py +1 -0
- anysite/db/operations/insert.py +199 -0
- anysite/db/operations/query.py +43 -0
- anysite/db/schema/__init__.py +1 -0
- anysite/db/schema/inference.py +213 -0
- anysite/db/schema/types.py +71 -0
- anysite/db/utils/__init__.py +1 -0
- anysite/db/utils/sanitize.py +99 -0
- anysite/main.py +498 -0
- anysite/models/__init__.py +1 -0
- anysite/output/__init__.py +11 -0
- anysite/output/console.py +45 -0
- anysite/output/formatters.py +301 -0
- anysite/output/templates.py +76 -0
- anysite/py.typed +0 -0
- anysite/streaming/__init__.py +1 -0
- anysite/streaming/progress.py +121 -0
- anysite/streaming/writer.py +130 -0
- anysite/utils/__init__.py +1 -0
- anysite/utils/fields.py +242 -0
- anysite/utils/retry.py +109 -0
- anysite_cli-0.1.0.dist-info/METADATA +437 -0
- anysite_cli-0.1.0.dist-info/RECORD +64 -0
- anysite_cli-0.1.0.dist-info/WHEEL +4 -0
- anysite_cli-0.1.0.dist-info/entry_points.txt +2 -0
- anysite_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
"""Pydantic models for dataset YAML configuration."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections import deque
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Literal
|
|
8
|
+
|
|
9
|
+
import yaml
|
|
10
|
+
from pydantic import BaseModel, Field, PrivateAttr, field_validator, model_validator
|
|
11
|
+
|
|
12
|
+
from anysite.dataset.errors import CircularDependencyError, SourceNotFoundError
|
|
13
|
+
|
|
14
|
+
# ---------------------------------------------------------------------------
|
|
15
|
+
# New models for transform / export / schedule / notifications
|
|
16
|
+
# ---------------------------------------------------------------------------
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class TransformConfig(BaseModel):
|
|
20
|
+
"""Per-source transform: filter → select fields → add columns."""
|
|
21
|
+
|
|
22
|
+
filter: str | None = Field(default=None, description="Filter expression (e.g., '.employee_count > 10')")
|
|
23
|
+
fields: list[str] = Field(default_factory=list, description="Fields to keep (empty = all)")
|
|
24
|
+
add_columns: dict[str, Any] = Field(default_factory=dict, description="Static columns to add")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ExportDestination(BaseModel):
|
|
28
|
+
"""Per-source export destination (file or webhook)."""
|
|
29
|
+
|
|
30
|
+
type: Literal["file", "webhook"] = Field(description="Export type")
|
|
31
|
+
path: str | None = Field(default=None, description="Output file path (file type)")
|
|
32
|
+
format: str = Field(default="jsonl", description="File format: json, jsonl, csv")
|
|
33
|
+
url: str | None = Field(default=None, description="Webhook URL (webhook type)")
|
|
34
|
+
headers: dict[str, str] = Field(default_factory=dict, description="HTTP headers for webhook")
|
|
35
|
+
|
|
36
|
+
@model_validator(mode="after")
|
|
37
|
+
def validate_type_fields(self) -> ExportDestination:
|
|
38
|
+
if self.type == "file" and not self.path:
|
|
39
|
+
raise ValueError("File export requires 'path'")
|
|
40
|
+
if self.type == "webhook" and not self.url:
|
|
41
|
+
raise ValueError("Webhook export requires 'url'")
|
|
42
|
+
return self
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class ScheduleConfig(BaseModel):
|
|
46
|
+
"""Cron-based schedule for dataset collection."""
|
|
47
|
+
|
|
48
|
+
cron: str = Field(description="Cron expression (e.g., '0 9 * * MON')")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class WebhookNotification(BaseModel):
|
|
52
|
+
"""A single webhook notification endpoint."""
|
|
53
|
+
|
|
54
|
+
url: str = Field(description="Webhook URL")
|
|
55
|
+
headers: dict[str, str] = Field(default_factory=dict, description="HTTP headers")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class NotificationsConfig(BaseModel):
|
|
59
|
+
"""Notification webhooks for collection events."""
|
|
60
|
+
|
|
61
|
+
on_complete: list[WebhookNotification] = Field(default_factory=list)
|
|
62
|
+
on_failure: list[WebhookNotification] = Field(default_factory=list)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class SourceDependency(BaseModel):
|
|
66
|
+
"""Dependency on another source's output."""
|
|
67
|
+
|
|
68
|
+
from_source: str = Field(description="Source ID to depend on")
|
|
69
|
+
field: str | None = Field(
|
|
70
|
+
default=None,
|
|
71
|
+
description="Field to extract from parent records (dot notation)",
|
|
72
|
+
)
|
|
73
|
+
match_by: str | None = Field(
|
|
74
|
+
default=None,
|
|
75
|
+
description="Field for fuzzy matching by name",
|
|
76
|
+
)
|
|
77
|
+
dedupe: bool = Field(default=False, description="Deduplicate extracted values")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class DbLoadConfig(BaseModel):
|
|
81
|
+
"""Configuration for loading a source into a relational database."""
|
|
82
|
+
|
|
83
|
+
table: str | None = Field(default=None, description="Override table name (default: source id)")
|
|
84
|
+
fields: list[str] = Field(default_factory=list, description="Fields to include (empty = all)")
|
|
85
|
+
exclude: list[str] = Field(
|
|
86
|
+
default_factory=lambda: ["_input_value", "_parent_source"],
|
|
87
|
+
description="Fields to exclude (default: provenance metadata)",
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class DatasetSource(BaseModel):
|
|
92
|
+
"""A single data source within a dataset."""
|
|
93
|
+
|
|
94
|
+
id: str = Field(description="Unique source identifier")
|
|
95
|
+
endpoint: str = Field(description="API endpoint path (e.g., /api/linkedin/search/users)")
|
|
96
|
+
params: dict[str, Any] = Field(default_factory=dict, description="Static API parameters")
|
|
97
|
+
dependency: SourceDependency | None = Field(
|
|
98
|
+
default=None,
|
|
99
|
+
description="Dependency on another source",
|
|
100
|
+
)
|
|
101
|
+
input_key: str | None = Field(
|
|
102
|
+
default=None,
|
|
103
|
+
description="Parameter name for dependent input values",
|
|
104
|
+
)
|
|
105
|
+
input_template: dict[str, Any] | None = Field(
|
|
106
|
+
default=None,
|
|
107
|
+
description="Template for input value — use {value} placeholder (e.g., {type: company, value: '{value}'})",
|
|
108
|
+
)
|
|
109
|
+
from_file: str | None = Field(
|
|
110
|
+
default=None,
|
|
111
|
+
description="Path to input file (CSV/JSONL/text) with values to iterate over",
|
|
112
|
+
)
|
|
113
|
+
file_field: str | None = Field(
|
|
114
|
+
default=None,
|
|
115
|
+
description="Column name to extract from CSV input file",
|
|
116
|
+
)
|
|
117
|
+
parallel: int = Field(default=1, ge=1, description="Parallel requests for dependent collection")
|
|
118
|
+
rate_limit: str | None = Field(default=None, description="Rate limit (e.g., '10/s')")
|
|
119
|
+
on_error: str = Field(default="skip", description="Error handling: stop or skip")
|
|
120
|
+
db_load: DbLoadConfig | None = Field(
|
|
121
|
+
default=None,
|
|
122
|
+
description="Database loading configuration (optional)",
|
|
123
|
+
)
|
|
124
|
+
transform: TransformConfig | None = Field(
|
|
125
|
+
default=None,
|
|
126
|
+
description="Post-collection transform (filter/fields/add_columns)",
|
|
127
|
+
)
|
|
128
|
+
export: list[ExportDestination] = Field(
|
|
129
|
+
default_factory=list,
|
|
130
|
+
description="Export destinations (file/webhook) applied after Parquet write",
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
@field_validator("endpoint")
|
|
134
|
+
@classmethod
|
|
135
|
+
def validate_endpoint(cls, v: str) -> str:
|
|
136
|
+
if not v.startswith("/"):
|
|
137
|
+
raise ValueError(f"Endpoint must start with '/', got: {v}")
|
|
138
|
+
return v
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class StorageConfig(BaseModel):
|
|
142
|
+
"""Storage configuration for dataset output."""
|
|
143
|
+
|
|
144
|
+
format: str = Field(default="parquet", description="Storage format")
|
|
145
|
+
path: str = Field(default="./data/", description="Base directory for data files")
|
|
146
|
+
partition_by: list[str] = Field(
|
|
147
|
+
default_factory=lambda: ["source_id", "collected_date"],
|
|
148
|
+
description="Partition dimensions",
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class DatasetConfig(BaseModel):
|
|
153
|
+
"""Top-level dataset configuration parsed from YAML."""
|
|
154
|
+
|
|
155
|
+
name: str = Field(description="Dataset name")
|
|
156
|
+
description: str = Field(default="", description="Dataset description")
|
|
157
|
+
sources: list[DatasetSource] = Field(description="Data sources to collect")
|
|
158
|
+
storage: StorageConfig = Field(default_factory=StorageConfig)
|
|
159
|
+
schedule: ScheduleConfig | None = Field(default=None, description="Collection schedule")
|
|
160
|
+
notifications: NotificationsConfig | None = Field(default=None, description="Webhook notifications")
|
|
161
|
+
|
|
162
|
+
_config_dir: Path | None = PrivateAttr(default=None)
|
|
163
|
+
|
|
164
|
+
@field_validator("sources")
|
|
165
|
+
@classmethod
|
|
166
|
+
def validate_unique_ids(cls, v: list[DatasetSource]) -> list[DatasetSource]:
|
|
167
|
+
ids = [s.id for s in v]
|
|
168
|
+
dupes = [sid for sid in ids if ids.count(sid) > 1]
|
|
169
|
+
if dupes:
|
|
170
|
+
raise ValueError(f"Duplicate source IDs: {set(dupes)}")
|
|
171
|
+
return v
|
|
172
|
+
|
|
173
|
+
@classmethod
|
|
174
|
+
def from_yaml(cls, path: Path) -> DatasetConfig:
|
|
175
|
+
"""Load dataset configuration from a YAML file."""
|
|
176
|
+
with open(path) as f:
|
|
177
|
+
data = yaml.safe_load(f)
|
|
178
|
+
config = cls.model_validate(data)
|
|
179
|
+
config._config_dir = path.resolve().parent
|
|
180
|
+
return config
|
|
181
|
+
|
|
182
|
+
def get_source(self, source_id: str) -> DatasetSource | None:
|
|
183
|
+
"""Get a source by ID."""
|
|
184
|
+
for s in self.sources:
|
|
185
|
+
if s.id == source_id:
|
|
186
|
+
return s
|
|
187
|
+
return None
|
|
188
|
+
|
|
189
|
+
def topological_sort(self) -> list[DatasetSource]:
|
|
190
|
+
"""Sort sources by dependency order using Kahn's algorithm.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
List of sources in execution order (independent first).
|
|
194
|
+
|
|
195
|
+
Raises:
|
|
196
|
+
CircularDependencyError: If dependencies form a cycle.
|
|
197
|
+
SourceNotFoundError: If a dependency references a non-existent source.
|
|
198
|
+
"""
|
|
199
|
+
source_map = {s.id: s for s in self.sources}
|
|
200
|
+
|
|
201
|
+
# Build adjacency: in_degree counts and adjacency list
|
|
202
|
+
in_degree: dict[str, int] = {s.id: 0 for s in self.sources}
|
|
203
|
+
dependents: dict[str, list[str]] = {s.id: [] for s in self.sources}
|
|
204
|
+
|
|
205
|
+
for source in self.sources:
|
|
206
|
+
if source.dependency:
|
|
207
|
+
parent_id = source.dependency.from_source
|
|
208
|
+
if parent_id not in source_map:
|
|
209
|
+
raise SourceNotFoundError(parent_id, source.id)
|
|
210
|
+
in_degree[source.id] += 1
|
|
211
|
+
dependents[parent_id].append(source.id)
|
|
212
|
+
|
|
213
|
+
# Kahn's algorithm
|
|
214
|
+
queue: deque[str] = deque()
|
|
215
|
+
for sid, degree in in_degree.items():
|
|
216
|
+
if degree == 0:
|
|
217
|
+
queue.append(sid)
|
|
218
|
+
|
|
219
|
+
result: list[DatasetSource] = []
|
|
220
|
+
while queue:
|
|
221
|
+
sid = queue.popleft()
|
|
222
|
+
result.append(source_map[sid])
|
|
223
|
+
for dep_id in dependents[sid]:
|
|
224
|
+
in_degree[dep_id] -= 1
|
|
225
|
+
if in_degree[dep_id] == 0:
|
|
226
|
+
queue.append(dep_id)
|
|
227
|
+
|
|
228
|
+
if len(result) != len(self.sources):
|
|
229
|
+
# Find the cycle
|
|
230
|
+
remaining = [s.id for s in self.sources if s.id not in {r.id for r in result}]
|
|
231
|
+
raise CircularDependencyError(remaining)
|
|
232
|
+
|
|
233
|
+
return result
|
|
234
|
+
|
|
235
|
+
def storage_path(self) -> Path:
|
|
236
|
+
"""Resolve the storage base path.
|
|
237
|
+
|
|
238
|
+
Relative paths are resolved against the directory containing the
|
|
239
|
+
YAML config file (set by ``from_yaml``). Absolute paths and
|
|
240
|
+
programmatic configs (no config dir) use the path as-is.
|
|
241
|
+
"""
|
|
242
|
+
p = Path(self.storage.path)
|
|
243
|
+
if not p.is_absolute() and self._config_dir is not None:
|
|
244
|
+
return self._config_dir / p
|
|
245
|
+
return p
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Webhook notifications for dataset collection events."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from datetime import UTC, datetime
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from anysite.dataset.models import NotificationsConfig
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class WebhookNotifier:
|
|
15
|
+
"""Send webhook notifications on collection complete/failure."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, config: NotificationsConfig) -> None:
|
|
18
|
+
self.config = config
|
|
19
|
+
|
|
20
|
+
async def notify_complete(
|
|
21
|
+
self,
|
|
22
|
+
dataset_name: str,
|
|
23
|
+
record_count: int,
|
|
24
|
+
source_count: int,
|
|
25
|
+
duration: float,
|
|
26
|
+
) -> None:
|
|
27
|
+
"""Send on_complete webhooks."""
|
|
28
|
+
if not self.config.on_complete:
|
|
29
|
+
return
|
|
30
|
+
|
|
31
|
+
payload = _build_payload(
|
|
32
|
+
event="complete",
|
|
33
|
+
dataset_name=dataset_name,
|
|
34
|
+
record_count=record_count,
|
|
35
|
+
source_count=source_count,
|
|
36
|
+
duration=duration,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
for hook in self.config.on_complete:
|
|
40
|
+
await self._send(hook.url, hook.headers, payload)
|
|
41
|
+
|
|
42
|
+
async def notify_failure(
|
|
43
|
+
self,
|
|
44
|
+
dataset_name: str,
|
|
45
|
+
error: str,
|
|
46
|
+
duration: float,
|
|
47
|
+
) -> None:
|
|
48
|
+
"""Send on_failure webhooks."""
|
|
49
|
+
if not self.config.on_failure:
|
|
50
|
+
return
|
|
51
|
+
|
|
52
|
+
payload = _build_payload(
|
|
53
|
+
event="failure",
|
|
54
|
+
dataset_name=dataset_name,
|
|
55
|
+
error=error,
|
|
56
|
+
duration=duration,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
for hook in self.config.on_failure:
|
|
60
|
+
await self._send(hook.url, hook.headers, payload)
|
|
61
|
+
|
|
62
|
+
async def _send(self, url: str, headers: dict[str, str], payload: dict[str, Any]) -> None:
|
|
63
|
+
"""Send a single webhook POST."""
|
|
64
|
+
try:
|
|
65
|
+
import httpx
|
|
66
|
+
|
|
67
|
+
async with httpx.AsyncClient(timeout=15.0) as client:
|
|
68
|
+
resp = await client.post(url, json=payload, headers=headers)
|
|
69
|
+
resp.raise_for_status()
|
|
70
|
+
logger.info("Notification sent to %s", url)
|
|
71
|
+
except Exception as e:
|
|
72
|
+
logger.error("Notification to %s failed: %s", url, e)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _build_payload(
|
|
76
|
+
*,
|
|
77
|
+
event: str,
|
|
78
|
+
dataset_name: str,
|
|
79
|
+
**kwargs: Any,
|
|
80
|
+
) -> dict[str, Any]:
|
|
81
|
+
"""Build a webhook notification payload."""
|
|
82
|
+
return {
|
|
83
|
+
"event": event,
|
|
84
|
+
"dataset": dataset_name,
|
|
85
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
86
|
+
**kwargs,
|
|
87
|
+
}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Schedule generation for dataset collection — crontab and systemd."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import shutil
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ScheduleGenerator:
|
|
9
|
+
"""Generate crontab or systemd timer entries for dataset collection."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, dataset_name: str, cron_expr: str, yaml_path: str) -> None:
|
|
12
|
+
self.dataset_name = dataset_name
|
|
13
|
+
self.cron = cron_expr
|
|
14
|
+
self.yaml_path = yaml_path
|
|
15
|
+
|
|
16
|
+
def generate_crontab(self, *, incremental: bool = False, load_db: str | None = None) -> str:
|
|
17
|
+
"""Generate a crontab entry."""
|
|
18
|
+
anysite = shutil.which("anysite") or "anysite"
|
|
19
|
+
cmd = f"{anysite} dataset collect {self.yaml_path}"
|
|
20
|
+
if incremental:
|
|
21
|
+
cmd += " --incremental"
|
|
22
|
+
if load_db:
|
|
23
|
+
cmd += f" --load-db {load_db}"
|
|
24
|
+
return f"{self.cron} {cmd} >> ~/.anysite/logs/{self.dataset_name}_cron.log 2>&1"
|
|
25
|
+
|
|
26
|
+
def generate_systemd(self, *, incremental: bool = False, load_db: str | None = None) -> dict[str, str]:
|
|
27
|
+
"""Generate systemd service and timer unit files."""
|
|
28
|
+
anysite = shutil.which("anysite") or "anysite"
|
|
29
|
+
cmd = f"{anysite} dataset collect {self.yaml_path}"
|
|
30
|
+
if incremental:
|
|
31
|
+
cmd += " --incremental"
|
|
32
|
+
if load_db:
|
|
33
|
+
cmd += f" --load-db {load_db}"
|
|
34
|
+
|
|
35
|
+
service_name = f"anysite-dataset-{self.dataset_name}"
|
|
36
|
+
on_calendar = _cron_to_oncalendar(self.cron)
|
|
37
|
+
|
|
38
|
+
service = f"""[Unit]
|
|
39
|
+
Description=Anysite dataset collection: {self.dataset_name}
|
|
40
|
+
|
|
41
|
+
[Service]
|
|
42
|
+
Type=oneshot
|
|
43
|
+
ExecStart={cmd}
|
|
44
|
+
StandardOutput=journal
|
|
45
|
+
StandardError=journal
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
timer = f"""[Unit]
|
|
49
|
+
Description=Timer for anysite dataset: {self.dataset_name}
|
|
50
|
+
|
|
51
|
+
[Timer]
|
|
52
|
+
OnCalendar={on_calendar}
|
|
53
|
+
Persistent=true
|
|
54
|
+
|
|
55
|
+
[Install]
|
|
56
|
+
WantedBy=timers.target
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
return {
|
|
60
|
+
f"{service_name}.service": service,
|
|
61
|
+
f"{service_name}.timer": timer,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _cron_to_oncalendar(cron: str) -> str:
|
|
66
|
+
"""Convert a cron expression to systemd OnCalendar format (best effort).
|
|
67
|
+
|
|
68
|
+
Handles common patterns:
|
|
69
|
+
0 9 * * MON -> Mon *-*-* 09:00:00
|
|
70
|
+
*/5 * * * * -> *-*-* *:0/5:00
|
|
71
|
+
0 0 1 * * -> *-*-01 00:00:00
|
|
72
|
+
"""
|
|
73
|
+
parts = cron.split()
|
|
74
|
+
if len(parts) != 5:
|
|
75
|
+
return cron # pass through as-is
|
|
76
|
+
|
|
77
|
+
minute, hour, day, month, dow = parts
|
|
78
|
+
|
|
79
|
+
# Day-of-week mapping
|
|
80
|
+
dow_map = {
|
|
81
|
+
"0": "Sun", "1": "Mon", "2": "Tue", "3": "Wed",
|
|
82
|
+
"4": "Thu", "5": "Fri", "6": "Sat", "7": "Sun",
|
|
83
|
+
"MON": "Mon", "TUE": "Tue", "WED": "Wed",
|
|
84
|
+
"THU": "Thu", "FRI": "Fri", "SAT": "Sat", "SUN": "Sun",
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
# Format components
|
|
88
|
+
dow_str = ""
|
|
89
|
+
if dow != "*":
|
|
90
|
+
dow_parts = dow.replace(",", " ").split()
|
|
91
|
+
dow_str = ",".join(dow_map.get(d.upper(), d) for d in dow_parts) + " "
|
|
92
|
+
|
|
93
|
+
month_str = month if month != "*" else "*"
|
|
94
|
+
day_str = day.zfill(2) if day != "*" and "/" not in day else day
|
|
95
|
+
|
|
96
|
+
hour_str = hour.zfill(2) if hour != "*" and "/" not in hour else hour
|
|
97
|
+
min_str = minute.zfill(2) if minute != "*" and "/" not in minute else minute
|
|
98
|
+
|
|
99
|
+
# Handle step values
|
|
100
|
+
if "/" in minute:
|
|
101
|
+
base, step = minute.split("/")
|
|
102
|
+
min_str = f"{base or '0'}/{step}"
|
|
103
|
+
if "/" in hour:
|
|
104
|
+
base, step = hour.split("/")
|
|
105
|
+
hour_str = f"{base or '0'}/{step}"
|
|
106
|
+
|
|
107
|
+
return f"{dow_str}*-{month_str}-{day_str} {hour_str}:{min_str}:00"
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""Parquet storage layer for dataset records."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from datetime import date
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def write_parquet(
|
|
12
|
+
records: list[dict[str, Any]],
|
|
13
|
+
path: Path,
|
|
14
|
+
) -> int:
|
|
15
|
+
"""Write records to a Parquet file.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
records: List of dicts to write.
|
|
19
|
+
path: Output file path.
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Number of records written.
|
|
23
|
+
"""
|
|
24
|
+
import pyarrow as pa
|
|
25
|
+
import pyarrow.parquet as pq
|
|
26
|
+
|
|
27
|
+
if not records:
|
|
28
|
+
return 0
|
|
29
|
+
|
|
30
|
+
# Normalize: flatten nested structures to JSON strings for non-scalar types
|
|
31
|
+
normalized = _normalize_records(records)
|
|
32
|
+
|
|
33
|
+
table = pa.Table.from_pylist(normalized)
|
|
34
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
35
|
+
pq.write_table(table, path)
|
|
36
|
+
return len(records)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _normalize_records(records: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
40
|
+
"""Normalize records for Parquet compatibility.
|
|
41
|
+
|
|
42
|
+
Converts nested dicts/lists to JSON strings so that pyarrow
|
|
43
|
+
can infer a consistent schema across heterogeneous records.
|
|
44
|
+
"""
|
|
45
|
+
if not records:
|
|
46
|
+
return records
|
|
47
|
+
|
|
48
|
+
result = []
|
|
49
|
+
for record in records:
|
|
50
|
+
normalized: dict[str, Any] = {}
|
|
51
|
+
for key, value in record.items():
|
|
52
|
+
if isinstance(value, (dict, list)):
|
|
53
|
+
normalized[key] = json.dumps(value, default=str)
|
|
54
|
+
else:
|
|
55
|
+
normalized[key] = value
|
|
56
|
+
result.append(normalized)
|
|
57
|
+
return result
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def read_parquet(path: Path) -> list[dict[str, Any]]:
|
|
61
|
+
"""Read records from a Parquet file or directory of Parquet files.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
path: Parquet file or directory containing .parquet files.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
List of dicts.
|
|
68
|
+
"""
|
|
69
|
+
import pyarrow.parquet as pq
|
|
70
|
+
|
|
71
|
+
if path.is_dir():
|
|
72
|
+
files = sorted(path.glob("*.parquet"))
|
|
73
|
+
if not files:
|
|
74
|
+
return []
|
|
75
|
+
tables = [pq.read_table(f) for f in files]
|
|
76
|
+
import pyarrow as pa
|
|
77
|
+
|
|
78
|
+
table = pa.concat_tables(tables)
|
|
79
|
+
else:
|
|
80
|
+
if not path.exists():
|
|
81
|
+
return []
|
|
82
|
+
table = pq.read_table(path)
|
|
83
|
+
|
|
84
|
+
return table.to_pylist()
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_source_dir(base_path: Path, source_id: str) -> Path:
|
|
88
|
+
"""Get the raw data directory for a source."""
|
|
89
|
+
return base_path / "raw" / source_id
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def get_parquet_path(base_path: Path, source_id: str, collected_date: date | None = None) -> Path:
|
|
93
|
+
"""Get the Parquet file path for a source on a given date."""
|
|
94
|
+
if collected_date is None:
|
|
95
|
+
collected_date = date.today()
|
|
96
|
+
source_dir = get_source_dir(base_path, source_id)
|
|
97
|
+
return source_dir / f"{collected_date.isoformat()}.parquet"
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class MetadataStore:
|
|
101
|
+
"""Read/write metadata.json for dataset state tracking."""
|
|
102
|
+
|
|
103
|
+
def __init__(self, base_path: Path) -> None:
|
|
104
|
+
self.path = base_path / "metadata.json"
|
|
105
|
+
|
|
106
|
+
def load(self) -> dict[str, Any]:
|
|
107
|
+
if self.path.exists():
|
|
108
|
+
with open(self.path) as f:
|
|
109
|
+
return json.load(f) # type: ignore[no-any-return]
|
|
110
|
+
return {"sources": {}}
|
|
111
|
+
|
|
112
|
+
def save(self, data: dict[str, Any]) -> None:
|
|
113
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
114
|
+
with open(self.path, "w") as f:
|
|
115
|
+
json.dump(data, f, indent=2, default=str)
|
|
116
|
+
|
|
117
|
+
def update_source(
|
|
118
|
+
self,
|
|
119
|
+
source_id: str,
|
|
120
|
+
record_count: int,
|
|
121
|
+
collected_date: date | None = None,
|
|
122
|
+
) -> None:
|
|
123
|
+
"""Update metadata for a collected source."""
|
|
124
|
+
if collected_date is None:
|
|
125
|
+
collected_date = date.today()
|
|
126
|
+
|
|
127
|
+
data = self.load()
|
|
128
|
+
sources = data.setdefault("sources", {})
|
|
129
|
+
sources[source_id] = {
|
|
130
|
+
"last_collected": collected_date.isoformat(),
|
|
131
|
+
"record_count": record_count,
|
|
132
|
+
}
|
|
133
|
+
data["last_run"] = collected_date.isoformat()
|
|
134
|
+
self.save(data)
|
|
135
|
+
|
|
136
|
+
def get_source_info(self, source_id: str) -> dict[str, Any] | None:
|
|
137
|
+
"""Get metadata for a specific source."""
|
|
138
|
+
data = self.load()
|
|
139
|
+
return data.get("sources", {}).get(source_id)
|
|
140
|
+
|
|
141
|
+
def get_all_sources(self) -> dict[str, Any]:
|
|
142
|
+
"""Get metadata for all sources."""
|
|
143
|
+
data = self.load()
|
|
144
|
+
return data.get("sources", {})
|
|
145
|
+
|
|
146
|
+
def update_collected_inputs(
|
|
147
|
+
self, source_id: str, inputs: list[str]
|
|
148
|
+
) -> None:
|
|
149
|
+
"""Append collected input values to metadata for dedup tracking."""
|
|
150
|
+
data = self.load()
|
|
151
|
+
sources = data.setdefault("sources", {})
|
|
152
|
+
source_info = sources.setdefault(source_id, {})
|
|
153
|
+
existing = set(source_info.get("collected_inputs", []))
|
|
154
|
+
existing.update(str(v) for v in inputs)
|
|
155
|
+
source_info["collected_inputs"] = sorted(existing)
|
|
156
|
+
self.save(data)
|
|
157
|
+
|
|
158
|
+
def get_collected_inputs(self, source_id: str) -> set[str]:
|
|
159
|
+
"""Get the set of already-collected input values for a source."""
|
|
160
|
+
info = self.get_source_info(source_id)
|
|
161
|
+
if info and "collected_inputs" in info:
|
|
162
|
+
return set(info["collected_inputs"])
|
|
163
|
+
return set()
|
|
164
|
+
|
|
165
|
+
def reset_collected_inputs(self, source_id: str) -> None:
|
|
166
|
+
"""Clear collected input tracking for a source (forces re-collection)."""
|
|
167
|
+
data = self.load()
|
|
168
|
+
source_info = data.get("sources", {}).get(source_id, {})
|
|
169
|
+
if "collected_inputs" in source_info:
|
|
170
|
+
del source_info["collected_inputs"]
|
|
171
|
+
self.save(data)
|