data-collection-framework 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_collection_framework-0.1.0.dist-info/METADATA +19 -0
- data_collection_framework-0.1.0.dist-info/RECORD +44 -0
- data_collection_framework-0.1.0.dist-info/WHEEL +5 -0
- data_collection_framework-0.1.0.dist-info/entry_points.txt +2 -0
- data_collection_framework-0.1.0.dist-info/top_level.txt +1 -0
- dcf/__init__.py +4 -0
- dcf/cli.py +841 -0
- dcf/config/__init__.py +4 -0
- dcf/config/loader.py +77 -0
- dcf/config/models.py +240 -0
- dcf/engine/__init__.py +6 -0
- dcf/engine/fetcher.py +118 -0
- dcf/engine/iterator.py +96 -0
- dcf/engine/projector.py +56 -0
- dcf/engine/runner.py +90 -0
- dcf/engine/transforms.py +41 -0
- dcf/gcp/__init__.py +0 -0
- dcf/gcp/_collector_utils.py +87 -0
- dcf/gcp/auth.py +1 -0
- dcf/gcp/batch_deploy.py +548 -0
- dcf/gcp/bootstrap.py +131 -0
- dcf/gcp/gcloud.py +42 -0
- dcf/gcp/terraform.py +151 -0
- dcf/infra/modules/batch_collector/gcp/airflow/main.tf +194 -0
- dcf/infra/modules/batch_collector/gcp/airflow/outputs.tf +9 -0
- dcf/infra/modules/batch_collector/gcp/airflow/variables.tf +52 -0
- dcf/infra/modules/batch_collector/gcp/main.tf +70 -0
- dcf/infra/modules/batch_collector/gcp/outputs.tf +4 -0
- dcf/infra/modules/batch_collector/gcp/variables.tf +40 -0
- dcf/infra/modules/batch_collector/local/airflow/main.tf +64 -0
- dcf/infra/modules/batch_collector/local/airflow/outputs.tf +9 -0
- dcf/infra/modules/batch_collector/local/airflow/variables.tf +59 -0
- dcf/infra/modules/batch_collector/local/main.tf +32 -0
- dcf/infra/modules/batch_collector/local/outputs.tf +4 -0
- dcf/infra/modules/batch_collector/local/variables.tf +25 -0
- dcf/infra/templates/airflow.Dockerfile.tftpl +6 -0
- dcf/infra/templates/batch_collector.Dockerfile.tftpl +14 -0
- dcf/infra/templates/docker-compose.yml.tftpl +76 -0
- dcf/local_deploy.py +756 -0
- dcf/project.py +23 -0
- dcf/spark_session.py +66 -0
- dcf/warehouse_reader.py +323 -0
- dcf/writer/__init__.py +3 -0
- dcf/writer/iceberg.py +315 -0
dcf/config/__init__.py
ADDED
dcf/config/loader.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
from .models import Collector
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _project_config() -> dict:
|
|
12
|
+
"""Load project.yml from the project root, returning an empty dict if absent."""
|
|
13
|
+
from ..project import find_project_root
|
|
14
|
+
try:
|
|
15
|
+
cfg_path = find_project_root() / "project.yml"
|
|
16
|
+
except RuntimeError:
|
|
17
|
+
return {}
|
|
18
|
+
if cfg_path.exists():
|
|
19
|
+
return yaml.safe_load(cfg_path.read_text()) or {}
|
|
20
|
+
return {}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _resolve_env(value: str, project_cfg: dict) -> str:
|
|
24
|
+
"""Replace {{ env.VAR }} placeholders.
|
|
25
|
+
|
|
26
|
+
Resolution order:
|
|
27
|
+
1. OS environment variable
|
|
28
|
+
2. project.yml key (VAR lowercased, e.g. PORTLANDMAPS_API_KEY → portlandmaps_api_key)
|
|
29
|
+
"""
|
|
30
|
+
import re
|
|
31
|
+
def replacer(match):
|
|
32
|
+
var = match.group(1).strip()
|
|
33
|
+
resolved = os.environ.get(var)
|
|
34
|
+
if resolved is None:
|
|
35
|
+
resolved = project_cfg.get(var.lower())
|
|
36
|
+
if not resolved:
|
|
37
|
+
raise EnvironmentError(
|
|
38
|
+
f"'{var}' is not set — add it as an environment variable "
|
|
39
|
+
f"or set '{var.lower()}' in project.yml"
|
|
40
|
+
)
|
|
41
|
+
return resolved
|
|
42
|
+
return re.sub(r"\{\{\s*env\.(\w+)\s*\}\}", replacer, value)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _resolve_env_in(obj, project_cfg: dict):
|
|
46
|
+
if isinstance(obj, dict):
|
|
47
|
+
return {k: _resolve_env_in(v, project_cfg) for k, v in obj.items()}
|
|
48
|
+
if isinstance(obj, list):
|
|
49
|
+
return [_resolve_env_in(v, project_cfg) for v in obj]
|
|
50
|
+
if isinstance(obj, str):
|
|
51
|
+
return _resolve_env(obj, project_cfg)
|
|
52
|
+
return obj
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def load_collector(path: Path, resolve_env: bool = True) -> Collector:
|
|
56
|
+
raw = yaml.safe_load(path.read_text())
|
|
57
|
+
if resolve_env:
|
|
58
|
+
raw = _resolve_env_in(raw, _project_config())
|
|
59
|
+
else:
|
|
60
|
+
raw = _strip_env_placeholders(raw)
|
|
61
|
+
return Collector.from_dict(raw)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _strip_env_placeholders(obj):
|
|
65
|
+
"""Replace {{ env.VAR }} with a placeholder string for structural validation."""
|
|
66
|
+
import re
|
|
67
|
+
if isinstance(obj, dict):
|
|
68
|
+
return {k: _strip_env_placeholders(v) for k, v in obj.items()}
|
|
69
|
+
if isinstance(obj, list):
|
|
70
|
+
return [_strip_env_placeholders(v) for v in obj]
|
|
71
|
+
if isinstance(obj, str):
|
|
72
|
+
return re.sub(r"\{\{\s*env\.\w+\s*\}\}", "<env>", obj)
|
|
73
|
+
return obj
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def load_all_collectors(collectors_dir: Path, resolve_env: bool = True) -> list[Collector]:
|
|
77
|
+
return [load_collector(p, resolve_env=resolve_env) for p in sorted(collectors_dir.glob("*.yml"))]
|
dcf/config/models.py
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re as _re
|
|
4
|
+
from typing import Any, Literal, Annotated, Union
|
|
5
|
+
from pydantic import BaseModel, Field, model_validator
|
|
6
|
+
|
|
7
|
+
_CRON_FIELD_RE = _re.compile(r'^(\*|[0-9,\-\*/]+)$')
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# ------------------------------------------------------------------ #
|
|
11
|
+
# Source — params #
|
|
12
|
+
# ------------------------------------------------------------------ #
|
|
13
|
+
|
|
14
|
+
class Param(BaseModel):
|
|
15
|
+
name: str
|
|
16
|
+
type: Literal["string", "integer", "float", "date", "boolean"]
|
|
17
|
+
format: str | None = None # e.g. "%m/%d/%Y" for date URL serialization
|
|
18
|
+
value: Any | None = None # present → static; absent → must be covered by iterate
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Auth(BaseModel):
|
|
22
|
+
type: Literal["query_param", "header", "bearer"]
|
|
23
|
+
key: str | None = None # param name or header name; unused (and optional) for bearer
|
|
24
|
+
value: str # supports "{{ env.VAR }}"
|
|
25
|
+
|
|
26
|
+
@model_validator(mode="after")
|
|
27
|
+
def key_required_for_non_bearer(self) -> "Auth":
|
|
28
|
+
if self.type in ("query_param", "header") and not self.key:
|
|
29
|
+
raise ValueError(f"auth.key is required when type is '{self.type}'")
|
|
30
|
+
return self
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class RateLimit(BaseModel):
|
|
34
|
+
requests: int
|
|
35
|
+
per_minutes: float
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class Response(BaseModel):
|
|
39
|
+
format: Literal["json", "csv"]
|
|
40
|
+
records_path: str | None = None # key (or dot-path) in JSON holding the records array
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ------------------------------------------------------------------ #
|
|
44
|
+
# Source — iteration #
|
|
45
|
+
# ------------------------------------------------------------------ #
|
|
46
|
+
|
|
47
|
+
class DateRangeIterate(BaseModel):
|
|
48
|
+
type: Literal["date_range"]
|
|
49
|
+
params: list[str] # one or two param names that receive the window start/end
|
|
50
|
+
start: str # ISO date or "today"
|
|
51
|
+
end: str # ISO date or "today"
|
|
52
|
+
step: str # e.g. "1 day", "7 days"
|
|
53
|
+
window: str | None = None # defaults to step when absent
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class CategoricalIterate(BaseModel):
|
|
57
|
+
type: Literal["categorical"]
|
|
58
|
+
param: str
|
|
59
|
+
values: list[Any]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
IterateSpec = DateRangeIterate | CategoricalIterate
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _validate_dynamic_params(params: list[Param], iterate: list[IterateSpec]) -> None:
|
|
66
|
+
dynamic = {p.name for p in params if p.value is None}
|
|
67
|
+
covered: set[str] = set()
|
|
68
|
+
for it in iterate:
|
|
69
|
+
if isinstance(it, DateRangeIterate):
|
|
70
|
+
covered.update(it.params)
|
|
71
|
+
elif isinstance(it, CategoricalIterate):
|
|
72
|
+
covered.add(it.param)
|
|
73
|
+
missing = dynamic - covered
|
|
74
|
+
if missing:
|
|
75
|
+
raise ValueError(f"Params declared without a value or iterator: {missing}")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# ------------------------------------------------------------------ #
|
|
79
|
+
# Schema #
|
|
80
|
+
# ------------------------------------------------------------------ #
|
|
81
|
+
|
|
82
|
+
class CrsReprojectTransform(BaseModel):
|
|
83
|
+
type: Literal["crs_reproject"]
|
|
84
|
+
from_columns: list[str]
|
|
85
|
+
from_crs: str
|
|
86
|
+
to_crs: str
|
|
87
|
+
component: Literal["x", "y"]
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class ArrayJoinTransform(BaseModel):
|
|
91
|
+
type: Literal["array_join"]
|
|
92
|
+
path: str # dot-notation path to the array field in the raw record
|
|
93
|
+
separator: str = "," # delimiter used to join elements
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
Transform = Annotated[
|
|
97
|
+
Union[CrsReprojectTransform, ArrayJoinTransform],
|
|
98
|
+
Field(discriminator="type"),
|
|
99
|
+
]
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class Column(BaseModel):
|
|
103
|
+
name: str
|
|
104
|
+
path: str | None = None # key in the raw record (dot-notation for nested)
|
|
105
|
+
type: Literal["string", "integer", "float", "date", "timestamp", "boolean"] | None = None
|
|
106
|
+
transform: Transform | None = None
|
|
107
|
+
|
|
108
|
+
@model_validator(mode="after")
|
|
109
|
+
def has_source(self) -> Column:
|
|
110
|
+
if self.path is None and self.transform is None:
|
|
111
|
+
raise ValueError(f"Column '{self.name}' must have either 'path' or 'transform'")
|
|
112
|
+
return self
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class Schema(BaseModel):
|
|
116
|
+
columns: list[Column]
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# ------------------------------------------------------------------ #
|
|
120
|
+
# Source types #
|
|
121
|
+
# ------------------------------------------------------------------ #
|
|
122
|
+
|
|
123
|
+
class HttpSource(BaseModel):
|
|
124
|
+
model_config = {"populate_by_name": True}
|
|
125
|
+
type: Literal["http"]
|
|
126
|
+
url: str
|
|
127
|
+
method: Literal["GET", "POST"] = "GET"
|
|
128
|
+
auth: Auth | None = None
|
|
129
|
+
params: list[Param] = []
|
|
130
|
+
response: Response = Response(format="json")
|
|
131
|
+
rate_limit: RateLimit | None = None
|
|
132
|
+
schema_: Schema | None = Field(default=None, alias="schema")
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class PythonSource(BaseModel):
|
|
136
|
+
"""Calls a Python function that returns list[dict]; handles its own pagination."""
|
|
137
|
+
model_config = {"populate_by_name": True}
|
|
138
|
+
type: Literal["python"]
|
|
139
|
+
module: str # importable module path, e.g. "connectors.craigslist_apts"
|
|
140
|
+
function: str # function name; called as fn(dynamic_params) -> list[dict]
|
|
141
|
+
params: list[Param] = []
|
|
142
|
+
schema_: Schema | None = Field(default=None, alias="schema")
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class PubSubSource(BaseModel):
|
|
146
|
+
"""Continuously reads JSON messages from a GCP Pub/Sub subscription."""
|
|
147
|
+
model_config = {"populate_by_name": True}
|
|
148
|
+
type: Literal["pubsub"]
|
|
149
|
+
subscription: str # full resource path: projects/<project>/subscriptions/<name>
|
|
150
|
+
schema_: Schema | None = Field(default=None, alias="schema")
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
Source = Annotated[Union[HttpSource, PythonSource, PubSubSource], Field(discriminator="type")]
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
# ------------------------------------------------------------------ #
|
|
157
|
+
# Cadence #
|
|
158
|
+
# ------------------------------------------------------------------ #
|
|
159
|
+
|
|
160
|
+
class StagingConfig(BaseModel):
|
|
161
|
+
partition_param: str # which iterate param splits into separate staging tables
|
|
162
|
+
table_pattern: str # e.g. "permits_{date_type}_loader_staging"
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class MergeDedup(BaseModel):
|
|
166
|
+
type: Literal["latest_non_null"]
|
|
167
|
+
columns: list[str]
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class MergeConfig(BaseModel):
|
|
171
|
+
table: str
|
|
172
|
+
key: str
|
|
173
|
+
dedup: MergeDedup | None = None
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class Cadence(BaseModel):
|
|
177
|
+
iterate: list[IterateSpec] = []
|
|
178
|
+
strategy: Literal["incremental", "append", "full_refresh"]
|
|
179
|
+
primary_key: str | None = None
|
|
180
|
+
staging: StagingConfig | None = None
|
|
181
|
+
merge: MergeConfig | None = None
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# ------------------------------------------------------------------ #
|
|
185
|
+
# Deployment #
|
|
186
|
+
# ------------------------------------------------------------------ #
|
|
187
|
+
|
|
188
|
+
class Deployment(BaseModel):
|
|
189
|
+
type: Literal["batch", "streaming"] = "batch"
|
|
190
|
+
# batch fields
|
|
191
|
+
schedule: str | None = None
|
|
192
|
+
paused: bool = False
|
|
193
|
+
# streaming fields
|
|
194
|
+
window_seconds: int = 60
|
|
195
|
+
|
|
196
|
+
@model_validator(mode="after")
|
|
197
|
+
def validate_deployment(self) -> "Deployment":
|
|
198
|
+
if self.type == "batch":
|
|
199
|
+
if not self.schedule:
|
|
200
|
+
raise ValueError(
|
|
201
|
+
"deployment.schedule is required for batch deployments "
|
|
202
|
+
"(e.g. schedule: \"0 8 * * *\")"
|
|
203
|
+
)
|
|
204
|
+
parts = self.schedule.strip().split()
|
|
205
|
+
if len(parts) != 5 or not all(_CRON_FIELD_RE.match(p) for p in parts):
|
|
206
|
+
raise ValueError(
|
|
207
|
+
f"deployment.schedule '{self.schedule}' is not a valid cron expression. "
|
|
208
|
+
"Expected 5 space-separated fields: minute hour day-of-month month day-of-week "
|
|
209
|
+
"(e.g. '0 8 * * *' for daily at 8 AM UTC)"
|
|
210
|
+
)
|
|
211
|
+
return self
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
# ------------------------------------------------------------------ #
|
|
215
|
+
# Collector (top-level) #
|
|
216
|
+
# ------------------------------------------------------------------ #
|
|
217
|
+
|
|
218
|
+
class Collector(BaseModel):
|
|
219
|
+
name: str
|
|
220
|
+
namespace: str | None = None # warehouse namespace; defaults to collector name when absent
|
|
221
|
+
description: str | None = None
|
|
222
|
+
source: Source
|
|
223
|
+
cadence: Cadence
|
|
224
|
+
deployment: Deployment | None = None
|
|
225
|
+
|
|
226
|
+
model_config = {"populate_by_name": True}
|
|
227
|
+
|
|
228
|
+
@classmethod
|
|
229
|
+
def model_fields_set(cls):
|
|
230
|
+
return super().model_fields_set()
|
|
231
|
+
|
|
232
|
+
@model_validator(mode="after")
|
|
233
|
+
def all_dynamic_params_have_iterators(self) -> "Collector":
|
|
234
|
+
if isinstance(self.source, (HttpSource, PythonSource)):
|
|
235
|
+
_validate_dynamic_params(self.source.params, self.cadence.iterate)
|
|
236
|
+
return self
|
|
237
|
+
|
|
238
|
+
@classmethod
|
|
239
|
+
def from_dict(cls, data: dict) -> Collector:
|
|
240
|
+
return cls.model_validate(data)
|
dcf/engine/__init__.py
ADDED
dcf/engine/fetcher.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import sys
|
|
5
|
+
import time
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import requests
|
|
11
|
+
|
|
12
|
+
from ..config.models import HttpSource, PythonSource, Param
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _resolve_static_params(params: list[Param]) -> dict[str, Any]:
|
|
16
|
+
return {p.name: p.value for p in params if p.value is not None}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _rate_limit_sleep(rate_limit) -> None:
|
|
20
|
+
if rate_limit is None:
|
|
21
|
+
return
|
|
22
|
+
sleep_secs = (rate_limit.per_minutes * 60) / rate_limit.requests
|
|
23
|
+
time.sleep(sleep_secs)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _get_nested(record: dict, path: str) -> Any:
|
|
27
|
+
"""Resolve a dot-notation path into a nested dict."""
|
|
28
|
+
parts = path.split(".")
|
|
29
|
+
val = record
|
|
30
|
+
for part in parts:
|
|
31
|
+
if not isinstance(val, dict):
|
|
32
|
+
return None
|
|
33
|
+
val = val.get(part)
|
|
34
|
+
return val
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _parse_response(response: requests.Response, source: HttpSource) -> list[dict]:
|
|
38
|
+
fmt = source.response.format
|
|
39
|
+
|
|
40
|
+
if fmt == "csv":
|
|
41
|
+
try:
|
|
42
|
+
df = pd.read_csv(io.StringIO(response.text))
|
|
43
|
+
except pd.errors.EmptyDataError:
|
|
44
|
+
return []
|
|
45
|
+
return df.to_dict(orient="records")
|
|
46
|
+
|
|
47
|
+
if fmt == "json":
|
|
48
|
+
data = response.json()
|
|
49
|
+
if source.response.records_path:
|
|
50
|
+
for key in source.response.records_path.split("."):
|
|
51
|
+
if not isinstance(data, dict):
|
|
52
|
+
raise ValueError(
|
|
53
|
+
f"records_path '{source.response.records_path}' could not be followed: "
|
|
54
|
+
f"expected a JSON object at key '{key}' but found {type(data).__name__}. "
|
|
55
|
+
f"If the response is a top-level array, omit records_path entirely."
|
|
56
|
+
)
|
|
57
|
+
data = data.get(key, [])
|
|
58
|
+
if isinstance(data, list):
|
|
59
|
+
return data
|
|
60
|
+
return [data]
|
|
61
|
+
|
|
62
|
+
raise ValueError(f"Unsupported response format: '{fmt}'")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _fetch_http(source: HttpSource, dynamic_params: dict[str, Any]) -> list[dict]:
|
|
66
|
+
params = _resolve_static_params(source.params)
|
|
67
|
+
params.update(dynamic_params)
|
|
68
|
+
|
|
69
|
+
if source.auth and source.auth.type == "query_param":
|
|
70
|
+
params[source.auth.key] = source.auth.value
|
|
71
|
+
|
|
72
|
+
headers = {}
|
|
73
|
+
if source.auth and source.auth.type == "header":
|
|
74
|
+
headers[source.auth.key] = source.auth.value
|
|
75
|
+
if source.auth and source.auth.type == "bearer":
|
|
76
|
+
headers["Authorization"] = f"Bearer {source.auth.value}"
|
|
77
|
+
|
|
78
|
+
_rate_limit_sleep(source.rate_limit)
|
|
79
|
+
|
|
80
|
+
response = requests.request(
|
|
81
|
+
method=source.method,
|
|
82
|
+
url=source.url,
|
|
83
|
+
params=params,
|
|
84
|
+
headers=headers,
|
|
85
|
+
timeout=60,
|
|
86
|
+
)
|
|
87
|
+
try:
|
|
88
|
+
response.raise_for_status()
|
|
89
|
+
except requests.HTTPError:
|
|
90
|
+
status = response.status_code
|
|
91
|
+
hint = {
|
|
92
|
+
401: "Check that your API token or key is correct and has not expired.",
|
|
93
|
+
403: "Your credentials may lack the required permissions for this endpoint.",
|
|
94
|
+
404: "The URL may be wrong, or this resource does not exist.",
|
|
95
|
+
429: "Rate limit exceeded. Add a rate_limit block to your collector YAML to slow down requests.",
|
|
96
|
+
}.get(status, "")
|
|
97
|
+
msg = f"HTTP {status} from {source.url}"
|
|
98
|
+
if hint:
|
|
99
|
+
msg += f" — {hint}"
|
|
100
|
+
raise requests.HTTPError(msg, response=response)
|
|
101
|
+
return _parse_response(response, source)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _fetch_python(source: PythonSource, dynamic_params: dict[str, Any]) -> list[dict]:
|
|
105
|
+
from ..project import find_project_root
|
|
106
|
+
project_root = str(find_project_root())
|
|
107
|
+
if project_root not in sys.path:
|
|
108
|
+
sys.path.insert(0, project_root)
|
|
109
|
+
import importlib
|
|
110
|
+
mod = importlib.import_module(source.module)
|
|
111
|
+
fn = getattr(mod, source.function)
|
|
112
|
+
return fn(dynamic_params)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def fetch_records(source, dynamic_params: dict[str, Any]) -> list[dict]:
|
|
116
|
+
if isinstance(source, PythonSource):
|
|
117
|
+
return _fetch_python(source, dynamic_params)
|
|
118
|
+
return _fetch_http(source, dynamic_params)
|
dcf/engine/iterator.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import itertools
|
|
4
|
+
from datetime import date, timedelta
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from ..config.models import DateRangeIterate, CategoricalIterate, IterateSpec, Param
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _parse_duration(s: str) -> timedelta:
|
|
11
|
+
"""Parse simple duration strings like '1 day', '7 days', '2 weeks'."""
|
|
12
|
+
parts = s.strip().split()
|
|
13
|
+
if len(parts) != 2:
|
|
14
|
+
raise ValueError(f"Cannot parse duration: '{s}'. Expected '<n> <unit>'")
|
|
15
|
+
n = int(parts[0])
|
|
16
|
+
unit = parts[1].rstrip("s") # normalize "days" → "day"
|
|
17
|
+
if unit == "day":
|
|
18
|
+
return timedelta(days=n)
|
|
19
|
+
if unit == "week":
|
|
20
|
+
return timedelta(weeks=n)
|
|
21
|
+
if unit == "month":
|
|
22
|
+
return timedelta(days=30 * n)
|
|
23
|
+
raise ValueError(f"Unknown duration unit: '{unit}'")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _resolve_date(value: str) -> date:
|
|
27
|
+
if value == "today":
|
|
28
|
+
return date.today()
|
|
29
|
+
return date.fromisoformat(value)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _format_date(d: date, fmt: str | None, param_defs: dict[str, Param]) -> str:
|
|
33
|
+
"""Apply the param's declared format, falling back to ISO."""
|
|
34
|
+
if fmt:
|
|
35
|
+
return d.strftime(fmt)
|
|
36
|
+
return d.isoformat()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _date_range_steps(spec: DateRangeIterate, param_defs: dict[str, Param]) -> list[dict[str, Any]]:
|
|
40
|
+
"""
|
|
41
|
+
Yield one dict per step. Each dict maps param names to their formatted values.
|
|
42
|
+
When spec.params has two entries, the first receives the window start
|
|
43
|
+
and the second receives the window end.
|
|
44
|
+
"""
|
|
45
|
+
step = _parse_duration(spec.step)
|
|
46
|
+
window = _parse_duration(spec.window) if spec.window else step
|
|
47
|
+
|
|
48
|
+
start = _resolve_date(spec.start)
|
|
49
|
+
end = _resolve_date(spec.end)
|
|
50
|
+
|
|
51
|
+
steps = []
|
|
52
|
+
window_start = start
|
|
53
|
+
while window_start <= end:
|
|
54
|
+
window_end = min(window_start + window - timedelta(days=1), end)
|
|
55
|
+
|
|
56
|
+
def fmt(d: date, param_name: str) -> str:
|
|
57
|
+
p = param_defs.get(param_name)
|
|
58
|
+
return _format_date(d, p.format if p else None, param_defs)
|
|
59
|
+
|
|
60
|
+
if len(spec.params) == 1:
|
|
61
|
+
steps.append({spec.params[0]: fmt(window_start, spec.params[0])})
|
|
62
|
+
else:
|
|
63
|
+
steps.append({
|
|
64
|
+
spec.params[0]: fmt(window_start, spec.params[0]),
|
|
65
|
+
spec.params[1]: fmt(window_end, spec.params[1]),
|
|
66
|
+
})
|
|
67
|
+
window_start += step
|
|
68
|
+
return steps
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _categorical_steps(spec: CategoricalIterate) -> list[dict[str, Any]]:
|
|
72
|
+
return [{spec.param: v} for v in spec.values]
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def build_request_sequence(
|
|
76
|
+
iterate: list[IterateSpec],
|
|
77
|
+
param_defs: dict[str, Param],
|
|
78
|
+
) -> list[dict[str, Any]]:
|
|
79
|
+
"""
|
|
80
|
+
Return the cartesian product of all iteration axes.
|
|
81
|
+
Each element is a dict of {param_name: value} for one request.
|
|
82
|
+
"""
|
|
83
|
+
if not iterate:
|
|
84
|
+
return [{}]
|
|
85
|
+
|
|
86
|
+
axes: list[list[dict[str, Any]]] = []
|
|
87
|
+
for spec in iterate:
|
|
88
|
+
if isinstance(spec, DateRangeIterate):
|
|
89
|
+
axes.append(_date_range_steps(spec, param_defs))
|
|
90
|
+
elif isinstance(spec, CategoricalIterate):
|
|
91
|
+
axes.append(_categorical_steps(spec))
|
|
92
|
+
|
|
93
|
+
return [
|
|
94
|
+
{k: v for d in combo for k, v in d.items()}
|
|
95
|
+
for combo in itertools.product(*axes)
|
|
96
|
+
]
|
dcf/engine/projector.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from ..config.models import Schema, Column
|
|
8
|
+
from .transforms import apply_transform
|
|
9
|
+
from ..engine.fetcher import _get_nested
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
_CAST = {
|
|
13
|
+
"string": str,
|
|
14
|
+
"integer": lambda v: int(float(v)) if v is not None else None,
|
|
15
|
+
"float": lambda v: float(v) if v is not None else None,
|
|
16
|
+
"boolean": lambda v: bool(v) if v is not None else None,
|
|
17
|
+
"date": lambda v: pd.to_datetime(v, errors="coerce"),
|
|
18
|
+
"timestamp": lambda v: pd.to_datetime(v, errors="coerce"),
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _cast(value: Any, col_type: str | None) -> Any:
|
|
23
|
+
if col_type is None or value is None:
|
|
24
|
+
return value
|
|
25
|
+
try:
|
|
26
|
+
return _CAST[col_type](value)
|
|
27
|
+
except (ValueError, TypeError):
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _extract(record: dict, col: Column) -> Any:
|
|
32
|
+
if col.transform is not None:
|
|
33
|
+
return apply_transform(col.transform, record)
|
|
34
|
+
return _get_nested(record, col.path)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def project(records: list[dict], schema: Schema | None) -> pd.DataFrame:
|
|
38
|
+
"""
|
|
39
|
+
Apply transforms, extract declared columns only, cast types.
|
|
40
|
+
Columns not listed in the schema are dropped.
|
|
41
|
+
When schema is None, all fields are selected as-is.
|
|
42
|
+
"""
|
|
43
|
+
if schema is None:
|
|
44
|
+
return pd.DataFrame(records)
|
|
45
|
+
|
|
46
|
+
if not records:
|
|
47
|
+
return pd.DataFrame(columns=[c.name for c in schema.columns])
|
|
48
|
+
|
|
49
|
+
rows = []
|
|
50
|
+
for record in records:
|
|
51
|
+
row = {}
|
|
52
|
+
for col in schema.columns:
|
|
53
|
+
row[col.name] = _cast(_extract(record, col), col.type)
|
|
54
|
+
rows.append(row)
|
|
55
|
+
|
|
56
|
+
return pd.DataFrame(rows)
|
dcf/engine/runner.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import textwrap
|
|
4
|
+
import traceback
|
|
5
|
+
|
|
6
|
+
from ..config.models import Collector, PythonSource
|
|
7
|
+
from .iterator import build_request_sequence
|
|
8
|
+
from .fetcher import fetch_records
|
|
9
|
+
from .projector import project
|
|
10
|
+
from .. import writer as iceberg_writer
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def run_collector(
|
|
14
|
+
collector: Collector,
|
|
15
|
+
catalog: str = "local",
|
|
16
|
+
limit: int | None = None,
|
|
17
|
+
param_overrides: dict | None = None,
|
|
18
|
+
) -> None:
|
|
19
|
+
# GCS write path bypasses Spark entirely — skip JVM startup
|
|
20
|
+
if catalog == "gcp":
|
|
21
|
+
spark = None
|
|
22
|
+
else:
|
|
23
|
+
from dcf.spark_session import get_spark
|
|
24
|
+
spark = get_spark("dcf")
|
|
25
|
+
|
|
26
|
+
param_defs = {p.name: p for p in collector.source.params}
|
|
27
|
+
request_sequence = build_request_sequence(collector.cadence.iterate, param_defs)
|
|
28
|
+
|
|
29
|
+
if limit is not None:
|
|
30
|
+
request_sequence = request_sequence[:limit]
|
|
31
|
+
|
|
32
|
+
# Static params declared in the YAML (value is set) flow through to Python sources
|
|
33
|
+
static_params = {p.name: p.value for p in collector.source.params if p.value is not None}
|
|
34
|
+
|
|
35
|
+
print(f"\n[dcf] Running '{collector.name}' — {len(request_sequence)} requests\n")
|
|
36
|
+
|
|
37
|
+
failed = 0
|
|
38
|
+
|
|
39
|
+
for i, dynamic_params in enumerate(request_sequence, 1):
|
|
40
|
+
label = " ".join(f"{k}={v}" for k, v in dynamic_params.items())
|
|
41
|
+
print(f" [{i}/{len(request_sequence)}] {label}")
|
|
42
|
+
|
|
43
|
+
# Build full params: static defaults → iterate values → CLI overrides
|
|
44
|
+
full_params = {**static_params, **dynamic_params, **(param_overrides or {})}
|
|
45
|
+
|
|
46
|
+
# For http sources, iterate-driven params are already handled in the fetcher;
|
|
47
|
+
# pass full_params only to python sources which need everything in one dict
|
|
48
|
+
source_params = full_params if isinstance(collector.source, PythonSource) else dynamic_params
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
records = fetch_records(collector.source, source_params)
|
|
52
|
+
except Exception as e:
|
|
53
|
+
failed += 1
|
|
54
|
+
print(f" fetch error ({type(e).__name__}): {e}")
|
|
55
|
+
print(textwrap.indent(traceback.format_exc(), " "))
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
if not records:
|
|
59
|
+
print(f" 0 records — skipping")
|
|
60
|
+
continue
|
|
61
|
+
|
|
62
|
+
df = project(records, collector.source.schema_)
|
|
63
|
+
print(f" {len(df)} rows → writing")
|
|
64
|
+
|
|
65
|
+
iceberg_writer.write(spark, collector, df, catalog=catalog, dynamic_params=dynamic_params)
|
|
66
|
+
|
|
67
|
+
if catalog == "gcp":
|
|
68
|
+
from .. import writer as _w
|
|
69
|
+
bucket = _w.iceberg._gcs_warehouse_bucket()
|
|
70
|
+
if collector.namespace:
|
|
71
|
+
dest = f"gs://{bucket}/{collector.namespace}/{collector.name}/data"
|
|
72
|
+
else:
|
|
73
|
+
dest = f"gs://{bucket}/{collector.name}/data"
|
|
74
|
+
else:
|
|
75
|
+
from ..project import find_project_root
|
|
76
|
+
if collector.namespace:
|
|
77
|
+
dest = str(find_project_root() / "warehouse" / collector.namespace / collector.name / "data")
|
|
78
|
+
else:
|
|
79
|
+
dest = str(find_project_root() / "warehouse" / collector.name / "data")
|
|
80
|
+
|
|
81
|
+
total = len(request_sequence)
|
|
82
|
+
if failed == total:
|
|
83
|
+
print(f"\n[dcf] '{collector.name}' FAILED — all {total} iteration(s) errored → {dest}\n")
|
|
84
|
+
elif failed:
|
|
85
|
+
print(f"\n[dcf] '{collector.name}' complete with errors — {failed}/{total} iteration(s) failed → {dest}\n")
|
|
86
|
+
else:
|
|
87
|
+
print(f"\n[dcf] '{collector.name}' complete → {dest}\n")
|
|
88
|
+
|
|
89
|
+
if spark is not None:
|
|
90
|
+
spark.stop()
|