data-collection-framework 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. data_collection_framework-0.1.0.dist-info/METADATA +19 -0
  2. data_collection_framework-0.1.0.dist-info/RECORD +44 -0
  3. data_collection_framework-0.1.0.dist-info/WHEEL +5 -0
  4. data_collection_framework-0.1.0.dist-info/entry_points.txt +2 -0
  5. data_collection_framework-0.1.0.dist-info/top_level.txt +1 -0
  6. dcf/__init__.py +4 -0
  7. dcf/cli.py +841 -0
  8. dcf/config/__init__.py +4 -0
  9. dcf/config/loader.py +77 -0
  10. dcf/config/models.py +240 -0
  11. dcf/engine/__init__.py +6 -0
  12. dcf/engine/fetcher.py +118 -0
  13. dcf/engine/iterator.py +96 -0
  14. dcf/engine/projector.py +56 -0
  15. dcf/engine/runner.py +90 -0
  16. dcf/engine/transforms.py +41 -0
  17. dcf/gcp/__init__.py +0 -0
  18. dcf/gcp/_collector_utils.py +87 -0
  19. dcf/gcp/auth.py +1 -0
  20. dcf/gcp/batch_deploy.py +548 -0
  21. dcf/gcp/bootstrap.py +131 -0
  22. dcf/gcp/gcloud.py +42 -0
  23. dcf/gcp/terraform.py +151 -0
  24. dcf/infra/modules/batch_collector/gcp/airflow/main.tf +194 -0
  25. dcf/infra/modules/batch_collector/gcp/airflow/outputs.tf +9 -0
  26. dcf/infra/modules/batch_collector/gcp/airflow/variables.tf +52 -0
  27. dcf/infra/modules/batch_collector/gcp/main.tf +70 -0
  28. dcf/infra/modules/batch_collector/gcp/outputs.tf +4 -0
  29. dcf/infra/modules/batch_collector/gcp/variables.tf +40 -0
  30. dcf/infra/modules/batch_collector/local/airflow/main.tf +64 -0
  31. dcf/infra/modules/batch_collector/local/airflow/outputs.tf +9 -0
  32. dcf/infra/modules/batch_collector/local/airflow/variables.tf +59 -0
  33. dcf/infra/modules/batch_collector/local/main.tf +32 -0
  34. dcf/infra/modules/batch_collector/local/outputs.tf +4 -0
  35. dcf/infra/modules/batch_collector/local/variables.tf +25 -0
  36. dcf/infra/templates/airflow.Dockerfile.tftpl +6 -0
  37. dcf/infra/templates/batch_collector.Dockerfile.tftpl +14 -0
  38. dcf/infra/templates/docker-compose.yml.tftpl +76 -0
  39. dcf/local_deploy.py +756 -0
  40. dcf/project.py +23 -0
  41. dcf/spark_session.py +66 -0
  42. dcf/warehouse_reader.py +323 -0
  43. dcf/writer/__init__.py +3 -0
  44. dcf/writer/iceberg.py +315 -0
dcf/config/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from .loader import load_collector, load_all_collectors
2
+ from .models import Collector
3
+
4
+ __all__ = ["load_collector", "load_all_collectors", "Collector"]
dcf/config/loader.py ADDED
@@ -0,0 +1,77 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from pathlib import Path
5
+
6
+ import yaml
7
+
8
+ from .models import Collector
9
+
10
+
11
+ def _project_config() -> dict:
12
+ """Load project.yml from the project root, returning an empty dict if absent."""
13
+ from ..project import find_project_root
14
+ try:
15
+ cfg_path = find_project_root() / "project.yml"
16
+ except RuntimeError:
17
+ return {}
18
+ if cfg_path.exists():
19
+ return yaml.safe_load(cfg_path.read_text()) or {}
20
+ return {}
21
+
22
+
23
+ def _resolve_env(value: str, project_cfg: dict) -> str:
24
+ """Replace {{ env.VAR }} placeholders.
25
+
26
+ Resolution order:
27
+ 1. OS environment variable
28
+ 2. project.yml key (VAR lowercased, e.g. PORTLANDMAPS_API_KEY → portlandmaps_api_key)
29
+ """
30
+ import re
31
+ def replacer(match):
32
+ var = match.group(1).strip()
33
+ resolved = os.environ.get(var)
34
+ if resolved is None:
35
+ resolved = project_cfg.get(var.lower())
36
+ if not resolved:
37
+ raise EnvironmentError(
38
+ f"'{var}' is not set — add it as an environment variable "
39
+ f"or set '{var.lower()}' in project.yml"
40
+ )
41
+ return resolved
42
+ return re.sub(r"\{\{\s*env\.(\w+)\s*\}\}", replacer, value)
43
+
44
+
45
+ def _resolve_env_in(obj, project_cfg: dict):
46
+ if isinstance(obj, dict):
47
+ return {k: _resolve_env_in(v, project_cfg) for k, v in obj.items()}
48
+ if isinstance(obj, list):
49
+ return [_resolve_env_in(v, project_cfg) for v in obj]
50
+ if isinstance(obj, str):
51
+ return _resolve_env(obj, project_cfg)
52
+ return obj
53
+
54
+
55
+ def load_collector(path: Path, resolve_env: bool = True) -> Collector:
56
+ raw = yaml.safe_load(path.read_text())
57
+ if resolve_env:
58
+ raw = _resolve_env_in(raw, _project_config())
59
+ else:
60
+ raw = _strip_env_placeholders(raw)
61
+ return Collector.from_dict(raw)
62
+
63
+
64
+ def _strip_env_placeholders(obj):
65
+ """Replace {{ env.VAR }} with a placeholder string for structural validation."""
66
+ import re
67
+ if isinstance(obj, dict):
68
+ return {k: _strip_env_placeholders(v) for k, v in obj.items()}
69
+ if isinstance(obj, list):
70
+ return [_strip_env_placeholders(v) for v in obj]
71
+ if isinstance(obj, str):
72
+ return re.sub(r"\{\{\s*env\.\w+\s*\}\}", "<env>", obj)
73
+ return obj
74
+
75
+
76
+ def load_all_collectors(collectors_dir: Path, resolve_env: bool = True) -> list[Collector]:
77
+ return [load_collector(p, resolve_env=resolve_env) for p in sorted(collectors_dir.glob("*.yml"))]
dcf/config/models.py ADDED
@@ -0,0 +1,240 @@
1
+ from __future__ import annotations
2
+
3
+ import re as _re
4
+ from typing import Any, Literal, Annotated, Union
5
+ from pydantic import BaseModel, Field, model_validator
6
+
7
+ _CRON_FIELD_RE = _re.compile(r'^(\*|[0-9,\-\*/]+)$')
8
+
9
+
10
+ # ------------------------------------------------------------------ #
11
+ # Source — params #
12
+ # ------------------------------------------------------------------ #
13
+
14
+ class Param(BaseModel):
15
+ name: str
16
+ type: Literal["string", "integer", "float", "date", "boolean"]
17
+ format: str | None = None # e.g. "%m/%d/%Y" for date URL serialization
18
+ value: Any | None = None # present → static; absent → must be covered by iterate
19
+
20
+
21
+ class Auth(BaseModel):
22
+ type: Literal["query_param", "header", "bearer"]
23
+ key: str | None = None # param name or header name; unused (and optional) for bearer
24
+ value: str # supports "{{ env.VAR }}"
25
+
26
+ @model_validator(mode="after")
27
+ def key_required_for_non_bearer(self) -> "Auth":
28
+ if self.type in ("query_param", "header") and not self.key:
29
+ raise ValueError(f"auth.key is required when type is '{self.type}'")
30
+ return self
31
+
32
+
33
+ class RateLimit(BaseModel):
34
+ requests: int
35
+ per_minutes: float
36
+
37
+
38
+ class Response(BaseModel):
39
+ format: Literal["json", "csv"]
40
+ records_path: str | None = None # key (or dot-path) in JSON holding the records array
41
+
42
+
43
+ # ------------------------------------------------------------------ #
44
+ # Source — iteration #
45
+ # ------------------------------------------------------------------ #
46
+
47
+ class DateRangeIterate(BaseModel):
48
+ type: Literal["date_range"]
49
+ params: list[str] # one or two param names that receive the window start/end
50
+ start: str # ISO date or "today"
51
+ end: str # ISO date or "today"
52
+ step: str # e.g. "1 day", "7 days"
53
+ window: str | None = None # defaults to step when absent
54
+
55
+
56
+ class CategoricalIterate(BaseModel):
57
+ type: Literal["categorical"]
58
+ param: str
59
+ values: list[Any]
60
+
61
+
62
+ IterateSpec = DateRangeIterate | CategoricalIterate
63
+
64
+
65
+ def _validate_dynamic_params(params: list[Param], iterate: list[IterateSpec]) -> None:
66
+ dynamic = {p.name for p in params if p.value is None}
67
+ covered: set[str] = set()
68
+ for it in iterate:
69
+ if isinstance(it, DateRangeIterate):
70
+ covered.update(it.params)
71
+ elif isinstance(it, CategoricalIterate):
72
+ covered.add(it.param)
73
+ missing = dynamic - covered
74
+ if missing:
75
+ raise ValueError(f"Params declared without a value or iterator: {missing}")
76
+
77
+
78
+ # ------------------------------------------------------------------ #
79
+ # Schema #
80
+ # ------------------------------------------------------------------ #
81
+
82
+ class CrsReprojectTransform(BaseModel):
83
+ type: Literal["crs_reproject"]
84
+ from_columns: list[str]
85
+ from_crs: str
86
+ to_crs: str
87
+ component: Literal["x", "y"]
88
+
89
+
90
+ class ArrayJoinTransform(BaseModel):
91
+ type: Literal["array_join"]
92
+ path: str # dot-notation path to the array field in the raw record
93
+ separator: str = "," # delimiter used to join elements
94
+
95
+
96
+ Transform = Annotated[
97
+ Union[CrsReprojectTransform, ArrayJoinTransform],
98
+ Field(discriminator="type"),
99
+ ]
100
+
101
+
102
+ class Column(BaseModel):
103
+ name: str
104
+ path: str | None = None # key in the raw record (dot-notation for nested)
105
+ type: Literal["string", "integer", "float", "date", "timestamp", "boolean"] | None = None
106
+ transform: Transform | None = None
107
+
108
+ @model_validator(mode="after")
109
+ def has_source(self) -> Column:
110
+ if self.path is None and self.transform is None:
111
+ raise ValueError(f"Column '{self.name}' must have either 'path' or 'transform'")
112
+ return self
113
+
114
+
115
+ class Schema(BaseModel):
116
+ columns: list[Column]
117
+
118
+
119
+ # ------------------------------------------------------------------ #
120
+ # Source types #
121
+ # ------------------------------------------------------------------ #
122
+
123
+ class HttpSource(BaseModel):
124
+ model_config = {"populate_by_name": True}
125
+ type: Literal["http"]
126
+ url: str
127
+ method: Literal["GET", "POST"] = "GET"
128
+ auth: Auth | None = None
129
+ params: list[Param] = []
130
+ response: Response = Response(format="json")
131
+ rate_limit: RateLimit | None = None
132
+ schema_: Schema | None = Field(default=None, alias="schema")
133
+
134
+
135
+ class PythonSource(BaseModel):
136
+ """Calls a Python function that returns list[dict]; handles its own pagination."""
137
+ model_config = {"populate_by_name": True}
138
+ type: Literal["python"]
139
+ module: str # importable module path, e.g. "connectors.craigslist_apts"
140
+ function: str # function name; called as fn(dynamic_params) -> list[dict]
141
+ params: list[Param] = []
142
+ schema_: Schema | None = Field(default=None, alias="schema")
143
+
144
+
145
+ class PubSubSource(BaseModel):
146
+ """Continuously reads JSON messages from a GCP Pub/Sub subscription."""
147
+ model_config = {"populate_by_name": True}
148
+ type: Literal["pubsub"]
149
+ subscription: str # full resource path: projects/<project>/subscriptions/<name>
150
+ schema_: Schema | None = Field(default=None, alias="schema")
151
+
152
+
153
+ Source = Annotated[Union[HttpSource, PythonSource, PubSubSource], Field(discriminator="type")]
154
+
155
+
156
+ # ------------------------------------------------------------------ #
157
+ # Cadence #
158
+ # ------------------------------------------------------------------ #
159
+
160
+ class StagingConfig(BaseModel):
161
+ partition_param: str # which iterate param splits into separate staging tables
162
+ table_pattern: str # e.g. "permits_{date_type}_loader_staging"
163
+
164
+
165
+ class MergeDedup(BaseModel):
166
+ type: Literal["latest_non_null"]
167
+ columns: list[str]
168
+
169
+
170
+ class MergeConfig(BaseModel):
171
+ table: str
172
+ key: str
173
+ dedup: MergeDedup | None = None
174
+
175
+
176
+ class Cadence(BaseModel):
177
+ iterate: list[IterateSpec] = []
178
+ strategy: Literal["incremental", "append", "full_refresh"]
179
+ primary_key: str | None = None
180
+ staging: StagingConfig | None = None
181
+ merge: MergeConfig | None = None
182
+
183
+
184
+ # ------------------------------------------------------------------ #
185
+ # Deployment #
186
+ # ------------------------------------------------------------------ #
187
+
188
+ class Deployment(BaseModel):
189
+ type: Literal["batch", "streaming"] = "batch"
190
+ # batch fields
191
+ schedule: str | None = None
192
+ paused: bool = False
193
+ # streaming fields
194
+ window_seconds: int = 60
195
+
196
+ @model_validator(mode="after")
197
+ def validate_deployment(self) -> "Deployment":
198
+ if self.type == "batch":
199
+ if not self.schedule:
200
+ raise ValueError(
201
+ "deployment.schedule is required for batch deployments "
202
+ "(e.g. schedule: \"0 8 * * *\")"
203
+ )
204
+ parts = self.schedule.strip().split()
205
+ if len(parts) != 5 or not all(_CRON_FIELD_RE.match(p) for p in parts):
206
+ raise ValueError(
207
+ f"deployment.schedule '{self.schedule}' is not a valid cron expression. "
208
+ "Expected 5 space-separated fields: minute hour day-of-month month day-of-week "
209
+ "(e.g. '0 8 * * *' for daily at 8 AM UTC)"
210
+ )
211
+ return self
212
+
213
+
214
+ # ------------------------------------------------------------------ #
215
+ # Collector (top-level) #
216
+ # ------------------------------------------------------------------ #
217
+
218
+ class Collector(BaseModel):
219
+ name: str
220
+ namespace: str | None = None # warehouse namespace; defaults to collector name when absent
221
+ description: str | None = None
222
+ source: Source
223
+ cadence: Cadence
224
+ deployment: Deployment | None = None
225
+
226
+ model_config = {"populate_by_name": True}
227
+
228
+ @classmethod
229
+ def model_fields_set(cls):
230
+ return super().model_fields_set()
231
+
232
+ @model_validator(mode="after")
233
+ def all_dynamic_params_have_iterators(self) -> "Collector":
234
+ if isinstance(self.source, (HttpSource, PythonSource)):
235
+ _validate_dynamic_params(self.source.params, self.cadence.iterate)
236
+ return self
237
+
238
+ @classmethod
239
+ def from_dict(cls, data: dict) -> Collector:
240
+ return cls.model_validate(data)
dcf/engine/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ from .runner import run_collector
2
+ from .iterator import build_request_sequence
3
+ from .fetcher import fetch_records
4
+ from .projector import project
5
+
6
+ __all__ = ["run_collector", "build_request_sequence", "fetch_records", "project"]
dcf/engine/fetcher.py ADDED
@@ -0,0 +1,118 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import sys
5
+ import time
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import pandas as pd
10
+ import requests
11
+
12
+ from ..config.models import HttpSource, PythonSource, Param
13
+
14
+
15
+ def _resolve_static_params(params: list[Param]) -> dict[str, Any]:
16
+ return {p.name: p.value for p in params if p.value is not None}
17
+
18
+
19
+ def _rate_limit_sleep(rate_limit) -> None:
20
+ if rate_limit is None:
21
+ return
22
+ sleep_secs = (rate_limit.per_minutes * 60) / rate_limit.requests
23
+ time.sleep(sleep_secs)
24
+
25
+
26
+ def _get_nested(record: dict, path: str) -> Any:
27
+ """Resolve a dot-notation path into a nested dict."""
28
+ parts = path.split(".")
29
+ val = record
30
+ for part in parts:
31
+ if not isinstance(val, dict):
32
+ return None
33
+ val = val.get(part)
34
+ return val
35
+
36
+
37
+ def _parse_response(response: requests.Response, source: HttpSource) -> list[dict]:
38
+ fmt = source.response.format
39
+
40
+ if fmt == "csv":
41
+ try:
42
+ df = pd.read_csv(io.StringIO(response.text))
43
+ except pd.errors.EmptyDataError:
44
+ return []
45
+ return df.to_dict(orient="records")
46
+
47
+ if fmt == "json":
48
+ data = response.json()
49
+ if source.response.records_path:
50
+ for key in source.response.records_path.split("."):
51
+ if not isinstance(data, dict):
52
+ raise ValueError(
53
+ f"records_path '{source.response.records_path}' could not be followed: "
54
+ f"expected a JSON object at key '{key}' but found {type(data).__name__}. "
55
+ f"If the response is a top-level array, omit records_path entirely."
56
+ )
57
+ data = data.get(key, [])
58
+ if isinstance(data, list):
59
+ return data
60
+ return [data]
61
+
62
+ raise ValueError(f"Unsupported response format: '{fmt}'")
63
+
64
+
65
+ def _fetch_http(source: HttpSource, dynamic_params: dict[str, Any]) -> list[dict]:
66
+ params = _resolve_static_params(source.params)
67
+ params.update(dynamic_params)
68
+
69
+ if source.auth and source.auth.type == "query_param":
70
+ params[source.auth.key] = source.auth.value
71
+
72
+ headers = {}
73
+ if source.auth and source.auth.type == "header":
74
+ headers[source.auth.key] = source.auth.value
75
+ if source.auth and source.auth.type == "bearer":
76
+ headers["Authorization"] = f"Bearer {source.auth.value}"
77
+
78
+ _rate_limit_sleep(source.rate_limit)
79
+
80
+ response = requests.request(
81
+ method=source.method,
82
+ url=source.url,
83
+ params=params,
84
+ headers=headers,
85
+ timeout=60,
86
+ )
87
+ try:
88
+ response.raise_for_status()
89
+ except requests.HTTPError:
90
+ status = response.status_code
91
+ hint = {
92
+ 401: "Check that your API token or key is correct and has not expired.",
93
+ 403: "Your credentials may lack the required permissions for this endpoint.",
94
+ 404: "The URL may be wrong, or this resource does not exist.",
95
+ 429: "Rate limit exceeded. Add a rate_limit block to your collector YAML to slow down requests.",
96
+ }.get(status, "")
97
+ msg = f"HTTP {status} from {source.url}"
98
+ if hint:
99
+ msg += f" — {hint}"
100
+ raise requests.HTTPError(msg, response=response)
101
+ return _parse_response(response, source)
102
+
103
+
104
+ def _fetch_python(source: PythonSource, dynamic_params: dict[str, Any]) -> list[dict]:
105
+ from ..project import find_project_root
106
+ project_root = str(find_project_root())
107
+ if project_root not in sys.path:
108
+ sys.path.insert(0, project_root)
109
+ import importlib
110
+ mod = importlib.import_module(source.module)
111
+ fn = getattr(mod, source.function)
112
+ return fn(dynamic_params)
113
+
114
+
115
+ def fetch_records(source, dynamic_params: dict[str, Any]) -> list[dict]:
116
+ if isinstance(source, PythonSource):
117
+ return _fetch_python(source, dynamic_params)
118
+ return _fetch_http(source, dynamic_params)
dcf/engine/iterator.py ADDED
@@ -0,0 +1,96 @@
1
+ from __future__ import annotations
2
+
3
+ import itertools
4
+ from datetime import date, timedelta
5
+ from typing import Any
6
+
7
+ from ..config.models import DateRangeIterate, CategoricalIterate, IterateSpec, Param
8
+
9
+
10
+ def _parse_duration(s: str) -> timedelta:
11
+ """Parse simple duration strings like '1 day', '7 days', '2 weeks'."""
12
+ parts = s.strip().split()
13
+ if len(parts) != 2:
14
+ raise ValueError(f"Cannot parse duration: '{s}'. Expected '<n> <unit>'")
15
+ n = int(parts[0])
16
+ unit = parts[1].rstrip("s") # normalize "days" → "day"
17
+ if unit == "day":
18
+ return timedelta(days=n)
19
+ if unit == "week":
20
+ return timedelta(weeks=n)
21
+ if unit == "month":
22
+ return timedelta(days=30 * n)
23
+ raise ValueError(f"Unknown duration unit: '{unit}'")
24
+
25
+
26
+ def _resolve_date(value: str) -> date:
27
+ if value == "today":
28
+ return date.today()
29
+ return date.fromisoformat(value)
30
+
31
+
32
+ def _format_date(d: date, fmt: str | None, param_defs: dict[str, Param]) -> str:
33
+ """Apply the param's declared format, falling back to ISO."""
34
+ if fmt:
35
+ return d.strftime(fmt)
36
+ return d.isoformat()
37
+
38
+
39
+ def _date_range_steps(spec: DateRangeIterate, param_defs: dict[str, Param]) -> list[dict[str, Any]]:
40
+ """
41
+ Yield one dict per step. Each dict maps param names to their formatted values.
42
+ When spec.params has two entries, the first receives the window start
43
+ and the second receives the window end.
44
+ """
45
+ step = _parse_duration(spec.step)
46
+ window = _parse_duration(spec.window) if spec.window else step
47
+
48
+ start = _resolve_date(spec.start)
49
+ end = _resolve_date(spec.end)
50
+
51
+ steps = []
52
+ window_start = start
53
+ while window_start <= end:
54
+ window_end = min(window_start + window - timedelta(days=1), end)
55
+
56
+ def fmt(d: date, param_name: str) -> str:
57
+ p = param_defs.get(param_name)
58
+ return _format_date(d, p.format if p else None, param_defs)
59
+
60
+ if len(spec.params) == 1:
61
+ steps.append({spec.params[0]: fmt(window_start, spec.params[0])})
62
+ else:
63
+ steps.append({
64
+ spec.params[0]: fmt(window_start, spec.params[0]),
65
+ spec.params[1]: fmt(window_end, spec.params[1]),
66
+ })
67
+ window_start += step
68
+ return steps
69
+
70
+
71
+ def _categorical_steps(spec: CategoricalIterate) -> list[dict[str, Any]]:
72
+ return [{spec.param: v} for v in spec.values]
73
+
74
+
75
+ def build_request_sequence(
76
+ iterate: list[IterateSpec],
77
+ param_defs: dict[str, Param],
78
+ ) -> list[dict[str, Any]]:
79
+ """
80
+ Return the cartesian product of all iteration axes.
81
+ Each element is a dict of {param_name: value} for one request.
82
+ """
83
+ if not iterate:
84
+ return [{}]
85
+
86
+ axes: list[list[dict[str, Any]]] = []
87
+ for spec in iterate:
88
+ if isinstance(spec, DateRangeIterate):
89
+ axes.append(_date_range_steps(spec, param_defs))
90
+ elif isinstance(spec, CategoricalIterate):
91
+ axes.append(_categorical_steps(spec))
92
+
93
+ return [
94
+ {k: v for d in combo for k, v in d.items()}
95
+ for combo in itertools.product(*axes)
96
+ ]
@@ -0,0 +1,56 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ import pandas as pd
6
+
7
+ from ..config.models import Schema, Column
8
+ from .transforms import apply_transform
9
+ from ..engine.fetcher import _get_nested
10
+
11
+
12
+ _CAST = {
13
+ "string": str,
14
+ "integer": lambda v: int(float(v)) if v is not None else None,
15
+ "float": lambda v: float(v) if v is not None else None,
16
+ "boolean": lambda v: bool(v) if v is not None else None,
17
+ "date": lambda v: pd.to_datetime(v, errors="coerce"),
18
+ "timestamp": lambda v: pd.to_datetime(v, errors="coerce"),
19
+ }
20
+
21
+
22
+ def _cast(value: Any, col_type: str | None) -> Any:
23
+ if col_type is None or value is None:
24
+ return value
25
+ try:
26
+ return _CAST[col_type](value)
27
+ except (ValueError, TypeError):
28
+ return None
29
+
30
+
31
+ def _extract(record: dict, col: Column) -> Any:
32
+ if col.transform is not None:
33
+ return apply_transform(col.transform, record)
34
+ return _get_nested(record, col.path)
35
+
36
+
37
+ def project(records: list[dict], schema: Schema | None) -> pd.DataFrame:
38
+ """
39
+ Apply transforms, extract declared columns only, cast types.
40
+ Columns not listed in the schema are dropped.
41
+ When schema is None, all fields are selected as-is.
42
+ """
43
+ if schema is None:
44
+ return pd.DataFrame(records)
45
+
46
+ if not records:
47
+ return pd.DataFrame(columns=[c.name for c in schema.columns])
48
+
49
+ rows = []
50
+ for record in records:
51
+ row = {}
52
+ for col in schema.columns:
53
+ row[col.name] = _cast(_extract(record, col), col.type)
54
+ rows.append(row)
55
+
56
+ return pd.DataFrame(rows)
dcf/engine/runner.py ADDED
@@ -0,0 +1,90 @@
1
+ from __future__ import annotations
2
+
3
+ import textwrap
4
+ import traceback
5
+
6
+ from ..config.models import Collector, PythonSource
7
+ from .iterator import build_request_sequence
8
+ from .fetcher import fetch_records
9
+ from .projector import project
10
+ from .. import writer as iceberg_writer
11
+
12
+
13
+ def run_collector(
14
+ collector: Collector,
15
+ catalog: str = "local",
16
+ limit: int | None = None,
17
+ param_overrides: dict | None = None,
18
+ ) -> None:
19
+ # GCS write path bypasses Spark entirely — skip JVM startup
20
+ if catalog == "gcp":
21
+ spark = None
22
+ else:
23
+ from dcf.spark_session import get_spark
24
+ spark = get_spark("dcf")
25
+
26
+ param_defs = {p.name: p for p in collector.source.params}
27
+ request_sequence = build_request_sequence(collector.cadence.iterate, param_defs)
28
+
29
+ if limit is not None:
30
+ request_sequence = request_sequence[:limit]
31
+
32
+ # Static params declared in the YAML (value is set) flow through to Python sources
33
+ static_params = {p.name: p.value for p in collector.source.params if p.value is not None}
34
+
35
+ print(f"\n[dcf] Running '{collector.name}' — {len(request_sequence)} requests\n")
36
+
37
+ failed = 0
38
+
39
+ for i, dynamic_params in enumerate(request_sequence, 1):
40
+ label = " ".join(f"{k}={v}" for k, v in dynamic_params.items())
41
+ print(f" [{i}/{len(request_sequence)}] {label}")
42
+
43
+ # Build full params: static defaults → iterate values → CLI overrides
44
+ full_params = {**static_params, **dynamic_params, **(param_overrides or {})}
45
+
46
+ # For http sources, iterate-driven params are already handled in the fetcher;
47
+ # pass full_params only to python sources which need everything in one dict
48
+ source_params = full_params if isinstance(collector.source, PythonSource) else dynamic_params
49
+
50
+ try:
51
+ records = fetch_records(collector.source, source_params)
52
+ except Exception as e:
53
+ failed += 1
54
+ print(f" fetch error ({type(e).__name__}): {e}")
55
+ print(textwrap.indent(traceback.format_exc(), " "))
56
+ continue
57
+
58
+ if not records:
59
+ print(f" 0 records — skipping")
60
+ continue
61
+
62
+ df = project(records, collector.source.schema_)
63
+ print(f" {len(df)} rows → writing")
64
+
65
+ iceberg_writer.write(spark, collector, df, catalog=catalog, dynamic_params=dynamic_params)
66
+
67
+ if catalog == "gcp":
68
+ from .. import writer as _w
69
+ bucket = _w.iceberg._gcs_warehouse_bucket()
70
+ if collector.namespace:
71
+ dest = f"gs://{bucket}/{collector.namespace}/{collector.name}/data"
72
+ else:
73
+ dest = f"gs://{bucket}/{collector.name}/data"
74
+ else:
75
+ from ..project import find_project_root
76
+ if collector.namespace:
77
+ dest = str(find_project_root() / "warehouse" / collector.namespace / collector.name / "data")
78
+ else:
79
+ dest = str(find_project_root() / "warehouse" / collector.name / "data")
80
+
81
+ total = len(request_sequence)
82
+ if failed == total:
83
+ print(f"\n[dcf] '{collector.name}' FAILED — all {total} iteration(s) errored → {dest}\n")
84
+ elif failed:
85
+ print(f"\n[dcf] '{collector.name}' complete with errors — {failed}/{total} iteration(s) failed → {dest}\n")
86
+ else:
87
+ print(f"\n[dcf] '{collector.name}' complete → {dest}\n")
88
+
89
+ if spark is not None:
90
+ spark.stop()