jerry-thomas 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/cli/app.py +5 -9
- datapipeline/cli/commands/contract.py +8 -2
- datapipeline/cli/commands/source.py +5 -0
- datapipeline/cli/visuals/common.py +57 -5
- datapipeline/cli/visuals/labels.py +8 -41
- datapipeline/cli/visuals/sources_rich.py +8 -3
- datapipeline/cli/workspace_utils.py +25 -0
- datapipeline/config/dataset/dataset.py +1 -1
- datapipeline/config/dataset/normalize.py +9 -4
- datapipeline/config/workspace.py +15 -0
- datapipeline/services/scaffold/source.py +2 -1
- datapipeline/sources/foreach.py +151 -0
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +1 -1
- {jerry_thomas-1.0.2.dist-info → jerry_thomas-1.0.3.dist-info}/METADATA +290 -288
- {jerry_thomas-1.0.2.dist-info → jerry_thomas-1.0.3.dist-info}/RECORD +19 -20
- {jerry_thomas-1.0.2.dist-info → jerry_thomas-1.0.3.dist-info}/entry_points.txt +1 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +0 -31
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +0 -30
- datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +0 -12
- {jerry_thomas-1.0.2.dist-info → jerry_thomas-1.0.3.dist-info}/WHEEL +0 -0
- {jerry_thomas-1.0.2.dist-info → jerry_thomas-1.0.3.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-1.0.2.dist-info → jerry_thomas-1.0.3.dist-info}/top_level.txt +0 -0
datapipeline/cli/app.py
CHANGED
|
@@ -31,15 +31,9 @@ def _dataset_to_project_path(
|
|
|
31
31
|
"""Resolve a dataset selector (alias, folder, or file) into a project.yaml path."""
|
|
32
32
|
# 1) Alias via jerry.yaml datasets (wins over local folders with same name)
|
|
33
33
|
if workspace is not None:
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
base = workspace.root
|
|
38
|
-
candidate = Path(raw)
|
|
39
|
-
candidate = candidate if candidate.is_absolute() else (base / candidate)
|
|
40
|
-
if candidate.is_dir():
|
|
41
|
-
candidate = candidate / "project.yaml"
|
|
42
|
-
return str(candidate.resolve())
|
|
34
|
+
resolved = workspace.resolve_dataset_alias(dataset)
|
|
35
|
+
if resolved is not None:
|
|
36
|
+
return str(resolved)
|
|
43
37
|
|
|
44
38
|
# 2) Direct file path
|
|
45
39
|
path = Path(dataset)
|
|
@@ -640,6 +634,7 @@ def main() -> None:
|
|
|
640
634
|
alias=getattr(args, "alias", None),
|
|
641
635
|
identity=getattr(args, "identity", False),
|
|
642
636
|
plugin_root=plugin_root,
|
|
637
|
+
workspace=workspace_context,
|
|
643
638
|
)
|
|
644
639
|
return
|
|
645
640
|
|
|
@@ -658,6 +653,7 @@ def main() -> None:
|
|
|
658
653
|
handle_contract(
|
|
659
654
|
plugin_root=plugin_root,
|
|
660
655
|
use_identity=args.identity,
|
|
656
|
+
workspace=workspace_context,
|
|
661
657
|
)
|
|
662
658
|
return
|
|
663
659
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
|
+
from datapipeline.config.workspace import WorkspaceContext
|
|
5
|
+
from datapipeline.cli.workspace_utils import resolve_default_project_yaml
|
|
4
6
|
from datapipeline.services.paths import pkg_root, resolve_base_pkg_dir
|
|
5
7
|
from datapipeline.services.entrypoints import read_group_entries, inject_ep
|
|
6
8
|
from datapipeline.services.constants import FILTERS_GROUP, MAPPERS_GROUP
|
|
@@ -31,8 +33,10 @@ def handle(
|
|
|
31
33
|
*,
|
|
32
34
|
plugin_root: Path | None = None,
|
|
33
35
|
use_identity: bool = False,
|
|
36
|
+
workspace: WorkspaceContext | None = None,
|
|
34
37
|
) -> None:
|
|
35
38
|
root_dir, name, pyproject = pkg_root(plugin_root)
|
|
39
|
+
default_project = resolve_default_project_yaml(workspace)
|
|
36
40
|
# Select contract type: Ingest (source->stream) or Composed (streams->stream)
|
|
37
41
|
print("Select contract type:", file=sys.stderr)
|
|
38
42
|
print(" [1] Ingest (source → stream)", file=sys.stderr)
|
|
@@ -49,12 +53,13 @@ def handle(
|
|
|
49
53
|
mapper_path=None,
|
|
50
54
|
with_mapper_stub=True,
|
|
51
55
|
plugin_root=plugin_root,
|
|
56
|
+
project_yaml=default_project,
|
|
52
57
|
)
|
|
53
58
|
return
|
|
54
59
|
|
|
55
60
|
# Discover sources by scanning sources_dir YAMLs
|
|
56
61
|
# Default to dataset-scoped project config
|
|
57
|
-
proj_path = resolve_project_yaml_path(root_dir)
|
|
62
|
+
proj_path = default_project or resolve_project_yaml_path(root_dir)
|
|
58
63
|
# Ensure a minimal project scaffold so we can resolve dirs interactively
|
|
59
64
|
ensure_project_scaffold(proj_path)
|
|
60
65
|
sources_dir = resolve_sources_dir(proj_path)
|
|
@@ -187,6 +192,7 @@ def scaffold_conflux(
|
|
|
187
192
|
mapper_path: str | None,
|
|
188
193
|
with_mapper_stub: bool,
|
|
189
194
|
plugin_root: Path | None,
|
|
195
|
+
project_yaml: Path | None,
|
|
190
196
|
) -> None:
|
|
191
197
|
"""Scaffold a composed (multi-input) contract and optional mapper stub.
|
|
192
198
|
|
|
@@ -195,7 +201,7 @@ def scaffold_conflux(
|
|
|
195
201
|
"""
|
|
196
202
|
root_dir, name, _ = pkg_root(plugin_root)
|
|
197
203
|
# Resolve default project path early for interactive selections
|
|
198
|
-
proj_path = resolve_project_yaml_path(root_dir)
|
|
204
|
+
proj_path = project_yaml or resolve_project_yaml_path(root_dir)
|
|
199
205
|
ensure_project_scaffold(proj_path)
|
|
200
206
|
# Defer target domain selection until after choosing inputs
|
|
201
207
|
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
|
|
3
|
+
from datapipeline.config.workspace import WorkspaceContext
|
|
4
|
+
from datapipeline.cli.workspace_utils import resolve_default_project_yaml
|
|
3
5
|
from datapipeline.services.scaffold.source import create_source
|
|
4
6
|
|
|
5
7
|
|
|
@@ -13,6 +15,7 @@ def handle(
|
|
|
13
15
|
identity: bool = False,
|
|
14
16
|
alias: str | None = None,
|
|
15
17
|
plugin_root: Path | None = None,
|
|
18
|
+
workspace: WorkspaceContext | None = None,
|
|
16
19
|
) -> None:
|
|
17
20
|
if subcmd in {"create", "add"}:
|
|
18
21
|
# Allow: positional provider dataset, --provider/--dataset, --alias, or provider as 'prov.ds'
|
|
@@ -43,6 +46,7 @@ def handle(
|
|
|
43
46
|
if transport in {"fs", "http"} and not format:
|
|
44
47
|
print("[error] --format is required for fs/http transports (fs: csv|json|json-lines|pickle, http: csv|json|json-lines)")
|
|
45
48
|
raise SystemExit(2)
|
|
49
|
+
project_yaml = resolve_default_project_yaml(workspace)
|
|
46
50
|
create_source(
|
|
47
51
|
provider=provider,
|
|
48
52
|
dataset=dataset,
|
|
@@ -50,4 +54,5 @@ def handle(
|
|
|
50
54
|
format=format,
|
|
51
55
|
root=plugin_root,
|
|
52
56
|
identity=identity,
|
|
57
|
+
**({"project_yaml": project_yaml} if project_yaml is not None else {}),
|
|
53
58
|
)
|
|
@@ -7,6 +7,7 @@ from typing import Optional, Sequence
|
|
|
7
7
|
|
|
8
8
|
from urllib.parse import urlparse
|
|
9
9
|
from datapipeline.sources.transports import FsGlobTransport, FsFileTransport, HttpTransport
|
|
10
|
+
from datapipeline.sources.foreach import ForeachLoader
|
|
10
11
|
|
|
11
12
|
logger = logging.getLogger(__name__)
|
|
12
13
|
|
|
@@ -217,23 +218,74 @@ def current_transport_label(transport, *, glob_root: Optional[Path] = None) -> O
|
|
|
217
218
|
current = getattr(transport, "current_path", None)
|
|
218
219
|
if not current:
|
|
219
220
|
return None
|
|
220
|
-
return relative_label(current, glob_root)
|
|
221
|
+
return f"\"{relative_label(current, glob_root)}\""
|
|
221
222
|
if isinstance(transport, FsFileTransport):
|
|
222
223
|
path = getattr(transport, "path", None)
|
|
223
224
|
if not path:
|
|
224
225
|
return None
|
|
225
226
|
try:
|
|
226
|
-
|
|
227
|
+
name = Path(path).name or str(path)
|
|
228
|
+
return f"\"{name}\""
|
|
227
229
|
except Exception:
|
|
228
|
-
return
|
|
230
|
+
return f"\"{path}\""
|
|
229
231
|
if isinstance(transport, HttpTransport):
|
|
230
232
|
url = getattr(transport, "url", None)
|
|
231
233
|
if not url:
|
|
232
234
|
return None
|
|
233
235
|
try:
|
|
234
236
|
parts = urlparse(url)
|
|
235
|
-
|
|
236
|
-
return
|
|
237
|
+
host = parts.netloc or "http"
|
|
238
|
+
return f"@{host}"
|
|
237
239
|
except Exception:
|
|
238
240
|
return None
|
|
239
241
|
return None
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def current_loader_label(loader, transport, *, glob_root: Optional[Path] = None) -> Optional[str]:
|
|
245
|
+
"""Return a human-friendly label for the loader's current unit of work."""
|
|
246
|
+
if isinstance(loader, ForeachLoader):
|
|
247
|
+
value = getattr(loader, "_current_value", None)
|
|
248
|
+
if value is None:
|
|
249
|
+
return None
|
|
250
|
+
idx = getattr(loader, "_current_index", None)
|
|
251
|
+
values = getattr(loader, "_values", None)
|
|
252
|
+
total = len(values) if isinstance(values, list) else None
|
|
253
|
+
|
|
254
|
+
item_label = f"\"{value}\""
|
|
255
|
+
status = None
|
|
256
|
+
if isinstance(idx, int) and isinstance(total, int) and total > 0:
|
|
257
|
+
status = f"({idx}/{total})"
|
|
258
|
+
|
|
259
|
+
def _with_item(action: str | None) -> str:
|
|
260
|
+
parts = []
|
|
261
|
+
if action:
|
|
262
|
+
parts.append(action)
|
|
263
|
+
parts.append(item_label)
|
|
264
|
+
if status:
|
|
265
|
+
parts.append(status)
|
|
266
|
+
return " ".join(parts)
|
|
267
|
+
|
|
268
|
+
spec = getattr(loader, "_loader_spec", None) or {}
|
|
269
|
+
entrypoint = spec.get("entrypoint", "") if isinstance(spec, dict) else ""
|
|
270
|
+
args = getattr(loader, "_current_args", None)
|
|
271
|
+
inner_transport = getattr(loader, "_current_transport", None)
|
|
272
|
+
|
|
273
|
+
if entrypoint == "core.io" and isinstance(args, dict):
|
|
274
|
+
t = args.get("transport")
|
|
275
|
+
if t == "http":
|
|
276
|
+
parts = urlparse(str(args.get("url", "")))
|
|
277
|
+
host = parts.netloc or "http"
|
|
278
|
+
return _with_item(f"Downloading @{host}")
|
|
279
|
+
if t == "fs":
|
|
280
|
+
inner_root = None
|
|
281
|
+
if isinstance(inner_transport, FsGlobTransport):
|
|
282
|
+
inner_root = compute_glob_root(getattr(inner_transport, "files", []))
|
|
283
|
+
label = current_transport_label(inner_transport, glob_root=inner_root)
|
|
284
|
+
action = f"Loading {label}" if label else "Loading fs"
|
|
285
|
+
return _with_item(action)
|
|
286
|
+
|
|
287
|
+
if entrypoint:
|
|
288
|
+
return _with_item(f"via {entrypoint}")
|
|
289
|
+
return _with_item(None)
|
|
290
|
+
|
|
291
|
+
return current_transport_label(transport, glob_root=glob_root)
|
|
@@ -1,41 +1,9 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
from urllib.parse import urlparse
|
|
3
|
-
|
|
4
1
|
from datapipeline.sources.models.loader import SyntheticLoader, BaseDataLoader
|
|
5
2
|
from datapipeline.sources.data_loader import DataLoader
|
|
3
|
+
from datapipeline.sources.foreach import ForeachLoader
|
|
6
4
|
from datapipeline.sources.transports import FsFileTransport, FsGlobTransport, HttpTransport
|
|
7
5
|
from datapipeline.sources.decoders import CsvDecoder, JsonDecoder, JsonLinesDecoder, PickleDecoder
|
|
8
6
|
|
|
9
|
-
MAX_LABEL_LEN = 48
|
|
10
|
-
GLOB_SEGMENTS = 3
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def _truncate_middle(text: str, max_len: int) -> str:
|
|
14
|
-
if len(text) <= max_len:
|
|
15
|
-
return text
|
|
16
|
-
if max_len <= 3:
|
|
17
|
-
return text[:max_len]
|
|
18
|
-
keep = max_len - 3
|
|
19
|
-
head = (keep + 1) // 2
|
|
20
|
-
tail = keep - head
|
|
21
|
-
suffix = text[-tail:] if tail > 0 else ""
|
|
22
|
-
return f"{text[:head]}...{suffix}"
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def _compact_path_label(name: str) -> str:
|
|
26
|
-
if not name:
|
|
27
|
-
return "fs"
|
|
28
|
-
normalized = name.replace("\\", "/").strip()
|
|
29
|
-
if not normalized:
|
|
30
|
-
return "fs"
|
|
31
|
-
parts = [part for part in normalized.split("/") if part]
|
|
32
|
-
if not parts:
|
|
33
|
-
return normalized
|
|
34
|
-
if len(parts) > GLOB_SEGMENTS:
|
|
35
|
-
parts = parts[-GLOB_SEGMENTS:]
|
|
36
|
-
label = "/".join(parts)
|
|
37
|
-
return _truncate_middle(label, MAX_LABEL_LEN)
|
|
38
|
-
|
|
39
7
|
|
|
40
8
|
def unit_for_loader(loader) -> str:
|
|
41
9
|
if isinstance(loader, SyntheticLoader):
|
|
@@ -56,18 +24,17 @@ def build_source_label(loader: BaseDataLoader) -> str:
|
|
|
56
24
|
except Exception:
|
|
57
25
|
gen_name = loader.__class__.__name__
|
|
58
26
|
return "Generating data with " + gen_name
|
|
27
|
+
if isinstance(loader, ForeachLoader):
|
|
28
|
+
key = str(getattr(loader, "_key", "item"))
|
|
29
|
+
values = getattr(loader, "_values", None)
|
|
30
|
+
n = len(values) if isinstance(values, list) else "?"
|
|
31
|
+
return f"Fan-out {key}×{n}:"
|
|
59
32
|
if isinstance(loader, DataLoader):
|
|
60
33
|
transport = getattr(loader, "transport", None)
|
|
61
34
|
if isinstance(transport, (FsFileTransport, FsGlobTransport)):
|
|
62
|
-
|
|
63
|
-
if isinstance(transport, FsFileTransport) and name and "*" not in name:
|
|
64
|
-
label = Path(name).name or "fs"
|
|
65
|
-
else:
|
|
66
|
-
label = _compact_path_label(name)
|
|
67
|
-
return f"Loading data from: {label}"
|
|
35
|
+
return "Loading"
|
|
68
36
|
if isinstance(transport, HttpTransport):
|
|
69
|
-
|
|
70
|
-
return f"Downloading data from: @{host}"
|
|
37
|
+
return "Downloading"
|
|
71
38
|
return loader.__class__.__name__
|
|
72
39
|
|
|
73
40
|
|
|
@@ -24,13 +24,14 @@ from rich.text import Text
|
|
|
24
24
|
from .labels import progress_meta_for_loader
|
|
25
25
|
from .common import (
|
|
26
26
|
compute_glob_root,
|
|
27
|
-
|
|
27
|
+
current_loader_label,
|
|
28
28
|
log_combined_stream,
|
|
29
29
|
transport_debug_lines,
|
|
30
30
|
transport_info_lines,
|
|
31
31
|
)
|
|
32
32
|
from datapipeline.runtime import Runtime
|
|
33
33
|
from datapipeline.sources.models.source import Source
|
|
34
|
+
from datapipeline.sources.foreach import ForeachLoader
|
|
34
35
|
from datapipeline.sources.transports import FsGlobTransport, FsFileTransport, HttpTransport
|
|
35
36
|
logger = logging.getLogger(__name__)
|
|
36
37
|
|
|
@@ -144,8 +145,12 @@ class _RichSourceProxy(Source):
|
|
|
144
145
|
glob_root = compute_glob_root(
|
|
145
146
|
getattr(transport, "files", []))
|
|
146
147
|
|
|
148
|
+
is_foreach_loader = isinstance(loader, ForeachLoader)
|
|
149
|
+
|
|
147
150
|
def compose_text(name: Optional[str]) -> str:
|
|
148
151
|
if name:
|
|
152
|
+
if is_foreach_loader:
|
|
153
|
+
return str(name)
|
|
149
154
|
base = header if sep else desc
|
|
150
155
|
return f"{base} {name}".rstrip()
|
|
151
156
|
if tail:
|
|
@@ -173,8 +178,8 @@ class _RichSourceProxy(Source):
|
|
|
173
178
|
|
|
174
179
|
try:
|
|
175
180
|
for item in self._inner.stream():
|
|
176
|
-
current_label =
|
|
177
|
-
transport, glob_root=glob_root
|
|
181
|
+
current_label = current_loader_label(
|
|
182
|
+
loader, transport, glob_root=glob_root
|
|
178
183
|
)
|
|
179
184
|
# On first item: emit Start + transport details
|
|
180
185
|
if not started_logged:
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from datapipeline.config.workspace import WorkspaceContext
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def resolve_default_project_yaml(workspace: WorkspaceContext | None) -> Path | None:
|
|
9
|
+
"""Resolve default_dataset from jerry.yaml into a project.yaml path.
|
|
10
|
+
|
|
11
|
+
Returns None when no workspace context or no default_dataset is configured.
|
|
12
|
+
Raises SystemExit when default_dataset is set but missing from datasets:.
|
|
13
|
+
"""
|
|
14
|
+
if workspace is None:
|
|
15
|
+
return None
|
|
16
|
+
alias = workspace.config.default_dataset
|
|
17
|
+
if not alias:
|
|
18
|
+
return None
|
|
19
|
+
resolved = workspace.resolve_dataset_alias(alias)
|
|
20
|
+
if resolved is None:
|
|
21
|
+
raise SystemExit(
|
|
22
|
+
f"Unknown default_dataset '{alias}'. Define it under datasets: in jerry.yaml."
|
|
23
|
+
)
|
|
24
|
+
return resolved
|
|
25
|
+
|
|
@@ -10,6 +10,6 @@ class RecordDatasetConfig(BaseModel):
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class FeatureDatasetConfig(BaseModel):
|
|
13
|
-
group_by: str = Field(..., pattern=r"^\d+(m|min|h)$")
|
|
13
|
+
group_by: str = Field(..., pattern=r"^\d+(m|min|h|d)$")
|
|
14
14
|
features: List[FeatureRecordConfig] = Field(default_factory=list)
|
|
15
15
|
targets: List[FeatureRecordConfig] = Field(default_factory=list)
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
from datetime import datetime
|
|
1
|
+
from datetime import datetime, timedelta
|
|
2
2
|
import re
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
def floor_time_to_bucket(ts: datetime, bucket: str) -> datetime:
|
|
6
6
|
"""Floor a timestamp to the nearest bucket boundary.
|
|
7
7
|
|
|
8
|
-
Supports patterns like '10m', '10min', '1h', '2h'.
|
|
8
|
+
Supports patterns like '10m', '10min', '1h', '2h', '1d'.
|
|
9
9
|
Minutes may be specified as 'm' or 'min'.
|
|
10
10
|
"""
|
|
11
|
-
m = re.fullmatch(r"^(\d+)(m|min|h)$", bucket)
|
|
11
|
+
m = re.fullmatch(r"^(\d+)(m|min|h|d)$", bucket)
|
|
12
12
|
if not m:
|
|
13
13
|
raise ValueError(f"Unsupported cadence: {bucket}")
|
|
14
14
|
n = int(m.group(1))
|
|
@@ -19,6 +19,11 @@ def floor_time_to_bucket(ts: datetime, bucket: str) -> datetime:
|
|
|
19
19
|
if unit in ("m", "min"):
|
|
20
20
|
floored_minute = (ts.minute // n) * n
|
|
21
21
|
return ts.replace(minute=floored_minute, second=0, microsecond=0)
|
|
22
|
-
|
|
22
|
+
if unit == "h":
|
|
23
23
|
floored_hour = (ts.hour // n) * n
|
|
24
24
|
return ts.replace(hour=floored_hour, minute=0, second=0, microsecond=0)
|
|
25
|
+
base = ts.replace(hour=0, minute=0, second=0, microsecond=0)
|
|
26
|
+
if n == 1:
|
|
27
|
+
return base
|
|
28
|
+
remainder = (base.toordinal() - 1) % n
|
|
29
|
+
return base - timedelta(days=remainder)
|
datapipeline/config/workspace.py
CHANGED
|
@@ -125,6 +125,21 @@ class WorkspaceContext:
|
|
|
125
125
|
def root(self) -> Path:
|
|
126
126
|
return self.file_path.parent
|
|
127
127
|
|
|
128
|
+
def resolve_dataset_alias(self, alias: str) -> Optional[Path]:
|
|
129
|
+
"""Resolve a dataset alias from jerry.yaml into an absolute project.yaml path."""
|
|
130
|
+
raw = (self.config.datasets or {}).get(alias)
|
|
131
|
+
if not raw:
|
|
132
|
+
return None
|
|
133
|
+
candidate = Path(raw)
|
|
134
|
+
candidate = (
|
|
135
|
+
candidate.resolve()
|
|
136
|
+
if candidate.is_absolute()
|
|
137
|
+
else (self.root / candidate).resolve()
|
|
138
|
+
)
|
|
139
|
+
if candidate.is_dir():
|
|
140
|
+
candidate = candidate / "project.yaml"
|
|
141
|
+
return candidate.resolve()
|
|
142
|
+
|
|
128
143
|
def resolve_plugin_root(self) -> Optional[Path]:
|
|
129
144
|
raw = self.config.plugin_root
|
|
130
145
|
if not raw:
|
|
@@ -98,6 +98,7 @@ def create_source(
|
|
|
98
98
|
format: Optional[str],
|
|
99
99
|
root: Optional[Path],
|
|
100
100
|
identity: bool = False,
|
|
101
|
+
project_yaml: Optional[Path] = None,
|
|
101
102
|
) -> None:
|
|
102
103
|
root_dir, name, _ = pkg_root(root)
|
|
103
104
|
base = resolve_base_pkg_dir(root_dir, name)
|
|
@@ -169,7 +170,7 @@ def create_source(
|
|
|
169
170
|
# Resolve sources directory from a single dataset-scoped project config.
|
|
170
171
|
# If not present or invalid, let the exception bubble up to prompt the user
|
|
171
172
|
# to provide a valid project path.
|
|
172
|
-
proj_yaml = resolve_project_yaml_path(root_dir)
|
|
173
|
+
proj_yaml = project_yaml.resolve() if project_yaml is not None else resolve_project_yaml_path(root_dir)
|
|
173
174
|
# Best-effort: create a minimal project scaffold if missing
|
|
174
175
|
ensure_project_scaffold(proj_yaml)
|
|
175
176
|
sources_dir = resolve_sources_dir(proj_yaml).resolve()
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any, Iterator, Mapping
|
|
5
|
+
|
|
6
|
+
from datapipeline.plugins import LOADERS_EP
|
|
7
|
+
from datapipeline.sources.models.loader import BaseDataLoader
|
|
8
|
+
from datapipeline.utils.load import load_ep
|
|
9
|
+
from datapipeline.utils.placeholders import normalize_args, MissingInterpolation, is_missing
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
_VAR_RE = re.compile(r"\$\{([^}]+)\}")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _interpolate(obj: Any, vars_: Mapping[str, Any]) -> Any:
|
|
16
|
+
if isinstance(obj, dict):
|
|
17
|
+
return {k: _interpolate(v, vars_) for k, v in obj.items()}
|
|
18
|
+
if isinstance(obj, list):
|
|
19
|
+
return [_interpolate(v, vars_) for v in obj]
|
|
20
|
+
if isinstance(obj, str):
|
|
21
|
+
match = _VAR_RE.fullmatch(obj)
|
|
22
|
+
if match:
|
|
23
|
+
key = match.group(1)
|
|
24
|
+
if key in vars_:
|
|
25
|
+
value = vars_[key]
|
|
26
|
+
if value is None or is_missing(value):
|
|
27
|
+
return MissingInterpolation(key)
|
|
28
|
+
return value
|
|
29
|
+
return obj
|
|
30
|
+
|
|
31
|
+
def repl(m):
|
|
32
|
+
key = m.group(1)
|
|
33
|
+
value = vars_.get(key, m.group(0))
|
|
34
|
+
if value is None or is_missing(value):
|
|
35
|
+
return m.group(0)
|
|
36
|
+
return str(value)
|
|
37
|
+
|
|
38
|
+
return _VAR_RE.sub(repl, obj)
|
|
39
|
+
return obj
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ForeachLoader(BaseDataLoader):
|
|
43
|
+
"""Expand a loader spec across a foreach map and concatenate results."""
|
|
44
|
+
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
*,
|
|
48
|
+
foreach: Mapping[str, list[Any]],
|
|
49
|
+
loader: Mapping[str, Any],
|
|
50
|
+
inject_field: str | None = None,
|
|
51
|
+
inject: Mapping[str, Any] | None = None,
|
|
52
|
+
):
|
|
53
|
+
self._key, self._values = self._normalize_foreach(foreach)
|
|
54
|
+
self._loader_spec = self._normalize_loader_spec(loader)
|
|
55
|
+
self._inject_field = inject_field
|
|
56
|
+
self._inject = inject
|
|
57
|
+
self._current_index: int | None = None
|
|
58
|
+
self._current_value: Any | None = None
|
|
59
|
+
self._current_args: dict[str, Any] | None = None
|
|
60
|
+
self._current_transport: Any | None = None
|
|
61
|
+
|
|
62
|
+
if inject_field and inject:
|
|
63
|
+
raise ValueError("core.foreach supports only one of inject_field or inject")
|
|
64
|
+
if inject_field and self._key is None:
|
|
65
|
+
raise ValueError("inject_field requires a non-empty foreach map")
|
|
66
|
+
if inject is not None and not isinstance(inject, Mapping):
|
|
67
|
+
raise TypeError("inject must be a mapping when provided")
|
|
68
|
+
|
|
69
|
+
def load(self) -> Iterator[Any]:
|
|
70
|
+
for i, value in enumerate(self._values, 1):
|
|
71
|
+
vars_ = {self._key: value}
|
|
72
|
+
loader_args = self._make_loader_args(vars_)
|
|
73
|
+
loader = self._build_loader(loader_args)
|
|
74
|
+
self._current_index = i
|
|
75
|
+
self._current_value = value
|
|
76
|
+
self._current_args = loader_args
|
|
77
|
+
self._current_transport = getattr(loader, "transport", None)
|
|
78
|
+
inject_map = self._build_inject(vars_)
|
|
79
|
+
for row in loader.load():
|
|
80
|
+
if inject_map:
|
|
81
|
+
yield self._apply_inject(row, inject_map)
|
|
82
|
+
else:
|
|
83
|
+
yield row
|
|
84
|
+
|
|
85
|
+
def count(self):
|
|
86
|
+
total = 0
|
|
87
|
+
for value in self._values:
|
|
88
|
+
vars_ = {self._key: value}
|
|
89
|
+
loader_args = self._make_loader_args(vars_)
|
|
90
|
+
loader = self._build_loader(loader_args)
|
|
91
|
+
c = loader.count()
|
|
92
|
+
if c is None:
|
|
93
|
+
return None
|
|
94
|
+
total += int(c)
|
|
95
|
+
return total
|
|
96
|
+
|
|
97
|
+
@staticmethod
|
|
98
|
+
def _normalize_foreach(foreach: Mapping[str, list[Any]]):
|
|
99
|
+
if not isinstance(foreach, Mapping) or not foreach:
|
|
100
|
+
raise ValueError("core.foreach requires a non-empty foreach mapping")
|
|
101
|
+
keys = list(foreach.keys())
|
|
102
|
+
if len(keys) != 1:
|
|
103
|
+
raise ValueError("core.foreach currently supports exactly one foreach key")
|
|
104
|
+
key = keys[0]
|
|
105
|
+
values = foreach[key]
|
|
106
|
+
if not isinstance(values, list):
|
|
107
|
+
raise TypeError("core.foreach foreach values must be a list")
|
|
108
|
+
return str(key), list(values)
|
|
109
|
+
|
|
110
|
+
@staticmethod
|
|
111
|
+
def _normalize_loader_spec(loader: Mapping[str, Any]) -> Mapping[str, Any]:
|
|
112
|
+
if not isinstance(loader, Mapping):
|
|
113
|
+
raise TypeError("core.foreach loader must be a mapping with entrypoint/args")
|
|
114
|
+
entrypoint = loader.get("entrypoint")
|
|
115
|
+
if not entrypoint or not isinstance(entrypoint, str):
|
|
116
|
+
raise ValueError("core.foreach loader.entrypoint must be a non-empty string")
|
|
117
|
+
args = loader.get("args")
|
|
118
|
+
if args is not None and not isinstance(args, Mapping):
|
|
119
|
+
raise TypeError("core.foreach loader.args must be a mapping when provided")
|
|
120
|
+
return dict(loader)
|
|
121
|
+
|
|
122
|
+
def _make_loader_args(self, vars_: Mapping[str, Any]) -> dict[str, Any]:
|
|
123
|
+
args = self._loader_spec.get("args") or {}
|
|
124
|
+
interpolated = _interpolate(args, vars_)
|
|
125
|
+
return normalize_args(interpolated)
|
|
126
|
+
|
|
127
|
+
def _build_loader(self, loader_args: dict[str, Any]) -> BaseDataLoader:
|
|
128
|
+
entrypoint = self._loader_spec["entrypoint"]
|
|
129
|
+
L = load_ep(LOADERS_EP, entrypoint)
|
|
130
|
+
return L(**loader_args)
|
|
131
|
+
|
|
132
|
+
def _build_inject(self, vars_: Mapping[str, Any]) -> Mapping[str, Any] | None:
|
|
133
|
+
if self._inject_field:
|
|
134
|
+
return {self._inject_field: vars_.get(self._key)}
|
|
135
|
+
if self._inject is None:
|
|
136
|
+
return None
|
|
137
|
+
interpolated = _interpolate(self._inject, vars_)
|
|
138
|
+
if not isinstance(interpolated, Mapping):
|
|
139
|
+
raise TypeError("core.foreach inject must resolve to a mapping")
|
|
140
|
+
return normalize_args(interpolated)
|
|
141
|
+
|
|
142
|
+
@staticmethod
|
|
143
|
+
def _apply_inject(row: Any, inject_map: Mapping[str, Any]) -> Any:
|
|
144
|
+
if isinstance(row, dict):
|
|
145
|
+
row.update(inject_map)
|
|
146
|
+
return row
|
|
147
|
+
if isinstance(row, Mapping):
|
|
148
|
+
out = dict(row)
|
|
149
|
+
out.update(inject_map)
|
|
150
|
+
return out
|
|
151
|
+
raise TypeError("core.foreach inject requires mapping rows")
|
|
@@ -7,7 +7,7 @@ paths:
|
|
|
7
7
|
postprocess: postprocess.yaml
|
|
8
8
|
artifacts: ../artifacts/${project_name}/v${version}
|
|
9
9
|
tasks: ./tasks
|
|
10
|
-
globals:
|
|
10
|
+
globals: # Globals to use in your .yaml files via ${var_name}.
|
|
11
11
|
# Primary dataset cadence; referenced from dataset.yaml (group_by)
|
|
12
12
|
# and contracts via ${group_by}.
|
|
13
13
|
group_by: <your-bucket-cadence>
|