jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. datapipeline/analysis/vector/collector.py +120 -17
  2. datapipeline/analysis/vector/matrix.py +33 -8
  3. datapipeline/analysis/vector/report.py +162 -32
  4. datapipeline/build/tasks/__init__.py +11 -0
  5. datapipeline/build/tasks/config.py +74 -0
  6. datapipeline/build/tasks/metadata.py +170 -0
  7. datapipeline/build/tasks/scaler.py +73 -0
  8. datapipeline/build/tasks/schema.py +60 -0
  9. datapipeline/build/tasks/utils.py +169 -0
  10. datapipeline/cli/app.py +304 -127
  11. datapipeline/cli/commands/build.py +240 -16
  12. datapipeline/cli/commands/contract.py +367 -0
  13. datapipeline/cli/commands/domain.py +8 -3
  14. datapipeline/cli/commands/inspect.py +401 -149
  15. datapipeline/cli/commands/list_.py +30 -7
  16. datapipeline/cli/commands/plugin.py +5 -1
  17. datapipeline/cli/commands/run.py +227 -241
  18. datapipeline/cli/commands/run_config.py +101 -0
  19. datapipeline/cli/commands/serve_pipeline.py +156 -0
  20. datapipeline/cli/commands/source.py +44 -8
  21. datapipeline/cli/visuals/__init__.py +4 -2
  22. datapipeline/cli/visuals/common.py +239 -0
  23. datapipeline/cli/visuals/labels.py +15 -15
  24. datapipeline/cli/visuals/runner.py +66 -0
  25. datapipeline/cli/visuals/sections.py +20 -0
  26. datapipeline/cli/visuals/sources.py +132 -119
  27. datapipeline/cli/visuals/sources_basic.py +260 -0
  28. datapipeline/cli/visuals/sources_off.py +76 -0
  29. datapipeline/cli/visuals/sources_rich.py +414 -0
  30. datapipeline/config/catalog.py +37 -3
  31. datapipeline/config/context.py +214 -0
  32. datapipeline/config/dataset/loader.py +21 -4
  33. datapipeline/config/dataset/normalize.py +4 -4
  34. datapipeline/config/metadata.py +43 -0
  35. datapipeline/config/postprocess.py +2 -2
  36. datapipeline/config/project.py +3 -2
  37. datapipeline/config/resolution.py +129 -0
  38. datapipeline/config/tasks.py +309 -0
  39. datapipeline/config/workspace.py +155 -0
  40. datapipeline/domain/__init__.py +12 -0
  41. datapipeline/domain/record.py +11 -0
  42. datapipeline/domain/sample.py +54 -0
  43. datapipeline/integrations/ml/adapter.py +34 -20
  44. datapipeline/integrations/ml/pandas_support.py +0 -2
  45. datapipeline/integrations/ml/rows.py +1 -6
  46. datapipeline/integrations/ml/torch_support.py +1 -3
  47. datapipeline/io/factory.py +112 -0
  48. datapipeline/io/output.py +132 -0
  49. datapipeline/io/protocols.py +21 -0
  50. datapipeline/io/serializers.py +219 -0
  51. datapipeline/io/sinks/__init__.py +23 -0
  52. datapipeline/io/sinks/base.py +2 -0
  53. datapipeline/io/sinks/files.py +79 -0
  54. datapipeline/io/sinks/rich.py +57 -0
  55. datapipeline/io/sinks/stdout.py +18 -0
  56. datapipeline/io/writers/__init__.py +14 -0
  57. datapipeline/io/writers/base.py +28 -0
  58. datapipeline/io/writers/csv_writer.py +25 -0
  59. datapipeline/io/writers/jsonl.py +52 -0
  60. datapipeline/io/writers/pickle_writer.py +30 -0
  61. datapipeline/pipeline/artifacts.py +58 -0
  62. datapipeline/pipeline/context.py +66 -7
  63. datapipeline/pipeline/observability.py +65 -0
  64. datapipeline/pipeline/pipelines.py +65 -13
  65. datapipeline/pipeline/split.py +11 -10
  66. datapipeline/pipeline/stages.py +127 -16
  67. datapipeline/pipeline/utils/keygen.py +20 -7
  68. datapipeline/pipeline/utils/memory_sort.py +22 -10
  69. datapipeline/pipeline/utils/transform_utils.py +22 -0
  70. datapipeline/runtime.py +5 -2
  71. datapipeline/services/artifacts.py +12 -6
  72. datapipeline/services/bootstrap/config.py +25 -0
  73. datapipeline/services/bootstrap/core.py +52 -37
  74. datapipeline/services/constants.py +6 -5
  75. datapipeline/services/factories.py +123 -1
  76. datapipeline/services/project_paths.py +43 -16
  77. datapipeline/services/runs.py +208 -0
  78. datapipeline/services/scaffold/domain.py +3 -2
  79. datapipeline/services/scaffold/filter.py +3 -2
  80. datapipeline/services/scaffold/mappers.py +9 -6
  81. datapipeline/services/scaffold/plugin.py +54 -10
  82. datapipeline/services/scaffold/source.py +93 -56
  83. datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
  84. datapipeline/sources/decoders.py +83 -18
  85. datapipeline/sources/factory.py +26 -16
  86. datapipeline/sources/models/__init__.py +2 -2
  87. datapipeline/sources/models/generator.py +0 -7
  88. datapipeline/sources/models/loader.py +3 -3
  89. datapipeline/sources/models/parsing_error.py +24 -0
  90. datapipeline/sources/models/source.py +6 -6
  91. datapipeline/sources/synthetic/time/loader.py +14 -2
  92. datapipeline/sources/transports.py +74 -37
  93. datapipeline/templates/plugin_skeleton/README.md +76 -30
  94. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
  95. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
  96. datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
  97. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
  98. datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
  99. datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
  100. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
  101. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
  102. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
  103. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
  104. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
  105. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
  106. datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
  107. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
  108. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
  109. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
  110. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
  111. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
  112. datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
  113. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
  114. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
  115. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
  116. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
  117. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
  118. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
  119. datapipeline/templates/stubs/dto.py.j2 +2 -0
  120. datapipeline/templates/stubs/mapper.py.j2 +5 -4
  121. datapipeline/templates/stubs/parser.py.j2 +2 -0
  122. datapipeline/templates/stubs/record.py.j2 +2 -0
  123. datapipeline/templates/stubs/source.yaml.j2 +2 -3
  124. datapipeline/transforms/debug/lint.py +26 -41
  125. datapipeline/transforms/feature/scaler.py +89 -13
  126. datapipeline/transforms/record/floor_time.py +4 -4
  127. datapipeline/transforms/sequence.py +2 -35
  128. datapipeline/transforms/stream/dedupe.py +24 -0
  129. datapipeline/transforms/stream/ensure_ticks.py +7 -6
  130. datapipeline/transforms/vector/__init__.py +5 -0
  131. datapipeline/transforms/vector/common.py +98 -0
  132. datapipeline/transforms/vector/drop/__init__.py +4 -0
  133. datapipeline/transforms/vector/drop/horizontal.py +79 -0
  134. datapipeline/transforms/vector/drop/orchestrator.py +59 -0
  135. datapipeline/transforms/vector/drop/vertical.py +182 -0
  136. datapipeline/transforms/vector/ensure_schema.py +184 -0
  137. datapipeline/transforms/vector/fill.py +87 -0
  138. datapipeline/transforms/vector/replace.py +62 -0
  139. datapipeline/utils/load.py +24 -3
  140. datapipeline/utils/rich_compat.py +38 -0
  141. datapipeline/utils/window.py +76 -0
  142. jerry_thomas-1.0.1.dist-info/METADATA +825 -0
  143. jerry_thomas-1.0.1.dist-info/RECORD +199 -0
  144. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
  145. datapipeline/build/tasks.py +0 -186
  146. datapipeline/cli/commands/link.py +0 -128
  147. datapipeline/cli/commands/writers.py +0 -138
  148. datapipeline/config/build.py +0 -64
  149. datapipeline/config/run.py +0 -116
  150. datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
  151. datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
  152. datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
  153. datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
  154. datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
  155. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
  156. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
  157. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
  158. datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
  159. datapipeline/transforms/vector.py +0 -210
  160. jerry_thomas-0.3.0.dist-info/METADATA +0 -502
  161. jerry_thomas-0.3.0.dist-info/RECORD +0 -139
  162. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
  163. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
  164. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0
datapipeline/runtime.py CHANGED
@@ -1,8 +1,9 @@
1
1
  from dataclasses import dataclass, field
2
2
  from pathlib import Path
3
3
  from typing import Any, List, Mapping, Optional, Sequence, Union
4
+ from datetime import datetime
4
5
 
5
- from datapipeline.config.run import RunConfig
6
+ from datapipeline.config.tasks import ServeTask
6
7
  from datapipeline.config.split import SplitConfig
7
8
 
8
9
  from datapipeline.registries.registry import Registry
@@ -66,7 +67,9 @@ class Runtime:
66
67
  registries: Registries = field(default_factory=Registries)
67
68
  split: Optional[SplitConfig] = None
68
69
  split_keep: Optional[str] = None
69
- run: Optional[RunConfig] = None
70
+ run: Optional[ServeTask] = None
71
+ schema_required: bool = True
72
+ window_bounds: tuple[datetime | None, datetime | None] | None = None
70
73
  artifacts: ArtifactManager = field(init=False)
71
74
 
72
75
  def __post_init__(self) -> None:
@@ -1,10 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass
4
+ import json
4
5
  from pathlib import Path
5
6
  from typing import Any, Callable, Dict, Generic, Mapping, Optional, TypeVar
6
7
 
7
- from datapipeline.services.constants import PARTIONED_IDS
8
+ from datapipeline.services.constants import VECTOR_SCHEMA, VECTOR_SCHEMA_METADATA
8
9
 
9
10
  ArtifactValue = TypeVar("ArtifactValue")
10
11
 
@@ -85,12 +86,17 @@ class ArtifactManager:
85
86
  raise RuntimeError(message) from exc
86
87
 
87
88
 
88
- def _read_expected_ids(path: Path) -> list[str]:
89
+ def _read_schema(path: Path) -> dict:
89
90
  with path.open("r", encoding="utf-8") as fh:
90
- return [line.strip() for line in fh if line.strip()]
91
+ return json.load(fh)
91
92
 
92
93
 
93
- PARTITIONED_IDS_SPEC = ArtifactSpec[list[str]](
94
- key=PARTIONED_IDS,
95
- loader=_read_expected_ids,
94
+ VECTOR_SCHEMA_SPEC = ArtifactSpec[dict](
95
+ key=VECTOR_SCHEMA,
96
+ loader=_read_schema,
97
+ )
98
+
99
+ VECTOR_METADATA_SPEC = ArtifactSpec[dict](
100
+ key=VECTOR_SCHEMA_METADATA,
101
+ loader=_read_schema,
96
102
  )
@@ -41,6 +41,12 @@ def _project_vars(data: dict) -> dict[str, Any]:
41
41
  if name:
42
42
  vars_["project"] = str(name)
43
43
  vars_["project_name"] = str(name)
44
+
45
+ version = data.get("version")
46
+ if version is not None:
47
+ vars_["version"] = str(version)
48
+ vars_["project_version"] = str(version)
49
+
44
50
  globals_ = data.get("globals") or {}
45
51
  for k, v in globals_.items():
46
52
  vars_[str(k)] = _serialize_global_value(v)
@@ -64,6 +70,24 @@ def artifacts_root(project_yaml: Path) -> Path:
64
70
  return (pj.parent / ap).resolve() if not ap.is_absolute() else ap
65
71
 
66
72
 
73
+ def run_root(project_yaml: Path, run_id: str | None = None) -> Path:
74
+ """Return a per-run artifacts directory under the project artifacts root.
75
+
76
+ Example:
77
+ artifacts_root: /.../artifacts/my_dataset/v3
78
+ run_root: /.../artifacts/my_dataset/v3/runs/2025-11-29T14-15-23Z
79
+ """
80
+ base = artifacts_root(project_yaml)
81
+
82
+ if run_id is None:
83
+ ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%SZ")
84
+ run_id = ts
85
+
86
+ root = (base / "runs" / run_id).resolve()
87
+ root.mkdir(parents=True, exist_ok=True)
88
+ return root
89
+
90
+
67
91
  def _load_by_key(
68
92
  project_yaml: Path,
69
93
  key: str,
@@ -131,6 +155,7 @@ def _interpolate(obj, vars_: dict[str, Any]):
131
155
 
132
156
  __all__ = [
133
157
  "artifacts_root",
158
+ "run_root",
134
159
  "_globals",
135
160
  "_interpolate",
136
161
  "_load_by_key",
@@ -1,15 +1,14 @@
1
1
  from pathlib import Path
2
- from typing import Any, Mapping
2
+ from typing import Any
3
3
 
4
4
  from datapipeline.utils.load import load_yaml
5
5
  from datapipeline.config.catalog import StreamsConfig
6
- from datapipeline.config.run import load_run_config
6
+ from datapipeline.config.tasks import default_serve_task
7
7
  from datapipeline.services.project_paths import streams_dir, sources_dir
8
8
  from datapipeline.build.state import load_build_state
9
9
  from datapipeline.services.constants import (
10
10
  PARSER_KEY,
11
11
  LOADER_KEY,
12
- SOURCE_KEY,
13
12
  SOURCE_ID_KEY,
14
13
  MAPPER_KEY,
15
14
  ENTRYPOINT_KEY,
@@ -19,6 +18,7 @@ from datapipeline.services.constants import (
19
18
  from datapipeline.services.factories import (
20
19
  build_source_from_spec,
21
20
  build_mapper_from_spec,
21
+ build_composed_source,
22
22
  )
23
23
 
24
24
  from datapipeline.runtime import Runtime
@@ -28,9 +28,7 @@ from .config import (
28
28
  _globals,
29
29
  _interpolate,
30
30
  _load_by_key,
31
- _paths,
32
31
  _project,
33
- _project_vars,
34
32
  )
35
33
 
36
34
 
@@ -41,26 +39,28 @@ SRC_LOADER_KEY = LOADER_KEY
41
39
  def _load_sources_from_dir(project_yaml: Path, vars_: dict[str, Any]) -> dict:
42
40
  """Aggregate per-source YAML files into a raw-sources mapping.
43
41
 
44
- Expects each file to define a single source with top-level 'parser' and
45
- 'loader' keys. The source alias is inferred from the filename (without
46
- extension).
42
+ Scans for YAML files under the sources directory (recursing through
43
+ subfolders). Expects each file to define a single source with top-level
44
+ 'parser' and 'loader' keys. The top-level 'id' inside the file becomes the
45
+ runtime alias.
47
46
  """
48
- import os
49
47
  src_dir = sources_dir(project_yaml)
50
48
  if not src_dir.exists() or not src_dir.is_dir():
51
49
  return {}
52
50
  out: dict[str, dict] = {}
53
- for fname in sorted(os.listdir(src_dir)):
54
- if not (fname.endswith(".yaml") or fname.endswith(".yml")):
55
- continue
56
- data = load_yaml(src_dir / fname)
51
+ candidates = sorted(
52
+ (p for p in src_dir.rglob("*.y*ml") if p.is_file()),
53
+ key=lambda p: p.relative_to(src_dir).as_posix(),
54
+ )
55
+ for path in candidates:
56
+ data = load_yaml(path)
57
57
  if not isinstance(data, dict):
58
58
  continue
59
59
  if isinstance(data.get(SRC_PARSER_KEY), dict) and isinstance(data.get(SRC_LOADER_KEY), dict):
60
60
  alias = data.get(SOURCE_ID_KEY)
61
61
  if not alias:
62
62
  raise ValueError(
63
- f"Missing 'source_id' in source file: {fname}")
63
+ f"Missing 'id' in source file: {path.relative_to(src_dir)}")
64
64
  out[alias] = _interpolate(data, vars_)
65
65
  continue
66
66
  return out
@@ -81,13 +81,32 @@ def _load_canonical_streams(project_yaml: Path, vars_: dict[str, Any]) -> dict:
81
81
  if not p.is_file():
82
82
  continue
83
83
  data = load_yaml(p)
84
- # Require explicit ids: stream_id and source_id
85
- if isinstance(data, dict) and (SOURCE_ID_KEY in data) and (STREAM_ID_KEY in data):
86
- m = data.get(MAPPER_KEY)
87
- if (not isinstance(m, dict)) or (ENTRYPOINT_KEY not in (m or {})):
88
- data[MAPPER_KEY] = None
89
- alias = data.get(STREAM_ID_KEY)
90
- out[alias] = _interpolate(data, vars_)
84
+ # Contracts must declare kind: 'ingest' | 'composed'
85
+ if not isinstance(data, dict):
86
+ continue
87
+ kind = data.get("kind")
88
+ if kind not in {"ingest", "composed"}:
89
+ continue
90
+ if (STREAM_ID_KEY not in data):
91
+ continue
92
+ if kind == "ingest" and ("source" not in data):
93
+ continue
94
+ if kind == "composed" and ("inputs" not in data):
95
+ continue
96
+ m = data.get(MAPPER_KEY)
97
+ if (not isinstance(m, dict)) or (ENTRYPOINT_KEY not in (m or {})):
98
+ data[MAPPER_KEY] = None
99
+ # Support simple per-contract variables like 'cadence' while keeping
100
+ # project-level globals as the single source of truth for shared values.
101
+ local_vars = dict(vars_)
102
+ cadence_expr = data.get("cadence")
103
+ if cadence_expr is not None:
104
+ # Allow cadence to reference globals (e.g. ${group_by}) while also
105
+ # making ${cadence} usable elsewhere in the same contract.
106
+ resolved_cadence = _interpolate(cadence_expr, vars_)
107
+ local_vars["cadence"] = resolved_cadence
108
+ alias = data.get(STREAM_ID_KEY)
109
+ out[alias] = _interpolate(data, local_vars)
91
110
  return out
92
111
 
93
112
 
@@ -101,16 +120,7 @@ def load_streams(project_yaml: Path) -> StreamsConfig:
101
120
  def init_streams(cfg: StreamsConfig, runtime: Runtime) -> None:
102
121
  """Compile typed streams config into runtime registries."""
103
122
  regs = runtime.registries
104
- regs.stream_operations.clear()
105
- regs.debug_operations.clear()
106
- regs.partition_by.clear()
107
- regs.sort_batch_size.clear()
108
- regs.record_operations.clear()
109
- regs.feature_transforms.clear()
110
- regs.postprocesses.clear()
111
- regs.sources.clear()
112
- regs.mappers.clear()
113
- regs.stream_sources.clear()
123
+ regs.clear_all()
114
124
 
115
125
  # Register per-stream policies and record transforms for runtime lookups
116
126
  for alias, spec in (cfg.contracts or {}).items():
@@ -124,9 +134,16 @@ def init_streams(cfg: StreamsConfig, runtime: Runtime) -> None:
124
134
  for alias, spec in (cfg.raw or {}).items():
125
135
  regs.sources.register(alias, build_source_from_spec(spec))
126
136
  for alias, spec in (cfg.contracts or {}).items():
127
- mapper = build_mapper_from_spec(spec.mapper)
128
- regs.mappers.register(alias, mapper)
129
- regs.stream_sources.register(alias, regs.sources.get(spec.source_id))
137
+ if getattr(spec, "kind", None) == "composed":
138
+ # Composed stream: register virtual source and identity mapper
139
+ regs.stream_sources.register(
140
+ alias, build_composed_source(alias, spec, runtime)
141
+ )
142
+ regs.mappers.register(alias, build_mapper_from_spec(None))
143
+ else:
144
+ mapper = build_mapper_from_spec(spec.mapper)
145
+ regs.mappers.register(alias, mapper)
146
+ regs.stream_sources.register(alias, regs.sources.get(spec.source))
130
147
 
131
148
 
132
149
  def bootstrap(project_yaml: Path) -> Runtime:
@@ -146,9 +163,7 @@ def bootstrap(project_yaml: Path) -> Runtime:
146
163
  runtime.split = None
147
164
 
148
165
  try:
149
- runtime.run = load_run_config(project_yaml)
150
- except FileNotFoundError:
151
- runtime.run = None
166
+ runtime.run = default_serve_task(project_yaml)
152
167
  except Exception:
153
168
  runtime.run = None
154
169
 
@@ -1,19 +1,20 @@
1
1
  PARSER_KEY = "parser"
2
2
  LOADER_KEY = "loader"
3
3
  SOURCE_KEY = "source"
4
- SOURCE_ID_KEY = "source_id"
4
+ SOURCE_ID_KEY = "id"
5
5
  MAPPER_KEY = "mapper"
6
6
  ENTRYPOINT_KEY = "entrypoint"
7
7
  ARGS_KEY = "args"
8
- STREAM_ID_KEY = "stream_id"
8
+ STREAM_ID_KEY = "id"
9
9
 
10
10
  PARSERS_GROUP = "parsers"
11
11
  LOADERS_GROUP = "loaders"
12
12
  MAPPERS_GROUP = "mappers"
13
13
  FILTERS_GROUP = "filters"
14
- COMPOSED_LOADER_EP = "composed.loader"
14
+ DEFAULT_IO_LOADER_EP = "core.io"
15
15
 
16
- #POSTPROCESS_GLOBAL_KEY = "__global__"
16
+ # POSTPROCESS_GLOBAL_KEY = "__global__"
17
17
  POSTPROCESS_TRANSFORMS = "transforms"
18
- PARTIONED_IDS = "partitioned_ids"
19
18
  SCALER_STATISTICS = "scaler_statistics"
19
+ VECTOR_SCHEMA = "vector_schema"
20
+ VECTOR_SCHEMA_METADATA = "vector_schema_metadata"
@@ -1,9 +1,16 @@
1
1
  from datapipeline.utils.load import load_ep
2
2
  from datapipeline.plugins import PARSERS_EP, LOADERS_EP, MAPPERS_EP
3
3
  from datapipeline.sources.models.source import Source
4
- from datapipeline.config.catalog import SourceConfig, EPArgs
4
+ from datapipeline.config.catalog import SourceConfig, EPArgs, ContractConfig
5
5
  from datapipeline.mappers.noop import identity
6
6
  from datapipeline.utils.placeholders import normalize_args
7
+ from datapipeline.sources.models.base import SourceInterface
8
+ from datapipeline.pipeline.context import PipelineContext
9
+ from datapipeline.config.dataset.feature import FeatureRecordConfig
10
+ from datapipeline.pipeline.pipelines import build_feature_pipeline
11
+ from datapipeline.pipeline.utils.transform_utils import _supports_parameter
12
+ from inspect import isclass
13
+ from typing import Iterator, Any, Optional
7
14
 
8
15
 
9
16
  def build_source_from_spec(spec: SourceConfig) -> Source:
@@ -23,3 +30,118 @@ def build_mapper_from_spec(spec: EPArgs | None):
23
30
  if args:
24
31
  return lambda raw: fn(raw, **args)
25
32
  return fn
33
+
34
+
35
+ class _ComposedSource(SourceInterface):
36
+ def __init__(self, *, runtime, stream_id: str, spec: ContractConfig):
37
+ self._runtime = runtime
38
+ self._stream_id = stream_id
39
+ self._spec = spec
40
+
41
+ def stream(self):
42
+ context = PipelineContext(self._runtime)
43
+ raw_inputs = self._spec.inputs
44
+ input_specs = list(raw_inputs or [])
45
+ if not input_specs:
46
+ return iter(())
47
+
48
+ # Resolve inputs: "[alias=]stream_id" (streams only)
49
+ resolved = self._resolve_inputs(context, input_specs)
50
+ aligned = {k: v for k, v in resolved.items() if v["aligned"]}
51
+ aux = {k: v for k, v in resolved.items() if not v["aligned"]}
52
+
53
+ # Build aligned/aux iterators (unwrap FeatureRecord -> record for aligned)
54
+ aligned_iters: dict[str, Iterator[Any]] = {
55
+ k: (fr.record for fr in v["iter"]) # stage>=3 yields FeatureRecord
56
+ for k, v in aligned.items()
57
+ }
58
+ aux_iters: dict[str, Iterator[Any]] = {
59
+ k: v["iter"] for k, v in aux.items()}
60
+
61
+ # Load mapper (composer) from contract
62
+ mapper = self._spec.mapper
63
+ if not mapper or not mapper.entrypoint:
64
+ raise ValueError(
65
+ f"Composed stream '{self._stream_id}' requires mapper.entrypoint composer"
66
+ )
67
+ ep = load_ep(MAPPERS_EP, mapper.entrypoint)
68
+ kwargs = normalize_args(mapper.args)
69
+
70
+ # Choose driver among aligned inputs
71
+ aligned_keys = list(aligned_iters.keys())
72
+ if not aligned_keys:
73
+ driver_key = None
74
+ else:
75
+ driver_key = kwargs.pop("driver", None) or aligned_keys[0]
76
+
77
+ # Mapper adapters: Simple vs Advanced
78
+ if not isclass(ep) and not _supports_parameter(ep, "inputs"):
79
+ # Simple: expect a single iterator when exactly one aligned input and no aux
80
+ if len(aligned_iters) == 1 and not aux_iters:
81
+ single_iter = next(iter(aligned_iters.values()))
82
+ for rec in ep(single_iter):
83
+ yield getattr(rec, "record", rec)
84
+ return
85
+ raise TypeError(
86
+ "Mapper must accept inputs=... for multi-input or aux-enabled contracts"
87
+ )
88
+
89
+ # Advanced: pass inputs / aux / driver / context when supported
90
+ call_kwargs = dict(kwargs)
91
+ if _supports_parameter(ep, "context") and "context" not in call_kwargs:
92
+ call_kwargs["context"] = context
93
+ if _supports_parameter(ep, "aux"):
94
+ call_kwargs["aux"] = aux_iters
95
+ if driver_key and _supports_parameter(ep, "driver"):
96
+ call_kwargs["driver"] = driver_key
97
+
98
+ if isclass(ep):
99
+ inst = ep(**call_kwargs) if call_kwargs else ep()
100
+ binder = getattr(inst, "bind_context", None)
101
+ if callable(binder):
102
+ binder(context)
103
+ for rec in inst(inputs=aligned_iters):
104
+ yield getattr(rec, "record", rec)
105
+ return
106
+
107
+ for rec in ep(inputs=aligned_iters, **call_kwargs):
108
+ yield getattr(rec, "record", rec)
109
+
110
+ def _resolve_inputs(self, context: PipelineContext, specs: list[str]):
111
+ """Parse and resolve composed inputs into iterators.
112
+
113
+ Grammar: "[alias=]stream_id" only. All inputs are built to stage 4
114
+ and are alignable (FeatureRecord -> domain record unwrapped).
115
+ """
116
+ runtime = context.runtime
117
+ known_streams = set(runtime.registries.stream_sources.keys())
118
+
119
+ out: dict[str, dict] = {}
120
+ for spec in specs:
121
+ alias, ref = self._parse_input(spec)
122
+ if ref not in known_streams:
123
+ raise ValueError(
124
+ f"Unknown input stream '{ref}'. Known streams: {sorted(known_streams)}"
125
+ )
126
+ cfg = FeatureRecordConfig(record_stream=ref, id=alias)
127
+ it = build_feature_pipeline(context, cfg, stage=4)
128
+ out[alias] = {"iter": it, "aligned": True}
129
+
130
+ return out
131
+
132
+ @staticmethod
133
+ def _parse_input(text: str) -> tuple[str, str]:
134
+ # alias=stream_id
135
+ if "@" in text:
136
+ raise ValueError(
137
+ "composed inputs may not include '@stage'; streams align by default")
138
+ alias: Optional[str] = None
139
+ if "=" in text:
140
+ alias, text = text.split("=", 1)
141
+ ref = text
142
+ alias = alias or ref
143
+ return alias, ref
144
+
145
+
146
+ def build_composed_source(stream_id: str, spec: ContractConfig, runtime) -> SourceInterface:
147
+ return _ComposedSource(runtime=runtime, stream_id=stream_id, spec=spec)
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from pathlib import Path
4
+ from typing import Optional
4
5
 
5
6
  from datapipeline.utils.load import load_yaml
6
7
  from datapipeline.config.project import ProjectConfig
@@ -35,21 +36,19 @@ def sources_dir(project_yaml: Path) -> Path:
35
36
  return p
36
37
 
37
38
 
38
- def build_config_path(project_yaml: Path) -> Path:
39
- """Return the resolved path to build.yaml declared in project.paths.build."""
39
+ def tasks_dir(project_yaml: Path) -> Path:
40
+ """Return the resolved path to the tasks directory (project.paths.tasks)."""
40
41
 
41
42
  cfg = read_project(project_yaml)
42
- build_path = getattr(cfg.paths, "build", None)
43
- if not build_path:
44
- raise FileNotFoundError(
45
- "project.paths.build must point to a build.yaml configuration file."
46
- )
47
- p = Path(build_path)
43
+ tasks_path = getattr(cfg.paths, "tasks", None)
44
+ if not tasks_path:
45
+ raise FileNotFoundError("project.paths.tasks must point to a tasks directory.")
46
+ p = Path(tasks_path)
48
47
  if not p.is_absolute():
49
48
  p = _project_root(project_yaml) / p
50
- if not p.exists():
51
- raise FileNotFoundError(f"build config not found: {p}")
52
- return p
49
+ if not p.exists() or not p.is_dir():
50
+ raise FileNotFoundError(f"tasks directory not found: {p}")
51
+ return p.resolve()
53
52
 
54
53
 
55
54
  def ensure_project_scaffold(project_yaml: Path) -> None:
@@ -64,14 +63,14 @@ def ensure_project_scaffold(project_yaml: Path) -> None:
64
63
  project_yaml.parent.mkdir(parents=True, exist_ok=True)
65
64
  default = (
66
65
  "version: 1\n"
66
+ "name: default\n"
67
67
  "paths:\n"
68
- " streams: ../../contracts\n"
69
- " sources: ../../sources\n"
68
+ " streams: ./contracts\n"
69
+ " sources: ./sources\n"
70
70
  " dataset: dataset.yaml\n"
71
71
  " postprocess: postprocess.yaml\n"
72
- " artifacts: ../../build/datasets/default\n"
73
- " build: build.yaml\n"
74
- " run: run.yaml\n"
72
+ " artifacts: ../artifacts/default\n"
73
+ " tasks: ./tasks\n"
75
74
  "globals:\n"
76
75
  " start_time: 2021-01-01T00:00:00Z\n"
77
76
  " end_time: 2021-12-31T23:00:00Z\n"
@@ -90,7 +89,35 @@ def ensure_project_scaffold(project_yaml: Path) -> None:
90
89
  if not sources.is_absolute():
91
90
  sources = _project_root(project_yaml) / sources
92
91
  sources.mkdir(parents=True, exist_ok=True)
92
+
93
+ tasks = getattr(cfg.paths, "tasks", None)
94
+ if tasks:
95
+ tasks_path = Path(tasks)
96
+ if not tasks_path.is_absolute():
97
+ tasks_path = _project_root(project_yaml) / tasks_path
98
+ tasks_path.mkdir(parents=True, exist_ok=True)
93
99
  except Exception:
94
100
  # If the file is malformed, leave it to callers to report; this helper
95
101
  # is best-effort to create a sensible starting point.
96
102
  pass
103
+
104
+
105
+ def resolve_project_yaml_path(plugin_root: Path) -> Path:
106
+ """Return a best-effort project.yaml path for scaffolding.
107
+
108
+ Resolution order:
109
+ 1) <plugin_root>/example/project.yaml
110
+ 2) <plugin_root>/config/project.yaml
111
+ 3) <plugin_root>/config/datasets/default/project.yaml
112
+ 4) Fallback: <plugin_root>/example/project.yaml
113
+ """
114
+ candidates = [
115
+ plugin_root / "example" / "project.yaml",
116
+ plugin_root / "config" / "project.yaml",
117
+ plugin_root / "config" / "datasets" / "default" / "project.yaml",
118
+ ]
119
+ for candidate in candidates:
120
+ if candidate.exists():
121
+ return candidate
122
+ # Default to the first candidate; callers may scaffold a new project there.
123
+ return candidates[0]