jerry-thomas 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. datapipeline/analysis/vector/collector.py +120 -17
  2. datapipeline/analysis/vector/matrix.py +33 -8
  3. datapipeline/analysis/vector/report.py +162 -32
  4. datapipeline/build/tasks/__init__.py +11 -0
  5. datapipeline/build/tasks/config.py +74 -0
  6. datapipeline/build/tasks/metadata.py +170 -0
  7. datapipeline/build/tasks/scaler.py +73 -0
  8. datapipeline/build/tasks/schema.py +60 -0
  9. datapipeline/build/tasks/utils.py +169 -0
  10. datapipeline/cli/app.py +304 -127
  11. datapipeline/cli/commands/build.py +240 -16
  12. datapipeline/cli/commands/contract.py +367 -0
  13. datapipeline/cli/commands/domain.py +8 -3
  14. datapipeline/cli/commands/inspect.py +401 -149
  15. datapipeline/cli/commands/list_.py +30 -7
  16. datapipeline/cli/commands/plugin.py +1 -1
  17. datapipeline/cli/commands/run.py +227 -241
  18. datapipeline/cli/commands/run_config.py +101 -0
  19. datapipeline/cli/commands/serve_pipeline.py +156 -0
  20. datapipeline/cli/commands/source.py +44 -8
  21. datapipeline/cli/visuals/__init__.py +4 -2
  22. datapipeline/cli/visuals/common.py +239 -0
  23. datapipeline/cli/visuals/labels.py +15 -15
  24. datapipeline/cli/visuals/runner.py +66 -0
  25. datapipeline/cli/visuals/sections.py +20 -0
  26. datapipeline/cli/visuals/sources.py +132 -119
  27. datapipeline/cli/visuals/sources_basic.py +260 -0
  28. datapipeline/cli/visuals/sources_off.py +76 -0
  29. datapipeline/cli/visuals/sources_rich.py +414 -0
  30. datapipeline/config/catalog.py +37 -3
  31. datapipeline/config/context.py +214 -0
  32. datapipeline/config/dataset/loader.py +21 -4
  33. datapipeline/config/dataset/normalize.py +4 -4
  34. datapipeline/config/metadata.py +43 -0
  35. datapipeline/config/postprocess.py +2 -2
  36. datapipeline/config/project.py +3 -2
  37. datapipeline/config/resolution.py +129 -0
  38. datapipeline/config/tasks.py +309 -0
  39. datapipeline/config/workspace.py +155 -0
  40. datapipeline/domain/__init__.py +12 -0
  41. datapipeline/domain/record.py +11 -0
  42. datapipeline/domain/sample.py +54 -0
  43. datapipeline/integrations/ml/adapter.py +34 -20
  44. datapipeline/integrations/ml/pandas_support.py +0 -2
  45. datapipeline/integrations/ml/rows.py +1 -6
  46. datapipeline/integrations/ml/torch_support.py +1 -3
  47. datapipeline/io/factory.py +112 -0
  48. datapipeline/io/output.py +132 -0
  49. datapipeline/io/protocols.py +21 -0
  50. datapipeline/io/serializers.py +219 -0
  51. datapipeline/io/sinks/__init__.py +23 -0
  52. datapipeline/io/sinks/base.py +2 -0
  53. datapipeline/io/sinks/files.py +79 -0
  54. datapipeline/io/sinks/rich.py +57 -0
  55. datapipeline/io/sinks/stdout.py +18 -0
  56. datapipeline/io/writers/__init__.py +14 -0
  57. datapipeline/io/writers/base.py +28 -0
  58. datapipeline/io/writers/csv_writer.py +25 -0
  59. datapipeline/io/writers/jsonl.py +52 -0
  60. datapipeline/io/writers/pickle_writer.py +30 -0
  61. datapipeline/pipeline/artifacts.py +58 -0
  62. datapipeline/pipeline/context.py +66 -7
  63. datapipeline/pipeline/observability.py +65 -0
  64. datapipeline/pipeline/pipelines.py +65 -13
  65. datapipeline/pipeline/split.py +11 -10
  66. datapipeline/pipeline/stages.py +127 -16
  67. datapipeline/pipeline/utils/keygen.py +20 -7
  68. datapipeline/pipeline/utils/memory_sort.py +22 -10
  69. datapipeline/pipeline/utils/transform_utils.py +22 -0
  70. datapipeline/runtime.py +5 -2
  71. datapipeline/services/artifacts.py +12 -6
  72. datapipeline/services/bootstrap/config.py +25 -0
  73. datapipeline/services/bootstrap/core.py +52 -37
  74. datapipeline/services/constants.py +6 -5
  75. datapipeline/services/factories.py +123 -1
  76. datapipeline/services/project_paths.py +43 -16
  77. datapipeline/services/runs.py +208 -0
  78. datapipeline/services/scaffold/domain.py +3 -2
  79. datapipeline/services/scaffold/filter.py +3 -2
  80. datapipeline/services/scaffold/mappers.py +9 -6
  81. datapipeline/services/scaffold/plugin.py +3 -3
  82. datapipeline/services/scaffold/source.py +93 -56
  83. datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
  84. datapipeline/sources/decoders.py +83 -18
  85. datapipeline/sources/factory.py +26 -16
  86. datapipeline/sources/models/__init__.py +2 -2
  87. datapipeline/sources/models/generator.py +0 -7
  88. datapipeline/sources/models/loader.py +3 -3
  89. datapipeline/sources/models/parsing_error.py +24 -0
  90. datapipeline/sources/models/source.py +6 -6
  91. datapipeline/sources/synthetic/time/loader.py +14 -2
  92. datapipeline/sources/transports.py +74 -37
  93. datapipeline/templates/plugin_skeleton/README.md +74 -30
  94. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
  95. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
  96. datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
  97. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
  98. datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
  99. datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
  100. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
  101. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
  102. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
  103. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
  104. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
  105. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
  106. datapipeline/templates/plugin_skeleton/jerry.yaml +28 -0
  107. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
  108. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
  109. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
  110. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
  111. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
  112. datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
  113. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
  114. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
  115. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
  116. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
  117. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
  118. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
  119. datapipeline/templates/stubs/dto.py.j2 +2 -0
  120. datapipeline/templates/stubs/mapper.py.j2 +5 -4
  121. datapipeline/templates/stubs/parser.py.j2 +2 -0
  122. datapipeline/templates/stubs/record.py.j2 +2 -0
  123. datapipeline/templates/stubs/source.yaml.j2 +2 -3
  124. datapipeline/transforms/debug/lint.py +26 -41
  125. datapipeline/transforms/feature/scaler.py +89 -13
  126. datapipeline/transforms/record/floor_time.py +4 -4
  127. datapipeline/transforms/sequence.py +2 -35
  128. datapipeline/transforms/stream/dedupe.py +24 -0
  129. datapipeline/transforms/stream/ensure_ticks.py +7 -6
  130. datapipeline/transforms/vector/__init__.py +5 -0
  131. datapipeline/transforms/vector/common.py +98 -0
  132. datapipeline/transforms/vector/drop/__init__.py +4 -0
  133. datapipeline/transforms/vector/drop/horizontal.py +79 -0
  134. datapipeline/transforms/vector/drop/orchestrator.py +59 -0
  135. datapipeline/transforms/vector/drop/vertical.py +182 -0
  136. datapipeline/transforms/vector/ensure_schema.py +184 -0
  137. datapipeline/transforms/vector/fill.py +87 -0
  138. datapipeline/transforms/vector/replace.py +62 -0
  139. datapipeline/utils/load.py +24 -3
  140. datapipeline/utils/rich_compat.py +38 -0
  141. datapipeline/utils/window.py +76 -0
  142. jerry_thomas-1.0.0.dist-info/METADATA +825 -0
  143. jerry_thomas-1.0.0.dist-info/RECORD +199 -0
  144. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/entry_points.txt +9 -8
  145. datapipeline/build/tasks.py +0 -186
  146. datapipeline/cli/commands/link.py +0 -128
  147. datapipeline/cli/commands/writers.py +0 -138
  148. datapipeline/config/build.py +0 -64
  149. datapipeline/config/run.py +0 -116
  150. datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
  151. datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
  152. datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
  153. datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
  154. datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
  155. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
  156. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
  157. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
  158. datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
  159. datapipeline/transforms/vector.py +0 -210
  160. jerry_thomas-0.3.0.dist-info/METADATA +0 -502
  161. jerry_thomas-0.3.0.dist-info/RECORD +0 -139
  162. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/WHEEL +0 -0
  163. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/licenses/LICENSE +0 -0
  164. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,18 +1,24 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
3
4
  from contextlib import contextmanager
4
5
  from contextvars import ContextVar
5
6
  from dataclasses import dataclass, field
6
- from typing import Iterator, Mapping, Any
7
+ from typing import Iterator, Mapping, Any, Callable, Optional
8
+ from datetime import datetime
7
9
 
8
10
  from datapipeline.runtime import Runtime
11
+ from datapipeline.pipeline.observability import ObserverRegistry
9
12
  from datapipeline.services.artifacts import (
13
+ ArtifactNotRegisteredError,
10
14
  ArtifactManager,
11
15
  ArtifactSpec,
12
16
  ArtifactValue,
13
- PARTITIONED_IDS_SPEC,
17
+ VECTOR_SCHEMA_SPEC,
14
18
  )
19
+ from datapipeline.utils.window import resolve_window_bounds
15
20
 
21
+ logger = logging.getLogger(__name__)
16
22
 
17
23
  _current_context: ContextVar[PipelineContext | None] = ContextVar(
18
24
  "datapipeline_pipeline_context", default=None
@@ -24,6 +30,8 @@ class PipelineContext:
24
30
  """Lightweight runtime context shared across pipeline stages."""
25
31
 
26
32
  runtime: Runtime
33
+ transform_observer: Callable[..., None] | None = None
34
+ observer_registry: Optional[ObserverRegistry] = None
27
35
  _cache: dict[str, Any] = field(default_factory=dict)
28
36
 
29
37
  @property
@@ -42,13 +50,64 @@ class PipelineContext:
42
50
  def require_artifact(self, spec: ArtifactSpec[ArtifactValue]) -> ArtifactValue:
43
51
  return self.artifacts.load(spec)
44
52
 
45
- def load_expected_ids(self) -> list[str]:
46
- ids = self._cache.get("expected_ids")
47
- if ids is None:
48
- ids = list(self.artifacts.load(PARTITIONED_IDS_SPEC))
49
- self._cache["expected_ids"] = ids
53
+ def load_expected_ids(self, *, payload: str = "features") -> list[str]:
54
+ key = f"expected_ids:{payload}"
55
+ cached = self._cache.get(key)
56
+ if cached is not None:
57
+ return list(cached)
58
+ entries = self.load_schema(payload=payload)
59
+ if not entries:
60
+ if payload == "targets":
61
+ logger.debug("Target schema entries missing; proceeding without target baseline.")
62
+ self._cache[key] = []
63
+ return []
64
+ raise RuntimeError("Vector schema artifact missing; run `jerry build` to materialize schema.json.")
65
+ ids = [entry["id"] for entry in entries if isinstance(entry.get("id"), str)]
66
+ self._cache[key] = ids
50
67
  return list(ids)
51
68
 
69
+ def load_schema(self, *, payload: str = "features") -> list[dict[str, Any]]:
70
+ key = f"schema:{payload}"
71
+ cached = self._cache.get(key)
72
+ if cached is None:
73
+ try:
74
+ doc = self.artifacts.load(VECTOR_SCHEMA_SPEC)
75
+ except ArtifactNotRegisteredError:
76
+ cached = []
77
+ else:
78
+ section = doc.get("targets" if payload == "targets" else "features")
79
+ if isinstance(section, list):
80
+ cached = [entry for entry in section if isinstance(entry, dict)]
81
+ else:
82
+ cached = []
83
+ self._cache[key] = cached
84
+ return [dict(entry) for entry in cached] if cached else []
85
+
86
+ @property
87
+ def schema_required(self) -> bool:
88
+ return bool(getattr(self.runtime, "schema_required", True))
89
+
90
+ def window_bounds(self, *, rectangular_required: bool = False) -> tuple[datetime | None, datetime | None]:
91
+ key = "window_bounds:required" if rectangular_required else "window_bounds:optional"
92
+ cached = self._cache.get(key)
93
+ if cached is not None:
94
+ return cached
95
+ bounds = resolve_window_bounds(self.runtime, rectangular_required)
96
+ if rectangular_required:
97
+ self.runtime.window_bounds = bounds
98
+ self._cache[key] = bounds
99
+ return bounds
100
+
101
+ @property
102
+ def start_time(self) -> datetime | None:
103
+ start, _ = self.window_bounds()
104
+ return start
105
+
106
+ @property
107
+ def end_time(self) -> datetime | None:
108
+ _, end = self.window_bounds()
109
+ return end
110
+
52
111
  @contextmanager
53
112
  def activate(self) -> Iterator[PipelineContext]:
54
113
  token = _current_context.set(self)
@@ -0,0 +1,65 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from dataclasses import dataclass
5
+ from typing import Callable, Mapping, Optional, Protocol, runtime_checkable
6
+
7
+
8
+ @dataclass(frozen=True)
9
+ class TransformEvent:
10
+ type: str
11
+ payload: Mapping[str, object]
12
+
13
+
14
+ # Observer receives a structured event.
15
+ Observer = Callable[[TransformEvent], None]
16
+ # Factory builds an observer for a given logger (may return None if not active at current level).
17
+ ObserverFactory = Callable[[logging.Logger], Optional[Observer]]
18
+
19
+
20
+ @runtime_checkable
21
+ class SupportsObserver(Protocol):
22
+ def set_observer(self, observer: Optional[Observer]) -> None:
23
+ ...
24
+
25
+
26
+ class ObserverRegistry:
27
+ def __init__(self, factories: Optional[Mapping[str, ObserverFactory]] = None) -> None:
28
+ self._factories: dict[str, ObserverFactory] = dict(factories or {})
29
+
30
+ def register(self, name: str, factory: ObserverFactory) -> None:
31
+ self._factories[name] = factory
32
+
33
+ def get(self, name: str, logger: logging.Logger) -> Optional[Observer]:
34
+ factory = self._factories.get(name)
35
+ if not factory:
36
+ return None
37
+ return factory(logger)
38
+
39
+
40
+ def _scaler_observer_factory(logger: logging.Logger) -> Optional[Observer]:
41
+ if not logger.isEnabledFor(logging.DEBUG):
42
+ return None
43
+
44
+ warned: set[str] = set()
45
+
46
+ def _observer(event: TransformEvent) -> None:
47
+ if event.type != "scaler_none":
48
+ return
49
+ fid = event.payload.get("feature_id")
50
+ if logger.isEnabledFor(logging.DEBUG):
51
+ if isinstance(fid, str) and fid not in warned:
52
+ warned.add(fid)
53
+ logger.warning(
54
+ "Scaler encountered None value during scaling for feature=%s "
55
+ "(further occurrences suppressed; consider fill/lint upstream).",
56
+ fid,
57
+ )
58
+
59
+ return _observer
60
+
61
+
62
+ def default_observer_registry() -> ObserverRegistry:
63
+ registry = ObserverRegistry()
64
+ registry.register("scale", _scaler_observer_factory)
65
+ return registry
@@ -1,7 +1,10 @@
1
1
  import heapq
2
2
  from collections.abc import Iterator, Sequence
3
3
  from typing import Any
4
+ from itertools import tee
4
5
 
6
+ from datapipeline.domain.sample import Sample
7
+ from datapipeline.domain.vector import Vector
5
8
  from datapipeline.pipeline.utils.keygen import group_key_for
6
9
  from datapipeline.pipeline.utils.memory_sort import batch_sort
7
10
  from datapipeline.config.dataset.feature import FeatureRecordConfig
@@ -13,6 +16,9 @@ from datapipeline.pipeline.stages import (
13
16
  regularize_feature_stream,
14
17
  apply_feature_transforms,
15
18
  vector_assemble_stage,
19
+ sample_assemble_stage,
20
+ align_stream,
21
+ window_keys,
16
22
  )
17
23
  from datapipeline.pipeline.context import PipelineContext
18
24
 
@@ -72,20 +78,66 @@ def build_vector_pipeline(
72
78
  context: PipelineContext,
73
79
  configs: Sequence[FeatureRecordConfig],
74
80
  group_by_cadence: str,
75
- stage: int | None = None,
81
+ target_configs: Sequence[FeatureRecordConfig] | None = None,
82
+ *,
83
+ rectangular: bool = True,
76
84
  ) -> Iterator[Any]:
77
- """Build the vector assembly pipeline.
78
- Stages:
79
- - 0..5: delegates to feature pipeline for the first configured feature
80
- - 6: assembled vectors
81
- """
82
- if stage is not None and stage <= 5:
83
- first = next(iter(configs))
84
- return build_feature_pipeline(context, first, stage=stage)
85
-
86
- streams = [build_feature_pipeline(context, cfg, stage=None) for cfg in configs]
85
+ """Build the vector assembly pipeline for features and optionally attach targets."""
86
+ feature_cfgs = list(configs)
87
+ target_cfgs = list(target_configs or [])
88
+ if not feature_cfgs and not target_cfgs:
89
+ return iter(())
90
+
91
+ if rectangular:
92
+ start, end = context.window_bounds(rectangular_required=True)
93
+ keys = window_keys(start, end, group_by_cadence)
94
+ else:
95
+ keys = None
96
+
97
+ feature_vectors = _assemble_vectors(
98
+ context,
99
+ feature_cfgs,
100
+ group_by_cadence,
101
+ )
102
+ if keys is not None:
103
+ # share keys across feature/target alignment
104
+ if target_cfgs:
105
+ keys_feature, keys_target = tee(keys, 2)
106
+ else:
107
+ keys_feature = keys
108
+ keys_target = None
109
+ feature_vectors = align_stream(feature_vectors, keys=keys_feature)
110
+ else:
111
+ keys_target = None
112
+
113
+ if not target_cfgs:
114
+ return sample_assemble_stage(feature_vectors)
115
+
116
+ target_vectors = _assemble_vectors(
117
+ context,
118
+ target_cfgs,
119
+ group_by_cadence,
120
+ )
121
+ if keys is not None:
122
+ target_vectors = align_stream(target_vectors, keys=keys_target)
123
+ return sample_assemble_stage(feature_vectors, target_vectors)
124
+
125
+
126
+ def _assemble_vectors(
127
+ context: PipelineContext,
128
+ configs: Sequence[FeatureRecordConfig],
129
+ group_by_cadence: str,
130
+ ) -> Iterator[tuple[tuple, Vector]]:
131
+ if not configs:
132
+ return iter(())
133
+ streams = [
134
+ build_feature_pipeline(
135
+ context,
136
+ cfg,
137
+ )
138
+ for cfg in configs
139
+ ]
87
140
  merged = heapq.merge(
88
141
  *streams, key=lambda fr: group_key_for(fr, group_by_cadence)
89
142
  )
90
- vectors = vector_assemble_stage(merged, group_by_cadence)
91
- return vectors
143
+ return vector_assemble_stage(merged, group_by_cadence)
@@ -1,12 +1,12 @@
1
1
  import hashlib
2
- from collections.abc import Iterator, Mapping, MutableMapping, Sequence
2
+ from collections.abc import Iterator, Mapping, Sequence
3
3
  from datetime import datetime
4
- from typing import Any, Literal, Tuple
4
+ from typing import Any, Literal
5
5
 
6
+ from datapipeline.domain.sample import Sample
6
7
  from datapipeline.domain.vector import Vector
7
8
  from datapipeline.config.split import (
8
9
  SplitConfig,
9
- HashSplitConfig,
10
10
  TimeSplitConfig,
11
11
  )
12
12
 
@@ -121,24 +121,25 @@ class VectorSplitApplicator:
121
121
  self._keep is None or self._keep_placeholder)
122
122
  )
123
123
 
124
- def __call__(self, stream: Iterator[Tuple[Any, Vector]]) -> Iterator[Tuple[Any, Vector]]:
124
+ def __call__(self, stream: Iterator[Sample]) -> Iterator[Sample]:
125
125
  return self.apply(stream)
126
126
 
127
- def apply(self, stream: Iterator[Tuple[Any, Vector]]) -> Iterator[Tuple[Any, Vector]]:
128
- for group_key, vector in stream:
127
+ def apply(self, stream: Iterator[Sample]) -> Iterator[Sample]:
128
+ for sample in stream:
129
+ group_key, vector = sample.key, sample.features
129
130
  label = self._labeler.label(group_key, vector)
130
131
  if self._output == "filter":
131
132
  if not self._filter_enabled:
132
- yield group_key, vector
133
+ yield sample
133
134
  continue
134
135
  if label == self._keep:
135
- yield group_key, vector
136
+ yield sample
136
137
  else:
137
138
  continue
138
139
  else:
139
140
  data = clone(vector.values)
140
141
  data[self._field] = label
141
- yield group_key, Vector(values=data)
142
+ yield sample.with_features(Vector(values=data))
142
143
 
143
144
 
144
145
  def build_labeler(cfg: SplitConfig) -> BaseLabeler:
@@ -153,7 +154,7 @@ def build_applicator(cfg: SplitConfig, keep: str | None = None) -> VectorSplitAp
153
154
  return VectorSplitApplicator(labeler=labeler, output="filter", keep=selected)
154
155
 
155
156
 
156
- def apply_split_stage(runtime, stream: Iterator[Tuple[Any, Vector]]) -> Iterator[Tuple[Any, Vector]]:
157
+ def apply_split_stage(runtime, stream: Iterator[Sample]) -> Iterator[Sample]:
157
158
  """Apply project-configured split at the end of the vector pipeline.
158
159
 
159
160
  Reads `runtime.split` (set during bootstrap from project.globals.split) and,
@@ -1,20 +1,25 @@
1
1
  from collections import defaultdict
2
- from itertools import groupby
3
- from typing import Any, Iterable, Iterator, Tuple, Mapping
2
+ from itertools import chain, groupby
3
+ from typing import Any, Iterable, Iterator, Mapping
4
+ from datetime import datetime
5
+
4
6
  from datapipeline.pipeline.context import PipelineContext
5
- from datapipeline.services.artifacts import PARTITIONED_IDS_SPEC
6
7
  from datapipeline.services.constants import POSTPROCESS_TRANSFORMS, SCALER_STATISTICS
7
8
 
8
9
  from datapipeline.domain.feature import FeatureRecord, FeatureRecordSequence
9
10
  from datapipeline.domain.vector import Vector, vectorize_record_group
11
+ from datapipeline.domain.sample import Sample
10
12
  from datapipeline.pipeline.utils.memory_sort import batch_sort
13
+
11
14
  from datapipeline.pipeline.utils.transform_utils import apply_transforms
12
15
  from datapipeline.plugins import FEATURE_TRANSFORMS_EP, VECTOR_TRANSFORMS_EP, RECORD_TRANSFORMS_EP, STREAM_TRANFORMS_EP, DEBUG_TRANSFORMS_EP
13
16
 
14
17
  from datapipeline.domain.record import TemporalRecord
15
18
  from datapipeline.pipeline.utils.keygen import FeatureIdGenerator, group_key_for
16
19
  from datapipeline.sources.models.source import Source
17
- from datapipeline.pipeline.split import apply_split_stage as split_stage
20
+ from datapipeline.transforms.vector import VectorEnsureSchemaTransform
21
+ from datapipeline.config.dataset.normalize import floor_time_to_bucket
22
+ from datapipeline.utils.time import parse_timecode
18
23
 
19
24
 
20
25
  def open_source_stream(context: PipelineContext, stream_alias: str) -> Source:
@@ -39,7 +44,8 @@ def apply_record_operations(
39
44
  ) -> Iterator[TemporalRecord]:
40
45
  """Apply record transforms defined in contract policies in order."""
41
46
  steps = context.runtime.registries.record_operations.get(stream_id)
42
- records = apply_transforms(record_stream, RECORD_TRANSFORMS_EP, steps, context)
47
+ records = apply_transforms(
48
+ record_stream, RECORD_TRANSFORMS_EP, steps, context)
43
49
  return records
44
50
 
45
51
 
@@ -65,7 +71,7 @@ def regularize_feature_stream(
65
71
  batch_size: int,
66
72
  ) -> Iterator[FeatureRecord]:
67
73
  """Apply feature transforms defined in contract policies in order."""
68
- # Sort by (id, time) to satisfy stream transforms (ensure_ticks/fill)
74
+ # Sort by (id, time) to satisfy stream transforms (ensure_cadence/fill)
69
75
  sorted = batch_sort(
70
76
  feature_stream,
71
77
  batch_size=batch_size,
@@ -121,11 +127,8 @@ def apply_feature_transforms(
121
127
  def vector_assemble_stage(
122
128
  merged: Iterator[FeatureRecord | FeatureRecordSequence],
123
129
  group_by_cadence: str,
124
- ) -> Iterator[Tuple[Any, Vector]]:
125
- """Group the merged feature stream by group_key.
126
- Coalesce each partitioned feature_id into record buckets.
127
- Yield (group_key, Vector) pairs ready for downstream consumption."""
128
-
130
+ ) -> Iterator[tuple[tuple, Vector]]:
131
+ """Group merged feature stream by key and emit raw vectors."""
129
132
  for group_key, group in groupby(
130
133
  merged, key=lambda fr: group_key_for(fr, group_by_cadence)
131
134
  ):
@@ -136,23 +139,131 @@ def vector_assemble_stage(
136
139
  else:
137
140
  records = [fr.record]
138
141
  feature_map[fr.id].extend(records)
139
- yield group_key, vectorize_record_group(feature_map)
142
+ vector = vectorize_record_group(feature_map)
143
+ yield group_key, vector
144
+
145
+
146
+ def window_keys(start: datetime | None, end: datetime | None, cadence: str | None) -> Iterator[tuple] | None:
147
+ if start is None or end is None or cadence is None:
148
+ return None
149
+ try:
150
+ current = floor_time_to_bucket(start, cadence)
151
+ stop = floor_time_to_bucket(end, cadence)
152
+ step = parse_timecode(cadence)
153
+ except Exception:
154
+ return None
155
+ if stop < current:
156
+ return None
157
+
158
+ def _iter():
159
+ t = current
160
+ while t <= stop:
161
+ yield (t,)
162
+ t = t + step
163
+
164
+ return _iter()
165
+
166
+
167
+ def align_stream(
168
+ stream: Iterator[tuple[tuple, Vector]] | None,
169
+ keys: Iterator[tuple] | None,
170
+ ) -> Iterator[tuple[tuple, Vector]]:
171
+ if keys is None:
172
+ return iter(stream or ())
173
+ it = iter(stream or ())
174
+ current = next(it, None)
175
+ for key in keys:
176
+ while current and current[0] < key:
177
+ current = next(it, None)
178
+ if current and current[0] == key:
179
+ yield current
180
+ current = next(it, None)
181
+ else:
182
+ yield (key, Vector(values={}))
183
+
184
+
185
+ def sample_assemble_stage(
186
+ feature_vectors: Iterator[tuple[tuple, Vector]],
187
+ target_vectors: Iterator[tuple[tuple, Vector]] | None = None,
188
+ ) -> Iterator[Sample]:
189
+ """Combine feature/target vectors into Sample objects."""
190
+ feature_iter = iter(feature_vectors)
191
+ target_iter = iter(target_vectors or ())
192
+
193
+ def _advance(it):
194
+ try:
195
+ return next(it)
196
+ except StopIteration:
197
+ return None
198
+
199
+ current_feature = _advance(feature_iter)
200
+ current_target = _advance(target_iter)
201
+
202
+ while current_feature:
203
+ feature_key, feature_vector = current_feature
204
+ targets = None
205
+
206
+ while current_target and current_target[0] < feature_key:
207
+ current_target = _advance(target_iter)
208
+
209
+ if current_target and current_target[0] == feature_key:
210
+ targets = current_target[1]
211
+ current_target = _advance(target_iter)
212
+
213
+ yield Sample(key=feature_key, features=feature_vector, targets=targets)
214
+ current_feature = _advance(feature_iter)
140
215
 
141
216
 
142
217
  def post_process(
143
218
  context: PipelineContext,
144
- stream: Iterator[Tuple[Any, Vector]],
145
- ) -> Iterator[Tuple[Any, Vector]]:
219
+ stream: Iterator[Sample],
220
+ ) -> Iterator[Sample]:
146
221
  """Apply project-scoped postprocess transforms (from registry).
147
222
 
148
223
  Explicit prereq artifact flow:
149
224
  - Read a precomputed expected feature-id list (full ids) from the build
150
225
  folder. If missing, instruct the user to generate it via CLI.
151
226
  """
227
+ stream = _apply_vector_schema(context, stream)
152
228
  runtime = context.runtime
153
229
  transforms = runtime.registries.postprocesses.get(POSTPROCESS_TRANSFORMS)
154
-
155
230
  if not transforms:
156
231
  return stream
157
-
158
232
  return apply_transforms(stream, VECTOR_TRANSFORMS_EP, transforms, context)
233
+
234
+
235
+ def _apply_vector_schema(
236
+ context: PipelineContext,
237
+ stream: Iterator[Sample],
238
+ ) -> Iterator[Sample]:
239
+ with context.activate():
240
+ feature_entries = context.load_schema(payload="features")
241
+ target_entries = context.load_schema(payload="targets")
242
+
243
+ if not feature_entries:
244
+ if context.schema_required:
245
+ raise RuntimeError("Schema missing for payload 'features'. Run `jerry build` to materialize schema.json.")
246
+ feature_stream = stream
247
+ else:
248
+ feature_schema = VectorEnsureSchemaTransform(on_missing="fill", on_extra="drop")
249
+ feature_schema.bind_context(context)
250
+ feature_stream = feature_schema(stream)
251
+
252
+ def _apply_targets(upstream: Iterator[Sample]) -> Iterator[Sample]:
253
+ if target_entries:
254
+ target_schema = VectorEnsureSchemaTransform(payload="targets", on_missing="fill", on_extra="drop")
255
+ target_schema.bind_context(context)
256
+ return target_schema(upstream)
257
+ if not context.schema_required:
258
+ return upstream
259
+ # schema required but missing: only raise if targets are present in stream
260
+ iterator = iter(upstream)
261
+ try:
262
+ first = next(iterator)
263
+ except StopIteration:
264
+ return iter(())
265
+ if first.targets is None:
266
+ return chain([first], iterator)
267
+ raise RuntimeError("Schema missing for payload 'targets'. Run `jerry build` to materialize schema.json.")
268
+
269
+ return _apply_targets(feature_stream)
@@ -1,7 +1,8 @@
1
1
  from typing import Union, List, Any
2
2
  from datetime import datetime
3
3
 
4
- from datapipeline.config.dataset.normalize import floor_time_to_resolution
4
+ from datapipeline.config.dataset.normalize import floor_time_to_bucket
5
+ from datapipeline.transforms.vector_utils import PARTITION_SEP
5
6
 
6
7
 
7
8
  class FeatureIdGenerator:
@@ -9,18 +10,30 @@ class FeatureIdGenerator:
9
10
  Generates unique feature keys by appending suffixes from expand_by fields.
10
11
  """
11
12
 
13
+ COMPONENT_PREFIX = "@"
14
+ COMPONENT_JOINER = "_"
15
+ VALUE_DELIMITER = ":"
16
+
12
17
  def __init__(self, partition_by: Union[str, List[str], None]):
13
18
  self.partition_by = partition_by
14
19
 
20
+ def _format_component(self, field: str, value: Any) -> str:
21
+ value_str = "" if value is None else str(value)
22
+ return f"{self.COMPONENT_PREFIX}{field}{self.VALUE_DELIMITER}{value_str}"
23
+
15
24
  def generate(self, base_id: str, record: Any) -> str:
16
25
  if not self.partition_by:
17
26
  return base_id
18
27
  if isinstance(self.partition_by, str):
19
- suffix = getattr(record, self.partition_by)
28
+ value = getattr(record, self.partition_by)
29
+ suffix = self._format_component(self.partition_by, value)
20
30
  else:
21
- suffix = "__".join(str(getattr(record, f))
22
- for f in self.partition_by)
23
- return f"{base_id}__{suffix}"
31
+ parts = [
32
+ self._format_component(field, getattr(record, field))
33
+ for field in self.partition_by
34
+ ]
35
+ suffix = self.COMPONENT_JOINER.join(parts)
36
+ return f"{base_id}{PARTITION_SEP}{suffix}"
24
37
 
25
38
 
26
39
  def _anchor_time(item: Any) -> datetime | None:
@@ -36,7 +49,7 @@ def _anchor_time(item: Any) -> datetime | None:
36
49
  return getattr(recs[0], "time", None) if recs else None
37
50
 
38
51
 
39
- def group_key_for(item: Any, resolution: str) -> tuple:
52
+ def group_key_for(item: Any, cadence: str) -> tuple:
40
53
  """Compute 1-tuple bucket key from a FeatureRecord or FeatureRecordSequence."""
41
54
  t = _anchor_time(item)
42
- return (floor_time_to_resolution(t, resolution),)
55
+ return (floor_time_to_bucket(t, cadence),)
@@ -1,17 +1,12 @@
1
1
  from typing import Iterable, Iterator, Callable, TypeVar
2
2
  import heapq
3
-
4
-
5
- def apply_pipeline(stream, stages):
6
- for stage in stages:
7
- stream = stage(stream)
8
- return stream
3
+ from itertools import count
9
4
 
10
5
 
11
6
  T = TypeVar("T")
12
7
 
13
8
 
14
- def read_batches(iterable: Iterable[T], batch_size: int, key: Callable[[T], any]) -> Iterator[list[T]]:
9
+ def read_batches(iterable: Iterable[T], batch_size: int, key: Callable[[T], object]) -> Iterator[list[T]]:
15
10
  batch = []
16
11
  for item in iterable:
17
12
  batch.append(item)
@@ -22,6 +17,23 @@ def read_batches(iterable: Iterable[T], batch_size: int, key: Callable[[T], any]
22
17
  yield sorted(batch, key=key)
23
18
 
24
19
 
25
- def batch_sort(iterable: Iterable[T], batch_size: int, key: Callable[[T], any]) -> Iterator[T]:
26
- sorted_batches = read_batches(iterable, batch_size, key)
27
- return heapq.merge(*sorted_batches, key=key)
20
+ def batch_sort(iterable: Iterable[T], batch_size: int, key: Callable[[T], object]) -> Iterator[T]:
21
+ """Sort an iterable by chunking then merging to reduce peak memory usage."""
22
+ batches = read_batches(iterable, batch_size, key)
23
+
24
+ heap: list[tuple[object, int, T, Iterator[T]]] = []
25
+ seq = count()
26
+
27
+ for batch in batches:
28
+ it = iter(batch)
29
+ first = next(it, None)
30
+ if first is None:
31
+ continue
32
+ heapq.heappush(heap, (key(first), next(seq), first, it))
33
+
34
+ while heap:
35
+ _, _, item, it = heapq.heappop(heap)
36
+ yield item
37
+ nxt = next(it, None)
38
+ if nxt is not None:
39
+ heapq.heappush(heap, (key(nxt), next(seq), nxt, it))
@@ -1,9 +1,11 @@
1
+ import logging
1
2
  from collections.abc import Callable, Iterator, Mapping, Sequence
2
3
  from typing import Any, Optional, Tuple
3
4
  from inspect import isclass, signature, Parameter
4
5
  from contextlib import nullcontext
5
6
 
6
7
  from datapipeline.pipeline.context import PipelineContext
8
+ from datapipeline.pipeline.observability import ObserverRegistry, SupportsObserver, TransformEvent
7
9
 
8
10
  from datapipeline.utils.load import load_ep
9
11
 
@@ -79,9 +81,16 @@ def apply_transforms(
79
81
  group: str,
80
82
  transforms: Optional[Sequence[Mapping[str, Any]]],
81
83
  context: Optional[PipelineContext] = None,
84
+ observer: Callable[[TransformEvent], None] | None = None,
85
+ observer_registry: ObserverRegistry | None = None,
82
86
  ) -> Iterator[Any]:
83
87
  """Instantiate and apply configured transforms in order."""
84
88
 
89
+ observer = observer or (getattr(context, "transform_observer", None)
90
+ if context is not None else None)
91
+ registry = observer_registry or (getattr(context, "observer_registry", None)
92
+ if context is not None else None)
93
+
85
94
  context_cm = context.activate() if context else nullcontext()
86
95
  with context_cm:
87
96
  for transform in transforms or ():
@@ -90,7 +99,20 @@ def apply_transforms(
90
99
  if isclass(ep):
91
100
  inst = _instantiate_entry_point(ep, params, context)
92
101
  _bind_context(inst, context)
102
+ eff_observer = observer
103
+ if eff_observer is None and registry:
104
+ eff_observer = registry.get(
105
+ name, logging.getLogger(f"{group}.{name}")
106
+ )
107
+ _attach_observer(inst, eff_observer)
93
108
  stream = inst(stream)
94
109
  else:
95
110
  stream = _call_with_params(ep, stream, params, context)
96
111
  return stream
112
+
113
+
114
+ def _attach_observer(transform: Any, observer: Callable[..., None] | None) -> None:
115
+ if observer is None:
116
+ return
117
+ if isinstance(transform, SupportsObserver):
118
+ transform.set_observer(observer)