jerry-thomas 1.0.3__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. datapipeline/analysis/vector/collector.py +0 -1
  2. datapipeline/build/tasks/config.py +0 -2
  3. datapipeline/build/tasks/metadata.py +0 -2
  4. datapipeline/build/tasks/scaler.py +0 -2
  5. datapipeline/build/tasks/schema.py +0 -2
  6. datapipeline/build/tasks/utils.py +0 -2
  7. datapipeline/cli/app.py +201 -81
  8. datapipeline/cli/commands/contract.py +145 -283
  9. datapipeline/cli/commands/demo.py +13 -0
  10. datapipeline/cli/commands/domain.py +4 -4
  11. datapipeline/cli/commands/dto.py +11 -0
  12. datapipeline/cli/commands/filter.py +2 -2
  13. datapipeline/cli/commands/inspect.py +0 -68
  14. datapipeline/cli/commands/list_.py +30 -13
  15. datapipeline/cli/commands/loader.py +11 -0
  16. datapipeline/cli/commands/mapper.py +82 -0
  17. datapipeline/cli/commands/parser.py +45 -0
  18. datapipeline/cli/commands/run_config.py +1 -3
  19. datapipeline/cli/commands/serve_pipeline.py +5 -7
  20. datapipeline/cli/commands/source.py +106 -18
  21. datapipeline/cli/commands/stream.py +292 -0
  22. datapipeline/cli/visuals/common.py +0 -2
  23. datapipeline/cli/visuals/sections.py +0 -2
  24. datapipeline/cli/workspace_utils.py +0 -3
  25. datapipeline/config/context.py +0 -2
  26. datapipeline/config/dataset/feature.py +1 -0
  27. datapipeline/config/metadata.py +0 -2
  28. datapipeline/config/project.py +0 -2
  29. datapipeline/config/resolution.py +10 -2
  30. datapipeline/config/tasks.py +9 -9
  31. datapipeline/domain/feature.py +3 -0
  32. datapipeline/domain/record.py +7 -7
  33. datapipeline/domain/sample.py +0 -2
  34. datapipeline/domain/vector.py +6 -8
  35. datapipeline/integrations/ml/adapter.py +0 -2
  36. datapipeline/integrations/ml/pandas_support.py +0 -2
  37. datapipeline/integrations/ml/rows.py +0 -2
  38. datapipeline/integrations/ml/torch_support.py +0 -2
  39. datapipeline/io/output.py +0 -2
  40. datapipeline/io/serializers.py +26 -16
  41. datapipeline/mappers/synthetic/time.py +9 -2
  42. datapipeline/pipeline/artifacts.py +3 -5
  43. datapipeline/pipeline/observability.py +0 -2
  44. datapipeline/pipeline/pipelines.py +118 -34
  45. datapipeline/pipeline/stages.py +54 -18
  46. datapipeline/pipeline/utils/spool_cache.py +142 -0
  47. datapipeline/pipeline/utils/transform_utils.py +27 -2
  48. datapipeline/services/artifacts.py +1 -4
  49. datapipeline/services/constants.py +1 -0
  50. datapipeline/services/factories.py +4 -6
  51. datapipeline/services/paths.py +10 -1
  52. datapipeline/services/project_paths.py +0 -2
  53. datapipeline/services/runs.py +0 -2
  54. datapipeline/services/scaffold/contract_yaml.py +76 -0
  55. datapipeline/services/scaffold/demo.py +141 -0
  56. datapipeline/services/scaffold/discovery.py +115 -0
  57. datapipeline/services/scaffold/domain.py +21 -13
  58. datapipeline/services/scaffold/dto.py +31 -0
  59. datapipeline/services/scaffold/filter.py +2 -1
  60. datapipeline/services/scaffold/layout.py +96 -0
  61. datapipeline/services/scaffold/loader.py +61 -0
  62. datapipeline/services/scaffold/mapper.py +116 -0
  63. datapipeline/services/scaffold/parser.py +56 -0
  64. datapipeline/services/scaffold/plugin.py +14 -2
  65. datapipeline/services/scaffold/source_yaml.py +91 -0
  66. datapipeline/services/scaffold/stream_plan.py +129 -0
  67. datapipeline/services/scaffold/utils.py +187 -0
  68. datapipeline/sources/data_loader.py +0 -2
  69. datapipeline/sources/decoders.py +49 -8
  70. datapipeline/sources/factory.py +9 -6
  71. datapipeline/sources/foreach.py +18 -3
  72. datapipeline/sources/synthetic/time/parser.py +1 -1
  73. datapipeline/sources/transports.py +10 -4
  74. datapipeline/templates/demo_skeleton/demo/contracts/equity.ohlcv.yaml +33 -0
  75. datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.hour_sin.yaml +22 -0
  76. datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.linear.yaml +22 -0
  77. datapipeline/templates/demo_skeleton/demo/data/APPL.jsonl +19 -0
  78. datapipeline/templates/demo_skeleton/demo/data/MSFT.jsonl +19 -0
  79. datapipeline/templates/demo_skeleton/demo/dataset.yaml +19 -0
  80. datapipeline/templates/demo_skeleton/demo/postprocess.yaml +19 -0
  81. datapipeline/templates/demo_skeleton/demo/project.yaml +19 -0
  82. datapipeline/templates/demo_skeleton/demo/sources/sandbox.ohlcv.yaml +17 -0
  83. datapipeline/templates/{plugin_skeleton/example → demo_skeleton/demo}/sources/synthetic.ticks.yaml +1 -1
  84. datapipeline/templates/demo_skeleton/demo/tasks/metadata.yaml +2 -0
  85. datapipeline/templates/demo_skeleton/demo/tasks/scaler.yaml +3 -0
  86. datapipeline/templates/demo_skeleton/demo/tasks/schema.yaml +2 -0
  87. datapipeline/templates/demo_skeleton/demo/tasks/serve.test.yaml +4 -0
  88. datapipeline/templates/demo_skeleton/demo/tasks/serve.train.yaml +4 -0
  89. datapipeline/templates/demo_skeleton/demo/tasks/serve.val.yaml +4 -0
  90. datapipeline/templates/demo_skeleton/scripts/run_dataframe.py +20 -0
  91. datapipeline/templates/demo_skeleton/scripts/run_torch.py +23 -0
  92. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
  93. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/__init__.py +0 -0
  94. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/model.py +18 -0
  95. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
  96. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/sandbox_ohlcv_dto.py +14 -0
  97. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +0 -0
  98. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/map_sandbox_ohlcv_dto_to_equity.py +26 -0
  99. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
  100. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/sandbox_ohlcv_dto_parser.py +46 -0
  101. datapipeline/templates/plugin_skeleton/README.md +57 -136
  102. datapipeline/templates/plugin_skeleton/jerry.yaml +12 -24
  103. datapipeline/templates/plugin_skeleton/reference/jerry.yaml +28 -0
  104. datapipeline/templates/plugin_skeleton/reference/reference/contracts/composed.reference.yaml +29 -0
  105. datapipeline/templates/plugin_skeleton/reference/reference/contracts/ingest.reference.yaml +31 -0
  106. datapipeline/templates/plugin_skeleton/reference/reference/contracts/overview.reference.yaml +34 -0
  107. datapipeline/templates/plugin_skeleton/reference/reference/dataset.yaml +29 -0
  108. datapipeline/templates/plugin_skeleton/reference/reference/postprocess.yaml +25 -0
  109. datapipeline/templates/plugin_skeleton/reference/reference/project.yaml +32 -0
  110. datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.http.reference.yaml +24 -0
  111. datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.reference.yaml +21 -0
  112. datapipeline/templates/plugin_skeleton/reference/reference/sources/fs.reference.yaml +16 -0
  113. datapipeline/templates/plugin_skeleton/reference/reference/sources/http.reference.yaml +17 -0
  114. datapipeline/templates/plugin_skeleton/reference/reference/sources/overview.reference.yaml +18 -0
  115. datapipeline/templates/plugin_skeleton/reference/reference/sources/synthetic.reference.yaml +15 -0
  116. datapipeline/templates/plugin_skeleton/reference/reference/tasks/metadata.reference.yaml +11 -0
  117. datapipeline/templates/plugin_skeleton/reference/reference/tasks/scaler.reference.yaml +10 -0
  118. datapipeline/templates/plugin_skeleton/reference/reference/tasks/schema.reference.yaml +10 -0
  119. datapipeline/templates/plugin_skeleton/reference/reference/tasks/serve.reference.yaml +28 -0
  120. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/domains/__init__.py +2 -0
  121. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
  122. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/loaders/__init__.py +0 -0
  123. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +1 -0
  124. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
  125. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +12 -11
  126. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +4 -13
  127. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +9 -11
  128. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +1 -2
  129. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +1 -7
  130. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +1 -1
  131. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +1 -1
  132. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +1 -25
  133. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +1 -1
  134. datapipeline/templates/plugin_skeleton/your-interim-data-builder/dataset.yaml +9 -0
  135. datapipeline/templates/plugin_skeleton/your-interim-data-builder/postprocess.yaml +1 -0
  136. datapipeline/templates/plugin_skeleton/your-interim-data-builder/project.yaml +15 -0
  137. datapipeline/templates/plugin_skeleton/your-interim-data-builder/tasks/serve.all.yaml +8 -0
  138. datapipeline/templates/stubs/contracts/composed.yaml.j2 +10 -0
  139. datapipeline/templates/stubs/contracts/ingest.yaml.j2 +25 -0
  140. datapipeline/templates/stubs/dto.py.j2 +2 -2
  141. datapipeline/templates/stubs/filter.py.j2 +1 -1
  142. datapipeline/templates/stubs/loaders/basic.py.j2 +11 -0
  143. datapipeline/templates/stubs/mappers/composed.py.j2 +13 -0
  144. datapipeline/templates/stubs/mappers/ingest.py.j2 +20 -0
  145. datapipeline/templates/stubs/parser.py.j2 +5 -1
  146. datapipeline/templates/stubs/record.py.j2 +1 -1
  147. datapipeline/templates/stubs/source.yaml.j2 +1 -1
  148. datapipeline/transforms/debug/identity.py +34 -16
  149. datapipeline/transforms/debug/lint.py +14 -11
  150. datapipeline/transforms/feature/scaler.py +5 -12
  151. datapipeline/transforms/filter.py +73 -17
  152. datapipeline/transforms/interfaces.py +58 -0
  153. datapipeline/transforms/record/floor_time.py +10 -7
  154. datapipeline/transforms/record/lag.py +8 -10
  155. datapipeline/transforms/sequence.py +2 -3
  156. datapipeline/transforms/stream/dedupe.py +5 -7
  157. datapipeline/transforms/stream/ensure_ticks.py +39 -24
  158. datapipeline/transforms/stream/fill.py +34 -25
  159. datapipeline/transforms/stream/filter.py +25 -0
  160. datapipeline/transforms/stream/floor_time.py +16 -0
  161. datapipeline/transforms/stream/granularity.py +52 -30
  162. datapipeline/transforms/stream/lag.py +17 -0
  163. datapipeline/transforms/stream/rolling.py +72 -0
  164. datapipeline/transforms/utils.py +42 -10
  165. datapipeline/transforms/vector/drop/horizontal.py +0 -3
  166. datapipeline/transforms/vector/drop/orchestrator.py +0 -3
  167. datapipeline/transforms/vector/drop/vertical.py +0 -2
  168. datapipeline/transforms/vector/ensure_schema.py +0 -2
  169. datapipeline/utils/paths.py +0 -2
  170. datapipeline/utils/placeholders.py +0 -2
  171. datapipeline/utils/rich_compat.py +0 -3
  172. datapipeline/utils/window.py +0 -2
  173. jerry_thomas-2.0.1.dist-info/METADATA +269 -0
  174. jerry_thomas-2.0.1.dist-info/RECORD +264 -0
  175. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/WHEEL +1 -1
  176. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/entry_points.txt +7 -3
  177. datapipeline/services/scaffold/mappers.py +0 -55
  178. datapipeline/services/scaffold/source.py +0 -191
  179. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +0 -31
  180. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +0 -30
  181. datapipeline/templates/plugin_skeleton/example/dataset.yaml +0 -18
  182. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +0 -29
  183. datapipeline/templates/plugin_skeleton/example/project.yaml +0 -23
  184. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +0 -3
  185. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +0 -9
  186. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +0 -2
  187. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +0 -4
  188. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +0 -28
  189. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +0 -4
  190. datapipeline/templates/stubs/mapper.py.j2 +0 -22
  191. jerry_thomas-1.0.3.dist-info/METADATA +0 -827
  192. jerry_thomas-1.0.3.dist-info/RECORD +0 -198
  193. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/licenses/LICENSE +0 -0
  194. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,9 @@
1
1
  import heapq
2
+ from collections import defaultdict
2
3
  from collections.abc import Iterator, Sequence
3
4
  from typing import Any
4
5
  from itertools import tee
5
6
 
6
- from datapipeline.domain.sample import Sample
7
7
  from datapipeline.domain.vector import Vector
8
8
  from datapipeline.pipeline.utils.keygen import group_key_for
9
9
  from datapipeline.pipeline.utils.memory_sort import batch_sort
@@ -12,8 +12,9 @@ from datapipeline.pipeline.stages import (
12
12
  open_source_stream,
13
13
  build_record_stream,
14
14
  apply_record_operations,
15
+ order_record_stream,
15
16
  build_feature_stream,
16
- regularize_feature_stream,
17
+ apply_stream_operations,
17
18
  apply_feature_transforms,
18
19
  vector_assemble_stage,
19
20
  sample_assemble_stage,
@@ -21,15 +22,61 @@ from datapipeline.pipeline.stages import (
21
22
  window_keys,
22
23
  )
23
24
  from datapipeline.pipeline.context import PipelineContext
25
+ from datapipeline.pipeline.utils.spool_cache import SpoolCache
24
26
 
25
27
 
26
- def build_feature_pipeline(
28
+ def _time_then_id(item: Any):
29
+ rec = getattr(item, "record", None)
30
+ if rec is not None:
31
+ t = getattr(rec, "time", None)
32
+ else:
33
+ recs = getattr(item, "records", None)
34
+ t = getattr(recs[0], "time", None) if recs else None
35
+ return (t, getattr(item, "id", None))
36
+
37
+
38
+ def _build_feature_from_records(
27
39
  context: PipelineContext,
40
+ records: Iterator[Any],
28
41
  cfg: FeatureRecordConfig,
29
42
  stage: int | None = None,
43
+ batch_size: int | None = None,
44
+ partition_by: str | None = None,
30
45
  ) -> Iterator[Any]:
31
46
  runtime = context.runtime
32
- record_stream_id = cfg.record_stream
47
+
48
+ if partition_by is None:
49
+ partition_by = runtime.registries.partition_by.get(cfg.record_stream)
50
+
51
+ features = build_feature_stream(
52
+ records,
53
+ base_feature_id=cfg.id,
54
+ field=cfg.field,
55
+ partition_by=partition_by,
56
+ )
57
+ if stage == 5:
58
+ return features
59
+
60
+ transformed = apply_feature_transforms(
61
+ context, features, cfg.scale, cfg.sequence)
62
+ if stage == 6:
63
+ return transformed
64
+
65
+ if batch_size is None:
66
+ batch_size = runtime.registries.sort_batch_size.get(cfg.record_stream)
67
+ sorted_for_grouping = batch_sort(
68
+ transformed, batch_size=batch_size, key=_time_then_id
69
+ )
70
+ return sorted_for_grouping
71
+
72
+
73
+ def build_record_pipeline(
74
+ context: PipelineContext,
75
+ record_stream_id: str,
76
+ stage: int | None = None,
77
+ ) -> Iterator[Any]:
78
+ """Build a canonical record stream through stream transforms."""
79
+ runtime = context.runtime
33
80
 
34
81
  dtos = open_source_stream(context, record_stream_id)
35
82
  if stage == 0:
@@ -43,35 +90,41 @@ def build_feature_pipeline(
43
90
  if stage == 2:
44
91
  return records
45
92
 
46
- partition_by = runtime.registries.partition_by.get(record_stream_id)
47
- features = build_feature_stream(records, cfg.id, partition_by)
93
+ batch_size = runtime.registries.sort_batch_size.get(record_stream_id)
94
+ records = order_record_stream(
95
+ context, records, record_stream_id, batch_size)
48
96
  if stage == 3:
49
- return features
97
+ return records
50
98
 
51
- batch_size = runtime.registries.sort_batch_size.get(record_stream_id)
52
- regularized = regularize_feature_stream(
53
- context, features, record_stream_id, batch_size)
99
+ records = apply_stream_operations(context, records, record_stream_id)
54
100
  if stage == 4:
55
- return regularized
101
+ return records
56
102
 
57
- transformed = apply_feature_transforms(
58
- context, regularized, cfg.scale, cfg.sequence)
59
- if stage == 5:
60
- return transformed
103
+ return records
61
104
 
62
- def _time_then_id(item: Any):
63
- rec = getattr(item, "record", None)
64
- if rec is not None:
65
- t = getattr(rec, "time", None)
66
- else:
67
- recs = getattr(item, "records", None)
68
- t = getattr(recs[0], "time", None) if recs else None
69
- return (t, getattr(item, "id", None))
70
105
 
71
- sorted_for_grouping = batch_sort(
72
- transformed, batch_size=batch_size, key=_time_then_id
106
+ def build_feature_pipeline(
107
+ context: PipelineContext,
108
+ cfg: FeatureRecordConfig,
109
+ stage: int | None = None,
110
+ ) -> Iterator[Any]:
111
+ runtime = context.runtime
112
+ record_stream_id = cfg.record_stream
113
+
114
+ records = build_record_pipeline(context, record_stream_id, stage=stage)
115
+ if stage is not None and stage <= 4:
116
+ return records
117
+
118
+ batch_size = runtime.registries.sort_batch_size.get(record_stream_id)
119
+ partition_by = runtime.registries.partition_by.get(record_stream_id)
120
+ return _build_feature_from_records(
121
+ context,
122
+ records,
123
+ cfg,
124
+ stage=stage,
125
+ batch_size=batch_size,
126
+ partition_by=partition_by,
73
127
  )
74
- return sorted_for_grouping
75
128
 
76
129
 
77
130
  def build_vector_pipeline(
@@ -130,14 +183,45 @@ def _assemble_vectors(
130
183
  ) -> Iterator[tuple[tuple, Vector]]:
131
184
  if not configs:
132
185
  return iter(())
133
- streams = [
134
- build_feature_pipeline(
135
- context,
136
- cfg,
137
- )
138
- for cfg in configs
139
- ]
186
+
187
+ runtime = context.runtime
188
+ grouped: dict[str, list[FeatureRecordConfig]] = defaultdict(list)
189
+ for cfg in configs:
190
+ grouped[cfg.record_stream].append(cfg)
191
+
192
+ streams: list[Iterator[Any]] = []
193
+ caches: list[SpoolCache] = []
194
+ for record_stream_id, cfgs in grouped.items():
195
+ records = build_record_pipeline(context, record_stream_id, stage=4)
196
+ if len(cfgs) == 1:
197
+ record_iters = (records,)
198
+ else:
199
+ cache = SpoolCache(records, name=record_stream_id)
200
+ caches.append(cache)
201
+ record_iters = tuple(cache.reader() for _ in cfgs)
202
+ batch_size = runtime.registries.sort_batch_size.get(record_stream_id)
203
+ partition_by = runtime.registries.partition_by.get(record_stream_id)
204
+
205
+ for cfg, rec_iter in zip(cfgs, record_iters):
206
+ streams.append(
207
+ _build_feature_from_records(
208
+ context,
209
+ rec_iter,
210
+ cfg,
211
+ batch_size=batch_size,
212
+ partition_by=partition_by,
213
+ )
214
+ )
215
+
140
216
  merged = heapq.merge(
141
217
  *streams, key=lambda fr: group_key_for(fr, group_by_cadence)
142
218
  )
143
- return vector_assemble_stage(merged, group_by_cadence)
219
+
220
+ def _with_cleanup() -> Iterator[tuple[tuple, Vector]]:
221
+ try:
222
+ yield from vector_assemble_stage(merged, group_by_cadence)
223
+ finally:
224
+ for cache in caches:
225
+ cache.close()
226
+
227
+ return _with_cleanup()
@@ -20,11 +20,23 @@ from datapipeline.sources.models.source import Source
20
20
  from datapipeline.transforms.vector import VectorEnsureSchemaTransform
21
21
  from datapipeline.config.dataset.normalize import floor_time_to_bucket
22
22
  from datapipeline.utils.time import parse_timecode
23
+ from datapipeline.transforms.utils import get_field, partition_key
23
24
 
24
25
 
25
26
  def open_source_stream(context: PipelineContext, stream_alias: str) -> Source:
26
27
  runtime = context.runtime
27
- return runtime.registries.stream_sources.get(stream_alias).stream()
28
+ registry = runtime.registries.stream_sources
29
+ try:
30
+ source = registry.get(stream_alias)
31
+ except KeyError as exc:
32
+ available = sorted(registry.keys())
33
+ available_text = ", ".join(available) if available else "(none)"
34
+ raise KeyError(
35
+ "Unknown record_stream "
36
+ f"'{stream_alias}'. Check dataset.yaml and contracts/ ids. "
37
+ f"Available streams: {available_text}"
38
+ ) from exc
39
+ return source.stream()
28
40
 
29
41
 
30
42
  def build_record_stream(
@@ -49,45 +61,66 @@ def apply_record_operations(
49
61
  return records
50
62
 
51
63
 
64
+ def _record_has_field(record: Any, field: str) -> bool:
65
+ if isinstance(record, dict):
66
+ return field in record
67
+ return hasattr(record, field)
68
+
69
+
52
70
  def build_feature_stream(
53
71
  record_stream: Iterable[TemporalRecord],
54
72
  base_feature_id: str,
73
+ field: str,
55
74
  partition_by: Any | None = None,
56
75
  ) -> Iterator[FeatureRecord]:
57
-
58
76
  keygen = FeatureIdGenerator(partition_by)
59
77
 
60
78
  for rec in record_stream:
79
+ if not _record_has_field(rec, field):
80
+ raise KeyError(
81
+ f"Record field '{field}' not found on {type(rec).__name__}")
61
82
  yield FeatureRecord(
62
83
  record=rec,
63
84
  id=keygen.generate(base_feature_id, rec),
85
+ value=get_field(rec, field),
64
86
  )
65
87
 
66
88
 
67
- def regularize_feature_stream(
89
+ def order_record_stream(
68
90
  context: PipelineContext,
69
- feature_stream: Iterable[FeatureRecord],
91
+ record_stream: Iterable[TemporalRecord],
70
92
  stream_id: str,
71
93
  batch_size: int,
72
- ) -> Iterator[FeatureRecord]:
73
- """Apply feature transforms defined in contract policies in order."""
74
- # Sort by (id, time) to satisfy stream transforms (ensure_cadence/fill)
75
- sorted = batch_sort(
76
- feature_stream,
94
+ ) -> Iterator[TemporalRecord]:
95
+ """Return records sorted by (partition_key, time)."""
96
+ partition_by = context.runtime.registries.partition_by.get(stream_id)
97
+ return batch_sort(
98
+ record_stream,
77
99
  batch_size=batch_size,
78
- key=lambda fr: (fr.id, fr.record.time),
100
+ key=lambda rec: (partition_key(rec, partition_by), rec.time),
79
101
  )
102
+
103
+
104
+ def apply_stream_operations(
105
+ context: PipelineContext,
106
+ record_stream: Iterable[TemporalRecord],
107
+ stream_id: str,
108
+ ) -> Iterator[TemporalRecord]:
109
+ """Apply stream/debug transforms (expects input sorted by partition_key + time)."""
110
+ partition_by = context.runtime.registries.partition_by.get(stream_id)
80
111
  transformed = apply_transforms(
81
- sorted,
112
+ record_stream,
82
113
  STREAM_TRANFORMS_EP,
83
114
  context.runtime.registries.stream_operations.get(stream_id),
84
115
  context,
116
+ extra_kwargs={"partition_by": partition_by},
85
117
  )
86
118
  transformed = apply_transforms(
87
119
  transformed,
88
120
  DEBUG_TRANSFORMS_EP,
89
121
  context.runtime.registries.debug_operations.get(stream_id),
90
122
  context,
123
+ extra_kwargs={"partition_by": partition_by},
91
124
  )
92
125
  return transformed
93
126
 
@@ -135,10 +168,9 @@ def vector_assemble_stage(
135
168
  feature_map = defaultdict(list)
136
169
  for fr in group:
137
170
  if isinstance(fr, FeatureRecordSequence):
138
- records = fr.records
171
+ feature_map[fr.id].extend(fr.values)
139
172
  else:
140
- records = [fr.record]
141
- feature_map[fr.id].extend(records)
173
+ feature_map[fr.id].append(fr.value)
142
174
  vector = vectorize_record_group(feature_map)
143
175
  yield group_key, vector
144
176
 
@@ -242,16 +274,19 @@ def _apply_vector_schema(
242
274
 
243
275
  if not feature_entries:
244
276
  if context.schema_required:
245
- raise RuntimeError("Schema missing for payload 'features'. Run `jerry build` to materialize schema.json.")
277
+ raise RuntimeError(
278
+ "Schema missing for payload 'features'. Run `jerry build` to materialize schema.json.")
246
279
  feature_stream = stream
247
280
  else:
248
- feature_schema = VectorEnsureSchemaTransform(on_missing="fill", on_extra="drop")
281
+ feature_schema = VectorEnsureSchemaTransform(
282
+ on_missing="fill", on_extra="drop")
249
283
  feature_schema.bind_context(context)
250
284
  feature_stream = feature_schema(stream)
251
285
 
252
286
  def _apply_targets(upstream: Iterator[Sample]) -> Iterator[Sample]:
253
287
  if target_entries:
254
- target_schema = VectorEnsureSchemaTransform(payload="targets", on_missing="fill", on_extra="drop")
288
+ target_schema = VectorEnsureSchemaTransform(
289
+ payload="targets", on_missing="fill", on_extra="drop")
255
290
  target_schema.bind_context(context)
256
291
  return target_schema(upstream)
257
292
  if not context.schema_required:
@@ -264,6 +299,7 @@ def _apply_vector_schema(
264
299
  return iter(())
265
300
  if first.targets is None:
266
301
  return chain([first], iterator)
267
- raise RuntimeError("Schema missing for payload 'targets'. Run `jerry build` to materialize schema.json.")
302
+ raise RuntimeError(
303
+ "Schema missing for payload 'targets'. Run `jerry build` to materialize schema.json.")
268
304
 
269
305
  return _apply_targets(feature_stream)
@@ -0,0 +1,142 @@
1
+ import pickle
2
+ import tempfile
3
+ import weakref
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Iterator, Any
7
+
8
+
9
+ _LEN_BYTES = 8
10
+
11
+
12
+ def _encode_len(size: int) -> bytes:
13
+ return int(size).to_bytes(_LEN_BYTES, "little", signed=False)
14
+
15
+
16
+ def _decode_len(raw: bytes) -> int:
17
+ return int.from_bytes(raw, "little", signed=False)
18
+
19
+
20
+ @dataclass
21
+ class _SpoolState:
22
+ writer: Any
23
+ path: Path
24
+ offsets: list[int]
25
+ source: Iterator[Any]
26
+ done: bool = False
27
+
28
+ def close(self) -> None:
29
+ try:
30
+ self.writer.close()
31
+ except Exception:
32
+ pass
33
+
34
+
35
+ class SpoolCache:
36
+ """Disk-backed cache for iterators with multiple sequential readers."""
37
+
38
+ def __init__(self, source: Iterator[Any], *, name: str | None = None) -> None:
39
+ tmp = tempfile.NamedTemporaryFile(
40
+ prefix=f"dp-spool-{name or 'stream'}-",
41
+ suffix=".pkl",
42
+ delete=False,
43
+ )
44
+ path = Path(tmp.name)
45
+ self._state = _SpoolState(
46
+ writer=tmp,
47
+ path=path,
48
+ offsets=[],
49
+ source=iter(source),
50
+ )
51
+ self._finalizer = weakref.finalize(self, _cleanup, path, tmp)
52
+
53
+ @property
54
+ def path(self) -> Path:
55
+ return self._state.path
56
+
57
+ def close(self) -> None:
58
+ """Close writer and remove the spool file."""
59
+ if self._finalizer.alive:
60
+ self._finalizer()
61
+
62
+ def __enter__(self) -> "SpoolCache":
63
+ return self
64
+
65
+ def __exit__(self, exc_type, exc, tb) -> None:
66
+ self.close()
67
+
68
+ def reader(self) -> Iterator[Any]:
69
+ return _SpoolReader(self)
70
+
71
+ def _append_next(self) -> bool:
72
+ if self._state.done:
73
+ return False
74
+ try:
75
+ item = next(self._state.source)
76
+ except StopIteration:
77
+ self._state.done = True
78
+ self._state.writer.flush()
79
+ return False
80
+ try:
81
+ data = pickle.dumps(item, protocol=pickle.HIGHEST_PROTOCOL)
82
+ except Exception as exc: # pragma: no cover - defensive
83
+ raise TypeError(
84
+ "SpoolCache requires picklable records for multi-feature fanout."
85
+ ) from exc
86
+ offset = self._state.writer.tell()
87
+ self._state.writer.write(_encode_len(len(data)))
88
+ self._state.writer.write(data)
89
+ self._state.writer.flush()
90
+ self._state.offsets.append(offset)
91
+ return True
92
+
93
+ def _ensure_index(self, index: int) -> None:
94
+ while len(self._state.offsets) <= index:
95
+ if not self._append_next():
96
+ break
97
+
98
+
99
+ class _SpoolReader:
100
+ def __init__(self, cache: SpoolCache) -> None:
101
+ self._cache = cache
102
+ self._index = 0
103
+ self._fh = open(cache.path, "rb")
104
+
105
+ def __iter__(self) -> "_SpoolReader":
106
+ return self
107
+
108
+ def __next__(self) -> Any:
109
+ self._cache._ensure_index(self._index)
110
+ if self._index >= len(self._cache._state.offsets):
111
+ self._close()
112
+ raise StopIteration
113
+ offset = self._cache._state.offsets[self._index]
114
+ self._index += 1
115
+ self._fh.seek(offset)
116
+ raw = self._fh.read(_LEN_BYTES)
117
+ if not raw:
118
+ self._close()
119
+ raise StopIteration
120
+ size = _decode_len(raw)
121
+ payload = self._fh.read(size)
122
+ return pickle.loads(payload)
123
+
124
+ def _close(self) -> None:
125
+ try:
126
+ self._fh.close()
127
+ except Exception:
128
+ pass
129
+
130
+ def __del__(self) -> None:
131
+ self._close()
132
+
133
+
134
+ def _cleanup(path: Path, writer: Any) -> None:
135
+ try:
136
+ writer.close()
137
+ except Exception:
138
+ pass
139
+ try:
140
+ path.unlink(missing_ok=True)
141
+ except Exception:
142
+ pass
@@ -41,17 +41,35 @@ def _split_params(params: Any) -> Tuple[Tuple[Any, ...], dict[str, Any]]:
41
41
  return (params,), {}
42
42
 
43
43
 
44
+ def _merge_extra_kwargs(
45
+ fn: Callable[..., Any],
46
+ kwargs: dict[str, Any],
47
+ extra_kwargs: Mapping[str, Any] | None,
48
+ ) -> dict[str, Any]:
49
+ if not extra_kwargs:
50
+ return kwargs
51
+ merged = dict(kwargs)
52
+ for key, value in extra_kwargs.items():
53
+ if key in merged:
54
+ continue
55
+ if _supports_parameter(fn, key):
56
+ merged[key] = value
57
+ return merged
58
+
59
+
44
60
  def _call_with_params(
45
61
  fn: Callable,
46
62
  stream: Iterator[Any],
47
63
  params: Any,
48
64
  context: Optional[PipelineContext],
65
+ extra_kwargs: Mapping[str, Any] | None = None,
49
66
  ) -> Iterator[Any]:
50
67
  """Invoke an entry-point callable with optional params semantics."""
51
68
 
52
69
  args, kwargs = _split_params(params)
53
70
  if context and _supports_parameter(fn, "context") and "context" not in kwargs:
54
71
  kwargs["context"] = context
72
+ kwargs = _merge_extra_kwargs(fn, kwargs, extra_kwargs)
55
73
  return fn(stream, *args, **kwargs)
56
74
 
57
75
 
@@ -59,12 +77,14 @@ def _instantiate_entry_point(
59
77
  cls: Callable[..., Any],
60
78
  params: Any,
61
79
  context: Optional[PipelineContext],
80
+ extra_kwargs: Mapping[str, Any] | None = None,
62
81
  ) -> Any:
63
82
  """Instantiate a transform class with parameters from the config."""
64
83
 
65
84
  args, kwargs = _split_params(params)
66
85
  if context and _supports_parameter(cls.__init__, "context") and "context" not in kwargs:
67
86
  kwargs["context"] = context
87
+ kwargs = _merge_extra_kwargs(cls.__init__, kwargs, extra_kwargs)
68
88
  return cls(*args, **kwargs)
69
89
 
70
90
 
@@ -83,6 +103,7 @@ def apply_transforms(
83
103
  context: Optional[PipelineContext] = None,
84
104
  observer: Callable[[TransformEvent], None] | None = None,
85
105
  observer_registry: ObserverRegistry | None = None,
106
+ extra_kwargs: Mapping[str, Any] | None = None,
86
107
  ) -> Iterator[Any]:
87
108
  """Instantiate and apply configured transforms in order."""
88
109
 
@@ -97,7 +118,9 @@ def apply_transforms(
97
118
  name, params = _extract_single_pair(transform, "Transform")
98
119
  ep = load_ep(group=group, name=name)
99
120
  if isclass(ep):
100
- inst = _instantiate_entry_point(ep, params, context)
121
+ inst = _instantiate_entry_point(
122
+ ep, params, context, extra_kwargs=extra_kwargs
123
+ )
101
124
  _bind_context(inst, context)
102
125
  eff_observer = observer
103
126
  if eff_observer is None and registry:
@@ -107,7 +130,9 @@ def apply_transforms(
107
130
  _attach_observer(inst, eff_observer)
108
131
  stream = inst(stream)
109
132
  else:
110
- stream = _call_with_params(ep, stream, params, context)
133
+ stream = _call_with_params(
134
+ ep, stream, params, context, extra_kwargs=extra_kwargs
135
+ )
111
136
  return stream
112
137
 
113
138
 
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from dataclasses import dataclass
4
2
  import json
5
3
  from pathlib import Path
@@ -80,8 +78,7 @@ class ArtifactManager:
80
78
  except FileNotFoundError as exc:
81
79
  message = (
82
80
  f"Artifact file not found: {path}. "
83
- "Run `jerry build --project <project.yaml>` (preferred) or "
84
- "`jerry inspect expected --project <project.yaml>` to regenerate it."
81
+ "Run `jerry build --project <project.yaml>` to regenerate it."
85
82
  )
86
83
  raise RuntimeError(message) from exc
87
84
 
@@ -12,6 +12,7 @@ LOADERS_GROUP = "loaders"
12
12
  MAPPERS_GROUP = "mappers"
13
13
  FILTERS_GROUP = "filters"
14
14
  DEFAULT_IO_LOADER_EP = "core.io"
15
+ DEFAULT_SYNTHETIC_LOADER_EP = "core.synthetic.ticks"
15
16
 
16
17
  # POSTPROCESS_GLOBAL_KEY = "__global__"
17
18
  POSTPROCESS_TRANSFORMS = "transforms"
@@ -6,8 +6,7 @@ from datapipeline.mappers.noop import identity
6
6
  from datapipeline.utils.placeholders import normalize_args
7
7
  from datapipeline.sources.models.base import SourceInterface
8
8
  from datapipeline.pipeline.context import PipelineContext
9
- from datapipeline.config.dataset.feature import FeatureRecordConfig
10
- from datapipeline.pipeline.pipelines import build_feature_pipeline
9
+ from datapipeline.pipeline.pipelines import build_record_pipeline
11
10
  from datapipeline.pipeline.utils.transform_utils import _supports_parameter
12
11
  from inspect import isclass
13
12
  from typing import Iterator, Any, Optional
@@ -52,7 +51,7 @@ class _ComposedSource(SourceInterface):
52
51
 
53
52
  # Build aligned/aux iterators (unwrap FeatureRecord -> record for aligned)
54
53
  aligned_iters: dict[str, Iterator[Any]] = {
55
- k: (fr.record for fr in v["iter"]) # stage>=3 yields FeatureRecord
54
+ k: (getattr(item, "record", item) for item in v["iter"])
56
55
  for k, v in aligned.items()
57
56
  }
58
57
  aux_iters: dict[str, Iterator[Any]] = {
@@ -111,7 +110,7 @@ class _ComposedSource(SourceInterface):
111
110
  """Parse and resolve composed inputs into iterators.
112
111
 
113
112
  Grammar: "[alias=]stream_id" only. All inputs are built to stage 4
114
- and are alignable (FeatureRecord -> domain record unwrapped).
113
+ and are alignable (domain records with stream transforms applied).
115
114
  """
116
115
  runtime = context.runtime
117
116
  known_streams = set(runtime.registries.stream_sources.keys())
@@ -123,8 +122,7 @@ class _ComposedSource(SourceInterface):
123
122
  raise ValueError(
124
123
  f"Unknown input stream '{ref}'. Known streams: {sorted(known_streams)}"
125
124
  )
126
- cfg = FeatureRecordConfig(record_stream=ref, id=alias)
127
- it = build_feature_pipeline(context, cfg, stage=4)
125
+ it = build_record_pipeline(context, ref, stage=4)
128
126
  out[alias] = {"iter": it, "aligned": True}
129
127
 
130
128
  return out
@@ -8,7 +8,16 @@ def pkg_root(start: Optional[Path] = None) -> tuple[Path, str, Path]:
8
8
  for d in [here, *here.parents]:
9
9
  pyproject = d / "pyproject.toml"
10
10
  if pyproject.exists():
11
- return d, d.name, pyproject
11
+ pkg_name = d.name
12
+ src_dir = d / "src"
13
+ if src_dir.exists():
14
+ candidates = [
15
+ p for p in src_dir.iterdir()
16
+ if p.is_dir() and (p / "__init__.py").exists()
17
+ ]
18
+ if len(candidates) == 1:
19
+ pkg_name = candidates[0].name
20
+ return d, pkg_name, pyproject
12
21
  print("[error] pyproject.toml not found (searched current and parent dirs)", file=sys.stderr)
13
22
  raise SystemExit(1)
14
23
 
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from pathlib import Path
4
2
  from typing import Optional
5
3
 
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from dataclasses import dataclass, asdict
4
2
  from datetime import datetime, timezone
5
3
  from pathlib import Path