jerry-thomas 1.0.3__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +0 -1
- datapipeline/build/tasks/config.py +0 -2
- datapipeline/build/tasks/metadata.py +0 -2
- datapipeline/build/tasks/scaler.py +0 -2
- datapipeline/build/tasks/schema.py +0 -2
- datapipeline/build/tasks/utils.py +0 -2
- datapipeline/cli/app.py +201 -81
- datapipeline/cli/commands/contract.py +145 -283
- datapipeline/cli/commands/demo.py +13 -0
- datapipeline/cli/commands/domain.py +4 -4
- datapipeline/cli/commands/dto.py +11 -0
- datapipeline/cli/commands/filter.py +2 -2
- datapipeline/cli/commands/inspect.py +0 -68
- datapipeline/cli/commands/list_.py +30 -13
- datapipeline/cli/commands/loader.py +11 -0
- datapipeline/cli/commands/mapper.py +82 -0
- datapipeline/cli/commands/parser.py +45 -0
- datapipeline/cli/commands/run_config.py +1 -3
- datapipeline/cli/commands/serve_pipeline.py +5 -7
- datapipeline/cli/commands/source.py +106 -18
- datapipeline/cli/commands/stream.py +286 -0
- datapipeline/cli/visuals/common.py +0 -2
- datapipeline/cli/visuals/sections.py +0 -2
- datapipeline/cli/workspace_utils.py +0 -3
- datapipeline/config/context.py +0 -2
- datapipeline/config/dataset/feature.py +1 -0
- datapipeline/config/metadata.py +0 -2
- datapipeline/config/project.py +0 -2
- datapipeline/config/resolution.py +10 -2
- datapipeline/config/tasks.py +9 -9
- datapipeline/domain/feature.py +3 -0
- datapipeline/domain/record.py +7 -7
- datapipeline/domain/sample.py +0 -2
- datapipeline/domain/vector.py +6 -8
- datapipeline/integrations/ml/adapter.py +0 -2
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +0 -2
- datapipeline/integrations/ml/torch_support.py +0 -2
- datapipeline/io/output.py +0 -2
- datapipeline/io/serializers.py +26 -16
- datapipeline/mappers/synthetic/time.py +9 -2
- datapipeline/pipeline/artifacts.py +3 -5
- datapipeline/pipeline/observability.py +0 -2
- datapipeline/pipeline/pipelines.py +118 -34
- datapipeline/pipeline/stages.py +42 -17
- datapipeline/pipeline/utils/spool_cache.py +142 -0
- datapipeline/pipeline/utils/transform_utils.py +27 -2
- datapipeline/services/artifacts.py +1 -4
- datapipeline/services/constants.py +1 -0
- datapipeline/services/factories.py +4 -6
- datapipeline/services/project_paths.py +0 -2
- datapipeline/services/runs.py +0 -2
- datapipeline/services/scaffold/contract_yaml.py +76 -0
- datapipeline/services/scaffold/demo.py +141 -0
- datapipeline/services/scaffold/discovery.py +115 -0
- datapipeline/services/scaffold/domain.py +21 -13
- datapipeline/services/scaffold/dto.py +31 -0
- datapipeline/services/scaffold/filter.py +2 -1
- datapipeline/services/scaffold/layout.py +96 -0
- datapipeline/services/scaffold/loader.py +61 -0
- datapipeline/services/scaffold/mapper.py +116 -0
- datapipeline/services/scaffold/parser.py +56 -0
- datapipeline/services/scaffold/plugin.py +14 -2
- datapipeline/services/scaffold/source_yaml.py +91 -0
- datapipeline/services/scaffold/stream_plan.py +110 -0
- datapipeline/services/scaffold/utils.py +187 -0
- datapipeline/sources/data_loader.py +0 -2
- datapipeline/sources/decoders.py +49 -8
- datapipeline/sources/factory.py +9 -6
- datapipeline/sources/foreach.py +18 -3
- datapipeline/sources/synthetic/time/parser.py +1 -1
- datapipeline/sources/transports.py +10 -4
- datapipeline/templates/demo_skeleton/demo/contracts/equity.ohlcv.yaml +33 -0
- datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.hour_sin.yaml +22 -0
- datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.linear.yaml +22 -0
- datapipeline/templates/demo_skeleton/demo/data/APPL.jsonl +19 -0
- datapipeline/templates/demo_skeleton/demo/data/MSFT.jsonl +19 -0
- datapipeline/templates/demo_skeleton/demo/dataset.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/postprocess.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/project.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/sources/sandbox.ohlcv.yaml +17 -0
- datapipeline/templates/{plugin_skeleton/example → demo_skeleton/demo}/sources/synthetic.ticks.yaml +1 -1
- datapipeline/templates/demo_skeleton/demo/tasks/metadata.yaml +2 -0
- datapipeline/templates/demo_skeleton/demo/tasks/scaler.yaml +3 -0
- datapipeline/templates/demo_skeleton/demo/tasks/schema.yaml +2 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.test.yaml +4 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.train.yaml +4 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.val.yaml +4 -0
- datapipeline/templates/demo_skeleton/scripts/run_dataframe.py +20 -0
- datapipeline/templates/demo_skeleton/scripts/run_torch.py +23 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/model.py +18 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/sandbox_ohlcv_dto.py +14 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/map_sandbox_ohlcv_dto_to_equity.py +26 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/sandbox_ohlcv_dto_parser.py +46 -0
- datapipeline/templates/plugin_skeleton/README.md +57 -136
- datapipeline/templates/plugin_skeleton/jerry.yaml +12 -24
- datapipeline/templates/plugin_skeleton/reference/jerry.yaml +28 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/composed.reference.yaml +29 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/ingest.reference.yaml +31 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/overview.reference.yaml +34 -0
- datapipeline/templates/plugin_skeleton/reference/reference/dataset.yaml +29 -0
- datapipeline/templates/plugin_skeleton/reference/reference/postprocess.yaml +25 -0
- datapipeline/templates/plugin_skeleton/reference/reference/project.yaml +32 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.http.reference.yaml +24 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.reference.yaml +21 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/fs.reference.yaml +16 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/http.reference.yaml +17 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/overview.reference.yaml +18 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/synthetic.reference.yaml +15 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/metadata.reference.yaml +11 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/scaler.reference.yaml +10 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/schema.reference.yaml +10 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/serve.reference.yaml +28 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/domains/__init__.py +2 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/loaders/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +1 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +12 -11
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +4 -13
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +7 -10
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +1 -2
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +1 -7
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +1 -25
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/dataset.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/postprocess.yaml +1 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/project.yaml +14 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/tasks/serve.all.yaml +8 -0
- datapipeline/templates/stubs/contracts/composed.yaml.j2 +10 -0
- datapipeline/templates/stubs/contracts/ingest.yaml.j2 +25 -0
- datapipeline/templates/stubs/dto.py.j2 +1 -1
- datapipeline/templates/stubs/loaders/basic.py.j2 +11 -0
- datapipeline/templates/stubs/mappers/composed.py.j2 +13 -0
- datapipeline/templates/stubs/mappers/ingest.py.j2 +17 -0
- datapipeline/templates/stubs/parser.py.j2 +4 -0
- datapipeline/templates/stubs/record.py.j2 +0 -1
- datapipeline/templates/stubs/source.yaml.j2 +1 -1
- datapipeline/transforms/debug/identity.py +34 -16
- datapipeline/transforms/debug/lint.py +14 -11
- datapipeline/transforms/feature/scaler.py +5 -12
- datapipeline/transforms/filter.py +73 -17
- datapipeline/transforms/interfaces.py +58 -0
- datapipeline/transforms/record/floor_time.py +10 -7
- datapipeline/transforms/record/lag.py +8 -10
- datapipeline/transforms/sequence.py +2 -3
- datapipeline/transforms/stream/dedupe.py +5 -7
- datapipeline/transforms/stream/ensure_ticks.py +39 -24
- datapipeline/transforms/stream/fill.py +34 -25
- datapipeline/transforms/stream/filter.py +25 -0
- datapipeline/transforms/stream/floor_time.py +16 -0
- datapipeline/transforms/stream/granularity.py +52 -30
- datapipeline/transforms/stream/lag.py +17 -0
- datapipeline/transforms/stream/rolling.py +72 -0
- datapipeline/transforms/utils.py +42 -10
- datapipeline/transforms/vector/drop/horizontal.py +0 -3
- datapipeline/transforms/vector/drop/orchestrator.py +0 -3
- datapipeline/transforms/vector/drop/vertical.py +0 -2
- datapipeline/transforms/vector/ensure_schema.py +0 -2
- datapipeline/utils/paths.py +0 -2
- datapipeline/utils/placeholders.py +0 -2
- datapipeline/utils/rich_compat.py +0 -3
- datapipeline/utils/window.py +0 -2
- jerry_thomas-2.0.0.dist-info/METADATA +282 -0
- jerry_thomas-2.0.0.dist-info/RECORD +264 -0
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/WHEEL +1 -1
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/entry_points.txt +7 -3
- datapipeline/services/scaffold/mappers.py +0 -55
- datapipeline/services/scaffold/source.py +0 -191
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +0 -31
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +0 -30
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +0 -18
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +0 -29
- datapipeline/templates/plugin_skeleton/example/project.yaml +0 -23
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +0 -3
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +0 -9
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +0 -2
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +0 -4
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +0 -28
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +0 -4
- datapipeline/templates/stubs/mapper.py.j2 +0 -22
- jerry_thomas-1.0.3.dist-info/METADATA +0 -827
- jerry_thomas-1.0.3.dist-info/RECORD +0 -198
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import heapq
|
|
2
|
+
from collections import defaultdict
|
|
2
3
|
from collections.abc import Iterator, Sequence
|
|
3
4
|
from typing import Any
|
|
4
5
|
from itertools import tee
|
|
5
6
|
|
|
6
|
-
from datapipeline.domain.sample import Sample
|
|
7
7
|
from datapipeline.domain.vector import Vector
|
|
8
8
|
from datapipeline.pipeline.utils.keygen import group_key_for
|
|
9
9
|
from datapipeline.pipeline.utils.memory_sort import batch_sort
|
|
@@ -12,8 +12,9 @@ from datapipeline.pipeline.stages import (
|
|
|
12
12
|
open_source_stream,
|
|
13
13
|
build_record_stream,
|
|
14
14
|
apply_record_operations,
|
|
15
|
+
order_record_stream,
|
|
15
16
|
build_feature_stream,
|
|
16
|
-
|
|
17
|
+
apply_stream_operations,
|
|
17
18
|
apply_feature_transforms,
|
|
18
19
|
vector_assemble_stage,
|
|
19
20
|
sample_assemble_stage,
|
|
@@ -21,15 +22,61 @@ from datapipeline.pipeline.stages import (
|
|
|
21
22
|
window_keys,
|
|
22
23
|
)
|
|
23
24
|
from datapipeline.pipeline.context import PipelineContext
|
|
25
|
+
from datapipeline.pipeline.utils.spool_cache import SpoolCache
|
|
24
26
|
|
|
25
27
|
|
|
26
|
-
def
|
|
28
|
+
def _time_then_id(item: Any):
|
|
29
|
+
rec = getattr(item, "record", None)
|
|
30
|
+
if rec is not None:
|
|
31
|
+
t = getattr(rec, "time", None)
|
|
32
|
+
else:
|
|
33
|
+
recs = getattr(item, "records", None)
|
|
34
|
+
t = getattr(recs[0], "time", None) if recs else None
|
|
35
|
+
return (t, getattr(item, "id", None))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _build_feature_from_records(
|
|
27
39
|
context: PipelineContext,
|
|
40
|
+
records: Iterator[Any],
|
|
28
41
|
cfg: FeatureRecordConfig,
|
|
29
42
|
stage: int | None = None,
|
|
43
|
+
batch_size: int | None = None,
|
|
44
|
+
partition_by: str | None = None,
|
|
30
45
|
) -> Iterator[Any]:
|
|
31
46
|
runtime = context.runtime
|
|
32
|
-
|
|
47
|
+
|
|
48
|
+
if partition_by is None:
|
|
49
|
+
partition_by = runtime.registries.partition_by.get(cfg.record_stream)
|
|
50
|
+
|
|
51
|
+
features = build_feature_stream(
|
|
52
|
+
records,
|
|
53
|
+
base_feature_id=cfg.id,
|
|
54
|
+
field=cfg.field,
|
|
55
|
+
partition_by=partition_by,
|
|
56
|
+
)
|
|
57
|
+
if stage == 5:
|
|
58
|
+
return features
|
|
59
|
+
|
|
60
|
+
transformed = apply_feature_transforms(
|
|
61
|
+
context, features, cfg.scale, cfg.sequence)
|
|
62
|
+
if stage == 6:
|
|
63
|
+
return transformed
|
|
64
|
+
|
|
65
|
+
if batch_size is None:
|
|
66
|
+
batch_size = runtime.registries.sort_batch_size.get(cfg.record_stream)
|
|
67
|
+
sorted_for_grouping = batch_sort(
|
|
68
|
+
transformed, batch_size=batch_size, key=_time_then_id
|
|
69
|
+
)
|
|
70
|
+
return sorted_for_grouping
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def build_record_pipeline(
|
|
74
|
+
context: PipelineContext,
|
|
75
|
+
record_stream_id: str,
|
|
76
|
+
stage: int | None = None,
|
|
77
|
+
) -> Iterator[Any]:
|
|
78
|
+
"""Build a canonical record stream through stream transforms."""
|
|
79
|
+
runtime = context.runtime
|
|
33
80
|
|
|
34
81
|
dtos = open_source_stream(context, record_stream_id)
|
|
35
82
|
if stage == 0:
|
|
@@ -43,35 +90,41 @@ def build_feature_pipeline(
|
|
|
43
90
|
if stage == 2:
|
|
44
91
|
return records
|
|
45
92
|
|
|
46
|
-
|
|
47
|
-
|
|
93
|
+
batch_size = runtime.registries.sort_batch_size.get(record_stream_id)
|
|
94
|
+
records = order_record_stream(
|
|
95
|
+
context, records, record_stream_id, batch_size)
|
|
48
96
|
if stage == 3:
|
|
49
|
-
return
|
|
97
|
+
return records
|
|
50
98
|
|
|
51
|
-
|
|
52
|
-
regularized = regularize_feature_stream(
|
|
53
|
-
context, features, record_stream_id, batch_size)
|
|
99
|
+
records = apply_stream_operations(context, records, record_stream_id)
|
|
54
100
|
if stage == 4:
|
|
55
|
-
return
|
|
101
|
+
return records
|
|
56
102
|
|
|
57
|
-
|
|
58
|
-
context, regularized, cfg.scale, cfg.sequence)
|
|
59
|
-
if stage == 5:
|
|
60
|
-
return transformed
|
|
103
|
+
return records
|
|
61
104
|
|
|
62
|
-
def _time_then_id(item: Any):
|
|
63
|
-
rec = getattr(item, "record", None)
|
|
64
|
-
if rec is not None:
|
|
65
|
-
t = getattr(rec, "time", None)
|
|
66
|
-
else:
|
|
67
|
-
recs = getattr(item, "records", None)
|
|
68
|
-
t = getattr(recs[0], "time", None) if recs else None
|
|
69
|
-
return (t, getattr(item, "id", None))
|
|
70
105
|
|
|
71
|
-
|
|
72
|
-
|
|
106
|
+
def build_feature_pipeline(
|
|
107
|
+
context: PipelineContext,
|
|
108
|
+
cfg: FeatureRecordConfig,
|
|
109
|
+
stage: int | None = None,
|
|
110
|
+
) -> Iterator[Any]:
|
|
111
|
+
runtime = context.runtime
|
|
112
|
+
record_stream_id = cfg.record_stream
|
|
113
|
+
|
|
114
|
+
records = build_record_pipeline(context, record_stream_id, stage=stage)
|
|
115
|
+
if stage is not None and stage <= 4:
|
|
116
|
+
return records
|
|
117
|
+
|
|
118
|
+
batch_size = runtime.registries.sort_batch_size.get(record_stream_id)
|
|
119
|
+
partition_by = runtime.registries.partition_by.get(record_stream_id)
|
|
120
|
+
return _build_feature_from_records(
|
|
121
|
+
context,
|
|
122
|
+
records,
|
|
123
|
+
cfg,
|
|
124
|
+
stage=stage,
|
|
125
|
+
batch_size=batch_size,
|
|
126
|
+
partition_by=partition_by,
|
|
73
127
|
)
|
|
74
|
-
return sorted_for_grouping
|
|
75
128
|
|
|
76
129
|
|
|
77
130
|
def build_vector_pipeline(
|
|
@@ -130,14 +183,45 @@ def _assemble_vectors(
|
|
|
130
183
|
) -> Iterator[tuple[tuple, Vector]]:
|
|
131
184
|
if not configs:
|
|
132
185
|
return iter(())
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
]
|
|
186
|
+
|
|
187
|
+
runtime = context.runtime
|
|
188
|
+
grouped: dict[str, list[FeatureRecordConfig]] = defaultdict(list)
|
|
189
|
+
for cfg in configs:
|
|
190
|
+
grouped[cfg.record_stream].append(cfg)
|
|
191
|
+
|
|
192
|
+
streams: list[Iterator[Any]] = []
|
|
193
|
+
caches: list[SpoolCache] = []
|
|
194
|
+
for record_stream_id, cfgs in grouped.items():
|
|
195
|
+
records = build_record_pipeline(context, record_stream_id, stage=4)
|
|
196
|
+
if len(cfgs) == 1:
|
|
197
|
+
record_iters = (records,)
|
|
198
|
+
else:
|
|
199
|
+
cache = SpoolCache(records, name=record_stream_id)
|
|
200
|
+
caches.append(cache)
|
|
201
|
+
record_iters = tuple(cache.reader() for _ in cfgs)
|
|
202
|
+
batch_size = runtime.registries.sort_batch_size.get(record_stream_id)
|
|
203
|
+
partition_by = runtime.registries.partition_by.get(record_stream_id)
|
|
204
|
+
|
|
205
|
+
for cfg, rec_iter in zip(cfgs, record_iters):
|
|
206
|
+
streams.append(
|
|
207
|
+
_build_feature_from_records(
|
|
208
|
+
context,
|
|
209
|
+
rec_iter,
|
|
210
|
+
cfg,
|
|
211
|
+
batch_size=batch_size,
|
|
212
|
+
partition_by=partition_by,
|
|
213
|
+
)
|
|
214
|
+
)
|
|
215
|
+
|
|
140
216
|
merged = heapq.merge(
|
|
141
217
|
*streams, key=lambda fr: group_key_for(fr, group_by_cadence)
|
|
142
218
|
)
|
|
143
|
-
|
|
219
|
+
|
|
220
|
+
def _with_cleanup() -> Iterator[tuple[tuple, Vector]]:
|
|
221
|
+
try:
|
|
222
|
+
yield from vector_assemble_stage(merged, group_by_cadence)
|
|
223
|
+
finally:
|
|
224
|
+
for cache in caches:
|
|
225
|
+
cache.close()
|
|
226
|
+
|
|
227
|
+
return _with_cleanup()
|
datapipeline/pipeline/stages.py
CHANGED
|
@@ -20,6 +20,7 @@ from datapipeline.sources.models.source import Source
|
|
|
20
20
|
from datapipeline.transforms.vector import VectorEnsureSchemaTransform
|
|
21
21
|
from datapipeline.config.dataset.normalize import floor_time_to_bucket
|
|
22
22
|
from datapipeline.utils.time import parse_timecode
|
|
23
|
+
from datapipeline.transforms.utils import get_field, partition_key
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
def open_source_stream(context: PipelineContext, stream_alias: str) -> Source:
|
|
@@ -49,45 +50,66 @@ def apply_record_operations(
|
|
|
49
50
|
return records
|
|
50
51
|
|
|
51
52
|
|
|
53
|
+
def _record_has_field(record: Any, field: str) -> bool:
|
|
54
|
+
if isinstance(record, dict):
|
|
55
|
+
return field in record
|
|
56
|
+
return hasattr(record, field)
|
|
57
|
+
|
|
58
|
+
|
|
52
59
|
def build_feature_stream(
|
|
53
60
|
record_stream: Iterable[TemporalRecord],
|
|
54
61
|
base_feature_id: str,
|
|
62
|
+
field: str,
|
|
55
63
|
partition_by: Any | None = None,
|
|
56
64
|
) -> Iterator[FeatureRecord]:
|
|
57
|
-
|
|
58
65
|
keygen = FeatureIdGenerator(partition_by)
|
|
59
66
|
|
|
60
67
|
for rec in record_stream:
|
|
68
|
+
if not _record_has_field(rec, field):
|
|
69
|
+
raise KeyError(
|
|
70
|
+
f"Record field '{field}' not found on {type(rec).__name__}")
|
|
61
71
|
yield FeatureRecord(
|
|
62
72
|
record=rec,
|
|
63
73
|
id=keygen.generate(base_feature_id, rec),
|
|
74
|
+
value=get_field(rec, field),
|
|
64
75
|
)
|
|
65
76
|
|
|
66
77
|
|
|
67
|
-
def
|
|
78
|
+
def order_record_stream(
|
|
68
79
|
context: PipelineContext,
|
|
69
|
-
|
|
80
|
+
record_stream: Iterable[TemporalRecord],
|
|
70
81
|
stream_id: str,
|
|
71
82
|
batch_size: int,
|
|
72
|
-
) -> Iterator[
|
|
73
|
-
"""
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
83
|
+
) -> Iterator[TemporalRecord]:
|
|
84
|
+
"""Return records sorted by (partition_key, time)."""
|
|
85
|
+
partition_by = context.runtime.registries.partition_by.get(stream_id)
|
|
86
|
+
return batch_sort(
|
|
87
|
+
record_stream,
|
|
77
88
|
batch_size=batch_size,
|
|
78
|
-
key=lambda
|
|
89
|
+
key=lambda rec: (partition_key(rec, partition_by), rec.time),
|
|
79
90
|
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def apply_stream_operations(
|
|
94
|
+
context: PipelineContext,
|
|
95
|
+
record_stream: Iterable[TemporalRecord],
|
|
96
|
+
stream_id: str,
|
|
97
|
+
) -> Iterator[TemporalRecord]:
|
|
98
|
+
"""Apply stream/debug transforms (expects input sorted by partition_key + time)."""
|
|
99
|
+
partition_by = context.runtime.registries.partition_by.get(stream_id)
|
|
80
100
|
transformed = apply_transforms(
|
|
81
|
-
|
|
101
|
+
record_stream,
|
|
82
102
|
STREAM_TRANFORMS_EP,
|
|
83
103
|
context.runtime.registries.stream_operations.get(stream_id),
|
|
84
104
|
context,
|
|
105
|
+
extra_kwargs={"partition_by": partition_by},
|
|
85
106
|
)
|
|
86
107
|
transformed = apply_transforms(
|
|
87
108
|
transformed,
|
|
88
109
|
DEBUG_TRANSFORMS_EP,
|
|
89
110
|
context.runtime.registries.debug_operations.get(stream_id),
|
|
90
111
|
context,
|
|
112
|
+
extra_kwargs={"partition_by": partition_by},
|
|
91
113
|
)
|
|
92
114
|
return transformed
|
|
93
115
|
|
|
@@ -135,10 +157,9 @@ def vector_assemble_stage(
|
|
|
135
157
|
feature_map = defaultdict(list)
|
|
136
158
|
for fr in group:
|
|
137
159
|
if isinstance(fr, FeatureRecordSequence):
|
|
138
|
-
|
|
160
|
+
feature_map[fr.id].extend(fr.values)
|
|
139
161
|
else:
|
|
140
|
-
|
|
141
|
-
feature_map[fr.id].extend(records)
|
|
162
|
+
feature_map[fr.id].append(fr.value)
|
|
142
163
|
vector = vectorize_record_group(feature_map)
|
|
143
164
|
yield group_key, vector
|
|
144
165
|
|
|
@@ -242,16 +263,19 @@ def _apply_vector_schema(
|
|
|
242
263
|
|
|
243
264
|
if not feature_entries:
|
|
244
265
|
if context.schema_required:
|
|
245
|
-
raise RuntimeError(
|
|
266
|
+
raise RuntimeError(
|
|
267
|
+
"Schema missing for payload 'features'. Run `jerry build` to materialize schema.json.")
|
|
246
268
|
feature_stream = stream
|
|
247
269
|
else:
|
|
248
|
-
feature_schema = VectorEnsureSchemaTransform(
|
|
270
|
+
feature_schema = VectorEnsureSchemaTransform(
|
|
271
|
+
on_missing="fill", on_extra="drop")
|
|
249
272
|
feature_schema.bind_context(context)
|
|
250
273
|
feature_stream = feature_schema(stream)
|
|
251
274
|
|
|
252
275
|
def _apply_targets(upstream: Iterator[Sample]) -> Iterator[Sample]:
|
|
253
276
|
if target_entries:
|
|
254
|
-
target_schema = VectorEnsureSchemaTransform(
|
|
277
|
+
target_schema = VectorEnsureSchemaTransform(
|
|
278
|
+
payload="targets", on_missing="fill", on_extra="drop")
|
|
255
279
|
target_schema.bind_context(context)
|
|
256
280
|
return target_schema(upstream)
|
|
257
281
|
if not context.schema_required:
|
|
@@ -264,6 +288,7 @@ def _apply_vector_schema(
|
|
|
264
288
|
return iter(())
|
|
265
289
|
if first.targets is None:
|
|
266
290
|
return chain([first], iterator)
|
|
267
|
-
raise RuntimeError(
|
|
291
|
+
raise RuntimeError(
|
|
292
|
+
"Schema missing for payload 'targets'. Run `jerry build` to materialize schema.json.")
|
|
268
293
|
|
|
269
294
|
return _apply_targets(feature_stream)
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
import tempfile
|
|
3
|
+
import weakref
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Iterator, Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
_LEN_BYTES = 8
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _encode_len(size: int) -> bytes:
|
|
13
|
+
return int(size).to_bytes(_LEN_BYTES, "little", signed=False)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _decode_len(raw: bytes) -> int:
|
|
17
|
+
return int.from_bytes(raw, "little", signed=False)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class _SpoolState:
|
|
22
|
+
writer: Any
|
|
23
|
+
path: Path
|
|
24
|
+
offsets: list[int]
|
|
25
|
+
source: Iterator[Any]
|
|
26
|
+
done: bool = False
|
|
27
|
+
|
|
28
|
+
def close(self) -> None:
|
|
29
|
+
try:
|
|
30
|
+
self.writer.close()
|
|
31
|
+
except Exception:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class SpoolCache:
|
|
36
|
+
"""Disk-backed cache for iterators with multiple sequential readers."""
|
|
37
|
+
|
|
38
|
+
def __init__(self, source: Iterator[Any], *, name: str | None = None) -> None:
|
|
39
|
+
tmp = tempfile.NamedTemporaryFile(
|
|
40
|
+
prefix=f"dp-spool-{name or 'stream'}-",
|
|
41
|
+
suffix=".pkl",
|
|
42
|
+
delete=False,
|
|
43
|
+
)
|
|
44
|
+
path = Path(tmp.name)
|
|
45
|
+
self._state = _SpoolState(
|
|
46
|
+
writer=tmp,
|
|
47
|
+
path=path,
|
|
48
|
+
offsets=[],
|
|
49
|
+
source=iter(source),
|
|
50
|
+
)
|
|
51
|
+
self._finalizer = weakref.finalize(self, _cleanup, path, tmp)
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def path(self) -> Path:
|
|
55
|
+
return self._state.path
|
|
56
|
+
|
|
57
|
+
def close(self) -> None:
|
|
58
|
+
"""Close writer and remove the spool file."""
|
|
59
|
+
if self._finalizer.alive:
|
|
60
|
+
self._finalizer()
|
|
61
|
+
|
|
62
|
+
def __enter__(self) -> "SpoolCache":
|
|
63
|
+
return self
|
|
64
|
+
|
|
65
|
+
def __exit__(self, exc_type, exc, tb) -> None:
|
|
66
|
+
self.close()
|
|
67
|
+
|
|
68
|
+
def reader(self) -> Iterator[Any]:
|
|
69
|
+
return _SpoolReader(self)
|
|
70
|
+
|
|
71
|
+
def _append_next(self) -> bool:
|
|
72
|
+
if self._state.done:
|
|
73
|
+
return False
|
|
74
|
+
try:
|
|
75
|
+
item = next(self._state.source)
|
|
76
|
+
except StopIteration:
|
|
77
|
+
self._state.done = True
|
|
78
|
+
self._state.writer.flush()
|
|
79
|
+
return False
|
|
80
|
+
try:
|
|
81
|
+
data = pickle.dumps(item, protocol=pickle.HIGHEST_PROTOCOL)
|
|
82
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
83
|
+
raise TypeError(
|
|
84
|
+
"SpoolCache requires picklable records for multi-feature fanout."
|
|
85
|
+
) from exc
|
|
86
|
+
offset = self._state.writer.tell()
|
|
87
|
+
self._state.writer.write(_encode_len(len(data)))
|
|
88
|
+
self._state.writer.write(data)
|
|
89
|
+
self._state.writer.flush()
|
|
90
|
+
self._state.offsets.append(offset)
|
|
91
|
+
return True
|
|
92
|
+
|
|
93
|
+
def _ensure_index(self, index: int) -> None:
|
|
94
|
+
while len(self._state.offsets) <= index:
|
|
95
|
+
if not self._append_next():
|
|
96
|
+
break
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class _SpoolReader:
|
|
100
|
+
def __init__(self, cache: SpoolCache) -> None:
|
|
101
|
+
self._cache = cache
|
|
102
|
+
self._index = 0
|
|
103
|
+
self._fh = open(cache.path, "rb")
|
|
104
|
+
|
|
105
|
+
def __iter__(self) -> "_SpoolReader":
|
|
106
|
+
return self
|
|
107
|
+
|
|
108
|
+
def __next__(self) -> Any:
|
|
109
|
+
self._cache._ensure_index(self._index)
|
|
110
|
+
if self._index >= len(self._cache._state.offsets):
|
|
111
|
+
self._close()
|
|
112
|
+
raise StopIteration
|
|
113
|
+
offset = self._cache._state.offsets[self._index]
|
|
114
|
+
self._index += 1
|
|
115
|
+
self._fh.seek(offset)
|
|
116
|
+
raw = self._fh.read(_LEN_BYTES)
|
|
117
|
+
if not raw:
|
|
118
|
+
self._close()
|
|
119
|
+
raise StopIteration
|
|
120
|
+
size = _decode_len(raw)
|
|
121
|
+
payload = self._fh.read(size)
|
|
122
|
+
return pickle.loads(payload)
|
|
123
|
+
|
|
124
|
+
def _close(self) -> None:
|
|
125
|
+
try:
|
|
126
|
+
self._fh.close()
|
|
127
|
+
except Exception:
|
|
128
|
+
pass
|
|
129
|
+
|
|
130
|
+
def __del__(self) -> None:
|
|
131
|
+
self._close()
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _cleanup(path: Path, writer: Any) -> None:
|
|
135
|
+
try:
|
|
136
|
+
writer.close()
|
|
137
|
+
except Exception:
|
|
138
|
+
pass
|
|
139
|
+
try:
|
|
140
|
+
path.unlink(missing_ok=True)
|
|
141
|
+
except Exception:
|
|
142
|
+
pass
|
|
@@ -41,17 +41,35 @@ def _split_params(params: Any) -> Tuple[Tuple[Any, ...], dict[str, Any]]:
|
|
|
41
41
|
return (params,), {}
|
|
42
42
|
|
|
43
43
|
|
|
44
|
+
def _merge_extra_kwargs(
|
|
45
|
+
fn: Callable[..., Any],
|
|
46
|
+
kwargs: dict[str, Any],
|
|
47
|
+
extra_kwargs: Mapping[str, Any] | None,
|
|
48
|
+
) -> dict[str, Any]:
|
|
49
|
+
if not extra_kwargs:
|
|
50
|
+
return kwargs
|
|
51
|
+
merged = dict(kwargs)
|
|
52
|
+
for key, value in extra_kwargs.items():
|
|
53
|
+
if key in merged:
|
|
54
|
+
continue
|
|
55
|
+
if _supports_parameter(fn, key):
|
|
56
|
+
merged[key] = value
|
|
57
|
+
return merged
|
|
58
|
+
|
|
59
|
+
|
|
44
60
|
def _call_with_params(
|
|
45
61
|
fn: Callable,
|
|
46
62
|
stream: Iterator[Any],
|
|
47
63
|
params: Any,
|
|
48
64
|
context: Optional[PipelineContext],
|
|
65
|
+
extra_kwargs: Mapping[str, Any] | None = None,
|
|
49
66
|
) -> Iterator[Any]:
|
|
50
67
|
"""Invoke an entry-point callable with optional params semantics."""
|
|
51
68
|
|
|
52
69
|
args, kwargs = _split_params(params)
|
|
53
70
|
if context and _supports_parameter(fn, "context") and "context" not in kwargs:
|
|
54
71
|
kwargs["context"] = context
|
|
72
|
+
kwargs = _merge_extra_kwargs(fn, kwargs, extra_kwargs)
|
|
55
73
|
return fn(stream, *args, **kwargs)
|
|
56
74
|
|
|
57
75
|
|
|
@@ -59,12 +77,14 @@ def _instantiate_entry_point(
|
|
|
59
77
|
cls: Callable[..., Any],
|
|
60
78
|
params: Any,
|
|
61
79
|
context: Optional[PipelineContext],
|
|
80
|
+
extra_kwargs: Mapping[str, Any] | None = None,
|
|
62
81
|
) -> Any:
|
|
63
82
|
"""Instantiate a transform class with parameters from the config."""
|
|
64
83
|
|
|
65
84
|
args, kwargs = _split_params(params)
|
|
66
85
|
if context and _supports_parameter(cls.__init__, "context") and "context" not in kwargs:
|
|
67
86
|
kwargs["context"] = context
|
|
87
|
+
kwargs = _merge_extra_kwargs(cls.__init__, kwargs, extra_kwargs)
|
|
68
88
|
return cls(*args, **kwargs)
|
|
69
89
|
|
|
70
90
|
|
|
@@ -83,6 +103,7 @@ def apply_transforms(
|
|
|
83
103
|
context: Optional[PipelineContext] = None,
|
|
84
104
|
observer: Callable[[TransformEvent], None] | None = None,
|
|
85
105
|
observer_registry: ObserverRegistry | None = None,
|
|
106
|
+
extra_kwargs: Mapping[str, Any] | None = None,
|
|
86
107
|
) -> Iterator[Any]:
|
|
87
108
|
"""Instantiate and apply configured transforms in order."""
|
|
88
109
|
|
|
@@ -97,7 +118,9 @@ def apply_transforms(
|
|
|
97
118
|
name, params = _extract_single_pair(transform, "Transform")
|
|
98
119
|
ep = load_ep(group=group, name=name)
|
|
99
120
|
if isclass(ep):
|
|
100
|
-
inst = _instantiate_entry_point(
|
|
121
|
+
inst = _instantiate_entry_point(
|
|
122
|
+
ep, params, context, extra_kwargs=extra_kwargs
|
|
123
|
+
)
|
|
101
124
|
_bind_context(inst, context)
|
|
102
125
|
eff_observer = observer
|
|
103
126
|
if eff_observer is None and registry:
|
|
@@ -107,7 +130,9 @@ def apply_transforms(
|
|
|
107
130
|
_attach_observer(inst, eff_observer)
|
|
108
131
|
stream = inst(stream)
|
|
109
132
|
else:
|
|
110
|
-
stream = _call_with_params(
|
|
133
|
+
stream = _call_with_params(
|
|
134
|
+
ep, stream, params, context, extra_kwargs=extra_kwargs
|
|
135
|
+
)
|
|
111
136
|
return stream
|
|
112
137
|
|
|
113
138
|
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
1
|
from dataclasses import dataclass
|
|
4
2
|
import json
|
|
5
3
|
from pathlib import Path
|
|
@@ -80,8 +78,7 @@ class ArtifactManager:
|
|
|
80
78
|
except FileNotFoundError as exc:
|
|
81
79
|
message = (
|
|
82
80
|
f"Artifact file not found: {path}. "
|
|
83
|
-
"Run `jerry build --project <project.yaml>`
|
|
84
|
-
"`jerry inspect expected --project <project.yaml>` to regenerate it."
|
|
81
|
+
"Run `jerry build --project <project.yaml>` to regenerate it."
|
|
85
82
|
)
|
|
86
83
|
raise RuntimeError(message) from exc
|
|
87
84
|
|
|
@@ -12,6 +12,7 @@ LOADERS_GROUP = "loaders"
|
|
|
12
12
|
MAPPERS_GROUP = "mappers"
|
|
13
13
|
FILTERS_GROUP = "filters"
|
|
14
14
|
DEFAULT_IO_LOADER_EP = "core.io"
|
|
15
|
+
DEFAULT_SYNTHETIC_LOADER_EP = "core.synthetic.ticks"
|
|
15
16
|
|
|
16
17
|
# POSTPROCESS_GLOBAL_KEY = "__global__"
|
|
17
18
|
POSTPROCESS_TRANSFORMS = "transforms"
|
|
@@ -6,8 +6,7 @@ from datapipeline.mappers.noop import identity
|
|
|
6
6
|
from datapipeline.utils.placeholders import normalize_args
|
|
7
7
|
from datapipeline.sources.models.base import SourceInterface
|
|
8
8
|
from datapipeline.pipeline.context import PipelineContext
|
|
9
|
-
from datapipeline.
|
|
10
|
-
from datapipeline.pipeline.pipelines import build_feature_pipeline
|
|
9
|
+
from datapipeline.pipeline.pipelines import build_record_pipeline
|
|
11
10
|
from datapipeline.pipeline.utils.transform_utils import _supports_parameter
|
|
12
11
|
from inspect import isclass
|
|
13
12
|
from typing import Iterator, Any, Optional
|
|
@@ -52,7 +51,7 @@ class _ComposedSource(SourceInterface):
|
|
|
52
51
|
|
|
53
52
|
# Build aligned/aux iterators (unwrap FeatureRecord -> record for aligned)
|
|
54
53
|
aligned_iters: dict[str, Iterator[Any]] = {
|
|
55
|
-
k: (
|
|
54
|
+
k: (getattr(item, "record", item) for item in v["iter"])
|
|
56
55
|
for k, v in aligned.items()
|
|
57
56
|
}
|
|
58
57
|
aux_iters: dict[str, Iterator[Any]] = {
|
|
@@ -111,7 +110,7 @@ class _ComposedSource(SourceInterface):
|
|
|
111
110
|
"""Parse and resolve composed inputs into iterators.
|
|
112
111
|
|
|
113
112
|
Grammar: "[alias=]stream_id" only. All inputs are built to stage 4
|
|
114
|
-
and are alignable (
|
|
113
|
+
and are alignable (domain records with stream transforms applied).
|
|
115
114
|
"""
|
|
116
115
|
runtime = context.runtime
|
|
117
116
|
known_streams = set(runtime.registries.stream_sources.keys())
|
|
@@ -123,8 +122,7 @@ class _ComposedSource(SourceInterface):
|
|
|
123
122
|
raise ValueError(
|
|
124
123
|
f"Unknown input stream '{ref}'. Known streams: {sorted(known_streams)}"
|
|
125
124
|
)
|
|
126
|
-
|
|
127
|
-
it = build_feature_pipeline(context, cfg, stage=4)
|
|
125
|
+
it = build_record_pipeline(context, ref, stage=4)
|
|
128
126
|
out[alias] = {"iter": it, "aligned": True}
|
|
129
127
|
|
|
130
128
|
return out
|