jerry-thomas 1.0.3__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +0 -1
- datapipeline/build/tasks/config.py +0 -2
- datapipeline/build/tasks/metadata.py +0 -2
- datapipeline/build/tasks/scaler.py +0 -2
- datapipeline/build/tasks/schema.py +0 -2
- datapipeline/build/tasks/utils.py +0 -2
- datapipeline/cli/app.py +201 -81
- datapipeline/cli/commands/contract.py +145 -283
- datapipeline/cli/commands/demo.py +13 -0
- datapipeline/cli/commands/domain.py +4 -4
- datapipeline/cli/commands/dto.py +11 -0
- datapipeline/cli/commands/filter.py +2 -2
- datapipeline/cli/commands/inspect.py +0 -68
- datapipeline/cli/commands/list_.py +30 -13
- datapipeline/cli/commands/loader.py +11 -0
- datapipeline/cli/commands/mapper.py +82 -0
- datapipeline/cli/commands/parser.py +45 -0
- datapipeline/cli/commands/run_config.py +1 -3
- datapipeline/cli/commands/serve_pipeline.py +5 -7
- datapipeline/cli/commands/source.py +106 -18
- datapipeline/cli/commands/stream.py +286 -0
- datapipeline/cli/visuals/common.py +0 -2
- datapipeline/cli/visuals/sections.py +0 -2
- datapipeline/cli/workspace_utils.py +0 -3
- datapipeline/config/context.py +0 -2
- datapipeline/config/dataset/feature.py +1 -0
- datapipeline/config/metadata.py +0 -2
- datapipeline/config/project.py +0 -2
- datapipeline/config/resolution.py +10 -2
- datapipeline/config/tasks.py +9 -9
- datapipeline/domain/feature.py +3 -0
- datapipeline/domain/record.py +7 -7
- datapipeline/domain/sample.py +0 -2
- datapipeline/domain/vector.py +6 -8
- datapipeline/integrations/ml/adapter.py +0 -2
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +0 -2
- datapipeline/integrations/ml/torch_support.py +0 -2
- datapipeline/io/output.py +0 -2
- datapipeline/io/serializers.py +26 -16
- datapipeline/mappers/synthetic/time.py +9 -2
- datapipeline/pipeline/artifacts.py +3 -5
- datapipeline/pipeline/observability.py +0 -2
- datapipeline/pipeline/pipelines.py +118 -34
- datapipeline/pipeline/stages.py +42 -17
- datapipeline/pipeline/utils/spool_cache.py +142 -0
- datapipeline/pipeline/utils/transform_utils.py +27 -2
- datapipeline/services/artifacts.py +1 -4
- datapipeline/services/constants.py +1 -0
- datapipeline/services/factories.py +4 -6
- datapipeline/services/project_paths.py +0 -2
- datapipeline/services/runs.py +0 -2
- datapipeline/services/scaffold/contract_yaml.py +76 -0
- datapipeline/services/scaffold/demo.py +141 -0
- datapipeline/services/scaffold/discovery.py +115 -0
- datapipeline/services/scaffold/domain.py +21 -13
- datapipeline/services/scaffold/dto.py +31 -0
- datapipeline/services/scaffold/filter.py +2 -1
- datapipeline/services/scaffold/layout.py +96 -0
- datapipeline/services/scaffold/loader.py +61 -0
- datapipeline/services/scaffold/mapper.py +116 -0
- datapipeline/services/scaffold/parser.py +56 -0
- datapipeline/services/scaffold/plugin.py +14 -2
- datapipeline/services/scaffold/source_yaml.py +91 -0
- datapipeline/services/scaffold/stream_plan.py +110 -0
- datapipeline/services/scaffold/utils.py +187 -0
- datapipeline/sources/data_loader.py +0 -2
- datapipeline/sources/decoders.py +49 -8
- datapipeline/sources/factory.py +9 -6
- datapipeline/sources/foreach.py +18 -3
- datapipeline/sources/synthetic/time/parser.py +1 -1
- datapipeline/sources/transports.py +10 -4
- datapipeline/templates/demo_skeleton/demo/contracts/equity.ohlcv.yaml +33 -0
- datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.hour_sin.yaml +22 -0
- datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.linear.yaml +22 -0
- datapipeline/templates/demo_skeleton/demo/data/APPL.jsonl +19 -0
- datapipeline/templates/demo_skeleton/demo/data/MSFT.jsonl +19 -0
- datapipeline/templates/demo_skeleton/demo/dataset.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/postprocess.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/project.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/sources/sandbox.ohlcv.yaml +17 -0
- datapipeline/templates/{plugin_skeleton/example → demo_skeleton/demo}/sources/synthetic.ticks.yaml +1 -1
- datapipeline/templates/demo_skeleton/demo/tasks/metadata.yaml +2 -0
- datapipeline/templates/demo_skeleton/demo/tasks/scaler.yaml +3 -0
- datapipeline/templates/demo_skeleton/demo/tasks/schema.yaml +2 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.test.yaml +4 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.train.yaml +4 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.val.yaml +4 -0
- datapipeline/templates/demo_skeleton/scripts/run_dataframe.py +20 -0
- datapipeline/templates/demo_skeleton/scripts/run_torch.py +23 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/model.py +18 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/sandbox_ohlcv_dto.py +14 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/map_sandbox_ohlcv_dto_to_equity.py +26 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/sandbox_ohlcv_dto_parser.py +46 -0
- datapipeline/templates/plugin_skeleton/README.md +57 -136
- datapipeline/templates/plugin_skeleton/jerry.yaml +12 -24
- datapipeline/templates/plugin_skeleton/reference/jerry.yaml +28 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/composed.reference.yaml +29 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/ingest.reference.yaml +31 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/overview.reference.yaml +34 -0
- datapipeline/templates/plugin_skeleton/reference/reference/dataset.yaml +29 -0
- datapipeline/templates/plugin_skeleton/reference/reference/postprocess.yaml +25 -0
- datapipeline/templates/plugin_skeleton/reference/reference/project.yaml +32 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.http.reference.yaml +24 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.reference.yaml +21 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/fs.reference.yaml +16 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/http.reference.yaml +17 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/overview.reference.yaml +18 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/synthetic.reference.yaml +15 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/metadata.reference.yaml +11 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/scaler.reference.yaml +10 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/schema.reference.yaml +10 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/serve.reference.yaml +28 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/domains/__init__.py +2 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/loaders/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +1 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +12 -11
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +4 -13
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +7 -10
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +1 -2
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +1 -7
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +1 -25
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/dataset.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/postprocess.yaml +1 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/project.yaml +14 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/tasks/serve.all.yaml +8 -0
- datapipeline/templates/stubs/contracts/composed.yaml.j2 +10 -0
- datapipeline/templates/stubs/contracts/ingest.yaml.j2 +25 -0
- datapipeline/templates/stubs/dto.py.j2 +1 -1
- datapipeline/templates/stubs/loaders/basic.py.j2 +11 -0
- datapipeline/templates/stubs/mappers/composed.py.j2 +13 -0
- datapipeline/templates/stubs/mappers/ingest.py.j2 +17 -0
- datapipeline/templates/stubs/parser.py.j2 +4 -0
- datapipeline/templates/stubs/record.py.j2 +0 -1
- datapipeline/templates/stubs/source.yaml.j2 +1 -1
- datapipeline/transforms/debug/identity.py +34 -16
- datapipeline/transforms/debug/lint.py +14 -11
- datapipeline/transforms/feature/scaler.py +5 -12
- datapipeline/transforms/filter.py +73 -17
- datapipeline/transforms/interfaces.py +58 -0
- datapipeline/transforms/record/floor_time.py +10 -7
- datapipeline/transforms/record/lag.py +8 -10
- datapipeline/transforms/sequence.py +2 -3
- datapipeline/transforms/stream/dedupe.py +5 -7
- datapipeline/transforms/stream/ensure_ticks.py +39 -24
- datapipeline/transforms/stream/fill.py +34 -25
- datapipeline/transforms/stream/filter.py +25 -0
- datapipeline/transforms/stream/floor_time.py +16 -0
- datapipeline/transforms/stream/granularity.py +52 -30
- datapipeline/transforms/stream/lag.py +17 -0
- datapipeline/transforms/stream/rolling.py +72 -0
- datapipeline/transforms/utils.py +42 -10
- datapipeline/transforms/vector/drop/horizontal.py +0 -3
- datapipeline/transforms/vector/drop/orchestrator.py +0 -3
- datapipeline/transforms/vector/drop/vertical.py +0 -2
- datapipeline/transforms/vector/ensure_schema.py +0 -2
- datapipeline/utils/paths.py +0 -2
- datapipeline/utils/placeholders.py +0 -2
- datapipeline/utils/rich_compat.py +0 -3
- datapipeline/utils/window.py +0 -2
- jerry_thomas-2.0.0.dist-info/METADATA +282 -0
- jerry_thomas-2.0.0.dist-info/RECORD +264 -0
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/WHEEL +1 -1
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/entry_points.txt +7 -3
- datapipeline/services/scaffold/mappers.py +0 -55
- datapipeline/services/scaffold/source.py +0 -191
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +0 -31
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +0 -30
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +0 -18
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +0 -29
- datapipeline/templates/plugin_skeleton/example/project.yaml +0 -23
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +0 -3
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +0 -9
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +0 -2
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +0 -4
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +0 -28
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +0 -4
- datapipeline/templates/stubs/mapper.py.j2 +0 -22
- jerry_thomas-1.0.3.dist-info/METADATA +0 -827
- jerry_thomas-1.0.3.dist-info/RECORD +0 -198
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from datapipeline.config.workspace import WorkspaceContext
|
|
4
|
+
from datapipeline.cli.workspace_utils import resolve_default_project_yaml
|
|
5
|
+
from datapipeline.services.paths import pkg_root
|
|
6
|
+
from datapipeline.services.project_paths import resolve_project_yaml_path
|
|
7
|
+
from datapipeline.services.scaffold.discovery import (
|
|
8
|
+
list_domains,
|
|
9
|
+
list_mappers,
|
|
10
|
+
list_parsers,
|
|
11
|
+
list_sources,
|
|
12
|
+
list_dtos,
|
|
13
|
+
)
|
|
14
|
+
from datapipeline.services.scaffold.source_yaml import default_loader_config
|
|
15
|
+
from datapipeline.services.scaffold.layout import (
|
|
16
|
+
default_stream_id,
|
|
17
|
+
dto_class_name,
|
|
18
|
+
default_parser_name,
|
|
19
|
+
default_mapper_name,
|
|
20
|
+
dto_module_path,
|
|
21
|
+
LABEL_DTO_FOR_PARSER,
|
|
22
|
+
LABEL_DTO_FOR_MAPPER,
|
|
23
|
+
LABEL_DOMAIN_TO_MAP,
|
|
24
|
+
LABEL_MAPPER_INPUT,
|
|
25
|
+
default_mapper_name_for_identity,
|
|
26
|
+
)
|
|
27
|
+
from datapipeline.services.scaffold.stream_plan import StreamPlan, ParserPlan, MapperPlan, execute_stream_plan
|
|
28
|
+
from datapipeline.services.scaffold.utils import (
|
|
29
|
+
choose_existing_or_create,
|
|
30
|
+
choose_name,
|
|
31
|
+
choose_existing_or_create_name,
|
|
32
|
+
error_exit,
|
|
33
|
+
info,
|
|
34
|
+
pick_from_list,
|
|
35
|
+
pick_from_menu,
|
|
36
|
+
prompt_required,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def handle(*, plugin_root: Path | None = None, workspace: WorkspaceContext | None = None) -> None:
|
|
41
|
+
root_dir, pkg_name, _ = pkg_root(plugin_root)
|
|
42
|
+
project_yaml = resolve_default_project_yaml(workspace) or resolve_project_yaml_path(root_dir)
|
|
43
|
+
|
|
44
|
+
# Shared context
|
|
45
|
+
provider = prompt_required("Provider")
|
|
46
|
+
dataset = prompt_required("Dataset")
|
|
47
|
+
source_id = f"{provider}.{dataset}"
|
|
48
|
+
|
|
49
|
+
# Collected actions (execute at end)
|
|
50
|
+
create_source = False
|
|
51
|
+
create_domain_flag = False
|
|
52
|
+
create_parser_flag = False
|
|
53
|
+
create_mapper_flag = False
|
|
54
|
+
parser_create_dto = False
|
|
55
|
+
mapper_create_dto = False
|
|
56
|
+
|
|
57
|
+
dto_class = None
|
|
58
|
+
dto_module = None
|
|
59
|
+
mapper_input_class = None
|
|
60
|
+
mapper_input_module = None
|
|
61
|
+
loader_ep = None
|
|
62
|
+
loader_args = None
|
|
63
|
+
parser_ep = None
|
|
64
|
+
mapper_ep = None
|
|
65
|
+
parser_name = None
|
|
66
|
+
mapper_name = None
|
|
67
|
+
pchoice = "identity"
|
|
68
|
+
|
|
69
|
+
# Source selection (may override shared context if existing is chosen)
|
|
70
|
+
source_choice = pick_from_menu(
|
|
71
|
+
"Source:",
|
|
72
|
+
[
|
|
73
|
+
("create", "Create new source (default)"),
|
|
74
|
+
("existing", "Select existing source"),
|
|
75
|
+
],
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
if source_choice == "existing":
|
|
79
|
+
sources = list_sources(project_yaml)
|
|
80
|
+
if not sources:
|
|
81
|
+
error_exit("No sources found. Create one first.")
|
|
82
|
+
source_id = pick_from_list("Select source:", sources)
|
|
83
|
+
parts = source_id.split(".", 1)
|
|
84
|
+
provider = parts[0] if len(parts) == 2 else provider
|
|
85
|
+
dataset = parts[1] if len(parts) == 2 else dataset
|
|
86
|
+
else:
|
|
87
|
+
source_id_default = f"{provider}.{dataset}"
|
|
88
|
+
source_id = choose_name("Source id", default=source_id_default)
|
|
89
|
+
create_source = True
|
|
90
|
+
|
|
91
|
+
# Loader selection
|
|
92
|
+
loader_ep = None
|
|
93
|
+
loader_args = {}
|
|
94
|
+
choice = pick_from_menu(
|
|
95
|
+
"Loader:",
|
|
96
|
+
[
|
|
97
|
+
("fs", "Built-in fs"),
|
|
98
|
+
("http", "Built-in http"),
|
|
99
|
+
("synthetic", "Built-in synthetic"),
|
|
100
|
+
("custom", "Custom loader"),
|
|
101
|
+
],
|
|
102
|
+
allow_default=False,
|
|
103
|
+
)
|
|
104
|
+
if choice in {"fs", "http", "synthetic"}:
|
|
105
|
+
if choice in {"fs", "http"}:
|
|
106
|
+
fmt_options = [
|
|
107
|
+
("csv", "csv"),
|
|
108
|
+
("json", "json"),
|
|
109
|
+
("json-lines", "json-lines"),
|
|
110
|
+
]
|
|
111
|
+
if choice == "fs":
|
|
112
|
+
fmt_options.append(("pickle", "pickle"))
|
|
113
|
+
fmt = pick_from_menu("Format:", fmt_options, allow_default=False)
|
|
114
|
+
else:
|
|
115
|
+
fmt = None
|
|
116
|
+
loader_ep, loader_args = default_loader_config(choice, fmt)
|
|
117
|
+
else:
|
|
118
|
+
loader_ep = prompt_required("Loader entrypoint")
|
|
119
|
+
|
|
120
|
+
# Parser selection
|
|
121
|
+
parsers = list_parsers(root=plugin_root)
|
|
122
|
+
if parsers:
|
|
123
|
+
pchoice = pick_from_menu(
|
|
124
|
+
"Parser:",
|
|
125
|
+
[
|
|
126
|
+
("create", "Create new parser (default)"),
|
|
127
|
+
("existing", "Select existing parser"),
|
|
128
|
+
("identity", "Identity parser"),
|
|
129
|
+
],
|
|
130
|
+
)
|
|
131
|
+
else:
|
|
132
|
+
pchoice = pick_from_menu(
|
|
133
|
+
"Parser:",
|
|
134
|
+
[
|
|
135
|
+
("create", "Create new parser (default)"),
|
|
136
|
+
("identity", "Identity parser"),
|
|
137
|
+
],
|
|
138
|
+
)
|
|
139
|
+
if pchoice == "existing":
|
|
140
|
+
parser_ep = pick_from_menu(
|
|
141
|
+
"Select parser entrypoint:",
|
|
142
|
+
[(k, k) for k in sorted(parsers.keys())],
|
|
143
|
+
)
|
|
144
|
+
elif pchoice == "create":
|
|
145
|
+
dto_default = dto_class_name(f"{provider}_{dataset}") if provider and dataset else None
|
|
146
|
+
dto_class, parser_create_dto = choose_existing_or_create_name(
|
|
147
|
+
label=LABEL_DTO_FOR_PARSER,
|
|
148
|
+
existing=sorted(list_dtos(root=plugin_root).keys()),
|
|
149
|
+
create_label="Create new DTO",
|
|
150
|
+
prompt_new="DTO class name",
|
|
151
|
+
default_new=dto_default,
|
|
152
|
+
)
|
|
153
|
+
parser_name = choose_name(
|
|
154
|
+
"Parser class name",
|
|
155
|
+
default=default_parser_name(dto_class),
|
|
156
|
+
)
|
|
157
|
+
dto_module = dto_module_path(pkg_name, dto_class)
|
|
158
|
+
create_parser_flag = True
|
|
159
|
+
elif pchoice == "identity":
|
|
160
|
+
parser_ep = "identity"
|
|
161
|
+
else:
|
|
162
|
+
parser_ep = "identity"
|
|
163
|
+
|
|
164
|
+
# Domain selection
|
|
165
|
+
domain, create_domain_flag = choose_existing_or_create_name(
|
|
166
|
+
label=LABEL_DOMAIN_TO_MAP,
|
|
167
|
+
existing=list_domains(root=plugin_root),
|
|
168
|
+
create_label="Create new domain",
|
|
169
|
+
prompt_new="Domain name",
|
|
170
|
+
default_new=dataset,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# Mapper selection
|
|
174
|
+
mappers = list_mappers(root=plugin_root)
|
|
175
|
+
if mappers:
|
|
176
|
+
mchoice = pick_from_menu(
|
|
177
|
+
"Mapper:",
|
|
178
|
+
[
|
|
179
|
+
("create", "Create new mapper (default)"),
|
|
180
|
+
("existing", "Select existing mapper"),
|
|
181
|
+
("identity", "Identity mapper"),
|
|
182
|
+
],
|
|
183
|
+
)
|
|
184
|
+
else:
|
|
185
|
+
mchoice = pick_from_menu(
|
|
186
|
+
"Mapper:",
|
|
187
|
+
[
|
|
188
|
+
("create", "Create new mapper (default)"),
|
|
189
|
+
("identity", "Identity mapper"),
|
|
190
|
+
],
|
|
191
|
+
)
|
|
192
|
+
if mchoice == "existing":
|
|
193
|
+
mapper_ep = pick_from_menu(
|
|
194
|
+
"Select mapper entrypoint:",
|
|
195
|
+
[(k, k) for k in sorted(mappers.keys())],
|
|
196
|
+
)
|
|
197
|
+
elif mchoice == "create":
|
|
198
|
+
create_mapper_flag = True
|
|
199
|
+
input_choice = pick_from_menu(
|
|
200
|
+
f"{LABEL_MAPPER_INPUT}:",
|
|
201
|
+
[
|
|
202
|
+
("dto", "DTO (default)"),
|
|
203
|
+
("identity", "Any"),
|
|
204
|
+
],
|
|
205
|
+
)
|
|
206
|
+
info("Domain output: Domain record")
|
|
207
|
+
if input_choice == "dto":
|
|
208
|
+
if not dto_class:
|
|
209
|
+
dto_class, mapper_create_dto = choose_existing_or_create_name(
|
|
210
|
+
label=LABEL_DTO_FOR_MAPPER,
|
|
211
|
+
existing=sorted(list_dtos(root=plugin_root).keys()),
|
|
212
|
+
create_label="Create new DTO",
|
|
213
|
+
prompt_new="DTO class name",
|
|
214
|
+
default_new=dto_class_name(f"{provider}_{dataset}"),
|
|
215
|
+
)
|
|
216
|
+
else:
|
|
217
|
+
mapper_create_dto = False
|
|
218
|
+
dto_module = dto_module_path(pkg_name, dto_class)
|
|
219
|
+
mapper_input_class = dto_class
|
|
220
|
+
mapper_input_module = dto_module
|
|
221
|
+
else:
|
|
222
|
+
mapper_input_module = "typing"
|
|
223
|
+
mapper_input_class = "Any"
|
|
224
|
+
mapper_create_dto = False
|
|
225
|
+
if input_choice == "identity":
|
|
226
|
+
mapper_name = choose_name(
|
|
227
|
+
"Mapper name",
|
|
228
|
+
default=default_mapper_name_for_identity(domain),
|
|
229
|
+
)
|
|
230
|
+
else:
|
|
231
|
+
mapper_name = choose_name("Mapper name", default=default_mapper_name(mapper_input_module, domain))
|
|
232
|
+
elif mchoice == "identity":
|
|
233
|
+
mapper_ep = "identity"
|
|
234
|
+
else:
|
|
235
|
+
mapper_ep = "identity"
|
|
236
|
+
|
|
237
|
+
# Stream id and contract
|
|
238
|
+
default_id = default_stream_id(domain, dataset or "dataset", None)
|
|
239
|
+
stream_id = choose_name("Stream id", default=default_id)
|
|
240
|
+
|
|
241
|
+
# Build plan and execute (no side effects during selection)
|
|
242
|
+
parser_plan = None
|
|
243
|
+
if pchoice == "create":
|
|
244
|
+
parser_plan = ParserPlan(
|
|
245
|
+
create=True,
|
|
246
|
+
create_dto=parser_create_dto,
|
|
247
|
+
dto_class=dto_class,
|
|
248
|
+
dto_module=dto_module,
|
|
249
|
+
parser_name=parser_name,
|
|
250
|
+
)
|
|
251
|
+
elif pchoice == "existing":
|
|
252
|
+
parser_plan = ParserPlan(create=False, parser_ep=parser_ep)
|
|
253
|
+
else:
|
|
254
|
+
parser_plan = ParserPlan(create=False, parser_ep="identity")
|
|
255
|
+
|
|
256
|
+
mapper_plan = None
|
|
257
|
+
if mchoice == "create":
|
|
258
|
+
mapper_plan = MapperPlan(
|
|
259
|
+
create=True,
|
|
260
|
+
create_dto=mapper_create_dto,
|
|
261
|
+
input_class=mapper_input_class,
|
|
262
|
+
input_module=mapper_input_module,
|
|
263
|
+
mapper_name=mapper_name,
|
|
264
|
+
domain=domain,
|
|
265
|
+
)
|
|
266
|
+
elif mchoice == "existing":
|
|
267
|
+
mapper_plan = MapperPlan(create=False, mapper_ep=mapper_ep, domain=domain)
|
|
268
|
+
else:
|
|
269
|
+
mapper_plan = MapperPlan(create=False, mapper_ep="identity", domain=domain)
|
|
270
|
+
|
|
271
|
+
plan = StreamPlan(
|
|
272
|
+
provider=provider,
|
|
273
|
+
dataset=dataset,
|
|
274
|
+
source_id=source_id,
|
|
275
|
+
project_yaml=project_yaml,
|
|
276
|
+
stream_id=stream_id,
|
|
277
|
+
root=plugin_root,
|
|
278
|
+
create_source=create_source,
|
|
279
|
+
loader_ep=loader_ep,
|
|
280
|
+
loader_args=loader_args,
|
|
281
|
+
parser=parser_plan,
|
|
282
|
+
mapper=mapper_plan,
|
|
283
|
+
domain=domain,
|
|
284
|
+
create_domain=create_domain_flag,
|
|
285
|
+
)
|
|
286
|
+
execute_stream_plan(plan)
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
1
|
from pathlib import Path
|
|
4
2
|
|
|
5
3
|
from datapipeline.config.workspace import WorkspaceContext
|
|
@@ -22,4 +20,3 @@ def resolve_default_project_yaml(workspace: WorkspaceContext | None) -> Path | N
|
|
|
22
20
|
f"Unknown default_dataset '{alias}'. Define it under datasets: in jerry.yaml."
|
|
23
21
|
)
|
|
24
22
|
return resolved
|
|
25
|
-
|
datapipeline/config/context.py
CHANGED
datapipeline/config/metadata.py
CHANGED
datapipeline/config/project.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
1
|
import logging
|
|
4
2
|
from dataclasses import dataclass
|
|
5
3
|
from pathlib import Path
|
|
6
4
|
from typing import Any, Optional
|
|
7
5
|
|
|
8
6
|
from datapipeline.config.tasks import ServeOutputConfig
|
|
7
|
+
from datapipeline.io.output import OutputResolutionError
|
|
9
8
|
from datapipeline.config.workspace import WorkspaceContext
|
|
10
9
|
|
|
11
10
|
|
|
@@ -113,6 +112,15 @@ def workspace_output_defaults(
|
|
|
113
112
|
if not serve_defaults or not serve_defaults.output:
|
|
114
113
|
return None
|
|
115
114
|
od = serve_defaults.output
|
|
115
|
+
transport = str(od.transport).lower() if od.transport is not None else None
|
|
116
|
+
if transport == "fs" and not od.directory:
|
|
117
|
+
raise OutputResolutionError(
|
|
118
|
+
"fs output requires a directory. Example:\n"
|
|
119
|
+
" output:\n"
|
|
120
|
+
" transport: fs\n"
|
|
121
|
+
" format: json-lines\n"
|
|
122
|
+
" directory: ./data/processed/jerry"
|
|
123
|
+
)
|
|
116
124
|
output_dir = None
|
|
117
125
|
if od.directory:
|
|
118
126
|
candidate = Path(od.directory)
|
datapipeline/config/tasks.py
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
1
|
from pathlib import Path
|
|
4
|
-
from typing import Annotated,
|
|
2
|
+
from typing import Annotated, Literal, Sequence
|
|
5
3
|
|
|
6
4
|
from pydantic import BaseModel, Field, field_validator, model_validator
|
|
7
5
|
from pydantic.type_adapter import TypeAdapter
|
|
@@ -21,9 +19,10 @@ PayloadMode = Literal["sample", "vector"]
|
|
|
21
19
|
class TaskBase(BaseModel):
|
|
22
20
|
version: int = Field(default=1)
|
|
23
21
|
kind: str
|
|
24
|
-
name: str | None = Field(
|
|
25
|
-
|
|
26
|
-
|
|
22
|
+
name: str | None = Field(
|
|
23
|
+
default=None, description="Optional task identifier.")
|
|
24
|
+
enabled: bool = Field(
|
|
25
|
+
default=True, description="Disable to skip execution.")
|
|
27
26
|
source_path: Path | None = Field(default=None, exclude=True)
|
|
28
27
|
|
|
29
28
|
def effective_name(self) -> str:
|
|
@@ -78,7 +77,8 @@ class RuntimeTask(TaskBase):
|
|
|
78
77
|
|
|
79
78
|
class ServeOutputConfig(BaseModel):
|
|
80
79
|
transport: Transport = Field(..., description="fs | stdout")
|
|
81
|
-
format: Format = Field(...,
|
|
80
|
+
format: Format = Field(...,
|
|
81
|
+
description="csv | json | json-lines | print | pickle")
|
|
82
82
|
payload: PayloadMode = Field(
|
|
83
83
|
default="sample",
|
|
84
84
|
description="sample (key + metadata) or vector payload (features [+targets]).",
|
|
@@ -151,9 +151,9 @@ class ServeTask(RuntimeTask):
|
|
|
151
151
|
)
|
|
152
152
|
stage: int | None = Field(
|
|
153
153
|
default=None,
|
|
154
|
-
description="Default pipeline stage preview (0-
|
|
154
|
+
description="Default pipeline stage preview (0-8).",
|
|
155
155
|
ge=0,
|
|
156
|
-
le=
|
|
156
|
+
le=8,
|
|
157
157
|
)
|
|
158
158
|
throttle_ms: float | None = Field(
|
|
159
159
|
default=None,
|
datapipeline/domain/feature.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from datapipeline.domain.record import TemporalRecord
|
|
2
2
|
from dataclasses import dataclass
|
|
3
|
+
from typing import Any
|
|
3
4
|
|
|
4
5
|
|
|
5
6
|
@dataclass
|
|
@@ -10,8 +11,10 @@ class BaseFeature:
|
|
|
10
11
|
@dataclass
|
|
11
12
|
class FeatureRecord(BaseFeature):
|
|
12
13
|
record: TemporalRecord
|
|
14
|
+
value: Any
|
|
13
15
|
|
|
14
16
|
|
|
15
17
|
@dataclass
|
|
16
18
|
class FeatureRecordSequence(BaseFeature):
|
|
17
19
|
records: list[TemporalRecord]
|
|
20
|
+
values: list[Any]
|
datapipeline/domain/record.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
1
|
+
from dataclasses import dataclass
|
|
2
2
|
from datetime import datetime, timezone
|
|
3
|
-
from typing import Any
|
|
4
3
|
|
|
5
4
|
|
|
6
5
|
@dataclass
|
|
@@ -13,7 +12,6 @@ class TemporalRecord(Record):
|
|
|
13
12
|
"""Canonical time-series payload used throughout the pipeline."""
|
|
14
13
|
|
|
15
14
|
time: datetime
|
|
16
|
-
value: Any
|
|
17
15
|
|
|
18
16
|
def __post_init__(self) -> None:
|
|
19
17
|
if self.time.tzinfo is None:
|
|
@@ -21,10 +19,13 @@ class TemporalRecord(Record):
|
|
|
21
19
|
self.time = self.time.astimezone(timezone.utc)
|
|
22
20
|
|
|
23
21
|
def _identity_fields(self) -> dict:
|
|
24
|
-
"""Return a mapping of domain fields excluding 'time'
|
|
25
|
-
data =
|
|
22
|
+
"""Return a mapping of domain fields excluding 'time'."""
|
|
23
|
+
data = {
|
|
24
|
+
key: value
|
|
25
|
+
for key, value in self.__dict__.items()
|
|
26
|
+
if not key.startswith("_")
|
|
27
|
+
}
|
|
26
28
|
data.pop("time", None)
|
|
27
|
-
data.pop("value", None)
|
|
28
29
|
return data
|
|
29
30
|
|
|
30
31
|
def __eq__(self, other: object) -> bool:
|
|
@@ -34,6 +35,5 @@ class TemporalRecord(Record):
|
|
|
34
35
|
return NotImplemented
|
|
35
36
|
return (
|
|
36
37
|
self.time == other.time
|
|
37
|
-
and self.value == other.value
|
|
38
38
|
and self._identity_fields() == other._identity_fields()
|
|
39
39
|
)
|
datapipeline/domain/sample.py
CHANGED
datapipeline/domain/vector.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
|
1
|
-
from
|
|
2
|
-
from typing import Dict
|
|
3
|
-
from typing import Union
|
|
1
|
+
from typing import Dict, Union, Any
|
|
4
2
|
|
|
5
3
|
from dataclasses import dataclass
|
|
6
4
|
|
|
@@ -25,13 +23,13 @@ class Vector:
|
|
|
25
23
|
return self.values[key]
|
|
26
24
|
|
|
27
25
|
|
|
28
|
-
def vectorize_record_group(values: Dict[str, list[
|
|
26
|
+
def vectorize_record_group(values: Dict[str, list[Any]]) -> Vector:
|
|
29
27
|
structured: Dict[str, Union[float, list[float]]] = {}
|
|
30
28
|
|
|
31
|
-
for key,
|
|
32
|
-
if len(
|
|
33
|
-
structured[key] =
|
|
29
|
+
for key, items in values.items():
|
|
30
|
+
if len(items) == 1:
|
|
31
|
+
structured[key] = items[0]
|
|
34
32
|
else:
|
|
35
|
-
structured[key] =
|
|
33
|
+
structured[key] = list(items)
|
|
36
34
|
|
|
37
35
|
return Vector(values=structured)
|
datapipeline/io/output.py
CHANGED
datapipeline/io/serializers.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
1
|
import json
|
|
4
2
|
from dataclasses import asdict, is_dataclass
|
|
5
3
|
from typing import Any, Dict, Type
|
|
@@ -107,20 +105,32 @@ class VectorPickleSerializer(BasePickleSerializer):
|
|
|
107
105
|
|
|
108
106
|
|
|
109
107
|
def _record_payload(value: Any) -> Any:
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
108
|
+
def _convert(obj: Any) -> Any:
|
|
109
|
+
if obj is None:
|
|
110
|
+
return None
|
|
111
|
+
if is_dataclass(obj):
|
|
112
|
+
attrs = getattr(obj, "__dict__", None)
|
|
113
|
+
if attrs is not None:
|
|
114
|
+
return {
|
|
115
|
+
k: _convert(v)
|
|
116
|
+
for k, v in attrs.items()
|
|
117
|
+
if not k.startswith("_")
|
|
118
|
+
}
|
|
119
|
+
return asdict(obj)
|
|
120
|
+
if isinstance(obj, dict):
|
|
121
|
+
return {k: _convert(v) for k, v in obj.items()}
|
|
122
|
+
if isinstance(obj, (list, tuple)):
|
|
123
|
+
return [_convert(v) for v in obj]
|
|
124
|
+
attrs = getattr(obj, "__dict__", None)
|
|
125
|
+
if attrs:
|
|
126
|
+
return {
|
|
127
|
+
k: _convert(v)
|
|
128
|
+
for k, v in attrs.items()
|
|
129
|
+
if not k.startswith("_")
|
|
130
|
+
}
|
|
131
|
+
return obj
|
|
132
|
+
|
|
133
|
+
return _convert(value)
|
|
124
134
|
|
|
125
135
|
|
|
126
136
|
def _record_key(value: Any) -> Any:
|
|
@@ -1,9 +1,16 @@
|
|
|
1
|
-
from
|
|
1
|
+
from dataclasses import dataclass
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from math import sin, pi
|
|
4
|
+
from typing import Iterator
|
|
5
|
+
|
|
4
6
|
from datapipeline.domain.record import TemporalRecord
|
|
5
7
|
|
|
6
8
|
|
|
9
|
+
@dataclass
|
|
10
|
+
class TimeEncodedRecord(TemporalRecord):
|
|
11
|
+
value: float
|
|
12
|
+
|
|
13
|
+
|
|
7
14
|
def encode(stream: Iterator[TemporalRecord], mode: str) -> Iterator[TemporalRecord]:
|
|
8
15
|
for rec in stream:
|
|
9
16
|
t: datetime = rec.time
|
|
@@ -15,4 +22,4 @@ def encode(stream: Iterator[TemporalRecord], mode: str) -> Iterator[TemporalReco
|
|
|
15
22
|
val = t.timestamp()
|
|
16
23
|
else:
|
|
17
24
|
raise ValueError(f"Unsupported encode_time mode: {mode}")
|
|
18
|
-
yield
|
|
25
|
+
yield TimeEncodedRecord(time=rec.time, value=val)
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
1
|
from dataclasses import dataclass
|
|
4
2
|
from typing import Iterable
|
|
5
3
|
|
|
@@ -43,12 +41,12 @@ def required_artifacts_for(
|
|
|
43
41
|
needs_metadata = False
|
|
44
42
|
for demand in demands:
|
|
45
43
|
stage = demand.stage
|
|
46
|
-
effective_stage =
|
|
44
|
+
effective_stage = 8 if stage is None else stage
|
|
47
45
|
|
|
48
|
-
if effective_stage >=
|
|
46
|
+
if effective_stage >= 6 and _requires_scaler(dataset):
|
|
49
47
|
required.add(SCALER_STATISTICS)
|
|
50
48
|
|
|
51
|
-
if effective_stage >=
|
|
49
|
+
if effective_stage >= 7:
|
|
52
50
|
required.add(VECTOR_SCHEMA)
|
|
53
51
|
needs_metadata = True
|
|
54
52
|
|