jerry-thomas 1.0.3__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. datapipeline/analysis/vector/collector.py +0 -1
  2. datapipeline/build/tasks/config.py +0 -2
  3. datapipeline/build/tasks/metadata.py +0 -2
  4. datapipeline/build/tasks/scaler.py +0 -2
  5. datapipeline/build/tasks/schema.py +0 -2
  6. datapipeline/build/tasks/utils.py +0 -2
  7. datapipeline/cli/app.py +201 -81
  8. datapipeline/cli/commands/contract.py +145 -283
  9. datapipeline/cli/commands/demo.py +13 -0
  10. datapipeline/cli/commands/domain.py +4 -4
  11. datapipeline/cli/commands/dto.py +11 -0
  12. datapipeline/cli/commands/filter.py +2 -2
  13. datapipeline/cli/commands/inspect.py +0 -68
  14. datapipeline/cli/commands/list_.py +30 -13
  15. datapipeline/cli/commands/loader.py +11 -0
  16. datapipeline/cli/commands/mapper.py +82 -0
  17. datapipeline/cli/commands/parser.py +45 -0
  18. datapipeline/cli/commands/run_config.py +1 -3
  19. datapipeline/cli/commands/serve_pipeline.py +5 -7
  20. datapipeline/cli/commands/source.py +106 -18
  21. datapipeline/cli/commands/stream.py +292 -0
  22. datapipeline/cli/visuals/common.py +0 -2
  23. datapipeline/cli/visuals/sections.py +0 -2
  24. datapipeline/cli/workspace_utils.py +0 -3
  25. datapipeline/config/context.py +0 -2
  26. datapipeline/config/dataset/feature.py +1 -0
  27. datapipeline/config/metadata.py +0 -2
  28. datapipeline/config/project.py +0 -2
  29. datapipeline/config/resolution.py +10 -2
  30. datapipeline/config/tasks.py +9 -9
  31. datapipeline/domain/feature.py +3 -0
  32. datapipeline/domain/record.py +7 -7
  33. datapipeline/domain/sample.py +0 -2
  34. datapipeline/domain/vector.py +6 -8
  35. datapipeline/integrations/ml/adapter.py +0 -2
  36. datapipeline/integrations/ml/pandas_support.py +0 -2
  37. datapipeline/integrations/ml/rows.py +0 -2
  38. datapipeline/integrations/ml/torch_support.py +0 -2
  39. datapipeline/io/output.py +0 -2
  40. datapipeline/io/serializers.py +26 -16
  41. datapipeline/mappers/synthetic/time.py +9 -2
  42. datapipeline/pipeline/artifacts.py +3 -5
  43. datapipeline/pipeline/observability.py +0 -2
  44. datapipeline/pipeline/pipelines.py +118 -34
  45. datapipeline/pipeline/stages.py +54 -18
  46. datapipeline/pipeline/utils/spool_cache.py +142 -0
  47. datapipeline/pipeline/utils/transform_utils.py +27 -2
  48. datapipeline/services/artifacts.py +1 -4
  49. datapipeline/services/constants.py +1 -0
  50. datapipeline/services/factories.py +4 -6
  51. datapipeline/services/paths.py +10 -1
  52. datapipeline/services/project_paths.py +0 -2
  53. datapipeline/services/runs.py +0 -2
  54. datapipeline/services/scaffold/contract_yaml.py +76 -0
  55. datapipeline/services/scaffold/demo.py +141 -0
  56. datapipeline/services/scaffold/discovery.py +115 -0
  57. datapipeline/services/scaffold/domain.py +21 -13
  58. datapipeline/services/scaffold/dto.py +31 -0
  59. datapipeline/services/scaffold/filter.py +2 -1
  60. datapipeline/services/scaffold/layout.py +96 -0
  61. datapipeline/services/scaffold/loader.py +61 -0
  62. datapipeline/services/scaffold/mapper.py +116 -0
  63. datapipeline/services/scaffold/parser.py +56 -0
  64. datapipeline/services/scaffold/plugin.py +14 -2
  65. datapipeline/services/scaffold/source_yaml.py +91 -0
  66. datapipeline/services/scaffold/stream_plan.py +129 -0
  67. datapipeline/services/scaffold/utils.py +187 -0
  68. datapipeline/sources/data_loader.py +0 -2
  69. datapipeline/sources/decoders.py +49 -8
  70. datapipeline/sources/factory.py +9 -6
  71. datapipeline/sources/foreach.py +18 -3
  72. datapipeline/sources/synthetic/time/parser.py +1 -1
  73. datapipeline/sources/transports.py +10 -4
  74. datapipeline/templates/demo_skeleton/demo/contracts/equity.ohlcv.yaml +33 -0
  75. datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.hour_sin.yaml +22 -0
  76. datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.linear.yaml +22 -0
  77. datapipeline/templates/demo_skeleton/demo/data/APPL.jsonl +19 -0
  78. datapipeline/templates/demo_skeleton/demo/data/MSFT.jsonl +19 -0
  79. datapipeline/templates/demo_skeleton/demo/dataset.yaml +19 -0
  80. datapipeline/templates/demo_skeleton/demo/postprocess.yaml +19 -0
  81. datapipeline/templates/demo_skeleton/demo/project.yaml +19 -0
  82. datapipeline/templates/demo_skeleton/demo/sources/sandbox.ohlcv.yaml +17 -0
  83. datapipeline/templates/{plugin_skeleton/example → demo_skeleton/demo}/sources/synthetic.ticks.yaml +1 -1
  84. datapipeline/templates/demo_skeleton/demo/tasks/metadata.yaml +2 -0
  85. datapipeline/templates/demo_skeleton/demo/tasks/scaler.yaml +3 -0
  86. datapipeline/templates/demo_skeleton/demo/tasks/schema.yaml +2 -0
  87. datapipeline/templates/demo_skeleton/demo/tasks/serve.test.yaml +4 -0
  88. datapipeline/templates/demo_skeleton/demo/tasks/serve.train.yaml +4 -0
  89. datapipeline/templates/demo_skeleton/demo/tasks/serve.val.yaml +4 -0
  90. datapipeline/templates/demo_skeleton/scripts/run_dataframe.py +20 -0
  91. datapipeline/templates/demo_skeleton/scripts/run_torch.py +23 -0
  92. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
  93. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/__init__.py +0 -0
  94. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/model.py +18 -0
  95. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
  96. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/sandbox_ohlcv_dto.py +14 -0
  97. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +0 -0
  98. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/map_sandbox_ohlcv_dto_to_equity.py +26 -0
  99. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
  100. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/sandbox_ohlcv_dto_parser.py +46 -0
  101. datapipeline/templates/plugin_skeleton/README.md +57 -136
  102. datapipeline/templates/plugin_skeleton/jerry.yaml +12 -24
  103. datapipeline/templates/plugin_skeleton/reference/jerry.yaml +28 -0
  104. datapipeline/templates/plugin_skeleton/reference/reference/contracts/composed.reference.yaml +29 -0
  105. datapipeline/templates/plugin_skeleton/reference/reference/contracts/ingest.reference.yaml +31 -0
  106. datapipeline/templates/plugin_skeleton/reference/reference/contracts/overview.reference.yaml +34 -0
  107. datapipeline/templates/plugin_skeleton/reference/reference/dataset.yaml +29 -0
  108. datapipeline/templates/plugin_skeleton/reference/reference/postprocess.yaml +25 -0
  109. datapipeline/templates/plugin_skeleton/reference/reference/project.yaml +32 -0
  110. datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.http.reference.yaml +24 -0
  111. datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.reference.yaml +21 -0
  112. datapipeline/templates/plugin_skeleton/reference/reference/sources/fs.reference.yaml +16 -0
  113. datapipeline/templates/plugin_skeleton/reference/reference/sources/http.reference.yaml +17 -0
  114. datapipeline/templates/plugin_skeleton/reference/reference/sources/overview.reference.yaml +18 -0
  115. datapipeline/templates/plugin_skeleton/reference/reference/sources/synthetic.reference.yaml +15 -0
  116. datapipeline/templates/plugin_skeleton/reference/reference/tasks/metadata.reference.yaml +11 -0
  117. datapipeline/templates/plugin_skeleton/reference/reference/tasks/scaler.reference.yaml +10 -0
  118. datapipeline/templates/plugin_skeleton/reference/reference/tasks/schema.reference.yaml +10 -0
  119. datapipeline/templates/plugin_skeleton/reference/reference/tasks/serve.reference.yaml +28 -0
  120. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/domains/__init__.py +2 -0
  121. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
  122. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/loaders/__init__.py +0 -0
  123. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +1 -0
  124. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
  125. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +12 -11
  126. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +4 -13
  127. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +9 -11
  128. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +1 -2
  129. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +1 -7
  130. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +1 -1
  131. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +1 -1
  132. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +1 -25
  133. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +1 -1
  134. datapipeline/templates/plugin_skeleton/your-interim-data-builder/dataset.yaml +9 -0
  135. datapipeline/templates/plugin_skeleton/your-interim-data-builder/postprocess.yaml +1 -0
  136. datapipeline/templates/plugin_skeleton/your-interim-data-builder/project.yaml +15 -0
  137. datapipeline/templates/plugin_skeleton/your-interim-data-builder/tasks/serve.all.yaml +8 -0
  138. datapipeline/templates/stubs/contracts/composed.yaml.j2 +10 -0
  139. datapipeline/templates/stubs/contracts/ingest.yaml.j2 +25 -0
  140. datapipeline/templates/stubs/dto.py.j2 +2 -2
  141. datapipeline/templates/stubs/filter.py.j2 +1 -1
  142. datapipeline/templates/stubs/loaders/basic.py.j2 +11 -0
  143. datapipeline/templates/stubs/mappers/composed.py.j2 +13 -0
  144. datapipeline/templates/stubs/mappers/ingest.py.j2 +20 -0
  145. datapipeline/templates/stubs/parser.py.j2 +5 -1
  146. datapipeline/templates/stubs/record.py.j2 +1 -1
  147. datapipeline/templates/stubs/source.yaml.j2 +1 -1
  148. datapipeline/transforms/debug/identity.py +34 -16
  149. datapipeline/transforms/debug/lint.py +14 -11
  150. datapipeline/transforms/feature/scaler.py +5 -12
  151. datapipeline/transforms/filter.py +73 -17
  152. datapipeline/transforms/interfaces.py +58 -0
  153. datapipeline/transforms/record/floor_time.py +10 -7
  154. datapipeline/transforms/record/lag.py +8 -10
  155. datapipeline/transforms/sequence.py +2 -3
  156. datapipeline/transforms/stream/dedupe.py +5 -7
  157. datapipeline/transforms/stream/ensure_ticks.py +39 -24
  158. datapipeline/transforms/stream/fill.py +34 -25
  159. datapipeline/transforms/stream/filter.py +25 -0
  160. datapipeline/transforms/stream/floor_time.py +16 -0
  161. datapipeline/transforms/stream/granularity.py +52 -30
  162. datapipeline/transforms/stream/lag.py +17 -0
  163. datapipeline/transforms/stream/rolling.py +72 -0
  164. datapipeline/transforms/utils.py +42 -10
  165. datapipeline/transforms/vector/drop/horizontal.py +0 -3
  166. datapipeline/transforms/vector/drop/orchestrator.py +0 -3
  167. datapipeline/transforms/vector/drop/vertical.py +0 -2
  168. datapipeline/transforms/vector/ensure_schema.py +0 -2
  169. datapipeline/utils/paths.py +0 -2
  170. datapipeline/utils/placeholders.py +0 -2
  171. datapipeline/utils/rich_compat.py +0 -3
  172. datapipeline/utils/window.py +0 -2
  173. jerry_thomas-2.0.1.dist-info/METADATA +269 -0
  174. jerry_thomas-2.0.1.dist-info/RECORD +264 -0
  175. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/WHEEL +1 -1
  176. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/entry_points.txt +7 -3
  177. datapipeline/services/scaffold/mappers.py +0 -55
  178. datapipeline/services/scaffold/source.py +0 -191
  179. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +0 -31
  180. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +0 -30
  181. datapipeline/templates/plugin_skeleton/example/dataset.yaml +0 -18
  182. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +0 -29
  183. datapipeline/templates/plugin_skeleton/example/project.yaml +0 -23
  184. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +0 -3
  185. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +0 -9
  186. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +0 -2
  187. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +0 -4
  188. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +0 -28
  189. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +0 -4
  190. datapipeline/templates/stubs/mapper.py.j2 +0 -22
  191. jerry_thomas-1.0.3.dist-info/METADATA +0 -827
  192. jerry_thomas-1.0.3.dist-info/RECORD +0 -198
  193. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/licenses/LICENSE +0 -0
  194. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,292 @@
1
+ from pathlib import Path
2
+
3
+ from datapipeline.config.workspace import WorkspaceContext
4
+ from datapipeline.cli.workspace_utils import resolve_default_project_yaml
5
+ from datapipeline.services.paths import pkg_root
6
+ from datapipeline.services.project_paths import resolve_project_yaml_path
7
+ from datapipeline.services.scaffold.discovery import (
8
+ list_domains,
9
+ list_mappers,
10
+ list_parsers,
11
+ list_sources,
12
+ list_dtos,
13
+ )
14
+ from datapipeline.services.scaffold.source_yaml import default_loader_config
15
+ from datapipeline.services.scaffold.layout import (
16
+ default_stream_id,
17
+ dto_class_name,
18
+ default_parser_name,
19
+ default_mapper_name,
20
+ dto_module_path,
21
+ LABEL_DTO_FOR_PARSER,
22
+ LABEL_DTO_FOR_MAPPER,
23
+ LABEL_DOMAIN_TO_MAP,
24
+ LABEL_MAPPER_INPUT,
25
+ default_mapper_name_for_identity,
26
+ )
27
+ from datapipeline.services.scaffold.stream_plan import StreamPlan, ParserPlan, MapperPlan, execute_stream_plan
28
+ from datapipeline.services.scaffold.utils import (
29
+ choose_existing_or_create,
30
+ choose_name,
31
+ choose_existing_or_create_name,
32
+ error_exit,
33
+ info,
34
+ pick_from_list,
35
+ pick_from_menu,
36
+ prompt_required,
37
+ )
38
+
39
+
40
+ def handle(*, plugin_root: Path | None = None, workspace: WorkspaceContext | None = None) -> None:
41
+ root_dir, pkg_name, _ = pkg_root(plugin_root)
42
+ project_yaml = resolve_default_project_yaml(
43
+ workspace) or resolve_project_yaml_path(root_dir)
44
+
45
+ # Shared context
46
+ provider = prompt_required("Provider name (e.g. nasa)")
47
+ dataset = prompt_required("Dataset name (e.g. weather)")
48
+ source_id = f"{provider}.{dataset}"
49
+
50
+ # Collected actions (execute at end)
51
+ create_source = False
52
+ create_domain_flag = False
53
+ create_parser_flag = False
54
+ create_mapper_flag = False
55
+ parser_create_dto = False
56
+ mapper_create_dto = False
57
+
58
+ dto_class = None
59
+ dto_module = None
60
+ mapper_input_class = None
61
+ mapper_input_module = None
62
+ loader_ep = None
63
+ loader_args = None
64
+ parser_ep = None
65
+ mapper_ep = None
66
+ parser_name = None
67
+ mapper_name = None
68
+ pchoice = "identity"
69
+
70
+ # Source selection (may override shared context if existing is chosen)
71
+ source_choice = pick_from_menu(
72
+ "Source:",
73
+ [
74
+ ("create", "Create new source (default)"),
75
+ ("existing", "Select existing source"),
76
+ ],
77
+ )
78
+
79
+ if source_choice == "existing":
80
+ sources = list_sources(project_yaml)
81
+ if not sources:
82
+ error_exit("No sources found. Create one first.")
83
+ source_id = pick_from_list("Select source:", sources)
84
+ parts = source_id.split(".", 1)
85
+ provider = parts[0] if len(parts) == 2 else provider
86
+ dataset = parts[1] if len(parts) == 2 else dataset
87
+ else:
88
+ source_id_default = f"{provider}.{dataset}"
89
+ source_id = choose_name("Source id", default=source_id_default)
90
+ create_source = True
91
+
92
+ # Loader selection
93
+ loader_ep = None
94
+ loader_args = {}
95
+ choice = pick_from_menu(
96
+ "Loader:",
97
+ [
98
+ ("fs", "Built-in fs"),
99
+ ("http", "Built-in http"),
100
+ ("synthetic", "Built-in synthetic"),
101
+ ("custom", "Custom loader"),
102
+ ],
103
+ allow_default=False,
104
+ )
105
+ if choice in {"fs", "http", "synthetic"}:
106
+ if choice in {"fs", "http"}:
107
+ fmt_options = [
108
+ ("csv", "csv"),
109
+ ("json", "json"),
110
+ ("json-lines", "json-lines"),
111
+ ]
112
+ if choice == "fs":
113
+ fmt_options.append(("pickle", "pickle"))
114
+ fmt = pick_from_menu(
115
+ "Format:", fmt_options, allow_default=False)
116
+ else:
117
+ fmt = None
118
+ loader_ep, loader_args = default_loader_config(choice, fmt)
119
+ else:
120
+ loader_ep = prompt_required("Loader entrypoint")
121
+
122
+ # Parser selection
123
+ parsers = list_parsers(root=plugin_root)
124
+ if parsers:
125
+ pchoice = pick_from_menu(
126
+ "Parser:",
127
+ [
128
+ ("create", "Create new parser (default)"),
129
+ ("existing", "Select existing parser"),
130
+ ("identity", "Identity parser"),
131
+ ],
132
+ )
133
+ else:
134
+ pchoice = pick_from_menu(
135
+ "Parser:",
136
+ [
137
+ ("create", "Create new parser (default)"),
138
+ ("identity", "Identity parser"),
139
+ ],
140
+ )
141
+ if pchoice == "existing":
142
+ parser_ep = pick_from_menu(
143
+ "Select parser entrypoint:",
144
+ [(k, k) for k in sorted(parsers.keys())],
145
+ )
146
+ elif pchoice == "create":
147
+ dto_default = dto_class_name(
148
+ f"{provider}_{dataset}") if provider and dataset else None
149
+ dto_class, parser_create_dto = choose_existing_or_create_name(
150
+ label=LABEL_DTO_FOR_PARSER,
151
+ existing=sorted(list_dtos(root=plugin_root).keys()),
152
+ create_label="Create new DTO",
153
+ prompt_new="DTO class name",
154
+ default_new=dto_default,
155
+ )
156
+ parser_name = choose_name(
157
+ "Parser class name",
158
+ default=default_parser_name(dto_class),
159
+ )
160
+ dto_module = dto_module_path(pkg_name, dto_class)
161
+ create_parser_flag = True
162
+ elif pchoice == "identity":
163
+ parser_ep = "identity"
164
+ else:
165
+ parser_ep = "identity"
166
+
167
+ # Domain selection
168
+ domain, create_domain_flag = choose_existing_or_create_name(
169
+ label=LABEL_DOMAIN_TO_MAP,
170
+ existing=list_domains(root=plugin_root),
171
+ create_label="Create new domain",
172
+ prompt_new="Domain name",
173
+ default_new=dataset,
174
+ )
175
+
176
+ # Mapper selection
177
+ mappers = list_mappers(root=plugin_root)
178
+ if mappers:
179
+ mchoice = pick_from_menu(
180
+ "Mapper:",
181
+ [
182
+ ("create", "Create new mapper (default)"),
183
+ ("existing", "Select existing mapper"),
184
+ ("identity", "Identity mapper"),
185
+ ],
186
+ )
187
+ else:
188
+ mchoice = pick_from_menu(
189
+ "Mapper:",
190
+ [
191
+ ("create", "Create new mapper (default)"),
192
+ ("identity", "Identity mapper"),
193
+ ],
194
+ )
195
+ if mchoice == "existing":
196
+ mapper_ep = pick_from_menu(
197
+ "Select mapper entrypoint:",
198
+ [(k, k) for k in sorted(mappers.keys())],
199
+ )
200
+ elif mchoice == "create":
201
+ create_mapper_flag = True
202
+ input_choice = pick_from_menu(
203
+ f"{LABEL_MAPPER_INPUT}:",
204
+ [
205
+ ("dto", "DTO (default)"),
206
+ ("identity", "Any"),
207
+ ],
208
+ )
209
+ info("Domain output: Domain record")
210
+ if input_choice == "dto":
211
+ if not dto_class:
212
+ dto_class, mapper_create_dto = choose_existing_or_create_name(
213
+ label=LABEL_DTO_FOR_MAPPER,
214
+ existing=sorted(list_dtos(root=plugin_root).keys()),
215
+ create_label="Create new DTO",
216
+ prompt_new="DTO class name",
217
+ default_new=dto_class_name(f"{provider}_{dataset}"),
218
+ )
219
+ else:
220
+ mapper_create_dto = False
221
+ dto_module = dto_module_path(pkg_name, dto_class)
222
+ mapper_input_class = dto_class
223
+ mapper_input_module = dto_module
224
+ else:
225
+ mapper_input_module = "typing"
226
+ mapper_input_class = "Any"
227
+ mapper_create_dto = False
228
+ if input_choice == "identity":
229
+ mapper_name = choose_name(
230
+ "Mapper name",
231
+ default=default_mapper_name_for_identity(domain),
232
+ )
233
+ else:
234
+ mapper_name = choose_name(
235
+ "Mapper name", default=default_mapper_name(mapper_input_module, domain))
236
+ elif mchoice == "identity":
237
+ mapper_ep = "identity"
238
+ else:
239
+ mapper_ep = "identity"
240
+
241
+ # Stream id and contract
242
+ default_id = default_stream_id(domain, dataset or "dataset", None)
243
+ stream_id = choose_name("Stream id", default=default_id)
244
+
245
+ # Build plan and execute (no side effects during selection)
246
+ parser_plan = None
247
+ if pchoice == "create":
248
+ parser_plan = ParserPlan(
249
+ create=True,
250
+ create_dto=parser_create_dto,
251
+ dto_class=dto_class,
252
+ dto_module=dto_module,
253
+ parser_name=parser_name,
254
+ )
255
+ elif pchoice == "existing":
256
+ parser_plan = ParserPlan(create=False, parser_ep=parser_ep)
257
+ else:
258
+ parser_plan = ParserPlan(create=False, parser_ep="identity")
259
+
260
+ mapper_plan = None
261
+ if mchoice == "create":
262
+ mapper_plan = MapperPlan(
263
+ create=True,
264
+ create_dto=mapper_create_dto,
265
+ input_class=mapper_input_class,
266
+ input_module=mapper_input_module,
267
+ mapper_name=mapper_name,
268
+ domain=domain,
269
+ )
270
+ elif mchoice == "existing":
271
+ mapper_plan = MapperPlan(
272
+ create=False, mapper_ep=mapper_ep, domain=domain)
273
+ else:
274
+ mapper_plan = MapperPlan(
275
+ create=False, mapper_ep="identity", domain=domain)
276
+
277
+ plan = StreamPlan(
278
+ provider=provider,
279
+ dataset=dataset,
280
+ source_id=source_id,
281
+ project_yaml=project_yaml,
282
+ stream_id=stream_id,
283
+ root=plugin_root,
284
+ create_source=create_source,
285
+ loader_ep=loader_ep,
286
+ loader_args=loader_args,
287
+ parser=parser_plan,
288
+ mapper=mapper_plan,
289
+ domain=domain,
290
+ create_domain=create_domain_flag,
291
+ )
292
+ execute_stream_plan(plan)
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from pathlib import Path
4
2
  import logging
5
3
  import os
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from pathlib import Path
4
2
  from typing import Optional, Tuple
5
3
 
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from pathlib import Path
4
2
 
5
3
  from datapipeline.config.workspace import WorkspaceContext
@@ -22,4 +20,3 @@ def resolve_default_project_yaml(workspace: WorkspaceContext | None) -> Path | N
22
20
  f"Unknown default_dataset '{alias}'. Define it under datasets: in jerry.yaml."
23
21
  )
24
22
  return resolved
25
-
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from dataclasses import dataclass
4
2
  from pathlib import Path
5
3
  from typing import Optional, Sequence
@@ -9,5 +9,6 @@ class BaseRecordConfig(BaseModel):
9
9
 
10
10
  class FeatureRecordConfig(BaseRecordConfig):
11
11
  id: str
12
+ field: str
12
13
  scale: Optional[Union[bool, Mapping[str, Any]]] = Field(default=False)
13
14
  sequence: Optional[Mapping[str, Any]] = Field(default=None)
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from datetime import datetime
4
2
  from typing import Any, Dict, List, Optional
5
3
 
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from datetime import datetime
4
2
  from typing import Optional
5
3
  from pydantic import BaseModel, Field, ConfigDict
@@ -1,11 +1,10 @@
1
- from __future__ import annotations
2
-
3
1
  import logging
4
2
  from dataclasses import dataclass
5
3
  from pathlib import Path
6
4
  from typing import Any, Optional
7
5
 
8
6
  from datapipeline.config.tasks import ServeOutputConfig
7
+ from datapipeline.io.output import OutputResolutionError
9
8
  from datapipeline.config.workspace import WorkspaceContext
10
9
 
11
10
 
@@ -113,6 +112,15 @@ def workspace_output_defaults(
113
112
  if not serve_defaults or not serve_defaults.output:
114
113
  return None
115
114
  od = serve_defaults.output
115
+ transport = str(od.transport).lower() if od.transport is not None else None
116
+ if transport == "fs" and not od.directory:
117
+ raise OutputResolutionError(
118
+ "fs output requires a directory. Example:\n"
119
+ " output:\n"
120
+ " transport: fs\n"
121
+ " format: json-lines\n"
122
+ " directory: ./data/processed/jerry"
123
+ )
116
124
  output_dir = None
117
125
  if od.directory:
118
126
  candidate = Path(od.directory)
@@ -1,7 +1,5 @@
1
- from __future__ import annotations
2
-
3
1
  from pathlib import Path
4
- from typing import Annotated, Iterable, List, Literal, Sequence
2
+ from typing import Annotated, Literal, Sequence
5
3
 
6
4
  from pydantic import BaseModel, Field, field_validator, model_validator
7
5
  from pydantic.type_adapter import TypeAdapter
@@ -21,9 +19,10 @@ PayloadMode = Literal["sample", "vector"]
21
19
  class TaskBase(BaseModel):
22
20
  version: int = Field(default=1)
23
21
  kind: str
24
- name: str | None = Field(default=None, description="Optional task identifier.")
25
- enabled: bool = Field(default=True, description="Disable to skip execution.")
26
- depends_on: list[str] = Field(default_factory=list)
22
+ name: str | None = Field(
23
+ default=None, description="Optional task identifier.")
24
+ enabled: bool = Field(
25
+ default=True, description="Disable to skip execution.")
27
26
  source_path: Path | None = Field(default=None, exclude=True)
28
27
 
29
28
  def effective_name(self) -> str:
@@ -78,7 +77,8 @@ class RuntimeTask(TaskBase):
78
77
 
79
78
  class ServeOutputConfig(BaseModel):
80
79
  transport: Transport = Field(..., description="fs | stdout")
81
- format: Format = Field(..., description="csv | json | json-lines | print | pickle")
80
+ format: Format = Field(...,
81
+ description="csv | json | json-lines | print | pickle")
82
82
  payload: PayloadMode = Field(
83
83
  default="sample",
84
84
  description="sample (key + metadata) or vector payload (features [+targets]).",
@@ -151,9 +151,9 @@ class ServeTask(RuntimeTask):
151
151
  )
152
152
  stage: int | None = Field(
153
153
  default=None,
154
- description="Default pipeline stage preview (0-7).",
154
+ description="Default pipeline stage preview (0-8).",
155
155
  ge=0,
156
- le=7,
156
+ le=8,
157
157
  )
158
158
  throttle_ms: float | None = Field(
159
159
  default=None,
@@ -1,5 +1,6 @@
1
1
  from datapipeline.domain.record import TemporalRecord
2
2
  from dataclasses import dataclass
3
+ from typing import Any
3
4
 
4
5
 
5
6
  @dataclass
@@ -10,8 +11,10 @@ class BaseFeature:
10
11
  @dataclass
11
12
  class FeatureRecord(BaseFeature):
12
13
  record: TemporalRecord
14
+ value: Any
13
15
 
14
16
 
15
17
  @dataclass
16
18
  class FeatureRecordSequence(BaseFeature):
17
19
  records: list[TemporalRecord]
20
+ values: list[Any]
@@ -1,6 +1,5 @@
1
- from dataclasses import dataclass, asdict
1
+ from dataclasses import dataclass
2
2
  from datetime import datetime, timezone
3
- from typing import Any
4
3
 
5
4
 
6
5
  @dataclass
@@ -13,7 +12,6 @@ class TemporalRecord(Record):
13
12
  """Canonical time-series payload used throughout the pipeline."""
14
13
 
15
14
  time: datetime
16
- value: Any
17
15
 
18
16
  def __post_init__(self) -> None:
19
17
  if self.time.tzinfo is None:
@@ -21,10 +19,13 @@ class TemporalRecord(Record):
21
19
  self.time = self.time.astimezone(timezone.utc)
22
20
 
23
21
  def _identity_fields(self) -> dict:
24
- """Return a mapping of domain fields excluding 'time' and 'value'."""
25
- data = asdict(self)
22
+ """Return a mapping of domain fields excluding 'time'."""
23
+ data = {
24
+ key: value
25
+ for key, value in self.__dict__.items()
26
+ if not key.startswith("_")
27
+ }
26
28
  data.pop("time", None)
27
- data.pop("value", None)
28
29
  return data
29
30
 
30
31
  def __eq__(self, other: object) -> bool:
@@ -34,6 +35,5 @@ class TemporalRecord(Record):
34
35
  return NotImplemented
35
36
  return (
36
37
  self.time == other.time
37
- and self.value == other.value
38
38
  and self._identity_fields() == other._identity_fields()
39
39
  )
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from dataclasses import dataclass, asdict
4
2
  from typing import Any, Iterator, Optional, Literal
5
3
 
@@ -1,6 +1,4 @@
1
- from datapipeline.domain.record import TemporalRecord
2
- from typing import Dict
3
- from typing import Union
1
+ from typing import Dict, Union, Any
4
2
 
5
3
  from dataclasses import dataclass
6
4
 
@@ -25,13 +23,13 @@ class Vector:
25
23
  return self.values[key]
26
24
 
27
25
 
28
- def vectorize_record_group(values: Dict[str, list[TemporalRecord]]) -> Vector:
26
+ def vectorize_record_group(values: Dict[str, list[Any]]) -> Vector:
29
27
  structured: Dict[str, Union[float, list[float]]] = {}
30
28
 
31
- for key, records in values.items():
32
- if len(records) == 1:
33
- structured[key] = records[0].value
29
+ for key, items in values.items():
30
+ if len(items) == 1:
31
+ structured[key] = items[0]
34
32
  else:
35
- structured[key] = [r.value for r in records]
33
+ structured[key] = list(items)
36
34
 
37
35
  return Vector(values=structured)
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from collections.abc import Iterator, Sequence
4
2
  from dataclasses import dataclass
5
3
  from itertools import islice
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from collections.abc import Callable, Iterable
4
2
  from typing import Any
5
3
 
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from collections.abc import Iterator, Sequence
4
2
  from pathlib import Path
5
3
  from typing import Any
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from collections.abc import Sequence
4
2
  from pathlib import Path
5
3
  from typing import Any, Mapping
datapipeline/io/output.py CHANGED
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from dataclasses import dataclass
4
2
  from pathlib import Path
5
3
  from typing import Optional
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  import json
4
2
  from dataclasses import asdict, is_dataclass
5
3
  from typing import Any, Dict, Type
@@ -107,20 +105,32 @@ class VectorPickleSerializer(BasePickleSerializer):
107
105
 
108
106
 
109
107
  def _record_payload(value: Any) -> Any:
110
- if value is None:
111
- return None
112
- if is_dataclass(value):
113
- return asdict(value)
114
- if isinstance(value, dict):
115
- return value
116
- attrs = getattr(value, "__dict__", None)
117
- if attrs:
118
- return {
119
- k: v
120
- for k, v in attrs.items()
121
- if not k.startswith("_")
122
- }
123
- return value
108
+ def _convert(obj: Any) -> Any:
109
+ if obj is None:
110
+ return None
111
+ if is_dataclass(obj):
112
+ attrs = getattr(obj, "__dict__", None)
113
+ if attrs is not None:
114
+ return {
115
+ k: _convert(v)
116
+ for k, v in attrs.items()
117
+ if not k.startswith("_")
118
+ }
119
+ return asdict(obj)
120
+ if isinstance(obj, dict):
121
+ return {k: _convert(v) for k, v in obj.items()}
122
+ if isinstance(obj, (list, tuple)):
123
+ return [_convert(v) for v in obj]
124
+ attrs = getattr(obj, "__dict__", None)
125
+ if attrs:
126
+ return {
127
+ k: _convert(v)
128
+ for k, v in attrs.items()
129
+ if not k.startswith("_")
130
+ }
131
+ return obj
132
+
133
+ return _convert(value)
124
134
 
125
135
 
126
136
  def _record_key(value: Any) -> Any:
@@ -1,9 +1,16 @@
1
- from typing import Iterator
1
+ from dataclasses import dataclass
2
2
  from datetime import datetime
3
3
  from math import sin, pi
4
+ from typing import Iterator
5
+
4
6
  from datapipeline.domain.record import TemporalRecord
5
7
 
6
8
 
9
+ @dataclass
10
+ class TimeEncodedRecord(TemporalRecord):
11
+ value: float
12
+
13
+
7
14
  def encode(stream: Iterator[TemporalRecord], mode: str) -> Iterator[TemporalRecord]:
8
15
  for rec in stream:
9
16
  t: datetime = rec.time
@@ -15,4 +22,4 @@ def encode(stream: Iterator[TemporalRecord], mode: str) -> Iterator[TemporalReco
15
22
  val = t.timestamp()
16
23
  else:
17
24
  raise ValueError(f"Unsupported encode_time mode: {mode}")
18
- yield TemporalRecord(time=rec.time, value=val)
25
+ yield TimeEncodedRecord(time=rec.time, value=val)
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from dataclasses import dataclass
4
2
  from typing import Iterable
5
3
 
@@ -43,12 +41,12 @@ def required_artifacts_for(
43
41
  needs_metadata = False
44
42
  for demand in demands:
45
43
  stage = demand.stage
46
- effective_stage = 7 if stage is None else stage
44
+ effective_stage = 8 if stage is None else stage
47
45
 
48
- if effective_stage >= 5 and _requires_scaler(dataset):
46
+ if effective_stage >= 6 and _requires_scaler(dataset):
49
47
  required.add(SCALER_STATISTICS)
50
48
 
51
- if effective_stage >= 6:
49
+ if effective_stage >= 7:
52
50
  required.add(VECTOR_SCHEMA)
53
51
  needs_metadata = True
54
52
 
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  import logging
4
2
  from dataclasses import dataclass
5
3
  from typing import Callable, Mapping, Optional, Protocol, runtime_checkable