jerry-thomas 1.0.3__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. datapipeline/analysis/vector/collector.py +0 -1
  2. datapipeline/build/tasks/config.py +0 -2
  3. datapipeline/build/tasks/metadata.py +0 -2
  4. datapipeline/build/tasks/scaler.py +0 -2
  5. datapipeline/build/tasks/schema.py +0 -2
  6. datapipeline/build/tasks/utils.py +0 -2
  7. datapipeline/cli/app.py +201 -81
  8. datapipeline/cli/commands/contract.py +145 -283
  9. datapipeline/cli/commands/demo.py +13 -0
  10. datapipeline/cli/commands/domain.py +4 -4
  11. datapipeline/cli/commands/dto.py +11 -0
  12. datapipeline/cli/commands/filter.py +2 -2
  13. datapipeline/cli/commands/inspect.py +0 -68
  14. datapipeline/cli/commands/list_.py +30 -13
  15. datapipeline/cli/commands/loader.py +11 -0
  16. datapipeline/cli/commands/mapper.py +82 -0
  17. datapipeline/cli/commands/parser.py +45 -0
  18. datapipeline/cli/commands/run_config.py +1 -3
  19. datapipeline/cli/commands/serve_pipeline.py +5 -7
  20. datapipeline/cli/commands/source.py +106 -18
  21. datapipeline/cli/commands/stream.py +292 -0
  22. datapipeline/cli/visuals/common.py +0 -2
  23. datapipeline/cli/visuals/sections.py +0 -2
  24. datapipeline/cli/workspace_utils.py +0 -3
  25. datapipeline/config/context.py +0 -2
  26. datapipeline/config/dataset/feature.py +1 -0
  27. datapipeline/config/metadata.py +0 -2
  28. datapipeline/config/project.py +0 -2
  29. datapipeline/config/resolution.py +10 -2
  30. datapipeline/config/tasks.py +9 -9
  31. datapipeline/domain/feature.py +3 -0
  32. datapipeline/domain/record.py +7 -7
  33. datapipeline/domain/sample.py +0 -2
  34. datapipeline/domain/vector.py +6 -8
  35. datapipeline/integrations/ml/adapter.py +0 -2
  36. datapipeline/integrations/ml/pandas_support.py +0 -2
  37. datapipeline/integrations/ml/rows.py +0 -2
  38. datapipeline/integrations/ml/torch_support.py +0 -2
  39. datapipeline/io/output.py +0 -2
  40. datapipeline/io/serializers.py +26 -16
  41. datapipeline/mappers/synthetic/time.py +9 -2
  42. datapipeline/pipeline/artifacts.py +3 -5
  43. datapipeline/pipeline/observability.py +0 -2
  44. datapipeline/pipeline/pipelines.py +118 -34
  45. datapipeline/pipeline/stages.py +54 -18
  46. datapipeline/pipeline/utils/spool_cache.py +142 -0
  47. datapipeline/pipeline/utils/transform_utils.py +27 -2
  48. datapipeline/services/artifacts.py +1 -4
  49. datapipeline/services/constants.py +1 -0
  50. datapipeline/services/factories.py +4 -6
  51. datapipeline/services/paths.py +10 -1
  52. datapipeline/services/project_paths.py +0 -2
  53. datapipeline/services/runs.py +0 -2
  54. datapipeline/services/scaffold/contract_yaml.py +76 -0
  55. datapipeline/services/scaffold/demo.py +141 -0
  56. datapipeline/services/scaffold/discovery.py +115 -0
  57. datapipeline/services/scaffold/domain.py +21 -13
  58. datapipeline/services/scaffold/dto.py +31 -0
  59. datapipeline/services/scaffold/filter.py +2 -1
  60. datapipeline/services/scaffold/layout.py +96 -0
  61. datapipeline/services/scaffold/loader.py +61 -0
  62. datapipeline/services/scaffold/mapper.py +116 -0
  63. datapipeline/services/scaffold/parser.py +56 -0
  64. datapipeline/services/scaffold/plugin.py +14 -2
  65. datapipeline/services/scaffold/source_yaml.py +91 -0
  66. datapipeline/services/scaffold/stream_plan.py +129 -0
  67. datapipeline/services/scaffold/utils.py +187 -0
  68. datapipeline/sources/data_loader.py +0 -2
  69. datapipeline/sources/decoders.py +49 -8
  70. datapipeline/sources/factory.py +9 -6
  71. datapipeline/sources/foreach.py +18 -3
  72. datapipeline/sources/synthetic/time/parser.py +1 -1
  73. datapipeline/sources/transports.py +10 -4
  74. datapipeline/templates/demo_skeleton/demo/contracts/equity.ohlcv.yaml +33 -0
  75. datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.hour_sin.yaml +22 -0
  76. datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.linear.yaml +22 -0
  77. datapipeline/templates/demo_skeleton/demo/data/APPL.jsonl +19 -0
  78. datapipeline/templates/demo_skeleton/demo/data/MSFT.jsonl +19 -0
  79. datapipeline/templates/demo_skeleton/demo/dataset.yaml +19 -0
  80. datapipeline/templates/demo_skeleton/demo/postprocess.yaml +19 -0
  81. datapipeline/templates/demo_skeleton/demo/project.yaml +19 -0
  82. datapipeline/templates/demo_skeleton/demo/sources/sandbox.ohlcv.yaml +17 -0
  83. datapipeline/templates/{plugin_skeleton/example → demo_skeleton/demo}/sources/synthetic.ticks.yaml +1 -1
  84. datapipeline/templates/demo_skeleton/demo/tasks/metadata.yaml +2 -0
  85. datapipeline/templates/demo_skeleton/demo/tasks/scaler.yaml +3 -0
  86. datapipeline/templates/demo_skeleton/demo/tasks/schema.yaml +2 -0
  87. datapipeline/templates/demo_skeleton/demo/tasks/serve.test.yaml +4 -0
  88. datapipeline/templates/demo_skeleton/demo/tasks/serve.train.yaml +4 -0
  89. datapipeline/templates/demo_skeleton/demo/tasks/serve.val.yaml +4 -0
  90. datapipeline/templates/demo_skeleton/scripts/run_dataframe.py +20 -0
  91. datapipeline/templates/demo_skeleton/scripts/run_torch.py +23 -0
  92. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
  93. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/__init__.py +0 -0
  94. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/model.py +18 -0
  95. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
  96. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/sandbox_ohlcv_dto.py +14 -0
  97. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +0 -0
  98. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/map_sandbox_ohlcv_dto_to_equity.py +26 -0
  99. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
  100. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/sandbox_ohlcv_dto_parser.py +46 -0
  101. datapipeline/templates/plugin_skeleton/README.md +57 -136
  102. datapipeline/templates/plugin_skeleton/jerry.yaml +12 -24
  103. datapipeline/templates/plugin_skeleton/reference/jerry.yaml +28 -0
  104. datapipeline/templates/plugin_skeleton/reference/reference/contracts/composed.reference.yaml +29 -0
  105. datapipeline/templates/plugin_skeleton/reference/reference/contracts/ingest.reference.yaml +31 -0
  106. datapipeline/templates/plugin_skeleton/reference/reference/contracts/overview.reference.yaml +34 -0
  107. datapipeline/templates/plugin_skeleton/reference/reference/dataset.yaml +29 -0
  108. datapipeline/templates/plugin_skeleton/reference/reference/postprocess.yaml +25 -0
  109. datapipeline/templates/plugin_skeleton/reference/reference/project.yaml +32 -0
  110. datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.http.reference.yaml +24 -0
  111. datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.reference.yaml +21 -0
  112. datapipeline/templates/plugin_skeleton/reference/reference/sources/fs.reference.yaml +16 -0
  113. datapipeline/templates/plugin_skeleton/reference/reference/sources/http.reference.yaml +17 -0
  114. datapipeline/templates/plugin_skeleton/reference/reference/sources/overview.reference.yaml +18 -0
  115. datapipeline/templates/plugin_skeleton/reference/reference/sources/synthetic.reference.yaml +15 -0
  116. datapipeline/templates/plugin_skeleton/reference/reference/tasks/metadata.reference.yaml +11 -0
  117. datapipeline/templates/plugin_skeleton/reference/reference/tasks/scaler.reference.yaml +10 -0
  118. datapipeline/templates/plugin_skeleton/reference/reference/tasks/schema.reference.yaml +10 -0
  119. datapipeline/templates/plugin_skeleton/reference/reference/tasks/serve.reference.yaml +28 -0
  120. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/domains/__init__.py +2 -0
  121. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
  122. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/loaders/__init__.py +0 -0
  123. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +1 -0
  124. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
  125. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +12 -11
  126. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +4 -13
  127. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +9 -11
  128. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +1 -2
  129. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +1 -7
  130. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +1 -1
  131. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +1 -1
  132. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +1 -25
  133. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +1 -1
  134. datapipeline/templates/plugin_skeleton/your-interim-data-builder/dataset.yaml +9 -0
  135. datapipeline/templates/plugin_skeleton/your-interim-data-builder/postprocess.yaml +1 -0
  136. datapipeline/templates/plugin_skeleton/your-interim-data-builder/project.yaml +15 -0
  137. datapipeline/templates/plugin_skeleton/your-interim-data-builder/tasks/serve.all.yaml +8 -0
  138. datapipeline/templates/stubs/contracts/composed.yaml.j2 +10 -0
  139. datapipeline/templates/stubs/contracts/ingest.yaml.j2 +25 -0
  140. datapipeline/templates/stubs/dto.py.j2 +2 -2
  141. datapipeline/templates/stubs/filter.py.j2 +1 -1
  142. datapipeline/templates/stubs/loaders/basic.py.j2 +11 -0
  143. datapipeline/templates/stubs/mappers/composed.py.j2 +13 -0
  144. datapipeline/templates/stubs/mappers/ingest.py.j2 +20 -0
  145. datapipeline/templates/stubs/parser.py.j2 +5 -1
  146. datapipeline/templates/stubs/record.py.j2 +1 -1
  147. datapipeline/templates/stubs/source.yaml.j2 +1 -1
  148. datapipeline/transforms/debug/identity.py +34 -16
  149. datapipeline/transforms/debug/lint.py +14 -11
  150. datapipeline/transforms/feature/scaler.py +5 -12
  151. datapipeline/transforms/filter.py +73 -17
  152. datapipeline/transforms/interfaces.py +58 -0
  153. datapipeline/transforms/record/floor_time.py +10 -7
  154. datapipeline/transforms/record/lag.py +8 -10
  155. datapipeline/transforms/sequence.py +2 -3
  156. datapipeline/transforms/stream/dedupe.py +5 -7
  157. datapipeline/transforms/stream/ensure_ticks.py +39 -24
  158. datapipeline/transforms/stream/fill.py +34 -25
  159. datapipeline/transforms/stream/filter.py +25 -0
  160. datapipeline/transforms/stream/floor_time.py +16 -0
  161. datapipeline/transforms/stream/granularity.py +52 -30
  162. datapipeline/transforms/stream/lag.py +17 -0
  163. datapipeline/transforms/stream/rolling.py +72 -0
  164. datapipeline/transforms/utils.py +42 -10
  165. datapipeline/transforms/vector/drop/horizontal.py +0 -3
  166. datapipeline/transforms/vector/drop/orchestrator.py +0 -3
  167. datapipeline/transforms/vector/drop/vertical.py +0 -2
  168. datapipeline/transforms/vector/ensure_schema.py +0 -2
  169. datapipeline/utils/paths.py +0 -2
  170. datapipeline/utils/placeholders.py +0 -2
  171. datapipeline/utils/rich_compat.py +0 -3
  172. datapipeline/utils/window.py +0 -2
  173. jerry_thomas-2.0.1.dist-info/METADATA +269 -0
  174. jerry_thomas-2.0.1.dist-info/RECORD +264 -0
  175. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/WHEEL +1 -1
  176. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/entry_points.txt +7 -3
  177. datapipeline/services/scaffold/mappers.py +0 -55
  178. datapipeline/services/scaffold/source.py +0 -191
  179. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +0 -31
  180. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +0 -30
  181. datapipeline/templates/plugin_skeleton/example/dataset.yaml +0 -18
  182. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +0 -29
  183. datapipeline/templates/plugin_skeleton/example/project.yaml +0 -23
  184. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +0 -3
  185. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +0 -9
  186. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +0 -2
  187. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +0 -4
  188. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +0 -28
  189. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +0 -4
  190. datapipeline/templates/stubs/mapper.py.j2 +0 -22
  191. jerry_thomas-1.0.3.dist-info/METADATA +0 -827
  192. jerry_thomas-1.0.3.dist-info/RECORD +0 -198
  193. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/licenses/LICENSE +0 -0
  194. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,10 @@
1
+ # Schema task reference (all options).
2
+ # This file is documentation only; uncomment the keys you want to use.
3
+ #
4
+ # version: 1 # optional
5
+ # kind: schema
6
+ # name: schema # optional (defaults to filename stem)
7
+ # enabled: true # optional
8
+ #
9
+ # output: schema.json # optional; relative to project.paths.artifacts
10
+ # cadence_strategy: max # optional; currently only "max"
@@ -0,0 +1,28 @@
1
+ # Serve task reference (all options).
2
+ # This file is documentation only; uncomment the keys you want to use.
3
+ #
4
+ # version: 1 # optional
5
+ # kind: serve
6
+ # name: train # optional (defaults to filename stem)
7
+ # enabled: true # optional
8
+ #
9
+ # keep: train # optional; split label from globals.split (null disables filtering)
10
+ #
11
+ # output: # optional; omit to use CLI defaults
12
+ # transport: stdout # stdout | fs
13
+ # format: json-lines # stdout: print | json-lines | json
14
+ # payload: sample # sample | vector
15
+ # # fs transport only:
16
+ # # transport: fs
17
+ # # format: csv # csv | json | json-lines | pickle
18
+ # # payload: vector
19
+ # # directory: artifacts/serve
20
+ # # filename: vectors.train # no extension, no path separators
21
+ #
22
+ # limit: 100 # optional; null = unlimited
23
+ # stage: 8 # optional; 0-8; null lets CLI decide
24
+ # throttle_ms: 0 # optional; milliseconds; null disables
25
+ #
26
+ # log_level: INFO # optional; CRITICAL | ERROR | WARNING | INFO | DEBUG
27
+ # visuals: AUTO # optional; AUTO | TQDM | RICH | OFF (false -> OFF)
28
+ # progress: AUTO # optional; AUTO | SPINNER | BARS | OFF
@@ -0,0 +1,2 @@
1
+ """Domain record models live under this package."""
2
+
@@ -0,0 +1 @@
1
+ """Stream mappers (DTO -> domain records)."""
@@ -1,18 +1,19 @@
1
+ # See ../reference/reference/dataset.yaml for full options.
2
+
1
3
  group_by: ${group_by}
2
4
 
3
5
  features:
4
- - id: time_linear
5
- record_stream: time.ticks.linear
6
- scale: true # optionally add with_mean/with_std overrides
7
- # Sliding window over the regularized stream; cadence is enforced in the contract.
6
+ - id: first_feature
7
+ record_stream: your.stream.one
8
+ field: some_field
9
+ scale: true
8
10
  sequence: { size: 6, stride: 1 }
9
11
 
10
- - id: time_hour_sin
11
- record_stream: time.ticks.hour_sin
12
+ - id: second_feature
13
+ record_stream: your.stream.two
14
+ field: some_field
12
15
 
13
- # - id: third_feature
14
- # record_stream: anotherstream
15
16
  # targets:
16
- # - id: some_target
17
- # record_stream: time.ticks.linear
18
-
17
+ # - id: target_feature
18
+ # record_stream: your.target.stream
19
+ # field: some_field
@@ -1,16 +1,15 @@
1
- #### example combination of postprocessing steps ######
2
- #### making sure data is complete after these combinations ######
3
- - drop: # example of dropping sparse partitions/vertical-axis for targets
1
+ # See ../reference/reference/postprocess.yaml for full options.
2
+ - drop:
4
3
  axis: vertical
5
4
  payload: targets
6
5
  threshold: 0.9
7
6
 
8
- - drop: # example of dropping sparse partitions for features
7
+ - drop:
9
8
  axis: vertical
10
9
  payload: features
11
10
  threshold: 0.9
12
11
 
13
- - drop: # dropping vectors/horizontal-axis that has features which none
12
+ - drop:
14
13
  axis: horizontal
15
14
  payload: features
16
15
  threshold: 1
@@ -19,11 +18,3 @@
19
18
  axis: horizontal
20
19
  payload: targets
21
20
  threshold: 1
22
- ######
23
- # - fill:
24
- # statistic: median
25
- # window: 48
26
- # min_samples: 6
27
- # - replace:
28
- # payload: targets
29
- # value: 0.0
@@ -1,3 +1,4 @@
1
+ # See ../reference/reference/project.yaml for full options.
1
2
  version: 1
2
3
  name: <your-dataset>
3
4
  paths:
@@ -7,16 +8,13 @@ paths:
7
8
  postprocess: postprocess.yaml
8
9
  artifacts: ../artifacts/${project_name}/v${version}
9
10
  tasks: ./tasks
10
- globals: # Globals to use in your .yaml files via ${var_name}.
11
- # Primary dataset cadence; referenced from dataset.yaml (group_by)
12
- # and contracts via ${group_by}.
13
- group_by: <your-bucket-cadence>
14
- start_time: null #2021-01-01T00:00:00Z
15
- end_time: null #2021-01-02T00:00:00Z
16
- # Configure deterministic dataset split here (applied at serve time, after postprocess).
17
- # Adjust `ratios` as needed; the active split is selected via serve tasks or CLI.
11
+ globals:
12
+ # TODO: Set your grouping cadence (must match ^\d+(m|min|h|d)$).
13
+ group_by: 1h
14
+ start_time: null
15
+ end_time: null
18
16
  split:
19
- mode: hash # hash | time (time uses boundaries/labels)
20
- key: group # group | feature:<id> (entity-stable split)
21
- seed: 42 # deterministic hash seed
17
+ mode: hash
18
+ key: group
19
+ seed: 42
22
20
  ratios: { train: 0.8, val: 0.1, test: 0.1 }
@@ -1,3 +1,2 @@
1
+ # See ../../reference/reference/tasks/metadata.reference.yaml for full options.
1
2
  kind: metadata
2
- # window_mode: intersection # union|intersection|strict|relaxed (default: intersection)
3
-
@@ -1,9 +1,3 @@
1
+ # See ../../reference/reference/tasks/scaler.reference.yaml for full options.
1
2
  kind: scaler
2
-
3
- # Output path is relative to project.paths.artifacts; defaults to "scaler.pkl".
4
- # output: scaler.pkl
5
-
6
- # Split label to use when fitting scaler statistics.
7
- # Must match a label from globals.split.ratios.
8
3
  split_label: train
9
-
@@ -1,2 +1,2 @@
1
+ # See ../../reference/reference/tasks/schema.reference.yaml for full options.
1
2
  kind: schema
2
-
@@ -1,4 +1,4 @@
1
+ # See ../../reference/reference/tasks/serve.reference.yaml for full options.
1
2
  kind: serve
2
3
  name: test
3
4
  keep: test
4
-
@@ -1,28 +1,4 @@
1
+ # See ../../reference/reference/tasks/serve.reference.yaml for full options.
1
2
  kind: serve
2
-
3
- # Optional identifier for this serve task; defaults to filename stem.
4
3
  name: train
5
-
6
- # Active split label to serve; must match a label from globals.split.ratios.
7
- # Set to null to disable split filtering.
8
4
  keep: train
9
- #output:
10
- # transport: stdout | fs
11
- # format: print | json-lines | json | csv | pickle
12
- # When using fs transport, set a directory (and optionally filename) for outputs:
13
- # directory: artifacts/serve
14
- # filename: vectors.train
15
-
16
- # Default max number of vectors to emit (null = unlimited).
17
- # limit: 5
18
- # Optional pipeline stage preview (0-7); null lets the CLI decide.
19
- # stage: 7
20
-
21
- # Optional pacing between emitted vectors (milliseconds).
22
- # throttle_ms: null
23
-
24
- # Visuals/logging knobs (inherit CLI or jerry.yaml defaults when omitted):
25
- # visuals: AUTO # AUTO | TQDM | RICH | OFF
26
- # progress: AUTO # AUTO | SPINNER | BARS | OFF
27
- # log_level: INFO # CRITICAL | ERROR | WARNING | INFO | DEBUG
28
-
@@ -1,4 +1,4 @@
1
+ # See ../../reference/reference/tasks/serve.reference.yaml for full options.
1
2
  kind: serve
2
3
  name: val
3
4
  keep: val
4
-
@@ -0,0 +1,9 @@
1
+ # Early-stage dataset definition (record/feature prep only).
2
+ # Populate record_stream entries with your canonical stream ids.
3
+
4
+ group_by: ${group_by}
5
+
6
+ features:
7
+ - id: first_feature
8
+ record_stream: your.stream.one
9
+ field: some_field
@@ -0,0 +1,15 @@
1
+ # See ../reference/reference/project.yaml for full options.
2
+ version: 1
3
+ name: <your-interim-data-builder>
4
+ paths:
5
+ streams: ./contracts
6
+ sources: ./sources
7
+ dataset: dataset.yaml
8
+ postprocess: postprocess.yaml
9
+ artifacts: ../artifacts/${project_name}/v${version}
10
+ tasks: ./tasks
11
+ globals:
12
+ # TODO: Set your grouping cadence (must match ^\d+(m|min|h|d)$).
13
+ group_by: 1h
14
+ start_time: null
15
+ end_time: null
@@ -0,0 +1,8 @@
1
+ # See ../../reference/reference/tasks/serve.reference.yaml for full options.
2
+ kind: serve
3
+ name: all
4
+ keep: null
5
+ output:
6
+ transport: fs
7
+ format: json-lines
8
+ directory: data/interim/jerry
@@ -0,0 +1,10 @@
1
+ kind: composed
2
+ id: {{ stream_id }} # format: domain.dataset.(variant)
3
+ # cadence: ${group_by} # optional per-contract cadence
4
+ # partition_by: <field or [fields]>
5
+ inputs:
6
+ - {{ inputs_list }}
7
+
8
+ mapper:
9
+ entrypoint: {{ mapper_entrypoint }}
10
+ args: { driver: {{ driver_key }} }
@@ -0,0 +1,25 @@
1
+ kind: ingest
2
+ source: {{ source }}
3
+ id: {{ stream_id }} # format: domain.dataset.(variant)
4
+
5
+ mapper:
6
+ entrypoint: {{ mapper_entrypoint }}
7
+ args: {}
8
+
9
+ cadence: ${group_by} # optional per-contract cadence
10
+ # partition_by: <field or [fields]>
11
+ # sort_batch_size: 100000 # in-memory sort chunk size
12
+
13
+ record: # record-level transforms
14
+ - filter: { field: time, operator: ge, comparand: "${start_time}" }
15
+ - filter: { field: time, operator: le, comparand: "${end_time}" }
16
+ - floor_time: { cadence: "${cadence}" }
17
+ # - lag: { lag: 10m }
18
+
19
+ stream: # per-stream transforms (input sorted by partition,time)
20
+ - ensure_cadence: { field: some_field, to: some_field, cadence: "${cadence}" }
21
+ - granularity: { field: some_field, to: some_field, mode: first }
22
+ # - fill: { field: some_field, to: some_field, statistic: median, window: 6, min_samples: 1 }
23
+
24
+ debug: # optional validation-only checks
25
+ - lint: { mode: warn, tick: "${cadence}" }
@@ -5,7 +5,7 @@ from datetime import datetime
5
5
  @dataclass
6
6
  class {{CLASS_NAME}}:
7
7
  """
8
- Data Transfer Object (DTO) for the '{{DOMAIN}}' source.
8
+ Data Transfer Object (DTO) for the '{{DOMAIN}}' records.
9
9
 
10
10
  Purpose
11
11
  - Represents the raw, source-shaped data emitted by the loader + parser.
@@ -24,4 +24,4 @@ class {{CLASS_NAME}}:
24
24
  # currency: str
25
25
  """
26
26
  # TODO: define fields matching the '{{DOMAIN}}' source schema
27
- raise NotImplementedError(f"Define fields for the DTO '{{CLASS_NAME}}'")
27
+ pass
@@ -11,6 +11,6 @@ def {{ FUNCTION_NAME }}(stream: Iterator[Any], field: str, target: Any) -> Itera
11
11
  Replace the condition below with your own logic.
12
12
  """
13
13
  for record in stream:
14
+ # TODO: implement filter logic
14
15
  # Example: pass through everything
15
16
  yield record
16
-
@@ -0,0 +1,11 @@
1
+ from typing import Iterator, Any
2
+
3
+ from datapipeline.sources.models.loader import DataLoader
4
+
5
+
6
+ class {{CLASS_NAME}}(DataLoader):
7
+ """Custom loader stub. Yield raw items to be parsed by a parser."""
8
+
9
+ def __iter__(self) -> Iterator[Any]:
10
+ # TODO: implement data loading
11
+ yield from ()
@@ -0,0 +1,13 @@
1
+ from typing import Iterator, Mapping
2
+
3
+ from datapipeline.domain.record import TemporalRecord
4
+
5
+
6
+ def mapper(
7
+ inputs: Mapping[str, Iterator[TemporalRecord]],
8
+ *, driver: str | None = None, aux: Mapping[str, Iterator[TemporalRecord]] | None = None, context=None, **params
9
+ ) -> Iterator[TemporalRecord]:
10
+ # TODO: implement domain math; inputs are ordered/regularized; aux is raw
11
+ key = driver or next(iter(inputs.keys()))
12
+ for rec in inputs[key]:
13
+ yield rec # replace with your dataclass and computation
@@ -0,0 +1,20 @@
1
+ from typing import Any, Iterator
2
+
3
+ from {{DOMAIN_MODULE}} import {{DOMAIN_RECORD}}
4
+ {% if INPUT_IMPORT != "typing" %}from {{INPUT_IMPORT}} import {{INPUT_CLASS}}
5
+ {% endif %}
6
+
7
+
8
+ def {{FUNCTION_NAME}}(
9
+ stream: Iterator[{{INPUT_CLASS}}],
10
+ **params: Any,
11
+ ) -> Iterator[{{DOMAIN_RECORD}}]:
12
+ """Map {{INPUT_CLASS}} records to domain-level {{DOMAIN_RECORD}} records."""
13
+ raise NotImplementedError(
14
+ "Implement mapper logic for {{INPUT_CLASS}} -> {{DOMAIN_RECORD}}"
15
+ )
16
+ for record in stream:
17
+ # TODO: construct {{DOMAIN_RECORD}} from record fields
18
+ yield {{DOMAIN_RECORD}}(
19
+ time=record.time, # required
20
+ )
@@ -2,7 +2,11 @@ from typing import Any
2
2
 
3
3
  from datapipeline.sources.models.parser import DataParser
4
4
 
5
+ {% if DTO_IMPORT -%}
6
+ from {{DTO_IMPORT}} import {{DTO_CLASS}}
7
+ {% else -%}
5
8
  from .dto import {{DTO_CLASS}}
9
+ {% endif %}
6
10
 
7
11
 
8
12
  class {{CLASS_NAME}}(DataParser[{{DTO_CLASS}}]):
@@ -18,4 +22,4 @@ class {{CLASS_NAME}}(DataParser[{{DTO_CLASS}}]):
18
22
  # return {{DTO_CLASS}}(
19
23
  # ... map fields from `raw` ...
20
24
  # )
21
- raise NotImplementedError
25
+ raise NotImplementedError("Implement parser logic for {{DTO_CLASS}}")
@@ -9,11 +9,11 @@ class {{CLASS_NAME}}({{PARENT_CLASS}}):
9
9
  Domain record for '{{DOMAIN}}'.
10
10
 
11
11
  Required fields inherited from the base:
12
- - value: main payload used to model records value by pipelines (numeric or categorical)
13
12
  - time: tz-aware datetime (always required)
14
13
 
15
14
  Add any additional fields you need for filtering/partitioning/grouping.
16
15
  """
16
+ # TODO: Add domain fields for filtering/partitioning/grouping.
17
17
  # Example extra fields (uncomment and adapt):
18
18
  # region: str # e.g. 'us-west', 'eu-central', etc.
19
19
  # exchange: str # e.g. 'NASDAQ', 'NYSE', etc.
@@ -1,5 +1,5 @@
1
1
  # Required identifier for this raw source. Contracts reference it under `source:`.
2
- id: "{{ id }}" # format: provider.dataset
2
+ id: "{{ id }}" # suggested format: provider.dataset
3
3
 
4
4
  # parser.entrypoint: registered parser name (not a file path)
5
5
  parser:
@@ -1,10 +1,9 @@
1
- from __future__ import annotations
2
-
3
1
  import logging
4
2
  from dataclasses import asdict, is_dataclass
5
3
  from typing import Iterator, Any
6
4
 
7
- from datapipeline.domain.feature import FeatureRecord
5
+ from datapipeline.domain.record import TemporalRecord
6
+ from datapipeline.transforms.utils import partition_key
8
7
 
9
8
  logger = logging.getLogger(__name__)
10
9
 
@@ -16,14 +15,21 @@ class IdentityGuardTransform:
16
15
  - mode: 'warn' (default) logs warnings; 'error' raises on first violation
17
16
  - fields: optional explicit list of attribute names to compare. When omitted,
18
17
  the transform attempts to derive identity from dataclass fields on the
19
- underlying record, excluding 'time' and 'value'.
18
+ underlying record, excluding 'time'.
20
19
  """
21
20
 
22
- def __init__(self, *, mode: str = "warn", fields: list[str] | None = None) -> None:
21
+ def __init__(
22
+ self,
23
+ *,
24
+ mode: str = "warn",
25
+ fields: list[str] | None = None,
26
+ partition_by: str | list[str] | None = None,
27
+ ) -> None:
23
28
  self.mode = mode
24
29
  self.fields = fields
30
+ self.partition_by = partition_by
25
31
 
26
- def __call__(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
32
+ def __call__(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
27
33
  return self.apply(stream)
28
34
 
29
35
  def _violation(self, msg: str) -> None:
@@ -41,26 +47,38 @@ class IdentityGuardTransform:
41
47
  except Exception:
42
48
  out[f] = None
43
49
  return out
50
+ # Fall back to partition_by when available
51
+ if self.partition_by:
52
+ fields = (
53
+ [self.partition_by]
54
+ if isinstance(self.partition_by, str)
55
+ else list(self.partition_by)
56
+ )
57
+ out = {}
58
+ for f in fields:
59
+ try:
60
+ out[f] = getattr(rec, f)
61
+ except Exception:
62
+ out[f] = None
63
+ return out
44
64
  # Try domain-provided hook first
45
65
  if hasattr(rec, "identity_fields") and callable(getattr(rec, "identity_fields")):
46
66
  try:
47
67
  return rec.identity_fields() # type: ignore[attr-defined]
48
68
  except Exception:
49
69
  pass
50
- # Fallback: dataclass fields minus time/value
70
+ # Fallback: dataclass fields minus time
51
71
  if is_dataclass(rec):
52
72
  data = asdict(rec)
53
73
  data.pop("time", None)
54
- data.pop("value", None)
55
74
  return data
56
75
  return {}
57
76
 
58
- def apply(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
59
- current_key = None
77
+ def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
78
+ current_key: tuple | None = None
60
79
  baseline: dict | None = None
61
- for fr in stream:
62
- key = fr.id
63
- rec = fr.record
80
+ for rec in stream:
81
+ key = partition_key(rec, self.partition_by)
64
82
  ident = self._identity_map(rec)
65
83
  if key != current_key:
66
84
  current_key = key
@@ -68,7 +86,7 @@ class IdentityGuardTransform:
68
86
  else:
69
87
  if ident != baseline:
70
88
  self._violation(
71
- "identity drift in feature stream id=%s: expected=%s observed=%s"
72
- % (fr.id, baseline, ident)
89
+ "identity drift in record stream key=%s: expected=%s observed=%s"
90
+ % (key, baseline, ident)
73
91
  )
74
- yield fr
92
+ yield rec
@@ -3,7 +3,8 @@ from datetime import timedelta
3
3
  from itertools import groupby
4
4
  from typing import Iterator
5
5
 
6
- from datapipeline.domain.feature import FeatureRecord
6
+ from datapipeline.domain.record import TemporalRecord
7
+ from datapipeline.transforms.utils import partition_key
7
8
  from datapipeline.utils.time import parse_timecode
8
9
 
9
10
 
@@ -23,9 +24,11 @@ class StreamLint:
23
24
  *,
24
25
  mode: str = "warn",
25
26
  tick: str | None = None,
27
+ partition_by: str | list[str] | None = None,
26
28
  ) -> None:
27
29
  self.mode = mode
28
30
  self.tick = tick
31
+ self.partition_by = partition_by
29
32
 
30
33
  # Pre-compute tick step in seconds when provided to avoid repeated parsing.
31
34
  self._tick_seconds: int | None = None
@@ -38,7 +41,7 @@ class StreamLint:
38
41
  )
39
42
  self._tick_seconds = None
40
43
 
41
- def __call__(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
44
+ def __call__(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
42
45
  return self.apply(stream)
43
46
 
44
47
  def _violation(self, msg: str) -> None:
@@ -46,25 +49,25 @@ class StreamLint:
46
49
  raise ValueError(msg)
47
50
  logger.warning(msg)
48
51
 
49
- def apply(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
50
- # Group by base feature id to keep state local
51
- for fid, records in groupby(stream, key=lambda fr: fr.id):
52
+ def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
53
+ # Group by partition key to keep state local
54
+ for key, records in groupby(stream, key=lambda rec: partition_key(rec, self.partition_by)):
52
55
  last_time = None
53
56
  seen_times: set = set()
54
- for fr in records:
55
- t = getattr(fr.record, "time", None)
57
+ for record in records:
58
+ t = getattr(record, "time", None)
56
59
 
57
60
  # Check ordering
58
61
  if last_time is not None and t is not None and t < last_time:
59
62
  self._violation(
60
- f"out-of-order timestamp for feature '{fid}': {t} < {last_time}. "
63
+ f"out-of-order timestamp for partition '{key}': {t} < {last_time}. "
61
64
  f"Consider sorting upstream or fixing loader."
62
65
  )
63
66
 
64
67
  # Check duplicates
65
68
  if t in seen_times:
66
69
  self._violation(
67
- f"duplicate timestamp for feature '{fid}' at {t}. "
70
+ f"duplicate timestamp for partition '{key}' at {t}. "
68
71
  f"Consider a granularity transform (first/last/mean/median)."
69
72
  )
70
73
  seen_times.add(t)
@@ -78,9 +81,9 @@ class StreamLint:
78
81
  expect = last_time + timedelta(seconds=self._tick_seconds)
79
82
  if t != expect and t > expect:
80
83
  self._violation(
81
- f"skipped tick(s) for feature '{fid}': expected {expect}, got {t}. "
84
+ f"skipped tick(s) for partition '{key}': expected {expect}, got {t}. "
82
85
  f"Consider using ensure_cadence."
83
86
  )
84
87
 
85
88
  last_time = t
86
- yield fr
89
+ yield record
@@ -3,12 +3,11 @@ from collections import defaultdict
3
3
  from itertools import groupby
4
4
  from numbers import Real
5
5
  from pathlib import Path
6
- from typing import Any, Callable, Iterator, Literal, Mapping
6
+ from typing import Any, Callable, Iterator, Literal
7
7
 
8
8
  from datapipeline.domain.feature import FeatureRecord
9
9
  from datapipeline.domain.sample import Sample
10
10
  from datapipeline.transforms.feature.model import FeatureTransform
11
- from datapipeline.transforms.utils import clone_record_with_value
12
11
  from datapipeline.utils.pickle_model import PicklePersistanceMixin
13
12
  from datapipeline.pipeline.observability import TransformEvent
14
13
 
@@ -86,7 +85,7 @@ class StandardScaler(PicklePersistanceMixin):
86
85
  mean = float(stats.get("mean", 0.0))
87
86
  std = float(stats.get("std", 1.0))
88
87
  for fr in records:
89
- value = fr.record.value
88
+ value = fr.value
90
89
  if not isinstance(value, Real):
91
90
  if value is None and on_none == "skip":
92
91
  self.missing_counts[feature_id] = (
@@ -114,10 +113,7 @@ class StandardScaler(PicklePersistanceMixin):
114
113
  normalized -= mean
115
114
  if self.with_std:
116
115
  normalized /= std
117
- yield FeatureRecord(
118
- record=clone_record_with_value(fr.record, normalized),
119
- id=fr.id,
120
- )
116
+ yield FeatureRecord(record=fr.record, id=fr.id, value=normalized)
121
117
 
122
118
  def inverse_transform(
123
119
  self,
@@ -136,7 +132,7 @@ class StandardScaler(PicklePersistanceMixin):
136
132
  mean = float(stats.get("mean", 0.0))
137
133
  std = float(stats.get("std", 1.0))
138
134
  for fr in records:
139
- value = fr.record.value
135
+ value = fr.value
140
136
  if not isinstance(value, Real):
141
137
  raise TypeError(
142
138
  f"Record value must be numeric, got {value!r}")
@@ -145,10 +141,7 @@ class StandardScaler(PicklePersistanceMixin):
145
141
  restored *= std
146
142
  if self.with_mean:
147
143
  restored += mean
148
- yield FeatureRecord(
149
- record=clone_record_with_value(fr.record, restored),
150
- id=fr.id,
151
- )
144
+ yield FeatureRecord(record=fr.record, id=fr.id, value=restored)
152
145
 
153
146
  class _RunningStats:
154
147
  __slots__ = ("count", "mean", "m2")