jerry-thomas 1.0.3__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. datapipeline/analysis/vector/collector.py +0 -1
  2. datapipeline/build/tasks/config.py +0 -2
  3. datapipeline/build/tasks/metadata.py +0 -2
  4. datapipeline/build/tasks/scaler.py +0 -2
  5. datapipeline/build/tasks/schema.py +0 -2
  6. datapipeline/build/tasks/utils.py +0 -2
  7. datapipeline/cli/app.py +201 -81
  8. datapipeline/cli/commands/contract.py +145 -283
  9. datapipeline/cli/commands/demo.py +13 -0
  10. datapipeline/cli/commands/domain.py +4 -4
  11. datapipeline/cli/commands/dto.py +11 -0
  12. datapipeline/cli/commands/filter.py +2 -2
  13. datapipeline/cli/commands/inspect.py +0 -68
  14. datapipeline/cli/commands/list_.py +30 -13
  15. datapipeline/cli/commands/loader.py +11 -0
  16. datapipeline/cli/commands/mapper.py +82 -0
  17. datapipeline/cli/commands/parser.py +45 -0
  18. datapipeline/cli/commands/run_config.py +1 -3
  19. datapipeline/cli/commands/serve_pipeline.py +5 -7
  20. datapipeline/cli/commands/source.py +106 -18
  21. datapipeline/cli/commands/stream.py +286 -0
  22. datapipeline/cli/visuals/common.py +0 -2
  23. datapipeline/cli/visuals/sections.py +0 -2
  24. datapipeline/cli/workspace_utils.py +0 -3
  25. datapipeline/config/context.py +0 -2
  26. datapipeline/config/dataset/feature.py +1 -0
  27. datapipeline/config/metadata.py +0 -2
  28. datapipeline/config/project.py +0 -2
  29. datapipeline/config/resolution.py +10 -2
  30. datapipeline/config/tasks.py +9 -9
  31. datapipeline/domain/feature.py +3 -0
  32. datapipeline/domain/record.py +7 -7
  33. datapipeline/domain/sample.py +0 -2
  34. datapipeline/domain/vector.py +6 -8
  35. datapipeline/integrations/ml/adapter.py +0 -2
  36. datapipeline/integrations/ml/pandas_support.py +0 -2
  37. datapipeline/integrations/ml/rows.py +0 -2
  38. datapipeline/integrations/ml/torch_support.py +0 -2
  39. datapipeline/io/output.py +0 -2
  40. datapipeline/io/serializers.py +26 -16
  41. datapipeline/mappers/synthetic/time.py +9 -2
  42. datapipeline/pipeline/artifacts.py +3 -5
  43. datapipeline/pipeline/observability.py +0 -2
  44. datapipeline/pipeline/pipelines.py +118 -34
  45. datapipeline/pipeline/stages.py +42 -17
  46. datapipeline/pipeline/utils/spool_cache.py +142 -0
  47. datapipeline/pipeline/utils/transform_utils.py +27 -2
  48. datapipeline/services/artifacts.py +1 -4
  49. datapipeline/services/constants.py +1 -0
  50. datapipeline/services/factories.py +4 -6
  51. datapipeline/services/project_paths.py +0 -2
  52. datapipeline/services/runs.py +0 -2
  53. datapipeline/services/scaffold/contract_yaml.py +76 -0
  54. datapipeline/services/scaffold/demo.py +141 -0
  55. datapipeline/services/scaffold/discovery.py +115 -0
  56. datapipeline/services/scaffold/domain.py +21 -13
  57. datapipeline/services/scaffold/dto.py +31 -0
  58. datapipeline/services/scaffold/filter.py +2 -1
  59. datapipeline/services/scaffold/layout.py +96 -0
  60. datapipeline/services/scaffold/loader.py +61 -0
  61. datapipeline/services/scaffold/mapper.py +116 -0
  62. datapipeline/services/scaffold/parser.py +56 -0
  63. datapipeline/services/scaffold/plugin.py +14 -2
  64. datapipeline/services/scaffold/source_yaml.py +91 -0
  65. datapipeline/services/scaffold/stream_plan.py +110 -0
  66. datapipeline/services/scaffold/utils.py +187 -0
  67. datapipeline/sources/data_loader.py +0 -2
  68. datapipeline/sources/decoders.py +49 -8
  69. datapipeline/sources/factory.py +9 -6
  70. datapipeline/sources/foreach.py +18 -3
  71. datapipeline/sources/synthetic/time/parser.py +1 -1
  72. datapipeline/sources/transports.py +10 -4
  73. datapipeline/templates/demo_skeleton/demo/contracts/equity.ohlcv.yaml +33 -0
  74. datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.hour_sin.yaml +22 -0
  75. datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.linear.yaml +22 -0
  76. datapipeline/templates/demo_skeleton/demo/data/APPL.jsonl +19 -0
  77. datapipeline/templates/demo_skeleton/demo/data/MSFT.jsonl +19 -0
  78. datapipeline/templates/demo_skeleton/demo/dataset.yaml +19 -0
  79. datapipeline/templates/demo_skeleton/demo/postprocess.yaml +19 -0
  80. datapipeline/templates/demo_skeleton/demo/project.yaml +19 -0
  81. datapipeline/templates/demo_skeleton/demo/sources/sandbox.ohlcv.yaml +17 -0
  82. datapipeline/templates/{plugin_skeleton/example → demo_skeleton/demo}/sources/synthetic.ticks.yaml +1 -1
  83. datapipeline/templates/demo_skeleton/demo/tasks/metadata.yaml +2 -0
  84. datapipeline/templates/demo_skeleton/demo/tasks/scaler.yaml +3 -0
  85. datapipeline/templates/demo_skeleton/demo/tasks/schema.yaml +2 -0
  86. datapipeline/templates/demo_skeleton/demo/tasks/serve.test.yaml +4 -0
  87. datapipeline/templates/demo_skeleton/demo/tasks/serve.train.yaml +4 -0
  88. datapipeline/templates/demo_skeleton/demo/tasks/serve.val.yaml +4 -0
  89. datapipeline/templates/demo_skeleton/scripts/run_dataframe.py +20 -0
  90. datapipeline/templates/demo_skeleton/scripts/run_torch.py +23 -0
  91. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
  92. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/__init__.py +0 -0
  93. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/model.py +18 -0
  94. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
  95. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/sandbox_ohlcv_dto.py +14 -0
  96. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +0 -0
  97. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/map_sandbox_ohlcv_dto_to_equity.py +26 -0
  98. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
  99. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/sandbox_ohlcv_dto_parser.py +46 -0
  100. datapipeline/templates/plugin_skeleton/README.md +57 -136
  101. datapipeline/templates/plugin_skeleton/jerry.yaml +12 -24
  102. datapipeline/templates/plugin_skeleton/reference/jerry.yaml +28 -0
  103. datapipeline/templates/plugin_skeleton/reference/reference/contracts/composed.reference.yaml +29 -0
  104. datapipeline/templates/plugin_skeleton/reference/reference/contracts/ingest.reference.yaml +31 -0
  105. datapipeline/templates/plugin_skeleton/reference/reference/contracts/overview.reference.yaml +34 -0
  106. datapipeline/templates/plugin_skeleton/reference/reference/dataset.yaml +29 -0
  107. datapipeline/templates/plugin_skeleton/reference/reference/postprocess.yaml +25 -0
  108. datapipeline/templates/plugin_skeleton/reference/reference/project.yaml +32 -0
  109. datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.http.reference.yaml +24 -0
  110. datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.reference.yaml +21 -0
  111. datapipeline/templates/plugin_skeleton/reference/reference/sources/fs.reference.yaml +16 -0
  112. datapipeline/templates/plugin_skeleton/reference/reference/sources/http.reference.yaml +17 -0
  113. datapipeline/templates/plugin_skeleton/reference/reference/sources/overview.reference.yaml +18 -0
  114. datapipeline/templates/plugin_skeleton/reference/reference/sources/synthetic.reference.yaml +15 -0
  115. datapipeline/templates/plugin_skeleton/reference/reference/tasks/metadata.reference.yaml +11 -0
  116. datapipeline/templates/plugin_skeleton/reference/reference/tasks/scaler.reference.yaml +10 -0
  117. datapipeline/templates/plugin_skeleton/reference/reference/tasks/schema.reference.yaml +10 -0
  118. datapipeline/templates/plugin_skeleton/reference/reference/tasks/serve.reference.yaml +28 -0
  119. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/domains/__init__.py +2 -0
  120. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
  121. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/loaders/__init__.py +0 -0
  122. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +1 -0
  123. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
  124. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +12 -11
  125. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +4 -13
  126. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +7 -10
  127. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +1 -2
  128. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +1 -7
  129. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +1 -1
  130. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +1 -1
  131. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +1 -25
  132. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +1 -1
  133. datapipeline/templates/plugin_skeleton/your-interim-data-builder/dataset.yaml +9 -0
  134. datapipeline/templates/plugin_skeleton/your-interim-data-builder/postprocess.yaml +1 -0
  135. datapipeline/templates/plugin_skeleton/your-interim-data-builder/project.yaml +14 -0
  136. datapipeline/templates/plugin_skeleton/your-interim-data-builder/tasks/serve.all.yaml +8 -0
  137. datapipeline/templates/stubs/contracts/composed.yaml.j2 +10 -0
  138. datapipeline/templates/stubs/contracts/ingest.yaml.j2 +25 -0
  139. datapipeline/templates/stubs/dto.py.j2 +1 -1
  140. datapipeline/templates/stubs/loaders/basic.py.j2 +11 -0
  141. datapipeline/templates/stubs/mappers/composed.py.j2 +13 -0
  142. datapipeline/templates/stubs/mappers/ingest.py.j2 +17 -0
  143. datapipeline/templates/stubs/parser.py.j2 +4 -0
  144. datapipeline/templates/stubs/record.py.j2 +0 -1
  145. datapipeline/templates/stubs/source.yaml.j2 +1 -1
  146. datapipeline/transforms/debug/identity.py +34 -16
  147. datapipeline/transforms/debug/lint.py +14 -11
  148. datapipeline/transforms/feature/scaler.py +5 -12
  149. datapipeline/transforms/filter.py +73 -17
  150. datapipeline/transforms/interfaces.py +58 -0
  151. datapipeline/transforms/record/floor_time.py +10 -7
  152. datapipeline/transforms/record/lag.py +8 -10
  153. datapipeline/transforms/sequence.py +2 -3
  154. datapipeline/transforms/stream/dedupe.py +5 -7
  155. datapipeline/transforms/stream/ensure_ticks.py +39 -24
  156. datapipeline/transforms/stream/fill.py +34 -25
  157. datapipeline/transforms/stream/filter.py +25 -0
  158. datapipeline/transforms/stream/floor_time.py +16 -0
  159. datapipeline/transforms/stream/granularity.py +52 -30
  160. datapipeline/transforms/stream/lag.py +17 -0
  161. datapipeline/transforms/stream/rolling.py +72 -0
  162. datapipeline/transforms/utils.py +42 -10
  163. datapipeline/transforms/vector/drop/horizontal.py +0 -3
  164. datapipeline/transforms/vector/drop/orchestrator.py +0 -3
  165. datapipeline/transforms/vector/drop/vertical.py +0 -2
  166. datapipeline/transforms/vector/ensure_schema.py +0 -2
  167. datapipeline/utils/paths.py +0 -2
  168. datapipeline/utils/placeholders.py +0 -2
  169. datapipeline/utils/rich_compat.py +0 -3
  170. datapipeline/utils/window.py +0 -2
  171. jerry_thomas-2.0.0.dist-info/METADATA +282 -0
  172. jerry_thomas-2.0.0.dist-info/RECORD +264 -0
  173. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/WHEEL +1 -1
  174. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/entry_points.txt +7 -3
  175. datapipeline/services/scaffold/mappers.py +0 -55
  176. datapipeline/services/scaffold/source.py +0 -191
  177. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +0 -31
  178. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +0 -30
  179. datapipeline/templates/plugin_skeleton/example/dataset.yaml +0 -18
  180. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +0 -29
  181. datapipeline/templates/plugin_skeleton/example/project.yaml +0 -23
  182. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +0 -3
  183. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +0 -9
  184. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +0 -2
  185. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +0 -4
  186. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +0 -28
  187. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +0 -4
  188. datapipeline/templates/stubs/mapper.py.j2 +0 -22
  189. jerry_thomas-1.0.3.dist-info/METADATA +0 -827
  190. jerry_thomas-1.0.3.dist-info/RECORD +0 -198
  191. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/licenses/LICENSE +0 -0
  192. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,827 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: jerry-thomas
3
- Version: 1.0.3
4
- Summary: Jerry-Thomas: a stream-first, plugin-friendly data pipeline (mixology-themed CLI)
5
- Author: Anders Skott Lind
6
- License: MIT
7
- Requires-Python: >=3.10
8
- Description-Content-Type: text/markdown
9
- License-File: LICENSE
10
- Requires-Dist: numpy<3.0,>=1.24
11
- Requires-Dist: pydantic>=2.0
12
- Requires-Dist: PyYAML>=5.4
13
- Requires-Dist: tqdm>=4.0
14
- Requires-Dist: jinja2>=3.0
15
- Requires-Dist: rich>=13
16
- Provides-Extra: ml
17
- Requires-Dist: pandas>=2.0; extra == "ml"
18
- Requires-Dist: torch>=2.0; extra == "ml"
19
- Dynamic: license-file
20
-
21
- # Datapipeline Runtime
22
-
23
- Jerry Thomas is a time-series-first data pipeline runtime. It turns declarative
24
- YAML projects into iterators that stream records, engineered features, and
25
- model-ready vectors. The CLI lets you preview every stage, build deterministic
26
- artifacts, inspect quality, and scaffold plugins for custom loaders, parsers,
27
- transforms, and filters.
28
-
29
- > **Core assumptions**
30
- >
31
- > - Every record carries a timezone-aware `time` attribute and a numeric
32
- > `value`.
33
- > - Grouping is purely temporal. Dimensional splits belong in `partition_by`.
34
-
35
- ---
36
-
37
- ## Why You Might Use It
38
-
39
- - Materialize canonical time-series datasets from disparate sources.
40
- - Preview and debug each stage of the pipeline without writing ad-hoc scripts.
41
- - Enforce coverage/quality gates and publish artifacts (expected IDs, scaler
42
- stats) for downstream ML teams.
43
- - Extend the runtime with entry-point driven plugins for domain-specific I/O or
44
- feature engineering.
45
- - Consume vectors directly from Python via iterators, Pandas DataFrames, or
46
- `torch.utils.data.Dataset`.
47
-
48
- ---
49
-
50
- ## Quick Start
51
-
52
- ### Serve The Example
53
-
54
- ```bash
55
- pip install jerry-thomas
56
- jerry plugin init my-datapipeline --out lib/
57
- jerry serve --limit 3
58
- ```
59
-
60
- ### Create Your Own Stream
61
-
62
- Assumes you already ran `jerry plugin init ...` in this workspace (it writes `jerry.yaml` which the CLI uses for defaults and scaffolding paths).
63
- These scaffolding commands write YAML into the dataset selected by `default_dataset` in `jerry.yaml` (`example` by default).
64
-
65
- ```bash
66
- jerry source add demo weather -t fs -f csv
67
- jerry domain add weather
68
- jerry contract
69
- pip install -e lib/my-datapipeline
70
- ```
71
-
72
- ---
73
-
74
- ## CLI Cheat Sheet
75
-
76
- - `jerry plugin init <name> --out lib/`: scaffolds `lib/<name>/` and writes workspace `jerry.yaml`.
77
- - `jerry.yaml` (created by `plugin init`): sets `plugin_root` for scaffolding commands and `datasets/default_dataset` so you can omit `--project`/`--dataset`.
78
- - `jerry serve [--dataset <alias>|--project <path>] [--limit N] [--stage 0-7] [--skip-build]`: streams output; builds required artifacts unless `--skip-build`.
79
- - `jerry build [--dataset <alias>|--project <path>] [--force]`: materializes artifacts (schema, scaler, expected IDs, etc.).
80
- - `jerry inspect report|matrix|partitions|expected [--dataset <alias>|--project <path>]`: quality and metadata helpers.
81
- - `jerry source add <provider> <dataset> -t fs|http|synthetic -f csv|json|json-lines|pickle [--identity]`: scaffolds a source YAML and (unless `--identity`) a parser + entry point.
82
- - `jerry domain add <domain>`: scaffolds domain models under `src/<package>/domains/<domain>/`.
83
- - `jerry contract [--identity]`: interactive contract scaffolder; most users pick `[1] Ingest (source → stream)` (use `[2] Composed` for derived streams, e.g. air_density from temp + pressure).
84
- - `pip install -e lib/<name>`: rerun after commands that update `lib/<name>/pyproject.toml` (entry points), or after manual edits to it.
85
-
86
- ---
87
-
88
- ## Concepts
89
-
90
- ### Workspace (`jerry.yaml`)
91
-
92
- - `datasets`: dataset aliases → `project.yaml` paths (relative to `jerry.yaml`).
93
- - `default_dataset`: which dataset `jerry serve/build/inspect` use when you omit `--dataset/--project`.
94
- - `plugin_root`: where scaffolding commands write Python code (`src/<package>/...`) and where they look for `pyproject.toml`.
95
-
96
- ### Plugin Package (Python Code)
97
-
98
- These live under `lib/<plugin>/src/<package>/`:
99
-
100
- - `sources/<provider>/<dataset>/dto.py` + `parser.py`: source DTO + parser (created by `jerry source add` unless `--identity`).
101
- - `domains/<domain>/model.py`: domain records (created by `jerry domain add`).
102
- - `mappers/<provider>/<dataset>/to_<domain>.py`: DTO → domain record mapping (usually created by `jerry contract`).
103
- - `pyproject.toml`: entry points for loaders/parsers/mappers/transforms (rerun `pip install -e lib/<plugin>` after changes).
104
-
105
- ### Loaders & Parsers
106
-
107
- - A **loader** yields raw rows (bytes/dicts) from some transport (FS/HTTP/synthetic/etc.).
108
- - A **parser** turns each raw row into a typed DTO (or returns `None` to drop a row).
109
- - In most projects, your source YAML uses the built-in loader `core.io` and you only customize its `args` (`transport`, `format`, and a `path`/`url`).
110
- - You typically only implement a custom loader when you need specialized behavior (auth/pagination/rate limits, proprietary formats, or non-standard protocols).
111
- - `parser.args` are optional and only used when your parser supports configuration; many parsers don’t need any args since filtering etc is supported natively downstream.
112
-
113
- ### DTOs & Domains
114
-
115
- - A **DTO** (Data Transfer Object) mirrors a single source’s schema (columns/fields) and stays “raw-shaped”; it’s what parsers emit.
116
- - A **domain record** is the canonical shape used across the pipeline. Mappers convert DTOs into domain records so multiple sources can land in the same domain model.
117
- - The base time-series type is `TemporalRecord` (`time` + `value`). Domains typically add identity fields (e.g. `symbol`, `station_id`) that make filtering/partitioning meaningful.
118
- - `time` must be timezone-aware (normalized to UTC); `value` is the measurement you engineer features from; all other fields act as the record’s “identity” (used by equality/deduping and commonly by `partition_by`).
119
-
120
- ### Glossary
121
-
122
- - **Source alias**: `sources/*.yaml:id` (referenced by contracts under `source:`).
123
- - **Stream id**: `contracts/*.yaml:id` (referenced by `dataset.yaml` under `record_stream:`).
124
- - **Partition**: dimension keys appended to feature IDs, driven by `contract.partition_by`.
125
- - **Group**: vector “bucket” cadence set by `dataset.group_by` (controls how records become samples).
126
- - **Stage**: debug/preview level for `jerry serve --stage 0-7` (DTOs → domain records → features → vectors).
127
-
128
- ### Dataset Project (YAML Config)
129
-
130
- These live under the dataset “project root” directory (the folder containing `project.yaml`):
131
-
132
- - `project.yaml`: paths + globals (single source of truth).
133
- - `sources/*.yaml`: raw sources (loader + parser wiring).
134
- - `contracts/*.yaml`: canonical streams (ingest or composed).
135
- - `dataset.yaml`: feature/target declarations.
136
- - `postprocess.yaml`: vector-level transforms.
137
- - `tasks/*.yaml`: serve presets and artifact task configs.
138
-
139
- ### Configuration & Resolution Order
140
-
141
- Defaults are layered so you can set global preferences once, keep dataset/run
142
- files focused on per-project behavior, and still override anything from the CLI.
143
- For both `jerry serve` and `jerry build`, options are merged in the following
144
- order (highest precedence first):
145
-
146
- 1. **CLI flags** – anything you pass on the command line always wins.
147
- 2. **Project task files** – `kind: serve` specs (under `project.paths.tasks`)
148
- supply serve defaults; artifact tasks in the same directory drive `jerry build`.
149
- 3. **`jerry.yaml` command blocks** – settings under `jerry.serve` and `jerry.build`.
150
- 4. **`jerry.yaml.shared`** – shared fallbacks for visuals/progress/log-level style settings.
151
- 5. **Built-in defaults** – runtime hard-coded defaults.
152
-
153
- ---
154
-
155
- ## YAML Config Reference
156
-
157
- All dataset configuration is rooted at a single `project.yaml` file. Other YAML files are discovered via `project.paths.*` (relative to `project.yaml` unless absolute).
158
-
159
- ### `project.yaml`
160
-
161
- ```yaml
162
- version: 1
163
- name: default
164
- paths:
165
- streams: ./contracts
166
- sources: ./sources
167
- dataset: dataset.yaml
168
- postprocess: postprocess.yaml
169
- artifacts: ../artifacts/${project_name}/v${version}
170
- tasks: ./tasks
171
- globals:
172
- start_time: 2021-01-01T00:00:00Z
173
- end_time: 2023-01-03T23:00:00Z
174
- split:
175
- mode: hash # hash | time
176
- key: group # group | feature:<id>
177
- seed: 42
178
- ratios: { train: 0.8, val: 0.1, test: 0.1 }
179
- ```
180
-
181
- - `name` provides a stable identifier you can reuse inside config files via `${project_name}`.
182
- - `paths.*` are resolved relative to the project file unless absolute; they also support `${var}` interpolation.
183
- - `globals` provide values for `${var}` interpolation across YAML files. Datetime
184
- values are normalized to strict UTC `YYYY-MM-DDTHH:MM:SSZ`.
185
- - `split` config defines how labels are assigned; serve tasks or CLI flags pick the active label via `keep`.
186
- - `paths.tasks` points to a directory of task specs. Each `*.yaml` file declares `kind: ...`
187
- (`scaler`, `schema`, `metadata`, `serve`, …). Artifact tasks drive `jerry build`; command
188
- tasks (currently `kind: serve`) provide presets for `jerry serve`. When multiple serve tasks
189
- exist, `jerry serve --run <name>` selects by `name`/filename stem.
190
- - Label names are free-form: match whatever keys you declare in `split.ratios` (hash) or `split.labels` (time).
191
-
192
- ### Serve Tasks (`tasks/serve.<name>.yaml`)
193
-
194
- ```yaml
195
- kind: serve
196
- name: train # defaults to filename stem when omitted
197
- keep: train # select active split label (null disables filtering)
198
- output:
199
- transport: stdout # stdout | fs
200
- format: print # print | json-lines | json | csv | pickle
201
- limit: 100 # cap vectors per serve run (null = unlimited)
202
- throttle_ms: null # milliseconds to sleep between emitted vectors
203
- # Optional overrides:
204
- # log_level: INFO # DEBUG=progress bars, INFO=spinner, WARNING=quiet
205
- # visuals: AUTO # AUTO | TQDM | RICH | OFF
206
- # progress: AUTO # AUTO | SPINNER | BARS | OFF
207
- ```
208
-
209
- - Each serve task lives alongside artifact tasks under `paths.tasks`. Files are independent—no special directory structure required.
210
- - `output`, `limit`, `throttle_ms`, and `log_level` provide defaults for `jerry serve`; CLI flags still win per invocation (see _Configuration & Resolution Order_). For filesystem outputs, set `transport: fs`, `directory: /path/to/root`, and omit file names—each run automatically writes to `<directory>/<run_name>/<run_name>.<ext>` unless you override the entire `output` block with a custom `filename`.
211
- - Override `keep` (and other fields) per invocation via `jerry serve ... --keep val` etc.
212
- - Visuals backend: set `visuals: AUTO|TQDM|RICH|OFF` in the task or use `--visuals`. Pair with `progress: AUTO|SPINNER|BARS|OFF` or `--progress` to control progress layouts.
213
- - Add additional `kind: serve` files to the tasks directory for other splits (val/test/etc.); `jerry serve` runs each enabled file unless you pass `--run <name>`.
214
- - Use `jerry.yaml` next to the project or workspace root to define shared defaults (visuals/progress/log level/output); CLI flags still take precedence.
215
-
216
- ### Workspace Defaults (`jerry.yaml`)
217
-
218
- Create an optional `jerry.yaml` in the directory where you run the CLI to share settings across commands. The CLI walks up from the current working directory to find the first `jerry.yaml`.
219
-
220
- ```yaml
221
- plugin_root: lib/my-datapipeline # plugin workspace (relative to this file)
222
-
223
- # Dataset aliases for --dataset; values may be dirs (auto-append project.yaml).
224
- datasets:
225
- example: lib/my-datapipeline/example/project.yaml
226
- default_dataset: example
227
-
228
- shared:
229
- visuals: AUTO # AUTO | TQDM | RICH | OFF
230
- progress: BARS # AUTO | SPINNER | BARS | OFF
231
- log_level: INFO
232
-
233
- serve:
234
- limit: null
235
- stage: null
236
- output:
237
- transport: stdout
238
- format: print # print | json-lines | json | csv | pickle
239
- # directory: artifacts/serve # Required when transport=fs
240
-
241
- build:
242
- mode: AUTO # AUTO | FORCE | OFF
243
- ```
244
-
245
- `jerry.yaml` sits near the root of your workspace, while dataset-specific overrides still live in individual `tasks/serve.*.yaml` files as needed.
246
-
247
- ### `<project_root>/sources/<alias>.yaml`
248
-
249
- Each file defines a loader/parser pair exposed under `<alias>`. Files may live in nested
250
- subdirectories under `<project_root>/sources/`; discovery is recursive.
251
-
252
- ```yaml
253
- # Source identifier (commonly `provider.dataset`). Contracts reference this under `source:`.
254
- id: stooq.ohlcv
255
- parser:
256
- # Parser entry point name (registered in your plugin’s pyproject.toml).
257
- entrypoint: stooq.ohlcv
258
- loader:
259
- # Most common loader: core.io (supports fs/http via args.transport + args.format).
260
- entrypoint: core.io
261
- args:
262
- transport: http
263
- format: csv
264
- url: "https://stooq.com/q/d/l/?s=aapl.us&i=d"
265
- ```
266
-
267
- - `id`: the source alias; referenced by contracts under `source:`.
268
- - `parser.entrypoint`: which parser to use; `parser.args` are optional.
269
- - `loader.entrypoint`: which loader to use; `core.io` is the default for fs/http and is configured via `loader.args`.
270
-
271
- #### Fan-out Sources (`core.foreach`)
272
-
273
- Use `core.foreach` to expand any inner loader spec across a list without duplicating YAML. It interpolates string args and optionally injects the foreach value into each row.
274
-
275
- ```yaml
276
- loader:
277
- entrypoint: core.foreach
278
- args:
279
- foreach:
280
- symbol: [AAPL, MSFT]
281
- inject_field: symbol
282
- loader:
283
- entrypoint: core.io
284
- args:
285
- transport: http
286
- format: csv
287
- url: "https://stooq.com/q/d/l/?s=${symbol}&i=d"
288
- ```
289
-
290
- ### `<project_root>/contracts/<stream_id>.yaml`
291
-
292
- Canonical stream contracts describe how the runtime should map and prepare a raw
293
- source. Use folders to organize by domain if you like.
294
-
295
- ```yaml
296
- kind: ingest
297
- id: equity.ohlcv # stream identifier (domain.dataset[.variant])
298
- source: stooq.ohlcv # references sources/<alias>.yaml:id
299
-
300
- mapper:
301
- entrypoint: equity.ohlcv
302
- args: {}
303
-
304
- partition_by: station
305
- sort_batch_size: 50000
306
-
307
- record:
308
- - filter: { operator: ge, field: time, comparand: "${start_time}" }
309
- - filter: { operator: lt, field: time, comparand: "${end_time}" }
310
- - floor_time: { cadence: 10m }
311
-
312
- stream:
313
- - ensure_cadence: { cadence: 10m }
314
- - granularity: { mode: mean }
315
- - fill: { statistic: median, window: 6, min_samples: 2 }
316
-
317
- debug:
318
- - lint: { mode: warn, tick: 10m }
319
- ```
320
-
321
- - `record`: ordered record-level transforms (filters, floor/lag, custom
322
- transforms registered under the `record` entry-point group).
323
- - `stream`: transforms applied after feature wrapping, still per base feature.
324
- - `debug`: instrumentation-only transforms (linters, assertions).
325
- - `partition_by`: optional keys used to suffix feature IDs (e.g., `temp__@station_id:XYZ`).
326
- - `sort_batch_size`: chunk size used by the in-memory sorter when normalizing
327
- order before stream transforms.
328
-
329
- ### Composed Streams (Engineered Domains)
330
-
331
- Define engineered streams that depend on other canonical streams directly in contracts. The runtime builds each input to stage 4 (ordered + regularized), stream‑aligns by partition + timestamp, runs your composer, and emits fresh records for the derived stream.
332
-
333
- ```yaml
334
- # <project_root>/contracts/air_density.processed.yaml
335
- kind: composed
336
- id: air_density.processed
337
- inputs:
338
- - pressure.processed
339
- - t=temp_dry.processed
340
- partition_by: station_id
341
- sort_batch_size: 20000
342
-
343
- mapper:
344
- # Function or class via dotted path; entry points optional
345
- entrypoint: mypkg.domains.air_density:compose_to_record
346
- args:
347
- driver: pressure.processed # optional; defaults to first input
348
-
349
- # Optional post‑compose policies (run after composition like any stream)
350
- # record: [...]
351
- # stream: [...]
352
- # debug: [...]
353
- ```
354
-
355
- Dataset stays minimal — features only reference the composed stream:
356
-
357
- ```yaml
358
- # dataset.yaml
359
- group_by: 1h
360
- features:
361
- - id: air_density
362
- record_stream: air_density.processed
363
- ```
364
-
365
- Notes:
366
-
367
- - Inputs always reference canonical stream_ids (not raw sources).
368
- - The composed source outputs records; its own `record`/`stream`/`debug` rules still apply afterward.
369
- - Partitioning for the engineered domain is explicit via `partition_by` on the composed contract.
370
-
371
- ### `dataset.yaml`
372
-
373
- Defines which canonical streams become features/targets and the vector bucketing.
374
-
375
- ```yaml
376
- group_by: 1h
377
-
378
- features:
379
- - id: close
380
- record_stream: equity.ohlcv
381
- scale: true
382
- sequence: { size: 6, stride: 1 }
383
-
384
- targets:
385
- - id: returns_1d
386
- record_stream: equity.ohlcv
387
- ```
388
-
389
- - `group_by` controls the cadence for vector partitioning (accepts `Xm|min|Xh`
390
- — minutes or hours).
391
- - `scale: true` inserts the standard scaler feature transform (requires scaler
392
- stats artifact or inline statistics).
393
- - Downstream consumers can load the `scaler.pkl` artifact and call
394
- `StandardScaler.inverse_transform` (or `StandardScalerTransform.inverse`)
395
- to undo scaling.
396
- - `sequence` emits `FeatureRecordSequence` windows (size, stride, optional
397
- cadence enforcement via `tick`).
398
-
399
- ### `postprocess.yaml`
400
-
401
- Project-scoped vector transforms that run after assembly and before serving.
402
-
403
- ```yaml
404
- - drop:
405
- axis: horizontal
406
- payload: features
407
- threshold: 0.95
408
- - fill:
409
- statistic: median
410
- window: 48
411
- min_samples: 6
412
- - replace:
413
- payload: targets
414
- value: 0.0
415
- ```
416
-
417
- - Each transform receives a `Sample`; set `payload: targets` when you want to
418
- mutate label vectors, otherwise the feature vector is used.
419
- - Vector transforms rely on the schema artifact (for expected IDs/cadence)
420
- and scaler stats when scaling is enabled. When no transforms are configured
421
- the stream passes through unchanged.
422
-
423
- ### Task Specs (`tasks/*.yaml`)
424
-
425
- Declare artifact and command tasks under `project.paths.tasks` (default `tasks/`).
426
- Artifact specs are optional; if you omit them, Jerry falls back to built-in defaults.
427
- Add a YAML file only when you need to override paths or other parameters.
428
-
429
- `tasks/scaler.yaml`
430
-
431
- ```yaml
432
- kind: scaler
433
- output: scaler.pkl
434
- split_label: train
435
- enabled: true
436
- ```
437
-
438
- - `scaler.pkl` is a pickled standard scaler fitted on the requested split.
439
- - `schema.json` (from the `schema` task) enumerates the discovered feature/target identifiers (including partitions), their kinds (scalar/list), and cadence hints used to enforce ordering downstream.
440
- - Configure the `schema` task to choose a cadence strategy (currently `max`). Per-feature overrides will be added later; for now every list-valued feature records the max observed length as its enforcement target.
441
- - `metadata.json` (from the `metadata` task) captures heavier statistics—present/null counts, inferred value types, list-length histograms, per-partition timestamps, and the dataset window. Configure `metadata.window_mode` with `union|intersection|strict|relaxed` (default `intersection`) to control how start/end bounds are derived. `union` considers base features, `intersection` uses their overlap, `strict` intersects every partition, and `relaxed` unions partitions independently.
442
- - Command tasks (`kind: serve`) live alongside artifact tasks; `jerry serve` reads them directly.
443
- - Shared run/build defaults (visuals/progress/log level/build mode) live in `jerry.yaml`.
444
-
445
- ---
446
-
447
- ## CLI Reference
448
-
449
- All commands live under the `jerry` entry point (`src/datapipeline/cli/app.py`).
450
- Pass `--help` on any command for flags.
451
- All commands that take a project accept either `--project <path/to/project.yaml>` or `--dataset <alias>` (from `jerry.yaml datasets:`).
452
-
453
- ### Preview Stages
454
-
455
- - `jerry serve --project <project.yaml> --stage <0-7> --limit N [--log-level LEVEL] [--visuals auto|tqdm|rich|off] [--progress auto|spinner|bars|off]`
456
- - Stage 0: raw DTOs
457
- - Stage 1: domain `TemporalRecord`s
458
- - Stage 2: record transforms applied
459
- - Stage 3: feature records (before sort/regularization)
460
- - Stage 4: feature regularization (post stream transforms)
461
- - Stage 5: feature transforms/sequence outputs
462
- - Stage 6: vectors assembled (no postprocess)
463
- - Stage 7: vectors + postprocess transforms
464
- - Use `--log-level DEBUG` for progress bars, `--log-level INFO` for spinner + prints, or the default (`WARNING`) for minimal output.
465
- - Ensures build artifacts are current before streaming; the build step only runs when the configuration hash changes unless you pass `--stage` 0-5 (auto-skip) or opt out with `--skip-build`.
466
- - `jerry serve --project <project.yaml> --out-transport stdout --out-format json-lines --limit N [--include-targets] [--log-level LEVEL] [--visuals ...] [--progress ...] [--run name]`
467
- - Applies postprocess transforms and optional dataset split before emitting.
468
- - Use `--out-transport fs --out-format json-lines --out-path build/serve` (or `csv`, `pickle`, etc.) to write artifacts to disk instead of stdout; files land under `<out-path>/<run_name>/`.
469
- - `--out-payload vector` emits only the vector payload with features/targets
470
- flattened into schema-ordered lists (no identifier keys) when you don't need
471
- the group key or metadata. Default is `sample`.
472
- - Set `--log-level DEBUG` (or set your serve task `log_level: DEBUG`) to reuse the tqdm progress bars when previewing stages.
473
- - When multiple serve tasks exist, add `--run val` (task name or filename stem) to target a single config; otherwise every enabled task is executed sequentially.
474
- - Argument precedence follows the order described under _Configuration & Resolution Order_.
475
- - Combine with `--skip-build` when you already have fresh artifacts and want to jump straight into streaming.
476
-
477
- ### Build & Quality
478
-
479
- - `jerry inspect report --project <project.yaml> [--threshold 0.95] [--include-targets]`
480
- - Prints coverage summary (keep/below lists) and writes `coverage.json` under
481
- the artifacts directory.
482
- - Add `--matrix csv|html` to persist an availability matrix.
483
- - `jerry inspect partitions --project <project.yaml> [--include-targets]`
484
- - Writes discovered partition suffixes to `partitions.json`.
485
- - `jerry inspect expected --project <project.yaml> [--include-targets]`
486
- - Writes the full set of observed feature IDs to `expected.txt` (for external tooling; runtime uses `schema.json`).
487
- - `jerry build --project <project.yaml> [--force] [--visuals ...] [--progress ...]`
488
- - Regenerates artifact tasks declared under `project.paths.tasks` when the configuration hash changes.
489
-
490
- ### Scaffolding & Reference
491
-
492
- - `jerry plugin init <package> --out <dir>` (also supports `-n/--name`)
493
- - Generates a plugin project (pyproject, package skeleton, config templates).
494
- - `jerry source add <provider> <dataset> --transport fs|http|synthetic --format csv|json|json-lines|pickle`
495
- - Also supports `<provider>.<dataset>` via `--alias` or as the first positional
496
- - Flag form remains available: `--provider/--dataset`
497
- - Creates loader/parser stubs, updates entry points, and drops a matching
498
- source YAML.
499
- - `jerry domain add <name>` (also supports `-n/--name`)
500
- - Adds a `domains/<name>/` package with a `model.py` stub.
501
- - `jerry filter create --name <identifier>`
502
- - Scaffolds an entry-point-ready filter (helpful for custom record predicates).
503
- - `jerry list sources|domains`
504
- - Introspect configured source aliases or domain packages.
505
-
506
- ---
507
-
508
- ## Transform & Filter Library
509
-
510
- ### Record Filters (`<project_root>/contracts/*.yaml:record`)
511
-
512
- - Binary comparisons: `eq`, `ne`, `lt`, `le`, `gt`, `ge` (timezone-aware for ISO
513
- or datetime literals).
514
- - Membership: `in`, `nin`.
515
- ```yaml
516
- - filter: { operator: ge, field: time, comparand: "${start_time}" }
517
- - filter: { operator: in, field: station, comparand: [a, b, c] }
518
- ```
519
-
520
- ### Record Transforms
521
-
522
- - `floor_time`: snap timestamps down to the nearest cadence (`10m`, `1h`, …).
523
- - `lag`: add lagged copies of records (see `src/datapipeline/transforms/record/lag.py` for options).
524
-
525
- ### Stream (Feature) Transforms
526
-
527
- - `ensure_cadence`: backfill missing ticks with `value=None` records to enforce a
528
- strict cadence.
529
- - `granularity`: merge duplicate timestamps using `first|last|mean|median`.
530
- - `dedupe`: drop exact duplicate records (same id, timestamp, and payload) from
531
- an already sorted feature stream.
532
- - `fill`: rolling statistic-based imputation within each feature stream.
533
- - Custom transforms can be registered under the `stream` entry-point group.
534
-
535
- ### Feature Transforms
536
-
537
- - `scale`: wraps `StandardScalerTransform`. Read statistics from the build
538
- artifact or accept inline `statistics`.
539
- ```yaml
540
- scale:
541
- with_mean: true
542
- with_std: true
543
- statistics:
544
- temp_c__station=001: { mean: 10.3, std: 2.1 }
545
- ```
546
-
547
- ### Sequence Transforms
548
-
549
- - `sequence`: sliding window generator (`size`, `stride`, optional `cadence` to
550
- enforce contiguous windows). Emits `FeatureRecordSequence` payloads with `.records`.
551
-
552
- ### Vector (Postprocess) Transforms
553
-
554
- - `drop`: apply coverage thresholds along the horizontal axis (vectors) or
555
- vertical axis (features/partitions) using `axis: horizontal|vertical` and
556
- `threshold`. Vertical mode requires the optional `metadata.json`
557
- artifact and internally prunes weak partitions.
558
- - `fill`: impute using rolling statistics from prior vectors (history-based).
559
- - `replace`: seed missing IDs with a constant or literal value.
560
- (Jerry automatically enforces the `schema.json` vector schema—ordering +
561
- cadence—before any configured vector transforms run.)
562
-
563
- All transforms share a consistent entry-point signature and accept their config
564
- dict as keyword arguments. Register new ones in `pyproject.toml` under the
565
- appropriate group (`record`, `stream`, `feature`, `sequence`, `vector`,
566
- `filters`, `debug`).
567
-
568
- ---
569
-
570
- ## Artifacts & Postprocess
571
-
572
- - `expected.txt`: newline-delimited full feature IDs, generated on demand via
573
- `jerry inspect expected`. Not required at runtime; transforms derive the
574
- expected universe from `schema.json`.
575
- - `schema.json`: output of the `schema` task. Jerry automatically
576
- enforces this schema during postprocess to impose deterministic ordering and
577
- list cadence metadata (targets appear whenever the dataset defines them). Window metadata now lives in `metadata.json`.
578
- - `scaler.pkl`: pickled standard scaler fitted on the configured split. Loaded
579
- lazily by feature transforms at runtime.
580
- - Build state is tracked in `artifacts/build/state.json`; config hashes avoid
581
- redundant runs.
582
-
583
- If a postprocess transform needs an artifact and it is missing, the runtime will
584
- raise a descriptive error suggesting `jerry build`.
585
-
586
- ---
587
-
588
- ## Splitting & Serving
589
-
590
- If `project.globals.split` is present, `jerry serve` filters vectors at the
591
- end of the pipeline:
592
-
593
- - `mode: hash` – deterministic entity hash using either the group key or a
594
- specified feature ID.
595
- - `mode: time` – boundary-based slicing using timestamp labels.
596
- - `run.keep` (or CLI `--keep`) selects the active slice; use any label name defined in your split config.
597
-
598
- The split configuration never mutates stored artifacts; it is only applied when
599
- serving vectors (either via CLI or the Python integrations).
600
-
601
- ---
602
-
603
- ## Python Integrations
604
-
605
- `datapipeline.integrations.ml` demonstrates how to reuse the runtime from
606
- application code:
607
-
608
- - `VectorAdapter.from_project(project_yaml)` – bootstrap once, then stream
609
- vectors or row dicts.
610
- - `stream_vectors(project_yaml, limit=...)` – iterator matching `jerry serve`.
611
- - `iter_vector_rows` / `collect_vector_rows` – handy for Pandas or custom sinks.
612
- - `dataframe_from_vectors` – eager helper that returns a Pandas DataFrame
613
- (requires `pandas`).
614
- - `torch_dataset` – builds a `torch.utils.data.Dataset` that yields tensors. See
615
- `examples/minimal_project/run_torch.py` for usage.
616
-
617
- ---
618
-
619
- ## Extending the Runtime
620
-
621
- ### Entry Points
622
-
623
- Register custom components in your plugin’s `pyproject.toml`:
624
-
625
- ```toml
626
- [project.entry-points."datapipeline.loaders"]
627
- demo.csv_loader = "my_datapipeline.loaders.csv:CsvLoader"
628
-
629
- [project.entry-points."datapipeline.parsers"]
630
- demo.weather_parser = "my_datapipeline.parsers.weather:WeatherParser"
631
-
632
- [project.entry-points."datapipeline.mappers"]
633
- time.ticks = "my_datapipeline.mappers.synthetic.ticks:map"
634
-
635
- [project.entry-points."datapipeline.stream"]
636
- weather.fill = "my_datapipeline.transforms.weather:CustomFill"
637
- ```
638
-
639
- Loader, parser, mapper, and transform classes should provide a callable
640
- interface (usually `__call__`) matching the runtime expectations. Refer to the
641
- built-in implementations in `src/datapipeline/sources/`, `src/datapipeline/transforms/`,
642
- and `src/datapipeline/filters/`.
643
-
644
- ### Scaffolding Helpers
645
-
646
- - `datapipeline.services.scaffold.plugin.scaffold_plugin` – invoked by
647
- `jerry plugin init`.
648
- - `datapipeline.services.scaffold.source.create_source` – writes loader/parser
649
- stubs and updates entry points.
650
- - `datapipeline.services.scaffold.domain.create_domain` – domain record skeleton.
651
- - `datapipeline.services.scaffold.filter.create_filter` – custom filter stub.
652
- - `datapipeline.services.scaffold.mappers.attach_source_to_domain` – helper for
653
- programmatically wiring sources to domain mappers and emitting stream
654
- contracts (useful in custom automation or tests).
655
-
656
- ---
657
-
658
- ## Development Workflow
659
-
660
- - Install dependencies: `pip install -e .[dev]`.
661
- - Run tests: `pytest`.
662
- - When iterating on configs, use `jerry serve --stage <n>` to peek into problematic
663
- stages.
664
- - After tuning transforms, refresh artifacts: `jerry build`.
665
- - Use `jerry inspect report --include-targets` to ensure targets meet coverage
666
- gates before handing vectors to downstream consumers.
667
-
668
- ---
669
-
670
- ## Additional Resources
671
-
672
- - `src/datapipeline/analysis/vector_analyzer.py` – quality metrics collected by
673
- the inspect commands.
674
- - `src/datapipeline/pipeline/` – pure functions that wire each stage.
675
- - `src/datapipeline/services/bootstrap/` – runtime initialization and
676
- registry population (see `core.py`).
677
- - `examples/minimal_project/` – runnable demo showing config layout and Torch
678
- integration.
679
-
680
- ---
681
-
682
- ## Pipeline Architecture (WIP)
683
-
684
- ```text
685
- raw source ──▶ loader/parser DTOs ──▶ canonical stream ──▶ record policies
686
- └──▶ feature wrapping ──▶ stream regularization ──▶ feature transforms/sequence
687
- └──▶ vector assembly ──▶ postprocess transforms
688
- ```
689
-
690
- 1. **Loader/parser (Stage 0)** – raw bytes become typed DTOs. Loaders fetch from
691
- FS/HTTP/synthetic sources; parsers map bytes to DTOs. Register them via entry
692
- points (`loaders`, `parsers`) and wire them in `<project_root>/sources/*.yaml`.
693
- 2. **Canonical stream mapping (Stage 1)** – mappers attach domain semantics and
694
- partition keys, producing domain `TemporalRecord`s.
695
- 3. **Record policies (Stage 2)** – contract `record` rules (filters, floor, lag)
696
- prune and normalize DTO-derived records.
697
- 4. **Feature wrapping (Stage 3)** – records become `FeatureRecord`s before
698
- sort/regularization.
699
- 5. **Stream regularization (Stage 4)** – contract `stream` rules ensure cadence,
700
- deduplicate timestamps, and impute where needed.
701
- 6. **Feature transforms/sequence (Stage 5)** – dataset transforms (scale,
702
- sequence windows) produce per-feature tensors or windows.
703
- 7. **Vector assembly (Stage 6)** – features merge by `group_by` cadence into
704
- `(group_key, Vector)` pairs, prior to postprocess tweaks.
705
- 8. **Postprocess (Stage 7)** – optional vector transforms (fill/drop/etc.) run
706
- before results are emitted to the configured output.
707
-
708
- #### Visual Flowchart
709
-
710
- ```mermaid
711
- flowchart TB
712
- subgraph CLI & Project config
713
- cliSource[jerry source add]
714
- cliDomain[jerry domain add]
715
- cliContract[jerry contract]
716
- cliServe[jerry serve]
717
- project[[project.yaml]]
718
- sourcesCfg[sources/*.yaml]
719
- contractsCfg[contracts/*.yaml]
720
- datasetCfg[dataset.yaml]
721
- postprocessCfg[postprocess.yaml]
722
- end
723
-
724
- cliSource --> sourcesCfg
725
- cliDomain --> domainPkg
726
- cliContract --> contractsCfg
727
- cliServe --> vectorSamples
728
- project -.->|paths.sources| sourcesCfg
729
- project -.->|paths.streams| contractsCfg
730
- project -.->|paths.dataset| datasetCfg
731
- project -.->|paths.postprocess| postprocessCfg
732
-
733
- subgraph Plugin code
734
- domainPkg[domains/*]
735
- mappersPkg[mappers/*]
736
- end
737
-
738
- cliContract --> mappersPkg
739
- domainPkg -. domain models .-> mappersPkg
740
-
741
- subgraph Registries
742
- registrySources[sources]
743
- registryStreamSources[stream_sources]
744
- registryMappers[mappers]
745
- registryRecordOps[record_ops]
746
- registryStreamOps[stream_ops]
747
- registryDebugOps[debug_ops]
748
- end
749
-
750
- subgraph Source wiring
751
- rawData[(external data)]
752
- transportSpec[transport + format]
753
- loaderEP[loader ep]
754
- parserEP[parser ep]
755
- sourceArgs[loader args]
756
- sourceNode[Source]
757
- dtoStream[(DTOs)]
758
- end
759
-
760
- sourcesCfg --> transportSpec
761
- sourcesCfg --> loaderEP
762
- sourcesCfg --> parserEP
763
- sourcesCfg --> sourceArgs
764
- transportSpec -. select fs/http/synth .-> loaderEP
765
- loaderEP -. build loader .-> sourceNode
766
- parserEP -. build parser .-> sourceNode
767
- sourceArgs -. paths/creds .-> sourceNode
768
- rawData --> sourceNode --> dtoStream
769
- sourcesCfg -. build_source_from_spec .-> registrySources
770
- contractsCfg -. stream_id + source .-> registryStreamSources
771
- registrySources -. alias -> Source .-> registryStreamSources
772
-
773
- subgraph Canonical stream
774
- mapperEP[mapper ep]
775
- recordRules[record rules]
776
- streamRules[stream rules]
777
- debugRules[debug rules]
778
- canonical[DTO -> record]
779
- domainRecords((TemporalRecord))
780
- recordStage[record xforms]
781
- featureWrap[record -> feature]
782
- featureRecords((FeatureRecord))
783
- regularization[stream xforms]
784
- end
785
-
786
- dtoStream --> canonical --> domainRecords --> recordStage --> featureWrap --> featureRecords --> regularization
787
- contractsCfg --> mapperEP
788
- mappersPkg -. ep target .-> mapperEP
789
- mapperEP -. build_mapper_from_spec .-> registryMappers
790
- registryMappers --> canonical
791
- contractsCfg --> recordRules
792
- contractsCfg --> streamRules
793
- contractsCfg --> debugRules
794
- registryRecordOps --> recordRules
795
- registryStreamOps --> streamRules
796
- registryDebugOps --> debugRules
797
- recordRules --> recordStage
798
- streamRules --> regularization
799
- debugRules --> regularization
800
-
801
- subgraph Dataset shaping
802
- featureSpec[feature cfg]
803
- groupBySpec[group_by]
804
- streamRefs[record_stream ids]
805
- featureTrans[feature/seq xforms]
806
- sequenceStream((seq/features))
807
- vectorStage[vector assembly]
808
- vectorSamples((samples))
809
- end
810
-
811
- datasetCfg --> featureSpec
812
- datasetCfg --> groupBySpec
813
- datasetCfg --> streamRefs
814
- streamRefs -.->|build_feature_pipeline| registryStreamSources
815
- registryStreamSources -.->|open_source_stream| sourceNode
816
- featureRecords --> regularization --> featureTrans --> sequenceStream --> vectorStage --> vectorSamples
817
- featureSpec -. scale/sequence .-> featureTrans
818
- groupBySpec -. cadence .-> vectorStage
819
-
820
- subgraph Postprocess
821
- vectorTransforms[vector xforms]
822
- postprocessNode[postprocess]
823
- end
824
-
825
- postprocessCfg --> vectorTransforms -. drop/fill .-> postprocessNode
826
- vectorStage --> postprocessNode
827
- ```