jerry-thomas 0.0.5__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. {jerry_thomas-0.0.5/src/jerry_thomas.egg-info → jerry_thomas-0.2.0}/PKG-INFO +153 -53
  2. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/README.md +147 -49
  3. jerry_thomas-0.2.0/pyproject.toml +83 -0
  4. jerry_thomas-0.2.0/src/datapipeline/analysis/vector_analyzer.py +696 -0
  5. jerry_thomas-0.2.0/src/datapipeline/cli/app.py +425 -0
  6. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/cli/commands/domain.py +2 -2
  7. jerry_thomas-0.2.0/src/datapipeline/cli/commands/inspect.py +169 -0
  8. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/cli/commands/link.py +48 -14
  9. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/cli/commands/plugin.py +2 -2
  10. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/cli/commands/run.py +47 -48
  11. jerry_thomas-0.2.0/src/datapipeline/cli/visual_source.py +32 -0
  12. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/cli/visuals.py +4 -31
  13. jerry_thomas-0.2.0/src/datapipeline/config/catalog.py +30 -0
  14. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/config/dataset/dataset.py +4 -7
  15. jerry_thomas-0.2.0/src/datapipeline/config/dataset/feature.py +13 -0
  16. jerry_thomas-0.2.0/src/datapipeline/config/dataset/loader.py +99 -0
  17. jerry_thomas-0.2.0/src/datapipeline/config/dataset/normalize.py +24 -0
  18. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/config/project.py +0 -2
  19. jerry_thomas-0.2.0/src/datapipeline/domain/feature.py +17 -0
  20. jerry_thomas-0.2.0/src/datapipeline/domain/record.py +28 -0
  21. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/domain/vector.py +4 -2
  22. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/filters/filters.py +0 -1
  23. jerry_thomas-0.2.0/src/datapipeline/integrations/__init__.py +19 -0
  24. jerry_thomas-0.2.0/src/datapipeline/integrations/ml.py +319 -0
  25. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/mappers/synthetic/time.py +3 -3
  26. jerry_thomas-0.2.0/src/datapipeline/pipeline/pipelines.py +93 -0
  27. jerry_thomas-0.2.0/src/datapipeline/pipeline/stages.py +119 -0
  28. jerry_thomas-0.2.0/src/datapipeline/pipeline/utils/keygen.py +42 -0
  29. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/pipeline/utils/memory_sort.py +1 -1
  30. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/pipeline/utils/ordering.py +0 -2
  31. jerry_thomas-0.2.0/src/datapipeline/pipeline/utils/transform_utils.py +55 -0
  32. jerry_thomas-0.2.0/src/datapipeline/plugins.py +21 -0
  33. jerry_thomas-0.2.0/src/datapipeline/registries/registries.py +15 -0
  34. jerry_thomas-0.2.0/src/datapipeline/registries/registry.py +28 -0
  35. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/bootstrap.py +50 -17
  36. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/constants.py +2 -0
  37. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/factories.py +9 -5
  38. jerry_thomas-0.2.0/src/datapipeline/services/project_paths.py +75 -0
  39. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/scaffold/domain.py +6 -3
  40. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/scaffold/mappers.py +2 -2
  41. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/scaffold/plugin.py +5 -5
  42. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/scaffold/source.py +15 -25
  43. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/scaffold/templates.py +1 -5
  44. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/models/__init__.py +1 -3
  45. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/models/loader.py +1 -12
  46. jerry_thomas-0.2.0/src/datapipeline/sources/synthetic/time/parser.py +9 -0
  47. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/plugin_skeleton/README.md +14 -11
  48. jerry_thomas-0.2.0/src/datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.yaml +24 -0
  49. jerry_thomas-0.2.0/src/datapipeline/templates/plugin_skeleton/config/contracts/time_linear.yaml +23 -0
  50. jerry_thomas-0.2.0/src/datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +29 -0
  51. {jerry_thomas-0.0.5/src/datapipeline/templates/plugin_skeleton/config → jerry_thomas-0.2.0/src/datapipeline/templates/plugin_skeleton/config/datasets/default}/project.yaml +3 -3
  52. {jerry_thomas-0.0.5/src/datapipeline/templates/plugin_skeleton/config/distilleries → jerry_thomas-0.2.0/src/datapipeline/templates/plugin_skeleton/config/sources}/time_ticks.yaml +2 -0
  53. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/plugin_skeleton/pyproject.toml +2 -2
  54. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/stubs/dto.py.j2 +1 -2
  55. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/stubs/mapper.py.j2 +3 -4
  56. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/stubs/record.py.j2 +1 -1
  57. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/stubs/source.yaml.j2 +4 -0
  58. jerry_thomas-0.2.0/src/datapipeline/transforms/debug/identity.py +74 -0
  59. jerry_thomas-0.2.0/src/datapipeline/transforms/debug/lint.py +101 -0
  60. jerry_thomas-0.2.0/src/datapipeline/transforms/feature/model.py +12 -0
  61. jerry_thomas-0.0.5/src/datapipeline/transforms/transforms.py → jerry_thomas-0.2.0/src/datapipeline/transforms/feature/scaler.py +9 -67
  62. jerry_thomas-0.2.0/src/datapipeline/transforms/filter.py +57 -0
  63. jerry_thomas-0.2.0/src/datapipeline/transforms/record/floor_time.py +17 -0
  64. jerry_thomas-0.2.0/src/datapipeline/transforms/record/lag.py +18 -0
  65. jerry_thomas-0.2.0/src/datapipeline/transforms/sequence.py +84 -0
  66. jerry_thomas-0.2.0/src/datapipeline/transforms/stream/ensure_ticks.py +33 -0
  67. jerry_thomas-0.2.0/src/datapipeline/transforms/stream/fill.py +103 -0
  68. jerry_thomas-0.2.0/src/datapipeline/transforms/stream/granularity.py +92 -0
  69. jerry_thomas-0.2.0/src/datapipeline/transforms/utils.py +10 -0
  70. jerry_thomas-0.2.0/src/datapipeline/transforms/vector.py +226 -0
  71. jerry_thomas-0.2.0/src/datapipeline/transforms/vector_utils.py +84 -0
  72. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/utils/load.py +3 -1
  73. jerry_thomas-0.2.0/src/datapipeline/utils/paths.py +26 -0
  74. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/utils/time.py +6 -4
  75. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0/src/jerry_thomas.egg-info}/PKG-INFO +153 -53
  76. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/jerry_thomas.egg-info/SOURCES.txt +27 -11
  77. jerry_thomas-0.2.0/src/jerry_thomas.egg-info/entry_points.txt +39 -0
  78. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/jerry_thomas.egg-info/requires.txt +5 -2
  79. jerry_thomas-0.2.0/tests/test_config_pipeline.py +25 -0
  80. jerry_thomas-0.2.0/tests/test_regression_vectors.py +162 -0
  81. jerry_thomas-0.2.0/tests/test_transforms.py +189 -0
  82. jerry_thomas-0.2.0/tests/test_vector_analyzer.py +19 -0
  83. jerry_thomas-0.0.5/pyproject.toml +0 -92
  84. jerry_thomas-0.0.5/src/datapipeline/analysis/vector_analyzer.py +0 -49
  85. jerry_thomas-0.0.5/src/datapipeline/cli/app.py +0 -208
  86. jerry_thomas-0.0.5/src/datapipeline/cli/commands/analyze.py +0 -32
  87. jerry_thomas-0.0.5/src/datapipeline/cli/openers.py +0 -11
  88. jerry_thomas-0.0.5/src/datapipeline/config/catalog.py +0 -22
  89. jerry_thomas-0.0.5/src/datapipeline/config/dataset/feature.py +0 -24
  90. jerry_thomas-0.0.5/src/datapipeline/config/dataset/group_by.py +0 -31
  91. jerry_thomas-0.0.5/src/datapipeline/config/dataset/loader.py +0 -19
  92. jerry_thomas-0.0.5/src/datapipeline/config/dataset/normalize.py +0 -10
  93. jerry_thomas-0.0.5/src/datapipeline/domain/feature.py +0 -10
  94. jerry_thomas-0.0.5/src/datapipeline/domain/record.py +0 -20
  95. jerry_thomas-0.0.5/src/datapipeline/pipeline/pipelines.py +0 -46
  96. jerry_thomas-0.0.5/src/datapipeline/pipeline/stages.py +0 -64
  97. jerry_thomas-0.0.5/src/datapipeline/pipeline/utils/keygen.py +0 -20
  98. jerry_thomas-0.0.5/src/datapipeline/pipeline/utils/transform_utils.py +0 -120
  99. jerry_thomas-0.0.5/src/datapipeline/plugins.py +0 -7
  100. jerry_thomas-0.0.5/src/datapipeline/services/project_paths.py +0 -35
  101. jerry_thomas-0.0.5/src/datapipeline/sources/synthetic/time/parser.py +0 -9
  102. jerry_thomas-0.0.5/src/datapipeline/streams/canonical.py +0 -28
  103. jerry_thomas-0.0.5/src/datapipeline/streams/raw.py +0 -16
  104. jerry_thomas-0.0.5/src/datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.yaml +0 -4
  105. jerry_thomas-0.0.5/src/datapipeline/templates/plugin_skeleton/config/contracts/time_linear.yaml +0 -4
  106. jerry_thomas-0.0.5/src/datapipeline/templates/plugin_skeleton/config/contracts/time_ticks.yaml +0 -2
  107. jerry_thomas-0.0.5/src/datapipeline/templates/plugin_skeleton/config/recipe.yaml +0 -17
  108. jerry_thomas-0.0.5/src/datapipeline/transforms/sequence.py +0 -31
  109. jerry_thomas-0.0.5/src/jerry_thomas.egg-info/entry_points.txt +0 -44
  110. jerry_thomas-0.0.5/tests/test_transforms.py +0 -76
  111. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/LICENSE +0 -0
  112. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/setup.cfg +0 -0
  113. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/__init__.py +0 -0
  114. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/analysis/__init__.py +0 -0
  115. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/cli/commands/filter.py +0 -0
  116. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/cli/commands/list_.py +0 -0
  117. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/cli/commands/source.py +0 -0
  118. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/common/__init__.py +0 -0
  119. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/common/geo.py +0 -0
  120. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/config/__init__.py +0 -0
  121. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/domain/__init__.py +0 -0
  122. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/mappers/noop.py +0 -0
  123. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/parsers/identity.py +0 -0
  124. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/pipeline/__init__.py +0 -0
  125. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/entrypoints.py +0 -0
  126. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/paths.py +0 -0
  127. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/scaffold/__init__.py +0 -0
  128. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/scaffold/filter.py +0 -0
  129. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/__init__.py +0 -0
  130. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/composed_loader.py +0 -0
  131. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/decoders.py +0 -0
  132. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/factory.py +0 -0
  133. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/models/base.py +0 -0
  134. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/models/generator.py +0 -0
  135. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/models/parser.py +0 -0
  136. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/models/source.py +0 -0
  137. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/models/synthetic.py +0 -0
  138. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/synthetic/__init__.py +0 -0
  139. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/synthetic/time/__init__.py +0 -0
  140. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/synthetic/time/loader.py +0 -0
  141. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/transports.py +0 -0
  142. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
  143. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/stubs/filter.py.j2 +0 -0
  144. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/stubs/loader_synthetic.py.j2 +0 -0
  145. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/stubs/parser.py.j2 +0 -0
  146. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/stubs/parser_custom.py.j2 +0 -0
  147. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/utils/__init__.py +0 -0
  148. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/jerry_thomas.egg-info/dependency_links.txt +0 -0
  149. {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/jerry_thomas.egg-info/top_level.txt +0 -0
@@ -1,22 +1,31 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: jerry-thomas
3
- Version: 0.0.5
3
+ Version: 0.2.0
4
4
  Summary: Jerry-Thomas: a stream-first, plugin-friendly data pipeline (mixology-themed CLI)
5
5
  Author: Anders Skott Lind
6
6
  License: MIT
7
- Requires-Python: >=3.9
7
+ Requires-Python: >=3.10
8
8
  Description-Content-Type: text/markdown
9
9
  License-File: LICENSE
10
10
  Requires-Dist: numpy<3.0,>=1.24
11
- Requires-Dist: pydantic>=1.8
11
+ Requires-Dist: pydantic>=2.0
12
12
  Requires-Dist: PyYAML>=5.4
13
13
  Requires-Dist: tqdm>=4.0
14
14
  Requires-Dist: jinja2>=3.0
15
- Requires-Dist: setuptools>=70
15
+ Provides-Extra: ml
16
+ Requires-Dist: pandas>=2.0; extra == "ml"
17
+ Requires-Dist: torch>=2.0; extra == "ml"
16
18
  Dynamic: license-file
17
19
 
18
20
  # Jerry Thomas
19
21
 
22
+ Time‑Series First
23
+ - This runtime is time‑series‑first. Every domain record must include a timezone‑aware `time` and a `value`.
24
+ - Grouping is defined by time buckets only (`group_by.keys: [ { type: time, ... } ]`).
25
+ - Feature streams are sorted by time; sequence transforms assume ordered series.
26
+ - Categorical dimensions (e.g., station, zone, ticker) belong in `partition_by` so they become partitions of the same time series.
27
+ - Non‑temporal grouping is not supported.
28
+
20
29
  Jerry Thomas turns the datapipeline runtime into a cocktail program. You still install the
21
30
  same Python package (`datapipeline`) and tap into the plugin architecture, but every CLI
22
31
  dance step nods to a craft bar. Declarative YAML menus describe projects, sources and
@@ -59,11 +68,29 @@ raw source → canonical stream → record stage → feature stage → vector st
59
68
  | `src/datapipeline/services` | Bootstrapping (project loading, YAML interpolation), runtime factories and scaffolding helpers for new bar tools (`services/bootstrap.py`, `services/factories.py`, `services/scaffold/plugin.py`). |
60
69
  | `src/datapipeline/pipeline` | Pure functions that build record/feature/vector iterators plus supporting utilities for ordering and transform wiring (`pipeline/pipelines.py`, `pipeline/utils/transform_utils.py`). |
61
70
  | `src/datapipeline/domain` | Data structures representing records, feature records and vectors coming off the line (`domain/record.py`, `domain/feature.py`, `domain/vector.py`). |
62
- | `src/datapipeline/transforms` & `src/datapipeline/filters` | Built-in transforms (lagging timestamps, sliding windows) and filter helpers exposed through entry points (`transforms/transforms.py`, `transforms/sequence.py`, `filters/filters.py`). |
71
+ | `src/datapipeline/transforms` & `src/datapipeline/filters` | Built-in transforms (lagging timestamps, scaling, sliding windows) and filter helpers exposed through entry points (`transforms/record.py`, `transforms/feature.py`, `transforms/sequence.py`, `filters/filters.py`). |
63
72
  | `src/datapipeline/sources/synthetic/time` | Example synthetic time-series loader/parser pair plus helper mappers for experimentation while the real spirits arrive (`sources/synthetic/time/loader.py`, `sources/synthetic/time/parser.py`, `mappers/synthetic/time.py`). |
64
73
 
65
74
  ---
66
75
 
76
+ ## Built-in DSL identifiers
77
+
78
+ The YAML DSL resolves filters and transforms by entry-point name. These ship with the
79
+ template out of the box:
80
+
81
+ | Kind | Identifiers | Notes |
82
+ | ----------------- | ----------------------------------------------------------------------------------------------- | ----- |
83
+ | Filters | `eq`/`equals`, `ne`/`not_equal`, `lt`, `le`, `gt`, `ge`, `in`/`contains`, `nin`/`not_in` | Use as `- gt: { field: value }` or `- in: { field: [values...] }`. Synonyms map to the same implementation. |
84
+ | Record transforms | `time_lag`, `drop_missing` | `time_lag` expects a duration string (e.g. `1h`), `drop_missing` removes `None`/`NaN` records. |
85
+ | Feature transforms| `standard_scale` | Options: `with_mean`, `with_std`, optional `statistics`. |
86
+ | Sequence transforms | `time_window`, `time_fill_mean`, `time_fill_median` | `time_window` builds sliding windows; the fill transforms impute missing values from running mean/median with optional `window`/`min_samples`. |
87
+ | Vector transforms | `fill_history`, `fill_horizontal`, `fill_constant`, `drop_missing` | History fill uses prior buckets, horizontal fill aggregates sibling partitions, constant sets a default, and drop removes vectors below coverage thresholds. |
88
+
89
+ Extend `pyproject.toml` with additional entry points to register custom logic under your
90
+ own identifiers.
91
+
92
+ ---
93
+
67
94
  ## Opening the bar
68
95
 
69
96
  ### 1. Install the tools
@@ -86,17 +113,17 @@ python -c "import datapipeline; print('bar ready')"
86
113
 
87
114
  ### 2. Draft your bar book
88
115
 
89
- Create a `config/project.yaml` so the runtime knows where to find ingredients, infusions
90
- and the tasting menu. Globals are optional but handy for sharing values—they are
91
- interpolated into downstream YAML specs during bootstrap
116
+ Create a `config/recipes/<name>/project.yaml` so the runtime knows where to find
117
+ ingredients, infusions and the tasting menu. Globals are optional but handy for sharing
118
+ values—they are interpolated into downstream YAML specs during bootstrap
92
119
  (`src/datapipeline/config/project.py`, `src/datapipeline/services/bootstrap.py`).
93
120
 
94
121
  ```yaml
95
122
  version: 1
96
123
  paths:
97
- sources: config/distilleries
98
- streams: config/contracts
99
- dataset: config/recipe.yaml
124
+ sources: ../../sources
125
+ streams: ../../contracts
126
+ dataset: dataset.yaml
100
127
  globals:
101
128
  opening_time: "2024-01-01T16:00:00Z"
102
129
  last_call: "2024-01-02T02:00:00Z"
@@ -107,13 +134,13 @@ globals:
107
134
 
108
135
  ### 3. Stock the bottles (raw sources)
109
136
 
110
- Create `config/distilleries/<alias>.yaml` files. Each must expose a `parser` and `loader`
137
+ Create `config/sources/<alias>.yaml` files. Each must expose a `parser` and `loader`
111
138
  pointing at entry points plus any constructor arguments
112
139
  (`src/datapipeline/services/bootstrap.py`). Here is a synthetic clock source that feels
113
140
  like a drip of barrel-aged bitters:
114
141
 
115
142
  ```yaml
116
- # config/distilleries/time_ticks.yaml
143
+ # config/sources/time_ticks.yaml
117
144
  parser:
118
145
  entrypoint: "synthetic.time"
119
146
  args: {}
@@ -145,7 +172,7 @@ mapper:
145
172
  mode: spritz
146
173
  ```
147
174
 
148
- The mapper uses the provided mode to create a new `TimeFeatureRecord` stream ready for the
175
+ The mapper uses the provided mode to create a new `TimeSeriesRecord` stream ready for the
149
176
  feature stage (`mappers/synthetic/time.py`).
150
177
 
151
178
  ### 5. Script the tasting menu (dataset)
@@ -155,28 +182,53 @@ are grouped (`src/datapipeline/config/dataset/dataset.py`). A minimal hourly men
155
182
  look like:
156
183
 
157
184
  ```yaml
158
- # config/recipe.yaml
185
+ # config/recipes/default/dataset.yaml
159
186
  group_by:
160
187
  keys:
161
188
  - type: time
162
189
  field: time
163
190
  resolution: 1h
164
191
  features:
165
- - stream: time.encode
166
- feature_id: hour_spritz
167
- partition_by: null
168
- filters: []
192
+ - id: hour_spritz
193
+ stream: time.encode
169
194
  transforms:
170
- - time_lag: "0h"
195
+ - record:
196
+ transform: time_lag
197
+ args: 0h
198
+ - feature:
199
+ transform: standard_scale
200
+ with_mean: true
201
+ with_std: true
202
+ - sequence:
203
+ transform: time_window
204
+ size: 4
205
+ stride: 1
206
+ - sequence:
207
+ transform: time_fill_mean
208
+ window: 24
209
+ min_samples: 6
171
210
  ```
172
211
 
173
212
  Use the sample `dataset` template as a starting point if you prefer scaffolding before
174
- pouring concrete values. Group keys support time bucketing (with automatic flooring to the
175
- requested resolution) and categorical splits
176
- (`src/datapipeline/config/dataset/group_by.py`,
177
- `src/datapipeline/config/dataset/normalize.py`). You can also attach feature or sequence
178
- transforms—such as the sliding `TimeWindowTransformer`—directly in the YAML by referencing
179
- their entry point names (`src/datapipeline/transforms/sequence.py`).
213
+ pouring concrete values. Group keys now require explicit time bucketing (with automatic
214
+ flooring to the requested resolution) so every pipeline is clock-driven. You can attach
215
+ feature or sequence transforms—such as the sliding `TimeWindowTransformer` or the
216
+ `time_fill_mean`/`time_fill_median` imputers—directly in the YAML by referencing their
217
+ entry point names (`src/datapipeline/transforms/sequence.py`).
218
+
219
+ When vectors are assembled you can optionally apply `vector_transforms` to enforce schema
220
+ guarantees. The built-ins cover:
221
+
222
+ - `fill_history` – use running means/medians from prior buckets (per partition) with
223
+ configurable window/minimum samples.
224
+ - `fill_horizontal` – aggregate sibling partitions at the same timestamp (e.g. other
225
+ stations) using mean/median.
226
+ - `fill_constant` – provide a constant default for missing features/partitions.
227
+ - `drop_missing` – drop vectors that fall below a coverage threshold or omit required
228
+ features.
229
+
230
+ Transforms accept either an explicit `expected` list or a manifest path to discover the
231
+ full partition set (`build/partitions.json` produced by `jerry inspect partitions`).
180
232
 
181
233
  Once the book is ready, run the bootstrapper (the CLI does this automatically) to
182
234
  materialize all registered sources and streams
@@ -189,9 +241,9 @@ materialize all registered sources and streams
189
241
  ### Prep any station (with visuals)
190
242
 
191
243
  ```bash
192
- jerry prep pour --project config/project.yaml --limit 20
193
- jerry prep build --project config/project.yaml --limit 20
194
- jerry prep stir --project config/project.yaml --limit 20
244
+ jerry prep pour --project config/datasets/default/project.yaml --limit 20
245
+ jerry prep build --project config/datasets/default/project.yaml --limit 20
246
+ jerry prep stir --project config/datasets/default/project.yaml --limit 20
195
247
  ```
196
248
 
197
249
  - `prep pour` shows the record-stage ingredients headed for each feature.
@@ -208,34 +260,79 @@ loaders. The CLI wires up `build_record_pipeline`, `build_feature_pipeline` and
208
260
  ### Serve the flights (production mode)
209
261
 
210
262
  ```bash
211
- jerry serve --project config/project.yaml --output print
212
- jerry serve --project config/project.yaml --output stream
213
- jerry serve --project config/project.yaml --output exports/batch.pt
263
+ jerry serve --project config/datasets/default/project.yaml --output print
264
+ jerry serve --project config/datasets/default/project.yaml --output stream
265
+ jerry serve --project config/datasets/default/project.yaml --output exports/batch.pt
214
266
  ```
215
267
 
216
268
  Production mode skips the bar flair and focuses on throughput. `print` writes tasting
217
269
  notes to stdout, `stream` emits newline-delimited JSON (with values coerced to strings when
218
270
  necessary), and a `.pt` destination stores a pickle-compatible payload for later pours.
219
271
 
220
- ### Taste the balance (vector quality)
221
-
222
- ```bash
223
- jerry taste --project config/project.yaml
272
+ ## Funnel vectors into ML projects
273
+
274
+ Data scientists rarely want to shell out to the CLI; they need a programmatic
275
+ hand-off that plugs vectors straight into notebooks, feature stores or training
276
+ loops. The `datapipeline.integrations` package wraps the existing iterator
277
+ builders with ML-friendly adapters without pulling pandas or torch into the
278
+ core runtime.
279
+
280
+ ```python
281
+ from datapipeline.integrations import (
282
+ VectorAdapter,
283
+ dataframe_from_vectors,
284
+ iter_vector_rows,
285
+ torch_dataset,
286
+ )
287
+
288
+ # Bootstrap once and stream ready-to-use rows.
289
+ adapter = VectorAdapter.from_project("config/project.yaml")
290
+ for row in adapter.iter_rows(limit=32, flatten_sequences=True):
291
+ send_to_feature_store(row)
292
+
293
+ # Helper functions cover ad-hoc jobs as well.
294
+ rows = iter_vector_rows(
295
+ "config/project.yaml",
296
+ include_group=True,
297
+ group_format="mapping",
298
+ flatten_sequences=True,
299
+ )
300
+
301
+ # Optional extras materialize into common ML containers if installed.
302
+ df = dataframe_from_vectors("config/project.yaml") # Requires pandas
303
+ dataset = torch_dataset("config/project.yaml", dtype=torch.float32) # Requires torch
224
304
  ```
225
305
 
226
- This command reuses the vector pipeline, collects presence counts for every configured
227
- feature and flags empty or incomplete flights so you can diagnose upstream issues quickly
228
- (`src/datapipeline/cli/commands/analyze.py`, `src/datapipeline/analysis/vector_analyzer.py`).
229
- Use `--limit` to spot-check during service.
306
+ Everything still flows through `build_vector_pipeline`; the integration layer
307
+ normalizes group keys, optionally flattens sequence features and demonstrates
308
+ how to turn the iterator into DataFrames or `torch.utils.data.Dataset`
309
+ instances. ML teams can fork the same pattern for their own stacks—Spark, NumPy
310
+ or feature store SDKs—without adding opinionated glue to the runtime itself.
311
+
312
+ ### Inspect the balance (vector quality)
313
+
314
+ Use the inspect helpers for different outputs:
315
+
316
+ - `jerry inspect report --project config/datasets/default/project.yaml` — print a
317
+ human-readable quality report (totals, keep/below lists, optional partition detail).
318
+ - `jerry inspect coverage --project config/datasets/default/project.yaml` — persist the
319
+ coverage summary to `build/coverage.json` (keep/below feature and partition lists plus
320
+ coverage percentages).
321
+ - `jerry inspect matrix --project config/datasets/default/project.yaml --format html` —
322
+ export availability matrices (CSV or HTML) for deeper analysis.
323
+ - `jerry inspect partitions --project config/datasets/default/project.yaml` — write the
324
+ observed partition manifest to `build/partitions.json` for use in configs.
325
+
326
+ Note: `jerry prep taste` has been removed; use `jerry inspect report` and friends.
230
327
 
231
328
  ---
232
329
 
233
- ## Extending the bar program
330
+ ## Extending the CLI
234
331
 
235
332
  ### Scaffold a plugin package
236
333
 
237
334
  ```bash
238
- jerry station init --name my_datapipeline --out .
335
+ jerry plugin init --name my_datapipeline --out .
239
336
  ```
240
337
 
241
338
  The generator copies a ready-made skeleton (pyproject, README, package directory) and
@@ -249,25 +346,29 @@ transforms.
249
346
  Use the CLI helpers to scaffold boilerplate code in your plugin workspace:
250
347
 
251
348
  ```bash
252
- jerry distillery add --provider dmi --dataset metobs --transport fs --format csv
253
- jerry spirit add --domain metobs --time-aware
254
- jerry contract --time-aware
349
+ jerry source add --provider dmi --dataset metobs --transport fs --format csv
350
+ jerry domain add --domain metobs
351
+ jerry contract
255
352
  ```
256
353
 
257
- The distillery command writes DTO/parser stubs, updates entry points and drops a matching
258
- YAML file in `config/distilleries/` pre-filled with composed-loader defaults for the chosen
354
+ The source command writes DTO/parser stubs, updates entry points and drops a matching
355
+ YAML file in `config/sources/` pre-filled with composed-loader defaults for the chosen
259
356
  transport (`src/datapipeline/cli/app.py`, `src/datapipeline/services/scaffold/source.py`).
357
+ `jerry domain add` now always scaffolds `TimeSeriesRecord` domains so every mapper carries
358
+ an explicit timestamp alongside its value, and `jerry contract` wires that source/domain
359
+ pair up for canonical stream generation.
260
360
 
261
361
  ### Add custom filters or transforms
262
362
 
263
363
  Register new functions/classes under the appropriate entry point group in your plugin’s
264
- `pyproject.toml`. The runtime resolves them through `load_ep`, applies record-level
265
- filters first, then record/feature/sequence transforms in the order declared in the
266
- dataset config (`pyproject.toml`, `src/datapipeline/utils/load.py`,
364
+ `pyproject.toml`. The runtime resolves them through `load_ep`, applies record filters first,
365
+ then record/feature/sequence transforms in the order declared in the dataset config
366
+ (`pyproject.toml`, `src/datapipeline/utils/load.py`,
267
367
  `src/datapipeline/pipeline/utils/transform_utils.py`). Built-in helpers cover common
268
368
  comparisons (including timezone-aware checks) and time-based transforms (lags, sliding
269
369
  windows) if you need quick wins (`src/datapipeline/filters/filters.py`,
270
- `src/datapipeline/transforms/transforms.py`, `src/datapipeline/transforms/sequence.py`).
370
+ `src/datapipeline/transforms/record.py`, `src/datapipeline/transforms/feature.py`,
371
+ `src/datapipeline/transforms/sequence.py`).
271
372
 
272
373
  ### Prototype with synthetic time-series data
273
374
 
@@ -285,8 +386,7 @@ transform to build sliding-window feature flights without external datasets
285
386
 
286
387
  | Type | Description |
287
388
  | ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
288
- | `Record` | Canonical payload containing a `value`; extended by other record types (`src/datapipeline/domain/record.py`). |
289
- | `TimeFeatureRecord` | A record with a timezone-aware `time` attribute, normalized to UTC to avoid boundary issues (`src/datapipeline/domain/record.py`). |
389
+ | `TimeSeriesRecord` | Canonical record with `time` (tz-aware, normalized to UTC) and `value`; the pipeline treats streams as ordered series (`src/datapipeline/domain/record.py`).|
290
390
  | `FeatureRecord` | Links a record (or list of records from sequence transforms) to a `feature_id` and `group_key` (`src/datapipeline/domain/feature.py`). |
291
391
  | `Vector` | Final grouped payload: a mapping of feature IDs to scalars or ordered lists plus helper methods for shape/key access (`src/datapipeline/domain/vector.py`). |
292
392
 
@@ -1,5 +1,12 @@
1
1
  # Jerry Thomas
2
2
 
3
+ Time‑Series First
4
+ - This runtime is time‑series‑first. Every domain record must include a timezone‑aware `time` and a `value`.
5
+ - Grouping is defined by time buckets only (`group_by.keys: [ { type: time, ... } ]`).
6
+ - Feature streams are sorted by time; sequence transforms assume ordered series.
7
+ - Categorical dimensions (e.g., station, zone, ticker) belong in `partition_by` so they become partitions of the same time series.
8
+ - Non‑temporal grouping is not supported.
9
+
3
10
  Jerry Thomas turns the datapipeline runtime into a cocktail program. You still install the
4
11
  same Python package (`datapipeline`) and tap into the plugin architecture, but every CLI
5
12
  dance step nods to a craft bar. Declarative YAML menus describe projects, sources and
@@ -42,11 +49,29 @@ raw source → canonical stream → record stage → feature stage → vector st
42
49
  | `src/datapipeline/services` | Bootstrapping (project loading, YAML interpolation), runtime factories and scaffolding helpers for new bar tools (`services/bootstrap.py`, `services/factories.py`, `services/scaffold/plugin.py`). |
43
50
  | `src/datapipeline/pipeline` | Pure functions that build record/feature/vector iterators plus supporting utilities for ordering and transform wiring (`pipeline/pipelines.py`, `pipeline/utils/transform_utils.py`). |
44
51
  | `src/datapipeline/domain` | Data structures representing records, feature records and vectors coming off the line (`domain/record.py`, `domain/feature.py`, `domain/vector.py`). |
45
- | `src/datapipeline/transforms` & `src/datapipeline/filters` | Built-in transforms (lagging timestamps, sliding windows) and filter helpers exposed through entry points (`transforms/transforms.py`, `transforms/sequence.py`, `filters/filters.py`). |
52
+ | `src/datapipeline/transforms` & `src/datapipeline/filters` | Built-in transforms (lagging timestamps, scaling, sliding windows) and filter helpers exposed through entry points (`transforms/record.py`, `transforms/feature.py`, `transforms/sequence.py`, `filters/filters.py`). |
46
53
  | `src/datapipeline/sources/synthetic/time` | Example synthetic time-series loader/parser pair plus helper mappers for experimentation while the real spirits arrive (`sources/synthetic/time/loader.py`, `sources/synthetic/time/parser.py`, `mappers/synthetic/time.py`). |
47
54
 
48
55
  ---
49
56
 
57
+ ## Built-in DSL identifiers
58
+
59
+ The YAML DSL resolves filters and transforms by entry-point name. These ship with the
60
+ template out of the box:
61
+
62
+ | Kind | Identifiers | Notes |
63
+ | ----------------- | ----------------------------------------------------------------------------------------------- | ----- |
64
+ | Filters | `eq`/`equals`, `ne`/`not_equal`, `lt`, `le`, `gt`, `ge`, `in`/`contains`, `nin`/`not_in` | Use as `- gt: { field: value }` or `- in: { field: [values...] }`. Synonyms map to the same implementation. |
65
+ | Record transforms | `time_lag`, `drop_missing` | `time_lag` expects a duration string (e.g. `1h`), `drop_missing` removes `None`/`NaN` records. |
66
+ | Feature transforms| `standard_scale` | Options: `with_mean`, `with_std`, optional `statistics`. |
67
+ | Sequence transforms | `time_window`, `time_fill_mean`, `time_fill_median` | `time_window` builds sliding windows; the fill transforms impute missing values from running mean/median with optional `window`/`min_samples`. |
68
+ | Vector transforms | `fill_history`, `fill_horizontal`, `fill_constant`, `drop_missing` | History fill uses prior buckets, horizontal fill aggregates sibling partitions, constant sets a default, and drop removes vectors below coverage thresholds. |
69
+
70
+ Extend `pyproject.toml` with additional entry points to register custom logic under your
71
+ own identifiers.
72
+
73
+ ---
74
+
50
75
  ## Opening the bar
51
76
 
52
77
  ### 1. Install the tools
@@ -69,17 +94,17 @@ python -c "import datapipeline; print('bar ready')"
69
94
 
70
95
  ### 2. Draft your bar book
71
96
 
72
- Create a `config/project.yaml` so the runtime knows where to find ingredients, infusions
73
- and the tasting menu. Globals are optional but handy for sharing values—they are
74
- interpolated into downstream YAML specs during bootstrap
97
+ Create a `config/recipes/<name>/project.yaml` so the runtime knows where to find
98
+ ingredients, infusions and the tasting menu. Globals are optional but handy for sharing
99
+ values—they are interpolated into downstream YAML specs during bootstrap
75
100
  (`src/datapipeline/config/project.py`, `src/datapipeline/services/bootstrap.py`).
76
101
 
77
102
  ```yaml
78
103
  version: 1
79
104
  paths:
80
- sources: config/distilleries
81
- streams: config/contracts
82
- dataset: config/recipe.yaml
105
+ sources: ../../sources
106
+ streams: ../../contracts
107
+ dataset: dataset.yaml
83
108
  globals:
84
109
  opening_time: "2024-01-01T16:00:00Z"
85
110
  last_call: "2024-01-02T02:00:00Z"
@@ -90,13 +115,13 @@ globals:
90
115
 
91
116
  ### 3. Stock the bottles (raw sources)
92
117
 
93
- Create `config/distilleries/<alias>.yaml` files. Each must expose a `parser` and `loader`
118
+ Create `config/sources/<alias>.yaml` files. Each must expose a `parser` and `loader`
94
119
  pointing at entry points plus any constructor arguments
95
120
  (`src/datapipeline/services/bootstrap.py`). Here is a synthetic clock source that feels
96
121
  like a drip of barrel-aged bitters:
97
122
 
98
123
  ```yaml
99
- # config/distilleries/time_ticks.yaml
124
+ # config/sources/time_ticks.yaml
100
125
  parser:
101
126
  entrypoint: "synthetic.time"
102
127
  args: {}
@@ -128,7 +153,7 @@ mapper:
128
153
  mode: spritz
129
154
  ```
130
155
 
131
- The mapper uses the provided mode to create a new `TimeFeatureRecord` stream ready for the
156
+ The mapper uses the provided mode to create a new `TimeSeriesRecord` stream ready for the
132
157
  feature stage (`mappers/synthetic/time.py`).
133
158
 
134
159
  ### 5. Script the tasting menu (dataset)
@@ -138,28 +163,53 @@ are grouped (`src/datapipeline/config/dataset/dataset.py`). A minimal hourly men
138
163
  look like:
139
164
 
140
165
  ```yaml
141
- # config/recipe.yaml
166
+ # config/recipes/default/dataset.yaml
142
167
  group_by:
143
168
  keys:
144
169
  - type: time
145
170
  field: time
146
171
  resolution: 1h
147
172
  features:
148
- - stream: time.encode
149
- feature_id: hour_spritz
150
- partition_by: null
151
- filters: []
173
+ - id: hour_spritz
174
+ stream: time.encode
152
175
  transforms:
153
- - time_lag: "0h"
176
+ - record:
177
+ transform: time_lag
178
+ args: 0h
179
+ - feature:
180
+ transform: standard_scale
181
+ with_mean: true
182
+ with_std: true
183
+ - sequence:
184
+ transform: time_window
185
+ size: 4
186
+ stride: 1
187
+ - sequence:
188
+ transform: time_fill_mean
189
+ window: 24
190
+ min_samples: 6
154
191
  ```
155
192
 
156
193
  Use the sample `dataset` template as a starting point if you prefer scaffolding before
157
- pouring concrete values. Group keys support time bucketing (with automatic flooring to the
158
- requested resolution) and categorical splits
159
- (`src/datapipeline/config/dataset/group_by.py`,
160
- `src/datapipeline/config/dataset/normalize.py`). You can also attach feature or sequence
161
- transforms—such as the sliding `TimeWindowTransformer`—directly in the YAML by referencing
162
- their entry point names (`src/datapipeline/transforms/sequence.py`).
194
+ pouring concrete values. Group keys now require explicit time bucketing (with automatic
195
+ flooring to the requested resolution) so every pipeline is clock-driven. You can attach
196
+ feature or sequence transforms—such as the sliding `TimeWindowTransformer` or the
197
+ `time_fill_mean`/`time_fill_median` imputers—directly in the YAML by referencing their
198
+ entry point names (`src/datapipeline/transforms/sequence.py`).
199
+
200
+ When vectors are assembled you can optionally apply `vector_transforms` to enforce schema
201
+ guarantees. The built-ins cover:
202
+
203
+ - `fill_history` – use running means/medians from prior buckets (per partition) with
204
+ configurable window/minimum samples.
205
+ - `fill_horizontal` – aggregate sibling partitions at the same timestamp (e.g. other
206
+ stations) using mean/median.
207
+ - `fill_constant` – provide a constant default for missing features/partitions.
208
+ - `drop_missing` – drop vectors that fall below a coverage threshold or omit required
209
+ features.
210
+
211
+ Transforms accept either an explicit `expected` list or a manifest path to discover the
212
+ full partition set (`build/partitions.json` produced by `jerry inspect partitions`).
163
213
 
164
214
  Once the book is ready, run the bootstrapper (the CLI does this automatically) to
165
215
  materialize all registered sources and streams
@@ -172,9 +222,9 @@ materialize all registered sources and streams
172
222
  ### Prep any station (with visuals)
173
223
 
174
224
  ```bash
175
- jerry prep pour --project config/project.yaml --limit 20
176
- jerry prep build --project config/project.yaml --limit 20
177
- jerry prep stir --project config/project.yaml --limit 20
225
+ jerry prep pour --project config/datasets/default/project.yaml --limit 20
226
+ jerry prep build --project config/datasets/default/project.yaml --limit 20
227
+ jerry prep stir --project config/datasets/default/project.yaml --limit 20
178
228
  ```
179
229
 
180
230
  - `prep pour` shows the record-stage ingredients headed for each feature.
@@ -191,34 +241,79 @@ loaders. The CLI wires up `build_record_pipeline`, `build_feature_pipeline` and
191
241
  ### Serve the flights (production mode)
192
242
 
193
243
  ```bash
194
- jerry serve --project config/project.yaml --output print
195
- jerry serve --project config/project.yaml --output stream
196
- jerry serve --project config/project.yaml --output exports/batch.pt
244
+ jerry serve --project config/datasets/default/project.yaml --output print
245
+ jerry serve --project config/datasets/default/project.yaml --output stream
246
+ jerry serve --project config/datasets/default/project.yaml --output exports/batch.pt
197
247
  ```
198
248
 
199
249
  Production mode skips the bar flair and focuses on throughput. `print` writes tasting
200
250
  notes to stdout, `stream` emits newline-delimited JSON (with values coerced to strings when
201
251
  necessary), and a `.pt` destination stores a pickle-compatible payload for later pours.
202
252
 
203
- ### Taste the balance (vector quality)
204
-
205
- ```bash
206
- jerry taste --project config/project.yaml
253
+ ## Funnel vectors into ML projects
254
+
255
+ Data scientists rarely want to shell out to the CLI; they need a programmatic
256
+ hand-off that plugs vectors straight into notebooks, feature stores or training
257
+ loops. The `datapipeline.integrations` package wraps the existing iterator
258
+ builders with ML-friendly adapters without pulling pandas or torch into the
259
+ core runtime.
260
+
261
+ ```python
262
+ from datapipeline.integrations import (
263
+ VectorAdapter,
264
+ dataframe_from_vectors,
265
+ iter_vector_rows,
266
+ torch_dataset,
267
+ )
268
+
269
+ # Bootstrap once and stream ready-to-use rows.
270
+ adapter = VectorAdapter.from_project("config/project.yaml")
271
+ for row in adapter.iter_rows(limit=32, flatten_sequences=True):
272
+ send_to_feature_store(row)
273
+
274
+ # Helper functions cover ad-hoc jobs as well.
275
+ rows = iter_vector_rows(
276
+ "config/project.yaml",
277
+ include_group=True,
278
+ group_format="mapping",
279
+ flatten_sequences=True,
280
+ )
281
+
282
+ # Optional extras materialize into common ML containers if installed.
283
+ df = dataframe_from_vectors("config/project.yaml") # Requires pandas
284
+ dataset = torch_dataset("config/project.yaml", dtype=torch.float32) # Requires torch
207
285
  ```
208
286
 
209
- This command reuses the vector pipeline, collects presence counts for every configured
210
- feature and flags empty or incomplete flights so you can diagnose upstream issues quickly
211
- (`src/datapipeline/cli/commands/analyze.py`, `src/datapipeline/analysis/vector_analyzer.py`).
212
- Use `--limit` to spot-check during service.
287
+ Everything still flows through `build_vector_pipeline`; the integration layer
288
+ normalizes group keys, optionally flattens sequence features and demonstrates
289
+ how to turn the iterator into DataFrames or `torch.utils.data.Dataset`
290
+ instances. ML teams can fork the same pattern for their own stacks—Spark, NumPy
291
+ or feature store SDKs—without adding opinionated glue to the runtime itself.
292
+
293
+ ### Inspect the balance (vector quality)
294
+
295
+ Use the inspect helpers for different outputs:
296
+
297
+ - `jerry inspect report --project config/datasets/default/project.yaml` — print a
298
+ human-readable quality report (totals, keep/below lists, optional partition detail).
299
+ - `jerry inspect coverage --project config/datasets/default/project.yaml` — persist the
300
+ coverage summary to `build/coverage.json` (keep/below feature and partition lists plus
301
+ coverage percentages).
302
+ - `jerry inspect matrix --project config/datasets/default/project.yaml --format html` —
303
+ export availability matrices (CSV or HTML) for deeper analysis.
304
+ - `jerry inspect partitions --project config/datasets/default/project.yaml` — write the
305
+ observed partition manifest to `build/partitions.json` for use in configs.
306
+
307
+ Note: `jerry prep taste` has been removed; use `jerry inspect report` and friends.
213
308
 
214
309
  ---
215
310
 
216
- ## Extending the bar program
311
+ ## Extending the CLI
217
312
 
218
313
  ### Scaffold a plugin package
219
314
 
220
315
  ```bash
221
- jerry station init --name my_datapipeline --out .
316
+ jerry plugin init --name my_datapipeline --out .
222
317
  ```
223
318
 
224
319
  The generator copies a ready-made skeleton (pyproject, README, package directory) and
@@ -232,25 +327,29 @@ transforms.
232
327
  Use the CLI helpers to scaffold boilerplate code in your plugin workspace:
233
328
 
234
329
  ```bash
235
- jerry distillery add --provider dmi --dataset metobs --transport fs --format csv
236
- jerry spirit add --domain metobs --time-aware
237
- jerry contract --time-aware
330
+ jerry source add --provider dmi --dataset metobs --transport fs --format csv
331
+ jerry domain add --domain metobs
332
+ jerry contract
238
333
  ```
239
334
 
240
- The distillery command writes DTO/parser stubs, updates entry points and drops a matching
241
- YAML file in `config/distilleries/` pre-filled with composed-loader defaults for the chosen
335
+ The source command writes DTO/parser stubs, updates entry points and drops a matching
336
+ YAML file in `config/sources/` pre-filled with composed-loader defaults for the chosen
242
337
  transport (`src/datapipeline/cli/app.py`, `src/datapipeline/services/scaffold/source.py`).
338
+ `jerry domain add` now always scaffolds `TimeSeriesRecord` domains so every mapper carries
339
+ an explicit timestamp alongside its value, and `jerry contract` wires that source/domain
340
+ pair up for canonical stream generation.
243
341
 
244
342
  ### Add custom filters or transforms
245
343
 
246
344
  Register new functions/classes under the appropriate entry point group in your plugin’s
247
- `pyproject.toml`. The runtime resolves them through `load_ep`, applies record-level
248
- filters first, then record/feature/sequence transforms in the order declared in the
249
- dataset config (`pyproject.toml`, `src/datapipeline/utils/load.py`,
345
+ `pyproject.toml`. The runtime resolves them through `load_ep`, applies record filters first,
346
+ then record/feature/sequence transforms in the order declared in the dataset config
347
+ (`pyproject.toml`, `src/datapipeline/utils/load.py`,
250
348
  `src/datapipeline/pipeline/utils/transform_utils.py`). Built-in helpers cover common
251
349
  comparisons (including timezone-aware checks) and time-based transforms (lags, sliding
252
350
  windows) if you need quick wins (`src/datapipeline/filters/filters.py`,
253
- `src/datapipeline/transforms/transforms.py`, `src/datapipeline/transforms/sequence.py`).
351
+ `src/datapipeline/transforms/record.py`, `src/datapipeline/transforms/feature.py`,
352
+ `src/datapipeline/transforms/sequence.py`).
254
353
 
255
354
  ### Prototype with synthetic time-series data
256
355
 
@@ -268,8 +367,7 @@ transform to build sliding-window feature flights without external datasets
268
367
 
269
368
  | Type | Description |
270
369
  | ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
271
- | `Record` | Canonical payload containing a `value`; extended by other record types (`src/datapipeline/domain/record.py`). |
272
- | `TimeFeatureRecord` | A record with a timezone-aware `time` attribute, normalized to UTC to avoid boundary issues (`src/datapipeline/domain/record.py`). |
370
+ | `TimeSeriesRecord` | Canonical record with `time` (tz-aware, normalized to UTC) and `value`; the pipeline treats streams as ordered series (`src/datapipeline/domain/record.py`).|
273
371
  | `FeatureRecord` | Links a record (or list of records from sequence transforms) to a `feature_id` and `group_key` (`src/datapipeline/domain/feature.py`). |
274
372
  | `Vector` | Final grouped payload: a mapping of feature IDs to scalars or ordered lists plus helper methods for shape/key access (`src/datapipeline/domain/vector.py`). |
275
373