jerry-thomas 0.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. jerry_thomas-0.0.2/LICENSE +21 -0
  2. jerry_thomas-0.0.2/PKG-INFO +301 -0
  3. jerry_thomas-0.0.2/README.md +285 -0
  4. jerry_thomas-0.0.2/pyproject.toml +84 -0
  5. jerry_thomas-0.0.2/setup.cfg +4 -0
  6. jerry_thomas-0.0.2/src/datapipeline/__init__.py +0 -0
  7. jerry_thomas-0.0.2/src/datapipeline/analysis/__init__.py +0 -0
  8. jerry_thomas-0.0.2/src/datapipeline/analysis/vector_analyzer.py +49 -0
  9. jerry_thomas-0.0.2/src/datapipeline/cli/app.py +208 -0
  10. jerry_thomas-0.0.2/src/datapipeline/cli/commands/analyze.py +32 -0
  11. jerry_thomas-0.0.2/src/datapipeline/cli/commands/domain.py +9 -0
  12. jerry_thomas-0.0.2/src/datapipeline/cli/commands/filter.py +10 -0
  13. jerry_thomas-0.0.2/src/datapipeline/cli/commands/link.py +95 -0
  14. jerry_thomas-0.0.2/src/datapipeline/cli/commands/list_.py +22 -0
  15. jerry_thomas-0.0.2/src/datapipeline/cli/commands/plugin.py +10 -0
  16. jerry_thomas-0.0.2/src/datapipeline/cli/commands/run.py +151 -0
  17. jerry_thomas-0.0.2/src/datapipeline/cli/commands/source.py +17 -0
  18. jerry_thomas-0.0.2/src/datapipeline/cli/openers.py +11 -0
  19. jerry_thomas-0.0.2/src/datapipeline/cli/visuals.py +91 -0
  20. jerry_thomas-0.0.2/src/datapipeline/common/__init__.py +0 -0
  21. jerry_thomas-0.0.2/src/datapipeline/common/geo.py +13 -0
  22. jerry_thomas-0.0.2/src/datapipeline/config/__init__.py +0 -0
  23. jerry_thomas-0.0.2/src/datapipeline/config/catalog.py +22 -0
  24. jerry_thomas-0.0.2/src/datapipeline/config/dataset/dataset.py +19 -0
  25. jerry_thomas-0.0.2/src/datapipeline/config/dataset/feature.py +24 -0
  26. jerry_thomas-0.0.2/src/datapipeline/config/dataset/group_by.py +31 -0
  27. jerry_thomas-0.0.2/src/datapipeline/config/dataset/loader.py +19 -0
  28. jerry_thomas-0.0.2/src/datapipeline/config/dataset/normalize.py +10 -0
  29. jerry_thomas-0.0.2/src/datapipeline/config/project.py +24 -0
  30. jerry_thomas-0.0.2/src/datapipeline/domain/__init__.py +0 -0
  31. jerry_thomas-0.0.2/src/datapipeline/domain/feature.py +10 -0
  32. jerry_thomas-0.0.2/src/datapipeline/domain/record.py +20 -0
  33. jerry_thomas-0.0.2/src/datapipeline/domain/vector.py +44 -0
  34. jerry_thomas-0.0.2/src/datapipeline/filters/filters.py +88 -0
  35. jerry_thomas-0.0.2/src/datapipeline/mappers/noop.py +5 -0
  36. jerry_thomas-0.0.2/src/datapipeline/mappers/synthetic/time.py +19 -0
  37. jerry_thomas-0.0.2/src/datapipeline/parsers/identity.py +14 -0
  38. jerry_thomas-0.0.2/src/datapipeline/pipeline/__init__.py +0 -0
  39. jerry_thomas-0.0.2/src/datapipeline/pipeline/pipelines.py +46 -0
  40. jerry_thomas-0.0.2/src/datapipeline/pipeline/stages.py +64 -0
  41. jerry_thomas-0.0.2/src/datapipeline/pipeline/utils/keygen.py +20 -0
  42. jerry_thomas-0.0.2/src/datapipeline/pipeline/utils/memory_sort.py +27 -0
  43. jerry_thomas-0.0.2/src/datapipeline/pipeline/utils/ordering.py +52 -0
  44. jerry_thomas-0.0.2/src/datapipeline/pipeline/utils/transform_utils.py +120 -0
  45. jerry_thomas-0.0.2/src/datapipeline/plugins.py +7 -0
  46. jerry_thomas-0.0.2/src/datapipeline/services/bootstrap.py +158 -0
  47. jerry_thomas-0.0.2/src/datapipeline/services/constants.py +12 -0
  48. jerry_thomas-0.0.2/src/datapipeline/services/entrypoints.py +69 -0
  49. jerry_thomas-0.0.2/src/datapipeline/services/factories.py +18 -0
  50. jerry_thomas-0.0.2/src/datapipeline/services/paths.py +28 -0
  51. jerry_thomas-0.0.2/src/datapipeline/services/project_paths.py +35 -0
  52. jerry_thomas-0.0.2/src/datapipeline/services/scaffold/__init__.py +2 -0
  53. jerry_thomas-0.0.2/src/datapipeline/services/scaffold/domain.py +23 -0
  54. jerry_thomas-0.0.2/src/datapipeline/services/scaffold/filter.py +32 -0
  55. jerry_thomas-0.0.2/src/datapipeline/services/scaffold/mappers.py +52 -0
  56. jerry_thomas-0.0.2/src/datapipeline/services/scaffold/plugin.py +23 -0
  57. jerry_thomas-0.0.2/src/datapipeline/services/scaffold/source.py +165 -0
  58. jerry_thomas-0.0.2/src/datapipeline/services/scaffold/templates.py +32 -0
  59. jerry_thomas-0.0.2/src/datapipeline/sources/__init__.py +0 -0
  60. jerry_thomas-0.0.2/src/datapipeline/sources/composed_loader.py +38 -0
  61. jerry_thomas-0.0.2/src/datapipeline/sources/decoders.py +64 -0
  62. jerry_thomas-0.0.2/src/datapipeline/sources/factory.py +53 -0
  63. jerry_thomas-0.0.2/src/datapipeline/sources/models/__init__.py +18 -0
  64. jerry_thomas-0.0.2/src/datapipeline/sources/models/base.py +12 -0
  65. jerry_thomas-0.0.2/src/datapipeline/sources/models/generator.py +23 -0
  66. jerry_thomas-0.0.2/src/datapipeline/sources/models/loader.py +52 -0
  67. jerry_thomas-0.0.2/src/datapipeline/sources/models/parser.py +11 -0
  68. jerry_thomas-0.0.2/src/datapipeline/sources/models/source.py +28 -0
  69. jerry_thomas-0.0.2/src/datapipeline/sources/models/synthetic.py +11 -0
  70. jerry_thomas-0.0.2/src/datapipeline/sources/synthetic/__init__.py +0 -0
  71. jerry_thomas-0.0.2/src/datapipeline/sources/synthetic/time/__init__.py +0 -0
  72. jerry_thomas-0.0.2/src/datapipeline/sources/synthetic/time/loader.py +30 -0
  73. jerry_thomas-0.0.2/src/datapipeline/sources/synthetic/time/parser.py +9 -0
  74. jerry_thomas-0.0.2/src/datapipeline/sources/transports.py +66 -0
  75. jerry_thomas-0.0.2/src/datapipeline/streams/canonical.py +28 -0
  76. jerry_thomas-0.0.2/src/datapipeline/streams/raw.py +16 -0
  77. jerry_thomas-0.0.2/src/datapipeline/templates/plugin_skeleton/README.md +48 -0
  78. jerry_thomas-0.0.2/src/datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.yaml +4 -0
  79. jerry_thomas-0.0.2/src/datapipeline/templates/plugin_skeleton/config/contracts/time_linear.yaml +4 -0
  80. jerry_thomas-0.0.2/src/datapipeline/templates/plugin_skeleton/config/contracts/time_ticks.yaml +2 -0
  81. jerry_thomas-0.0.2/src/datapipeline/templates/plugin_skeleton/config/distilleries/time_ticks.yaml +9 -0
  82. jerry_thomas-0.0.2/src/datapipeline/templates/plugin_skeleton/config/project.yaml +8 -0
  83. jerry_thomas-0.0.2/src/datapipeline/templates/plugin_skeleton/config/recipe.yaml +17 -0
  84. jerry_thomas-0.0.2/src/datapipeline/templates/plugin_skeleton/pyproject.toml +11 -0
  85. jerry_thomas-0.0.2/src/datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
  86. jerry_thomas-0.0.2/src/datapipeline/templates/stubs/dto.py.j2 +24 -0
  87. jerry_thomas-0.0.2/src/datapipeline/templates/stubs/filter.py.j2 +16 -0
  88. jerry_thomas-0.0.2/src/datapipeline/templates/stubs/loader_synthetic.py.j2 +38 -0
  89. jerry_thomas-0.0.2/src/datapipeline/templates/stubs/mapper.py.j2 +20 -0
  90. jerry_thomas-0.0.2/src/datapipeline/templates/stubs/parser.py.j2 +18 -0
  91. jerry_thomas-0.0.2/src/datapipeline/templates/stubs/parser_custom.py.j2 +14 -0
  92. jerry_thomas-0.0.2/src/datapipeline/templates/stubs/record.py.j2 +18 -0
  93. jerry_thomas-0.0.2/src/datapipeline/templates/stubs/source.yaml.j2 +11 -0
  94. jerry_thomas-0.0.2/src/datapipeline/transforms/sequence.py +31 -0
  95. jerry_thomas-0.0.2/src/datapipeline/transforms/transforms.py +15 -0
  96. jerry_thomas-0.0.2/src/datapipeline/utils/__init__.py +0 -0
  97. jerry_thomas-0.0.2/src/datapipeline/utils/load.py +36 -0
  98. jerry_thomas-0.0.2/src/datapipeline/utils/time.py +32 -0
  99. jerry_thomas-0.0.2/src/jerry_thomas.egg-info/PKG-INFO +301 -0
  100. jerry_thomas-0.0.2/src/jerry_thomas.egg-info/SOURCES.txt +102 -0
  101. jerry_thomas-0.0.2/src/jerry_thomas.egg-info/dependency_links.txt +1 -0
  102. jerry_thomas-0.0.2/src/jerry_thomas.egg-info/entry_points.txt +40 -0
  103. jerry_thomas-0.0.2/src/jerry_thomas.egg-info/requires.txt +5 -0
  104. jerry_thomas-0.0.2/src/jerry_thomas.egg-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Anders Skottlind
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,301 @@
1
+ Metadata-Version: 2.4
2
+ Name: jerry-thomas
3
+ Version: 0.0.2
4
+ Summary: Jerry-Thomas: a stream-first, plugin-friendly data pipeline (mixology-themed CLI)
5
+ Author: Your Name
6
+ License: MIT
7
+ Requires-Python: >=3.9
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: numpy<3.0,>=1.24
11
+ Requires-Dist: pydantic>=1.8
12
+ Requires-Dist: PyYAML>=5.4
13
+ Requires-Dist: tqdm>=4.0
14
+ Requires-Dist: jinja2>=3.0
15
+ Dynamic: license-file
16
+
17
+ # Jerry Thomas
18
+
19
+ Jerry Thomas turns the datapipeline runtime into a cocktail program. You still install the
20
+ same Python package (`datapipeline`) and tap into the plugin architecture, but every CLI
21
+ dance step nods to a craft bar. Declarative YAML menus describe projects, sources and
22
+ datasets, pipelines move payloads through record/feature/vector stations, and setuptools
23
+ entry points keep the back bar stocked with new ingredients.
24
+
25
+ ---
26
+
27
+ ## How the bar is set up
28
+
29
+ ```text
30
+ raw source → canonical stream → record stage → feature stage → vector stage
31
+ ```
32
+
33
+ 1. **Raw sources (bottles on the shelf)** bundle a loader + parser recipe. Loaders handle
34
+ the I/O (files, URLs or synthetic runs) and parsers map rows into typed records while
35
+ skimming the dregs (`src/datapipeline/sources/models/loader.py`,
36
+ `src/datapipeline/sources/models/source.py`). The bootstrapper registers each source under
37
+ an alias so you can order it later in the service flow (`src/datapipeline/streams/raw.py`,
38
+ `src/datapipeline/services/bootstrap.py`).
39
+ 2. **Canonical streams (house infusions)** optionally apply a mapper on top of a raw
40
+ source to normalize payloads before the dataset drinks them
41
+ (`src/datapipeline/streams/canonical.py`, `src/datapipeline/services/factories.py`).
42
+ 3. **Dataset stages (prep stations)** read the configured canonical streams. Record stages
43
+ are your strainers and shakers, feature stages bottle the clarified spirits into keyed
44
+ features (with optional sequence transforms), and vector stages line up the flights ready
45
+ for service (`src/datapipeline/pipeline/pipelines.py`, `src/datapipeline/pipeline/stages.py`,
46
+ `src/datapipeline/config/dataset/feature.py`).
47
+ 4. **Vectors (tasting flights)** carry grouped feature values; downstream tasters can
48
+ inspect them for balance and completeness
49
+ (`src/datapipeline/domain/vector.py`, `src/datapipeline/analysis/vector_analyzer.py`).
50
+
51
+ ---
52
+
53
+ ## Bar back cheat sheet
54
+
55
+ | Path | What lives here |
56
+ | ---------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
57
+ | `src/datapipeline/cli` | Argparse-powered bar program with commands for running pipelines, inspecting pours, scaffolding plugins and projecting service flow (`cli/app.py`, `cli/openers.py`, `cli/visuals.py`). |
58
+ | `src/datapipeline/services` | Bootstrapping (project loading, YAML interpolation), runtime factories and scaffolding helpers for new bar tools (`services/bootstrap.py`, `services/factories.py`, `services/scaffold/plugin.py`). |
59
+ | `src/datapipeline/pipeline` | Pure functions that build record/feature/vector iterators plus supporting utilities for ordering and transform wiring (`pipeline/pipelines.py`, `pipeline/utils/transform_utils.py`). |
60
+ | `src/datapipeline/domain` | Data structures representing records, feature records and vectors coming off the line (`domain/record.py`, `domain/feature.py`, `domain/vector.py`). |
61
+ | `src/datapipeline/transforms` & `src/datapipeline/filters` | Built-in transforms (lagging timestamps, sliding windows) and filter helpers exposed through entry points (`transforms/transforms.py`, `transforms/sequence.py`, `filters/filters.py`). |
62
+ | `src/datapipeline/sources/synthetic/time` | Example synthetic time-series loader/parser pair plus helper mappers for experimentation while the real spirits arrive (`sources/synthetic/time/loader.py`, `sources/synthetic/time/parser.py`, `mappers/synthetic/time.py`). |
63
+
64
+ ---
65
+
66
+ ## Opening the bar
67
+
68
+ ### 1. Install the tools
69
+
70
+ ```bash
71
+ python -m venv .venv
72
+ source .venv/bin/activate # Windows: .venv\Scripts\activate
73
+ python -m pip install --upgrade pip
74
+ pip install jerry-thomas
75
+ ```
76
+
77
+ The published wheel exposes the `jerry` CLI (backed by the `datapipeline` package) and
78
+ pulls in core dependencies like Pydantic, PyYAML, tqdm and Jinja2 (see
79
+ `pyproject.toml`). Prefer `pip install -e .` only when you are actively developing this
80
+ repository. Double-check the back bar is reachable:
81
+
82
+ ```bash
83
+ python -c "import datapipeline; print('bar ready')"
84
+ ```
85
+
86
+ ### 2. Draft your bar book
87
+
88
+ Create a `config/project.yaml` so the runtime knows where to find ingredients, infusions
89
+ and the tasting menu. Globals are optional but handy for sharing values—they are
90
+ interpolated into downstream YAML specs during bootstrap
91
+ (`src/datapipeline/config/project.py`, `src/datapipeline/services/bootstrap.py`).
92
+
93
+ ```yaml
94
+ version: 1
95
+ paths:
96
+ sources: config/distilleries
97
+ streams: config/contracts
98
+ dataset: config/recipe.yaml
99
+ globals:
100
+ opening_time: "2024-01-01T16:00:00Z"
101
+ last_call: "2024-01-02T02:00:00Z"
102
+ ```
103
+
104
+ > Helper functions in `src/datapipeline/services/project_paths.py` resolve relative paths
105
+ > against the project root and ensure the mise en place folders exist.
106
+
107
+ ### 3. Stock the bottles (raw sources)
108
+
109
+ Create `config/distilleries/<alias>.yaml` files. Each must expose a `parser` and `loader`
110
+ pointing at entry points plus any constructor arguments
111
+ (`src/datapipeline/services/bootstrap.py`). Here is a synthetic clock source that feels
112
+ like a drip of barrel-aged bitters:
113
+
114
+ ```yaml
115
+ # config/distilleries/time_ticks.yaml
116
+ parser:
117
+ entrypoint: "synthetic.time"
118
+ args: {}
119
+ loader:
120
+ entrypoint: "synthetic.time"
121
+ args:
122
+ start: "${opening_time}"
123
+ end: "${last_call}"
124
+ frequency: "1h"
125
+ ```
126
+
127
+ That file wires up the built-in `TimeTicksGenerator` + parser pair that yields
128
+ timezone-aware timestamps (`sources/synthetic/time/loader.py`,
129
+ `sources/synthetic/time/parser.py`).
130
+
131
+ ### 4. Mix house infusions (canonical streams)
132
+
133
+ Canonical specs live under `config/contracts/` and reference a raw source alias plus an
134
+ optional mapper entry point (`src/datapipeline/services/bootstrap.py`,
135
+ `src/datapipeline/streams/canonical.py`). This example turns each timestamp into a citrus
136
+ spritz feature:
137
+
138
+ ```yaml
139
+ # config/contracts/time/encode.yaml
140
+ source: time_ticks
141
+ mapper:
142
+ entrypoint: "synthetic.time.encode"
143
+ args:
144
+ mode: spritz
145
+ ```
146
+
147
+ The mapper uses the provided mode to create a new `TimeFeatureRecord` stream ready for the
148
+ feature stage (`mappers/synthetic/time.py`).
149
+
150
+ ### 5. Script the tasting menu (dataset)
151
+
152
+ Datasets describe which canonical streams should be read at each station and how flights
153
+ are grouped (`src/datapipeline/config/dataset/dataset.py`). A minimal hourly menu might
154
+ look like:
155
+
156
+ ```yaml
157
+ # config/recipe.yaml
158
+ group_by:
159
+ keys:
160
+ - type: time
161
+ field: time
162
+ resolution: 1h
163
+ features:
164
+ - stream: time.encode
165
+ feature_id: hour_spritz
166
+ partition_by: null
167
+ filters: []
168
+ transforms:
169
+ - time_lag: "0h"
170
+ ```
171
+
172
+ Use the sample `dataset` template as a starting point if you prefer scaffolding before
173
+ pouring concrete values. Group keys support time bucketing (with automatic flooring to the
174
+ requested resolution) and categorical splits
175
+ (`src/datapipeline/config/dataset/group_by.py`,
176
+ `src/datapipeline/config/dataset/normalize.py`). You can also attach feature or sequence
177
+ transforms—such as the sliding `TimeWindowTransformer`—directly in the YAML by referencing
178
+ their entry point names (`src/datapipeline/transforms/sequence.py`).
179
+
180
+ Once the book is ready, run the bootstrapper (the CLI does this automatically) to
181
+ materialize all registered sources and streams
182
+ (`src/datapipeline/services/bootstrap.py`).
183
+
184
+ ---
185
+
186
+ ## Running service
187
+
188
+ ### Prep any station (with visuals)
189
+
190
+ ```bash
191
+ jerry prep pour --project config/project.yaml --limit 20
192
+ jerry prep build --project config/project.yaml --limit 20
193
+ jerry prep stir --project config/project.yaml --limit 20
194
+ ```
195
+
196
+ - `prep pour` shows the record-stage ingredients headed for each feature.
197
+ - `prep build` highlights `FeatureRecord` entries after the shake/strain sequence.
198
+ - `prep stir` emits grouped vectors—the tasting flight before it leaves the pass.
199
+
200
+ All variants respect `--limit` and display tqdm-powered progress bars for the underlying
201
+ loaders. The CLI wires up `build_record_pipeline`, `build_feature_pipeline` and
202
+ `build_vector_pipeline`, so what you see mirrors the service line
203
+ (`src/datapipeline/cli/app.py`, `src/datapipeline/cli/commands/run.py`,
204
+ `src/datapipeline/cli/openers.py`, `src/datapipeline/cli/visuals.py`,
205
+ `src/datapipeline/pipeline/pipelines.py`).
206
+
207
+ ### Serve the flights (production mode)
208
+
209
+ ```bash
210
+ jerry serve --project config/project.yaml --output print
211
+ jerry serve --project config/project.yaml --output stream
212
+ jerry serve --project config/project.yaml --output exports/batch.pt
213
+ ```
214
+
215
+ Production mode skips the bar flair and focuses on throughput. `print` writes tasting
216
+ notes to stdout, `stream` emits newline-delimited JSON (with values coerced to strings when
217
+ necessary), and a `.pt` destination stores a pickle-compatible payload for later pours.
218
+
219
+ ### Taste the balance (vector quality)
220
+
221
+ ```bash
222
+ jerry taste --project config/project.yaml
223
+ ```
224
+
225
+ This command reuses the vector pipeline, collects presence counts for every configured
226
+ feature and flags empty or incomplete flights so you can diagnose upstream issues quickly
227
+ (`src/datapipeline/cli/commands/analyze.py`, `src/datapipeline/analysis/vector_analyzer.py`).
228
+ Use `--limit` to spot-check during service.
229
+
230
+ ---
231
+
232
+ ## Extending the bar program
233
+
234
+ ### Scaffold a plugin package
235
+
236
+ ```bash
237
+ jerry station init --name my_datapipeline --out .
238
+ ```
239
+
240
+ The generator copies a ready-made skeleton (pyproject, README, package directory) and
241
+ swaps placeholders for your package name so you can start adding new spirits immediately
242
+ (`src/datapipeline/cli/app.py`, `src/datapipeline/services/scaffold/plugin.py`). Install the
243
+ resulting project in editable mode to expose your loaders, parsers, mappers and
244
+ transforms.
245
+
246
+ ### Create new sources, domains and contracts
247
+
248
+ Use the CLI helpers to scaffold boilerplate code in your plugin workspace:
249
+
250
+ ```bash
251
+ jerry distillery add --provider dmi --dataset metobs --transport fs --format csv
252
+ jerry spirit add --domain metobs --time-aware
253
+ jerry contract --time-aware
254
+ ```
255
+
256
+ The distillery command writes DTO/parser stubs, updates entry points and drops a matching
257
+ YAML file in `config/distilleries/` pre-filled with composed-loader defaults for the chosen
258
+ transport (`src/datapipeline/cli/app.py`, `src/datapipeline/services/scaffold/source.py`).
259
+
260
+ ### Add custom filters or transforms
261
+
262
+ Register new functions/classes under the appropriate entry point group in your plugin’s
263
+ `pyproject.toml`. The runtime resolves them through `load_ep`, applies record-level
264
+ filters first, then record/feature/sequence transforms in the order declared in the
265
+ dataset config (`pyproject.toml`, `src/datapipeline/utils/load.py`,
266
+ `src/datapipeline/pipeline/utils/transform_utils.py`). Built-in helpers cover common
267
+ comparisons (including timezone-aware checks) and time-based transforms (lags, sliding
268
+ windows) if you need quick wins (`src/datapipeline/filters/filters.py`,
269
+ `src/datapipeline/transforms/transforms.py`, `src/datapipeline/transforms/sequence.py`).
270
+
271
+ ### Prototype with synthetic time-series data
272
+
273
+ Need sample pours while wiring up transforms? Reuse the bundled synthetic time loader +
274
+ parser and season it with the `encode_time` mapper for engineered temporal features
275
+ (`src/datapipeline/sources/synthetic/time/loader.py`,
276
+ `src/datapipeline/sources/synthetic/time/parser.py`,
277
+ `src/datapipeline/mappers/synthetic/time.py`). Pair it with the `time_window` sequence
278
+ transform to build sliding-window feature flights without external datasets
279
+ (`src/datapipeline/transforms/sequence.py`).
280
+
281
+ ---
282
+
283
+ ## Data model tasting notes
284
+
285
+ | Type | Description |
286
+ | ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
287
+ | `Record` | Canonical payload containing a `value`; extended by other record types (`src/datapipeline/domain/record.py`). |
288
+ | `TimeFeatureRecord` | A record with a timezone-aware `time` attribute, normalized to UTC to avoid boundary issues (`src/datapipeline/domain/record.py`). |
289
+ | `FeatureRecord` | Links a record (or list of records from sequence transforms) to a `feature_id` and `group_key` (`src/datapipeline/domain/feature.py`). |
290
+ | `Vector` | Final grouped payload: a mapping of feature IDs to scalars or ordered lists plus helper methods for shape/key access (`src/datapipeline/domain/vector.py`). |
291
+
292
+ ---
293
+
294
+ ## Developer shift checklist
295
+
296
+ These commands mirror the tooling used in CI and are useful while iterating locally:
297
+
298
+ ```bash
299
+ pip install -e .[dev]
300
+ pytest
301
+ ```
@@ -0,0 +1,285 @@
1
+ # Jerry Thomas
2
+
3
+ Jerry Thomas turns the datapipeline runtime into a cocktail program. You still install the
4
+ same Python package (`datapipeline`) and tap into the plugin architecture, but every CLI
5
+ dance step nods to a craft bar. Declarative YAML menus describe projects, sources and
6
+ datasets, pipelines move payloads through record/feature/vector stations, and setuptools
7
+ entry points keep the back bar stocked with new ingredients.
8
+
9
+ ---
10
+
11
+ ## How the bar is set up
12
+
13
+ ```text
14
+ raw source → canonical stream → record stage → feature stage → vector stage
15
+ ```
16
+
17
+ 1. **Raw sources (bottles on the shelf)** bundle a loader + parser recipe. Loaders handle
18
+ the I/O (files, URLs or synthetic runs) and parsers map rows into typed records while
19
+ skimming the dregs (`src/datapipeline/sources/models/loader.py`,
20
+ `src/datapipeline/sources/models/source.py`). The bootstrapper registers each source under
21
+ an alias so you can order it later in the service flow (`src/datapipeline/streams/raw.py`,
22
+ `src/datapipeline/services/bootstrap.py`).
23
+ 2. **Canonical streams (house infusions)** optionally apply a mapper on top of a raw
24
+ source to normalize payloads before the dataset drinks them
25
+ (`src/datapipeline/streams/canonical.py`, `src/datapipeline/services/factories.py`).
26
+ 3. **Dataset stages (prep stations)** read the configured canonical streams. Record stages
27
+ are your strainers and shakers, feature stages bottle the clarified spirits into keyed
28
+ features (with optional sequence transforms), and vector stages line up the flights ready
29
+ for service (`src/datapipeline/pipeline/pipelines.py`, `src/datapipeline/pipeline/stages.py`,
30
+ `src/datapipeline/config/dataset/feature.py`).
31
+ 4. **Vectors (tasting flights)** carry grouped feature values; downstream tasters can
32
+ inspect them for balance and completeness
33
+ (`src/datapipeline/domain/vector.py`, `src/datapipeline/analysis/vector_analyzer.py`).
34
+
35
+ ---
36
+
37
+ ## Bar back cheat sheet
38
+
39
+ | Path | What lives here |
40
+ | ---------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
41
+ | `src/datapipeline/cli` | Argparse-powered bar program with commands for running pipelines, inspecting pours, scaffolding plugins and projecting service flow (`cli/app.py`, `cli/openers.py`, `cli/visuals.py`). |
42
+ | `src/datapipeline/services` | Bootstrapping (project loading, YAML interpolation), runtime factories and scaffolding helpers for new bar tools (`services/bootstrap.py`, `services/factories.py`, `services/scaffold/plugin.py`). |
43
+ | `src/datapipeline/pipeline` | Pure functions that build record/feature/vector iterators plus supporting utilities for ordering and transform wiring (`pipeline/pipelines.py`, `pipeline/utils/transform_utils.py`). |
44
+ | `src/datapipeline/domain` | Data structures representing records, feature records and vectors coming off the line (`domain/record.py`, `domain/feature.py`, `domain/vector.py`). |
45
+ | `src/datapipeline/transforms` & `src/datapipeline/filters` | Built-in transforms (lagging timestamps, sliding windows) and filter helpers exposed through entry points (`transforms/transforms.py`, `transforms/sequence.py`, `filters/filters.py`). |
46
+ | `src/datapipeline/sources/synthetic/time` | Example synthetic time-series loader/parser pair plus helper mappers for experimentation while the real spirits arrive (`sources/synthetic/time/loader.py`, `sources/synthetic/time/parser.py`, `mappers/synthetic/time.py`). |
47
+
48
+ ---
49
+
50
+ ## Opening the bar
51
+
52
+ ### 1. Install the tools
53
+
54
+ ```bash
55
+ python -m venv .venv
56
+ source .venv/bin/activate # Windows: .venv\Scripts\activate
57
+ python -m pip install --upgrade pip
58
+ pip install jerry-thomas
59
+ ```
60
+
61
+ The published wheel exposes the `jerry` CLI (backed by the `datapipeline` package) and
62
+ pulls in core dependencies like Pydantic, PyYAML, tqdm and Jinja2 (see
63
+ `pyproject.toml`). Prefer `pip install -e .` only when you are actively developing this
64
+ repository. Double-check the back bar is reachable:
65
+
66
+ ```bash
67
+ python -c "import datapipeline; print('bar ready')"
68
+ ```
69
+
70
+ ### 2. Draft your bar book
71
+
72
+ Create a `config/project.yaml` so the runtime knows where to find ingredients, infusions
73
+ and the tasting menu. Globals are optional but handy for sharing values—they are
74
+ interpolated into downstream YAML specs during bootstrap
75
+ (`src/datapipeline/config/project.py`, `src/datapipeline/services/bootstrap.py`).
76
+
77
+ ```yaml
78
+ version: 1
79
+ paths:
80
+ sources: config/distilleries
81
+ streams: config/contracts
82
+ dataset: config/recipe.yaml
83
+ globals:
84
+ opening_time: "2024-01-01T16:00:00Z"
85
+ last_call: "2024-01-02T02:00:00Z"
86
+ ```
87
+
88
+ > Helper functions in `src/datapipeline/services/project_paths.py` resolve relative paths
89
+ > against the project root and ensure the mise en place folders exist.
90
+
91
+ ### 3. Stock the bottles (raw sources)
92
+
93
+ Create `config/distilleries/<alias>.yaml` files. Each must expose a `parser` and `loader`
94
+ pointing at entry points plus any constructor arguments
95
+ (`src/datapipeline/services/bootstrap.py`). Here is a synthetic clock source that feels
96
+ like a drip of barrel-aged bitters:
97
+
98
+ ```yaml
99
+ # config/distilleries/time_ticks.yaml
100
+ parser:
101
+ entrypoint: "synthetic.time"
102
+ args: {}
103
+ loader:
104
+ entrypoint: "synthetic.time"
105
+ args:
106
+ start: "${opening_time}"
107
+ end: "${last_call}"
108
+ frequency: "1h"
109
+ ```
110
+
111
+ That file wires up the built-in `TimeTicksGenerator` + parser pair that yields
112
+ timezone-aware timestamps (`sources/synthetic/time/loader.py`,
113
+ `sources/synthetic/time/parser.py`).
114
+
115
+ ### 4. Mix house infusions (canonical streams)
116
+
117
+ Canonical specs live under `config/contracts/` and reference a raw source alias plus an
118
+ optional mapper entry point (`src/datapipeline/services/bootstrap.py`,
119
+ `src/datapipeline/streams/canonical.py`). This example turns each timestamp into a citrus
120
+ spritz feature:
121
+
122
+ ```yaml
123
+ # config/contracts/time/encode.yaml
124
+ source: time_ticks
125
+ mapper:
126
+ entrypoint: "synthetic.time.encode"
127
+ args:
128
+ mode: spritz
129
+ ```
130
+
131
+ The mapper uses the provided mode to create a new `TimeFeatureRecord` stream ready for the
132
+ feature stage (`mappers/synthetic/time.py`).
133
+
134
+ ### 5. Script the tasting menu (dataset)
135
+
136
+ Datasets describe which canonical streams should be read at each station and how flights
137
+ are grouped (`src/datapipeline/config/dataset/dataset.py`). A minimal hourly menu might
138
+ look like:
139
+
140
+ ```yaml
141
+ # config/recipe.yaml
142
+ group_by:
143
+ keys:
144
+ - type: time
145
+ field: time
146
+ resolution: 1h
147
+ features:
148
+ - stream: time.encode
149
+ feature_id: hour_spritz
150
+ partition_by: null
151
+ filters: []
152
+ transforms:
153
+ - time_lag: "0h"
154
+ ```
155
+
156
+ Use the sample `dataset` template as a starting point if you prefer scaffolding before
157
+ pouring concrete values. Group keys support time bucketing (with automatic flooring to the
158
+ requested resolution) and categorical splits
159
+ (`src/datapipeline/config/dataset/group_by.py`,
160
+ `src/datapipeline/config/dataset/normalize.py`). You can also attach feature or sequence
161
+ transforms—such as the sliding `TimeWindowTransformer`—directly in the YAML by referencing
162
+ their entry point names (`src/datapipeline/transforms/sequence.py`).
163
+
164
+ Once the book is ready, run the bootstrapper (the CLI does this automatically) to
165
+ materialize all registered sources and streams
166
+ (`src/datapipeline/services/bootstrap.py`).
167
+
168
+ ---
169
+
170
+ ## Running service
171
+
172
+ ### Prep any station (with visuals)
173
+
174
+ ```bash
175
+ jerry prep pour --project config/project.yaml --limit 20
176
+ jerry prep build --project config/project.yaml --limit 20
177
+ jerry prep stir --project config/project.yaml --limit 20
178
+ ```
179
+
180
+ - `prep pour` shows the record-stage ingredients headed for each feature.
181
+ - `prep build` highlights `FeatureRecord` entries after the shake/strain sequence.
182
+ - `prep stir` emits grouped vectors—the tasting flight before it leaves the pass.
183
+
184
+ All variants respect `--limit` and display tqdm-powered progress bars for the underlying
185
+ loaders. The CLI wires up `build_record_pipeline`, `build_feature_pipeline` and
186
+ `build_vector_pipeline`, so what you see mirrors the service line
187
+ (`src/datapipeline/cli/app.py`, `src/datapipeline/cli/commands/run.py`,
188
+ `src/datapipeline/cli/openers.py`, `src/datapipeline/cli/visuals.py`,
189
+ `src/datapipeline/pipeline/pipelines.py`).
190
+
191
+ ### Serve the flights (production mode)
192
+
193
+ ```bash
194
+ jerry serve --project config/project.yaml --output print
195
+ jerry serve --project config/project.yaml --output stream
196
+ jerry serve --project config/project.yaml --output exports/batch.pt
197
+ ```
198
+
199
+ Production mode skips the bar flair and focuses on throughput. `print` writes tasting
200
+ notes to stdout, `stream` emits newline-delimited JSON (with values coerced to strings when
201
+ necessary), and a `.pt` destination stores a pickle-compatible payload for later pours.
202
+
203
+ ### Taste the balance (vector quality)
204
+
205
+ ```bash
206
+ jerry taste --project config/project.yaml
207
+ ```
208
+
209
+ This command reuses the vector pipeline, collects presence counts for every configured
210
+ feature and flags empty or incomplete flights so you can diagnose upstream issues quickly
211
+ (`src/datapipeline/cli/commands/analyze.py`, `src/datapipeline/analysis/vector_analyzer.py`).
212
+ Use `--limit` to spot-check during service.
213
+
214
+ ---
215
+
216
+ ## Extending the bar program
217
+
218
+ ### Scaffold a plugin package
219
+
220
+ ```bash
221
+ jerry station init --name my_datapipeline --out .
222
+ ```
223
+
224
+ The generator copies a ready-made skeleton (pyproject, README, package directory) and
225
+ swaps placeholders for your package name so you can start adding new spirits immediately
226
+ (`src/datapipeline/cli/app.py`, `src/datapipeline/services/scaffold/plugin.py`). Install the
227
+ resulting project in editable mode to expose your loaders, parsers, mappers and
228
+ transforms.
229
+
230
+ ### Create new sources, domains and contracts
231
+
232
+ Use the CLI helpers to scaffold boilerplate code in your plugin workspace:
233
+
234
+ ```bash
235
+ jerry distillery add --provider dmi --dataset metobs --transport fs --format csv
236
+ jerry spirit add --domain metobs --time-aware
237
+ jerry contract --time-aware
238
+ ```
239
+
240
+ The distillery command writes DTO/parser stubs, updates entry points and drops a matching
241
+ YAML file in `config/distilleries/` pre-filled with composed-loader defaults for the chosen
242
+ transport (`src/datapipeline/cli/app.py`, `src/datapipeline/services/scaffold/source.py`).
243
+
244
+ ### Add custom filters or transforms
245
+
246
+ Register new functions/classes under the appropriate entry point group in your plugin’s
247
+ `pyproject.toml`. The runtime resolves them through `load_ep`, applies record-level
248
+ filters first, then record/feature/sequence transforms in the order declared in the
249
+ dataset config (`pyproject.toml`, `src/datapipeline/utils/load.py`,
250
+ `src/datapipeline/pipeline/utils/transform_utils.py`). Built-in helpers cover common
251
+ comparisons (including timezone-aware checks) and time-based transforms (lags, sliding
252
+ windows) if you need quick wins (`src/datapipeline/filters/filters.py`,
253
+ `src/datapipeline/transforms/transforms.py`, `src/datapipeline/transforms/sequence.py`).
254
+
255
+ ### Prototype with synthetic time-series data
256
+
257
+ Need sample pours while wiring up transforms? Reuse the bundled synthetic time loader +
258
+ parser and season it with the `encode_time` mapper for engineered temporal features
259
+ (`src/datapipeline/sources/synthetic/time/loader.py`,
260
+ `src/datapipeline/sources/synthetic/time/parser.py`,
261
+ `src/datapipeline/mappers/synthetic/time.py`). Pair it with the `time_window` sequence
262
+ transform to build sliding-window feature flights without external datasets
263
+ (`src/datapipeline/transforms/sequence.py`).
264
+
265
+ ---
266
+
267
+ ## Data model tasting notes
268
+
269
+ | Type | Description |
270
+ | ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
271
+ | `Record` | Canonical payload containing a `value`; extended by other record types (`src/datapipeline/domain/record.py`). |
272
+ | `TimeFeatureRecord` | A record with a timezone-aware `time` attribute, normalized to UTC to avoid boundary issues (`src/datapipeline/domain/record.py`). |
273
+ | `FeatureRecord` | Links a record (or list of records from sequence transforms) to a `feature_id` and `group_key` (`src/datapipeline/domain/feature.py`). |
274
+ | `Vector` | Final grouped payload: a mapping of feature IDs to scalars or ordered lists plus helper methods for shape/key access (`src/datapipeline/domain/vector.py`). |
275
+
276
+ ---
277
+
278
+ ## Developer shift checklist
279
+
280
+ These commands mirror the tooling used in CI and are useful while iterating locally:
281
+
282
+ ```bash
283
+ pip install -e .[dev]
284
+ pytest
285
+ ```