jerry-thomas 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. datapipeline/analysis/vector/collector.py +275 -0
  2. datapipeline/analysis/vector/matrix.py +527 -0
  3. datapipeline/analysis/vector/report.py +317 -0
  4. datapipeline/analysis/vector_analyzer.py +3 -694
  5. datapipeline/build/__init__.py +6 -0
  6. datapipeline/build/state.py +52 -0
  7. datapipeline/build/tasks.py +186 -0
  8. datapipeline/cli/app.py +125 -56
  9. datapipeline/cli/commands/build.py +39 -0
  10. datapipeline/cli/commands/domain.py +1 -1
  11. datapipeline/cli/commands/filter.py +1 -2
  12. datapipeline/cli/commands/inspect.py +77 -26
  13. datapipeline/cli/commands/link.py +11 -12
  14. datapipeline/cli/commands/plugin.py +1 -1
  15. datapipeline/cli/commands/run.py +234 -110
  16. datapipeline/cli/commands/source.py +3 -3
  17. datapipeline/cli/commands/writers.py +138 -0
  18. datapipeline/cli/visuals/__init__.py +14 -0
  19. datapipeline/cli/{visuals.py → visuals/labels.py} +35 -24
  20. datapipeline/cli/visuals/sources.py +138 -0
  21. datapipeline/config/build.py +64 -0
  22. datapipeline/config/dataset/dataset.py +1 -2
  23. datapipeline/config/dataset/loader.py +1 -81
  24. datapipeline/config/postprocess.py +14 -0
  25. datapipeline/config/project.py +13 -1
  26. datapipeline/config/run.py +116 -0
  27. datapipeline/config/split.py +35 -0
  28. datapipeline/domain/vector.py +0 -9
  29. datapipeline/filters/filters.py +1 -1
  30. datapipeline/integrations/ml/__init__.py +16 -0
  31. datapipeline/integrations/ml/adapter.py +120 -0
  32. datapipeline/integrations/ml/pandas_support.py +46 -0
  33. datapipeline/integrations/ml/rows.py +82 -0
  34. datapipeline/integrations/ml/torch_support.py +94 -0
  35. datapipeline/pipeline/context.py +69 -0
  36. datapipeline/pipeline/pipelines.py +21 -23
  37. datapipeline/pipeline/split.py +171 -0
  38. datapipeline/pipeline/stages.py +54 -15
  39. datapipeline/pipeline/utils/keygen.py +2 -2
  40. datapipeline/pipeline/utils/transform_utils.py +64 -23
  41. datapipeline/plugins.py +1 -1
  42. datapipeline/runtime.py +73 -0
  43. datapipeline/services/artifacts.py +96 -0
  44. datapipeline/services/bootstrap/__init__.py +12 -0
  45. datapipeline/services/bootstrap/config.py +141 -0
  46. datapipeline/services/bootstrap/core.py +186 -0
  47. datapipeline/services/constants.py +5 -0
  48. datapipeline/services/entrypoints.py +1 -1
  49. datapipeline/services/factories.py +5 -2
  50. datapipeline/services/paths.py +1 -1
  51. datapipeline/services/project_paths.py +21 -0
  52. datapipeline/services/scaffold/domain.py +1 -2
  53. datapipeline/services/scaffold/filter.py +1 -2
  54. datapipeline/services/scaffold/mappers.py +1 -1
  55. datapipeline/services/scaffold/plugin.py +31 -5
  56. datapipeline/services/scaffold/source.py +2 -4
  57. datapipeline/sources/models/generator.py +6 -2
  58. datapipeline/sources/models/loader.py +0 -3
  59. datapipeline/sources/models/synthetic.py +1 -1
  60. datapipeline/sources/synthetic/time/loader.py +10 -2
  61. datapipeline/templates/plugin_skeleton/README.md +52 -7
  62. datapipeline/templates/plugin_skeleton/config/contracts/{time_hour_sin.yaml → time_hour_sin.synthetic.yaml} +3 -3
  63. datapipeline/templates/plugin_skeleton/config/contracts/{time_linear.yaml → time_linear.synthetic.yaml} +3 -3
  64. datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +9 -0
  65. datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +3 -18
  66. datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +13 -0
  67. datapipeline/templates/plugin_skeleton/config/datasets/default/project.yaml +12 -0
  68. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +10 -0
  69. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +10 -0
  70. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +10 -0
  71. datapipeline/templates/plugin_skeleton/pyproject.toml +2 -2
  72. datapipeline/templates/stubs/dto.py.j2 +2 -0
  73. datapipeline/templates/stubs/mapper.py.j2 +5 -3
  74. datapipeline/templates/stubs/parser.py.j2 +1 -0
  75. datapipeline/transforms/feature/scaler.py +127 -62
  76. datapipeline/transforms/filter.py +5 -2
  77. datapipeline/transforms/stream/fill.py +3 -25
  78. datapipeline/transforms/utils.py +16 -0
  79. datapipeline/transforms/vector.py +62 -78
  80. datapipeline/transforms/vector_utils.py +19 -67
  81. datapipeline/utils/load.py +2 -2
  82. datapipeline/utils/pickle_model.py +30 -0
  83. datapipeline/utils/placeholders.py +35 -0
  84. jerry_thomas-0.3.0.dist-info/METADATA +502 -0
  85. jerry_thomas-0.3.0.dist-info/RECORD +139 -0
  86. datapipeline/cli/visual_source.py +0 -32
  87. datapipeline/common/__init__.py +0 -0
  88. datapipeline/common/geo.py +0 -13
  89. datapipeline/integrations/ml.py +0 -319
  90. datapipeline/registries/registries.py +0 -15
  91. datapipeline/services/bootstrap.py +0 -191
  92. jerry_thomas-0.2.0.dist-info/METADATA +0 -402
  93. jerry_thomas-0.2.0.dist-info/RECORD +0 -112
  94. {jerry_thomas-0.2.0.dist-info → jerry_thomas-0.3.0.dist-info}/WHEEL +0 -0
  95. {jerry_thomas-0.2.0.dist-info → jerry_thomas-0.3.0.dist-info}/entry_points.txt +0 -0
  96. {jerry_thomas-0.2.0.dist-info → jerry_thomas-0.3.0.dist-info}/licenses/LICENSE +0 -0
  97. {jerry_thomas-0.2.0.dist-info → jerry_thomas-0.3.0.dist-info}/top_level.txt +0 -0
@@ -1,402 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: jerry-thomas
3
- Version: 0.2.0
4
- Summary: Jerry-Thomas: a stream-first, plugin-friendly data pipeline (mixology-themed CLI)
5
- Author: Anders Skott Lind
6
- License: MIT
7
- Requires-Python: >=3.10
8
- Description-Content-Type: text/markdown
9
- License-File: LICENSE
10
- Requires-Dist: numpy<3.0,>=1.24
11
- Requires-Dist: pydantic>=2.0
12
- Requires-Dist: PyYAML>=5.4
13
- Requires-Dist: tqdm>=4.0
14
- Requires-Dist: jinja2>=3.0
15
- Provides-Extra: ml
16
- Requires-Dist: pandas>=2.0; extra == "ml"
17
- Requires-Dist: torch>=2.0; extra == "ml"
18
- Dynamic: license-file
19
-
20
- # Jerry Thomas
21
-
22
- Time‑Series First
23
- - This runtime is time‑series‑first. Every domain record must include a timezone‑aware `time` and a `value`.
24
- - Grouping is defined by time buckets only (`group_by.keys: [ { type: time, ... } ]`).
25
- - Feature streams are sorted by time; sequence transforms assume ordered series.
26
- - Categorical dimensions (e.g., station, zone, ticker) belong in `partition_by` so they become partitions of the same time series.
27
- - Non‑temporal grouping is not supported.
28
-
29
- Jerry Thomas turns the datapipeline runtime into a cocktail program. You still install the
30
- same Python package (`datapipeline`) and tap into the plugin architecture, but every CLI
31
- dance step nods to a craft bar. Declarative YAML menus describe projects, sources and
32
- datasets, pipelines move payloads through record/feature/vector stations, and setuptools
33
- entry points keep the back bar stocked with new ingredients.
34
-
35
- ---
36
-
37
- ## How the bar is set up
38
-
39
- ```text
40
- raw source → canonical stream → record stage → feature stage → vector stage
41
- ```
42
-
43
- 1. **Raw sources (bottles on the shelf)** bundle a loader + parser recipe. Loaders handle
44
- the I/O (files, URLs or synthetic runs) and parsers map rows into typed records while
45
- skimming the dregs (`src/datapipeline/sources/models/loader.py`,
46
- `src/datapipeline/sources/models/source.py`). The bootstrapper registers each source under
47
- an alias so you can order it later in the service flow (`src/datapipeline/streams/raw.py`,
48
- `src/datapipeline/services/bootstrap.py`).
49
- 2. **Canonical streams (house infusions)** optionally apply a mapper on top of a raw
50
- source to normalize payloads before the dataset drinks them
51
- (`src/datapipeline/streams/canonical.py`, `src/datapipeline/services/factories.py`).
52
- 3. **Dataset stages (prep stations)** read the configured canonical streams. Record stages
53
- are your strainers and shakers, feature stages bottle the clarified spirits into keyed
54
- features (with optional sequence transforms), and vector stages line up the flights ready
55
- for service (`src/datapipeline/pipeline/pipelines.py`, `src/datapipeline/pipeline/stages.py`,
56
- `src/datapipeline/config/dataset/feature.py`).
57
- 4. **Vectors (tasting flights)** carry grouped feature values; downstream tasters can
58
- inspect them for balance and completeness
59
- (`src/datapipeline/domain/vector.py`, `src/datapipeline/analysis/vector_analyzer.py`).
60
-
61
- ---
62
-
63
- ## Bar back cheat sheet
64
-
65
- | Path | What lives here |
66
- | ---------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
67
- | `src/datapipeline/cli` | Argparse-powered bar program with commands for running pipelines, inspecting pours, scaffolding plugins and projecting service flow (`cli/app.py`, `cli/openers.py`, `cli/visuals.py`). |
68
- | `src/datapipeline/services` | Bootstrapping (project loading, YAML interpolation), runtime factories and scaffolding helpers for new bar tools (`services/bootstrap.py`, `services/factories.py`, `services/scaffold/plugin.py`). |
69
- | `src/datapipeline/pipeline` | Pure functions that build record/feature/vector iterators plus supporting utilities for ordering and transform wiring (`pipeline/pipelines.py`, `pipeline/utils/transform_utils.py`). |
70
- | `src/datapipeline/domain` | Data structures representing records, feature records and vectors coming off the line (`domain/record.py`, `domain/feature.py`, `domain/vector.py`). |
71
- | `src/datapipeline/transforms` & `src/datapipeline/filters` | Built-in transforms (lagging timestamps, scaling, sliding windows) and filter helpers exposed through entry points (`transforms/record.py`, `transforms/feature.py`, `transforms/sequence.py`, `filters/filters.py`). |
72
- | `src/datapipeline/sources/synthetic/time` | Example synthetic time-series loader/parser pair plus helper mappers for experimentation while the real spirits arrive (`sources/synthetic/time/loader.py`, `sources/synthetic/time/parser.py`, `mappers/synthetic/time.py`). |
73
-
74
- ---
75
-
76
- ## Built-in DSL identifiers
77
-
78
- The YAML DSL resolves filters and transforms by entry-point name. These ship with the
79
- template out of the box:
80
-
81
- | Kind | Identifiers | Notes |
82
- | ----------------- | ----------------------------------------------------------------------------------------------- | ----- |
83
- | Filters | `eq`/`equals`, `ne`/`not_equal`, `lt`, `le`, `gt`, `ge`, `in`/`contains`, `nin`/`not_in` | Use as `- gt: { field: value }` or `- in: { field: [values...] }`. Synonyms map to the same implementation. |
84
- | Record transforms | `time_lag`, `drop_missing` | `time_lag` expects a duration string (e.g. `1h`), `drop_missing` removes `None`/`NaN` records. |
85
- | Feature transforms| `standard_scale` | Options: `with_mean`, `with_std`, optional `statistics`. |
86
- | Sequence transforms | `time_window`, `time_fill_mean`, `time_fill_median` | `time_window` builds sliding windows; the fill transforms impute missing values from running mean/median with optional `window`/`min_samples`. |
87
- | Vector transforms | `fill_history`, `fill_horizontal`, `fill_constant`, `drop_missing` | History fill uses prior buckets, horizontal fill aggregates sibling partitions, constant sets a default, and drop removes vectors below coverage thresholds. |
88
-
89
- Extend `pyproject.toml` with additional entry points to register custom logic under your
90
- own identifiers.
91
-
92
- ---
93
-
94
- ## Opening the bar
95
-
96
- ### 1. Install the tools
97
-
98
- ```bash
99
- python -m venv .venv
100
- source .venv/bin/activate # Windows: .venv\Scripts\activate
101
- python -m pip install --upgrade pip
102
- pip install jerry-thomas
103
- ```
104
-
105
- The published wheel exposes the `jerry` CLI (backed by the `datapipeline` package) and
106
- pulls in core dependencies like Pydantic, PyYAML, tqdm and Jinja2 (see
107
- `pyproject.toml`). Prefer `pip install -e .` only when you are actively developing this
108
- repository. Double-check the back bar is reachable:
109
-
110
- ```bash
111
- python -c "import datapipeline; print('bar ready')"
112
- ```
113
-
114
- ### 2. Draft your bar book
115
-
116
- Create a `config/recipes/<name>/project.yaml` so the runtime knows where to find
117
- ingredients, infusions and the tasting menu. Globals are optional but handy for sharing
118
- values—they are interpolated into downstream YAML specs during bootstrap
119
- (`src/datapipeline/config/project.py`, `src/datapipeline/services/bootstrap.py`).
120
-
121
- ```yaml
122
- version: 1
123
- paths:
124
- sources: ../../sources
125
- streams: ../../contracts
126
- dataset: dataset.yaml
127
- globals:
128
- opening_time: "2024-01-01T16:00:00Z"
129
- last_call: "2024-01-02T02:00:00Z"
130
- ```
131
-
132
- > Helper functions in `src/datapipeline/services/project_paths.py` resolve relative paths
133
- > against the project root and ensure the mise en place folders exist.
134
-
135
- ### 3. Stock the bottles (raw sources)
136
-
137
- Create `config/sources/<alias>.yaml` files. Each must expose a `parser` and `loader`
138
- pointing at entry points plus any constructor arguments
139
- (`src/datapipeline/services/bootstrap.py`). Here is a synthetic clock source that feels
140
- like a drip of barrel-aged bitters:
141
-
142
- ```yaml
143
- # config/sources/time_ticks.yaml
144
- parser:
145
- entrypoint: "synthetic.time"
146
- args: {}
147
- loader:
148
- entrypoint: "synthetic.time"
149
- args:
150
- start: "${opening_time}"
151
- end: "${last_call}"
152
- frequency: "1h"
153
- ```
154
-
155
- That file wires up the built-in `TimeTicksGenerator` + parser pair that yields
156
- timezone-aware timestamps (`sources/synthetic/time/loader.py`,
157
- `sources/synthetic/time/parser.py`).
158
-
159
- ### 4. Mix house infusions (canonical streams)
160
-
161
- Canonical specs live under `config/contracts/` and reference a raw source alias plus an
162
- optional mapper entry point (`src/datapipeline/services/bootstrap.py`,
163
- `src/datapipeline/streams/canonical.py`). This example turns each timestamp into a citrus
164
- spritz feature:
165
-
166
- ```yaml
167
- # config/contracts/time/encode.yaml
168
- source: time_ticks
169
- mapper:
170
- entrypoint: "synthetic.time.encode"
171
- args:
172
- mode: spritz
173
- ```
174
-
175
- The mapper uses the provided mode to create a new `TimeSeriesRecord` stream ready for the
176
- feature stage (`mappers/synthetic/time.py`).
177
-
178
- ### 5. Script the tasting menu (dataset)
179
-
180
- Datasets describe which canonical streams should be read at each station and how flights
181
- are grouped (`src/datapipeline/config/dataset/dataset.py`). A minimal hourly menu might
182
- look like:
183
-
184
- ```yaml
185
- # config/recipes/default/dataset.yaml
186
- group_by:
187
- keys:
188
- - type: time
189
- field: time
190
- resolution: 1h
191
- features:
192
- - id: hour_spritz
193
- stream: time.encode
194
- transforms:
195
- - record:
196
- transform: time_lag
197
- args: 0h
198
- - feature:
199
- transform: standard_scale
200
- with_mean: true
201
- with_std: true
202
- - sequence:
203
- transform: time_window
204
- size: 4
205
- stride: 1
206
- - sequence:
207
- transform: time_fill_mean
208
- window: 24
209
- min_samples: 6
210
- ```
211
-
212
- Use the sample `dataset` template as a starting point if you prefer scaffolding before
213
- pouring concrete values. Group keys now require explicit time bucketing (with automatic
214
- flooring to the requested resolution) so every pipeline is clock-driven. You can attach
215
- feature or sequence transforms—such as the sliding `TimeWindowTransformer` or the
216
- `time_fill_mean`/`time_fill_median` imputers—directly in the YAML by referencing their
217
- entry point names (`src/datapipeline/transforms/sequence.py`).
218
-
219
- When vectors are assembled you can optionally apply `vector_transforms` to enforce schema
220
- guarantees. The built-ins cover:
221
-
222
- - `fill_history` – use running means/medians from prior buckets (per partition) with
223
- configurable window/minimum samples.
224
- - `fill_horizontal` – aggregate sibling partitions at the same timestamp (e.g. other
225
- stations) using mean/median.
226
- - `fill_constant` – provide a constant default for missing features/partitions.
227
- - `drop_missing` – drop vectors that fall below a coverage threshold or omit required
228
- features.
229
-
230
- Transforms accept either an explicit `expected` list or a manifest path to discover the
231
- full partition set (`build/partitions.json` produced by `jerry inspect partitions`).
232
-
233
- Once the book is ready, run the bootstrapper (the CLI does this automatically) to
234
- materialize all registered sources and streams
235
- (`src/datapipeline/services/bootstrap.py`).
236
-
237
- ---
238
-
239
- ## Running service
240
-
241
- ### Prep any station (with visuals)
242
-
243
- ```bash
244
- jerry prep pour --project config/datasets/default/project.yaml --limit 20
245
- jerry prep build --project config/datasets/default/project.yaml --limit 20
246
- jerry prep stir --project config/datasets/default/project.yaml --limit 20
247
- ```
248
-
249
- - `prep pour` shows the record-stage ingredients headed for each feature.
250
- - `prep build` highlights `FeatureRecord` entries after the shake/strain sequence.
251
- - `prep stir` emits grouped vectors—the tasting flight before it leaves the pass.
252
-
253
- All variants respect `--limit` and display tqdm-powered progress bars for the underlying
254
- loaders. The CLI wires up `build_record_pipeline`, `build_feature_pipeline` and
255
- `build_vector_pipeline`, so what you see mirrors the service line
256
- (`src/datapipeline/cli/app.py`, `src/datapipeline/cli/commands/run.py`,
257
- `src/datapipeline/cli/openers.py`, `src/datapipeline/cli/visuals.py`,
258
- `src/datapipeline/pipeline/pipelines.py`).
259
-
260
- ### Serve the flights (production mode)
261
-
262
- ```bash
263
- jerry serve --project config/datasets/default/project.yaml --output print
264
- jerry serve --project config/datasets/default/project.yaml --output stream
265
- jerry serve --project config/datasets/default/project.yaml --output exports/batch.pt
266
- ```
267
-
268
- Production mode skips the bar flair and focuses on throughput. `print` writes tasting
269
- notes to stdout, `stream` emits newline-delimited JSON (with values coerced to strings when
270
- necessary), and a `.pt` destination stores a pickle-compatible payload for later pours.
271
-
272
- ## Funnel vectors into ML projects
273
-
274
- Data scientists rarely want to shell out to the CLI; they need a programmatic
275
- hand-off that plugs vectors straight into notebooks, feature stores or training
276
- loops. The `datapipeline.integrations` package wraps the existing iterator
277
- builders with ML-friendly adapters without pulling pandas or torch into the
278
- core runtime.
279
-
280
- ```python
281
- from datapipeline.integrations import (
282
- VectorAdapter,
283
- dataframe_from_vectors,
284
- iter_vector_rows,
285
- torch_dataset,
286
- )
287
-
288
- # Bootstrap once and stream ready-to-use rows.
289
- adapter = VectorAdapter.from_project("config/project.yaml")
290
- for row in adapter.iter_rows(limit=32, flatten_sequences=True):
291
- send_to_feature_store(row)
292
-
293
- # Helper functions cover ad-hoc jobs as well.
294
- rows = iter_vector_rows(
295
- "config/project.yaml",
296
- include_group=True,
297
- group_format="mapping",
298
- flatten_sequences=True,
299
- )
300
-
301
- # Optional extras materialize into common ML containers if installed.
302
- df = dataframe_from_vectors("config/project.yaml") # Requires pandas
303
- dataset = torch_dataset("config/project.yaml", dtype=torch.float32) # Requires torch
304
- ```
305
-
306
- Everything still flows through `build_vector_pipeline`; the integration layer
307
- normalizes group keys, optionally flattens sequence features and demonstrates
308
- how to turn the iterator into DataFrames or `torch.utils.data.Dataset`
309
- instances. ML teams can fork the same pattern for their own stacks—Spark, NumPy
310
- or feature store SDKs—without adding opinionated glue to the runtime itself.
311
-
312
- ### Inspect the balance (vector quality)
313
-
314
- Use the inspect helpers for different outputs:
315
-
316
- - `jerry inspect report --project config/datasets/default/project.yaml` — print a
317
- human-readable quality report (totals, keep/below lists, optional partition detail).
318
- - `jerry inspect coverage --project config/datasets/default/project.yaml` — persist the
319
- coverage summary to `build/coverage.json` (keep/below feature and partition lists plus
320
- coverage percentages).
321
- - `jerry inspect matrix --project config/datasets/default/project.yaml --format html` —
322
- export availability matrices (CSV or HTML) for deeper analysis.
323
- - `jerry inspect partitions --project config/datasets/default/project.yaml` — write the
324
- observed partition manifest to `build/partitions.json` for use in configs.
325
-
326
- Note: `jerry prep taste` has been removed; use `jerry inspect report` and friends.
327
-
328
- ---
329
-
330
- ## Extending the CLI
331
-
332
- ### Scaffold a plugin package
333
-
334
- ```bash
335
- jerry plugin init --name my_datapipeline --out .
336
- ```
337
-
338
- The generator copies a ready-made skeleton (pyproject, README, package directory) and
339
- swaps placeholders for your package name so you can start adding new spirits immediately
340
- (`src/datapipeline/cli/app.py`, `src/datapipeline/services/scaffold/plugin.py`). Install the
341
- resulting project in editable mode to expose your loaders, parsers, mappers and
342
- transforms.
343
-
344
- ### Create new sources, domains and contracts
345
-
346
- Use the CLI helpers to scaffold boilerplate code in your plugin workspace:
347
-
348
- ```bash
349
- jerry source add --provider dmi --dataset metobs --transport fs --format csv
350
- jerry domain add --domain metobs
351
- jerry contract
352
- ```
353
-
354
- The source command writes DTO/parser stubs, updates entry points and drops a matching
355
- YAML file in `config/sources/` pre-filled with composed-loader defaults for the chosen
356
- transport (`src/datapipeline/cli/app.py`, `src/datapipeline/services/scaffold/source.py`).
357
- `jerry domain add` now always scaffolds `TimeSeriesRecord` domains so every mapper carries
358
- an explicit timestamp alongside its value, and `jerry contract` wires that source/domain
359
- pair up for canonical stream generation.
360
-
361
- ### Add custom filters or transforms
362
-
363
- Register new functions/classes under the appropriate entry point group in your plugin’s
364
- `pyproject.toml`. The runtime resolves them through `load_ep`, applies record filters first,
365
- then record/feature/sequence transforms in the order declared in the dataset config
366
- (`pyproject.toml`, `src/datapipeline/utils/load.py`,
367
- `src/datapipeline/pipeline/utils/transform_utils.py`). Built-in helpers cover common
368
- comparisons (including timezone-aware checks) and time-based transforms (lags, sliding
369
- windows) if you need quick wins (`src/datapipeline/filters/filters.py`,
370
- `src/datapipeline/transforms/record.py`, `src/datapipeline/transforms/feature.py`,
371
- `src/datapipeline/transforms/sequence.py`).
372
-
373
- ### Prototype with synthetic time-series data
374
-
375
- Need sample pours while wiring up transforms? Reuse the bundled synthetic time loader +
376
- parser and season it with the `encode_time` mapper for engineered temporal features
377
- (`src/datapipeline/sources/synthetic/time/loader.py`,
378
- `src/datapipeline/sources/synthetic/time/parser.py`,
379
- `src/datapipeline/mappers/synthetic/time.py`). Pair it with the `time_window` sequence
380
- transform to build sliding-window feature flights without external datasets
381
- (`src/datapipeline/transforms/sequence.py`).
382
-
383
- ---
384
-
385
- ## Data model tasting notes
386
-
387
- | Type | Description |
388
- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
389
- | `TimeSeriesRecord` | Canonical record with `time` (tz-aware, normalized to UTC) and `value`; the pipeline treats streams as ordered series (`src/datapipeline/domain/record.py`).|
390
- | `FeatureRecord` | Links a record (or list of records from sequence transforms) to a `feature_id` and `group_key` (`src/datapipeline/domain/feature.py`). |
391
- | `Vector` | Final grouped payload: a mapping of feature IDs to scalars or ordered lists plus helper methods for shape/key access (`src/datapipeline/domain/vector.py`). |
392
-
393
- ---
394
-
395
- ## Developer shift checklist
396
-
397
- These commands mirror the tooling used in CI and are useful while iterating locally:
398
-
399
- ```bash
400
- pip install -e .[dev]
401
- pytest
402
- ```
@@ -1,112 +0,0 @@
1
- datapipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- datapipeline/plugins.py,sha256=RPdzS1TUDLVsuLal-EMiADHwkn1TVJhiG1ukEcCH5LE,837
3
- datapipeline/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- datapipeline/analysis/vector_analyzer.py,sha256=rZEAZC3xqsI83-VikKbCf3tHbhpCtEVYvmD9c6ui39U,27158
5
- datapipeline/cli/app.py,sha256=iSHcwKvBFPwjSxsAoG-dzpiaC118WJ_8CQ3VNJ44DIc,13921
6
- datapipeline/cli/visual_source.py,sha256=8Nl8KKwQh1apNkj-OzsteaA54K6AvX5daItHBAvqBsU,1174
7
- datapipeline/cli/visuals.py,sha256=9OXMJh8B1a5_6_9sS7mgY4UeSu6fUhSuxsauVm8HTkU,2462
8
- datapipeline/cli/commands/domain.py,sha256=w1xd19wtVWslqg_AFTh0m9uIR3Zr341Rt_8YZpRosf8,304
9
- datapipeline/cli/commands/filter.py,sha256=IeAsp9KHm98y65oGGcQuoZxzVDz243If6l0dkN1F39s,304
10
- datapipeline/cli/commands/inspect.py,sha256=kmXTqPOMg2mYic6lmZuoc9lxhs0-XCVExco7aVU0wnQ,5885
11
- datapipeline/cli/commands/link.py,sha256=KsL1-V0o15DrFSYLNttAqQPJqKkhtrzsif9EWZdva_0,5031
12
- datapipeline/cli/commands/list_.py,sha256=cCrx35CPNMyAaOVvVxkZje4DAx7K7HdGpngzZwhNn40,943
13
- datapipeline/cli/commands/plugin.py,sha256=Ab24t0DwTIqGGjPcVfI0a0yue37cLb4ff-ZcI0ZLj9g,327
14
- datapipeline/cli/commands/run.py,sha256=8CQR-DsNqzEm0oWr8efKh7_i8Bj2Rh0BVwGFW_lfd3Y,5211
15
- datapipeline/cli/commands/source.py,sha256=bCIY15rYGx7aYensnQ4WbInRopLee458KF0pHCk-sqg,810
16
- datapipeline/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- datapipeline/common/geo.py,sha256=oFWPhU9V3jJUloPVktLTxHJttdZxJh97RFUOm0B0Kfw,292
18
- datapipeline/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- datapipeline/config/catalog.py,sha256=ezGSy2yFybnYZ2EHU9IjDdeFVHgHhQbePAHGXFPY9b8,1012
20
- datapipeline/config/project.py,sha256=23t5UVNQXzJa5hy6ungCi3ipWllKyFQXId55uLhqRVA,502
21
- datapipeline/config/dataset/dataset.py,sha256=an7S1CAEZ5bYNkgO5uHTnKJfSrfWSE4CjCMM4FN-L-s,629
22
- datapipeline/config/dataset/feature.py,sha256=2Hxz0FXZskLI4ICXhmlG6b1Vvxzh0Ql9e6BwjMRtzSs,346
23
- datapipeline/config/dataset/loader.py,sha256=mCOXorU6g2UbPWQjkln7N24b8NPJju4Fg6C8u1pDri4,4187
24
- datapipeline/config/dataset/normalize.py,sha256=66yvPvbQyef6qQtyJOGTkAo5Q5ldDqpk-XIHx9hYH4c,825
25
- datapipeline/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
- datapipeline/domain/feature.py,sha256=7BOI4H458BKU8B9vqdfez7WOO1YKiF6lt0oy7PMbqrQ,295
27
- datapipeline/domain/record.py,sha256=Q-QjuR3FbQ01_vJ5LT9k8E40C-oYmOvWdbqtwtfIHPM,709
28
- datapipeline/domain/vector.py,sha256=1DKa1eqigz966itN-6noc8E2d67D-73u55YCE8WhYsU,1278
29
- datapipeline/filters/filters.py,sha256=oU4iu8JeJsbsKtLWJNeBBXOqwBk3uKaPzvVqkh8yw-Y,2650
30
- datapipeline/integrations/__init__.py,sha256=tjTLsIa6NRWKI05wjwPAUuXozDA-gP98SccFJ9lYHs8,410
31
- datapipeline/integrations/ml.py,sha256=fqzdF3JcLV_tazST4sicPsXjG6ZyDcKiNMKyRVgllpw,10483
32
- datapipeline/mappers/noop.py,sha256=L8bH1QVbLH-ogIam0ppYdx7KuWQ7Dj44lvD8tvNlY0Q,111
33
- datapipeline/mappers/synthetic/time.py,sha256=lt1pC0May6Y4E8bZO4sERm3D04_r-qv63Y5fwrtCaBQ,639
34
- datapipeline/parsers/identity.py,sha256=pdGuz0SSQGfySPpvZSnLgfTXTkC36x-7dQMMei3XhsU,321
35
- datapipeline/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
- datapipeline/pipeline/pipelines.py,sha256=j1TioHie927r4q8dILCjnUJsanpnfrbw2AlP-lnIIYc,2865
37
- datapipeline/pipeline/stages.py,sha256=ESgBNck0POKgNtFWfwvuJtQ01qm6pzyd6QP0lZcNGvg,4402
38
- datapipeline/pipeline/utils/keygen.py,sha256=0gqgfxBAWNwfBHN37G5dIfurdfS25jLsPWu5SKjh2gg,1402
39
- datapipeline/pipeline/utils/memory_sort.py,sha256=iPWcD81xtZZ8SXEX_Ph-hUCnpKlJMFybbxPZVOQdSs8,735
40
- datapipeline/pipeline/utils/ordering.py,sha256=ZX18I7GFtvyMFJB96vWQpTOGwljjeJ6ppCg5a3Av3es,1450
41
- datapipeline/pipeline/utils/transform_utils.py,sha256=05Udkl8K4Mot_nZ_Ih3q0oNmpnTthjgXyXE3QC0Xi4s,1826
42
- datapipeline/registries/registries.py,sha256=2d6sut7AgmmxZIWr_3FbzDs5UxPdrOQHKuZNNwJVpak,774
43
- datapipeline/registries/registry.py,sha256=MWWOHz2wT1oHQmovodtEreEuQhvH-i11Y2yXUUgZJhQ,641
44
- datapipeline/services/bootstrap.py,sha256=UNYoMJOJ2wCMlo0ZWVeIdpqdY6Uz7T8TUzpIIoIyHI8,6426
45
- datapipeline/services/constants.py,sha256=ZeTjk1mmxKVsKmRm4BJvnCnQ3Rwqh8ICUP_-VHFzNWE,336
46
- datapipeline/services/entrypoints.py,sha256=ZmIh2Oq0M2Jy32Iqyfif69MjcYm8SatGts-zh4n33YE,2505
47
- datapipeline/services/factories.py,sha256=Nmy5gXJOVquhkdXgJlQauUc4bvqCsFWh4T-Tj0qQznU,854
48
- datapipeline/services/paths.py,sha256=6rjGaqHa37H8ylN9lD4nvPYKk8lxkWak1JcQ1qwhFxk,962
49
- datapipeline/services/project_paths.py,sha256=IHcGr8RqRAEWmQnR6IHuMCRqokHu6XLL69lwYafFLEE,2507
50
- datapipeline/services/scaffold/__init__.py,sha256=PaQNtYki9Kc7mQPnUtKDPU-rKohLHoXLvFVwdHdbXNM,68
51
- datapipeline/services/scaffold/domain.py,sha256=2xmBv1dpEB-ZnOguQB0EptIN4gNDFs-3QttKn5wtLX4,959
52
- datapipeline/services/scaffold/filter.py,sha256=FmlnmZKLu_BbjBxqSekI5OWD2iTkLhGx0XWPpjj2F1o,1059
53
- datapipeline/services/scaffold/mappers.py,sha256=559IBU_-pIoqwzwVy8dFpsTVd3uBXPEJtrvC6ETYfqk,1991
54
- datapipeline/services/scaffold/plugin.py,sha256=1hTM8hBHm05_7uYABeAvyVAD_wlf8DpgK7QqUKfm7MI,874
55
- datapipeline/services/scaffold/source.py,sha256=HWYBcIpKeau6Slhu1cJWfcg7DPWEEABtVXwvKG7vv00,5610
56
- datapipeline/services/scaffold/templates.py,sha256=B3YnZpFUZLynijJosTNxZQLXnPP_Y_t1RHqfI1lGOxU,634
57
- datapipeline/sources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
- datapipeline/sources/composed_loader.py,sha256=dUmdAYncL6IJqwLhKpQ3AVyFNyut-_K55isSTr-iDXo,1350
59
- datapipeline/sources/decoders.py,sha256=OEozYeV9bAA0yiWebPRvq_qKt0KXArk5jBJk_2sr7iI,1970
60
- datapipeline/sources/factory.py,sha256=J177Y1TJsAJZimd54bKkJl1c7-rq8e2cs04ie3tHY8E,2097
61
- datapipeline/sources/transports.py,sha256=yBfRLlZxxP4INQ5uFSR8zJecjUx_XlwU6KMQqJUYx44,2080
62
- datapipeline/sources/models/__init__.py,sha256=rS3Rc8L2eXSd2CW9ejRConk9pmV8Zv7QCz3ekSIdcLQ,397
63
- datapipeline/sources/models/base.py,sha256=MAUawd11fII-mxxuSPM4f6H1t1tbyZX_QWhoAgeYUcU,238
64
- datapipeline/sources/models/generator.py,sha256=JK5o2k3aoNR8hVq2RP7WOyAmoBz6leV95cMgrxuvtzw,545
65
- datapipeline/sources/models/loader.py,sha256=NbmRSNM1eU-6A30qsoNllFXA9HCDF4Shg14y3b_Fc0I,1092
66
- datapipeline/sources/models/parser.py,sha256=Ts31aksHLDCw5ovF2D99w9g_j-NnEiZ8x0JHtUxmmXs,226
67
- datapipeline/sources/models/source.py,sha256=PBtbJVdyuRABPGFSwkyDaSmT1BuHk2faL-lUvRFpOAo,796
68
- datapipeline/sources/models/synthetic.py,sha256=uGi46h8b-bV0S0bArcs5RhiTvqqguWZjkq6X1Hir7QQ,290
69
- datapipeline/sources/synthetic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
70
- datapipeline/sources/synthetic/time/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
- datapipeline/sources/synthetic/time/loader.py,sha256=3bIlRSCmEJlES-MUX3ICyU1kLJoY6BCcyDEyVH0xSWw,1229
72
- datapipeline/sources/synthetic/time/parser.py,sha256=d3GZMQ7L1Qi4LeEm7U3y0_pk0RdhskioQukYyqyoqic,343
73
- datapipeline/templates/plugin_skeleton/README.md,sha256=5OtDUSM2pexqrwVAlvl54kE8ARv-V-9UJy97UPdnpVI,2805
74
- datapipeline/templates/plugin_skeleton/pyproject.toml,sha256=knrTg5zYPiTzj7jpCaO4LMWzdHLgxtZaDHF3czckwOc,265
75
- datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.yaml,sha256=PLbmLw2fKbpMsSRgiM2BKEzD5WYcD-60BpCVnLU_j70,581
76
- datapipeline/templates/plugin_skeleton/config/contracts/time_linear.yaml,sha256=OgYm92pe3DY_QrcoM2XTqbiu8jvBJI1BbmHu1brVHYk,1304
77
- datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml,sha256=3uuRS72Wy9iRUEtNkm6frq-AS-YA6oB6BprJc6CNzvs,764
78
- datapipeline/templates/plugin_skeleton/config/datasets/default/project.yaml,sha256=03XiBqNqVHg2aysjYt-C1adeI_aWz1C4xeU--HkE4Ic,171
79
- datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml,sha256=7tie6CqEmOOK8M629f2WZDWKaUPr9bePMC6Oj8RsqB8,190
80
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
81
- datapipeline/templates/stubs/dto.py.j2,sha256=nenydq0t1PmXi4ChD7o8yVp6fRbpKu-NNUgddM6WiLQ,734
82
- datapipeline/templates/stubs/filter.py.j2,sha256=3LgRgAL_HRaENOOqQx8NdeM1AUy-T0rtHVTA7N2oWOs,466
83
- datapipeline/templates/stubs/loader_synthetic.py.j2,sha256=9SQBeTBGlZmKs6nSYBKd8nbOPcFHgDx17Mh8xOEQnvs,1285
84
- datapipeline/templates/stubs/mapper.py.j2,sha256=sF3ME7IVREKR6jDsRfBNbDt5ppoIsfHTvCYGQauTKpU,713
85
- datapipeline/templates/stubs/parser.py.j2,sha256=CWWsPKWH56kOQ5X-N4NsSKrN__HYVs1pHHRy2FbVYuI,632
86
- datapipeline/templates/stubs/parser_custom.py.j2,sha256=0Nytq43JdTZoyRj-4Mz6HWdMTmOP3VlFuYOB_A_13Vg,580
87
- datapipeline/templates/stubs/record.py.j2,sha256=bzI8Jt0hcInLpn-IlyL2a8-q4VQY4gyZ2Z5BAMB70k4,660
88
- datapipeline/templates/stubs/source.yaml.j2,sha256=mPOfYD3Hyvaw-lSgJslfsP2VqOp7qsg_ePOLvyTeSRw,416
89
- datapipeline/transforms/filter.py,sha256=Jb4SIqic5xrCoyY9zQoABwhtckp55q4xkFN-g2On-qA,1294
90
- datapipeline/transforms/sequence.py,sha256=5i-0w1jQcSHy12rhztBhzyhJ2FdnVbD35NWcPXPi_kQ,3059
91
- datapipeline/transforms/utils.py,sha256=4ad3v0fhyl6sgHg9EHyf8C8U-46c8CLOZwBGK9vi-aY,194
92
- datapipeline/transforms/vector.py,sha256=sOksLOa60oGOX75RTwsnhdqJZXmaA1tjx9d8Pm296us,8043
93
- datapipeline/transforms/vector_utils.py,sha256=4rKzcAADE9OKXSrqr4X_eEya30cULymUPoSB_JVGbBk,2420
94
- datapipeline/transforms/debug/identity.py,sha256=6bwnEYhMBYw0YPrMccrZPXDOQM4r_-odsKo8Hhpbz10,2515
95
- datapipeline/transforms/debug/lint.py,sha256=6EBzGOfYjJbHzcZIIzVixlvW5RVr7liw6DieuWwxNUM,4057
96
- datapipeline/transforms/feature/model.py,sha256=gB-GP80_P7bzEKJFSM4leRke75yiD4-S5eJ1p8g3JU8,382
97
- datapipeline/transforms/feature/scaler.py,sha256=tExlpsVK8TNMC_qpPx5QdyX6AAMbUPmdvBNZAMZMS8E,3315
98
- datapipeline/transforms/record/floor_time.py,sha256=dKxLjnmBNJmDClPQfuKBEM_lrW-356v8XfQtLog5K2k,627
99
- datapipeline/transforms/record/lag.py,sha256=5wrPyVNFvidvdQddnK6ZeUOI5I8rfXEbzIg6tzKiJu4,536
100
- datapipeline/transforms/stream/ensure_ticks.py,sha256=Q0AwKuRY2nRIOUKoaAINeAWUEuoOzSh30Ug5k8296Kw,1170
101
- datapipeline/transforms/stream/fill.py,sha256=-NJhPD3LP_G7E2oLMBvNOFtdhGhjgCSMR9hM3_QPMAo,4230
102
- datapipeline/transforms/stream/granularity.py,sha256=PzHDGDwyn8P07BCbcFZaorS_7lbAbEdMLqD9Wy61y0M,3376
103
- datapipeline/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
104
- datapipeline/utils/load.py,sha256=uQUqwyyAXXVVcBKjzX0RjoP_Nuc3zgBN1BplLZpJIYw,1282
105
- datapipeline/utils/paths.py,sha256=5Y5rhNbjTiybUHfq9VfRMJ4gUfN9UltonM-4MABEG8w,798
106
- datapipeline/utils/time.py,sha256=vOqa2arqwEqbDo-JWEhOFPMnI1E4Ib3i1L-Rt-cGH8c,1072
107
- jerry_thomas-0.2.0.dist-info/licenses/LICENSE,sha256=pkBMylAJF5yChHAkdxwFhEptLGx13i-XFEKh-Sh6DkM,1073
108
- jerry_thomas-0.2.0.dist-info/METADATA,sha256=LUh6IrWqfk1fjG-MMSO4d-GViqZ8qzc4sPhKkjZo3zw,19202
109
- jerry_thomas-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
110
- jerry_thomas-0.2.0.dist-info/entry_points.txt,sha256=7GKSNCgwbzIqI3_LdbOro2eiAbBvbpoOxAuh2XcqBN0,1669
111
- jerry_thomas-0.2.0.dist-info/top_level.txt,sha256=N8aoNPdPyHefODO4YAm7tqTaUcw0e8LDcqycFTf8TbM,13
112
- jerry_thomas-0.2.0.dist-info/RECORD,,