jerry-thomas 0.0.5__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {jerry_thomas-0.0.5/src/jerry_thomas.egg-info → jerry_thomas-0.2.0}/PKG-INFO +153 -53
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/README.md +147 -49
- jerry_thomas-0.2.0/pyproject.toml +83 -0
- jerry_thomas-0.2.0/src/datapipeline/analysis/vector_analyzer.py +696 -0
- jerry_thomas-0.2.0/src/datapipeline/cli/app.py +425 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/cli/commands/domain.py +2 -2
- jerry_thomas-0.2.0/src/datapipeline/cli/commands/inspect.py +169 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/cli/commands/link.py +48 -14
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/cli/commands/plugin.py +2 -2
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/cli/commands/run.py +47 -48
- jerry_thomas-0.2.0/src/datapipeline/cli/visual_source.py +32 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/cli/visuals.py +4 -31
- jerry_thomas-0.2.0/src/datapipeline/config/catalog.py +30 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/config/dataset/dataset.py +4 -7
- jerry_thomas-0.2.0/src/datapipeline/config/dataset/feature.py +13 -0
- jerry_thomas-0.2.0/src/datapipeline/config/dataset/loader.py +99 -0
- jerry_thomas-0.2.0/src/datapipeline/config/dataset/normalize.py +24 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/config/project.py +0 -2
- jerry_thomas-0.2.0/src/datapipeline/domain/feature.py +17 -0
- jerry_thomas-0.2.0/src/datapipeline/domain/record.py +28 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/domain/vector.py +4 -2
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/filters/filters.py +0 -1
- jerry_thomas-0.2.0/src/datapipeline/integrations/__init__.py +19 -0
- jerry_thomas-0.2.0/src/datapipeline/integrations/ml.py +319 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/mappers/synthetic/time.py +3 -3
- jerry_thomas-0.2.0/src/datapipeline/pipeline/pipelines.py +93 -0
- jerry_thomas-0.2.0/src/datapipeline/pipeline/stages.py +119 -0
- jerry_thomas-0.2.0/src/datapipeline/pipeline/utils/keygen.py +42 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/pipeline/utils/memory_sort.py +1 -1
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/pipeline/utils/ordering.py +0 -2
- jerry_thomas-0.2.0/src/datapipeline/pipeline/utils/transform_utils.py +55 -0
- jerry_thomas-0.2.0/src/datapipeline/plugins.py +21 -0
- jerry_thomas-0.2.0/src/datapipeline/registries/registries.py +15 -0
- jerry_thomas-0.2.0/src/datapipeline/registries/registry.py +28 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/bootstrap.py +50 -17
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/constants.py +2 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/factories.py +9 -5
- jerry_thomas-0.2.0/src/datapipeline/services/project_paths.py +75 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/scaffold/domain.py +6 -3
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/scaffold/mappers.py +2 -2
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/scaffold/plugin.py +5 -5
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/scaffold/source.py +15 -25
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/scaffold/templates.py +1 -5
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/models/__init__.py +1 -3
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/models/loader.py +1 -12
- jerry_thomas-0.2.0/src/datapipeline/sources/synthetic/time/parser.py +9 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/plugin_skeleton/README.md +14 -11
- jerry_thomas-0.2.0/src/datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.yaml +24 -0
- jerry_thomas-0.2.0/src/datapipeline/templates/plugin_skeleton/config/contracts/time_linear.yaml +23 -0
- jerry_thomas-0.2.0/src/datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +29 -0
- {jerry_thomas-0.0.5/src/datapipeline/templates/plugin_skeleton/config → jerry_thomas-0.2.0/src/datapipeline/templates/plugin_skeleton/config/datasets/default}/project.yaml +3 -3
- {jerry_thomas-0.0.5/src/datapipeline/templates/plugin_skeleton/config/distilleries → jerry_thomas-0.2.0/src/datapipeline/templates/plugin_skeleton/config/sources}/time_ticks.yaml +2 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/plugin_skeleton/pyproject.toml +2 -2
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/stubs/dto.py.j2 +1 -2
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/stubs/mapper.py.j2 +3 -4
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/stubs/record.py.j2 +1 -1
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/stubs/source.yaml.j2 +4 -0
- jerry_thomas-0.2.0/src/datapipeline/transforms/debug/identity.py +74 -0
- jerry_thomas-0.2.0/src/datapipeline/transforms/debug/lint.py +101 -0
- jerry_thomas-0.2.0/src/datapipeline/transforms/feature/model.py +12 -0
- jerry_thomas-0.0.5/src/datapipeline/transforms/transforms.py → jerry_thomas-0.2.0/src/datapipeline/transforms/feature/scaler.py +9 -67
- jerry_thomas-0.2.0/src/datapipeline/transforms/filter.py +57 -0
- jerry_thomas-0.2.0/src/datapipeline/transforms/record/floor_time.py +17 -0
- jerry_thomas-0.2.0/src/datapipeline/transforms/record/lag.py +18 -0
- jerry_thomas-0.2.0/src/datapipeline/transforms/sequence.py +84 -0
- jerry_thomas-0.2.0/src/datapipeline/transforms/stream/ensure_ticks.py +33 -0
- jerry_thomas-0.2.0/src/datapipeline/transforms/stream/fill.py +103 -0
- jerry_thomas-0.2.0/src/datapipeline/transforms/stream/granularity.py +92 -0
- jerry_thomas-0.2.0/src/datapipeline/transforms/utils.py +10 -0
- jerry_thomas-0.2.0/src/datapipeline/transforms/vector.py +226 -0
- jerry_thomas-0.2.0/src/datapipeline/transforms/vector_utils.py +84 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/utils/load.py +3 -1
- jerry_thomas-0.2.0/src/datapipeline/utils/paths.py +26 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/utils/time.py +6 -4
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0/src/jerry_thomas.egg-info}/PKG-INFO +153 -53
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/jerry_thomas.egg-info/SOURCES.txt +27 -11
- jerry_thomas-0.2.0/src/jerry_thomas.egg-info/entry_points.txt +39 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/jerry_thomas.egg-info/requires.txt +5 -2
- jerry_thomas-0.2.0/tests/test_config_pipeline.py +25 -0
- jerry_thomas-0.2.0/tests/test_regression_vectors.py +162 -0
- jerry_thomas-0.2.0/tests/test_transforms.py +189 -0
- jerry_thomas-0.2.0/tests/test_vector_analyzer.py +19 -0
- jerry_thomas-0.0.5/pyproject.toml +0 -92
- jerry_thomas-0.0.5/src/datapipeline/analysis/vector_analyzer.py +0 -49
- jerry_thomas-0.0.5/src/datapipeline/cli/app.py +0 -208
- jerry_thomas-0.0.5/src/datapipeline/cli/commands/analyze.py +0 -32
- jerry_thomas-0.0.5/src/datapipeline/cli/openers.py +0 -11
- jerry_thomas-0.0.5/src/datapipeline/config/catalog.py +0 -22
- jerry_thomas-0.0.5/src/datapipeline/config/dataset/feature.py +0 -24
- jerry_thomas-0.0.5/src/datapipeline/config/dataset/group_by.py +0 -31
- jerry_thomas-0.0.5/src/datapipeline/config/dataset/loader.py +0 -19
- jerry_thomas-0.0.5/src/datapipeline/config/dataset/normalize.py +0 -10
- jerry_thomas-0.0.5/src/datapipeline/domain/feature.py +0 -10
- jerry_thomas-0.0.5/src/datapipeline/domain/record.py +0 -20
- jerry_thomas-0.0.5/src/datapipeline/pipeline/pipelines.py +0 -46
- jerry_thomas-0.0.5/src/datapipeline/pipeline/stages.py +0 -64
- jerry_thomas-0.0.5/src/datapipeline/pipeline/utils/keygen.py +0 -20
- jerry_thomas-0.0.5/src/datapipeline/pipeline/utils/transform_utils.py +0 -120
- jerry_thomas-0.0.5/src/datapipeline/plugins.py +0 -7
- jerry_thomas-0.0.5/src/datapipeline/services/project_paths.py +0 -35
- jerry_thomas-0.0.5/src/datapipeline/sources/synthetic/time/parser.py +0 -9
- jerry_thomas-0.0.5/src/datapipeline/streams/canonical.py +0 -28
- jerry_thomas-0.0.5/src/datapipeline/streams/raw.py +0 -16
- jerry_thomas-0.0.5/src/datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.yaml +0 -4
- jerry_thomas-0.0.5/src/datapipeline/templates/plugin_skeleton/config/contracts/time_linear.yaml +0 -4
- jerry_thomas-0.0.5/src/datapipeline/templates/plugin_skeleton/config/contracts/time_ticks.yaml +0 -2
- jerry_thomas-0.0.5/src/datapipeline/templates/plugin_skeleton/config/recipe.yaml +0 -17
- jerry_thomas-0.0.5/src/datapipeline/transforms/sequence.py +0 -31
- jerry_thomas-0.0.5/src/jerry_thomas.egg-info/entry_points.txt +0 -44
- jerry_thomas-0.0.5/tests/test_transforms.py +0 -76
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/LICENSE +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/setup.cfg +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/__init__.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/analysis/__init__.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/cli/commands/filter.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/cli/commands/list_.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/cli/commands/source.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/common/__init__.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/common/geo.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/config/__init__.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/domain/__init__.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/mappers/noop.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/parsers/identity.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/pipeline/__init__.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/entrypoints.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/paths.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/scaffold/__init__.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/services/scaffold/filter.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/__init__.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/composed_loader.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/decoders.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/factory.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/models/base.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/models/generator.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/models/parser.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/models/source.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/models/synthetic.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/synthetic/__init__.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/synthetic/time/__init__.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/synthetic/time/loader.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/sources/transports.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/stubs/filter.py.j2 +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/stubs/loader_synthetic.py.j2 +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/stubs/parser.py.j2 +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/templates/stubs/parser_custom.py.j2 +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/datapipeline/utils/__init__.py +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/jerry_thomas.egg-info/dependency_links.txt +0 -0
- {jerry_thomas-0.0.5 → jerry_thomas-0.2.0}/src/jerry_thomas.egg-info/top_level.txt +0 -0
|
@@ -1,22 +1,31 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: jerry-thomas
|
|
3
|
-
Version: 0.0
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Jerry-Thomas: a stream-first, plugin-friendly data pipeline (mixology-themed CLI)
|
|
5
5
|
Author: Anders Skott Lind
|
|
6
6
|
License: MIT
|
|
7
|
-
Requires-Python: >=3.
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
License-File: LICENSE
|
|
10
10
|
Requires-Dist: numpy<3.0,>=1.24
|
|
11
|
-
Requires-Dist: pydantic>=
|
|
11
|
+
Requires-Dist: pydantic>=2.0
|
|
12
12
|
Requires-Dist: PyYAML>=5.4
|
|
13
13
|
Requires-Dist: tqdm>=4.0
|
|
14
14
|
Requires-Dist: jinja2>=3.0
|
|
15
|
-
|
|
15
|
+
Provides-Extra: ml
|
|
16
|
+
Requires-Dist: pandas>=2.0; extra == "ml"
|
|
17
|
+
Requires-Dist: torch>=2.0; extra == "ml"
|
|
16
18
|
Dynamic: license-file
|
|
17
19
|
|
|
18
20
|
# Jerry Thomas
|
|
19
21
|
|
|
22
|
+
Time‑Series First
|
|
23
|
+
- This runtime is time‑series‑first. Every domain record must include a timezone‑aware `time` and a `value`.
|
|
24
|
+
- Grouping is defined by time buckets only (`group_by.keys: [ { type: time, ... } ]`).
|
|
25
|
+
- Feature streams are sorted by time; sequence transforms assume ordered series.
|
|
26
|
+
- Categorical dimensions (e.g., station, zone, ticker) belong in `partition_by` so they become partitions of the same time series.
|
|
27
|
+
- Non‑temporal grouping is not supported.
|
|
28
|
+
|
|
20
29
|
Jerry Thomas turns the datapipeline runtime into a cocktail program. You still install the
|
|
21
30
|
same Python package (`datapipeline`) and tap into the plugin architecture, but every CLI
|
|
22
31
|
dance step nods to a craft bar. Declarative YAML menus describe projects, sources and
|
|
@@ -59,11 +68,29 @@ raw source → canonical stream → record stage → feature stage → vector st
|
|
|
59
68
|
| `src/datapipeline/services` | Bootstrapping (project loading, YAML interpolation), runtime factories and scaffolding helpers for new bar tools (`services/bootstrap.py`, `services/factories.py`, `services/scaffold/plugin.py`). |
|
|
60
69
|
| `src/datapipeline/pipeline` | Pure functions that build record/feature/vector iterators plus supporting utilities for ordering and transform wiring (`pipeline/pipelines.py`, `pipeline/utils/transform_utils.py`). |
|
|
61
70
|
| `src/datapipeline/domain` | Data structures representing records, feature records and vectors coming off the line (`domain/record.py`, `domain/feature.py`, `domain/vector.py`). |
|
|
62
|
-
| `src/datapipeline/transforms` & `src/datapipeline/filters` | Built-in transforms (lagging timestamps, sliding windows) and filter helpers exposed through entry points (`transforms/transforms.py`, `transforms/sequence.py`, `filters/filters.py`).
|
|
71
|
+
| `src/datapipeline/transforms` & `src/datapipeline/filters` | Built-in transforms (lagging timestamps, scaling, sliding windows) and filter helpers exposed through entry points (`transforms/record.py`, `transforms/feature.py`, `transforms/sequence.py`, `filters/filters.py`). |
|
|
63
72
|
| `src/datapipeline/sources/synthetic/time` | Example synthetic time-series loader/parser pair plus helper mappers for experimentation while the real spirits arrive (`sources/synthetic/time/loader.py`, `sources/synthetic/time/parser.py`, `mappers/synthetic/time.py`). |
|
|
64
73
|
|
|
65
74
|
---
|
|
66
75
|
|
|
76
|
+
## Built-in DSL identifiers
|
|
77
|
+
|
|
78
|
+
The YAML DSL resolves filters and transforms by entry-point name. These ship with the
|
|
79
|
+
template out of the box:
|
|
80
|
+
|
|
81
|
+
| Kind | Identifiers | Notes |
|
|
82
|
+
| ----------------- | ----------------------------------------------------------------------------------------------- | ----- |
|
|
83
|
+
| Filters | `eq`/`equals`, `ne`/`not_equal`, `lt`, `le`, `gt`, `ge`, `in`/`contains`, `nin`/`not_in` | Use as `- gt: { field: value }` or `- in: { field: [values...] }`. Synonyms map to the same implementation. |
|
|
84
|
+
| Record transforms | `time_lag`, `drop_missing` | `time_lag` expects a duration string (e.g. `1h`), `drop_missing` removes `None`/`NaN` records. |
|
|
85
|
+
| Feature transforms| `standard_scale` | Options: `with_mean`, `with_std`, optional `statistics`. |
|
|
86
|
+
| Sequence transforms | `time_window`, `time_fill_mean`, `time_fill_median` | `time_window` builds sliding windows; the fill transforms impute missing values from running mean/median with optional `window`/`min_samples`. |
|
|
87
|
+
| Vector transforms | `fill_history`, `fill_horizontal`, `fill_constant`, `drop_missing` | History fill uses prior buckets, horizontal fill aggregates sibling partitions, constant sets a default, and drop removes vectors below coverage thresholds. |
|
|
88
|
+
|
|
89
|
+
Extend `pyproject.toml` with additional entry points to register custom logic under your
|
|
90
|
+
own identifiers.
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
67
94
|
## Opening the bar
|
|
68
95
|
|
|
69
96
|
### 1. Install the tools
|
|
@@ -86,17 +113,17 @@ python -c "import datapipeline; print('bar ready')"
|
|
|
86
113
|
|
|
87
114
|
### 2. Draft your bar book
|
|
88
115
|
|
|
89
|
-
Create a `config/project.yaml` so the runtime knows where to find
|
|
90
|
-
and the tasting menu. Globals are optional but handy for sharing
|
|
91
|
-
interpolated into downstream YAML specs during bootstrap
|
|
116
|
+
Create a `config/recipes/<name>/project.yaml` so the runtime knows where to find
|
|
117
|
+
ingredients, infusions and the tasting menu. Globals are optional but handy for sharing
|
|
118
|
+
values—they are interpolated into downstream YAML specs during bootstrap
|
|
92
119
|
(`src/datapipeline/config/project.py`, `src/datapipeline/services/bootstrap.py`).
|
|
93
120
|
|
|
94
121
|
```yaml
|
|
95
122
|
version: 1
|
|
96
123
|
paths:
|
|
97
|
-
sources:
|
|
98
|
-
streams:
|
|
99
|
-
dataset:
|
|
124
|
+
sources: ../../sources
|
|
125
|
+
streams: ../../contracts
|
|
126
|
+
dataset: dataset.yaml
|
|
100
127
|
globals:
|
|
101
128
|
opening_time: "2024-01-01T16:00:00Z"
|
|
102
129
|
last_call: "2024-01-02T02:00:00Z"
|
|
@@ -107,13 +134,13 @@ globals:
|
|
|
107
134
|
|
|
108
135
|
### 3. Stock the bottles (raw sources)
|
|
109
136
|
|
|
110
|
-
Create `config/
|
|
137
|
+
Create `config/sources/<alias>.yaml` files. Each must expose a `parser` and `loader`
|
|
111
138
|
pointing at entry points plus any constructor arguments
|
|
112
139
|
(`src/datapipeline/services/bootstrap.py`). Here is a synthetic clock source that feels
|
|
113
140
|
like a drip of barrel-aged bitters:
|
|
114
141
|
|
|
115
142
|
```yaml
|
|
116
|
-
# config/
|
|
143
|
+
# config/sources/time_ticks.yaml
|
|
117
144
|
parser:
|
|
118
145
|
entrypoint: "synthetic.time"
|
|
119
146
|
args: {}
|
|
@@ -145,7 +172,7 @@ mapper:
|
|
|
145
172
|
mode: spritz
|
|
146
173
|
```
|
|
147
174
|
|
|
148
|
-
The mapper uses the provided mode to create a new `
|
|
175
|
+
The mapper uses the provided mode to create a new `TimeSeriesRecord` stream ready for the
|
|
149
176
|
feature stage (`mappers/synthetic/time.py`).
|
|
150
177
|
|
|
151
178
|
### 5. Script the tasting menu (dataset)
|
|
@@ -155,28 +182,53 @@ are grouped (`src/datapipeline/config/dataset/dataset.py`). A minimal hourly men
|
|
|
155
182
|
look like:
|
|
156
183
|
|
|
157
184
|
```yaml
|
|
158
|
-
# config/
|
|
185
|
+
# config/recipes/default/dataset.yaml
|
|
159
186
|
group_by:
|
|
160
187
|
keys:
|
|
161
188
|
- type: time
|
|
162
189
|
field: time
|
|
163
190
|
resolution: 1h
|
|
164
191
|
features:
|
|
165
|
-
-
|
|
166
|
-
|
|
167
|
-
partition_by: null
|
|
168
|
-
filters: []
|
|
192
|
+
- id: hour_spritz
|
|
193
|
+
stream: time.encode
|
|
169
194
|
transforms:
|
|
170
|
-
-
|
|
195
|
+
- record:
|
|
196
|
+
transform: time_lag
|
|
197
|
+
args: 0h
|
|
198
|
+
- feature:
|
|
199
|
+
transform: standard_scale
|
|
200
|
+
with_mean: true
|
|
201
|
+
with_std: true
|
|
202
|
+
- sequence:
|
|
203
|
+
transform: time_window
|
|
204
|
+
size: 4
|
|
205
|
+
stride: 1
|
|
206
|
+
- sequence:
|
|
207
|
+
transform: time_fill_mean
|
|
208
|
+
window: 24
|
|
209
|
+
min_samples: 6
|
|
171
210
|
```
|
|
172
211
|
|
|
173
212
|
Use the sample `dataset` template as a starting point if you prefer scaffolding before
|
|
174
|
-
pouring concrete values. Group keys
|
|
175
|
-
requested resolution)
|
|
176
|
-
|
|
177
|
-
`
|
|
178
|
-
|
|
179
|
-
|
|
213
|
+
pouring concrete values. Group keys now require explicit time bucketing (with automatic
|
|
214
|
+
flooring to the requested resolution) so every pipeline is clock-driven. You can attach
|
|
215
|
+
feature or sequence transforms—such as the sliding `TimeWindowTransformer` or the
|
|
216
|
+
`time_fill_mean`/`time_fill_median` imputers—directly in the YAML by referencing their
|
|
217
|
+
entry point names (`src/datapipeline/transforms/sequence.py`).
|
|
218
|
+
|
|
219
|
+
When vectors are assembled you can optionally apply `vector_transforms` to enforce schema
|
|
220
|
+
guarantees. The built-ins cover:
|
|
221
|
+
|
|
222
|
+
- `fill_history` – use running means/medians from prior buckets (per partition) with
|
|
223
|
+
configurable window/minimum samples.
|
|
224
|
+
- `fill_horizontal` – aggregate sibling partitions at the same timestamp (e.g. other
|
|
225
|
+
stations) using mean/median.
|
|
226
|
+
- `fill_constant` – provide a constant default for missing features/partitions.
|
|
227
|
+
- `drop_missing` – drop vectors that fall below a coverage threshold or omit required
|
|
228
|
+
features.
|
|
229
|
+
|
|
230
|
+
Transforms accept either an explicit `expected` list or a manifest path to discover the
|
|
231
|
+
full partition set (`build/partitions.json` produced by `jerry inspect partitions`).
|
|
180
232
|
|
|
181
233
|
Once the book is ready, run the bootstrapper (the CLI does this automatically) to
|
|
182
234
|
materialize all registered sources and streams
|
|
@@ -189,9 +241,9 @@ materialize all registered sources and streams
|
|
|
189
241
|
### Prep any station (with visuals)
|
|
190
242
|
|
|
191
243
|
```bash
|
|
192
|
-
jerry prep pour --project config/project.yaml --limit 20
|
|
193
|
-
jerry prep build --project config/project.yaml --limit 20
|
|
194
|
-
jerry prep stir --project config/project.yaml --limit 20
|
|
244
|
+
jerry prep pour --project config/datasets/default/project.yaml --limit 20
|
|
245
|
+
jerry prep build --project config/datasets/default/project.yaml --limit 20
|
|
246
|
+
jerry prep stir --project config/datasets/default/project.yaml --limit 20
|
|
195
247
|
```
|
|
196
248
|
|
|
197
249
|
- `prep pour` shows the record-stage ingredients headed for each feature.
|
|
@@ -208,34 +260,79 @@ loaders. The CLI wires up `build_record_pipeline`, `build_feature_pipeline` and
|
|
|
208
260
|
### Serve the flights (production mode)
|
|
209
261
|
|
|
210
262
|
```bash
|
|
211
|
-
jerry serve --project config/project.yaml --output print
|
|
212
|
-
jerry serve --project config/project.yaml --output stream
|
|
213
|
-
jerry serve --project config/project.yaml --output exports/batch.pt
|
|
263
|
+
jerry serve --project config/datasets/default/project.yaml --output print
|
|
264
|
+
jerry serve --project config/datasets/default/project.yaml --output stream
|
|
265
|
+
jerry serve --project config/datasets/default/project.yaml --output exports/batch.pt
|
|
214
266
|
```
|
|
215
267
|
|
|
216
268
|
Production mode skips the bar flair and focuses on throughput. `print` writes tasting
|
|
217
269
|
notes to stdout, `stream` emits newline-delimited JSON (with values coerced to strings when
|
|
218
270
|
necessary), and a `.pt` destination stores a pickle-compatible payload for later pours.
|
|
219
271
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
272
|
+
## Funnel vectors into ML projects
|
|
273
|
+
|
|
274
|
+
Data scientists rarely want to shell out to the CLI; they need a programmatic
|
|
275
|
+
hand-off that plugs vectors straight into notebooks, feature stores or training
|
|
276
|
+
loops. The `datapipeline.integrations` package wraps the existing iterator
|
|
277
|
+
builders with ML-friendly adapters without pulling pandas or torch into the
|
|
278
|
+
core runtime.
|
|
279
|
+
|
|
280
|
+
```python
|
|
281
|
+
from datapipeline.integrations import (
|
|
282
|
+
VectorAdapter,
|
|
283
|
+
dataframe_from_vectors,
|
|
284
|
+
iter_vector_rows,
|
|
285
|
+
torch_dataset,
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
# Bootstrap once and stream ready-to-use rows.
|
|
289
|
+
adapter = VectorAdapter.from_project("config/project.yaml")
|
|
290
|
+
for row in adapter.iter_rows(limit=32, flatten_sequences=True):
|
|
291
|
+
send_to_feature_store(row)
|
|
292
|
+
|
|
293
|
+
# Helper functions cover ad-hoc jobs as well.
|
|
294
|
+
rows = iter_vector_rows(
|
|
295
|
+
"config/project.yaml",
|
|
296
|
+
include_group=True,
|
|
297
|
+
group_format="mapping",
|
|
298
|
+
flatten_sequences=True,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
# Optional extras materialize into common ML containers if installed.
|
|
302
|
+
df = dataframe_from_vectors("config/project.yaml") # Requires pandas
|
|
303
|
+
dataset = torch_dataset("config/project.yaml", dtype=torch.float32) # Requires torch
|
|
224
304
|
```
|
|
225
305
|
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
306
|
+
Everything still flows through `build_vector_pipeline`; the integration layer
|
|
307
|
+
normalizes group keys, optionally flattens sequence features and demonstrates
|
|
308
|
+
how to turn the iterator into DataFrames or `torch.utils.data.Dataset`
|
|
309
|
+
instances. ML teams can fork the same pattern for their own stacks—Spark, NumPy
|
|
310
|
+
or feature store SDKs—without adding opinionated glue to the runtime itself.
|
|
311
|
+
|
|
312
|
+
### Inspect the balance (vector quality)
|
|
313
|
+
|
|
314
|
+
Use the inspect helpers for different outputs:
|
|
315
|
+
|
|
316
|
+
- `jerry inspect report --project config/datasets/default/project.yaml` — print a
|
|
317
|
+
human-readable quality report (totals, keep/below lists, optional partition detail).
|
|
318
|
+
- `jerry inspect coverage --project config/datasets/default/project.yaml` — persist the
|
|
319
|
+
coverage summary to `build/coverage.json` (keep/below feature and partition lists plus
|
|
320
|
+
coverage percentages).
|
|
321
|
+
- `jerry inspect matrix --project config/datasets/default/project.yaml --format html` —
|
|
322
|
+
export availability matrices (CSV or HTML) for deeper analysis.
|
|
323
|
+
- `jerry inspect partitions --project config/datasets/default/project.yaml` — write the
|
|
324
|
+
observed partition manifest to `build/partitions.json` for use in configs.
|
|
325
|
+
|
|
326
|
+
Note: `jerry prep taste` has been removed; use `jerry inspect report` and friends.
|
|
230
327
|
|
|
231
328
|
---
|
|
232
329
|
|
|
233
|
-
## Extending the
|
|
330
|
+
## Extending the CLI
|
|
234
331
|
|
|
235
332
|
### Scaffold a plugin package
|
|
236
333
|
|
|
237
334
|
```bash
|
|
238
|
-
jerry
|
|
335
|
+
jerry plugin init --name my_datapipeline --out .
|
|
239
336
|
```
|
|
240
337
|
|
|
241
338
|
The generator copies a ready-made skeleton (pyproject, README, package directory) and
|
|
@@ -249,25 +346,29 @@ transforms.
|
|
|
249
346
|
Use the CLI helpers to scaffold boilerplate code in your plugin workspace:
|
|
250
347
|
|
|
251
348
|
```bash
|
|
252
|
-
jerry
|
|
253
|
-
jerry
|
|
254
|
-
jerry contract
|
|
349
|
+
jerry source add --provider dmi --dataset metobs --transport fs --format csv
|
|
350
|
+
jerry domain add --domain metobs
|
|
351
|
+
jerry contract
|
|
255
352
|
```
|
|
256
353
|
|
|
257
|
-
The
|
|
258
|
-
YAML file in `config/
|
|
354
|
+
The source command writes DTO/parser stubs, updates entry points and drops a matching
|
|
355
|
+
YAML file in `config/sources/` pre-filled with composed-loader defaults for the chosen
|
|
259
356
|
transport (`src/datapipeline/cli/app.py`, `src/datapipeline/services/scaffold/source.py`).
|
|
357
|
+
`jerry domain add` now always scaffolds `TimeSeriesRecord` domains so every mapper carries
|
|
358
|
+
an explicit timestamp alongside its value, and `jerry contract` wires that source/domain
|
|
359
|
+
pair up for canonical stream generation.
|
|
260
360
|
|
|
261
361
|
### Add custom filters or transforms
|
|
262
362
|
|
|
263
363
|
Register new functions/classes under the appropriate entry point group in your plugin’s
|
|
264
|
-
`pyproject.toml`. The runtime resolves them through `load_ep`, applies record
|
|
265
|
-
|
|
266
|
-
|
|
364
|
+
`pyproject.toml`. The runtime resolves them through `load_ep`, applies record filters first,
|
|
365
|
+
then record/feature/sequence transforms in the order declared in the dataset config
|
|
366
|
+
(`pyproject.toml`, `src/datapipeline/utils/load.py`,
|
|
267
367
|
`src/datapipeline/pipeline/utils/transform_utils.py`). Built-in helpers cover common
|
|
268
368
|
comparisons (including timezone-aware checks) and time-based transforms (lags, sliding
|
|
269
369
|
windows) if you need quick wins (`src/datapipeline/filters/filters.py`,
|
|
270
|
-
`src/datapipeline/transforms/
|
|
370
|
+
`src/datapipeline/transforms/record.py`, `src/datapipeline/transforms/feature.py`,
|
|
371
|
+
`src/datapipeline/transforms/sequence.py`).
|
|
271
372
|
|
|
272
373
|
### Prototype with synthetic time-series data
|
|
273
374
|
|
|
@@ -285,8 +386,7 @@ transform to build sliding-window feature flights without external datasets
|
|
|
285
386
|
|
|
286
387
|
| Type | Description |
|
|
287
388
|
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
288
|
-
| `
|
|
289
|
-
| `TimeFeatureRecord` | A record with a timezone-aware `time` attribute, normalized to UTC to avoid boundary issues (`src/datapipeline/domain/record.py`). |
|
|
389
|
+
| `TimeSeriesRecord` | Canonical record with `time` (tz-aware, normalized to UTC) and `value`; the pipeline treats streams as ordered series (`src/datapipeline/domain/record.py`).|
|
|
290
390
|
| `FeatureRecord` | Links a record (or list of records from sequence transforms) to a `feature_id` and `group_key` (`src/datapipeline/domain/feature.py`). |
|
|
291
391
|
| `Vector` | Final grouped payload: a mapping of feature IDs to scalars or ordered lists plus helper methods for shape/key access (`src/datapipeline/domain/vector.py`). |
|
|
292
392
|
|
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
# Jerry Thomas
|
|
2
2
|
|
|
3
|
+
Time‑Series First
|
|
4
|
+
- This runtime is time‑series‑first. Every domain record must include a timezone‑aware `time` and a `value`.
|
|
5
|
+
- Grouping is defined by time buckets only (`group_by.keys: [ { type: time, ... } ]`).
|
|
6
|
+
- Feature streams are sorted by time; sequence transforms assume ordered series.
|
|
7
|
+
- Categorical dimensions (e.g., station, zone, ticker) belong in `partition_by` so they become partitions of the same time series.
|
|
8
|
+
- Non‑temporal grouping is not supported.
|
|
9
|
+
|
|
3
10
|
Jerry Thomas turns the datapipeline runtime into a cocktail program. You still install the
|
|
4
11
|
same Python package (`datapipeline`) and tap into the plugin architecture, but every CLI
|
|
5
12
|
dance step nods to a craft bar. Declarative YAML menus describe projects, sources and
|
|
@@ -42,11 +49,29 @@ raw source → canonical stream → record stage → feature stage → vector st
|
|
|
42
49
|
| `src/datapipeline/services` | Bootstrapping (project loading, YAML interpolation), runtime factories and scaffolding helpers for new bar tools (`services/bootstrap.py`, `services/factories.py`, `services/scaffold/plugin.py`). |
|
|
43
50
|
| `src/datapipeline/pipeline` | Pure functions that build record/feature/vector iterators plus supporting utilities for ordering and transform wiring (`pipeline/pipelines.py`, `pipeline/utils/transform_utils.py`). |
|
|
44
51
|
| `src/datapipeline/domain` | Data structures representing records, feature records and vectors coming off the line (`domain/record.py`, `domain/feature.py`, `domain/vector.py`). |
|
|
45
|
-
| `src/datapipeline/transforms` & `src/datapipeline/filters` | Built-in transforms (lagging timestamps, sliding windows) and filter helpers exposed through entry points (`transforms/transforms.py`, `transforms/sequence.py`, `filters/filters.py`).
|
|
52
|
+
| `src/datapipeline/transforms` & `src/datapipeline/filters` | Built-in transforms (lagging timestamps, scaling, sliding windows) and filter helpers exposed through entry points (`transforms/record.py`, `transforms/feature.py`, `transforms/sequence.py`, `filters/filters.py`). |
|
|
46
53
|
| `src/datapipeline/sources/synthetic/time` | Example synthetic time-series loader/parser pair plus helper mappers for experimentation while the real spirits arrive (`sources/synthetic/time/loader.py`, `sources/synthetic/time/parser.py`, `mappers/synthetic/time.py`). |
|
|
47
54
|
|
|
48
55
|
---
|
|
49
56
|
|
|
57
|
+
## Built-in DSL identifiers
|
|
58
|
+
|
|
59
|
+
The YAML DSL resolves filters and transforms by entry-point name. These ship with the
|
|
60
|
+
template out of the box:
|
|
61
|
+
|
|
62
|
+
| Kind | Identifiers | Notes |
|
|
63
|
+
| ----------------- | ----------------------------------------------------------------------------------------------- | ----- |
|
|
64
|
+
| Filters | `eq`/`equals`, `ne`/`not_equal`, `lt`, `le`, `gt`, `ge`, `in`/`contains`, `nin`/`not_in` | Use as `- gt: { field: value }` or `- in: { field: [values...] }`. Synonyms map to the same implementation. |
|
|
65
|
+
| Record transforms | `time_lag`, `drop_missing` | `time_lag` expects a duration string (e.g. `1h`), `drop_missing` removes `None`/`NaN` records. |
|
|
66
|
+
| Feature transforms| `standard_scale` | Options: `with_mean`, `with_std`, optional `statistics`. |
|
|
67
|
+
| Sequence transforms | `time_window`, `time_fill_mean`, `time_fill_median` | `time_window` builds sliding windows; the fill transforms impute missing values from running mean/median with optional `window`/`min_samples`. |
|
|
68
|
+
| Vector transforms | `fill_history`, `fill_horizontal`, `fill_constant`, `drop_missing` | History fill uses prior buckets, horizontal fill aggregates sibling partitions, constant sets a default, and drop removes vectors below coverage thresholds. |
|
|
69
|
+
|
|
70
|
+
Extend `pyproject.toml` with additional entry points to register custom logic under your
|
|
71
|
+
own identifiers.
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
50
75
|
## Opening the bar
|
|
51
76
|
|
|
52
77
|
### 1. Install the tools
|
|
@@ -69,17 +94,17 @@ python -c "import datapipeline; print('bar ready')"
|
|
|
69
94
|
|
|
70
95
|
### 2. Draft your bar book
|
|
71
96
|
|
|
72
|
-
Create a `config/project.yaml` so the runtime knows where to find
|
|
73
|
-
and the tasting menu. Globals are optional but handy for sharing
|
|
74
|
-
interpolated into downstream YAML specs during bootstrap
|
|
97
|
+
Create a `config/recipes/<name>/project.yaml` so the runtime knows where to find
|
|
98
|
+
ingredients, infusions and the tasting menu. Globals are optional but handy for sharing
|
|
99
|
+
values—they are interpolated into downstream YAML specs during bootstrap
|
|
75
100
|
(`src/datapipeline/config/project.py`, `src/datapipeline/services/bootstrap.py`).
|
|
76
101
|
|
|
77
102
|
```yaml
|
|
78
103
|
version: 1
|
|
79
104
|
paths:
|
|
80
|
-
sources:
|
|
81
|
-
streams:
|
|
82
|
-
dataset:
|
|
105
|
+
sources: ../../sources
|
|
106
|
+
streams: ../../contracts
|
|
107
|
+
dataset: dataset.yaml
|
|
83
108
|
globals:
|
|
84
109
|
opening_time: "2024-01-01T16:00:00Z"
|
|
85
110
|
last_call: "2024-01-02T02:00:00Z"
|
|
@@ -90,13 +115,13 @@ globals:
|
|
|
90
115
|
|
|
91
116
|
### 3. Stock the bottles (raw sources)
|
|
92
117
|
|
|
93
|
-
Create `config/
|
|
118
|
+
Create `config/sources/<alias>.yaml` files. Each must expose a `parser` and `loader`
|
|
94
119
|
pointing at entry points plus any constructor arguments
|
|
95
120
|
(`src/datapipeline/services/bootstrap.py`). Here is a synthetic clock source that feels
|
|
96
121
|
like a drip of barrel-aged bitters:
|
|
97
122
|
|
|
98
123
|
```yaml
|
|
99
|
-
# config/
|
|
124
|
+
# config/sources/time_ticks.yaml
|
|
100
125
|
parser:
|
|
101
126
|
entrypoint: "synthetic.time"
|
|
102
127
|
args: {}
|
|
@@ -128,7 +153,7 @@ mapper:
|
|
|
128
153
|
mode: spritz
|
|
129
154
|
```
|
|
130
155
|
|
|
131
|
-
The mapper uses the provided mode to create a new `
|
|
156
|
+
The mapper uses the provided mode to create a new `TimeSeriesRecord` stream ready for the
|
|
132
157
|
feature stage (`mappers/synthetic/time.py`).
|
|
133
158
|
|
|
134
159
|
### 5. Script the tasting menu (dataset)
|
|
@@ -138,28 +163,53 @@ are grouped (`src/datapipeline/config/dataset/dataset.py`). A minimal hourly men
|
|
|
138
163
|
look like:
|
|
139
164
|
|
|
140
165
|
```yaml
|
|
141
|
-
# config/
|
|
166
|
+
# config/recipes/default/dataset.yaml
|
|
142
167
|
group_by:
|
|
143
168
|
keys:
|
|
144
169
|
- type: time
|
|
145
170
|
field: time
|
|
146
171
|
resolution: 1h
|
|
147
172
|
features:
|
|
148
|
-
-
|
|
149
|
-
|
|
150
|
-
partition_by: null
|
|
151
|
-
filters: []
|
|
173
|
+
- id: hour_spritz
|
|
174
|
+
stream: time.encode
|
|
152
175
|
transforms:
|
|
153
|
-
-
|
|
176
|
+
- record:
|
|
177
|
+
transform: time_lag
|
|
178
|
+
args: 0h
|
|
179
|
+
- feature:
|
|
180
|
+
transform: standard_scale
|
|
181
|
+
with_mean: true
|
|
182
|
+
with_std: true
|
|
183
|
+
- sequence:
|
|
184
|
+
transform: time_window
|
|
185
|
+
size: 4
|
|
186
|
+
stride: 1
|
|
187
|
+
- sequence:
|
|
188
|
+
transform: time_fill_mean
|
|
189
|
+
window: 24
|
|
190
|
+
min_samples: 6
|
|
154
191
|
```
|
|
155
192
|
|
|
156
193
|
Use the sample `dataset` template as a starting point if you prefer scaffolding before
|
|
157
|
-
pouring concrete values. Group keys
|
|
158
|
-
requested resolution)
|
|
159
|
-
|
|
160
|
-
`
|
|
161
|
-
|
|
162
|
-
|
|
194
|
+
pouring concrete values. Group keys now require explicit time bucketing (with automatic
|
|
195
|
+
flooring to the requested resolution) so every pipeline is clock-driven. You can attach
|
|
196
|
+
feature or sequence transforms—such as the sliding `TimeWindowTransformer` or the
|
|
197
|
+
`time_fill_mean`/`time_fill_median` imputers—directly in the YAML by referencing their
|
|
198
|
+
entry point names (`src/datapipeline/transforms/sequence.py`).
|
|
199
|
+
|
|
200
|
+
When vectors are assembled you can optionally apply `vector_transforms` to enforce schema
|
|
201
|
+
guarantees. The built-ins cover:
|
|
202
|
+
|
|
203
|
+
- `fill_history` – use running means/medians from prior buckets (per partition) with
|
|
204
|
+
configurable window/minimum samples.
|
|
205
|
+
- `fill_horizontal` – aggregate sibling partitions at the same timestamp (e.g. other
|
|
206
|
+
stations) using mean/median.
|
|
207
|
+
- `fill_constant` – provide a constant default for missing features/partitions.
|
|
208
|
+
- `drop_missing` – drop vectors that fall below a coverage threshold or omit required
|
|
209
|
+
features.
|
|
210
|
+
|
|
211
|
+
Transforms accept either an explicit `expected` list or a manifest path to discover the
|
|
212
|
+
full partition set (`build/partitions.json` produced by `jerry inspect partitions`).
|
|
163
213
|
|
|
164
214
|
Once the book is ready, run the bootstrapper (the CLI does this automatically) to
|
|
165
215
|
materialize all registered sources and streams
|
|
@@ -172,9 +222,9 @@ materialize all registered sources and streams
|
|
|
172
222
|
### Prep any station (with visuals)
|
|
173
223
|
|
|
174
224
|
```bash
|
|
175
|
-
jerry prep pour --project config/project.yaml --limit 20
|
|
176
|
-
jerry prep build --project config/project.yaml --limit 20
|
|
177
|
-
jerry prep stir --project config/project.yaml --limit 20
|
|
225
|
+
jerry prep pour --project config/datasets/default/project.yaml --limit 20
|
|
226
|
+
jerry prep build --project config/datasets/default/project.yaml --limit 20
|
|
227
|
+
jerry prep stir --project config/datasets/default/project.yaml --limit 20
|
|
178
228
|
```
|
|
179
229
|
|
|
180
230
|
- `prep pour` shows the record-stage ingredients headed for each feature.
|
|
@@ -191,34 +241,79 @@ loaders. The CLI wires up `build_record_pipeline`, `build_feature_pipeline` and
|
|
|
191
241
|
### Serve the flights (production mode)
|
|
192
242
|
|
|
193
243
|
```bash
|
|
194
|
-
jerry serve --project config/project.yaml --output print
|
|
195
|
-
jerry serve --project config/project.yaml --output stream
|
|
196
|
-
jerry serve --project config/project.yaml --output exports/batch.pt
|
|
244
|
+
jerry serve --project config/datasets/default/project.yaml --output print
|
|
245
|
+
jerry serve --project config/datasets/default/project.yaml --output stream
|
|
246
|
+
jerry serve --project config/datasets/default/project.yaml --output exports/batch.pt
|
|
197
247
|
```
|
|
198
248
|
|
|
199
249
|
Production mode skips the bar flair and focuses on throughput. `print` writes tasting
|
|
200
250
|
notes to stdout, `stream` emits newline-delimited JSON (with values coerced to strings when
|
|
201
251
|
necessary), and a `.pt` destination stores a pickle-compatible payload for later pours.
|
|
202
252
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
253
|
+
## Funnel vectors into ML projects
|
|
254
|
+
|
|
255
|
+
Data scientists rarely want to shell out to the CLI; they need a programmatic
|
|
256
|
+
hand-off that plugs vectors straight into notebooks, feature stores or training
|
|
257
|
+
loops. The `datapipeline.integrations` package wraps the existing iterator
|
|
258
|
+
builders with ML-friendly adapters without pulling pandas or torch into the
|
|
259
|
+
core runtime.
|
|
260
|
+
|
|
261
|
+
```python
|
|
262
|
+
from datapipeline.integrations import (
|
|
263
|
+
VectorAdapter,
|
|
264
|
+
dataframe_from_vectors,
|
|
265
|
+
iter_vector_rows,
|
|
266
|
+
torch_dataset,
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# Bootstrap once and stream ready-to-use rows.
|
|
270
|
+
adapter = VectorAdapter.from_project("config/project.yaml")
|
|
271
|
+
for row in adapter.iter_rows(limit=32, flatten_sequences=True):
|
|
272
|
+
send_to_feature_store(row)
|
|
273
|
+
|
|
274
|
+
# Helper functions cover ad-hoc jobs as well.
|
|
275
|
+
rows = iter_vector_rows(
|
|
276
|
+
"config/project.yaml",
|
|
277
|
+
include_group=True,
|
|
278
|
+
group_format="mapping",
|
|
279
|
+
flatten_sequences=True,
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
# Optional extras materialize into common ML containers if installed.
|
|
283
|
+
df = dataframe_from_vectors("config/project.yaml") # Requires pandas
|
|
284
|
+
dataset = torch_dataset("config/project.yaml", dtype=torch.float32) # Requires torch
|
|
207
285
|
```
|
|
208
286
|
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
287
|
+
Everything still flows through `build_vector_pipeline`; the integration layer
|
|
288
|
+
normalizes group keys, optionally flattens sequence features and demonstrates
|
|
289
|
+
how to turn the iterator into DataFrames or `torch.utils.data.Dataset`
|
|
290
|
+
instances. ML teams can fork the same pattern for their own stacks—Spark, NumPy
|
|
291
|
+
or feature store SDKs—without adding opinionated glue to the runtime itself.
|
|
292
|
+
|
|
293
|
+
### Inspect the balance (vector quality)
|
|
294
|
+
|
|
295
|
+
Use the inspect helpers for different outputs:
|
|
296
|
+
|
|
297
|
+
- `jerry inspect report --project config/datasets/default/project.yaml` — print a
|
|
298
|
+
human-readable quality report (totals, keep/below lists, optional partition detail).
|
|
299
|
+
- `jerry inspect coverage --project config/datasets/default/project.yaml` — persist the
|
|
300
|
+
coverage summary to `build/coverage.json` (keep/below feature and partition lists plus
|
|
301
|
+
coverage percentages).
|
|
302
|
+
- `jerry inspect matrix --project config/datasets/default/project.yaml --format html` —
|
|
303
|
+
export availability matrices (CSV or HTML) for deeper analysis.
|
|
304
|
+
- `jerry inspect partitions --project config/datasets/default/project.yaml` — write the
|
|
305
|
+
observed partition manifest to `build/partitions.json` for use in configs.
|
|
306
|
+
|
|
307
|
+
Note: `jerry prep taste` has been removed; use `jerry inspect report` and friends.
|
|
213
308
|
|
|
214
309
|
---
|
|
215
310
|
|
|
216
|
-
## Extending the
|
|
311
|
+
## Extending the CLI
|
|
217
312
|
|
|
218
313
|
### Scaffold a plugin package
|
|
219
314
|
|
|
220
315
|
```bash
|
|
221
|
-
jerry
|
|
316
|
+
jerry plugin init --name my_datapipeline --out .
|
|
222
317
|
```
|
|
223
318
|
|
|
224
319
|
The generator copies a ready-made skeleton (pyproject, README, package directory) and
|
|
@@ -232,25 +327,29 @@ transforms.
|
|
|
232
327
|
Use the CLI helpers to scaffold boilerplate code in your plugin workspace:
|
|
233
328
|
|
|
234
329
|
```bash
|
|
235
|
-
jerry
|
|
236
|
-
jerry
|
|
237
|
-
jerry contract
|
|
330
|
+
jerry source add --provider dmi --dataset metobs --transport fs --format csv
|
|
331
|
+
jerry domain add --domain metobs
|
|
332
|
+
jerry contract
|
|
238
333
|
```
|
|
239
334
|
|
|
240
|
-
The
|
|
241
|
-
YAML file in `config/
|
|
335
|
+
The source command writes DTO/parser stubs, updates entry points and drops a matching
|
|
336
|
+
YAML file in `config/sources/` pre-filled with composed-loader defaults for the chosen
|
|
242
337
|
transport (`src/datapipeline/cli/app.py`, `src/datapipeline/services/scaffold/source.py`).
|
|
338
|
+
`jerry domain add` now always scaffolds `TimeSeriesRecord` domains so every mapper carries
|
|
339
|
+
an explicit timestamp alongside its value, and `jerry contract` wires that source/domain
|
|
340
|
+
pair up for canonical stream generation.
|
|
243
341
|
|
|
244
342
|
### Add custom filters or transforms
|
|
245
343
|
|
|
246
344
|
Register new functions/classes under the appropriate entry point group in your plugin’s
|
|
247
|
-
`pyproject.toml`. The runtime resolves them through `load_ep`, applies record
|
|
248
|
-
|
|
249
|
-
|
|
345
|
+
`pyproject.toml`. The runtime resolves them through `load_ep`, applies record filters first,
|
|
346
|
+
then record/feature/sequence transforms in the order declared in the dataset config
|
|
347
|
+
(`pyproject.toml`, `src/datapipeline/utils/load.py`,
|
|
250
348
|
`src/datapipeline/pipeline/utils/transform_utils.py`). Built-in helpers cover common
|
|
251
349
|
comparisons (including timezone-aware checks) and time-based transforms (lags, sliding
|
|
252
350
|
windows) if you need quick wins (`src/datapipeline/filters/filters.py`,
|
|
253
|
-
`src/datapipeline/transforms/
|
|
351
|
+
`src/datapipeline/transforms/record.py`, `src/datapipeline/transforms/feature.py`,
|
|
352
|
+
`src/datapipeline/transforms/sequence.py`).
|
|
254
353
|
|
|
255
354
|
### Prototype with synthetic time-series data
|
|
256
355
|
|
|
@@ -268,8 +367,7 @@ transform to build sliding-window feature flights without external datasets
|
|
|
268
367
|
|
|
269
368
|
| Type | Description |
|
|
270
369
|
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
271
|
-
| `
|
|
272
|
-
| `TimeFeatureRecord` | A record with a timezone-aware `time` attribute, normalized to UTC to avoid boundary issues (`src/datapipeline/domain/record.py`). |
|
|
370
|
+
| `TimeSeriesRecord` | Canonical record with `time` (tz-aware, normalized to UTC) and `value`; the pipeline treats streams as ordered series (`src/datapipeline/domain/record.py`).|
|
|
273
371
|
| `FeatureRecord` | Links a record (or list of records from sequence transforms) to a `feature_id` and `group_key` (`src/datapipeline/domain/feature.py`). |
|
|
274
372
|
| `Vector` | Final grouped payload: a mapping of feature IDs to scalars or ordered lists plus helper methods for shape/key access (`src/datapipeline/domain/vector.py`). |
|
|
275
373
|
|