jerry-thomas 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +120 -17
- datapipeline/analysis/vector/matrix.py +33 -8
- datapipeline/analysis/vector/report.py +162 -32
- datapipeline/build/tasks/__init__.py +11 -0
- datapipeline/build/tasks/config.py +74 -0
- datapipeline/build/tasks/metadata.py +170 -0
- datapipeline/build/tasks/scaler.py +73 -0
- datapipeline/build/tasks/schema.py +60 -0
- datapipeline/build/tasks/utils.py +169 -0
- datapipeline/cli/app.py +304 -127
- datapipeline/cli/commands/build.py +240 -16
- datapipeline/cli/commands/contract.py +367 -0
- datapipeline/cli/commands/domain.py +8 -3
- datapipeline/cli/commands/inspect.py +401 -149
- datapipeline/cli/commands/list_.py +30 -7
- datapipeline/cli/commands/plugin.py +1 -1
- datapipeline/cli/commands/run.py +227 -241
- datapipeline/cli/commands/run_config.py +101 -0
- datapipeline/cli/commands/serve_pipeline.py +156 -0
- datapipeline/cli/commands/source.py +44 -8
- datapipeline/cli/visuals/__init__.py +4 -2
- datapipeline/cli/visuals/common.py +239 -0
- datapipeline/cli/visuals/labels.py +15 -15
- datapipeline/cli/visuals/runner.py +66 -0
- datapipeline/cli/visuals/sections.py +20 -0
- datapipeline/cli/visuals/sources.py +132 -119
- datapipeline/cli/visuals/sources_basic.py +260 -0
- datapipeline/cli/visuals/sources_off.py +76 -0
- datapipeline/cli/visuals/sources_rich.py +414 -0
- datapipeline/config/catalog.py +37 -3
- datapipeline/config/context.py +214 -0
- datapipeline/config/dataset/loader.py +21 -4
- datapipeline/config/dataset/normalize.py +4 -4
- datapipeline/config/metadata.py +43 -0
- datapipeline/config/postprocess.py +2 -2
- datapipeline/config/project.py +3 -2
- datapipeline/config/resolution.py +129 -0
- datapipeline/config/tasks.py +309 -0
- datapipeline/config/workspace.py +155 -0
- datapipeline/domain/__init__.py +12 -0
- datapipeline/domain/record.py +11 -0
- datapipeline/domain/sample.py +54 -0
- datapipeline/integrations/ml/adapter.py +34 -20
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +1 -6
- datapipeline/integrations/ml/torch_support.py +1 -3
- datapipeline/io/factory.py +112 -0
- datapipeline/io/output.py +132 -0
- datapipeline/io/protocols.py +21 -0
- datapipeline/io/serializers.py +219 -0
- datapipeline/io/sinks/__init__.py +23 -0
- datapipeline/io/sinks/base.py +2 -0
- datapipeline/io/sinks/files.py +79 -0
- datapipeline/io/sinks/rich.py +57 -0
- datapipeline/io/sinks/stdout.py +18 -0
- datapipeline/io/writers/__init__.py +14 -0
- datapipeline/io/writers/base.py +28 -0
- datapipeline/io/writers/csv_writer.py +25 -0
- datapipeline/io/writers/jsonl.py +52 -0
- datapipeline/io/writers/pickle_writer.py +30 -0
- datapipeline/pipeline/artifacts.py +58 -0
- datapipeline/pipeline/context.py +66 -7
- datapipeline/pipeline/observability.py +65 -0
- datapipeline/pipeline/pipelines.py +65 -13
- datapipeline/pipeline/split.py +11 -10
- datapipeline/pipeline/stages.py +127 -16
- datapipeline/pipeline/utils/keygen.py +20 -7
- datapipeline/pipeline/utils/memory_sort.py +22 -10
- datapipeline/pipeline/utils/transform_utils.py +22 -0
- datapipeline/runtime.py +5 -2
- datapipeline/services/artifacts.py +12 -6
- datapipeline/services/bootstrap/config.py +25 -0
- datapipeline/services/bootstrap/core.py +52 -37
- datapipeline/services/constants.py +6 -5
- datapipeline/services/factories.py +123 -1
- datapipeline/services/project_paths.py +43 -16
- datapipeline/services/runs.py +208 -0
- datapipeline/services/scaffold/domain.py +3 -2
- datapipeline/services/scaffold/filter.py +3 -2
- datapipeline/services/scaffold/mappers.py +9 -6
- datapipeline/services/scaffold/plugin.py +3 -3
- datapipeline/services/scaffold/source.py +93 -56
- datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
- datapipeline/sources/decoders.py +83 -18
- datapipeline/sources/factory.py +26 -16
- datapipeline/sources/models/__init__.py +2 -2
- datapipeline/sources/models/generator.py +0 -7
- datapipeline/sources/models/loader.py +3 -3
- datapipeline/sources/models/parsing_error.py +24 -0
- datapipeline/sources/models/source.py +6 -6
- datapipeline/sources/synthetic/time/loader.py +14 -2
- datapipeline/sources/transports.py +74 -37
- datapipeline/templates/plugin_skeleton/README.md +74 -30
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
- datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
- datapipeline/templates/plugin_skeleton/jerry.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
- datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -0
- datapipeline/templates/stubs/mapper.py.j2 +5 -4
- datapipeline/templates/stubs/parser.py.j2 +2 -0
- datapipeline/templates/stubs/record.py.j2 +2 -0
- datapipeline/templates/stubs/source.yaml.j2 +2 -3
- datapipeline/transforms/debug/lint.py +26 -41
- datapipeline/transforms/feature/scaler.py +89 -13
- datapipeline/transforms/record/floor_time.py +4 -4
- datapipeline/transforms/sequence.py +2 -35
- datapipeline/transforms/stream/dedupe.py +24 -0
- datapipeline/transforms/stream/ensure_ticks.py +7 -6
- datapipeline/transforms/vector/__init__.py +5 -0
- datapipeline/transforms/vector/common.py +98 -0
- datapipeline/transforms/vector/drop/__init__.py +4 -0
- datapipeline/transforms/vector/drop/horizontal.py +79 -0
- datapipeline/transforms/vector/drop/orchestrator.py +59 -0
- datapipeline/transforms/vector/drop/vertical.py +182 -0
- datapipeline/transforms/vector/ensure_schema.py +184 -0
- datapipeline/transforms/vector/fill.py +87 -0
- datapipeline/transforms/vector/replace.py +62 -0
- datapipeline/utils/load.py +24 -3
- datapipeline/utils/rich_compat.py +38 -0
- datapipeline/utils/window.py +76 -0
- jerry_thomas-1.0.0.dist-info/METADATA +825 -0
- jerry_thomas-1.0.0.dist-info/RECORD +199 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/entry_points.txt +9 -8
- datapipeline/build/tasks.py +0 -186
- datapipeline/cli/commands/link.py +0 -128
- datapipeline/cli/commands/writers.py +0 -138
- datapipeline/config/build.py +0 -64
- datapipeline/config/run.py +0 -116
- datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
- datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
- datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
- datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
- datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
- datapipeline/transforms/vector.py +0 -210
- jerry_thomas-0.3.0.dist-info/METADATA +0 -502
- jerry_thomas-0.3.0.dist-info/RECORD +0 -139
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/WHEEL +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -4,10 +4,10 @@ Minimal plugin skeleton for the Jerry Thomas (datapipeline) framework.
|
|
|
4
4
|
|
|
5
5
|
Quick start
|
|
6
6
|
- Initialize a plugin (already done if you’re reading this here):
|
|
7
|
-
- `jerry plugin init
|
|
7
|
+
- `jerry plugin init {{DIST_NAME}}`
|
|
8
8
|
- Add a source via CLI (transport-specific placeholders are scaffolded):
|
|
9
|
-
- File data: `jerry source add
|
|
10
|
-
-
|
|
9
|
+
- File data: `jerry source add <provider> <dataset> -t fs -f <csv|json|json-lines|pickle>`
|
|
10
|
+
- HTTP data: `jerry source add <provider>.<dataset> -t http -f <json|json-lines|csv>`
|
|
11
11
|
- Synthetic: `jerry source add -p <provider> -d <dataset> -t synthetic`
|
|
12
12
|
- Edit the generated `config/sources/*.yaml` to fill in the `path`, delimiter, etc.
|
|
13
13
|
- Reinstall after EP changes (pyproject.toml) and restart Python processes:
|
|
@@ -15,12 +15,15 @@ Quick start
|
|
|
15
15
|
- This plugin: `python -m pip install -e .`
|
|
16
16
|
|
|
17
17
|
Folder layout
|
|
18
|
-
- `
|
|
19
|
-
- `
|
|
18
|
+
- `example/`
|
|
19
|
+
- `project.yaml` — project root (paths, globals, cadence/split)
|
|
20
|
+
- `dataset.yaml` — feature/target declarations (uses `${group_by}` from globals)
|
|
21
|
+
- `postprocess.yaml` — postprocess transforms
|
|
20
22
|
- `contracts/*.yaml` — canonical stream definitions
|
|
21
|
-
- `
|
|
23
|
+
- `sources/*.yaml` — raw source definitions (one file per source)
|
|
24
|
+
- `tasks/*.yaml` — task specs (schema/scaler/metadata/serve)
|
|
22
25
|
- Every dataset `project.yaml` declares a `name`; reference it via `${project_name}`
|
|
23
|
-
inside other config files (e.g., `paths.artifacts:
|
|
26
|
+
inside other config files (e.g., `paths.artifacts: ../artifacts/${project_name}`) to
|
|
24
27
|
avoid hard-coding per-dataset directories.
|
|
25
28
|
- `src/{{PACKAGE_NAME}}/`
|
|
26
29
|
- `sources/<provider>/<dataset>/dto.py` — DTO model for the source
|
|
@@ -30,52 +33,59 @@ Folder layout
|
|
|
30
33
|
- `mappers/*.py` — map DTOs → domain records
|
|
31
34
|
|
|
32
35
|
How loaders work
|
|
33
|
-
- For fs/
|
|
34
|
-
- `loader.entrypoint: "{{
|
|
36
|
+
- For fs/http, sources use the generic loader entry point:
|
|
37
|
+
- `loader.entrypoint: "{{DEFAULT_IO_LOADER_EP}}"`
|
|
35
38
|
- `loader.args` include `transport`, `format`, and source-specific args (placeholders are provided):
|
|
36
39
|
- fs: `path`, `glob`, `encoding`, plus `delimiter` for csv
|
|
37
|
-
-
|
|
40
|
+
- http: `url`, `headers`, `params`, `encoding`, optional `count_by_fetch`
|
|
38
41
|
- Synthetic sources generate data in-process and keep a small loader stub.
|
|
39
42
|
|
|
40
43
|
Run data flows
|
|
41
|
-
- Build artifacts once: `jerry build --project
|
|
42
|
-
- Preview records (stage 1): `jerry serve --project
|
|
43
|
-
- Preview features (stage 3): `jerry serve --project
|
|
44
|
-
- Preview vectors (stage 7): `jerry serve --project
|
|
44
|
+
- Build artifacts once: `jerry build --project example/project.yaml`
|
|
45
|
+
- Preview records (stage 1): `jerry serve --project example/project.yaml --stage 1 --limit 100`
|
|
46
|
+
- Preview features (stage 3): `jerry serve --project example/project.yaml --stage 3 --limit 100`
|
|
47
|
+
- Preview vectors (stage 7): `jerry serve --project example/project.yaml --stage 7 --limit 100`
|
|
45
48
|
|
|
46
49
|
Analyze vectors
|
|
47
|
-
- `jerry inspect report --project
|
|
48
|
-
- `jerry inspect
|
|
49
|
-
- `jerry inspect matrix --project
|
|
50
|
-
- `jerry inspect
|
|
50
|
+
- `jerry inspect report --project example/project.yaml` (console only)
|
|
51
|
+
- `jerry inspect partitions --project example/project.yaml` (writes build/partitions.json)
|
|
52
|
+
- `jerry inspect matrix --project example/project.yaml --format html` (writes build/matrix.html)
|
|
53
|
+
- `jerry inspect expected --project example/project.yaml` (writes build/expected.txt)
|
|
51
54
|
- Use post-processing transforms in `postprocess.yaml` to keep coverage high
|
|
52
55
|
(history/horizontal fills, constants, or drop rules) before serving vectors.
|
|
56
|
+
Add `payload: targets` inside a transform when you need to mutate label vectors.
|
|
53
57
|
|
|
54
58
|
Train/Val/Test splits (deterministic)
|
|
55
59
|
- Configure split mechanics once in your project file:
|
|
56
|
-
- Edit `
|
|
60
|
+
- Edit `example/project.yaml` and set:
|
|
57
61
|
```yaml
|
|
58
62
|
globals:
|
|
63
|
+
group_by: 10m # dataset cadence; reused as contract cadence
|
|
59
64
|
split:
|
|
60
65
|
mode: hash # hash|time
|
|
61
66
|
key: group # group or feature:<id> (entity-stable)
|
|
62
67
|
seed: 42 # deterministic hash seed
|
|
63
68
|
ratios: {train: 0.8, val: 0.1, test: 0.1}
|
|
64
69
|
```
|
|
65
|
-
- Select the active slice via `
|
|
70
|
+
- Select the active slice via `example/tasks/serve.<name>.yaml` (or `--keep`):
|
|
66
71
|
```yaml
|
|
67
|
-
|
|
72
|
+
kind: serve
|
|
73
|
+
name: train # defaults to filename stem when omitted
|
|
68
74
|
keep: train # any label defined in globals.split; null disables filtering
|
|
69
|
-
output:
|
|
75
|
+
output:
|
|
76
|
+
transport: stdout # stdout | fs
|
|
77
|
+
format: print # print | json-lines | json | csv | pickle
|
|
70
78
|
limit: 100 # cap vectors per serve run (null = unlimited)
|
|
71
|
-
include_targets: false # include dataset.targets when serving
|
|
72
79
|
throttle_ms: null # sleep between vectors (milliseconds)
|
|
80
|
+
# visuals: AUTO # AUTO | TQDM | RICH | OFF
|
|
81
|
+
# progress: AUTO # AUTO | SPINNER | BARS | OFF
|
|
73
82
|
```
|
|
74
|
-
-
|
|
75
|
-
|
|
76
|
-
-
|
|
77
|
-
- `jerry serve -p
|
|
78
|
-
- `
|
|
83
|
+
- Add additional `kind: serve` files (e.g., `serve.val.yaml`, `serve.test.yaml`) and the CLI will run each enabled file in order unless you pass `--run <name>`.
|
|
84
|
+
- Serve examples (change the serve task or pass `--keep val|test`):
|
|
85
|
+
- `jerry serve -p example/project.yaml --out-transport stdout --out-format json-lines > train.jsonl`
|
|
86
|
+
- `jerry serve -p example/project.yaml --keep val --out-transport stdout --out-format json-lines > val.jsonl`
|
|
87
|
+
- Add `--visuals rich --progress bars` for a richer interactive UI; defaults to `AUTO`.
|
|
88
|
+
- For shared workspace defaults (visual renderer, progress display, build mode), drop a `jerry.yaml` next to your workspace root and set `shared.visuals`, `shared.progress`, etc. CLI commands walk up from the current directory to find it.
|
|
79
89
|
- The split is applied at the end (after postprocess transforms), and assignment
|
|
80
90
|
is deterministic (hash-based) with a fixed seed; no overlap across runs.
|
|
81
91
|
|
|
@@ -84,13 +94,47 @@ Key selection guidance
|
|
|
84
94
|
- `key: feature:<id>` hashes a specific feature value, e.g., `feature:entity_id` or `feature:station_id`, ensuring all vectors for that entity land in the same split (recommended to avoid leakage).
|
|
85
95
|
|
|
86
96
|
Postprocess expected IDs
|
|
87
|
-
- Build once with `jerry build --project config/
|
|
97
|
+
- Build once with `jerry build --project config/project.yaml` (or run `jerry inspect expected …`) to materialize `<paths.artifacts>/expected.txt`.
|
|
88
98
|
- Bootstrap registers the artifact; postprocess transforms read it automatically. Per-transform `expected:` overrides are no longer required or supported — the build output is the single source of truth.
|
|
89
99
|
|
|
90
100
|
Scaler statistics
|
|
91
|
-
-
|
|
101
|
+
- Jerry computes scaler stats automatically. If you need custom paths or settings, add `tasks/scaler.yaml` and override the defaults.
|
|
92
102
|
- The build writes `<paths.artifacts>/scaler.pkl`; runtime scaling requires this artifact. If it is missing, scaling transforms raise a runtime error.
|
|
93
103
|
|
|
94
104
|
Tips
|
|
95
105
|
- Keep parsers thin — mirror source schema and return DTOs; use the identity parser only if your loader already emits domain records.
|
|
96
106
|
- Prefer small, composable configs over monolithic ones: one YAML per source is easier to review and reuse.
|
|
107
|
+
|
|
108
|
+
Composed streams (engineered domains)
|
|
109
|
+
- Declare engineered streams that depend on other canonical streams directly in contracts. The runtime builds each input to stage 4, stream‑aligns by partition+timestamp, runs your composer, and emits fresh records for the derived stream.
|
|
110
|
+
|
|
111
|
+
```yaml
|
|
112
|
+
# example/contracts/air_density.processed.yaml
|
|
113
|
+
kind: composed
|
|
114
|
+
id: air_density.processed
|
|
115
|
+
inputs:
|
|
116
|
+
- p=pressure.processed
|
|
117
|
+
- t=temp_dry.processed
|
|
118
|
+
partition_by: station_id
|
|
119
|
+
sort_batch_size: 20000
|
|
120
|
+
|
|
121
|
+
mapper:
|
|
122
|
+
entrypoint: {{PACKAGE_NAME}}.mappers.air_density:mapper
|
|
123
|
+
args:
|
|
124
|
+
driver: p # optional; defaults to first input alias
|
|
125
|
+
|
|
126
|
+
# Optional post‑compose policies (same as any stream):
|
|
127
|
+
# record: [...]
|
|
128
|
+
# stream: [...]
|
|
129
|
+
# debug: [...]
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
Then reference the composed stream in your dataset:
|
|
133
|
+
|
|
134
|
+
```yaml
|
|
135
|
+
# example/dataset.yaml
|
|
136
|
+
group_by: ${group_by}
|
|
137
|
+
features:
|
|
138
|
+
- id: air_density
|
|
139
|
+
record_stream: air_density.processed
|
|
140
|
+
```
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
kind: ingest
|
|
2
|
+
source: synthetic.ticks
|
|
3
|
+
id: time.ticks.hour_sin # format: domain.dataset.(variant)
|
|
4
|
+
|
|
5
|
+
# Fine-grained cadence for this stream. Defaults to the dataset group_by via project.globals.
|
|
6
|
+
cadence: ${group_by}
|
|
7
|
+
|
|
8
|
+
mapper:
|
|
9
|
+
entrypoint: encode_time
|
|
10
|
+
args: { mode: hour_sin }
|
|
11
|
+
|
|
12
|
+
# partition_by: field you want to partition
|
|
13
|
+
|
|
14
|
+
record:
|
|
15
|
+
- filter: { operator: ge, field: time, comparand: "${start_time}" }
|
|
16
|
+
- filter: { operator: le, field: time, comparand: "${end_time}" }
|
|
17
|
+
- floor_time: { cadence: "${cadence}" }
|
|
18
|
+
# - lag: { lag: "${cadence}" }
|
|
19
|
+
|
|
20
|
+
stream:
|
|
21
|
+
- dedupe: {}
|
|
22
|
+
- granularity: { mode: first }
|
|
23
|
+
- ensure_cadence: { cadence: "${cadence}" }
|
|
24
|
+
# Optional: fill gaps before downstream transforms:
|
|
25
|
+
# - fill: { statistic: median, window: 24, min_samples: 4 }
|
|
26
|
+
|
|
27
|
+
debug:
|
|
28
|
+
- lint: { mode: error, tick: "${cadence}" }
|
|
29
|
+
|
|
30
|
+
# sort_batch_size: 100000
|
|
31
|
+
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
kind: ingest
|
|
2
|
+
source: synthetic.ticks # raw source alias (see example/sources)
|
|
3
|
+
id: time.ticks.linear # canonical stream id (format: domain.dataset.(variant))
|
|
4
|
+
|
|
5
|
+
# Fine-grained cadence for this stream. Defaults to the dataset group_by via project.globals.
|
|
6
|
+
cadence: ${group_by}
|
|
7
|
+
|
|
8
|
+
mapper: # normalize/reshape DTO -> TemporalRecord
|
|
9
|
+
entrypoint: encode_time
|
|
10
|
+
args: { mode: linear }
|
|
11
|
+
# partition_by: station_id # optional: add partition suffixes to feature ids
|
|
12
|
+
|
|
13
|
+
record: # record-level transforms
|
|
14
|
+
- filter: { operator: ge, field: time, comparand: "${start_time}" }
|
|
15
|
+
- filter: { operator: le, field: time, comparand: "${end_time}" }
|
|
16
|
+
- floor_time: { cadence: "${cadence}" } # snap timestamps to cadence boundaries
|
|
17
|
+
# - lag: { lag: "${cadence}" } # optional: shift timestamps backwards
|
|
18
|
+
|
|
19
|
+
stream: # per-feature stream transforms (input sorted by id,time)
|
|
20
|
+
- dedupe: {} # drop exact-duplicate records per tick
|
|
21
|
+
- granularity: { mode: first } # aggregate duplicates within a tick
|
|
22
|
+
- ensure_cadence: { cadence: "${cadence}" } # insert missing ticks (value=None)
|
|
23
|
+
# Consider adding a fill transform to impute None values before sequence/windowing:
|
|
24
|
+
# - fill: { statistic: median, window: 6, min_samples: 1 }
|
|
25
|
+
|
|
26
|
+
debug: # optional validation-only transforms
|
|
27
|
+
- lint: { mode: error, tick: "${cadence}" } # strict cadence/order; value issues handled by downstream transforms
|
|
28
|
+
|
|
29
|
+
# sort_batch_size: 100000 # in-memory chunk size used by internal sorting
|
|
30
|
+
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
group_by: ${group_by}
|
|
2
|
+
|
|
3
|
+
features:
|
|
4
|
+
- id: time_linear
|
|
5
|
+
record_stream: time.ticks.linear
|
|
6
|
+
scale: true # optionally add with_mean/with_std overrides
|
|
7
|
+
# Sliding window over the regularized stream; cadence is enforced in the contract.
|
|
8
|
+
sequence: { size: 6, stride: 1 }
|
|
9
|
+
|
|
10
|
+
- id: time_hour_sin
|
|
11
|
+
record_stream: time.ticks.hour_sin
|
|
12
|
+
|
|
13
|
+
# - id: third_feature
|
|
14
|
+
# record_stream: anotherstream
|
|
15
|
+
# targets:
|
|
16
|
+
# - id: some_target
|
|
17
|
+
# record_stream: time.ticks.linear
|
|
18
|
+
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#### example combination of postprocessing steps ######
|
|
2
|
+
#### making sure data is complete after these combinations ######
|
|
3
|
+
- drop: # example of dropping sparse partitions/vertical-axis for targets
|
|
4
|
+
axis: vertical
|
|
5
|
+
payload: targets
|
|
6
|
+
threshold: 0.9
|
|
7
|
+
|
|
8
|
+
- drop: # example of dropping sparse partitions for features
|
|
9
|
+
axis: vertical
|
|
10
|
+
payload: features
|
|
11
|
+
threshold: 0.9
|
|
12
|
+
|
|
13
|
+
- drop: # dropping vectors/horizontal-axis that has features which none
|
|
14
|
+
axis: horizontal
|
|
15
|
+
payload: features
|
|
16
|
+
threshold: 1
|
|
17
|
+
|
|
18
|
+
- drop:
|
|
19
|
+
axis: horizontal
|
|
20
|
+
payload: targets
|
|
21
|
+
threshold: 1
|
|
22
|
+
######
|
|
23
|
+
# - fill:
|
|
24
|
+
# statistic: median
|
|
25
|
+
# window: 48
|
|
26
|
+
# min_samples: 6
|
|
27
|
+
# - replace:
|
|
28
|
+
# payload: targets
|
|
29
|
+
# value: 0.0
|
|
@@ -1,18 +1,21 @@
|
|
|
1
1
|
version: 1
|
|
2
|
-
name:
|
|
2
|
+
name: example
|
|
3
3
|
paths:
|
|
4
|
-
streams:
|
|
5
|
-
sources:
|
|
4
|
+
streams: ./contracts
|
|
5
|
+
sources: ./sources
|
|
6
6
|
dataset: dataset.yaml
|
|
7
7
|
postprocess: postprocess.yaml
|
|
8
|
-
artifacts:
|
|
9
|
-
|
|
10
|
-
run: runs
|
|
8
|
+
artifacts: ../artifacts/${project_name}/v${version}
|
|
9
|
+
tasks: ./tasks
|
|
11
10
|
globals:
|
|
11
|
+
# Globals to use in your .yaml files via ${var_name}.
|
|
12
|
+
# Primary dataset cadence; referenced from dataset.yaml (group_by)
|
|
13
|
+
# and contracts via ${group_by}.
|
|
14
|
+
group_by: 1h
|
|
12
15
|
start_time: 2021-01-01T00:00:00Z
|
|
13
|
-
end_time:
|
|
16
|
+
end_time: 2021-01-02T00:00:00Z
|
|
14
17
|
# Configure deterministic dataset split here (applied at serve time, after postprocess).
|
|
15
|
-
# Adjust `ratios` as needed; the active split is selected via
|
|
18
|
+
# Adjust `ratios` as needed; the active split is selected via serve tasks or CLI.
|
|
16
19
|
split:
|
|
17
20
|
mode: hash # hash | time (time uses boundaries/labels)
|
|
18
21
|
key: group # group | feature:<id> (entity-stable split)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
kind: serve
|
|
2
|
+
|
|
3
|
+
# Optional identifier for this serve task; defaults to filename stem.
|
|
4
|
+
name: train
|
|
5
|
+
|
|
6
|
+
# Active split label to serve; must match a label from globals.split.ratios.
|
|
7
|
+
# Set to null to disable split filtering.
|
|
8
|
+
keep: train
|
|
9
|
+
#output:
|
|
10
|
+
# transport: stdout | fs
|
|
11
|
+
# format: print | json-lines | json | csv | pickle
|
|
12
|
+
# When using fs transport, set a directory (and optionally filename) for outputs:
|
|
13
|
+
# directory: artifacts/serve
|
|
14
|
+
# filename: vectors.train
|
|
15
|
+
|
|
16
|
+
# Default max number of vectors to emit (null = unlimited).
|
|
17
|
+
# limit: 5
|
|
18
|
+
# Optional pipeline stage preview (0-7); null lets the CLI decide.
|
|
19
|
+
# stage: 7
|
|
20
|
+
|
|
21
|
+
# Optional pacing between emitted vectors (milliseconds).
|
|
22
|
+
# throttle_ms: null
|
|
23
|
+
|
|
24
|
+
# Visuals/logging knobs (inherit CLI or jerry.yaml defaults when omitted):
|
|
25
|
+
# visuals: AUTO # AUTO | TQDM | RICH | OFF
|
|
26
|
+
# progress: AUTO # AUTO | SPINNER | BARS | OFF
|
|
27
|
+
# log_level: INFO # CRITICAL | ERROR | WARNING | INFO | DEBUG
|
|
28
|
+
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Workspace defaults. Move this file to the root of your workspace if you serve
|
|
2
|
+
# configs outside the plugin repo; the CLI searches upward from cwd for jerry.yaml.
|
|
3
|
+
|
|
4
|
+
plugin_root: . # relative path to the plugin repo root
|
|
5
|
+
|
|
6
|
+
datasets:
|
|
7
|
+
example: example/project.yaml
|
|
8
|
+
your-second-example-dataset: your-dataset/project.yaml
|
|
9
|
+
|
|
10
|
+
default_dataset: example
|
|
11
|
+
|
|
12
|
+
shared:
|
|
13
|
+
visuals: AUTO # AUTO | TQDM | RICH | OFF
|
|
14
|
+
progress: BARS # AUTO | SPINNER | BARS | OFF
|
|
15
|
+
log_level: INFO
|
|
16
|
+
|
|
17
|
+
serve:
|
|
18
|
+
# log_level: INFO
|
|
19
|
+
limit: null
|
|
20
|
+
stage: null
|
|
21
|
+
output:
|
|
22
|
+
transport: stdout
|
|
23
|
+
format: print # set to fs + directory for file outputs
|
|
24
|
+
# directory: artifacts/serve
|
|
25
|
+
|
|
26
|
+
build:
|
|
27
|
+
# log_level: INFO
|
|
28
|
+
mode: AUTO # AUTO | FORCE | OFF
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
kind: ingest
|
|
2
|
+
source: synthetic.ticks
|
|
3
|
+
id: time.ticks.hour_sin # format: domain.dataset.(variant)
|
|
4
|
+
|
|
5
|
+
# Fine-grained cadence for this stream. Defaults to the dataset group_by via project.globals.
|
|
6
|
+
cadence: ${group_by}
|
|
7
|
+
|
|
8
|
+
mapper:
|
|
9
|
+
entrypoint: encode_time
|
|
10
|
+
args: { mode: hour_sin }
|
|
11
|
+
|
|
12
|
+
# partition_by: field you want to partition
|
|
13
|
+
|
|
14
|
+
record:
|
|
15
|
+
- filter: { operator: ge, field: time, comparand: "${start_time}" }
|
|
16
|
+
- filter: { operator: le, field: time, comparand: "${end_time}" }
|
|
17
|
+
- floor_time: { cadence: "${cadence}" }
|
|
18
|
+
# - lag: { lag: "${cadence}" }
|
|
19
|
+
|
|
20
|
+
stream:
|
|
21
|
+
- dedupe: {}
|
|
22
|
+
- granularity: { mode: first }
|
|
23
|
+
- ensure_cadence: { cadence: "${cadence}" }
|
|
24
|
+
# Optional: fill gaps before downstream transforms:
|
|
25
|
+
# - fill: { statistic: median, window: 24, min_samples: 4 }
|
|
26
|
+
|
|
27
|
+
debug:
|
|
28
|
+
- lint: { mode: error, tick: "${cadence}" }
|
|
29
|
+
|
|
30
|
+
# sort_batch_size: 100000
|
|
31
|
+
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
kind: ingest
|
|
2
|
+
source: synthetic.ticks # raw source alias (see example/sources)
|
|
3
|
+
id: time.ticks.linear # canonical stream id (format: domain.dataset.(variant))
|
|
4
|
+
|
|
5
|
+
# Fine-grained cadence for this stream. Defaults to the dataset group_by via project.globals.
|
|
6
|
+
cadence: ${group_by}
|
|
7
|
+
|
|
8
|
+
mapper: # normalize/reshape DTO -> TemporalRecord
|
|
9
|
+
entrypoint: encode_time
|
|
10
|
+
args: { mode: linear }
|
|
11
|
+
# partition_by: station_id # optional: add partition suffixes to feature ids
|
|
12
|
+
|
|
13
|
+
record: # record-level transforms
|
|
14
|
+
- filter: { operator: ge, field: time, comparand: "${start_time}" }
|
|
15
|
+
- filter: { operator: le, field: time, comparand: "${end_time}" }
|
|
16
|
+
- floor_time: { cadence: "${cadence}" } # snap timestamps to cadence boundaries
|
|
17
|
+
# - lag: { lag: "${cadence}" } # optional: shift timestamps backwards
|
|
18
|
+
|
|
19
|
+
stream: # per-feature stream transforms (input sorted by id,time)
|
|
20
|
+
- dedupe: {} # drop exact-duplicate records per tick
|
|
21
|
+
- granularity: { mode: first } # aggregate duplicates within a tick
|
|
22
|
+
- ensure_cadence: { cadence: "${cadence}" } # insert missing ticks (value=None)
|
|
23
|
+
# Consider adding a fill transform to impute None values before sequence/windowing:
|
|
24
|
+
# - fill: { statistic: median, window: 6, min_samples: 1 }
|
|
25
|
+
|
|
26
|
+
debug: # optional validation-only transforms
|
|
27
|
+
- lint: { mode: error, tick: "${cadence}" } # strict cadence/order; value issues handled by downstream transforms
|
|
28
|
+
|
|
29
|
+
# sort_batch_size: 100000 # in-memory chunk size used by internal sorting
|
|
30
|
+
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
group_by: ${group_by}
|
|
2
|
+
|
|
3
|
+
features:
|
|
4
|
+
- id: time_linear
|
|
5
|
+
record_stream: time.ticks.linear
|
|
6
|
+
scale: true # optionally add with_mean/with_std overrides
|
|
7
|
+
# Sliding window over the regularized stream; cadence is enforced in the contract.
|
|
8
|
+
sequence: { size: 6, stride: 1 }
|
|
9
|
+
|
|
10
|
+
- id: time_hour_sin
|
|
11
|
+
record_stream: time.ticks.hour_sin
|
|
12
|
+
|
|
13
|
+
# - id: third_feature
|
|
14
|
+
# record_stream: anotherstream
|
|
15
|
+
# targets:
|
|
16
|
+
# - id: some_target
|
|
17
|
+
# record_stream: time.ticks.linear
|
|
18
|
+
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#### example combination of postprocessing steps ######
|
|
2
|
+
#### making sure data is complete after these combinations ######
|
|
3
|
+
- drop: # example of dropping sparse partitions/vertical-axis for targets
|
|
4
|
+
axis: vertical
|
|
5
|
+
payload: targets
|
|
6
|
+
threshold: 0.9
|
|
7
|
+
|
|
8
|
+
- drop: # example of dropping sparse partitions for features
|
|
9
|
+
axis: vertical
|
|
10
|
+
payload: features
|
|
11
|
+
threshold: 0.9
|
|
12
|
+
|
|
13
|
+
- drop: # dropping vectors/horizontal-axis that has features which none
|
|
14
|
+
axis: horizontal
|
|
15
|
+
payload: features
|
|
16
|
+
threshold: 1
|
|
17
|
+
|
|
18
|
+
- drop:
|
|
19
|
+
axis: horizontal
|
|
20
|
+
payload: targets
|
|
21
|
+
threshold: 1
|
|
22
|
+
######
|
|
23
|
+
# - fill:
|
|
24
|
+
# statistic: median
|
|
25
|
+
# window: 48
|
|
26
|
+
# min_samples: 6
|
|
27
|
+
# - replace:
|
|
28
|
+
# payload: targets
|
|
29
|
+
# value: 0.0
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
version: 1
|
|
2
|
+
name: <your-dataset>
|
|
3
|
+
paths:
|
|
4
|
+
streams: ./contracts
|
|
5
|
+
sources: ./sources
|
|
6
|
+
dataset: dataset.yaml
|
|
7
|
+
postprocess: postprocess.yaml
|
|
8
|
+
artifacts: ../artifacts/${project_name}/v${version}
|
|
9
|
+
tasks: ./tasks
|
|
10
|
+
globals:
|
|
11
|
+
# Primary dataset cadence; referenced from dataset.yaml (group_by)
|
|
12
|
+
# and contracts via ${group_by}.
|
|
13
|
+
group_by: <your-bucket-cadence>
|
|
14
|
+
start_time: null #2021-01-01T00:00:00Z
|
|
15
|
+
end_time: null #2021-01-02T00:00:00Z
|
|
16
|
+
# Configure deterministic dataset split here (applied at serve time, after postprocess).
|
|
17
|
+
# Adjust `ratios` as needed; the active split is selected via serve tasks or CLI.
|
|
18
|
+
split:
|
|
19
|
+
mode: hash # hash | time (time uses boundaries/labels)
|
|
20
|
+
key: group # group | feature:<id> (entity-stable split)
|
|
21
|
+
seed: 42 # deterministic hash seed
|
|
22
|
+
ratios: { train: 0.8, val: 0.1, test: 0.1 }
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
kind: serve
|
|
2
|
+
|
|
3
|
+
# Optional identifier for this serve task; defaults to filename stem.
|
|
4
|
+
name: train
|
|
5
|
+
|
|
6
|
+
# Active split label to serve; must match a label from globals.split.ratios.
|
|
7
|
+
# Set to null to disable split filtering.
|
|
8
|
+
keep: train
|
|
9
|
+
#output:
|
|
10
|
+
# transport: stdout | fs
|
|
11
|
+
# format: print | json-lines | json | csv | pickle
|
|
12
|
+
# When using fs transport, set a directory (and optionally filename) for outputs:
|
|
13
|
+
# directory: artifacts/serve
|
|
14
|
+
# filename: vectors.train
|
|
15
|
+
|
|
16
|
+
# Default max number of vectors to emit (null = unlimited).
|
|
17
|
+
# limit: 5
|
|
18
|
+
# Optional pipeline stage preview (0-7); null lets the CLI decide.
|
|
19
|
+
# stage: 7
|
|
20
|
+
|
|
21
|
+
# Optional pacing between emitted vectors (milliseconds).
|
|
22
|
+
# throttle_ms: null
|
|
23
|
+
|
|
24
|
+
# Visuals/logging knobs (inherit CLI or jerry.yaml defaults when omitted):
|
|
25
|
+
# visuals: AUTO # AUTO | TQDM | RICH | OFF
|
|
26
|
+
# progress: AUTO # AUTO | SPINNER | BARS | OFF
|
|
27
|
+
# log_level: INFO # CRITICAL | ERROR | WARNING | INFO | DEBUG
|
|
28
|
+
|
|
@@ -1,16 +1,17 @@
|
|
|
1
|
-
from typing import
|
|
2
|
-
|
|
1
|
+
from typing import Any, Iterator
|
|
2
|
+
|
|
3
3
|
from {{PACKAGE_NAME}}.domains.{{TARGET_DOMAIN}}.model import {{DomainRecord}}
|
|
4
|
+
from {{PACKAGE_NAME}}.sources.{{ORIGIN}}.{{DATASET}}.dto import {{OriginDTO}}
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
def {{FUNCTION_NAME}}(
|
|
7
8
|
stream: Iterator[{{OriginDTO}}],
|
|
8
|
-
|
|
9
|
+
**params: Any,
|
|
9
10
|
) -> Iterator[{{DomainRecord}}]:
|
|
10
11
|
"""Map raw {{ORIGIN}} DTOs to domain-level {{TARGET_DOMAIN}} records.
|
|
11
12
|
|
|
12
13
|
- Required on domain record: time and value.
|
|
13
|
-
- Additional options may be passed via kwargs (e.g., mode="...").
|
|
14
|
+
- Additional options may be passed via kwargs (e.g., variant="..." or mode="...").
|
|
14
15
|
"""
|
|
15
16
|
for dto in stream:
|
|
16
17
|
# TODO: construct {{DomainRecord}} from dto fields
|