jerry-thomas 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. datapipeline/analysis/vector/collector.py +120 -17
  2. datapipeline/analysis/vector/matrix.py +33 -8
  3. datapipeline/analysis/vector/report.py +162 -32
  4. datapipeline/build/tasks/__init__.py +11 -0
  5. datapipeline/build/tasks/config.py +74 -0
  6. datapipeline/build/tasks/metadata.py +170 -0
  7. datapipeline/build/tasks/scaler.py +73 -0
  8. datapipeline/build/tasks/schema.py +60 -0
  9. datapipeline/build/tasks/utils.py +169 -0
  10. datapipeline/cli/app.py +304 -127
  11. datapipeline/cli/commands/build.py +240 -16
  12. datapipeline/cli/commands/contract.py +367 -0
  13. datapipeline/cli/commands/domain.py +8 -3
  14. datapipeline/cli/commands/inspect.py +401 -149
  15. datapipeline/cli/commands/list_.py +30 -7
  16. datapipeline/cli/commands/plugin.py +1 -1
  17. datapipeline/cli/commands/run.py +227 -241
  18. datapipeline/cli/commands/run_config.py +101 -0
  19. datapipeline/cli/commands/serve_pipeline.py +156 -0
  20. datapipeline/cli/commands/source.py +44 -8
  21. datapipeline/cli/visuals/__init__.py +4 -2
  22. datapipeline/cli/visuals/common.py +239 -0
  23. datapipeline/cli/visuals/labels.py +15 -15
  24. datapipeline/cli/visuals/runner.py +66 -0
  25. datapipeline/cli/visuals/sections.py +20 -0
  26. datapipeline/cli/visuals/sources.py +132 -119
  27. datapipeline/cli/visuals/sources_basic.py +260 -0
  28. datapipeline/cli/visuals/sources_off.py +76 -0
  29. datapipeline/cli/visuals/sources_rich.py +414 -0
  30. datapipeline/config/catalog.py +37 -3
  31. datapipeline/config/context.py +214 -0
  32. datapipeline/config/dataset/loader.py +21 -4
  33. datapipeline/config/dataset/normalize.py +4 -4
  34. datapipeline/config/metadata.py +43 -0
  35. datapipeline/config/postprocess.py +2 -2
  36. datapipeline/config/project.py +3 -2
  37. datapipeline/config/resolution.py +129 -0
  38. datapipeline/config/tasks.py +309 -0
  39. datapipeline/config/workspace.py +155 -0
  40. datapipeline/domain/__init__.py +12 -0
  41. datapipeline/domain/record.py +11 -0
  42. datapipeline/domain/sample.py +54 -0
  43. datapipeline/integrations/ml/adapter.py +34 -20
  44. datapipeline/integrations/ml/pandas_support.py +0 -2
  45. datapipeline/integrations/ml/rows.py +1 -6
  46. datapipeline/integrations/ml/torch_support.py +1 -3
  47. datapipeline/io/factory.py +112 -0
  48. datapipeline/io/output.py +132 -0
  49. datapipeline/io/protocols.py +21 -0
  50. datapipeline/io/serializers.py +219 -0
  51. datapipeline/io/sinks/__init__.py +23 -0
  52. datapipeline/io/sinks/base.py +2 -0
  53. datapipeline/io/sinks/files.py +79 -0
  54. datapipeline/io/sinks/rich.py +57 -0
  55. datapipeline/io/sinks/stdout.py +18 -0
  56. datapipeline/io/writers/__init__.py +14 -0
  57. datapipeline/io/writers/base.py +28 -0
  58. datapipeline/io/writers/csv_writer.py +25 -0
  59. datapipeline/io/writers/jsonl.py +52 -0
  60. datapipeline/io/writers/pickle_writer.py +30 -0
  61. datapipeline/pipeline/artifacts.py +58 -0
  62. datapipeline/pipeline/context.py +66 -7
  63. datapipeline/pipeline/observability.py +65 -0
  64. datapipeline/pipeline/pipelines.py +65 -13
  65. datapipeline/pipeline/split.py +11 -10
  66. datapipeline/pipeline/stages.py +127 -16
  67. datapipeline/pipeline/utils/keygen.py +20 -7
  68. datapipeline/pipeline/utils/memory_sort.py +22 -10
  69. datapipeline/pipeline/utils/transform_utils.py +22 -0
  70. datapipeline/runtime.py +5 -2
  71. datapipeline/services/artifacts.py +12 -6
  72. datapipeline/services/bootstrap/config.py +25 -0
  73. datapipeline/services/bootstrap/core.py +52 -37
  74. datapipeline/services/constants.py +6 -5
  75. datapipeline/services/factories.py +123 -1
  76. datapipeline/services/project_paths.py +43 -16
  77. datapipeline/services/runs.py +208 -0
  78. datapipeline/services/scaffold/domain.py +3 -2
  79. datapipeline/services/scaffold/filter.py +3 -2
  80. datapipeline/services/scaffold/mappers.py +9 -6
  81. datapipeline/services/scaffold/plugin.py +3 -3
  82. datapipeline/services/scaffold/source.py +93 -56
  83. datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
  84. datapipeline/sources/decoders.py +83 -18
  85. datapipeline/sources/factory.py +26 -16
  86. datapipeline/sources/models/__init__.py +2 -2
  87. datapipeline/sources/models/generator.py +0 -7
  88. datapipeline/sources/models/loader.py +3 -3
  89. datapipeline/sources/models/parsing_error.py +24 -0
  90. datapipeline/sources/models/source.py +6 -6
  91. datapipeline/sources/synthetic/time/loader.py +14 -2
  92. datapipeline/sources/transports.py +74 -37
  93. datapipeline/templates/plugin_skeleton/README.md +74 -30
  94. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
  95. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
  96. datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
  97. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
  98. datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
  99. datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
  100. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
  101. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
  102. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
  103. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
  104. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
  105. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
  106. datapipeline/templates/plugin_skeleton/jerry.yaml +28 -0
  107. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
  108. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
  109. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
  110. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
  111. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
  112. datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
  113. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
  114. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
  115. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
  116. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
  117. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
  118. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
  119. datapipeline/templates/stubs/dto.py.j2 +2 -0
  120. datapipeline/templates/stubs/mapper.py.j2 +5 -4
  121. datapipeline/templates/stubs/parser.py.j2 +2 -0
  122. datapipeline/templates/stubs/record.py.j2 +2 -0
  123. datapipeline/templates/stubs/source.yaml.j2 +2 -3
  124. datapipeline/transforms/debug/lint.py +26 -41
  125. datapipeline/transforms/feature/scaler.py +89 -13
  126. datapipeline/transforms/record/floor_time.py +4 -4
  127. datapipeline/transforms/sequence.py +2 -35
  128. datapipeline/transforms/stream/dedupe.py +24 -0
  129. datapipeline/transforms/stream/ensure_ticks.py +7 -6
  130. datapipeline/transforms/vector/__init__.py +5 -0
  131. datapipeline/transforms/vector/common.py +98 -0
  132. datapipeline/transforms/vector/drop/__init__.py +4 -0
  133. datapipeline/transforms/vector/drop/horizontal.py +79 -0
  134. datapipeline/transforms/vector/drop/orchestrator.py +59 -0
  135. datapipeline/transforms/vector/drop/vertical.py +182 -0
  136. datapipeline/transforms/vector/ensure_schema.py +184 -0
  137. datapipeline/transforms/vector/fill.py +87 -0
  138. datapipeline/transforms/vector/replace.py +62 -0
  139. datapipeline/utils/load.py +24 -3
  140. datapipeline/utils/rich_compat.py +38 -0
  141. datapipeline/utils/window.py +76 -0
  142. jerry_thomas-1.0.0.dist-info/METADATA +825 -0
  143. jerry_thomas-1.0.0.dist-info/RECORD +199 -0
  144. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/entry_points.txt +9 -8
  145. datapipeline/build/tasks.py +0 -186
  146. datapipeline/cli/commands/link.py +0 -128
  147. datapipeline/cli/commands/writers.py +0 -138
  148. datapipeline/config/build.py +0 -64
  149. datapipeline/config/run.py +0 -116
  150. datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
  151. datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
  152. datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
  153. datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
  154. datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
  155. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
  156. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
  157. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
  158. datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
  159. datapipeline/transforms/vector.py +0 -210
  160. jerry_thomas-0.3.0.dist-info/METADATA +0 -502
  161. jerry_thomas-0.3.0.dist-info/RECORD +0 -139
  162. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/WHEEL +0 -0
  163. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/licenses/LICENSE +0 -0
  164. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/top_level.txt +0 -0
@@ -4,10 +4,10 @@ Minimal plugin skeleton for the Jerry Thomas (datapipeline) framework.
4
4
 
5
5
  Quick start
6
6
  - Initialize a plugin (already done if you’re reading this here):
7
- - `jerry plugin init --name {{DIST_NAME}}`
7
+ - `jerry plugin init {{DIST_NAME}}`
8
8
  - Add a source via CLI (transport-specific placeholders are scaffolded):
9
- - File data: `jerry source add -p <provider> -d <dataset> -t fs -f <csv|json|json-lines>`
10
- - URL data: `jerry source add -p <provider> -d <dataset> -t url -f <json|json-lines|csv>`
9
+ - File data: `jerry source add <provider> <dataset> -t fs -f <csv|json|json-lines|pickle>`
10
+ - HTTP data: `jerry source add <provider>.<dataset> -t http -f <json|json-lines|csv>`
11
11
  - Synthetic: `jerry source add -p <provider> -d <dataset> -t synthetic`
12
12
  - Edit the generated `config/sources/*.yaml` to fill in the `path`, delimiter, etc.
13
13
  - Reinstall after EP changes (pyproject.toml) and restart Python processes:
@@ -15,12 +15,15 @@ Quick start
15
15
  - This plugin: `python -m pip install -e .`
16
16
 
17
17
  Folder layout
18
- - `config/`
19
- - `sources/*.yaml` — raw source definitions (one file per source)
18
+ - `example/`
19
+ - `project.yaml` — project root (paths, globals, cadence/split)
20
+ - `dataset.yaml` — feature/target declarations (uses `${group_by}` from globals)
21
+ - `postprocess.yaml` — postprocess transforms
20
22
  - `contracts/*.yaml` — canonical stream definitions
21
- - `datasets/<name>/build.yaml` — build configuration (partitioned ids today, more artifacts later)
23
+ - `sources/*.yaml` — raw source definitions (one file per source)
24
+ - `tasks/*.yaml` — task specs (schema/scaler/metadata/serve)
22
25
  - Every dataset `project.yaml` declares a `name`; reference it via `${project_name}`
23
- inside other config files (e.g., `paths.artifacts: ../../build/datasets/${project_name}`) to
26
+ inside other config files (e.g., `paths.artifacts: ../artifacts/${project_name}`) to
24
27
  avoid hard-coding per-dataset directories.
25
28
  - `src/{{PACKAGE_NAME}}/`
26
29
  - `sources/<provider>/<dataset>/dto.py` — DTO model for the source
@@ -30,52 +33,59 @@ Folder layout
30
33
  - `mappers/*.py` — map DTOs → domain records
31
34
 
32
35
  How loaders work
33
- - For fs/url, sources use the generic loader entry point:
34
- - `loader.entrypoint: "{{COMPOSED_LOADER_EP}}"`
36
+ - For fs/http, sources use the generic loader entry point:
37
+ - `loader.entrypoint: "{{DEFAULT_IO_LOADER_EP}}"`
35
38
  - `loader.args` include `transport`, `format`, and source-specific args (placeholders are provided):
36
39
  - fs: `path`, `glob`, `encoding`, plus `delimiter` for csv
37
- - url: `url`, `headers`, `encoding`, optional `count_by_fetch`
40
+ - http: `url`, `headers`, `params`, `encoding`, optional `count_by_fetch`
38
41
  - Synthetic sources generate data in-process and keep a small loader stub.
39
42
 
40
43
  Run data flows
41
- - Build artifacts once: `jerry build --project config/datasets/default/project.yaml`
42
- - Preview records (stage 1): `jerry serve --project config/datasets/default/project.yaml --stage 1 --limit 100`
43
- - Preview features (stage 3): `jerry serve --project config/datasets/default/project.yaml --stage 3 --limit 100`
44
- - Preview vectors (stage 7): `jerry serve --project config/datasets/default/project.yaml --stage 7 --limit 100`
44
+ - Build artifacts once: `jerry build --project example/project.yaml`
45
+ - Preview records (stage 1): `jerry serve --project example/project.yaml --stage 1 --limit 100`
46
+ - Preview features (stage 3): `jerry serve --project example/project.yaml --stage 3 --limit 100`
47
+ - Preview vectors (stage 7): `jerry serve --project example/project.yaml --stage 7 --limit 100`
45
48
 
46
49
  Analyze vectors
47
- - `jerry inspect report --project config/datasets/default/project.yaml` (console only)
48
- - `jerry inspect coverage --project config/datasets/default/project.yaml` (writes build/coverage.json)
49
- - `jerry inspect matrix --project config/datasets/default/project.yaml --format html` (writes build/matrix.html)
50
- - `jerry inspect partitions --project config/datasets/default/project.yaml` (writes build/partitions.json)
50
+ - `jerry inspect report --project example/project.yaml` (console only)
51
+ - `jerry inspect partitions --project example/project.yaml` (writes build/partitions.json)
52
+ - `jerry inspect matrix --project example/project.yaml --format html` (writes build/matrix.html)
53
+ - `jerry inspect expected --project example/project.yaml` (writes build/expected.txt)
51
54
  - Use post-processing transforms in `postprocess.yaml` to keep coverage high
52
55
  (history/horizontal fills, constants, or drop rules) before serving vectors.
56
+ Add `payload: targets` inside a transform when you need to mutate label vectors.
53
57
 
54
58
  Train/Val/Test splits (deterministic)
55
59
  - Configure split mechanics once in your project file:
56
- - Edit `config/datasets/default/project.yaml` and set:
60
+ - Edit `example/project.yaml` and set:
57
61
  ```yaml
58
62
  globals:
63
+ group_by: 10m # dataset cadence; reused as contract cadence
59
64
  split:
60
65
  mode: hash # hash|time
61
66
  key: group # group or feature:<id> (entity-stable)
62
67
  seed: 42 # deterministic hash seed
63
68
  ratios: {train: 0.8, val: 0.1, test: 0.1}
64
69
  ```
65
- - Select the active slice via `config/datasets/default/run.yaml` (or `--keep`):
70
+ - Select the active slice via `example/tasks/serve.<name>.yaml` (or `--keep`):
66
71
  ```yaml
67
- version: 1
72
+ kind: serve
73
+ name: train # defaults to filename stem when omitted
68
74
  keep: train # any label defined in globals.split; null disables filtering
69
- output: print # serve output default (print|stream|/path)
75
+ output:
76
+ transport: stdout # stdout | fs
77
+ format: print # print | json-lines | json | csv | pickle
70
78
  limit: 100 # cap vectors per serve run (null = unlimited)
71
- include_targets: false # include dataset.targets when serving
72
79
  throttle_ms: null # sleep between vectors (milliseconds)
80
+ # visuals: AUTO # AUTO | TQDM | RICH | OFF
81
+ # progress: AUTO # AUTO | SPINNER | BARS | OFF
73
82
  ```
74
- - If you prefer separate configs per split, point `project.paths.run` at a folder (e.g., `config/datasets/default/runs/`),
75
- drop `train.yaml`, `val.yaml`, etc. inside, and the CLI will run each file in order unless you pass `--run <name>`.
76
- - Serve examples (change run.yaml or pass `--keep val|test`):
77
- - `jerry serve -p config/datasets/default/project.yaml -o stream > train.jsonl`
78
- - `jerry serve -p config/datasets/default/project.yaml --keep val -o stream > val.jsonl`
83
+ - Add additional `kind: serve` files (e.g., `serve.val.yaml`, `serve.test.yaml`) and the CLI will run each enabled file in order unless you pass `--run <name>`.
84
+ - Serve examples (change the serve task or pass `--keep val|test`):
85
+ - `jerry serve -p example/project.yaml --out-transport stdout --out-format json-lines > train.jsonl`
86
+ - `jerry serve -p example/project.yaml --keep val --out-transport stdout --out-format json-lines > val.jsonl`
87
+ - Add `--visuals rich --progress bars` for a richer interactive UI; defaults to `AUTO`.
88
+ - For shared workspace defaults (visual renderer, progress display, build mode), drop a `jerry.yaml` next to your workspace root and set `shared.visuals`, `shared.progress`, etc. CLI commands walk up from the current directory to find it.
79
89
  - The split is applied at the end (after postprocess transforms), and assignment
80
90
  is deterministic (hash-based) with a fixed seed; no overlap across runs.
81
91
 
@@ -84,13 +94,47 @@ Key selection guidance
84
94
  - `key: feature:<id>` hashes a specific feature value, e.g., `feature:entity_id` or `feature:station_id`, ensuring all vectors for that entity land in the same split (recommended to avoid leakage).
85
95
 
86
96
  Postprocess expected IDs
87
- - Build once with `jerry build --project config/datasets/default/project.yaml` (or run `jerry inspect expected …`) to materialize `<paths.artifacts>/expected.txt`.
97
+ - Build once with `jerry build --project config/project.yaml` (or run `jerry inspect expected …`) to materialize `<paths.artifacts>/expected.txt`.
88
98
  - Bootstrap registers the artifact; postprocess transforms read it automatically. Per-transform `expected:` overrides are no longer required or supported — the build output is the single source of truth.
89
99
 
90
100
  Scaler statistics
91
- - Enable the scaler task in `build.yaml` (default `enabled: true`) to compute mean/std per feature using the configured training split.
101
+ - Jerry computes scaler stats automatically. If you need custom paths or settings, add `tasks/scaler.yaml` and override the defaults.
92
102
  - The build writes `<paths.artifacts>/scaler.pkl`; runtime scaling requires this artifact. If it is missing, scaling transforms raise a runtime error.
93
103
 
94
104
  Tips
95
105
  - Keep parsers thin — mirror source schema and return DTOs; use the identity parser only if your loader already emits domain records.
96
106
  - Prefer small, composable configs over monolithic ones: one YAML per source is easier to review and reuse.
107
+
108
+ Composed streams (engineered domains)
109
+ - Declare engineered streams that depend on other canonical streams directly in contracts. The runtime builds each input to stage 4, stream‑aligns by partition+timestamp, runs your composer, and emits fresh records for the derived stream.
110
+
111
+ ```yaml
112
+ # example/contracts/air_density.processed.yaml
113
+ kind: composed
114
+ id: air_density.processed
115
+ inputs:
116
+ - p=pressure.processed
117
+ - t=temp_dry.processed
118
+ partition_by: station_id
119
+ sort_batch_size: 20000
120
+
121
+ mapper:
122
+ entrypoint: {{PACKAGE_NAME}}.mappers.air_density:mapper
123
+ args:
124
+ driver: p # optional; defaults to first input alias
125
+
126
+ # Optional post‑compose policies (same as any stream):
127
+ # record: [...]
128
+ # stream: [...]
129
+ # debug: [...]
130
+ ```
131
+
132
+ Then reference the composed stream in your dataset:
133
+
134
+ ```yaml
135
+ # example/dataset.yaml
136
+ group_by: ${group_by}
137
+ features:
138
+ - id: air_density
139
+ record_stream: air_density.processed
140
+ ```
@@ -0,0 +1,31 @@
1
+ kind: ingest
2
+ source: synthetic.ticks
3
+ id: time.ticks.hour_sin # format: domain.dataset.(variant)
4
+
5
+ # Fine-grained cadence for this stream. Defaults to the dataset group_by via project.globals.
6
+ cadence: ${group_by}
7
+
8
+ mapper:
9
+ entrypoint: encode_time
10
+ args: { mode: hour_sin }
11
+
12
+ # partition_by: field you want to partition
13
+
14
+ record:
15
+ - filter: { operator: ge, field: time, comparand: "${start_time}" }
16
+ - filter: { operator: le, field: time, comparand: "${end_time}" }
17
+ - floor_time: { cadence: "${cadence}" }
18
+ # - lag: { lag: "${cadence}" }
19
+
20
+ stream:
21
+ - dedupe: {}
22
+ - granularity: { mode: first }
23
+ - ensure_cadence: { cadence: "${cadence}" }
24
+ # Optional: fill gaps before downstream transforms:
25
+ # - fill: { statistic: median, window: 24, min_samples: 4 }
26
+
27
+ debug:
28
+ - lint: { mode: error, tick: "${cadence}" }
29
+
30
+ # sort_batch_size: 100000
31
+
@@ -0,0 +1,30 @@
1
+ kind: ingest
2
+ source: synthetic.ticks # raw source alias (see example/sources)
3
+ id: time.ticks.linear # canonical stream id (format: domain.dataset.(variant))
4
+
5
+ # Fine-grained cadence for this stream. Defaults to the dataset group_by via project.globals.
6
+ cadence: ${group_by}
7
+
8
+ mapper: # normalize/reshape DTO -> TemporalRecord
9
+ entrypoint: encode_time
10
+ args: { mode: linear }
11
+ # partition_by: station_id # optional: add partition suffixes to feature ids
12
+
13
+ record: # record-level transforms
14
+ - filter: { operator: ge, field: time, comparand: "${start_time}" }
15
+ - filter: { operator: le, field: time, comparand: "${end_time}" }
16
+ - floor_time: { cadence: "${cadence}" } # snap timestamps to cadence boundaries
17
+ # - lag: { lag: "${cadence}" } # optional: shift timestamps backwards
18
+
19
+ stream: # per-feature stream transforms (input sorted by id,time)
20
+ - dedupe: {} # drop exact-duplicate records per tick
21
+ - granularity: { mode: first } # aggregate duplicates within a tick
22
+ - ensure_cadence: { cadence: "${cadence}" } # insert missing ticks (value=None)
23
+ # Consider adding a fill transform to impute None values before sequence/windowing:
24
+ # - fill: { statistic: median, window: 6, min_samples: 1 }
25
+
26
+ debug: # optional validation-only transforms
27
+ - lint: { mode: error, tick: "${cadence}" } # strict cadence/order; value issues handled by downstream transforms
28
+
29
+ # sort_batch_size: 100000 # in-memory chunk size used by internal sorting
30
+
@@ -0,0 +1,18 @@
1
+ group_by: ${group_by}
2
+
3
+ features:
4
+ - id: time_linear
5
+ record_stream: time.ticks.linear
6
+ scale: true # optionally add with_mean/with_std overrides
7
+ # Sliding window over the regularized stream; cadence is enforced in the contract.
8
+ sequence: { size: 6, stride: 1 }
9
+
10
+ - id: time_hour_sin
11
+ record_stream: time.ticks.hour_sin
12
+
13
+ # - id: third_feature
14
+ # record_stream: anotherstream
15
+ # targets:
16
+ # - id: some_target
17
+ # record_stream: time.ticks.linear
18
+
@@ -0,0 +1,29 @@
1
+ #### example combination of postprocessing steps ######
2
+ #### making sure data is complete after these combinations ######
3
+ - drop: # example of dropping sparse partitions/vertical-axis for targets
4
+ axis: vertical
5
+ payload: targets
6
+ threshold: 0.9
7
+
8
+ - drop: # example of dropping sparse partitions for features
9
+ axis: vertical
10
+ payload: features
11
+ threshold: 0.9
12
+
13
+ - drop: # dropping vectors/horizontal-axis that has features which none
14
+ axis: horizontal
15
+ payload: features
16
+ threshold: 1
17
+
18
+ - drop:
19
+ axis: horizontal
20
+ payload: targets
21
+ threshold: 1
22
+ ######
23
+ # - fill:
24
+ # statistic: median
25
+ # window: 48
26
+ # min_samples: 6
27
+ # - replace:
28
+ # payload: targets
29
+ # value: 0.0
@@ -1,18 +1,21 @@
1
1
  version: 1
2
- name: default
2
+ name: example
3
3
  paths:
4
- streams: ../../contracts
5
- sources: ../../sources
4
+ streams: ./contracts
5
+ sources: ./sources
6
6
  dataset: dataset.yaml
7
7
  postprocess: postprocess.yaml
8
- artifacts: ../../build/datasets/${project_name}
9
- build: build.yaml
10
- run: runs
8
+ artifacts: ../artifacts/${project_name}/v${version}
9
+ tasks: ./tasks
11
10
  globals:
11
+ # Globals to use in your .yaml files via ${var_name}.
12
+ # Primary dataset cadence; referenced from dataset.yaml (group_by)
13
+ # and contracts via ${group_by}.
14
+ group_by: 1h
12
15
  start_time: 2021-01-01T00:00:00Z
13
- end_time: 2023-01-03T23:00:00Z
16
+ end_time: 2021-01-02T00:00:00Z
14
17
  # Configure deterministic dataset split here (applied at serve time, after postprocess).
15
- # Adjust `ratios` as needed; the active split is selected via run.yaml or CLI.
18
+ # Adjust `ratios` as needed; the active split is selected via serve tasks or CLI.
16
19
  split:
17
20
  mode: hash # hash | time (time uses boundaries/labels)
18
21
  key: group # group | feature:<id> (entity-stable split)
@@ -0,0 +1,12 @@
1
+ id: synthetic.ticks
2
+
3
+ parser:
4
+ entrypoint: "core.synthetic.ticks"
5
+ args: {}
6
+ loader:
7
+ entrypoint: "core.synthetic.ticks"
8
+ args:
9
+ start: "${start_time}"
10
+ end: "${end_time}"
11
+ frequency: "${group_by}"
12
+
@@ -0,0 +1,3 @@
1
+ kind: metadata
2
+ # window_mode: intersection # union|intersection|strict|relaxed (default: intersection)
3
+
@@ -0,0 +1,9 @@
1
+ kind: scaler
2
+
3
+ # Output path is relative to project.paths.artifacts; defaults to "scaler.pkl".
4
+ # output: scaler.pkl
5
+
6
+ # Split label to use when fitting scaler statistics.
7
+ # Must match a label from globals.split.ratios.
8
+ split_label: train
9
+
@@ -0,0 +1,4 @@
1
+ kind: serve
2
+ name: test
3
+ keep: test
4
+
@@ -0,0 +1,28 @@
1
+ kind: serve
2
+
3
+ # Optional identifier for this serve task; defaults to filename stem.
4
+ name: train
5
+
6
+ # Active split label to serve; must match a label from globals.split.ratios.
7
+ # Set to null to disable split filtering.
8
+ keep: train
9
+ #output:
10
+ # transport: stdout | fs
11
+ # format: print | json-lines | json | csv | pickle
12
+ # When using fs transport, set a directory (and optionally filename) for outputs:
13
+ # directory: artifacts/serve
14
+ # filename: vectors.train
15
+
16
+ # Default max number of vectors to emit (null = unlimited).
17
+ # limit: 5
18
+ # Optional pipeline stage preview (0-7); null lets the CLI decide.
19
+ # stage: 7
20
+
21
+ # Optional pacing between emitted vectors (milliseconds).
22
+ # throttle_ms: null
23
+
24
+ # Visuals/logging knobs (inherit CLI or jerry.yaml defaults when omitted):
25
+ # visuals: AUTO # AUTO | TQDM | RICH | OFF
26
+ # progress: AUTO # AUTO | SPINNER | BARS | OFF
27
+ # log_level: INFO # CRITICAL | ERROR | WARNING | INFO | DEBUG
28
+
@@ -0,0 +1,4 @@
1
+ kind: serve
2
+ name: val
3
+ keep: val
4
+
@@ -0,0 +1,28 @@
1
+ # Workspace defaults. Move this file to the root of your workspace if you serve
2
+ # configs outside the plugin repo; the CLI searches upward from cwd for jerry.yaml.
3
+
4
+ plugin_root: . # relative path to the plugin repo root
5
+
6
+ datasets:
7
+ example: example/project.yaml
8
+ your-second-example-dataset: your-dataset/project.yaml
9
+
10
+ default_dataset: example
11
+
12
+ shared:
13
+ visuals: AUTO # AUTO | TQDM | RICH | OFF
14
+ progress: BARS # AUTO | SPINNER | BARS | OFF
15
+ log_level: INFO
16
+
17
+ serve:
18
+ # log_level: INFO
19
+ limit: null
20
+ stage: null
21
+ output:
22
+ transport: stdout
23
+ format: print # set to fs + directory for file outputs
24
+ # directory: artifacts/serve
25
+
26
+ build:
27
+ # log_level: INFO
28
+ mode: AUTO # AUTO | FORCE | OFF
@@ -0,0 +1,31 @@
1
+ kind: ingest
2
+ source: synthetic.ticks
3
+ id: time.ticks.hour_sin # format: domain.dataset.(variant)
4
+
5
+ # Fine-grained cadence for this stream. Defaults to the dataset group_by via project.globals.
6
+ cadence: ${group_by}
7
+
8
+ mapper:
9
+ entrypoint: encode_time
10
+ args: { mode: hour_sin }
11
+
12
+ # partition_by: field you want to partition
13
+
14
+ record:
15
+ - filter: { operator: ge, field: time, comparand: "${start_time}" }
16
+ - filter: { operator: le, field: time, comparand: "${end_time}" }
17
+ - floor_time: { cadence: "${cadence}" }
18
+ # - lag: { lag: "${cadence}" }
19
+
20
+ stream:
21
+ - dedupe: {}
22
+ - granularity: { mode: first }
23
+ - ensure_cadence: { cadence: "${cadence}" }
24
+ # Optional: fill gaps before downstream transforms:
25
+ # - fill: { statistic: median, window: 24, min_samples: 4 }
26
+
27
+ debug:
28
+ - lint: { mode: error, tick: "${cadence}" }
29
+
30
+ # sort_batch_size: 100000
31
+
@@ -0,0 +1,30 @@
1
+ kind: ingest
2
+ source: synthetic.ticks # raw source alias (see example/sources)
3
+ id: time.ticks.linear # canonical stream id (format: domain.dataset.(variant))
4
+
5
+ # Fine-grained cadence for this stream. Defaults to the dataset group_by via project.globals.
6
+ cadence: ${group_by}
7
+
8
+ mapper: # normalize/reshape DTO -> TemporalRecord
9
+ entrypoint: encode_time
10
+ args: { mode: linear }
11
+ # partition_by: station_id # optional: add partition suffixes to feature ids
12
+
13
+ record: # record-level transforms
14
+ - filter: { operator: ge, field: time, comparand: "${start_time}" }
15
+ - filter: { operator: le, field: time, comparand: "${end_time}" }
16
+ - floor_time: { cadence: "${cadence}" } # snap timestamps to cadence boundaries
17
+ # - lag: { lag: "${cadence}" } # optional: shift timestamps backwards
18
+
19
+ stream: # per-feature stream transforms (input sorted by id,time)
20
+ - dedupe: {} # drop exact-duplicate records per tick
21
+ - granularity: { mode: first } # aggregate duplicates within a tick
22
+ - ensure_cadence: { cadence: "${cadence}" } # insert missing ticks (value=None)
23
+ # Consider adding a fill transform to impute None values before sequence/windowing:
24
+ # - fill: { statistic: median, window: 6, min_samples: 1 }
25
+
26
+ debug: # optional validation-only transforms
27
+ - lint: { mode: error, tick: "${cadence}" } # strict cadence/order; value issues handled by downstream transforms
28
+
29
+ # sort_batch_size: 100000 # in-memory chunk size used by internal sorting
30
+
@@ -0,0 +1,18 @@
1
+ group_by: ${group_by}
2
+
3
+ features:
4
+ - id: time_linear
5
+ record_stream: time.ticks.linear
6
+ scale: true # optionally add with_mean/with_std overrides
7
+ # Sliding window over the regularized stream; cadence is enforced in the contract.
8
+ sequence: { size: 6, stride: 1 }
9
+
10
+ - id: time_hour_sin
11
+ record_stream: time.ticks.hour_sin
12
+
13
+ # - id: third_feature
14
+ # record_stream: anotherstream
15
+ # targets:
16
+ # - id: some_target
17
+ # record_stream: time.ticks.linear
18
+
@@ -0,0 +1,29 @@
1
+ #### example combination of postprocessing steps ######
2
+ #### making sure data is complete after these combinations ######
3
+ - drop: # example of dropping sparse partitions/vertical-axis for targets
4
+ axis: vertical
5
+ payload: targets
6
+ threshold: 0.9
7
+
8
+ - drop: # example of dropping sparse partitions for features
9
+ axis: vertical
10
+ payload: features
11
+ threshold: 0.9
12
+
13
+ - drop: # dropping vectors/horizontal-axis that has features which none
14
+ axis: horizontal
15
+ payload: features
16
+ threshold: 1
17
+
18
+ - drop:
19
+ axis: horizontal
20
+ payload: targets
21
+ threshold: 1
22
+ ######
23
+ # - fill:
24
+ # statistic: median
25
+ # window: 48
26
+ # min_samples: 6
27
+ # - replace:
28
+ # payload: targets
29
+ # value: 0.0
@@ -0,0 +1,22 @@
1
+ version: 1
2
+ name: <your-dataset>
3
+ paths:
4
+ streams: ./contracts
5
+ sources: ./sources
6
+ dataset: dataset.yaml
7
+ postprocess: postprocess.yaml
8
+ artifacts: ../artifacts/${project_name}/v${version}
9
+ tasks: ./tasks
10
+ globals:
11
+ # Primary dataset cadence; referenced from dataset.yaml (group_by)
12
+ # and contracts via ${group_by}.
13
+ group_by: <your-bucket-cadence>
14
+ start_time: null #2021-01-01T00:00:00Z
15
+ end_time: null #2021-01-02T00:00:00Z
16
+ # Configure deterministic dataset split here (applied at serve time, after postprocess).
17
+ # Adjust `ratios` as needed; the active split is selected via serve tasks or CLI.
18
+ split:
19
+ mode: hash # hash | time (time uses boundaries/labels)
20
+ key: group # group | feature:<id> (entity-stable split)
21
+ seed: 42 # deterministic hash seed
22
+ ratios: { train: 0.8, val: 0.1, test: 0.1 }
@@ -0,0 +1,12 @@
1
+ id: synthetic.ticks
2
+
3
+ parser:
4
+ entrypoint: "core.synthetic.ticks"
5
+ args: {}
6
+ loader:
7
+ entrypoint: "core.synthetic.ticks"
8
+ args:
9
+ start: "${start_time}"
10
+ end: "${end_time}"
11
+ frequency: "${group_by}"
12
+
@@ -0,0 +1,3 @@
1
+ kind: metadata
2
+ # window_mode: intersection # union|intersection|strict|relaxed (default: intersection)
3
+
@@ -0,0 +1,9 @@
1
+ kind: scaler
2
+
3
+ # Output path is relative to project.paths.artifacts; defaults to "scaler.pkl".
4
+ # output: scaler.pkl
5
+
6
+ # Split label to use when fitting scaler statistics.
7
+ # Must match a label from globals.split.ratios.
8
+ split_label: train
9
+
@@ -0,0 +1,4 @@
1
+ kind: serve
2
+ name: test
3
+ keep: test
4
+
@@ -0,0 +1,28 @@
1
+ kind: serve
2
+
3
+ # Optional identifier for this serve task; defaults to filename stem.
4
+ name: train
5
+
6
+ # Active split label to serve; must match a label from globals.split.ratios.
7
+ # Set to null to disable split filtering.
8
+ keep: train
9
+ #output:
10
+ # transport: stdout | fs
11
+ # format: print | json-lines | json | csv | pickle
12
+ # When using fs transport, set a directory (and optionally filename) for outputs:
13
+ # directory: artifacts/serve
14
+ # filename: vectors.train
15
+
16
+ # Default max number of vectors to emit (null = unlimited).
17
+ # limit: 5
18
+ # Optional pipeline stage preview (0-7); null lets the CLI decide.
19
+ # stage: 7
20
+
21
+ # Optional pacing between emitted vectors (milliseconds).
22
+ # throttle_ms: null
23
+
24
+ # Visuals/logging knobs (inherit CLI or jerry.yaml defaults when omitted):
25
+ # visuals: AUTO # AUTO | TQDM | RICH | OFF
26
+ # progress: AUTO # AUTO | SPINNER | BARS | OFF
27
+ # log_level: INFO # CRITICAL | ERROR | WARNING | INFO | DEBUG
28
+
@@ -0,0 +1,4 @@
1
+ kind: serve
2
+ name: val
3
+ keep: val
4
+
@@ -1,4 +1,6 @@
1
1
  from dataclasses import dataclass
2
+ from datetime import datetime
3
+
2
4
 
3
5
  @dataclass
4
6
  class {{CLASS_NAME}}:
@@ -1,16 +1,17 @@
1
- from typing import Iterator, Any
2
- from {{PACKAGE_NAME}}.sources.{{ORIGIN}}.{{DATASET}}.dto import {{OriginDTO}}
1
+ from typing import Any, Iterator
2
+
3
3
  from {{PACKAGE_NAME}}.domains.{{TARGET_DOMAIN}}.model import {{DomainRecord}}
4
+ from {{PACKAGE_NAME}}.sources.{{ORIGIN}}.{{DATASET}}.dto import {{OriginDTO}}
4
5
 
5
6
 
6
7
  def {{FUNCTION_NAME}}(
7
8
  stream: Iterator[{{OriginDTO}}],
8
- *args: Any,
9
+ **params: Any,
9
10
  ) -> Iterator[{{DomainRecord}}]:
10
11
  """Map raw {{ORIGIN}} DTOs to domain-level {{TARGET_DOMAIN}} records.
11
12
 
12
13
  - Required on domain record: time and value.
13
- - Additional options may be passed via kwargs (e.g., mode="...").
14
+ - Additional options may be passed via kwargs (e.g., variant="..." or mode="...").
14
15
  """
15
16
  for dto in stream:
16
17
  # TODO: construct {{DomainRecord}} from dto fields