jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. datapipeline/analysis/vector/collector.py +120 -17
  2. datapipeline/analysis/vector/matrix.py +33 -8
  3. datapipeline/analysis/vector/report.py +162 -32
  4. datapipeline/build/tasks/__init__.py +11 -0
  5. datapipeline/build/tasks/config.py +74 -0
  6. datapipeline/build/tasks/metadata.py +170 -0
  7. datapipeline/build/tasks/scaler.py +73 -0
  8. datapipeline/build/tasks/schema.py +60 -0
  9. datapipeline/build/tasks/utils.py +169 -0
  10. datapipeline/cli/app.py +304 -127
  11. datapipeline/cli/commands/build.py +240 -16
  12. datapipeline/cli/commands/contract.py +367 -0
  13. datapipeline/cli/commands/domain.py +8 -3
  14. datapipeline/cli/commands/inspect.py +401 -149
  15. datapipeline/cli/commands/list_.py +30 -7
  16. datapipeline/cli/commands/plugin.py +5 -1
  17. datapipeline/cli/commands/run.py +227 -241
  18. datapipeline/cli/commands/run_config.py +101 -0
  19. datapipeline/cli/commands/serve_pipeline.py +156 -0
  20. datapipeline/cli/commands/source.py +44 -8
  21. datapipeline/cli/visuals/__init__.py +4 -2
  22. datapipeline/cli/visuals/common.py +239 -0
  23. datapipeline/cli/visuals/labels.py +15 -15
  24. datapipeline/cli/visuals/runner.py +66 -0
  25. datapipeline/cli/visuals/sections.py +20 -0
  26. datapipeline/cli/visuals/sources.py +132 -119
  27. datapipeline/cli/visuals/sources_basic.py +260 -0
  28. datapipeline/cli/visuals/sources_off.py +76 -0
  29. datapipeline/cli/visuals/sources_rich.py +414 -0
  30. datapipeline/config/catalog.py +37 -3
  31. datapipeline/config/context.py +214 -0
  32. datapipeline/config/dataset/loader.py +21 -4
  33. datapipeline/config/dataset/normalize.py +4 -4
  34. datapipeline/config/metadata.py +43 -0
  35. datapipeline/config/postprocess.py +2 -2
  36. datapipeline/config/project.py +3 -2
  37. datapipeline/config/resolution.py +129 -0
  38. datapipeline/config/tasks.py +309 -0
  39. datapipeline/config/workspace.py +155 -0
  40. datapipeline/domain/__init__.py +12 -0
  41. datapipeline/domain/record.py +11 -0
  42. datapipeline/domain/sample.py +54 -0
  43. datapipeline/integrations/ml/adapter.py +34 -20
  44. datapipeline/integrations/ml/pandas_support.py +0 -2
  45. datapipeline/integrations/ml/rows.py +1 -6
  46. datapipeline/integrations/ml/torch_support.py +1 -3
  47. datapipeline/io/factory.py +112 -0
  48. datapipeline/io/output.py +132 -0
  49. datapipeline/io/protocols.py +21 -0
  50. datapipeline/io/serializers.py +219 -0
  51. datapipeline/io/sinks/__init__.py +23 -0
  52. datapipeline/io/sinks/base.py +2 -0
  53. datapipeline/io/sinks/files.py +79 -0
  54. datapipeline/io/sinks/rich.py +57 -0
  55. datapipeline/io/sinks/stdout.py +18 -0
  56. datapipeline/io/writers/__init__.py +14 -0
  57. datapipeline/io/writers/base.py +28 -0
  58. datapipeline/io/writers/csv_writer.py +25 -0
  59. datapipeline/io/writers/jsonl.py +52 -0
  60. datapipeline/io/writers/pickle_writer.py +30 -0
  61. datapipeline/pipeline/artifacts.py +58 -0
  62. datapipeline/pipeline/context.py +66 -7
  63. datapipeline/pipeline/observability.py +65 -0
  64. datapipeline/pipeline/pipelines.py +65 -13
  65. datapipeline/pipeline/split.py +11 -10
  66. datapipeline/pipeline/stages.py +127 -16
  67. datapipeline/pipeline/utils/keygen.py +20 -7
  68. datapipeline/pipeline/utils/memory_sort.py +22 -10
  69. datapipeline/pipeline/utils/transform_utils.py +22 -0
  70. datapipeline/runtime.py +5 -2
  71. datapipeline/services/artifacts.py +12 -6
  72. datapipeline/services/bootstrap/config.py +25 -0
  73. datapipeline/services/bootstrap/core.py +52 -37
  74. datapipeline/services/constants.py +6 -5
  75. datapipeline/services/factories.py +123 -1
  76. datapipeline/services/project_paths.py +43 -16
  77. datapipeline/services/runs.py +208 -0
  78. datapipeline/services/scaffold/domain.py +3 -2
  79. datapipeline/services/scaffold/filter.py +3 -2
  80. datapipeline/services/scaffold/mappers.py +9 -6
  81. datapipeline/services/scaffold/plugin.py +54 -10
  82. datapipeline/services/scaffold/source.py +93 -56
  83. datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
  84. datapipeline/sources/decoders.py +83 -18
  85. datapipeline/sources/factory.py +26 -16
  86. datapipeline/sources/models/__init__.py +2 -2
  87. datapipeline/sources/models/generator.py +0 -7
  88. datapipeline/sources/models/loader.py +3 -3
  89. datapipeline/sources/models/parsing_error.py +24 -0
  90. datapipeline/sources/models/source.py +6 -6
  91. datapipeline/sources/synthetic/time/loader.py +14 -2
  92. datapipeline/sources/transports.py +74 -37
  93. datapipeline/templates/plugin_skeleton/README.md +76 -30
  94. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
  95. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
  96. datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
  97. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
  98. datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
  99. datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
  100. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
  101. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
  102. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
  103. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
  104. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
  105. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
  106. datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
  107. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
  108. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
  109. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
  110. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
  111. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
  112. datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
  113. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
  114. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
  115. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
  116. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
  117. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
  118. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
  119. datapipeline/templates/stubs/dto.py.j2 +2 -0
  120. datapipeline/templates/stubs/mapper.py.j2 +5 -4
  121. datapipeline/templates/stubs/parser.py.j2 +2 -0
  122. datapipeline/templates/stubs/record.py.j2 +2 -0
  123. datapipeline/templates/stubs/source.yaml.j2 +2 -3
  124. datapipeline/transforms/debug/lint.py +26 -41
  125. datapipeline/transforms/feature/scaler.py +89 -13
  126. datapipeline/transforms/record/floor_time.py +4 -4
  127. datapipeline/transforms/sequence.py +2 -35
  128. datapipeline/transforms/stream/dedupe.py +24 -0
  129. datapipeline/transforms/stream/ensure_ticks.py +7 -6
  130. datapipeline/transforms/vector/__init__.py +5 -0
  131. datapipeline/transforms/vector/common.py +98 -0
  132. datapipeline/transforms/vector/drop/__init__.py +4 -0
  133. datapipeline/transforms/vector/drop/horizontal.py +79 -0
  134. datapipeline/transforms/vector/drop/orchestrator.py +59 -0
  135. datapipeline/transforms/vector/drop/vertical.py +182 -0
  136. datapipeline/transforms/vector/ensure_schema.py +184 -0
  137. datapipeline/transforms/vector/fill.py +87 -0
  138. datapipeline/transforms/vector/replace.py +62 -0
  139. datapipeline/utils/load.py +24 -3
  140. datapipeline/utils/rich_compat.py +38 -0
  141. datapipeline/utils/window.py +76 -0
  142. jerry_thomas-1.0.1.dist-info/METADATA +825 -0
  143. jerry_thomas-1.0.1.dist-info/RECORD +199 -0
  144. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
  145. datapipeline/build/tasks.py +0 -186
  146. datapipeline/cli/commands/link.py +0 -128
  147. datapipeline/cli/commands/writers.py +0 -138
  148. datapipeline/config/build.py +0 -64
  149. datapipeline/config/run.py +0 -116
  150. datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
  151. datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
  152. datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
  153. datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
  154. datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
  155. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
  156. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
  157. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
  158. datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
  159. datapipeline/transforms/vector.py +0 -210
  160. jerry_thomas-0.3.0.dist-info/METADATA +0 -502
  161. jerry_thomas-0.3.0.dist-info/RECORD +0 -139
  162. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
  163. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
  164. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0
@@ -4,23 +4,28 @@ Minimal plugin skeleton for the Jerry Thomas (datapipeline) framework.
4
4
 
5
5
  Quick start
6
6
  - Initialize a plugin (already done if you’re reading this here):
7
- - `jerry plugin init --name {{DIST_NAME}}`
7
+ - `jerry plugin init {{DIST_NAME}}`
8
8
  - Add a source via CLI (transport-specific placeholders are scaffolded):
9
- - File data: `jerry source add -p <provider> -d <dataset> -t fs -f <csv|json|json-lines>`
10
- - URL data: `jerry source add -p <provider> -d <dataset> -t url -f <json|json-lines|csv>`
9
+ - File data: `jerry source add <provider> <dataset> -t fs -f <csv|json|json-lines|pickle>`
10
+ - HTTP data: `jerry source add <provider>.<dataset> -t http -f <json|json-lines|csv>`
11
11
  - Synthetic: `jerry source add -p <provider> -d <dataset> -t synthetic`
12
12
  - Edit the generated `config/sources/*.yaml` to fill in the `path`, delimiter, etc.
13
+ - `jerry.yaml` is placed in your workspace root (alongside the plugin folder) so
14
+ you can run CLI commands from there; `plugin_root` points back to this plugin.
13
15
  - Reinstall after EP changes (pyproject.toml) and restart Python processes:
14
16
  - Core: `cd lib/datapipeline && python -m pip install -e .`
15
17
  - This plugin: `python -m pip install -e .`
16
18
 
17
19
  Folder layout
18
- - `config/`
19
- - `sources/*.yaml` — raw source definitions (one file per source)
20
+ - `example/`
21
+ - `project.yaml` — project root (paths, globals, cadence/split)
22
+ - `dataset.yaml` — feature/target declarations (uses `${group_by}` from globals)
23
+ - `postprocess.yaml` — postprocess transforms
20
24
  - `contracts/*.yaml` — canonical stream definitions
21
- - `datasets/<name>/build.yaml` — build configuration (partitioned ids today, more artifacts later)
25
+ - `sources/*.yaml` — raw source definitions (one file per source)
26
+ - `tasks/*.yaml` — task specs (schema/scaler/metadata/serve)
22
27
  - Every dataset `project.yaml` declares a `name`; reference it via `${project_name}`
23
- inside other config files (e.g., `paths.artifacts: ../../build/datasets/${project_name}`) to
28
+ inside other config files (e.g., `paths.artifacts: ../artifacts/${project_name}`) to
24
29
  avoid hard-coding per-dataset directories.
25
30
  - `src/{{PACKAGE_NAME}}/`
26
31
  - `sources/<provider>/<dataset>/dto.py` — DTO model for the source
@@ -30,52 +35,59 @@ Folder layout
30
35
  - `mappers/*.py` — map DTOs → domain records
31
36
 
32
37
  How loaders work
33
- - For fs/url, sources use the generic loader entry point:
34
- - `loader.entrypoint: "{{COMPOSED_LOADER_EP}}"`
38
+ - For fs/http, sources use the generic loader entry point:
39
+ - `loader.entrypoint: "{{DEFAULT_IO_LOADER_EP}}"`
35
40
  - `loader.args` include `transport`, `format`, and source-specific args (placeholders are provided):
36
41
  - fs: `path`, `glob`, `encoding`, plus `delimiter` for csv
37
- - url: `url`, `headers`, `encoding`, optional `count_by_fetch`
42
+ - http: `url`, `headers`, `params`, `encoding`, optional `count_by_fetch`
38
43
  - Synthetic sources generate data in-process and keep a small loader stub.
39
44
 
40
45
  Run data flows
41
- - Build artifacts once: `jerry build --project config/datasets/default/project.yaml`
42
- - Preview records (stage 1): `jerry serve --project config/datasets/default/project.yaml --stage 1 --limit 100`
43
- - Preview features (stage 3): `jerry serve --project config/datasets/default/project.yaml --stage 3 --limit 100`
44
- - Preview vectors (stage 7): `jerry serve --project config/datasets/default/project.yaml --stage 7 --limit 100`
46
+ - Build artifacts once: `jerry build --project example/project.yaml`
47
+ - Preview records (stage 1): `jerry serve --project example/project.yaml --stage 1 --limit 100`
48
+ - Preview features (stage 3): `jerry serve --project example/project.yaml --stage 3 --limit 100`
49
+ - Preview vectors (stage 7): `jerry serve --project example/project.yaml --stage 7 --limit 100`
45
50
 
46
51
  Analyze vectors
47
- - `jerry inspect report --project config/datasets/default/project.yaml` (console only)
48
- - `jerry inspect coverage --project config/datasets/default/project.yaml` (writes build/coverage.json)
49
- - `jerry inspect matrix --project config/datasets/default/project.yaml --format html` (writes build/matrix.html)
50
- - `jerry inspect partitions --project config/datasets/default/project.yaml` (writes build/partitions.json)
52
+ - `jerry inspect report --project example/project.yaml` (console only)
53
+ - `jerry inspect partitions --project example/project.yaml` (writes build/partitions.json)
54
+ - `jerry inspect matrix --project example/project.yaml --format html` (writes build/matrix.html)
55
+ - `jerry inspect expected --project example/project.yaml` (writes build/expected.txt)
51
56
  - Use post-processing transforms in `postprocess.yaml` to keep coverage high
52
57
  (history/horizontal fills, constants, or drop rules) before serving vectors.
58
+ Add `payload: targets` inside a transform when you need to mutate label vectors.
53
59
 
54
60
  Train/Val/Test splits (deterministic)
55
61
  - Configure split mechanics once in your project file:
56
- - Edit `config/datasets/default/project.yaml` and set:
62
+ - Edit `example/project.yaml` and set:
57
63
  ```yaml
58
64
  globals:
65
+ group_by: 10m # dataset cadence; reused as contract cadence
59
66
  split:
60
67
  mode: hash # hash|time
61
68
  key: group # group or feature:<id> (entity-stable)
62
69
  seed: 42 # deterministic hash seed
63
70
  ratios: {train: 0.8, val: 0.1, test: 0.1}
64
71
  ```
65
- - Select the active slice via `config/datasets/default/run.yaml` (or `--keep`):
72
+ - Select the active slice via `example/tasks/serve.<name>.yaml` (or `--keep`):
66
73
  ```yaml
67
- version: 1
74
+ kind: serve
75
+ name: train # defaults to filename stem when omitted
68
76
  keep: train # any label defined in globals.split; null disables filtering
69
- output: print # serve output default (print|stream|/path)
77
+ output:
78
+ transport: stdout # stdout | fs
79
+ format: print # print | json-lines | json | csv | pickle
70
80
  limit: 100 # cap vectors per serve run (null = unlimited)
71
- include_targets: false # include dataset.targets when serving
72
81
  throttle_ms: null # sleep between vectors (milliseconds)
82
+ # visuals: AUTO # AUTO | TQDM | RICH | OFF
83
+ # progress: AUTO # AUTO | SPINNER | BARS | OFF
73
84
  ```
74
- - If you prefer separate configs per split, point `project.paths.run` at a folder (e.g., `config/datasets/default/runs/`),
75
- drop `train.yaml`, `val.yaml`, etc. inside, and the CLI will run each file in order unless you pass `--run <name>`.
76
- - Serve examples (change run.yaml or pass `--keep val|test`):
77
- - `jerry serve -p config/datasets/default/project.yaml -o stream > train.jsonl`
78
- - `jerry serve -p config/datasets/default/project.yaml --keep val -o stream > val.jsonl`
85
+ - Add additional `kind: serve` files (e.g., `serve.val.yaml`, `serve.test.yaml`) and the CLI will run each enabled file in order unless you pass `--run <name>`.
86
+ - Serve examples (change the serve task or pass `--keep val|test`):
87
+ - `jerry serve -p example/project.yaml --out-transport stdout --out-format json-lines > train.jsonl`
88
+ - `jerry serve -p example/project.yaml --keep val --out-transport stdout --out-format json-lines > val.jsonl`
89
+ - Add `--visuals rich --progress bars` for a richer interactive UI; defaults to `AUTO`.
90
+ - For shared workspace defaults (visual renderer, progress display, build mode), drop a `jerry.yaml` next to your workspace root and set `shared.visuals`, `shared.progress`, etc. CLI commands walk up from the current directory to find it.
79
91
  - The split is applied at the end (after postprocess transforms), and assignment
80
92
  is deterministic (hash-based) with a fixed seed; no overlap across runs.
81
93
 
@@ -84,13 +96,47 @@ Key selection guidance
84
96
  - `key: feature:<id>` hashes a specific feature value, e.g., `feature:entity_id` or `feature:station_id`, ensuring all vectors for that entity land in the same split (recommended to avoid leakage).
85
97
 
86
98
  Postprocess expected IDs
87
- - Build once with `jerry build --project config/datasets/default/project.yaml` (or run `jerry inspect expected …`) to materialize `<paths.artifacts>/expected.txt`.
99
+ - Build once with `jerry build --project config/project.yaml` (or run `jerry inspect expected …`) to materialize `<paths.artifacts>/expected.txt`.
88
100
  - Bootstrap registers the artifact; postprocess transforms read it automatically. Per-transform `expected:` overrides are no longer required or supported — the build output is the single source of truth.
89
101
 
90
102
  Scaler statistics
91
- - Enable the scaler task in `build.yaml` (default `enabled: true`) to compute mean/std per feature using the configured training split.
103
+ - Jerry computes scaler stats automatically. If you need custom paths or settings, add `tasks/scaler.yaml` and override the defaults.
92
104
  - The build writes `<paths.artifacts>/scaler.pkl`; runtime scaling requires this artifact. If it is missing, scaling transforms raise a runtime error.
93
105
 
94
106
  Tips
95
107
  - Keep parsers thin — mirror source schema and return DTOs; use the identity parser only if your loader already emits domain records.
96
108
  - Prefer small, composable configs over monolithic ones: one YAML per source is easier to review and reuse.
109
+
110
+ Composed streams (engineered domains)
111
+ - Declare engineered streams that depend on other canonical streams directly in contracts. The runtime builds each input to stage 4, stream‑aligns by partition+timestamp, runs your composer, and emits fresh records for the derived stream.
112
+
113
+ ```yaml
114
+ # example/contracts/air_density.processed.yaml
115
+ kind: composed
116
+ id: air_density.processed
117
+ inputs:
118
+ - p=pressure.processed
119
+ - t=temp_dry.processed
120
+ partition_by: station_id
121
+ sort_batch_size: 20000
122
+
123
+ mapper:
124
+ entrypoint: {{PACKAGE_NAME}}.mappers.air_density:mapper
125
+ args:
126
+ driver: p # optional; defaults to first input alias
127
+
128
+ # Optional post‑compose policies (same as any stream):
129
+ # record: [...]
130
+ # stream: [...]
131
+ # debug: [...]
132
+ ```
133
+
134
+ Then reference the composed stream in your dataset:
135
+
136
+ ```yaml
137
+ # example/dataset.yaml
138
+ group_by: ${group_by}
139
+ features:
140
+ - id: air_density
141
+ record_stream: air_density.processed
142
+ ```
@@ -0,0 +1,31 @@
1
+ kind: ingest
2
+ source: synthetic.ticks
3
+ id: time.ticks.hour_sin # format: domain.dataset.(variant)
4
+
5
+ # Fine-grained cadence for this stream. Defaults to the dataset group_by via project.globals.
6
+ cadence: ${group_by}
7
+
8
+ mapper:
9
+ entrypoint: encode_time
10
+ args: { mode: hour_sin }
11
+
12
+ # partition_by: field you want to partition
13
+
14
+ record:
15
+ - filter: { operator: ge, field: time, comparand: "${start_time}" }
16
+ - filter: { operator: le, field: time, comparand: "${end_time}" }
17
+ - floor_time: { cadence: "${cadence}" }
18
+ # - lag: { lag: "${cadence}" }
19
+
20
+ stream:
21
+ - dedupe: {}
22
+ - granularity: { mode: first }
23
+ - ensure_cadence: { cadence: "${cadence}" }
24
+ # Optional: fill gaps before downstream transforms:
25
+ # - fill: { statistic: median, window: 24, min_samples: 4 }
26
+
27
+ debug:
28
+ - lint: { mode: error, tick: "${cadence}" }
29
+
30
+ # sort_batch_size: 100000
31
+
@@ -0,0 +1,30 @@
1
+ kind: ingest
2
+ source: synthetic.ticks # raw source alias (see example/sources)
3
+ id: time.ticks.linear # canonical stream id (format: domain.dataset.(variant))
4
+
5
+ # Fine-grained cadence for this stream. Defaults to the dataset group_by via project.globals.
6
+ cadence: ${group_by}
7
+
8
+ mapper: # normalize/reshape DTO -> TemporalRecord
9
+ entrypoint: encode_time
10
+ args: { mode: linear }
11
+ # partition_by: station_id # optional: add partition suffixes to feature ids
12
+
13
+ record: # record-level transforms
14
+ - filter: { operator: ge, field: time, comparand: "${start_time}" }
15
+ - filter: { operator: le, field: time, comparand: "${end_time}" }
16
+ - floor_time: { cadence: "${cadence}" } # snap timestamps to cadence boundaries
17
+ # - lag: { lag: "${cadence}" } # optional: shift timestamps backwards
18
+
19
+ stream: # per-feature stream transforms (input sorted by id,time)
20
+ - dedupe: {} # drop exact-duplicate records per tick
21
+ - granularity: { mode: first } # aggregate duplicates within a tick
22
+ - ensure_cadence: { cadence: "${cadence}" } # insert missing ticks (value=None)
23
+ # Consider adding a fill transform to impute None values before sequence/windowing:
24
+ # - fill: { statistic: median, window: 6, min_samples: 1 }
25
+
26
+ debug: # optional validation-only transforms
27
+ - lint: { mode: error, tick: "${cadence}" } # strict cadence/order; value issues handled by downstream transforms
28
+
29
+ # sort_batch_size: 100000 # in-memory chunk size used by internal sorting
30
+
@@ -0,0 +1,18 @@
1
+ group_by: ${group_by}
2
+
3
+ features:
4
+ - id: time_linear
5
+ record_stream: time.ticks.linear
6
+ scale: true # optionally add with_mean/with_std overrides
7
+ # Sliding window over the regularized stream; cadence is enforced in the contract.
8
+ sequence: { size: 6, stride: 1 }
9
+
10
+ - id: time_hour_sin
11
+ record_stream: time.ticks.hour_sin
12
+
13
+ # - id: third_feature
14
+ # record_stream: anotherstream
15
+ # targets:
16
+ # - id: some_target
17
+ # record_stream: time.ticks.linear
18
+
@@ -0,0 +1,29 @@
1
+ #### example combination of postprocessing steps ######
2
+ #### making sure data is complete after these combinations ######
3
+ - drop: # example of dropping sparse partitions/vertical-axis for targets
4
+ axis: vertical
5
+ payload: targets
6
+ threshold: 0.9
7
+
8
+ - drop: # example of dropping sparse partitions for features
9
+ axis: vertical
10
+ payload: features
11
+ threshold: 0.9
12
+
13
+ - drop: # dropping vectors/horizontal-axis that has features which none
14
+ axis: horizontal
15
+ payload: features
16
+ threshold: 1
17
+
18
+ - drop:
19
+ axis: horizontal
20
+ payload: targets
21
+ threshold: 1
22
+ ######
23
+ # - fill:
24
+ # statistic: median
25
+ # window: 48
26
+ # min_samples: 6
27
+ # - replace:
28
+ # payload: targets
29
+ # value: 0.0
@@ -1,18 +1,21 @@
1
1
  version: 1
2
- name: default
2
+ name: example
3
3
  paths:
4
- streams: ../../contracts
5
- sources: ../../sources
4
+ streams: ./contracts
5
+ sources: ./sources
6
6
  dataset: dataset.yaml
7
7
  postprocess: postprocess.yaml
8
- artifacts: ../../build/datasets/${project_name}
9
- build: build.yaml
10
- run: runs
8
+ artifacts: ../artifacts/${project_name}/v${version}
9
+ tasks: ./tasks
11
10
  globals:
11
+ # Globals to use in your .yaml files via ${var_name}.
12
+ # Primary dataset cadence; referenced from dataset.yaml (group_by)
13
+ # and contracts via ${group_by}.
14
+ group_by: 1h
12
15
  start_time: 2021-01-01T00:00:00Z
13
- end_time: 2023-01-03T23:00:00Z
16
+ end_time: 2021-01-02T00:00:00Z
14
17
  # Configure deterministic dataset split here (applied at serve time, after postprocess).
15
- # Adjust `ratios` as needed; the active split is selected via run.yaml or CLI.
18
+ # Adjust `ratios` as needed; the active split is selected via serve tasks or CLI.
16
19
  split:
17
20
  mode: hash # hash | time (time uses boundaries/labels)
18
21
  key: group # group | feature:<id> (entity-stable split)
@@ -0,0 +1,12 @@
1
+ id: synthetic.ticks
2
+
3
+ parser:
4
+ entrypoint: "core.synthetic.ticks"
5
+ args: {}
6
+ loader:
7
+ entrypoint: "core.synthetic.ticks"
8
+ args:
9
+ start: "${start_time}"
10
+ end: "${end_time}"
11
+ frequency: "${group_by}"
12
+
@@ -0,0 +1,3 @@
1
+ kind: metadata
2
+ # window_mode: intersection # union|intersection|strict|relaxed (default: intersection)
3
+
@@ -0,0 +1,9 @@
1
+ kind: scaler
2
+
3
+ # Output path is relative to project.paths.artifacts; defaults to "scaler.pkl".
4
+ # output: scaler.pkl
5
+
6
+ # Split label to use when fitting scaler statistics.
7
+ # Must match a label from globals.split.ratios.
8
+ split_label: train
9
+
@@ -0,0 +1,4 @@
1
+ kind: serve
2
+ name: test
3
+ keep: test
4
+
@@ -0,0 +1,28 @@
1
+ kind: serve
2
+
3
+ # Optional identifier for this serve task; defaults to filename stem.
4
+ name: train
5
+
6
+ # Active split label to serve; must match a label from globals.split.ratios.
7
+ # Set to null to disable split filtering.
8
+ keep: train
9
+ #output:
10
+ # transport: stdout | fs
11
+ # format: print | json-lines | json | csv | pickle
12
+ # When using fs transport, set a directory (and optionally filename) for outputs:
13
+ # directory: artifacts/serve
14
+ # filename: vectors.train
15
+
16
+ # Default max number of vectors to emit (null = unlimited).
17
+ # limit: 5
18
+ # Optional pipeline stage preview (0-7); null lets the CLI decide.
19
+ # stage: 7
20
+
21
+ # Optional pacing between emitted vectors (milliseconds).
22
+ # throttle_ms: null
23
+
24
+ # Visuals/logging knobs (inherit CLI or jerry.yaml defaults when omitted):
25
+ # visuals: AUTO # AUTO | TQDM | RICH | OFF
26
+ # progress: AUTO # AUTO | SPINNER | BARS | OFF
27
+ # log_level: INFO # CRITICAL | ERROR | WARNING | INFO | DEBUG
28
+
@@ -0,0 +1,4 @@
1
+ kind: serve
2
+ name: val
3
+ keep: val
4
+
@@ -0,0 +1,34 @@
1
+ # Workspace defaults. The scaffolder copies this to your workspace root (where
2
+ # you ran `jerry plugin init`). CLI commands walk upward from cwd to find it.
3
+
4
+ # Relative path from this workspace file back to the plugin root.
5
+ plugin_root: . # e.g., "lib/myplugin" if your plugin lives under lib/
6
+
7
+ # Dataset aliases for `--dataset`; values may be dirs (auto-append project.yaml).
8
+ datasets:
9
+ example: example/project.yaml
10
+ your-second-example-dataset: your-dataset/project.yaml
11
+
12
+ # Default dataset alias when --dataset/--project are omitted.
13
+ default_dataset: example
14
+
15
+ # Shared fallbacks used by all commands (unless overridden).
16
+ shared:
17
+ visuals: AUTO # AUTO | TQDM | RICH | OFF
18
+ progress: BARS # AUTO | SPINNER | BARS | OFF
19
+ log_level: INFO # Default log level when not set elsewhere
20
+
21
+ # Defaults for `jerry serve` (run-time options).
22
+ serve:
23
+ # log_level: INFO # Uncomment to force INFO for serve runs
24
+ limit: null # Cap vectors; null means unlimited
25
+ stage: null # Preview a specific stage; null runs the full pipeline
26
+ output:
27
+ transport: stdout
28
+ format: print # stdout: print|json-lines|json|csv|pickle
29
+ # directory: artifacts/serve # Required when transport=fs
30
+
31
+ # Defaults for `jerry build` (artifact materialization).
32
+ build:
33
+ # log_level: INFO # Uncomment to set build log level
34
+ mode: AUTO # AUTO | FORCE | OFF
@@ -0,0 +1,31 @@
1
+ kind: ingest
2
+ source: synthetic.ticks
3
+ id: time.ticks.hour_sin # format: domain.dataset.(variant)
4
+
5
+ # Fine-grained cadence for this stream. Defaults to the dataset group_by via project.globals.
6
+ cadence: ${group_by}
7
+
8
+ mapper:
9
+ entrypoint: encode_time
10
+ args: { mode: hour_sin }
11
+
12
+ # partition_by: field you want to partition
13
+
14
+ record:
15
+ - filter: { operator: ge, field: time, comparand: "${start_time}" }
16
+ - filter: { operator: le, field: time, comparand: "${end_time}" }
17
+ - floor_time: { cadence: "${cadence}" }
18
+ # - lag: { lag: "${cadence}" }
19
+
20
+ stream:
21
+ - dedupe: {}
22
+ - granularity: { mode: first }
23
+ - ensure_cadence: { cadence: "${cadence}" }
24
+ # Optional: fill gaps before downstream transforms:
25
+ # - fill: { statistic: median, window: 24, min_samples: 4 }
26
+
27
+ debug:
28
+ - lint: { mode: error, tick: "${cadence}" }
29
+
30
+ # sort_batch_size: 100000
31
+
@@ -0,0 +1,30 @@
1
+ kind: ingest
2
+ source: synthetic.ticks # raw source alias (see example/sources)
3
+ id: time.ticks.linear # canonical stream id (format: domain.dataset.(variant))
4
+
5
+ # Fine-grained cadence for this stream. Defaults to the dataset group_by via project.globals.
6
+ cadence: ${group_by}
7
+
8
+ mapper: # normalize/reshape DTO -> TemporalRecord
9
+ entrypoint: encode_time
10
+ args: { mode: linear }
11
+ # partition_by: station_id # optional: add partition suffixes to feature ids
12
+
13
+ record: # record-level transforms
14
+ - filter: { operator: ge, field: time, comparand: "${start_time}" }
15
+ - filter: { operator: le, field: time, comparand: "${end_time}" }
16
+ - floor_time: { cadence: "${cadence}" } # snap timestamps to cadence boundaries
17
+ # - lag: { lag: "${cadence}" } # optional: shift timestamps backwards
18
+
19
+ stream: # per-feature stream transforms (input sorted by id,time)
20
+ - dedupe: {} # drop exact-duplicate records per tick
21
+ - granularity: { mode: first } # aggregate duplicates within a tick
22
+ - ensure_cadence: { cadence: "${cadence}" } # insert missing ticks (value=None)
23
+ # Consider adding a fill transform to impute None values before sequence/windowing:
24
+ # - fill: { statistic: median, window: 6, min_samples: 1 }
25
+
26
+ debug: # optional validation-only transforms
27
+ - lint: { mode: error, tick: "${cadence}" } # strict cadence/order; value issues handled by downstream transforms
28
+
29
+ # sort_batch_size: 100000 # in-memory chunk size used by internal sorting
30
+
@@ -0,0 +1,18 @@
1
+ group_by: ${group_by}
2
+
3
+ features:
4
+ - id: time_linear
5
+ record_stream: time.ticks.linear
6
+ scale: true # optionally add with_mean/with_std overrides
7
+ # Sliding window over the regularized stream; cadence is enforced in the contract.
8
+ sequence: { size: 6, stride: 1 }
9
+
10
+ - id: time_hour_sin
11
+ record_stream: time.ticks.hour_sin
12
+
13
+ # - id: third_feature
14
+ # record_stream: anotherstream
15
+ # targets:
16
+ # - id: some_target
17
+ # record_stream: time.ticks.linear
18
+
@@ -0,0 +1,29 @@
1
+ #### example combination of postprocessing steps ######
2
+ #### making sure data is complete after these combinations ######
3
+ - drop: # example of dropping sparse partitions/vertical-axis for targets
4
+ axis: vertical
5
+ payload: targets
6
+ threshold: 0.9
7
+
8
+ - drop: # example of dropping sparse partitions for features
9
+ axis: vertical
10
+ payload: features
11
+ threshold: 0.9
12
+
13
+ - drop: # dropping vectors/horizontal-axis that has features which none
14
+ axis: horizontal
15
+ payload: features
16
+ threshold: 1
17
+
18
+ - drop:
19
+ axis: horizontal
20
+ payload: targets
21
+ threshold: 1
22
+ ######
23
+ # - fill:
24
+ # statistic: median
25
+ # window: 48
26
+ # min_samples: 6
27
+ # - replace:
28
+ # payload: targets
29
+ # value: 0.0
@@ -0,0 +1,22 @@
1
+ version: 1
2
+ name: <your-dataset>
3
+ paths:
4
+ streams: ./contracts
5
+ sources: ./sources
6
+ dataset: dataset.yaml
7
+ postprocess: postprocess.yaml
8
+ artifacts: ../artifacts/${project_name}/v${version}
9
+ tasks: ./tasks
10
+ globals:
11
+ # Primary dataset cadence; referenced from dataset.yaml (group_by)
12
+ # and contracts via ${group_by}.
13
+ group_by: <your-bucket-cadence>
14
+ start_time: null #2021-01-01T00:00:00Z
15
+ end_time: null #2021-01-02T00:00:00Z
16
+ # Configure deterministic dataset split here (applied at serve time, after postprocess).
17
+ # Adjust `ratios` as needed; the active split is selected via serve tasks or CLI.
18
+ split:
19
+ mode: hash # hash | time (time uses boundaries/labels)
20
+ key: group # group | feature:<id> (entity-stable split)
21
+ seed: 42 # deterministic hash seed
22
+ ratios: { train: 0.8, val: 0.1, test: 0.1 }
@@ -0,0 +1,12 @@
1
+ id: synthetic.ticks
2
+
3
+ parser:
4
+ entrypoint: "core.synthetic.ticks"
5
+ args: {}
6
+ loader:
7
+ entrypoint: "core.synthetic.ticks"
8
+ args:
9
+ start: "${start_time}"
10
+ end: "${end_time}"
11
+ frequency: "${group_by}"
12
+
@@ -0,0 +1,3 @@
1
+ kind: metadata
2
+ # window_mode: intersection # union|intersection|strict|relaxed (default: intersection)
3
+
@@ -0,0 +1,9 @@
1
+ kind: scaler
2
+
3
+ # Output path is relative to project.paths.artifacts; defaults to "scaler.pkl".
4
+ # output: scaler.pkl
5
+
6
+ # Split label to use when fitting scaler statistics.
7
+ # Must match a label from globals.split.ratios.
8
+ split_label: train
9
+
@@ -0,0 +1,4 @@
1
+ kind: serve
2
+ name: test
3
+ keep: test
4
+
@@ -0,0 +1,28 @@
1
+ kind: serve
2
+
3
+ # Optional identifier for this serve task; defaults to filename stem.
4
+ name: train
5
+
6
+ # Active split label to serve; must match a label from globals.split.ratios.
7
+ # Set to null to disable split filtering.
8
+ keep: train
9
+ #output:
10
+ # transport: stdout | fs
11
+ # format: print | json-lines | json | csv | pickle
12
+ # When using fs transport, set a directory (and optionally filename) for outputs:
13
+ # directory: artifacts/serve
14
+ # filename: vectors.train
15
+
16
+ # Default max number of vectors to emit (null = unlimited).
17
+ # limit: 5
18
+ # Optional pipeline stage preview (0-7); null lets the CLI decide.
19
+ # stage: 7
20
+
21
+ # Optional pacing between emitted vectors (milliseconds).
22
+ # throttle_ms: null
23
+
24
+ # Visuals/logging knobs (inherit CLI or jerry.yaml defaults when omitted):
25
+ # visuals: AUTO # AUTO | TQDM | RICH | OFF
26
+ # progress: AUTO # AUTO | SPINNER | BARS | OFF
27
+ # log_level: INFO # CRITICAL | ERROR | WARNING | INFO | DEBUG
28
+
@@ -0,0 +1,4 @@
1
+ kind: serve
2
+ name: val
3
+ keep: val
4
+
@@ -1,4 +1,6 @@
1
1
  from dataclasses import dataclass
2
+ from datetime import datetime
3
+
2
4
 
3
5
  @dataclass
4
6
  class {{CLASS_NAME}}: