jerry-thomas 1.0.3__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. datapipeline/analysis/vector/collector.py +0 -1
  2. datapipeline/build/tasks/config.py +0 -2
  3. datapipeline/build/tasks/metadata.py +0 -2
  4. datapipeline/build/tasks/scaler.py +0 -2
  5. datapipeline/build/tasks/schema.py +0 -2
  6. datapipeline/build/tasks/utils.py +0 -2
  7. datapipeline/cli/app.py +201 -81
  8. datapipeline/cli/commands/contract.py +145 -283
  9. datapipeline/cli/commands/demo.py +13 -0
  10. datapipeline/cli/commands/domain.py +4 -4
  11. datapipeline/cli/commands/dto.py +11 -0
  12. datapipeline/cli/commands/filter.py +2 -2
  13. datapipeline/cli/commands/inspect.py +0 -68
  14. datapipeline/cli/commands/list_.py +30 -13
  15. datapipeline/cli/commands/loader.py +11 -0
  16. datapipeline/cli/commands/mapper.py +82 -0
  17. datapipeline/cli/commands/parser.py +45 -0
  18. datapipeline/cli/commands/run_config.py +1 -3
  19. datapipeline/cli/commands/serve_pipeline.py +5 -7
  20. datapipeline/cli/commands/source.py +106 -18
  21. datapipeline/cli/commands/stream.py +286 -0
  22. datapipeline/cli/visuals/common.py +0 -2
  23. datapipeline/cli/visuals/sections.py +0 -2
  24. datapipeline/cli/workspace_utils.py +0 -3
  25. datapipeline/config/context.py +0 -2
  26. datapipeline/config/dataset/feature.py +1 -0
  27. datapipeline/config/metadata.py +0 -2
  28. datapipeline/config/project.py +0 -2
  29. datapipeline/config/resolution.py +10 -2
  30. datapipeline/config/tasks.py +9 -9
  31. datapipeline/domain/feature.py +3 -0
  32. datapipeline/domain/record.py +7 -7
  33. datapipeline/domain/sample.py +0 -2
  34. datapipeline/domain/vector.py +6 -8
  35. datapipeline/integrations/ml/adapter.py +0 -2
  36. datapipeline/integrations/ml/pandas_support.py +0 -2
  37. datapipeline/integrations/ml/rows.py +0 -2
  38. datapipeline/integrations/ml/torch_support.py +0 -2
  39. datapipeline/io/output.py +0 -2
  40. datapipeline/io/serializers.py +26 -16
  41. datapipeline/mappers/synthetic/time.py +9 -2
  42. datapipeline/pipeline/artifacts.py +3 -5
  43. datapipeline/pipeline/observability.py +0 -2
  44. datapipeline/pipeline/pipelines.py +118 -34
  45. datapipeline/pipeline/stages.py +42 -17
  46. datapipeline/pipeline/utils/spool_cache.py +142 -0
  47. datapipeline/pipeline/utils/transform_utils.py +27 -2
  48. datapipeline/services/artifacts.py +1 -4
  49. datapipeline/services/constants.py +1 -0
  50. datapipeline/services/factories.py +4 -6
  51. datapipeline/services/project_paths.py +0 -2
  52. datapipeline/services/runs.py +0 -2
  53. datapipeline/services/scaffold/contract_yaml.py +76 -0
  54. datapipeline/services/scaffold/demo.py +141 -0
  55. datapipeline/services/scaffold/discovery.py +115 -0
  56. datapipeline/services/scaffold/domain.py +21 -13
  57. datapipeline/services/scaffold/dto.py +31 -0
  58. datapipeline/services/scaffold/filter.py +2 -1
  59. datapipeline/services/scaffold/layout.py +96 -0
  60. datapipeline/services/scaffold/loader.py +61 -0
  61. datapipeline/services/scaffold/mapper.py +116 -0
  62. datapipeline/services/scaffold/parser.py +56 -0
  63. datapipeline/services/scaffold/plugin.py +14 -2
  64. datapipeline/services/scaffold/source_yaml.py +91 -0
  65. datapipeline/services/scaffold/stream_plan.py +110 -0
  66. datapipeline/services/scaffold/utils.py +187 -0
  67. datapipeline/sources/data_loader.py +0 -2
  68. datapipeline/sources/decoders.py +49 -8
  69. datapipeline/sources/factory.py +9 -6
  70. datapipeline/sources/foreach.py +18 -3
  71. datapipeline/sources/synthetic/time/parser.py +1 -1
  72. datapipeline/sources/transports.py +10 -4
  73. datapipeline/templates/demo_skeleton/demo/contracts/equity.ohlcv.yaml +33 -0
  74. datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.hour_sin.yaml +22 -0
  75. datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.linear.yaml +22 -0
  76. datapipeline/templates/demo_skeleton/demo/data/APPL.jsonl +19 -0
  77. datapipeline/templates/demo_skeleton/demo/data/MSFT.jsonl +19 -0
  78. datapipeline/templates/demo_skeleton/demo/dataset.yaml +19 -0
  79. datapipeline/templates/demo_skeleton/demo/postprocess.yaml +19 -0
  80. datapipeline/templates/demo_skeleton/demo/project.yaml +19 -0
  81. datapipeline/templates/demo_skeleton/demo/sources/sandbox.ohlcv.yaml +17 -0
  82. datapipeline/templates/{plugin_skeleton/example → demo_skeleton/demo}/sources/synthetic.ticks.yaml +1 -1
  83. datapipeline/templates/demo_skeleton/demo/tasks/metadata.yaml +2 -0
  84. datapipeline/templates/demo_skeleton/demo/tasks/scaler.yaml +3 -0
  85. datapipeline/templates/demo_skeleton/demo/tasks/schema.yaml +2 -0
  86. datapipeline/templates/demo_skeleton/demo/tasks/serve.test.yaml +4 -0
  87. datapipeline/templates/demo_skeleton/demo/tasks/serve.train.yaml +4 -0
  88. datapipeline/templates/demo_skeleton/demo/tasks/serve.val.yaml +4 -0
  89. datapipeline/templates/demo_skeleton/scripts/run_dataframe.py +20 -0
  90. datapipeline/templates/demo_skeleton/scripts/run_torch.py +23 -0
  91. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
  92. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/__init__.py +0 -0
  93. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/model.py +18 -0
  94. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
  95. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/sandbox_ohlcv_dto.py +14 -0
  96. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +0 -0
  97. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/map_sandbox_ohlcv_dto_to_equity.py +26 -0
  98. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
  99. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/sandbox_ohlcv_dto_parser.py +46 -0
  100. datapipeline/templates/plugin_skeleton/README.md +57 -136
  101. datapipeline/templates/plugin_skeleton/jerry.yaml +12 -24
  102. datapipeline/templates/plugin_skeleton/reference/jerry.yaml +28 -0
  103. datapipeline/templates/plugin_skeleton/reference/reference/contracts/composed.reference.yaml +29 -0
  104. datapipeline/templates/plugin_skeleton/reference/reference/contracts/ingest.reference.yaml +31 -0
  105. datapipeline/templates/plugin_skeleton/reference/reference/contracts/overview.reference.yaml +34 -0
  106. datapipeline/templates/plugin_skeleton/reference/reference/dataset.yaml +29 -0
  107. datapipeline/templates/plugin_skeleton/reference/reference/postprocess.yaml +25 -0
  108. datapipeline/templates/plugin_skeleton/reference/reference/project.yaml +32 -0
  109. datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.http.reference.yaml +24 -0
  110. datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.reference.yaml +21 -0
  111. datapipeline/templates/plugin_skeleton/reference/reference/sources/fs.reference.yaml +16 -0
  112. datapipeline/templates/plugin_skeleton/reference/reference/sources/http.reference.yaml +17 -0
  113. datapipeline/templates/plugin_skeleton/reference/reference/sources/overview.reference.yaml +18 -0
  114. datapipeline/templates/plugin_skeleton/reference/reference/sources/synthetic.reference.yaml +15 -0
  115. datapipeline/templates/plugin_skeleton/reference/reference/tasks/metadata.reference.yaml +11 -0
  116. datapipeline/templates/plugin_skeleton/reference/reference/tasks/scaler.reference.yaml +10 -0
  117. datapipeline/templates/plugin_skeleton/reference/reference/tasks/schema.reference.yaml +10 -0
  118. datapipeline/templates/plugin_skeleton/reference/reference/tasks/serve.reference.yaml +28 -0
  119. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/domains/__init__.py +2 -0
  120. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
  121. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/loaders/__init__.py +0 -0
  122. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +1 -0
  123. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
  124. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +12 -11
  125. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +4 -13
  126. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +7 -10
  127. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +1 -2
  128. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +1 -7
  129. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +1 -1
  130. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +1 -1
  131. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +1 -25
  132. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +1 -1
  133. datapipeline/templates/plugin_skeleton/your-interim-data-builder/dataset.yaml +9 -0
  134. datapipeline/templates/plugin_skeleton/your-interim-data-builder/postprocess.yaml +1 -0
  135. datapipeline/templates/plugin_skeleton/your-interim-data-builder/project.yaml +14 -0
  136. datapipeline/templates/plugin_skeleton/your-interim-data-builder/tasks/serve.all.yaml +8 -0
  137. datapipeline/templates/stubs/contracts/composed.yaml.j2 +10 -0
  138. datapipeline/templates/stubs/contracts/ingest.yaml.j2 +25 -0
  139. datapipeline/templates/stubs/dto.py.j2 +1 -1
  140. datapipeline/templates/stubs/loaders/basic.py.j2 +11 -0
  141. datapipeline/templates/stubs/mappers/composed.py.j2 +13 -0
  142. datapipeline/templates/stubs/mappers/ingest.py.j2 +17 -0
  143. datapipeline/templates/stubs/parser.py.j2 +4 -0
  144. datapipeline/templates/stubs/record.py.j2 +0 -1
  145. datapipeline/templates/stubs/source.yaml.j2 +1 -1
  146. datapipeline/transforms/debug/identity.py +34 -16
  147. datapipeline/transforms/debug/lint.py +14 -11
  148. datapipeline/transforms/feature/scaler.py +5 -12
  149. datapipeline/transforms/filter.py +73 -17
  150. datapipeline/transforms/interfaces.py +58 -0
  151. datapipeline/transforms/record/floor_time.py +10 -7
  152. datapipeline/transforms/record/lag.py +8 -10
  153. datapipeline/transforms/sequence.py +2 -3
  154. datapipeline/transforms/stream/dedupe.py +5 -7
  155. datapipeline/transforms/stream/ensure_ticks.py +39 -24
  156. datapipeline/transforms/stream/fill.py +34 -25
  157. datapipeline/transforms/stream/filter.py +25 -0
  158. datapipeline/transforms/stream/floor_time.py +16 -0
  159. datapipeline/transforms/stream/granularity.py +52 -30
  160. datapipeline/transforms/stream/lag.py +17 -0
  161. datapipeline/transforms/stream/rolling.py +72 -0
  162. datapipeline/transforms/utils.py +42 -10
  163. datapipeline/transforms/vector/drop/horizontal.py +0 -3
  164. datapipeline/transforms/vector/drop/orchestrator.py +0 -3
  165. datapipeline/transforms/vector/drop/vertical.py +0 -2
  166. datapipeline/transforms/vector/ensure_schema.py +0 -2
  167. datapipeline/utils/paths.py +0 -2
  168. datapipeline/utils/placeholders.py +0 -2
  169. datapipeline/utils/rich_compat.py +0 -3
  170. datapipeline/utils/window.py +0 -2
  171. jerry_thomas-2.0.0.dist-info/METADATA +282 -0
  172. jerry_thomas-2.0.0.dist-info/RECORD +264 -0
  173. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/WHEEL +1 -1
  174. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/entry_points.txt +7 -3
  175. datapipeline/services/scaffold/mappers.py +0 -55
  176. datapipeline/services/scaffold/source.py +0 -191
  177. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +0 -31
  178. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +0 -30
  179. datapipeline/templates/plugin_skeleton/example/dataset.yaml +0 -18
  180. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +0 -29
  181. datapipeline/templates/plugin_skeleton/example/project.yaml +0 -23
  182. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +0 -3
  183. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +0 -9
  184. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +0 -2
  185. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +0 -4
  186. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +0 -28
  187. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +0 -4
  188. datapipeline/templates/stubs/mapper.py.j2 +0 -22
  189. jerry_thomas-1.0.3.dist-info/METADATA +0 -827
  190. jerry_thomas-1.0.3.dist-info/RECORD +0 -198
  191. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/licenses/LICENSE +0 -0
  192. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,46 @@
1
+ from datetime import datetime, timezone
2
+ from typing import Any
3
+
4
+ from datapipeline.sources.models.parser import DataParser
5
+
6
+ from {{PACKAGE_NAME}}.dtos.sandbox_ohlcv_dto import SandboxOhlcvDTO
7
+
8
+
9
+ def _parse_time(value: Any) -> datetime | None:
10
+ if isinstance(value, datetime):
11
+ if value.tzinfo is None:
12
+ return value.replace(tzinfo=timezone.utc)
13
+ return value
14
+ if isinstance(value, str):
15
+ try:
16
+ dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
17
+ except ValueError:
18
+ return None
19
+ if dt.tzinfo is None:
20
+ dt = dt.replace(tzinfo=timezone.utc)
21
+ return dt
22
+ return None
23
+
24
+
25
+ class SandboxOhlcvDTOParser(DataParser[SandboxOhlcvDTO]):
26
+ def parse(self, raw: Any) -> SandboxOhlcvDTO | None:
27
+ """
28
+ Convert one raw item (row/dict/tuple/record) into a SandboxOhlcvDTO.
29
+
30
+ - Return a DTO instance to keep the item, or None to drop it.
31
+ - Keep this logic thin and mirror your source data.
32
+ """
33
+ if not isinstance(raw, dict):
34
+ return None
35
+ parsed_time = _parse_time(raw.get("time"))
36
+ if parsed_time is None:
37
+ return None
38
+ return SandboxOhlcvDTO(
39
+ time=parsed_time,
40
+ open=float(raw["open"]),
41
+ high=float(raw["high"]),
42
+ low=float(raw["low"]),
43
+ close=float(raw["close"]),
44
+ volume=float(raw["volume"]),
45
+ symbol=str(raw["symbol"]),
46
+ )
@@ -1,142 +1,63 @@
1
1
  # {{DIST_NAME}}
2
2
 
3
- Minimal plugin skeleton for the Jerry Thomas (datapipeline) framework.
4
-
5
- Quick start
6
- - Initialize a plugin (already done if you’re reading this here):
7
- - `jerry plugin init {{DIST_NAME}}`
8
- - Add a source via CLI (transport-specific placeholders are scaffolded):
9
- - File data: `jerry source add <provider> <dataset> -t fs -f <csv|json|json-lines|pickle>`
10
- - HTTP data: `jerry source add <provider>.<dataset> -t http -f <json|json-lines|csv>`
11
- - Synthetic: `jerry source add -p <provider> -d <dataset> -t synthetic`
12
- - Edit the generated `config/sources/*.yaml` to fill in the `path`, delimiter, etc.
13
- - `jerry.yaml` is placed in your workspace root (alongside the plugin folder) so
14
- you can run CLI commands from there; `plugin_root` points back to this plugin.
15
- - Reinstall after EP changes (pyproject.toml) and restart Python processes:
16
- - Core: `cd lib/datapipeline && python -m pip install -e .`
17
- - This plugin: `python -m pip install -e .`
18
-
19
- Folder layout
20
- - `example/`
21
- - `project.yaml` project root (paths, globals, cadence/split)
22
- - `dataset.yaml` — feature/target declarations (uses `${group_by}` from globals)
23
- - `postprocess.yaml` — postprocess transforms
24
- - `contracts/*.yaml` — canonical stream definitions
25
- - `sources/*.yaml` — raw source definitions (one file per source)
26
- - `tasks/*.yaml` — task specs (schema/scaler/metadata/serve)
27
- - Every dataset `project.yaml` declares a `name`; reference it via `${project_name}`
28
- inside other config files (e.g., `paths.artifacts: ../artifacts/${project_name}`) to
29
- avoid hard-coding per-dataset directories.
30
- - `src/{{PACKAGE_NAME}}/`
31
- - `sources/<provider>/<dataset>/dto.py` — DTO model for the source
32
- - `sources/<provider>/<dataset>/parser.py` — parse raw → DTO
33
- - Optional: `sources/<provider>/<dataset>/loader.py` for synthetic sources
34
- - `domains/<domain>/model.py` — domain record models
35
- - `mappers/*.py` — map DTOs → domain records
36
-
37
- How loaders work
38
- - For fs/http, sources use the generic loader entry point:
39
- - `loader.entrypoint: "{{DEFAULT_IO_LOADER_EP}}"`
40
- - `loader.args` include `transport`, `format`, and source-specific args (placeholders are provided):
41
- - fs: `path`, `glob`, `encoding`, plus `delimiter` for csv
42
- - http: `url`, `headers`, `params`, `encoding`, optional `count_by_fetch`
43
- - Synthetic sources generate data in-process and keep a small loader stub.
44
-
45
- Run data flows
46
- - Build artifacts once: `jerry build --project example/project.yaml`
47
- - Preview records (stage 1): `jerry serve --project example/project.yaml --stage 1 --limit 100`
48
- - Preview features (stage 3): `jerry serve --project example/project.yaml --stage 3 --limit 100`
49
- - Preview vectors (stage 7): `jerry serve --project example/project.yaml --stage 7 --limit 100`
50
-
51
- Analyze vectors
52
- - `jerry inspect report --project example/project.yaml` (console only)
53
- - `jerry inspect partitions --project example/project.yaml` (writes build/partitions.json)
54
- - `jerry inspect matrix --project example/project.yaml --format html` (writes build/matrix.html)
55
- - `jerry inspect expected --project example/project.yaml` (writes build/expected.txt)
56
- - Use post-processing transforms in `postprocess.yaml` to keep coverage high
57
- (history/horizontal fills, constants, or drop rules) before serving vectors.
58
- Add `payload: targets` inside a transform when you need to mutate label vectors.
59
-
60
- Train/Val/Test splits (deterministic)
61
- - Configure split mechanics once in your project file:
62
- - Edit `example/project.yaml` and set:
63
- ```yaml
64
- globals:
65
- group_by: 10m # dataset cadence; reused as contract cadence
66
- split:
67
- mode: hash # hash|time
68
- key: group # group or feature:<id> (entity-stable)
69
- seed: 42 # deterministic hash seed
70
- ratios: {train: 0.8, val: 0.1, test: 0.1}
71
- ```
72
- - Select the active slice via `example/tasks/serve.<name>.yaml` (or `--keep`):
73
- ```yaml
74
- kind: serve
75
- name: train # defaults to filename stem when omitted
76
- keep: train # any label defined in globals.split; null disables filtering
77
- output:
78
- transport: stdout # stdout | fs
79
- format: print # print | json-lines | json | csv | pickle
80
- limit: 100 # cap vectors per serve run (null = unlimited)
81
- throttle_ms: null # sleep between vectors (milliseconds)
82
- # visuals: AUTO # AUTO | TQDM | RICH | OFF
83
- # progress: AUTO # AUTO | SPINNER | BARS | OFF
84
- ```
85
- - Add additional `kind: serve` files (e.g., `serve.val.yaml`, `serve.test.yaml`) and the CLI will run each enabled file in order unless you pass `--run <name>`.
86
- - Serve examples (change the serve task or pass `--keep val|test`):
87
- - `jerry serve -p example/project.yaml --out-transport stdout --out-format json-lines > train.jsonl`
88
- - `jerry serve -p example/project.yaml --keep val --out-transport stdout --out-format json-lines > val.jsonl`
89
- - Add `--visuals rich --progress bars` for a richer interactive UI; defaults to `AUTO`.
90
- - For shared workspace defaults (visual renderer, progress display, build mode), drop a `jerry.yaml` next to your workspace root and set `shared.visuals`, `shared.progress`, etc. CLI commands walk up from the current directory to find it.
91
- - The split is applied at the end (after postprocess transforms), and assignment
92
- is deterministic (hash-based) with a fixed seed; no overlap across runs.
93
-
94
- Key selection guidance
95
- - `key: group` hashes the group key (commonly the time bucket). This yields a uniform random split per group but may allow the same entity to appear in multiple splits across time.
96
- - `key: feature:<id>` hashes a specific feature value, e.g., `feature:entity_id` or `feature:station_id`, ensuring all vectors for that entity land in the same split (recommended to avoid leakage).
97
-
98
- Postprocess expected IDs
99
- - Build once with `jerry build --project config/project.yaml` (or run `jerry inspect expected …`) to materialize `<paths.artifacts>/expected.txt`.
100
- - Bootstrap registers the artifact; postprocess transforms read it automatically. Per-transform `expected:` overrides are no longer required or supported — the build output is the single source of truth.
101
-
102
- Scaler statistics
103
- - Jerry computes scaler stats automatically. If you need custom paths or settings, add `tasks/scaler.yaml` and override the defaults.
104
- - The build writes `<paths.artifacts>/scaler.pkl`; runtime scaling requires this artifact. If it is missing, scaling transforms raise a runtime error.
105
-
106
- Tips
107
- - Keep parsers thin — mirror source schema and return DTOs; use the identity parser only if your loader already emits domain records.
108
- - Prefer small, composable configs over monolithic ones: one YAML per source is easier to review and reuse.
109
-
110
- Composed streams (engineered domains)
111
- - Declare engineered streams that depend on other canonical streams directly in contracts. The runtime builds each input to stage 4, stream‑aligns by partition+timestamp, runs your composer, and emits fresh records for the derived stream.
112
-
113
- ```yaml
114
- # example/contracts/air_density.processed.yaml
115
- kind: composed
116
- id: air_density.processed
117
- inputs:
118
- - p=pressure.processed
119
- - t=temp_dry.processed
120
- partition_by: station_id
121
- sort_batch_size: 20000
122
-
123
- mapper:
124
- entrypoint: {{PACKAGE_NAME}}.mappers.air_density:mapper
125
- args:
126
- driver: p # optional; defaults to first input alias
127
-
128
- # Optional post‑compose policies (same as any stream):
129
- # record: [...]
130
- # stream: [...]
131
- # debug: [...]
3
+ Minimal plugin skeleton for the Jerry Thomas (datapipeline) runtime.
4
+
5
+ ## Quick start
6
+
7
+ ```bash
8
+ python -m pip install -U jerry-thomas
9
+
10
+ jerry plugin init {{DIST_NAME}} --out .
11
+ python -m pip install -e {{DIST_NAME}}
12
+
13
+ # One-stop wizard: source YAML + DTO/parser + domain + mapper + contract.
14
+ jerry inflow create
15
+
16
+ # If a workspace-level `jerry.yaml` was created (fresh workspace), you can use the dataset alias:
17
+ jerry serve --dataset your-dataset --limit 3
18
+ #
19
+ # If you already had a workspace `jerry.yaml`, `jerry plugin init` will not overwrite it.
20
+ # In that case, either add a dataset alias to your existing `jerry.yaml` or pass `--project`:
21
+ # jerry serve --project your-dataset/project.yaml --limit 3
132
22
  ```
133
23
 
134
- Then reference the composed stream in your dataset:
24
+ ## After scaffolding: what you must edit
25
+
26
+ - `your-dataset/sources/*.yaml`
27
+ - Replace placeholders (`path`/`url`, headers/params, delimiter, etc.)
28
+ - `your-dataset/dataset.yaml`
29
+ - Ensure `record_stream:` points at the contract id you created.
30
+ - Select a `field:` for each feature/target (record attribute to use as value).
31
+ - Ensure `group_by` matches `^\d+(m|min|h|d)$` (e.g. `10m`, `1h`, `1d`).
32
+
33
+ If you add/edit entry points in `pyproject.toml`, reinstall the plugin:
135
34
 
136
- ```yaml
137
- # example/dataset.yaml
138
- group_by: ${group_by}
139
- features:
140
- - id: air_density
141
- record_stream: air_density.processed
35
+ ```bash
36
+ python -m pip install -e .
142
37
  ```
38
+
39
+ ## Folder layout
40
+
41
+ YAML config (dataset project root):
42
+
43
+ - `your-dataset/`
44
+ - `project.yaml` (paths, globals, split)
45
+ - `sources/*.yaml` (raw source definitions)
46
+ - `contracts/*.yaml` (canonical streams)
47
+ - `dataset.yaml` (features/targets)
48
+ - `postprocess.yaml` (vector-level transforms)
49
+ - `tasks/*.yaml` (serve/build tasks; optional overrides)
50
+
51
+ Python plugin code:
52
+
53
+ - `src/{{PACKAGE_NAME}}/`
54
+ - `dtos/` (DTO models)
55
+ - `parsers/` (raw -> DTO)
56
+ - `domains/<domain>/model.py` (domain record models)
57
+ - `mappers/` (DTO -> domain records)
58
+ - `loaders/` (optional custom loaders)
59
+
60
+ ## Learn more
61
+
62
+ - Pipeline stages and split/build timing: the Jerry Thomas runtime `README.md` ("Pipeline Stages (serve --stage)").
63
+ - Deep dives: runtime `docs/config.md`, `docs/transforms.md`, `docs/artifacts.md`, `docs/extending.md`, `docs/architecture.md`.
@@ -1,34 +1,22 @@
1
- # Workspace defaults. The scaffolder copies this to your workspace root (where
2
- # you ran `jerry plugin init`). CLI commands walk upward from cwd to find it.
3
-
4
- # Relative path from this workspace file back to the plugin root.
5
- plugin_root: . # e.g., "lib/myplugin" if your plugin lives under lib/
6
-
7
- # Dataset aliases for `--dataset`; values may be dirs (auto-append project.yaml).
1
+ # See reference/jerry.yaml for full options and explanations.
2
+ plugin_root: .
8
3
  datasets:
9
- example: example/project.yaml
10
- your-second-example-dataset: your-dataset/project.yaml
4
+ your-dataset: your-dataset/project.yaml
5
+ interim-builder: your-interim-data-builder/project.yaml # use this to build interim data used by other datasets
11
6
 
12
- # Default dataset alias when --dataset/--project are omitted.
13
- default_dataset: example
7
+ default_dataset: your-dataset
14
8
 
15
- # Shared fallbacks used by all commands (unless overridden).
16
9
  shared:
17
- visuals: AUTO # AUTO | TQDM | RICH | OFF
18
- progress: BARS # AUTO | SPINNER | BARS | OFF
19
- log_level: INFO # Default log level when not set elsewhere
10
+ visuals: AUTO
11
+ progress: BARS
12
+ log_level: INFO
20
13
 
21
- # Defaults for `jerry serve` (run-time options).
22
14
  serve:
23
- # log_level: INFO # Uncomment to force INFO for serve runs
24
- limit: null # Cap vectors; null means unlimited
25
- stage: null # Preview a specific stage; null runs the full pipeline
15
+ limit: null
16
+ stage: null
26
17
  output:
27
18
  transport: stdout
28
- format: print # stdout: print|json-lines|json|csv|pickle
29
- # directory: artifacts/serve # Required when transport=fs
19
+ format: print
30
20
 
31
- # Defaults for `jerry build` (artifact materialization).
32
21
  build:
33
- # log_level: INFO # Uncomment to set build log level
34
- mode: AUTO # AUTO | FORCE | OFF
22
+ mode: AUTO
@@ -0,0 +1,28 @@
1
+ # Jerry workspace config reference (all options).
2
+ # This file is documentation only; uncomment the keys you want to use.
3
+ #
4
+ # plugin_root: ./path/to/your/plugin # optional
5
+ #
6
+ # datasets: # optional
7
+ # default: example/project.yaml # optional (relative to jerry.yaml)
8
+ # default_dataset: default # optional (must be a key in datasets)
9
+ #
10
+ # shared: # optional
11
+ # visuals: AUTO # optional; AUTO | TQDM | RICH | OFF
12
+ # progress: AUTO # optional; AUTO | SPINNER | BARS | OFF
13
+ # log_level: INFO # optional; CRITICAL | ERROR | WARNING | INFO | DEBUG
14
+ #
15
+ # serve: # optional
16
+ # log_level: INFO # optional
17
+ # limit: 100 # optional
18
+ # stage: 8 # optional
19
+ # throttle_ms: 0 # optional
20
+ # output: # optional
21
+ # transport: stdout # optional; stdout | fs
22
+ # format: json-lines # optional; stdout: print | json-lines | json
23
+ # payload: sample # optional; sample | vector
24
+ # # directory: artifacts/serve # optional; fs only; relative to jerry.yaml
25
+ #
26
+ # build: # optional
27
+ # log_level: INFO # optional
28
+ # mode: AUTO # optional; AUTO | FORCE | OFF (false -> OFF)
@@ -0,0 +1,29 @@
1
+ # Composed contract reference (kind: composed).
2
+ # This file is documentation only; uncomment the keys you want to use.
3
+ #
4
+ # kind: composed
5
+ # id: domain.dataset.variant
6
+ # inputs: # required (list of stream ids or alias=stream)
7
+ # - alias=upstream.stream.id
8
+ # - other.stream.id
9
+ # mapper: # optional (defaults to identity)
10
+ # entrypoint: my_composer
11
+ # args: {}
12
+ # cadence: ${group_by} # optional per-contract variable for interpolation
13
+ # partition_by: station_id # optional; string or list of strings
14
+ # sort_batch_size: 100000 # optional; in-memory chunk size for sorting
15
+ #
16
+ # record transforms (one-key mappings; optional):
17
+ # - filter: { field: time, operator: ge, comparand: "${start_time}" }
18
+ # - floor_time: { cadence: "${cadence}" }
19
+ # - lag: { lag: "${cadence}" }
20
+ #
21
+ # stream transforms (one-key mappings; optional; operate on record fields):
22
+ # - dedupe: {}
23
+ # - granularity: { field: close, to: close, mode: first }
24
+ # - ensure_cadence: { field: close, to: close, cadence: "${cadence}" }
25
+ # - fill: { field: close, to: close, statistic: median, window: 6, min_samples: 1 }
26
+ #
27
+ # debug transforms (one-key mappings; optional):
28
+ # - lint: { mode: error, tick: "${cadence}" }
29
+ # - identity: {}
@@ -0,0 +1,31 @@
1
+ # Ingest contract reference (kind: ingest).
2
+ # This file is documentation only; uncomment the keys you want to use.
3
+ #
4
+ # kind: ingest
5
+ # id: domain.dataset.variant
6
+ # source: source.alias # required
7
+ # mapper: # optional (defaults to identity)
8
+ # entrypoint: my_mapper
9
+ # args: {}
10
+ # cadence: ${group_by} # optional per-contract variable for interpolation
11
+ # partition_by: station_id # optional; string or list of strings
12
+ # sort_batch_size: 100000 # optional; in-memory chunk size for sorting
13
+ #
14
+ # record transforms (one-key mappings; optional):
15
+ # - filter: { field: time, operator: ge, comparand: "${start_time}" }
16
+ # - floor_time: { cadence: "${cadence}" }
17
+ # - lag: { lag: "${cadence}" }
18
+ #
19
+ # stream transforms (one-key mappings; optional; operate on record fields):
20
+ # - floor_time: { cadence: "${cadence}" }
21
+ # - lag: { lag: "${cadence}" }
22
+ # - filter: { field: close, operator: ge, comparand: 1000000 }
23
+ # - dedupe: {}
24
+ # - granularity: { field: close, to: close, mode: first }
25
+ # - ensure_cadence: { field: close, to: close, cadence: "${cadence}" }
26
+ # - rolling: { field: dollar_volume, to: adv60, window: 60, statistic: mean, min_samples: 60 }
27
+ # - fill: { field: close, to: close, statistic: median, window: 6, min_samples: 1 }
28
+ #
29
+ # debug transforms (one-key mappings; optional):
30
+ # - lint: { mode: error, tick: "${cadence}" }
31
+ # - identity: {}
@@ -0,0 +1,34 @@
1
+ # Contract config reference (overview).
2
+ # This file is documentation only; uncomment the keys you want to use.
3
+ #
4
+ # kind: ingest | composed
5
+ # id: domain.dataset.variant
6
+ # source: source.alias # required when kind: ingest
7
+ # inputs: [stream.id] # required when kind: composed
8
+ # mapper: # optional (defaults to identity)
9
+ # entrypoint: my_mapper
10
+ # args: {}
11
+ # cadence: ${group_by} # optional per-contract variable for interpolation
12
+ # partition_by: station_id # optional; string or list of strings
13
+ # sort_batch_size: 100000 # optional; in-memory chunk size for sorting
14
+ #
15
+ # record transforms (one-key mappings; optional):
16
+ # - filter: { field: time, operator: ge, comparand: "${start_time}" }
17
+ # # operator: eq|ne|lt|le|gt|ge|in|nin (aliases: ==, !=, >=, <=, etc.)
18
+ # - floor_time: { cadence: "${cadence}" }
19
+ # - lag: { lag: "${cadence}" }
20
+ #
21
+ # stream transforms (one-key mappings; optional):
22
+ # - dedupe: {}
23
+ # - granularity: { field: close, to: close, mode: first } # first | last | mean | median
24
+ # - ensure_cadence: { field: close, to: close, cadence: "${cadence}" }
25
+ # - fill: { field: close, to: close, statistic: median, window: 6, min_samples: 1 }
26
+ # # statistic: mean | median; window must be > 0
27
+ #
28
+ # debug transforms (one-key mappings; optional):
29
+ # - lint: { mode: error, tick: "${cadence}" } # mode: warn | error
30
+ # - identity: {}
31
+ #
32
+ # See also:
33
+ # - ingest.reference.yaml
34
+ # - composed.reference.yaml
@@ -0,0 +1,29 @@
1
+ # Dataset config reference (all options).
2
+ # This file is documentation only; uncomment the keys you want to use.
3
+ #
4
+ # Feature/vector stages require group_by.
5
+ # group_by must match ^\d+(m|min|h|d)$ (e.g., 10m, 1h, 1d).
6
+ #
7
+ # group_by: ${group_by} # required for feature/vector stages
8
+ #
9
+ # features: # optional
10
+ # - id: time_linear
11
+ # record_stream: time.ticks.linear
12
+ # field: value
13
+ # scale: true # optional; true | false | mapping (see below)
14
+ # # scale:
15
+ # # model_path: ../artifacts/example/v1/scaler.pkl
16
+ # # with_mean: true
17
+ # # with_std: true
18
+ # # epsilon: 1.0e-12
19
+ # # on_none: skip # skip | error
20
+ # sequence: { size: 6, stride: 1 } # optional
21
+ #
22
+ # targets: # optional
23
+ # - id: some_target
24
+ # record_stream: time.ticks.linear
25
+ # field: value
26
+ # scale: false # optional
27
+ # sequence: null # optional
28
+ #
29
+ # Record-only stage uses only record_stream entries; id/field/scale/sequence are ignored.
@@ -0,0 +1,25 @@
1
+ # Postprocess config reference (vector transforms).
2
+ # This file is documentation only; uncomment the keys you want to use.
3
+ # Each list item is a one-key mapping: <transform_name>: <params>.
4
+ #
5
+ # - drop:
6
+ # axis: vertical # optional; horizontal | vertical
7
+ # payload: targets # optional; features | targets | both (both only for horizontal)
8
+ # threshold: 0.9 # required; 0.0 - 1.0
9
+ # only: [feature_id] # optional
10
+ # exclude: [feature_id] # optional
11
+ #
12
+ # - fill:
13
+ # statistic: median # optional; mean | median
14
+ # window: 48 # optional; rolling window size
15
+ # min_samples: 6 # optional
16
+ # payload: features # optional; features | targets | both
17
+ # only: [feature_id] # optional
18
+ # exclude: [feature_id] # optional
19
+ #
20
+ # - replace:
21
+ # value: 0.0 # required
22
+ # payload: targets # optional; features | targets | both
23
+ # target: null # optional; replace only when value equals target; default replaces missing
24
+ # only: [feature_id] # optional
25
+ # exclude: [feature_id] # optional
@@ -0,0 +1,32 @@
1
+ # Project config reference (all options).
2
+ # This file is documentation only; uncomment the keys you want to use.
3
+ #
4
+ # version: 1 # optional
5
+ # name: example # optional
6
+ # paths:
7
+ # streams: ./contracts # required
8
+ # sources: ./sources # required
9
+ # dataset: dataset.yaml # required
10
+ # postprocess: postprocess.yaml # required
11
+ # artifacts: ../artifacts/${project_name}/v${version} # required
12
+ # tasks: ./tasks # optional
13
+ # globals: # optional; available via ${var}
14
+ # group_by: 1h # optional; dataset cadence
15
+ # start_time: 2021-01-01T00:00:00Z # optional; used in contracts
16
+ # end_time: 2021-01-02T00:00:00Z # optional; used in contracts
17
+ # split: # optional; applied at serve time after postprocess
18
+ # mode: hash # hash | time
19
+ # ratios: { train: 0.8, val: 0.1, test: 0.1 } # must sum to 1.0
20
+ # seed: 42 # deterministic hash seed
21
+ # key: group # group | feature:<id>
22
+ #
23
+ # Time-based split (labels length must be len(boundaries) + 1):
24
+ # globals:
25
+ # split:
26
+ # mode: time
27
+ # boundaries:
28
+ # - 2021-01-01T00:00:00Z # first cutover (train -> val)
29
+ # - 2021-01-02T00:00:00Z # second cutover (val -> test)
30
+ # labels: [train, val, test]
31
+ #
32
+ # Any extra keys under globals are allowed and can be referenced via ${var}.
@@ -0,0 +1,24 @@
1
+ # Foreach + HTTP loader reference (core.foreach + core.io).
2
+ # This file is documentation only; uncomment the keys you want to use.
3
+ #
4
+ # id: provider.dataset # required
5
+ # parser:
6
+ # entrypoint: my_pkg.sources.provider.dataset:parse # required
7
+ # args: {} # optional
8
+ # loader:
9
+ # entrypoint: core.foreach
10
+ # args:
11
+ # foreach:
12
+ # page: [1, 2, 3] # required; exactly one key; list of values
13
+ # loader:
14
+ # entrypoint: {{DEFAULT_IO_LOADER_EP}}
15
+ # args:
16
+ # transport: http
17
+ # format: json-lines
18
+ # url: "https://example/api?page=${page}" # required
19
+ # headers: { Authorization: "Bearer ..." } # optional
20
+ # params: {} # optional
21
+ # encoding: utf-8 # optional
22
+ # count_by_fetch: false # optional
23
+ # # inject_field: page # optional
24
+ # # throttle_seconds: 0 # optional
@@ -0,0 +1,21 @@
1
+ # Foreach loader reference (core.foreach).
2
+ # This file is documentation only; uncomment the keys you want to use.
3
+ #
4
+ # id: provider.dataset # required
5
+ # parser:
6
+ # entrypoint: my_pkg.sources.provider.dataset:parse # required
7
+ # args: {} # optional
8
+ # loader:
9
+ # entrypoint: core.foreach
10
+ # args:
11
+ # foreach:
12
+ # month: ["2024-01", "2024-02"] # required; exactly one key; list of values
13
+ # loader:
14
+ # entrypoint: {{DEFAULT_IO_LOADER_EP}}
15
+ # args:
16
+ # transport: fs
17
+ # format: csv # required
18
+ # path: ./data/${month}.csv # required
19
+ # # inject_field: month # optional; mapping output only
20
+ # # inject: { month: "${month}" } # optional; mapping output only
21
+ # # throttle_seconds: 0 # optional
@@ -0,0 +1,16 @@
1
+ # FS loader reference (generic I/O loader).
2
+ # This file is documentation only; uncomment the keys you want to use.
3
+ #
4
+ # id: provider.dataset # required
5
+ # parser:
6
+ # entrypoint: my_pkg.sources.provider.dataset:parse # required
7
+ # args: {} # optional
8
+ # loader:
9
+ # entrypoint: {{DEFAULT_IO_LOADER_EP}}
10
+ # args:
11
+ # transport: fs
12
+ # format: csv # required; csv | json | json-lines | pickle
13
+ # path: ./data/file.csv # optional (use path or glob)
14
+ # glob: ./data/*.csv # optional (use path or glob)
15
+ # encoding: utf-8 # optional
16
+ # delimiter: "," # optional; csv only
@@ -0,0 +1,17 @@
1
+ # HTTP loader reference (generic I/O loader).
2
+ # This file is documentation only; uncomment the keys you want to use.
3
+ #
4
+ # id: provider.dataset # required
5
+ # parser:
6
+ # entrypoint: my_pkg.sources.provider.dataset:parse # required
7
+ # args: {} # optional
8
+ # loader:
9
+ # entrypoint: {{DEFAULT_IO_LOADER_EP}}
10
+ # args:
11
+ # transport: http
12
+ # format: json-lines # required; json | json-lines | csv
13
+ # url: https://example/api # required
14
+ # params: { key: value } # optional
15
+ # headers: { Authorization: "Bearer ..." } # optional
16
+ # encoding: utf-8 # optional
17
+ # count_by_fetch: false # optional
@@ -0,0 +1,18 @@
1
+ # Source config reference (overview).
2
+ # This file is documentation only; uncomment the keys you want to use.
3
+ #
4
+ # Required shape:
5
+ # id: provider.dataset # required
6
+ # parser:
7
+ # entrypoint: module.path:callable # required
8
+ # args: {} # optional
9
+ # loader:
10
+ # entrypoint: module.path:callable # required
11
+ # args: {} # optional
12
+ #
13
+ # See also:
14
+ # - fs.reference.yaml
15
+ # - http.reference.yaml
16
+ # - synthetic.reference.yaml
17
+ # - foreach.reference.yaml
18
+ # - foreach.http.reference.yaml
@@ -0,0 +1,15 @@
1
+ # Synthetic source reference.
2
+ # This file is documentation only; uncomment the keys you want to use.
3
+ #
4
+ # id: synthetic.ticks # required
5
+ #
6
+ # parser:
7
+ # entrypoint: core.synthetic.ticks # required
8
+ # args: {} # optional
9
+ #
10
+ # loader:
11
+ # entrypoint: core.synthetic.ticks # required
12
+ # args:
13
+ # start: "${start_time}" # optional
14
+ # end: "${end_time}" # optional
15
+ # frequency: "${group_by}" # optional
@@ -0,0 +1,11 @@
1
+ # Metadata task reference (all options).
2
+ # This file is documentation only; uncomment the keys you want to use.
3
+ #
4
+ # version: 1 # optional
5
+ # kind: metadata
6
+ # name: metadata # optional (defaults to filename stem)
7
+ # enabled: true # optional
8
+ #
9
+ # output: metadata.json # optional; relative to project.paths.artifacts
10
+ # cadence_strategy: max # optional; currently only "max"
11
+ # window_mode: intersection # optional; union | intersection | strict | relaxed
@@ -0,0 +1,10 @@
1
+ # Scaler task reference (all options).
2
+ # This file is documentation only; uncomment the keys you want to use.
3
+ #
4
+ # version: 1 # optional
5
+ # kind: scaler
6
+ # name: scaler # optional (defaults to filename stem)
7
+ # enabled: true # optional
8
+ #
9
+ # output: scaler.pkl # optional; relative to project.paths.artifacts
10
+ # split_label: train # optional; split label from globals.split