jerry-thomas 1.0.3__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +0 -1
- datapipeline/build/tasks/config.py +0 -2
- datapipeline/build/tasks/metadata.py +0 -2
- datapipeline/build/tasks/scaler.py +0 -2
- datapipeline/build/tasks/schema.py +0 -2
- datapipeline/build/tasks/utils.py +0 -2
- datapipeline/cli/app.py +201 -81
- datapipeline/cli/commands/contract.py +145 -283
- datapipeline/cli/commands/demo.py +13 -0
- datapipeline/cli/commands/domain.py +4 -4
- datapipeline/cli/commands/dto.py +11 -0
- datapipeline/cli/commands/filter.py +2 -2
- datapipeline/cli/commands/inspect.py +0 -68
- datapipeline/cli/commands/list_.py +30 -13
- datapipeline/cli/commands/loader.py +11 -0
- datapipeline/cli/commands/mapper.py +82 -0
- datapipeline/cli/commands/parser.py +45 -0
- datapipeline/cli/commands/run_config.py +1 -3
- datapipeline/cli/commands/serve_pipeline.py +5 -7
- datapipeline/cli/commands/source.py +106 -18
- datapipeline/cli/commands/stream.py +286 -0
- datapipeline/cli/visuals/common.py +0 -2
- datapipeline/cli/visuals/sections.py +0 -2
- datapipeline/cli/workspace_utils.py +0 -3
- datapipeline/config/context.py +0 -2
- datapipeline/config/dataset/feature.py +1 -0
- datapipeline/config/metadata.py +0 -2
- datapipeline/config/project.py +0 -2
- datapipeline/config/resolution.py +10 -2
- datapipeline/config/tasks.py +9 -9
- datapipeline/domain/feature.py +3 -0
- datapipeline/domain/record.py +7 -7
- datapipeline/domain/sample.py +0 -2
- datapipeline/domain/vector.py +6 -8
- datapipeline/integrations/ml/adapter.py +0 -2
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +0 -2
- datapipeline/integrations/ml/torch_support.py +0 -2
- datapipeline/io/output.py +0 -2
- datapipeline/io/serializers.py +26 -16
- datapipeline/mappers/synthetic/time.py +9 -2
- datapipeline/pipeline/artifacts.py +3 -5
- datapipeline/pipeline/observability.py +0 -2
- datapipeline/pipeline/pipelines.py +118 -34
- datapipeline/pipeline/stages.py +42 -17
- datapipeline/pipeline/utils/spool_cache.py +142 -0
- datapipeline/pipeline/utils/transform_utils.py +27 -2
- datapipeline/services/artifacts.py +1 -4
- datapipeline/services/constants.py +1 -0
- datapipeline/services/factories.py +4 -6
- datapipeline/services/project_paths.py +0 -2
- datapipeline/services/runs.py +0 -2
- datapipeline/services/scaffold/contract_yaml.py +76 -0
- datapipeline/services/scaffold/demo.py +141 -0
- datapipeline/services/scaffold/discovery.py +115 -0
- datapipeline/services/scaffold/domain.py +21 -13
- datapipeline/services/scaffold/dto.py +31 -0
- datapipeline/services/scaffold/filter.py +2 -1
- datapipeline/services/scaffold/layout.py +96 -0
- datapipeline/services/scaffold/loader.py +61 -0
- datapipeline/services/scaffold/mapper.py +116 -0
- datapipeline/services/scaffold/parser.py +56 -0
- datapipeline/services/scaffold/plugin.py +14 -2
- datapipeline/services/scaffold/source_yaml.py +91 -0
- datapipeline/services/scaffold/stream_plan.py +110 -0
- datapipeline/services/scaffold/utils.py +187 -0
- datapipeline/sources/data_loader.py +0 -2
- datapipeline/sources/decoders.py +49 -8
- datapipeline/sources/factory.py +9 -6
- datapipeline/sources/foreach.py +18 -3
- datapipeline/sources/synthetic/time/parser.py +1 -1
- datapipeline/sources/transports.py +10 -4
- datapipeline/templates/demo_skeleton/demo/contracts/equity.ohlcv.yaml +33 -0
- datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.hour_sin.yaml +22 -0
- datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.linear.yaml +22 -0
- datapipeline/templates/demo_skeleton/demo/data/APPL.jsonl +19 -0
- datapipeline/templates/demo_skeleton/demo/data/MSFT.jsonl +19 -0
- datapipeline/templates/demo_skeleton/demo/dataset.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/postprocess.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/project.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/sources/sandbox.ohlcv.yaml +17 -0
- datapipeline/templates/{plugin_skeleton/example → demo_skeleton/demo}/sources/synthetic.ticks.yaml +1 -1
- datapipeline/templates/demo_skeleton/demo/tasks/metadata.yaml +2 -0
- datapipeline/templates/demo_skeleton/demo/tasks/scaler.yaml +3 -0
- datapipeline/templates/demo_skeleton/demo/tasks/schema.yaml +2 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.test.yaml +4 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.train.yaml +4 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.val.yaml +4 -0
- datapipeline/templates/demo_skeleton/scripts/run_dataframe.py +20 -0
- datapipeline/templates/demo_skeleton/scripts/run_torch.py +23 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/model.py +18 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/sandbox_ohlcv_dto.py +14 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/map_sandbox_ohlcv_dto_to_equity.py +26 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/sandbox_ohlcv_dto_parser.py +46 -0
- datapipeline/templates/plugin_skeleton/README.md +57 -136
- datapipeline/templates/plugin_skeleton/jerry.yaml +12 -24
- datapipeline/templates/plugin_skeleton/reference/jerry.yaml +28 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/composed.reference.yaml +29 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/ingest.reference.yaml +31 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/overview.reference.yaml +34 -0
- datapipeline/templates/plugin_skeleton/reference/reference/dataset.yaml +29 -0
- datapipeline/templates/plugin_skeleton/reference/reference/postprocess.yaml +25 -0
- datapipeline/templates/plugin_skeleton/reference/reference/project.yaml +32 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.http.reference.yaml +24 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.reference.yaml +21 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/fs.reference.yaml +16 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/http.reference.yaml +17 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/overview.reference.yaml +18 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/synthetic.reference.yaml +15 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/metadata.reference.yaml +11 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/scaler.reference.yaml +10 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/schema.reference.yaml +10 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/serve.reference.yaml +28 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/domains/__init__.py +2 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/loaders/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +1 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +12 -11
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +4 -13
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +7 -10
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +1 -2
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +1 -7
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +1 -25
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/dataset.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/postprocess.yaml +1 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/project.yaml +14 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/tasks/serve.all.yaml +8 -0
- datapipeline/templates/stubs/contracts/composed.yaml.j2 +10 -0
- datapipeline/templates/stubs/contracts/ingest.yaml.j2 +25 -0
- datapipeline/templates/stubs/dto.py.j2 +1 -1
- datapipeline/templates/stubs/loaders/basic.py.j2 +11 -0
- datapipeline/templates/stubs/mappers/composed.py.j2 +13 -0
- datapipeline/templates/stubs/mappers/ingest.py.j2 +17 -0
- datapipeline/templates/stubs/parser.py.j2 +4 -0
- datapipeline/templates/stubs/record.py.j2 +0 -1
- datapipeline/templates/stubs/source.yaml.j2 +1 -1
- datapipeline/transforms/debug/identity.py +34 -16
- datapipeline/transforms/debug/lint.py +14 -11
- datapipeline/transforms/feature/scaler.py +5 -12
- datapipeline/transforms/filter.py +73 -17
- datapipeline/transforms/interfaces.py +58 -0
- datapipeline/transforms/record/floor_time.py +10 -7
- datapipeline/transforms/record/lag.py +8 -10
- datapipeline/transforms/sequence.py +2 -3
- datapipeline/transforms/stream/dedupe.py +5 -7
- datapipeline/transforms/stream/ensure_ticks.py +39 -24
- datapipeline/transforms/stream/fill.py +34 -25
- datapipeline/transforms/stream/filter.py +25 -0
- datapipeline/transforms/stream/floor_time.py +16 -0
- datapipeline/transforms/stream/granularity.py +52 -30
- datapipeline/transforms/stream/lag.py +17 -0
- datapipeline/transforms/stream/rolling.py +72 -0
- datapipeline/transforms/utils.py +42 -10
- datapipeline/transforms/vector/drop/horizontal.py +0 -3
- datapipeline/transforms/vector/drop/orchestrator.py +0 -3
- datapipeline/transforms/vector/drop/vertical.py +0 -2
- datapipeline/transforms/vector/ensure_schema.py +0 -2
- datapipeline/utils/paths.py +0 -2
- datapipeline/utils/placeholders.py +0 -2
- datapipeline/utils/rich_compat.py +0 -3
- datapipeline/utils/window.py +0 -2
- jerry_thomas-2.0.0.dist-info/METADATA +282 -0
- jerry_thomas-2.0.0.dist-info/RECORD +264 -0
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/WHEEL +1 -1
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/entry_points.txt +7 -3
- datapipeline/services/scaffold/mappers.py +0 -55
- datapipeline/services/scaffold/source.py +0 -191
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +0 -31
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +0 -30
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +0 -18
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +0 -29
- datapipeline/templates/plugin_skeleton/example/project.yaml +0 -23
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +0 -3
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +0 -9
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +0 -2
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +0 -4
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +0 -28
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +0 -4
- datapipeline/templates/stubs/mapper.py.j2 +0 -22
- jerry_thomas-1.0.3.dist-info/METADATA +0 -827
- jerry_thomas-1.0.3.dist-info/RECORD +0 -198
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from datetime import datetime, timezone
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from datapipeline.sources.models.parser import DataParser
|
|
5
|
+
|
|
6
|
+
from {{PACKAGE_NAME}}.dtos.sandbox_ohlcv_dto import SandboxOhlcvDTO
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _parse_time(value: Any) -> datetime | None:
|
|
10
|
+
if isinstance(value, datetime):
|
|
11
|
+
if value.tzinfo is None:
|
|
12
|
+
return value.replace(tzinfo=timezone.utc)
|
|
13
|
+
return value
|
|
14
|
+
if isinstance(value, str):
|
|
15
|
+
try:
|
|
16
|
+
dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
|
|
17
|
+
except ValueError:
|
|
18
|
+
return None
|
|
19
|
+
if dt.tzinfo is None:
|
|
20
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
21
|
+
return dt
|
|
22
|
+
return None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SandboxOhlcvDTOParser(DataParser[SandboxOhlcvDTO]):
|
|
26
|
+
def parse(self, raw: Any) -> SandboxOhlcvDTO | None:
|
|
27
|
+
"""
|
|
28
|
+
Convert one raw item (row/dict/tuple/record) into a SandboxOhlcvDTO.
|
|
29
|
+
|
|
30
|
+
- Return a DTO instance to keep the item, or None to drop it.
|
|
31
|
+
- Keep this logic thin and mirror your source data.
|
|
32
|
+
"""
|
|
33
|
+
if not isinstance(raw, dict):
|
|
34
|
+
return None
|
|
35
|
+
parsed_time = _parse_time(raw.get("time"))
|
|
36
|
+
if parsed_time is None:
|
|
37
|
+
return None
|
|
38
|
+
return SandboxOhlcvDTO(
|
|
39
|
+
time=parsed_time,
|
|
40
|
+
open=float(raw["open"]),
|
|
41
|
+
high=float(raw["high"]),
|
|
42
|
+
low=float(raw["low"]),
|
|
43
|
+
close=float(raw["close"]),
|
|
44
|
+
volume=float(raw["volume"]),
|
|
45
|
+
symbol=str(raw["symbol"]),
|
|
46
|
+
)
|
|
@@ -1,142 +1,63 @@
|
|
|
1
1
|
# {{DIST_NAME}}
|
|
2
2
|
|
|
3
|
-
Minimal plugin skeleton for the Jerry Thomas (datapipeline)
|
|
4
|
-
|
|
5
|
-
Quick start
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
- `dataset.yaml` — feature/target declarations (uses `${group_by}` from globals)
|
|
23
|
-
- `postprocess.yaml` — postprocess transforms
|
|
24
|
-
- `contracts/*.yaml` — canonical stream definitions
|
|
25
|
-
- `sources/*.yaml` — raw source definitions (one file per source)
|
|
26
|
-
- `tasks/*.yaml` — task specs (schema/scaler/metadata/serve)
|
|
27
|
-
- Every dataset `project.yaml` declares a `name`; reference it via `${project_name}`
|
|
28
|
-
inside other config files (e.g., `paths.artifacts: ../artifacts/${project_name}`) to
|
|
29
|
-
avoid hard-coding per-dataset directories.
|
|
30
|
-
- `src/{{PACKAGE_NAME}}/`
|
|
31
|
-
- `sources/<provider>/<dataset>/dto.py` — DTO model for the source
|
|
32
|
-
- `sources/<provider>/<dataset>/parser.py` — parse raw → DTO
|
|
33
|
-
- Optional: `sources/<provider>/<dataset>/loader.py` for synthetic sources
|
|
34
|
-
- `domains/<domain>/model.py` — domain record models
|
|
35
|
-
- `mappers/*.py` — map DTOs → domain records
|
|
36
|
-
|
|
37
|
-
How loaders work
|
|
38
|
-
- For fs/http, sources use the generic loader entry point:
|
|
39
|
-
- `loader.entrypoint: "{{DEFAULT_IO_LOADER_EP}}"`
|
|
40
|
-
- `loader.args` include `transport`, `format`, and source-specific args (placeholders are provided):
|
|
41
|
-
- fs: `path`, `glob`, `encoding`, plus `delimiter` for csv
|
|
42
|
-
- http: `url`, `headers`, `params`, `encoding`, optional `count_by_fetch`
|
|
43
|
-
- Synthetic sources generate data in-process and keep a small loader stub.
|
|
44
|
-
|
|
45
|
-
Run data flows
|
|
46
|
-
- Build artifacts once: `jerry build --project example/project.yaml`
|
|
47
|
-
- Preview records (stage 1): `jerry serve --project example/project.yaml --stage 1 --limit 100`
|
|
48
|
-
- Preview features (stage 3): `jerry serve --project example/project.yaml --stage 3 --limit 100`
|
|
49
|
-
- Preview vectors (stage 7): `jerry serve --project example/project.yaml --stage 7 --limit 100`
|
|
50
|
-
|
|
51
|
-
Analyze vectors
|
|
52
|
-
- `jerry inspect report --project example/project.yaml` (console only)
|
|
53
|
-
- `jerry inspect partitions --project example/project.yaml` (writes build/partitions.json)
|
|
54
|
-
- `jerry inspect matrix --project example/project.yaml --format html` (writes build/matrix.html)
|
|
55
|
-
- `jerry inspect expected --project example/project.yaml` (writes build/expected.txt)
|
|
56
|
-
- Use post-processing transforms in `postprocess.yaml` to keep coverage high
|
|
57
|
-
(history/horizontal fills, constants, or drop rules) before serving vectors.
|
|
58
|
-
Add `payload: targets` inside a transform when you need to mutate label vectors.
|
|
59
|
-
|
|
60
|
-
Train/Val/Test splits (deterministic)
|
|
61
|
-
- Configure split mechanics once in your project file:
|
|
62
|
-
- Edit `example/project.yaml` and set:
|
|
63
|
-
```yaml
|
|
64
|
-
globals:
|
|
65
|
-
group_by: 10m # dataset cadence; reused as contract cadence
|
|
66
|
-
split:
|
|
67
|
-
mode: hash # hash|time
|
|
68
|
-
key: group # group or feature:<id> (entity-stable)
|
|
69
|
-
seed: 42 # deterministic hash seed
|
|
70
|
-
ratios: {train: 0.8, val: 0.1, test: 0.1}
|
|
71
|
-
```
|
|
72
|
-
- Select the active slice via `example/tasks/serve.<name>.yaml` (or `--keep`):
|
|
73
|
-
```yaml
|
|
74
|
-
kind: serve
|
|
75
|
-
name: train # defaults to filename stem when omitted
|
|
76
|
-
keep: train # any label defined in globals.split; null disables filtering
|
|
77
|
-
output:
|
|
78
|
-
transport: stdout # stdout | fs
|
|
79
|
-
format: print # print | json-lines | json | csv | pickle
|
|
80
|
-
limit: 100 # cap vectors per serve run (null = unlimited)
|
|
81
|
-
throttle_ms: null # sleep between vectors (milliseconds)
|
|
82
|
-
# visuals: AUTO # AUTO | TQDM | RICH | OFF
|
|
83
|
-
# progress: AUTO # AUTO | SPINNER | BARS | OFF
|
|
84
|
-
```
|
|
85
|
-
- Add additional `kind: serve` files (e.g., `serve.val.yaml`, `serve.test.yaml`) and the CLI will run each enabled file in order unless you pass `--run <name>`.
|
|
86
|
-
- Serve examples (change the serve task or pass `--keep val|test`):
|
|
87
|
-
- `jerry serve -p example/project.yaml --out-transport stdout --out-format json-lines > train.jsonl`
|
|
88
|
-
- `jerry serve -p example/project.yaml --keep val --out-transport stdout --out-format json-lines > val.jsonl`
|
|
89
|
-
- Add `--visuals rich --progress bars` for a richer interactive UI; defaults to `AUTO`.
|
|
90
|
-
- For shared workspace defaults (visual renderer, progress display, build mode), drop a `jerry.yaml` next to your workspace root and set `shared.visuals`, `shared.progress`, etc. CLI commands walk up from the current directory to find it.
|
|
91
|
-
- The split is applied at the end (after postprocess transforms), and assignment
|
|
92
|
-
is deterministic (hash-based) with a fixed seed; no overlap across runs.
|
|
93
|
-
|
|
94
|
-
Key selection guidance
|
|
95
|
-
- `key: group` hashes the group key (commonly the time bucket). This yields a uniform random split per group but may allow the same entity to appear in multiple splits across time.
|
|
96
|
-
- `key: feature:<id>` hashes a specific feature value, e.g., `feature:entity_id` or `feature:station_id`, ensuring all vectors for that entity land in the same split (recommended to avoid leakage).
|
|
97
|
-
|
|
98
|
-
Postprocess expected IDs
|
|
99
|
-
- Build once with `jerry build --project config/project.yaml` (or run `jerry inspect expected …`) to materialize `<paths.artifacts>/expected.txt`.
|
|
100
|
-
- Bootstrap registers the artifact; postprocess transforms read it automatically. Per-transform `expected:` overrides are no longer required or supported — the build output is the single source of truth.
|
|
101
|
-
|
|
102
|
-
Scaler statistics
|
|
103
|
-
- Jerry computes scaler stats automatically. If you need custom paths or settings, add `tasks/scaler.yaml` and override the defaults.
|
|
104
|
-
- The build writes `<paths.artifacts>/scaler.pkl`; runtime scaling requires this artifact. If it is missing, scaling transforms raise a runtime error.
|
|
105
|
-
|
|
106
|
-
Tips
|
|
107
|
-
- Keep parsers thin — mirror source schema and return DTOs; use the identity parser only if your loader already emits domain records.
|
|
108
|
-
- Prefer small, composable configs over monolithic ones: one YAML per source is easier to review and reuse.
|
|
109
|
-
|
|
110
|
-
Composed streams (engineered domains)
|
|
111
|
-
- Declare engineered streams that depend on other canonical streams directly in contracts. The runtime builds each input to stage 4, stream‑aligns by partition+timestamp, runs your composer, and emits fresh records for the derived stream.
|
|
112
|
-
|
|
113
|
-
```yaml
|
|
114
|
-
# example/contracts/air_density.processed.yaml
|
|
115
|
-
kind: composed
|
|
116
|
-
id: air_density.processed
|
|
117
|
-
inputs:
|
|
118
|
-
- p=pressure.processed
|
|
119
|
-
- t=temp_dry.processed
|
|
120
|
-
partition_by: station_id
|
|
121
|
-
sort_batch_size: 20000
|
|
122
|
-
|
|
123
|
-
mapper:
|
|
124
|
-
entrypoint: {{PACKAGE_NAME}}.mappers.air_density:mapper
|
|
125
|
-
args:
|
|
126
|
-
driver: p # optional; defaults to first input alias
|
|
127
|
-
|
|
128
|
-
# Optional post‑compose policies (same as any stream):
|
|
129
|
-
# record: [...]
|
|
130
|
-
# stream: [...]
|
|
131
|
-
# debug: [...]
|
|
3
|
+
Minimal plugin skeleton for the Jerry Thomas (datapipeline) runtime.
|
|
4
|
+
|
|
5
|
+
## Quick start
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
python -m pip install -U jerry-thomas
|
|
9
|
+
|
|
10
|
+
jerry plugin init {{DIST_NAME}} --out .
|
|
11
|
+
python -m pip install -e {{DIST_NAME}}
|
|
12
|
+
|
|
13
|
+
# One-stop wizard: source YAML + DTO/parser + domain + mapper + contract.
|
|
14
|
+
jerry inflow create
|
|
15
|
+
|
|
16
|
+
# If a workspace-level `jerry.yaml` was created (fresh workspace), you can use the dataset alias:
|
|
17
|
+
jerry serve --dataset your-dataset --limit 3
|
|
18
|
+
#
|
|
19
|
+
# If you already had a workspace `jerry.yaml`, `jerry plugin init` will not overwrite it.
|
|
20
|
+
# In that case, either add a dataset alias to your existing `jerry.yaml` or pass `--project`:
|
|
21
|
+
# jerry serve --project your-dataset/project.yaml --limit 3
|
|
132
22
|
```
|
|
133
23
|
|
|
134
|
-
|
|
24
|
+
## After scaffolding: what you must edit
|
|
25
|
+
|
|
26
|
+
- `your-dataset/sources/*.yaml`
|
|
27
|
+
- Replace placeholders (`path`/`url`, headers/params, delimiter, etc.)
|
|
28
|
+
- `your-dataset/dataset.yaml`
|
|
29
|
+
- Ensure `record_stream:` points at the contract id you created.
|
|
30
|
+
- Select a `field:` for each feature/target (record attribute to use as value).
|
|
31
|
+
- Ensure `group_by` matches `^\d+(m|min|h|d)$` (e.g. `10m`, `1h`, `1d`).
|
|
32
|
+
|
|
33
|
+
If you add/edit entry points in `pyproject.toml`, reinstall the plugin:
|
|
135
34
|
|
|
136
|
-
```
|
|
137
|
-
|
|
138
|
-
group_by: ${group_by}
|
|
139
|
-
features:
|
|
140
|
-
- id: air_density
|
|
141
|
-
record_stream: air_density.processed
|
|
35
|
+
```bash
|
|
36
|
+
python -m pip install -e .
|
|
142
37
|
```
|
|
38
|
+
|
|
39
|
+
## Folder layout
|
|
40
|
+
|
|
41
|
+
YAML config (dataset project root):
|
|
42
|
+
|
|
43
|
+
- `your-dataset/`
|
|
44
|
+
- `project.yaml` (paths, globals, split)
|
|
45
|
+
- `sources/*.yaml` (raw source definitions)
|
|
46
|
+
- `contracts/*.yaml` (canonical streams)
|
|
47
|
+
- `dataset.yaml` (features/targets)
|
|
48
|
+
- `postprocess.yaml` (vector-level transforms)
|
|
49
|
+
- `tasks/*.yaml` (serve/build tasks; optional overrides)
|
|
50
|
+
|
|
51
|
+
Python plugin code:
|
|
52
|
+
|
|
53
|
+
- `src/{{PACKAGE_NAME}}/`
|
|
54
|
+
- `dtos/` (DTO models)
|
|
55
|
+
- `parsers/` (raw -> DTO)
|
|
56
|
+
- `domains/<domain>/model.py` (domain record models)
|
|
57
|
+
- `mappers/` (DTO -> domain records)
|
|
58
|
+
- `loaders/` (optional custom loaders)
|
|
59
|
+
|
|
60
|
+
## Learn more
|
|
61
|
+
|
|
62
|
+
- Pipeline stages and split/build timing: the Jerry Thomas runtime `README.md` ("Pipeline Stages (serve --stage)").
|
|
63
|
+
- Deep dives: runtime `docs/config.md`, `docs/transforms.md`, `docs/artifacts.md`, `docs/extending.md`, `docs/architecture.md`.
|
|
@@ -1,34 +1,22 @@
|
|
|
1
|
-
#
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
# Relative path from this workspace file back to the plugin root.
|
|
5
|
-
plugin_root: . # e.g., "lib/myplugin" if your plugin lives under lib/
|
|
6
|
-
|
|
7
|
-
# Dataset aliases for `--dataset`; values may be dirs (auto-append project.yaml).
|
|
1
|
+
# See reference/jerry.yaml for full options and explanations.
|
|
2
|
+
plugin_root: .
|
|
8
3
|
datasets:
|
|
9
|
-
|
|
10
|
-
|
|
4
|
+
your-dataset: your-dataset/project.yaml
|
|
5
|
+
interim-builder: your-interim-data-builder/project.yaml # use this to build interim data used by other datasets
|
|
11
6
|
|
|
12
|
-
|
|
13
|
-
default_dataset: example
|
|
7
|
+
default_dataset: your-dataset
|
|
14
8
|
|
|
15
|
-
# Shared fallbacks used by all commands (unless overridden).
|
|
16
9
|
shared:
|
|
17
|
-
visuals: AUTO
|
|
18
|
-
progress: BARS
|
|
19
|
-
log_level: INFO
|
|
10
|
+
visuals: AUTO
|
|
11
|
+
progress: BARS
|
|
12
|
+
log_level: INFO
|
|
20
13
|
|
|
21
|
-
# Defaults for `jerry serve` (run-time options).
|
|
22
14
|
serve:
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
stage: null # Preview a specific stage; null runs the full pipeline
|
|
15
|
+
limit: null
|
|
16
|
+
stage: null
|
|
26
17
|
output:
|
|
27
18
|
transport: stdout
|
|
28
|
-
format: print
|
|
29
|
-
# directory: artifacts/serve # Required when transport=fs
|
|
19
|
+
format: print
|
|
30
20
|
|
|
31
|
-
# Defaults for `jerry build` (artifact materialization).
|
|
32
21
|
build:
|
|
33
|
-
|
|
34
|
-
mode: AUTO # AUTO | FORCE | OFF
|
|
22
|
+
mode: AUTO
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Jerry workspace config reference (all options).
|
|
2
|
+
# This file is documentation only; uncomment the keys you want to use.
|
|
3
|
+
#
|
|
4
|
+
# plugin_root: ./path/to/your/plugin # optional
|
|
5
|
+
#
|
|
6
|
+
# datasets: # optional
|
|
7
|
+
# default: example/project.yaml # optional (relative to jerry.yaml)
|
|
8
|
+
# default_dataset: default # optional (must be a key in datasets)
|
|
9
|
+
#
|
|
10
|
+
# shared: # optional
|
|
11
|
+
# visuals: AUTO # optional; AUTO | TQDM | RICH | OFF
|
|
12
|
+
# progress: AUTO # optional; AUTO | SPINNER | BARS | OFF
|
|
13
|
+
# log_level: INFO # optional; CRITICAL | ERROR | WARNING | INFO | DEBUG
|
|
14
|
+
#
|
|
15
|
+
# serve: # optional
|
|
16
|
+
# log_level: INFO # optional
|
|
17
|
+
# limit: 100 # optional
|
|
18
|
+
# stage: 8 # optional
|
|
19
|
+
# throttle_ms: 0 # optional
|
|
20
|
+
# output: # optional
|
|
21
|
+
# transport: stdout # optional; stdout | fs
|
|
22
|
+
# format: json-lines # optional; stdout: print | json-lines | json
|
|
23
|
+
# payload: sample # optional; sample | vector
|
|
24
|
+
# # directory: artifacts/serve # optional; fs only; relative to jerry.yaml
|
|
25
|
+
#
|
|
26
|
+
# build: # optional
|
|
27
|
+
# log_level: INFO # optional
|
|
28
|
+
# mode: AUTO # optional; AUTO | FORCE | OFF (false -> OFF)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Composed contract reference (kind: composed).
|
|
2
|
+
# This file is documentation only; uncomment the keys you want to use.
|
|
3
|
+
#
|
|
4
|
+
# kind: composed
|
|
5
|
+
# id: domain.dataset.variant
|
|
6
|
+
# inputs: # required (list of stream ids or alias=stream)
|
|
7
|
+
# - alias=upstream.stream.id
|
|
8
|
+
# - other.stream.id
|
|
9
|
+
# mapper: # optional (defaults to identity)
|
|
10
|
+
# entrypoint: my_composer
|
|
11
|
+
# args: {}
|
|
12
|
+
# cadence: ${group_by} # optional per-contract variable for interpolation
|
|
13
|
+
# partition_by: station_id # optional; string or list of strings
|
|
14
|
+
# sort_batch_size: 100000 # optional; in-memory chunk size for sorting
|
|
15
|
+
#
|
|
16
|
+
# record transforms (one-key mappings; optional):
|
|
17
|
+
# - filter: { field: time, operator: ge, comparand: "${start_time}" }
|
|
18
|
+
# - floor_time: { cadence: "${cadence}" }
|
|
19
|
+
# - lag: { lag: "${cadence}" }
|
|
20
|
+
#
|
|
21
|
+
# stream transforms (one-key mappings; optional; operate on record fields):
|
|
22
|
+
# - dedupe: {}
|
|
23
|
+
# - granularity: { field: close, to: close, mode: first }
|
|
24
|
+
# - ensure_cadence: { field: close, to: close, cadence: "${cadence}" }
|
|
25
|
+
# - fill: { field: close, to: close, statistic: median, window: 6, min_samples: 1 }
|
|
26
|
+
#
|
|
27
|
+
# debug transforms (one-key mappings; optional):
|
|
28
|
+
# - lint: { mode: error, tick: "${cadence}" }
|
|
29
|
+
# - identity: {}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Ingest contract reference (kind: ingest).
|
|
2
|
+
# This file is documentation only; uncomment the keys you want to use.
|
|
3
|
+
#
|
|
4
|
+
# kind: ingest
|
|
5
|
+
# id: domain.dataset.variant
|
|
6
|
+
# source: source.alias # required
|
|
7
|
+
# mapper: # optional (defaults to identity)
|
|
8
|
+
# entrypoint: my_mapper
|
|
9
|
+
# args: {}
|
|
10
|
+
# cadence: ${group_by} # optional per-contract variable for interpolation
|
|
11
|
+
# partition_by: station_id # optional; string or list of strings
|
|
12
|
+
# sort_batch_size: 100000 # optional; in-memory chunk size for sorting
|
|
13
|
+
#
|
|
14
|
+
# record transforms (one-key mappings; optional):
|
|
15
|
+
# - filter: { field: time, operator: ge, comparand: "${start_time}" }
|
|
16
|
+
# - floor_time: { cadence: "${cadence}" }
|
|
17
|
+
# - lag: { lag: "${cadence}" }
|
|
18
|
+
#
|
|
19
|
+
# stream transforms (one-key mappings; optional; operate on record fields):
|
|
20
|
+
# - floor_time: { cadence: "${cadence}" }
|
|
21
|
+
# - lag: { lag: "${cadence}" }
|
|
22
|
+
# - filter: { field: close, operator: ge, comparand: 1000000 }
|
|
23
|
+
# - dedupe: {}
|
|
24
|
+
# - granularity: { field: close, to: close, mode: first }
|
|
25
|
+
# - ensure_cadence: { field: close, to: close, cadence: "${cadence}" }
|
|
26
|
+
# - rolling: { field: dollar_volume, to: adv60, window: 60, statistic: mean, min_samples: 60 }
|
|
27
|
+
# - fill: { field: close, to: close, statistic: median, window: 6, min_samples: 1 }
|
|
28
|
+
#
|
|
29
|
+
# debug transforms (one-key mappings; optional):
|
|
30
|
+
# - lint: { mode: error, tick: "${cadence}" }
|
|
31
|
+
# - identity: {}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Contract config reference (overview).
|
|
2
|
+
# This file is documentation only; uncomment the keys you want to use.
|
|
3
|
+
#
|
|
4
|
+
# kind: ingest | composed
|
|
5
|
+
# id: domain.dataset.variant
|
|
6
|
+
# source: source.alias # required when kind: ingest
|
|
7
|
+
# inputs: [stream.id] # required when kind: composed
|
|
8
|
+
# mapper: # optional (defaults to identity)
|
|
9
|
+
# entrypoint: my_mapper
|
|
10
|
+
# args: {}
|
|
11
|
+
# cadence: ${group_by} # optional per-contract variable for interpolation
|
|
12
|
+
# partition_by: station_id # optional; string or list of strings
|
|
13
|
+
# sort_batch_size: 100000 # optional; in-memory chunk size for sorting
|
|
14
|
+
#
|
|
15
|
+
# record transforms (one-key mappings; optional):
|
|
16
|
+
# - filter: { field: time, operator: ge, comparand: "${start_time}" }
|
|
17
|
+
# # operator: eq|ne|lt|le|gt|ge|in|nin (aliases: ==, !=, >=, <=, etc.)
|
|
18
|
+
# - floor_time: { cadence: "${cadence}" }
|
|
19
|
+
# - lag: { lag: "${cadence}" }
|
|
20
|
+
#
|
|
21
|
+
# stream transforms (one-key mappings; optional):
|
|
22
|
+
# - dedupe: {}
|
|
23
|
+
# - granularity: { field: close, to: close, mode: first } # first | last | mean | median
|
|
24
|
+
# - ensure_cadence: { field: close, to: close, cadence: "${cadence}" }
|
|
25
|
+
# - fill: { field: close, to: close, statistic: median, window: 6, min_samples: 1 }
|
|
26
|
+
# # statistic: mean | median; window must be > 0
|
|
27
|
+
#
|
|
28
|
+
# debug transforms (one-key mappings; optional):
|
|
29
|
+
# - lint: { mode: error, tick: "${cadence}" } # mode: warn | error
|
|
30
|
+
# - identity: {}
|
|
31
|
+
#
|
|
32
|
+
# See also:
|
|
33
|
+
# - ingest.reference.yaml
|
|
34
|
+
# - composed.reference.yaml
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Dataset config reference (all options).
|
|
2
|
+
# This file is documentation only; uncomment the keys you want to use.
|
|
3
|
+
#
|
|
4
|
+
# Feature/vector stages require group_by.
|
|
5
|
+
# group_by must match ^\d+(m|min|h|d)$ (e.g., 10m, 1h, 1d).
|
|
6
|
+
#
|
|
7
|
+
# group_by: ${group_by} # required for feature/vector stages
|
|
8
|
+
#
|
|
9
|
+
# features: # optional
|
|
10
|
+
# - id: time_linear
|
|
11
|
+
# record_stream: time.ticks.linear
|
|
12
|
+
# field: value
|
|
13
|
+
# scale: true # optional; true | false | mapping (see below)
|
|
14
|
+
# # scale:
|
|
15
|
+
# # model_path: ../artifacts/example/v1/scaler.pkl
|
|
16
|
+
# # with_mean: true
|
|
17
|
+
# # with_std: true
|
|
18
|
+
# # epsilon: 1.0e-12
|
|
19
|
+
# # on_none: skip # skip | error
|
|
20
|
+
# sequence: { size: 6, stride: 1 } # optional
|
|
21
|
+
#
|
|
22
|
+
# targets: # optional
|
|
23
|
+
# - id: some_target
|
|
24
|
+
# record_stream: time.ticks.linear
|
|
25
|
+
# field: value
|
|
26
|
+
# scale: false # optional
|
|
27
|
+
# sequence: null # optional
|
|
28
|
+
#
|
|
29
|
+
# Record-only stage uses only record_stream entries; id/field/scale/sequence are ignored.
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Postprocess config reference (vector transforms).
|
|
2
|
+
# This file is documentation only; uncomment the keys you want to use.
|
|
3
|
+
# Each list item is a one-key mapping: <transform_name>: <params>.
|
|
4
|
+
#
|
|
5
|
+
# - drop:
|
|
6
|
+
# axis: vertical # optional; horizontal | vertical
|
|
7
|
+
# payload: targets # optional; features | targets | both (both only for horizontal)
|
|
8
|
+
# threshold: 0.9 # required; 0.0 - 1.0
|
|
9
|
+
# only: [feature_id] # optional
|
|
10
|
+
# exclude: [feature_id] # optional
|
|
11
|
+
#
|
|
12
|
+
# - fill:
|
|
13
|
+
# statistic: median # optional; mean | median
|
|
14
|
+
# window: 48 # optional; rolling window size
|
|
15
|
+
# min_samples: 6 # optional
|
|
16
|
+
# payload: features # optional; features | targets | both
|
|
17
|
+
# only: [feature_id] # optional
|
|
18
|
+
# exclude: [feature_id] # optional
|
|
19
|
+
#
|
|
20
|
+
# - replace:
|
|
21
|
+
# value: 0.0 # required
|
|
22
|
+
# payload: targets # optional; features | targets | both
|
|
23
|
+
# target: null # optional; replace only when value equals target; default replaces missing
|
|
24
|
+
# only: [feature_id] # optional
|
|
25
|
+
# exclude: [feature_id] # optional
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Project config reference (all options).
|
|
2
|
+
# This file is documentation only; uncomment the keys you want to use.
|
|
3
|
+
#
|
|
4
|
+
# version: 1 # optional
|
|
5
|
+
# name: example # optional
|
|
6
|
+
# paths:
|
|
7
|
+
# streams: ./contracts # required
|
|
8
|
+
# sources: ./sources # required
|
|
9
|
+
# dataset: dataset.yaml # required
|
|
10
|
+
# postprocess: postprocess.yaml # required
|
|
11
|
+
# artifacts: ../artifacts/${project_name}/v${version} # required
|
|
12
|
+
# tasks: ./tasks # optional
|
|
13
|
+
# globals: # optional; available via ${var}
|
|
14
|
+
# group_by: 1h # optional; dataset cadence
|
|
15
|
+
# start_time: 2021-01-01T00:00:00Z # optional; used in contracts
|
|
16
|
+
# end_time: 2021-01-02T00:00:00Z # optional; used in contracts
|
|
17
|
+
# split: # optional; applied at serve time after postprocess
|
|
18
|
+
# mode: hash # hash | time
|
|
19
|
+
# ratios: { train: 0.8, val: 0.1, test: 0.1 } # must sum to 1.0
|
|
20
|
+
# seed: 42 # deterministic hash seed
|
|
21
|
+
# key: group # group | feature:<id>
|
|
22
|
+
#
|
|
23
|
+
# Time-based split (labels length must be len(boundaries) + 1):
|
|
24
|
+
# globals:
|
|
25
|
+
# split:
|
|
26
|
+
# mode: time
|
|
27
|
+
# boundaries:
|
|
28
|
+
# - 2021-01-01T00:00:00Z # first cutover (train -> val)
|
|
29
|
+
# - 2021-01-02T00:00:00Z # second cutover (val -> test)
|
|
30
|
+
# labels: [train, val, test]
|
|
31
|
+
#
|
|
32
|
+
# Any extra keys under globals are allowed and can be referenced via ${var}.
|
datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.http.reference.yaml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Foreach + HTTP loader reference (core.foreach + core.io).
|
|
2
|
+
# This file is documentation only; uncomment the keys you want to use.
|
|
3
|
+
#
|
|
4
|
+
# id: provider.dataset # required
|
|
5
|
+
# parser:
|
|
6
|
+
# entrypoint: my_pkg.sources.provider.dataset:parse # required
|
|
7
|
+
# args: {} # optional
|
|
8
|
+
# loader:
|
|
9
|
+
# entrypoint: core.foreach
|
|
10
|
+
# args:
|
|
11
|
+
# foreach:
|
|
12
|
+
# page: [1, 2, 3] # required; exactly one key; list of values
|
|
13
|
+
# loader:
|
|
14
|
+
# entrypoint: {{DEFAULT_IO_LOADER_EP}}
|
|
15
|
+
# args:
|
|
16
|
+
# transport: http
|
|
17
|
+
# format: json-lines
|
|
18
|
+
# url: "https://example/api?page=${page}" # required
|
|
19
|
+
# headers: { Authorization: "Bearer ..." } # optional
|
|
20
|
+
# params: {} # optional
|
|
21
|
+
# encoding: utf-8 # optional
|
|
22
|
+
# count_by_fetch: false # optional
|
|
23
|
+
# # inject_field: page # optional
|
|
24
|
+
# # throttle_seconds: 0 # optional
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# Foreach loader reference (core.foreach).
|
|
2
|
+
# This file is documentation only; uncomment the keys you want to use.
|
|
3
|
+
#
|
|
4
|
+
# id: provider.dataset # required
|
|
5
|
+
# parser:
|
|
6
|
+
# entrypoint: my_pkg.sources.provider.dataset:parse # required
|
|
7
|
+
# args: {} # optional
|
|
8
|
+
# loader:
|
|
9
|
+
# entrypoint: core.foreach
|
|
10
|
+
# args:
|
|
11
|
+
# foreach:
|
|
12
|
+
# month: ["2024-01", "2024-02"] # required; exactly one key; list of values
|
|
13
|
+
# loader:
|
|
14
|
+
# entrypoint: {{DEFAULT_IO_LOADER_EP}}
|
|
15
|
+
# args:
|
|
16
|
+
# transport: fs
|
|
17
|
+
# format: csv # required
|
|
18
|
+
# path: ./data/${month}.csv # required
|
|
19
|
+
# # inject_field: month # optional; mapping output only
|
|
20
|
+
# # inject: { month: "${month}" } # optional; mapping output only
|
|
21
|
+
# # throttle_seconds: 0 # optional
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# FS loader reference (generic I/O loader).
|
|
2
|
+
# This file is documentation only; uncomment the keys you want to use.
|
|
3
|
+
#
|
|
4
|
+
# id: provider.dataset # required
|
|
5
|
+
# parser:
|
|
6
|
+
# entrypoint: my_pkg.sources.provider.dataset:parse # required
|
|
7
|
+
# args: {} # optional
|
|
8
|
+
# loader:
|
|
9
|
+
# entrypoint: {{DEFAULT_IO_LOADER_EP}}
|
|
10
|
+
# args:
|
|
11
|
+
# transport: fs
|
|
12
|
+
# format: csv # required; csv | json | json-lines | pickle
|
|
13
|
+
# path: ./data/file.csv # optional (use path or glob)
|
|
14
|
+
# glob: ./data/*.csv # optional (use path or glob)
|
|
15
|
+
# encoding: utf-8 # optional
|
|
16
|
+
# delimiter: "," # optional; csv only
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# HTTP loader reference (generic I/O loader).
|
|
2
|
+
# This file is documentation only; uncomment the keys you want to use.
|
|
3
|
+
#
|
|
4
|
+
# id: provider.dataset # required
|
|
5
|
+
# parser:
|
|
6
|
+
# entrypoint: my_pkg.sources.provider.dataset:parse # required
|
|
7
|
+
# args: {} # optional
|
|
8
|
+
# loader:
|
|
9
|
+
# entrypoint: {{DEFAULT_IO_LOADER_EP}}
|
|
10
|
+
# args:
|
|
11
|
+
# transport: http
|
|
12
|
+
# format: json-lines # required; json | json-lines | csv
|
|
13
|
+
# url: https://example/api # required
|
|
14
|
+
# params: { key: value } # optional
|
|
15
|
+
# headers: { Authorization: "Bearer ..." } # optional
|
|
16
|
+
# encoding: utf-8 # optional
|
|
17
|
+
# count_by_fetch: false # optional
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Source config reference (overview).
|
|
2
|
+
# This file is documentation only; uncomment the keys you want to use.
|
|
3
|
+
#
|
|
4
|
+
# Required shape:
|
|
5
|
+
# id: provider.dataset # required
|
|
6
|
+
# parser:
|
|
7
|
+
# entrypoint: module.path:callable # required
|
|
8
|
+
# args: {} # optional
|
|
9
|
+
# loader:
|
|
10
|
+
# entrypoint: module.path:callable # required
|
|
11
|
+
# args: {} # optional
|
|
12
|
+
#
|
|
13
|
+
# See also:
|
|
14
|
+
# - fs.reference.yaml
|
|
15
|
+
# - http.reference.yaml
|
|
16
|
+
# - synthetic.reference.yaml
|
|
17
|
+
# - foreach.reference.yaml
|
|
18
|
+
# - foreach.http.reference.yaml
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Synthetic source reference.
|
|
2
|
+
# This file is documentation only; uncomment the keys you want to use.
|
|
3
|
+
#
|
|
4
|
+
# id: synthetic.ticks # required
|
|
5
|
+
#
|
|
6
|
+
# parser:
|
|
7
|
+
# entrypoint: core.synthetic.ticks # required
|
|
8
|
+
# args: {} # optional
|
|
9
|
+
#
|
|
10
|
+
# loader:
|
|
11
|
+
# entrypoint: core.synthetic.ticks # required
|
|
12
|
+
# args:
|
|
13
|
+
# start: "${start_time}" # optional
|
|
14
|
+
# end: "${end_time}" # optional
|
|
15
|
+
# frequency: "${group_by}" # optional
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Metadata task reference (all options).
|
|
2
|
+
# This file is documentation only; uncomment the keys you want to use.
|
|
3
|
+
#
|
|
4
|
+
# version: 1 # optional
|
|
5
|
+
# kind: metadata
|
|
6
|
+
# name: metadata # optional (defaults to filename stem)
|
|
7
|
+
# enabled: true # optional
|
|
8
|
+
#
|
|
9
|
+
# output: metadata.json # optional; relative to project.paths.artifacts
|
|
10
|
+
# cadence_strategy: max # optional; currently only "max"
|
|
11
|
+
# window_mode: intersection # optional; union | intersection | strict | relaxed
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# Scaler task reference (all options).
|
|
2
|
+
# This file is documentation only; uncomment the keys you want to use.
|
|
3
|
+
#
|
|
4
|
+
# version: 1 # optional
|
|
5
|
+
# kind: scaler
|
|
6
|
+
# name: scaler # optional (defaults to filename stem)
|
|
7
|
+
# enabled: true # optional
|
|
8
|
+
#
|
|
9
|
+
# output: scaler.pkl # optional; relative to project.paths.artifacts
|
|
10
|
+
# split_label: train # optional; split label from globals.split
|