jerry-thomas 1.0.3__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +0 -1
- datapipeline/build/tasks/config.py +0 -2
- datapipeline/build/tasks/metadata.py +0 -2
- datapipeline/build/tasks/scaler.py +0 -2
- datapipeline/build/tasks/schema.py +0 -2
- datapipeline/build/tasks/utils.py +0 -2
- datapipeline/cli/app.py +201 -81
- datapipeline/cli/commands/contract.py +145 -283
- datapipeline/cli/commands/demo.py +13 -0
- datapipeline/cli/commands/domain.py +4 -4
- datapipeline/cli/commands/dto.py +11 -0
- datapipeline/cli/commands/filter.py +2 -2
- datapipeline/cli/commands/inspect.py +0 -68
- datapipeline/cli/commands/list_.py +30 -13
- datapipeline/cli/commands/loader.py +11 -0
- datapipeline/cli/commands/mapper.py +82 -0
- datapipeline/cli/commands/parser.py +45 -0
- datapipeline/cli/commands/run_config.py +1 -3
- datapipeline/cli/commands/serve_pipeline.py +5 -7
- datapipeline/cli/commands/source.py +106 -18
- datapipeline/cli/commands/stream.py +292 -0
- datapipeline/cli/visuals/common.py +0 -2
- datapipeline/cli/visuals/sections.py +0 -2
- datapipeline/cli/workspace_utils.py +0 -3
- datapipeline/config/context.py +0 -2
- datapipeline/config/dataset/feature.py +1 -0
- datapipeline/config/metadata.py +0 -2
- datapipeline/config/project.py +0 -2
- datapipeline/config/resolution.py +10 -2
- datapipeline/config/tasks.py +9 -9
- datapipeline/domain/feature.py +3 -0
- datapipeline/domain/record.py +7 -7
- datapipeline/domain/sample.py +0 -2
- datapipeline/domain/vector.py +6 -8
- datapipeline/integrations/ml/adapter.py +0 -2
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +0 -2
- datapipeline/integrations/ml/torch_support.py +0 -2
- datapipeline/io/output.py +0 -2
- datapipeline/io/serializers.py +26 -16
- datapipeline/mappers/synthetic/time.py +9 -2
- datapipeline/pipeline/artifacts.py +3 -5
- datapipeline/pipeline/observability.py +0 -2
- datapipeline/pipeline/pipelines.py +118 -34
- datapipeline/pipeline/stages.py +54 -18
- datapipeline/pipeline/utils/spool_cache.py +142 -0
- datapipeline/pipeline/utils/transform_utils.py +27 -2
- datapipeline/services/artifacts.py +1 -4
- datapipeline/services/constants.py +1 -0
- datapipeline/services/factories.py +4 -6
- datapipeline/services/paths.py +10 -1
- datapipeline/services/project_paths.py +0 -2
- datapipeline/services/runs.py +0 -2
- datapipeline/services/scaffold/contract_yaml.py +76 -0
- datapipeline/services/scaffold/demo.py +141 -0
- datapipeline/services/scaffold/discovery.py +115 -0
- datapipeline/services/scaffold/domain.py +21 -13
- datapipeline/services/scaffold/dto.py +31 -0
- datapipeline/services/scaffold/filter.py +2 -1
- datapipeline/services/scaffold/layout.py +96 -0
- datapipeline/services/scaffold/loader.py +61 -0
- datapipeline/services/scaffold/mapper.py +116 -0
- datapipeline/services/scaffold/parser.py +56 -0
- datapipeline/services/scaffold/plugin.py +14 -2
- datapipeline/services/scaffold/source_yaml.py +91 -0
- datapipeline/services/scaffold/stream_plan.py +129 -0
- datapipeline/services/scaffold/utils.py +187 -0
- datapipeline/sources/data_loader.py +0 -2
- datapipeline/sources/decoders.py +49 -8
- datapipeline/sources/factory.py +9 -6
- datapipeline/sources/foreach.py +18 -3
- datapipeline/sources/synthetic/time/parser.py +1 -1
- datapipeline/sources/transports.py +10 -4
- datapipeline/templates/demo_skeleton/demo/contracts/equity.ohlcv.yaml +33 -0
- datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.hour_sin.yaml +22 -0
- datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.linear.yaml +22 -0
- datapipeline/templates/demo_skeleton/demo/data/APPL.jsonl +19 -0
- datapipeline/templates/demo_skeleton/demo/data/MSFT.jsonl +19 -0
- datapipeline/templates/demo_skeleton/demo/dataset.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/postprocess.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/project.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/sources/sandbox.ohlcv.yaml +17 -0
- datapipeline/templates/{plugin_skeleton/example → demo_skeleton/demo}/sources/synthetic.ticks.yaml +1 -1
- datapipeline/templates/demo_skeleton/demo/tasks/metadata.yaml +2 -0
- datapipeline/templates/demo_skeleton/demo/tasks/scaler.yaml +3 -0
- datapipeline/templates/demo_skeleton/demo/tasks/schema.yaml +2 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.test.yaml +4 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.train.yaml +4 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.val.yaml +4 -0
- datapipeline/templates/demo_skeleton/scripts/run_dataframe.py +20 -0
- datapipeline/templates/demo_skeleton/scripts/run_torch.py +23 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/model.py +18 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/sandbox_ohlcv_dto.py +14 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/map_sandbox_ohlcv_dto_to_equity.py +26 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/sandbox_ohlcv_dto_parser.py +46 -0
- datapipeline/templates/plugin_skeleton/README.md +57 -136
- datapipeline/templates/plugin_skeleton/jerry.yaml +12 -24
- datapipeline/templates/plugin_skeleton/reference/jerry.yaml +28 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/composed.reference.yaml +29 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/ingest.reference.yaml +31 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/overview.reference.yaml +34 -0
- datapipeline/templates/plugin_skeleton/reference/reference/dataset.yaml +29 -0
- datapipeline/templates/plugin_skeleton/reference/reference/postprocess.yaml +25 -0
- datapipeline/templates/plugin_skeleton/reference/reference/project.yaml +32 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.http.reference.yaml +24 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.reference.yaml +21 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/fs.reference.yaml +16 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/http.reference.yaml +17 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/overview.reference.yaml +18 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/synthetic.reference.yaml +15 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/metadata.reference.yaml +11 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/scaler.reference.yaml +10 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/schema.reference.yaml +10 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/serve.reference.yaml +28 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/domains/__init__.py +2 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/loaders/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +1 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +12 -11
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +4 -13
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +9 -11
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +1 -2
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +1 -7
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +1 -25
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/dataset.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/postprocess.yaml +1 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/project.yaml +15 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/tasks/serve.all.yaml +8 -0
- datapipeline/templates/stubs/contracts/composed.yaml.j2 +10 -0
- datapipeline/templates/stubs/contracts/ingest.yaml.j2 +25 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -2
- datapipeline/templates/stubs/filter.py.j2 +1 -1
- datapipeline/templates/stubs/loaders/basic.py.j2 +11 -0
- datapipeline/templates/stubs/mappers/composed.py.j2 +13 -0
- datapipeline/templates/stubs/mappers/ingest.py.j2 +20 -0
- datapipeline/templates/stubs/parser.py.j2 +5 -1
- datapipeline/templates/stubs/record.py.j2 +1 -1
- datapipeline/templates/stubs/source.yaml.j2 +1 -1
- datapipeline/transforms/debug/identity.py +34 -16
- datapipeline/transforms/debug/lint.py +14 -11
- datapipeline/transforms/feature/scaler.py +5 -12
- datapipeline/transforms/filter.py +73 -17
- datapipeline/transforms/interfaces.py +58 -0
- datapipeline/transforms/record/floor_time.py +10 -7
- datapipeline/transforms/record/lag.py +8 -10
- datapipeline/transforms/sequence.py +2 -3
- datapipeline/transforms/stream/dedupe.py +5 -7
- datapipeline/transforms/stream/ensure_ticks.py +39 -24
- datapipeline/transforms/stream/fill.py +34 -25
- datapipeline/transforms/stream/filter.py +25 -0
- datapipeline/transforms/stream/floor_time.py +16 -0
- datapipeline/transforms/stream/granularity.py +52 -30
- datapipeline/transforms/stream/lag.py +17 -0
- datapipeline/transforms/stream/rolling.py +72 -0
- datapipeline/transforms/utils.py +42 -10
- datapipeline/transforms/vector/drop/horizontal.py +0 -3
- datapipeline/transforms/vector/drop/orchestrator.py +0 -3
- datapipeline/transforms/vector/drop/vertical.py +0 -2
- datapipeline/transforms/vector/ensure_schema.py +0 -2
- datapipeline/utils/paths.py +0 -2
- datapipeline/utils/placeholders.py +0 -2
- datapipeline/utils/rich_compat.py +0 -3
- datapipeline/utils/window.py +0 -2
- jerry_thomas-2.0.1.dist-info/METADATA +269 -0
- jerry_thomas-2.0.1.dist-info/RECORD +264 -0
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/WHEEL +1 -1
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/entry_points.txt +7 -3
- datapipeline/services/scaffold/mappers.py +0 -55
- datapipeline/services/scaffold/source.py +0 -191
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +0 -31
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +0 -30
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +0 -18
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +0 -29
- datapipeline/templates/plugin_skeleton/example/project.yaml +0 -23
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +0 -3
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +0 -9
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +0 -2
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +0 -4
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +0 -28
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +0 -4
- datapipeline/templates/stubs/mapper.py.j2 +0 -22
- jerry_thomas-1.0.3.dist-info/METADATA +0 -827
- jerry_thomas-1.0.3.dist-info/RECORD +0 -198
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# Schema task reference (all options).
|
|
2
|
+
# This file is documentation only; uncomment the keys you want to use.
|
|
3
|
+
#
|
|
4
|
+
# version: 1 # optional
|
|
5
|
+
# kind: schema
|
|
6
|
+
# name: schema # optional (defaults to filename stem)
|
|
7
|
+
# enabled: true # optional
|
|
8
|
+
#
|
|
9
|
+
# output: schema.json # optional; relative to project.paths.artifacts
|
|
10
|
+
# cadence_strategy: max # optional; currently only "max"
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Serve task reference (all options).
|
|
2
|
+
# This file is documentation only; uncomment the keys you want to use.
|
|
3
|
+
#
|
|
4
|
+
# version: 1 # optional
|
|
5
|
+
# kind: serve
|
|
6
|
+
# name: train # optional (defaults to filename stem)
|
|
7
|
+
# enabled: true # optional
|
|
8
|
+
#
|
|
9
|
+
# keep: train # optional; split label from globals.split (null disables filtering)
|
|
10
|
+
#
|
|
11
|
+
# output: # optional; omit to use CLI defaults
|
|
12
|
+
# transport: stdout # stdout | fs
|
|
13
|
+
# format: json-lines # stdout: print | json-lines | json
|
|
14
|
+
# payload: sample # sample | vector
|
|
15
|
+
# # fs transport only:
|
|
16
|
+
# # transport: fs
|
|
17
|
+
# # format: csv # csv | json | json-lines | pickle
|
|
18
|
+
# # payload: vector
|
|
19
|
+
# # directory: artifacts/serve
|
|
20
|
+
# # filename: vectors.train # no extension, no path separators
|
|
21
|
+
#
|
|
22
|
+
# limit: 100 # optional; null = unlimited
|
|
23
|
+
# stage: 8 # optional; 0-8; null lets CLI decide
|
|
24
|
+
# throttle_ms: 0 # optional; milliseconds; null disables
|
|
25
|
+
#
|
|
26
|
+
# log_level: INFO # optional; CRITICAL | ERROR | WARNING | INFO | DEBUG
|
|
27
|
+
# visuals: AUTO # optional; AUTO | TQDM | RICH | OFF (false -> OFF)
|
|
28
|
+
# progress: AUTO # optional; AUTO | SPINNER | BARS | OFF
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Stream mappers (DTO -> domain records)."""
|
|
File without changes
|
|
@@ -1,18 +1,19 @@
|
|
|
1
|
+
# See ../reference/reference/dataset.yaml for full options.
|
|
2
|
+
|
|
1
3
|
group_by: ${group_by}
|
|
2
4
|
|
|
3
5
|
features:
|
|
4
|
-
- id:
|
|
5
|
-
record_stream:
|
|
6
|
-
|
|
7
|
-
|
|
6
|
+
- id: first_feature
|
|
7
|
+
record_stream: your.stream.one
|
|
8
|
+
field: some_field
|
|
9
|
+
scale: true
|
|
8
10
|
sequence: { size: 6, stride: 1 }
|
|
9
11
|
|
|
10
|
-
- id:
|
|
11
|
-
record_stream:
|
|
12
|
+
- id: second_feature
|
|
13
|
+
record_stream: your.stream.two
|
|
14
|
+
field: some_field
|
|
12
15
|
|
|
13
|
-
# - id: third_feature
|
|
14
|
-
# record_stream: anotherstream
|
|
15
16
|
# targets:
|
|
16
|
-
# - id:
|
|
17
|
-
# record_stream:
|
|
18
|
-
|
|
17
|
+
# - id: target_feature
|
|
18
|
+
# record_stream: your.target.stream
|
|
19
|
+
# field: some_field
|
|
@@ -1,16 +1,15 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
- drop: # example of dropping sparse partitions/vertical-axis for targets
|
|
1
|
+
# See ../reference/reference/postprocess.yaml for full options.
|
|
2
|
+
- drop:
|
|
4
3
|
axis: vertical
|
|
5
4
|
payload: targets
|
|
6
5
|
threshold: 0.9
|
|
7
6
|
|
|
8
|
-
- drop:
|
|
7
|
+
- drop:
|
|
9
8
|
axis: vertical
|
|
10
9
|
payload: features
|
|
11
10
|
threshold: 0.9
|
|
12
11
|
|
|
13
|
-
- drop:
|
|
12
|
+
- drop:
|
|
14
13
|
axis: horizontal
|
|
15
14
|
payload: features
|
|
16
15
|
threshold: 1
|
|
@@ -19,11 +18,3 @@
|
|
|
19
18
|
axis: horizontal
|
|
20
19
|
payload: targets
|
|
21
20
|
threshold: 1
|
|
22
|
-
######
|
|
23
|
-
# - fill:
|
|
24
|
-
# statistic: median
|
|
25
|
-
# window: 48
|
|
26
|
-
# min_samples: 6
|
|
27
|
-
# - replace:
|
|
28
|
-
# payload: targets
|
|
29
|
-
# value: 0.0
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
# See ../reference/reference/project.yaml for full options.
|
|
1
2
|
version: 1
|
|
2
3
|
name: <your-dataset>
|
|
3
4
|
paths:
|
|
@@ -7,16 +8,13 @@ paths:
|
|
|
7
8
|
postprocess: postprocess.yaml
|
|
8
9
|
artifacts: ../artifacts/${project_name}/v${version}
|
|
9
10
|
tasks: ./tasks
|
|
10
|
-
globals:
|
|
11
|
-
#
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
end_time: null #2021-01-02T00:00:00Z
|
|
16
|
-
# Configure deterministic dataset split here (applied at serve time, after postprocess).
|
|
17
|
-
# Adjust `ratios` as needed; the active split is selected via serve tasks or CLI.
|
|
11
|
+
globals:
|
|
12
|
+
# TODO: Set your grouping cadence (must match ^\d+(m|min|h|d)$).
|
|
13
|
+
group_by: 1h
|
|
14
|
+
start_time: null
|
|
15
|
+
end_time: null
|
|
18
16
|
split:
|
|
19
|
-
mode: hash
|
|
20
|
-
key: group
|
|
21
|
-
seed: 42
|
|
17
|
+
mode: hash
|
|
18
|
+
key: group
|
|
19
|
+
seed: 42
|
|
22
20
|
ratios: { train: 0.8, val: 0.1, test: 0.1 }
|
|
@@ -1,9 +1,3 @@
|
|
|
1
|
+
# See ../../reference/reference/tasks/scaler.reference.yaml for full options.
|
|
1
2
|
kind: scaler
|
|
2
|
-
|
|
3
|
-
# Output path is relative to project.paths.artifacts; defaults to "scaler.pkl".
|
|
4
|
-
# output: scaler.pkl
|
|
5
|
-
|
|
6
|
-
# Split label to use when fitting scaler statistics.
|
|
7
|
-
# Must match a label from globals.split.ratios.
|
|
8
3
|
split_label: train
|
|
9
|
-
|
|
@@ -1,28 +1,4 @@
|
|
|
1
|
+
# See ../../reference/reference/tasks/serve.reference.yaml for full options.
|
|
1
2
|
kind: serve
|
|
2
|
-
|
|
3
|
-
# Optional identifier for this serve task; defaults to filename stem.
|
|
4
3
|
name: train
|
|
5
|
-
|
|
6
|
-
# Active split label to serve; must match a label from globals.split.ratios.
|
|
7
|
-
# Set to null to disable split filtering.
|
|
8
4
|
keep: train
|
|
9
|
-
#output:
|
|
10
|
-
# transport: stdout | fs
|
|
11
|
-
# format: print | json-lines | json | csv | pickle
|
|
12
|
-
# When using fs transport, set a directory (and optionally filename) for outputs:
|
|
13
|
-
# directory: artifacts/serve
|
|
14
|
-
# filename: vectors.train
|
|
15
|
-
|
|
16
|
-
# Default max number of vectors to emit (null = unlimited).
|
|
17
|
-
# limit: 5
|
|
18
|
-
# Optional pipeline stage preview (0-7); null lets the CLI decide.
|
|
19
|
-
# stage: 7
|
|
20
|
-
|
|
21
|
-
# Optional pacing between emitted vectors (milliseconds).
|
|
22
|
-
# throttle_ms: null
|
|
23
|
-
|
|
24
|
-
# Visuals/logging knobs (inherit CLI or jerry.yaml defaults when omitted):
|
|
25
|
-
# visuals: AUTO # AUTO | TQDM | RICH | OFF
|
|
26
|
-
# progress: AUTO # AUTO | SPINNER | BARS | OFF
|
|
27
|
-
# log_level: INFO # CRITICAL | ERROR | WARNING | INFO | DEBUG
|
|
28
|
-
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
[]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# See ../reference/reference/project.yaml for full options.
|
|
2
|
+
version: 1
|
|
3
|
+
name: <your-interim-data-builder>
|
|
4
|
+
paths:
|
|
5
|
+
streams: ./contracts
|
|
6
|
+
sources: ./sources
|
|
7
|
+
dataset: dataset.yaml
|
|
8
|
+
postprocess: postprocess.yaml
|
|
9
|
+
artifacts: ../artifacts/${project_name}/v${version}
|
|
10
|
+
tasks: ./tasks
|
|
11
|
+
globals:
|
|
12
|
+
# TODO: Set your grouping cadence (must match ^\d+(m|min|h|d)$).
|
|
13
|
+
group_by: 1h
|
|
14
|
+
start_time: null
|
|
15
|
+
end_time: null
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
kind: composed
|
|
2
|
+
id: {{ stream_id }} # format: domain.dataset.(variant)
|
|
3
|
+
# cadence: ${group_by} # optional per-contract cadence
|
|
4
|
+
# partition_by: <field or [fields]>
|
|
5
|
+
inputs:
|
|
6
|
+
- {{ inputs_list }}
|
|
7
|
+
|
|
8
|
+
mapper:
|
|
9
|
+
entrypoint: {{ mapper_entrypoint }}
|
|
10
|
+
args: { driver: {{ driver_key }} }
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
kind: ingest
|
|
2
|
+
source: {{ source }}
|
|
3
|
+
id: {{ stream_id }} # format: domain.dataset.(variant)
|
|
4
|
+
|
|
5
|
+
mapper:
|
|
6
|
+
entrypoint: {{ mapper_entrypoint }}
|
|
7
|
+
args: {}
|
|
8
|
+
|
|
9
|
+
cadence: ${group_by} # optional per-contract cadence
|
|
10
|
+
# partition_by: <field or [fields]>
|
|
11
|
+
# sort_batch_size: 100000 # in-memory sort chunk size
|
|
12
|
+
|
|
13
|
+
record: # record-level transforms
|
|
14
|
+
- filter: { field: time, operator: ge, comparand: "${start_time}" }
|
|
15
|
+
- filter: { field: time, operator: le, comparand: "${end_time}" }
|
|
16
|
+
- floor_time: { cadence: "${cadence}" }
|
|
17
|
+
# - lag: { lag: 10m }
|
|
18
|
+
|
|
19
|
+
stream: # per-stream transforms (input sorted by partition,time)
|
|
20
|
+
- ensure_cadence: { field: some_field, to: some_field, cadence: "${cadence}" }
|
|
21
|
+
- granularity: { field: some_field, to: some_field, mode: first }
|
|
22
|
+
# - fill: { field: some_field, to: some_field, statistic: median, window: 6, min_samples: 1 }
|
|
23
|
+
|
|
24
|
+
debug: # optional validation-only checks
|
|
25
|
+
- lint: { mode: warn, tick: "${cadence}" }
|
|
@@ -5,7 +5,7 @@ from datetime import datetime
|
|
|
5
5
|
@dataclass
|
|
6
6
|
class {{CLASS_NAME}}:
|
|
7
7
|
"""
|
|
8
|
-
Data Transfer Object (DTO) for the '{{DOMAIN}}'
|
|
8
|
+
Data Transfer Object (DTO) for the '{{DOMAIN}}' records.
|
|
9
9
|
|
|
10
10
|
Purpose
|
|
11
11
|
- Represents the raw, source-shaped data emitted by the loader + parser.
|
|
@@ -24,4 +24,4 @@ class {{CLASS_NAME}}:
|
|
|
24
24
|
# currency: str
|
|
25
25
|
"""
|
|
26
26
|
# TODO: define fields matching the '{{DOMAIN}}' source schema
|
|
27
|
-
|
|
27
|
+
pass
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from typing import Iterator, Any
|
|
2
|
+
|
|
3
|
+
from datapipeline.sources.models.loader import DataLoader
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class {{CLASS_NAME}}(DataLoader):
|
|
7
|
+
"""Custom loader stub. Yield raw items to be parsed by a parser."""
|
|
8
|
+
|
|
9
|
+
def __iter__(self) -> Iterator[Any]:
|
|
10
|
+
# TODO: implement data loading
|
|
11
|
+
yield from ()
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from typing import Iterator, Mapping
|
|
2
|
+
|
|
3
|
+
from datapipeline.domain.record import TemporalRecord
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def mapper(
|
|
7
|
+
inputs: Mapping[str, Iterator[TemporalRecord]],
|
|
8
|
+
*, driver: str | None = None, aux: Mapping[str, Iterator[TemporalRecord]] | None = None, context=None, **params
|
|
9
|
+
) -> Iterator[TemporalRecord]:
|
|
10
|
+
# TODO: implement domain math; inputs are ordered/regularized; aux is raw
|
|
11
|
+
key = driver or next(iter(inputs.keys()))
|
|
12
|
+
for rec in inputs[key]:
|
|
13
|
+
yield rec # replace with your dataclass and computation
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from typing import Any, Iterator
|
|
2
|
+
|
|
3
|
+
from {{DOMAIN_MODULE}} import {{DOMAIN_RECORD}}
|
|
4
|
+
{% if INPUT_IMPORT != "typing" %}from {{INPUT_IMPORT}} import {{INPUT_CLASS}}
|
|
5
|
+
{% endif %}
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def {{FUNCTION_NAME}}(
|
|
9
|
+
stream: Iterator[{{INPUT_CLASS}}],
|
|
10
|
+
**params: Any,
|
|
11
|
+
) -> Iterator[{{DOMAIN_RECORD}}]:
|
|
12
|
+
"""Map {{INPUT_CLASS}} records to domain-level {{DOMAIN_RECORD}} records."""
|
|
13
|
+
raise NotImplementedError(
|
|
14
|
+
"Implement mapper logic for {{INPUT_CLASS}} -> {{DOMAIN_RECORD}}"
|
|
15
|
+
)
|
|
16
|
+
for record in stream:
|
|
17
|
+
# TODO: construct {{DOMAIN_RECORD}} from record fields
|
|
18
|
+
yield {{DOMAIN_RECORD}}(
|
|
19
|
+
time=record.time, # required
|
|
20
|
+
)
|
|
@@ -2,7 +2,11 @@ from typing import Any
|
|
|
2
2
|
|
|
3
3
|
from datapipeline.sources.models.parser import DataParser
|
|
4
4
|
|
|
5
|
+
{% if DTO_IMPORT -%}
|
|
6
|
+
from {{DTO_IMPORT}} import {{DTO_CLASS}}
|
|
7
|
+
{% else -%}
|
|
5
8
|
from .dto import {{DTO_CLASS}}
|
|
9
|
+
{% endif %}
|
|
6
10
|
|
|
7
11
|
|
|
8
12
|
class {{CLASS_NAME}}(DataParser[{{DTO_CLASS}}]):
|
|
@@ -18,4 +22,4 @@ class {{CLASS_NAME}}(DataParser[{{DTO_CLASS}}]):
|
|
|
18
22
|
# return {{DTO_CLASS}}(
|
|
19
23
|
# ... map fields from `raw` ...
|
|
20
24
|
# )
|
|
21
|
-
raise NotImplementedError
|
|
25
|
+
raise NotImplementedError("Implement parser logic for {{DTO_CLASS}}")
|
|
@@ -9,11 +9,11 @@ class {{CLASS_NAME}}({{PARENT_CLASS}}):
|
|
|
9
9
|
Domain record for '{{DOMAIN}}'.
|
|
10
10
|
|
|
11
11
|
Required fields inherited from the base:
|
|
12
|
-
- value: main payload used to model records value by pipelines (numeric or categorical)
|
|
13
12
|
- time: tz-aware datetime (always required)
|
|
14
13
|
|
|
15
14
|
Add any additional fields you need for filtering/partitioning/grouping.
|
|
16
15
|
"""
|
|
16
|
+
# TODO: Add domain fields for filtering/partitioning/grouping.
|
|
17
17
|
# Example extra fields (uncomment and adapt):
|
|
18
18
|
# region: str # e.g. 'us-west', 'eu-central', etc.
|
|
19
19
|
# exchange: str # e.g. 'NASDAQ', 'NYSE', etc.
|
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
1
|
import logging
|
|
4
2
|
from dataclasses import asdict, is_dataclass
|
|
5
3
|
from typing import Iterator, Any
|
|
6
4
|
|
|
7
|
-
from datapipeline.domain.
|
|
5
|
+
from datapipeline.domain.record import TemporalRecord
|
|
6
|
+
from datapipeline.transforms.utils import partition_key
|
|
8
7
|
|
|
9
8
|
logger = logging.getLogger(__name__)
|
|
10
9
|
|
|
@@ -16,14 +15,21 @@ class IdentityGuardTransform:
|
|
|
16
15
|
- mode: 'warn' (default) logs warnings; 'error' raises on first violation
|
|
17
16
|
- fields: optional explicit list of attribute names to compare. When omitted,
|
|
18
17
|
the transform attempts to derive identity from dataclass fields on the
|
|
19
|
-
underlying record, excluding 'time'
|
|
18
|
+
underlying record, excluding 'time'.
|
|
20
19
|
"""
|
|
21
20
|
|
|
22
|
-
def __init__(
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
*,
|
|
24
|
+
mode: str = "warn",
|
|
25
|
+
fields: list[str] | None = None,
|
|
26
|
+
partition_by: str | list[str] | None = None,
|
|
27
|
+
) -> None:
|
|
23
28
|
self.mode = mode
|
|
24
29
|
self.fields = fields
|
|
30
|
+
self.partition_by = partition_by
|
|
25
31
|
|
|
26
|
-
def __call__(self, stream: Iterator[
|
|
32
|
+
def __call__(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
|
|
27
33
|
return self.apply(stream)
|
|
28
34
|
|
|
29
35
|
def _violation(self, msg: str) -> None:
|
|
@@ -41,26 +47,38 @@ class IdentityGuardTransform:
|
|
|
41
47
|
except Exception:
|
|
42
48
|
out[f] = None
|
|
43
49
|
return out
|
|
50
|
+
# Fall back to partition_by when available
|
|
51
|
+
if self.partition_by:
|
|
52
|
+
fields = (
|
|
53
|
+
[self.partition_by]
|
|
54
|
+
if isinstance(self.partition_by, str)
|
|
55
|
+
else list(self.partition_by)
|
|
56
|
+
)
|
|
57
|
+
out = {}
|
|
58
|
+
for f in fields:
|
|
59
|
+
try:
|
|
60
|
+
out[f] = getattr(rec, f)
|
|
61
|
+
except Exception:
|
|
62
|
+
out[f] = None
|
|
63
|
+
return out
|
|
44
64
|
# Try domain-provided hook first
|
|
45
65
|
if hasattr(rec, "identity_fields") and callable(getattr(rec, "identity_fields")):
|
|
46
66
|
try:
|
|
47
67
|
return rec.identity_fields() # type: ignore[attr-defined]
|
|
48
68
|
except Exception:
|
|
49
69
|
pass
|
|
50
|
-
# Fallback: dataclass fields minus time
|
|
70
|
+
# Fallback: dataclass fields minus time
|
|
51
71
|
if is_dataclass(rec):
|
|
52
72
|
data = asdict(rec)
|
|
53
73
|
data.pop("time", None)
|
|
54
|
-
data.pop("value", None)
|
|
55
74
|
return data
|
|
56
75
|
return {}
|
|
57
76
|
|
|
58
|
-
def apply(self, stream: Iterator[
|
|
59
|
-
current_key = None
|
|
77
|
+
def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
|
|
78
|
+
current_key: tuple | None = None
|
|
60
79
|
baseline: dict | None = None
|
|
61
|
-
for
|
|
62
|
-
key =
|
|
63
|
-
rec = fr.record
|
|
80
|
+
for rec in stream:
|
|
81
|
+
key = partition_key(rec, self.partition_by)
|
|
64
82
|
ident = self._identity_map(rec)
|
|
65
83
|
if key != current_key:
|
|
66
84
|
current_key = key
|
|
@@ -68,7 +86,7 @@ class IdentityGuardTransform:
|
|
|
68
86
|
else:
|
|
69
87
|
if ident != baseline:
|
|
70
88
|
self._violation(
|
|
71
|
-
"identity drift in
|
|
72
|
-
% (
|
|
89
|
+
"identity drift in record stream key=%s: expected=%s observed=%s"
|
|
90
|
+
% (key, baseline, ident)
|
|
73
91
|
)
|
|
74
|
-
yield
|
|
92
|
+
yield rec
|
|
@@ -3,7 +3,8 @@ from datetime import timedelta
|
|
|
3
3
|
from itertools import groupby
|
|
4
4
|
from typing import Iterator
|
|
5
5
|
|
|
6
|
-
from datapipeline.domain.
|
|
6
|
+
from datapipeline.domain.record import TemporalRecord
|
|
7
|
+
from datapipeline.transforms.utils import partition_key
|
|
7
8
|
from datapipeline.utils.time import parse_timecode
|
|
8
9
|
|
|
9
10
|
|
|
@@ -23,9 +24,11 @@ class StreamLint:
|
|
|
23
24
|
*,
|
|
24
25
|
mode: str = "warn",
|
|
25
26
|
tick: str | None = None,
|
|
27
|
+
partition_by: str | list[str] | None = None,
|
|
26
28
|
) -> None:
|
|
27
29
|
self.mode = mode
|
|
28
30
|
self.tick = tick
|
|
31
|
+
self.partition_by = partition_by
|
|
29
32
|
|
|
30
33
|
# Pre-compute tick step in seconds when provided to avoid repeated parsing.
|
|
31
34
|
self._tick_seconds: int | None = None
|
|
@@ -38,7 +41,7 @@ class StreamLint:
|
|
|
38
41
|
)
|
|
39
42
|
self._tick_seconds = None
|
|
40
43
|
|
|
41
|
-
def __call__(self, stream: Iterator[
|
|
44
|
+
def __call__(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
|
|
42
45
|
return self.apply(stream)
|
|
43
46
|
|
|
44
47
|
def _violation(self, msg: str) -> None:
|
|
@@ -46,25 +49,25 @@ class StreamLint:
|
|
|
46
49
|
raise ValueError(msg)
|
|
47
50
|
logger.warning(msg)
|
|
48
51
|
|
|
49
|
-
def apply(self, stream: Iterator[
|
|
50
|
-
# Group by
|
|
51
|
-
for
|
|
52
|
+
def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
|
|
53
|
+
# Group by partition key to keep state local
|
|
54
|
+
for key, records in groupby(stream, key=lambda rec: partition_key(rec, self.partition_by)):
|
|
52
55
|
last_time = None
|
|
53
56
|
seen_times: set = set()
|
|
54
|
-
for
|
|
55
|
-
t = getattr(
|
|
57
|
+
for record in records:
|
|
58
|
+
t = getattr(record, "time", None)
|
|
56
59
|
|
|
57
60
|
# Check ordering
|
|
58
61
|
if last_time is not None and t is not None and t < last_time:
|
|
59
62
|
self._violation(
|
|
60
|
-
f"out-of-order timestamp for
|
|
63
|
+
f"out-of-order timestamp for partition '{key}': {t} < {last_time}. "
|
|
61
64
|
f"Consider sorting upstream or fixing loader."
|
|
62
65
|
)
|
|
63
66
|
|
|
64
67
|
# Check duplicates
|
|
65
68
|
if t in seen_times:
|
|
66
69
|
self._violation(
|
|
67
|
-
f"duplicate timestamp for
|
|
70
|
+
f"duplicate timestamp for partition '{key}' at {t}. "
|
|
68
71
|
f"Consider a granularity transform (first/last/mean/median)."
|
|
69
72
|
)
|
|
70
73
|
seen_times.add(t)
|
|
@@ -78,9 +81,9 @@ class StreamLint:
|
|
|
78
81
|
expect = last_time + timedelta(seconds=self._tick_seconds)
|
|
79
82
|
if t != expect and t > expect:
|
|
80
83
|
self._violation(
|
|
81
|
-
f"skipped tick(s) for
|
|
84
|
+
f"skipped tick(s) for partition '{key}': expected {expect}, got {t}. "
|
|
82
85
|
f"Consider using ensure_cadence."
|
|
83
86
|
)
|
|
84
87
|
|
|
85
88
|
last_time = t
|
|
86
|
-
yield
|
|
89
|
+
yield record
|
|
@@ -3,12 +3,11 @@ from collections import defaultdict
|
|
|
3
3
|
from itertools import groupby
|
|
4
4
|
from numbers import Real
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Callable, Iterator, Literal
|
|
6
|
+
from typing import Any, Callable, Iterator, Literal
|
|
7
7
|
|
|
8
8
|
from datapipeline.domain.feature import FeatureRecord
|
|
9
9
|
from datapipeline.domain.sample import Sample
|
|
10
10
|
from datapipeline.transforms.feature.model import FeatureTransform
|
|
11
|
-
from datapipeline.transforms.utils import clone_record_with_value
|
|
12
11
|
from datapipeline.utils.pickle_model import PicklePersistanceMixin
|
|
13
12
|
from datapipeline.pipeline.observability import TransformEvent
|
|
14
13
|
|
|
@@ -86,7 +85,7 @@ class StandardScaler(PicklePersistanceMixin):
|
|
|
86
85
|
mean = float(stats.get("mean", 0.0))
|
|
87
86
|
std = float(stats.get("std", 1.0))
|
|
88
87
|
for fr in records:
|
|
89
|
-
value = fr.
|
|
88
|
+
value = fr.value
|
|
90
89
|
if not isinstance(value, Real):
|
|
91
90
|
if value is None and on_none == "skip":
|
|
92
91
|
self.missing_counts[feature_id] = (
|
|
@@ -114,10 +113,7 @@ class StandardScaler(PicklePersistanceMixin):
|
|
|
114
113
|
normalized -= mean
|
|
115
114
|
if self.with_std:
|
|
116
115
|
normalized /= std
|
|
117
|
-
yield FeatureRecord(
|
|
118
|
-
record=clone_record_with_value(fr.record, normalized),
|
|
119
|
-
id=fr.id,
|
|
120
|
-
)
|
|
116
|
+
yield FeatureRecord(record=fr.record, id=fr.id, value=normalized)
|
|
121
117
|
|
|
122
118
|
def inverse_transform(
|
|
123
119
|
self,
|
|
@@ -136,7 +132,7 @@ class StandardScaler(PicklePersistanceMixin):
|
|
|
136
132
|
mean = float(stats.get("mean", 0.0))
|
|
137
133
|
std = float(stats.get("std", 1.0))
|
|
138
134
|
for fr in records:
|
|
139
|
-
value = fr.
|
|
135
|
+
value = fr.value
|
|
140
136
|
if not isinstance(value, Real):
|
|
141
137
|
raise TypeError(
|
|
142
138
|
f"Record value must be numeric, got {value!r}")
|
|
@@ -145,10 +141,7 @@ class StandardScaler(PicklePersistanceMixin):
|
|
|
145
141
|
restored *= std
|
|
146
142
|
if self.with_mean:
|
|
147
143
|
restored += mean
|
|
148
|
-
yield FeatureRecord(
|
|
149
|
-
record=clone_record_with_value(fr.record, restored),
|
|
150
|
-
id=fr.id,
|
|
151
|
-
)
|
|
144
|
+
yield FeatureRecord(record=fr.record, id=fr.id, value=restored)
|
|
152
145
|
|
|
153
146
|
class _RunningStats:
|
|
154
147
|
__slots__ = ("count", "mean", "m2")
|