PyPI - jerry-thomas - Versions diffs - 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

jerry-thomas 0.3.0py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (164) hide show

datapipeline/analysis/vector/collector.py +120 -17
datapipeline/analysis/vector/matrix.py +33 -8
datapipeline/analysis/vector/report.py +162 -32
datapipeline/build/tasks/__init__.py +11 -0
datapipeline/build/tasks/config.py +74 -0
datapipeline/build/tasks/metadata.py +170 -0
datapipeline/build/tasks/scaler.py +73 -0
datapipeline/build/tasks/schema.py +60 -0
datapipeline/build/tasks/utils.py +169 -0
datapipeline/cli/app.py +304 -127
datapipeline/cli/commands/build.py +240 -16
datapipeline/cli/commands/contract.py +367 -0
datapipeline/cli/commands/domain.py +8 -3
datapipeline/cli/commands/inspect.py +401 -149
datapipeline/cli/commands/list_.py +30 -7
datapipeline/cli/commands/plugin.py +5 -1
datapipeline/cli/commands/run.py +227 -241
datapipeline/cli/commands/run_config.py +101 -0
datapipeline/cli/commands/serve_pipeline.py +156 -0
datapipeline/cli/commands/source.py +44 -8
datapipeline/cli/visuals/__init__.py +4 -2
datapipeline/cli/visuals/common.py +239 -0
datapipeline/cli/visuals/labels.py +15 -15
datapipeline/cli/visuals/runner.py +66 -0
datapipeline/cli/visuals/sections.py +20 -0
datapipeline/cli/visuals/sources.py +132 -119
datapipeline/cli/visuals/sources_basic.py +260 -0
datapipeline/cli/visuals/sources_off.py +76 -0
datapipeline/cli/visuals/sources_rich.py +414 -0
datapipeline/config/catalog.py +37 -3
datapipeline/config/context.py +214 -0
datapipeline/config/dataset/loader.py +21 -4
datapipeline/config/dataset/normalize.py +4 -4
datapipeline/config/metadata.py +43 -0
datapipeline/config/postprocess.py +2 -2
datapipeline/config/project.py +3 -2
datapipeline/config/resolution.py +129 -0
datapipeline/config/tasks.py +309 -0
datapipeline/config/workspace.py +155 -0
datapipeline/domain/__init__.py +12 -0
datapipeline/domain/record.py +11 -0
datapipeline/domain/sample.py +54 -0
datapipeline/integrations/ml/adapter.py +34 -20
datapipeline/integrations/ml/pandas_support.py +0 -2
datapipeline/integrations/ml/rows.py +1 -6
datapipeline/integrations/ml/torch_support.py +1 -3
datapipeline/io/factory.py +112 -0
datapipeline/io/output.py +132 -0
datapipeline/io/protocols.py +21 -0
datapipeline/io/serializers.py +219 -0
datapipeline/io/sinks/__init__.py +23 -0
datapipeline/io/sinks/base.py +2 -0
datapipeline/io/sinks/files.py +79 -0
datapipeline/io/sinks/rich.py +57 -0
datapipeline/io/sinks/stdout.py +18 -0
datapipeline/io/writers/__init__.py +14 -0
datapipeline/io/writers/base.py +28 -0
datapipeline/io/writers/csv_writer.py +25 -0
datapipeline/io/writers/jsonl.py +52 -0
datapipeline/io/writers/pickle_writer.py +30 -0
datapipeline/pipeline/artifacts.py +58 -0
datapipeline/pipeline/context.py +66 -7
datapipeline/pipeline/observability.py +65 -0
datapipeline/pipeline/pipelines.py +65 -13
datapipeline/pipeline/split.py +11 -10
datapipeline/pipeline/stages.py +127 -16
datapipeline/pipeline/utils/keygen.py +20 -7
datapipeline/pipeline/utils/memory_sort.py +22 -10
datapipeline/pipeline/utils/transform_utils.py +22 -0
datapipeline/runtime.py +5 -2
datapipeline/services/artifacts.py +12 -6
datapipeline/services/bootstrap/config.py +25 -0
datapipeline/services/bootstrap/core.py +52 -37
datapipeline/services/constants.py +6 -5
datapipeline/services/factories.py +123 -1
datapipeline/services/project_paths.py +43 -16
datapipeline/services/runs.py +208 -0
datapipeline/services/scaffold/domain.py +3 -2
datapipeline/services/scaffold/filter.py +3 -2
datapipeline/services/scaffold/mappers.py +9 -6
datapipeline/services/scaffold/plugin.py +54 -10
datapipeline/services/scaffold/source.py +93 -56
datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
datapipeline/sources/decoders.py +83 -18
datapipeline/sources/factory.py +26 -16
datapipeline/sources/models/__init__.py +2 -2
datapipeline/sources/models/generator.py +0 -7
datapipeline/sources/models/loader.py +3 -3
datapipeline/sources/models/parsing_error.py +24 -0
datapipeline/sources/models/source.py +6 -6
datapipeline/sources/synthetic/time/loader.py +14 -2
datapipeline/sources/transports.py +74 -37
datapipeline/templates/plugin_skeleton/README.md +76 -30
datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
datapipeline/templates/stubs/dto.py.j2 +2 -0
datapipeline/templates/stubs/mapper.py.j2 +5 -4
datapipeline/templates/stubs/parser.py.j2 +2 -0
datapipeline/templates/stubs/record.py.j2 +2 -0
datapipeline/templates/stubs/source.yaml.j2 +2 -3
datapipeline/transforms/debug/lint.py +26 -41
datapipeline/transforms/feature/scaler.py +89 -13
datapipeline/transforms/record/floor_time.py +4 -4
datapipeline/transforms/sequence.py +2 -35
datapipeline/transforms/stream/dedupe.py +24 -0
datapipeline/transforms/stream/ensure_ticks.py +7 -6
datapipeline/transforms/vector/__init__.py +5 -0
datapipeline/transforms/vector/common.py +98 -0
datapipeline/transforms/vector/drop/__init__.py +4 -0
datapipeline/transforms/vector/drop/horizontal.py +79 -0
datapipeline/transforms/vector/drop/orchestrator.py +59 -0
datapipeline/transforms/vector/drop/vertical.py +182 -0
datapipeline/transforms/vector/ensure_schema.py +184 -0
datapipeline/transforms/vector/fill.py +87 -0
datapipeline/transforms/vector/replace.py +62 -0
datapipeline/utils/load.py +24 -3
datapipeline/utils/rich_compat.py +38 -0
datapipeline/utils/window.py +76 -0
jerry_thomas-1.0.1.dist-info/METADATA +825 -0
jerry_thomas-1.0.1.dist-info/RECORD +199 -0
{jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
datapipeline/build/tasks.py +0 -186
datapipeline/cli/commands/link.py +0 -128
datapipeline/cli/commands/writers.py +0 -138
datapipeline/config/build.py +0 -64
datapipeline/config/run.py +0 -116
datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
datapipeline/transforms/vector.py +0 -210
jerry_thomas-0.3.0.dist-info/METADATA +0 -502
jerry_thomas-0.3.0.dist-info/RECORD +0 -139
{jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
{jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
{jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0

datapipeline/templates/plugin_skeleton/README.md CHANGED Viewed

@@ -4,23 +4,28 @@ Minimal plugin skeleton for the Jerry Thomas (datapipeline) framework.
 Quick start
 - Initialize a plugin (already done if you’re reading this here):
-- `jerry plugin init --name {{DIST_NAME}}`
+- `jerry plugin init {{DIST_NAME}}`
 - Add a source via CLI (transport-specific placeholders are scaffolded):
-  - File data: `jerry source add -p <provider> -d <dataset> -t fs -f <csv|json|json-lines>`
-  - URL data: `jerry source add -p <provider> -d <dataset> -t url -f <json|json-lines|csv>`
+  - File data: `jerry source add <provider> <dataset> -t fs -f <csv|json|json-lines|pickle>`
+  - HTTP data: `jerry source add <provider>.<dataset> -t http -f <json|json-lines|csv>`
   - Synthetic: `jerry source add -p <provider> -d <dataset> -t synthetic`
 - Edit the generated `config/sources/*.yaml` to fill in the `path`, delimiter, etc.
+- `jerry.yaml` is placed in your workspace root (alongside the plugin folder) so
+  you can run CLI commands from there; `plugin_root` points back to this plugin.
 - Reinstall after EP changes (pyproject.toml) and restart Python processes:
   - Core: `cd lib/datapipeline && python -m pip install -e .`
   - This plugin: `python -m pip install -e .`
 Folder layout
-- `config/`
-  - `sources/*.yaml` — raw source definitions (one file per source)
+- `example/`
+  - `project.yaml` — project root (paths, globals, cadence/split)
+  - `dataset.yaml` — feature/target declarations (uses `${group_by}` from globals)
+  - `postprocess.yaml` — postprocess transforms
   - `contracts/*.yaml` — canonical stream definitions
-  - `datasets/<name>/build.yaml` — build configuration (partitioned ids today, more artifacts later)
+  - `sources/*.yaml` — raw source definitions (one file per source)
+  - `tasks/*.yaml` — task specs (schema/scaler/metadata/serve)
 - Every dataset `project.yaml` declares a `name`; reference it via `${project_name}`
-  inside other config files (e.g., `paths.artifacts: ../../build/datasets/${project_name}`) to
+  inside other config files (e.g., `paths.artifacts: ../artifacts/${project_name}`) to
   avoid hard-coding per-dataset directories.
 - `src/{{PACKAGE_NAME}}/`
   - `sources/<provider>/<dataset>/dto.py` — DTO model for the source
@@ -30,52 +35,59 @@ Folder layout
   - `mappers/*.py` — map DTOs → domain records
 How loaders work
-- For fs/url, sources use the generic loader entry point:
-  - `loader.entrypoint: "{{COMPOSED_LOADER_EP}}"`
+- For fs/http, sources use the generic loader entry point:
+  - `loader.entrypoint: "{{DEFAULT_IO_LOADER_EP}}"`
 - `loader.args` include `transport`, `format`, and source-specific args (placeholders are provided):
     - fs: `path`, `glob`, `encoding`, plus `delimiter` for csv
-    - url: `url`, `headers`, `encoding`, optional `count_by_fetch`
+    - http: `url`, `headers`, `params`, `encoding`, optional `count_by_fetch`
 - Synthetic sources generate data in-process and keep a small loader stub.
 Run data flows
-- Build artifacts once: `jerry build --project config/datasets/default/project.yaml`
-- Preview records (stage 1): `jerry serve --project config/datasets/default/project.yaml --stage 1 --limit 100`
-- Preview features (stage 3): `jerry serve --project config/datasets/default/project.yaml --stage 3 --limit 100`
-- Preview vectors (stage 7): `jerry serve --project config/datasets/default/project.yaml --stage 7 --limit 100`
+- Build artifacts once: `jerry build --project example/project.yaml`
+- Preview records (stage 1): `jerry serve --project example/project.yaml --stage 1 --limit 100`
+- Preview features (stage 3): `jerry serve --project example/project.yaml --stage 3 --limit 100`
+- Preview vectors (stage 7): `jerry serve --project example/project.yaml --stage 7 --limit 100`
 Analyze vectors
-- `jerry inspect report   --project config/datasets/default/project.yaml` (console only)
-- `jerry inspect coverage --project config/datasets/default/project.yaml` (writes build/coverage.json)
-- `jerry inspect matrix   --project config/datasets/default/project.yaml --format html` (writes build/matrix.html)
-- `jerry inspect partitions --project config/datasets/default/project.yaml` (writes build/partitions.json)
+- `jerry inspect report   --project example/project.yaml` (console only)
+- `jerry inspect partitions --project example/project.yaml` (writes build/partitions.json)
+- `jerry inspect matrix   --project example/project.yaml --format html` (writes build/matrix.html)
+- `jerry inspect expected --project example/project.yaml` (writes build/expected.txt)
 - Use post-processing transforms in `postprocess.yaml` to keep coverage high
   (history/horizontal fills, constants, or drop rules) before serving vectors.
+  Add `payload: targets` inside a transform when you need to mutate label vectors.
 Train/Val/Test splits (deterministic)
 - Configure split mechanics once in your project file:
-  - Edit `config/datasets/default/project.yaml` and set:
+  - Edit `example/project.yaml` and set:
     ```yaml
     globals:
+      group_by: 10m          # dataset cadence; reused as contract cadence
       split:
         mode: hash            # hash|time
         key: group            # group or feature:<id> (entity-stable)
         seed: 42              # deterministic hash seed
         ratios: {train: 0.8, val: 0.1, test: 0.1}
     ```
-- Select the active slice via `config/datasets/default/run.yaml` (or `--keep`):
+- Select the active slice via `example/tasks/serve.<name>.yaml` (or `--keep`):
   ```yaml
-  version: 1
+  kind: serve
+  name: train               # defaults to filename stem when omitted
   keep: train               # any label defined in globals.split; null disables filtering
-  output: print             # serve output default (print|stream|/path)
+  output:
+    transport: stdout       # stdout | fs
+    format: print           # print | json-lines | json | csv | pickle
   limit: 100                # cap vectors per serve run (null = unlimited)
-  include_targets: false    # include dataset.targets when serving
   throttle_ms: null         # sleep between vectors (milliseconds)
+  # visuals: AUTO  # AUTO | TQDM | RICH | OFF
+  # progress: AUTO # AUTO | SPINNER | BARS | OFF
   ```
-- If you prefer separate configs per split, point `project.paths.run` at a folder (e.g., `config/datasets/default/runs/`),
-  drop `train.yaml`, `val.yaml`, etc. inside, and the CLI will run each file in order unless you pass `--run <name>`.
-- Serve examples (change run.yaml or pass `--keep val|test`):
-  - `jerry serve -p config/datasets/default/project.yaml -o stream > train.jsonl`
-  - `jerry serve -p config/datasets/default/project.yaml --keep val -o stream > val.jsonl`
+- Add additional `kind: serve` files (e.g., `serve.val.yaml`, `serve.test.yaml`) and the CLI will run each enabled file in order unless you pass `--run <name>`.
+- Serve examples (change the serve task or pass `--keep val|test`):
+  - `jerry serve -p example/project.yaml --out-transport stdout --out-format json-lines > train.jsonl`
+  - `jerry serve -p example/project.yaml --keep val --out-transport stdout --out-format json-lines > val.jsonl`
+  - Add `--visuals rich --progress bars` for a richer interactive UI; defaults to `AUTO`.
+- For shared workspace defaults (visual renderer, progress display, build mode), drop a `jerry.yaml` next to your workspace root and set `shared.visuals`, `shared.progress`, etc. CLI commands walk up from the current directory to find it.
 - The split is applied at the end (after postprocess transforms), and assignment
   is deterministic (hash-based) with a fixed seed; no overlap across runs.
@@ -84,13 +96,47 @@ Key selection guidance
 - `key: feature:<id>` hashes a specific feature value, e.g., `feature:entity_id` or `feature:station_id`, ensuring all vectors for that entity land in the same split (recommended to avoid leakage).
 Postprocess expected IDs
-- Build once with `jerry build --project config/datasets/default/project.yaml` (or run `jerry inspect expected …`) to materialize `<paths.artifacts>/expected.txt`.
+- Build once with `jerry build --project config/project.yaml` (or run `jerry inspect expected …`) to materialize `<paths.artifacts>/expected.txt`.
 - Bootstrap registers the artifact; postprocess transforms read it automatically. Per-transform `expected:` overrides are no longer required or supported — the build output is the single source of truth.
 Scaler statistics
-- Enable the scaler task in `build.yaml` (default `enabled: true`) to compute mean/std per feature using the configured training split.
+- Jerry computes scaler stats automatically. If you need custom paths or settings, add `tasks/scaler.yaml` and override the defaults.
 - The build writes `<paths.artifacts>/scaler.pkl`; runtime scaling requires this artifact. If it is missing, scaling transforms raise a runtime error.
 Tips
 - Keep parsers thin — mirror source schema and return DTOs; use the identity parser only if your loader already emits domain records.
 - Prefer small, composable configs over monolithic ones: one YAML per source is easier to review and reuse.
+Composed streams (engineered domains)
+- Declare engineered streams that depend on other canonical streams directly in contracts. The runtime builds each input to stage 4, stream‑aligns by partition+timestamp, runs your composer, and emits fresh records for the derived stream.
+```yaml
+# example/contracts/air_density.processed.yaml
+kind: composed
+id: air_density.processed
+inputs:
+  - p=pressure.processed
+  - t=temp_dry.processed
+partition_by: station_id
+sort_batch_size: 20000
+mapper:
+  entrypoint: {{PACKAGE_NAME}}.mappers.air_density:mapper
+  args:
+    driver: p   # optional; defaults to first input alias
+# Optional post‑compose policies (same as any stream):
+# record: [...]
+# stream: [...]
+# debug: [...]
+```
+Then reference the composed stream in your dataset:
+```yaml
+# example/dataset.yaml
+group_by: ${group_by}
+features:
+  - id: air_density
+    record_stream: air_density.processed
+```

datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml ADDED Viewed

@@ -0,0 +1,31 @@
+kind: ingest
+source: synthetic.ticks
+id: time.ticks.hour_sin # format: domain.dataset.(variant)
+# Fine-grained cadence for this stream. Defaults to the dataset group_by via project.globals.
+cadence: ${group_by}
+mapper:
+  entrypoint: encode_time
+  args: { mode: hour_sin }
+# partition_by: field you want to partition
+record:
+  - filter: { operator: ge, field: time, comparand: "${start_time}" }
+  - filter: { operator: le, field: time, comparand: "${end_time}" }
+  - floor_time: { cadence: "${cadence}" }
+  # - lag: { lag: "${cadence}" }
+stream:
+  - dedupe: {}
+  - granularity: { mode: first }
+  - ensure_cadence: { cadence: "${cadence}" }
+  # Optional: fill gaps before downstream transforms:
+  # - fill: { statistic: median, window: 24, min_samples: 4 }
+debug:
+  - lint: { mode: error, tick: "${cadence}" }
+# sort_batch_size: 100000

datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml ADDED Viewed

@@ -0,0 +1,30 @@
+kind: ingest
+source: synthetic.ticks # raw source alias (see example/sources)
+id: time.ticks.linear # canonical stream id (format: domain.dataset.(variant))
+# Fine-grained cadence for this stream. Defaults to the dataset group_by via project.globals.
+cadence: ${group_by}
+mapper: # normalize/reshape DTO -> TemporalRecord
+  entrypoint: encode_time
+  args: { mode: linear }
+# partition_by: station_id      # optional: add partition suffixes to feature ids
+record: # record-level transforms
+  - filter: { operator: ge, field: time, comparand: "${start_time}" }
+  - filter: { operator: le, field: time, comparand: "${end_time}" }
+  - floor_time: { cadence: "${cadence}" }          # snap timestamps to cadence boundaries
+  # - lag: { lag: "${cadence}" }                      # optional: shift timestamps backwards
+stream:                       # per-feature stream transforms (input sorted by id,time)
+  - dedupe: {}                                # drop exact-duplicate records per tick
+  - granularity: { mode: first }              # aggregate duplicates within a tick
+  - ensure_cadence: { cadence: "${cadence}" }           # insert missing ticks (value=None)
+  # Consider adding a fill transform to impute None values before sequence/windowing:
+  # - fill: { statistic: median, window: 6, min_samples: 1 }
+debug:                        # optional validation-only transforms
+  - lint: { mode: error, tick: "${cadence}" }  # strict cadence/order; value issues handled by downstream transforms
+# sort_batch_size: 100000       # in-memory chunk size used by internal sorting

datapipeline/templates/plugin_skeleton/example/dataset.yaml ADDED Viewed

@@ -0,0 +1,18 @@
+group_by: ${group_by}
+features:
+  - id: time_linear
+    record_stream: time.ticks.linear
+    scale: true # optionally add with_mean/with_std overrides
+    # Sliding window over the regularized stream; cadence is enforced in the contract.
+    sequence: { size: 6, stride: 1 }
+  - id: time_hour_sin
+    record_stream: time.ticks.hour_sin
+  # - id: third_feature
+  #   record_stream: anotherstream
+# targets:
+#   - id: some_target
+#     record_stream: time.ticks.linear

datapipeline/templates/plugin_skeleton/example/postprocess.yaml ADDED Viewed

@@ -0,0 +1,29 @@
+#### example combination of postprocessing steps ######
+#### making sure data is complete after these combinations ######
+- drop: # example of dropping sparse partitions/vertical-axis for targets
+    axis: vertical
+    payload: targets
+    threshold: 0.9
+- drop: # example of dropping sparse partitions for features
+    axis: vertical
+    payload: features
+    threshold: 0.9
+- drop: # dropping vectors/horizontal-axis that has features which none
+    axis: horizontal
+    payload: features
+    threshold: 1
+- drop:
+    axis: horizontal
+    payload: targets
+    threshold: 1
+######
+# - fill:
+#     statistic: median
+#     window: 48
+#     min_samples: 6
+# - replace:
+#     payload: targets
+#     value: 0.0

datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml RENAMED Viewed

@@ -1,18 +1,21 @@
 version: 1
-name: default
+name: example
 paths:
-  streams: ../../contracts
-  sources: ../../sources
+  streams: ./contracts
+  sources: ./sources
   dataset: dataset.yaml
   postprocess: postprocess.yaml
-  artifacts: ../../build/datasets/${project_name}
-  build: build.yaml
-  run: runs
+  artifacts: ../artifacts/${project_name}/v${version}
+  tasks: ./tasks
 globals:
+  # Globals to use in your .yaml files via ${var_name}.
+  # Primary dataset cadence; referenced from dataset.yaml (group_by)
+  # and contracts via ${group_by}.
+  group_by: 1h
   start_time: 2021-01-01T00:00:00Z
-  end_time: 2023-01-03T23:00:00Z
+  end_time: 2021-01-02T00:00:00Z
   # Configure deterministic dataset split here (applied at serve time, after postprocess).
-  # Adjust `ratios` as needed; the active split is selected via run.yaml or CLI.
+  # Adjust `ratios` as needed; the active split is selected via serve tasks or CLI.
   split:
     mode: hash # hash | time (time uses boundaries/labels)
     key: group # group | feature:<id> (entity-stable split)

datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml ADDED Viewed

@@ -0,0 +1,12 @@
+id: synthetic.ticks
+parser:
+  entrypoint: "core.synthetic.ticks"
+  args: {}
+loader:
+  entrypoint: "core.synthetic.ticks"
+  args:
+    start: "${start_time}"
+    end: "${end_time}"
+    frequency: "${group_by}"

datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml ADDED Viewed

@@ -0,0 +1,3 @@
+kind: metadata
+# window_mode: intersection # union|intersection|strict|relaxed (default: intersection)

datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml ADDED Viewed

@@ -0,0 +1,9 @@
+kind: scaler
+# Output path is relative to project.paths.artifacts; defaults to "scaler.pkl".
+# output: scaler.pkl
+# Split label to use when fitting scaler statistics.
+# Must match a label from globals.split.ratios.
+split_label: train

datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ kind: schema
2	+

datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml ADDED Viewed

@@ -0,0 +1,4 @@
+kind: serve
+name: test
+keep: test

datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml ADDED Viewed

@@ -0,0 +1,28 @@
+kind: serve
+# Optional identifier for this serve task; defaults to filename stem.
+name: train
+# Active split label to serve; must match a label from globals.split.ratios.
+# Set to null to disable split filtering.
+keep: train
+#output:
+# transport: stdout | fs
+# format: print | json-lines | json | csv | pickle
+# When using fs transport, set a directory (and optionally filename) for outputs:
+# directory: artifacts/serve
+# filename: vectors.train
+# Default max number of vectors to emit (null = unlimited).
+# limit: 5
+# Optional pipeline stage preview (0-7); null lets the CLI decide.
+# stage: 7
+# Optional pacing between emitted vectors (milliseconds).
+# throttle_ms: null
+# Visuals/logging knobs (inherit CLI or jerry.yaml defaults when omitted):
+# visuals: AUTO   # AUTO | TQDM | RICH | OFF
+# progress: AUTO  # AUTO | SPINNER | BARS | OFF
+# log_level: INFO # CRITICAL | ERROR | WARNING | INFO | DEBUG

datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml ADDED Viewed

@@ -0,0 +1,4 @@
+kind: serve
+name: val
+keep: val

datapipeline/templates/plugin_skeleton/jerry.yaml ADDED Viewed

@@ -0,0 +1,34 @@
+# Workspace defaults. The scaffolder copies this to your workspace root (where
+# you ran `jerry plugin init`). CLI commands walk upward from cwd to find it.
+# Relative path from this workspace file back to the plugin root.
+plugin_root: . # e.g., "lib/myplugin" if your plugin lives under lib/
+# Dataset aliases for `--dataset`; values may be dirs (auto-append project.yaml).
+datasets:
+  example: example/project.yaml
+  your-second-example-dataset: your-dataset/project.yaml
+# Default dataset alias when --dataset/--project are omitted.
+default_dataset: example
+# Shared fallbacks used by all commands (unless overridden).
+shared:
+  visuals: AUTO # AUTO | TQDM | RICH | OFF
+  progress: BARS # AUTO | SPINNER | BARS | OFF
+  log_level: INFO # Default log level when not set elsewhere
+# Defaults for `jerry serve` (run-time options).
+serve:
+  # log_level: INFO # Uncomment to force INFO for serve runs
+  limit: null      # Cap vectors; null means unlimited
+  stage: null      # Preview a specific stage; null runs the full pipeline
+  output:
+    transport: stdout
+    format: print  # stdout: print|json-lines|json|csv|pickle
+    # directory: artifacts/serve # Required when transport=fs
+# Defaults for `jerry build` (artifact materialization).
+build:
+  # log_level: INFO # Uncomment to set build log level
+  mode: AUTO # AUTO | FORCE | OFF

datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml ADDED Viewed

@@ -0,0 +1,31 @@
+kind: ingest
+source: synthetic.ticks
+id: time.ticks.hour_sin # format: domain.dataset.(variant)
+# Fine-grained cadence for this stream. Defaults to the dataset group_by via project.globals.
+cadence: ${group_by}
+mapper:
+  entrypoint: encode_time
+  args: { mode: hour_sin }
+# partition_by: field you want to partition
+record:
+  - filter: { operator: ge, field: time, comparand: "${start_time}" }
+  - filter: { operator: le, field: time, comparand: "${end_time}" }
+  - floor_time: { cadence: "${cadence}" }
+  # - lag: { lag: "${cadence}" }
+stream:
+  - dedupe: {}
+  - granularity: { mode: first }
+  - ensure_cadence: { cadence: "${cadence}" }
+  # Optional: fill gaps before downstream transforms:
+  # - fill: { statistic: median, window: 24, min_samples: 4 }
+debug:
+  - lint: { mode: error, tick: "${cadence}" }
+# sort_batch_size: 100000

datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml ADDED Viewed

@@ -0,0 +1,30 @@
+kind: ingest
+source: synthetic.ticks # raw source alias (see example/sources)
+id: time.ticks.linear # canonical stream id (format: domain.dataset.(variant))
+# Fine-grained cadence for this stream. Defaults to the dataset group_by via project.globals.
+cadence: ${group_by}
+mapper: # normalize/reshape DTO -> TemporalRecord
+  entrypoint: encode_time
+  args: { mode: linear }
+# partition_by: station_id      # optional: add partition suffixes to feature ids
+record: # record-level transforms
+  - filter: { operator: ge, field: time, comparand: "${start_time}" }
+  - filter: { operator: le, field: time, comparand: "${end_time}" }
+  - floor_time: { cadence: "${cadence}" }          # snap timestamps to cadence boundaries
+  # - lag: { lag: "${cadence}" }                      # optional: shift timestamps backwards
+stream:                       # per-feature stream transforms (input sorted by id,time)
+  - dedupe: {}                                # drop exact-duplicate records per tick
+  - granularity: { mode: first }              # aggregate duplicates within a tick
+  - ensure_cadence: { cadence: "${cadence}" }           # insert missing ticks (value=None)
+  # Consider adding a fill transform to impute None values before sequence/windowing:
+  # - fill: { statistic: median, window: 6, min_samples: 1 }
+debug:                        # optional validation-only transforms
+  - lint: { mode: error, tick: "${cadence}" }  # strict cadence/order; value issues handled by downstream transforms
+# sort_batch_size: 100000       # in-memory chunk size used by internal sorting

datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml ADDED Viewed

@@ -0,0 +1,18 @@
+group_by: ${group_by}
+features:
+  - id: time_linear
+    record_stream: time.ticks.linear
+    scale: true # optionally add with_mean/with_std overrides
+    # Sliding window over the regularized stream; cadence is enforced in the contract.
+    sequence: { size: 6, stride: 1 }
+  - id: time_hour_sin
+    record_stream: time.ticks.hour_sin
+  # - id: third_feature
+  #   record_stream: anotherstream
+# targets:
+#   - id: some_target
+#     record_stream: time.ticks.linear

datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml ADDED Viewed

@@ -0,0 +1,29 @@
+#### example combination of postprocessing steps ######
+#### making sure data is complete after these combinations ######
+- drop: # example of dropping sparse partitions/vertical-axis for targets
+    axis: vertical
+    payload: targets
+    threshold: 0.9
+- drop: # example of dropping sparse partitions for features
+    axis: vertical
+    payload: features
+    threshold: 0.9
+- drop: # dropping vectors/horizontal-axis that has features which none
+    axis: horizontal
+    payload: features
+    threshold: 1
+- drop:
+    axis: horizontal
+    payload: targets
+    threshold: 1
+######
+# - fill:
+#     statistic: median
+#     window: 48
+#     min_samples: 6
+# - replace:
+#     payload: targets
+#     value: 0.0

datapipeline/templates/plugin_skeleton/your-dataset/project.yaml ADDED Viewed

@@ -0,0 +1,22 @@
+version: 1
+name: <your-dataset>
+paths:
+  streams: ./contracts
+  sources: ./sources
+  dataset: dataset.yaml
+  postprocess: postprocess.yaml
+  artifacts: ../artifacts/${project_name}/v${version}
+  tasks: ./tasks
+globals:
+  # Primary dataset cadence; referenced from dataset.yaml (group_by)
+  # and contracts via ${group_by}.
+  group_by: <your-bucket-cadence>
+  start_time: null #2021-01-01T00:00:00Z
+  end_time: null #2021-01-02T00:00:00Z
+  # Configure deterministic dataset split here (applied at serve time, after postprocess).
+  # Adjust `ratios` as needed; the active split is selected via serve tasks or CLI.
+  split:
+    mode: hash # hash | time (time uses boundaries/labels)
+    key: group # group | feature:<id> (entity-stable split)
+    seed: 42 # deterministic hash seed
+    ratios: { train: 0.8, val: 0.1, test: 0.1 }

datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml ADDED Viewed

@@ -0,0 +1,12 @@
+id: synthetic.ticks
+parser:
+  entrypoint: "core.synthetic.ticks"
+  args: {}
+loader:
+  entrypoint: "core.synthetic.ticks"
+  args:
+    start: "${start_time}"
+    end: "${end_time}"
+    frequency: "${group_by}"

datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml ADDED Viewed

@@ -0,0 +1,3 @@
+kind: metadata
+# window_mode: intersection # union|intersection|strict|relaxed (default: intersection)

datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml ADDED Viewed

@@ -0,0 +1,9 @@
+kind: scaler
+# Output path is relative to project.paths.artifacts; defaults to "scaler.pkl".
+# output: scaler.pkl
+# Split label to use when fitting scaler statistics.
+# Must match a label from globals.split.ratios.
+split_label: train

datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ kind: schema
2	+

datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml ADDED Viewed

@@ -0,0 +1,4 @@
+kind: serve
+name: test
+keep: test

datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml ADDED Viewed

@@ -0,0 +1,28 @@
+kind: serve
+# Optional identifier for this serve task; defaults to filename stem.
+name: train
+# Active split label to serve; must match a label from globals.split.ratios.
+# Set to null to disable split filtering.
+keep: train
+#output:
+# transport: stdout | fs
+# format: print | json-lines | json | csv | pickle
+# When using fs transport, set a directory (and optionally filename) for outputs:
+# directory: artifacts/serve
+# filename: vectors.train
+# Default max number of vectors to emit (null = unlimited).
+# limit: 5
+# Optional pipeline stage preview (0-7); null lets the CLI decide.
+# stage: 7
+# Optional pacing between emitted vectors (milliseconds).
+# throttle_ms: null
+# Visuals/logging knobs (inherit CLI or jerry.yaml defaults when omitted):
+# visuals: AUTO   # AUTO | TQDM | RICH | OFF
+# progress: AUTO  # AUTO | SPINNER | BARS | OFF
+# log_level: INFO # CRITICAL | ERROR | WARNING | INFO | DEBUG

datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml ADDED Viewed

@@ -0,0 +1,4 @@
+kind: serve
+name: val
+keep: val

datapipeline/templates/stubs/dto.py.j2 CHANGED Viewed

@@ -1,4 +1,6 @@
 from dataclasses import dataclass
+from datetime import datetime
 @dataclass
 class {{CLASS_NAME}}:

jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

jerry-thomas 0.3.0py3-none-any.whl → 1.0.1py3-none-any.whl