jerry-thomas 1.0.1__tar.gz → 1.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/PKG-INFO +290 -288
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/README.md +289 -287
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/pyproject.toml +2 -1
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/app.py +9 -10
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/contract.py +8 -2
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/source.py +5 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/visuals/common.py +57 -5
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/visuals/labels.py +8 -41
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/visuals/sources_rich.py +8 -3
- jerry_thomas-1.0.3/src/datapipeline/cli/workspace_utils.py +25 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/dataset/dataset.py +1 -1
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/dataset/normalize.py +9 -4
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/workspace.py +15 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/scaffold/source.py +2 -1
- jerry_thomas-1.0.3/src/datapipeline/sources/foreach.py +151 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/pyproject.toml +1 -1
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +1 -1
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/jerry_thomas.egg-info/PKG-INFO +290 -288
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/jerry_thomas.egg-info/SOURCES.txt +2 -3
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/jerry_thomas.egg-info/entry_points.txt +1 -0
- jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +0 -31
- jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +0 -30
- jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +0 -12
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/LICENSE +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/setup.cfg +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/analysis/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/analysis/vector/collector.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/analysis/vector/matrix.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/analysis/vector/report.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/analysis/vector_analyzer.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/build/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/build/state.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/build/tasks/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/build/tasks/config.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/build/tasks/metadata.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/build/tasks/scaler.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/build/tasks/schema.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/build/tasks/utils.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/build.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/domain.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/filter.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/inspect.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/list_.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/plugin.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/run.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/run_config.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/serve_pipeline.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/visuals/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/visuals/runner.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/visuals/sections.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/visuals/sources.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/visuals/sources_basic.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/visuals/sources_off.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/catalog.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/context.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/dataset/feature.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/dataset/loader.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/metadata.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/postprocess.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/project.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/resolution.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/split.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/tasks.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/domain/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/domain/feature.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/domain/record.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/domain/sample.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/domain/vector.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/filters/filters.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/integrations/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/integrations/ml/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/integrations/ml/adapter.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/integrations/ml/pandas_support.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/integrations/ml/rows.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/integrations/ml/torch_support.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/factory.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/output.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/protocols.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/serializers.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/sinks/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/sinks/base.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/sinks/files.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/sinks/rich.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/sinks/stdout.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/writers/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/writers/base.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/writers/csv_writer.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/writers/jsonl.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/writers/pickle_writer.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/mappers/noop.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/mappers/synthetic/time.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/parsers/identity.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/artifacts.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/context.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/observability.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/pipelines.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/split.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/stages.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/utils/keygen.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/utils/memory_sort.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/utils/ordering.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/utils/transform_utils.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/plugins.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/registries/registry.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/runtime.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/artifacts.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/bootstrap/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/bootstrap/config.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/bootstrap/core.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/constants.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/entrypoints.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/factories.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/paths.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/project_paths.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/runs.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/scaffold/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/scaffold/domain.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/scaffold/filter.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/scaffold/mappers.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/scaffold/plugin.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/scaffold/templates.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/data_loader.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/decoders.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/factory.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/models/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/models/base.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/models/generator.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/models/loader.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/models/parser.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/models/parsing_error.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/models/source.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/models/synthetic.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/synthetic/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/synthetic/time/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/synthetic/time/loader.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/synthetic/time/parser.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/transports.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/README.md +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/dataset.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/postprocess.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/project.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/jerry.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/stubs/dto.py.j2 +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/stubs/filter.py.j2 +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/stubs/loader_synthetic.py.j2 +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/stubs/mapper.py.j2 +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/stubs/parser.py.j2 +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/stubs/parser_custom.py.j2 +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/stubs/record.py.j2 +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/stubs/source.yaml.j2 +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/debug/identity.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/debug/lint.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/feature/model.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/feature/scaler.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/filter.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/record/floor_time.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/record/lag.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/sequence.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/stream/dedupe.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/stream/ensure_ticks.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/stream/fill.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/stream/granularity.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/utils.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/vector/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/vector/common.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/vector/drop/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/vector/drop/horizontal.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/vector/drop/orchestrator.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/vector/drop/vertical.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/vector/ensure_schema.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/vector/fill.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/vector/replace.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/vector_utils.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/utils/__init__.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/utils/load.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/utils/paths.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/utils/pickle_model.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/utils/placeholders.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/utils/rich_compat.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/utils/time.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/utils/window.py +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/jerry_thomas.egg-info/dependency_links.txt +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/jerry_thomas.egg-info/requires.txt +0 -0
- {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/jerry_thomas.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: jerry-thomas
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.3
|
|
4
4
|
Summary: Jerry-Thomas: a stream-first, plugin-friendly data pipeline (mixology-themed CLI)
|
|
5
5
|
Author: Anders Skott Lind
|
|
6
6
|
License: MIT
|
|
@@ -49,263 +49,112 @@ transforms, and filters.
|
|
|
49
49
|
|
|
50
50
|
## Quick Start
|
|
51
51
|
|
|
52
|
+
### Serve The Example
|
|
53
|
+
|
|
52
54
|
```bash
|
|
53
|
-
|
|
54
|
-
|
|
55
|
+
pip install jerry-thomas
|
|
56
|
+
jerry plugin init my-datapipeline --out lib/
|
|
57
|
+
jerry serve --limit 3
|
|
58
|
+
```
|
|
55
59
|
|
|
56
|
-
|
|
57
|
-
jerry plugin init my_datapipeline --out .
|
|
60
|
+
### Create Your Own Stream
|
|
58
61
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
jerry source add demo weather --transport fs --format csv
|
|
62
|
-
jerry source add demo.weather --transport http --format json
|
|
62
|
+
Assumes you already ran `jerry plugin init ...` in this workspace (it writes `jerry.yaml` which the CLI uses for defaults and scaffolding paths).
|
|
63
|
+
These scaffolding commands write YAML into the dataset selected by `default_dataset` in `jerry.yaml` (`example` by default).
|
|
63
64
|
|
|
64
|
-
|
|
65
|
-
jerry source add
|
|
65
|
+
```bash
|
|
66
|
+
jerry source add demo weather -t fs -f csv
|
|
66
67
|
jerry domain add weather
|
|
67
|
-
|
|
68
|
+
jerry contract
|
|
69
|
+
pip install -e lib/my-datapipeline
|
|
70
|
+
```
|
|
68
71
|
|
|
69
|
-
|
|
70
|
-
# Then preview the pipeline and serve a few vectors:
|
|
71
|
-
# Add --skip-build when you only need a quick feature peek.
|
|
72
|
-
jerry serve --project config/project.yaml --stage 2 --limit 5
|
|
73
|
-
jerry serve --project config/project.yaml --limit 3
|
|
72
|
+
---
|
|
74
73
|
|
|
75
|
-
|
|
76
|
-
jerry inspect report --project config/project.yaml
|
|
77
|
-
jerry build --project config/project.yaml
|
|
78
|
-
```
|
|
74
|
+
## CLI Cheat Sheet
|
|
79
75
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
76
|
+
- `jerry plugin init <name> --out lib/`: scaffolds `lib/<name>/` and writes workspace `jerry.yaml`.
|
|
77
|
+
- `jerry.yaml` (created by `plugin init`): sets `plugin_root` for scaffolding commands and `datasets/default_dataset` so you can omit `--project`/`--dataset`.
|
|
78
|
+
- `jerry serve [--dataset <alias>|--project <path>] [--limit N] [--stage 0-7] [--skip-build]`: streams output; builds required artifacts unless `--skip-build`.
|
|
79
|
+
- `jerry build [--dataset <alias>|--project <path>] [--force]`: materializes artifacts (schema, scaler, expected IDs, etc.).
|
|
80
|
+
- `jerry inspect report|matrix|partitions|expected [--dataset <alias>|--project <path>]`: quality and metadata helpers.
|
|
81
|
+
- `jerry source add <provider> <dataset> -t fs|http|synthetic -f csv|json|json-lines|pickle [--identity]`: scaffolds a source YAML and (unless `--identity`) a parser + entry point.
|
|
82
|
+
- `jerry domain add <domain>`: scaffolds domain models under `src/<package>/domains/<domain>/`.
|
|
83
|
+
- `jerry contract [--identity]`: interactive contract scaffolder; most users pick `[1] Ingest (source → stream)` (use `[2] Composed` for derived streams, e.g. air_density from temp + pressure).
|
|
84
|
+
- `pip install -e lib/<name>`: rerun after commands that update `lib/<name>/pyproject.toml` (entry points), or after manual edits to it.
|
|
83
85
|
|
|
84
86
|
---
|
|
85
87
|
|
|
86
|
-
##
|
|
88
|
+
## Concepts
|
|
87
89
|
|
|
88
|
-
|
|
89
|
-
raw source ──▶ loader/parser DTOs ──▶ canonical stream ──▶ record policies
|
|
90
|
-
└──▶ feature wrapping ──▶ stream regularization ──▶ feature transforms/sequence
|
|
91
|
-
└──▶ vector assembly ──▶ postprocess transforms
|
|
92
|
-
```
|
|
90
|
+
### Workspace (`jerry.yaml`)
|
|
93
91
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
2. **Canonical stream mapping (Stage 1)** – mappers attach domain semantics and
|
|
98
|
-
partition keys, producing domain `TemporalRecord`s.
|
|
99
|
-
3. **Record policies (Stage 2)** – contract `record` rules (filters, floor, lag)
|
|
100
|
-
prune and normalize DTO-derived records.
|
|
101
|
-
4. **Feature wrapping (Stage 3)** – records become `FeatureRecord`s before
|
|
102
|
-
sort/regularization.
|
|
103
|
-
5. **Stream regularization (Stage 4)** – contract `stream` rules ensure cadence,
|
|
104
|
-
deduplicate timestamps, and impute where needed.
|
|
105
|
-
6. **Feature transforms/sequence (Stage 5)** – dataset transforms (scale,
|
|
106
|
-
sequence windows) produce per-feature tensors or windows.
|
|
107
|
-
7. **Vector assembly (Stage 6)** – features merge by `group_by` cadence into
|
|
108
|
-
`(group_key, Vector)` pairs, prior to postprocess tweaks.
|
|
109
|
-
8. **Postprocess (Stage 7)** – optional vector transforms (fill/drop/etc.) run
|
|
110
|
-
before results are emitted to the configured output.
|
|
92
|
+
- `datasets`: dataset aliases → `project.yaml` paths (relative to `jerry.yaml`).
|
|
93
|
+
- `default_dataset`: which dataset `jerry serve/build/inspect` use when you omit `--dataset/--project`.
|
|
94
|
+
- `plugin_root`: where scaffolding commands write Python code (`src/<package>/...`) and where they look for `pyproject.toml`.
|
|
111
95
|
|
|
112
|
-
|
|
96
|
+
### Plugin Package (Python Code)
|
|
113
97
|
|
|
114
|
-
|
|
115
|
-
flowchart TB
|
|
116
|
-
subgraph CLI & Project config
|
|
117
|
-
cliSource[jerry source add]
|
|
118
|
-
cliDomain[jerry domain add]
|
|
119
|
-
cliContract[jerry contract]
|
|
120
|
-
cliServe[jerry serve]
|
|
121
|
-
project[[project.yaml]]
|
|
122
|
-
sourcesCfg[config/sources/*.yaml]
|
|
123
|
-
contractsCfg[config/contracts/*.yaml]
|
|
124
|
-
datasetCfg[dataset.yaml]
|
|
125
|
-
postprocessCfg[postprocess.yaml]
|
|
126
|
-
end
|
|
98
|
+
These live under `lib/<plugin>/src/<package>/`:
|
|
127
99
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
project -.->|paths.sources| sourcesCfg
|
|
133
|
-
project -.->|paths.streams| contractsCfg
|
|
134
|
-
project -.->|paths.dataset| datasetCfg
|
|
135
|
-
project -.->|paths.postprocess| postprocessCfg
|
|
100
|
+
- `sources/<provider>/<dataset>/dto.py` + `parser.py`: source DTO + parser (created by `jerry source add` unless `--identity`).
|
|
101
|
+
- `domains/<domain>/model.py`: domain records (created by `jerry domain add`).
|
|
102
|
+
- `mappers/<provider>/<dataset>/to_<domain>.py`: DTO → domain record mapping (usually created by `jerry contract`).
|
|
103
|
+
- `pyproject.toml`: entry points for loaders/parsers/mappers/transforms (rerun `pip install -e lib/<plugin>` after changes).
|
|
136
104
|
|
|
137
|
-
|
|
138
|
-
domainPkg[domains/*]
|
|
139
|
-
mappersPkg[mappers/*]
|
|
140
|
-
end
|
|
105
|
+
### Loaders & Parsers
|
|
141
106
|
|
|
142
|
-
|
|
143
|
-
|
|
107
|
+
- A **loader** yields raw rows (bytes/dicts) from some transport (FS/HTTP/synthetic/etc.).
|
|
108
|
+
- A **parser** turns each raw row into a typed DTO (or returns `None` to drop a row).
|
|
109
|
+
- In most projects, your source YAML uses the built-in loader `core.io` and you only customize its `args` (`transport`, `format`, and a `path`/`url`).
|
|
110
|
+
- You typically only implement a custom loader when you need specialized behavior (auth/pagination/rate limits, proprietary formats, or non-standard protocols).
|
|
111
|
+
- `parser.args` are optional and only used when your parser supports configuration; many parsers don’t need any args since filtering etc is supported natively downstream.
|
|
144
112
|
|
|
145
|
-
|
|
146
|
-
registrySources[sources]
|
|
147
|
-
registryStreamSources[stream_sources]
|
|
148
|
-
registryMappers[mappers]
|
|
149
|
-
registryRecordOps[record_ops]
|
|
150
|
-
registryStreamOps[stream_ops]
|
|
151
|
-
registryDebugOps[debug_ops]
|
|
152
|
-
end
|
|
113
|
+
### DTOs & Domains
|
|
153
114
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
parserEP[parser ep]
|
|
159
|
-
sourceArgs[loader args]
|
|
160
|
-
sourceNode[Source]
|
|
161
|
-
dtoStream[(DTOs)]
|
|
162
|
-
end
|
|
115
|
+
- A **DTO** (Data Transfer Object) mirrors a single source’s schema (columns/fields) and stays “raw-shaped”; it’s what parsers emit.
|
|
116
|
+
- A **domain record** is the canonical shape used across the pipeline. Mappers convert DTOs into domain records so multiple sources can land in the same domain model.
|
|
117
|
+
- The base time-series type is `TemporalRecord` (`time` + `value`). Domains typically add identity fields (e.g. `symbol`, `station_id`) that make filtering/partitioning meaningful.
|
|
118
|
+
- `time` must be timezone-aware (normalized to UTC); `value` is the measurement you engineer features from; all other fields act as the record’s “identity” (used by equality/deduping and commonly by `partition_by`).
|
|
163
119
|
|
|
164
|
-
|
|
165
|
-
sourcesCfg --> loaderEP
|
|
166
|
-
sourcesCfg --> parserEP
|
|
167
|
-
sourcesCfg --> sourceArgs
|
|
168
|
-
transportSpec -. select fs/http/synth .-> loaderEP
|
|
169
|
-
loaderEP -. build loader .-> sourceNode
|
|
170
|
-
parserEP -. build parser .-> sourceNode
|
|
171
|
-
sourceArgs -. paths/creds .-> sourceNode
|
|
172
|
-
rawData --> sourceNode --> dtoStream
|
|
173
|
-
sourcesCfg -. build_source_from_spec .-> registrySources
|
|
174
|
-
contractsCfg -. stream_id + source .-> registryStreamSources
|
|
175
|
-
registrySources -. alias -> Source .-> registryStreamSources
|
|
120
|
+
### Glossary
|
|
176
121
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
canonical[DTO -> record]
|
|
183
|
-
domainRecords((TemporalRecord))
|
|
184
|
-
recordStage[record xforms]
|
|
185
|
-
featureWrap[record -> feature]
|
|
186
|
-
featureRecords((FeatureRecord))
|
|
187
|
-
regularization[stream xforms]
|
|
188
|
-
end
|
|
122
|
+
- **Source alias**: `sources/*.yaml:id` (referenced by contracts under `source:`).
|
|
123
|
+
- **Stream id**: `contracts/*.yaml:id` (referenced by `dataset.yaml` under `record_stream:`).
|
|
124
|
+
- **Partition**: dimension keys appended to feature IDs, driven by `contract.partition_by`.
|
|
125
|
+
- **Group**: vector “bucket” cadence set by `dataset.group_by` (controls how records become samples).
|
|
126
|
+
- **Stage**: debug/preview level for `jerry serve --stage 0-7` (DTOs → domain records → features → vectors).
|
|
189
127
|
|
|
190
|
-
|
|
191
|
-
contractsCfg --> mapperEP
|
|
192
|
-
mappersPkg -. ep target .-> mapperEP
|
|
193
|
-
mapperEP -. build_mapper_from_spec .-> registryMappers
|
|
194
|
-
registryMappers --> canonical
|
|
195
|
-
contractsCfg --> recordRules
|
|
196
|
-
contractsCfg --> streamRules
|
|
197
|
-
contractsCfg --> debugRules
|
|
198
|
-
registryRecordOps --> recordRules
|
|
199
|
-
registryStreamOps --> streamRules
|
|
200
|
-
registryDebugOps --> debugRules
|
|
201
|
-
recordRules --> recordStage
|
|
202
|
-
streamRules --> regularization
|
|
203
|
-
debugRules --> regularization
|
|
128
|
+
### Dataset Project (YAML Config)
|
|
204
129
|
|
|
205
|
-
|
|
206
|
-
featureSpec[feature cfg]
|
|
207
|
-
groupBySpec[group_by]
|
|
208
|
-
streamRefs[record_stream ids]
|
|
209
|
-
featureTrans[feature/seq xforms]
|
|
210
|
-
sequenceStream((seq/features))
|
|
211
|
-
vectorStage[vector assembly]
|
|
212
|
-
vectorSamples((samples))
|
|
213
|
-
end
|
|
130
|
+
These live under the dataset “project root” directory (the folder containing `project.yaml`):
|
|
214
131
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
featureSpec -. scale/sequence .-> featureTrans
|
|
222
|
-
groupBySpec -. cadence .-> vectorStage
|
|
132
|
+
- `project.yaml`: paths + globals (single source of truth).
|
|
133
|
+
- `sources/*.yaml`: raw sources (loader + parser wiring).
|
|
134
|
+
- `contracts/*.yaml`: canonical streams (ingest or composed).
|
|
135
|
+
- `dataset.yaml`: feature/target declarations.
|
|
136
|
+
- `postprocess.yaml`: vector-level transforms.
|
|
137
|
+
- `tasks/*.yaml`: serve presets and artifact task configs.
|
|
223
138
|
|
|
224
|
-
|
|
225
|
-
vectorTransforms[vector xforms]
|
|
226
|
-
postprocessNode[postprocess]
|
|
227
|
-
end
|
|
139
|
+
### Configuration & Resolution Order
|
|
228
140
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
141
|
+
Defaults are layered so you can set global preferences once, keep dataset/run
|
|
142
|
+
files focused on per-project behavior, and still override anything from the CLI.
|
|
143
|
+
For both `jerry serve` and `jerry build`, options are merged in the following
|
|
144
|
+
order (highest precedence first):
|
|
232
145
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
style
|
|
238
|
-
|
|
239
|
-
style datasetCfg width:180px
|
|
240
|
-
style postprocessCfg width:200px
|
|
241
|
-
style registrySources width:160px
|
|
242
|
-
style registryStreamSources width:180px
|
|
243
|
-
style registryMappers width:160px
|
|
244
|
-
style registryRecordOps width:180px
|
|
245
|
-
style registryStreamOps width:180px
|
|
246
|
-
style registryDebugOps width:180px
|
|
247
|
-
style transportSpec width:180px
|
|
248
|
-
style loaderEP width:140px
|
|
249
|
-
style parserEP width:140px
|
|
250
|
-
style sourceArgs width:160px
|
|
251
|
-
style canonical width:180px
|
|
252
|
-
style featureTrans width:180px
|
|
253
|
-
style domainRecords width:140px
|
|
254
|
-
style featureRecords width:140px
|
|
255
|
-
style sequenceStream width:180px
|
|
256
|
-
style vectorStage width:160px
|
|
257
|
-
style vectorSamples width:180px
|
|
258
|
-
style recordRules width:160px
|
|
259
|
-
style streamRules width:160px
|
|
260
|
-
style debugRules width:160px
|
|
261
|
-
style domainPkg width:160px
|
|
262
|
-
style mappersPkg width:160px
|
|
263
|
-
|
|
264
|
-
Solid arrows trace runtime data flow; dashed edges highlight how the config files
|
|
265
|
-
inject transports, entry points, or policies into each stage.
|
|
266
|
-
|
|
267
|
-
CLI quick path:
|
|
268
|
-
- `jerry source add <provider> <dataset> --transport fs|http|synthetic --format ...` → scaffolds DTO/parser/loader and writes `config/sources/*.yaml`.
|
|
269
|
-
- `jerry domain add <name>` → creates `src/<pkg>/domains/<name>/model.py`.
|
|
270
|
-
- `jerry contract` → picks a source + domain, scaffolds/links a mapper under `mappers/`, registers its entry point, and writes `config/contracts/<stream>.yaml`.
|
|
271
|
-
- `jerry serve --project <project.yaml>` → builds/streams vectors using dataset `record_stream` IDs, registry wiring, and postprocess rules.
|
|
272
|
-
|
|
273
|
-
`config/sources/*.yaml` determines both the transport and parsing strategy:
|
|
274
|
-
you define transport (`fs`, `http`, `synthetic`, etc.), the payload format
|
|
275
|
-
(`csv`, `json`, ...), and the loader/parser entry points. Loader `args`
|
|
276
|
-
typically include file paths, bucket prefixes, or credential references—the
|
|
277
|
-
runtime feeds those arguments into the instantiated loader so it knows exactly
|
|
278
|
-
which external data store to read. Contracts bind each canonical stream to a
|
|
279
|
-
`source` alias (connecting back to the loader/parser pair) and register a
|
|
280
|
-
stream ID; they also specify mapper entry points, record/stream rules,
|
|
281
|
-
partitioning, and batch sizes. Dataset features reference those canonical
|
|
282
|
-
stream IDs via `record_stream`, so each feature config reuses the registered
|
|
283
|
-
stream (and, by extension, the raw source) when you call
|
|
284
|
-
`build_feature_pipeline()` (`src/datapipeline/pipeline/pipelines.py`). Finally,
|
|
285
|
-
`postprocess.yaml` decorates the vector stream with additional filters/fills so
|
|
286
|
-
serve/build outputs inherit the full set of policies. When you run the CLI,
|
|
287
|
-
`bootstrap()` (`src/datapipeline/services/bootstrap/core.py`) loads each
|
|
288
|
-
directory declared in `project.yaml`, instantiates loaders/parsers via
|
|
289
|
-
`build_source_from_spec()` and `load_ep()`, attaches contract registries, and
|
|
290
|
-
hands a fully wired `Runtime` to the pipeline stages in
|
|
291
|
-
`src/datapipeline/pipeline/stages.py`.
|
|
292
|
-
|
|
293
|
-
Every `record_stream` identifier ultimately resolves to the stream entry revived
|
|
294
|
-
by the contract bootstrap step, so requesting stage outputs for a feature always
|
|
295
|
-
walks the entire chain from dataset config → canonical contract → source
|
|
296
|
-
definition. That is why `build_feature_pipeline()` starts by calling
|
|
297
|
-
`open_source_stream(context, record_stream_id)` before stepping through record
|
|
298
|
-
policies, stream policies, and feature transforms.
|
|
299
|
-
|
|
300
|
-
The runtime (`src/datapipeline/runtime.py`) hosts registries for sources,
|
|
301
|
-
transforms, artifacts, and postprocess rules. The CLI constructs lightweight
|
|
302
|
-
`PipelineContext` objects to build iterators without mutating global state.
|
|
146
|
+
1. **CLI flags** – anything you pass on the command line always wins.
|
|
147
|
+
2. **Project task files** – `kind: serve` specs (under `project.paths.tasks`)
|
|
148
|
+
supply serve defaults; artifact tasks in the same directory drive `jerry build`.
|
|
149
|
+
3. **`jerry.yaml` command blocks** – settings under `jerry.serve` and `jerry.build`.
|
|
150
|
+
4. **`jerry.yaml.shared`** – shared fallbacks for visuals/progress/log-level style settings.
|
|
151
|
+
5. **Built-in defaults** – runtime hard-coded defaults.
|
|
303
152
|
|
|
304
153
|
---
|
|
305
154
|
|
|
306
|
-
##
|
|
155
|
+
## YAML Config Reference
|
|
307
156
|
|
|
308
|
-
All
|
|
157
|
+
All dataset configuration is rooted at a single `project.yaml` file. Other YAML files are discovered via `project.paths.*` (relative to `project.yaml` unless absolute).
|
|
309
158
|
|
|
310
159
|
### `project.yaml`
|
|
311
160
|
|
|
@@ -317,7 +166,7 @@ paths:
|
|
|
317
166
|
sources: ./sources
|
|
318
167
|
dataset: dataset.yaml
|
|
319
168
|
postprocess: postprocess.yaml
|
|
320
|
-
artifacts: ../
|
|
169
|
+
artifacts: ../artifacts/${project_name}/v${version}
|
|
321
170
|
tasks: ./tasks
|
|
322
171
|
globals:
|
|
323
172
|
start_time: 2021-01-01T00:00:00Z
|
|
@@ -344,13 +193,13 @@ globals:
|
|
|
344
193
|
|
|
345
194
|
```yaml
|
|
346
195
|
kind: serve
|
|
347
|
-
name: train
|
|
348
|
-
keep: train
|
|
196
|
+
name: train # defaults to filename stem when omitted
|
|
197
|
+
keep: train # select active split label (null disables filtering)
|
|
349
198
|
output:
|
|
350
|
-
transport: stdout
|
|
351
|
-
format: print
|
|
352
|
-
limit: 100
|
|
353
|
-
throttle_ms: null
|
|
199
|
+
transport: stdout # stdout | fs
|
|
200
|
+
format: print # print | json-lines | json | csv | pickle
|
|
201
|
+
limit: 100 # cap vectors per serve run (null = unlimited)
|
|
202
|
+
throttle_ms: null # milliseconds to sleep between emitted vectors
|
|
354
203
|
# Optional overrides:
|
|
355
204
|
# log_level: INFO # DEBUG=progress bars, INFO=spinner, WARNING=quiet
|
|
356
205
|
# visuals: AUTO # AUTO | TQDM | RICH | OFF
|
|
@@ -358,7 +207,7 @@ throttle_ms: null # milliseconds to sleep between emitted vectors
|
|
|
358
207
|
```
|
|
359
208
|
|
|
360
209
|
- Each serve task lives alongside artifact tasks under `paths.tasks`. Files are independent—no special directory structure required.
|
|
361
|
-
- `output`, `limit`, `throttle_ms`, and `log_level` provide defaults for `jerry serve`; CLI flags still win per invocation (see
|
|
210
|
+
- `output`, `limit`, `throttle_ms`, and `log_level` provide defaults for `jerry serve`; CLI flags still win per invocation (see _Configuration & Resolution Order_). For filesystem outputs, set `transport: fs`, `directory: /path/to/root`, and omit file names—each run automatically writes to `<directory>/<run_name>/<run_name>.<ext>` unless you override the entire `output` block with a custom `filename`.
|
|
362
211
|
- Override `keep` (and other fields) per invocation via `jerry serve ... --keep val` etc.
|
|
363
212
|
- Visuals backend: set `visuals: AUTO|TQDM|RICH|OFF` in the task or use `--visuals`. Pair with `progress: AUTO|SPINNER|BARS|OFF` or `--progress` to control progress layouts.
|
|
364
213
|
- Add additional `kind: serve` files to the tasks directory for other splits (val/test/etc.); `jerry serve` runs each enabled file unless you pass `--run <name>`.
|
|
@@ -369,81 +218,87 @@ throttle_ms: null # milliseconds to sleep between emitted vectors
|
|
|
369
218
|
Create an optional `jerry.yaml` in the directory where you run the CLI to share settings across commands. The CLI walks up from the current working directory to find the first `jerry.yaml`.
|
|
370
219
|
|
|
371
220
|
```yaml
|
|
372
|
-
plugin_root: lib/
|
|
373
|
-
|
|
221
|
+
plugin_root: lib/my-datapipeline # plugin workspace (relative to this file)
|
|
222
|
+
|
|
223
|
+
# Dataset aliases for --dataset; values may be dirs (auto-append project.yaml).
|
|
224
|
+
datasets:
|
|
225
|
+
example: lib/my-datapipeline/example/project.yaml
|
|
226
|
+
default_dataset: example
|
|
374
227
|
|
|
375
228
|
shared:
|
|
376
|
-
visuals:
|
|
377
|
-
progress:
|
|
229
|
+
visuals: AUTO # AUTO | TQDM | RICH | OFF
|
|
230
|
+
progress: BARS # AUTO | SPINNER | BARS | OFF
|
|
231
|
+
log_level: INFO
|
|
378
232
|
|
|
379
233
|
serve:
|
|
380
|
-
|
|
234
|
+
limit: null
|
|
235
|
+
stage: null
|
|
381
236
|
output:
|
|
382
237
|
transport: stdout
|
|
383
|
-
format: print
|
|
238
|
+
format: print # print | json-lines | json | csv | pickle
|
|
384
239
|
# directory: artifacts/serve # Required when transport=fs
|
|
385
240
|
|
|
386
241
|
build:
|
|
387
|
-
log_level: INFO
|
|
388
242
|
mode: AUTO # AUTO | FORCE | OFF
|
|
389
243
|
```
|
|
390
244
|
|
|
391
245
|
`jerry.yaml` sits near the root of your workspace, while dataset-specific overrides still live in individual `tasks/serve.*.yaml` files as needed.
|
|
392
246
|
|
|
393
|
-
###
|
|
247
|
+
### `<project_root>/sources/<alias>.yaml`
|
|
394
248
|
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
For both `jerry serve` and `jerry build`, options are merged in the following
|
|
398
|
-
order (highest precedence first):
|
|
249
|
+
Each file defines a loader/parser pair exposed under `<alias>`. Files may live in nested
|
|
250
|
+
subdirectories under `<project_root>/sources/`; discovery is recursive.
|
|
399
251
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
252
|
+
```yaml
|
|
253
|
+
# Source identifier (commonly `provider.dataset`). Contracts reference this under `source:`.
|
|
254
|
+
id: stooq.ohlcv
|
|
255
|
+
parser:
|
|
256
|
+
# Parser entry point name (registered in your plugin’s pyproject.toml).
|
|
257
|
+
entrypoint: stooq.ohlcv
|
|
258
|
+
loader:
|
|
259
|
+
# Most common loader: core.io (supports fs/http via args.transport + args.format).
|
|
260
|
+
entrypoint: core.io
|
|
261
|
+
args:
|
|
262
|
+
transport: http
|
|
263
|
+
format: csv
|
|
264
|
+
url: "https://stooq.com/q/d/l/?s=aapl.us&i=d"
|
|
265
|
+
```
|
|
412
266
|
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
267
|
+
- `id`: the source alias; referenced by contracts under `source:`.
|
|
268
|
+
- `parser.entrypoint`: which parser to use; `parser.args` are optional.
|
|
269
|
+
- `loader.entrypoint`: which loader to use; `core.io` is the default for fs/http and is configured via `loader.args`.
|
|
416
270
|
|
|
417
|
-
|
|
271
|
+
#### Fan-out Sources (`core.foreach`)
|
|
418
272
|
|
|
419
|
-
|
|
420
|
-
`id` the rest of the pipeline references). Files may live in nested
|
|
421
|
-
subdirectories under `config/sources/`; discovery is recursive.
|
|
273
|
+
Use `core.foreach` to expand any inner loader spec across a list without duplicating YAML. It interpolates string args and optionally injects the foreach value into each row.
|
|
422
274
|
|
|
423
275
|
```yaml
|
|
424
|
-
id: demo_weather
|
|
425
|
-
parser:
|
|
426
|
-
entrypoint: demo.weather_parser
|
|
427
|
-
args:
|
|
428
|
-
timezone: UTC
|
|
429
276
|
loader:
|
|
430
|
-
entrypoint:
|
|
277
|
+
entrypoint: core.foreach
|
|
431
278
|
args:
|
|
432
|
-
|
|
279
|
+
foreach:
|
|
280
|
+
symbol: [AAPL, MSFT]
|
|
281
|
+
inject_field: symbol
|
|
282
|
+
loader:
|
|
283
|
+
entrypoint: core.io
|
|
284
|
+
args:
|
|
285
|
+
transport: http
|
|
286
|
+
format: csv
|
|
287
|
+
url: "https://stooq.com/q/d/l/?s=${symbol}&i=d"
|
|
433
288
|
```
|
|
434
289
|
|
|
435
|
-
###
|
|
290
|
+
### `<project_root>/contracts/<stream_id>.yaml`
|
|
436
291
|
|
|
437
|
-
Canonical stream contracts describe how the runtime should map and prepare a
|
|
438
|
-
source. Use folders to organize by domain.
|
|
292
|
+
Canonical stream contracts describe how the runtime should map and prepare a raw
|
|
293
|
+
source. Use folders to organize by domain if you like.
|
|
439
294
|
|
|
440
295
|
```yaml
|
|
441
296
|
kind: ingest
|
|
442
|
-
id:
|
|
443
|
-
source:
|
|
297
|
+
id: equity.ohlcv # stream identifier (domain.dataset[.variant])
|
|
298
|
+
source: stooq.ohlcv # references sources/<alias>.yaml:id
|
|
444
299
|
|
|
445
300
|
mapper:
|
|
446
|
-
entrypoint:
|
|
301
|
+
entrypoint: equity.ohlcv
|
|
447
302
|
args: {}
|
|
448
303
|
|
|
449
304
|
partition_by: station
|
|
@@ -476,7 +331,7 @@ debug:
|
|
|
476
331
|
Define engineered streams that depend on other canonical streams directly in contracts. The runtime builds each input to stage 4 (ordered + regularized), stream‑aligns by partition + timestamp, runs your composer, and emits fresh records for the derived stream.
|
|
477
332
|
|
|
478
333
|
```yaml
|
|
479
|
-
# contracts/air_density.processed.yaml
|
|
334
|
+
# <project_root>/contracts/air_density.processed.yaml
|
|
480
335
|
kind: composed
|
|
481
336
|
id: air_density.processed
|
|
482
337
|
inputs:
|
|
@@ -521,14 +376,14 @@ Defines which canonical streams become features/targets and the vector bucketing
|
|
|
521
376
|
group_by: 1h
|
|
522
377
|
|
|
523
378
|
features:
|
|
524
|
-
- id:
|
|
525
|
-
record_stream:
|
|
379
|
+
- id: close
|
|
380
|
+
record_stream: equity.ohlcv
|
|
526
381
|
scale: true
|
|
527
382
|
sequence: { size: 6, stride: 1 }
|
|
528
383
|
|
|
529
384
|
targets:
|
|
530
|
-
- id:
|
|
531
|
-
record_stream:
|
|
385
|
+
- id: returns_1d
|
|
386
|
+
record_stream: equity.ohlcv
|
|
532
387
|
```
|
|
533
388
|
|
|
534
389
|
- `group_by` controls the cadence for vector partitioning (accepts `Xm|min|Xh`
|
|
@@ -593,6 +448,7 @@ enabled: true
|
|
|
593
448
|
|
|
594
449
|
All commands live under the `jerry` entry point (`src/datapipeline/cli/app.py`).
|
|
595
450
|
Pass `--help` on any command for flags.
|
|
451
|
+
All commands that take a project accept either `--project <path/to/project.yaml>` or `--dataset <alias>` (from `jerry.yaml datasets:`).
|
|
596
452
|
|
|
597
453
|
### Preview Stages
|
|
598
454
|
|
|
@@ -615,7 +471,7 @@ Pass `--help` on any command for flags.
|
|
|
615
471
|
the group key or metadata. Default is `sample`.
|
|
616
472
|
- Set `--log-level DEBUG` (or set your serve task `log_level: DEBUG`) to reuse the tqdm progress bars when previewing stages.
|
|
617
473
|
- When multiple serve tasks exist, add `--run val` (task name or filename stem) to target a single config; otherwise every enabled task is executed sequentially.
|
|
618
|
-
- Argument precedence follows the order described under
|
|
474
|
+
- Argument precedence follows the order described under _Configuration & Resolution Order_.
|
|
619
475
|
- Combine with `--skip-build` when you already have fresh artifacts and want to jump straight into streaming.
|
|
620
476
|
|
|
621
477
|
### Build & Quality
|
|
@@ -651,7 +507,7 @@ Pass `--help` on any command for flags.
|
|
|
651
507
|
|
|
652
508
|
## Transform & Filter Library
|
|
653
509
|
|
|
654
|
-
### Record Filters (
|
|
510
|
+
### Record Filters (`<project_root>/contracts/*.yaml:record`)
|
|
655
511
|
|
|
656
512
|
- Binary comparisons: `eq`, `ne`, `lt`, `le`, `gt`, `ge` (timezone-aware for ISO
|
|
657
513
|
or datetime literals).
|
|
@@ -791,7 +647,7 @@ and `src/datapipeline/filters/`.
|
|
|
791
647
|
`jerry plugin init`.
|
|
792
648
|
- `datapipeline.services.scaffold.source.create_source` – writes loader/parser
|
|
793
649
|
stubs and updates entry points.
|
|
794
|
-
- `datapipeline.services.scaffold.domain.create_domain` – domain
|
|
650
|
+
- `datapipeline.services.scaffold.domain.create_domain` – domain record skeleton.
|
|
795
651
|
- `datapipeline.services.scaffold.filter.create_filter` – custom filter stub.
|
|
796
652
|
- `datapipeline.services.scaffold.mappers.attach_source_to_domain` – helper for
|
|
797
653
|
programmatically wiring sources to domain mappers and emitting stream
|
|
@@ -821,5 +677,151 @@ and `src/datapipeline/filters/`.
|
|
|
821
677
|
- `examples/minimal_project/` – runnable demo showing config layout and Torch
|
|
822
678
|
integration.
|
|
823
679
|
|
|
824
|
-
|
|
825
|
-
|
|
680
|
+
---
|
|
681
|
+
|
|
682
|
+
## Pipeline Architecture (WIP)
|
|
683
|
+
|
|
684
|
+
```text
|
|
685
|
+
raw source ──▶ loader/parser DTOs ──▶ canonical stream ──▶ record policies
|
|
686
|
+
└──▶ feature wrapping ──▶ stream regularization ──▶ feature transforms/sequence
|
|
687
|
+
└──▶ vector assembly ──▶ postprocess transforms
|
|
688
|
+
```
|
|
689
|
+
|
|
690
|
+
1. **Loader/parser (Stage 0)** – raw bytes become typed DTOs. Loaders fetch from
|
|
691
|
+
FS/HTTP/synthetic sources; parsers map bytes to DTOs. Register them via entry
|
|
692
|
+
points (`loaders`, `parsers`) and wire them in `<project_root>/sources/*.yaml`.
|
|
693
|
+
2. **Canonical stream mapping (Stage 1)** – mappers attach domain semantics and
|
|
694
|
+
partition keys, producing domain `TemporalRecord`s.
|
|
695
|
+
3. **Record policies (Stage 2)** – contract `record` rules (filters, floor, lag)
|
|
696
|
+
prune and normalize DTO-derived records.
|
|
697
|
+
4. **Feature wrapping (Stage 3)** – records become `FeatureRecord`s before
|
|
698
|
+
sort/regularization.
|
|
699
|
+
5. **Stream regularization (Stage 4)** – contract `stream` rules ensure cadence,
|
|
700
|
+
deduplicate timestamps, and impute where needed.
|
|
701
|
+
6. **Feature transforms/sequence (Stage 5)** – dataset transforms (scale,
|
|
702
|
+
sequence windows) produce per-feature tensors or windows.
|
|
703
|
+
7. **Vector assembly (Stage 6)** – features merge by `group_by` cadence into
|
|
704
|
+
`(group_key, Vector)` pairs, prior to postprocess tweaks.
|
|
705
|
+
8. **Postprocess (Stage 7)** – optional vector transforms (fill/drop/etc.) run
|
|
706
|
+
before results are emitted to the configured output.
|
|
707
|
+
|
|
708
|
+
#### Visual Flowchart
|
|
709
|
+
|
|
710
|
+
```mermaid
|
|
711
|
+
flowchart TB
|
|
712
|
+
subgraph CLI & Project config
|
|
713
|
+
cliSource[jerry source add]
|
|
714
|
+
cliDomain[jerry domain add]
|
|
715
|
+
cliContract[jerry contract]
|
|
716
|
+
cliServe[jerry serve]
|
|
717
|
+
project[[project.yaml]]
|
|
718
|
+
sourcesCfg[sources/*.yaml]
|
|
719
|
+
contractsCfg[contracts/*.yaml]
|
|
720
|
+
datasetCfg[dataset.yaml]
|
|
721
|
+
postprocessCfg[postprocess.yaml]
|
|
722
|
+
end
|
|
723
|
+
|
|
724
|
+
cliSource --> sourcesCfg
|
|
725
|
+
cliDomain --> domainPkg
|
|
726
|
+
cliContract --> contractsCfg
|
|
727
|
+
cliServe --> vectorSamples
|
|
728
|
+
project -.->|paths.sources| sourcesCfg
|
|
729
|
+
project -.->|paths.streams| contractsCfg
|
|
730
|
+
project -.->|paths.dataset| datasetCfg
|
|
731
|
+
project -.->|paths.postprocess| postprocessCfg
|
|
732
|
+
|
|
733
|
+
subgraph Plugin code
|
|
734
|
+
domainPkg[domains/*]
|
|
735
|
+
mappersPkg[mappers/*]
|
|
736
|
+
end
|
|
737
|
+
|
|
738
|
+
cliContract --> mappersPkg
|
|
739
|
+
domainPkg -. domain models .-> mappersPkg
|
|
740
|
+
|
|
741
|
+
subgraph Registries
|
|
742
|
+
registrySources[sources]
|
|
743
|
+
registryStreamSources[stream_sources]
|
|
744
|
+
registryMappers[mappers]
|
|
745
|
+
registryRecordOps[record_ops]
|
|
746
|
+
registryStreamOps[stream_ops]
|
|
747
|
+
registryDebugOps[debug_ops]
|
|
748
|
+
end
|
|
749
|
+
|
|
750
|
+
subgraph Source wiring
|
|
751
|
+
rawData[(external data)]
|
|
752
|
+
transportSpec[transport + format]
|
|
753
|
+
loaderEP[loader ep]
|
|
754
|
+
parserEP[parser ep]
|
|
755
|
+
sourceArgs[loader args]
|
|
756
|
+
sourceNode[Source]
|
|
757
|
+
dtoStream[(DTOs)]
|
|
758
|
+
end
|
|
759
|
+
|
|
760
|
+
sourcesCfg --> transportSpec
|
|
761
|
+
sourcesCfg --> loaderEP
|
|
762
|
+
sourcesCfg --> parserEP
|
|
763
|
+
sourcesCfg --> sourceArgs
|
|
764
|
+
transportSpec -. select fs/http/synth .-> loaderEP
|
|
765
|
+
loaderEP -. build loader .-> sourceNode
|
|
766
|
+
parserEP -. build parser .-> sourceNode
|
|
767
|
+
sourceArgs -. paths/creds .-> sourceNode
|
|
768
|
+
rawData --> sourceNode --> dtoStream
|
|
769
|
+
sourcesCfg -. build_source_from_spec .-> registrySources
|
|
770
|
+
contractsCfg -. stream_id + source .-> registryStreamSources
|
|
771
|
+
registrySources -. alias -> Source .-> registryStreamSources
|
|
772
|
+
|
|
773
|
+
subgraph Canonical stream
|
|
774
|
+
mapperEP[mapper ep]
|
|
775
|
+
recordRules[record rules]
|
|
776
|
+
streamRules[stream rules]
|
|
777
|
+
debugRules[debug rules]
|
|
778
|
+
canonical[DTO -> record]
|
|
779
|
+
domainRecords((TemporalRecord))
|
|
780
|
+
recordStage[record xforms]
|
|
781
|
+
featureWrap[record -> feature]
|
|
782
|
+
featureRecords((FeatureRecord))
|
|
783
|
+
regularization[stream xforms]
|
|
784
|
+
end
|
|
785
|
+
|
|
786
|
+
dtoStream --> canonical --> domainRecords --> recordStage --> featureWrap --> featureRecords --> regularization
|
|
787
|
+
contractsCfg --> mapperEP
|
|
788
|
+
mappersPkg -. ep target .-> mapperEP
|
|
789
|
+
mapperEP -. build_mapper_from_spec .-> registryMappers
|
|
790
|
+
registryMappers --> canonical
|
|
791
|
+
contractsCfg --> recordRules
|
|
792
|
+
contractsCfg --> streamRules
|
|
793
|
+
contractsCfg --> debugRules
|
|
794
|
+
registryRecordOps --> recordRules
|
|
795
|
+
registryStreamOps --> streamRules
|
|
796
|
+
registryDebugOps --> debugRules
|
|
797
|
+
recordRules --> recordStage
|
|
798
|
+
streamRules --> regularization
|
|
799
|
+
debugRules --> regularization
|
|
800
|
+
|
|
801
|
+
subgraph Dataset shaping
|
|
802
|
+
featureSpec[feature cfg]
|
|
803
|
+
groupBySpec[group_by]
|
|
804
|
+
streamRefs[record_stream ids]
|
|
805
|
+
featureTrans[feature/seq xforms]
|
|
806
|
+
sequenceStream((seq/features))
|
|
807
|
+
vectorStage[vector assembly]
|
|
808
|
+
vectorSamples((samples))
|
|
809
|
+
end
|
|
810
|
+
|
|
811
|
+
datasetCfg --> featureSpec
|
|
812
|
+
datasetCfg --> groupBySpec
|
|
813
|
+
datasetCfg --> streamRefs
|
|
814
|
+
streamRefs -.->|build_feature_pipeline| registryStreamSources
|
|
815
|
+
registryStreamSources -.->|open_source_stream| sourceNode
|
|
816
|
+
featureRecords --> regularization --> featureTrans --> sequenceStream --> vectorStage --> vectorSamples
|
|
817
|
+
featureSpec -. scale/sequence .-> featureTrans
|
|
818
|
+
groupBySpec -. cadence .-> vectorStage
|
|
819
|
+
|
|
820
|
+
subgraph Postprocess
|
|
821
|
+
vectorTransforms[vector xforms]
|
|
822
|
+
postprocessNode[postprocess]
|
|
823
|
+
end
|
|
824
|
+
|
|
825
|
+
postprocessCfg --> vectorTransforms -. drop/fill .-> postprocessNode
|
|
826
|
+
vectorStage --> postprocessNode
|
|
827
|
+
```
|