PyPI - jerry-thomas - Versions diffs - 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

jerry-thomas 0.3.0py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (164) hide show

datapipeline/analysis/vector/collector.py +120 -17
datapipeline/analysis/vector/matrix.py +33 -8
datapipeline/analysis/vector/report.py +162 -32
datapipeline/build/tasks/__init__.py +11 -0
datapipeline/build/tasks/config.py +74 -0
datapipeline/build/tasks/metadata.py +170 -0
datapipeline/build/tasks/scaler.py +73 -0
datapipeline/build/tasks/schema.py +60 -0
datapipeline/build/tasks/utils.py +169 -0
datapipeline/cli/app.py +304 -127
datapipeline/cli/commands/build.py +240 -16
datapipeline/cli/commands/contract.py +367 -0
datapipeline/cli/commands/domain.py +8 -3
datapipeline/cli/commands/inspect.py +401 -149
datapipeline/cli/commands/list_.py +30 -7
datapipeline/cli/commands/plugin.py +5 -1
datapipeline/cli/commands/run.py +227 -241
datapipeline/cli/commands/run_config.py +101 -0
datapipeline/cli/commands/serve_pipeline.py +156 -0
datapipeline/cli/commands/source.py +44 -8
datapipeline/cli/visuals/__init__.py +4 -2
datapipeline/cli/visuals/common.py +239 -0
datapipeline/cli/visuals/labels.py +15 -15
datapipeline/cli/visuals/runner.py +66 -0
datapipeline/cli/visuals/sections.py +20 -0
datapipeline/cli/visuals/sources.py +132 -119
datapipeline/cli/visuals/sources_basic.py +260 -0
datapipeline/cli/visuals/sources_off.py +76 -0
datapipeline/cli/visuals/sources_rich.py +414 -0
datapipeline/config/catalog.py +37 -3
datapipeline/config/context.py +214 -0
datapipeline/config/dataset/loader.py +21 -4
datapipeline/config/dataset/normalize.py +4 -4
datapipeline/config/metadata.py +43 -0
datapipeline/config/postprocess.py +2 -2
datapipeline/config/project.py +3 -2
datapipeline/config/resolution.py +129 -0
datapipeline/config/tasks.py +309 -0
datapipeline/config/workspace.py +155 -0
datapipeline/domain/__init__.py +12 -0
datapipeline/domain/record.py +11 -0
datapipeline/domain/sample.py +54 -0
datapipeline/integrations/ml/adapter.py +34 -20
datapipeline/integrations/ml/pandas_support.py +0 -2
datapipeline/integrations/ml/rows.py +1 -6
datapipeline/integrations/ml/torch_support.py +1 -3
datapipeline/io/factory.py +112 -0
datapipeline/io/output.py +132 -0
datapipeline/io/protocols.py +21 -0
datapipeline/io/serializers.py +219 -0
datapipeline/io/sinks/__init__.py +23 -0
datapipeline/io/sinks/base.py +2 -0
datapipeline/io/sinks/files.py +79 -0
datapipeline/io/sinks/rich.py +57 -0
datapipeline/io/sinks/stdout.py +18 -0
datapipeline/io/writers/__init__.py +14 -0
datapipeline/io/writers/base.py +28 -0
datapipeline/io/writers/csv_writer.py +25 -0
datapipeline/io/writers/jsonl.py +52 -0
datapipeline/io/writers/pickle_writer.py +30 -0
datapipeline/pipeline/artifacts.py +58 -0
datapipeline/pipeline/context.py +66 -7
datapipeline/pipeline/observability.py +65 -0
datapipeline/pipeline/pipelines.py +65 -13
datapipeline/pipeline/split.py +11 -10
datapipeline/pipeline/stages.py +127 -16
datapipeline/pipeline/utils/keygen.py +20 -7
datapipeline/pipeline/utils/memory_sort.py +22 -10
datapipeline/pipeline/utils/transform_utils.py +22 -0
datapipeline/runtime.py +5 -2
datapipeline/services/artifacts.py +12 -6
datapipeline/services/bootstrap/config.py +25 -0
datapipeline/services/bootstrap/core.py +52 -37
datapipeline/services/constants.py +6 -5
datapipeline/services/factories.py +123 -1
datapipeline/services/project_paths.py +43 -16
datapipeline/services/runs.py +208 -0
datapipeline/services/scaffold/domain.py +3 -2
datapipeline/services/scaffold/filter.py +3 -2
datapipeline/services/scaffold/mappers.py +9 -6
datapipeline/services/scaffold/plugin.py +54 -10
datapipeline/services/scaffold/source.py +93 -56
datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
datapipeline/sources/decoders.py +83 -18
datapipeline/sources/factory.py +26 -16
datapipeline/sources/models/__init__.py +2 -2
datapipeline/sources/models/generator.py +0 -7
datapipeline/sources/models/loader.py +3 -3
datapipeline/sources/models/parsing_error.py +24 -0
datapipeline/sources/models/source.py +6 -6
datapipeline/sources/synthetic/time/loader.py +14 -2
datapipeline/sources/transports.py +74 -37
datapipeline/templates/plugin_skeleton/README.md +76 -30
datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
datapipeline/templates/stubs/dto.py.j2 +2 -0
datapipeline/templates/stubs/mapper.py.j2 +5 -4
datapipeline/templates/stubs/parser.py.j2 +2 -0
datapipeline/templates/stubs/record.py.j2 +2 -0
datapipeline/templates/stubs/source.yaml.j2 +2 -3
datapipeline/transforms/debug/lint.py +26 -41
datapipeline/transforms/feature/scaler.py +89 -13
datapipeline/transforms/record/floor_time.py +4 -4
datapipeline/transforms/sequence.py +2 -35
datapipeline/transforms/stream/dedupe.py +24 -0
datapipeline/transforms/stream/ensure_ticks.py +7 -6
datapipeline/transforms/vector/__init__.py +5 -0
datapipeline/transforms/vector/common.py +98 -0
datapipeline/transforms/vector/drop/__init__.py +4 -0
datapipeline/transforms/vector/drop/horizontal.py +79 -0
datapipeline/transforms/vector/drop/orchestrator.py +59 -0
datapipeline/transforms/vector/drop/vertical.py +182 -0
datapipeline/transforms/vector/ensure_schema.py +184 -0
datapipeline/transforms/vector/fill.py +87 -0
datapipeline/transforms/vector/replace.py +62 -0
datapipeline/utils/load.py +24 -3
datapipeline/utils/rich_compat.py +38 -0
datapipeline/utils/window.py +76 -0
jerry_thomas-1.0.1.dist-info/METADATA +825 -0
jerry_thomas-1.0.1.dist-info/RECORD +199 -0
{jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
datapipeline/build/tasks.py +0 -186
datapipeline/cli/commands/link.py +0 -128
datapipeline/cli/commands/writers.py +0 -138
datapipeline/config/build.py +0 -64
datapipeline/config/run.py +0 -116
datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
datapipeline/transforms/vector.py +0 -210
jerry_thomas-0.3.0.dist-info/METADATA +0 -502
jerry_thomas-0.3.0.dist-info/RECORD +0 -139
{jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
{jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
{jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0

datapipeline/cli/commands/inspect.py CHANGED Viewed

@@ -1,16 +1,104 @@
 import io
 import json
+import logging
+import sys
 from contextlib import redirect_stdout
 from pathlib import Path
+from typing import Iterable, Iterator, TypeVar
 from datapipeline.analysis.vector.collector import VectorStatsCollector
+from datapipeline.cli.visuals.runner import run_job
+from datapipeline.config.context import load_dataset_context
 from datapipeline.config.dataset.loader import load_dataset
-from datapipeline.services.bootstrap import bootstrap
 from datapipeline.utils.paths import ensure_parent
 from datapipeline.services.bootstrap import artifacts_root
-from datapipeline.pipeline.context import PipelineContext
 from datapipeline.pipeline.pipelines import build_vector_pipeline
 from datapipeline.pipeline.stages import post_process
+from datapipeline.pipeline.artifacts import StageDemand, required_artifacts_for
+from datapipeline.cli.commands.build import run_build_if_needed
+from tqdm import tqdm
+T = TypeVar("T")
+def _prepare_inspect_build(
+    project: str | Path,
+    *,
+    visuals: str | None,
+    progress: str | None,
+    workspace=None,
+) -> None:
+    project_path = Path(project)
+    dataset = load_dataset(project_path, "vectors")
+    demands = [StageDemand(stage=None)]
+    required = required_artifacts_for(dataset, demands)
+    if not required:
+        return
+    run_build_if_needed(
+        project_path,
+        required_artifacts=required,
+        cli_visuals=visuals,
+        cli_progress=progress,
+        workspace=workspace,
+    )
+def _iter_with_progress(
+    iterable: Iterable[T],
+    *,
+    progress_style: str | None,
+    label: str,
+) -> Iterator[T]:
+    style = (progress_style or "auto").lower()
+    if style == "auto":
+        # Default to a light spinner unless DEBUG logging is active.
+        style = "bars" if logging.getLogger().isEnabledFor(logging.DEBUG) else "spinner"
+    if style == "off":
+        yield from iterable
+        return
+    bar_kwargs = {
+        "desc": label,
+        "unit": "vec",
+        "dynamic_ncols": True,
+        "mininterval": 0.2,
+        "leave": False,
+        # Avoid noisy multi-line progress when stdout is not a TTY (e.g., logs)
+        "disable": not sys.stderr.isatty(),
+    }
+    if style == "spinner":
+        bar_kwargs["bar_format"] = "{desc} {n_fmt}{unit}"
+    bar = tqdm(iterable, **bar_kwargs)
+    try:
+        for item in bar:
+            yield item
+    finally:
+        bar.close()
+def _run_inspect_job(
+    project: str,
+    *,
+    visuals: str | None,
+    progress: str | None,
+    log_level: int | None,
+    label: str,
+    section: str,
+    work,
+) -> None:
+    dataset_ctx = load_dataset_context(project)
+    level_value = log_level if log_level is not None else logging.getLogger().getEffectiveLevel()
+    visuals_provider = visuals or "auto"
+    progress_style = progress or "auto"
+    run_job(
+        sections=("inspect", section),
+        label=label,
+        visuals=visuals_provider,
+        progress_style=progress_style,
+        level=level_value,
+        runtime=dataset_ctx.runtime,
+        work=lambda: work(dataset_ctx, progress_style),
+    )
 def report(
@@ -27,7 +115,11 @@ def report(
     quiet: bool = False,
     write_coverage: bool = True,
     apply_postprocess: bool = True,
-    include_targets: bool = False,
+    visuals: str | None = None,
+    progress: str | None = None,
+    log_level: int | None = None,
+    sort: str = "missing",
+    workspace=None,
 ) -> None:
     """Compute a quality report and optionally export coverage JSON and/or a matrix.
@@ -36,92 +128,189 @@ def report(
     - When matrix != 'none', writes an availability matrix in the requested format.
     """
-    project_path = Path(project)
-    dataset = load_dataset(project_path, "vectors")
-    runtime = bootstrap(project_path)
-    context = PipelineContext(runtime)
-    feature_cfgs = list(dataset.features or [])
-    if include_targets:
-        feature_cfgs += list(dataset.targets or [])
-    expected_feature_ids = [cfg.id for cfg in feature_cfgs]
-    # Resolve matrix format and path
-    matrix_fmt = (fmt or matrix) if matrix in {"csv", "html"} else None
-    if matrix_fmt:
-        filename = "matrix.html" if matrix_fmt == "html" else "matrix.csv"
-    else:
-        filename = None
-    base_artifacts = artifacts_root(project_path)
-    matrix_path = None
-    if matrix_fmt:
-        matrix_path = Path(matrix_output) if matrix_output else (base_artifacts / filename)
-    collector = VectorStatsCollector(
-        expected_feature_ids or None,
-        match_partition=match_partition,
-        threshold=threshold,
-        show_matrix=False,
-        matrix_rows=rows,
-        matrix_cols=cols,
-        matrix_output=(str(matrix_path) if matrix_path else None),
-        matrix_format=(matrix_fmt or "html"),
+    _prepare_inspect_build(
+        project,
+        visuals=visuals,
+        progress=progress,
+        workspace=workspace,
     )
+    coverage_path: Path | None = None
-    # When applying transforms, let the global postprocess registry provide them (pass None).
-    # When raw, pass an empty list to bypass registry/defaults.
-    vectors = build_vector_pipeline(context, feature_cfgs, dataset.group_by, stage=None)
-    if apply_postprocess:
-        vectors = post_process(context, vectors)  # use global postprocess
-    for group_key, vector in vectors:
-        collector.update(group_key, vector.values)
-    buffer = io.StringIO()
-    with redirect_stdout(buffer):
-        summary = collector.print_report()
-    if not quiet:
-        report_text = buffer.getvalue()
-        if report_text.strip():
-            print(report_text, end="")
-    # Optionally write coverage summary JSON to a path
-    if write_coverage:
-        output_path = Path(output) if output else (base_artifacts / "coverage.json")
-        ensure_parent(output_path)
+    def _work(dataset_ctx, progress_style):
+        project_path = dataset_ctx.project
+        context = dataset_ctx.pipeline_context
+        dataset = dataset_ctx.dataset
-        feature_stats = summary.get("feature_stats", [])
-        partition_stats = summary.get("partition_stats", [])
-        trimmed = {
-            "total_vectors": summary.get("total_vectors", collector.total_vectors),
-            "empty_vectors": summary.get("empty_vectors", collector.empty_vectors),
-            "threshold": threshold,
-            "match_partition": match_partition,
-            "features": {
-                "keep": summary.get("keep_features", []),
-                "below": summary.get("below_features", []),
-                "coverage": {stat["id"]: stat["coverage"] for stat in feature_stats},
-            },
-            "partitions": {
-                "keep": summary.get("keep_partitions", []),
-                "below": summary.get("below_partitions", []),
-                "keep_suffixes": summary.get("keep_suffixes", []),
-                "below_suffixes": summary.get("below_suffixes", []),
-                "coverage": {stat["id"]: stat["coverage"] for stat in partition_stats},
-            },
-        }
+        feature_cfgs = dataset_ctx.features
+        target_cfgs = dataset_ctx.targets
+        expected_feature_ids = [cfg.id for cfg in feature_cfgs]
-        with output_path.open("w", encoding="utf-8") as fh:
-            json.dump(trimmed, fh, indent=2)
-        print(f"[write] Saved coverage summary to {output_path}")
+        matrix_fmt = (fmt or matrix) if matrix in {"csv", "html"} else None
+        if matrix_fmt:
+            filename = "matrix.html" if matrix_fmt == "html" else "matrix.csv"
+        else:
+            filename = None
+        base_artifacts = artifacts_root(project_path)
+        matrix_path = None
+        if matrix_fmt:
+            matrix_path = Path(matrix_output) if matrix_output else (base_artifacts / filename)
+        schema_entries = dataset_ctx.pipeline_context.load_schema(payload="features")
+        schema_meta = {entry["id"]: entry for entry in (schema_entries or []) if isinstance(entry.get("id"), str)}
+        collector = VectorStatsCollector(
+            expected_feature_ids or None,
+            match_partition=match_partition,
+            schema_meta=schema_meta,
+            threshold=threshold,
+            show_matrix=False,
+            matrix_rows=rows,
+            matrix_cols=cols,
+            matrix_output=(str(matrix_path) if matrix_path else None),
+            matrix_format=(matrix_fmt or "html"),
+        )
+        context.window_bounds(rectangular_required=True)
+        vectors = build_vector_pipeline(
+            context,
+            feature_cfgs,
+            dataset.group_by,
+            target_configs=target_cfgs,
+            rectangular=True,
+        )
+        if apply_postprocess:
+            vectors = post_process(context, vectors)
+        vector_iter = _iter_with_progress(
+            vectors,
+            progress_style=progress_style,
+            label="Processing vectors",
+        )
+        for sample in vector_iter:
+            merged = dict(sample.features.values)
+            if sample.targets:
+                merged.update(sample.targets.values)
+            collector.update(sample.key, merged)
+        buffer = io.StringIO()
+        with redirect_stdout(buffer):
+            summary = collector.print_report(sort_key=sort)
+        if not quiet:
+            report_text = buffer.getvalue()
+            if report_text.strip():
+                print(report_text, end="")
+        if write_coverage:
+            output_path = Path(output) if output else (base_artifacts / "coverage.json")
+            ensure_parent(output_path)
+            feature_stats = summary.get("feature_stats", [])
+            partition_stats = summary.get("partition_stats", [])
+            trimmed = {
+                "total_vectors": summary.get("total_vectors", collector.total_vectors),
+                "empty_vectors": summary.get("empty_vectors", collector.empty_vectors),
+                "threshold": threshold,
+                "match_partition": match_partition,
+                "features": {
+                    "keep": summary.get("keep_features", []),
+                    "below": summary.get("below_features", []),
+                    "coverage": {stat["id"]: stat["coverage"] for stat in feature_stats},
+                    "availability": {
+                        stat["id"]: (
+                            stat["present"] / stat["opportunities"]
+                            if stat.get("opportunities")
+                            else 0
+                        )
+                        for stat in feature_stats
+                    },
+                    "nulls": {stat["id"]: stat.get("nulls", 0) for stat in feature_stats},
+                    "null_rate": {
+                        stat["id"]: (
+                            stat.get("nulls", 0) / stat["opportunities"]
+                            if stat.get("opportunities")
+                            else 0
+                        )
+                        for stat in feature_stats
+                    },
+                    "cadence_nulls": {
+                        stat["id"]: stat.get("cadence_nulls")
+                        for stat in feature_stats
+                        if stat.get("cadence_opportunities")
+                    },
+                    "cadence_opportunities": {
+                        stat["id"]: stat.get("cadence_opportunities")
+                        for stat in feature_stats
+                        if stat.get("cadence_opportunities")
+                    },
+                },
+                "partitions": {
+                    "keep": summary.get("keep_partitions", []),
+                    "below": summary.get("below_partitions", []),
+                    "keep_suffixes": summary.get("keep_suffixes", []),
+                    "below_suffixes": summary.get("below_suffixes", []),
+                    "keep_values": summary.get("keep_partition_values", []),
+                    "below_values": summary.get("below_partition_values", []),
+                    "coverage": {stat["id"]: stat["coverage"] for stat in partition_stats},
+                    "availability": {
+                        stat["id"]: (
+                            stat["present"] / stat["opportunities"]
+                            if stat.get("opportunities")
+                            else 0
+                        )
+                        for stat in partition_stats
+                    },
+                    "nulls": {
+                        stat["id"]: stat.get("nulls", 0) for stat in partition_stats
+                    },
+                    "null_rate": {
+                        stat["id"]: (
+                            stat.get("nulls", 0) / stat["opportunities"]
+                            if stat.get("opportunities")
+                            else 0
+                        )
+                        for stat in partition_stats
+                    },
+                    "cadence_nulls": {
+                        stat["id"]: stat.get("cadence_nulls")
+                        for stat in partition_stats
+                        if stat.get("cadence_opportunities")
+                    },
+                    "cadence_opportunities": {
+                        stat["id"]: stat.get("cadence_opportunities")
+                        for stat in partition_stats
+                        if stat.get("cadence_opportunities")
+                    },
+                },
+            }
+            with output_path.open("w", encoding="utf-8") as fh:
+                json.dump(trimmed, fh, indent=2)
+            print(f"[write] Saved coverage summary to {output_path}")
+            coverage_path = output_path
+    _run_inspect_job(
+        project,
+        visuals=visuals,
+        progress=progress,
+        log_level=log_level,
+        label="Inspect report",
+        section="report",
+        work=_work,
+    )
+    if write_coverage and coverage_path:
+        print(f"[inspect] Coverage summary available at {coverage_path}")
 def partitions(
     project: str,
     *,
     output: str | None = None,
-    include_targets: bool = False,
+    visuals: str | None = None,
+    progress: str | None = None,
+    log_level: int | None = None,
+    workspace=None,
 ) -> None:
     """Discover observed partitions and write a manifest JSON.
@@ -131,90 +320,153 @@ def partitions(
       - by_feature: mapping base id -> list of suffixes (empty when none)
     """
-    project_path = Path(project)
-    dataset = load_dataset(project_path, "vectors")
-    runtime = bootstrap(project_path)
-    feature_cfgs = list(dataset.features or [])
-    if include_targets:
-        feature_cfgs += list(dataset.targets or [])
-    expected_feature_ids = [cfg.id for cfg in feature_cfgs]
-    collector = VectorStatsCollector(
-        expected_feature_ids or None,
-        match_partition="full",
-        threshold=None,
-        show_matrix=False,
+    _prepare_inspect_build(
+        project,
+        visuals=visuals,
+        progress=progress,
+        workspace=workspace,
     )
-    context = PipelineContext(runtime)
-    vectors = build_vector_pipeline(context, feature_cfgs, dataset.group_by, stage=None)
-    vectors = post_process(context, vectors)  # apply global postprocess
-    for group_key, vector in vectors:
-        collector.update(group_key, vector.values)
-    base_artifacts = artifacts_root(project_path)
-    output_path = Path(output) if output else (base_artifacts / "partitions.json")
-    ensure_parent(output_path)
-    parts = sorted(collector.discovered_partitions)
-    features = sorted({pid.split("__", 1)[0] for pid in parts})
-    by_feature: dict[str, list[str]] = {}
-    for pid in parts:
-        if "__" in pid:
-            base, suffix = pid.split("__", 1)
-        else:
-            base, suffix = pid, ""
-        by_feature.setdefault(base, [])
-        if suffix and suffix not in by_feature[base]:
-            by_feature[base].append(suffix)
-    for k in list(by_feature.keys()):
-        by_feature[k] = sorted(by_feature[k])
-    data = {
-        "features": features,
-        "partitions": parts,
-        "by_feature": by_feature,
-    }
+    def _work(dataset_ctx, progress_style):
+        project_path = dataset_ctx.project
-    with output_path.open("w", encoding="utf-8") as fh:
-        json.dump(data, fh, indent=2)
-    print(f"[write] Saved partitions manifest to {output_path}")
+        dataset = dataset_ctx.dataset
+        feature_cfgs = list(dataset.features or [])
+        target_cfgs = list(dataset.targets or [])
+        expected_feature_ids = [cfg.id for cfg in feature_cfgs]
+        base_artifacts = artifacts_root(project_path)
+        output_path = Path(output) if output else (base_artifacts / "partitions.json")
+        collector = VectorStatsCollector(
+            expected_feature_ids or None,
+            match_partition="full",
+            threshold=None,
+            show_matrix=False,
+        )
+        context = dataset_ctx.pipeline_context
+        context.window_bounds(rectangular_required=True)
+        vectors = build_vector_pipeline(
+            context,
+            feature_cfgs,
+            dataset.group_by,
+            target_configs=target_cfgs,
+            rectangular=True,
+        )
+        vectors = post_process(context, vectors)
+        vector_iter = _iter_with_progress(
+            vectors,
+            progress_style=progress_style,
+            label="Processing vectors",
+        )
+        for sample in vector_iter:
+            merged = dict(sample.features.values)
+            if sample.targets:
+                merged.update(sample.targets.values)
+            collector.update(sample.key, merged)
+        ensure_parent(output_path)
+        parts = sorted(collector.discovered_partitions)
+        features = sorted({pid.split("__", 1)[0] for pid in parts})
+        by_feature: dict[str, list[str]] = {}
+        for pid in parts:
+            if "__" in pid:
+                base, suffix = pid.split("__", 1)
+            else:
+                base, suffix = pid, ""
+            by_feature.setdefault(base, [])
+            if suffix and suffix not in by_feature[base]:
+                by_feature[base].append(suffix)
+        for k in list(by_feature.keys()):
+            by_feature[k] = sorted(by_feature[k])
+        data = {
+            "features": features,
+            "partitions": parts,
+            "by_feature": by_feature,
+        }
+        with output_path.open("w", encoding="utf-8") as fh:
+            json.dump(data, fh, indent=2)
+        print(f"[write] Saved partitions manifest to {output_path}")
+    _run_inspect_job(
+        project,
+        visuals=visuals,
+        progress=progress,
+        log_level=log_level,
+        label="Inspect partitions",
+        section="partitions",
+        work=_work,
+    )
 def expected(
     project: str,
     *,
     output: str | None = None,
-    include_targets: bool = False,
+    visuals: str | None = None,
+    progress: str | None = None,
+    log_level: int | None = None,
+    workspace=None,
 ) -> None:
     """Discover complete set of observed full feature IDs and write a list.
     Writes newline-separated ids to `<paths.artifacts>/expected.txt` by default.
     """
-    project_path = Path(project)
-    dataset = load_dataset(project_path, "vectors")
-    runtime = bootstrap(project_path)
-    feature_cfgs = list(dataset.features or [])
-    if include_targets:
-        feature_cfgs += list(dataset.targets or [])
-    context = PipelineContext(runtime)
-    vectors = build_vector_pipeline(context, feature_cfgs, dataset.group_by, stage=None)
-    ids: set[str] = set()
-    for _, vector in vectors:
-        ids.update(vector.values.keys())
+    _prepare_inspect_build(
+        project,
+        visuals=visuals,
+        progress=progress,
+        workspace=workspace,
+    )
-    try:
-        default_path = artifacts_root(project_path) / "expected.txt"
-    except Exception as e:
-        raise RuntimeError(
-            f"{e}. Set `paths.artifacts` in your project.yaml to a writable directory."
+    def _work(dataset_ctx, progress_style):
+        project_path = dataset_ctx.project
+        dataset = dataset_ctx.dataset
+        feature_cfgs = list(dataset.features or [])
+        target_cfgs = list(dataset.targets or [])
+        context = dataset_ctx.pipeline_context
+        vectors = build_vector_pipeline(
+            context,
+            feature_cfgs,
+            dataset.group_by,
+            target_configs=target_cfgs,
+        )
+        vector_iter = _iter_with_progress(
+            vectors,
+            progress_style=progress_style,
+            label="Processing vectors",
         )
-    output_path = Path(output) if output else default_path
-    ensure_parent(output_path)
-    with output_path.open("w", encoding="utf-8") as fh:
-        for fid in sorted(ids):
-            fh.write(f"{fid}\n")
-    print(f"[write] Saved expected feature list to {output_path} ({len(ids)} ids)")
+        ids: set[str] = set()
+        for sample in vector_iter:
+            ids.update(sample.features.values.keys())
+            if sample.targets:
+                ids.update(sample.targets.values.keys())
+        try:
+            default_path = artifacts_root(project_path) / "expected.txt"
+        except Exception as e:
+            raise RuntimeError(
+                f"{e}. Set `paths.artifacts` in your project.yaml to a writable directory."
+            )
+        output_path = Path(output) if output else default_path
+        ensure_parent(output_path)
+        with output_path.open("w", encoding="utf-8") as fh:
+            for fid in sorted(ids):
+                fh.write(f"{fid}\n")
+        print(f"[write] Saved expected feature list to {output_path} ({len(ids)} ids)")
+    _run_inspect_job(
+        project,
+        visuals=visuals,
+        progress=progress,
+        log_level=log_level,
+        label="Inspect expected ids",
+        section="expected",
+        work=_work,
+    )

datapipeline/cli/commands/list_.py CHANGED Viewed

@@ -1,17 +1,40 @@
+from pathlib import Path
 from datapipeline.services.paths import pkg_root, resolve_base_pkg_dir
-from datapipeline.services.project_paths import sources_dir as sources_dir_from_project
+from datapipeline.services.bootstrap.core import load_streams
+def _default_project_path(root_dir: Path) -> Path | None:
+    candidate = root_dir / "config" / "project.yaml"
+    if candidate.exists():
+        return candidate
+    default_proj = root_dir / "config" / "datasets" / "default" / "project.yaml"
+    if default_proj.exists():
+        return default_proj
+    datasets_dir = root_dir / "config" / "datasets"
+    if datasets_dir.exists():
+        for p in sorted(datasets_dir.rglob("project.yaml")):
+            if p.is_file():
+                return p
+    return None
 def handle(subcmd: str) -> None:
     root_dir, name, pyproject = pkg_root(None)
     if subcmd == "sources":
         # Discover sources by scanning sources_dir for YAML files
-        proj_path = root_dir / "config" / "project.yaml"
-        sources_dir = sources_dir_from_project(proj_path)
-        if sources_dir.exists():
-            aliases = sorted(p.stem for p in sources_dir.glob("*.y*ml"))
-            for a in aliases:
-                print(a)
+        proj_path = _default_project_path(root_dir)
+        if proj_path is None:
+            print("[error] No project.yaml found under config/.")
+            return
+        try:
+            streams = load_streams(proj_path)
+        except FileNotFoundError as exc:
+            print(f"[error] {exc}")
+            return
+        aliases = sorted(streams.raw.keys())
+        for alias in aliases:
+            print(alias)
     elif subcmd == "domains":
         base = resolve_base_pkg_dir(root_dir, name)
         dom_dir = base / "domains"

datapipeline/cli/commands/plugin.py CHANGED Viewed

@@ -1,10 +1,14 @@
+import logging
 from pathlib import Path
 from datapipeline.services.scaffold.plugin import scaffold_plugin
+logger = logging.getLogger(__name__)
 def bar(subcmd: str, name: str | None, out: str) -> None:
     if subcmd == "init":
         if not name:
-            print("[error] --name is required for bar init")
+            logger.error("Plugin name is required. Use 'jerry plugin init <name>' or pass -n/--name.")
             raise SystemExit(2)
         scaffold_plugin(name, Path(out))

jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

jerry-thomas 0.3.0py3-none-any.whl → 1.0.1py3-none-any.whl