jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +120 -17
- datapipeline/analysis/vector/matrix.py +33 -8
- datapipeline/analysis/vector/report.py +162 -32
- datapipeline/build/tasks/__init__.py +11 -0
- datapipeline/build/tasks/config.py +74 -0
- datapipeline/build/tasks/metadata.py +170 -0
- datapipeline/build/tasks/scaler.py +73 -0
- datapipeline/build/tasks/schema.py +60 -0
- datapipeline/build/tasks/utils.py +169 -0
- datapipeline/cli/app.py +304 -127
- datapipeline/cli/commands/build.py +240 -16
- datapipeline/cli/commands/contract.py +367 -0
- datapipeline/cli/commands/domain.py +8 -3
- datapipeline/cli/commands/inspect.py +401 -149
- datapipeline/cli/commands/list_.py +30 -7
- datapipeline/cli/commands/plugin.py +5 -1
- datapipeline/cli/commands/run.py +227 -241
- datapipeline/cli/commands/run_config.py +101 -0
- datapipeline/cli/commands/serve_pipeline.py +156 -0
- datapipeline/cli/commands/source.py +44 -8
- datapipeline/cli/visuals/__init__.py +4 -2
- datapipeline/cli/visuals/common.py +239 -0
- datapipeline/cli/visuals/labels.py +15 -15
- datapipeline/cli/visuals/runner.py +66 -0
- datapipeline/cli/visuals/sections.py +20 -0
- datapipeline/cli/visuals/sources.py +132 -119
- datapipeline/cli/visuals/sources_basic.py +260 -0
- datapipeline/cli/visuals/sources_off.py +76 -0
- datapipeline/cli/visuals/sources_rich.py +414 -0
- datapipeline/config/catalog.py +37 -3
- datapipeline/config/context.py +214 -0
- datapipeline/config/dataset/loader.py +21 -4
- datapipeline/config/dataset/normalize.py +4 -4
- datapipeline/config/metadata.py +43 -0
- datapipeline/config/postprocess.py +2 -2
- datapipeline/config/project.py +3 -2
- datapipeline/config/resolution.py +129 -0
- datapipeline/config/tasks.py +309 -0
- datapipeline/config/workspace.py +155 -0
- datapipeline/domain/__init__.py +12 -0
- datapipeline/domain/record.py +11 -0
- datapipeline/domain/sample.py +54 -0
- datapipeline/integrations/ml/adapter.py +34 -20
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +1 -6
- datapipeline/integrations/ml/torch_support.py +1 -3
- datapipeline/io/factory.py +112 -0
- datapipeline/io/output.py +132 -0
- datapipeline/io/protocols.py +21 -0
- datapipeline/io/serializers.py +219 -0
- datapipeline/io/sinks/__init__.py +23 -0
- datapipeline/io/sinks/base.py +2 -0
- datapipeline/io/sinks/files.py +79 -0
- datapipeline/io/sinks/rich.py +57 -0
- datapipeline/io/sinks/stdout.py +18 -0
- datapipeline/io/writers/__init__.py +14 -0
- datapipeline/io/writers/base.py +28 -0
- datapipeline/io/writers/csv_writer.py +25 -0
- datapipeline/io/writers/jsonl.py +52 -0
- datapipeline/io/writers/pickle_writer.py +30 -0
- datapipeline/pipeline/artifacts.py +58 -0
- datapipeline/pipeline/context.py +66 -7
- datapipeline/pipeline/observability.py +65 -0
- datapipeline/pipeline/pipelines.py +65 -13
- datapipeline/pipeline/split.py +11 -10
- datapipeline/pipeline/stages.py +127 -16
- datapipeline/pipeline/utils/keygen.py +20 -7
- datapipeline/pipeline/utils/memory_sort.py +22 -10
- datapipeline/pipeline/utils/transform_utils.py +22 -0
- datapipeline/runtime.py +5 -2
- datapipeline/services/artifacts.py +12 -6
- datapipeline/services/bootstrap/config.py +25 -0
- datapipeline/services/bootstrap/core.py +52 -37
- datapipeline/services/constants.py +6 -5
- datapipeline/services/factories.py +123 -1
- datapipeline/services/project_paths.py +43 -16
- datapipeline/services/runs.py +208 -0
- datapipeline/services/scaffold/domain.py +3 -2
- datapipeline/services/scaffold/filter.py +3 -2
- datapipeline/services/scaffold/mappers.py +9 -6
- datapipeline/services/scaffold/plugin.py +54 -10
- datapipeline/services/scaffold/source.py +93 -56
- datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
- datapipeline/sources/decoders.py +83 -18
- datapipeline/sources/factory.py +26 -16
- datapipeline/sources/models/__init__.py +2 -2
- datapipeline/sources/models/generator.py +0 -7
- datapipeline/sources/models/loader.py +3 -3
- datapipeline/sources/models/parsing_error.py +24 -0
- datapipeline/sources/models/source.py +6 -6
- datapipeline/sources/synthetic/time/loader.py +14 -2
- datapipeline/sources/transports.py +74 -37
- datapipeline/templates/plugin_skeleton/README.md +76 -30
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
- datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
- datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
- datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -0
- datapipeline/templates/stubs/mapper.py.j2 +5 -4
- datapipeline/templates/stubs/parser.py.j2 +2 -0
- datapipeline/templates/stubs/record.py.j2 +2 -0
- datapipeline/templates/stubs/source.yaml.j2 +2 -3
- datapipeline/transforms/debug/lint.py +26 -41
- datapipeline/transforms/feature/scaler.py +89 -13
- datapipeline/transforms/record/floor_time.py +4 -4
- datapipeline/transforms/sequence.py +2 -35
- datapipeline/transforms/stream/dedupe.py +24 -0
- datapipeline/transforms/stream/ensure_ticks.py +7 -6
- datapipeline/transforms/vector/__init__.py +5 -0
- datapipeline/transforms/vector/common.py +98 -0
- datapipeline/transforms/vector/drop/__init__.py +4 -0
- datapipeline/transforms/vector/drop/horizontal.py +79 -0
- datapipeline/transforms/vector/drop/orchestrator.py +59 -0
- datapipeline/transforms/vector/drop/vertical.py +182 -0
- datapipeline/transforms/vector/ensure_schema.py +184 -0
- datapipeline/transforms/vector/fill.py +87 -0
- datapipeline/transforms/vector/replace.py +62 -0
- datapipeline/utils/load.py +24 -3
- datapipeline/utils/rich_compat.py +38 -0
- datapipeline/utils/window.py +76 -0
- jerry_thomas-1.0.1.dist-info/METADATA +825 -0
- jerry_thomas-1.0.1.dist-info/RECORD +199 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
- datapipeline/build/tasks.py +0 -186
- datapipeline/cli/commands/link.py +0 -128
- datapipeline/cli/commands/writers.py +0 -138
- datapipeline/config/build.py +0 -64
- datapipeline/config/run.py +0 -116
- datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
- datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
- datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
- datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
- datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
- datapipeline/transforms/vector.py +0 -210
- jerry_thomas-0.3.0.dist-info/METADATA +0 -502
- jerry_thomas-0.3.0.dist-info/RECORD +0 -139
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -1,138 +1,151 @@
|
|
|
1
|
-
from typing import Iterator, Any, Optional
|
|
2
1
|
from contextlib import contextmanager
|
|
3
|
-
from itertools import cycle
|
|
4
2
|
import logging
|
|
5
|
-
import
|
|
6
|
-
import
|
|
3
|
+
import sys
|
|
4
|
+
from typing import Optional, Tuple
|
|
7
5
|
|
|
8
|
-
from .labels import progress_meta_for_loader
|
|
9
6
|
from datapipeline.runtime import Runtime
|
|
10
|
-
from datapipeline.sources.models.source import Source
|
|
11
|
-
from tqdm import tqdm
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class VisualSourceProxy(Source):
|
|
15
|
-
"""Proxy wrapping Source.stream() with CLI feedback scaled by logging level."""
|
|
16
|
-
|
|
17
|
-
def __init__(self, inner: Source, alias: str, verbosity: int):
|
|
18
|
-
self._inner = inner
|
|
19
|
-
self._alias = alias
|
|
20
|
-
self._verbosity = max(0, min(verbosity, 2))
|
|
21
|
-
|
|
22
|
-
@staticmethod
|
|
23
|
-
def _start_spinner(label: str):
|
|
24
|
-
"""Start a background spinner tqdm progress bar."""
|
|
25
|
-
bar = tqdm(
|
|
26
|
-
total=0,
|
|
27
|
-
desc="",
|
|
28
|
-
bar_format="{desc}",
|
|
29
|
-
dynamic_ncols=True,
|
|
30
|
-
leave=False,
|
|
31
|
-
)
|
|
32
|
-
bar.set_description_str(label)
|
|
33
|
-
bar.refresh()
|
|
34
|
-
|
|
35
|
-
stop_event = threading.Event()
|
|
36
|
-
|
|
37
|
-
def _spin():
|
|
38
|
-
frames = cycle((" |", " /", " -", " \\"))
|
|
39
|
-
while not stop_event.is_set():
|
|
40
|
-
bar.set_description_str(f"{label}{next(frames)}")
|
|
41
|
-
bar.refresh()
|
|
42
|
-
time.sleep(0.1)
|
|
43
|
-
bar.set_description_str(label)
|
|
44
|
-
bar.refresh()
|
|
45
|
-
|
|
46
|
-
worker = threading.Thread(target=_spin, daemon=True)
|
|
47
|
-
worker.start()
|
|
48
|
-
return stop_event, worker, bar
|
|
49
|
-
|
|
50
|
-
@staticmethod
|
|
51
|
-
def _stop_spinner(stop_event, worker, bar):
|
|
52
|
-
stop_event.set()
|
|
53
|
-
worker.join()
|
|
54
|
-
try:
|
|
55
|
-
bar.close()
|
|
56
|
-
finally:
|
|
57
|
-
fp = getattr(bar, "fp", None)
|
|
58
|
-
try:
|
|
59
|
-
if getattr(bar, "disable", False):
|
|
60
|
-
return
|
|
61
|
-
if fp and hasattr(fp, "write"):
|
|
62
|
-
fp.write("\n")
|
|
63
|
-
fp.flush()
|
|
64
|
-
else:
|
|
65
|
-
print()
|
|
66
|
-
except Exception:
|
|
67
|
-
pass
|
|
68
7
|
|
|
69
|
-
|
|
70
|
-
try:
|
|
71
|
-
stop_event, worker, bar = self._start_spinner(label)
|
|
72
|
-
except Exception:
|
|
73
|
-
# If spinner setup fails, silently fall back to raw count
|
|
74
|
-
return self._safe_count()
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
75
9
|
|
|
76
|
-
try:
|
|
77
|
-
return self._safe_count()
|
|
78
|
-
finally:
|
|
79
|
-
self._stop_spinner(stop_event, worker, bar)
|
|
80
10
|
|
|
81
|
-
|
|
11
|
+
def _is_tty() -> bool:
|
|
12
|
+
try:
|
|
13
|
+
return hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
|
|
14
|
+
except Exception:
|
|
15
|
+
return False
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class VisualsBackend:
|
|
19
|
+
"""Interface for visuals backends.
|
|
20
|
+
|
|
21
|
+
- on_build_start/on_job_start return True if the backend handled the headline, False to let caller log it.
|
|
22
|
+
- wrap_sources returns a contextmanager that enables streaming visuals.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def on_build_start(self, path) -> bool: # Path-like
|
|
26
|
+
return False
|
|
27
|
+
|
|
28
|
+
def on_job_start(self, sections: Tuple[str, ...], label: str, idx: int, total: int) -> bool:
|
|
29
|
+
return False
|
|
30
|
+
|
|
31
|
+
def on_streams_complete(self) -> bool:
|
|
32
|
+
"""Return True if backend surfaced a final completion line visually."""
|
|
33
|
+
return False
|
|
34
|
+
|
|
35
|
+
def requires_logging_redirect(self) -> bool:
|
|
36
|
+
"""Return True when console logging should be routed via tqdm."""
|
|
37
|
+
return True
|
|
38
|
+
|
|
39
|
+
def wrap_sources(self, runtime: Runtime, log_level: int, progress_style: str): # contextmanager
|
|
40
|
+
@contextmanager
|
|
41
|
+
def _noop():
|
|
42
|
+
yield
|
|
43
|
+
|
|
44
|
+
return _noop()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class _BasicBackend(VisualsBackend):
|
|
48
|
+
def wrap_sources(self, runtime: Runtime, log_level: int, progress_style: str):
|
|
49
|
+
from .sources_basic import visual_sources as basic
|
|
50
|
+
return basic(runtime, log_level, progress_style)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class _RichBackend(VisualsBackend):
|
|
54
|
+
def _render_sections(self, console, sections: tuple[str, ...]) -> None:
|
|
55
|
+
if not sections:
|
|
56
|
+
return
|
|
57
|
+
from rich.rule import Rule as _Rule
|
|
58
|
+
console.print(_Rule(sections[0].title(), style="bold white"))
|
|
59
|
+
if len(sections) > 1:
|
|
60
|
+
for level, name in enumerate(sections[1:], start=1):
|
|
61
|
+
indent = " " * level
|
|
62
|
+
console.print(f"{indent}[cyan]{name}[/cyan]")
|
|
63
|
+
console.print()
|
|
64
|
+
|
|
65
|
+
def on_job_start(self, sections: tuple[str, ...], label: str, idx: int, total: int) -> bool:
|
|
82
66
|
try:
|
|
83
|
-
|
|
67
|
+
from rich.console import Console as _Console
|
|
68
|
+
import sys as _sys
|
|
69
|
+
console = _Console(file=_sys.stderr, markup=True)
|
|
70
|
+
self._render_sections(console, sections)
|
|
71
|
+
indent = " " * max(len(sections), 1)
|
|
72
|
+
console.print(f"{indent}── {label} ({idx}/{total}) ──")
|
|
73
|
+
console.print()
|
|
74
|
+
return True
|
|
84
75
|
except Exception:
|
|
85
|
-
return
|
|
86
|
-
|
|
87
|
-
def stream(self) -> Iterator[Any]:
|
|
88
|
-
desc, unit = progress_meta_for_loader(self._inner.loader)
|
|
89
|
-
progress_desc = f"{desc} [{self._alias}]"
|
|
90
|
-
label = f"Preparing data stream for [{self._alias}]"
|
|
91
|
-
|
|
92
|
-
if self._verbosity >= 2:
|
|
93
|
-
total = self._count_with_indicator(label)
|
|
94
|
-
yield from tqdm(
|
|
95
|
-
self._inner.stream(),
|
|
96
|
-
total=total,
|
|
97
|
-
desc=progress_desc,
|
|
98
|
-
unit=unit,
|
|
99
|
-
dynamic_ncols=True,
|
|
100
|
-
mininterval=0.0,
|
|
101
|
-
miniters=1,
|
|
102
|
-
leave=True,
|
|
103
|
-
)
|
|
104
|
-
return
|
|
76
|
+
return False
|
|
105
77
|
|
|
78
|
+
def on_build_start(self, path) -> bool:
|
|
106
79
|
try:
|
|
107
|
-
|
|
80
|
+
from rich.console import Console as _Console
|
|
81
|
+
from rich.rule import Rule as _Rule
|
|
82
|
+
import sys as _sys
|
|
83
|
+
from pathlib import Path as _Path
|
|
84
|
+
import os as _os
|
|
85
|
+
console = _Console(file=_sys.stderr, markup=True)
|
|
86
|
+
console.print(_Rule("Info", style="bold white"))
|
|
87
|
+
# Subheader with compact path to project.yaml
|
|
88
|
+
p = _Path(path)
|
|
89
|
+
try:
|
|
90
|
+
cwd = _Path(_os.getcwd())
|
|
91
|
+
rel = p.relative_to(cwd)
|
|
92
|
+
parts = [part for part in rel.as_posix().split("/") if part]
|
|
93
|
+
except Exception:
|
|
94
|
+
parts = [part for part in p.as_posix().split("/") if part]
|
|
95
|
+
if len(parts) > 3:
|
|
96
|
+
parts = ["..."] + parts[-3:]
|
|
97
|
+
compact = "/".join(parts) if parts else p.name
|
|
98
|
+
console.print(f"[cyan]project:[/cyan] {compact}")
|
|
99
|
+
console.print() # spacer
|
|
100
|
+
return True
|
|
108
101
|
except Exception:
|
|
109
|
-
|
|
110
|
-
yield from self._inner.stream()
|
|
111
|
-
return
|
|
102
|
+
return False
|
|
112
103
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
finally:
|
|
117
|
-
self._stop_spinner(stop_event, worker, bar)
|
|
104
|
+
def wrap_sources(self, runtime: Runtime, log_level: int, progress_style: str):
|
|
105
|
+
from .sources_rich import visual_sources as rich_vs
|
|
106
|
+
return rich_vs(runtime, log_level, progress_style)
|
|
118
107
|
|
|
108
|
+
def on_streams_complete(self) -> bool:
|
|
109
|
+
# Rich backend manages its own persistent final line; signal handled
|
|
110
|
+
return True
|
|
119
111
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
112
|
+
def requires_logging_redirect(self) -> bool:
|
|
113
|
+
return False
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class _OffBackend(VisualsBackend):
|
|
117
|
+
def requires_logging_redirect(self) -> bool:
|
|
118
|
+
return False
|
|
126
119
|
|
|
127
|
-
|
|
120
|
+
def wrap_sources(self, runtime: Runtime, log_level: int, progress_style: str):
|
|
121
|
+
from .sources_off import visual_sources as off_vs
|
|
122
|
+
return off_vs(runtime, log_level, progress_style)
|
|
128
123
|
|
|
129
|
-
|
|
130
|
-
|
|
124
|
+
|
|
125
|
+
def _rich_available() -> bool:
|
|
131
126
|
try:
|
|
132
|
-
|
|
133
|
-
|
|
127
|
+
import rich # noqa: F401
|
|
128
|
+
return True
|
|
129
|
+
except Exception:
|
|
130
|
+
return False
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def get_visuals_backend(provider: Optional[str]) -> VisualsBackend:
|
|
134
|
+
mode = (provider or "auto").lower()
|
|
135
|
+
if mode == "off":
|
|
136
|
+
return _OffBackend()
|
|
137
|
+
if mode == "tqdm":
|
|
138
|
+
return _BasicBackend()
|
|
139
|
+
if mode == "rich":
|
|
140
|
+
return _RichBackend() if _rich_available() else _BasicBackend()
|
|
141
|
+
# auto
|
|
142
|
+
if _rich_available() and _is_tty():
|
|
143
|
+
return _RichBackend()
|
|
144
|
+
return _BasicBackend()
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
@contextmanager
|
|
148
|
+
def visual_sources(runtime: Runtime, log_level: int, provider: Optional[str] = None, progress_style: str = "auto"):
|
|
149
|
+
backend = get_visuals_backend(provider)
|
|
150
|
+
with backend.wrap_sources(runtime, log_level, progress_style):
|
|
134
151
|
yield
|
|
135
|
-
finally:
|
|
136
|
-
# Restore original sources
|
|
137
|
-
for alias, src in originals.items():
|
|
138
|
-
reg.register(alias, src)
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
from typing import Iterator, Any, Optional
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from itertools import cycle
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
import threading
|
|
8
|
+
import time
|
|
9
|
+
|
|
10
|
+
from .labels import progress_meta_for_loader
|
|
11
|
+
from datapipeline.runtime import Runtime
|
|
12
|
+
from datapipeline.sources.models.source import Source
|
|
13
|
+
from datapipeline.sources.transports import FsGlobTransport
|
|
14
|
+
from tqdm import tqdm
|
|
15
|
+
from .common import (
|
|
16
|
+
compute_glob_root,
|
|
17
|
+
current_transport_label,
|
|
18
|
+
log_combined_stream,
|
|
19
|
+
log_transport_details,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class VisualSourceProxy(Source):
|
|
26
|
+
"""Proxy wrapping Source.stream() with CLI feedback scaled by logging level."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, inner: Source, alias: str, verbosity: int):
|
|
29
|
+
self._inner = inner
|
|
30
|
+
self._alias = alias
|
|
31
|
+
self._verbosity = max(0, min(verbosity, 2))
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def _start_spinner(label: str):
|
|
35
|
+
"""Start a background spinner tqdm progress bar."""
|
|
36
|
+
bar = tqdm(
|
|
37
|
+
total=0,
|
|
38
|
+
desc="",
|
|
39
|
+
bar_format="{desc}",
|
|
40
|
+
dynamic_ncols=True,
|
|
41
|
+
leave=False,
|
|
42
|
+
)
|
|
43
|
+
state = {"base": label}
|
|
44
|
+
bar.set_description_str(label)
|
|
45
|
+
bar.refresh()
|
|
46
|
+
|
|
47
|
+
stop_event = threading.Event()
|
|
48
|
+
|
|
49
|
+
def _spin():
|
|
50
|
+
frames = cycle((" |", " /", " -", " \\"))
|
|
51
|
+
while not stop_event.is_set():
|
|
52
|
+
bar.set_description_str(f"{state['base']}{next(frames)}")
|
|
53
|
+
bar.refresh()
|
|
54
|
+
time.sleep(0.1)
|
|
55
|
+
bar.set_description_str(state["base"])
|
|
56
|
+
bar.refresh()
|
|
57
|
+
|
|
58
|
+
worker = threading.Thread(target=_spin, daemon=True)
|
|
59
|
+
worker.start()
|
|
60
|
+
return state, stop_event, worker, bar
|
|
61
|
+
|
|
62
|
+
@staticmethod
|
|
63
|
+
def _stop_spinner(stop_event, worker, bar):
|
|
64
|
+
stop_event.set()
|
|
65
|
+
worker.join()
|
|
66
|
+
try:
|
|
67
|
+
bar.close()
|
|
68
|
+
finally:
|
|
69
|
+
fp = getattr(bar, "fp", None)
|
|
70
|
+
try:
|
|
71
|
+
if getattr(bar, "disable", False):
|
|
72
|
+
return
|
|
73
|
+
if fp and hasattr(fp, "write"):
|
|
74
|
+
fp.write("\n")
|
|
75
|
+
fp.flush()
|
|
76
|
+
else:
|
|
77
|
+
print()
|
|
78
|
+
except Exception:
|
|
79
|
+
pass
|
|
80
|
+
|
|
81
|
+
def _count_with_indicator(self, label: str) -> Optional[int]:
|
|
82
|
+
try:
|
|
83
|
+
_, stop_event, worker, bar = self._start_spinner(label)
|
|
84
|
+
except Exception:
|
|
85
|
+
# If spinner setup fails, silently fall back to raw count
|
|
86
|
+
return self._safe_count()
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
return self._safe_count()
|
|
90
|
+
finally:
|
|
91
|
+
self._stop_spinner(stop_event, worker, bar)
|
|
92
|
+
|
|
93
|
+
def _safe_count(self) -> Optional[int]:
|
|
94
|
+
try:
|
|
95
|
+
return self._inner.count()
|
|
96
|
+
except Exception:
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
def _log_source_details(self, transport, root: Optional[Path]) -> None:
|
|
100
|
+
# Use visuals-agnostic helper so behavior matches rich/basic
|
|
101
|
+
log_transport_details(transport, self._alias)
|
|
102
|
+
|
|
103
|
+
def stream(self) -> Iterator[Any]:
|
|
104
|
+
loader = getattr(self._inner, "loader", None)
|
|
105
|
+
desc, unit = progress_meta_for_loader(loader)
|
|
106
|
+
prefix, sep, suffix = desc.partition(": ")
|
|
107
|
+
header = f"{prefix}:" if sep else desc
|
|
108
|
+
tail = suffix if sep else None
|
|
109
|
+
label = f"[{self._alias}] Preparing data stream"
|
|
110
|
+
|
|
111
|
+
transport = getattr(loader, "transport", None)
|
|
112
|
+
|
|
113
|
+
glob_root: Optional[Path] = None
|
|
114
|
+
if isinstance(transport, FsGlobTransport):
|
|
115
|
+
glob_root = compute_glob_root(transport.files)
|
|
116
|
+
|
|
117
|
+
last_path_label: Optional[str] = None
|
|
118
|
+
|
|
119
|
+
def compose_desc(name: Optional[str]) -> str:
|
|
120
|
+
if name:
|
|
121
|
+
base = header if sep else desc
|
|
122
|
+
return f"[{self._alias}] {base} {name}".rstrip()
|
|
123
|
+
if tail:
|
|
124
|
+
return f"[{self._alias}] {header} {tail}".rstrip()
|
|
125
|
+
return f"[{self._alias}] {desc}"
|
|
126
|
+
|
|
127
|
+
def maybe_update_label(apply_label):
|
|
128
|
+
nonlocal last_path_label
|
|
129
|
+
current_label = current_transport_label(transport, glob_root=glob_root)
|
|
130
|
+
if not current_label or current_label == last_path_label:
|
|
131
|
+
return
|
|
132
|
+
last_path_label = current_label
|
|
133
|
+
apply_label(current_label)
|
|
134
|
+
|
|
135
|
+
emitted = 0
|
|
136
|
+
if self._verbosity >= 2:
|
|
137
|
+
total = self._count_with_indicator(label)
|
|
138
|
+
|
|
139
|
+
bar = tqdm(
|
|
140
|
+
total=total,
|
|
141
|
+
desc=compose_desc(None),
|
|
142
|
+
unit=unit,
|
|
143
|
+
dynamic_ncols=True,
|
|
144
|
+
mininterval=0.0,
|
|
145
|
+
miniters=1,
|
|
146
|
+
leave=True,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
started = False
|
|
150
|
+
|
|
151
|
+
def update_progress(name: str) -> None:
|
|
152
|
+
bar.set_description_str(compose_desc(name))
|
|
153
|
+
bar.refresh()
|
|
154
|
+
|
|
155
|
+
try:
|
|
156
|
+
for item in self._inner.stream():
|
|
157
|
+
if not started:
|
|
158
|
+
# Emit transport details on first item for correct ordering (DEBUG verbosity)
|
|
159
|
+
self._log_source_details(transport, glob_root)
|
|
160
|
+
started = True
|
|
161
|
+
maybe_update_label(update_progress)
|
|
162
|
+
bar.update()
|
|
163
|
+
emitted += 1
|
|
164
|
+
yield item
|
|
165
|
+
finally:
|
|
166
|
+
bar.close()
|
|
167
|
+
if logger.isEnabledFor(logging.INFO):
|
|
168
|
+
try:
|
|
169
|
+
unit_label = f" {unit}" if unit else ""
|
|
170
|
+
logger.info("[%s] Stream complete (%d%s) ✔",
|
|
171
|
+
self._alias, emitted, unit_label)
|
|
172
|
+
except Exception:
|
|
173
|
+
pass
|
|
174
|
+
return
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
state, stop_event, worker, bar = self._start_spinner(
|
|
178
|
+
compose_desc(None))
|
|
179
|
+
except Exception:
|
|
180
|
+
# Spinner isn't critical; fall back to raw stream
|
|
181
|
+
yield from self._inner.stream()
|
|
182
|
+
return
|
|
183
|
+
|
|
184
|
+
def update_spinner(name: str) -> None:
|
|
185
|
+
state["base"] = compose_desc(name)
|
|
186
|
+
bar.set_description_str(state["base"])
|
|
187
|
+
bar.refresh()
|
|
188
|
+
|
|
189
|
+
started = False
|
|
190
|
+
try:
|
|
191
|
+
for item in self._inner.stream():
|
|
192
|
+
if not started:
|
|
193
|
+
# Emit transport details at the start for correct grouping
|
|
194
|
+
self._log_source_details(transport, glob_root)
|
|
195
|
+
started = True
|
|
196
|
+
maybe_update_label(update_spinner)
|
|
197
|
+
emitted += 1
|
|
198
|
+
yield item
|
|
199
|
+
finally:
|
|
200
|
+
self._stop_spinner(stop_event, worker, bar)
|
|
201
|
+
if logger.isEnabledFor(logging.INFO):
|
|
202
|
+
try:
|
|
203
|
+
unit_label = f" {unit}" if unit else ""
|
|
204
|
+
logger.info("[%s] Stream complete (%d%s) ✔",
|
|
205
|
+
self._alias, emitted, unit_label)
|
|
206
|
+
except Exception:
|
|
207
|
+
pass
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _style_mode(progress_style: str, log_level: int | None) -> str:
|
|
211
|
+
mode = (progress_style or "auto").lower()
|
|
212
|
+
if mode == "auto":
|
|
213
|
+
level = log_level if log_level is not None else logging.INFO
|
|
214
|
+
return "bars" if level <= logging.DEBUG else "spinner"
|
|
215
|
+
return mode
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
@contextmanager
|
|
219
|
+
def visual_sources(runtime: Runtime, log_level: int | None, progress_style: str = "auto"):
|
|
220
|
+
"""Temporarily wrap stream sources with logging-level-driven feedback."""
|
|
221
|
+
level = log_level if log_level is not None else logging.INFO
|
|
222
|
+
style_mode = _style_mode(progress_style, log_level)
|
|
223
|
+
if style_mode == "off" or level > logging.INFO:
|
|
224
|
+
yield
|
|
225
|
+
return
|
|
226
|
+
|
|
227
|
+
verbosity = 2 if style_mode == "bars" else 1
|
|
228
|
+
|
|
229
|
+
reg = runtime.registries.stream_sources
|
|
230
|
+
originals = dict(reg.items())
|
|
231
|
+
try:
|
|
232
|
+
# Lightweight proxy that only prints a composed header when actually streamed
|
|
233
|
+
class _ComposedHeaderProxy:
|
|
234
|
+
def __init__(self, inner, alias: str):
|
|
235
|
+
self._inner = inner
|
|
236
|
+
self._alias = alias
|
|
237
|
+
|
|
238
|
+
def stream(self): # Iterator[Any]
|
|
239
|
+
detail_entries: Optional[list[str]] = None
|
|
240
|
+
try:
|
|
241
|
+
spec = getattr(self._inner, "_spec", None)
|
|
242
|
+
inputs = getattr(spec, "inputs", None)
|
|
243
|
+
if isinstance(inputs, (list, tuple)) and inputs:
|
|
244
|
+
detail_entries = [str(item) for item in inputs]
|
|
245
|
+
except Exception:
|
|
246
|
+
detail_entries = None
|
|
247
|
+
log_combined_stream(self._alias, detail_entries)
|
|
248
|
+
yield from self._inner.stream()
|
|
249
|
+
|
|
250
|
+
for alias, src in originals.items():
|
|
251
|
+
# Wrap composed/virtual sources with a header-only proxy; others with visuals
|
|
252
|
+
if getattr(src, "loader", None) is None:
|
|
253
|
+
reg.register(alias, _ComposedHeaderProxy(src, alias))
|
|
254
|
+
else:
|
|
255
|
+
reg.register(alias, VisualSourceProxy(src, alias, verbosity))
|
|
256
|
+
yield
|
|
257
|
+
finally:
|
|
258
|
+
# Restore original sources
|
|
259
|
+
for alias, src in originals.items():
|
|
260
|
+
reg.register(alias, src)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
2
|
+
from typing import Iterator, Any, Optional
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from datapipeline.runtime import Runtime
|
|
6
|
+
from datapipeline.sources.models.source import Source
|
|
7
|
+
|
|
8
|
+
from .labels import progress_meta_for_loader
|
|
9
|
+
from .common import log_transport_details, log_combined_stream
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class _OffSourceProxy(Source):
|
|
15
|
+
def __init__(self, inner: Source, alias: str):
|
|
16
|
+
self._inner = inner
|
|
17
|
+
self._alias = alias
|
|
18
|
+
|
|
19
|
+
def stream(self) -> Iterator[Any]:
|
|
20
|
+
loader = getattr(self._inner, "loader", None)
|
|
21
|
+
transport = getattr(loader, "transport", None)
|
|
22
|
+
_, unit = progress_meta_for_loader(loader)
|
|
23
|
+
emitted = 0
|
|
24
|
+
started = False
|
|
25
|
+
try:
|
|
26
|
+
for item in self._inner.stream():
|
|
27
|
+
if not started:
|
|
28
|
+
try:
|
|
29
|
+
log_transport_details(transport, self._alias)
|
|
30
|
+
except Exception:
|
|
31
|
+
pass
|
|
32
|
+
started = True
|
|
33
|
+
emitted += 1
|
|
34
|
+
yield item
|
|
35
|
+
finally:
|
|
36
|
+
if logger.isEnabledFor(logging.INFO):
|
|
37
|
+
unit_label = f" {unit}" if unit else ""
|
|
38
|
+
logger.info("[%s] Stream complete (%d%s) ✔", self._alias, emitted, unit_label)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@contextmanager
|
|
42
|
+
def visual_sources(runtime: Runtime, log_level: int | None, progress_style: str = "auto"):
|
|
43
|
+
if log_level is None or log_level > logging.INFO:
|
|
44
|
+
yield
|
|
45
|
+
return
|
|
46
|
+
|
|
47
|
+
reg = runtime.registries.stream_sources
|
|
48
|
+
originals = dict(reg.items())
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
class _ComposedHeaderProxy:
|
|
52
|
+
def __init__(self, inner, alias: str):
|
|
53
|
+
self._inner = inner
|
|
54
|
+
self._alias = alias
|
|
55
|
+
|
|
56
|
+
def stream(self):
|
|
57
|
+
detail_entries: Optional[list[str]] = None
|
|
58
|
+
try:
|
|
59
|
+
spec = getattr(self._inner, "_spec", None)
|
|
60
|
+
inputs = getattr(spec, "inputs", None)
|
|
61
|
+
if isinstance(inputs, (list, tuple)) and inputs:
|
|
62
|
+
detail_entries = [str(item) for item in inputs]
|
|
63
|
+
except Exception:
|
|
64
|
+
detail_entries = None
|
|
65
|
+
log_combined_stream(self._alias, detail_entries)
|
|
66
|
+
yield from self._inner.stream()
|
|
67
|
+
|
|
68
|
+
for alias, src in originals.items():
|
|
69
|
+
if getattr(src, "loader", None) is None:
|
|
70
|
+
reg.register(alias, _ComposedHeaderProxy(src, alias))
|
|
71
|
+
else:
|
|
72
|
+
reg.register(alias, _OffSourceProxy(src, alias))
|
|
73
|
+
yield
|
|
74
|
+
finally:
|
|
75
|
+
for alias, src in originals.items():
|
|
76
|
+
reg.register(alias, src)
|