jerry-thomas 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. datapipeline/analysis/vector/collector.py +120 -17
  2. datapipeline/analysis/vector/matrix.py +33 -8
  3. datapipeline/analysis/vector/report.py +162 -32
  4. datapipeline/build/tasks/__init__.py +11 -0
  5. datapipeline/build/tasks/config.py +74 -0
  6. datapipeline/build/tasks/metadata.py +170 -0
  7. datapipeline/build/tasks/scaler.py +73 -0
  8. datapipeline/build/tasks/schema.py +60 -0
  9. datapipeline/build/tasks/utils.py +169 -0
  10. datapipeline/cli/app.py +304 -127
  11. datapipeline/cli/commands/build.py +240 -16
  12. datapipeline/cli/commands/contract.py +367 -0
  13. datapipeline/cli/commands/domain.py +8 -3
  14. datapipeline/cli/commands/inspect.py +401 -149
  15. datapipeline/cli/commands/list_.py +30 -7
  16. datapipeline/cli/commands/plugin.py +1 -1
  17. datapipeline/cli/commands/run.py +227 -241
  18. datapipeline/cli/commands/run_config.py +101 -0
  19. datapipeline/cli/commands/serve_pipeline.py +156 -0
  20. datapipeline/cli/commands/source.py +44 -8
  21. datapipeline/cli/visuals/__init__.py +4 -2
  22. datapipeline/cli/visuals/common.py +239 -0
  23. datapipeline/cli/visuals/labels.py +15 -15
  24. datapipeline/cli/visuals/runner.py +66 -0
  25. datapipeline/cli/visuals/sections.py +20 -0
  26. datapipeline/cli/visuals/sources.py +132 -119
  27. datapipeline/cli/visuals/sources_basic.py +260 -0
  28. datapipeline/cli/visuals/sources_off.py +76 -0
  29. datapipeline/cli/visuals/sources_rich.py +414 -0
  30. datapipeline/config/catalog.py +37 -3
  31. datapipeline/config/context.py +214 -0
  32. datapipeline/config/dataset/loader.py +21 -4
  33. datapipeline/config/dataset/normalize.py +4 -4
  34. datapipeline/config/metadata.py +43 -0
  35. datapipeline/config/postprocess.py +2 -2
  36. datapipeline/config/project.py +3 -2
  37. datapipeline/config/resolution.py +129 -0
  38. datapipeline/config/tasks.py +309 -0
  39. datapipeline/config/workspace.py +155 -0
  40. datapipeline/domain/__init__.py +12 -0
  41. datapipeline/domain/record.py +11 -0
  42. datapipeline/domain/sample.py +54 -0
  43. datapipeline/integrations/ml/adapter.py +34 -20
  44. datapipeline/integrations/ml/pandas_support.py +0 -2
  45. datapipeline/integrations/ml/rows.py +1 -6
  46. datapipeline/integrations/ml/torch_support.py +1 -3
  47. datapipeline/io/factory.py +112 -0
  48. datapipeline/io/output.py +132 -0
  49. datapipeline/io/protocols.py +21 -0
  50. datapipeline/io/serializers.py +219 -0
  51. datapipeline/io/sinks/__init__.py +23 -0
  52. datapipeline/io/sinks/base.py +2 -0
  53. datapipeline/io/sinks/files.py +79 -0
  54. datapipeline/io/sinks/rich.py +57 -0
  55. datapipeline/io/sinks/stdout.py +18 -0
  56. datapipeline/io/writers/__init__.py +14 -0
  57. datapipeline/io/writers/base.py +28 -0
  58. datapipeline/io/writers/csv_writer.py +25 -0
  59. datapipeline/io/writers/jsonl.py +52 -0
  60. datapipeline/io/writers/pickle_writer.py +30 -0
  61. datapipeline/pipeline/artifacts.py +58 -0
  62. datapipeline/pipeline/context.py +66 -7
  63. datapipeline/pipeline/observability.py +65 -0
  64. datapipeline/pipeline/pipelines.py +65 -13
  65. datapipeline/pipeline/split.py +11 -10
  66. datapipeline/pipeline/stages.py +127 -16
  67. datapipeline/pipeline/utils/keygen.py +20 -7
  68. datapipeline/pipeline/utils/memory_sort.py +22 -10
  69. datapipeline/pipeline/utils/transform_utils.py +22 -0
  70. datapipeline/runtime.py +5 -2
  71. datapipeline/services/artifacts.py +12 -6
  72. datapipeline/services/bootstrap/config.py +25 -0
  73. datapipeline/services/bootstrap/core.py +52 -37
  74. datapipeline/services/constants.py +6 -5
  75. datapipeline/services/factories.py +123 -1
  76. datapipeline/services/project_paths.py +43 -16
  77. datapipeline/services/runs.py +208 -0
  78. datapipeline/services/scaffold/domain.py +3 -2
  79. datapipeline/services/scaffold/filter.py +3 -2
  80. datapipeline/services/scaffold/mappers.py +9 -6
  81. datapipeline/services/scaffold/plugin.py +3 -3
  82. datapipeline/services/scaffold/source.py +93 -56
  83. datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
  84. datapipeline/sources/decoders.py +83 -18
  85. datapipeline/sources/factory.py +26 -16
  86. datapipeline/sources/models/__init__.py +2 -2
  87. datapipeline/sources/models/generator.py +0 -7
  88. datapipeline/sources/models/loader.py +3 -3
  89. datapipeline/sources/models/parsing_error.py +24 -0
  90. datapipeline/sources/models/source.py +6 -6
  91. datapipeline/sources/synthetic/time/loader.py +14 -2
  92. datapipeline/sources/transports.py +74 -37
  93. datapipeline/templates/plugin_skeleton/README.md +74 -30
  94. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
  95. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
  96. datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
  97. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
  98. datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
  99. datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
  100. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
  101. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
  102. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
  103. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
  104. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
  105. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
  106. datapipeline/templates/plugin_skeleton/jerry.yaml +28 -0
  107. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
  108. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
  109. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
  110. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
  111. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
  112. datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
  113. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
  114. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
  115. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
  116. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
  117. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
  118. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
  119. datapipeline/templates/stubs/dto.py.j2 +2 -0
  120. datapipeline/templates/stubs/mapper.py.j2 +5 -4
  121. datapipeline/templates/stubs/parser.py.j2 +2 -0
  122. datapipeline/templates/stubs/record.py.j2 +2 -0
  123. datapipeline/templates/stubs/source.yaml.j2 +2 -3
  124. datapipeline/transforms/debug/lint.py +26 -41
  125. datapipeline/transforms/feature/scaler.py +89 -13
  126. datapipeline/transforms/record/floor_time.py +4 -4
  127. datapipeline/transforms/sequence.py +2 -35
  128. datapipeline/transforms/stream/dedupe.py +24 -0
  129. datapipeline/transforms/stream/ensure_ticks.py +7 -6
  130. datapipeline/transforms/vector/__init__.py +5 -0
  131. datapipeline/transforms/vector/common.py +98 -0
  132. datapipeline/transforms/vector/drop/__init__.py +4 -0
  133. datapipeline/transforms/vector/drop/horizontal.py +79 -0
  134. datapipeline/transforms/vector/drop/orchestrator.py +59 -0
  135. datapipeline/transforms/vector/drop/vertical.py +182 -0
  136. datapipeline/transforms/vector/ensure_schema.py +184 -0
  137. datapipeline/transforms/vector/fill.py +87 -0
  138. datapipeline/transforms/vector/replace.py +62 -0
  139. datapipeline/utils/load.py +24 -3
  140. datapipeline/utils/rich_compat.py +38 -0
  141. datapipeline/utils/window.py +76 -0
  142. jerry_thomas-1.0.0.dist-info/METADATA +825 -0
  143. jerry_thomas-1.0.0.dist-info/RECORD +199 -0
  144. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/entry_points.txt +9 -8
  145. datapipeline/build/tasks.py +0 -186
  146. datapipeline/cli/commands/link.py +0 -128
  147. datapipeline/cli/commands/writers.py +0 -138
  148. datapipeline/config/build.py +0 -64
  149. datapipeline/config/run.py +0 -116
  150. datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
  151. datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
  152. datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
  153. datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
  154. datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
  155. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
  156. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
  157. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
  158. datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
  159. datapipeline/transforms/vector.py +0 -210
  160. jerry_thomas-0.3.0.dist-info/METADATA +0 -502
  161. jerry_thomas-0.3.0.dist-info/RECORD +0 -139
  162. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/WHEEL +0 -0
  163. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/licenses/LICENSE +0 -0
  164. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,155 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Any, Optional
6
+
7
+ from pydantic import BaseModel, Field, field_validator
8
+
9
+ from datapipeline.config.tasks import (
10
+ VALID_PROGRESS_STYLES,
11
+ VALID_VISUAL_PROVIDERS,
12
+ )
13
+ from datapipeline.utils.load import load_yaml
14
+
15
+
16
+ class SharedDefaults(BaseModel):
17
+ visuals: Optional[str] = Field(
18
+ default=None, description="AUTO | TQDM | RICH | OFF"
19
+ )
20
+ progress: Optional[str] = Field(
21
+ default=None, description="AUTO | SPINNER | BARS | OFF"
22
+ )
23
+ log_level: Optional[str] = Field(default=None, description="DEFAULT LOG LEVEL")
24
+
25
+ @field_validator("visuals", "progress", "log_level", mode="before")
26
+ @classmethod
27
+ def _normalize(cls, value: object):
28
+ if value is None:
29
+ return None
30
+ if isinstance(value, str):
31
+ text = value.strip()
32
+ return text if text else None
33
+ return value
34
+
35
+ @field_validator("visuals", mode="before")
36
+ @classmethod
37
+ def _normalize_visuals(cls, value):
38
+ if value is None:
39
+ return None
40
+ if isinstance(value, bool):
41
+ return "OFF" if value is False else "AUTO"
42
+ name = str(value).upper()
43
+ if name not in VALID_VISUAL_PROVIDERS:
44
+ raise ValueError(
45
+ f"visuals must be one of {', '.join(VALID_VISUAL_PROVIDERS)}, got {value!r}"
46
+ )
47
+ return name
48
+
49
+ @field_validator("progress", mode="before")
50
+ @classmethod
51
+ def _normalize_progress(cls, value):
52
+ if value is None:
53
+ return None
54
+ if isinstance(value, bool):
55
+ return "OFF" if value is False else "AUTO"
56
+ name = str(value).upper()
57
+ if name not in VALID_PROGRESS_STYLES:
58
+ raise ValueError(
59
+ f"progress must be one of {', '.join(VALID_PROGRESS_STYLES)}, got {value!r}"
60
+ )
61
+ return name
62
+
63
+
64
+ class ServeDefaults(BaseModel):
65
+ log_level: Optional[str] = None
66
+ limit: Optional[int] = None
67
+ stage: Optional[int] = None
68
+ throttle_ms: Optional[float] = None
69
+
70
+ class OutputDefaults(BaseModel):
71
+ transport: str
72
+ format: str
73
+ payload: str = Field(default="sample")
74
+ directory: Optional[str] = Field(
75
+ default=None,
76
+ description="Base directory for fs outputs (relative paths are resolved from jerry.yaml).",
77
+ )
78
+
79
+ output: Optional[OutputDefaults] = None
80
+
81
+
82
+ class BuildDefaults(BaseModel):
83
+ log_level: Optional[str] = None
84
+ mode: Optional[str] = None
85
+
86
+ @field_validator("mode", mode="before")
87
+ @classmethod
88
+ def _normalize_mode(cls, value: object):
89
+ if value is None:
90
+ return None
91
+ if isinstance(value, bool):
92
+ return "OFF" if value is False else "AUTO"
93
+ text = str(value).strip()
94
+ if not text:
95
+ return None
96
+ name = text.upper()
97
+ valid_modes = {"AUTO", "FORCE", "OFF"}
98
+ if name not in valid_modes:
99
+ options = ", ".join(sorted(valid_modes))
100
+ raise ValueError(f"build.mode must be one of {options}, got {value!r}")
101
+ return name
102
+
103
+
104
+ class WorkspaceConfig(BaseModel):
105
+ plugin_root: Optional[str] = None
106
+ datasets: dict[str, str] = Field(
107
+ default_factory=dict,
108
+ description="Named dataset aliases mapping to project.yaml paths (relative to jerry.yaml).",
109
+ )
110
+ default_dataset: Optional[str] = Field(
111
+ default=None,
112
+ description="Optional default dataset alias when --dataset/--project are omitted.",
113
+ )
114
+ shared: SharedDefaults = Field(default_factory=SharedDefaults)
115
+ serve: ServeDefaults = Field(default_factory=ServeDefaults)
116
+ build: BuildDefaults = Field(default_factory=BuildDefaults)
117
+
118
+
119
+ @dataclass
120
+ class WorkspaceContext:
121
+ file_path: Path
122
+ config: WorkspaceConfig
123
+
124
+ @property
125
+ def root(self) -> Path:
126
+ return self.file_path.parent
127
+
128
+ def resolve_plugin_root(self) -> Optional[Path]:
129
+ raw = self.config.plugin_root
130
+ if not raw:
131
+ return None
132
+ candidate = Path(raw)
133
+ return (
134
+ candidate.resolve()
135
+ if candidate.is_absolute()
136
+ else (self.root / candidate).resolve()
137
+ )
138
+
139
+
140
+ def load_workspace_context(start_dir: Optional[Path] = None) -> Optional[WorkspaceContext]:
141
+ """Search from start_dir upward for jerry.yaml and return parsed config."""
142
+ directory = (start_dir or Path.cwd()).resolve()
143
+ for path in [directory, *directory.parents]:
144
+ candidate = path / "jerry.yaml"
145
+ if candidate.is_file():
146
+ data = load_yaml(candidate)
147
+ if not isinstance(data, dict):
148
+ raise TypeError("jerry.yaml must define a mapping at the top level")
149
+ # Allow users to set serve/build/shared to null to fall back to defaults
150
+ for key in ("shared", "serve", "build"):
151
+ if key in data and data[key] is None:
152
+ data.pop(key)
153
+ cfg = WorkspaceConfig.model_validate(data)
154
+ return WorkspaceContext(file_path=candidate, config=cfg)
155
+ return None
@@ -0,0 +1,12 @@
1
+ from .sample import Sample
2
+ from .vector import Vector
3
+ from .feature import FeatureRecord, FeatureRecordSequence
4
+ from .record import TemporalRecord
5
+
6
+ __all__ = [
7
+ "Sample",
8
+ "Vector",
9
+ "FeatureRecord",
10
+ "FeatureRecordSequence",
11
+ "TemporalRecord",
12
+ ]
@@ -26,3 +26,14 @@ class TemporalRecord(Record):
26
26
  data.pop("time", None)
27
27
  data.pop("value", None)
28
28
  return data
29
+
30
+ def __eq__(self, other: object) -> bool:
31
+ if self is other:
32
+ return True
33
+ if not isinstance(other, TemporalRecord):
34
+ return NotImplemented
35
+ return (
36
+ self.time == other.time
37
+ and self.value == other.value
38
+ and self._identity_fields() == other._identity_fields()
39
+ )
@@ -0,0 +1,54 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, asdict
4
+ from typing import Any, Iterator, Optional, Literal
5
+
6
+ from .vector import Vector
7
+
8
+ PayloadMode = Literal["sample", "vector"]
9
+
10
+
11
+ @dataclass
12
+ class Sample:
13
+ """
14
+ Represents a single grouped vector sample emitted by the pipeline.
15
+
16
+ Attributes:
17
+ key: Group identifier (tuple when group_by cadence > 1).
18
+ features: Feature vector payload.
19
+ targets: Optional target vector when requested.
20
+ """
21
+
22
+ key: Any
23
+ features: Vector
24
+ targets: Optional[Vector] = None
25
+
26
+ def __iter__(self) -> Iterator[Any]:
27
+ """Retain tuple-like unpacking compatibility."""
28
+ yield self.key
29
+ yield self.features
30
+
31
+ def __len__(self) -> int:
32
+ return 2
33
+
34
+ def __getitem__(self, idx: int) -> Any:
35
+ if idx == 0:
36
+ return self.key
37
+ if idx == 1:
38
+ return self.features
39
+ raise IndexError(idx)
40
+
41
+ def with_targets(self, targets: Optional[Vector]) -> "Sample":
42
+ return Sample(key=self.key, features=self.features, targets=targets)
43
+
44
+ def with_features(self, features: Vector) -> "Sample":
45
+ return Sample(key=self.key, features=features, targets=self.targets)
46
+
47
+ def as_full_payload(self) -> dict[str, Any]:
48
+ return asdict(self)
49
+
50
+ def as_vector_payload(self) -> dict[str, Any]:
51
+ data: dict[str, Any] = {"features": list(self.features.values.values())}
52
+ if self.targets is not None:
53
+ data["targets"] = list(self.targets.values.values())
54
+ return data
@@ -8,6 +8,7 @@ from typing import Any, Literal
8
8
 
9
9
  from datapipeline.config.dataset.dataset import FeatureDatasetConfig
10
10
  from datapipeline.config.dataset.loader import load_dataset
11
+ from datapipeline.domain.sample import Sample
11
12
  from datapipeline.domain.vector import Vector
12
13
  from datapipeline.pipeline.context import PipelineContext
13
14
  from datapipeline.pipeline.pipelines import build_vector_pipeline
@@ -69,22 +70,21 @@ class VectorAdapter:
69
70
  self,
70
71
  *,
71
72
  limit: int | None = None,
72
- include_targets: bool = False,
73
73
  ) -> Iterator[tuple[Sequence[Any], Vector]]:
74
74
  features = list(_ensure_features(self.dataset))
75
- if include_targets:
76
- try:
77
- features += list(getattr(self.dataset, "targets", []) or [])
78
- except Exception:
79
- pass
75
+ target_cfgs = list(getattr(self.dataset, "targets", []) or [])
80
76
  context = PipelineContext(self.runtime)
81
77
  vectors = build_vector_pipeline(
82
- context, features, self.dataset.group_by, stage=None
78
+ context,
79
+ features,
80
+ self.dataset.group_by,
81
+ target_configs=target_cfgs,
83
82
  )
84
- stream = post_process(context, vectors)
83
+ base_stream = post_process(context, vectors)
84
+ sample_iter = base_stream
85
85
  if limit is not None:
86
- stream = islice(stream, limit)
87
- return stream
86
+ sample_iter = islice(sample_iter, limit)
87
+ return ((sample.key, sample.features) for sample in sample_iter)
88
88
 
89
89
  def iter_rows(
90
90
  self,
@@ -94,24 +94,38 @@ class VectorAdapter:
94
94
  group_format: GroupFormat = "mapping",
95
95
  group_column: str = "group",
96
96
  flatten_sequences: bool = False,
97
- include_targets: bool = False,
98
97
  ) -> Iterator[dict[str, Any]]:
99
- stream = self.stream(limit=limit, include_targets=include_targets)
98
+ features = list(_ensure_features(self.dataset))
99
+ target_cfgs = list(getattr(self.dataset, "targets", []) or [])
100
+ context = PipelineContext(self.runtime)
101
+ vectors = build_vector_pipeline(
102
+ context,
103
+ features,
104
+ self.dataset.group_by,
105
+ target_configs=target_cfgs,
106
+ )
107
+ base_stream = post_process(context, vectors)
108
+ if limit is not None:
109
+ base_stream = islice(base_stream, limit)
100
110
  group_by = self.dataset.group_by
101
111
 
102
112
  def _rows() -> Iterator[dict[str, Any]]:
103
- for group_key, vector in stream:
113
+ for sample in base_stream:
104
114
  row: dict[str, Any] = {}
105
115
  if include_group:
106
116
  row[group_column] = _normalize_group(
107
- group_key, group_by, group_format
117
+ sample.key, group_by, group_format
108
118
  )
109
- for feature_id, value in vector.values.items():
110
- if flatten_sequences and isinstance(value, list):
111
- for idx, item in enumerate(value):
112
- row[f"{feature_id}[{idx}]"] = item
113
- else:
114
- row[feature_id] = value
119
+ vectors = [sample.features]
120
+ if sample.targets:
121
+ vectors.append(sample.targets)
122
+ for vector in vectors:
123
+ for feature_id, value in vector.values.items():
124
+ if flatten_sequences and isinstance(value, list):
125
+ for idx, item in enumerate(value):
126
+ row[f"{feature_id}[{idx}]"] = item
127
+ else:
128
+ row[feature_id] = value
115
129
  yield row
116
130
 
117
131
  return _rows()
@@ -15,7 +15,6 @@ def dataframe_from_vectors(
15
15
  group_format: GroupFormat = "mapping",
16
16
  group_column: str = "group",
17
17
  flatten_sequences: bool = False,
18
- include_targets: bool = False,
19
18
  open_stream: Callable[[str], Iterable[Any]] | None = None,
20
19
  ):
21
20
  """Return a Pandas DataFrame built from project vectors.
@@ -37,7 +36,6 @@ def dataframe_from_vectors(
37
36
  group_format=group_format,
38
37
  group_column=group_column,
39
38
  flatten_sequences=flatten_sequences,
40
- include_targets=include_targets,
41
39
  open_stream=open_stream,
42
40
  )
43
41
  return pd.DataFrame(rows)
@@ -13,13 +13,12 @@ def stream_vectors(
13
13
  project_yaml: str | Path,
14
14
  *,
15
15
  limit: int | None = None,
16
- include_targets: bool = False,
17
16
  ) -> Iterator[tuple[Sequence[Any], Vector]]:
18
17
  """Yield ``(group_key, Vector)`` pairs for the configured project."""
19
18
 
20
19
  adapter = VectorAdapter.from_project(project_yaml)
21
20
  try:
22
- return adapter.stream(limit=limit, include_targets=include_targets)
21
+ return adapter.stream(limit=limit)
23
22
  except ValueError:
24
23
  return iter(())
25
24
 
@@ -32,7 +31,6 @@ def iter_vector_rows(
32
31
  group_format: GroupFormat = "mapping",
33
32
  group_column: str = "group",
34
33
  flatten_sequences: bool = False,
35
- include_targets: bool = False,
36
34
  ) -> Iterator[dict[str, Any]]:
37
35
  """Return an iterator of row dictionaries derived from vectors."""
38
36
 
@@ -44,7 +42,6 @@ def iter_vector_rows(
44
42
  group_format=group_format,
45
43
  group_column=group_column,
46
44
  flatten_sequences=flatten_sequences,
47
- include_targets=include_targets,
48
45
  )
49
46
  except ValueError:
50
47
  return iter(())
@@ -58,7 +55,6 @@ def collect_vector_rows(
58
55
  group_format: GroupFormat = "mapping",
59
56
  group_column: str = "group",
60
57
  flatten_sequences: bool = False,
61
- include_targets: bool = False,
62
58
  open_stream=None,
63
59
  ) -> list[dict[str, Any]]:
64
60
  """Materialize :func:`iter_vector_rows` into a list for eager workflows."""
@@ -70,7 +66,6 @@ def collect_vector_rows(
70
66
  group_format=group_format,
71
67
  group_column=group_column,
72
68
  flatten_sequences=flatten_sequences,
73
- include_targets=include_targets,
74
69
  )
75
70
  return list(iterator)
76
71
 
@@ -35,7 +35,6 @@ def torch_dataset(
35
35
  dtype: Any | None = None,
36
36
  device: Any | None = None,
37
37
  flatten_sequences: bool = False,
38
- include_targets: bool = False,
39
38
  ):
40
39
  """Build a torch.utils.data.Dataset that yields tensors from vectors."""
41
40
 
@@ -52,10 +51,9 @@ def torch_dataset(
52
51
  limit=limit,
53
52
  include_group=False,
54
53
  flatten_sequences=flatten_sequences,
55
- include_targets=include_targets,
56
54
  )
57
55
 
58
- if include_targets and target_columns is None:
56
+ if target_columns is None:
59
57
  try:
60
58
  ds = load_dataset(Path(project_yaml), "vectors")
61
59
  target_columns = [cfg.id for cfg in getattr(ds, "targets", []) or []]
@@ -0,0 +1,112 @@
1
+ from typing import Optional
2
+
3
+ from datapipeline.io.writers import (
4
+ JsonLinesFileWriter,
5
+ JsonLinesStdoutWriter,
6
+ GzipJsonLinesWriter,
7
+ CsvFileWriter,
8
+ PickleFileWriter,
9
+ LineWriter,
10
+ )
11
+ from datapipeline.io.protocols import Writer
12
+ from datapipeline.io.serializers import (
13
+ json_line_serializer,
14
+ print_serializer,
15
+ csv_row_serializer,
16
+ pickle_serializer,
17
+ record_json_line_serializer,
18
+ record_print_serializer,
19
+ record_csv_row_serializer,
20
+ record_pickle_serializer,
21
+ )
22
+ from datapipeline.io.sinks import StdoutTextSink, RichStdoutSink, ReprRichFormatter, JsonRichFormatter, PlainRichFormatter
23
+ from datapipeline.io.output import OutputTarget
24
+
25
+
26
+ def stdout_sink_for(format_: str, visuals: Optional[str]) -> StdoutTextSink:
27
+ """Select an appropriate stdout sink given format and visuals preference.
28
+
29
+ Behavior:
30
+ - visuals == "rich" or "auto" -> attempt Rich formatting; fallback to plain on error.
31
+ - anything else -> plain stdout (no Rich formatting).
32
+ """
33
+ fmt = (format_ or "print").lower()
34
+ provider = (visuals or "auto").lower()
35
+
36
+ use_rich = provider == "rich" or provider == "auto"
37
+ if not use_rich:
38
+ return StdoutTextSink()
39
+
40
+ # Prefer Rich when possible; gracefully degrade to plain stdout on any failure.
41
+ try:
42
+ if fmt in {"json", "json-lines", "jsonl"}:
43
+ return RichStdoutSink(JsonRichFormatter())
44
+ if fmt == "print":
45
+ return RichStdoutSink(ReprRichFormatter())
46
+ return RichStdoutSink(PlainRichFormatter())
47
+ except Exception:
48
+ return StdoutTextSink()
49
+
50
+
51
+ def writer_factory(
52
+ target: OutputTarget,
53
+ *,
54
+ visuals: Optional[str] = None,
55
+ item_type: str = "sample",
56
+ ) -> Writer:
57
+ transport = target.transport.lower()
58
+ format_ = target.format.lower()
59
+ payload = target.payload
60
+
61
+ if item_type not in {"sample", "record"}:
62
+ raise ValueError(f"Unsupported writer item_type '{item_type}'")
63
+
64
+ if transport == "stdout":
65
+ sink = stdout_sink_for(format_, visuals)
66
+ if format_ in {"json-lines", "json", "jsonl"}:
67
+ serializer = (
68
+ record_json_line_serializer()
69
+ if item_type == "record"
70
+ else json_line_serializer(payload)
71
+ )
72
+ return LineWriter(sink, serializer)
73
+ if format_ == "print":
74
+ serializer = (
75
+ record_print_serializer()
76
+ if item_type == "record"
77
+ else print_serializer(payload)
78
+ )
79
+ return LineWriter(sink, serializer)
80
+ raise ValueError(f"Unsupported stdout format '{target.format}'")
81
+
82
+ destination = target.destination
83
+ if destination is None:
84
+ raise ValueError("fs output requires a destination path")
85
+ destination.parent.mkdir(parents=True, exist_ok=True)
86
+
87
+ suffix = "".join(destination.suffixes).lower()
88
+ if format_ in {"json-lines", "json", "jsonl"}:
89
+ serializer = (
90
+ record_json_line_serializer()
91
+ if item_type == "record"
92
+ else json_line_serializer(payload)
93
+ )
94
+ if suffix.endswith(".jsonl.gz") or suffix.endswith(".json.gz") or suffix.endswith(".gz"):
95
+ return GzipJsonLinesWriter(destination, serializer=serializer)
96
+ return JsonLinesFileWriter(destination, serializer=serializer)
97
+ if format_ == "csv":
98
+ serializer = (
99
+ record_csv_row_serializer()
100
+ if item_type == "record"
101
+ else csv_row_serializer(payload)
102
+ )
103
+ return CsvFileWriter(destination, serializer=serializer)
104
+ if format_ == "pickle":
105
+ serializer = (
106
+ record_pickle_serializer()
107
+ if item_type == "record"
108
+ else pickle_serializer(payload)
109
+ )
110
+ return PickleFileWriter(destination, serializer=serializer)
111
+
112
+ raise ValueError(f"Unsupported fs format '{target.format}'")
@@ -0,0 +1,132 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from datapipeline.config.tasks import ServeOutputConfig
8
+ from datapipeline.services.runs import RunPaths, start_run_for_directory
9
+
10
+
11
+ def _format_suffix(fmt: str) -> str:
12
+ suffix_map = {
13
+ "json-lines": ".jsonl",
14
+ "json": ".json",
15
+ "csv": ".csv",
16
+ "pickle": ".pkl",
17
+ }
18
+ return suffix_map.get(fmt, ".out")
19
+
20
+
21
+ def _default_filename_for_format(fmt: str) -> str:
22
+ suffix = _format_suffix(fmt)
23
+ return f"vectors{suffix}"
24
+
25
+
26
+ def _sanitize_segment(value: str) -> str:
27
+ cleaned = "".join(
28
+ ch if ch.isalnum() or ch in ("_", "-", ".") else "_"
29
+ for ch in value.strip()
30
+ )
31
+ return cleaned or "run"
32
+
33
+
34
+ @dataclass(frozen=True)
35
+ class OutputTarget:
36
+ """Resolved writer target describing how and where to emit records."""
37
+
38
+ transport: str # stdout | fs
39
+ format: str # print | json-lines | json | csv | pickle
40
+ destination: Optional[Path]
41
+ payload: str = "sample"
42
+ run: RunPaths | None = None
43
+
44
+ def for_feature(self, feature_id: str) -> "OutputTarget":
45
+ if self.transport != "fs" or self.destination is None:
46
+ return self
47
+ safe_feature = "".join(
48
+ ch if ch.isalnum() or ch in ("_", "-", ".") else "_"
49
+ for ch in str(feature_id)
50
+ )
51
+ dest = self.destination
52
+ suffix = "".join(dest.suffixes)
53
+ stem = dest.name[: -len(suffix)] if suffix else dest.name
54
+ new_name = f"{stem}.{safe_feature}{suffix}"
55
+ new_path = dest.with_name(new_name)
56
+ return OutputTarget(
57
+ transport=self.transport,
58
+ format=self.format,
59
+ destination=new_path,
60
+ payload=self.payload,
61
+ run=self.run,
62
+ )
63
+
64
+
65
+ class OutputResolutionError(ValueError):
66
+ """Raised when CLI/config output options cannot be resolved."""
67
+
68
+
69
+ def resolve_output_target(
70
+ *,
71
+ cli_output: ServeOutputConfig | None,
72
+ config_output: ServeOutputConfig | None,
73
+ default: ServeOutputConfig | None = None,
74
+ base_path: Path | None = None,
75
+ run_name: str | None = None,
76
+ payload_override: str | None = None,
77
+ stage: int | None = None,
78
+ create_run: bool = False,
79
+ ) -> OutputTarget:
80
+ """
81
+ Resolve the effective output target using CLI override, run config, or default.
82
+ """
83
+
84
+ base_path = base_path or Path.cwd()
85
+
86
+ config = cli_output or config_output or default
87
+ if config is None:
88
+ config = ServeOutputConfig(transport="stdout", format="print")
89
+
90
+ payload = payload_override or config.payload or "sample"
91
+
92
+ if config.transport == "stdout":
93
+ return OutputTarget(
94
+ transport="stdout",
95
+ format=config.format,
96
+ destination=None,
97
+ payload=payload,
98
+ run=None,
99
+ )
100
+
101
+ if config.directory is None:
102
+ raise OutputResolutionError("fs output requires a directory")
103
+ directory = (
104
+ config.directory
105
+ if config.directory.is_absolute()
106
+ else (base_path / config.directory).resolve()
107
+ )
108
+ if create_run:
109
+ run_paths, _ = start_run_for_directory(directory, stage=stage)
110
+ base_dest_dir = run_paths.dataset_dir
111
+ else:
112
+ run_paths = None
113
+ # When not creating a managed run, nest outputs under an optional
114
+ # run_name subdirectory to keep layouts consistent with tests/CLI.
115
+ base_dest_dir = directory
116
+ if run_name:
117
+ base_dest_dir = base_dest_dir / _sanitize_segment(run_name)
118
+ suffix = _format_suffix(config.format)
119
+ filename_stem = config.filename or run_name
120
+ if filename_stem:
121
+ filename = f"{filename_stem}{suffix}"
122
+ else:
123
+ filename = _default_filename_for_format(config.format)
124
+ dest_path = (base_dest_dir / filename).resolve()
125
+
126
+ return OutputTarget(
127
+ transport="fs",
128
+ format=config.format,
129
+ destination=dest_path,
130
+ payload=payload,
131
+ run=run_paths,
132
+ )
@@ -0,0 +1,21 @@
1
+ from typing import Protocol, Optional, runtime_checkable
2
+ from pathlib import Path
3
+
4
+
5
+ @runtime_checkable
6
+ class Writer(Protocol):
7
+ def write(self, rec: dict) -> None: ...
8
+ def close(self) -> None: ...
9
+
10
+
11
+ @runtime_checkable
12
+ class HeaderCapable(Protocol):
13
+ """Writers that can accept an injected logical 'header record' as the first write."""
14
+
15
+ def write_header(self, header: dict) -> None: ...
16
+
17
+
18
+ @runtime_checkable
19
+ class HasFilePath(Protocol):
20
+ @property
21
+ def file_path(self) -> Optional[Path]: ...