jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. datapipeline/analysis/vector/collector.py +120 -17
  2. datapipeline/analysis/vector/matrix.py +33 -8
  3. datapipeline/analysis/vector/report.py +162 -32
  4. datapipeline/build/tasks/__init__.py +11 -0
  5. datapipeline/build/tasks/config.py +74 -0
  6. datapipeline/build/tasks/metadata.py +170 -0
  7. datapipeline/build/tasks/scaler.py +73 -0
  8. datapipeline/build/tasks/schema.py +60 -0
  9. datapipeline/build/tasks/utils.py +169 -0
  10. datapipeline/cli/app.py +304 -127
  11. datapipeline/cli/commands/build.py +240 -16
  12. datapipeline/cli/commands/contract.py +367 -0
  13. datapipeline/cli/commands/domain.py +8 -3
  14. datapipeline/cli/commands/inspect.py +401 -149
  15. datapipeline/cli/commands/list_.py +30 -7
  16. datapipeline/cli/commands/plugin.py +5 -1
  17. datapipeline/cli/commands/run.py +227 -241
  18. datapipeline/cli/commands/run_config.py +101 -0
  19. datapipeline/cli/commands/serve_pipeline.py +156 -0
  20. datapipeline/cli/commands/source.py +44 -8
  21. datapipeline/cli/visuals/__init__.py +4 -2
  22. datapipeline/cli/visuals/common.py +239 -0
  23. datapipeline/cli/visuals/labels.py +15 -15
  24. datapipeline/cli/visuals/runner.py +66 -0
  25. datapipeline/cli/visuals/sections.py +20 -0
  26. datapipeline/cli/visuals/sources.py +132 -119
  27. datapipeline/cli/visuals/sources_basic.py +260 -0
  28. datapipeline/cli/visuals/sources_off.py +76 -0
  29. datapipeline/cli/visuals/sources_rich.py +414 -0
  30. datapipeline/config/catalog.py +37 -3
  31. datapipeline/config/context.py +214 -0
  32. datapipeline/config/dataset/loader.py +21 -4
  33. datapipeline/config/dataset/normalize.py +4 -4
  34. datapipeline/config/metadata.py +43 -0
  35. datapipeline/config/postprocess.py +2 -2
  36. datapipeline/config/project.py +3 -2
  37. datapipeline/config/resolution.py +129 -0
  38. datapipeline/config/tasks.py +309 -0
  39. datapipeline/config/workspace.py +155 -0
  40. datapipeline/domain/__init__.py +12 -0
  41. datapipeline/domain/record.py +11 -0
  42. datapipeline/domain/sample.py +54 -0
  43. datapipeline/integrations/ml/adapter.py +34 -20
  44. datapipeline/integrations/ml/pandas_support.py +0 -2
  45. datapipeline/integrations/ml/rows.py +1 -6
  46. datapipeline/integrations/ml/torch_support.py +1 -3
  47. datapipeline/io/factory.py +112 -0
  48. datapipeline/io/output.py +132 -0
  49. datapipeline/io/protocols.py +21 -0
  50. datapipeline/io/serializers.py +219 -0
  51. datapipeline/io/sinks/__init__.py +23 -0
  52. datapipeline/io/sinks/base.py +2 -0
  53. datapipeline/io/sinks/files.py +79 -0
  54. datapipeline/io/sinks/rich.py +57 -0
  55. datapipeline/io/sinks/stdout.py +18 -0
  56. datapipeline/io/writers/__init__.py +14 -0
  57. datapipeline/io/writers/base.py +28 -0
  58. datapipeline/io/writers/csv_writer.py +25 -0
  59. datapipeline/io/writers/jsonl.py +52 -0
  60. datapipeline/io/writers/pickle_writer.py +30 -0
  61. datapipeline/pipeline/artifacts.py +58 -0
  62. datapipeline/pipeline/context.py +66 -7
  63. datapipeline/pipeline/observability.py +65 -0
  64. datapipeline/pipeline/pipelines.py +65 -13
  65. datapipeline/pipeline/split.py +11 -10
  66. datapipeline/pipeline/stages.py +127 -16
  67. datapipeline/pipeline/utils/keygen.py +20 -7
  68. datapipeline/pipeline/utils/memory_sort.py +22 -10
  69. datapipeline/pipeline/utils/transform_utils.py +22 -0
  70. datapipeline/runtime.py +5 -2
  71. datapipeline/services/artifacts.py +12 -6
  72. datapipeline/services/bootstrap/config.py +25 -0
  73. datapipeline/services/bootstrap/core.py +52 -37
  74. datapipeline/services/constants.py +6 -5
  75. datapipeline/services/factories.py +123 -1
  76. datapipeline/services/project_paths.py +43 -16
  77. datapipeline/services/runs.py +208 -0
  78. datapipeline/services/scaffold/domain.py +3 -2
  79. datapipeline/services/scaffold/filter.py +3 -2
  80. datapipeline/services/scaffold/mappers.py +9 -6
  81. datapipeline/services/scaffold/plugin.py +54 -10
  82. datapipeline/services/scaffold/source.py +93 -56
  83. datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
  84. datapipeline/sources/decoders.py +83 -18
  85. datapipeline/sources/factory.py +26 -16
  86. datapipeline/sources/models/__init__.py +2 -2
  87. datapipeline/sources/models/generator.py +0 -7
  88. datapipeline/sources/models/loader.py +3 -3
  89. datapipeline/sources/models/parsing_error.py +24 -0
  90. datapipeline/sources/models/source.py +6 -6
  91. datapipeline/sources/synthetic/time/loader.py +14 -2
  92. datapipeline/sources/transports.py +74 -37
  93. datapipeline/templates/plugin_skeleton/README.md +76 -30
  94. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
  95. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
  96. datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
  97. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
  98. datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
  99. datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
  100. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
  101. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
  102. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
  103. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
  104. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
  105. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
  106. datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
  107. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
  108. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
  109. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
  110. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
  111. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
  112. datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
  113. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
  114. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
  115. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
  116. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
  117. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
  118. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
  119. datapipeline/templates/stubs/dto.py.j2 +2 -0
  120. datapipeline/templates/stubs/mapper.py.j2 +5 -4
  121. datapipeline/templates/stubs/parser.py.j2 +2 -0
  122. datapipeline/templates/stubs/record.py.j2 +2 -0
  123. datapipeline/templates/stubs/source.yaml.j2 +2 -3
  124. datapipeline/transforms/debug/lint.py +26 -41
  125. datapipeline/transforms/feature/scaler.py +89 -13
  126. datapipeline/transforms/record/floor_time.py +4 -4
  127. datapipeline/transforms/sequence.py +2 -35
  128. datapipeline/transforms/stream/dedupe.py +24 -0
  129. datapipeline/transforms/stream/ensure_ticks.py +7 -6
  130. datapipeline/transforms/vector/__init__.py +5 -0
  131. datapipeline/transforms/vector/common.py +98 -0
  132. datapipeline/transforms/vector/drop/__init__.py +4 -0
  133. datapipeline/transforms/vector/drop/horizontal.py +79 -0
  134. datapipeline/transforms/vector/drop/orchestrator.py +59 -0
  135. datapipeline/transforms/vector/drop/vertical.py +182 -0
  136. datapipeline/transforms/vector/ensure_schema.py +184 -0
  137. datapipeline/transforms/vector/fill.py +87 -0
  138. datapipeline/transforms/vector/replace.py +62 -0
  139. datapipeline/utils/load.py +24 -3
  140. datapipeline/utils/rich_compat.py +38 -0
  141. datapipeline/utils/window.py +76 -0
  142. jerry_thomas-1.0.1.dist-info/METADATA +825 -0
  143. jerry_thomas-1.0.1.dist-info/RECORD +199 -0
  144. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
  145. datapipeline/build/tasks.py +0 -186
  146. datapipeline/cli/commands/link.py +0 -128
  147. datapipeline/cli/commands/writers.py +0 -138
  148. datapipeline/config/build.py +0 -64
  149. datapipeline/config/run.py +0 -116
  150. datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
  151. datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
  152. datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
  153. datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
  154. datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
  155. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
  156. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
  157. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
  158. datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
  159. datapipeline/transforms/vector.py +0 -210
  160. jerry_thomas-0.3.0.dist-info/METADATA +0 -502
  161. jerry_thomas-0.3.0.dist-info/RECORD +0 -139
  162. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
  163. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
  164. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,170 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from collections import defaultdict
5
+ from datetime import datetime, timezone
6
+ from pathlib import Path
7
+ from typing import Dict, Tuple
8
+
9
+ from datapipeline.config.dataset.loader import load_dataset
10
+ from datapipeline.config.metadata import (
11
+ VectorMetadata,
12
+ Window,
13
+ FEATURE_VECTORS_COUNT_KEY,
14
+ TARGET_VECTORS_COUNT_KEY,
15
+ )
16
+ from datapipeline.config.tasks import MetadataTask
17
+ from datapipeline.runtime import Runtime
18
+ from datapipeline.utils.paths import ensure_parent
19
+ from datapipeline.config.dataset.normalize import floor_time_to_bucket
20
+ from datapipeline.utils.time import parse_timecode
21
+
22
+ from .utils import collect_schema_entries, metadata_entries_from_stats
23
+
24
+
25
+ def _entry_window(entry: dict) -> tuple[datetime | None, datetime | None]:
26
+ start = entry.get("first_ts")
27
+ end = entry.get("last_ts")
28
+ return (start if isinstance(start, datetime) else None, end if isinstance(end, datetime) else None)
29
+
30
+
31
+ def _group_ranges(entries: list[dict], key_name: str) -> list[tuple[datetime, datetime]]:
32
+ grouped: dict[str, list[tuple[datetime, datetime]]] = defaultdict(list)
33
+ for entry in entries:
34
+ start, end = _entry_window(entry)
35
+ if start is None or end is None:
36
+ continue
37
+ group_key = entry.get(key_name) or entry.get("id")
38
+ if not isinstance(group_key, str):
39
+ continue
40
+ grouped[group_key].append((start, end))
41
+ ranges: list[tuple[datetime, datetime]] = []
42
+ for values in grouped.values():
43
+ group_start = min(start for start, _ in values)
44
+ group_end = max(end for _, end in values)
45
+ ranges.append((group_start, group_end))
46
+ return ranges
47
+
48
+
49
+ def _range_union(ranges):
50
+ if not ranges:
51
+ return None, None
52
+ start = min(r[0] for r in ranges)
53
+ end = max(r[1] for r in ranges)
54
+ if start >= end:
55
+ return None, None
56
+ return start, end
57
+
58
+
59
+ def _range_intersection(ranges):
60
+ if not ranges:
61
+ return None, None
62
+ start = max(r[0] for r in ranges)
63
+ end = min(r[1] for r in ranges)
64
+ if start >= end:
65
+ return None, None
66
+ return start, end
67
+
68
+
69
+ def _window_bounds_from_stats(
70
+ feature_stats: list[dict],
71
+ target_stats: list[dict],
72
+ *,
73
+ mode: str,
74
+ ) -> tuple[datetime | None, datetime | None]:
75
+ base_ranges = _group_ranges(
76
+ feature_stats, "base_id") + _group_ranges(target_stats, "base_id")
77
+ partition_ranges = _group_ranges(
78
+ feature_stats, "id") + _group_ranges(target_stats, "id")
79
+
80
+ if mode == "intersection":
81
+ return _range_intersection(base_ranges)
82
+ if mode == "strict":
83
+ return _range_intersection(partition_ranges)
84
+ if mode == "relaxed":
85
+ return _range_union(partition_ranges)
86
+ # default to union
87
+ return _range_union(base_ranges if base_ranges else partition_ranges)
88
+
89
+
90
+ def _window_size(start: datetime | None, end: datetime | None, cadence: str | None) -> int | None:
91
+ if start is None or end is None or cadence is None:
92
+ return None
93
+ try:
94
+ anchored_start = floor_time_to_bucket(start, cadence)
95
+ anchored_end = floor_time_to_bucket(end, cadence)
96
+ step = parse_timecode(cadence)
97
+ if anchored_end < anchored_start:
98
+ return None
99
+ return int(((anchored_end - anchored_start) / step)) + 1
100
+ except Exception:
101
+ return None
102
+
103
+
104
+ def materialize_metadata(runtime: Runtime, task_cfg: MetadataTask) -> Tuple[str, Dict[str, object]] | None:
105
+ if not task_cfg.enabled:
106
+ return None
107
+ dataset = load_dataset(runtime.project_yaml, "vectors")
108
+ features_cfgs = list(dataset.features or [])
109
+ feature_stats, feature_vectors, feature_min, feature_max = collect_schema_entries(
110
+ runtime,
111
+ features_cfgs,
112
+ dataset.group_by,
113
+ cadence_strategy=task_cfg.cadence_strategy,
114
+ collect_metadata=True,
115
+ )
116
+ target_meta: list[dict] = []
117
+ target_vectors = 0
118
+ target_cfgs = list(dataset.targets or [])
119
+ target_stats: list[dict] = []
120
+ target_min = target_max = None
121
+ if target_cfgs:
122
+ target_stats, target_vectors, target_min, target_max = collect_schema_entries(
123
+ runtime,
124
+ target_cfgs,
125
+ dataset.group_by,
126
+ cadence_strategy=task_cfg.cadence_strategy,
127
+ collect_metadata=True,
128
+ )
129
+ target_meta = metadata_entries_from_stats(
130
+ target_stats, task_cfg.cadence_strategy)
131
+ feature_meta = metadata_entries_from_stats(
132
+ feature_stats, task_cfg.cadence_strategy)
133
+
134
+ generated_at = datetime.now(timezone.utc)
135
+ window_obj: Window | None = None
136
+ computed_start, computed_end = _window_bounds_from_stats(
137
+ feature_stats,
138
+ target_stats if target_cfgs else [],
139
+ mode=task_cfg.window_mode,
140
+ )
141
+ start = computed_start
142
+ end = computed_end
143
+ if start is not None and end is not None and start < end:
144
+ size = _window_size(start, end, dataset.group_by)
145
+ window_obj = Window(start=start, end=end,
146
+ mode=task_cfg.window_mode, size=size)
147
+
148
+ doc = VectorMetadata(
149
+ schema_version=1,
150
+ generated_at=generated_at,
151
+ features=feature_meta,
152
+ targets=target_meta,
153
+ counts={
154
+ FEATURE_VECTORS_COUNT_KEY: feature_vectors,
155
+ TARGET_VECTORS_COUNT_KEY: target_vectors,
156
+ },
157
+ window=window_obj,
158
+ )
159
+
160
+ relative_path = Path(task_cfg.output)
161
+ destination = (runtime.artifacts_root / relative_path).resolve()
162
+ ensure_parent(destination)
163
+ with destination.open("w", encoding="utf-8") as fh:
164
+ json.dump(doc.model_dump(mode="json"), fh, indent=2)
165
+
166
+ meta: Dict[str, object] = {
167
+ "features": len(feature_meta),
168
+ "targets": len(target_meta),
169
+ }
170
+ return str(relative_path), meta
@@ -0,0 +1,73 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Dict, Iterator, Tuple
5
+
6
+ from datapipeline.config.tasks import ScalerTask
7
+ from datapipeline.config.dataset.loader import load_dataset
8
+ from datapipeline.domain.sample import Sample
9
+ from datapipeline.pipeline.context import PipelineContext
10
+ from datapipeline.pipeline.pipelines import build_vector_pipeline
11
+ from datapipeline.pipeline.split import build_labeler
12
+ from datapipeline.runtime import Runtime
13
+ from datapipeline.transforms.feature.scaler import StandardScaler
14
+ from datapipeline.utils.paths import ensure_parent
15
+
16
+
17
+ def materialize_scaler_statistics(runtime: Runtime, task_cfg: ScalerTask) -> Tuple[str, Dict[str, object]] | None:
18
+ if not task_cfg.enabled:
19
+ return None
20
+
21
+ dataset = load_dataset(runtime.project_yaml, "vectors")
22
+ feature_cfgs = list(dataset.features or [])
23
+ target_cfgs = list(dataset.targets or [])
24
+ if not feature_cfgs and not target_cfgs:
25
+ return None
26
+
27
+ sanitized_features = [cfg.model_copy(update={"scale": False}) for cfg in feature_cfgs]
28
+ sanitized_targets = [cfg.model_copy(update={"scale": False}) for cfg in target_cfgs]
29
+
30
+ context = PipelineContext(runtime)
31
+ vectors = build_vector_pipeline(
32
+ context,
33
+ sanitized_features,
34
+ dataset.group_by,
35
+ target_configs=sanitized_targets,
36
+ rectangular=False,
37
+ )
38
+
39
+ cfg = getattr(runtime, "split", None)
40
+ labeler = build_labeler(cfg) if cfg else None
41
+ if not labeler and task_cfg.split_label != "all":
42
+ raise RuntimeError(
43
+ f"Cannot compute scaler statistics for split '{task_cfg.split_label}' "
44
+ "when no split configuration is defined in the project."
45
+ )
46
+
47
+ def _train_stream() -> Iterator[Sample]:
48
+ for sample in vectors:
49
+ if labeler and labeler.label(sample.key, sample.features) != task_cfg.split_label:
50
+ continue
51
+ yield sample
52
+
53
+ scaler = StandardScaler()
54
+ total_observations = scaler.fit(_train_stream())
55
+
56
+ if not scaler.statistics:
57
+ raise RuntimeError(
58
+ f"No scaler statistics computed for split '{task_cfg.split_label}'."
59
+ )
60
+
61
+ relative_path = Path(task_cfg.output)
62
+ destination = (runtime.artifacts_root / relative_path).resolve()
63
+ ensure_parent(destination)
64
+
65
+ scaler.save(destination)
66
+
67
+ meta: Dict[str, object] = {
68
+ "features": len(scaler.statistics),
69
+ "split": task_cfg.split_label,
70
+ "observations": total_observations,
71
+ }
72
+
73
+ return str(relative_path), meta
@@ -0,0 +1,60 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from datetime import datetime, timezone
5
+ from pathlib import Path
6
+ from typing import Dict, Tuple
7
+
8
+ from datapipeline.config.tasks import SchemaTask
9
+ from datapipeline.config.dataset.loader import load_dataset
10
+ from datapipeline.runtime import Runtime
11
+ from datapipeline.utils.paths import ensure_parent
12
+ from datapipeline.utils.window import resolve_window_bounds
13
+
14
+ from .utils import collect_schema_entries, schema_entries_from_stats
15
+
16
+
17
+ def materialize_vector_schema(runtime: Runtime, task_cfg: SchemaTask) -> Tuple[str, Dict[str, object]] | None:
18
+ if not task_cfg.enabled:
19
+ return None
20
+ dataset = load_dataset(runtime.project_yaml, "vectors")
21
+ features_cfgs = list(dataset.features or [])
22
+ feature_stats, feature_vectors, feature_min, feature_max = collect_schema_entries(
23
+ runtime,
24
+ features_cfgs,
25
+ dataset.group_by,
26
+ cadence_strategy=task_cfg.cadence_strategy,
27
+ collect_metadata=False,
28
+ )
29
+ target_entries: list[dict] = []
30
+ target_cfgs = list(dataset.targets or [])
31
+ target_min = target_max = None
32
+ if target_cfgs:
33
+ target_stats, _, target_min, target_max = collect_schema_entries(
34
+ runtime,
35
+ target_cfgs,
36
+ dataset.group_by,
37
+ cadence_strategy=task_cfg.cadence_strategy,
38
+ collect_metadata=False,
39
+ )
40
+ target_entries = schema_entries_from_stats(target_stats, task_cfg.cadence_strategy)
41
+ feature_entries = schema_entries_from_stats(feature_stats, task_cfg.cadence_strategy)
42
+
43
+ doc = {
44
+ "schema_version": 1,
45
+ "generated_at": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
46
+ }
47
+ doc["features"] = feature_entries
48
+ doc["targets"] = target_entries
49
+
50
+ relative_path = Path(task_cfg.output)
51
+ destination = (runtime.artifacts_root / relative_path).resolve()
52
+ ensure_parent(destination)
53
+ with destination.open("w", encoding="utf-8") as fh:
54
+ json.dump(doc, fh, indent=2)
55
+
56
+ meta: Dict[str, object] = {
57
+ "features": len(feature_entries),
58
+ "targets": len(target_entries),
59
+ }
60
+ return str(relative_path), meta
@@ -0,0 +1,169 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import Counter, OrderedDict
4
+ from datetime import datetime
5
+ from typing import Any
6
+
7
+ from datapipeline.pipeline.context import PipelineContext
8
+ from datapipeline.pipeline.pipelines import build_vector_pipeline
9
+ from datapipeline.runtime import Runtime
10
+ from datapipeline.transforms.vector_utils import base_id as _base_feature_id
11
+ from datapipeline.transforms.utils import is_missing
12
+
13
+
14
+ def _type_name(value: object) -> str:
15
+ if value is None:
16
+ return "null"
17
+ return type(value).__name__
18
+
19
+
20
+ def collect_schema_entries(
21
+ runtime: Runtime,
22
+ configs,
23
+ group_by: str,
24
+ *,
25
+ cadence_strategy: str,
26
+ collect_metadata: bool,
27
+ ) -> tuple[list[dict], int, datetime | None, datetime | None]:
28
+ configs = list(configs or [])
29
+ if not configs:
30
+ return [], 0, None, None
31
+ sanitized = [cfg.model_copy(update={"scale": False}) for cfg in configs]
32
+ context = PipelineContext(runtime)
33
+ vectors = build_vector_pipeline(
34
+ context,
35
+ sanitized,
36
+ group_by,
37
+ rectangular=False,
38
+ )
39
+
40
+ stats: OrderedDict[str, dict] = OrderedDict()
41
+ vector_count = 0
42
+ min_time: datetime | None = None
43
+ max_time: datetime | None = None
44
+ for sample in vectors:
45
+ vector_count += 1
46
+ ts = sample.key[0] if isinstance(sample.key, tuple) and sample.key else None
47
+ if isinstance(ts, datetime):
48
+ min_time = ts if min_time is None else min(min_time, ts)
49
+ max_time = ts if max_time is None else max(max_time, ts)
50
+ payload = sample.features
51
+ for fid, value in payload.values.items():
52
+ entry = stats.get(fid)
53
+ if not entry:
54
+ entry = stats[fid] = {
55
+ "id": fid,
56
+ "base_id": _base_feature_id(fid),
57
+ "kind": None,
58
+ "max_length": None,
59
+ "present_count": 0,
60
+ "null_count": 0,
61
+ "scalar_types": set(),
62
+ "element_types": set(),
63
+ "min_length": None,
64
+ "lengths": Counter(),
65
+ "first_ts": None,
66
+ "last_ts": None,
67
+ }
68
+ if isinstance(ts, datetime):
69
+ prev_start = entry.get("first_ts")
70
+ entry["first_ts"] = ts if prev_start is None else min(prev_start, ts)
71
+ prev_end = entry.get("last_ts")
72
+ entry["last_ts"] = ts if prev_end is None else max(prev_end, ts)
73
+ if collect_metadata:
74
+ entry["present_count"] += 1
75
+ if is_missing(value):
76
+ if collect_metadata:
77
+ entry["null_count"] += 1
78
+ continue
79
+ if isinstance(value, list):
80
+ entry["kind"] = "list"
81
+ length = len(value)
82
+ entry["min_length"] = length if entry["min_length"] is None else min(
83
+ entry["min_length"], length
84
+ )
85
+ entry["max_length"] = length if entry["max_length"] is None else max(
86
+ entry["max_length"], length
87
+ )
88
+ if collect_metadata:
89
+ entry["lengths"][length] += 1
90
+ entry["observed_elements"] = entry.get("observed_elements", 0) + sum(
91
+ 1 for v in value if not is_missing(v)
92
+ )
93
+ if not value:
94
+ entry["element_types"].add("empty")
95
+ else:
96
+ entry["element_types"].update(_type_name(v) for v in value)
97
+ else:
98
+ if entry["kind"] != "list":
99
+ entry["kind"] = "scalar"
100
+ if collect_metadata:
101
+ entry["scalar_types"].add(_type_name(value))
102
+
103
+ return list(stats.values()), vector_count, min_time, max_time
104
+
105
+
106
+ def _resolve_cadence_target(stats: dict, strategy: str) -> int | None:
107
+ if strategy == "max":
108
+ max_len = stats.get("max_length")
109
+ if isinstance(max_len, (int, float)) and max_len > 0:
110
+ return int(max_len)
111
+ return None
112
+
113
+
114
+ def schema_entries_from_stats(entries: list[dict], cadence_strategy: str) -> list[dict]:
115
+ doc: list[dict] = []
116
+ for entry in entries:
117
+ kind = entry.get("kind") or "scalar"
118
+ item = {
119
+ "id": entry["id"],
120
+ "base_id": entry["base_id"],
121
+ "kind": kind,
122
+ }
123
+ if kind == "list":
124
+ target = _resolve_cadence_target(entry, cadence_strategy)
125
+ if target is not None:
126
+ item["cadence"] = {"strategy": cadence_strategy, "target": target}
127
+ doc.append(item)
128
+ return doc
129
+
130
+
131
+ def _to_iso(ts: datetime | None) -> str | None:
132
+ if isinstance(ts, datetime):
133
+ text = ts.isoformat()
134
+ if text.endswith("+00:00"):
135
+ return text[:-6] + "Z"
136
+ return text
137
+ return None
138
+
139
+
140
+ def metadata_entries_from_stats(entries: list[dict], cadence_strategy: str) -> list[dict]:
141
+ meta_entries: list[dict] = []
142
+ for entry in entries:
143
+ kind = entry.get("kind") or "scalar"
144
+ item: dict[str, Any] = {
145
+ "id": entry["id"],
146
+ "base_id": entry["base_id"],
147
+ "kind": kind,
148
+ "present_count": entry.get("present_count", 0),
149
+ "null_count": entry.get("null_count", 0),
150
+ }
151
+ first_ts = _to_iso(entry.get("first_ts"))
152
+ last_ts = _to_iso(entry.get("last_ts"))
153
+ if first_ts:
154
+ item["first_observed"] = first_ts
155
+ if last_ts:
156
+ item["last_observed"] = last_ts
157
+ if kind == "list":
158
+ item["element_types"] = sorted(entry.get("element_types", []))
159
+ lengths = entry.get("lengths") or {}
160
+ item["lengths"] = {str(length): count for length, count in sorted(lengths.items())}
161
+ target = _resolve_cadence_target(entry, cadence_strategy)
162
+ if target is not None:
163
+ item["cadence"] = {"strategy": cadence_strategy, "target": target}
164
+ if "observed_elements" in entry:
165
+ item["observed_elements"] = int(entry.get("observed_elements", 0))
166
+ else:
167
+ item["value_types"] = sorted(entry.get("scalar_types", []))
168
+ meta_entries.append(item)
169
+ return meta_entries