jerry-thomas 1.0.3__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. datapipeline/analysis/vector/collector.py +0 -1
  2. datapipeline/build/tasks/config.py +0 -2
  3. datapipeline/build/tasks/metadata.py +0 -2
  4. datapipeline/build/tasks/scaler.py +0 -2
  5. datapipeline/build/tasks/schema.py +0 -2
  6. datapipeline/build/tasks/utils.py +0 -2
  7. datapipeline/cli/app.py +201 -81
  8. datapipeline/cli/commands/contract.py +145 -283
  9. datapipeline/cli/commands/demo.py +13 -0
  10. datapipeline/cli/commands/domain.py +4 -4
  11. datapipeline/cli/commands/dto.py +11 -0
  12. datapipeline/cli/commands/filter.py +2 -2
  13. datapipeline/cli/commands/inspect.py +0 -68
  14. datapipeline/cli/commands/list_.py +30 -13
  15. datapipeline/cli/commands/loader.py +11 -0
  16. datapipeline/cli/commands/mapper.py +82 -0
  17. datapipeline/cli/commands/parser.py +45 -0
  18. datapipeline/cli/commands/run_config.py +1 -3
  19. datapipeline/cli/commands/serve_pipeline.py +5 -7
  20. datapipeline/cli/commands/source.py +106 -18
  21. datapipeline/cli/commands/stream.py +286 -0
  22. datapipeline/cli/visuals/common.py +0 -2
  23. datapipeline/cli/visuals/sections.py +0 -2
  24. datapipeline/cli/workspace_utils.py +0 -3
  25. datapipeline/config/context.py +0 -2
  26. datapipeline/config/dataset/feature.py +1 -0
  27. datapipeline/config/metadata.py +0 -2
  28. datapipeline/config/project.py +0 -2
  29. datapipeline/config/resolution.py +10 -2
  30. datapipeline/config/tasks.py +9 -9
  31. datapipeline/domain/feature.py +3 -0
  32. datapipeline/domain/record.py +7 -7
  33. datapipeline/domain/sample.py +0 -2
  34. datapipeline/domain/vector.py +6 -8
  35. datapipeline/integrations/ml/adapter.py +0 -2
  36. datapipeline/integrations/ml/pandas_support.py +0 -2
  37. datapipeline/integrations/ml/rows.py +0 -2
  38. datapipeline/integrations/ml/torch_support.py +0 -2
  39. datapipeline/io/output.py +0 -2
  40. datapipeline/io/serializers.py +26 -16
  41. datapipeline/mappers/synthetic/time.py +9 -2
  42. datapipeline/pipeline/artifacts.py +3 -5
  43. datapipeline/pipeline/observability.py +0 -2
  44. datapipeline/pipeline/pipelines.py +118 -34
  45. datapipeline/pipeline/stages.py +42 -17
  46. datapipeline/pipeline/utils/spool_cache.py +142 -0
  47. datapipeline/pipeline/utils/transform_utils.py +27 -2
  48. datapipeline/services/artifacts.py +1 -4
  49. datapipeline/services/constants.py +1 -0
  50. datapipeline/services/factories.py +4 -6
  51. datapipeline/services/project_paths.py +0 -2
  52. datapipeline/services/runs.py +0 -2
  53. datapipeline/services/scaffold/contract_yaml.py +76 -0
  54. datapipeline/services/scaffold/demo.py +141 -0
  55. datapipeline/services/scaffold/discovery.py +115 -0
  56. datapipeline/services/scaffold/domain.py +21 -13
  57. datapipeline/services/scaffold/dto.py +31 -0
  58. datapipeline/services/scaffold/filter.py +2 -1
  59. datapipeline/services/scaffold/layout.py +96 -0
  60. datapipeline/services/scaffold/loader.py +61 -0
  61. datapipeline/services/scaffold/mapper.py +116 -0
  62. datapipeline/services/scaffold/parser.py +56 -0
  63. datapipeline/services/scaffold/plugin.py +14 -2
  64. datapipeline/services/scaffold/source_yaml.py +91 -0
  65. datapipeline/services/scaffold/stream_plan.py +110 -0
  66. datapipeline/services/scaffold/utils.py +187 -0
  67. datapipeline/sources/data_loader.py +0 -2
  68. datapipeline/sources/decoders.py +49 -8
  69. datapipeline/sources/factory.py +9 -6
  70. datapipeline/sources/foreach.py +18 -3
  71. datapipeline/sources/synthetic/time/parser.py +1 -1
  72. datapipeline/sources/transports.py +10 -4
  73. datapipeline/templates/demo_skeleton/demo/contracts/equity.ohlcv.yaml +33 -0
  74. datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.hour_sin.yaml +22 -0
  75. datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.linear.yaml +22 -0
  76. datapipeline/templates/demo_skeleton/demo/data/APPL.jsonl +19 -0
  77. datapipeline/templates/demo_skeleton/demo/data/MSFT.jsonl +19 -0
  78. datapipeline/templates/demo_skeleton/demo/dataset.yaml +19 -0
  79. datapipeline/templates/demo_skeleton/demo/postprocess.yaml +19 -0
  80. datapipeline/templates/demo_skeleton/demo/project.yaml +19 -0
  81. datapipeline/templates/demo_skeleton/demo/sources/sandbox.ohlcv.yaml +17 -0
  82. datapipeline/templates/{plugin_skeleton/example → demo_skeleton/demo}/sources/synthetic.ticks.yaml +1 -1
  83. datapipeline/templates/demo_skeleton/demo/tasks/metadata.yaml +2 -0
  84. datapipeline/templates/demo_skeleton/demo/tasks/scaler.yaml +3 -0
  85. datapipeline/templates/demo_skeleton/demo/tasks/schema.yaml +2 -0
  86. datapipeline/templates/demo_skeleton/demo/tasks/serve.test.yaml +4 -0
  87. datapipeline/templates/demo_skeleton/demo/tasks/serve.train.yaml +4 -0
  88. datapipeline/templates/demo_skeleton/demo/tasks/serve.val.yaml +4 -0
  89. datapipeline/templates/demo_skeleton/scripts/run_dataframe.py +20 -0
  90. datapipeline/templates/demo_skeleton/scripts/run_torch.py +23 -0
  91. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
  92. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/__init__.py +0 -0
  93. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/model.py +18 -0
  94. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
  95. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/sandbox_ohlcv_dto.py +14 -0
  96. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +0 -0
  97. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/map_sandbox_ohlcv_dto_to_equity.py +26 -0
  98. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
  99. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/sandbox_ohlcv_dto_parser.py +46 -0
  100. datapipeline/templates/plugin_skeleton/README.md +57 -136
  101. datapipeline/templates/plugin_skeleton/jerry.yaml +12 -24
  102. datapipeline/templates/plugin_skeleton/reference/jerry.yaml +28 -0
  103. datapipeline/templates/plugin_skeleton/reference/reference/contracts/composed.reference.yaml +29 -0
  104. datapipeline/templates/plugin_skeleton/reference/reference/contracts/ingest.reference.yaml +31 -0
  105. datapipeline/templates/plugin_skeleton/reference/reference/contracts/overview.reference.yaml +34 -0
  106. datapipeline/templates/plugin_skeleton/reference/reference/dataset.yaml +29 -0
  107. datapipeline/templates/plugin_skeleton/reference/reference/postprocess.yaml +25 -0
  108. datapipeline/templates/plugin_skeleton/reference/reference/project.yaml +32 -0
  109. datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.http.reference.yaml +24 -0
  110. datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.reference.yaml +21 -0
  111. datapipeline/templates/plugin_skeleton/reference/reference/sources/fs.reference.yaml +16 -0
  112. datapipeline/templates/plugin_skeleton/reference/reference/sources/http.reference.yaml +17 -0
  113. datapipeline/templates/plugin_skeleton/reference/reference/sources/overview.reference.yaml +18 -0
  114. datapipeline/templates/plugin_skeleton/reference/reference/sources/synthetic.reference.yaml +15 -0
  115. datapipeline/templates/plugin_skeleton/reference/reference/tasks/metadata.reference.yaml +11 -0
  116. datapipeline/templates/plugin_skeleton/reference/reference/tasks/scaler.reference.yaml +10 -0
  117. datapipeline/templates/plugin_skeleton/reference/reference/tasks/schema.reference.yaml +10 -0
  118. datapipeline/templates/plugin_skeleton/reference/reference/tasks/serve.reference.yaml +28 -0
  119. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/domains/__init__.py +2 -0
  120. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
  121. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/loaders/__init__.py +0 -0
  122. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +1 -0
  123. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
  124. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +12 -11
  125. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +4 -13
  126. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +7 -10
  127. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +1 -2
  128. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +1 -7
  129. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +1 -1
  130. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +1 -1
  131. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +1 -25
  132. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +1 -1
  133. datapipeline/templates/plugin_skeleton/your-interim-data-builder/dataset.yaml +9 -0
  134. datapipeline/templates/plugin_skeleton/your-interim-data-builder/postprocess.yaml +1 -0
  135. datapipeline/templates/plugin_skeleton/your-interim-data-builder/project.yaml +14 -0
  136. datapipeline/templates/plugin_skeleton/your-interim-data-builder/tasks/serve.all.yaml +8 -0
  137. datapipeline/templates/stubs/contracts/composed.yaml.j2 +10 -0
  138. datapipeline/templates/stubs/contracts/ingest.yaml.j2 +25 -0
  139. datapipeline/templates/stubs/dto.py.j2 +1 -1
  140. datapipeline/templates/stubs/loaders/basic.py.j2 +11 -0
  141. datapipeline/templates/stubs/mappers/composed.py.j2 +13 -0
  142. datapipeline/templates/stubs/mappers/ingest.py.j2 +17 -0
  143. datapipeline/templates/stubs/parser.py.j2 +4 -0
  144. datapipeline/templates/stubs/record.py.j2 +0 -1
  145. datapipeline/templates/stubs/source.yaml.j2 +1 -1
  146. datapipeline/transforms/debug/identity.py +34 -16
  147. datapipeline/transforms/debug/lint.py +14 -11
  148. datapipeline/transforms/feature/scaler.py +5 -12
  149. datapipeline/transforms/filter.py +73 -17
  150. datapipeline/transforms/interfaces.py +58 -0
  151. datapipeline/transforms/record/floor_time.py +10 -7
  152. datapipeline/transforms/record/lag.py +8 -10
  153. datapipeline/transforms/sequence.py +2 -3
  154. datapipeline/transforms/stream/dedupe.py +5 -7
  155. datapipeline/transforms/stream/ensure_ticks.py +39 -24
  156. datapipeline/transforms/stream/fill.py +34 -25
  157. datapipeline/transforms/stream/filter.py +25 -0
  158. datapipeline/transforms/stream/floor_time.py +16 -0
  159. datapipeline/transforms/stream/granularity.py +52 -30
  160. datapipeline/transforms/stream/lag.py +17 -0
  161. datapipeline/transforms/stream/rolling.py +72 -0
  162. datapipeline/transforms/utils.py +42 -10
  163. datapipeline/transforms/vector/drop/horizontal.py +0 -3
  164. datapipeline/transforms/vector/drop/orchestrator.py +0 -3
  165. datapipeline/transforms/vector/drop/vertical.py +0 -2
  166. datapipeline/transforms/vector/ensure_schema.py +0 -2
  167. datapipeline/utils/paths.py +0 -2
  168. datapipeline/utils/placeholders.py +0 -2
  169. datapipeline/utils/rich_compat.py +0 -3
  170. datapipeline/utils/window.py +0 -2
  171. jerry_thomas-2.0.0.dist-info/METADATA +282 -0
  172. jerry_thomas-2.0.0.dist-info/RECORD +264 -0
  173. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/WHEEL +1 -1
  174. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/entry_points.txt +7 -3
  175. datapipeline/services/scaffold/mappers.py +0 -55
  176. datapipeline/services/scaffold/source.py +0 -191
  177. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +0 -31
  178. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +0 -30
  179. datapipeline/templates/plugin_skeleton/example/dataset.yaml +0 -18
  180. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +0 -29
  181. datapipeline/templates/plugin_skeleton/example/project.yaml +0 -23
  182. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +0 -3
  183. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +0 -9
  184. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +0 -2
  185. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +0 -4
  186. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +0 -28
  187. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +0 -4
  188. datapipeline/templates/stubs/mapper.py.j2 +0 -22
  189. jerry_thomas-1.0.3.dist-info/METADATA +0 -827
  190. jerry_thomas-1.0.3.dist-info/RECORD +0 -198
  191. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/licenses/LICENSE +0 -0
  192. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,11 @@
1
- from __future__ import annotations
2
-
3
1
  from abc import ABC, abstractmethod
4
- from typing import Iterable, Iterator, Any, Optional
2
+ from typing import Iterable, Iterator, Any, Optional, Sequence
5
3
  import codecs
6
4
  import csv
7
5
  import io
8
6
  import json
9
7
  import pickle
8
+ import itertools
10
9
 
11
10
 
12
11
  class Decoder(ABC):
@@ -32,7 +31,7 @@ def _iter_text_lines(chunks: Iterable[bytes], encoding: str) -> Iterator[str]:
32
31
  idx = buffer.find("\n")
33
32
  if idx == -1:
34
33
  break
35
- line, buffer = buffer[:idx], buffer[idx + 1 :]
34
+ line, buffer = buffer[:idx], buffer[idx + 1:]
36
35
  if line.endswith("\r"):
37
36
  line = line[:-1]
38
37
  yield line
@@ -53,26 +52,58 @@ def _read_all_text(chunks: Iterable[bytes], encoding: str) -> str:
53
52
 
54
53
 
55
54
  class CsvDecoder(Decoder):
56
- def __init__(self, *, delimiter: str = ";", encoding: str = "utf-8"):
55
+ def __init__(
56
+ self,
57
+ *,
58
+ delimiter: str = ";",
59
+ encoding: str = "utf-8",
60
+ error_prefixes: Optional[Sequence[str]] = None,
61
+ ):
57
62
  self.delimiter = delimiter
58
63
  self.encoding = encoding
64
+ self._error_prefixes = [p.lower() for p in (error_prefixes or [])]
65
+
66
+ def _iter_lines(self, chunks: Iterable[bytes]) -> Iterator[str]:
67
+ lines = _iter_text_lines(chunks, self.encoding)
68
+ try:
69
+ first = next(lines)
70
+ except StopIteration:
71
+ return iter(())
72
+ if self._error_prefixes:
73
+ lowered = first.lstrip().lower()
74
+ if any(lowered.startswith(p) for p in self._error_prefixes):
75
+ raise ValueError(
76
+ f"csv response looks like error text: {first[:120]}")
77
+ return itertools.chain([first], lines)
59
78
 
60
79
  def decode(self, chunks: Iterable[bytes]) -> Iterator[dict]:
61
- reader = csv.DictReader(_iter_text_lines(chunks, self.encoding), delimiter=self.delimiter)
80
+ reader = csv.DictReader(self._iter_lines(
81
+ chunks), delimiter=self.delimiter)
62
82
  for row in reader:
63
83
  yield row
64
84
 
65
85
  def count(self, chunks: Iterable[bytes]) -> Optional[int]:
66
- return sum(1 for _ in csv.DictReader(_iter_text_lines(chunks, self.encoding), delimiter=self.delimiter))
86
+ return sum(1 for _ in csv.DictReader(self._iter_lines(chunks), delimiter=self.delimiter))
67
87
 
68
88
 
69
89
  class JsonDecoder(Decoder):
70
- def __init__(self, *, encoding: str = "utf-8"):
90
+ def __init__(self, *, encoding: str = "utf-8", array_field: Optional[str] = None):
71
91
  self.encoding = encoding
92
+ self.array_field = array_field
72
93
 
73
94
  def decode(self, chunks: Iterable[bytes]) -> Iterator[Any]:
74
95
  text = _read_all_text(chunks, self.encoding)
75
96
  data = json.loads(text)
97
+ if self.array_field:
98
+ if not isinstance(data, dict):
99
+ raise ValueError(
100
+ "json array_field requires a top-level object")
101
+ if self.array_field not in data:
102
+ raise ValueError(
103
+ f"json array_field missing: {self.array_field}")
104
+ data = data[self.array_field]
105
+ if data is None:
106
+ return # TODO MAYBE we NEED DO DO SOMETHING ABOUT THIS so we dont silence it
76
107
  if isinstance(data, list):
77
108
  for item in data:
78
109
  yield item
@@ -83,6 +114,16 @@ class JsonDecoder(Decoder):
83
114
  def count(self, chunks: Iterable[bytes]) -> Optional[int]:
84
115
  text = _read_all_text(chunks, self.encoding)
85
116
  data = json.loads(text)
117
+ if self.array_field:
118
+ if not isinstance(data, dict):
119
+ raise ValueError(
120
+ "json array_field requires a top-level object")
121
+ if self.array_field not in data:
122
+ raise ValueError(
123
+ f"json array_field missing: {self.array_field}")
124
+ data = data[self.array_field]
125
+ if data is None:
126
+ return 0
86
127
  return len(data) if isinstance(data, list) else 1
87
128
 
88
129
 
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from typing import Any, Dict
4
2
 
5
3
  from datapipeline.sources.data_loader import DataLoader
@@ -19,7 +17,9 @@ def build_loader(*, transport: str, format: str | None = None, **kwargs: Any) ->
19
17
  transport: "fs" | "http"
20
18
  format: "csv" | "json" | "json-lines" | "pickle" (required for fs/http)
21
19
  fs: path (str), glob (bool, optional), encoding (str, default utf-8), delimiter (csv only)
22
- http: url (str), headers (dict, optional), params (dict, optional), encoding (str, default utf-8)
20
+ http: url (str), headers (dict, optional), params (dict, optional), encoding (str, default utf-8), timeout_seconds (float, optional)
21
+ csv: error_prefixes (list[str], optional)
22
+ json: array_field (str, optional)
23
23
  """
24
24
 
25
25
  t = (transport or "").lower()
@@ -40,16 +40,19 @@ def build_loader(*, transport: str, format: str | None = None, **kwargs: Any) ->
40
40
  headers: Dict[str, str] = dict(kwargs.get("headers") or {})
41
41
  params: Dict[str, Any] = dict(kwargs.get("params") or {})
42
42
  encoding = kwargs.get("encoding", "utf-8")
43
- source = HttpTransport(url, headers=headers, params=params)
43
+ timeout_seconds = kwargs.get("timeout_seconds")
44
+ source = HttpTransport(url, headers=headers, params=params, timeout_seconds=timeout_seconds)
44
45
  else:
45
46
  raise ValueError(f"unsupported transport: {transport}")
46
47
 
47
48
  # Build decoder
48
49
  if fmt == "csv":
49
50
  delimiter = kwargs.get("delimiter", ";")
50
- decoder = CsvDecoder(delimiter=delimiter, encoding=encoding)
51
+ error_prefixes = kwargs.get("error_prefixes")
52
+ decoder = CsvDecoder(delimiter=delimiter, encoding=encoding, error_prefixes=error_prefixes)
51
53
  elif fmt == "json":
52
- decoder = JsonDecoder(encoding=encoding)
54
+ array_field = kwargs.get("array_field")
55
+ decoder = JsonDecoder(encoding=encoding, array_field=array_field)
53
56
  elif fmt == "json-lines":
54
57
  decoder = JsonLinesDecoder(encoding=encoding)
55
58
  elif fmt == "pickle":
@@ -1,6 +1,5 @@
1
- from __future__ import annotations
2
-
3
1
  import re
2
+ import time
4
3
  from typing import Any, Iterator, Mapping
5
4
 
6
5
  from datapipeline.plugins import LOADERS_EP
@@ -49,11 +48,13 @@ class ForeachLoader(BaseDataLoader):
49
48
  loader: Mapping[str, Any],
50
49
  inject_field: str | None = None,
51
50
  inject: Mapping[str, Any] | None = None,
51
+ throttle_seconds: float | None = None,
52
52
  ):
53
53
  self._key, self._values = self._normalize_foreach(foreach)
54
54
  self._loader_spec = self._normalize_loader_spec(loader)
55
55
  self._inject_field = inject_field
56
56
  self._inject = inject
57
+ self._throttle_seconds = self._normalize_throttle(throttle_seconds)
57
58
  self._current_index: int | None = None
58
59
  self._current_value: Any | None = None
59
60
  self._current_args: dict[str, Any] | None = None
@@ -68,6 +69,8 @@ class ForeachLoader(BaseDataLoader):
68
69
 
69
70
  def load(self) -> Iterator[Any]:
70
71
  for i, value in enumerate(self._values, 1):
72
+ if self._throttle_seconds and i > 1:
73
+ time.sleep(self._throttle_seconds)
71
74
  vars_ = {self._key: value}
72
75
  loader_args = self._make_loader_args(vars_)
73
76
  loader = self._build_loader(loader_args)
@@ -84,7 +87,9 @@ class ForeachLoader(BaseDataLoader):
84
87
 
85
88
  def count(self):
86
89
  total = 0
87
- for value in self._values:
90
+ for i, value in enumerate(self._values, 1):
91
+ if self._throttle_seconds and i > 1:
92
+ time.sleep(self._throttle_seconds)
88
93
  vars_ = {self._key: value}
89
94
  loader_args = self._make_loader_args(vars_)
90
95
  loader = self._build_loader(loader_args)
@@ -119,6 +124,16 @@ class ForeachLoader(BaseDataLoader):
119
124
  raise TypeError("core.foreach loader.args must be a mapping when provided")
120
125
  return dict(loader)
121
126
 
127
+ @staticmethod
128
+ def _normalize_throttle(throttle_seconds: float | None) -> float:
129
+ if throttle_seconds is None:
130
+ return 0.0
131
+ if not isinstance(throttle_seconds, (int, float)):
132
+ raise TypeError("core.foreach throttle_seconds must be a number")
133
+ if throttle_seconds < 0:
134
+ raise ValueError("core.foreach throttle_seconds must be >= 0")
135
+ return float(throttle_seconds)
136
+
122
137
  def _make_loader_args(self, vars_: Mapping[str, Any]) -> dict[str, Any]:
123
138
  args = self._loader_spec.get("args") or {}
124
139
  interpolated = _interpolate(args, vars_)
@@ -6,4 +6,4 @@ from datapipeline.domain.record import TemporalRecord
6
6
  class TimeRowParser(DataParser[TemporalRecord]):
7
7
  def parse(self, raw: Dict[str, Any]) -> Optional[TemporalRecord]:
8
8
  t = raw["time"]
9
- return TemporalRecord(time=t, value=t)
9
+ return TemporalRecord(time=t)
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from abc import ABC, abstractmethod
4
2
  from typing import Iterable, Iterator, List, Dict, Optional, Any
5
3
  from urllib.request import Request, urlopen
@@ -65,11 +63,19 @@ class FsGlobTransport(Transport):
65
63
 
66
64
 
67
65
  class HttpTransport(Transport):
68
- def __init__(self, url: str, headers: Optional[Dict[str, str]] = None, params: Optional[Dict[str, Any]] = None, chunk_size: int = 64 * 1024):
66
+ def __init__(
67
+ self,
68
+ url: str,
69
+ headers: Optional[Dict[str, str]] = None,
70
+ params: Optional[Dict[str, Any]] = None,
71
+ chunk_size: int = 64 * 1024,
72
+ timeout_seconds: Optional[float] = None,
73
+ ):
69
74
  self.url = url
70
75
  self.headers = dict(headers or {})
71
76
  self.params: Dict[str, Any] = dict(params or {})
72
77
  self.chunk_size = chunk_size
78
+ self.timeout_seconds = timeout_seconds
73
79
 
74
80
  def _build_url(self) -> str:
75
81
  if not self.params:
@@ -88,7 +94,7 @@ class HttpTransport(Transport):
88
94
  req = Request(req_url, headers=self.headers)
89
95
 
90
96
  try:
91
- resp = urlopen(req)
97
+ resp = urlopen(req, timeout=self.timeout_seconds)
92
98
  except (URLError, HTTPError) as e:
93
99
  raise RuntimeError(f"failed to fetch {self.url}: {e}") from e
94
100
 
@@ -0,0 +1,33 @@
1
+ kind: ingest
2
+ source: sandbox.ohlcv
3
+ id: equity.ohlcv # format: domain.dataset.(variant)
4
+
5
+ mapper:
6
+ entrypoint: map_sandbox_ohlcv_dto_to_equity
7
+ args: {}
8
+
9
+ cadence: ${group_by} # optional per-contract cadence
10
+ partition_by: ticker
11
+ # sort_batch_size: 100000 # in-memory sort chunk size
12
+
13
+ record: # record-level transforms
14
+ - filter: { field: time, operator: ge, comparand: "${start_time}" }
15
+ - filter: { field: time, operator: le, comparand: "${end_time}" }
16
+ - floor_time: { cadence: "${cadence}" }
17
+ # - lag: { lag: 10m }
18
+
19
+ stream: # per-stream transforms (input sorted by partition,time)
20
+ # - ensure_cadence: { field: close, to: close, cadence: "${cadence}" }
21
+ # - granularity: { field: close, to: close, mode: first }
22
+ - rolling: {
23
+ field: dollar_volume,
24
+ to: adv5,
25
+ window: 5,
26
+ statistic: mean,
27
+ min_samples: 3,
28
+ } # compute 5-day average dollar volume (ADV5)
29
+ - filter: { field: adv5, operator: ge, comparand: 1_000_000 } # filter out illiquid stocks
30
+ # - fill: { statistic: median, window: 6, min_samples: 1 }
31
+
32
+ debug: # optional validation-only checks
33
+ #- lint: { mode: warn, tick: "${cadence}" }
@@ -0,0 +1,22 @@
1
+ # See ../../reference/reference/contracts/ingest.reference.yaml for full options.
2
+ kind: ingest
3
+ source: synthetic.ticks
4
+ id: time.ticks.hour_sin
5
+ cadence: ${group_by}
6
+
7
+ mapper:
8
+ entrypoint: encode_time
9
+ args: { mode: hour_sin }
10
+
11
+ record:
12
+ - filter: { field: time, operator: ge, comparand: "${start_time}" }
13
+ - filter: { field: time, operator: le, comparand: "${end_time}" }
14
+ - floor_time: { cadence: "${cadence}" }
15
+
16
+ stream:
17
+ - dedupe: {}
18
+ - granularity: { field: value, to: value, mode: first }
19
+ - ensure_cadence: { field: value, to: value, cadence: "${cadence}" }
20
+
21
+ debug:
22
+ - lint: { mode: error, tick: "${cadence}" }
@@ -0,0 +1,22 @@
1
+ # See ../../reference/reference/contracts/ingest.reference.yaml for full options.
2
+ kind: ingest
3
+ source: synthetic.ticks
4
+ id: time.ticks.linear
5
+ cadence: ${group_by}
6
+
7
+ mapper:
8
+ entrypoint: encode_time
9
+ args: { mode: linear }
10
+
11
+ record:
12
+ - filter: { field: time, operator: ge, comparand: "${start_time}" }
13
+ - filter: { field: time, operator: le, comparand: "${end_time}" }
14
+ - floor_time: { cadence: "${cadence}" }
15
+
16
+ stream:
17
+ - dedupe: {}
18
+ - granularity: { field: value, to: value, mode: first }
19
+ - ensure_cadence: { field: value, to: value, cadence: "${cadence}" }
20
+
21
+ debug:
22
+ - lint: { mode: error, tick: "${cadence}" }
@@ -0,0 +1,19 @@
1
+ {"time": "2021-01-04 05:00:00+00:00", "open": 129.99, "high": 130.06, "low": 123.47, "close": 126.15, "volume": 1549553.0, "symbol": "AAPL"}
2
+ {"time": "2021-01-05 05:00:00+00:00", "open": 125.55, "high": 128.25, "low": 125.06, "close": 127.5, "volume": 804637.0, "symbol": "AAPL"}
3
+ {"time": "2021-01-06 05:00:00+00:00", "open": 124.2, "high": 127.57, "low": 123.07, "close": 123.35, "volume": 2202534.0, "symbol": "AAPL"}
4
+ {"time": "2021-01-07 05:00:00+00:00", "open": 124.98, "high": 128.14, "low": 124.51, "close": 127.42, "volume": 1440239.0, "symbol": "AAPL"}
5
+ {"time": "2021-01-08 05:00:00+00:00", "open": 128.9, "high": 129.11, "low": 126.81, "close": 128.65, "volume": 1340001.0, "symbol": "AAPL"}
6
+ {"time": "2021-01-11 05:00:00+00:00", "open": 125.81, "high": 126.67, "low": 125.1, "close": 125.53, "volume": 1168071.0, "symbol": "AAPL"}
7
+ {"time": "2021-01-12 05:00:00+00:00", "open": 125.03, "high": 126.26, "low": 123.55, "close": 125.47, "volume": 810812.0, "symbol": "AAPL"}
8
+ {"time": "2021-01-13 05:00:00+00:00", "open": 125.29, "high": 127.96, "low": 125.1, "close": 127.56, "volume": 1341043.0, "symbol": "AAPL"}
9
+ {"time": "2021-01-14 05:00:00+00:00", "open": 127.31, "high": 127.53, "low": 125.39, "close": 125.69, "volume": 969718.0, "symbol": "AAPL"}
10
+ {"time": "2021-01-15 05:00:00+00:00", "open": 125.27, "high": 126.76, "low": 123.65, "close": 123.95, "volume": 1159284.0, "symbol": "AAPL"}
11
+ {"time": "2021-01-19 05:00:00+00:00", "open": 124.37, "high": 125.29, "low": 123.59, "close": 124.39, "volume": 1051940.0, "symbol": "AAPL"}
12
+ {"time": "2021-01-20 05:00:00+00:00", "open": 125.16, "high": 128.99, "low": 125.16, "close": 128.52, "volume": 957072.0, "symbol": "AAPL"}
13
+ {"time": "2021-01-21 05:00:00+00:00", "open": 130.25, "high": 133.7, "low": 130.06, "close": 133.19, "volume": 1994077.0, "symbol": "AAPL"}
14
+ {"time": "2021-01-22 05:00:00+00:00", "open": 132.68, "high": 136.15, "low": 131.45, "close": 135.37, "volume": 1820717.0, "symbol": "AAPL"}
15
+ {"time": "2021-01-25 05:00:00+00:00", "open": 139.22, "high": 141.2, "low": 132.94, "close": 139.14, "volume": 1957404.0, "symbol": "AAPL"}
16
+ {"time": "2021-01-26 05:00:00+00:00", "open": 139.74, "high": 140.48, "low": 137.68, "close": 139.51, "volume": 1242288.0, "symbol": "AAPL"}
17
+ {"time": "2021-01-27 05:00:00+00:00", "open": 139.62, "high": 140.44, "low": 136.79, "close": 138.26, "volume": 1965025.0, "symbol": "AAPL"}
18
+ {"time": "2021-01-28 05:00:00+00:00", "open": 135.82, "high": 138.23, "low": 133.1, "close": 133.1, "volume": 2645618.0, "symbol": "AAPL"}
19
+ {"time": "2021-01-29 05:00:00+00:00", "open": 132.29, "high": 133.09, "low": 126.79, "close": 128.3, "volume": 2609717.0, "symbol": "AAPL"}
@@ -0,0 +1,19 @@
1
+ {"time": "2021-01-04 05:00:00+00:00", "open": 213.55, "high": 213.95, "low": 206.18, "close": 209.1, "volume": 678049.0, "symbol": "MSFT"}
2
+ {"time": "2021-01-05 05:00:00+00:00", "open": 208.76, "high": 209.64, "low": 207.24, "close": 209.29, "volume": 483132.0, "symbol": "MSFT"}
3
+ {"time": "2021-01-06 05:00:00+00:00", "open": 203.82, "high": 207.69, "low": 203.41, "close": 203.5, "volume": 881552.0, "symbol": "MSFT"}
4
+ {"time": "2021-01-07 05:00:00+00:00", "open": 205.33, "high": 210.43, "low": 205.06, "close": 209.52, "volume": 621610.0, "symbol": "MSFT"}
5
+ {"time": "2021-01-08 05:00:00+00:00", "open": 209.81, "high": 211.59, "low": 208.24, "close": 210.74, "volume": 656869.0, "symbol": "MSFT"}
6
+ {"time": "2021-01-11 05:00:00+00:00", "open": 209.6, "high": 210.02, "low": 207.96, "close": 208.67, "volume": 519302.0, "symbol": "MSFT"}
7
+ {"time": "2021-01-12 05:00:00+00:00", "open": 207.77, "high": 208.27, "low": 204.68, "close": 206.44, "volume": 705831.0, "symbol": "MSFT"}
8
+ {"time": "2021-01-13 05:00:00+00:00", "open": 205.3, "high": 207.96, "low": 205.3, "close": 207.83, "volume": 635639.0, "symbol": "MSFT"}
9
+ {"time": "2021-01-14 05:00:00+00:00", "open": 207.37, "high": 208.6, "low": 204.12, "close": 204.77, "volume": 573145.0, "symbol": "MSFT"}
10
+ {"time": "2021-01-15 05:00:00+00:00", "open": 204.57, "high": 205.82, "low": 203.48, "close": 204.15, "volume": 669016.0, "symbol": "MSFT"}
11
+ {"time": "2021-01-19 05:00:00+00:00", "open": 205.09, "high": 208.19, "low": 204.04, "close": 207.69, "volume": 688794.0, "symbol": "MSFT"}
12
+ {"time": "2021-01-20 05:00:00+00:00", "open": 209.06, "high": 216.63, "low": 208.55, "close": 215.25, "volume": 1221985.0, "symbol": "MSFT"}
13
+ {"time": "2021-01-21 05:00:00+00:00", "open": 215.56, "high": 217.12, "low": 213.43, "close": 215.79, "volume": 1226767.0, "symbol": "MSFT"}
14
+ {"time": "2021-01-22 05:00:00+00:00", "open": 217.95, "high": 220.71, "low": 216.89, "close": 216.89, "volume": 1457906.0, "symbol": "MSFT"}
15
+ {"time": "2021-01-25 05:00:00+00:00", "open": 219.82, "high": 220.46, "low": 215.26, "close": 220.09, "volume": 976700.0, "symbol": "MSFT"}
16
+ {"time": "2021-01-26 05:00:00+00:00", "open": 222.66, "high": 224.68, "low": 220.77, "close": 223.1, "volume": 1510093.0, "symbol": "MSFT"}
17
+ {"time": "2021-01-27 05:00:00+00:00", "open": 228.35, "high": 230.69, "low": 220.83, "close": 223.45, "volume": 2063324.0, "symbol": "MSFT"}
18
+ {"time": "2021-01-28 05:00:00+00:00", "open": 226.06, "high": 232.79, "low": 225.58, "close": 229.35, "volume": 1701928.0, "symbol": "MSFT"}
19
+ {"time": "2021-01-29 05:00:00+00:00", "open": 225.83, "high": 228.36, "low": 222.07, "close": 222.31, "volume": 1490894.0, "symbol": "MSFT"}
@@ -0,0 +1,19 @@
1
+ # See ../reference/reference/dataset.yaml for full options.
2
+ group_by: ${group_by}
3
+
4
+ features:
5
+ - id: linear_time
6
+ record_stream: time.ticks.linear
7
+ field: value
8
+ scale: true
9
+ sequence: { size: 2, stride: 1 }
10
+
11
+ - id: closing_price
12
+ record_stream: equity.ohlcv
13
+ field: close
14
+ scale: true
15
+
16
+ - id: opening_price
17
+ record_stream: equity.ohlcv
18
+ field: open
19
+ scale: true
@@ -0,0 +1,19 @@
1
+ # See ../reference/reference/postprocess.yaml for full options.
2
+ # - drop: # no targets so no effect but included here for demonstration
3
+ # axis: vertical
4
+ # payload: targets
5
+ # threshold: 0.9
6
+
7
+ - drop: # effectively drops features with >50% missing values. 0 drops in the demo, but included here for demonstration
8
+ axis: vertical
9
+ payload: features
10
+ threshold: 0.5
11
+
12
+ - drop: # this actually drops some vectors
13
+ axis: horizontal
14
+ payload: features
15
+ threshold: 1
16
+ # - drop: # no targets so no effect but included here for demonstration
17
+ # axis: horizontal
18
+ # payload: targets
19
+ # threshold: 1
@@ -0,0 +1,19 @@
1
+ # See ../reference/reference/project.yaml for full options.
2
+ version: 1
3
+ name: demo
4
+ paths:
5
+ streams: ./contracts
6
+ sources: ./sources
7
+ dataset: dataset.yaml
8
+ postprocess: postprocess.yaml
9
+ artifacts: ../artifacts/${project_name}/v${version}
10
+ tasks: ./tasks
11
+ globals:
12
+ group_by: 1d
13
+ start_time: 2021-01-01T00:00:00Z
14
+ end_time: 2021-02-01T00:00:00Z
15
+ split:
16
+ mode: hash
17
+ key: group
18
+ seed: 42
19
+ ratios: { train: 0.8, val: 0.1, test: 0.1 }
@@ -0,0 +1,17 @@
1
+ # Required identifier for this raw source. Contracts reference it under `source:`.
2
+ id: "sandbox.ohlcv" # suggested format: provider.dataset
3
+
4
+ # parser.entrypoint: registered parser name (not a file path)
5
+ parser:
6
+ entrypoint: "sandbox_ohlcv_dto_parser"
7
+ args: {}
8
+
9
+ # loader.entrypoint: registered loader name (not a file path)
10
+ loader:
11
+ entrypoint: "core.io"
12
+ args:
13
+ transport: fs
14
+ format: json-lines
15
+ path: demo/data/*.jsonl
16
+ glob: true
17
+ encoding: utf-8
@@ -1,3 +1,4 @@
1
+ # See ../../reference/reference/sources/overview.reference.yaml for full options.
1
2
  id: synthetic.ticks
2
3
 
3
4
  parser:
@@ -9,4 +10,3 @@ loader:
9
10
  start: "${start_time}"
10
11
  end: "${end_time}"
11
12
  frequency: "${group_by}"
12
-
@@ -0,0 +1,2 @@
1
+ # See ../../reference/reference/tasks/metadata.reference.yaml for full options.
2
+ kind: metadata
@@ -0,0 +1,3 @@
1
+ # See ../../reference/reference/tasks/scaler.reference.yaml for full options.
2
+ kind: scaler
3
+ split_label: train
@@ -0,0 +1,2 @@
1
+ # See ../../reference/reference/tasks/schema.reference.yaml for full options.
2
+ kind: schema
@@ -0,0 +1,4 @@
1
+ # See ../../reference/reference/tasks/serve.reference.yaml for full options.
2
+ kind: serve
3
+ name: test
4
+ keep: test
@@ -0,0 +1,4 @@
1
+ # See ../../reference/reference/tasks/serve.reference.yaml for full options.
2
+ kind: serve
3
+ name: train
4
+ keep: train
@@ -0,0 +1,4 @@
1
+ # See ../../reference/reference/tasks/serve.reference.yaml for full options.
2
+ kind: serve
3
+ name: val
4
+ keep: val
@@ -0,0 +1,20 @@
1
+ from pathlib import Path
2
+
3
+ from datapipeline.integrations import dataframe_from_vectors
4
+
5
+
6
+ def main() -> None:
7
+ project = Path(__file__).resolve().parent / "project.yaml"
8
+ df = dataframe_from_vectors(
9
+ project,
10
+ limit=None,
11
+ include_group=True,
12
+ group_format="mapping",
13
+ flatten_sequences=True,
14
+ )
15
+ print("DataFrame shape:", df.shape)
16
+ print(df.head())
17
+
18
+
19
+ if __name__ == "__main__":
20
+ main()
@@ -0,0 +1,23 @@
1
+ from pathlib import Path
2
+
3
+ import torch
4
+ from torch.utils.data import DataLoader
5
+
6
+ from datapipeline.integrations import torch_dataset
7
+
8
+
9
+ def main() -> None:
10
+ project = Path(__file__).resolve().parent / "project.yaml"
11
+ ds = torch_dataset(
12
+ project,
13
+ limit=256,
14
+ dtype=torch.float32,
15
+ flatten_sequences=True,
16
+ )
17
+ loader = DataLoader(ds, batch_size=32, shuffle=True)
18
+ batch = next(iter(loader))
19
+ print("Feature batch shape:", batch.shape)
20
+
21
+
22
+ if __name__ == "__main__":
23
+ main()
@@ -0,0 +1,18 @@
1
+ from dataclasses import dataclass
2
+
3
+ from datapipeline.domain.record import TemporalRecord
4
+
5
+
6
+ @dataclass
7
+ class EquityRecord(TemporalRecord):
8
+ """
9
+ Domain record for 'equity'.
10
+ """
11
+ open: float
12
+ high: float
13
+ low: float
14
+ close: float
15
+ volume: float
16
+ dollar_volume: float
17
+ hl_range: float
18
+ ticker: str # equity ticker symbol
@@ -0,0 +1,14 @@
1
+ from dataclasses import dataclass
2
+ from datetime import datetime
3
+
4
+
5
+ @dataclass
6
+ class SandboxOhlcvDTO:
7
+ """Data Transfer Object (DTO) for sandbox OHLCV records."""
8
+ time: datetime
9
+ open: float
10
+ high: float
11
+ low: float
12
+ close: float
13
+ volume: float
14
+ symbol: str
@@ -0,0 +1,26 @@
1
+ from typing import Any, Iterator
2
+
3
+ from {{PACKAGE_NAME}}.domains.equity.model import EquityRecord
4
+ from {{PACKAGE_NAME}}.dtos.sandbox_ohlcv_dto import SandboxOhlcvDTO
5
+
6
+
7
+ def map_sandbox_ohlcv_dto_to_equity(
8
+ stream: Iterator[SandboxOhlcvDTO],
9
+ **params: Any,
10
+ ) -> Iterator[EquityRecord]:
11
+ """Map SandboxOhlcvDTO records to domain-level EquityRecord records."""
12
+ for record in stream:
13
+ yield EquityRecord(
14
+ time=record.time, # necessary for correct grouping and ordering
15
+
16
+ # filterable fields
17
+ open=record.open,
18
+ high=record.high,
19
+ low=record.low,
20
+ close=record.close,
21
+ volume=record.volume,
22
+ dollar_volume=record.close * record.volume,
23
+ hl_range=record.high - record.low,
24
+ ticker=record.symbol,
25
+ # filterable fields
26
+ )