jerry-thomas 0.3.0__tar.gz → 1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. jerry_thomas-1.0.1/PKG-INFO +825 -0
  2. jerry_thomas-1.0.1/README.md +805 -0
  3. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/pyproject.toml +13 -10
  4. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/analysis/vector/collector.py +120 -17
  5. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/analysis/vector/matrix.py +33 -8
  6. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/analysis/vector/report.py +162 -32
  7. jerry_thomas-1.0.1/src/datapipeline/build/tasks/__init__.py +11 -0
  8. jerry_thomas-1.0.1/src/datapipeline/build/tasks/config.py +74 -0
  9. jerry_thomas-1.0.1/src/datapipeline/build/tasks/metadata.py +170 -0
  10. jerry_thomas-1.0.1/src/datapipeline/build/tasks/scaler.py +73 -0
  11. jerry_thomas-1.0.1/src/datapipeline/build/tasks/schema.py +60 -0
  12. jerry_thomas-1.0.1/src/datapipeline/build/tasks/utils.py +169 -0
  13. jerry_thomas-1.0.1/src/datapipeline/cli/app.py +671 -0
  14. jerry_thomas-1.0.1/src/datapipeline/cli/commands/build.py +263 -0
  15. jerry_thomas-1.0.1/src/datapipeline/cli/commands/contract.py +367 -0
  16. jerry_thomas-1.0.1/src/datapipeline/cli/commands/domain.py +14 -0
  17. jerry_thomas-1.0.1/src/datapipeline/cli/commands/inspect.py +472 -0
  18. jerry_thomas-1.0.1/src/datapipeline/cli/commands/list_.py +45 -0
  19. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/cli/commands/plugin.py +5 -1
  20. jerry_thomas-1.0.1/src/datapipeline/cli/commands/run.py +260 -0
  21. jerry_thomas-1.0.1/src/datapipeline/cli/commands/run_config.py +101 -0
  22. jerry_thomas-1.0.1/src/datapipeline/cli/commands/serve_pipeline.py +156 -0
  23. jerry_thomas-1.0.1/src/datapipeline/cli/commands/source.py +53 -0
  24. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/cli/visuals/__init__.py +4 -2
  25. jerry_thomas-1.0.1/src/datapipeline/cli/visuals/common.py +239 -0
  26. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/cli/visuals/labels.py +15 -15
  27. jerry_thomas-1.0.1/src/datapipeline/cli/visuals/runner.py +66 -0
  28. jerry_thomas-1.0.1/src/datapipeline/cli/visuals/sections.py +20 -0
  29. jerry_thomas-1.0.1/src/datapipeline/cli/visuals/sources.py +151 -0
  30. jerry_thomas-1.0.1/src/datapipeline/cli/visuals/sources_basic.py +260 -0
  31. jerry_thomas-1.0.1/src/datapipeline/cli/visuals/sources_off.py +76 -0
  32. jerry_thomas-1.0.1/src/datapipeline/cli/visuals/sources_rich.py +414 -0
  33. jerry_thomas-1.0.1/src/datapipeline/config/catalog.py +64 -0
  34. jerry_thomas-1.0.1/src/datapipeline/config/context.py +214 -0
  35. jerry_thomas-1.0.1/src/datapipeline/config/dataset/loader.py +36 -0
  36. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/config/dataset/normalize.py +4 -4
  37. jerry_thomas-1.0.1/src/datapipeline/config/metadata.py +43 -0
  38. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/config/postprocess.py +2 -2
  39. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/config/project.py +3 -2
  40. jerry_thomas-1.0.1/src/datapipeline/config/resolution.py +129 -0
  41. jerry_thomas-1.0.1/src/datapipeline/config/tasks.py +309 -0
  42. jerry_thomas-1.0.1/src/datapipeline/config/workspace.py +155 -0
  43. jerry_thomas-1.0.1/src/datapipeline/domain/__init__.py +12 -0
  44. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/domain/record.py +11 -0
  45. jerry_thomas-1.0.1/src/datapipeline/domain/sample.py +54 -0
  46. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/integrations/ml/adapter.py +34 -20
  47. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/integrations/ml/pandas_support.py +0 -2
  48. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/integrations/ml/rows.py +1 -6
  49. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/integrations/ml/torch_support.py +1 -3
  50. jerry_thomas-1.0.1/src/datapipeline/io/factory.py +112 -0
  51. jerry_thomas-1.0.1/src/datapipeline/io/output.py +132 -0
  52. jerry_thomas-1.0.1/src/datapipeline/io/protocols.py +21 -0
  53. jerry_thomas-1.0.1/src/datapipeline/io/serializers.py +219 -0
  54. jerry_thomas-1.0.1/src/datapipeline/io/sinks/__init__.py +23 -0
  55. jerry_thomas-1.0.1/src/datapipeline/io/sinks/base.py +2 -0
  56. jerry_thomas-1.0.1/src/datapipeline/io/sinks/files.py +79 -0
  57. jerry_thomas-1.0.1/src/datapipeline/io/sinks/rich.py +57 -0
  58. jerry_thomas-1.0.1/src/datapipeline/io/sinks/stdout.py +18 -0
  59. jerry_thomas-1.0.1/src/datapipeline/io/writers/__init__.py +14 -0
  60. jerry_thomas-1.0.1/src/datapipeline/io/writers/base.py +28 -0
  61. jerry_thomas-1.0.1/src/datapipeline/io/writers/csv_writer.py +25 -0
  62. jerry_thomas-1.0.1/src/datapipeline/io/writers/jsonl.py +52 -0
  63. jerry_thomas-1.0.1/src/datapipeline/io/writers/pickle_writer.py +30 -0
  64. jerry_thomas-1.0.1/src/datapipeline/pipeline/artifacts.py +58 -0
  65. jerry_thomas-1.0.1/src/datapipeline/pipeline/context.py +128 -0
  66. jerry_thomas-1.0.1/src/datapipeline/pipeline/observability.py +65 -0
  67. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/pipeline/pipelines.py +65 -13
  68. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/pipeline/split.py +11 -10
  69. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/pipeline/stages.py +127 -16
  70. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/pipeline/utils/keygen.py +20 -7
  71. jerry_thomas-1.0.1/src/datapipeline/pipeline/utils/memory_sort.py +39 -0
  72. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/pipeline/utils/transform_utils.py +22 -0
  73. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/runtime.py +5 -2
  74. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/services/artifacts.py +12 -6
  75. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/services/bootstrap/config.py +25 -0
  76. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/services/bootstrap/core.py +52 -37
  77. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/services/constants.py +6 -5
  78. jerry_thomas-1.0.1/src/datapipeline/services/factories.py +147 -0
  79. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/services/project_paths.py +43 -16
  80. jerry_thomas-1.0.1/src/datapipeline/services/runs.py +208 -0
  81. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/services/scaffold/domain.py +3 -2
  82. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/services/scaffold/filter.py +3 -2
  83. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/services/scaffold/mappers.py +9 -6
  84. jerry_thomas-1.0.1/src/datapipeline/services/scaffold/plugin.py +93 -0
  85. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/services/scaffold/source.py +93 -56
  86. jerry_thomas-0.3.0/src/datapipeline/sources/composed_loader.py → jerry_thomas-1.0.1/src/datapipeline/sources/data_loader.py +9 -9
  87. jerry_thomas-1.0.1/src/datapipeline/sources/decoders.py +129 -0
  88. jerry_thomas-1.0.1/src/datapipeline/sources/factory.py +63 -0
  89. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/sources/models/__init__.py +2 -2
  90. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/sources/models/generator.py +0 -7
  91. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/sources/models/loader.py +3 -3
  92. jerry_thomas-1.0.1/src/datapipeline/sources/models/parsing_error.py +24 -0
  93. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/sources/models/source.py +6 -6
  94. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/sources/synthetic/time/loader.py +14 -2
  95. jerry_thomas-1.0.1/src/datapipeline/sources/transports.py +103 -0
  96. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/README.md +142 -0
  97. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
  98. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
  99. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
  100. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
  101. {jerry_thomas-0.3.0/src/datapipeline/templates/plugin_skeleton/config/datasets/default → jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/example}/project.yaml +11 -8
  102. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
  103. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
  104. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
  105. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
  106. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
  107. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
  108. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
  109. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
  110. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
  111. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
  112. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
  113. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
  114. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
  115. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
  116. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
  117. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
  118. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
  119. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
  120. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
  121. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
  122. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/templates/stubs/dto.py.j2 +2 -0
  123. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/templates/stubs/mapper.py.j2 +5 -4
  124. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/templates/stubs/parser.py.j2 +2 -0
  125. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/templates/stubs/record.py.j2 +2 -0
  126. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/templates/stubs/source.yaml.j2 +2 -3
  127. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/transforms/debug/lint.py +26 -41
  128. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/transforms/feature/scaler.py +89 -13
  129. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/transforms/record/floor_time.py +4 -4
  130. jerry_thomas-1.0.1/src/datapipeline/transforms/sequence.py +51 -0
  131. jerry_thomas-1.0.1/src/datapipeline/transforms/stream/dedupe.py +24 -0
  132. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/transforms/stream/ensure_ticks.py +7 -6
  133. jerry_thomas-1.0.1/src/datapipeline/transforms/vector/__init__.py +5 -0
  134. jerry_thomas-1.0.1/src/datapipeline/transforms/vector/common.py +98 -0
  135. jerry_thomas-1.0.1/src/datapipeline/transforms/vector/drop/__init__.py +4 -0
  136. jerry_thomas-1.0.1/src/datapipeline/transforms/vector/drop/horizontal.py +79 -0
  137. jerry_thomas-1.0.1/src/datapipeline/transforms/vector/drop/orchestrator.py +59 -0
  138. jerry_thomas-1.0.1/src/datapipeline/transforms/vector/drop/vertical.py +182 -0
  139. jerry_thomas-1.0.1/src/datapipeline/transforms/vector/ensure_schema.py +184 -0
  140. jerry_thomas-1.0.1/src/datapipeline/transforms/vector/fill.py +87 -0
  141. jerry_thomas-1.0.1/src/datapipeline/transforms/vector/replace.py +62 -0
  142. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/utils/load.py +24 -3
  143. jerry_thomas-1.0.1/src/datapipeline/utils/rich_compat.py +38 -0
  144. jerry_thomas-1.0.1/src/datapipeline/utils/window.py +76 -0
  145. jerry_thomas-1.0.1/src/jerry_thomas.egg-info/PKG-INFO +825 -0
  146. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/jerry_thomas.egg-info/SOURCES.txt +78 -25
  147. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/jerry_thomas.egg-info/entry_points.txt +9 -8
  148. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/jerry_thomas.egg-info/requires.txt +1 -0
  149. jerry_thomas-0.3.0/PKG-INFO +0 -502
  150. jerry_thomas-0.3.0/README.md +0 -483
  151. jerry_thomas-0.3.0/src/datapipeline/build/tasks.py +0 -186
  152. jerry_thomas-0.3.0/src/datapipeline/cli/app.py +0 -494
  153. jerry_thomas-0.3.0/src/datapipeline/cli/commands/build.py +0 -39
  154. jerry_thomas-0.3.0/src/datapipeline/cli/commands/domain.py +0 -9
  155. jerry_thomas-0.3.0/src/datapipeline/cli/commands/inspect.py +0 -220
  156. jerry_thomas-0.3.0/src/datapipeline/cli/commands/link.py +0 -128
  157. jerry_thomas-0.3.0/src/datapipeline/cli/commands/list_.py +0 -22
  158. jerry_thomas-0.3.0/src/datapipeline/cli/commands/run.py +0 -274
  159. jerry_thomas-0.3.0/src/datapipeline/cli/commands/source.py +0 -17
  160. jerry_thomas-0.3.0/src/datapipeline/cli/commands/writers.py +0 -138
  161. jerry_thomas-0.3.0/src/datapipeline/cli/visuals/sources.py +0 -138
  162. jerry_thomas-0.3.0/src/datapipeline/config/build.py +0 -64
  163. jerry_thomas-0.3.0/src/datapipeline/config/catalog.py +0 -30
  164. jerry_thomas-0.3.0/src/datapipeline/config/dataset/loader.py +0 -19
  165. jerry_thomas-0.3.0/src/datapipeline/config/run.py +0 -116
  166. jerry_thomas-0.3.0/src/datapipeline/pipeline/context.py +0 -69
  167. jerry_thomas-0.3.0/src/datapipeline/pipeline/utils/memory_sort.py +0 -27
  168. jerry_thomas-0.3.0/src/datapipeline/services/factories.py +0 -25
  169. jerry_thomas-0.3.0/src/datapipeline/services/scaffold/plugin.py +0 -49
  170. jerry_thomas-0.3.0/src/datapipeline/sources/decoders.py +0 -64
  171. jerry_thomas-0.3.0/src/datapipeline/sources/factory.py +0 -53
  172. jerry_thomas-0.3.0/src/datapipeline/sources/transports.py +0 -66
  173. jerry_thomas-0.3.0/src/datapipeline/templates/plugin_skeleton/README.md +0 -96
  174. jerry_thomas-0.3.0/src/datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
  175. jerry_thomas-0.3.0/src/datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
  176. jerry_thomas-0.3.0/src/datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
  177. jerry_thomas-0.3.0/src/datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
  178. jerry_thomas-0.3.0/src/datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
  179. jerry_thomas-0.3.0/src/datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
  180. jerry_thomas-0.3.0/src/datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
  181. jerry_thomas-0.3.0/src/datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
  182. jerry_thomas-0.3.0/src/datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
  183. jerry_thomas-0.3.0/src/datapipeline/transforms/sequence.py +0 -84
  184. jerry_thomas-0.3.0/src/datapipeline/transforms/vector.py +0 -210
  185. jerry_thomas-0.3.0/src/datapipeline/utils/__init__.py +0 -0
  186. jerry_thomas-0.3.0/src/jerry_thomas.egg-info/PKG-INFO +0 -502
  187. jerry_thomas-0.3.0/tests/test_config_pipeline.py +0 -19
  188. jerry_thomas-0.3.0/tests/test_regression_vectors.py +0 -249
  189. jerry_thomas-0.3.0/tests/test_run_config.py +0 -99
  190. jerry_thomas-0.3.0/tests/test_scaffold_plugin.py +0 -26
  191. jerry_thomas-0.3.0/tests/test_split_stage.py +0 -46
  192. jerry_thomas-0.3.0/tests/test_transforms.py +0 -225
  193. jerry_thomas-0.3.0/tests/test_vector_analyzer.py +0 -19
  194. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/LICENSE +0 -0
  195. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/setup.cfg +0 -0
  196. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/__init__.py +0 -0
  197. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/analysis/__init__.py +0 -0
  198. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/analysis/vector_analyzer.py +0 -0
  199. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/build/__init__.py +0 -0
  200. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/build/state.py +0 -0
  201. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/cli/commands/filter.py +0 -0
  202. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/config/__init__.py +0 -0
  203. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/config/dataset/dataset.py +0 -0
  204. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/config/dataset/feature.py +0 -0
  205. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/config/split.py +0 -0
  206. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/domain/feature.py +0 -0
  207. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/domain/vector.py +0 -0
  208. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/filters/filters.py +0 -0
  209. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/integrations/__init__.py +0 -0
  210. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/integrations/ml/__init__.py +0 -0
  211. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/mappers/noop.py +0 -0
  212. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/mappers/synthetic/time.py +0 -0
  213. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/parsers/identity.py +0 -0
  214. {jerry_thomas-0.3.0/src/datapipeline/domain → jerry_thomas-1.0.1/src/datapipeline/pipeline}/__init__.py +0 -0
  215. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/pipeline/utils/ordering.py +0 -0
  216. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/plugins.py +0 -0
  217. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/registries/registry.py +0 -0
  218. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/services/bootstrap/__init__.py +0 -0
  219. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/services/entrypoints.py +0 -0
  220. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/services/paths.py +0 -0
  221. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/services/scaffold/__init__.py +0 -0
  222. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/services/scaffold/templates.py +0 -0
  223. {jerry_thomas-0.3.0/src/datapipeline/pipeline → jerry_thomas-1.0.1/src/datapipeline/sources}/__init__.py +0 -0
  224. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/sources/models/base.py +0 -0
  225. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/sources/models/parser.py +0 -0
  226. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/sources/models/synthetic.py +0 -0
  227. {jerry_thomas-0.3.0/src/datapipeline/sources → jerry_thomas-1.0.1/src/datapipeline/sources/synthetic}/__init__.py +0 -0
  228. {jerry_thomas-0.3.0/src/datapipeline/sources/synthetic → jerry_thomas-1.0.1/src/datapipeline/sources/synthetic/time}/__init__.py +0 -0
  229. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/sources/synthetic/time/parser.py +0 -0
  230. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/templates/plugin_skeleton/pyproject.toml +0 -0
  231. {jerry_thomas-0.3.0/src/datapipeline/sources/synthetic/time → jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}}/__init__.py +0 -0
  232. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/templates/stubs/filter.py.j2 +0 -0
  233. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/templates/stubs/loader_synthetic.py.j2 +0 -0
  234. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/templates/stubs/parser_custom.py.j2 +0 -0
  235. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/transforms/debug/identity.py +0 -0
  236. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/transforms/feature/model.py +0 -0
  237. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/transforms/filter.py +0 -0
  238. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/transforms/record/lag.py +0 -0
  239. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/transforms/stream/fill.py +0 -0
  240. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/transforms/stream/granularity.py +0 -0
  241. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/transforms/utils.py +0 -0
  242. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/transforms/vector_utils.py +0 -0
  243. {jerry_thomas-0.3.0/src/datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}} → jerry_thomas-1.0.1/src/datapipeline/utils}/__init__.py +0 -0
  244. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/utils/paths.py +0 -0
  245. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/utils/pickle_model.py +0 -0
  246. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/utils/placeholders.py +0 -0
  247. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/datapipeline/utils/time.py +0 -0
  248. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/jerry_thomas.egg-info/dependency_links.txt +0 -0
  249. {jerry_thomas-0.3.0 → jerry_thomas-1.0.1}/src/jerry_thomas.egg-info/top_level.txt +0 -0
@@ -0,0 +1,825 @@
1
+ Metadata-Version: 2.4
2
+ Name: jerry-thomas
3
+ Version: 1.0.1
4
+ Summary: Jerry-Thomas: a stream-first, plugin-friendly data pipeline (mixology-themed CLI)
5
+ Author: Anders Skott Lind
6
+ License: MIT
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: numpy<3.0,>=1.24
11
+ Requires-Dist: pydantic>=2.0
12
+ Requires-Dist: PyYAML>=5.4
13
+ Requires-Dist: tqdm>=4.0
14
+ Requires-Dist: jinja2>=3.0
15
+ Requires-Dist: rich>=13
16
+ Provides-Extra: ml
17
+ Requires-Dist: pandas>=2.0; extra == "ml"
18
+ Requires-Dist: torch>=2.0; extra == "ml"
19
+ Dynamic: license-file
20
+
21
+ # Datapipeline Runtime
22
+
23
+ Jerry Thomas is a time-series-first data pipeline runtime. It turns declarative
24
+ YAML projects into iterators that stream records, engineered features, and
25
+ model-ready vectors. The CLI lets you preview every stage, build deterministic
26
+ artifacts, inspect quality, and scaffold plugins for custom loaders, parsers,
27
+ transforms, and filters.
28
+
29
+ > **Core assumptions**
30
+ >
31
+ > - Every record carries a timezone-aware `time` attribute and a numeric
32
+ > `value`.
33
+ > - Grouping is purely temporal. Dimensional splits belong in `partition_by`.
34
+
35
+ ---
36
+
37
+ ## Why You Might Use It
38
+
39
+ - Materialize canonical time-series datasets from disparate sources.
40
+ - Preview and debug each stage of the pipeline without writing ad-hoc scripts.
41
+ - Enforce coverage/quality gates and publish artifacts (expected IDs, scaler
42
+ stats) for downstream ML teams.
43
+ - Extend the runtime with entry-point driven plugins for domain-specific I/O or
44
+ feature engineering.
45
+ - Consume vectors directly from Python via iterators, Pandas DataFrames, or
46
+ `torch.utils.data.Dataset`.
47
+
48
+ ---
49
+
50
+ ## Quick Start
51
+
52
+ ```bash
53
+ # 1. Install in editable mode (with optional dev extras for testing).
54
+ pip install -e .[dev]
55
+
56
+ # 2. Bootstrap a project (scaffolds configs, plugin package, and templates).
57
+ jerry plugin init my_datapipeline --out .
58
+
59
+ # 3. Create a source & domain scaffold, then declare a canonical stream.
60
+ # Simple forms
61
+ jerry source add demo weather --transport fs --format csv
62
+ jerry source add demo.weather --transport http --format json
63
+
64
+ # Flag form (explicit)
65
+ jerry source add --provider demo --dataset weather --transport fs --format csv
66
+ jerry domain add weather
67
+ # (edit config/contracts/<alias>.yaml to point at your mapper and policies)
68
+
69
+ # 4. Configure dataset/postprocess/build files under config/.
70
+ # Then preview the pipeline and serve a few vectors:
71
+ # Add --skip-build when you only need a quick feature peek.
72
+ jerry serve --project config/project.yaml --stage 2 --limit 5
73
+ jerry serve --project config/project.yaml --limit 3
74
+
75
+ # 5. Inspect coverage and build artifacts:
76
+ jerry inspect report --project config/project.yaml
77
+ jerry build --project config/project.yaml
78
+ ```
79
+
80
+ The skeleton project in `src/datapipeline/templates/plugin_skeleton/` mirrors the
81
+ paths expected by the CLI. Copy it or run `jerry plugin init` to get a ready-made
82
+ layout with `config/`, `src/<package>/`, and entry-point stubs.
83
+
84
+ ---
85
+
86
+ ## Pipeline Architecture
87
+
88
+ ```text
89
+ raw source ──▶ loader/parser DTOs ──▶ canonical stream ──▶ record policies
90
+ └──▶ feature wrapping ──▶ stream regularization ──▶ feature transforms/sequence
91
+ └──▶ vector assembly ──▶ postprocess transforms
92
+ ```
93
+
94
+ 1. **Loader/parser (Stage 0)** – raw bytes become typed DTOs. Loaders fetch from
95
+ FS/HTTP/synthetic sources; parsers map bytes to DTOs. Register them via entry
96
+ points (`loaders`, `parsers`) and wire them in `config/sources/*.yaml`.
97
+ 2. **Canonical stream mapping (Stage 1)** – mappers attach domain semantics and
98
+ partition keys, producing domain `TemporalRecord`s.
99
+ 3. **Record policies (Stage 2)** – contract `record` rules (filters, floor, lag)
100
+ prune and normalize DTO-derived records.
101
+ 4. **Feature wrapping (Stage 3)** – records become `FeatureRecord`s before
102
+ sort/regularization.
103
+ 5. **Stream regularization (Stage 4)** – contract `stream` rules ensure cadence,
104
+ deduplicate timestamps, and impute where needed.
105
+ 6. **Feature transforms/sequence (Stage 5)** – dataset transforms (scale,
106
+ sequence windows) produce per-feature tensors or windows.
107
+ 7. **Vector assembly (Stage 6)** – features merge by `group_by` cadence into
108
+ `(group_key, Vector)` pairs, prior to postprocess tweaks.
109
+ 8. **Postprocess (Stage 7)** – optional vector transforms (fill/drop/etc.) run
110
+ before results are emitted to the configured output.
111
+
112
+ #### Visual Flowchart
113
+
114
+ ```mermaid
115
+ flowchart TB
116
+ subgraph CLI & Project config
117
+ cliSource[jerry source add]
118
+ cliDomain[jerry domain add]
119
+ cliContract[jerry contract]
120
+ cliServe[jerry serve]
121
+ project[[project.yaml]]
122
+ sourcesCfg[config/sources/*.yaml]
123
+ contractsCfg[config/contracts/*.yaml]
124
+ datasetCfg[dataset.yaml]
125
+ postprocessCfg[postprocess.yaml]
126
+ end
127
+
128
+ cliSource --> sourcesCfg
129
+ cliDomain --> domainPkg
130
+ cliContract --> contractsCfg
131
+ cliServe --> vectorSamples
132
+ project -.->|paths.sources| sourcesCfg
133
+ project -.->|paths.streams| contractsCfg
134
+ project -.->|paths.dataset| datasetCfg
135
+ project -.->|paths.postprocess| postprocessCfg
136
+
137
+ subgraph Plugin code
138
+ domainPkg[domains/*]
139
+ mappersPkg[mappers/*]
140
+ end
141
+
142
+ cliContract --> mappersPkg
143
+ domainPkg -. domain models .-> mappersPkg
144
+
145
+ subgraph Registries
146
+ registrySources[sources]
147
+ registryStreamSources[stream_sources]
148
+ registryMappers[mappers]
149
+ registryRecordOps[record_ops]
150
+ registryStreamOps[stream_ops]
151
+ registryDebugOps[debug_ops]
152
+ end
153
+
154
+ subgraph Source wiring
155
+ rawData[(external data)]
156
+ transportSpec[transport + format]
157
+ loaderEP[loader ep]
158
+ parserEP[parser ep]
159
+ sourceArgs[loader args]
160
+ sourceNode[Source]
161
+ dtoStream[(DTOs)]
162
+ end
163
+
164
+ sourcesCfg --> transportSpec
165
+ sourcesCfg --> loaderEP
166
+ sourcesCfg --> parserEP
167
+ sourcesCfg --> sourceArgs
168
+ transportSpec -. select fs/http/synth .-> loaderEP
169
+ loaderEP -. build loader .-> sourceNode
170
+ parserEP -. build parser .-> sourceNode
171
+ sourceArgs -. paths/creds .-> sourceNode
172
+ rawData --> sourceNode --> dtoStream
173
+ sourcesCfg -. build_source_from_spec .-> registrySources
174
+ contractsCfg -. stream_id + source .-> registryStreamSources
175
+ registrySources -. alias -> Source .-> registryStreamSources
176
+
177
+ subgraph Canonical stream
178
+ mapperEP[mapper ep]
179
+ recordRules[record rules]
180
+ streamRules[stream rules]
181
+ debugRules[debug rules]
182
+ canonical[DTO -> record]
183
+ domainRecords((TemporalRecord))
184
+ recordStage[record xforms]
185
+ featureWrap[record -> feature]
186
+ featureRecords((FeatureRecord))
187
+ regularization[stream xforms]
188
+ end
189
+
190
+ dtoStream --> canonical --> domainRecords --> recordStage --> featureWrap --> featureRecords --> regularization
191
+ contractsCfg --> mapperEP
192
+ mappersPkg -. ep target .-> mapperEP
193
+ mapperEP -. build_mapper_from_spec .-> registryMappers
194
+ registryMappers --> canonical
195
+ contractsCfg --> recordRules
196
+ contractsCfg --> streamRules
197
+ contractsCfg --> debugRules
198
+ registryRecordOps --> recordRules
199
+ registryStreamOps --> streamRules
200
+ registryDebugOps --> debugRules
201
+ recordRules --> recordStage
202
+ streamRules --> regularization
203
+ debugRules --> regularization
204
+
205
+ subgraph Dataset shaping
206
+ featureSpec[feature cfg]
207
+ groupBySpec[group_by]
208
+ streamRefs[record_stream ids]
209
+ featureTrans[feature/seq xforms]
210
+ sequenceStream((seq/features))
211
+ vectorStage[vector assembly]
212
+ vectorSamples((samples))
213
+ end
214
+
215
+ datasetCfg --> featureSpec
216
+ datasetCfg --> groupBySpec
217
+ datasetCfg --> streamRefs
218
+ streamRefs -.->|build_feature_pipeline| registryStreamSources
219
+ registryStreamSources -.->|open_source_stream| sourceNode
220
+ featureRecords --> regularization --> featureTrans --> sequenceStream --> vectorStage --> vectorSamples
221
+ featureSpec -. scale/sequence .-> featureTrans
222
+ groupBySpec -. cadence .-> vectorStage
223
+
224
+ subgraph Postprocess
225
+ vectorTransforms[vector xforms]
226
+ postprocessNode[postprocess]
227
+ end
228
+
229
+ postprocessCfg --> vectorTransforms -. drop/fill .-> postprocessNode
230
+ vectorStage --> postprocessNode
231
+ ```
232
+
233
+ style cliSource width:120px
234
+ style cliDomain width:120px
235
+ style cliContract width:120px
236
+ style cliServe width:120px
237
+ style sourcesCfg width:200px
238
+ style contractsCfg width:200px
239
+ style datasetCfg width:180px
240
+ style postprocessCfg width:200px
241
+ style registrySources width:160px
242
+ style registryStreamSources width:180px
243
+ style registryMappers width:160px
244
+ style registryRecordOps width:180px
245
+ style registryStreamOps width:180px
246
+ style registryDebugOps width:180px
247
+ style transportSpec width:180px
248
+ style loaderEP width:140px
249
+ style parserEP width:140px
250
+ style sourceArgs width:160px
251
+ style canonical width:180px
252
+ style featureTrans width:180px
253
+ style domainRecords width:140px
254
+ style featureRecords width:140px
255
+ style sequenceStream width:180px
256
+ style vectorStage width:160px
257
+ style vectorSamples width:180px
258
+ style recordRules width:160px
259
+ style streamRules width:160px
260
+ style debugRules width:160px
261
+ style domainPkg width:160px
262
+ style mappersPkg width:160px
263
+
264
+ Solid arrows trace runtime data flow; dashed edges highlight how the config files
265
+ inject transports, entry points, or policies into each stage.
266
+
267
+ CLI quick path:
268
+ - `jerry source add <provider> <dataset> --transport fs|http|synthetic --format ...` → scaffolds DTO/parser/loader and writes `config/sources/*.yaml`.
269
+ - `jerry domain add <name>` → creates `src/<pkg>/domains/<name>/model.py`.
270
+ - `jerry contract` → picks a source + domain, scaffolds/links a mapper under `mappers/`, registers its entry point, and writes `config/contracts/<stream>.yaml`.
271
+ - `jerry serve --project <project.yaml>` → builds/streams vectors using dataset `record_stream` IDs, registry wiring, and postprocess rules.
272
+
273
+ `config/sources/*.yaml` determines both the transport and parsing strategy:
274
+ you define transport (`fs`, `http`, `synthetic`, etc.), the payload format
275
+ (`csv`, `json`, ...), and the loader/parser entry points. Loader `args`
276
+ typically include file paths, bucket prefixes, or credential references—the
277
+ runtime feeds those arguments into the instantiated loader so it knows exactly
278
+ which external data store to read. Contracts bind each canonical stream to a
279
+ `source` alias (connecting back to the loader/parser pair) and register a
280
+ stream ID; they also specify mapper entry points, record/stream rules,
281
+ partitioning, and batch sizes. Dataset features reference those canonical
282
+ stream IDs via `record_stream`, so each feature config reuses the registered
283
+ stream (and, by extension, the raw source) when you call
284
+ `build_feature_pipeline()` (`src/datapipeline/pipeline/pipelines.py`). Finally,
285
+ `postprocess.yaml` decorates the vector stream with additional filters/fills so
286
+ serve/build outputs inherit the full set of policies. When you run the CLI,
287
+ `bootstrap()` (`src/datapipeline/services/bootstrap/core.py`) loads each
288
+ directory declared in `project.yaml`, instantiates loaders/parsers via
289
+ `build_source_from_spec()` and `load_ep()`, attaches contract registries, and
290
+ hands a fully wired `Runtime` to the pipeline stages in
291
+ `src/datapipeline/pipeline/stages.py`.
292
+
293
+ Every `record_stream` identifier ultimately resolves to the stream entry revived
294
+ by the contract bootstrap step, so requesting stage outputs for a feature always
295
+ walks the entire chain from dataset config → canonical contract → source
296
+ definition. That is why `build_feature_pipeline()` starts by calling
297
+ `open_source_stream(context, record_stream_id)` before stepping through record
298
+ policies, stream policies, and feature transforms.
299
+
300
+ The runtime (`src/datapipeline/runtime.py`) hosts registries for sources,
301
+ transforms, artifacts, and postprocess rules. The CLI constructs lightweight
302
+ `PipelineContext` objects to build iterators without mutating global state.
303
+
304
+ ---
305
+
306
+ ## Configuration Files
307
+
308
+ All project configuration for a dataset lives under a single project root directory (for example `config/`), which contains `project.yaml` and its siblings.
309
+
310
+ ### `project.yaml`
311
+
312
+ ```yaml
313
+ version: 1
314
+ name: default
315
+ paths:
316
+ streams: ./contracts
317
+ sources: ./sources
318
+ dataset: dataset.yaml
319
+ postprocess: postprocess.yaml
320
+ artifacts: ../build/datasets/${project_name}
321
+ tasks: ./tasks
322
+ globals:
323
+ start_time: 2021-01-01T00:00:00Z
324
+ end_time: 2023-01-03T23:00:00Z
325
+ split:
326
+ mode: hash # hash | time
327
+ key: group # group | feature:<id>
328
+ seed: 42
329
+ ratios: { train: 0.8, val: 0.1, test: 0.1 }
330
+ ```
331
+
332
+ - `name` provides a stable identifier you can reuse inside config files via `${project_name}`.
333
+ - `paths.*` are resolved relative to the project file unless absolute; they also support `${var}` interpolation.
334
+ - `globals` provide values for `${var}` interpolation across YAML files. Datetime
335
+ values are normalized to strict UTC `YYYY-MM-DDTHH:MM:SSZ`.
336
+ - `split` config defines how labels are assigned; serve tasks or CLI flags pick the active label via `keep`.
337
+ - `paths.tasks` points to a directory of task specs. Each `*.yaml` file declares `kind: ...`
338
+ (`scaler`, `schema`, `metadata`, `serve`, …). Artifact tasks drive `jerry build`; command
339
+ tasks (currently `kind: serve`) provide presets for `jerry serve`. When multiple serve tasks
340
+ exist, `jerry serve --run <name>` selects by `name`/filename stem.
341
+ - Label names are free-form: match whatever keys you declare in `split.ratios` (hash) or `split.labels` (time).
342
+
343
+ ### Serve Tasks (`tasks/serve.<name>.yaml`)
344
+
345
+ ```yaml
346
+ kind: serve
347
+ name: train # defaults to filename stem when omitted
348
+ keep: train # select active split label (null disables filtering)
349
+ output:
350
+ transport: stdout # stdout | fs
351
+ format: print # print | json-lines | json | csv | pickle
352
+ limit: 100 # cap vectors per serve run (null = unlimited)
353
+ throttle_ms: null # milliseconds to sleep between emitted vectors
354
+ # Optional overrides:
355
+ # log_level: INFO # DEBUG=progress bars, INFO=spinner, WARNING=quiet
356
+ # visuals: AUTO # AUTO | TQDM | RICH | OFF
357
+ # progress: AUTO # AUTO | SPINNER | BARS | OFF
358
+ ```
359
+
360
+ - Each serve task lives alongside artifact tasks under `paths.tasks`. Files are independent—no special directory structure required.
361
+ - `output`, `limit`, `throttle_ms`, and `log_level` provide defaults for `jerry serve`; CLI flags still win per invocation (see *Configuration Resolution Order*). For filesystem outputs, set `transport: fs`, `directory: /path/to/root`, and omit file names—each run automatically writes to `<directory>/<run_name>/<run_name>.<ext>` unless you override the entire `output` block with a custom `filename`.
362
+ - Override `keep` (and other fields) per invocation via `jerry serve ... --keep val` etc.
363
+ - Visuals backend: set `visuals: AUTO|TQDM|RICH|OFF` in the task or use `--visuals`. Pair with `progress: AUTO|SPINNER|BARS|OFF` or `--progress` to control progress layouts.
364
+ - Add additional `kind: serve` files to the tasks directory for other splits (val/test/etc.); `jerry serve` runs each enabled file unless you pass `--run <name>`.
365
+ - Use `jerry.yaml` next to the project or workspace root to define shared defaults (visuals/progress/log level/output); CLI flags still take precedence.
366
+
367
+ ### Workspace Defaults (`jerry.yaml`)
368
+
369
+ Create an optional `jerry.yaml` in the directory where you run the CLI to share settings across commands. The CLI walks up from the current working directory to find the first `jerry.yaml`.
370
+
371
+ ```yaml
372
+ plugin_root: lib/power_plugin # optional repo path for scaffolding (relative to this file)
373
+ config_root: configs/default # directory containing project.yaml (relative paths ok)
374
+
375
+ shared:
376
+ visuals: rich # default visual renderer (auto|tqdm|rich|off)
377
+ progress: bars # spinner|bars|auto|off
378
+
379
+ serve:
380
+ log_level: INFO
381
+ output:
382
+ transport: stdout
383
+ format: print
384
+ # directory: artifacts/serve # Required when transport=fs
385
+
386
+ build:
387
+ log_level: INFO
388
+ mode: AUTO # AUTO | FORCE | OFF
389
+ ```
390
+
391
+ `jerry.yaml` sits near the root of your workspace, while dataset-specific overrides still live in individual `tasks/serve.*.yaml` files as needed.
392
+
393
+ ### Configuration Resolution Order
394
+
395
+ Defaults are layered so you can set global preferences once, keep dataset/run
396
+ files focused on per-project behavior, and still override anything from the CLI.
397
+ For both `jerry serve` and `jerry build`, options are merged in the following
398
+ order (highest precedence first):
399
+
400
+ 1. **CLI flags** – anything you pass on the command line always wins, even if a
401
+ value is already specified elsewhere.
402
+ 2. **Project task files** – `kind: serve` specs (under `project.paths.tasks`)
403
+ supply serve defaults; artifact tasks in the same directory drive `jerry build`.
404
+ These only apply to the dataset that owns the config directory.
405
+ 3. **`jerry.yaml` command blocks** – settings under `jerry.serve` and
406
+ `jerry.build` provide workspace-wide defaults for their respective commands.
407
+ 4. **`jerry.yaml.shared`** – shared fallbacks for visuals/progress/log-level
408
+ style settings that apply to every command when a more specific value is not
409
+ defined.
410
+ 5. **Built-in defaults** – the runtime’s hard-coded values used when nothing else
411
+ sets an option.
412
+
413
+ This hierarchy lets you push opinionated defaults up to the workspace (so every
414
+ project or dataset behaves consistently) while still giving each dataset and
415
+ every CLI invocation the ability to tighten or override behaviors.
416
+
417
+ ### `config/sources/<alias>.yaml`
418
+
419
+ Each file defines a loader/parser pair exposed under `<alias>` (also the
420
+ `id` the rest of the pipeline references). Files may live in nested
421
+ subdirectories under `config/sources/`; discovery is recursive.
422
+
423
+ ```yaml
424
+ id: demo_weather
425
+ parser:
426
+ entrypoint: demo.weather_parser
427
+ args:
428
+ timezone: UTC
429
+ loader:
430
+ entrypoint: demo.csv_loader
431
+ args:
432
+ path: data/weather.csv
433
+ ```
434
+
435
+ ### `config/contracts/<alias>.yaml`
436
+
437
+ Canonical stream contracts describe how the runtime should map and prepare a
438
+ source. Use folders to organize by domain.
439
+
440
+ ```yaml
441
+ kind: ingest
442
+ id: demo_weather
443
+ source: demo_weather
444
+
445
+ mapper:
446
+ entrypoint: weather.domain.mapper
447
+ args: {}
448
+
449
+ partition_by: station
450
+ sort_batch_size: 50000
451
+
452
+ record:
453
+ - filter: { operator: ge, field: time, comparand: "${start_time}" }
454
+ - filter: { operator: lt, field: time, comparand: "${end_time}" }
455
+ - floor_time: { cadence: 10m }
456
+
457
+ stream:
458
+ - ensure_cadence: { cadence: 10m }
459
+ - granularity: { mode: mean }
460
+ - fill: { statistic: median, window: 6, min_samples: 2 }
461
+
462
+ debug:
463
+ - lint: { mode: warn, tick: 10m }
464
+ ```
465
+
466
+ - `record`: ordered record-level transforms (filters, floor/lag, custom
467
+ transforms registered under the `record` entry-point group).
468
+ - `stream`: transforms applied after feature wrapping, still per base feature.
469
+ - `debug`: instrumentation-only transforms (linters, assertions).
470
+ - `partition_by`: optional keys used to suffix feature IDs (e.g., `temp__@station_id:XYZ`).
471
+ - `sort_batch_size`: chunk size used by the in-memory sorter when normalizing
472
+ order before stream transforms.
473
+
474
+ ### Composed Streams (Engineered Domains)
475
+
476
+ Define engineered streams that depend on other canonical streams directly in contracts. The runtime builds each input to stage 4 (ordered + regularized), stream‑aligns by partition + timestamp, runs your composer, and emits fresh records for the derived stream.
477
+
478
+ ```yaml
479
+ # contracts/air_density.processed.yaml
480
+ kind: composed
481
+ id: air_density.processed
482
+ inputs:
483
+ - pressure.processed
484
+ - t=temp_dry.processed
485
+ partition_by: station_id
486
+ sort_batch_size: 20000
487
+
488
+ mapper:
489
+ # Function or class via dotted path; entry points optional
490
+ entrypoint: mypkg.domains.air_density:compose_to_record
491
+ args:
492
+ driver: pressure.processed # optional; defaults to first input
493
+
494
+ # Optional post‑compose policies (run after composition like any stream)
495
+ # record: [...]
496
+ # stream: [...]
497
+ # debug: [...]
498
+ ```
499
+
500
+ Dataset stays minimal — features only reference the composed stream:
501
+
502
+ ```yaml
503
+ # dataset.yaml
504
+ group_by: 1h
505
+ features:
506
+ - id: air_density
507
+ record_stream: air_density.processed
508
+ ```
509
+
510
+ Notes:
511
+
512
+ - Inputs always reference canonical stream_ids (not raw sources).
513
+ - The composed source outputs records; its own `record`/`stream`/`debug` rules still apply afterward.
514
+ - Partitioning for the engineered domain is explicit via `partition_by` on the composed contract.
515
+
516
+ ### `dataset.yaml`
517
+
518
+ Defines which canonical streams become features/targets and the vector bucketing.
519
+
520
+ ```yaml
521
+ group_by: 1h
522
+
523
+ features:
524
+ - id: temp_c
525
+ record_stream: demo_weather
526
+ scale: true
527
+ sequence: { size: 6, stride: 1 }
528
+
529
+ targets:
530
+ - id: precip
531
+ record_stream: demo_weather
532
+ ```
533
+
534
+ - `group_by` controls the cadence for vector partitioning (accepts `Xm|min|Xh`
535
+ — minutes or hours).
536
+ - `scale: true` inserts the standard scaler feature transform (requires scaler
537
+ stats artifact or inline statistics).
538
+ - Downstream consumers can load the `scaler.pkl` artifact and call
539
+ `StandardScaler.inverse_transform` (or `StandardScalerTransform.inverse`)
540
+ to undo scaling.
541
+ - `sequence` emits `FeatureRecordSequence` windows (size, stride, optional
542
+ cadence enforcement via `tick`).
543
+
544
+ ### `postprocess.yaml`
545
+
546
+ Project-scoped vector transforms that run after assembly and before serving.
547
+
548
+ ```yaml
549
+ - drop:
550
+ axis: horizontal
551
+ payload: features
552
+ threshold: 0.95
553
+ - fill:
554
+ statistic: median
555
+ window: 48
556
+ min_samples: 6
557
+ - replace:
558
+ payload: targets
559
+ value: 0.0
560
+ ```
561
+
562
+ - Each transform receives a `Sample`; set `payload: targets` when you want to
563
+ mutate label vectors, otherwise the feature vector is used.
564
+ - Vector transforms rely on the schema artifact (for expected IDs/cadence)
565
+ and scaler stats when scaling is enabled. When no transforms are configured
566
+ the stream passes through unchanged.
567
+
568
+ ### Task Specs (`tasks/*.yaml`)
569
+
570
+ Declare artifact and command tasks under `project.paths.tasks` (default `tasks/`).
571
+ Artifact specs are optional; if you omit them, Jerry falls back to built-in defaults.
572
+ Add a YAML file only when you need to override paths or other parameters.
573
+
574
+ `tasks/scaler.yaml`
575
+
576
+ ```yaml
577
+ kind: scaler
578
+ output: scaler.pkl
579
+ split_label: train
580
+ enabled: true
581
+ ```
582
+
583
+ - `scaler.pkl` is a pickled standard scaler fitted on the requested split.
584
+ - `schema.json` (from the `schema` task) enumerates the discovered feature/target identifiers (including partitions), their kinds (scalar/list), and cadence hints used to enforce ordering downstream.
585
+ - Configure the `schema` task to choose a cadence strategy (currently `max`). Per-feature overrides will be added later; for now every list-valued feature records the max observed length as its enforcement target.
586
+ - `metadata.json` (from the `metadata` task) captures heavier statistics—present/null counts, inferred value types, list-length histograms, per-partition timestamps, and the dataset window. Configure `metadata.window_mode` with `union|intersection|strict|relaxed` (default `intersection`) to control how start/end bounds are derived. `union` considers base features, `intersection` uses their overlap, `strict` intersects every partition, and `relaxed` unions partitions independently.
587
+ - Command tasks (`kind: serve`) live alongside artifact tasks; `jerry serve` reads them directly.
588
+ - Shared run/build defaults (visuals/progress/log level/build mode) live in `jerry.yaml`.
589
+
590
+ ---
591
+
592
+ ## CLI Reference
593
+
594
+ All commands live under the `jerry` entry point (`src/datapipeline/cli/app.py`).
595
+ Pass `--help` on any command for flags.
596
+
597
+ ### Preview Stages
598
+
599
+ - `jerry serve --project <project.yaml> --stage <0-7> --limit N [--log-level LEVEL] [--visuals auto|tqdm|rich|off] [--progress auto|spinner|bars|off]`
600
+ - Stage 0: raw DTOs
601
+ - Stage 1: domain `TemporalRecord`s
602
+ - Stage 2: record transforms applied
603
+ - Stage 3: feature records (before sort/regularization)
604
+ - Stage 4: feature regularization (post stream transforms)
605
+ - Stage 5: feature transforms/sequence outputs
606
+ - Stage 6: vectors assembled (no postprocess)
607
+ - Stage 7: vectors + postprocess transforms
608
+ - Use `--log-level DEBUG` for progress bars, `--log-level INFO` for spinner + prints, or the default (`WARNING`) for minimal output.
609
+ - Ensures build artifacts are current before streaming; the build step only runs when the configuration hash changes unless you pass `--stage` 0-5 (auto-skip) or opt out with `--skip-build`.
610
+ - `jerry serve --project <project.yaml> --out-transport stdout --out-format json-lines --limit N [--include-targets] [--log-level LEVEL] [--visuals ...] [--progress ...] [--run name]`
611
+ - Applies postprocess transforms and optional dataset split before emitting.
612
+ - Use `--out-transport fs --out-format json-lines --out-path build/serve` (or `csv`, `pickle`, etc.) to write artifacts to disk instead of stdout; files land under `<out-path>/<run_name>/`.
613
+ - `--out-payload vector` emits only the vector payload with features/targets
614
+ flattened into schema-ordered lists (no identifier keys) when you don't need
615
+ the group key or metadata. Default is `sample`.
616
+ - Set `--log-level DEBUG` (or set your serve task `log_level: DEBUG`) to reuse the tqdm progress bars when previewing stages.
617
+ - When multiple serve tasks exist, add `--run val` (task name or filename stem) to target a single config; otherwise every enabled task is executed sequentially.
618
+ - Argument precedence follows the order described under *Configuration Resolution Order*.
619
+ - Combine with `--skip-build` when you already have fresh artifacts and want to jump straight into streaming.
620
+
621
+ ### Build & Quality
622
+
623
+ - `jerry inspect report --project <project.yaml> [--threshold 0.95] [--include-targets]`
624
+ - Prints coverage summary (keep/below lists) and writes `coverage.json` under
625
+ the artifacts directory.
626
+ - Add `--matrix csv|html` to persist an availability matrix.
627
+ - `jerry inspect partitions --project <project.yaml> [--include-targets]`
628
+ - Writes discovered partition suffixes to `partitions.json`.
629
+ - `jerry inspect expected --project <project.yaml> [--include-targets]`
630
+ - Writes the full set of observed feature IDs to `expected.txt` (for external tooling; runtime uses `schema.json`).
631
+ - `jerry build --project <project.yaml> [--force] [--visuals ...] [--progress ...]`
632
+ - Regenerates artifact tasks declared under `project.paths.tasks` when the configuration hash changes.
633
+
634
+ ### Scaffolding & Reference
635
+
636
+ - `jerry plugin init <package> --out <dir>` (also supports `-n/--name`)
637
+ - Generates a plugin project (pyproject, package skeleton, config templates).
638
+ - `jerry source add <provider> <dataset> --transport fs|http|synthetic --format csv|json|json-lines|pickle`
639
+ - Also supports `<provider>.<dataset>` via `--alias` or as the first positional
640
+ - Flag form remains available: `--provider/--dataset`
641
+ - Creates loader/parser stubs, updates entry points, and drops a matching
642
+ source YAML.
643
+ - `jerry domain add <name>` (also supports `-n/--name`)
644
+ - Adds a `domains/<name>/` package with a `model.py` stub.
645
+ - `jerry filter create --name <identifier>`
646
+ - Scaffolds an entry-point-ready filter (helpful for custom record predicates).
647
+ - `jerry list sources|domains`
648
+ - Introspect configured source aliases or domain packages.
649
+
650
+ ---
651
+
652
+ ## Transform & Filter Library
653
+
654
+ ### Record Filters (`config/contracts[].record`)
655
+
656
+ - Binary comparisons: `eq`, `ne`, `lt`, `le`, `gt`, `ge` (timezone-aware for ISO
657
+ or datetime literals).
658
+ - Membership: `in`, `nin`.
659
+ ```yaml
660
+ - filter: { operator: ge, field: time, comparand: "${start_time}" }
661
+ - filter: { operator: in, field: station, comparand: [a, b, c] }
662
+ ```
663
+
664
+ ### Record Transforms
665
+
666
+ - `floor_time`: snap timestamps down to the nearest cadence (`10m`, `1h`, …).
667
+ - `lag`: add lagged copies of records (see `src/datapipeline/transforms/record/lag.py` for options).
668
+
669
+ ### Stream (Feature) Transforms
670
+
671
+ - `ensure_cadence`: backfill missing ticks with `value=None` records to enforce a
672
+ strict cadence.
673
+ - `granularity`: merge duplicate timestamps using `first|last|mean|median`.
674
+ - `dedupe`: drop exact duplicate records (same id, timestamp, and payload) from
675
+ an already sorted feature stream.
676
+ - `fill`: rolling statistic-based imputation within each feature stream.
677
+ - Custom transforms can be registered under the `stream` entry-point group.
678
+
679
+ ### Feature Transforms
680
+
681
+ - `scale`: wraps `StandardScalerTransform`. Read statistics from the build
682
+ artifact or accept inline `statistics`.
683
+ ```yaml
684
+ scale:
685
+ with_mean: true
686
+ with_std: true
687
+ statistics:
688
+ temp_c__station=001: { mean: 10.3, std: 2.1 }
689
+ ```
690
+
691
+ ### Sequence Transforms
692
+
693
+ - `sequence`: sliding window generator (`size`, `stride`, optional `cadence` to
694
+ enforce contiguous windows). Emits `FeatureRecordSequence` payloads with `.records`.
695
+
696
+ ### Vector (Postprocess) Transforms
697
+
698
+ - `drop`: apply coverage thresholds along the horizontal axis (vectors) or
699
+ vertical axis (features/partitions) using `axis: horizontal|vertical` and
700
+ `threshold`. Vertical mode requires the optional `metadata.json`
701
+ artifact and internally prunes weak partitions.
702
+ - `fill`: impute using rolling statistics from prior vectors (history-based).
703
+ - `replace`: seed missing IDs with a constant or literal value.
704
+ (Jerry automatically enforces the `schema.json` vector schema—ordering +
705
+ cadence—before any configured vector transforms run.)
706
+
707
+ All transforms share a consistent entry-point signature and accept their config
708
+ dict as keyword arguments. Register new ones in `pyproject.toml` under the
709
+ appropriate group (`record`, `stream`, `feature`, `sequence`, `vector`,
710
+ `filters`, `debug`).
711
+
712
+ ---
713
+
714
+ ## Artifacts & Postprocess
715
+
716
+ - `expected.txt`: newline-delimited full feature IDs, generated on demand via
717
+ `jerry inspect expected`. Not required at runtime; transforms derive the
718
+ expected universe from `schema.json`.
719
+ - `schema.json`: output of the `schema` task. Jerry automatically
720
+ enforces this schema during postprocess to impose deterministic ordering and
721
+ list cadence metadata (targets appear whenever the dataset defines them). Window metadata now lives in `metadata.json`.
722
+ - `scaler.pkl`: pickled standard scaler fitted on the configured split. Loaded
723
+ lazily by feature transforms at runtime.
724
+ - Build state is tracked in `artifacts/build/state.json`; config hashes avoid
725
+ redundant runs.
726
+
727
+ If a postprocess transform needs an artifact and it is missing, the runtime will
728
+ raise a descriptive error suggesting `jerry build`.
729
+
730
+ ---
731
+
732
+ ## Splitting & Serving
733
+
734
+ If `project.globals.split` is present, `jerry serve` filters vectors at the
735
+ end of the pipeline:
736
+
737
+ - `mode: hash` – deterministic entity hash using either the group key or a
738
+ specified feature ID.
739
+ - `mode: time` – boundary-based slicing using timestamp labels.
740
+ - `run.keep` (or CLI `--keep`) selects the active slice; use any label name defined in your split config.
741
+
742
+ The split configuration never mutates stored artifacts; it is only applied when
743
+ serving vectors (either via CLI or the Python integrations).
744
+
745
+ ---
746
+
747
+ ## Python Integrations
748
+
749
+ `datapipeline.integrations.ml` demonstrates how to reuse the runtime from
750
+ application code:
751
+
752
+ - `VectorAdapter.from_project(project_yaml)` – bootstrap once, then stream
753
+ vectors or row dicts.
754
+ - `stream_vectors(project_yaml, limit=...)` – iterator matching `jerry serve`.
755
+ - `iter_vector_rows` / `collect_vector_rows` – handy for Pandas or custom sinks.
756
+ - `dataframe_from_vectors` – eager helper that returns a Pandas DataFrame
757
+ (requires `pandas`).
758
+ - `torch_dataset` – builds a `torch.utils.data.Dataset` that yields tensors. See
759
+ `examples/minimal_project/run_torch.py` for usage.
760
+
761
+ ---
762
+
763
+ ## Extending the Runtime
764
+
765
+ ### Entry Points
766
+
767
+ Register custom components in your plugin’s `pyproject.toml`:
768
+
769
+ ```toml
770
+ [project.entry-points."datapipeline.loaders"]
771
+ demo.csv_loader = "my_datapipeline.loaders.csv:CsvLoader"
772
+
773
+ [project.entry-points."datapipeline.parsers"]
774
+ demo.weather_parser = "my_datapipeline.parsers.weather:WeatherParser"
775
+
776
+ [project.entry-points."datapipeline.mappers"]
777
+ time.ticks = "my_datapipeline.mappers.synthetic.ticks:map"
778
+
779
+ [project.entry-points."datapipeline.stream"]
780
+ weather.fill = "my_datapipeline.transforms.weather:CustomFill"
781
+ ```
782
+
783
+ Loader, parser, mapper, and transform classes should provide a callable
784
+ interface (usually `__call__`) matching the runtime expectations. Refer to the
785
+ built-in implementations in `src/datapipeline/sources/`, `src/datapipeline/transforms/`,
786
+ and `src/datapipeline/filters/`.
787
+
788
+ ### Scaffolding Helpers
789
+
790
+ - `datapipeline.services.scaffold.plugin.scaffold_plugin` – invoked by
791
+ `jerry plugin init`.
792
+ - `datapipeline.services.scaffold.source.create_source` – writes loader/parser
793
+ stubs and updates entry points.
794
+ - `datapipeline.services.scaffold.domain.create_domain` – domain DTO skeleton.
795
+ - `datapipeline.services.scaffold.filter.create_filter` – custom filter stub.
796
+ - `datapipeline.services.scaffold.mappers.attach_source_to_domain` – helper for
797
+ programmatically wiring sources to domain mappers and emitting stream
798
+ contracts (useful in custom automation or tests).
799
+
800
+ ---
801
+
802
+ ## Development Workflow
803
+
804
+ - Install dependencies: `pip install -e .[dev]`.
805
+ - Run tests: `pytest`.
806
+ - When iterating on configs, use `jerry serve --stage <n>` to peek into problematic
807
+ stages.
808
+ - After tuning transforms, refresh artifacts: `jerry build`.
809
+ - Use `jerry inspect report --include-targets` to ensure targets meet coverage
810
+ gates before handing vectors to downstream consumers.
811
+
812
+ ---
813
+
814
+ ## Additional Resources
815
+
816
+ - `src/datapipeline/analysis/vector_analyzer.py` – quality metrics collected by
817
+ the inspect commands.
818
+ - `src/datapipeline/pipeline/` – pure functions that wire each stage.
819
+ - `src/datapipeline/services/bootstrap/` – runtime initialization and
820
+ registry population (see `core.py`).
821
+ - `examples/minimal_project/` – runnable demo showing config layout and Torch
822
+ integration.
823
+
824
+ Happy shipping! Build, inspect, and serve consistent time-series features with
825
+ confidence.