jerry-thomas 1.0.3__tar.gz → 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (307) hide show
  1. jerry_thomas-2.0.0/PKG-INFO +282 -0
  2. jerry_thomas-2.0.0/README.md +259 -0
  3. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/pyproject.toml +10 -4
  4. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/analysis/vector/collector.py +0 -1
  5. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/build/tasks/config.py +0 -2
  6. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/build/tasks/metadata.py +0 -2
  7. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/build/tasks/scaler.py +0 -2
  8. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/build/tasks/schema.py +0 -2
  9. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/build/tasks/utils.py +0 -2
  10. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/cli/app.py +201 -81
  11. jerry_thomas-2.0.0/src/datapipeline/cli/commands/contract.py +235 -0
  12. jerry_thomas-2.0.0/src/datapipeline/cli/commands/demo.py +13 -0
  13. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/cli/commands/domain.py +4 -4
  14. jerry_thomas-2.0.0/src/datapipeline/cli/commands/dto.py +11 -0
  15. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/cli/commands/filter.py +2 -2
  16. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/cli/commands/inspect.py +0 -68
  17. jerry_thomas-2.0.0/src/datapipeline/cli/commands/list_.py +62 -0
  18. jerry_thomas-2.0.0/src/datapipeline/cli/commands/loader.py +11 -0
  19. jerry_thomas-2.0.0/src/datapipeline/cli/commands/mapper.py +82 -0
  20. jerry_thomas-2.0.0/src/datapipeline/cli/commands/parser.py +45 -0
  21. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/cli/commands/run_config.py +1 -3
  22. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/cli/commands/serve_pipeline.py +5 -7
  23. jerry_thomas-2.0.0/src/datapipeline/cli/commands/source.py +146 -0
  24. jerry_thomas-2.0.0/src/datapipeline/cli/commands/stream.py +286 -0
  25. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/cli/visuals/common.py +0 -2
  26. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/cli/visuals/sections.py +0 -2
  27. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/cli/workspace_utils.py +0 -3
  28. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/config/context.py +0 -2
  29. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/config/dataset/feature.py +1 -0
  30. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/config/metadata.py +0 -2
  31. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/config/project.py +0 -2
  32. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/config/resolution.py +10 -2
  33. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/config/tasks.py +9 -9
  34. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/domain/feature.py +3 -0
  35. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/domain/record.py +7 -7
  36. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/domain/sample.py +0 -2
  37. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/domain/vector.py +6 -8
  38. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/integrations/ml/adapter.py +0 -2
  39. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/integrations/ml/pandas_support.py +0 -2
  40. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/integrations/ml/rows.py +0 -2
  41. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/integrations/ml/torch_support.py +0 -2
  42. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/io/output.py +0 -2
  43. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/io/serializers.py +26 -16
  44. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/mappers/synthetic/time.py +9 -2
  45. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/pipeline/artifacts.py +3 -5
  46. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/pipeline/observability.py +0 -2
  47. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/pipeline/pipelines.py +118 -34
  48. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/pipeline/stages.py +42 -17
  49. jerry_thomas-2.0.0/src/datapipeline/pipeline/utils/spool_cache.py +142 -0
  50. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/pipeline/utils/transform_utils.py +27 -2
  51. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/services/artifacts.py +1 -4
  52. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/services/constants.py +1 -0
  53. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/services/factories.py +4 -6
  54. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/services/project_paths.py +0 -2
  55. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/services/runs.py +0 -2
  56. jerry_thomas-2.0.0/src/datapipeline/services/scaffold/contract_yaml.py +76 -0
  57. jerry_thomas-2.0.0/src/datapipeline/services/scaffold/demo.py +141 -0
  58. jerry_thomas-2.0.0/src/datapipeline/services/scaffold/discovery.py +115 -0
  59. jerry_thomas-2.0.0/src/datapipeline/services/scaffold/domain.py +34 -0
  60. jerry_thomas-2.0.0/src/datapipeline/services/scaffold/dto.py +31 -0
  61. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/services/scaffold/filter.py +2 -1
  62. jerry_thomas-2.0.0/src/datapipeline/services/scaffold/layout.py +96 -0
  63. jerry_thomas-2.0.0/src/datapipeline/services/scaffold/loader.py +61 -0
  64. jerry_thomas-2.0.0/src/datapipeline/services/scaffold/mapper.py +116 -0
  65. jerry_thomas-2.0.0/src/datapipeline/services/scaffold/parser.py +56 -0
  66. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/services/scaffold/plugin.py +14 -2
  67. jerry_thomas-2.0.0/src/datapipeline/services/scaffold/source_yaml.py +91 -0
  68. jerry_thomas-2.0.0/src/datapipeline/services/scaffold/stream_plan.py +110 -0
  69. jerry_thomas-2.0.0/src/datapipeline/services/scaffold/utils.py +187 -0
  70. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/sources/data_loader.py +0 -2
  71. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/sources/decoders.py +49 -8
  72. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/sources/factory.py +9 -6
  73. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/sources/foreach.py +18 -3
  74. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/sources/synthetic/time/parser.py +1 -1
  75. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/sources/transports.py +10 -4
  76. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/demo/contracts/equity.ohlcv.yaml +33 -0
  77. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.hour_sin.yaml +22 -0
  78. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.linear.yaml +22 -0
  79. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/demo/data/APPL.jsonl +19 -0
  80. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/demo/data/MSFT.jsonl +19 -0
  81. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/demo/dataset.yaml +19 -0
  82. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/demo/postprocess.yaml +19 -0
  83. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/demo/project.yaml +19 -0
  84. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/demo/sources/sandbox.ohlcv.yaml +17 -0
  85. {jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/example → jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/demo}/sources/synthetic.ticks.yaml +1 -1
  86. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/demo/tasks/metadata.yaml +2 -0
  87. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/demo/tasks/scaler.yaml +3 -0
  88. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/demo/tasks/schema.yaml +2 -0
  89. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/demo/tasks/serve.test.yaml +4 -0
  90. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/demo/tasks/serve.train.yaml +4 -0
  91. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/demo/tasks/serve.val.yaml +4 -0
  92. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/scripts/run_dataframe.py +20 -0
  93. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/scripts/run_torch.py +23 -0
  94. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/model.py +18 -0
  95. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
  96. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/sandbox_ohlcv_dto.py +14 -0
  97. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +0 -0
  98. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/map_sandbox_ohlcv_dto_to_equity.py +26 -0
  99. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
  100. jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/sandbox_ohlcv_dto_parser.py +46 -0
  101. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/README.md +63 -0
  102. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/jerry.yaml +22 -0
  103. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/reference/jerry.yaml +28 -0
  104. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/reference/reference/contracts/composed.reference.yaml +29 -0
  105. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/reference/reference/contracts/ingest.reference.yaml +31 -0
  106. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/reference/reference/contracts/overview.reference.yaml +34 -0
  107. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/reference/reference/dataset.yaml +29 -0
  108. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/reference/reference/postprocess.yaml +25 -0
  109. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/reference/reference/project.yaml +32 -0
  110. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.http.reference.yaml +24 -0
  111. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.reference.yaml +21 -0
  112. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/reference/reference/sources/fs.reference.yaml +16 -0
  113. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/reference/reference/sources/http.reference.yaml +17 -0
  114. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/reference/reference/sources/overview.reference.yaml +18 -0
  115. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/reference/reference/sources/synthetic.reference.yaml +15 -0
  116. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/reference/reference/tasks/metadata.reference.yaml +11 -0
  117. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/reference/reference/tasks/scaler.reference.yaml +10 -0
  118. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/reference/reference/tasks/schema.reference.yaml +10 -0
  119. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/reference/reference/tasks/serve.reference.yaml +28 -0
  120. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
  121. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/domains/__init__.py +2 -0
  122. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
  123. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/loaders/__init__.py +0 -0
  124. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +1 -0
  125. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
  126. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +19 -0
  127. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +20 -0
  128. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +19 -0
  129. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +2 -0
  130. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +3 -0
  131. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
  132. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
  133. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +4 -0
  134. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
  135. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/your-interim-data-builder/dataset.yaml +9 -0
  136. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/your-interim-data-builder/postprocess.yaml +1 -0
  137. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/your-interim-data-builder/project.yaml +14 -0
  138. jerry_thomas-2.0.0/src/datapipeline/templates/plugin_skeleton/your-interim-data-builder/tasks/serve.all.yaml +8 -0
  139. jerry_thomas-2.0.0/src/datapipeline/templates/stubs/contracts/composed.yaml.j2 +10 -0
  140. jerry_thomas-2.0.0/src/datapipeline/templates/stubs/contracts/ingest.yaml.j2 +25 -0
  141. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/templates/stubs/dto.py.j2 +1 -1
  142. jerry_thomas-2.0.0/src/datapipeline/templates/stubs/loaders/basic.py.j2 +11 -0
  143. jerry_thomas-2.0.0/src/datapipeline/templates/stubs/mappers/composed.py.j2 +13 -0
  144. jerry_thomas-2.0.0/src/datapipeline/templates/stubs/mappers/ingest.py.j2 +17 -0
  145. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/templates/stubs/parser.py.j2 +4 -0
  146. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/templates/stubs/record.py.j2 +0 -1
  147. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/templates/stubs/source.yaml.j2 +1 -1
  148. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/transforms/debug/identity.py +34 -16
  149. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/transforms/debug/lint.py +14 -11
  150. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/transforms/feature/scaler.py +5 -12
  151. jerry_thomas-2.0.0/src/datapipeline/transforms/filter.py +116 -0
  152. jerry_thomas-2.0.0/src/datapipeline/transforms/interfaces.py +58 -0
  153. jerry_thomas-2.0.0/src/datapipeline/transforms/record/floor_time.py +20 -0
  154. jerry_thomas-2.0.0/src/datapipeline/transforms/record/lag.py +16 -0
  155. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/transforms/sequence.py +2 -3
  156. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/transforms/stream/dedupe.py +5 -7
  157. jerry_thomas-2.0.0/src/datapipeline/transforms/stream/ensure_ticks.py +49 -0
  158. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/transforms/stream/fill.py +34 -25
  159. jerry_thomas-2.0.0/src/datapipeline/transforms/stream/filter.py +25 -0
  160. jerry_thomas-2.0.0/src/datapipeline/transforms/stream/floor_time.py +16 -0
  161. jerry_thomas-2.0.0/src/datapipeline/transforms/stream/granularity.py +114 -0
  162. jerry_thomas-2.0.0/src/datapipeline/transforms/stream/lag.py +17 -0
  163. jerry_thomas-2.0.0/src/datapipeline/transforms/stream/rolling.py +72 -0
  164. jerry_thomas-2.0.0/src/datapipeline/transforms/utils.py +58 -0
  165. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/transforms/vector/drop/horizontal.py +0 -3
  166. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/transforms/vector/drop/orchestrator.py +0 -3
  167. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/transforms/vector/drop/vertical.py +0 -2
  168. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/transforms/vector/ensure_schema.py +0 -2
  169. jerry_thomas-2.0.0/src/datapipeline/utils/__init__.py +0 -0
  170. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/utils/paths.py +0 -2
  171. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/utils/placeholders.py +0 -2
  172. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/utils/rich_compat.py +0 -3
  173. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/utils/window.py +0 -2
  174. jerry_thomas-2.0.0/src/jerry_thomas.egg-info/PKG-INFO +282 -0
  175. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/jerry_thomas.egg-info/SOURCES.txt +81 -15
  176. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/jerry_thomas.egg-info/entry_points.txt +7 -3
  177. jerry_thomas-1.0.3/PKG-INFO +0 -827
  178. jerry_thomas-1.0.3/README.md +0 -807
  179. jerry_thomas-1.0.3/src/datapipeline/cli/commands/contract.py +0 -373
  180. jerry_thomas-1.0.3/src/datapipeline/cli/commands/list_.py +0 -45
  181. jerry_thomas-1.0.3/src/datapipeline/cli/commands/source.py +0 -58
  182. jerry_thomas-1.0.3/src/datapipeline/services/scaffold/domain.py +0 -26
  183. jerry_thomas-1.0.3/src/datapipeline/services/scaffold/mappers.py +0 -55
  184. jerry_thomas-1.0.3/src/datapipeline/services/scaffold/source.py +0 -191
  185. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/README.md +0 -142
  186. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +0 -31
  187. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +0 -30
  188. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/example/dataset.yaml +0 -18
  189. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/example/postprocess.yaml +0 -29
  190. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/example/project.yaml +0 -23
  191. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +0 -3
  192. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +0 -9
  193. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +0 -2
  194. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +0 -4
  195. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +0 -28
  196. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +0 -4
  197. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/jerry.yaml +0 -34
  198. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +0 -18
  199. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +0 -29
  200. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +0 -22
  201. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +0 -3
  202. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +0 -9
  203. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +0 -2
  204. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +0 -4
  205. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +0 -28
  206. jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +0 -4
  207. jerry_thomas-1.0.3/src/datapipeline/templates/stubs/mapper.py.j2 +0 -22
  208. jerry_thomas-1.0.3/src/datapipeline/transforms/filter.py +0 -60
  209. jerry_thomas-1.0.3/src/datapipeline/transforms/record/floor_time.py +0 -17
  210. jerry_thomas-1.0.3/src/datapipeline/transforms/record/lag.py +0 -18
  211. jerry_thomas-1.0.3/src/datapipeline/transforms/stream/ensure_ticks.py +0 -34
  212. jerry_thomas-1.0.3/src/datapipeline/transforms/stream/granularity.py +0 -92
  213. jerry_thomas-1.0.3/src/datapipeline/transforms/utils.py +0 -26
  214. jerry_thomas-1.0.3/src/jerry_thomas.egg-info/PKG-INFO +0 -827
  215. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/LICENSE +0 -0
  216. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/setup.cfg +0 -0
  217. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/__init__.py +0 -0
  218. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/analysis/__init__.py +0 -0
  219. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/analysis/vector/matrix.py +0 -0
  220. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/analysis/vector/report.py +0 -0
  221. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/analysis/vector_analyzer.py +0 -0
  222. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/build/__init__.py +0 -0
  223. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/build/state.py +0 -0
  224. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/build/tasks/__init__.py +0 -0
  225. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/cli/commands/build.py +0 -0
  226. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/cli/commands/plugin.py +0 -0
  227. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/cli/commands/run.py +0 -0
  228. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/cli/visuals/__init__.py +0 -0
  229. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/cli/visuals/labels.py +0 -0
  230. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/cli/visuals/runner.py +0 -0
  231. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/cli/visuals/sources.py +0 -0
  232. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/cli/visuals/sources_basic.py +0 -0
  233. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/cli/visuals/sources_off.py +0 -0
  234. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/cli/visuals/sources_rich.py +0 -0
  235. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/config/__init__.py +0 -0
  236. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/config/catalog.py +0 -0
  237. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/config/dataset/dataset.py +0 -0
  238. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/config/dataset/loader.py +0 -0
  239. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/config/dataset/normalize.py +0 -0
  240. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/config/postprocess.py +0 -0
  241. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/config/split.py +0 -0
  242. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/config/workspace.py +0 -0
  243. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/domain/__init__.py +0 -0
  244. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/filters/filters.py +0 -0
  245. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/integrations/__init__.py +0 -0
  246. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/integrations/ml/__init__.py +0 -0
  247. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/io/factory.py +0 -0
  248. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/io/protocols.py +0 -0
  249. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/io/sinks/__init__.py +0 -0
  250. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/io/sinks/base.py +0 -0
  251. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/io/sinks/files.py +0 -0
  252. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/io/sinks/rich.py +0 -0
  253. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/io/sinks/stdout.py +0 -0
  254. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/io/writers/__init__.py +0 -0
  255. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/io/writers/base.py +0 -0
  256. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/io/writers/csv_writer.py +0 -0
  257. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/io/writers/jsonl.py +0 -0
  258. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/io/writers/pickle_writer.py +0 -0
  259. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/mappers/noop.py +0 -0
  260. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/parsers/identity.py +0 -0
  261. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/pipeline/__init__.py +0 -0
  262. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/pipeline/context.py +0 -0
  263. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/pipeline/split.py +0 -0
  264. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/pipeline/utils/keygen.py +0 -0
  265. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/pipeline/utils/memory_sort.py +0 -0
  266. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/pipeline/utils/ordering.py +0 -0
  267. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/plugins.py +0 -0
  268. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/registries/registry.py +0 -0
  269. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/runtime.py +0 -0
  270. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/services/bootstrap/__init__.py +0 -0
  271. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/services/bootstrap/config.py +0 -0
  272. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/services/bootstrap/core.py +0 -0
  273. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/services/entrypoints.py +0 -0
  274. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/services/paths.py +0 -0
  275. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/services/scaffold/__init__.py +0 -0
  276. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/services/scaffold/templates.py +0 -0
  277. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/sources/__init__.py +0 -0
  278. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/sources/models/__init__.py +0 -0
  279. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/sources/models/base.py +0 -0
  280. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/sources/models/generator.py +0 -0
  281. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/sources/models/loader.py +0 -0
  282. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/sources/models/parser.py +0 -0
  283. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/sources/models/parsing_error.py +0 -0
  284. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/sources/models/source.py +0 -0
  285. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/sources/models/synthetic.py +0 -0
  286. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/sources/synthetic/__init__.py +0 -0
  287. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/sources/synthetic/time/__init__.py +0 -0
  288. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/sources/synthetic/time/loader.py +0 -0
  289. {jerry_thomas-1.0.3/src/datapipeline/templates/plugin_skeleton → jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton}/src/{{PACKAGE_NAME}}/__init__.py +0 -0
  290. {jerry_thomas-1.0.3/src/datapipeline/utils → jerry_thomas-2.0.0/src/datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity}/__init__.py +0 -0
  291. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/templates/plugin_skeleton/pyproject.toml +0 -0
  292. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/templates/stubs/filter.py.j2 +0 -0
  293. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/templates/stubs/loader_synthetic.py.j2 +0 -0
  294. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/templates/stubs/parser_custom.py.j2 +0 -0
  295. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/transforms/feature/model.py +0 -0
  296. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/transforms/vector/__init__.py +0 -0
  297. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/transforms/vector/common.py +0 -0
  298. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/transforms/vector/drop/__init__.py +0 -0
  299. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/transforms/vector/fill.py +0 -0
  300. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/transforms/vector/replace.py +0 -0
  301. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/transforms/vector_utils.py +0 -0
  302. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/utils/load.py +0 -0
  303. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/utils/pickle_model.py +0 -0
  304. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/datapipeline/utils/time.py +0 -0
  305. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/jerry_thomas.egg-info/dependency_links.txt +0 -0
  306. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/jerry_thomas.egg-info/requires.txt +0 -0
  307. {jerry_thomas-1.0.3 → jerry_thomas-2.0.0}/src/jerry_thomas.egg-info/top_level.txt +0 -0
@@ -0,0 +1,282 @@
1
+ Metadata-Version: 2.4
2
+ Name: jerry-thomas
3
+ Version: 2.0.0
4
+ Summary: Jerry-Thomas: a stream-first, plugin-friendly data pipeline (mixology-themed CLI)
5
+ Author: Anders Skott Lind
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/mr-lovalova/datapipeline
8
+ Project-URL: Repository, https://github.com/mr-lovalova/datapipeline
9
+ Project-URL: Issues, https://github.com/mr-lovalova/datapipeline/issues
10
+ Requires-Python: >=3.10
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: numpy<3.0,>=1.24
14
+ Requires-Dist: pydantic>=2.0
15
+ Requires-Dist: PyYAML>=5.4
16
+ Requires-Dist: tqdm>=4.0
17
+ Requires-Dist: jinja2>=3.0
18
+ Requires-Dist: rich>=13
19
+ Provides-Extra: ml
20
+ Requires-Dist: pandas>=2.0; extra == "ml"
21
+ Requires-Dist: torch>=2.0; extra == "ml"
22
+ Dynamic: license-file
23
+
24
+ # Datapipeline Runtime
25
+
26
+ Named after the famous bartender, Jerry Thomas is a time-series-first data
27
+ pipeline runtime that mixes disparate data sources into fresh, ready-to-serve
28
+ vectors using declarative YAML recipes. Everything is on-demand, iterator-first:
29
+ data streams through the pipeline without pre-batching the whole dataset in
30
+ memory. Like any good bartender, Jerry obsesses over quality control and
31
+ service, offering stage-by-stage observability along the way. And no bar is
32
+ complete without proper tools: deterministic artifacts and plugin scaffolding
33
+ for custom loaders, parsers, transforms, and filters.
34
+
35
+ Contributing: PRs welcome on [GitHub](https://github.com/mr-lovalova/datapipeline).
36
+
37
+ > **Core assumptions**
38
+ >
39
+ > - Every record carries a timezone-aware `time` attribute and a numeric
40
+ > `value`. The time-zone awareness is a quality gate to ensure correct vector assembly.
41
+ > - Grouping is purely temporal. Dimensional splits belong in `partition_by`.
42
+
43
+ ---
44
+
45
+ ## Why You Might Use It
46
+
47
+ - Materialize canonical time-series datasets from disparate sources.
48
+ - Preview and debug each stage of the pipeline without writing ad-hoc scripts.
49
+ - Enforce coverage/quality gates and publish artifacts (schema, scaler stats)
50
+ for downstream ML teams.
51
+ - Extend the runtime with entry-point driven plugins for domain-specific I/O or
52
+ feature engineering.
53
+ - Consume vectors directly from Python via iterators, Pandas DataFrames, or
54
+ `torch.utils.data.Dataset`.
55
+
56
+ ---
57
+
58
+ ## Quick Start
59
+
60
+ ### Serve The Demo Plugin (Recommended)
61
+
62
+ ```bash
63
+ python -m pip install -U jerry-thomas
64
+ jerry demo init
65
+ python -m pip install -e demo
66
+ jerry serve --dataset demo --limit 3
67
+ ```
68
+
69
+ Note: `jerry demo init` creates a workspace `jerry.yaml`. If you later run
70
+ `jerry plugin init`, it won’t overwrite that file. Remove or edit
71
+ `jerry.yaml` (or pass `--project`) to point at your new plugin.
72
+ For example: `jerry serve --project lib/my-datapipeline/project.yaml`.
73
+
74
+ ### Create Your Own Plugin + First Ingest
75
+
76
+ ```bash
77
+ jerry plugin init my-datapipeline --out lib/
78
+ python -m pip install -e lib/my-datapipeline
79
+
80
+ # One-stop wizard: scaffolds source YAML + DTO/parser + domain + mapper + contract.
81
+ jerry inflow create
82
+
83
+ # Wizard tips (identity vs custom)
84
+ #
85
+ # - Parser:
86
+ # - Choose "Identity parser" when the loader already yields dicts/objects that match your DTO shape,
87
+ # and no type conversion is needed.
88
+ # - Choose "Create new parser" when you need to parse timestamps, coerce types, rename fields,
89
+ # or drop/validate rows.
90
+ #
91
+ # - Mapper:
92
+ # - Choose "Identity mapper" only when your DTO already is the final domain record shape (for example you might have used jerry to output interim datasets):
93
+ # `time` is timezone-aware and identity fields are present.
94
+ # - Otherwise, choose "Create new mapper" to map DTO -> domain record and add light derived fields.
95
+ #
96
+ # After scaffolding, you typically still need to:
97
+ # - Fill placeholders in `sources/*.yaml` (path/url/headers/etc.).
98
+ # - Reference your stream contract id in `dataset.yaml` under `record_stream: <contract_id>` and pick a `field` for each feature.
99
+ #
100
+ # Reinstall after commands that update entry points (pyproject.toml).
101
+ python -m pip install -e lib/my-datapipeline
102
+
103
+ jerry serve --dataset your-dataset --limit 3
104
+ ```
105
+
106
+ ---
107
+
108
+ ## Pipeline Stages (serve --stage)
109
+
110
+ Stages 0-6 operate on a single stream at a time (per feature/target config). Stages 7-8 assemble full vectors across all configured features.
111
+
112
+ - Stage 0 (DTO stream)
113
+ - Input: raw source rows (loader transport + decoder)
114
+ - Ops: loader -> decoder -> parser (raw -> DTO; return None to drop rows)
115
+ - Output: DTO objects yielded by the parser
116
+
117
+ - Stage 1 (record stream)
118
+ - Input: DTO stream
119
+ - Ops: mapper (DTO -> domain TemporalRecord)
120
+ - Output: TemporalRecord instances (must have timezone-aware `time`)
121
+
122
+ - Stage 2 (record transforms)
123
+ - Input: TemporalRecord stream
124
+ - Ops: contract `record:` transforms (e.g. filter, floor_time); per-record only (no history)
125
+ - Output: TemporalRecord stream (possibly filtered/mutated)
126
+
127
+ - Stage 3 (ordered record stream)
128
+ - Input: TemporalRecord stream
129
+ - Ops:
130
+ - sort by `(partition_key, record.time)` (batch/in-memory sort; typically the expensive step)
131
+ - Output: TemporalRecord stream (sorted by partition,time)
132
+
133
+ - Stage 4 (stream transforms)
134
+ - Input: ordered TemporalRecord stream
135
+ - Ops:
136
+ - apply contract `stream:` transforms (per-partition history; e.g. ensure_cadence, rolling, fill)
137
+ - apply contract `debug:` transforms (validation only; e.g. lint)
138
+ - Output: TemporalRecord stream (sorted by partition,time)
139
+
140
+ - Stage 5 (feature stream)
141
+ - Input: TemporalRecord stream
142
+ - Ops: wrap each record as `FeatureRecord(id, record, value)`; `id` is derived from:
143
+ - dataset `id:` (base feature id), and
144
+ - optional `partition_by:` fields (entity-specific feature ids)
145
+ - `value` is selected from `dataset.yaml` via `field: <record_attr>`
146
+ - Output: FeatureRecord stream (sorted by id,time within partitions)
147
+
148
+ - Stage 6 (feature transforms)
149
+ - Input: FeatureRecord stream (sorted by id,time)
150
+ - Ops: dataset-level feature transforms configured per feature (e.g. `scale`, `sequence`)
151
+ - Output: FeatureRecord or FeatureRecordSequence
152
+
153
+ - Stage 7 (vector assembly)
154
+ - Input: all features/targets after stage 6
155
+ - Ops:
156
+ - merge feature streams by time bucket (`group_by`)
157
+ - assemble `Vector` objects (feature_id -> value or sequence)
158
+ - assemble `Sample(key, features, targets)`
159
+ - if rectangular mode is on, align to the expected time window keys (missing buckets become empty vectors)
160
+ - Output: Sample stream (no postprocess, no split)
161
+
162
+ - Stage 8 (postprocess)
163
+ - Input: Sample stream
164
+ - Ops:
165
+ - ensure vector schema (fill missing configured feature ids, drop extras)
166
+ - apply project `postprocess.yaml` vector transforms
167
+ - Output: Sample stream (still not split)
168
+
169
+ Full run (no --stage)
170
+
171
+ - Runs stages 0-8, then applies the configured train/val/test split and optional throttling, then writes output.
172
+
173
+ Split timing (leakage note)
174
+
175
+ - Split is applied after stage 8 in `jerry serve` (postprocess runs before split).
176
+ - Feature engineering runs before split; keep it causal (no look-ahead, no future leakage).
177
+ - Scaler statistics are fit by the build task `scaler.yaml` and are typically restricted to the `train` split (configurable via `split_label`).
178
+
179
+ ---
180
+
181
+ ## CLI Cheat Sheet
182
+
183
+ - `jerry demo init`: scaffolds a standalone demo plugin at `./demo/` and wires a `demo` dataset.
184
+ - `jerry plugin init <name> --out lib/`: scaffolds `lib/<name>/` (writes workspace `jerry.yaml` when missing).
185
+ - `jerry.yaml`: sets `plugin_root` for scaffolding commands and `datasets/default_dataset` so you can omit `--project`/`--dataset`.
186
+ - `jerry serve [--dataset <alias>|--project <path>] [--limit N] [--stage 0-8] [--skip-build]`: streams output; builds required artifacts unless `--skip-build`.
187
+ - `jerry build [--dataset <alias>|--project <path>] [--force]`: materializes artifacts (schema, scaler, etc.).
188
+ - `jerry inspect report|matrix|partitions [--dataset <alias>|--project <path>]`: quality and metadata helpers.
189
+ - `jerry inflow create`: interactive wizard to scaffold an end-to-end ingest stream (source + parser/DTO + mapper + contract).
190
+ - `jerry source create <provider>.<dataset> ...`: scaffolds a source YAML (no Python code).
191
+ - `jerry domain create <domain>`: scaffolds a domain record stub.
192
+ - `jerry dto create`, `jerry parser create`, `jerry mapper create`, `jerry loader create`: scaffold Python code + register entry points (reinstall after).
193
+ - `jerry contract create [--identity]`: interactive contract scaffolder (YAML); use for canonical streams or composed streams.
194
+ - `jerry list sources|domains|parsers|mappers|loaders|dtos`: introspection helpers.
195
+ - `pip install -e lib/<name>`: rerun after commands that update `lib/<name>/pyproject.toml` (entry points), or after manual edits to it.
196
+
197
+ ---
198
+
199
+ ## MLOps & Reproducibility
200
+
201
+ - `jerry build` materializes deterministic artifacts (schema, scaler, metadata).
202
+ Builds are keyed by config hashes and skip work when nothing changed unless
203
+ you pass `--force`.
204
+ - `jerry serve` runs are named (task/run) and can write outputs to
205
+ `<out-path>/<run_name>/` for auditing, sharing, or downstream training.
206
+ - Versioning: tag the project config + plugin code in Git and pair with a data
207
+ versioning tool like DVC for raw sources. With those inputs pinned, interim
208
+ datasets and artifacts can be regenerated instead of stored.
209
+
210
+ ---
211
+
212
+ ## Concepts
213
+
214
+ ### Workspace (`jerry.yaml`)
215
+
216
+ - `datasets`: dataset aliases → `project.yaml` paths (relative to `jerry.yaml`).
217
+ - `default_dataset`: which dataset `jerry serve/build/inspect` use when you omit `--dataset/--project`.
218
+ - `plugin_root`: where scaffolding commands write Python code (`src/<package>/...`) and where they look for `pyproject.toml`.
219
+
220
+ ### Plugin Package (Python Code)
221
+
222
+ These live under `lib/<plugin>/src/<package>/`:
223
+
224
+ - `dtos/*.py`: DTO models (raw source shapes).
225
+ - `parsers/*.py`: raw -> DTO parsers (referenced by source YAML via entry point).
226
+ - `domains/<domain>/model.py`: domain record models.
227
+ - `mappers/*.py`: DTO -> domain record mapping functions (referenced by contracts via entry point).
228
+ - `loaders/*.py`: optional custom loaders (fs/http usually use the built-in core loader).
229
+ - `pyproject.toml`: entry points for loaders/parsers/mappers/transforms (rerun `pip install -e lib/<plugin>` after changes).
230
+
231
+ ### Loaders & Parsers
232
+
233
+ - A **loader** yields raw rows (bytes/dicts) from some transport (FS/HTTP/synthetic/etc.).
234
+ - A **parser** turns each raw row into a typed DTO (or returns `None` to drop a row).
235
+ - In most projects, your source YAML uses the built-in loader `core.io` and you only customize its `args` (`transport`, `format`, and a `path`/`url`).
236
+ - You typically only implement a custom loader when you need specialized behavior (auth/pagination/rate limits, proprietary formats, or non-standard protocols).
237
+ - `parser.args` are optional and only used when your parser supports configuration; many parsers don’t need any args since filtering etc is supported natively downstream.
238
+
239
+ ### DTOs & Domains
240
+
241
+ - A **DTO** (Data Transfer Object) mirrors a single source’s schema (columns/fields) and stays “raw-shaped”; it’s what parsers emit.
242
+ - A **domain record** is the canonical shape used across the pipeline. Mappers convert DTOs into domain records so multiple sources can land in the same domain model.
243
+ - The base time-series type is `TemporalRecord` (`time` + metadata fields). Domains add identity fields (e.g. `symbol`, `station_id`) that make filtering/partitioning meaningful.
244
+ - `time` must be timezone-aware (normalized to UTC); feature values are selected from record fields in `dataset.yaml` (see `field:`); remaining fields act as the record’s “identity” (used by equality/deduping and commonly by `partition_by`).
245
+
246
+ ### Transforms (Record → Stream → Feature → Vector)
247
+
248
+ - **Record transforms** run on raw canonical records before sorting or grouping (filters, time flooring, lagging). Each transform operates on one record at a time because order and partitions are not established yet. Configure in `contracts/*.yaml` under `record:`.
249
+ - **Stream transforms** run on ordered, per-stream records after record transforms (dedupe, cadence enforcement, rolling fills). These operate across a sequence of records for a partition because they depend on sorted partition/time order and cadence. Configure in `contracts/*.yaml` under `stream:`.
250
+ - **Feature transforms** run after stream regularization and shape the per-feature payload for vectorization (scalers, sequence/windowing). These occur after feature ids are finalized and payloads are wrapped. Configure in `dataset.yaml` under each feature.
251
+ - **Vector (postprocess) transforms** operate on assembled vectors (coverage/drop/fill/replace). Configure in `postprocess.yaml`.
252
+ - **Debug transforms** run after stream transforms for validation only. Configure in `contracts/*.yaml` under `debug:`.
253
+ - Custom transforms are registered in your plugin `pyproject.toml` under the matching entry-point group:
254
+ - `datapipeline.transforms.record`
255
+ - `datapipeline.transforms.stream`
256
+ - `datapipeline.transforms.feature`
257
+ - `datapipeline.transforms.vector`
258
+ - `datapipeline.transforms.debug`
259
+ Then reference them by name in the YAML.
260
+
261
+ ### Glossary
262
+
263
+ - **Source alias**: `sources/*.yaml:id` (referenced by contracts under `source:`).
264
+ - **Stream id**: `contracts/*.yaml:id` (referenced by `dataset.yaml` under `record_stream:`).
265
+ - **Partition**: dimension keys appended to feature IDs, driven by `contract.partition_by`.
266
+ - **Group**: vector “bucket” cadence set by `dataset.group_by` (controls how records become samples).
267
+ - **Stage**: debug/preview level for `jerry serve --stage 0-8` (DTOs → domain records → features → vectors).
268
+ - **Fan-out**: when multiple features reference the same `record_stream`, the pipeline spools records to disk so each feature can read independently (records must be picklable).
269
+
270
+ ## Documentation
271
+
272
+ - `docs/config.md`: config layout, resolution order, and YAML reference.
273
+ - `docs/cli.md`: CLI reference (beyond the cheat sheet).
274
+ - `docs/transforms.md`: built-in transforms and filters.
275
+ - `docs/artifacts.md`: artifacts, postprocess, and split timing.
276
+ - `docs/python.md`: Python API usage patterns.
277
+ - `docs/extending.md`: entry points and writing plugins.
278
+ - `docs/architecture.md`: pipeline diagrams.
279
+
280
+ ## Development
281
+
282
+ See `CONTRIBUTING.md`.
@@ -0,0 +1,259 @@
1
+ # Datapipeline Runtime
2
+
3
+ Named after the famous bartender, Jerry Thomas is a time-series-first data
4
+ pipeline runtime that mixes disparate data sources into fresh, ready-to-serve
5
+ vectors using declarative YAML recipes. Everything is on-demand, iterator-first:
6
+ data streams through the pipeline without pre-batching the whole dataset in
7
+ memory. Like any good bartender, Jerry obsesses over quality control and
8
+ service, offering stage-by-stage observability along the way. And no bar is
9
+ complete without proper tools: deterministic artifacts and plugin scaffolding
10
+ for custom loaders, parsers, transforms, and filters.
11
+
12
+ Contributing: PRs welcome on [GitHub](https://github.com/mr-lovalova/datapipeline).
13
+
14
+ > **Core assumptions**
15
+ >
16
+ > - Every record carries a timezone-aware `time` attribute and a numeric
17
+ > `value`. The time-zone awareness is a quality gate to ensure correct vector assembly.
18
+ > - Grouping is purely temporal. Dimensional splits belong in `partition_by`.
19
+
20
+ ---
21
+
22
+ ## Why You Might Use It
23
+
24
+ - Materialize canonical time-series datasets from disparate sources.
25
+ - Preview and debug each stage of the pipeline without writing ad-hoc scripts.
26
+ - Enforce coverage/quality gates and publish artifacts (schema, scaler stats)
27
+ for downstream ML teams.
28
+ - Extend the runtime with entry-point driven plugins for domain-specific I/O or
29
+ feature engineering.
30
+ - Consume vectors directly from Python via iterators, Pandas DataFrames, or
31
+ `torch.utils.data.Dataset`.
32
+
33
+ ---
34
+
35
+ ## Quick Start
36
+
37
+ ### Serve The Demo Plugin (Recommended)
38
+
39
+ ```bash
40
+ python -m pip install -U jerry-thomas
41
+ jerry demo init
42
+ python -m pip install -e demo
43
+ jerry serve --dataset demo --limit 3
44
+ ```
45
+
46
+ Note: `jerry demo init` creates a workspace `jerry.yaml`. If you later run
47
+ `jerry plugin init`, it won’t overwrite that file. Remove or edit
48
+ `jerry.yaml` (or pass `--project`) to point at your new plugin.
49
+ For example: `jerry serve --project lib/my-datapipeline/project.yaml`.
50
+
51
+ ### Create Your Own Plugin + First Ingest
52
+
53
+ ```bash
54
+ jerry plugin init my-datapipeline --out lib/
55
+ python -m pip install -e lib/my-datapipeline
56
+
57
+ # One-stop wizard: scaffolds source YAML + DTO/parser + domain + mapper + contract.
58
+ jerry inflow create
59
+
60
+ # Wizard tips (identity vs custom)
61
+ #
62
+ # - Parser:
63
+ # - Choose "Identity parser" when the loader already yields dicts/objects that match your DTO shape,
64
+ # and no type conversion is needed.
65
+ # - Choose "Create new parser" when you need to parse timestamps, coerce types, rename fields,
66
+ # or drop/validate rows.
67
+ #
68
+ # - Mapper:
69
+ # - Choose "Identity mapper" only when your DTO already is the final domain record shape (for example you might have used jerry to output interim datasets):
70
+ # `time` is timezone-aware and identity fields are present.
71
+ # - Otherwise, choose "Create new mapper" to map DTO -> domain record and add light derived fields.
72
+ #
73
+ # After scaffolding, you typically still need to:
74
+ # - Fill placeholders in `sources/*.yaml` (path/url/headers/etc.).
75
+ # - Reference your stream contract id in `dataset.yaml` under `record_stream: <contract_id>` and pick a `field` for each feature.
76
+ #
77
+ # Reinstall after commands that update entry points (pyproject.toml).
78
+ python -m pip install -e lib/my-datapipeline
79
+
80
+ jerry serve --dataset your-dataset --limit 3
81
+ ```
82
+
83
+ ---
84
+
85
+ ## Pipeline Stages (serve --stage)
86
+
87
+ Stages 0-6 operate on a single stream at a time (per feature/target config). Stages 7-8 assemble full vectors across all configured features.
88
+
89
+ - Stage 0 (DTO stream)
90
+ - Input: raw source rows (loader transport + decoder)
91
+ - Ops: loader -> decoder -> parser (raw -> DTO; return None to drop rows)
92
+ - Output: DTO objects yielded by the parser
93
+
94
+ - Stage 1 (record stream)
95
+ - Input: DTO stream
96
+ - Ops: mapper (DTO -> domain TemporalRecord)
97
+ - Output: TemporalRecord instances (must have timezone-aware `time`)
98
+
99
+ - Stage 2 (record transforms)
100
+ - Input: TemporalRecord stream
101
+ - Ops: contract `record:` transforms (e.g. filter, floor_time); per-record only (no history)
102
+ - Output: TemporalRecord stream (possibly filtered/mutated)
103
+
104
+ - Stage 3 (ordered record stream)
105
+ - Input: TemporalRecord stream
106
+ - Ops:
107
+ - sort by `(partition_key, record.time)` (batch/in-memory sort; typically the expensive step)
108
+ - Output: TemporalRecord stream (sorted by partition,time)
109
+
110
+ - Stage 4 (stream transforms)
111
+ - Input: ordered TemporalRecord stream
112
+ - Ops:
113
+ - apply contract `stream:` transforms (per-partition history; e.g. ensure_cadence, rolling, fill)
114
+ - apply contract `debug:` transforms (validation only; e.g. lint)
115
+ - Output: TemporalRecord stream (sorted by partition,time)
116
+
117
+ - Stage 5 (feature stream)
118
+ - Input: TemporalRecord stream
119
+ - Ops: wrap each record as `FeatureRecord(id, record, value)`; `id` is derived from:
120
+ - dataset `id:` (base feature id), and
121
+ - optional `partition_by:` fields (entity-specific feature ids)
122
+ - `value` is selected from `dataset.yaml` via `field: <record_attr>`
123
+ - Output: FeatureRecord stream (sorted by id,time within partitions)
124
+
125
+ - Stage 6 (feature transforms)
126
+ - Input: FeatureRecord stream (sorted by id,time)
127
+ - Ops: dataset-level feature transforms configured per feature (e.g. `scale`, `sequence`)
128
+ - Output: FeatureRecord or FeatureRecordSequence
129
+
130
+ - Stage 7 (vector assembly)
131
+ - Input: all features/targets after stage 6
132
+ - Ops:
133
+ - merge feature streams by time bucket (`group_by`)
134
+ - assemble `Vector` objects (feature_id -> value or sequence)
135
+ - assemble `Sample(key, features, targets)`
136
+ - if rectangular mode is on, align to the expected time window keys (missing buckets become empty vectors)
137
+ - Output: Sample stream (no postprocess, no split)
138
+
139
+ - Stage 8 (postprocess)
140
+ - Input: Sample stream
141
+ - Ops:
142
+ - ensure vector schema (fill missing configured feature ids, drop extras)
143
+ - apply project `postprocess.yaml` vector transforms
144
+ - Output: Sample stream (still not split)
145
+
146
+ Full run (no --stage)
147
+
148
+ - Runs stages 0-8, then applies the configured train/val/test split and optional throttling, then writes output.
149
+
150
+ Split timing (leakage note)
151
+
152
+ - Split is applied after stage 8 in `jerry serve` (postprocess runs before split).
153
+ - Feature engineering runs before split; keep it causal (no look-ahead, no future leakage).
154
+ - Scaler statistics are fit by the build task `scaler.yaml` and are typically restricted to the `train` split (configurable via `split_label`).
155
+
156
+ ---
157
+
158
+ ## CLI Cheat Sheet
159
+
160
+ - `jerry demo init`: scaffolds a standalone demo plugin at `./demo/` and wires a `demo` dataset.
161
+ - `jerry plugin init <name> --out lib/`: scaffolds `lib/<name>/` (writes workspace `jerry.yaml` when missing).
162
+ - `jerry.yaml`: sets `plugin_root` for scaffolding commands and `datasets/default_dataset` so you can omit `--project`/`--dataset`.
163
+ - `jerry serve [--dataset <alias>|--project <path>] [--limit N] [--stage 0-8] [--skip-build]`: streams output; builds required artifacts unless `--skip-build`.
164
+ - `jerry build [--dataset <alias>|--project <path>] [--force]`: materializes artifacts (schema, scaler, etc.).
165
+ - `jerry inspect report|matrix|partitions [--dataset <alias>|--project <path>]`: quality and metadata helpers.
166
+ - `jerry inflow create`: interactive wizard to scaffold an end-to-end ingest stream (source + parser/DTO + mapper + contract).
167
+ - `jerry source create <provider>.<dataset> ...`: scaffolds a source YAML (no Python code).
168
+ - `jerry domain create <domain>`: scaffolds a domain record stub.
169
+ - `jerry dto create`, `jerry parser create`, `jerry mapper create`, `jerry loader create`: scaffold Python code + register entry points (reinstall after).
170
+ - `jerry contract create [--identity]`: interactive contract scaffolder (YAML); use for canonical streams or composed streams.
171
+ - `jerry list sources|domains|parsers|mappers|loaders|dtos`: introspection helpers.
172
+ - `pip install -e lib/<name>`: rerun after commands that update `lib/<name>/pyproject.toml` (entry points), or after manual edits to it.
173
+
174
+ ---
175
+
176
+ ## MLOps & Reproducibility
177
+
178
+ - `jerry build` materializes deterministic artifacts (schema, scaler, metadata).
179
+ Builds are keyed by config hashes and skip work when nothing changed unless
180
+ you pass `--force`.
181
+ - `jerry serve` runs are named (task/run) and can write outputs to
182
+ `<out-path>/<run_name>/` for auditing, sharing, or downstream training.
183
+ - Versioning: tag the project config + plugin code in Git and pair with a data
184
+ versioning tool like DVC for raw sources. With those inputs pinned, interim
185
+ datasets and artifacts can be regenerated instead of stored.
186
+
187
+ ---
188
+
189
+ ## Concepts
190
+
191
+ ### Workspace (`jerry.yaml`)
192
+
193
+ - `datasets`: dataset aliases → `project.yaml` paths (relative to `jerry.yaml`).
194
+ - `default_dataset`: which dataset `jerry serve/build/inspect` use when you omit `--dataset/--project`.
195
+ - `plugin_root`: where scaffolding commands write Python code (`src/<package>/...`) and where they look for `pyproject.toml`.
196
+
197
+ ### Plugin Package (Python Code)
198
+
199
+ These live under `lib/<plugin>/src/<package>/`:
200
+
201
+ - `dtos/*.py`: DTO models (raw source shapes).
202
+ - `parsers/*.py`: raw -> DTO parsers (referenced by source YAML via entry point).
203
+ - `domains/<domain>/model.py`: domain record models.
204
+ - `mappers/*.py`: DTO -> domain record mapping functions (referenced by contracts via entry point).
205
+ - `loaders/*.py`: optional custom loaders (fs/http usually use the built-in core loader).
206
+ - `pyproject.toml`: entry points for loaders/parsers/mappers/transforms (rerun `pip install -e lib/<plugin>` after changes).
207
+
208
+ ### Loaders & Parsers
209
+
210
+ - A **loader** yields raw rows (bytes/dicts) from some transport (FS/HTTP/synthetic/etc.).
211
+ - A **parser** turns each raw row into a typed DTO (or returns `None` to drop a row).
212
+ - In most projects, your source YAML uses the built-in loader `core.io` and you only customize its `args` (`transport`, `format`, and a `path`/`url`).
213
+ - You typically only implement a custom loader when you need specialized behavior (auth/pagination/rate limits, proprietary formats, or non-standard protocols).
214
+ - `parser.args` are optional and only used when your parser supports configuration; many parsers don’t need any args since filtering etc is supported natively downstream.
215
+
216
+ ### DTOs & Domains
217
+
218
+ - A **DTO** (Data Transfer Object) mirrors a single source’s schema (columns/fields) and stays “raw-shaped”; it’s what parsers emit.
219
+ - A **domain record** is the canonical shape used across the pipeline. Mappers convert DTOs into domain records so multiple sources can land in the same domain model.
220
+ - The base time-series type is `TemporalRecord` (`time` + metadata fields). Domains add identity fields (e.g. `symbol`, `station_id`) that make filtering/partitioning meaningful.
221
+ - `time` must be timezone-aware (normalized to UTC); feature values are selected from record fields in `dataset.yaml` (see `field:`); remaining fields act as the record’s “identity” (used by equality/deduping and commonly by `partition_by`).
222
+
223
+ ### Transforms (Record → Stream → Feature → Vector)
224
+
225
+ - **Record transforms** run on raw canonical records before sorting or grouping (filters, time flooring, lagging). Each transform operates on one record at a time because order and partitions are not established yet. Configure in `contracts/*.yaml` under `record:`.
226
+ - **Stream transforms** run on ordered, per-stream records after record transforms (dedupe, cadence enforcement, rolling fills). These operate across a sequence of records for a partition because they depend on sorted partition/time order and cadence. Configure in `contracts/*.yaml` under `stream:`.
227
+ - **Feature transforms** run after stream regularization and shape the per-feature payload for vectorization (scalers, sequence/windowing). These occur after feature ids are finalized and payloads are wrapped. Configure in `dataset.yaml` under each feature.
228
+ - **Vector (postprocess) transforms** operate on assembled vectors (coverage/drop/fill/replace). Configure in `postprocess.yaml`.
229
+ - **Debug transforms** run after stream transforms for validation only. Configure in `contracts/*.yaml` under `debug:`.
230
+ - Custom transforms are registered in your plugin `pyproject.toml` under the matching entry-point group:
231
+ - `datapipeline.transforms.record`
232
+ - `datapipeline.transforms.stream`
233
+ - `datapipeline.transforms.feature`
234
+ - `datapipeline.transforms.vector`
235
+ - `datapipeline.transforms.debug`
236
+ Then reference them by name in the YAML.
237
+
238
+ ### Glossary
239
+
240
+ - **Source alias**: `sources/*.yaml:id` (referenced by contracts under `source:`).
241
+ - **Stream id**: `contracts/*.yaml:id` (referenced by `dataset.yaml` under `record_stream:`).
242
+ - **Partition**: dimension keys appended to feature IDs, driven by `contract.partition_by`.
243
+ - **Group**: vector “bucket” cadence set by `dataset.group_by` (controls how records become samples).
244
+ - **Stage**: debug/preview level for `jerry serve --stage 0-8` (DTOs → domain records → features → vectors).
245
+ - **Fan-out**: when multiple features reference the same `record_stream`, the pipeline spools records to disk so each feature can read independently (records must be picklable).
246
+
247
+ ## Documentation
248
+
249
+ - `docs/config.md`: config layout, resolution order, and YAML reference.
250
+ - `docs/cli.md`: CLI reference (beyond the cheat sheet).
251
+ - `docs/transforms.md`: built-in transforms and filters.
252
+ - `docs/artifacts.md`: artifacts, postprocess, and split timing.
253
+ - `docs/python.md`: Python API usage patterns.
254
+ - `docs/extending.md`: entry points and writing plugins.
255
+ - `docs/architecture.md`: pipeline diagrams.
256
+
257
+ ## Development
258
+
259
+ See `CONTRIBUTING.md`.
@@ -4,12 +4,13 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "jerry-thomas"
7
- version = "1.0.3"
7
+ version = "2.0.0"
8
8
  description = "Jerry-Thomas: a stream-first, plugin-friendly data pipeline (mixology-themed CLI)"
9
9
  readme = { file = "README.md", content-type = "text/markdown" }
10
10
  requires-python = ">=3.10"
11
11
  license = { text = "MIT" }
12
12
  authors = [{ name = "Anders Skott Lind" }]
13
+ urls = { "Homepage" = "https://github.com/mr-lovalova/datapipeline", "Repository" = "https://github.com/mr-lovalova/datapipeline", "Issues" = "https://github.com/mr-lovalova/datapipeline/issues" }
13
14
  dependencies = [
14
15
  "numpy>=1.24,<3.0",
15
16
  "pydantic>=2.0",
@@ -39,22 +40,27 @@ where = ["src"]
39
40
  datapipeline = [
40
41
  "templates/dataset_config.yaml",
41
42
  "templates/plugin_skeleton/**",
43
+ "templates/demo_skeleton/**",
42
44
  "templates/stubs/**"
43
45
  ]
44
46
 
45
47
  [project.entry-points."datapipeline.filters"]
46
48
 
47
49
  [project.entry-points."datapipeline.transforms.record"]
48
- lag = "datapipeline.transforms.record.lag:apply_lag"
49
- floor_time = "datapipeline.transforms.record.floor_time:floor_time"
50
+ lag = "datapipeline.transforms.record.lag:LagRecordTransform"
51
+ floor_time = "datapipeline.transforms.record.floor_time:FloorTimeRecordTransform"
50
52
  filter = "datapipeline.transforms.filter:filter"
51
53
 
52
54
  [project.entry-points."datapipeline.transforms.stream"]
53
- ensure_cadence = "datapipeline.transforms.stream.ensure_ticks:ensure_cadence"
55
+ floor_time = "datapipeline.transforms.stream.floor_time:FloorTimeTransform"
56
+ lag = "datapipeline.transforms.stream.lag:LagTransform"
57
+ ensure_cadence = "datapipeline.transforms.stream.ensure_ticks:EnsureCadenceTransform"
54
58
  fill = "datapipeline.transforms.stream.fill:FillTransformer"
55
59
  granularity = "datapipeline.transforms.stream.granularity:FeatureGranularityTransform"
56
60
  lint = "datapipeline.transforms.stream.lint:StreamLint"
57
61
  dedupe = "datapipeline.transforms.stream.dedupe:FeatureDeduplicateTransform"
62
+ rolling = "datapipeline.transforms.stream.rolling:RollingTransformer"
63
+ filter = "datapipeline.transforms.stream.filter:FilterTransform"
58
64
 
59
65
  [project.entry-points."datapipeline.transforms.feature"]
60
66
  scale = "datapipeline.transforms.feature.scaler:StandardScalerTransform"
@@ -1,4 +1,3 @@
1
- from __future__ import annotations
2
1
  from collections import Counter, defaultdict
3
2
  from typing import Any, Hashable, Iterable, Literal
4
3
  from datapipeline.transforms.vector_utils import base_id as _base_id
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  import hashlib
4
2
  from pathlib import Path
5
3
  from typing import Iterable
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  import json
4
2
  from collections import defaultdict
5
3
  from datetime import datetime, timezone
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from pathlib import Path
4
2
  from typing import Dict, Iterator, Tuple
5
3
 
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  import json
4
2
  from datetime import datetime, timezone
5
3
  from pathlib import Path
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from collections import Counter, OrderedDict
4
2
  from datetime import datetime
5
3
  from typing import Any