jerry-thomas 1.0.1__tar.gz → 1.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (206) hide show
  1. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/PKG-INFO +290 -288
  2. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/README.md +289 -287
  3. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/pyproject.toml +2 -1
  4. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/app.py +9 -10
  5. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/contract.py +8 -2
  6. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/source.py +5 -0
  7. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/visuals/common.py +57 -5
  8. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/visuals/labels.py +8 -41
  9. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/visuals/sources_rich.py +8 -3
  10. jerry_thomas-1.0.3/src/datapipeline/cli/workspace_utils.py +25 -0
  11. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/dataset/dataset.py +1 -1
  12. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/dataset/normalize.py +9 -4
  13. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/workspace.py +15 -0
  14. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/scaffold/source.py +2 -1
  15. jerry_thomas-1.0.3/src/datapipeline/sources/foreach.py +151 -0
  16. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/pyproject.toml +1 -1
  17. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +1 -1
  18. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/jerry_thomas.egg-info/PKG-INFO +290 -288
  19. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/jerry_thomas.egg-info/SOURCES.txt +2 -3
  20. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/jerry_thomas.egg-info/entry_points.txt +1 -0
  21. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +0 -31
  22. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +0 -30
  23. jerry_thomas-1.0.1/src/datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +0 -12
  24. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/LICENSE +0 -0
  25. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/setup.cfg +0 -0
  26. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/__init__.py +0 -0
  27. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/analysis/__init__.py +0 -0
  28. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/analysis/vector/collector.py +0 -0
  29. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/analysis/vector/matrix.py +0 -0
  30. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/analysis/vector/report.py +0 -0
  31. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/analysis/vector_analyzer.py +0 -0
  32. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/build/__init__.py +0 -0
  33. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/build/state.py +0 -0
  34. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/build/tasks/__init__.py +0 -0
  35. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/build/tasks/config.py +0 -0
  36. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/build/tasks/metadata.py +0 -0
  37. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/build/tasks/scaler.py +0 -0
  38. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/build/tasks/schema.py +0 -0
  39. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/build/tasks/utils.py +0 -0
  40. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/build.py +0 -0
  41. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/domain.py +0 -0
  42. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/filter.py +0 -0
  43. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/inspect.py +0 -0
  44. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/list_.py +0 -0
  45. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/plugin.py +0 -0
  46. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/run.py +0 -0
  47. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/run_config.py +0 -0
  48. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/commands/serve_pipeline.py +0 -0
  49. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/visuals/__init__.py +0 -0
  50. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/visuals/runner.py +0 -0
  51. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/visuals/sections.py +0 -0
  52. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/visuals/sources.py +0 -0
  53. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/visuals/sources_basic.py +0 -0
  54. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/cli/visuals/sources_off.py +0 -0
  55. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/__init__.py +0 -0
  56. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/catalog.py +0 -0
  57. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/context.py +0 -0
  58. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/dataset/feature.py +0 -0
  59. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/dataset/loader.py +0 -0
  60. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/metadata.py +0 -0
  61. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/postprocess.py +0 -0
  62. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/project.py +0 -0
  63. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/resolution.py +0 -0
  64. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/split.py +0 -0
  65. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/config/tasks.py +0 -0
  66. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/domain/__init__.py +0 -0
  67. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/domain/feature.py +0 -0
  68. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/domain/record.py +0 -0
  69. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/domain/sample.py +0 -0
  70. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/domain/vector.py +0 -0
  71. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/filters/filters.py +0 -0
  72. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/integrations/__init__.py +0 -0
  73. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/integrations/ml/__init__.py +0 -0
  74. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/integrations/ml/adapter.py +0 -0
  75. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/integrations/ml/pandas_support.py +0 -0
  76. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/integrations/ml/rows.py +0 -0
  77. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/integrations/ml/torch_support.py +0 -0
  78. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/factory.py +0 -0
  79. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/output.py +0 -0
  80. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/protocols.py +0 -0
  81. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/serializers.py +0 -0
  82. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/sinks/__init__.py +0 -0
  83. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/sinks/base.py +0 -0
  84. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/sinks/files.py +0 -0
  85. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/sinks/rich.py +0 -0
  86. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/sinks/stdout.py +0 -0
  87. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/writers/__init__.py +0 -0
  88. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/writers/base.py +0 -0
  89. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/writers/csv_writer.py +0 -0
  90. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/writers/jsonl.py +0 -0
  91. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/io/writers/pickle_writer.py +0 -0
  92. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/mappers/noop.py +0 -0
  93. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/mappers/synthetic/time.py +0 -0
  94. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/parsers/identity.py +0 -0
  95. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/__init__.py +0 -0
  96. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/artifacts.py +0 -0
  97. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/context.py +0 -0
  98. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/observability.py +0 -0
  99. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/pipelines.py +0 -0
  100. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/split.py +0 -0
  101. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/stages.py +0 -0
  102. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/utils/keygen.py +0 -0
  103. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/utils/memory_sort.py +0 -0
  104. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/utils/ordering.py +0 -0
  105. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/pipeline/utils/transform_utils.py +0 -0
  106. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/plugins.py +0 -0
  107. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/registries/registry.py +0 -0
  108. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/runtime.py +0 -0
  109. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/artifacts.py +0 -0
  110. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/bootstrap/__init__.py +0 -0
  111. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/bootstrap/config.py +0 -0
  112. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/bootstrap/core.py +0 -0
  113. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/constants.py +0 -0
  114. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/entrypoints.py +0 -0
  115. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/factories.py +0 -0
  116. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/paths.py +0 -0
  117. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/project_paths.py +0 -0
  118. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/runs.py +0 -0
  119. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/scaffold/__init__.py +0 -0
  120. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/scaffold/domain.py +0 -0
  121. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/scaffold/filter.py +0 -0
  122. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/scaffold/mappers.py +0 -0
  123. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/scaffold/plugin.py +0 -0
  124. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/services/scaffold/templates.py +0 -0
  125. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/__init__.py +0 -0
  126. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/data_loader.py +0 -0
  127. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/decoders.py +0 -0
  128. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/factory.py +0 -0
  129. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/models/__init__.py +0 -0
  130. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/models/base.py +0 -0
  131. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/models/generator.py +0 -0
  132. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/models/loader.py +0 -0
  133. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/models/parser.py +0 -0
  134. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/models/parsing_error.py +0 -0
  135. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/models/source.py +0 -0
  136. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/models/synthetic.py +0 -0
  137. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/synthetic/__init__.py +0 -0
  138. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/synthetic/time/__init__.py +0 -0
  139. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/synthetic/time/loader.py +0 -0
  140. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/synthetic/time/parser.py +0 -0
  141. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/sources/transports.py +0 -0
  142. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/README.md +0 -0
  143. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +0 -0
  144. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +0 -0
  145. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/dataset.yaml +0 -0
  146. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/postprocess.yaml +0 -0
  147. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/project.yaml +0 -0
  148. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +0 -0
  149. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +0 -0
  150. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +0 -0
  151. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +0 -0
  152. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +0 -0
  153. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +0 -0
  154. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +0 -0
  155. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/jerry.yaml +0 -0
  156. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
  157. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +0 -0
  158. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +0 -0
  159. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +0 -0
  160. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +0 -0
  161. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +0 -0
  162. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +0 -0
  163. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +0 -0
  164. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +0 -0
  165. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/stubs/dto.py.j2 +0 -0
  166. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/stubs/filter.py.j2 +0 -0
  167. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/stubs/loader_synthetic.py.j2 +0 -0
  168. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/stubs/mapper.py.j2 +0 -0
  169. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/stubs/parser.py.j2 +0 -0
  170. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/stubs/parser_custom.py.j2 +0 -0
  171. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/stubs/record.py.j2 +0 -0
  172. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/templates/stubs/source.yaml.j2 +0 -0
  173. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/debug/identity.py +0 -0
  174. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/debug/lint.py +0 -0
  175. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/feature/model.py +0 -0
  176. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/feature/scaler.py +0 -0
  177. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/filter.py +0 -0
  178. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/record/floor_time.py +0 -0
  179. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/record/lag.py +0 -0
  180. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/sequence.py +0 -0
  181. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/stream/dedupe.py +0 -0
  182. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/stream/ensure_ticks.py +0 -0
  183. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/stream/fill.py +0 -0
  184. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/stream/granularity.py +0 -0
  185. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/utils.py +0 -0
  186. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/vector/__init__.py +0 -0
  187. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/vector/common.py +0 -0
  188. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/vector/drop/__init__.py +0 -0
  189. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/vector/drop/horizontal.py +0 -0
  190. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/vector/drop/orchestrator.py +0 -0
  191. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/vector/drop/vertical.py +0 -0
  192. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/vector/ensure_schema.py +0 -0
  193. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/vector/fill.py +0 -0
  194. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/vector/replace.py +0 -0
  195. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/transforms/vector_utils.py +0 -0
  196. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/utils/__init__.py +0 -0
  197. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/utils/load.py +0 -0
  198. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/utils/paths.py +0 -0
  199. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/utils/pickle_model.py +0 -0
  200. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/utils/placeholders.py +0 -0
  201. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/utils/rich_compat.py +0 -0
  202. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/utils/time.py +0 -0
  203. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/datapipeline/utils/window.py +0 -0
  204. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/jerry_thomas.egg-info/dependency_links.txt +0 -0
  205. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/jerry_thomas.egg-info/requires.txt +0 -0
  206. {jerry_thomas-1.0.1 → jerry_thomas-1.0.3}/src/jerry_thomas.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: jerry-thomas
3
- Version: 1.0.1
3
+ Version: 1.0.3
4
4
  Summary: Jerry-Thomas: a stream-first, plugin-friendly data pipeline (mixology-themed CLI)
5
5
  Author: Anders Skott Lind
6
6
  License: MIT
@@ -49,263 +49,112 @@ transforms, and filters.
49
49
 
50
50
  ## Quick Start
51
51
 
52
+ ### Serve The Example
53
+
52
54
  ```bash
53
- # 1. Install in editable mode (with optional dev extras for testing).
54
- pip install -e .[dev]
55
+ pip install jerry-thomas
56
+ jerry plugin init my-datapipeline --out lib/
57
+ jerry serve --limit 3
58
+ ```
55
59
 
56
- # 2. Bootstrap a project (scaffolds configs, plugin package, and templates).
57
- jerry plugin init my_datapipeline --out .
60
+ ### Create Your Own Stream
58
61
 
59
- # 3. Create a source & domain scaffold, then declare a canonical stream.
60
- # Simple forms
61
- jerry source add demo weather --transport fs --format csv
62
- jerry source add demo.weather --transport http --format json
62
+ Assumes you already ran `jerry plugin init ...` in this workspace (it writes `jerry.yaml` which the CLI uses for defaults and scaffolding paths).
63
+ These scaffolding commands write YAML into the dataset selected by `default_dataset` in `jerry.yaml` (`example` by default).
63
64
 
64
- # Flag form (explicit)
65
- jerry source add --provider demo --dataset weather --transport fs --format csv
65
+ ```bash
66
+ jerry source add demo weather -t fs -f csv
66
67
  jerry domain add weather
67
- # (edit config/contracts/<alias>.yaml to point at your mapper and policies)
68
+ jerry contract
69
+ pip install -e lib/my-datapipeline
70
+ ```
68
71
 
69
- # 4. Configure dataset/postprocess/build files under config/.
70
- # Then preview the pipeline and serve a few vectors:
71
- # Add --skip-build when you only need a quick feature peek.
72
- jerry serve --project config/project.yaml --stage 2 --limit 5
73
- jerry serve --project config/project.yaml --limit 3
72
+ ---
74
73
 
75
- # 5. Inspect coverage and build artifacts:
76
- jerry inspect report --project config/project.yaml
77
- jerry build --project config/project.yaml
78
- ```
74
+ ## CLI Cheat Sheet
79
75
 
80
- The skeleton project in `src/datapipeline/templates/plugin_skeleton/` mirrors the
81
- paths expected by the CLI. Copy it or run `jerry plugin init` to get a ready-made
82
- layout with `config/`, `src/<package>/`, and entry-point stubs.
76
+ - `jerry plugin init <name> --out lib/`: scaffolds `lib/<name>/` and writes workspace `jerry.yaml`.
77
+ - `jerry.yaml` (created by `plugin init`): sets `plugin_root` for scaffolding commands and `datasets/default_dataset` so you can omit `--project`/`--dataset`.
78
+ - `jerry serve [--dataset <alias>|--project <path>] [--limit N] [--stage 0-7] [--skip-build]`: streams output; builds required artifacts unless `--skip-build`.
79
+ - `jerry build [--dataset <alias>|--project <path>] [--force]`: materializes artifacts (schema, scaler, expected IDs, etc.).
80
+ - `jerry inspect report|matrix|partitions|expected [--dataset <alias>|--project <path>]`: quality and metadata helpers.
81
+ - `jerry source add <provider> <dataset> -t fs|http|synthetic -f csv|json|json-lines|pickle [--identity]`: scaffolds a source YAML and (unless `--identity`) a parser + entry point.
82
+ - `jerry domain add <domain>`: scaffolds domain models under `src/<package>/domains/<domain>/`.
83
+ - `jerry contract [--identity]`: interactive contract scaffolder; most users pick `[1] Ingest (source → stream)` (use `[2] Composed` for derived streams, e.g. air_density from temp + pressure).
84
+ - `pip install -e lib/<name>`: rerun after commands that update `lib/<name>/pyproject.toml` (entry points), or after manual edits to it.
83
85
 
84
86
  ---
85
87
 
86
- ## Pipeline Architecture
88
+ ## Concepts
87
89
 
88
- ```text
89
- raw source ──▶ loader/parser DTOs ──▶ canonical stream ──▶ record policies
90
- └──▶ feature wrapping ──▶ stream regularization ──▶ feature transforms/sequence
91
- └──▶ vector assembly ──▶ postprocess transforms
92
- ```
90
+ ### Workspace (`jerry.yaml`)
93
91
 
94
- 1. **Loader/parser (Stage 0)** raw bytes become typed DTOs. Loaders fetch from
95
- FS/HTTP/synthetic sources; parsers map bytes to DTOs. Register them via entry
96
- points (`loaders`, `parsers`) and wire them in `config/sources/*.yaml`.
97
- 2. **Canonical stream mapping (Stage 1)** – mappers attach domain semantics and
98
- partition keys, producing domain `TemporalRecord`s.
99
- 3. **Record policies (Stage 2)** – contract `record` rules (filters, floor, lag)
100
- prune and normalize DTO-derived records.
101
- 4. **Feature wrapping (Stage 3)** – records become `FeatureRecord`s before
102
- sort/regularization.
103
- 5. **Stream regularization (Stage 4)** – contract `stream` rules ensure cadence,
104
- deduplicate timestamps, and impute where needed.
105
- 6. **Feature transforms/sequence (Stage 5)** – dataset transforms (scale,
106
- sequence windows) produce per-feature tensors or windows.
107
- 7. **Vector assembly (Stage 6)** – features merge by `group_by` cadence into
108
- `(group_key, Vector)` pairs, prior to postprocess tweaks.
109
- 8. **Postprocess (Stage 7)** – optional vector transforms (fill/drop/etc.) run
110
- before results are emitted to the configured output.
92
+ - `datasets`: dataset aliases `project.yaml` paths (relative to `jerry.yaml`).
93
+ - `default_dataset`: which dataset `jerry serve/build/inspect` use when you omit `--dataset/--project`.
94
+ - `plugin_root`: where scaffolding commands write Python code (`src/<package>/...`) and where they look for `pyproject.toml`.
111
95
 
112
- #### Visual Flowchart
96
+ ### Plugin Package (Python Code)
113
97
 
114
- ```mermaid
115
- flowchart TB
116
- subgraph CLI & Project config
117
- cliSource[jerry source add]
118
- cliDomain[jerry domain add]
119
- cliContract[jerry contract]
120
- cliServe[jerry serve]
121
- project[[project.yaml]]
122
- sourcesCfg[config/sources/*.yaml]
123
- contractsCfg[config/contracts/*.yaml]
124
- datasetCfg[dataset.yaml]
125
- postprocessCfg[postprocess.yaml]
126
- end
98
+ These live under `lib/<plugin>/src/<package>/`:
127
99
 
128
- cliSource --> sourcesCfg
129
- cliDomain --> domainPkg
130
- cliContract --> contractsCfg
131
- cliServe --> vectorSamples
132
- project -.->|paths.sources| sourcesCfg
133
- project -.->|paths.streams| contractsCfg
134
- project -.->|paths.dataset| datasetCfg
135
- project -.->|paths.postprocess| postprocessCfg
100
+ - `sources/<provider>/<dataset>/dto.py` + `parser.py`: source DTO + parser (created by `jerry source add` unless `--identity`).
101
+ - `domains/<domain>/model.py`: domain records (created by `jerry domain add`).
102
+ - `mappers/<provider>/<dataset>/to_<domain>.py`: DTO → domain record mapping (usually created by `jerry contract`).
103
+ - `pyproject.toml`: entry points for loaders/parsers/mappers/transforms (rerun `pip install -e lib/<plugin>` after changes).
136
104
 
137
- subgraph Plugin code
138
- domainPkg[domains/*]
139
- mappersPkg[mappers/*]
140
- end
105
+ ### Loaders & Parsers
141
106
 
142
- cliContract --> mappersPkg
143
- domainPkg -. domain models .-> mappersPkg
107
+ - A **loader** yields raw rows (bytes/dicts) from some transport (FS/HTTP/synthetic/etc.).
108
+ - A **parser** turns each raw row into a typed DTO (or returns `None` to drop a row).
109
+ - In most projects, your source YAML uses the built-in loader `core.io` and you only customize its `args` (`transport`, `format`, and a `path`/`url`).
110
+ - You typically only implement a custom loader when you need specialized behavior (auth/pagination/rate limits, proprietary formats, or non-standard protocols).
111
+ - `parser.args` are optional and only used when your parser supports configuration; many parsers don’t need any args since filtering etc is supported natively downstream.
144
112
 
145
- subgraph Registries
146
- registrySources[sources]
147
- registryStreamSources[stream_sources]
148
- registryMappers[mappers]
149
- registryRecordOps[record_ops]
150
- registryStreamOps[stream_ops]
151
- registryDebugOps[debug_ops]
152
- end
113
+ ### DTOs & Domains
153
114
 
154
- subgraph Source wiring
155
- rawData[(external data)]
156
- transportSpec[transport + format]
157
- loaderEP[loader ep]
158
- parserEP[parser ep]
159
- sourceArgs[loader args]
160
- sourceNode[Source]
161
- dtoStream[(DTOs)]
162
- end
115
+ - A **DTO** (Data Transfer Object) mirrors a single source’s schema (columns/fields) and stays “raw-shaped”; it’s what parsers emit.
116
+ - A **domain record** is the canonical shape used across the pipeline. Mappers convert DTOs into domain records so multiple sources can land in the same domain model.
117
+ - The base time-series type is `TemporalRecord` (`time` + `value`). Domains typically add identity fields (e.g. `symbol`, `station_id`) that make filtering/partitioning meaningful.
118
+ - `time` must be timezone-aware (normalized to UTC); `value` is the measurement you engineer features from; all other fields act as the record’s “identity” (used by equality/deduping and commonly by `partition_by`).
163
119
 
164
- sourcesCfg --> transportSpec
165
- sourcesCfg --> loaderEP
166
- sourcesCfg --> parserEP
167
- sourcesCfg --> sourceArgs
168
- transportSpec -. select fs/http/synth .-> loaderEP
169
- loaderEP -. build loader .-> sourceNode
170
- parserEP -. build parser .-> sourceNode
171
- sourceArgs -. paths/creds .-> sourceNode
172
- rawData --> sourceNode --> dtoStream
173
- sourcesCfg -. build_source_from_spec .-> registrySources
174
- contractsCfg -. stream_id + source .-> registryStreamSources
175
- registrySources -. alias -> Source .-> registryStreamSources
120
+ ### Glossary
176
121
 
177
- subgraph Canonical stream
178
- mapperEP[mapper ep]
179
- recordRules[record rules]
180
- streamRules[stream rules]
181
- debugRules[debug rules]
182
- canonical[DTO -> record]
183
- domainRecords((TemporalRecord))
184
- recordStage[record xforms]
185
- featureWrap[record -> feature]
186
- featureRecords((FeatureRecord))
187
- regularization[stream xforms]
188
- end
122
+ - **Source alias**: `sources/*.yaml:id` (referenced by contracts under `source:`).
123
+ - **Stream id**: `contracts/*.yaml:id` (referenced by `dataset.yaml` under `record_stream:`).
124
+ - **Partition**: dimension keys appended to feature IDs, driven by `contract.partition_by`.
125
+ - **Group**: vector “bucket” cadence set by `dataset.group_by` (controls how records become samples).
126
+ - **Stage**: debug/preview level for `jerry serve --stage 0-7` (DTOs → domain records → features → vectors).
189
127
 
190
- dtoStream --> canonical --> domainRecords --> recordStage --> featureWrap --> featureRecords --> regularization
191
- contractsCfg --> mapperEP
192
- mappersPkg -. ep target .-> mapperEP
193
- mapperEP -. build_mapper_from_spec .-> registryMappers
194
- registryMappers --> canonical
195
- contractsCfg --> recordRules
196
- contractsCfg --> streamRules
197
- contractsCfg --> debugRules
198
- registryRecordOps --> recordRules
199
- registryStreamOps --> streamRules
200
- registryDebugOps --> debugRules
201
- recordRules --> recordStage
202
- streamRules --> regularization
203
- debugRules --> regularization
128
+ ### Dataset Project (YAML Config)
204
129
 
205
- subgraph Dataset shaping
206
- featureSpec[feature cfg]
207
- groupBySpec[group_by]
208
- streamRefs[record_stream ids]
209
- featureTrans[feature/seq xforms]
210
- sequenceStream((seq/features))
211
- vectorStage[vector assembly]
212
- vectorSamples((samples))
213
- end
130
+ These live under the dataset “project root” directory (the folder containing `project.yaml`):
214
131
 
215
- datasetCfg --> featureSpec
216
- datasetCfg --> groupBySpec
217
- datasetCfg --> streamRefs
218
- streamRefs -.->|build_feature_pipeline| registryStreamSources
219
- registryStreamSources -.->|open_source_stream| sourceNode
220
- featureRecords --> regularization --> featureTrans --> sequenceStream --> vectorStage --> vectorSamples
221
- featureSpec -. scale/sequence .-> featureTrans
222
- groupBySpec -. cadence .-> vectorStage
132
+ - `project.yaml`: paths + globals (single source of truth).
133
+ - `sources/*.yaml`: raw sources (loader + parser wiring).
134
+ - `contracts/*.yaml`: canonical streams (ingest or composed).
135
+ - `dataset.yaml`: feature/target declarations.
136
+ - `postprocess.yaml`: vector-level transforms.
137
+ - `tasks/*.yaml`: serve presets and artifact task configs.
223
138
 
224
- subgraph Postprocess
225
- vectorTransforms[vector xforms]
226
- postprocessNode[postprocess]
227
- end
139
+ ### Configuration & Resolution Order
228
140
 
229
- postprocessCfg --> vectorTransforms -. drop/fill .-> postprocessNode
230
- vectorStage --> postprocessNode
231
- ```
141
+ Defaults are layered so you can set global preferences once, keep dataset/run
142
+ files focused on per-project behavior, and still override anything from the CLI.
143
+ For both `jerry serve` and `jerry build`, options are merged in the following
144
+ order (highest precedence first):
232
145
 
233
- style cliSource width:120px
234
- style cliDomain width:120px
235
- style cliContract width:120px
236
- style cliServe width:120px
237
- style sourcesCfg width:200px
238
- style contractsCfg width:200px
239
- style datasetCfg width:180px
240
- style postprocessCfg width:200px
241
- style registrySources width:160px
242
- style registryStreamSources width:180px
243
- style registryMappers width:160px
244
- style registryRecordOps width:180px
245
- style registryStreamOps width:180px
246
- style registryDebugOps width:180px
247
- style transportSpec width:180px
248
- style loaderEP width:140px
249
- style parserEP width:140px
250
- style sourceArgs width:160px
251
- style canonical width:180px
252
- style featureTrans width:180px
253
- style domainRecords width:140px
254
- style featureRecords width:140px
255
- style sequenceStream width:180px
256
- style vectorStage width:160px
257
- style vectorSamples width:180px
258
- style recordRules width:160px
259
- style streamRules width:160px
260
- style debugRules width:160px
261
- style domainPkg width:160px
262
- style mappersPkg width:160px
263
-
264
- Solid arrows trace runtime data flow; dashed edges highlight how the config files
265
- inject transports, entry points, or policies into each stage.
266
-
267
- CLI quick path:
268
- - `jerry source add <provider> <dataset> --transport fs|http|synthetic --format ...` → scaffolds DTO/parser/loader and writes `config/sources/*.yaml`.
269
- - `jerry domain add <name>` → creates `src/<pkg>/domains/<name>/model.py`.
270
- - `jerry contract` → picks a source + domain, scaffolds/links a mapper under `mappers/`, registers its entry point, and writes `config/contracts/<stream>.yaml`.
271
- - `jerry serve --project <project.yaml>` → builds/streams vectors using dataset `record_stream` IDs, registry wiring, and postprocess rules.
272
-
273
- `config/sources/*.yaml` determines both the transport and parsing strategy:
274
- you define transport (`fs`, `http`, `synthetic`, etc.), the payload format
275
- (`csv`, `json`, ...), and the loader/parser entry points. Loader `args`
276
- typically include file paths, bucket prefixes, or credential references—the
277
- runtime feeds those arguments into the instantiated loader so it knows exactly
278
- which external data store to read. Contracts bind each canonical stream to a
279
- `source` alias (connecting back to the loader/parser pair) and register a
280
- stream ID; they also specify mapper entry points, record/stream rules,
281
- partitioning, and batch sizes. Dataset features reference those canonical
282
- stream IDs via `record_stream`, so each feature config reuses the registered
283
- stream (and, by extension, the raw source) when you call
284
- `build_feature_pipeline()` (`src/datapipeline/pipeline/pipelines.py`). Finally,
285
- `postprocess.yaml` decorates the vector stream with additional filters/fills so
286
- serve/build outputs inherit the full set of policies. When you run the CLI,
287
- `bootstrap()` (`src/datapipeline/services/bootstrap/core.py`) loads each
288
- directory declared in `project.yaml`, instantiates loaders/parsers via
289
- `build_source_from_spec()` and `load_ep()`, attaches contract registries, and
290
- hands a fully wired `Runtime` to the pipeline stages in
291
- `src/datapipeline/pipeline/stages.py`.
292
-
293
- Every `record_stream` identifier ultimately resolves to the stream entry revived
294
- by the contract bootstrap step, so requesting stage outputs for a feature always
295
- walks the entire chain from dataset config → canonical contract → source
296
- definition. That is why `build_feature_pipeline()` starts by calling
297
- `open_source_stream(context, record_stream_id)` before stepping through record
298
- policies, stream policies, and feature transforms.
299
-
300
- The runtime (`src/datapipeline/runtime.py`) hosts registries for sources,
301
- transforms, artifacts, and postprocess rules. The CLI constructs lightweight
302
- `PipelineContext` objects to build iterators without mutating global state.
146
+ 1. **CLI flags** – anything you pass on the command line always wins.
147
+ 2. **Project task files** – `kind: serve` specs (under `project.paths.tasks`)
148
+ supply serve defaults; artifact tasks in the same directory drive `jerry build`.
149
+ 3. **`jerry.yaml` command blocks** – settings under `jerry.serve` and `jerry.build`.
150
+ 4. **`jerry.yaml.shared`** – shared fallbacks for visuals/progress/log-level style settings.
151
+ 5. **Built-in defaults** – runtime hard-coded defaults.
303
152
 
304
153
  ---
305
154
 
306
- ## Configuration Files
155
+ ## YAML Config Reference
307
156
 
308
- All project configuration for a dataset lives under a single project root directory (for example `config/`), which contains `project.yaml` and its siblings.
157
+ All dataset configuration is rooted at a single `project.yaml` file. Other YAML files are discovered via `project.paths.*` (relative to `project.yaml` unless absolute).
309
158
 
310
159
  ### `project.yaml`
311
160
 
@@ -317,7 +166,7 @@ paths:
317
166
  sources: ./sources
318
167
  dataset: dataset.yaml
319
168
  postprocess: postprocess.yaml
320
- artifacts: ../build/datasets/${project_name}
169
+ artifacts: ../artifacts/${project_name}/v${version}
321
170
  tasks: ./tasks
322
171
  globals:
323
172
  start_time: 2021-01-01T00:00:00Z
@@ -344,13 +193,13 @@ globals:
344
193
 
345
194
  ```yaml
346
195
  kind: serve
347
- name: train # defaults to filename stem when omitted
348
- keep: train # select active split label (null disables filtering)
196
+ name: train # defaults to filename stem when omitted
197
+ keep: train # select active split label (null disables filtering)
349
198
  output:
350
- transport: stdout # stdout | fs
351
- format: print # print | json-lines | json | csv | pickle
352
- limit: 100 # cap vectors per serve run (null = unlimited)
353
- throttle_ms: null # milliseconds to sleep between emitted vectors
199
+ transport: stdout # stdout | fs
200
+ format: print # print | json-lines | json | csv | pickle
201
+ limit: 100 # cap vectors per serve run (null = unlimited)
202
+ throttle_ms: null # milliseconds to sleep between emitted vectors
354
203
  # Optional overrides:
355
204
  # log_level: INFO # DEBUG=progress bars, INFO=spinner, WARNING=quiet
356
205
  # visuals: AUTO # AUTO | TQDM | RICH | OFF
@@ -358,7 +207,7 @@ throttle_ms: null # milliseconds to sleep between emitted vectors
358
207
  ```
359
208
 
360
209
  - Each serve task lives alongside artifact tasks under `paths.tasks`. Files are independent—no special directory structure required.
361
- - `output`, `limit`, `throttle_ms`, and `log_level` provide defaults for `jerry serve`; CLI flags still win per invocation (see *Configuration Resolution Order*). For filesystem outputs, set `transport: fs`, `directory: /path/to/root`, and omit file names—each run automatically writes to `<directory>/<run_name>/<run_name>.<ext>` unless you override the entire `output` block with a custom `filename`.
210
+ - `output`, `limit`, `throttle_ms`, and `log_level` provide defaults for `jerry serve`; CLI flags still win per invocation (see _Configuration & Resolution Order_). For filesystem outputs, set `transport: fs`, `directory: /path/to/root`, and omit file names—each run automatically writes to `<directory>/<run_name>/<run_name>.<ext>` unless you override the entire `output` block with a custom `filename`.
362
211
  - Override `keep` (and other fields) per invocation via `jerry serve ... --keep val` etc.
363
212
  - Visuals backend: set `visuals: AUTO|TQDM|RICH|OFF` in the task or use `--visuals`. Pair with `progress: AUTO|SPINNER|BARS|OFF` or `--progress` to control progress layouts.
364
213
  - Add additional `kind: serve` files to the tasks directory for other splits (val/test/etc.); `jerry serve` runs each enabled file unless you pass `--run <name>`.
@@ -369,81 +218,87 @@ throttle_ms: null # milliseconds to sleep between emitted vectors
369
218
  Create an optional `jerry.yaml` in the directory where you run the CLI to share settings across commands. The CLI walks up from the current working directory to find the first `jerry.yaml`.
370
219
 
371
220
  ```yaml
372
- plugin_root: lib/power_plugin # optional repo path for scaffolding (relative to this file)
373
- config_root: configs/default # directory containing project.yaml (relative paths ok)
221
+ plugin_root: lib/my-datapipeline # plugin workspace (relative to this file)
222
+
223
+ # Dataset aliases for --dataset; values may be dirs (auto-append project.yaml).
224
+ datasets:
225
+ example: lib/my-datapipeline/example/project.yaml
226
+ default_dataset: example
374
227
 
375
228
  shared:
376
- visuals: rich # default visual renderer (auto|tqdm|rich|off)
377
- progress: bars # spinner|bars|auto|off
229
+ visuals: AUTO # AUTO | TQDM | RICH | OFF
230
+ progress: BARS # AUTO | SPINNER | BARS | OFF
231
+ log_level: INFO
378
232
 
379
233
  serve:
380
- log_level: INFO
234
+ limit: null
235
+ stage: null
381
236
  output:
382
237
  transport: stdout
383
- format: print
238
+ format: print # print | json-lines | json | csv | pickle
384
239
  # directory: artifacts/serve # Required when transport=fs
385
240
 
386
241
  build:
387
- log_level: INFO
388
242
  mode: AUTO # AUTO | FORCE | OFF
389
243
  ```
390
244
 
391
245
  `jerry.yaml` sits near the root of your workspace, while dataset-specific overrides still live in individual `tasks/serve.*.yaml` files as needed.
392
246
 
393
- ### Configuration Resolution Order
247
+ ### `<project_root>/sources/<alias>.yaml`
394
248
 
395
- Defaults are layered so you can set global preferences once, keep dataset/run
396
- files focused on per-project behavior, and still override anything from the CLI.
397
- For both `jerry serve` and `jerry build`, options are merged in the following
398
- order (highest precedence first):
249
+ Each file defines a loader/parser pair exposed under `<alias>`. Files may live in nested
250
+ subdirectories under `<project_root>/sources/`; discovery is recursive.
399
251
 
400
- 1. **CLI flags** – anything you pass on the command line always wins, even if a
401
- value is already specified elsewhere.
402
- 2. **Project task files** – `kind: serve` specs (under `project.paths.tasks`)
403
- supply serve defaults; artifact tasks in the same directory drive `jerry build`.
404
- These only apply to the dataset that owns the config directory.
405
- 3. **`jerry.yaml` command blocks** – settings under `jerry.serve` and
406
- `jerry.build` provide workspace-wide defaults for their respective commands.
407
- 4. **`jerry.yaml.shared`** shared fallbacks for visuals/progress/log-level
408
- style settings that apply to every command when a more specific value is not
409
- defined.
410
- 5. **Built-in defaults** – the runtime’s hard-coded values used when nothing else
411
- sets an option.
252
+ ```yaml
253
+ # Source identifier (commonly `provider.dataset`). Contracts reference this under `source:`.
254
+ id: stooq.ohlcv
255
+ parser:
256
+ # Parser entry point name (registered in your plugin’s pyproject.toml).
257
+ entrypoint: stooq.ohlcv
258
+ loader:
259
+ # Most common loader: core.io (supports fs/http via args.transport + args.format).
260
+ entrypoint: core.io
261
+ args:
262
+ transport: http
263
+ format: csv
264
+ url: "https://stooq.com/q/d/l/?s=aapl.us&i=d"
265
+ ```
412
266
 
413
- This hierarchy lets you push opinionated defaults up to the workspace (so every
414
- project or dataset behaves consistently) while still giving each dataset and
415
- every CLI invocation the ability to tighten or override behaviors.
267
+ - `id`: the source alias; referenced by contracts under `source:`.
268
+ - `parser.entrypoint`: which parser to use; `parser.args` are optional.
269
+ - `loader.entrypoint`: which loader to use; `core.io` is the default for fs/http and is configured via `loader.args`.
416
270
 
417
- ### `config/sources/<alias>.yaml`
271
+ #### Fan-out Sources (`core.foreach`)
418
272
 
419
- Each file defines a loader/parser pair exposed under `<alias>` (also the
420
- `id` the rest of the pipeline references). Files may live in nested
421
- subdirectories under `config/sources/`; discovery is recursive.
273
+ Use `core.foreach` to expand any inner loader spec across a list without duplicating YAML. It interpolates string args and optionally injects the foreach value into each row.
422
274
 
423
275
  ```yaml
424
- id: demo_weather
425
- parser:
426
- entrypoint: demo.weather_parser
427
- args:
428
- timezone: UTC
429
276
  loader:
430
- entrypoint: demo.csv_loader
277
+ entrypoint: core.foreach
431
278
  args:
432
- path: data/weather.csv
279
+ foreach:
280
+ symbol: [AAPL, MSFT]
281
+ inject_field: symbol
282
+ loader:
283
+ entrypoint: core.io
284
+ args:
285
+ transport: http
286
+ format: csv
287
+ url: "https://stooq.com/q/d/l/?s=${symbol}&i=d"
433
288
  ```
434
289
 
435
- ### `config/contracts/<alias>.yaml`
290
+ ### `<project_root>/contracts/<stream_id>.yaml`
436
291
 
437
- Canonical stream contracts describe how the runtime should map and prepare a
438
- source. Use folders to organize by domain.
292
+ Canonical stream contracts describe how the runtime should map and prepare a raw
293
+ source. Use folders to organize by domain if you like.
439
294
 
440
295
  ```yaml
441
296
  kind: ingest
442
- id: demo_weather
443
- source: demo_weather
297
+ id: equity.ohlcv # stream identifier (domain.dataset[.variant])
298
+ source: stooq.ohlcv # references sources/<alias>.yaml:id
444
299
 
445
300
  mapper:
446
- entrypoint: weather.domain.mapper
301
+ entrypoint: equity.ohlcv
447
302
  args: {}
448
303
 
449
304
  partition_by: station
@@ -476,7 +331,7 @@ debug:
476
331
  Define engineered streams that depend on other canonical streams directly in contracts. The runtime builds each input to stage 4 (ordered + regularized), stream‑aligns by partition + timestamp, runs your composer, and emits fresh records for the derived stream.
477
332
 
478
333
  ```yaml
479
- # contracts/air_density.processed.yaml
334
+ # <project_root>/contracts/air_density.processed.yaml
480
335
  kind: composed
481
336
  id: air_density.processed
482
337
  inputs:
@@ -521,14 +376,14 @@ Defines which canonical streams become features/targets and the vector bucketing
521
376
  group_by: 1h
522
377
 
523
378
  features:
524
- - id: temp_c
525
- record_stream: demo_weather
379
+ - id: close
380
+ record_stream: equity.ohlcv
526
381
  scale: true
527
382
  sequence: { size: 6, stride: 1 }
528
383
 
529
384
  targets:
530
- - id: precip
531
- record_stream: demo_weather
385
+ - id: returns_1d
386
+ record_stream: equity.ohlcv
532
387
  ```
533
388
 
534
389
  - `group_by` controls the cadence for vector partitioning (accepts `Xm|min|Xh`
@@ -593,6 +448,7 @@ enabled: true
593
448
 
594
449
  All commands live under the `jerry` entry point (`src/datapipeline/cli/app.py`).
595
450
  Pass `--help` on any command for flags.
451
+ All commands that take a project accept either `--project <path/to/project.yaml>` or `--dataset <alias>` (from `jerry.yaml datasets:`).
596
452
 
597
453
  ### Preview Stages
598
454
 
@@ -615,7 +471,7 @@ Pass `--help` on any command for flags.
615
471
  the group key or metadata. Default is `sample`.
616
472
  - Set `--log-level DEBUG` (or set your serve task `log_level: DEBUG`) to reuse the tqdm progress bars when previewing stages.
617
473
  - When multiple serve tasks exist, add `--run val` (task name or filename stem) to target a single config; otherwise every enabled task is executed sequentially.
618
- - Argument precedence follows the order described under *Configuration Resolution Order*.
474
+ - Argument precedence follows the order described under _Configuration & Resolution Order_.
619
475
  - Combine with `--skip-build` when you already have fresh artifacts and want to jump straight into streaming.
620
476
 
621
477
  ### Build & Quality
@@ -651,7 +507,7 @@ Pass `--help` on any command for flags.
651
507
 
652
508
  ## Transform & Filter Library
653
509
 
654
- ### Record Filters (`config/contracts[].record`)
510
+ ### Record Filters (`<project_root>/contracts/*.yaml:record`)
655
511
 
656
512
  - Binary comparisons: `eq`, `ne`, `lt`, `le`, `gt`, `ge` (timezone-aware for ISO
657
513
  or datetime literals).
@@ -791,7 +647,7 @@ and `src/datapipeline/filters/`.
791
647
  `jerry plugin init`.
792
648
  - `datapipeline.services.scaffold.source.create_source` – writes loader/parser
793
649
  stubs and updates entry points.
794
- - `datapipeline.services.scaffold.domain.create_domain` – domain DTO skeleton.
650
+ - `datapipeline.services.scaffold.domain.create_domain` – domain record skeleton.
795
651
  - `datapipeline.services.scaffold.filter.create_filter` – custom filter stub.
796
652
  - `datapipeline.services.scaffold.mappers.attach_source_to_domain` – helper for
797
653
  programmatically wiring sources to domain mappers and emitting stream
@@ -821,5 +677,151 @@ and `src/datapipeline/filters/`.
821
677
  - `examples/minimal_project/` – runnable demo showing config layout and Torch
822
678
  integration.
823
679
 
824
- Happy shipping! Build, inspect, and serve consistent time-series features with
825
- confidence.
680
+ ---
681
+
682
+ ## Pipeline Architecture (WIP)
683
+
684
+ ```text
685
+ raw source ──▶ loader/parser DTOs ──▶ canonical stream ──▶ record policies
686
+ └──▶ feature wrapping ──▶ stream regularization ──▶ feature transforms/sequence
687
+ └──▶ vector assembly ──▶ postprocess transforms
688
+ ```
689
+
690
+ 1. **Loader/parser (Stage 0)** – raw bytes become typed DTOs. Loaders fetch from
691
+ FS/HTTP/synthetic sources; parsers map bytes to DTOs. Register them via entry
692
+ points (`loaders`, `parsers`) and wire them in `<project_root>/sources/*.yaml`.
693
+ 2. **Canonical stream mapping (Stage 1)** – mappers attach domain semantics and
694
+ partition keys, producing domain `TemporalRecord`s.
695
+ 3. **Record policies (Stage 2)** – contract `record` rules (filters, floor, lag)
696
+ prune and normalize DTO-derived records.
697
+ 4. **Feature wrapping (Stage 3)** – records become `FeatureRecord`s before
698
+ sort/regularization.
699
+ 5. **Stream regularization (Stage 4)** – contract `stream` rules ensure cadence,
700
+ deduplicate timestamps, and impute where needed.
701
+ 6. **Feature transforms/sequence (Stage 5)** – dataset transforms (scale,
702
+ sequence windows) produce per-feature tensors or windows.
703
+ 7. **Vector assembly (Stage 6)** – features merge by `group_by` cadence into
704
+ `(group_key, Vector)` pairs, prior to postprocess tweaks.
705
+ 8. **Postprocess (Stage 7)** – optional vector transforms (fill/drop/etc.) run
706
+ before results are emitted to the configured output.
707
+
708
+ #### Visual Flowchart
709
+
710
+ ```mermaid
711
+ flowchart TB
712
+ subgraph CLI & Project config
713
+ cliSource[jerry source add]
714
+ cliDomain[jerry domain add]
715
+ cliContract[jerry contract]
716
+ cliServe[jerry serve]
717
+ project[[project.yaml]]
718
+ sourcesCfg[sources/*.yaml]
719
+ contractsCfg[contracts/*.yaml]
720
+ datasetCfg[dataset.yaml]
721
+ postprocessCfg[postprocess.yaml]
722
+ end
723
+
724
+ cliSource --> sourcesCfg
725
+ cliDomain --> domainPkg
726
+ cliContract --> contractsCfg
727
+ cliServe --> vectorSamples
728
+ project -.->|paths.sources| sourcesCfg
729
+ project -.->|paths.streams| contractsCfg
730
+ project -.->|paths.dataset| datasetCfg
731
+ project -.->|paths.postprocess| postprocessCfg
732
+
733
+ subgraph Plugin code
734
+ domainPkg[domains/*]
735
+ mappersPkg[mappers/*]
736
+ end
737
+
738
+ cliContract --> mappersPkg
739
+ domainPkg -. domain models .-> mappersPkg
740
+
741
+ subgraph Registries
742
+ registrySources[sources]
743
+ registryStreamSources[stream_sources]
744
+ registryMappers[mappers]
745
+ registryRecordOps[record_ops]
746
+ registryStreamOps[stream_ops]
747
+ registryDebugOps[debug_ops]
748
+ end
749
+
750
+ subgraph Source wiring
751
+ rawData[(external data)]
752
+ transportSpec[transport + format]
753
+ loaderEP[loader ep]
754
+ parserEP[parser ep]
755
+ sourceArgs[loader args]
756
+ sourceNode[Source]
757
+ dtoStream[(DTOs)]
758
+ end
759
+
760
+ sourcesCfg --> transportSpec
761
+ sourcesCfg --> loaderEP
762
+ sourcesCfg --> parserEP
763
+ sourcesCfg --> sourceArgs
764
+ transportSpec -. select fs/http/synth .-> loaderEP
765
+ loaderEP -. build loader .-> sourceNode
766
+ parserEP -. build parser .-> sourceNode
767
+ sourceArgs -. paths/creds .-> sourceNode
768
+ rawData --> sourceNode --> dtoStream
769
+ sourcesCfg -. build_source_from_spec .-> registrySources
770
+ contractsCfg -. stream_id + source .-> registryStreamSources
771
+ registrySources -. alias -> Source .-> registryStreamSources
772
+
773
+ subgraph Canonical stream
774
+ mapperEP[mapper ep]
775
+ recordRules[record rules]
776
+ streamRules[stream rules]
777
+ debugRules[debug rules]
778
+ canonical[DTO -> record]
779
+ domainRecords((TemporalRecord))
780
+ recordStage[record xforms]
781
+ featureWrap[record -> feature]
782
+ featureRecords((FeatureRecord))
783
+ regularization[stream xforms]
784
+ end
785
+
786
+ dtoStream --> canonical --> domainRecords --> recordStage --> featureWrap --> featureRecords --> regularization
787
+ contractsCfg --> mapperEP
788
+ mappersPkg -. ep target .-> mapperEP
789
+ mapperEP -. build_mapper_from_spec .-> registryMappers
790
+ registryMappers --> canonical
791
+ contractsCfg --> recordRules
792
+ contractsCfg --> streamRules
793
+ contractsCfg --> debugRules
794
+ registryRecordOps --> recordRules
795
+ registryStreamOps --> streamRules
796
+ registryDebugOps --> debugRules
797
+ recordRules --> recordStage
798
+ streamRules --> regularization
799
+ debugRules --> regularization
800
+
801
+ subgraph Dataset shaping
802
+ featureSpec[feature cfg]
803
+ groupBySpec[group_by]
804
+ streamRefs[record_stream ids]
805
+ featureTrans[feature/seq xforms]
806
+ sequenceStream((seq/features))
807
+ vectorStage[vector assembly]
808
+ vectorSamples((samples))
809
+ end
810
+
811
+ datasetCfg --> featureSpec
812
+ datasetCfg --> groupBySpec
813
+ datasetCfg --> streamRefs
814
+ streamRefs -.->|build_feature_pipeline| registryStreamSources
815
+ registryStreamSources -.->|open_source_stream| sourceNode
816
+ featureRecords --> regularization --> featureTrans --> sequenceStream --> vectorStage --> vectorSamples
817
+ featureSpec -. scale/sequence .-> featureTrans
818
+ groupBySpec -. cadence .-> vectorStage
819
+
820
+ subgraph Postprocess
821
+ vectorTransforms[vector xforms]
822
+ postprocessNode[postprocess]
823
+ end
824
+
825
+ postprocessCfg --> vectorTransforms -. drop/fill .-> postprocessNode
826
+ vectorStage --> postprocessNode
827
+ ```