macrodata-refiner 0.3.1__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/PKG-INFO +8 -2
  2. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/pyproject.toml +10 -2
  3. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/macrodata_refiner.egg-info/PKG-INFO +8 -2
  4. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/macrodata_refiner.egg-info/SOURCES.txt +1 -0
  5. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/macrodata_refiner.egg-info/requires.txt +9 -1
  6. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/io/datafile.py +4 -0
  7. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/io/datafolder.py +4 -0
  8. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/io/fileset.py +20 -1
  9. macrodata_refiner-0.3.2/src/refiner/io/utils.py +29 -0
  10. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/launchers/cloud.py +18 -9
  11. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/pipeline.py +11 -6
  12. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/planning.py +11 -2
  13. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sinks/base.py +22 -1
  14. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sinks/lerobot.py +3 -2
  15. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sinks/reducer/zarr.py +3 -2
  16. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sinks/zarr.py +5 -1
  17. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sources/base.py +19 -0
  18. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sources/readers/base.py +3 -0
  19. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sources/readers/hdf5.py +3 -0
  20. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sources/readers/hf_dataset.py +46 -20
  21. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sources/readers/mcap.py +3 -0
  22. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sources/readers/tfds.py +3 -0
  23. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sources/readers/tfrecord.py +3 -0
  24. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sources/readers/zarr.py +9 -2
  25. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/platform/manifest.py +76 -17
  26. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/robotics/hand_tracking.py +8 -7
  27. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/text/commoncrawl.py +46 -21
  28. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/tests/test_commoncrawl_text.py +25 -0
  29. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/LICENSE +0 -0
  30. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/README.md +0 -0
  31. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/setup.cfg +0 -0
  32. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/macrodata_refiner.egg-info/dependency_links.txt +0 -0
  33. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/macrodata_refiner.egg-info/entry_points.txt +0 -0
  34. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/macrodata_refiner.egg-info/top_level.txt +0 -0
  35. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/__init__.py +0 -0
  36. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/__init__.py +0 -0
  37. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/auth.py +0 -0
  38. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/commands/__init__.py +0 -0
  39. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/commands/auth.py +0 -0
  40. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/commands/jobs.py +0 -0
  41. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/commands/run.py +0 -0
  42. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/commands/secrets.py +0 -0
  43. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/common.py +0 -0
  44. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/jobs/__init__.py +0 -0
  45. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/jobs/attach.py +0 -0
  46. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/jobs/common.py +0 -0
  47. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/jobs/control.py +0 -0
  48. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/jobs/follow.py +0 -0
  49. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/jobs/get.py +0 -0
  50. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/jobs/list.py +0 -0
  51. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/jobs/logs.py +0 -0
  52. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/jobs/manifest.py +0 -0
  53. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/jobs/metrics.py +0 -0
  54. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/jobs/workers.py +0 -0
  55. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/main.py +0 -0
  56. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/run/__init__.py +0 -0
  57. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/run/cloud.py +0 -0
  58. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/run/command.py +0 -0
  59. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/run/local.py +0 -0
  60. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/run/modes.py +0 -0
  61. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/secrets.py +0 -0
  62. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/ui/__init__.py +0 -0
  63. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/ui/console.py +0 -0
  64. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/cli/ui/terminal.py +0 -0
  65. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/execution/__init__.py +0 -0
  66. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/execution/asyncio/__init__.py +0 -0
  67. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/execution/asyncio/runtime.py +0 -0
  68. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/execution/asyncio/window.py +0 -0
  69. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/execution/buffer.py +0 -0
  70. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/execution/engine.py +0 -0
  71. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/execution/operators/__init__.py +0 -0
  72. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/execution/operators/row.py +0 -0
  73. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/execution/operators/vectorized.py +0 -0
  74. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/execution/tracking/__init__.py +0 -0
  75. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/execution/tracking/shards.py +0 -0
  76. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/inference/__init__.py +0 -0
  77. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/inference/capabilities.py +0 -0
  78. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/inference/generate_pooling.py +0 -0
  79. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/inference/generate_text.py +0 -0
  80. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/inference/internal/__init__.py +0 -0
  81. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/inference/internal/media.py +0 -0
  82. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/inference/internal/message_conversion.py +0 -0
  83. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/inference/internal/response.py +0 -0
  84. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/inference/internal/runtime.py +0 -0
  85. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/inference/internal/schema.py +0 -0
  86. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/inference/internal/transport.py +0 -0
  87. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/inference/internal/usage.py +0 -0
  88. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/inference/providers/__init__.py +0 -0
  89. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/inference/providers/anthropic.py +0 -0
  90. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/inference/providers/base.py +0 -0
  91. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/inference/providers/google.py +0 -0
  92. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/inference/providers/openai.py +0 -0
  93. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/inference/providers/warnings.py +0 -0
  94. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/inference/types.py +0 -0
  95. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/io/__init__.py +0 -0
  96. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/job_urls.py +0 -0
  97. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/launchers/__init__.py +0 -0
  98. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/launchers/base.py +0 -0
  99. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/launchers/local.py +0 -0
  100. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/launchers/secrets.py +0 -0
  101. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/__init__.py +0 -0
  102. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/data/block.py +0 -0
  103. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/data/datatype.py +0 -0
  104. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/data/row.py +0 -0
  105. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/data/shard.py +0 -0
  106. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/data/tabular.py +0 -0
  107. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/expressions.py +0 -0
  108. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/resources.py +0 -0
  109. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sinks/__init__.py +0 -0
  110. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sinks/assets.py +0 -0
  111. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sinks/jsonl.py +0 -0
  112. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sinks/parquet.py +0 -0
  113. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sinks/reducer/__init__.py +0 -0
  114. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sinks/reducer/file.py +0 -0
  115. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sinks/reducer/lerobot.py +0 -0
  116. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sources/__init__.py +0 -0
  117. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sources/items.py +0 -0
  118. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sources/readers/__init__.py +0 -0
  119. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sources/readers/csv.py +0 -0
  120. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sources/readers/files.py +0 -0
  121. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sources/readers/json.py +0 -0
  122. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sources/readers/lerobot.py +0 -0
  123. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sources/readers/parquet.py +0 -0
  124. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sources/readers/utils.py +0 -0
  125. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/sources/task.py +0 -0
  126. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/steps.py +0 -0
  127. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/utils/__init__.py +0 -0
  128. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/utils/cache/__init__.py +0 -0
  129. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/utils/cache/decoder_cache.py +0 -0
  130. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/utils/cache/file_cache.py +0 -0
  131. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/pipeline/utils/cache/lease_cache.py +0 -0
  132. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/platform/__init__.py +0 -0
  133. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/platform/auth.py +0 -0
  134. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/platform/client/__init__.py +0 -0
  135. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/platform/client/api.py +0 -0
  136. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/platform/client/models.py +0 -0
  137. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/platform/client/serialize.py +0 -0
  138. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/py.typed +0 -0
  139. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/robotics/__init__.py +0 -0
  140. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/robotics/lerobot_format/__init__.py +0 -0
  141. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/robotics/lerobot_format/metadata/__init__.py +0 -0
  142. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/robotics/lerobot_format/metadata/info.py +0 -0
  143. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/robotics/lerobot_format/metadata/metadata.py +0 -0
  144. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/robotics/lerobot_format/metadata/stats.py +0 -0
  145. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/robotics/lerobot_format/metadata/tasks.py +0 -0
  146. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/robotics/lerobot_format/row.py +0 -0
  147. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/robotics/lerobot_format/tabular.py +0 -0
  148. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/robotics/motion.py +0 -0
  149. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/robotics/reward.py +0 -0
  150. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/robotics/row.py +0 -0
  151. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/robotics/subtask_annotation.py +0 -0
  152. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/robotics/synchronization.py +0 -0
  153. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/robotics/tabular.py +0 -0
  154. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/services/__init__.py +0 -0
  155. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/services/base.py +0 -0
  156. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/services/discovery.py +0 -0
  157. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/services/manager.py +0 -0
  158. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/services/vllm.py +0 -0
  159. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/text/__init__.py +0 -0
  160. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/utils/__init__.py +0 -0
  161. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/utils/imports.py +0 -0
  162. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/video/__init__.py +0 -0
  163. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/video/decode.py +0 -0
  164. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/video/remux.py +0 -0
  165. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/video/transcode.py +0 -0
  166. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/video/types.py +0 -0
  167. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/video/writer.py +0 -0
  168. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/worker/__init__.py +0 -0
  169. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/worker/context.py +0 -0
  170. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/worker/entrypoint.py +0 -0
  171. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/worker/lifecycle.py +0 -0
  172. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/worker/metrics/__init__.py +0 -0
  173. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/worker/metrics/api.py +0 -0
  174. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/worker/metrics/emitter.py +0 -0
  175. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/worker/resources/__init__.py +0 -0
  176. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/worker/resources/cpu.py +0 -0
  177. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/worker/resources/gpu.py +0 -0
  178. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/worker/runner.py +0 -0
  179. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/src/refiner/worker/workdir.py +0 -0
  180. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/tests/test_cache.py +0 -0
  181. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/tests/test_expressions.py +0 -0
  182. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/tests/test_optional_dependencies.py +0 -0
  183. {macrodata_refiner-0.3.1 → macrodata_refiner-0.3.2}/tests/test_video_decode.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: macrodata-refiner
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets
5
5
  Author: Macrodata Labs
6
6
  License-Expression: Apache-2.0
@@ -27,9 +27,11 @@ Provides-Extra: video
27
27
  Requires-Dist: av; extra == "video"
28
28
  Requires-Dist: pillow; extra == "video"
29
29
  Provides-Extra: hf
30
- Requires-Dist: datasets>=3.0.0; extra == "hf"
31
30
  Requires-Dist: huggingface-hub>=1.4.1; extra == "hf"
32
31
  Requires-Dist: hf>=1.7.1; extra == "hf"
32
+ Provides-Extra: datasets
33
+ Requires-Dist: macrodata-refiner[hf]; extra == "datasets"
34
+ Requires-Dist: datasets>=3.0.0; extra == "datasets"
33
35
  Provides-Extra: hand-tracking
34
36
  Requires-Dist: macrodata-refiner[hf]; extra == "hand-tracking"
35
37
  Requires-Dist: macrodata-refiner[video]; extra == "hand-tracking"
@@ -49,6 +51,8 @@ Requires-Dist: mcap-ros2-support; extra == "mcap"
49
51
  Requires-Dist: pillow; extra == "mcap"
50
52
  Provides-Extra: s3
51
53
  Requires-Dist: s3fs; extra == "s3"
54
+ Provides-Extra: gcs
55
+ Requires-Dist: gcsfs; extra == "gcs"
52
56
  Provides-Extra: tensorflow
53
57
  Requires-Dist: tensorflow; extra == "tensorflow"
54
58
  Provides-Extra: tfds
@@ -59,6 +63,7 @@ Requires-Dist: macrodata-refiner[all]; extra == "testing"
59
63
  Requires-Dist: pytest>=8.0.0; extra == "testing"
60
64
  Requires-Dist: pytest-cov>=5.0.0; extra == "testing"
61
65
  Provides-Extra: all
66
+ Requires-Dist: macrodata-refiner[datasets]; extra == "all"
62
67
  Requires-Dist: macrodata-refiner[hdf5]; extra == "all"
63
68
  Requires-Dist: macrodata-refiner[hf]; extra == "all"
64
69
  Requires-Dist: macrodata-refiner[mcap]; extra == "all"
@@ -66,6 +71,7 @@ Requires-Dist: macrodata-refiner[video]; extra == "all"
66
71
  Requires-Dist: macrodata-refiner[zarr]; extra == "all"
67
72
  Requires-Dist: macrodata-refiner[text]; extra == "all"
68
73
  Requires-Dist: macrodata-refiner[s3]; extra == "all"
74
+ Requires-Dist: macrodata-refiner[gcs]; extra == "all"
69
75
  Requires-Dist: macrodata-refiner[tfds]; extra == "all"
70
76
  Dynamic: license-file
71
77
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "macrodata-refiner"
3
- version = "0.3.1"
3
+ version = "0.3.2"
4
4
  description = "Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets"
5
5
  readme = "README.md"
6
6
  license = "Apache-2.0"
@@ -35,10 +35,13 @@ video = [
35
35
  "pillow",
36
36
  ]
37
37
  hf = [
38
- "datasets>=3.0.0",
39
38
  "huggingface-hub>=1.4.1",
40
39
  "hf>=1.7.1",
41
40
  ]
41
+ datasets = [
42
+ "macrodata-refiner[hf]",
43
+ "datasets>=3.0.0",
44
+ ]
42
45
  hand_tracking = [
43
46
  "macrodata-refiner[hf]",
44
47
  "macrodata-refiner[video]",
@@ -64,6 +67,9 @@ mcap = [
64
67
  s3 = [
65
68
  "s3fs",
66
69
  ]
70
+ gcs = [
71
+ "gcsfs",
72
+ ]
67
73
  tensorflow = [
68
74
  "tensorflow",
69
75
  ]
@@ -77,6 +83,7 @@ testing = [
77
83
  "pytest-cov>=5.0.0",
78
84
  ]
79
85
  all = [
86
+ "macrodata-refiner[datasets]",
80
87
  "macrodata-refiner[hdf5]",
81
88
  "macrodata-refiner[hf]",
82
89
  "macrodata-refiner[mcap]",
@@ -84,6 +91,7 @@ all = [
84
91
  "macrodata-refiner[zarr]",
85
92
  "macrodata-refiner[text]",
86
93
  "macrodata-refiner[s3]",
94
+ "macrodata-refiner[gcs]",
87
95
  "macrodata-refiner[tfds]",
88
96
  ]
89
97
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: macrodata-refiner
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets
5
5
  Author: Macrodata Labs
6
6
  License-Expression: Apache-2.0
@@ -27,9 +27,11 @@ Provides-Extra: video
27
27
  Requires-Dist: av; extra == "video"
28
28
  Requires-Dist: pillow; extra == "video"
29
29
  Provides-Extra: hf
30
- Requires-Dist: datasets>=3.0.0; extra == "hf"
31
30
  Requires-Dist: huggingface-hub>=1.4.1; extra == "hf"
32
31
  Requires-Dist: hf>=1.7.1; extra == "hf"
32
+ Provides-Extra: datasets
33
+ Requires-Dist: macrodata-refiner[hf]; extra == "datasets"
34
+ Requires-Dist: datasets>=3.0.0; extra == "datasets"
33
35
  Provides-Extra: hand-tracking
34
36
  Requires-Dist: macrodata-refiner[hf]; extra == "hand-tracking"
35
37
  Requires-Dist: macrodata-refiner[video]; extra == "hand-tracking"
@@ -49,6 +51,8 @@ Requires-Dist: mcap-ros2-support; extra == "mcap"
49
51
  Requires-Dist: pillow; extra == "mcap"
50
52
  Provides-Extra: s3
51
53
  Requires-Dist: s3fs; extra == "s3"
54
+ Provides-Extra: gcs
55
+ Requires-Dist: gcsfs; extra == "gcs"
52
56
  Provides-Extra: tensorflow
53
57
  Requires-Dist: tensorflow; extra == "tensorflow"
54
58
  Provides-Extra: tfds
@@ -59,6 +63,7 @@ Requires-Dist: macrodata-refiner[all]; extra == "testing"
59
63
  Requires-Dist: pytest>=8.0.0; extra == "testing"
60
64
  Requires-Dist: pytest-cov>=5.0.0; extra == "testing"
61
65
  Provides-Extra: all
66
+ Requires-Dist: macrodata-refiner[datasets]; extra == "all"
62
67
  Requires-Dist: macrodata-refiner[hdf5]; extra == "all"
63
68
  Requires-Dist: macrodata-refiner[hf]; extra == "all"
64
69
  Requires-Dist: macrodata-refiner[mcap]; extra == "all"
@@ -66,6 +71,7 @@ Requires-Dist: macrodata-refiner[video]; extra == "all"
66
71
  Requires-Dist: macrodata-refiner[zarr]; extra == "all"
67
72
  Requires-Dist: macrodata-refiner[text]; extra == "all"
68
73
  Requires-Dist: macrodata-refiner[s3]; extra == "all"
74
+ Requires-Dist: macrodata-refiner[gcs]; extra == "all"
69
75
  Requires-Dist: macrodata-refiner[tfds]; extra == "all"
70
76
  Dynamic: license-file
71
77
 
@@ -73,6 +73,7 @@ src/refiner/io/__init__.py
73
73
  src/refiner/io/datafile.py
74
74
  src/refiner/io/datafolder.py
75
75
  src/refiner/io/fileset.py
76
+ src/refiner/io/utils.py
76
77
  src/refiner/launchers/__init__.py
77
78
  src/refiner/launchers/base.py
78
79
  src/refiner/launchers/cloud.py
@@ -11,6 +11,7 @@ msgspec>=0.20.0
11
11
  pydantic>=2.0.0
12
12
 
13
13
  [all]
14
+ macrodata-refiner[datasets]
14
15
  macrodata-refiner[hdf5]
15
16
  macrodata-refiner[hf]
16
17
  macrodata-refiner[mcap]
@@ -18,8 +19,16 @@ macrodata-refiner[video]
18
19
  macrodata-refiner[zarr]
19
20
  macrodata-refiner[text]
20
21
  macrodata-refiner[s3]
22
+ macrodata-refiner[gcs]
21
23
  macrodata-refiner[tfds]
22
24
 
25
+ [datasets]
26
+ macrodata-refiner[hf]
27
+ datasets>=3.0.0
28
+
29
+ [gcs]
30
+ gcsfs
31
+
23
32
  [hand_tracking]
24
33
  macrodata-refiner[hf]
25
34
  macrodata-refiner[video]
@@ -29,7 +38,6 @@ ego-vision[models]>=0.1.25
29
38
  h5py
30
39
 
31
40
  [hf]
32
- datasets>=3.0.0
33
41
  huggingface-hub>=1.4.1
34
42
  hf>=1.7.1
35
43
 
@@ -11,6 +11,7 @@ from typing import Any, TypeAlias, Union, cast
11
11
  from fsspec import AbstractFileSystem, url_to_fs
12
12
  from fsspec.implementations.http import HTTPFileSystem
13
13
  from fsspec.implementations.local import LocalFileSystem
14
+ from refiner.io.utils import required_refiner_extras
14
15
 
15
16
  DataFilePath: TypeAlias = str | PathLike[str]
16
17
  DataFileSpec: TypeAlias = tuple[DataFilePath, AbstractFileSystem]
@@ -146,6 +147,9 @@ class DataFile:
146
147
  def abs_path(self) -> str:
147
148
  return self.fs.unstrip_protocol(self.path).removeprefix("file://")
148
149
 
150
+ def required_refiner_extras(self) -> tuple[str, ...]:
151
+ return required_refiner_extras(self.path, self.fs)
152
+
149
153
  @property
150
154
  def is_local(self) -> bool:
151
155
  return isinstance(self.fs, LocalFileSystem)
@@ -7,6 +7,7 @@ from fsspec.implementations.dirfs import DirFileSystem
7
7
  from fsspec.implementations.local import LocalFileSystem
8
8
 
9
9
  from refiner.io.datafile import DataFile, _storage_options_for_path
10
+ from refiner.io.utils import required_refiner_extras
10
11
 
11
12
  DataFolderPath: TypeAlias = str | PathLike[str]
12
13
  DataFolderSpec: TypeAlias = tuple[DataFolderPath, AbstractFileSystem]
@@ -102,6 +103,9 @@ class DataFolder(DirFileSystem):
102
103
  # make sure we strip file:// and similar
103
104
  return self.fs.unstrip_protocol(self._join(path)).removeprefix("file://")
104
105
 
106
+ def required_refiner_extras(self) -> tuple[str, ...]:
107
+ return required_refiner_extras(self.path, self.fs)
108
+
105
109
  def abs_paths(self, paths: str | Iterable[str]) -> str | list[str]:
106
110
  """
107
111
  Transform a list of relative paths into a list of complete paths (including fs protocol and base path)
@@ -8,8 +8,13 @@ from typing import Any, Literal, TypeAlias, Union, cast
8
8
 
9
9
  from fsspec import AbstractFileSystem, url_to_fs
10
10
 
11
- from refiner.io.datafile import DataFile, DataFileSpec, _storage_options_for_path
11
+ from refiner.io.datafile import (
12
+ DataFile,
13
+ DataFileSpec,
14
+ _storage_options_for_path,
15
+ )
12
16
  from refiner.io.datafolder import DataFolder, DataFolderSpec
17
+ from refiner.io.utils import required_refiner_extras
13
18
 
14
19
  DataFileSetInput: TypeAlias = Union[
15
20
  str, PathLike[str], DataFileSpec, DataFolderSpec, DataFile, DataFolder
@@ -22,6 +27,9 @@ class _PathSource:
22
27
  path: str
23
28
  fs: AbstractFileSystem
24
29
 
30
+ def required_refiner_extras(self) -> tuple[str, ...]:
31
+ return required_refiner_extras(self.path, self.fs)
32
+
25
33
 
26
34
  @dataclass(frozen=True, slots=True)
27
35
  class DataFileSet:
@@ -174,6 +182,17 @@ class DataFileSet:
174
182
  raise TypeError("DataFileSet entries are not all folders")
175
183
  return cast(tuple[DataFolder, ...], entries)
176
184
 
185
+ def required_refiner_extras(self) -> tuple[str, ...]:
186
+ return tuple(
187
+ sorted(
188
+ {
189
+ extra
190
+ for entry in self.entries
191
+ for extra in entry.required_refiner_extras()
192
+ }
193
+ )
194
+ )
195
+
177
196
  @property
178
197
  def resolved_entries(self) -> tuple[DataFile | DataFolder, ...]:
179
198
  entries: list[DataFile | DataFolder] = []
@@ -0,0 +1,29 @@
1
+ from __future__ import annotations
2
+
3
+ from fsspec import AbstractFileSystem
4
+
5
+
6
+ _PROTOCOL_REFINER_EXTRAS = {
7
+ "s3": "s3",
8
+ "s3a": "s3",
9
+ "hf": "hf",
10
+ "gcs": "gcs",
11
+ "gs": "gcs",
12
+ }
13
+
14
+
15
+ def required_refiner_extras(path: str, fs: AbstractFileSystem) -> tuple[str, ...]:
16
+ protocol = fs.protocol
17
+ protocols = (protocol,) if isinstance(protocol, str) else tuple(protocol)
18
+ path_protocol, sep, _rest = path.partition("://")
19
+ return tuple(
20
+ sorted(
21
+ {
22
+ extra
23
+ for item in (*protocols, path_protocol if sep else None)
24
+ if item is not None
25
+ and (extra := _PROTOCOL_REFINER_EXTRAS.get(str(item).lower()))
26
+ is not None
27
+ }
28
+ )
29
+ )
@@ -103,9 +103,11 @@ class CloudLauncher(BaseLauncher):
103
103
  gpu: Optional GPU runtime request for cloud scheduling.
104
104
  sync_local_dependencies: Whether to include packages detected from the
105
105
  local environment in the cloud runtime.
106
- extra_dependencies: Additional packages to install in the cloud runtime.
107
- Entries are requirement strings. These take precedence over packages
108
- detected from the local environment.
106
+ dependencies: Additional packages to install in the cloud runtime.
107
+ Entries are requirement strings.
108
+ refiner_extras: Additional macrodata-refiner extras to install in the
109
+ cloud runtime. Built-in blocks automatically declare the extras they
110
+ require; pass this for extras used outside those blocks.
109
111
  secrets: Optional secret sources mounted into the cloud runtime.
110
112
  env: Optional plain environment variables mounted into the cloud runtime.
111
113
  """
@@ -119,8 +121,9 @@ class CloudLauncher(BaseLauncher):
119
121
  cpus_per_worker: int | None = None,
120
122
  mem_mb_per_worker: int | None = None,
121
123
  gpu: GPU | None = None,
122
- sync_local_dependencies: bool = True,
123
- extra_dependencies: Sequence[str] | None = None,
124
+ sync_local_dependencies: bool = False,
125
+ dependencies: Sequence[str] | None = None,
126
+ refiner_extras: Sequence[str] | None = None,
124
127
  secrets: SecretInput | None = None,
125
128
  env: dict[str, object | None] | None = None,
126
129
  continue_from_job: str | None = None,
@@ -141,7 +144,8 @@ class CloudLauncher(BaseLauncher):
141
144
  self.cpus_per_worker = cpus_per_worker
142
145
  self.mem_mb_per_worker = mem_mb_per_worker
143
146
  self.sync_local_dependencies = sync_local_dependencies
144
- self.extra_dependencies = extra_dependencies
147
+ self.dependencies = dependencies
148
+ self.refiner_extras = refiner_extras
145
149
  self.secrets = normalize_secret_sources(secrets)
146
150
  self.env = env
147
151
  self.continue_from_job = normalized_continue_from_job
@@ -153,12 +157,14 @@ class CloudLauncher(BaseLauncher):
153
157
  return raw.strip().lower() in {"1", "true", "yes", "on"}
154
158
 
155
159
  def _resolve_cloud_manifest(
156
- self, *, secret_values: tuple[str, ...]
160
+ self, *, secret_values: tuple[str, ...], stages: list[PlannedStage]
157
161
  ) -> dict[str, object]:
158
162
  manifest = build_run_manifest(
159
163
  secret_values=secret_values,
160
164
  capture_dependencies=self.sync_local_dependencies,
161
- extra_dependencies=self.extra_dependencies,
165
+ dependencies=self.dependencies,
166
+ refiner_extras=self.refiner_extras,
167
+ pipeline_stages=stages,
162
168
  )
163
169
  environment = manifest.get("environment")
164
170
  if environment is None:
@@ -286,7 +292,10 @@ class CloudLauncher(BaseLauncher):
286
292
  resolved_secret_sources, secret_values = resolve_secret_sources(self.secrets)
287
293
  resolved_env = resolve_env_mapping(self.env) if self.env else None
288
294
  stages = self._resolved_stages()
289
- manifest = self._resolve_cloud_manifest(secret_values=secret_values)
295
+ manifest = self._resolve_cloud_manifest(
296
+ secret_values=secret_values,
297
+ stages=stages,
298
+ )
290
299
  plan = self._compiled_plan(stages, secret_values=secret_values)
291
300
  try:
292
301
  pipeline_payloads = self._upload_stage_payloads(
@@ -708,8 +708,9 @@ class RefinerPipeline:
708
708
  cpus_per_worker: int | None = None,
709
709
  mem_mb_per_worker: int | None = None,
710
710
  gpu: GPU | None = None,
711
- sync_local_dependencies: bool = True,
712
- extra_dependencies: Sequence[str] | None = None,
711
+ sync_local_dependencies: bool = False,
712
+ dependencies: Sequence[str] | None = None,
713
+ refiner_extras: Sequence[str] | None = None,
713
714
  secrets: SecretInput | None = None,
714
715
  env: Mapping[str, object | None] | None = None,
715
716
  continue_from_job: str | None = None,
@@ -725,10 +726,13 @@ class RefinerPipeline:
725
726
  gpu: Optional structured GPU request.
726
727
  sync_local_dependencies: Include packages detected from the local
727
728
  environment in the cloud runtime.
728
- extra_dependencies: Additional packages to install in the cloud runtime.
729
+ dependencies: Additional packages to install in the cloud runtime.
729
730
  Entries are requirement strings such as `"torch"` or
730
- `"ego-vision[models]==0.1.2"`. These take precedence over packages
731
- detected from the local environment.
731
+ `"ego-vision[models]==0.1.2"`.
732
+ refiner_extras: Additional macrodata-refiner extras to install in
733
+ the cloud runtime. Built-in blocks automatically declare the
734
+ extras they require; pass this for extras used outside those
735
+ blocks.
732
736
  secrets: Secret sources to mount inside the cloud image. A mapping keeps
733
737
  the legacy behavior; `None` values are loaded from the submitting
734
738
  environment. `Secrets.env(...)` references stored workspace secrets.
@@ -750,7 +754,8 @@ class RefinerPipeline:
750
754
  mem_mb_per_worker=mem_mb_per_worker,
751
755
  gpu=gpu,
752
756
  sync_local_dependencies=sync_local_dependencies,
753
- extra_dependencies=extra_dependencies,
757
+ dependencies=dependencies,
758
+ refiner_extras=refiner_extras,
754
759
  secrets=secrets,
755
760
  env=dict(env) if env is not None else None,
756
761
  continue_from_job=continue_from_job,
@@ -329,10 +329,19 @@ def _builtin_description(fn: Any) -> dict[str, Any] | None:
329
329
  return {"name": name, "args": args, "services": tuple(parsed_services)}
330
330
 
331
331
 
332
- def describe_builtin(name: str, **args: Any) -> Any:
332
+ def describe_builtin(
333
+ name: str, *, refiner_extras: tuple[str, ...] = (), **args: Any
334
+ ) -> Any:
333
335
  def _decorate(fn: Any) -> Any:
334
336
  setattr(
335
- fn, _REFINER_BUILTIN_CALL_ATTR, {"name": name, "args": args, "services": ()}
337
+ fn,
338
+ _REFINER_BUILTIN_CALL_ATTR,
339
+ {
340
+ "name": name,
341
+ "args": args,
342
+ "services": (),
343
+ "refiner_extras": refiner_extras,
344
+ },
336
345
  )
337
346
  return fn
338
347
 
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from abc import ABC
4
- from typing import Any
4
+ from typing import Any, cast
5
5
 
6
6
  import pyarrow as pa
7
7
 
@@ -56,6 +56,27 @@ class BaseSink(ABC):
56
56
  """
57
57
  return None
58
58
 
59
+ def required_refiner_extras(self) -> tuple[str, ...]:
60
+ """macrodata-refiner extras required by this sink."""
61
+ return tuple(
62
+ sorted(
63
+ {
64
+ *self._declared_refiner_extras(),
65
+ *self._io_refiner_extras(),
66
+ }
67
+ )
68
+ )
69
+
70
+ def _declared_refiner_extras(self) -> tuple[str, ...]:
71
+ """Feature extras declared by this sink."""
72
+ return ()
73
+
74
+ def _io_refiner_extras(self) -> tuple[str, ...]:
75
+ """Storage extras required by this sink's output, if it has one."""
76
+ if not hasattr(self, "output"):
77
+ return ()
78
+ return cast(Any, self).output.required_refiner_extras()
79
+
59
80
  def build_reducer(self) -> "BaseSink | None":
60
81
  """Return an optional 1-worker reducer sink for launched execution.
61
82
 
@@ -36,7 +36,6 @@ from refiner.robotics.lerobot_format import (
36
36
  infer_feature_info,
37
37
  )
38
38
  from refiner.robotics.row import RoboticsRow
39
- from refiner.utils import check_required_dependencies
40
39
  from refiner.worker.context import get_active_worker_token
41
40
  from refiner.worker.metrics.api import register_gauge
42
41
 
@@ -101,7 +100,6 @@ class LeRobotWriterSink(BaseSink):
101
100
  quantile_bins: int = 5000,
102
101
  force_recompute_video_stats: bool = False,
103
102
  ):
104
- check_required_dependencies("write_lerobot", ["av"], dist="robotics")
105
103
  self.output = DataFolder.resolve(output)
106
104
  self.data_files_size_in_mb = data_files_size_in_mb
107
105
  self.video_files_size_in_mb = video_files_size_in_mb
@@ -123,6 +121,9 @@ class LeRobotWriterSink(BaseSink):
123
121
  )
124
122
  self._episodes_in_flight_registered = False
125
123
 
124
+ def _declared_refiner_extras(self) -> tuple[str, ...]:
125
+ return ("video",)
126
+
126
127
  def write_shard_block(self, shard_id: str, block: Block) -> None:
127
128
  """Submit one async write task per episode row in the shard-local block."""
128
129
  if not self._episodes_in_flight_registered:
@@ -15,7 +15,6 @@ from refiner.pipeline.sinks.zarr import (
15
15
  _render_store_relpath,
16
16
  _zarr_store,
17
17
  )
18
- from refiner.utils import check_required_dependencies
19
18
  from refiner.worker.context import get_active_stage_index, get_finalized_workers
20
19
  from refiner.worker.lifecycle import sort_finalized_workers
21
20
 
@@ -30,7 +29,6 @@ class ZarrReducerSink(FileCleanupReducerSink):
30
29
  array_chunk_bytes: int = _DEFAULT_ARRAY_CHUNK_BYTES,
31
30
  reduce_to_single_store: bool = True,
32
31
  ) -> None:
33
- check_required_dependencies("write_zarr", ["zarr"], dist="zarr")
34
32
  super().__init__(
35
33
  output=output,
36
34
  filename_template=(
@@ -43,6 +41,9 @@ class ZarrReducerSink(FileCleanupReducerSink):
43
41
  self.array_chunk_bytes = array_chunk_bytes
44
42
  self.reduce_to_single_store = reduce_to_single_store
45
43
 
44
+ def _declared_refiner_extras(self) -> tuple[str, ...]:
45
+ return ("zarr",)
46
+
46
47
  def write_shard_block(self, shard_id: str, block: Block) -> None:
47
48
  self._run_cleanup()
48
49
  if self.reduce_to_single_store:
@@ -42,7 +42,6 @@ class ZarrSink(BaseSink):
42
42
  array_chunk_bytes: int = _DEFAULT_ARRAY_CHUNK_BYTES,
43
43
  reduce_to_single_store: bool = True,
44
44
  ):
45
- check_required_dependencies("write_zarr", ["zarr"], dist="zarr")
46
45
  if video_frame_batch_size <= 0:
47
46
  raise ValueError("video_frame_batch_size must be greater than zero")
48
47
  if array_chunk_bytes <= 0:
@@ -70,6 +69,9 @@ class ZarrSink(BaseSink):
70
69
  self._stores: dict[str, _ZarrWriteState] = {}
71
70
  self._default_arrays: dict[str, str] | None = None
72
71
 
72
+ def _declared_refiner_extras(self) -> tuple[str, ...]:
73
+ return ("zarr",)
74
+
73
75
  def write_shard_block(self, shard_id: str, block: Block) -> int:
74
76
  count = 0
75
77
  pending_arrays: dict[str, list[np.ndarray]] = {}
@@ -311,6 +313,7 @@ class ZarrSink(BaseSink):
311
313
  store = self._stores.get(relpath)
312
314
  if store is not None:
313
315
  return store
316
+ check_required_dependencies("write_zarr", ["zarr"], dist="zarr")
314
317
  import zarr
315
318
 
316
319
  store = _ZarrWriteState(
@@ -535,6 +538,7 @@ def _matching_length(lengths: list[int]) -> int | None:
535
538
 
536
539
 
537
540
  def _zarr_store(output: DataFolder, path: str = "", *, mode: str = "r"):
541
+ check_required_dependencies("write_zarr", ["zarr"], dist="zarr")
538
542
  import zarr
539
543
 
540
544
  return zarr.storage.FSStore(
@@ -47,6 +47,25 @@ class BaseSource(ABC):
47
47
  """Optional source metadata for planning/observability."""
48
48
  return {}
49
49
 
50
+ def required_refiner_extras(self) -> tuple[str, ...]:
51
+ """macrodata-refiner extras required by this source."""
52
+ return tuple(
53
+ sorted(
54
+ {
55
+ *self._declared_refiner_extras(),
56
+ *self._io_refiner_extras(),
57
+ }
58
+ )
59
+ )
60
+
61
+ def _declared_refiner_extras(self) -> tuple[str, ...]:
62
+ """Feature extras declared by this source."""
63
+ return ()
64
+
65
+ def _io_refiner_extras(self) -> tuple[str, ...]:
66
+ """Storage extras required by this source's normalized IO handles."""
67
+ return ()
68
+
50
69
 
51
70
  __all__ = ["BaseSource"]
52
71
 
@@ -117,6 +117,9 @@ class BaseReader(BaseSource):
117
117
  "file_path_column": self.file_path_column,
118
118
  }
119
119
 
120
+ def _io_refiner_extras(self) -> tuple[str, ...]:
121
+ return self.fileset.required_refiner_extras()
122
+
120
123
  def _with_file_path(
121
124
  self, row: dict[str, Any], source_file: DataFile
122
125
  ) -> dict[str, Any]:
@@ -119,6 +119,9 @@ class Hdf5Reader(BaseReader):
119
119
  )
120
120
  return description
121
121
 
122
+ def _declared_refiner_extras(self) -> tuple[str, ...]:
123
+ return ("hdf5",)
124
+
122
125
  def _validate_column_names(self) -> None:
123
126
  for name, path in self.datasets.items():
124
127
  if path.startswith("/"):