macrodata-refiner 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. macrodata_refiner-0.2.0/PKG-INFO +151 -0
  2. macrodata_refiner-0.2.0/README.md +121 -0
  3. {macrodata_refiner-0.1.0 → macrodata_refiner-0.2.0}/pyproject.toml +5 -1
  4. macrodata_refiner-0.2.0/src/macrodata_refiner.egg-info/PKG-INFO +151 -0
  5. macrodata_refiner-0.2.0/src/macrodata_refiner.egg-info/SOURCES.txt +110 -0
  6. {macrodata_refiner-0.1.0 → macrodata_refiner-0.2.0}/src/macrodata_refiner.egg-info/requires.txt +4 -0
  7. macrodata_refiner-0.2.0/src/refiner/__init__.py +57 -0
  8. {macrodata_refiner-0.1.0 → macrodata_refiner-0.2.0}/src/refiner/cli/auth.py +18 -23
  9. {macrodata_refiner-0.1.0 → macrodata_refiner-0.2.0}/src/refiner/cli/main.py +1 -1
  10. {macrodata_refiner-0.1.0 → macrodata_refiner-0.2.0}/src/refiner/cli/ui.py +6 -4
  11. macrodata_refiner-0.2.0/src/refiner/execution/__init__.py +1 -0
  12. macrodata_refiner-0.2.0/src/refiner/execution/asyncio/runtime.py +112 -0
  13. macrodata_refiner-0.2.0/src/refiner/execution/asyncio/window.py +91 -0
  14. macrodata_refiner-0.1.0/src/refiner/runtime/execution/row_queue.py → macrodata_refiner-0.2.0/src/refiner/execution/buffer.py +3 -3
  15. {macrodata_refiner-0.1.0/src/refiner/runtime → macrodata_refiner-0.2.0/src/refiner}/execution/engine.py +60 -55
  16. macrodata_refiner-0.2.0/src/refiner/execution/operators/__init__.py +1 -0
  17. macrodata_refiner-0.2.0/src/refiner/execution/operators/row.py +186 -0
  18. macrodata_refiner-0.2.0/src/refiner/execution/operators/vectorized.py +148 -0
  19. macrodata_refiner-0.2.0/src/refiner/execution/tracking/__init__.py +1 -0
  20. macrodata_refiner-0.2.0/src/refiner/execution/tracking/shards.py +106 -0
  21. macrodata_refiner-0.2.0/src/refiner/io/__init__.py +9 -0
  22. {macrodata_refiner-0.1.0 → macrodata_refiner-0.2.0}/src/refiner/io/datafile.py +25 -6
  23. {macrodata_refiner-0.1.0 → macrodata_refiner-0.2.0}/src/refiner/io/datafolder.py +25 -9
  24. macrodata_refiner-0.2.0/src/refiner/io/fileset.py +254 -0
  25. macrodata_refiner-0.2.0/src/refiner/launchers/__init__.py +11 -0
  26. macrodata_refiner-0.2.0/src/refiner/launchers/base.py +185 -0
  27. macrodata_refiner-0.2.0/src/refiner/launchers/cloud.py +139 -0
  28. macrodata_refiner-0.2.0/src/refiner/launchers/local.py +321 -0
  29. macrodata_refiner-0.2.0/src/refiner/media/__init__.py +3 -0
  30. macrodata_refiner-0.2.0/src/refiner/media/video/__init__.py +3 -0
  31. macrodata_refiner-0.2.0/src/refiner/media/video/remux.py +239 -0
  32. macrodata_refiner-0.2.0/src/refiner/media/video/transcode.py +243 -0
  33. macrodata_refiner-0.2.0/src/refiner/media/video/types.py +23 -0
  34. macrodata_refiner-0.2.0/src/refiner/media/video/writer.py +250 -0
  35. macrodata_refiner-0.2.0/src/refiner/pipeline/__init__.py +31 -0
  36. macrodata_refiner-0.2.0/src/refiner/pipeline/data/block.py +129 -0
  37. {macrodata_refiner-0.1.0/src/refiner/sources → macrodata_refiner-0.2.0/src/refiner/pipeline/data}/row.py +125 -17
  38. macrodata_refiner-0.2.0/src/refiner/pipeline/data/shard.py +264 -0
  39. macrodata_refiner-0.2.0/src/refiner/pipeline/data/tabular.py +252 -0
  40. {macrodata_refiner-0.1.0/src/refiner → macrodata_refiner-0.2.0/src/refiner/pipeline}/expressions.py +33 -2
  41. macrodata_refiner-0.2.0/src/refiner/pipeline/pipeline.py +587 -0
  42. {macrodata_refiner-0.1.0/src/refiner/runtime → macrodata_refiner-0.2.0/src/refiner/pipeline}/planning.py +219 -17
  43. macrodata_refiner-0.2.0/src/refiner/pipeline/sinks/__init__.py +14 -0
  44. macrodata_refiner-0.2.0/src/refiner/pipeline/sinks/base.py +76 -0
  45. macrodata_refiner-0.2.0/src/refiner/pipeline/sinks/jsonl.py +81 -0
  46. macrodata_refiner-0.2.0/src/refiner/pipeline/sinks/lerobot.py +555 -0
  47. macrodata_refiner-0.2.0/src/refiner/pipeline/sinks/lerobot_reducer.py +276 -0
  48. macrodata_refiner-0.2.0/src/refiner/pipeline/sinks/parquet.py +78 -0
  49. macrodata_refiner-0.2.0/src/refiner/pipeline/sources/__init__.py +17 -0
  50. {macrodata_refiner-0.1.0/src/refiner → macrodata_refiner-0.2.0/src/refiner/pipeline}/sources/base.py +14 -16
  51. macrodata_refiner-0.2.0/src/refiner/pipeline/sources/items.py +70 -0
  52. macrodata_refiner-0.2.0/src/refiner/pipeline/sources/readers/__init__.py +15 -0
  53. macrodata_refiner-0.2.0/src/refiner/pipeline/sources/readers/base.py +264 -0
  54. macrodata_refiner-0.2.0/src/refiner/pipeline/sources/readers/csv.py +204 -0
  55. macrodata_refiner-0.2.0/src/refiner/pipeline/sources/readers/jsonl.py +88 -0
  56. macrodata_refiner-0.2.0/src/refiner/pipeline/sources/readers/lerobot.py +283 -0
  57. macrodata_refiner-0.2.0/src/refiner/pipeline/sources/readers/parquet.py +252 -0
  58. {macrodata_refiner-0.1.0/src/refiner → macrodata_refiner-0.2.0/src/refiner/pipeline}/sources/readers/utils.py +1 -11
  59. {macrodata_refiner-0.1.0/src/refiner → macrodata_refiner-0.2.0/src/refiner/pipeline}/sources/task.py +8 -9
  60. macrodata_refiner-0.1.0/src/refiner/processors/step.py → macrodata_refiner-0.2.0/src/refiner/pipeline/steps.py +67 -41
  61. macrodata_refiner-0.2.0/src/refiner/pipeline/utils/cache/decoder_cache.py +192 -0
  62. macrodata_refiner-0.2.0/src/refiner/pipeline/utils/cache/file_cache.py +233 -0
  63. macrodata_refiner-0.2.0/src/refiner/pipeline/utils/cache/lease_cache.py +276 -0
  64. macrodata_refiner-0.2.0/src/refiner/platform/__init__.py +1 -0
  65. macrodata_refiner-0.2.0/src/refiner/platform/client/__init__.py +56 -0
  66. macrodata_refiner-0.2.0/src/refiner/platform/client/api.py +271 -0
  67. {macrodata_refiner-0.1.0/src/refiner/platform → macrodata_refiner-0.2.0/src/refiner/platform/client}/http.py +9 -12
  68. macrodata_refiner-0.2.0/src/refiner/platform/client/models.py +191 -0
  69. {macrodata_refiner-0.1.0/src/refiner/platform/cloud → macrodata_refiner-0.2.0/src/refiner/platform/client}/serialize.py +1 -1
  70. {macrodata_refiner-0.1.0 → macrodata_refiner-0.2.0}/src/refiner/platform/manifest.py +25 -5
  71. macrodata_refiner-0.2.0/src/refiner/py.typed +0 -0
  72. macrodata_refiner-0.2.0/src/refiner/robotics/__init__.py +25 -0
  73. macrodata_refiner-0.2.0/src/refiner/robotics/lerobot_format/__init__.py +49 -0
  74. macrodata_refiner-0.2.0/src/refiner/robotics/lerobot_format/metadata/__init__.py +41 -0
  75. macrodata_refiner-0.2.0/src/refiner/robotics/lerobot_format/metadata/info.py +250 -0
  76. macrodata_refiner-0.2.0/src/refiner/robotics/lerobot_format/metadata/metadata.py +32 -0
  77. macrodata_refiner-0.2.0/src/refiner/robotics/lerobot_format/metadata/stats.py +686 -0
  78. macrodata_refiner-0.2.0/src/refiner/robotics/lerobot_format/metadata/tasks.py +151 -0
  79. macrodata_refiner-0.2.0/src/refiner/robotics/lerobot_format/row.py +297 -0
  80. macrodata_refiner-0.2.0/src/refiner/robotics/lerobot_format/tabular.py +134 -0
  81. macrodata_refiner-0.2.0/src/refiner/robotics/motion.py +165 -0
  82. macrodata_refiner-0.2.0/src/refiner/worker/__init__.py +1 -0
  83. macrodata_refiner-0.2.0/src/refiner/worker/context.py +121 -0
  84. macrodata_refiner-0.2.0/src/refiner/worker/entrypoint.py +101 -0
  85. macrodata_refiner-0.2.0/src/refiner/worker/lifecycle/__init__.py +5 -0
  86. macrodata_refiner-0.2.0/src/refiner/worker/lifecycle/base.py +25 -0
  87. macrodata_refiner-0.2.0/src/refiner/worker/lifecycle/local/__init__.py +3 -0
  88. macrodata_refiner-0.2.0/src/refiner/worker/lifecycle/local/claim.py +147 -0
  89. macrodata_refiner-0.2.0/src/refiner/worker/lifecycle/local/files.py +41 -0
  90. macrodata_refiner-0.2.0/src/refiner/worker/lifecycle/local/lifecycle.py +308 -0
  91. macrodata_refiner-0.2.0/src/refiner/worker/lifecycle/platform.py +99 -0
  92. macrodata_refiner-0.2.0/src/refiner/worker/metrics/__init__.py +1 -0
  93. macrodata_refiner-0.1.0/src/refiner/metrics.py → macrodata_refiner-0.2.0/src/refiner/worker/metrics/api.py +38 -13
  94. macrodata_refiner-0.1.0/src/refiner/runtime/metrics_context.py → macrodata_refiner-0.2.0/src/refiner/worker/metrics/context.py +23 -19
  95. macrodata_refiner-0.1.0/src/refiner/platform/telemetry/emitter.py → macrodata_refiner-0.2.0/src/refiner/worker/metrics/otel.py +147 -50
  96. macrodata_refiner-0.2.0/src/refiner/worker/resources/__init__.py +1 -0
  97. macrodata_refiner-0.2.0/src/refiner/worker/resources/cpu.py +123 -0
  98. macrodata_refiner-0.2.0/src/refiner/worker/resources/memory.py +63 -0
  99. macrodata_refiner-0.2.0/src/refiner/worker/resources/network.py +27 -0
  100. macrodata_refiner-0.2.0/src/refiner/worker/runner.py +391 -0
  101. macrodata_refiner-0.2.0/src/refiner/worker/workdir.py +22 -0
  102. macrodata_refiner-0.2.0/tests/test_cache.py +175 -0
  103. {macrodata_refiner-0.1.0 → macrodata_refiner-0.2.0}/tests/test_expressions.py +6 -0
  104. macrodata_refiner-0.1.0/PKG-INFO +0 -25
  105. macrodata_refiner-0.1.0/src/macrodata_refiner.egg-info/PKG-INFO +0 -25
  106. macrodata_refiner-0.1.0/src/macrodata_refiner.egg-info/SOURCES.txt +0 -76
  107. macrodata_refiner-0.1.0/src/refiner/__init__.py +0 -65
  108. macrodata_refiner-0.1.0/src/refiner/io/__init__.py +0 -9
  109. macrodata_refiner-0.1.0/src/refiner/io/fileset.py +0 -172
  110. macrodata_refiner-0.1.0/src/refiner/ledger/__init__.py +0 -10
  111. macrodata_refiner-0.1.0/src/refiner/ledger/backend/__init__.py +0 -10
  112. macrodata_refiner-0.1.0/src/refiner/ledger/backend/base.py +0 -60
  113. macrodata_refiner-0.1.0/src/refiner/ledger/backend/cloud.py +0 -85
  114. macrodata_refiner-0.1.0/src/refiner/ledger/backend/fs.py +0 -267
  115. macrodata_refiner-0.1.0/src/refiner/ledger/config.py +0 -49
  116. macrodata_refiner-0.1.0/src/refiner/ledger/policy.py +0 -144
  117. macrodata_refiner-0.1.0/src/refiner/ledger/shard.py +0 -126
  118. macrodata_refiner-0.1.0/src/refiner/pipeline.py +0 -381
  119. macrodata_refiner-0.1.0/src/refiner/platform/__init__.py +0 -28
  120. macrodata_refiner-0.1.0/src/refiner/platform/client.py +0 -303
  121. macrodata_refiner-0.1.0/src/refiner/platform/cloud/__init__.py +0 -1
  122. macrodata_refiner-0.1.0/src/refiner/platform/cloud/models.py +0 -73
  123. macrodata_refiner-0.1.0/src/refiner/platform/config.py +0 -14
  124. macrodata_refiner-0.1.0/src/refiner/platform/telemetry/__init__.py +0 -5
  125. macrodata_refiner-0.1.0/src/refiner/platform/telemetry/metric_helpers.py +0 -117
  126. macrodata_refiner-0.1.0/src/refiner/processors/__init__.py +0 -21
  127. macrodata_refiner-0.1.0/src/refiner/runtime/__init__.py +0 -1
  128. macrodata_refiner-0.1.0/src/refiner/runtime/errors.py +0 -8
  129. macrodata_refiner-0.1.0/src/refiner/runtime/execution/__init__.py +0 -19
  130. macrodata_refiner-0.1.0/src/refiner/runtime/execution/row_steps.py +0 -104
  131. macrodata_refiner-0.1.0/src/refiner/runtime/execution/vectorized.py +0 -114
  132. macrodata_refiner-0.1.0/src/refiner/runtime/launchers/__init__.py +0 -11
  133. macrodata_refiner-0.1.0/src/refiner/runtime/launchers/base.py +0 -152
  134. macrodata_refiner-0.1.0/src/refiner/runtime/launchers/cloud.py +0 -92
  135. macrodata_refiner-0.1.0/src/refiner/runtime/launchers/local.py +0 -294
  136. macrodata_refiner-0.1.0/src/refiner/runtime/resources/__init__.py +0 -10
  137. macrodata_refiner-0.1.0/src/refiner/runtime/resources/cpu.py +0 -52
  138. macrodata_refiner-0.1.0/src/refiner/runtime/resources/memory.py +0 -39
  139. macrodata_refiner-0.1.0/src/refiner/runtime/types.py +0 -16
  140. macrodata_refiner-0.1.0/src/refiner/runtime/worker/__init__.py +0 -3
  141. macrodata_refiner-0.1.0/src/refiner/runtime/worker/entrypoint.py +0 -142
  142. macrodata_refiner-0.1.0/src/refiner/runtime/worker/runner.py +0 -242
  143. macrodata_refiner-0.1.0/src/refiner/sources/__init__.py +0 -23
  144. macrodata_refiner-0.1.0/src/refiner/sources/items.py +0 -69
  145. macrodata_refiner-0.1.0/src/refiner/sources/readers/__init__.py +0 -17
  146. macrodata_refiner-0.1.0/src/refiner/sources/readers/base.py +0 -146
  147. macrodata_refiner-0.1.0/src/refiner/sources/readers/csv.py +0 -307
  148. macrodata_refiner-0.1.0/src/refiner/sources/readers/jsonl.py +0 -111
  149. macrodata_refiner-0.1.0/src/refiner/sources/readers/parquet.py +0 -228
  150. {macrodata_refiner-0.1.0 → macrodata_refiner-0.2.0}/LICENSE +0 -0
  151. {macrodata_refiner-0.1.0 → macrodata_refiner-0.2.0}/setup.cfg +0 -0
  152. {macrodata_refiner-0.1.0 → macrodata_refiner-0.2.0}/src/macrodata_refiner.egg-info/dependency_links.txt +0 -0
  153. {macrodata_refiner-0.1.0 → macrodata_refiner-0.2.0}/src/macrodata_refiner.egg-info/entry_points.txt +0 -0
  154. {macrodata_refiner-0.1.0 → macrodata_refiner-0.2.0}/src/macrodata_refiner.egg-info/top_level.txt +0 -0
  155. {macrodata_refiner-0.1.0 → macrodata_refiner-0.2.0}/src/refiner/cli/__init__.py +0 -0
  156. /macrodata_refiner-0.1.0/README.md → /macrodata_refiner-0.2.0/src/refiner/pipeline/utils/__init__.py +0 -0
  157. /macrodata_refiner-0.1.0/src/refiner/py.typed → /macrodata_refiner-0.2.0/src/refiner/pipeline/utils/cache/__init__.py +0 -0
  158. {macrodata_refiner-0.1.0 → macrodata_refiner-0.2.0}/src/refiner/platform/auth.py +0 -0
@@ -0,0 +1,151 @@
1
+ Metadata-Version: 2.4
2
+ Name: macrodata-refiner
3
+ Version: 0.2.0
4
+ Summary: Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets
5
+ Author: Macrodata Labs
6
+ License-Expression: Apache-2.0
7
+ Classifier: Operating System :: OS Independent
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Requires-Python: >=3.10
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: av
16
+ Requires-Dist: cloudpickle==3.1.2
17
+ Requires-Dist: fsspec
18
+ Requires-Dist: httpx
19
+ Requires-Dist: loguru
20
+ Requires-Dist: huggingface-hub>=1.4.1
21
+ Requires-Dist: opentelemetry-exporter-otlp-proto-http
22
+ Requires-Dist: opentelemetry-sdk
23
+ Requires-Dist: numpy
24
+ Requires-Dist: psutil
25
+ Requires-Dist: orjson
26
+ Requires-Dist: pyarrow
27
+ Requires-Dist: msgspec>=0.20.0
28
+ Requires-Dist: hf>=1.7.1
29
+ Dynamic: license-file
30
+
31
+ <p align="center">
32
+ <img src="https://macrodata.co/logo.svg" alt="Macrodata" width="180">
33
+ </p>
34
+
35
+ <h1 align="center">Macrodata Refiner</h1>
36
+
37
+ Refiner is an open-source engine for turning raw, unstructured, and multimodal data into **high-quality datasets** for large model training.
38
+
39
+ It replaces the brittle scripts and stitched-together data tooling that teams still use for training data work, while offering much better support for multimodal data, robotics workflows, and model-based processing.
40
+
41
+ It also plugs into the Macrodata platform, which gives you visibility into what is happening to your data while pipelines run: job and shard lifecycle, logs, metrics, manifests, and pipeline behavior. The same code can run locally for development and then scale out through Macrodata's elastic serverless cloud.
42
+
43
+ ## Quickstart
44
+
45
+ Install:
46
+
47
+ ```bash
48
+ pip install macrodata-refiner
49
+ ```
50
+
51
+ Create a Macrodata API key:
52
+
53
+ - https://macrodata.co/settings/api-keys
54
+
55
+ Log in:
56
+
57
+ ```bash
58
+ macrodata login
59
+ ```
60
+
61
+ ### Cloud example
62
+
63
+ Launch a robotics pipeline on Macrodata Cloud.
64
+
65
+ This requires a valid API key.
66
+
67
+ ```python
68
+ import refiner as mdr
69
+
70
+ (
71
+ mdr.read_lerobot("hf://datasets/macrodata/aloha_static_battery_ep005_009")
72
+ .map(
73
+ mdr.robotics.motion_trim(
74
+ threshold=0.001,
75
+ pad_frames=5,
76
+ )
77
+ )
78
+ .write_lerobot("hf://buckets/macrodata/test_bucket/aloha_motion")
79
+ .launch_cloud(
80
+ name="motion_trim",
81
+ num_workers=4,
82
+ )
83
+ )
84
+ ```
85
+
86
+ ### Local example
87
+
88
+ Launch a local pipeline:
89
+
90
+ ```python
91
+ import refiner as mdr
92
+
93
+ def add_preview(row):
94
+ return row.update(
95
+ preview=" ".join(row["text"].split()[:20]),
96
+ )
97
+
98
+ (
99
+ mdr.read_jsonl("input/*.jsonl")
100
+ .filter(mdr.col("lang") == "en")
101
+ .with_columns(
102
+ text=mdr.col("text").str.strip(),
103
+ text_len=mdr.col("text").str.len(),
104
+ )
105
+ .map(add_preview)
106
+ .write_parquet("s3://my-bucket/english-cleanup/")
107
+ .launch_local(
108
+ name="english-cleanup",
109
+ num_workers=2,
110
+ )
111
+ )
112
+ ```
113
+
114
+ `pip install` gives you:
115
+
116
+ - the Python package as `refiner`
117
+ - the CLI as `macrodata`
118
+
119
+ ## Batteries included
120
+
121
+ - training-data-first pipeline primitives instead of generic ETL abstractions
122
+ - multimodal processing, with robotics support today
123
+ - a lot of built-in readers, transforms, sinks, and lifecycle/runtime machinery so you do not have to rebuild the same scaffolding in scripts
124
+ - access to any storage backend supported by `fsspec` (S3, GCP, Hugging Face, etc.)
125
+ - local execution for development and elastic cloud execution for large runs
126
+ - built-in observability through the Macrodata platform, so you can inspect how your data is changing instead of debugging blindly after the fact
127
+
128
+ ## Docs
129
+
130
+ Getting started:
131
+
132
+ - [Pipeline basics](docs/pipeline-basics.md)
133
+ - [Launchers](docs/launchers.md)
134
+ - [CLI](docs/cli.md)
135
+
136
+ Core concepts:
137
+
138
+ - [Reading and writing data](docs/reading-and-writing.md)
139
+ - [Transforms](docs/transforms.md)
140
+ - [Expressions](docs/expressions.md)
141
+ - [In-process debugging](docs/in-process-debugging.md)
142
+ - [Task pipelines](docs/task-pipelines.md)
143
+
144
+ Modalities and platform:
145
+
146
+ - [Robotics](docs/robotics.md)
147
+ - [Observability](docs/observability.md)
148
+
149
+ ## Community
150
+
151
+ - join the Macrodata Discord: https://discord.gg/S8kZtmBR2x
@@ -0,0 +1,121 @@
1
+ <p align="center">
2
+ <img src="https://macrodata.co/logo.svg" alt="Macrodata" width="180">
3
+ </p>
4
+
5
+ <h1 align="center">Macrodata Refiner</h1>
6
+
7
+ Refiner is an open-source engine for turning raw, unstructured, and multimodal data into **high-quality datasets** for large model training.
8
+
9
+ It replaces the brittle scripts and stitched-together data tooling that teams still use for training data work, while offering much better support for multimodal data, robotics workflows, and model-based processing.
10
+
11
+ It also plugs into the Macrodata platform, which gives you visibility into what is happening to your data while pipelines run: job and shard lifecycle, logs, metrics, manifests, and pipeline behavior. The same code can run locally for development and then scale out through Macrodata's elastic serverless cloud.
12
+
13
+ ## Quickstart
14
+
15
+ Install:
16
+
17
+ ```bash
18
+ pip install macrodata-refiner
19
+ ```
20
+
21
+ Create a Macrodata API key:
22
+
23
+ - https://macrodata.co/settings/api-keys
24
+
25
+ Log in:
26
+
27
+ ```bash
28
+ macrodata login
29
+ ```
30
+
31
+ ### Cloud example
32
+
33
+ Launch a robotics pipeline on Macrodata Cloud.
34
+
35
+ This requires a valid API key.
36
+
37
+ ```python
38
+ import refiner as mdr
39
+
40
+ (
41
+ mdr.read_lerobot("hf://datasets/macrodata/aloha_static_battery_ep005_009")
42
+ .map(
43
+ mdr.robotics.motion_trim(
44
+ threshold=0.001,
45
+ pad_frames=5,
46
+ )
47
+ )
48
+ .write_lerobot("hf://buckets/macrodata/test_bucket/aloha_motion")
49
+ .launch_cloud(
50
+ name="motion_trim",
51
+ num_workers=4,
52
+ )
53
+ )
54
+ ```
55
+
56
+ ### Local example
57
+
58
+ Launch a local pipeline:
59
+
60
+ ```python
61
+ import refiner as mdr
62
+
63
+ def add_preview(row):
64
+ return row.update(
65
+ preview=" ".join(row["text"].split()[:20]),
66
+ )
67
+
68
+ (
69
+ mdr.read_jsonl("input/*.jsonl")
70
+ .filter(mdr.col("lang") == "en")
71
+ .with_columns(
72
+ text=mdr.col("text").str.strip(),
73
+ text_len=mdr.col("text").str.len(),
74
+ )
75
+ .map(add_preview)
76
+ .write_parquet("s3://my-bucket/english-cleanup/")
77
+ .launch_local(
78
+ name="english-cleanup",
79
+ num_workers=2,
80
+ )
81
+ )
82
+ ```
83
+
84
+ `pip install` gives you:
85
+
86
+ - the Python package as `refiner`
87
+ - the CLI as `macrodata`
88
+
89
+ ## Batteries included
90
+
91
+ - training-data-first pipeline primitives instead of generic ETL abstractions
92
+ - multimodal processing, with robotics support today
93
+ - a lot of built-in readers, transforms, sinks, and lifecycle/runtime machinery so you do not have to rebuild the same scaffolding in scripts
94
+ - access to any storage backend supported by `fsspec` (S3, GCP, Hugging Face, etc.)
95
+ - local execution for development and elastic cloud execution for large runs
96
+ - built-in observability through the Macrodata platform, so you can inspect how your data is changing instead of debugging blindly after the fact
97
+
98
+ ## Docs
99
+
100
+ Getting started:
101
+
102
+ - [Pipeline basics](docs/pipeline-basics.md)
103
+ - [Launchers](docs/launchers.md)
104
+ - [CLI](docs/cli.md)
105
+
106
+ Core concepts:
107
+
108
+ - [Reading and writing data](docs/reading-and-writing.md)
109
+ - [Transforms](docs/transforms.md)
110
+ - [Expressions](docs/expressions.md)
111
+ - [In-process debugging](docs/in-process-debugging.md)
112
+ - [Task pipelines](docs/task-pipelines.md)
113
+
114
+ Modalities and platform:
115
+
116
+ - [Robotics](docs/robotics.md)
117
+ - [Observability](docs/observability.md)
118
+
119
+ ## Community
120
+
121
+ - join the Macrodata Discord: https://discord.gg/S8kZtmBR2x
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "macrodata-refiner"
3
- version = "0.1.0"
3
+ version = "0.2.0"
4
4
  description = "Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets"
5
5
  readme = "README.md"
6
6
  license = "Apache-2.0"
@@ -16,6 +16,7 @@ authors = [
16
16
  ]
17
17
  requires-python = ">=3.10"
18
18
  dependencies = [
19
+ "av",
19
20
  "cloudpickle==3.1.2",
20
21
  "fsspec",
21
22
  "httpx",
@@ -23,9 +24,12 @@ dependencies = [
23
24
  "huggingface-hub>=1.4.1",
24
25
  "opentelemetry-exporter-otlp-proto-http",
25
26
  "opentelemetry-sdk",
27
+ "numpy",
26
28
  "psutil",
27
29
  "orjson",
28
30
  "pyarrow",
31
+ "msgspec>=0.20.0",
32
+ "hf>=1.7.1",
29
33
  ]
30
34
 
31
35
  [project.scripts]
@@ -0,0 +1,151 @@
1
+ Metadata-Version: 2.4
2
+ Name: macrodata-refiner
3
+ Version: 0.2.0
4
+ Summary: Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets
5
+ Author: Macrodata Labs
6
+ License-Expression: Apache-2.0
7
+ Classifier: Operating System :: OS Independent
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Requires-Python: >=3.10
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: av
16
+ Requires-Dist: cloudpickle==3.1.2
17
+ Requires-Dist: fsspec
18
+ Requires-Dist: httpx
19
+ Requires-Dist: loguru
20
+ Requires-Dist: huggingface-hub>=1.4.1
21
+ Requires-Dist: opentelemetry-exporter-otlp-proto-http
22
+ Requires-Dist: opentelemetry-sdk
23
+ Requires-Dist: numpy
24
+ Requires-Dist: psutil
25
+ Requires-Dist: orjson
26
+ Requires-Dist: pyarrow
27
+ Requires-Dist: msgspec>=0.20.0
28
+ Requires-Dist: hf>=1.7.1
29
+ Dynamic: license-file
30
+
31
+ <p align="center">
32
+ <img src="https://macrodata.co/logo.svg" alt="Macrodata" width="180">
33
+ </p>
34
+
35
+ <h1 align="center">Macrodata Refiner</h1>
36
+
37
+ Refiner is an open-source engine for turning raw, unstructured, and multimodal data into **high-quality datasets** for large model training.
38
+
39
+ It replaces the brittle scripts and stitched-together data tooling that teams still use for training data work, while offering much better support for multimodal data, robotics workflows, and model-based processing.
40
+
41
+ It also plugs into the Macrodata platform, which gives you visibility into what is happening to your data while pipelines run: job and shard lifecycle, logs, metrics, manifests, and pipeline behavior. The same code can run locally for development and then scale out through Macrodata's elastic serverless cloud.
42
+
43
+ ## Quickstart
44
+
45
+ Install:
46
+
47
+ ```bash
48
+ pip install macrodata-refiner
49
+ ```
50
+
51
+ Create a Macrodata API key:
52
+
53
+ - https://macrodata.co/settings/api-keys
54
+
55
+ Log in:
56
+
57
+ ```bash
58
+ macrodata login
59
+ ```
60
+
61
+ ### Cloud example
62
+
63
+ Launch a robotics pipeline on Macrodata Cloud.
64
+
65
+ This requires a valid API key.
66
+
67
+ ```python
68
+ import refiner as mdr
69
+
70
+ (
71
+ mdr.read_lerobot("hf://datasets/macrodata/aloha_static_battery_ep005_009")
72
+ .map(
73
+ mdr.robotics.motion_trim(
74
+ threshold=0.001,
75
+ pad_frames=5,
76
+ )
77
+ )
78
+ .write_lerobot("hf://buckets/macrodata/test_bucket/aloha_motion")
79
+ .launch_cloud(
80
+ name="motion_trim",
81
+ num_workers=4,
82
+ )
83
+ )
84
+ ```
85
+
86
+ ### Local example
87
+
88
+ Launch a local pipeline:
89
+
90
+ ```python
91
+ import refiner as mdr
92
+
93
+ def add_preview(row):
94
+ return row.update(
95
+ preview=" ".join(row["text"].split()[:20]),
96
+ )
97
+
98
+ (
99
+ mdr.read_jsonl("input/*.jsonl")
100
+ .filter(mdr.col("lang") == "en")
101
+ .with_columns(
102
+ text=mdr.col("text").str.strip(),
103
+ text_len=mdr.col("text").str.len(),
104
+ )
105
+ .map(add_preview)
106
+ .write_parquet("s3://my-bucket/english-cleanup/")
107
+ .launch_local(
108
+ name="english-cleanup",
109
+ num_workers=2,
110
+ )
111
+ )
112
+ ```
113
+
114
+ `pip install` gives you:
115
+
116
+ - the Python package as `refiner`
117
+ - the CLI as `macrodata`
118
+
119
+ ## Batteries included
120
+
121
+ - training-data-first pipeline primitives instead of generic ETL abstractions
122
+ - multimodal processing, with robotics support today
123
+ - a lot of built-in readers, transforms, sinks, and lifecycle/runtime machinery so you do not have to rebuild the same scaffolding in scripts
124
+ - access to any storage backend supported by `fsspec` (S3, GCP, Hugging Face, etc.)
125
+ - local execution for development and elastic cloud execution for large runs
126
+ - built-in observability through the Macrodata platform, so you can inspect how your data is changing instead of debugging blindly after the fact
127
+
128
+ ## Docs
129
+
130
+ Getting started:
131
+
132
+ - [Pipeline basics](docs/pipeline-basics.md)
133
+ - [Launchers](docs/launchers.md)
134
+ - [CLI](docs/cli.md)
135
+
136
+ Core concepts:
137
+
138
+ - [Reading and writing data](docs/reading-and-writing.md)
139
+ - [Transforms](docs/transforms.md)
140
+ - [Expressions](docs/expressions.md)
141
+ - [In-process debugging](docs/in-process-debugging.md)
142
+ - [Task pipelines](docs/task-pipelines.md)
143
+
144
+ Modalities and platform:
145
+
146
+ - [Robotics](docs/robotics.md)
147
+ - [Observability](docs/observability.md)
148
+
149
+ ## Community
150
+
151
+ - join the Macrodata Discord: https://discord.gg/S8kZtmBR2x
@@ -0,0 +1,110 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/macrodata_refiner.egg-info/PKG-INFO
5
+ src/macrodata_refiner.egg-info/SOURCES.txt
6
+ src/macrodata_refiner.egg-info/dependency_links.txt
7
+ src/macrodata_refiner.egg-info/entry_points.txt
8
+ src/macrodata_refiner.egg-info/requires.txt
9
+ src/macrodata_refiner.egg-info/top_level.txt
10
+ src/refiner/__init__.py
11
+ src/refiner/py.typed
12
+ src/refiner/cli/__init__.py
13
+ src/refiner/cli/auth.py
14
+ src/refiner/cli/main.py
15
+ src/refiner/cli/ui.py
16
+ src/refiner/execution/__init__.py
17
+ src/refiner/execution/buffer.py
18
+ src/refiner/execution/engine.py
19
+ src/refiner/execution/asyncio/runtime.py
20
+ src/refiner/execution/asyncio/window.py
21
+ src/refiner/execution/operators/__init__.py
22
+ src/refiner/execution/operators/row.py
23
+ src/refiner/execution/operators/vectorized.py
24
+ src/refiner/execution/tracking/__init__.py
25
+ src/refiner/execution/tracking/shards.py
26
+ src/refiner/io/__init__.py
27
+ src/refiner/io/datafile.py
28
+ src/refiner/io/datafolder.py
29
+ src/refiner/io/fileset.py
30
+ src/refiner/launchers/__init__.py
31
+ src/refiner/launchers/base.py
32
+ src/refiner/launchers/cloud.py
33
+ src/refiner/launchers/local.py
34
+ src/refiner/media/__init__.py
35
+ src/refiner/media/video/__init__.py
36
+ src/refiner/media/video/remux.py
37
+ src/refiner/media/video/transcode.py
38
+ src/refiner/media/video/types.py
39
+ src/refiner/media/video/writer.py
40
+ src/refiner/pipeline/__init__.py
41
+ src/refiner/pipeline/expressions.py
42
+ src/refiner/pipeline/pipeline.py
43
+ src/refiner/pipeline/planning.py
44
+ src/refiner/pipeline/steps.py
45
+ src/refiner/pipeline/data/block.py
46
+ src/refiner/pipeline/data/row.py
47
+ src/refiner/pipeline/data/shard.py
48
+ src/refiner/pipeline/data/tabular.py
49
+ src/refiner/pipeline/sinks/__init__.py
50
+ src/refiner/pipeline/sinks/base.py
51
+ src/refiner/pipeline/sinks/jsonl.py
52
+ src/refiner/pipeline/sinks/lerobot.py
53
+ src/refiner/pipeline/sinks/lerobot_reducer.py
54
+ src/refiner/pipeline/sinks/parquet.py
55
+ src/refiner/pipeline/sources/__init__.py
56
+ src/refiner/pipeline/sources/base.py
57
+ src/refiner/pipeline/sources/items.py
58
+ src/refiner/pipeline/sources/task.py
59
+ src/refiner/pipeline/sources/readers/__init__.py
60
+ src/refiner/pipeline/sources/readers/base.py
61
+ src/refiner/pipeline/sources/readers/csv.py
62
+ src/refiner/pipeline/sources/readers/jsonl.py
63
+ src/refiner/pipeline/sources/readers/lerobot.py
64
+ src/refiner/pipeline/sources/readers/parquet.py
65
+ src/refiner/pipeline/sources/readers/utils.py
66
+ src/refiner/pipeline/utils/__init__.py
67
+ src/refiner/pipeline/utils/cache/__init__.py
68
+ src/refiner/pipeline/utils/cache/decoder_cache.py
69
+ src/refiner/pipeline/utils/cache/file_cache.py
70
+ src/refiner/pipeline/utils/cache/lease_cache.py
71
+ src/refiner/platform/__init__.py
72
+ src/refiner/platform/auth.py
73
+ src/refiner/platform/manifest.py
74
+ src/refiner/platform/client/__init__.py
75
+ src/refiner/platform/client/api.py
76
+ src/refiner/platform/client/http.py
77
+ src/refiner/platform/client/models.py
78
+ src/refiner/platform/client/serialize.py
79
+ src/refiner/robotics/__init__.py
80
+ src/refiner/robotics/motion.py
81
+ src/refiner/robotics/lerobot_format/__init__.py
82
+ src/refiner/robotics/lerobot_format/row.py
83
+ src/refiner/robotics/lerobot_format/tabular.py
84
+ src/refiner/robotics/lerobot_format/metadata/__init__.py
85
+ src/refiner/robotics/lerobot_format/metadata/info.py
86
+ src/refiner/robotics/lerobot_format/metadata/metadata.py
87
+ src/refiner/robotics/lerobot_format/metadata/stats.py
88
+ src/refiner/robotics/lerobot_format/metadata/tasks.py
89
+ src/refiner/worker/__init__.py
90
+ src/refiner/worker/context.py
91
+ src/refiner/worker/entrypoint.py
92
+ src/refiner/worker/runner.py
93
+ src/refiner/worker/workdir.py
94
+ src/refiner/worker/lifecycle/__init__.py
95
+ src/refiner/worker/lifecycle/base.py
96
+ src/refiner/worker/lifecycle/platform.py
97
+ src/refiner/worker/lifecycle/local/__init__.py
98
+ src/refiner/worker/lifecycle/local/claim.py
99
+ src/refiner/worker/lifecycle/local/files.py
100
+ src/refiner/worker/lifecycle/local/lifecycle.py
101
+ src/refiner/worker/metrics/__init__.py
102
+ src/refiner/worker/metrics/api.py
103
+ src/refiner/worker/metrics/context.py
104
+ src/refiner/worker/metrics/otel.py
105
+ src/refiner/worker/resources/__init__.py
106
+ src/refiner/worker/resources/cpu.py
107
+ src/refiner/worker/resources/memory.py
108
+ src/refiner/worker/resources/network.py
109
+ tests/test_cache.py
110
+ tests/test_expressions.py
@@ -1,3 +1,4 @@
1
+ av
1
2
  cloudpickle==3.1.2
2
3
  fsspec
3
4
  httpx
@@ -5,6 +6,9 @@ loguru
5
6
  huggingface-hub>=1.4.1
6
7
  opentelemetry-exporter-otlp-proto-http
7
8
  opentelemetry-sdk
9
+ numpy
8
10
  psutil
9
11
  orjson
10
12
  pyarrow
13
+ msgspec>=0.20.0
14
+ hf>=1.7.1
@@ -0,0 +1,57 @@
1
+ import refiner.robotics as robotics
2
+ from refiner.io import DataFile, DataFileSet, DataFolder
3
+ from refiner.launchers import LaunchStats, LocalLauncher
4
+ from refiner.media import VideoFile
5
+ from refiner.pipeline import (
6
+ RefinerPipeline,
7
+ Row,
8
+ Shard,
9
+ from_items,
10
+ from_source,
11
+ read_csv,
12
+ read_jsonl,
13
+ read_lerobot,
14
+ read_parquet,
15
+ task,
16
+ )
17
+ from refiner.pipeline.expressions import coalesce, col, if_else, lit
18
+ from refiner.worker.metrics.api import (
19
+ log_gauge,
20
+ log_gauges,
21
+ log_histogram,
22
+ log_throughput,
23
+ register_gauge,
24
+ )
25
+ from refiner.worker.runner import Worker, WorkerRunStats
26
+
27
+ __all__ = [
28
+ "RefinerPipeline",
29
+ "LocalLauncher",
30
+ "LaunchStats",
31
+ "DataFile",
32
+ "DataFolder",
33
+ "DataFileSet",
34
+ "Shard",
35
+ "Row",
36
+ "Worker",
37
+ "WorkerRunStats",
38
+ "read_csv",
39
+ "read_jsonl",
40
+ "read_lerobot",
41
+ "read_parquet",
42
+ "from_items",
43
+ "from_source",
44
+ "task",
45
+ "log_throughput",
46
+ "log_gauge",
47
+ "log_gauges",
48
+ "register_gauge",
49
+ "log_histogram",
50
+ "col",
51
+ "lit",
52
+ "coalesce",
53
+ "if_else",
54
+ "VideoFile",
55
+ "Video",
56
+ "robotics",
57
+ ]