macrodata-refiner 0.2.1__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- macrodata_refiner-0.3.0/PKG-INFO +196 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/README.md +23 -18
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/pyproject.toml +63 -9
- macrodata_refiner-0.3.0/src/macrodata_refiner.egg-info/PKG-INFO +196 -0
- macrodata_refiner-0.3.0/src/macrodata_refiner.egg-info/SOURCES.txt +180 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/macrodata_refiner.egg-info/entry_points.txt +1 -0
- macrodata_refiner-0.3.0/src/macrodata_refiner.egg-info/requires.txt +66 -0
- macrodata_refiner-0.3.0/src/refiner/__init__.py +89 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/cli/auth.py +13 -13
- macrodata_refiner-0.3.0/src/refiner/cli/commands/__init__.py +1 -0
- macrodata_refiner-0.2.1/src/refiner/cli/main.py → macrodata_refiner-0.3.0/src/refiner/cli/commands/auth.py +3 -21
- macrodata_refiner-0.3.0/src/refiner/cli/commands/jobs.py +194 -0
- macrodata_refiner-0.3.0/src/refiner/cli/commands/run.py +42 -0
- macrodata_refiner-0.3.0/src/refiner/cli/commands/secrets.py +53 -0
- macrodata_refiner-0.3.0/src/refiner/cli/common.py +66 -0
- macrodata_refiner-0.3.0/src/refiner/cli/jobs/__init__.py +1 -0
- macrodata_refiner-0.3.0/src/refiner/cli/jobs/attach.py +49 -0
- macrodata_refiner-0.3.0/src/refiner/cli/jobs/common.py +161 -0
- macrodata_refiner-0.3.0/src/refiner/cli/jobs/control.py +30 -0
- macrodata_refiner-0.3.0/src/refiner/cli/jobs/follow.py +299 -0
- macrodata_refiner-0.3.0/src/refiner/cli/jobs/get.py +237 -0
- macrodata_refiner-0.3.0/src/refiner/cli/jobs/list.py +86 -0
- macrodata_refiner-0.3.0/src/refiner/cli/jobs/logs.py +574 -0
- macrodata_refiner-0.3.0/src/refiner/cli/jobs/manifest.py +158 -0
- macrodata_refiner-0.3.0/src/refiner/cli/jobs/metrics.py +346 -0
- macrodata_refiner-0.3.0/src/refiner/cli/jobs/workers.py +87 -0
- macrodata_refiner-0.3.0/src/refiner/cli/main.py +34 -0
- macrodata_refiner-0.3.0/src/refiner/cli/run/__init__.py +1 -0
- macrodata_refiner-0.3.0/src/refiner/cli/run/cloud.py +575 -0
- macrodata_refiner-0.3.0/src/refiner/cli/run/command.py +92 -0
- macrodata_refiner-0.3.0/src/refiner/cli/run/local.py +343 -0
- macrodata_refiner-0.3.0/src/refiner/cli/run/modes.py +69 -0
- macrodata_refiner-0.3.0/src/refiner/cli/secrets.py +105 -0
- macrodata_refiner-0.3.0/src/refiner/cli/ui/__init__.py +15 -0
- macrodata_refiner-0.3.0/src/refiner/cli/ui/console.py +943 -0
- macrodata_refiner-0.2.1/src/refiner/cli/ui.py → macrodata_refiner-0.3.0/src/refiner/cli/ui/terminal.py +7 -0
- macrodata_refiner-0.3.0/src/refiner/execution/asyncio/__init__.py +1 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/execution/asyncio/runtime.py +1 -3
- macrodata_refiner-0.3.0/src/refiner/execution/asyncio/window.py +119 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/execution/engine.py +158 -14
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/execution/operators/row.py +2 -2
- macrodata_refiner-0.3.0/src/refiner/execution/operators/vectorized.py +234 -0
- macrodata_refiner-0.3.0/src/refiner/inference/__init__.py +82 -0
- macrodata_refiner-0.3.0/src/refiner/inference/capabilities.py +246 -0
- macrodata_refiner-0.3.0/src/refiner/inference/generate_pooling.py +49 -0
- macrodata_refiner-0.3.0/src/refiner/inference/generate_text.py +241 -0
- macrodata_refiner-0.3.0/src/refiner/inference/internal/__init__.py +1 -0
- macrodata_refiner-0.3.0/src/refiner/inference/internal/media.py +133 -0
- macrodata_refiner-0.3.0/src/refiner/inference/internal/message_conversion.py +45 -0
- macrodata_refiner-0.3.0/src/refiner/inference/internal/response.py +70 -0
- macrodata_refiner-0.3.0/src/refiner/inference/internal/runtime.py +177 -0
- macrodata_refiner-0.3.0/src/refiner/inference/internal/schema.py +71 -0
- macrodata_refiner-0.3.0/src/refiner/inference/internal/transport.py +380 -0
- macrodata_refiner-0.3.0/src/refiner/inference/internal/usage.py +31 -0
- macrodata_refiner-0.3.0/src/refiner/inference/providers/__init__.py +15 -0
- macrodata_refiner-0.3.0/src/refiner/inference/providers/anthropic.py +694 -0
- macrodata_refiner-0.3.0/src/refiner/inference/providers/base.py +138 -0
- macrodata_refiner-0.3.0/src/refiner/inference/providers/google.py +787 -0
- macrodata_refiner-0.3.0/src/refiner/inference/providers/openai.py +1242 -0
- macrodata_refiner-0.3.0/src/refiner/inference/providers/warnings.py +55 -0
- macrodata_refiner-0.3.0/src/refiner/inference/types.py +342 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/io/datafile.py +67 -1
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/io/datafolder.py +65 -7
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/io/fileset.py +136 -52
- macrodata_refiner-0.3.0/src/refiner/job_urls.py +16 -0
- macrodata_refiner-0.3.0/src/refiner/launchers/base.py +104 -0
- macrodata_refiner-0.3.0/src/refiner/launchers/cloud.py +372 -0
- macrodata_refiner-0.3.0/src/refiner/launchers/local.py +516 -0
- macrodata_refiner-0.3.0/src/refiner/launchers/secrets.py +153 -0
- macrodata_refiner-0.3.0/src/refiner/pipeline/__init__.py +55 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/data/block.py +9 -3
- macrodata_refiner-0.3.0/src/refiner/pipeline/data/datatype.py +409 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/data/shard.py +10 -2
- macrodata_refiner-0.3.0/src/refiner/pipeline/data/tabular.py +428 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/expressions.py +86 -0
- macrodata_refiner-0.3.0/src/refiner/pipeline/pipeline.py +1274 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/planning.py +90 -40
- macrodata_refiner-0.3.0/src/refiner/pipeline/resources.py +48 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/sinks/__init__.py +4 -3
- macrodata_refiner-0.3.0/src/refiner/pipeline/sinks/assets.py +430 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/sinks/base.py +37 -7
- macrodata_refiner-0.3.0/src/refiner/pipeline/sinks/jsonl.py +147 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/sinks/lerobot.py +168 -47
- macrodata_refiner-0.3.0/src/refiner/pipeline/sinks/parquet.py +146 -0
- macrodata_refiner-0.3.0/src/refiner/pipeline/sinks/reducer/__init__.py +9 -0
- macrodata_refiner-0.3.0/src/refiner/pipeline/sinks/reducer/file.py +180 -0
- macrodata_refiner-0.2.1/src/refiner/pipeline/sinks/lerobot_reducer.py → macrodata_refiner-0.3.0/src/refiner/pipeline/sinks/reducer/lerobot.py +29 -19
- macrodata_refiner-0.3.0/src/refiner/pipeline/sinks/reducer/zarr.py +281 -0
- macrodata_refiner-0.3.0/src/refiner/pipeline/sinks/zarr.py +602 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/sources/__init__.py +16 -2
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/sources/base.py +4 -0
- macrodata_refiner-0.3.0/src/refiner/pipeline/sources/readers/__init__.py +29 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/sources/readers/base.py +57 -3
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/sources/readers/csv.py +32 -5
- macrodata_refiner-0.3.0/src/refiner/pipeline/sources/readers/files.py +166 -0
- macrodata_refiner-0.3.0/src/refiner/pipeline/sources/readers/hdf5.py +280 -0
- macrodata_refiner-0.3.0/src/refiner/pipeline/sources/readers/hf_dataset.py +416 -0
- macrodata_refiner-0.3.0/src/refiner/pipeline/sources/readers/json.py +167 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/sources/readers/lerobot.py +1 -0
- macrodata_refiner-0.3.0/src/refiner/pipeline/sources/readers/mcap.py +967 -0
- macrodata_refiner-0.3.0/src/refiner/pipeline/sources/readers/parquet.py +494 -0
- macrodata_refiner-0.3.0/src/refiner/pipeline/sources/readers/tfds.py +392 -0
- macrodata_refiner-0.3.0/src/refiner/pipeline/sources/readers/tfrecord.py +205 -0
- macrodata_refiner-0.3.0/src/refiner/pipeline/sources/readers/utils.py +237 -0
- macrodata_refiner-0.3.0/src/refiner/pipeline/sources/readers/zarr.py +577 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/steps.py +6 -1
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/utils/cache/decoder_cache.py +19 -13
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/platform/auth.py +14 -4
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/platform/client/__init__.py +20 -13
- macrodata_refiner-0.3.0/src/refiner/platform/client/api.py +577 -0
- macrodata_refiner-0.3.0/src/refiner/platform/client/models.py +319 -0
- macrodata_refiner-0.3.0/src/refiner/platform/client/serialize.py +39 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/platform/manifest.py +59 -2
- macrodata_refiner-0.3.0/src/refiner/robotics/__init__.py +47 -0
- macrodata_refiner-0.3.0/src/refiner/robotics/egocentric.py +99 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/robotics/lerobot_format/__init__.py +0 -4
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/robotics/lerobot_format/row.py +137 -83
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/robotics/lerobot_format/tabular.py +37 -8
- macrodata_refiner-0.3.0/src/refiner/robotics/motion.py +181 -0
- macrodata_refiner-0.3.0/src/refiner/robotics/reward.py +268 -0
- macrodata_refiner-0.3.0/src/refiner/robotics/row.py +867 -0
- macrodata_refiner-0.3.0/src/refiner/robotics/subtask_annotation.py +466 -0
- macrodata_refiner-0.3.0/src/refiner/robotics/synchronization.py +244 -0
- macrodata_refiner-0.3.0/src/refiner/robotics/tabular.py +172 -0
- macrodata_refiner-0.3.0/src/refiner/services/__init__.py +14 -0
- macrodata_refiner-0.3.0/src/refiner/services/base.py +44 -0
- macrodata_refiner-0.3.0/src/refiner/services/discovery.py +102 -0
- macrodata_refiner-0.3.0/src/refiner/services/manager.py +251 -0
- macrodata_refiner-0.3.0/src/refiner/services/vllm.py +78 -0
- macrodata_refiner-0.3.0/src/refiner/text/__init__.py +11 -0
- macrodata_refiner-0.3.0/src/refiner/text/commoncrawl.py +661 -0
- macrodata_refiner-0.3.0/src/refiner/utils/__init__.py +3 -0
- macrodata_refiner-0.3.0/src/refiner/utils/imports.py +75 -0
- macrodata_refiner-0.3.0/src/refiner/video/__init__.py +67 -0
- macrodata_refiner-0.3.0/src/refiner/video/decode.py +279 -0
- {macrodata_refiner-0.2.1/src/refiner/media → macrodata_refiner-0.3.0/src/refiner}/video/remux.py +71 -16
- {macrodata_refiner-0.2.1/src/refiner/media → macrodata_refiner-0.3.0/src/refiner}/video/transcode.py +62 -47
- macrodata_refiner-0.3.0/src/refiner/video/types.py +520 -0
- {macrodata_refiner-0.2.1/src/refiner/media → macrodata_refiner-0.3.0/src/refiner}/video/writer.py +91 -44
- macrodata_refiner-0.3.0/src/refiner/worker/context.py +177 -0
- macrodata_refiner-0.3.0/src/refiner/worker/entrypoint.py +99 -0
- macrodata_refiner-0.3.0/src/refiner/worker/lifecycle.py +142 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/worker/metrics/api.py +4 -2
- macrodata_refiner-0.3.0/src/refiner/worker/metrics/emitter.py +112 -0
- macrodata_refiner-0.3.0/src/refiner/worker/resources/cpu.py +24 -0
- macrodata_refiner-0.3.0/src/refiner/worker/resources/gpu.py +81 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/worker/runner.py +115 -163
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/worker/workdir.py +2 -2
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/tests/test_cache.py +2 -3
- macrodata_refiner-0.3.0/tests/test_commoncrawl_text.py +1245 -0
- macrodata_refiner-0.3.0/tests/test_optional_dependencies.py +19 -0
- macrodata_refiner-0.3.0/tests/test_video_decode.py +255 -0
- macrodata_refiner-0.2.1/PKG-INFO +0 -151
- macrodata_refiner-0.2.1/src/macrodata_refiner.egg-info/PKG-INFO +0 -151
- macrodata_refiner-0.2.1/src/macrodata_refiner.egg-info/SOURCES.txt +0 -110
- macrodata_refiner-0.2.1/src/macrodata_refiner.egg-info/requires.txt +0 -14
- macrodata_refiner-0.2.1/src/refiner/__init__.py +0 -57
- macrodata_refiner-0.2.1/src/refiner/execution/asyncio/window.py +0 -91
- macrodata_refiner-0.2.1/src/refiner/execution/operators/vectorized.py +0 -150
- macrodata_refiner-0.2.1/src/refiner/launchers/base.py +0 -185
- macrodata_refiner-0.2.1/src/refiner/launchers/cloud.py +0 -185
- macrodata_refiner-0.2.1/src/refiner/launchers/local.py +0 -321
- macrodata_refiner-0.2.1/src/refiner/media/__init__.py +0 -3
- macrodata_refiner-0.2.1/src/refiner/media/video/__init__.py +0 -3
- macrodata_refiner-0.2.1/src/refiner/media/video/types.py +0 -23
- macrodata_refiner-0.2.1/src/refiner/pipeline/__init__.py +0 -31
- macrodata_refiner-0.2.1/src/refiner/pipeline/data/tabular.py +0 -252
- macrodata_refiner-0.2.1/src/refiner/pipeline/pipeline.py +0 -587
- macrodata_refiner-0.2.1/src/refiner/pipeline/sinks/jsonl.py +0 -81
- macrodata_refiner-0.2.1/src/refiner/pipeline/sinks/parquet.py +0 -78
- macrodata_refiner-0.2.1/src/refiner/pipeline/sources/readers/__init__.py +0 -15
- macrodata_refiner-0.2.1/src/refiner/pipeline/sources/readers/jsonl.py +0 -88
- macrodata_refiner-0.2.1/src/refiner/pipeline/sources/readers/parquet.py +0 -252
- macrodata_refiner-0.2.1/src/refiner/pipeline/sources/readers/utils.py +0 -104
- macrodata_refiner-0.2.1/src/refiner/platform/client/api.py +0 -271
- macrodata_refiner-0.2.1/src/refiner/platform/client/http.py +0 -94
- macrodata_refiner-0.2.1/src/refiner/platform/client/models.py +0 -191
- macrodata_refiner-0.2.1/src/refiner/platform/client/serialize.py +0 -34
- macrodata_refiner-0.2.1/src/refiner/robotics/__init__.py +0 -25
- macrodata_refiner-0.2.1/src/refiner/robotics/motion.py +0 -165
- macrodata_refiner-0.2.1/src/refiner/worker/context.py +0 -121
- macrodata_refiner-0.2.1/src/refiner/worker/entrypoint.py +0 -101
- macrodata_refiner-0.2.1/src/refiner/worker/lifecycle/__init__.py +0 -5
- macrodata_refiner-0.2.1/src/refiner/worker/lifecycle/base.py +0 -25
- macrodata_refiner-0.2.1/src/refiner/worker/lifecycle/local/__init__.py +0 -3
- macrodata_refiner-0.2.1/src/refiner/worker/lifecycle/local/claim.py +0 -147
- macrodata_refiner-0.2.1/src/refiner/worker/lifecycle/local/files.py +0 -41
- macrodata_refiner-0.2.1/src/refiner/worker/lifecycle/local/lifecycle.py +0 -308
- macrodata_refiner-0.2.1/src/refiner/worker/lifecycle/platform.py +0 -99
- macrodata_refiner-0.2.1/src/refiner/worker/metrics/context.py +0 -147
- macrodata_refiner-0.2.1/src/refiner/worker/metrics/otel.py +0 -364
- macrodata_refiner-0.2.1/src/refiner/worker/resources/cpu.py +0 -123
- macrodata_refiner-0.2.1/src/refiner/worker/resources/memory.py +0 -63
- macrodata_refiner-0.2.1/src/refiner/worker/resources/network.py +0 -27
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/LICENSE +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/setup.cfg +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/macrodata_refiner.egg-info/dependency_links.txt +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/macrodata_refiner.egg-info/top_level.txt +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/cli/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/execution/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/execution/buffer.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/execution/operators/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/execution/tracking/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/execution/tracking/shards.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/io/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/launchers/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/data/row.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/sources/items.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/sources/task.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/utils/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/utils/cache/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/utils/cache/file_cache.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/pipeline/utils/cache/lease_cache.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/platform/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/py.typed +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/robotics/lerobot_format/metadata/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/robotics/lerobot_format/metadata/info.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/robotics/lerobot_format/metadata/metadata.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/robotics/lerobot_format/metadata/stats.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/robotics/lerobot_format/metadata/tasks.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/worker/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/worker/metrics/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/src/refiner/worker/resources/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.3.0}/tests/test_expressions.py +0 -0
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: macrodata-refiner
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets
|
|
5
|
+
Author: Macrodata Labs
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Classifier: Operating System :: OS Independent
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: cloudpickle==3.1.2
|
|
16
|
+
Requires-Dist: fsspec[http]
|
|
17
|
+
Requires-Dist: httpx
|
|
18
|
+
Requires-Dist: loguru
|
|
19
|
+
Requires-Dist: numpy
|
|
20
|
+
Requires-Dist: orjson
|
|
21
|
+
Requires-Dist: packaging
|
|
22
|
+
Requires-Dist: pyarrow
|
|
23
|
+
Requires-Dist: msgspec>=0.20.0
|
|
24
|
+
Requires-Dist: pydantic>=2.0.0
|
|
25
|
+
Provides-Extra: video
|
|
26
|
+
Requires-Dist: av; extra == "video"
|
|
27
|
+
Requires-Dist: pillow; extra == "video"
|
|
28
|
+
Provides-Extra: hf
|
|
29
|
+
Requires-Dist: datasets>=3.0.0; extra == "hf"
|
|
30
|
+
Requires-Dist: huggingface-hub>=1.4.1; extra == "hf"
|
|
31
|
+
Requires-Dist: hf>=1.7.1; extra == "hf"
|
|
32
|
+
Provides-Extra: egocentric
|
|
33
|
+
Requires-Dist: macrodata-refiner[hf]; extra == "egocentric"
|
|
34
|
+
Requires-Dist: macrodata-refiner[video]; extra == "egocentric"
|
|
35
|
+
Requires-Dist: ego-vision[models]>=0.1.8; extra == "egocentric"
|
|
36
|
+
Provides-Extra: text
|
|
37
|
+
Requires-Dist: warcio; extra == "text"
|
|
38
|
+
Provides-Extra: hdf5
|
|
39
|
+
Requires-Dist: h5py; extra == "hdf5"
|
|
40
|
+
Provides-Extra: zarr
|
|
41
|
+
Requires-Dist: zarr<3,>=2.18; extra == "zarr"
|
|
42
|
+
Requires-Dist: numcodecs<0.16; extra == "zarr"
|
|
43
|
+
Provides-Extra: mcap
|
|
44
|
+
Requires-Dist: av; extra == "mcap"
|
|
45
|
+
Requires-Dist: mcap; extra == "mcap"
|
|
46
|
+
Requires-Dist: mcap-protobuf-support; extra == "mcap"
|
|
47
|
+
Requires-Dist: mcap-ros2-support; extra == "mcap"
|
|
48
|
+
Requires-Dist: pillow; extra == "mcap"
|
|
49
|
+
Provides-Extra: s3
|
|
50
|
+
Requires-Dist: s3fs; extra == "s3"
|
|
51
|
+
Provides-Extra: tensorflow
|
|
52
|
+
Requires-Dist: tensorflow; extra == "tensorflow"
|
|
53
|
+
Provides-Extra: tfds
|
|
54
|
+
Requires-Dist: macrodata-refiner[tensorflow]; extra == "tfds"
|
|
55
|
+
Requires-Dist: tensorflow-datasets; extra == "tfds"
|
|
56
|
+
Provides-Extra: testing
|
|
57
|
+
Requires-Dist: macrodata-refiner[hdf5]; extra == "testing"
|
|
58
|
+
Requires-Dist: macrodata-refiner[hf]; extra == "testing"
|
|
59
|
+
Requires-Dist: macrodata-refiner[mcap]; extra == "testing"
|
|
60
|
+
Requires-Dist: macrodata-refiner[video]; extra == "testing"
|
|
61
|
+
Requires-Dist: macrodata-refiner[zarr]; extra == "testing"
|
|
62
|
+
Requires-Dist: macrodata-refiner[text]; extra == "testing"
|
|
63
|
+
Requires-Dist: macrodata-refiner[s3]; extra == "testing"
|
|
64
|
+
Requires-Dist: macrodata-refiner[tfds]; extra == "testing"
|
|
65
|
+
Requires-Dist: pytest>=8.0.0; extra == "testing"
|
|
66
|
+
Requires-Dist: pytest-cov>=5.0.0; extra == "testing"
|
|
67
|
+
Provides-Extra: all
|
|
68
|
+
Requires-Dist: macrodata-refiner[testing]; extra == "all"
|
|
69
|
+
Dynamic: license-file
|
|
70
|
+
|
|
71
|
+
<p align="center">
|
|
72
|
+
<img src="https://macrodata.co/logo.svg" alt="Macrodata" width="180">
|
|
73
|
+
</p>
|
|
74
|
+
|
|
75
|
+
<h1 align="center">Macrodata Refiner</h1>
|
|
76
|
+
|
|
77
|
+
Refiner is an open-source engine for turning raw robotics and multimodal data into **high-quality datasets** for model training.
|
|
78
|
+
|
|
79
|
+
It gives training-data teams one pipeline model for multimodal data, robotics
|
|
80
|
+
workflows, and model-based processing.
|
|
81
|
+
|
|
82
|
+
It also plugs into the Macrodata platform, which gives you visibility into what is happening to your data while pipelines run: job and shard lifecycle, logs, metrics, manifests, and pipeline behavior. The same code can run locally for development and then scale out through Macrodata's elastic serverless cloud.
|
|
83
|
+
|
|
84
|
+
## Quickstart
|
|
85
|
+
|
|
86
|
+
Install:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
pip install macrodata-refiner
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Create a Macrodata API key:
|
|
93
|
+
|
|
94
|
+
- https://macrodata.co/settings/api-keys
|
|
95
|
+
|
|
96
|
+
Log in:
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
macrodata login
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Cloud example
|
|
103
|
+
|
|
104
|
+
Launch a robotics pipeline on Macrodata Cloud.
|
|
105
|
+
|
|
106
|
+
This requires a valid API key.
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
import refiner as mdr
|
|
110
|
+
|
|
111
|
+
(
|
|
112
|
+
mdr.read_lerobot("hf://datasets/macrodata/aloha_static_battery_ep005_009")
|
|
113
|
+
.map(
|
|
114
|
+
mdr.robotics.motion_trim(
|
|
115
|
+
threshold=0.001,
|
|
116
|
+
pad_frames=5,
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
.write_lerobot("hf://buckets/acme-robotics/aloha_motion")
|
|
120
|
+
.launch_cloud(
|
|
121
|
+
name="motion_trim",
|
|
122
|
+
num_workers=4,
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Need cloud GPUs? See [Resources, GPUs, and Services](docs/running-pipelines/resources-gpus-and-services.md).
|
|
128
|
+
|
|
129
|
+
### Local example
|
|
130
|
+
|
|
131
|
+
Launch a local pipeline:
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
import refiner as mdr
|
|
135
|
+
|
|
136
|
+
def add_preview(row):
|
|
137
|
+
return row.update(
|
|
138
|
+
preview=" ".join(row["text"].split()[:20]),
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
(
|
|
142
|
+
mdr.read_jsonl("input/*.jsonl")
|
|
143
|
+
.filter(mdr.col("lang") == "en")
|
|
144
|
+
.with_columns(
|
|
145
|
+
text=mdr.col("text").str.strip(),
|
|
146
|
+
text_len=mdr.col("text").str.len(),
|
|
147
|
+
)
|
|
148
|
+
.map(add_preview)
|
|
149
|
+
.write_parquet("s3://my-bucket/english-cleanup/")
|
|
150
|
+
.launch_local(
|
|
151
|
+
name="english-cleanup",
|
|
152
|
+
num_workers=2,
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
`pip install` gives you:
|
|
158
|
+
|
|
159
|
+
- the Python package as `refiner`
|
|
160
|
+
- the CLI as `macrodata`
|
|
161
|
+
|
|
162
|
+
## Batteries included
|
|
163
|
+
|
|
164
|
+
- training-data-first pipeline primitives instead of generic ETL abstractions
|
|
165
|
+
- multimodal processing, with robotics support today
|
|
166
|
+
- built-in readers, transforms, sinks, and runtime machinery for common dataset work
|
|
167
|
+
- access to any storage backend supported by `fsspec` (S3, GCP, Hugging Face, etc.)
|
|
168
|
+
- local execution for development and elastic cloud execution for large runs
|
|
169
|
+
- built-in observability through the Macrodata platform for job state, logs, metrics, and manifests
|
|
170
|
+
|
|
171
|
+
## Docs
|
|
172
|
+
|
|
173
|
+
Start here:
|
|
174
|
+
|
|
175
|
+
- [Docs index](docs/index.md)
|
|
176
|
+
- [Quickstart](docs/quickstart.md)
|
|
177
|
+
- [Running pipelines](docs/running-pipelines/index.md)
|
|
178
|
+
|
|
179
|
+
Build a dataset:
|
|
180
|
+
|
|
181
|
+
- [Reading data](docs/reading-data/index.md)
|
|
182
|
+
- [Episode data](docs/episode-data/index.md)
|
|
183
|
+
- [Transforms](docs/transforms/index.md)
|
|
184
|
+
- [Episode operations](docs/episode-operations/index.md)
|
|
185
|
+
- [Writing data](docs/writing-data/index.md)
|
|
186
|
+
- [Examples](docs/examples/index.md)
|
|
187
|
+
|
|
188
|
+
Operate jobs:
|
|
189
|
+
|
|
190
|
+
- [Platform](docs/platform/index.md)
|
|
191
|
+
- [CLI](docs/cli/index.md)
|
|
192
|
+
- [Reference](docs/reference/index.md)
|
|
193
|
+
|
|
194
|
+
## Community
|
|
195
|
+
|
|
196
|
+
- join the Macrodata Discord: https://discord.gg/S8kZtmBR2x
|
|
@@ -4,9 +4,10 @@
|
|
|
4
4
|
|
|
5
5
|
<h1 align="center">Macrodata Refiner</h1>
|
|
6
6
|
|
|
7
|
-
Refiner is an open-source engine for turning raw
|
|
7
|
+
Refiner is an open-source engine for turning raw robotics and multimodal data into **high-quality datasets** for model training.
|
|
8
8
|
|
|
9
|
-
It
|
|
9
|
+
It gives training-data teams one pipeline model for multimodal data, robotics
|
|
10
|
+
workflows, and model-based processing.
|
|
10
11
|
|
|
11
12
|
It also plugs into the Macrodata platform, which gives you visibility into what is happening to your data while pipelines run: job and shard lifecycle, logs, metrics, manifests, and pipeline behavior. The same code can run locally for development and then scale out through Macrodata's elastic serverless cloud.
|
|
12
13
|
|
|
@@ -45,7 +46,7 @@ import refiner as mdr
|
|
|
45
46
|
pad_frames=5,
|
|
46
47
|
)
|
|
47
48
|
)
|
|
48
|
-
.write_lerobot("hf://buckets/
|
|
49
|
+
.write_lerobot("hf://buckets/acme-robotics/aloha_motion")
|
|
49
50
|
.launch_cloud(
|
|
50
51
|
name="motion_trim",
|
|
51
52
|
num_workers=4,
|
|
@@ -53,6 +54,8 @@ import refiner as mdr
|
|
|
53
54
|
)
|
|
54
55
|
```
|
|
55
56
|
|
|
57
|
+
Need cloud GPUs? See [Resources, GPUs, and Services](docs/running-pipelines/resources-gpus-and-services.md).
|
|
58
|
+
|
|
56
59
|
### Local example
|
|
57
60
|
|
|
58
61
|
Launch a local pipeline:
|
|
@@ -90,31 +93,33 @@ def add_preview(row):
|
|
|
90
93
|
|
|
91
94
|
- training-data-first pipeline primitives instead of generic ETL abstractions
|
|
92
95
|
- multimodal processing, with robotics support today
|
|
93
|
-
-
|
|
96
|
+
- built-in readers, transforms, sinks, and runtime machinery for common dataset work
|
|
94
97
|
- access to any storage backend supported by `fsspec` (S3, GCP, Hugging Face, etc.)
|
|
95
98
|
- local execution for development and elastic cloud execution for large runs
|
|
96
|
-
- built-in observability through the Macrodata platform
|
|
99
|
+
- built-in observability through the Macrodata platform for job state, logs, metrics, and manifests
|
|
97
100
|
|
|
98
101
|
## Docs
|
|
99
102
|
|
|
100
|
-
|
|
103
|
+
Start here:
|
|
101
104
|
|
|
102
|
-
- [
|
|
103
|
-
- [
|
|
104
|
-
- [
|
|
105
|
+
- [Docs index](docs/index.md)
|
|
106
|
+
- [Quickstart](docs/quickstart.md)
|
|
107
|
+
- [Running pipelines](docs/running-pipelines/index.md)
|
|
105
108
|
|
|
106
|
-
|
|
109
|
+
Build a dataset:
|
|
107
110
|
|
|
108
|
-
- [Reading
|
|
109
|
-
- [
|
|
110
|
-
- [
|
|
111
|
-
- [
|
|
112
|
-
- [
|
|
111
|
+
- [Reading data](docs/reading-data/index.md)
|
|
112
|
+
- [Episode data](docs/episode-data/index.md)
|
|
113
|
+
- [Transforms](docs/transforms/index.md)
|
|
114
|
+
- [Episode operations](docs/episode-operations/index.md)
|
|
115
|
+
- [Writing data](docs/writing-data/index.md)
|
|
116
|
+
- [Examples](docs/examples/index.md)
|
|
113
117
|
|
|
114
|
-
|
|
118
|
+
Operate jobs:
|
|
115
119
|
|
|
116
|
-
- [
|
|
117
|
-
- [
|
|
120
|
+
- [Platform](docs/platform/index.md)
|
|
121
|
+
- [CLI](docs/cli/index.md)
|
|
122
|
+
- [Reference](docs/reference/index.md)
|
|
118
123
|
|
|
119
124
|
## Community
|
|
120
125
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "macrodata-refiner"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.3.0"
|
|
4
4
|
description = "Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = "Apache-2.0"
|
|
@@ -16,24 +16,79 @@ authors = [
|
|
|
16
16
|
]
|
|
17
17
|
requires-python = ">=3.10"
|
|
18
18
|
dependencies = [
|
|
19
|
-
"av",
|
|
20
19
|
"cloudpickle==3.1.2",
|
|
21
|
-
"fsspec",
|
|
20
|
+
"fsspec[http]",
|
|
22
21
|
"httpx",
|
|
23
22
|
"loguru",
|
|
24
|
-
"huggingface-hub>=1.4.1",
|
|
25
|
-
"opentelemetry-exporter-otlp-proto-http",
|
|
26
|
-
"opentelemetry-sdk",
|
|
27
23
|
"numpy",
|
|
28
|
-
"psutil",
|
|
29
24
|
"orjson",
|
|
25
|
+
"packaging",
|
|
30
26
|
"pyarrow",
|
|
31
27
|
"msgspec>=0.20.0",
|
|
28
|
+
"pydantic>=2.0.0",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
video = [
|
|
33
|
+
"av",
|
|
34
|
+
"pillow",
|
|
35
|
+
]
|
|
36
|
+
hf = [
|
|
37
|
+
"datasets>=3.0.0",
|
|
38
|
+
"huggingface-hub>=1.4.1",
|
|
32
39
|
"hf>=1.7.1",
|
|
33
40
|
]
|
|
41
|
+
egocentric = [
|
|
42
|
+
"macrodata-refiner[hf]",
|
|
43
|
+
"macrodata-refiner[video]",
|
|
44
|
+
"ego-vision[models]>=0.1.8",
|
|
45
|
+
]
|
|
46
|
+
text = [
|
|
47
|
+
"warcio",
|
|
48
|
+
]
|
|
49
|
+
hdf5 = [
|
|
50
|
+
"h5py",
|
|
51
|
+
]
|
|
52
|
+
zarr = [
|
|
53
|
+
"zarr>=2.18,<3",
|
|
54
|
+
"numcodecs<0.16",
|
|
55
|
+
]
|
|
56
|
+
mcap = [
|
|
57
|
+
"av",
|
|
58
|
+
"mcap",
|
|
59
|
+
"mcap-protobuf-support",
|
|
60
|
+
"mcap-ros2-support",
|
|
61
|
+
"pillow",
|
|
62
|
+
]
|
|
63
|
+
s3 = [
|
|
64
|
+
"s3fs",
|
|
65
|
+
]
|
|
66
|
+
tensorflow = [
|
|
67
|
+
"tensorflow",
|
|
68
|
+
]
|
|
69
|
+
tfds = [
|
|
70
|
+
"macrodata-refiner[tensorflow]",
|
|
71
|
+
"tensorflow-datasets",
|
|
72
|
+
]
|
|
73
|
+
testing = [
|
|
74
|
+
"macrodata-refiner[hdf5]",
|
|
75
|
+
"macrodata-refiner[hf]",
|
|
76
|
+
"macrodata-refiner[mcap]",
|
|
77
|
+
"macrodata-refiner[video]",
|
|
78
|
+
"macrodata-refiner[zarr]",
|
|
79
|
+
"macrodata-refiner[text]",
|
|
80
|
+
"macrodata-refiner[s3]",
|
|
81
|
+
"macrodata-refiner[tfds]",
|
|
82
|
+
"pytest>=8.0.0",
|
|
83
|
+
"pytest-cov>=5.0.0",
|
|
84
|
+
]
|
|
85
|
+
all = [
|
|
86
|
+
"macrodata-refiner[testing]",
|
|
87
|
+
]
|
|
34
88
|
|
|
35
89
|
[project.scripts]
|
|
36
90
|
macrodata = "refiner.cli.main:main"
|
|
91
|
+
mdr = "refiner.cli.main:main"
|
|
37
92
|
|
|
38
93
|
[build-system]
|
|
39
94
|
requires = ["setuptools>=77", "wheel"]
|
|
@@ -47,9 +102,8 @@ refiner = ["py.typed"]
|
|
|
47
102
|
|
|
48
103
|
[dependency-groups]
|
|
49
104
|
dev = [
|
|
105
|
+
"macrodata-refiner[all]",
|
|
50
106
|
"pre-commit>=4.0.0",
|
|
51
|
-
"pytest>=8.0.0",
|
|
52
|
-
"pytest-cov>=5.0.0",
|
|
53
107
|
"ruff>=0.14.10",
|
|
54
108
|
"ty>=0.0.7",
|
|
55
109
|
]
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: macrodata-refiner
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets
|
|
5
|
+
Author: Macrodata Labs
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Classifier: Operating System :: OS Independent
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: cloudpickle==3.1.2
|
|
16
|
+
Requires-Dist: fsspec[http]
|
|
17
|
+
Requires-Dist: httpx
|
|
18
|
+
Requires-Dist: loguru
|
|
19
|
+
Requires-Dist: numpy
|
|
20
|
+
Requires-Dist: orjson
|
|
21
|
+
Requires-Dist: packaging
|
|
22
|
+
Requires-Dist: pyarrow
|
|
23
|
+
Requires-Dist: msgspec>=0.20.0
|
|
24
|
+
Requires-Dist: pydantic>=2.0.0
|
|
25
|
+
Provides-Extra: video
|
|
26
|
+
Requires-Dist: av; extra == "video"
|
|
27
|
+
Requires-Dist: pillow; extra == "video"
|
|
28
|
+
Provides-Extra: hf
|
|
29
|
+
Requires-Dist: datasets>=3.0.0; extra == "hf"
|
|
30
|
+
Requires-Dist: huggingface-hub>=1.4.1; extra == "hf"
|
|
31
|
+
Requires-Dist: hf>=1.7.1; extra == "hf"
|
|
32
|
+
Provides-Extra: egocentric
|
|
33
|
+
Requires-Dist: macrodata-refiner[hf]; extra == "egocentric"
|
|
34
|
+
Requires-Dist: macrodata-refiner[video]; extra == "egocentric"
|
|
35
|
+
Requires-Dist: ego-vision[models]>=0.1.8; extra == "egocentric"
|
|
36
|
+
Provides-Extra: text
|
|
37
|
+
Requires-Dist: warcio; extra == "text"
|
|
38
|
+
Provides-Extra: hdf5
|
|
39
|
+
Requires-Dist: h5py; extra == "hdf5"
|
|
40
|
+
Provides-Extra: zarr
|
|
41
|
+
Requires-Dist: zarr<3,>=2.18; extra == "zarr"
|
|
42
|
+
Requires-Dist: numcodecs<0.16; extra == "zarr"
|
|
43
|
+
Provides-Extra: mcap
|
|
44
|
+
Requires-Dist: av; extra == "mcap"
|
|
45
|
+
Requires-Dist: mcap; extra == "mcap"
|
|
46
|
+
Requires-Dist: mcap-protobuf-support; extra == "mcap"
|
|
47
|
+
Requires-Dist: mcap-ros2-support; extra == "mcap"
|
|
48
|
+
Requires-Dist: pillow; extra == "mcap"
|
|
49
|
+
Provides-Extra: s3
|
|
50
|
+
Requires-Dist: s3fs; extra == "s3"
|
|
51
|
+
Provides-Extra: tensorflow
|
|
52
|
+
Requires-Dist: tensorflow; extra == "tensorflow"
|
|
53
|
+
Provides-Extra: tfds
|
|
54
|
+
Requires-Dist: macrodata-refiner[tensorflow]; extra == "tfds"
|
|
55
|
+
Requires-Dist: tensorflow-datasets; extra == "tfds"
|
|
56
|
+
Provides-Extra: testing
|
|
57
|
+
Requires-Dist: macrodata-refiner[hdf5]; extra == "testing"
|
|
58
|
+
Requires-Dist: macrodata-refiner[hf]; extra == "testing"
|
|
59
|
+
Requires-Dist: macrodata-refiner[mcap]; extra == "testing"
|
|
60
|
+
Requires-Dist: macrodata-refiner[video]; extra == "testing"
|
|
61
|
+
Requires-Dist: macrodata-refiner[zarr]; extra == "testing"
|
|
62
|
+
Requires-Dist: macrodata-refiner[text]; extra == "testing"
|
|
63
|
+
Requires-Dist: macrodata-refiner[s3]; extra == "testing"
|
|
64
|
+
Requires-Dist: macrodata-refiner[tfds]; extra == "testing"
|
|
65
|
+
Requires-Dist: pytest>=8.0.0; extra == "testing"
|
|
66
|
+
Requires-Dist: pytest-cov>=5.0.0; extra == "testing"
|
|
67
|
+
Provides-Extra: all
|
|
68
|
+
Requires-Dist: macrodata-refiner[testing]; extra == "all"
|
|
69
|
+
Dynamic: license-file
|
|
70
|
+
|
|
71
|
+
<p align="center">
|
|
72
|
+
<img src="https://macrodata.co/logo.svg" alt="Macrodata" width="180">
|
|
73
|
+
</p>
|
|
74
|
+
|
|
75
|
+
<h1 align="center">Macrodata Refiner</h1>
|
|
76
|
+
|
|
77
|
+
Refiner is an open-source engine for turning raw robotics and multimodal data into **high-quality datasets** for model training.
|
|
78
|
+
|
|
79
|
+
It gives training-data teams one pipeline model for multimodal data, robotics
|
|
80
|
+
workflows, and model-based processing.
|
|
81
|
+
|
|
82
|
+
It also plugs into the Macrodata platform, which gives you visibility into what is happening to your data while pipelines run: job and shard lifecycle, logs, metrics, manifests, and pipeline behavior. The same code can run locally for development and then scale out through Macrodata's elastic serverless cloud.
|
|
83
|
+
|
|
84
|
+
## Quickstart
|
|
85
|
+
|
|
86
|
+
Install:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
pip install macrodata-refiner
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Create a Macrodata API key:
|
|
93
|
+
|
|
94
|
+
- https://macrodata.co/settings/api-keys
|
|
95
|
+
|
|
96
|
+
Log in:
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
macrodata login
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Cloud example
|
|
103
|
+
|
|
104
|
+
Launch a robotics pipeline on Macrodata Cloud.
|
|
105
|
+
|
|
106
|
+
This requires a valid API key.
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
import refiner as mdr
|
|
110
|
+
|
|
111
|
+
(
|
|
112
|
+
mdr.read_lerobot("hf://datasets/macrodata/aloha_static_battery_ep005_009")
|
|
113
|
+
.map(
|
|
114
|
+
mdr.robotics.motion_trim(
|
|
115
|
+
threshold=0.001,
|
|
116
|
+
pad_frames=5,
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
.write_lerobot("hf://buckets/acme-robotics/aloha_motion")
|
|
120
|
+
.launch_cloud(
|
|
121
|
+
name="motion_trim",
|
|
122
|
+
num_workers=4,
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Need cloud GPUs? See [Resources, GPUs, and Services](docs/running-pipelines/resources-gpus-and-services.md).
|
|
128
|
+
|
|
129
|
+
### Local example
|
|
130
|
+
|
|
131
|
+
Launch a local pipeline:
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
import refiner as mdr
|
|
135
|
+
|
|
136
|
+
def add_preview(row):
|
|
137
|
+
return row.update(
|
|
138
|
+
preview=" ".join(row["text"].split()[:20]),
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
(
|
|
142
|
+
mdr.read_jsonl("input/*.jsonl")
|
|
143
|
+
.filter(mdr.col("lang") == "en")
|
|
144
|
+
.with_columns(
|
|
145
|
+
text=mdr.col("text").str.strip(),
|
|
146
|
+
text_len=mdr.col("text").str.len(),
|
|
147
|
+
)
|
|
148
|
+
.map(add_preview)
|
|
149
|
+
.write_parquet("s3://my-bucket/english-cleanup/")
|
|
150
|
+
.launch_local(
|
|
151
|
+
name="english-cleanup",
|
|
152
|
+
num_workers=2,
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
`pip install` gives you:
|
|
158
|
+
|
|
159
|
+
- the Python package as `refiner`
|
|
160
|
+
- the CLI as `macrodata`
|
|
161
|
+
|
|
162
|
+
## Batteries included
|
|
163
|
+
|
|
164
|
+
- training-data-first pipeline primitives instead of generic ETL abstractions
|
|
165
|
+
- multimodal processing, with robotics support today
|
|
166
|
+
- built-in readers, transforms, sinks, and runtime machinery for common dataset work
|
|
167
|
+
- access to any storage backend supported by `fsspec` (S3, GCP, Hugging Face, etc.)
|
|
168
|
+
- local execution for development and elastic cloud execution for large runs
|
|
169
|
+
- built-in observability through the Macrodata platform for job state, logs, metrics, and manifests
|
|
170
|
+
|
|
171
|
+
## Docs
|
|
172
|
+
|
|
173
|
+
Start here:
|
|
174
|
+
|
|
175
|
+
- [Docs index](docs/index.md)
|
|
176
|
+
- [Quickstart](docs/quickstart.md)
|
|
177
|
+
- [Running pipelines](docs/running-pipelines/index.md)
|
|
178
|
+
|
|
179
|
+
Build a dataset:
|
|
180
|
+
|
|
181
|
+
- [Reading data](docs/reading-data/index.md)
|
|
182
|
+
- [Episode data](docs/episode-data/index.md)
|
|
183
|
+
- [Transforms](docs/transforms/index.md)
|
|
184
|
+
- [Episode operations](docs/episode-operations/index.md)
|
|
185
|
+
- [Writing data](docs/writing-data/index.md)
|
|
186
|
+
- [Examples](docs/examples/index.md)
|
|
187
|
+
|
|
188
|
+
Operate jobs:
|
|
189
|
+
|
|
190
|
+
- [Platform](docs/platform/index.md)
|
|
191
|
+
- [CLI](docs/cli/index.md)
|
|
192
|
+
- [Reference](docs/reference/index.md)
|
|
193
|
+
|
|
194
|
+
## Community
|
|
195
|
+
|
|
196
|
+
- join the Macrodata Discord: https://discord.gg/S8kZtmBR2x
|