macrodata-refiner 0.2.1__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. {macrodata_refiner-0.2.1/src/macrodata_refiner.egg-info → macrodata_refiner-0.2.2}/PKG-INFO +21 -4
  2. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/README.md +2 -0
  3. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/pyproject.toml +27 -5
  4. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2/src/macrodata_refiner.egg-info}/PKG-INFO +21 -4
  5. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/macrodata_refiner.egg-info/SOURCES.txt +14 -7
  6. macrodata_refiner-0.2.2/src/macrodata_refiner.egg-info/requires.txt +35 -0
  7. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/__init__.py +16 -20
  8. macrodata_refiner-0.2.2/src/refiner/execution/asyncio/__init__.py +1 -0
  9. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/operators/vectorized.py +2 -9
  10. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/io/datafolder.py +55 -1
  11. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/io/fileset.py +50 -35
  12. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/launchers/base.py +33 -3
  13. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/launchers/cloud.py +26 -1
  14. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/launchers/local.py +15 -0
  15. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/__init__.py +0 -6
  16. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/data/tabular.py +10 -0
  17. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/expressions.py +86 -0
  18. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/pipeline.py +21 -5
  19. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/planning.py +20 -2
  20. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sinks/__init__.py +0 -2
  21. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sinks/lerobot.py +5 -4
  22. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/readers/base.py +28 -2
  23. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/readers/csv.py +14 -5
  24. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/readers/jsonl.py +11 -2
  25. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/readers/lerobot.py +1 -0
  26. macrodata_refiner-0.2.2/src/refiner/pipeline/sources/readers/parquet.py +450 -0
  27. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/utils/cache/decoder_cache.py +4 -2
  28. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/platform/client/api.py +5 -13
  29. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/platform/client/http.py +28 -4
  30. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/platform/client/models.py +6 -0
  31. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/__init__.py +0 -2
  32. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/row.py +3 -12
  33. macrodata_refiner-0.2.2/src/refiner/text/__init__.py +11 -0
  34. macrodata_refiner-0.2.2/src/refiner/text/commoncrawl.py +654 -0
  35. macrodata_refiner-0.2.2/src/refiner/utils/__init__.py +3 -0
  36. macrodata_refiner-0.2.2/src/refiner/utils/imports.py +75 -0
  37. macrodata_refiner-0.2.2/src/refiner/video/__init__.py +41 -0
  38. {macrodata_refiner-0.2.1/src/refiner/media → macrodata_refiner-0.2.2/src/refiner}/video/remux.py +6 -4
  39. {macrodata_refiner-0.2.1/src/refiner/media → macrodata_refiner-0.2.2/src/refiner}/video/transcode.py +6 -4
  40. {macrodata_refiner-0.2.1/src/refiner/media → macrodata_refiner-0.2.2/src/refiner}/video/writer.py +12 -14
  41. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/entrypoint.py +12 -0
  42. macrodata_refiner-0.2.2/src/refiner/worker/resources/gpu.py +81 -0
  43. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/tests/test_cache.py +1 -1
  44. macrodata_refiner-0.2.2/tests/test_commoncrawl_text.py +1194 -0
  45. macrodata_refiner-0.2.2/tests/test_optional_dependencies.py +19 -0
  46. macrodata_refiner-0.2.1/src/macrodata_refiner.egg-info/requires.txt +0 -14
  47. macrodata_refiner-0.2.1/src/refiner/media/__init__.py +0 -3
  48. macrodata_refiner-0.2.1/src/refiner/media/video/__init__.py +0 -3
  49. macrodata_refiner-0.2.1/src/refiner/pipeline/sources/readers/parquet.py +0 -252
  50. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/LICENSE +0 -0
  51. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/setup.cfg +0 -0
  52. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/macrodata_refiner.egg-info/dependency_links.txt +0 -0
  53. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/macrodata_refiner.egg-info/entry_points.txt +0 -0
  54. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/macrodata_refiner.egg-info/top_level.txt +0 -0
  55. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/cli/__init__.py +0 -0
  56. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/cli/auth.py +0 -0
  57. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/cli/main.py +0 -0
  58. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/cli/ui.py +0 -0
  59. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/__init__.py +0 -0
  60. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/asyncio/runtime.py +0 -0
  61. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/asyncio/window.py +0 -0
  62. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/buffer.py +0 -0
  63. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/engine.py +0 -0
  64. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/operators/__init__.py +0 -0
  65. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/operators/row.py +0 -0
  66. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/tracking/__init__.py +0 -0
  67. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/tracking/shards.py +0 -0
  68. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/io/__init__.py +0 -0
  69. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/io/datafile.py +0 -0
  70. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/launchers/__init__.py +0 -0
  71. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/data/block.py +0 -0
  72. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/data/row.py +0 -0
  73. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/data/shard.py +0 -0
  74. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sinks/base.py +0 -0
  75. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sinks/jsonl.py +0 -0
  76. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sinks/lerobot_reducer.py +0 -0
  77. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sinks/parquet.py +0 -0
  78. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/__init__.py +0 -0
  79. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/base.py +0 -0
  80. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/items.py +0 -0
  81. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/readers/__init__.py +0 -0
  82. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/readers/utils.py +0 -0
  83. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/task.py +0 -0
  84. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/steps.py +0 -0
  85. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/utils/__init__.py +0 -0
  86. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/utils/cache/__init__.py +0 -0
  87. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/utils/cache/file_cache.py +0 -0
  88. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/utils/cache/lease_cache.py +0 -0
  89. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/platform/__init__.py +0 -0
  90. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/platform/auth.py +0 -0
  91. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/platform/client/__init__.py +0 -0
  92. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/platform/client/serialize.py +0 -0
  93. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/platform/manifest.py +0 -0
  94. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/py.typed +0 -0
  95. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/robotics/__init__.py +0 -0
  96. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/metadata/__init__.py +0 -0
  97. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/metadata/info.py +0 -0
  98. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/metadata/metadata.py +0 -0
  99. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/metadata/stats.py +0 -0
  100. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/metadata/tasks.py +0 -0
  101. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/tabular.py +0 -0
  102. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/robotics/motion.py +0 -0
  103. {macrodata_refiner-0.2.1/src/refiner/media → macrodata_refiner-0.2.2/src/refiner}/video/types.py +0 -0
  104. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/__init__.py +0 -0
  105. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/context.py +0 -0
  106. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/__init__.py +0 -0
  107. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/base.py +0 -0
  108. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/local/__init__.py +0 -0
  109. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/local/claim.py +0 -0
  110. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/local/files.py +0 -0
  111. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/local/lifecycle.py +0 -0
  112. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/platform.py +0 -0
  113. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/metrics/__init__.py +0 -0
  114. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/metrics/api.py +0 -0
  115. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/metrics/context.py +0 -0
  116. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/metrics/otel.py +0 -0
  117. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/resources/__init__.py +0 -0
  118. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/resources/cpu.py +0 -0
  119. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/resources/memory.py +0 -0
  120. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/resources/network.py +0 -0
  121. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/runner.py +0 -0
  122. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/workdir.py +0 -0
  123. {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/tests/test_expressions.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: macrodata-refiner
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets
5
5
  Author: Macrodata Labs
6
6
  License-Expression: Apache-2.0
@@ -12,12 +12,10 @@ Classifier: Programming Language :: Python :: 3.12
12
12
  Requires-Python: >=3.10
13
13
  Description-Content-Type: text/markdown
14
14
  License-File: LICENSE
15
- Requires-Dist: av
16
15
  Requires-Dist: cloudpickle==3.1.2
17
16
  Requires-Dist: fsspec
18
17
  Requires-Dist: httpx
19
18
  Requires-Dist: loguru
20
- Requires-Dist: huggingface-hub>=1.4.1
21
19
  Requires-Dist: opentelemetry-exporter-otlp-proto-http
22
20
  Requires-Dist: opentelemetry-sdk
23
21
  Requires-Dist: numpy
@@ -25,7 +23,24 @@ Requires-Dist: psutil
25
23
  Requires-Dist: orjson
26
24
  Requires-Dist: pyarrow
27
25
  Requires-Dist: msgspec>=0.20.0
28
- Requires-Dist: hf>=1.7.1
26
+ Provides-Extra: video
27
+ Requires-Dist: av; extra == "video"
28
+ Provides-Extra: robotics
29
+ Requires-Dist: macrodata-refiner[video]; extra == "robotics"
30
+ Requires-Dist: huggingface-hub>=1.4.1; extra == "robotics"
31
+ Requires-Dist: hf>=1.7.1; extra == "robotics"
32
+ Provides-Extra: text
33
+ Requires-Dist: warcio; extra == "text"
34
+ Provides-Extra: s3
35
+ Requires-Dist: s3fs; extra == "s3"
36
+ Provides-Extra: testing
37
+ Requires-Dist: macrodata-refiner[robotics]; extra == "testing"
38
+ Requires-Dist: macrodata-refiner[text]; extra == "testing"
39
+ Requires-Dist: macrodata-refiner[s3]; extra == "testing"
40
+ Requires-Dist: pytest>=8.0.0; extra == "testing"
41
+ Requires-Dist: pytest-cov>=5.0.0; extra == "testing"
42
+ Provides-Extra: all
43
+ Requires-Dist: macrodata-refiner[testing]; extra == "all"
29
44
  Dynamic: license-file
30
45
 
31
46
  <p align="center">
@@ -83,6 +98,8 @@ import refiner as mdr
83
98
  )
84
99
  ```
85
100
 
101
+ Need cloud GPUs? See [Launchers](docs/launchers.md) for the GPU-specific cloud options.
102
+
86
103
  ### Local example
87
104
 
88
105
  Launch a local pipeline:
@@ -53,6 +53,8 @@ import refiner as mdr
53
53
  )
54
54
  ```
55
55
 
56
+ Need cloud GPUs? See [Launchers](docs/launchers.md) for the GPU-specific cloud options.
57
+
56
58
  ### Local example
57
59
 
58
60
  Launch a local pipeline:
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "macrodata-refiner"
3
- version = "0.2.1"
3
+ version = "0.2.2"
4
4
  description = "Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets"
5
5
  readme = "README.md"
6
6
  license = "Apache-2.0"
@@ -16,12 +16,10 @@ authors = [
16
16
  ]
17
17
  requires-python = ">=3.10"
18
18
  dependencies = [
19
- "av",
20
19
  "cloudpickle==3.1.2",
21
20
  "fsspec",
22
21
  "httpx",
23
22
  "loguru",
24
- "huggingface-hub>=1.4.1",
25
23
  "opentelemetry-exporter-otlp-proto-http",
26
24
  "opentelemetry-sdk",
27
25
  "numpy",
@@ -29,8 +27,33 @@ dependencies = [
29
27
  "orjson",
30
28
  "pyarrow",
31
29
  "msgspec>=0.20.0",
30
+ ]
31
+
32
+ [project.optional-dependencies]
33
+ video = [
34
+ "av",
35
+ ]
36
+ robotics = [
37
+ "macrodata-refiner[video]",
38
+ "huggingface-hub>=1.4.1",
32
39
  "hf>=1.7.1",
33
40
  ]
41
+ text = [
42
+ "warcio",
43
+ ]
44
+ s3 = [
45
+ "s3fs",
46
+ ]
47
+ testing = [
48
+ "macrodata-refiner[robotics]",
49
+ "macrodata-refiner[text]",
50
+ "macrodata-refiner[s3]",
51
+ "pytest>=8.0.0",
52
+ "pytest-cov>=5.0.0",
53
+ ]
54
+ all = [
55
+ "macrodata-refiner[testing]",
56
+ ]
34
57
 
35
58
  [project.scripts]
36
59
  macrodata = "refiner.cli.main:main"
@@ -47,9 +70,8 @@ refiner = ["py.typed"]
47
70
 
48
71
  [dependency-groups]
49
72
  dev = [
73
+ "macrodata-refiner[all]",
50
74
  "pre-commit>=4.0.0",
51
- "pytest>=8.0.0",
52
- "pytest-cov>=5.0.0",
53
75
  "ruff>=0.14.10",
54
76
  "ty>=0.0.7",
55
77
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: macrodata-refiner
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets
5
5
  Author: Macrodata Labs
6
6
  License-Expression: Apache-2.0
@@ -12,12 +12,10 @@ Classifier: Programming Language :: Python :: 3.12
12
12
  Requires-Python: >=3.10
13
13
  Description-Content-Type: text/markdown
14
14
  License-File: LICENSE
15
- Requires-Dist: av
16
15
  Requires-Dist: cloudpickle==3.1.2
17
16
  Requires-Dist: fsspec
18
17
  Requires-Dist: httpx
19
18
  Requires-Dist: loguru
20
- Requires-Dist: huggingface-hub>=1.4.1
21
19
  Requires-Dist: opentelemetry-exporter-otlp-proto-http
22
20
  Requires-Dist: opentelemetry-sdk
23
21
  Requires-Dist: numpy
@@ -25,7 +23,24 @@ Requires-Dist: psutil
25
23
  Requires-Dist: orjson
26
24
  Requires-Dist: pyarrow
27
25
  Requires-Dist: msgspec>=0.20.0
28
- Requires-Dist: hf>=1.7.1
26
+ Provides-Extra: video
27
+ Requires-Dist: av; extra == "video"
28
+ Provides-Extra: robotics
29
+ Requires-Dist: macrodata-refiner[video]; extra == "robotics"
30
+ Requires-Dist: huggingface-hub>=1.4.1; extra == "robotics"
31
+ Requires-Dist: hf>=1.7.1; extra == "robotics"
32
+ Provides-Extra: text
33
+ Requires-Dist: warcio; extra == "text"
34
+ Provides-Extra: s3
35
+ Requires-Dist: s3fs; extra == "s3"
36
+ Provides-Extra: testing
37
+ Requires-Dist: macrodata-refiner[robotics]; extra == "testing"
38
+ Requires-Dist: macrodata-refiner[text]; extra == "testing"
39
+ Requires-Dist: macrodata-refiner[s3]; extra == "testing"
40
+ Requires-Dist: pytest>=8.0.0; extra == "testing"
41
+ Requires-Dist: pytest-cov>=5.0.0; extra == "testing"
42
+ Provides-Extra: all
43
+ Requires-Dist: macrodata-refiner[testing]; extra == "all"
29
44
  Dynamic: license-file
30
45
 
31
46
  <p align="center">
@@ -83,6 +98,8 @@ import refiner as mdr
83
98
  )
84
99
  ```
85
100
 
101
+ Need cloud GPUs? See [Launchers](docs/launchers.md) for the GPU-specific cloud options.
102
+
86
103
  ### Local example
87
104
 
88
105
  Launch a local pipeline:
@@ -16,6 +16,7 @@ src/refiner/cli/ui.py
16
16
  src/refiner/execution/__init__.py
17
17
  src/refiner/execution/buffer.py
18
18
  src/refiner/execution/engine.py
19
+ src/refiner/execution/asyncio/__init__.py
19
20
  src/refiner/execution/asyncio/runtime.py
20
21
  src/refiner/execution/asyncio/window.py
21
22
  src/refiner/execution/operators/__init__.py
@@ -31,12 +32,6 @@ src/refiner/launchers/__init__.py
31
32
  src/refiner/launchers/base.py
32
33
  src/refiner/launchers/cloud.py
33
34
  src/refiner/launchers/local.py
34
- src/refiner/media/__init__.py
35
- src/refiner/media/video/__init__.py
36
- src/refiner/media/video/remux.py
37
- src/refiner/media/video/transcode.py
38
- src/refiner/media/video/types.py
39
- src/refiner/media/video/writer.py
40
35
  src/refiner/pipeline/__init__.py
41
36
  src/refiner/pipeline/expressions.py
42
37
  src/refiner/pipeline/pipeline.py
@@ -86,6 +81,15 @@ src/refiner/robotics/lerobot_format/metadata/info.py
86
81
  src/refiner/robotics/lerobot_format/metadata/metadata.py
87
82
  src/refiner/robotics/lerobot_format/metadata/stats.py
88
83
  src/refiner/robotics/lerobot_format/metadata/tasks.py
84
+ src/refiner/text/__init__.py
85
+ src/refiner/text/commoncrawl.py
86
+ src/refiner/utils/__init__.py
87
+ src/refiner/utils/imports.py
88
+ src/refiner/video/__init__.py
89
+ src/refiner/video/remux.py
90
+ src/refiner/video/transcode.py
91
+ src/refiner/video/types.py
92
+ src/refiner/video/writer.py
89
93
  src/refiner/worker/__init__.py
90
94
  src/refiner/worker/context.py
91
95
  src/refiner/worker/entrypoint.py
@@ -104,7 +108,10 @@ src/refiner/worker/metrics/context.py
104
108
  src/refiner/worker/metrics/otel.py
105
109
  src/refiner/worker/resources/__init__.py
106
110
  src/refiner/worker/resources/cpu.py
111
+ src/refiner/worker/resources/gpu.py
107
112
  src/refiner/worker/resources/memory.py
108
113
  src/refiner/worker/resources/network.py
109
114
  tests/test_cache.py
110
- tests/test_expressions.py
115
+ tests/test_commoncrawl_text.py
116
+ tests/test_expressions.py
117
+ tests/test_optional_dependencies.py
@@ -0,0 +1,35 @@
1
+ cloudpickle==3.1.2
2
+ fsspec
3
+ httpx
4
+ loguru
5
+ opentelemetry-exporter-otlp-proto-http
6
+ opentelemetry-sdk
7
+ numpy
8
+ psutil
9
+ orjson
10
+ pyarrow
11
+ msgspec>=0.20.0
12
+
13
+ [all]
14
+ macrodata-refiner[testing]
15
+
16
+ [robotics]
17
+ macrodata-refiner[video]
18
+ huggingface-hub>=1.4.1
19
+ hf>=1.7.1
20
+
21
+ [s3]
22
+ s3fs
23
+
24
+ [testing]
25
+ macrodata-refiner[robotics]
26
+ macrodata-refiner[text]
27
+ macrodata-refiner[s3]
28
+ pytest>=8.0.0
29
+ pytest-cov>=5.0.0
30
+
31
+ [text]
32
+ warcio
33
+
34
+ [video]
35
+ av
@@ -1,11 +1,9 @@
1
+ import refiner.io as io
2
+ import refiner.pipeline as pipeline
1
3
  import refiner.robotics as robotics
2
- from refiner.io import DataFile, DataFileSet, DataFolder
3
- from refiner.launchers import LaunchStats, LocalLauncher
4
- from refiner.media import VideoFile
4
+ import refiner.text as text
5
+ import refiner.video as video
5
6
  from refiner.pipeline import (
6
- RefinerPipeline,
7
- Row,
8
- Shard,
9
7
  from_items,
10
8
  from_source,
11
9
  read_csv,
@@ -22,19 +20,11 @@ from refiner.worker.metrics.api import (
22
20
  log_throughput,
23
21
  register_gauge,
24
22
  )
25
- from refiner.worker.runner import Worker, WorkerRunStats
23
+
24
+ robot = robotics
26
25
 
27
26
  __all__ = [
28
- "RefinerPipeline",
29
- "LocalLauncher",
30
- "LaunchStats",
31
- "DataFile",
32
- "DataFolder",
33
- "DataFileSet",
34
- "Shard",
35
- "Row",
36
- "Worker",
37
- "WorkerRunStats",
27
+ # sources
38
28
  "read_csv",
39
29
  "read_jsonl",
40
30
  "read_lerobot",
@@ -42,16 +32,22 @@ __all__ = [
42
32
  "from_items",
43
33
  "from_source",
44
34
  "task",
35
+ # metrics
45
36
  "log_throughput",
46
37
  "log_gauge",
47
38
  "log_gauges",
48
- "register_gauge",
49
39
  "log_histogram",
40
+ "register_gauge",
41
+ # expressions
50
42
  "col",
51
43
  "lit",
52
44
  "coalesce",
53
45
  "if_else",
54
- "VideoFile",
55
- "Video",
46
+ # submodules
47
+ "io",
48
+ "pipeline",
49
+ "video",
50
+ "robot",
56
51
  "robotics",
52
+ "text",
57
53
  ]
@@ -10,7 +10,7 @@ from refiner.execution.tracking.shards import (
10
10
  count_table_by_shard,
11
11
  counts_delta,
12
12
  )
13
- from refiner.pipeline.data.tabular import repeat_scalar
13
+ from refiner.pipeline.data.tabular import filter_table, repeat_scalar
14
14
  from refiner.pipeline.expressions import eval_expr_arrow
15
15
  from refiner.pipeline.steps import (
16
16
  CastStep,
@@ -69,14 +69,7 @@ def apply_vectorized_op(
69
69
  return out, None
70
70
 
71
71
  if isinstance(op, FilterExprStep):
72
- mask = eval_expr_arrow(op.predicate, table)
73
- next_table = (
74
- table
75
- if isinstance(mask, pa.Scalar) and bool(mask.as_py())
76
- else (
77
- table.slice(0, 0) if isinstance(mask, pa.Scalar) else table.filter(mask)
78
- )
79
- )
72
+ next_table = filter_table(table, op.predicate)
80
73
  next_shard_counts = count_table_by_shard(next_table)
81
74
  for shard_id in set(shard_counts) | set(next_shard_counts):
82
75
  previous = int(shard_counts.get(shard_id, 0))
@@ -1,4 +1,4 @@
1
- from collections.abc import Iterable, Mapping
1
+ from collections.abc import Iterable, Iterator, Mapping
2
2
  from os import PathLike
3
3
  from typing import IO, Any, TypeAlias, Union, cast
4
4
 
@@ -113,6 +113,34 @@ class DataFolder(DirFileSystem):
113
113
  return self.abs_path(paths)
114
114
  return [self.abs_path(p) for p in paths]
115
115
 
116
+ def find(self, path: str, *args, **kwargs):
117
+ # Avoid DirFileSystem.find(): some backends (notably HF buckets) can leak
118
+ # sibling prefix matches like `root-2/...` or return the bare root entry,
119
+ # and DirFileSystem._relpath() asserts before we can filter them out.
120
+ """List paths under this folder, skipping backend results outside the base path."""
121
+ detail = kwargs.get("detail", False)
122
+ target = self._join(path.rstrip("/"))
123
+ ret = self.fs.find(target, *args, **kwargs)
124
+ target = target.rstrip("/")
125
+ target_prefix = target + self.fs.sep
126
+ alt_target = target[1:] if target.startswith(self.fs.sep) else None
127
+ alt_prefix = alt_target + self.fs.sep if alt_target is not None else None
128
+
129
+ def rel(p: str) -> str | None:
130
+ if p == target or (alt_target is not None and p == alt_target):
131
+ return path.rstrip("/")
132
+ if p.startswith(target_prefix):
133
+ suffix = p[len(target_prefix) :]
134
+ elif alt_prefix is not None and p.startswith(alt_prefix):
135
+ suffix = p[len(alt_prefix) :]
136
+ else:
137
+ return None
138
+ return suffix if path in {"", "/"} else f"{path.rstrip('/')}/{suffix}"
139
+
140
+ if detail:
141
+ return {r: info for p, info in ret.items() if (r := rel(p)) is not None}
142
+ return [r for p in ret if (r := rel(p)) is not None]
143
+
116
144
  def open_files(
117
145
  self, paths: Iterable[str], mode: str = "rb", **kwargs
118
146
  ) -> list[IO[Any]]:
@@ -159,3 +187,29 @@ class DataFolder(DirFileSystem):
159
187
 
160
188
  def files(self, relpaths: Iterable[str]) -> list[DataFile]:
161
189
  return [self.file(p) for p in relpaths]
190
+
191
+ def iter_files_with_sizes(
192
+ self, *, recursive: bool = False, **kwargs: Any
193
+ ) -> Iterator[tuple[DataFile, int | None]]:
194
+ if recursive:
195
+ found = self.find("", detail=True, **kwargs)
196
+ items: Iterable[tuple[str, Mapping[str, Any]]] = found.items()
197
+ else:
198
+ items = (
199
+ (str(info["name"]), info)
200
+ for info in self.ls("", detail=True, **kwargs)
201
+ if isinstance(info, Mapping)
202
+ )
203
+
204
+ for relpath, info in sorted(items, key=lambda item: item[0]):
205
+ info_dict = dict(info)
206
+ if info_dict.get("type") != "file":
207
+ continue
208
+ size = info_dict.get("size")
209
+ yield self.file(relpath), size if isinstance(size, int) else None
210
+
211
+ def iter_files(
212
+ self, *, recursive: bool = False, **kwargs: Any
213
+ ) -> Iterator[DataFile]:
214
+ for file, _ in self.iter_files_with_sizes(recursive=recursive, **kwargs):
215
+ yield file
@@ -170,62 +170,77 @@ class DataFileSet:
170
170
  exts = tuple(e.lower() for e in self.extensions)
171
171
  seen: set[tuple[int, str]] = set()
172
172
  expanded: list[tuple[DataFile, ...]] = []
173
-
174
- def _append_file(out: list[DataFile], file: DataFile) -> None:
175
- if exts and not file.path.lower().endswith(exts):
173
+ sizes = dict(self._sizes)
174
+
175
+ def _append_file(
176
+ out: list[DataFile],
177
+ file: DataFile,
178
+ *,
179
+ size: int | None = None,
180
+ apply_extensions: bool = True,
181
+ ) -> None:
182
+ if apply_extensions and exts and not file.path.lower().endswith(exts):
176
183
  return
177
184
  key = (id(file.fs), file.path)
178
185
  if key in seen:
179
186
  return
180
187
  seen.add(key)
181
188
  out.append(file)
189
+ if size is not None:
190
+ sizes[(len(expanded), file.abs_path())] = int(size)
182
191
 
183
192
  for entry in self.entries:
184
193
  files: list[DataFile] = []
194
+ if isinstance(entry, _PathSource) and not glob.has_magic(entry.path):
195
+ try:
196
+ info = entry.fs.info(entry.path)
197
+ except FileNotFoundError:
198
+ raise FileNotFoundError(
199
+ f"Could not resolve input: {entry.fs.unstrip_protocol(entry.path)!r}"
200
+ )
201
+ item_type = info.get("type")
202
+ if item_type == "directory":
203
+ entry = DataFolder(path=entry.path, fs=entry.fs)
204
+ elif item_type == "file":
205
+ entry = DataFile(fs=entry.fs, path=entry.path)
206
+ else:
207
+ raise TypeError(
208
+ f"Unsupported file type {item_type!r} for input: "
209
+ f"{entry.fs.unstrip_protocol(entry.path)!r}"
210
+ )
211
+
185
212
  if isinstance(entry, DataFile):
186
- _append_file(files, entry)
213
+ _append_file(files, entry, apply_extensions=False)
187
214
  elif isinstance(entry, DataFolder):
188
- paths = (
189
- sorted(entry.find(""))
190
- if self.recursive
191
- else sorted(
192
- e["name"] if isinstance(e, dict) else e
193
- for e in entry.ls("", detail=True)
194
- if not isinstance(e, dict) or e.get("type") == "file"
195
- )
196
- )
197
- for path in paths:
198
- _append_file(files, entry.file(path))
215
+ for file, size in entry.iter_files_with_sizes(recursive=self.recursive):
216
+ _append_file(files, file, size=size)
199
217
  else:
200
218
  next_fs, path = entry.fs, entry.path
201
219
  if glob.has_magic(path):
202
- for expanded_path in sorted(next_fs.glob(path)):
203
- _append_file(files, DataFile(fs=next_fs, path=expanded_path))
204
- elif next_fs.exists(path):
205
- if next_fs.isdir(path):
206
- paths = (
207
- sorted(next_fs.find(path))
208
- if self.recursive
209
- else sorted(
210
- e["name"] if isinstance(e, dict) else e
211
- for e in next_fs.ls(path, detail=True)
212
- if not isinstance(e, dict) or e.get("type") == "file"
213
- )
220
+ matched = next_fs.glob(path, detail=True)
221
+ items = matched.items()
222
+ for expanded_path, info in sorted(items):
223
+ if not isinstance(expanded_path, str) or not isinstance(
224
+ info, Mapping
225
+ ):
226
+ continue
227
+ if info.get("type") != "file":
228
+ continue
229
+ size = info.get("size")
230
+ _append_file(
231
+ files,
232
+ DataFile(fs=next_fs, path=expanded_path),
233
+ size=size if isinstance(size, int) else None,
214
234
  )
215
- for expanded_path in paths:
216
- _append_file(
217
- files, DataFile(fs=next_fs, path=expanded_path)
218
- )
219
- else:
220
- _append_file(files, DataFile(fs=next_fs, path=path))
221
235
  else:
222
- raise FileNotFoundError(
223
- f"Could not resolve input: {next_fs.unstrip_protocol(path)!r}"
236
+ raise AssertionError(
237
+ "non-glob _PathSource should have been resolved"
224
238
  )
225
239
  expanded.append(tuple(files))
226
240
 
227
241
  out = tuple(expanded)
228
242
  object.__setattr__(self, "_expanded_sources", out)
243
+ object.__setattr__(self, "_sizes", sizes)
229
244
  return out
230
245
 
231
246
  def resolve_file(self, source_index: int, path: str) -> DataFile:
@@ -1,20 +1,22 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from abc import ABC, abstractmethod
4
+ from dataclasses import replace
4
5
  from typing import TYPE_CHECKING
5
- from uuid import uuid4
6
6
  import re
7
7
  import time
8
+ from uuid import uuid4
8
9
 
9
10
  from loguru import logger
10
11
 
11
12
  from refiner.platform.auth import CredentialsError
12
13
  from refiner.platform.client.api import MacrodataClient
13
- from refiner.platform.client.http import sanitize_terminal_text
14
+ from refiner.platform.client.http import request_json, sanitize_terminal_text
14
15
  from refiner.platform.manifest import build_run_manifest
15
16
  from refiner.worker.context import RunHandle
16
17
  from refiner.pipeline.planning import (
17
18
  PlannedStage,
19
+ StageComputeRequirements,
18
20
  compile_planned_stages,
19
21
  plan_pipeline_stages,
20
22
  )
@@ -34,6 +36,7 @@ class BaseLauncher(ABC):
34
36
  num_workers: int | None = None,
35
37
  heartbeat_interval_seconds: int | None = None,
36
38
  cpus_per_worker: int | None = None,
39
+ gpus_per_worker: int | None = None,
37
40
  ):
38
41
  if not name.strip():
39
42
  raise ValueError("name must be non-empty")
@@ -41,6 +44,7 @@ class BaseLauncher(ABC):
41
44
  self.name = name
42
45
  self.job_id = job_id or self._build_local_job_id(name)
43
46
  self.cpus_per_worker: int | None = None
47
+ self.gpus_per_worker: int | None = None
44
48
  if num_workers is not None:
45
49
  if num_workers <= 0:
46
50
  raise ValueError("num_workers must be > 0")
@@ -53,6 +57,10 @@ class BaseLauncher(ABC):
53
57
  if cpus_per_worker <= 0:
54
58
  raise ValueError("cpus_per_worker must be > 0")
55
59
  self.cpus_per_worker = cpus_per_worker
60
+ if gpus_per_worker is not None:
61
+ if gpus_per_worker <= 0:
62
+ raise ValueError("gpus_per_worker must be > 0")
63
+ self.gpus_per_worker = gpus_per_worker
56
64
 
57
65
  @staticmethod
58
66
  def _build_local_job_id(name: str) -> str:
@@ -81,6 +89,14 @@ class BaseLauncher(ABC):
81
89
  try:
82
90
  return MacrodataClient()
83
91
  except CredentialsError:
92
+ try:
93
+ request_json(
94
+ method="GET",
95
+ path="/api/me",
96
+ timeout_s=2.0,
97
+ )
98
+ except Exception:
99
+ pass
84
100
  self._warn(
85
101
  "platform integration disabled: no API key found in "
86
102
  "MACRODATA_API_KEY or local credentials. "
@@ -113,8 +129,22 @@ class BaseLauncher(ABC):
113
129
  *,
114
130
  secret_values: tuple[str, ...] = (),
115
131
  ) -> dict[str, object]:
132
+ resolved_stages = [
133
+ replace(stage, compute=self._stage_compute_requirements(stage.compute))
134
+ for stage in (stages or self._planned_stages())
135
+ ]
116
136
  return compile_planned_stages(
117
- stages or self._planned_stages(), secret_values=secret_values
137
+ resolved_stages,
138
+ secret_values=secret_values,
139
+ )
140
+
141
+ def _stage_compute_requirements(
142
+ self, compute: StageComputeRequirements
143
+ ) -> StageComputeRequirements:
144
+ return replace(
145
+ compute,
146
+ cpus_per_worker=self.cpus_per_worker,
147
+ gpus_per_worker=self.gpus_per_worker,
118
148
  )
119
149
 
120
150
  def _run_manifest(