macrodata-refiner 0.2.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. {macrodata_refiner-0.2.0/src/macrodata_refiner.egg-info → macrodata_refiner-0.2.2}/PKG-INFO +21 -4
  2. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/README.md +2 -0
  3. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/pyproject.toml +27 -5
  4. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2/src/macrodata_refiner.egg-info}/PKG-INFO +21 -4
  5. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/macrodata_refiner.egg-info/SOURCES.txt +14 -7
  6. macrodata_refiner-0.2.2/src/macrodata_refiner.egg-info/requires.txt +35 -0
  7. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/__init__.py +16 -20
  8. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/cli/auth.py +2 -2
  9. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/cli/ui.py +9 -0
  10. macrodata_refiner-0.2.2/src/refiner/execution/asyncio/__init__.py +1 -0
  11. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/execution/operators/row.py +10 -9
  12. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/execution/operators/vectorized.py +5 -10
  13. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/io/datafolder.py +55 -1
  14. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/io/fileset.py +50 -35
  15. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/launchers/base.py +33 -3
  16. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/launchers/cloud.py +74 -3
  17. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/launchers/local.py +15 -0
  18. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/__init__.py +0 -6
  19. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/data/tabular.py +10 -0
  20. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/expressions.py +86 -0
  21. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/pipeline.py +21 -5
  22. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/planning.py +31 -6
  23. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sinks/__init__.py +0 -2
  24. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sinks/lerobot.py +5 -4
  25. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/readers/base.py +28 -2
  26. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/readers/csv.py +14 -5
  27. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/readers/jsonl.py +11 -2
  28. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/readers/lerobot.py +5 -0
  29. macrodata_refiner-0.2.2/src/refiner/pipeline/sources/readers/parquet.py +450 -0
  30. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/utils/cache/decoder_cache.py +4 -2
  31. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/platform/client/api.py +5 -13
  32. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/platform/client/http.py +28 -4
  33. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/platform/client/models.py +6 -0
  34. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/platform/manifest.py +36 -35
  35. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/__init__.py +0 -2
  36. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/row.py +3 -12
  37. macrodata_refiner-0.2.2/src/refiner/text/__init__.py +11 -0
  38. macrodata_refiner-0.2.2/src/refiner/text/commoncrawl.py +654 -0
  39. macrodata_refiner-0.2.2/src/refiner/utils/__init__.py +3 -0
  40. macrodata_refiner-0.2.2/src/refiner/utils/imports.py +75 -0
  41. macrodata_refiner-0.2.2/src/refiner/video/__init__.py +41 -0
  42. {macrodata_refiner-0.2.0/src/refiner/media → macrodata_refiner-0.2.2/src/refiner}/video/remux.py +6 -4
  43. {macrodata_refiner-0.2.0/src/refiner/media → macrodata_refiner-0.2.2/src/refiner}/video/transcode.py +6 -4
  44. {macrodata_refiner-0.2.0/src/refiner/media → macrodata_refiner-0.2.2/src/refiner}/video/writer.py +12 -14
  45. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/worker/entrypoint.py +12 -0
  46. macrodata_refiner-0.2.2/src/refiner/worker/resources/gpu.py +81 -0
  47. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/worker/runner.py +29 -18
  48. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/tests/test_cache.py +1 -1
  49. macrodata_refiner-0.2.2/tests/test_commoncrawl_text.py +1194 -0
  50. macrodata_refiner-0.2.2/tests/test_optional_dependencies.py +19 -0
  51. macrodata_refiner-0.2.0/src/macrodata_refiner.egg-info/requires.txt +0 -14
  52. macrodata_refiner-0.2.0/src/refiner/media/__init__.py +0 -3
  53. macrodata_refiner-0.2.0/src/refiner/media/video/__init__.py +0 -3
  54. macrodata_refiner-0.2.0/src/refiner/pipeline/sources/readers/parquet.py +0 -252
  55. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/LICENSE +0 -0
  56. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/setup.cfg +0 -0
  57. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/macrodata_refiner.egg-info/dependency_links.txt +0 -0
  58. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/macrodata_refiner.egg-info/entry_points.txt +0 -0
  59. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/macrodata_refiner.egg-info/top_level.txt +0 -0
  60. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/cli/__init__.py +0 -0
  61. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/cli/main.py +0 -0
  62. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/execution/__init__.py +0 -0
  63. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/execution/asyncio/runtime.py +0 -0
  64. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/execution/asyncio/window.py +0 -0
  65. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/execution/buffer.py +0 -0
  66. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/execution/engine.py +0 -0
  67. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/execution/operators/__init__.py +0 -0
  68. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/execution/tracking/__init__.py +0 -0
  69. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/execution/tracking/shards.py +0 -0
  70. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/io/__init__.py +0 -0
  71. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/io/datafile.py +0 -0
  72. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/launchers/__init__.py +0 -0
  73. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/data/block.py +0 -0
  74. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/data/row.py +0 -0
  75. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/data/shard.py +0 -0
  76. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sinks/base.py +0 -0
  77. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sinks/jsonl.py +0 -0
  78. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sinks/lerobot_reducer.py +0 -0
  79. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sinks/parquet.py +0 -0
  80. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/__init__.py +0 -0
  81. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/base.py +0 -0
  82. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/items.py +0 -0
  83. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/readers/__init__.py +0 -0
  84. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/readers/utils.py +0 -0
  85. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/task.py +0 -0
  86. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/steps.py +0 -0
  87. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/utils/__init__.py +0 -0
  88. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/utils/cache/__init__.py +0 -0
  89. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/utils/cache/file_cache.py +0 -0
  90. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/pipeline/utils/cache/lease_cache.py +0 -0
  91. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/platform/__init__.py +0 -0
  92. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/platform/auth.py +0 -0
  93. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/platform/client/__init__.py +0 -0
  94. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/platform/client/serialize.py +0 -0
  95. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/py.typed +0 -0
  96. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/robotics/__init__.py +0 -0
  97. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/metadata/__init__.py +0 -0
  98. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/metadata/info.py +0 -0
  99. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/metadata/metadata.py +0 -0
  100. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/metadata/stats.py +0 -0
  101. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/metadata/tasks.py +0 -0
  102. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/tabular.py +0 -0
  103. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/robotics/motion.py +0 -0
  104. {macrodata_refiner-0.2.0/src/refiner/media → macrodata_refiner-0.2.2/src/refiner}/video/types.py +0 -0
  105. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/worker/__init__.py +0 -0
  106. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/worker/context.py +0 -0
  107. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/__init__.py +0 -0
  108. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/base.py +0 -0
  109. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/local/__init__.py +0 -0
  110. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/local/claim.py +0 -0
  111. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/local/files.py +0 -0
  112. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/local/lifecycle.py +0 -0
  113. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/platform.py +0 -0
  114. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/worker/metrics/__init__.py +0 -0
  115. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/worker/metrics/api.py +0 -0
  116. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/worker/metrics/context.py +0 -0
  117. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/worker/metrics/otel.py +0 -0
  118. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/worker/resources/__init__.py +0 -0
  119. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/worker/resources/cpu.py +0 -0
  120. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/worker/resources/memory.py +0 -0
  121. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/worker/resources/network.py +0 -0
  122. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/src/refiner/worker/workdir.py +0 -0
  123. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.2}/tests/test_expressions.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: macrodata-refiner
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets
5
5
  Author: Macrodata Labs
6
6
  License-Expression: Apache-2.0
@@ -12,12 +12,10 @@ Classifier: Programming Language :: Python :: 3.12
12
12
  Requires-Python: >=3.10
13
13
  Description-Content-Type: text/markdown
14
14
  License-File: LICENSE
15
- Requires-Dist: av
16
15
  Requires-Dist: cloudpickle==3.1.2
17
16
  Requires-Dist: fsspec
18
17
  Requires-Dist: httpx
19
18
  Requires-Dist: loguru
20
- Requires-Dist: huggingface-hub>=1.4.1
21
19
  Requires-Dist: opentelemetry-exporter-otlp-proto-http
22
20
  Requires-Dist: opentelemetry-sdk
23
21
  Requires-Dist: numpy
@@ -25,7 +23,24 @@ Requires-Dist: psutil
25
23
  Requires-Dist: orjson
26
24
  Requires-Dist: pyarrow
27
25
  Requires-Dist: msgspec>=0.20.0
28
- Requires-Dist: hf>=1.7.1
26
+ Provides-Extra: video
27
+ Requires-Dist: av; extra == "video"
28
+ Provides-Extra: robotics
29
+ Requires-Dist: macrodata-refiner[video]; extra == "robotics"
30
+ Requires-Dist: huggingface-hub>=1.4.1; extra == "robotics"
31
+ Requires-Dist: hf>=1.7.1; extra == "robotics"
32
+ Provides-Extra: text
33
+ Requires-Dist: warcio; extra == "text"
34
+ Provides-Extra: s3
35
+ Requires-Dist: s3fs; extra == "s3"
36
+ Provides-Extra: testing
37
+ Requires-Dist: macrodata-refiner[robotics]; extra == "testing"
38
+ Requires-Dist: macrodata-refiner[text]; extra == "testing"
39
+ Requires-Dist: macrodata-refiner[s3]; extra == "testing"
40
+ Requires-Dist: pytest>=8.0.0; extra == "testing"
41
+ Requires-Dist: pytest-cov>=5.0.0; extra == "testing"
42
+ Provides-Extra: all
43
+ Requires-Dist: macrodata-refiner[testing]; extra == "all"
29
44
  Dynamic: license-file
30
45
 
31
46
  <p align="center">
@@ -83,6 +98,8 @@ import refiner as mdr
83
98
  )
84
99
  ```
85
100
 
101
+ Need cloud GPUs? See [Launchers](docs/launchers.md) for the GPU-specific cloud options.
102
+
86
103
  ### Local example
87
104
 
88
105
  Launch a local pipeline:
@@ -53,6 +53,8 @@ import refiner as mdr
53
53
  )
54
54
  ```
55
55
 
56
+ Need cloud GPUs? See [Launchers](docs/launchers.md) for the GPU-specific cloud options.
57
+
56
58
  ### Local example
57
59
 
58
60
  Launch a local pipeline:
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "macrodata-refiner"
3
- version = "0.2.0"
3
+ version = "0.2.2"
4
4
  description = "Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets"
5
5
  readme = "README.md"
6
6
  license = "Apache-2.0"
@@ -16,12 +16,10 @@ authors = [
16
16
  ]
17
17
  requires-python = ">=3.10"
18
18
  dependencies = [
19
- "av",
20
19
  "cloudpickle==3.1.2",
21
20
  "fsspec",
22
21
  "httpx",
23
22
  "loguru",
24
- "huggingface-hub>=1.4.1",
25
23
  "opentelemetry-exporter-otlp-proto-http",
26
24
  "opentelemetry-sdk",
27
25
  "numpy",
@@ -29,8 +27,33 @@ dependencies = [
29
27
  "orjson",
30
28
  "pyarrow",
31
29
  "msgspec>=0.20.0",
30
+ ]
31
+
32
+ [project.optional-dependencies]
33
+ video = [
34
+ "av",
35
+ ]
36
+ robotics = [
37
+ "macrodata-refiner[video]",
38
+ "huggingface-hub>=1.4.1",
32
39
  "hf>=1.7.1",
33
40
  ]
41
+ text = [
42
+ "warcio",
43
+ ]
44
+ s3 = [
45
+ "s3fs",
46
+ ]
47
+ testing = [
48
+ "macrodata-refiner[robotics]",
49
+ "macrodata-refiner[text]",
50
+ "macrodata-refiner[s3]",
51
+ "pytest>=8.0.0",
52
+ "pytest-cov>=5.0.0",
53
+ ]
54
+ all = [
55
+ "macrodata-refiner[testing]",
56
+ ]
34
57
 
35
58
  [project.scripts]
36
59
  macrodata = "refiner.cli.main:main"
@@ -47,9 +70,8 @@ refiner = ["py.typed"]
47
70
 
48
71
  [dependency-groups]
49
72
  dev = [
73
+ "macrodata-refiner[all]",
50
74
  "pre-commit>=4.0.0",
51
- "pytest>=8.0.0",
52
- "pytest-cov>=5.0.0",
53
75
  "ruff>=0.14.10",
54
76
  "ty>=0.0.7",
55
77
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: macrodata-refiner
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets
5
5
  Author: Macrodata Labs
6
6
  License-Expression: Apache-2.0
@@ -12,12 +12,10 @@ Classifier: Programming Language :: Python :: 3.12
12
12
  Requires-Python: >=3.10
13
13
  Description-Content-Type: text/markdown
14
14
  License-File: LICENSE
15
- Requires-Dist: av
16
15
  Requires-Dist: cloudpickle==3.1.2
17
16
  Requires-Dist: fsspec
18
17
  Requires-Dist: httpx
19
18
  Requires-Dist: loguru
20
- Requires-Dist: huggingface-hub>=1.4.1
21
19
  Requires-Dist: opentelemetry-exporter-otlp-proto-http
22
20
  Requires-Dist: opentelemetry-sdk
23
21
  Requires-Dist: numpy
@@ -25,7 +23,24 @@ Requires-Dist: psutil
25
23
  Requires-Dist: orjson
26
24
  Requires-Dist: pyarrow
27
25
  Requires-Dist: msgspec>=0.20.0
28
- Requires-Dist: hf>=1.7.1
26
+ Provides-Extra: video
27
+ Requires-Dist: av; extra == "video"
28
+ Provides-Extra: robotics
29
+ Requires-Dist: macrodata-refiner[video]; extra == "robotics"
30
+ Requires-Dist: huggingface-hub>=1.4.1; extra == "robotics"
31
+ Requires-Dist: hf>=1.7.1; extra == "robotics"
32
+ Provides-Extra: text
33
+ Requires-Dist: warcio; extra == "text"
34
+ Provides-Extra: s3
35
+ Requires-Dist: s3fs; extra == "s3"
36
+ Provides-Extra: testing
37
+ Requires-Dist: macrodata-refiner[robotics]; extra == "testing"
38
+ Requires-Dist: macrodata-refiner[text]; extra == "testing"
39
+ Requires-Dist: macrodata-refiner[s3]; extra == "testing"
40
+ Requires-Dist: pytest>=8.0.0; extra == "testing"
41
+ Requires-Dist: pytest-cov>=5.0.0; extra == "testing"
42
+ Provides-Extra: all
43
+ Requires-Dist: macrodata-refiner[testing]; extra == "all"
29
44
  Dynamic: license-file
30
45
 
31
46
  <p align="center">
@@ -83,6 +98,8 @@ import refiner as mdr
83
98
  )
84
99
  ```
85
100
 
101
+ Need cloud GPUs? See [Launchers](docs/launchers.md) for the GPU-specific cloud options.
102
+
86
103
  ### Local example
87
104
 
88
105
  Launch a local pipeline:
@@ -16,6 +16,7 @@ src/refiner/cli/ui.py
16
16
  src/refiner/execution/__init__.py
17
17
  src/refiner/execution/buffer.py
18
18
  src/refiner/execution/engine.py
19
+ src/refiner/execution/asyncio/__init__.py
19
20
  src/refiner/execution/asyncio/runtime.py
20
21
  src/refiner/execution/asyncio/window.py
21
22
  src/refiner/execution/operators/__init__.py
@@ -31,12 +32,6 @@ src/refiner/launchers/__init__.py
31
32
  src/refiner/launchers/base.py
32
33
  src/refiner/launchers/cloud.py
33
34
  src/refiner/launchers/local.py
34
- src/refiner/media/__init__.py
35
- src/refiner/media/video/__init__.py
36
- src/refiner/media/video/remux.py
37
- src/refiner/media/video/transcode.py
38
- src/refiner/media/video/types.py
39
- src/refiner/media/video/writer.py
40
35
  src/refiner/pipeline/__init__.py
41
36
  src/refiner/pipeline/expressions.py
42
37
  src/refiner/pipeline/pipeline.py
@@ -86,6 +81,15 @@ src/refiner/robotics/lerobot_format/metadata/info.py
86
81
  src/refiner/robotics/lerobot_format/metadata/metadata.py
87
82
  src/refiner/robotics/lerobot_format/metadata/stats.py
88
83
  src/refiner/robotics/lerobot_format/metadata/tasks.py
84
+ src/refiner/text/__init__.py
85
+ src/refiner/text/commoncrawl.py
86
+ src/refiner/utils/__init__.py
87
+ src/refiner/utils/imports.py
88
+ src/refiner/video/__init__.py
89
+ src/refiner/video/remux.py
90
+ src/refiner/video/transcode.py
91
+ src/refiner/video/types.py
92
+ src/refiner/video/writer.py
89
93
  src/refiner/worker/__init__.py
90
94
  src/refiner/worker/context.py
91
95
  src/refiner/worker/entrypoint.py
@@ -104,7 +108,10 @@ src/refiner/worker/metrics/context.py
104
108
  src/refiner/worker/metrics/otel.py
105
109
  src/refiner/worker/resources/__init__.py
106
110
  src/refiner/worker/resources/cpu.py
111
+ src/refiner/worker/resources/gpu.py
107
112
  src/refiner/worker/resources/memory.py
108
113
  src/refiner/worker/resources/network.py
109
114
  tests/test_cache.py
110
- tests/test_expressions.py
115
+ tests/test_commoncrawl_text.py
116
+ tests/test_expressions.py
117
+ tests/test_optional_dependencies.py
@@ -0,0 +1,35 @@
1
+ cloudpickle==3.1.2
2
+ fsspec
3
+ httpx
4
+ loguru
5
+ opentelemetry-exporter-otlp-proto-http
6
+ opentelemetry-sdk
7
+ numpy
8
+ psutil
9
+ orjson
10
+ pyarrow
11
+ msgspec>=0.20.0
12
+
13
+ [all]
14
+ macrodata-refiner[testing]
15
+
16
+ [robotics]
17
+ macrodata-refiner[video]
18
+ huggingface-hub>=1.4.1
19
+ hf>=1.7.1
20
+
21
+ [s3]
22
+ s3fs
23
+
24
+ [testing]
25
+ macrodata-refiner[robotics]
26
+ macrodata-refiner[text]
27
+ macrodata-refiner[s3]
28
+ pytest>=8.0.0
29
+ pytest-cov>=5.0.0
30
+
31
+ [text]
32
+ warcio
33
+
34
+ [video]
35
+ av
@@ -1,11 +1,9 @@
1
+ import refiner.io as io
2
+ import refiner.pipeline as pipeline
1
3
  import refiner.robotics as robotics
2
- from refiner.io import DataFile, DataFileSet, DataFolder
3
- from refiner.launchers import LaunchStats, LocalLauncher
4
- from refiner.media import VideoFile
4
+ import refiner.text as text
5
+ import refiner.video as video
5
6
  from refiner.pipeline import (
6
- RefinerPipeline,
7
- Row,
8
- Shard,
9
7
  from_items,
10
8
  from_source,
11
9
  read_csv,
@@ -22,19 +20,11 @@ from refiner.worker.metrics.api import (
22
20
  log_throughput,
23
21
  register_gauge,
24
22
  )
25
- from refiner.worker.runner import Worker, WorkerRunStats
23
+
24
+ robot = robotics
26
25
 
27
26
  __all__ = [
28
- "RefinerPipeline",
29
- "LocalLauncher",
30
- "LaunchStats",
31
- "DataFile",
32
- "DataFolder",
33
- "DataFileSet",
34
- "Shard",
35
- "Row",
36
- "Worker",
37
- "WorkerRunStats",
27
+ # sources
38
28
  "read_csv",
39
29
  "read_jsonl",
40
30
  "read_lerobot",
@@ -42,16 +32,22 @@ __all__ = [
42
32
  "from_items",
43
33
  "from_source",
44
34
  "task",
35
+ # metrics
45
36
  "log_throughput",
46
37
  "log_gauge",
47
38
  "log_gauges",
48
- "register_gauge",
49
39
  "log_histogram",
40
+ "register_gauge",
41
+ # expressions
50
42
  "col",
51
43
  "lit",
52
44
  "coalesce",
53
45
  "if_else",
54
- "VideoFile",
55
- "Video",
46
+ # submodules
47
+ "io",
48
+ "pipeline",
49
+ "video",
50
+ "robot",
56
51
  "robotics",
52
+ "text",
57
53
  ]
@@ -18,7 +18,7 @@ from refiner.platform.client import (
18
18
  sanitize_terminal_text,
19
19
  verify_api_key,
20
20
  )
21
- from refiner.cli.ui import display_identity, print_banner
21
+ from refiner.cli.ui import display_identity, print_banner, stdin_is_interactive
22
22
 
23
23
  _TOKEN_SETTINGS_SUFFIX = "/settings/api-keys"
24
24
 
@@ -31,7 +31,7 @@ def _read_token(args: argparse.Namespace) -> str:
31
31
  if args.token and args.token.strip():
32
32
  return args.token.strip()
33
33
 
34
- read_from_stdin = args.token_stdin or not sys.stdin.isatty()
34
+ read_from_stdin = args.token_stdin or not stdin_is_interactive()
35
35
  if read_from_stdin:
36
36
  token = sys.stdin.read().strip()
37
37
  if token:
@@ -1,5 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import sys
4
+
3
5
  from refiner.platform.client import UserIdentity
4
6
 
5
7
  ASCII_BANNER = r"""
@@ -26,3 +28,10 @@ def display_identity(user: UserIdentity) -> str:
26
28
  if email:
27
29
  return f"{label} ({email})"
28
30
  return label
31
+
32
+
33
+ def stdin_is_interactive() -> bool:
34
+ try:
35
+ return sys.stdin.isatty()
36
+ except Exception: # pragma: no cover
37
+ return False
@@ -58,15 +58,16 @@ def execute_row_steps(
58
58
  )
59
59
 
60
60
  async def _run_async_step(*, step: AsyncRowStep, row: Row) -> Row:
61
- result = step.apply_row_async(row)
62
- if inspect.isawaitable(result):
63
- result = await result
64
- result = cast(MapResult, result)
65
- if isinstance(result, Row):
66
- return result
67
- if isinstance(result, dict):
68
- return row.update(result)
69
- raise TypeError(f"Unsupported map_async() result type: {type(result)!r}")
61
+ with set_active_step_index(step.index):
62
+ result = step.apply_row_async(row)
63
+ if inspect.isawaitable(result):
64
+ result = await result
65
+ result = cast(MapResult, result)
66
+ if isinstance(result, Row):
67
+ return result
68
+ if isinstance(result, dict):
69
+ return row.update(result)
70
+ raise TypeError(f"Unsupported map_async() result type: {type(result)!r}")
70
71
 
71
72
  def _run_step(i: int, *, flush_all: bool) -> None:
72
73
  step = ordered[i]
@@ -10,7 +10,7 @@ from refiner.execution.tracking.shards import (
10
10
  count_table_by_shard,
11
11
  counts_delta,
12
12
  )
13
- from refiner.pipeline.data.tabular import repeat_scalar
13
+ from refiner.pipeline.data.tabular import filter_table, repeat_scalar
14
14
  from refiner.pipeline.expressions import eval_expr_arrow
15
15
  from refiner.pipeline.steps import (
16
16
  CastStep,
@@ -22,6 +22,7 @@ from refiner.pipeline.steps import (
22
22
  VectorizedOp,
23
23
  WithColumnsStep,
24
24
  )
25
+ from refiner.worker.context import set_active_step_index
25
26
  from refiner.worker.metrics.api import log_throughput
26
27
 
27
28
 
@@ -68,14 +69,7 @@ def apply_vectorized_op(
68
69
  return out, None
69
70
 
70
71
  if isinstance(op, FilterExprStep):
71
- mask = eval_expr_arrow(op.predicate, table)
72
- next_table = (
73
- table
74
- if isinstance(mask, pa.Scalar) and bool(mask.as_py())
75
- else (
76
- table.slice(0, 0) if isinstance(mask, pa.Scalar) else table.filter(mask)
77
- )
78
- )
72
+ next_table = filter_table(table, op.predicate)
79
73
  next_shard_counts = count_table_by_shard(next_table)
80
74
  for shard_id in set(shard_counts) | set(next_shard_counts):
81
75
  previous = int(shard_counts.get(shard_id, 0))
@@ -100,7 +94,8 @@ def apply_vectorized_op(
100
94
  return next_table, next_shard_counts
101
95
 
102
96
  if isinstance(op, FnTableStep):
103
- next_table = op.fn(table)
97
+ with set_active_step_index(op.index):
98
+ next_table = op.fn(table)
104
99
  if not isinstance(next_table, pa.Table):
105
100
  raise TypeError(
106
101
  f"map_table() must return pa.Table, got {type(next_table)!r}"
@@ -1,4 +1,4 @@
1
- from collections.abc import Iterable, Mapping
1
+ from collections.abc import Iterable, Iterator, Mapping
2
2
  from os import PathLike
3
3
  from typing import IO, Any, TypeAlias, Union, cast
4
4
 
@@ -113,6 +113,34 @@ class DataFolder(DirFileSystem):
113
113
  return self.abs_path(paths)
114
114
  return [self.abs_path(p) for p in paths]
115
115
 
116
+ def find(self, path: str, *args, **kwargs):
117
+ # Avoid DirFileSystem.find(): some backends (notably HF buckets) can leak
118
+ # sibling prefix matches like `root-2/...` or return the bare root entry,
119
+ # and DirFileSystem._relpath() asserts before we can filter them out.
120
+ """List paths under this folder, skipping backend results outside the base path."""
121
+ detail = kwargs.get("detail", False)
122
+ target = self._join(path.rstrip("/"))
123
+ ret = self.fs.find(target, *args, **kwargs)
124
+ target = target.rstrip("/")
125
+ target_prefix = target + self.fs.sep
126
+ alt_target = target[1:] if target.startswith(self.fs.sep) else None
127
+ alt_prefix = alt_target + self.fs.sep if alt_target is not None else None
128
+
129
+ def rel(p: str) -> str | None:
130
+ if p == target or (alt_target is not None and p == alt_target):
131
+ return path.rstrip("/")
132
+ if p.startswith(target_prefix):
133
+ suffix = p[len(target_prefix) :]
134
+ elif alt_prefix is not None and p.startswith(alt_prefix):
135
+ suffix = p[len(alt_prefix) :]
136
+ else:
137
+ return None
138
+ return suffix if path in {"", "/"} else f"{path.rstrip('/')}/{suffix}"
139
+
140
+ if detail:
141
+ return {r: info for p, info in ret.items() if (r := rel(p)) is not None}
142
+ return [r for p in ret if (r := rel(p)) is not None]
143
+
116
144
  def open_files(
117
145
  self, paths: Iterable[str], mode: str = "rb", **kwargs
118
146
  ) -> list[IO[Any]]:
@@ -159,3 +187,29 @@ class DataFolder(DirFileSystem):
159
187
 
160
188
  def files(self, relpaths: Iterable[str]) -> list[DataFile]:
161
189
  return [self.file(p) for p in relpaths]
190
+
191
+ def iter_files_with_sizes(
192
+ self, *, recursive: bool = False, **kwargs: Any
193
+ ) -> Iterator[tuple[DataFile, int | None]]:
194
+ if recursive:
195
+ found = self.find("", detail=True, **kwargs)
196
+ items: Iterable[tuple[str, Mapping[str, Any]]] = found.items()
197
+ else:
198
+ items = (
199
+ (str(info["name"]), info)
200
+ for info in self.ls("", detail=True, **kwargs)
201
+ if isinstance(info, Mapping)
202
+ )
203
+
204
+ for relpath, info in sorted(items, key=lambda item: item[0]):
205
+ info_dict = dict(info)
206
+ if info_dict.get("type") != "file":
207
+ continue
208
+ size = info_dict.get("size")
209
+ yield self.file(relpath), size if isinstance(size, int) else None
210
+
211
+ def iter_files(
212
+ self, *, recursive: bool = False, **kwargs: Any
213
+ ) -> Iterator[DataFile]:
214
+ for file, _ in self.iter_files_with_sizes(recursive=recursive, **kwargs):
215
+ yield file
@@ -170,62 +170,77 @@ class DataFileSet:
170
170
  exts = tuple(e.lower() for e in self.extensions)
171
171
  seen: set[tuple[int, str]] = set()
172
172
  expanded: list[tuple[DataFile, ...]] = []
173
-
174
- def _append_file(out: list[DataFile], file: DataFile) -> None:
175
- if exts and not file.path.lower().endswith(exts):
173
+ sizes = dict(self._sizes)
174
+
175
+ def _append_file(
176
+ out: list[DataFile],
177
+ file: DataFile,
178
+ *,
179
+ size: int | None = None,
180
+ apply_extensions: bool = True,
181
+ ) -> None:
182
+ if apply_extensions and exts and not file.path.lower().endswith(exts):
176
183
  return
177
184
  key = (id(file.fs), file.path)
178
185
  if key in seen:
179
186
  return
180
187
  seen.add(key)
181
188
  out.append(file)
189
+ if size is not None:
190
+ sizes[(len(expanded), file.abs_path())] = int(size)
182
191
 
183
192
  for entry in self.entries:
184
193
  files: list[DataFile] = []
194
+ if isinstance(entry, _PathSource) and not glob.has_magic(entry.path):
195
+ try:
196
+ info = entry.fs.info(entry.path)
197
+ except FileNotFoundError:
198
+ raise FileNotFoundError(
199
+ f"Could not resolve input: {entry.fs.unstrip_protocol(entry.path)!r}"
200
+ )
201
+ item_type = info.get("type")
202
+ if item_type == "directory":
203
+ entry = DataFolder(path=entry.path, fs=entry.fs)
204
+ elif item_type == "file":
205
+ entry = DataFile(fs=entry.fs, path=entry.path)
206
+ else:
207
+ raise TypeError(
208
+ f"Unsupported file type {item_type!r} for input: "
209
+ f"{entry.fs.unstrip_protocol(entry.path)!r}"
210
+ )
211
+
185
212
  if isinstance(entry, DataFile):
186
- _append_file(files, entry)
213
+ _append_file(files, entry, apply_extensions=False)
187
214
  elif isinstance(entry, DataFolder):
188
- paths = (
189
- sorted(entry.find(""))
190
- if self.recursive
191
- else sorted(
192
- e["name"] if isinstance(e, dict) else e
193
- for e in entry.ls("", detail=True)
194
- if not isinstance(e, dict) or e.get("type") == "file"
195
- )
196
- )
197
- for path in paths:
198
- _append_file(files, entry.file(path))
215
+ for file, size in entry.iter_files_with_sizes(recursive=self.recursive):
216
+ _append_file(files, file, size=size)
199
217
  else:
200
218
  next_fs, path = entry.fs, entry.path
201
219
  if glob.has_magic(path):
202
- for expanded_path in sorted(next_fs.glob(path)):
203
- _append_file(files, DataFile(fs=next_fs, path=expanded_path))
204
- elif next_fs.exists(path):
205
- if next_fs.isdir(path):
206
- paths = (
207
- sorted(next_fs.find(path))
208
- if self.recursive
209
- else sorted(
210
- e["name"] if isinstance(e, dict) else e
211
- for e in next_fs.ls(path, detail=True)
212
- if not isinstance(e, dict) or e.get("type") == "file"
213
- )
220
+ matched = next_fs.glob(path, detail=True)
221
+ items = matched.items()
222
+ for expanded_path, info in sorted(items):
223
+ if not isinstance(expanded_path, str) or not isinstance(
224
+ info, Mapping
225
+ ):
226
+ continue
227
+ if info.get("type") != "file":
228
+ continue
229
+ size = info.get("size")
230
+ _append_file(
231
+ files,
232
+ DataFile(fs=next_fs, path=expanded_path),
233
+ size=size if isinstance(size, int) else None,
214
234
  )
215
- for expanded_path in paths:
216
- _append_file(
217
- files, DataFile(fs=next_fs, path=expanded_path)
218
- )
219
- else:
220
- _append_file(files, DataFile(fs=next_fs, path=path))
221
235
  else:
222
- raise FileNotFoundError(
223
- f"Could not resolve input: {next_fs.unstrip_protocol(path)!r}"
236
+ raise AssertionError(
237
+ "non-glob _PathSource should have been resolved"
224
238
  )
225
239
  expanded.append(tuple(files))
226
240
 
227
241
  out = tuple(expanded)
228
242
  object.__setattr__(self, "_expanded_sources", out)
243
+ object.__setattr__(self, "_sizes", sizes)
229
244
  return out
230
245
 
231
246
  def resolve_file(self, source_index: int, path: str) -> DataFile: