macrodata-refiner 0.2.1__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {macrodata_refiner-0.2.1/src/macrodata_refiner.egg-info → macrodata_refiner-0.2.2}/PKG-INFO +21 -4
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/README.md +2 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/pyproject.toml +27 -5
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2/src/macrodata_refiner.egg-info}/PKG-INFO +21 -4
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/macrodata_refiner.egg-info/SOURCES.txt +14 -7
- macrodata_refiner-0.2.2/src/macrodata_refiner.egg-info/requires.txt +35 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/__init__.py +16 -20
- macrodata_refiner-0.2.2/src/refiner/execution/asyncio/__init__.py +1 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/operators/vectorized.py +2 -9
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/io/datafolder.py +55 -1
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/io/fileset.py +50 -35
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/launchers/base.py +33 -3
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/launchers/cloud.py +26 -1
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/launchers/local.py +15 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/__init__.py +0 -6
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/data/tabular.py +10 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/expressions.py +86 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/pipeline.py +21 -5
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/planning.py +20 -2
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sinks/__init__.py +0 -2
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sinks/lerobot.py +5 -4
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/readers/base.py +28 -2
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/readers/csv.py +14 -5
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/readers/jsonl.py +11 -2
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/readers/lerobot.py +1 -0
- macrodata_refiner-0.2.2/src/refiner/pipeline/sources/readers/parquet.py +450 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/utils/cache/decoder_cache.py +4 -2
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/platform/client/api.py +5 -13
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/platform/client/http.py +28 -4
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/platform/client/models.py +6 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/__init__.py +0 -2
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/row.py +3 -12
- macrodata_refiner-0.2.2/src/refiner/text/__init__.py +11 -0
- macrodata_refiner-0.2.2/src/refiner/text/commoncrawl.py +654 -0
- macrodata_refiner-0.2.2/src/refiner/utils/__init__.py +3 -0
- macrodata_refiner-0.2.2/src/refiner/utils/imports.py +75 -0
- macrodata_refiner-0.2.2/src/refiner/video/__init__.py +41 -0
- {macrodata_refiner-0.2.1/src/refiner/media → macrodata_refiner-0.2.2/src/refiner}/video/remux.py +6 -4
- {macrodata_refiner-0.2.1/src/refiner/media → macrodata_refiner-0.2.2/src/refiner}/video/transcode.py +6 -4
- {macrodata_refiner-0.2.1/src/refiner/media → macrodata_refiner-0.2.2/src/refiner}/video/writer.py +12 -14
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/entrypoint.py +12 -0
- macrodata_refiner-0.2.2/src/refiner/worker/resources/gpu.py +81 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/tests/test_cache.py +1 -1
- macrodata_refiner-0.2.2/tests/test_commoncrawl_text.py +1194 -0
- macrodata_refiner-0.2.2/tests/test_optional_dependencies.py +19 -0
- macrodata_refiner-0.2.1/src/macrodata_refiner.egg-info/requires.txt +0 -14
- macrodata_refiner-0.2.1/src/refiner/media/__init__.py +0 -3
- macrodata_refiner-0.2.1/src/refiner/media/video/__init__.py +0 -3
- macrodata_refiner-0.2.1/src/refiner/pipeline/sources/readers/parquet.py +0 -252
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/LICENSE +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/setup.cfg +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/macrodata_refiner.egg-info/dependency_links.txt +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/macrodata_refiner.egg-info/entry_points.txt +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/macrodata_refiner.egg-info/top_level.txt +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/cli/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/cli/auth.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/cli/main.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/cli/ui.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/asyncio/runtime.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/asyncio/window.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/buffer.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/engine.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/operators/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/operators/row.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/tracking/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/tracking/shards.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/io/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/io/datafile.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/launchers/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/data/block.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/data/row.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/data/shard.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sinks/base.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sinks/jsonl.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sinks/lerobot_reducer.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sinks/parquet.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/base.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/items.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/readers/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/readers/utils.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/sources/task.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/steps.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/utils/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/utils/cache/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/utils/cache/file_cache.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/pipeline/utils/cache/lease_cache.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/platform/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/platform/auth.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/platform/client/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/platform/client/serialize.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/platform/manifest.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/py.typed +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/robotics/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/metadata/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/metadata/info.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/metadata/metadata.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/metadata/stats.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/metadata/tasks.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/robotics/lerobot_format/tabular.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/robotics/motion.py +0 -0
- {macrodata_refiner-0.2.1/src/refiner/media → macrodata_refiner-0.2.2/src/refiner}/video/types.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/context.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/base.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/local/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/local/claim.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/local/files.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/local/lifecycle.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/lifecycle/platform.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/metrics/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/metrics/api.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/metrics/context.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/metrics/otel.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/resources/__init__.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/resources/cpu.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/resources/memory.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/resources/network.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/runner.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/worker/workdir.py +0 -0
- {macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/tests/test_expressions.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: macrodata-refiner
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets
|
|
5
5
|
Author: Macrodata Labs
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -12,12 +12,10 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
12
12
|
Requires-Python: >=3.10
|
|
13
13
|
Description-Content-Type: text/markdown
|
|
14
14
|
License-File: LICENSE
|
|
15
|
-
Requires-Dist: av
|
|
16
15
|
Requires-Dist: cloudpickle==3.1.2
|
|
17
16
|
Requires-Dist: fsspec
|
|
18
17
|
Requires-Dist: httpx
|
|
19
18
|
Requires-Dist: loguru
|
|
20
|
-
Requires-Dist: huggingface-hub>=1.4.1
|
|
21
19
|
Requires-Dist: opentelemetry-exporter-otlp-proto-http
|
|
22
20
|
Requires-Dist: opentelemetry-sdk
|
|
23
21
|
Requires-Dist: numpy
|
|
@@ -25,7 +23,24 @@ Requires-Dist: psutil
|
|
|
25
23
|
Requires-Dist: orjson
|
|
26
24
|
Requires-Dist: pyarrow
|
|
27
25
|
Requires-Dist: msgspec>=0.20.0
|
|
28
|
-
|
|
26
|
+
Provides-Extra: video
|
|
27
|
+
Requires-Dist: av; extra == "video"
|
|
28
|
+
Provides-Extra: robotics
|
|
29
|
+
Requires-Dist: macrodata-refiner[video]; extra == "robotics"
|
|
30
|
+
Requires-Dist: huggingface-hub>=1.4.1; extra == "robotics"
|
|
31
|
+
Requires-Dist: hf>=1.7.1; extra == "robotics"
|
|
32
|
+
Provides-Extra: text
|
|
33
|
+
Requires-Dist: warcio; extra == "text"
|
|
34
|
+
Provides-Extra: s3
|
|
35
|
+
Requires-Dist: s3fs; extra == "s3"
|
|
36
|
+
Provides-Extra: testing
|
|
37
|
+
Requires-Dist: macrodata-refiner[robotics]; extra == "testing"
|
|
38
|
+
Requires-Dist: macrodata-refiner[text]; extra == "testing"
|
|
39
|
+
Requires-Dist: macrodata-refiner[s3]; extra == "testing"
|
|
40
|
+
Requires-Dist: pytest>=8.0.0; extra == "testing"
|
|
41
|
+
Requires-Dist: pytest-cov>=5.0.0; extra == "testing"
|
|
42
|
+
Provides-Extra: all
|
|
43
|
+
Requires-Dist: macrodata-refiner[testing]; extra == "all"
|
|
29
44
|
Dynamic: license-file
|
|
30
45
|
|
|
31
46
|
<p align="center">
|
|
@@ -83,6 +98,8 @@ import refiner as mdr
|
|
|
83
98
|
)
|
|
84
99
|
```
|
|
85
100
|
|
|
101
|
+
Need cloud GPUs? See [Launchers](docs/launchers.md) for the GPU-specific cloud options.
|
|
102
|
+
|
|
86
103
|
### Local example
|
|
87
104
|
|
|
88
105
|
Launch a local pipeline:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "macrodata-refiner"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.2"
|
|
4
4
|
description = "Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = "Apache-2.0"
|
|
@@ -16,12 +16,10 @@ authors = [
|
|
|
16
16
|
]
|
|
17
17
|
requires-python = ">=3.10"
|
|
18
18
|
dependencies = [
|
|
19
|
-
"av",
|
|
20
19
|
"cloudpickle==3.1.2",
|
|
21
20
|
"fsspec",
|
|
22
21
|
"httpx",
|
|
23
22
|
"loguru",
|
|
24
|
-
"huggingface-hub>=1.4.1",
|
|
25
23
|
"opentelemetry-exporter-otlp-proto-http",
|
|
26
24
|
"opentelemetry-sdk",
|
|
27
25
|
"numpy",
|
|
@@ -29,8 +27,33 @@ dependencies = [
|
|
|
29
27
|
"orjson",
|
|
30
28
|
"pyarrow",
|
|
31
29
|
"msgspec>=0.20.0",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
video = [
|
|
34
|
+
"av",
|
|
35
|
+
]
|
|
36
|
+
robotics = [
|
|
37
|
+
"macrodata-refiner[video]",
|
|
38
|
+
"huggingface-hub>=1.4.1",
|
|
32
39
|
"hf>=1.7.1",
|
|
33
40
|
]
|
|
41
|
+
text = [
|
|
42
|
+
"warcio",
|
|
43
|
+
]
|
|
44
|
+
s3 = [
|
|
45
|
+
"s3fs",
|
|
46
|
+
]
|
|
47
|
+
testing = [
|
|
48
|
+
"macrodata-refiner[robotics]",
|
|
49
|
+
"macrodata-refiner[text]",
|
|
50
|
+
"macrodata-refiner[s3]",
|
|
51
|
+
"pytest>=8.0.0",
|
|
52
|
+
"pytest-cov>=5.0.0",
|
|
53
|
+
]
|
|
54
|
+
all = [
|
|
55
|
+
"macrodata-refiner[testing]",
|
|
56
|
+
]
|
|
34
57
|
|
|
35
58
|
[project.scripts]
|
|
36
59
|
macrodata = "refiner.cli.main:main"
|
|
@@ -47,9 +70,8 @@ refiner = ["py.typed"]
|
|
|
47
70
|
|
|
48
71
|
[dependency-groups]
|
|
49
72
|
dev = [
|
|
73
|
+
"macrodata-refiner[all]",
|
|
50
74
|
"pre-commit>=4.0.0",
|
|
51
|
-
"pytest>=8.0.0",
|
|
52
|
-
"pytest-cov>=5.0.0",
|
|
53
75
|
"ruff>=0.14.10",
|
|
54
76
|
"ty>=0.0.7",
|
|
55
77
|
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: macrodata-refiner
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets
|
|
5
5
|
Author: Macrodata Labs
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -12,12 +12,10 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
12
12
|
Requires-Python: >=3.10
|
|
13
13
|
Description-Content-Type: text/markdown
|
|
14
14
|
License-File: LICENSE
|
|
15
|
-
Requires-Dist: av
|
|
16
15
|
Requires-Dist: cloudpickle==3.1.2
|
|
17
16
|
Requires-Dist: fsspec
|
|
18
17
|
Requires-Dist: httpx
|
|
19
18
|
Requires-Dist: loguru
|
|
20
|
-
Requires-Dist: huggingface-hub>=1.4.1
|
|
21
19
|
Requires-Dist: opentelemetry-exporter-otlp-proto-http
|
|
22
20
|
Requires-Dist: opentelemetry-sdk
|
|
23
21
|
Requires-Dist: numpy
|
|
@@ -25,7 +23,24 @@ Requires-Dist: psutil
|
|
|
25
23
|
Requires-Dist: orjson
|
|
26
24
|
Requires-Dist: pyarrow
|
|
27
25
|
Requires-Dist: msgspec>=0.20.0
|
|
28
|
-
|
|
26
|
+
Provides-Extra: video
|
|
27
|
+
Requires-Dist: av; extra == "video"
|
|
28
|
+
Provides-Extra: robotics
|
|
29
|
+
Requires-Dist: macrodata-refiner[video]; extra == "robotics"
|
|
30
|
+
Requires-Dist: huggingface-hub>=1.4.1; extra == "robotics"
|
|
31
|
+
Requires-Dist: hf>=1.7.1; extra == "robotics"
|
|
32
|
+
Provides-Extra: text
|
|
33
|
+
Requires-Dist: warcio; extra == "text"
|
|
34
|
+
Provides-Extra: s3
|
|
35
|
+
Requires-Dist: s3fs; extra == "s3"
|
|
36
|
+
Provides-Extra: testing
|
|
37
|
+
Requires-Dist: macrodata-refiner[robotics]; extra == "testing"
|
|
38
|
+
Requires-Dist: macrodata-refiner[text]; extra == "testing"
|
|
39
|
+
Requires-Dist: macrodata-refiner[s3]; extra == "testing"
|
|
40
|
+
Requires-Dist: pytest>=8.0.0; extra == "testing"
|
|
41
|
+
Requires-Dist: pytest-cov>=5.0.0; extra == "testing"
|
|
42
|
+
Provides-Extra: all
|
|
43
|
+
Requires-Dist: macrodata-refiner[testing]; extra == "all"
|
|
29
44
|
Dynamic: license-file
|
|
30
45
|
|
|
31
46
|
<p align="center">
|
|
@@ -83,6 +98,8 @@ import refiner as mdr
|
|
|
83
98
|
)
|
|
84
99
|
```
|
|
85
100
|
|
|
101
|
+
Need cloud GPUs? See [Launchers](docs/launchers.md) for the GPU-specific cloud options.
|
|
102
|
+
|
|
86
103
|
### Local example
|
|
87
104
|
|
|
88
105
|
Launch a local pipeline:
|
{macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/macrodata_refiner.egg-info/SOURCES.txt
RENAMED
|
@@ -16,6 +16,7 @@ src/refiner/cli/ui.py
|
|
|
16
16
|
src/refiner/execution/__init__.py
|
|
17
17
|
src/refiner/execution/buffer.py
|
|
18
18
|
src/refiner/execution/engine.py
|
|
19
|
+
src/refiner/execution/asyncio/__init__.py
|
|
19
20
|
src/refiner/execution/asyncio/runtime.py
|
|
20
21
|
src/refiner/execution/asyncio/window.py
|
|
21
22
|
src/refiner/execution/operators/__init__.py
|
|
@@ -31,12 +32,6 @@ src/refiner/launchers/__init__.py
|
|
|
31
32
|
src/refiner/launchers/base.py
|
|
32
33
|
src/refiner/launchers/cloud.py
|
|
33
34
|
src/refiner/launchers/local.py
|
|
34
|
-
src/refiner/media/__init__.py
|
|
35
|
-
src/refiner/media/video/__init__.py
|
|
36
|
-
src/refiner/media/video/remux.py
|
|
37
|
-
src/refiner/media/video/transcode.py
|
|
38
|
-
src/refiner/media/video/types.py
|
|
39
|
-
src/refiner/media/video/writer.py
|
|
40
35
|
src/refiner/pipeline/__init__.py
|
|
41
36
|
src/refiner/pipeline/expressions.py
|
|
42
37
|
src/refiner/pipeline/pipeline.py
|
|
@@ -86,6 +81,15 @@ src/refiner/robotics/lerobot_format/metadata/info.py
|
|
|
86
81
|
src/refiner/robotics/lerobot_format/metadata/metadata.py
|
|
87
82
|
src/refiner/robotics/lerobot_format/metadata/stats.py
|
|
88
83
|
src/refiner/robotics/lerobot_format/metadata/tasks.py
|
|
84
|
+
src/refiner/text/__init__.py
|
|
85
|
+
src/refiner/text/commoncrawl.py
|
|
86
|
+
src/refiner/utils/__init__.py
|
|
87
|
+
src/refiner/utils/imports.py
|
|
88
|
+
src/refiner/video/__init__.py
|
|
89
|
+
src/refiner/video/remux.py
|
|
90
|
+
src/refiner/video/transcode.py
|
|
91
|
+
src/refiner/video/types.py
|
|
92
|
+
src/refiner/video/writer.py
|
|
89
93
|
src/refiner/worker/__init__.py
|
|
90
94
|
src/refiner/worker/context.py
|
|
91
95
|
src/refiner/worker/entrypoint.py
|
|
@@ -104,7 +108,10 @@ src/refiner/worker/metrics/context.py
|
|
|
104
108
|
src/refiner/worker/metrics/otel.py
|
|
105
109
|
src/refiner/worker/resources/__init__.py
|
|
106
110
|
src/refiner/worker/resources/cpu.py
|
|
111
|
+
src/refiner/worker/resources/gpu.py
|
|
107
112
|
src/refiner/worker/resources/memory.py
|
|
108
113
|
src/refiner/worker/resources/network.py
|
|
109
114
|
tests/test_cache.py
|
|
110
|
-
tests/
|
|
115
|
+
tests/test_commoncrawl_text.py
|
|
116
|
+
tests/test_expressions.py
|
|
117
|
+
tests/test_optional_dependencies.py
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
cloudpickle==3.1.2
|
|
2
|
+
fsspec
|
|
3
|
+
httpx
|
|
4
|
+
loguru
|
|
5
|
+
opentelemetry-exporter-otlp-proto-http
|
|
6
|
+
opentelemetry-sdk
|
|
7
|
+
numpy
|
|
8
|
+
psutil
|
|
9
|
+
orjson
|
|
10
|
+
pyarrow
|
|
11
|
+
msgspec>=0.20.0
|
|
12
|
+
|
|
13
|
+
[all]
|
|
14
|
+
macrodata-refiner[testing]
|
|
15
|
+
|
|
16
|
+
[robotics]
|
|
17
|
+
macrodata-refiner[video]
|
|
18
|
+
huggingface-hub>=1.4.1
|
|
19
|
+
hf>=1.7.1
|
|
20
|
+
|
|
21
|
+
[s3]
|
|
22
|
+
s3fs
|
|
23
|
+
|
|
24
|
+
[testing]
|
|
25
|
+
macrodata-refiner[robotics]
|
|
26
|
+
macrodata-refiner[text]
|
|
27
|
+
macrodata-refiner[s3]
|
|
28
|
+
pytest>=8.0.0
|
|
29
|
+
pytest-cov>=5.0.0
|
|
30
|
+
|
|
31
|
+
[text]
|
|
32
|
+
warcio
|
|
33
|
+
|
|
34
|
+
[video]
|
|
35
|
+
av
|
|
@@ -1,11 +1,9 @@
|
|
|
1
|
+
import refiner.io as io
|
|
2
|
+
import refiner.pipeline as pipeline
|
|
1
3
|
import refiner.robotics as robotics
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
from refiner.media import VideoFile
|
|
4
|
+
import refiner.text as text
|
|
5
|
+
import refiner.video as video
|
|
5
6
|
from refiner.pipeline import (
|
|
6
|
-
RefinerPipeline,
|
|
7
|
-
Row,
|
|
8
|
-
Shard,
|
|
9
7
|
from_items,
|
|
10
8
|
from_source,
|
|
11
9
|
read_csv,
|
|
@@ -22,19 +20,11 @@ from refiner.worker.metrics.api import (
|
|
|
22
20
|
log_throughput,
|
|
23
21
|
register_gauge,
|
|
24
22
|
)
|
|
25
|
-
|
|
23
|
+
|
|
24
|
+
robot = robotics
|
|
26
25
|
|
|
27
26
|
__all__ = [
|
|
28
|
-
|
|
29
|
-
"LocalLauncher",
|
|
30
|
-
"LaunchStats",
|
|
31
|
-
"DataFile",
|
|
32
|
-
"DataFolder",
|
|
33
|
-
"DataFileSet",
|
|
34
|
-
"Shard",
|
|
35
|
-
"Row",
|
|
36
|
-
"Worker",
|
|
37
|
-
"WorkerRunStats",
|
|
27
|
+
# sources
|
|
38
28
|
"read_csv",
|
|
39
29
|
"read_jsonl",
|
|
40
30
|
"read_lerobot",
|
|
@@ -42,16 +32,22 @@ __all__ = [
|
|
|
42
32
|
"from_items",
|
|
43
33
|
"from_source",
|
|
44
34
|
"task",
|
|
35
|
+
# metrics
|
|
45
36
|
"log_throughput",
|
|
46
37
|
"log_gauge",
|
|
47
38
|
"log_gauges",
|
|
48
|
-
"register_gauge",
|
|
49
39
|
"log_histogram",
|
|
40
|
+
"register_gauge",
|
|
41
|
+
# expressions
|
|
50
42
|
"col",
|
|
51
43
|
"lit",
|
|
52
44
|
"coalesce",
|
|
53
45
|
"if_else",
|
|
54
|
-
|
|
55
|
-
"
|
|
46
|
+
# submodules
|
|
47
|
+
"io",
|
|
48
|
+
"pipeline",
|
|
49
|
+
"video",
|
|
50
|
+
"robot",
|
|
56
51
|
"robotics",
|
|
52
|
+
"text",
|
|
57
53
|
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
{macrodata_refiner-0.2.1 → macrodata_refiner-0.2.2}/src/refiner/execution/operators/vectorized.py
RENAMED
|
@@ -10,7 +10,7 @@ from refiner.execution.tracking.shards import (
|
|
|
10
10
|
count_table_by_shard,
|
|
11
11
|
counts_delta,
|
|
12
12
|
)
|
|
13
|
-
from refiner.pipeline.data.tabular import repeat_scalar
|
|
13
|
+
from refiner.pipeline.data.tabular import filter_table, repeat_scalar
|
|
14
14
|
from refiner.pipeline.expressions import eval_expr_arrow
|
|
15
15
|
from refiner.pipeline.steps import (
|
|
16
16
|
CastStep,
|
|
@@ -69,14 +69,7 @@ def apply_vectorized_op(
|
|
|
69
69
|
return out, None
|
|
70
70
|
|
|
71
71
|
if isinstance(op, FilterExprStep):
|
|
72
|
-
|
|
73
|
-
next_table = (
|
|
74
|
-
table
|
|
75
|
-
if isinstance(mask, pa.Scalar) and bool(mask.as_py())
|
|
76
|
-
else (
|
|
77
|
-
table.slice(0, 0) if isinstance(mask, pa.Scalar) else table.filter(mask)
|
|
78
|
-
)
|
|
79
|
-
)
|
|
72
|
+
next_table = filter_table(table, op.predicate)
|
|
80
73
|
next_shard_counts = count_table_by_shard(next_table)
|
|
81
74
|
for shard_id in set(shard_counts) | set(next_shard_counts):
|
|
82
75
|
previous = int(shard_counts.get(shard_id, 0))
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from collections.abc import Iterable, Mapping
|
|
1
|
+
from collections.abc import Iterable, Iterator, Mapping
|
|
2
2
|
from os import PathLike
|
|
3
3
|
from typing import IO, Any, TypeAlias, Union, cast
|
|
4
4
|
|
|
@@ -113,6 +113,34 @@ class DataFolder(DirFileSystem):
|
|
|
113
113
|
return self.abs_path(paths)
|
|
114
114
|
return [self.abs_path(p) for p in paths]
|
|
115
115
|
|
|
116
|
+
def find(self, path: str, *args, **kwargs):
|
|
117
|
+
# Avoid DirFileSystem.find(): some backends (notably HF buckets) can leak
|
|
118
|
+
# sibling prefix matches like `root-2/...` or return the bare root entry,
|
|
119
|
+
# and DirFileSystem._relpath() asserts before we can filter them out.
|
|
120
|
+
"""List paths under this folder, skipping backend results outside the base path."""
|
|
121
|
+
detail = kwargs.get("detail", False)
|
|
122
|
+
target = self._join(path.rstrip("/"))
|
|
123
|
+
ret = self.fs.find(target, *args, **kwargs)
|
|
124
|
+
target = target.rstrip("/")
|
|
125
|
+
target_prefix = target + self.fs.sep
|
|
126
|
+
alt_target = target[1:] if target.startswith(self.fs.sep) else None
|
|
127
|
+
alt_prefix = alt_target + self.fs.sep if alt_target is not None else None
|
|
128
|
+
|
|
129
|
+
def rel(p: str) -> str | None:
|
|
130
|
+
if p == target or (alt_target is not None and p == alt_target):
|
|
131
|
+
return path.rstrip("/")
|
|
132
|
+
if p.startswith(target_prefix):
|
|
133
|
+
suffix = p[len(target_prefix) :]
|
|
134
|
+
elif alt_prefix is not None and p.startswith(alt_prefix):
|
|
135
|
+
suffix = p[len(alt_prefix) :]
|
|
136
|
+
else:
|
|
137
|
+
return None
|
|
138
|
+
return suffix if path in {"", "/"} else f"{path.rstrip('/')}/{suffix}"
|
|
139
|
+
|
|
140
|
+
if detail:
|
|
141
|
+
return {r: info for p, info in ret.items() if (r := rel(p)) is not None}
|
|
142
|
+
return [r for p in ret if (r := rel(p)) is not None]
|
|
143
|
+
|
|
116
144
|
def open_files(
|
|
117
145
|
self, paths: Iterable[str], mode: str = "rb", **kwargs
|
|
118
146
|
) -> list[IO[Any]]:
|
|
@@ -159,3 +187,29 @@ class DataFolder(DirFileSystem):
|
|
|
159
187
|
|
|
160
188
|
def files(self, relpaths: Iterable[str]) -> list[DataFile]:
|
|
161
189
|
return [self.file(p) for p in relpaths]
|
|
190
|
+
|
|
191
|
+
def iter_files_with_sizes(
|
|
192
|
+
self, *, recursive: bool = False, **kwargs: Any
|
|
193
|
+
) -> Iterator[tuple[DataFile, int | None]]:
|
|
194
|
+
if recursive:
|
|
195
|
+
found = self.find("", detail=True, **kwargs)
|
|
196
|
+
items: Iterable[tuple[str, Mapping[str, Any]]] = found.items()
|
|
197
|
+
else:
|
|
198
|
+
items = (
|
|
199
|
+
(str(info["name"]), info)
|
|
200
|
+
for info in self.ls("", detail=True, **kwargs)
|
|
201
|
+
if isinstance(info, Mapping)
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
for relpath, info in sorted(items, key=lambda item: item[0]):
|
|
205
|
+
info_dict = dict(info)
|
|
206
|
+
if info_dict.get("type") != "file":
|
|
207
|
+
continue
|
|
208
|
+
size = info_dict.get("size")
|
|
209
|
+
yield self.file(relpath), size if isinstance(size, int) else None
|
|
210
|
+
|
|
211
|
+
def iter_files(
|
|
212
|
+
self, *, recursive: bool = False, **kwargs: Any
|
|
213
|
+
) -> Iterator[DataFile]:
|
|
214
|
+
for file, _ in self.iter_files_with_sizes(recursive=recursive, **kwargs):
|
|
215
|
+
yield file
|
|
@@ -170,62 +170,77 @@ class DataFileSet:
|
|
|
170
170
|
exts = tuple(e.lower() for e in self.extensions)
|
|
171
171
|
seen: set[tuple[int, str]] = set()
|
|
172
172
|
expanded: list[tuple[DataFile, ...]] = []
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
173
|
+
sizes = dict(self._sizes)
|
|
174
|
+
|
|
175
|
+
def _append_file(
|
|
176
|
+
out: list[DataFile],
|
|
177
|
+
file: DataFile,
|
|
178
|
+
*,
|
|
179
|
+
size: int | None = None,
|
|
180
|
+
apply_extensions: bool = True,
|
|
181
|
+
) -> None:
|
|
182
|
+
if apply_extensions and exts and not file.path.lower().endswith(exts):
|
|
176
183
|
return
|
|
177
184
|
key = (id(file.fs), file.path)
|
|
178
185
|
if key in seen:
|
|
179
186
|
return
|
|
180
187
|
seen.add(key)
|
|
181
188
|
out.append(file)
|
|
189
|
+
if size is not None:
|
|
190
|
+
sizes[(len(expanded), file.abs_path())] = int(size)
|
|
182
191
|
|
|
183
192
|
for entry in self.entries:
|
|
184
193
|
files: list[DataFile] = []
|
|
194
|
+
if isinstance(entry, _PathSource) and not glob.has_magic(entry.path):
|
|
195
|
+
try:
|
|
196
|
+
info = entry.fs.info(entry.path)
|
|
197
|
+
except FileNotFoundError:
|
|
198
|
+
raise FileNotFoundError(
|
|
199
|
+
f"Could not resolve input: {entry.fs.unstrip_protocol(entry.path)!r}"
|
|
200
|
+
)
|
|
201
|
+
item_type = info.get("type")
|
|
202
|
+
if item_type == "directory":
|
|
203
|
+
entry = DataFolder(path=entry.path, fs=entry.fs)
|
|
204
|
+
elif item_type == "file":
|
|
205
|
+
entry = DataFile(fs=entry.fs, path=entry.path)
|
|
206
|
+
else:
|
|
207
|
+
raise TypeError(
|
|
208
|
+
f"Unsupported file type {item_type!r} for input: "
|
|
209
|
+
f"{entry.fs.unstrip_protocol(entry.path)!r}"
|
|
210
|
+
)
|
|
211
|
+
|
|
185
212
|
if isinstance(entry, DataFile):
|
|
186
|
-
_append_file(files, entry)
|
|
213
|
+
_append_file(files, entry, apply_extensions=False)
|
|
187
214
|
elif isinstance(entry, DataFolder):
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
if self.recursive
|
|
191
|
-
else sorted(
|
|
192
|
-
e["name"] if isinstance(e, dict) else e
|
|
193
|
-
for e in entry.ls("", detail=True)
|
|
194
|
-
if not isinstance(e, dict) or e.get("type") == "file"
|
|
195
|
-
)
|
|
196
|
-
)
|
|
197
|
-
for path in paths:
|
|
198
|
-
_append_file(files, entry.file(path))
|
|
215
|
+
for file, size in entry.iter_files_with_sizes(recursive=self.recursive):
|
|
216
|
+
_append_file(files, file, size=size)
|
|
199
217
|
else:
|
|
200
218
|
next_fs, path = entry.fs, entry.path
|
|
201
219
|
if glob.has_magic(path):
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
220
|
+
matched = next_fs.glob(path, detail=True)
|
|
221
|
+
items = matched.items()
|
|
222
|
+
for expanded_path, info in sorted(items):
|
|
223
|
+
if not isinstance(expanded_path, str) or not isinstance(
|
|
224
|
+
info, Mapping
|
|
225
|
+
):
|
|
226
|
+
continue
|
|
227
|
+
if info.get("type") != "file":
|
|
228
|
+
continue
|
|
229
|
+
size = info.get("size")
|
|
230
|
+
_append_file(
|
|
231
|
+
files,
|
|
232
|
+
DataFile(fs=next_fs, path=expanded_path),
|
|
233
|
+
size=size if isinstance(size, int) else None,
|
|
214
234
|
)
|
|
215
|
-
for expanded_path in paths:
|
|
216
|
-
_append_file(
|
|
217
|
-
files, DataFile(fs=next_fs, path=expanded_path)
|
|
218
|
-
)
|
|
219
|
-
else:
|
|
220
|
-
_append_file(files, DataFile(fs=next_fs, path=path))
|
|
221
235
|
else:
|
|
222
|
-
raise
|
|
223
|
-
|
|
236
|
+
raise AssertionError(
|
|
237
|
+
"non-glob _PathSource should have been resolved"
|
|
224
238
|
)
|
|
225
239
|
expanded.append(tuple(files))
|
|
226
240
|
|
|
227
241
|
out = tuple(expanded)
|
|
228
242
|
object.__setattr__(self, "_expanded_sources", out)
|
|
243
|
+
object.__setattr__(self, "_sizes", sizes)
|
|
229
244
|
return out
|
|
230
245
|
|
|
231
246
|
def resolve_file(self, source_index: int, path: str) -> DataFile:
|
|
@@ -1,20 +1,22 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
+
from dataclasses import replace
|
|
4
5
|
from typing import TYPE_CHECKING
|
|
5
|
-
from uuid import uuid4
|
|
6
6
|
import re
|
|
7
7
|
import time
|
|
8
|
+
from uuid import uuid4
|
|
8
9
|
|
|
9
10
|
from loguru import logger
|
|
10
11
|
|
|
11
12
|
from refiner.platform.auth import CredentialsError
|
|
12
13
|
from refiner.platform.client.api import MacrodataClient
|
|
13
|
-
from refiner.platform.client.http import sanitize_terminal_text
|
|
14
|
+
from refiner.platform.client.http import request_json, sanitize_terminal_text
|
|
14
15
|
from refiner.platform.manifest import build_run_manifest
|
|
15
16
|
from refiner.worker.context import RunHandle
|
|
16
17
|
from refiner.pipeline.planning import (
|
|
17
18
|
PlannedStage,
|
|
19
|
+
StageComputeRequirements,
|
|
18
20
|
compile_planned_stages,
|
|
19
21
|
plan_pipeline_stages,
|
|
20
22
|
)
|
|
@@ -34,6 +36,7 @@ class BaseLauncher(ABC):
|
|
|
34
36
|
num_workers: int | None = None,
|
|
35
37
|
heartbeat_interval_seconds: int | None = None,
|
|
36
38
|
cpus_per_worker: int | None = None,
|
|
39
|
+
gpus_per_worker: int | None = None,
|
|
37
40
|
):
|
|
38
41
|
if not name.strip():
|
|
39
42
|
raise ValueError("name must be non-empty")
|
|
@@ -41,6 +44,7 @@ class BaseLauncher(ABC):
|
|
|
41
44
|
self.name = name
|
|
42
45
|
self.job_id = job_id or self._build_local_job_id(name)
|
|
43
46
|
self.cpus_per_worker: int | None = None
|
|
47
|
+
self.gpus_per_worker: int | None = None
|
|
44
48
|
if num_workers is not None:
|
|
45
49
|
if num_workers <= 0:
|
|
46
50
|
raise ValueError("num_workers must be > 0")
|
|
@@ -53,6 +57,10 @@ class BaseLauncher(ABC):
|
|
|
53
57
|
if cpus_per_worker <= 0:
|
|
54
58
|
raise ValueError("cpus_per_worker must be > 0")
|
|
55
59
|
self.cpus_per_worker = cpus_per_worker
|
|
60
|
+
if gpus_per_worker is not None:
|
|
61
|
+
if gpus_per_worker <= 0:
|
|
62
|
+
raise ValueError("gpus_per_worker must be > 0")
|
|
63
|
+
self.gpus_per_worker = gpus_per_worker
|
|
56
64
|
|
|
57
65
|
@staticmethod
|
|
58
66
|
def _build_local_job_id(name: str) -> str:
|
|
@@ -81,6 +89,14 @@ class BaseLauncher(ABC):
|
|
|
81
89
|
try:
|
|
82
90
|
return MacrodataClient()
|
|
83
91
|
except CredentialsError:
|
|
92
|
+
try:
|
|
93
|
+
request_json(
|
|
94
|
+
method="GET",
|
|
95
|
+
path="/api/me",
|
|
96
|
+
timeout_s=2.0,
|
|
97
|
+
)
|
|
98
|
+
except Exception:
|
|
99
|
+
pass
|
|
84
100
|
self._warn(
|
|
85
101
|
"platform integration disabled: no API key found in "
|
|
86
102
|
"MACRODATA_API_KEY or local credentials. "
|
|
@@ -113,8 +129,22 @@ class BaseLauncher(ABC):
|
|
|
113
129
|
*,
|
|
114
130
|
secret_values: tuple[str, ...] = (),
|
|
115
131
|
) -> dict[str, object]:
|
|
132
|
+
resolved_stages = [
|
|
133
|
+
replace(stage, compute=self._stage_compute_requirements(stage.compute))
|
|
134
|
+
for stage in (stages or self._planned_stages())
|
|
135
|
+
]
|
|
116
136
|
return compile_planned_stages(
|
|
117
|
-
|
|
137
|
+
resolved_stages,
|
|
138
|
+
secret_values=secret_values,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
def _stage_compute_requirements(
|
|
142
|
+
self, compute: StageComputeRequirements
|
|
143
|
+
) -> StageComputeRequirements:
|
|
144
|
+
return replace(
|
|
145
|
+
compute,
|
|
146
|
+
cpus_per_worker=self.cpus_per_worker,
|
|
147
|
+
gpus_per_worker=self.gpus_per_worker,
|
|
118
148
|
)
|
|
119
149
|
|
|
120
150
|
def _run_manifest(
|