macrodata-refiner 0.2.0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {macrodata_refiner-0.2.0/src/macrodata_refiner.egg-info → macrodata_refiner-0.2.1}/PKG-INFO +1 -1
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/pyproject.toml +1 -1
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1/src/macrodata_refiner.egg-info}/PKG-INFO +1 -1
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/cli/auth.py +2 -2
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/cli/ui.py +9 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/operators/row.py +10 -9
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/operators/vectorized.py +3 -1
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/launchers/cloud.py +48 -2
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/planning.py +11 -4
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/lerobot.py +4 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/platform/manifest.py +36 -35
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/runner.py +29 -18
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/LICENSE +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/README.md +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/setup.cfg +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/macrodata_refiner.egg-info/SOURCES.txt +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/macrodata_refiner.egg-info/dependency_links.txt +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/macrodata_refiner.egg-info/entry_points.txt +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/macrodata_refiner.egg-info/requires.txt +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/macrodata_refiner.egg-info/top_level.txt +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/cli/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/cli/main.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/asyncio/runtime.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/asyncio/window.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/buffer.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/engine.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/operators/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/tracking/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/tracking/shards.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/io/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/io/datafile.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/io/datafolder.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/io/fileset.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/launchers/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/launchers/base.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/launchers/local.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/media/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/media/video/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/media/video/remux.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/media/video/transcode.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/media/video/types.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/media/video/writer.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/data/block.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/data/row.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/data/shard.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/data/tabular.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/expressions.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/pipeline.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sinks/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sinks/base.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sinks/jsonl.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sinks/lerobot.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sinks/lerobot_reducer.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sinks/parquet.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/base.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/items.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/base.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/csv.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/jsonl.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/parquet.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/utils.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/task.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/steps.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/utils/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/utils/cache/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/utils/cache/decoder_cache.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/utils/cache/file_cache.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/utils/cache/lease_cache.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/platform/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/platform/auth.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/platform/client/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/platform/client/api.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/platform/client/http.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/platform/client/models.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/platform/client/serialize.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/py.typed +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/lerobot_format/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/lerobot_format/metadata/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/lerobot_format/metadata/info.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/lerobot_format/metadata/metadata.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/lerobot_format/metadata/stats.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/lerobot_format/metadata/tasks.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/lerobot_format/row.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/lerobot_format/tabular.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/motion.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/context.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/entrypoint.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/lifecycle/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/lifecycle/base.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/lifecycle/local/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/lifecycle/local/claim.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/lifecycle/local/files.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/lifecycle/local/lifecycle.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/lifecycle/platform.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/metrics/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/metrics/api.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/metrics/context.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/metrics/otel.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/resources/__init__.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/resources/cpu.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/resources/memory.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/resources/network.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/workdir.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/tests/test_cache.py +0 -0
- {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/tests/test_expressions.py +0 -0
|
@@ -18,7 +18,7 @@ from refiner.platform.client import (
|
|
|
18
18
|
sanitize_terminal_text,
|
|
19
19
|
verify_api_key,
|
|
20
20
|
)
|
|
21
|
-
from refiner.cli.ui import display_identity, print_banner
|
|
21
|
+
from refiner.cli.ui import display_identity, print_banner, stdin_is_interactive
|
|
22
22
|
|
|
23
23
|
_TOKEN_SETTINGS_SUFFIX = "/settings/api-keys"
|
|
24
24
|
|
|
@@ -31,7 +31,7 @@ def _read_token(args: argparse.Namespace) -> str:
|
|
|
31
31
|
if args.token and args.token.strip():
|
|
32
32
|
return args.token.strip()
|
|
33
33
|
|
|
34
|
-
read_from_stdin = args.token_stdin or not
|
|
34
|
+
read_from_stdin = args.token_stdin or not stdin_is_interactive()
|
|
35
35
|
if read_from_stdin:
|
|
36
36
|
token = sys.stdin.read().strip()
|
|
37
37
|
if token:
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import sys
|
|
4
|
+
|
|
3
5
|
from refiner.platform.client import UserIdentity
|
|
4
6
|
|
|
5
7
|
ASCII_BANNER = r"""
|
|
@@ -26,3 +28,10 @@ def display_identity(user: UserIdentity) -> str:
|
|
|
26
28
|
if email:
|
|
27
29
|
return f"{label} ({email})"
|
|
28
30
|
return label
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def stdin_is_interactive() -> bool:
|
|
34
|
+
try:
|
|
35
|
+
return sys.stdin.isatty()
|
|
36
|
+
except Exception: # pragma: no cover
|
|
37
|
+
return False
|
|
@@ -58,15 +58,16 @@ def execute_row_steps(
|
|
|
58
58
|
)
|
|
59
59
|
|
|
60
60
|
async def _run_async_step(*, step: AsyncRowStep, row: Row) -> Row:
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
61
|
+
with set_active_step_index(step.index):
|
|
62
|
+
result = step.apply_row_async(row)
|
|
63
|
+
if inspect.isawaitable(result):
|
|
64
|
+
result = await result
|
|
65
|
+
result = cast(MapResult, result)
|
|
66
|
+
if isinstance(result, Row):
|
|
67
|
+
return result
|
|
68
|
+
if isinstance(result, dict):
|
|
69
|
+
return row.update(result)
|
|
70
|
+
raise TypeError(f"Unsupported map_async() result type: {type(result)!r}")
|
|
70
71
|
|
|
71
72
|
def _run_step(i: int, *, flush_all: bool) -> None:
|
|
72
73
|
step = ordered[i]
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/operators/vectorized.py
RENAMED
|
@@ -22,6 +22,7 @@ from refiner.pipeline.steps import (
|
|
|
22
22
|
VectorizedOp,
|
|
23
23
|
WithColumnsStep,
|
|
24
24
|
)
|
|
25
|
+
from refiner.worker.context import set_active_step_index
|
|
25
26
|
from refiner.worker.metrics.api import log_throughput
|
|
26
27
|
|
|
27
28
|
|
|
@@ -100,7 +101,8 @@ def apply_vectorized_op(
|
|
|
100
101
|
return next_table, next_shard_counts
|
|
101
102
|
|
|
102
103
|
if isinstance(op, FnTableStep):
|
|
103
|
-
|
|
104
|
+
with set_active_step_index(op.index):
|
|
105
|
+
next_table = op.fn(table)
|
|
104
106
|
if not isinstance(next_table, pa.Table):
|
|
105
107
|
raise TypeError(
|
|
106
108
|
f"map_table() must return pa.Table, got {type(next_table)!r}"
|
|
@@ -2,14 +2,16 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
4
|
from dataclasses import dataclass
|
|
5
|
-
from typing import TYPE_CHECKING
|
|
5
|
+
from typing import TYPE_CHECKING, cast
|
|
6
6
|
|
|
7
|
+
from refiner.cli.ui import stdin_is_interactive
|
|
7
8
|
from refiner.platform.client import (
|
|
8
9
|
CloudRunCreateRequest,
|
|
9
10
|
CloudRuntimeConfig,
|
|
10
11
|
StagePayload,
|
|
11
12
|
serialize_pipeline_inline,
|
|
12
13
|
)
|
|
14
|
+
from refiner.platform.manifest import refiner_ref_exists_on_remote
|
|
13
15
|
|
|
14
16
|
from refiner.launchers.base import BaseLauncher
|
|
15
17
|
|
|
@@ -17,6 +19,9 @@ if TYPE_CHECKING:
|
|
|
17
19
|
from refiner.pipeline import RefinerPipeline
|
|
18
20
|
|
|
19
21
|
|
|
22
|
+
_FALLBACK_ENV_VAR = "MACRODATA_FALLBACK_TO_LATEST_PYPI"
|
|
23
|
+
|
|
24
|
+
|
|
20
25
|
@dataclass(frozen=True, slots=True)
|
|
21
26
|
class CloudLaunchResult:
|
|
22
27
|
job_id: str
|
|
@@ -96,6 +101,46 @@ class CloudLauncher(BaseLauncher):
|
|
|
96
101
|
)
|
|
97
102
|
return {**(secrets or {}), **(env or {})} or None
|
|
98
103
|
|
|
104
|
+
@staticmethod
|
|
105
|
+
def _fallback_to_latest_pypi_enabled() -> bool:
|
|
106
|
+
raw = os.environ.get(_FALLBACK_ENV_VAR, "")
|
|
107
|
+
return raw.strip().lower() in {"1", "true", "yes", "on"}
|
|
108
|
+
|
|
109
|
+
def _resolve_cloud_manifest(
|
|
110
|
+
self, *, secret_values: tuple[str, ...]
|
|
111
|
+
) -> dict[str, object]:
|
|
112
|
+
manifest = self._run_manifest(secret_values=secret_values)
|
|
113
|
+
environment = manifest.get("environment")
|
|
114
|
+
if environment is None:
|
|
115
|
+
return manifest
|
|
116
|
+
environment_dict = cast(dict[str, object], environment)
|
|
117
|
+
refiner_ref = environment_dict.get("refiner_ref")
|
|
118
|
+
if not isinstance(refiner_ref, str) or not refiner_ref.strip():
|
|
119
|
+
return manifest
|
|
120
|
+
refiner_ref = refiner_ref.strip()
|
|
121
|
+
if refiner_ref_exists_on_remote(refiner_ref):
|
|
122
|
+
return manifest
|
|
123
|
+
|
|
124
|
+
message = (
|
|
125
|
+
f"Refiner ref {refiner_ref!r} is not available on GitHub. "
|
|
126
|
+
"Launch with the latest PyPI version instead?"
|
|
127
|
+
)
|
|
128
|
+
fallback_allowed = self._fallback_to_latest_pypi_enabled()
|
|
129
|
+
interactive = stdin_is_interactive()
|
|
130
|
+
if not fallback_allowed and interactive:
|
|
131
|
+
answer = input(f"{message} [y/N] ")
|
|
132
|
+
fallback_allowed = answer.strip().lower() in {"y", "yes"}
|
|
133
|
+
if fallback_allowed:
|
|
134
|
+
environment_dict["refiner_ref"] = None
|
|
135
|
+
return manifest
|
|
136
|
+
if interactive:
|
|
137
|
+
raise SystemExit("cloud launch aborted")
|
|
138
|
+
|
|
139
|
+
raise SystemExit(
|
|
140
|
+
f"{message} Launch aborted before submission. "
|
|
141
|
+
f"Set {_FALLBACK_ENV_VAR}=1 to allow fallback to the latest PyPI version."
|
|
142
|
+
)
|
|
143
|
+
|
|
99
144
|
def launch(self) -> CloudLaunchResult:
|
|
100
145
|
try:
|
|
101
146
|
client = self._require_platform_client()
|
|
@@ -105,6 +150,7 @@ class CloudLauncher(BaseLauncher):
|
|
|
105
150
|
resolved_env = self._resolve_env_values(self.env)
|
|
106
151
|
secret_values = tuple(resolved_secrets.values()) if resolved_secrets else ()
|
|
107
152
|
stages = self._planned_stages()
|
|
153
|
+
manifest = self._resolve_cloud_manifest(secret_values=secret_values)
|
|
108
154
|
request = CloudRunCreateRequest(
|
|
109
155
|
name=self.name,
|
|
110
156
|
plan=self._compiled_plan(stages, secret_values=secret_values),
|
|
@@ -121,7 +167,7 @@ class CloudLauncher(BaseLauncher):
|
|
|
121
167
|
)
|
|
122
168
|
for stage in stages
|
|
123
169
|
],
|
|
124
|
-
manifest=
|
|
170
|
+
manifest=manifest,
|
|
125
171
|
sync_local_dependencies=self.sync_local_dependencies,
|
|
126
172
|
secrets=self._merged_env(resolved_secrets, resolved_env),
|
|
127
173
|
)
|
|
@@ -313,6 +313,14 @@ def _step_payload(
|
|
|
313
313
|
return payload
|
|
314
314
|
|
|
315
315
|
|
|
316
|
+
def _sink_name_type(sink: Any) -> tuple[str, str, dict[str, Any] | None]:
|
|
317
|
+
payload = sink.describe()
|
|
318
|
+
if payload is not None:
|
|
319
|
+
return payload
|
|
320
|
+
sink_name = sink.__class__.__name__.replace("Sink", "").lower()
|
|
321
|
+
return sink_name or "sink", "writer", None
|
|
322
|
+
|
|
323
|
+
|
|
316
324
|
def _serialize_args(
|
|
317
325
|
args: dict[str, Any] | None, *, secret_values: tuple[str, ...] = ()
|
|
318
326
|
) -> dict[str, Any] | None:
|
|
@@ -389,16 +397,15 @@ def _compile_stage_steps(
|
|
|
389
397
|
)
|
|
390
398
|
)
|
|
391
399
|
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
base_name, step_type, args = sink_payload
|
|
400
|
+
if pipeline.sink is not None:
|
|
401
|
+
base_name, step_type, args = _sink_name_type(pipeline.sink)
|
|
395
402
|
unique_name = _unique_name(base_name)
|
|
396
403
|
steps.append(
|
|
397
404
|
_step_payload(
|
|
398
405
|
name=unique_name,
|
|
399
406
|
step_type=step_type,
|
|
400
407
|
index=len(steps),
|
|
401
|
-
args=_serialize_args(args),
|
|
408
|
+
args=_serialize_args(args, secret_values=secret_values),
|
|
402
409
|
)
|
|
403
410
|
)
|
|
404
411
|
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/lerobot.py
RENAMED
|
@@ -81,6 +81,10 @@ class LeRobotEpisodeReader(ParquetReader):
|
|
|
81
81
|
split_row_groups=split_row_groups,
|
|
82
82
|
)
|
|
83
83
|
|
|
84
|
+
def describe(self) -> dict[str, Any]:
|
|
85
|
+
inputs = [str(root.abs_paths("")) for root in self.roots]
|
|
86
|
+
return {"path": ", ".join(inputs), "inputs": inputs}
|
|
87
|
+
|
|
84
88
|
def read_shard(self, shard: Shard) -> Iterator[SourceUnit]:
|
|
85
89
|
"""Read one planned episode shard and emit `LeRobotTabular` blocks."""
|
|
86
90
|
descriptor = shard.descriptor
|
|
@@ -6,8 +6,9 @@ import json
|
|
|
6
6
|
import platform
|
|
7
7
|
import subprocess
|
|
8
8
|
import sys
|
|
9
|
+
from urllib import error as urllib_error
|
|
10
|
+
from urllib import request as urllib_request
|
|
9
11
|
from collections.abc import Sequence
|
|
10
|
-
from functools import lru_cache
|
|
11
12
|
from importlib import metadata as importlib_metadata
|
|
12
13
|
from pathlib import Path
|
|
13
14
|
from typing import Any
|
|
@@ -90,36 +91,32 @@ def _collect_dependencies() -> list[dict[str, str]]:
|
|
|
90
91
|
|
|
91
92
|
|
|
92
93
|
def _resolve_installed_version() -> str | None:
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
if version:
|
|
99
|
-
return version
|
|
100
|
-
return None
|
|
94
|
+
try:
|
|
95
|
+
version = importlib_metadata.version("macrodata-refiner").strip()
|
|
96
|
+
except importlib_metadata.PackageNotFoundError:
|
|
97
|
+
return None
|
|
98
|
+
return version or None
|
|
101
99
|
|
|
102
100
|
|
|
103
101
|
def _resolve_direct_url_git_sha() -> str | None:
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
return None
|
|
102
|
+
try:
|
|
103
|
+
dist = importlib_metadata.distribution("macrodata-refiner")
|
|
104
|
+
except importlib_metadata.PackageNotFoundError:
|
|
105
|
+
return None
|
|
106
|
+
raw = dist.read_text("direct_url.json")
|
|
107
|
+
if not raw:
|
|
108
|
+
return None
|
|
109
|
+
try:
|
|
110
|
+
data = json.loads(raw)
|
|
111
|
+
except json.JSONDecodeError:
|
|
112
|
+
return None
|
|
113
|
+
if not isinstance(data, dict):
|
|
114
|
+
return None
|
|
115
|
+
vcs_info = data.get("vcs_info")
|
|
116
|
+
if not isinstance(vcs_info, dict):
|
|
117
|
+
return None
|
|
118
|
+
commit = str(vcs_info.get("commit_id", "")).strip()
|
|
119
|
+
return commit or None
|
|
123
120
|
|
|
124
121
|
|
|
125
122
|
def _resolve_repo_root(start: Path) -> Path | None:
|
|
@@ -148,19 +145,22 @@ def _resolve_local_repo_git_sha() -> str | None:
|
|
|
148
145
|
return sha or None
|
|
149
146
|
|
|
150
147
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
_resolve_direct_url_git_sha()
|
|
155
|
-
or _resolve_local_repo_git_sha()
|
|
156
|
-
or _resolve_installed_version()
|
|
148
|
+
def refiner_ref_exists_on_remote(ref: str) -> bool:
|
|
149
|
+
request = urllib_request.Request(
|
|
150
|
+
f"https://api.github.com/repos/macrodata-labs/refiner/commits/{ref}"
|
|
157
151
|
)
|
|
152
|
+
try:
|
|
153
|
+
with urllib_request.urlopen(request):
|
|
154
|
+
return True
|
|
155
|
+
except (urllib_error.HTTPError, urllib_error.URLError):
|
|
156
|
+
return False
|
|
158
157
|
|
|
159
158
|
|
|
160
159
|
def build_run_manifest(*, secret_values: Sequence[str] = ()) -> dict[str, Any]:
|
|
161
160
|
script_path = _detect_script_path()
|
|
162
161
|
path, text, sha256 = _read_script(script_path)
|
|
163
|
-
|
|
162
|
+
refiner_version = _resolve_installed_version()
|
|
163
|
+
refiner_ref = _resolve_direct_url_git_sha() or _resolve_local_repo_git_sha()
|
|
164
164
|
|
|
165
165
|
manifest: dict[str, Any] = {
|
|
166
166
|
"version": 1,
|
|
@@ -173,6 +173,7 @@ def build_run_manifest(*, secret_values: Sequence[str] = ()) -> dict[str, Any]:
|
|
|
173
173
|
},
|
|
174
174
|
"environment": {
|
|
175
175
|
"python_version": platform.python_version(),
|
|
176
|
+
"refiner_version": refiner_version,
|
|
176
177
|
"refiner_ref": refiner_ref,
|
|
177
178
|
"platform": f"{platform.system().lower()}-{platform.machine().lower()}",
|
|
178
179
|
},
|
|
@@ -135,6 +135,9 @@ class Worker:
|
|
|
135
135
|
runtime_name,
|
|
136
136
|
)
|
|
137
137
|
sink = self.pipeline.sink or NullSink()
|
|
138
|
+
sink_step_index = (
|
|
139
|
+
self.pipeline._next_step_index() if self.pipeline.sink is not None else None
|
|
140
|
+
)
|
|
138
141
|
|
|
139
142
|
def _heartbeat_once() -> None:
|
|
140
143
|
with inflight_lock:
|
|
@@ -166,7 +169,8 @@ class Worker:
|
|
|
166
169
|
if shard is None:
|
|
167
170
|
return
|
|
168
171
|
try:
|
|
169
|
-
|
|
172
|
+
with set_active_step_index(sink_step_index):
|
|
173
|
+
sink.on_shard_complete(shard_id)
|
|
170
174
|
user_metrics_emitter.force_flush_user_metrics()
|
|
171
175
|
runtime_lifecycle.complete(shard)
|
|
172
176
|
except Exception: # noqa: BLE001
|
|
@@ -232,21 +236,26 @@ class Worker:
|
|
|
232
236
|
shard.start_key,
|
|
233
237
|
shard.end_key,
|
|
234
238
|
)
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
239
|
+
obs_logger.info(
|
|
240
|
+
"shard source started shard_id={} global_ordinal={}",
|
|
241
|
+
shard.id,
|
|
242
|
+
shard.global_ordinal,
|
|
243
|
+
)
|
|
244
|
+
source_iter = iter(self.pipeline.source.iter_shard_units(shard))
|
|
245
|
+
while True:
|
|
246
|
+
with set_active_step_index(0):
|
|
247
|
+
try:
|
|
248
|
+
unit = next(source_iter)
|
|
249
|
+
except StopIteration:
|
|
250
|
+
break
|
|
251
|
+
rows = block_num_rows(unit)
|
|
252
|
+
if rows > 0:
|
|
253
|
+
rows_read += rows
|
|
254
|
+
with inflight_lock:
|
|
255
|
+
pending_rows_by_shard[shard.id] = (
|
|
256
|
+
pending_rows_by_shard.get(shard.id, 0) + rows
|
|
257
|
+
)
|
|
258
|
+
yield unit
|
|
250
259
|
obs_logger.info(
|
|
251
260
|
"shard source finished shard_id={} global_ordinal={} rows_read={}",
|
|
252
261
|
shard.id,
|
|
@@ -274,7 +283,8 @@ class Worker:
|
|
|
274
283
|
):
|
|
275
284
|
if heartbeat_error is not None:
|
|
276
285
|
raise RuntimeError(f"heartbeat failed: {heartbeat_error}")
|
|
277
|
-
|
|
286
|
+
with set_active_step_index(sink_step_index):
|
|
287
|
+
written = sink.write_block(block)
|
|
278
288
|
_apply_row_delta(
|
|
279
289
|
{
|
|
280
290
|
shard_id: -count
|
|
@@ -334,7 +344,8 @@ class Worker:
|
|
|
334
344
|
stop_heartbeat.set()
|
|
335
345
|
heartbeat_thread.join(timeout=1.0)
|
|
336
346
|
try:
|
|
337
|
-
|
|
347
|
+
with set_active_step_index(sink_step_index):
|
|
348
|
+
sink.close()
|
|
338
349
|
except Exception as e:
|
|
339
350
|
if execution_error is not None or run_exception is not None:
|
|
340
351
|
obs_logger.warning(
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/macrodata_refiner.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/macrodata_refiner.egg-info/entry_points.txt
RENAMED
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/macrodata_refiner.egg-info/requires.txt
RENAMED
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/macrodata_refiner.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/asyncio/runtime.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/operators/__init__.py
RENAMED
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/tracking/__init__.py
RENAMED
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/tracking/shards.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sinks/lerobot_reducer.py
RENAMED
|
File without changes
|
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/__init__.py
RENAMED
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/base.py
RENAMED
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/csv.py
RENAMED
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/jsonl.py
RENAMED
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/parquet.py
RENAMED
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/utils/cache/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/utils/cache/file_cache.py
RENAMED
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/utils/cache/lease_cache.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/platform/client/serialize.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/lerobot_format/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/lerobot_format/row.py
RENAMED
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/lerobot_format/tabular.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/lifecycle/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/lifecycle/local/__init__.py
RENAMED
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/lifecycle/local/claim.py
RENAMED
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/lifecycle/local/files.py
RENAMED
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/lifecycle/local/lifecycle.py
RENAMED
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/lifecycle/platform.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/resources/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|