macrodata-refiner 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {macrodata_refiner-0.2.0/src/macrodata_refiner.egg-info → macrodata_refiner-0.2.1}/PKG-INFO +1 -1
  2. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/pyproject.toml +1 -1
  3. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1/src/macrodata_refiner.egg-info}/PKG-INFO +1 -1
  4. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/cli/auth.py +2 -2
  5. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/cli/ui.py +9 -0
  6. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/operators/row.py +10 -9
  7. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/operators/vectorized.py +3 -1
  8. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/launchers/cloud.py +48 -2
  9. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/planning.py +11 -4
  10. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/lerobot.py +4 -0
  11. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/platform/manifest.py +36 -35
  12. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/runner.py +29 -18
  13. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/LICENSE +0 -0
  14. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/README.md +0 -0
  15. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/setup.cfg +0 -0
  16. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/macrodata_refiner.egg-info/SOURCES.txt +0 -0
  17. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/macrodata_refiner.egg-info/dependency_links.txt +0 -0
  18. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/macrodata_refiner.egg-info/entry_points.txt +0 -0
  19. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/macrodata_refiner.egg-info/requires.txt +0 -0
  20. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/macrodata_refiner.egg-info/top_level.txt +0 -0
  21. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/__init__.py +0 -0
  22. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/cli/__init__.py +0 -0
  23. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/cli/main.py +0 -0
  24. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/__init__.py +0 -0
  25. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/asyncio/runtime.py +0 -0
  26. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/asyncio/window.py +0 -0
  27. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/buffer.py +0 -0
  28. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/engine.py +0 -0
  29. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/operators/__init__.py +0 -0
  30. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/tracking/__init__.py +0 -0
  31. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/execution/tracking/shards.py +0 -0
  32. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/io/__init__.py +0 -0
  33. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/io/datafile.py +0 -0
  34. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/io/datafolder.py +0 -0
  35. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/io/fileset.py +0 -0
  36. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/launchers/__init__.py +0 -0
  37. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/launchers/base.py +0 -0
  38. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/launchers/local.py +0 -0
  39. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/media/__init__.py +0 -0
  40. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/media/video/__init__.py +0 -0
  41. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/media/video/remux.py +0 -0
  42. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/media/video/transcode.py +0 -0
  43. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/media/video/types.py +0 -0
  44. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/media/video/writer.py +0 -0
  45. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/__init__.py +0 -0
  46. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/data/block.py +0 -0
  47. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/data/row.py +0 -0
  48. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/data/shard.py +0 -0
  49. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/data/tabular.py +0 -0
  50. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/expressions.py +0 -0
  51. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/pipeline.py +0 -0
  52. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sinks/__init__.py +0 -0
  53. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sinks/base.py +0 -0
  54. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sinks/jsonl.py +0 -0
  55. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sinks/lerobot.py +0 -0
  56. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sinks/lerobot_reducer.py +0 -0
  57. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sinks/parquet.py +0 -0
  58. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/__init__.py +0 -0
  59. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/base.py +0 -0
  60. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/items.py +0 -0
  61. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/__init__.py +0 -0
  62. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/base.py +0 -0
  63. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/csv.py +0 -0
  64. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/jsonl.py +0 -0
  65. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/parquet.py +0 -0
  66. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/readers/utils.py +0 -0
  67. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/sources/task.py +0 -0
  68. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/steps.py +0 -0
  69. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/utils/__init__.py +0 -0
  70. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/utils/cache/__init__.py +0 -0
  71. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/utils/cache/decoder_cache.py +0 -0
  72. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/utils/cache/file_cache.py +0 -0
  73. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/pipeline/utils/cache/lease_cache.py +0 -0
  74. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/platform/__init__.py +0 -0
  75. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/platform/auth.py +0 -0
  76. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/platform/client/__init__.py +0 -0
  77. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/platform/client/api.py +0 -0
  78. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/platform/client/http.py +0 -0
  79. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/platform/client/models.py +0 -0
  80. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/platform/client/serialize.py +0 -0
  81. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/py.typed +0 -0
  82. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/__init__.py +0 -0
  83. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/lerobot_format/__init__.py +0 -0
  84. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/lerobot_format/metadata/__init__.py +0 -0
  85. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/lerobot_format/metadata/info.py +0 -0
  86. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/lerobot_format/metadata/metadata.py +0 -0
  87. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/lerobot_format/metadata/stats.py +0 -0
  88. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/lerobot_format/metadata/tasks.py +0 -0
  89. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/lerobot_format/row.py +0 -0
  90. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/lerobot_format/tabular.py +0 -0
  91. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/robotics/motion.py +0 -0
  92. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/__init__.py +0 -0
  93. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/context.py +0 -0
  94. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/entrypoint.py +0 -0
  95. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/lifecycle/__init__.py +0 -0
  96. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/lifecycle/base.py +0 -0
  97. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/lifecycle/local/__init__.py +0 -0
  98. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/lifecycle/local/claim.py +0 -0
  99. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/lifecycle/local/files.py +0 -0
  100. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/lifecycle/local/lifecycle.py +0 -0
  101. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/lifecycle/platform.py +0 -0
  102. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/metrics/__init__.py +0 -0
  103. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/metrics/api.py +0 -0
  104. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/metrics/context.py +0 -0
  105. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/metrics/otel.py +0 -0
  106. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/resources/__init__.py +0 -0
  107. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/resources/cpu.py +0 -0
  108. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/resources/memory.py +0 -0
  109. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/resources/network.py +0 -0
  110. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/src/refiner/worker/workdir.py +0 -0
  111. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/tests/test_cache.py +0 -0
  112. {macrodata_refiner-0.2.0 → macrodata_refiner-0.2.1}/tests/test_expressions.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: macrodata-refiner
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets
5
5
  Author: Macrodata Labs
6
6
  License-Expression: Apache-2.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "macrodata-refiner"
3
- version = "0.2.0"
3
+ version = "0.2.1"
4
4
  description = "Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets"
5
5
  readme = "README.md"
6
6
  license = "Apache-2.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: macrodata-refiner
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: Refiner by Macrodata Labs, a data processing framework for Machine Learning large scale datasets
5
5
  Author: Macrodata Labs
6
6
  License-Expression: Apache-2.0
@@ -18,7 +18,7 @@ from refiner.platform.client import (
18
18
  sanitize_terminal_text,
19
19
  verify_api_key,
20
20
  )
21
- from refiner.cli.ui import display_identity, print_banner
21
+ from refiner.cli.ui import display_identity, print_banner, stdin_is_interactive
22
22
 
23
23
  _TOKEN_SETTINGS_SUFFIX = "/settings/api-keys"
24
24
 
@@ -31,7 +31,7 @@ def _read_token(args: argparse.Namespace) -> str:
31
31
  if args.token and args.token.strip():
32
32
  return args.token.strip()
33
33
 
34
- read_from_stdin = args.token_stdin or not sys.stdin.isatty()
34
+ read_from_stdin = args.token_stdin or not stdin_is_interactive()
35
35
  if read_from_stdin:
36
36
  token = sys.stdin.read().strip()
37
37
  if token:
@@ -1,5 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import sys
4
+
3
5
  from refiner.platform.client import UserIdentity
4
6
 
5
7
  ASCII_BANNER = r"""
@@ -26,3 +28,10 @@ def display_identity(user: UserIdentity) -> str:
26
28
  if email:
27
29
  return f"{label} ({email})"
28
30
  return label
31
+
32
+
33
+ def stdin_is_interactive() -> bool:
34
+ try:
35
+ return sys.stdin.isatty()
36
+ except Exception: # pragma: no cover
37
+ return False
@@ -58,15 +58,16 @@ def execute_row_steps(
58
58
  )
59
59
 
60
60
  async def _run_async_step(*, step: AsyncRowStep, row: Row) -> Row:
61
- result = step.apply_row_async(row)
62
- if inspect.isawaitable(result):
63
- result = await result
64
- result = cast(MapResult, result)
65
- if isinstance(result, Row):
66
- return result
67
- if isinstance(result, dict):
68
- return row.update(result)
69
- raise TypeError(f"Unsupported map_async() result type: {type(result)!r}")
61
+ with set_active_step_index(step.index):
62
+ result = step.apply_row_async(row)
63
+ if inspect.isawaitable(result):
64
+ result = await result
65
+ result = cast(MapResult, result)
66
+ if isinstance(result, Row):
67
+ return result
68
+ if isinstance(result, dict):
69
+ return row.update(result)
70
+ raise TypeError(f"Unsupported map_async() result type: {type(result)!r}")
70
71
 
71
72
  def _run_step(i: int, *, flush_all: bool) -> None:
72
73
  step = ordered[i]
@@ -22,6 +22,7 @@ from refiner.pipeline.steps import (
22
22
  VectorizedOp,
23
23
  WithColumnsStep,
24
24
  )
25
+ from refiner.worker.context import set_active_step_index
25
26
  from refiner.worker.metrics.api import log_throughput
26
27
 
27
28
 
@@ -100,7 +101,8 @@ def apply_vectorized_op(
100
101
  return next_table, next_shard_counts
101
102
 
102
103
  if isinstance(op, FnTableStep):
103
- next_table = op.fn(table)
104
+ with set_active_step_index(op.index):
105
+ next_table = op.fn(table)
104
106
  if not isinstance(next_table, pa.Table):
105
107
  raise TypeError(
106
108
  f"map_table() must return pa.Table, got {type(next_table)!r}"
@@ -2,14 +2,16 @@ from __future__ import annotations
2
2
 
3
3
  import os
4
4
  from dataclasses import dataclass
5
- from typing import TYPE_CHECKING
5
+ from typing import TYPE_CHECKING, cast
6
6
 
7
+ from refiner.cli.ui import stdin_is_interactive
7
8
  from refiner.platform.client import (
8
9
  CloudRunCreateRequest,
9
10
  CloudRuntimeConfig,
10
11
  StagePayload,
11
12
  serialize_pipeline_inline,
12
13
  )
14
+ from refiner.platform.manifest import refiner_ref_exists_on_remote
13
15
 
14
16
  from refiner.launchers.base import BaseLauncher
15
17
 
@@ -17,6 +19,9 @@ if TYPE_CHECKING:
17
19
  from refiner.pipeline import RefinerPipeline
18
20
 
19
21
 
22
+ _FALLBACK_ENV_VAR = "MACRODATA_FALLBACK_TO_LATEST_PYPI"
23
+
24
+
20
25
  @dataclass(frozen=True, slots=True)
21
26
  class CloudLaunchResult:
22
27
  job_id: str
@@ -96,6 +101,46 @@ class CloudLauncher(BaseLauncher):
96
101
  )
97
102
  return {**(secrets or {}), **(env or {})} or None
98
103
 
104
+ @staticmethod
105
+ def _fallback_to_latest_pypi_enabled() -> bool:
106
+ raw = os.environ.get(_FALLBACK_ENV_VAR, "")
107
+ return raw.strip().lower() in {"1", "true", "yes", "on"}
108
+
109
+ def _resolve_cloud_manifest(
110
+ self, *, secret_values: tuple[str, ...]
111
+ ) -> dict[str, object]:
112
+ manifest = self._run_manifest(secret_values=secret_values)
113
+ environment = manifest.get("environment")
114
+ if environment is None:
115
+ return manifest
116
+ environment_dict = cast(dict[str, object], environment)
117
+ refiner_ref = environment_dict.get("refiner_ref")
118
+ if not isinstance(refiner_ref, str) or not refiner_ref.strip():
119
+ return manifest
120
+ refiner_ref = refiner_ref.strip()
121
+ if refiner_ref_exists_on_remote(refiner_ref):
122
+ return manifest
123
+
124
+ message = (
125
+ f"Refiner ref {refiner_ref!r} is not available on GitHub. "
126
+ "Launch with the latest PyPI version instead?"
127
+ )
128
+ fallback_allowed = self._fallback_to_latest_pypi_enabled()
129
+ interactive = stdin_is_interactive()
130
+ if not fallback_allowed and interactive:
131
+ answer = input(f"{message} [y/N] ")
132
+ fallback_allowed = answer.strip().lower() in {"y", "yes"}
133
+ if fallback_allowed:
134
+ environment_dict["refiner_ref"] = None
135
+ return manifest
136
+ if interactive:
137
+ raise SystemExit("cloud launch aborted")
138
+
139
+ raise SystemExit(
140
+ f"{message} Launch aborted before submission. "
141
+ f"Set {_FALLBACK_ENV_VAR}=1 to allow fallback to the latest PyPI version."
142
+ )
143
+
99
144
  def launch(self) -> CloudLaunchResult:
100
145
  try:
101
146
  client = self._require_platform_client()
@@ -105,6 +150,7 @@ class CloudLauncher(BaseLauncher):
105
150
  resolved_env = self._resolve_env_values(self.env)
106
151
  secret_values = tuple(resolved_secrets.values()) if resolved_secrets else ()
107
152
  stages = self._planned_stages()
153
+ manifest = self._resolve_cloud_manifest(secret_values=secret_values)
108
154
  request = CloudRunCreateRequest(
109
155
  name=self.name,
110
156
  plan=self._compiled_plan(stages, secret_values=secret_values),
@@ -121,7 +167,7 @@ class CloudLauncher(BaseLauncher):
121
167
  )
122
168
  for stage in stages
123
169
  ],
124
- manifest=self._run_manifest(secret_values=secret_values),
170
+ manifest=manifest,
125
171
  sync_local_dependencies=self.sync_local_dependencies,
126
172
  secrets=self._merged_env(resolved_secrets, resolved_env),
127
173
  )
@@ -313,6 +313,14 @@ def _step_payload(
313
313
  return payload
314
314
 
315
315
 
316
+ def _sink_name_type(sink: Any) -> tuple[str, str, dict[str, Any] | None]:
317
+ payload = sink.describe()
318
+ if payload is not None:
319
+ return payload
320
+ sink_name = sink.__class__.__name__.replace("Sink", "").lower()
321
+ return sink_name or "sink", "writer", None
322
+
323
+
316
324
  def _serialize_args(
317
325
  args: dict[str, Any] | None, *, secret_values: tuple[str, ...] = ()
318
326
  ) -> dict[str, Any] | None:
@@ -389,16 +397,15 @@ def _compile_stage_steps(
389
397
  )
390
398
  )
391
399
 
392
- sink_payload = pipeline.sink.describe() if pipeline.sink is not None else None
393
- if sink_payload is not None:
394
- base_name, step_type, args = sink_payload
400
+ if pipeline.sink is not None:
401
+ base_name, step_type, args = _sink_name_type(pipeline.sink)
395
402
  unique_name = _unique_name(base_name)
396
403
  steps.append(
397
404
  _step_payload(
398
405
  name=unique_name,
399
406
  step_type=step_type,
400
407
  index=len(steps),
401
- args=_serialize_args(args),
408
+ args=_serialize_args(args, secret_values=secret_values),
402
409
  )
403
410
  )
404
411
 
@@ -81,6 +81,10 @@ class LeRobotEpisodeReader(ParquetReader):
81
81
  split_row_groups=split_row_groups,
82
82
  )
83
83
 
84
+ def describe(self) -> dict[str, Any]:
85
+ inputs = [str(root.abs_paths("")) for root in self.roots]
86
+ return {"path": ", ".join(inputs), "inputs": inputs}
87
+
84
88
  def read_shard(self, shard: Shard) -> Iterator[SourceUnit]:
85
89
  """Read one planned episode shard and emit `LeRobotTabular` blocks."""
86
90
  descriptor = shard.descriptor
@@ -6,8 +6,9 @@ import json
6
6
  import platform
7
7
  import subprocess
8
8
  import sys
9
+ from urllib import error as urllib_error
10
+ from urllib import request as urllib_request
9
11
  from collections.abc import Sequence
10
- from functools import lru_cache
11
12
  from importlib import metadata as importlib_metadata
12
13
  from pathlib import Path
13
14
  from typing import Any
@@ -90,36 +91,32 @@ def _collect_dependencies() -> list[dict[str, str]]:
90
91
 
91
92
 
92
93
  def _resolve_installed_version() -> str | None:
93
- for package_name in ("refiner", "macrodata-refiner"):
94
- try:
95
- version = importlib_metadata.version(package_name).strip()
96
- except importlib_metadata.PackageNotFoundError:
97
- continue
98
- if version:
99
- return version
100
- return None
94
+ try:
95
+ version = importlib_metadata.version("macrodata-refiner").strip()
96
+ except importlib_metadata.PackageNotFoundError:
97
+ return None
98
+ return version or None
101
99
 
102
100
 
103
101
  def _resolve_direct_url_git_sha() -> str | None:
104
- for package_name in ("refiner", "macrodata-refiner"):
105
- try:
106
- dist = importlib_metadata.distribution(package_name)
107
- except importlib_metadata.PackageNotFoundError:
108
- continue
109
- raw = dist.read_text("direct_url.json")
110
- if not raw:
111
- continue
112
- try:
113
- data = json.loads(raw)
114
- except json.JSONDecodeError:
115
- continue
116
- vcs_info = data.get("vcs_info")
117
- if not isinstance(vcs_info, dict):
118
- continue
119
- commit = str(vcs_info.get("commit_id", "")).strip()
120
- if commit:
121
- return commit
122
- return None
102
+ try:
103
+ dist = importlib_metadata.distribution("macrodata-refiner")
104
+ except importlib_metadata.PackageNotFoundError:
105
+ return None
106
+ raw = dist.read_text("direct_url.json")
107
+ if not raw:
108
+ return None
109
+ try:
110
+ data = json.loads(raw)
111
+ except json.JSONDecodeError:
112
+ return None
113
+ if not isinstance(data, dict):
114
+ return None
115
+ vcs_info = data.get("vcs_info")
116
+ if not isinstance(vcs_info, dict):
117
+ return None
118
+ commit = str(vcs_info.get("commit_id", "")).strip()
119
+ return commit or None
123
120
 
124
121
 
125
122
  def _resolve_repo_root(start: Path) -> Path | None:
@@ -148,19 +145,22 @@ def _resolve_local_repo_git_sha() -> str | None:
148
145
  return sha or None
149
146
 
150
147
 
151
- @lru_cache(maxsize=1)
152
- def _resolve_refiner_ref() -> str | None:
153
- return (
154
- _resolve_direct_url_git_sha()
155
- or _resolve_local_repo_git_sha()
156
- or _resolve_installed_version()
148
+ def refiner_ref_exists_on_remote(ref: str) -> bool:
149
+ request = urllib_request.Request(
150
+ f"https://api.github.com/repos/macrodata-labs/refiner/commits/{ref}"
157
151
  )
152
+ try:
153
+ with urllib_request.urlopen(request):
154
+ return True
155
+ except (urllib_error.HTTPError, urllib_error.URLError):
156
+ return False
158
157
 
159
158
 
160
159
  def build_run_manifest(*, secret_values: Sequence[str] = ()) -> dict[str, Any]:
161
160
  script_path = _detect_script_path()
162
161
  path, text, sha256 = _read_script(script_path)
163
- refiner_ref = _resolve_refiner_ref()
162
+ refiner_version = _resolve_installed_version()
163
+ refiner_ref = _resolve_direct_url_git_sha() or _resolve_local_repo_git_sha()
164
164
 
165
165
  manifest: dict[str, Any] = {
166
166
  "version": 1,
@@ -173,6 +173,7 @@ def build_run_manifest(*, secret_values: Sequence[str] = ()) -> dict[str, Any]:
173
173
  },
174
174
  "environment": {
175
175
  "python_version": platform.python_version(),
176
+ "refiner_version": refiner_version,
176
177
  "refiner_ref": refiner_ref,
177
178
  "platform": f"{platform.system().lower()}-{platform.machine().lower()}",
178
179
  },
@@ -135,6 +135,9 @@ class Worker:
135
135
  runtime_name,
136
136
  )
137
137
  sink = self.pipeline.sink or NullSink()
138
+ sink_step_index = (
139
+ self.pipeline._next_step_index() if self.pipeline.sink is not None else None
140
+ )
138
141
 
139
142
  def _heartbeat_once() -> None:
140
143
  with inflight_lock:
@@ -166,7 +169,8 @@ class Worker:
166
169
  if shard is None:
167
170
  return
168
171
  try:
169
- sink.on_shard_complete(shard_id)
172
+ with set_active_step_index(sink_step_index):
173
+ sink.on_shard_complete(shard_id)
170
174
  user_metrics_emitter.force_flush_user_metrics()
171
175
  runtime_lifecycle.complete(shard)
172
176
  except Exception: # noqa: BLE001
@@ -232,21 +236,26 @@ class Worker:
232
236
  shard.start_key,
233
237
  shard.end_key,
234
238
  )
235
- with set_active_step_index(0):
236
- obs_logger.info(
237
- "shard source started shard_id={} global_ordinal={}",
238
- shard.id,
239
- shard.global_ordinal,
240
- )
241
- for unit in self.pipeline.source.iter_shard_units(shard):
242
- rows = block_num_rows(unit)
243
- if rows > 0:
244
- rows_read += rows
245
- with inflight_lock:
246
- pending_rows_by_shard[shard.id] = (
247
- pending_rows_by_shard.get(shard.id, 0) + rows
248
- )
249
- yield unit
239
+ obs_logger.info(
240
+ "shard source started shard_id={} global_ordinal={}",
241
+ shard.id,
242
+ shard.global_ordinal,
243
+ )
244
+ source_iter = iter(self.pipeline.source.iter_shard_units(shard))
245
+ while True:
246
+ with set_active_step_index(0):
247
+ try:
248
+ unit = next(source_iter)
249
+ except StopIteration:
250
+ break
251
+ rows = block_num_rows(unit)
252
+ if rows > 0:
253
+ rows_read += rows
254
+ with inflight_lock:
255
+ pending_rows_by_shard[shard.id] = (
256
+ pending_rows_by_shard.get(shard.id, 0) + rows
257
+ )
258
+ yield unit
250
259
  obs_logger.info(
251
260
  "shard source finished shard_id={} global_ordinal={} rows_read={}",
252
261
  shard.id,
@@ -274,7 +283,8 @@ class Worker:
274
283
  ):
275
284
  if heartbeat_error is not None:
276
285
  raise RuntimeError(f"heartbeat failed: {heartbeat_error}")
277
- written = sink.write_block(block)
286
+ with set_active_step_index(sink_step_index):
287
+ written = sink.write_block(block)
278
288
  _apply_row_delta(
279
289
  {
280
290
  shard_id: -count
@@ -334,7 +344,8 @@ class Worker:
334
344
  stop_heartbeat.set()
335
345
  heartbeat_thread.join(timeout=1.0)
336
346
  try:
337
- sink.close()
347
+ with set_active_step_index(sink_step_index):
348
+ sink.close()
338
349
  except Exception as e:
339
350
  if execution_error is not None or run_exception is not None:
340
351
  obs_logger.warning(