PyPI - traceplane - Versions diffs - 0.1.0__tar.gz - Mend

traceplane 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

traceplane-0.1.0/.gitignore +29 -0
traceplane-0.1.0/PKG-INFO +129 -0
traceplane-0.1.0/README.md +71 -0
traceplane-0.1.0/pyproject.toml +58 -0
traceplane-0.1.0/scripts/setup_isaac_sim.sh +85 -0
traceplane-0.1.0/src/traceplane/__init__.py +16 -0
traceplane-0.1.0/src/traceplane/dataset.py +93 -0
traceplane-0.1.0/src/traceplane/embeddings.py +207 -0
traceplane-0.1.0/src/traceplane/jax.py +108 -0
traceplane-0.1.0/src/traceplane/lerobot_reader.py +362 -0
traceplane-0.1.0/src/traceplane/query.py +643 -0
traceplane-0.1.0/src/traceplane/sim/__init__.py +31 -0
traceplane-0.1.0/src/traceplane/sim/_compat.py +16 -0
traceplane-0.1.0/src/traceplane/sim/cli.py +115 -0
traceplane-0.1.0/src/traceplane/sim/config.py +58 -0
traceplane-0.1.0/src/traceplane/sim/controller.py +77 -0
traceplane-0.1.0/src/traceplane/sim/evaluator.py +230 -0
traceplane-0.1.0/src/traceplane/sim/metrics.py +110 -0
traceplane-0.1.0/src/traceplane/sim/robot.py +123 -0
traceplane-0.1.0/src/traceplane/sim/scene.py +91 -0
traceplane-0.1.0/src/traceplane/sim/visualizer.py +188 -0
traceplane-0.1.0/src/traceplane/tf.py +86 -0
traceplane-0.1.0/src/traceplane/torch.py +126 -0
traceplane-0.1.0/src/traceplane/training/__init__.py +23 -0
traceplane-0.1.0/src/traceplane/training/cli.py +92 -0
traceplane-0.1.0/src/traceplane/training/config.py +82 -0
traceplane-0.1.0/src/traceplane/training/diffusion_policy.py +314 -0
traceplane-0.1.0/src/traceplane/training/eval.py +85 -0
traceplane-0.1.0/src/traceplane/training/normalization.py +262 -0
traceplane-0.1.0/src/traceplane/training/trainer.py +318 -0
traceplane-0.1.0/src/traceplane/windowing.py +112 -0
traceplane-0.1.0/tests/test_core.py +217 -0
traceplane-0.1.0/tests/test_engine.py +91 -0
traceplane-0.1.0/tests/test_integration.py +426 -0
traceplane-0.1.0/tests/test_torch.py +107 -0

traceplane-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,29 @@
+# Rust
+backend/target/
+# Node / Frontend
+frontend/node_modules/
+frontend/dist/
+# Python
+dataloader/__pycache__/
+dataloader/src/traceplane/__pycache__/
+dataloader/tests/__pycache__/
+dataloader/.pytest_cache/
+dataloader/src/*.egg-info/
+dataloader/src/traceplane.egg-info/
+*.pyc
+*.pyo
+__pycache__/
+*.egg-info/
+# IDE
+.idea/
+.vscode/
+*.swp
+# OS
+.DS_Store
+# Env
+.env

traceplane-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,129 @@
+Metadata-Version: 2.4
+Name: traceplane
+Version: 0.1.0
+Summary: Streaming dataloader for robotics trajectory datasets
+Project-URL: Homepage, https://traceplane.ai
+Project-URL: Documentation, https://docs.traceplane.ai
+Project-URL: Repository, https://github.com/traceplane/traceplane
+Project-URL: Issues, https://github.com/traceplane/traceplane/issues
+Author-email: Traceplane <hello@traceplane.ai>
+License-Expression: Apache-2.0
+Keywords: datasets,imitation-learning,lerobot,robotics,trajectories
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.10
+Requires-Dist: fsspec>=2023.1
+Requires-Dist: numpy>=1.24
+Requires-Dist: pyarrow>=14.0
+Requires-Dist: requests>=2.28
+Provides-Extra: all
+Requires-Dist: gcsfs>=2023.1; extra == 'all'
+Requires-Dist: jax>=0.4; extra == 'all'
+Requires-Dist: jaxlib>=0.4; extra == 'all'
+Requires-Dist: numpy>=1.24; extra == 'all'
+Requires-Dist: s3fs>=2023.1; extra == 'all'
+Requires-Dist: sentence-transformers>=2.0; extra == 'all'
+Requires-Dist: tensorflow>=2.14; extra == 'all'
+Requires-Dist: torch>=2.0; extra == 'all'
+Provides-Extra: dev
+Requires-Dist: pytest>=7; extra == 'dev'
+Requires-Dist: torch>=2.0; extra == 'dev'
+Provides-Extra: embeddings
+Requires-Dist: sentence-transformers>=2.0; extra == 'embeddings'
+Provides-Extra: gcs
+Requires-Dist: gcsfs>=2023.1; extra == 'gcs'
+Provides-Extra: jax
+Requires-Dist: jax>=0.4; extra == 'jax'
+Requires-Dist: jaxlib>=0.4; extra == 'jax'
+Provides-Extra: s3
+Requires-Dist: s3fs>=2023.1; extra == 's3'
+Provides-Extra: sim
+Requires-Dist: numpy>=1.24; extra == 'sim'
+Requires-Dist: torch>=2.0; extra == 'sim'
+Provides-Extra: tf
+Requires-Dist: tensorflow>=2.14; extra == 'tf'
+Provides-Extra: torch
+Requires-Dist: torch>=2.0; extra == 'torch'
+Provides-Extra: training
+Requires-Dist: torch>=2.0; extra == 'training'
+Description-Content-Type: text/markdown
+# Traceplane
+Python SDK for the Traceplane trajectory data platform.
+## Installation
+```bash
+pip install traceplane
+```
+With framework extras:
+```bash
+pip install traceplane[torch]     # PyTorch DataLoader
+pip install traceplane[jax]       # JAX support
+pip install traceplane[training]  # Diffusion policy training
+pip install traceplane[all]       # Everything
+```
+## Quick Start
+```python
+from traceplane import TraceplaneClient
+client = TraceplaneClient("https://api.traceplane.ai", api_key="tp_live_...")
+# Register a dataset
+client.register("my_data", "/path/to/dataset", include_data=True)
+# Query with SQL
+rows = client.sql_rows("SELECT * FROM my_data WHERE frame_count > 100")
+# Upload data
+client.upload_dataset("my_data", "/path/to/parquet/files/")
+# Vector search
+results = client.search_similar("my_data", episode_index=0, k=5)
+```
+## Features
+- **SQL query engine** -- register datasets and query with full SQL, including vector UDFs (`vec_mean`, `vec_norm`, `vec_cosine_sim`, etc.)
+- **Streaming dataloaders** -- PyTorch, JAX, and TensorFlow adapters with windowed sampling
+- **LeRobot format** -- native reader for LeRobot v2/v3 datasets (Parquet + MP4)
+- **Similarity search** -- find related episodes via embedding-based vector search
+- **Dataset upload** -- push local Parquet files to the platform
+- **Retargeting** -- XR hand poses to robot action space via calibration bridge
+- **Training** -- built-in diffusion policy training with `traceplane-train` CLI
+## Training Integration
+```python
+from traceplane import LeRobotReader
+from traceplane.torch import TorchEpisodeLoader
+reader = LeRobotReader("/path/to/lerobot/dataset")
+loader = TorchEpisodeLoader(reader, batch_size=32, window_size=16)
+for batch in loader:
+    observations = batch["observation"]
+    actions = batch["action"]
+    # ... your training loop
+```
+## API Reference
+Full documentation: [docs.traceplane.ai](https://docs.traceplane.ai)
+## License
+Apache-2.0

traceplane-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,71 @@
+# Traceplane
+Python SDK for the Traceplane trajectory data platform.
+## Installation
+```bash
+pip install traceplane
+```
+With framework extras:
+```bash
+pip install traceplane[torch]     # PyTorch DataLoader
+pip install traceplane[jax]       # JAX support
+pip install traceplane[training]  # Diffusion policy training
+pip install traceplane[all]       # Everything
+```
+## Quick Start
+```python
+from traceplane import TraceplaneClient
+client = TraceplaneClient("https://api.traceplane.ai", api_key="tp_live_...")
+# Register a dataset
+client.register("my_data", "/path/to/dataset", include_data=True)
+# Query with SQL
+rows = client.sql_rows("SELECT * FROM my_data WHERE frame_count > 100")
+# Upload data
+client.upload_dataset("my_data", "/path/to/parquet/files/")
+# Vector search
+results = client.search_similar("my_data", episode_index=0, k=5)
+```
+## Features
+- **SQL query engine** -- register datasets and query with full SQL, including vector UDFs (`vec_mean`, `vec_norm`, `vec_cosine_sim`, etc.)
+- **Streaming dataloaders** -- PyTorch, JAX, and TensorFlow adapters with windowed sampling
+- **LeRobot format** -- native reader for LeRobot v2/v3 datasets (Parquet + MP4)
+- **Similarity search** -- find related episodes via embedding-based vector search
+- **Dataset upload** -- push local Parquet files to the platform
+- **Retargeting** -- XR hand poses to robot action space via calibration bridge
+- **Training** -- built-in diffusion policy training with `traceplane-train` CLI
+## Training Integration
+```python
+from traceplane import LeRobotReader
+from traceplane.torch import TorchEpisodeLoader
+reader = LeRobotReader("/path/to/lerobot/dataset")
+loader = TorchEpisodeLoader(reader, batch_size=32, window_size=16)
+for batch in loader:
+    observations = batch["observation"]
+    actions = batch["action"]
+    # ... your training loop
+```
+## API Reference
+Full documentation: [docs.traceplane.ai](https://docs.traceplane.ai)
+## License
+Apache-2.0

traceplane-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,58 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "traceplane"
+version = "0.1.0"
+description = "Streaming dataloader for robotics trajectory datasets"
+requires-python = ">=3.10"
+license = "Apache-2.0"
+authors = [{name = "Traceplane", email = "hello@traceplane.ai"}]
+readme = "README.md"
+keywords = ["robotics", "trajectories", "datasets", "imitation-learning", "lerobot"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Scientific/Engineering",
+]
+dependencies = [
+    "numpy>=1.24",
+    "pyarrow>=14.0",
+    "fsspec>=2023.1",
+    "requests>=2.28",
+]
+[project.optional-dependencies]
+torch = ["torch>=2.0"]
+jax = ["jax>=0.4", "jaxlib>=0.4"]
+tf = ["tensorflow>=2.14"]
+s3 = ["s3fs>=2023.1"]
+gcs = ["gcsfs>=2023.1"]
+training = ["torch>=2.0"]
+embeddings = ["sentence-transformers>=2.0"]
+sim = ["torch>=2.0", "numpy>=1.24"]
+all = ["traceplane[torch,jax,tf,s3,gcs,training,embeddings,sim]"]
+dev = ["pytest>=7", "traceplane[torch]"]
+[project.scripts]
+traceplane-train = "traceplane.training.cli:main"
+traceplane-embed = "traceplane.embeddings:main"
+traceplane-sim-viz = "traceplane.sim.cli:main_viz"
+traceplane-sim-eval = "traceplane.sim.cli:main_eval"
+[project.urls]
+Homepage = "https://traceplane.ai"
+Documentation = "https://docs.traceplane.ai"
+Repository = "https://github.com/traceplane/traceplane"
+Issues = "https://github.com/traceplane/traceplane/issues"
+[tool.hatch.build.targets.wheel]
+packages = ["src/traceplane"]

traceplane-0.1.0/scripts/setup_isaac_sim.sh ADDED Viewed

@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+# Traceplane Isaac Sim Setup — Ubuntu 22.04/24.04 + NVIDIA RTX 5080
+#
+# Prerequisites:
+#   - Ubuntu 22.04 or 24.04
+#   - NVIDIA driver >= 565 (required for RTX 5080 / Blackwell)
+#   - Python 3.10+
+#
+# Usage:
+#   chmod +x scripts/setup_isaac_sim.sh
+#   ./scripts/setup_isaac_sim.sh
+set -euo pipefail
+echo "=== Traceplane Isaac Sim Setup ==="
+echo ""
+# Check OS
+if ! grep -qE "22\.04|24\.04" /etc/lsb-release 2>/dev/null; then
+    echo "WARNING: This script targets Ubuntu 22.04/24.04."
+    echo "Current OS: $(lsb_release -ds 2>/dev/null || echo 'unknown')"
+    echo ""
+fi
+# Check NVIDIA driver
+if ! command -v nvidia-smi &>/dev/null; then
+    echo "ERROR: nvidia-smi not found. Install NVIDIA driver first:"
+    echo "  sudo apt update && sudo apt install nvidia-driver-565"
+    exit 1
+fi
+DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1)
+echo "NVIDIA driver: $DRIVER_VERSION"
+DRIVER_MAJOR=$(echo "$DRIVER_VERSION" | cut -d. -f1)
+if [ "$DRIVER_MAJOR" -lt 565 ]; then
+    echo "WARNING: Driver $DRIVER_VERSION may be too old for RTX 5080."
+    echo "Recommended: >= 565. Install with:"
+    echo "  sudo apt install nvidia-driver-565"
+    echo ""
+fi
+# Check GPU
+GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)
+echo "GPU: $GPU_NAME"
+echo ""
+# Check Python
+PYTHON=${PYTHON:-python3}
+PY_VERSION=$($PYTHON --version 2>&1)
+echo "Python: $PY_VERSION"
+echo ""
+# Install Isaac Sim
+echo "=== Installing Isaac Sim 5.x ==="
+echo "This may take several minutes..."
+$PYTHON -m pip install isaacsim==5.* --extra-index-url https://pypi.nvidia.com
+# Install Traceplane sim module
+echo ""
+echo "=== Installing Traceplane sim module ==="
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+DATALOADER_DIR="$(dirname "$SCRIPT_DIR")"
+$PYTHON -m pip install -e "$DATALOADER_DIR[sim]"
+# Smoke test
+echo ""
+echo "=== Smoke test ==="
+$PYTHON -c "
+from isaacsim import SimulationApp
+app = SimulationApp({'headless': True})
+print('Isaac Sim loaded successfully')
+app.close()
+print('Smoke test passed!')
+"
+echo ""
+echo "=== Setup complete ==="
+echo ""
+echo "Quick start:"
+echo "  # Replay a dataset episode in sim"
+echo "  traceplane-sim-viz --dataset-path /path/to/dataset --episode-index 0"
+echo ""
+echo "  # Evaluate a policy"
+echo "  traceplane-sim-eval /path/to/checkpoint.pt --num-episodes 10 --headless"

traceplane-0.1.0/src/traceplane/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""Traceplane — streaming dataloader for robotics trajectory datasets."""
+from traceplane.dataset import Episode, TrajectoryDataset
+from traceplane.lerobot_reader import LeRobotReader
+from traceplane.windowing import WindowedDataset
+from traceplane.query import TraceplaneClient
+__version__ = "0.1.0"
+__all__ = [
+    "Episode",
+    "TrajectoryDataset",
+    "LeRobotReader",
+    "WindowedDataset",
+    "TraceplaneClient",
+]

traceplane-0.1.0/src/traceplane/dataset.py ADDED Viewed

@@ -0,0 +1,93 @@
+"""Core dataset abstractions."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, Iterator, Sequence
+import numpy as np
+@dataclass
+class Episode:
+    """A single trajectory episode.
+    Attributes:
+        episode_id: Unique identifier (e.g. "episode_000042").
+        observations: Dict of observation arrays keyed by modality.
+            Common keys: "state" (proprioception), "action", image camera names.
+            Each value is shape (T, D) for vectors or (T, H, W, C) for images.
+        actions: Action array, shape (T, action_dim).
+        timestamps: Monotonic timestamps in seconds, shape (T,).
+        metadata: Arbitrary episode metadata (task label, fps, success, etc.).
+    """
+    episode_id: str
+    observations: dict[str, np.ndarray] = field(default_factory=dict)
+    actions: np.ndarray = field(default_factory=lambda: np.empty((0,)))
+    timestamps: np.ndarray = field(default_factory=lambda: np.empty((0,)))
+    metadata: dict[str, Any] = field(default_factory=dict)
+    @property
+    def length(self) -> int:
+        """Number of timesteps."""
+        if self.actions.ndim >= 1 and self.actions.shape[0] > 0:
+            return self.actions.shape[0]
+        if self.timestamps.ndim >= 1 and self.timestamps.shape[0] > 0:
+            return self.timestamps.shape[0]
+        for v in self.observations.values():
+            if hasattr(v, "shape") and v.shape[0] > 0:
+                return v.shape[0]
+        return 0
+    @property
+    def action_dim(self) -> int:
+        if self.actions.ndim == 2:
+            return self.actions.shape[1]
+        return 0
+class TrajectoryDataset:
+    """Abstract base for trajectory datasets.
+    Subclasses must implement ``__len__`` and ``__getitem__``.
+    """
+    def __len__(self) -> int:
+        raise NotImplementedError
+    def __getitem__(self, idx: int) -> Episode:
+        raise NotImplementedError
+    def __iter__(self) -> Iterator[Episode]:
+        for i in range(len(self)):
+            yield self[i]
+    def episode_ids(self) -> list[str]:
+        """Return all episode IDs in order."""
+        raise NotImplementedError
+    def filter(self, episode_ids: Sequence[str]) -> "FilteredDataset":
+        """Return a view containing only the specified episodes."""
+        return FilteredDataset(self, list(episode_ids))
+class FilteredDataset(TrajectoryDataset):
+    """A filtered view over another dataset."""
+    def __init__(self, parent: TrajectoryDataset, episode_ids: list[str]):
+        self._parent = parent
+        self._ids = episode_ids
+        # Build index map: episode_id -> parent index
+        parent_ids = parent.episode_ids()
+        self._id_to_idx = {eid: i for i, eid in enumerate(parent_ids)}
+        self._indices = [self._id_to_idx[eid] for eid in episode_ids if eid in self._id_to_idx]
+    def __len__(self) -> int:
+        return len(self._indices)
+    def __getitem__(self, idx: int) -> Episode:
+        return self._parent[self._indices[idx]]
+    def episode_ids(self) -> list[str]:
+        return [self._ids[i] for i in range(len(self._indices))]

traceplane-0.1.0/src/traceplane/embeddings.py ADDED Viewed

@@ -0,0 +1,207 @@
+"""Episode embedding computation for similarity search.
+Computes per-episode feature vectors and writes them to Parquet for
+use with the DataFusion ``vec_cosine_sim`` UDF.
+Two strategies:
+    - **Trajectory features** (always available): statistical aggregates
+      of observation.state and action vectors per episode.
+    - **Text embeddings** (optional, requires ``sentence-transformers``):
+      encodes task_label strings with a pretrained language model.
+"""
+from __future__ import annotations
+import argparse
+import os
+import sys
+from pathlib import Path
+from typing import Any
+import numpy as np
+import pyarrow as pa
+import pyarrow.parquet as pq
+from traceplane.dataset import Episode
+from traceplane.lerobot_reader import LeRobotReader
+def compute_trajectory_embedding(episode: Episode) -> np.ndarray:
+    """Compute a feature vector from an episode's state and action statistics.
+    Concatenates [mean, std, min, max] per dimension for both
+    observation.state and action, then L2-normalises.
+    Returns:
+        1-D float32 array.
+    """
+    parts: list[np.ndarray] = []
+    # State features
+    state = episode.observations.get("state")
+    if state is not None and state.ndim == 2 and state.shape[0] > 0:
+        parts.extend(_stat_features(state))
+    # Action features
+    if episode.actions.ndim == 2 and episode.actions.shape[0] > 0:
+        parts.extend(_stat_features(episode.actions))
+    if not parts:
+        # Fallback: zero vector
+        return np.zeros(1, dtype=np.float32)
+    vec = np.concatenate(parts).astype(np.float32)
+    # L2 normalise
+    norm = np.linalg.norm(vec)
+    if norm > 1e-8:
+        vec /= norm
+    return vec
+def _stat_features(arr: np.ndarray) -> list[np.ndarray]:
+    """Compute [mean, std, min, max] per dimension."""
+    return [
+        arr.mean(axis=0),
+        arr.std(axis=0),
+        arr.min(axis=0),
+        arr.max(axis=0),
+    ]
+def compute_text_embeddings(
+    labels: list[str],
+    model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
+) -> np.ndarray:
+    """Encode text labels with a sentence-transformer model.
+    Returns:
+        (N, D) float32 array of embeddings.
+    """
+    try:
+        from sentence_transformers import SentenceTransformer
+    except ImportError:
+        raise ImportError(
+            "sentence-transformers is required for text embeddings. "
+            "Install with: pip install traceplane[embeddings]"
+        )
+    model = SentenceTransformer(model_name)
+    embeddings = model.encode(labels, show_progress_bar=False, normalize_embeddings=True)
+    return np.asarray(embeddings, dtype=np.float32)
+def compute_embeddings(
+    dataset_path: str,
+    output_dir: str | None = None,
+    include_text: bool = False,
+    episode_indices: list[int] | None = None,
+    storage_options: dict[str, Any] | None = None,
+) -> str:
+    """Compute episode embeddings and write to Parquet.
+    Args:
+        dataset_path: Path to a LeRobot dataset.
+        output_dir: Output directory. Defaults to ``{dataset_path}/embeddings``.
+        include_text: Also compute text embeddings from task labels.
+        episode_indices: Subset of episodes. None = all.
+        storage_options: fsspec options for remote datasets.
+    Returns:
+        Path to the written Parquet file.
+    """
+    reader = LeRobotReader(
+        dataset_path,
+        storage_options=storage_options,
+        episode_indices=episode_indices,
+    )
+    n = len(reader)
+    if n == 0:
+        raise ValueError("Dataset has no episodes")
+    print(f"Computing embeddings for {n} episodes...", file=sys.stderr)
+    indices: list[int] = []
+    labels: list[str] = []
+    traj_embeddings: list[np.ndarray] = []
+    for i in range(n):
+        ep = reader[i]
+        traj_emb = compute_trajectory_embedding(ep)
+        traj_embeddings.append(traj_emb)
+        # Extract episode index from metadata or ID
+        ep_idx = ep.metadata.get("episode_index", i)
+        if isinstance(ep_idx, str):
+            try:
+                ep_idx = int(ep_idx.split("_")[-1])
+            except (ValueError, IndexError):
+                ep_idx = i
+        indices.append(int(ep_idx))
+        labels.append(ep.metadata.get("task_label", ""))
+        if (i + 1) % 50 == 0 or i == n - 1:
+            print(f"  {i + 1}/{n}", file=sys.stderr)
+    # Build arrow arrays
+    traj_dim = traj_embeddings[0].shape[0]
+    arrow_traj = pa.list_(pa.float32())
+    columns: dict[str, Any] = {
+        "episode_index": pa.array(indices, type=pa.int64()),
+        "task_label": pa.array(labels, type=pa.utf8()),
+        "trajectory_embedding": pa.array(
+            [emb.tolist() for emb in traj_embeddings],
+            type=arrow_traj,
+        ),
+    }
+    # Optional text embeddings
+    if include_text:
+        unique_labels = list(set(labels))
+        print(f"Computing text embeddings for {len(unique_labels)} unique labels...", file=sys.stderr)
+        text_embs = compute_text_embeddings(unique_labels)
+        label_to_emb = {lbl: text_embs[i] for i, lbl in enumerate(unique_labels)}
+        text_vecs = [label_to_emb[lbl].tolist() for lbl in labels]
+        columns["text_embedding"] = pa.array(
+            text_vecs,
+            type=pa.list_(pa.float32()),
+        )
+    table = pa.table(columns)
+    # Write
+    if output_dir is None:
+        output_dir = os.path.join(dataset_path, "embeddings")
+    os.makedirs(output_dir, exist_ok=True)
+    output_path = os.path.join(output_dir, "episode_embeddings.parquet")
+    pq.write_table(table, output_path)
+    print(f"Written {n} embeddings ({traj_dim}-dim) to {output_path}", file=sys.stderr)
+    return output_path
+def main(argv: list[str] | None = None) -> None:
+    """CLI entry point for embedding computation."""
+    parser = argparse.ArgumentParser(
+        description="Compute episode embeddings for similarity search",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("dataset_path", help="Path to LeRobot dataset")
+    parser.add_argument("--output-dir", help="Output directory (default: {dataset_path}/embeddings)")
+    parser.add_argument("--include-text", action="store_true", help="Compute text embeddings (requires sentence-transformers)")
+    parser.add_argument("--episodes", type=int, nargs="+", help="Specific episode indices")
+    args = parser.parse_args(argv)
+    path = compute_embeddings(
+        args.dataset_path,
+        output_dir=args.output_dir,
+        include_text=args.include_text,
+        episode_indices=args.episodes,
+    )
+    print(path)
+if __name__ == "__main__":
+    main()