PyPI - egogym - Versions diffs - 0.1.0__py3-none-any.whl - Mend

egogym 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

baselines/pi_policy.py +110 -0
baselines/rum/__init__.py +1 -0
baselines/rum/loss_fns/__init__.py +37 -0
baselines/rum/loss_fns/abstract_loss_fn.py +13 -0
baselines/rum/loss_fns/diffusion_policy_loss_fn.py +114 -0
baselines/rum/loss_fns/rvq_loss_fn.py +104 -0
baselines/rum/loss_fns/vqbet_loss_fn.py +202 -0
baselines/rum/models/__init__.py +1 -0
baselines/rum/models/bet/__init__.py +3 -0
baselines/rum/models/bet/bet.py +347 -0
baselines/rum/models/bet/gpt.py +277 -0
baselines/rum/models/bet/tokenized_bet.py +454 -0
baselines/rum/models/bet/utils.py +124 -0
baselines/rum/models/bet/vqbet.py +410 -0
baselines/rum/models/bet/vqvae/__init__.py +3 -0
baselines/rum/models/bet/vqvae/residual_vq.py +346 -0
baselines/rum/models/bet/vqvae/vector_quantize_pytorch.py +1194 -0
baselines/rum/models/bet/vqvae/vqvae.py +313 -0
baselines/rum/models/bet/vqvae/vqvae_utils.py +30 -0
baselines/rum/models/custom.py +33 -0
baselines/rum/models/encoders/__init__.py +0 -0
baselines/rum/models/encoders/abstract_base_encoder.py +70 -0
baselines/rum/models/encoders/identity.py +45 -0
baselines/rum/models/encoders/timm_encoders.py +82 -0
baselines/rum/models/policies/diffusion_policy.py +881 -0
baselines/rum/models/policies/open_loop.py +122 -0
baselines/rum/models/policies/simple_open_loop.py +108 -0
baselines/rum/molmo/server.py +144 -0
baselines/rum/policy.py +293 -0
baselines/rum/utils/__init__.py +212 -0
baselines/rum/utils/action_transforms.py +22 -0
baselines/rum/utils/decord_transforms.py +135 -0
baselines/rum/utils/rpc.py +249 -0
baselines/rum/utils/schedulers.py +71 -0
baselines/rum/utils/trajectory_vis.py +128 -0
baselines/rum/utils/zmq_utils.py +281 -0
baselines/rum_policy.py +108 -0
egogym/__init__.py +8 -0
egogym/assets/constants.py +1804 -0
egogym/components/__init__.py +1 -0
egogym/components/object.py +94 -0
egogym/egogym.py +106 -0
egogym/embodiments/__init__.py +10 -0
egogym/embodiments/arms/__init__.py +4 -0
egogym/embodiments/arms/arm.py +65 -0
egogym/embodiments/arms/droid.py +49 -0
egogym/embodiments/grippers/__init__.py +4 -0
egogym/embodiments/grippers/floating_gripper.py +58 -0
egogym/embodiments/grippers/rum.py +6 -0
egogym/embodiments/robot.py +95 -0
egogym/evaluate.py +216 -0
egogym/managers/__init__.py +2 -0
egogym/managers/objects_managers.py +30 -0
egogym/managers/textures_manager.py +21 -0
egogym/misc/molmo_client.py +49 -0
egogym/misc/molmo_server.py +197 -0
egogym/policies/__init__.py +1 -0
egogym/policies/base_policy.py +13 -0
egogym/scripts/analayze.py +834 -0
egogym/scripts/plot.py +87 -0
egogym/scripts/plot_correlation.py +392 -0
egogym/scripts/plot_correlation_hardcoded.py +338 -0
egogym/scripts/plot_failure.py +248 -0
egogym/scripts/plot_failure_hardcoded.py +195 -0
egogym/scripts/plot_failure_vlm.py +257 -0
egogym/scripts/plot_failure_vlm_hardcoded.py +177 -0
egogym/scripts/plot_line.py +303 -0
egogym/scripts/plot_line_hardcoded.py +285 -0
egogym/scripts/plot_pi0_bars.py +169 -0
egogym/tasks/close.py +84 -0
egogym/tasks/open.py +85 -0
egogym/tasks/pick.py +121 -0
egogym/utils.py +969 -0
egogym/wrappers/__init__.py +20 -0
egogym/wrappers/episode_monitor.py +282 -0
egogym/wrappers/unprivileged_chatgpt.py +163 -0
egogym/wrappers/unprivileged_gemini.py +157 -0
egogym/wrappers/unprivileged_molmo.py +88 -0
egogym/wrappers/unprivileged_moondream.py +121 -0
egogym-0.1.0.dist-info/METADATA +52 -0
egogym-0.1.0.dist-info/RECORD +83 -0
egogym-0.1.0.dist-info/WHEEL +5 -0
egogym-0.1.0.dist-info/top_level.txt +2 -0

egogym/scripts/plot_correlation_hardcoded.py ADDED Viewed

@@ -0,0 +1,338 @@
+import numpy as np
+import pandas as pd
+import altair as alt
+from scipy.stats import beta, pearsonr, gaussian_kde
+def plot_bayesian_correlation_hardcoded():
+    # Set random seed for reproducibility
+    np.random.seed(42)
+    # Register and enable custom font theme
+    alt.themes.register('custom_theme', lambda: {
+        'config': {
+            'title': {'font': 'Produkt'},
+            'axis': {'labelFont': 'Produkt', 'titleFont': 'Produkt'},
+            'legend': {'labelFont': 'Produkt', 'titleFont': 'Produkt'},
+            'mark': {'font': 'Produkt'},
+            'text': {'font': 'Produkt'},
+        }
+    })
+    alt.themes.enable('custom_theme')
+    # Hardcoded values
+    checkpoint_data = {
+        "checkpoint_10": {
+            "sim_success": 1001,
+            "sim_total": 5000,
+            "real_success": 60,
+            "real_total": 250
+        },
+        "checkpoint_50": {
+            "sim_success": 1847,
+            "sim_total": 5000,
+            "real_success": 96,
+            "real_total": 250
+        },
+        "checkpoint_64": {
+            "sim_success": 3137,
+            "sim_total": 5000,
+            "real_success": 168,
+            "real_total": 250
+        },
+        "checkpoint_80": {
+            "sim_success": 3994,
+            "sim_total": 5000,
+            "real_success": 208,
+            "real_total": 250
+        }
+    }
+    stats = {}
+    checkpoint_order = ["checkpoint_10", "checkpoint_50", "checkpoint_64", "checkpoint_80"]
+    for checkpoint in checkpoint_order:
+        data = checkpoint_data[checkpoint]
+        # Simulation stats
+        a_sim = 1 + data["sim_success"]
+        b_sim = 1 + (data["sim_total"] - data["sim_success"])
+        sim_mean = 100 * a_sim / (a_sim + b_sim)
+        # Real stats
+        a_real = 1 + data["real_success"]
+        b_real = 1 + (data["real_total"] - data["real_success"])
+        real_mean = 100 * a_real / (a_real + b_real)
+        print(f"{checkpoint}: sim={data['sim_success']}/{data['sim_total']} ({sim_mean:.1f}%), real={real_mean:.1f}% ({data['real_success']}/{data['real_total']})")
+        stats[checkpoint] = {
+            "sim_mean": sim_mean,
+            "sim_alpha": a_sim,
+            "sim_beta": b_sim,
+            "real_mean": real_mean,
+            "real_alpha": a_real,
+            "real_beta": b_real,
+        }
+    if not stats:
+        print("No data found!")
+        return
+    checkpoint_names = list(stats.keys())
+    sim_rates = np.array([stats[c]["sim_mean"] for c in checkpoint_names])
+    real_rates = np.array([stats[c]["real_mean"] for c in checkpoint_names])
+    r, p_value = pearsonr(sim_rates, real_rates)
+    # Compute confidence intervals and violin plot data
+    violin_data = []
+    point_data = []
+    error_data = []
+    for i, checkpoint in enumerate(checkpoint_names):
+        sim_lo = beta.ppf(0.025, stats[checkpoint]["sim_alpha"], stats[checkpoint]["sim_beta"]) * 100
+        sim_hi = beta.ppf(0.975, stats[checkpoint]["sim_alpha"], stats[checkpoint]["sim_beta"]) * 100
+        # Generate violin data
+        y_samples = beta.rvs(stats[checkpoint]["real_alpha"], stats[checkpoint]["real_beta"], size=10000) * 100
+        xc = sim_rates[i]
+        yc = real_rates[i]
+        # Store point data
+        point_data.append({
+            'checkpoint': checkpoint,
+            'sim_rate': xc,
+            'real_rate': yc,
+            'sim_lo': sim_lo,
+            'sim_hi': sim_hi
+        })
+        # Store violin data for density transform
+        for y_val in y_samples:
+            violin_data.append({
+                'checkpoint': checkpoint,
+                'sim_rate': xc,
+                'real_rate': y_val
+            })
+    # Compute bootstrap correlation confidence interval
+    n_samples = 1000
+    r_samples = []
+    for _ in range(n_samples):
+        sim_sample = [beta.rvs(stats[c]["sim_alpha"], stats[c]["sim_beta"]) * 100 for c in checkpoint_names]
+        real_sample = [beta.rvs(stats[c]["real_alpha"], stats[c]["real_beta"]) * 100 for c in checkpoint_names]
+        r_sample, _ = pearsonr(sim_sample, real_sample)
+        r_samples.append(r_sample)
+    r_samples = np.array(r_samples)
+    r_lo = np.percentile(r_samples, 2.5)
+    r_hi = np.percentile(r_samples, 97.5)
+    # Create DataFrames
+    point_df = pd.DataFrame(point_data)
+    # For violins, we need much less data - just sample points
+    violin_sample_data = []
+    for checkpoint in checkpoint_names:
+        # Use only 500 samples per checkpoint to avoid memory issues
+        y_samples = beta.rvs(stats[checkpoint]["real_alpha"], stats[checkpoint]["real_beta"], size=500) * 100
+        xc = point_df[point_df['checkpoint'] == checkpoint]['sim_rate'].values[0]
+        for y_val in y_samples:
+            violin_sample_data.append({
+                'checkpoint': checkpoint,
+                'sim_rate': xc,
+                'real_rate': y_val
+            })
+    violin_df = pd.DataFrame(violin_sample_data)
+    # Compute regression line
+    z = np.polyfit(sim_rates, real_rates, 1)
+    p = np.poly1d(z)
+    xs = np.linspace(sim_rates.min() - 5, sim_rates.max() + 5, 200)
+    regression_df = pd.DataFrame({'sim_rate': xs, 'real_rate': p(xs)})
+    # Manually create violin shapes positioned at sim_rate coordinates
+    violin_width_scale = 2.5
+    violin_polygon_data = []
+    # Color mapping from lowest to highest checkpoint
+    checkpoint_colors = {
+        "checkpoint_10": "#F8F0FA",  # very light purple
+        "checkpoint_50": "#D6BAE2",  # medium purple
+        "checkpoint_64": "#9B66BB",  # medium-dark purple
+        "checkpoint_80": "#4B136D"   # deep purple
+    }
+    for checkpoint in checkpoint_names:
+        # Get samples and compute KDE
+        y_samples = beta.rvs(stats[checkpoint]["real_alpha"], stats[checkpoint]["real_beta"], size=2000) * 100
+        kde = gaussian_kde(y_samples)
+        # Create density curve points
+        y_points = np.linspace(y_samples.min(), y_samples.max(), 100)
+        densities = kde(y_points)
+        densities = densities / densities.max() * violin_width_scale
+        xc = point_df[point_df['checkpoint'] == checkpoint]['sim_rate'].values[0]
+        color = checkpoint_colors[checkpoint]
+        # Create closed polygon: left side up, then right side down
+        for i, (y, d) in enumerate(zip(y_points, densities)):
+            violin_polygon_data.append({
+                'checkpoint': checkpoint,
+                'x': xc - d,
+                'y': y,
+                'order': i,
+                'color': color
+            })
+        # Right side going back down
+        for i, (y, d) in enumerate(zip(y_points[::-1], densities[::-1])):
+            violin_polygon_data.append({
+                'checkpoint': checkpoint,
+                'x': xc + d,
+                'y': y,
+                'order': len(y_points) + i,
+                'color': color
+            })
+    violin_polygon_df = pd.DataFrame(violin_polygon_data)
+    # Create violin shapes with color encoding
+    violins = alt.Chart(violin_polygon_df).mark_line(
+        fillOpacity=0.6,
+        stroke='#8B4789',
+        strokeWidth=1.1,
+        interpolate='linear',
+        filled=True
+    ).encode(
+        x=alt.X('x:Q', title='EgoGym Performance (%)').scale(zero=False),
+        y=alt.Y('y:Q', title='Real Performance (%)').scale(zero=False),
+        order='order:Q',
+        detail='checkpoint:N',
+        fill=alt.Fill('color:N', scale=None, legend=None)
+    )
+    # Create regression line
+    regression_line = alt.Chart(regression_df).mark_line(
+        strokeDash=[5, 5],
+        color='black',
+        opacity=0.5,
+        size=1.5
+    ).encode(
+        x=alt.X('sim_rate:Q').scale(zero=False),
+        y=alt.Y('real_rate:Q').scale(zero=False)
+    )
+    # Create error bars for simulation
+    error_bars = alt.Chart(point_df).mark_errorbar(ticks=True, thickness=1.5).encode(
+        x=alt.X('sim_lo:Q', title=''),
+        x2=alt.X2('sim_hi:Q'),
+        y='real_rate:Q'
+    )
+    # Create horizontal lines at mean (for each violin)
+    mean_line_data = []
+    for checkpoint in checkpoint_names:
+        xc = point_df[point_df['checkpoint'] == checkpoint]['sim_rate'].values[0]
+        yc = point_df[point_df['checkpoint'] == checkpoint]['real_rate'].values[0]
+        mean_line_data.append({
+            'checkpoint': checkpoint,
+            'x_left': xc - 1.0,
+            'x_right': xc + 1.0,
+            'y': yc
+        })
+    mean_line_df = pd.DataFrame(mean_line_data)
+    mean_lines = alt.Chart(mean_line_df).mark_rule(color='black', size=1.2).encode(
+        x=alt.X('x_left:Q'),
+        x2=alt.X2('x_right:Q'),
+        y='y:Q'
+    )
+    # Create central points
+    points = alt.Chart(point_df).mark_point(
+        filled=True,
+        size=100,
+        color='#A894DB',
+        stroke='#8B4789',
+        strokeWidth=1.5,
+        shape='diamond'
+    ).encode(
+        x=alt.X('sim_rate:Q').scale(zero=False),
+        y=alt.Y('real_rate:Q').scale(zero=False),
+        tooltip=['checkpoint', 'sim_rate', 'real_rate']
+    )
+    # Combine layers (violins, regression line, error bars, mean lines and points on top)
+    chart = (violins + regression_line + mean_lines + error_bars + points).properties(
+        title={
+            'text': '            Blind EgoGym-Pick Sim-to-Real Correlation',
+            'fontSize': 24,
+            'anchor': 'start',
+            'dx': 15,
+            'dy': -8
+        },
+        width=500,
+        height=400
+    ).configure_axis(
+        labelFontSize=16,
+        titleFontSize=18,
+        titleFontStyle='normal',
+        grid=True,
+        gridOpacity=0.3,
+        tickCount=6
+    ).configure_view(
+        strokeWidth=0
+    )
+    # Add background box for correlation text (wider and thinner)
+    correlation_box = alt.Chart(pd.DataFrame([{
+        'x': point_df['sim_rate'].max() - 18,
+        'y': point_df['real_rate'].min() - 10,
+        'x2': point_df['sim_rate'].max() + 4,
+        'y2': point_df['real_rate'].min() - 4
+    }])).mark_rect(
+        fill='#F7D45B',
+        stroke='gray',
+        strokeWidth=1.5,
+        opacity=0.9
+    ).encode(
+        x=alt.X('x:Q'),
+        y=alt.Y('y:Q'),
+        x2='x2:Q',
+        y2='y2:Q'
+    )
+    # Add correlation text annotation in box
+    correlation_text = alt.Chart(pd.DataFrame([{
+        'x': point_df['sim_rate'].max() - 7,
+        'y': point_df['real_rate'].min() - 7,
+        'text': f'95% CI r: {r_lo:.3f}, {r_hi:.3f}'
+    }])).mark_text(
+        align='center',
+        baseline='middle',
+        fontSize=16,
+        font='Produkt'
+    ).encode(
+        x=alt.X('x:Q'),
+        y=alt.Y('y:Q'),
+        text='text:N'
+    )
+    # Combine with correlation box and text
+    chart = chart + correlation_box + correlation_text
+    # Save chart
+    chart.save("checkpoint_correlation.html")
+    chart.save("checkpoint_correlation.png", scale_factor=2.0)
+    chart.save("checkpoint_correlation.pdf", scale_factor=2.0)
+    print(f"\nCorrelation: r = {r:.3f}, p = {p_value:.4f}")
+    print("\nPlot saved to: checkpoint_correlation.html and checkpoint_correlation.png")
+if __name__ == "__main__":
+    plot_bayesian_correlation_hardcoded()

egogym/scripts/plot_failure.py ADDED Viewed

@@ -0,0 +1,248 @@
+import os
+import numpy as np
+import pandas as pd
+import altair as alt
+# Register custom font for export
+alt.themes.register('custom_theme', lambda: {
+    'config': {
+        'title': {'font': 'Produkt'},
+        'axis': {'labelFont': 'Produkt', 'titleFont': 'Produkt'},
+        'legend': {'labelFont': 'Produkt', 'titleFont': 'Produkt'},
+        'mark': {'font': 'Produkt'},
+        'text': {'font': 'Produkt'},
+    }
+})
+alt.themes.enable('custom_theme')
+BASE_DIR = "logs/5_objects"
+REWARD_THRESHOLD = 0.03
+PARTIAL_LIFT_THRESHOLD = 0.005
+def compute_outcomes_from_csv(csv_path):
+    if not os.path.exists(csv_path):
+        return None, None
+    df = pd.read_csv(csv_path, sep="\t")
+    total_episodes = len(df)
+    # Count successes
+    successes = (df["max_reward"] > REWARD_THRESHOLD).sum()
+    def get_failure_mode(row):
+        if row["max_reward"] > REWARD_THRESHOLD:
+            return "Success"
+        bodies_contacted = str(row.get("grasped_bodies", ""))
+        object_name = str(row.get("object_name", ""))
+        grasping_object = row.get("grasping_object", False)
+        is_grasping = row.get("is_grasping", False)  # Final gripper state only
+        # Extract body names from grasped_bodies list
+        has_target_contact = object_name in bodies_contacted
+        has_gripper_contact = "left" in bodies_contacted or "right" in bodies_contacted
+        has_any_object_contact = "object" in bodies_contacted
+        has_wrong_object_contact = has_any_object_contact and not has_target_contact
+        # Decision tree (most specific to least specific)
+        # Note: is_grasping is only final state, grasping_object tracks if target was ever grasped
+        # 1. Successfully grasped target but didn't lift high enough
+        if grasping_object and row["max_reward"] >= PARTIAL_LIFT_THRESHOLD:
+            return "Did not lift enough"
+        # 2. Grasped target but barely lifted (or dropped immediately)
+        if grasping_object:
+            return "Object touched but not grasped"
+        # 3. Made contact with target but never achieved grasp
+        if has_target_contact:
+            return "Object touched but not grasped"
+        # 4. Grasped wrong object with significant lift
+        if has_wrong_object_contact and row["max_reward"] >= PARTIAL_LIFT_THRESHOLD:
+            return "Picked wrong object"
+        # 5. Contacted wrong object (with or without final grasp state)
+        if has_wrong_object_contact:
+            return "Picked wrong object"
+        # 6. Gripper closed at end or had gripper contact but no object identification
+        if is_grasping or has_gripper_contact:
+            return "Empty Grasp"
+        # 7. Never made meaningful contact with anything
+        return "Did not grasp"
+    df["outcome"] = df.apply(get_failure_mode, axis=1)
+    outcomes = {
+        "Success": 0,
+        "Did not lift enough": 0,
+        "Object touched but not grasped": 0,
+        "Picked wrong object": 0,
+        "Empty Grasp": 0,
+        "Did not grasp": 0,
+    }
+    for _, row in df.iterrows():
+        mode = row["outcome"]
+        if mode in outcomes:
+            outcomes[mode] += 1
+    return outcomes, total_episodes
+def plot_failure_modes():
+    checkpoint_order = ["checkpoint_10", "checkpoint_50", "checkpoint_64", "checkpoint_80"]
+    checkpoint_labels = {
+        "checkpoint_10": "Checkpoint 24%",
+        "checkpoint_50": "Checkpoint 39%",
+        "checkpoint_64": "Checkpoint 68%",
+        "checkpoint_80": "Checkpoint 83%"
+    }
+    outcome_data = {}
+    episode_totals = {}
+    for checkpoint in checkpoint_order:
+        csv_path = os.path.join(BASE_DIR, checkpoint, "log.csv")
+        outcomes, total_episodes = compute_outcomes_from_csv(csv_path)
+        if outcomes is None:
+            print(f"Warning: Could not find CSV data for {checkpoint}")
+            continue
+        outcome_data[checkpoint] = outcomes
+        episode_totals[checkpoint] = total_episodes
+    if not outcome_data:
+        print("No data found!")
+        return
+    checkpoint_names = list(outcome_data.keys())
+    # Print results to terminal
+    print("\n" + "="*80)
+    print("FAILURE MODE ANALYSIS BY CHECKPOINT")
+    print("="*80)
+    for checkpoint in checkpoint_names:
+        checkpoint_label = checkpoint_labels.get(checkpoint, checkpoint)
+        total = episode_totals.get(checkpoint, 0)
+        print(f"\n{checkpoint_label} (n={total}):")
+        print("-" * 60)
+        for outcome, count in outcome_data[checkpoint].items():
+            percentage = (count / total * 100) if total > 0 else 0
+            print(f"  {outcome:40s}: {count:3d} ({percentage:5.1f}%)")
+    print("\n" + "="*80 + "\n")
+    outcome_order = [
+        "Success",
+        "Did not lift enough",
+        "Object touched but not grasped",
+        "Picked wrong object",
+        "Empty Grasp",
+        "Did not grasp",
+    ]
+    colors = {
+        "Success": "#388038",  # Green
+        "Did not lift enough": "#F7D45B",  # Yellow
+        "Object touched but not grasped": "#66ACF7",  # Blue
+        "Picked wrong object": "#F0529C",  # Pink
+        "Empty Grasp": "#9B66BB",  # Purple
+        "Did not grasp": "#870927",  # Dark red
+    }
+    # Calculate average percentage for each outcome to sort by size
+    outcome_totals = {}
+    for checkpoint in checkpoint_names:
+        total = episode_totals.get(checkpoint, 0) or 1
+        for outcome in outcome_order:
+            count = outcome_data[checkpoint].get(outcome, 0)
+            percentage = count / total * 100
+            if outcome not in outcome_totals:
+                outcome_totals[outcome] = 0
+            outcome_totals[outcome] += percentage
+    # Sort outcomes by total percentage (largest first), excluding Success
+    failure_modes = [o for o in outcome_order if o != "Success"]
+    sorted_failures = sorted(failure_modes, key=lambda x: outcome_totals.get(x, 0), reverse=True)
+    # Build stacking order: Success at bottom, then failures from largest to smallest going up
+    stacking_order = ["Success"] + sorted_failures
+    chart_data = []
+    for checkpoint in checkpoint_names:
+        checkpoint_label = checkpoint_labels.get(checkpoint, checkpoint)
+        total = episode_totals.get(checkpoint, 0) or 1
+        for outcome in stacking_order:
+            count = outcome_data[checkpoint].get(outcome, 0)
+            percentage = count / total * 100
+            if percentage > 0:  # Only include non-zero values
+                chart_data.append({
+                    'Checkpoint': checkpoint_label,
+                    'Outcome': outcome,
+                    'Percentage': percentage,
+                    'Color': colors.get(outcome, "#999999")
+                })
+    df = pd.DataFrame(chart_data)
+    # Get only outcomes that actually appear in the data
+    outcomes_in_data = df['Outcome'].unique().tolist()
+    # Filter stacking_order to only include outcomes present in data
+    filtered_stacking_order = [o for o in stacking_order if o in outcomes_in_data]
+    # Add sort index to control stacking order
+    outcome_to_index = {outcome: i for i, outcome in enumerate(filtered_stacking_order)}
+    df['sort_index'] = df['Outcome'].map(outcome_to_index)
+    # Create color scale only for outcomes in data
+    color_scale = alt.Scale(
+        domain=filtered_stacking_order,
+        range=[colors[o] for o in filtered_stacking_order]
+    )
+    # Create stacked bar chart
+    chart = alt.Chart(df).mark_bar(
+        stroke='white',
+        strokeWidth=1
+    ).encode(
+        x=alt.X('Checkpoint:N', title=None, axis=alt.Axis(labelFontSize=18, labelAngle=0)),
+        y=alt.Y('Percentage:Q', title='Share of Episodes (%)', axis=alt.Axis(labelFontSize=18, titleFontSize=20)).scale(domain=[0, 100]),
+        color=alt.Color('Outcome:N', scale=color_scale, sort=filtered_stacking_order, legend=alt.Legend(
+            title=None,
+            labelFontSize=16,
+            symbolSize=200,
+            orient="bottom",
+            direction="horizontal",
+            labelLimit=0,
+            columns=3
+        )),
+        order=alt.Order('sort_index:Q'),
+        tooltip=['Checkpoint', 'Outcome', alt.Tooltip('Percentage:Q', format='.1f')]
+    ).properties(
+        width=600,
+        height=400,
+        title={
+            'text': '       EgoGym-Pick Failure Modes by Checkpoint',
+            'fontSize': 22,
+            'anchor': 'start',
+            'dx': 60,
+            'dy': -20
+        },
+        padding={"left": 5, "right": 5, "top": 20, "bottom": 40}
+    ).configure_view(
+        strokeWidth=0
+    )
+    # Save chart
+    chart.save("failure_modes_by_checkpoint.html")
+    chart.save("failure_modes_by_checkpoint.pdf", scale_factor=3)
+    chart.save("failure_modes_by_checkpoint.png", scale_factor=3)
+    print("\nPlot saved to: failure_modes_by_checkpoint.html and failure_modes_by_checkpoint.png")
+if __name__ == "__main__":
+    plot_failure_modes()