PyPI - orena-focus - Versions diffs - 0.1.0__tar.gz - Mend

orena-focus 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

orena_focus-0.1.0/.github/workflows/release.yml +49 -0
orena_focus-0.1.0/.github/workflows/tests.yml +32 -0
orena_focus-0.1.0/LICENSE +21 -0
orena_focus-0.1.0/PKG-INFO +172 -0
orena_focus-0.1.0/README.md +129 -0
orena_focus-0.1.0/examples/data_preparation.py +67 -0
orena_focus-0.1.0/examples/evaluation.py +130 -0
orena_focus-0.1.0/examples/inference.py +224 -0
orena_focus-0.1.0/pyproject.toml +90 -0
orena_focus-0.1.0/src/focus/__init__.py +130 -0
orena_focus-0.1.0/src/focus/assets/FO_definitions.txt +68 -0
orena_focus-0.1.0/src/focus/assets/SAVE_FOCUS_Capabilities.png +0 -0
orena_focus-0.1.0/src/focus/config.py +110 -0
orena_focus-0.1.0/src/focus/data/base_dataset.py +230 -0
orena_focus-0.1.0/src/focus/data/data_models.py +232 -0
orena_focus-0.1.0/src/focus/data/download.py +99 -0
orena_focus-0.1.0/src/focus/data/formats.py +298 -0
orena_focus-0.1.0/src/focus/data/frame_dataset.py +179 -0
orena_focus-0.1.0/src/focus/data/video_dataset.py +223 -0
orena_focus-0.1.0/src/focus/enums.py +33 -0
orena_focus-0.1.0/src/focus/evaluation/__init__.py +15 -0
orena_focus-0.1.0/src/focus/evaluation/adversarial.py +72 -0
orena_focus-0.1.0/src/focus/evaluation/evaluator.py +339 -0
orena_focus-0.1.0/src/focus/evaluation/judges.py +277 -0
orena_focus-0.1.0/src/focus/foreign_objects.py +268 -0
orena_focus-0.1.0/src/focus/preprocessing/__init__.py +64 -0
orena_focus-0.1.0/src/focus/preprocessing/frame_extraction.py +252 -0
orena_focus-0.1.0/src/focus/preprocessing/video_overlay.py +258 -0
orena_focus-0.1.0/src/focus/py.typed +0 -0
orena_focus-0.1.0/src/focus/taxonomy.py +237 -0
orena_focus-0.1.0/tests/__init__.py +0 -0
orena_focus-0.1.0/tests/conftest.py +101 -0
orena_focus-0.1.0/tests/test_data_models.py +170 -0
orena_focus-0.1.0/tests/test_dataset.py +130 -0
orena_focus-0.1.0/tests/test_evaluator.py +210 -0
orena_focus-0.1.0/tests/test_formats.py +251 -0

orena_focus-0.1.0/.github/workflows/release.yml ADDED Viewed

@@ -0,0 +1,49 @@
+name: Release
+on:
+  push:
+    tags:
+      - "v*"
+jobs:
+  build:
+    name: Build distribution
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install build backend
+        run: pip install build
+      - name: Build wheel and sdist
+        run: python -m build
+      - name: Upload distribution artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: dist
+          path: dist/
+  publish:
+    name: Publish to PyPI
+    needs: build
+    runs-on: ubuntu-latest
+    environment: pypi
+    permissions:
+      id-token: write  # required for trusted publishing (OIDC)
+    steps:
+      - name: Download distribution artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: dist
+          path: dist/
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1

orena_focus-0.1.0/.github/workflows/tests.yml ADDED Viewed

@@ -0,0 +1,32 @@
+name: Tests
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+jobs:
+  test:
+    name: Python ${{ matrix.python-version }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10", "3.11", "3.12"]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: pip
+      - name: Install package and test dependencies
+        run: pip install -e ".[test]"
+      - name: Run tests
+        run: pytest --tb=short

orena_focus-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 IMSY
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

orena_focus-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,172 @@
+Metadata-Version: 2.4
+Name: orena-focus
+Version: 0.1.0
+Summary: Utilities for the ORena SAVE FOCUS challenge: Foreign Object Contextual Understanding for Safe Surgical AI
+Project-URL: Homepage, https://or-arena.org/
+Project-URL: Repository, https://github.com/IMSY-DKFZ/orena-focus
+Author-email: Patrick Godau <patrick.godau@dkfz-heidelberg.de>, Lucas Luttner <lucas.luttner@dkfz-heidelberg.de>, Leon Mayer <leon.mayer@dkfz-heidelberg.de>
+License-Expression: MIT
+License-File: LICENSE
+Keywords: foreign-objects,laparoscopy,miccai,surgical-ai,vision-language-model,vqa
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
+Requires-Python: >=3.10
+Requires-Dist: datasets>=2.14.0
+Requires-Dist: decord>=0.6.0
+Requires-Dist: huggingface-hub>=0.17.0
+Requires-Dist: matplotlib>=3.5.0
+Requires-Dist: numpy>=1.23.0
+Requires-Dist: opencv-python>=4.8.0
+Requires-Dist: pandas>=2.0
+Requires-Dist: pillow>=9.0
+Requires-Dist: progiter>=0.12.0
+Requires-Dist: tiktoken>=0.5.0
+Requires-Dist: torch>=2.0.0
+Requires-Dist: torchvision>=0.15.0
+Requires-Dist: transformers>=4.30.0
+Provides-Extra: dev
+Requires-Dist: mypy>=1.8; extra == 'dev'
+Requires-Dist: pytest-cov>=4.0; extra == 'dev'
+Requires-Dist: pytest>=7.0; extra == 'dev'
+Requires-Dist: ruff>=0.4; extra == 'dev'
+Provides-Extra: test
+Requires-Dist: pytest-cov>=4.0; extra == 'test'
+Requires-Dist: pytest>=7.0; extra == 'test'
+Description-Content-Type: text/markdown
+<div align="center">
+# orena-focus
+[![Tests](https://img.shields.io/github/actions/workflow/status/IMSY-DKFZ/orena-focus/tests.yml?branch=main&label=tests)](https://github.com/IMSY-DKFZ/orena-focus/actions/workflows/tests.yml)
+[![PyPI](https://img.shields.io/pypi/v/orena-focus?color=blue)](https://pypi.org/project/orena-focus/)
+[![Python](https://img.shields.io/pypi/pyversions/orena-focus)](https://pypi.org/project/orena-focus/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![Data: CC BY-NC-SA 4.0](https://img.shields.io/badge/Data-CC%20BY--NC--SA%204.0-lightgrey.svg)](https://creativecommons.org/licenses/by-nc-sa/4.0/)
+[![MICCAI 2026](https://img.shields.io/badge/Challenge-MICCAI%202026-blue)](https://or-arena.org/)
+[![Dataset](https://img.shields.io/badge/%F0%9F%A4%97%20Dataset-orena--dkfz%2Fheico--focus--vqa-blue)](https://huggingface.co/datasets/orena-dkfz/heico-focus-vqa)
+</div>
+<br>
+Python utilities for the **FOCUS datasets and challenge** — *Foreign Object Contextual Understanding for Safe Surgical AI*.
+The library provides dataset loaders, preprocessing pipelines, answer-format handling, and an evaluation framework for working with the FOCUS surgical VQA datasets. It can be used independently for research on foreign-object understanding in minimally invasive surgery, and also serves as the official toolkit for the [ORena SAVE FOCUS challenge](https://or-arena.org/) at MICCAI 2026.
+> **Challenge soon open for registration.** Submit your results and compete on the leaderboard at [or-arena.org](https://or-arena.org/).
+Retained foreign objects are a life-threatening and preventable surgical complication. FOCUS benchmarks vision-language models on clinically relevant VQA tasks around detecting, counting, and reasoning about foreign objects in endoscopic video.
+## Tracks
+FOCUS offers three participation tracks, each requiring a different type of visual context:
+| Track | `Track` enum | Visual input | Description |
+|-------|-------------|--------------|-------------|
+| **Frame** | `Track.FRAME` | Single frame | Answer questions from one extracted video frame. The simplest entry point — no temporal modelling required. |
+| **Segment** | `Track.SEGMENT` | Short clip | Answer questions from a multi-second video segment surrounding the relevant event. Requires understanding of motion and temporal context. |
+| **Procedure** | `Track.PROCEDURE` | Full video | Answer questions that may require reasoning over an entire surgical procedure, including events that happened well before or after the queried moment. |
+Participants may enter any subset of tracks. Each track is evaluated independently with the same hierarchical capability taxonomy.
+## Installation
+```bash
+pip install orena-focus
+```
+## Quick start
+```python
+from focus import FocusDataset, DatasetSplit, Track
+ds = FocusDataset("heico", DatasetSplit.TEST, Track.SEGMENT)
+request, reference = ds[0]
+print(request.question)        # "How many sponges are visible?"
+print(reference.answer)        # "2"
+print(reference.format.type)   # "number"
+```
+## Data preparation
+Download, preprocess, and split the dataset in one script — see **[`examples/data_preparation.py`](examples/data_preparation.py)** for the full walkthrough.
+```python
+from focus import download
+from focus.preprocessing import VideoTimestampOverlayPreprocessor, FrameExtractorPreprocessor
+download("heico")
+VideoTimestampOverlayPreprocessor().process(dataset="heico")
+FrameExtractorPreprocessor(stride=1).process(dataset="heico")
+```
+QA annotations are fetched automatically from HuggingFace when you construct a `FocusDataset`.
+## Inference & evaluation
+See **[`examples/inference.py`](examples/inference.py)** for an end-to-end example with Qwen3-VL.
+```python
+from focus import Evaluator, Response
+responses = [Response(qID=req.qID, content=my_model(req)) for req, _ in ds]
+results_df, summary_df = Evaluator().run(
+    requests=ds.requests,
+    references=ds.references,
+    responses=responses,
+)
+print(summary_df)
+```
+## Capability taxonomy
+Five capability groups, each composed of leaf capabilities assigned to questions.
+![SAVE FOCUS capability taxonomy with example questions](https://github.com/IMSY-DKFZ/orena-focus/blob/main/src/focus/assets/SAVE_FOCUS_Capabilities.png?raw=true)
+| # | Group | Leaf capabilities |
+|---|-------|-------------------|
+| 1 | Object Recognition | Identification, Instance Matching, Attributes, Spatial (camera), Spatial (situs) |
+| 2 | Temporal Grounding | Temporal Localization, Duration Estimation |
+| 3 | Aggregation | Object Aggregation, Event Aggregation |
+| 4 | Event & Procedural Understanding | FO Interaction Recognition, FO Usage Purpose, Temporal Ordering |
+| 5 | Complex Reasoning | Functional Reasoning, Causal & Consequence Reasoning, Multi-step Reasoning |
+## Answer formats
+| Format | Accepts | Returns |
+|--------|---------|---------|
+| `Binary` | `"yes"` / `"no"` | `bool` |
+| `Number` | Non-negative integer strings | `int` |
+| `Percentage` | Numeric percentage strings | `float` |
+| `FOClass` | Registered FO class names | `str` |
+| `OpenEnded` | Free text (≤ 300 chars) | `str` |
+| `Matching` | Regex-validated text | `str` |
+| `MultipleChoice` | One of predefined options | `str` |
+| `Time` | `hh:mm:ss` timestamps | `timedelta` |
+## Dataset
+The QA annotations are publicly available on HuggingFace: **[orena-dkfz/heico-focus-vqa](https://huggingface.co/datasets/orena-dkfz/heico-focus-vqa)**.
+The FOCUS challenge is built on the **HeiCo** dataset. If you use this data, please cite the original publication:
+> Maier-Hein, L., et al. (2021). *Heidelberg colorectal data set for surgical data science in the sensor operating room*. [https://doi.org/10.1038/s41597-021-00882-2](https://doi.org/10.1038/s41597-021-00882-2)
+The HeiCo data is released under [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) — non-commercial use only, with attribution and share-alike conditions.
+## License
+MIT (library code) — see [Dataset](#dataset) for the data license.

orena_focus-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,129 @@
+<div align="center">
+# orena-focus
+[![Tests](https://img.shields.io/github/actions/workflow/status/IMSY-DKFZ/orena-focus/tests.yml?branch=main&label=tests)](https://github.com/IMSY-DKFZ/orena-focus/actions/workflows/tests.yml)
+[![PyPI](https://img.shields.io/pypi/v/orena-focus?color=blue)](https://pypi.org/project/orena-focus/)
+[![Python](https://img.shields.io/pypi/pyversions/orena-focus)](https://pypi.org/project/orena-focus/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![Data: CC BY-NC-SA 4.0](https://img.shields.io/badge/Data-CC%20BY--NC--SA%204.0-lightgrey.svg)](https://creativecommons.org/licenses/by-nc-sa/4.0/)
+[![MICCAI 2026](https://img.shields.io/badge/Challenge-MICCAI%202026-blue)](https://or-arena.org/)
+[![Dataset](https://img.shields.io/badge/%F0%9F%A4%97%20Dataset-orena--dkfz%2Fheico--focus--vqa-blue)](https://huggingface.co/datasets/orena-dkfz/heico-focus-vqa)
+</div>
+<br>
+Python utilities for the **FOCUS datasets and challenge** — *Foreign Object Contextual Understanding for Safe Surgical AI*.
+The library provides dataset loaders, preprocessing pipelines, answer-format handling, and an evaluation framework for working with the FOCUS surgical VQA datasets. It can be used independently for research on foreign-object understanding in minimally invasive surgery, and also serves as the official toolkit for the [ORena SAVE FOCUS challenge](https://or-arena.org/) at MICCAI 2026.
+> **Challenge soon open for registration.** Submit your results and compete on the leaderboard at [or-arena.org](https://or-arena.org/).
+Retained foreign objects are a life-threatening and preventable surgical complication. FOCUS benchmarks vision-language models on clinically relevant VQA tasks around detecting, counting, and reasoning about foreign objects in endoscopic video.
+## Tracks
+FOCUS offers three participation tracks, each requiring a different type of visual context:
+| Track | `Track` enum | Visual input | Description |
+|-------|-------------|--------------|-------------|
+| **Frame** | `Track.FRAME` | Single frame | Answer questions from one extracted video frame. The simplest entry point — no temporal modelling required. |
+| **Segment** | `Track.SEGMENT` | Short clip | Answer questions from a multi-second video segment surrounding the relevant event. Requires understanding of motion and temporal context. |
+| **Procedure** | `Track.PROCEDURE` | Full video | Answer questions that may require reasoning over an entire surgical procedure, including events that happened well before or after the queried moment. |
+Participants may enter any subset of tracks. Each track is evaluated independently with the same hierarchical capability taxonomy.
+## Installation
+```bash
+pip install orena-focus
+```
+## Quick start
+```python
+from focus import FocusDataset, DatasetSplit, Track
+ds = FocusDataset("heico", DatasetSplit.TEST, Track.SEGMENT)
+request, reference = ds[0]
+print(request.question)        # "How many sponges are visible?"
+print(reference.answer)        # "2"
+print(reference.format.type)   # "number"
+```
+## Data preparation
+Download, preprocess, and split the dataset in one script — see **[`examples/data_preparation.py`](examples/data_preparation.py)** for the full walkthrough.
+```python
+from focus import download
+from focus.preprocessing import VideoTimestampOverlayPreprocessor, FrameExtractorPreprocessor
+download("heico")
+VideoTimestampOverlayPreprocessor().process(dataset="heico")
+FrameExtractorPreprocessor(stride=1).process(dataset="heico")
+```
+QA annotations are fetched automatically from HuggingFace when you construct a `FocusDataset`.
+## Inference & evaluation
+See **[`examples/inference.py`](examples/inference.py)** for an end-to-end example with Qwen3-VL.
+```python
+from focus import Evaluator, Response
+responses = [Response(qID=req.qID, content=my_model(req)) for req, _ in ds]
+results_df, summary_df = Evaluator().run(
+    requests=ds.requests,
+    references=ds.references,
+    responses=responses,
+)
+print(summary_df)
+```
+## Capability taxonomy
+Five capability groups, each composed of leaf capabilities assigned to questions.
+![SAVE FOCUS capability taxonomy with example questions](https://github.com/IMSY-DKFZ/orena-focus/blob/main/src/focus/assets/SAVE_FOCUS_Capabilities.png?raw=true)
+| # | Group | Leaf capabilities |
+|---|-------|-------------------|
+| 1 | Object Recognition | Identification, Instance Matching, Attributes, Spatial (camera), Spatial (situs) |
+| 2 | Temporal Grounding | Temporal Localization, Duration Estimation |
+| 3 | Aggregation | Object Aggregation, Event Aggregation |
+| 4 | Event & Procedural Understanding | FO Interaction Recognition, FO Usage Purpose, Temporal Ordering |
+| 5 | Complex Reasoning | Functional Reasoning, Causal & Consequence Reasoning, Multi-step Reasoning |
+## Answer formats
+| Format | Accepts | Returns |
+|--------|---------|---------|
+| `Binary` | `"yes"` / `"no"` | `bool` |
+| `Number` | Non-negative integer strings | `int` |
+| `Percentage` | Numeric percentage strings | `float` |
+| `FOClass` | Registered FO class names | `str` |
+| `OpenEnded` | Free text (≤ 300 chars) | `str` |
+| `Matching` | Regex-validated text | `str` |
+| `MultipleChoice` | One of predefined options | `str` |
+| `Time` | `hh:mm:ss` timestamps | `timedelta` |
+## Dataset
+The QA annotations are publicly available on HuggingFace: **[orena-dkfz/heico-focus-vqa](https://huggingface.co/datasets/orena-dkfz/heico-focus-vqa)**.
+The FOCUS challenge is built on the **HeiCo** dataset. If you use this data, please cite the original publication:
+> Maier-Hein, L., et al. (2021). *Heidelberg colorectal data set for surgical data science in the sensor operating room*. [https://doi.org/10.1038/s41597-021-00882-2](https://doi.org/10.1038/s41597-021-00882-2)
+The HeiCo data is released under [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) — non-commercial use only, with attribution and share-alike conditions.
+## License
+MIT (library code) — see [Dataset](#dataset) for the data license.

orena_focus-0.1.0/examples/data_preparation.py ADDED Viewed

@@ -0,0 +1,67 @@
+"""Data preparation example for the ORena SAVE FOCUS challenge.
+This script walks through the full preparation pipeline:
+  1. Configure the library (root directory)
+  2. Download the dataset videos from HuggingFace Hub
+  3. Burn timestamps into the source videos (optional)
+  4. Extract JPEG frames from the videos
+QA annotations (parquet files) are fetched automatically when you
+construct a :class:`~focus.FocusDataset` — no separate step needed.
+Run each step independently — every step is idempotent and safe to re-run.
+Prerequisites
+-------------
+    pip install orena-focus
+Set ``FOCUS_ROOT_DIR`` to an existing directory with enough disk space, or
+pass it explicitly to ``FocusConfig`` as shown below.
+"""
+# ── 1. Configure ─────────────────────────────────────────────────────
+from focus import FocusConfig, download, set_config
+from focus.preprocessing import FrameExtractorPreprocessor, VideoTimestampOverlayPreprocessor
+# Point the library at a local root directory.  Every dataset is stored as a
+# sub-folder inside this root (e.g. <root>/heico/).
+# Alternatively, just set the FOCUS_ROOT_DIR environment variable.
+set_config(FocusConfig(root_dir="/data/focus"))
+DATASET = "heico"
+# ── 2. Download ───────────────────────────────────────────────────────
+# Downloads videos from HuggingFace Hub into <root>/heico/videos/.
+# QA annotation parquet files are streamed on demand — no manual step needed.
+# Safe to call repeatedly — skips the download if already complete.
+download(DATASET)
+# ── 3. Timestamp overlay (optional) ──────────────────────────────────
+# Burns a visible hh:mm:ss counter into each video and writes the result to
+# <root>/heico/overlayed/.  Skip this step if you do not need overlayed videos.
+VideoTimestampOverlayPreprocessor().process(dataset=DATASET, max_workers=4)
+# ── 4. Frame extraction ───────────────────────────────────────────────
+# Extract every frame (stride=1) from the original videos into
+# <root>/heico/frames/<video_stem>/frame{index:07d}.jpg.
+FrameExtractorPreprocessor(stride=1).process(dataset=DATASET, max_workers=4)
+# To extract from the overlayed videos instead, use a separate frames folder
+# so both variants live side-by-side:
+set_config(FocusConfig(root_dir="/data/focus", frames_folder="frames_overlay"))
+FrameExtractorPreprocessor(stride=1, use_overlay=True).process(dataset=DATASET, max_workers=4)
+set_config(FocusConfig(root_dir="/data/focus"))  # restore default
+# ── Done ──────────────────────────────────────────────────────────────
+# The dataset is now ready.  Load it with FocusDataset:
+#
+#   from focus import FocusDataset, DatasetSplit, Track
+#
+#   ds = FocusDataset("heico", DatasetSplit.TEST, Track.SEGMENT)
+#   request, reference = ds[0]

orena_focus-0.1.0/examples/evaluation.py ADDED Viewed

@@ -0,0 +1,130 @@
+"""Standalone evaluation example for the ORena SAVE FOCUS challenge.
+This script evaluates pre-computed model responses saved to a JSON file,
+without requiring a model or GPU.  Use this when:
+- You ran inference on your own infrastructure and want to score the outputs.
+- You want to compare multiple runs by evaluating different response files.
+- You want to debug the evaluation pipeline on a small sample.
+Pipeline:
+  1. Load the dataset split (requests + references from disk)
+  2. Load model responses from a JSON file produced by ``save_items``
+  3. Run the Evaluator (judges are only invoked for open-ended formats)
+  4. Print the hierarchical accuracy summary and inspect per-question results
+Prerequisites
+-------------
+    pip install orena-focus
+The dataset must have been downloaded and split beforehand — see
+``examples/data_preparation.py``.  Response files can be produced by
+``examples/inference.py`` (which calls ``save_items``) or by any other
+pipeline that writes the correct JSON schema::
+    [
+      {"qID": "q001", "content": "2", "latency": 1.23},
+      ...
+    ]
+"""
+import logging
+from pathlib import Path
+from focus import (
+    DatasetSplit,
+    Evaluator,
+    FocusConfig,
+    FocusDataset,
+    Track,
+    load_responses,
+    set_config,
+)
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# ── Configuration ─────────────────────────────────────────────────────
+CONFIG = {
+    "root_dir": "/data/focus",
+    "dataset_name": "heico",
+    "track": Track.SEGMENT,
+    "split": DatasetSplit.TEST,
+    # Path to the JSON file produced by save_items([...responses...], path)
+    "responses_file": "/data/focus/responses/segment_test_responses.json",
+    # Optional: write results.csv and summary.csv here
+    "output_dir": None,
+}
+def main() -> None:
+    # ── 1. Load dataset ───────────────────────────────────────────────
+    set_config(FocusConfig(root_dir=CONFIG["root_dir"]))
+    dataset = FocusDataset(
+        dataset=CONFIG["dataset_name"],
+        split=CONFIG["split"],
+        track=CONFIG["track"],
+    )
+    logger.info(f"Loaded dataset: {dataset}")
+    # ── 2. Load responses ─────────────────────────────────────────────
+    responses_path = Path(CONFIG["responses_file"])
+    if not responses_path.exists():
+        raise FileNotFoundError(
+            f"Responses file not found: {responses_path}\n"
+            "Run examples/inference.py first, or provide a path to an existing file."
+        )
+    responses = load_responses(responses_path)
+    logger.info(f"Loaded {len(responses)} responses from {responses_path}.")
+    n_total = len(dataset)
+    n_answered = len(responses)
+    if n_answered < n_total:
+        logger.warning(
+            f"{n_total - n_answered}/{n_total} questions have no response "
+            "and will be marked incorrect."
+        )
+    # ── 3. Evaluate ───────────────────────────────────────────────────
+    # Open-ended and matching questions are routed to an LLM judge.
+    # Pass judges=[] to skip judging and mark those questions incorrect,
+    # which is useful for a quick sanity check without loading a model.
+    evaluator = Evaluator()
+    results_df, summary_df = evaluator.run(
+        requests=dataset.requests,
+        references=dataset.references,
+        responses=responses,
+        output_dir=CONFIG["output_dir"],
+    )
+    # ── 4. Report ─────────────────────────────────────────────────────
+    overall = summary_df.loc[summary_df["level"] == "overall", "accuracy"].iloc[0]
+    print(f"\nOverall macro-accuracy: {overall:.1%}")
+    print()
+    print(summary_df.to_string(index=False))
+    # Per-capability breakdown with question counts
+    leaf_df = summary_df[summary_df["level"] == "leaf"].copy()
+    leaf_df = leaf_df.sort_values("accuracy", ascending=False)
+    print("\nPer-capability accuracy (best to worst):")
+    for _, row in leaf_df.iterrows():
+        bar = "█" * round(row["accuracy"] * 20)
+        print(f"  {row['name']:<40s} {row['accuracy']:5.1%}  {bar}")
+    # Subgroup analyses
+    if "ood" in results_df.columns:
+        ood_acc = results_df[results_df["ood"]]["correctness"].mean()
+        in_dist_acc = results_df[~results_df["ood"]]["correctness"].mean()
+        print(f"\nIn-distribution accuracy : {in_dist_acc:.1%}")
+        print(f"Out-of-distribution accuracy: {ood_acc:.1%}")
+    if "clinical" in results_df.columns:
+        clin_acc = results_df[results_df["clinical"]]["correctness"].mean()
+        print(f"Clinical question accuracy  : {clin_acc:.1%}")
+if __name__ == "__main__":
+    main()