orena-focus 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. orena_focus-0.1.0/.github/workflows/release.yml +49 -0
  2. orena_focus-0.1.0/.github/workflows/tests.yml +32 -0
  3. orena_focus-0.1.0/LICENSE +21 -0
  4. orena_focus-0.1.0/PKG-INFO +172 -0
  5. orena_focus-0.1.0/README.md +129 -0
  6. orena_focus-0.1.0/examples/data_preparation.py +67 -0
  7. orena_focus-0.1.0/examples/evaluation.py +130 -0
  8. orena_focus-0.1.0/examples/inference.py +224 -0
  9. orena_focus-0.1.0/pyproject.toml +90 -0
  10. orena_focus-0.1.0/src/focus/__init__.py +130 -0
  11. orena_focus-0.1.0/src/focus/assets/FO_definitions.txt +68 -0
  12. orena_focus-0.1.0/src/focus/assets/SAVE_FOCUS_Capabilities.png +0 -0
  13. orena_focus-0.1.0/src/focus/config.py +110 -0
  14. orena_focus-0.1.0/src/focus/data/base_dataset.py +230 -0
  15. orena_focus-0.1.0/src/focus/data/data_models.py +232 -0
  16. orena_focus-0.1.0/src/focus/data/download.py +99 -0
  17. orena_focus-0.1.0/src/focus/data/formats.py +298 -0
  18. orena_focus-0.1.0/src/focus/data/frame_dataset.py +179 -0
  19. orena_focus-0.1.0/src/focus/data/video_dataset.py +223 -0
  20. orena_focus-0.1.0/src/focus/enums.py +33 -0
  21. orena_focus-0.1.0/src/focus/evaluation/__init__.py +15 -0
  22. orena_focus-0.1.0/src/focus/evaluation/adversarial.py +72 -0
  23. orena_focus-0.1.0/src/focus/evaluation/evaluator.py +339 -0
  24. orena_focus-0.1.0/src/focus/evaluation/judges.py +277 -0
  25. orena_focus-0.1.0/src/focus/foreign_objects.py +268 -0
  26. orena_focus-0.1.0/src/focus/preprocessing/__init__.py +64 -0
  27. orena_focus-0.1.0/src/focus/preprocessing/frame_extraction.py +252 -0
  28. orena_focus-0.1.0/src/focus/preprocessing/video_overlay.py +258 -0
  29. orena_focus-0.1.0/src/focus/py.typed +0 -0
  30. orena_focus-0.1.0/src/focus/taxonomy.py +237 -0
  31. orena_focus-0.1.0/tests/__init__.py +0 -0
  32. orena_focus-0.1.0/tests/conftest.py +101 -0
  33. orena_focus-0.1.0/tests/test_data_models.py +170 -0
  34. orena_focus-0.1.0/tests/test_dataset.py +130 -0
  35. orena_focus-0.1.0/tests/test_evaluator.py +210 -0
  36. orena_focus-0.1.0/tests/test_formats.py +251 -0
@@ -0,0 +1,49 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ jobs:
9
+ build:
10
+ name: Build distribution
11
+ runs-on: ubuntu-latest
12
+
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v5
18
+ with:
19
+ python-version: "3.11"
20
+
21
+ - name: Install build backend
22
+ run: pip install build
23
+
24
+ - name: Build wheel and sdist
25
+ run: python -m build
26
+
27
+ - name: Upload distribution artifacts
28
+ uses: actions/upload-artifact@v4
29
+ with:
30
+ name: dist
31
+ path: dist/
32
+
33
+ publish:
34
+ name: Publish to PyPI
35
+ needs: build
36
+ runs-on: ubuntu-latest
37
+ environment: pypi
38
+ permissions:
39
+ id-token: write # required for trusted publishing (OIDC)
40
+
41
+ steps:
42
+ - name: Download distribution artifacts
43
+ uses: actions/download-artifact@v4
44
+ with:
45
+ name: dist
46
+ path: dist/
47
+
48
+ - name: Publish to PyPI
49
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,32 @@
1
+ name: Tests
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ name: Python ${{ matrix.python-version }}
12
+ runs-on: ubuntu-latest
13
+
14
+ strategy:
15
+ fail-fast: false
16
+ matrix:
17
+ python-version: ["3.10", "3.11", "3.12"]
18
+
19
+ steps:
20
+ - uses: actions/checkout@v4
21
+
22
+ - name: Set up Python ${{ matrix.python-version }}
23
+ uses: actions/setup-python@v5
24
+ with:
25
+ python-version: ${{ matrix.python-version }}
26
+ cache: pip
27
+
28
+ - name: Install package and test dependencies
29
+ run: pip install -e ".[test]"
30
+
31
+ - name: Run tests
32
+ run: pytest --tb=short
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 IMSY
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,172 @@
1
+ Metadata-Version: 2.4
2
+ Name: orena-focus
3
+ Version: 0.1.0
4
+ Summary: Utilities for the ORena SAVE FOCUS challenge: Foreign Object Contextual Understanding for Safe Surgical AI
5
+ Project-URL: Homepage, https://or-arena.org/
6
+ Project-URL: Repository, https://github.com/IMSY-DKFZ/orena-focus
7
+ Author-email: Patrick Godau <patrick.godau@dkfz-heidelberg.de>, Lucas Luttner <lucas.luttner@dkfz-heidelberg.de>, Leon Mayer <leon.mayer@dkfz-heidelberg.de>
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Keywords: foreign-objects,laparoscopy,miccai,surgical-ai,vision-language-model,vqa
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
20
+ Requires-Python: >=3.10
21
+ Requires-Dist: datasets>=2.14.0
22
+ Requires-Dist: decord>=0.6.0
23
+ Requires-Dist: huggingface-hub>=0.17.0
24
+ Requires-Dist: matplotlib>=3.5.0
25
+ Requires-Dist: numpy>=1.23.0
26
+ Requires-Dist: opencv-python>=4.8.0
27
+ Requires-Dist: pandas>=2.0
28
+ Requires-Dist: pillow>=9.0
29
+ Requires-Dist: progiter>=0.12.0
30
+ Requires-Dist: tiktoken>=0.5.0
31
+ Requires-Dist: torch>=2.0.0
32
+ Requires-Dist: torchvision>=0.15.0
33
+ Requires-Dist: transformers>=4.30.0
34
+ Provides-Extra: dev
35
+ Requires-Dist: mypy>=1.8; extra == 'dev'
36
+ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
37
+ Requires-Dist: pytest>=7.0; extra == 'dev'
38
+ Requires-Dist: ruff>=0.4; extra == 'dev'
39
+ Provides-Extra: test
40
+ Requires-Dist: pytest-cov>=4.0; extra == 'test'
41
+ Requires-Dist: pytest>=7.0; extra == 'test'
42
+ Description-Content-Type: text/markdown
43
+
44
+ <div align="center">
45
+
46
+ # orena-focus
47
+
48
+ [![Tests](https://img.shields.io/github/actions/workflow/status/IMSY-DKFZ/orena-focus/tests.yml?branch=main&label=tests)](https://github.com/IMSY-DKFZ/orena-focus/actions/workflows/tests.yml)
49
+ [![PyPI](https://img.shields.io/pypi/v/orena-focus?color=blue)](https://pypi.org/project/orena-focus/)
50
+ [![Python](https://img.shields.io/pypi/pyversions/orena-focus)](https://pypi.org/project/orena-focus/)
51
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
52
+ [![Data: CC BY-NC-SA 4.0](https://img.shields.io/badge/Data-CC%20BY--NC--SA%204.0-lightgrey.svg)](https://creativecommons.org/licenses/by-nc-sa/4.0/)
53
+ [![MICCAI 2026](https://img.shields.io/badge/Challenge-MICCAI%202026-blue)](https://or-arena.org/)
54
+ [![Dataset](https://img.shields.io/badge/%F0%9F%A4%97%20Dataset-orena--dkfz%2Fheico--focus--vqa-blue)](https://huggingface.co/datasets/orena-dkfz/heico-focus-vqa)
55
+
56
+
57
+ </div>
58
+
59
+ <br>
60
+
61
+ Python utilities for the **FOCUS datasets and challenge** — *Foreign Object Contextual Understanding for Safe Surgical AI*.
62
+
63
+ The library provides dataset loaders, preprocessing pipelines, answer-format handling, and an evaluation framework for working with the FOCUS surgical VQA datasets. It can be used independently for research on foreign-object understanding in minimally invasive surgery, and also serves as the official toolkit for the [ORena SAVE FOCUS challenge](https://or-arena.org/) at MICCAI 2026.
64
+
65
+ > **Challenge soon open for registration.** Submit your results and compete on the leaderboard at [or-arena.org](https://or-arena.org/).
66
+
67
+ Retained foreign objects are a life-threatening and preventable surgical complication. FOCUS benchmarks vision-language models on clinically relevant VQA tasks around detecting, counting, and reasoning about foreign objects in endoscopic video.
68
+
69
+ ## Tracks
70
+
71
+ FOCUS offers three participation tracks, each requiring a different type of visual context:
72
+
73
+ | Track | `Track` enum | Visual input | Description |
74
+ |-------|-------------|--------------|-------------|
75
+ | **Frame** | `Track.FRAME` | Single frame | Answer questions from one extracted video frame. The simplest entry point — no temporal modelling required. |
76
+ | **Segment** | `Track.SEGMENT` | Short clip | Answer questions from a multi-second video segment surrounding the relevant event. Requires understanding of motion and temporal context. |
77
+ | **Procedure** | `Track.PROCEDURE` | Full video | Answer questions that may require reasoning over an entire surgical procedure, including events that happened well before or after the queried moment. |
78
+
79
+ Participants may enter any subset of tracks. Each track is evaluated independently with the same hierarchical capability taxonomy.
80
+
81
+ ## Installation
82
+
83
+ ```bash
84
+ pip install orena-focus
85
+ ```
86
+
87
+ ## Quick start
88
+
89
+ ```python
90
+ from focus import FocusDataset, DatasetSplit, Track
91
+
92
+ ds = FocusDataset("heico", DatasetSplit.TEST, Track.SEGMENT)
93
+
94
+ request, reference = ds[0]
95
+ print(request.question) # "How many sponges are visible?"
96
+ print(reference.answer) # "2"
97
+ print(reference.format.type) # "number"
98
+ ```
99
+
100
+ ## Data preparation
101
+
102
+ Download, preprocess, and split the dataset in one script — see **[`examples/data_preparation.py`](examples/data_preparation.py)** for the full walkthrough.
103
+
104
+ ```python
105
+ from focus import download
106
+ from focus.preprocessing import VideoTimestampOverlayPreprocessor, FrameExtractorPreprocessor
107
+
108
+ download("heico")
109
+
110
+ VideoTimestampOverlayPreprocessor().process(dataset="heico")
111
+ FrameExtractorPreprocessor(stride=1).process(dataset="heico")
112
+ ```
113
+
114
+ QA annotations are fetched automatically from HuggingFace when you construct a `FocusDataset`.
115
+
116
+ ## Inference & evaluation
117
+
118
+ See **[`examples/inference.py`](examples/inference.py)** for an end-to-end example with Qwen3-VL.
119
+
120
+ ```python
121
+ from focus import Evaluator, Response
122
+
123
+ responses = [Response(qID=req.qID, content=my_model(req)) for req, _ in ds]
124
+
125
+ results_df, summary_df = Evaluator().run(
126
+ requests=ds.requests,
127
+ references=ds.references,
128
+ responses=responses,
129
+ )
130
+ print(summary_df)
131
+ ```
132
+
133
+ ## Capability taxonomy
134
+
135
+ Five capability groups, each composed of leaf capabilities assigned to questions.
136
+
137
+ ![SAVE FOCUS capability taxonomy with example questions](https://github.com/IMSY-DKFZ/orena-focus/blob/main/src/focus/assets/SAVE_FOCUS_Capabilities.png?raw=true)
138
+
139
+ | # | Group | Leaf capabilities |
140
+ |---|-------|-------------------|
141
+ | 1 | Object Recognition | Identification, Instance Matching, Attributes, Spatial (camera), Spatial (situs) |
142
+ | 2 | Temporal Grounding | Temporal Localization, Duration Estimation |
143
+ | 3 | Aggregation | Object Aggregation, Event Aggregation |
144
+ | 4 | Event & Procedural Understanding | FO Interaction Recognition, FO Usage Purpose, Temporal Ordering |
145
+ | 5 | Complex Reasoning | Functional Reasoning, Causal & Consequence Reasoning, Multi-step Reasoning |
146
+
147
+ ## Answer formats
148
+
149
+ | Format | Accepts | Returns |
150
+ |--------|---------|---------|
151
+ | `Binary` | `"yes"` / `"no"` | `bool` |
152
+ | `Number` | Non-negative integer strings | `int` |
153
+ | `Percentage` | Numeric percentage strings | `float` |
154
+ | `FOClass` | Registered FO class names | `str` |
155
+ | `OpenEnded` | Free text (≤ 300 chars) | `str` |
156
+ | `Matching` | Regex-validated text | `str` |
157
+ | `MultipleChoice` | One of predefined options | `str` |
158
+ | `Time` | `hh:mm:ss` timestamps | `timedelta` |
159
+
160
+ ## Dataset
161
+
162
+ The QA annotations are publicly available on HuggingFace: **[orena-dkfz/heico-focus-vqa](https://huggingface.co/datasets/orena-dkfz/heico-focus-vqa)**.
163
+
164
+ The FOCUS challenge is built on the **HeiCo** dataset. If you use this data, please cite the original publication:
165
+
166
+ > Maier-Hein, L., et al. (2021). *Heidelberg colorectal data set for surgical data science in the sensor operating room*. [https://doi.org/10.1038/s41597-021-00882-2](https://doi.org/10.1038/s41597-021-00882-2)
167
+
168
+ The HeiCo data is released under [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) — non-commercial use only, with attribution and share-alike conditions.
169
+
170
+ ## License
171
+
172
+ MIT (library code) — see [Dataset](#dataset) for the data license.
@@ -0,0 +1,129 @@
1
+ <div align="center">
2
+
3
+ # orena-focus
4
+
5
+ [![Tests](https://img.shields.io/github/actions/workflow/status/IMSY-DKFZ/orena-focus/tests.yml?branch=main&label=tests)](https://github.com/IMSY-DKFZ/orena-focus/actions/workflows/tests.yml)
6
+ [![PyPI](https://img.shields.io/pypi/v/orena-focus?color=blue)](https://pypi.org/project/orena-focus/)
7
+ [![Python](https://img.shields.io/pypi/pyversions/orena-focus)](https://pypi.org/project/orena-focus/)
8
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
9
+ [![Data: CC BY-NC-SA 4.0](https://img.shields.io/badge/Data-CC%20BY--NC--SA%204.0-lightgrey.svg)](https://creativecommons.org/licenses/by-nc-sa/4.0/)
10
+ [![MICCAI 2026](https://img.shields.io/badge/Challenge-MICCAI%202026-blue)](https://or-arena.org/)
11
+ [![Dataset](https://img.shields.io/badge/%F0%9F%A4%97%20Dataset-orena--dkfz%2Fheico--focus--vqa-blue)](https://huggingface.co/datasets/orena-dkfz/heico-focus-vqa)
12
+
13
+
14
+ </div>
15
+
16
+ <br>
17
+
18
+ Python utilities for the **FOCUS datasets and challenge** — *Foreign Object Contextual Understanding for Safe Surgical AI*.
19
+
20
+ The library provides dataset loaders, preprocessing pipelines, answer-format handling, and an evaluation framework for working with the FOCUS surgical VQA datasets. It can be used independently for research on foreign-object understanding in minimally invasive surgery, and also serves as the official toolkit for the [ORena SAVE FOCUS challenge](https://or-arena.org/) at MICCAI 2026.
21
+
22
+ > **Challenge soon open for registration.** Submit your results and compete on the leaderboard at [or-arena.org](https://or-arena.org/).
23
+
24
+ Retained foreign objects are a life-threatening and preventable surgical complication. FOCUS benchmarks vision-language models on clinically relevant VQA tasks around detecting, counting, and reasoning about foreign objects in endoscopic video.
25
+
26
+ ## Tracks
27
+
28
+ FOCUS offers three participation tracks, each requiring a different type of visual context:
29
+
30
+ | Track | `Track` enum | Visual input | Description |
31
+ |-------|-------------|--------------|-------------|
32
+ | **Frame** | `Track.FRAME` | Single frame | Answer questions from one extracted video frame. The simplest entry point — no temporal modelling required. |
33
+ | **Segment** | `Track.SEGMENT` | Short clip | Answer questions from a multi-second video segment surrounding the relevant event. Requires understanding of motion and temporal context. |
34
+ | **Procedure** | `Track.PROCEDURE` | Full video | Answer questions that may require reasoning over an entire surgical procedure, including events that happened well before or after the queried moment. |
35
+
36
+ Participants may enter any subset of tracks. Each track is evaluated independently with the same hierarchical capability taxonomy.
37
+
38
+ ## Installation
39
+
40
+ ```bash
41
+ pip install orena-focus
42
+ ```
43
+
44
+ ## Quick start
45
+
46
+ ```python
47
+ from focus import FocusDataset, DatasetSplit, Track
48
+
49
+ ds = FocusDataset("heico", DatasetSplit.TEST, Track.SEGMENT)
50
+
51
+ request, reference = ds[0]
52
+ print(request.question) # "How many sponges are visible?"
53
+ print(reference.answer) # "2"
54
+ print(reference.format.type) # "number"
55
+ ```
56
+
57
+ ## Data preparation
58
+
59
+ Download, preprocess, and split the dataset in one script — see **[`examples/data_preparation.py`](examples/data_preparation.py)** for the full walkthrough.
60
+
61
+ ```python
62
+ from focus import download
63
+ from focus.preprocessing import VideoTimestampOverlayPreprocessor, FrameExtractorPreprocessor
64
+
65
+ download("heico")
66
+
67
+ VideoTimestampOverlayPreprocessor().process(dataset="heico")
68
+ FrameExtractorPreprocessor(stride=1).process(dataset="heico")
69
+ ```
70
+
71
+ QA annotations are fetched automatically from HuggingFace when you construct a `FocusDataset`.
72
+
73
+ ## Inference & evaluation
74
+
75
+ See **[`examples/inference.py`](examples/inference.py)** for an end-to-end example with Qwen3-VL.
76
+
77
+ ```python
78
+ from focus import Evaluator, Response
79
+
80
+ responses = [Response(qID=req.qID, content=my_model(req)) for req, _ in ds]
81
+
82
+ results_df, summary_df = Evaluator().run(
83
+ requests=ds.requests,
84
+ references=ds.references,
85
+ responses=responses,
86
+ )
87
+ print(summary_df)
88
+ ```
89
+
90
+ ## Capability taxonomy
91
+
92
+ Five capability groups, each composed of leaf capabilities assigned to questions.
93
+
94
+ ![SAVE FOCUS capability taxonomy with example questions](https://github.com/IMSY-DKFZ/orena-focus/blob/main/src/focus/assets/SAVE_FOCUS_Capabilities.png?raw=true)
95
+
96
+ | # | Group | Leaf capabilities |
97
+ |---|-------|-------------------|
98
+ | 1 | Object Recognition | Identification, Instance Matching, Attributes, Spatial (camera), Spatial (situs) |
99
+ | 2 | Temporal Grounding | Temporal Localization, Duration Estimation |
100
+ | 3 | Aggregation | Object Aggregation, Event Aggregation |
101
+ | 4 | Event & Procedural Understanding | FO Interaction Recognition, FO Usage Purpose, Temporal Ordering |
102
+ | 5 | Complex Reasoning | Functional Reasoning, Causal & Consequence Reasoning, Multi-step Reasoning |
103
+
104
+ ## Answer formats
105
+
106
+ | Format | Accepts | Returns |
107
+ |--------|---------|---------|
108
+ | `Binary` | `"yes"` / `"no"` | `bool` |
109
+ | `Number` | Non-negative integer strings | `int` |
110
+ | `Percentage` | Numeric percentage strings | `float` |
111
+ | `FOClass` | Registered FO class names | `str` |
112
+ | `OpenEnded` | Free text (≤ 300 chars) | `str` |
113
+ | `Matching` | Regex-validated text | `str` |
114
+ | `MultipleChoice` | One of predefined options | `str` |
115
+ | `Time` | `hh:mm:ss` timestamps | `timedelta` |
116
+
117
+ ## Dataset
118
+
119
+ The QA annotations are publicly available on HuggingFace: **[orena-dkfz/heico-focus-vqa](https://huggingface.co/datasets/orena-dkfz/heico-focus-vqa)**.
120
+
121
+ The FOCUS challenge is built on the **HeiCo** dataset. If you use this data, please cite the original publication:
122
+
123
+ > Maier-Hein, L., et al. (2021). *Heidelberg colorectal data set for surgical data science in the sensor operating room*. [https://doi.org/10.1038/s41597-021-00882-2](https://doi.org/10.1038/s41597-021-00882-2)
124
+
125
+ The HeiCo data is released under [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) — non-commercial use only, with attribution and share-alike conditions.
126
+
127
+ ## License
128
+
129
+ MIT (library code) — see [Dataset](#dataset) for the data license.
@@ -0,0 +1,67 @@
1
+ """Data preparation example for the ORena SAVE FOCUS challenge.
2
+
3
+ This script walks through the full preparation pipeline:
4
+
5
+ 1. Configure the library (root directory)
6
+ 2. Download the dataset videos from HuggingFace Hub
7
+ 3. Burn timestamps into the source videos (optional)
8
+ 4. Extract JPEG frames from the videos
9
+
10
+ QA annotations (parquet files) are fetched automatically when you
11
+ construct a :class:`~focus.FocusDataset` — no separate step needed.
12
+
13
+ Run each step independently — every step is idempotent and safe to re-run.
14
+
15
+ Prerequisites
16
+ -------------
17
+ pip install orena-focus
18
+
19
+ Set ``FOCUS_ROOT_DIR`` to an existing directory with enough disk space, or
20
+ pass it explicitly to ``FocusConfig`` as shown below.
21
+ """
22
+
23
+ # ── 1. Configure ─────────────────────────────────────────────────────
24
+
25
+ from focus import FocusConfig, download, set_config
26
+ from focus.preprocessing import FrameExtractorPreprocessor, VideoTimestampOverlayPreprocessor
27
+
28
+ # Point the library at a local root directory. Every dataset is stored as a
29
+ # sub-folder inside this root (e.g. <root>/heico/).
30
+ # Alternatively, just set the FOCUS_ROOT_DIR environment variable.
31
+
32
+ set_config(FocusConfig(root_dir="/data/focus"))
33
+
34
+ DATASET = "heico"
35
+
36
+ # ── 2. Download ───────────────────────────────────────────────────────
37
+
38
+ # Downloads videos from HuggingFace Hub into <root>/heico/videos/.
39
+ # QA annotation parquet files are streamed on demand — no manual step needed.
40
+ # Safe to call repeatedly — skips the download if already complete.
41
+ download(DATASET)
42
+
43
+ # ── 3. Timestamp overlay (optional) ──────────────────────────────────
44
+
45
+ # Burns a visible hh:mm:ss counter into each video and writes the result to
46
+ # <root>/heico/overlayed/. Skip this step if you do not need overlayed videos.
47
+ VideoTimestampOverlayPreprocessor().process(dataset=DATASET, max_workers=4)
48
+
49
+ # ── 4. Frame extraction ───────────────────────────────────────────────
50
+
51
+ # Extract every frame (stride=1) from the original videos into
52
+ # <root>/heico/frames/<video_stem>/frame{index:07d}.jpg.
53
+ FrameExtractorPreprocessor(stride=1).process(dataset=DATASET, max_workers=4)
54
+
55
+ # To extract from the overlayed videos instead, use a separate frames folder
56
+ # so both variants live side-by-side:
57
+ set_config(FocusConfig(root_dir="/data/focus", frames_folder="frames_overlay"))
58
+ FrameExtractorPreprocessor(stride=1, use_overlay=True).process(dataset=DATASET, max_workers=4)
59
+ set_config(FocusConfig(root_dir="/data/focus")) # restore default
60
+
61
+ # ── Done ──────────────────────────────────────────────────────────────
62
+ # The dataset is now ready. Load it with FocusDataset:
63
+ #
64
+ # from focus import FocusDataset, DatasetSplit, Track
65
+ #
66
+ # ds = FocusDataset("heico", DatasetSplit.TEST, Track.SEGMENT)
67
+ # request, reference = ds[0]
@@ -0,0 +1,130 @@
1
+ """Standalone evaluation example for the ORena SAVE FOCUS challenge.
2
+
3
+ This script evaluates pre-computed model responses saved to a JSON file,
4
+ without requiring a model or GPU. Use this when:
5
+
6
+ - You ran inference on your own infrastructure and want to score the outputs.
7
+ - You want to compare multiple runs by evaluating different response files.
8
+ - You want to debug the evaluation pipeline on a small sample.
9
+
10
+ Pipeline:
11
+ 1. Load the dataset split (requests + references from disk)
12
+ 2. Load model responses from a JSON file produced by ``save_items``
13
+ 3. Run the Evaluator (judges are only invoked for open-ended formats)
14
+ 4. Print the hierarchical accuracy summary and inspect per-question results
15
+
16
+ Prerequisites
17
+ -------------
18
+ pip install orena-focus
19
+
20
+ The dataset must have been downloaded and split beforehand — see
21
+ ``examples/data_preparation.py``. Response files can be produced by
22
+ ``examples/inference.py`` (which calls ``save_items``) or by any other
23
+ pipeline that writes the correct JSON schema::
24
+
25
+ [
26
+ {"qID": "q001", "content": "2", "latency": 1.23},
27
+ ...
28
+ ]
29
+ """
30
+
31
+ import logging
32
+ from pathlib import Path
33
+
34
+ from focus import (
35
+ DatasetSplit,
36
+ Evaluator,
37
+ FocusConfig,
38
+ FocusDataset,
39
+ Track,
40
+ load_responses,
41
+ set_config,
42
+ )
43
+
44
+ logging.basicConfig(level=logging.INFO)
45
+ logger = logging.getLogger(__name__)
46
+
47
+ # ── Configuration ─────────────────────────────────────────────────────
48
+
49
+ CONFIG = {
50
+ "root_dir": "/data/focus",
51
+ "dataset_name": "heico",
52
+ "track": Track.SEGMENT,
53
+ "split": DatasetSplit.TEST,
54
+ # Path to the JSON file produced by save_items([...responses...], path)
55
+ "responses_file": "/data/focus/responses/segment_test_responses.json",
56
+ # Optional: write results.csv and summary.csv here
57
+ "output_dir": None,
58
+ }
59
+
60
+
61
+ def main() -> None:
62
+ # ── 1. Load dataset ───────────────────────────────────────────────
63
+ set_config(FocusConfig(root_dir=CONFIG["root_dir"]))
64
+
65
+ dataset = FocusDataset(
66
+ dataset=CONFIG["dataset_name"],
67
+ split=CONFIG["split"],
68
+ track=CONFIG["track"],
69
+ )
70
+ logger.info(f"Loaded dataset: {dataset}")
71
+
72
+ # ── 2. Load responses ─────────────────────────────────────────────
73
+ responses_path = Path(CONFIG["responses_file"])
74
+ if not responses_path.exists():
75
+ raise FileNotFoundError(
76
+ f"Responses file not found: {responses_path}\n"
77
+ "Run examples/inference.py first, or provide a path to an existing file."
78
+ )
79
+
80
+ responses = load_responses(responses_path)
81
+ logger.info(f"Loaded {len(responses)} responses from {responses_path}.")
82
+
83
+ n_total = len(dataset)
84
+ n_answered = len(responses)
85
+ if n_answered < n_total:
86
+ logger.warning(
87
+ f"{n_total - n_answered}/{n_total} questions have no response "
88
+ "and will be marked incorrect."
89
+ )
90
+
91
+ # ── 3. Evaluate ───────────────────────────────────────────────────
92
+ # Open-ended and matching questions are routed to an LLM judge.
93
+ # Pass judges=[] to skip judging and mark those questions incorrect,
94
+ # which is useful for a quick sanity check without loading a model.
95
+ evaluator = Evaluator()
96
+ results_df, summary_df = evaluator.run(
97
+ requests=dataset.requests,
98
+ references=dataset.references,
99
+ responses=responses,
100
+ output_dir=CONFIG["output_dir"],
101
+ )
102
+
103
+ # ── 4. Report ─────────────────────────────────────────────────────
104
+ overall = summary_df.loc[summary_df["level"] == "overall", "accuracy"].iloc[0]
105
+ print(f"\nOverall macro-accuracy: {overall:.1%}")
106
+ print()
107
+ print(summary_df.to_string(index=False))
108
+
109
+ # Per-capability breakdown with question counts
110
+ leaf_df = summary_df[summary_df["level"] == "leaf"].copy()
111
+ leaf_df = leaf_df.sort_values("accuracy", ascending=False)
112
+ print("\nPer-capability accuracy (best to worst):")
113
+ for _, row in leaf_df.iterrows():
114
+ bar = "█" * round(row["accuracy"] * 20)
115
+ print(f" {row['name']:<40s} {row['accuracy']:5.1%} {bar}")
116
+
117
+ # Subgroup analyses
118
+ if "ood" in results_df.columns:
119
+ ood_acc = results_df[results_df["ood"]]["correctness"].mean()
120
+ in_dist_acc = results_df[~results_df["ood"]]["correctness"].mean()
121
+ print(f"\nIn-distribution accuracy : {in_dist_acc:.1%}")
122
+ print(f"Out-of-distribution accuracy: {ood_acc:.1%}")
123
+
124
+ if "clinical" in results_df.columns:
125
+ clin_acc = results_df[results_df["clinical"]]["correctness"].mean()
126
+ print(f"Clinical question accuracy : {clin_acc:.1%}")
127
+
128
+
129
+ if __name__ == "__main__":
130
+ main()