openglottal 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. openglottal-0.2.0/LICENSE +21 -0
  2. openglottal-0.2.0/PKG-INFO +518 -0
  3. openglottal-0.2.0/README.md +459 -0
  4. openglottal-0.2.0/openglottal/__init__.py +20 -0
  5. openglottal-0.2.0/openglottal/cli.py +255 -0
  6. openglottal-0.2.0/openglottal/data.py +341 -0
  7. openglottal-0.2.0/openglottal/displacement.py +280 -0
  8. openglottal-0.2.0/openglottal/features.py +247 -0
  9. openglottal-0.2.0/openglottal/geometry.py +278 -0
  10. openglottal-0.2.0/openglottal/kaggle_paths.py +36 -0
  11. openglottal-0.2.0/openglottal/kinematics.py +64 -0
  12. openglottal-0.2.0/openglottal/metadata.py +82 -0
  13. openglottal-0.2.0/openglottal/models/__init__.py +12 -0
  14. openglottal-0.2.0/openglottal/models/detector.py +102 -0
  15. openglottal-0.2.0/openglottal/models/tracker.py +232 -0
  16. openglottal-0.2.0/openglottal/models/unet.py +192 -0
  17. openglottal-0.2.0/openglottal/qt_app/__init__.py +8 -0
  18. openglottal-0.2.0/openglottal/qt_app/__main__.py +6 -0
  19. openglottal-0.2.0/openglottal/qt_app/analyzer.py +5 -0
  20. openglottal-0.2.0/openglottal/qt_app/main.py +1437 -0
  21. openglottal-0.2.0/openglottal/qt_app/utils.py +154 -0
  22. openglottal-0.2.0/openglottal/utils.py +263 -0
  23. openglottal-0.2.0/openglottal.egg-info/PKG-INFO +518 -0
  24. openglottal-0.2.0/openglottal.egg-info/SOURCES.txt +29 -0
  25. openglottal-0.2.0/openglottal.egg-info/dependency_links.txt +1 -0
  26. openglottal-0.2.0/openglottal.egg-info/entry_points.txt +3 -0
  27. openglottal-0.2.0/openglottal.egg-info/requires.txt +24 -0
  28. openglottal-0.2.0/openglottal.egg-info/top_level.txt +1 -0
  29. openglottal-0.2.0/pyproject.toml +49 -0
  30. openglottal-0.2.0/setup.cfg +4 -0
  31. openglottal-0.2.0/tests/test_kaggle_data.py +147 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 OpenGlottal Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,518 @@
1
+ Metadata-Version: 2.4
2
+ Name: openglottal
3
+ Version: 0.2.0
4
+ Summary: Automated glottal area segmentation from high-speed videoendoscopy
5
+ License: MIT License
6
+
7
+ Copyright (c) 2024 OpenGlottal Contributors
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ of this software and associated documentation files (the "Software"), to deal
11
+ in the Software without restriction, including without limitation the rights
12
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ copies of the Software, and to permit persons to whom the Software is
14
+ furnished to do so, subject to the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be included in all
17
+ copies or substantial portions of the Software.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ SOFTWARE.
26
+
27
+ Keywords: glottis,segmentation,vocal-folds,laryngoscopy,medical-imaging
28
+ Classifier: Development Status :: 3 - Alpha
29
+ Classifier: Intended Audience :: Science/Research
30
+ Classifier: License :: OSI Approved :: MIT License
31
+ Classifier: Programming Language :: Python :: 3
32
+ Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
33
+ Classifier: Topic :: Scientific/Engineering :: Image Processing
34
+ Requires-Python: >=3.9
35
+ Description-Content-Type: text/markdown
36
+ License-File: LICENSE
37
+ Requires-Dist: h5py>=3.8
38
+ Requires-Dist: torch>=2.0
39
+ Requires-Dist: torchvision>=0.15
40
+ Requires-Dist: ultralytics>=8.0
41
+ Requires-Dist: opencv-python>=4.8
42
+ Requires-Dist: numpy>=1.24
43
+ Requires-Dist: scipy>=1.10
44
+ Requires-Dist: matplotlib>=3.7
45
+ Requires-Dist: tqdm>=4.65
46
+ Requires-Dist: tensorboard>=2.14
47
+ Provides-Extra: dev
48
+ Requires-Dist: pytest; extra == "dev"
49
+ Requires-Dist: pytest-cov; extra == "dev"
50
+ Requires-Dist: ruff; extra == "dev"
51
+ Requires-Dist: mypy; extra == "dev"
52
+ Requires-Dist: kaggle>=1.6; extra == "dev"
53
+ Provides-Extra: docs
54
+ Requires-Dist: mkdocs; extra == "docs"
55
+ Requires-Dist: mkdocs-material; extra == "docs"
56
+ Provides-Extra: gui
57
+ Requires-Dist: PyQt5>=5.15; extra == "gui"
58
+ Dynamic: license-file
59
+
60
+ # OpenGlottal
61
+
62
+ ![Patient 1 montage](paper/patient1_montage.png)
63
+ *Frame montage: segmentation across a glottal cycle.*
64
+
65
+ ![L/R asymmetry in vocal fold nodules](paper/qt_gui_lr_nodule.png)
66
+ *Quantifying L/R asymmetry in a case of vocal fold nodules using the dynamic medial probe and independent displacement waveforms.*
67
+
68
+ Open-source toolkit for automated glottal area segmentation from high-speed videoendoscopy (HSV). Beyond per-frame masks, OpenGlottal extracts **glottal area waveforms (GAW)** and **left/right (L/R) displacement waveforms** along a configurable medial axis — enabling quantification of L/R asymmetry and kinematic features (open quotient, F0, periodicity) for clinical assessment.
69
+
70
+ Building on previous research into nodule-induced vibratory irregularities (Patel, Unnikrishnan, & Donohue, 2016), OpenGlottal provides the first open-source GUI to quantify these L/R asymmetries in real time.
71
+
72
+ **Author:** Harikrishnan Unnikrishnan (hari@orchard-robotics.com)
73
+
74
+ **Paper:** [**arXiv:2603.02087**](https://arxiv.org/abs/2603.02087) — *A Detection-Gated Pipeline for Robust Glottal Area Waveform Extraction and Clinical Pathology Assessment*
75
+
76
+ OpenGlottal combines a YOLOv8 glottis detector, a U-Net pixel-level segmenter, and a temporal vocal fold tracker into a single, reproducible inference and training pipeline — trained and evaluated on the [GIRAFE dataset](https://zenodo.org/records/13773163) ([dataset paper](https://doi.org/10.1016/j.dib.2025.111376)) and [BAGLS](https://zenodo.org/records/3762320) ([Scientific Data, 2020](https://doi.org/10.1038/s41597-020-0526-3)).
77
+
78
+ ---
79
+
80
+ ## Pipelines
81
+
82
+ Five pipelines are provided (four YOLO-gated and one U-Net-only):
83
+
84
+ | Pipeline | Flag | Description |
85
+ |----------|------|-------------|
86
+ | **1 — VFT** | `vft` | YOLO detects the glottis → crop (size locked to first detection) → motion-based VocalFoldTracker |
87
+ | **2 — Guided VFT** | `guided-vft` | YOLO bbox as ROI mask on the full frame → YOLOGuidedVFT (no cropping) |
88
+ | **3 — YOLO+UNet** | `unet` | YOLO + full-frame U-Net: detection-gated (output only on frames where YOLO fires) |
89
+ | **4 — YOLO-Crop+UNet** | `yolo-crop+unet` | YOLO crop → resize to 256×256 → crop-trained U-Net → project mask back. Weights are provided; use `--crop-weights` in `eval_girafe.py` / `eval_bagls.py`. Once tests validate, these weights will be committed to the repo. |
90
+ | **5 — U-Net only** | `unet-only` | Full-frame U-Net only, no YOLO gate (only `--unet-weights` required) |
91
+
92
+ All pipelines produce a per-frame **glottal area waveform**. The Qt GUI and CLI further support **L/R displacement waveform** extraction: independent left and right vocal-fold excursions relative to a medial axis, with a configurable sampling position (e.g. anterior–middle junction) to isolate asymmetry. Kinematic features (open quotient, fundamental frequency, periodicity, etc.) are computed from area or L/R waveforms for downstream clinical analysis.
93
+
94
+ ---
95
+
96
+ ## Installation
97
+
98
+ Use a virtual environment in the repo so all commands use the same Python and dependencies.
99
+
100
+ ```bash
101
+ git clone https://github.com/hari-krishnan/openglottal.git
102
+ cd openglottal
103
+ python3 -m venv .venv
104
+ source .venv/bin/activate # Linux/macOS; on Windows: .venv\Scripts\activate
105
+ pip install -e ".[dev]"
106
+ ```
107
+
108
+ From a local clone (venv already exists):
109
+
110
+ ```bash
111
+ cd /path/to/openglottal
112
+ source .venv/bin/activate
113
+ pip install -e ".[dev]" # only if you need to reinstall
114
+ ```
115
+
116
+ **Always use this venv** for training, evaluation, and CLI: activate it (`source .venv/bin/activate`) before running any `python`/`openglottal` commands, or use the `./run` script (e.g. `./run scripts/train_unet.py ...`) which uses `.venv/bin/python` automatically.
117
+
118
+ For the **Qt GUI**, install the optional GUI extra: `pip install -e ".[gui]"` (adds PyQt5). See [Qt GUI](#qt-gui) below.
119
+
120
+ *(A `pip install openglottal` option will work once the package is published to PyPI.)*
121
+
122
+ **Requirements:** Python ≥ 3.9, PyTorch ≥ 2.0, Ultralytics ≥ 8.0, OpenCV ≥ 4.8
123
+
124
+ **Weights:** Place pre-trained weights in `weights/` (or pass full paths to the scripts). Provided weights:
125
+
126
+ | Weights file | Description |
127
+ |--------------|-------------|
128
+ | `weights/og_girafe_unet_full.pt` | GIRAFE-trained U-Net (full-frame, no crop) |
129
+ | `weights/og_girafe_unet_crop.pt` | GIRAFE-trained U-Net (crop mode; use with YOLO-Crop+UNet pipeline) |
130
+ | `weights/og_girafe_yolo.pt` | GIRAFE-trained YOLO glottis detector |
131
+ | `weights/og_bagls_unet_full.pt` | BAGLS-trained U-Net (full-frame, no crop) |
132
+ | `weights/og_bagls_unet_crop.pt` | BAGLS-trained U-Net (crop mode; use with YOLO-Crop+UNet on BAGLS in-distribution) |
133
+ | `weights/og_bagls_yolo.pt` | BAGLS-trained YOLO glottis detector |
134
+
135
+ To train your own, see [Training](#training); trained models are saved under `outputs/`.
136
+
137
+ ---
138
+
139
+ ## Quick Start
140
+
141
+ ### Python API
142
+
143
+ ```python
144
+ import torch
145
+ from openglottal import TemporalDetector, UNet, extract_features_unet
146
+
147
+ device = torch.device("mps") # or "cuda" / "cpu"
148
+
149
+ detector = TemporalDetector("weights/og_girafe_yolo.pt")
150
+
151
+ model = UNet(1, 1, (32, 64, 128, 256)).to(device)
152
+ model.load_state_dict(torch.load("weights/og_girafe_unet_full.pt", map_location=device))
153
+ model.eval()
154
+
155
+ features = extract_features_unet("video.avi", detector, model, device)
156
+ print(features)
157
+ # {'area_mean': 312.4, 'area_std': 98.1, 'open_quotient': 0.61, 'f0': 0.017, ...}
158
+ ```
159
+
160
+ ### CLI
161
+
162
+ ```bash
163
+ # U-Net pipeline (recommended)
164
+ openglottal run video.avi \
165
+ --yolo-weights weights/og_girafe_yolo.pt \
166
+ --unet-weights weights/og_girafe_unet_full.pt \
167
+ --pipeline unet \
168
+ --output results/
169
+
170
+ # Motion-based pipeline (no U-Net weights needed)
171
+ openglottal run video.avi \
172
+ --yolo-weights weights/og_girafe_yolo.pt \
173
+ --pipeline guided-vft \
174
+ --output results/
175
+ ```
176
+
177
+ ### Qt GUI
178
+
179
+ Desktop app for viewing HSV videos with segmentation overlay, midline/AC–PC axes, **L/R displacement waveforms**, and kinematic metrics (open quotient, F0, periodicity, etc.). Quantifying L/R asymmetry in cases such as vocal fold nodules is a key use case: the dynamic medial probe and independent left/right waveforms isolate nodule-induced irregularities.
180
+
181
+ **Pathology / L/R asymmetry** — For clinicians, the L/R displacement waveform is the main diagnostic view (see screenshot at top).
182
+
183
+ ```bash
184
+ pip install -e ".[gui]"
185
+ openglottal-gui
186
+ ```
187
+
188
+ Load a video, choose detector and U-Net weights from `weights/`, set frame range, and use Play/Pause or the timeline slider. Optional `metadata.json` next to the video supplies FPS and other keys. See [openglottal/qt_app/README.md](openglottal/qt_app/README.md) for details.
189
+
190
+ ---
191
+
192
+ ## Evaluation
193
+
194
+ ### GIRAFE (4 test patients, 80 frames)
195
+
196
+ Compare against the GIRAFE paper baselines using GIRAFE-trained weights:
197
+
198
+ ```bash
199
+ python scripts/eval_girafe.py \
200
+ --images-dir GIRAFE/Training/imagesTr \
201
+ --labels-dir GIRAFE/Training/labelsTr \
202
+ --training-json GIRAFE/Training/training.json \
203
+ --unet-weights weights/og_girafe_unet_full.pt \
204
+ --yolo-weights weights/og_girafe_yolo.pt \
205
+ --device mps
206
+ ```
207
+
208
+ Results are printed alongside the published GIRAFE baselines for direct comparison. Pass `--output-json results.json` to save raw per-frame scores.
209
+
210
+ ### Results (GIRAFE test split, 4 patients, 80 frames)
211
+
212
+ | Method | Det.Recall | Dice | IoU | Dice≥0.5 |
213
+ |--------|-----------|------|-----|----------|
214
+ | InP (GIRAFE paper) | n/a | 0.71 | n/a | n/a |
215
+ | U-Net (GIRAFE paper) | n/a | 0.64 | n/a | n/a |
216
+ | SwinUNetV2 (paper) | n/a | 0.62 | n/a | n/a |
217
+ | **U-Net only** | n/a | **0.81** | **0.70** | **96.2%** |
218
+ | OTSU (baseline) | 0.95 | 0.22 | 0.13 | 2.5% |
219
+ | YOLO+UNet | 0.95 | 0.75 | 0.64 | 88.8% |
220
+ | YOLO-Crop+UNet† | 0.95 | 0.70 | 0.57 | 77.5% |
221
+ | Motion (baseline) | 0.95 | 0.27 | 0.17 | 9.7% |
222
+
223
+ - **Det.Recall** — fraction of frames where YOLO detected a glottis
224
+ - **Dice** — mean Dice coefficient across all test frames (higher is better)
225
+ - **Dice≥0.5** — fraction of frames meeting the clinical pass threshold
226
+
227
+ † YOLO-Crop+UNet uses the crop-trained U-Net: `weights/og_girafe_unet_crop.pt`. Full-frame and crop weight files are not interchangeable.
228
+
229
+ YOLO+Motion underperforms because GIRAFE test frames are the first 20 frames per patient, providing insufficient temporal context for the motion tracker to converge.
230
+
231
+ ### BAGLS cross-dataset (3 500 test frames, GIRAFE-trained only)
232
+
233
+ No BAGLS data used in training. Use GIRAFE-trained weights; images are letterboxed to 256×256.
234
+
235
+ ```bash
236
+ python scripts/eval_bagls.py \
237
+ --bagls-dir BAGLS/test \
238
+ --unet-weights weights/og_girafe_unet_full.pt \
239
+ --crop-weights weights/og_girafe_unet_crop.pt \
240
+ --yolo-weights weights/og_girafe_yolo.pt \
241
+ --device mps
242
+ ```
243
+
244
+ | Method | Det.Recall | Dice | IoU | Dice≥0.5 |
245
+ |--------|-----------|------|-----|----------|
246
+ | U-Net only | 1.00 | 0.59 | 0.50 | 67.1% |
247
+ | YOLO+UNet | 0.69 | 0.55 | 0.47 | 61.9% |
248
+ | **YOLO-Crop+UNet** | **0.69** | **0.61** | **0.53** | **70.3%** |
249
+
250
+ *(Table at default confidence $\tau=0.25$. With `--conf 0.02`, YOLO-Crop+UNet reaches Dice 0.64 and Dice≥0.5 76.4%; see paper and `sweep_bagls_conf.py`.)*
251
+
252
+ YOLO-Crop+UNet is the strongest pipeline on the unseen BAGLS data (+2 pp Dice, +3.2 pp Dice≥0.5 over U-Net alone at default $\tau$; at $\tau=0.02$, +5 pp Dice and +9.3 pp Dice≥0.5), despite YOLO only detecting on 68.8% of frames (domain shift from GIRAFE). When YOLO does fire, cropping and re-scaling the region of interest gives U-Net higher effective resolution and cleaner context — benefits that generalise across datasets.
253
+
254
+ ### BAGLS in-distribution (3 500 test frames, BAGLS-trained weights)
255
+
256
+ Use the provided BAGLS-trained U-Net (full-frame and crop) and YOLO weights:
257
+
258
+ ```bash
259
+ python scripts/eval_bagls.py \
260
+ --bagls-dir BAGLS/test \
261
+ --unet-weights weights/og_bagls_unet_full.pt \
262
+ --crop-weights weights/og_bagls_unet_crop.pt \
263
+ --yolo-weights weights/og_bagls_yolo.pt \
264
+ --device mps
265
+ ```
266
+
267
+ On the 3 500-frame BAGLS test set this configuration achieves:
268
+
269
+ | Method | Det.Recall | Dice | IoU | Dice≥0.5 |
270
+ |--------------------|-----------:|------:|------:|---------:|
271
+ | **U-Net only** | 1.00 | 0.85 | 0.77 | 94.0% |
272
+ | **YOLO+UNet** | 0.87 | **0.85** | **0.78** | **94.6%** |
273
+ | YOLO-Crop+UNet | 0.87 | 0.74 | 0.64 | 87.1% |
274
+
275
+ So BAGLS-trained YOLO+UNet sets a strong in-distribution baseline (Dice 0.85), while the GIRAFE-trained YOLO-Crop+UNet remains the best cross-dataset configuration.
276
+
277
+ ---
278
+
279
+ ## Training
280
+
281
+ ### 1. Build the YOLO dataset
282
+
283
+ ```bash
284
+ openglottal build-dataset \
285
+ --images-dir GIRAFE/Training/imagesTr \
286
+ --labels-dir GIRAFE/Training/labelsTr \
287
+ --training-json GIRAFE/Training/training.json \
288
+ --output-dir yolo_data
289
+ ```
290
+
291
+ Or via script:
292
+
293
+ ```bash
294
+ python scripts/train_yolo.py \
295
+ --images-dir GIRAFE/Training/imagesTr \
296
+ --labels-dir GIRAFE/Training/labelsTr \
297
+ --training-json GIRAFE/Training/training.json \
298
+ --epochs 100
299
+ ```
300
+
301
+ YOLO saves best weights to `outputs/yolo/exp/weights/best.pt` (default run name `exp`). Use that path for `train_unet_crop.py` below, or copy to `weights/og_girafe_yolo.pt` for eval/CLI.
302
+
303
+ ### 2. Train the U-Net (full-frame mode)
304
+
305
+ ```bash
306
+ python scripts/train_unet.py \
307
+ --images-dir GIRAFE/Training/imagesTr \
308
+ --labels-dir GIRAFE/Training/labelsTr \
309
+ --training-json GIRAFE/Training/training.json \
310
+ --output outputs/og_girafe_unet_full.pt \
311
+ --epochs 50
312
+ ```
313
+
314
+ ### 3. Train the U-Net (YOLO-crop mode — higher effective resolution)
315
+
316
+ Uses YOLO to crop each training image to the glottis region, resizes to 256×256, and trains U-Net on these patches. At inference time the YOLO-Crop+UNet pipeline crops the input, runs U-Net, and projects the mask back to full-frame coordinates.
317
+
318
+ ```bash
319
+ python scripts/train_unet_crop.py \
320
+ --images-dir GIRAFE/Training/imagesTr \
321
+ --labels-dir GIRAFE/Training/labelsTr \
322
+ --training-json GIRAFE/Training/training.json \
323
+ --yolo-weights outputs/yolo/exp/weights/best.pt \
324
+ --output outputs/og_girafe_unet_crop.pt \
325
+ --crop-size 256 \
326
+ --epochs 50 \
327
+ --device cpu
328
+ ```
329
+
330
+ **Device:** Use `--device cuda` on Kaggle or a GPU machine. On Mac, use `--device cpu` for training (MPS is not used for training); use `--device mps` for evaluation (`eval_bagls.py`, `eval_girafe.py`, `analyze_gaw.py`) for faster inference.
331
+
332
+ Both training modes use a **50/50 BCE + Dice loss** with cosine annealing, saving the best validation checkpoint automatically.
333
+
334
+ ---
335
+
336
+ ## Repository Structure
337
+
338
+ ```
339
+ openglottal/
340
+ ├── weights/ # pre-trained weights (see table above)
341
+ ├── outputs/ # trained models (gitignored) when training your own
342
+ ├── openglottal/
343
+ │ ├── models/
344
+ │ │ ├── detector.py # TemporalDetector — YOLOv8 + temporal box locking
345
+ │ │ ├── tracker.py # VocalFoldTracker, YOLOGuidedVFT
346
+ │ │ └── unet.py # UNet, DoubleConv, GlottisDataset (with augmentation)
347
+ │ ├── qt_app/ # Qt GUI — overlay, midline, waveform, openglottal-gui
348
+ │ │ ├── main.py # Main window, inference worker, playback
349
+ │ │ ├── utils.py # Overlay, axes/AC-PC drawing
350
+ │ │ └── README.md # GUI usage and features
351
+ │ ├── metadata.py # Video I/O, metadata.json, frame loading (used by GUI)
352
+ │ ├── features.py # extract_features_{detector,yolo_guided_vft,unet}
353
+ │ ├── data.py # mask_to_yolo, build_yolo_dataset
354
+ │ ├── utils.py # I/O helpers, dice/IoU metrics, unet_segment_frame
355
+ │ └── cli.py # `openglottal` command-line interface
356
+ ├── scripts/
357
+ │ ├── train_yolo.py # standalone YOLO training script
358
+ │ ├── train_unet.py # standalone U-Net training script
359
+ │ ├── infer.py # batch inference: AVI dir or image sequence → _out.avi
360
+ │ ├── eval_girafe.py # per-patient test evaluation vs GIRAFE baselines
361
+ │ ├── eval_bagls.py # cross-dataset evaluation on BAGLS (3 500 frames)
362
+ │ ├── analyze_gaw.py # GAW feature analysis: Healthy vs Pathological (65 patients)
363
+ │ ├── download_datasets.py # download GIRAFE and BAGLS from Zenodo
364
+ │ ├── prepare_girafe_splits.py # build GIRAFE training.json from images/labels
365
+ │ ├── prepare_bagls_splits.py # build BAGLS train/test splits
366
+ │ ├── make_montage.py # build frame montage PNGs for paper figures
367
+ │ ├── sweep_bagls_conf.py # YOLO confidence threshold sweep on BAGLS
368
+ │ └── train_unet_crop.py # train U-Net on YOLO-cropped patches (higher res)
369
+ └── configs/
370
+ └── default.yaml # all hyperparameters documented in one place
371
+ ```
372
+
373
+ ---
374
+
375
+ ## Dataset
376
+
377
+ OpenGlottal is developed and evaluated on **GIRAFE** and **BAGLS**. Download them (optional: use the script) then point the training/eval scripts at the extracted directories.
378
+
379
+ ```bash
380
+ # Download GIRAFE and/or BAGLS to the current directory (or use --output-dir)
381
+ python scripts/download_datasets.py --girafe --bagls
382
+ ```
383
+
384
+ - **GIRAFE** (Zenodo): [zenodo.org/records/13773163](https://zenodo.org/records/13773163) — 760 frames (256×256 px), expert-annotated glottal masks. Dataset paper: [Data in Brief (2025)](https://doi.org/10.1016/j.dib.2025.111376). After unpacking: `GIRAFE/Training/imagesTr/`, `GIRAFE/Training/labelsTr/`, `GIRAFE/Training/training.json`; raw videos: `GIRAFE/Raw_Data/`. If you have a copy in `./sdsc/glottal_area`, copy it over then run the prepare script:
385
+
386
+ ```bash
387
+ mkdir -p GIRAFE/Training && cp -r sdsc/glottal_area/imagesTr sdsc/glottal_area/labelsTr GIRAFE/Training/
388
+ python scripts/prepare_girafe_splits.py --images-dir GIRAFE/Training/imagesTr --labels-dir GIRAFE/Training/labelsTr --output GIRAFE/Training/training.json
389
+ ```
390
+ - **BAGLS** (Zenodo): [zenodo.org/records/3762320](https://zenodo.org/records/3762320) — benchmark for automatic glottis segmentation. Dataset paper: [Gómez et al., Scientific Data (2020)](https://doi.org/10.1038/s41597-020-0526-3). After downloading with `download_datasets.py --bagls`, use `BAGLS/test` as `--bagls-dir` for eval and `BAGLS/training` for training.
391
+
392
+ | Split (GIRAFE) | Frames |
393
+ |----------------|--------|
394
+ | Train | ~608 |
395
+ | Val | ~76 |
396
+ | Test | ~76 |
397
+
398
+ ---
399
+
400
+ ## Kinematic Features
401
+
402
+ The following scalar features are extracted from each glottal area waveform (or from the L/R-derived opening signal when using displacement mode):
403
+
404
+ | Feature | Description |
405
+ |---------|-------------|
406
+ | `area_mean` | Mean glottal area (px²) |
407
+ | `area_std` | Standard deviation of area |
408
+ | `area_range` | Max − min area |
409
+ | `open_quotient` | Fraction of cycle with area above 10 % of mean |
410
+ | `f0` | Dominant frequency from FFT (cycles/frame; multiply by capture fps for Hz) |
411
+ | `periodicity` | Peak autocorrelation at lags 1–50 |
412
+ | `cv` | Coefficient of variation (std / mean) |
413
+
414
+ **Spatial kinematics (L/R asymmetry)**
415
+ - **L/R displacement:** Independent tracking of left and right vocal-fold excursions (px) relative to the medial axis. Exported as separate time series (L, R, L−R, area) via the GUI “Save analysis” or the CLI `openglottal displacement` command.
416
+ - **Dynamic medial probe:** Configurable sampling position (0–1 along the anterior–posterior axis) so analysis can be run at a specific anatomical point (e.g. junction of anterior and middle thirds) to isolate nodule-induced irregularities.
417
+
418
+ ---
419
+
420
+ ## Glottal Area Waveform and L/R Displacement
421
+
422
+ Beyond frame-level segmentation, the pipeline produces a **Glottal Area Waveform (GAW)** — the per-frame glottal area over time — and optionally **L/R displacement waveforms** (left and right excursions relative to the medial axis). Kinematic features can be extracted from either GAW or L/R-derived signals and used for clinical classification and asymmetry assessment.
423
+
424
+ ```bash
425
+ python scripts/analyze_gaw.py \
426
+ --raw-data-dir GIRAFE/Raw_Data \
427
+ --yolo-weights weights/og_girafe_yolo.pt \
428
+ --unet-weights weights/og_girafe_unet_full.pt \
429
+ --device mps \
430
+ --output-dir results/gaw
431
+ ```
432
+
433
+ This script processes all 65 GIRAFE patients, extracts kinematic features from each area waveform, and compares **Healthy** vs **Pathological** groups using Mann-Whitney U tests.
434
+
435
+ ### Key Findings (65 patients: 15 Healthy, 25 Pathological, 25 Unknown)
436
+
437
+ Because the cohort has a significant sex imbalance (see note below), results are reported **stratified by sex** rather than pooled.
438
+
439
+ **Female subgroup (12 H / 11 P):**
440
+
441
+ | Feature | Healthy (mean±std) | Pathological (mean±std) | p-value |
442
+ |---------|-------------------|------------------------|---------|
443
+ | area_mean | 125.2 ± 43.1 | 247.8 ± 204.6 | 0.230 |
444
+ | area_std | 112.9 ± 32.2 | 118.9 ± 96.0 | 0.406 |
445
+ | area_range | 336.7 ± 97.6 | 375.5 ± 272.2 | 0.559 |
446
+ | open_quotient | 0.760 ± 0.207 | 0.874 ± 0.131 | 0.192 |
447
+ | f0 | 241.7 ± 34.8 Hz | 203.5 ± 73.6 Hz | 0.156 |
448
+ | periodicity | 0.955 ± 0.008 | 0.946 ± 0.013 | 0.255 |
449
+ | cv | **0.95 ± 0.20** | **0.57 ± 0.29** | **0.006*** |
450
+
451
+ **Male subgroup (3 H / 14 P):**
452
+
453
+ | Feature | Healthy (mean±std) | Pathological (mean±std) | p-value |
454
+ |---------|-------------------|------------------------|---------|
455
+ | area_mean | 192.1 ± 18.3 | 172.7 ± 94.0 | 0.768 |
456
+ | area_std | 142.7 ± 35.0 | 92.0 ± 66.9 | 0.197 |
457
+ | area_range | 439.7 ± 86.7 | 343.1 ± 212.3 | 0.488 |
458
+ | open_quotient | 0.860 ± 0.145 | 0.843 ± 0.186 | 1.000 |
459
+ | f0 | 183.3 ± 75.0 Hz | 82.5 ± 79.3 Hz | 0.169 |
460
+ | periodicity | 0.962 ± 0.001 | 0.900 ± 0.116 | 0.068 |
461
+ | cv | 0.75 ± 0.19 | 0.63 ± 0.40 | 0.509 |
462
+
463
+ \* p < 0.05 (Mann-Whitney U, two-sided)
464
+
465
+ > **Sex imbalance note:** The Healthy group is 80% female (12F/3M) while the Pathological group is 56% male (14M/11F; Fisher's exact p=0.025). Because f0 is strongly sex-dependent (males ~100 Hz vs females ~224 Hz), pooling would confound f0. After stratifying, only **cv** (coefficient of variation) reaches significance in the female subgroup (p=0.006). In the male subgroup (n=3 Healthy), cv trends in the same direction (0.75 vs 0.63) but does not reach significance (p=0.509); periodicity approaches significance (p=0.068). The male subgroup is too small for reliable inference.
466
+
467
+ ### Production Robustness
468
+
469
+ YOLO acts as a **detection gate**: when the endoscope moves away from the glottis (scope insertion, patient coughing, instrument in view), YOLO fires no detection and the area is set to zero — preventing spurious waveform spikes that would corrupt downstream feature extraction.
470
+
471
+ ---
472
+
473
+ ## Citation
474
+
475
+ If you use OpenGlottal in your research, please cite:
476
+
477
+ **H. Unnikrishnan.** *A Detection-Gated Pipeline for Robust Glottal Area Waveform Extraction and Clinical Pathology Assessment.* arXiv:2603.02087 [cs.CV], 2 Mar 2026.
478
+ [https://arxiv.org/abs/2603.02087](https://arxiv.org/abs/2603.02087)
479
+
480
+ ```bibtex
481
+ @misc{unnikrishnan2026openglottal,
482
+ title = {A Detection-Gated Pipeline for Robust Glottal Area
483
+ Waveform Extraction and Clinical Pathology Assessment},
484
+ author = {Unnikrishnan, Harikrishnan},
485
+ year = {2026},
486
+ eprint = {2603.02087},
487
+ archivePrefix = {arXiv},
488
+ primaryClass = {cs.CV},
489
+ url = {https://arxiv.org/abs/2603.02087}
490
+ }
491
+ ```
492
+
493
+ Related prior work on glottal kinematics and high-speed imaging:
494
+
495
+ ```bibtex
496
+ @article{patel2013invivo,
497
+ title = {In vivo measurement of pediatric vocal fold motion using
498
+ structured light laser projection},
499
+ author = {Patel, Rita R and Donohue, Kevin D and Lau, Daniel and
500
+ Unnikrishnan, Harikrishnan},
501
+ journal = {The Laryngoscope},
502
+ year = {2013}
503
+ }
504
+
505
+ @article{patel2016effects,
506
+ title = {Effects of vocal fold nodules on glottal cycle measurements
507
+ derived from high-speed digital imaging},
508
+ author = {Patel, Rita R and Unnikrishnan, Harikrishnan and Donohue, Kevin D},
509
+ journal = {Journal of Speech, Language, and Hearing Research},
510
+ year = {2016}
511
+ }
512
+ ```
513
+
514
+ ---
515
+
516
+ ## License
517
+
518
+ MIT License — see [LICENSE](LICENSE).