discophon 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. discophon-0.0.3/.github/workflows/release.yml +109 -0
  2. discophon-0.0.3/.gitignore +7 -0
  3. discophon-0.0.3/.justfile +17 -0
  4. discophon-0.0.3/.python-version +1 -0
  5. discophon-0.0.3/LICENSE +21 -0
  6. discophon-0.0.3/PKG-INFO +83 -0
  7. discophon-0.0.3/README.md +60 -0
  8. discophon-0.0.3/TODO +4 -0
  9. discophon-0.0.3/docs/baselines.md +0 -0
  10. discophon-0.0.3/docs/evaluate.md +77 -0
  11. discophon-0.0.3/docs/prepare.md +87 -0
  12. discophon-0.0.3/docs/submission.md +0 -0
  13. discophon-0.0.3/pyproject.toml +116 -0
  14. discophon-0.0.3/scripts/benchmark.slurm +65 -0
  15. discophon-0.0.3/scripts/finetune_spidr.py +125 -0
  16. discophon-0.0.3/src/discophon/__init__.py +0 -0
  17. discophon-0.0.3/src/discophon/benchmark.py +137 -0
  18. discophon-0.0.3/src/discophon/core/__init__.py +16 -0
  19. discophon-0.0.3/src/discophon/core/assets/phonology.json +452 -0
  20. discophon-0.0.3/src/discophon/core/assets/sonority.json +139 -0
  21. discophon-0.0.3/src/discophon/core/assets/tipa.json +139 -0
  22. discophon-0.0.3/src/discophon/core/data.py +103 -0
  23. discophon-0.0.3/src/discophon/core/languages.py +103 -0
  24. discophon-0.0.3/src/discophon/core/validation.py +91 -0
  25. discophon-0.0.3/src/discophon/evaluate/__init__.py +5 -0
  26. discophon-0.0.3/src/discophon/evaluate/__main__.py +27 -0
  27. discophon-0.0.3/src/discophon/evaluate/abx.py +69 -0
  28. discophon-0.0.3/src/discophon/evaluate/boundaries.py +151 -0
  29. discophon-0.0.3/src/discophon/evaluate/evaluate.py +42 -0
  30. discophon-0.0.3/src/discophon/evaluate/per.py +51 -0
  31. discophon-0.0.3/src/discophon/evaluate/pnmi.py +191 -0
  32. discophon-0.0.3/src/discophon/prepare/__init__.py +3 -0
  33. discophon-0.0.3/src/discophon/prepare/__main__.py +26 -0
  34. discophon-0.0.3/src/discophon/prepare/core.py +94 -0
  35. discophon-0.0.3/src/discophon/py.typed +0 -0
  36. discophon-0.0.3/tests/__init__.py +0 -0
  37. discophon-0.0.3/tests/test_benchmark_integrity.py +0 -0
  38. discophon-0.0.3/tests/test_tipa_mapping.py +39 -0
  39. discophon-0.0.3/uv.lock +2623 -0
@@ -0,0 +1,109 @@
1
+ name: Release Workflow
2
+ permissions: {}
3
+
4
+ on:
5
+ push:
6
+ tags:
7
+ - "*"
8
+
9
+ jobs:
10
+ lint:
11
+ name: Linters
12
+ runs-on: ubuntu-latest
13
+ permissions:
14
+ security-events: write
15
+ contents: read # Fetch repository
16
+ actions: read
17
+ steps:
18
+ - name: Checkout code
19
+ uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
20
+ with:
21
+ persist-credentials: false
22
+
23
+ # - name: zizmor
24
+ # uses: zizmorcore/zizmor-action@e639db99335bc9038abc0e066dfcd72e23d26fb4 # v0.3.0
25
+
26
+ - name: typos
27
+ uses: crate-ci/typos@bb4666ad77b539a6b4ce4eda7ebb6de553704021 # v1.42.0
28
+
29
+ - name: Ruff
30
+ uses: astral-sh/ruff-action@57714a7c8a2e59f32539362ba31877a1957dded1 # v3.5.1
31
+
32
+ build:
33
+ name: Build distribution
34
+ runs-on: ubuntu-latest
35
+ needs: lint
36
+ permissions:
37
+ contents: read # Fetch repository
38
+ actions: write # Upload artifact
39
+ steps:
40
+ - name: Checkout code
41
+ uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
42
+ with:
43
+ persist-credentials: false
44
+
45
+ - name: Install uv
46
+ uses: astral-sh/setup-uv@61cb8a9741eeb8a550a1b8544337180c0fc8476b # v7.2.0
47
+ with:
48
+ version: "latest"
49
+ enable-cache: false
50
+
51
+ - name: Build wheel
52
+ run: uv build
53
+
54
+ - name: Upload dists
55
+ uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
56
+ with:
57
+ name: python-package-distributions
58
+ path: dist/
59
+
60
+ publish-to-pypi:
61
+ name: Publish to PyPI
62
+ runs-on: ubuntu-latest
63
+ needs: build
64
+ environment:
65
+ name: publish-to-pypi
66
+ permissions:
67
+ actions: read # Download artifact
68
+ id-token: write # Needed for trusted publishing
69
+ steps:
70
+ - name: Download all dists
71
+ uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
72
+ with:
73
+ name: python-package-distributions
74
+ path: dist/
75
+
76
+ - name: Install uv
77
+ uses: astral-sh/setup-uv@61cb8a9741eeb8a550a1b8544337180c0fc8476b # v7.2.0
78
+ with:
79
+ version: "latest"
80
+ enable-cache: false
81
+
82
+ - name: Upload to PyPI
83
+ # run: uv publish --trusted-publishing always
84
+ run: uv publish -t ${{ secrets.PYPI_TOKEN }}
85
+
86
+
87
+ github-release:
88
+ name: GitHub release
89
+ runs-on: ubuntu-latest
90
+ needs: build
91
+ permissions:
92
+ actions: read # Download artifact
93
+ contents: write # Create GitHub release
94
+ steps:
95
+ - name: Checkout code
96
+ uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
97
+ with:
98
+ persist-credentials: false
99
+
100
+ - name: Download all dists
101
+ uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
102
+ with:
103
+ name: python-package-distributions
104
+ path: ./dist/
105
+
106
+ - name: Create release
107
+ env:
108
+ GITHUB_TOKEN: ${{ github.token }}
109
+ run: gh release create "${GITHUB_REF#refs/tags/}" ./dist/*
@@ -0,0 +1,7 @@
1
+ __pycache__
2
+ *.py[oc]
3
+ build/
4
+ dist/
5
+ *.egg-info
6
+ .venv
7
+ tipa-mapping-*.pdf
@@ -0,0 +1,17 @@
1
+ set quiet
2
+
3
+ [private]
4
+ @default:
5
+ just --list
6
+
7
+ # Test
8
+ test:
9
+ uv run --group test pytest
10
+
11
+ # Lint and format
12
+ lint:
13
+ uv run --dev ruff check
14
+ uv run --dev ruff format
15
+ uv run --dev tombi format
16
+ uv run --dev typos
17
+ uv run --dev ty check src
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 CoML
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,83 @@
1
+ Metadata-Version: 2.4
2
+ Name: discophon
3
+ Version: 0.0.3
4
+ Summary: The Phoneme Discovery Benchmark
5
+ Author-email: CoML <dev@cognitive-ml.fr>
6
+ License-Expression: MIT
7
+ License-File: LICENSE
8
+ Requires-Python: >=3.12
9
+ Requires-Dist: filelock>=3.20.2
10
+ Requires-Dist: httpx>=0.28.1
11
+ Requires-Dist: joblib>=1.5.3
12
+ Requires-Dist: numba>=0.63.1
13
+ Requires-Dist: numpy>=2.3.5
14
+ Requires-Dist: polars>=1.36.1
15
+ Requires-Dist: praat-textgrids>=1.4.0
16
+ Requires-Dist: soundfile>=0.13.1
17
+ Requires-Dist: soxr>=1.0.0
18
+ Requires-Dist: tqdm>=4.67.1
19
+ Requires-Dist: xarray>=2025.12.0
20
+ Provides-Extra: abx
21
+ Requires-Dist: fastabx>=0.6.1; extra == 'abx'
22
+ Description-Content-Type: text/markdown
23
+
24
+ # The Phoneme Discovery benchmark
25
+
26
+ [💾 [Website](https://benchmarks.cognitive-ml.fr/phoneme_discovery)] [📜 [Paper]()] [📖 [BibTex](https://github.com/bootphon/phoneme_discovery?tab=readme-ov-file#citation)]
27
+
28
+ ## Introduction
29
+
30
+ The last several years have seen revolutionary improvements in both speech processing and textual natural language
31
+ processing. In both cases, unsupervised or self-supervised pre-training has been the key to models autonomously
32
+ discovering representations that are tremendously useful for doing language tasks. Yet, central to the study of human
33
+ speech processing is the phoneme inventory, a small set of discrete units that abstract away from massive pronunciation
34
+ variability in the signal.
35
+
36
+ Discovering the correct set of phonemes for a language is crucial: encode the wrong categories, and contrasts between
37
+ words are distorted or disappear; fail to categorize at all, and contrasts between words are hidden behind semantically
38
+ irrelevant variation in the signal. While much attention has been paid to whether unsupervised speech models’
39
+ (continuous or discrete) representations are predictive of phonemes, this benchmark, for the first time, explicitly
40
+ fixes the goal of learning a discrete set of categories that are in one-to-one correspondence with the phoneme
41
+ inventory of a language.
42
+
43
+ Infants appear to learn the phoneme inventory of their language effortlessly, before they can speak. They benefit from
44
+ millions of years of evolution of the human brain and body, giving them a learning architecture that allows them to
45
+ thrive in the face of scarce and noisy language data, preparing them to learn the phoneme inventory of any human
46
+ language.
47
+
48
+ The Phoneme Discovery benchmark is aimed at building models that discover phoneme inventories across various languages,
49
+ using only small amounts of speech data, and without textual data during training.
50
+
51
+ ## Installation
52
+
53
+ ```bash
54
+ pip install discophon
55
+ ```
56
+
57
+ To be able to compute ABX discriminabilities: `pip install discophon[abx]`.
58
+
59
+ If you want to run baselines and have access to the utility scripts, clone this repository:
60
+
61
+ ```bash
62
+ git clone https://github.com/bootphon/phoneme_discovery
63
+ cd phoneme_discovery
64
+ uv sync
65
+ # uv sync --all-extras --all-groups # If you want all dependencies
66
+ ```
67
+
68
+ ## Usage
69
+
70
+ Check out the documentation:
71
+
72
+ - [Data preparation](https://github.com/bootphon/phoneme_discovery/blob/main/docs/prepare.md)
73
+ - [Simple evaluation](https://github.com/bootphon/phoneme_discovery/blob/main/docs/evaluate.md)
74
+ - [Run the benchmark](https://github.com/bootphon/phoneme_discovery/blob/main/benchmark.md)
75
+ - [Use the baseline systems](https://github.com/bootphon/phoneme_discovery/blob/main/baselines.md)
76
+
77
+ ### Citation
78
+
79
+ ```bibtex
80
+
81
+ ```
82
+
83
+ Contact: `benchmarks [at] cognitive-ml [dot] fr`
@@ -0,0 +1,60 @@
1
+ # The Phoneme Discovery benchmark
2
+
3
+ [💾 [Website](https://benchmarks.cognitive-ml.fr/phoneme_discovery)] [📜 [Paper]()] [📖 [BibTex](https://github.com/bootphon/phoneme_discovery?tab=readme-ov-file#citation)]
4
+
5
+ ## Introduction
6
+
7
+ The last several years have seen revolutionary improvements in both speech processing and textual natural language
8
+ processing. In both cases, unsupervised or self-supervised pre-training has been the key to models autonomously
9
+ discovering representations that are tremendously useful for doing language tasks. Yet, central to the study of human
10
+ speech processing is the phoneme inventory, a small set of discrete units that abstract away from massive pronunciation
11
+ variability in the signal.
12
+
13
+ Discovering the correct set of phonemes for a language is crucial: encode the wrong categories, and contrasts between
14
+ words are distorted or disappear; fail to categorize at all, and contrasts between words are hidden behind semantically
15
+ irrelevant variation in the signal. While much attention has been paid to whether unsupervised speech models’
16
+ (continuous or discrete) representations are predictive of phonemes, this benchmark, for the first time, explicitly
17
+ fixes the goal of learning a discrete set of categories that are in one-to-one correspondence with the phoneme
18
+ inventory of a language.
19
+
20
+ Infants appear to learn the phoneme inventory of their language effortlessly, before they can speak. They benefit from
21
+ millions of years of evolution of the human brain and body, giving them a learning architecture that allows them to
22
+ thrive in the face of scarce and noisy language data, preparing them to learn the phoneme inventory of any human
23
+ language.
24
+
25
+ The Phoneme Discovery benchmark is aimed at building models that discover phoneme inventories across various languages,
26
+ using only small amounts of speech data, and without textual data during training.
27
+
28
+ ## Installation
29
+
30
+ ```bash
31
+ pip install discophon
32
+ ```
33
+
34
+ To be able to compute ABX discriminabilities: `pip install discophon[abx]`.
35
+
36
+ If you want to run baselines and have access to the utility scripts, clone this repository:
37
+
38
+ ```bash
39
+ git clone https://github.com/bootphon/phoneme_discovery
40
+ cd phoneme_discovery
41
+ uv sync
42
+ # uv sync --all-extras --all-groups # If you want all dependencies
43
+ ```
44
+
45
+ ## Usage
46
+
47
+ Check out the documentation:
48
+
49
+ - [Data preparation](https://github.com/bootphon/phoneme_discovery/blob/main/docs/prepare.md)
50
+ - [Simple evaluation](https://github.com/bootphon/phoneme_discovery/blob/main/docs/evaluate.md)
51
+ - [Run the benchmark](https://github.com/bootphon/phoneme_discovery/blob/main/benchmark.md)
52
+ - [Use the baseline systems](https://github.com/bootphon/phoneme_discovery/blob/main/baselines.md)
53
+
54
+ ### Citation
55
+
56
+ ```bibtex
57
+
58
+ ```
59
+
60
+ Contact: `benchmarks [at] cognitive-ml [dot] fr`
discophon-0.0.3/TODO ADDED
@@ -0,0 +1,4 @@
1
+ TODO:
2
+ - Replace g -> ɡ in Wolof only
3
+ - Normalize ç
4
+ - Replace r -> ɹ in English only
File without changes
@@ -0,0 +1,77 @@
1
+ # Evaluation
2
+
3
+ ## Phoneme discovery
4
+
5
+ You can use the `phoneme_discovery` function with `units: dict[str, list[int]]`, and `phones: dict[str, list[str]]`.
6
+ You also need to set the number of units `n_units`, of phonemes `n_phones`, and the step (in ms) between consecutive
7
+ units `step_units`.
8
+
9
+ Example:
10
+
11
+ ```python
12
+ from discophon.core import read_gold_annotations, read_submitted_units
13
+ from discophon.evaluate import phoneme_discovery
14
+
15
+ phones = read_gold_annotations("/path/to/alignments/dataset.align")
16
+ units = read_submitted_units("/path/to/predictions/units.jsonl")
17
+ result = phoneme_discovery(units, phones, n_units=256, n_phones=40, step_units=20)
18
+ print(result)
19
+ ```
20
+
21
+ Or via the CLI:
22
+
23
+ ```console
24
+ ❯ python -m discophon.evaluate --help
25
+ usage: discophon.evaluate [-h] [--n-units N_UNITS] [--n-phones N_PHONES] [--step-units STEP_UNITS] units phones
26
+
27
+ Evaluate predicted units on phoneme discovery
28
+
29
+ positional arguments:
30
+ units path to predicted units
31
+ phones path to gold alignments
32
+
33
+ options:
34
+ -h, --help show this help message and exit
35
+ --n-units N_UNITS number of units
36
+ --n-phones N_PHONES number of phonemes
37
+ --step-units STEP_UNITS
38
+ step between units (in ms)
39
+ ```
40
+
41
+ ## ABX
42
+
43
+ The ABX evaluation is done separately. First, install this package with the `abx` optional dependencies:
44
+
45
+ ```bash
46
+ pip install discophon[abx]
47
+ ```
48
+
49
+ Then, either run it in Python:
50
+
51
+ ```python
52
+ from discophon.evaluate.abx import discrete_abx, continuous_abx
53
+
54
+ result_discrete = discrete_abx("/path/to/item/dataset.item", "/path/to/predictions/units.jsonl", frequency=50)
55
+ print("Discrete: ", result_discrete)
56
+
57
+ result_continuous = continuous_abx("/path/to/item/dataset.item", "/path/to/features", frequency=50)
58
+ print("Continuous: ", result_discrete)
59
+ ```
60
+
61
+ Or via the CLI:
62
+
63
+ ```console
64
+ ❯ python -m discophon.evaluate.abx --help
65
+ usage: discophon.evaluate.abx [-h] --frequency FREQUENCY item root
66
+
67
+ Continuous or discrete ABX
68
+
69
+ positional arguments:
70
+ item Path to the item file
71
+ root Path to the JSONL with units or directory with continuous features
72
+
73
+ options:
74
+ -h, --help show this help message and exit
75
+ --frequency FREQUENCY
76
+ Units frequency in Hz
77
+ ```
@@ -0,0 +1,87 @@
1
+ # Data preparation
2
+
3
+ You need the `sox` binary available in your `$PATH` for pre-processing audio files.
4
+
5
+ Let's say you want to install the benchmark data and assets in a directory `$DATA`.
6
+
7
+ ## Download Common Voice data
8
+
9
+ You first need to download audio data from CommonVoice. You can use their API if you don't want to download large
10
+ files on your local computer.
11
+
12
+ Download everything in `$DATA/raw`.
13
+
14
+ Dev languages:
15
+
16
+ - [Common Voice Scripted Speech 23.0 - German](https://datacollective.mozillafoundation.org/datasets/cmflnuzw5p0q7ydlq4k8skhqi) (34.41 GB)
17
+ - [Common Voice Scripted Speech 23.0 - Swahili](https://datacollective.mozillafoundation.org/datasets/cmflnuzw7mjcay14kmowc4y96) (21.23 GB)
18
+ - [Common Voice Scripted Speech 23.0 - Tamil](https://datacollective.mozillafoundation.org/datasets/cmflnuzw73r9g1avrbu6bwkfx) (8.56 GB)
19
+ - [Common Voice Scripted Speech 23.0 - Thai](https://datacollective.mozillafoundation.org/datasets/cmflnuzw7fwn4fc969r5owufz) (8.35 GB)
20
+ - [Common Voice Scripted Speech 23.0 - Turkish](https://datacollective.mozillafoundation.org/datasets/cmflnuzw71qkz8x3kil3tgjvk) (2.73 GB)
21
+ - [Common Voice Scripted Speech 23.0 - Ukrainian](https://datacollective.mozillafoundation.org/datasets/cmflnuzw7ijdv5oe9u7ky0zrc) (2.55 GB)
22
+
23
+ Test languages:
24
+
25
+ - [Common Voice Scripted Speech 23.0 - Basque](https://datacollective.mozillafoundation.org/datasets/cmflnuzw5qoauo49kpf8y1gzp) (14.58 GB)
26
+ - [Common Voice Scripted Speech 23.0 - Chinese (China)](https://datacollective.mozillafoundation.org/datasets/cmflnuzw8fvgv2vdgt6f52qvh) (21.26 GB)
27
+ - [Common Voice Scripted Speech 23.0 - English](https://datacollective.mozillafoundation.org/datasets/cmflnuzw52mzok78yz6woemc1) (86.83 GB)
28
+ - [Common Voice Scripted Speech 23.0 - French](https://datacollective.mozillafoundation.org/datasets/cmflnuzw5ahjms0zbrcl0vg4e) (27.87 GB)
29
+ - [Common Voice Scripted Speech 23.0 - Japanese](https://datacollective.mozillafoundation.org/datasets/cmflnuzw5lv4n3cd25tbavjb9) (11.80 GB)
30
+ - Wolof data comes from a different source, and will be downloaded with the other assets in the following section.
31
+
32
+ Extract each archive, with `tar --strip-components=1 -xvf ...`.
33
+
34
+ For example, let's say your archive is named `mcv-scripted-uk-v23.0.tar.gz`.
35
+ Extract it with `tar --strip-components=1 -xvf mcv-scripted-uk-v23.0.tar.gz`, and move the output directory to
36
+ `$DATA/raw`.
37
+
38
+ You can delete the archives afterwards. You should have the following structure:
39
+
40
+ ```bash
41
+ ❯ tree -L 2 $DATA
42
+ $DATA
43
+ └── raw
44
+ ├── de
45
+ ├── en
46
+ ├── eu
47
+ ├── fr
48
+ ├── ja
49
+ ├── sw
50
+ ├── ta
51
+ ├── th
52
+ ├── tr
53
+ ├── uk
54
+ └── zh-CN
55
+ ```
56
+
57
+ ## Download benchmark assets
58
+
59
+ Now download the benchmark assets with the following command:
60
+
61
+ ```bash
62
+ python -m discophon.prepare download $DATA
63
+ ```
64
+
65
+ This will download:
66
+ - Symlinks to audio files for each split in each language
67
+ - Manifests
68
+ - Alignments and item files
69
+
70
+ ## Preprocess selected audio files
71
+
72
+ Now resample audio files and convert them to WAV with the command:
73
+
74
+ ```bash
75
+ for code in swa tam tha tur ukr cmn eus jpn; do
76
+ python -m discophon.prepare audio $DATA $code
77
+ done
78
+ ```
79
+
80
+ This will create directories `$DATA/audio/cmn/all`, `$DATA/audio/deu/all`, `$DATA/audio/eng/all`, etc. with
81
+ resampled audio files. The directories corresponding to each split contain symlinks to those files.
82
+
83
+ You should parallelize this loop if you can. If you are in a SLURM cluster, you should also parallelize each dataset
84
+ processing across tasks or array jobs. The `discophon.prepare` package will automatically handle the distribution of
85
+ files to process across jobs.
86
+
87
+ You can delete the `$DATA/raw` folder afterwards.
File without changes
@@ -0,0 +1,116 @@
1
+ [project]
2
+ name = "discophon"
3
+ description = "The Phoneme Discovery Benchmark"
4
+ readme = "README.md"
5
+ requires-python = ">=3.12"
6
+ license = "MIT"
7
+ authors = [{ name = "CoML", email = "dev@cognitive-ml.fr" }]
8
+ dependencies = [
9
+ "filelock>=3.20.2",
10
+ "httpx>=0.28.1",
11
+ "joblib>=1.5.3",
12
+ "numba>=0.63.1",
13
+ "numpy>=2.3.5",
14
+ "polars>=1.36.1",
15
+ "praat-textgrids>=1.4.0",
16
+ "soundfile>=0.13.1",
17
+ "soxr>=1.0.0",
18
+ "tqdm>=4.67.1",
19
+ "xarray>=2025.12.0",
20
+ ]
21
+ dynamic = ["version"]
22
+
23
+ [project.optional-dependencies]
24
+ abx = [
25
+ "fastabx>=0.6.1",
26
+ ]
27
+
28
+ [dependency-groups]
29
+ dev = [
30
+ "ipykernel>=7.1.0",
31
+ "ruff>=0.14.10",
32
+ "tombi>=0.7.16",
33
+ "ty>=0.0.11",
34
+ "typos>=1.42.0",
35
+ ]
36
+ baselines = [
37
+ "minimal-hubert>=0.0.1",
38
+ "spidr[train]>=0.1.3",
39
+ ]
40
+ plot = [
41
+ "altair>=6.0.0",
42
+ "matplotlib>=3.10.8",
43
+ "seaborn>=0.13.2",
44
+ "vegafusion>=2.0.3",
45
+ "vl-convert-python>=1.9.0",
46
+ ]
47
+ test = [
48
+ "matplotlib>=3.10.8",
49
+ "pytest>=9.0.2",
50
+ ]
51
+
52
+ [build-system]
53
+ requires = ["hatch-vcs", "hatchling"]
54
+ build-backend = "hatchling.build"
55
+
56
+ [tool.hatch.version]
57
+ source = "vcs"
58
+
59
+ [tool.pytest]
60
+ log_cli = true
61
+ log_cli_level = "INFO"
62
+
63
+ [tool.ruff]
64
+ line-length = 119
65
+
66
+ [tool.ruff.lint]
67
+ select = [
68
+ "A", # flake8-builtins
69
+ "ARG", # flake8-unused-arguments
70
+ "ANN", # flake8-annotations
71
+ "B", # flake8-bugbear
72
+ "BLE", # flake8-blind-except
73
+ "C4", # flake8-comprehensions
74
+ "C90", # mccabe
75
+ "E", # pycodestyle error
76
+ "EXE", # flake8-executable
77
+ "F", # Pyflakes
78
+ "FBT", # flake8-boolean-trap
79
+ "FLY", # flynt
80
+ "FURB", # refurb
81
+ "G", # flake8-logging-format
82
+ "I", # isort
83
+ "ICN", # flake8-import-conventions
84
+ "ISC", # flake8-implicit-str-concat
85
+ "LOG", # flake8-logging
86
+ "N", # pep8-naming
87
+ "NPY", # NumPy-specific rules
88
+ "PERF", # Perflint
89
+ "PIE", # flake8-pie
90
+ "PGH", # pygrep-hooks
91
+ "PL", # Pylint
92
+ "PTH", # flake8-use-pathlib
93
+ "Q", # flake8-quotes
94
+ "RET", # flake8-return
95
+ "RSE", # flake8-raise
96
+ "RUF", # Ruff-specific rules
97
+ "SIM", # flake8-simplify
98
+ "SLF", # flake8-slots
99
+ "T10", # flake8-debugger
100
+ "TC", # flake8-type-checking
101
+ "TD", # flake8-todos
102
+ "TID", # flake8-tidy-imports
103
+ "UP", # pyupgrade
104
+ "W", # pycodestyle warning
105
+ ]
106
+
107
+ [tool.ruff.lint.flake8-tidy-imports]
108
+ banned-module-level-imports = ["discophon.evaluate.abx"]
109
+
110
+ [tool.ruff.lint.pylint]
111
+ allow-magic-value-types = ["int", "str"]
112
+ max-args = 10
113
+ max-statements = 75
114
+
115
+ [tool.typos.default.extend-words]
116
+ tha = "tha"
@@ -0,0 +1,65 @@
1
+ #!/bin/bash
2
+ #SBATCH -J discophon
3
+ #SBATCH --cpus-per-task=10
4
+ #SBATCH --time=05:00:00
5
+ #SBATCH --array=1-12
6
+
7
+ set -euo pipefail
8
+
9
+ usage() {
10
+ cat << EOF
11
+ Usage: $0 DATASET UNITS FEATURES OUTPUT N_UNITS [STEP_UNITS]
12
+
13
+ Arguments:
14
+ DATASET Path to the dataset
15
+ UNITS Path to the discrete units (used for benchmarks 'discovery' and 'abx-discrete')
16
+ FEATURES Path to the continuous features (used for benchmark 'abx-continuous')
17
+ OUTPUT Output JSONL file
18
+ N_UNITS Number of units
19
+ STEP_UNITS Step size for units (default: 20)
20
+
21
+ Description:
22
+ Runs all tasks in the Phoneme Discovery benchmark, across all languages and splits combinations.
23
+ Array tasks 1-4: phoneme discovery
24
+ Array tasks 5-8: ABX discrete
25
+ Array tasks 9-12: ABX continuous
26
+
27
+ Example:
28
+ sbatch $0 ./path/to/dataset ./path/to/units ./path/to/features ./path/to/output 256
29
+ EOF
30
+ }
31
+
32
+ if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
33
+ usage
34
+ exit 0
35
+ fi
36
+
37
+ if [[ $# -lt 5 ]]; then
38
+ echo "Error: Missing required arguments." >&2
39
+ usage
40
+ exit 1
41
+ fi
42
+
43
+ DATASET=$1
44
+ UNITS=$2
45
+ FEATURES=$3
46
+ OUTPUT=$4
47
+ N_UNITS=$5
48
+ STEP_UNITS=${6:-20}
49
+
50
+ benchmark=$(( (SLURM_ARRAY_TASK_ID - 1) / 4 ))
51
+ benchmark=$(echo "discovery abx-discrete abx-continuous" | cut -d' ' -f$((benchmark + 1)))
52
+ [[ "$benchmark" == "abx-continuous" ]] && predictions="$FEATURES" || predictions="$UNITS"
53
+ task=$(( (SLURM_ARRAY_TASK_ID - 1) % 4 ))
54
+ [[ $task -lt 2 ]] && split="dev" || split="test"
55
+ [[ $((task % 2)) -eq 0 ]] && languages="dev" || languages="test"
56
+
57
+ srun python -m discophon.benchmark \
58
+ "$DATASET" \
59
+ "$predictions" \
60
+ "$OUTPUT" \
61
+ --languages "$languages" \
62
+ --split "$split" \
63
+ --benchmark "$benchmark" \
64
+ --n-units "$N_UNITS" \
65
+ --step-units "$STEP_UNITS"