discophon 0.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- discophon-0.0.3/.github/workflows/release.yml +109 -0
- discophon-0.0.3/.gitignore +7 -0
- discophon-0.0.3/.justfile +17 -0
- discophon-0.0.3/.python-version +1 -0
- discophon-0.0.3/LICENSE +21 -0
- discophon-0.0.3/PKG-INFO +83 -0
- discophon-0.0.3/README.md +60 -0
- discophon-0.0.3/TODO +4 -0
- discophon-0.0.3/docs/baselines.md +0 -0
- discophon-0.0.3/docs/evaluate.md +77 -0
- discophon-0.0.3/docs/prepare.md +87 -0
- discophon-0.0.3/docs/submission.md +0 -0
- discophon-0.0.3/pyproject.toml +116 -0
- discophon-0.0.3/scripts/benchmark.slurm +65 -0
- discophon-0.0.3/scripts/finetune_spidr.py +125 -0
- discophon-0.0.3/src/discophon/__init__.py +0 -0
- discophon-0.0.3/src/discophon/benchmark.py +137 -0
- discophon-0.0.3/src/discophon/core/__init__.py +16 -0
- discophon-0.0.3/src/discophon/core/assets/phonology.json +452 -0
- discophon-0.0.3/src/discophon/core/assets/sonority.json +139 -0
- discophon-0.0.3/src/discophon/core/assets/tipa.json +139 -0
- discophon-0.0.3/src/discophon/core/data.py +103 -0
- discophon-0.0.3/src/discophon/core/languages.py +103 -0
- discophon-0.0.3/src/discophon/core/validation.py +91 -0
- discophon-0.0.3/src/discophon/evaluate/__init__.py +5 -0
- discophon-0.0.3/src/discophon/evaluate/__main__.py +27 -0
- discophon-0.0.3/src/discophon/evaluate/abx.py +69 -0
- discophon-0.0.3/src/discophon/evaluate/boundaries.py +151 -0
- discophon-0.0.3/src/discophon/evaluate/evaluate.py +42 -0
- discophon-0.0.3/src/discophon/evaluate/per.py +51 -0
- discophon-0.0.3/src/discophon/evaluate/pnmi.py +191 -0
- discophon-0.0.3/src/discophon/prepare/__init__.py +3 -0
- discophon-0.0.3/src/discophon/prepare/__main__.py +26 -0
- discophon-0.0.3/src/discophon/prepare/core.py +94 -0
- discophon-0.0.3/src/discophon/py.typed +0 -0
- discophon-0.0.3/tests/__init__.py +0 -0
- discophon-0.0.3/tests/test_benchmark_integrity.py +0 -0
- discophon-0.0.3/tests/test_tipa_mapping.py +39 -0
- discophon-0.0.3/uv.lock +2623 -0
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
name: Release Workflow
|
|
2
|
+
permissions: {}
|
|
3
|
+
|
|
4
|
+
on:
|
|
5
|
+
push:
|
|
6
|
+
tags:
|
|
7
|
+
- "*"
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
name: Linters
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
permissions:
|
|
14
|
+
security-events: write
|
|
15
|
+
contents: read # Fetch repository
|
|
16
|
+
actions: read
|
|
17
|
+
steps:
|
|
18
|
+
- name: Checkout code
|
|
19
|
+
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
|
20
|
+
with:
|
|
21
|
+
persist-credentials: false
|
|
22
|
+
|
|
23
|
+
# - name: zizmor
|
|
24
|
+
# uses: zizmorcore/zizmor-action@e639db99335bc9038abc0e066dfcd72e23d26fb4 # v0.3.0
|
|
25
|
+
|
|
26
|
+
- name: typos
|
|
27
|
+
uses: crate-ci/typos@bb4666ad77b539a6b4ce4eda7ebb6de553704021 # v1.42.0
|
|
28
|
+
|
|
29
|
+
- name: Ruff
|
|
30
|
+
uses: astral-sh/ruff-action@57714a7c8a2e59f32539362ba31877a1957dded1 # v3.5.1
|
|
31
|
+
|
|
32
|
+
build:
|
|
33
|
+
name: Build distribution
|
|
34
|
+
runs-on: ubuntu-latest
|
|
35
|
+
needs: lint
|
|
36
|
+
permissions:
|
|
37
|
+
contents: read # Fetch repository
|
|
38
|
+
actions: write # Upload artifact
|
|
39
|
+
steps:
|
|
40
|
+
- name: Checkout code
|
|
41
|
+
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
|
42
|
+
with:
|
|
43
|
+
persist-credentials: false
|
|
44
|
+
|
|
45
|
+
- name: Install uv
|
|
46
|
+
uses: astral-sh/setup-uv@61cb8a9741eeb8a550a1b8544337180c0fc8476b # v7.2.0
|
|
47
|
+
with:
|
|
48
|
+
version: "latest"
|
|
49
|
+
enable-cache: false
|
|
50
|
+
|
|
51
|
+
- name: Build wheel
|
|
52
|
+
run: uv build
|
|
53
|
+
|
|
54
|
+
- name: Upload dists
|
|
55
|
+
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
|
|
56
|
+
with:
|
|
57
|
+
name: python-package-distributions
|
|
58
|
+
path: dist/
|
|
59
|
+
|
|
60
|
+
publish-to-pypi:
|
|
61
|
+
name: Publish to PyPI
|
|
62
|
+
runs-on: ubuntu-latest
|
|
63
|
+
needs: build
|
|
64
|
+
environment:
|
|
65
|
+
name: publish-to-pypi
|
|
66
|
+
permissions:
|
|
67
|
+
actions: read # Download artifact
|
|
68
|
+
id-token: write # Needed for trusted publishing
|
|
69
|
+
steps:
|
|
70
|
+
- name: Download all dists
|
|
71
|
+
uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
|
|
72
|
+
with:
|
|
73
|
+
name: python-package-distributions
|
|
74
|
+
path: dist/
|
|
75
|
+
|
|
76
|
+
- name: Install uv
|
|
77
|
+
uses: astral-sh/setup-uv@61cb8a9741eeb8a550a1b8544337180c0fc8476b # v7.2.0
|
|
78
|
+
with:
|
|
79
|
+
version: "latest"
|
|
80
|
+
enable-cache: false
|
|
81
|
+
|
|
82
|
+
- name: Upload to PyPI
|
|
83
|
+
# run: uv publish --trusted-publishing always
|
|
84
|
+
run: uv publish -t ${{ secrets.PYPI_TOKEN }}
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
github-release:
|
|
88
|
+
name: GitHub release
|
|
89
|
+
runs-on: ubuntu-latest
|
|
90
|
+
needs: build
|
|
91
|
+
permissions:
|
|
92
|
+
actions: read # Download artifact
|
|
93
|
+
contents: write # Create GitHub release
|
|
94
|
+
steps:
|
|
95
|
+
- name: Checkout code
|
|
96
|
+
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
|
97
|
+
with:
|
|
98
|
+
persist-credentials: false
|
|
99
|
+
|
|
100
|
+
- name: Download all dists
|
|
101
|
+
uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
|
|
102
|
+
with:
|
|
103
|
+
name: python-package-distributions
|
|
104
|
+
path: ./dist/
|
|
105
|
+
|
|
106
|
+
- name: Create release
|
|
107
|
+
env:
|
|
108
|
+
GITHUB_TOKEN: ${{ github.token }}
|
|
109
|
+
run: gh release create "${GITHUB_REF#refs/tags/}" ./dist/*
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
set quiet
|
|
2
|
+
|
|
3
|
+
[private]
|
|
4
|
+
@default:
|
|
5
|
+
just --list
|
|
6
|
+
|
|
7
|
+
# Test
|
|
8
|
+
test:
|
|
9
|
+
uv run --group test pytest
|
|
10
|
+
|
|
11
|
+
# Lint and format
|
|
12
|
+
lint:
|
|
13
|
+
uv run --dev ruff check
|
|
14
|
+
uv run --dev ruff format
|
|
15
|
+
uv run --dev tombi format
|
|
16
|
+
uv run --dev typos
|
|
17
|
+
uv run --dev ty check src
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
discophon-0.0.3/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 CoML
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
discophon-0.0.3/PKG-INFO
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: discophon
|
|
3
|
+
Version: 0.0.3
|
|
4
|
+
Summary: The Phoneme Discovery Benchmark
|
|
5
|
+
Author-email: CoML <dev@cognitive-ml.fr>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Python: >=3.12
|
|
9
|
+
Requires-Dist: filelock>=3.20.2
|
|
10
|
+
Requires-Dist: httpx>=0.28.1
|
|
11
|
+
Requires-Dist: joblib>=1.5.3
|
|
12
|
+
Requires-Dist: numba>=0.63.1
|
|
13
|
+
Requires-Dist: numpy>=2.3.5
|
|
14
|
+
Requires-Dist: polars>=1.36.1
|
|
15
|
+
Requires-Dist: praat-textgrids>=1.4.0
|
|
16
|
+
Requires-Dist: soundfile>=0.13.1
|
|
17
|
+
Requires-Dist: soxr>=1.0.0
|
|
18
|
+
Requires-Dist: tqdm>=4.67.1
|
|
19
|
+
Requires-Dist: xarray>=2025.12.0
|
|
20
|
+
Provides-Extra: abx
|
|
21
|
+
Requires-Dist: fastabx>=0.6.1; extra == 'abx'
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# The Phoneme Discovery benchmark
|
|
25
|
+
|
|
26
|
+
[💾 [Website](https://benchmarks.cognitive-ml.fr/phoneme_discovery)] [📜 [Paper]()] [📖 [BibTex](https://github.com/bootphon/phoneme_discovery?tab=readme-ov-file#citation)]
|
|
27
|
+
|
|
28
|
+
## Introduction
|
|
29
|
+
|
|
30
|
+
The last several years have seen revolutionary improvements in both speech processing and textual natural language
|
|
31
|
+
processing. In both cases, unsupervised or self-supervised pre-training has been the key to models autonomously
|
|
32
|
+
discovering representations that are tremendously useful for doing language tasks. Yet, central to the study of human
|
|
33
|
+
speech processing is the phoneme inventory, a small set of discrete units that abstract away from massive pronunciation
|
|
34
|
+
variability in the signal.
|
|
35
|
+
|
|
36
|
+
Discovering the correct set of phonemes for a language is crucial: encode the wrong categories, and contrasts between
|
|
37
|
+
words are distorted or disappear; fail to categorize at all, and contrasts between words are hidden behind semantically
|
|
38
|
+
irrelevant variation in the signal. While much attention has been paid to whether unsupervised speech models’
|
|
39
|
+
(continuous or discrete) representations are predictive of phonemes, this benchmark, for the first time, explicitly
|
|
40
|
+
fixes the goal of learning a discrete set of categories that are in one-to-one correspondence with the phoneme
|
|
41
|
+
inventory of a language.
|
|
42
|
+
|
|
43
|
+
Infants appear to learn the phoneme inventory of their language effortlessly, before they can speak. They benefit from
|
|
44
|
+
millions of years of evolution of the human brain and body, giving them a learning architecture that allows them to
|
|
45
|
+
thrive in the face of scarce and noisy language data, preparing them to learn the phoneme inventory of any human
|
|
46
|
+
language.
|
|
47
|
+
|
|
48
|
+
The Phoneme Discovery benchmark is aimed at building models that discover phoneme inventories across various languages,
|
|
49
|
+
using only small amounts of speech data, and without textual data during training.
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install discophon
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
To be able to compute ABX discriminabilities: `pip install discophon[abx]`.
|
|
58
|
+
|
|
59
|
+
If you want to run baselines and have access to the utility scripts, clone this repository:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
git clone https://github.com/bootphon/phoneme_discovery
|
|
63
|
+
cd phoneme_discovery
|
|
64
|
+
uv sync
|
|
65
|
+
# uv sync --all-extras --all-groups # If you want all dependencies
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Usage
|
|
69
|
+
|
|
70
|
+
Check out the documentation:
|
|
71
|
+
|
|
72
|
+
- [Data preparation](https://github.com/bootphon/phoneme_discovery/blob/main/docs/prepare.md)
|
|
73
|
+
- [Simple evaluation](https://github.com/bootphon/phoneme_discovery/blob/main/docs/evaluate.md)
|
|
74
|
+
- [Run the benchmark](https://github.com/bootphon/phoneme_discovery/blob/main/benchmark.md)
|
|
75
|
+
- [Use the baseline systems](https://github.com/bootphon/phoneme_discovery/blob/main/baselines.md)
|
|
76
|
+
|
|
77
|
+
### Citation
|
|
78
|
+
|
|
79
|
+
```bibtex
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Contact: `benchmarks [at] cognitive-ml [dot] fr`
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# The Phoneme Discovery benchmark
|
|
2
|
+
|
|
3
|
+
[💾 [Website](https://benchmarks.cognitive-ml.fr/phoneme_discovery)] [📜 [Paper]()] [📖 [BibTex](https://github.com/bootphon/phoneme_discovery?tab=readme-ov-file#citation)]
|
|
4
|
+
|
|
5
|
+
## Introduction
|
|
6
|
+
|
|
7
|
+
The last several years have seen revolutionary improvements in both speech processing and textual natural language
|
|
8
|
+
processing. In both cases, unsupervised or self-supervised pre-training has been the key to models autonomously
|
|
9
|
+
discovering representations that are tremendously useful for doing language tasks. Yet, central to the study of human
|
|
10
|
+
speech processing is the phoneme inventory, a small set of discrete units that abstract away from massive pronunciation
|
|
11
|
+
variability in the signal.
|
|
12
|
+
|
|
13
|
+
Discovering the correct set of phonemes for a language is crucial: encode the wrong categories, and contrasts between
|
|
14
|
+
words are distorted or disappear; fail to categorize at all, and contrasts between words are hidden behind semantically
|
|
15
|
+
irrelevant variation in the signal. While much attention has been paid to whether unsupervised speech models’
|
|
16
|
+
(continuous or discrete) representations are predictive of phonemes, this benchmark, for the first time, explicitly
|
|
17
|
+
fixes the goal of learning a discrete set of categories that are in one-to-one correspondence with the phoneme
|
|
18
|
+
inventory of a language.
|
|
19
|
+
|
|
20
|
+
Infants appear to learn the phoneme inventory of their language effortlessly, before they can speak. They benefit from
|
|
21
|
+
millions of years of evolution of the human brain and body, giving them a learning architecture that allows them to
|
|
22
|
+
thrive in the face of scarce and noisy language data, preparing them to learn the phoneme inventory of any human
|
|
23
|
+
language.
|
|
24
|
+
|
|
25
|
+
The Phoneme Discovery benchmark is aimed at building models that discover phoneme inventories across various languages,
|
|
26
|
+
using only small amounts of speech data, and without textual data during training.
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install discophon
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
To be able to compute ABX discriminabilities: `pip install discophon[abx]`.
|
|
35
|
+
|
|
36
|
+
If you want to run baselines and have access to the utility scripts, clone this repository:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
git clone https://github.com/bootphon/phoneme_discovery
|
|
40
|
+
cd phoneme_discovery
|
|
41
|
+
uv sync
|
|
42
|
+
# uv sync --all-extras --all-groups # If you want all dependencies
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Usage
|
|
46
|
+
|
|
47
|
+
Check out the documentation:
|
|
48
|
+
|
|
49
|
+
- [Data preparation](https://github.com/bootphon/phoneme_discovery/blob/main/docs/prepare.md)
|
|
50
|
+
- [Simple evaluation](https://github.com/bootphon/phoneme_discovery/blob/main/docs/evaluate.md)
|
|
51
|
+
- [Run the benchmark](https://github.com/bootphon/phoneme_discovery/blob/main/benchmark.md)
|
|
52
|
+
- [Use the baseline systems](https://github.com/bootphon/phoneme_discovery/blob/main/baselines.md)
|
|
53
|
+
|
|
54
|
+
### Citation
|
|
55
|
+
|
|
56
|
+
```bibtex
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Contact: `benchmarks [at] cognitive-ml [dot] fr`
|
discophon-0.0.3/TODO
ADDED
|
File without changes
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Evaluation
|
|
2
|
+
|
|
3
|
+
## Phoneme discovery
|
|
4
|
+
|
|
5
|
+
You can use the `phoneme_discovery` function with `units: dict[str, list[int]]`, and `phones: dict[str, list[str]]`.
|
|
6
|
+
You also need to set the number of units `n_units`, of phonemes `n_phones`, and the step (in ms) between consecutive
|
|
7
|
+
units `step_units`.
|
|
8
|
+
|
|
9
|
+
Example:
|
|
10
|
+
|
|
11
|
+
```python
|
|
12
|
+
from discophon.core import read_gold_annotations, read_submitted_units
|
|
13
|
+
from discophon.evaluate import phoneme_discovery
|
|
14
|
+
|
|
15
|
+
phones = read_gold_annotations("/path/to/alignments/dataset.align")
|
|
16
|
+
units = read_submitted_units("/path/to/predictions/units.jsonl")
|
|
17
|
+
result = phoneme_discovery(units, phones, n_units=256, n_phones=40, step_units=20)
|
|
18
|
+
print(result)
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Or via the CLI:
|
|
22
|
+
|
|
23
|
+
```console
|
|
24
|
+
❯ python -m discophon.evaluate --help
|
|
25
|
+
usage: discophon.evaluate [-h] [--n-units N_UNITS] [--n-phones N_PHONES] [--step-units STEP_UNITS] units phones
|
|
26
|
+
|
|
27
|
+
Evaluate predicted units on phoneme discovery
|
|
28
|
+
|
|
29
|
+
positional arguments:
|
|
30
|
+
units path to predicted units
|
|
31
|
+
phones path to gold alignments
|
|
32
|
+
|
|
33
|
+
options:
|
|
34
|
+
-h, --help show this help message and exit
|
|
35
|
+
--n-units N_UNITS number of units
|
|
36
|
+
--n-phones N_PHONES number of phonemes
|
|
37
|
+
--step-units STEP_UNITS
|
|
38
|
+
step between units (in ms)
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## ABX
|
|
42
|
+
|
|
43
|
+
The ABX evaluation is done separately. First, install this package with the `abx` optional dependencies:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install discophon[abx]
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Then, either run it in Python:
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from discophon.evaluate.abx import discrete_abx, continuous_abx
|
|
53
|
+
|
|
54
|
+
result_discrete = discrete_abx("/path/to/item/dataset.item", "/path/to/predictions/units.jsonl", frequency=50)
|
|
55
|
+
print("Discrete: ", result_discrete)
|
|
56
|
+
|
|
57
|
+
result_continuous = continuous_abx("/path/to/item/dataset.item", "/path/to/features", frequency=50)
|
|
58
|
+
print("Continuous: ", result_discrete)
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Or via the CLI:
|
|
62
|
+
|
|
63
|
+
```console
|
|
64
|
+
❯ python -m discophon.evaluate.abx --help
|
|
65
|
+
usage: discophon.evaluate.abx [-h] --frequency FREQUENCY item root
|
|
66
|
+
|
|
67
|
+
Continuous or discrete ABX
|
|
68
|
+
|
|
69
|
+
positional arguments:
|
|
70
|
+
item Path to the item file
|
|
71
|
+
root Path to the JSONL with units or directory with continuous features
|
|
72
|
+
|
|
73
|
+
options:
|
|
74
|
+
-h, --help show this help message and exit
|
|
75
|
+
--frequency FREQUENCY
|
|
76
|
+
Units frequency in Hz
|
|
77
|
+
```
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# Data preparation
|
|
2
|
+
|
|
3
|
+
You need the `sox` binary available in your `$PATH` for pre-processing audio files.
|
|
4
|
+
|
|
5
|
+
Let's say you want to install the benchmark data and assets in a directory `$DATA`.
|
|
6
|
+
|
|
7
|
+
## Download Common Voice data
|
|
8
|
+
|
|
9
|
+
You first need to download audio data from CommonVoice. You can use their API if you don't want to download large
|
|
10
|
+
files on your local computer.
|
|
11
|
+
|
|
12
|
+
Download everything in `$DATA/raw`.
|
|
13
|
+
|
|
14
|
+
Dev languages:
|
|
15
|
+
|
|
16
|
+
- [Common Voice Scripted Speech 23.0 - German](https://datacollective.mozillafoundation.org/datasets/cmflnuzw5p0q7ydlq4k8skhqi) (34.41 GB)
|
|
17
|
+
- [Common Voice Scripted Speech 23.0 - Swahili](https://datacollective.mozillafoundation.org/datasets/cmflnuzw7mjcay14kmowc4y96) (21.23 GB)
|
|
18
|
+
- [Common Voice Scripted Speech 23.0 - Tamil](https://datacollective.mozillafoundation.org/datasets/cmflnuzw73r9g1avrbu6bwkfx) (8.56 GB)
|
|
19
|
+
- [Common Voice Scripted Speech 23.0 - Thai](https://datacollective.mozillafoundation.org/datasets/cmflnuzw7fwn4fc969r5owufz) (8.35 GB)
|
|
20
|
+
- [Common Voice Scripted Speech 23.0 - Turkish](https://datacollective.mozillafoundation.org/datasets/cmflnuzw71qkz8x3kil3tgjvk) (2.73 GB)
|
|
21
|
+
- [Common Voice Scripted Speech 23.0 - Ukrainian](https://datacollective.mozillafoundation.org/datasets/cmflnuzw7ijdv5oe9u7ky0zrc) (2.55 GB)
|
|
22
|
+
|
|
23
|
+
Test languages:
|
|
24
|
+
|
|
25
|
+
- [Common Voice Scripted Speech 23.0 - Basque](https://datacollective.mozillafoundation.org/datasets/cmflnuzw5qoauo49kpf8y1gzp) (14.58 GB)
|
|
26
|
+
- [Common Voice Scripted Speech 23.0 - Chinese (China)](https://datacollective.mozillafoundation.org/datasets/cmflnuzw8fvgv2vdgt6f52qvh) (21.26 GB)
|
|
27
|
+
- [Common Voice Scripted Speech 23.0 - English](https://datacollective.mozillafoundation.org/datasets/cmflnuzw52mzok78yz6woemc1) (86.83 GB)
|
|
28
|
+
- [Common Voice Scripted Speech 23.0 - French](https://datacollective.mozillafoundation.org/datasets/cmflnuzw5ahjms0zbrcl0vg4e) (27.87 GB)
|
|
29
|
+
- [Common Voice Scripted Speech 23.0 - Japanese](https://datacollective.mozillafoundation.org/datasets/cmflnuzw5lv4n3cd25tbavjb9) (11.80 GB)
|
|
30
|
+
- Wolof data comes from a different source, and will be downloaded with the other assets in the following section.
|
|
31
|
+
|
|
32
|
+
Extract each archive, with `tar --strip-components=1 -xvf ...`.
|
|
33
|
+
|
|
34
|
+
For example, let's say your archive is named `mcv-scripted-uk-v23.0.tar.gz`.
|
|
35
|
+
Extract it with `tar --strip-components=1 -xvf mcv-scripted-uk-v23.0.tar.gz`, and move the output directory to
|
|
36
|
+
`$DATA/raw`.
|
|
37
|
+
|
|
38
|
+
You can delete the archives afterwards. You should have the following structure:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
❯ tree -L 2 $DATA
|
|
42
|
+
$DATA
|
|
43
|
+
└── raw
|
|
44
|
+
├── de
|
|
45
|
+
├── en
|
|
46
|
+
├── eu
|
|
47
|
+
├── fr
|
|
48
|
+
├── ja
|
|
49
|
+
├── sw
|
|
50
|
+
├── ta
|
|
51
|
+
├── th
|
|
52
|
+
├── tr
|
|
53
|
+
├── uk
|
|
54
|
+
└── zh-CN
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Download benchmark assets
|
|
58
|
+
|
|
59
|
+
Now download the benchmark assets with the following command:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
python -m discophon.prepare download $DATA
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
This will download:
|
|
66
|
+
- Symlinks to audio files for each split in each language
|
|
67
|
+
- Manifests
|
|
68
|
+
- Alignments and item files
|
|
69
|
+
|
|
70
|
+
## Preprocess selected audio files
|
|
71
|
+
|
|
72
|
+
Now resample audio files and convert them to WAV with the command:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
for code in swa tam tha tur ukr cmn eus jpn; do
|
|
76
|
+
python -m discophon.prepare audio $DATA $code
|
|
77
|
+
done
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
This will create directories `$DATA/audio/cmn/all`, `$DATA/audio/deu/all`, `$DATA/audio/eng/all`, etc. with
|
|
81
|
+
resampled audio files. The directories corresponding to each split contain symlinks to those files.
|
|
82
|
+
|
|
83
|
+
You should parallelize this loop if you can. If you are in a SLURM cluster, you should also parallelize each dataset
|
|
84
|
+
processing across tasks or array jobs. The `discophon.prepare` package will automatically handle the distribution of
|
|
85
|
+
files to process across jobs.
|
|
86
|
+
|
|
87
|
+
You can delete the `$DATA/raw` folder afterwards.
|
|
File without changes
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "discophon"
|
|
3
|
+
description = "The Phoneme Discovery Benchmark"
|
|
4
|
+
readme = "README.md"
|
|
5
|
+
requires-python = ">=3.12"
|
|
6
|
+
license = "MIT"
|
|
7
|
+
authors = [{ name = "CoML", email = "dev@cognitive-ml.fr" }]
|
|
8
|
+
dependencies = [
|
|
9
|
+
"filelock>=3.20.2",
|
|
10
|
+
"httpx>=0.28.1",
|
|
11
|
+
"joblib>=1.5.3",
|
|
12
|
+
"numba>=0.63.1",
|
|
13
|
+
"numpy>=2.3.5",
|
|
14
|
+
"polars>=1.36.1",
|
|
15
|
+
"praat-textgrids>=1.4.0",
|
|
16
|
+
"soundfile>=0.13.1",
|
|
17
|
+
"soxr>=1.0.0",
|
|
18
|
+
"tqdm>=4.67.1",
|
|
19
|
+
"xarray>=2025.12.0",
|
|
20
|
+
]
|
|
21
|
+
dynamic = ["version"]
|
|
22
|
+
|
|
23
|
+
[project.optional-dependencies]
|
|
24
|
+
abx = [
|
|
25
|
+
"fastabx>=0.6.1",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[dependency-groups]
|
|
29
|
+
dev = [
|
|
30
|
+
"ipykernel>=7.1.0",
|
|
31
|
+
"ruff>=0.14.10",
|
|
32
|
+
"tombi>=0.7.16",
|
|
33
|
+
"ty>=0.0.11",
|
|
34
|
+
"typos>=1.42.0",
|
|
35
|
+
]
|
|
36
|
+
baselines = [
|
|
37
|
+
"minimal-hubert>=0.0.1",
|
|
38
|
+
"spidr[train]>=0.1.3",
|
|
39
|
+
]
|
|
40
|
+
plot = [
|
|
41
|
+
"altair>=6.0.0",
|
|
42
|
+
"matplotlib>=3.10.8",
|
|
43
|
+
"seaborn>=0.13.2",
|
|
44
|
+
"vegafusion>=2.0.3",
|
|
45
|
+
"vl-convert-python>=1.9.0",
|
|
46
|
+
]
|
|
47
|
+
test = [
|
|
48
|
+
"matplotlib>=3.10.8",
|
|
49
|
+
"pytest>=9.0.2",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
[build-system]
|
|
53
|
+
requires = ["hatch-vcs", "hatchling"]
|
|
54
|
+
build-backend = "hatchling.build"
|
|
55
|
+
|
|
56
|
+
[tool.hatch.version]
|
|
57
|
+
source = "vcs"
|
|
58
|
+
|
|
59
|
+
[tool.pytest]
|
|
60
|
+
log_cli = true
|
|
61
|
+
log_cli_level = "INFO"
|
|
62
|
+
|
|
63
|
+
[tool.ruff]
|
|
64
|
+
line-length = 119
|
|
65
|
+
|
|
66
|
+
[tool.ruff.lint]
|
|
67
|
+
select = [
|
|
68
|
+
"A", # flake8-builtins
|
|
69
|
+
"ARG", # flake8-unused-arguments
|
|
70
|
+
"ANN", # flake8-annotations
|
|
71
|
+
"B", # flake8-bugbear
|
|
72
|
+
"BLE", # flake8-blind-except
|
|
73
|
+
"C4", # flake8-comprehensions
|
|
74
|
+
"C90", # mccabe
|
|
75
|
+
"E", # pycodestyle error
|
|
76
|
+
"EXE", # flake8-executable
|
|
77
|
+
"F", # Pyflakes
|
|
78
|
+
"FBT", # flake8-boolean-trap
|
|
79
|
+
"FLY", # flynt
|
|
80
|
+
"FURB", # refurb
|
|
81
|
+
"G", # flake8-logging-format
|
|
82
|
+
"I", # isort
|
|
83
|
+
"ICN", # flake8-import-conventions
|
|
84
|
+
"ISC", # flake8-implicit-str-concat
|
|
85
|
+
"LOG", # flake8-logging
|
|
86
|
+
"N", # pep8-naming
|
|
87
|
+
"NPY", # NumPy-specific rules
|
|
88
|
+
"PERF", # Perflint
|
|
89
|
+
"PIE", # flake8-pie
|
|
90
|
+
"PGH", # pygrep-hooks
|
|
91
|
+
"PL", # Pylint
|
|
92
|
+
"PTH", # flake8-use-pathlib
|
|
93
|
+
"Q", # flake8-quotes
|
|
94
|
+
"RET", # flake8-return
|
|
95
|
+
"RSE", # flake8-raise
|
|
96
|
+
"RUF", # Ruff-specific rules
|
|
97
|
+
"SIM", # flake8-simplify
|
|
98
|
+
"SLF", # flake8-slots
|
|
99
|
+
"T10", # flake8-debugger
|
|
100
|
+
"TC", # flake8-type-checking
|
|
101
|
+
"TD", # flake8-todos
|
|
102
|
+
"TID", # flake8-tidy-imports
|
|
103
|
+
"UP", # pyupgrade
|
|
104
|
+
"W", # pycodestyle warning
|
|
105
|
+
]
|
|
106
|
+
|
|
107
|
+
[tool.ruff.lint.flake8-tidy-imports]
|
|
108
|
+
banned-module-level-imports = ["discophon.evaluate.abx"]
|
|
109
|
+
|
|
110
|
+
[tool.ruff.lint.pylint]
|
|
111
|
+
allow-magic-value-types = ["int", "str"]
|
|
112
|
+
max-args = 10
|
|
113
|
+
max-statements = 75
|
|
114
|
+
|
|
115
|
+
[tool.typos.default.extend-words]
|
|
116
|
+
tha = "tha"
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
#SBATCH -J discophon
|
|
3
|
+
#SBATCH --cpus-per-task=10
|
|
4
|
+
#SBATCH --time=05:00:00
|
|
5
|
+
#SBATCH --array=1-12
|
|
6
|
+
|
|
7
|
+
set -euo pipefail
|
|
8
|
+
|
|
9
|
+
usage() {
|
|
10
|
+
cat << EOF
|
|
11
|
+
Usage: $0 DATASET UNITS FEATURES OUTPUT N_UNITS [STEP_UNITS]
|
|
12
|
+
|
|
13
|
+
Arguments:
|
|
14
|
+
DATASET Path to the dataset
|
|
15
|
+
UNITS Path to the discrete units (used for benchmarks 'discovery' and 'abx-discrete')
|
|
16
|
+
FEATURES Path to the continuous features (used for benchmark 'abx-continuous')
|
|
17
|
+
OUTPUT Output JSONL file
|
|
18
|
+
N_UNITS Number of units
|
|
19
|
+
STEP_UNITS Step size for units (default: 20)
|
|
20
|
+
|
|
21
|
+
Description:
|
|
22
|
+
Runs all tasks in the Phoneme Discovery benchmark, across all languages and splits combinations.
|
|
23
|
+
Array tasks 1-4: phoneme discovery
|
|
24
|
+
Array tasks 5-8: ABX discrete
|
|
25
|
+
Array tasks 9-12: ABX continuous
|
|
26
|
+
|
|
27
|
+
Example:
|
|
28
|
+
sbatch $0 ./path/to/dataset ./path/to/units ./path/to/features ./path/to/output 256
|
|
29
|
+
EOF
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
|
|
33
|
+
usage
|
|
34
|
+
exit 0
|
|
35
|
+
fi
|
|
36
|
+
|
|
37
|
+
if [[ $# -lt 5 ]]; then
|
|
38
|
+
echo "Error: Missing required arguments." >&2
|
|
39
|
+
usage
|
|
40
|
+
exit 1
|
|
41
|
+
fi
|
|
42
|
+
|
|
43
|
+
DATASET=$1
|
|
44
|
+
UNITS=$2
|
|
45
|
+
FEATURES=$3
|
|
46
|
+
OUTPUT=$4
|
|
47
|
+
N_UNITS=$5
|
|
48
|
+
STEP_UNITS=${6:-20}
|
|
49
|
+
|
|
50
|
+
benchmark=$(( (SLURM_ARRAY_TASK_ID - 1) / 4 ))
|
|
51
|
+
benchmark=$(echo "discovery abx-discrete abx-continuous" | cut -d' ' -f$((benchmark + 1)))
|
|
52
|
+
[[ "$benchmark" == "abx-continuous" ]] && predictions="$FEATURES" || predictions="$UNITS"
|
|
53
|
+
task=$(( (SLURM_ARRAY_TASK_ID - 1) % 4 ))
|
|
54
|
+
[[ $task -lt 2 ]] && split="dev" || split="test"
|
|
55
|
+
[[ $((task % 2)) -eq 0 ]] && languages="dev" || languages="test"
|
|
56
|
+
|
|
57
|
+
srun python -m discophon.benchmark \
|
|
58
|
+
"$DATASET" \
|
|
59
|
+
"$predictions" \
|
|
60
|
+
"$OUTPUT" \
|
|
61
|
+
--languages "$languages" \
|
|
62
|
+
--split "$split" \
|
|
63
|
+
--benchmark "$benchmark" \
|
|
64
|
+
--n-units "$N_UNITS" \
|
|
65
|
+
--step-units "$STEP_UNITS"
|