transcribe-studio 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. transcribe_studio-0.2.0/LICENSE +21 -0
  2. transcribe_studio-0.2.0/PKG-INFO +128 -0
  3. transcribe_studio-0.2.0/README.md +95 -0
  4. transcribe_studio-0.2.0/app/__init__.py +0 -0
  5. transcribe_studio-0.2.0/app/analytics.py +131 -0
  6. transcribe_studio-0.2.0/app/config/evaluation.toml +45 -0
  7. transcribe_studio-0.2.0/app/config/languages/en.toml +242 -0
  8. transcribe_studio-0.2.0/app/config/transcript_formats.toml +47 -0
  9. transcribe_studio-0.2.0/app/database.py +126 -0
  10. transcribe_studio-0.2.0/app/evaluation/__init__.py +18 -0
  11. transcribe_studio-0.2.0/app/evaluation/config.py +197 -0
  12. transcribe_studio-0.2.0/app/evaluation/engine.py +289 -0
  13. transcribe_studio-0.2.0/app/evaluation/matchers/__init__.py +7 -0
  14. transcribe_studio-0.2.0/app/evaluation/matchers/base.py +44 -0
  15. transcribe_studio-0.2.0/app/evaluation/matchers/semantic.py +244 -0
  16. transcribe_studio-0.2.0/app/evaluation/metrics/__init__.py +7 -0
  17. transcribe_studio-0.2.0/app/evaluation/metrics/base.py +61 -0
  18. transcribe_studio-0.2.0/app/evaluation/metrics/wer.py +195 -0
  19. transcribe_studio-0.2.0/app/evaluation/models.py +133 -0
  20. transcribe_studio-0.2.0/app/evaluation/normalizers/__init__.py +7 -0
  21. transcribe_studio-0.2.0/app/evaluation/normalizers/base.py +49 -0
  22. transcribe_studio-0.2.0/app/evaluation/normalizers/en.py +137 -0
  23. transcribe_studio-0.2.0/app/export_formats.py +178 -0
  24. transcribe_studio-0.2.0/app/main.py +854 -0
  25. transcribe_studio-0.2.0/app/navigation.py +103 -0
  26. transcribe_studio-0.2.0/app/paths.py +22 -0
  27. transcribe_studio-0.2.0/app/services/__init__.py +1 -0
  28. transcribe_studio-0.2.0/app/services/projects.py +182 -0
  29. transcribe_studio-0.2.0/app/static/css/style.css +1892 -0
  30. transcribe_studio-0.2.0/app/static/js/editor.js +1001 -0
  31. transcribe_studio-0.2.0/app/static/js/evaluation.js +361 -0
  32. transcribe_studio-0.2.0/app/templates/analysis.html +97 -0
  33. transcribe_studio-0.2.0/app/templates/base.html +31 -0
  34. transcribe_studio-0.2.0/app/templates/dashboard.html +77 -0
  35. transcribe_studio-0.2.0/app/templates/editor.html +223 -0
  36. transcribe_studio-0.2.0/app/templates/evaluation.html +129 -0
  37. transcribe_studio-0.2.0/app/templates/partials/breadcrumbs.html +15 -0
  38. transcribe_studio-0.2.0/app/templates/partials/recording_header.html +32 -0
  39. transcribe_studio-0.2.0/app/templates/partials/recording_tabs.html +10 -0
  40. transcribe_studio-0.2.0/app/templates/partials/sidebar.html +47 -0
  41. transcribe_studio-0.2.0/app/templates/partials/upload_form.html +26 -0
  42. transcribe_studio-0.2.0/app/templates/recordings.html +66 -0
  43. transcribe_studio-0.2.0/app/templates/screens/dashboard.html +80 -0
  44. transcribe_studio-0.2.0/app/templates/screens/project_detail.html +89 -0
  45. transcribe_studio-0.2.0/app/templates/screens/project_form.html +98 -0
  46. transcribe_studio-0.2.0/app/templates/screens/project_upload.html +47 -0
  47. transcribe_studio-0.2.0/app/templates/upload.html +64 -0
  48. transcribe_studio-0.2.0/app/transcript_formats/__init__.py +21 -0
  49. transcribe_studio-0.2.0/app/transcript_formats/align.py +77 -0
  50. transcribe_studio-0.2.0/app/transcript_formats/config.py +86 -0
  51. transcribe_studio-0.2.0/app/transcript_formats/models.py +42 -0
  52. transcribe_studio-0.2.0/app/transcript_formats/parsers/__init__.py +9 -0
  53. transcribe_studio-0.2.0/app/transcript_formats/parsers/base.py +49 -0
  54. transcribe_studio-0.2.0/app/transcript_formats/parsers/json_segments.py +81 -0
  55. transcribe_studio-0.2.0/app/transcript_formats/parsers/plain_text.py +21 -0
  56. transcribe_studio-0.2.0/app/transcript_formats/parsers/timestamp_speaker.py +80 -0
  57. transcribe_studio-0.2.0/app/transcript_formats/registry.py +103 -0
  58. transcribe_studio-0.2.0/app/web/__init__.py +1 -0
  59. transcribe_studio-0.2.0/app/web/context.py +87 -0
  60. transcribe_studio-0.2.0/app/web/deps.py +43 -0
  61. transcribe_studio-0.2.0/app/web/routes/__init__.py +16 -0
  62. transcribe_studio-0.2.0/app/web/routes/dashboard.py +34 -0
  63. transcribe_studio-0.2.0/app/web/routes/legacy.py +66 -0
  64. transcribe_studio-0.2.0/app/web/routes/projects.py +116 -0
  65. transcribe_studio-0.2.0/app/web/routes/recordings_pages.py +86 -0
  66. transcribe_studio-0.2.0/pyproject.toml +52 -0
  67. transcribe_studio-0.2.0/setup.cfg +4 -0
  68. transcribe_studio-0.2.0/tests/test_evaluation.py +81 -0
  69. transcribe_studio-0.2.0/tests/test_transcript_formats.py +66 -0
  70. transcribe_studio-0.2.0/transcribe_studio.egg-info/PKG-INFO +128 -0
  71. transcribe_studio-0.2.0/transcribe_studio.egg-info/SOURCES.txt +73 -0
  72. transcribe_studio-0.2.0/transcribe_studio.egg-info/dependency_links.txt +1 -0
  73. transcribe_studio-0.2.0/transcribe_studio.egg-info/entry_points.txt +2 -0
  74. transcribe_studio-0.2.0/transcribe_studio.egg-info/requires.txt +10 -0
  75. transcribe_studio-0.2.0/transcribe_studio.egg-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Mishkat Quantum Labs
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,128 @@
1
+ Metadata-Version: 2.4
2
+ Name: transcribe-studio
3
+ Version: 0.2.0
4
+ Summary: Local classroom audio transcription with projects, WER evaluation, and pluggable transcript formats
5
+ Author: Mishkat Quantum Labs
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/Mishkat-Quantum-Labs/transcribe-studio
8
+ Project-URL: Repository, https://github.com/Mishkat-Quantum-Labs/transcribe-studio
9
+ Project-URL: Issues, https://github.com/Mishkat-Quantum-Labs/transcribe-studio/issues
10
+ Project-URL: Changelog, https://github.com/Mishkat-Quantum-Labs/transcribe-studio/blob/main/CHANGELOG.md
11
+ Keywords: transcription,classroom,audio,wer,speech,evaluation
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Web Environment
14
+ Classifier: Framework :: FastAPI
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
19
+ Classifier: Topic :: Scientific/Engineering
20
+ Requires-Python: >=3.11
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: fastapi>=0.115.0
24
+ Requires-Dist: uvicorn[standard]>=0.32.0
25
+ Requires-Dist: jinja2>=3.1.0
26
+ Requires-Dist: python-multipart>=0.0.12
27
+ Requires-Dist: aiofiles>=24.1.0
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=8.0; extra == "dev"
30
+ Requires-Dist: build>=1.0; extra == "dev"
31
+ Requires-Dist: twine>=5.0; extra == "dev"
32
+ Dynamic: license-file
33
+
34
+ # Transcribe Studio
35
+
36
+ [![CI](https://github.com/Mishkat-Quantum-Labs/transcribe-studio/actions/workflows/ci.yml/badge.svg)](https://github.com/Mishkat-Quantum-Labs/transcribe-studio/actions/workflows/ci.yml)
37
+
38
+ A local, browser-based tool for classroom audio transcription. Organize work by **project**, split audio into timed chunks, label speakers in free text, and evaluate human transcripts against LLM output (WER + semantic WER).
39
+
40
+ Built for researchers and annotators who need millisecond timestamps and exportable data — without Label Studio complexity.
41
+
42
+ ## Features
43
+
44
+ - **Projects** — group recordings by class, session, or study
45
+ - **Waveform editor** — divide audio into chunks (by duration or count), overlap speakers at the same timestamp
46
+ - **Chunk playback** — play one chunk at a time with **speed up/down** (0.25×–2×, keys `,` / `.`)
47
+ - **Exports** — TXT, Markdown, JSON, CSV, SRT, WebVTT
48
+ - **LLM evaluation** — paste or upload hypothesis transcripts; strict + semantic WER
49
+ - **Pluggable formats** — timestamp/speaker lines, JSON segments, plain text (TOML-driven)
50
+
51
+ ## Quick start
52
+
53
+ ### With uv (recommended)
54
+
55
+ ```bash
56
+ git clone https://github.com/Mishkat-Quantum-Labs/transcribe-studio.git
57
+ cd transcribe-studio
58
+ uv venv
59
+ uv pip install -e ".[dev]"
60
+ uv run transcribe-studio
61
+ ```
62
+
63
+ Open **http://127.0.0.1:8082**
64
+
65
+ ### With pip
66
+
67
+ ```bash
68
+ pip install transcribe-studio
69
+ transcribe-studio
70
+ ```
71
+
72
+ ## Usage
73
+
74
+ 1. Create a **project** from the dashboard
75
+ 2. **Upload** an MP3/WAV/M4A/OGG/FLAC recording
76
+ 3. **Divide** the wave into chunks, then transcribe each segment
77
+ 4. Use **Evaluation** to compare your transcript against an LLM upload
78
+ 5. **Export** when done
79
+
80
+ Data is stored under `~/.transcribe-studio/` (override with `TRANSCRIBE_STUDIO_DATA`).
81
+
82
+ ## Development
83
+
84
+ ```bash
85
+ uv pip install -e ".[dev]"
86
+ uv run pytest
87
+ ```
88
+
89
+ ## Publishing
90
+
91
+ ### PyPI via uv (recommended)
92
+
93
+ ```bash
94
+ uv build
95
+ uv publish # uses UV_PUBLISH_TOKEN or prompts for PyPI credentials
96
+ ```
97
+
98
+ ### PyPI via pip/twine
99
+
100
+ ```bash
101
+ pip install build twine
102
+ python -m build
103
+ twine upload dist/*
104
+ ```
105
+
106
+ ### GitHub release
107
+
108
+ ```bash
109
+ git tag v0.2.0
110
+ git push origin v0.2.0
111
+ gh release create v0.2.0 dist/*
112
+ ```
113
+
114
+ ## Configuration
115
+
116
+ Evaluation and transcript import settings ship inside the package:
117
+
118
+ - `app/config/evaluation.toml`
119
+ - `app/config/transcript_formats.toml`
120
+ - `app/config/languages/en.toml`
121
+
122
+ ## License
123
+
124
+ MIT — see [LICENSE](LICENSE).
125
+
126
+ ## Contributing
127
+
128
+ Issues and PRs welcome at [github.com/Mishkat-Quantum-Labs/transcribe-studio](https://github.com/Mishkat-Quantum-Labs/transcribe-studio).
@@ -0,0 +1,95 @@
1
+ # Transcribe Studio
2
+
3
+ [![CI](https://github.com/Mishkat-Quantum-Labs/transcribe-studio/actions/workflows/ci.yml/badge.svg)](https://github.com/Mishkat-Quantum-Labs/transcribe-studio/actions/workflows/ci.yml)
4
+
5
+ A local, browser-based tool for classroom audio transcription. Organize work by **project**, split audio into timed chunks, label speakers in free text, and evaluate human transcripts against LLM output (WER + semantic WER).
6
+
7
+ Built for researchers and annotators who need millisecond timestamps and exportable data — without Label Studio complexity.
8
+
9
+ ## Features
10
+
11
+ - **Projects** — group recordings by class, session, or study
12
+ - **Waveform editor** — divide audio into chunks (by duration or count), overlap speakers at the same timestamp
13
+ - **Chunk playback** — play one chunk at a time with **speed up/down** (0.25×–2×, keys `,` / `.`)
14
+ - **Exports** — TXT, Markdown, JSON, CSV, SRT, WebVTT
15
+ - **LLM evaluation** — paste or upload hypothesis transcripts; strict + semantic WER
16
+ - **Pluggable formats** — timestamp/speaker lines, JSON segments, plain text (TOML-driven)
17
+
18
+ ## Quick start
19
+
20
+ ### With uv (recommended)
21
+
22
+ ```bash
23
+ git clone https://github.com/Mishkat-Quantum-Labs/transcribe-studio.git
24
+ cd transcribe-studio
25
+ uv venv
26
+ uv pip install -e ".[dev]"
27
+ uv run transcribe-studio
28
+ ```
29
+
30
+ Open **http://127.0.0.1:8082**
31
+
32
+ ### With pip
33
+
34
+ ```bash
35
+ pip install transcribe-studio
36
+ transcribe-studio
37
+ ```
38
+
39
+ ## Usage
40
+
41
+ 1. Create a **project** from the dashboard
42
+ 2. **Upload** an MP3/WAV/M4A/OGG/FLAC recording
43
+ 3. **Divide** the wave into chunks, then transcribe each segment
44
+ 4. Use **Evaluation** to compare your transcript against an LLM upload
45
+ 5. **Export** when done
46
+
47
+ Data is stored under `~/.transcribe-studio/` (override with `TRANSCRIBE_STUDIO_DATA`).
48
+
49
+ ## Development
50
+
51
+ ```bash
52
+ uv pip install -e ".[dev]"
53
+ uv run pytest
54
+ ```
55
+
56
+ ## Publishing
57
+
58
+ ### PyPI via uv (recommended)
59
+
60
+ ```bash
61
+ uv build
62
+ uv publish # uses UV_PUBLISH_TOKEN or prompts for PyPI credentials
63
+ ```
64
+
65
+ ### PyPI via pip/twine
66
+
67
+ ```bash
68
+ pip install build twine
69
+ python -m build
70
+ twine upload dist/*
71
+ ```
72
+
73
+ ### GitHub release
74
+
75
+ ```bash
76
+ git tag v0.2.0
77
+ git push origin v0.2.0
78
+ gh release create v0.2.0 dist/*
79
+ ```
80
+
81
+ ## Configuration
82
+
83
+ Evaluation and transcript import settings ship inside the package:
84
+
85
+ - `app/config/evaluation.toml`
86
+ - `app/config/transcript_formats.toml`
87
+ - `app/config/languages/en.toml`
88
+
89
+ ## License
90
+
91
+ MIT — see [LICENSE](LICENSE).
92
+
93
+ ## Contributing
94
+
95
+ Issues and PRs welcome at [github.com/Mishkat-Quantum-Labs/transcribe-studio](https://github.com/Mishkat-Quantum-Labs/transcribe-studio).
File without changes
@@ -0,0 +1,131 @@
1
+ """Dashboard and recording-level transcription analytics."""
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from typing import Any
6
+
7
+
8
+ def _word_count(text: str) -> int:
9
+ return len(re.findall(r"\S+", text or ""))
10
+
11
+
12
+ def _segment_duration(seg: dict) -> int:
13
+ return max(0, seg["end_ms"] - seg["start_ms"])
14
+
15
+
16
+ def analyze_segments(segments: list[dict], duration_ms: int | None) -> dict[str, Any]:
17
+ total = len(segments)
18
+ transcribed = sum(1 for s in segments if (s.get("transcript") or "").strip())
19
+ labeled = sum(1 for s in segments if (s.get("speaker") or "").strip())
20
+ seg_ms = sum(_segment_duration(s) for s in segments)
21
+ words = sum(_word_count(s.get("transcript") or "") for s in segments)
22
+
23
+ speaker_stats: dict[str, dict[str, int]] = {}
24
+ for s in segments:
25
+ name = (s.get("speaker") or "").strip() or "Unlabeled"
26
+ if name not in speaker_stats:
27
+ speaker_stats[name] = {"segments": 0, "words": 0, "duration_ms": 0}
28
+ speaker_stats[name]["segments"] += 1
29
+ speaker_stats[name]["words"] += _word_count(s.get("transcript") or "")
30
+ speaker_stats[name]["duration_ms"] += _segment_duration(s)
31
+
32
+ speakers = [
33
+ {"name": k, **v}
34
+ for k, v in sorted(speaker_stats.items(), key=lambda x: -x[1]["duration_ms"])
35
+ ]
36
+
37
+ dur = duration_ms or 0
38
+ coverage_pct = min(100, round(seg_ms / dur * 100)) if dur else 0
39
+ transcript_pct = round(transcribed / total * 100) if total else 0
40
+ speaker_pct = round(labeled / total * 100) if total else 0
41
+ avg_seg_ms = round(seg_ms / total) if total else 0
42
+
43
+ return {
44
+ "segment_count": total,
45
+ "transcribed_segments": transcribed,
46
+ "speaker_labeled_segments": labeled,
47
+ "empty_segments": total - transcribed,
48
+ "total_words": words,
49
+ "segmented_duration_ms": seg_ms,
50
+ "coverage_pct": coverage_pct,
51
+ "transcript_pct": transcript_pct,
52
+ "speaker_label_pct": speaker_pct,
53
+ "avg_segment_ms": avg_seg_ms,
54
+ "speakers": speakers,
55
+ }
56
+
57
+
58
+ def analyze_recording(rec: dict, segments: list[dict]) -> dict[str, Any]:
59
+ stats = analyze_segments(segments, rec.get("duration_ms"))
60
+ return {
61
+ "id": rec["id"],
62
+ "title": rec["title"],
63
+ "duration_ms": rec.get("duration_ms"),
64
+ "created_at": rec.get("created_at", "")[:10],
65
+ "notes": rec.get("notes") or "",
66
+ **stats,
67
+ }
68
+
69
+
70
+ def dashboard_stats(conn) -> dict[str, Any]:
71
+ recordings = conn.execute(
72
+ "SELECT id, title, duration_ms, created_at FROM recordings ORDER BY id DESC"
73
+ ).fetchall()
74
+
75
+ total_segments = conn.execute("SELECT COUNT(*) FROM segments").fetchone()[0]
76
+ total_duration = conn.execute(
77
+ "SELECT COALESCE(SUM(duration_ms), 0) FROM recordings"
78
+ ).fetchone()[0]
79
+
80
+ all_segments = conn.execute(
81
+ "SELECT recording_id, start_ms, end_ms, speaker, transcript FROM segments"
82
+ ).fetchall()
83
+ seg_list = [dict(s) for s in all_segments]
84
+ transcribed = sum(1 for s in seg_list if (s.get("transcript") or "").strip())
85
+ words = sum(_word_count(s.get("transcript") or "") for s in seg_list)
86
+ segmented_ms = sum(_segment_duration(s) for s in seg_list)
87
+
88
+ speakers = {
89
+ (s.get("speaker") or "").strip() or "Unlabeled"
90
+ for s in seg_list
91
+ if (s.get("transcript") or "").strip() or (s.get("speaker") or "").strip()
92
+ }
93
+
94
+ recording_stats = []
95
+ for rec in recordings:
96
+ rec_segs = [s for s in seg_list if s["recording_id"] == rec["id"]]
97
+ recording_stats.append(analyze_recording(dict(rec), rec_segs))
98
+
99
+ overall_transcript_pct = (
100
+ round(transcribed / total_segments * 100) if total_segments else 0
101
+ )
102
+ overall_coverage_pct = (
103
+ min(100, round(segmented_ms / total_duration * 100)) if total_duration else 0
104
+ )
105
+
106
+ return {
107
+ "recording_count": len(recordings),
108
+ "segment_count": total_segments,
109
+ "total_duration_ms": total_duration,
110
+ "segmented_duration_ms": segmented_ms,
111
+ "transcribed_segments": transcribed,
112
+ "total_words": words,
113
+ "unique_speakers": len(speakers),
114
+ "transcript_pct": overall_transcript_pct,
115
+ "coverage_pct": overall_coverage_pct,
116
+ "recordings": recording_stats,
117
+ }
118
+
119
+
120
+ def fmt_duration(ms: int | None) -> str:
121
+ if not ms:
122
+ return "—"
123
+ s = ms / 1000
124
+ h = int(s // 3600)
125
+ m = int((s % 3600) // 60)
126
+ sec = s % 60
127
+ if h:
128
+ return f"{h}h {m}m"
129
+ if m:
130
+ return f"{m}m {sec:.0f}s"
131
+ return f"{sec:.1f}s"
@@ -0,0 +1,45 @@
1
+ # Transcribe Studio - Evaluation Configuration
2
+ # https://github.com/Mishkat-Quantum-Labs/transcribe-studio
3
+
4
+ [evaluation]
5
+ version = "1.0"
6
+ default_language = "en"
7
+
8
+ # Metrics to compute
9
+ # Set enabled = false to skip a metric
10
+ # weight is used for weighted averaging in overall score
11
+
12
+ [evaluation.metrics]
13
+
14
+ [evaluation.metrics.wer]
15
+ enabled = true
16
+ weight = 1.0
17
+ case_sensitive = false
18
+ description = "Word Error Rate - standard ASR metric"
19
+
20
+ [evaluation.metrics.cer]
21
+ enabled = false
22
+ weight = 0.0
23
+ description = "Character Error Rate - useful for character-level languages"
24
+
25
+ [evaluation.metrics.semantic_score]
26
+ enabled = true
27
+ weight = 0.5
28
+ description = "Semantic equivalence score - partial credit for meaning"
29
+
30
+ # Text normalization settings
31
+ # These apply before metric calculation
32
+
33
+ [evaluation.normalization]
34
+ lowercase = true
35
+ trim_whitespace = true
36
+ remove_punctuation = false
37
+ normalize_quotes = true
38
+ remove_special_chars = false
39
+
40
+ # UI Settings
41
+
42
+ [evaluation.ui]
43
+ show_detailed_breakdown = true
44
+ highlight_errors = true
45
+ color_scheme = "auto"
@@ -0,0 +1,242 @@
1
+ # English Language Configuration
2
+ # Semantic equivalence rules for English
3
+
4
+ [language]
5
+ code = "en"
6
+ name = "English"
7
+ normalizer_class = "en"
8
+
9
+ # ============================================================
10
+ # SEMANTIC MATCHING RULES
11
+ # ============================================================
12
+ # These rules define phrases that are semantically equivalent
13
+ # even when they differ in exact wording.
14
+ #
15
+ # Each rule has:
16
+ # - variants: list of alternative phrasings
17
+ # - canonical: the "standard" form to compare against
18
+ # - weight: 0.0-1.0, confidence of equivalence
19
+ #
20
+ # Matching works bidirectionally:
21
+ # "gonna" matches "going to" and vice versa
22
+ # ============================================================
23
+
24
+ [[semantic_matchers.group]]
25
+ name = "contractions_informal"
26
+ description = "Contractions and informal speech → formal forms"
27
+ enabled = true
28
+
29
+ [[semantic_matchers.group.rule]]
30
+ variants = ["gonna", "gon na", "gunna", "gonna"]
31
+ canonical = "going to"
32
+ weight = 0.95
33
+
34
+ [[semantic_matchers.group.rule]]
35
+ variants = ["wanna", "wanner"]
36
+ canonical = "want to"
37
+ weight = 0.95
38
+
39
+ [[semantic_matchers.group.rule]]
40
+ variants = ["gotta", "got a"]
41
+ canonical = "got to"
42
+ weight = 0.9
43
+
44
+ [[semantic_matchers.group.rule]]
45
+ variants = ["kinda", "kind of"]
46
+ canonical = "kind of"
47
+ weight = 0.9
48
+
49
+ [[semantic_matchers.group.rule]]
50
+ variants = ["sorta", "sort of"]
51
+ canonical = "sort of"
52
+ weight = 0.9
53
+
54
+ [[semantic_matchers.group.rule]]
55
+ variants = ["outta", "out of"]
56
+ canonical = "out of"
57
+ weight = 0.95
58
+
59
+ [[semantic_matchers.group.rule]]
60
+ variants = ["lemme", "let me"]
61
+ canonical = "let me"
62
+ weight = 0.95
63
+
64
+ [[semantic_matchers.group.rule]]
65
+ variants = ["gimme", "give me"]
66
+ canonical = "give me"
67
+ weight = 0.95
68
+
69
+ [[semantic_matchers.group.rule]]
70
+ variants = ["dunno", "dont know", "do not know", "don't know"]
71
+ canonical = "do not know"
72
+ weight = 0.9
73
+
74
+ [[semantic_matchers.group.rule]]
75
+ variants = ["coulda", "could have", "could've"]
76
+ canonical = "could have"
77
+ weight = 0.95
78
+
79
+ [[semantic_matchers.group.rule]]
80
+ variants = ["woulda", "would have", "would've"]
81
+ canonical = "would have"
82
+ weight = 0.95
83
+
84
+ [[semantic_matchers.group.rule]]
85
+ variants = ["shoulda", "should have", "should've"]
86
+ canonical = "should have"
87
+ weight = 0.95
88
+
89
+ [[semantic_matchers.group.rule]]
90
+ variants = ["lotsa", "lots of"]
91
+ canonical = "lots of"
92
+ weight = 0.95
93
+
94
+ [[semantic_matchers.group.rule]]
95
+ variants = ["cause", "cos", "cuz"]
96
+ canonical = "because"
97
+ weight = 0.85
98
+
99
+ [[semantic_matchers.group.rule]]
100
+ variants = ["nvm", "nvr", "nevermind", "never mind"]
101
+ canonical = "never mind"
102
+ weight = 0.95
103
+
104
+ [[semantic_matchers.group.rule]]
105
+ variants = ["thru", "through"]
106
+ canonical = "through"
107
+ weight = 0.98
108
+
109
+ [[semantic_matchers.group.rule]]
110
+ variants = ["u", "you"]
111
+ canonical = "you"
112
+ weight = 0.8
113
+
114
+ [[semantic_matchers.group.rule]]
115
+ variants = ["ur", "you're", "your"]
116
+ canonical = "your"
117
+ weight = 0.7
118
+
119
+ [[semantic_matchers.group.rule]]
120
+ variants = ["ok", "okay", "ok"]
121
+ canonical = "okay"
122
+ weight = 1.0
123
+
124
+ [[semantic_matchers.group.rule]]
125
+ variants = ["yeah", "yes", "yea", "yah"]
126
+ canonical = "yes"
127
+ weight = 0.95
128
+
129
+ [[semantic_matchers.group.rule]]
130
+ variants = ["nope", "no", "nah"]
131
+ canonical = "no"
132
+ weight = 0.95
133
+
134
+ [[semantic_matchers.group.rule]]
135
+ variants = ["alright", "all right", "allright"]
136
+ canonical = "all right"
137
+ weight = 1.0
138
+
139
+ [[semantic_matchers.group.rule]]
140
+ variants = ["gonna", "goin", "goin to", "going"]
141
+ canonical = "going"
142
+ weight = 0.8
143
+
144
+ [[semantic_matchers.group]]
145
+ name = "repeated_sounds"
146
+ description = "Stuttered/repeated sounds - common in spontaneous speech"
147
+ enabled = true
148
+
149
+ [[semantic_matchers.group.rule]]
150
+ variants = ["um", "uh", "er", "erm"]
151
+ canonical = ""
152
+ weight = 0.5
153
+
154
+ [[semantic_matchers.group]]
155
+ name = "common_confusions"
156
+ description = "Common ASR/LLM transcription confusions"
157
+ enabled = true
158
+
159
+ [[semantic_matchers.group.rule]]
160
+ variants = ["i am", "i'm", "im"]
161
+ canonical = "i am"
162
+ weight = 1.0
163
+
164
+ [[semantic_matchers.group.rule]]
165
+ variants = ["you know", "yknow", "y'know"]
166
+ canonical = "you know"
167
+ weight = 0.9
168
+
169
+ [[semantic_matchers.group.rule]]
170
+ variants = ["like", "like like"]
171
+ canonical = "like"
172
+ weight = 0.7
173
+
174
+ [[semantic_matchers.group]]
175
+ name = "numbers"
176
+ description = "Number word ↔ digit equivalence"
177
+ enabled = true
178
+
179
+ [[semantic_matchers.group.rule]]
180
+ variants = ["for", "four"]
181
+ canonical = "four"
182
+ weight = 0.9
183
+
184
+ [[semantic_matchers.group.rule]]
185
+ variants = ["to", "two", "too"]
186
+ canonical = "two"
187
+ weight = 0.8
188
+
189
+ [[semantic_matchers.group]]
190
+ name = "contractions"
191
+ description = "Standard English contractions"
192
+ enabled = true
193
+
194
+ [[semantic_matchers.group.rule]]
195
+ variants = ["don't", "do not"]
196
+ canonical = "do not"
197
+ weight = 1.0
198
+
199
+ [[semantic_matchers.group.rule]]
200
+ variants = ["can't", "cannot"]
201
+ canonical = "cannot"
202
+ weight = 1.0
203
+
204
+ [[semantic_matchers.group.rule]]
205
+ variants = ["won't", "will not"]
206
+ canonical = "will not"
207
+ weight = 1.0
208
+
209
+ [[semantic_matchers.group.rule]]
210
+ variants = ["i've", "i have"]
211
+ canonical = "i have"
212
+ weight = 1.0
213
+
214
+ [[semantic_matchers.group.rule]]
215
+ variants = ["i'll", "i will"]
216
+ canonical = "i will"
217
+ weight = 1.0
218
+
219
+ [[semantic_matchers.group.rule]]
220
+ variants = ["it's", "it is", "its"]
221
+ canonical = "it is"
222
+ weight = 0.95
223
+
224
+ [[semantic_matchers.group.rule]]
225
+ variants = ["that's", "that is"]
226
+ canonical = "that is"
227
+ weight = 1.0
228
+
229
+ [[semantic_matchers.group.rule]]
230
+ variants = ["there's", "there is"]
231
+ canonical = "there is"
232
+ weight = 1.0
233
+
234
+ [[semantic_matchers.group.rule]]
235
+ variants = ["here's", "here is"]
236
+ canonical = "here is"
237
+ weight = 1.0
238
+
239
+ [[semantic_matchers.group.rule]]
240
+ variants = ["what's", "what is"]
241
+ canonical = "what is"
242
+ weight = 1.0