transcribe-studio 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. app/__init__.py +0 -0
  2. app/analytics.py +131 -0
  3. app/config/evaluation.toml +45 -0
  4. app/config/languages/en.toml +242 -0
  5. app/config/transcript_formats.toml +47 -0
  6. app/database.py +126 -0
  7. app/evaluation/__init__.py +18 -0
  8. app/evaluation/config.py +197 -0
  9. app/evaluation/engine.py +289 -0
  10. app/evaluation/matchers/__init__.py +7 -0
  11. app/evaluation/matchers/base.py +44 -0
  12. app/evaluation/matchers/semantic.py +244 -0
  13. app/evaluation/metrics/__init__.py +7 -0
  14. app/evaluation/metrics/base.py +61 -0
  15. app/evaluation/metrics/wer.py +195 -0
  16. app/evaluation/models.py +133 -0
  17. app/evaluation/normalizers/__init__.py +7 -0
  18. app/evaluation/normalizers/base.py +49 -0
  19. app/evaluation/normalizers/en.py +137 -0
  20. app/export_formats.py +178 -0
  21. app/main.py +854 -0
  22. app/navigation.py +103 -0
  23. app/paths.py +22 -0
  24. app/services/__init__.py +1 -0
  25. app/services/projects.py +182 -0
  26. app/static/css/style.css +1892 -0
  27. app/static/js/editor.js +1001 -0
  28. app/static/js/evaluation.js +361 -0
  29. app/templates/analysis.html +97 -0
  30. app/templates/base.html +31 -0
  31. app/templates/dashboard.html +77 -0
  32. app/templates/editor.html +223 -0
  33. app/templates/evaluation.html +129 -0
  34. app/templates/partials/breadcrumbs.html +15 -0
  35. app/templates/partials/recording_header.html +32 -0
  36. app/templates/partials/recording_tabs.html +10 -0
  37. app/templates/partials/sidebar.html +47 -0
  38. app/templates/partials/upload_form.html +26 -0
  39. app/templates/recordings.html +66 -0
  40. app/templates/screens/dashboard.html +80 -0
  41. app/templates/screens/project_detail.html +89 -0
  42. app/templates/screens/project_form.html +98 -0
  43. app/templates/screens/project_upload.html +47 -0
  44. app/templates/upload.html +64 -0
  45. app/transcript_formats/__init__.py +21 -0
  46. app/transcript_formats/align.py +77 -0
  47. app/transcript_formats/config.py +86 -0
  48. app/transcript_formats/models.py +42 -0
  49. app/transcript_formats/parsers/__init__.py +9 -0
  50. app/transcript_formats/parsers/base.py +49 -0
  51. app/transcript_formats/parsers/json_segments.py +81 -0
  52. app/transcript_formats/parsers/plain_text.py +21 -0
  53. app/transcript_formats/parsers/timestamp_speaker.py +80 -0
  54. app/transcript_formats/registry.py +103 -0
  55. app/web/__init__.py +1 -0
  56. app/web/context.py +87 -0
  57. app/web/deps.py +43 -0
  58. app/web/routes/__init__.py +16 -0
  59. app/web/routes/dashboard.py +34 -0
  60. app/web/routes/legacy.py +66 -0
  61. app/web/routes/projects.py +116 -0
  62. app/web/routes/recordings_pages.py +86 -0
  63. transcribe_studio-0.2.0.dist-info/METADATA +128 -0
  64. transcribe_studio-0.2.0.dist-info/RECORD +68 -0
  65. transcribe_studio-0.2.0.dist-info/WHEEL +5 -0
  66. transcribe_studio-0.2.0.dist-info/entry_points.txt +2 -0
  67. transcribe_studio-0.2.0.dist-info/licenses/LICENSE +21 -0
  68. transcribe_studio-0.2.0.dist-info/top_level.txt +1 -0
app/__init__.py ADDED
File without changes
app/analytics.py ADDED
@@ -0,0 +1,131 @@
1
+ """Dashboard and recording-level transcription analytics."""
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from typing import Any
6
+
7
+
8
+ def _word_count(text: str) -> int:
9
+ return len(re.findall(r"\S+", text or ""))
10
+
11
+
12
+ def _segment_duration(seg: dict) -> int:
13
+ return max(0, seg["end_ms"] - seg["start_ms"])
14
+
15
+
16
+ def analyze_segments(segments: list[dict], duration_ms: int | None) -> dict[str, Any]:
17
+ total = len(segments)
18
+ transcribed = sum(1 for s in segments if (s.get("transcript") or "").strip())
19
+ labeled = sum(1 for s in segments if (s.get("speaker") or "").strip())
20
+ seg_ms = sum(_segment_duration(s) for s in segments)
21
+ words = sum(_word_count(s.get("transcript") or "") for s in segments)
22
+
23
+ speaker_stats: dict[str, dict[str, int]] = {}
24
+ for s in segments:
25
+ name = (s.get("speaker") or "").strip() or "Unlabeled"
26
+ if name not in speaker_stats:
27
+ speaker_stats[name] = {"segments": 0, "words": 0, "duration_ms": 0}
28
+ speaker_stats[name]["segments"] += 1
29
+ speaker_stats[name]["words"] += _word_count(s.get("transcript") or "")
30
+ speaker_stats[name]["duration_ms"] += _segment_duration(s)
31
+
32
+ speakers = [
33
+ {"name": k, **v}
34
+ for k, v in sorted(speaker_stats.items(), key=lambda x: -x[1]["duration_ms"])
35
+ ]
36
+
37
+ dur = duration_ms or 0
38
+ coverage_pct = min(100, round(seg_ms / dur * 100)) if dur else 0
39
+ transcript_pct = round(transcribed / total * 100) if total else 0
40
+ speaker_pct = round(labeled / total * 100) if total else 0
41
+ avg_seg_ms = round(seg_ms / total) if total else 0
42
+
43
+ return {
44
+ "segment_count": total,
45
+ "transcribed_segments": transcribed,
46
+ "speaker_labeled_segments": labeled,
47
+ "empty_segments": total - transcribed,
48
+ "total_words": words,
49
+ "segmented_duration_ms": seg_ms,
50
+ "coverage_pct": coverage_pct,
51
+ "transcript_pct": transcript_pct,
52
+ "speaker_label_pct": speaker_pct,
53
+ "avg_segment_ms": avg_seg_ms,
54
+ "speakers": speakers,
55
+ }
56
+
57
+
58
+ def analyze_recording(rec: dict, segments: list[dict]) -> dict[str, Any]:
59
+ stats = analyze_segments(segments, rec.get("duration_ms"))
60
+ return {
61
+ "id": rec["id"],
62
+ "title": rec["title"],
63
+ "duration_ms": rec.get("duration_ms"),
64
+ "created_at": rec.get("created_at", "")[:10],
65
+ "notes": rec.get("notes") or "",
66
+ **stats,
67
+ }
68
+
69
+
70
+ def dashboard_stats(conn) -> dict[str, Any]:
71
+ recordings = conn.execute(
72
+ "SELECT id, title, duration_ms, created_at FROM recordings ORDER BY id DESC"
73
+ ).fetchall()
74
+
75
+ total_segments = conn.execute("SELECT COUNT(*) FROM segments").fetchone()[0]
76
+ total_duration = conn.execute(
77
+ "SELECT COALESCE(SUM(duration_ms), 0) FROM recordings"
78
+ ).fetchone()[0]
79
+
80
+ all_segments = conn.execute(
81
+ "SELECT recording_id, start_ms, end_ms, speaker, transcript FROM segments"
82
+ ).fetchall()
83
+ seg_list = [dict(s) for s in all_segments]
84
+ transcribed = sum(1 for s in seg_list if (s.get("transcript") or "").strip())
85
+ words = sum(_word_count(s.get("transcript") or "") for s in seg_list)
86
+ segmented_ms = sum(_segment_duration(s) for s in seg_list)
87
+
88
+ speakers = {
89
+ (s.get("speaker") or "").strip() or "Unlabeled"
90
+ for s in seg_list
91
+ if (s.get("transcript") or "").strip() or (s.get("speaker") or "").strip()
92
+ }
93
+
94
+ recording_stats = []
95
+ for rec in recordings:
96
+ rec_segs = [s for s in seg_list if s["recording_id"] == rec["id"]]
97
+ recording_stats.append(analyze_recording(dict(rec), rec_segs))
98
+
99
+ overall_transcript_pct = (
100
+ round(transcribed / total_segments * 100) if total_segments else 0
101
+ )
102
+ overall_coverage_pct = (
103
+ min(100, round(segmented_ms / total_duration * 100)) if total_duration else 0
104
+ )
105
+
106
+ return {
107
+ "recording_count": len(recordings),
108
+ "segment_count": total_segments,
109
+ "total_duration_ms": total_duration,
110
+ "segmented_duration_ms": segmented_ms,
111
+ "transcribed_segments": transcribed,
112
+ "total_words": words,
113
+ "unique_speakers": len(speakers),
114
+ "transcript_pct": overall_transcript_pct,
115
+ "coverage_pct": overall_coverage_pct,
116
+ "recordings": recording_stats,
117
+ }
118
+
119
+
120
+ def fmt_duration(ms: int | None) -> str:
121
+ if not ms:
122
+ return "—"
123
+ s = ms / 1000
124
+ h = int(s // 3600)
125
+ m = int((s % 3600) // 60)
126
+ sec = s % 60
127
+ if h:
128
+ return f"{h}h {m}m"
129
+ if m:
130
+ return f"{m}m {sec:.0f}s"
131
+ return f"{sec:.1f}s"
@@ -0,0 +1,45 @@
1
+ # Transcribe Studio - Evaluation Configuration
2
+ # https://github.com/Mishkat-Quantum-Labs/transcribe-studio
3
+
4
+ [evaluation]
5
+ version = "1.0"
6
+ default_language = "en"
7
+
8
+ # Metrics to compute
9
+ # Set enabled = false to skip a metric
10
+ # weight is used for weighted averaging in overall score
11
+
12
+ [evaluation.metrics]
13
+
14
+ [evaluation.metrics.wer]
15
+ enabled = true
16
+ weight = 1.0
17
+ case_sensitive = false
18
+ description = "Word Error Rate - standard ASR metric"
19
+
20
+ [evaluation.metrics.cer]
21
+ enabled = false
22
+ weight = 0.0
23
+ description = "Character Error Rate - useful for character-level languages"
24
+
25
+ [evaluation.metrics.semantic_score]
26
+ enabled = true
27
+ weight = 0.5
28
+ description = "Semantic equivalence score - partial credit for meaning"
29
+
30
+ # Text normalization settings
31
+ # These apply before metric calculation
32
+
33
+ [evaluation.normalization]
34
+ lowercase = true
35
+ trim_whitespace = true
36
+ remove_punctuation = false
37
+ normalize_quotes = true
38
+ remove_special_chars = false
39
+
40
+ # UI Settings
41
+
42
+ [evaluation.ui]
43
+ show_detailed_breakdown = true
44
+ highlight_errors = true
45
+ color_scheme = "auto"
@@ -0,0 +1,242 @@
1
+ # English Language Configuration
2
+ # Semantic equivalence rules for English
3
+
4
+ [language]
5
+ code = "en"
6
+ name = "English"
7
+ normalizer_class = "en"
8
+
9
+ # ============================================================
10
+ # SEMANTIC MATCHING RULES
11
+ # ============================================================
12
+ # These rules define phrases that are semantically equivalent
13
+ # even when they differ in exact wording.
14
+ #
15
+ # Each rule has:
16
+ # - variants: list of alternative phrasings
17
+ # - canonical: the "standard" form to compare against
18
+ # - weight: 0.0-1.0, confidence of equivalence
19
+ #
20
+ # Matching works bidirectionally:
21
+ # "gonna" matches "going to" and vice versa
22
+ # ============================================================
23
+
24
+ [[semantic_matchers.group]]
25
+ name = "contractions_informal"
26
+ description = "Contractions and informal speech → formal forms"
27
+ enabled = true
28
+
29
+ [[semantic_matchers.group.rule]]
30
+ variants = ["gonna", "gon na", "gunna", "gonna"]
31
+ canonical = "going to"
32
+ weight = 0.95
33
+
34
+ [[semantic_matchers.group.rule]]
35
+ variants = ["wanna", "wanner"]
36
+ canonical = "want to"
37
+ weight = 0.95
38
+
39
+ [[semantic_matchers.group.rule]]
40
+ variants = ["gotta", "got a"]
41
+ canonical = "got to"
42
+ weight = 0.9
43
+
44
+ [[semantic_matchers.group.rule]]
45
+ variants = ["kinda", "kind of"]
46
+ canonical = "kind of"
47
+ weight = 0.9
48
+
49
+ [[semantic_matchers.group.rule]]
50
+ variants = ["sorta", "sort of"]
51
+ canonical = "sort of"
52
+ weight = 0.9
53
+
54
+ [[semantic_matchers.group.rule]]
55
+ variants = ["outta", "out of"]
56
+ canonical = "out of"
57
+ weight = 0.95
58
+
59
+ [[semantic_matchers.group.rule]]
60
+ variants = ["lemme", "let me"]
61
+ canonical = "let me"
62
+ weight = 0.95
63
+
64
+ [[semantic_matchers.group.rule]]
65
+ variants = ["gimme", "give me"]
66
+ canonical = "give me"
67
+ weight = 0.95
68
+
69
+ [[semantic_matchers.group.rule]]
70
+ variants = ["dunno", "dont know", "do not know", "don't know"]
71
+ canonical = "do not know"
72
+ weight = 0.9
73
+
74
+ [[semantic_matchers.group.rule]]
75
+ variants = ["coulda", "could have", "could've"]
76
+ canonical = "could have"
77
+ weight = 0.95
78
+
79
+ [[semantic_matchers.group.rule]]
80
+ variants = ["woulda", "would have", "would've"]
81
+ canonical = "would have"
82
+ weight = 0.95
83
+
84
+ [[semantic_matchers.group.rule]]
85
+ variants = ["shoulda", "should have", "should've"]
86
+ canonical = "should have"
87
+ weight = 0.95
88
+
89
+ [[semantic_matchers.group.rule]]
90
+ variants = ["lotsa", "lots of"]
91
+ canonical = "lots of"
92
+ weight = 0.95
93
+
94
+ [[semantic_matchers.group.rule]]
95
+ variants = ["cause", "cos", "cuz"]
96
+ canonical = "because"
97
+ weight = 0.85
98
+
99
+ [[semantic_matchers.group.rule]]
100
+ variants = ["nvm", "nvr", "nevermind", "never mind"]
101
+ canonical = "never mind"
102
+ weight = 0.95
103
+
104
+ [[semantic_matchers.group.rule]]
105
+ variants = ["thru", "through"]
106
+ canonical = "through"
107
+ weight = 0.98
108
+
109
+ [[semantic_matchers.group.rule]]
110
+ variants = ["u", "you"]
111
+ canonical = "you"
112
+ weight = 0.8
113
+
114
+ [[semantic_matchers.group.rule]]
115
+ variants = ["ur", "you're", "your"]
116
+ canonical = "your"
117
+ weight = 0.7
118
+
119
+ [[semantic_matchers.group.rule]]
120
+ variants = ["ok", "okay", "ok"]
121
+ canonical = "okay"
122
+ weight = 1.0
123
+
124
+ [[semantic_matchers.group.rule]]
125
+ variants = ["yeah", "yes", "yea", "yah"]
126
+ canonical = "yes"
127
+ weight = 0.95
128
+
129
+ [[semantic_matchers.group.rule]]
130
+ variants = ["nope", "no", "nah"]
131
+ canonical = "no"
132
+ weight = 0.95
133
+
134
+ [[semantic_matchers.group.rule]]
135
+ variants = ["alright", "all right", "allright"]
136
+ canonical = "all right"
137
+ weight = 1.0
138
+
139
+ [[semantic_matchers.group.rule]]
140
+ variants = ["gonna", "goin", "goin to", "going"]
141
+ canonical = "going"
142
+ weight = 0.8
143
+
144
+ [[semantic_matchers.group]]
145
+ name = "repeated_sounds"
146
+ description = "Stuttered/repeated sounds - common in spontaneous speech"
147
+ enabled = true
148
+
149
+ [[semantic_matchers.group.rule]]
150
+ variants = ["um", "uh", "er", "erm"]
151
+ canonical = ""
152
+ weight = 0.5
153
+
154
+ [[semantic_matchers.group]]
155
+ name = "common_confusions"
156
+ description = "Common ASR/LLM transcription confusions"
157
+ enabled = true
158
+
159
+ [[semantic_matchers.group.rule]]
160
+ variants = ["i am", "i'm", "im"]
161
+ canonical = "i am"
162
+ weight = 1.0
163
+
164
+ [[semantic_matchers.group.rule]]
165
+ variants = ["you know", "yknow", "y'know"]
166
+ canonical = "you know"
167
+ weight = 0.9
168
+
169
+ [[semantic_matchers.group.rule]]
170
+ variants = ["like", "like like"]
171
+ canonical = "like"
172
+ weight = 0.7
173
+
174
+ [[semantic_matchers.group]]
175
+ name = "numbers"
176
+ description = "Number word ↔ digit equivalence"
177
+ enabled = true
178
+
179
+ [[semantic_matchers.group.rule]]
180
+ variants = ["for", "four"]
181
+ canonical = "four"
182
+ weight = 0.9
183
+
184
+ [[semantic_matchers.group.rule]]
185
+ variants = ["to", "two", "too"]
186
+ canonical = "two"
187
+ weight = 0.8
188
+
189
+ [[semantic_matchers.group]]
190
+ name = "contractions"
191
+ description = "Standard English contractions"
192
+ enabled = true
193
+
194
+ [[semantic_matchers.group.rule]]
195
+ variants = ["don't", "do not"]
196
+ canonical = "do not"
197
+ weight = 1.0
198
+
199
+ [[semantic_matchers.group.rule]]
200
+ variants = ["can't", "cannot"]
201
+ canonical = "cannot"
202
+ weight = 1.0
203
+
204
+ [[semantic_matchers.group.rule]]
205
+ variants = ["won't", "will not"]
206
+ canonical = "will not"
207
+ weight = 1.0
208
+
209
+ [[semantic_matchers.group.rule]]
210
+ variants = ["i've", "i have"]
211
+ canonical = "i have"
212
+ weight = 1.0
213
+
214
+ [[semantic_matchers.group.rule]]
215
+ variants = ["i'll", "i will"]
216
+ canonical = "i will"
217
+ weight = 1.0
218
+
219
+ [[semantic_matchers.group.rule]]
220
+ variants = ["it's", "it is", "its"]
221
+ canonical = "it is"
222
+ weight = 0.95
223
+
224
+ [[semantic_matchers.group.rule]]
225
+ variants = ["that's", "that is"]
226
+ canonical = "that is"
227
+ weight = 1.0
228
+
229
+ [[semantic_matchers.group.rule]]
230
+ variants = ["there's", "there is"]
231
+ canonical = "there is"
232
+ weight = 1.0
233
+
234
+ [[semantic_matchers.group.rule]]
235
+ variants = ["here's", "here is"]
236
+ canonical = "here is"
237
+ weight = 1.0
238
+
239
+ [[semantic_matchers.group.rule]]
240
+ variants = ["what's", "what is"]
241
+ canonical = "what is"
242
+ weight = 1.0
@@ -0,0 +1,47 @@
1
+ # Transcribe Studio — transcript import formats
2
+ # Contributors: add a parser module + entry here
3
+
4
+ [transcript_formats]
5
+ # First matching parser wins (auto-detect)
6
+ detection_order = ["timestamp_speaker", "json_segments", "plain_text"]
7
+
8
+ # File upload only — paste accepts any text (format auto-detected from content)
9
+ [transcript_formats.upload]
10
+ accepted_extensions = [".json", ".txt", ".transcript"]
11
+ max_bytes = 5242880
12
+
13
+ [transcript_formats.alignment]
14
+ # When aligning [MM:SS] lines to audio chunks, prefix with speaker name?
15
+ include_speaker = false
16
+
17
+ [transcript_formats.formats.timestamp_speaker]
18
+ name = "Timestamp + Speaker"
19
+ description = "Lines like [MM:SS] Speaker: text — common in LLM / classroom exports"
20
+ enabled = true
21
+ extensions = [".txt", ".transcript"]
22
+ example = "[00:07] Teacher: At the beginning of the year..."
23
+
24
+ [transcript_formats.formats.json_segments]
25
+ name = "JSON segments"
26
+ description = "Structured JSON with segment id or start_ms"
27
+ enabled = true
28
+ extensions = [".json"]
29
+ example = '{"segments": [{"id": 1, "text": "Hello"}]}'
30
+
31
+ [transcript_formats.formats.plain_text]
32
+ name = "Plain text"
33
+ description = "Single block of text (applied to every chunk — quick test only)"
34
+ enabled = true
35
+ extensions = [".txt"]
36
+ example = "Full transcript as one paragraph..."
37
+
38
+ # Future formats (enable when parser exists):
39
+ # [transcript_formats.formats.srt]
40
+ # name = "SubRip subtitles"
41
+ # enabled = false
42
+ # extensions = [".srt"]
43
+
44
+ # [transcript_formats.formats.vtt]
45
+ # name = "WebVTT"
46
+ # enabled = false
47
+ # extensions = [".vtt"]
app/database.py ADDED
@@ -0,0 +1,126 @@
1
+ import sqlite3
2
+
3
+ from app.paths import data_dir
4
+
5
+ DATA_DIR = data_dir()
6
+ DB_PATH = DATA_DIR / "transcribe.db"
7
+
8
+
9
+ def get_conn() -> sqlite3.Connection:
10
+ DATA_DIR.mkdir(parents=True, exist_ok=True)
11
+ conn = sqlite3.connect(DB_PATH, check_same_thread=False)
12
+ conn.row_factory = sqlite3.Row
13
+ conn.execute("PRAGMA foreign_keys = ON")
14
+ return conn
15
+
16
+
17
+ def init_db() -> None:
18
+ conn = get_conn()
19
+ conn.executescript(
20
+ """
21
+ CREATE TABLE IF NOT EXISTS projects (
22
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
23
+ name TEXT NOT NULL,
24
+ description TEXT DEFAULT '',
25
+ created_at TEXT DEFAULT (datetime('now'))
26
+ );
27
+
28
+ CREATE TABLE IF NOT EXISTS recordings (
29
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
30
+ project_id INTEGER REFERENCES projects(id) ON DELETE SET NULL,
31
+ title TEXT NOT NULL,
32
+ filename TEXT NOT NULL,
33
+ duration_ms INTEGER,
34
+ notes TEXT DEFAULT '',
35
+ llm_transcript_file TEXT DEFAULT '',
36
+ llm_transcript_lang TEXT DEFAULT 'en',
37
+ llm_transcript_format TEXT DEFAULT '',
38
+ created_at TEXT DEFAULT (datetime('now'))
39
+ );
40
+
41
+ CREATE TABLE IF NOT EXISTS segments (
42
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
43
+ recording_id INTEGER NOT NULL REFERENCES recordings(id) ON DELETE CASCADE,
44
+ start_ms INTEGER NOT NULL,
45
+ end_ms INTEGER NOT NULL,
46
+ speaker TEXT NOT NULL DEFAULT '',
47
+ transcript TEXT NOT NULL DEFAULT '',
48
+ llm_transcript TEXT NOT NULL DEFAULT '',
49
+ sort_order INTEGER NOT NULL DEFAULT 0,
50
+ created_at TEXT DEFAULT (datetime('now')),
51
+ updated_at TEXT DEFAULT (datetime('now'))
52
+ );
53
+ """
54
+ )
55
+ conn.commit()
56
+ conn.close()
57
+
58
+
59
+ def migrate_add_llm_transcript() -> None:
60
+ """Add llm_transcript column to segments if it doesn't exist."""
61
+ conn = get_conn()
62
+ try:
63
+ conn.execute("ALTER TABLE segments ADD COLUMN llm_transcript TEXT NOT NULL DEFAULT ''")
64
+ conn.commit()
65
+ except sqlite3.OperationalError:
66
+ pass # Column already exists
67
+ finally:
68
+ conn.close()
69
+
70
+
71
+ def migrate_add_recording_llm_fields() -> None:
72
+ """Add LLM transcript fields to recordings table if they don't exist."""
73
+ conn = get_conn()
74
+ for sql in (
75
+ "ALTER TABLE recordings ADD COLUMN llm_transcript_file TEXT DEFAULT ''",
76
+ "ALTER TABLE recordings ADD COLUMN llm_transcript_lang TEXT DEFAULT 'en'",
77
+ "ALTER TABLE recordings ADD COLUMN llm_transcript_format TEXT DEFAULT ''",
78
+ ):
79
+ try:
80
+ conn.execute(sql)
81
+ conn.commit()
82
+ except sqlite3.OperationalError:
83
+ pass
84
+ conn.close()
85
+
86
+
87
+ def migrate_add_projects() -> None:
88
+ """Ensure projects exist and recordings are assigned."""
89
+ conn = get_conn()
90
+ conn.execute(
91
+ """
92
+ CREATE TABLE IF NOT EXISTS projects (
93
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
94
+ name TEXT NOT NULL,
95
+ description TEXT DEFAULT '',
96
+ created_at TEXT DEFAULT (datetime('now'))
97
+ )
98
+ """
99
+ )
100
+ conn.commit()
101
+ try:
102
+ conn.execute(
103
+ "ALTER TABLE recordings ADD COLUMN project_id INTEGER REFERENCES projects(id)"
104
+ )
105
+ conn.commit()
106
+ except sqlite3.OperationalError:
107
+ pass
108
+
109
+ default = conn.execute(
110
+ "SELECT id FROM projects WHERE name = 'Default project' LIMIT 1"
111
+ ).fetchone()
112
+ if not default:
113
+ cur = conn.execute(
114
+ "INSERT INTO projects (name, description) VALUES (?, ?)",
115
+ ("Default project", "Imported and new recordings"),
116
+ )
117
+ default_id = cur.lastrowid
118
+ else:
119
+ default_id = default["id"]
120
+
121
+ conn.execute(
122
+ "UPDATE recordings SET project_id = ? WHERE project_id IS NULL",
123
+ (default_id,),
124
+ )
125
+ conn.commit()
126
+ conn.close()
@@ -0,0 +1,18 @@
1
+ """
2
+ Transcribe Studio Evaluation Module.
3
+
4
+ Provides WER, semantic matching, and extensible evaluation metrics
5
+ for comparing human transcriptions against LLM-generated transcripts.
6
+ """
7
+ from app.evaluation.config import EvaluationConfig, get_config, reload_config
8
+ from app.evaluation.engine import EvaluationEngine
9
+ from app.evaluation.models import EvaluationResult, SegmentResult
10
+
11
+ __all__ = [
12
+ "EvaluationEngine",
13
+ "EvaluationConfig",
14
+ "EvaluationResult",
15
+ "SegmentResult",
16
+ "get_config",
17
+ "reload_config",
18
+ ]