karaoke-gen 0.90.1__py3-none-any.whl → 0.96.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- backend/.coveragerc +20 -0
- backend/.gitignore +37 -0
- backend/Dockerfile +43 -0
- backend/Dockerfile.base +74 -0
- backend/README.md +242 -0
- backend/__init__.py +0 -0
- backend/api/__init__.py +0 -0
- backend/api/dependencies.py +457 -0
- backend/api/routes/__init__.py +0 -0
- backend/api/routes/admin.py +742 -0
- backend/api/routes/audio_search.py +903 -0
- backend/api/routes/auth.py +348 -0
- backend/api/routes/file_upload.py +2076 -0
- backend/api/routes/health.py +344 -0
- backend/api/routes/internal.py +435 -0
- backend/api/routes/jobs.py +1610 -0
- backend/api/routes/review.py +652 -0
- backend/api/routes/themes.py +162 -0
- backend/api/routes/users.py +1014 -0
- backend/config.py +172 -0
- backend/main.py +133 -0
- backend/middleware/__init__.py +5 -0
- backend/middleware/audit_logging.py +124 -0
- backend/models/__init__.py +0 -0
- backend/models/job.py +519 -0
- backend/models/requests.py +123 -0
- backend/models/theme.py +153 -0
- backend/models/user.py +254 -0
- backend/models/worker_log.py +164 -0
- backend/pyproject.toml +29 -0
- backend/quick-check.sh +93 -0
- backend/requirements.txt +29 -0
- backend/run_tests.sh +60 -0
- backend/services/__init__.py +0 -0
- backend/services/audio_analysis_service.py +243 -0
- backend/services/audio_editing_service.py +278 -0
- backend/services/audio_search_service.py +702 -0
- backend/services/auth_service.py +630 -0
- backend/services/credential_manager.py +792 -0
- backend/services/discord_service.py +172 -0
- backend/services/dropbox_service.py +301 -0
- backend/services/email_service.py +1093 -0
- backend/services/encoding_interface.py +454 -0
- backend/services/encoding_service.py +405 -0
- backend/services/firestore_service.py +512 -0
- backend/services/flacfetch_client.py +573 -0
- backend/services/gce_encoding/README.md +72 -0
- backend/services/gce_encoding/__init__.py +22 -0
- backend/services/gce_encoding/main.py +589 -0
- backend/services/gce_encoding/requirements.txt +16 -0
- backend/services/gdrive_service.py +356 -0
- backend/services/job_logging.py +258 -0
- backend/services/job_manager.py +842 -0
- backend/services/job_notification_service.py +271 -0
- backend/services/local_encoding_service.py +590 -0
- backend/services/local_preview_encoding_service.py +407 -0
- backend/services/lyrics_cache_service.py +216 -0
- backend/services/metrics.py +413 -0
- backend/services/packaging_service.py +287 -0
- backend/services/rclone_service.py +106 -0
- backend/services/storage_service.py +209 -0
- backend/services/stripe_service.py +275 -0
- backend/services/structured_logging.py +254 -0
- backend/services/template_service.py +330 -0
- backend/services/theme_service.py +469 -0
- backend/services/tracing.py +543 -0
- backend/services/user_service.py +721 -0
- backend/services/worker_service.py +558 -0
- backend/services/youtube_service.py +112 -0
- backend/services/youtube_upload_service.py +445 -0
- backend/tests/__init__.py +4 -0
- backend/tests/conftest.py +224 -0
- backend/tests/emulator/__init__.py +7 -0
- backend/tests/emulator/conftest.py +88 -0
- backend/tests/emulator/test_e2e_cli_backend.py +1053 -0
- backend/tests/emulator/test_emulator_integration.py +356 -0
- backend/tests/emulator/test_style_loading_direct.py +436 -0
- backend/tests/emulator/test_worker_logs_direct.py +229 -0
- backend/tests/emulator/test_worker_logs_subcollection.py +443 -0
- backend/tests/requirements-test.txt +10 -0
- backend/tests/requirements.txt +6 -0
- backend/tests/test_admin_email_endpoints.py +411 -0
- backend/tests/test_api_integration.py +460 -0
- backend/tests/test_api_routes.py +93 -0
- backend/tests/test_audio_analysis_service.py +294 -0
- backend/tests/test_audio_editing_service.py +386 -0
- backend/tests/test_audio_search.py +1398 -0
- backend/tests/test_audio_services.py +378 -0
- backend/tests/test_auth_firestore.py +231 -0
- backend/tests/test_config_extended.py +68 -0
- backend/tests/test_credential_manager.py +377 -0
- backend/tests/test_dependencies.py +54 -0
- backend/tests/test_discord_service.py +244 -0
- backend/tests/test_distribution_services.py +820 -0
- backend/tests/test_dropbox_service.py +472 -0
- backend/tests/test_email_service.py +492 -0
- backend/tests/test_emulator_integration.py +322 -0
- backend/tests/test_encoding_interface.py +412 -0
- backend/tests/test_file_upload.py +1739 -0
- backend/tests/test_flacfetch_client.py +632 -0
- backend/tests/test_gdrive_service.py +524 -0
- backend/tests/test_instrumental_api.py +431 -0
- backend/tests/test_internal_api.py +343 -0
- backend/tests/test_job_creation_regression.py +583 -0
- backend/tests/test_job_manager.py +339 -0
- backend/tests/test_job_manager_notifications.py +329 -0
- backend/tests/test_job_notification_service.py +443 -0
- backend/tests/test_jobs_api.py +273 -0
- backend/tests/test_local_encoding_service.py +423 -0
- backend/tests/test_local_preview_encoding_service.py +567 -0
- backend/tests/test_main.py +87 -0
- backend/tests/test_models.py +918 -0
- backend/tests/test_packaging_service.py +382 -0
- backend/tests/test_requests.py +201 -0
- backend/tests/test_routes_jobs.py +282 -0
- backend/tests/test_routes_review.py +337 -0
- backend/tests/test_services.py +556 -0
- backend/tests/test_services_extended.py +112 -0
- backend/tests/test_storage_service.py +448 -0
- backend/tests/test_style_upload.py +261 -0
- backend/tests/test_template_service.py +295 -0
- backend/tests/test_theme_service.py +516 -0
- backend/tests/test_unicode_sanitization.py +522 -0
- backend/tests/test_upload_api.py +256 -0
- backend/tests/test_validate.py +156 -0
- backend/tests/test_video_worker_orchestrator.py +847 -0
- backend/tests/test_worker_log_subcollection.py +509 -0
- backend/tests/test_worker_logging.py +365 -0
- backend/tests/test_workers.py +1116 -0
- backend/tests/test_workers_extended.py +178 -0
- backend/tests/test_youtube_service.py +247 -0
- backend/tests/test_youtube_upload_service.py +568 -0
- backend/validate.py +173 -0
- backend/version.py +27 -0
- backend/workers/README.md +597 -0
- backend/workers/__init__.py +11 -0
- backend/workers/audio_worker.py +618 -0
- backend/workers/lyrics_worker.py +683 -0
- backend/workers/render_video_worker.py +483 -0
- backend/workers/screens_worker.py +525 -0
- backend/workers/style_helper.py +198 -0
- backend/workers/video_worker.py +1277 -0
- backend/workers/video_worker_orchestrator.py +701 -0
- backend/workers/worker_logging.py +278 -0
- karaoke_gen/instrumental_review/static/index.html +7 -4
- karaoke_gen/karaoke_finalise/karaoke_finalise.py +6 -1
- karaoke_gen/utils/__init__.py +163 -8
- karaoke_gen/video_background_processor.py +9 -4
- {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.96.0.dist-info}/METADATA +1 -1
- {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.96.0.dist-info}/RECORD +186 -41
- lyrics_transcriber/correction/agentic/providers/config.py +9 -5
- lyrics_transcriber/correction/agentic/providers/langchain_bridge.py +1 -51
- lyrics_transcriber/correction/corrector.py +192 -130
- lyrics_transcriber/correction/operations.py +24 -9
- lyrics_transcriber/frontend/package-lock.json +2 -2
- lyrics_transcriber/frontend/package.json +1 -1
- lyrics_transcriber/frontend/src/components/AIFeedbackModal.tsx +1 -1
- lyrics_transcriber/frontend/src/components/CorrectedWordWithActions.tsx +11 -7
- lyrics_transcriber/frontend/src/components/EditActionBar.tsx +31 -5
- lyrics_transcriber/frontend/src/components/EditModal.tsx +28 -10
- lyrics_transcriber/frontend/src/components/EditTimelineSection.tsx +123 -27
- lyrics_transcriber/frontend/src/components/EditWordList.tsx +112 -60
- lyrics_transcriber/frontend/src/components/Header.tsx +90 -76
- lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +53 -31
- lyrics_transcriber/frontend/src/components/LyricsSynchronizer/SyncControls.tsx +44 -13
- lyrics_transcriber/frontend/src/components/LyricsSynchronizer/TimelineCanvas.tsx +66 -50
- lyrics_transcriber/frontend/src/components/LyricsSynchronizer/index.tsx +124 -30
- lyrics_transcriber/frontend/src/components/ReferenceView.tsx +1 -1
- lyrics_transcriber/frontend/src/components/TimelineEditor.tsx +12 -5
- lyrics_transcriber/frontend/src/components/TimingOffsetModal.tsx +3 -3
- lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +1 -1
- lyrics_transcriber/frontend/src/components/WordDivider.tsx +11 -7
- lyrics_transcriber/frontend/src/components/shared/components/Word.tsx +4 -2
- lyrics_transcriber/frontend/src/hooks/useManualSync.ts +103 -1
- lyrics_transcriber/frontend/src/theme.ts +42 -15
- lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -1
- lyrics_transcriber/frontend/vite.config.js +5 -0
- lyrics_transcriber/frontend/web_assets/assets/{index-BECn1o8Q.js → index-BSMgOq4Z.js} +6959 -5782
- lyrics_transcriber/frontend/web_assets/assets/index-BSMgOq4Z.js.map +1 -0
- lyrics_transcriber/frontend/web_assets/index.html +6 -2
- lyrics_transcriber/frontend/web_assets/nomad-karaoke-logo.svg +5 -0
- lyrics_transcriber/output/generator.py +17 -3
- lyrics_transcriber/output/video.py +60 -95
- lyrics_transcriber/frontend/web_assets/assets/index-BECn1o8Q.js.map +0 -1
- {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.96.0.dist-info}/WHEEL +0 -0
- {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.96.0.dist-info}/entry_points.txt +0 -0
- {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.96.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,522 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Comprehensive tests for Unicode and special character handling.
|
|
3
|
+
|
|
4
|
+
This test file verifies that artist/title with Unicode characters (curly quotes,
|
|
5
|
+
em dashes, non-ASCII characters) are properly sanitized throughout the codebase
|
|
6
|
+
to prevent:
|
|
7
|
+
1. HTTP header encoding failures (Content-Disposition, email subjects)
|
|
8
|
+
2. API query failures (Google Drive)
|
|
9
|
+
3. Filename encoding issues (Modal API, filesystem)
|
|
10
|
+
|
|
11
|
+
The root cause was job d49efab1 which had title "Mama Says (You Can't Back Down)"
|
|
12
|
+
with a curly apostrophe (U+2019) that caused the audio separation to fail because
|
|
13
|
+
the Modal API couldn't handle the non-latin-1 character in HTTP headers.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import pytest
|
|
17
|
+
from karaoke_gen.utils import (
|
|
18
|
+
sanitize_filename,
|
|
19
|
+
normalize_text,
|
|
20
|
+
UNICODE_REPLACEMENTS,
|
|
21
|
+
TEXT_NORMALIZATIONS,
|
|
22
|
+
APOSTROPHE_REPLACEMENTS,
|
|
23
|
+
DOUBLE_QUOTE_REPLACEMENTS,
|
|
24
|
+
DASH_REPLACEMENTS,
|
|
25
|
+
WHITESPACE_REPLACEMENTS,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class TestSanitizeFilename:
|
|
30
|
+
"""Test the sanitize_filename function handles all edge cases."""
|
|
31
|
+
|
|
32
|
+
def test_curly_single_quotes(self):
|
|
33
|
+
"""Test that curly single quotes are converted to straight quotes."""
|
|
34
|
+
# The exact character that caused job d49efab1 to fail
|
|
35
|
+
assert sanitize_filename("Can't") == "Can't"
|
|
36
|
+
assert sanitize_filename("It's") == "It's"
|
|
37
|
+
# Left single quote
|
|
38
|
+
assert sanitize_filename("'Hello'") == "'Hello'"
|
|
39
|
+
|
|
40
|
+
def test_curly_double_quotes(self):
|
|
41
|
+
"""Test that curly double quotes are converted to underscores.
|
|
42
|
+
|
|
43
|
+
Note: Double quotes are filesystem-unsafe so they become underscores."""
|
|
44
|
+
# Curly quotes first become straight, then straight quotes become underscore
|
|
45
|
+
assert sanitize_filename("\u201cHello\u201d") == "_Hello_"
|
|
46
|
+
|
|
47
|
+
def test_em_dash(self):
|
|
48
|
+
"""Test that em dashes are converted to regular hyphens."""
|
|
49
|
+
assert sanitize_filename("Artist — Title") == "Artist - Title"
|
|
50
|
+
assert sanitize_filename("Song—Name") == "Song-Name"
|
|
51
|
+
|
|
52
|
+
def test_en_dash(self):
|
|
53
|
+
"""Test that en dashes are converted to regular hyphens."""
|
|
54
|
+
assert sanitize_filename("1990–2000") == "1990-2000"
|
|
55
|
+
|
|
56
|
+
def test_ellipsis(self):
|
|
57
|
+
"""Test that horizontal ellipsis is converted to three dots."""
|
|
58
|
+
# Note: The ellipsis becomes "..." but trailing dots are stripped
|
|
59
|
+
assert sanitize_filename("Wait\u2026") == "Wait"
|
|
60
|
+
# But in the middle of a string they're preserved
|
|
61
|
+
assert sanitize_filename("Wait\u2026Here") == "Wait...Here"
|
|
62
|
+
|
|
63
|
+
def test_non_breaking_space(self):
|
|
64
|
+
"""Test that non-breaking spaces are converted to regular spaces."""
|
|
65
|
+
# U+00A0 is non-breaking space
|
|
66
|
+
assert sanitize_filename("Hello\u00A0World") == "Hello World"
|
|
67
|
+
|
|
68
|
+
def test_filesystem_unsafe_characters(self):
|
|
69
|
+
"""Test that filesystem-unsafe characters are replaced."""
|
|
70
|
+
assert sanitize_filename("file/name") == "file_name"
|
|
71
|
+
assert sanitize_filename("file\\name") == "file_name"
|
|
72
|
+
assert sanitize_filename("file:name") == "file_name"
|
|
73
|
+
assert sanitize_filename("file*name") == "file_name"
|
|
74
|
+
assert sanitize_filename("file?name") == "file_name"
|
|
75
|
+
assert sanitize_filename('file"name') == "file_name"
|
|
76
|
+
assert sanitize_filename("file<name") == "file_name"
|
|
77
|
+
assert sanitize_filename("file>name") == "file_name"
|
|
78
|
+
assert sanitize_filename("file|name") == "file_name"
|
|
79
|
+
|
|
80
|
+
def test_trailing_periods_and_spaces(self):
|
|
81
|
+
"""Test that trailing periods and spaces are removed."""
|
|
82
|
+
assert sanitize_filename("filename.") == "filename"
|
|
83
|
+
assert sanitize_filename("filename...") == "filename"
|
|
84
|
+
assert sanitize_filename("filename ") == "filename"
|
|
85
|
+
assert sanitize_filename("filename . ") == "filename"
|
|
86
|
+
|
|
87
|
+
def test_leading_periods_and_spaces(self):
|
|
88
|
+
"""Test that leading periods and spaces are removed."""
|
|
89
|
+
assert sanitize_filename(".filename") == "filename"
|
|
90
|
+
assert sanitize_filename("...filename") == "filename"
|
|
91
|
+
assert sanitize_filename(" filename") == "filename"
|
|
92
|
+
assert sanitize_filename(" . filename") == "filename"
|
|
93
|
+
|
|
94
|
+
def test_multiple_underscores_collapsed(self):
|
|
95
|
+
"""Test that multiple consecutive underscores are collapsed to one."""
|
|
96
|
+
assert sanitize_filename("file___name") == "file_name"
|
|
97
|
+
# Multiple unsafe chars in a row
|
|
98
|
+
assert sanitize_filename("file?*:name") == "file_name"
|
|
99
|
+
|
|
100
|
+
def test_multiple_spaces_collapsed(self):
|
|
101
|
+
"""Test that multiple consecutive spaces are collapsed to one."""
|
|
102
|
+
assert sanitize_filename("file name") == "file name"
|
|
103
|
+
|
|
104
|
+
def test_none_input(self):
|
|
105
|
+
"""Test that None input returns None."""
|
|
106
|
+
assert sanitize_filename(None) is None
|
|
107
|
+
|
|
108
|
+
def test_empty_string(self):
|
|
109
|
+
"""Test that empty string returns empty string."""
|
|
110
|
+
assert sanitize_filename("") == ""
|
|
111
|
+
|
|
112
|
+
def test_real_world_examples(self):
|
|
113
|
+
"""Test real-world examples that have caused issues."""
|
|
114
|
+
# The exact title from job d49efab1
|
|
115
|
+
assert sanitize_filename("Mama Says (You Can't Back Down)") == "Mama Says (You Can't Back Down)"
|
|
116
|
+
|
|
117
|
+
# Broadway cast with smart quotes
|
|
118
|
+
assert sanitize_filename("Footloose (Broadway Cast) \u2014 \u201cFinal Song\u201d") == "Footloose (Broadway Cast) - _Final Song_"
|
|
119
|
+
|
|
120
|
+
# Japanese artist with em dash
|
|
121
|
+
assert sanitize_filename("宇多田ヒカル — First Love") == "宇多田ヒカル - First Love"
|
|
122
|
+
|
|
123
|
+
# Korean title (should pass through unchanged)
|
|
124
|
+
assert sanitize_filename("아이유 - 좋은 날") == "아이유 - 좋은 날"
|
|
125
|
+
|
|
126
|
+
# Mixed content
|
|
127
|
+
assert sanitize_filename("L'Arc~en~Ciel - Driver's High") == "L'Arc~en~Ciel - Driver's High"
|
|
128
|
+
|
|
129
|
+
def test_combination_of_issues(self):
|
|
130
|
+
"""Test strings with multiple problematic characters."""
|
|
131
|
+
result = sanitize_filename("It\u2019s \u201cMy\u201d Song \u2014 Volume\u20261")
|
|
132
|
+
# Curly ' -> ', curly " -> _ (via filesystem check), em dash -> -, ellipsis -> ...
|
|
133
|
+
assert result == "It's _My_ Song - Volume...1"
|
|
134
|
+
|
|
135
|
+
def test_unicode_replacements_dict_complete(self):
|
|
136
|
+
"""Verify all expected Unicode characters are in the replacements dict."""
|
|
137
|
+
# Curly quotes
|
|
138
|
+
assert "\u2018" in UNICODE_REPLACEMENTS # LEFT SINGLE QUOTATION MARK
|
|
139
|
+
assert "\u2019" in UNICODE_REPLACEMENTS # RIGHT SINGLE QUOTATION MARK
|
|
140
|
+
assert "\u201A" in UNICODE_REPLACEMENTS # SINGLE LOW-9 QUOTATION MARK
|
|
141
|
+
assert "\u201B" in UNICODE_REPLACEMENTS # SINGLE HIGH-REVERSED-9 QUOTATION MARK
|
|
142
|
+
assert "\u201C" in UNICODE_REPLACEMENTS # LEFT DOUBLE QUOTATION MARK
|
|
143
|
+
assert "\u201D" in UNICODE_REPLACEMENTS # RIGHT DOUBLE QUOTATION MARK
|
|
144
|
+
assert "\u201E" in UNICODE_REPLACEMENTS # DOUBLE LOW-9 QUOTATION MARK
|
|
145
|
+
assert "\u201F" in UNICODE_REPLACEMENTS # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
|
|
146
|
+
# Dashes
|
|
147
|
+
assert "\u2013" in UNICODE_REPLACEMENTS # EN DASH
|
|
148
|
+
assert "\u2014" in UNICODE_REPLACEMENTS # EM DASH
|
|
149
|
+
# Other
|
|
150
|
+
assert "\u2026" in UNICODE_REPLACEMENTS # HORIZONTAL ELLIPSIS
|
|
151
|
+
assert "\u00A0" in UNICODE_REPLACEMENTS # NON-BREAKING SPACE
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class TestSanitizationInContext:
|
|
155
|
+
"""Test that sanitization is applied correctly in various contexts.
|
|
156
|
+
|
|
157
|
+
These tests verify that the fixes we applied work in the actual
|
|
158
|
+
code paths that use artist/title data.
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
def test_artist_title_format_for_audio_separation(self):
|
|
162
|
+
"""Verify artist-title formatting is sanitized for Modal API."""
|
|
163
|
+
# Simulate what audio_worker.py does
|
|
164
|
+
artist = "Footloose (Broadway Cast)"
|
|
165
|
+
title = "Mama Says (You Can\u2019t Back Down)" # Curly apostrophe U+2019
|
|
166
|
+
|
|
167
|
+
safe_artist = sanitize_filename(artist) if artist else "Unknown"
|
|
168
|
+
safe_title = sanitize_filename(title) if title else "Unknown"
|
|
169
|
+
artist_title = f"{safe_artist} - {safe_title}"
|
|
170
|
+
|
|
171
|
+
# The curly apostrophe should be converted to straight
|
|
172
|
+
assert "'" in artist_title # Straight apostrophe
|
|
173
|
+
assert "\u2019" not in artist_title # No curly apostrophe (U+2019)
|
|
174
|
+
|
|
175
|
+
def test_email_subject_safe_for_latin1(self):
|
|
176
|
+
"""Verify email subjects can be encoded to latin-1 after sanitization."""
|
|
177
|
+
artist = "Café del Mar" # Has accented e
|
|
178
|
+
title = "It\u2019s a \u201cBeautiful\u201d Day \u2014 Remix"
|
|
179
|
+
|
|
180
|
+
safe_artist = sanitize_filename(artist) if artist else None
|
|
181
|
+
safe_title = sanitize_filename(title) if title else None
|
|
182
|
+
subject = f"{safe_artist} - {safe_title}"
|
|
183
|
+
|
|
184
|
+
# Accented characters should pass through (they're valid latin-1)
|
|
185
|
+
assert "é" in subject
|
|
186
|
+
# But curly quotes and em dash should be converted
|
|
187
|
+
assert "'" in subject # Straight apostrophe
|
|
188
|
+
assert "-" in subject # Regular hyphen
|
|
189
|
+
# No curly apostrophe (U+2019)
|
|
190
|
+
assert "\u2019" not in subject
|
|
191
|
+
|
|
192
|
+
def test_content_disposition_safe(self):
|
|
193
|
+
"""Verify filenames are safe for HTTP Content-Disposition headers."""
|
|
194
|
+
artist = "Artist\u2019s \u201cName\u201d"
|
|
195
|
+
title = "Song\u2014Title"
|
|
196
|
+
|
|
197
|
+
safe_artist = sanitize_filename(artist) if artist else None
|
|
198
|
+
safe_title = sanitize_filename(title) if title else None
|
|
199
|
+
filename = f"{safe_artist} - {safe_title} (Final Karaoke).mp4"
|
|
200
|
+
|
|
201
|
+
# Should be able to encode to latin-1 for HTTP headers
|
|
202
|
+
try:
|
|
203
|
+
filename.encode('latin-1')
|
|
204
|
+
can_encode = True
|
|
205
|
+
except UnicodeEncodeError:
|
|
206
|
+
can_encode = False
|
|
207
|
+
|
|
208
|
+
# Standard ASCII chars and latin-1 safe accented chars should work
|
|
209
|
+
# Note: The test above with 'é' passes latin-1; smart quotes do not
|
|
210
|
+
assert can_encode
|
|
211
|
+
|
|
212
|
+
def test_google_drive_query_safe(self):
|
|
213
|
+
"""Verify filenames don't break Google Drive API queries."""
|
|
214
|
+
base_name = "Artist\u2019s \u201cSong\u201d"
|
|
215
|
+
|
|
216
|
+
safe_base_name = sanitize_filename(base_name) if base_name else base_name
|
|
217
|
+
filename = f"NOMAD-1234 - {safe_base_name}.mp4"
|
|
218
|
+
|
|
219
|
+
# Google Drive queries use single quotes - our sanitized string
|
|
220
|
+
# should have straight single quotes, not curly
|
|
221
|
+
# The query escaping handles straight quotes: '
|
|
222
|
+
escaped = filename.replace("'", "\\'")
|
|
223
|
+
query = f"name='{escaped}'"
|
|
224
|
+
|
|
225
|
+
# Should be a valid query string (no curly quotes to break syntax)
|
|
226
|
+
assert "\u2019" not in query # No curly single quote (U+2019)
|
|
227
|
+
assert "\u2018" not in query # No curly single quote (U+2018)
|
|
228
|
+
|
|
229
|
+
def test_dropbox_path_safe(self):
|
|
230
|
+
"""Verify Dropbox paths don't have problematic characters."""
|
|
231
|
+
artist = "Don\u2019t Stop"
|
|
232
|
+
title = "Believin\u2019"
|
|
233
|
+
|
|
234
|
+
safe_artist = sanitize_filename(artist) if artist else "Unknown"
|
|
235
|
+
safe_title = sanitize_filename(title) if title else "Unknown"
|
|
236
|
+
folder_name = f"NOMAD-1234 - {safe_artist} - {safe_title}"
|
|
237
|
+
remote_path = f"/Karaoke/{folder_name}"
|
|
238
|
+
|
|
239
|
+
# Path should not have curly quotes (U+2018 and U+2019)
|
|
240
|
+
assert "\u2018" not in remote_path
|
|
241
|
+
assert "\u2019" not in remote_path
|
|
242
|
+
# But should have the converted straight apostrophe
|
|
243
|
+
assert "'" in remote_path
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class TestInternationalCharacters:
|
|
247
|
+
"""Test that international/non-ASCII characters are handled properly.
|
|
248
|
+
|
|
249
|
+
The goal is NOT to strip all non-ASCII, but to convert problematic
|
|
250
|
+
Unicode to safe equivalents while preserving legitimate international text.
|
|
251
|
+
"""
|
|
252
|
+
|
|
253
|
+
def test_japanese_preserved(self):
|
|
254
|
+
"""Japanese characters should pass through unchanged."""
|
|
255
|
+
assert sanitize_filename("宇多田ヒカル") == "宇多田ヒカル"
|
|
256
|
+
assert sanitize_filename("君が代") == "君が代"
|
|
257
|
+
|
|
258
|
+
def test_korean_preserved(self):
|
|
259
|
+
"""Korean characters should pass through unchanged."""
|
|
260
|
+
assert sanitize_filename("방탄소년단") == "방탄소년단"
|
|
261
|
+
assert sanitize_filename("좋은 날") == "좋은 날"
|
|
262
|
+
|
|
263
|
+
def test_chinese_preserved(self):
|
|
264
|
+
"""Chinese characters should pass through unchanged."""
|
|
265
|
+
assert sanitize_filename("周杰倫") == "周杰倫"
|
|
266
|
+
assert sanitize_filename("青花瓷") == "青花瓷"
|
|
267
|
+
|
|
268
|
+
def test_cyrillic_preserved(self):
|
|
269
|
+
"""Cyrillic characters should pass through unchanged."""
|
|
270
|
+
assert sanitize_filename("Тату") == "Тату"
|
|
271
|
+
assert sanitize_filename("Не верь, не бойся") == "Не верь, не бойся"
|
|
272
|
+
|
|
273
|
+
def test_arabic_preserved(self):
|
|
274
|
+
"""Arabic characters should pass through unchanged."""
|
|
275
|
+
assert sanitize_filename("فيروز") == "فيروز"
|
|
276
|
+
|
|
277
|
+
def test_accented_latin_preserved(self):
|
|
278
|
+
"""Accented Latin characters should pass through unchanged."""
|
|
279
|
+
assert sanitize_filename("Café") == "Café"
|
|
280
|
+
assert sanitize_filename("Señorita") == "Señorita"
|
|
281
|
+
assert sanitize_filename("Naïve") == "Naïve"
|
|
282
|
+
assert sanitize_filename("Björk") == "Björk"
|
|
283
|
+
|
|
284
|
+
def test_mixed_script_preserved(self):
|
|
285
|
+
"""Mixed script text should work correctly."""
|
|
286
|
+
# Japanese artist, English title
|
|
287
|
+
assert sanitize_filename("宇多田ヒカル - First Love") == "宇多田ヒカル - First Love"
|
|
288
|
+
# K-pop with English
|
|
289
|
+
assert sanitize_filename("BTS 방탄소년단 - Dynamite") == "BTS 방탄소년단 - Dynamite"
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
class TestEdgeCasesAndRegression:
|
|
293
|
+
"""Edge cases and regression tests for specific bugs."""
|
|
294
|
+
|
|
295
|
+
def test_job_d49efab1_exact_title(self):
|
|
296
|
+
"""Exact reproduction of job d49efab1 failure case.
|
|
297
|
+
|
|
298
|
+
The job had:
|
|
299
|
+
- artist: "Footloose (Broadway Cast)" (from display_artist override)
|
|
300
|
+
- title: "Mama Says (You Can't Back Down)" (with curly apostrophe U+2019)
|
|
301
|
+
|
|
302
|
+
This caused Stage 2 audio separation to fail because the filename
|
|
303
|
+
with the curly apostrophe couldn't be encoded in HTTP headers.
|
|
304
|
+
"""
|
|
305
|
+
artist = "Footloose (Broadway Cast)"
|
|
306
|
+
title = "Mama Says (You Can\u2019t Back Down)" # Explicit U+2019
|
|
307
|
+
|
|
308
|
+
safe_artist = sanitize_filename(artist)
|
|
309
|
+
safe_title = sanitize_filename(title)
|
|
310
|
+
artist_title = f"{safe_artist} - {safe_title}"
|
|
311
|
+
|
|
312
|
+
# The result should use straight apostrophe
|
|
313
|
+
assert "Can't" in artist_title
|
|
314
|
+
assert "\u2019" not in artist_title # No curly apostrophe
|
|
315
|
+
|
|
316
|
+
# Should be HTTP-header safe
|
|
317
|
+
try:
|
|
318
|
+
artist_title.encode('latin-1')
|
|
319
|
+
header_safe = True
|
|
320
|
+
except UnicodeEncodeError:
|
|
321
|
+
header_safe = False
|
|
322
|
+
assert header_safe
|
|
323
|
+
|
|
324
|
+
def test_double_sanitization_idempotent(self):
|
|
325
|
+
"""Sanitizing twice should give the same result as once."""
|
|
326
|
+
original = "It\u2019s \u201cMy\u201d Song \u2014 Test\u2026"
|
|
327
|
+
once = sanitize_filename(original)
|
|
328
|
+
twice = sanitize_filename(once)
|
|
329
|
+
assert once == twice
|
|
330
|
+
|
|
331
|
+
def test_only_problematic_chars_string(self):
|
|
332
|
+
"""Test string made entirely of problematic characters."""
|
|
333
|
+
result = sanitize_filename("\u2018\u2019\u201c\u201d\u2014\u2026")
|
|
334
|
+
# Should become: ''""--... then quotes -> underscores, collapses
|
|
335
|
+
# Actually: ' ' " " - ... -> underscores for filesystem chars
|
|
336
|
+
assert result # Should not be empty
|
|
337
|
+
|
|
338
|
+
def test_very_long_filename(self):
|
|
339
|
+
"""Test that very long filenames are handled."""
|
|
340
|
+
long_name = "A" * 1000
|
|
341
|
+
result = sanitize_filename(long_name)
|
|
342
|
+
assert result == long_name # No truncation in sanitize_filename itself
|
|
343
|
+
|
|
344
|
+
def test_special_musical_characters(self):
|
|
345
|
+
"""Test musical symbols and special characters."""
|
|
346
|
+
# These should pass through as they're not in our replacement list
|
|
347
|
+
assert "♪" in sanitize_filename("♪ Intro ♪")
|
|
348
|
+
assert "♫" in sanitize_filename("♫ Music ♫")
|
|
349
|
+
assert "♯" in sanitize_filename("C♯ Minor")
|
|
350
|
+
assert "♭" in sanitize_filename("B♭ Major")
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
class TestNormalizeText:
|
|
354
|
+
"""Test the normalize_text function for data consistency normalization."""
|
|
355
|
+
|
|
356
|
+
def test_curly_single_quotes_normalized(self):
|
|
357
|
+
"""Test that curly single quotes are converted to straight apostrophe."""
|
|
358
|
+
assert normalize_text("Can\u2019t") == "Can't"
|
|
359
|
+
assert normalize_text("It\u2018s") == "It's"
|
|
360
|
+
assert normalize_text("\u201Aquote\u201B") == "'quote'"
|
|
361
|
+
|
|
362
|
+
def test_curly_double_quotes_normalized(self):
|
|
363
|
+
"""Test that curly double quotes are converted to straight double quotes."""
|
|
364
|
+
assert normalize_text("\u201CHello\u201D") == '"Hello"'
|
|
365
|
+
assert normalize_text("\u201Equote\u201F") == '"quote"'
|
|
366
|
+
|
|
367
|
+
def test_backticks_normalized(self):
|
|
368
|
+
"""Test that backticks and similar marks are converted to apostrophe."""
|
|
369
|
+
assert normalize_text("code`here") == "code'here"
|
|
370
|
+
assert normalize_text("acute\u00B4accent") == "acute'accent"
|
|
371
|
+
assert normalize_text("prime\u2032mark") == "prime'mark"
|
|
372
|
+
|
|
373
|
+
def test_dashes_normalized(self):
|
|
374
|
+
"""Test that various dashes are converted to hyphen-minus."""
|
|
375
|
+
# EN DASH
|
|
376
|
+
assert normalize_text("1990\u20132000") == "1990-2000"
|
|
377
|
+
# EM DASH
|
|
378
|
+
assert normalize_text("word\u2014word") == "word-word"
|
|
379
|
+
# MINUS SIGN
|
|
380
|
+
assert normalize_text("a\u2212b") == "a-b"
|
|
381
|
+
# FIGURE DASH
|
|
382
|
+
assert normalize_text("phone\u2012number") == "phone-number"
|
|
383
|
+
|
|
384
|
+
def test_whitespace_normalized(self):
|
|
385
|
+
"""Test that various whitespace characters are normalized."""
|
|
386
|
+
# NON-BREAKING SPACE
|
|
387
|
+
assert normalize_text("hello\u00A0world") == "hello world"
|
|
388
|
+
# EM SPACE
|
|
389
|
+
assert normalize_text("hello\u2003world") == "hello world"
|
|
390
|
+
# IDEOGRAPHIC SPACE (CJK full-width)
|
|
391
|
+
assert normalize_text("hello\u3000world") == "hello world"
|
|
392
|
+
# ZERO WIDTH SPACE (removed entirely)
|
|
393
|
+
assert normalize_text("hello\u200Bworld") == "helloworld"
|
|
394
|
+
|
|
395
|
+
def test_ellipsis_normalized(self):
|
|
396
|
+
"""Test that ellipsis character is converted to three dots."""
|
|
397
|
+
assert normalize_text("Wait\u2026") == "Wait..."
|
|
398
|
+
assert normalize_text("Loading\u2026please wait") == "Loading...please wait"
|
|
399
|
+
|
|
400
|
+
def test_multiple_spaces_collapsed(self):
|
|
401
|
+
"""Test that multiple spaces are collapsed to one."""
|
|
402
|
+
assert normalize_text("hello world") == "hello world"
|
|
403
|
+
# After normalizing multiple whitespace chars
|
|
404
|
+
assert normalize_text("hello\u00A0\u00A0\u00A0world") == "hello world"
|
|
405
|
+
|
|
406
|
+
def test_leading_trailing_whitespace_stripped(self):
|
|
407
|
+
"""Test that leading/trailing whitespace is stripped."""
|
|
408
|
+
assert normalize_text(" hello ") == "hello"
|
|
409
|
+
assert normalize_text("\u00A0hello\u00A0") == "hello"
|
|
410
|
+
|
|
411
|
+
def test_none_input(self):
|
|
412
|
+
"""Test that None input returns None."""
|
|
413
|
+
assert normalize_text(None) is None
|
|
414
|
+
|
|
415
|
+
def test_non_string_input(self):
|
|
416
|
+
"""Test that non-string input is returned unchanged."""
|
|
417
|
+
assert normalize_text(123) == 123
|
|
418
|
+
assert normalize_text(["list"]) == ["list"]
|
|
419
|
+
|
|
420
|
+
def test_international_characters_preserved(self):
|
|
421
|
+
"""Test that international characters are NOT normalized away."""
|
|
422
|
+
assert normalize_text("日本語") == "日本語"
|
|
423
|
+
assert normalize_text("한국어") == "한국어"
|
|
424
|
+
assert normalize_text("Café") == "Café"
|
|
425
|
+
assert normalize_text("Björk") == "Björk"
|
|
426
|
+
|
|
427
|
+
def test_real_world_examples(self):
|
|
428
|
+
"""Test real-world examples with mixed content."""
|
|
429
|
+
# Job d49efab1's title
|
|
430
|
+
assert normalize_text("Mama Says (You Can\u2019t Back Down)") == "Mama Says (You Can't Back Down)"
|
|
431
|
+
# Broadway cast with smart quotes and em dash
|
|
432
|
+
assert normalize_text("Footloose \u2014 \u201CFinal Song\u201D") == 'Footloose - "Final Song"'
|
|
433
|
+
# Japanese with em dash
|
|
434
|
+
assert normalize_text("宇多田ヒカル \u2014 First Love") == "宇多田ヒカル - First Love"
|
|
435
|
+
|
|
436
|
+
def test_idempotent(self):
|
|
437
|
+
"""Test that normalizing twice gives the same result."""
|
|
438
|
+
original = "It\u2019s \u201CMy\u201D Song \u2014 Test\u2026"
|
|
439
|
+
once = normalize_text(original)
|
|
440
|
+
twice = normalize_text(once)
|
|
441
|
+
assert once == twice
|
|
442
|
+
|
|
443
|
+
def test_text_normalizations_dict_complete(self):
|
|
444
|
+
"""Verify TEXT_NORMALIZATIONS includes all expected categories."""
|
|
445
|
+
# Check apostrophe-like characters
|
|
446
|
+
for char in APOSTROPHE_REPLACEMENTS:
|
|
447
|
+
assert char in TEXT_NORMALIZATIONS
|
|
448
|
+
|
|
449
|
+
# Check double quote-like characters
|
|
450
|
+
for char in DOUBLE_QUOTE_REPLACEMENTS:
|
|
451
|
+
assert char in TEXT_NORMALIZATIONS
|
|
452
|
+
|
|
453
|
+
# Check dash-like characters
|
|
454
|
+
for char in DASH_REPLACEMENTS:
|
|
455
|
+
assert char in TEXT_NORMALIZATIONS
|
|
456
|
+
|
|
457
|
+
# Check whitespace characters
|
|
458
|
+
for char in WHITESPACE_REPLACEMENTS:
|
|
459
|
+
assert char in TEXT_NORMALIZATIONS
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
class TestModelValidatorNormalization:
|
|
463
|
+
"""Test that model validators correctly normalize input text."""
|
|
464
|
+
|
|
465
|
+
def test_job_create_normalizes_artist_title(self):
|
|
466
|
+
"""Test that JobCreate model normalizes artist and title fields."""
|
|
467
|
+
from backend.models.job import JobCreate
|
|
468
|
+
|
|
469
|
+
job = JobCreate(
|
|
470
|
+
artist="Don\u2019t Stop",
|
|
471
|
+
title="Believin\u2019 \u2014 Live",
|
|
472
|
+
)
|
|
473
|
+
assert job.artist == "Don't Stop"
|
|
474
|
+
assert job.title == "Believin' - Live"
|
|
475
|
+
|
|
476
|
+
def test_job_create_preserves_international_chars(self):
|
|
477
|
+
"""Test that JobCreate preserves international characters."""
|
|
478
|
+
from backend.models.job import JobCreate
|
|
479
|
+
|
|
480
|
+
job = JobCreate(
|
|
481
|
+
artist="宇多田ヒカル",
|
|
482
|
+
title="First Love",
|
|
483
|
+
)
|
|
484
|
+
assert job.artist == "宇多田ヒカル"
|
|
485
|
+
assert job.title == "First Love"
|
|
486
|
+
|
|
487
|
+
def test_job_create_handles_none(self):
|
|
488
|
+
"""Test that JobCreate handles None values correctly."""
|
|
489
|
+
from backend.models.job import JobCreate
|
|
490
|
+
|
|
491
|
+
job = JobCreate(
|
|
492
|
+
artist=None,
|
|
493
|
+
title="Some Title",
|
|
494
|
+
)
|
|
495
|
+
assert job.artist is None
|
|
496
|
+
assert job.title == "Some Title"
|
|
497
|
+
|
|
498
|
+
def test_audio_search_normalizes_fields(self):
|
|
499
|
+
"""Test that AudioSearchRequest normalizes text fields."""
|
|
500
|
+
from backend.api.routes.audio_search import AudioSearchRequest
|
|
501
|
+
|
|
502
|
+
request = AudioSearchRequest(
|
|
503
|
+
artist="Artist\u2019s Name",
|
|
504
|
+
title="Song \u2014 Remix",
|
|
505
|
+
display_artist="Display\u2019s Artist",
|
|
506
|
+
display_title="Display\u2019s Title",
|
|
507
|
+
)
|
|
508
|
+
assert request.artist == "Artist's Name"
|
|
509
|
+
assert request.title == "Song - Remix"
|
|
510
|
+
assert request.display_artist == "Display's Artist"
|
|
511
|
+
assert request.display_title == "Display's Title"
|
|
512
|
+
|
|
513
|
+
def test_audio_search_preserves_international_chars(self):
|
|
514
|
+
"""Test that AudioSearchRequest preserves international characters."""
|
|
515
|
+
from backend.api.routes.audio_search import AudioSearchRequest
|
|
516
|
+
|
|
517
|
+
request = AudioSearchRequest(
|
|
518
|
+
artist="방탄소년단",
|
|
519
|
+
title="Dynamite",
|
|
520
|
+
)
|
|
521
|
+
assert request.artist == "방탄소년단"
|
|
522
|
+
assert request.title == "Dynamite"
|