karaoke-gen 0.90.1__py3-none-any.whl → 0.96.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. backend/.coveragerc +20 -0
  2. backend/.gitignore +37 -0
  3. backend/Dockerfile +43 -0
  4. backend/Dockerfile.base +74 -0
  5. backend/README.md +242 -0
  6. backend/__init__.py +0 -0
  7. backend/api/__init__.py +0 -0
  8. backend/api/dependencies.py +457 -0
  9. backend/api/routes/__init__.py +0 -0
  10. backend/api/routes/admin.py +742 -0
  11. backend/api/routes/audio_search.py +903 -0
  12. backend/api/routes/auth.py +348 -0
  13. backend/api/routes/file_upload.py +2076 -0
  14. backend/api/routes/health.py +344 -0
  15. backend/api/routes/internal.py +435 -0
  16. backend/api/routes/jobs.py +1610 -0
  17. backend/api/routes/review.py +652 -0
  18. backend/api/routes/themes.py +162 -0
  19. backend/api/routes/users.py +1014 -0
  20. backend/config.py +172 -0
  21. backend/main.py +133 -0
  22. backend/middleware/__init__.py +5 -0
  23. backend/middleware/audit_logging.py +124 -0
  24. backend/models/__init__.py +0 -0
  25. backend/models/job.py +519 -0
  26. backend/models/requests.py +123 -0
  27. backend/models/theme.py +153 -0
  28. backend/models/user.py +254 -0
  29. backend/models/worker_log.py +164 -0
  30. backend/pyproject.toml +29 -0
  31. backend/quick-check.sh +93 -0
  32. backend/requirements.txt +29 -0
  33. backend/run_tests.sh +60 -0
  34. backend/services/__init__.py +0 -0
  35. backend/services/audio_analysis_service.py +243 -0
  36. backend/services/audio_editing_service.py +278 -0
  37. backend/services/audio_search_service.py +702 -0
  38. backend/services/auth_service.py +630 -0
  39. backend/services/credential_manager.py +792 -0
  40. backend/services/discord_service.py +172 -0
  41. backend/services/dropbox_service.py +301 -0
  42. backend/services/email_service.py +1093 -0
  43. backend/services/encoding_interface.py +454 -0
  44. backend/services/encoding_service.py +405 -0
  45. backend/services/firestore_service.py +512 -0
  46. backend/services/flacfetch_client.py +573 -0
  47. backend/services/gce_encoding/README.md +72 -0
  48. backend/services/gce_encoding/__init__.py +22 -0
  49. backend/services/gce_encoding/main.py +589 -0
  50. backend/services/gce_encoding/requirements.txt +16 -0
  51. backend/services/gdrive_service.py +356 -0
  52. backend/services/job_logging.py +258 -0
  53. backend/services/job_manager.py +842 -0
  54. backend/services/job_notification_service.py +271 -0
  55. backend/services/local_encoding_service.py +590 -0
  56. backend/services/local_preview_encoding_service.py +407 -0
  57. backend/services/lyrics_cache_service.py +216 -0
  58. backend/services/metrics.py +413 -0
  59. backend/services/packaging_service.py +287 -0
  60. backend/services/rclone_service.py +106 -0
  61. backend/services/storage_service.py +209 -0
  62. backend/services/stripe_service.py +275 -0
  63. backend/services/structured_logging.py +254 -0
  64. backend/services/template_service.py +330 -0
  65. backend/services/theme_service.py +469 -0
  66. backend/services/tracing.py +543 -0
  67. backend/services/user_service.py +721 -0
  68. backend/services/worker_service.py +558 -0
  69. backend/services/youtube_service.py +112 -0
  70. backend/services/youtube_upload_service.py +445 -0
  71. backend/tests/__init__.py +4 -0
  72. backend/tests/conftest.py +224 -0
  73. backend/tests/emulator/__init__.py +7 -0
  74. backend/tests/emulator/conftest.py +88 -0
  75. backend/tests/emulator/test_e2e_cli_backend.py +1053 -0
  76. backend/tests/emulator/test_emulator_integration.py +356 -0
  77. backend/tests/emulator/test_style_loading_direct.py +436 -0
  78. backend/tests/emulator/test_worker_logs_direct.py +229 -0
  79. backend/tests/emulator/test_worker_logs_subcollection.py +443 -0
  80. backend/tests/requirements-test.txt +10 -0
  81. backend/tests/requirements.txt +6 -0
  82. backend/tests/test_admin_email_endpoints.py +411 -0
  83. backend/tests/test_api_integration.py +460 -0
  84. backend/tests/test_api_routes.py +93 -0
  85. backend/tests/test_audio_analysis_service.py +294 -0
  86. backend/tests/test_audio_editing_service.py +386 -0
  87. backend/tests/test_audio_search.py +1398 -0
  88. backend/tests/test_audio_services.py +378 -0
  89. backend/tests/test_auth_firestore.py +231 -0
  90. backend/tests/test_config_extended.py +68 -0
  91. backend/tests/test_credential_manager.py +377 -0
  92. backend/tests/test_dependencies.py +54 -0
  93. backend/tests/test_discord_service.py +244 -0
  94. backend/tests/test_distribution_services.py +820 -0
  95. backend/tests/test_dropbox_service.py +472 -0
  96. backend/tests/test_email_service.py +492 -0
  97. backend/tests/test_emulator_integration.py +322 -0
  98. backend/tests/test_encoding_interface.py +412 -0
  99. backend/tests/test_file_upload.py +1739 -0
  100. backend/tests/test_flacfetch_client.py +632 -0
  101. backend/tests/test_gdrive_service.py +524 -0
  102. backend/tests/test_instrumental_api.py +431 -0
  103. backend/tests/test_internal_api.py +343 -0
  104. backend/tests/test_job_creation_regression.py +583 -0
  105. backend/tests/test_job_manager.py +339 -0
  106. backend/tests/test_job_manager_notifications.py +329 -0
  107. backend/tests/test_job_notification_service.py +443 -0
  108. backend/tests/test_jobs_api.py +273 -0
  109. backend/tests/test_local_encoding_service.py +423 -0
  110. backend/tests/test_local_preview_encoding_service.py +567 -0
  111. backend/tests/test_main.py +87 -0
  112. backend/tests/test_models.py +918 -0
  113. backend/tests/test_packaging_service.py +382 -0
  114. backend/tests/test_requests.py +201 -0
  115. backend/tests/test_routes_jobs.py +282 -0
  116. backend/tests/test_routes_review.py +337 -0
  117. backend/tests/test_services.py +556 -0
  118. backend/tests/test_services_extended.py +112 -0
  119. backend/tests/test_storage_service.py +448 -0
  120. backend/tests/test_style_upload.py +261 -0
  121. backend/tests/test_template_service.py +295 -0
  122. backend/tests/test_theme_service.py +516 -0
  123. backend/tests/test_unicode_sanitization.py +522 -0
  124. backend/tests/test_upload_api.py +256 -0
  125. backend/tests/test_validate.py +156 -0
  126. backend/tests/test_video_worker_orchestrator.py +847 -0
  127. backend/tests/test_worker_log_subcollection.py +509 -0
  128. backend/tests/test_worker_logging.py +365 -0
  129. backend/tests/test_workers.py +1116 -0
  130. backend/tests/test_workers_extended.py +178 -0
  131. backend/tests/test_youtube_service.py +247 -0
  132. backend/tests/test_youtube_upload_service.py +568 -0
  133. backend/validate.py +173 -0
  134. backend/version.py +27 -0
  135. backend/workers/README.md +597 -0
  136. backend/workers/__init__.py +11 -0
  137. backend/workers/audio_worker.py +618 -0
  138. backend/workers/lyrics_worker.py +683 -0
  139. backend/workers/render_video_worker.py +483 -0
  140. backend/workers/screens_worker.py +525 -0
  141. backend/workers/style_helper.py +198 -0
  142. backend/workers/video_worker.py +1277 -0
  143. backend/workers/video_worker_orchestrator.py +701 -0
  144. backend/workers/worker_logging.py +278 -0
  145. karaoke_gen/instrumental_review/static/index.html +7 -4
  146. karaoke_gen/karaoke_finalise/karaoke_finalise.py +6 -1
  147. karaoke_gen/utils/__init__.py +163 -8
  148. karaoke_gen/video_background_processor.py +9 -4
  149. {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.96.0.dist-info}/METADATA +1 -1
  150. {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.96.0.dist-info}/RECORD +186 -41
  151. lyrics_transcriber/correction/agentic/providers/config.py +9 -5
  152. lyrics_transcriber/correction/agentic/providers/langchain_bridge.py +1 -51
  153. lyrics_transcriber/correction/corrector.py +192 -130
  154. lyrics_transcriber/correction/operations.py +24 -9
  155. lyrics_transcriber/frontend/package-lock.json +2 -2
  156. lyrics_transcriber/frontend/package.json +1 -1
  157. lyrics_transcriber/frontend/src/components/AIFeedbackModal.tsx +1 -1
  158. lyrics_transcriber/frontend/src/components/CorrectedWordWithActions.tsx +11 -7
  159. lyrics_transcriber/frontend/src/components/EditActionBar.tsx +31 -5
  160. lyrics_transcriber/frontend/src/components/EditModal.tsx +28 -10
  161. lyrics_transcriber/frontend/src/components/EditTimelineSection.tsx +123 -27
  162. lyrics_transcriber/frontend/src/components/EditWordList.tsx +112 -60
  163. lyrics_transcriber/frontend/src/components/Header.tsx +90 -76
  164. lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +53 -31
  165. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/SyncControls.tsx +44 -13
  166. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/TimelineCanvas.tsx +66 -50
  167. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/index.tsx +124 -30
  168. lyrics_transcriber/frontend/src/components/ReferenceView.tsx +1 -1
  169. lyrics_transcriber/frontend/src/components/TimelineEditor.tsx +12 -5
  170. lyrics_transcriber/frontend/src/components/TimingOffsetModal.tsx +3 -3
  171. lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +1 -1
  172. lyrics_transcriber/frontend/src/components/WordDivider.tsx +11 -7
  173. lyrics_transcriber/frontend/src/components/shared/components/Word.tsx +4 -2
  174. lyrics_transcriber/frontend/src/hooks/useManualSync.ts +103 -1
  175. lyrics_transcriber/frontend/src/theme.ts +42 -15
  176. lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -1
  177. lyrics_transcriber/frontend/vite.config.js +5 -0
  178. lyrics_transcriber/frontend/web_assets/assets/{index-BECn1o8Q.js → index-BSMgOq4Z.js} +6959 -5782
  179. lyrics_transcriber/frontend/web_assets/assets/index-BSMgOq4Z.js.map +1 -0
  180. lyrics_transcriber/frontend/web_assets/index.html +6 -2
  181. lyrics_transcriber/frontend/web_assets/nomad-karaoke-logo.svg +5 -0
  182. lyrics_transcriber/output/generator.py +17 -3
  183. lyrics_transcriber/output/video.py +60 -95
  184. lyrics_transcriber/frontend/web_assets/assets/index-BECn1o8Q.js.map +0 -1
  185. {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.96.0.dist-info}/WHEEL +0 -0
  186. {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.96.0.dist-info}/entry_points.txt +0 -0
  187. {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.96.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,522 @@
1
+ """
2
+ Comprehensive tests for Unicode and special character handling.
3
+
4
+ This test file verifies that artist/title with Unicode characters (curly quotes,
5
+ em dashes, non-ASCII characters) are properly sanitized throughout the codebase
6
+ to prevent:
7
+ 1. HTTP header encoding failures (Content-Disposition, email subjects)
8
+ 2. API query failures (Google Drive)
9
+ 3. Filename encoding issues (Modal API, filesystem)
10
+
11
+ The root cause was job d49efab1 which had title "Mama Says (You Can't Back Down)"
12
+ with a curly apostrophe (U+2019) that caused the audio separation to fail because
13
+ the Modal API couldn't handle the non-latin-1 character in HTTP headers.
14
+ """
15
+
16
+ import pytest
17
+ from karaoke_gen.utils import (
18
+ sanitize_filename,
19
+ normalize_text,
20
+ UNICODE_REPLACEMENTS,
21
+ TEXT_NORMALIZATIONS,
22
+ APOSTROPHE_REPLACEMENTS,
23
+ DOUBLE_QUOTE_REPLACEMENTS,
24
+ DASH_REPLACEMENTS,
25
+ WHITESPACE_REPLACEMENTS,
26
+ )
27
+
28
+
29
+ class TestSanitizeFilename:
30
+ """Test the sanitize_filename function handles all edge cases."""
31
+
32
+ def test_curly_single_quotes(self):
33
+ """Test that curly single quotes are converted to straight quotes."""
34
+ # The exact character that caused job d49efab1 to fail
35
+ assert sanitize_filename("Can't") == "Can't"
36
+ assert sanitize_filename("It's") == "It's"
37
+ # Left single quote
38
+ assert sanitize_filename("'Hello'") == "'Hello'"
39
+
40
+ def test_curly_double_quotes(self):
41
+ """Test that curly double quotes are converted to underscores.
42
+
43
+ Note: Double quotes are filesystem-unsafe so they become underscores."""
44
+ # Curly quotes first become straight, then straight quotes become underscore
45
+ assert sanitize_filename("\u201cHello\u201d") == "_Hello_"
46
+
47
+ def test_em_dash(self):
48
+ """Test that em dashes are converted to regular hyphens."""
49
+ assert sanitize_filename("Artist — Title") == "Artist - Title"
50
+ assert sanitize_filename("Song—Name") == "Song-Name"
51
+
52
+ def test_en_dash(self):
53
+ """Test that en dashes are converted to regular hyphens."""
54
+ assert sanitize_filename("1990–2000") == "1990-2000"
55
+
56
+ def test_ellipsis(self):
57
+ """Test that horizontal ellipsis is converted to three dots."""
58
+ # Note: The ellipsis becomes "..." but trailing dots are stripped
59
+ assert sanitize_filename("Wait\u2026") == "Wait"
60
+ # But in the middle of a string they're preserved
61
+ assert sanitize_filename("Wait\u2026Here") == "Wait...Here"
62
+
63
+ def test_non_breaking_space(self):
64
+ """Test that non-breaking spaces are converted to regular spaces."""
65
+ # U+00A0 is non-breaking space
66
+ assert sanitize_filename("Hello\u00A0World") == "Hello World"
67
+
68
+ def test_filesystem_unsafe_characters(self):
69
+ """Test that filesystem-unsafe characters are replaced."""
70
+ assert sanitize_filename("file/name") == "file_name"
71
+ assert sanitize_filename("file\\name") == "file_name"
72
+ assert sanitize_filename("file:name") == "file_name"
73
+ assert sanitize_filename("file*name") == "file_name"
74
+ assert sanitize_filename("file?name") == "file_name"
75
+ assert sanitize_filename('file"name') == "file_name"
76
+ assert sanitize_filename("file<name") == "file_name"
77
+ assert sanitize_filename("file>name") == "file_name"
78
+ assert sanitize_filename("file|name") == "file_name"
79
+
80
+ def test_trailing_periods_and_spaces(self):
81
+ """Test that trailing periods and spaces are removed."""
82
+ assert sanitize_filename("filename.") == "filename"
83
+ assert sanitize_filename("filename...") == "filename"
84
+ assert sanitize_filename("filename ") == "filename"
85
+ assert sanitize_filename("filename . ") == "filename"
86
+
87
+ def test_leading_periods_and_spaces(self):
88
+ """Test that leading periods and spaces are removed."""
89
+ assert sanitize_filename(".filename") == "filename"
90
+ assert sanitize_filename("...filename") == "filename"
91
+ assert sanitize_filename(" filename") == "filename"
92
+ assert sanitize_filename(" . filename") == "filename"
93
+
94
+ def test_multiple_underscores_collapsed(self):
95
+ """Test that multiple consecutive underscores are collapsed to one."""
96
+ assert sanitize_filename("file___name") == "file_name"
97
+ # Multiple unsafe chars in a row
98
+ assert sanitize_filename("file?*:name") == "file_name"
99
+
100
+ def test_multiple_spaces_collapsed(self):
101
+ """Test that multiple consecutive spaces are collapsed to one."""
102
+ assert sanitize_filename("file name") == "file name"
103
+
104
+ def test_none_input(self):
105
+ """Test that None input returns None."""
106
+ assert sanitize_filename(None) is None
107
+
108
+ def test_empty_string(self):
109
+ """Test that empty string returns empty string."""
110
+ assert sanitize_filename("") == ""
111
+
112
+ def test_real_world_examples(self):
113
+ """Test real-world examples that have caused issues."""
114
+ # The exact title from job d49efab1
115
+ assert sanitize_filename("Mama Says (You Can't Back Down)") == "Mama Says (You Can't Back Down)"
116
+
117
+ # Broadway cast with smart quotes
118
+ assert sanitize_filename("Footloose (Broadway Cast) \u2014 \u201cFinal Song\u201d") == "Footloose (Broadway Cast) - _Final Song_"
119
+
120
+ # Japanese artist with em dash
121
+ assert sanitize_filename("宇多田ヒカル — First Love") == "宇多田ヒカル - First Love"
122
+
123
+ # Korean title (should pass through unchanged)
124
+ assert sanitize_filename("아이유 - 좋은 날") == "아이유 - 좋은 날"
125
+
126
+ # Mixed content
127
+ assert sanitize_filename("L'Arc~en~Ciel - Driver's High") == "L'Arc~en~Ciel - Driver's High"
128
+
129
+ def test_combination_of_issues(self):
130
+ """Test strings with multiple problematic characters."""
131
+ result = sanitize_filename("It\u2019s \u201cMy\u201d Song \u2014 Volume\u20261")
132
+ # Curly ' -> ', curly " -> _ (via filesystem check), em dash -> -, ellipsis -> ...
133
+ assert result == "It's _My_ Song - Volume...1"
134
+
135
+ def test_unicode_replacements_dict_complete(self):
136
+ """Verify all expected Unicode characters are in the replacements dict."""
137
+ # Curly quotes
138
+ assert "\u2018" in UNICODE_REPLACEMENTS # LEFT SINGLE QUOTATION MARK
139
+ assert "\u2019" in UNICODE_REPLACEMENTS # RIGHT SINGLE QUOTATION MARK
140
+ assert "\u201A" in UNICODE_REPLACEMENTS # SINGLE LOW-9 QUOTATION MARK
141
+ assert "\u201B" in UNICODE_REPLACEMENTS # SINGLE HIGH-REVERSED-9 QUOTATION MARK
142
+ assert "\u201C" in UNICODE_REPLACEMENTS # LEFT DOUBLE QUOTATION MARK
143
+ assert "\u201D" in UNICODE_REPLACEMENTS # RIGHT DOUBLE QUOTATION MARK
144
+ assert "\u201E" in UNICODE_REPLACEMENTS # DOUBLE LOW-9 QUOTATION MARK
145
+ assert "\u201F" in UNICODE_REPLACEMENTS # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
146
+ # Dashes
147
+ assert "\u2013" in UNICODE_REPLACEMENTS # EN DASH
148
+ assert "\u2014" in UNICODE_REPLACEMENTS # EM DASH
149
+ # Other
150
+ assert "\u2026" in UNICODE_REPLACEMENTS # HORIZONTAL ELLIPSIS
151
+ assert "\u00A0" in UNICODE_REPLACEMENTS # NON-BREAKING SPACE
152
+
153
+
154
+ class TestSanitizationInContext:
155
+ """Test that sanitization is applied correctly in various contexts.
156
+
157
+ These tests verify that the fixes we applied work in the actual
158
+ code paths that use artist/title data.
159
+ """
160
+
161
+ def test_artist_title_format_for_audio_separation(self):
162
+ """Verify artist-title formatting is sanitized for Modal API."""
163
+ # Simulate what audio_worker.py does
164
+ artist = "Footloose (Broadway Cast)"
165
+ title = "Mama Says (You Can\u2019t Back Down)" # Curly apostrophe U+2019
166
+
167
+ safe_artist = sanitize_filename(artist) if artist else "Unknown"
168
+ safe_title = sanitize_filename(title) if title else "Unknown"
169
+ artist_title = f"{safe_artist} - {safe_title}"
170
+
171
+ # The curly apostrophe should be converted to straight
172
+ assert "'" in artist_title # Straight apostrophe
173
+ assert "\u2019" not in artist_title # No curly apostrophe (U+2019)
174
+
175
+ def test_email_subject_safe_for_latin1(self):
176
+ """Verify email subjects can be encoded to latin-1 after sanitization."""
177
+ artist = "Café del Mar" # Has accented e
178
+ title = "It\u2019s a \u201cBeautiful\u201d Day \u2014 Remix"
179
+
180
+ safe_artist = sanitize_filename(artist) if artist else None
181
+ safe_title = sanitize_filename(title) if title else None
182
+ subject = f"{safe_artist} - {safe_title}"
183
+
184
+ # Accented characters should pass through (they're valid latin-1)
185
+ assert "é" in subject
186
+ # But curly quotes and em dash should be converted
187
+ assert "'" in subject # Straight apostrophe
188
+ assert "-" in subject # Regular hyphen
189
+ # No curly apostrophe (U+2019)
190
+ assert "\u2019" not in subject
191
+
192
+ def test_content_disposition_safe(self):
193
+ """Verify filenames are safe for HTTP Content-Disposition headers."""
194
+ artist = "Artist\u2019s \u201cName\u201d"
195
+ title = "Song\u2014Title"
196
+
197
+ safe_artist = sanitize_filename(artist) if artist else None
198
+ safe_title = sanitize_filename(title) if title else None
199
+ filename = f"{safe_artist} - {safe_title} (Final Karaoke).mp4"
200
+
201
+ # Should be able to encode to latin-1 for HTTP headers
202
+ try:
203
+ filename.encode('latin-1')
204
+ can_encode = True
205
+ except UnicodeEncodeError:
206
+ can_encode = False
207
+
208
+ # Standard ASCII chars and latin-1 safe accented chars should work
209
+ # Note: The test above with 'é' passes latin-1; smart quotes do not
210
+ assert can_encode
211
+
212
+ def test_google_drive_query_safe(self):
213
+ """Verify filenames don't break Google Drive API queries."""
214
+ base_name = "Artist\u2019s \u201cSong\u201d"
215
+
216
+ safe_base_name = sanitize_filename(base_name) if base_name else base_name
217
+ filename = f"NOMAD-1234 - {safe_base_name}.mp4"
218
+
219
+ # Google Drive queries use single quotes - our sanitized string
220
+ # should have straight single quotes, not curly
221
+ # The query escaping handles straight quotes: '
222
+ escaped = filename.replace("'", "\\'")
223
+ query = f"name='{escaped}'"
224
+
225
+ # Should be a valid query string (no curly quotes to break syntax)
226
+ assert "\u2019" not in query # No curly single quote (U+2019)
227
+ assert "\u2018" not in query # No curly single quote (U+2018)
228
+
229
+ def test_dropbox_path_safe(self):
230
+ """Verify Dropbox paths don't have problematic characters."""
231
+ artist = "Don\u2019t Stop"
232
+ title = "Believin\u2019"
233
+
234
+ safe_artist = sanitize_filename(artist) if artist else "Unknown"
235
+ safe_title = sanitize_filename(title) if title else "Unknown"
236
+ folder_name = f"NOMAD-1234 - {safe_artist} - {safe_title}"
237
+ remote_path = f"/Karaoke/{folder_name}"
238
+
239
+ # Path should not have curly quotes (U+2018 and U+2019)
240
+ assert "\u2018" not in remote_path
241
+ assert "\u2019" not in remote_path
242
+ # But should have the converted straight apostrophe
243
+ assert "'" in remote_path
244
+
245
+
246
+ class TestInternationalCharacters:
247
+ """Test that international/non-ASCII characters are handled properly.
248
+
249
+ The goal is NOT to strip all non-ASCII, but to convert problematic
250
+ Unicode to safe equivalents while preserving legitimate international text.
251
+ """
252
+
253
+ def test_japanese_preserved(self):
254
+ """Japanese characters should pass through unchanged."""
255
+ assert sanitize_filename("宇多田ヒカル") == "宇多田ヒカル"
256
+ assert sanitize_filename("君が代") == "君が代"
257
+
258
+ def test_korean_preserved(self):
259
+ """Korean characters should pass through unchanged."""
260
+ assert sanitize_filename("방탄소년단") == "방탄소년단"
261
+ assert sanitize_filename("좋은 날") == "좋은 날"
262
+
263
+ def test_chinese_preserved(self):
264
+ """Chinese characters should pass through unchanged."""
265
+ assert sanitize_filename("周杰倫") == "周杰倫"
266
+ assert sanitize_filename("青花瓷") == "青花瓷"
267
+
268
+ def test_cyrillic_preserved(self):
269
+ """Cyrillic characters should pass through unchanged."""
270
+ assert sanitize_filename("Тату") == "Тату"
271
+ assert sanitize_filename("Не верь, не бойся") == "Не верь, не бойся"
272
+
273
+ def test_arabic_preserved(self):
274
+ """Arabic characters should pass through unchanged."""
275
+ assert sanitize_filename("فيروز") == "فيروز"
276
+
277
+ def test_accented_latin_preserved(self):
278
+ """Accented Latin characters should pass through unchanged."""
279
+ assert sanitize_filename("Café") == "Café"
280
+ assert sanitize_filename("Señorita") == "Señorita"
281
+ assert sanitize_filename("Naïve") == "Naïve"
282
+ assert sanitize_filename("Björk") == "Björk"
283
+
284
+ def test_mixed_script_preserved(self):
285
+ """Mixed script text should work correctly."""
286
+ # Japanese artist, English title
287
+ assert sanitize_filename("宇多田ヒカル - First Love") == "宇多田ヒカル - First Love"
288
+ # K-pop with English
289
+ assert sanitize_filename("BTS 방탄소년단 - Dynamite") == "BTS 방탄소년단 - Dynamite"
290
+
291
+
292
+ class TestEdgeCasesAndRegression:
293
+ """Edge cases and regression tests for specific bugs."""
294
+
295
+ def test_job_d49efab1_exact_title(self):
296
+ """Exact reproduction of job d49efab1 failure case.
297
+
298
+ The job had:
299
+ - artist: "Footloose (Broadway Cast)" (from display_artist override)
300
+ - title: "Mama Says (You Can't Back Down)" (with curly apostrophe U+2019)
301
+
302
+ This caused Stage 2 audio separation to fail because the filename
303
+ with the curly apostrophe couldn't be encoded in HTTP headers.
304
+ """
305
+ artist = "Footloose (Broadway Cast)"
306
+ title = "Mama Says (You Can\u2019t Back Down)" # Explicit U+2019
307
+
308
+ safe_artist = sanitize_filename(artist)
309
+ safe_title = sanitize_filename(title)
310
+ artist_title = f"{safe_artist} - {safe_title}"
311
+
312
+ # The result should use straight apostrophe
313
+ assert "Can't" in artist_title
314
+ assert "\u2019" not in artist_title # No curly apostrophe
315
+
316
+ # Should be HTTP-header safe
317
+ try:
318
+ artist_title.encode('latin-1')
319
+ header_safe = True
320
+ except UnicodeEncodeError:
321
+ header_safe = False
322
+ assert header_safe
323
+
324
+ def test_double_sanitization_idempotent(self):
325
+ """Sanitizing twice should give the same result as once."""
326
+ original = "It\u2019s \u201cMy\u201d Song \u2014 Test\u2026"
327
+ once = sanitize_filename(original)
328
+ twice = sanitize_filename(once)
329
+ assert once == twice
330
+
331
+ def test_only_problematic_chars_string(self):
332
+ """Test string made entirely of problematic characters."""
333
+ result = sanitize_filename("\u2018\u2019\u201c\u201d\u2014\u2026")
334
+ # Should become: ''""--... then quotes -> underscores, collapses
335
+ # Actually: ' ' " " - ... -> underscores for filesystem chars
336
+ assert result # Should not be empty
337
+
338
+ def test_very_long_filename(self):
339
+ """Test that very long filenames are handled."""
340
+ long_name = "A" * 1000
341
+ result = sanitize_filename(long_name)
342
+ assert result == long_name # No truncation in sanitize_filename itself
343
+
344
+ def test_special_musical_characters(self):
345
+ """Test musical symbols and special characters."""
346
+ # These should pass through as they're not in our replacement list
347
+ assert "♪" in sanitize_filename("♪ Intro ♪")
348
+ assert "♫" in sanitize_filename("♫ Music ♫")
349
+ assert "♯" in sanitize_filename("C♯ Minor")
350
+ assert "♭" in sanitize_filename("B♭ Major")
351
+
352
+
353
+ class TestNormalizeText:
354
+ """Test the normalize_text function for data consistency normalization."""
355
+
356
+ def test_curly_single_quotes_normalized(self):
357
+ """Test that curly single quotes are converted to straight apostrophe."""
358
+ assert normalize_text("Can\u2019t") == "Can't"
359
+ assert normalize_text("It\u2018s") == "It's"
360
+ assert normalize_text("\u201Aquote\u201B") == "'quote'"
361
+
362
+ def test_curly_double_quotes_normalized(self):
363
+ """Test that curly double quotes are converted to straight double quotes."""
364
+ assert normalize_text("\u201CHello\u201D") == '"Hello"'
365
+ assert normalize_text("\u201Equote\u201F") == '"quote"'
366
+
367
+ def test_backticks_normalized(self):
368
+ """Test that backticks and similar marks are converted to apostrophe."""
369
+ assert normalize_text("code`here") == "code'here"
370
+ assert normalize_text("acute\u00B4accent") == "acute'accent"
371
+ assert normalize_text("prime\u2032mark") == "prime'mark"
372
+
373
+ def test_dashes_normalized(self):
374
+ """Test that various dashes are converted to hyphen-minus."""
375
+ # EN DASH
376
+ assert normalize_text("1990\u20132000") == "1990-2000"
377
+ # EM DASH
378
+ assert normalize_text("word\u2014word") == "word-word"
379
+ # MINUS SIGN
380
+ assert normalize_text("a\u2212b") == "a-b"
381
+ # FIGURE DASH
382
+ assert normalize_text("phone\u2012number") == "phone-number"
383
+
384
+ def test_whitespace_normalized(self):
385
+ """Test that various whitespace characters are normalized."""
386
+ # NON-BREAKING SPACE
387
+ assert normalize_text("hello\u00A0world") == "hello world"
388
+ # EM SPACE
389
+ assert normalize_text("hello\u2003world") == "hello world"
390
+ # IDEOGRAPHIC SPACE (CJK full-width)
391
+ assert normalize_text("hello\u3000world") == "hello world"
392
+ # ZERO WIDTH SPACE (removed entirely)
393
+ assert normalize_text("hello\u200Bworld") == "helloworld"
394
+
395
+ def test_ellipsis_normalized(self):
396
+ """Test that ellipsis character is converted to three dots."""
397
+ assert normalize_text("Wait\u2026") == "Wait..."
398
+ assert normalize_text("Loading\u2026please wait") == "Loading...please wait"
399
+
400
+ def test_multiple_spaces_collapsed(self):
401
+ """Test that multiple spaces are collapsed to one."""
402
+ assert normalize_text("hello world") == "hello world"
403
+ # After normalizing multiple whitespace chars
404
+ assert normalize_text("hello\u00A0\u00A0\u00A0world") == "hello world"
405
+
406
+ def test_leading_trailing_whitespace_stripped(self):
407
+ """Test that leading/trailing whitespace is stripped."""
408
+ assert normalize_text(" hello ") == "hello"
409
+ assert normalize_text("\u00A0hello\u00A0") == "hello"
410
+
411
+ def test_none_input(self):
412
+ """Test that None input returns None."""
413
+ assert normalize_text(None) is None
414
+
415
+ def test_non_string_input(self):
416
+ """Test that non-string input is returned unchanged."""
417
+ assert normalize_text(123) == 123
418
+ assert normalize_text(["list"]) == ["list"]
419
+
420
+ def test_international_characters_preserved(self):
421
+ """Test that international characters are NOT normalized away."""
422
+ assert normalize_text("日本語") == "日本語"
423
+ assert normalize_text("한국어") == "한국어"
424
+ assert normalize_text("Café") == "Café"
425
+ assert normalize_text("Björk") == "Björk"
426
+
427
+ def test_real_world_examples(self):
428
+ """Test real-world examples with mixed content."""
429
+ # Job d49efab1's title
430
+ assert normalize_text("Mama Says (You Can\u2019t Back Down)") == "Mama Says (You Can't Back Down)"
431
+ # Broadway cast with smart quotes and em dash
432
+ assert normalize_text("Footloose \u2014 \u201CFinal Song\u201D") == 'Footloose - "Final Song"'
433
+ # Japanese with em dash
434
+ assert normalize_text("宇多田ヒカル \u2014 First Love") == "宇多田ヒカル - First Love"
435
+
436
+ def test_idempotent(self):
437
+ """Test that normalizing twice gives the same result."""
438
+ original = "It\u2019s \u201CMy\u201D Song \u2014 Test\u2026"
439
+ once = normalize_text(original)
440
+ twice = normalize_text(once)
441
+ assert once == twice
442
+
443
+ def test_text_normalizations_dict_complete(self):
444
+ """Verify TEXT_NORMALIZATIONS includes all expected categories."""
445
+ # Check apostrophe-like characters
446
+ for char in APOSTROPHE_REPLACEMENTS:
447
+ assert char in TEXT_NORMALIZATIONS
448
+
449
+ # Check double quote-like characters
450
+ for char in DOUBLE_QUOTE_REPLACEMENTS:
451
+ assert char in TEXT_NORMALIZATIONS
452
+
453
+ # Check dash-like characters
454
+ for char in DASH_REPLACEMENTS:
455
+ assert char in TEXT_NORMALIZATIONS
456
+
457
+ # Check whitespace characters
458
+ for char in WHITESPACE_REPLACEMENTS:
459
+ assert char in TEXT_NORMALIZATIONS
460
+
461
+
462
+ class TestModelValidatorNormalization:
463
+ """Test that model validators correctly normalize input text."""
464
+
465
+ def test_job_create_normalizes_artist_title(self):
466
+ """Test that JobCreate model normalizes artist and title fields."""
467
+ from backend.models.job import JobCreate
468
+
469
+ job = JobCreate(
470
+ artist="Don\u2019t Stop",
471
+ title="Believin\u2019 \u2014 Live",
472
+ )
473
+ assert job.artist == "Don't Stop"
474
+ assert job.title == "Believin' - Live"
475
+
476
+ def test_job_create_preserves_international_chars(self):
477
+ """Test that JobCreate preserves international characters."""
478
+ from backend.models.job import JobCreate
479
+
480
+ job = JobCreate(
481
+ artist="宇多田ヒカル",
482
+ title="First Love",
483
+ )
484
+ assert job.artist == "宇多田ヒカル"
485
+ assert job.title == "First Love"
486
+
487
+ def test_job_create_handles_none(self):
488
+ """Test that JobCreate handles None values correctly."""
489
+ from backend.models.job import JobCreate
490
+
491
+ job = JobCreate(
492
+ artist=None,
493
+ title="Some Title",
494
+ )
495
+ assert job.artist is None
496
+ assert job.title == "Some Title"
497
+
498
+ def test_audio_search_normalizes_fields(self):
499
+ """Test that AudioSearchRequest normalizes text fields."""
500
+ from backend.api.routes.audio_search import AudioSearchRequest
501
+
502
+ request = AudioSearchRequest(
503
+ artist="Artist\u2019s Name",
504
+ title="Song \u2014 Remix",
505
+ display_artist="Display\u2019s Artist",
506
+ display_title="Display\u2019s Title",
507
+ )
508
+ assert request.artist == "Artist's Name"
509
+ assert request.title == "Song - Remix"
510
+ assert request.display_artist == "Display's Artist"
511
+ assert request.display_title == "Display's Title"
512
+
513
+ def test_audio_search_preserves_international_chars(self):
514
+ """Test that AudioSearchRequest preserves international characters."""
515
+ from backend.api.routes.audio_search import AudioSearchRequest
516
+
517
+ request = AudioSearchRequest(
518
+ artist="방탄소년단",
519
+ title="Dynamite",
520
+ )
521
+ assert request.artist == "방탄소년단"
522
+ assert request.title == "Dynamite"