karaoke-lyrics-processor 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- karaoke_lyrics_processor/karaoke_lyrics_processor.py +101 -24
- {karaoke_lyrics_processor-0.3.0.dist-info → karaoke_lyrics_processor-0.4.0.dist-info}/METADATA +1 -1
- karaoke_lyrics_processor-0.4.0.dist-info/RECORD +8 -0
- karaoke_lyrics_processor-0.3.0.dist-info/RECORD +0 -8
- {karaoke_lyrics_processor-0.3.0.dist-info → karaoke_lyrics_processor-0.4.0.dist-info}/LICENSE +0 -0
- {karaoke_lyrics_processor-0.3.0.dist-info → karaoke_lyrics_processor-0.4.0.dist-info}/WHEEL +0 -0
- {karaoke_lyrics_processor-0.3.0.dist-info → karaoke_lyrics_processor-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -5,6 +5,7 @@ import unicodedata
|
|
5
5
|
import docx2txt
|
6
6
|
from striprtf.striprtf import rtf_to_text
|
7
7
|
import os
|
8
|
+
import codecs
|
8
9
|
|
9
10
|
|
10
11
|
class KaraokeLyricsProcessor:
|
@@ -46,6 +47,8 @@ class KaraokeLyricsProcessor:
|
|
46
47
|
def read_input_file(self):
|
47
48
|
file_extension = os.path.splitext(self.input_filename)[1].lower()
|
48
49
|
|
50
|
+
self.logger.debug(f"Reading input file: {self.input_filename}")
|
51
|
+
|
49
52
|
if file_extension == ".txt":
|
50
53
|
return self.read_txt_file()
|
51
54
|
elif file_extension in [".docx", ".doc"]:
|
@@ -56,8 +59,14 @@ class KaraokeLyricsProcessor:
|
|
56
59
|
raise ValueError(f"Unsupported file format: {file_extension}")
|
57
60
|
|
58
61
|
def read_txt_file(self):
|
59
|
-
with open(self.input_filename, "r", encoding="utf-8") as infile:
|
60
|
-
|
62
|
+
with codecs.open(self.input_filename, "r", encoding="utf-8") as infile:
|
63
|
+
content = infile.read()
|
64
|
+
self.logger.debug(f"Raw content read from file: {repr(content)}")
|
65
|
+
lines = content.splitlines()
|
66
|
+
self.logger.debug(f"Number of lines read: {len(lines)}")
|
67
|
+
for i, line in enumerate(lines):
|
68
|
+
self.logger.debug(f"Line {i}: {repr(line)}")
|
69
|
+
return self.clean_text(content).splitlines()
|
61
70
|
|
62
71
|
def read_doc_file(self):
|
63
72
|
text = docx2txt.process(self.input_filename)
|
@@ -70,13 +79,17 @@ class KaraokeLyricsProcessor:
|
|
70
79
|
return self.clean_text(plain_text).splitlines()
|
71
80
|
|
72
81
|
def clean_text(self, text):
|
73
|
-
|
74
|
-
|
82
|
+
self.logger.debug(f"Cleaning text: {repr(text)}")
|
83
|
+
# Remove any non-printable characters except newlines and U+2005 (four-per-em space)
|
84
|
+
cleaned = "".join(char for char in text if char.isprintable() or char in ["\n", "\u2005"])
|
85
|
+
self.logger.debug(f"Text after removing non-printable characters: {repr(cleaned)}")
|
75
86
|
# Replace multiple newlines with a single newline
|
76
|
-
|
87
|
+
cleaned = re.sub(r"\n{2,}", "\n", cleaned)
|
88
|
+
self.logger.debug(f"Text after replacing multiple newlines: {repr(cleaned)}")
|
77
89
|
# Remove leading/trailing whitespace from each line
|
78
|
-
|
79
|
-
|
90
|
+
cleaned = "\n".join(line.strip() for line in cleaned.splitlines())
|
91
|
+
self.logger.debug(f"Final cleaned text: {repr(cleaned)}")
|
92
|
+
return cleaned
|
80
93
|
|
81
94
|
def find_best_split_point(self, line):
|
82
95
|
"""
|
@@ -135,15 +148,28 @@ class KaraokeLyricsProcessor:
|
|
135
148
|
Replace non-printable space-like characters, tabs, and other whitespace with regular spaces,
|
136
149
|
excluding newline characters.
|
137
150
|
"""
|
138
|
-
self.logger.debug(f"Replacing non-printable spaces in: {text}")
|
151
|
+
self.logger.debug(f"Replacing non-printable spaces in: {repr(text)}")
|
152
|
+
|
153
|
+
# Log each character and its Unicode code point
|
154
|
+
# for i, char in enumerate(text):
|
155
|
+
# self.logger.debug(f"Character at position {i}: {repr(char)} (Unicode: U+{ord(char):04X})")
|
156
|
+
|
139
157
|
# Define a pattern for space-like characters, including tabs and other whitespace, but excluding newlines
|
140
158
|
space_pattern = r"[^\S\n\r]|\u00A0|\u1680|\u2000-\u200A|\u202F|\u205F|\u3000"
|
159
|
+
|
141
160
|
# Replace matched characters with a regular space
|
142
161
|
cleaned_text = re.sub(space_pattern, " ", text)
|
162
|
+
|
163
|
+
# Log the result of the replacement
|
164
|
+
self.logger.debug(f"Text after replacing non-printable spaces: {repr(cleaned_text)}")
|
165
|
+
|
143
166
|
# Remove leading/trailing spaces and collapse multiple spaces into one, preserving newlines
|
144
|
-
|
145
|
-
|
146
|
-
|
167
|
+
final_text = re.sub(r" +", " ", cleaned_text).strip()
|
168
|
+
|
169
|
+
# Log the final result
|
170
|
+
self.logger.debug(f"Final text after cleaning: {repr(final_text)}")
|
171
|
+
|
172
|
+
return final_text
|
147
173
|
|
148
174
|
def clean_punctuation_spacing(self, text):
|
149
175
|
"""
|
@@ -155,36 +181,52 @@ class KaraokeLyricsProcessor:
|
|
155
181
|
self.logger.debug(f"Text after cleaning punctuation spacing: {cleaned_text}")
|
156
182
|
return cleaned_text
|
157
183
|
|
184
|
+
def fix_commas_inside_quotes(self, text):
|
185
|
+
"""
|
186
|
+
Move commas inside quotes to after the closing quote.
|
187
|
+
"""
|
188
|
+
self.logger.debug(f"Fixing commas inside quotes in: {text}")
|
189
|
+
# Use regex to find patterns where a comma is inside quotes and move it outside
|
190
|
+
fixed_text = re.sub(r'(".*?)(,)(\s*")', r"\1\3\2", text)
|
191
|
+
self.logger.debug(f"Text after fixing commas inside quotes: {fixed_text}")
|
192
|
+
return fixed_text
|
193
|
+
|
158
194
|
def process_line(self, line):
|
159
195
|
"""
|
160
196
|
Process a single line to ensure it's within the maximum length,
|
161
197
|
handle parentheses, and replace non-printable spaces.
|
162
198
|
"""
|
163
|
-
# Replace non-printable spaces at the beginning
|
164
199
|
line = self.replace_non_printable_spaces(line)
|
165
|
-
# Clean up punctuation spacing
|
166
200
|
line = self.clean_punctuation_spacing(line)
|
201
|
+
line = self.fix_commas_inside_quotes(line)
|
167
202
|
|
168
203
|
processed_lines = []
|
169
204
|
iteration_count = 0
|
170
205
|
max_iterations = 100 # Failsafe limit
|
171
206
|
|
172
|
-
while len(line) > self.max_line_length:
|
173
|
-
if iteration_count > max_iterations:
|
174
|
-
self.logger.error(f"Maximum iterations exceeded in process_line for line: {line}")
|
175
|
-
break
|
176
|
-
|
207
|
+
while len(line) > self.max_line_length and iteration_count < max_iterations:
|
177
208
|
# Check if the line contains parentheses
|
178
209
|
if "(" in line and ")" in line:
|
179
210
|
start_paren = line.find("(")
|
180
|
-
end_paren =
|
211
|
+
end_paren = self.find_matching_paren(line, start_paren)
|
181
212
|
if end_paren < len(line) and line[end_paren] == ",":
|
182
213
|
end_paren += 1
|
183
214
|
|
215
|
+
# Process text before parentheses if it exists
|
184
216
|
if start_paren > 0:
|
185
|
-
|
186
|
-
|
187
|
-
|
217
|
+
before_paren = line[:start_paren].strip()
|
218
|
+
processed_lines.extend(self.split_line(before_paren))
|
219
|
+
|
220
|
+
# Process text within parentheses
|
221
|
+
paren_content = line[start_paren : end_paren + 1].strip()
|
222
|
+
if len(paren_content) > self.max_line_length:
|
223
|
+
# Split the content within parentheses if it's too long
|
224
|
+
split_paren_content = self.split_line(paren_content)
|
225
|
+
processed_lines.extend(split_paren_content)
|
226
|
+
else:
|
227
|
+
processed_lines.append(paren_content)
|
228
|
+
|
229
|
+
line = line[end_paren + 1 :].strip()
|
188
230
|
else:
|
189
231
|
split_point = self.find_best_split_point(line)
|
190
232
|
processed_lines.append(line[:split_point].strip())
|
@@ -192,11 +234,46 @@ class KaraokeLyricsProcessor:
|
|
192
234
|
|
193
235
|
iteration_count += 1
|
194
236
|
|
195
|
-
if line: # Add
|
196
|
-
processed_lines.
|
237
|
+
if line: # Add any remaining part
|
238
|
+
processed_lines.extend(self.split_line(line))
|
239
|
+
|
240
|
+
if iteration_count >= max_iterations:
|
241
|
+
self.logger.error(f"Maximum iterations exceeded in process_line for line: {line}")
|
197
242
|
|
198
243
|
return processed_lines
|
199
244
|
|
245
|
+
def find_matching_paren(self, line, start_index):
|
246
|
+
"""
|
247
|
+
Find the index of the matching closing parenthesis for the opening parenthesis at start_index.
|
248
|
+
"""
|
249
|
+
stack = 0
|
250
|
+
for i in range(start_index, len(line)):
|
251
|
+
if line[i] == "(":
|
252
|
+
stack += 1
|
253
|
+
elif line[i] == ")":
|
254
|
+
stack -= 1
|
255
|
+
if stack == 0:
|
256
|
+
return i
|
257
|
+
return -1 # No matching parenthesis found
|
258
|
+
|
259
|
+
def split_line(self, line):
|
260
|
+
"""
|
261
|
+
Split a line into multiple lines if it exceeds the maximum length.
|
262
|
+
"""
|
263
|
+
if len(line) <= self.max_line_length:
|
264
|
+
return [line]
|
265
|
+
|
266
|
+
split_lines = []
|
267
|
+
while len(line) > self.max_line_length:
|
268
|
+
split_point = self.find_best_split_point(line)
|
269
|
+
split_lines.append(line[:split_point].strip())
|
270
|
+
line = line[split_point:].strip()
|
271
|
+
|
272
|
+
if line:
|
273
|
+
split_lines.append(line)
|
274
|
+
|
275
|
+
return split_lines
|
276
|
+
|
200
277
|
def process(self):
|
201
278
|
self.logger.info(f"Processing input lyrics from {self.input_filename}")
|
202
279
|
|
{karaoke_lyrics_processor-0.3.0.dist-info → karaoke_lyrics_processor-0.4.0.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: karaoke-lyrics-processor
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.4.0
|
4
4
|
Summary: Process song lyrics to prepare them for karaoke video production, e.g. by splitting long lines
|
5
5
|
Home-page: https://github.com/karaokenerds/karaoke-lyrics-processor
|
6
6
|
License: MIT
|
@@ -0,0 +1,8 @@
|
|
1
|
+
karaoke_lyrics_processor/__init__.py,sha256=rLRkJQi61qkRiNXdlTleE3ahJ1oBKcghYVkz64x7IIg,62
|
2
|
+
karaoke_lyrics_processor/cli.py,sha256=bdtseRI2jcChb1bMr92pc5mpSWpHXh4TSzA2tknbyjU,2522
|
3
|
+
karaoke_lyrics_processor/karaoke_lyrics_processor.py,sha256=3x5Ev9xzJ_FHarHDiYHtxftpXH0PaWZGJl9GjYeeIg0,13510
|
4
|
+
karaoke_lyrics_processor-0.4.0.dist-info/LICENSE,sha256=BiPihPDxhxIPEx6yAxVfAljD5Bhm_XG2teCbPEj_m0Y,1069
|
5
|
+
karaoke_lyrics_processor-0.4.0.dist-info/METADATA,sha256=_tWFNbjNUj4WoA79ZqVCsRwH2J5KlMqXQ8j8ZthzoGI,4264
|
6
|
+
karaoke_lyrics_processor-0.4.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
7
|
+
karaoke_lyrics_processor-0.4.0.dist-info/entry_points.txt,sha256=hjFp6CUxl1p-1WJYfB6TbNcI_DHEnVzX3BXAs4y_0O8,78
|
8
|
+
karaoke_lyrics_processor-0.4.0.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
karaoke_lyrics_processor/__init__.py,sha256=rLRkJQi61qkRiNXdlTleE3ahJ1oBKcghYVkz64x7IIg,62
|
2
|
-
karaoke_lyrics_processor/cli.py,sha256=bdtseRI2jcChb1bMr92pc5mpSWpHXh4TSzA2tknbyjU,2522
|
3
|
-
karaoke_lyrics_processor/karaoke_lyrics_processor.py,sha256=LmsciDtBS1-apCbvya2RBmYCSH4S-svrIDpOb8Ut0Gw,10387
|
4
|
-
karaoke_lyrics_processor-0.3.0.dist-info/LICENSE,sha256=BiPihPDxhxIPEx6yAxVfAljD5Bhm_XG2teCbPEj_m0Y,1069
|
5
|
-
karaoke_lyrics_processor-0.3.0.dist-info/METADATA,sha256=JuGcHlIyUvoesSQbFxN1iu-JP-B4mmGQIIrgIVv72pE,4264
|
6
|
-
karaoke_lyrics_processor-0.3.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
7
|
-
karaoke_lyrics_processor-0.3.0.dist-info/entry_points.txt,sha256=hjFp6CUxl1p-1WJYfB6TbNcI_DHEnVzX3BXAs4y_0O8,78
|
8
|
-
karaoke_lyrics_processor-0.3.0.dist-info/RECORD,,
|
{karaoke_lyrics_processor-0.3.0.dist-info → karaoke_lyrics_processor-0.4.0.dist-info}/LICENSE
RENAMED
File without changes
|
File without changes
|
File without changes
|