karaoke-lyrics-processor 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- karaoke_lyrics_processor/cli.py +2 -1
- karaoke_lyrics_processor/karaoke_lyrics_processor.py +98 -14
- {karaoke_lyrics_processor-0.2.0.dist-info → karaoke_lyrics_processor-0.3.1.dist-info}/METADATA +4 -1
- karaoke_lyrics_processor-0.3.1.dist-info/RECORD +8 -0
- {karaoke_lyrics_processor-0.2.0.dist-info → karaoke_lyrics_processor-0.3.1.dist-info}/WHEEL +1 -1
- karaoke_lyrics_processor-0.2.0.dist-info/RECORD +0 -8
- {karaoke_lyrics_processor-0.2.0.dist-info → karaoke_lyrics_processor-0.3.1.dist-info}/LICENSE +0 -0
- {karaoke_lyrics_processor-0.2.0.dist-info → karaoke_lyrics_processor-0.3.1.dist-info}/entry_points.txt +0 -0
karaoke_lyrics_processor/cli.py
CHANGED
@@ -61,7 +61,8 @@ def main():
|
|
61
61
|
processor.process()
|
62
62
|
processor.write_to_output_file()
|
63
63
|
|
64
|
-
|
64
|
+
output_file = processor.output_filename
|
65
|
+
logger.info(f"Lyrics processing complete, lyrics written to output file: {output_file}")
|
65
66
|
|
66
67
|
|
67
68
|
if __name__ == "__main__":
|
@@ -2,6 +2,10 @@ import re
|
|
2
2
|
import logging
|
3
3
|
import pyperclip
|
4
4
|
import unicodedata
|
5
|
+
import docx2txt
|
6
|
+
from striprtf.striprtf import rtf_to_text
|
7
|
+
import os
|
8
|
+
import codecs
|
5
9
|
|
6
10
|
|
7
11
|
class KaraokeLyricsProcessor:
|
@@ -36,13 +40,56 @@ class KaraokeLyricsProcessor:
|
|
36
40
|
if input_lyrics_text is not None and input_filename is None:
|
37
41
|
self.input_lyrics_lines = input_lyrics_text.splitlines()
|
38
42
|
elif input_filename is not None and input_lyrics_text is None:
|
39
|
-
self.input_lyrics_lines = self.
|
43
|
+
self.input_lyrics_lines = self.read_input_file()
|
40
44
|
else:
|
41
45
|
raise ValueError("Either input_lyrics or input_filename must be set, but not both.")
|
42
46
|
|
43
|
-
def
|
44
|
-
|
45
|
-
|
47
|
+
def read_input_file(self):
|
48
|
+
file_extension = os.path.splitext(self.input_filename)[1].lower()
|
49
|
+
|
50
|
+
self.logger.debug(f"Reading input file: {self.input_filename}")
|
51
|
+
|
52
|
+
if file_extension == ".txt":
|
53
|
+
return self.read_txt_file()
|
54
|
+
elif file_extension in [".docx", ".doc"]:
|
55
|
+
return self.read_doc_file()
|
56
|
+
elif file_extension == ".rtf":
|
57
|
+
return self.read_rtf_file()
|
58
|
+
else:
|
59
|
+
raise ValueError(f"Unsupported file format: {file_extension}")
|
60
|
+
|
61
|
+
def read_txt_file(self):
|
62
|
+
with codecs.open(self.input_filename, "r", encoding="utf-8") as infile:
|
63
|
+
content = infile.read()
|
64
|
+
self.logger.debug(f"Raw content read from file: {repr(content)}")
|
65
|
+
lines = content.splitlines()
|
66
|
+
self.logger.debug(f"Number of lines read: {len(lines)}")
|
67
|
+
for i, line in enumerate(lines):
|
68
|
+
self.logger.debug(f"Line {i}: {repr(line)}")
|
69
|
+
return self.clean_text(content).splitlines()
|
70
|
+
|
71
|
+
def read_doc_file(self):
|
72
|
+
text = docx2txt.process(self.input_filename)
|
73
|
+
return self.clean_text(text).splitlines()
|
74
|
+
|
75
|
+
def read_rtf_file(self):
|
76
|
+
with open(self.input_filename, "r", encoding="utf-8") as file:
|
77
|
+
rtf_text = file.read()
|
78
|
+
plain_text = rtf_to_text(rtf_text)
|
79
|
+
return self.clean_text(plain_text).splitlines()
|
80
|
+
|
81
|
+
def clean_text(self, text):
|
82
|
+
self.logger.debug(f"Cleaning text: {repr(text)}")
|
83
|
+
# Remove any non-printable characters except newlines and U+2005 (four-per-em space)
|
84
|
+
cleaned = "".join(char for char in text if char.isprintable() or char in ["\n", "\u2005"])
|
85
|
+
self.logger.debug(f"Text after removing non-printable characters: {repr(cleaned)}")
|
86
|
+
# Replace multiple newlines with a single newline
|
87
|
+
cleaned = re.sub(r"\n{2,}", "\n", cleaned)
|
88
|
+
self.logger.debug(f"Text after replacing multiple newlines: {repr(cleaned)}")
|
89
|
+
# Remove leading/trailing whitespace from each line
|
90
|
+
cleaned = "\n".join(line.strip() for line in cleaned.splitlines())
|
91
|
+
self.logger.debug(f"Final cleaned text: {repr(cleaned)}")
|
92
|
+
return cleaned
|
46
93
|
|
47
94
|
def find_best_split_point(self, line):
|
48
95
|
"""
|
@@ -82,25 +129,56 @@ class KaraokeLyricsProcessor:
|
|
82
129
|
self.logger.debug(f"Splitting at middle word index: {mid_word_index}")
|
83
130
|
return split_at_middle
|
84
131
|
|
85
|
-
# If the line is still too long,
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
132
|
+
# If the line is still too long, find the last space before max_line_length
|
133
|
+
if len(line) > self.max_line_length:
|
134
|
+
last_space = line.rfind(" ", 0, self.max_line_length)
|
135
|
+
if last_space != -1:
|
136
|
+
self.logger.debug(f"Splitting at last space before max_line_length: {last_space}")
|
137
|
+
return last_space
|
138
|
+
else:
|
139
|
+
# If no space is found, split at max_line_length
|
140
|
+
self.logger.debug(f"No space found, forcibly splitting at max_line_length: {self.max_line_length}")
|
141
|
+
return self.max_line_length
|
142
|
+
|
143
|
+
# If the line is shorter than max_line_length, return its length
|
144
|
+
return len(line)
|
90
145
|
|
91
146
|
def replace_non_printable_spaces(self, text):
|
92
147
|
"""
|
93
148
|
Replace non-printable space-like characters, tabs, and other whitespace with regular spaces,
|
94
149
|
excluding newline characters.
|
95
150
|
"""
|
96
|
-
self.logger.debug(f"Replacing non-printable spaces in: {text}")
|
151
|
+
self.logger.debug(f"Replacing non-printable spaces in: {repr(text)}")
|
152
|
+
|
153
|
+
# Log each character and its Unicode code point
|
154
|
+
# for i, char in enumerate(text):
|
155
|
+
# self.logger.debug(f"Character at position {i}: {repr(char)} (Unicode: U+{ord(char):04X})")
|
156
|
+
|
97
157
|
# Define a pattern for space-like characters, including tabs and other whitespace, but excluding newlines
|
98
158
|
space_pattern = r"[^\S\n\r]|\u00A0|\u1680|\u2000-\u200A|\u202F|\u205F|\u3000"
|
159
|
+
|
99
160
|
# Replace matched characters with a regular space
|
100
161
|
cleaned_text = re.sub(space_pattern, " ", text)
|
162
|
+
|
163
|
+
# Log the result of the replacement
|
164
|
+
self.logger.debug(f"Text after replacing non-printable spaces: {repr(cleaned_text)}")
|
165
|
+
|
101
166
|
# Remove leading/trailing spaces and collapse multiple spaces into one, preserving newlines
|
102
|
-
|
103
|
-
|
167
|
+
final_text = re.sub(r" +", " ", cleaned_text).strip()
|
168
|
+
|
169
|
+
# Log the final result
|
170
|
+
self.logger.debug(f"Final text after cleaning: {repr(final_text)}")
|
171
|
+
|
172
|
+
return final_text
|
173
|
+
|
174
|
+
def clean_punctuation_spacing(self, text):
|
175
|
+
"""
|
176
|
+
Remove unnecessary spaces before punctuation marks.
|
177
|
+
"""
|
178
|
+
self.logger.debug(f"Cleaning punctuation spacing in: {text}")
|
179
|
+
# Remove space before comma, period, exclamation mark, question mark, colon, and semicolon
|
180
|
+
cleaned_text = re.sub(r"\s+([,\.!?:;])", r"\1", text)
|
181
|
+
self.logger.debug(f"Text after cleaning punctuation spacing: {cleaned_text}")
|
104
182
|
return cleaned_text
|
105
183
|
|
106
184
|
def process_line(self, line):
|
@@ -110,6 +188,8 @@ class KaraokeLyricsProcessor:
|
|
110
188
|
"""
|
111
189
|
# Replace non-printable spaces at the beginning
|
112
190
|
line = self.replace_non_printable_spaces(line)
|
191
|
+
# Clean up punctuation spacing
|
192
|
+
line = self.clean_punctuation_spacing(line)
|
113
193
|
|
114
194
|
processed_lines = []
|
115
195
|
iteration_count = 0
|
@@ -172,8 +252,9 @@ class KaraokeLyricsProcessor:
|
|
172
252
|
|
173
253
|
processed_lyrics_text = "\n".join(lyrics_lines)
|
174
254
|
|
175
|
-
# Final pass to replace any remaining non-printable spaces
|
255
|
+
# Final pass to replace any remaining non-printable spaces and clean punctuation
|
176
256
|
processed_lyrics_text = self.replace_non_printable_spaces(processed_lyrics_text)
|
257
|
+
processed_lyrics_text = self.clean_punctuation_spacing(processed_lyrics_text)
|
177
258
|
|
178
259
|
self.processed_lyrics_text = processed_lyrics_text
|
179
260
|
pyperclip.copy(processed_lyrics_text)
|
@@ -183,8 +264,11 @@ class KaraokeLyricsProcessor:
|
|
183
264
|
return processed_lyrics_text
|
184
265
|
|
185
266
|
def write_to_output_file(self):
|
267
|
+
# Ensure the output filename has a .txt extension
|
268
|
+
base, _ = os.path.splitext(self.output_filename)
|
269
|
+
self.output_filename = f"{base}.txt"
|
186
270
|
|
187
|
-
with open(self.output_filename, "w") as outfile:
|
271
|
+
with open(self.output_filename, "w", encoding="utf-8") as outfile:
|
188
272
|
outfile.write(self.processed_lyrics_text)
|
189
273
|
|
190
274
|
self.logger.info(f"Processed lyrics written to output file {self.output_filename}")
|
{karaoke_lyrics_processor-0.2.0.dist-info → karaoke_lyrics_processor-0.3.1.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: karaoke-lyrics-processor
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.1
|
4
4
|
Summary: Process song lyrics to prepare them for karaoke video production, e.g. by splitting long lines
|
5
5
|
Home-page: https://github.com/karaokenerds/karaoke-lyrics-processor
|
6
6
|
License: MIT
|
@@ -13,7 +13,10 @@ Classifier: Programming Language :: Python :: 3.9
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.10
|
14
14
|
Classifier: Programming Language :: Python :: 3.11
|
15
15
|
Classifier: Programming Language :: Python :: 3.12
|
16
|
+
Requires-Dist: docx2txt (>=0.8)
|
16
17
|
Requires-Dist: pyperclip (>=1.8)
|
18
|
+
Requires-Dist: python-docx (>=1)
|
19
|
+
Requires-Dist: striprtf (>=0.0.27)
|
17
20
|
Project-URL: Documentation, https://github.com/karaokenerds/karaoke-lyrics-processor/blob/main/README.md
|
18
21
|
Project-URL: Repository, https://github.com/karaokenerds/karaoke-lyrics-processor
|
19
22
|
Description-Content-Type: text/markdown
|
@@ -0,0 +1,8 @@
|
|
1
|
+
karaoke_lyrics_processor/__init__.py,sha256=rLRkJQi61qkRiNXdlTleE3ahJ1oBKcghYVkz64x7IIg,62
|
2
|
+
karaoke_lyrics_processor/cli.py,sha256=bdtseRI2jcChb1bMr92pc5mpSWpHXh4TSzA2tknbyjU,2522
|
3
|
+
karaoke_lyrics_processor/karaoke_lyrics_processor.py,sha256=qMITsW0SbYCaacvEi9WzuZNeBMJhl2wlMck4WktzTEY,11513
|
4
|
+
karaoke_lyrics_processor-0.3.1.dist-info/LICENSE,sha256=BiPihPDxhxIPEx6yAxVfAljD5Bhm_XG2teCbPEj_m0Y,1069
|
5
|
+
karaoke_lyrics_processor-0.3.1.dist-info/METADATA,sha256=rCLkfq2I2QkdQlEGI0kiecLb_v5HdcBzuKT0VaNdjJw,4264
|
6
|
+
karaoke_lyrics_processor-0.3.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
7
|
+
karaoke_lyrics_processor-0.3.1.dist-info/entry_points.txt,sha256=hjFp6CUxl1p-1WJYfB6TbNcI_DHEnVzX3BXAs4y_0O8,78
|
8
|
+
karaoke_lyrics_processor-0.3.1.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
karaoke_lyrics_processor/__init__.py,sha256=rLRkJQi61qkRiNXdlTleE3ahJ1oBKcghYVkz64x7IIg,62
|
2
|
-
karaoke_lyrics_processor/cli.py,sha256=84utSfU-AZZU3okHS8tBFSucJ9-59hXJugfMn48oAKQ,2482
|
3
|
-
karaoke_lyrics_processor/karaoke_lyrics_processor.py,sha256=3UOEHFU61aZveDMbPDYcIhAwGc6qKeHRLhwHdY9akLM,7813
|
4
|
-
karaoke_lyrics_processor-0.2.0.dist-info/LICENSE,sha256=BiPihPDxhxIPEx6yAxVfAljD5Bhm_XG2teCbPEj_m0Y,1069
|
5
|
-
karaoke_lyrics_processor-0.2.0.dist-info/METADATA,sha256=6axf30tfSxbLylh6YF_kfVF2B6CytMoVPKYq2qhB8Tc,4164
|
6
|
-
karaoke_lyrics_processor-0.2.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
7
|
-
karaoke_lyrics_processor-0.2.0.dist-info/entry_points.txt,sha256=hjFp6CUxl1p-1WJYfB6TbNcI_DHEnVzX3BXAs4y_0O8,78
|
8
|
-
karaoke_lyrics_processor-0.2.0.dist-info/RECORD,,
|
{karaoke_lyrics_processor-0.2.0.dist-info → karaoke_lyrics_processor-0.3.1.dist-info}/LICENSE
RENAMED
File without changes
|
File without changes
|