karaoke-lyrics-processor 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -61,7 +61,8 @@ def main():
61
61
  processor.process()
62
62
  processor.write_to_output_file()
63
63
 
64
- logger.info(f"Lyrics processing complete, lyrics written to output file: {output_filename}")
64
+ output_file = processor.output_filename
65
+ logger.info(f"Lyrics processing complete, lyrics written to output file: {output_file}")
65
66
 
66
67
 
67
68
  if __name__ == "__main__":
@@ -2,6 +2,10 @@ import re
2
2
  import logging
3
3
  import pyperclip
4
4
  import unicodedata
5
+ import docx2txt
6
+ from striprtf.striprtf import rtf_to_text
7
+ import os
8
+ import codecs
5
9
 
6
10
 
7
11
  class KaraokeLyricsProcessor:
@@ -36,13 +40,56 @@ class KaraokeLyricsProcessor:
36
40
  if input_lyrics_text is not None and input_filename is None:
37
41
  self.input_lyrics_lines = input_lyrics_text.splitlines()
38
42
  elif input_filename is not None and input_lyrics_text is None:
39
- self.input_lyrics_lines = self.read_input_lyrics_file()
43
+ self.input_lyrics_lines = self.read_input_file()
40
44
  else:
41
45
  raise ValueError("Either input_lyrics or input_filename must be set, but not both.")
42
46
 
43
- def read_input_lyrics_file(self):
44
- with open(self.input_filename, "r") as infile:
45
- return infile.readlines()
47
+ def read_input_file(self):
48
+ file_extension = os.path.splitext(self.input_filename)[1].lower()
49
+
50
+ self.logger.debug(f"Reading input file: {self.input_filename}")
51
+
52
+ if file_extension == ".txt":
53
+ return self.read_txt_file()
54
+ elif file_extension in [".docx", ".doc"]:
55
+ return self.read_doc_file()
56
+ elif file_extension == ".rtf":
57
+ return self.read_rtf_file()
58
+ else:
59
+ raise ValueError(f"Unsupported file format: {file_extension}")
60
+
61
+ def read_txt_file(self):
62
+ with codecs.open(self.input_filename, "r", encoding="utf-8") as infile:
63
+ content = infile.read()
64
+ self.logger.debug(f"Raw content read from file: {repr(content)}")
65
+ lines = content.splitlines()
66
+ self.logger.debug(f"Number of lines read: {len(lines)}")
67
+ for i, line in enumerate(lines):
68
+ self.logger.debug(f"Line {i}: {repr(line)}")
69
+ return self.clean_text(content).splitlines()
70
+
71
+ def read_doc_file(self):
72
+ text = docx2txt.process(self.input_filename)
73
+ return self.clean_text(text).splitlines()
74
+
75
+ def read_rtf_file(self):
76
+ with open(self.input_filename, "r", encoding="utf-8") as file:
77
+ rtf_text = file.read()
78
+ plain_text = rtf_to_text(rtf_text)
79
+ return self.clean_text(plain_text).splitlines()
80
+
81
+ def clean_text(self, text):
82
+ self.logger.debug(f"Cleaning text: {repr(text)}")
83
+ # Remove any non-printable characters except newlines and U+2005 (four-per-em space)
84
+ cleaned = "".join(char for char in text if char.isprintable() or char in ["\n", "\u2005"])
85
+ self.logger.debug(f"Text after removing non-printable characters: {repr(cleaned)}")
86
+ # Replace multiple newlines with a single newline
87
+ cleaned = re.sub(r"\n{2,}", "\n", cleaned)
88
+ self.logger.debug(f"Text after replacing multiple newlines: {repr(cleaned)}")
89
+ # Remove leading/trailing whitespace from each line
90
+ cleaned = "\n".join(line.strip() for line in cleaned.splitlines())
91
+ self.logger.debug(f"Final cleaned text: {repr(cleaned)}")
92
+ return cleaned
46
93
 
47
94
  def find_best_split_point(self, line):
48
95
  """
@@ -82,25 +129,56 @@ class KaraokeLyricsProcessor:
82
129
  self.logger.debug(f"Splitting at middle word index: {mid_word_index}")
83
130
  return split_at_middle
84
131
 
85
- # If the line is still too long, forcibly split at the maximum length
86
- forced_split_point = self.max_line_length
87
- if len(line) > forced_split_point:
88
- self.logger.debug(f"Line is still too long, forcibly splitting at position {forced_split_point}")
89
- return forced_split_point
132
+ # If the line is still too long, find the last space before max_line_length
133
+ if len(line) > self.max_line_length:
134
+ last_space = line.rfind(" ", 0, self.max_line_length)
135
+ if last_space != -1:
136
+ self.logger.debug(f"Splitting at last space before max_line_length: {last_space}")
137
+ return last_space
138
+ else:
139
+ # If no space is found, split at max_line_length
140
+ self.logger.debug(f"No space found, forcibly splitting at max_line_length: {self.max_line_length}")
141
+ return self.max_line_length
142
+
143
+ # If the line is shorter than max_line_length, return its length
144
+ return len(line)
90
145
 
91
146
  def replace_non_printable_spaces(self, text):
92
147
  """
93
148
  Replace non-printable space-like characters, tabs, and other whitespace with regular spaces,
94
149
  excluding newline characters.
95
150
  """
96
- self.logger.debug(f"Replacing non-printable spaces in: {text}")
151
+ self.logger.debug(f"Replacing non-printable spaces in: {repr(text)}")
152
+
153
+ # Log each character and its Unicode code point
154
+ # for i, char in enumerate(text):
155
+ # self.logger.debug(f"Character at position {i}: {repr(char)} (Unicode: U+{ord(char):04X})")
156
+
97
157
  # Define a pattern for space-like characters, including tabs and other whitespace, but excluding newlines
98
158
  space_pattern = r"[^\S\n\r]|\u00A0|\u1680|\u2000-\u200A|\u202F|\u205F|\u3000"
159
+
99
160
  # Replace matched characters with a regular space
100
161
  cleaned_text = re.sub(space_pattern, " ", text)
162
+
163
+ # Log the result of the replacement
164
+ self.logger.debug(f"Text after replacing non-printable spaces: {repr(cleaned_text)}")
165
+
101
166
  # Remove leading/trailing spaces and collapse multiple spaces into one, preserving newlines
102
- cleaned_text = re.sub(r" +", " ", cleaned_text).strip()
103
- self.logger.debug(f"Text after replacing non-printable spaces: {cleaned_text}")
167
+ final_text = re.sub(r" +", " ", cleaned_text).strip()
168
+
169
+ # Log the final result
170
+ self.logger.debug(f"Final text after cleaning: {repr(final_text)}")
171
+
172
+ return final_text
173
+
174
+ def clean_punctuation_spacing(self, text):
175
+ """
176
+ Remove unnecessary spaces before punctuation marks.
177
+ """
178
+ self.logger.debug(f"Cleaning punctuation spacing in: {text}")
179
+ # Remove space before comma, period, exclamation mark, question mark, colon, and semicolon
180
+ cleaned_text = re.sub(r"\s+([,\.!?:;])", r"\1", text)
181
+ self.logger.debug(f"Text after cleaning punctuation spacing: {cleaned_text}")
104
182
  return cleaned_text
105
183
 
106
184
  def process_line(self, line):
@@ -110,6 +188,8 @@ class KaraokeLyricsProcessor:
110
188
  """
111
189
  # Replace non-printable spaces at the beginning
112
190
  line = self.replace_non_printable_spaces(line)
191
+ # Clean up punctuation spacing
192
+ line = self.clean_punctuation_spacing(line)
113
193
 
114
194
  processed_lines = []
115
195
  iteration_count = 0
@@ -172,8 +252,9 @@ class KaraokeLyricsProcessor:
172
252
 
173
253
  processed_lyrics_text = "\n".join(lyrics_lines)
174
254
 
175
- # Final pass to replace any remaining non-printable spaces
255
+ # Final pass to replace any remaining non-printable spaces and clean punctuation
176
256
  processed_lyrics_text = self.replace_non_printable_spaces(processed_lyrics_text)
257
+ processed_lyrics_text = self.clean_punctuation_spacing(processed_lyrics_text)
177
258
 
178
259
  self.processed_lyrics_text = processed_lyrics_text
179
260
  pyperclip.copy(processed_lyrics_text)
@@ -183,8 +264,11 @@ class KaraokeLyricsProcessor:
183
264
  return processed_lyrics_text
184
265
 
185
266
  def write_to_output_file(self):
267
+ # Ensure the output filename has a .txt extension
268
+ base, _ = os.path.splitext(self.output_filename)
269
+ self.output_filename = f"{base}.txt"
186
270
 
187
- with open(self.output_filename, "w") as outfile:
271
+ with open(self.output_filename, "w", encoding="utf-8") as outfile:
188
272
  outfile.write(self.processed_lyrics_text)
189
273
 
190
274
  self.logger.info(f"Processed lyrics written to output file {self.output_filename}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: karaoke-lyrics-processor
3
- Version: 0.2.0
3
+ Version: 0.3.1
4
4
  Summary: Process song lyrics to prepare them for karaoke video production, e.g. by splitting long lines
5
5
  Home-page: https://github.com/karaokenerds/karaoke-lyrics-processor
6
6
  License: MIT
@@ -13,7 +13,10 @@ Classifier: Programming Language :: Python :: 3.9
13
13
  Classifier: Programming Language :: Python :: 3.10
14
14
  Classifier: Programming Language :: Python :: 3.11
15
15
  Classifier: Programming Language :: Python :: 3.12
16
+ Requires-Dist: docx2txt (>=0.8)
16
17
  Requires-Dist: pyperclip (>=1.8)
18
+ Requires-Dist: python-docx (>=1)
19
+ Requires-Dist: striprtf (>=0.0.27)
17
20
  Project-URL: Documentation, https://github.com/karaokenerds/karaoke-lyrics-processor/blob/main/README.md
18
21
  Project-URL: Repository, https://github.com/karaokenerds/karaoke-lyrics-processor
19
22
  Description-Content-Type: text/markdown
@@ -0,0 +1,8 @@
1
+ karaoke_lyrics_processor/__init__.py,sha256=rLRkJQi61qkRiNXdlTleE3ahJ1oBKcghYVkz64x7IIg,62
2
+ karaoke_lyrics_processor/cli.py,sha256=bdtseRI2jcChb1bMr92pc5mpSWpHXh4TSzA2tknbyjU,2522
3
+ karaoke_lyrics_processor/karaoke_lyrics_processor.py,sha256=qMITsW0SbYCaacvEi9WzuZNeBMJhl2wlMck4WktzTEY,11513
4
+ karaoke_lyrics_processor-0.3.1.dist-info/LICENSE,sha256=BiPihPDxhxIPEx6yAxVfAljD5Bhm_XG2teCbPEj_m0Y,1069
5
+ karaoke_lyrics_processor-0.3.1.dist-info/METADATA,sha256=rCLkfq2I2QkdQlEGI0kiecLb_v5HdcBzuKT0VaNdjJw,4264
6
+ karaoke_lyrics_processor-0.3.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
7
+ karaoke_lyrics_processor-0.3.1.dist-info/entry_points.txt,sha256=hjFp6CUxl1p-1WJYfB6TbNcI_DHEnVzX3BXAs4y_0O8,78
8
+ karaoke_lyrics_processor-0.3.1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.9.0
2
+ Generator: poetry-core 1.9.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,8 +0,0 @@
1
- karaoke_lyrics_processor/__init__.py,sha256=rLRkJQi61qkRiNXdlTleE3ahJ1oBKcghYVkz64x7IIg,62
2
- karaoke_lyrics_processor/cli.py,sha256=84utSfU-AZZU3okHS8tBFSucJ9-59hXJugfMn48oAKQ,2482
3
- karaoke_lyrics_processor/karaoke_lyrics_processor.py,sha256=3UOEHFU61aZveDMbPDYcIhAwGc6qKeHRLhwHdY9akLM,7813
4
- karaoke_lyrics_processor-0.2.0.dist-info/LICENSE,sha256=BiPihPDxhxIPEx6yAxVfAljD5Bhm_XG2teCbPEj_m0Y,1069
5
- karaoke_lyrics_processor-0.2.0.dist-info/METADATA,sha256=6axf30tfSxbLylh6YF_kfVF2B6CytMoVPKYq2qhB8Tc,4164
6
- karaoke_lyrics_processor-0.2.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
7
- karaoke_lyrics_processor-0.2.0.dist-info/entry_points.txt,sha256=hjFp6CUxl1p-1WJYfB6TbNcI_DHEnVzX3BXAs4y_0O8,78
8
- karaoke_lyrics_processor-0.2.0.dist-info/RECORD,,