py2ls 0.1.10.1__py3-none-any.whl → 0.1.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/translator.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import re
2
2
  import json
3
- import docx # pip install python-docx
3
+ import docx # pip install python-docx
4
4
  from PyPDF2 import PdfReader
5
5
  from langdetect import detect
6
6
  from googletrans import Translator as GoogleTranslator
@@ -14,70 +14,79 @@ import numpy as np
14
14
  from nltk.tokenize import sent_tokenize
15
15
  from itertools import pairwise
16
16
  from tqdm import tqdm
17
- from fuzzywuzzy import fuzz,process
17
+ from fuzzywuzzy import fuzz, process
18
18
 
19
19
 
20
- def split_by_sent_n(text,n=10):
20
+ def split_by_sent_n(text, n=10):
21
21
  # split text into sentences
22
- text_split_by_sent=sent_tokenize(text)
23
- cut_loc_array=np.arange(0,len(text_split_by_sent),n)
24
- if cut_loc_array[-1]!=len(text_split_by_sent):
25
- cut_loc=np.append(cut_loc_array,len(text_split_by_sent))
22
+ text_split_by_sent = sent_tokenize(text)
23
+ cut_loc_array = np.arange(0, len(text_split_by_sent), n)
24
+ if cut_loc_array[-1] != len(text_split_by_sent):
25
+ cut_loc = np.append(cut_loc_array, len(text_split_by_sent))
26
26
  else:
27
27
  cut_loc = cut_loc_array
28
28
  # get text in section (e.g., every 10 sentences)
29
- text_section=[]
30
- for i,j in pairwise(cut_loc):
29
+ text_section = []
30
+ for i, j in pairwise(cut_loc):
31
31
  text_section.append(text_split_by_sent[i:j])
32
32
  return text_section
33
- def account_letters(text,n=10):
34
- len_=[]
35
- [len_.append(len(i)) for i in split_by_sent_n(text,n)[0]]
33
+
34
+
35
+ def account_letters(text, n=10):
36
+ len_ = []
37
+ [len_.append(len(i)) for i in split_by_sent_n(text, n)[0]]
36
38
  return np.sum(len_)
37
- def auto_chunk_size(txt,verbose=False):
38
- chunk_size=[]
39
- for i in range(1,50):
40
- while 4000<account_letters(txt,n=i)<4700:
39
+
40
+
41
+ def auto_chunk_size(txt, verbose=False):
42
+ chunk_size = []
43
+ for i in range(1, 50):
44
+ while 4000 < account_letters(txt, n=i) < 4700:
41
45
  if verbose:
42
- print(f"the optimal chunk_size is {i} sentences")
46
+ print(f"the optimal chunk_size is {i} sentences")
43
47
  chunk_size.append(i)
44
48
  break
45
49
  return chunk_size[0]
50
+
51
+
46
52
  # import pathlib
47
53
  # import argostranslate.package
48
54
  # import argostranslate.translate
49
55
  def get_lang_code_iso639():
50
56
  from JFL import netfinder
51
- url="https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes"
57
+
58
+ url = "https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes"
52
59
  # res=netfinder.fetch(url,where="table",what="wikitable sortable jquery-tablesorter")
53
- res=netfinder.fetch(url,where="tr",extend=0)
54
- fullname,shortcut=[],[]
55
- for i in range(6,len(res)-2):
56
- if len(res[i])>len(res[i+1]) and res[i+1][:2]==res[i+2][:2]:
60
+ res = netfinder.fetch(url, where="tr", extend=0)
61
+ fullname, shortcut = [], []
62
+ for i in range(6, len(res) - 2):
63
+ if len(res[i]) > len(res[i + 1]) and res[i + 1][:2] == res[i + 2][:2]:
57
64
  fullname.append(res[i])
58
- shortcut.append(res[i+1])
59
- lang_code_iso639=dict([*zip(fullname,shortcut)])
65
+ shortcut.append(res[i + 1])
66
+ lang_code_iso639 = dict([*zip(fullname, shortcut)])
60
67
  return lang_code_iso639
61
68
 
62
- def detect_lang(text, output='lang',verbose=False):
63
- dir_curr_script=os.path.dirname(os.path.abspath(__file__))
64
- dir_lang_code=dir_curr_script+"/data/lang_code_iso639.json"
69
+
70
+ def detect_lang(text, output="lang", verbose=False):
71
+ dir_curr_script = os.path.dirname(os.path.abspath(__file__))
72
+ dir_lang_code = dir_curr_script + "/data/lang_code_iso639.json"
65
73
  with open(dir_lang_code, "r") as file:
66
- lang_code_iso639 = json.load(file)
67
- l_lang,l_code = [],[]
68
- [[l_lang.append(v),l_code.append(k)] for v,k in lang_code_iso639.items()]
74
+ lang_code_iso639 = json.load(file)
75
+ l_lang, l_code = [], []
76
+ [[l_lang.append(v), l_code.append(k)] for v, k in lang_code_iso639.items()]
69
77
  try:
70
78
  if is_text(text):
71
- code_detect=detect(text)
72
- if 'c' in output.lower(): # return code
73
- return l_code[strcmp(code_detect,l_code, verbose=verbose)[1]]
79
+ code_detect = detect(text)
80
+ if "c" in output.lower(): # return code
81
+ return l_code[strcmp(code_detect, l_code, verbose=verbose)[1]]
74
82
  else:
75
- return l_lang[strcmp(code_detect,l_code, verbose=verbose)[1]]
83
+ return l_lang[strcmp(code_detect, l_code, verbose=verbose)[1]]
76
84
  else:
77
85
  print(f"{text} is not supported")
78
- return 'no'
86
+ return "no"
79
87
  except:
80
- return 'no'
88
+ return "no"
89
+
81
90
 
82
91
  def is_text(s):
83
92
  has_alpha = any(char.isalpha() for char in s)
@@ -85,7 +94,8 @@ def is_text(s):
85
94
  # no_special = not re.search(r'[^A-Za-z0-9\s]', s)
86
95
  return has_alpha and has_non_alpha
87
96
 
88
- def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer='WR'):
97
+
98
+ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"):
89
99
  """
90
100
  Compares a search term with a list of candidate strings and finds the best match based on similarity score.
91
101
 
@@ -98,21 +108,23 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer='WR'
98
108
  Returns:
99
109
  tuple: A tuple containing the best match and its index in the candidates list.
100
110
  """
111
+
101
112
  def to_lower(s, ignore_case=True):
102
- #Converts a string or list of strings to lowercase if ignore_case is True.
113
+ # Converts a string or list of strings to lowercase if ignore_case is True.
103
114
  if ignore_case:
104
115
  if isinstance(s, str):
105
116
  return s.lower()
106
117
  elif isinstance(s, list):
107
118
  return [elem.lower() for elem in s]
108
119
  return s
109
- str1_,str2_ = to_lower(search_term, ignore_case),to_lower(candidates, ignore_case)
120
+
121
+ str1_, str2_ = to_lower(search_term, ignore_case), to_lower(candidates, ignore_case)
110
122
  if isinstance(str2_, list):
111
- if 'part' in scorer.lower():
123
+ if "part" in scorer.lower():
112
124
  similarity_scores = [fuzz.partial_ratio(str1_, word) for word in str2_]
113
- elif 'W' in scorer.lower():
125
+ elif "W" in scorer.lower():
114
126
  similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
115
- elif 'Ratio' in scorer.lower():
127
+ elif "Ratio" in scorer.lower():
116
128
  similarity_scores = [fuzz.Ratio(str1_, word) for word in str2_]
117
129
  else:
118
130
  similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
@@ -120,11 +132,11 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer='WR'
120
132
  best_match_score = similarity_scores[best_match_index]
121
133
  else:
122
134
  best_match_index = 0
123
- if 'part' in scorer.lower():
135
+ if "part" in scorer.lower():
124
136
  best_match_score = fuzz.partial_ratio(str1_, str2_)
125
- elif 'W' in scorer.lower():
137
+ elif "W" in scorer.lower():
126
138
  best_match_score = fuzz.WRatio(str1_, str2_)
127
- elif 'Ratio' in scorer.lower():
139
+ elif "Ratio" in scorer.lower():
128
140
  best_match_score = fuzz.Ratio(str1_, str2_)
129
141
  else:
130
142
  best_match_score = fuzz.WRatio(str1_, str2_)
@@ -136,12 +148,15 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer='WR'
136
148
 
137
149
 
138
150
  def methods(idx=0):
139
- methods_=["GoogleTrans (default)",'DeepL','Argos']
151
+ methods_ = ["GoogleTrans (default)", "DeepL", "Argos"]
140
152
  # print(f"supported methods: {methods_}")
141
153
  # print(f"return the selected is: {methods_[idx]}")
142
154
  return methods_[idx]
143
155
 
144
- DEFAULT_SERVICE_URLS = ('translate.google.de','translate.google.fr')
156
+
157
+ DEFAULT_SERVICE_URLS = ("translate.google.de", "translate.google.fr")
158
+
159
+
145
160
  def user_agent():
146
161
  # Example of generating a random user-agent string
147
162
  user_agents = [
@@ -179,24 +194,259 @@ def user_agent():
179
194
  ]
180
195
  agents = random.choice(user_agents)
181
196
  return agents
197
+
198
+
182
199
  def get_language_code(language, translator="google"):
183
200
  """
184
201
  Get language code for translation services (Google Translate, DeepL).
185
202
  """
186
- deepl_languages = {"English":"EN","German":"DE","French":"FR","Spanish":"ES","Italian":"IT","Dutch":"NL","Polish":"PL","Russian":"RU","Japanese":"JA","Chinese":"ZH",}
187
- google_languages = {"Afrikaans":"af","Albanian":"sq","Amharic":"am","Arabic":"ar","Armenian":"hy","Azerbaijani":"az","Basque":"eu","Belarusian":"be","Bengali":"bn","Bosnian":"bs","Bulgarian":"bg","Catalan":"ca","Cebuano":"ceb","Chichewa":"ny","Chinese":"zh-CN","Corsican":"co","Croatian":"hr","Czech":"cs","Danish":"da","Dutch":"nl","English":"en","Esperanto":"eo","Estonian":"et","Filipino":"tl","Finnish":"fi","French":"fr","Frisian":"fy","Galician":"gl","Georgian":"ka","German":"de","Greek":"el","Gujarati":"gu","HaitianCreole":"ht","Hausa":"ha","Hawaiian":"haw","Hebrew":"he","Hindi":"hi","Hmong":"hmn","Hungarian":"hu","Icelandic":"is","Igbo":"ig","Indonesian":"id","Irish":"ga","Italian":"it","Japanese":"ja","Javanese":"jv","Kannada":"kn","Kazakh":"kk","Khmer":"km","Kinyarwanda":"rw","Korean":"ko","Kurdish":"ku","Kyrgyz":"ky","Lao":"lo","Latin":"la","Latvian":"lv","Lithuanian":"lt","Luxembourgish":"lb","Macedonian":"mk","Malagasy":"mg","Malay":"ms","Malayalam":"ml","Maltese":"mt","Maori":"mi","Marathi":"mr","Mongolian":"mn","Myanmar":"my","Nepali":"ne","Norwegian":"no","Odia":"or","Oriya":"or","Pashto":"ps","Persian":"fa","Polish":"pl","Portuguese":"pt","Punjabi":"pa","Romanian":"ro","Russian":"ru","Samoan":"sm","ScotsGaelic":"gd","Serbian":"sr","Sesotho":"st","Shona":"sn","Sindhi":"sd","Sinhala":"si","Slovak":"sk","Slovenian":"sl","Somali":"so","Spanish":"es","Sundanese":"su","Swahili":"sw","Swedish":"sv","Tajik":"tg","Tamil":"ta","Tatar":"tt","Telugu":"te","Thai":"th","Turkish":"tr","Turkmen":"tk","Ukrainian":"uk","Urdu":"ur","Uyghur":"ug","Uzbek":"uz","Vietnamese":"vi","Welsh":"cy","Xhosa":"xh","Yiddish":"yi","Yoruba":"yo","Zulu":"zu"}
188
- argos_languages = {"Afrikaans":"af","Albanian":"sq","Amharic":"am","Arabic":"ar","Armenian":"hy","Azerbaijani":"az","Basque":"eu","Belarusian":"be","Bengali":"bn","Bosnian":"bs","Bulgarian":"bg","Catalan":"ca","Cebuano":"ceb","Chichewa":"ny","Chinese":"zh","Corsican":"co","Croatian":"hr","Czech":"cs","Danish":"da","Dutch":"nl","English":"en","Esperanto":"es","Estonian":"et","Filipino":"tl","Finnish":"fi","French":"fr","Frisian":"fy","Galician":"gl","Georgian":"ka","German":"de","Greek":"el","Gujarati":"gu","HaitianCreole":"ht","Hausa":"ha","Hawaiian":"haw","Hebrew":"he","Hindi":"hi","Hmong":"hmn","Hungarian":"hu","Icelandic":"is","Igbo":"ig","Indonesian":"id","Irish":"ga","Italian":"it","Japanese":"ja","Javanese":"jv","Kannada":"kn","Kazakh":"kk","Khmer":"km","Kinyarwanda":"rw","Korean":"ko","Kurdish":"ku","Kyrgyz":"ky","Lao":"lo","Latin":"la","Latvian":"lv","Lithuanian":"lt","Luxembourgish":"lb","Macedonian":"mk","Malagasy":"mg","Malay":"ms","Malayalam":"ml","Maltese":"mt","Maori":"mi","Marathi":"mr","Mongolian":"mn","Myanmar":"my","Nepali":"ne","Norwegian":"no","Odia":"or","Oriya":"or","Pashto":"ps","Persian":"fa","Polish":"pl","Portuguese":"pt","Punjabi":"pa","Romanian":"ro","Russian":"ru","Samoan":"sm","ScotsGaelic":"gd","Serbian":"sr","Sesotho":"st","Shona":"sn","Sindhi":"sd","Sinhala":"si","Slovak":"sk","Slovenian":"sl","Somali":"so","Spanish":"es","Sundanese":"su","Swahili":"sw","Swedish":"sv","Tajik":"tg","Tamil":"ta","Tatar":"tt","Telugu":"te","Thai":"th","Turkish":"tr","Turkmen":"tk","Ukrainian":"uk","Urdu":"ur","Uyghur":"ug","Uzbek":"uz","Vietnamese":"vi","Welsh":"cy","Xhosa":"xh","Yiddish":"yi","Yoruba":"yo","Zulu":"zu"}
203
+ deepl_languages = {
204
+ "English": "EN",
205
+ "German": "DE",
206
+ "French": "FR",
207
+ "Spanish": "ES",
208
+ "Italian": "IT",
209
+ "Dutch": "NL",
210
+ "Polish": "PL",
211
+ "Russian": "RU",
212
+ "Japanese": "JA",
213
+ "Chinese": "ZH",
214
+ }
215
+ google_languages = {
216
+ "Afrikaans": "af",
217
+ "Albanian": "sq",
218
+ "Amharic": "am",
219
+ "Arabic": "ar",
220
+ "Armenian": "hy",
221
+ "Azerbaijani": "az",
222
+ "Basque": "eu",
223
+ "Belarusian": "be",
224
+ "Bengali": "bn",
225
+ "Bosnian": "bs",
226
+ "Bulgarian": "bg",
227
+ "Catalan": "ca",
228
+ "Cebuano": "ceb",
229
+ "Chichewa": "ny",
230
+ "Chinese": "zh-CN",
231
+ "Corsican": "co",
232
+ "Croatian": "hr",
233
+ "Czech": "cs",
234
+ "Danish": "da",
235
+ "Dutch": "nl",
236
+ "English": "en",
237
+ "Esperanto": "eo",
238
+ "Estonian": "et",
239
+ "Filipino": "tl",
240
+ "Finnish": "fi",
241
+ "French": "fr",
242
+ "Frisian": "fy",
243
+ "Galician": "gl",
244
+ "Georgian": "ka",
245
+ "German": "de",
246
+ "Greek": "el",
247
+ "Gujarati": "gu",
248
+ "HaitianCreole": "ht",
249
+ "Hausa": "ha",
250
+ "Hawaiian": "haw",
251
+ "Hebrew": "he",
252
+ "Hindi": "hi",
253
+ "Hmong": "hmn",
254
+ "Hungarian": "hu",
255
+ "Icelandic": "is",
256
+ "Igbo": "ig",
257
+ "Indonesian": "id",
258
+ "Irish": "ga",
259
+ "Italian": "it",
260
+ "Japanese": "ja",
261
+ "Javanese": "jv",
262
+ "Kannada": "kn",
263
+ "Kazakh": "kk",
264
+ "Khmer": "km",
265
+ "Kinyarwanda": "rw",
266
+ "Korean": "ko",
267
+ "Kurdish": "ku",
268
+ "Kyrgyz": "ky",
269
+ "Lao": "lo",
270
+ "Latin": "la",
271
+ "Latvian": "lv",
272
+ "Lithuanian": "lt",
273
+ "Luxembourgish": "lb",
274
+ "Macedonian": "mk",
275
+ "Malagasy": "mg",
276
+ "Malay": "ms",
277
+ "Malayalam": "ml",
278
+ "Maltese": "mt",
279
+ "Maori": "mi",
280
+ "Marathi": "mr",
281
+ "Mongolian": "mn",
282
+ "Myanmar": "my",
283
+ "Nepali": "ne",
284
+ "Norwegian": "no",
285
+ "Odia": "or",
286
+ "Oriya": "or",
287
+ "Pashto": "ps",
288
+ "Persian": "fa",
289
+ "Polish": "pl",
290
+ "Portuguese": "pt",
291
+ "Punjabi": "pa",
292
+ "Romanian": "ro",
293
+ "Russian": "ru",
294
+ "Samoan": "sm",
295
+ "ScotsGaelic": "gd",
296
+ "Serbian": "sr",
297
+ "Sesotho": "st",
298
+ "Shona": "sn",
299
+ "Sindhi": "sd",
300
+ "Sinhala": "si",
301
+ "Slovak": "sk",
302
+ "Slovenian": "sl",
303
+ "Somali": "so",
304
+ "Spanish": "es",
305
+ "Sundanese": "su",
306
+ "Swahili": "sw",
307
+ "Swedish": "sv",
308
+ "Tajik": "tg",
309
+ "Tamil": "ta",
310
+ "Tatar": "tt",
311
+ "Telugu": "te",
312
+ "Thai": "th",
313
+ "Turkish": "tr",
314
+ "Turkmen": "tk",
315
+ "Ukrainian": "uk",
316
+ "Urdu": "ur",
317
+ "Uyghur": "ug",
318
+ "Uzbek": "uz",
319
+ "Vietnamese": "vi",
320
+ "Welsh": "cy",
321
+ "Xhosa": "xh",
322
+ "Yiddish": "yi",
323
+ "Yoruba": "yo",
324
+ "Zulu": "zu",
325
+ }
326
+ argos_languages = {
327
+ "Afrikaans": "af",
328
+ "Albanian": "sq",
329
+ "Amharic": "am",
330
+ "Arabic": "ar",
331
+ "Armenian": "hy",
332
+ "Azerbaijani": "az",
333
+ "Basque": "eu",
334
+ "Belarusian": "be",
335
+ "Bengali": "bn",
336
+ "Bosnian": "bs",
337
+ "Bulgarian": "bg",
338
+ "Catalan": "ca",
339
+ "Cebuano": "ceb",
340
+ "Chichewa": "ny",
341
+ "Chinese": "zh",
342
+ "Corsican": "co",
343
+ "Croatian": "hr",
344
+ "Czech": "cs",
345
+ "Danish": "da",
346
+ "Dutch": "nl",
347
+ "English": "en",
348
+ "Esperanto": "es",
349
+ "Estonian": "et",
350
+ "Filipino": "tl",
351
+ "Finnish": "fi",
352
+ "French": "fr",
353
+ "Frisian": "fy",
354
+ "Galician": "gl",
355
+ "Georgian": "ka",
356
+ "German": "de",
357
+ "Greek": "el",
358
+ "Gujarati": "gu",
359
+ "HaitianCreole": "ht",
360
+ "Hausa": "ha",
361
+ "Hawaiian": "haw",
362
+ "Hebrew": "he",
363
+ "Hindi": "hi",
364
+ "Hmong": "hmn",
365
+ "Hungarian": "hu",
366
+ "Icelandic": "is",
367
+ "Igbo": "ig",
368
+ "Indonesian": "id",
369
+ "Irish": "ga",
370
+ "Italian": "it",
371
+ "Japanese": "ja",
372
+ "Javanese": "jv",
373
+ "Kannada": "kn",
374
+ "Kazakh": "kk",
375
+ "Khmer": "km",
376
+ "Kinyarwanda": "rw",
377
+ "Korean": "ko",
378
+ "Kurdish": "ku",
379
+ "Kyrgyz": "ky",
380
+ "Lao": "lo",
381
+ "Latin": "la",
382
+ "Latvian": "lv",
383
+ "Lithuanian": "lt",
384
+ "Luxembourgish": "lb",
385
+ "Macedonian": "mk",
386
+ "Malagasy": "mg",
387
+ "Malay": "ms",
388
+ "Malayalam": "ml",
389
+ "Maltese": "mt",
390
+ "Maori": "mi",
391
+ "Marathi": "mr",
392
+ "Mongolian": "mn",
393
+ "Myanmar": "my",
394
+ "Nepali": "ne",
395
+ "Norwegian": "no",
396
+ "Odia": "or",
397
+ "Oriya": "or",
398
+ "Pashto": "ps",
399
+ "Persian": "fa",
400
+ "Polish": "pl",
401
+ "Portuguese": "pt",
402
+ "Punjabi": "pa",
403
+ "Romanian": "ro",
404
+ "Russian": "ru",
405
+ "Samoan": "sm",
406
+ "ScotsGaelic": "gd",
407
+ "Serbian": "sr",
408
+ "Sesotho": "st",
409
+ "Shona": "sn",
410
+ "Sindhi": "sd",
411
+ "Sinhala": "si",
412
+ "Slovak": "sk",
413
+ "Slovenian": "sl",
414
+ "Somali": "so",
415
+ "Spanish": "es",
416
+ "Sundanese": "su",
417
+ "Swahili": "sw",
418
+ "Swedish": "sv",
419
+ "Tajik": "tg",
420
+ "Tamil": "ta",
421
+ "Tatar": "tt",
422
+ "Telugu": "te",
423
+ "Thai": "th",
424
+ "Turkish": "tr",
425
+ "Turkmen": "tk",
426
+ "Ukrainian": "uk",
427
+ "Urdu": "ur",
428
+ "Uyghur": "ug",
429
+ "Uzbek": "uz",
430
+ "Vietnamese": "vi",
431
+ "Welsh": "cy",
432
+ "Xhosa": "xh",
433
+ "Yiddish": "yi",
434
+ "Yoruba": "yo",
435
+ "Zulu": "zu",
436
+ }
189
437
  if "deep" in translator.lower():
190
438
  langs = deepl_languages
191
- elif 'goo' in translator.lower():
439
+ elif "goo" in translator.lower():
192
440
  langs = google_languages
193
- elif 'ar' in translator.lower():
441
+ elif "ar" in translator.lower():
194
442
  langs = argos_languages
195
- for lang, code in langs.items():
196
- if language.lower() in lang.lower():
197
- return code
198
- print(f"fail to find the {language} code in translator {translator}")
199
- return None
443
+ lang_found = strcmp(language, list(langs.keys()))[0]
444
+ if lang_found in list(langs.keys()):
445
+ return langs[lang_found]
446
+ else:
447
+ print(f"fail to find the {language} code in translator {translator}")
448
+ return None
449
+
200
450
 
201
451
  # language = "chinese"
202
452
  # # Example usage:
@@ -216,6 +466,8 @@ def load_docx(filename):
216
466
  for paragraph in doc.paragraphs:
217
467
  text.append(paragraph.text)
218
468
  return text
469
+
470
+
219
471
  # # usage
220
472
  # filename = "example.docx" # Change to the path of your .docx file
221
473
  # text = load_docx(filename)
@@ -223,6 +475,7 @@ def load_docx(filename):
223
475
  # print("Text from the document:")
224
476
  # print(text)
225
477
 
478
+
226
479
  def load_pdf(filename, page="all", verbose=False):
227
480
  from PyPDF2 import PdfReader
228
481
  import numpy as np
@@ -289,12 +542,14 @@ def split_text(text, method="sentence", limit=4500):
289
542
  else:
290
543
  return re.split(r"\{}".format(method), text)
291
544
 
545
+
292
546
  def filter_errors(text):
293
547
  # handle bugs:
294
548
  # bug1: ".com" cannot be translated, but '..com' works
295
- text=text.replace(".com", "..come")
549
+ text = text.replace(".com", "..come")
296
550
  return text
297
551
 
552
+
298
553
  def merge_text(input, robust=True):
299
554
  """
300
555
  Convert a list of strings, tuple of strings, or numpy array of strings into a single concatenated string.
@@ -318,6 +573,7 @@ def merge_text(input, robust=True):
318
573
  else:
319
574
  return str(input)
320
575
 
576
+
321
577
  def replace_text(text, dict_replace=None, robust=True):
322
578
  """
323
579
  Replace specified substrings in the input text with provided replacements.
@@ -362,19 +618,21 @@ def replace_text(text, dict_replace=None, robust=True):
362
618
  text = text.replace(k, v)
363
619
  return text
364
620
 
621
+
365
622
  # # usage:
366
623
  # a = "kjkjk (a, b, c)"
367
624
  # replace_text(a, {"(": "", ")": "", " ": " "}, robust=False)
368
625
 
626
+
369
627
  def merge_strings_every_n(strings_list, n=10):
370
628
  merged_list = []
371
- if n>0:
629
+ if n > 0:
372
630
  for i in range(0, len(strings_list), n):
373
631
  merged_string = "".join(strings_list[i : i + n])
374
632
  merged_list.append(merged_string)
375
- return merged_list,n
633
+ return merged_list, n
376
634
  else:
377
- return strings_list,n
635
+ return strings_list, n
378
636
 
379
637
 
380
638
  def translate(
@@ -386,7 +644,7 @@ def translate(
386
644
  user_agent=user_agent(),
387
645
  verbose=True,
388
646
  error_verbose=True,
389
- limit=5000
647
+ limit=5000,
390
648
  ):
391
649
  """
392
650
  Translate text to the target language using the specified translation method (Google Translate or DeepL).
@@ -394,19 +652,47 @@ def translate(
394
652
  """
395
653
  # error_verbose = verbose or False
396
654
 
397
- if isinstance(text,list):
398
- text=merge_text(text)
655
+ if isinstance(text, list):
656
+ # text=merge_text(text)
657
+ text_list = [
658
+ translate(
659
+ i,
660
+ lang=lang,
661
+ lang_src=lang_src,
662
+ method=method,
663
+ service_urls=service_urls,
664
+ user_agent=user_agent,
665
+ verbose=verbose,
666
+ error_verbose=error_verbose,
667
+ limit=limit,
668
+ )
669
+ for i in tqdm(
670
+ text,
671
+ desc="is translating",
672
+ colour="green",
673
+ bar_format="{l_bar}{bar} {n_fmt}/{total_fmt}",
674
+ )
675
+ ]
676
+ return text_list
399
677
  text = replace_text(text)
400
678
  if lang_src is None:
401
- lang_src = detect_lang(text)
679
+ lang_src = detect_lang(text)
402
680
  try:
403
681
  if len(text) > limit:
404
- n=auto_chunk_size(text)
405
- text_segments = split_by_sent_n(text,n)
682
+ n = auto_chunk_size(text)
683
+ text_segments = split_by_sent_n(text, n)
406
684
  translations = ""
407
- for segment in tqdm(text_segments,desc='is translating', colour="green"):
685
+ for segment in tqdm(text_segments, desc="is translating", colour="green"):
408
686
  segment = replace_text(merge_text(segment))
409
- translated_segment = translate_segment(text=segment, lang=lang, lang_src=lang_src, method=method, user_agent=user_agent,service_urls=service_urls, verbose=verbose,error_verbose=error_verbose
687
+ translated_segment = translate_segment(
688
+ text=segment,
689
+ lang=lang,
690
+ lang_src=lang_src,
691
+ method=method,
692
+ user_agent=user_agent,
693
+ service_urls=service_urls,
694
+ verbose=verbose,
695
+ error_verbose=error_verbose,
410
696
  )
411
697
  time.sleep(1)
412
698
  if translated_segment:
@@ -416,12 +702,22 @@ def translate(
416
702
  translations += ""
417
703
  return translations
418
704
  else:
419
- return translate_segment(text=text, lang=lang, lang_src=lang_src, method=method, user_agent=user_agent,service_urls=service_urls, verbose=verbose,error_verbose=error_verbose)
705
+ return translate_segment(
706
+ text=text,
707
+ lang=lang,
708
+ lang_src=lang_src,
709
+ method=method,
710
+ user_agent=user_agent,
711
+ service_urls=service_urls,
712
+ verbose=verbose,
713
+ error_verbose=error_verbose,
714
+ )
420
715
  except Exception as e:
421
716
  if error_verbose:
422
717
  print("(translate)Error during translation :", e)
423
718
  return ""
424
719
 
720
+
425
721
  def translate_segment(
426
722
  text,
427
723
  lang="chinese",
@@ -430,39 +726,42 @@ def translate_segment(
430
726
  service_urls=DEFAULT_SERVICE_URLS,
431
727
  user_agent=user_agent(),
432
728
  verbose=False,
433
- error_verbose=True
729
+ error_verbose=True,
434
730
  ):
435
731
  """
436
732
  Translate a text segment to the target language using the specified translation method (Google Translate or DeepL).
437
733
  """
438
-
734
+
439
735
  text_clean = filter_errors(text)
440
736
  text_clean = replace_text(text_clean)
441
737
  if lang_src is None:
442
738
  lang_src = detect_lang(text_clean)
443
739
  try:
444
- lang_src = get_language_code(lang_src, 'google')
445
- lang_tgt = get_language_code(lang, 'google')
740
+ lang_src = get_language_code(lang_src, "google")
741
+ lang_tgt = get_language_code(lang, "google")
446
742
  if "goog" in method.lower():
447
- Trstor = GoogleTranslator(service_urls=service_urls,user_agent=user_agent)
743
+ Trstor = GoogleTranslator(service_urls=service_urls, user_agent=user_agent)
448
744
  txt = Trstor.translate(text_clean, src=lang_src, dest=lang_tgt).text
449
745
  elif "trans" in method.lower():
450
- lang_src = get_language_code(lang_src, 'google')
451
- lang_tgt = get_language_code(lang, 'google')
452
- translator = TranslateTranslator(from_lang=lang_src,
453
- to_lang=lang_tgt,
454
- provider='LibreTranslate',
455
- secret_access_key=None,
456
- base_url='https://translate.astian.org/')
746
+ lang_src = get_language_code(lang_src, "google")
747
+ lang_tgt = get_language_code(lang, "google")
748
+ translator = TranslateTranslator(
749
+ from_lang=lang_src,
750
+ to_lang=lang_tgt,
751
+ provider="LibreTranslate",
752
+ secret_access_key=None,
753
+ base_url="https://translate.astian.org/",
754
+ )
457
755
  txt = translator.translate(text_clean)
458
- elif 'ar' in method.lower():
756
+ elif "ar" in method.lower():
459
757
  lang_src = get_language_code(language=lang_src, translator="argos")
460
758
  lang_tgt = get_language_code(language=lang, translator="argos")
461
759
  argostranslate.package.update_package_index()
462
760
  available_packages = argostranslate.package.get_available_packages()
463
761
  package_to_install = next(
464
762
  filter(
465
- lambda x: x.from_code == lang_src and x.to_code == lang_tgt, available_packages
763
+ lambda x: x.from_code == lang_src and x.to_code == lang_tgt,
764
+ available_packages,
466
765
  )
467
766
  )
468
767
  argostranslate.package.install_from_path(package_to_install.download())
@@ -475,15 +774,19 @@ def translate_segment(
475
774
  print(txt)
476
775
  return txt
477
776
  except Exception as e:
478
- txt=translate_with_retry(
479
- text_clean,
480
- lang=lang,
481
- lang_src=lang_src,
482
- method=method,
483
- verbose=verbose,
484
- error_verbose=error_verbose,
485
- user_agent=user_agent, service_urls=service_urls)
777
+ txt = translate_with_retry(
778
+ text_clean,
779
+ lang=lang,
780
+ lang_src=lang_src,
781
+ method=method,
782
+ verbose=verbose,
783
+ error_verbose=error_verbose,
784
+ user_agent=user_agent,
785
+ service_urls=service_urls,
786
+ )
486
787
  return txt
788
+
789
+
487
790
  def translate_with_retry(
488
791
  text,
489
792
  lang="chinese",
@@ -491,15 +794,20 @@ def translate_with_retry(
491
794
  method=methods(),
492
795
  verbose=False,
493
796
  error_verbose=True,
494
- user_agent=user_agent(), service_urls=DEFAULT_SERVICE_URLS):
797
+ user_agent=user_agent(),
798
+ service_urls=DEFAULT_SERVICE_URLS,
799
+ ):
495
800
  """
496
- Translate a text to the target language, retrying with alternative service URLs on connection errors.
801
+ Translate a text to the target language, retrying with alternative service URLs on connection errors.
497
802
  """
498
- def try_translate(text,lang,lang_src,user_agent,service_url):
803
+
804
+ def try_translate(text, lang, lang_src, user_agent, service_url):
499
805
  try:
500
- translator_ = GoogleTranslator(user_agent=user_agent, service_urls=[service_url])
806
+ translator_ = GoogleTranslator(
807
+ user_agent=user_agent, service_urls=[service_url]
808
+ )
501
809
  result = translator_.translate(text, dest=lang, src=lang_src)
502
- if result and hasattr(result, 'text'):
810
+ if result and hasattr(result, "text"):
503
811
  return result.text
504
812
  else:
505
813
  raise ValueError(f"Invalid response from {service_url}: {result}")
@@ -507,32 +815,52 @@ def translate_with_retry(
507
815
  raise RuntimeError(f"Error using {service_url}: {e}")
508
816
 
509
817
  if lang_src is None:
510
- lang_src = detect_lang(text)
818
+ lang_src = detect_lang(text)
511
819
  lang_src = get_language_code(language=lang_src)
512
820
  lang = get_language_code(language=lang)
513
821
  try:
514
- return try_translate(text,lang=lang,lang_src=lang_src,user_agent=user_agent,service_url=service_urls[0])
822
+ return try_translate(
823
+ text,
824
+ lang=lang,
825
+ lang_src=lang_src,
826
+ user_agent=user_agent,
827
+ service_url=service_urls[0],
828
+ )
515
829
  except Exception as e:
516
830
  if error_verbose:
517
831
  print("Connection error:", e)
518
- try:
832
+ try:
519
833
  time.sleep(1)
520
- return try_translate(text,lang=lang,lang_src=lang_src,user_agent=user_agent,service_url=service_urls[1])
834
+ return try_translate(
835
+ text,
836
+ lang=lang,
837
+ lang_src=lang_src,
838
+ user_agent=user_agent,
839
+ service_url=service_urls[1],
840
+ )
521
841
  except Exception as e:
522
842
  if error_verbose:
523
- print(f"(translate_with_retry):Connection error with {service_urls}: {e}")
843
+ print(
844
+ f"(translate_with_retry):Connection error with {service_urls}: {e}"
845
+ )
524
846
  if error_verbose:
525
847
  print("All service URLs failed. Unable to translate the text.")
526
848
  return text
527
849
 
528
850
 
529
- def trans_docx(filename, lang="english", lang_src=None, method=methods(),service_urls=[
530
- 'translate.google.de'],verbose=False):
851
+ def trans_docx(
852
+ filename,
853
+ lang="english",
854
+ lang_src=None,
855
+ method=methods(),
856
+ service_urls=["translate.google.de"],
857
+ verbose=False,
858
+ ):
531
859
  """
532
- load the docx file and translated it into target lang "lang",
533
- verbose: (default 'False', no display)to display the translated text in for loop
534
- Return (list):
535
- the translated text as a list
860
+ load the docx file and translated it into target lang "lang",
861
+ verbose: (default 'False', no display)to display the translated text in for loop
862
+ Return (list):
863
+ the translated text as a list
536
864
  """
537
865
  txt = load_docx(filename)
538
866
  trans_text = []
@@ -543,7 +871,14 @@ def trans_docx(filename, lang="english", lang_src=None, method=methods(),service
543
871
  pass
544
872
  else:
545
873
  i.join(j)
546
- trans_text_ = translate(i, lang=lang,lang_src=lang_src, method=method,service_urls=service_urls,verbose=verbose)
874
+ trans_text_ = translate(
875
+ i,
876
+ lang=lang,
877
+ lang_src=lang_src,
878
+ method=method,
879
+ service_urls=service_urls,
880
+ verbose=verbose,
881
+ )
547
882
  trans_text.append(trans_text_)
548
883
  # if verbose:
549
884
  # print(trans_text_)
@@ -552,15 +887,30 @@ def trans_docx(filename, lang="english", lang_src=None, method=methods(),service
552
887
  else:
553
888
  return None
554
889
 
555
- def trans_pdf(filename, page="all",lang="english", lang_src=None, method="google",service_urls=[
556
- 'translate.google.de'],verbose=False):
557
- """load the pdf file and translated it into target lang "lang",
890
+
891
+ def trans_pdf(
892
+ filename,
893
+ page="all",
894
+ lang="english",
895
+ lang_src=None,
896
+ method="google",
897
+ service_urls=["translate.google.de"],
898
+ verbose=False,
899
+ ):
900
+ """load the pdf file and translated it into target lang "lang",
558
901
  verbose: (default 'False', no display)to display the translated text in for loop
559
902
  Return (list):
560
903
  the translated text as a list
561
904
  """
562
- txt = load_pdf(filename,page=page,verbose=verbose)
563
- trans_text = translate(txt, lang=lang,lang_src=lang_src, method=method,service_urls=service_urls,verbose=False)
905
+ txt = load_pdf(filename, page=page, verbose=verbose)
906
+ trans_text = translate(
907
+ txt,
908
+ lang=lang,
909
+ lang_src=lang_src,
910
+ method=method,
911
+ service_urls=service_urls,
912
+ verbose=False,
913
+ )
564
914
  return trans_text
565
915
 
566
916
 
@@ -578,7 +928,8 @@ def save_content(fpath, content):
578
928
  with open(fpath, "w") as file:
579
929
  file.write(content)
580
930
 
581
- def save_file(fpath, content, kind=None, font_name="Arial", font_size=10,spacing=6):
931
+
932
+ def save_file(fpath, content, kind=None, font_name="Arial", font_size=10, spacing=6):
582
933
  """
583
934
  Save content into a file with specified file type and formatting.
584
935
 
@@ -592,7 +943,7 @@ def save_file(fpath, content, kind=None, font_name="Arial", font_size=10,spacing
592
943
  Returns:
593
944
  None
594
945
  """
595
- file_types = [".docx", ".txt", ".md", ".html", ".pdf"]
946
+ file_types = [".docx", ".txt", ".md", ".html", ".pdf"]
596
947
  if kind is None:
597
948
  # Extract the file extension from fpath
598
949
  _, kind = os.path.splitext(fpath)
@@ -601,8 +952,8 @@ def save_file(fpath, content, kind=None, font_name="Arial", font_size=10,spacing
601
952
  if kind.lower() not in file_types:
602
953
  raise ValueError(f"Error:\n{kind} is not in the supported list {file_types}")
603
954
  if "docx" in kind.lower():
604
- if isinstance(content,str):
605
- content = split_text(content,'sentence')
955
+ if isinstance(content, str):
956
+ content = split_text(content, "sentence")
606
957
  doc = docx.Document()
607
958
  for i, paragraph_text in enumerate(content):
608
959
  paragraph = doc.add_paragraph()
@@ -644,4 +995,4 @@ def save_file(fpath, content, kind=None, font_name="Arial", font_size=10,spacing
644
995
  # lang = "chinese"
645
996
  # translated_text = translate(text_to_translate, lang=lang)
646
997
  # print(f"Detected language:{detected_language} \ntranslated into {lang}")
647
- # print("Translated text:\n", translated_text)
998
+ # print("Translated text:\n", translated_text)