py2ls 0.1.10.1__py3-none-any.whl → 0.1.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/ips.py +770 -2
- py2ls/netfinder.py +33 -8
- py2ls/ocr.py +258 -94
- py2ls/translator.py +470 -119
- {py2ls-0.1.10.1.dist-info → py2ls-0.1.10.2.dist-info}/METADATA +1 -1
- {py2ls-0.1.10.1.dist-info → py2ls-0.1.10.2.dist-info}/RECORD +7 -7
- {py2ls-0.1.10.1.dist-info → py2ls-0.1.10.2.dist-info}/WHEEL +1 -1
py2ls/translator.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
import re
|
2
2
|
import json
|
3
|
-
import docx
|
3
|
+
import docx # pip install python-docx
|
4
4
|
from PyPDF2 import PdfReader
|
5
5
|
from langdetect import detect
|
6
6
|
from googletrans import Translator as GoogleTranslator
|
@@ -14,70 +14,79 @@ import numpy as np
|
|
14
14
|
from nltk.tokenize import sent_tokenize
|
15
15
|
from itertools import pairwise
|
16
16
|
from tqdm import tqdm
|
17
|
-
from fuzzywuzzy import fuzz,process
|
17
|
+
from fuzzywuzzy import fuzz, process
|
18
18
|
|
19
19
|
|
20
|
-
def split_by_sent_n(text,n=10):
|
20
|
+
def split_by_sent_n(text, n=10):
|
21
21
|
# split text into sentences
|
22
|
-
text_split_by_sent=sent_tokenize(text)
|
23
|
-
cut_loc_array=np.arange(0,len(text_split_by_sent),n)
|
24
|
-
if cut_loc_array[-1]!=len(text_split_by_sent):
|
25
|
-
cut_loc=np.append(cut_loc_array,len(text_split_by_sent))
|
22
|
+
text_split_by_sent = sent_tokenize(text)
|
23
|
+
cut_loc_array = np.arange(0, len(text_split_by_sent), n)
|
24
|
+
if cut_loc_array[-1] != len(text_split_by_sent):
|
25
|
+
cut_loc = np.append(cut_loc_array, len(text_split_by_sent))
|
26
26
|
else:
|
27
27
|
cut_loc = cut_loc_array
|
28
28
|
# get text in section (e.g., every 10 sentences)
|
29
|
-
text_section=[]
|
30
|
-
for i,j in pairwise(cut_loc):
|
29
|
+
text_section = []
|
30
|
+
for i, j in pairwise(cut_loc):
|
31
31
|
text_section.append(text_split_by_sent[i:j])
|
32
32
|
return text_section
|
33
|
-
|
34
|
-
|
35
|
-
|
33
|
+
|
34
|
+
|
35
|
+
def account_letters(text, n=10):
|
36
|
+
len_ = []
|
37
|
+
[len_.append(len(i)) for i in split_by_sent_n(text, n)[0]]
|
36
38
|
return np.sum(len_)
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
39
|
+
|
40
|
+
|
41
|
+
def auto_chunk_size(txt, verbose=False):
|
42
|
+
chunk_size = []
|
43
|
+
for i in range(1, 50):
|
44
|
+
while 4000 < account_letters(txt, n=i) < 4700:
|
41
45
|
if verbose:
|
42
|
-
print(f"the optimal chunk_size is {i} sentences")
|
46
|
+
print(f"the optimal chunk_size is {i} sentences")
|
43
47
|
chunk_size.append(i)
|
44
48
|
break
|
45
49
|
return chunk_size[0]
|
50
|
+
|
51
|
+
|
46
52
|
# import pathlib
|
47
53
|
# import argostranslate.package
|
48
54
|
# import argostranslate.translate
|
49
55
|
def get_lang_code_iso639():
|
50
56
|
from JFL import netfinder
|
51
|
-
|
57
|
+
|
58
|
+
url = "https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes"
|
52
59
|
# res=netfinder.fetch(url,where="table",what="wikitable sortable jquery-tablesorter")
|
53
|
-
res=netfinder.fetch(url,where="tr",extend=0)
|
54
|
-
fullname,shortcut=[],[]
|
55
|
-
for i in range(6,len(res)-2):
|
56
|
-
if len(res[i])>len(res[i+1]) and res[i+1][:2]==res[i+2][:2]:
|
60
|
+
res = netfinder.fetch(url, where="tr", extend=0)
|
61
|
+
fullname, shortcut = [], []
|
62
|
+
for i in range(6, len(res) - 2):
|
63
|
+
if len(res[i]) > len(res[i + 1]) and res[i + 1][:2] == res[i + 2][:2]:
|
57
64
|
fullname.append(res[i])
|
58
|
-
shortcut.append(res[i+1])
|
59
|
-
lang_code_iso639=dict([*zip(fullname,shortcut)])
|
65
|
+
shortcut.append(res[i + 1])
|
66
|
+
lang_code_iso639 = dict([*zip(fullname, shortcut)])
|
60
67
|
return lang_code_iso639
|
61
68
|
|
62
|
-
|
63
|
-
|
64
|
-
|
69
|
+
|
70
|
+
def detect_lang(text, output="lang", verbose=False):
|
71
|
+
dir_curr_script = os.path.dirname(os.path.abspath(__file__))
|
72
|
+
dir_lang_code = dir_curr_script + "/data/lang_code_iso639.json"
|
65
73
|
with open(dir_lang_code, "r") as file:
|
66
|
-
|
67
|
-
l_lang,l_code = [],[]
|
68
|
-
[[l_lang.append(v),l_code.append(k)] for v,k in lang_code_iso639.items()]
|
74
|
+
lang_code_iso639 = json.load(file)
|
75
|
+
l_lang, l_code = [], []
|
76
|
+
[[l_lang.append(v), l_code.append(k)] for v, k in lang_code_iso639.items()]
|
69
77
|
try:
|
70
78
|
if is_text(text):
|
71
|
-
code_detect=detect(text)
|
72
|
-
if
|
73
|
-
return l_code[strcmp(code_detect,l_code, verbose=verbose)[1]]
|
79
|
+
code_detect = detect(text)
|
80
|
+
if "c" in output.lower(): # return code
|
81
|
+
return l_code[strcmp(code_detect, l_code, verbose=verbose)[1]]
|
74
82
|
else:
|
75
|
-
return l_lang[strcmp(code_detect,l_code, verbose=verbose)[1]]
|
83
|
+
return l_lang[strcmp(code_detect, l_code, verbose=verbose)[1]]
|
76
84
|
else:
|
77
85
|
print(f"{text} is not supported")
|
78
|
-
return
|
86
|
+
return "no"
|
79
87
|
except:
|
80
|
-
return
|
88
|
+
return "no"
|
89
|
+
|
81
90
|
|
82
91
|
def is_text(s):
|
83
92
|
has_alpha = any(char.isalpha() for char in s)
|
@@ -85,7 +94,8 @@ def is_text(s):
|
|
85
94
|
# no_special = not re.search(r'[^A-Za-z0-9\s]', s)
|
86
95
|
return has_alpha and has_non_alpha
|
87
96
|
|
88
|
-
|
97
|
+
|
98
|
+
def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"):
|
89
99
|
"""
|
90
100
|
Compares a search term with a list of candidate strings and finds the best match based on similarity score.
|
91
101
|
|
@@ -98,21 +108,23 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer='WR'
|
|
98
108
|
Returns:
|
99
109
|
tuple: A tuple containing the best match and its index in the candidates list.
|
100
110
|
"""
|
111
|
+
|
101
112
|
def to_lower(s, ignore_case=True):
|
102
|
-
#Converts a string or list of strings to lowercase if ignore_case is True.
|
113
|
+
# Converts a string or list of strings to lowercase if ignore_case is True.
|
103
114
|
if ignore_case:
|
104
115
|
if isinstance(s, str):
|
105
116
|
return s.lower()
|
106
117
|
elif isinstance(s, list):
|
107
118
|
return [elem.lower() for elem in s]
|
108
119
|
return s
|
109
|
-
|
120
|
+
|
121
|
+
str1_, str2_ = to_lower(search_term, ignore_case), to_lower(candidates, ignore_case)
|
110
122
|
if isinstance(str2_, list):
|
111
|
-
if
|
123
|
+
if "part" in scorer.lower():
|
112
124
|
similarity_scores = [fuzz.partial_ratio(str1_, word) for word in str2_]
|
113
|
-
elif
|
125
|
+
elif "W" in scorer.lower():
|
114
126
|
similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
|
115
|
-
elif
|
127
|
+
elif "Ratio" in scorer.lower():
|
116
128
|
similarity_scores = [fuzz.Ratio(str1_, word) for word in str2_]
|
117
129
|
else:
|
118
130
|
similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
|
@@ -120,11 +132,11 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer='WR'
|
|
120
132
|
best_match_score = similarity_scores[best_match_index]
|
121
133
|
else:
|
122
134
|
best_match_index = 0
|
123
|
-
if
|
135
|
+
if "part" in scorer.lower():
|
124
136
|
best_match_score = fuzz.partial_ratio(str1_, str2_)
|
125
|
-
elif
|
137
|
+
elif "W" in scorer.lower():
|
126
138
|
best_match_score = fuzz.WRatio(str1_, str2_)
|
127
|
-
elif
|
139
|
+
elif "Ratio" in scorer.lower():
|
128
140
|
best_match_score = fuzz.Ratio(str1_, str2_)
|
129
141
|
else:
|
130
142
|
best_match_score = fuzz.WRatio(str1_, str2_)
|
@@ -136,12 +148,15 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer='WR'
|
|
136
148
|
|
137
149
|
|
138
150
|
def methods(idx=0):
|
139
|
-
methods_=["GoogleTrans (default)",
|
151
|
+
methods_ = ["GoogleTrans (default)", "DeepL", "Argos"]
|
140
152
|
# print(f"supported methods: {methods_}")
|
141
153
|
# print(f"return the selected is: {methods_[idx]}")
|
142
154
|
return methods_[idx]
|
143
155
|
|
144
|
-
|
156
|
+
|
157
|
+
DEFAULT_SERVICE_URLS = ("translate.google.de", "translate.google.fr")
|
158
|
+
|
159
|
+
|
145
160
|
def user_agent():
|
146
161
|
# Example of generating a random user-agent string
|
147
162
|
user_agents = [
|
@@ -179,24 +194,259 @@ def user_agent():
|
|
179
194
|
]
|
180
195
|
agents = random.choice(user_agents)
|
181
196
|
return agents
|
197
|
+
|
198
|
+
|
182
199
|
def get_language_code(language, translator="google"):
|
183
200
|
"""
|
184
201
|
Get language code for translation services (Google Translate, DeepL).
|
185
202
|
"""
|
186
|
-
deepl_languages = {
|
187
|
-
|
188
|
-
|
203
|
+
deepl_languages = {
|
204
|
+
"English": "EN",
|
205
|
+
"German": "DE",
|
206
|
+
"French": "FR",
|
207
|
+
"Spanish": "ES",
|
208
|
+
"Italian": "IT",
|
209
|
+
"Dutch": "NL",
|
210
|
+
"Polish": "PL",
|
211
|
+
"Russian": "RU",
|
212
|
+
"Japanese": "JA",
|
213
|
+
"Chinese": "ZH",
|
214
|
+
}
|
215
|
+
google_languages = {
|
216
|
+
"Afrikaans": "af",
|
217
|
+
"Albanian": "sq",
|
218
|
+
"Amharic": "am",
|
219
|
+
"Arabic": "ar",
|
220
|
+
"Armenian": "hy",
|
221
|
+
"Azerbaijani": "az",
|
222
|
+
"Basque": "eu",
|
223
|
+
"Belarusian": "be",
|
224
|
+
"Bengali": "bn",
|
225
|
+
"Bosnian": "bs",
|
226
|
+
"Bulgarian": "bg",
|
227
|
+
"Catalan": "ca",
|
228
|
+
"Cebuano": "ceb",
|
229
|
+
"Chichewa": "ny",
|
230
|
+
"Chinese": "zh-CN",
|
231
|
+
"Corsican": "co",
|
232
|
+
"Croatian": "hr",
|
233
|
+
"Czech": "cs",
|
234
|
+
"Danish": "da",
|
235
|
+
"Dutch": "nl",
|
236
|
+
"English": "en",
|
237
|
+
"Esperanto": "eo",
|
238
|
+
"Estonian": "et",
|
239
|
+
"Filipino": "tl",
|
240
|
+
"Finnish": "fi",
|
241
|
+
"French": "fr",
|
242
|
+
"Frisian": "fy",
|
243
|
+
"Galician": "gl",
|
244
|
+
"Georgian": "ka",
|
245
|
+
"German": "de",
|
246
|
+
"Greek": "el",
|
247
|
+
"Gujarati": "gu",
|
248
|
+
"HaitianCreole": "ht",
|
249
|
+
"Hausa": "ha",
|
250
|
+
"Hawaiian": "haw",
|
251
|
+
"Hebrew": "he",
|
252
|
+
"Hindi": "hi",
|
253
|
+
"Hmong": "hmn",
|
254
|
+
"Hungarian": "hu",
|
255
|
+
"Icelandic": "is",
|
256
|
+
"Igbo": "ig",
|
257
|
+
"Indonesian": "id",
|
258
|
+
"Irish": "ga",
|
259
|
+
"Italian": "it",
|
260
|
+
"Japanese": "ja",
|
261
|
+
"Javanese": "jv",
|
262
|
+
"Kannada": "kn",
|
263
|
+
"Kazakh": "kk",
|
264
|
+
"Khmer": "km",
|
265
|
+
"Kinyarwanda": "rw",
|
266
|
+
"Korean": "ko",
|
267
|
+
"Kurdish": "ku",
|
268
|
+
"Kyrgyz": "ky",
|
269
|
+
"Lao": "lo",
|
270
|
+
"Latin": "la",
|
271
|
+
"Latvian": "lv",
|
272
|
+
"Lithuanian": "lt",
|
273
|
+
"Luxembourgish": "lb",
|
274
|
+
"Macedonian": "mk",
|
275
|
+
"Malagasy": "mg",
|
276
|
+
"Malay": "ms",
|
277
|
+
"Malayalam": "ml",
|
278
|
+
"Maltese": "mt",
|
279
|
+
"Maori": "mi",
|
280
|
+
"Marathi": "mr",
|
281
|
+
"Mongolian": "mn",
|
282
|
+
"Myanmar": "my",
|
283
|
+
"Nepali": "ne",
|
284
|
+
"Norwegian": "no",
|
285
|
+
"Odia": "or",
|
286
|
+
"Oriya": "or",
|
287
|
+
"Pashto": "ps",
|
288
|
+
"Persian": "fa",
|
289
|
+
"Polish": "pl",
|
290
|
+
"Portuguese": "pt",
|
291
|
+
"Punjabi": "pa",
|
292
|
+
"Romanian": "ro",
|
293
|
+
"Russian": "ru",
|
294
|
+
"Samoan": "sm",
|
295
|
+
"ScotsGaelic": "gd",
|
296
|
+
"Serbian": "sr",
|
297
|
+
"Sesotho": "st",
|
298
|
+
"Shona": "sn",
|
299
|
+
"Sindhi": "sd",
|
300
|
+
"Sinhala": "si",
|
301
|
+
"Slovak": "sk",
|
302
|
+
"Slovenian": "sl",
|
303
|
+
"Somali": "so",
|
304
|
+
"Spanish": "es",
|
305
|
+
"Sundanese": "su",
|
306
|
+
"Swahili": "sw",
|
307
|
+
"Swedish": "sv",
|
308
|
+
"Tajik": "tg",
|
309
|
+
"Tamil": "ta",
|
310
|
+
"Tatar": "tt",
|
311
|
+
"Telugu": "te",
|
312
|
+
"Thai": "th",
|
313
|
+
"Turkish": "tr",
|
314
|
+
"Turkmen": "tk",
|
315
|
+
"Ukrainian": "uk",
|
316
|
+
"Urdu": "ur",
|
317
|
+
"Uyghur": "ug",
|
318
|
+
"Uzbek": "uz",
|
319
|
+
"Vietnamese": "vi",
|
320
|
+
"Welsh": "cy",
|
321
|
+
"Xhosa": "xh",
|
322
|
+
"Yiddish": "yi",
|
323
|
+
"Yoruba": "yo",
|
324
|
+
"Zulu": "zu",
|
325
|
+
}
|
326
|
+
argos_languages = {
|
327
|
+
"Afrikaans": "af",
|
328
|
+
"Albanian": "sq",
|
329
|
+
"Amharic": "am",
|
330
|
+
"Arabic": "ar",
|
331
|
+
"Armenian": "hy",
|
332
|
+
"Azerbaijani": "az",
|
333
|
+
"Basque": "eu",
|
334
|
+
"Belarusian": "be",
|
335
|
+
"Bengali": "bn",
|
336
|
+
"Bosnian": "bs",
|
337
|
+
"Bulgarian": "bg",
|
338
|
+
"Catalan": "ca",
|
339
|
+
"Cebuano": "ceb",
|
340
|
+
"Chichewa": "ny",
|
341
|
+
"Chinese": "zh",
|
342
|
+
"Corsican": "co",
|
343
|
+
"Croatian": "hr",
|
344
|
+
"Czech": "cs",
|
345
|
+
"Danish": "da",
|
346
|
+
"Dutch": "nl",
|
347
|
+
"English": "en",
|
348
|
+
"Esperanto": "es",
|
349
|
+
"Estonian": "et",
|
350
|
+
"Filipino": "tl",
|
351
|
+
"Finnish": "fi",
|
352
|
+
"French": "fr",
|
353
|
+
"Frisian": "fy",
|
354
|
+
"Galician": "gl",
|
355
|
+
"Georgian": "ka",
|
356
|
+
"German": "de",
|
357
|
+
"Greek": "el",
|
358
|
+
"Gujarati": "gu",
|
359
|
+
"HaitianCreole": "ht",
|
360
|
+
"Hausa": "ha",
|
361
|
+
"Hawaiian": "haw",
|
362
|
+
"Hebrew": "he",
|
363
|
+
"Hindi": "hi",
|
364
|
+
"Hmong": "hmn",
|
365
|
+
"Hungarian": "hu",
|
366
|
+
"Icelandic": "is",
|
367
|
+
"Igbo": "ig",
|
368
|
+
"Indonesian": "id",
|
369
|
+
"Irish": "ga",
|
370
|
+
"Italian": "it",
|
371
|
+
"Japanese": "ja",
|
372
|
+
"Javanese": "jv",
|
373
|
+
"Kannada": "kn",
|
374
|
+
"Kazakh": "kk",
|
375
|
+
"Khmer": "km",
|
376
|
+
"Kinyarwanda": "rw",
|
377
|
+
"Korean": "ko",
|
378
|
+
"Kurdish": "ku",
|
379
|
+
"Kyrgyz": "ky",
|
380
|
+
"Lao": "lo",
|
381
|
+
"Latin": "la",
|
382
|
+
"Latvian": "lv",
|
383
|
+
"Lithuanian": "lt",
|
384
|
+
"Luxembourgish": "lb",
|
385
|
+
"Macedonian": "mk",
|
386
|
+
"Malagasy": "mg",
|
387
|
+
"Malay": "ms",
|
388
|
+
"Malayalam": "ml",
|
389
|
+
"Maltese": "mt",
|
390
|
+
"Maori": "mi",
|
391
|
+
"Marathi": "mr",
|
392
|
+
"Mongolian": "mn",
|
393
|
+
"Myanmar": "my",
|
394
|
+
"Nepali": "ne",
|
395
|
+
"Norwegian": "no",
|
396
|
+
"Odia": "or",
|
397
|
+
"Oriya": "or",
|
398
|
+
"Pashto": "ps",
|
399
|
+
"Persian": "fa",
|
400
|
+
"Polish": "pl",
|
401
|
+
"Portuguese": "pt",
|
402
|
+
"Punjabi": "pa",
|
403
|
+
"Romanian": "ro",
|
404
|
+
"Russian": "ru",
|
405
|
+
"Samoan": "sm",
|
406
|
+
"ScotsGaelic": "gd",
|
407
|
+
"Serbian": "sr",
|
408
|
+
"Sesotho": "st",
|
409
|
+
"Shona": "sn",
|
410
|
+
"Sindhi": "sd",
|
411
|
+
"Sinhala": "si",
|
412
|
+
"Slovak": "sk",
|
413
|
+
"Slovenian": "sl",
|
414
|
+
"Somali": "so",
|
415
|
+
"Spanish": "es",
|
416
|
+
"Sundanese": "su",
|
417
|
+
"Swahili": "sw",
|
418
|
+
"Swedish": "sv",
|
419
|
+
"Tajik": "tg",
|
420
|
+
"Tamil": "ta",
|
421
|
+
"Tatar": "tt",
|
422
|
+
"Telugu": "te",
|
423
|
+
"Thai": "th",
|
424
|
+
"Turkish": "tr",
|
425
|
+
"Turkmen": "tk",
|
426
|
+
"Ukrainian": "uk",
|
427
|
+
"Urdu": "ur",
|
428
|
+
"Uyghur": "ug",
|
429
|
+
"Uzbek": "uz",
|
430
|
+
"Vietnamese": "vi",
|
431
|
+
"Welsh": "cy",
|
432
|
+
"Xhosa": "xh",
|
433
|
+
"Yiddish": "yi",
|
434
|
+
"Yoruba": "yo",
|
435
|
+
"Zulu": "zu",
|
436
|
+
}
|
189
437
|
if "deep" in translator.lower():
|
190
438
|
langs = deepl_languages
|
191
|
-
elif
|
439
|
+
elif "goo" in translator.lower():
|
192
440
|
langs = google_languages
|
193
|
-
elif
|
441
|
+
elif "ar" in translator.lower():
|
194
442
|
langs = argos_languages
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
443
|
+
lang_found = strcmp(language, list(langs.keys()))[0]
|
444
|
+
if lang_found in list(langs.keys()):
|
445
|
+
return langs[lang_found]
|
446
|
+
else:
|
447
|
+
print(f"fail to find the {language} code in translator {translator}")
|
448
|
+
return None
|
449
|
+
|
200
450
|
|
201
451
|
# language = "chinese"
|
202
452
|
# # Example usage:
|
@@ -216,6 +466,8 @@ def load_docx(filename):
|
|
216
466
|
for paragraph in doc.paragraphs:
|
217
467
|
text.append(paragraph.text)
|
218
468
|
return text
|
469
|
+
|
470
|
+
|
219
471
|
# # usage
|
220
472
|
# filename = "example.docx" # Change to the path of your .docx file
|
221
473
|
# text = load_docx(filename)
|
@@ -223,6 +475,7 @@ def load_docx(filename):
|
|
223
475
|
# print("Text from the document:")
|
224
476
|
# print(text)
|
225
477
|
|
478
|
+
|
226
479
|
def load_pdf(filename, page="all", verbose=False):
|
227
480
|
from PyPDF2 import PdfReader
|
228
481
|
import numpy as np
|
@@ -289,12 +542,14 @@ def split_text(text, method="sentence", limit=4500):
|
|
289
542
|
else:
|
290
543
|
return re.split(r"\{}".format(method), text)
|
291
544
|
|
545
|
+
|
292
546
|
def filter_errors(text):
|
293
547
|
# handle bugs:
|
294
548
|
# bug1: ".com" cannot be translated, but '..com' works
|
295
|
-
text=text.replace(".com", "..come")
|
549
|
+
text = text.replace(".com", "..come")
|
296
550
|
return text
|
297
551
|
|
552
|
+
|
298
553
|
def merge_text(input, robust=True):
|
299
554
|
"""
|
300
555
|
Convert a list of strings, tuple of strings, or numpy array of strings into a single concatenated string.
|
@@ -318,6 +573,7 @@ def merge_text(input, robust=True):
|
|
318
573
|
else:
|
319
574
|
return str(input)
|
320
575
|
|
576
|
+
|
321
577
|
def replace_text(text, dict_replace=None, robust=True):
|
322
578
|
"""
|
323
579
|
Replace specified substrings in the input text with provided replacements.
|
@@ -362,19 +618,21 @@ def replace_text(text, dict_replace=None, robust=True):
|
|
362
618
|
text = text.replace(k, v)
|
363
619
|
return text
|
364
620
|
|
621
|
+
|
365
622
|
# # usage:
|
366
623
|
# a = "kjkjk (a, b, c)"
|
367
624
|
# replace_text(a, {"(": "", ")": "", " ": " "}, robust=False)
|
368
625
|
|
626
|
+
|
369
627
|
def merge_strings_every_n(strings_list, n=10):
|
370
628
|
merged_list = []
|
371
|
-
if n>0:
|
629
|
+
if n > 0:
|
372
630
|
for i in range(0, len(strings_list), n):
|
373
631
|
merged_string = "".join(strings_list[i : i + n])
|
374
632
|
merged_list.append(merged_string)
|
375
|
-
return merged_list,n
|
633
|
+
return merged_list, n
|
376
634
|
else:
|
377
|
-
return strings_list,n
|
635
|
+
return strings_list, n
|
378
636
|
|
379
637
|
|
380
638
|
def translate(
|
@@ -386,7 +644,7 @@ def translate(
|
|
386
644
|
user_agent=user_agent(),
|
387
645
|
verbose=True,
|
388
646
|
error_verbose=True,
|
389
|
-
limit=5000
|
647
|
+
limit=5000,
|
390
648
|
):
|
391
649
|
"""
|
392
650
|
Translate text to the target language using the specified translation method (Google Translate or DeepL).
|
@@ -394,19 +652,47 @@ def translate(
|
|
394
652
|
"""
|
395
653
|
# error_verbose = verbose or False
|
396
654
|
|
397
|
-
if isinstance(text,list):
|
398
|
-
text=merge_text(text)
|
655
|
+
if isinstance(text, list):
|
656
|
+
# text=merge_text(text)
|
657
|
+
text_list = [
|
658
|
+
translate(
|
659
|
+
i,
|
660
|
+
lang=lang,
|
661
|
+
lang_src=lang_src,
|
662
|
+
method=method,
|
663
|
+
service_urls=service_urls,
|
664
|
+
user_agent=user_agent,
|
665
|
+
verbose=verbose,
|
666
|
+
error_verbose=error_verbose,
|
667
|
+
limit=limit,
|
668
|
+
)
|
669
|
+
for i in tqdm(
|
670
|
+
text,
|
671
|
+
desc="is translating",
|
672
|
+
colour="green",
|
673
|
+
bar_format="{l_bar}{bar} {n_fmt}/{total_fmt}",
|
674
|
+
)
|
675
|
+
]
|
676
|
+
return text_list
|
399
677
|
text = replace_text(text)
|
400
678
|
if lang_src is None:
|
401
|
-
lang_src =
|
679
|
+
lang_src = detect_lang(text)
|
402
680
|
try:
|
403
681
|
if len(text) > limit:
|
404
|
-
n=auto_chunk_size(text)
|
405
|
-
text_segments = split_by_sent_n(text,n)
|
682
|
+
n = auto_chunk_size(text)
|
683
|
+
text_segments = split_by_sent_n(text, n)
|
406
684
|
translations = ""
|
407
|
-
for segment in tqdm(text_segments,desc=
|
685
|
+
for segment in tqdm(text_segments, desc="is translating", colour="green"):
|
408
686
|
segment = replace_text(merge_text(segment))
|
409
|
-
translated_segment = translate_segment(
|
687
|
+
translated_segment = translate_segment(
|
688
|
+
text=segment,
|
689
|
+
lang=lang,
|
690
|
+
lang_src=lang_src,
|
691
|
+
method=method,
|
692
|
+
user_agent=user_agent,
|
693
|
+
service_urls=service_urls,
|
694
|
+
verbose=verbose,
|
695
|
+
error_verbose=error_verbose,
|
410
696
|
)
|
411
697
|
time.sleep(1)
|
412
698
|
if translated_segment:
|
@@ -416,12 +702,22 @@ def translate(
|
|
416
702
|
translations += ""
|
417
703
|
return translations
|
418
704
|
else:
|
419
|
-
return translate_segment(
|
705
|
+
return translate_segment(
|
706
|
+
text=text,
|
707
|
+
lang=lang,
|
708
|
+
lang_src=lang_src,
|
709
|
+
method=method,
|
710
|
+
user_agent=user_agent,
|
711
|
+
service_urls=service_urls,
|
712
|
+
verbose=verbose,
|
713
|
+
error_verbose=error_verbose,
|
714
|
+
)
|
420
715
|
except Exception as e:
|
421
716
|
if error_verbose:
|
422
717
|
print("(translate)Error during translation :", e)
|
423
718
|
return ""
|
424
719
|
|
720
|
+
|
425
721
|
def translate_segment(
|
426
722
|
text,
|
427
723
|
lang="chinese",
|
@@ -430,39 +726,42 @@ def translate_segment(
|
|
430
726
|
service_urls=DEFAULT_SERVICE_URLS,
|
431
727
|
user_agent=user_agent(),
|
432
728
|
verbose=False,
|
433
|
-
error_verbose=True
|
729
|
+
error_verbose=True,
|
434
730
|
):
|
435
731
|
"""
|
436
732
|
Translate a text segment to the target language using the specified translation method (Google Translate or DeepL).
|
437
733
|
"""
|
438
|
-
|
734
|
+
|
439
735
|
text_clean = filter_errors(text)
|
440
736
|
text_clean = replace_text(text_clean)
|
441
737
|
if lang_src is None:
|
442
738
|
lang_src = detect_lang(text_clean)
|
443
739
|
try:
|
444
|
-
lang_src = get_language_code(lang_src,
|
445
|
-
lang_tgt = get_language_code(lang,
|
740
|
+
lang_src = get_language_code(lang_src, "google")
|
741
|
+
lang_tgt = get_language_code(lang, "google")
|
446
742
|
if "goog" in method.lower():
|
447
|
-
Trstor = GoogleTranslator(service_urls=service_urls,user_agent=user_agent)
|
743
|
+
Trstor = GoogleTranslator(service_urls=service_urls, user_agent=user_agent)
|
448
744
|
txt = Trstor.translate(text_clean, src=lang_src, dest=lang_tgt).text
|
449
745
|
elif "trans" in method.lower():
|
450
|
-
lang_src = get_language_code(lang_src,
|
451
|
-
lang_tgt = get_language_code(lang,
|
452
|
-
translator = TranslateTranslator(
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
746
|
+
lang_src = get_language_code(lang_src, "google")
|
747
|
+
lang_tgt = get_language_code(lang, "google")
|
748
|
+
translator = TranslateTranslator(
|
749
|
+
from_lang=lang_src,
|
750
|
+
to_lang=lang_tgt,
|
751
|
+
provider="LibreTranslate",
|
752
|
+
secret_access_key=None,
|
753
|
+
base_url="https://translate.astian.org/",
|
754
|
+
)
|
457
755
|
txt = translator.translate(text_clean)
|
458
|
-
elif
|
756
|
+
elif "ar" in method.lower():
|
459
757
|
lang_src = get_language_code(language=lang_src, translator="argos")
|
460
758
|
lang_tgt = get_language_code(language=lang, translator="argos")
|
461
759
|
argostranslate.package.update_package_index()
|
462
760
|
available_packages = argostranslate.package.get_available_packages()
|
463
761
|
package_to_install = next(
|
464
762
|
filter(
|
465
|
-
lambda x: x.from_code == lang_src and x.to_code == lang_tgt,
|
763
|
+
lambda x: x.from_code == lang_src and x.to_code == lang_tgt,
|
764
|
+
available_packages,
|
466
765
|
)
|
467
766
|
)
|
468
767
|
argostranslate.package.install_from_path(package_to_install.download())
|
@@ -475,15 +774,19 @@ def translate_segment(
|
|
475
774
|
print(txt)
|
476
775
|
return txt
|
477
776
|
except Exception as e:
|
478
|
-
txt=translate_with_retry(
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
777
|
+
txt = translate_with_retry(
|
778
|
+
text_clean,
|
779
|
+
lang=lang,
|
780
|
+
lang_src=lang_src,
|
781
|
+
method=method,
|
782
|
+
verbose=verbose,
|
783
|
+
error_verbose=error_verbose,
|
784
|
+
user_agent=user_agent,
|
785
|
+
service_urls=service_urls,
|
786
|
+
)
|
486
787
|
return txt
|
788
|
+
|
789
|
+
|
487
790
|
def translate_with_retry(
|
488
791
|
text,
|
489
792
|
lang="chinese",
|
@@ -491,15 +794,20 @@ def translate_with_retry(
|
|
491
794
|
method=methods(),
|
492
795
|
verbose=False,
|
493
796
|
error_verbose=True,
|
494
|
-
user_agent=user_agent(),
|
797
|
+
user_agent=user_agent(),
|
798
|
+
service_urls=DEFAULT_SERVICE_URLS,
|
799
|
+
):
|
495
800
|
"""
|
496
|
-
|
801
|
+
Translate a text to the target language, retrying with alternative service URLs on connection errors.
|
497
802
|
"""
|
498
|
-
|
803
|
+
|
804
|
+
def try_translate(text, lang, lang_src, user_agent, service_url):
|
499
805
|
try:
|
500
|
-
translator_ = GoogleTranslator(
|
806
|
+
translator_ = GoogleTranslator(
|
807
|
+
user_agent=user_agent, service_urls=[service_url]
|
808
|
+
)
|
501
809
|
result = translator_.translate(text, dest=lang, src=lang_src)
|
502
|
-
if result and hasattr(result,
|
810
|
+
if result and hasattr(result, "text"):
|
503
811
|
return result.text
|
504
812
|
else:
|
505
813
|
raise ValueError(f"Invalid response from {service_url}: {result}")
|
@@ -507,32 +815,52 @@ def translate_with_retry(
|
|
507
815
|
raise RuntimeError(f"Error using {service_url}: {e}")
|
508
816
|
|
509
817
|
if lang_src is None:
|
510
|
-
lang_src = detect_lang(text)
|
818
|
+
lang_src = detect_lang(text)
|
511
819
|
lang_src = get_language_code(language=lang_src)
|
512
820
|
lang = get_language_code(language=lang)
|
513
821
|
try:
|
514
|
-
return try_translate(
|
822
|
+
return try_translate(
|
823
|
+
text,
|
824
|
+
lang=lang,
|
825
|
+
lang_src=lang_src,
|
826
|
+
user_agent=user_agent,
|
827
|
+
service_url=service_urls[0],
|
828
|
+
)
|
515
829
|
except Exception as e:
|
516
830
|
if error_verbose:
|
517
831
|
print("Connection error:", e)
|
518
|
-
try:
|
832
|
+
try:
|
519
833
|
time.sleep(1)
|
520
|
-
return try_translate(
|
834
|
+
return try_translate(
|
835
|
+
text,
|
836
|
+
lang=lang,
|
837
|
+
lang_src=lang_src,
|
838
|
+
user_agent=user_agent,
|
839
|
+
service_url=service_urls[1],
|
840
|
+
)
|
521
841
|
except Exception as e:
|
522
842
|
if error_verbose:
|
523
|
-
print(
|
843
|
+
print(
|
844
|
+
f"(translate_with_retry):Connection error with {service_urls}: {e}"
|
845
|
+
)
|
524
846
|
if error_verbose:
|
525
847
|
print("All service URLs failed. Unable to translate the text.")
|
526
848
|
return text
|
527
849
|
|
528
850
|
|
529
|
-
def trans_docx(
|
530
|
-
|
851
|
+
def trans_docx(
|
852
|
+
filename,
|
853
|
+
lang="english",
|
854
|
+
lang_src=None,
|
855
|
+
method=methods(),
|
856
|
+
service_urls=["translate.google.de"],
|
857
|
+
verbose=False,
|
858
|
+
):
|
531
859
|
"""
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
860
|
+
load the docx file and translated it into target lang "lang",
|
861
|
+
verbose: (default 'False', no display)to display the translated text in for loop
|
862
|
+
Return (list):
|
863
|
+
the translated text as a list
|
536
864
|
"""
|
537
865
|
txt = load_docx(filename)
|
538
866
|
trans_text = []
|
@@ -543,7 +871,14 @@ def trans_docx(filename, lang="english", lang_src=None, method=methods(),service
|
|
543
871
|
pass
|
544
872
|
else:
|
545
873
|
i.join(j)
|
546
|
-
trans_text_ = translate(
|
874
|
+
trans_text_ = translate(
|
875
|
+
i,
|
876
|
+
lang=lang,
|
877
|
+
lang_src=lang_src,
|
878
|
+
method=method,
|
879
|
+
service_urls=service_urls,
|
880
|
+
verbose=verbose,
|
881
|
+
)
|
547
882
|
trans_text.append(trans_text_)
|
548
883
|
# if verbose:
|
549
884
|
# print(trans_text_)
|
@@ -552,15 +887,30 @@ def trans_docx(filename, lang="english", lang_src=None, method=methods(),service
|
|
552
887
|
else:
|
553
888
|
return None
|
554
889
|
|
555
|
-
|
556
|
-
|
557
|
-
|
890
|
+
|
891
|
+
def trans_pdf(
|
892
|
+
filename,
|
893
|
+
page="all",
|
894
|
+
lang="english",
|
895
|
+
lang_src=None,
|
896
|
+
method="google",
|
897
|
+
service_urls=["translate.google.de"],
|
898
|
+
verbose=False,
|
899
|
+
):
|
900
|
+
"""load the pdf file and translated it into target lang "lang",
|
558
901
|
verbose: (default 'False', no display)to display the translated text in for loop
|
559
902
|
Return (list):
|
560
903
|
the translated text as a list
|
561
904
|
"""
|
562
|
-
txt = load_pdf(filename,page=page,verbose=verbose)
|
563
|
-
trans_text = translate(
|
905
|
+
txt = load_pdf(filename, page=page, verbose=verbose)
|
906
|
+
trans_text = translate(
|
907
|
+
txt,
|
908
|
+
lang=lang,
|
909
|
+
lang_src=lang_src,
|
910
|
+
method=method,
|
911
|
+
service_urls=service_urls,
|
912
|
+
verbose=False,
|
913
|
+
)
|
564
914
|
return trans_text
|
565
915
|
|
566
916
|
|
@@ -578,7 +928,8 @@ def save_content(fpath, content):
|
|
578
928
|
with open(fpath, "w") as file:
|
579
929
|
file.write(content)
|
580
930
|
|
581
|
-
|
931
|
+
|
932
|
+
def save_file(fpath, content, kind=None, font_name="Arial", font_size=10, spacing=6):
|
582
933
|
"""
|
583
934
|
Save content into a file with specified file type and formatting.
|
584
935
|
|
@@ -592,7 +943,7 @@ def save_file(fpath, content, kind=None, font_name="Arial", font_size=10,spacing
|
|
592
943
|
Returns:
|
593
944
|
None
|
594
945
|
"""
|
595
|
-
file_types = [".docx", ".txt", ".md", ".html", ".pdf"]
|
946
|
+
file_types = [".docx", ".txt", ".md", ".html", ".pdf"]
|
596
947
|
if kind is None:
|
597
948
|
# Extract the file extension from fpath
|
598
949
|
_, kind = os.path.splitext(fpath)
|
@@ -601,8 +952,8 @@ def save_file(fpath, content, kind=None, font_name="Arial", font_size=10,spacing
|
|
601
952
|
if kind.lower() not in file_types:
|
602
953
|
raise ValueError(f"Error:\n{kind} is not in the supported list {file_types}")
|
603
954
|
if "docx" in kind.lower():
|
604
|
-
if isinstance(content,str):
|
605
|
-
content = split_text(content,
|
955
|
+
if isinstance(content, str):
|
956
|
+
content = split_text(content, "sentence")
|
606
957
|
doc = docx.Document()
|
607
958
|
for i, paragraph_text in enumerate(content):
|
608
959
|
paragraph = doc.add_paragraph()
|
@@ -644,4 +995,4 @@ def save_file(fpath, content, kind=None, font_name="Arial", font_size=10,spacing
|
|
644
995
|
# lang = "chinese"
|
645
996
|
# translated_text = translate(text_to_translate, lang=lang)
|
646
997
|
# print(f"Detected language:{detected_language} \ntranslated into {lang}")
|
647
|
-
# print("Translated text:\n", translated_text)
|
998
|
+
# print("Translated text:\n", translated_text)
|