py2ls 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.git/COMMIT_EDITMSG +1 -0
- py2ls/.git/FETCH_HEAD +1 -0
- py2ls/.git/HEAD +1 -0
- py2ls/.git/config +15 -0
- py2ls/.git/description +1 -0
- py2ls/.git/hooks/applypatch-msg.sample +15 -0
- py2ls/.git/hooks/commit-msg.sample +24 -0
- py2ls/.git/hooks/fsmonitor-watchman.sample +174 -0
- py2ls/.git/hooks/post-update.sample +8 -0
- py2ls/.git/hooks/pre-applypatch.sample +14 -0
- py2ls/.git/hooks/pre-commit.sample +49 -0
- py2ls/.git/hooks/pre-merge-commit.sample +13 -0
- py2ls/.git/hooks/pre-push.sample +53 -0
- py2ls/.git/hooks/pre-rebase.sample +169 -0
- py2ls/.git/hooks/pre-receive.sample +24 -0
- py2ls/.git/hooks/prepare-commit-msg.sample +42 -0
- py2ls/.git/hooks/push-to-checkout.sample +78 -0
- py2ls/.git/hooks/update.sample +128 -0
- py2ls/.git/index +0 -0
- py2ls/.git/info/exclude +6 -0
- py2ls/.git/logs/HEAD +1 -0
- py2ls/.git/logs/refs/heads/main +1 -0
- py2ls/.git/logs/refs/remotes/origin/HEAD +1 -0
- py2ls/.git/logs/refs/remotes/origin/main +1 -0
- py2ls/.git/objects/25/b796accd261b9135fd32a2c00785f68edf6c46 +0 -0
- py2ls/.git/objects/36/b4a1b7403abc6c360f8fe2cb656ab945254971 +0 -0
- py2ls/.git/objects/3f/d6561300938afbb3d11976cf9c8f29549280d9 +0 -0
- py2ls/.git/objects/58/20a729045d4dc7e37ccaf8aa8eec126850afe2 +0 -0
- py2ls/.git/objects/60/f273eb1c412d916fa3f11318a7da7a9911b52a +0 -0
- py2ls/.git/objects/61/570cec8c061abe74121f27f5face6c69b98f99 +0 -0
- py2ls/.git/objects/69/13c452ca319f7cbf6a0836dc10a5bb033c84e4 +0 -0
- py2ls/.git/objects/78/3d4167bc95c9d2175e0df03ef1c1c880ba75ab +0 -0
- py2ls/.git/objects/79/7ae089b2212a937840e215276005ce76881307 +0 -0
- py2ls/.git/objects/7e/5956c806b5edc344d46dab599dec337891ba1f +1 -0
- py2ls/.git/objects/8e/55a7d2b96184030211f20c9b9af201eefcac82 +0 -0
- py2ls/.git/objects/91/c69ad88fe0ba94aa7859fb5f7edac5e6f1a3f7 +0 -0
- py2ls/.git/objects/b0/56be4be89ba6b76949dd641df45bb7036050c8 +0 -0
- py2ls/.git/objects/b0/9cd7856d58590578ee1a4f3ad45d1310a97f87 +0 -0
- py2ls/.git/objects/d9/005f2cc7fc4e65f14ed5518276007c08cf2fd0 +0 -0
- py2ls/.git/objects/df/e0770424b2a19faf507a501ebfc23be8f54e7b +0 -0
- py2ls/.git/objects/e9/391ffe371f1cc43b42ef09b705d9c767c2e14f +0 -0
- py2ls/.git/objects/fc/292e793ecfd42240ac43be407023bd731fa9e7 +0 -0
- py2ls/.git/refs/heads/main +1 -0
- py2ls/.git/refs/remotes/origin/HEAD +1 -0
- py2ls/.git/refs/remotes/origin/main +1 -0
- py2ls/.gitattributes +2 -0
- py2ls/.gitignore +152 -0
- py2ls/LICENSE +201 -0
- py2ls/README.md +409 -0
- py2ls/__init__.py +17 -0
- py2ls/brain_atlas.py +145 -0
- py2ls/correlators.py +475 -0
- py2ls/dbhandler.py +97 -0
- py2ls/freqanalysis.py +800 -0
- py2ls/internet_finder.py +405 -0
- py2ls/ips.py +2844 -0
- py2ls/netfinder.py +780 -0
- py2ls/sleep_events_detectors.py +1350 -0
- py2ls/translator.py +686 -0
- py2ls/version.py +1 -0
- py2ls/wb_detector.py +169 -0
- py2ls-0.1.0.dist-info/METADATA +12 -0
- py2ls-0.1.0.dist-info/RECORD +64 -0
- py2ls-0.1.0.dist-info/WHEEL +4 -0
py2ls/translator.py
ADDED
@@ -0,0 +1,686 @@
|
|
1
|
+
import re
|
2
|
+
import docx # pip install python-docx
|
3
|
+
from PyPDF2 import PdfReader
|
4
|
+
from langdetect import detect
|
5
|
+
from googletrans import Translator as GoogleTranslator
|
6
|
+
import os
|
7
|
+
import docx
|
8
|
+
from fpdf import FPDF
|
9
|
+
import random
|
10
|
+
import time
|
11
|
+
from translate import Translator as TranslateTranslator
|
12
|
+
import numpy as np
|
13
|
+
from nltk.tokenize import sent_tokenize
|
14
|
+
from itertools import pairwise
|
15
|
+
from tqdm import tqdm
|
16
|
+
|
17
|
+
|
18
|
+
def split_by_sent_n(text,n=10):
|
19
|
+
# split text into sentences
|
20
|
+
text_split_by_sent=sent_tokenize(text)
|
21
|
+
cut_loc_array=np.arange(0,len(text_split_by_sent),n)
|
22
|
+
if cut_loc_array[-1]!=len(text_split_by_sent):
|
23
|
+
cut_loc=np.append(cut_loc_array,len(text_split_by_sent))
|
24
|
+
else:
|
25
|
+
cut_loc = cut_loc_array
|
26
|
+
# get text in section (e.g., every 10 sentences)
|
27
|
+
text_section=[]
|
28
|
+
for i,j in pairwise(cut_loc):
|
29
|
+
text_section.append(text_split_by_sent[i:j])
|
30
|
+
return text_section
|
31
|
+
def account_letters(text,n=10):
|
32
|
+
len_=[]
|
33
|
+
[len_.append(len(i)) for i in split_by_sent_n(text,n)[0]]
|
34
|
+
return np.sum(len_)
|
35
|
+
def auto_chunk_size(txt,verbose=False):
|
36
|
+
chunk_size=[]
|
37
|
+
for i in range(1,50):
|
38
|
+
while 4000<account_letters(txt,n=i)<4700:
|
39
|
+
if verbose:
|
40
|
+
print(f"the optimal chunk_size is {i} sentences")
|
41
|
+
chunk_size.append(i)
|
42
|
+
break
|
43
|
+
return chunk_size[0]
|
44
|
+
# import pathlib
|
45
|
+
# import argostranslate.package
|
46
|
+
# import argostranslate.translate
|
47
|
+
def get_lang_code_iso639():
|
48
|
+
from JFL import netfinder
|
49
|
+
url="https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes"
|
50
|
+
# res=netfinder.fetch(url,where="table",what="wikitable sortable jquery-tablesorter")
|
51
|
+
res=netfinder.fetch(url,where="tr",extend=0)
|
52
|
+
fullname,shortcut=[],[]
|
53
|
+
for i in range(6,len(res)-2):
|
54
|
+
if len(res[i])>len(res[i+1]) and res[i+1][:2]==res[i+2][:2]:
|
55
|
+
fullname.append(res[i])
|
56
|
+
shortcut.append(res[i+1])
|
57
|
+
lang_code_iso639=dict([*zip(fullname,shortcut)])
|
58
|
+
return lang_code_iso639
|
59
|
+
|
60
|
+
# get_lang_code_iso639()
|
61
|
+
lang_code_iso639={'Abkhazian': 'ab',
|
62
|
+
'Afar': 'aa',
|
63
|
+
'Afrikaans': 'af',
|
64
|
+
'Akan': 'ak',
|
65
|
+
'Albanian': 'sq',
|
66
|
+
'Amharic': 'am',
|
67
|
+
'Arabic': 'ar',
|
68
|
+
'Armenian': 'hy',
|
69
|
+
'Assamese': 'as',
|
70
|
+
# 'Avaric': 'av',
|
71
|
+
'Aymara': 'ay',
|
72
|
+
'Azerbaijani': 'az',
|
73
|
+
'Bashkir': 'ba',
|
74
|
+
'Basque': 'eu',
|
75
|
+
'Belarusian': 'be',
|
76
|
+
'Bislama': 'bi',
|
77
|
+
'Breton': 'br',
|
78
|
+
'Burmese': 'my',
|
79
|
+
'Catalan, Valencian': 'ca',
|
80
|
+
'Chamorro': 'ch',
|
81
|
+
'Chichewa, Chewa, Nyanja': 'ny',
|
82
|
+
'Chinese': 'zh',
|
83
|
+
'Corsican': 'co',
|
84
|
+
'Cree': 'cr',
|
85
|
+
'Croatian': 'hr',
|
86
|
+
'Danish': 'da',
|
87
|
+
'Dutch, Flemish': 'nl',
|
88
|
+
'Dzongkha': 'dz',
|
89
|
+
'English': 'en',
|
90
|
+
'Finnish': 'fi',
|
91
|
+
'French': 'fr',
|
92
|
+
'Galician': 'gl',
|
93
|
+
'Georgian': 'ka',
|
94
|
+
'German': 'de',
|
95
|
+
'Greek, Modern (1453–)': 'el',
|
96
|
+
'Gujarati': 'gu',
|
97
|
+
'Hausa': 'ha',
|
98
|
+
'Hebrew': 'he',
|
99
|
+
'Hindi': 'hi',
|
100
|
+
'Hungarian': 'hu',
|
101
|
+
'Icelandic': 'is',
|
102
|
+
'Italian': 'it',
|
103
|
+
'Kikuyu, Gikuyu': 'ki',
|
104
|
+
'Korean': 'ko',
|
105
|
+
'Kurdish': 'ku',
|
106
|
+
'Latin': 'la',
|
107
|
+
'Limburgan, Limburger, Limburgish': 'li',
|
108
|
+
'Luba-Katanga': 'lu',
|
109
|
+
'Macedonian': 'mk',
|
110
|
+
'Malay': 'ms',
|
111
|
+
'Nauru': 'na',
|
112
|
+
'North Ndebele': 'nd',
|
113
|
+
'Nepali': 'ne',
|
114
|
+
'Norwegian': 'no',
|
115
|
+
'Norwegian Nynorsk': 'nn',
|
116
|
+
'Sichuan Yi, Nuosu': 'ii',
|
117
|
+
'Occitan': 'oc',
|
118
|
+
'Ojibwa': 'oj',
|
119
|
+
'Oriya': 'or',
|
120
|
+
'Ossetian, Ossetic': 'os',
|
121
|
+
'Persian': 'fa',
|
122
|
+
'Punjabi, Panjabi': 'pa',
|
123
|
+
'Quechua': 'qu',
|
124
|
+
'Romanian, Moldavian, Moldovan': 'ro',
|
125
|
+
'Russian': 'ru',
|
126
|
+
'Samoan': 'sm',
|
127
|
+
'Sanskrit': 'sa',
|
128
|
+
'Serbian': 'sr',
|
129
|
+
'Shona': 'sn',
|
130
|
+
'Sinhala, Sinhalese': 'si',
|
131
|
+
'Slovenian': 'sl',
|
132
|
+
'Somali': 'so',
|
133
|
+
'Sundanese': 'su',
|
134
|
+
'Swahili': 'sw',
|
135
|
+
'Swati': 'ss',
|
136
|
+
'Tajik': 'tg',
|
137
|
+
'Tamil': 'ta',
|
138
|
+
'Telugu': 'te',
|
139
|
+
'Thai': 'th',
|
140
|
+
'Tibetan': 'bo',
|
141
|
+
'Tigrinya': 'ti',
|
142
|
+
'Tonga (Tonga Islands)': 'to',
|
143
|
+
'Tsonga': 'ts',
|
144
|
+
'Twi': 'tw',
|
145
|
+
'Ukrainian': 'uk',
|
146
|
+
'Urdu': 'ur',
|
147
|
+
'Uzbek': 'uz',
|
148
|
+
'Venda': 've',
|
149
|
+
'Vietnamese': 'vi',
|
150
|
+
'Volapük': 'vo',
|
151
|
+
'Welsh': 'cy',
|
152
|
+
'Wolof': 'wo',
|
153
|
+
'Xhosa': 'xh',
|
154
|
+
'Yiddish': 'yi',
|
155
|
+
'Yoruba': 'yo',
|
156
|
+
'Zulu': 'zu'}
|
157
|
+
def search_iso639_fullname(val):
|
158
|
+
for k,v in lang_code_iso639.items():
|
159
|
+
if 'de' in v:
|
160
|
+
return k
|
161
|
+
|
162
|
+
|
163
|
+
def methods(idx=0):
|
164
|
+
methods_=["GoogleTrans (default)",'DeepL','Argos']
|
165
|
+
# print(f"supported methods: {methods_}")
|
166
|
+
# print(f"return the selected is: {methods_[idx]}")
|
167
|
+
return methods_[idx]
|
168
|
+
|
169
|
+
DEFAULT_SERVICE_URLS = ('translate.google.de','translate.google.fr')
|
170
|
+
def user_agent():
|
171
|
+
# Example of generating a random user-agent string
|
172
|
+
user_agents = [
|
173
|
+
# Windows (Intel)
|
174
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4891.0 Safari/537.36",
|
175
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4893.0 Safari/537.36",
|
176
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4895.0 Safari/537.36",
|
177
|
+
# Windows (ARM)
|
178
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; arm64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4891.0 Safari/537.36",
|
179
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; arm64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4893.0 Safari/537.36",
|
180
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; arm64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4895.0 Safari/537.36",
|
181
|
+
# Linux (x86_64)
|
182
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4891.0 Safari/537.36",
|
183
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4893.0 Safari/537.36",
|
184
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4895.0 Safari/537.36",
|
185
|
+
# macOS (Intel)
|
186
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 12_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15",
|
187
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 12_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15",
|
188
|
+
# macOS (ARM)
|
189
|
+
"Mozilla/5.0 (Macintosh; ARM Mac OS X 12_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15",
|
190
|
+
"Mozilla/5.0 (Macintosh; ARM Mac OS X 12_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15",
|
191
|
+
# iOS Devices
|
192
|
+
"Mozilla/5.0 (iPad; CPU OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
|
193
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
|
194
|
+
# Android Devices
|
195
|
+
"Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4891.0 Mobile Safari/537.36",
|
196
|
+
"Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4893.0 Mobile Safari/537.36",
|
197
|
+
"Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4895.0 Mobile Safari/537.36",
|
198
|
+
# Smart TVs
|
199
|
+
"Mozilla/5.0 (SMART-TV; LINUX; Tizen 6.0) AppleWebKit/537.36 (KHTML, like Gecko) SmartTV/1.0",
|
200
|
+
"Mozilla/5.0 (SMART-TV; LINUX; Tizen 6.0) AppleWebKit/537.36 (KHTML, like Gecko) WebAppManager/1.0",
|
201
|
+
# Game Consoles
|
202
|
+
"Mozilla/5.0 (PlayStation 5 3.01) AppleWebKit/605.1.15 (KHTML, like Gecko)",
|
203
|
+
"Mozilla/5.0 (Xbox One 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edge/44.18363.8740",
|
204
|
+
]
|
205
|
+
agents = random.choice(user_agents)
|
206
|
+
return agents
|
207
|
+
def get_language_code(language, translator="google"):
|
208
|
+
"""
|
209
|
+
Get language code for translation services (Google Translate, DeepL).
|
210
|
+
"""
|
211
|
+
deepl_languages = {"English":"EN","German":"DE","French":"FR","Spanish":"ES","Italian":"IT","Dutch":"NL","Polish":"PL","Russian":"RU","Japanese":"JA","Chinese":"ZH",}
|
212
|
+
google_languages = {"Afrikaans":"af","Albanian":"sq","Amharic":"am","Arabic":"ar","Armenian":"hy","Azerbaijani":"az","Basque":"eu","Belarusian":"be","Bengali":"bn","Bosnian":"bs","Bulgarian":"bg","Catalan":"ca","Cebuano":"ceb","Chichewa":"ny","Chinese":"zh-CN","Corsican":"co","Croatian":"hr","Czech":"cs","Danish":"da","Dutch":"nl","English":"en","Esperanto":"eo","Estonian":"et","Filipino":"tl","Finnish":"fi","French":"fr","Frisian":"fy","Galician":"gl","Georgian":"ka","German":"de","Greek":"el","Gujarati":"gu","HaitianCreole":"ht","Hausa":"ha","Hawaiian":"haw","Hebrew":"he","Hindi":"hi","Hmong":"hmn","Hungarian":"hu","Icelandic":"is","Igbo":"ig","Indonesian":"id","Irish":"ga","Italian":"it","Japanese":"ja","Javanese":"jv","Kannada":"kn","Kazakh":"kk","Khmer":"km","Kinyarwanda":"rw","Korean":"ko","Kurdish":"ku","Kyrgyz":"ky","Lao":"lo","Latin":"la","Latvian":"lv","Lithuanian":"lt","Luxembourgish":"lb","Macedonian":"mk","Malagasy":"mg","Malay":"ms","Malayalam":"ml","Maltese":"mt","Maori":"mi","Marathi":"mr","Mongolian":"mn","Myanmar":"my","Nepali":"ne","Norwegian":"no","Odia":"or","Oriya":"or","Pashto":"ps","Persian":"fa","Polish":"pl","Portuguese":"pt","Punjabi":"pa","Romanian":"ro","Russian":"ru","Samoan":"sm","ScotsGaelic":"gd","Serbian":"sr","Sesotho":"st","Shona":"sn","Sindhi":"sd","Sinhala":"si","Slovak":"sk","Slovenian":"sl","Somali":"so","Spanish":"es","Sundanese":"su","Swahili":"sw","Swedish":"sv","Tajik":"tg","Tamil":"ta","Tatar":"tt","Telugu":"te","Thai":"th","Turkish":"tr","Turkmen":"tk","Ukrainian":"uk","Urdu":"ur","Uyghur":"ug","Uzbek":"uz","Vietnamese":"vi","Welsh":"cy","Xhosa":"xh","Yiddish":"yi","Yoruba":"yo","Zulu":"zu"}
|
213
|
+
argos_languages = {"Afrikaans":"af","Albanian":"sq","Amharic":"am","Arabic":"ar","Armenian":"hy","Azerbaijani":"az","Basque":"eu","Belarusian":"be","Bengali":"bn","Bosnian":"bs","Bulgarian":"bg","Catalan":"ca","Cebuano":"ceb","Chichewa":"ny","Chinese":"zh","Corsican":"co","Croatian":"hr","Czech":"cs","Danish":"da","Dutch":"nl","English":"en","Esperanto":"es","Estonian":"et","Filipino":"tl","Finnish":"fi","French":"fr","Frisian":"fy","Galician":"gl","Georgian":"ka","German":"de","Greek":"el","Gujarati":"gu","HaitianCreole":"ht","Hausa":"ha","Hawaiian":"haw","Hebrew":"he","Hindi":"hi","Hmong":"hmn","Hungarian":"hu","Icelandic":"is","Igbo":"ig","Indonesian":"id","Irish":"ga","Italian":"it","Japanese":"ja","Javanese":"jv","Kannada":"kn","Kazakh":"kk","Khmer":"km","Kinyarwanda":"rw","Korean":"ko","Kurdish":"ku","Kyrgyz":"ky","Lao":"lo","Latin":"la","Latvian":"lv","Lithuanian":"lt","Luxembourgish":"lb","Macedonian":"mk","Malagasy":"mg","Malay":"ms","Malayalam":"ml","Maltese":"mt","Maori":"mi","Marathi":"mr","Mongolian":"mn","Myanmar":"my","Nepali":"ne","Norwegian":"no","Odia":"or","Oriya":"or","Pashto":"ps","Persian":"fa","Polish":"pl","Portuguese":"pt","Punjabi":"pa","Romanian":"ro","Russian":"ru","Samoan":"sm","ScotsGaelic":"gd","Serbian":"sr","Sesotho":"st","Shona":"sn","Sindhi":"sd","Sinhala":"si","Slovak":"sk","Slovenian":"sl","Somali":"so","Spanish":"es","Sundanese":"su","Swahili":"sw","Swedish":"sv","Tajik":"tg","Tamil":"ta","Tatar":"tt","Telugu":"te","Thai":"th","Turkish":"tr","Turkmen":"tk","Ukrainian":"uk","Urdu":"ur","Uyghur":"ug","Uzbek":"uz","Vietnamese":"vi","Welsh":"cy","Xhosa":"xh","Yiddish":"yi","Yoruba":"yo","Zulu":"zu"}
|
214
|
+
if "deep" in translator.lower():
|
215
|
+
langs = deepl_languages
|
216
|
+
elif 'goo' in translator.lower():
|
217
|
+
langs = google_languages
|
218
|
+
elif 'ar' in translator.lower():
|
219
|
+
langs = argos_languages
|
220
|
+
for lang, code in langs.items():
|
221
|
+
if language.lower() in lang.lower():
|
222
|
+
return code
|
223
|
+
print(f"fail to find the {language} code in translator {translator}")
|
224
|
+
return None
|
225
|
+
|
226
|
+
# language = "chinese"
|
227
|
+
# # Example usage:
|
228
|
+
# google_lang_code = get_language_code(language, "google")
|
229
|
+
# deepl_lang_code = get_language_code(language, "deepl")
|
230
|
+
|
231
|
+
# print(f"Google Translate Language Code for '{language}': {google_lang_code}")
|
232
|
+
# print(f"DeepL Translator Language Code for '{language}': {deepl_lang_code}")
|
233
|
+
|
234
|
+
def detect_language(text):
|
235
|
+
"""
|
236
|
+
Detect the language of the given text.
|
237
|
+
"""
|
238
|
+
if len(text.strip()) < 3:
|
239
|
+
print("Error: Input text is too short for language detection.")
|
240
|
+
return "english"
|
241
|
+
else:
|
242
|
+
lang_code = detect(text)
|
243
|
+
detected_language=search_iso639_fullname(lang_code)
|
244
|
+
print(detected_language)
|
245
|
+
return detected_language
|
246
|
+
|
247
|
+
|
248
|
+
# text_to_detect = "Bonjour, comment ça va?"
|
249
|
+
# detected_language = detect_language(text_to_detect)
|
250
|
+
# print("Detected language:", detected_language)
|
251
|
+
|
252
|
+
def load_docx(filename):
|
253
|
+
"""
|
254
|
+
Load a .docx file and return its content as a list of strings.
|
255
|
+
"""
|
256
|
+
doc = docx.Document(filename)
|
257
|
+
text = []
|
258
|
+
for paragraph in doc.paragraphs:
|
259
|
+
text.append(paragraph.text)
|
260
|
+
return text
|
261
|
+
# # usage
|
262
|
+
# filename = "example.docx" # Change to the path of your .docx file
|
263
|
+
# text = load_docx(filename)
|
264
|
+
# print("Document loaded successfully.")
|
265
|
+
# print("Text from the document:")
|
266
|
+
# print(text)
|
267
|
+
|
268
|
+
def load_pdf(filename, page="all", verbose=False):
|
269
|
+
from PyPDF2 import PdfReader
|
270
|
+
import numpy as np
|
271
|
+
|
272
|
+
"""
|
273
|
+
Parameters:
|
274
|
+
filename: The path to the PDF file to be loaded.
|
275
|
+
page (optional):
|
276
|
+
Specifies which page or pages to extract text from. By default, it's set to "all", which means text from all
|
277
|
+
pages will be returned. It can also be an integer to specify a single page number or a list of integers to
|
278
|
+
specify multiple pages.
|
279
|
+
verbose (optional):
|
280
|
+
If True, prints the total number of pages processed.
|
281
|
+
Functionality:
|
282
|
+
It initializes an empty dictionary text_dict to store page numbers as keys and their corresponding text as values.
|
283
|
+
It iterates through each page of the PDF file using a for loop.
|
284
|
+
For each page, it extracts the text using PyPDF2's extract_text() method and stores it in text_dict with the page number incremented by 1 as the key.
|
285
|
+
If the page parameter is an integer, it converts it into a list containing that single page number to ensure consistency in handling.
|
286
|
+
If the page parameter is a NumPy array, it converts it to a list using the tolist() method to ensure compatibility with list operations.
|
287
|
+
If verbose is True, it prints the total number of pages processed.
|
288
|
+
If page is a list, it combines the text of the specified pages into a single string combined_text and returns it.
|
289
|
+
If page is set to "all", it returns the entire text_dict containing text of all pages.
|
290
|
+
If page is an integer, it returns the text of the specified page number.
|
291
|
+
If the specified page is not found, it returns the string "Page is not found".
|
292
|
+
"""
|
293
|
+
|
294
|
+
text_dict = {}
|
295
|
+
with open(filename, "rb") as file:
|
296
|
+
pdf_reader = PdfReader(file)
|
297
|
+
num_pages = len(pdf_reader.pages)
|
298
|
+
for page_num in range(num_pages):
|
299
|
+
page_ = pdf_reader.pages[page_num]
|
300
|
+
text_dict[page_num + 1] = page_.extract_text()
|
301
|
+
if isinstance(page, int):
|
302
|
+
page = [page]
|
303
|
+
elif isinstance(page, np.ndarray):
|
304
|
+
page = page.tolist()
|
305
|
+
if verbose:
|
306
|
+
print(f"total pages: {page_num}")
|
307
|
+
if isinstance(page, list):
|
308
|
+
combined_text = ""
|
309
|
+
for page_num in page:
|
310
|
+
combined_text += text_dict.get(page_num, "")
|
311
|
+
return combined_text
|
312
|
+
elif "all" in page.lower():
|
313
|
+
combined_text = ""
|
314
|
+
for i in text_dict.values():
|
315
|
+
combined_text += i
|
316
|
+
return combined_text
|
317
|
+
else:
|
318
|
+
return text_dict.get(int(page), "Page is not found")
|
319
|
+
|
320
|
+
|
321
|
+
def split_text(text, method="sentence", limit=4500):
|
322
|
+
"""
|
323
|
+
Split text into segments based on sentence boundaries or a specified length limit.
|
324
|
+
"""
|
325
|
+
if "sent" in method.lower():
|
326
|
+
res = re.findall(r"https?://\S+|[^.!?]+(?:[.!?](?:\s|$))?", text)
|
327
|
+
print(f"There are {len(res)} sentences.")
|
328
|
+
return res
|
329
|
+
elif "len" in method.lower():
|
330
|
+
return [text[i : i + limit] for i in range(0, len(text), limit)]
|
331
|
+
else:
|
332
|
+
return re.split(r"\{}".format(method), text)
|
333
|
+
|
334
|
+
def filter_errors(text):
|
335
|
+
# handle bugs:
|
336
|
+
# bug1: ".com" cannot be translated, but '..com' works
|
337
|
+
text=text.replace(".com", "..come")
|
338
|
+
return text
|
339
|
+
|
340
|
+
def merge_text(input, robust=True):
|
341
|
+
"""
|
342
|
+
Convert a list of strings, tuple of strings, or numpy array of strings into a single concatenated string.
|
343
|
+
|
344
|
+
Args:
|
345
|
+
input (list, tuple, numpy.ndarray): A list, tuple, or numpy array of strings to be concatenated.
|
346
|
+
robust (bool, optional): If True, handles non-supported types by converting them to string.
|
347
|
+
If False, directly converts the input to string. Default is True.
|
348
|
+
|
349
|
+
Returns:
|
350
|
+
str: The concatenated string.
|
351
|
+
"""
|
352
|
+
supported_types = (list, tuple, np.ndarray)
|
353
|
+
if not isinstance(input, supported_types):
|
354
|
+
raise TypeError(f"{Input} must be {supported_types}.")
|
355
|
+
if robust:
|
356
|
+
# Convert each element to string if it's not already
|
357
|
+
text = [str(item) for item in input]
|
358
|
+
# Concatenate the strings
|
359
|
+
return "".join(text)
|
360
|
+
else:
|
361
|
+
return str(input)
|
362
|
+
|
363
|
+
def replace_text(text, dict_replace=None, robust=True):
|
364
|
+
"""
|
365
|
+
Replace specified substrings in the input text with provided replacements.
|
366
|
+
Args:
|
367
|
+
text (str): The input text where replacements will be made.
|
368
|
+
dict_replace (dict, optional): A dictionary containing substrings to be replaced as keys
|
369
|
+
and their corresponding replacements as values. Defaults to {".com": "..come", "\n": " ", "\t": " ", " ": " "}.
|
370
|
+
robust (bool, optional): If True, additional default replacements for newline and tab characters will be applied.
|
371
|
+
Default is False.
|
372
|
+
Returns:
|
373
|
+
str: The text after replacements have been made.
|
374
|
+
"""
|
375
|
+
# Default replacements for newline and tab characters
|
376
|
+
default_replacements = {
|
377
|
+
"\a": "",
|
378
|
+
"\b": "",
|
379
|
+
"\f": "",
|
380
|
+
"\n": "",
|
381
|
+
"\r": "",
|
382
|
+
"\t": "",
|
383
|
+
"\v": "",
|
384
|
+
"\\": "", # Corrected here
|
385
|
+
# "\?": "",
|
386
|
+
"�": "",
|
387
|
+
"\\x": "", # Corrected here
|
388
|
+
"\\x hhhh": "",
|
389
|
+
"\\ ooo": "", # Corrected here
|
390
|
+
"\xa0": "",
|
391
|
+
" ": " ",
|
392
|
+
}
|
393
|
+
|
394
|
+
# If dict_replace is None, use the default dictionary
|
395
|
+
if dict_replace is None:
|
396
|
+
dict_replace = {}
|
397
|
+
|
398
|
+
# If robust is True, update the dictionary with default replacements
|
399
|
+
if robust:
|
400
|
+
dict_replace.update(default_replacements)
|
401
|
+
|
402
|
+
# Iterate over each key-value pair in the dictionary and replace substrings accordingly
|
403
|
+
for k, v in dict_replace.items():
|
404
|
+
text = text.replace(k, v)
|
405
|
+
return text
|
406
|
+
|
407
|
+
# # usage:
|
408
|
+
# a = "kjkjk (a, b, c)"
|
409
|
+
# replace_text(a, {"(": "", ")": "", " ": " "}, robust=False)
|
410
|
+
|
411
|
+
def merge_strings_every_n(strings_list, n=10):
|
412
|
+
merged_list = []
|
413
|
+
if n>0:
|
414
|
+
for i in range(0, len(strings_list), n):
|
415
|
+
merged_string = "".join(strings_list[i : i + n])
|
416
|
+
merged_list.append(merged_string)
|
417
|
+
return merged_list,n
|
418
|
+
else:
|
419
|
+
return strings_list,n
|
420
|
+
|
421
|
+
|
422
|
+
def translate(
|
423
|
+
text,
|
424
|
+
lang="chinese",
|
425
|
+
lang_src=None,
|
426
|
+
method=methods(),
|
427
|
+
service_urls=DEFAULT_SERVICE_URLS,
|
428
|
+
user_agent=user_agent(),
|
429
|
+
verbose=True,
|
430
|
+
error_verbose=True,
|
431
|
+
limit=5000
|
432
|
+
):
|
433
|
+
"""
|
434
|
+
Translate text to the target language using the specified translation method (Google Translate or DeepL).
|
435
|
+
lang_src (str): e.g., 'english', or 'chinese' when there are two languages, then lang_src must be given
|
436
|
+
"""
|
437
|
+
if isinstance(text,list):
|
438
|
+
text=merge_text(text)
|
439
|
+
text = replace_text(text)
|
440
|
+
if lang_src is None:
|
441
|
+
lang_src = detect_language(text)
|
442
|
+
try:
|
443
|
+
if len(text) > limit:
|
444
|
+
n=auto_chunk_size(text)
|
445
|
+
text_segments = split_by_sent_n(text,n)
|
446
|
+
translations = ""
|
447
|
+
for segment in tqdm(text_segments,desc='is translating'):
|
448
|
+
segment = replace_text(merge_text(segment))
|
449
|
+
translated_segment = translate_segment(text=segment, lang=lang, lang_src=lang_src, method=method, user_agent=user_agent,service_urls=service_urls, verbose=verbose,error_verbose=error_verbose
|
450
|
+
)
|
451
|
+
time.sleep(1)
|
452
|
+
if translated_segment:
|
453
|
+
translations += translated_segment
|
454
|
+
else:
|
455
|
+
print("Error: Translation of one of the segments failed.")
|
456
|
+
translations += ""
|
457
|
+
return translations
|
458
|
+
else:
|
459
|
+
return translate_segment(text=text, lang=lang, lang_src=lang_src, method=method, user_agent=user_agent,service_urls=service_urls, verbose=verbose,error_verbose=error_verbose)
|
460
|
+
except Exception as e:
|
461
|
+
if error_verbose:
|
462
|
+
print("(translate)Error during translation :", e)
|
463
|
+
return ""
|
464
|
+
|
465
|
+
def translate_segment(
|
466
|
+
text,
|
467
|
+
lang="chinese",
|
468
|
+
lang_src=None,
|
469
|
+
method=methods(),
|
470
|
+
service_urls=DEFAULT_SERVICE_URLS,
|
471
|
+
user_agent=user_agent(),
|
472
|
+
verbose=False,
|
473
|
+
error_verbose=True
|
474
|
+
):
|
475
|
+
"""
|
476
|
+
Translate a text segment to the target language using the specified translation method (Google Translate or DeepL).
|
477
|
+
"""
|
478
|
+
|
479
|
+
text_clean = filter_errors(text)
|
480
|
+
text_clean = replace_text(text_clean)
|
481
|
+
if lang_src is None:
|
482
|
+
lang_src = detect_language(text_clean)
|
483
|
+
try:
|
484
|
+
lang_src = get_language_code(lang_src, 'google')
|
485
|
+
lang_tgt = get_language_code(lang, 'google')
|
486
|
+
if "goog" in method.lower():
|
487
|
+
Trstor = GoogleTranslator(service_urls=service_urls,user_agent=user_agent)
|
488
|
+
txt = Trstor.translate(text_clean, src=lang_src, dest=lang_tgt).text
|
489
|
+
elif "trans" in method.lower():
|
490
|
+
lang_src = get_language_code(lang_src, 'google')
|
491
|
+
lang_tgt = get_language_code(lang, 'google')
|
492
|
+
translator = TranslateTranslator(from_lang=lang_src,
|
493
|
+
to_lang=lang_tgt,
|
494
|
+
provider='LibreTranslate',
|
495
|
+
secret_access_key=None,
|
496
|
+
base_url='https://translate.astian.org/')
|
497
|
+
txt = translator.translate(text_clean)
|
498
|
+
elif 'ar' in method.lower():
|
499
|
+
lang_src = get_language_code(language=lang_src, translator="argos")
|
500
|
+
lang_tgt = get_language_code(language=lang, translator="argos")
|
501
|
+
argostranslate.package.update_package_index()
|
502
|
+
available_packages = argostranslate.package.get_available_packages()
|
503
|
+
package_to_install = next(
|
504
|
+
filter(
|
505
|
+
lambda x: x.from_code == lang_src and x.to_code == lang_tgt, available_packages
|
506
|
+
)
|
507
|
+
)
|
508
|
+
argostranslate.package.install_from_path(package_to_install.download())
|
509
|
+
# Translate
|
510
|
+
txt = argostranslate.translate.translate("Hello World", lang_src, lang_tgt)
|
511
|
+
else:
|
512
|
+
print("Error: Invalid translation method. supported: 'google' or 'deepl'.")
|
513
|
+
return ""
|
514
|
+
if verbose:
|
515
|
+
print(txt)
|
516
|
+
return txt
|
517
|
+
except Exception as e:
|
518
|
+
txt=translate_with_retry(
|
519
|
+
text_clean,
|
520
|
+
lang=lang,
|
521
|
+
lang_src=lang_src,
|
522
|
+
method=method,
|
523
|
+
verbose=verbose,
|
524
|
+
error_verbose=error_verbose,
|
525
|
+
user_agent=user_agent, service_urls=service_urls)
|
526
|
+
return txt
|
527
|
+
def translate_with_retry(
|
528
|
+
text,
|
529
|
+
lang="chinese",
|
530
|
+
lang_src=None,
|
531
|
+
method=methods(),
|
532
|
+
verbose=False,
|
533
|
+
error_verbose=True,
|
534
|
+
user_agent=user_agent(), service_urls=DEFAULT_SERVICE_URLS):
|
535
|
+
"""
|
536
|
+
Translate a text to the target language, retrying with alternative service URLs on connection errors.
|
537
|
+
"""
|
538
|
+
def try_translate(text,lang,lang_src,user_agent,service_url):
|
539
|
+
try:
|
540
|
+
translator_ = GoogleTranslator(user_agent=user_agent, service_urls=[service_url])
|
541
|
+
result = translator_.translate(text, dest=lang, src=lang_src)
|
542
|
+
if result and hasattr(result, 'text'):
|
543
|
+
return result.text
|
544
|
+
else:
|
545
|
+
raise ValueError(f"Invalid response from {service_url}: {result}")
|
546
|
+
except Exception as e:
|
547
|
+
raise RuntimeError(f"Error using {service_url}: {e}")
|
548
|
+
|
549
|
+
if lang_src is None:
|
550
|
+
lang_src = detect_language(text)
|
551
|
+
lang_src = get_language_code(language=lang_src)
|
552
|
+
lang = get_language_code(language=lang)
|
553
|
+
print(f"lang:{lang},lang_src:{lang_src}")
|
554
|
+
try:
|
555
|
+
print(len(text))
|
556
|
+
return try_translate(text,lang=lang,lang_src=lang_src,user_agent=user_agent,service_url=service_urls[0])
|
557
|
+
except Exception as e:
|
558
|
+
print("Connection error:", e)
|
559
|
+
try:
|
560
|
+
time.sleep(1)
|
561
|
+
return try_translate(text,lang=lang,lang_src=lang_src,user_agent=user_agent,service_url=service_urls[1])
|
562
|
+
except Exception as e:
|
563
|
+
print(f"(translate_with_retry):Connection error with {service_urls}: {e}")
|
564
|
+
print("All service URLs failed. Unable to translate the text.")
|
565
|
+
return text
|
566
|
+
|
567
|
+
|
568
|
+
def trans_docx(filename, lang="english", lang_src=None, method=methods(),service_urls=[
|
569
|
+
'translate.google.de'],verbose=False):
|
570
|
+
"""
|
571
|
+
load the docx file and translated it into target lang "lang",
|
572
|
+
verbose: (default 'False', no display)to display the translated text in for loop
|
573
|
+
Return (list):
|
574
|
+
the translated text as a list
|
575
|
+
"""
|
576
|
+
txt = load_docx(filename)
|
577
|
+
trans_text = []
|
578
|
+
for i in txt:
|
579
|
+
# print(i)
|
580
|
+
j = ""
|
581
|
+
if len(i.strip()) < 3:
|
582
|
+
pass
|
583
|
+
else:
|
584
|
+
i.join(j)
|
585
|
+
trans_text_ = translate(i, lang=lang,lang_src=lang_src, method=method,service_urls=service_urls,verbose=verbose)
|
586
|
+
trans_text.append(trans_text_)
|
587
|
+
# if verbose:
|
588
|
+
# print(trans_text_)
|
589
|
+
if trans_text:
|
590
|
+
return trans_text
|
591
|
+
else:
|
592
|
+
return None
|
593
|
+
|
594
|
+
def trans_pdf(filename, page="all",lang="english", lang_src=None, method="google",service_urls=[
|
595
|
+
'translate.google.de'],verbose=False):
|
596
|
+
"""load the pdf file and translated it into target lang "lang",
|
597
|
+
verbose: (default 'False', no display)to display the translated text in for loop
|
598
|
+
Return (list):
|
599
|
+
the translated text as a list
|
600
|
+
"""
|
601
|
+
txt = load_pdf(filename,page=page,verbose=verbose)
|
602
|
+
trans_text = translate(txt, lang=lang,lang_src=lang_src, method=method,service_urls=service_urls,verbose=False)
|
603
|
+
return trans_text
|
604
|
+
|
605
|
+
|
606
|
+
def save_content(fpath, content):
|
607
|
+
"""
|
608
|
+
Save content to a file.
|
609
|
+
|
610
|
+
Parameters:
|
611
|
+
fpath (str): The file path where content will be saved.
|
612
|
+
content (str): The content to be saved.
|
613
|
+
|
614
|
+
Returns:
|
615
|
+
None
|
616
|
+
"""
|
617
|
+
with open(fpath, "w") as file:
|
618
|
+
file.write(content)
|
619
|
+
|
620
|
+
def save_file(fpath, content, kind=None, font_name="Arial", font_size=10,spacing=6):
|
621
|
+
"""
|
622
|
+
Save content into a file with specified file type and formatting.
|
623
|
+
|
624
|
+
Parameters:
|
625
|
+
fpath (str): The file path where content will be saved.
|
626
|
+
content (list of str): The content to be saved, where each string represents a paragraph.
|
627
|
+
kind (str): The file type to save. Supported options: 'docx', 'txt', 'md', 'html', 'pdf'.
|
628
|
+
font_name (str): The font name for text formatting (only applicable for 'docx', 'html', and 'pdf').
|
629
|
+
font_size (int): The font size for text formatting (only applicable for 'docx', 'html', and 'pdf').
|
630
|
+
|
631
|
+
Returns:
|
632
|
+
None
|
633
|
+
"""
|
634
|
+
file_types = [".docx", ".txt", ".md", ".html", ".pdf"]
|
635
|
+
if kind is None:
|
636
|
+
# Extract the file extension from fpath
|
637
|
+
_, kind = os.path.splitext(fpath)
|
638
|
+
kind = kind.lower() # Convert extension to lowercase for comparison
|
639
|
+
# Check if kind is valid
|
640
|
+
if kind.lower() not in file_types:
|
641
|
+
raise ValueError(f"Error:\n{kind} is not in the supported list {file_types}")
|
642
|
+
if "docx" in kind.lower():
|
643
|
+
if isinstance(content,str):
|
644
|
+
content = split_text(content,'sentence')
|
645
|
+
doc = docx.Document()
|
646
|
+
for i, paragraph_text in enumerate(content):
|
647
|
+
paragraph = doc.add_paragraph()
|
648
|
+
run = paragraph.add_run(paragraph_text)
|
649
|
+
font = run.font
|
650
|
+
font.name = font_name
|
651
|
+
font.size = docx.shared.Pt(font_size)
|
652
|
+
if i != len(content) - 1: # Add spacing for all but the last paragraph
|
653
|
+
paragraph.space_after = docx.shared.Pt(spacing)
|
654
|
+
doc.save(fpath)
|
655
|
+
elif "txt" in kind.lower():
|
656
|
+
save_content(fpath, "\n".join(content))
|
657
|
+
elif "md" in kind.lower():
|
658
|
+
save_content(fpath, "\n\n".join(content))
|
659
|
+
elif "html" in kind.lower():
|
660
|
+
html_content = "<html><body>"
|
661
|
+
for paragraph_text in content:
|
662
|
+
html_content += f'<p style="font-family:{font_name}; font-size:{font_size}px;">{paragraph_text}</p>'
|
663
|
+
html_content += "</body></html>"
|
664
|
+
save_content(fpath, html_content)
|
665
|
+
elif "pdf" in kind.lower():
|
666
|
+
pdf = FPDF()
|
667
|
+
pdf.add_page()
|
668
|
+
pdf.set_font(font_name, size=font_size)
|
669
|
+
for paragraph_text in content:
|
670
|
+
pdf.cell(200, 10, txt=paragraph_text, ln=True)
|
671
|
+
# Output PDF content as bytes
|
672
|
+
pdf_bytes = pdf.output(dest="S").encode(
|
673
|
+
"utf-8"
|
674
|
+
) # Encode PDF content to bytes using latin-1
|
675
|
+
with open(fpath, "wb") as file:
|
676
|
+
file.write(pdf_bytes)
|
677
|
+
else:
|
678
|
+
raise ValueError(f"Error:\n{kind} is not in the supported list {file_types}")
|
679
|
+
|
680
|
+
|
681
|
+
# if __name__ == "__main__":
|
682
|
+
# text_to_translate = "Hello, how are you?"
|
683
|
+
# lang = "chinese"
|
684
|
+
# translated_text = translate(text_to_translate, lang=lang)
|
685
|
+
# print(f"Detected language:{detected_language} \ntranslated into {lang}")
|
686
|
+
# print("Translated text:\n", translated_text)
|