py2ls 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. py2ls/.git/COMMIT_EDITMSG +1 -0
  2. py2ls/.git/FETCH_HEAD +1 -0
  3. py2ls/.git/HEAD +1 -0
  4. py2ls/.git/config +15 -0
  5. py2ls/.git/description +1 -0
  6. py2ls/.git/hooks/applypatch-msg.sample +15 -0
  7. py2ls/.git/hooks/commit-msg.sample +24 -0
  8. py2ls/.git/hooks/fsmonitor-watchman.sample +174 -0
  9. py2ls/.git/hooks/post-update.sample +8 -0
  10. py2ls/.git/hooks/pre-applypatch.sample +14 -0
  11. py2ls/.git/hooks/pre-commit.sample +49 -0
  12. py2ls/.git/hooks/pre-merge-commit.sample +13 -0
  13. py2ls/.git/hooks/pre-push.sample +53 -0
  14. py2ls/.git/hooks/pre-rebase.sample +169 -0
  15. py2ls/.git/hooks/pre-receive.sample +24 -0
  16. py2ls/.git/hooks/prepare-commit-msg.sample +42 -0
  17. py2ls/.git/hooks/push-to-checkout.sample +78 -0
  18. py2ls/.git/hooks/update.sample +128 -0
  19. py2ls/.git/index +0 -0
  20. py2ls/.git/info/exclude +6 -0
  21. py2ls/.git/logs/HEAD +1 -0
  22. py2ls/.git/logs/refs/heads/main +1 -0
  23. py2ls/.git/logs/refs/remotes/origin/HEAD +1 -0
  24. py2ls/.git/logs/refs/remotes/origin/main +1 -0
  25. py2ls/.git/objects/25/b796accd261b9135fd32a2c00785f68edf6c46 +0 -0
  26. py2ls/.git/objects/36/b4a1b7403abc6c360f8fe2cb656ab945254971 +0 -0
  27. py2ls/.git/objects/3f/d6561300938afbb3d11976cf9c8f29549280d9 +0 -0
  28. py2ls/.git/objects/58/20a729045d4dc7e37ccaf8aa8eec126850afe2 +0 -0
  29. py2ls/.git/objects/60/f273eb1c412d916fa3f11318a7da7a9911b52a +0 -0
  30. py2ls/.git/objects/61/570cec8c061abe74121f27f5face6c69b98f99 +0 -0
  31. py2ls/.git/objects/69/13c452ca319f7cbf6a0836dc10a5bb033c84e4 +0 -0
  32. py2ls/.git/objects/78/3d4167bc95c9d2175e0df03ef1c1c880ba75ab +0 -0
  33. py2ls/.git/objects/79/7ae089b2212a937840e215276005ce76881307 +0 -0
  34. py2ls/.git/objects/7e/5956c806b5edc344d46dab599dec337891ba1f +1 -0
  35. py2ls/.git/objects/8e/55a7d2b96184030211f20c9b9af201eefcac82 +0 -0
  36. py2ls/.git/objects/91/c69ad88fe0ba94aa7859fb5f7edac5e6f1a3f7 +0 -0
  37. py2ls/.git/objects/b0/56be4be89ba6b76949dd641df45bb7036050c8 +0 -0
  38. py2ls/.git/objects/b0/9cd7856d58590578ee1a4f3ad45d1310a97f87 +0 -0
  39. py2ls/.git/objects/d9/005f2cc7fc4e65f14ed5518276007c08cf2fd0 +0 -0
  40. py2ls/.git/objects/df/e0770424b2a19faf507a501ebfc23be8f54e7b +0 -0
  41. py2ls/.git/objects/e9/391ffe371f1cc43b42ef09b705d9c767c2e14f +0 -0
  42. py2ls/.git/objects/fc/292e793ecfd42240ac43be407023bd731fa9e7 +0 -0
  43. py2ls/.git/refs/heads/main +1 -0
  44. py2ls/.git/refs/remotes/origin/HEAD +1 -0
  45. py2ls/.git/refs/remotes/origin/main +1 -0
  46. py2ls/.gitattributes +2 -0
  47. py2ls/.gitignore +152 -0
  48. py2ls/LICENSE +201 -0
  49. py2ls/README.md +409 -0
  50. py2ls/__init__.py +17 -0
  51. py2ls/brain_atlas.py +145 -0
  52. py2ls/correlators.py +475 -0
  53. py2ls/dbhandler.py +97 -0
  54. py2ls/freqanalysis.py +800 -0
  55. py2ls/internet_finder.py +405 -0
  56. py2ls/ips.py +2844 -0
  57. py2ls/netfinder.py +780 -0
  58. py2ls/sleep_events_detectors.py +1350 -0
  59. py2ls/translator.py +686 -0
  60. py2ls/version.py +1 -0
  61. py2ls/wb_detector.py +169 -0
  62. py2ls-0.1.0.dist-info/METADATA +12 -0
  63. py2ls-0.1.0.dist-info/RECORD +64 -0
  64. py2ls-0.1.0.dist-info/WHEEL +4 -0
py2ls/translator.py ADDED
@@ -0,0 +1,686 @@
1
+ import re
2
+ import docx # pip install python-docx
3
+ from PyPDF2 import PdfReader
4
+ from langdetect import detect
5
+ from googletrans import Translator as GoogleTranslator
6
+ import os
7
+ import docx
8
+ from fpdf import FPDF
9
+ import random
10
+ import time
11
+ from translate import Translator as TranslateTranslator
12
+ import numpy as np
13
+ from nltk.tokenize import sent_tokenize
14
+ from itertools import pairwise
15
+ from tqdm import tqdm
16
+
17
+
18
+ def split_by_sent_n(text,n=10):
19
+ # split text into sentences
20
+ text_split_by_sent=sent_tokenize(text)
21
+ cut_loc_array=np.arange(0,len(text_split_by_sent),n)
22
+ if cut_loc_array[-1]!=len(text_split_by_sent):
23
+ cut_loc=np.append(cut_loc_array,len(text_split_by_sent))
24
+ else:
25
+ cut_loc = cut_loc_array
26
+ # get text in section (e.g., every 10 sentences)
27
+ text_section=[]
28
+ for i,j in pairwise(cut_loc):
29
+ text_section.append(text_split_by_sent[i:j])
30
+ return text_section
31
+ def account_letters(text,n=10):
32
+ len_=[]
33
+ [len_.append(len(i)) for i in split_by_sent_n(text,n)[0]]
34
+ return np.sum(len_)
35
+ def auto_chunk_size(txt,verbose=False):
36
+ chunk_size=[]
37
+ for i in range(1,50):
38
+ while 4000<account_letters(txt,n=i)<4700:
39
+ if verbose:
40
+ print(f"the optimal chunk_size is {i} sentences")
41
+ chunk_size.append(i)
42
+ break
43
+ return chunk_size[0]
44
+ # import pathlib
45
+ # import argostranslate.package
46
+ # import argostranslate.translate
47
+ def get_lang_code_iso639():
48
+ from JFL import netfinder
49
+ url="https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes"
50
+ # res=netfinder.fetch(url,where="table",what="wikitable sortable jquery-tablesorter")
51
+ res=netfinder.fetch(url,where="tr",extend=0)
52
+ fullname,shortcut=[],[]
53
+ for i in range(6,len(res)-2):
54
+ if len(res[i])>len(res[i+1]) and res[i+1][:2]==res[i+2][:2]:
55
+ fullname.append(res[i])
56
+ shortcut.append(res[i+1])
57
+ lang_code_iso639=dict([*zip(fullname,shortcut)])
58
+ return lang_code_iso639
59
+
60
+ # get_lang_code_iso639()
61
+ lang_code_iso639={'Abkhazian': 'ab',
62
+ 'Afar': 'aa',
63
+ 'Afrikaans': 'af',
64
+ 'Akan': 'ak',
65
+ 'Albanian': 'sq',
66
+ 'Amharic': 'am',
67
+ 'Arabic': 'ar',
68
+ 'Armenian': 'hy',
69
+ 'Assamese': 'as',
70
+ # 'Avaric': 'av',
71
+ 'Aymara': 'ay',
72
+ 'Azerbaijani': 'az',
73
+ 'Bashkir': 'ba',
74
+ 'Basque': 'eu',
75
+ 'Belarusian': 'be',
76
+ 'Bislama': 'bi',
77
+ 'Breton': 'br',
78
+ 'Burmese': 'my',
79
+ 'Catalan, Valencian': 'ca',
80
+ 'Chamorro': 'ch',
81
+ 'Chichewa, Chewa, Nyanja': 'ny',
82
+ 'Chinese': 'zh',
83
+ 'Corsican': 'co',
84
+ 'Cree': 'cr',
85
+ 'Croatian': 'hr',
86
+ 'Danish': 'da',
87
+ 'Dutch, Flemish': 'nl',
88
+ 'Dzongkha': 'dz',
89
+ 'English': 'en',
90
+ 'Finnish': 'fi',
91
+ 'French': 'fr',
92
+ 'Galician': 'gl',
93
+ 'Georgian': 'ka',
94
+ 'German': 'de',
95
+ 'Greek, Modern (1453–)': 'el',
96
+ 'Gujarati': 'gu',
97
+ 'Hausa': 'ha',
98
+ 'Hebrew': 'he',
99
+ 'Hindi': 'hi',
100
+ 'Hungarian': 'hu',
101
+ 'Icelandic': 'is',
102
+ 'Italian': 'it',
103
+ 'Kikuyu, Gikuyu': 'ki',
104
+ 'Korean': 'ko',
105
+ 'Kurdish': 'ku',
106
+ 'Latin': 'la',
107
+ 'Limburgan, Limburger, Limburgish': 'li',
108
+ 'Luba-Katanga': 'lu',
109
+ 'Macedonian': 'mk',
110
+ 'Malay': 'ms',
111
+ 'Nauru': 'na',
112
+ 'North Ndebele': 'nd',
113
+ 'Nepali': 'ne',
114
+ 'Norwegian': 'no',
115
+ 'Norwegian Nynorsk': 'nn',
116
+ 'Sichuan Yi, Nuosu': 'ii',
117
+ 'Occitan': 'oc',
118
+ 'Ojibwa': 'oj',
119
+ 'Oriya': 'or',
120
+ 'Ossetian, Ossetic': 'os',
121
+ 'Persian': 'fa',
122
+ 'Punjabi, Panjabi': 'pa',
123
+ 'Quechua': 'qu',
124
+ 'Romanian, Moldavian, Moldovan': 'ro',
125
+ 'Russian': 'ru',
126
+ 'Samoan': 'sm',
127
+ 'Sanskrit': 'sa',
128
+ 'Serbian': 'sr',
129
+ 'Shona': 'sn',
130
+ 'Sinhala, Sinhalese': 'si',
131
+ 'Slovenian': 'sl',
132
+ 'Somali': 'so',
133
+ 'Sundanese': 'su',
134
+ 'Swahili': 'sw',
135
+ 'Swati': 'ss',
136
+ 'Tajik': 'tg',
137
+ 'Tamil': 'ta',
138
+ 'Telugu': 'te',
139
+ 'Thai': 'th',
140
+ 'Tibetan': 'bo',
141
+ 'Tigrinya': 'ti',
142
+ 'Tonga (Tonga Islands)': 'to',
143
+ 'Tsonga': 'ts',
144
+ 'Twi': 'tw',
145
+ 'Ukrainian': 'uk',
146
+ 'Urdu': 'ur',
147
+ 'Uzbek': 'uz',
148
+ 'Venda': 've',
149
+ 'Vietnamese': 'vi',
150
+ 'Volapük': 'vo',
151
+ 'Welsh': 'cy',
152
+ 'Wolof': 'wo',
153
+ 'Xhosa': 'xh',
154
+ 'Yiddish': 'yi',
155
+ 'Yoruba': 'yo',
156
+ 'Zulu': 'zu'}
157
+ def search_iso639_fullname(val):
158
+ for k,v in lang_code_iso639.items():
159
+ if 'de' in v:
160
+ return k
161
+
162
+
163
+ def methods(idx=0):
164
+ methods_=["GoogleTrans (default)",'DeepL','Argos']
165
+ # print(f"supported methods: {methods_}")
166
+ # print(f"return the selected is: {methods_[idx]}")
167
+ return methods_[idx]
168
+
169
+ DEFAULT_SERVICE_URLS = ('translate.google.de','translate.google.fr')
170
+ def user_agent():
171
+ # Example of generating a random user-agent string
172
+ user_agents = [
173
+ # Windows (Intel)
174
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4891.0 Safari/537.36",
175
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4893.0 Safari/537.36",
176
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4895.0 Safari/537.36",
177
+ # Windows (ARM)
178
+ "Mozilla/5.0 (Windows NT 10.0; Win64; arm64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4891.0 Safari/537.36",
179
+ "Mozilla/5.0 (Windows NT 10.0; Win64; arm64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4893.0 Safari/537.36",
180
+ "Mozilla/5.0 (Windows NT 10.0; Win64; arm64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4895.0 Safari/537.36",
181
+ # Linux (x86_64)
182
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4891.0 Safari/537.36",
183
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4893.0 Safari/537.36",
184
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4895.0 Safari/537.36",
185
+ # macOS (Intel)
186
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15",
187
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15",
188
+ # macOS (ARM)
189
+ "Mozilla/5.0 (Macintosh; ARM Mac OS X 12_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15",
190
+ "Mozilla/5.0 (Macintosh; ARM Mac OS X 12_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15",
191
+ # iOS Devices
192
+ "Mozilla/5.0 (iPad; CPU OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
193
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
194
+ # Android Devices
195
+ "Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4891.0 Mobile Safari/537.36",
196
+ "Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4893.0 Mobile Safari/537.36",
197
+ "Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4895.0 Mobile Safari/537.36",
198
+ # Smart TVs
199
+ "Mozilla/5.0 (SMART-TV; LINUX; Tizen 6.0) AppleWebKit/537.36 (KHTML, like Gecko) SmartTV/1.0",
200
+ "Mozilla/5.0 (SMART-TV; LINUX; Tizen 6.0) AppleWebKit/537.36 (KHTML, like Gecko) WebAppManager/1.0",
201
+ # Game Consoles
202
+ "Mozilla/5.0 (PlayStation 5 3.01) AppleWebKit/605.1.15 (KHTML, like Gecko)",
203
+ "Mozilla/5.0 (Xbox One 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edge/44.18363.8740",
204
+ ]
205
+ agents = random.choice(user_agents)
206
+ return agents
207
+ def get_language_code(language, translator="google"):
208
+ """
209
+ Get language code for translation services (Google Translate, DeepL).
210
+ """
211
+ deepl_languages = {"English":"EN","German":"DE","French":"FR","Spanish":"ES","Italian":"IT","Dutch":"NL","Polish":"PL","Russian":"RU","Japanese":"JA","Chinese":"ZH",}
212
+ google_languages = {"Afrikaans":"af","Albanian":"sq","Amharic":"am","Arabic":"ar","Armenian":"hy","Azerbaijani":"az","Basque":"eu","Belarusian":"be","Bengali":"bn","Bosnian":"bs","Bulgarian":"bg","Catalan":"ca","Cebuano":"ceb","Chichewa":"ny","Chinese":"zh-CN","Corsican":"co","Croatian":"hr","Czech":"cs","Danish":"da","Dutch":"nl","English":"en","Esperanto":"eo","Estonian":"et","Filipino":"tl","Finnish":"fi","French":"fr","Frisian":"fy","Galician":"gl","Georgian":"ka","German":"de","Greek":"el","Gujarati":"gu","HaitianCreole":"ht","Hausa":"ha","Hawaiian":"haw","Hebrew":"he","Hindi":"hi","Hmong":"hmn","Hungarian":"hu","Icelandic":"is","Igbo":"ig","Indonesian":"id","Irish":"ga","Italian":"it","Japanese":"ja","Javanese":"jv","Kannada":"kn","Kazakh":"kk","Khmer":"km","Kinyarwanda":"rw","Korean":"ko","Kurdish":"ku","Kyrgyz":"ky","Lao":"lo","Latin":"la","Latvian":"lv","Lithuanian":"lt","Luxembourgish":"lb","Macedonian":"mk","Malagasy":"mg","Malay":"ms","Malayalam":"ml","Maltese":"mt","Maori":"mi","Marathi":"mr","Mongolian":"mn","Myanmar":"my","Nepali":"ne","Norwegian":"no","Odia":"or","Oriya":"or","Pashto":"ps","Persian":"fa","Polish":"pl","Portuguese":"pt","Punjabi":"pa","Romanian":"ro","Russian":"ru","Samoan":"sm","ScotsGaelic":"gd","Serbian":"sr","Sesotho":"st","Shona":"sn","Sindhi":"sd","Sinhala":"si","Slovak":"sk","Slovenian":"sl","Somali":"so","Spanish":"es","Sundanese":"su","Swahili":"sw","Swedish":"sv","Tajik":"tg","Tamil":"ta","Tatar":"tt","Telugu":"te","Thai":"th","Turkish":"tr","Turkmen":"tk","Ukrainian":"uk","Urdu":"ur","Uyghur":"ug","Uzbek":"uz","Vietnamese":"vi","Welsh":"cy","Xhosa":"xh","Yiddish":"yi","Yoruba":"yo","Zulu":"zu"}
213
+ argos_languages = {"Afrikaans":"af","Albanian":"sq","Amharic":"am","Arabic":"ar","Armenian":"hy","Azerbaijani":"az","Basque":"eu","Belarusian":"be","Bengali":"bn","Bosnian":"bs","Bulgarian":"bg","Catalan":"ca","Cebuano":"ceb","Chichewa":"ny","Chinese":"zh","Corsican":"co","Croatian":"hr","Czech":"cs","Danish":"da","Dutch":"nl","English":"en","Esperanto":"es","Estonian":"et","Filipino":"tl","Finnish":"fi","French":"fr","Frisian":"fy","Galician":"gl","Georgian":"ka","German":"de","Greek":"el","Gujarati":"gu","HaitianCreole":"ht","Hausa":"ha","Hawaiian":"haw","Hebrew":"he","Hindi":"hi","Hmong":"hmn","Hungarian":"hu","Icelandic":"is","Igbo":"ig","Indonesian":"id","Irish":"ga","Italian":"it","Japanese":"ja","Javanese":"jv","Kannada":"kn","Kazakh":"kk","Khmer":"km","Kinyarwanda":"rw","Korean":"ko","Kurdish":"ku","Kyrgyz":"ky","Lao":"lo","Latin":"la","Latvian":"lv","Lithuanian":"lt","Luxembourgish":"lb","Macedonian":"mk","Malagasy":"mg","Malay":"ms","Malayalam":"ml","Maltese":"mt","Maori":"mi","Marathi":"mr","Mongolian":"mn","Myanmar":"my","Nepali":"ne","Norwegian":"no","Odia":"or","Oriya":"or","Pashto":"ps","Persian":"fa","Polish":"pl","Portuguese":"pt","Punjabi":"pa","Romanian":"ro","Russian":"ru","Samoan":"sm","ScotsGaelic":"gd","Serbian":"sr","Sesotho":"st","Shona":"sn","Sindhi":"sd","Sinhala":"si","Slovak":"sk","Slovenian":"sl","Somali":"so","Spanish":"es","Sundanese":"su","Swahili":"sw","Swedish":"sv","Tajik":"tg","Tamil":"ta","Tatar":"tt","Telugu":"te","Thai":"th","Turkish":"tr","Turkmen":"tk","Ukrainian":"uk","Urdu":"ur","Uyghur":"ug","Uzbek":"uz","Vietnamese":"vi","Welsh":"cy","Xhosa":"xh","Yiddish":"yi","Yoruba":"yo","Zulu":"zu"}
214
+ if "deep" in translator.lower():
215
+ langs = deepl_languages
216
+ elif 'goo' in translator.lower():
217
+ langs = google_languages
218
+ elif 'ar' in translator.lower():
219
+ langs = argos_languages
220
+ for lang, code in langs.items():
221
+ if language.lower() in lang.lower():
222
+ return code
223
+ print(f"fail to find the {language} code in translator {translator}")
224
+ return None
225
+
226
+ # language = "chinese"
227
+ # # Example usage:
228
+ # google_lang_code = get_language_code(language, "google")
229
+ # deepl_lang_code = get_language_code(language, "deepl")
230
+
231
+ # print(f"Google Translate Language Code for '{language}': {google_lang_code}")
232
+ # print(f"DeepL Translator Language Code for '{language}': {deepl_lang_code}")
233
+
234
+ def detect_language(text):
235
+ """
236
+ Detect the language of the given text.
237
+ """
238
+ if len(text.strip()) < 3:
239
+ print("Error: Input text is too short for language detection.")
240
+ return "english"
241
+ else:
242
+ lang_code = detect(text)
243
+ detected_language=search_iso639_fullname(lang_code)
244
+ print(detected_language)
245
+ return detected_language
246
+
247
+
248
+ # text_to_detect = "Bonjour, comment ça va?"
249
+ # detected_language = detect_language(text_to_detect)
250
+ # print("Detected language:", detected_language)
251
+
252
+ def load_docx(filename):
253
+ """
254
+ Load a .docx file and return its content as a list of strings.
255
+ """
256
+ doc = docx.Document(filename)
257
+ text = []
258
+ for paragraph in doc.paragraphs:
259
+ text.append(paragraph.text)
260
+ return text
261
+ # # usage
262
+ # filename = "example.docx" # Change to the path of your .docx file
263
+ # text = load_docx(filename)
264
+ # print("Document loaded successfully.")
265
+ # print("Text from the document:")
266
+ # print(text)
267
+
268
+ def load_pdf(filename, page="all", verbose=False):
269
+ from PyPDF2 import PdfReader
270
+ import numpy as np
271
+
272
+ """
273
+ Parameters:
274
+ filename: The path to the PDF file to be loaded.
275
+ page (optional):
276
+ Specifies which page or pages to extract text from. By default, it's set to "all", which means text from all
277
+ pages will be returned. It can also be an integer to specify a single page number or a list of integers to
278
+ specify multiple pages.
279
+ verbose (optional):
280
+ If True, prints the total number of pages processed.
281
+ Functionality:
282
+ It initializes an empty dictionary text_dict to store page numbers as keys and their corresponding text as values.
283
+ It iterates through each page of the PDF file using a for loop.
284
+ For each page, it extracts the text using PyPDF2's extract_text() method and stores it in text_dict with the page number incremented by 1 as the key.
285
+ If the page parameter is an integer, it converts it into a list containing that single page number to ensure consistency in handling.
286
+ If the page parameter is a NumPy array, it converts it to a list using the tolist() method to ensure compatibility with list operations.
287
+ If verbose is True, it prints the total number of pages processed.
288
+ If page is a list, it combines the text of the specified pages into a single string combined_text and returns it.
289
+ If page is set to "all", it returns the entire text_dict containing text of all pages.
290
+ If page is an integer, it returns the text of the specified page number.
291
+ If the specified page is not found, it returns the string "Page is not found".
292
+ """
293
+
294
+ text_dict = {}
295
+ with open(filename, "rb") as file:
296
+ pdf_reader = PdfReader(file)
297
+ num_pages = len(pdf_reader.pages)
298
+ for page_num in range(num_pages):
299
+ page_ = pdf_reader.pages[page_num]
300
+ text_dict[page_num + 1] = page_.extract_text()
301
+ if isinstance(page, int):
302
+ page = [page]
303
+ elif isinstance(page, np.ndarray):
304
+ page = page.tolist()
305
+ if verbose:
306
+ print(f"total pages: {page_num}")
307
+ if isinstance(page, list):
308
+ combined_text = ""
309
+ for page_num in page:
310
+ combined_text += text_dict.get(page_num, "")
311
+ return combined_text
312
+ elif "all" in page.lower():
313
+ combined_text = ""
314
+ for i in text_dict.values():
315
+ combined_text += i
316
+ return combined_text
317
+ else:
318
+ return text_dict.get(int(page), "Page is not found")
319
+
320
+
321
+ def split_text(text, method="sentence", limit=4500):
322
+ """
323
+ Split text into segments based on sentence boundaries or a specified length limit.
324
+ """
325
+ if "sent" in method.lower():
326
+ res = re.findall(r"https?://\S+|[^.!?]+(?:[.!?](?:\s|$))?", text)
327
+ print(f"There are {len(res)} sentences.")
328
+ return res
329
+ elif "len" in method.lower():
330
+ return [text[i : i + limit] for i in range(0, len(text), limit)]
331
+ else:
332
+ return re.split(r"\{}".format(method), text)
333
+
334
+ def filter_errors(text):
335
+ # handle bugs:
336
+ # bug1: ".com" cannot be translated, but '..com' works
337
+ text=text.replace(".com", "..come")
338
+ return text
339
+
340
+ def merge_text(input, robust=True):
341
+ """
342
+ Convert a list of strings, tuple of strings, or numpy array of strings into a single concatenated string.
343
+
344
+ Args:
345
+ input (list, tuple, numpy.ndarray): A list, tuple, or numpy array of strings to be concatenated.
346
+ robust (bool, optional): If True, handles non-supported types by converting them to string.
347
+ If False, directly converts the input to string. Default is True.
348
+
349
+ Returns:
350
+ str: The concatenated string.
351
+ """
352
+ supported_types = (list, tuple, np.ndarray)
353
+ if not isinstance(input, supported_types):
354
+ raise TypeError(f"{Input} must be {supported_types}.")
355
+ if robust:
356
+ # Convert each element to string if it's not already
357
+ text = [str(item) for item in input]
358
+ # Concatenate the strings
359
+ return "".join(text)
360
+ else:
361
+ return str(input)
362
+
363
+ def replace_text(text, dict_replace=None, robust=True):
364
+ """
365
+ Replace specified substrings in the input text with provided replacements.
366
+ Args:
367
+ text (str): The input text where replacements will be made.
368
+ dict_replace (dict, optional): A dictionary containing substrings to be replaced as keys
369
+ and their corresponding replacements as values. Defaults to {".com": "..come", "\n": " ", "\t": " ", " ": " "}.
370
+ robust (bool, optional): If True, additional default replacements for newline and tab characters will be applied.
371
+ Default is False.
372
+ Returns:
373
+ str: The text after replacements have been made.
374
+ """
375
+ # Default replacements for newline and tab characters
376
+ default_replacements = {
377
+ "\a": "",
378
+ "\b": "",
379
+ "\f": "",
380
+ "\n": "",
381
+ "\r": "",
382
+ "\t": "",
383
+ "\v": "",
384
+ "\\": "", # Corrected here
385
+ # "\?": "",
386
+ "�": "",
387
+ "\\x": "", # Corrected here
388
+ "\\x hhhh": "",
389
+ "\\ ooo": "", # Corrected here
390
+ "\xa0": "",
391
+ " ": " ",
392
+ }
393
+
394
+ # If dict_replace is None, use the default dictionary
395
+ if dict_replace is None:
396
+ dict_replace = {}
397
+
398
+ # If robust is True, update the dictionary with default replacements
399
+ if robust:
400
+ dict_replace.update(default_replacements)
401
+
402
+ # Iterate over each key-value pair in the dictionary and replace substrings accordingly
403
+ for k, v in dict_replace.items():
404
+ text = text.replace(k, v)
405
+ return text
406
+
407
+ # # usage:
408
+ # a = "kjkjk (a, b, c)"
409
+ # replace_text(a, {"(": "", ")": "", " ": " "}, robust=False)
410
+
411
+ def merge_strings_every_n(strings_list, n=10):
412
+ merged_list = []
413
+ if n>0:
414
+ for i in range(0, len(strings_list), n):
415
+ merged_string = "".join(strings_list[i : i + n])
416
+ merged_list.append(merged_string)
417
+ return merged_list,n
418
+ else:
419
+ return strings_list,n
420
+
421
+
422
+ def translate(
423
+ text,
424
+ lang="chinese",
425
+ lang_src=None,
426
+ method=methods(),
427
+ service_urls=DEFAULT_SERVICE_URLS,
428
+ user_agent=user_agent(),
429
+ verbose=True,
430
+ error_verbose=True,
431
+ limit=5000
432
+ ):
433
+ """
434
+ Translate text to the target language using the specified translation method (Google Translate or DeepL).
435
+ lang_src (str): e.g., 'english', or 'chinese' when there are two languages, then lang_src must be given
436
+ """
437
+ if isinstance(text,list):
438
+ text=merge_text(text)
439
+ text = replace_text(text)
440
+ if lang_src is None:
441
+ lang_src = detect_language(text)
442
+ try:
443
+ if len(text) > limit:
444
+ n=auto_chunk_size(text)
445
+ text_segments = split_by_sent_n(text,n)
446
+ translations = ""
447
+ for segment in tqdm(text_segments,desc='is translating'):
448
+ segment = replace_text(merge_text(segment))
449
+ translated_segment = translate_segment(text=segment, lang=lang, lang_src=lang_src, method=method, user_agent=user_agent,service_urls=service_urls, verbose=verbose,error_verbose=error_verbose
450
+ )
451
+ time.sleep(1)
452
+ if translated_segment:
453
+ translations += translated_segment
454
+ else:
455
+ print("Error: Translation of one of the segments failed.")
456
+ translations += ""
457
+ return translations
458
+ else:
459
+ return translate_segment(text=text, lang=lang, lang_src=lang_src, method=method, user_agent=user_agent,service_urls=service_urls, verbose=verbose,error_verbose=error_verbose)
460
+ except Exception as e:
461
+ if error_verbose:
462
+ print("(translate)Error during translation :", e)
463
+ return ""
464
+
465
+ def translate_segment(
466
+ text,
467
+ lang="chinese",
468
+ lang_src=None,
469
+ method=methods(),
470
+ service_urls=DEFAULT_SERVICE_URLS,
471
+ user_agent=user_agent(),
472
+ verbose=False,
473
+ error_verbose=True
474
+ ):
475
+ """
476
+ Translate a text segment to the target language using the specified translation method (Google Translate or DeepL).
477
+ """
478
+
479
+ text_clean = filter_errors(text)
480
+ text_clean = replace_text(text_clean)
481
+ if lang_src is None:
482
+ lang_src = detect_language(text_clean)
483
+ try:
484
+ lang_src = get_language_code(lang_src, 'google')
485
+ lang_tgt = get_language_code(lang, 'google')
486
+ if "goog" in method.lower():
487
+ Trstor = GoogleTranslator(service_urls=service_urls,user_agent=user_agent)
488
+ txt = Trstor.translate(text_clean, src=lang_src, dest=lang_tgt).text
489
+ elif "trans" in method.lower():
490
+ lang_src = get_language_code(lang_src, 'google')
491
+ lang_tgt = get_language_code(lang, 'google')
492
+ translator = TranslateTranslator(from_lang=lang_src,
493
+ to_lang=lang_tgt,
494
+ provider='LibreTranslate',
495
+ secret_access_key=None,
496
+ base_url='https://translate.astian.org/')
497
+ txt = translator.translate(text_clean)
498
+ elif 'ar' in method.lower():
499
+ lang_src = get_language_code(language=lang_src, translator="argos")
500
+ lang_tgt = get_language_code(language=lang, translator="argos")
501
+ argostranslate.package.update_package_index()
502
+ available_packages = argostranslate.package.get_available_packages()
503
+ package_to_install = next(
504
+ filter(
505
+ lambda x: x.from_code == lang_src and x.to_code == lang_tgt, available_packages
506
+ )
507
+ )
508
+ argostranslate.package.install_from_path(package_to_install.download())
509
+ # Translate
510
+ txt = argostranslate.translate.translate("Hello World", lang_src, lang_tgt)
511
+ else:
512
+ print("Error: Invalid translation method. supported: 'google' or 'deepl'.")
513
+ return ""
514
+ if verbose:
515
+ print(txt)
516
+ return txt
517
+ except Exception as e:
518
+ txt=translate_with_retry(
519
+ text_clean,
520
+ lang=lang,
521
+ lang_src=lang_src,
522
+ method=method,
523
+ verbose=verbose,
524
+ error_verbose=error_verbose,
525
+ user_agent=user_agent, service_urls=service_urls)
526
+ return txt
527
+ def translate_with_retry(
528
+ text,
529
+ lang="chinese",
530
+ lang_src=None,
531
+ method=methods(),
532
+ verbose=False,
533
+ error_verbose=True,
534
+ user_agent=user_agent(), service_urls=DEFAULT_SERVICE_URLS):
535
+ """
536
+ Translate a text to the target language, retrying with alternative service URLs on connection errors.
537
+ """
538
+ def try_translate(text,lang,lang_src,user_agent,service_url):
539
+ try:
540
+ translator_ = GoogleTranslator(user_agent=user_agent, service_urls=[service_url])
541
+ result = translator_.translate(text, dest=lang, src=lang_src)
542
+ if result and hasattr(result, 'text'):
543
+ return result.text
544
+ else:
545
+ raise ValueError(f"Invalid response from {service_url}: {result}")
546
+ except Exception as e:
547
+ raise RuntimeError(f"Error using {service_url}: {e}")
548
+
549
+ if lang_src is None:
550
+ lang_src = detect_language(text)
551
+ lang_src = get_language_code(language=lang_src)
552
+ lang = get_language_code(language=lang)
553
+ print(f"lang:{lang},lang_src:{lang_src}")
554
+ try:
555
+ print(len(text))
556
+ return try_translate(text,lang=lang,lang_src=lang_src,user_agent=user_agent,service_url=service_urls[0])
557
+ except Exception as e:
558
+ print("Connection error:", e)
559
+ try:
560
+ time.sleep(1)
561
+ return try_translate(text,lang=lang,lang_src=lang_src,user_agent=user_agent,service_url=service_urls[1])
562
+ except Exception as e:
563
+ print(f"(translate_with_retry):Connection error with {service_urls}: {e}")
564
+ print("All service URLs failed. Unable to translate the text.")
565
+ return text
566
+
567
+
568
+ def trans_docx(filename, lang="english", lang_src=None, method=methods(),service_urls=[
569
+ 'translate.google.de'],verbose=False):
570
+ """
571
+ load the docx file and translated it into target lang "lang",
572
+ verbose: (default 'False', no display)to display the translated text in for loop
573
+ Return (list):
574
+ the translated text as a list
575
+ """
576
+ txt = load_docx(filename)
577
+ trans_text = []
578
+ for i in txt:
579
+ # print(i)
580
+ j = ""
581
+ if len(i.strip()) < 3:
582
+ pass
583
+ else:
584
+ i.join(j)
585
+ trans_text_ = translate(i, lang=lang,lang_src=lang_src, method=method,service_urls=service_urls,verbose=verbose)
586
+ trans_text.append(trans_text_)
587
+ # if verbose:
588
+ # print(trans_text_)
589
+ if trans_text:
590
+ return trans_text
591
+ else:
592
+ return None
593
+
594
+ def trans_pdf(filename, page="all",lang="english", lang_src=None, method="google",service_urls=[
595
+ 'translate.google.de'],verbose=False):
596
+ """load the pdf file and translated it into target lang "lang",
597
+ verbose: (default 'False', no display)to display the translated text in for loop
598
+ Return (list):
599
+ the translated text as a list
600
+ """
601
+ txt = load_pdf(filename,page=page,verbose=verbose)
602
+ trans_text = translate(txt, lang=lang,lang_src=lang_src, method=method,service_urls=service_urls,verbose=False)
603
+ return trans_text
604
+
605
+
606
+ def save_content(fpath, content):
607
+ """
608
+ Save content to a file.
609
+
610
+ Parameters:
611
+ fpath (str): The file path where content will be saved.
612
+ content (str): The content to be saved.
613
+
614
+ Returns:
615
+ None
616
+ """
617
+ with open(fpath, "w") as file:
618
+ file.write(content)
619
+
620
+ def save_file(fpath, content, kind=None, font_name="Arial", font_size=10,spacing=6):
621
+ """
622
+ Save content into a file with specified file type and formatting.
623
+
624
+ Parameters:
625
+ fpath (str): The file path where content will be saved.
626
+ content (list of str): The content to be saved, where each string represents a paragraph.
627
+ kind (str): The file type to save. Supported options: 'docx', 'txt', 'md', 'html', 'pdf'.
628
+ font_name (str): The font name for text formatting (only applicable for 'docx', 'html', and 'pdf').
629
+ font_size (int): The font size for text formatting (only applicable for 'docx', 'html', and 'pdf').
630
+
631
+ Returns:
632
+ None
633
+ """
634
+ file_types = [".docx", ".txt", ".md", ".html", ".pdf"]
635
+ if kind is None:
636
+ # Extract the file extension from fpath
637
+ _, kind = os.path.splitext(fpath)
638
+ kind = kind.lower() # Convert extension to lowercase for comparison
639
+ # Check if kind is valid
640
+ if kind.lower() not in file_types:
641
+ raise ValueError(f"Error:\n{kind} is not in the supported list {file_types}")
642
+ if "docx" in kind.lower():
643
+ if isinstance(content,str):
644
+ content = split_text(content,'sentence')
645
+ doc = docx.Document()
646
+ for i, paragraph_text in enumerate(content):
647
+ paragraph = doc.add_paragraph()
648
+ run = paragraph.add_run(paragraph_text)
649
+ font = run.font
650
+ font.name = font_name
651
+ font.size = docx.shared.Pt(font_size)
652
+ if i != len(content) - 1: # Add spacing for all but the last paragraph
653
+ paragraph.space_after = docx.shared.Pt(spacing)
654
+ doc.save(fpath)
655
+ elif "txt" in kind.lower():
656
+ save_content(fpath, "\n".join(content))
657
+ elif "md" in kind.lower():
658
+ save_content(fpath, "\n\n".join(content))
659
+ elif "html" in kind.lower():
660
+ html_content = "<html><body>"
661
+ for paragraph_text in content:
662
+ html_content += f'<p style="font-family:{font_name}; font-size:{font_size}px;">{paragraph_text}</p>'
663
+ html_content += "</body></html>"
664
+ save_content(fpath, html_content)
665
+ elif "pdf" in kind.lower():
666
+ pdf = FPDF()
667
+ pdf.add_page()
668
+ pdf.set_font(font_name, size=font_size)
669
+ for paragraph_text in content:
670
+ pdf.cell(200, 10, txt=paragraph_text, ln=True)
671
+ # Output PDF content as bytes
672
+ pdf_bytes = pdf.output(dest="S").encode(
673
+ "utf-8"
674
+ ) # Encode PDF content to bytes using latin-1
675
+ with open(fpath, "wb") as file:
676
+ file.write(pdf_bytes)
677
+ else:
678
+ raise ValueError(f"Error:\n{kind} is not in the supported list {file_types}")
679
+
680
+
681
+ # if __name__ == "__main__":
682
+ # text_to_translate = "Hello, how are you?"
683
+ # lang = "chinese"
684
+ # translated_text = translate(text_to_translate, lang=lang)
685
+ # print(f"Detected language:{detected_language} \ntranslated into {lang}")
686
+ # print("Translated text:\n", translated_text)