py2ls 0.1.4.6__py3-none-any.whl → 0.1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/translator.py CHANGED
@@ -13,6 +13,7 @@ import numpy as np
13
13
  from nltk.tokenize import sent_tokenize
14
14
  from itertools import pairwise
15
15
  from tqdm import tqdm
16
+ from fuzzywuzzy import fuzz,process
16
17
 
17
18
 
18
19
  def split_by_sent_n(text,n=10):
@@ -58,106 +59,173 @@ def get_lang_code_iso639():
58
59
  return lang_code_iso639
59
60
 
60
61
  # get_lang_code_iso639()
61
- lang_code_iso639={'Abkhazian': 'ab',
62
- 'Afar': 'aa',
63
- 'Afrikaans': 'af',
64
- 'Akan': 'ak',
65
- 'Albanian': 'sq',
66
- 'Amharic': 'am',
67
- 'Arabic': 'ar',
68
- 'Armenian': 'hy',
69
- 'Assamese': 'as',
70
- # 'Avaric': 'av',
71
- 'Aymara': 'ay',
72
- 'Azerbaijani': 'az',
73
- 'Bashkir': 'ba',
74
- 'Basque': 'eu',
75
- 'Belarusian': 'be',
76
- 'Bislama': 'bi',
77
- 'Breton': 'br',
78
- 'Burmese': 'my',
79
- 'Catalan, Valencian': 'ca',
80
- 'Chamorro': 'ch',
81
- 'Chichewa, Chewa, Nyanja': 'ny',
82
- 'Chinese': 'zh',
83
- 'Corsican': 'co',
84
- 'Cree': 'cr',
85
- 'Croatian': 'hr',
86
- 'Danish': 'da',
87
- 'Dutch, Flemish': 'nl',
88
- 'Dzongkha': 'dz',
89
- 'English': 'en',
90
- 'Finnish': 'fi',
91
- 'French': 'fr',
92
- 'Galician': 'gl',
93
- 'Georgian': 'ka',
94
- 'German': 'de',
95
- 'Greek, Modern (1453–)': 'el',
96
- 'Gujarati': 'gu',
97
- 'Hausa': 'ha',
98
- 'Hebrew': 'he',
99
- 'Hindi': 'hi',
100
- 'Hungarian': 'hu',
101
- 'Icelandic': 'is',
102
- 'Italian': 'it',
103
- 'Kikuyu, Gikuyu': 'ki',
104
- 'Korean': 'ko',
105
- 'Kurdish': 'ku',
106
- 'Latin': 'la',
107
- 'Limburgan, Limburger, Limburgish': 'li',
108
- 'Luba-Katanga': 'lu',
109
- 'Macedonian': 'mk',
110
- 'Malay': 'ms',
111
- 'Nauru': 'na',
112
- 'North Ndebele': 'nd',
113
- 'Nepali': 'ne',
114
- 'Norwegian': 'no',
115
- 'Norwegian Nynorsk': 'nn',
116
- 'Sichuan Yi, Nuosu': 'ii',
117
- 'Occitan': 'oc',
118
- 'Ojibwa': 'oj',
119
- 'Oriya': 'or',
120
- 'Ossetian, Ossetic': 'os',
121
- 'Persian': 'fa',
122
- 'Punjabi, Panjabi': 'pa',
123
- 'Quechua': 'qu',
124
- 'Romanian, Moldavian, Moldovan': 'ro',
125
- 'Russian': 'ru',
126
- 'Samoan': 'sm',
127
- 'Sanskrit': 'sa',
128
- 'Serbian': 'sr',
129
- 'Shona': 'sn',
130
- 'Sinhala, Sinhalese': 'si',
131
- 'Slovenian': 'sl',
132
- 'Somali': 'so',
133
- 'Sundanese': 'su',
134
- 'Swahili': 'sw',
135
- 'Swati': 'ss',
136
- 'Tajik': 'tg',
137
- 'Tamil': 'ta',
138
- 'Telugu': 'te',
139
- 'Thai': 'th',
140
- 'Tibetan': 'bo',
141
- 'Tigrinya': 'ti',
142
- 'Tonga (Tonga Islands)': 'to',
143
- 'Tsonga': 'ts',
144
- 'Twi': 'tw',
145
- 'Ukrainian': 'uk',
146
- 'Urdu': 'ur',
147
- 'Uzbek': 'uz',
148
- 'Venda': 've',
149
- 'Vietnamese': 'vi',
150
- 'Volapük': 'vo',
151
- 'Welsh': 'cy',
152
- 'Wolof': 'wo',
153
- 'Xhosa': 'xh',
154
- 'Yiddish': 'yi',
155
- 'Yoruba': 'yo',
156
- 'Zulu': 'zu'}
157
- def search_iso639_fullname(val):
158
- for k,v in lang_code_iso639.items():
159
- if 'de' in v:
160
- return k
62
+
63
+ def detect_lang(text, output='lang',verbose=False):
64
+ lang_code_iso639={'Abkhazian': 'ab',
65
+ 'Afar': 'aa',
66
+ 'Afrikaans': 'af',
67
+ 'Akan': 'ak',
68
+ 'Albanian': 'sq',
69
+ 'Amharic': 'am',
70
+ 'Arabic': 'ar',
71
+ 'Armenian': 'hy',
72
+ 'Assamese': 'as',
73
+ # 'Avaric': 'av',
74
+ 'Aymara': 'ay',
75
+ 'Azerbaijani': 'az',
76
+ 'Bashkir': 'ba',
77
+ 'Basque': 'eu',
78
+ 'Belarusian': 'be',
79
+ 'Bislama': 'bi',
80
+ 'Breton': 'br',
81
+ 'Burmese': 'my',
82
+ 'Catalan, Valencian': 'ca',
83
+ 'Chamorro': 'ch',
84
+ 'Chichewa, Chewa, Nyanja': 'ny',
85
+ 'Chinese': 'zh',
86
+ 'Corsican': 'co',
87
+ 'Cree': 'cr',
88
+ 'Croatian': 'hr',
89
+ 'Danish': 'da',
90
+ 'Dutch, Flemish': 'nl',
91
+ 'Dzongkha': 'dz',
92
+ 'English': 'en',
93
+ 'Finnish': 'fi',
94
+ 'French': 'fr',
95
+ 'Galician': 'gl',
96
+ 'Georgian': 'ka',
97
+ 'German': 'de',
98
+ 'Greek, Modern (1453–)': 'el',
99
+ 'Gujarati': 'gu',
100
+ 'Hausa': 'ha',
101
+ 'Hebrew': 'he',
102
+ 'Hindi': 'hi',
103
+ 'Hungarian': 'hu',
104
+ 'Icelandic': 'is',
105
+ 'Italian': 'it',
106
+ 'Kikuyu, Gikuyu': 'ki',
107
+ 'Korean': 'ko',
108
+ 'Kurdish': 'ku',
109
+ 'Latin': 'la',
110
+ 'Limburgan, Limburger, Limburgish': 'li',
111
+ 'Luba-Katanga': 'lu',
112
+ 'Macedonian': 'mk',
113
+ 'Malay': 'ms',
114
+ 'Nauru': 'na',
115
+ 'North Ndebele': 'nd',
116
+ 'Nepali': 'ne',
117
+ 'Norwegian': 'no',
118
+ 'Norwegian Nynorsk': 'nn',
119
+ 'Sichuan Yi, Nuosu': 'ii',
120
+ 'Occitan': 'oc',
121
+ 'Ojibwa': 'oj',
122
+ 'Oriya': 'or',
123
+ 'Ossetian, Ossetic': 'os',
124
+ 'Persian': 'fa',
125
+ 'Punjabi, Panjabi': 'pa',
126
+ 'Quechua': 'qu',
127
+ 'Romanian, Moldavian, Moldovan': 'ro',
128
+ 'Russian': 'ru',
129
+ 'Samoan': 'sm',
130
+ 'Sanskrit': 'sa',
131
+ 'Serbian': 'sr',
132
+ 'Shona': 'sn',
133
+ 'Sinhala, Sinhalese': 'si',
134
+ 'Slovenian': 'sl',
135
+ 'Somali': 'so',
136
+ 'Sundanese': 'su',
137
+ 'Swahili': 'sw',
138
+ 'Swati': 'ss',
139
+ 'Tajik': 'tg',
140
+ 'Tamil': 'ta',
141
+ 'Telugu': 'te',
142
+ 'Thai': 'th',
143
+ 'Tibetan': 'bo',
144
+ 'Tigrinya': 'ti',
145
+ 'Tonga (Tonga Islands)': 'to',
146
+ 'Tsonga': 'ts',
147
+ 'Twi': 'tw',
148
+ 'Ukrainian': 'uk',
149
+ 'Urdu': 'ur',
150
+ 'Uzbek': 'uz',
151
+ 'Venda': 've',
152
+ 'Vietnamese': 'vi',
153
+ 'Volapük': 'vo',
154
+ 'Welsh': 'cy',
155
+ 'Wolof': 'wo',
156
+ 'Xhosa': 'xh',
157
+ 'Yiddish': 'yi',
158
+ 'Yoruba': 'yo',
159
+ 'Zulu': 'zu'}
160
+ l_lang,l_code = [],[]
161
+ [[l_lang.append(v),l_code.append(k)] for v,k in lang_code_iso639.items()]
162
+ try:
163
+ if is_text(text):
164
+ code_detect=detect(text)
165
+ if 'c' in output.lower(): # return code
166
+ return l_code[strcmp(code_detect,l_code, verbose=verbose)[1]]
167
+ else:
168
+ return l_lang[strcmp(code_detect,l_code, verbose=verbose)[1]]
169
+ else:
170
+ print(f"{text} is not supported")
171
+ return 'no'
172
+ except:
173
+ return 'no'
174
+
175
+ def is_text(s):
176
+ has_alpha = any(char.isalpha() for char in s)
177
+ has_non_alpha = any(not char.isalpha() for char in s)
178
+ # no_special = not re.search(r'[^A-Za-z0-9\s]', s)
179
+ return has_alpha and has_non_alpha
180
+
181
+ def strcmp(search_term, candidates, ignore_case=True, verbose=True, scorer='WR'):
182
+ """
183
+ Compares a search term with a list of candidate strings and finds the best match based on similarity score.
184
+
185
+ Parameters:
186
+ search_term (str): The term to be searched for.
187
+ candidates (list of str): A list of candidate strings to compare against the search term.
188
+ ignore_case (bool): If True, the comparison ignores case differences.
189
+ verbose (bool): If True, prints the similarity score and the best match.
190
+
191
+ Returns:
192
+ tuple: A tuple containing the best match and its index in the candidates list.
193
+ """
194
+ def to_lower(s, ignore_case=True):
195
+ #Converts a string or list of strings to lowercase if ignore_case is True.
196
+ if ignore_case:
197
+ if isinstance(s, str):
198
+ return s.lower()
199
+ elif isinstance(s, list):
200
+ return [elem.lower() for elem in s]
201
+ return s
202
+ str1_,str2_ = to_lower(search_term, ignore_case),to_lower(candidates, ignore_case)
203
+ if isinstance(str2_, list):
204
+ if 'part' in scorer.lower():
205
+ similarity_scores = [fuzz.partial_ratio(str1_, word) for word in str2_]
206
+ elif 'W' in scorer.lower():
207
+ similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
208
+ elif 'Ratio' in scorer.lower():
209
+ similarity_scores = [fuzz.Ratio(str1_, word) for word in str2_]
210
+ else:
211
+ similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
212
+ best_match_index = similarity_scores.index(max(similarity_scores))
213
+ best_match_score = similarity_scores[best_match_index]
214
+ else:
215
+ best_match_index = 0
216
+ if 'part' in scorer.lower():
217
+ best_match_score = fuzz.partial_ratio(str1_, str2_)
218
+ elif 'W' in scorer.lower():
219
+ best_match_score = fuzz.WRatio(str1_, str2_)
220
+ elif 'Ratio' in scorer.lower():
221
+ best_match_score = fuzz.Ratio(str1_, str2_)
222
+ else:
223
+ best_match_score = fuzz.WRatio(str1_, str2_)
224
+ if verbose:
225
+ print(f"\nbest_match is: {candidates[best_match_index],best_match_score}")
226
+ best_match = process.extract(search_term, candidates)
227
+ print(f"建议: {best_match}")
228
+ return candidates[best_match_index], best_match_index
161
229
 
162
230
 
163
231
  def methods(idx=0):
@@ -231,23 +299,6 @@ def get_language_code(language, translator="google"):
231
299
  # print(f"Google Translate Language Code for '{language}': {google_lang_code}")
232
300
  # print(f"DeepL Translator Language Code for '{language}': {deepl_lang_code}")
233
301
 
234
- def detect_language(text):
235
- """
236
- Detect the language of the given text.
237
- """
238
- if len(text.strip()) < 3:
239
- print("Error: Input text is too short for language detection.")
240
- return "english"
241
- else:
242
- lang_code = detect(text)
243
- detected_language=search_iso639_fullname(lang_code)
244
- print(detected_language)
245
- return detected_language
246
-
247
-
248
- # text_to_detect = "Bonjour, comment ça va?"
249
- # detected_language = detect_language(text_to_detect)
250
- # print("Detected language:", detected_language)
251
302
 
252
303
  def load_docx(filename):
253
304
  """
@@ -438,13 +489,13 @@ def translate(
438
489
  text=merge_text(text)
439
490
  text = replace_text(text)
440
491
  if lang_src is None:
441
- lang_src = detect_language(text)
492
+ lang_src = detect_lang(text)
442
493
  try:
443
494
  if len(text) > limit:
444
495
  n=auto_chunk_size(text)
445
496
  text_segments = split_by_sent_n(text,n)
446
497
  translations = ""
447
- for segment in tqdm(text_segments,desc='is translating'):
498
+ for segment in tqdm(text_segments,desc='is translating', colour="green"):
448
499
  segment = replace_text(merge_text(segment))
449
500
  translated_segment = translate_segment(text=segment, lang=lang, lang_src=lang_src, method=method, user_agent=user_agent,service_urls=service_urls, verbose=verbose,error_verbose=error_verbose
450
501
  )
@@ -479,7 +530,7 @@ def translate_segment(
479
530
  text_clean = filter_errors(text)
480
531
  text_clean = replace_text(text_clean)
481
532
  if lang_src is None:
482
- lang_src = detect_language(text_clean)
533
+ lang_src = detect_lang(text_clean)
483
534
  try:
484
535
  lang_src = get_language_code(lang_src, 'google')
485
536
  lang_tgt = get_language_code(lang, 'google')
@@ -547,7 +598,7 @@ def translate_with_retry(
547
598
  raise RuntimeError(f"Error using {service_url}: {e}")
548
599
 
549
600
  if lang_src is None:
550
- lang_src = detect_language(text)
601
+ lang_src = detect_lang(text)
551
602
  lang_src = get_language_code(language=lang_src)
552
603
  lang = get_language_code(language=lang)
553
604
  print(f"lang:{lang},lang_src:{lang_src}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: py2ls
3
- Version: 0.1.4.6
3
+ Version: 0.1.4.8
4
4
  Summary: py(thon)2(too)ls
5
5
  Author: Jianfeng
6
6
  Author-email: Jianfeng.Liu0413@gmail.com
@@ -2,7 +2,7 @@ py2ls/.DS_Store,sha256=1lFlJ5EFymdzGAUAaI30vcaaLHt3F1LwpG7xILf9jsM,6148
2
2
  py2ls/.git/COMMIT_EDITMSG,sha256=5xj-jWMbrdOc9m7gSn-UcsAQ9FMNvWSbLWSsrOUIO5w,7
3
3
  py2ls/.git/FETCH_HEAD,sha256=1FfG9FtKEzbthC4ygl5ci0pnEm7ZaF3ZY7njNqkjz2I,100
4
4
  py2ls/.git/HEAD,sha256=KNJb-Cr0wOK3L1CVmyvrhZ4-YLljCl6MYD2tTdsrboA,21
5
- py2ls/.git/config,sha256=XswTg1Ts7_7IBDlKHh4OF_0Tq7v4wW7BXb6xSVInSec,345
5
+ py2ls/.git/config,sha256=CL7WR7jU8VRchJwRooCBkXUMvuRoPdf3FWIBAOlap1c,378
6
6
  py2ls/.git/description,sha256=ZzMxc0Ca26m45Twn1DDnOHqin5VHEZ9uOTBrScIXSjE,16
7
7
  py2ls/.git/hooks/applypatch-msg.sample,sha256=AiNJeguLAzqlijpSG4YphpOGz3qw4vEBlj0yiqYhk_c,478
8
8
  py2ls/.git/hooks/commit-msg.sample,sha256=H3TV6SkpebVz69WXQdRsuT_zkazdCD00C5Q3B1PZJDc,896
@@ -100,12 +100,12 @@ py2ls/brain_atlas.py,sha256=w1o5EelRjq89zuFJUNSz4Da8HnTCwAwDAZ4NU4a-bAY,5486
100
100
  py2ls/correlators.py,sha256=RbOaJIPLCHJtUm5SFi_4dCJ7VFUPWR0PErfK3K26ad4,18243
101
101
  py2ls/dbhandler.py,sha256=i9dNrpHyx0oIaFieHI4X4tsrCdN-aFxudPTDOgy9Ppo,3574
102
102
  py2ls/freqanalysis.py,sha256=F4218VSPbgL5tnngh6xNCYuNnfR-F_QjECUUxrPYZss,32594
103
- py2ls/ips.py,sha256=wcA7UITz2Nx5bmDkQvGyZ9mNCvt9ZE9JTRpgCvExNPs,124868
104
- py2ls/netfinder.py,sha256=dt6hkYeH-ivCHInoUi92MhJMLlXtjRXT3ewKzOwGtWk,31506
103
+ py2ls/ips.py,sha256=XBKhLvg613WL18wgm5pc4xwe-EI6MOGlCp4AZCQlW60,141873
104
+ py2ls/netfinder.py,sha256=ZsLWGYMeRuGvxj2nqE0Z8ANoaVl18Necfw0HQfh2q7I,45548
105
105
  py2ls/setuptools-70.1.0-py3-none-any.whl,sha256=2bi3cUVal8ip86s0SOvgspteEF8SKLukECi-EWmFomc,882588
106
106
  py2ls/sleep_events_detectors.py,sha256=36MCuRrpurn0Uvzpo3p3b3_JlVsRNHSWCXbJxCGM3mg,51546
107
- py2ls/translator.py,sha256=QfDUO0-pXHGMBFZBefiBHzOrC93-__N5sUQY_VP4wes,29734
107
+ py2ls/translator.py,sha256=OEYljlmICGb9pO06tMWJCoRfZTHBJ8g-tNkF_zLlzgg,34118
108
108
  py2ls/wb_detector.py,sha256=7y6TmBUj9exCZeIgBAJ_9hwuhkDh1x_-yg4dvNY1_GQ,6284
109
- py2ls-0.1.4.6.dist-info/METADATA,sha256=M9tLANmcFhRhKeppFawPAZ4tOTn7lrFw99JBp0Mso2A,17943
110
- py2ls-0.1.4.6.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
111
- py2ls-0.1.4.6.dist-info/RECORD,,
109
+ py2ls-0.1.4.8.dist-info/METADATA,sha256=LlicemgTJqMy8o5i8Nu8jT7Pp1g-Q3BF-zLc5Wnrr3I,17943
110
+ py2ls-0.1.4.8.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
111
+ py2ls-0.1.4.8.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.9.0
2
+ Generator: poetry-core 1.8.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any