dateparser 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dateparser/__init__.py +82 -0
- dateparser/calendars/__init__.py +144 -0
- dateparser/calendars/hijri.py +6 -0
- dateparser/calendars/hijri_parser.py +60 -0
- dateparser/calendars/jalali.py +9 -0
- dateparser/calendars/jalali_parser.py +184 -0
- dateparser/conf.py +267 -0
- dateparser/custom_language_detection/__init__.py +0 -0
- dateparser/custom_language_detection/fasttext.py +43 -0
- dateparser/custom_language_detection/langdetect.py +37 -0
- dateparser/custom_language_detection/language_mapping.py +18 -0
- dateparser/data/__init__.py +2 -0
- dateparser/data/date_translation_data/__init__.py +0 -0
- dateparser/data/date_translation_data/af.py +242 -0
- dateparser/data/date_translation_data/agq.py +169 -0
- dateparser/data/date_translation_data/ak.py +169 -0
- dateparser/data/date_translation_data/am.py +222 -0
- dateparser/data/date_translation_data/ar.py +574 -0
- dateparser/data/date_translation_data/as.py +164 -0
- dateparser/data/date_translation_data/asa.py +168 -0
- dateparser/data/date_translation_data/ast.py +280 -0
- dateparser/data/date_translation_data/az-Cyrl.py +168 -0
- dateparser/data/date_translation_data/az-Latn.py +217 -0
- dateparser/data/date_translation_data/az.py +217 -0
- dateparser/data/date_translation_data/bas.py +169 -0
- dateparser/data/date_translation_data/be.py +340 -0
- dateparser/data/date_translation_data/bem.py +161 -0
- dateparser/data/date_translation_data/bez.py +169 -0
- dateparser/data/date_translation_data/bg.py +345 -0
- dateparser/data/date_translation_data/bm.py +167 -0
- dateparser/data/date_translation_data/bn.py +241 -0
- dateparser/data/date_translation_data/bo.py +185 -0
- dateparser/data/date_translation_data/br.py +226 -0
- dateparser/data/date_translation_data/brx.py +157 -0
- dateparser/data/date_translation_data/bs-Cyrl.py +226 -0
- dateparser/data/date_translation_data/bs-Latn.py +248 -0
- dateparser/data/date_translation_data/bs.py +248 -0
- dateparser/data/date_translation_data/ca.py +313 -0
- dateparser/data/date_translation_data/ce.py +225 -0
- dateparser/data/date_translation_data/cgg.py +169 -0
- dateparser/data/date_translation_data/chr.py +240 -0
- dateparser/data/date_translation_data/ckb.py +154 -0
- dateparser/data/date_translation_data/cs.py +316 -0
- dateparser/data/date_translation_data/cy.py +217 -0
- dateparser/data/date_translation_data/da.py +296 -0
- dateparser/data/date_translation_data/dav.py +169 -0
- dateparser/data/date_translation_data/de.py +357 -0
- dateparser/data/date_translation_data/dje.py +167 -0
- dateparser/data/date_translation_data/dsb.py +270 -0
- dateparser/data/date_translation_data/dua.py +169 -0
- dateparser/data/date_translation_data/dyo.py +168 -0
- dateparser/data/date_translation_data/dz.py +225 -0
- dateparser/data/date_translation_data/ebu.py +169 -0
- dateparser/data/date_translation_data/ee.py +233 -0
- dateparser/data/date_translation_data/el.py +279 -0
- dateparser/data/date_translation_data/en.py +851 -0
- dateparser/data/date_translation_data/eo.py +169 -0
- dateparser/data/date_translation_data/es.py +499 -0
- dateparser/data/date_translation_data/et.py +233 -0
- dateparser/data/date_translation_data/eu.py +219 -0
- dateparser/data/date_translation_data/ewo.py +169 -0
- dateparser/data/date_translation_data/fa.py +270 -0
- dateparser/data/date_translation_data/ff.py +179 -0
- dateparser/data/date_translation_data/fi.py +345 -0
- dateparser/data/date_translation_data/fil.py +223 -0
- dateparser/data/date_translation_data/fo.py +256 -0
- dateparser/data/date_translation_data/fr.py +520 -0
- dateparser/data/date_translation_data/fur.py +223 -0
- dateparser/data/date_translation_data/fy.py +223 -0
- dateparser/data/date_translation_data/ga.py +238 -0
- dateparser/data/date_translation_data/gd.py +277 -0
- dateparser/data/date_translation_data/gl.py +253 -0
- dateparser/data/date_translation_data/gsw.py +179 -0
- dateparser/data/date_translation_data/gu.py +216 -0
- dateparser/data/date_translation_data/guz.py +170 -0
- dateparser/data/date_translation_data/gv.py +166 -0
- dateparser/data/date_translation_data/ha.py +176 -0
- dateparser/data/date_translation_data/haw.py +168 -0
- dateparser/data/date_translation_data/he.py +371 -0
- dateparser/data/date_translation_data/hi.py +261 -0
- dateparser/data/date_translation_data/hr.py +378 -0
- dateparser/data/date_translation_data/hsb.py +271 -0
- dateparser/data/date_translation_data/hu.py +297 -0
- dateparser/data/date_translation_data/hy.py +246 -0
- dateparser/data/date_translation_data/id.py +272 -0
- dateparser/data/date_translation_data/ig.py +168 -0
- dateparser/data/date_translation_data/ii.py +157 -0
- dateparser/data/date_translation_data/is.py +242 -0
- dateparser/data/date_translation_data/it.py +282 -0
- dateparser/data/date_translation_data/ja.py +286 -0
- dateparser/data/date_translation_data/jgo.py +188 -0
- dateparser/data/date_translation_data/jmc.py +168 -0
- dateparser/data/date_translation_data/ka.py +241 -0
- dateparser/data/date_translation_data/kab.py +169 -0
- dateparser/data/date_translation_data/kam.py +169 -0
- dateparser/data/date_translation_data/kde.py +169 -0
- dateparser/data/date_translation_data/kea.py +230 -0
- dateparser/data/date_translation_data/khq.py +167 -0
- dateparser/data/date_translation_data/ki.py +169 -0
- dateparser/data/date_translation_data/kk.py +228 -0
- dateparser/data/date_translation_data/kl.py +213 -0
- dateparser/data/date_translation_data/kln.py +171 -0
- dateparser/data/date_translation_data/km.py +198 -0
- dateparser/data/date_translation_data/kn.py +225 -0
- dateparser/data/date_translation_data/ko.py +207 -0
- dateparser/data/date_translation_data/kok.py +157 -0
- dateparser/data/date_translation_data/ks.py +152 -0
- dateparser/data/date_translation_data/ksb.py +168 -0
- dateparser/data/date_translation_data/ksf.py +169 -0
- dateparser/data/date_translation_data/ksh.py +192 -0
- dateparser/data/date_translation_data/kw.py +169 -0
- dateparser/data/date_translation_data/ky.py +240 -0
- dateparser/data/date_translation_data/lag.py +169 -0
- dateparser/data/date_translation_data/lb.py +233 -0
- dateparser/data/date_translation_data/lg.py +169 -0
- dateparser/data/date_translation_data/lkt.py +194 -0
- dateparser/data/date_translation_data/ln.py +179 -0
- dateparser/data/date_translation_data/lo.py +228 -0
- dateparser/data/date_translation_data/lrc.py +154 -0
- dateparser/data/date_translation_data/lt.py +263 -0
- dateparser/data/date_translation_data/lu.py +169 -0
- dateparser/data/date_translation_data/luo.py +169 -0
- dateparser/data/date_translation_data/luy.py +168 -0
- dateparser/data/date_translation_data/lv.py +257 -0
- dateparser/data/date_translation_data/mas.py +173 -0
- dateparser/data/date_translation_data/mer.py +168 -0
- dateparser/data/date_translation_data/mfe.py +166 -0
- dateparser/data/date_translation_data/mg.py +168 -0
- dateparser/data/date_translation_data/mgh.py +169 -0
- dateparser/data/date_translation_data/mgo.py +151 -0
- dateparser/data/date_translation_data/mk.py +234 -0
- dateparser/data/date_translation_data/ml.py +217 -0
- dateparser/data/date_translation_data/mn.py +224 -0
- dateparser/data/date_translation_data/mr.py +229 -0
- dateparser/data/date_translation_data/ms.py +242 -0
- dateparser/data/date_translation_data/mt.py +175 -0
- dateparser/data/date_translation_data/mua.py +169 -0
- dateparser/data/date_translation_data/my.py +203 -0
- dateparser/data/date_translation_data/mzn.py +199 -0
- dateparser/data/date_translation_data/naq.py +169 -0
- dateparser/data/date_translation_data/nb.py +261 -0
- dateparser/data/date_translation_data/nd.py +169 -0
- dateparser/data/date_translation_data/ne.py +207 -0
- dateparser/data/date_translation_data/nl.py +273 -0
- dateparser/data/date_translation_data/nmg.py +169 -0
- dateparser/data/date_translation_data/nn.py +231 -0
- dateparser/data/date_translation_data/nnh.py +150 -0
- dateparser/data/date_translation_data/nus.py +166 -0
- dateparser/data/date_translation_data/nyn.py +169 -0
- dateparser/data/date_translation_data/om.py +173 -0
- dateparser/data/date_translation_data/or.py +157 -0
- dateparser/data/date_translation_data/os.py +203 -0
- dateparser/data/date_translation_data/pa-Arab.py +150 -0
- dateparser/data/date_translation_data/pa-Guru.py +221 -0
- dateparser/data/date_translation_data/pa.py +221 -0
- dateparser/data/date_translation_data/pl.py +416 -0
- dateparser/data/date_translation_data/ps.py +150 -0
- dateparser/data/date_translation_data/pt.py +981 -0
- dateparser/data/date_translation_data/qu.py +176 -0
- dateparser/data/date_translation_data/rm.py +166 -0
- dateparser/data/date_translation_data/rn.py +169 -0
- dateparser/data/date_translation_data/ro.py +270 -0
- dateparser/data/date_translation_data/rof.py +157 -0
- dateparser/data/date_translation_data/ru.py +442 -0
- dateparser/data/date_translation_data/rw.py +169 -0
- dateparser/data/date_translation_data/rwk.py +168 -0
- dateparser/data/date_translation_data/sah.py +219 -0
- dateparser/data/date_translation_data/saq.py +169 -0
- dateparser/data/date_translation_data/sbp.py +169 -0
- dateparser/data/date_translation_data/se.py +280 -0
- dateparser/data/date_translation_data/seh.py +169 -0
- dateparser/data/date_translation_data/ses.py +167 -0
- dateparser/data/date_translation_data/sg.py +169 -0
- dateparser/data/date_translation_data/shi-Latn.py +169 -0
- dateparser/data/date_translation_data/shi-Tfng.py +169 -0
- dateparser/data/date_translation_data/shi.py +169 -0
- dateparser/data/date_translation_data/si.py +220 -0
- dateparser/data/date_translation_data/sk.py +327 -0
- dateparser/data/date_translation_data/sl.py +244 -0
- dateparser/data/date_translation_data/smn.py +176 -0
- dateparser/data/date_translation_data/sn.py +169 -0
- dateparser/data/date_translation_data/so.py +179 -0
- dateparser/data/date_translation_data/sq.py +237 -0
- dateparser/data/date_translation_data/sr-Cyrl.py +306 -0
- dateparser/data/date_translation_data/sr-Latn.py +306 -0
- dateparser/data/date_translation_data/sr.py +255 -0
- dateparser/data/date_translation_data/sv.py +309 -0
- dateparser/data/date_translation_data/sw.py +231 -0
- dateparser/data/date_translation_data/ta.py +264 -0
- dateparser/data/date_translation_data/te.py +239 -0
- dateparser/data/date_translation_data/teo.py +173 -0
- dateparser/data/date_translation_data/th.py +300 -0
- dateparser/data/date_translation_data/ti.py +173 -0
- dateparser/data/date_translation_data/tl.py +137 -0
- dateparser/data/date_translation_data/to.py +216 -0
- dateparser/data/date_translation_data/tr.py +259 -0
- dateparser/data/date_translation_data/twq.py +167 -0
- dateparser/data/date_translation_data/tzm.py +169 -0
- dateparser/data/date_translation_data/ug.py +203 -0
- dateparser/data/date_translation_data/uk.py +502 -0
- dateparser/data/date_translation_data/ur.py +256 -0
- dateparser/data/date_translation_data/uz-Arab.py +167 -0
- dateparser/data/date_translation_data/uz-Cyrl.py +210 -0
- dateparser/data/date_translation_data/uz-Latn.py +216 -0
- dateparser/data/date_translation_data/uz.py +216 -0
- dateparser/data/date_translation_data/vi.py +260 -0
- dateparser/data/date_translation_data/vun.py +168 -0
- dateparser/data/date_translation_data/wae.py +224 -0
- dateparser/data/date_translation_data/xog.py +169 -0
- dateparser/data/date_translation_data/yav.py +169 -0
- dateparser/data/date_translation_data/yi.py +178 -0
- dateparser/data/date_translation_data/yo.py +263 -0
- dateparser/data/date_translation_data/yue.py +203 -0
- dateparser/data/date_translation_data/zgh.py +169 -0
- dateparser/data/date_translation_data/zh-Hans.py +240 -0
- dateparser/data/date_translation_data/zh-Hant.py +402 -0
- dateparser/data/date_translation_data/zh.py +273 -0
- dateparser/data/date_translation_data/zu.py +196 -0
- dateparser/data/languages_info.py +826 -0
- dateparser/date.py +599 -0
- dateparser/date_parser.py +55 -0
- dateparser/freshness_date_parser.py +156 -0
- dateparser/languages/__init__.py +2 -0
- dateparser/languages/dictionary.py +352 -0
- dateparser/languages/loader.py +224 -0
- dateparser/languages/locale.py +625 -0
- dateparser/languages/validation.py +467 -0
- dateparser/parser.py +742 -0
- dateparser/search/__init__.py +71 -0
- dateparser/search/detection.py +78 -0
- dateparser/search/search.py +297 -0
- dateparser/search/text_detection.py +89 -0
- dateparser/timezone_parser.py +91 -0
- dateparser/timezones.py +469 -0
- dateparser/utils/__init__.py +257 -0
- dateparser/utils/strptime.py +108 -0
- dateparser-1.2.1.dist-info/AUTHORS.rst +17 -0
- dateparser-1.2.1.dist-info/LICENSE +12 -0
- dateparser-1.2.1.dist-info/METADATA +864 -0
- dateparser-1.2.1.dist-info/RECORD +256 -0
- dateparser-1.2.1.dist-info/WHEEL +5 -0
- dateparser-1.2.1.dist-info/entry_points.txt +2 -0
- dateparser-1.2.1.dist-info/top_level.txt +4 -0
- dateparser_cli/__init__.py +0 -0
- dateparser_cli/cli.py +36 -0
- dateparser_cli/exceptions.py +2 -0
- dateparser_cli/fasttext_manager.py +42 -0
- dateparser_cli/utils.py +27 -0
- dateparser_data/__init__.py +0 -0
- dateparser_data/settings.py +33 -0
- dateparser_scripts/__init__.py +0 -0
- dateparser_scripts/get_cldr_data.py +567 -0
- dateparser_scripts/order_languages.py +217 -0
- dateparser_scripts/update_supported_languages_and_locales.py +48 -0
- dateparser_scripts/utils.py +73 -0
- dateparser_scripts/write_complete_data.py +129 -0
|
@@ -0,0 +1,625 @@
|
|
|
1
|
+
from collections import OrderedDict
|
|
2
|
+
from itertools import chain
|
|
3
|
+
|
|
4
|
+
import regex as re
|
|
5
|
+
from dateutil import parser
|
|
6
|
+
|
|
7
|
+
from dateparser.timezone_parser import pop_tz_offset_from_string, word_is_tz
|
|
8
|
+
from dateparser.utils import combine_dicts, normalize_unicode
|
|
9
|
+
|
|
10
|
+
from .dictionary import ALWAYS_KEEP_TOKENS, Dictionary, NormalizedDictionary
|
|
11
|
+
|
|
12
|
+
NUMERAL_PATTERN = re.compile(r"(\d+)", re.U)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Locale:
|
|
16
|
+
"""
|
|
17
|
+
Class that deals with applicability and translation from a locale.
|
|
18
|
+
|
|
19
|
+
:param shortname:
|
|
20
|
+
A locale code, e.g. 'fr-PF', 'qu-EC', 'af-NA'.
|
|
21
|
+
:type shortname: str
|
|
22
|
+
|
|
23
|
+
:param language_info:
|
|
24
|
+
Language info (translation data) of the language the locale belongs to.
|
|
25
|
+
:type language_info: dict
|
|
26
|
+
|
|
27
|
+
:return: A Locale instance
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
_dictionary = None
|
|
31
|
+
_normalized_dictionary = None
|
|
32
|
+
_simplifications = None
|
|
33
|
+
_normalized_simplifications = None
|
|
34
|
+
_splitters = None
|
|
35
|
+
_wordchars = None
|
|
36
|
+
_relative_translations = None
|
|
37
|
+
_normalized_relative_translations = None
|
|
38
|
+
_abbreviations = None
|
|
39
|
+
_split_dictionary = None
|
|
40
|
+
_wordchars_for_detection = None
|
|
41
|
+
|
|
42
|
+
def __init__(self, shortname, language_info):
|
|
43
|
+
self.shortname = shortname
|
|
44
|
+
locale_specific_info = language_info.get("locale_specific", {}).get(
|
|
45
|
+
shortname, {}
|
|
46
|
+
)
|
|
47
|
+
self.info = combine_dicts(language_info, locale_specific_info)
|
|
48
|
+
self.info.pop("locale_specific", None)
|
|
49
|
+
|
|
50
|
+
def is_applicable(self, date_string, strip_timezone=False, settings=None):
|
|
51
|
+
"""
|
|
52
|
+
Check if the locale is applicable to translate date string.
|
|
53
|
+
|
|
54
|
+
:param date_string:
|
|
55
|
+
A string representing date and/or time in a recognizably valid format.
|
|
56
|
+
:type date_string: str
|
|
57
|
+
|
|
58
|
+
:param strip_timezone:
|
|
59
|
+
If True, timezone is stripped from date string.
|
|
60
|
+
:type strip_timezone: bool
|
|
61
|
+
|
|
62
|
+
:return: boolean value representing if the locale is applicable for the date string or not.
|
|
63
|
+
"""
|
|
64
|
+
if strip_timezone:
|
|
65
|
+
date_string, _ = pop_tz_offset_from_string(date_string, as_offset=False)
|
|
66
|
+
|
|
67
|
+
date_string = self._translate_numerals(date_string)
|
|
68
|
+
if settings.NORMALIZE:
|
|
69
|
+
date_string = normalize_unicode(date_string)
|
|
70
|
+
date_string = self._simplify(date_string, settings=settings)
|
|
71
|
+
dictionary = self._get_dictionary(settings)
|
|
72
|
+
date_tokens = dictionary.split(date_string)
|
|
73
|
+
return dictionary.are_tokens_valid(date_tokens)
|
|
74
|
+
|
|
75
|
+
def count_applicability(self, text, strip_timezone=False, settings=None):
|
|
76
|
+
if strip_timezone:
|
|
77
|
+
text, _ = pop_tz_offset_from_string(text, as_offset=False)
|
|
78
|
+
|
|
79
|
+
text = self._simplify(text, settings=settings)
|
|
80
|
+
sentences = self._sentence_split(text, settings=settings)
|
|
81
|
+
tokens = []
|
|
82
|
+
for sent in sentences:
|
|
83
|
+
tokens.extend(self._split(sent, keep_formatting=False, settings=settings))
|
|
84
|
+
return self._count_words_present_in_the_dictionary(tokens, settings)
|
|
85
|
+
|
|
86
|
+
def _count_words_present_in_the_dictionary(self, words, settings=None):
|
|
87
|
+
dictionary = self.clean_dictionary(
|
|
88
|
+
self._get_split_dictionary(settings=settings)
|
|
89
|
+
)
|
|
90
|
+
dict_cnt = 0
|
|
91
|
+
skip_cnt = 0
|
|
92
|
+
for word in set(words):
|
|
93
|
+
if word in dictionary:
|
|
94
|
+
if dictionary[word]:
|
|
95
|
+
dict_cnt += 1
|
|
96
|
+
else:
|
|
97
|
+
skip_cnt += 1
|
|
98
|
+
elif word.isdigit():
|
|
99
|
+
skip_cnt += 1
|
|
100
|
+
return [dict_cnt, skip_cnt]
|
|
101
|
+
|
|
102
|
+
@staticmethod
|
|
103
|
+
def clean_dictionary(dictionary, threshold=2):
|
|
104
|
+
del_keys = []
|
|
105
|
+
for key in dictionary:
|
|
106
|
+
if len(key) < threshold:
|
|
107
|
+
del_keys.append(key)
|
|
108
|
+
for del_key in del_keys:
|
|
109
|
+
del dictionary[del_key]
|
|
110
|
+
return dictionary
|
|
111
|
+
|
|
112
|
+
def translate(self, date_string, keep_formatting=False, settings=None):
|
|
113
|
+
"""
|
|
114
|
+
Translate the date string to its English equivalent.
|
|
115
|
+
|
|
116
|
+
:param date_string:
|
|
117
|
+
A string representing date and/or time in a recognizably valid format.
|
|
118
|
+
:type date_string: str
|
|
119
|
+
|
|
120
|
+
:param keep_formatting:
|
|
121
|
+
If True, retain formatting of the date string after translation.
|
|
122
|
+
:type keep_formatting: bool
|
|
123
|
+
|
|
124
|
+
:return: translated date string.
|
|
125
|
+
"""
|
|
126
|
+
date_string = self._translate_numerals(date_string)
|
|
127
|
+
if settings.NORMALIZE:
|
|
128
|
+
date_string = normalize_unicode(date_string)
|
|
129
|
+
date_string = self._simplify(date_string, settings=settings)
|
|
130
|
+
dictionary = self._get_dictionary(settings)
|
|
131
|
+
date_string_tokens = dictionary.split(date_string, keep_formatting)
|
|
132
|
+
|
|
133
|
+
relative_translations = self._get_relative_translations(settings=settings)
|
|
134
|
+
|
|
135
|
+
for i, word in enumerate(date_string_tokens):
|
|
136
|
+
word = word.lower()
|
|
137
|
+
for pattern, replacement in relative_translations.items():
|
|
138
|
+
if pattern.match(word):
|
|
139
|
+
date_string_tokens[i] = pattern.sub(replacement, word)
|
|
140
|
+
break
|
|
141
|
+
else:
|
|
142
|
+
if word in dictionary:
|
|
143
|
+
fallback = word if keep_formatting and not word.isalpha() else ""
|
|
144
|
+
date_string_tokens[i] = dictionary[word] or fallback
|
|
145
|
+
if "in" in date_string_tokens:
|
|
146
|
+
date_string_tokens = self._clear_future_words(date_string_tokens)
|
|
147
|
+
|
|
148
|
+
return self._join(
|
|
149
|
+
list(filter(bool, date_string_tokens)),
|
|
150
|
+
separator="" if keep_formatting else " ",
|
|
151
|
+
settings=settings,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
def _translate_numerals(self, date_string):
|
|
155
|
+
date_string_tokens = NUMERAL_PATTERN.split(date_string)
|
|
156
|
+
for i, token in enumerate(date_string_tokens):
|
|
157
|
+
if token.isdecimal():
|
|
158
|
+
date_string_tokens[i] = str(int(token)).zfill(len(token))
|
|
159
|
+
return "".join(date_string_tokens)
|
|
160
|
+
|
|
161
|
+
def _get_relative_translations(self, settings=None):
|
|
162
|
+
if settings.NORMALIZE:
|
|
163
|
+
if self._normalized_relative_translations is None:
|
|
164
|
+
self._normalized_relative_translations = (
|
|
165
|
+
self._generate_relative_translations(normalize=True)
|
|
166
|
+
)
|
|
167
|
+
return self._normalized_relative_translations
|
|
168
|
+
else:
|
|
169
|
+
if self._relative_translations is None:
|
|
170
|
+
self._relative_translations = self._generate_relative_translations(
|
|
171
|
+
normalize=False
|
|
172
|
+
)
|
|
173
|
+
return self._relative_translations
|
|
174
|
+
|
|
175
|
+
def _generate_relative_translations(self, normalize=False):
|
|
176
|
+
relative_translations = self.info.get("relative-type-regex", {})
|
|
177
|
+
relative_dictionary = OrderedDict()
|
|
178
|
+
for key, value in relative_translations.items():
|
|
179
|
+
if normalize:
|
|
180
|
+
value = list(map(normalize_unicode, value))
|
|
181
|
+
pattern = "|".join(sorted(value, key=len, reverse=True))
|
|
182
|
+
pattern = pattern.replace(r"(\d+", r"(?P<n>\d+")
|
|
183
|
+
pattern = re.compile(
|
|
184
|
+
r"^(?:{})$".format(pattern), re.UNICODE | re.IGNORECASE
|
|
185
|
+
)
|
|
186
|
+
relative_dictionary[pattern] = key
|
|
187
|
+
return relative_dictionary
|
|
188
|
+
|
|
189
|
+
def translate_search(self, search_string, settings=None):
|
|
190
|
+
dashes = ["-", "——", "—", "~"]
|
|
191
|
+
word_joint_unsupported_languages = ["zh", "ja"]
|
|
192
|
+
sentences = self._sentence_split(search_string, settings=settings)
|
|
193
|
+
dictionary = self._get_dictionary(settings=settings)
|
|
194
|
+
translated = []
|
|
195
|
+
original = []
|
|
196
|
+
for sentence in sentences:
|
|
197
|
+
original_tokens, simplified_tokens = self._simplify_split_align(
|
|
198
|
+
sentence, settings=settings
|
|
199
|
+
)
|
|
200
|
+
translated_chunk = []
|
|
201
|
+
original_chunk = []
|
|
202
|
+
last_token_index = len(simplified_tokens) - 1
|
|
203
|
+
skip_next_token = False
|
|
204
|
+
for i, word in enumerate(simplified_tokens):
|
|
205
|
+
next_word = simplified_tokens[i + 1] if i < last_token_index else ""
|
|
206
|
+
current_and_next_joined = self._join_chunk(
|
|
207
|
+
[word, next_word], settings=settings
|
|
208
|
+
)
|
|
209
|
+
if skip_next_token:
|
|
210
|
+
skip_next_token = False
|
|
211
|
+
continue
|
|
212
|
+
|
|
213
|
+
if word == "" or word == " ":
|
|
214
|
+
translated_chunk.append(word)
|
|
215
|
+
original_chunk.append(original_tokens[i])
|
|
216
|
+
elif (
|
|
217
|
+
current_and_next_joined in dictionary
|
|
218
|
+
and word not in dashes
|
|
219
|
+
and self.shortname not in word_joint_unsupported_languages
|
|
220
|
+
):
|
|
221
|
+
translated_chunk.append(dictionary[current_and_next_joined])
|
|
222
|
+
original_chunk.append(
|
|
223
|
+
self._join_chunk(
|
|
224
|
+
[original_tokens[i], original_tokens[i + 1]],
|
|
225
|
+
settings=settings,
|
|
226
|
+
)
|
|
227
|
+
)
|
|
228
|
+
skip_next_token = True
|
|
229
|
+
elif word in dictionary and word not in dashes:
|
|
230
|
+
translated_chunk.append(dictionary[word])
|
|
231
|
+
original_chunk.append(original_tokens[i])
|
|
232
|
+
elif word.strip("()\"'{}[],.،") in dictionary and word not in dashes:
|
|
233
|
+
punct = word[len(word.strip("()\"'{}[],.،")) :]
|
|
234
|
+
if punct and dictionary[word.strip("()\"'{}[],.،")]:
|
|
235
|
+
translated_chunk.append(
|
|
236
|
+
dictionary[word.strip("()\"'{}[],.،")] + punct
|
|
237
|
+
)
|
|
238
|
+
else:
|
|
239
|
+
translated_chunk.append(dictionary[word.strip("()\"'{}[],.،")])
|
|
240
|
+
original_chunk.append(original_tokens[i])
|
|
241
|
+
elif self._token_with_digits_is_ok(word):
|
|
242
|
+
translated_chunk.append(word)
|
|
243
|
+
original_chunk.append(original_tokens[i])
|
|
244
|
+
# Use original token because word_is_tz is case sensitive
|
|
245
|
+
elif translated_chunk and word_is_tz(original_tokens[i]):
|
|
246
|
+
translated_chunk.append(word)
|
|
247
|
+
original_chunk.append(original_tokens[i])
|
|
248
|
+
else:
|
|
249
|
+
if translated_chunk:
|
|
250
|
+
translated.append(translated_chunk)
|
|
251
|
+
translated_chunk = []
|
|
252
|
+
original.append(original_chunk)
|
|
253
|
+
original_chunk = []
|
|
254
|
+
if translated_chunk:
|
|
255
|
+
translated.append(translated_chunk)
|
|
256
|
+
original.append(original_chunk)
|
|
257
|
+
for i in range(len(translated)):
|
|
258
|
+
if "in" in translated[i]:
|
|
259
|
+
translated[i] = self._clear_future_words(translated[i])
|
|
260
|
+
translated[i] = self._join_chunk(
|
|
261
|
+
list(filter(bool, translated[i])), settings=settings
|
|
262
|
+
)
|
|
263
|
+
original[i] = self._join_chunk(
|
|
264
|
+
list(filter(bool, original[i])), settings=settings
|
|
265
|
+
)
|
|
266
|
+
return translated, original
|
|
267
|
+
|
|
268
|
+
def _get_abbreviations(self, settings):
|
|
269
|
+
dictionary = self._get_dictionary(settings=settings)
|
|
270
|
+
abbreviations = []
|
|
271
|
+
if self._abbreviations is None:
|
|
272
|
+
for item in dictionary:
|
|
273
|
+
if item.endswith(".") and len(item) > 1:
|
|
274
|
+
abbreviations.append(item)
|
|
275
|
+
self._abbreviations = abbreviations
|
|
276
|
+
return self._abbreviations
|
|
277
|
+
|
|
278
|
+
def _sentence_split(self, string, settings):
|
|
279
|
+
abbreviations = self._get_abbreviations(settings=settings)
|
|
280
|
+
digit_abbreviations = ["[0-9]"] # numeric date with full stop
|
|
281
|
+
abbreviation_string = ""
|
|
282
|
+
|
|
283
|
+
for abbreviation in abbreviations:
|
|
284
|
+
abbreviation_string += (
|
|
285
|
+
"(?<! " + abbreviation[:-1] + ")"
|
|
286
|
+
) # negative lookbehind
|
|
287
|
+
if self.shortname in ["fi", "cs", "hu", "de", "da"]:
|
|
288
|
+
for digit_abbreviation in digit_abbreviations:
|
|
289
|
+
abbreviation_string += (
|
|
290
|
+
"(?<!" + digit_abbreviation + ")"
|
|
291
|
+
) # negative lookbehind
|
|
292
|
+
|
|
293
|
+
splitters_dict = {
|
|
294
|
+
1: r"[\.!?;…\r\n]+(?:\s|$)*", # most European, Tagalog, Hebrew, Georgian,
|
|
295
|
+
# Indonesian, Vietnamese
|
|
296
|
+
2: r"[\.!?;…\r\n]+(\s*[¡¿]*|$)|[¡¿]+", # Spanish
|
|
297
|
+
3: r"[|!?;\r\n]+(?:\s|$)+", # Hindi and Bangla
|
|
298
|
+
4: r"[。…‥\.!??!;\r\n]+(?:\s|$)+", # Japanese and Chinese
|
|
299
|
+
5: r"[\r\n]+", # Thai
|
|
300
|
+
6: r"[\r\n؟!\.…]+(?:\s|$)+",
|
|
301
|
+
} # Arabic and Farsi
|
|
302
|
+
if "sentence_splitter_group" not in self.info:
|
|
303
|
+
split_reg = abbreviation_string + splitters_dict[1]
|
|
304
|
+
sentences = re.split(split_reg, string)
|
|
305
|
+
else:
|
|
306
|
+
split_reg = (
|
|
307
|
+
abbreviation_string
|
|
308
|
+
+ splitters_dict[self.info["sentence_splitter_group"]]
|
|
309
|
+
)
|
|
310
|
+
sentences = re.split(split_reg, string)
|
|
311
|
+
|
|
312
|
+
sentences = filter(None, sentences)
|
|
313
|
+
return sentences
|
|
314
|
+
|
|
315
|
+
def _simplify_split_align(self, original, settings):
|
|
316
|
+
# TODO: Switch to new split method.
|
|
317
|
+
original_tokens = self._word_split(original, settings=settings)
|
|
318
|
+
simplified_tokens = self._word_split(
|
|
319
|
+
self._simplify(normalize_unicode(original), settings=settings),
|
|
320
|
+
settings=settings,
|
|
321
|
+
)
|
|
322
|
+
if len(original_tokens) == len(simplified_tokens):
|
|
323
|
+
return original_tokens, simplified_tokens
|
|
324
|
+
|
|
325
|
+
elif len(original_tokens) < len(simplified_tokens):
|
|
326
|
+
add_empty = False
|
|
327
|
+
for i, token in enumerate(simplified_tokens):
|
|
328
|
+
if i < len(original_tokens):
|
|
329
|
+
if token == normalize_unicode(original_tokens[i].lower()):
|
|
330
|
+
add_empty = False
|
|
331
|
+
else:
|
|
332
|
+
if not add_empty:
|
|
333
|
+
add_empty = True
|
|
334
|
+
continue
|
|
335
|
+
else:
|
|
336
|
+
original_tokens.insert(i, "")
|
|
337
|
+
else:
|
|
338
|
+
original_tokens.insert(i, "")
|
|
339
|
+
else:
|
|
340
|
+
add_empty = False
|
|
341
|
+
for i, token in enumerate(original_tokens):
|
|
342
|
+
if i < len(simplified_tokens):
|
|
343
|
+
if normalize_unicode(token.lower()) == simplified_tokens[i]:
|
|
344
|
+
add_empty = False
|
|
345
|
+
else:
|
|
346
|
+
if not add_empty:
|
|
347
|
+
add_empty = True
|
|
348
|
+
continue
|
|
349
|
+
else:
|
|
350
|
+
simplified_tokens.insert(i, "")
|
|
351
|
+
else:
|
|
352
|
+
simplified_tokens.insert(i, "")
|
|
353
|
+
|
|
354
|
+
while len(original_tokens) != len(simplified_tokens):
|
|
355
|
+
if len(original_tokens) > len(simplified_tokens):
|
|
356
|
+
original_tokens.remove("")
|
|
357
|
+
else:
|
|
358
|
+
simplified_tokens.remove("")
|
|
359
|
+
return original_tokens, simplified_tokens
|
|
360
|
+
|
|
361
|
+
def _get_split_dictionary(self, settings):
|
|
362
|
+
if self._split_dictionary is None:
|
|
363
|
+
settings.NORMALIZE = True
|
|
364
|
+
dictionary = self._get_dictionary(settings=settings)
|
|
365
|
+
self._split_dictionary = self._split_dict(dictionary)
|
|
366
|
+
return self._split_dictionary
|
|
367
|
+
|
|
368
|
+
def _split_dict(self, dictionary):
|
|
369
|
+
newdict = {}
|
|
370
|
+
for item in dictionary:
|
|
371
|
+
if " " in item:
|
|
372
|
+
items = item.split()
|
|
373
|
+
for i in items:
|
|
374
|
+
newdict[i] = dictionary[item]
|
|
375
|
+
else:
|
|
376
|
+
newdict[item] = dictionary[item]
|
|
377
|
+
return newdict
|
|
378
|
+
|
|
379
|
+
def _word_split(self, string, settings):
|
|
380
|
+
if "no_word_spacing" in self.info:
|
|
381
|
+
return self._split(string, keep_formatting=True, settings=settings)
|
|
382
|
+
else:
|
|
383
|
+
return string.split()
|
|
384
|
+
|
|
385
|
+
def _split(self, date_string, keep_formatting, settings=None):
|
|
386
|
+
tokens = [date_string]
|
|
387
|
+
tokens = list(self._split_tokens_with_regex(tokens, r"(\d+)"))
|
|
388
|
+
tokens = list(
|
|
389
|
+
self._split_tokens_by_known_words(
|
|
390
|
+
tokens, keep_formatting, settings=settings
|
|
391
|
+
)
|
|
392
|
+
)
|
|
393
|
+
return tokens
|
|
394
|
+
|
|
395
|
+
def _split_tokens_with_regex(self, tokens, regex):
|
|
396
|
+
tokens = tokens[:]
|
|
397
|
+
for i, token in enumerate(tokens):
|
|
398
|
+
tokens[i] = re.split(regex, token)
|
|
399
|
+
return filter(bool, chain.from_iterable(tokens))
|
|
400
|
+
|
|
401
|
+
def _split_tokens_by_known_words(self, tokens, keep_formatting, settings=None):
|
|
402
|
+
dictionary = self._get_dictionary(settings)
|
|
403
|
+
for i, token in enumerate(tokens):
|
|
404
|
+
tokens[i] = dictionary.split(token, keep_formatting)
|
|
405
|
+
return list(chain.from_iterable(tokens))
|
|
406
|
+
|
|
407
|
+
def _join_chunk(self, chunk, settings):
|
|
408
|
+
if "no_word_spacing" in self.info:
|
|
409
|
+
return self._join(chunk, separator="", settings=settings)
|
|
410
|
+
else:
|
|
411
|
+
return re.sub(r"\s{2,}", " ", " ".join(chunk))
|
|
412
|
+
|
|
413
|
+
def _token_with_digits_is_ok(self, token):
|
|
414
|
+
if "no_word_spacing" in self.info:
|
|
415
|
+
if re.search(r"[\d\.:\-/]+", token) is not None:
|
|
416
|
+
return True
|
|
417
|
+
else:
|
|
418
|
+
return False
|
|
419
|
+
|
|
420
|
+
else:
|
|
421
|
+
if re.search(r"\d+", token) is not None:
|
|
422
|
+
return True
|
|
423
|
+
else:
|
|
424
|
+
return False
|
|
425
|
+
|
|
426
|
+
def _simplify(self, date_string, settings=None):
|
|
427
|
+
date_string = date_string.lower()
|
|
428
|
+
simplifications = self._get_simplifications(settings=settings)
|
|
429
|
+
for simplification in simplifications:
|
|
430
|
+
pattern, replacement = list(simplification.items())[0]
|
|
431
|
+
date_string = pattern.sub(replacement, date_string).lower()
|
|
432
|
+
return date_string
|
|
433
|
+
|
|
434
|
+
def _get_simplifications(self, settings=None):
|
|
435
|
+
no_word_spacing = eval(self.info.get("no_word_spacing", "False"))
|
|
436
|
+
if settings.NORMALIZE:
|
|
437
|
+
if self._normalized_simplifications is None:
|
|
438
|
+
self._normalized_simplifications = []
|
|
439
|
+
simplifications = self._generate_simplifications(normalize=True)
|
|
440
|
+
for simplification in simplifications:
|
|
441
|
+
pattern, replacement = list(simplification.items())[0]
|
|
442
|
+
if not no_word_spacing:
|
|
443
|
+
pattern = r"(?<=\A|\W|_)%s(?=\Z|\W|_)" % pattern
|
|
444
|
+
pattern = re.compile(pattern, flags=re.I | re.U)
|
|
445
|
+
self._normalized_simplifications.append({pattern: replacement})
|
|
446
|
+
return self._normalized_simplifications
|
|
447
|
+
|
|
448
|
+
else:
|
|
449
|
+
if self._simplifications is None:
|
|
450
|
+
self._simplifications = []
|
|
451
|
+
simplifications = self._generate_simplifications(normalize=False)
|
|
452
|
+
for simplification in simplifications:
|
|
453
|
+
pattern, replacement = list(simplification.items())[0]
|
|
454
|
+
if not no_word_spacing:
|
|
455
|
+
pattern = r"(?<=\A|\W|_)%s(?=\Z|\W|_)" % pattern
|
|
456
|
+
pattern = re.compile(pattern, flags=re.I | re.U)
|
|
457
|
+
self._simplifications.append({pattern: replacement})
|
|
458
|
+
return self._simplifications
|
|
459
|
+
|
|
460
|
+
def _generate_simplifications(self, normalize=False):
|
|
461
|
+
simplifications = []
|
|
462
|
+
for simplification in self.info.get("simplifications", []):
|
|
463
|
+
c_simplification = {}
|
|
464
|
+
key, value = list(simplification.items())[0]
|
|
465
|
+
if normalize:
|
|
466
|
+
key = normalize_unicode(key)
|
|
467
|
+
|
|
468
|
+
if isinstance(value, int):
|
|
469
|
+
c_simplification[key] = str(value)
|
|
470
|
+
else:
|
|
471
|
+
c_simplification[key] = normalize_unicode(value) if normalize else value
|
|
472
|
+
|
|
473
|
+
simplifications.append(c_simplification)
|
|
474
|
+
return simplifications
|
|
475
|
+
|
|
476
|
+
def _clear_future_words(self, words):
|
|
477
|
+
freshness_words = {"day", "week", "month", "year", "hour", "minute", "second"}
|
|
478
|
+
if set(words).isdisjoint(freshness_words):
|
|
479
|
+
words.remove("in")
|
|
480
|
+
return words
|
|
481
|
+
|
|
482
|
+
def _join(self, tokens, separator=" ", settings=None):
|
|
483
|
+
if not tokens:
|
|
484
|
+
return ""
|
|
485
|
+
|
|
486
|
+
capturing_splitters = self._get_splitters(settings)["capturing"]
|
|
487
|
+
joined = tokens[0]
|
|
488
|
+
for i in range(1, len(tokens)):
|
|
489
|
+
left, right = tokens[i - 1], tokens[i]
|
|
490
|
+
if left not in capturing_splitters and right not in capturing_splitters:
|
|
491
|
+
joined += separator
|
|
492
|
+
joined += right
|
|
493
|
+
|
|
494
|
+
return joined
|
|
495
|
+
|
|
496
|
+
def _get_dictionary(self, settings=None):
|
|
497
|
+
if not settings.NORMALIZE:
|
|
498
|
+
if self._dictionary is None:
|
|
499
|
+
self._generate_dictionary()
|
|
500
|
+
self._dictionary._settings = settings
|
|
501
|
+
return self._dictionary
|
|
502
|
+
else:
|
|
503
|
+
if self._normalized_dictionary is None:
|
|
504
|
+
self._generate_normalized_dictionary()
|
|
505
|
+
self._normalized_dictionary._settings = settings
|
|
506
|
+
return self._normalized_dictionary
|
|
507
|
+
|
|
508
|
+
def _get_wordchars(self, settings=None):
|
|
509
|
+
if self._wordchars is None:
|
|
510
|
+
self._set_wordchars(settings)
|
|
511
|
+
return self._wordchars
|
|
512
|
+
|
|
513
|
+
def _get_splitters(self, settings=None):
|
|
514
|
+
if self._splitters is None:
|
|
515
|
+
self._set_splitters(settings)
|
|
516
|
+
return self._splitters
|
|
517
|
+
|
|
518
|
+
def _set_splitters(self, settings=None):
|
|
519
|
+
splitters = {
|
|
520
|
+
# The ones that split string only if they are not surrounded by letters from both sides:
|
|
521
|
+
"wordchars": set(),
|
|
522
|
+
# The ones that are not filtered out from tokens after split:
|
|
523
|
+
"capturing": set(),
|
|
524
|
+
}
|
|
525
|
+
splitters["capturing"] |= set(ALWAYS_KEEP_TOKENS)
|
|
526
|
+
|
|
527
|
+
wordchars = self._get_wordchars(settings)
|
|
528
|
+
skip = set(self.info.get("skip", [])) | splitters["capturing"]
|
|
529
|
+
for token in skip:
|
|
530
|
+
if not re.match(r"^\W+$", token, re.UNICODE):
|
|
531
|
+
continue
|
|
532
|
+
if token in wordchars:
|
|
533
|
+
splitters["wordchars"].add(token)
|
|
534
|
+
|
|
535
|
+
self._splitters = splitters
|
|
536
|
+
|
|
537
|
+
def _set_wordchars(self, settings=None):
|
|
538
|
+
wordchars = set()
|
|
539
|
+
for word in self._get_dictionary(settings):
|
|
540
|
+
if re.match(r"^[\W\d_]+$", word, re.UNICODE):
|
|
541
|
+
continue
|
|
542
|
+
for char in word:
|
|
543
|
+
wordchars.add(char.lower())
|
|
544
|
+
|
|
545
|
+
self._wordchars = wordchars - {" "} | {
|
|
546
|
+
"0",
|
|
547
|
+
"1",
|
|
548
|
+
"2",
|
|
549
|
+
"3",
|
|
550
|
+
"4",
|
|
551
|
+
"5",
|
|
552
|
+
"6",
|
|
553
|
+
"7",
|
|
554
|
+
"8",
|
|
555
|
+
"9",
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
def get_wordchars_for_detection(self, settings):
|
|
559
|
+
if self._wordchars_for_detection is None:
|
|
560
|
+
wordchars = set()
|
|
561
|
+
for word in self._get_dictionary(settings):
|
|
562
|
+
if re.match(r"^[\W\d_]+$", word, re.UNICODE):
|
|
563
|
+
continue
|
|
564
|
+
for char in word:
|
|
565
|
+
wordchars.add(char.lower())
|
|
566
|
+
self._wordchars_for_detection = wordchars - {
|
|
567
|
+
"0",
|
|
568
|
+
"1",
|
|
569
|
+
"2",
|
|
570
|
+
"3",
|
|
571
|
+
"4",
|
|
572
|
+
"5",
|
|
573
|
+
"6",
|
|
574
|
+
"7",
|
|
575
|
+
"8",
|
|
576
|
+
"9",
|
|
577
|
+
":",
|
|
578
|
+
"(",
|
|
579
|
+
")",
|
|
580
|
+
"'",
|
|
581
|
+
"q",
|
|
582
|
+
"a",
|
|
583
|
+
"m",
|
|
584
|
+
"p",
|
|
585
|
+
" ",
|
|
586
|
+
}
|
|
587
|
+
return self._wordchars_for_detection
|
|
588
|
+
|
|
589
|
+
def _generate_dictionary(self, settings=None):
|
|
590
|
+
self._dictionary = Dictionary(self.info, settings=settings)
|
|
591
|
+
|
|
592
|
+
def _generate_normalized_dictionary(self, settings=None):
|
|
593
|
+
self._normalized_dictionary = NormalizedDictionary(self.info, settings=settings)
|
|
594
|
+
|
|
595
|
+
def to_parserinfo(self, base_cls=parser.parserinfo):
|
|
596
|
+
attributes = {
|
|
597
|
+
"JUMP": self.info.get("skip", []),
|
|
598
|
+
"PERTAIN": self.info.get("pertain", []),
|
|
599
|
+
"WEEKDAYS": [
|
|
600
|
+
self.info["monday"],
|
|
601
|
+
self.info["tuesday"],
|
|
602
|
+
self.info["wednesday"],
|
|
603
|
+
self.info["thursday"],
|
|
604
|
+
self.info["friday"],
|
|
605
|
+
self.info["saturday"],
|
|
606
|
+
self.info["sunday"],
|
|
607
|
+
],
|
|
608
|
+
"MONTHS": [
|
|
609
|
+
self.info["january"],
|
|
610
|
+
self.info["february"],
|
|
611
|
+
self.info["march"],
|
|
612
|
+
self.info["april"],
|
|
613
|
+
self.info["may"],
|
|
614
|
+
self.info["june"],
|
|
615
|
+
self.info["july"],
|
|
616
|
+
self.info["august"],
|
|
617
|
+
self.info["september"],
|
|
618
|
+
self.info["october"],
|
|
619
|
+
self.info["november"],
|
|
620
|
+
self.info["december"],
|
|
621
|
+
],
|
|
622
|
+
"HMS": [self.info["hour"], self.info["minute"], self.info["second"]],
|
|
623
|
+
}
|
|
624
|
+
name = "{language}ParserInfo".format(language=self.info["name"])
|
|
625
|
+
return type(name, bases=[base_cls], dict=attributes)
|