dateparser 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dateparser/__init__.py +82 -0
- dateparser/calendars/__init__.py +144 -0
- dateparser/calendars/hijri.py +6 -0
- dateparser/calendars/hijri_parser.py +60 -0
- dateparser/calendars/jalali.py +9 -0
- dateparser/calendars/jalali_parser.py +184 -0
- dateparser/conf.py +267 -0
- dateparser/custom_language_detection/__init__.py +0 -0
- dateparser/custom_language_detection/fasttext.py +43 -0
- dateparser/custom_language_detection/langdetect.py +37 -0
- dateparser/custom_language_detection/language_mapping.py +18 -0
- dateparser/data/__init__.py +2 -0
- dateparser/data/date_translation_data/__init__.py +0 -0
- dateparser/data/date_translation_data/af.py +242 -0
- dateparser/data/date_translation_data/agq.py +169 -0
- dateparser/data/date_translation_data/ak.py +169 -0
- dateparser/data/date_translation_data/am.py +222 -0
- dateparser/data/date_translation_data/ar.py +574 -0
- dateparser/data/date_translation_data/as.py +164 -0
- dateparser/data/date_translation_data/asa.py +168 -0
- dateparser/data/date_translation_data/ast.py +280 -0
- dateparser/data/date_translation_data/az-Cyrl.py +168 -0
- dateparser/data/date_translation_data/az-Latn.py +217 -0
- dateparser/data/date_translation_data/az.py +217 -0
- dateparser/data/date_translation_data/bas.py +169 -0
- dateparser/data/date_translation_data/be.py +340 -0
- dateparser/data/date_translation_data/bem.py +161 -0
- dateparser/data/date_translation_data/bez.py +169 -0
- dateparser/data/date_translation_data/bg.py +345 -0
- dateparser/data/date_translation_data/bm.py +167 -0
- dateparser/data/date_translation_data/bn.py +241 -0
- dateparser/data/date_translation_data/bo.py +185 -0
- dateparser/data/date_translation_data/br.py +226 -0
- dateparser/data/date_translation_data/brx.py +157 -0
- dateparser/data/date_translation_data/bs-Cyrl.py +226 -0
- dateparser/data/date_translation_data/bs-Latn.py +248 -0
- dateparser/data/date_translation_data/bs.py +248 -0
- dateparser/data/date_translation_data/ca.py +313 -0
- dateparser/data/date_translation_data/ce.py +225 -0
- dateparser/data/date_translation_data/cgg.py +169 -0
- dateparser/data/date_translation_data/chr.py +240 -0
- dateparser/data/date_translation_data/ckb.py +154 -0
- dateparser/data/date_translation_data/cs.py +316 -0
- dateparser/data/date_translation_data/cy.py +217 -0
- dateparser/data/date_translation_data/da.py +296 -0
- dateparser/data/date_translation_data/dav.py +169 -0
- dateparser/data/date_translation_data/de.py +357 -0
- dateparser/data/date_translation_data/dje.py +167 -0
- dateparser/data/date_translation_data/dsb.py +270 -0
- dateparser/data/date_translation_data/dua.py +169 -0
- dateparser/data/date_translation_data/dyo.py +168 -0
- dateparser/data/date_translation_data/dz.py +225 -0
- dateparser/data/date_translation_data/ebu.py +169 -0
- dateparser/data/date_translation_data/ee.py +233 -0
- dateparser/data/date_translation_data/el.py +279 -0
- dateparser/data/date_translation_data/en.py +851 -0
- dateparser/data/date_translation_data/eo.py +169 -0
- dateparser/data/date_translation_data/es.py +499 -0
- dateparser/data/date_translation_data/et.py +233 -0
- dateparser/data/date_translation_data/eu.py +219 -0
- dateparser/data/date_translation_data/ewo.py +169 -0
- dateparser/data/date_translation_data/fa.py +270 -0
- dateparser/data/date_translation_data/ff.py +179 -0
- dateparser/data/date_translation_data/fi.py +345 -0
- dateparser/data/date_translation_data/fil.py +223 -0
- dateparser/data/date_translation_data/fo.py +256 -0
- dateparser/data/date_translation_data/fr.py +520 -0
- dateparser/data/date_translation_data/fur.py +223 -0
- dateparser/data/date_translation_data/fy.py +223 -0
- dateparser/data/date_translation_data/ga.py +238 -0
- dateparser/data/date_translation_data/gd.py +277 -0
- dateparser/data/date_translation_data/gl.py +253 -0
- dateparser/data/date_translation_data/gsw.py +179 -0
- dateparser/data/date_translation_data/gu.py +216 -0
- dateparser/data/date_translation_data/guz.py +170 -0
- dateparser/data/date_translation_data/gv.py +166 -0
- dateparser/data/date_translation_data/ha.py +176 -0
- dateparser/data/date_translation_data/haw.py +168 -0
- dateparser/data/date_translation_data/he.py +371 -0
- dateparser/data/date_translation_data/hi.py +261 -0
- dateparser/data/date_translation_data/hr.py +378 -0
- dateparser/data/date_translation_data/hsb.py +271 -0
- dateparser/data/date_translation_data/hu.py +297 -0
- dateparser/data/date_translation_data/hy.py +246 -0
- dateparser/data/date_translation_data/id.py +272 -0
- dateparser/data/date_translation_data/ig.py +168 -0
- dateparser/data/date_translation_data/ii.py +157 -0
- dateparser/data/date_translation_data/is.py +242 -0
- dateparser/data/date_translation_data/it.py +282 -0
- dateparser/data/date_translation_data/ja.py +286 -0
- dateparser/data/date_translation_data/jgo.py +188 -0
- dateparser/data/date_translation_data/jmc.py +168 -0
- dateparser/data/date_translation_data/ka.py +241 -0
- dateparser/data/date_translation_data/kab.py +169 -0
- dateparser/data/date_translation_data/kam.py +169 -0
- dateparser/data/date_translation_data/kde.py +169 -0
- dateparser/data/date_translation_data/kea.py +230 -0
- dateparser/data/date_translation_data/khq.py +167 -0
- dateparser/data/date_translation_data/ki.py +169 -0
- dateparser/data/date_translation_data/kk.py +228 -0
- dateparser/data/date_translation_data/kl.py +213 -0
- dateparser/data/date_translation_data/kln.py +171 -0
- dateparser/data/date_translation_data/km.py +198 -0
- dateparser/data/date_translation_data/kn.py +225 -0
- dateparser/data/date_translation_data/ko.py +207 -0
- dateparser/data/date_translation_data/kok.py +157 -0
- dateparser/data/date_translation_data/ks.py +152 -0
- dateparser/data/date_translation_data/ksb.py +168 -0
- dateparser/data/date_translation_data/ksf.py +169 -0
- dateparser/data/date_translation_data/ksh.py +192 -0
- dateparser/data/date_translation_data/kw.py +169 -0
- dateparser/data/date_translation_data/ky.py +240 -0
- dateparser/data/date_translation_data/lag.py +169 -0
- dateparser/data/date_translation_data/lb.py +233 -0
- dateparser/data/date_translation_data/lg.py +169 -0
- dateparser/data/date_translation_data/lkt.py +194 -0
- dateparser/data/date_translation_data/ln.py +179 -0
- dateparser/data/date_translation_data/lo.py +228 -0
- dateparser/data/date_translation_data/lrc.py +154 -0
- dateparser/data/date_translation_data/lt.py +263 -0
- dateparser/data/date_translation_data/lu.py +169 -0
- dateparser/data/date_translation_data/luo.py +169 -0
- dateparser/data/date_translation_data/luy.py +168 -0
- dateparser/data/date_translation_data/lv.py +257 -0
- dateparser/data/date_translation_data/mas.py +173 -0
- dateparser/data/date_translation_data/mer.py +168 -0
- dateparser/data/date_translation_data/mfe.py +166 -0
- dateparser/data/date_translation_data/mg.py +168 -0
- dateparser/data/date_translation_data/mgh.py +169 -0
- dateparser/data/date_translation_data/mgo.py +151 -0
- dateparser/data/date_translation_data/mk.py +234 -0
- dateparser/data/date_translation_data/ml.py +217 -0
- dateparser/data/date_translation_data/mn.py +224 -0
- dateparser/data/date_translation_data/mr.py +229 -0
- dateparser/data/date_translation_data/ms.py +242 -0
- dateparser/data/date_translation_data/mt.py +175 -0
- dateparser/data/date_translation_data/mua.py +169 -0
- dateparser/data/date_translation_data/my.py +203 -0
- dateparser/data/date_translation_data/mzn.py +199 -0
- dateparser/data/date_translation_data/naq.py +169 -0
- dateparser/data/date_translation_data/nb.py +261 -0
- dateparser/data/date_translation_data/nd.py +169 -0
- dateparser/data/date_translation_data/ne.py +207 -0
- dateparser/data/date_translation_data/nl.py +273 -0
- dateparser/data/date_translation_data/nmg.py +169 -0
- dateparser/data/date_translation_data/nn.py +231 -0
- dateparser/data/date_translation_data/nnh.py +150 -0
- dateparser/data/date_translation_data/nus.py +166 -0
- dateparser/data/date_translation_data/nyn.py +169 -0
- dateparser/data/date_translation_data/om.py +173 -0
- dateparser/data/date_translation_data/or.py +157 -0
- dateparser/data/date_translation_data/os.py +203 -0
- dateparser/data/date_translation_data/pa-Arab.py +150 -0
- dateparser/data/date_translation_data/pa-Guru.py +221 -0
- dateparser/data/date_translation_data/pa.py +221 -0
- dateparser/data/date_translation_data/pl.py +416 -0
- dateparser/data/date_translation_data/ps.py +150 -0
- dateparser/data/date_translation_data/pt.py +981 -0
- dateparser/data/date_translation_data/qu.py +176 -0
- dateparser/data/date_translation_data/rm.py +166 -0
- dateparser/data/date_translation_data/rn.py +169 -0
- dateparser/data/date_translation_data/ro.py +270 -0
- dateparser/data/date_translation_data/rof.py +157 -0
- dateparser/data/date_translation_data/ru.py +442 -0
- dateparser/data/date_translation_data/rw.py +169 -0
- dateparser/data/date_translation_data/rwk.py +168 -0
- dateparser/data/date_translation_data/sah.py +219 -0
- dateparser/data/date_translation_data/saq.py +169 -0
- dateparser/data/date_translation_data/sbp.py +169 -0
- dateparser/data/date_translation_data/se.py +280 -0
- dateparser/data/date_translation_data/seh.py +169 -0
- dateparser/data/date_translation_data/ses.py +167 -0
- dateparser/data/date_translation_data/sg.py +169 -0
- dateparser/data/date_translation_data/shi-Latn.py +169 -0
- dateparser/data/date_translation_data/shi-Tfng.py +169 -0
- dateparser/data/date_translation_data/shi.py +169 -0
- dateparser/data/date_translation_data/si.py +220 -0
- dateparser/data/date_translation_data/sk.py +327 -0
- dateparser/data/date_translation_data/sl.py +244 -0
- dateparser/data/date_translation_data/smn.py +176 -0
- dateparser/data/date_translation_data/sn.py +169 -0
- dateparser/data/date_translation_data/so.py +179 -0
- dateparser/data/date_translation_data/sq.py +237 -0
- dateparser/data/date_translation_data/sr-Cyrl.py +306 -0
- dateparser/data/date_translation_data/sr-Latn.py +306 -0
- dateparser/data/date_translation_data/sr.py +255 -0
- dateparser/data/date_translation_data/sv.py +309 -0
- dateparser/data/date_translation_data/sw.py +231 -0
- dateparser/data/date_translation_data/ta.py +264 -0
- dateparser/data/date_translation_data/te.py +239 -0
- dateparser/data/date_translation_data/teo.py +173 -0
- dateparser/data/date_translation_data/th.py +300 -0
- dateparser/data/date_translation_data/ti.py +173 -0
- dateparser/data/date_translation_data/tl.py +137 -0
- dateparser/data/date_translation_data/to.py +216 -0
- dateparser/data/date_translation_data/tr.py +259 -0
- dateparser/data/date_translation_data/twq.py +167 -0
- dateparser/data/date_translation_data/tzm.py +169 -0
- dateparser/data/date_translation_data/ug.py +203 -0
- dateparser/data/date_translation_data/uk.py +502 -0
- dateparser/data/date_translation_data/ur.py +256 -0
- dateparser/data/date_translation_data/uz-Arab.py +167 -0
- dateparser/data/date_translation_data/uz-Cyrl.py +210 -0
- dateparser/data/date_translation_data/uz-Latn.py +216 -0
- dateparser/data/date_translation_data/uz.py +216 -0
- dateparser/data/date_translation_data/vi.py +260 -0
- dateparser/data/date_translation_data/vun.py +168 -0
- dateparser/data/date_translation_data/wae.py +224 -0
- dateparser/data/date_translation_data/xog.py +169 -0
- dateparser/data/date_translation_data/yav.py +169 -0
- dateparser/data/date_translation_data/yi.py +178 -0
- dateparser/data/date_translation_data/yo.py +263 -0
- dateparser/data/date_translation_data/yue.py +203 -0
- dateparser/data/date_translation_data/zgh.py +169 -0
- dateparser/data/date_translation_data/zh-Hans.py +240 -0
- dateparser/data/date_translation_data/zh-Hant.py +402 -0
- dateparser/data/date_translation_data/zh.py +273 -0
- dateparser/data/date_translation_data/zu.py +196 -0
- dateparser/data/languages_info.py +826 -0
- dateparser/date.py +599 -0
- dateparser/date_parser.py +55 -0
- dateparser/freshness_date_parser.py +156 -0
- dateparser/languages/__init__.py +2 -0
- dateparser/languages/dictionary.py +352 -0
- dateparser/languages/loader.py +224 -0
- dateparser/languages/locale.py +625 -0
- dateparser/languages/validation.py +467 -0
- dateparser/parser.py +742 -0
- dateparser/search/__init__.py +71 -0
- dateparser/search/detection.py +78 -0
- dateparser/search/search.py +297 -0
- dateparser/search/text_detection.py +89 -0
- dateparser/timezone_parser.py +91 -0
- dateparser/timezones.py +469 -0
- dateparser/utils/__init__.py +257 -0
- dateparser/utils/strptime.py +108 -0
- dateparser-1.2.1.dist-info/AUTHORS.rst +17 -0
- dateparser-1.2.1.dist-info/LICENSE +12 -0
- dateparser-1.2.1.dist-info/METADATA +864 -0
- dateparser-1.2.1.dist-info/RECORD +256 -0
- dateparser-1.2.1.dist-info/WHEEL +5 -0
- dateparser-1.2.1.dist-info/entry_points.txt +2 -0
- dateparser-1.2.1.dist-info/top_level.txt +4 -0
- dateparser_cli/__init__.py +0 -0
- dateparser_cli/cli.py +36 -0
- dateparser_cli/exceptions.py +2 -0
- dateparser_cli/fasttext_manager.py +42 -0
- dateparser_cli/utils.py +27 -0
- dateparser_data/__init__.py +0 -0
- dateparser_data/settings.py +33 -0
- dateparser_scripts/__init__.py +0 -0
- dateparser_scripts/get_cldr_data.py +567 -0
- dateparser_scripts/order_languages.py +217 -0
- dateparser_scripts/update_supported_languages_and_locales.py +48 -0
- dateparser_scripts/utils.py +73 -0
- dateparser_scripts/write_complete_data.py +129 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from dateparser.search.search import DateSearchWithDetection
|
|
2
|
+
|
|
3
|
+
_search_with_detection = DateSearchWithDetection()
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def search_dates(
|
|
7
|
+
text,
|
|
8
|
+
languages=None,
|
|
9
|
+
settings=None,
|
|
10
|
+
add_detected_language=False,
|
|
11
|
+
detect_languages_function=None,
|
|
12
|
+
):
|
|
13
|
+
"""Find all substrings of the given string which represent date and/or time and parse them.
|
|
14
|
+
|
|
15
|
+
:param text:
|
|
16
|
+
A string in a natural language which may contain date and/or time expressions.
|
|
17
|
+
:type text: str
|
|
18
|
+
|
|
19
|
+
:param languages:
|
|
20
|
+
A list of two letters language codes.e.g. ['en', 'es']. If languages are given, it will
|
|
21
|
+
not attempt to detect the language.
|
|
22
|
+
:type languages: list
|
|
23
|
+
|
|
24
|
+
:param settings:
|
|
25
|
+
Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`.
|
|
26
|
+
:type settings: dict
|
|
27
|
+
|
|
28
|
+
:param add_detected_language:
|
|
29
|
+
Indicates if we want the detected language returned in the tuple.
|
|
30
|
+
:type add_detected_language: bool
|
|
31
|
+
|
|
32
|
+
:param detect_languages_function:
|
|
33
|
+
A function for language detection that takes as input a `text` and a `confidence_threshold`,
|
|
34
|
+
and returns a list of detected language codes.
|
|
35
|
+
Note: detect_languages_function is only uses if `languages` are not provided.
|
|
36
|
+
:type detect_languages_function: function
|
|
37
|
+
|
|
38
|
+
:return: Returns list of tuples containing:
|
|
39
|
+
substrings representing date and/or time, corresponding :mod:`datetime.datetime`
|
|
40
|
+
object and detected language if *add_detected_language* is True.
|
|
41
|
+
Returns None if no dates that can be parsed are found.
|
|
42
|
+
:rtype: list
|
|
43
|
+
:raises: ValueError - Unknown Language
|
|
44
|
+
|
|
45
|
+
>>> from dateparser.search import search_dates
|
|
46
|
+
>>> search_dates('The first artificial Earth satellite was launched on 4 October 1957.')
|
|
47
|
+
[('on 4 October 1957', datetime.datetime(1957, 10, 4, 0, 0))]
|
|
48
|
+
|
|
49
|
+
>>> search_dates('The first artificial Earth satellite was launched on 4 October 1957.',
|
|
50
|
+
>>> add_detected_language=True)
|
|
51
|
+
[('on 4 October 1957', datetime.datetime(1957, 10, 4, 0, 0), 'en')]
|
|
52
|
+
|
|
53
|
+
>>> search_dates("The client arrived to the office for the first time in March 3rd, 2004 "
|
|
54
|
+
>>> "and got serviced, after a couple of months, on May 6th 2004, the customer "
|
|
55
|
+
>>> "returned indicating a defect on the part")
|
|
56
|
+
[('in March 3rd, 2004 and', datetime.datetime(2004, 3, 3, 0, 0)),
|
|
57
|
+
('on May 6th 2004', datetime.datetime(2004, 5, 6, 0, 0))]
|
|
58
|
+
|
|
59
|
+
"""
|
|
60
|
+
result = _search_with_detection.search_dates(
|
|
61
|
+
text=text,
|
|
62
|
+
languages=languages,
|
|
63
|
+
settings=settings,
|
|
64
|
+
detect_languages_function=detect_languages_function,
|
|
65
|
+
)
|
|
66
|
+
dates = result.get("Dates")
|
|
67
|
+
if dates:
|
|
68
|
+
if add_detected_language:
|
|
69
|
+
language = result.get("Language")
|
|
70
|
+
dates = [date + (language,) for date in dates]
|
|
71
|
+
return dates
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from functools import wraps
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def _restore_languages_on_generator_exit(method):
|
|
5
|
+
@wraps(method)
|
|
6
|
+
def wrapped(self, *args, **kwargs):
|
|
7
|
+
stored_languages = self.languages[:]
|
|
8
|
+
for language in method(self, *args, **kwargs):
|
|
9
|
+
yield language
|
|
10
|
+
else:
|
|
11
|
+
self.languages[:] = stored_languages
|
|
12
|
+
|
|
13
|
+
return wrapped
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BaseLanguageDetector:
|
|
17
|
+
def __init__(self, languages):
|
|
18
|
+
self.languages = languages[:]
|
|
19
|
+
|
|
20
|
+
@_restore_languages_on_generator_exit
|
|
21
|
+
def iterate_applicable_languages(self, date_string, settings=None, modify=False):
|
|
22
|
+
languages = self.languages if modify else self.languages[:]
|
|
23
|
+
yield from self._filter_languages(date_string, languages, settings)
|
|
24
|
+
|
|
25
|
+
@staticmethod
|
|
26
|
+
def _filter_languages(date_string, languages, settings=None):
|
|
27
|
+
while languages:
|
|
28
|
+
language = languages[0]
|
|
29
|
+
if language.is_applicable(
|
|
30
|
+
date_string, strip_timezone=False, settings=settings
|
|
31
|
+
):
|
|
32
|
+
yield language
|
|
33
|
+
elif language.is_applicable(
|
|
34
|
+
date_string, strip_timezone=True, settings=settings
|
|
35
|
+
):
|
|
36
|
+
yield language
|
|
37
|
+
|
|
38
|
+
languages.pop(0)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class AutoDetectLanguage(BaseLanguageDetector):
|
|
42
|
+
def __init__(self, languages, allow_redetection=False):
|
|
43
|
+
super().__init__(languages=languages[:])
|
|
44
|
+
self.language_pool = languages[:]
|
|
45
|
+
self.allow_redetection = allow_redetection
|
|
46
|
+
|
|
47
|
+
@_restore_languages_on_generator_exit
|
|
48
|
+
def iterate_applicable_languages(self, date_string, modify=False, settings=None):
|
|
49
|
+
languages = self.languages if modify else self.languages[:]
|
|
50
|
+
initial_languages = languages[:]
|
|
51
|
+
yield from self._filter_languages(date_string, languages, settings=settings)
|
|
52
|
+
|
|
53
|
+
if not self.allow_redetection:
|
|
54
|
+
return
|
|
55
|
+
|
|
56
|
+
# Try languages that was not tried before with this date_string
|
|
57
|
+
languages = [
|
|
58
|
+
language
|
|
59
|
+
for language in self.language_pool
|
|
60
|
+
if language not in initial_languages
|
|
61
|
+
]
|
|
62
|
+
if modify:
|
|
63
|
+
self.languages = languages
|
|
64
|
+
|
|
65
|
+
yield from self._filter_languages(date_string, languages, settings=settings)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class ExactLanguages(BaseLanguageDetector):
|
|
69
|
+
def __init__(self, languages):
|
|
70
|
+
if languages is None:
|
|
71
|
+
raise ValueError("language cannot be None for ExactLanguages")
|
|
72
|
+
super().__init__(languages=languages)
|
|
73
|
+
|
|
74
|
+
@_restore_languages_on_generator_exit
|
|
75
|
+
def iterate_applicable_languages(self, date_string, modify=False, settings=None):
|
|
76
|
+
yield from super().iterate_applicable_languages(
|
|
77
|
+
date_string, modify=False, settings=settings
|
|
78
|
+
)
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
from collections.abc import Set
|
|
2
|
+
|
|
3
|
+
import regex as re
|
|
4
|
+
|
|
5
|
+
from dateparser.conf import Settings, apply_settings, check_settings
|
|
6
|
+
from dateparser.custom_language_detection.language_mapping import map_languages
|
|
7
|
+
from dateparser.date import DateDataParser
|
|
8
|
+
from dateparser.languages.loader import LocaleDataLoader
|
|
9
|
+
from dateparser.search.text_detection import FullTextLanguageDetector
|
|
10
|
+
|
|
11
|
+
RELATIVE_REG = re.compile("(ago|in|from now|tomorrow|today|yesterday)")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def date_is_relative(translation):
|
|
15
|
+
return re.search(RELATIVE_REG, translation) is not None
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class _ExactLanguageSearch:
|
|
19
|
+
def __init__(self, loader):
|
|
20
|
+
self.loader = loader
|
|
21
|
+
self.language = None
|
|
22
|
+
|
|
23
|
+
def get_current_language(self, shortname):
|
|
24
|
+
if self.language is None or self.language.shortname != shortname:
|
|
25
|
+
self.language = self.loader.get_locale(shortname)
|
|
26
|
+
|
|
27
|
+
def search(self, shortname, text, settings):
|
|
28
|
+
self.get_current_language(shortname)
|
|
29
|
+
result = self.language.translate_search(text, settings=settings)
|
|
30
|
+
return result
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def set_relative_base(substring, already_parsed):
|
|
34
|
+
if len(already_parsed) == 0:
|
|
35
|
+
return substring, None
|
|
36
|
+
|
|
37
|
+
i = len(already_parsed) - 1
|
|
38
|
+
while already_parsed[i][1]:
|
|
39
|
+
i -= 1
|
|
40
|
+
if i == -1:
|
|
41
|
+
return substring, None
|
|
42
|
+
relative_base = already_parsed[i][0]["date_obj"]
|
|
43
|
+
return substring, relative_base
|
|
44
|
+
|
|
45
|
+
def choose_best_split(self, possible_parsed_splits, possible_substrings_splits):
|
|
46
|
+
rating = []
|
|
47
|
+
for i in range(len(possible_parsed_splits)):
|
|
48
|
+
num_substrings = len(possible_substrings_splits[i])
|
|
49
|
+
num_substrings_without_digits = 0
|
|
50
|
+
not_parsed = 0
|
|
51
|
+
for j, item in enumerate(possible_parsed_splits[i]):
|
|
52
|
+
if item[0]["date_obj"] is None:
|
|
53
|
+
not_parsed += 1
|
|
54
|
+
if not any(char.isdigit() for char in possible_substrings_splits[i][j]):
|
|
55
|
+
num_substrings_without_digits += 1
|
|
56
|
+
rating.append(
|
|
57
|
+
[
|
|
58
|
+
num_substrings,
|
|
59
|
+
0
|
|
60
|
+
if not_parsed == 0
|
|
61
|
+
else (float(not_parsed) / float(num_substrings)),
|
|
62
|
+
0
|
|
63
|
+
if num_substrings_without_digits == 0
|
|
64
|
+
else (float(num_substrings_without_digits) / float(num_substrings)),
|
|
65
|
+
]
|
|
66
|
+
)
|
|
67
|
+
best_index, best_rating = min(
|
|
68
|
+
enumerate(rating), key=lambda p: (p[1][1], p[1][0], p[1][2])
|
|
69
|
+
)
|
|
70
|
+
return (
|
|
71
|
+
possible_parsed_splits[best_index],
|
|
72
|
+
possible_substrings_splits[best_index],
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
def split_by(self, item, original, splitter):
|
|
76
|
+
if item.count(splitter) <= 2:
|
|
77
|
+
return [[item.split(splitter), original.split(splitter)]]
|
|
78
|
+
|
|
79
|
+
item_all_split = item.split(splitter)
|
|
80
|
+
original_all_split = original.split(splitter)
|
|
81
|
+
all_possible_splits = [[item_all_split, original_all_split]]
|
|
82
|
+
for i in range(2, 4):
|
|
83
|
+
item_partially_split = []
|
|
84
|
+
original_partially_split = []
|
|
85
|
+
for j in range(0, len(item_all_split), i):
|
|
86
|
+
item_join = splitter.join(item_all_split[j : j + i])
|
|
87
|
+
original_join = splitter.join(original_all_split[j : j + i])
|
|
88
|
+
item_partially_split.append(item_join)
|
|
89
|
+
original_partially_split.append(original_join)
|
|
90
|
+
all_possible_splits.append([item_partially_split, original_partially_split])
|
|
91
|
+
return all_possible_splits
|
|
92
|
+
|
|
93
|
+
def split_if_not_parsed(self, item, original):
|
|
94
|
+
splitters = [",", "،", "——", "—", "–", ".", " "]
|
|
95
|
+
possible_splits = []
|
|
96
|
+
for splitter in splitters:
|
|
97
|
+
if splitter in item and item.count(splitter) == original.count(splitter):
|
|
98
|
+
possible_splits.extend(self.split_by(item, original, splitter))
|
|
99
|
+
return possible_splits
|
|
100
|
+
|
|
101
|
+
def parse_item(self, parser, item, translated_item, parsed, need_relative_base):
|
|
102
|
+
relative_base = None
|
|
103
|
+
item = item.replace("ngày", "")
|
|
104
|
+
item = item.replace("am", "")
|
|
105
|
+
parsed_item = parser.get_date_data(item)
|
|
106
|
+
is_relative = date_is_relative(translated_item)
|
|
107
|
+
|
|
108
|
+
if need_relative_base:
|
|
109
|
+
item, relative_base = self.set_relative_base(item, parsed)
|
|
110
|
+
|
|
111
|
+
if relative_base:
|
|
112
|
+
parser._settings.RELATIVE_BASE = relative_base
|
|
113
|
+
parsed_item = parser.get_date_data(item)
|
|
114
|
+
return parsed_item, is_relative
|
|
115
|
+
|
|
116
|
+
def parse_found_objects(self, parser, to_parse, original, translated, settings):
|
|
117
|
+
parsed = []
|
|
118
|
+
substrings = []
|
|
119
|
+
need_relative_base = True
|
|
120
|
+
if settings.RELATIVE_BASE:
|
|
121
|
+
need_relative_base = False
|
|
122
|
+
for i, item in enumerate(to_parse):
|
|
123
|
+
if len(item) <= 2:
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
parsed_item, is_relative = self.parse_item(
|
|
127
|
+
parser, item, translated[i], parsed, need_relative_base
|
|
128
|
+
)
|
|
129
|
+
if parsed_item["date_obj"]:
|
|
130
|
+
parsed.append((parsed_item, is_relative))
|
|
131
|
+
substrings.append(original[i].strip(" .,:()[]-'"))
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
possible_splits = self.split_if_not_parsed(item, original[i])
|
|
135
|
+
if not possible_splits:
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
possible_parsed = []
|
|
139
|
+
possible_substrings = []
|
|
140
|
+
for split_translated, split_original in possible_splits:
|
|
141
|
+
current_parsed = []
|
|
142
|
+
current_substrings = []
|
|
143
|
+
if split_translated:
|
|
144
|
+
for j, jtem in enumerate(split_translated):
|
|
145
|
+
if len(jtem) <= 2:
|
|
146
|
+
continue
|
|
147
|
+
parsed_jtem, is_relative_jtem = self.parse_item(
|
|
148
|
+
parser,
|
|
149
|
+
jtem,
|
|
150
|
+
split_translated[j],
|
|
151
|
+
current_parsed,
|
|
152
|
+
need_relative_base,
|
|
153
|
+
)
|
|
154
|
+
current_parsed.append((parsed_jtem, is_relative_jtem))
|
|
155
|
+
current_substrings.append(split_original[j].strip(" .,:()[]-"))
|
|
156
|
+
possible_parsed.append(current_parsed)
|
|
157
|
+
possible_substrings.append(current_substrings)
|
|
158
|
+
parsed_best, substrings_best = self.choose_best_split(
|
|
159
|
+
possible_parsed, possible_substrings
|
|
160
|
+
)
|
|
161
|
+
for k in range(len(parsed_best)):
|
|
162
|
+
if parsed_best[k][0]["date_obj"]:
|
|
163
|
+
parsed.append(parsed_best[k])
|
|
164
|
+
substrings.append(substrings_best[k])
|
|
165
|
+
return parsed, substrings
|
|
166
|
+
|
|
167
|
+
def search_parse(self, shortname, text, settings):
|
|
168
|
+
translated, original = self.search(shortname, text, settings)
|
|
169
|
+
bad_translate_with_search = [
|
|
170
|
+
"vi",
|
|
171
|
+
"hu",
|
|
172
|
+
] # splitting done by spaces and some dictionary items contain spaces
|
|
173
|
+
if shortname not in bad_translate_with_search:
|
|
174
|
+
languages = ["en"]
|
|
175
|
+
to_parse = translated
|
|
176
|
+
else:
|
|
177
|
+
languages = [shortname]
|
|
178
|
+
to_parse = original
|
|
179
|
+
|
|
180
|
+
parser = DateDataParser(languages=languages, settings=settings)
|
|
181
|
+
parsed, substrings = self.parse_found_objects(
|
|
182
|
+
parser=parser,
|
|
183
|
+
to_parse=to_parse,
|
|
184
|
+
original=original,
|
|
185
|
+
translated=translated,
|
|
186
|
+
settings=settings,
|
|
187
|
+
)
|
|
188
|
+
parser._settings = Settings()
|
|
189
|
+
return list(zip(substrings, [i[0]["date_obj"] for i in parsed]))
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class DateSearchWithDetection:
|
|
193
|
+
"""
|
|
194
|
+
Class which executes language detection of string in a natural language, translation of a given string,
|
|
195
|
+
search of substrings which represent date and/or time and parsing of these substrings.
|
|
196
|
+
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
def __init__(self):
|
|
200
|
+
self.loader = LocaleDataLoader()
|
|
201
|
+
self.available_language_map = self.loader.get_locale_map()
|
|
202
|
+
self.search = _ExactLanguageSearch(self.loader)
|
|
203
|
+
|
|
204
|
+
@apply_settings
|
|
205
|
+
def detect_language(
|
|
206
|
+
self, text, languages, settings=None, detect_languages_function=None
|
|
207
|
+
):
|
|
208
|
+
if detect_languages_function and not languages:
|
|
209
|
+
detected_languages = detect_languages_function(
|
|
210
|
+
text,
|
|
211
|
+
confidence_threshold=settings.LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD,
|
|
212
|
+
)
|
|
213
|
+
detected_languages = (
|
|
214
|
+
map_languages(detected_languages) or settings.DEFAULT_LANGUAGES
|
|
215
|
+
)
|
|
216
|
+
return detected_languages[0] if detected_languages else None
|
|
217
|
+
|
|
218
|
+
if isinstance(languages, (list, tuple, Set)):
|
|
219
|
+
if all([language in self.available_language_map for language in languages]):
|
|
220
|
+
languages = [
|
|
221
|
+
self.available_language_map[language] for language in languages
|
|
222
|
+
]
|
|
223
|
+
else:
|
|
224
|
+
unsupported_languages = set(languages) - set(
|
|
225
|
+
self.available_language_map.keys()
|
|
226
|
+
)
|
|
227
|
+
raise ValueError(
|
|
228
|
+
"Unknown language(s): %s"
|
|
229
|
+
% ", ".join(map(repr, unsupported_languages))
|
|
230
|
+
)
|
|
231
|
+
elif languages is not None:
|
|
232
|
+
raise TypeError(
|
|
233
|
+
"languages argument must be a list (%r given)" % type(languages)
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
if languages:
|
|
237
|
+
self.language_detector = FullTextLanguageDetector(languages=languages)
|
|
238
|
+
else:
|
|
239
|
+
self.language_detector = FullTextLanguageDetector(
|
|
240
|
+
list(self.available_language_map.values())
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
detected_language = self.language_detector._best_language(text) or (
|
|
244
|
+
settings.DEFAULT_LANGUAGES[0] if settings.DEFAULT_LANGUAGES else None
|
|
245
|
+
)
|
|
246
|
+
return detected_language
|
|
247
|
+
|
|
248
|
+
@apply_settings
|
|
249
|
+
def search_dates(
|
|
250
|
+
self, text, languages=None, settings=None, detect_languages_function=None
|
|
251
|
+
):
|
|
252
|
+
"""
|
|
253
|
+
Find all substrings of the given string which represent date and/or time and parse them.
|
|
254
|
+
|
|
255
|
+
:param text:
|
|
256
|
+
A string in a natural language which may contain date and/or time expressions.
|
|
257
|
+
:type text: str
|
|
258
|
+
|
|
259
|
+
:param languages:
|
|
260
|
+
A list of two letters language codes.e.g. ['en', 'es']. If languages are given, it will not attempt
|
|
261
|
+
to detect the language.
|
|
262
|
+
:type languages: list
|
|
263
|
+
|
|
264
|
+
:param settings:
|
|
265
|
+
Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`.
|
|
266
|
+
:type settings: dict
|
|
267
|
+
|
|
268
|
+
:param detect_languages_function:
|
|
269
|
+
A function for language detection that takes as input a `text` and a `confidence_threshold`,
|
|
270
|
+
returns a list of detected language codes.
|
|
271
|
+
:type detect_languages_function: function
|
|
272
|
+
|
|
273
|
+
:return: a dict mapping keys to two letter language code and a list of tuples of pairs:
|
|
274
|
+
substring representing date expressions and corresponding :mod:`datetime.datetime` object.
|
|
275
|
+
For example:
|
|
276
|
+
{'Language': 'en', 'Dates': [('on 4 October 1957', datetime.datetime(1957, 10, 4, 0, 0))]}
|
|
277
|
+
If language of the string isn't recognised returns:
|
|
278
|
+
{'Language': None, 'Dates': None}
|
|
279
|
+
:raises: ValueError - Unknown Language
|
|
280
|
+
"""
|
|
281
|
+
|
|
282
|
+
check_settings(settings)
|
|
283
|
+
|
|
284
|
+
language_shortname = self.detect_language(
|
|
285
|
+
text=text,
|
|
286
|
+
languages=languages,
|
|
287
|
+
settings=settings,
|
|
288
|
+
detect_languages_function=detect_languages_function,
|
|
289
|
+
)
|
|
290
|
+
if not language_shortname:
|
|
291
|
+
return {"Language": None, "Dates": None}
|
|
292
|
+
return {
|
|
293
|
+
"Language": language_shortname,
|
|
294
|
+
"Dates": self.search.search_parse(
|
|
295
|
+
language_shortname, text, settings=settings
|
|
296
|
+
),
|
|
297
|
+
}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from dateparser.conf import apply_settings
|
|
2
|
+
from dateparser.search.detection import BaseLanguageDetector
|
|
3
|
+
from dateparser.utils import normalize_unicode
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class FullTextLanguageDetector(BaseLanguageDetector):
|
|
7
|
+
def __init__(self, languages):
|
|
8
|
+
super(BaseLanguageDetector, self).__init__()
|
|
9
|
+
self.languages = languages[:]
|
|
10
|
+
self.language_unique_chars = []
|
|
11
|
+
self.language_chars = []
|
|
12
|
+
|
|
13
|
+
def get_unique_characters(self, settings):
|
|
14
|
+
settings = settings.replace(NORMALIZE=False)
|
|
15
|
+
|
|
16
|
+
for language in self.languages:
|
|
17
|
+
chars = language.get_wordchars_for_detection(settings=settings)
|
|
18
|
+
self.language_chars.append(chars)
|
|
19
|
+
|
|
20
|
+
for char_set in self.language_chars:
|
|
21
|
+
unique_chars = char_set
|
|
22
|
+
for other_char_set in self.language_chars:
|
|
23
|
+
if other_char_set != char_set:
|
|
24
|
+
unique_chars = unique_chars - other_char_set
|
|
25
|
+
self.language_unique_chars.append(unique_chars)
|
|
26
|
+
|
|
27
|
+
def character_check(self, date_string, settings):
|
|
28
|
+
date_string_set = set(date_string.lower())
|
|
29
|
+
symbol_set = {
|
|
30
|
+
"0",
|
|
31
|
+
"1",
|
|
32
|
+
"2",
|
|
33
|
+
"3",
|
|
34
|
+
"4",
|
|
35
|
+
"5",
|
|
36
|
+
"6",
|
|
37
|
+
"7",
|
|
38
|
+
"8",
|
|
39
|
+
"9",
|
|
40
|
+
" ",
|
|
41
|
+
"/",
|
|
42
|
+
"-",
|
|
43
|
+
")",
|
|
44
|
+
"(",
|
|
45
|
+
".",
|
|
46
|
+
":",
|
|
47
|
+
"\\",
|
|
48
|
+
",",
|
|
49
|
+
"'",
|
|
50
|
+
}
|
|
51
|
+
if date_string_set & symbol_set == date_string_set:
|
|
52
|
+
self.languages = [self.languages[0]]
|
|
53
|
+
return
|
|
54
|
+
self.get_unique_characters(settings=settings)
|
|
55
|
+
for i in range(len(self.languages)):
|
|
56
|
+
for char in self.language_unique_chars[i]:
|
|
57
|
+
if char.lower() in date_string.lower():
|
|
58
|
+
self.languages = [self.languages[i]]
|
|
59
|
+
return
|
|
60
|
+
indices_to_pop = []
|
|
61
|
+
for i in range(len(self.languages)):
|
|
62
|
+
if len(date_string_set & self.language_chars[i]) == 0:
|
|
63
|
+
indices_to_pop.append(i)
|
|
64
|
+
self.languages = [
|
|
65
|
+
i for j, i in enumerate(self.languages) if j not in indices_to_pop
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
@apply_settings
|
|
69
|
+
def _best_language(self, date_string, settings=None):
|
|
70
|
+
self.character_check(date_string, settings)
|
|
71
|
+
date_string = normalize_unicode(date_string.lower())
|
|
72
|
+
if len(self.languages) == 1:
|
|
73
|
+
return self.languages[0].shortname
|
|
74
|
+
applicable_languages = []
|
|
75
|
+
for language in self.languages:
|
|
76
|
+
num_words = language.count_applicability(
|
|
77
|
+
date_string, strip_timezone=False, settings=settings
|
|
78
|
+
)
|
|
79
|
+
if num_words[0] > 0 or num_words[1] > 0:
|
|
80
|
+
applicable_languages.append((language.shortname, num_words))
|
|
81
|
+
else:
|
|
82
|
+
num_words = language.count_applicability(
|
|
83
|
+
date_string, strip_timezone=True, settings=settings
|
|
84
|
+
)
|
|
85
|
+
if num_words[0] > 0 or num_words[1] > 0:
|
|
86
|
+
applicable_languages.append((language.shortname, num_words))
|
|
87
|
+
if not applicable_languages:
|
|
88
|
+
return None
|
|
89
|
+
return max(applicable_languages, key=lambda p: (p[1][0], p[1][1]))[0]
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from datetime import datetime, timedelta, timezone, tzinfo
|
|
2
|
+
|
|
3
|
+
import regex as re
|
|
4
|
+
|
|
5
|
+
from .timezones import timezone_info_list
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class StaticTzInfo(tzinfo):
|
|
9
|
+
def __init__(self, name, offset):
|
|
10
|
+
self.__offset = offset
|
|
11
|
+
self.__name = name
|
|
12
|
+
|
|
13
|
+
def tzname(self, dt):
|
|
14
|
+
return self.__name
|
|
15
|
+
|
|
16
|
+
def utcoffset(self, dt):
|
|
17
|
+
return self.__offset
|
|
18
|
+
|
|
19
|
+
def dst(self, dt):
|
|
20
|
+
return timedelta(0)
|
|
21
|
+
|
|
22
|
+
def __repr__(self):
|
|
23
|
+
return "<%s '%s'>" % (self.__class__.__name__, self.__name)
|
|
24
|
+
|
|
25
|
+
def localize(self, dt, is_dst=False):
|
|
26
|
+
if dt.tzinfo is not None:
|
|
27
|
+
raise ValueError("Not naive datetime (tzinfo is already set)")
|
|
28
|
+
return dt.replace(tzinfo=self)
|
|
29
|
+
|
|
30
|
+
def __getinitargs__(self):
|
|
31
|
+
return self.__name, self.__offset
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def pop_tz_offset_from_string(date_string, as_offset=True):
|
|
35
|
+
if _search_regex_ignorecase.search(date_string):
|
|
36
|
+
for name, info in _tz_offsets:
|
|
37
|
+
timezone_re = info["regex"]
|
|
38
|
+
timezone_match = timezone_re.search(date_string)
|
|
39
|
+
if timezone_match:
|
|
40
|
+
start, stop = timezone_match.span()
|
|
41
|
+
date_string = date_string[: start + 1] + date_string[stop:]
|
|
42
|
+
return (
|
|
43
|
+
date_string,
|
|
44
|
+
StaticTzInfo(name, info["offset"]) if as_offset else name,
|
|
45
|
+
)
|
|
46
|
+
return date_string, None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def word_is_tz(word):
|
|
50
|
+
return bool(_search_regex.match(word))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def convert_to_local_tz(datetime_obj, datetime_tz_offset):
|
|
54
|
+
return datetime_obj - datetime_tz_offset + local_tz_offset
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def build_tz_offsets(search_regex_parts):
|
|
58
|
+
def get_offset(tz_obj, regex, repl="", replw=""):
|
|
59
|
+
return (
|
|
60
|
+
tz_obj[0],
|
|
61
|
+
{
|
|
62
|
+
"regex": re.compile(
|
|
63
|
+
re.sub(repl, replw, regex % tz_obj[0]), re.IGNORECASE
|
|
64
|
+
),
|
|
65
|
+
"offset": timedelta(seconds=tz_obj[1]),
|
|
66
|
+
},
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
for tz_info in timezone_info_list:
|
|
70
|
+
for regex in tz_info["regex_patterns"]:
|
|
71
|
+
for tz_obj in tz_info["timezones"]:
|
|
72
|
+
search_regex_parts.append(tz_obj[0])
|
|
73
|
+
yield get_offset(tz_obj, regex)
|
|
74
|
+
|
|
75
|
+
# alternate patterns
|
|
76
|
+
for replace, replacewith in tz_info.get("replace", []):
|
|
77
|
+
search_regex_parts.append(re.sub(replace, replacewith, tz_obj[0]))
|
|
78
|
+
yield get_offset(tz_obj, regex, repl=replace, replw=replacewith)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def get_local_tz_offset():
|
|
82
|
+
offset = datetime.now() - datetime.now(tz=timezone.utc).replace(tzinfo=None)
|
|
83
|
+
offset = timedelta(days=offset.days, seconds=round(offset.seconds, -1))
|
|
84
|
+
return offset
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
_search_regex_parts = []
|
|
88
|
+
_tz_offsets = list(build_tz_offsets(_search_regex_parts))
|
|
89
|
+
_search_regex = re.compile("|".join(_search_regex_parts))
|
|
90
|
+
_search_regex_ignorecase = re.compile("|".join(_search_regex_parts), re.IGNORECASE)
|
|
91
|
+
local_tz_offset = get_local_tz_offset()
|