dateparser 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dateparser/__init__.py +82 -0
- dateparser/calendars/__init__.py +144 -0
- dateparser/calendars/hijri.py +6 -0
- dateparser/calendars/hijri_parser.py +60 -0
- dateparser/calendars/jalali.py +9 -0
- dateparser/calendars/jalali_parser.py +184 -0
- dateparser/conf.py +267 -0
- dateparser/custom_language_detection/__init__.py +0 -0
- dateparser/custom_language_detection/fasttext.py +43 -0
- dateparser/custom_language_detection/langdetect.py +37 -0
- dateparser/custom_language_detection/language_mapping.py +18 -0
- dateparser/data/__init__.py +2 -0
- dateparser/data/date_translation_data/__init__.py +0 -0
- dateparser/data/date_translation_data/af.py +242 -0
- dateparser/data/date_translation_data/agq.py +169 -0
- dateparser/data/date_translation_data/ak.py +169 -0
- dateparser/data/date_translation_data/am.py +222 -0
- dateparser/data/date_translation_data/ar.py +574 -0
- dateparser/data/date_translation_data/as.py +164 -0
- dateparser/data/date_translation_data/asa.py +168 -0
- dateparser/data/date_translation_data/ast.py +280 -0
- dateparser/data/date_translation_data/az-Cyrl.py +168 -0
- dateparser/data/date_translation_data/az-Latn.py +217 -0
- dateparser/data/date_translation_data/az.py +217 -0
- dateparser/data/date_translation_data/bas.py +169 -0
- dateparser/data/date_translation_data/be.py +340 -0
- dateparser/data/date_translation_data/bem.py +161 -0
- dateparser/data/date_translation_data/bez.py +169 -0
- dateparser/data/date_translation_data/bg.py +345 -0
- dateparser/data/date_translation_data/bm.py +167 -0
- dateparser/data/date_translation_data/bn.py +241 -0
- dateparser/data/date_translation_data/bo.py +185 -0
- dateparser/data/date_translation_data/br.py +226 -0
- dateparser/data/date_translation_data/brx.py +157 -0
- dateparser/data/date_translation_data/bs-Cyrl.py +226 -0
- dateparser/data/date_translation_data/bs-Latn.py +248 -0
- dateparser/data/date_translation_data/bs.py +248 -0
- dateparser/data/date_translation_data/ca.py +313 -0
- dateparser/data/date_translation_data/ce.py +225 -0
- dateparser/data/date_translation_data/cgg.py +169 -0
- dateparser/data/date_translation_data/chr.py +240 -0
- dateparser/data/date_translation_data/ckb.py +154 -0
- dateparser/data/date_translation_data/cs.py +316 -0
- dateparser/data/date_translation_data/cy.py +217 -0
- dateparser/data/date_translation_data/da.py +296 -0
- dateparser/data/date_translation_data/dav.py +169 -0
- dateparser/data/date_translation_data/de.py +357 -0
- dateparser/data/date_translation_data/dje.py +167 -0
- dateparser/data/date_translation_data/dsb.py +270 -0
- dateparser/data/date_translation_data/dua.py +169 -0
- dateparser/data/date_translation_data/dyo.py +168 -0
- dateparser/data/date_translation_data/dz.py +225 -0
- dateparser/data/date_translation_data/ebu.py +169 -0
- dateparser/data/date_translation_data/ee.py +233 -0
- dateparser/data/date_translation_data/el.py +279 -0
- dateparser/data/date_translation_data/en.py +851 -0
- dateparser/data/date_translation_data/eo.py +169 -0
- dateparser/data/date_translation_data/es.py +499 -0
- dateparser/data/date_translation_data/et.py +233 -0
- dateparser/data/date_translation_data/eu.py +219 -0
- dateparser/data/date_translation_data/ewo.py +169 -0
- dateparser/data/date_translation_data/fa.py +270 -0
- dateparser/data/date_translation_data/ff.py +179 -0
- dateparser/data/date_translation_data/fi.py +345 -0
- dateparser/data/date_translation_data/fil.py +223 -0
- dateparser/data/date_translation_data/fo.py +256 -0
- dateparser/data/date_translation_data/fr.py +520 -0
- dateparser/data/date_translation_data/fur.py +223 -0
- dateparser/data/date_translation_data/fy.py +223 -0
- dateparser/data/date_translation_data/ga.py +238 -0
- dateparser/data/date_translation_data/gd.py +277 -0
- dateparser/data/date_translation_data/gl.py +253 -0
- dateparser/data/date_translation_data/gsw.py +179 -0
- dateparser/data/date_translation_data/gu.py +216 -0
- dateparser/data/date_translation_data/guz.py +170 -0
- dateparser/data/date_translation_data/gv.py +166 -0
- dateparser/data/date_translation_data/ha.py +176 -0
- dateparser/data/date_translation_data/haw.py +168 -0
- dateparser/data/date_translation_data/he.py +371 -0
- dateparser/data/date_translation_data/hi.py +261 -0
- dateparser/data/date_translation_data/hr.py +378 -0
- dateparser/data/date_translation_data/hsb.py +271 -0
- dateparser/data/date_translation_data/hu.py +297 -0
- dateparser/data/date_translation_data/hy.py +246 -0
- dateparser/data/date_translation_data/id.py +272 -0
- dateparser/data/date_translation_data/ig.py +168 -0
- dateparser/data/date_translation_data/ii.py +157 -0
- dateparser/data/date_translation_data/is.py +242 -0
- dateparser/data/date_translation_data/it.py +282 -0
- dateparser/data/date_translation_data/ja.py +286 -0
- dateparser/data/date_translation_data/jgo.py +188 -0
- dateparser/data/date_translation_data/jmc.py +168 -0
- dateparser/data/date_translation_data/ka.py +241 -0
- dateparser/data/date_translation_data/kab.py +169 -0
- dateparser/data/date_translation_data/kam.py +169 -0
- dateparser/data/date_translation_data/kde.py +169 -0
- dateparser/data/date_translation_data/kea.py +230 -0
- dateparser/data/date_translation_data/khq.py +167 -0
- dateparser/data/date_translation_data/ki.py +169 -0
- dateparser/data/date_translation_data/kk.py +228 -0
- dateparser/data/date_translation_data/kl.py +213 -0
- dateparser/data/date_translation_data/kln.py +171 -0
- dateparser/data/date_translation_data/km.py +198 -0
- dateparser/data/date_translation_data/kn.py +225 -0
- dateparser/data/date_translation_data/ko.py +207 -0
- dateparser/data/date_translation_data/kok.py +157 -0
- dateparser/data/date_translation_data/ks.py +152 -0
- dateparser/data/date_translation_data/ksb.py +168 -0
- dateparser/data/date_translation_data/ksf.py +169 -0
- dateparser/data/date_translation_data/ksh.py +192 -0
- dateparser/data/date_translation_data/kw.py +169 -0
- dateparser/data/date_translation_data/ky.py +240 -0
- dateparser/data/date_translation_data/lag.py +169 -0
- dateparser/data/date_translation_data/lb.py +233 -0
- dateparser/data/date_translation_data/lg.py +169 -0
- dateparser/data/date_translation_data/lkt.py +194 -0
- dateparser/data/date_translation_data/ln.py +179 -0
- dateparser/data/date_translation_data/lo.py +228 -0
- dateparser/data/date_translation_data/lrc.py +154 -0
- dateparser/data/date_translation_data/lt.py +263 -0
- dateparser/data/date_translation_data/lu.py +169 -0
- dateparser/data/date_translation_data/luo.py +169 -0
- dateparser/data/date_translation_data/luy.py +168 -0
- dateparser/data/date_translation_data/lv.py +257 -0
- dateparser/data/date_translation_data/mas.py +173 -0
- dateparser/data/date_translation_data/mer.py +168 -0
- dateparser/data/date_translation_data/mfe.py +166 -0
- dateparser/data/date_translation_data/mg.py +168 -0
- dateparser/data/date_translation_data/mgh.py +169 -0
- dateparser/data/date_translation_data/mgo.py +151 -0
- dateparser/data/date_translation_data/mk.py +234 -0
- dateparser/data/date_translation_data/ml.py +217 -0
- dateparser/data/date_translation_data/mn.py +224 -0
- dateparser/data/date_translation_data/mr.py +229 -0
- dateparser/data/date_translation_data/ms.py +242 -0
- dateparser/data/date_translation_data/mt.py +175 -0
- dateparser/data/date_translation_data/mua.py +169 -0
- dateparser/data/date_translation_data/my.py +203 -0
- dateparser/data/date_translation_data/mzn.py +199 -0
- dateparser/data/date_translation_data/naq.py +169 -0
- dateparser/data/date_translation_data/nb.py +261 -0
- dateparser/data/date_translation_data/nd.py +169 -0
- dateparser/data/date_translation_data/ne.py +207 -0
- dateparser/data/date_translation_data/nl.py +273 -0
- dateparser/data/date_translation_data/nmg.py +169 -0
- dateparser/data/date_translation_data/nn.py +231 -0
- dateparser/data/date_translation_data/nnh.py +150 -0
- dateparser/data/date_translation_data/nus.py +166 -0
- dateparser/data/date_translation_data/nyn.py +169 -0
- dateparser/data/date_translation_data/om.py +173 -0
- dateparser/data/date_translation_data/or.py +157 -0
- dateparser/data/date_translation_data/os.py +203 -0
- dateparser/data/date_translation_data/pa-Arab.py +150 -0
- dateparser/data/date_translation_data/pa-Guru.py +221 -0
- dateparser/data/date_translation_data/pa.py +221 -0
- dateparser/data/date_translation_data/pl.py +416 -0
- dateparser/data/date_translation_data/ps.py +150 -0
- dateparser/data/date_translation_data/pt.py +981 -0
- dateparser/data/date_translation_data/qu.py +176 -0
- dateparser/data/date_translation_data/rm.py +166 -0
- dateparser/data/date_translation_data/rn.py +169 -0
- dateparser/data/date_translation_data/ro.py +270 -0
- dateparser/data/date_translation_data/rof.py +157 -0
- dateparser/data/date_translation_data/ru.py +442 -0
- dateparser/data/date_translation_data/rw.py +169 -0
- dateparser/data/date_translation_data/rwk.py +168 -0
- dateparser/data/date_translation_data/sah.py +219 -0
- dateparser/data/date_translation_data/saq.py +169 -0
- dateparser/data/date_translation_data/sbp.py +169 -0
- dateparser/data/date_translation_data/se.py +280 -0
- dateparser/data/date_translation_data/seh.py +169 -0
- dateparser/data/date_translation_data/ses.py +167 -0
- dateparser/data/date_translation_data/sg.py +169 -0
- dateparser/data/date_translation_data/shi-Latn.py +169 -0
- dateparser/data/date_translation_data/shi-Tfng.py +169 -0
- dateparser/data/date_translation_data/shi.py +169 -0
- dateparser/data/date_translation_data/si.py +220 -0
- dateparser/data/date_translation_data/sk.py +327 -0
- dateparser/data/date_translation_data/sl.py +244 -0
- dateparser/data/date_translation_data/smn.py +176 -0
- dateparser/data/date_translation_data/sn.py +169 -0
- dateparser/data/date_translation_data/so.py +179 -0
- dateparser/data/date_translation_data/sq.py +237 -0
- dateparser/data/date_translation_data/sr-Cyrl.py +306 -0
- dateparser/data/date_translation_data/sr-Latn.py +306 -0
- dateparser/data/date_translation_data/sr.py +255 -0
- dateparser/data/date_translation_data/sv.py +309 -0
- dateparser/data/date_translation_data/sw.py +231 -0
- dateparser/data/date_translation_data/ta.py +264 -0
- dateparser/data/date_translation_data/te.py +239 -0
- dateparser/data/date_translation_data/teo.py +173 -0
- dateparser/data/date_translation_data/th.py +300 -0
- dateparser/data/date_translation_data/ti.py +173 -0
- dateparser/data/date_translation_data/tl.py +137 -0
- dateparser/data/date_translation_data/to.py +216 -0
- dateparser/data/date_translation_data/tr.py +259 -0
- dateparser/data/date_translation_data/twq.py +167 -0
- dateparser/data/date_translation_data/tzm.py +169 -0
- dateparser/data/date_translation_data/ug.py +203 -0
- dateparser/data/date_translation_data/uk.py +502 -0
- dateparser/data/date_translation_data/ur.py +256 -0
- dateparser/data/date_translation_data/uz-Arab.py +167 -0
- dateparser/data/date_translation_data/uz-Cyrl.py +210 -0
- dateparser/data/date_translation_data/uz-Latn.py +216 -0
- dateparser/data/date_translation_data/uz.py +216 -0
- dateparser/data/date_translation_data/vi.py +260 -0
- dateparser/data/date_translation_data/vun.py +168 -0
- dateparser/data/date_translation_data/wae.py +224 -0
- dateparser/data/date_translation_data/xog.py +169 -0
- dateparser/data/date_translation_data/yav.py +169 -0
- dateparser/data/date_translation_data/yi.py +178 -0
- dateparser/data/date_translation_data/yo.py +263 -0
- dateparser/data/date_translation_data/yue.py +203 -0
- dateparser/data/date_translation_data/zgh.py +169 -0
- dateparser/data/date_translation_data/zh-Hans.py +240 -0
- dateparser/data/date_translation_data/zh-Hant.py +402 -0
- dateparser/data/date_translation_data/zh.py +273 -0
- dateparser/data/date_translation_data/zu.py +196 -0
- dateparser/data/languages_info.py +826 -0
- dateparser/date.py +599 -0
- dateparser/date_parser.py +55 -0
- dateparser/freshness_date_parser.py +156 -0
- dateparser/languages/__init__.py +2 -0
- dateparser/languages/dictionary.py +352 -0
- dateparser/languages/loader.py +224 -0
- dateparser/languages/locale.py +625 -0
- dateparser/languages/validation.py +467 -0
- dateparser/parser.py +742 -0
- dateparser/search/__init__.py +71 -0
- dateparser/search/detection.py +78 -0
- dateparser/search/search.py +297 -0
- dateparser/search/text_detection.py +89 -0
- dateparser/timezone_parser.py +91 -0
- dateparser/timezones.py +469 -0
- dateparser/utils/__init__.py +257 -0
- dateparser/utils/strptime.py +108 -0
- dateparser-1.2.1.dist-info/AUTHORS.rst +17 -0
- dateparser-1.2.1.dist-info/LICENSE +12 -0
- dateparser-1.2.1.dist-info/METADATA +864 -0
- dateparser-1.2.1.dist-info/RECORD +256 -0
- dateparser-1.2.1.dist-info/WHEEL +5 -0
- dateparser-1.2.1.dist-info/entry_points.txt +2 -0
- dateparser-1.2.1.dist-info/top_level.txt +4 -0
- dateparser_cli/__init__.py +0 -0
- dateparser_cli/cli.py +36 -0
- dateparser_cli/exceptions.py +2 -0
- dateparser_cli/fasttext_manager.py +42 -0
- dateparser_cli/utils.py +27 -0
- dateparser_data/__init__.py +0 -0
- dateparser_data/settings.py +33 -0
- dateparser_scripts/__init__.py +0 -0
- dateparser_scripts/get_cldr_data.py +567 -0
- dateparser_scripts/order_languages.py +217 -0
- dateparser_scripts/update_supported_languages_and_locales.py +48 -0
- dateparser_scripts/utils.py +73 -0
- dateparser_scripts/write_complete_data.py +129 -0
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
from datetime import datetime, time, timezone
|
|
2
|
+
|
|
3
|
+
import regex as re
|
|
4
|
+
from dateutil.relativedelta import relativedelta
|
|
5
|
+
from tzlocal import get_localzone
|
|
6
|
+
|
|
7
|
+
from dateparser.utils import apply_timezone, localize_timezone, strip_braces
|
|
8
|
+
|
|
9
|
+
from .parser import time_parser
|
|
10
|
+
from .timezone_parser import pop_tz_offset_from_string
|
|
11
|
+
|
|
12
|
+
_UNITS = r"decade|year|month|week|day|hour|minute|second"
|
|
13
|
+
PATTERN = re.compile(r"(\d+[.,]?\d*)\s*(%s)\b" % _UNITS, re.I | re.S | re.U)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class FreshnessDateDataParser:
|
|
17
|
+
"""Parses date string like "1 year, 2 months ago" and "3 hours, 50 minutes ago" """
|
|
18
|
+
|
|
19
|
+
def _are_all_words_units(self, date_string):
|
|
20
|
+
skip = [_UNITS, r"ago|in|\d+", r":|[ap]m"]
|
|
21
|
+
|
|
22
|
+
date_string = re.sub(r"\s+", " ", date_string.strip())
|
|
23
|
+
|
|
24
|
+
words = [x for x in re.split(r"\W", date_string) if x]
|
|
25
|
+
words = [x for x in words if not re.match(r"%s" % "|".join(skip), x)]
|
|
26
|
+
return not words
|
|
27
|
+
|
|
28
|
+
def _parse_time(self, date_string, settings):
|
|
29
|
+
"""Attempts to parse time part of date strings like '1 day ago, 2 PM'"""
|
|
30
|
+
date_string = PATTERN.sub("", date_string)
|
|
31
|
+
date_string = re.sub(r"\b(?:ago|in)\b", "", date_string)
|
|
32
|
+
try:
|
|
33
|
+
return time_parser(date_string)
|
|
34
|
+
except Exception:
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
def get_local_tz(self):
|
|
38
|
+
return get_localzone()
|
|
39
|
+
|
|
40
|
+
def parse(self, date_string, settings):
|
|
41
|
+
date_string = strip_braces(date_string)
|
|
42
|
+
date_string, ptz = pop_tz_offset_from_string(date_string)
|
|
43
|
+
_time = self._parse_time(date_string, settings)
|
|
44
|
+
|
|
45
|
+
_settings_tz = settings.TIMEZONE.lower()
|
|
46
|
+
|
|
47
|
+
def apply_time(dateobj, timeobj):
|
|
48
|
+
if not isinstance(_time, time):
|
|
49
|
+
return dateobj
|
|
50
|
+
|
|
51
|
+
return dateobj.replace(
|
|
52
|
+
hour=timeobj.hour,
|
|
53
|
+
minute=timeobj.minute,
|
|
54
|
+
second=timeobj.second,
|
|
55
|
+
microsecond=timeobj.microsecond,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
if settings.RELATIVE_BASE:
|
|
59
|
+
now = settings.RELATIVE_BASE
|
|
60
|
+
|
|
61
|
+
if "local" not in _settings_tz:
|
|
62
|
+
now = localize_timezone(now, settings.TIMEZONE)
|
|
63
|
+
|
|
64
|
+
if ptz:
|
|
65
|
+
if now.tzinfo:
|
|
66
|
+
now = now.astimezone(ptz)
|
|
67
|
+
else:
|
|
68
|
+
if hasattr(ptz, "localize"):
|
|
69
|
+
now = ptz.localize(now)
|
|
70
|
+
else:
|
|
71
|
+
now = now.replace(tzinfo=ptz)
|
|
72
|
+
|
|
73
|
+
if not now.tzinfo:
|
|
74
|
+
now = now.replace(tzinfo=self.get_local_tz())
|
|
75
|
+
|
|
76
|
+
elif ptz:
|
|
77
|
+
localized_now = datetime.now(ptz)
|
|
78
|
+
|
|
79
|
+
if "local" in _settings_tz:
|
|
80
|
+
now = localized_now
|
|
81
|
+
else:
|
|
82
|
+
now = apply_timezone(localized_now, settings.TIMEZONE)
|
|
83
|
+
|
|
84
|
+
else:
|
|
85
|
+
if "local" not in _settings_tz:
|
|
86
|
+
utc_dt = datetime.now(tz=timezone.utc)
|
|
87
|
+
now = apply_timezone(utc_dt, settings.TIMEZONE)
|
|
88
|
+
else:
|
|
89
|
+
now = datetime.now(self.get_local_tz())
|
|
90
|
+
|
|
91
|
+
date, period = self._parse_date(date_string, now, settings.PREFER_DATES_FROM)
|
|
92
|
+
|
|
93
|
+
if date:
|
|
94
|
+
old_date = date
|
|
95
|
+
date = apply_time(date, _time)
|
|
96
|
+
if settings.RETURN_TIME_AS_PERIOD and old_date != date:
|
|
97
|
+
period = "time"
|
|
98
|
+
|
|
99
|
+
if settings.TO_TIMEZONE:
|
|
100
|
+
date = apply_timezone(date, settings.TO_TIMEZONE)
|
|
101
|
+
|
|
102
|
+
if not settings.RETURN_AS_TIMEZONE_AWARE or (
|
|
103
|
+
settings.RETURN_AS_TIMEZONE_AWARE
|
|
104
|
+
and "default" == settings.RETURN_AS_TIMEZONE_AWARE
|
|
105
|
+
and not ptz
|
|
106
|
+
):
|
|
107
|
+
date = date.replace(tzinfo=None)
|
|
108
|
+
|
|
109
|
+
return date, period
|
|
110
|
+
|
|
111
|
+
def _parse_date(self, date_string, now, prefer_dates_from):
|
|
112
|
+
if not self._are_all_words_units(date_string):
|
|
113
|
+
return None, None
|
|
114
|
+
|
|
115
|
+
kwargs = self.get_kwargs(date_string)
|
|
116
|
+
if not kwargs:
|
|
117
|
+
return None, None
|
|
118
|
+
period = "day"
|
|
119
|
+
if "days" not in kwargs:
|
|
120
|
+
for k in ["weeks", "months", "years"]:
|
|
121
|
+
if k in kwargs:
|
|
122
|
+
period = k[:-1]
|
|
123
|
+
break
|
|
124
|
+
td = relativedelta(**kwargs)
|
|
125
|
+
|
|
126
|
+
if (
|
|
127
|
+
re.search(r"\bin\b", date_string)
|
|
128
|
+
or re.search(r"\bfuture\b", prefer_dates_from)
|
|
129
|
+
and not re.search(r"\bago\b", date_string)
|
|
130
|
+
):
|
|
131
|
+
date = now + td
|
|
132
|
+
else:
|
|
133
|
+
date = now - td
|
|
134
|
+
return date, period
|
|
135
|
+
|
|
136
|
+
def get_kwargs(self, date_string):
|
|
137
|
+
m = PATTERN.findall(date_string)
|
|
138
|
+
if not m:
|
|
139
|
+
return {}
|
|
140
|
+
|
|
141
|
+
kwargs = {}
|
|
142
|
+
for num, unit in m:
|
|
143
|
+
kwargs[unit + "s"] = float(num.replace(",", "."))
|
|
144
|
+
if "decades" in kwargs:
|
|
145
|
+
kwargs["years"] = 10 * kwargs["decades"] + kwargs.get("years", 0)
|
|
146
|
+
del kwargs["decades"]
|
|
147
|
+
return kwargs
|
|
148
|
+
|
|
149
|
+
def get_date_data(self, date_string, settings=None):
|
|
150
|
+
from dateparser.date import DateData
|
|
151
|
+
|
|
152
|
+
date, period = self.parse(date_string, settings)
|
|
153
|
+
return DateData(date_obj=date, period=period)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
freshness_date_parser = FreshnessDateDataParser()
|
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
from itertools import chain, zip_longest
|
|
2
|
+
from operator import methodcaller
|
|
3
|
+
|
|
4
|
+
import regex as re
|
|
5
|
+
|
|
6
|
+
from dateparser.utils import normalize_unicode
|
|
7
|
+
|
|
8
|
+
PARSER_HARDCODED_TOKENS = [":", ".", " ", "-", "/"]
|
|
9
|
+
PARSER_KNOWN_TOKENS = ["am", "pm", "UTC", "GMT", "Z"]
|
|
10
|
+
ALWAYS_KEEP_TOKENS = ["+"] + PARSER_HARDCODED_TOKENS
|
|
11
|
+
KNOWN_WORD_TOKENS = [
|
|
12
|
+
"monday",
|
|
13
|
+
"tuesday",
|
|
14
|
+
"wednesday",
|
|
15
|
+
"thursday",
|
|
16
|
+
"friday",
|
|
17
|
+
"saturday",
|
|
18
|
+
"sunday",
|
|
19
|
+
"january",
|
|
20
|
+
"february",
|
|
21
|
+
"march",
|
|
22
|
+
"april",
|
|
23
|
+
"may",
|
|
24
|
+
"june",
|
|
25
|
+
"july",
|
|
26
|
+
"august",
|
|
27
|
+
"september",
|
|
28
|
+
"october",
|
|
29
|
+
"november",
|
|
30
|
+
"december",
|
|
31
|
+
"decade",
|
|
32
|
+
"year",
|
|
33
|
+
"month",
|
|
34
|
+
"week",
|
|
35
|
+
"day",
|
|
36
|
+
"hour",
|
|
37
|
+
"minute",
|
|
38
|
+
"second",
|
|
39
|
+
"ago",
|
|
40
|
+
"in",
|
|
41
|
+
"am",
|
|
42
|
+
"pm",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
PARENTHESES_PATTERN = re.compile(r"[\(\)]")
|
|
46
|
+
NUMERAL_PATTERN = re.compile(r"(\d+)")
|
|
47
|
+
KEEP_TOKEN_PATTERN = re.compile(r"^.*[^\W_].*$", flags=re.U)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class UnknownTokenError(Exception):
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class Dictionary:
|
|
55
|
+
"""
|
|
56
|
+
Class that modifies and stores translations and handles splitting of date string.
|
|
57
|
+
|
|
58
|
+
:param locale_info:
|
|
59
|
+
Locale info (translation data) of the locale.
|
|
60
|
+
:type language_info: dict
|
|
61
|
+
|
|
62
|
+
:param settings:
|
|
63
|
+
Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`.
|
|
64
|
+
:type settings: dict
|
|
65
|
+
|
|
66
|
+
:return: a Dictionary instance.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
_split_regex_cache = {}
|
|
70
|
+
_sorted_words_cache = {}
|
|
71
|
+
_split_relative_regex_cache = {}
|
|
72
|
+
_sorted_relative_strings_cache = {}
|
|
73
|
+
_match_relative_regex_cache = {}
|
|
74
|
+
|
|
75
|
+
def __init__(self, locale_info, settings=None):
|
|
76
|
+
dictionary = {}
|
|
77
|
+
self._settings = settings
|
|
78
|
+
self.info = locale_info
|
|
79
|
+
|
|
80
|
+
if "skip" in locale_info:
|
|
81
|
+
skip = map(methodcaller("lower"), locale_info["skip"])
|
|
82
|
+
dictionary.update(zip_longest(skip, [], fillvalue=None))
|
|
83
|
+
if "pertain" in locale_info:
|
|
84
|
+
pertain = map(methodcaller("lower"), locale_info["pertain"])
|
|
85
|
+
dictionary.update(zip_longest(pertain, [], fillvalue=None))
|
|
86
|
+
for word in KNOWN_WORD_TOKENS:
|
|
87
|
+
if word in locale_info:
|
|
88
|
+
translations = map(methodcaller("lower"), locale_info[word])
|
|
89
|
+
dictionary.update(zip_longest(translations, [], fillvalue=word))
|
|
90
|
+
dictionary.update(zip_longest(ALWAYS_KEEP_TOKENS, ALWAYS_KEEP_TOKENS))
|
|
91
|
+
dictionary.update(
|
|
92
|
+
zip_longest(
|
|
93
|
+
map(methodcaller("lower"), PARSER_KNOWN_TOKENS), PARSER_KNOWN_TOKENS
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
relative_type = locale_info.get("relative-type", {})
|
|
98
|
+
for key, value in relative_type.items():
|
|
99
|
+
relative_translations = map(methodcaller("lower"), value)
|
|
100
|
+
dictionary.update(zip_longest(relative_translations, [], fillvalue=key))
|
|
101
|
+
|
|
102
|
+
self._dictionary = dictionary
|
|
103
|
+
|
|
104
|
+
no_word_spacing = locale_info.get("no_word_spacing", "False")
|
|
105
|
+
self._no_word_spacing = bool(eval(no_word_spacing))
|
|
106
|
+
|
|
107
|
+
relative_type_regex = locale_info.get("relative-type-regex", {})
|
|
108
|
+
self._relative_strings = list(chain.from_iterable(relative_type_regex.values()))
|
|
109
|
+
|
|
110
|
+
def __contains__(self, key):
|
|
111
|
+
if key in self._settings.SKIP_TOKENS:
|
|
112
|
+
return True
|
|
113
|
+
return self._dictionary.__contains__(key)
|
|
114
|
+
|
|
115
|
+
def __getitem__(self, key):
|
|
116
|
+
if key in self._settings.SKIP_TOKENS:
|
|
117
|
+
return None
|
|
118
|
+
return self._dictionary.__getitem__(key)
|
|
119
|
+
|
|
120
|
+
def __iter__(self):
|
|
121
|
+
return chain(self._settings.SKIP_TOKENS, iter(self._dictionary))
|
|
122
|
+
|
|
123
|
+
def are_tokens_valid(self, tokens):
|
|
124
|
+
"""
|
|
125
|
+
Check if tokens are valid tokens for the locale.
|
|
126
|
+
|
|
127
|
+
:param tokens:
|
|
128
|
+
a list of string tokens.
|
|
129
|
+
:type tokens: list
|
|
130
|
+
|
|
131
|
+
:return: True if tokens are valid, False otherwise.
|
|
132
|
+
"""
|
|
133
|
+
has_only_keep_tokens = not set(tokens) - set(ALWAYS_KEEP_TOKENS)
|
|
134
|
+
if has_only_keep_tokens:
|
|
135
|
+
return False
|
|
136
|
+
match_relative_regex = self._get_match_relative_regex_cache()
|
|
137
|
+
for token in tokens:
|
|
138
|
+
if token.isdigit() or match_relative_regex.match(token) or token in self:
|
|
139
|
+
continue
|
|
140
|
+
else:
|
|
141
|
+
return False
|
|
142
|
+
else:
|
|
143
|
+
return True
|
|
144
|
+
|
|
145
|
+
def split(self, string, keep_formatting=False):
|
|
146
|
+
"""
|
|
147
|
+
Split the date string using translations in locale info.
|
|
148
|
+
|
|
149
|
+
:param string:
|
|
150
|
+
Date string to be splitted.
|
|
151
|
+
:type string:
|
|
152
|
+
str
|
|
153
|
+
|
|
154
|
+
:param keep_formatting:
|
|
155
|
+
If True, retain formatting of the date string.
|
|
156
|
+
:type keep_formatting: bool
|
|
157
|
+
|
|
158
|
+
:return: A list of string tokens formed after splitting the date string.
|
|
159
|
+
"""
|
|
160
|
+
if not string:
|
|
161
|
+
return string
|
|
162
|
+
|
|
163
|
+
split_relative_regex = self._get_split_relative_regex_cache()
|
|
164
|
+
match_relative_regex = self._get_match_relative_regex_cache()
|
|
165
|
+
|
|
166
|
+
tokens = split_relative_regex.split(string)
|
|
167
|
+
|
|
168
|
+
for i, token in enumerate(tokens):
|
|
169
|
+
if match_relative_regex.match(token):
|
|
170
|
+
tokens[i] = [token]
|
|
171
|
+
continue
|
|
172
|
+
tokens[i] = self._split_by_known_words(token, keep_formatting)
|
|
173
|
+
|
|
174
|
+
return list(filter(bool, chain.from_iterable(tokens)))
|
|
175
|
+
|
|
176
|
+
def _add_to_cache(self, value, cache):
|
|
177
|
+
cache.setdefault(self._settings.registry_key, {})[self.info["name"]] = value
|
|
178
|
+
if (
|
|
179
|
+
self._settings.CACHE_SIZE_LIMIT
|
|
180
|
+
and len(cache) > self._settings.CACHE_SIZE_LIMIT
|
|
181
|
+
):
|
|
182
|
+
cache.pop(list(cache.keys())[0])
|
|
183
|
+
|
|
184
|
+
def _split_by_known_words(self, string: str, keep_formatting: bool):
|
|
185
|
+
regex = self._get_split_regex_cache()
|
|
186
|
+
splitted = []
|
|
187
|
+
unknown = string
|
|
188
|
+
|
|
189
|
+
while unknown:
|
|
190
|
+
match = regex.match(string)
|
|
191
|
+
|
|
192
|
+
if not match:
|
|
193
|
+
curr_split = (
|
|
194
|
+
self._split_by_numerals(string, keep_formatting)
|
|
195
|
+
if self._should_capture(string, keep_formatting)
|
|
196
|
+
else []
|
|
197
|
+
)
|
|
198
|
+
unknown = ""
|
|
199
|
+
else:
|
|
200
|
+
unparsed, known, unknown = match.groups()
|
|
201
|
+
curr_split = (
|
|
202
|
+
[known] if self._should_capture(known, keep_formatting) else []
|
|
203
|
+
)
|
|
204
|
+
if unparsed and self._should_capture(unparsed, keep_formatting):
|
|
205
|
+
curr_split = (
|
|
206
|
+
self._split_by_numerals(unparsed, keep_formatting) + curr_split
|
|
207
|
+
)
|
|
208
|
+
if unknown:
|
|
209
|
+
string = unknown if string != unknown else ""
|
|
210
|
+
|
|
211
|
+
splitted.extend(curr_split)
|
|
212
|
+
return splitted
|
|
213
|
+
|
|
214
|
+
def _split_by_numerals(self, string, keep_formatting):
|
|
215
|
+
return [
|
|
216
|
+
token
|
|
217
|
+
for token in NUMERAL_PATTERN.split(string)
|
|
218
|
+
if self._should_capture(token, keep_formatting)
|
|
219
|
+
]
|
|
220
|
+
|
|
221
|
+
def _should_capture(self, token, keep_formatting):
|
|
222
|
+
return (
|
|
223
|
+
keep_formatting
|
|
224
|
+
or token in ALWAYS_KEEP_TOKENS
|
|
225
|
+
or KEEP_TOKEN_PATTERN.match(token)
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
def _get_sorted_words_from_cache(self):
|
|
229
|
+
if (
|
|
230
|
+
self._settings.registry_key not in self._sorted_words_cache
|
|
231
|
+
or self.info["name"]
|
|
232
|
+
not in self._sorted_words_cache[self._settings.registry_key]
|
|
233
|
+
):
|
|
234
|
+
self._add_to_cache(
|
|
235
|
+
cache=self._sorted_words_cache,
|
|
236
|
+
value=sorted([key for key in self], key=len, reverse=True),
|
|
237
|
+
)
|
|
238
|
+
return self._sorted_words_cache[self._settings.registry_key][self.info["name"]]
|
|
239
|
+
|
|
240
|
+
def _get_split_regex_cache(self):
|
|
241
|
+
if (
|
|
242
|
+
self._settings.registry_key not in self._split_regex_cache
|
|
243
|
+
or self.info["name"]
|
|
244
|
+
not in self._split_regex_cache[self._settings.registry_key]
|
|
245
|
+
):
|
|
246
|
+
self._construct_split_regex()
|
|
247
|
+
return self._split_regex_cache[self._settings.registry_key][self.info["name"]]
|
|
248
|
+
|
|
249
|
+
def _construct_split_regex(self):
|
|
250
|
+
known_words_group = "|".join(
|
|
251
|
+
map(re.escape, self._get_sorted_words_from_cache())
|
|
252
|
+
)
|
|
253
|
+
if self._no_word_spacing:
|
|
254
|
+
regex = r"^(.*?)({})(.*)$".format(known_words_group)
|
|
255
|
+
else:
|
|
256
|
+
regex = r"^(.*?(?:\A|\W|_|\d))({})((?:\Z|\W|_|\d).*)$".format(
|
|
257
|
+
known_words_group
|
|
258
|
+
)
|
|
259
|
+
self._add_to_cache(
|
|
260
|
+
cache=self._split_regex_cache,
|
|
261
|
+
value=re.compile(regex, re.UNICODE | re.IGNORECASE),
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
def _get_sorted_relative_strings_from_cache(self):
|
|
265
|
+
if (
|
|
266
|
+
self._settings.registry_key not in self._sorted_relative_strings_cache
|
|
267
|
+
or self.info["name"]
|
|
268
|
+
not in self._sorted_relative_strings_cache[self._settings.registry_key]
|
|
269
|
+
):
|
|
270
|
+
self._add_to_cache(
|
|
271
|
+
cache=self._sorted_relative_strings_cache,
|
|
272
|
+
value=sorted(
|
|
273
|
+
[
|
|
274
|
+
PARENTHESES_PATTERN.sub("", key)
|
|
275
|
+
for key in self._relative_strings
|
|
276
|
+
],
|
|
277
|
+
key=len,
|
|
278
|
+
reverse=True,
|
|
279
|
+
),
|
|
280
|
+
)
|
|
281
|
+
return self._sorted_relative_strings_cache[self._settings.registry_key][
|
|
282
|
+
self.info["name"]
|
|
283
|
+
]
|
|
284
|
+
|
|
285
|
+
def _get_split_relative_regex_cache(self):
|
|
286
|
+
if (
|
|
287
|
+
self._settings.registry_key not in self._split_relative_regex_cache
|
|
288
|
+
or self.info["name"]
|
|
289
|
+
not in self._split_relative_regex_cache[self._settings.registry_key]
|
|
290
|
+
):
|
|
291
|
+
self._construct_split_relative_regex()
|
|
292
|
+
return self._split_relative_regex_cache[self._settings.registry_key][
|
|
293
|
+
self.info["name"]
|
|
294
|
+
]
|
|
295
|
+
|
|
296
|
+
def _construct_split_relative_regex(self):
|
|
297
|
+
known_relative_strings_group = "|".join(
|
|
298
|
+
self._get_sorted_relative_strings_from_cache()
|
|
299
|
+
)
|
|
300
|
+
if self._no_word_spacing:
|
|
301
|
+
regex = "({})".format(known_relative_strings_group)
|
|
302
|
+
else:
|
|
303
|
+
regex = "(?<=(?:\\A|\\W|_))({})(?=(?:\\Z|\\W|_))".format(
|
|
304
|
+
known_relative_strings_group
|
|
305
|
+
)
|
|
306
|
+
self._add_to_cache(
|
|
307
|
+
cache=self._split_relative_regex_cache,
|
|
308
|
+
value=re.compile(regex, re.UNICODE | re.IGNORECASE),
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
def _get_match_relative_regex_cache(self):
|
|
312
|
+
if (
|
|
313
|
+
self._settings.registry_key not in self._match_relative_regex_cache
|
|
314
|
+
or self.info["name"]
|
|
315
|
+
not in self._match_relative_regex_cache[self._settings.registry_key]
|
|
316
|
+
):
|
|
317
|
+
self._construct_match_relative_regex()
|
|
318
|
+
return self._match_relative_regex_cache[self._settings.registry_key][
|
|
319
|
+
self.info["name"]
|
|
320
|
+
]
|
|
321
|
+
|
|
322
|
+
def _construct_match_relative_regex(self):
|
|
323
|
+
known_relative_strings_group = "|".join(
|
|
324
|
+
self._get_sorted_relative_strings_from_cache()
|
|
325
|
+
)
|
|
326
|
+
regex = "^({})$".format(known_relative_strings_group)
|
|
327
|
+
self._add_to_cache(
|
|
328
|
+
cache=self._match_relative_regex_cache,
|
|
329
|
+
value=re.compile(regex, re.UNICODE | re.IGNORECASE),
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
class NormalizedDictionary(Dictionary):
|
|
334
|
+
def __init__(self, locale_info, settings=None):
|
|
335
|
+
super().__init__(locale_info, settings)
|
|
336
|
+
self._normalize()
|
|
337
|
+
|
|
338
|
+
def _normalize(self):
|
|
339
|
+
new_dict = {}
|
|
340
|
+
conflicting_keys = []
|
|
341
|
+
for key, value in self._dictionary.items():
|
|
342
|
+
normalized = normalize_unicode(key)
|
|
343
|
+
if key != normalized and normalized in self._dictionary:
|
|
344
|
+
conflicting_keys.append(key)
|
|
345
|
+
else:
|
|
346
|
+
new_dict[normalized] = value
|
|
347
|
+
for key in conflicting_keys:
|
|
348
|
+
normalized = normalize_unicode(key)
|
|
349
|
+
if key in (self.info.get("skip", []) + self.info.get("pertain", [])):
|
|
350
|
+
new_dict[normalized] = self._dictionary[key]
|
|
351
|
+
self._dictionary = new_dict
|
|
352
|
+
self._relative_strings = list(map(normalize_unicode, self._relative_strings))
|