dateparser 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (256) hide show
  1. dateparser/__init__.py +82 -0
  2. dateparser/calendars/__init__.py +144 -0
  3. dateparser/calendars/hijri.py +6 -0
  4. dateparser/calendars/hijri_parser.py +60 -0
  5. dateparser/calendars/jalali.py +9 -0
  6. dateparser/calendars/jalali_parser.py +184 -0
  7. dateparser/conf.py +267 -0
  8. dateparser/custom_language_detection/__init__.py +0 -0
  9. dateparser/custom_language_detection/fasttext.py +43 -0
  10. dateparser/custom_language_detection/langdetect.py +37 -0
  11. dateparser/custom_language_detection/language_mapping.py +18 -0
  12. dateparser/data/__init__.py +2 -0
  13. dateparser/data/date_translation_data/__init__.py +0 -0
  14. dateparser/data/date_translation_data/af.py +242 -0
  15. dateparser/data/date_translation_data/agq.py +169 -0
  16. dateparser/data/date_translation_data/ak.py +169 -0
  17. dateparser/data/date_translation_data/am.py +222 -0
  18. dateparser/data/date_translation_data/ar.py +574 -0
  19. dateparser/data/date_translation_data/as.py +164 -0
  20. dateparser/data/date_translation_data/asa.py +168 -0
  21. dateparser/data/date_translation_data/ast.py +280 -0
  22. dateparser/data/date_translation_data/az-Cyrl.py +168 -0
  23. dateparser/data/date_translation_data/az-Latn.py +217 -0
  24. dateparser/data/date_translation_data/az.py +217 -0
  25. dateparser/data/date_translation_data/bas.py +169 -0
  26. dateparser/data/date_translation_data/be.py +340 -0
  27. dateparser/data/date_translation_data/bem.py +161 -0
  28. dateparser/data/date_translation_data/bez.py +169 -0
  29. dateparser/data/date_translation_data/bg.py +345 -0
  30. dateparser/data/date_translation_data/bm.py +167 -0
  31. dateparser/data/date_translation_data/bn.py +241 -0
  32. dateparser/data/date_translation_data/bo.py +185 -0
  33. dateparser/data/date_translation_data/br.py +226 -0
  34. dateparser/data/date_translation_data/brx.py +157 -0
  35. dateparser/data/date_translation_data/bs-Cyrl.py +226 -0
  36. dateparser/data/date_translation_data/bs-Latn.py +248 -0
  37. dateparser/data/date_translation_data/bs.py +248 -0
  38. dateparser/data/date_translation_data/ca.py +313 -0
  39. dateparser/data/date_translation_data/ce.py +225 -0
  40. dateparser/data/date_translation_data/cgg.py +169 -0
  41. dateparser/data/date_translation_data/chr.py +240 -0
  42. dateparser/data/date_translation_data/ckb.py +154 -0
  43. dateparser/data/date_translation_data/cs.py +316 -0
  44. dateparser/data/date_translation_data/cy.py +217 -0
  45. dateparser/data/date_translation_data/da.py +296 -0
  46. dateparser/data/date_translation_data/dav.py +169 -0
  47. dateparser/data/date_translation_data/de.py +357 -0
  48. dateparser/data/date_translation_data/dje.py +167 -0
  49. dateparser/data/date_translation_data/dsb.py +270 -0
  50. dateparser/data/date_translation_data/dua.py +169 -0
  51. dateparser/data/date_translation_data/dyo.py +168 -0
  52. dateparser/data/date_translation_data/dz.py +225 -0
  53. dateparser/data/date_translation_data/ebu.py +169 -0
  54. dateparser/data/date_translation_data/ee.py +233 -0
  55. dateparser/data/date_translation_data/el.py +279 -0
  56. dateparser/data/date_translation_data/en.py +851 -0
  57. dateparser/data/date_translation_data/eo.py +169 -0
  58. dateparser/data/date_translation_data/es.py +499 -0
  59. dateparser/data/date_translation_data/et.py +233 -0
  60. dateparser/data/date_translation_data/eu.py +219 -0
  61. dateparser/data/date_translation_data/ewo.py +169 -0
  62. dateparser/data/date_translation_data/fa.py +270 -0
  63. dateparser/data/date_translation_data/ff.py +179 -0
  64. dateparser/data/date_translation_data/fi.py +345 -0
  65. dateparser/data/date_translation_data/fil.py +223 -0
  66. dateparser/data/date_translation_data/fo.py +256 -0
  67. dateparser/data/date_translation_data/fr.py +520 -0
  68. dateparser/data/date_translation_data/fur.py +223 -0
  69. dateparser/data/date_translation_data/fy.py +223 -0
  70. dateparser/data/date_translation_data/ga.py +238 -0
  71. dateparser/data/date_translation_data/gd.py +277 -0
  72. dateparser/data/date_translation_data/gl.py +253 -0
  73. dateparser/data/date_translation_data/gsw.py +179 -0
  74. dateparser/data/date_translation_data/gu.py +216 -0
  75. dateparser/data/date_translation_data/guz.py +170 -0
  76. dateparser/data/date_translation_data/gv.py +166 -0
  77. dateparser/data/date_translation_data/ha.py +176 -0
  78. dateparser/data/date_translation_data/haw.py +168 -0
  79. dateparser/data/date_translation_data/he.py +371 -0
  80. dateparser/data/date_translation_data/hi.py +261 -0
  81. dateparser/data/date_translation_data/hr.py +378 -0
  82. dateparser/data/date_translation_data/hsb.py +271 -0
  83. dateparser/data/date_translation_data/hu.py +297 -0
  84. dateparser/data/date_translation_data/hy.py +246 -0
  85. dateparser/data/date_translation_data/id.py +272 -0
  86. dateparser/data/date_translation_data/ig.py +168 -0
  87. dateparser/data/date_translation_data/ii.py +157 -0
  88. dateparser/data/date_translation_data/is.py +242 -0
  89. dateparser/data/date_translation_data/it.py +282 -0
  90. dateparser/data/date_translation_data/ja.py +286 -0
  91. dateparser/data/date_translation_data/jgo.py +188 -0
  92. dateparser/data/date_translation_data/jmc.py +168 -0
  93. dateparser/data/date_translation_data/ka.py +241 -0
  94. dateparser/data/date_translation_data/kab.py +169 -0
  95. dateparser/data/date_translation_data/kam.py +169 -0
  96. dateparser/data/date_translation_data/kde.py +169 -0
  97. dateparser/data/date_translation_data/kea.py +230 -0
  98. dateparser/data/date_translation_data/khq.py +167 -0
  99. dateparser/data/date_translation_data/ki.py +169 -0
  100. dateparser/data/date_translation_data/kk.py +228 -0
  101. dateparser/data/date_translation_data/kl.py +213 -0
  102. dateparser/data/date_translation_data/kln.py +171 -0
  103. dateparser/data/date_translation_data/km.py +198 -0
  104. dateparser/data/date_translation_data/kn.py +225 -0
  105. dateparser/data/date_translation_data/ko.py +207 -0
  106. dateparser/data/date_translation_data/kok.py +157 -0
  107. dateparser/data/date_translation_data/ks.py +152 -0
  108. dateparser/data/date_translation_data/ksb.py +168 -0
  109. dateparser/data/date_translation_data/ksf.py +169 -0
  110. dateparser/data/date_translation_data/ksh.py +192 -0
  111. dateparser/data/date_translation_data/kw.py +169 -0
  112. dateparser/data/date_translation_data/ky.py +240 -0
  113. dateparser/data/date_translation_data/lag.py +169 -0
  114. dateparser/data/date_translation_data/lb.py +233 -0
  115. dateparser/data/date_translation_data/lg.py +169 -0
  116. dateparser/data/date_translation_data/lkt.py +194 -0
  117. dateparser/data/date_translation_data/ln.py +179 -0
  118. dateparser/data/date_translation_data/lo.py +228 -0
  119. dateparser/data/date_translation_data/lrc.py +154 -0
  120. dateparser/data/date_translation_data/lt.py +263 -0
  121. dateparser/data/date_translation_data/lu.py +169 -0
  122. dateparser/data/date_translation_data/luo.py +169 -0
  123. dateparser/data/date_translation_data/luy.py +168 -0
  124. dateparser/data/date_translation_data/lv.py +257 -0
  125. dateparser/data/date_translation_data/mas.py +173 -0
  126. dateparser/data/date_translation_data/mer.py +168 -0
  127. dateparser/data/date_translation_data/mfe.py +166 -0
  128. dateparser/data/date_translation_data/mg.py +168 -0
  129. dateparser/data/date_translation_data/mgh.py +169 -0
  130. dateparser/data/date_translation_data/mgo.py +151 -0
  131. dateparser/data/date_translation_data/mk.py +234 -0
  132. dateparser/data/date_translation_data/ml.py +217 -0
  133. dateparser/data/date_translation_data/mn.py +224 -0
  134. dateparser/data/date_translation_data/mr.py +229 -0
  135. dateparser/data/date_translation_data/ms.py +242 -0
  136. dateparser/data/date_translation_data/mt.py +175 -0
  137. dateparser/data/date_translation_data/mua.py +169 -0
  138. dateparser/data/date_translation_data/my.py +203 -0
  139. dateparser/data/date_translation_data/mzn.py +199 -0
  140. dateparser/data/date_translation_data/naq.py +169 -0
  141. dateparser/data/date_translation_data/nb.py +261 -0
  142. dateparser/data/date_translation_data/nd.py +169 -0
  143. dateparser/data/date_translation_data/ne.py +207 -0
  144. dateparser/data/date_translation_data/nl.py +273 -0
  145. dateparser/data/date_translation_data/nmg.py +169 -0
  146. dateparser/data/date_translation_data/nn.py +231 -0
  147. dateparser/data/date_translation_data/nnh.py +150 -0
  148. dateparser/data/date_translation_data/nus.py +166 -0
  149. dateparser/data/date_translation_data/nyn.py +169 -0
  150. dateparser/data/date_translation_data/om.py +173 -0
  151. dateparser/data/date_translation_data/or.py +157 -0
  152. dateparser/data/date_translation_data/os.py +203 -0
  153. dateparser/data/date_translation_data/pa-Arab.py +150 -0
  154. dateparser/data/date_translation_data/pa-Guru.py +221 -0
  155. dateparser/data/date_translation_data/pa.py +221 -0
  156. dateparser/data/date_translation_data/pl.py +416 -0
  157. dateparser/data/date_translation_data/ps.py +150 -0
  158. dateparser/data/date_translation_data/pt.py +981 -0
  159. dateparser/data/date_translation_data/qu.py +176 -0
  160. dateparser/data/date_translation_data/rm.py +166 -0
  161. dateparser/data/date_translation_data/rn.py +169 -0
  162. dateparser/data/date_translation_data/ro.py +270 -0
  163. dateparser/data/date_translation_data/rof.py +157 -0
  164. dateparser/data/date_translation_data/ru.py +442 -0
  165. dateparser/data/date_translation_data/rw.py +169 -0
  166. dateparser/data/date_translation_data/rwk.py +168 -0
  167. dateparser/data/date_translation_data/sah.py +219 -0
  168. dateparser/data/date_translation_data/saq.py +169 -0
  169. dateparser/data/date_translation_data/sbp.py +169 -0
  170. dateparser/data/date_translation_data/se.py +280 -0
  171. dateparser/data/date_translation_data/seh.py +169 -0
  172. dateparser/data/date_translation_data/ses.py +167 -0
  173. dateparser/data/date_translation_data/sg.py +169 -0
  174. dateparser/data/date_translation_data/shi-Latn.py +169 -0
  175. dateparser/data/date_translation_data/shi-Tfng.py +169 -0
  176. dateparser/data/date_translation_data/shi.py +169 -0
  177. dateparser/data/date_translation_data/si.py +220 -0
  178. dateparser/data/date_translation_data/sk.py +327 -0
  179. dateparser/data/date_translation_data/sl.py +244 -0
  180. dateparser/data/date_translation_data/smn.py +176 -0
  181. dateparser/data/date_translation_data/sn.py +169 -0
  182. dateparser/data/date_translation_data/so.py +179 -0
  183. dateparser/data/date_translation_data/sq.py +237 -0
  184. dateparser/data/date_translation_data/sr-Cyrl.py +306 -0
  185. dateparser/data/date_translation_data/sr-Latn.py +306 -0
  186. dateparser/data/date_translation_data/sr.py +255 -0
  187. dateparser/data/date_translation_data/sv.py +309 -0
  188. dateparser/data/date_translation_data/sw.py +231 -0
  189. dateparser/data/date_translation_data/ta.py +264 -0
  190. dateparser/data/date_translation_data/te.py +239 -0
  191. dateparser/data/date_translation_data/teo.py +173 -0
  192. dateparser/data/date_translation_data/th.py +300 -0
  193. dateparser/data/date_translation_data/ti.py +173 -0
  194. dateparser/data/date_translation_data/tl.py +137 -0
  195. dateparser/data/date_translation_data/to.py +216 -0
  196. dateparser/data/date_translation_data/tr.py +259 -0
  197. dateparser/data/date_translation_data/twq.py +167 -0
  198. dateparser/data/date_translation_data/tzm.py +169 -0
  199. dateparser/data/date_translation_data/ug.py +203 -0
  200. dateparser/data/date_translation_data/uk.py +502 -0
  201. dateparser/data/date_translation_data/ur.py +256 -0
  202. dateparser/data/date_translation_data/uz-Arab.py +167 -0
  203. dateparser/data/date_translation_data/uz-Cyrl.py +210 -0
  204. dateparser/data/date_translation_data/uz-Latn.py +216 -0
  205. dateparser/data/date_translation_data/uz.py +216 -0
  206. dateparser/data/date_translation_data/vi.py +260 -0
  207. dateparser/data/date_translation_data/vun.py +168 -0
  208. dateparser/data/date_translation_data/wae.py +224 -0
  209. dateparser/data/date_translation_data/xog.py +169 -0
  210. dateparser/data/date_translation_data/yav.py +169 -0
  211. dateparser/data/date_translation_data/yi.py +178 -0
  212. dateparser/data/date_translation_data/yo.py +263 -0
  213. dateparser/data/date_translation_data/yue.py +203 -0
  214. dateparser/data/date_translation_data/zgh.py +169 -0
  215. dateparser/data/date_translation_data/zh-Hans.py +240 -0
  216. dateparser/data/date_translation_data/zh-Hant.py +402 -0
  217. dateparser/data/date_translation_data/zh.py +273 -0
  218. dateparser/data/date_translation_data/zu.py +196 -0
  219. dateparser/data/languages_info.py +826 -0
  220. dateparser/date.py +599 -0
  221. dateparser/date_parser.py +55 -0
  222. dateparser/freshness_date_parser.py +156 -0
  223. dateparser/languages/__init__.py +2 -0
  224. dateparser/languages/dictionary.py +352 -0
  225. dateparser/languages/loader.py +224 -0
  226. dateparser/languages/locale.py +625 -0
  227. dateparser/languages/validation.py +467 -0
  228. dateparser/parser.py +742 -0
  229. dateparser/search/__init__.py +71 -0
  230. dateparser/search/detection.py +78 -0
  231. dateparser/search/search.py +297 -0
  232. dateparser/search/text_detection.py +89 -0
  233. dateparser/timezone_parser.py +91 -0
  234. dateparser/timezones.py +469 -0
  235. dateparser/utils/__init__.py +257 -0
  236. dateparser/utils/strptime.py +108 -0
  237. dateparser-1.2.1.dist-info/AUTHORS.rst +17 -0
  238. dateparser-1.2.1.dist-info/LICENSE +12 -0
  239. dateparser-1.2.1.dist-info/METADATA +864 -0
  240. dateparser-1.2.1.dist-info/RECORD +256 -0
  241. dateparser-1.2.1.dist-info/WHEEL +5 -0
  242. dateparser-1.2.1.dist-info/entry_points.txt +2 -0
  243. dateparser-1.2.1.dist-info/top_level.txt +4 -0
  244. dateparser_cli/__init__.py +0 -0
  245. dateparser_cli/cli.py +36 -0
  246. dateparser_cli/exceptions.py +2 -0
  247. dateparser_cli/fasttext_manager.py +42 -0
  248. dateparser_cli/utils.py +27 -0
  249. dateparser_data/__init__.py +0 -0
  250. dateparser_data/settings.py +33 -0
  251. dateparser_scripts/__init__.py +0 -0
  252. dateparser_scripts/get_cldr_data.py +567 -0
  253. dateparser_scripts/order_languages.py +217 -0
  254. dateparser_scripts/update_supported_languages_and_locales.py +48 -0
  255. dateparser_scripts/utils.py +73 -0
  256. dateparser_scripts/write_complete_data.py +129 -0
@@ -0,0 +1,156 @@
1
+ from datetime import datetime, time, timezone
2
+
3
+ import regex as re
4
+ from dateutil.relativedelta import relativedelta
5
+ from tzlocal import get_localzone
6
+
7
+ from dateparser.utils import apply_timezone, localize_timezone, strip_braces
8
+
9
+ from .parser import time_parser
10
+ from .timezone_parser import pop_tz_offset_from_string
11
+
12
+ _UNITS = r"decade|year|month|week|day|hour|minute|second"
13
+ PATTERN = re.compile(r"(\d+[.,]?\d*)\s*(%s)\b" % _UNITS, re.I | re.S | re.U)
14
+
15
+
16
+ class FreshnessDateDataParser:
17
+ """Parses date string like "1 year, 2 months ago" and "3 hours, 50 minutes ago" """
18
+
19
+ def _are_all_words_units(self, date_string):
20
+ skip = [_UNITS, r"ago|in|\d+", r":|[ap]m"]
21
+
22
+ date_string = re.sub(r"\s+", " ", date_string.strip())
23
+
24
+ words = [x for x in re.split(r"\W", date_string) if x]
25
+ words = [x for x in words if not re.match(r"%s" % "|".join(skip), x)]
26
+ return not words
27
+
28
+ def _parse_time(self, date_string, settings):
29
+ """Attempts to parse time part of date strings like '1 day ago, 2 PM'"""
30
+ date_string = PATTERN.sub("", date_string)
31
+ date_string = re.sub(r"\b(?:ago|in)\b", "", date_string)
32
+ try:
33
+ return time_parser(date_string)
34
+ except Exception:
35
+ pass
36
+
37
+ def get_local_tz(self):
38
+ return get_localzone()
39
+
40
+ def parse(self, date_string, settings):
41
+ date_string = strip_braces(date_string)
42
+ date_string, ptz = pop_tz_offset_from_string(date_string)
43
+ _time = self._parse_time(date_string, settings)
44
+
45
+ _settings_tz = settings.TIMEZONE.lower()
46
+
47
+ def apply_time(dateobj, timeobj):
48
+ if not isinstance(_time, time):
49
+ return dateobj
50
+
51
+ return dateobj.replace(
52
+ hour=timeobj.hour,
53
+ minute=timeobj.minute,
54
+ second=timeobj.second,
55
+ microsecond=timeobj.microsecond,
56
+ )
57
+
58
+ if settings.RELATIVE_BASE:
59
+ now = settings.RELATIVE_BASE
60
+
61
+ if "local" not in _settings_tz:
62
+ now = localize_timezone(now, settings.TIMEZONE)
63
+
64
+ if ptz:
65
+ if now.tzinfo:
66
+ now = now.astimezone(ptz)
67
+ else:
68
+ if hasattr(ptz, "localize"):
69
+ now = ptz.localize(now)
70
+ else:
71
+ now = now.replace(tzinfo=ptz)
72
+
73
+ if not now.tzinfo:
74
+ now = now.replace(tzinfo=self.get_local_tz())
75
+
76
+ elif ptz:
77
+ localized_now = datetime.now(ptz)
78
+
79
+ if "local" in _settings_tz:
80
+ now = localized_now
81
+ else:
82
+ now = apply_timezone(localized_now, settings.TIMEZONE)
83
+
84
+ else:
85
+ if "local" not in _settings_tz:
86
+ utc_dt = datetime.now(tz=timezone.utc)
87
+ now = apply_timezone(utc_dt, settings.TIMEZONE)
88
+ else:
89
+ now = datetime.now(self.get_local_tz())
90
+
91
+ date, period = self._parse_date(date_string, now, settings.PREFER_DATES_FROM)
92
+
93
+ if date:
94
+ old_date = date
95
+ date = apply_time(date, _time)
96
+ if settings.RETURN_TIME_AS_PERIOD and old_date != date:
97
+ period = "time"
98
+
99
+ if settings.TO_TIMEZONE:
100
+ date = apply_timezone(date, settings.TO_TIMEZONE)
101
+
102
+ if not settings.RETURN_AS_TIMEZONE_AWARE or (
103
+ settings.RETURN_AS_TIMEZONE_AWARE
104
+ and "default" == settings.RETURN_AS_TIMEZONE_AWARE
105
+ and not ptz
106
+ ):
107
+ date = date.replace(tzinfo=None)
108
+
109
+ return date, period
110
+
111
+ def _parse_date(self, date_string, now, prefer_dates_from):
112
+ if not self._are_all_words_units(date_string):
113
+ return None, None
114
+
115
+ kwargs = self.get_kwargs(date_string)
116
+ if not kwargs:
117
+ return None, None
118
+ period = "day"
119
+ if "days" not in kwargs:
120
+ for k in ["weeks", "months", "years"]:
121
+ if k in kwargs:
122
+ period = k[:-1]
123
+ break
124
+ td = relativedelta(**kwargs)
125
+
126
+ if (
127
+ re.search(r"\bin\b", date_string)
128
+ or re.search(r"\bfuture\b", prefer_dates_from)
129
+ and not re.search(r"\bago\b", date_string)
130
+ ):
131
+ date = now + td
132
+ else:
133
+ date = now - td
134
+ return date, period
135
+
136
+ def get_kwargs(self, date_string):
137
+ m = PATTERN.findall(date_string)
138
+ if not m:
139
+ return {}
140
+
141
+ kwargs = {}
142
+ for num, unit in m:
143
+ kwargs[unit + "s"] = float(num.replace(",", "."))
144
+ if "decades" in kwargs:
145
+ kwargs["years"] = 10 * kwargs["decades"] + kwargs.get("years", 0)
146
+ del kwargs["decades"]
147
+ return kwargs
148
+
149
+ def get_date_data(self, date_string, settings=None):
150
+ from dateparser.date import DateData
151
+
152
+ date, period = self.parse(date_string, settings)
153
+ return DateData(date_obj=date, period=period)
154
+
155
+
156
+ freshness_date_parser = FreshnessDateDataParser()
@@ -0,0 +1,2 @@
1
+ from .loader import default_loader
2
+ from .locale import Locale
@@ -0,0 +1,352 @@
1
+ from itertools import chain, zip_longest
2
+ from operator import methodcaller
3
+
4
+ import regex as re
5
+
6
+ from dateparser.utils import normalize_unicode
7
+
8
+ PARSER_HARDCODED_TOKENS = [":", ".", " ", "-", "/"]
9
+ PARSER_KNOWN_TOKENS = ["am", "pm", "UTC", "GMT", "Z"]
10
+ ALWAYS_KEEP_TOKENS = ["+"] + PARSER_HARDCODED_TOKENS
11
+ KNOWN_WORD_TOKENS = [
12
+ "monday",
13
+ "tuesday",
14
+ "wednesday",
15
+ "thursday",
16
+ "friday",
17
+ "saturday",
18
+ "sunday",
19
+ "january",
20
+ "february",
21
+ "march",
22
+ "april",
23
+ "may",
24
+ "june",
25
+ "july",
26
+ "august",
27
+ "september",
28
+ "october",
29
+ "november",
30
+ "december",
31
+ "decade",
32
+ "year",
33
+ "month",
34
+ "week",
35
+ "day",
36
+ "hour",
37
+ "minute",
38
+ "second",
39
+ "ago",
40
+ "in",
41
+ "am",
42
+ "pm",
43
+ ]
44
+
45
+ PARENTHESES_PATTERN = re.compile(r"[\(\)]")
46
+ NUMERAL_PATTERN = re.compile(r"(\d+)")
47
+ KEEP_TOKEN_PATTERN = re.compile(r"^.*[^\W_].*$", flags=re.U)
48
+
49
+
50
+ class UnknownTokenError(Exception):
51
+ pass
52
+
53
+
54
+ class Dictionary:
55
+ """
56
+ Class that modifies and stores translations and handles splitting of date string.
57
+
58
+ :param locale_info:
59
+ Locale info (translation data) of the locale.
60
+ :type language_info: dict
61
+
62
+ :param settings:
63
+ Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`.
64
+ :type settings: dict
65
+
66
+ :return: a Dictionary instance.
67
+ """
68
+
69
+ _split_regex_cache = {}
70
+ _sorted_words_cache = {}
71
+ _split_relative_regex_cache = {}
72
+ _sorted_relative_strings_cache = {}
73
+ _match_relative_regex_cache = {}
74
+
75
+ def __init__(self, locale_info, settings=None):
76
+ dictionary = {}
77
+ self._settings = settings
78
+ self.info = locale_info
79
+
80
+ if "skip" in locale_info:
81
+ skip = map(methodcaller("lower"), locale_info["skip"])
82
+ dictionary.update(zip_longest(skip, [], fillvalue=None))
83
+ if "pertain" in locale_info:
84
+ pertain = map(methodcaller("lower"), locale_info["pertain"])
85
+ dictionary.update(zip_longest(pertain, [], fillvalue=None))
86
+ for word in KNOWN_WORD_TOKENS:
87
+ if word in locale_info:
88
+ translations = map(methodcaller("lower"), locale_info[word])
89
+ dictionary.update(zip_longest(translations, [], fillvalue=word))
90
+ dictionary.update(zip_longest(ALWAYS_KEEP_TOKENS, ALWAYS_KEEP_TOKENS))
91
+ dictionary.update(
92
+ zip_longest(
93
+ map(methodcaller("lower"), PARSER_KNOWN_TOKENS), PARSER_KNOWN_TOKENS
94
+ )
95
+ )
96
+
97
+ relative_type = locale_info.get("relative-type", {})
98
+ for key, value in relative_type.items():
99
+ relative_translations = map(methodcaller("lower"), value)
100
+ dictionary.update(zip_longest(relative_translations, [], fillvalue=key))
101
+
102
+ self._dictionary = dictionary
103
+
104
+ no_word_spacing = locale_info.get("no_word_spacing", "False")
105
+ self._no_word_spacing = bool(eval(no_word_spacing))
106
+
107
+ relative_type_regex = locale_info.get("relative-type-regex", {})
108
+ self._relative_strings = list(chain.from_iterable(relative_type_regex.values()))
109
+
110
+ def __contains__(self, key):
111
+ if key in self._settings.SKIP_TOKENS:
112
+ return True
113
+ return self._dictionary.__contains__(key)
114
+
115
+ def __getitem__(self, key):
116
+ if key in self._settings.SKIP_TOKENS:
117
+ return None
118
+ return self._dictionary.__getitem__(key)
119
+
120
+ def __iter__(self):
121
+ return chain(self._settings.SKIP_TOKENS, iter(self._dictionary))
122
+
123
+ def are_tokens_valid(self, tokens):
124
+ """
125
+ Check if tokens are valid tokens for the locale.
126
+
127
+ :param tokens:
128
+ a list of string tokens.
129
+ :type tokens: list
130
+
131
+ :return: True if tokens are valid, False otherwise.
132
+ """
133
+ has_only_keep_tokens = not set(tokens) - set(ALWAYS_KEEP_TOKENS)
134
+ if has_only_keep_tokens:
135
+ return False
136
+ match_relative_regex = self._get_match_relative_regex_cache()
137
+ for token in tokens:
138
+ if token.isdigit() or match_relative_regex.match(token) or token in self:
139
+ continue
140
+ else:
141
+ return False
142
+ else:
143
+ return True
144
+
145
+ def split(self, string, keep_formatting=False):
146
+ """
147
+ Split the date string using translations in locale info.
148
+
149
+ :param string:
150
+ Date string to be splitted.
151
+ :type string:
152
+ str
153
+
154
+ :param keep_formatting:
155
+ If True, retain formatting of the date string.
156
+ :type keep_formatting: bool
157
+
158
+ :return: A list of string tokens formed after splitting the date string.
159
+ """
160
+ if not string:
161
+ return string
162
+
163
+ split_relative_regex = self._get_split_relative_regex_cache()
164
+ match_relative_regex = self._get_match_relative_regex_cache()
165
+
166
+ tokens = split_relative_regex.split(string)
167
+
168
+ for i, token in enumerate(tokens):
169
+ if match_relative_regex.match(token):
170
+ tokens[i] = [token]
171
+ continue
172
+ tokens[i] = self._split_by_known_words(token, keep_formatting)
173
+
174
+ return list(filter(bool, chain.from_iterable(tokens)))
175
+
176
+ def _add_to_cache(self, value, cache):
177
+ cache.setdefault(self._settings.registry_key, {})[self.info["name"]] = value
178
+ if (
179
+ self._settings.CACHE_SIZE_LIMIT
180
+ and len(cache) > self._settings.CACHE_SIZE_LIMIT
181
+ ):
182
+ cache.pop(list(cache.keys())[0])
183
+
184
+ def _split_by_known_words(self, string: str, keep_formatting: bool):
185
+ regex = self._get_split_regex_cache()
186
+ splitted = []
187
+ unknown = string
188
+
189
+ while unknown:
190
+ match = regex.match(string)
191
+
192
+ if not match:
193
+ curr_split = (
194
+ self._split_by_numerals(string, keep_formatting)
195
+ if self._should_capture(string, keep_formatting)
196
+ else []
197
+ )
198
+ unknown = ""
199
+ else:
200
+ unparsed, known, unknown = match.groups()
201
+ curr_split = (
202
+ [known] if self._should_capture(known, keep_formatting) else []
203
+ )
204
+ if unparsed and self._should_capture(unparsed, keep_formatting):
205
+ curr_split = (
206
+ self._split_by_numerals(unparsed, keep_formatting) + curr_split
207
+ )
208
+ if unknown:
209
+ string = unknown if string != unknown else ""
210
+
211
+ splitted.extend(curr_split)
212
+ return splitted
213
+
214
+ def _split_by_numerals(self, string, keep_formatting):
215
+ return [
216
+ token
217
+ for token in NUMERAL_PATTERN.split(string)
218
+ if self._should_capture(token, keep_formatting)
219
+ ]
220
+
221
+ def _should_capture(self, token, keep_formatting):
222
+ return (
223
+ keep_formatting
224
+ or token in ALWAYS_KEEP_TOKENS
225
+ or KEEP_TOKEN_PATTERN.match(token)
226
+ )
227
+
228
+ def _get_sorted_words_from_cache(self):
229
+ if (
230
+ self._settings.registry_key not in self._sorted_words_cache
231
+ or self.info["name"]
232
+ not in self._sorted_words_cache[self._settings.registry_key]
233
+ ):
234
+ self._add_to_cache(
235
+ cache=self._sorted_words_cache,
236
+ value=sorted([key for key in self], key=len, reverse=True),
237
+ )
238
+ return self._sorted_words_cache[self._settings.registry_key][self.info["name"]]
239
+
240
+ def _get_split_regex_cache(self):
241
+ if (
242
+ self._settings.registry_key not in self._split_regex_cache
243
+ or self.info["name"]
244
+ not in self._split_regex_cache[self._settings.registry_key]
245
+ ):
246
+ self._construct_split_regex()
247
+ return self._split_regex_cache[self._settings.registry_key][self.info["name"]]
248
+
249
+ def _construct_split_regex(self):
250
+ known_words_group = "|".join(
251
+ map(re.escape, self._get_sorted_words_from_cache())
252
+ )
253
+ if self._no_word_spacing:
254
+ regex = r"^(.*?)({})(.*)$".format(known_words_group)
255
+ else:
256
+ regex = r"^(.*?(?:\A|\W|_|\d))({})((?:\Z|\W|_|\d).*)$".format(
257
+ known_words_group
258
+ )
259
+ self._add_to_cache(
260
+ cache=self._split_regex_cache,
261
+ value=re.compile(regex, re.UNICODE | re.IGNORECASE),
262
+ )
263
+
264
+ def _get_sorted_relative_strings_from_cache(self):
265
+ if (
266
+ self._settings.registry_key not in self._sorted_relative_strings_cache
267
+ or self.info["name"]
268
+ not in self._sorted_relative_strings_cache[self._settings.registry_key]
269
+ ):
270
+ self._add_to_cache(
271
+ cache=self._sorted_relative_strings_cache,
272
+ value=sorted(
273
+ [
274
+ PARENTHESES_PATTERN.sub("", key)
275
+ for key in self._relative_strings
276
+ ],
277
+ key=len,
278
+ reverse=True,
279
+ ),
280
+ )
281
+ return self._sorted_relative_strings_cache[self._settings.registry_key][
282
+ self.info["name"]
283
+ ]
284
+
285
+ def _get_split_relative_regex_cache(self):
286
+ if (
287
+ self._settings.registry_key not in self._split_relative_regex_cache
288
+ or self.info["name"]
289
+ not in self._split_relative_regex_cache[self._settings.registry_key]
290
+ ):
291
+ self._construct_split_relative_regex()
292
+ return self._split_relative_regex_cache[self._settings.registry_key][
293
+ self.info["name"]
294
+ ]
295
+
296
+ def _construct_split_relative_regex(self):
297
+ known_relative_strings_group = "|".join(
298
+ self._get_sorted_relative_strings_from_cache()
299
+ )
300
+ if self._no_word_spacing:
301
+ regex = "({})".format(known_relative_strings_group)
302
+ else:
303
+ regex = "(?<=(?:\\A|\\W|_))({})(?=(?:\\Z|\\W|_))".format(
304
+ known_relative_strings_group
305
+ )
306
+ self._add_to_cache(
307
+ cache=self._split_relative_regex_cache,
308
+ value=re.compile(regex, re.UNICODE | re.IGNORECASE),
309
+ )
310
+
311
+ def _get_match_relative_regex_cache(self):
312
+ if (
313
+ self._settings.registry_key not in self._match_relative_regex_cache
314
+ or self.info["name"]
315
+ not in self._match_relative_regex_cache[self._settings.registry_key]
316
+ ):
317
+ self._construct_match_relative_regex()
318
+ return self._match_relative_regex_cache[self._settings.registry_key][
319
+ self.info["name"]
320
+ ]
321
+
322
+ def _construct_match_relative_regex(self):
323
+ known_relative_strings_group = "|".join(
324
+ self._get_sorted_relative_strings_from_cache()
325
+ )
326
+ regex = "^({})$".format(known_relative_strings_group)
327
+ self._add_to_cache(
328
+ cache=self._match_relative_regex_cache,
329
+ value=re.compile(regex, re.UNICODE | re.IGNORECASE),
330
+ )
331
+
332
+
333
+ class NormalizedDictionary(Dictionary):
334
+ def __init__(self, locale_info, settings=None):
335
+ super().__init__(locale_info, settings)
336
+ self._normalize()
337
+
338
+ def _normalize(self):
339
+ new_dict = {}
340
+ conflicting_keys = []
341
+ for key, value in self._dictionary.items():
342
+ normalized = normalize_unicode(key)
343
+ if key != normalized and normalized in self._dictionary:
344
+ conflicting_keys.append(key)
345
+ else:
346
+ new_dict[normalized] = value
347
+ for key in conflicting_keys:
348
+ normalized = normalize_unicode(key)
349
+ if key in (self.info.get("skip", []) + self.info.get("pertain", [])):
350
+ new_dict[normalized] = self._dictionary[key]
351
+ self._dictionary = new_dict
352
+ self._relative_strings = list(map(normalize_unicode, self._relative_strings))