dateparser 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (256) hide show
  1. dateparser/__init__.py +82 -0
  2. dateparser/calendars/__init__.py +144 -0
  3. dateparser/calendars/hijri.py +6 -0
  4. dateparser/calendars/hijri_parser.py +60 -0
  5. dateparser/calendars/jalali.py +9 -0
  6. dateparser/calendars/jalali_parser.py +184 -0
  7. dateparser/conf.py +267 -0
  8. dateparser/custom_language_detection/__init__.py +0 -0
  9. dateparser/custom_language_detection/fasttext.py +43 -0
  10. dateparser/custom_language_detection/langdetect.py +37 -0
  11. dateparser/custom_language_detection/language_mapping.py +18 -0
  12. dateparser/data/__init__.py +2 -0
  13. dateparser/data/date_translation_data/__init__.py +0 -0
  14. dateparser/data/date_translation_data/af.py +242 -0
  15. dateparser/data/date_translation_data/agq.py +169 -0
  16. dateparser/data/date_translation_data/ak.py +169 -0
  17. dateparser/data/date_translation_data/am.py +222 -0
  18. dateparser/data/date_translation_data/ar.py +574 -0
  19. dateparser/data/date_translation_data/as.py +164 -0
  20. dateparser/data/date_translation_data/asa.py +168 -0
  21. dateparser/data/date_translation_data/ast.py +280 -0
  22. dateparser/data/date_translation_data/az-Cyrl.py +168 -0
  23. dateparser/data/date_translation_data/az-Latn.py +217 -0
  24. dateparser/data/date_translation_data/az.py +217 -0
  25. dateparser/data/date_translation_data/bas.py +169 -0
  26. dateparser/data/date_translation_data/be.py +340 -0
  27. dateparser/data/date_translation_data/bem.py +161 -0
  28. dateparser/data/date_translation_data/bez.py +169 -0
  29. dateparser/data/date_translation_data/bg.py +345 -0
  30. dateparser/data/date_translation_data/bm.py +167 -0
  31. dateparser/data/date_translation_data/bn.py +241 -0
  32. dateparser/data/date_translation_data/bo.py +185 -0
  33. dateparser/data/date_translation_data/br.py +226 -0
  34. dateparser/data/date_translation_data/brx.py +157 -0
  35. dateparser/data/date_translation_data/bs-Cyrl.py +226 -0
  36. dateparser/data/date_translation_data/bs-Latn.py +248 -0
  37. dateparser/data/date_translation_data/bs.py +248 -0
  38. dateparser/data/date_translation_data/ca.py +313 -0
  39. dateparser/data/date_translation_data/ce.py +225 -0
  40. dateparser/data/date_translation_data/cgg.py +169 -0
  41. dateparser/data/date_translation_data/chr.py +240 -0
  42. dateparser/data/date_translation_data/ckb.py +154 -0
  43. dateparser/data/date_translation_data/cs.py +316 -0
  44. dateparser/data/date_translation_data/cy.py +217 -0
  45. dateparser/data/date_translation_data/da.py +296 -0
  46. dateparser/data/date_translation_data/dav.py +169 -0
  47. dateparser/data/date_translation_data/de.py +357 -0
  48. dateparser/data/date_translation_data/dje.py +167 -0
  49. dateparser/data/date_translation_data/dsb.py +270 -0
  50. dateparser/data/date_translation_data/dua.py +169 -0
  51. dateparser/data/date_translation_data/dyo.py +168 -0
  52. dateparser/data/date_translation_data/dz.py +225 -0
  53. dateparser/data/date_translation_data/ebu.py +169 -0
  54. dateparser/data/date_translation_data/ee.py +233 -0
  55. dateparser/data/date_translation_data/el.py +279 -0
  56. dateparser/data/date_translation_data/en.py +851 -0
  57. dateparser/data/date_translation_data/eo.py +169 -0
  58. dateparser/data/date_translation_data/es.py +499 -0
  59. dateparser/data/date_translation_data/et.py +233 -0
  60. dateparser/data/date_translation_data/eu.py +219 -0
  61. dateparser/data/date_translation_data/ewo.py +169 -0
  62. dateparser/data/date_translation_data/fa.py +270 -0
  63. dateparser/data/date_translation_data/ff.py +179 -0
  64. dateparser/data/date_translation_data/fi.py +345 -0
  65. dateparser/data/date_translation_data/fil.py +223 -0
  66. dateparser/data/date_translation_data/fo.py +256 -0
  67. dateparser/data/date_translation_data/fr.py +520 -0
  68. dateparser/data/date_translation_data/fur.py +223 -0
  69. dateparser/data/date_translation_data/fy.py +223 -0
  70. dateparser/data/date_translation_data/ga.py +238 -0
  71. dateparser/data/date_translation_data/gd.py +277 -0
  72. dateparser/data/date_translation_data/gl.py +253 -0
  73. dateparser/data/date_translation_data/gsw.py +179 -0
  74. dateparser/data/date_translation_data/gu.py +216 -0
  75. dateparser/data/date_translation_data/guz.py +170 -0
  76. dateparser/data/date_translation_data/gv.py +166 -0
  77. dateparser/data/date_translation_data/ha.py +176 -0
  78. dateparser/data/date_translation_data/haw.py +168 -0
  79. dateparser/data/date_translation_data/he.py +371 -0
  80. dateparser/data/date_translation_data/hi.py +261 -0
  81. dateparser/data/date_translation_data/hr.py +378 -0
  82. dateparser/data/date_translation_data/hsb.py +271 -0
  83. dateparser/data/date_translation_data/hu.py +297 -0
  84. dateparser/data/date_translation_data/hy.py +246 -0
  85. dateparser/data/date_translation_data/id.py +272 -0
  86. dateparser/data/date_translation_data/ig.py +168 -0
  87. dateparser/data/date_translation_data/ii.py +157 -0
  88. dateparser/data/date_translation_data/is.py +242 -0
  89. dateparser/data/date_translation_data/it.py +282 -0
  90. dateparser/data/date_translation_data/ja.py +286 -0
  91. dateparser/data/date_translation_data/jgo.py +188 -0
  92. dateparser/data/date_translation_data/jmc.py +168 -0
  93. dateparser/data/date_translation_data/ka.py +241 -0
  94. dateparser/data/date_translation_data/kab.py +169 -0
  95. dateparser/data/date_translation_data/kam.py +169 -0
  96. dateparser/data/date_translation_data/kde.py +169 -0
  97. dateparser/data/date_translation_data/kea.py +230 -0
  98. dateparser/data/date_translation_data/khq.py +167 -0
  99. dateparser/data/date_translation_data/ki.py +169 -0
  100. dateparser/data/date_translation_data/kk.py +228 -0
  101. dateparser/data/date_translation_data/kl.py +213 -0
  102. dateparser/data/date_translation_data/kln.py +171 -0
  103. dateparser/data/date_translation_data/km.py +198 -0
  104. dateparser/data/date_translation_data/kn.py +225 -0
  105. dateparser/data/date_translation_data/ko.py +207 -0
  106. dateparser/data/date_translation_data/kok.py +157 -0
  107. dateparser/data/date_translation_data/ks.py +152 -0
  108. dateparser/data/date_translation_data/ksb.py +168 -0
  109. dateparser/data/date_translation_data/ksf.py +169 -0
  110. dateparser/data/date_translation_data/ksh.py +192 -0
  111. dateparser/data/date_translation_data/kw.py +169 -0
  112. dateparser/data/date_translation_data/ky.py +240 -0
  113. dateparser/data/date_translation_data/lag.py +169 -0
  114. dateparser/data/date_translation_data/lb.py +233 -0
  115. dateparser/data/date_translation_data/lg.py +169 -0
  116. dateparser/data/date_translation_data/lkt.py +194 -0
  117. dateparser/data/date_translation_data/ln.py +179 -0
  118. dateparser/data/date_translation_data/lo.py +228 -0
  119. dateparser/data/date_translation_data/lrc.py +154 -0
  120. dateparser/data/date_translation_data/lt.py +263 -0
  121. dateparser/data/date_translation_data/lu.py +169 -0
  122. dateparser/data/date_translation_data/luo.py +169 -0
  123. dateparser/data/date_translation_data/luy.py +168 -0
  124. dateparser/data/date_translation_data/lv.py +257 -0
  125. dateparser/data/date_translation_data/mas.py +173 -0
  126. dateparser/data/date_translation_data/mer.py +168 -0
  127. dateparser/data/date_translation_data/mfe.py +166 -0
  128. dateparser/data/date_translation_data/mg.py +168 -0
  129. dateparser/data/date_translation_data/mgh.py +169 -0
  130. dateparser/data/date_translation_data/mgo.py +151 -0
  131. dateparser/data/date_translation_data/mk.py +234 -0
  132. dateparser/data/date_translation_data/ml.py +217 -0
  133. dateparser/data/date_translation_data/mn.py +224 -0
  134. dateparser/data/date_translation_data/mr.py +229 -0
  135. dateparser/data/date_translation_data/ms.py +242 -0
  136. dateparser/data/date_translation_data/mt.py +175 -0
  137. dateparser/data/date_translation_data/mua.py +169 -0
  138. dateparser/data/date_translation_data/my.py +203 -0
  139. dateparser/data/date_translation_data/mzn.py +199 -0
  140. dateparser/data/date_translation_data/naq.py +169 -0
  141. dateparser/data/date_translation_data/nb.py +261 -0
  142. dateparser/data/date_translation_data/nd.py +169 -0
  143. dateparser/data/date_translation_data/ne.py +207 -0
  144. dateparser/data/date_translation_data/nl.py +273 -0
  145. dateparser/data/date_translation_data/nmg.py +169 -0
  146. dateparser/data/date_translation_data/nn.py +231 -0
  147. dateparser/data/date_translation_data/nnh.py +150 -0
  148. dateparser/data/date_translation_data/nus.py +166 -0
  149. dateparser/data/date_translation_data/nyn.py +169 -0
  150. dateparser/data/date_translation_data/om.py +173 -0
  151. dateparser/data/date_translation_data/or.py +157 -0
  152. dateparser/data/date_translation_data/os.py +203 -0
  153. dateparser/data/date_translation_data/pa-Arab.py +150 -0
  154. dateparser/data/date_translation_data/pa-Guru.py +221 -0
  155. dateparser/data/date_translation_data/pa.py +221 -0
  156. dateparser/data/date_translation_data/pl.py +416 -0
  157. dateparser/data/date_translation_data/ps.py +150 -0
  158. dateparser/data/date_translation_data/pt.py +981 -0
  159. dateparser/data/date_translation_data/qu.py +176 -0
  160. dateparser/data/date_translation_data/rm.py +166 -0
  161. dateparser/data/date_translation_data/rn.py +169 -0
  162. dateparser/data/date_translation_data/ro.py +270 -0
  163. dateparser/data/date_translation_data/rof.py +157 -0
  164. dateparser/data/date_translation_data/ru.py +442 -0
  165. dateparser/data/date_translation_data/rw.py +169 -0
  166. dateparser/data/date_translation_data/rwk.py +168 -0
  167. dateparser/data/date_translation_data/sah.py +219 -0
  168. dateparser/data/date_translation_data/saq.py +169 -0
  169. dateparser/data/date_translation_data/sbp.py +169 -0
  170. dateparser/data/date_translation_data/se.py +280 -0
  171. dateparser/data/date_translation_data/seh.py +169 -0
  172. dateparser/data/date_translation_data/ses.py +167 -0
  173. dateparser/data/date_translation_data/sg.py +169 -0
  174. dateparser/data/date_translation_data/shi-Latn.py +169 -0
  175. dateparser/data/date_translation_data/shi-Tfng.py +169 -0
  176. dateparser/data/date_translation_data/shi.py +169 -0
  177. dateparser/data/date_translation_data/si.py +220 -0
  178. dateparser/data/date_translation_data/sk.py +327 -0
  179. dateparser/data/date_translation_data/sl.py +244 -0
  180. dateparser/data/date_translation_data/smn.py +176 -0
  181. dateparser/data/date_translation_data/sn.py +169 -0
  182. dateparser/data/date_translation_data/so.py +179 -0
  183. dateparser/data/date_translation_data/sq.py +237 -0
  184. dateparser/data/date_translation_data/sr-Cyrl.py +306 -0
  185. dateparser/data/date_translation_data/sr-Latn.py +306 -0
  186. dateparser/data/date_translation_data/sr.py +255 -0
  187. dateparser/data/date_translation_data/sv.py +309 -0
  188. dateparser/data/date_translation_data/sw.py +231 -0
  189. dateparser/data/date_translation_data/ta.py +264 -0
  190. dateparser/data/date_translation_data/te.py +239 -0
  191. dateparser/data/date_translation_data/teo.py +173 -0
  192. dateparser/data/date_translation_data/th.py +300 -0
  193. dateparser/data/date_translation_data/ti.py +173 -0
  194. dateparser/data/date_translation_data/tl.py +137 -0
  195. dateparser/data/date_translation_data/to.py +216 -0
  196. dateparser/data/date_translation_data/tr.py +259 -0
  197. dateparser/data/date_translation_data/twq.py +167 -0
  198. dateparser/data/date_translation_data/tzm.py +169 -0
  199. dateparser/data/date_translation_data/ug.py +203 -0
  200. dateparser/data/date_translation_data/uk.py +502 -0
  201. dateparser/data/date_translation_data/ur.py +256 -0
  202. dateparser/data/date_translation_data/uz-Arab.py +167 -0
  203. dateparser/data/date_translation_data/uz-Cyrl.py +210 -0
  204. dateparser/data/date_translation_data/uz-Latn.py +216 -0
  205. dateparser/data/date_translation_data/uz.py +216 -0
  206. dateparser/data/date_translation_data/vi.py +260 -0
  207. dateparser/data/date_translation_data/vun.py +168 -0
  208. dateparser/data/date_translation_data/wae.py +224 -0
  209. dateparser/data/date_translation_data/xog.py +169 -0
  210. dateparser/data/date_translation_data/yav.py +169 -0
  211. dateparser/data/date_translation_data/yi.py +178 -0
  212. dateparser/data/date_translation_data/yo.py +263 -0
  213. dateparser/data/date_translation_data/yue.py +203 -0
  214. dateparser/data/date_translation_data/zgh.py +169 -0
  215. dateparser/data/date_translation_data/zh-Hans.py +240 -0
  216. dateparser/data/date_translation_data/zh-Hant.py +402 -0
  217. dateparser/data/date_translation_data/zh.py +273 -0
  218. dateparser/data/date_translation_data/zu.py +196 -0
  219. dateparser/data/languages_info.py +826 -0
  220. dateparser/date.py +599 -0
  221. dateparser/date_parser.py +55 -0
  222. dateparser/freshness_date_parser.py +156 -0
  223. dateparser/languages/__init__.py +2 -0
  224. dateparser/languages/dictionary.py +352 -0
  225. dateparser/languages/loader.py +224 -0
  226. dateparser/languages/locale.py +625 -0
  227. dateparser/languages/validation.py +467 -0
  228. dateparser/parser.py +742 -0
  229. dateparser/search/__init__.py +71 -0
  230. dateparser/search/detection.py +78 -0
  231. dateparser/search/search.py +297 -0
  232. dateparser/search/text_detection.py +89 -0
  233. dateparser/timezone_parser.py +91 -0
  234. dateparser/timezones.py +469 -0
  235. dateparser/utils/__init__.py +257 -0
  236. dateparser/utils/strptime.py +108 -0
  237. dateparser-1.2.1.dist-info/AUTHORS.rst +17 -0
  238. dateparser-1.2.1.dist-info/LICENSE +12 -0
  239. dateparser-1.2.1.dist-info/METADATA +864 -0
  240. dateparser-1.2.1.dist-info/RECORD +256 -0
  241. dateparser-1.2.1.dist-info/WHEEL +5 -0
  242. dateparser-1.2.1.dist-info/entry_points.txt +2 -0
  243. dateparser-1.2.1.dist-info/top_level.txt +4 -0
  244. dateparser_cli/__init__.py +0 -0
  245. dateparser_cli/cli.py +36 -0
  246. dateparser_cli/exceptions.py +2 -0
  247. dateparser_cli/fasttext_manager.py +42 -0
  248. dateparser_cli/utils.py +27 -0
  249. dateparser_data/__init__.py +0 -0
  250. dateparser_data/settings.py +33 -0
  251. dateparser_scripts/__init__.py +0 -0
  252. dateparser_scripts/get_cldr_data.py +567 -0
  253. dateparser_scripts/order_languages.py +217 -0
  254. dateparser_scripts/update_supported_languages_and_locales.py +48 -0
  255. dateparser_scripts/utils.py +73 -0
  256. dateparser_scripts/write_complete_data.py +129 -0
@@ -0,0 +1,71 @@
1
+ from dateparser.search.search import DateSearchWithDetection
2
+
3
+ _search_with_detection = DateSearchWithDetection()
4
+
5
+
6
+ def search_dates(
7
+ text,
8
+ languages=None,
9
+ settings=None,
10
+ add_detected_language=False,
11
+ detect_languages_function=None,
12
+ ):
13
+ """Find all substrings of the given string which represent date and/or time and parse them.
14
+
15
+ :param text:
16
+ A string in a natural language which may contain date and/or time expressions.
17
+ :type text: str
18
+
19
+ :param languages:
20
+ A list of two letters language codes.e.g. ['en', 'es']. If languages are given, it will
21
+ not attempt to detect the language.
22
+ :type languages: list
23
+
24
+ :param settings:
25
+ Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`.
26
+ :type settings: dict
27
+
28
+ :param add_detected_language:
29
+ Indicates if we want the detected language returned in the tuple.
30
+ :type add_detected_language: bool
31
+
32
+ :param detect_languages_function:
33
+ A function for language detection that takes as input a `text` and a `confidence_threshold`,
34
+ and returns a list of detected language codes.
35
+ Note: detect_languages_function is only uses if `languages` are not provided.
36
+ :type detect_languages_function: function
37
+
38
+ :return: Returns list of tuples containing:
39
+ substrings representing date and/or time, corresponding :mod:`datetime.datetime`
40
+ object and detected language if *add_detected_language* is True.
41
+ Returns None if no dates that can be parsed are found.
42
+ :rtype: list
43
+ :raises: ValueError - Unknown Language
44
+
45
+ >>> from dateparser.search import search_dates
46
+ >>> search_dates('The first artificial Earth satellite was launched on 4 October 1957.')
47
+ [('on 4 October 1957', datetime.datetime(1957, 10, 4, 0, 0))]
48
+
49
+ >>> search_dates('The first artificial Earth satellite was launched on 4 October 1957.',
50
+ >>> add_detected_language=True)
51
+ [('on 4 October 1957', datetime.datetime(1957, 10, 4, 0, 0), 'en')]
52
+
53
+ >>> search_dates("The client arrived to the office for the first time in March 3rd, 2004 "
54
+ >>> "and got serviced, after a couple of months, on May 6th 2004, the customer "
55
+ >>> "returned indicating a defect on the part")
56
+ [('in March 3rd, 2004 and', datetime.datetime(2004, 3, 3, 0, 0)),
57
+ ('on May 6th 2004', datetime.datetime(2004, 5, 6, 0, 0))]
58
+
59
+ """
60
+ result = _search_with_detection.search_dates(
61
+ text=text,
62
+ languages=languages,
63
+ settings=settings,
64
+ detect_languages_function=detect_languages_function,
65
+ )
66
+ dates = result.get("Dates")
67
+ if dates:
68
+ if add_detected_language:
69
+ language = result.get("Language")
70
+ dates = [date + (language,) for date in dates]
71
+ return dates
@@ -0,0 +1,78 @@
1
+ from functools import wraps
2
+
3
+
4
+ def _restore_languages_on_generator_exit(method):
5
+ @wraps(method)
6
+ def wrapped(self, *args, **kwargs):
7
+ stored_languages = self.languages[:]
8
+ for language in method(self, *args, **kwargs):
9
+ yield language
10
+ else:
11
+ self.languages[:] = stored_languages
12
+
13
+ return wrapped
14
+
15
+
16
+ class BaseLanguageDetector:
17
+ def __init__(self, languages):
18
+ self.languages = languages[:]
19
+
20
+ @_restore_languages_on_generator_exit
21
+ def iterate_applicable_languages(self, date_string, settings=None, modify=False):
22
+ languages = self.languages if modify else self.languages[:]
23
+ yield from self._filter_languages(date_string, languages, settings)
24
+
25
+ @staticmethod
26
+ def _filter_languages(date_string, languages, settings=None):
27
+ while languages:
28
+ language = languages[0]
29
+ if language.is_applicable(
30
+ date_string, strip_timezone=False, settings=settings
31
+ ):
32
+ yield language
33
+ elif language.is_applicable(
34
+ date_string, strip_timezone=True, settings=settings
35
+ ):
36
+ yield language
37
+
38
+ languages.pop(0)
39
+
40
+
41
+ class AutoDetectLanguage(BaseLanguageDetector):
42
+ def __init__(self, languages, allow_redetection=False):
43
+ super().__init__(languages=languages[:])
44
+ self.language_pool = languages[:]
45
+ self.allow_redetection = allow_redetection
46
+
47
+ @_restore_languages_on_generator_exit
48
+ def iterate_applicable_languages(self, date_string, modify=False, settings=None):
49
+ languages = self.languages if modify else self.languages[:]
50
+ initial_languages = languages[:]
51
+ yield from self._filter_languages(date_string, languages, settings=settings)
52
+
53
+ if not self.allow_redetection:
54
+ return
55
+
56
+ # Try languages that was not tried before with this date_string
57
+ languages = [
58
+ language
59
+ for language in self.language_pool
60
+ if language not in initial_languages
61
+ ]
62
+ if modify:
63
+ self.languages = languages
64
+
65
+ yield from self._filter_languages(date_string, languages, settings=settings)
66
+
67
+
68
+ class ExactLanguages(BaseLanguageDetector):
69
+ def __init__(self, languages):
70
+ if languages is None:
71
+ raise ValueError("language cannot be None for ExactLanguages")
72
+ super().__init__(languages=languages)
73
+
74
+ @_restore_languages_on_generator_exit
75
+ def iterate_applicable_languages(self, date_string, modify=False, settings=None):
76
+ yield from super().iterate_applicable_languages(
77
+ date_string, modify=False, settings=settings
78
+ )
@@ -0,0 +1,297 @@
1
+ from collections.abc import Set
2
+
3
+ import regex as re
4
+
5
+ from dateparser.conf import Settings, apply_settings, check_settings
6
+ from dateparser.custom_language_detection.language_mapping import map_languages
7
+ from dateparser.date import DateDataParser
8
+ from dateparser.languages.loader import LocaleDataLoader
9
+ from dateparser.search.text_detection import FullTextLanguageDetector
10
+
11
+ RELATIVE_REG = re.compile("(ago|in|from now|tomorrow|today|yesterday)")
12
+
13
+
14
+ def date_is_relative(translation):
15
+ return re.search(RELATIVE_REG, translation) is not None
16
+
17
+
18
+ class _ExactLanguageSearch:
19
+ def __init__(self, loader):
20
+ self.loader = loader
21
+ self.language = None
22
+
23
+ def get_current_language(self, shortname):
24
+ if self.language is None or self.language.shortname != shortname:
25
+ self.language = self.loader.get_locale(shortname)
26
+
27
+ def search(self, shortname, text, settings):
28
+ self.get_current_language(shortname)
29
+ result = self.language.translate_search(text, settings=settings)
30
+ return result
31
+
32
+ @staticmethod
33
+ def set_relative_base(substring, already_parsed):
34
+ if len(already_parsed) == 0:
35
+ return substring, None
36
+
37
+ i = len(already_parsed) - 1
38
+ while already_parsed[i][1]:
39
+ i -= 1
40
+ if i == -1:
41
+ return substring, None
42
+ relative_base = already_parsed[i][0]["date_obj"]
43
+ return substring, relative_base
44
+
45
+ def choose_best_split(self, possible_parsed_splits, possible_substrings_splits):
46
+ rating = []
47
+ for i in range(len(possible_parsed_splits)):
48
+ num_substrings = len(possible_substrings_splits[i])
49
+ num_substrings_without_digits = 0
50
+ not_parsed = 0
51
+ for j, item in enumerate(possible_parsed_splits[i]):
52
+ if item[0]["date_obj"] is None:
53
+ not_parsed += 1
54
+ if not any(char.isdigit() for char in possible_substrings_splits[i][j]):
55
+ num_substrings_without_digits += 1
56
+ rating.append(
57
+ [
58
+ num_substrings,
59
+ 0
60
+ if not_parsed == 0
61
+ else (float(not_parsed) / float(num_substrings)),
62
+ 0
63
+ if num_substrings_without_digits == 0
64
+ else (float(num_substrings_without_digits) / float(num_substrings)),
65
+ ]
66
+ )
67
+ best_index, best_rating = min(
68
+ enumerate(rating), key=lambda p: (p[1][1], p[1][0], p[1][2])
69
+ )
70
+ return (
71
+ possible_parsed_splits[best_index],
72
+ possible_substrings_splits[best_index],
73
+ )
74
+
75
+ def split_by(self, item, original, splitter):
76
+ if item.count(splitter) <= 2:
77
+ return [[item.split(splitter), original.split(splitter)]]
78
+
79
+ item_all_split = item.split(splitter)
80
+ original_all_split = original.split(splitter)
81
+ all_possible_splits = [[item_all_split, original_all_split]]
82
+ for i in range(2, 4):
83
+ item_partially_split = []
84
+ original_partially_split = []
85
+ for j in range(0, len(item_all_split), i):
86
+ item_join = splitter.join(item_all_split[j : j + i])
87
+ original_join = splitter.join(original_all_split[j : j + i])
88
+ item_partially_split.append(item_join)
89
+ original_partially_split.append(original_join)
90
+ all_possible_splits.append([item_partially_split, original_partially_split])
91
+ return all_possible_splits
92
+
93
+ def split_if_not_parsed(self, item, original):
94
+ splitters = [",", "،", "——", "—", "–", ".", " "]
95
+ possible_splits = []
96
+ for splitter in splitters:
97
+ if splitter in item and item.count(splitter) == original.count(splitter):
98
+ possible_splits.extend(self.split_by(item, original, splitter))
99
+ return possible_splits
100
+
101
+ def parse_item(self, parser, item, translated_item, parsed, need_relative_base):
102
+ relative_base = None
103
+ item = item.replace("ngày", "")
104
+ item = item.replace("am", "")
105
+ parsed_item = parser.get_date_data(item)
106
+ is_relative = date_is_relative(translated_item)
107
+
108
+ if need_relative_base:
109
+ item, relative_base = self.set_relative_base(item, parsed)
110
+
111
+ if relative_base:
112
+ parser._settings.RELATIVE_BASE = relative_base
113
+ parsed_item = parser.get_date_data(item)
114
+ return parsed_item, is_relative
115
+
116
+ def parse_found_objects(self, parser, to_parse, original, translated, settings):
117
+ parsed = []
118
+ substrings = []
119
+ need_relative_base = True
120
+ if settings.RELATIVE_BASE:
121
+ need_relative_base = False
122
+ for i, item in enumerate(to_parse):
123
+ if len(item) <= 2:
124
+ continue
125
+
126
+ parsed_item, is_relative = self.parse_item(
127
+ parser, item, translated[i], parsed, need_relative_base
128
+ )
129
+ if parsed_item["date_obj"]:
130
+ parsed.append((parsed_item, is_relative))
131
+ substrings.append(original[i].strip(" .,:()[]-'"))
132
+ continue
133
+
134
+ possible_splits = self.split_if_not_parsed(item, original[i])
135
+ if not possible_splits:
136
+ continue
137
+
138
+ possible_parsed = []
139
+ possible_substrings = []
140
+ for split_translated, split_original in possible_splits:
141
+ current_parsed = []
142
+ current_substrings = []
143
+ if split_translated:
144
+ for j, jtem in enumerate(split_translated):
145
+ if len(jtem) <= 2:
146
+ continue
147
+ parsed_jtem, is_relative_jtem = self.parse_item(
148
+ parser,
149
+ jtem,
150
+ split_translated[j],
151
+ current_parsed,
152
+ need_relative_base,
153
+ )
154
+ current_parsed.append((parsed_jtem, is_relative_jtem))
155
+ current_substrings.append(split_original[j].strip(" .,:()[]-"))
156
+ possible_parsed.append(current_parsed)
157
+ possible_substrings.append(current_substrings)
158
+ parsed_best, substrings_best = self.choose_best_split(
159
+ possible_parsed, possible_substrings
160
+ )
161
+ for k in range(len(parsed_best)):
162
+ if parsed_best[k][0]["date_obj"]:
163
+ parsed.append(parsed_best[k])
164
+ substrings.append(substrings_best[k])
165
+ return parsed, substrings
166
+
167
+ def search_parse(self, shortname, text, settings):
168
+ translated, original = self.search(shortname, text, settings)
169
+ bad_translate_with_search = [
170
+ "vi",
171
+ "hu",
172
+ ] # splitting done by spaces and some dictionary items contain spaces
173
+ if shortname not in bad_translate_with_search:
174
+ languages = ["en"]
175
+ to_parse = translated
176
+ else:
177
+ languages = [shortname]
178
+ to_parse = original
179
+
180
+ parser = DateDataParser(languages=languages, settings=settings)
181
+ parsed, substrings = self.parse_found_objects(
182
+ parser=parser,
183
+ to_parse=to_parse,
184
+ original=original,
185
+ translated=translated,
186
+ settings=settings,
187
+ )
188
+ parser._settings = Settings()
189
+ return list(zip(substrings, [i[0]["date_obj"] for i in parsed]))
190
+
191
+
192
+ class DateSearchWithDetection:
193
+ """
194
+ Class which executes language detection of string in a natural language, translation of a given string,
195
+ search of substrings which represent date and/or time and parsing of these substrings.
196
+
197
+ """
198
+
199
+ def __init__(self):
200
+ self.loader = LocaleDataLoader()
201
+ self.available_language_map = self.loader.get_locale_map()
202
+ self.search = _ExactLanguageSearch(self.loader)
203
+
204
+ @apply_settings
205
+ def detect_language(
206
+ self, text, languages, settings=None, detect_languages_function=None
207
+ ):
208
+ if detect_languages_function and not languages:
209
+ detected_languages = detect_languages_function(
210
+ text,
211
+ confidence_threshold=settings.LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD,
212
+ )
213
+ detected_languages = (
214
+ map_languages(detected_languages) or settings.DEFAULT_LANGUAGES
215
+ )
216
+ return detected_languages[0] if detected_languages else None
217
+
218
+ if isinstance(languages, (list, tuple, Set)):
219
+ if all([language in self.available_language_map for language in languages]):
220
+ languages = [
221
+ self.available_language_map[language] for language in languages
222
+ ]
223
+ else:
224
+ unsupported_languages = set(languages) - set(
225
+ self.available_language_map.keys()
226
+ )
227
+ raise ValueError(
228
+ "Unknown language(s): %s"
229
+ % ", ".join(map(repr, unsupported_languages))
230
+ )
231
+ elif languages is not None:
232
+ raise TypeError(
233
+ "languages argument must be a list (%r given)" % type(languages)
234
+ )
235
+
236
+ if languages:
237
+ self.language_detector = FullTextLanguageDetector(languages=languages)
238
+ else:
239
+ self.language_detector = FullTextLanguageDetector(
240
+ list(self.available_language_map.values())
241
+ )
242
+
243
+ detected_language = self.language_detector._best_language(text) or (
244
+ settings.DEFAULT_LANGUAGES[0] if settings.DEFAULT_LANGUAGES else None
245
+ )
246
+ return detected_language
247
+
248
+ @apply_settings
249
+ def search_dates(
250
+ self, text, languages=None, settings=None, detect_languages_function=None
251
+ ):
252
+ """
253
+ Find all substrings of the given string which represent date and/or time and parse them.
254
+
255
+ :param text:
256
+ A string in a natural language which may contain date and/or time expressions.
257
+ :type text: str
258
+
259
+ :param languages:
260
+ A list of two letters language codes.e.g. ['en', 'es']. If languages are given, it will not attempt
261
+ to detect the language.
262
+ :type languages: list
263
+
264
+ :param settings:
265
+ Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`.
266
+ :type settings: dict
267
+
268
+ :param detect_languages_function:
269
+ A function for language detection that takes as input a `text` and a `confidence_threshold`,
270
+ returns a list of detected language codes.
271
+ :type detect_languages_function: function
272
+
273
+ :return: a dict mapping keys to two letter language code and a list of tuples of pairs:
274
+ substring representing date expressions and corresponding :mod:`datetime.datetime` object.
275
+ For example:
276
+ {'Language': 'en', 'Dates': [('on 4 October 1957', datetime.datetime(1957, 10, 4, 0, 0))]}
277
+ If language of the string isn't recognised returns:
278
+ {'Language': None, 'Dates': None}
279
+ :raises: ValueError - Unknown Language
280
+ """
281
+
282
+ check_settings(settings)
283
+
284
+ language_shortname = self.detect_language(
285
+ text=text,
286
+ languages=languages,
287
+ settings=settings,
288
+ detect_languages_function=detect_languages_function,
289
+ )
290
+ if not language_shortname:
291
+ return {"Language": None, "Dates": None}
292
+ return {
293
+ "Language": language_shortname,
294
+ "Dates": self.search.search_parse(
295
+ language_shortname, text, settings=settings
296
+ ),
297
+ }
@@ -0,0 +1,89 @@
1
+ from dateparser.conf import apply_settings
2
+ from dateparser.search.detection import BaseLanguageDetector
3
+ from dateparser.utils import normalize_unicode
4
+
5
+
6
+ class FullTextLanguageDetector(BaseLanguageDetector):
7
+ def __init__(self, languages):
8
+ super(BaseLanguageDetector, self).__init__()
9
+ self.languages = languages[:]
10
+ self.language_unique_chars = []
11
+ self.language_chars = []
12
+
13
+ def get_unique_characters(self, settings):
14
+ settings = settings.replace(NORMALIZE=False)
15
+
16
+ for language in self.languages:
17
+ chars = language.get_wordchars_for_detection(settings=settings)
18
+ self.language_chars.append(chars)
19
+
20
+ for char_set in self.language_chars:
21
+ unique_chars = char_set
22
+ for other_char_set in self.language_chars:
23
+ if other_char_set != char_set:
24
+ unique_chars = unique_chars - other_char_set
25
+ self.language_unique_chars.append(unique_chars)
26
+
27
+ def character_check(self, date_string, settings):
28
+ date_string_set = set(date_string.lower())
29
+ symbol_set = {
30
+ "0",
31
+ "1",
32
+ "2",
33
+ "3",
34
+ "4",
35
+ "5",
36
+ "6",
37
+ "7",
38
+ "8",
39
+ "9",
40
+ " ",
41
+ "/",
42
+ "-",
43
+ ")",
44
+ "(",
45
+ ".",
46
+ ":",
47
+ "\\",
48
+ ",",
49
+ "'",
50
+ }
51
+ if date_string_set & symbol_set == date_string_set:
52
+ self.languages = [self.languages[0]]
53
+ return
54
+ self.get_unique_characters(settings=settings)
55
+ for i in range(len(self.languages)):
56
+ for char in self.language_unique_chars[i]:
57
+ if char.lower() in date_string.lower():
58
+ self.languages = [self.languages[i]]
59
+ return
60
+ indices_to_pop = []
61
+ for i in range(len(self.languages)):
62
+ if len(date_string_set & self.language_chars[i]) == 0:
63
+ indices_to_pop.append(i)
64
+ self.languages = [
65
+ i for j, i in enumerate(self.languages) if j not in indices_to_pop
66
+ ]
67
+
68
+ @apply_settings
69
+ def _best_language(self, date_string, settings=None):
70
+ self.character_check(date_string, settings)
71
+ date_string = normalize_unicode(date_string.lower())
72
+ if len(self.languages) == 1:
73
+ return self.languages[0].shortname
74
+ applicable_languages = []
75
+ for language in self.languages:
76
+ num_words = language.count_applicability(
77
+ date_string, strip_timezone=False, settings=settings
78
+ )
79
+ if num_words[0] > 0 or num_words[1] > 0:
80
+ applicable_languages.append((language.shortname, num_words))
81
+ else:
82
+ num_words = language.count_applicability(
83
+ date_string, strip_timezone=True, settings=settings
84
+ )
85
+ if num_words[0] > 0 or num_words[1] > 0:
86
+ applicable_languages.append((language.shortname, num_words))
87
+ if not applicable_languages:
88
+ return None
89
+ return max(applicable_languages, key=lambda p: (p[1][0], p[1][1]))[0]
@@ -0,0 +1,91 @@
1
+ from datetime import datetime, timedelta, timezone, tzinfo
2
+
3
+ import regex as re
4
+
5
+ from .timezones import timezone_info_list
6
+
7
+
8
+ class StaticTzInfo(tzinfo):
9
+ def __init__(self, name, offset):
10
+ self.__offset = offset
11
+ self.__name = name
12
+
13
+ def tzname(self, dt):
14
+ return self.__name
15
+
16
+ def utcoffset(self, dt):
17
+ return self.__offset
18
+
19
+ def dst(self, dt):
20
+ return timedelta(0)
21
+
22
+ def __repr__(self):
23
+ return "<%s '%s'>" % (self.__class__.__name__, self.__name)
24
+
25
+ def localize(self, dt, is_dst=False):
26
+ if dt.tzinfo is not None:
27
+ raise ValueError("Not naive datetime (tzinfo is already set)")
28
+ return dt.replace(tzinfo=self)
29
+
30
+ def __getinitargs__(self):
31
+ return self.__name, self.__offset
32
+
33
+
34
+ def pop_tz_offset_from_string(date_string, as_offset=True):
35
+ if _search_regex_ignorecase.search(date_string):
36
+ for name, info in _tz_offsets:
37
+ timezone_re = info["regex"]
38
+ timezone_match = timezone_re.search(date_string)
39
+ if timezone_match:
40
+ start, stop = timezone_match.span()
41
+ date_string = date_string[: start + 1] + date_string[stop:]
42
+ return (
43
+ date_string,
44
+ StaticTzInfo(name, info["offset"]) if as_offset else name,
45
+ )
46
+ return date_string, None
47
+
48
+
49
+ def word_is_tz(word):
50
+ return bool(_search_regex.match(word))
51
+
52
+
53
+ def convert_to_local_tz(datetime_obj, datetime_tz_offset):
54
+ return datetime_obj - datetime_tz_offset + local_tz_offset
55
+
56
+
57
+ def build_tz_offsets(search_regex_parts):
58
+ def get_offset(tz_obj, regex, repl="", replw=""):
59
+ return (
60
+ tz_obj[0],
61
+ {
62
+ "regex": re.compile(
63
+ re.sub(repl, replw, regex % tz_obj[0]), re.IGNORECASE
64
+ ),
65
+ "offset": timedelta(seconds=tz_obj[1]),
66
+ },
67
+ )
68
+
69
+ for tz_info in timezone_info_list:
70
+ for regex in tz_info["regex_patterns"]:
71
+ for tz_obj in tz_info["timezones"]:
72
+ search_regex_parts.append(tz_obj[0])
73
+ yield get_offset(tz_obj, regex)
74
+
75
+ # alternate patterns
76
+ for replace, replacewith in tz_info.get("replace", []):
77
+ search_regex_parts.append(re.sub(replace, replacewith, tz_obj[0]))
78
+ yield get_offset(tz_obj, regex, repl=replace, replw=replacewith)
79
+
80
+
81
+ def get_local_tz_offset():
82
+ offset = datetime.now() - datetime.now(tz=timezone.utc).replace(tzinfo=None)
83
+ offset = timedelta(days=offset.days, seconds=round(offset.seconds, -1))
84
+ return offset
85
+
86
+
87
+ _search_regex_parts = []
88
+ _tz_offsets = list(build_tz_offsets(_search_regex_parts))
89
+ _search_regex = re.compile("|".join(_search_regex_parts))
90
+ _search_regex_ignorecase = re.compile("|".join(_search_regex_parts), re.IGNORECASE)
91
+ local_tz_offset = get_local_tz_offset()