dateparser 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dateparser/__init__.py +82 -0
- dateparser/calendars/__init__.py +144 -0
- dateparser/calendars/hijri.py +6 -0
- dateparser/calendars/hijri_parser.py +60 -0
- dateparser/calendars/jalali.py +9 -0
- dateparser/calendars/jalali_parser.py +184 -0
- dateparser/conf.py +267 -0
- dateparser/custom_language_detection/__init__.py +0 -0
- dateparser/custom_language_detection/fasttext.py +43 -0
- dateparser/custom_language_detection/langdetect.py +37 -0
- dateparser/custom_language_detection/language_mapping.py +18 -0
- dateparser/data/__init__.py +2 -0
- dateparser/data/date_translation_data/__init__.py +0 -0
- dateparser/data/date_translation_data/af.py +242 -0
- dateparser/data/date_translation_data/agq.py +169 -0
- dateparser/data/date_translation_data/ak.py +169 -0
- dateparser/data/date_translation_data/am.py +222 -0
- dateparser/data/date_translation_data/ar.py +574 -0
- dateparser/data/date_translation_data/as.py +164 -0
- dateparser/data/date_translation_data/asa.py +168 -0
- dateparser/data/date_translation_data/ast.py +280 -0
- dateparser/data/date_translation_data/az-Cyrl.py +168 -0
- dateparser/data/date_translation_data/az-Latn.py +217 -0
- dateparser/data/date_translation_data/az.py +217 -0
- dateparser/data/date_translation_data/bas.py +169 -0
- dateparser/data/date_translation_data/be.py +340 -0
- dateparser/data/date_translation_data/bem.py +161 -0
- dateparser/data/date_translation_data/bez.py +169 -0
- dateparser/data/date_translation_data/bg.py +345 -0
- dateparser/data/date_translation_data/bm.py +167 -0
- dateparser/data/date_translation_data/bn.py +241 -0
- dateparser/data/date_translation_data/bo.py +185 -0
- dateparser/data/date_translation_data/br.py +226 -0
- dateparser/data/date_translation_data/brx.py +157 -0
- dateparser/data/date_translation_data/bs-Cyrl.py +226 -0
- dateparser/data/date_translation_data/bs-Latn.py +248 -0
- dateparser/data/date_translation_data/bs.py +248 -0
- dateparser/data/date_translation_data/ca.py +313 -0
- dateparser/data/date_translation_data/ce.py +225 -0
- dateparser/data/date_translation_data/cgg.py +169 -0
- dateparser/data/date_translation_data/chr.py +240 -0
- dateparser/data/date_translation_data/ckb.py +154 -0
- dateparser/data/date_translation_data/cs.py +316 -0
- dateparser/data/date_translation_data/cy.py +217 -0
- dateparser/data/date_translation_data/da.py +296 -0
- dateparser/data/date_translation_data/dav.py +169 -0
- dateparser/data/date_translation_data/de.py +357 -0
- dateparser/data/date_translation_data/dje.py +167 -0
- dateparser/data/date_translation_data/dsb.py +270 -0
- dateparser/data/date_translation_data/dua.py +169 -0
- dateparser/data/date_translation_data/dyo.py +168 -0
- dateparser/data/date_translation_data/dz.py +225 -0
- dateparser/data/date_translation_data/ebu.py +169 -0
- dateparser/data/date_translation_data/ee.py +233 -0
- dateparser/data/date_translation_data/el.py +279 -0
- dateparser/data/date_translation_data/en.py +851 -0
- dateparser/data/date_translation_data/eo.py +169 -0
- dateparser/data/date_translation_data/es.py +499 -0
- dateparser/data/date_translation_data/et.py +233 -0
- dateparser/data/date_translation_data/eu.py +219 -0
- dateparser/data/date_translation_data/ewo.py +169 -0
- dateparser/data/date_translation_data/fa.py +270 -0
- dateparser/data/date_translation_data/ff.py +179 -0
- dateparser/data/date_translation_data/fi.py +345 -0
- dateparser/data/date_translation_data/fil.py +223 -0
- dateparser/data/date_translation_data/fo.py +256 -0
- dateparser/data/date_translation_data/fr.py +520 -0
- dateparser/data/date_translation_data/fur.py +223 -0
- dateparser/data/date_translation_data/fy.py +223 -0
- dateparser/data/date_translation_data/ga.py +238 -0
- dateparser/data/date_translation_data/gd.py +277 -0
- dateparser/data/date_translation_data/gl.py +253 -0
- dateparser/data/date_translation_data/gsw.py +179 -0
- dateparser/data/date_translation_data/gu.py +216 -0
- dateparser/data/date_translation_data/guz.py +170 -0
- dateparser/data/date_translation_data/gv.py +166 -0
- dateparser/data/date_translation_data/ha.py +176 -0
- dateparser/data/date_translation_data/haw.py +168 -0
- dateparser/data/date_translation_data/he.py +371 -0
- dateparser/data/date_translation_data/hi.py +261 -0
- dateparser/data/date_translation_data/hr.py +378 -0
- dateparser/data/date_translation_data/hsb.py +271 -0
- dateparser/data/date_translation_data/hu.py +297 -0
- dateparser/data/date_translation_data/hy.py +246 -0
- dateparser/data/date_translation_data/id.py +272 -0
- dateparser/data/date_translation_data/ig.py +168 -0
- dateparser/data/date_translation_data/ii.py +157 -0
- dateparser/data/date_translation_data/is.py +242 -0
- dateparser/data/date_translation_data/it.py +282 -0
- dateparser/data/date_translation_data/ja.py +286 -0
- dateparser/data/date_translation_data/jgo.py +188 -0
- dateparser/data/date_translation_data/jmc.py +168 -0
- dateparser/data/date_translation_data/ka.py +241 -0
- dateparser/data/date_translation_data/kab.py +169 -0
- dateparser/data/date_translation_data/kam.py +169 -0
- dateparser/data/date_translation_data/kde.py +169 -0
- dateparser/data/date_translation_data/kea.py +230 -0
- dateparser/data/date_translation_data/khq.py +167 -0
- dateparser/data/date_translation_data/ki.py +169 -0
- dateparser/data/date_translation_data/kk.py +228 -0
- dateparser/data/date_translation_data/kl.py +213 -0
- dateparser/data/date_translation_data/kln.py +171 -0
- dateparser/data/date_translation_data/km.py +198 -0
- dateparser/data/date_translation_data/kn.py +225 -0
- dateparser/data/date_translation_data/ko.py +207 -0
- dateparser/data/date_translation_data/kok.py +157 -0
- dateparser/data/date_translation_data/ks.py +152 -0
- dateparser/data/date_translation_data/ksb.py +168 -0
- dateparser/data/date_translation_data/ksf.py +169 -0
- dateparser/data/date_translation_data/ksh.py +192 -0
- dateparser/data/date_translation_data/kw.py +169 -0
- dateparser/data/date_translation_data/ky.py +240 -0
- dateparser/data/date_translation_data/lag.py +169 -0
- dateparser/data/date_translation_data/lb.py +233 -0
- dateparser/data/date_translation_data/lg.py +169 -0
- dateparser/data/date_translation_data/lkt.py +194 -0
- dateparser/data/date_translation_data/ln.py +179 -0
- dateparser/data/date_translation_data/lo.py +228 -0
- dateparser/data/date_translation_data/lrc.py +154 -0
- dateparser/data/date_translation_data/lt.py +263 -0
- dateparser/data/date_translation_data/lu.py +169 -0
- dateparser/data/date_translation_data/luo.py +169 -0
- dateparser/data/date_translation_data/luy.py +168 -0
- dateparser/data/date_translation_data/lv.py +257 -0
- dateparser/data/date_translation_data/mas.py +173 -0
- dateparser/data/date_translation_data/mer.py +168 -0
- dateparser/data/date_translation_data/mfe.py +166 -0
- dateparser/data/date_translation_data/mg.py +168 -0
- dateparser/data/date_translation_data/mgh.py +169 -0
- dateparser/data/date_translation_data/mgo.py +151 -0
- dateparser/data/date_translation_data/mk.py +234 -0
- dateparser/data/date_translation_data/ml.py +217 -0
- dateparser/data/date_translation_data/mn.py +224 -0
- dateparser/data/date_translation_data/mr.py +229 -0
- dateparser/data/date_translation_data/ms.py +242 -0
- dateparser/data/date_translation_data/mt.py +175 -0
- dateparser/data/date_translation_data/mua.py +169 -0
- dateparser/data/date_translation_data/my.py +203 -0
- dateparser/data/date_translation_data/mzn.py +199 -0
- dateparser/data/date_translation_data/naq.py +169 -0
- dateparser/data/date_translation_data/nb.py +261 -0
- dateparser/data/date_translation_data/nd.py +169 -0
- dateparser/data/date_translation_data/ne.py +207 -0
- dateparser/data/date_translation_data/nl.py +273 -0
- dateparser/data/date_translation_data/nmg.py +169 -0
- dateparser/data/date_translation_data/nn.py +231 -0
- dateparser/data/date_translation_data/nnh.py +150 -0
- dateparser/data/date_translation_data/nus.py +166 -0
- dateparser/data/date_translation_data/nyn.py +169 -0
- dateparser/data/date_translation_data/om.py +173 -0
- dateparser/data/date_translation_data/or.py +157 -0
- dateparser/data/date_translation_data/os.py +203 -0
- dateparser/data/date_translation_data/pa-Arab.py +150 -0
- dateparser/data/date_translation_data/pa-Guru.py +221 -0
- dateparser/data/date_translation_data/pa.py +221 -0
- dateparser/data/date_translation_data/pl.py +416 -0
- dateparser/data/date_translation_data/ps.py +150 -0
- dateparser/data/date_translation_data/pt.py +981 -0
- dateparser/data/date_translation_data/qu.py +176 -0
- dateparser/data/date_translation_data/rm.py +166 -0
- dateparser/data/date_translation_data/rn.py +169 -0
- dateparser/data/date_translation_data/ro.py +270 -0
- dateparser/data/date_translation_data/rof.py +157 -0
- dateparser/data/date_translation_data/ru.py +442 -0
- dateparser/data/date_translation_data/rw.py +169 -0
- dateparser/data/date_translation_data/rwk.py +168 -0
- dateparser/data/date_translation_data/sah.py +219 -0
- dateparser/data/date_translation_data/saq.py +169 -0
- dateparser/data/date_translation_data/sbp.py +169 -0
- dateparser/data/date_translation_data/se.py +280 -0
- dateparser/data/date_translation_data/seh.py +169 -0
- dateparser/data/date_translation_data/ses.py +167 -0
- dateparser/data/date_translation_data/sg.py +169 -0
- dateparser/data/date_translation_data/shi-Latn.py +169 -0
- dateparser/data/date_translation_data/shi-Tfng.py +169 -0
- dateparser/data/date_translation_data/shi.py +169 -0
- dateparser/data/date_translation_data/si.py +220 -0
- dateparser/data/date_translation_data/sk.py +327 -0
- dateparser/data/date_translation_data/sl.py +244 -0
- dateparser/data/date_translation_data/smn.py +176 -0
- dateparser/data/date_translation_data/sn.py +169 -0
- dateparser/data/date_translation_data/so.py +179 -0
- dateparser/data/date_translation_data/sq.py +237 -0
- dateparser/data/date_translation_data/sr-Cyrl.py +306 -0
- dateparser/data/date_translation_data/sr-Latn.py +306 -0
- dateparser/data/date_translation_data/sr.py +255 -0
- dateparser/data/date_translation_data/sv.py +309 -0
- dateparser/data/date_translation_data/sw.py +231 -0
- dateparser/data/date_translation_data/ta.py +264 -0
- dateparser/data/date_translation_data/te.py +239 -0
- dateparser/data/date_translation_data/teo.py +173 -0
- dateparser/data/date_translation_data/th.py +300 -0
- dateparser/data/date_translation_data/ti.py +173 -0
- dateparser/data/date_translation_data/tl.py +137 -0
- dateparser/data/date_translation_data/to.py +216 -0
- dateparser/data/date_translation_data/tr.py +259 -0
- dateparser/data/date_translation_data/twq.py +167 -0
- dateparser/data/date_translation_data/tzm.py +169 -0
- dateparser/data/date_translation_data/ug.py +203 -0
- dateparser/data/date_translation_data/uk.py +502 -0
- dateparser/data/date_translation_data/ur.py +256 -0
- dateparser/data/date_translation_data/uz-Arab.py +167 -0
- dateparser/data/date_translation_data/uz-Cyrl.py +210 -0
- dateparser/data/date_translation_data/uz-Latn.py +216 -0
- dateparser/data/date_translation_data/uz.py +216 -0
- dateparser/data/date_translation_data/vi.py +260 -0
- dateparser/data/date_translation_data/vun.py +168 -0
- dateparser/data/date_translation_data/wae.py +224 -0
- dateparser/data/date_translation_data/xog.py +169 -0
- dateparser/data/date_translation_data/yav.py +169 -0
- dateparser/data/date_translation_data/yi.py +178 -0
- dateparser/data/date_translation_data/yo.py +263 -0
- dateparser/data/date_translation_data/yue.py +203 -0
- dateparser/data/date_translation_data/zgh.py +169 -0
- dateparser/data/date_translation_data/zh-Hans.py +240 -0
- dateparser/data/date_translation_data/zh-Hant.py +402 -0
- dateparser/data/date_translation_data/zh.py +273 -0
- dateparser/data/date_translation_data/zu.py +196 -0
- dateparser/data/languages_info.py +826 -0
- dateparser/date.py +599 -0
- dateparser/date_parser.py +55 -0
- dateparser/freshness_date_parser.py +156 -0
- dateparser/languages/__init__.py +2 -0
- dateparser/languages/dictionary.py +352 -0
- dateparser/languages/loader.py +224 -0
- dateparser/languages/locale.py +625 -0
- dateparser/languages/validation.py +467 -0
- dateparser/parser.py +742 -0
- dateparser/search/__init__.py +71 -0
- dateparser/search/detection.py +78 -0
- dateparser/search/search.py +297 -0
- dateparser/search/text_detection.py +89 -0
- dateparser/timezone_parser.py +91 -0
- dateparser/timezones.py +469 -0
- dateparser/utils/__init__.py +257 -0
- dateparser/utils/strptime.py +108 -0
- dateparser-1.2.1.dist-info/AUTHORS.rst +17 -0
- dateparser-1.2.1.dist-info/LICENSE +12 -0
- dateparser-1.2.1.dist-info/METADATA +864 -0
- dateparser-1.2.1.dist-info/RECORD +256 -0
- dateparser-1.2.1.dist-info/WHEEL +5 -0
- dateparser-1.2.1.dist-info/entry_points.txt +2 -0
- dateparser-1.2.1.dist-info/top_level.txt +4 -0
- dateparser_cli/__init__.py +0 -0
- dateparser_cli/cli.py +36 -0
- dateparser_cli/exceptions.py +2 -0
- dateparser_cli/fasttext_manager.py +42 -0
- dateparser_cli/utils.py +27 -0
- dateparser_data/__init__.py +0 -0
- dateparser_data/settings.py +33 -0
- dateparser_scripts/__init__.py +0 -0
- dateparser_scripts/get_cldr_data.py +567 -0
- dateparser_scripts/order_languages.py +217 -0
- dateparser_scripts/update_supported_languages_and_locales.py +48 -0
- dateparser_scripts/utils.py +73 -0
- dateparser_scripts/write_complete_data.py +129 -0
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from collections import OrderedDict
|
|
4
|
+
|
|
5
|
+
import regex as re
|
|
6
|
+
import requests
|
|
7
|
+
from parsel import Selector
|
|
8
|
+
|
|
9
|
+
from dateparser_scripts.utils import get_raw_data
|
|
10
|
+
|
|
11
|
+
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
|
12
|
+
|
|
13
|
+
# Languages with insufficient translation data are excluded
|
|
14
|
+
avoid_languages = {"cu", "kkj", "nds", "prg", "tk", "vai", "vai-Latn", "vai-Vaii", "vo"}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _get_language_locale_dict():
|
|
18
|
+
cldr_dates_full_dir = "../raw_data/cldr_dates_full/main/"
|
|
19
|
+
available_locale_names = os.listdir(cldr_dates_full_dir)
|
|
20
|
+
available_language_names = [
|
|
21
|
+
shortname
|
|
22
|
+
for shortname in available_locale_names
|
|
23
|
+
if not re.search(r"-[A-Z0-9]+$", shortname)
|
|
24
|
+
]
|
|
25
|
+
available_language_names.remove("root")
|
|
26
|
+
language_locale_dict = {}
|
|
27
|
+
for language_name in available_language_names:
|
|
28
|
+
language_locale_dict[language_name] = []
|
|
29
|
+
for locale_name in available_locale_names:
|
|
30
|
+
if re.match(language_name + "-[A-Z0-9]+$", locale_name):
|
|
31
|
+
language_locale_dict[language_name].append(locale_name)
|
|
32
|
+
|
|
33
|
+
for language in avoid_languages:
|
|
34
|
+
if language in language_locale_dict:
|
|
35
|
+
del language_locale_dict[language]
|
|
36
|
+
return language_locale_dict
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _get_language_order(language_locale_dict):
|
|
40
|
+
def get_most_common_locales():
|
|
41
|
+
# Order from https://w3techs.com/technologies/overview/content_language
|
|
42
|
+
# Last updated on 03.10.2022
|
|
43
|
+
old_common_locales = [
|
|
44
|
+
"en",
|
|
45
|
+
"ru",
|
|
46
|
+
"es",
|
|
47
|
+
"de",
|
|
48
|
+
"tr",
|
|
49
|
+
"fr",
|
|
50
|
+
"fa",
|
|
51
|
+
"ja",
|
|
52
|
+
"zh",
|
|
53
|
+
"vi",
|
|
54
|
+
"it",
|
|
55
|
+
"nl",
|
|
56
|
+
"pt",
|
|
57
|
+
"ar",
|
|
58
|
+
"pl",
|
|
59
|
+
"id",
|
|
60
|
+
"ko",
|
|
61
|
+
"uk",
|
|
62
|
+
"th",
|
|
63
|
+
"he",
|
|
64
|
+
"cs",
|
|
65
|
+
"sv",
|
|
66
|
+
"ro",
|
|
67
|
+
"el",
|
|
68
|
+
"da",
|
|
69
|
+
"hu",
|
|
70
|
+
"fi",
|
|
71
|
+
"sr",
|
|
72
|
+
"sk",
|
|
73
|
+
"bg",
|
|
74
|
+
"nb",
|
|
75
|
+
"hr",
|
|
76
|
+
"lt",
|
|
77
|
+
"no",
|
|
78
|
+
"hi",
|
|
79
|
+
"sl",
|
|
80
|
+
"ca",
|
|
81
|
+
"et",
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
response = requests.get(
|
|
85
|
+
"https://w3techs.com/technologies/overview/content_language"
|
|
86
|
+
)
|
|
87
|
+
sel = Selector(text=response.text)
|
|
88
|
+
if response.ok:
|
|
89
|
+
try:
|
|
90
|
+
bars = sel.xpath("//table[@class='bars']//a/@href").getall()
|
|
91
|
+
if not bars:
|
|
92
|
+
raise ValueError("No bars found")
|
|
93
|
+
new_most_common_locales = [
|
|
94
|
+
i.replace("https://w3techs.com/technologies/details/cl", "").strip(
|
|
95
|
+
"-"
|
|
96
|
+
)
|
|
97
|
+
for i in bars
|
|
98
|
+
]
|
|
99
|
+
if new_most_common_locales[0] != "en":
|
|
100
|
+
raise ValueError("English is not the first language")
|
|
101
|
+
except Exception as e:
|
|
102
|
+
print(e)
|
|
103
|
+
print("The website could have changed, please update the code")
|
|
104
|
+
return old_common_locales
|
|
105
|
+
else:
|
|
106
|
+
return old_common_locales
|
|
107
|
+
return new_most_common_locales
|
|
108
|
+
|
|
109
|
+
territory_info_file = "../raw_data/cldr_core/supplemental/territoryInfo.json"
|
|
110
|
+
with open(territory_info_file) as f:
|
|
111
|
+
territory_content = json.load(f)
|
|
112
|
+
territory_info_data = territory_content["supplemental"]["territoryInfo"]
|
|
113
|
+
|
|
114
|
+
language_population_dict = {}
|
|
115
|
+
for territory in territory_info_data:
|
|
116
|
+
population = int(territory_info_data[territory]["_population"])
|
|
117
|
+
try:
|
|
118
|
+
lang_dict = territory_info_data[territory]["languagePopulation"]
|
|
119
|
+
for language in lang_dict:
|
|
120
|
+
language_population = (
|
|
121
|
+
float(lang_dict[language]["_populationPercent"]) * population
|
|
122
|
+
)
|
|
123
|
+
if language in language_population_dict:
|
|
124
|
+
language_population_dict[language] += language_population
|
|
125
|
+
else:
|
|
126
|
+
language_population_dict[language] = language_population
|
|
127
|
+
except Exception:
|
|
128
|
+
pass
|
|
129
|
+
|
|
130
|
+
most_common_locales = get_most_common_locales()
|
|
131
|
+
language_order_with_duplicates = most_common_locales + sorted(
|
|
132
|
+
language_population_dict.keys(),
|
|
133
|
+
key=lambda x: (language_population_dict[x], x),
|
|
134
|
+
reverse=True,
|
|
135
|
+
)
|
|
136
|
+
language_order = sorted(
|
|
137
|
+
set(language_order_with_duplicates),
|
|
138
|
+
key=lambda x: language_order_with_duplicates.index(x),
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
for index in range(0, len(language_order)):
|
|
142
|
+
language_order[index] = re.sub(r"_", r"-", language_order[index])
|
|
143
|
+
|
|
144
|
+
cldr_languages = language_locale_dict.keys()
|
|
145
|
+
supplementary_date_directory = (
|
|
146
|
+
"../dateparser_data/supplementary_language_data/date_translation_data"
|
|
147
|
+
)
|
|
148
|
+
supplementary_languages = [x[:-5] for x in os.listdir(supplementary_date_directory)]
|
|
149
|
+
available_languages = set(cldr_languages).union(set(supplementary_languages))
|
|
150
|
+
language_order = [
|
|
151
|
+
shortname for shortname in language_order if shortname in available_languages
|
|
152
|
+
]
|
|
153
|
+
absent_languages = set(available_languages) - set(language_order)
|
|
154
|
+
remaining_languages = []
|
|
155
|
+
for language in absent_languages:
|
|
156
|
+
parent_language = re.sub(r"-\w+", "", language)
|
|
157
|
+
if parent_language in language_order:
|
|
158
|
+
language_order.insert(language_order.index(parent_language) + 1, language)
|
|
159
|
+
else:
|
|
160
|
+
remaining_languages.append(language)
|
|
161
|
+
language_order = language_order + sorted(remaining_languages)
|
|
162
|
+
language_order = list(map(str, language_order))
|
|
163
|
+
return language_order
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def generate_language_map(language_order):
|
|
167
|
+
data = {}
|
|
168
|
+
for lang in sorted(language_order):
|
|
169
|
+
if "-" not in lang:
|
|
170
|
+
data[lang] = [lang]
|
|
171
|
+
else:
|
|
172
|
+
data[lang.split("-")[0]].append(lang)
|
|
173
|
+
return data
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def main():
|
|
177
|
+
get_raw_data()
|
|
178
|
+
language_locale_dict = _get_language_locale_dict()
|
|
179
|
+
language_order = _get_language_order(language_locale_dict)
|
|
180
|
+
|
|
181
|
+
parent_directory = "../dateparser/data/"
|
|
182
|
+
filename = "../dateparser/data/languages_info.py"
|
|
183
|
+
if not os.path.isdir(parent_directory):
|
|
184
|
+
os.mkdir(parent_directory)
|
|
185
|
+
language_order_string = "language_order = " + json.dumps(
|
|
186
|
+
language_order, separators=(",", ": "), indent=4
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
complete_language_locale_dict = OrderedDict()
|
|
190
|
+
for key in language_order:
|
|
191
|
+
if key in language_locale_dict.keys():
|
|
192
|
+
complete_language_locale_dict[key] = sorted(language_locale_dict[key])
|
|
193
|
+
else:
|
|
194
|
+
complete_language_locale_dict[key] = []
|
|
195
|
+
|
|
196
|
+
language_locale_dict_string = "language_locale_dict = " + json.dumps(
|
|
197
|
+
complete_language_locale_dict, separators=(",", ": "), indent=4
|
|
198
|
+
)
|
|
199
|
+
language_map_data = generate_language_map(language_order)
|
|
200
|
+
language_map_data_string = "language_map = " + json.dumps(
|
|
201
|
+
language_map_data, separators=(",", ": "), indent=4
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
languages_info_string = (
|
|
205
|
+
language_order_string
|
|
206
|
+
+ "\n\n"
|
|
207
|
+
+ language_map_data_string
|
|
208
|
+
+ "\n\n"
|
|
209
|
+
+ language_locale_dict_string
|
|
210
|
+
+ "\n"
|
|
211
|
+
)
|
|
212
|
+
with open(filename, "w") as f:
|
|
213
|
+
f.write(languages_info_string)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
if __name__ == "__main__":
|
|
217
|
+
main()
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from tempfile import TemporaryFile
|
|
5
|
+
|
|
6
|
+
from dateparser.data.languages_info import language_locale_dict
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def to_string(data):
|
|
10
|
+
result = ""
|
|
11
|
+
language_column_width = 18
|
|
12
|
+
for language in sorted(data):
|
|
13
|
+
result += language
|
|
14
|
+
locales = data[language]
|
|
15
|
+
if locales:
|
|
16
|
+
result += " " * (language_column_width - len(language))
|
|
17
|
+
result += ", ".join("'{}'".format(locale) for locale in sorted(locales))
|
|
18
|
+
result += "\n"
|
|
19
|
+
return result
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def main():
|
|
23
|
+
readme_path = os.path.join(
|
|
24
|
+
os.path.dirname(__file__), "..", "docs", "supported_locales.rst"
|
|
25
|
+
)
|
|
26
|
+
new_data = to_string(language_locale_dict)
|
|
27
|
+
temporary_file = TemporaryFile("w+")
|
|
28
|
+
with open(readme_path) as readme_file:
|
|
29
|
+
delimiter = "============ ================================================================\n"
|
|
30
|
+
delimiters_seen = 0
|
|
31
|
+
is_inside_table = False
|
|
32
|
+
for line in readme_file:
|
|
33
|
+
if line == delimiter:
|
|
34
|
+
delimiters_seen += 1
|
|
35
|
+
is_inside_table = delimiters_seen == 2
|
|
36
|
+
elif is_inside_table:
|
|
37
|
+
continue
|
|
38
|
+
temporary_file.write(line)
|
|
39
|
+
if is_inside_table:
|
|
40
|
+
temporary_file.write(new_data)
|
|
41
|
+
temporary_file.seek(0)
|
|
42
|
+
with open(readme_path, "w") as readme_file:
|
|
43
|
+
readme_file.write(temporary_file.read())
|
|
44
|
+
temporary_file.close()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
if __name__ == "__main__":
|
|
48
|
+
main()
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
from collections import OrderedDict
|
|
4
|
+
|
|
5
|
+
from git import Repo
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_raw_data():
|
|
9
|
+
cldr_version = "31.0.1"
|
|
10
|
+
raw_data_directory = "../raw_data"
|
|
11
|
+
|
|
12
|
+
cldr_data = {
|
|
13
|
+
"dates_full": {
|
|
14
|
+
"url": "https://github.com/unicode-cldr/cldr-dates-full.git",
|
|
15
|
+
"dir": "{}/cldr_dates_full/".format(raw_data_directory),
|
|
16
|
+
},
|
|
17
|
+
"core": {
|
|
18
|
+
"url": "https://github.com/unicode-cldr/cldr-core.git",
|
|
19
|
+
"dir": "{}/cldr_core/".format(raw_data_directory),
|
|
20
|
+
},
|
|
21
|
+
"rbnf": {
|
|
22
|
+
"url": "https://github.com/unicode-cldr/cldr-rbnf.git",
|
|
23
|
+
"dir": "{}/cldr_rbnf/".format(raw_data_directory),
|
|
24
|
+
},
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
if os.path.isdir(raw_data_directory):
|
|
28
|
+
# remove current raw data
|
|
29
|
+
shutil.rmtree(raw_data_directory)
|
|
30
|
+
os.mkdir(raw_data_directory)
|
|
31
|
+
|
|
32
|
+
for name, data in cldr_data.items():
|
|
33
|
+
print('Clonning "{}" from: {}'.format(name, data["url"]))
|
|
34
|
+
repo = Repo.clone_from(data["url"], data["dir"], branch="master")
|
|
35
|
+
repo.git.co(cldr_version)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get_dict_difference(parent_dict, child_dict):
|
|
39
|
+
difference_dict = OrderedDict()
|
|
40
|
+
for key, child_value in child_dict.items():
|
|
41
|
+
parent_value = parent_dict.get(key)
|
|
42
|
+
child_specific_value = None
|
|
43
|
+
if not parent_value:
|
|
44
|
+
child_specific_value = child_value
|
|
45
|
+
elif isinstance(child_value, list):
|
|
46
|
+
child_specific_value = sorted(set(child_value) - set(parent_value))
|
|
47
|
+
elif isinstance(child_value, dict):
|
|
48
|
+
child_specific_value = get_dict_difference(parent_value, child_value)
|
|
49
|
+
elif child_value != parent_value:
|
|
50
|
+
child_specific_value = child_value
|
|
51
|
+
if child_specific_value:
|
|
52
|
+
difference_dict[key] = child_specific_value
|
|
53
|
+
return difference_dict
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def combine_dicts(primary_dict, supplementary_dict):
|
|
57
|
+
combined_dict = OrderedDict()
|
|
58
|
+
for key, value in primary_dict.items():
|
|
59
|
+
if key in supplementary_dict:
|
|
60
|
+
if isinstance(value, list):
|
|
61
|
+
combined_dict[key] = value + supplementary_dict[key]
|
|
62
|
+
elif isinstance(value, dict):
|
|
63
|
+
combined_dict[key] = combine_dicts(value, supplementary_dict[key])
|
|
64
|
+
else:
|
|
65
|
+
combined_dict[key] = supplementary_dict[key]
|
|
66
|
+
else:
|
|
67
|
+
combined_dict[key] = primary_dict[key]
|
|
68
|
+
remaining_keys = [
|
|
69
|
+
key for key in supplementary_dict.keys() if key not in primary_dict.keys()
|
|
70
|
+
]
|
|
71
|
+
for key in remaining_keys:
|
|
72
|
+
combined_dict[key] = supplementary_dict[key]
|
|
73
|
+
return combined_dict
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import shutil
|
|
4
|
+
from collections import OrderedDict
|
|
5
|
+
|
|
6
|
+
import regex as re
|
|
7
|
+
from ruamel.yaml import RoundTripLoader
|
|
8
|
+
|
|
9
|
+
from dateparser_scripts.order_languages import avoid_languages
|
|
10
|
+
from dateparser_scripts.utils import combine_dicts
|
|
11
|
+
|
|
12
|
+
cldr_date_directory = "../dateparser_data/cldr_language_data/date_translation_data/"
|
|
13
|
+
supplementary_directory = "../dateparser_data/supplementary_language_data/"
|
|
14
|
+
supplementary_date_directory = (
|
|
15
|
+
"../dateparser_data/supplementary_language_data/date_translation_data/"
|
|
16
|
+
)
|
|
17
|
+
translation_data_directory = "../dateparser/data/"
|
|
18
|
+
date_translation_directory = "../dateparser/data/date_translation_data/"
|
|
19
|
+
|
|
20
|
+
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
|
21
|
+
|
|
22
|
+
cldr_languages = list(
|
|
23
|
+
set(map(lambda x: x[:-5], os.listdir(cldr_date_directory))) - avoid_languages
|
|
24
|
+
)
|
|
25
|
+
supplementary_languages = [x[:-5] for x in os.listdir(supplementary_date_directory)]
|
|
26
|
+
all_languages = set(cldr_languages).union(set(supplementary_languages))
|
|
27
|
+
|
|
28
|
+
RELATIVE_PATTERN = re.compile(r"\{0\}")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _modify_relative_data(relative_data):
|
|
32
|
+
modified_relative_data = OrderedDict()
|
|
33
|
+
for key, value in relative_data.items():
|
|
34
|
+
for i, string in enumerate(value):
|
|
35
|
+
string = RELATIVE_PATTERN.sub(r"(\\d+[.,]?\\d*)", string)
|
|
36
|
+
value[i] = string
|
|
37
|
+
modified_relative_data[key] = value
|
|
38
|
+
return modified_relative_data
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _modify_data(language_data):
|
|
42
|
+
relative_data = language_data.get("relative-type-regex", {})
|
|
43
|
+
relative_data = _modify_relative_data(relative_data)
|
|
44
|
+
locale_specific_data = language_data.get("locale_specific", {})
|
|
45
|
+
for _, info in locale_specific_data.items():
|
|
46
|
+
locale_relative_data = info.get("relative-type-regex", {})
|
|
47
|
+
locale_relative_data = _modify_relative_data(locale_relative_data)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _get_complete_date_translation_data(language):
|
|
51
|
+
cldr_data = {}
|
|
52
|
+
supplementary_data = {}
|
|
53
|
+
if language in cldr_languages:
|
|
54
|
+
with open(cldr_date_directory + language + ".json") as f:
|
|
55
|
+
cldr_data = json.load(f, object_pairs_hook=OrderedDict)
|
|
56
|
+
if language in supplementary_languages:
|
|
57
|
+
with open(supplementary_date_directory + language + ".yaml") as g:
|
|
58
|
+
supplementary_data = OrderedDict(RoundTripLoader(g).get_data())
|
|
59
|
+
complete_data = combine_dicts(cldr_data, supplementary_data)
|
|
60
|
+
if "name" not in complete_data:
|
|
61
|
+
complete_data["name"] = language
|
|
62
|
+
return complete_data
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _write_file(filename, text, mode, in_memory, in_memory_result):
|
|
66
|
+
if in_memory:
|
|
67
|
+
in_memory_result[filename] = text
|
|
68
|
+
else:
|
|
69
|
+
with open(filename, mode) as out:
|
|
70
|
+
out.write(text)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def write_complete_data(in_memory=False):
|
|
74
|
+
"""
|
|
75
|
+
This function is responsible of generating the needed py files from the
|
|
76
|
+
CLDR files (JSON format) and supplementary language data (YAML format).
|
|
77
|
+
|
|
78
|
+
Use it with in_memory=True to avoid writing real files and getting a
|
|
79
|
+
dictionary containing the file names and their content (used when testing).
|
|
80
|
+
"""
|
|
81
|
+
in_memory_result = {}
|
|
82
|
+
|
|
83
|
+
if not in_memory:
|
|
84
|
+
if not os.path.isdir(translation_data_directory):
|
|
85
|
+
os.mkdir(translation_data_directory)
|
|
86
|
+
if os.path.isdir(date_translation_directory):
|
|
87
|
+
shutil.rmtree(date_translation_directory)
|
|
88
|
+
os.mkdir(date_translation_directory)
|
|
89
|
+
|
|
90
|
+
with open(supplementary_directory + "base_data.yaml") as f:
|
|
91
|
+
base_data = RoundTripLoader(f).get_data()
|
|
92
|
+
|
|
93
|
+
for language in all_languages:
|
|
94
|
+
date_translation_data = _get_complete_date_translation_data(language)
|
|
95
|
+
date_translation_data = combine_dicts(date_translation_data, base_data)
|
|
96
|
+
_modify_data(date_translation_data)
|
|
97
|
+
translation_data = json.dumps(
|
|
98
|
+
date_translation_data, indent=4, separators=(",", ": "), ensure_ascii=False
|
|
99
|
+
)
|
|
100
|
+
out_text = ("info = " + translation_data + "\n").encode("utf-8")
|
|
101
|
+
_write_file(
|
|
102
|
+
date_translation_directory + language + ".py",
|
|
103
|
+
out_text,
|
|
104
|
+
"wb",
|
|
105
|
+
in_memory,
|
|
106
|
+
in_memory_result,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
init_text = (
|
|
110
|
+
"from dateparser.data import date_translation_data\n"
|
|
111
|
+
"from .languages_info import language_order, language_locale_dict\n"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
_write_file(
|
|
115
|
+
translation_data_directory + "__init__.py",
|
|
116
|
+
init_text,
|
|
117
|
+
"w",
|
|
118
|
+
False,
|
|
119
|
+
in_memory_result,
|
|
120
|
+
)
|
|
121
|
+
_write_file(
|
|
122
|
+
date_translation_directory + "__init__.py", "", "w", False, in_memory_result
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
return in_memory_result
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
if __name__ == "__main__":
|
|
129
|
+
write_complete_data()
|