rara-tools 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

@@ -0,0 +1,248 @@
1
+ import regex as re
2
+
3
+
4
+ class Transliterate():
5
+ """
6
+ Transliterate class to transliterate text from Cyrillic to Latin
7
+
8
+ Ruleset: http://www.eki.ee/books/ekk09/index.php?id=37&p=2&p1=6
9
+ 'Vene keele tähestikust eesti tähestikku' 2005.
10
+
11
+ Usage:
12
+ Initialize the class:
13
+ foo = Transliterate()
14
+ Call with a list of sentences you'd like to transliterate:
15
+ foo(['Дженни играет с мяц.', 'Яне говорит на телефоне.', 'Рассекречены планы выпуска дешевого iPhone Читать далее'])
16
+ # This Returns -- ['Дженни играет с мяц.', 'Яне говорит на телефоне.', 'Рассекречены планы выпуска дешевого iPhone Читать далее']
17
+ """
18
+
19
+ def __call__(self, sentences):
20
+ return self._transliterate(sentences)
21
+
22
+ def _transliterate(self, sentences):
23
+ """The main transliteration processor
24
+
25
+ Arguments:
26
+ sentences {list of strings} -- A list of string sentences to tranliterate
27
+
28
+ Returns:
29
+ [list of strings] -- A transliterated version of the output sentences
30
+ """
31
+ new_sentences = []
32
+ for sentence in sentences:
33
+ converted_sentence = []
34
+ sentence = sentence.split(' ')
35
+ for word in sentence:
36
+ word = self._check_doublechar_rules(word)
37
+ converted_word = ''
38
+ for char_index, char in enumerate(word):
39
+ transchar = ''
40
+ if char in self._translit_table:
41
+ transchar = self._translit_table[char]
42
+ elif char in self._chars_with_rules:
43
+ transchar = self._check_char_rules(
44
+ word, char, char_index)
45
+ else:
46
+ transchar = char
47
+
48
+ converted_word += transchar
49
+ converted_sentence.append(converted_word)
50
+ new_sentences.append(' '.join(converted_sentence))
51
+
52
+ return new_sentences
53
+
54
+ def _check_doublechar_rules(self, word):
55
+ """Checks for doublechar rules, currently only for the cyr characters ij
56
+
57
+ Arguments:
58
+ word {string} -- The word in which to check the rule in
59
+
60
+ Returns:
61
+ [string] -- The modified version of the input word relative to the rules, if no rules were found, returns the unchanged input word.
62
+ """
63
+ word_to_return = word
64
+ if len(word) > 3:
65
+ for match in re.finditer("ий", word.lower()):
66
+ if match.span()[1] - 1 == len(word) - 1:
67
+ new_char = self._check_if_upper_and_return(
68
+ word[match.span()[0]], 'I')
69
+ word_to_return = list(word_to_return[:match.span()[0] + 1])
70
+ word_to_return[-1] = new_char
71
+ word_to_return = ''.join(word_to_return)
72
+
73
+ return word_to_return
74
+
75
+ def _check_char_rules(self, word, char, char_index):
76
+ """Checks for specific rules for the given transliteration
77
+
78
+ Arguments:
79
+ word {string} -- The word in which the character originated from.
80
+ char {string} -- The character about to be transliterated.
81
+ char_index {int} -- The index of the character in the word.
82
+ """
83
+ if char in 'Ее':
84
+ return self._rules_for_e(word, char, char_index)
85
+ elif char in 'Ёё':
86
+ return self._rules_for_jo(word, char, char_index)
87
+ elif char in 'ИиЙй':
88
+ return self._rules_for_i(word, char, char_index)
89
+ elif char in 'Хх':
90
+ return self._rules_for_h(word, char, char_index)
91
+ elif char in 'Ьь':
92
+ return self._rules_for_soft(word, char, char_index)
93
+ elif char in 'Сс':
94
+ return self._rules_for_s(word, char, char_index)
95
+
96
+ def _rules_for_s(self, word, char, char_index):
97
+ """Checks rules for the cyr s character
98
+ Returns: [string] -- Returns 'Ss/ss/S/s' based on the rules
99
+ """
100
+ char_to_return = ''
101
+ if char_index > 0:
102
+ if word[char_index-1] in self._vocals_rus:
103
+ if len(word) == 2:
104
+ char_to_return = self._check_if_upper_and_return(
105
+ char, 'Ss')
106
+ # To avoid keyerror use len(word[char_index+1:])
107
+ elif len(word[char_index+1:]) > 0 and word[char_index+1] in self._vocals_rus:
108
+ char_to_return = self._check_if_upper_and_return(
109
+ char, 'Ss')
110
+ elif word[-1] == char:
111
+ char_to_return = self._check_if_upper_and_return(
112
+ char, 'Ss')
113
+
114
+ if char_to_return == '':
115
+ char_to_return = self._chars_with_rules[char]
116
+
117
+ return char_to_return
118
+
119
+ def _rules_for_soft(self, word, char, char_index):
120
+ """Checks rules for the cyr softening character
121
+ Returns: [string] -- Returns 'J/j//' based on the rules
122
+ """
123
+ if len(word[char_index+1:]) > 0 and word[char_index+1] not in 'еЕёЁюЮяЯ' and word[char_index+1] in self._vocals_rus:
124
+ char_to_return = self._check_if_upper_and_return(char, 'J')
125
+ else:
126
+ return self._chars_with_rules[char]
127
+
128
+ return char_to_return
129
+
130
+ def _rules_for_e(self, word, char, char_index):
131
+ """Checks rules for the cyr character 'e'
132
+ Returns: [string] -- Returns 'Je/je/E/e' based on the rules
133
+ """
134
+ if char_index == 0:
135
+ char_to_return = self._check_if_upper_and_return(char, 'Je')
136
+ elif word[char_index - 1] in self._vocals_rus or word[char_index - 1] in 'ЬьЪъ':
137
+ char_to_return = self._check_if_upper_and_return(char, 'Je')
138
+ else:
139
+ char_to_return = self._chars_with_rules[char]
140
+
141
+ return char_to_return
142
+
143
+ def _rules_for_jo(self, word, char, char_index):
144
+ """Checks rules for the cyr character 'jo'
145
+ Returns: [string] -- Returns 'O/o/Jo/jo' based on the rules
146
+ """
147
+ char_to_return = ''
148
+ if char_index != 0:
149
+ if word[char_index-1] in 'ЖжЧчШшЩщ':
150
+ char_to_return = self._check_if_upper_and_return(char, 'O')
151
+
152
+ if char_to_return == '':
153
+ char_to_return = self._chars_with_rules[char]
154
+
155
+ return char_to_return
156
+
157
+ def _rules_for_i(self, word, char, char_index):
158
+ """Checks rules for the cyr character 'i'
159
+ Returns: [string] -- Returns 'J/j/I/i' based on the rules
160
+ """
161
+ char_to_return = ''
162
+ if len(word) > 1:
163
+ if char_index == 0 and word[char_index + 1] in self._vocals_rus:
164
+ char_to_return = self._check_if_upper_and_return(char, 'J')
165
+
166
+ if char_to_return == '':
167
+ char_to_return = self._chars_with_rules[char]
168
+
169
+ return char_to_return
170
+
171
+ def _rules_for_h(self, word, char, char_index):
172
+ """Checks rules for the cyr character 'h'
173
+ Returns: [string] -- Returns 'Hh/hh/H/h' based on the rules
174
+ """
175
+ char_to_return = ''
176
+ if char_index > 0:
177
+ if word[char_index-1] in self._vocals_rus:
178
+ if len(word) == 2:
179
+ char_to_return = self._check_if_upper_and_return(
180
+ char, 'Hh')
181
+ # To avoid keyerror use len(word[char_index+1:])
182
+ elif len(word[char_index+1:]) > 0 and word[char_index+1] in self._vocals_rus:
183
+ char_to_return = self._check_if_upper_and_return(
184
+ char, 'Hh')
185
+ elif word[-1] == char:
186
+ char_to_return = self._check_if_upper_and_return(
187
+ char, 'Hh')
188
+
189
+ if char_to_return == '':
190
+ char_to_return = self._chars_with_rules[char]
191
+
192
+ return char_to_return
193
+
194
+ @staticmethod
195
+ def _check_if_upper_and_return(char_in_index, char):
196
+ """Checks whether the character is supposed to be lower or upper case
197
+
198
+ Arguments:
199
+ char_in_index {string} -- [The value of the char index in the word]
200
+ char {string} -- [The character to return, should be first letter uppercase, ex: 'Je']
201
+
202
+ Returns:
203
+ [string] -- [Upper or lower case version of the character]
204
+ """
205
+ if char_in_index.isupper():
206
+ return char
207
+ else:
208
+ return char.lower()
209
+
210
+ # Ugly constants too big for init method
211
+ _vocals_est = ['i', 'ü', 'u', 'e', 'ö', 'õ', 'o', 'ä', 'a']
212
+ _vocals_rus = ['а', 'э', 'ы', 'у', 'о', 'я', 'е', 'ё', 'ю', 'и']
213
+ _translit_table = {
214
+ 'А': 'A', 'а': 'a',
215
+ 'Б': 'B', 'б': 'b',
216
+ 'В': 'V', 'в': 'v',
217
+ 'Г': 'G', 'г': 'g',
218
+ 'Д': 'D', 'д': 'd',
219
+ 'Ж': 'Ž', 'ж': 'ž',
220
+ 'З': 'Z', 'з': 'z',
221
+ 'К': 'K', 'к': 'k',
222
+ 'Л': 'L', 'л': 'l',
223
+ 'М': 'M', 'м': 'm',
224
+ 'Н': 'N', 'н': 'n',
225
+ 'О': 'O', 'о': 'o',
226
+ 'П': 'P', 'п': 'p',
227
+ 'Р': 'R', 'р': 'r',
228
+ 'Т': 'T', 'т': 't',
229
+ 'У': 'U', 'у': 'u',
230
+ 'Ф': 'F', 'ф': 'f',
231
+ 'Ц': 'Ts', 'ц': 'ts',
232
+ 'Ч': 'Tš', 'ч': 'tš',
233
+ 'Ш': 'Š', 'ш': 'š',
234
+ 'Щ': 'Štš', 'щ': 'štš',
235
+ 'Ы': 'õ', 'ы': 'õ',
236
+ 'Ъ': "", 'ъ': "",
237
+ 'Э': 'E', 'э': 'e',
238
+ 'Ю': 'Ju', 'ю': 'ju',
239
+ 'Я': 'Ja', 'я': 'ja'}
240
+ _chars_with_rules = {
241
+ 'Е': 'E', 'е': 'e',
242
+ 'Ё': 'Jo', 'ё': 'jo',
243
+ 'И': 'I', 'и': 'i',
244
+ 'Й': 'I', 'й': 'i',
245
+ 'Х': 'H', 'х': 'h',
246
+ 'Ь': "", 'ь': "",
247
+ 'С': 'S', 'с': 's',
248
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.2.0
3
+ Version: 0.4.0
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -13,10 +13,13 @@ License-File: LICENSE.md
13
13
  Requires-Dist: elasticsearch==8.*
14
14
  Requires-Dist: elasticsearch_dsl==8.*
15
15
  Requires-Dist: minio==7.*
16
- Requires-Dist: rara-norm-linker==1.*
16
+ Requires-Dist: estnltk==1.7.3
17
+ Requires-Dist: nltk
18
+ Requires-Dist: jsonlines
17
19
  Requires-Dist: requests
18
20
  Requires-Dist: iso639-lang
19
21
  Requires-Dist: pymarc
22
+ Requires-Dist: regex
20
23
  Requires-Dist: glom
21
24
  Provides-Extra: testing
22
25
  Requires-Dist: pytest>=8.0; extra == "testing"
@@ -0,0 +1,37 @@
1
+ rara_tools/converters.py,sha256=_1ZRH4ACLOolI1G5b_aSssN68rWOvan-q2dTq7D7-j4,2794
2
+ rara_tools/decorators.py,sha256=MjOyvZ5nTkwxwx2JLFEGpKKBysvecFw6EN6UDrSvZLU,2187
3
+ rara_tools/digar_schema_converter.py,sha256=k95U2iRlEA3sh772-v6snhHW6fju6qSTMnvWJ6DpzZk,14254
4
+ rara_tools/elastic.py,sha256=MgPHxZ3UbSTIL8_sT9gU5V4PLKJjo3aQ8CGyhXjRz6M,13065
5
+ rara_tools/exceptions.py,sha256=YQyaueUbXeTkJYFDEuN6iWTXMI3eCv5l7PxGp87vg5I,550
6
+ rara_tools/s3.py,sha256=9ziDXsLjBtFAvsjTPxFddhfvkpA8773rzPJqO7y1N5Q,6415
7
+ rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
8
+ rara_tools/utils.py,sha256=9vSbmuWYU5ydr4lXBKlUKa0xzDccFsaJv4T-XwgUfuY,2578
9
+ rara_tools/constants/__init__.py,sha256=r78laM9vyRDAvzDhPvzDlhaX6qPwUUBBtwf1WosrW3o,27
10
+ rara_tools/constants/digitizer.py,sha256=MND0dUQySBAOVWzuUBxQGZWv_Ckdz2jCp25F2_oHGi8,496
11
+ rara_tools/constants/general.py,sha256=aVUQTMss89atAkTDZKJXNdnsBHPX-RSrlBOtt-wdPGU,195
12
+ rara_tools/constants/normalizers.py,sha256=GmWY89kYfX7_YJ8sdy1vb8ABJc_ABdw_zVVOxd9UZgY,171
13
+ rara_tools/constants/parsers.py,sha256=stXOyA1dEOgxdCUT4Mp4pvvGLmdE7DAjTe8Jq71tcS4,5453
14
+ rara_tools/normalizers/__init__.py,sha256=_NqpS5w710DhaURytHq9JpEt8HgYpSPfRDcOtOymJgE,193
15
+ rara_tools/normalizers/authorities.py,sha256=IDtcm0yNZNhv1f-WcdqWFSRzZk_CoKuBFsk6hEPddWM,4513
16
+ rara_tools/normalizers/base.py,sha256=taOboGURQF_ACPVWHX_wMsaDEo8gYdAkiOw0yT0zzR8,10910
17
+ rara_tools/normalizers/bibs.py,sha256=4DTS6k37z8qR5B3n7aiCXsT5Z49rLTvQ60lKKr5dyLs,2352
18
+ rara_tools/normalizers/viaf.py,sha256=9uTyEadSaoFedUbUfY_iWPJtgrt04jP71i_6MLPM08I,6919
19
+ rara_tools/parsers/marc_parsers/base_parser.py,sha256=wzCccZaiN4p2iUms3PAOfXihNgEeg1cGRzRx26ytJeA,1661
20
+ rara_tools/parsers/marc_parsers/ems_parser.py,sha256=70WdxnbxZmOqla7EAUapCiCWZl0zRPwghKy25sSKFiY,1801
21
+ rara_tools/parsers/marc_parsers/location_parser.py,sha256=18HExO2BLs1ZJodjNF8YOkB5CJ7bd6_zmyIIzALL7aI,1658
22
+ rara_tools/parsers/marc_parsers/organization_parser.py,sha256=faqQEYsut_ZF3kX1QycTnbRIqC7W8sULxmG75ICfya8,1629
23
+ rara_tools/parsers/marc_parsers/person_parser.py,sha256=iMycHSlgfvgB0axE_rneB5sImVlc920FcBnTsUsmVW4,1582
24
+ rara_tools/parsers/marc_parsers/title_parser.py,sha256=0FnX1kl9InELlSqMGECjswEbhP-sKl55TuhV05RhWSw,14
25
+ rara_tools/parsers/marc_records/base_record.py,sha256=oDp4yjPMEmSD3F_dWIdx7IRtZfKwD7ydMFUW9YXAhSQ,4322
26
+ rara_tools/parsers/marc_records/ems_record.py,sha256=B2YZLEeDd-GmmYqxhczbMsSEB7-x6ZLjB8OeDnzOxww,9376
27
+ rara_tools/parsers/marc_records/organization_record.py,sha256=HmDqAqAL_Tw7ppEsS5HfogrfNuQMNChCkrdPu6K-SUE,9141
28
+ rara_tools/parsers/marc_records/person_record.py,sha256=BZrXqd7hCOqm-c-sjmsOfaAI4L7lLSjIUWtxHqPjhTs,7863
29
+ rara_tools/parsers/marc_records/title_record.py,sha256=0FnX1kl9InELlSqMGECjswEbhP-sKl55TuhV05RhWSw,14
30
+ rara_tools/parsers/tools/entity_normalizers.py,sha256=afOMqJoL4aeq0cfsohIuxkxzvqNdZ_ba7U32eyogbzk,8722
31
+ rara_tools/parsers/tools/marc_converter.py,sha256=PUbggzJ_wHfke_bHTF2LOZyzX1t0wRM8qIFL36Dl3AI,414
32
+ rara_tools/parsers/tools/russian_transliterator.py,sha256=5ZU66iTqAhr7pmfVqXPAI_cidF43VqqmuN4d7H4_JuA,9770
33
+ rara_tools-0.4.0.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
34
+ rara_tools-0.4.0.dist-info/METADATA,sha256=93dBS6GW3Q9CoA3vyXvsjqrZauntruPKuT8vF_jHSm4,4054
35
+ rara_tools-0.4.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
36
+ rara_tools-0.4.0.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
37
+ rara_tools-0.4.0.dist-info/RECORD,,
@@ -1,17 +0,0 @@
1
- rara_tools/converters.py,sha256=_1ZRH4ACLOolI1G5b_aSssN68rWOvan-q2dTq7D7-j4,2794
2
- rara_tools/decorators.py,sha256=MjOyvZ5nTkwxwx2JLFEGpKKBysvecFw6EN6UDrSvZLU,2187
3
- rara_tools/digar_schema_converter.py,sha256=k95U2iRlEA3sh772-v6snhHW6fju6qSTMnvWJ6DpzZk,14254
4
- rara_tools/elastic.py,sha256=MgPHxZ3UbSTIL8_sT9gU5V4PLKJjo3aQ8CGyhXjRz6M,13065
5
- rara_tools/exceptions.py,sha256=YQyaueUbXeTkJYFDEuN6iWTXMI3eCv5l7PxGp87vg5I,550
6
- rara_tools/s3.py,sha256=9ziDXsLjBtFAvsjTPxFddhfvkpA8773rzPJqO7y1N5Q,6415
7
- rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
8
- rara_tools/utils.py,sha256=9vSbmuWYU5ydr4lXBKlUKa0xzDccFsaJv4T-XwgUfuY,2578
9
- rara_tools/constants/__init__.py,sha256=r78laM9vyRDAvzDhPvzDlhaX6qPwUUBBtwf1WosrW3o,27
10
- rara_tools/constants/digitizer.py,sha256=MND0dUQySBAOVWzuUBxQGZWv_Ckdz2jCp25F2_oHGi8,496
11
- rara_tools/constants/general.py,sha256=aVUQTMss89atAkTDZKJXNdnsBHPX-RSrlBOtt-wdPGU,195
12
- rara_tools/constants/normalizers.py,sha256=eM-REyHen8MdBRYD0s2fQcYrvWxDwWfZlYGpBvdLog0,494
13
- rara_tools-0.2.0.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
14
- rara_tools-0.2.0.dist-info/METADATA,sha256=YgPsOKoNplzOs4PVlgJX9eaw65iTSfD9C-Ba374fK2A,3995
15
- rara_tools-0.2.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
16
- rara_tools-0.2.0.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
17
- rara_tools-0.2.0.dist-info/RECORD,,