rara-tools 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

@@ -0,0 +1,217 @@
1
+ from typing import List, NoReturn
2
+ from pymarc.record import Record
3
+ from rara_tools.parsers.tools.entity_normalizers import PersonNormalizer
4
+ from rara_tools.parsers.marc_records.base_record import BaseRecord
5
+ from rara_tools.constants.parsers import PersonMarcIDs, LOGGER
6
+ import regex as re
7
+ import json
8
+ import logging
9
+
10
+
11
+ class PersonRecord(BaseRecord):
12
+ """ Generates a simplified organization JSON record
13
+ from a pymarc MARC record.
14
+ """
15
+ def __init__(self, record: Record, add_variations: bool = False) -> NoReturn:
16
+ """ Initializes PersonRecord object.
17
+
18
+ Parameters
19
+ -----------
20
+ record: Record
21
+ pymarc.record.Record object.
22
+ add_variations: bool
23
+ If enabled, constructs an additional variations field, which
24
+ combines the content of multiple fields + adds some generated
25
+ variations. If the output is uploaded into Elastic and used
26
+ via rara-norm-linker, it is necessary to enable this.
27
+ """
28
+ super().__init__(record=record, add_variations=add_variations)
29
+
30
+ self.__name_field_id: List[str] = PersonMarcIDs.NAME
31
+ self.__name_variations_field_id: List[str]= PersonMarcIDs.NAME_VARIATIONS
32
+ self.__source_field_id: List[str] = PersonMarcIDs.SOURCE
33
+ self.__description_field_id: List[str] = PersonMarcIDs.DESCRIPTION
34
+ self.__default_year: int | None = None
35
+
36
+ self.__name: str = ""
37
+ self.__original_name: dict = {}
38
+ self.__name_specification: str = ""
39
+ self.__life_years: str = ""
40
+ self.__birth_year: int = -1
41
+ self.__death_year: int = -1
42
+ self.__name_variations: List[str] = []
43
+ self.__source: str = ""
44
+ self.__description: str = ""
45
+ self.__full_record: dict = {}
46
+ self.__name_in_cyrillic: bool = None
47
+ self.__variations: List[str] = []
48
+ self.__person_normalizer: PersonNormalizer = PersonNormalizer(self.name)
49
+
50
+
51
+ def _parse_year(self, year: str) -> int:
52
+ year = year.strip()
53
+ _year = self.__default_year
54
+ if len(year) >= 4:
55
+ if year[:4].isnumeric():
56
+ _year = int(year[:4])
57
+ elif len(year) == 3 and year.isnumeric():
58
+ _year = int(year)
59
+ return _year
60
+
61
+ @property
62
+ def original_name(self) -> str:
63
+ if not self.__original_name:
64
+ values = self.get_values(
65
+ marc_ids=self.__name_field_id,
66
+ subfield_id=["a", "b"]
67
+ )
68
+ if values:
69
+ self.__original_name = {
70
+ "a": self._clean_value(values[0].get("a", "")),
71
+ "b": self._clean_value(values[0].get("b", ""))
72
+ }
73
+ else:
74
+ pass
75
+ return self.__original_name
76
+
77
+ @property
78
+ def name(self) -> str:
79
+ if not self.__name:
80
+ self.__name = self._merge_and_clean(self.original_name, ["a", "b"])
81
+ return self.__name
82
+
83
+
84
+ @property
85
+ def name_specification(self) -> str:
86
+ if not self.__name_specification:
87
+ values = self.get_values(
88
+ marc_ids=self.__name_field_id,
89
+ subfield_id="c"
90
+ )
91
+ self.__name_specification = self._clean_value(values[0]) if values else ""
92
+ return self.__name_specification
93
+
94
+ @property
95
+ def life_years(self) -> str:
96
+ if not self.__life_years:
97
+ values = self.get_values(
98
+ marc_ids = self.__name_field_id,
99
+ subfield_id="d"
100
+ )
101
+ self.__life_years = self._clean_value(values[0]) if values else ""
102
+ return self.__life_years
103
+
104
+
105
+ @property
106
+ def birth_year(self) -> int:
107
+ if self.__birth_year == -1:
108
+ try:
109
+ birth_year, death_year = self.life_years.split("-")
110
+ self.__birth_year = self._parse_year(birth_year)
111
+ self.__death_year = self._parse_year(death_year)
112
+ except Exception as e:
113
+ LOGGER.error(
114
+ f"Failed extracting birth and/or death year " \
115
+ f"from '{self.life_years}' with the following " \
116
+ f"exception: '{e}'."
117
+ )
118
+ return self.__birth_year
119
+
120
+
121
+ @property
122
+ def death_year(self) -> int:
123
+ if self.__death_year == -1:
124
+ try:
125
+ birth_year, death_year = self.life_years.split("-")
126
+ self.__birth_year = self._parse_year(birth_year)
127
+ self.__death_year = self._parse_year(death_year)
128
+ except Exception as e:
129
+ LOGGER.error(
130
+ f"Failed extracting birth and/or death year " \
131
+ f"from '{self.life_years}' with the following " \
132
+ f"exception: '{e}'."
133
+ )
134
+ return self.__death_year
135
+
136
+ @property
137
+ def name_variations(self) -> List[str]:
138
+ if not self.__name_variations:
139
+ values = self.get_values(
140
+ marc_ids=self.__name_variations_field_id,
141
+ subfield_id=["a", "b"]
142
+ )
143
+ if values:
144
+ raw_variations = [
145
+ {
146
+ "a": self._clean_value(value.get("a", "")),
147
+ "b": self._clean_value(value.get("b", ""))
148
+ }
149
+ for value in values
150
+ ]
151
+ self.__name_variations = [
152
+ self._merge_and_clean(value, ["a", "b"])
153
+ for value in raw_variations
154
+ ]
155
+ else:
156
+ pass
157
+ return self.__name_variations
158
+
159
+ @property
160
+ def source(self) -> str:
161
+ if not self.__source:
162
+ values = self.get_values(
163
+ marc_ids=self.__source_field_id,
164
+ subfield_id="a"
165
+ )
166
+ self.__source = self._clean_value(values[0]) if values else ""
167
+ return self.__source
168
+
169
+ @property
170
+ def description(self) -> str:
171
+ if not self.__description:
172
+ values = self.get_values(
173
+ marc_ids=self.__description_field_id,
174
+ subfield_id="i"
175
+ )
176
+ self.__description = self._clean_value(values[0]) if values else ""
177
+ return self.__description
178
+
179
+ @property
180
+ def name_in_cyrillic(self) -> bool:
181
+ if self.__name_in_cyrillic == None:
182
+ self.__name_in_cyrillic = PersonNormalizer.has_cyrillic(self.name)
183
+ return self.__name_in_cyrillic
184
+
185
+ @property
186
+ def variations(self) -> List[str]:
187
+ if not self.__variations:
188
+ variations_ = self.__person_normalizer.variations
189
+ for name in self.name_variations:
190
+ variations_.extend(PersonNormalizer(name).variations)
191
+ self.__variations = [v.lower() for v in list(set(variations_))]
192
+ return self.__variations
193
+
194
+ @property
195
+ def full_record(self) -> dict:
196
+ if not self.__full_record:
197
+ self.__full_record = {
198
+ "name": self.name,
199
+ "life_year": self.life_years,
200
+ "source": self.source,
201
+ "birth_year": self.birth_year,
202
+ "death_year": self.death_year,
203
+ "identifier": self.identifier,
204
+ "identifier_source": self.identifier_source,
205
+ "name_variations": self.name_variations,
206
+ "name_specification": self.name_specification,
207
+ "description": self.description,
208
+ "name_in_cyrillic": self.name_in_cyrillic,
209
+ "full_record_marc": str(self.marc_record),
210
+ "full_record_json": json.dumps(self.marc_json_record)
211
+ }
212
+ if self.add_variations:
213
+ self.__full_record.update(
214
+ {"link_variations": self.variations}
215
+ )
216
+
217
+ return self.__full_record
@@ -0,0 +1 @@
1
+ # Coming soon
@@ -0,0 +1,256 @@
1
+ import regex as re
2
+ import estnltk
3
+ import nltk
4
+ import logging
5
+ from rara_tools.parsers.tools.russian_transliterator import Transliterate
6
+ from rara_tools.constants.parsers import KeywordType, LOGGER
7
+ from typing import List, NoReturn
8
+ from abc import abstractmethod
9
+
10
+
11
+ nltk.download("punkt_tab")
12
+
13
+
14
+ class PersonalName:
15
+ """ Wraps generating and accessing main name forms.
16
+ """
17
+ def __init__(self, name: str) -> NoReturn:
18
+ """ Initializes PersonName object.
19
+
20
+ Parameters
21
+ -----------
22
+ name: str
23
+ Personal name. Expects one of the following formats:
24
+ '<first name> <last name>' or '<last name>, <first_name>', e.g:
25
+ 'Uku Tamm' or 'Tamm, Uku'.
26
+ """
27
+ self.__original_name: str = name
28
+ self.__name: dict = {}
29
+ self.__last_comma_first: str = ""
30
+ self.__first_last: str = ""
31
+
32
+ @property
33
+ def first_name(self) -> str:
34
+ return self.name.get("first_name")
35
+
36
+ @property
37
+ def last_name(self) -> str:
38
+ return self.name.get("last_name")
39
+
40
+ @property
41
+ def name(self) -> dict:
42
+ if not self.__name:
43
+ last_name = ""
44
+ first_name = ""
45
+ if "," in self.__original_name:
46
+ try:
47
+ last_name, first_name = self.__original_name.split(",")
48
+ except Exception as e:
49
+ LOGGER.error(
50
+ f"Parsing personal name {self.__original_name} " \
51
+ f"failed with error: {e}."
52
+ )
53
+ else:
54
+ name_tokens = [
55
+ t.strip()
56
+ for t in self.__original_name.split()
57
+ if t.strip()
58
+ ]
59
+ if len(name_tokens) > 1:
60
+ last_name = name_tokens[-1]
61
+ first_name = " ".join(name_tokens[:-1])
62
+ self.__name = {
63
+ "first_name": first_name.strip(),
64
+ "last_name": last_name.strip()
65
+ }
66
+ return self.__name
67
+
68
+ @property
69
+ def last_comma_first(self) -> str:
70
+ if not self.__last_comma_first:
71
+ if self.last_name or self.first_name:
72
+ self.__last_comma_first = f"{self.last_name}, {self.first_name}"
73
+ return self.__last_comma_first.strip()
74
+
75
+ @property
76
+ def first_last(self) -> str:
77
+ if not self.__first_last:
78
+ self.__first_last = f"{self.first_name} {self.last_name}"
79
+ return self.__first_last.strip()
80
+
81
+
82
+ class Normalizer:
83
+ """ Class for handling general methods for string
84
+ normalizations and variations generation.
85
+ """
86
+ def __init__(self, entity: str) -> NoReturn:
87
+ """ Initializes Normalizer object.
88
+
89
+ Parameters
90
+ -----------
91
+ entity: str
92
+ Entity (keyword, person etc) to normalize.
93
+ """
94
+ self.__entity: str = entity
95
+ self.__lemmatized_entity: str = ""
96
+ self.__cleaned_entity: str = ""
97
+
98
+
99
+ @staticmethod
100
+ def has_cyrillic(entity: str) -> bool:
101
+ return bool(re.search("[а-яА-Я]", entity))
102
+
103
+ @staticmethod
104
+ def transliterate(entity: str) -> str:
105
+ transliterator = Transliterate()
106
+ transliteration = transliterator([entity])[0]
107
+ return transliteration
108
+
109
+ @staticmethod
110
+ def lemmatize(entity: str) -> str:
111
+ layer = estnltk.Text(entity).tag_layer()
112
+ lemma_list = [l[0] for l in list(layer.lemma)]
113
+ lemmatized_entity = " ".join(lemma_list)
114
+ return lemmatized_entity
115
+
116
+ @staticmethod
117
+ def remove_parenthesized_info(entity: str) -> str:
118
+ clean_entity = re.sub(r"[(][^)]+[)]", "", entity)
119
+ return clean_entity.strip()
120
+
121
+ @staticmethod
122
+ def clean_entity(entity: str) -> str:
123
+ clean_entity = Normalizer.remove_parenthesized_info(entity)
124
+ return clean_entity
125
+
126
+ @property
127
+ def lemmatized_entity(self) -> str:
128
+ if not self.__lemmatized_entity:
129
+ self.__lemmatized_entity = Normalizer.lemmatize(self.__entity)
130
+ return self.__lemmatized_entity
131
+
132
+ @property
133
+ def cleaned_entity(self) -> str:
134
+ if not self.__cleaned_entity:
135
+ self.__cleaned_entity = Normalizer.clean_entity(self.__entity)
136
+ return self.__cleaned_entity
137
+
138
+ @abstractmethod
139
+ def variations(self) -> List[str]:
140
+ pass
141
+
142
+
143
+ class PersonNormalizer(Normalizer):
144
+ """ Class for handling person-specific methods for string
145
+ normalizations and variations generation.
146
+ """
147
+ def __init__(self, name: str) -> NoReturn:
148
+ """ Initializes PersonNormalizer object.
149
+
150
+ Parameters
151
+ -----------
152
+ name: str
153
+ Personal name to normalize / generate variations for.
154
+ """
155
+ super().__init__(entity=name)
156
+ self.__name: str = name
157
+ self.__name_object: PersonalName = PersonalName(name)
158
+ self.__variations: List[str] = []
159
+
160
+
161
+ @property
162
+ def variations(self) -> List[str]:
163
+ if not self.__variations:
164
+ LOGGER.debug(f"Generating variations for name {self.__name}.")
165
+ variations = []
166
+ variations.append(self.__name_object.last_comma_first)
167
+ variations.append(self.__name_object.first_last)
168
+
169
+ if Normalizer.has_cyrillic(self.__name):
170
+ LOGGER.debug(
171
+ f"Detected cyrillic in the original name '{self.__name}'. " \
172
+ f"Generating a transliterated latin version."
173
+ )
174
+ transliterations = [
175
+ Normalizer.transliterate(name)
176
+ for name in variations
177
+ ]
178
+ variations.extend(transliterations)
179
+
180
+ # Guarantee adding one-word names as well
181
+ if self.__name not in variations:
182
+ variations.append(self.__name)
183
+ _variations = [v.strip() for v in variations if v.strip()]
184
+ self.__variations = list(set(_variations))
185
+ LOGGER.debug(
186
+ f"Generated the following variations for name '{self.__name}': " \
187
+ f"{self.__variations}."
188
+ )
189
+ return self.__variations
190
+
191
+
192
+
193
+ class KeywordNormalizer(Normalizer):
194
+ """ Class for handling keyword-specific methods for string
195
+ normalizations and variations generation.
196
+ """
197
+ def __init__(self, keyword: str, keyword_type: str = "") -> NoReturn:
198
+ """ Initializes KeywordNormalizer object.
199
+
200
+ Parameters
201
+ -----------
202
+ keyword: str
203
+ keyword to normalize / generate variations for.
204
+ keyword_type: str
205
+ Keyword type. Should be one of the types specified in
206
+ rara_tools.constants.parsers.KeywordType or "".
207
+
208
+ """
209
+ super().__init__(entity=keyword)
210
+ self.__keyword: str = keyword
211
+ self.__variations: List[str] = []
212
+ self.__keyword_type: str = keyword_type
213
+ self.__loc_substitutions_map: dict = {"v": "w", "V": "W"}
214
+
215
+ def _transform_v_into_w(self, entity: str) -> str:
216
+ for old_val, new_val in list(self.__loc_substitutions_map.items()):
217
+ entity = re.sub(old_val, new_val, entity)
218
+ return entity
219
+
220
+ @property
221
+ def loc_substitutions_as_str(self) -> str:
222
+ subs = [
223
+ f"'{old_val}' -> '{new_val}'"
224
+ for old_val, new_val in list(self.__loc_substitutions_map.items())
225
+ ]
226
+ return ", ".join(subs)
227
+
228
+ @property
229
+ def variations(self) -> List[str]:
230
+ if not self.__variations:
231
+ LOGGER.debug(f"Generating variations for keyword {self.__keyword}.")
232
+ variations = []
233
+ variations.append(self.__keyword)
234
+ variations.append(self.lemmatized_entity)
235
+ variations.append(self.cleaned_entity)
236
+ variations.append(Normalizer.lemmatize(self.cleaned_entity))
237
+ # If keyword_type = LOC, add variations containing
238
+ # v -> w replacements
239
+ if self.__keyword_type == KeywordType.LOC:
240
+ LOGGER.debug(
241
+ f"Detected keyword type = '{KeywordType.LOC}' -> " \
242
+ f"Adding variations with the following character " \
243
+ f"replacements: {self.loc_substitutions_as_str}."
244
+ )
245
+ v_w_transformations = [
246
+ self._transform_v_into_w(entity)
247
+ for entity in variations
248
+ ]
249
+ variations.extend(v_w_transformations)
250
+ variations = list(set(variations))
251
+ self.__variations = variations
252
+ LOGGER.debug(
253
+ f"Generated the following variations for keyword '{self.__keyword}': " \
254
+ f"{self.__variations}."
255
+ )
256
+ return self.__variations
@@ -0,0 +1,15 @@
1
+ import pymarc
2
+ from typing import NoReturn
3
+
4
+
5
+ class MarcConveter:
6
+ def __init__(self):
7
+ pass
8
+
9
+ @staticmethod
10
+ def marc21xml_to_mrc(input_file: str, output_file: str) -> NoReturn:
11
+ """ Converts Marc21XML file into a MRC file.
12
+ """
13
+ with open(output_file, "wb") as f:
14
+ writer = pymarc.MARCWriter(f)
15
+ records = pymarc.marcxml.map_xml(writer.write, input_file)