rara-tools 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

@@ -0,0 +1,267 @@
1
+ from typing import List, NoReturn
2
+ from pymarc.record import Record
3
+ from rara_tools.parsers.tools.entity_normalizers import KeywordNormalizer
4
+ from rara_tools.parsers.marc_records.base_record import BaseRecord
5
+ from rara_tools.constants.parsers import (
6
+ EMSMarcIDs, KeywordType,
7
+ EN_SUBJECT_FIELDS, ET_SUBJECT_FIELDS
8
+ )
9
+ import regex as re
10
+ import json
11
+
12
+
13
+
14
+ class EMSRecord(BaseRecord):
15
+ """ Generates a simplified EMS JSON record
16
+ from a pymarc MARC record.
17
+ """
18
+ def __init__(self, record: Record, add_variations: bool = False) -> NoReturn:
19
+ """ Initializes EMSRecord object.
20
+
21
+ Parameters
22
+ -----------
23
+ record: Record
24
+ pymarc.record.Record object.
25
+ add_variations: bool
26
+ If enabled, constructs an additional variations field, which
27
+ combines the content of multiple fields + adds some generated
28
+ variations. If the output is uploaded into Elastic and used
29
+ via rara-norm-linker, it is necessary to enable this.
30
+ """
31
+ super().__init__(record=record, add_variations=add_variations)
32
+ self.__en_subject_field_mapping: dict = EN_SUBJECT_FIELDS
33
+ self.__et_subject_field_mapping: dict = ET_SUBJECT_FIELDS
34
+ self.__keyword: str = ""
35
+ self.__keyword_en: str = ""
36
+ self.__keyword_type: str = ""
37
+ self.__keyword_variations: List[str] = []
38
+ self.__keyword_fields = {
39
+ EMSMarcIDs.TIME_KEYWORD: KeywordType.TIME,
40
+ EMSMarcIDs.TOPIC_KEYWORD: KeywordType.TOPIC,
41
+ EMSMarcIDs.LOC_KEYWORD: KeywordType.LOC,
42
+ EMSMarcIDs.GENRE_KEYWORD: KeywordType.GENRE
43
+ }
44
+
45
+ self.__ems_url_ids = EMSMarcIDs.URL
46
+ self.__synonym_ids = EMSMarcIDs.SYNONYMS
47
+ self.__related_ids = EMSMarcIDs.RELATED
48
+ self.__category_ids = EMSMarcIDs.CATEGORY
49
+ self.__notes_ids = EMSMarcIDs.NOTES
50
+ self.__synonyms: List[str] = []
51
+ self.__synonyms_en: List[str] = []
52
+ self.__subject_field_ids: List[str] = []
53
+
54
+ self.__subject_fields_et: List[str] = []
55
+ self.__subject_fields_en: List[str] = []
56
+ self.__ems_url: str = ""
57
+ self.__narrower: List[str] = []
58
+ self.__broader: List[str] = []
59
+ self.__related: List[str] = []
60
+ self.__narrower_ems_urls: List[str] = []
61
+ self.__broader_ems_urls: List[str] = []
62
+ self.__related_ems_urls: List[str] = []
63
+ self.__variations: List[str] = []
64
+ self.__variations_en: List[str] = []
65
+ self.__use_with_others: bool | None = None
66
+ self.__full_record: dict = {}
67
+
68
+ @property
69
+ def keyword(self) -> str:
70
+ if not self.__keyword:
71
+ self.__keyword = self.get_values(
72
+ marc_ids=self.__keyword_fields,
73
+ subfield_id="a"
74
+ )[0]
75
+ return self.__keyword
76
+
77
+ @property
78
+ def keyword_en(self) -> str:
79
+ if not self.__keyword_en:
80
+ self.__keyword_en = self.synonyms_en[0] if self.synonyms_en else ""
81
+ return self.__keyword_en
82
+
83
+ @property
84
+ def synonyms(self) -> List[str]:
85
+ if not self.__synonyms:
86
+ self.__synonyms = self.get_values(
87
+ marc_ids=self.__synonym_ids,
88
+ subfield_id="a"
89
+ )
90
+ return self.__synonyms
91
+
92
+ @property
93
+ def synonyms_en(self) -> List[str]:
94
+ if not self.__synonyms_en:
95
+ self.__synonyms_en = self.get_values(
96
+ marc_ids=self.__synonym_ids,
97
+ subfield_id="a", ind2="9"
98
+ )
99
+ return self.__synonyms_en
100
+
101
+ @property
102
+ def subject_field_ids(self) -> List[str]:
103
+ if not self.__subject_field_ids:
104
+ self.__subject_field_ids = self.get_values(
105
+ marc_ids=self.__category_ids,
106
+ subfield_id="a",
107
+ ind2="7"
108
+ )
109
+ return self.__subject_field_ids
110
+
111
+ @property
112
+ def subject_fields_et(self) -> List[str]:
113
+ if not self.__subject_fields_et:
114
+ self.__subject_fields_et = [
115
+ self.__et_subject_field_mapping[_id]
116
+ for _id in self.subject_field_ids
117
+ ]
118
+ return self.__subject_fields_et
119
+
120
+ @property
121
+ def subject_fields_en(self) -> List[str]:
122
+ if not self.__subject_fields_en:
123
+ self.__subject_fields_en = [
124
+ self.__en_subject_field_mapping[_id]
125
+ for _id in self.subject_field_ids
126
+ ]
127
+ return self.__subject_fields_en
128
+
129
+
130
+ @property
131
+ def ems_url(self) -> str:
132
+ if not self.__ems_url:
133
+ self.__ems_url = self.get_values(
134
+ marc_ids=self.__ems_url_ids,
135
+ subfield_id="0",
136
+ ind2="8"
137
+ )[0]
138
+ return self.__ems_url
139
+
140
+ @property
141
+ def broader(self) -> List[str]:
142
+ if not self.__broader:
143
+ self.__broader = self.get_values(
144
+ marc_ids=self.__related_ids,
145
+ subfield_id="a",
146
+ subfield_restriction = ("w", "g")
147
+ )
148
+ return self.__broader
149
+
150
+ @property
151
+ def narrower(self) -> List[str]:
152
+ if not self.__narrower:
153
+ self.__narrower = self.get_values(
154
+ marc_ids=self.__related_ids,
155
+ subfield_id="a",
156
+ subfield_restriction = ("w", "h")
157
+ )
158
+ return self.__narrower
159
+
160
+ @property
161
+ def related(self) -> List[str]:
162
+ if not self.__related:
163
+ self.__related = self.get_values(
164
+ marc_ids=self.__related_ids,
165
+ subfield_id="a",
166
+ subfield_to_ignore ="w"
167
+ )
168
+ return self.__related
169
+
170
+ @property
171
+ def broader_ems_urls(self) -> List[str]:
172
+ if not self.__broader_ems_urls:
173
+ self.__broader_ems_urls = self.get_values(
174
+ marc_ids=self.__related_ids,
175
+ subfield_id="0",
176
+ subfield_restriction = ("w", "g")
177
+ )
178
+ return self.__broader_ems_urls
179
+
180
+ @property
181
+ def narrower_ems_urls(self) -> List[str]:
182
+ if not self.__narrower_ems_urls:
183
+ self.__narrower_ems_urls = self.get_values(
184
+ marc_ids=self.__related_ids,
185
+ subfield_id="0",
186
+ subfield_restriction = ("w", "h")
187
+ )
188
+ return self.__narrower_ems_urls
189
+
190
+ @property
191
+ def related_ems_urls(self) -> List[str]:
192
+ if not self.__related_ems_urls:
193
+ self.__related_ems_urls = self.get_values(
194
+ marc_ids=self.__related_ids,
195
+ subfield_id="0",
196
+ subfield_to_ignore ="w"
197
+ )
198
+ return self.__related_ems_urls
199
+
200
+ @property
201
+ def keyword_type(self) -> str:
202
+ if not self.__keyword_type:
203
+ for field in self.dict_record:
204
+ field_id = list(field.keys())[0]
205
+ if field_id in self.__keyword_fields:
206
+ self.__keyword_type = self.__keyword_fields[field_id]
207
+ return self.__keyword_type
208
+
209
+ @property
210
+ def use_with_others(self) -> bool:
211
+ if self.__use_with_others == None:
212
+ notes = self.get_values(marc_ids=self.__notes_ids, subfield_id="i")
213
+ self.__use_with_others = False
214
+ if notes:
215
+ if re.search(r"Kasutada koos teise", notes[0]):
216
+ self.__use_with_others = True
217
+
218
+ return self.__use_with_others
219
+
220
+ @property
221
+ def variations(self) -> List[str]:
222
+ if not self.__variations:
223
+ original_variations = self.synonyms + [self.keyword]
224
+ variations = []
225
+ for kw in original_variations:
226
+ variations_ = KeywordNormalizer(kw, keyword_type=self.keyword_type).variations
227
+ variations.extend(variations_)
228
+ self.__variations = [v.lower() for v in list(set(variations))]
229
+ return self.__variations
230
+
231
+ @property
232
+ def variations_en(self) -> List[str]:
233
+ if not self.__variations_en:
234
+ pass
235
+ return self.__variations_en
236
+
237
+
238
+ @property
239
+ def full_record(self) -> dict:
240
+ if not self.__full_record:
241
+ self.__full_record = {
242
+ "keyword": self.keyword,
243
+ "keyword_en": self.keyword_en,
244
+ "keyword_type": self.keyword_type,
245
+ "use_with_others": self.use_with_others,
246
+ "subject_field_ids": self.subject_field_ids,
247
+ "subject_fields_et": self.subject_fields_et,
248
+ "subject_fields_en": self.subject_fields_en,
249
+ "synonyms": self.synonyms,
250
+ "synonyms_en": self.synonyms_en,
251
+ "narrower": self.narrower,
252
+ "broader": self.broader,
253
+ "related": self.related,
254
+ "narrower_ems_urls": self.narrower_ems_urls,
255
+ "broader_ems_urls": self.broader_ems_urls,
256
+ "related_ems_urls": self.related_ems_urls,
257
+ "ems_id": self.identifier,
258
+ "ems_url": self.ems_url,
259
+ "identifier_source": self.identifier_source,
260
+ "full_record_marc": str(self.marc_record),
261
+ "full_record_json": json.dumps(self.marc_json_record)
262
+ }
263
+ if self.add_variations:
264
+ self.__full_record.update(
265
+ {"link_variations": self.variations}
266
+ )
267
+ return self.__full_record
@@ -0,0 +1,245 @@
1
+ from typing import List, NoReturn
2
+ from pymarc.record import Record
3
+ from rara_tools.parsers.marc_records.base_record import BaseRecord
4
+ from rara_tools.constants.parsers import OrganizationMarcIDs
5
+ import regex as re
6
+ import json
7
+
8
+ # TODO: indikaatorid ind1 väljadel 100 ja 400?
9
+
10
+
11
+ class OrganizationRecord(BaseRecord):
12
+ """ Generates a simplified organization JSON record
13
+ from a pymarc MARC record.
14
+ """
15
+ def __init__(self, record: Record, add_variations: bool = False) -> NoReturn:
16
+ """ Initializes OrganizationRecord object.
17
+
18
+ Parameters
19
+ -----------
20
+ record: Record
21
+ pymarc.record.Record object.
22
+ add_variations: bool
23
+ If enabled, constructs an additional variations field, which
24
+ combines the content of multiple fields + adds some generated
25
+ variations. If the output is uploaded into Elastic and used
26
+ via rara-norm-linker, it is necessary to enable this.
27
+ """
28
+ super().__init__(record=record, add_variations=add_variations)
29
+
30
+ self.__name_field_id: List[str] = OrganizationMarcIDs.NAME
31
+ self.__name_variations_field_id: List[str] = OrganizationMarcIDs.NAME_VARIATIONS
32
+ self.__related_names_field_id: List[str] = OrganizationMarcIDs.RELATED_NAMES
33
+ self.__source_field_id: List[str] = OrganizationMarcIDs.SOURCE
34
+ self.__description_field_id: List[str] = OrganizationMarcIDs.DESCRIPTION
35
+ self.__area_code_id: List[str] = OrganizationMarcIDs.AREA_CODE
36
+ self.__default_year: int | None = None
37
+
38
+ self.__name: str = ""
39
+ self.__original_name: dict = {}
40
+ self.__name_specification: str = ""
41
+ self.__life_years: str = ""
42
+ self.__birth_year: int = -1
43
+ self.__death_year: int = -1
44
+ self.__name_variations: List[str] = []
45
+ self.__source: str = ""
46
+ self.__description: str = ""
47
+ self.__area_code: str = ""
48
+ self.__acronyms: List[str] = []
49
+ self.__alternative_names: List[str] = []
50
+ self.__related_acronyms: List[str] = []
51
+ self.__old_names: List[str] = []
52
+ self.__new_names: List[str] = []
53
+ self.__related_old_names: List[str] = []
54
+ self.__related_new_names: List[str] = []
55
+ self.__full_record: dict = {}
56
+ self.__variations: List[str] = []
57
+
58
+
59
+ def _clean_value(self, value: str) -> str:
60
+ cleaned_value = value.strip("., ")
61
+ return cleaned_value
62
+
63
+ def _merge_and_clean(self, value: dict, keys: List[str]) -> str:
64
+ _merged = []
65
+ for key in keys:
66
+ _value = self._clean_value(value.get(key, ""))
67
+ if _value:
68
+ _merged.append(_value)
69
+ merged = " ".join(_merged)
70
+ return merged
71
+
72
+ @property
73
+ def original_name(self) -> str:
74
+ if not self.__original_name:
75
+ values = self.get_values(
76
+ marc_ids=self.__name_field_id,
77
+ subfield_id=["a", "b"]
78
+ )
79
+ if values:
80
+ self.__original_name = {
81
+ "a": self._clean_value(values[0].get("a", "")),
82
+ "b": self._clean_value(values[0].get("b", ""))
83
+ }
84
+ else:
85
+ print(self.marc_record)
86
+ return self.__original_name
87
+
88
+ @property
89
+ def name(self) -> str:
90
+ if not self.__name:
91
+ self.__name = self._merge_and_clean(self.original_name, ["a", "b"])
92
+ return self.__name
93
+
94
+ @property
95
+ def acronyms(self) -> List[str]:
96
+ if not self.__acronyms:
97
+ values = self.get_values(
98
+ marc_ids=self.__name_variations_field_id,
99
+ subfield_id="a",
100
+ subfield_restriction = ("w", "d")
101
+ )
102
+ self.__acronyms = [self._clean_value(value) for value in values]
103
+ return self.__acronyms
104
+
105
+ @property
106
+ def new_names(self) -> List[str]:
107
+ if not self.__new_names:
108
+ values = self.get_values(
109
+ marc_ids=self.__name_variations_field_id,
110
+ subfield_id=["a", "b"],
111
+ subfield_restriction = ("w", "b")
112
+ )
113
+ self.__new_names = [self._merge_and_clean(value, ["a", "b"]) for value in values]
114
+ return self.__new_names
115
+
116
+ @property
117
+ def old_names(self) -> List[str]:
118
+ if not self.__old_names:
119
+ values = self.get_values(
120
+ marc_ids=self.__name_variations_field_id,
121
+ subfield_id=["a", "b"],
122
+ subfield_restriction = ("w", "a")
123
+ )
124
+ self.__old_names = [self._merge_and_clean(value, ["a", "b"]) for value in values]
125
+ return self.__old_names
126
+
127
+ @property
128
+ def alternative_names(self) -> List[str]:
129
+ if not self.__alternative_names:
130
+ values = self.get_values(
131
+ marc_ids=self.__name_variations_field_id,
132
+ subfield_id=["a", "b"],
133
+ subfield_to_ignore="w"
134
+ )
135
+ self.__alternative_names = [self._merge_and_clean(value, ["a", "b"]) for value in values]
136
+ return self.__alternative_names
137
+
138
+
139
+ @property
140
+ def related_acronyms(self) -> List[str]:
141
+ if not self.__related_acronyms:
142
+ values = self.get_values(
143
+ marc_ids=self.__related_names_field_id,
144
+ subfield_id="a",
145
+ subfield_restriction = ("w", "d")
146
+ )
147
+ self.__related_acronyms = [self._clean_value(value) for value in values]
148
+ return self.__related_acronyms
149
+
150
+ @property
151
+ def related_new_names(self) -> List[str]:
152
+ if not self.__related_new_names:
153
+ values = self.get_values(
154
+ marc_ids=self.__related_names_field_id,
155
+ subfield_id=["a", "b"],
156
+ subfield_restriction = ("w", "b")
157
+ )
158
+ self.__related_new_names = [self._merge_and_clean(value, ["a", "b"]) for value in values]
159
+ return self.__related_new_names
160
+
161
+ @property
162
+ def related_old_names(self) -> List[str]:
163
+ if not self.__related_old_names:
164
+ values = self.get_values(
165
+ marc_ids=self.__related_names_field_id,
166
+ subfield_id=["a", "b"],
167
+ subfield_restriction = ("w", "a")
168
+ )
169
+ self.__related_old_names = [self._merge_and_clean(value, ["a", "b"]) for value in values]
170
+ return self.__related_old_names
171
+
172
+
173
+ @property
174
+ def source(self) -> str:
175
+ if not self.__source:
176
+ values = self.get_values(
177
+ marc_ids=self.__source_field_id,
178
+ subfield_id="a"
179
+ )
180
+ self.__source = self._clean_value(values[0]) if values else ""
181
+ return self.__source
182
+
183
+
184
+ @property
185
+ def area_code(self) -> str:
186
+ if not self.__area_code:
187
+ values = self.get_values(
188
+ marc_ids=self.__area_code_id,
189
+ subfield_id="c"
190
+ )
191
+ self.__area_code = self._clean_value(values[0]) if values else ""
192
+ return self.__area_code
193
+
194
+ @property
195
+ def description(self) -> str:
196
+ if not self.__description:
197
+ values = self.get_values(
198
+ marc_ids=self.__description_field_id,
199
+ subfield_id="i"
200
+ )
201
+ self.__description = self._clean_value(values[0]) if values else ""
202
+ return self.__description
203
+
204
+ @property
205
+ def variations(self) -> List[str]:
206
+ if not self.__variations:
207
+ _variations = [self.name]
208
+ _variations.extend(self.new_names)
209
+ _variations.extend(self.old_names)
210
+ _variations.extend(self.alternative_names)
211
+ _variations.extend(self.related_old_names)
212
+ _variations.extend(self.related_new_names)
213
+ self.__variations = [v.lower() for v in list(set(_variations))]
214
+
215
+ return self.__variations
216
+
217
+ @property
218
+ def full_record(self) -> dict:
219
+ if not self.__full_record:
220
+ self.__full_record = {
221
+ "name": self.name,
222
+ "original_name": self.original_name,
223
+ "acronyms": self.acronyms,
224
+ "new_names": self.new_names,
225
+ "old_names": self.old_names,
226
+ "source": self.source,
227
+ "description": self.description,
228
+ "area_code": self.area_code,
229
+ "alternative_names": self.alternative_names,
230
+ "related_acryonyms": self.related_acronyms,
231
+ "related_new_names": self.related_new_names,
232
+ "related_old_names": self.related_old_names,
233
+ "identifier": self.identifier,
234
+ "identifier_source": self.identifier_source,
235
+ "full_record_marc": str(self.marc_record),
236
+ "full_record_json": json.dumps(self.marc_json_record)
237
+ }
238
+ if self.add_variations:
239
+ self.__full_record.update(
240
+ {
241
+ "link_variations": self.variations,
242
+ "link_acronyms": [a.lower() for a in self.acronyms]
243
+ }
244
+ )
245
+ return self.__full_record