rara-tools 0.7.0__py3-none-any.whl → 0.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rara-tools might be problematic. Click here for more details.
- rara_tools/constants/digitizer.py +2 -0
- rara_tools/constants/general.py +1 -0
- rara_tools/constants/subject_indexer.py +4 -0
- rara_tools/normalizers/base.py +77 -20
- rara_tools/normalizers/reader.py +45 -0
- {rara_tools-0.7.0.dist-info → rara_tools-0.7.2.dist-info}/METADATA +1 -1
- {rara_tools-0.7.0.dist-info → rara_tools-0.7.2.dist-info}/RECORD +10 -9
- {rara_tools-0.7.0.dist-info → rara_tools-0.7.2.dist-info}/WHEEL +0 -0
- {rara_tools-0.7.0.dist-info → rara_tools-0.7.2.dist-info}/licenses/LICENSE.md +0 -0
- {rara_tools-0.7.0.dist-info → rara_tools-0.7.2.dist-info}/top_level.txt +0 -0
rara_tools/constants/general.py
CHANGED
|
@@ -6,10 +6,12 @@ COMPONENT_KEY = "subject_indexer"
|
|
|
6
6
|
class Tasks:
|
|
7
7
|
SINGLE = "run_subject_indexer_process"
|
|
8
8
|
PIPELINE = "run_subject_indexer_with_core_logic"
|
|
9
|
+
PURGE_MODELS = "purge_unused_subjectindexer_models"
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class Queue:
|
|
12
13
|
MAIN = "subject-indexer"
|
|
14
|
+
UTILITY = "subjectindexer-utility"
|
|
13
15
|
|
|
14
16
|
|
|
15
17
|
class StatusKeys:
|
|
@@ -21,6 +23,7 @@ class URLSource:
|
|
|
21
23
|
SIERRA = "Sierra"
|
|
22
24
|
EMS = "EMS"
|
|
23
25
|
|
|
26
|
+
|
|
24
27
|
class KeywordType:
|
|
25
28
|
LOC = "Kohamärksõnad"
|
|
26
29
|
TIME = "Ajamärksõnad"
|
|
@@ -45,6 +48,7 @@ class KeywordMARC:
|
|
|
45
48
|
EVENT = 611
|
|
46
49
|
TITLE = 630
|
|
47
50
|
|
|
51
|
+
|
|
48
52
|
class KeywordSource:
|
|
49
53
|
EMS = "EMS"
|
|
50
54
|
SIERRA = "SIERRA"
|
rara_tools/normalizers/base.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from datetime import datetime
|
|
2
2
|
from pymarc import (Field, Subfield, JSONReader, Record)
|
|
3
3
|
from typing import List, Optional, Iterator
|
|
4
|
+
from rara_tools.normalizers.reader import SafeJSONReader
|
|
4
5
|
|
|
5
6
|
from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
|
|
6
7
|
from rara_tools.constants.normalizers import (
|
|
@@ -35,7 +36,6 @@ class RecordNormalizer:
|
|
|
35
36
|
self.ALLOW_EDIT_FIELDS = ALLOW_EDIT_FIELDS
|
|
36
37
|
# include, if should be added alongside existing fields
|
|
37
38
|
self.REPEATABLE_FIELDS = REPEATABLE_FIELDS
|
|
38
|
-
|
|
39
39
|
self.records_extra_data = []
|
|
40
40
|
self.records = self._setup_records(linking_results, sierra_data)
|
|
41
41
|
self.sierra_data = sierra_data
|
|
@@ -92,8 +92,56 @@ class RecordNormalizer:
|
|
|
92
92
|
|
|
93
93
|
all_records = linked_records + (sierra_data or [])
|
|
94
94
|
|
|
95
|
-
return
|
|
96
|
-
|
|
95
|
+
return SafeJSONReader(
|
|
96
|
+
json.dumps(all_records, ensure_ascii=False),
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def _setup_records(self, linking_results: List[dict], sierra_data: List[dict]) -> JSONReader:
|
|
100
|
+
"""Setup initial MARC records and data.
|
|
101
|
+
|
|
102
|
+
For linked entities:
|
|
103
|
+
1. Try to get single linked normalized record from KATA elastic. If more than one found, skip.
|
|
104
|
+
2. If 0 matches, search from VIAF and if 1 result found, create a new authority record from the data.
|
|
105
|
+
3. If none or more than one responses found, use only Classificator data (coming from Linker?).
|
|
106
|
+
"""
|
|
107
|
+
linked_records = []
|
|
108
|
+
|
|
109
|
+
for linked in linking_results or []:
|
|
110
|
+
if not isinstance(linked, dict):
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
entity = linked.get("original_entity")
|
|
114
|
+
linked_info = linked.get("linked_info", [])
|
|
115
|
+
|
|
116
|
+
if not isinstance(linked_info, list) or not linked_info:
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
if len(linked_info) == 1:
|
|
120
|
+
linked_item = linked_info[0]
|
|
121
|
+
if not isinstance(linked_item, dict):
|
|
122
|
+
continue
|
|
123
|
+
|
|
124
|
+
linked_records.append(linked_item.get("json", {}))
|
|
125
|
+
self.records_extra_data.append({
|
|
126
|
+
"entity": entity,
|
|
127
|
+
"viaf": linked_item.get("viaf", {}),
|
|
128
|
+
"type": "linked",
|
|
129
|
+
"edited": True
|
|
130
|
+
})
|
|
131
|
+
|
|
132
|
+
self.records_extra_data.extend(
|
|
133
|
+
{
|
|
134
|
+
"sierraID": obj.get("sierraID"),
|
|
135
|
+
"type": "sierra",
|
|
136
|
+
"edited": True
|
|
137
|
+
}
|
|
138
|
+
for obj in (sierra_data or [])
|
|
139
|
+
if isinstance(obj, dict)
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
all_records = linked_records + (sierra_data or [])
|
|
143
|
+
|
|
144
|
+
return SafeJSONReader(json.dumps(all_records, ensure_ascii=False))
|
|
97
145
|
|
|
98
146
|
@staticmethod
|
|
99
147
|
def current_timestamp():
|
|
@@ -230,11 +278,15 @@ class RecordNormalizer:
|
|
|
230
278
|
|
|
231
279
|
def _get_viaf_search_term(self, record: Record, entity: Optional[str]) -> Optional[str]:
|
|
232
280
|
""" prioritize entity name, if not available, use author name. """
|
|
233
|
-
|
|
234
281
|
if entity:
|
|
235
282
|
return entity
|
|
236
|
-
|
|
237
|
-
|
|
283
|
+
|
|
284
|
+
author_field = record.get("100") or record.get("110") or record.get("111")
|
|
285
|
+
if author_field:
|
|
286
|
+
return author_field.get_subfields("a")[0] if author_field.get_subfields("a") else None
|
|
287
|
+
|
|
288
|
+
logger.warning(
|
|
289
|
+
"No entity or author name found for VIAF search. Skipping VIAF enrichment.")
|
|
238
290
|
|
|
239
291
|
def _get_viaf_record(self, record: Record, viaf_id: Optional[int] = None,
|
|
240
292
|
entity: Optional[str] = None, viaf_field: str = DEFAULT_VIAF_FIELD,
|
|
@@ -252,21 +304,26 @@ class RecordNormalizer:
|
|
|
252
304
|
viaf_record = viaf_records[0]
|
|
253
305
|
else:
|
|
254
306
|
search_term = self._get_viaf_search_term(record, entity)
|
|
255
|
-
if not verify:
|
|
256
|
-
logger.warning(
|
|
257
|
-
f"Record verification is turned off. If multiple records are " \
|
|
258
|
-
f"detected for search term '{search_term}', the first " \
|
|
259
|
-
f"result is automatically returned. This might lead to " \
|
|
260
|
-
f"some inaccuracies!"
|
|
261
|
-
)
|
|
262
307
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
verify
|
|
268
|
-
|
|
269
|
-
|
|
308
|
+
if search_term:
|
|
309
|
+
logger.info(
|
|
310
|
+
f"Searching for VIAF record with search term: {search_term}")
|
|
311
|
+
|
|
312
|
+
if not verify:
|
|
313
|
+
logger.warning(
|
|
314
|
+
f"Record verification is turned off. If multiple records are " \
|
|
315
|
+
f"detected for search term '{search_term}', the first " \
|
|
316
|
+
f"result is automatically returned. This might lead to " \
|
|
317
|
+
f"some inaccuracies!"
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
viaf_record = viaf_client.get_normalized_data_by_search_term(
|
|
321
|
+
search_term=search_term,
|
|
322
|
+
field=viaf_field,
|
|
323
|
+
max_records=max_records,
|
|
324
|
+
verify=verify,
|
|
325
|
+
threshold=threshold
|
|
326
|
+
)
|
|
270
327
|
|
|
271
328
|
except Exception as e:
|
|
272
329
|
logger.error(
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from pymarc import Record, Field, Subfield, Leader, JSONReader
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
logger = logging.getLogger(__name__)
|
|
5
|
+
|
|
6
|
+
DEFAULT_LEADER = '01682nz a2200349n 4500'
|
|
7
|
+
|
|
8
|
+
class SafeJSONReader(JSONReader):
|
|
9
|
+
|
|
10
|
+
def __next__(self):
|
|
11
|
+
while True:
|
|
12
|
+
try:
|
|
13
|
+
jobj = next(self.iter)
|
|
14
|
+
rec = Record()
|
|
15
|
+
|
|
16
|
+
# Use custom default leader if missing
|
|
17
|
+
leader_str = jobj.get("leader")
|
|
18
|
+
if leader_str:
|
|
19
|
+
rec.leader = Leader(leader_str)
|
|
20
|
+
else:
|
|
21
|
+
logger.warning("Missing leader in record. Using DEFAULT_LEADER.")
|
|
22
|
+
rec.leader = Leader(DEFAULT_LEADER)
|
|
23
|
+
|
|
24
|
+
for field in jobj["fields"]:
|
|
25
|
+
k, v = list(field.items())[0]
|
|
26
|
+
|
|
27
|
+
if isinstance(v, dict) and "subfields" in v:
|
|
28
|
+
subfields = []
|
|
29
|
+
for sub in v["subfields"]:
|
|
30
|
+
for code, value in sub.items():
|
|
31
|
+
subfields.append(Subfield(code, value))
|
|
32
|
+
ind1 = v.get("ind1", " ")
|
|
33
|
+
ind2 = v.get("ind2", " ")
|
|
34
|
+
fld = Field(tag=k, indicators=[ind1, ind2], subfields=subfields)
|
|
35
|
+
else:
|
|
36
|
+
fld = Field(tag=k, data=v)
|
|
37
|
+
rec.add_field(fld)
|
|
38
|
+
|
|
39
|
+
return rec
|
|
40
|
+
|
|
41
|
+
except StopIteration:
|
|
42
|
+
raise
|
|
43
|
+
except Exception as e:
|
|
44
|
+
logger.error(f"Skipping invalid record: {e}")
|
|
45
|
+
continue
|
|
@@ -7,22 +7,23 @@ rara_tools/s3.py,sha256=9ziDXsLjBtFAvsjTPxFddhfvkpA8773rzPJqO7y1N5Q,6415
|
|
|
7
7
|
rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
|
|
8
8
|
rara_tools/utils.py,sha256=1UrxOzo3cxe4juMkDlKWv1VKWMYay5v1pivGci1ajiM,3003
|
|
9
9
|
rara_tools/constants/__init__.py,sha256=r78laM9vyRDAvzDhPvzDlhaX6qPwUUBBtwf1WosrW3o,27
|
|
10
|
-
rara_tools/constants/digitizer.py,sha256=
|
|
11
|
-
rara_tools/constants/general.py,sha256=
|
|
10
|
+
rara_tools/constants/digitizer.py,sha256=9aQkJj8C5a_HLgCayrz3PpGYJMLoO4Ph9_U28Q-C1T4,633
|
|
11
|
+
rara_tools/constants/general.py,sha256=uCSIfE0o-drRDDqnaJALMGHecUOQIRhdr2g5M5NQqMY,1102
|
|
12
12
|
rara_tools/constants/language_evaluator.py,sha256=3sCSaoS-zXQRY0vJ7UUMuZqbtYQD_quVVbdpgvJjE7I,124
|
|
13
13
|
rara_tools/constants/linker.py,sha256=yBN9NpUhB3ENz8BapoIfpSHY_xNqwYdqutgQFdc_Cd8,3240
|
|
14
14
|
rara_tools/constants/meta_extractor.py,sha256=adYH8cQqH0ZWYO7clGMiObclXRTGsxWgk3pC1oiHxHE,242
|
|
15
15
|
rara_tools/constants/normalizers.py,sha256=Xs3anDwJHpHeniwx3xoIZyqdEXtO3eb7ouGLLr0CpHw,1344
|
|
16
16
|
rara_tools/constants/parsers.py,sha256=L6nh1Itget9_9DMsliDkh6T25z78eMFPWVkbaU08DwU,5561
|
|
17
|
-
rara_tools/constants/subject_indexer.py,sha256=
|
|
17
|
+
rara_tools/constants/subject_indexer.py,sha256=i0xRdqwasyb6d6WZZKPgyuEUd2JeO_qwWYoG6UeBo5U,2064
|
|
18
18
|
rara_tools/core_formatters/core_formatter.py,sha256=HJX7jOi9kaFie_zm0Wzjk0nKF8dRleJpVWbCplFFquo,2760
|
|
19
19
|
rara_tools/core_formatters/formatted_keyword.py,sha256=1-B9IQTycFt69pTy8WZNnfJ2WIMRow3kpEub6igyNQc,7865
|
|
20
20
|
rara_tools/core_formatters/formatted_meta.py,sha256=Zd0oQFLbn6m_wHaWtgxBsu9J7wGyWIpZxb2-8PrR3Wk,5240
|
|
21
21
|
rara_tools/core_formatters/formatted_object.py,sha256=7a499ZmcZXOqtlwxDi6FWHWF5a6HdCsduS22wV3uHIE,5656
|
|
22
22
|
rara_tools/normalizers/__init__.py,sha256=_NqpS5w710DhaURytHq9JpEt8HgYpSPfRDcOtOymJgE,193
|
|
23
23
|
rara_tools/normalizers/authorities.py,sha256=IDtcm0yNZNhv1f-WcdqWFSRzZk_CoKuBFsk6hEPddWM,4513
|
|
24
|
-
rara_tools/normalizers/base.py,sha256=
|
|
24
|
+
rara_tools/normalizers/base.py,sha256=e2Ql_JNrpRiVDeb9Iqf6mug3BHNjKasjwmiIh5oWgWc,14276
|
|
25
25
|
rara_tools/normalizers/bibs.py,sha256=4DTS6k37z8qR5B3n7aiCXsT5Z49rLTvQ60lKKr5dyLs,2352
|
|
26
|
+
rara_tools/normalizers/reader.py,sha256=_usCY2jDC_1KvEJEdthPEldVz-v4-2-S8deNJCzQVoI,1573
|
|
26
27
|
rara_tools/normalizers/viaf.py,sha256=LIeqbJoKtVt_0H1o7XMmhSE0BjF4l-jdAJgX_8Gg9Z4,24218
|
|
27
28
|
rara_tools/parsers/marc_parsers/base_parser.py,sha256=Kdw4aivJf2FkWgIK7pJtHtVXF_G1pjHVQ7IcFItSqy8,1649
|
|
28
29
|
rara_tools/parsers/marc_parsers/ems_parser.py,sha256=LFuhZcVwmHMcJknX9p4ZkO8RdjPdQZ4APGbw8KV6BIs,2024
|
|
@@ -38,8 +39,8 @@ rara_tools/parsers/marc_records/title_record.py,sha256=XrtJ4gj7wzSaGxNaPtPuawmqq
|
|
|
38
39
|
rara_tools/parsers/tools/entity_normalizers.py,sha256=VyCy_NowCLpOsL0luQ55IW-Qi-J5oBH0Ofzr7HRFBhM,8949
|
|
39
40
|
rara_tools/parsers/tools/marc_converter.py,sha256=LgSHe-7n7aiDrw2bnsB53r3fXTRFjZXTwBYfTpL0pfs,415
|
|
40
41
|
rara_tools/parsers/tools/russian_transliterator.py,sha256=5ZU66iTqAhr7pmfVqXPAI_cidF43VqqmuN4d7H4_JuA,9770
|
|
41
|
-
rara_tools-0.7.
|
|
42
|
-
rara_tools-0.7.
|
|
43
|
-
rara_tools-0.7.
|
|
44
|
-
rara_tools-0.7.
|
|
45
|
-
rara_tools-0.7.
|
|
42
|
+
rara_tools-0.7.2.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
|
|
43
|
+
rara_tools-0.7.2.dist-info/METADATA,sha256=x8GYrcQQCpMdhCUlfLr0Nxd1a2fNH3Fg5sUq2nK5x6Q,4079
|
|
44
|
+
rara_tools-0.7.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
45
|
+
rara_tools-0.7.2.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
|
|
46
|
+
rara_tools-0.7.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|