rara-tools 0.7.0__tar.gz → 0.7.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rara-tools might be problematic. Click here for more details.
- {rara_tools-0.7.0/rara_tools.egg-info → rara_tools-0.7.2}/PKG-INFO +1 -1
- rara_tools-0.7.2/VERSION +1 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/constants/digitizer.py +2 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/constants/general.py +1 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/constants/subject_indexer.py +4 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/normalizers/base.py +77 -20
- rara_tools-0.7.2/rara_tools/normalizers/reader.py +45 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2/rara_tools.egg-info}/PKG-INFO +1 -1
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools.egg-info/SOURCES.txt +1 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/tests/test_normalization.py +28 -1
- rara_tools-0.7.0/VERSION +0 -1
- {rara_tools-0.7.0 → rara_tools-0.7.2}/LICENSE.md +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/README.md +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/pyproject.toml +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/constants/__init__.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/constants/language_evaluator.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/constants/linker.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/constants/meta_extractor.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/constants/normalizers.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/constants/parsers.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/converters.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/core_formatters/core_formatter.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/core_formatters/formatted_keyword.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/core_formatters/formatted_meta.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/core_formatters/formatted_object.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/decorators.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/digar_schema_converter.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/elastic.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/exceptions.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/normalizers/__init__.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/normalizers/authorities.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/normalizers/bibs.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/normalizers/viaf.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/parsers/marc_parsers/base_parser.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/parsers/marc_parsers/ems_parser.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/parsers/marc_parsers/location_parser.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/parsers/marc_parsers/organization_parser.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/parsers/marc_parsers/person_parser.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/parsers/marc_parsers/title_parser.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/parsers/marc_records/base_record.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/parsers/marc_records/ems_record.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/parsers/marc_records/organization_record.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/parsers/marc_records/person_record.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/parsers/marc_records/title_record.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/parsers/tools/entity_normalizers.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/parsers/tools/marc_converter.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/parsers/tools/russian_transliterator.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/s3.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/task_reporter.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/utils.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools.egg-info/dependency_links.txt +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools.egg-info/requires.txt +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools.egg-info/top_level.txt +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/requirements.txt +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/setup.cfg +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/tests/test_digar_schema_converter.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/tests/test_elastic.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/tests/test_elastic_vector_and_search_operations.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/tests/test_entity_normalizers.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/tests/test_formatters.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/tests/test_marc_parsers.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/tests/test_s3_exceptions.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/tests/test_s3_file_operations.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/tests/test_sierra_converters.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/tests/test_task_reporter.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/tests/test_utils.py +0 -0
- {rara_tools-0.7.0 → rara_tools-0.7.2}/tests/test_viaf_client.py +0 -0
rara_tools-0.7.2/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.7.2
|
|
@@ -6,10 +6,12 @@ COMPONENT_KEY = "subject_indexer"
|
|
|
6
6
|
class Tasks:
|
|
7
7
|
SINGLE = "run_subject_indexer_process"
|
|
8
8
|
PIPELINE = "run_subject_indexer_with_core_logic"
|
|
9
|
+
PURGE_MODELS = "purge_unused_subjectindexer_models"
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class Queue:
|
|
12
13
|
MAIN = "subject-indexer"
|
|
14
|
+
UTILITY = "subjectindexer-utility"
|
|
13
15
|
|
|
14
16
|
|
|
15
17
|
class StatusKeys:
|
|
@@ -21,6 +23,7 @@ class URLSource:
|
|
|
21
23
|
SIERRA = "Sierra"
|
|
22
24
|
EMS = "EMS"
|
|
23
25
|
|
|
26
|
+
|
|
24
27
|
class KeywordType:
|
|
25
28
|
LOC = "Kohamärksõnad"
|
|
26
29
|
TIME = "Ajamärksõnad"
|
|
@@ -45,6 +48,7 @@ class KeywordMARC:
|
|
|
45
48
|
EVENT = 611
|
|
46
49
|
TITLE = 630
|
|
47
50
|
|
|
51
|
+
|
|
48
52
|
class KeywordSource:
|
|
49
53
|
EMS = "EMS"
|
|
50
54
|
SIERRA = "SIERRA"
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from datetime import datetime
|
|
2
2
|
from pymarc import (Field, Subfield, JSONReader, Record)
|
|
3
3
|
from typing import List, Optional, Iterator
|
|
4
|
+
from rara_tools.normalizers.reader import SafeJSONReader
|
|
4
5
|
|
|
5
6
|
from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
|
|
6
7
|
from rara_tools.constants.normalizers import (
|
|
@@ -35,7 +36,6 @@ class RecordNormalizer:
|
|
|
35
36
|
self.ALLOW_EDIT_FIELDS = ALLOW_EDIT_FIELDS
|
|
36
37
|
# include, if should be added alongside existing fields
|
|
37
38
|
self.REPEATABLE_FIELDS = REPEATABLE_FIELDS
|
|
38
|
-
|
|
39
39
|
self.records_extra_data = []
|
|
40
40
|
self.records = self._setup_records(linking_results, sierra_data)
|
|
41
41
|
self.sierra_data = sierra_data
|
|
@@ -92,8 +92,56 @@ class RecordNormalizer:
|
|
|
92
92
|
|
|
93
93
|
all_records = linked_records + (sierra_data or [])
|
|
94
94
|
|
|
95
|
-
return
|
|
96
|
-
|
|
95
|
+
return SafeJSONReader(
|
|
96
|
+
json.dumps(all_records, ensure_ascii=False),
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def _setup_records(self, linking_results: List[dict], sierra_data: List[dict]) -> JSONReader:
|
|
100
|
+
"""Setup initial MARC records and data.
|
|
101
|
+
|
|
102
|
+
For linked entities:
|
|
103
|
+
1. Try to get single linked normalized record from KATA elastic. If more than one found, skip.
|
|
104
|
+
2. If 0 matches, search from VIAF and if 1 result found, create a new authority record from the data.
|
|
105
|
+
3. If none or more than one responses found, use only Classificator data (coming from Linker?).
|
|
106
|
+
"""
|
|
107
|
+
linked_records = []
|
|
108
|
+
|
|
109
|
+
for linked in linking_results or []:
|
|
110
|
+
if not isinstance(linked, dict):
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
entity = linked.get("original_entity")
|
|
114
|
+
linked_info = linked.get("linked_info", [])
|
|
115
|
+
|
|
116
|
+
if not isinstance(linked_info, list) or not linked_info:
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
if len(linked_info) == 1:
|
|
120
|
+
linked_item = linked_info[0]
|
|
121
|
+
if not isinstance(linked_item, dict):
|
|
122
|
+
continue
|
|
123
|
+
|
|
124
|
+
linked_records.append(linked_item.get("json", {}))
|
|
125
|
+
self.records_extra_data.append({
|
|
126
|
+
"entity": entity,
|
|
127
|
+
"viaf": linked_item.get("viaf", {}),
|
|
128
|
+
"type": "linked",
|
|
129
|
+
"edited": True
|
|
130
|
+
})
|
|
131
|
+
|
|
132
|
+
self.records_extra_data.extend(
|
|
133
|
+
{
|
|
134
|
+
"sierraID": obj.get("sierraID"),
|
|
135
|
+
"type": "sierra",
|
|
136
|
+
"edited": True
|
|
137
|
+
}
|
|
138
|
+
for obj in (sierra_data or [])
|
|
139
|
+
if isinstance(obj, dict)
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
all_records = linked_records + (sierra_data or [])
|
|
143
|
+
|
|
144
|
+
return SafeJSONReader(json.dumps(all_records, ensure_ascii=False))
|
|
97
145
|
|
|
98
146
|
@staticmethod
|
|
99
147
|
def current_timestamp():
|
|
@@ -230,11 +278,15 @@ class RecordNormalizer:
|
|
|
230
278
|
|
|
231
279
|
def _get_viaf_search_term(self, record: Record, entity: Optional[str]) -> Optional[str]:
|
|
232
280
|
""" prioritize entity name, if not available, use author name. """
|
|
233
|
-
|
|
234
281
|
if entity:
|
|
235
282
|
return entity
|
|
236
|
-
|
|
237
|
-
|
|
283
|
+
|
|
284
|
+
author_field = record.get("100") or record.get("110") or record.get("111")
|
|
285
|
+
if author_field:
|
|
286
|
+
return author_field.get_subfields("a")[0] if author_field.get_subfields("a") else None
|
|
287
|
+
|
|
288
|
+
logger.warning(
|
|
289
|
+
"No entity or author name found for VIAF search. Skipping VIAF enrichment.")
|
|
238
290
|
|
|
239
291
|
def _get_viaf_record(self, record: Record, viaf_id: Optional[int] = None,
|
|
240
292
|
entity: Optional[str] = None, viaf_field: str = DEFAULT_VIAF_FIELD,
|
|
@@ -252,21 +304,26 @@ class RecordNormalizer:
|
|
|
252
304
|
viaf_record = viaf_records[0]
|
|
253
305
|
else:
|
|
254
306
|
search_term = self._get_viaf_search_term(record, entity)
|
|
255
|
-
if not verify:
|
|
256
|
-
logger.warning(
|
|
257
|
-
f"Record verification is turned off. If multiple records are " \
|
|
258
|
-
f"detected for search term '{search_term}', the first " \
|
|
259
|
-
f"result is automatically returned. This might lead to " \
|
|
260
|
-
f"some inaccuracies!"
|
|
261
|
-
)
|
|
262
307
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
verify
|
|
268
|
-
|
|
269
|
-
|
|
308
|
+
if search_term:
|
|
309
|
+
logger.info(
|
|
310
|
+
f"Searching for VIAF record with search term: {search_term}")
|
|
311
|
+
|
|
312
|
+
if not verify:
|
|
313
|
+
logger.warning(
|
|
314
|
+
f"Record verification is turned off. If multiple records are " \
|
|
315
|
+
f"detected for search term '{search_term}', the first " \
|
|
316
|
+
f"result is automatically returned. This might lead to " \
|
|
317
|
+
f"some inaccuracies!"
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
viaf_record = viaf_client.get_normalized_data_by_search_term(
|
|
321
|
+
search_term=search_term,
|
|
322
|
+
field=viaf_field,
|
|
323
|
+
max_records=max_records,
|
|
324
|
+
verify=verify,
|
|
325
|
+
threshold=threshold
|
|
326
|
+
)
|
|
270
327
|
|
|
271
328
|
except Exception as e:
|
|
272
329
|
logger.error(
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from pymarc import Record, Field, Subfield, Leader, JSONReader
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
logger = logging.getLogger(__name__)
|
|
5
|
+
|
|
6
|
+
DEFAULT_LEADER = '01682nz a2200349n 4500'
|
|
7
|
+
|
|
8
|
+
class SafeJSONReader(JSONReader):
|
|
9
|
+
|
|
10
|
+
def __next__(self):
|
|
11
|
+
while True:
|
|
12
|
+
try:
|
|
13
|
+
jobj = next(self.iter)
|
|
14
|
+
rec = Record()
|
|
15
|
+
|
|
16
|
+
# Use custom default leader if missing
|
|
17
|
+
leader_str = jobj.get("leader")
|
|
18
|
+
if leader_str:
|
|
19
|
+
rec.leader = Leader(leader_str)
|
|
20
|
+
else:
|
|
21
|
+
logger.warning("Missing leader in record. Using DEFAULT_LEADER.")
|
|
22
|
+
rec.leader = Leader(DEFAULT_LEADER)
|
|
23
|
+
|
|
24
|
+
for field in jobj["fields"]:
|
|
25
|
+
k, v = list(field.items())[0]
|
|
26
|
+
|
|
27
|
+
if isinstance(v, dict) and "subfields" in v:
|
|
28
|
+
subfields = []
|
|
29
|
+
for sub in v["subfields"]:
|
|
30
|
+
for code, value in sub.items():
|
|
31
|
+
subfields.append(Subfield(code, value))
|
|
32
|
+
ind1 = v.get("ind1", " ")
|
|
33
|
+
ind2 = v.get("ind2", " ")
|
|
34
|
+
fld = Field(tag=k, indicators=[ind1, ind2], subfields=subfields)
|
|
35
|
+
else:
|
|
36
|
+
fld = Field(tag=k, data=v)
|
|
37
|
+
rec.add_field(fld)
|
|
38
|
+
|
|
39
|
+
return rec
|
|
40
|
+
|
|
41
|
+
except StopIteration:
|
|
42
|
+
raise
|
|
43
|
+
except Exception as e:
|
|
44
|
+
logger.error(f"Skipping invalid record: {e}")
|
|
45
|
+
continue
|
|
@@ -33,6 +33,7 @@ rara_tools/normalizers/__init__.py
|
|
|
33
33
|
rara_tools/normalizers/authorities.py
|
|
34
34
|
rara_tools/normalizers/base.py
|
|
35
35
|
rara_tools/normalizers/bibs.py
|
|
36
|
+
rara_tools/normalizers/reader.py
|
|
36
37
|
rara_tools/normalizers/viaf.py
|
|
37
38
|
rara_tools/parsers/marc_parsers/base_parser.py
|
|
38
39
|
rara_tools/parsers/marc_parsers/ems_parser.py
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from rara_tools.constants import linker
|
|
1
2
|
from rara_tools.normalizers import BibRecordNormalizer, AuthoritiesRecordNormalizer
|
|
2
3
|
from tests.test_utils import (get_linker_res_example, get_formatted_sierra_response,
|
|
3
4
|
check_record_tags_sorted, check_no_dupe_tag_values, check_record_tags_have_values)
|
|
@@ -281,7 +282,8 @@ def test_authority_normrecord_not_found_in_es_and_viaf():
|
|
|
281
282
|
linking_results = [linker_res]
|
|
282
283
|
|
|
283
284
|
normalizer = AuthoritiesRecordNormalizer(
|
|
284
|
-
linking_results=linking_results
|
|
285
|
+
linking_results=linking_results
|
|
286
|
+
)
|
|
285
287
|
|
|
286
288
|
data = normalizer.data
|
|
287
289
|
|
|
@@ -302,6 +304,31 @@ def test_authority_normrecord_not_found_in_es_and_viaf():
|
|
|
302
304
|
# should create new normalized record in the future, none for now
|
|
303
305
|
assert len(data) == 0
|
|
304
306
|
|
|
307
|
+
def _run_normalizer(linked_data):
|
|
308
|
+
normalizer = AuthoritiesRecordNormalizer(
|
|
309
|
+
linking_results=linked_data
|
|
310
|
+
)
|
|
311
|
+
return normalizer.data
|
|
312
|
+
|
|
313
|
+
def test_normalizer_handles_bad_inputs():
|
|
314
|
+
linker_res = get_linker_res_example(
|
|
315
|
+
"oneFound.json")
|
|
316
|
+
|
|
317
|
+
# pop the leader field to simulate record without leader
|
|
318
|
+
linker_res["linked_info"][0]["json"].pop("leader", None)
|
|
319
|
+
_run_normalizer([linker_res])
|
|
320
|
+
|
|
321
|
+
# make fields empty to simulate a record with no fields
|
|
322
|
+
linker_res["linked_info"][0]["json"]["fields"] = []
|
|
323
|
+
_run_normalizer([linker_res])
|
|
324
|
+
|
|
325
|
+
# pop the fields to simulate a record with no fields
|
|
326
|
+
linker_res["linked_info"][0]["json"].pop("fields", None)
|
|
327
|
+
_run_normalizer([linker_res])
|
|
328
|
+
|
|
329
|
+
inputs = ["", None, [], {}, 123]
|
|
330
|
+
|
|
331
|
+
_run_normalizer(inputs)
|
|
305
332
|
|
|
306
333
|
def test_matching_sierra_record_viaf_id_found():
|
|
307
334
|
"""normkirjelt leitakse VIAF ID, vajadusel normi asukoht, kus see ID sisaldub."""
|
rara_tools-0.7.0/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.7.0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/parsers/marc_parsers/organization_parser.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/parsers/marc_records/organization_record.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|