rara-tools 0.7.7__py3-none-any.whl → 0.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rara-tools might be problematic. Click here for more details.
- rara_tools/constants/normalizers.py +3 -1
- rara_tools/digar_schema_converter.py +29 -4
- rara_tools/normalizers/authorities.py +32 -25
- rara_tools/normalizers/base.py +111 -88
- rara_tools/normalizers/bibs.py +13 -26
- rara_tools/normalizers/reader.py +1 -1
- rara_tools/normalizers/viaf.py +5 -2
- {rara_tools-0.7.7.dist-info → rara_tools-0.7.9.dist-info}/METADATA +1 -1
- {rara_tools-0.7.7.dist-info → rara_tools-0.7.9.dist-info}/RECORD +12 -12
- {rara_tools-0.7.7.dist-info → rara_tools-0.7.9.dist-info}/WHEEL +0 -0
- {rara_tools-0.7.7.dist-info → rara_tools-0.7.9.dist-info}/licenses/LICENSE.md +0 -0
- {rara_tools-0.7.7.dist-info → rara_tools-0.7.9.dist-info}/top_level.txt +0 -0
|
@@ -241,8 +241,8 @@ class DIGARSchemaConverter:
|
|
|
241
241
|
min_language_ratio: float = 0.2,
|
|
242
242
|
convert_ratio: bool = False
|
|
243
243
|
) -> NoReturn:
|
|
244
|
-
""" Initialize DIGARSchemaConverter object.
|
|
245
|
-
|
|
244
|
+
""" Initialize DIGARSchemaConverter object.
|
|
245
|
+
|
|
246
246
|
Parameters
|
|
247
247
|
----------
|
|
248
248
|
digitizer_output: dict
|
|
@@ -261,7 +261,7 @@ class DIGARSchemaConverter:
|
|
|
261
261
|
be added to the final output.
|
|
262
262
|
convert_ratio: bool
|
|
263
263
|
If enabled, all ratios are converted into percentages.
|
|
264
|
-
|
|
264
|
+
|
|
265
265
|
"""
|
|
266
266
|
self.__digitizer_output: dict = digitizer_output
|
|
267
267
|
self.__min_language_ratio: float = min_language_ratio
|
|
@@ -280,6 +280,7 @@ class DIGARSchemaConverter:
|
|
|
280
280
|
self.__dc_origin: dict = {}
|
|
281
281
|
self.__dc_identifier: List[dict] = []
|
|
282
282
|
self.__doc_id: str = ""
|
|
283
|
+
self.__page_count: int = None
|
|
283
284
|
|
|
284
285
|
self.__doc_schemas = DocSchemas(
|
|
285
286
|
doc_meta=self.doc_meta,
|
|
@@ -303,6 +304,28 @@ class DIGARSchemaConverter:
|
|
|
303
304
|
page_number = _first_segment.get("page")
|
|
304
305
|
return page_number
|
|
305
306
|
|
|
307
|
+
def _add_dummy_pages(self, docs: List[dict]):
|
|
308
|
+
for doc in docs:
|
|
309
|
+
if not doc.get("page"):
|
|
310
|
+
doc["page"] = self.dummy_page
|
|
311
|
+
return docs
|
|
312
|
+
|
|
313
|
+
@property
|
|
314
|
+
def dummy_page(self) -> int:
|
|
315
|
+
""" Get page number to add for images,
|
|
316
|
+
if actual page is missing. Currently returns
|
|
317
|
+
a new (non-existing) final page.
|
|
318
|
+
"""
|
|
319
|
+
return self.page_count+1
|
|
320
|
+
|
|
321
|
+
@property
|
|
322
|
+
def page_count(self) -> int:
|
|
323
|
+
""" Returns total page count of the document.
|
|
324
|
+
"""
|
|
325
|
+
if not self.__page_count:
|
|
326
|
+
self.__page_count = self.__digitizer_output.get("doc_meta", {}).get("pages", {}).get("count", 0)
|
|
327
|
+
return self.__page_count
|
|
328
|
+
|
|
306
329
|
@property
|
|
307
330
|
def doc_id(self) -> str:
|
|
308
331
|
""" Retrieves document ID to use for generating
|
|
@@ -327,7 +350,8 @@ class DIGARSchemaConverter:
|
|
|
327
350
|
@property
|
|
328
351
|
def images(self) -> List[dict]:
|
|
329
352
|
if not self.__images:
|
|
330
|
-
|
|
353
|
+
images = self.__digitizer_output.get("images")
|
|
354
|
+
self.__images = self._add_dummy_pages(images)
|
|
331
355
|
return self.__images
|
|
332
356
|
|
|
333
357
|
@property
|
|
@@ -344,6 +368,7 @@ class DIGARSchemaConverter:
|
|
|
344
368
|
mapped[text["start_page"]]["texts"].append(text)
|
|
345
369
|
for img in self.images:
|
|
346
370
|
mapped[img["page"]]["images"].append(img)
|
|
371
|
+
#print(mapped.items())
|
|
347
372
|
|
|
348
373
|
self.__page_mappings = [
|
|
349
374
|
v for k, v in sorted(list(mapped.items()), key=lambda x: x[0])
|
|
@@ -11,13 +11,15 @@ class AuthoritiesRecordNormalizer(RecordNormalizer):
|
|
|
11
11
|
""" Normalize authorities records """
|
|
12
12
|
|
|
13
13
|
def __init__(self, linking_results: List[dict] = [], sierra_data: List[dict] = [],
|
|
14
|
-
ALLOW_EDIT_FIELDS: List[str] = [
|
|
15
|
-
|
|
16
|
-
REPEATABLE_FIELDS: List[str] = ["024", "035", "400", "670"]):
|
|
14
|
+
ALLOW_EDIT_FIELDS: List[str] = ["008", "925"],
|
|
15
|
+
REPEATABLE_FIELDS: List[str] = ["024", "035", "400", "670", "667"]):
|
|
17
16
|
|
|
18
17
|
super().__init__(linking_results, sierra_data)
|
|
19
18
|
self.ALLOW_EDIT_FIELDS = ALLOW_EDIT_FIELDS
|
|
20
19
|
self.REPEATABLE_FIELDS = REPEATABLE_FIELDS
|
|
20
|
+
self.records_extra_data = []
|
|
21
|
+
self.sierra_data = sierra_data
|
|
22
|
+
self.records = self._setup_records(linking_results, sierra_data)
|
|
21
23
|
|
|
22
24
|
def _normalize_sierra(self, record: Record, sierraID: str) -> None:
|
|
23
25
|
|
|
@@ -26,7 +28,6 @@ class AuthoritiesRecordNormalizer(RecordNormalizer):
|
|
|
26
28
|
fields = [
|
|
27
29
|
Field(
|
|
28
30
|
tag="008",
|
|
29
|
-
indicators=EMPTY_INDICATORS,
|
|
30
31
|
data=f"{self.current_timestamp()}{suffix_008}"
|
|
31
32
|
),
|
|
32
33
|
|
|
@@ -50,21 +51,24 @@ class AuthoritiesRecordNormalizer(RecordNormalizer):
|
|
|
50
51
|
return record
|
|
51
52
|
|
|
52
53
|
def _add_birth_and_death_dates(self, record: Record, viaf_record: VIAFRecord) -> None:
|
|
54
|
+
|
|
55
|
+
formatted_birth_date = self._format_date(viaf_record.birth_date)
|
|
56
|
+
formatted_death_date = self._format_date(viaf_record.death_date) if viaf_record.death_date != 0 else ""
|
|
57
|
+
|
|
58
|
+
birth_date = self.get_subfield(
|
|
59
|
+
record, "046", "f", formatted_birth_date)
|
|
60
|
+
death_date = self.get_subfield(
|
|
61
|
+
record, "046", "g", formatted_death_date)
|
|
62
|
+
|
|
53
63
|
subfields_046 = [
|
|
54
|
-
Subfield("f",
|
|
55
|
-
|
|
56
|
-
Subfield("g", self.get_subfield(
|
|
57
|
-
record, "046", "g", viaf_record.death_date)),
|
|
58
|
-
Subfield("s", self.get_subfield(
|
|
59
|
-
record, "046", "s", viaf_record.activity_start)),
|
|
60
|
-
Subfield("t", self.get_subfield(
|
|
61
|
-
record, "046", "t", viaf_record.activity_end)),
|
|
64
|
+
Subfield("f", birth_date),
|
|
65
|
+
Subfield("g", death_date),
|
|
62
66
|
]
|
|
63
67
|
|
|
64
68
|
self._add_fields_to_record(
|
|
65
69
|
record, [Field(tag="046", indicators=EMPTY_INDICATORS, subfields=subfields_046)])
|
|
66
70
|
|
|
67
|
-
def
|
|
71
|
+
def _add_viaf_url_or_isni(self, record: Record, viaf_record: VIAFRecord) -> None:
|
|
68
72
|
# TODO 024. will be used to store KRATT KATA ID. Just generate one?
|
|
69
73
|
viaf_url = f"https://viaf.org/viaf/{viaf_record.viaf_id}"
|
|
70
74
|
|
|
@@ -80,17 +84,20 @@ class AuthoritiesRecordNormalizer(RecordNormalizer):
|
|
|
80
84
|
self._add_fields_to_record(record, [field])
|
|
81
85
|
|
|
82
86
|
def _add_nationality(self, record: Record, viaf_record: VIAFRecord) -> None:
|
|
87
|
+
""" Non-repeatable field 043 - adds ee only if is estonian nationality and
|
|
88
|
+
the records does not have the field already."""
|
|
89
|
+
|
|
90
|
+
is_person_est = self._is_person_est_nationality(viaf_record)
|
|
91
|
+
|
|
92
|
+
if is_person_est:
|
|
93
|
+
fields = [
|
|
94
|
+
Field(
|
|
95
|
+
tag="043",
|
|
96
|
+
indicators=EMPTY_INDICATORS,
|
|
97
|
+
subfields=[Subfield("c", "ee")])
|
|
98
|
+
]
|
|
83
99
|
|
|
84
|
-
|
|
85
|
-
Field(
|
|
86
|
-
tag="043",
|
|
87
|
-
indicators=EMPTY_INDICATORS,
|
|
88
|
-
subfields=[
|
|
89
|
-
Subfield("c", "ee")
|
|
90
|
-
] if self._is_person_est_nationality(viaf_record) else []
|
|
91
|
-
)]
|
|
92
|
-
|
|
93
|
-
self._add_fields_to_record(record, fields)
|
|
100
|
+
self._add_fields_to_record(record, fields)
|
|
94
101
|
|
|
95
102
|
def _normalize_viaf(self, record: Record, viaf_record: VIAFRecord) -> None:
|
|
96
103
|
""""
|
|
@@ -107,13 +114,13 @@ class AuthoritiesRecordNormalizer(RecordNormalizer):
|
|
|
107
114
|
return
|
|
108
115
|
|
|
109
116
|
self._add_nationality(record, viaf_record)
|
|
110
|
-
self.
|
|
117
|
+
self._add_viaf_url_or_isni(record, viaf_record)
|
|
111
118
|
self._add_birth_and_death_dates(record, viaf_record)
|
|
112
119
|
self._add_author(record, viaf_record)
|
|
113
120
|
|
|
114
121
|
def _normalize_record(self, record: Record, sierraID: str,
|
|
115
122
|
viaf_record: VIAFRecord, is_editing_existing_record: bool) -> Record:
|
|
116
|
-
|
|
123
|
+
|
|
117
124
|
self._normalize_sierra(record, sierraID)
|
|
118
125
|
self._normalize_viaf(record, viaf_record)
|
|
119
126
|
|
rara_tools/normalizers/base.py
CHANGED
|
@@ -7,9 +7,12 @@ from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
|
|
|
7
7
|
from rara_tools.constants.normalizers import (
|
|
8
8
|
DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
|
|
9
9
|
VIAF_SIMILARITY_THRESHOLD, VERIFY_VIAF_RECORD, MAX_VIAF_RECORDS_TO_VERIFY,
|
|
10
|
-
EMPTY_INDICATORS
|
|
10
|
+
EMPTY_INDICATORS, YYMMDD_FORMAT, YY_DD_FORMAT
|
|
11
11
|
)
|
|
12
12
|
from glom import glom
|
|
13
|
+
from dateutil import parser
|
|
14
|
+
from datetime import date
|
|
15
|
+
|
|
13
16
|
import logging
|
|
14
17
|
import json
|
|
15
18
|
|
|
@@ -18,7 +21,7 @@ logger = logging.getLogger(__name__)
|
|
|
18
21
|
|
|
19
22
|
class RecordNormalizer:
|
|
20
23
|
"""
|
|
21
|
-
Base class
|
|
24
|
+
Base class for normalizing different record types corresponding classes have been created.
|
|
22
25
|
By default existing record fields will not be changed, unless included in ALLOW_EDIT_FIELDS. If a field
|
|
23
26
|
included in the normalization is not present, it will be added to the record. If under REPEATABLE_FIELDS.
|
|
24
27
|
a new record field is added.
|
|
@@ -30,72 +33,15 @@ class RecordNormalizer:
|
|
|
30
33
|
"""
|
|
31
34
|
|
|
32
35
|
def __init__(self, linking_results: List[dict] = [], sierra_data: List[dict] = [],
|
|
33
|
-
ALLOW_EDIT_FIELDS: List[str] = ["
|
|
36
|
+
ALLOW_EDIT_FIELDS: List[str] = ["925"], REPEATABLE_FIELDS: List[str] = ["667"]):
|
|
34
37
|
|
|
35
38
|
# Include, if will replace existing field
|
|
36
39
|
self.ALLOW_EDIT_FIELDS = ALLOW_EDIT_FIELDS
|
|
37
40
|
# include, if should be added alongside existing fields
|
|
38
41
|
self.REPEATABLE_FIELDS = REPEATABLE_FIELDS
|
|
39
|
-
|
|
40
|
-
self.
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def _setup_records(self, linking_results: List[dict], sierra_data: List[dict]) -> JSONReader:
|
|
44
|
-
"""Setup initial MARC records and data.
|
|
45
|
-
|
|
46
|
-
For linked entities:
|
|
47
|
-
1. Try to get single linked normalized record from KATA elastic. If more than one found, skip.
|
|
48
|
-
2. If 0 matches, search from VIAF and if 1 result found, create a new authority record from the data.
|
|
49
|
-
3. If none or more than one responses found, use only Classificator data (coming from Linker?).
|
|
50
|
-
|
|
51
|
-
for SIERRA records: normalize.
|
|
52
|
-
"""
|
|
53
|
-
linked_records = []
|
|
54
|
-
|
|
55
|
-
for linked in linking_results:
|
|
56
|
-
entity = linked.get("original_entity")
|
|
57
|
-
try:
|
|
58
|
-
linked_info = linked.get("linked_info", [])
|
|
59
|
-
linked_num = len(linked_info)
|
|
60
|
-
|
|
61
|
-
if not linked_info:
|
|
62
|
-
# new record will be created
|
|
63
|
-
logger.info(
|
|
64
|
-
f"No linked entities found for {entity}")
|
|
65
|
-
continue
|
|
66
|
-
|
|
67
|
-
if linked_num == 1:
|
|
68
|
-
linked = linked_info[0]
|
|
69
|
-
linked_records.append(linked.get("json", {}))
|
|
70
|
-
self.records_extra_data.append({
|
|
71
|
-
"entity": entity,
|
|
72
|
-
"viaf": linked.get("viaf", {}),
|
|
73
|
-
"type": "linked",
|
|
74
|
-
"edited": True
|
|
75
|
-
})
|
|
76
|
-
else:
|
|
77
|
-
# new record will be created
|
|
78
|
-
logger.info(
|
|
79
|
-
f"Multiple linked entities found for {entity}")
|
|
80
|
-
|
|
81
|
-
except Exception as e:
|
|
82
|
-
logger.error(f"Error processing entity {entity}: {e}")
|
|
83
|
-
|
|
84
|
-
self.records_extra_data.extend(
|
|
85
|
-
{
|
|
86
|
-
"sierraID": obj.get("sierraID"),
|
|
87
|
-
"type": "sierra",
|
|
88
|
-
"edited": True
|
|
89
|
-
}
|
|
90
|
-
for obj in (sierra_data or [])
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
all_records = linked_records + (sierra_data or [])
|
|
94
|
-
|
|
95
|
-
return SafeJSONReader(
|
|
96
|
-
json.dumps(all_records, ensure_ascii=False),
|
|
97
|
-
)
|
|
98
|
-
|
|
42
|
+
# leader applied to new records
|
|
43
|
+
self.DEFAULT_LEADER = "01682nz a2200349n 4500" # must be 24 digits
|
|
44
|
+
|
|
99
45
|
def _setup_records(self, linking_results: List[dict], sierra_data: List[dict]) -> JSONReader:
|
|
100
46
|
"""Setup initial MARC records and data.
|
|
101
47
|
|
|
@@ -114,9 +60,34 @@ class RecordNormalizer:
|
|
|
114
60
|
linked_info = linked.get("linked_info", [])
|
|
115
61
|
|
|
116
62
|
if not isinstance(linked_info, list) or not linked_info:
|
|
63
|
+
# No linked entities found, create new record
|
|
64
|
+
logger.info(
|
|
65
|
+
f"No linked entities found for {entity}, Creating new record.")
|
|
66
|
+
linked_records.append({
|
|
67
|
+
"leader": self.DEFAULT_LEADER,
|
|
68
|
+
"fields": []
|
|
69
|
+
})
|
|
70
|
+
self.records_extra_data.append({
|
|
71
|
+
"entity": entity,
|
|
72
|
+
"edited": False
|
|
73
|
+
})
|
|
117
74
|
continue
|
|
75
|
+
|
|
76
|
+
if len(linked_info) > 1:
|
|
77
|
+
# Multiple linked entities found, create new record
|
|
78
|
+
logger.info(
|
|
79
|
+
f"Multiple linked entities found for {entity}. Creating new record.")
|
|
80
|
+
linked_records.append({
|
|
81
|
+
"leader": self.DEFAULT_LEADER,
|
|
82
|
+
"fields": []
|
|
83
|
+
})
|
|
84
|
+
self.records_extra_data.append({
|
|
85
|
+
"entity": entity,
|
|
86
|
+
"edited": False
|
|
87
|
+
})
|
|
88
|
+
continue
|
|
118
89
|
|
|
119
|
-
|
|
90
|
+
elif len(linked_info) == 1:
|
|
120
91
|
linked_item = linked_info[0]
|
|
121
92
|
if not isinstance(linked_item, dict):
|
|
122
93
|
continue
|
|
@@ -128,7 +99,8 @@ class RecordNormalizer:
|
|
|
128
99
|
"type": "linked",
|
|
129
100
|
"edited": True
|
|
130
101
|
})
|
|
131
|
-
|
|
102
|
+
continue
|
|
103
|
+
|
|
132
104
|
self.records_extra_data.extend(
|
|
133
105
|
{
|
|
134
106
|
"sierraID": obj.get("sierraID"),
|
|
@@ -138,25 +110,25 @@ class RecordNormalizer:
|
|
|
138
110
|
for obj in (sierra_data or [])
|
|
139
111
|
if isinstance(obj, dict)
|
|
140
112
|
)
|
|
141
|
-
|
|
113
|
+
|
|
142
114
|
all_records = linked_records + (sierra_data or [])
|
|
143
|
-
|
|
115
|
+
|
|
144
116
|
return SafeJSONReader(json.dumps(all_records, ensure_ascii=False))
|
|
145
117
|
|
|
146
118
|
@staticmethod
|
|
147
119
|
def current_timestamp():
|
|
148
|
-
"""6 digit timestamp
|
|
149
|
-
return datetime.now().strftime(
|
|
120
|
+
"""6 digit timestamp, format YYMMDD"""
|
|
121
|
+
return datetime.now().strftime(YYMMDD_FORMAT)
|
|
150
122
|
|
|
151
123
|
@staticmethod
|
|
152
124
|
def current_yyyy_dd():
|
|
153
125
|
"""format of 2025-03"""
|
|
154
|
-
return datetime.now().strftime(
|
|
126
|
+
return datetime.now().strftime(YY_DD_FORMAT)
|
|
155
127
|
|
|
156
128
|
@staticmethod
|
|
157
129
|
def _is_person_est_nationality(viaf_record: VIAFRecord) -> bool:
|
|
158
|
-
return viaf_record.nationality == "ee"
|
|
159
|
-
|
|
130
|
+
return hasattr(viaf_record, 'nationality') and viaf_record.nationality == "ee"
|
|
131
|
+
|
|
160
132
|
def _is_nxx(self, field: Field, n: str):
|
|
161
133
|
""" Check if fields tag is in nxx range. """
|
|
162
134
|
return field.tag.startswith(n)
|
|
@@ -173,6 +145,22 @@ class RecordNormalizer:
|
|
|
173
145
|
def _filter_equivalent_field_not_in_record(self, record: Record, fields: List[Field]) -> bool:
|
|
174
146
|
""" filter out fields, that do not have an equivalent in the record. """
|
|
175
147
|
return filter(lambda field: not self._field_in_record(field, record), fields)
|
|
148
|
+
|
|
149
|
+
def _format_date(self, value: str) -> str:
|
|
150
|
+
if value is None:
|
|
151
|
+
return ""
|
|
152
|
+
|
|
153
|
+
if isinstance(value, (datetime, date)):
|
|
154
|
+
return value.strftime(YYMMDD_FORMAT)
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
dt = parser.parse(str(value), fuzzy=True)
|
|
158
|
+
formatted_date = dt.strftime(YYMMDD_FORMAT)
|
|
159
|
+
logger.info(f"Formatted date '{formatted_date}' from value '{value}'")
|
|
160
|
+
return formatted_date
|
|
161
|
+
except Exception as e:
|
|
162
|
+
logger.info(f"Failed to format date string '{value}': {e}")
|
|
163
|
+
return ""
|
|
176
164
|
|
|
177
165
|
def get_subfield(self, record: Record, tag: str, subfield: str, default: str) -> str:
|
|
178
166
|
""" get record existing subfield value or assign a fallback value. """
|
|
@@ -220,7 +208,10 @@ class RecordNormalizer:
|
|
|
220
208
|
)
|
|
221
209
|
|
|
222
210
|
def _add_fields_to_record(self, record: Record, fields: List[Field]) -> Record:
|
|
223
|
-
|
|
211
|
+
# filter out subfields that are empty, or 0, as VIAF returns 0 for unknown dates
|
|
212
|
+
for field in fields:
|
|
213
|
+
field.subfields = [sub for sub in field.subfields if sub.value and sub.value not in ["0", 0]]
|
|
214
|
+
|
|
224
215
|
self._handle_repeatable_fields(record, *fields)
|
|
225
216
|
self._handle_editable_fields(record, *fields)
|
|
226
217
|
self._handle_default_fields(record, *fields)
|
|
@@ -247,31 +238,63 @@ class RecordNormalizer:
|
|
|
247
238
|
indicators=EMPTY_INDICATORS,
|
|
248
239
|
subfields=[
|
|
249
240
|
Subfield("a", viaf_record.name),
|
|
250
|
-
Subfield("b", viaf_record.name_type), # Is this correct??
|
|
251
|
-
Subfield("c", viaf_record.name_type) # Is this correct??
|
|
252
241
|
]
|
|
253
242
|
)
|
|
254
243
|
]
|
|
255
244
|
|
|
256
245
|
self._add_fields_to_record(record, fields)
|
|
246
|
+
|
|
247
|
+
def _move680_fields_to_667(self, record: Record) -> None:
|
|
248
|
+
""" Move existing 680 fields to 667, if any. """
|
|
249
|
+
fields_680 = record.get_fields("680")
|
|
250
|
+
if not fields_680:
|
|
251
|
+
return
|
|
252
|
+
|
|
253
|
+
fields_667 = [
|
|
254
|
+
Field(
|
|
255
|
+
tag="667",
|
|
256
|
+
indicators=EMPTY_INDICATORS,
|
|
257
|
+
subfields=field.subfields
|
|
258
|
+
) for field in fields_680
|
|
259
|
+
]
|
|
257
260
|
|
|
258
|
-
|
|
259
|
-
|
|
261
|
+
record.remove_fields("680")
|
|
262
|
+
self._add_fields_to_record(record, fields_667)
|
|
260
263
|
|
|
264
|
+
def _normalize_common(self, record: Record, is_editing_existing_record: bool) -> None:
|
|
265
|
+
"""Common logic for all normalizations.
|
|
266
|
+
- Includes note about record being created/edited.
|
|
267
|
+
- include date note with a different subfield, depending on if record is new or edited.
|
|
268
|
+
- move existing 680 fields to 667
|
|
269
|
+
"""
|
|
270
|
+
# before adding new notes
|
|
271
|
+
self._move680_fields_to_667(record)
|
|
272
|
+
|
|
261
273
|
note = "Muudetud AI poolt" if is_editing_existing_record else "Loodud AI poolt"
|
|
262
274
|
date_note = f"KRATT {self.current_yyyy_dd()}"
|
|
263
275
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
276
|
+
field_667 = Field(tag="667",
|
|
277
|
+
indicators=EMPTY_INDICATORS,
|
|
278
|
+
subfields=[Subfield("a", note)])
|
|
279
|
+
|
|
280
|
+
fields = [field_667]
|
|
281
|
+
|
|
282
|
+
if is_editing_existing_record:
|
|
283
|
+
field_925 = Field(tag="925",
|
|
284
|
+
indicators=EMPTY_INDICATORS,
|
|
285
|
+
subfields=[
|
|
286
|
+
Subfield("p", self.get_subfield(record, "925", "p", date_note))
|
|
287
|
+
])
|
|
288
|
+
fields.append(field_925)
|
|
289
|
+
|
|
290
|
+
else:
|
|
291
|
+
field_925 = Field(tag="925",
|
|
292
|
+
indicators=EMPTY_INDICATORS,
|
|
293
|
+
subfields=[
|
|
294
|
+
Subfield("t", self.get_subfield(record, "925", "t", date_note))
|
|
295
|
+
])
|
|
296
|
+
fields.append(field_925)
|
|
297
|
+
|
|
275
298
|
self._add_fields_to_record(record, fields)
|
|
276
299
|
|
|
277
300
|
return record
|
rara_tools/normalizers/bibs.py
CHANGED
|
@@ -11,32 +11,25 @@ class BibRecordNormalizer(RecordNormalizer):
|
|
|
11
11
|
""" Normalize bib records. """
|
|
12
12
|
|
|
13
13
|
def __init__(self, linking_results: List[dict] = [], sierra_data: List[dict] = [],
|
|
14
|
-
ALLOW_EDIT_FIELDS: List[str] = ["
|
|
15
|
-
REPEATABLE_FIELDS: List[str] = []):
|
|
14
|
+
ALLOW_EDIT_FIELDS: List[str] = ["008", "925"],
|
|
15
|
+
REPEATABLE_FIELDS: List[str] = ["667"]):
|
|
16
16
|
super().__init__(linking_results, sierra_data)
|
|
17
|
+
self.DEFAULT_LEADER = "00399nz a2200145n 4500" # must be 24 digits
|
|
17
18
|
self.ALLOW_EDIT_FIELDS = ALLOW_EDIT_FIELDS
|
|
18
19
|
self.REPEATABLE_FIELDS = REPEATABLE_FIELDS
|
|
20
|
+
|
|
21
|
+
self.records_extra_data = []
|
|
22
|
+
self.sierra_data = sierra_data
|
|
23
|
+
self.records = self._setup_records(linking_results, sierra_data)
|
|
19
24
|
|
|
20
25
|
def _normalize_sierra(self, record: Record) -> Record:
|
|
26
|
+
|
|
27
|
+
suffix_008 = "|||aznnnaabn || ||| "
|
|
28
|
+
|
|
21
29
|
fields = [
|
|
22
30
|
Field(
|
|
23
31
|
tag="008",
|
|
24
|
-
|
|
25
|
-
data=f"{self.current_timestamp()} | | | aznnnaabn | | | | |"
|
|
26
|
-
),
|
|
27
|
-
Field(
|
|
28
|
-
tag="046",
|
|
29
|
-
indicators=EMPTY_INDICATORS,
|
|
30
|
-
subfields=[
|
|
31
|
-
Subfield("k", "Pub date")
|
|
32
|
-
]
|
|
33
|
-
),
|
|
34
|
-
Field(
|
|
35
|
-
tag="245",
|
|
36
|
-
indicators=Indicators("1", "0"),
|
|
37
|
-
subfields=[
|
|
38
|
-
Subfield("a", "Title")
|
|
39
|
-
]
|
|
32
|
+
data=f"{self.current_timestamp()}{suffix_008}"
|
|
40
33
|
),
|
|
41
34
|
]
|
|
42
35
|
|
|
@@ -55,14 +48,8 @@ class BibRecordNormalizer(RecordNormalizer):
|
|
|
55
48
|
subfields=[
|
|
56
49
|
Subfield("a", viaf_id)
|
|
57
50
|
]
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
tag="100",
|
|
61
|
-
indicators=EMPTY_INDICATORS,
|
|
62
|
-
subfields=[
|
|
63
|
-
Subfield("a", "?")
|
|
64
|
-
]
|
|
65
|
-
)]
|
|
51
|
+
)
|
|
52
|
+
]
|
|
66
53
|
|
|
67
54
|
self._add_fields_to_record(record, fields)
|
|
68
55
|
self._add_author(record, viaf_record)
|
rara_tools/normalizers/reader.py
CHANGED
rara_tools/normalizers/viaf.py
CHANGED
|
@@ -10,6 +10,7 @@ from rara_tools.constants.normalizers import (
|
|
|
10
10
|
DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
|
|
11
11
|
VIAF_SIMILARITY_THRESHOLD, VIAF_ALLOWED_SOURCES
|
|
12
12
|
)
|
|
13
|
+
from glom import glom
|
|
13
14
|
|
|
14
15
|
import logging
|
|
15
16
|
logger = logging.getLogger(__name__)
|
|
@@ -598,11 +599,13 @@ class VIAFClient:
|
|
|
598
599
|
"""
|
|
599
600
|
logger.debug("Extracting VIAF IDs from VIAF search query results.")
|
|
600
601
|
try:
|
|
601
|
-
|
|
602
|
+
res_json = search_query_response.json()
|
|
603
|
+
records = glom(res_json, "queryResult.records.record", default=[])
|
|
604
|
+
|
|
602
605
|
except Exception as e:
|
|
603
606
|
logger.error(
|
|
604
607
|
f"Parsing records from search query " \
|
|
605
|
-
f"
|
|
608
|
+
f"failed with error: {e}."
|
|
606
609
|
)
|
|
607
610
|
records = []
|
|
608
611
|
viaf_ids = []
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
rara_tools/converters.py,sha256=a1dEMa0TwcO9UmjuSBkiuc7LGmH0d_dB6wwoTLpdZhI,4040
|
|
2
2
|
rara_tools/decorators.py,sha256=MjOyvZ5nTkwxwx2JLFEGpKKBysvecFw6EN6UDrSvZLU,2187
|
|
3
|
-
rara_tools/digar_schema_converter.py,sha256=
|
|
3
|
+
rara_tools/digar_schema_converter.py,sha256=usrNwlbN63wTE5U56vbmyzT_SxGLXO6ZF4JwY3Lnkqg,15061
|
|
4
4
|
rara_tools/elastic.py,sha256=4D9yoyMy6AJIKwhSi2H1usffDHAh2A_IZfv5BtYnBKg,13992
|
|
5
5
|
rara_tools/exceptions.py,sha256=YQyaueUbXeTkJYFDEuN6iWTXMI3eCv5l7PxGp87vg5I,550
|
|
6
6
|
rara_tools/s3.py,sha256=9ziDXsLjBtFAvsjTPxFddhfvkpA8773rzPJqO7y1N5Q,6415
|
|
@@ -12,7 +12,7 @@ rara_tools/constants/general.py,sha256=dLomRopLiHv_J_liSIGzK1A3XByydsKGIyVN8KuuN
|
|
|
12
12
|
rara_tools/constants/language_evaluator.py,sha256=3sCSaoS-zXQRY0vJ7UUMuZqbtYQD_quVVbdpgvJjE7I,124
|
|
13
13
|
rara_tools/constants/linker.py,sha256=WnOmJFTkoBMZUbBaW1uY45NTQB7FGG-dc9a_6qYTtwk,3381
|
|
14
14
|
rara_tools/constants/meta_extractor.py,sha256=iVyxycKScbrjFWLv50dRmdeHfTLOKbdyEhgUF3DyBrY,1053
|
|
15
|
-
rara_tools/constants/normalizers.py,sha256=
|
|
15
|
+
rara_tools/constants/normalizers.py,sha256=Qyi6eSCp4Gnz45xF-vOPExGXasyAoVimOAAlLj1t74s,1383
|
|
16
16
|
rara_tools/constants/parsers.py,sha256=L6nh1Itget9_9DMsliDkh6T25z78eMFPWVkbaU08DwU,5561
|
|
17
17
|
rara_tools/constants/subject_indexer.py,sha256=0snyyB8IMCWXOYPXR_c0Kavq4nBiww559rdNOKjawx8,2133
|
|
18
18
|
rara_tools/core_formatters/core_formatter.py,sha256=u_Cdgv9qBcyF-XddjaRGUqAFik9OMAdSzAulXpYR7vE,4997
|
|
@@ -20,11 +20,11 @@ rara_tools/core_formatters/formatted_keyword.py,sha256=hhi6wh4ErFionjBqYsEeKGbf1
|
|
|
20
20
|
rara_tools/core_formatters/formatted_meta.py,sha256=r0RPG4eM-REPIR1DrIJnvYPQtQrzkgdvX9tvhNWjQ0Y,5250
|
|
21
21
|
rara_tools/core_formatters/formatted_object.py,sha256=7a499ZmcZXOqtlwxDi6FWHWF5a6HdCsduS22wV3uHIE,5656
|
|
22
22
|
rara_tools/normalizers/__init__.py,sha256=_NqpS5w710DhaURytHq9JpEt8HgYpSPfRDcOtOymJgE,193
|
|
23
|
-
rara_tools/normalizers/authorities.py,sha256=
|
|
24
|
-
rara_tools/normalizers/base.py,sha256=
|
|
25
|
-
rara_tools/normalizers/bibs.py,sha256=
|
|
26
|
-
rara_tools/normalizers/reader.py,sha256=
|
|
27
|
-
rara_tools/normalizers/viaf.py,sha256=
|
|
23
|
+
rara_tools/normalizers/authorities.py,sha256=wUnusRRcgIbAVoOrxiT-D0dAbIlpPph69i2DvCU7lCE,4815
|
|
24
|
+
rara_tools/normalizers/base.py,sha256=W68goAtez5TgF-7SUou77eirkepatBUSL1Aghp8Gj84,15211
|
|
25
|
+
rara_tools/normalizers/bibs.py,sha256=mxRSYq6adCwrCeJgWXJbLGKrG48Dz93vAt8Is2HzMak,2030
|
|
26
|
+
rara_tools/normalizers/reader.py,sha256=GYCkAtnsNx135w5lD-_MqCZzdHQHHPDF-pDxYj839Vo,1595
|
|
27
|
+
rara_tools/normalizers/viaf.py,sha256=C-NfbvL83ZcHVB9ICMw43wAMYKTqDTHU3ZT2mXKec00,24288
|
|
28
28
|
rara_tools/parsers/marc_parsers/base_parser.py,sha256=Kdw4aivJf2FkWgIK7pJtHtVXF_G1pjHVQ7IcFItSqy8,1649
|
|
29
29
|
rara_tools/parsers/marc_parsers/ems_parser.py,sha256=LFuhZcVwmHMcJknX9p4ZkO8RdjPdQZ4APGbw8KV6BIs,2024
|
|
30
30
|
rara_tools/parsers/marc_parsers/location_parser.py,sha256=dSU9dQoGV5z0ajhLI1bn3AAghkOr79qKIrX7sO0_4lA,1873
|
|
@@ -39,8 +39,8 @@ rara_tools/parsers/marc_records/title_record.py,sha256=XrtJ4gj7wzSaGxNaPtPuawmqq
|
|
|
39
39
|
rara_tools/parsers/tools/entity_normalizers.py,sha256=VyCy_NowCLpOsL0luQ55IW-Qi-J5oBH0Ofzr7HRFBhM,8949
|
|
40
40
|
rara_tools/parsers/tools/marc_converter.py,sha256=LgSHe-7n7aiDrw2bnsB53r3fXTRFjZXTwBYfTpL0pfs,415
|
|
41
41
|
rara_tools/parsers/tools/russian_transliterator.py,sha256=5ZU66iTqAhr7pmfVqXPAI_cidF43VqqmuN4d7H4_JuA,9770
|
|
42
|
-
rara_tools-0.7.
|
|
43
|
-
rara_tools-0.7.
|
|
44
|
-
rara_tools-0.7.
|
|
45
|
-
rara_tools-0.7.
|
|
46
|
-
rara_tools-0.7.
|
|
42
|
+
rara_tools-0.7.9.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
|
|
43
|
+
rara_tools-0.7.9.dist-info/METADATA,sha256=4AzmBZEA0nqC1i1ypTLMDgz8wSzQvsCqKPsKBNaGEi8,4079
|
|
44
|
+
rara_tools-0.7.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
45
|
+
rara_tools-0.7.9.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
|
|
46
|
+
rara_tools-0.7.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|