rara-tools 0.7.12__tar.gz → 0.7.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rara-tools might be problematic. Click here for more details.
- {rara_tools-0.7.12/rara_tools.egg-info → rara_tools-0.7.14}/PKG-INFO +1 -1
- rara_tools-0.7.14/VERSION +1 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/constants/normalizers.py +1 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/core_formatters/formatted_meta.py +11 -1
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/normalizers/authorities.py +34 -25
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/normalizers/base.py +112 -19
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/normalizers/bibs.py +25 -34
- {rara_tools-0.7.12 → rara_tools-0.7.14/rara_tools.egg-info}/PKG-INFO +1 -1
- {rara_tools-0.7.12 → rara_tools-0.7.14}/tests/test_normalization.py +116 -5
- rara_tools-0.7.12/VERSION +0 -1
- {rara_tools-0.7.12 → rara_tools-0.7.14}/LICENSE.md +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/README.md +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/pyproject.toml +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/constants/__init__.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/constants/digitizer.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/constants/general.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/constants/language_evaluator.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/constants/linker.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/constants/meta_extractor.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/constants/parsers.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/constants/subject_indexer.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/converters.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/core_formatters/core_formatter.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/core_formatters/formatted_keyword.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/core_formatters/formatted_object.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/decorators.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/digar_schema_converter.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/elastic.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/exceptions.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/normalizers/__init__.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/normalizers/reader.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/normalizers/viaf.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/parsers/marc_parsers/base_parser.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/parsers/marc_parsers/ems_parser.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/parsers/marc_parsers/location_parser.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/parsers/marc_parsers/organization_parser.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/parsers/marc_parsers/person_parser.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/parsers/marc_parsers/title_parser.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/parsers/marc_records/base_record.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/parsers/marc_records/ems_record.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/parsers/marc_records/organization_record.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/parsers/marc_records/person_record.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/parsers/marc_records/title_record.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/parsers/tools/entity_normalizers.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/parsers/tools/marc_converter.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/parsers/tools/russian_transliterator.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/s3.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/task_reporter.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/utils.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools.egg-info/SOURCES.txt +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools.egg-info/dependency_links.txt +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools.egg-info/requires.txt +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools.egg-info/top_level.txt +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/requirements.txt +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/setup.cfg +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/tests/test_digar_schema_converter.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/tests/test_elastic.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/tests/test_elastic_vector_and_search_operations.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/tests/test_entity_normalizers.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/tests/test_formatters.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/tests/test_marc_parsers.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/tests/test_s3_exceptions.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/tests/test_s3_file_operations.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/tests/test_sierra_converters.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/tests/test_task_reporter.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/tests/test_utils.py +0 -0
- {rara_tools-0.7.12 → rara_tools-0.7.14}/tests/test_viaf_client.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.7.14
|
|
@@ -56,7 +56,17 @@ class FormattedAuthor(FormattedObject):
|
|
|
56
56
|
|
|
57
57
|
@property
|
|
58
58
|
def name(self) -> str:
|
|
59
|
-
|
|
59
|
+
""" Force all names into format <last_name>, <first_name>.
|
|
60
|
+
"""
|
|
61
|
+
if "," in self.entity:
|
|
62
|
+
name = self.entity
|
|
63
|
+
else:
|
|
64
|
+
name_tokens = self.entity.rsplit(" ", 1)
|
|
65
|
+
if len(name_tokens) == 2:
|
|
66
|
+
name = f"{name_tokens[1]}, {name_tokens[0]}"
|
|
67
|
+
else:
|
|
68
|
+
name = self.entity
|
|
69
|
+
return name
|
|
60
70
|
|
|
61
71
|
@property
|
|
62
72
|
def name_order(self) -> str:
|
|
@@ -11,45 +11,54 @@ class AuthoritiesRecordNormalizer(RecordNormalizer):
|
|
|
11
11
|
""" Normalize authorities records """
|
|
12
12
|
|
|
13
13
|
def __init__(self, linking_results: List[dict] = [], sierra_data: List[dict] = [],
|
|
14
|
+
classified_fields: List[str] = [],
|
|
14
15
|
ALLOW_EDIT_FIELDS: List[str] = ["008", "925"],
|
|
15
|
-
REPEATABLE_FIELDS: List[str] = ["024", "035", "400", "
|
|
16
|
+
REPEATABLE_FIELDS: List[str] = ["024", "035", "400", "667"]):
|
|
16
17
|
|
|
17
|
-
super().__init__(linking_results, sierra_data)
|
|
18
|
+
super().__init__(linking_results, sierra_data, classified_fields)
|
|
18
19
|
self.ALLOW_EDIT_FIELDS = ALLOW_EDIT_FIELDS
|
|
19
20
|
self.REPEATABLE_FIELDS = REPEATABLE_FIELDS
|
|
20
21
|
self.records_extra_data = []
|
|
21
22
|
self.sierra_data = sierra_data
|
|
22
|
-
self.records = self._setup_records(linking_results, sierra_data)
|
|
23
|
+
self.records = self._setup_records(linking_results, sierra_data, classified_fields)
|
|
23
24
|
|
|
24
|
-
def _normalize_sierra(self, record: Record, sierraID: str) ->
|
|
25
|
+
def _normalize_sierra(self, record: Record, sierraID: str, is_editing_existing_record: bool) -> Record:
|
|
26
|
+
"""008 updated only for new records, unless editing where prefix is preserved."""
|
|
25
27
|
|
|
26
28
|
suffix_008 = "|n|adnnnaabn || |a| "
|
|
27
29
|
|
|
30
|
+
if is_editing_existing_record:
|
|
31
|
+
# Try to reuse prefix from existing 008 field if present
|
|
32
|
+
existing_008 = next((f for f in record.fields if f.tag == "008" and hasattr(f, "data")), None)
|
|
33
|
+
if existing_008 and len(existing_008.data) >= 6:
|
|
34
|
+
prefix = existing_008.data[:6]
|
|
35
|
+
else:
|
|
36
|
+
prefix = self.current_timestamp() # fallback if missing
|
|
37
|
+
else:
|
|
38
|
+
prefix = self.current_timestamp()
|
|
39
|
+
|
|
28
40
|
fields = [
|
|
29
41
|
Field(
|
|
30
42
|
tag="008",
|
|
31
|
-
data=f"{
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
Field(
|
|
35
|
-
tag="040",
|
|
36
|
-
indicators=EMPTY_INDICATORS,
|
|
37
|
-
subfields=[
|
|
38
|
-
# if record subfield exists already, use that value. if not, use hardcoded value
|
|
39
|
-
Subfield("a", self.get_subfield(
|
|
40
|
-
record, "040", "a", "ErESTER")),
|
|
41
|
-
Subfield("b", self.get_subfield(
|
|
42
|
-
record, "040", "b", "est")),
|
|
43
|
-
Subfield("c", self.get_subfield(
|
|
44
|
-
record, "040", "c", "ErEster")),
|
|
45
|
-
]
|
|
46
|
-
),
|
|
43
|
+
data=f"{prefix}{suffix_008}"
|
|
44
|
+
)
|
|
47
45
|
]
|
|
48
46
|
|
|
49
|
-
|
|
47
|
+
field_040 = Field(
|
|
48
|
+
tag="040",
|
|
49
|
+
indicators=EMPTY_INDICATORS,
|
|
50
|
+
subfields=[
|
|
51
|
+
Subfield("a", self.get_subfield(record, "040", "a", "ErESTER")),
|
|
52
|
+
Subfield("b", self.get_subfield(record, "040", "b", "est")),
|
|
53
|
+
Subfield("c", self.get_subfield(record, "040", "c", "ErEster")),
|
|
54
|
+
]
|
|
55
|
+
)
|
|
56
|
+
fields.append(field_040)
|
|
50
57
|
|
|
58
|
+
self._add_fields_to_record(record, fields)
|
|
59
|
+
|
|
51
60
|
return record
|
|
52
|
-
|
|
61
|
+
|
|
53
62
|
def _add_birth_and_death_dates(self, record: Record, viaf_record: VIAFRecord) -> None:
|
|
54
63
|
|
|
55
64
|
formatted_birth_date = self._format_date(viaf_record.birth_date)
|
|
@@ -72,7 +81,7 @@ class AuthoritiesRecordNormalizer(RecordNormalizer):
|
|
|
72
81
|
record, [Field(tag="046", indicators=EMPTY_INDICATORS, subfields=subfields_046)])
|
|
73
82
|
|
|
74
83
|
def _add_viaf_url_or_isni(self, record: Record, viaf_record: VIAFRecord) -> None:
|
|
75
|
-
viaf_url =
|
|
84
|
+
viaf_url = viaf_record.viaf_url
|
|
76
85
|
|
|
77
86
|
subfields = [Subfield("0", self.get_subfield(
|
|
78
87
|
record, "024", "0", viaf_url))]
|
|
@@ -102,7 +111,7 @@ class AuthoritiesRecordNormalizer(RecordNormalizer):
|
|
|
102
111
|
self._add_fields_to_record(record, fields)
|
|
103
112
|
|
|
104
113
|
def _normalize_viaf(self, record: Record, viaf_record: VIAFRecord) -> None:
|
|
105
|
-
"""
|
|
114
|
+
"""
|
|
106
115
|
Attempts to enrich the record with VIAF data.
|
|
107
116
|
|
|
108
117
|
024 - repeatable field, add VIAF URL to subfield 0. If ISNI found, add to subfield 2
|
|
@@ -124,7 +133,7 @@ class AuthoritiesRecordNormalizer(RecordNormalizer):
|
|
|
124
133
|
is_editing_existing_record: bool,
|
|
125
134
|
original_entity: str) -> Record:
|
|
126
135
|
|
|
127
|
-
self._normalize_sierra(record, sierraID)
|
|
136
|
+
self._normalize_sierra(record, sierraID, is_editing_existing_record)
|
|
128
137
|
self._normalize_viaf(record, viaf_record)
|
|
129
138
|
|
|
130
139
|
return record
|
|
@@ -7,7 +7,7 @@ from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
|
|
|
7
7
|
from rara_tools.constants.normalizers import (
|
|
8
8
|
DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
|
|
9
9
|
VIAF_SIMILARITY_THRESHOLD, VERIFY_VIAF_RECORD, MAX_VIAF_RECORDS_TO_VERIFY,
|
|
10
|
-
EMPTY_INDICATORS, YYMMDD_FORMAT, YY_DD_FORMAT
|
|
10
|
+
EMPTY_INDICATORS, YYMMDD_FORMAT, YY_DD_FORMAT, YYYYMMDD_FORMAT
|
|
11
11
|
)
|
|
12
12
|
from glom import glom
|
|
13
13
|
from dateutil import parser
|
|
@@ -29,10 +29,12 @@ class RecordNormalizer:
|
|
|
29
29
|
Args:
|
|
30
30
|
sierra_data: Optionally, can normalize records from SIERRA. Must be in specific format,
|
|
31
31
|
e.g converted with SierraResponseConverter. examples at: tests/sierra/output
|
|
32
|
+
classified_fields: Optionally can include marc fields, will follow the rules of the tag number.
|
|
33
|
+
Useful to send classified data from core.
|
|
32
34
|
entities: List of Full names (str). If included, will use NormLinker to match with normalized records on KATA elastic.
|
|
33
35
|
"""
|
|
34
36
|
|
|
35
|
-
def __init__(self, linking_results: List[dict] = [], sierra_data: List[dict] = [],
|
|
37
|
+
def __init__(self, linking_results: List[dict] = [], sierra_data: List[dict] = [], classified_fields: List[str] = [],
|
|
36
38
|
ALLOW_EDIT_FIELDS: List[str] = ["925"], REPEATABLE_FIELDS: List[str] = ["667"]):
|
|
37
39
|
|
|
38
40
|
# Include, if will replace existing field
|
|
@@ -41,17 +43,16 @@ class RecordNormalizer:
|
|
|
41
43
|
self.REPEATABLE_FIELDS = REPEATABLE_FIELDS
|
|
42
44
|
# leader applied to new records
|
|
43
45
|
self.DEFAULT_LEADER = "01682nz a2200349n 4500" # must be 24 digits
|
|
44
|
-
|
|
45
|
-
def _setup_records(self, linking_results: List[dict], sierra_data: List[dict]) -> JSONReader:
|
|
46
|
-
"""Setup initial MARC records and data.
|
|
47
46
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
47
|
+
def _setup_records(self, linking_results: List[dict], sierra_data: List[dict], classified_fields: List[str] = []) -> JSONReader:
|
|
48
|
+
"""Setup initial MARC records and data.
|
|
49
|
+
|
|
50
|
+
If no linked entities or more than one linked entity found, we create a new record.
|
|
51
|
+
If one linked entity found, we create an updated record from the linked entity data.
|
|
52
52
|
"""
|
|
53
|
+
|
|
53
54
|
linked_records = []
|
|
54
|
-
|
|
55
|
+
|
|
55
56
|
for linked in linking_results or []:
|
|
56
57
|
if not isinstance(linked, dict):
|
|
57
58
|
continue
|
|
@@ -69,6 +70,7 @@ class RecordNormalizer:
|
|
|
69
70
|
})
|
|
70
71
|
self.records_extra_data.append({
|
|
71
72
|
"entity": entity,
|
|
73
|
+
"classified_fields": classified_fields,
|
|
72
74
|
"edited": False
|
|
73
75
|
})
|
|
74
76
|
continue
|
|
@@ -83,6 +85,7 @@ class RecordNormalizer:
|
|
|
83
85
|
})
|
|
84
86
|
self.records_extra_data.append({
|
|
85
87
|
"entity": entity,
|
|
88
|
+
"classified_fields": classified_fields,
|
|
86
89
|
"edited": False
|
|
87
90
|
})
|
|
88
91
|
continue
|
|
@@ -91,11 +94,13 @@ class RecordNormalizer:
|
|
|
91
94
|
linked_item = linked_info[0]
|
|
92
95
|
if not isinstance(linked_item, dict):
|
|
93
96
|
continue
|
|
94
|
-
|
|
97
|
+
|
|
95
98
|
linked_records.append(linked_item.get("json", {}))
|
|
99
|
+
|
|
96
100
|
self.records_extra_data.append({
|
|
97
101
|
"entity": entity,
|
|
98
102
|
"viaf": linked_item.get("viaf", {}),
|
|
103
|
+
"classified_fields": classified_fields,
|
|
99
104
|
"type": "linked",
|
|
100
105
|
"edited": True
|
|
101
106
|
})
|
|
@@ -128,6 +133,20 @@ class RecordNormalizer:
|
|
|
128
133
|
@staticmethod
|
|
129
134
|
def _is_person_est_nationality(viaf_record: VIAFRecord) -> bool:
|
|
130
135
|
return hasattr(viaf_record, 'nationality') and viaf_record.nationality == "ee"
|
|
136
|
+
|
|
137
|
+
def get_formatted_dates(self, viaf_record: VIAFRecord) -> str | None:
|
|
138
|
+
""" Get birth and death date in the form 1878-1940. If only birth date is present, return 1878-.
|
|
139
|
+
If no dates, return empty string.
|
|
140
|
+
"""
|
|
141
|
+
birth_date = self._extract_year(viaf_record.birth_date)
|
|
142
|
+
death_date = self._extract_year(viaf_record.death_date) if viaf_record.death_date != 0 else ""
|
|
143
|
+
|
|
144
|
+
if birth_date and death_date:
|
|
145
|
+
return f"{birth_date}-{death_date}"
|
|
146
|
+
elif birth_date:
|
|
147
|
+
return f"{birth_date}-"
|
|
148
|
+
else:
|
|
149
|
+
return None
|
|
131
150
|
|
|
132
151
|
def _is_nxx(self, field: Field, n: str):
|
|
133
152
|
""" Check if fields tag is in nxx range. """
|
|
@@ -146,13 +165,29 @@ class RecordNormalizer:
|
|
|
146
165
|
""" filter out fields, that do not have an equivalent in the record. """
|
|
147
166
|
return filter(lambda field: not self._field_in_record(field, record), fields)
|
|
148
167
|
|
|
168
|
+
def _extract_year(self, value: str) -> str:
|
|
169
|
+
if value is None:
|
|
170
|
+
return ""
|
|
171
|
+
|
|
172
|
+
if isinstance(value, (datetime, date)):
|
|
173
|
+
return str(value.year)
|
|
174
|
+
|
|
175
|
+
try:
|
|
176
|
+
dt = parser.parse(str(value), fuzzy=True)
|
|
177
|
+
parsed_year = str(dt.year)
|
|
178
|
+
logger.info(f"Extracted year '{parsed_year}' from value '{value}'")
|
|
179
|
+
return parsed_year
|
|
180
|
+
except Exception as e:
|
|
181
|
+
logger.info(f"Failed to extract year string '{value}': {e}")
|
|
182
|
+
return ""
|
|
183
|
+
|
|
149
184
|
def _format_date(self, value: str) -> str:
|
|
150
185
|
|
|
151
186
|
if not value:
|
|
152
187
|
return ""
|
|
153
188
|
|
|
154
189
|
if isinstance(value, (datetime, date)):
|
|
155
|
-
return value.strftime(
|
|
190
|
+
return value.strftime(YYYYMMDD_FORMAT)
|
|
156
191
|
|
|
157
192
|
val = str(value).strip()
|
|
158
193
|
|
|
@@ -165,7 +200,7 @@ class RecordNormalizer:
|
|
|
165
200
|
return dt.strftime("%Y") # YYYY
|
|
166
201
|
if len(val) in (6, 7): # YYYYMM or YYYY-MM
|
|
167
202
|
return dt.strftime("%Y%m") # YYYYMM
|
|
168
|
-
return dt.strftime(
|
|
203
|
+
return dt.strftime(YYYYMMDD_FORMAT) # YYYYMMDD
|
|
169
204
|
|
|
170
205
|
def get_subfield(self, record: Record, tag: str, subfield: str, default: str) -> str:
|
|
171
206
|
""" get record existing subfield value or assign a fallback value. """
|
|
@@ -179,7 +214,7 @@ class RecordNormalizer:
|
|
|
179
214
|
return subfields[0] if subfields else default
|
|
180
215
|
|
|
181
216
|
def _handle_default_fields(self, record: Record, *fields: List[Field]) -> Record:
|
|
182
|
-
""" add field to record iff not present already """
|
|
217
|
+
""" Default behavior - add field to record iff not present already """
|
|
183
218
|
record.add_field(
|
|
184
219
|
*filter(lambda field: field.tag not in [
|
|
185
220
|
f.tag for f in record.get_fields()], fields)
|
|
@@ -213,6 +248,7 @@ class RecordNormalizer:
|
|
|
213
248
|
)
|
|
214
249
|
|
|
215
250
|
def _add_fields_to_record(self, record: Record, fields: List[Field]) -> Record:
|
|
251
|
+
|
|
216
252
|
cleaned_fields = []
|
|
217
253
|
|
|
218
254
|
for field in fields:
|
|
@@ -242,7 +278,7 @@ class RecordNormalizer:
|
|
|
242
278
|
return record
|
|
243
279
|
|
|
244
280
|
def _add_author(self, record: Record, viaf_record: VIAFRecord) -> Optional[Field]:
|
|
245
|
-
|
|
281
|
+
|
|
246
282
|
existing_author: Optional[Field] = record.get(
|
|
247
283
|
"100") or record.get("110") or record.get("111")
|
|
248
284
|
if existing_author:
|
|
@@ -266,9 +302,42 @@ class RecordNormalizer:
|
|
|
266
302
|
]
|
|
267
303
|
)
|
|
268
304
|
]
|
|
305
|
+
|
|
306
|
+
if viaf_record:
|
|
307
|
+
author_dates = self.get_formatted_dates(viaf_record)
|
|
308
|
+
if author_dates:
|
|
309
|
+
fields[0].add_subfield("d", author_dates)
|
|
269
310
|
|
|
270
311
|
self._add_fields_to_record(record, fields)
|
|
271
312
|
|
|
313
|
+
if viaf_record:
|
|
314
|
+
self._include_name_variations(record, viaf_record)
|
|
315
|
+
|
|
316
|
+
def _include_name_variations(self, record: Record, viaf_record: VIAFRecord) -> None:
|
|
317
|
+
""" Include name variations from VIAF record as 400|t fields """
|
|
318
|
+
|
|
319
|
+
if not viaf_record or not viaf_record.name_variations:
|
|
320
|
+
return
|
|
321
|
+
|
|
322
|
+
existing_name_variations = record.get_fields("400")
|
|
323
|
+
existing_variations = [sf.value for field in existing_name_variations for sf in field.get_subfields("t")]
|
|
324
|
+
|
|
325
|
+
fields = []
|
|
326
|
+
|
|
327
|
+
for variation in viaf_record.name_variations:
|
|
328
|
+
if variation not in existing_variations:
|
|
329
|
+
fields.append(
|
|
330
|
+
Field(
|
|
331
|
+
tag="400",
|
|
332
|
+
indicators=EMPTY_INDICATORS,
|
|
333
|
+
subfields=[
|
|
334
|
+
Subfield("t", variation)
|
|
335
|
+
]
|
|
336
|
+
)
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
self._add_fields_to_record(record, fields)
|
|
340
|
+
|
|
272
341
|
def _move680_fields_to_667(self, record: Record) -> None:
|
|
273
342
|
""" Move existing 680 fields to 667, if any. """
|
|
274
343
|
fields_680 = record.get_fields("680")
|
|
@@ -285,13 +354,36 @@ class RecordNormalizer:
|
|
|
285
354
|
|
|
286
355
|
record.remove_fields("680")
|
|
287
356
|
self._add_fields_to_record(record, fields_667)
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def _include_classified_fields(self, record: Record, classified_fields: list[dict]) -> None:
|
|
360
|
+
"""Include classified fields from core, if any.
|
|
361
|
+
e.g. classified_fields=[{'670': {'ind1': ' ', 'ind2': '0', 'subfields': [{'a': 'Päikesekiri, 2021'}]}}]
|
|
362
|
+
"""
|
|
363
|
+
if not classified_fields:
|
|
364
|
+
return
|
|
288
365
|
|
|
289
|
-
|
|
366
|
+
fields = [
|
|
367
|
+
Field(
|
|
368
|
+
tag=str(tag),
|
|
369
|
+
indicators=v.get("indicators", [v.get("ind1", " "), v.get("ind2", " ")]),
|
|
370
|
+
subfields=[Subfield(code, value) for sub in v.get("subfields", []) for code, value in sub.items()]
|
|
371
|
+
)
|
|
372
|
+
for field_dict in classified_fields
|
|
373
|
+
for tag, v in field_dict.items()
|
|
374
|
+
]
|
|
375
|
+
|
|
376
|
+
logger.info(f"Adding classified fields: {[f.tag for f in fields]}")
|
|
377
|
+
self._add_fields_to_record(record, fields)
|
|
378
|
+
|
|
379
|
+
def _normalize_common(self, record: Record, is_editing_existing_record: bool, classified_fields: List[dict]) -> None:
|
|
290
380
|
"""Common logic for all normalizations.
|
|
291
381
|
- Includes note about record being created/edited.
|
|
292
382
|
- include date note with a different subfield, depending on if record is new or edited.
|
|
293
383
|
- move existing 680 fields to 667
|
|
294
384
|
"""
|
|
385
|
+
self._include_classified_fields(record, classified_fields)
|
|
386
|
+
|
|
295
387
|
# before adding new notes
|
|
296
388
|
self._move680_fields_to_667(record)
|
|
297
389
|
|
|
@@ -404,15 +496,16 @@ class RecordNormalizer:
|
|
|
404
496
|
sierra_id_path = "sierraID"
|
|
405
497
|
|
|
406
498
|
for record, extra_data in zip(self.records, self.records_extra_data):
|
|
407
|
-
|
|
499
|
+
|
|
408
500
|
sierra_id = glom(extra_data, sierra_id_path, default="")
|
|
409
501
|
viaf_id = glom(extra_data, viaf_id_path, default=None)
|
|
410
|
-
|
|
502
|
+
classified_fields = extra_data.get("classified_fields", [])
|
|
503
|
+
|
|
411
504
|
entity = extra_data.get("entity")
|
|
412
505
|
is_editing_existing_record = extra_data.get("edited") == True
|
|
413
506
|
|
|
414
507
|
viaf_record = self._get_viaf_record(record, viaf_id, entity)
|
|
415
|
-
record = self._normalize_common(record, is_editing_existing_record)
|
|
508
|
+
record = self._normalize_common(record, is_editing_existing_record, classified_fields)
|
|
416
509
|
|
|
417
510
|
normalized_record = self._normalize_record(
|
|
418
511
|
record, sierra_id, viaf_record, is_editing_existing_record, original_entity=entity)
|
|
@@ -12,55 +12,40 @@ class BibRecordNormalizer(RecordNormalizer):
|
|
|
12
12
|
""" Normalize bib records. """
|
|
13
13
|
|
|
14
14
|
def __init__(self, linking_results: List[dict] = [], sierra_data: List[dict] = [],
|
|
15
|
+
classified_fields: List[str] = [],
|
|
15
16
|
ALLOW_EDIT_FIELDS: List[str] = ["008", "925"],
|
|
16
17
|
REPEATABLE_FIELDS: List[str] = ["667"]):
|
|
17
|
-
super().__init__(linking_results, sierra_data)
|
|
18
|
+
super().__init__(linking_results, sierra_data, classified_fields)
|
|
18
19
|
self.DEFAULT_LEADER = "00399nz a2200145n 4500" # must be 24 digits
|
|
19
20
|
self.ALLOW_EDIT_FIELDS = ALLOW_EDIT_FIELDS
|
|
20
21
|
self.REPEATABLE_FIELDS = REPEATABLE_FIELDS
|
|
21
22
|
|
|
22
23
|
self.records_extra_data = []
|
|
23
24
|
self.sierra_data = sierra_data
|
|
24
|
-
self.records = self._setup_records(linking_results, sierra_data)
|
|
25
|
+
self.records = self._setup_records(linking_results, sierra_data, classified_fields)
|
|
25
26
|
|
|
26
|
-
def _normalize_sierra(self, record: Record) -> Record:
|
|
27
|
-
|
|
27
|
+
def _normalize_sierra(self, record: Record, is_editing_existing_record: bool) -> Record:
|
|
28
28
|
suffix_008 = "|||aznnnaabn || ||| "
|
|
29
|
-
|
|
29
|
+
|
|
30
|
+
if is_editing_existing_record:
|
|
31
|
+
# Try to reuse prefix from existing 008 field if present
|
|
32
|
+
existing_008 = next((f for f in record.fields if f.tag == "008" and hasattr(f, "data")), None)
|
|
33
|
+
if existing_008 and len(existing_008.data) >= 6:
|
|
34
|
+
prefix = existing_008.data[:6] # keep existing timestamp
|
|
35
|
+
else:
|
|
36
|
+
prefix = self.current_timestamp() # fallback if no valid existing data
|
|
37
|
+
else:
|
|
38
|
+
prefix = self.current_timestamp()
|
|
39
|
+
|
|
30
40
|
fields = [
|
|
31
41
|
Field(
|
|
32
42
|
tag="008",
|
|
33
|
-
data=f"{
|
|
43
|
+
data=f"{prefix}{suffix_008}"
|
|
34
44
|
),
|
|
35
45
|
]
|
|
36
46
|
|
|
37
47
|
self._add_fields_to_record(record, fields)
|
|
38
|
-
|
|
39
|
-
def _include_name_variations(self, record: Record, viaf_record: VIAFRecord) -> None:
|
|
40
|
-
""" Include name variations from VIAF record as 400|t fields """
|
|
41
|
-
|
|
42
|
-
if not viaf_record or not viaf_record.name_variations:
|
|
43
|
-
return
|
|
44
|
-
|
|
45
|
-
existing_name_variations = record.get_fields("400")
|
|
46
|
-
existing_variations = [sf.value for field in existing_name_variations for sf in field.get_subfields("t")]
|
|
47
|
-
|
|
48
|
-
fields = []
|
|
49
|
-
|
|
50
|
-
for variation in viaf_record.name_variations:
|
|
51
|
-
if variation not in existing_variations:
|
|
52
|
-
fields.append(
|
|
53
|
-
Field(
|
|
54
|
-
tag="400",
|
|
55
|
-
indicators=EMPTY_INDICATORS,
|
|
56
|
-
subfields=[
|
|
57
|
-
Subfield("t", variation)
|
|
58
|
-
]
|
|
59
|
-
)
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
self._add_fields_to_record(record, fields)
|
|
63
|
-
|
|
48
|
+
|
|
64
49
|
def _add_author(self, record: Record, viaf_record: Optional[VIAFRecord], original_entity: str) -> Optional[Field]:
|
|
65
50
|
if record.get("100") or record.get("110") or record.get("111"):
|
|
66
51
|
return record
|
|
@@ -76,11 +61,17 @@ class BibRecordNormalizer(RecordNormalizer):
|
|
|
76
61
|
|
|
77
62
|
fields = [Field(tag=tag, indicators=EMPTY_INDICATORS, subfields=[Subfield("t", title)])]
|
|
78
63
|
|
|
64
|
+
if viaf_record:
|
|
65
|
+
author_dates = self.get_formatted_dates(viaf_record)
|
|
66
|
+
if author_dates:
|
|
67
|
+
fields[0].add_subfield("d", author_dates)
|
|
68
|
+
|
|
79
69
|
self._add_fields_to_record(record, fields)
|
|
80
70
|
|
|
81
71
|
if viaf_record:
|
|
82
72
|
self._include_name_variations(record, viaf_record)
|
|
83
|
-
|
|
73
|
+
|
|
74
|
+
|
|
84
75
|
def _normalize_viaf(self, record: Record, viaf_record: VIAFRecord, original_entity: str) -> None:
|
|
85
76
|
|
|
86
77
|
if not viaf_record:
|
|
@@ -105,7 +96,7 @@ class BibRecordNormalizer(RecordNormalizer):
|
|
|
105
96
|
def _normalize_record(self, record: Record, sierraID: str,
|
|
106
97
|
viaf_record: VIAFRecord, is_editing_existing_record: bool, original_entity: str) -> Record:
|
|
107
98
|
|
|
108
|
-
self._normalize_sierra(record)
|
|
99
|
+
self._normalize_sierra(record, is_editing_existing_record)
|
|
109
100
|
self._normalize_viaf(record, viaf_record, original_entity=original_entity)
|
|
110
101
|
|
|
111
102
|
return record
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import json
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
from rara_tools.constants import YYMMDD_FORMAT, YY_DD_FORMAT
|
|
4
|
+
from rara_tools.constants import YYMMDD_FORMAT
|
|
6
5
|
from rara_tools.normalizers import (BibRecordNormalizer, AuthoritiesRecordNormalizer)
|
|
7
|
-
from tests.test_utils import (get_linker_res_example, get_formatted_sierra_response,
|
|
6
|
+
from tests.test_utils import (get_linker_res_example, get_formatted_sierra_response,
|
|
8
7
|
check_record_tags_sorted, check_no_dupe_tag_values, check_record_tags_have_values)
|
|
9
8
|
|
|
10
9
|
from rara_tools.constants.linker import EntityType
|
|
@@ -302,8 +301,7 @@ def test_normalized_fields_sorted():
|
|
|
302
301
|
|
|
303
302
|
def test_authority_normrecord_found_in_es_and_normalized():
|
|
304
303
|
""" KATA elastic normkirjete seast leitakse 1 vaste & normaliseerija täiendab leitud normkirjet VIAF infoga.
|
|
305
|
-
- valideeri normaliseerimise mapping, mis autori tabelis. Täiendatud väljad ja VIAFist info
|
|
306
|
-
- Valideeri märge lisatud (TODO) """
|
|
304
|
+
- valideeri normaliseerimise mapping, mis autori tabelis. Täiendatud väljad ja VIAFist info """
|
|
307
305
|
# Presume, author name identified and sent to linker
|
|
308
306
|
linker_res = get_linker_res_example(
|
|
309
307
|
"oneFound.json") # single result
|
|
@@ -573,6 +571,20 @@ def test_create_new_normrecord():
|
|
|
573
571
|
assert len(data) == 1
|
|
574
572
|
record = normalizer.first
|
|
575
573
|
|
|
574
|
+
# Test 100|d gets date
|
|
575
|
+
linking_results = [{
|
|
576
|
+
"original_entity": "Libe, Katariina",
|
|
577
|
+
"entity_type": EntityType.PER,
|
|
578
|
+
"linked_info": []
|
|
579
|
+
}]
|
|
580
|
+
normalizer = AuthoritiesRecordNormalizer(linking_results=linking_results)
|
|
581
|
+
record = normalizer.first
|
|
582
|
+
# Check that 100|d has date added from VIAF
|
|
583
|
+
field_100 = record.get_fields("100")[0]
|
|
584
|
+
assert field_100.get_subfields("d")[0] == "1986-"
|
|
585
|
+
field_046 = record.get_fields("046")[0]
|
|
586
|
+
assert field_046.get_subfields("f")[0] == "19861126"
|
|
587
|
+
|
|
576
588
|
def test_680_field_on_existing_record_moved_to_667():
|
|
577
589
|
""" 680 Should not be added for new, if exists on existing record, should be moved to 667 """
|
|
578
590
|
linker_res = get_linker_res_example(
|
|
@@ -666,3 +678,102 @@ def test_new_bibrecord_title_included():
|
|
|
666
678
|
fields_100 = record.get_fields("100")
|
|
667
679
|
assert len(fields_100) == 1
|
|
668
680
|
assert fields_100[0].get_subfields("t")[0] == "Roolijoodiku katastroofiline jõulusõit"
|
|
681
|
+
|
|
682
|
+
def _validate_new_record_008_field(record: Record):
|
|
683
|
+
""" Validate 008 field in new record """
|
|
684
|
+
field_008 = record.get_fields("008")[0].data
|
|
685
|
+
assert len(field_008) == 40
|
|
686
|
+
# pos 00-05 is current date in YYMMDD format
|
|
687
|
+
timestamp = field_008[0:6]
|
|
688
|
+
try:
|
|
689
|
+
datetime.strptime(timestamp, YYMMDD_FORMAT)
|
|
690
|
+
except ValueError:
|
|
691
|
+
raise AssertionError(f"008 field timestamp {timestamp} is not in format {YYMMDD_FORMAT}")
|
|
692
|
+
|
|
693
|
+
def test_008_field_formatting():
|
|
694
|
+
""" 00-04 position will be changed for new record, not edited on existing record """
|
|
695
|
+
|
|
696
|
+
# Case 1 - new record created, should have current date in 008 field
|
|
697
|
+
|
|
698
|
+
linking_results = [{
|
|
699
|
+
"original_entity": "Eesti Ekspress",
|
|
700
|
+
"entity_type": EntityType.TITLE,
|
|
701
|
+
"linked_info": []
|
|
702
|
+
}]
|
|
703
|
+
|
|
704
|
+
normalizer = BibRecordNormalizer(
|
|
705
|
+
linking_results=linking_results,
|
|
706
|
+
)
|
|
707
|
+
new_record = normalizer.first
|
|
708
|
+
_validate_new_record_008_field(new_record)
|
|
709
|
+
|
|
710
|
+
# Case 2 - existing record updated, 008 field should not be changed
|
|
711
|
+
linker_res = get_linker_res_example(
|
|
712
|
+
"oneFound.json")
|
|
713
|
+
linking_results = [linker_res]
|
|
714
|
+
original_record = JSONReader(
|
|
715
|
+
json.dumps([linker_res["linked_info"][0]["json"]], ensure_ascii=False)
|
|
716
|
+
)
|
|
717
|
+
record = next(iter(original_record))
|
|
718
|
+
original_008 = record.get_fields("008")[0].data
|
|
719
|
+
|
|
720
|
+
# for authorities
|
|
721
|
+
normalizer = AuthoritiesRecordNormalizer(
|
|
722
|
+
linking_results=linking_results
|
|
723
|
+
)
|
|
724
|
+
authorities_record = normalizer.first
|
|
725
|
+
field_008 = authorities_record.get_fields("008")[0].data
|
|
726
|
+
assert len(field_008) == 40
|
|
727
|
+
|
|
728
|
+
assert field_008 == original_008
|
|
729
|
+
|
|
730
|
+
# for bibs
|
|
731
|
+
normalizer = BibRecordNormalizer(
|
|
732
|
+
linking_results=linking_results
|
|
733
|
+
)
|
|
734
|
+
expected_008 = "990107|||aznnnaabn || |||" + 6 * " "
|
|
735
|
+
bibrecord = normalizer.first
|
|
736
|
+
field_008 = bibrecord.get_fields("008")[0].data
|
|
737
|
+
assert len(field_008) == 40
|
|
738
|
+
assert field_008 == expected_008
|
|
739
|
+
|
|
740
|
+
def test_classified_fields_added_to_linked_record():
|
|
741
|
+
""" Test that classified fields Can be passed to normalizer & added to linked record """
|
|
742
|
+
|
|
743
|
+
classified_fields = [
|
|
744
|
+
{
|
|
745
|
+
"670": {
|
|
746
|
+
"ind1": " ",
|
|
747
|
+
"ind2": "0",
|
|
748
|
+
"subfields": [
|
|
749
|
+
{
|
|
750
|
+
"a": "Päikesekiri, 2021"
|
|
751
|
+
}
|
|
752
|
+
]
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
]
|
|
756
|
+
# Case 1 - no 670 exists, should be added to linked record
|
|
757
|
+
for normalizer in (AuthoritiesRecordNormalizer, BibRecordNormalizer):
|
|
758
|
+
linking_results = [MOCK_LINKER_NOT_FOUND]
|
|
759
|
+
normalizer = normalizer(linking_results=linking_results, classified_fields=classified_fields)
|
|
760
|
+
|
|
761
|
+
record = normalizer.first
|
|
762
|
+
fields_670 = record.get_fields("670")
|
|
763
|
+
assert len(fields_670) == 1
|
|
764
|
+
assert fields_670[0].get_subfields("a")[0] == "Päikesekiri, 2021"
|
|
765
|
+
|
|
766
|
+
# Case 1 - existing record with 670 should not update (same behavior for both normalizers)
|
|
767
|
+
linker_res = get_linker_res_example(
|
|
768
|
+
"oneFound.json")
|
|
769
|
+
linking_results = [linker_res]
|
|
770
|
+
|
|
771
|
+
for normalizer in (AuthoritiesRecordNormalizer, BibRecordNormalizer):
|
|
772
|
+
normalizer = normalizer(
|
|
773
|
+
linking_results=linking_results,
|
|
774
|
+
classified_fields=classified_fields
|
|
775
|
+
)
|
|
776
|
+
record = normalizer.first
|
|
777
|
+
fields_670 = record.get_fields("670")
|
|
778
|
+
assert len(fields_670) == 1
|
|
779
|
+
assert fields_670[0].get_subfields("a")[0] == "Eesti kirjarahva leksikon, 1995."
|
rara_tools-0.7.12/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.7.12
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/parsers/marc_parsers/organization_parser.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rara_tools-0.7.12 → rara_tools-0.7.14}/rara_tools/parsers/marc_records/organization_record.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|