rara-tools 0.7.15__tar.gz → 0.7.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rara-tools might be problematic. Click here for more details.
- {rara_tools-0.7.15/rara_tools.egg-info → rara_tools-0.7.17}/PKG-INFO +1 -1
- rara_tools-0.7.17/VERSION +1 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/normalizers/base.py +51 -32
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/normalizers/bibs.py +2 -15
- rara_tools-0.7.17/rara_tools/parsers/tools/validators.py +54 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17/rara_tools.egg-info}/PKG-INFO +1 -1
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools.egg-info/SOURCES.txt +2 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/tests/test_normalization.py +110 -3
- rara_tools-0.7.17/tests/test_validators.py +55 -0
- rara_tools-0.7.15/VERSION +0 -1
- {rara_tools-0.7.15 → rara_tools-0.7.17}/LICENSE.md +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/README.md +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/pyproject.toml +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/constants/__init__.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/constants/digitizer.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/constants/general.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/constants/language_evaluator.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/constants/linker.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/constants/meta_extractor.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/constants/normalizers.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/constants/parsers.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/constants/subject_indexer.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/converters.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/core_formatters/core_formatter.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/core_formatters/formatted_keyword.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/core_formatters/formatted_meta.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/core_formatters/formatted_object.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/decorators.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/digar_schema_converter.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/elastic.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/exceptions.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/normalizers/__init__.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/normalizers/authorities.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/normalizers/reader.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/normalizers/viaf.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/parsers/marc_parsers/base_parser.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/parsers/marc_parsers/ems_parser.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/parsers/marc_parsers/location_parser.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/parsers/marc_parsers/organization_parser.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/parsers/marc_parsers/person_parser.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/parsers/marc_parsers/title_parser.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/parsers/marc_records/base_record.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/parsers/marc_records/ems_record.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/parsers/marc_records/organization_record.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/parsers/marc_records/person_record.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/parsers/marc_records/title_record.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/parsers/tools/entity_normalizers.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/parsers/tools/marc_converter.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/parsers/tools/russian_transliterator.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/s3.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/task_reporter.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/utils.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools.egg-info/dependency_links.txt +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools.egg-info/requires.txt +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools.egg-info/top_level.txt +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/requirements.txt +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/setup.cfg +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/tests/test_digar_schema_converter.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/tests/test_elastic.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/tests/test_elastic_vector_and_search_operations.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/tests/test_entity_normalizers.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/tests/test_formatters.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/tests/test_marc_parsers.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/tests/test_s3_exceptions.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/tests/test_s3_file_operations.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/tests/test_sierra_converters.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/tests/test_task_reporter.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/tests/test_utils.py +0 -0
- {rara_tools-0.7.15 → rara_tools-0.7.17}/tests/test_viaf_client.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.7.17
|
|
@@ -3,6 +3,8 @@ from pymarc import (Field, Subfield, JSONReader, Record)
|
|
|
3
3
|
from typing import List, Optional, Iterator
|
|
4
4
|
from rara_tools.normalizers.reader import SafeJSONReader
|
|
5
5
|
|
|
6
|
+
from rara_tools.parsers.tools.validators import filter_names
|
|
7
|
+
|
|
6
8
|
from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
|
|
7
9
|
from rara_tools.constants.normalizers import (
|
|
8
10
|
DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
|
|
@@ -51,10 +53,23 @@ class RecordNormalizer:
|
|
|
51
53
|
If one linked entity found, we create an updated record from the linked entity data.
|
|
52
54
|
"""
|
|
53
55
|
linked_records = []
|
|
56
|
+
|
|
57
|
+
def handle_create_new_record(entity, idx):
|
|
58
|
+
logger.info(f"No linked entities found for {entity}, Creating new record.")
|
|
59
|
+
linked_records.append({
|
|
60
|
+
"leader": self.DEFAULT_LEADER,
|
|
61
|
+
"fields": []
|
|
62
|
+
})
|
|
63
|
+
self.records_extra_data.append({
|
|
64
|
+
"entity": entity,
|
|
65
|
+
"classified_fields": classified_fields[idx] if idx < len(classified_fields) else [],
|
|
66
|
+
"edited": False,
|
|
67
|
+
})
|
|
54
68
|
|
|
55
69
|
for idx, linked in enumerate(linking_results or []):
|
|
56
70
|
|
|
57
71
|
if not isinstance(linked, dict):
|
|
72
|
+
logger.debug(f"Skipping invalid linked result: {linked}")
|
|
58
73
|
continue
|
|
59
74
|
|
|
60
75
|
entity = linked.get("original_entity")
|
|
@@ -62,46 +77,32 @@ class RecordNormalizer:
|
|
|
62
77
|
|
|
63
78
|
if not isinstance(linked_info, list) or not linked_info:
|
|
64
79
|
# No linked entities found, create new record
|
|
65
|
-
|
|
66
|
-
f"No linked entities found for {entity}, Creating new record.")
|
|
67
|
-
linked_records.append({
|
|
68
|
-
"leader": self.DEFAULT_LEADER,
|
|
69
|
-
"fields": []
|
|
70
|
-
})
|
|
71
|
-
self.records_extra_data.append({
|
|
72
|
-
"entity": entity,
|
|
73
|
-
"classified_fields": classified_fields[idx] if idx < len(classified_fields) else [],
|
|
74
|
-
"edited": False
|
|
75
|
-
})
|
|
80
|
+
handle_create_new_record(entity, idx)
|
|
76
81
|
continue
|
|
77
82
|
|
|
78
83
|
elif len(linked_info) > 1:
|
|
79
84
|
# Multiple linked entities found, create new record
|
|
80
|
-
|
|
81
|
-
f"Multiple linked entities found for {entity}. Creating new record.")
|
|
82
|
-
linked_records.append({
|
|
83
|
-
"leader": self.DEFAULT_LEADER,
|
|
84
|
-
"fields": []
|
|
85
|
-
})
|
|
86
|
-
self.records_extra_data.append({
|
|
87
|
-
"entity": entity,
|
|
88
|
-
"classified_fields": classified_fields[idx] if idx < len(classified_fields) else [],
|
|
89
|
-
"edited": False
|
|
90
|
-
})
|
|
85
|
+
handle_create_new_record(entity, idx)
|
|
91
86
|
continue
|
|
92
87
|
|
|
93
88
|
elif len(linked_info) == 1:
|
|
89
|
+
# one record match found, we update existing record
|
|
90
|
+
|
|
94
91
|
linked_item = linked_info[0]
|
|
95
92
|
if not isinstance(linked_item, dict):
|
|
96
93
|
continue
|
|
97
94
|
|
|
95
|
+
# handle case where we have linked an entity without a record
|
|
96
|
+
if not linked_item.get("json", None):
|
|
97
|
+
handle_create_new_record(entity, idx)
|
|
98
|
+
continue
|
|
99
|
+
|
|
98
100
|
linked_records.append(linked_item.get("json", {}))
|
|
99
101
|
|
|
100
102
|
self.records_extra_data.append({
|
|
101
103
|
"entity": entity,
|
|
102
104
|
"viaf": linked_item.get("viaf", {}),
|
|
103
105
|
"classified_fields": classified_fields[idx] if idx < len(classified_fields) else [],
|
|
104
|
-
"type": "linked",
|
|
105
106
|
"edited": True
|
|
106
107
|
})
|
|
107
108
|
continue
|
|
@@ -109,7 +110,6 @@ class RecordNormalizer:
|
|
|
109
110
|
self.records_extra_data.extend(
|
|
110
111
|
{
|
|
111
112
|
"sierraID": obj.get("sierraID"),
|
|
112
|
-
"type": "sierra",
|
|
113
113
|
"edited": True
|
|
114
114
|
}
|
|
115
115
|
for obj in (sierra_data or [])
|
|
@@ -313,25 +313,34 @@ class RecordNormalizer:
|
|
|
313
313
|
if viaf_record:
|
|
314
314
|
self._include_name_variations(record, viaf_record)
|
|
315
315
|
|
|
316
|
-
def _include_name_variations(self, record: Record, viaf_record: VIAFRecord) -> None:
|
|
316
|
+
def _include_name_variations(self, record: Record, viaf_record: VIAFRecord, filter_variations=True) -> None:
|
|
317
317
|
""" Include name variations from VIAF record as 400|t fields """
|
|
318
318
|
|
|
319
319
|
if not viaf_record or not viaf_record.name_variations:
|
|
320
320
|
return
|
|
321
321
|
|
|
322
322
|
existing_name_variations = record.get_fields("400")
|
|
323
|
-
existing_variations = [sf.value for field in existing_name_variations for sf in field.get_subfields("
|
|
323
|
+
existing_variations = [sf.value for field in existing_name_variations for sf in field.get_subfields("a")]
|
|
324
|
+
|
|
325
|
+
if filter_variations:
|
|
326
|
+
allowed_variations = filter_names(viaf_record.name_variations)
|
|
327
|
+
logger.debug(
|
|
328
|
+
f"filtered out {len(viaf_record.name_variations) - len(allowed_variations)} name variations for '{viaf_record.name}'"
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
else:
|
|
332
|
+
allowed_variations = viaf_record.name_variations
|
|
324
333
|
|
|
325
334
|
fields = []
|
|
326
335
|
|
|
327
|
-
for variation in
|
|
336
|
+
for variation in allowed_variations:
|
|
328
337
|
if variation not in existing_variations:
|
|
329
338
|
fields.append(
|
|
330
339
|
Field(
|
|
331
340
|
tag="400",
|
|
332
341
|
indicators=EMPTY_INDICATORS,
|
|
333
342
|
subfields=[
|
|
334
|
-
Subfield("
|
|
343
|
+
Subfield("a", variation)
|
|
335
344
|
]
|
|
336
345
|
)
|
|
337
346
|
)
|
|
@@ -465,6 +474,8 @@ class RecordNormalizer:
|
|
|
465
474
|
verify=verify,
|
|
466
475
|
threshold=threshold
|
|
467
476
|
)
|
|
477
|
+
if viaf_record:
|
|
478
|
+
logger.debug(f"VIAF {search_term}, linked to ID: {viaf_record.viaf_id}")
|
|
468
479
|
|
|
469
480
|
except Exception as e:
|
|
470
481
|
logger.error(
|
|
@@ -473,7 +484,8 @@ class RecordNormalizer:
|
|
|
473
484
|
return viaf_record
|
|
474
485
|
|
|
475
486
|
def _normalize_record(self, record: Record, sierraID: str,
|
|
476
|
-
viaf_record: VIAFRecord, is_editing_existing_record: bool,
|
|
487
|
+
viaf_record: VIAFRecord, is_editing_existing_record: bool,
|
|
488
|
+
original_entity: str) -> Record:
|
|
477
489
|
return record
|
|
478
490
|
|
|
479
491
|
def get_record(self, index: int) -> Record:
|
|
@@ -500,19 +512,26 @@ class RecordNormalizer:
|
|
|
500
512
|
return next(iter(self))
|
|
501
513
|
|
|
502
514
|
def __iter__(self) -> Iterator:
|
|
503
|
-
viaf_id_path = "viaf.queryResult.
|
|
515
|
+
# viaf_id_path = "viaf.original.queryResult.viafID"
|
|
516
|
+
viaf_id_path = "viaf.parsed.viaf_id"
|
|
517
|
+
|
|
504
518
|
sierra_id_path = "sierraID"
|
|
505
|
-
|
|
519
|
+
|
|
506
520
|
for record, extra_data in zip(self.records, self.records_extra_data):
|
|
507
521
|
|
|
508
522
|
sierra_id = glom(extra_data, sierra_id_path, default="")
|
|
509
523
|
viaf_id = glom(extra_data, viaf_id_path, default=None)
|
|
510
|
-
classified_fields = extra_data.get("classified_fields", [])
|
|
511
524
|
|
|
525
|
+
classified_fields = extra_data.get("classified_fields", [])
|
|
512
526
|
entity = extra_data.get("entity")
|
|
513
527
|
is_editing_existing_record = extra_data.get("edited") == True
|
|
514
528
|
|
|
515
529
|
viaf_record = self._get_viaf_record(record, viaf_id, entity)
|
|
530
|
+
if viaf_record:
|
|
531
|
+
logger.debug(
|
|
532
|
+
f"linked VIAF record with ID {viaf_record.viaf_id} for entity '{entity}'"
|
|
533
|
+
)
|
|
534
|
+
|
|
516
535
|
record = self._normalize_common(record, is_editing_existing_record, classified_fields)
|
|
517
536
|
|
|
518
537
|
normalized_record = self._normalize_record(
|
|
@@ -73,26 +73,13 @@ class BibRecordNormalizer(RecordNormalizer):
|
|
|
73
73
|
|
|
74
74
|
|
|
75
75
|
def _normalize_viaf(self, record: Record, viaf_record: VIAFRecord, original_entity: str) -> None:
|
|
76
|
-
|
|
77
76
|
if not viaf_record:
|
|
78
77
|
# viaf record not found, include original entity as 100|t
|
|
79
78
|
self._add_author(record, viaf_record=None, original_entity=original_entity)
|
|
80
79
|
return record
|
|
81
|
-
|
|
82
|
-
viaf_id = viaf_record.viaf_id
|
|
83
|
-
fields = [
|
|
84
|
-
Field(
|
|
85
|
-
tag="035",
|
|
86
|
-
indicators=EMPTY_INDICATORS,
|
|
87
|
-
subfields=[
|
|
88
|
-
Subfield("a", viaf_id)
|
|
89
|
-
]
|
|
90
|
-
)
|
|
91
|
-
]
|
|
92
|
-
|
|
93
|
-
self._add_fields_to_record(record, fields)
|
|
80
|
+
|
|
94
81
|
self._add_author(record, viaf_record, original_entity=original_entity)
|
|
95
|
-
|
|
82
|
+
|
|
96
83
|
def _normalize_record(self, record: Record, sierraID: str,
|
|
97
84
|
viaf_record: VIAFRecord, is_editing_existing_record: bool, original_entity: str) -> Record:
|
|
98
85
|
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import regex as re
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
def has_valid_chars(entity: str, allow_cyrillic: bool = True) -> bool:
|
|
5
|
+
""" Checks if entity contains any valid characters in latin
|
|
6
|
+
or in cyrillic, if the latter is enabled
|
|
7
|
+
|
|
8
|
+
Parameters
|
|
9
|
+
------------
|
|
10
|
+
entity: str
|
|
11
|
+
String to validate.
|
|
12
|
+
allow_cyrillic: bool
|
|
13
|
+
Allow strings in cyrillic?
|
|
14
|
+
|
|
15
|
+
Returns
|
|
16
|
+
------------
|
|
17
|
+
bool
|
|
18
|
+
Boolean value indicating, if the string
|
|
19
|
+
contains any valid characters.
|
|
20
|
+
|
|
21
|
+
"""
|
|
22
|
+
# Check for latin characters
|
|
23
|
+
is_valid = bool(re.search(r"[a-züõöäA-ZÜÕÖÄ]", entity))
|
|
24
|
+
|
|
25
|
+
if allow_cyrillic and not is_valid:
|
|
26
|
+
# If cyrillic characters are allowed,
|
|
27
|
+
# check for them as well
|
|
28
|
+
is_valid = bool(re.search(r"[а-яА-Я]", entity))
|
|
29
|
+
|
|
30
|
+
return is_valid
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def filter_names(names: List[str], allow_cyrillic: bool = True) -> List[str]:
|
|
34
|
+
""" Filters out names not in allowed encodings (latin / cyrillic).
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
------------
|
|
38
|
+
names: List[str]
|
|
39
|
+
Names to filters.
|
|
40
|
+
allow_cyrillic: bool
|
|
41
|
+
Allow strings in cyrillic?
|
|
42
|
+
|
|
43
|
+
Returns
|
|
44
|
+
------------
|
|
45
|
+
List[str]
|
|
46
|
+
List of filtered names.
|
|
47
|
+
|
|
48
|
+
"""
|
|
49
|
+
filtered_names = [
|
|
50
|
+
name for name in names
|
|
51
|
+
if has_valid_chars(entity=name, allow_cyrillic=allow_cyrillic)
|
|
52
|
+
]
|
|
53
|
+
return filtered_names
|
|
54
|
+
|
|
@@ -49,6 +49,7 @@ rara_tools/parsers/marc_records/title_record.py
|
|
|
49
49
|
rara_tools/parsers/tools/entity_normalizers.py
|
|
50
50
|
rara_tools/parsers/tools/marc_converter.py
|
|
51
51
|
rara_tools/parsers/tools/russian_transliterator.py
|
|
52
|
+
rara_tools/parsers/tools/validators.py
|
|
52
53
|
tests/test_digar_schema_converter.py
|
|
53
54
|
tests/test_elastic.py
|
|
54
55
|
tests/test_elastic_vector_and_search_operations.py
|
|
@@ -61,4 +62,5 @@ tests/test_s3_file_operations.py
|
|
|
61
62
|
tests/test_sierra_converters.py
|
|
62
63
|
tests/test_task_reporter.py
|
|
63
64
|
tests/test_utils.py
|
|
65
|
+
tests/test_validators.py
|
|
64
66
|
tests/test_viaf_client.py
|
|
@@ -5,6 +5,7 @@ from rara_tools.constants import YYMMDD_FORMAT
|
|
|
5
5
|
from rara_tools.normalizers import (BibRecordNormalizer, AuthoritiesRecordNormalizer)
|
|
6
6
|
from tests.test_utils import (get_linker_res_example, get_formatted_sierra_response,
|
|
7
7
|
check_record_tags_sorted, check_no_dupe_tag_values, check_record_tags_have_values)
|
|
8
|
+
from rara_tools.normalizers.viaf import VIAFRecord
|
|
8
9
|
|
|
9
10
|
from rara_tools.constants.linker import EntityType
|
|
10
11
|
|
|
@@ -212,7 +213,7 @@ def test_missing_fields_created_bibrecord_normalization():
|
|
|
212
213
|
for record in normalizer_entities_only:
|
|
213
214
|
check_record_tags_have_values(
|
|
214
215
|
record, ["008", # Sierra related, always with bibs
|
|
215
|
-
|
|
216
|
+
"100", # VIAf enriched
|
|
216
217
|
] + REQUIRED_FIELDS
|
|
217
218
|
)
|
|
218
219
|
validate_bibrecord_normalized(record, has_viaf_data=True)
|
|
@@ -752,7 +753,7 @@ def test_classified_fields_added_to_linked_record():
|
|
|
752
753
|
}
|
|
753
754
|
]
|
|
754
755
|
}
|
|
755
|
-
}
|
|
756
|
+
}
|
|
756
757
|
]
|
|
757
758
|
]
|
|
758
759
|
# Case 1 - no 670 exists, should be added to linked record
|
|
@@ -765,7 +766,7 @@ def test_classified_fields_added_to_linked_record():
|
|
|
765
766
|
assert len(fields_670) == 1
|
|
766
767
|
assert fields_670[0].get_subfields("a")[0] == "Päikesekiri, 2021"
|
|
767
768
|
|
|
768
|
-
# Case
|
|
769
|
+
# Case 2 - existing record with 670 should not update (same behavior for both normalizers)
|
|
769
770
|
linker_res = get_linker_res_example(
|
|
770
771
|
"oneFound.json")
|
|
771
772
|
linking_results = [linker_res]
|
|
@@ -779,7 +780,54 @@ def test_classified_fields_added_to_linked_record():
|
|
|
779
780
|
fields_670 = record.get_fields("670")
|
|
780
781
|
assert len(fields_670) == 1
|
|
781
782
|
assert fields_670[0].get_subfields("a")[0] == "Eesti kirjarahva leksikon, 1995."
|
|
783
|
+
|
|
784
|
+
def get_046_field(year: str) -> dict:
|
|
785
|
+
return {
|
|
786
|
+
"046": {
|
|
787
|
+
"ind1": " ",
|
|
788
|
+
"ind2": " ",
|
|
789
|
+
"subfields": [
|
|
790
|
+
{"k": year }
|
|
791
|
+
]
|
|
792
|
+
}
|
|
793
|
+
}
|
|
782
794
|
|
|
795
|
+
# Case 3 - 046 $k - publication date Passed for bib
|
|
796
|
+
classified_fields = [
|
|
797
|
+
[get_046_field("2021")],
|
|
798
|
+
[get_046_field("1999")],
|
|
799
|
+
[get_046_field("2022")]
|
|
800
|
+
]
|
|
801
|
+
|
|
802
|
+
|
|
803
|
+
mock_046_exists = MOCK_LINKER_ONE_FOUND.copy()
|
|
804
|
+
mock_046_exists["linked_info"][0]["json"]["fields"].append(get_046_field("2000"))
|
|
805
|
+
|
|
806
|
+
# for new record should get included
|
|
807
|
+
linking_results = [MOCK_LINKER_NOT_FOUND, # new record
|
|
808
|
+
MOCK_LINKER_ONE_FOUND, # new record
|
|
809
|
+
MOCK_LINKER_NOT_FOUND] # editing existing record
|
|
810
|
+
|
|
811
|
+
normalizer = BibRecordNormalizer(linking_results=linking_results, classified_fields=classified_fields)
|
|
812
|
+
|
|
813
|
+
# for i, record in enumerate(normalizer):
|
|
814
|
+
# first two should have 046 from classified data
|
|
815
|
+
record1 = normalizer.get_record(0)
|
|
816
|
+
fields_046 = record1.get_fields("046")
|
|
817
|
+
assert len(fields_046) == 1
|
|
818
|
+
assert fields_046[0].get_subfields("k")[0] == "2021"
|
|
819
|
+
|
|
820
|
+
record2 = normalizer.get_record(1)
|
|
821
|
+
fields_046 = record2.get_fields("046")
|
|
822
|
+
assert len(fields_046) == 1
|
|
823
|
+
# should be unchanged, aka 2000
|
|
824
|
+
assert fields_046[0].get_subfields("k")[0] == "2000"
|
|
825
|
+
|
|
826
|
+
record3 = normalizer.get_record(2)
|
|
827
|
+
fields_046 = record3.get_fields("046")
|
|
828
|
+
assert len(fields_046) == 1
|
|
829
|
+
assert fields_046[0].get_subfields("k")[0] == "2022"
|
|
830
|
+
|
|
783
831
|
def test_classified_data_with_multiple_records():
|
|
784
832
|
""" Test classified data with multiple records - should match by sierraID """
|
|
785
833
|
|
|
@@ -842,3 +890,62 @@ def test_classified_data_with_multiple_records():
|
|
|
842
890
|
assert len(record.get_fields("670")) == 1
|
|
843
891
|
fields_670 = record.get_fields("670")[0]
|
|
844
892
|
assert fields_670.get_subfields("a")[0] == "Teine kirjeldus, 2022"
|
|
893
|
+
|
|
894
|
+
|
|
895
|
+
def test_viaf_name_variations():
|
|
896
|
+
""" Test adding alternative name forms from VIAF to 4XX fields. Should skip some variants """
|
|
897
|
+
|
|
898
|
+
normalizer = AuthoritiesRecordNormalizer()
|
|
899
|
+
record = Record()
|
|
900
|
+
|
|
901
|
+
viaf_record: VIAFRecord = normalizer._get_viaf_record(
|
|
902
|
+
record,
|
|
903
|
+
entity="Jaan Kaplinski"
|
|
904
|
+
)
|
|
905
|
+
|
|
906
|
+
assert viaf_record is not None
|
|
907
|
+
assert len(viaf_record.name_variations) > 0
|
|
908
|
+
|
|
909
|
+
normalizer._add_author(record, viaf_record)
|
|
910
|
+
|
|
911
|
+
fields_4xx = record.get_fields("400") + record.get_fields("410") + record.get_fields("430")
|
|
912
|
+
|
|
913
|
+
unfiltered_name_variations = viaf_record.name_variations
|
|
914
|
+
|
|
915
|
+
assert len(fields_4xx) > 0
|
|
916
|
+
assert len(fields_4xx) < len(unfiltered_name_variations)
|
|
917
|
+
|
|
918
|
+
def test_existing_record_linked_to_viaf_record():
|
|
919
|
+
""" Test existing record linked to VIAF record - should enrich with VIAF data """
|
|
920
|
+
|
|
921
|
+
base_path = "tests/test_data/marc_records/json/"
|
|
922
|
+
with open(os.path.join(base_path, "imbi.json"), "r", encoding="utf-8") as f, \
|
|
923
|
+
open(os.path.join(base_path, "ernits.json"), "r", encoding="utf-8") as f2, \
|
|
924
|
+
open(os.path.join(base_path, "rowling.json"), "r", encoding="utf-8") as f3:
|
|
925
|
+
imbi = json.load(f)
|
|
926
|
+
ernits = json.load(f2)
|
|
927
|
+
rowling = json.load(f3)
|
|
928
|
+
|
|
929
|
+
linking_results = [
|
|
930
|
+
imbi,
|
|
931
|
+
ernits,
|
|
932
|
+
rowling
|
|
933
|
+
]
|
|
934
|
+
|
|
935
|
+
normalizer = AuthoritiesRecordNormalizer(
|
|
936
|
+
linking_results=linking_results,
|
|
937
|
+
)
|
|
938
|
+
|
|
939
|
+
def get_viaf_url(record: Record):
|
|
940
|
+
field_024 = record.get_fields("024")
|
|
941
|
+
if len(field_024) == 0:
|
|
942
|
+
return None
|
|
943
|
+
return field_024[0].get_subfields("0")[0]
|
|
944
|
+
|
|
945
|
+
viaf_base_url = "http://viaf.org/viaf"
|
|
946
|
+
assert get_viaf_url(normalizer.get_record(0)) == f"{viaf_base_url}/167120147/"
|
|
947
|
+
assert get_viaf_url(normalizer.get_record(1)) == f"{viaf_base_url}/22458146/"
|
|
948
|
+
assert get_viaf_url(normalizer.get_record(1)) == f"{viaf_base_url}/22458146/"
|
|
949
|
+
assert get_viaf_url(normalizer.get_record(2)) == f"{viaf_base_url}/116796842/"
|
|
950
|
+
|
|
951
|
+
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from rara_tools.parsers.tools.validators import filter_names
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
are_equal = lambda x, y: not bool(set(x).difference(set(y)))
|
|
5
|
+
|
|
6
|
+
names_to_validate = [
|
|
7
|
+
"ליסט, פראנץ",
|
|
8
|
+
"Liszt, Franz",
|
|
9
|
+
"Lißt, Franz",
|
|
10
|
+
"ליסט, פרנץ",
|
|
11
|
+
"Liszt, Ferencz",
|
|
12
|
+
"Лист, Франц",
|
|
13
|
+
"Listz",
|
|
14
|
+
"Lißzt, Franz",
|
|
15
|
+
"Lists, Francis",
|
|
16
|
+
"List, Ferenc",
|
|
17
|
+
"List, Frants리스",
|
|
18
|
+
"List, Ferents",
|
|
19
|
+
"李斯特,弗朗西斯庫斯",
|
|
20
|
+
"ᓕᔅᑦ, ᕗᕌᓐᓯᔅᑲᔅ",
|
|
21
|
+
"리스트, 프란치스코"
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
valid_names_1 = [
|
|
25
|
+
"Liszt, Franz",
|
|
26
|
+
"Lißt, Franz",
|
|
27
|
+
"Liszt, Ferencz",
|
|
28
|
+
"Лист, Франц",
|
|
29
|
+
"Listz",
|
|
30
|
+
"Lißzt, Franz",
|
|
31
|
+
"Lists, Francis",
|
|
32
|
+
"List, Ferenc",
|
|
33
|
+
"List, Frants리스",
|
|
34
|
+
"List, Ferents"
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
valid_names_2 = [
|
|
38
|
+
"Liszt, Franz",
|
|
39
|
+
"Lißt, Franz",
|
|
40
|
+
"Liszt, Ferencz",
|
|
41
|
+
"Listz",
|
|
42
|
+
"Lißzt, Franz",
|
|
43
|
+
"Lists, Francis",
|
|
44
|
+
"List, Ferenc",
|
|
45
|
+
"List, Frants리스",
|
|
46
|
+
"List, Ferents"
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
def test_filtering_latin_cyrillic():
|
|
50
|
+
filtered_names = filter_names(names_to_validate, allow_cyrillic=True)
|
|
51
|
+
assert are_equal(filtered_names, valid_names_1)
|
|
52
|
+
|
|
53
|
+
def test_filtering_latin():
|
|
54
|
+
filtered_names = filter_names(names_to_validate, allow_cyrillic=False)
|
|
55
|
+
assert are_equal(filtered_names, valid_names_2)
|
rara_tools-0.7.15/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.7.15
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/parsers/marc_parsers/organization_parser.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rara_tools-0.7.15 → rara_tools-0.7.17}/rara_tools/parsers/marc_records/organization_record.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|