rara-tools 0.7.16__py3-none-any.whl → 0.7.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rara-tools might be problematic. Click here for more details.
- rara_tools/digar_schema_converter.py +2 -2
- rara_tools/normalizers/base.py +15 -4
- rara_tools/normalizers/bibs.py +2 -15
- rara_tools/parsers/tools/validators.py +54 -0
- {rara_tools-0.7.16.dist-info → rara_tools-0.7.18.dist-info}/METADATA +1 -1
- {rara_tools-0.7.16.dist-info → rara_tools-0.7.18.dist-info}/RECORD +9 -8
- {rara_tools-0.7.16.dist-info → rara_tools-0.7.18.dist-info}/WHEEL +0 -0
- {rara_tools-0.7.16.dist-info → rara_tools-0.7.18.dist-info}/licenses/LICENSE.md +0 -0
- {rara_tools-0.7.16.dist-info → rara_tools-0.7.18.dist-info}/top_level.txt +0 -0
|
@@ -77,7 +77,7 @@ class PageSchema:
|
|
|
77
77
|
self.__schema = {
|
|
78
78
|
"@type": "CreativeWork", # CONSTANT for pages
|
|
79
79
|
"@id": self.page_id,
|
|
80
|
-
"hasPart": []
|
|
80
|
+
"dcterms:hasPart": []
|
|
81
81
|
}
|
|
82
82
|
text_schemas = [
|
|
83
83
|
TextPageSchema(page).schema
|
|
@@ -91,7 +91,7 @@ class PageSchema:
|
|
|
91
91
|
page_schemas = text_schemas + image_schemas
|
|
92
92
|
page_schemas_with_ids = self._add_segment_ids(page_schemas)
|
|
93
93
|
|
|
94
|
-
self.__schema["hasPart"].extend(page_schemas_with_ids)
|
|
94
|
+
self.__schema["dcterms:hasPart"].extend(page_schemas_with_ids)
|
|
95
95
|
|
|
96
96
|
return self.__schema
|
|
97
97
|
|
rara_tools/normalizers/base.py
CHANGED
|
@@ -3,6 +3,8 @@ from pymarc import (Field, Subfield, JSONReader, Record)
|
|
|
3
3
|
from typing import List, Optional, Iterator
|
|
4
4
|
from rara_tools.normalizers.reader import SafeJSONReader
|
|
5
5
|
|
|
6
|
+
from rara_tools.parsers.tools.validators import filter_names
|
|
7
|
+
|
|
6
8
|
from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
|
|
7
9
|
from rara_tools.constants.normalizers import (
|
|
8
10
|
DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
|
|
@@ -311,25 +313,34 @@ class RecordNormalizer:
|
|
|
311
313
|
if viaf_record:
|
|
312
314
|
self._include_name_variations(record, viaf_record)
|
|
313
315
|
|
|
314
|
-
def _include_name_variations(self, record: Record, viaf_record: VIAFRecord) -> None:
|
|
316
|
+
def _include_name_variations(self, record: Record, viaf_record: VIAFRecord, filter_variations=True) -> None:
|
|
315
317
|
""" Include name variations from VIAF record as 400|t fields """
|
|
316
318
|
|
|
317
319
|
if not viaf_record or not viaf_record.name_variations:
|
|
318
320
|
return
|
|
319
321
|
|
|
320
322
|
existing_name_variations = record.get_fields("400")
|
|
321
|
-
existing_variations = [sf.value for field in existing_name_variations for sf in field.get_subfields("
|
|
323
|
+
existing_variations = [sf.value for field in existing_name_variations for sf in field.get_subfields("a")]
|
|
324
|
+
|
|
325
|
+
if filter_variations:
|
|
326
|
+
allowed_variations = filter_names(viaf_record.name_variations)
|
|
327
|
+
logger.debug(
|
|
328
|
+
f"filtered out {len(viaf_record.name_variations) - len(allowed_variations)} name variations for '{viaf_record.name}'"
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
else:
|
|
332
|
+
allowed_variations = viaf_record.name_variations
|
|
322
333
|
|
|
323
334
|
fields = []
|
|
324
335
|
|
|
325
|
-
for variation in
|
|
336
|
+
for variation in allowed_variations:
|
|
326
337
|
if variation not in existing_variations:
|
|
327
338
|
fields.append(
|
|
328
339
|
Field(
|
|
329
340
|
tag="400",
|
|
330
341
|
indicators=EMPTY_INDICATORS,
|
|
331
342
|
subfields=[
|
|
332
|
-
Subfield("
|
|
343
|
+
Subfield("a", variation)
|
|
333
344
|
]
|
|
334
345
|
)
|
|
335
346
|
)
|
rara_tools/normalizers/bibs.py
CHANGED
|
@@ -73,26 +73,13 @@ class BibRecordNormalizer(RecordNormalizer):
|
|
|
73
73
|
|
|
74
74
|
|
|
75
75
|
def _normalize_viaf(self, record: Record, viaf_record: VIAFRecord, original_entity: str) -> None:
|
|
76
|
-
|
|
77
76
|
if not viaf_record:
|
|
78
77
|
# viaf record not found, include original entity as 100|t
|
|
79
78
|
self._add_author(record, viaf_record=None, original_entity=original_entity)
|
|
80
79
|
return record
|
|
81
|
-
|
|
82
|
-
viaf_id = viaf_record.viaf_id
|
|
83
|
-
fields = [
|
|
84
|
-
Field(
|
|
85
|
-
tag="035",
|
|
86
|
-
indicators=EMPTY_INDICATORS,
|
|
87
|
-
subfields=[
|
|
88
|
-
Subfield("a", viaf_id)
|
|
89
|
-
]
|
|
90
|
-
)
|
|
91
|
-
]
|
|
92
|
-
|
|
93
|
-
self._add_fields_to_record(record, fields)
|
|
80
|
+
|
|
94
81
|
self._add_author(record, viaf_record, original_entity=original_entity)
|
|
95
|
-
|
|
82
|
+
|
|
96
83
|
def _normalize_record(self, record: Record, sierraID: str,
|
|
97
84
|
viaf_record: VIAFRecord, is_editing_existing_record: bool, original_entity: str) -> Record:
|
|
98
85
|
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import regex as re
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
def has_valid_chars(entity: str, allow_cyrillic: bool = True) -> bool:
|
|
5
|
+
""" Checks if entity contains any valid characters in latin
|
|
6
|
+
or in cyrillic, if the latter is enabled
|
|
7
|
+
|
|
8
|
+
Parameters
|
|
9
|
+
------------
|
|
10
|
+
entity: str
|
|
11
|
+
String to validate.
|
|
12
|
+
allow_cyrillic: bool
|
|
13
|
+
Allow strings in cyrillic?
|
|
14
|
+
|
|
15
|
+
Returns
|
|
16
|
+
------------
|
|
17
|
+
bool
|
|
18
|
+
Boolean value indicating, if the string
|
|
19
|
+
contains any valid characters.
|
|
20
|
+
|
|
21
|
+
"""
|
|
22
|
+
# Check for latin characters
|
|
23
|
+
is_valid = bool(re.search(r"[a-züõöäA-ZÜÕÖÄ]", entity))
|
|
24
|
+
|
|
25
|
+
if allow_cyrillic and not is_valid:
|
|
26
|
+
# If cyrillic characters are allowed,
|
|
27
|
+
# check for them as well
|
|
28
|
+
is_valid = bool(re.search(r"[а-яА-Я]", entity))
|
|
29
|
+
|
|
30
|
+
return is_valid
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def filter_names(names: List[str], allow_cyrillic: bool = True) -> List[str]:
|
|
34
|
+
""" Filters out names not in allowed encodings (latin / cyrillic).
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
------------
|
|
38
|
+
names: List[str]
|
|
39
|
+
Names to filters.
|
|
40
|
+
allow_cyrillic: bool
|
|
41
|
+
Allow strings in cyrillic?
|
|
42
|
+
|
|
43
|
+
Returns
|
|
44
|
+
------------
|
|
45
|
+
List[str]
|
|
46
|
+
List of filtered names.
|
|
47
|
+
|
|
48
|
+
"""
|
|
49
|
+
filtered_names = [
|
|
50
|
+
name for name in names
|
|
51
|
+
if has_valid_chars(entity=name, allow_cyrillic=allow_cyrillic)
|
|
52
|
+
]
|
|
53
|
+
return filtered_names
|
|
54
|
+
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
rara_tools/converters.py,sha256=a1dEMa0TwcO9UmjuSBkiuc7LGmH0d_dB6wwoTLpdZhI,4040
|
|
2
2
|
rara_tools/decorators.py,sha256=MjOyvZ5nTkwxwx2JLFEGpKKBysvecFw6EN6UDrSvZLU,2187
|
|
3
|
-
rara_tools/digar_schema_converter.py,sha256=
|
|
3
|
+
rara_tools/digar_schema_converter.py,sha256=wd6QeSxC1nfiH5tDogfNl0zO1VnS5IiPZ5Y2UIrjOL4,15077
|
|
4
4
|
rara_tools/elastic.py,sha256=4D9yoyMy6AJIKwhSi2H1usffDHAh2A_IZfv5BtYnBKg,13992
|
|
5
5
|
rara_tools/exceptions.py,sha256=YQyaueUbXeTkJYFDEuN6iWTXMI3eCv5l7PxGp87vg5I,550
|
|
6
6
|
rara_tools/s3.py,sha256=9ziDXsLjBtFAvsjTPxFddhfvkpA8773rzPJqO7y1N5Q,6415
|
|
@@ -21,8 +21,8 @@ rara_tools/core_formatters/formatted_meta.py,sha256=WEnMs8K0YeTLGjXn_mxQTpshxcz5
|
|
|
21
21
|
rara_tools/core_formatters/formatted_object.py,sha256=7a499ZmcZXOqtlwxDi6FWHWF5a6HdCsduS22wV3uHIE,5656
|
|
22
22
|
rara_tools/normalizers/__init__.py,sha256=_NqpS5w710DhaURytHq9JpEt8HgYpSPfRDcOtOymJgE,193
|
|
23
23
|
rara_tools/normalizers/authorities.py,sha256=iW3cYOqqVJKy4CcnG9_T6dN-1bBT1e-0jtLYvco-MyQ,5311
|
|
24
|
-
rara_tools/normalizers/base.py,sha256=
|
|
25
|
-
rara_tools/normalizers/bibs.py,sha256=
|
|
24
|
+
rara_tools/normalizers/base.py,sha256=tw64ZK7KXg9O2IPMxICMogYHAG6il10qQqCd4fIjQL0,20941
|
|
25
|
+
rara_tools/normalizers/bibs.py,sha256=5pOw8RsQ4eDwbREbYySeI_b7dQyGlJnfMRSS-tWGJ9c,3632
|
|
26
26
|
rara_tools/normalizers/reader.py,sha256=GYCkAtnsNx135w5lD-_MqCZzdHQHHPDF-pDxYj839Vo,1595
|
|
27
27
|
rara_tools/normalizers/viaf.py,sha256=C-NfbvL83ZcHVB9ICMw43wAMYKTqDTHU3ZT2mXKec00,24288
|
|
28
28
|
rara_tools/parsers/marc_parsers/base_parser.py,sha256=Kdw4aivJf2FkWgIK7pJtHtVXF_G1pjHVQ7IcFItSqy8,1649
|
|
@@ -39,8 +39,9 @@ rara_tools/parsers/marc_records/title_record.py,sha256=XrtJ4gj7wzSaGxNaPtPuawmqq
|
|
|
39
39
|
rara_tools/parsers/tools/entity_normalizers.py,sha256=VyCy_NowCLpOsL0luQ55IW-Qi-J5oBH0Ofzr7HRFBhM,8949
|
|
40
40
|
rara_tools/parsers/tools/marc_converter.py,sha256=LgSHe-7n7aiDrw2bnsB53r3fXTRFjZXTwBYfTpL0pfs,415
|
|
41
41
|
rara_tools/parsers/tools/russian_transliterator.py,sha256=5ZU66iTqAhr7pmfVqXPAI_cidF43VqqmuN4d7H4_JuA,9770
|
|
42
|
-
rara_tools
|
|
43
|
-
rara_tools-0.7.
|
|
44
|
-
rara_tools-0.7.
|
|
45
|
-
rara_tools-0.7.
|
|
46
|
-
rara_tools-0.7.
|
|
42
|
+
rara_tools/parsers/tools/validators.py,sha256=JTGbfAWcLldlZrX0nb343P9RJ8QwSh3455fYap3UxxY,1335
|
|
43
|
+
rara_tools-0.7.18.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
|
|
44
|
+
rara_tools-0.7.18.dist-info/METADATA,sha256=9AR_e8-yNVW_qp6Iaxp0IP2_HxV_NU87DE_I2GQOuJg,4080
|
|
45
|
+
rara_tools-0.7.18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
46
|
+
rara_tools-0.7.18.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
|
|
47
|
+
rara_tools-0.7.18.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|