rara-tools 0.5.2__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rara-tools might be problematic. Click here for more details.
- {rara_tools-0.5.2/rara_tools.egg-info → rara_tools-0.6.0}/PKG-INFO +2 -1
- rara_tools-0.6.0/VERSION +1 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/constants/digitizer.py +1 -1
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/constants/general.py +4 -1
- rara_tools-0.6.0/rara_tools/constants/language_evaluator.py +9 -0
- rara_tools-0.6.0/rara_tools/constants/normalizers.py +44 -0
- rara_tools-0.6.0/rara_tools/constants/subject_indexer.py +9 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/normalizers/base.py +40 -24
- rara_tools-0.6.0/rara_tools/normalizers/viaf.py +653 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0/rara_tools.egg-info}/PKG-INFO +2 -1
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools.egg-info/requires.txt +1 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/requirements.txt +1 -0
- rara_tools-0.6.0/tests/test_viaf_client.py +71 -0
- rara_tools-0.5.2/VERSION +0 -1
- rara_tools-0.5.2/rara_tools/constants/language_evaluator.py +0 -1
- rara_tools-0.5.2/rara_tools/constants/normalizers.py +0 -6
- rara_tools-0.5.2/rara_tools/constants/subject_indexer.py +0 -1
- rara_tools-0.5.2/rara_tools/normalizers/viaf.py +0 -204
- rara_tools-0.5.2/tests/test_viaf_client.py +0 -19
- {rara_tools-0.5.2 → rara_tools-0.6.0}/LICENSE.md +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/README.md +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/pyproject.toml +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/constants/__init__.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/constants/linker.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/constants/meta_extractor.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/constants/parsers.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/converters.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/decorators.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/digar_schema_converter.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/elastic.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/exceptions.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/normalizers/__init__.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/normalizers/authorities.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/normalizers/bibs.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/parsers/marc_parsers/base_parser.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/parsers/marc_parsers/ems_parser.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/parsers/marc_parsers/location_parser.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/parsers/marc_parsers/organization_parser.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/parsers/marc_parsers/person_parser.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/parsers/marc_parsers/title_parser.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/parsers/marc_records/base_record.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/parsers/marc_records/ems_record.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/parsers/marc_records/organization_record.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/parsers/marc_records/person_record.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/parsers/marc_records/title_record.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/parsers/tools/entity_normalizers.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/parsers/tools/marc_converter.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/parsers/tools/russian_transliterator.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/s3.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/task_reporter.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools/utils.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools.egg-info/SOURCES.txt +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools.egg-info/dependency_links.txt +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/rara_tools.egg-info/top_level.txt +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/setup.cfg +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/tests/test_digar_schema_converter.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/tests/test_elastic.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/tests/test_elastic_vector_and_search_operations.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/tests/test_entity_normalizers.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/tests/test_marc_parsers.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/tests/test_normalization.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/tests/test_s3_exceptions.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/tests/test_s3_file_operations.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/tests/test_sierra_converters.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/tests/test_task_reporter.py +0 -0
- {rara_tools-0.5.2 → rara_tools-0.6.0}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rara-tools
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Tools to support Kata's work.
|
|
5
5
|
Classifier: Programming Language :: Python :: 3
|
|
6
6
|
Classifier: Programming Language :: Python :: 3.10
|
|
@@ -18,6 +18,7 @@ Requires-Dist: nltk
|
|
|
18
18
|
Requires-Dist: jsonlines
|
|
19
19
|
Requires-Dist: requests
|
|
20
20
|
Requires-Dist: iso639-lang
|
|
21
|
+
Requires-Dist: jellyfish
|
|
21
22
|
Requires-Dist: pymarc
|
|
22
23
|
Requires-Dist: regex
|
|
23
24
|
Requires-Dist: glom
|
rara_tools-0.6.0/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.6.0
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from pymarc import Indicators
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
class EntityType:
|
|
5
|
+
PER = "PER"
|
|
6
|
+
ORG = "ORG"
|
|
7
|
+
KEYWORD = "EMS_KEYWORD"
|
|
8
|
+
LOC = "LOC"
|
|
9
|
+
TITLE = "TITLE"
|
|
10
|
+
UNK = "UNKNOWN"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
EMPTY_INDICATORS = Indicators(" ", " ")
|
|
14
|
+
VIAF_ALLOWED_SOURCES = ["LC", "DNB", "LNB", "NLL",
|
|
15
|
+
"ERRR", "J9U"]
|
|
16
|
+
|
|
17
|
+
DEFAULT_VIAF_FIELD = "local.names"
|
|
18
|
+
|
|
19
|
+
ALLOWED_VIAF_FIELDS = [
|
|
20
|
+
"cql.any", # All fields
|
|
21
|
+
"local.names", # All headings
|
|
22
|
+
"local.personalNames", # Personal names
|
|
23
|
+
"local.corporateNames", # Corporate names
|
|
24
|
+
"local.geographicNames", # Geographic names
|
|
25
|
+
"local.uniformTitleWorks", # Works
|
|
26
|
+
"local.uniformTitleExpressions", # Expressions
|
|
27
|
+
"local.mainHeadingEl", # Preferred headings
|
|
28
|
+
"Xlocal.names", # Exact headings
|
|
29
|
+
"local.title" # Bibliographic titles
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
# For mapping rara-linker's entity type's to corresponding VIAF fields
|
|
33
|
+
VIAF_ENTITY_MAP = {
|
|
34
|
+
EntityType.PER: "local.personalNames",
|
|
35
|
+
EntityType.ORG: "local.corporateNames",
|
|
36
|
+
EntityType.LOC: "loca.geographicNames",
|
|
37
|
+
EntityType.TITLE: "local.uniformTitleWorks"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
}
|
|
41
|
+
ALLOWED_VIAF_WIKILINK_LANGS = ["en", "et"]
|
|
42
|
+
VIAF_SIMILARITY_THRESHOLD = 0.92
|
|
43
|
+
VERIFY_VIAF_RECORD = True
|
|
44
|
+
MAX_VIAF_RECORDS_TO_VERIFY = 10
|
|
@@ -4,7 +4,10 @@ from typing import List, Optional, Iterator
|
|
|
4
4
|
|
|
5
5
|
from rara_tools.constants import EMPTY_INDICATORS
|
|
6
6
|
from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
|
|
7
|
-
|
|
7
|
+
from rara_tools.constants.normalizers import (
|
|
8
|
+
DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
|
|
9
|
+
VIAF_SIMILARITY_THRESHOLD, VERIFY_VIAF_RECORD, MAX_VIAF_RECORDS_TO_VERIFY
|
|
10
|
+
)
|
|
8
11
|
from glom import glom
|
|
9
12
|
import logging
|
|
10
13
|
import json
|
|
@@ -187,7 +190,7 @@ class RecordNormalizer:
|
|
|
187
190
|
"Collective": "111"
|
|
188
191
|
}
|
|
189
192
|
|
|
190
|
-
author_type = viaf_record.
|
|
193
|
+
author_type = viaf_record.name_type
|
|
191
194
|
tag = type_map.get(author_type, "100")
|
|
192
195
|
|
|
193
196
|
fields = [
|
|
@@ -195,9 +198,9 @@ class RecordNormalizer:
|
|
|
195
198
|
tag=tag,
|
|
196
199
|
indicators=EMPTY_INDICATORS,
|
|
197
200
|
subfields=[
|
|
198
|
-
Subfield("a", viaf_record.
|
|
199
|
-
Subfield("b", viaf_record.
|
|
200
|
-
Subfield("c", viaf_record.
|
|
201
|
+
Subfield("a", viaf_record.name),
|
|
202
|
+
Subfield("b", viaf_record.name_type), # Is this correct??
|
|
203
|
+
Subfield("c", viaf_record.name_type) # Is this correct??
|
|
201
204
|
]
|
|
202
205
|
)
|
|
203
206
|
]
|
|
@@ -231,32 +234,45 @@ class RecordNormalizer:
|
|
|
231
234
|
if entity:
|
|
232
235
|
return entity
|
|
233
236
|
else:
|
|
234
|
-
return record.
|
|
237
|
+
return record.name
|
|
238
|
+
|
|
239
|
+
def _get_viaf_record(self, record: Record, viaf_id: Optional[int] = None,
|
|
240
|
+
entity: Optional[str] = None, viaf_field: str = DEFAULT_VIAF_FIELD,
|
|
241
|
+
threshold: float = VIAF_SIMILARITY_THRESHOLD, verify: bool = VERIFY_VIAF_RECORD,
|
|
242
|
+
max_records: int = MAX_VIAF_RECORDS_TO_VERIFY
|
|
243
|
+
) -> Optional[VIAFRecord]:
|
|
244
|
+
viaf_record = None
|
|
235
245
|
|
|
236
|
-
def _get_viaf_record(self, record: Record, viaf_id: Optional[int] = None, entity: Optional[str] = None) -> Optional[VIAFRecord]:
|
|
237
246
|
try:
|
|
238
247
|
viaf_client = VIAFClient()
|
|
239
248
|
|
|
240
249
|
if viaf_id:
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
250
|
+
viaf_records = viaf_client.get_normalized_data_by_ids([viaf_id])
|
|
251
|
+
if viaf_records:
|
|
252
|
+
viaf_record = viaf_records[0]
|
|
253
|
+
else:
|
|
254
|
+
search_term = self._get_viaf_search_term(record, entity)
|
|
255
|
+
if not verify:
|
|
256
|
+
logger.warning(
|
|
257
|
+
f"Record verification is turned off. If multiple records are " \
|
|
258
|
+
f"detected for search term '{search_term}', the first " \
|
|
259
|
+
f"result is automatically returned. This might lead to " \
|
|
260
|
+
f"some inaccuracies!"
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
viaf_record = viaf_client.get_normalized_data_by_search_term(
|
|
264
|
+
search_term=search_term,
|
|
265
|
+
field=viaf_field,
|
|
266
|
+
max_records=max_records,
|
|
267
|
+
verify=verify,
|
|
268
|
+
threshold=threshold
|
|
269
|
+
)
|
|
257
270
|
|
|
258
271
|
except Exception as e:
|
|
259
|
-
logger.error(
|
|
272
|
+
logger.error(
|
|
273
|
+
f"Error fetching VIAF record with ID={viaf_id} / entity='{entity}': {e}"
|
|
274
|
+
)
|
|
275
|
+
return viaf_record
|
|
260
276
|
|
|
261
277
|
def _normalize_record(self, record: Record, sierraID: str,
|
|
262
278
|
viaf_record: VIAFRecord, is_editing_existing_record: bool) -> Record:
|