rara-tools 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rara-tools might be problematic. Click here for more details.
- rara_tools/constants/normalizers.py +38 -0
- rara_tools/normalizers/base.py +40 -24
- rara_tools/normalizers/viaf.py +496 -47
- {rara_tools-0.5.2.dist-info → rara_tools-0.5.3.dist-info}/METADATA +2 -1
- {rara_tools-0.5.2.dist-info → rara_tools-0.5.3.dist-info}/RECORD +8 -8
- {rara_tools-0.5.2.dist-info → rara_tools-0.5.3.dist-info}/WHEEL +0 -0
- {rara_tools-0.5.2.dist-info → rara_tools-0.5.3.dist-info}/licenses/LICENSE.md +0 -0
- {rara_tools-0.5.2.dist-info → rara_tools-0.5.3.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,44 @@
|
|
|
1
1
|
from pymarc import Indicators
|
|
2
2
|
import os
|
|
3
3
|
|
|
4
|
+
class EntityType:
|
|
5
|
+
PER = "PER"
|
|
6
|
+
ORG = "ORG"
|
|
7
|
+
KEYWORD = "EMS_KEYWORD"
|
|
8
|
+
LOC = "LOC"
|
|
9
|
+
TITLE = "TITLE"
|
|
10
|
+
UNK = "UNKNOWN"
|
|
11
|
+
|
|
12
|
+
|
|
4
13
|
EMPTY_INDICATORS = Indicators(" ", " ")
|
|
5
14
|
VIAF_ALLOWED_SOURCES = ["LC", "DNB", "LNB", "NLL",
|
|
6
15
|
"ERRR", "J9U"]
|
|
16
|
+
|
|
17
|
+
DEFAULT_VIAF_FIELD = "local.names"
|
|
18
|
+
|
|
19
|
+
ALLOWED_VIAF_FIELDS = [
|
|
20
|
+
"cql.any", # All fields
|
|
21
|
+
"local.names", # All headings
|
|
22
|
+
"local.personalNames", # Personal names
|
|
23
|
+
"local.corporateNames", # Corporate names
|
|
24
|
+
"local.geographicNames", # Geographic names
|
|
25
|
+
"local.uniformTitleWorks", # Works
|
|
26
|
+
"local.uniformTitleExpressions", # Expressions
|
|
27
|
+
"local.mainHeadingEl", # Preferred headings
|
|
28
|
+
"Xlocal.names", # Exact headings
|
|
29
|
+
"local.title" # Bibliographic titles
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
# For mapping rara-linker's entity type's to corresponding VIAF fields
|
|
33
|
+
VIAF_ENTITY_MAP = {
|
|
34
|
+
EntityType.PER: "local.personalNames",
|
|
35
|
+
EntityType.ORG: "local.corporateNames",
|
|
36
|
+
EntityType.LOC: "loca.geographicNames",
|
|
37
|
+
EntityType.TITLE: "local.uniformTitleWorks"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
}
|
|
41
|
+
ALLOWED_VIAF_WIKILINK_LANGS = ["en", "et"]
|
|
42
|
+
VIAF_SIMILARITY_THRESHOLD = 0.92
|
|
43
|
+
VERIFY_VIAF_RECORD = True
|
|
44
|
+
MAX_VIAF_RECORDS_TO_VERIFY = 10
|
rara_tools/normalizers/base.py
CHANGED
|
@@ -4,7 +4,10 @@ from typing import List, Optional, Iterator
|
|
|
4
4
|
|
|
5
5
|
from rara_tools.constants import EMPTY_INDICATORS
|
|
6
6
|
from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
|
|
7
|
-
|
|
7
|
+
from rara_tools.constants.normalizers import (
|
|
8
|
+
DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
|
|
9
|
+
VIAF_SIMILARITY_THRESHOLD, VERIFY_VIAF_RECORD, MAX_VIAF_RECORDS_TO_VERIFY
|
|
10
|
+
)
|
|
8
11
|
from glom import glom
|
|
9
12
|
import logging
|
|
10
13
|
import json
|
|
@@ -187,7 +190,7 @@ class RecordNormalizer:
|
|
|
187
190
|
"Collective": "111"
|
|
188
191
|
}
|
|
189
192
|
|
|
190
|
-
author_type = viaf_record.
|
|
193
|
+
author_type = viaf_record.name_type
|
|
191
194
|
tag = type_map.get(author_type, "100")
|
|
192
195
|
|
|
193
196
|
fields = [
|
|
@@ -195,9 +198,9 @@ class RecordNormalizer:
|
|
|
195
198
|
tag=tag,
|
|
196
199
|
indicators=EMPTY_INDICATORS,
|
|
197
200
|
subfields=[
|
|
198
|
-
Subfield("a", viaf_record.
|
|
199
|
-
Subfield("b", viaf_record.
|
|
200
|
-
Subfield("c", viaf_record.
|
|
201
|
+
Subfield("a", viaf_record.name),
|
|
202
|
+
Subfield("b", viaf_record.name_type), # Is this correct??
|
|
203
|
+
Subfield("c", viaf_record.name_type) # Is this correct??
|
|
201
204
|
]
|
|
202
205
|
)
|
|
203
206
|
]
|
|
@@ -231,32 +234,45 @@ class RecordNormalizer:
|
|
|
231
234
|
if entity:
|
|
232
235
|
return entity
|
|
233
236
|
else:
|
|
234
|
-
return record.
|
|
237
|
+
return record.name
|
|
238
|
+
|
|
239
|
+
def _get_viaf_record(self, record: Record, viaf_id: Optional[int] = None,
|
|
240
|
+
entity: Optional[str] = None, viaf_field: str = DEFAULT_VIAF_FIELD,
|
|
241
|
+
threshold: float = VIAF_SIMILARITY_THRESHOLD, verify: bool = VERIFY_VIAF_RECORD,
|
|
242
|
+
max_records: int = MAX_VIAF_RECORDS_TO_VERIFY
|
|
243
|
+
) -> Optional[VIAFRecord]:
|
|
244
|
+
viaf_record = None
|
|
235
245
|
|
|
236
|
-
def _get_viaf_record(self, record: Record, viaf_id: Optional[int] = None, entity: Optional[str] = None) -> Optional[VIAFRecord]:
|
|
237
246
|
try:
|
|
238
247
|
viaf_client = VIAFClient()
|
|
239
248
|
|
|
240
249
|
if viaf_id:
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
250
|
+
viaf_records = viaf_client.get_normalized_data_by_ids([viaf_id])
|
|
251
|
+
if viaf_records:
|
|
252
|
+
viaf_record = viaf_records[0]
|
|
253
|
+
else:
|
|
254
|
+
search_term = self._get_viaf_search_term(record, entity)
|
|
255
|
+
if not verify:
|
|
256
|
+
logger.warning(
|
|
257
|
+
f"Record verification is turned off. If multiple records are " \
|
|
258
|
+
f"detected for search term '{search_term}', the first " \
|
|
259
|
+
f"result is automatically returned. This might lead to " \
|
|
260
|
+
f"some inaccuracies!"
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
viaf_record = viaf_client.get_normalized_data_by_search_term(
|
|
264
|
+
search_term=search_term,
|
|
265
|
+
field=viaf_field,
|
|
266
|
+
max_records=max_records,
|
|
267
|
+
verify=verify,
|
|
268
|
+
threshold=threshold
|
|
269
|
+
)
|
|
257
270
|
|
|
258
271
|
except Exception as e:
|
|
259
|
-
logger.error(
|
|
272
|
+
logger.error(
|
|
273
|
+
f"Error fetching VIAF record with ID={viaf_id} / entity='{entity}': {e}"
|
|
274
|
+
)
|
|
275
|
+
return viaf_record
|
|
260
276
|
|
|
261
277
|
def _normalize_record(self, record: Record, sierraID: str,
|
|
262
278
|
viaf_record: VIAFRecord, is_editing_existing_record: bool) -> Record:
|
rara_tools/normalizers/viaf.py
CHANGED
|
@@ -1,22 +1,44 @@
|
|
|
1
1
|
import requests
|
|
2
2
|
import json
|
|
3
|
-
|
|
3
|
+
import regex as re
|
|
4
|
+
from typing import List, Dict
|
|
4
5
|
from collections import defaultdict
|
|
6
|
+
from jellyfish import jaro_winkler_similarity as jw
|
|
7
|
+
from requests.models import Response
|
|
8
|
+
from rara_tools.parsers.tools.entity_normalizers import PersonalName, Normalizer
|
|
9
|
+
from rara_tools.constants.normalizers import (
|
|
10
|
+
DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
|
|
11
|
+
VIAF_SIMILARITY_THRESHOLD
|
|
12
|
+
)
|
|
5
13
|
|
|
6
14
|
import logging
|
|
7
15
|
logger = logging.getLogger(__name__)
|
|
8
16
|
|
|
9
|
-
|
|
10
17
|
class VIAFRecord:
|
|
18
|
+
""" Takes in a VIAF query response JSON and wraps
|
|
19
|
+
information extraction from it.
|
|
20
|
+
"""
|
|
11
21
|
def __init__(self,
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
22
|
+
record: dict,
|
|
23
|
+
allowed_sources: List[str] = [
|
|
24
|
+
"LC", "DNB", "LNB", "NLL", "ERRR", "J9U"
|
|
25
|
+
]
|
|
26
|
+
):
|
|
27
|
+
""" Initializes VIAFRecord class.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
-----------
|
|
31
|
+
record: dict
|
|
32
|
+
VIAF query response JSON.
|
|
33
|
+
allowed_sources: List[str]
|
|
34
|
+
Only exracts information from these sources. Other
|
|
35
|
+
sources are ignored.
|
|
36
|
+
"""
|
|
16
37
|
self.__record: dict = record
|
|
17
38
|
self.__record_data: dict = {}
|
|
18
39
|
self.__allowed_sources: List[str] = allowed_sources
|
|
19
40
|
self.__viaf_id: int = None
|
|
41
|
+
self.__viaf_url: str = ""
|
|
20
42
|
self.__name_variations: List[str] = []
|
|
21
43
|
self.__birth_date: str = None
|
|
22
44
|
self.__death_date: str = None
|
|
@@ -24,23 +46,156 @@ class VIAFRecord:
|
|
|
24
46
|
self.__all_fields: dict = {}
|
|
25
47
|
self.__nationality: str = ""
|
|
26
48
|
self.__has_isni: bool = False
|
|
27
|
-
self.
|
|
28
|
-
self.
|
|
49
|
+
self.__name: str = ""
|
|
50
|
+
self.__name_type: str = ""
|
|
29
51
|
self.__has_isni: str = ""
|
|
30
52
|
self.__activity_start: str = None
|
|
31
|
-
self.__activity_end: str = None
|
|
53
|
+
self.__activity_end: str = None,
|
|
54
|
+
self.__works: List[str] = []
|
|
55
|
+
self.__wikilinks: dict = {}
|
|
56
|
+
self.__all_wikilinks: List[str] = []
|
|
57
|
+
self.__has_isni: bool | None = None
|
|
58
|
+
self.__marc_400: List[dict] = []
|
|
59
|
+
self.__marc_500: List[dict] = []
|
|
60
|
+
self.__marc_main: List[dict] = []
|
|
61
|
+
self.__subfield_indicator: str = ""
|
|
62
|
+
|
|
63
|
+
self.__value_fields: List[str] = [
|
|
64
|
+
"text", "value", "title", "datafield"
|
|
65
|
+
]
|
|
66
|
+
self.__title_types: List[str] = ["UniformTitleWork"]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def __get_data(self, field_name: str, subfield_name: str = "data",
|
|
70
|
+
allowed_sources: List[str] = []
|
|
71
|
+
) -> List[str]:
|
|
72
|
+
|
|
73
|
+
if not allowed_sources:
|
|
74
|
+
allowed_sources = self.__allowed_sources
|
|
75
|
+
|
|
76
|
+
data = []
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
entries = self.record_data.get(
|
|
80
|
+
field_name, {}
|
|
81
|
+
).get(subfield_name, [])
|
|
82
|
+
|
|
83
|
+
for entry in entries:
|
|
84
|
+
sources = entry.get("sources", {}).get("s", [])
|
|
85
|
+
if set(allowed_sources).intersection(set(sources)):
|
|
86
|
+
for field in self.__value_fields:
|
|
87
|
+
value = entry.get(field, "")
|
|
88
|
+
if value:
|
|
89
|
+
data.append(value)
|
|
90
|
+
break
|
|
91
|
+
except Exception as e:
|
|
92
|
+
logger.error(
|
|
93
|
+
f"Failed extracting data from field '{field_name}' with subfield " \
|
|
94
|
+
f"'{subfield_name}'. '{field_name}' dict has the following " \
|
|
95
|
+
f"structure: {self.record_data.get(field_name)}. " \
|
|
96
|
+
f"Exception reason: {e}."
|
|
97
|
+
)
|
|
98
|
+
return data
|
|
99
|
+
|
|
100
|
+
def _get_wikilink_lang(self, wikilink: str) -> str:
|
|
101
|
+
""" Parses the language of the Wikipedia page
|
|
102
|
+
from wikilink.
|
|
103
|
+
"""
|
|
104
|
+
pattern = r"(?<=https\W{3})\w+(?=[.])"
|
|
105
|
+
match = re.search(pattern, wikilink)
|
|
106
|
+
wikilink_lang = ""
|
|
107
|
+
if match:
|
|
108
|
+
wikilink_lang = match.group()
|
|
109
|
+
return wikilink_lang
|
|
110
|
+
|
|
111
|
+
def _get_marc_field(self, marc_dict: dict, subfield: str = "a") -> str:
|
|
112
|
+
value = ""
|
|
113
|
+
if marc_dict.get("dtype", "") == "MARC21":
|
|
114
|
+
subfields = marc_dict.get("subfield", [])
|
|
115
|
+
for _subfield in subfields:
|
|
116
|
+
if _subfield.get("code", "") == subfield:
|
|
117
|
+
value = _subfield.get("value", "")
|
|
118
|
+
break
|
|
119
|
+
return value
|
|
120
|
+
|
|
121
|
+
def _get_marc_tag(self, marc_dict: dict) -> str:
|
|
122
|
+
tag = ""
|
|
123
|
+
if marc_dict.get("dtype", "") == "MARC21":
|
|
124
|
+
tag = marc_dict.get("tag", "")
|
|
125
|
+
return tag
|
|
126
|
+
|
|
127
|
+
def _get_names(self, marc_dicts: List[dict]) -> List[str]:
|
|
128
|
+
names_d = defaultdict(int)
|
|
129
|
+
for marc_dict in marc_dicts:
|
|
130
|
+
name = self._get_marc_field(marc_dict, self.subfield_indicator)
|
|
131
|
+
names_d[name]+=1
|
|
132
|
+
name_list = sorted(
|
|
133
|
+
list(names_d.items()),
|
|
134
|
+
key=lambda x: x[1],
|
|
135
|
+
reverse=True
|
|
136
|
+
)
|
|
137
|
+
names = []
|
|
138
|
+
for n in name_list:
|
|
139
|
+
_name = self._strip_punctuation(n[0])
|
|
140
|
+
if _name not in names:
|
|
141
|
+
names.append(_name)
|
|
142
|
+
|
|
143
|
+
return names
|
|
144
|
+
|
|
145
|
+
def _get_name(self, marc_dicts: List[dict]) -> str:
|
|
146
|
+
names = self._get_names(marc_dicts)
|
|
147
|
+
name = ""
|
|
148
|
+
if names:
|
|
149
|
+
name = names[0]
|
|
150
|
+
return name
|
|
151
|
+
|
|
152
|
+
def _strip_punctuation(self, entity: str) -> str:
|
|
153
|
+
entity = entity.strip(",")
|
|
154
|
+
# Strip "." only if the last token is not an initial,
|
|
155
|
+
# e.g: "Meri, Lennart." -> Strip
|
|
156
|
+
# "Meri, L." -> Do not strip.
|
|
157
|
+
ent_tokens = entity.split()
|
|
158
|
+
if len(ent_tokens[-1]) > 2:
|
|
159
|
+
entity = entity.strip(".")
|
|
160
|
+
return entity
|
|
161
|
+
|
|
162
|
+
def _strip_parenthesis(self, entity: str) -> str:
|
|
163
|
+
""" Strip information in parenthesis from VIAF records
|
|
164
|
+
in order to compare the records more easily.
|
|
165
|
+
"""
|
|
166
|
+
_entity = re.sub(r"[(][^)][)]", "", entity)
|
|
167
|
+
return _entity.strip()
|
|
168
|
+
|
|
169
|
+
@property
|
|
170
|
+
def subfield_indicator(self) -> str:
|
|
171
|
+
if not self.__subfield_indicator:
|
|
172
|
+
if self.name_type in self.__title_types:
|
|
173
|
+
subfield_name = "t"
|
|
174
|
+
else:
|
|
175
|
+
subfield_name = "a"
|
|
176
|
+
self.__subfield_indicator = subfield_name
|
|
177
|
+
return self.__subfield_indicator
|
|
32
178
|
|
|
33
179
|
@property
|
|
34
|
-
def
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
180
|
+
def name(self) -> str:
|
|
181
|
+
# author -> name
|
|
182
|
+
if not self.__name:
|
|
183
|
+
if self.marc_main:
|
|
184
|
+
self.__name = self._get_name(self.marc_main)
|
|
185
|
+
else:
|
|
186
|
+
names = self.__get_data("mainHeadings", "data")
|
|
187
|
+
if names:
|
|
188
|
+
self.__name = names[0]
|
|
189
|
+
return self.__name
|
|
38
190
|
|
|
39
191
|
@property
|
|
40
|
-
def
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
192
|
+
def name_type(self) -> str:
|
|
193
|
+
# author_type -> name_type
|
|
194
|
+
""" Type of name (personal, corporate, title, etc)
|
|
195
|
+
"""
|
|
196
|
+
if not self.__name_type:
|
|
197
|
+
self.__name_type = self.record_data.get("nameType")
|
|
198
|
+
return self.__name_type
|
|
44
199
|
|
|
45
200
|
@property
|
|
46
201
|
def viaf_id(self) -> int:
|
|
@@ -49,18 +204,17 @@ class VIAFRecord:
|
|
|
49
204
|
return self.__viaf_id
|
|
50
205
|
|
|
51
206
|
@property
|
|
52
|
-
def
|
|
53
|
-
|
|
207
|
+
def viaf_url(self) -> str:
|
|
208
|
+
if not self.__viaf_url:
|
|
209
|
+
self.__viaf_url = self.record_data.get(
|
|
210
|
+
"Document", {}).get("about", "")
|
|
211
|
+
return self.__viaf_url
|
|
54
212
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
sources = entry.get("sources", {}).get("s", [])
|
|
61
|
-
if set(self.__allowed_sources).intersection(set(sources)):
|
|
62
|
-
data.append(entry.get("text", ""))
|
|
63
|
-
return data
|
|
213
|
+
@property
|
|
214
|
+
def has_isni(self) -> bool:
|
|
215
|
+
if self.__has_isni == None:
|
|
216
|
+
self.__has_isni = bool(self.record_data.get("isni", ""))
|
|
217
|
+
return self.__has_isni
|
|
64
218
|
|
|
65
219
|
@property
|
|
66
220
|
def record_data(self) -> dict:
|
|
@@ -75,7 +229,18 @@ class VIAFRecord:
|
|
|
75
229
|
@property
|
|
76
230
|
def name_variations(self) -> List[str]:
|
|
77
231
|
if not self.__name_variations:
|
|
78
|
-
|
|
232
|
+
if self.marc_400:
|
|
233
|
+
var_1 = self._get_names(self.marc_400)
|
|
234
|
+
var_2 = self._get_names(self.marc_main)
|
|
235
|
+
_vars = var_1 + var_2
|
|
236
|
+
|
|
237
|
+
else:
|
|
238
|
+
_vars = self.__get_data("mainHeadings")
|
|
239
|
+
vars_3 = [Normalizer.clean_entity(v) for v in _vars]
|
|
240
|
+
|
|
241
|
+
vars = _vars + vars_3
|
|
242
|
+
#print(vars)
|
|
243
|
+
self.__name_variations = list(set(vars))
|
|
79
244
|
return self.__name_variations
|
|
80
245
|
|
|
81
246
|
@property
|
|
@@ -117,14 +282,75 @@ class VIAFRecord:
|
|
|
117
282
|
nationalities_dict[n.lower()] += 1
|
|
118
283
|
if nationalities:
|
|
119
284
|
self.__nationality = sorted(
|
|
120
|
-
nationalities_dict.items(),
|
|
285
|
+
nationalities_dict.items(),
|
|
286
|
+
key=lambda x: x[1],
|
|
287
|
+
reverse=True
|
|
288
|
+
)[0][0]
|
|
121
289
|
return self.__nationality
|
|
122
290
|
|
|
291
|
+
@property
|
|
292
|
+
def works(self) -> List[str]:
|
|
293
|
+
if not self.__works:
|
|
294
|
+
self.__works = list(set(self.__get_data(
|
|
295
|
+
field_name="titles",
|
|
296
|
+
subfield_name="work"
|
|
297
|
+
)))
|
|
298
|
+
return self.__works
|
|
299
|
+
|
|
300
|
+
@property
|
|
301
|
+
def all_wikilinks(self) -> List[str]:
|
|
302
|
+
if not self.__all_wikilinks:
|
|
303
|
+
self.__all_wikilinks = self.__get_data(
|
|
304
|
+
field_name="xLinks", subfield_name="xLink",
|
|
305
|
+
allowed_sources=["WKP"]
|
|
306
|
+
)
|
|
307
|
+
return self.__all_wikilinks
|
|
308
|
+
|
|
309
|
+
@property
|
|
310
|
+
def wikilinks(self) -> dict:
|
|
311
|
+
if not self.__wikilinks:
|
|
312
|
+
for wikilink in self.all_wikilinks:
|
|
313
|
+
wikilink_lang = self._get_wikilink_lang(wikilink)
|
|
314
|
+
if wikilink_lang and wikilink_lang in ALLOWED_VIAF_WIKILINK_LANGS:
|
|
315
|
+
self.__wikilinks[wikilink_lang] = wikilink
|
|
316
|
+
return self.__wikilinks
|
|
317
|
+
|
|
318
|
+
@property
|
|
319
|
+
def marc_400(self) -> List[dict]:
|
|
320
|
+
if not self.__marc_400:
|
|
321
|
+
self.__marc_400 = self.__get_data(
|
|
322
|
+
field_name="x400s",
|
|
323
|
+
subfield_name="x400"
|
|
324
|
+
)
|
|
325
|
+
return self.__marc_400
|
|
326
|
+
|
|
327
|
+
@property
|
|
328
|
+
def marc_500(self) -> List[dict]:
|
|
329
|
+
if not self.__marc_500:
|
|
330
|
+
self.__marc_500 = self.__get_data(
|
|
331
|
+
field_name="x500s",
|
|
332
|
+
subfield_name="x500"
|
|
333
|
+
)
|
|
334
|
+
return self.__marc_500
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
@property
|
|
338
|
+
def marc_main(self) -> List[dict]:
|
|
339
|
+
if not self.__marc_main:
|
|
340
|
+
self.__marc_main = self.__get_data(
|
|
341
|
+
field_name="mainHeadings",
|
|
342
|
+
subfield_name="mainHeadingEl"
|
|
343
|
+
)
|
|
344
|
+
return self.__marc_main
|
|
345
|
+
|
|
123
346
|
@property
|
|
124
347
|
def all_fields(self) -> dict:
|
|
125
348
|
if not self.__all_fields:
|
|
126
349
|
self.__all_fields = {
|
|
127
350
|
"viaf_id": self.viaf_id,
|
|
351
|
+
"viaf_url": self.viaf_url,
|
|
352
|
+
"name": self.name,
|
|
353
|
+
"name_type": self.name_type,
|
|
128
354
|
"name_variations": self.name_variations,
|
|
129
355
|
"birth_date": self.birth_date,
|
|
130
356
|
"death_date": self.death_date,
|
|
@@ -133,31 +359,170 @@ class VIAFRecord:
|
|
|
133
359
|
"activity_start": self.activity_start,
|
|
134
360
|
"activity_end": self.activity_end,
|
|
135
361
|
"has_isni": self.has_isni,
|
|
136
|
-
"
|
|
362
|
+
"works": self.works,
|
|
363
|
+
"wikilinks": self.wikilinks,
|
|
364
|
+
"marc_400": self.marc_400,
|
|
365
|
+
"marc_500": self.marc_500,
|
|
366
|
+
"marc_main": self.marc_main
|
|
137
367
|
}
|
|
138
368
|
return self.__all_fields
|
|
139
369
|
|
|
140
370
|
|
|
141
371
|
class VIAFClient:
|
|
142
372
|
def __init__(self, viaf_api_url: str = "https://viaf.org/api"):
|
|
143
|
-
self.root_url = viaf_api_url.strip("/")
|
|
144
|
-
self.record_url = f"{self.root_url}/cluster-record"
|
|
145
|
-
self.search_url = f"{self.root_url}/search"
|
|
146
|
-
self.headers = {
|
|
373
|
+
self.root_url: str = viaf_api_url.strip("/")
|
|
374
|
+
self.record_url: str = f"{self.root_url}/cluster-record"
|
|
375
|
+
self.search_url: str = f"{self.root_url}/search"
|
|
376
|
+
self.headers: dict = {
|
|
147
377
|
"Accept": "application/json",
|
|
148
378
|
"Content-Type": "application/json"
|
|
149
379
|
}
|
|
150
380
|
|
|
151
|
-
def
|
|
381
|
+
def check_search_term_query(self) -> bool:
|
|
382
|
+
""" Function for checking, if VIAF search term
|
|
383
|
+
query works as expected.
|
|
384
|
+
"""
|
|
385
|
+
test_entity = "Lennart Meri"
|
|
386
|
+
record = self.get_normalized_data_by_search_term(
|
|
387
|
+
search_term=test_entity,
|
|
388
|
+
max_records=1,
|
|
389
|
+
verify=False
|
|
390
|
+
)
|
|
391
|
+
success = True
|
|
392
|
+
|
|
393
|
+
if record:
|
|
394
|
+
if record.name != "Meri, Lennart":
|
|
395
|
+
success = False
|
|
396
|
+
else:
|
|
397
|
+
success = False
|
|
398
|
+
if not success:
|
|
399
|
+
logger.error(f"VIAF search term query has changed or not working!")
|
|
400
|
+
return success
|
|
401
|
+
|
|
402
|
+
def check_id_query(self) -> bool:
|
|
403
|
+
""" Function for checking, if VIAF search term
|
|
404
|
+
query works as expected.
|
|
405
|
+
"""
|
|
406
|
+
test_id = "84153775"
|
|
407
|
+
records = self.get_normalized_data_by_ids([test_id])
|
|
408
|
+
success = True
|
|
409
|
+
if records:
|
|
410
|
+
record = records[0]
|
|
411
|
+
if record.name != "Meri, Lennart":
|
|
412
|
+
success = False
|
|
413
|
+
else:
|
|
414
|
+
success = False
|
|
415
|
+
|
|
416
|
+
if not success:
|
|
417
|
+
logger.error(f"VIAF ID query has changed or not working!")
|
|
418
|
+
return success
|
|
419
|
+
|
|
420
|
+
@staticmethod
|
|
421
|
+
def verify(entity: str, viaf_record: VIAFRecord,
|
|
422
|
+
threshold: float = VIAF_SIMILARITY_THRESHOLD
|
|
423
|
+
) -> dict:
|
|
424
|
+
""" Verifies, if entity to link is sufficiently
|
|
425
|
+
similar to a VIAF Record based on name forms in
|
|
426
|
+
VIAFRecord.name_variations.
|
|
427
|
+
|
|
428
|
+
Parameters
|
|
429
|
+
------------
|
|
430
|
+
entity: str
|
|
431
|
+
Entity queried from VIAF.
|
|
432
|
+
viaf_record: VIAFRecord
|
|
433
|
+
A VIAFRecord object.
|
|
434
|
+
threshold: float
|
|
435
|
+
Min similarity threshold for a verified result
|
|
436
|
+
Should be a float between 0 and 1.
|
|
437
|
+
|
|
438
|
+
Returns
|
|
439
|
+
------------
|
|
440
|
+
dict
|
|
441
|
+
Dict with keys:
|
|
442
|
+
verified: bool
|
|
443
|
+
If the VIAFRecord was verified to be
|
|
444
|
+
sufficiently similar.
|
|
445
|
+
most_similar_record: str
|
|
446
|
+
The most similar string to entity
|
|
447
|
+
in VIAFRecord.name_variations.
|
|
448
|
+
score: float
|
|
449
|
+
Similarity score of the most similar record.
|
|
450
|
+
"""
|
|
451
|
+
# might not always be personal name, but shouldn't break anything
|
|
452
|
+
if len(entity.split()) > 1:
|
|
453
|
+
pn = PersonalName(entity)
|
|
454
|
+
name_forms = [pn.last_comma_first, pn.first_last]
|
|
455
|
+
else:
|
|
456
|
+
name_forms = [entity]
|
|
457
|
+
max_similarity = 0
|
|
458
|
+
most_similar_record = ""
|
|
459
|
+
verified = False
|
|
460
|
+
for var in viaf_record.name_variations:
|
|
461
|
+
for name_form in name_forms:
|
|
462
|
+
score = jw(name_form.lower(), var.lower())
|
|
463
|
+
if score > max_similarity:
|
|
464
|
+
max_similarity = score
|
|
465
|
+
most_similar_record = var
|
|
466
|
+
if score >= threshold:
|
|
467
|
+
logger.info(
|
|
468
|
+
f"Verification successful! '{name_form}' sufficiently " \
|
|
469
|
+
f"similar to '{var}'! Score = {score}."
|
|
470
|
+
)
|
|
471
|
+
verified = True
|
|
472
|
+
break
|
|
473
|
+
if verified:
|
|
474
|
+
break
|
|
475
|
+
out = {
|
|
476
|
+
"verified": verified,
|
|
477
|
+
"most_similar_record": most_similar_record,
|
|
478
|
+
"score": max_similarity
|
|
479
|
+
}
|
|
480
|
+
return out
|
|
481
|
+
|
|
482
|
+
@staticmethod
|
|
483
|
+
def get_verified_record(search_term: str, viaf_records: List[VIAFRecord],
|
|
484
|
+
threshold: float = VIAF_SIMILARITY_THRESHOLD
|
|
485
|
+
) -> VIAFRecord:
|
|
486
|
+
""" Takes in n VIAFRecords found while searching the term `search_term`.
|
|
487
|
+
Returns the most similar VIAFRecord.
|
|
488
|
+
"""
|
|
489
|
+
verified_record = None
|
|
490
|
+
max_score = 0
|
|
491
|
+
most_similar_record = ""
|
|
492
|
+
for record in viaf_records:
|
|
493
|
+
verified = VIAFClient.verify(search_term, record, threshold)
|
|
494
|
+
if verified.get("score") > max_score:
|
|
495
|
+
most_similar_record = verified.get("most_similar_record")
|
|
496
|
+
max_score = verified.get("score")
|
|
497
|
+
if verified.get("verified"):
|
|
498
|
+
verified_record = record
|
|
499
|
+
break
|
|
500
|
+
if not verified_record:
|
|
501
|
+
logger.error(
|
|
502
|
+
f"Verification failed. No matched record surpassed the set similarity " \
|
|
503
|
+
f"threshold ({threshold}). Closest match for search term '{search_term}' was " \
|
|
504
|
+
f"'{most_similar_record}' with similarity score {max_score} "
|
|
505
|
+
)
|
|
506
|
+
return verified_record
|
|
507
|
+
|
|
508
|
+
def _send_request(self, url: str, data: dict) -> Response:
|
|
152
509
|
return requests.post(url, data=json.dumps(data), headers=self.headers)
|
|
153
510
|
|
|
154
511
|
def get_records_by_search_term(self,
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
512
|
+
search_term: str,
|
|
513
|
+
index: str = "VIAF",
|
|
514
|
+
field: str = DEFAULT_VIAF_FIELD,
|
|
515
|
+
page_index: int = 0,
|
|
516
|
+
page_size: int = 50
|
|
517
|
+
) -> Response:
|
|
518
|
+
""" Query VIAF records by search term.
|
|
519
|
+
"""
|
|
520
|
+
if field and field not in ALLOWED_VIAF_FIELDS:
|
|
521
|
+
logger.error(
|
|
522
|
+
f"Field '{field}' is not allowed. Defaulting to '{DEFAULT_VIAF_FIELD}'. " \
|
|
523
|
+
f"Allowed VIAF fields are: {ALLOWED_VIAF_FIELDS}. "
|
|
524
|
+
)
|
|
525
|
+
field = DEFAULT_VIAF_FIELD
|
|
161
526
|
data = {
|
|
162
527
|
"reqValues": {
|
|
163
528
|
"field": field,
|
|
@@ -173,18 +538,58 @@ class VIAFClient:
|
|
|
173
538
|
response = self._send_request(url=self.search_url, data=data)
|
|
174
539
|
return response
|
|
175
540
|
|
|
176
|
-
def get_records_by_viaf_id(self, record_id: str) ->
|
|
541
|
+
def get_records_by_viaf_id(self, record_id: str) -> Response:
|
|
542
|
+
""" Query VIAF records by ID.
|
|
543
|
+
"""
|
|
177
544
|
data = {
|
|
178
545
|
"reqValues": {
|
|
179
546
|
"recordId": str(record_id)
|
|
180
547
|
}
|
|
181
548
|
}
|
|
182
549
|
response = self._send_request(url=self.record_url, data=data)
|
|
183
|
-
|
|
184
550
|
return response
|
|
185
551
|
|
|
186
|
-
def
|
|
552
|
+
def extract_viaf_ids(self, search_query_response: Response) -> List[str]:
|
|
553
|
+
""" Parse VIAF ID-s from search query response.
|
|
554
|
+
"""
|
|
555
|
+
try:
|
|
556
|
+
records = search_query_response.json()["queryResult"]["records"]["record"]
|
|
557
|
+
except Exception as e:
|
|
558
|
+
logger.error(
|
|
559
|
+
f"Parsing records from search query " \
|
|
560
|
+
f"response failed with error: {e}."
|
|
561
|
+
)
|
|
562
|
+
records = []
|
|
563
|
+
viaf_ids = []
|
|
564
|
+
for record in records:
|
|
565
|
+
try:
|
|
566
|
+
viaf_id = record["recordData"]["VIAFCluster"]["viafID"]
|
|
567
|
+
viaf_ids.append(viaf_id)
|
|
568
|
+
except Exception as e:
|
|
569
|
+
logger.error(
|
|
570
|
+
f"Extracing VIAF ID from record '{record}' " \
|
|
571
|
+
f"failed with error: {e}"
|
|
572
|
+
)
|
|
573
|
+
return viaf_ids
|
|
574
|
+
|
|
575
|
+
def get_viaf_ids_by_search_terms(self,
|
|
576
|
+
search_term: str, field: str = DEFAULT_VIAF_FIELD,
|
|
577
|
+
viaf_index: str = "VIAF", page_size: int = 50
|
|
578
|
+
) -> List[str]:
|
|
579
|
+
""" Get all matching VIAF IDs for a search term.
|
|
580
|
+
"""
|
|
187
581
|
|
|
582
|
+
search_response = self.get_records_by_search_term(
|
|
583
|
+
search_term=search_term,
|
|
584
|
+
field=field,
|
|
585
|
+
index=viaf_index,
|
|
586
|
+
page_size=page_size
|
|
587
|
+
)
|
|
588
|
+
viaf_ids = self.extract_viaf_ids(search_response)
|
|
589
|
+
return viaf_ids
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
def fetch_viaf_clusters(self, viaf_ids: List[str]) -> Dict[str, dict]:
|
|
188
593
|
results = {}
|
|
189
594
|
|
|
190
595
|
for viaf_id in viaf_ids:
|
|
@@ -198,7 +603,51 @@ class VIAFClient:
|
|
|
198
603
|
|
|
199
604
|
return results
|
|
200
605
|
|
|
201
|
-
def
|
|
606
|
+
def get_normalized_data_by_ids(self, record_ids: List[str]) -> List[VIAFRecord]:
|
|
202
607
|
""" Fetch data required for normalization from VIAF. """
|
|
203
608
|
response = self.fetch_viaf_clusters(record_ids)
|
|
204
|
-
|
|
609
|
+
viaf_records = [
|
|
610
|
+
VIAFRecord(response[record_id])
|
|
611
|
+
for record_id in record_ids
|
|
612
|
+
]
|
|
613
|
+
return viaf_records
|
|
614
|
+
|
|
615
|
+
def get_normalized_data_by_search_term(self,
|
|
616
|
+
search_term: str, field: str = DEFAULT_VIAF_FIELD, max_records: int = 10,
|
|
617
|
+
verify: bool = True, threshold: float = VIAF_SIMILARITY_THRESHOLD,
|
|
618
|
+
viaf_index: str = "VIAF"
|
|
619
|
+
) -> VIAFRecord | None:
|
|
620
|
+
""" Fetch data required for normalization from VIAF. """
|
|
621
|
+
viaf_record = None
|
|
622
|
+
viaf_ids = self.get_viaf_ids_by_search_terms(
|
|
623
|
+
search_term=search_term,
|
|
624
|
+
field=field,
|
|
625
|
+
page_size=max_records,
|
|
626
|
+
viaf_index=viaf_index
|
|
627
|
+
)
|
|
628
|
+
if verify:
|
|
629
|
+
records = self.get_normalized_data_by_ids(viaf_ids[:max_records])
|
|
630
|
+
verified_record = VIAFClient.get_verified_record(
|
|
631
|
+
search_term=search_term,
|
|
632
|
+
viaf_records=records,
|
|
633
|
+
threshold=threshold
|
|
634
|
+
)
|
|
635
|
+
else:
|
|
636
|
+
if viaf_ids:
|
|
637
|
+
records = self.get_normalized_data_by_ids(viaf_ids[:1])
|
|
638
|
+
verified_record = records[0] if records else None
|
|
639
|
+
return verified_record
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
if __name__ == "__main__":
|
|
644
|
+
from pprint import pprint
|
|
645
|
+
vc = VIAFClient()
|
|
646
|
+
entity="Kevade"
|
|
647
|
+
record = vc.get_normalized_data_by_search_term(entity, field="local.uniformTitleWorks", max_records=5, verify=True)
|
|
648
|
+
#pprint(record.record_data)
|
|
649
|
+
if record:
|
|
650
|
+
pprint(record.all_fields)
|
|
651
|
+
#pprint(record.record_data)
|
|
652
|
+
else:
|
|
653
|
+
print(f"Couldn't detect a verified record for entity '{entity}' :(.")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rara-tools
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.3
|
|
4
4
|
Summary: Tools to support Kata's work.
|
|
5
5
|
Classifier: Programming Language :: Python :: 3
|
|
6
6
|
Classifier: Programming Language :: Python :: 3.10
|
|
@@ -18,6 +18,7 @@ Requires-Dist: nltk
|
|
|
18
18
|
Requires-Dist: jsonlines
|
|
19
19
|
Requires-Dist: requests
|
|
20
20
|
Requires-Dist: iso639-lang
|
|
21
|
+
Requires-Dist: jellyfish
|
|
21
22
|
Requires-Dist: pymarc
|
|
22
23
|
Requires-Dist: regex
|
|
23
24
|
Requires-Dist: glom
|
|
@@ -12,14 +12,14 @@ rara_tools/constants/general.py,sha256=i-OrySdsf05HzKWEI5CvWs3ZNsBZpZ5fhWVlU3m2Q
|
|
|
12
12
|
rara_tools/constants/language_evaluator.py,sha256=XtGAgspO2wGV4C2WhPN8zaxHkZ3d5FLgZ1PCvgZY9u0,37
|
|
13
13
|
rara_tools/constants/linker.py,sha256=XUI-fD1LfvpdMDeLmMU3siAsc0pleQ92m6Cdk3_OGmo,169
|
|
14
14
|
rara_tools/constants/meta_extractor.py,sha256=mhuRX4_I2JTnJO_d8tldClmuPx-RwmWWNLavZAJBgVU,33
|
|
15
|
-
rara_tools/constants/normalizers.py,sha256=
|
|
15
|
+
rara_tools/constants/normalizers.py,sha256=Xs3anDwJHpHeniwx3xoIZyqdEXtO3eb7ouGLLr0CpHw,1344
|
|
16
16
|
rara_tools/constants/parsers.py,sha256=L6nh1Itget9_9DMsliDkh6T25z78eMFPWVkbaU08DwU,5561
|
|
17
17
|
rara_tools/constants/subject_indexer.py,sha256=RBbUuhJM8M3GQ1p2GwDAeW5go7zkI5yiuMoL-3V2-NQ,34
|
|
18
18
|
rara_tools/normalizers/__init__.py,sha256=_NqpS5w710DhaURytHq9JpEt8HgYpSPfRDcOtOymJgE,193
|
|
19
19
|
rara_tools/normalizers/authorities.py,sha256=IDtcm0yNZNhv1f-WcdqWFSRzZk_CoKuBFsk6hEPddWM,4513
|
|
20
|
-
rara_tools/normalizers/base.py,sha256=
|
|
20
|
+
rara_tools/normalizers/base.py,sha256=gsKG8NEOah_lwzY9kgCf68943xYoIIo6pPWZQuFHEuk,11818
|
|
21
21
|
rara_tools/normalizers/bibs.py,sha256=4DTS6k37z8qR5B3n7aiCXsT5Z49rLTvQ60lKKr5dyLs,2352
|
|
22
|
-
rara_tools/normalizers/viaf.py,sha256=
|
|
22
|
+
rara_tools/normalizers/viaf.py,sha256=XWpf_GONBGg8nsjGHoF4Vgk4S1xY3TcTIIwsTCNEyAQ,22298
|
|
23
23
|
rara_tools/parsers/marc_parsers/base_parser.py,sha256=wzCccZaiN4p2iUms3PAOfXihNgEeg1cGRzRx26ytJeA,1661
|
|
24
24
|
rara_tools/parsers/marc_parsers/ems_parser.py,sha256=LFuhZcVwmHMcJknX9p4ZkO8RdjPdQZ4APGbw8KV6BIs,2024
|
|
25
25
|
rara_tools/parsers/marc_parsers/location_parser.py,sha256=dSU9dQoGV5z0ajhLI1bn3AAghkOr79qKIrX7sO0_4lA,1873
|
|
@@ -34,8 +34,8 @@ rara_tools/parsers/marc_records/title_record.py,sha256=NyrubWvouZEb46vaoy9NHLCzn
|
|
|
34
34
|
rara_tools/parsers/tools/entity_normalizers.py,sha256=afOMqJoL4aeq0cfsohIuxkxzvqNdZ_ba7U32eyogbzk,8722
|
|
35
35
|
rara_tools/parsers/tools/marc_converter.py,sha256=PUbggzJ_wHfke_bHTF2LOZyzX1t0wRM8qIFL36Dl3AI,414
|
|
36
36
|
rara_tools/parsers/tools/russian_transliterator.py,sha256=5ZU66iTqAhr7pmfVqXPAI_cidF43VqqmuN4d7H4_JuA,9770
|
|
37
|
-
rara_tools-0.5.
|
|
38
|
-
rara_tools-0.5.
|
|
39
|
-
rara_tools-0.5.
|
|
40
|
-
rara_tools-0.5.
|
|
41
|
-
rara_tools-0.5.
|
|
37
|
+
rara_tools-0.5.3.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
|
|
38
|
+
rara_tools-0.5.3.dist-info/METADATA,sha256=JoiwsfHX-dE2ZK3lo2gqaQaMFjr1bWL02Hmxa4y0J8E,4079
|
|
39
|
+
rara_tools-0.5.3.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
|
|
40
|
+
rara_tools-0.5.3.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
|
|
41
|
+
rara_tools-0.5.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|