rara-tools 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

@@ -9,3 +9,7 @@ class Status:
9
9
 
10
10
  class Queue:
11
11
  CORE = "core"
12
+
13
+
14
+ class Task:
15
+ SEND_VERSION = "send_version_to_core"
@@ -1,6 +1,44 @@
1
1
  from pymarc import Indicators
2
2
  import os
3
3
 
4
+ class EntityType:
5
+ PER = "PER"
6
+ ORG = "ORG"
7
+ KEYWORD = "EMS_KEYWORD"
8
+ LOC = "LOC"
9
+ TITLE = "TITLE"
10
+ UNK = "UNKNOWN"
11
+
12
+
4
13
  EMPTY_INDICATORS = Indicators(" ", " ")
5
14
  VIAF_ALLOWED_SOURCES = ["LC", "DNB", "LNB", "NLL",
6
15
  "ERRR", "J9U"]
16
+
17
+ DEFAULT_VIAF_FIELD = "local.names"
18
+
19
+ ALLOWED_VIAF_FIELDS = [
20
+ "cql.any", # All fields
21
+ "local.names", # All headings
22
+ "local.personalNames", # Personal names
23
+ "local.corporateNames", # Corporate names
24
+ "local.geographicNames", # Geographic names
25
+ "local.uniformTitleWorks", # Works
26
+ "local.uniformTitleExpressions", # Expressions
27
+ "local.mainHeadingEl", # Preferred headings
28
+ "Xlocal.names", # Exact headings
29
+ "local.title" # Bibliographic titles
30
+ ]
31
+
32
+ # For mapping rara-linker's entity type's to corresponding VIAF fields
33
+ VIAF_ENTITY_MAP = {
34
+ EntityType.PER: "local.personalNames",
35
+ EntityType.ORG: "local.corporateNames",
36
+ EntityType.LOC: "loca.geographicNames",
37
+ EntityType.TITLE: "local.uniformTitleWorks"
38
+
39
+
40
+ }
41
+ ALLOWED_VIAF_WIKILINK_LANGS = ["en", "et"]
42
+ VIAF_SIMILARITY_THRESHOLD = 0.92
43
+ VERIFY_VIAF_RECORD = True
44
+ MAX_VIAF_RECORDS_TO_VERIFY = 10
@@ -4,7 +4,10 @@ from typing import List, Optional, Iterator
4
4
 
5
5
  from rara_tools.constants import EMPTY_INDICATORS
6
6
  from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
7
-
7
+ from rara_tools.constants.normalizers import (
8
+ DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
9
+ VIAF_SIMILARITY_THRESHOLD, VERIFY_VIAF_RECORD, MAX_VIAF_RECORDS_TO_VERIFY
10
+ )
8
11
  from glom import glom
9
12
  import logging
10
13
  import json
@@ -187,7 +190,7 @@ class RecordNormalizer:
187
190
  "Collective": "111"
188
191
  }
189
192
 
190
- author_type = viaf_record.author_type
193
+ author_type = viaf_record.name_type
191
194
  tag = type_map.get(author_type, "100")
192
195
 
193
196
  fields = [
@@ -195,9 +198,9 @@ class RecordNormalizer:
195
198
  tag=tag,
196
199
  indicators=EMPTY_INDICATORS,
197
200
  subfields=[
198
- Subfield("a", viaf_record.author),
199
- Subfield("b", viaf_record.author_type),
200
- Subfield("c", viaf_record.author_type)
201
+ Subfield("a", viaf_record.name),
202
+ Subfield("b", viaf_record.name_type), # Is this correct??
203
+ Subfield("c", viaf_record.name_type) # Is this correct??
201
204
  ]
202
205
  )
203
206
  ]
@@ -231,32 +234,45 @@ class RecordNormalizer:
231
234
  if entity:
232
235
  return entity
233
236
  else:
234
- return record.author
237
+ return record.name
238
+
239
+ def _get_viaf_record(self, record: Record, viaf_id: Optional[int] = None,
240
+ entity: Optional[str] = None, viaf_field: str = DEFAULT_VIAF_FIELD,
241
+ threshold: float = VIAF_SIMILARITY_THRESHOLD, verify: bool = VERIFY_VIAF_RECORD,
242
+ max_records: int = MAX_VIAF_RECORDS_TO_VERIFY
243
+ ) -> Optional[VIAFRecord]:
244
+ viaf_record = None
235
245
 
236
- def _get_viaf_record(self, record: Record, viaf_id: Optional[int] = None, entity: Optional[str] = None) -> Optional[VIAFRecord]:
237
246
  try:
238
247
  viaf_client = VIAFClient()
239
248
 
240
249
  if viaf_id:
241
- viaf_info = viaf_client.get_records_by_viaf_id(viaf_id).json()
242
- return VIAFRecord(viaf_info)
243
-
244
- search_term = self._get_viaf_search_term(record, entity)
245
-
246
- results = viaf_client.get_records_by_search_term(
247
- search_term).json()
248
-
249
- num_records = glom(
250
- results, "queryResult.numberOfRecords.value", default=0)
251
-
252
- if num_records == 1:
253
- return VIAFRecord(results)
254
-
255
- logger.warning(
256
- f"Multiple VIAF records found for {search_term}: {num_records}. Skipping.")
250
+ viaf_records = viaf_client.get_normalized_data_by_ids([viaf_id])
251
+ if viaf_records:
252
+ viaf_record = viaf_records[0]
253
+ else:
254
+ search_term = self._get_viaf_search_term(record, entity)
255
+ if not verify:
256
+ logger.warning(
257
+ f"Record verification is turned off. If multiple records are " \
258
+ f"detected for search term '{search_term}', the first " \
259
+ f"result is automatically returned. This might lead to " \
260
+ f"some inaccuracies!"
261
+ )
262
+
263
+ viaf_record = viaf_client.get_normalized_data_by_search_term(
264
+ search_term=search_term,
265
+ field=viaf_field,
266
+ max_records=max_records,
267
+ verify=verify,
268
+ threshold=threshold
269
+ )
257
270
 
258
271
  except Exception as e:
259
- logger.error(f"Error fetching VIAF record: {e}")
272
+ logger.error(
273
+ f"Error fetching VIAF record with ID={viaf_id} / entity='{entity}': {e}"
274
+ )
275
+ return viaf_record
260
276
 
261
277
  def _normalize_record(self, record: Record, sierraID: str,
262
278
  viaf_record: VIAFRecord, is_editing_existing_record: bool) -> Record:
@@ -1,22 +1,44 @@
1
1
  import requests
2
2
  import json
3
- from typing import List
3
+ import regex as re
4
+ from typing import List, Dict
4
5
  from collections import defaultdict
6
+ from jellyfish import jaro_winkler_similarity as jw
7
+ from requests.models import Response
8
+ from rara_tools.parsers.tools.entity_normalizers import PersonalName, Normalizer
9
+ from rara_tools.constants.normalizers import (
10
+ DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
11
+ VIAF_SIMILARITY_THRESHOLD
12
+ )
5
13
 
6
14
  import logging
7
15
  logger = logging.getLogger(__name__)
8
16
 
9
-
10
17
  class VIAFRecord:
18
+ """ Takes in a VIAF query response JSON and wraps
19
+ information extraction from it.
20
+ """
11
21
  def __init__(self,
12
- record: dict,
13
- allowed_sources: List[str] = [
14
- "LC", "DNB", "LNB", "NLL", "ERRR", "J9U"]
15
- ):
22
+ record: dict,
23
+ allowed_sources: List[str] = [
24
+ "LC", "DNB", "LNB", "NLL", "ERRR", "J9U"
25
+ ]
26
+ ):
27
+ """ Initializes VIAFRecord class.
28
+
29
+ Parameters
30
+ -----------
31
+ record: dict
32
+ VIAF query response JSON.
33
+ allowed_sources: List[str]
34
+ Only exracts information from these sources. Other
35
+ sources are ignored.
36
+ """
16
37
  self.__record: dict = record
17
38
  self.__record_data: dict = {}
18
39
  self.__allowed_sources: List[str] = allowed_sources
19
40
  self.__viaf_id: int = None
41
+ self.__viaf_url: str = ""
20
42
  self.__name_variations: List[str] = []
21
43
  self.__birth_date: str = None
22
44
  self.__death_date: str = None
@@ -24,23 +46,156 @@ class VIAFRecord:
24
46
  self.__all_fields: dict = {}
25
47
  self.__nationality: str = ""
26
48
  self.__has_isni: bool = False
27
- self.__author: str = ""
28
- self.__author_type: str = None
49
+ self.__name: str = ""
50
+ self.__name_type: str = ""
29
51
  self.__has_isni: str = ""
30
52
  self.__activity_start: str = None
31
- self.__activity_end: str = None
53
+ self.__activity_end: str = None,
54
+ self.__works: List[str] = []
55
+ self.__wikilinks: dict = {}
56
+ self.__all_wikilinks: List[str] = []
57
+ self.__has_isni: bool | None = None
58
+ self.__marc_400: List[dict] = []
59
+ self.__marc_500: List[dict] = []
60
+ self.__marc_main: List[dict] = []
61
+ self.__subfield_indicator: str = ""
62
+
63
+ self.__value_fields: List[str] = [
64
+ "text", "value", "title", "datafield"
65
+ ]
66
+ self.__title_types: List[str] = ["UniformTitleWork"]
67
+
68
+
69
+ def __get_data(self, field_name: str, subfield_name: str = "data",
70
+ allowed_sources: List[str] = []
71
+ ) -> List[str]:
72
+
73
+ if not allowed_sources:
74
+ allowed_sources = self.__allowed_sources
75
+
76
+ data = []
77
+
78
+ try:
79
+ entries = self.record_data.get(
80
+ field_name, {}
81
+ ).get(subfield_name, [])
82
+
83
+ for entry in entries:
84
+ sources = entry.get("sources", {}).get("s", [])
85
+ if set(allowed_sources).intersection(set(sources)):
86
+ for field in self.__value_fields:
87
+ value = entry.get(field, "")
88
+ if value:
89
+ data.append(value)
90
+ break
91
+ except Exception as e:
92
+ logger.error(
93
+ f"Failed extracting data from field '{field_name}' with subfield " \
94
+ f"'{subfield_name}'. '{field_name}' dict has the following " \
95
+ f"structure: {self.record_data.get(field_name)}. " \
96
+ f"Exception reason: {e}."
97
+ )
98
+ return data
99
+
100
+ def _get_wikilink_lang(self, wikilink: str) -> str:
101
+ """ Parses the language of the Wikipedia page
102
+ from wikilink.
103
+ """
104
+ pattern = r"(?<=https\W{3})\w+(?=[.])"
105
+ match = re.search(pattern, wikilink)
106
+ wikilink_lang = ""
107
+ if match:
108
+ wikilink_lang = match.group()
109
+ return wikilink_lang
110
+
111
+ def _get_marc_field(self, marc_dict: dict, subfield: str = "a") -> str:
112
+ value = ""
113
+ if marc_dict.get("dtype", "") == "MARC21":
114
+ subfields = marc_dict.get("subfield", [])
115
+ for _subfield in subfields:
116
+ if _subfield.get("code", "") == subfield:
117
+ value = _subfield.get("value", "")
118
+ break
119
+ return value
120
+
121
+ def _get_marc_tag(self, marc_dict: dict) -> str:
122
+ tag = ""
123
+ if marc_dict.get("dtype", "") == "MARC21":
124
+ tag = marc_dict.get("tag", "")
125
+ return tag
126
+
127
+ def _get_names(self, marc_dicts: List[dict]) -> List[str]:
128
+ names_d = defaultdict(int)
129
+ for marc_dict in marc_dicts:
130
+ name = self._get_marc_field(marc_dict, self.subfield_indicator)
131
+ names_d[name]+=1
132
+ name_list = sorted(
133
+ list(names_d.items()),
134
+ key=lambda x: x[1],
135
+ reverse=True
136
+ )
137
+ names = []
138
+ for n in name_list:
139
+ _name = self._strip_punctuation(n[0])
140
+ if _name not in names:
141
+ names.append(_name)
142
+
143
+ return names
144
+
145
+ def _get_name(self, marc_dicts: List[dict]) -> str:
146
+ names = self._get_names(marc_dicts)
147
+ name = ""
148
+ if names:
149
+ name = names[0]
150
+ return name
151
+
152
+ def _strip_punctuation(self, entity: str) -> str:
153
+ entity = entity.strip(",")
154
+ # Strip "." only if the last token is not an initial,
155
+ # e.g: "Meri, Lennart." -> Strip
156
+ # "Meri, L." -> Do not strip.
157
+ ent_tokens = entity.split()
158
+ if len(ent_tokens[-1]) > 2:
159
+ entity = entity.strip(".")
160
+ return entity
161
+
162
+ def _strip_parenthesis(self, entity: str) -> str:
163
+ """ Strip information in parenthesis from VIAF records
164
+ in order to compare the records more easily.
165
+ """
166
+ _entity = re.sub(r"[(][^)][)]", "", entity)
167
+ return _entity.strip()
168
+
169
+ @property
170
+ def subfield_indicator(self) -> str:
171
+ if not self.__subfield_indicator:
172
+ if self.name_type in self.__title_types:
173
+ subfield_name = "t"
174
+ else:
175
+ subfield_name = "a"
176
+ self.__subfield_indicator = subfield_name
177
+ return self.__subfield_indicator
32
178
 
33
179
  @property
34
- def author(self) -> str:
35
- if not self.__author:
36
- self.__author = self.record_data.get(
37
- "mainHeading", {}).get("text", "")
180
+ def name(self) -> str:
181
+ # author -> name
182
+ if not self.__name:
183
+ if self.marc_main:
184
+ self.__name = self._get_name(self.marc_main)
185
+ else:
186
+ names = self.__get_data("mainHeadings", "data")
187
+ if names:
188
+ self.__name = names[0]
189
+ return self.__name
38
190
 
39
191
  @property
40
- def author_type(self) -> str:
41
- """type of name (personal, corporate, title, etc)"""
42
- if not self.__author_type:
43
- self.__author_type = self.record_data.get("nameType")
192
+ def name_type(self) -> str:
193
+ # author_type -> name_type
194
+ """ Type of name (personal, corporate, title, etc)
195
+ """
196
+ if not self.__name_type:
197
+ self.__name_type = self.record_data.get("nameType")
198
+ return self.__name_type
44
199
 
45
200
  @property
46
201
  def viaf_id(self) -> int:
@@ -49,18 +204,17 @@ class VIAFRecord:
49
204
  return self.__viaf_id
50
205
 
51
206
  @property
52
- def has_isni(self) -> bool:
53
- return bool(self.record_data.get("isni", ""))
207
+ def viaf_url(self) -> str:
208
+ if not self.__viaf_url:
209
+ self.__viaf_url = self.record_data.get(
210
+ "Document", {}).get("about", "")
211
+ return self.__viaf_url
54
212
 
55
- def __get_data(self, field_name: str) -> List[str]:
56
- entries = self.record_data.get(field_name, {}).get("data", [])
57
-
58
- data = []
59
- for entry in entries:
60
- sources = entry.get("sources", {}).get("s", [])
61
- if set(self.__allowed_sources).intersection(set(sources)):
62
- data.append(entry.get("text", ""))
63
- return data
213
+ @property
214
+ def has_isni(self) -> bool:
215
+ if self.__has_isni == None:
216
+ self.__has_isni = bool(self.record_data.get("isni", ""))
217
+ return self.__has_isni
64
218
 
65
219
  @property
66
220
  def record_data(self) -> dict:
@@ -75,7 +229,18 @@ class VIAFRecord:
75
229
  @property
76
230
  def name_variations(self) -> List[str]:
77
231
  if not self.__name_variations:
78
- self.__name_variations = self.__get_data("mainHeadings")
232
+ if self.marc_400:
233
+ var_1 = self._get_names(self.marc_400)
234
+ var_2 = self._get_names(self.marc_main)
235
+ _vars = var_1 + var_2
236
+
237
+ else:
238
+ _vars = self.__get_data("mainHeadings")
239
+ vars_3 = [Normalizer.clean_entity(v) for v in _vars]
240
+
241
+ vars = _vars + vars_3
242
+ #print(vars)
243
+ self.__name_variations = list(set(vars))
79
244
  return self.__name_variations
80
245
 
81
246
  @property
@@ -117,14 +282,75 @@ class VIAFRecord:
117
282
  nationalities_dict[n.lower()] += 1
118
283
  if nationalities:
119
284
  self.__nationality = sorted(
120
- nationalities_dict.items(), key=lambda x: x[1], reverse=True)[0][0]
285
+ nationalities_dict.items(),
286
+ key=lambda x: x[1],
287
+ reverse=True
288
+ )[0][0]
121
289
  return self.__nationality
122
290
 
291
+ @property
292
+ def works(self) -> List[str]:
293
+ if not self.__works:
294
+ self.__works = list(set(self.__get_data(
295
+ field_name="titles",
296
+ subfield_name="work"
297
+ )))
298
+ return self.__works
299
+
300
+ @property
301
+ def all_wikilinks(self) -> List[str]:
302
+ if not self.__all_wikilinks:
303
+ self.__all_wikilinks = self.__get_data(
304
+ field_name="xLinks", subfield_name="xLink",
305
+ allowed_sources=["WKP"]
306
+ )
307
+ return self.__all_wikilinks
308
+
309
+ @property
310
+ def wikilinks(self) -> dict:
311
+ if not self.__wikilinks:
312
+ for wikilink in self.all_wikilinks:
313
+ wikilink_lang = self._get_wikilink_lang(wikilink)
314
+ if wikilink_lang and wikilink_lang in ALLOWED_VIAF_WIKILINK_LANGS:
315
+ self.__wikilinks[wikilink_lang] = wikilink
316
+ return self.__wikilinks
317
+
318
+ @property
319
+ def marc_400(self) -> List[dict]:
320
+ if not self.__marc_400:
321
+ self.__marc_400 = self.__get_data(
322
+ field_name="x400s",
323
+ subfield_name="x400"
324
+ )
325
+ return self.__marc_400
326
+
327
+ @property
328
+ def marc_500(self) -> List[dict]:
329
+ if not self.__marc_500:
330
+ self.__marc_500 = self.__get_data(
331
+ field_name="x500s",
332
+ subfield_name="x500"
333
+ )
334
+ return self.__marc_500
335
+
336
+
337
+ @property
338
+ def marc_main(self) -> List[dict]:
339
+ if not self.__marc_main:
340
+ self.__marc_main = self.__get_data(
341
+ field_name="mainHeadings",
342
+ subfield_name="mainHeadingEl"
343
+ )
344
+ return self.__marc_main
345
+
123
346
  @property
124
347
  def all_fields(self) -> dict:
125
348
  if not self.__all_fields:
126
349
  self.__all_fields = {
127
350
  "viaf_id": self.viaf_id,
351
+ "viaf_url": self.viaf_url,
352
+ "name": self.name,
353
+ "name_type": self.name_type,
128
354
  "name_variations": self.name_variations,
129
355
  "birth_date": self.birth_date,
130
356
  "death_date": self.death_date,
@@ -133,31 +359,170 @@ class VIAFRecord:
133
359
  "activity_start": self.activity_start,
134
360
  "activity_end": self.activity_end,
135
361
  "has_isni": self.has_isni,
136
- "author": self.author
362
+ "works": self.works,
363
+ "wikilinks": self.wikilinks,
364
+ "marc_400": self.marc_400,
365
+ "marc_500": self.marc_500,
366
+ "marc_main": self.marc_main
137
367
  }
138
368
  return self.__all_fields
139
369
 
140
370
 
141
371
  class VIAFClient:
142
372
  def __init__(self, viaf_api_url: str = "https://viaf.org/api"):
143
- self.root_url = viaf_api_url.strip("/")
144
- self.record_url = f"{self.root_url}/cluster-record"
145
- self.search_url = f"{self.root_url}/search"
146
- self.headers = {
373
+ self.root_url: str = viaf_api_url.strip("/")
374
+ self.record_url: str = f"{self.root_url}/cluster-record"
375
+ self.search_url: str = f"{self.root_url}/search"
376
+ self.headers: dict = {
147
377
  "Accept": "application/json",
148
378
  "Content-Type": "application/json"
149
379
  }
150
380
 
151
- def _send_request(self, url: str, data: dict) -> dict:
381
+ def check_search_term_query(self) -> bool:
382
+ """ Function for checking, if VIAF search term
383
+ query works as expected.
384
+ """
385
+ test_entity = "Lennart Meri"
386
+ record = self.get_normalized_data_by_search_term(
387
+ search_term=test_entity,
388
+ max_records=1,
389
+ verify=False
390
+ )
391
+ success = True
392
+
393
+ if record:
394
+ if record.name != "Meri, Lennart":
395
+ success = False
396
+ else:
397
+ success = False
398
+ if not success:
399
+ logger.error(f"VIAF search term query has changed or not working!")
400
+ return success
401
+
402
+ def check_id_query(self) -> bool:
403
+ """ Function for checking, if VIAF search term
404
+ query works as expected.
405
+ """
406
+ test_id = "84153775"
407
+ records = self.get_normalized_data_by_ids([test_id])
408
+ success = True
409
+ if records:
410
+ record = records[0]
411
+ if record.name != "Meri, Lennart":
412
+ success = False
413
+ else:
414
+ success = False
415
+
416
+ if not success:
417
+ logger.error(f"VIAF ID query has changed or not working!")
418
+ return success
419
+
420
+ @staticmethod
421
+ def verify(entity: str, viaf_record: VIAFRecord,
422
+ threshold: float = VIAF_SIMILARITY_THRESHOLD
423
+ ) -> dict:
424
+ """ Verifies, if entity to link is sufficiently
425
+ similar to a VIAF Record based on name forms in
426
+ VIAFRecord.name_variations.
427
+
428
+ Parameters
429
+ ------------
430
+ entity: str
431
+ Entity queried from VIAF.
432
+ viaf_record: VIAFRecord
433
+ A VIAFRecord object.
434
+ threshold: float
435
+ Min similarity threshold for a verified result
436
+ Should be a float between 0 and 1.
437
+
438
+ Returns
439
+ ------------
440
+ dict
441
+ Dict with keys:
442
+ verified: bool
443
+ If the VIAFRecord was verified to be
444
+ sufficiently similar.
445
+ most_similar_record: str
446
+ The most similar string to entity
447
+ in VIAFRecord.name_variations.
448
+ score: float
449
+ Similarity score of the most similar record.
450
+ """
451
+ # might not always be personal name, but shouldn't break anything
452
+ if len(entity.split()) > 1:
453
+ pn = PersonalName(entity)
454
+ name_forms = [pn.last_comma_first, pn.first_last]
455
+ else:
456
+ name_forms = [entity]
457
+ max_similarity = 0
458
+ most_similar_record = ""
459
+ verified = False
460
+ for var in viaf_record.name_variations:
461
+ for name_form in name_forms:
462
+ score = jw(name_form.lower(), var.lower())
463
+ if score > max_similarity:
464
+ max_similarity = score
465
+ most_similar_record = var
466
+ if score >= threshold:
467
+ logger.info(
468
+ f"Verification successful! '{name_form}' sufficiently " \
469
+ f"similar to '{var}'! Score = {score}."
470
+ )
471
+ verified = True
472
+ break
473
+ if verified:
474
+ break
475
+ out = {
476
+ "verified": verified,
477
+ "most_similar_record": most_similar_record,
478
+ "score": max_similarity
479
+ }
480
+ return out
481
+
482
+ @staticmethod
483
+ def get_verified_record(search_term: str, viaf_records: List[VIAFRecord],
484
+ threshold: float = VIAF_SIMILARITY_THRESHOLD
485
+ ) -> VIAFRecord:
486
+ """ Takes in n VIAFRecords found while searching the term `search_term`.
487
+ Returns the most similar VIAFRecord.
488
+ """
489
+ verified_record = None
490
+ max_score = 0
491
+ most_similar_record = ""
492
+ for record in viaf_records:
493
+ verified = VIAFClient.verify(search_term, record, threshold)
494
+ if verified.get("score") > max_score:
495
+ most_similar_record = verified.get("most_similar_record")
496
+ max_score = verified.get("score")
497
+ if verified.get("verified"):
498
+ verified_record = record
499
+ break
500
+ if not verified_record:
501
+ logger.error(
502
+ f"Verification failed. No matched record surpassed the set similarity " \
503
+ f"threshold ({threshold}). Closest match for search term '{search_term}' was " \
504
+ f"'{most_similar_record}' with similarity score {max_score} "
505
+ )
506
+ return verified_record
507
+
508
+ def _send_request(self, url: str, data: dict) -> Response:
152
509
  return requests.post(url, data=json.dumps(data), headers=self.headers)
153
510
 
154
511
  def get_records_by_search_term(self,
155
- search_term: str,
156
- index: str = "viaf",
157
- field: str = "local.names",
158
- page_index: int = 0,
159
- page_size: int = 50
160
- ) -> dict:
512
+ search_term: str,
513
+ index: str = "VIAF",
514
+ field: str = DEFAULT_VIAF_FIELD,
515
+ page_index: int = 0,
516
+ page_size: int = 50
517
+ ) -> Response:
518
+ """ Query VIAF records by search term.
519
+ """
520
+ if field and field not in ALLOWED_VIAF_FIELDS:
521
+ logger.error(
522
+ f"Field '{field}' is not allowed. Defaulting to '{DEFAULT_VIAF_FIELD}'. " \
523
+ f"Allowed VIAF fields are: {ALLOWED_VIAF_FIELDS}. "
524
+ )
525
+ field = DEFAULT_VIAF_FIELD
161
526
  data = {
162
527
  "reqValues": {
163
528
  "field": field,
@@ -173,18 +538,58 @@ class VIAFClient:
173
538
  response = self._send_request(url=self.search_url, data=data)
174
539
  return response
175
540
 
176
- def get_records_by_viaf_id(self, record_id: str) -> dict:
541
+ def get_records_by_viaf_id(self, record_id: str) -> Response:
542
+ """ Query VIAF records by ID.
543
+ """
177
544
  data = {
178
545
  "reqValues": {
179
546
  "recordId": str(record_id)
180
547
  }
181
548
  }
182
549
  response = self._send_request(url=self.record_url, data=data)
183
-
184
550
  return response
185
551
 
186
- def fetch_viaf_clusters(self, viaf_ids):
552
+ def extract_viaf_ids(self, search_query_response: Response) -> List[str]:
553
+ """ Parse VIAF ID-s from search query response.
554
+ """
555
+ try:
556
+ records = search_query_response.json()["queryResult"]["records"]["record"]
557
+ except Exception as e:
558
+ logger.error(
559
+ f"Parsing records from search query " \
560
+ f"response failed with error: {e}."
561
+ )
562
+ records = []
563
+ viaf_ids = []
564
+ for record in records:
565
+ try:
566
+ viaf_id = record["recordData"]["VIAFCluster"]["viafID"]
567
+ viaf_ids.append(viaf_id)
568
+ except Exception as e:
569
+ logger.error(
570
+ f"Extracing VIAF ID from record '{record}' " \
571
+ f"failed with error: {e}"
572
+ )
573
+ return viaf_ids
574
+
575
+ def get_viaf_ids_by_search_terms(self,
576
+ search_term: str, field: str = DEFAULT_VIAF_FIELD,
577
+ viaf_index: str = "VIAF", page_size: int = 50
578
+ ) -> List[str]:
579
+ """ Get all matching VIAF IDs for a search term.
580
+ """
187
581
 
582
+ search_response = self.get_records_by_search_term(
583
+ search_term=search_term,
584
+ field=field,
585
+ index=viaf_index,
586
+ page_size=page_size
587
+ )
588
+ viaf_ids = self.extract_viaf_ids(search_response)
589
+ return viaf_ids
590
+
591
+
592
+ def fetch_viaf_clusters(self, viaf_ids: List[str]) -> Dict[str, dict]:
188
593
  results = {}
189
594
 
190
595
  for viaf_id in viaf_ids:
@@ -198,7 +603,51 @@ class VIAFClient:
198
603
 
199
604
  return results
200
605
 
201
- def get_normalized_data(self, record_ids: List[str]) -> List[VIAFRecord]:
606
+ def get_normalized_data_by_ids(self, record_ids: List[str]) -> List[VIAFRecord]:
202
607
  """ Fetch data required for normalization from VIAF. """
203
608
  response = self.fetch_viaf_clusters(record_ids)
204
- return [VIAFRecord(response[record_id]) for record_id in record_ids]
609
+ viaf_records = [
610
+ VIAFRecord(response[record_id])
611
+ for record_id in record_ids
612
+ ]
613
+ return viaf_records
614
+
615
+ def get_normalized_data_by_search_term(self,
616
+ search_term: str, field: str = DEFAULT_VIAF_FIELD, max_records: int = 10,
617
+ verify: bool = True, threshold: float = VIAF_SIMILARITY_THRESHOLD,
618
+ viaf_index: str = "VIAF"
619
+ ) -> VIAFRecord | None:
620
+ """ Fetch data required for normalization from VIAF. """
621
+ viaf_record = None
622
+ viaf_ids = self.get_viaf_ids_by_search_terms(
623
+ search_term=search_term,
624
+ field=field,
625
+ page_size=max_records,
626
+ viaf_index=viaf_index
627
+ )
628
+ if verify:
629
+ records = self.get_normalized_data_by_ids(viaf_ids[:max_records])
630
+ verified_record = VIAFClient.get_verified_record(
631
+ search_term=search_term,
632
+ viaf_records=records,
633
+ threshold=threshold
634
+ )
635
+ else:
636
+ if viaf_ids:
637
+ records = self.get_normalized_data_by_ids(viaf_ids[:1])
638
+ verified_record = records[0] if records else None
639
+ return verified_record
640
+
641
+
642
+
643
+ if __name__ == "__main__":
644
+ from pprint import pprint
645
+ vc = VIAFClient()
646
+ entity="Kevade"
647
+ record = vc.get_normalized_data_by_search_term(entity, field="local.uniformTitleWorks", max_records=5, verify=True)
648
+ #pprint(record.record_data)
649
+ if record:
650
+ pprint(record.all_fields)
651
+ #pprint(record.record_data)
652
+ else:
653
+ print(f"Couldn't detect a verified record for entity '{entity}' :(.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.5.1
3
+ Version: 0.5.3
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -18,6 +18,7 @@ Requires-Dist: nltk
18
18
  Requires-Dist: jsonlines
19
19
  Requires-Dist: requests
20
20
  Requires-Dist: iso639-lang
21
+ Requires-Dist: jellyfish
21
22
  Requires-Dist: pymarc
22
23
  Requires-Dist: regex
23
24
  Requires-Dist: glom
@@ -8,18 +8,18 @@ rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3
8
8
  rara_tools/utils.py,sha256=9vSbmuWYU5ydr4lXBKlUKa0xzDccFsaJv4T-XwgUfuY,2578
9
9
  rara_tools/constants/__init__.py,sha256=r78laM9vyRDAvzDhPvzDlhaX6qPwUUBBtwf1WosrW3o,27
10
10
  rara_tools/constants/digitizer.py,sha256=MND0dUQySBAOVWzuUBxQGZWv_Ckdz2jCp25F2_oHGi8,496
11
- rara_tools/constants/general.py,sha256=aVUQTMss89atAkTDZKJXNdnsBHPX-RSrlBOtt-wdPGU,195
11
+ rara_tools/constants/general.py,sha256=i-OrySdsf05HzKWEI5CvWs3ZNsBZpZ5fhWVlU3m2QeY,251
12
12
  rara_tools/constants/language_evaluator.py,sha256=XtGAgspO2wGV4C2WhPN8zaxHkZ3d5FLgZ1PCvgZY9u0,37
13
13
  rara_tools/constants/linker.py,sha256=XUI-fD1LfvpdMDeLmMU3siAsc0pleQ92m6Cdk3_OGmo,169
14
14
  rara_tools/constants/meta_extractor.py,sha256=mhuRX4_I2JTnJO_d8tldClmuPx-RwmWWNLavZAJBgVU,33
15
- rara_tools/constants/normalizers.py,sha256=GmWY89kYfX7_YJ8sdy1vb8ABJc_ABdw_zVVOxd9UZgY,171
15
+ rara_tools/constants/normalizers.py,sha256=Xs3anDwJHpHeniwx3xoIZyqdEXtO3eb7ouGLLr0CpHw,1344
16
16
  rara_tools/constants/parsers.py,sha256=L6nh1Itget9_9DMsliDkh6T25z78eMFPWVkbaU08DwU,5561
17
17
  rara_tools/constants/subject_indexer.py,sha256=RBbUuhJM8M3GQ1p2GwDAeW5go7zkI5yiuMoL-3V2-NQ,34
18
18
  rara_tools/normalizers/__init__.py,sha256=_NqpS5w710DhaURytHq9JpEt8HgYpSPfRDcOtOymJgE,193
19
19
  rara_tools/normalizers/authorities.py,sha256=IDtcm0yNZNhv1f-WcdqWFSRzZk_CoKuBFsk6hEPddWM,4513
20
- rara_tools/normalizers/base.py,sha256=taOboGURQF_ACPVWHX_wMsaDEo8gYdAkiOw0yT0zzR8,10910
20
+ rara_tools/normalizers/base.py,sha256=gsKG8NEOah_lwzY9kgCf68943xYoIIo6pPWZQuFHEuk,11818
21
21
  rara_tools/normalizers/bibs.py,sha256=4DTS6k37z8qR5B3n7aiCXsT5Z49rLTvQ60lKKr5dyLs,2352
22
- rara_tools/normalizers/viaf.py,sha256=9uTyEadSaoFedUbUfY_iWPJtgrt04jP71i_6MLPM08I,6919
22
+ rara_tools/normalizers/viaf.py,sha256=XWpf_GONBGg8nsjGHoF4Vgk4S1xY3TcTIIwsTCNEyAQ,22298
23
23
  rara_tools/parsers/marc_parsers/base_parser.py,sha256=wzCccZaiN4p2iUms3PAOfXihNgEeg1cGRzRx26ytJeA,1661
24
24
  rara_tools/parsers/marc_parsers/ems_parser.py,sha256=LFuhZcVwmHMcJknX9p4ZkO8RdjPdQZ4APGbw8KV6BIs,2024
25
25
  rara_tools/parsers/marc_parsers/location_parser.py,sha256=dSU9dQoGV5z0ajhLI1bn3AAghkOr79qKIrX7sO0_4lA,1873
@@ -34,8 +34,8 @@ rara_tools/parsers/marc_records/title_record.py,sha256=NyrubWvouZEb46vaoy9NHLCzn
34
34
  rara_tools/parsers/tools/entity_normalizers.py,sha256=afOMqJoL4aeq0cfsohIuxkxzvqNdZ_ba7U32eyogbzk,8722
35
35
  rara_tools/parsers/tools/marc_converter.py,sha256=PUbggzJ_wHfke_bHTF2LOZyzX1t0wRM8qIFL36Dl3AI,414
36
36
  rara_tools/parsers/tools/russian_transliterator.py,sha256=5ZU66iTqAhr7pmfVqXPAI_cidF43VqqmuN4d7H4_JuA,9770
37
- rara_tools-0.5.1.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
38
- rara_tools-0.5.1.dist-info/METADATA,sha256=vyta87EhsR-MEbYL-0jSedPQ6rL8gTR0O2p-uSXQA-g,4054
39
- rara_tools-0.5.1.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
40
- rara_tools-0.5.1.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
41
- rara_tools-0.5.1.dist-info/RECORD,,
37
+ rara_tools-0.5.3.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
38
+ rara_tools-0.5.3.dist-info/METADATA,sha256=JoiwsfHX-dE2ZK3lo2gqaQaMFjr1bWL02Hmxa4y0J8E,4079
39
+ rara_tools-0.5.3.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
40
+ rara_tools-0.5.3.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
41
+ rara_tools-0.5.3.dist-info/RECORD,,