rara-tools 0.7.15__py3-none-any.whl → 0.7.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

@@ -3,6 +3,8 @@ from pymarc import (Field, Subfield, JSONReader, Record)
3
3
  from typing import List, Optional, Iterator
4
4
  from rara_tools.normalizers.reader import SafeJSONReader
5
5
 
6
+ from rara_tools.parsers.tools.validators import filter_names
7
+
6
8
  from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
7
9
  from rara_tools.constants.normalizers import (
8
10
  DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
@@ -51,10 +53,23 @@ class RecordNormalizer:
51
53
  If one linked entity found, we create an updated record from the linked entity data.
52
54
  """
53
55
  linked_records = []
56
+
57
+ def handle_create_new_record(entity, idx):
58
+ logger.info(f"No linked entities found for {entity}, Creating new record.")
59
+ linked_records.append({
60
+ "leader": self.DEFAULT_LEADER,
61
+ "fields": []
62
+ })
63
+ self.records_extra_data.append({
64
+ "entity": entity,
65
+ "classified_fields": classified_fields[idx] if idx < len(classified_fields) else [],
66
+ "edited": False,
67
+ })
54
68
 
55
69
  for idx, linked in enumerate(linking_results or []):
56
70
 
57
71
  if not isinstance(linked, dict):
72
+ logger.debug(f"Skipping invalid linked result: {linked}")
58
73
  continue
59
74
 
60
75
  entity = linked.get("original_entity")
@@ -62,46 +77,32 @@ class RecordNormalizer:
62
77
 
63
78
  if not isinstance(linked_info, list) or not linked_info:
64
79
  # No linked entities found, create new record
65
- logger.info(
66
- f"No linked entities found for {entity}, Creating new record.")
67
- linked_records.append({
68
- "leader": self.DEFAULT_LEADER,
69
- "fields": []
70
- })
71
- self.records_extra_data.append({
72
- "entity": entity,
73
- "classified_fields": classified_fields[idx] if idx < len(classified_fields) else [],
74
- "edited": False
75
- })
80
+ handle_create_new_record(entity, idx)
76
81
  continue
77
82
 
78
83
  elif len(linked_info) > 1:
79
84
  # Multiple linked entities found, create new record
80
- logger.info(
81
- f"Multiple linked entities found for {entity}. Creating new record.")
82
- linked_records.append({
83
- "leader": self.DEFAULT_LEADER,
84
- "fields": []
85
- })
86
- self.records_extra_data.append({
87
- "entity": entity,
88
- "classified_fields": classified_fields[idx] if idx < len(classified_fields) else [],
89
- "edited": False
90
- })
85
+ handle_create_new_record(entity, idx)
91
86
  continue
92
87
 
93
88
  elif len(linked_info) == 1:
89
+ # one record match found, we update existing record
90
+
94
91
  linked_item = linked_info[0]
95
92
  if not isinstance(linked_item, dict):
96
93
  continue
97
94
 
95
+ # handle case where we have linked an entity without a record
96
+ if not linked_item.get("json", None):
97
+ handle_create_new_record(entity, idx)
98
+ continue
99
+
98
100
  linked_records.append(linked_item.get("json", {}))
99
101
 
100
102
  self.records_extra_data.append({
101
103
  "entity": entity,
102
104
  "viaf": linked_item.get("viaf", {}),
103
105
  "classified_fields": classified_fields[idx] if idx < len(classified_fields) else [],
104
- "type": "linked",
105
106
  "edited": True
106
107
  })
107
108
  continue
@@ -109,7 +110,6 @@ class RecordNormalizer:
109
110
  self.records_extra_data.extend(
110
111
  {
111
112
  "sierraID": obj.get("sierraID"),
112
- "type": "sierra",
113
113
  "edited": True
114
114
  }
115
115
  for obj in (sierra_data or [])
@@ -313,25 +313,34 @@ class RecordNormalizer:
313
313
  if viaf_record:
314
314
  self._include_name_variations(record, viaf_record)
315
315
 
316
- def _include_name_variations(self, record: Record, viaf_record: VIAFRecord) -> None:
316
+ def _include_name_variations(self, record: Record, viaf_record: VIAFRecord, filter_variations=True) -> None:
317
317
  """ Include name variations from VIAF record as 400|t fields """
318
318
 
319
319
  if not viaf_record or not viaf_record.name_variations:
320
320
  return
321
321
 
322
322
  existing_name_variations = record.get_fields("400")
323
- existing_variations = [sf.value for field in existing_name_variations for sf in field.get_subfields("t")]
323
+ existing_variations = [sf.value for field in existing_name_variations for sf in field.get_subfields("a")]
324
+
325
+ if filter_variations:
326
+ allowed_variations = filter_names(viaf_record.name_variations)
327
+ logger.debug(
328
+ f"filtered out {len(viaf_record.name_variations) - len(allowed_variations)} name variations for '{viaf_record.name}'"
329
+ )
330
+
331
+ else:
332
+ allowed_variations = viaf_record.name_variations
324
333
 
325
334
  fields = []
326
335
 
327
- for variation in viaf_record.name_variations:
336
+ for variation in allowed_variations:
328
337
  if variation not in existing_variations:
329
338
  fields.append(
330
339
  Field(
331
340
  tag="400",
332
341
  indicators=EMPTY_INDICATORS,
333
342
  subfields=[
334
- Subfield("t", variation)
343
+ Subfield("a", variation)
335
344
  ]
336
345
  )
337
346
  )
@@ -465,6 +474,8 @@ class RecordNormalizer:
465
474
  verify=verify,
466
475
  threshold=threshold
467
476
  )
477
+ if viaf_record:
478
+ logger.debug(f"VIAF {search_term}, linked to ID: {viaf_record.viaf_id}")
468
479
 
469
480
  except Exception as e:
470
481
  logger.error(
@@ -473,7 +484,8 @@ class RecordNormalizer:
473
484
  return viaf_record
474
485
 
475
486
  def _normalize_record(self, record: Record, sierraID: str,
476
- viaf_record: VIAFRecord, is_editing_existing_record: bool, original_entity: str) -> Record:
487
+ viaf_record: VIAFRecord, is_editing_existing_record: bool,
488
+ original_entity: str) -> Record:
477
489
  return record
478
490
 
479
491
  def get_record(self, index: int) -> Record:
@@ -500,19 +512,26 @@ class RecordNormalizer:
500
512
  return next(iter(self))
501
513
 
502
514
  def __iter__(self) -> Iterator:
503
- viaf_id_path = "viaf.queryResult.records.record.0.recordData.VIAFCluster.viafID"
515
+ # viaf_id_path = "viaf.original.queryResult.viafID"
516
+ viaf_id_path = "viaf.parsed.viaf_id"
517
+
504
518
  sierra_id_path = "sierraID"
505
-
519
+
506
520
  for record, extra_data in zip(self.records, self.records_extra_data):
507
521
 
508
522
  sierra_id = glom(extra_data, sierra_id_path, default="")
509
523
  viaf_id = glom(extra_data, viaf_id_path, default=None)
510
- classified_fields = extra_data.get("classified_fields", [])
511
524
 
525
+ classified_fields = extra_data.get("classified_fields", [])
512
526
  entity = extra_data.get("entity")
513
527
  is_editing_existing_record = extra_data.get("edited") == True
514
528
 
515
529
  viaf_record = self._get_viaf_record(record, viaf_id, entity)
530
+ if viaf_record:
531
+ logger.debug(
532
+ f"linked VIAF record with ID {viaf_record.viaf_id} for entity '{entity}'"
533
+ )
534
+
516
535
  record = self._normalize_common(record, is_editing_existing_record, classified_fields)
517
536
 
518
537
  normalized_record = self._normalize_record(
@@ -73,26 +73,13 @@ class BibRecordNormalizer(RecordNormalizer):
73
73
 
74
74
 
75
75
  def _normalize_viaf(self, record: Record, viaf_record: VIAFRecord, original_entity: str) -> None:
76
-
77
76
  if not viaf_record:
78
77
  # viaf record not found, include original entity as 100|t
79
78
  self._add_author(record, viaf_record=None, original_entity=original_entity)
80
79
  return record
81
-
82
- viaf_id = viaf_record.viaf_id
83
- fields = [
84
- Field(
85
- tag="035",
86
- indicators=EMPTY_INDICATORS,
87
- subfields=[
88
- Subfield("a", viaf_id)
89
- ]
90
- )
91
- ]
92
-
93
- self._add_fields_to_record(record, fields)
80
+
94
81
  self._add_author(record, viaf_record, original_entity=original_entity)
95
-
82
+
96
83
  def _normalize_record(self, record: Record, sierraID: str,
97
84
  viaf_record: VIAFRecord, is_editing_existing_record: bool, original_entity: str) -> Record:
98
85
 
@@ -0,0 +1,54 @@
1
+ import regex as re
2
+ from typing import List
3
+
4
+ def has_valid_chars(entity: str, allow_cyrillic: bool = True) -> bool:
5
+ """ Checks if entity contains any valid characters in latin
6
+ or in cyrillic, if the latter is enabled
7
+
8
+ Parameters
9
+ ------------
10
+ entity: str
11
+ String to validate.
12
+ allow_cyrillic: bool
13
+ Allow strings in cyrillic?
14
+
15
+ Returns
16
+ ------------
17
+ bool
18
+ Boolean value indicating, if the string
19
+ contains any valid characters.
20
+
21
+ """
22
+ # Check for latin characters
23
+ is_valid = bool(re.search(r"[a-züõöäA-ZÜÕÖÄ]", entity))
24
+
25
+ if allow_cyrillic and not is_valid:
26
+ # If cyrillic characters are allowed,
27
+ # check for them as well
28
+ is_valid = bool(re.search(r"[а-яА-Я]", entity))
29
+
30
+ return is_valid
31
+
32
+
33
+ def filter_names(names: List[str], allow_cyrillic: bool = True) -> List[str]:
34
+ """ Filters out names not in allowed encodings (latin / cyrillic).
35
+
36
+ Parameters
37
+ ------------
38
+ names: List[str]
39
+ Names to filters.
40
+ allow_cyrillic: bool
41
+ Allow strings in cyrillic?
42
+
43
+ Returns
44
+ ------------
45
+ List[str]
46
+ List of filtered names.
47
+
48
+ """
49
+ filtered_names = [
50
+ name for name in names
51
+ if has_valid_chars(entity=name, allow_cyrillic=allow_cyrillic)
52
+ ]
53
+ return filtered_names
54
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.7.15
3
+ Version: 0.7.17
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -21,8 +21,8 @@ rara_tools/core_formatters/formatted_meta.py,sha256=WEnMs8K0YeTLGjXn_mxQTpshxcz5
21
21
  rara_tools/core_formatters/formatted_object.py,sha256=7a499ZmcZXOqtlwxDi6FWHWF5a6HdCsduS22wV3uHIE,5656
22
22
  rara_tools/normalizers/__init__.py,sha256=_NqpS5w710DhaURytHq9JpEt8HgYpSPfRDcOtOymJgE,193
23
23
  rara_tools/normalizers/authorities.py,sha256=iW3cYOqqVJKy4CcnG9_T6dN-1bBT1e-0jtLYvco-MyQ,5311
24
- rara_tools/normalizers/base.py,sha256=LzjQ6HZEdsnEbQzhTRJU23f16nch7ZgM6efXEY45zNY,20190
25
- rara_tools/normalizers/bibs.py,sha256=s8NGoieCjiftASUb--1YvYZ0VzW6uBt2ZidhLi_wP9A,3938
24
+ rara_tools/normalizers/base.py,sha256=tw64ZK7KXg9O2IPMxICMogYHAG6il10qQqCd4fIjQL0,20941
25
+ rara_tools/normalizers/bibs.py,sha256=5pOw8RsQ4eDwbREbYySeI_b7dQyGlJnfMRSS-tWGJ9c,3632
26
26
  rara_tools/normalizers/reader.py,sha256=GYCkAtnsNx135w5lD-_MqCZzdHQHHPDF-pDxYj839Vo,1595
27
27
  rara_tools/normalizers/viaf.py,sha256=C-NfbvL83ZcHVB9ICMw43wAMYKTqDTHU3ZT2mXKec00,24288
28
28
  rara_tools/parsers/marc_parsers/base_parser.py,sha256=Kdw4aivJf2FkWgIK7pJtHtVXF_G1pjHVQ7IcFItSqy8,1649
@@ -39,8 +39,9 @@ rara_tools/parsers/marc_records/title_record.py,sha256=XrtJ4gj7wzSaGxNaPtPuawmqq
39
39
  rara_tools/parsers/tools/entity_normalizers.py,sha256=VyCy_NowCLpOsL0luQ55IW-Qi-J5oBH0Ofzr7HRFBhM,8949
40
40
  rara_tools/parsers/tools/marc_converter.py,sha256=LgSHe-7n7aiDrw2bnsB53r3fXTRFjZXTwBYfTpL0pfs,415
41
41
  rara_tools/parsers/tools/russian_transliterator.py,sha256=5ZU66iTqAhr7pmfVqXPAI_cidF43VqqmuN4d7H4_JuA,9770
42
- rara_tools-0.7.15.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
43
- rara_tools-0.7.15.dist-info/METADATA,sha256=_McWtiEQK0TGptlidWNxJ26zgdWB_kNk00DSrIAhtB8,4080
44
- rara_tools-0.7.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
45
- rara_tools-0.7.15.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
46
- rara_tools-0.7.15.dist-info/RECORD,,
42
+ rara_tools/parsers/tools/validators.py,sha256=JTGbfAWcLldlZrX0nb343P9RJ8QwSh3455fYap3UxxY,1335
43
+ rara_tools-0.7.17.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
44
+ rara_tools-0.7.17.dist-info/METADATA,sha256=of0OwoIpSaah24TRi1bIU78dL-rfQOIxcxSdZbRL5IU,4080
45
+ rara_tools-0.7.17.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
46
+ rara_tools-0.7.17.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
47
+ rara_tools-0.7.17.dist-info/RECORD,,