PyPI - rara-tools - Versions diffs - 0.7.15__py3-none-any.whl → 0.7.17__py3-none-any.whl - Mend

rara-tools 0.7.15py3-none-any.whl → 0.7.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rara-tools might be problematic. Click here for more details.

Files changed (8) hide show

rara_tools/normalizers/base.py CHANGED Viewed

@@ -3,6 +3,8 @@ from pymarc import (Field, Subfield, JSONReader, Record)
 from typing import List, Optional, Iterator
 from rara_tools.normalizers.reader import SafeJSONReader
+from rara_tools.parsers.tools.validators import filter_names
 from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
 from rara_tools.constants.normalizers import (
     DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
@@ -51,10 +53,23 @@ class RecordNormalizer:
         If one linked entity found, we create an updated record from the linked entity data.
         """
         linked_records = []
+        def handle_create_new_record(entity, idx):
+            logger.info(f"No linked entities found for {entity}, Creating new record.")
+            linked_records.append({
+                "leader": self.DEFAULT_LEADER,
+                "fields": []
+            })
+            self.records_extra_data.append({
+                "entity": entity,
+                "classified_fields": classified_fields[idx] if idx < len(classified_fields) else [],
+                "edited": False,
+            })
         for idx, linked in enumerate(linking_results or []):
             if not isinstance(linked, dict):
+                logger.debug(f"Skipping invalid linked result: {linked}")
                 continue
             entity = linked.get("original_entity")
@@ -62,46 +77,32 @@ class RecordNormalizer:
             if not isinstance(linked_info, list) or not linked_info:
                 # No linked entities found, create new record
-                logger.info(
-                    f"No linked entities found for {entity}, Creating new record.")
-                linked_records.append({
-                    "leader": self.DEFAULT_LEADER,
-                    "fields": []
-                })
-                self.records_extra_data.append({
-                    "entity": entity,
-                    "classified_fields": classified_fields[idx] if idx < len(classified_fields) else [],
-                    "edited": False
-                })
+                handle_create_new_record(entity, idx)
                 continue
             elif len(linked_info) > 1:
                 # Multiple linked entities found, create new record
-                logger.info(
-                    f"Multiple linked entities found for {entity}. Creating new record.")
-                linked_records.append({
-                    "leader": self.DEFAULT_LEADER,
-                    "fields": []
-                })
-                self.records_extra_data.append({
-                    "entity": entity,
-                    "classified_fields": classified_fields[idx] if idx < len(classified_fields) else [],
-                    "edited": False
-                })
+                handle_create_new_record(entity, idx)
                 continue
             elif len(linked_info) == 1:
+                # one record match found, we update existing record
                 linked_item = linked_info[0]
                 if not isinstance(linked_item, dict):
                     continue
+                # handle case where we have linked an entity without a record
+                if not linked_item.get("json", None):
+                    handle_create_new_record(entity, idx)
+                    continue
                 linked_records.append(linked_item.get("json", {}))
                 self.records_extra_data.append({
                     "entity": entity,
                     "viaf": linked_item.get("viaf", {}),
                     "classified_fields": classified_fields[idx] if idx < len(classified_fields) else [],
-                    "type": "linked",
                     "edited": True
                 })
                 continue
@@ -109,7 +110,6 @@ class RecordNormalizer:
         self.records_extra_data.extend(
             {
                 "sierraID": obj.get("sierraID"),
-                "type": "sierra",
                 "edited": True
             }
             for obj in (sierra_data or [])
@@ -313,25 +313,34 @@ class RecordNormalizer:
         if viaf_record:
             self._include_name_variations(record, viaf_record)
-    def _include_name_variations(self, record: Record, viaf_record: VIAFRecord) -> None:
+    def _include_name_variations(self, record: Record, viaf_record: VIAFRecord, filter_variations=True) -> None:
         """ Include name variations from VIAF record as 400|t fields """
         if not viaf_record or not viaf_record.name_variations:
             return
         existing_name_variations = record.get_fields("400")
-        existing_variations = [sf.value for field in existing_name_variations for sf in field.get_subfields("t")]
+        existing_variations = [sf.value for field in existing_name_variations for sf in field.get_subfields("a")]
+        if filter_variations:
+            allowed_variations = filter_names(viaf_record.name_variations)
+            logger.debug(
+                f"filtered out {len(viaf_record.name_variations) - len(allowed_variations)} name variations for '{viaf_record.name}'"
+            )
+        else:
+            allowed_variations = viaf_record.name_variations
         fields = []
-        for variation in viaf_record.name_variations:
+        for variation in allowed_variations:
             if variation not in existing_variations:
                 fields.append(
                     Field(
                         tag="400",
                         indicators=EMPTY_INDICATORS,
                         subfields=[
-                            Subfield("t", variation)
+                            Subfield("a", variation)
                         ]
                     )
                 )
@@ -465,6 +474,8 @@ class RecordNormalizer:
                         verify=verify,
                         threshold=threshold
                     )
+                    if viaf_record:
+                        logger.debug(f"VIAF {search_term}, linked to ID: {viaf_record.viaf_id}")
         except Exception as e:
             logger.error(
@@ -473,7 +484,8 @@ class RecordNormalizer:
         return viaf_record
     def _normalize_record(self, record: Record, sierraID: str,
-                          viaf_record: VIAFRecord, is_editing_existing_record: bool, original_entity: str) -> Record:
+                          viaf_record: VIAFRecord, is_editing_existing_record: bool,
+                          original_entity: str) -> Record:
         return record
     def get_record(self, index: int) -> Record:
@@ -500,19 +512,26 @@ class RecordNormalizer:
         return next(iter(self))
     def __iter__(self) -> Iterator:
-        viaf_id_path = "viaf.queryResult.records.record.0.recordData.VIAFCluster.viafID"
+        # viaf_id_path = "viaf.original.queryResult.viafID"
+        viaf_id_path = "viaf.parsed.viaf_id"
         sierra_id_path = "sierraID"
         for record, extra_data in zip(self.records, self.records_extra_data):
             sierra_id = glom(extra_data, sierra_id_path, default="")
             viaf_id = glom(extra_data, viaf_id_path, default=None)
-            classified_fields = extra_data.get("classified_fields", [])
+            classified_fields = extra_data.get("classified_fields", [])
             entity = extra_data.get("entity")
             is_editing_existing_record = extra_data.get("edited") == True
             viaf_record = self._get_viaf_record(record, viaf_id, entity)
+            if viaf_record:
+                logger.debug(
+                    f"linked VIAF record with ID {viaf_record.viaf_id} for entity '{entity}'"
+                )
             record = self._normalize_common(record, is_editing_existing_record, classified_fields)
             normalized_record = self._normalize_record(

rara_tools/normalizers/bibs.py CHANGED Viewed

@@ -73,26 +73,13 @@ class BibRecordNormalizer(RecordNormalizer):
     def _normalize_viaf(self, record: Record, viaf_record: VIAFRecord, original_entity: str) -> None:
         if not viaf_record:
             # viaf record not found, include original entity as 100|t
             self._add_author(record, viaf_record=None, original_entity=original_entity)
             return record
-        viaf_id = viaf_record.viaf_id
-        fields = [
-            Field(
-                tag="035",
-                indicators=EMPTY_INDICATORS,
-                subfields=[
-                    Subfield("a", viaf_id)
-                ]
-            )
-        ]
-        self._add_fields_to_record(record, fields)
         self._add_author(record, viaf_record, original_entity=original_entity)
     def _normalize_record(self, record: Record, sierraID: str,
                           viaf_record: VIAFRecord, is_editing_existing_record: bool, original_entity: str) -> Record:

rara_tools/parsers/tools/validators.py ADDED Viewed

@@ -0,0 +1,54 @@
+import regex as re
+from typing import List
+def has_valid_chars(entity: str, allow_cyrillic: bool = True) -> bool:
+    """ Checks if entity contains any valid characters in latin
+    or in cyrillic, if the latter is enabled
+    Parameters
+    ------------
+    entity: str
+        String to validate.
+    allow_cyrillic: bool
+        Allow strings in cyrillic?
+    Returns
+    ------------
+    bool
+        Boolean value indicating, if the string
+        contains any valid characters.
+    """
+    # Check for latin characters
+    is_valid = bool(re.search(r"[a-züõöäA-ZÜÕÖÄ]", entity))
+    if allow_cyrillic and not is_valid:
+        # If cyrillic characters are allowed,
+        # check for them as well
+        is_valid = bool(re.search(r"[а-яА-Я]", entity))
+    return is_valid
+def filter_names(names: List[str], allow_cyrillic: bool = True) -> List[str]:
+    """ Filters out names not in allowed encodings (latin / cyrillic).
+    Parameters
+    ------------
+    names: List[str]
+        Names to filters.
+    allow_cyrillic: bool
+        Allow strings in cyrillic?
+    Returns
+    ------------
+    List[str]
+        List of filtered names.
+    """
+    filtered_names = [
+        name for name in names
+        if has_valid_chars(entity=name, allow_cyrillic=allow_cyrillic)
+    ]
+    return filtered_names

{rara_tools-0.7.15.dist-info → rara_tools-0.7.17.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rara-tools
-Version: 0.7.15
+Version: 0.7.17
 Summary: Tools to support Kata's work.
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10

{rara_tools-0.7.15.dist-info → rara_tools-0.7.17.dist-info}/RECORD RENAMED Viewed

@@ -21,8 +21,8 @@ rara_tools/core_formatters/formatted_meta.py,sha256=WEnMs8K0YeTLGjXn_mxQTpshxcz5
 rara_tools/core_formatters/formatted_object.py,sha256=7a499ZmcZXOqtlwxDi6FWHWF5a6HdCsduS22wV3uHIE,5656
 rara_tools/normalizers/__init__.py,sha256=_NqpS5w710DhaURytHq9JpEt8HgYpSPfRDcOtOymJgE,193
 rara_tools/normalizers/authorities.py,sha256=iW3cYOqqVJKy4CcnG9_T6dN-1bBT1e-0jtLYvco-MyQ,5311
-rara_tools/normalizers/base.py,sha256=LzjQ6HZEdsnEbQzhTRJU23f16nch7ZgM6efXEY45zNY,20190
-rara_tools/normalizers/bibs.py,sha256=s8NGoieCjiftASUb--1YvYZ0VzW6uBt2ZidhLi_wP9A,3938
+rara_tools/normalizers/base.py,sha256=tw64ZK7KXg9O2IPMxICMogYHAG6il10qQqCd4fIjQL0,20941
+rara_tools/normalizers/bibs.py,sha256=5pOw8RsQ4eDwbREbYySeI_b7dQyGlJnfMRSS-tWGJ9c,3632
 rara_tools/normalizers/reader.py,sha256=GYCkAtnsNx135w5lD-_MqCZzdHQHHPDF-pDxYj839Vo,1595
 rara_tools/normalizers/viaf.py,sha256=C-NfbvL83ZcHVB9ICMw43wAMYKTqDTHU3ZT2mXKec00,24288
 rara_tools/parsers/marc_parsers/base_parser.py,sha256=Kdw4aivJf2FkWgIK7pJtHtVXF_G1pjHVQ7IcFItSqy8,1649
@@ -39,8 +39,9 @@ rara_tools/parsers/marc_records/title_record.py,sha256=XrtJ4gj7wzSaGxNaPtPuawmqq
 rara_tools/parsers/tools/entity_normalizers.py,sha256=VyCy_NowCLpOsL0luQ55IW-Qi-J5oBH0Ofzr7HRFBhM,8949
 rara_tools/parsers/tools/marc_converter.py,sha256=LgSHe-7n7aiDrw2bnsB53r3fXTRFjZXTwBYfTpL0pfs,415
 rara_tools/parsers/tools/russian_transliterator.py,sha256=5ZU66iTqAhr7pmfVqXPAI_cidF43VqqmuN4d7H4_JuA,9770
-rara_tools-0.7.15.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
-rara_tools-0.7.15.dist-info/METADATA,sha256=_McWtiEQK0TGptlidWNxJ26zgdWB_kNk00DSrIAhtB8,4080
-rara_tools-0.7.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-rara_tools-0.7.15.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
-rara_tools-0.7.15.dist-info/RECORD,,
+rara_tools/parsers/tools/validators.py,sha256=JTGbfAWcLldlZrX0nb343P9RJ8QwSh3455fYap3UxxY,1335
+rara_tools-0.7.17.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
+rara_tools-0.7.17.dist-info/METADATA,sha256=of0OwoIpSaah24TRi1bIU78dL-rfQOIxcxSdZbRL5IU,4080
+rara_tools-0.7.17.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+rara_tools-0.7.17.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
+rara_tools-0.7.17.dist-info/RECORD,,

{rara_tools-0.7.15.dist-info → rara_tools-0.7.17.dist-info}/WHEEL RENAMED Viewed

File without changes

{rara_tools-0.7.15.dist-info → rara_tools-0.7.17.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

{rara_tools-0.7.15.dist-info → rara_tools-0.7.17.dist-info}/top_level.txt RENAMED Viewed

File without changes

rara-tools 0.7.15__py3-none-any.whl → 0.7.17__py3-none-any.whl

Potentially problematic release.

rara-tools 0.7.15py3-none-any.whl → 0.7.17py3-none-any.whl