PyPI - rara-tools - Versions diffs - 0.7.0__tar.gz → 0.7.2__tar.gz - Mend

rara-tools 0.7.0tar.gz → 0.7.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rara-tools might be problematic. Click here for more details.

Files changed (67) hide show

{rara_tools-0.7.0/rara_tools.egg-info → rara_tools-0.7.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rara-tools
-Version: 0.7.0
+Version: 0.7.2
 Summary: Tools to support Kata's work.
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10

rara_tools-0.7.2/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.7.2

{rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/constants/digitizer.py RENAMED Viewed

@@ -20,7 +20,9 @@ class Queue:
     DOWNLOAD = "download"
     FINISH = "finish"
     OCR = "ocr"
+    UTILITY = "digitizer-utility"
 class Tasks:
     START_DIGITIZER_PIPELINE = "start_digitizer_pipeline"
+    PURGE_MODELS = "purge_unused_digitizer_models"

{rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/constants/general.py RENAMED Viewed

@@ -20,6 +20,7 @@ class Tasks:
     UPDATE_TASK_VALUES = "update_task_values"
     MODEL_UPDATE = "component_model_update"
     RUN_POST_TASK_COMPLETION_TASKS = "run_post_task_completion_tasks"
+    PURGE_MODELS = "purge_unused_models"
 class Models:

{rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/constants/subject_indexer.py RENAMED Viewed

@@ -6,10 +6,12 @@ COMPONENT_KEY = "subject_indexer"
 class Tasks:
     SINGLE = "run_subject_indexer_process"
     PIPELINE = "run_subject_indexer_with_core_logic"
+    PURGE_MODELS = "purge_unused_subjectindexer_models"
 class Queue:
     MAIN = "subject-indexer"
+    UTILITY = "subjectindexer-utility"
 class StatusKeys:
@@ -21,6 +23,7 @@ class URLSource:
     SIERRA = "Sierra"
     EMS = "EMS"
 class KeywordType:
     LOC = "Kohamärksõnad"
     TIME = "Ajamärksõnad"
@@ -45,6 +48,7 @@ class KeywordMARC:
     EVENT = 611
     TITLE = 630
 class KeywordSource:
     EMS = "EMS"
     SIERRA = "SIERRA"

{rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools/normalizers/base.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from datetime import datetime
 from pymarc import (Field, Subfield, JSONReader, Record)
 from typing import List, Optional, Iterator
+from rara_tools.normalizers.reader import SafeJSONReader
 from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
 from rara_tools.constants.normalizers import (
@@ -35,7 +36,6 @@ class RecordNormalizer:
         self.ALLOW_EDIT_FIELDS = ALLOW_EDIT_FIELDS
         # include, if should be added alongside existing fields
         self.REPEATABLE_FIELDS = REPEATABLE_FIELDS
         self.records_extra_data = []
         self.records = self._setup_records(linking_results, sierra_data)
         self.sierra_data = sierra_data
@@ -92,8 +92,56 @@ class RecordNormalizer:
         all_records = linked_records + (sierra_data or [])
-        return JSONReader(json.dumps(all_records,
-                                     ensure_ascii=False), stream=False)
+        return SafeJSONReader(
+            json.dumps(all_records, ensure_ascii=False),
+        )
+    def _setup_records(self, linking_results: List[dict], sierra_data: List[dict]) -> JSONReader:
+        """Setup initial MARC records and data.
+        For linked entities:
+            1. Try to get single linked normalized record from KATA elastic. If more than one found, skip.
+            2. If 0 matches, search from VIAF and if 1 result found, create a new authority record from the data.
+            3. If none or more than one responses found, use only Classificator data (coming from Linker?).
+        """
+        linked_records = []
+        for linked in linking_results or []:
+            if not isinstance(linked, dict):
+                continue
+            entity = linked.get("original_entity")
+            linked_info = linked.get("linked_info", [])
+            if not isinstance(linked_info, list) or not linked_info:
+                continue
+            if len(linked_info) == 1:
+                linked_item = linked_info[0]
+                if not isinstance(linked_item, dict):
+                    continue
+                linked_records.append(linked_item.get("json", {}))
+                self.records_extra_data.append({
+                    "entity": entity,
+                    "viaf": linked_item.get("viaf", {}),
+                    "type": "linked",
+                    "edited": True
+                })
+        self.records_extra_data.extend(
+            {
+                "sierraID": obj.get("sierraID"),
+                "type": "sierra",
+                "edited": True
+            }
+            for obj in (sierra_data or [])
+            if isinstance(obj, dict)
+        )
+        all_records = linked_records + (sierra_data or [])
+        return SafeJSONReader(json.dumps(all_records, ensure_ascii=False))
     @staticmethod
     def current_timestamp():
@@ -230,11 +278,15 @@ class RecordNormalizer:
     def _get_viaf_search_term(self, record: Record, entity: Optional[str]) -> Optional[str]:
         """ prioritize entity name, if not available, use author name. """
         if entity:
             return entity
-        else:
-            return record.name
+        author_field = record.get("100") or record.get("110") or record.get("111")
+        if author_field:
+            return author_field.get_subfields("a")[0] if author_field.get_subfields("a") else None
+        logger.warning(
+            "No entity or author name found for VIAF search. Skipping VIAF enrichment.")
     def _get_viaf_record(self, record: Record, viaf_id: Optional[int] = None,
             entity: Optional[str] = None, viaf_field: str = DEFAULT_VIAF_FIELD,
@@ -252,21 +304,26 @@ class RecordNormalizer:
                     viaf_record = viaf_records[0]
             else:
                 search_term = self._get_viaf_search_term(record, entity)
-                if not verify:
-                    logger.warning(
-                        f"Record verification is turned off. If multiple records are " \
-                        f"detected for search term '{search_term}', the first " \
-                        f"result is automatically returned. This might lead to " \
-                        f"some inaccuracies!"
-                    )
-                viaf_record = viaf_client.get_normalized_data_by_search_term(
-                    search_term=search_term,
-                    field=viaf_field,
-                    max_records=max_records,
-                    verify=verify,
-                    threshold=threshold
-                )
+                if search_term:
+                    logger.info(
+                        f"Searching for VIAF record with search term: {search_term}")
+                    if not verify:
+                        logger.warning(
+                            f"Record verification is turned off. If multiple records are " \
+                            f"detected for search term '{search_term}', the first " \
+                            f"result is automatically returned. This might lead to " \
+                            f"some inaccuracies!"
+                        )
+                    viaf_record = viaf_client.get_normalized_data_by_search_term(
+                        search_term=search_term,
+                        field=viaf_field,
+                        max_records=max_records,
+                        verify=verify,
+                        threshold=threshold
+                    )
         except Exception as e:
             logger.error(

rara_tools-0.7.2/rara_tools/normalizers/reader.py ADDED Viewed

@@ -0,0 +1,45 @@
+from pymarc import Record, Field, Subfield, Leader, JSONReader
+import logging
+logger = logging.getLogger(__name__)
+DEFAULT_LEADER = '01682nz a2200349n 4500'
+class SafeJSONReader(JSONReader):
+    def __next__(self):
+        while True:
+            try:
+                jobj = next(self.iter)
+                rec = Record()
+                # Use custom default leader if missing
+                leader_str = jobj.get("leader")
+                if leader_str:
+                    rec.leader = Leader(leader_str)
+                else:
+                    logger.warning("Missing leader in record. Using DEFAULT_LEADER.")
+                    rec.leader = Leader(DEFAULT_LEADER)
+                for field in jobj["fields"]:
+                    k, v = list(field.items())[0]
+                    if isinstance(v, dict) and "subfields" in v:
+                        subfields = []
+                        for sub in v["subfields"]:
+                            for code, value in sub.items():
+                                subfields.append(Subfield(code, value))
+                        ind1 = v.get("ind1", " ")
+                        ind2 = v.get("ind2", " ")
+                        fld = Field(tag=k, indicators=[ind1, ind2], subfields=subfields)
+                    else:
+                        fld = Field(tag=k, data=v)
+                    rec.add_field(fld)
+                return rec
+            except StopIteration:
+                raise
+            except Exception as e:
+                logger.error(f"Skipping invalid record: {e}")
+                continue

{rara_tools-0.7.0 → rara_tools-0.7.2/rara_tools.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rara-tools
-Version: 0.7.0
+Version: 0.7.2
 Summary: Tools to support Kata's work.
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10

{rara_tools-0.7.0 → rara_tools-0.7.2}/rara_tools.egg-info/SOURCES.txt RENAMED Viewed

@@ -33,6 +33,7 @@ rara_tools/normalizers/__init__.py
 rara_tools/normalizers/authorities.py
 rara_tools/normalizers/base.py
 rara_tools/normalizers/bibs.py
+rara_tools/normalizers/reader.py
 rara_tools/normalizers/viaf.py
 rara_tools/parsers/marc_parsers/base_parser.py
 rara_tools/parsers/marc_parsers/ems_parser.py

{rara_tools-0.7.0 → rara_tools-0.7.2}/tests/test_normalization.py RENAMED Viewed

@@ -1,3 +1,4 @@
+from rara_tools.constants import linker
 from rara_tools.normalizers import BibRecordNormalizer, AuthoritiesRecordNormalizer
 from tests.test_utils import (get_linker_res_example, get_formatted_sierra_response,
                               check_record_tags_sorted, check_no_dupe_tag_values, check_record_tags_have_values)
@@ -281,7 +282,8 @@ def test_authority_normrecord_not_found_in_es_and_viaf():
     linking_results = [linker_res]
     normalizer = AuthoritiesRecordNormalizer(
-        linking_results=linking_results)
+        linking_results=linking_results
+    )
     data = normalizer.data
@@ -302,6 +304,31 @@ def test_authority_normrecord_not_found_in_es_and_viaf():
     # should create new normalized record in the future, none for now
     assert len(data) == 0
+def _run_normalizer(linked_data):
+    normalizer = AuthoritiesRecordNormalizer(
+        linking_results=linked_data
+    )
+    return normalizer.data
+def test_normalizer_handles_bad_inputs():
+    linker_res = get_linker_res_example(
+        "oneFound.json")
+    # pop the leader field to simulate record without leader
+    linker_res["linked_info"][0]["json"].pop("leader", None)
+    _run_normalizer([linker_res])
+    # make fields empty to simulate a record with no fields
+    linker_res["linked_info"][0]["json"]["fields"] = []
+    _run_normalizer([linker_res])
+    # pop the fields to simulate a record with no fields
+    linker_res["linked_info"][0]["json"].pop("fields", None)
+    _run_normalizer([linker_res])
+    inputs = ["", None, [], {}, 123]
+    _run_normalizer(inputs)
 def test_matching_sierra_record_viaf_id_found():
     """normkirjelt leitakse VIAF ID, vajadusel normi asukoht, kus see ID sisaldub."""