PyPI - rara-tools - Versions diffs - 0.5.3__tar.gz → 0.6.1__tar.gz - Mend

rara-tools 0.5.3tar.gz → 0.6.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rara-tools might be problematic. Click here for more details.

Files changed (63) hide show

{rara_tools-0.5.3/rara_tools.egg-info → rara_tools-0.6.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rara-tools
-Version: 0.5.3
+Version: 0.6.1
 Summary: Tools to support Kata's work.
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10

rara_tools-0.6.1/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.6.1

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/constants/digitizer.py RENAMED Viewed

@@ -22,4 +22,4 @@ class Queue:
 class Tasks:
-    MODEL_UPDATE = "component_model_update"
+    START_DIGITIZER_PIPELINE = "start_digitizer_pipeline"

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/constants/general.py RENAMED Viewed

@@ -11,5 +11,8 @@ class Queue:
     CORE = "core"
-class Task:
+class Tasks:
     SEND_VERSION = "send_version_to_core"
+    UPDATE_TASK_STATUS = "update_task_status"
+    UPDATE_TASK_VALUES = "update_task_values"
+    MODEL_UPDATE = "component_model_update"

rara_tools-0.6.1/rara_tools/constants/language_evaluator.py ADDED Viewed

@@ -0,0 +1,9 @@
+COMPONENT_KEY = "language_evaluator"
+class Tasks:
+    EVALUATE = "text_evaluator"
+class Queue:
+    EVALUATE = "text_evaluator"

rara_tools-0.6.1/rara_tools/constants/subject_indexer.py ADDED Viewed

@@ -0,0 +1,9 @@
+COMPONENT_KEY = "subject_indexer"
+class Tasks:
+    PIPELINE = "run_subject_indexer_pipeline"
+class Queue:
+    MAIN = "subject-indexer"

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/normalizers/base.py RENAMED Viewed

@@ -2,11 +2,11 @@ from datetime import datetime
 from pymarc import (Field, Subfield, JSONReader, Record)
 from typing import List, Optional, Iterator
-from rara_tools.constants import EMPTY_INDICATORS
 from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
 from rara_tools.constants.normalizers import (
     DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
-    VIAF_SIMILARITY_THRESHOLD, VERIFY_VIAF_RECORD, MAX_VIAF_RECORDS_TO_VERIFY
+    VIAF_SIMILARITY_THRESHOLD, VERIFY_VIAF_RECORD, MAX_VIAF_RECORDS_TO_VERIFY,
+    EMPTY_INDICATORS
 )
 from glom import glom
 import logging

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/normalizers/viaf.py RENAMED Viewed

@@ -8,7 +8,7 @@ from requests.models import Response
 from rara_tools.parsers.tools.entity_normalizers import PersonalName, Normalizer
 from rara_tools.constants.normalizers import (
     DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
-    VIAF_SIMILARITY_THRESHOLD
+    VIAF_SIMILARITY_THRESHOLD, VIAF_ALLOWED_SOURCES
 )
 import logging
@@ -20,9 +20,7 @@ class VIAFRecord:
     """
     def __init__(self,
             record: dict,
-            allowed_sources: List[str] = [
-                "LC", "DNB", "LNB", "NLL", "ERRR", "J9U"
-            ]
+            allowed_sources: List[str] = VIAF_ALLOWED_SOURCES
     ):
         """ Initializes VIAFRecord class.
@@ -108,14 +106,36 @@ class VIAFRecord:
             wikilink_lang = match.group()
         return wikilink_lang
-    def _get_marc_field(self, marc_dict: dict, subfield: str = "a") -> str:
+    def _get_marc_field(self, marc_dict: dict, subfield: str = "a",
+                        strict_subfield: bool = True
+    ) -> str:
+        """ Retrieve value from a MARC dict
+        Parameters
+        -----------
+        marc_dict: dict
+            MARC dictionaryself.
+        subfield: str
+            Subfield to extract
+        strict_subfield: bool
+            If set to True, data is extracted ONLY from
+            the subfield set with param `subfield`. If set to False,
+            data can be extracted from other subfields as well as long
+            there is only one subfield in the dict. This might be necessary
+            for uniformTitleWorks as sometimes the title is present in
+            subfield (t) while subfield (a) contains the author. However,
+            there are instances, where the title is present is subfield (a)
+            with no author.
+        """
         value = ""
         if marc_dict.get("dtype", "") == "MARC21":
             subfields = marc_dict.get("subfield", [])
             for _subfield in subfields:
-                if _subfield.get("code", "") == subfield:
+                if len(subfields) > 1 and _subfield.get("code", "") == subfield:
                     value = _subfield.get("value", "")
                     break
+                elif len(subfields) == 1 and not strict_subfield:
+                    value = _subfield.get("value", "")
         return value
     def _get_marc_tag(self, marc_dict: dict) -> str:
@@ -127,7 +147,11 @@ class VIAFRecord:
     def _get_names(self, marc_dicts: List[dict]) -> List[str]:
         names_d = defaultdict(int)
         for marc_dict in marc_dicts:
-            name = self._get_marc_field(marc_dict, self.subfield_indicator)
+            name = self._get_marc_field(
+                marc_dict=marc_dict,
+                subfield=self.subfield_indicator,
+                strict_subfield=False
+            )
             names_d[name]+=1
         name_list = sorted(
             list(names_d.items()),
@@ -154,8 +178,8 @@ class VIAFRecord:
         # Strip "." only if the last token is not an initial,
         # e.g: "Meri, Lennart." -> Strip
         # "Meri, L." -> Do not strip.
-        ent_tokens = entity.split()
-        if len(ent_tokens[-1]) > 2:
+        ent_tokens = [t.strip() for t in entity.split() if t.strip()]
+        if ent_tokens and len(ent_tokens[-1]) > 2:
             entity = entity.strip(".")
         return entity
@@ -166,6 +190,14 @@ class VIAFRecord:
         _entity = re.sub(r"[(][^)][)]", "", entity)
         return _entity.strip()
+    @property
+    def record(self) -> dict:
+        return self.__record
+    @property
+    def record_data(self) -> dict:
+        return self.__record_data
     @property
     def subfield_indicator(self) -> str:
         if not self.__subfield_indicator:
@@ -239,7 +271,6 @@ class VIAFRecord:
             vars_3 = [Normalizer.clean_entity(v) for v in _vars]
             vars = _vars + vars_3
-            #print(vars)
             self.__name_variations = list(set(vars))
         return self.__name_variations
@@ -369,7 +400,10 @@ class VIAFRecord:
 class VIAFClient:
-    def __init__(self, viaf_api_url: str = "https://viaf.org/api"):
+    def __init__(self,
+            viaf_api_url: str = "https://viaf.org/api",
+            allowed_viaf_sources: List[str] = VIAF_ALLOWED_SOURCES
+        ):
         self.root_url: str = viaf_api_url.strip("/")
         self.record_url: str = f"{self.root_url}/cluster-record"
         self.search_url: str = f"{self.root_url}/search"
@@ -377,6 +411,7 @@ class VIAFClient:
             "Accept": "application/json",
             "Content-Type": "application/json"
         }
+        self.allowed_viaf_sources: List[str] = allowed_viaf_sources
     def check_search_term_query(self) -> bool:
         """ Function for checking, if VIAF search term
@@ -448,6 +483,9 @@ class VIAFClient:
                 score: float
                     Similarity score of the most similar record.
         """
+        logger.debug(
+            f"Verifying if '{viaf_record.name}' is sufficiently similar to '{entity}'."
+        )
         # might not always be personal name, but shouldn't break anything
         if len(entity.split()) > 1:
             pn = PersonalName(entity)
@@ -464,7 +502,7 @@ class VIAFClient:
                     max_similarity = score
                     most_similar_record = var
                 if score >= threshold:
-                    logger.info(
+                    logger.debug(
                         f"Verification successful! '{name_form}' sufficiently " \
                         f"similar to '{var}'! Score = {score}."
                     )
@@ -486,6 +524,10 @@ class VIAFClient:
         """ Takes in n VIAFRecords found while searching the term `search_term`.
         Returns the most similar VIAFRecord.
         """
+        logger.debug(
+            f"Retrieving a single verified record from VIAF search results. " \
+            f"search term = '{search_term}'."
+        )
         verified_record = None
         max_score = 0
         most_similar_record = ""
@@ -517,6 +559,7 @@ class VIAFClient:
      ) -> Response:
         """ Query VIAF records by search term.
         """
+        logger.debug(f"Retriecing VIAF records for search term '{search_term}'.")
         if field and field not in ALLOWED_VIAF_FIELDS:
             logger.error(
                 f"Field '{field}' is not allowed. Defaulting to '{DEFAULT_VIAF_FIELD}'. " \
@@ -541,6 +584,7 @@ class VIAFClient:
     def get_records_by_viaf_id(self, record_id: str) -> Response:
         """ Query VIAF records by ID.
         """
+        logger.debug(f"Retrieving VIAF records for ID {record_id}.")
         data = {
             "reqValues": {
                 "recordId": str(record_id)
@@ -552,6 +596,7 @@ class VIAFClient:
     def extract_viaf_ids(self, search_query_response: Response) -> List[str]:
         """ Parse VIAF ID-s from search query response.
         """
+        logger.debug("Extracting VIAF IDs from VIAF search query results.")
         try:
             records = search_query_response.json()["queryResult"]["records"]["record"]
         except Exception as e:
@@ -591,7 +636,6 @@ class VIAFClient:
     def fetch_viaf_clusters(self, viaf_ids: List[str]) -> Dict[str, dict]:
         results = {}
         for viaf_id in viaf_ids:
             try:
                 response = self.get_records_by_viaf_id(viaf_id)
@@ -605,9 +649,13 @@ class VIAFClient:
     def get_normalized_data_by_ids(self, record_ids: List[str]) -> List[VIAFRecord]:
         """ Fetch data required for normalization from VIAF. """
+        logger.debug(f"Fetching VIAFRecords for the following IDs: {record_ids}.")
         response = self.fetch_viaf_clusters(record_ids)
         viaf_records = [
-            VIAFRecord(response[record_id])
+            VIAFRecord(
+                record=response[record_id],
+                allowed_sources=self.allowed_viaf_sources
+            )
             for record_id in record_ids
         ]
         return viaf_records
@@ -618,6 +666,11 @@ class VIAFClient:
         viaf_index: str = "VIAF"
     ) -> VIAFRecord | None:
         """ Fetch data required for normalization from VIAF. """
+        logger.debug(
+            f"Finding VIAFRecords with search term '{search_term}' " \
+            f"using VIAF field='{field}', verify={verify}, threshold={threshold}. " \
+            f"Allowed VIAF sources are: {self.allowed_viaf_sources}."
+        )
         viaf_record = None
         viaf_ids = self.get_viaf_ids_by_search_terms(
             search_term=search_term,
@@ -637,17 +690,3 @@ class VIAFClient:
                 records = self.get_normalized_data_by_ids(viaf_ids[:1])
                 verified_record = records[0] if records else None
         return verified_record
-if __name__ == "__main__":
-    from pprint import pprint
-    vc = VIAFClient()
-    entity="Kevade"
-    record = vc.get_normalized_data_by_search_term(entity,  field="local.uniformTitleWorks", max_records=5, verify=True)
-    #pprint(record.record_data)
-    if record:
-        pprint(record.all_fields)
-        #pprint(record.record_data)
-    else:
-        print(f"Couldn't detect a verified record for entity '{entity}' :(.")

{rara_tools-0.5.3 → rara_tools-0.6.1/rara_tools.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rara-tools
-Version: 0.5.3
+Version: 0.6.1
 Summary: Tools to support Kata's work.
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10

{rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_viaf_client.py RENAMED Viewed

@@ -69,3 +69,20 @@ def test_subfield_based_main_field_extraction():
         verify=True
     )
     assert record.name == "Luts, Oskar"
+def test_changing_allowed_sources():
+    client = VIAFClient(allowed_viaf_sources=["PLWABN"])
+    record = client.get_normalized_data_by_search_term(
+        search_term="Anora",
+        field="local.uniformTitleWorks",
+        verify=False
+    )
+    assert record.name == "Anora (film)"
+    client = VIAFClient(allowed_viaf_sources=["LC"])
+    record = client.get_normalized_data_by_search_term(
+        search_term="Anora",
+        field="local.uniformTitleWorks",
+        verify=False
+    )
+    assert record.name == "Anora (Motion picture)"

rara_tools-0.5.3/VERSION DELETED Viewed

	@@ -1 +0,0 @@
1	- 0.5.3

rara_tools-0.5.3/rara_tools/constants/language_evaluator.py DELETED Viewed

	@@ -1 +0,0 @@
1	- COMPONENT_KEY = "language_evaluator"

rara_tools-0.5.3/rara_tools/constants/subject_indexer.py DELETED Viewed

	@@ -1 +0,0 @@
1	- COMPONENT_KEY = "subject_indexer"

{rara_tools-0.5.3 → rara_tools-0.6.1}/LICENSE.md RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/README.md RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/pyproject.toml RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/constants/__init__.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/constants/linker.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/constants/meta_extractor.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/constants/normalizers.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/constants/parsers.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/converters.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/decorators.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/digar_schema_converter.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/elastic.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/exceptions.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/normalizers/__init__.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/normalizers/authorities.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/normalizers/bibs.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_parsers/base_parser.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_parsers/ems_parser.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_parsers/location_parser.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_parsers/organization_parser.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_parsers/person_parser.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_parsers/title_parser.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_records/base_record.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_records/ems_record.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_records/organization_record.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_records/person_record.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_records/title_record.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/tools/entity_normalizers.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/tools/marc_converter.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/tools/russian_transliterator.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/s3.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/task_reporter.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/utils.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools.egg-info/requires.txt RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools.egg-info/top_level.txt RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/requirements.txt RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/setup.cfg RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_digar_schema_converter.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_elastic.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_elastic_vector_and_search_operations.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_entity_normalizers.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_marc_parsers.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_normalization.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_s3_exceptions.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_s3_file_operations.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_sierra_converters.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_task_reporter.py RENAMED Viewed

File without changes

{rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_utils.py RENAMED Viewed

File without changes

rara-tools 0.5.3__tar.gz → 0.6.1__tar.gz

Potentially problematic release.

rara-tools 0.5.3tar.gz → 0.6.1tar.gz