PyPI - rara-tools - Versions diffs - 0.0.13__tar.gz → 0.2.0__tar.gz - Mend

rara-tools 0.0.13tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rara-tools might be problematic. Click here for more details.

Files changed (37) hide show

{rara_tools-0.0.13/rara_tools.egg-info → rara_tools-0.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: rara-tools
-Version: 0.0.13
+Version: 0.2.0
 Summary: Tools to support Kata's work.
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10
@@ -13,11 +13,15 @@ License-File: LICENSE.md
 Requires-Dist: elasticsearch==8.*
 Requires-Dist: elasticsearch_dsl==8.*
 Requires-Dist: minio==7.*
+Requires-Dist: rara-norm-linker==1.*
 Requires-Dist: requests
 Requires-Dist: iso639-lang
+Requires-Dist: pymarc
+Requires-Dist: glom
 Provides-Extra: testing
 Requires-Dist: pytest>=8.0; extra == "testing"
 Requires-Dist: pytest-order; extra == "testing"
+Dynamic: license-file
 # RaRa Tools

rara_tools-0.2.0/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.2.0

rara_tools-0.2.0/rara_tools/constants/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .normalizers import *

{rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools/constants/digitizer.py RENAMED Viewed

@@ -1,4 +1,12 @@
+COMPONENT_KEY = "digitizer"
+class ModelTypes:
+    IMAGE_PROCESSOR = "image_processor"
 class StatusKeys:
+    DOWNLOAD_MODELS = "digitizer_download_models"
     CLEAN_UP = "digitizer_clean_up"
     ELASTICSEARCH_UPLOAD = "digitizer_elasticsearch_upload"
     UPLOAD = "s3_upload"
@@ -11,3 +19,7 @@ class Queue:
     DOWNLOAD = "download"
     FINISH = "finish"
     OCR = "ocr"
+class Tasks:
+    MODEL_UPDATE = "component_model_update"

{rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools/constants/general.py RENAMED Viewed

@@ -1,4 +1,5 @@
 class Status:
+    SKIPPED = "SKIPPED"
     FAILED = "FAILED"
     PENDING = "PENDING"
     RUNNING = "RUNNING"

rara_tools-0.2.0/rara_tools/constants/normalizers.py ADDED Viewed

@@ -0,0 +1,17 @@
+from pymarc import Indicators
+import os
+EMPTY_INDICATORS = Indicators(" ", " ")
+VIAF_ALLOWED_SOURCES = ["LC", "DNB", "LNB", "NLL",
+                        "ERRR", "J9U"]
+ES_HOST = os.getenv("ELASTIC_TEST_URL", "http://localhost:9200")
+LINKER_CONFIG = {
+    "add_viaf_info": True,
+    "vectorizer_data_path": "./vectorizer_data",
+    "per_config": {"es_host": ES_HOST},
+    "org_config": {"es_host": ES_HOST},
+    "loc_config": {"es_host": ES_HOST},
+    "ems_config": {"es_host": ES_HOST},
+}

{rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools/converters.py RENAMED Viewed

@@ -1,19 +1,22 @@
-from .exceptions import SierraResponseConverterException
+from rara_tools.exceptions import SierraResponseConverterException
 class SierraResponseConverter:
     """Converts a JSON response from the Sierra API to MARC-in-JSON format."""
     def __init__(self, response: dict):
         if not isinstance(response, dict):
-            raise SierraResponseConverterException("Please provide a valid JSON response.")
+            raise SierraResponseConverterException(
+                "Please provide a valid JSON response.")
         self.response = response
-    def _map_control_fields(self, field: dict) -> dict:
-        # for tags < 010, no subfields, instead one str value in "value"
+    @staticmethod
+    def _map_control_fields(field: dict) -> dict:
+        # for tags < 010, no subfields, instead one str value in "value"
         return {field["tag"]: field["value"]}
-    def _map_data_fields(self, field: dict) -> dict:
+    @staticmethod
+    def _map_data_fields(field: dict) -> dict:
         """ Maps marc fields > 010.
         Args:
@@ -22,60 +25,66 @@ class SierraResponseConverter:
         Returns:
             dict: standardised marc-in-json format.
         """
         data = field["data"]
         # Order matters ind1, in2, subfields
         field_data = {
             "ind1": data.get("ind1", " "),
             "ind2": data.get("ind2", " "),
             "subfields": data.get("subfields", [])
         }
         return {field["tag"]: field_data}
-    def _is_marc21structured(self, field: dict) -> bool:
+    @staticmethod
+    def _is_marc21structured(field: dict) -> bool:
         """Checks if the field is already structured according to MARC21 in JSON"""
         return any(key.isdigit() for key in field.keys())
     def _handle_field_type(self, field: dict) -> dict:
         if self._is_marc21structured(field):
             return field
         if field.get("data"):
             return self._map_data_fields(field)
         tag = field.get("tag")
         if not tag:
-            raise SierraResponseConverterException("Field is missing MARC21 tag.")
+            raise SierraResponseConverterException(
+                "Field is missing MARC21 tag.")
         if tag < "010":
             return self._map_control_fields(field)
         else:
             return self._map_data_fields(field)
     def _convert_response(self) -> list:
         entries = self.response.get("entries")
         if not entries:
-            raise SierraResponseConverterException("No entries found in the response.")
+            raise SierraResponseConverterException(
+                "No entries found in the response.")
         try:
-            return {"fields": [
-                {e["id"]: [
-                    self._handle_field_type(f) for f in e["marc"]["fields"]
+            return [
+                {
+                    "sierraID": str(e["id"]),
+                    "leader": e["marc"]["leader"],
+                    "fields": [
+                        self._handle_field_type(f) for f in e["marc"]["fields"]
                     ]}
                 for e in entries
-            ]}
+            ]
         except KeyError as e:
-            raise SierraResponseConverterException(f"Malformed response: missing key {e}")
+            raise SierraResponseConverterException(
+                f"Malformed response: missing key {e}")
     def convert(self) -> list:
         try:
             return self._convert_response()
         except Exception as e:
-            raise SierraResponseConverterException(f"An unexpected error occurred: {e}")
+            raise SierraResponseConverterException(
+                f"An unexpected error occurred: {e}")

{rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools/exceptions.py RENAMED Viewed

@@ -7,6 +7,10 @@ class S3InitException(Exception):
 class S3ConnectionException(Exception):
     """Raised S3 Bucket/Connection Error."""
+class S3DownloadException(Exception):
+    """Raised S3 Download Error."""
 class ElasticsearchException(Exception):
     """Raised Elasticsearch Error."""

{rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools/s3.py RENAMED Viewed

@@ -1,11 +1,20 @@
+import logging
 import os
+import pathlib
+import time
 import uuid
 from typing import Any, Generator, List, Optional
-from minio import Minio
+from minio import Minio, S3Error
-from .exceptions import (S3ConnectionException, S3InitException,
-                         S3InputException)
+from .exceptions import (
+    S3ConnectionException,
+    S3InitException,
+    S3InputException,
+    S3DownloadException
+)
+logger = logging.getLogger("tools.s3")
 class S3Files:
@@ -76,9 +85,49 @@ class S3Files:
         list_of_objects = list(self.minio_client.list_objects(self.bucket, prefix=path, recursive=True))
         for minio_object in list_of_objects:
             full_path = os.path.join(download_dir, minio_object.object_name)
-            self.minio_client.fget_object(self.bucket, minio_object.object_name, full_path)
+            self._download_file(minio_object.object_name, full_path)
             yield full_path
+    def _download_file(self, path, download_dir=".", max_retries=3) -> str:
+        """Download a single file with retry and resume support."""
+        attempts = 0
+        while attempts < max_retries:
+            try:
+                stat = self.minio_client.stat_object(self.bucket, path)
+                file_size = stat.size
+                temp_path = download_dir + ".part"
+                pathlib.Path(temp_path).parent.mkdir(parents=True, exist_ok=True)
+                # Check if a partial file exists
+                downloaded_size = os.path.getsize(temp_path) if os.path.exists(temp_path) else 0
+                if downloaded_size >= file_size:
+                    os.rename(temp_path, download_dir)  # Rename to final filename
+                    logger.info(f"Completed: {path}")
+                    return str(pathlib.Path(download_dir) / path)
+                logger.info(f"Downloading {path} ({downloaded_size}/{file_size} bytes)...")
+                # Open file in append mode to resume download
+                with open(temp_path, "ab") as f:
+                    response = self.minio_client.get_object(self.bucket, path, offset=downloaded_size)
+                    for data in response.stream(32 * 1024):  # 32KB chunks
+                        f.write(data)
+                    response.close()
+                    response.release_conn()
+                os.rename(temp_path, download_dir)  # Rename temp to final
+                logger.info(f"Downloaded: {path}")
+                return str(pathlib.Path(download_dir) / path)
+            except S3Error as e:
+                logger.info(f"Error downloading {path}, attempt {attempts + 1}: {e}")
+                attempts += 1
+                time.sleep(2 ** attempts)  # Exponential backoff
+        raise S3DownloadException(f"Failed to download {path} after {max_retries} attempts.")
     def upload(self, path: str, prefix: Optional[str] = "") -> str:
         """Uploads file or folder to S3 bucket.
         :param: path str: Path to the file to upload in local file system.

{rara_tools-0.0.13 → rara_tools-0.2.0/rara_tools.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: rara-tools
-Version: 0.0.13
+Version: 0.2.0
 Summary: Tools to support Kata's work.
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10
@@ -13,11 +13,15 @@ License-File: LICENSE.md
 Requires-Dist: elasticsearch==8.*
 Requires-Dist: elasticsearch_dsl==8.*
 Requires-Dist: minio==7.*
+Requires-Dist: rara-norm-linker==1.*
 Requires-Dist: requests
 Requires-Dist: iso639-lang
+Requires-Dist: pymarc
+Requires-Dist: glom
 Provides-Extra: testing
 Requires-Dist: pytest>=8.0; extra == "testing"
 Requires-Dist: pytest-order; extra == "testing"
+Dynamic: license-file
 # RaRa Tools

{rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools.egg-info/SOURCES.txt RENAMED Viewed

@@ -19,10 +19,14 @@ rara_tools.egg-info/top_level.txt
 rara_tools/constants/__init__.py
 rara_tools/constants/digitizer.py
 rara_tools/constants/general.py
-tests/test_converters.py
+rara_tools/constants/normalizers.py
 tests/test_digar_schema_converter.py
 tests/test_elastic.py
 tests/test_elastic_vector_and_search_operations.py
+tests/test_normalization.py
 tests/test_s3_exceptions.py
 tests/test_s3_file_operations.py
-tests/test_task_reporter.py
+tests/test_sierra_converters.py
+tests/test_task_reporter.py
+tests/test_utils.py
+tests/test_viaf_client.py

{rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools.egg-info/requires.txt RENAMED Viewed

@@ -1,8 +1,11 @@
 elasticsearch==8.*
 elasticsearch_dsl==8.*
 minio==7.*
+rara-norm-linker==1.*
 requests
 iso639-lang
+pymarc
+glom
 [testing]
 pytest>=8.0

{rara_tools-0.0.13 → rara_tools-0.2.0}/requirements.txt RENAMED Viewed

@@ -1,5 +1,8 @@
 elasticsearch==8.*
 elasticsearch_dsl==8.*
 minio==7.*
+rara-norm-linker==1.*
 requests
 iso639-lang
+pymarc
+glom

rara_tools-0.2.0/tests/test_normalization.py ADDED Viewed

@@ -0,0 +1,315 @@
+from rara_tools.normalizers import BibRecordNormalizer, AuthoritiesRecordNormalizer
+from tests.test_utils import (get_formatted_sierra_response,
+                              check_record_tags_sorted, check_no_dupe_tag_values, check_record_tags_have_values)
+from pymarc import Record
+import pytest
+import os
+TEST_LEVEL = os.getenv("TEST_LEVEL", "unit")
+EMPTY_SIERRA_RECORDS = [
+    {
+        "sierraID": "1",
+        "leader": "00000nz  a2200000n  4500",
+        "fields": []
+    },
+]
+REQUIRED_FIELDS = ["667", "925"]  # always included after normalization
+REASON = "Skipped because TEST_LEVEL is set to 'ci'"
+@pytest.mark.skipif(TEST_LEVEL == "ci", reason=REASON)
+def test_normalizers_OK():
+    """ Test field editing logic & internals"""
+    entities = [
+        "Paul Keres",  # will find multiple entities
+        "Anton Hansen Tammsaare",
+        "GIBBBERRISH",
+    ]
+    test_sierra_data = get_formatted_sierra_response("authorities.json")
+    normalizer = AuthoritiesRecordNormalizer(
+        entities=entities,
+        sierra_data=test_sierra_data,
+    )
+    assert len(normalizer.records_extra_data) == len(normalizer.data)
+    normalizer = BibRecordNormalizer(
+        entities=entities,
+        sierra_data=test_sierra_data,
+    )
+    assert len(normalizer.records_extra_data) == len(normalizer.data)
+    data = [
+        {
+            "sierraID": "1",
+            "leader": "00000nz  a2200000n  4500",
+            "fields": [
+                {
+                    "667": {
+                        "ind1": " ",
+                        "ind2": " ",
+                        "subfields": [
+                            {
+                                "a": "Val"
+                            }
+                        ]
+                    }
+                },
+            ]
+        },
+    ]
+    # default behavior - added if not in record &
+    normalizer = AuthoritiesRecordNormalizer(
+        sierra_data=data,
+        ALLOW_EDIT_FIELDS=[],
+        REPEATABLE_FIELDS=[],
+    )
+    for r in normalizer:
+        assert r.get_fields("667")[0].get_subfields("a")[0] == "Val"
+    # not edited if exists
+    normalizer = AuthoritiesRecordNormalizer(
+        sierra_data=data,
+        ALLOW_EDIT_FIELDS=[],
+        REPEATABLE_FIELDS=[]
+    )
+    for r in normalizer:
+        assert r.get_fields("667")[0].get_subfields("a")[0] == "Val"
+    # allow repeatable, new field will be added
+    normalizer = AuthoritiesRecordNormalizer(
+        sierra_data=data,
+        ALLOW_EDIT_FIELDS=[],
+        REPEATABLE_FIELDS=["667"]
+    )
+    for r in normalizer:
+        fields_667 = r.get_fields("667")
+        assert len(fields_667) == 2
+        assert fields_667[0].get_subfields("a")[0] == "Val"
+        assert fields_667[1].get_subfields("a")[0] == "Muudetud AI poolt"
+    # allow editing, field will be edited
+    normalizer = AuthoritiesRecordNormalizer(
+        sierra_data=data,
+        ALLOW_EDIT_FIELDS=["667"],
+        REPEATABLE_FIELDS=[]
+    )
+    for r in normalizer:
+        fields_667 = r.get_fields("667")
+        assert len(fields_667) == 1
+        assert fields_667[0].get_subfields("a")[0] == "Muudetud AI poolt"
+def validate_bibrecord_normalized(record: Record, has_viaf_data=False):
+    # source notes
+    assert record.get_fields("667")[0].get_subfields("a")[
+        0] == "Muudetud AI poolt"
+def validate_authorities_record_normalized(record: Record, has_viaf_data=False):
+    field_667 = record.get_fields("667")[0].get_subfields("a")[0]
+    assert field_667 == "Muudetud AI poolt" or field_667 == "Loodud AI poolt"
+    field_040_subfields = record.get_fields("040")[0]
+    # check that a, b & c subfields have values (can have default or unique)
+    assert len(field_040_subfields.get_subfields("a")) > 0
+    assert len(field_040_subfields.get_subfields("b")) > 0
+    assert len(field_040_subfields.get_subfields("c")) > 0
+    # check that 008 field has a value of length 40
+    field_008 = record.get_fields("008")[0].data
+    assert len(field_008) == 40
+    if has_viaf_data:
+        field_043 = record.get_fields("043")[0].get_subfields(
+            "c")[0]  # check that 043 has subfield c with value "ee"
+        assert field_043 == "ee"
+        field_024 = record.get_fields("024")
+        for f in field_024:
+            assert len(f.get_subfields("0")) > 0  # VIAF url
+        field_046 = record.get_fields("046")[0]
+        assert len(field_046.get_subfields("f")) > 0  # birth date
+        assert len(field_046.get_subfields("g")) > 0  # death date
+        # assert len(field_046.get_subfields("s")) > 0 # activity start
+        # assert len(field_046.get_subfields("t")) > 0 # activity end
+@pytest.mark.skipif(TEST_LEVEL == "ci", reason=REASON)
+def test_missing_fields_created_bibrecord_normalization():
+    normalizer_entities_only = BibRecordNormalizer(
+        entities=["Eduard Vilde", "Linda Vilde"],  # find one match
+    )
+    normalizer_sierra_data_only = BibRecordNormalizer(
+        sierra_data=EMPTY_SIERRA_RECORDS,
+    )
+    for record in normalizer_entities_only:
+        check_record_tags_have_values(
+            record, ["008", "046", "245",  # Sierra related, always with bibs
+                     "035",  "100",  # VIAf enriched
+                     ] + REQUIRED_FIELDS
+        )
+        validate_bibrecord_normalized(record, has_viaf_data=True)
+    for record in normalizer_sierra_data_only:
+        check_record_tags_have_values(
+            record, ["008", "046", "245",  # Sierra related, always with bibs
+                     ] + REQUIRED_FIELDS)
+        validate_bibrecord_normalized(record)
+@pytest.mark.skipif(TEST_LEVEL == "ci", reason=REASON)
+def test_missing_fields_created_authorities_normalization():
+    normalizer_entities_only = AuthoritiesRecordNormalizer(
+        entities=["Eduard Vilde"],  # find one match
+    )
+    normalizer_sierra_data_only = AuthoritiesRecordNormalizer(
+        sierra_data=EMPTY_SIERRA_RECORDS,
+    )
+    for r in normalizer_entities_only:
+        check_record_tags_have_values(r, ["008", "040",  # SIERRA related
+                                          "024", "043", "046"  # VIAF enriched
+                                          ] + REQUIRED_FIELDS)
+        validate_authorities_record_normalized(r, True)
+    for r in normalizer_sierra_data_only:
+        check_record_tags_have_values(
+            r, ["040"] + REQUIRED_FIELDS)
+        validate_authorities_record_normalized(r)
+@pytest.mark.skipif(TEST_LEVEL == "ci", reason=REASON)
+def test_normalized_fields_sorted():
+    unsorted_bibdata = [
+        {
+            "sierraID": "1",
+            "leader": "00000nz  a2200000n  4500",
+            "fields": [
+                {
+                        "035": {
+                            "ind1": " ",
+                            "ind2": " ",
+                            "subfields": [
+                                {
+                                    "a": "(ErESTER)<1>"
+                                }
+                            ]
+                        }
+                },
+                {
+                    "008": "220805|||aznnnaabn          || |||      nz n  "
+                },
+                {
+                    "046": {
+                        "ind1": " ",
+                        "ind2": " ",
+                        "subfields": [
+                            {
+                                "k": "1912"
+                            }
+                        ]
+                    }
+                },
+            ]
+        }
+    ]
+    normalizers = (BibRecordNormalizer, AuthoritiesRecordNormalizer)
+    for normalizer in normalizers:
+        normalizer = normalizer(
+            entities=[],
+            sierra_data=unsorted_bibdata
+        )
+        for r in normalizer:
+            check_no_dupe_tag_values(r)
+            check_record_tags_sorted(r)
+@pytest.mark.skipif(TEST_LEVEL == "ci", reason=REASON)
+def test_authority_normrecord_found_in_es_and_normalized():
+    """ KATA elastic normkirjete seast leitakse 1 vaste & normaliseerija täiendab leitud normkirjet VIAF infoga.
+        - valideeri normaliseerimise mapping, mis autori tabelis. Täiendatud väljad ja VIAFist info
+        - Valideeri märge lisatud (TODO) """
+    # Presume, author name identified and sent to linker
+    name = "Jaan Kross"
+    normalizer = AuthoritiesRecordNormalizer(
+        entities=[name]
+    )
+    data = normalizer.data
+    assert len(data) == 1
+    for r in normalizer:
+        check_record_tags_have_values(r, ["040"] + REQUIRED_FIELDS)
+        validate_authorities_record_normalized(r, has_viaf_data=True)
+@pytest.mark.skipif(TEST_LEVEL == "ci", reason=REASON)
+def test_authority_normrecord_not_found_in_es_and_viaf():
+    """KATA elastic normkirjete seast vastet ei leitud & linkija sooritab VIAFisse otsingu
+        - Üks vaste leiti - luuakse uus normkirje
+        - Ei leitud ühtegi vastet, või on leitud vasteid mitu - AI tuvastatud info põhjal uue kirje loomine(TODO)
+    """
+    # 1 result found
+    normalizer = AuthoritiesRecordNormalizer(entities=["Karl Ristikivi"])
+    data = normalizer.data
+    assert len(data) == 1  # should create new normalized record
+    # Entities not found, es & VIAF
+    normalizer = AuthoritiesRecordNormalizer(entities=["asdasd#@2"])
+    data = normalizer.data
+    assert len(data) == 0  # should create new normalized record
+    # multiple entities found, skipped
+    normalizer = AuthoritiesRecordNormalizer(entities=["Paul Keres"])
+    data = normalizer.data
+    assert len(data) == 0  # should not create anything atm
+@pytest.mark.skipif(TEST_LEVEL == "ci", reason=REASON)
+def test_matching_sierra_record_viaf_id_found():
+    """normkirjelt leitakse VIAF ID, vajadusel normi asukoht, kus see ID sisaldub."""
+    pass
+@pytest.mark.skipif(TEST_LEVEL == "ci", reason=REASON)
+def test_matching_sierra_record_viaf_id_not_found():
+    """kirjelt VIAF IDd ei leitud, soorita otsing VIAFi pihta, et leida _vastutav isik_?. Loo uus vastavalt otsingu tulemusele."""
+    pass
+@pytest.mark.skipif(TEST_LEVEL == "ci", reason=REASON)
+def test_authorities_normalizer_checks():
+    """
+    - kontrolli kas tuvastatud nimi on SIERRAst leitud vaste 1XX, 4XX väljadel. Kui pole, siis lisa 4XX väljale.
+    - kontrolli, kas VIAF andmete nimekujud on normkandes olemas. Kui pole, lisa need 4XX väljale.
+    - Kontrolli, kas VIAF kandes on sünni ja surma daatumid ja kas need klapivad normkandes olevaga. Kui pole, siis liiguta normkandest kogu 1XX väli 4XX väljale. Seejärel loo uute daatumitega 1XX väli.
+    - Kontrolli, et väljal 046 olevad daatumid klapiksid just 1xx väljale lisatuga. Kui andmeid muudeti, siis märgi, et baasis on normkanne muutunud
+    """
+    pass

rara_tools-0.2.0/tests/test_sierra_converters.py ADDED Viewed

@@ -0,0 +1,101 @@
+import os
+import pytest
+from rara_tools.converters import SierraResponseConverter
+from rara_tools.exceptions import SierraResponseConverterException
+from tests.const import SIERRA_OUTPUT_DIR
+from tests.test_utils import (read_json_file, get_formatted_sierra_response, compare_results)
+example_res = {
+    "total": 100,
+    "start": 50000,
+    "entries": [
+        {
+            "id": 1126963,
+            "updatedDate": "2016-02-09T08:42:52Z",
+            "createdDate": "2014-05-17T17:22:00Z",
+            "deleted": False,
+            "suppressed": False,
+            "marc": {
+                "leader": "00000nz  a2200145n  4500",
+                "fields": [
+                    {
+                        # "tag": "100",
+                        "data": {
+                            "ind1": "1",
+                                    "ind2": " ",
+                                    "subfields": [
+                                        {
+                                            "code": "a",
+                                            "data": "Viggor, Signe,"
+                                        },
+                                        {
+                                            "code": "d",
+                                            "data": "1975-"
+                                        }
+                                    ]
+                        }
+                    },
+                ]}}]}
+def test_convert_bibs_response():
+    data = get_formatted_sierra_response("bibs.json")
+    expected = read_json_file(os.path.join(SIERRA_OUTPUT_DIR, "bibs.json"))
+    assert compare_results(expected, data)
+def test_convert_keywords_response():
+    data = get_formatted_sierra_response("keywords.json")
+    expected = read_json_file(os.path.join(SIERRA_OUTPUT_DIR, "keywords.json"))
+    assert compare_results(expected, data)
+def test_convert_authorities_response():
+    data = get_formatted_sierra_response("authorities.json")
+    expected = read_json_file(os.path.join(
+        SIERRA_OUTPUT_DIR, "authorities.json"))
+    assert compare_results(expected, data)
+def test_converter_handles_marc_in_json_response():
+    """ Gracefully handle entries already in MARC-in-JSON format """
+    data = get_formatted_sierra_response("bibsmarc.json")
+    expected = read_json_file(os.path.join(SIERRA_OUTPUT_DIR, "bibsmarc.json"))
+    assert compare_results(expected, data)
+def test_convert_with_wrong_format():
+    with pytest.raises(SierraResponseConverterException):
+        SierraResponseConverter("$")
+def test_convert_missing_tag():
+    with pytest.raises(SierraResponseConverterException):
+        response = example_res.copy()
+        response["entries"][0]["marc"]["fields"][0].pop("tag", None)
+        converter = SierraResponseConverter(response)
+        converter.convert()
+def test_no_entries_in_response():
+    with pytest.raises(SierraResponseConverterException):
+        response = example_res.copy()
+        response.pop("entries", [])
+        converter = SierraResponseConverter(response)
+        converter.convert()

rara_tools-0.2.0/tests/test_utils.py ADDED Viewed

@@ -0,0 +1,77 @@
+from tests.const import SIERRA_INPUT_DIR, NORMALIZED_DIR, VIAF_TEST_DATA_DIR
+from rara_tools.constants import VIAF_ALLOWED_SOURCES
+from rara_tools.converters import SierraResponseConverter
+from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
+from rara_linker.linkers.linker import Linker
+from pymarc import Record
+from typing import List
+import json
+import os
+def read_json_file(path: str):
+    with open(path, "r") as f:
+        data = f.read()
+        return json.loads(data)
+def check_record_tags_sorted(record: Record):
+    record_tags = [field.tag for field in record.get_fields()]
+    assert record_tags == sorted(record_tags)
+def check_no_dupe_tag_values(record: Record):
+    repetable_tags = ["024", "035", "400", "670"]
+    record_tags = [field.tag for field in record.get_fields() if field.tag not in repetable_tags]
+    assert len(record_tags) == len(set(record_tags))
+def check_record_tags_have_values(record: Record, tags: List[str]):
+    for tag in tags:
+        assert record[tag] is not None
+def get_record_field_value(record: Record, tag: str):
+    """ handle control & variable fields """
+    return record.get_fields(tag)[0].value()
+def compare_results(expected: dict, results: dict):
+    return json.dumps(expected) == json.dumps(results)
+def get_formatted_sierra_response(fname: str):
+    """ Reads a mock Sierra response file and converts it to MARC in json."""
+    response = read_json_file(os.path.join(SIERRA_INPUT_DIR, fname))
+    converter = SierraResponseConverter(response)
+    return converter.convert()
+def get_viaf_record(id: str, allowed_sources: list):
+    """ Fetches VIAF record by ID and returns a VIAFRecord object """
+    client = VIAFClient()  # should use Linker instead? not ViafLinker directly
+    response = client.get_records_by_viaf_id(id)
+    viaf_record = VIAFRecord(
+        response, allowed_sources=allowed_sources)
+    return viaf_record
+def search_viaf_record(search_term: str, allowed_sources: list):
+    """ Fetches VIAF record by name and returns a VIAFRecord object """
+    client = VIAFClient()
+    response = client.get_records_by_search_term(search_term)
+    return VIAFRecord(response, allowed_sources=allowed_sources)
+def get_normalized_example(fname: str):
+    with open(os.path.join(NORMALIZED_DIR, fname), "r") as f:
+        data = f.read()
+        return json.loads(data)

rara_tools-0.2.0/tests/test_viaf_client.py ADDED Viewed

@@ -0,0 +1,19 @@
+from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
+def test_fetch_clusters_by_id_list():
+    viaf_ids = ["7432247", "456"]
+    client = VIAFClient()
+    results = client.fetch_viaf_clusters(viaf_ids)
+    assert len(results) == 2
+    assert results["456"] == {}
+    assert len(results["7432247"]) > 0
+def test_fetch_viaf_results_for_normalizer():
+    viaf_ids = ["7432247", "456"]
+    client = VIAFClient()
+    results = client.get_normalized_data(viaf_ids)
+    assert len(results) == 2

rara_tools-0.0.13/VERSION DELETED Viewed

	@@ -1 +0,0 @@
1	- 0.0.13

rara_tools-0.0.13/rara_tools/constants/__init__.py DELETED Viewed

File without changes

rara_tools-0.0.13/tests/test_converters.py DELETED Viewed

@@ -1,127 +0,0 @@
-import json
-import os
-import pytest
-from rara_tools.converters import SierraResponseConverter
-from rara_tools.exceptions import SierraResponseConverterException
-import json
-root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-SIERRA_TEST_DATA_DIR = os.path.join(root, "tests", "test_data", "sierra")
-INPUT_DIR = os.path.join(SIERRA_TEST_DATA_DIR, "input")
-OUTPUT_DIR = os.path.join(SIERRA_TEST_DATA_DIR, "output")
-def compare_results(expected, converted):
-    return json.dumps(expected) == json.dumps(converted)
-def read_json_file(file_path):
-    with open(file_path, "r") as f:
-        data = f.read()
-        return json.loads(data)
-example_res =  {
-            "total": 100,
-            "start": 50000,
-            "entries": [
-                {
-                    "id": 1126963,
-                    "updatedDate": "2016-02-09T08:42:52Z",
-                    "createdDate": "2014-05-17T17:22:00Z",
-                    "deleted": False,
-                    "suppressed": False,
-                    "marc": {
-                        "leader": "00000nz  a2200145n  4500",
-                        "fields": [
-                            {
-                                # "tag": "100",
-                                "data": {
-                                    "ind1": "1",
-                                    "ind2": " ",
-                                    "subfields": [
-                                        {
-                                            "code": "a",
-                                            "data": "Viggor, Signe,"
-                                        },
-                                        {
-                                            "code": "d",
-                                            "data": "1975-"
-                                        }
-                                    ]
-                                }
-                            },
-            ]}}]}
-def test_convert_bibs_response():
-    response = read_json_file(os.path.join(INPUT_DIR, "bibs.json"))
-    converter = SierraResponseConverter(response)
-    data = converter.convert()
-    expected = read_json_file(os.path.join(OUTPUT_DIR, "bibs.json"))
-    assert compare_results(expected, data)
-def test_convert_keywords_response():
-    with open(os.path.join(INPUT_DIR, "keywords.json"), "r") as f:
-        response = f.read()
-        response = json.loads(response)
-    converter = SierraResponseConverter(response)
-    data = converter.convert()
-    expected = read_json_file(os.path.join(OUTPUT_DIR, "keywords.json"))
-    assert compare_results(expected, data)
-def test_convert_authorities_response():
-    with open(os.path.join(INPUT_DIR, "authorities.json"), "r") as f:
-        response = f.read()
-        response = json.loads(response)
-    converter = SierraResponseConverter(response)
-    data = converter.convert()
-    expected = read_json_file(os.path.join(OUTPUT_DIR, "authorities.json"))
-    assert compare_results(expected, data)
-def test_converter_handles_marc_in_json_response():
-    """ Gracefully handle entries already in MARC-in-JSON format """
-    with open(os.path.join(INPUT_DIR, "bibsmarc.json"), "r") as f:
-        response = f.read()
-        response = json.loads(response)
-    converter = SierraResponseConverter(response)
-    data = converter.convert()
-    expected = read_json_file(os.path.join(OUTPUT_DIR, "bibsmarc.json"))
-    assert compare_results(expected, data)
-def test_convert_with_wrong_format():
-    with pytest.raises(SierraResponseConverterException):
-        SierraResponseConverter("$")
-def test_convert_missing_tag():
-    with pytest.raises(SierraResponseConverterException):
-        response = example_res.copy()
-        response["entries"][0]["marc"]["fields"][0].pop("tag", None)
-        converter = SierraResponseConverter(response)
-        converter.convert()
-def test_no_entries_in_response():
-    with pytest.raises(SierraResponseConverterException):
-        response = example_res.copy()
-        response.pop("entries", [])
-        converter = SierraResponseConverter(response)
-        converter.convert()