PyPI - rara-tools - Versions diffs - 0.0.12__tar.gz → 0.1.0__tar.gz - Mend

rara-tools 0.0.12tar.gz → 0.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rara-tools might be problematic. Click here for more details.

Files changed (32) hide show

{rara_tools-0.0.12/rara_tools.egg-info → rara_tools-0.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: rara-tools
-Version: 0.0.12
+Version: 0.1.0
 Summary: Tools to support Kata's work.
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10
@@ -18,6 +18,7 @@ Requires-Dist: iso639-lang
 Provides-Extra: testing
 Requires-Dist: pytest>=8.0; extra == "testing"
 Requires-Dist: pytest-order; extra == "testing"
+Dynamic: license-file
 # RaRa Tools

rara_tools-0.1.0/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.1.0

{rara_tools-0.0.12 → rara_tools-0.1.0}/rara_tools/constants/digitizer.py RENAMED Viewed

@@ -1,4 +1,12 @@
+COMPONENT_KEY = "digitizer"
+class ModelTypes:
+    IMAGE_PROCESSOR = "image_processor"
 class StatusKeys:
+    DOWNLOAD_MODELS = "digitizer_download_models"
     CLEAN_UP = "digitizer_clean_up"
     ELASTICSEARCH_UPLOAD = "digitizer_elasticsearch_upload"
     UPLOAD = "s3_upload"
@@ -11,3 +19,7 @@ class Queue:
     DOWNLOAD = "download"
     FINISH = "finish"
     OCR = "ocr"
+class Tasks:
+    MODEL_UPDATE = "component_model_update"

{rara_tools-0.0.12 → rara_tools-0.1.0}/rara_tools/constants/general.py RENAMED Viewed

@@ -1,4 +1,5 @@
 class Status:
+    SKIPPED = "SKIPPED"
     FAILED = "FAILED"
     PENDING = "PENDING"
     RUNNING = "RUNNING"

rara_tools-0.1.0/rara_tools/converters.py ADDED Viewed

@@ -0,0 +1,81 @@
+from .exceptions import SierraResponseConverterException
+class SierraResponseConverter:
+    """Converts a JSON response from the Sierra API to MARC-in-JSON format."""
+    def __init__(self, response: dict):
+        if not isinstance(response, dict):
+            raise SierraResponseConverterException("Please provide a valid JSON response.")
+        self.response = response
+    def _map_control_fields(self, field: dict) -> dict:
+        # for tags < 010, no subfields, instead one str value in "value"
+        return {field["tag"]: field["value"]}
+    def _map_data_fields(self, field: dict) -> dict:
+        """ Maps marc fields > 010.
+        Args:
+            field (dict): Contains the marc tag and list with indicators and subfields.
+        Returns:
+            dict: standardised marc-in-json format.
+        """
+        data = field["data"]
+        # Order matters ind1, in2, subfields
+        field_data = {
+            "ind1": data.get("ind1", " "),
+            "ind2": data.get("ind2", " "),
+            "subfields": data.get("subfields", [])
+        }
+        return {field["tag"]: field_data}
+    def _is_marc21structured(self, field: dict) -> bool:
+        """Checks if the field is already structured according to MARC21 in JSON"""
+        return any(key.isdigit() for key in field.keys())
+    def _handle_field_type(self, field: dict) -> dict:
+        if self._is_marc21structured(field):
+            return field
+        if field.get("data"):
+            return self._map_data_fields(field)
+        tag = field.get("tag")
+        if not tag:
+            raise SierraResponseConverterException("Field is missing MARC21 tag.")
+        if tag < "010":
+            return self._map_control_fields(field)
+        else:
+            return self._map_data_fields(field)
+    def _convert_response(self) -> list:
+        entries = self.response.get("entries")
+        if not entries:
+            raise SierraResponseConverterException("No entries found in the response.")
+        try:
+            return {"fields": [
+                {e["id"]: [
+                    self._handle_field_type(f) for f in e["marc"]["fields"]
+                    ]}
+                for e in entries
+            ]}
+        except KeyError as e:
+            raise SierraResponseConverterException(f"Malformed response: missing key {e}")
+    def convert(self) -> list:
+        try:
+            return self._convert_response()
+        except Exception as e:
+            raise SierraResponseConverterException(f"An unexpected error occurred: {e}")

{rara_tools-0.0.12 → rara_tools-0.1.0}/rara_tools/exceptions.py RENAMED Viewed

@@ -7,6 +7,10 @@ class S3InitException(Exception):
 class S3ConnectionException(Exception):
     """Raised S3 Bucket/Connection Error."""
+class S3DownloadException(Exception):
+    """Raised S3 Download Error."""
 class ElasticsearchException(Exception):
     """Raised Elasticsearch Error."""

{rara_tools-0.0.12 → rara_tools-0.1.0}/rara_tools/s3.py RENAMED Viewed

@@ -1,11 +1,20 @@
+import logging
 import os
+import pathlib
+import time
 import uuid
 from typing import Any, Generator, List, Optional
-from minio import Minio
+from minio import Minio, S3Error
-from .exceptions import (S3ConnectionException, S3InitException,
-                         S3InputException)
+from .exceptions import (
+    S3ConnectionException,
+    S3InitException,
+    S3InputException,
+    S3DownloadException
+)
+logger = logging.getLogger("tools.s3")
 class S3Files:
@@ -76,9 +85,49 @@ class S3Files:
         list_of_objects = list(self.minio_client.list_objects(self.bucket, prefix=path, recursive=True))
         for minio_object in list_of_objects:
             full_path = os.path.join(download_dir, minio_object.object_name)
-            self.minio_client.fget_object(self.bucket, minio_object.object_name, full_path)
+            self._download_file(minio_object.object_name, full_path)
             yield full_path
+    def _download_file(self, path, download_dir=".", max_retries=3) -> str:
+        """Download a single file with retry and resume support."""
+        attempts = 0
+        while attempts < max_retries:
+            try:
+                stat = self.minio_client.stat_object(self.bucket, path)
+                file_size = stat.size
+                temp_path = download_dir + ".part"
+                pathlib.Path(temp_path).parent.mkdir(parents=True, exist_ok=True)
+                # Check if a partial file exists
+                downloaded_size = os.path.getsize(temp_path) if os.path.exists(temp_path) else 0
+                if downloaded_size >= file_size:
+                    os.rename(temp_path, download_dir)  # Rename to final filename
+                    logger.info(f"Completed: {path}")
+                    return str(pathlib.Path(download_dir) / path)
+                logger.info(f"Downloading {path} ({downloaded_size}/{file_size} bytes)...")
+                # Open file in append mode to resume download
+                with open(temp_path, "ab") as f:
+                    response = self.minio_client.get_object(self.bucket, path, offset=downloaded_size)
+                    for data in response.stream(32 * 1024):  # 32KB chunks
+                        f.write(data)
+                    response.close()
+                    response.release_conn()
+                os.rename(temp_path, download_dir)  # Rename temp to final
+                logger.info(f"Downloaded: {path}")
+                return str(pathlib.Path(download_dir) / path)
+            except S3Error as e:
+                logger.info(f"Error downloading {path}, attempt {attempts + 1}: {e}")
+                attempts += 1
+                time.sleep(2 ** attempts)  # Exponential backoff
+        raise S3DownloadException(f"Failed to download {path} after {max_retries} attempts.")
     def upload(self, path: str, prefix: Optional[str] = "") -> str:
         """Uploads file or folder to S3 bucket.
         :param: path str: Path to the file to upload in local file system.

{rara_tools-0.0.12 → rara_tools-0.1.0/rara_tools.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: rara-tools
-Version: 0.0.12
+Version: 0.1.0
 Summary: Tools to support Kata's work.
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10
@@ -18,6 +18,7 @@ Requires-Dist: iso639-lang
 Provides-Extra: testing
 Requires-Dist: pytest>=8.0; extra == "testing"
 Requires-Dist: pytest-order; extra == "testing"
+Dynamic: license-file
 # RaRa Tools

{rara_tools-0.0.12 → rara_tools-0.1.0}/tests/test_converters.py RENAMED Viewed

@@ -5,12 +5,22 @@ import pytest
 from rara_tools.converters import SierraResponseConverter
 from rara_tools.exceptions import SierraResponseConverterException
+import json
 root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 SIERRA_TEST_DATA_DIR = os.path.join(root, "tests", "test_data", "sierra")
 INPUT_DIR = os.path.join(SIERRA_TEST_DATA_DIR, "input")
 OUTPUT_DIR = os.path.join(SIERRA_TEST_DATA_DIR, "output")
+def compare_results(expected, converted):
+    return json.dumps(expected) == json.dumps(converted)
+def read_json_file(file_path):
+    with open(file_path, "r") as f:
+        data = f.read()
+        return json.loads(data)
 example_res =  {
             "total": 100,
             "start": 50000,
@@ -27,6 +37,8 @@ example_res =  {
                             {
                                 # "tag": "100",
                                 "data": {
+                                    "ind1": "1",
+                                    "ind2": " ",
                                     "subfields": [
                                         {
                                             "code": "a",
@@ -36,18 +48,13 @@ example_res =  {
                                             "code": "d",
                                             "data": "1975-"
                                         }
-                                    ],
-                                    "ind1": "1",
-                                    "ind2": " "
+                                    ]
                                 }
                             },
             ]}}]}
-def read_json_file(file_path):
-    with open(file_path, "r") as f:
-        data = f.read()
-        return json.loads(data)
 def test_convert_bibs_response():
     response = read_json_file(os.path.join(INPUT_DIR, "bibs.json"))
@@ -55,8 +62,9 @@ def test_convert_bibs_response():
     converter = SierraResponseConverter(response)
     data = converter.convert()
-    expected = read_json_file(os.path.join(OUTPUT_DIR, "bibs.json"))
-    assert data == expected
+    expected = read_json_file(os.path.join(OUTPUT_DIR, "bibs.json"))
+    assert compare_results(expected, data)
 def test_convert_keywords_response():
@@ -67,9 +75,10 @@ def test_convert_keywords_response():
     converter = SierraResponseConverter(response)
     data = converter.convert()
     expected = read_json_file(os.path.join(OUTPUT_DIR, "keywords.json"))
-    assert data == expected
+    assert compare_results(expected, data)
 def test_convert_authorities_response():
@@ -82,7 +91,20 @@ def test_convert_authorities_response():
     expected = read_json_file(os.path.join(OUTPUT_DIR, "authorities.json"))
-    assert data == expected
+    assert compare_results(expected, data)
+def test_converter_handles_marc_in_json_response():
+    """ Gracefully handle entries already in MARC-in-JSON format """
+    with open(os.path.join(INPUT_DIR, "bibsmarc.json"), "r") as f:
+        response = f.read()
+        response = json.loads(response)
+    converter = SierraResponseConverter(response)
+    data = converter.convert()
+    expected = read_json_file(os.path.join(OUTPUT_DIR, "bibsmarc.json"))
+    assert compare_results(expected, data)
 def test_convert_with_wrong_format():
     with pytest.raises(SierraResponseConverterException):

{rara_tools-0.0.12 → rara_tools-0.1.0}/tests/test_elastic_vector_and_search_operations.py RENAMED Viewed

@@ -15,7 +15,7 @@ TEST_DOCUMENTS = load_json("./tests/test_data/elastic_vectorized_docs.json")
 TEST_VECTOR_DATA = load_json("./tests/test_data/test_vector_data.json")
 TEST_VECTOR = TEST_VECTOR_DATA.get("vector")
-es_url = os.getenv("ELASTIC_TEST_URL", "http://rara-elastic.texta.ee:9200")#http://localhost:9200")
+es_url = os.getenv("ELASTIC_TEST_URL", "http://localhost:9200")
 ELASTIC = KataElastic(es_url)
 TEST_KNN_INDEX_NAME = "tools_knn_testing_index"

rara_tools-0.0.12/VERSION DELETED Viewed

	@@ -1 +0,0 @@
1	- 0.0.12

rara_tools-0.0.12/rara_tools/converters.py DELETED Viewed

@@ -1,41 +0,0 @@
-from .exceptions import SierraResponseConverterException
-class SierraResponseConverter:
-    """  Takes a JSON response from the Sierra API (https://tester.ester.ee/iii/sierra-api/swagger/index.html)
-    and converts it to MARC-in-JSON format.
-    """
-    def __init__(self, response: dict):
-        if not isinstance(response, dict):
-            raise SierraResponseConverterException("Please provide a valid JSON response.")
-        self.response = response
-    def _map_field_data(self, field):
-        tag = field.get("tag")
-        if not tag:
-            raise SierraResponseConverterException("Field is missing a valid 'tag'.")
-        data = field.get("data", {})
-        return {tag: data}
-    def _convert_response(self):
-        response = self.response
-        entries = response.get("entries")
-        if not entries:
-            raise SierraResponseConverterException("No entries found in the response.")
-        try:
-            fields = [self._map_field_data(f) for e in entries for f in e["marc"]["fields"]]
-        except KeyError as e:
-            raise SierraResponseConverterException(f"Missing expected MARC fields in the response: {e}")
-        return {"fields": fields}
-    def convert(self):
-        """Runner method, converts the response to MARC-in-JSON format with error handling."""
-        try:
-            return self._convert_response()
-        except Exception as e:
-            raise SierraResponseConverterException(f"An unexpected error occurred during conversion: {e}")

{rara_tools-0.0.12 → rara_tools-0.1.0}/LICENSE.md RENAMED Viewed

File without changes

{rara_tools-0.0.12 → rara_tools-0.1.0}/README.md RENAMED Viewed

File without changes

{rara_tools-0.0.12 → rara_tools-0.1.0}/pyproject.toml RENAMED Viewed

File without changes

{rara_tools-0.0.12 → rara_tools-0.1.0}/rara_tools/constants/__init__.py RENAMED Viewed

File without changes

{rara_tools-0.0.12 → rara_tools-0.1.0}/rara_tools/decorators.py RENAMED Viewed

File without changes

{rara_tools-0.0.12 → rara_tools-0.1.0}/rara_tools/digar_schema_converter.py RENAMED Viewed

File without changes

{rara_tools-0.0.12 → rara_tools-0.1.0}/rara_tools/elastic.py RENAMED Viewed

File without changes

{rara_tools-0.0.12 → rara_tools-0.1.0}/rara_tools/task_reporter.py RENAMED Viewed

File without changes

{rara_tools-0.0.12 → rara_tools-0.1.0}/rara_tools/utils.py RENAMED Viewed

File without changes

{rara_tools-0.0.12 → rara_tools-0.1.0}/rara_tools.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{rara_tools-0.0.12 → rara_tools-0.1.0}/rara_tools.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{rara_tools-0.0.12 → rara_tools-0.1.0}/rara_tools.egg-info/requires.txt RENAMED Viewed

File without changes

{rara_tools-0.0.12 → rara_tools-0.1.0}/rara_tools.egg-info/top_level.txt RENAMED Viewed

File without changes

{rara_tools-0.0.12 → rara_tools-0.1.0}/requirements.txt RENAMED Viewed

File without changes

{rara_tools-0.0.12 → rara_tools-0.1.0}/setup.cfg RENAMED Viewed

File without changes

{rara_tools-0.0.12 → rara_tools-0.1.0}/tests/test_digar_schema_converter.py RENAMED Viewed

File without changes

{rara_tools-0.0.12 → rara_tools-0.1.0}/tests/test_elastic.py RENAMED Viewed

File without changes

{rara_tools-0.0.12 → rara_tools-0.1.0}/tests/test_s3_exceptions.py RENAMED Viewed

File without changes

{rara_tools-0.0.12 → rara_tools-0.1.0}/tests/test_s3_file_operations.py RENAMED Viewed

File without changes

{rara_tools-0.0.12 → rara_tools-0.1.0}/tests/test_task_reporter.py RENAMED Viewed

File without changes

rara-tools 0.0.12__tar.gz → 0.1.0__tar.gz

Potentially problematic release.

rara-tools 0.0.12tar.gz → 0.1.0tar.gz