rara-tools 0.0.13__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

@@ -0,0 +1 @@
1
+ from .normalizers import *
@@ -1,4 +1,12 @@
1
+ COMPONENT_KEY = "digitizer"
2
+
3
+
4
+ class ModelTypes:
5
+ IMAGE_PROCESSOR = "image_processor"
6
+
7
+
1
8
  class StatusKeys:
9
+ DOWNLOAD_MODELS = "digitizer_download_models"
2
10
  CLEAN_UP = "digitizer_clean_up"
3
11
  ELASTICSEARCH_UPLOAD = "digitizer_elasticsearch_upload"
4
12
  UPLOAD = "s3_upload"
@@ -11,3 +19,7 @@ class Queue:
11
19
  DOWNLOAD = "download"
12
20
  FINISH = "finish"
13
21
  OCR = "ocr"
22
+
23
+
24
+ class Tasks:
25
+ MODEL_UPDATE = "component_model_update"
@@ -1,4 +1,5 @@
1
1
  class Status:
2
+ SKIPPED = "SKIPPED"
2
3
  FAILED = "FAILED"
3
4
  PENDING = "PENDING"
4
5
  RUNNING = "RUNNING"
@@ -0,0 +1,17 @@
1
+ from pymarc import Indicators
2
+ import os
3
+
4
+ EMPTY_INDICATORS = Indicators(" ", " ")
5
+ VIAF_ALLOWED_SOURCES = ["LC", "DNB", "LNB", "NLL",
6
+ "ERRR", "J9U"]
7
+
8
+ ES_HOST = os.getenv("ELASTIC_TEST_URL", "http://localhost:9200")
9
+
10
+ LINKER_CONFIG = {
11
+ "add_viaf_info": True,
12
+ "vectorizer_data_path": "./vectorizer_data",
13
+ "per_config": {"es_host": ES_HOST},
14
+ "org_config": {"es_host": ES_HOST},
15
+ "loc_config": {"es_host": ES_HOST},
16
+ "ems_config": {"es_host": ES_HOST},
17
+ }
rara_tools/converters.py CHANGED
@@ -1,19 +1,22 @@
1
- from .exceptions import SierraResponseConverterException
1
+ from rara_tools.exceptions import SierraResponseConverterException
2
2
 
3
3
 
4
4
  class SierraResponseConverter:
5
5
  """Converts a JSON response from the Sierra API to MARC-in-JSON format."""
6
-
6
+
7
7
  def __init__(self, response: dict):
8
8
  if not isinstance(response, dict):
9
- raise SierraResponseConverterException("Please provide a valid JSON response.")
9
+ raise SierraResponseConverterException(
10
+ "Please provide a valid JSON response.")
10
11
  self.response = response
11
-
12
- def _map_control_fields(self, field: dict) -> dict:
13
- # for tags < 010, no subfields, instead one str value in "value"
12
+
13
+ @staticmethod
14
+ def _map_control_fields(field: dict) -> dict:
15
+ # for tags < 010, no subfields, instead one str value in "value"
14
16
  return {field["tag"]: field["value"]}
15
-
16
- def _map_data_fields(self, field: dict) -> dict:
17
+
18
+ @staticmethod
19
+ def _map_data_fields(field: dict) -> dict:
17
20
  """ Maps marc fields > 010.
18
21
 
19
22
  Args:
@@ -22,60 +25,66 @@ class SierraResponseConverter:
22
25
  Returns:
23
26
  dict: standardised marc-in-json format.
24
27
  """
25
-
28
+
26
29
  data = field["data"]
27
-
30
+
28
31
  # Order matters ind1, in2, subfields
29
32
  field_data = {
30
33
  "ind1": data.get("ind1", " "),
31
34
  "ind2": data.get("ind2", " "),
32
35
  "subfields": data.get("subfields", [])
33
36
  }
34
-
37
+
35
38
  return {field["tag"]: field_data}
36
-
37
- def _is_marc21structured(self, field: dict) -> bool:
39
+
40
+ @staticmethod
41
+ def _is_marc21structured(field: dict) -> bool:
38
42
  """Checks if the field is already structured according to MARC21 in JSON"""
39
43
  return any(key.isdigit() for key in field.keys())
40
-
41
-
44
+
42
45
  def _handle_field_type(self, field: dict) -> dict:
43
-
46
+
44
47
  if self._is_marc21structured(field):
45
48
  return field
46
-
49
+
47
50
  if field.get("data"):
48
51
  return self._map_data_fields(field)
49
-
52
+
50
53
  tag = field.get("tag")
51
-
54
+
52
55
  if not tag:
53
- raise SierraResponseConverterException("Field is missing MARC21 tag.")
54
-
56
+ raise SierraResponseConverterException(
57
+ "Field is missing MARC21 tag.")
58
+
55
59
  if tag < "010":
56
60
  return self._map_control_fields(field)
57
61
  else:
58
62
  return self._map_data_fields(field)
59
-
63
+
60
64
  def _convert_response(self) -> list:
61
65
  entries = self.response.get("entries")
62
66
  if not entries:
63
- raise SierraResponseConverterException("No entries found in the response.")
64
-
67
+ raise SierraResponseConverterException(
68
+ "No entries found in the response.")
69
+
65
70
  try:
66
- return {"fields": [
67
- {e["id"]: [
68
- self._handle_field_type(f) for f in e["marc"]["fields"]
71
+ return [
72
+ {
73
+ "sierraID": str(e["id"]),
74
+ "leader": e["marc"]["leader"],
75
+ "fields": [
76
+ self._handle_field_type(f) for f in e["marc"]["fields"]
69
77
  ]}
70
78
  for e in entries
71
- ]}
72
-
79
+ ]
80
+
73
81
  except KeyError as e:
74
- raise SierraResponseConverterException(f"Malformed response: missing key {e}")
75
-
76
-
82
+ raise SierraResponseConverterException(
83
+ f"Malformed response: missing key {e}")
84
+
77
85
  def convert(self) -> list:
78
86
  try:
79
87
  return self._convert_response()
80
88
  except Exception as e:
81
- raise SierraResponseConverterException(f"An unexpected error occurred: {e}")
89
+ raise SierraResponseConverterException(
90
+ f"An unexpected error occurred: {e}")
rara_tools/exceptions.py CHANGED
@@ -7,6 +7,10 @@ class S3InitException(Exception):
7
7
  class S3ConnectionException(Exception):
8
8
  """Raised S3 Bucket/Connection Error."""
9
9
 
10
+ class S3DownloadException(Exception):
11
+ """Raised S3 Download Error."""
12
+
13
+
10
14
  class ElasticsearchException(Exception):
11
15
  """Raised Elasticsearch Error."""
12
16
 
rara_tools/s3.py CHANGED
@@ -1,11 +1,20 @@
1
+ import logging
1
2
  import os
3
+ import pathlib
4
+ import time
2
5
  import uuid
3
6
  from typing import Any, Generator, List, Optional
4
7
 
5
- from minio import Minio
8
+ from minio import Minio, S3Error
6
9
 
7
- from .exceptions import (S3ConnectionException, S3InitException,
8
- S3InputException)
10
+ from .exceptions import (
11
+ S3ConnectionException,
12
+ S3InitException,
13
+ S3InputException,
14
+ S3DownloadException
15
+ )
16
+
17
+ logger = logging.getLogger("tools.s3")
9
18
 
10
19
 
11
20
  class S3Files:
@@ -76,9 +85,49 @@ class S3Files:
76
85
  list_of_objects = list(self.minio_client.list_objects(self.bucket, prefix=path, recursive=True))
77
86
  for minio_object in list_of_objects:
78
87
  full_path = os.path.join(download_dir, minio_object.object_name)
79
- self.minio_client.fget_object(self.bucket, minio_object.object_name, full_path)
88
+ self._download_file(minio_object.object_name, full_path)
80
89
  yield full_path
81
90
 
91
+ def _download_file(self, path, download_dir=".", max_retries=3) -> str:
92
+ """Download a single file with retry and resume support."""
93
+ attempts = 0
94
+
95
+ while attempts < max_retries:
96
+ try:
97
+ stat = self.minio_client.stat_object(self.bucket, path)
98
+ file_size = stat.size
99
+ temp_path = download_dir + ".part"
100
+ pathlib.Path(temp_path).parent.mkdir(parents=True, exist_ok=True)
101
+
102
+ # Check if a partial file exists
103
+ downloaded_size = os.path.getsize(temp_path) if os.path.exists(temp_path) else 0
104
+
105
+ if downloaded_size >= file_size:
106
+ os.rename(temp_path, download_dir) # Rename to final filename
107
+ logger.info(f"Completed: {path}")
108
+ return str(pathlib.Path(download_dir) / path)
109
+
110
+ logger.info(f"Downloading {path} ({downloaded_size}/{file_size} bytes)...")
111
+
112
+ # Open file in append mode to resume download
113
+ with open(temp_path, "ab") as f:
114
+ response = self.minio_client.get_object(self.bucket, path, offset=downloaded_size)
115
+ for data in response.stream(32 * 1024): # 32KB chunks
116
+ f.write(data)
117
+ response.close()
118
+ response.release_conn()
119
+
120
+ os.rename(temp_path, download_dir) # Rename temp to final
121
+ logger.info(f"Downloaded: {path}")
122
+ return str(pathlib.Path(download_dir) / path)
123
+
124
+ except S3Error as e:
125
+ logger.info(f"Error downloading {path}, attempt {attempts + 1}: {e}")
126
+ attempts += 1
127
+ time.sleep(2 ** attempts) # Exponential backoff
128
+
129
+ raise S3DownloadException(f"Failed to download {path} after {max_retries} attempts.")
130
+
82
131
  def upload(self, path: str, prefix: Optional[str] = "") -> str:
83
132
  """Uploads file or folder to S3 bucket.
84
133
  :param: path str: Path to the file to upload in local file system.
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.0.13
3
+ Version: 0.2.0
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -13,11 +13,15 @@ License-File: LICENSE.md
13
13
  Requires-Dist: elasticsearch==8.*
14
14
  Requires-Dist: elasticsearch_dsl==8.*
15
15
  Requires-Dist: minio==7.*
16
+ Requires-Dist: rara-norm-linker==1.*
16
17
  Requires-Dist: requests
17
18
  Requires-Dist: iso639-lang
19
+ Requires-Dist: pymarc
20
+ Requires-Dist: glom
18
21
  Provides-Extra: testing
19
22
  Requires-Dist: pytest>=8.0; extra == "testing"
20
23
  Requires-Dist: pytest-order; extra == "testing"
24
+ Dynamic: license-file
21
25
 
22
26
  # RaRa Tools
23
27
 
@@ -0,0 +1,17 @@
1
+ rara_tools/converters.py,sha256=_1ZRH4ACLOolI1G5b_aSssN68rWOvan-q2dTq7D7-j4,2794
2
+ rara_tools/decorators.py,sha256=MjOyvZ5nTkwxwx2JLFEGpKKBysvecFw6EN6UDrSvZLU,2187
3
+ rara_tools/digar_schema_converter.py,sha256=k95U2iRlEA3sh772-v6snhHW6fju6qSTMnvWJ6DpzZk,14254
4
+ rara_tools/elastic.py,sha256=MgPHxZ3UbSTIL8_sT9gU5V4PLKJjo3aQ8CGyhXjRz6M,13065
5
+ rara_tools/exceptions.py,sha256=YQyaueUbXeTkJYFDEuN6iWTXMI3eCv5l7PxGp87vg5I,550
6
+ rara_tools/s3.py,sha256=9ziDXsLjBtFAvsjTPxFddhfvkpA8773rzPJqO7y1N5Q,6415
7
+ rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
8
+ rara_tools/utils.py,sha256=9vSbmuWYU5ydr4lXBKlUKa0xzDccFsaJv4T-XwgUfuY,2578
9
+ rara_tools/constants/__init__.py,sha256=r78laM9vyRDAvzDhPvzDlhaX6qPwUUBBtwf1WosrW3o,27
10
+ rara_tools/constants/digitizer.py,sha256=MND0dUQySBAOVWzuUBxQGZWv_Ckdz2jCp25F2_oHGi8,496
11
+ rara_tools/constants/general.py,sha256=aVUQTMss89atAkTDZKJXNdnsBHPX-RSrlBOtt-wdPGU,195
12
+ rara_tools/constants/normalizers.py,sha256=eM-REyHen8MdBRYD0s2fQcYrvWxDwWfZlYGpBvdLog0,494
13
+ rara_tools-0.2.0.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
14
+ rara_tools-0.2.0.dist-info/METADATA,sha256=YgPsOKoNplzOs4PVlgJX9eaw65iTSfD9C-Ba374fK2A,3995
15
+ rara_tools-0.2.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
16
+ rara_tools-0.2.0.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
17
+ rara_tools-0.2.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.1)
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,16 +0,0 @@
1
- rara_tools/converters.py,sha256=O769zNjde1VCfEUF2VU_49IAbm8NT-cG-VR0uPxixtE,2687
2
- rara_tools/decorators.py,sha256=MjOyvZ5nTkwxwx2JLFEGpKKBysvecFw6EN6UDrSvZLU,2187
3
- rara_tools/digar_schema_converter.py,sha256=k95U2iRlEA3sh772-v6snhHW6fju6qSTMnvWJ6DpzZk,14254
4
- rara_tools/elastic.py,sha256=MgPHxZ3UbSTIL8_sT9gU5V4PLKJjo3aQ8CGyhXjRz6M,13065
5
- rara_tools/exceptions.py,sha256=BwNh4qWxau_ylr9RqZoYwd1KnExI6oWWWDno3jkh8q4,474
6
- rara_tools/s3.py,sha256=uNDu2HzMYHAWh33RcHeyPFK7gdQfQPxsdfohyIKezEY,4467
7
- rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
8
- rara_tools/utils.py,sha256=9vSbmuWYU5ydr4lXBKlUKa0xzDccFsaJv4T-XwgUfuY,2578
9
- rara_tools/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- rara_tools/constants/digitizer.py,sha256=gJ3jOMwuZfKcLqgOAxTyB266VYsskLabJiMUiSz3xX4,297
11
- rara_tools/constants/general.py,sha256=E9Jaw-YxocS_tOZw9QBoxO3e9KK5EMbLoM0R7D4Iflw,171
12
- rara_tools-0.0.13.dist-info/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
13
- rara_tools-0.0.13.dist-info/METADATA,sha256=0Aipkuodi_CzCTUMkVqKOI__n5mN2r8hEGJ49-MjpMo,3895
14
- rara_tools-0.0.13.dist-info/WHEEL,sha256=nn6H5-ilmfVryoAQl3ZQ2l8SH5imPWFpm1A5FgEuFV4,91
15
- rara_tools-0.0.13.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
16
- rara_tools-0.0.13.dist-info/RECORD,,