rara-tools 0.0.4__py3-none-any.whl → 0.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

File without changes
@@ -0,0 +1,13 @@
1
+ class StatusKeys:
2
+ CLEAN_UP = "digitizer_clean_up"
3
+ ELASTICSEARCH_UPLOAD = "digitizer_elasticsearch_upload"
4
+ UPLOAD = "s3_upload"
5
+ DOWNLOAD = "digitizer_s3_download"
6
+ OCR = "digitizer_ocr"
7
+
8
+
9
+ class Queue:
10
+ IO = "io"
11
+ DOWNLOAD = "download"
12
+ FINISH = "finish"
13
+ OCR = "ocr"
@@ -0,0 +1,10 @@
1
+ class Status:
2
+ FAILED = "FAILED"
3
+ PENDING = "PENDING"
4
+ RUNNING = "RUNNING"
5
+ COMPLETED = "COMPLETED"
6
+ RETRYING = "RETRYING"
7
+
8
+
9
+ class Queue:
10
+ CORE = "core"
@@ -0,0 +1,41 @@
1
+ from .exceptions import SierraResponseConverterException
2
+
3
+
4
+ class SierraResponseConverter:
5
+ """ Takes a JSON response from the Sierra API (https://tester.ester.ee/iii/sierra-api/swagger/index.html)
6
+ and converts it to MARC-in-JSON format.
7
+
8
+ """
9
+
10
+ def __init__(self, response: dict):
11
+ if not isinstance(response, dict):
12
+ raise SierraResponseConverterException("Please provide a valid JSON response.")
13
+ self.response = response
14
+
15
+ def _map_field_data(self, field):
16
+ tag = field.get("tag")
17
+ if not tag:
18
+ raise SierraResponseConverterException("Field is missing a valid 'tag'.")
19
+ data = field.get("data", {})
20
+ return {tag: data}
21
+
22
+ def _convert_response(self):
23
+ response = self.response
24
+
25
+ entries = response.get("entries")
26
+ if not entries:
27
+ raise SierraResponseConverterException("No entries found in the response.")
28
+
29
+ try:
30
+ fields = [self._map_field_data(f) for e in entries for f in e["marc"]["fields"]]
31
+ except KeyError as e:
32
+ raise SierraResponseConverterException(f"Missing expected MARC fields in the response: {e}")
33
+
34
+ return {"fields": fields}
35
+
36
+ def convert(self):
37
+ """Runner method, converts the response to MARC-in-JSON format with error handling."""
38
+ try:
39
+ return self._convert_response()
40
+ except Exception as e:
41
+ raise SierraResponseConverterException(f"An unexpected error occurred during conversion: {e}")
rara_tools/decorators.py CHANGED
@@ -1,12 +1,12 @@
1
1
  import functools
2
+ from typing import Any, Callable
3
+
2
4
  from elasticsearch import AuthenticationException
3
5
  from elasticsearch import ConnectionError as ElasticsearchConnectionError
4
6
  from elasticsearch import ConnectionTimeout, NotFoundError, RequestError
5
- from typing import Any, Callable
6
7
 
7
8
  from .exceptions import ElasticsearchException
8
9
 
9
-
10
10
  ELASTIC_NOT_FOUND_MESSAGE = 'Could not find specified data from Elasticsearch!'
11
11
  ELASTIC_REQUEST_ERROR_MESSAGE = 'Error executing Elasticsearch query! Bad query?'
12
12
  ELASTIC_CONNECTION_TIMEOUT_MESSAGE = 'Connection to Elasticsearch took too long, please try again later!'
@@ -39,4 +39,4 @@ def _elastic_connection(func: Callable) -> Callable:
39
39
  raise ElasticsearchException(ELASTIC_CONNECTION_ERROR_MESSAGE) from exception
40
40
  except Exception as exception:
41
41
  raise ElasticsearchException(ELASTIC_UNKNOWN_ERROR_MESSAGE) from exception
42
- return wrapper
42
+ return wrapper
rara_tools/elastic.py CHANGED
@@ -1,6 +1,10 @@
1
- from typing import Dict, Optional, List
1
+ from typing import Any, Dict, Iterator, Optional
2
+
3
+ import elasticsearch_dsl
4
+ from elastic_transport import ObjectApiResponse
2
5
  from elasticsearch import Elasticsearch
3
- from elasticsearch_dsl import Search
6
+ from elasticsearch.helpers import bulk
7
+ from elasticsearch_dsl import Index
4
8
 
5
9
  from .decorators import _elastic_connection
6
10
 
@@ -8,11 +12,54 @@ from .decorators import _elastic_connection
8
12
  class KataElastic:
9
13
  """A class to manage all required Elasticsearch operations for Kata.
10
14
  """
15
+
16
+ TYPE_MAPPING = {
17
+ "keyword": elasticsearch_dsl.Keyword,
18
+ "text": elasticsearch_dsl.Text,
19
+ "float": elasticsearch_dsl.Float,
20
+ "integer": elasticsearch_dsl.Integer,
21
+ "date": elasticsearch_dsl.Date,
22
+ }
23
+
24
+ DEFAULT_MAPPING = {
25
+ "text": "keyword",
26
+ "parent_id": "keyword",
27
+ "text_quality": "float",
28
+ "n_chars": "integer",
29
+ "n_words": "integer",
30
+ "language": "keyword",
31
+ "end_page": "integer",
32
+ "start_page": "integer",
33
+ "sequence_nr": "integer",
34
+ "section_title": "keyword",
35
+ "section_type": "keyword",
36
+ "section_meta": "keyword",
37
+ }
38
+
11
39
  def __init__(self, elasticsearch_url: str, timeout: Optional[int] = None):
12
40
  self.timeout = timeout
13
41
  self.elasticsearch_url = elasticsearch_url
14
42
  self.elasticsearch = Elasticsearch(self.elasticsearch_url, request_timeout=self.timeout)
15
43
 
44
+ def _produce_rollover_index(self, index_prefix: str, rollover_limit: int) -> str:
45
+ indices = self.elasticsearch.indices.get(index=f"{index_prefix}-*", expand_wildcards="open")
46
+ sorted_indices = sorted([(k, v["settings"]["index"]["creation_date"]) for k, v in indices.items()], key=lambda x: x[1], reverse=True)
47
+ sorted_indices = [i[0] for i in sorted_indices]
48
+
49
+ # new index name if none exist
50
+ if not len(sorted_indices):
51
+ last_index_name = f"{index_prefix}-0"
52
+ last_index_count = 0
53
+ else:
54
+ last_index_name = sorted_indices[0]
55
+ last_index_count = self.elasticsearch.count(index=last_index_name)["count"]
56
+ # check the size of the last index of the pipeline
57
+ if last_index_count >= rollover_limit:
58
+ new_index_number = int(last_index_name[-1]) + 1
59
+ last_index_name = f"{index_prefix}-{new_index_number}"
60
+
61
+ return last_index_name
62
+
16
63
  @_elastic_connection
17
64
  def check(self) -> bool:
18
65
  """Checks Elasticsearch connection.
@@ -22,25 +69,42 @@ class KataElastic:
22
69
  return True
23
70
  return False
24
71
 
72
+ def generate_mapping(self, schema: dict | None = None) -> dict:
73
+ mapping_dsl = elasticsearch_dsl.Mapping()
74
+ mapping = schema or self.DEFAULT_MAPPING
75
+ for field_name, field_type in mapping.items():
76
+ if field_type in self.TYPE_MAPPING:
77
+ # We instantiate the class stored in the type mapping.
78
+ mapping_dsl.field(field_name, self.TYPE_MAPPING[field_type]())
79
+ return mapping_dsl.to_dict()
80
+
81
+ @_elastic_connection
82
+ def add_mapping(self, index_name: str, schema: dict):
83
+ index = Index(name=index_name)
84
+ return index.put_mapping(body=schema, using=self.elasticsearch)
85
+
25
86
  @_elastic_connection
26
87
  def create_index(
27
88
  self,
28
89
  index: str,
29
90
  shards: int = 3,
30
91
  replicas: int = 1,
31
- settings: Optional[dict] = None
32
- ) -> Dict:
92
+ settings: Optional[dict] = None,
93
+ ) -> Dict | None:
33
94
  """Creates empty index.
34
95
  :param: index str: Name of the index to create.
35
96
  :param: shards int: Number of shards for the index.
36
97
  :param: replicas int: Number of replicas of the index.
37
98
  :param: settings dict: Overwrite settings for the index.
38
99
  """
39
- body = settings or {
40
- "number_of_shards": shards,
41
- "number_of_replicas": replicas,
42
- }
43
- return self.elasticsearch.indices.create(index=index, settings=body)
100
+
101
+ index_exists = self.elasticsearch.indices.exists(index=index).body
102
+ if index_exists is False:
103
+ setting_body = settings or {
104
+ "number_of_shards": shards,
105
+ "number_of_replicas": replicas,
106
+ }
107
+ return self.elasticsearch.indices.create(index=index, settings=setting_body)
44
108
 
45
109
  @_elastic_connection
46
110
  def delete_index(self, index: str, ignore: Optional[bool] = True) -> Dict:
@@ -49,11 +113,11 @@ class KataElastic:
49
113
  :param: ignore bool: Ignore errors because of closed/deleted index.
50
114
  :return: Dict of Elastic's acknowledgement of the action.
51
115
  """
52
- response = self.elasticsearch.indices.delete(index=index, ignore_unavailable=ignore)
116
+ response = self.elasticsearch.indices.delete(index=index, ignore_unavailable=ignore, expand_wildcards="open")
53
117
  return response
54
118
 
55
119
  @_elastic_connection
56
- def delete_document(self, index: str, document_id: str) -> Dict:
120
+ def delete_document(self, index: str, document_id: str) -> ObjectApiResponse[Any]:
57
121
  """Deletes document fom index.
58
122
  :param: document_id str: ID of the document to be deleted.
59
123
  :param: index str: Index where the document is to be found.
@@ -63,6 +127,25 @@ class KataElastic:
63
127
  response = self.elasticsearch.delete(id=document_id, index=index)
64
128
  return response
65
129
 
130
+ @_elastic_connection
131
+ def bulk_index(
132
+ self,
133
+ documents: Iterator[dict],
134
+ index_prefix: str,
135
+ rollover_limit: int,
136
+ refresh="false",
137
+ create_index: bool = True
138
+ ) -> (int, int):
139
+ last_index_name = self._produce_rollover_index(index_prefix, rollover_limit)
140
+ if create_index:
141
+ response = self.create_index(index=last_index_name)
142
+ response = self.add_mapping(index_name=last_index_name, schema=self.generate_mapping())
143
+ pass
144
+
145
+ actions = [{"_index": last_index_name, "_source": document} for document in documents]
146
+ successful_count, error_count = bulk(actions=actions, client=self.elasticsearch, max_retries=3, refresh=refresh)
147
+ return successful_count, error_count
148
+
66
149
  @_elastic_connection
67
150
  def index_document(self, index: str, body: dict, document_id: Optional[str] = None) -> Dict:
68
151
  """Indexes document.
@@ -78,15 +161,15 @@ class KataElastic:
78
161
  return indexed
79
162
 
80
163
  @_elastic_connection
81
- def get_documents_by_key(self, index: str, document_key: str) -> List:
82
- """This method is for retrieving all texts/pages of the original document.
83
- :param: index str: Index to search the documents from.
84
- :param: document_key str: parent_id field that connects pages of document together.
85
- :return: List of matching documents.
86
- """
87
- s = Search(using=self.elasticsearch, index=index)
88
- docs = s.query("match", parent_id=document_key).execute()
89
- return docs
164
+ def get_documents_by_key(self, index: str, document_key: str, sort_fields=("start_page", "end_page", "sequence_nr",)):
165
+ index = f"{index}-*"
166
+ s = elasticsearch_dsl.Search(using=self.elasticsearch, index=index)
167
+ s = s.query("match", parent_id=document_key).sort(*sort_fields)
168
+ # Since scan doesn't allow for sorting, we do it manually after fetching the documents.
169
+ documents = sorted(
170
+ s.scan(), key=lambda doc: [getattr(doc, field) for field in sort_fields]
171
+ )
172
+ return documents
90
173
 
91
174
  def __str__(self) -> str:
92
- return self.elasticsearch_url
175
+ return self.elasticsearch_url
rara_tools/exceptions.py CHANGED
@@ -12,3 +12,6 @@ class ElasticsearchException(Exception):
12
12
 
13
13
  class TaskReporterException(Exception):
14
14
  """Raised TaskReporter Error."""
15
+
16
+ class SierraResponseConverterException(Exception):
17
+ """Raised SierraResponseConverter Error."""
rara_tools/s3.py CHANGED
@@ -1,10 +1,11 @@
1
1
  import os
2
2
  import uuid
3
- from typing import Optional, List, Generator, Any
3
+ from typing import Any, Generator, List, Optional
4
4
 
5
5
  from minio import Minio
6
6
 
7
- from .exceptions import S3InitException, S3ConnectionException, S3InputException
7
+ from .exceptions import (S3ConnectionException, S3InitException,
8
+ S3InputException)
8
9
 
9
10
 
10
11
  class S3Files:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: rara-tools
3
- Version: 0.0.4
3
+ Version: 0.0.9
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -28,25 +28,28 @@ Requires-Dist: pytest-order; extra == "testing"
28
28
 
29
29
  ---
30
30
 
31
- ## ✨ Features
31
+ ## ✨ Features
32
32
 
33
33
  - Elasticsearch index & document operations
34
34
  - S3 file management operations
35
35
  - Task reporting to Core API
36
+ - Converting SIERRA API responses to Pymarc compatible JSON
37
+
36
38
  ---
37
39
 
38
- ## ⚡ Quick Start
40
+ ## ⚡ Quick Start
39
41
 
40
42
  Get started with `rara-tools` in just a few steps:
41
43
 
42
44
  1. **Install the Package**
43
- Ensure you're using Python 3.10 or above, then run:
45
+ Ensure you're using Python 3.10 or above, then run:
46
+
44
47
  ```bash
45
48
  pip install rara-tools
46
49
  ```
47
50
 
48
51
  2. **Import and Use**
49
- Example usage to download a folder from S3:
52
+ Example usage to download a folder from S3:
50
53
 
51
54
  ```python
52
55
  from rara_tools.s3 import S3Files
@@ -77,22 +80,25 @@ Follow the steps below to install the `rara-tools` package, either via `pip` or
77
80
  Create or activate a Python environment using Python **3.10** or above.
78
81
 
79
82
  2. **Install the Package**
80
- Run the following command:
83
+ Run the following command:
84
+
81
85
  ```bash
82
86
  pip install rara-tools
83
87
  ```
84
- </details>
88
+
89
+ </details>
85
90
 
86
91
  ---
87
92
 
88
93
  ### Local Installation
89
94
 
90
- Follow these steps to install the `rara-tools` package locally:
95
+ Follow these steps to install the `rara-tools` package locally:
91
96
 
92
97
  <details><summary>Click to expand</summary>
93
98
 
94
99
  1. **Clone the Repository**
95
- Clone the repository and navigate into it:
100
+ Clone the repository and navigate into it:
101
+
96
102
  ```bash
97
103
  git clone <repository-url>
98
104
  cd <repository-directory>
@@ -100,25 +106,29 @@ Follow these steps to install the `rara-tools` package locally:
100
106
 
101
107
  2. **Set Up Python Environment**
102
108
  Create or activate a Python environment using Python 3.10 or above. E.g:
109
+
103
110
  ```bash
104
111
  conda create -n py310 python==3.10
105
112
  conda activate py310
106
113
  ```
107
114
 
108
115
  3. **Install Build Package**
109
- Install the `build` package to enable local builds:
116
+ Install the `build` package to enable local builds:
117
+
110
118
  ```bash
111
119
  pip install build
112
120
  ```
113
121
 
114
122
  4. **Build the Package**
115
- Run the following command inside the repository:
123
+ Run the following command inside the repository:
124
+
116
125
  ```bash
117
126
  python -m build
118
127
  ```
119
128
 
120
129
  5. **Install the Package**
121
- Install the built package locally:
130
+ Install the built package locally:
131
+
122
132
  ```bash
123
133
  pip install .
124
134
  ```
@@ -131,13 +141,13 @@ Follow these steps to install the `rara-tools` package locally:
131
141
 
132
142
  Follow these steps to test the `rara-tools` package.
133
143
 
134
-
135
144
  ### How to Test
136
145
 
137
146
  <details><summary>Click to expand</summary>
138
147
 
139
148
  1. **Clone the Repository**
140
- Clone the repository and navigate into it:
149
+ Clone the repository and navigate into it:
150
+
141
151
  ```bash
142
152
  git clone <repository-url>
143
153
  cd <repository-directory>
@@ -147,25 +157,29 @@ Follow these steps to test the `rara-tools` package.
147
157
  Create or activate a Python environment using Python 3.10 or above.
148
158
 
149
159
  3. **Install Build Package**
150
- Install the `build` package:
160
+ Install the `build` package:
161
+
151
162
  ```bash
152
163
  pip install build
153
164
  ```
154
165
 
155
166
  4. **Build the Package**
156
- Build the package inside the repository:
167
+ Build the package inside the repository:
168
+
157
169
  ```bash
158
170
  python -m build
159
171
  ```
160
172
 
161
173
  5. **Install with Testing Dependencies**
162
- Install the package along with its testing dependencies:
174
+ Install the package along with its testing dependencies:
175
+
163
176
  ```bash
164
177
  pip install .[testing]
165
178
  ```
166
179
 
167
180
  6. **Run Tests**
168
- Run the test suite from the repository root:
181
+ Run the test suite from the repository root:
182
+
169
183
  ```bash
170
184
  python -m pytest -v tests
171
185
  ```
@@ -0,0 +1,14 @@
1
+ rara_tools/converters.py,sha256=JcS74VzV6jm12l3C6aqMJBY9nuVW_aevQeCe32KmfrE,1576
2
+ rara_tools/decorators.py,sha256=MjOyvZ5nTkwxwx2JLFEGpKKBysvecFw6EN6UDrSvZLU,2187
3
+ rara_tools/elastic.py,sha256=vEvrbIPRtdqTdrNrPH2cewHLMfOTSf87a4JOiRQgYyA,7146
4
+ rara_tools/exceptions.py,sha256=BwNh4qWxau_ylr9RqZoYwd1KnExI6oWWWDno3jkh8q4,474
5
+ rara_tools/s3.py,sha256=uNDu2HzMYHAWh33RcHeyPFK7gdQfQPxsdfohyIKezEY,4467
6
+ rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
7
+ rara_tools/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ rara_tools/constants/digitizer.py,sha256=gJ3jOMwuZfKcLqgOAxTyB266VYsskLabJiMUiSz3xX4,297
9
+ rara_tools/constants/general.py,sha256=E9Jaw-YxocS_tOZw9QBoxO3e9KK5EMbLoM0R7D4Iflw,171
10
+ rara_tools-0.0.9.dist-info/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
11
+ rara_tools-0.0.9.dist-info/METADATA,sha256=HhxVd2e_lhAizmc9p88dOVuaCygVRH5tDv3xrPZXVmk,3867
12
+ rara_tools-0.0.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
13
+ rara_tools-0.0.9.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
14
+ rara_tools-0.0.9.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- rara_tools/decorators.py,sha256=rYDk5CEHhCZvqeFaHku8qLMv7G7NTMWppHwLg3ZeVj4,2186
2
- rara_tools/elastic.py,sha256=MVqai6wDQlDQeHQzAKsRpxOchI29y3W1UiridgfH6d4,3718
3
- rara_tools/exceptions.py,sha256=FtuHG-2snaEfADA25HjjutGNQzNo6sTdSfqk9VrzOuE,374
4
- rara_tools/s3.py,sha256=eqMiOKbjXvXY04JJV68gmOU-4DUnwEaeYdhjQSI6crU,4440
5
- rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
6
- rara_tools-0.0.4.dist-info/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
7
- rara_tools-0.0.4.dist-info/METADATA,sha256=onb2qPn7IXknjCM09yxKfuEZYJlTlUwetCrS2ZXlPio,3820
8
- rara_tools-0.0.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
9
- rara_tools-0.0.4.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
10
- rara_tools-0.0.4.dist-info/RECORD,,