rara-tools 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rara-tools might be problematic. Click here for more details.
- rara_tools/converters.py +41 -0
- rara_tools/decorators.py +3 -3
- rara_tools/digar_schema_converter.py +409 -0
- rara_tools/elastic.py +1 -1
- rara_tools/exceptions.py +3 -0
- rara_tools/s3.py +3 -2
- rara_tools/utils.py +104 -0
- {rara_tools-0.0.8.dist-info → rara_tools-0.0.10.dist-info}/METADATA +33 -18
- rara_tools-0.0.10.dist-info/RECORD +16 -0
- rara_tools-0.0.8.dist-info/RECORD +0 -13
- {rara_tools-0.0.8.dist-info → rara_tools-0.0.10.dist-info}/LICENSE.md +0 -0
- {rara_tools-0.0.8.dist-info → rara_tools-0.0.10.dist-info}/WHEEL +0 -0
- {rara_tools-0.0.8.dist-info → rara_tools-0.0.10.dist-info}/top_level.txt +0 -0
rara_tools/converters.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from .exceptions import SierraResponseConverterException
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SierraResponseConverter:
|
|
5
|
+
""" Takes a JSON response from the Sierra API (https://tester.ester.ee/iii/sierra-api/swagger/index.html)
|
|
6
|
+
and converts it to MARC-in-JSON format.
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
def __init__(self, response: dict):
|
|
11
|
+
if not isinstance(response, dict):
|
|
12
|
+
raise SierraResponseConverterException("Please provide a valid JSON response.")
|
|
13
|
+
self.response = response
|
|
14
|
+
|
|
15
|
+
def _map_field_data(self, field):
|
|
16
|
+
tag = field.get("tag")
|
|
17
|
+
if not tag:
|
|
18
|
+
raise SierraResponseConverterException("Field is missing a valid 'tag'.")
|
|
19
|
+
data = field.get("data", {})
|
|
20
|
+
return {tag: data}
|
|
21
|
+
|
|
22
|
+
def _convert_response(self):
|
|
23
|
+
response = self.response
|
|
24
|
+
|
|
25
|
+
entries = response.get("entries")
|
|
26
|
+
if not entries:
|
|
27
|
+
raise SierraResponseConverterException("No entries found in the response.")
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
fields = [self._map_field_data(f) for e in entries for f in e["marc"]["fields"]]
|
|
31
|
+
except KeyError as e:
|
|
32
|
+
raise SierraResponseConverterException(f"Missing expected MARC fields in the response: {e}")
|
|
33
|
+
|
|
34
|
+
return {"fields": fields}
|
|
35
|
+
|
|
36
|
+
def convert(self):
|
|
37
|
+
"""Runner method, converts the response to MARC-in-JSON format with error handling."""
|
|
38
|
+
try:
|
|
39
|
+
return self._convert_response()
|
|
40
|
+
except Exception as e:
|
|
41
|
+
raise SierraResponseConverterException(f"An unexpected error occurred during conversion: {e}")
|
rara_tools/decorators.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import functools
|
|
2
|
+
from typing import Any, Callable
|
|
3
|
+
|
|
2
4
|
from elasticsearch import AuthenticationException
|
|
3
5
|
from elasticsearch import ConnectionError as ElasticsearchConnectionError
|
|
4
6
|
from elasticsearch import ConnectionTimeout, NotFoundError, RequestError
|
|
5
|
-
from typing import Any, Callable
|
|
6
7
|
|
|
7
8
|
from .exceptions import ElasticsearchException
|
|
8
9
|
|
|
9
|
-
|
|
10
10
|
ELASTIC_NOT_FOUND_MESSAGE = 'Could not find specified data from Elasticsearch!'
|
|
11
11
|
ELASTIC_REQUEST_ERROR_MESSAGE = 'Error executing Elasticsearch query! Bad query?'
|
|
12
12
|
ELASTIC_CONNECTION_TIMEOUT_MESSAGE = 'Connection to Elasticsearch took too long, please try again later!'
|
|
@@ -39,4 +39,4 @@ def _elastic_connection(func: Callable) -> Callable:
|
|
|
39
39
|
raise ElasticsearchException(ELASTIC_CONNECTION_ERROR_MESSAGE) from exception
|
|
40
40
|
except Exception as exception:
|
|
41
41
|
raise ElasticsearchException(ELASTIC_UNKNOWN_ERROR_MESSAGE) from exception
|
|
42
|
-
return wrapper
|
|
42
|
+
return wrapper
|
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import List, NoReturn
|
|
3
|
+
|
|
4
|
+
from rara_tools.utils import lang_to_iso639_2, ratio_to_percentage
|
|
5
|
+
|
|
6
|
+
GENERAL_DOC_IDENTIFIER = "Filepath"
|
|
7
|
+
UNDEFINED_LANGUAGE_VALUE = "unk"
|
|
8
|
+
QUALITY_RATIO_TYPE = "Float"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ImagePageSchema:
|
|
12
|
+
def __init__(self, image: dict) -> NoReturn:
|
|
13
|
+
self.__image = image
|
|
14
|
+
self.__schema: dict = {}
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def schema(self) -> dict:
|
|
18
|
+
if not self.__schema:
|
|
19
|
+
self.__schema = {
|
|
20
|
+
"@type": "VisualArtwork",
|
|
21
|
+
"@id": "",
|
|
22
|
+
"value": self.__image.get("label"),
|
|
23
|
+
"description": "",
|
|
24
|
+
"schema:position": self.__image.get("page")
|
|
25
|
+
}
|
|
26
|
+
return self.__schema
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class TextPageSchema:
|
|
30
|
+
def __init__(self, page: dict) -> NoReturn:
|
|
31
|
+
self.__page: dict = page
|
|
32
|
+
self.__schema: dict = {}
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def schema(self) -> dict:
|
|
36
|
+
if not self.__schema:
|
|
37
|
+
self.__schema = {
|
|
38
|
+
"@type": "Text", # CONSTANT
|
|
39
|
+
"@id": "", # Will be added in a later stage
|
|
40
|
+
"value": "Textblock", # CONSTANT
|
|
41
|
+
"content": self.__page.get("text"),
|
|
42
|
+
"schema:position": self.__page.get("start_page") # start_page ?
|
|
43
|
+
}
|
|
44
|
+
return self.__schema
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class PageSchema:
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
page_texts: List[dict],
|
|
51
|
+
page_images: List[dict],
|
|
52
|
+
page_number: int,
|
|
53
|
+
doc_id: str
|
|
54
|
+
) -> NoReturn:
|
|
55
|
+
self.__page_texts: List[dict] = page_texts
|
|
56
|
+
self.__page_images: List[dict] = page_images
|
|
57
|
+
self.__page_nr: int = page_number
|
|
58
|
+
self.__page_id: str = ""
|
|
59
|
+
self.__doc_id: str = doc_id
|
|
60
|
+
self.__schema: dict = {}
|
|
61
|
+
|
|
62
|
+
def _add_segment_ids(self, segments: List[dict]) -> List[dict]:
|
|
63
|
+
for i, segment in enumerate(segments):
|
|
64
|
+
segment_id = f"{self.page_id}/{i + 1}"
|
|
65
|
+
segment["@id"] = segment_id
|
|
66
|
+
return segments
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def page_id(self) -> str:
|
|
70
|
+
if not self.__page_id:
|
|
71
|
+
self.__page_id = f"{self.__doc_id}/{self.__page_nr}"
|
|
72
|
+
return self.__page_id
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def schema(self) -> dict:
|
|
76
|
+
if not self.__schema:
|
|
77
|
+
self.__schema = {
|
|
78
|
+
"@type": "CreativeWork", # CONSTANT for pages
|
|
79
|
+
"@id": self.page_id,
|
|
80
|
+
"hasPart": []
|
|
81
|
+
}
|
|
82
|
+
text_schemas = [
|
|
83
|
+
TextPageSchema(page).schema
|
|
84
|
+
for page in self.__page_texts
|
|
85
|
+
]
|
|
86
|
+
image_schemas = [
|
|
87
|
+
ImagePageSchema(image).schema
|
|
88
|
+
for image in self.__page_images
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
page_schemas = text_schemas + image_schemas
|
|
92
|
+
page_schemas_with_ids = self._add_segment_ids(page_schemas)
|
|
93
|
+
|
|
94
|
+
self.__schema["hasPart"].extend(page_schemas_with_ids)
|
|
95
|
+
|
|
96
|
+
return self.__schema
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class DocSchemas:
|
|
100
|
+
def __init__(
|
|
101
|
+
self,
|
|
102
|
+
doc_meta: dict,
|
|
103
|
+
sierra_id: str = "",
|
|
104
|
+
generated_id: str = "",
|
|
105
|
+
permalink: str = "",
|
|
106
|
+
min_language_ratio: float = 0.2,
|
|
107
|
+
convert_ratio: bool = True
|
|
108
|
+
) -> NoReturn:
|
|
109
|
+
self.__convert_ratio = convert_ratio
|
|
110
|
+
self.__min_language_ratio = min_language_ratio
|
|
111
|
+
self.__sierra_id = sierra_id
|
|
112
|
+
self.__generated_id = generated_id
|
|
113
|
+
self.__permalink = permalink
|
|
114
|
+
self.__doc_meta = doc_meta
|
|
115
|
+
self.__ocr_accuracy_schema: dict = {}
|
|
116
|
+
self.__text_quality_schema: dict = {}
|
|
117
|
+
self.__language_schema: List[dict] = []
|
|
118
|
+
self.__identifier_schema: List[dict] = []
|
|
119
|
+
self.__origin_schema: dict = {}
|
|
120
|
+
self.__origin: str = ""
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def origin(self) -> str:
|
|
124
|
+
if not self.__origin:
|
|
125
|
+
if self.__doc_meta["ocr_applied"]:
|
|
126
|
+
self.__origin = "Reformatted digital"
|
|
127
|
+
else:
|
|
128
|
+
self.__origin = "Born digital"
|
|
129
|
+
return self.__origin
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def ocr_accuracy_schema(self) -> dict:
|
|
133
|
+
if not self.__ocr_accuracy_schema:
|
|
134
|
+
ocr_quality = self.__doc_meta.get("alto_text_quality")
|
|
135
|
+
if ocr_quality:
|
|
136
|
+
self.__ocr_accuracy_schema = {
|
|
137
|
+
"comment": "Estimated OCR accuracy"
|
|
138
|
+
}
|
|
139
|
+
if self.__convert_ratio:
|
|
140
|
+
type_and_value = {
|
|
141
|
+
"@type": QUALITY_RATIO_TYPE,
|
|
142
|
+
"value": ocr_quality
|
|
143
|
+
}
|
|
144
|
+
else:
|
|
145
|
+
type_and_value = {
|
|
146
|
+
"@type": "Text",
|
|
147
|
+
"value": ratio_to_percentage(ocr_quality)
|
|
148
|
+
}
|
|
149
|
+
self.__ocr_accuracy_schema.update(type_and_value)
|
|
150
|
+
return self.__ocr_accuracy_schema
|
|
151
|
+
|
|
152
|
+
@property
|
|
153
|
+
def text_quality_schema(self) -> dict:
|
|
154
|
+
if not self.__text_quality_schema:
|
|
155
|
+
text_quality = self.__doc_meta.get("text_quality")
|
|
156
|
+
self.__text_quality_schema = {
|
|
157
|
+
"comment": "Estimated n-gram-based text quality"
|
|
158
|
+
}
|
|
159
|
+
if self.__convert_ratio:
|
|
160
|
+
type_and_value = {
|
|
161
|
+
"@type": QUALITY_RATIO_TYPE,
|
|
162
|
+
"value": text_quality
|
|
163
|
+
}
|
|
164
|
+
else:
|
|
165
|
+
type_and_value = {
|
|
166
|
+
"@type": "Text",
|
|
167
|
+
"value": ratio_to_percentage(text_quality)
|
|
168
|
+
}
|
|
169
|
+
self.__text_quality_schema.update(type_and_value)
|
|
170
|
+
return self.__text_quality_schema
|
|
171
|
+
|
|
172
|
+
@property
|
|
173
|
+
def language_schema(self) -> List[dict]:
|
|
174
|
+
if not self.__language_schema:
|
|
175
|
+
self.__language_schema = [
|
|
176
|
+
{
|
|
177
|
+
"@type": "ISO 639-2",
|
|
178
|
+
"value": lang_to_iso639_2(
|
|
179
|
+
lang["language"],
|
|
180
|
+
unk_code=UNDEFINED_LANGUAGE_VALUE
|
|
181
|
+
)
|
|
182
|
+
}
|
|
183
|
+
for lang in self.__doc_meta["languages"]
|
|
184
|
+
if lang["ratio"] >= self.__min_language_ratio
|
|
185
|
+
]
|
|
186
|
+
return self.__language_schema
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def identifier_schema(self) -> List[dict]:
|
|
190
|
+
if not self.__identifier_schema:
|
|
191
|
+
identifiers = []
|
|
192
|
+
if self.__sierra_id:
|
|
193
|
+
identifiers.append(
|
|
194
|
+
{
|
|
195
|
+
"@type": "Identifier",
|
|
196
|
+
"qualifier": "OPAC",
|
|
197
|
+
"value": self.__sierra_id
|
|
198
|
+
}
|
|
199
|
+
)
|
|
200
|
+
if self.__permalink:
|
|
201
|
+
identifiers.append(
|
|
202
|
+
{
|
|
203
|
+
"@type": "Identifier",
|
|
204
|
+
"qualifier": "Permalink",
|
|
205
|
+
"value": self.__permalink
|
|
206
|
+
}
|
|
207
|
+
)
|
|
208
|
+
if self.__generated_id:
|
|
209
|
+
identifiers.append(
|
|
210
|
+
{
|
|
211
|
+
"@type": "Identifier",
|
|
212
|
+
"qualifier": GENERAL_DOC_IDENTIFIER,
|
|
213
|
+
"value": self.__generated_id
|
|
214
|
+
}
|
|
215
|
+
)
|
|
216
|
+
self.__identifier_schema = identifiers
|
|
217
|
+
|
|
218
|
+
return self.__identifier_schema
|
|
219
|
+
|
|
220
|
+
@property
|
|
221
|
+
def origin_schema(self) -> dict:
|
|
222
|
+
if not self.__origin_schema:
|
|
223
|
+
self.__origin_schema = {
|
|
224
|
+
"@type": "Text",
|
|
225
|
+
"value": self.origin,
|
|
226
|
+
"comment": "Origin"
|
|
227
|
+
}
|
|
228
|
+
return self.__origin_schema
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
class DIGARSchemaConverter:
|
|
232
|
+
def __init__(
|
|
233
|
+
self,
|
|
234
|
+
digitizer_output: dict,
|
|
235
|
+
generated_id: str,
|
|
236
|
+
sierra_id: str = "",
|
|
237
|
+
permalink: str = "",
|
|
238
|
+
min_language_ratio: float = 0.2,
|
|
239
|
+
convert_ratio: bool = False
|
|
240
|
+
) -> NoReturn:
|
|
241
|
+
""" Initialize DIGARSchemaConverter object.
|
|
242
|
+
|
|
243
|
+
Parameters
|
|
244
|
+
----------
|
|
245
|
+
digitizer_output: dict
|
|
246
|
+
Raw output of rara-digitizer (https://pypi.org/project/rara-digitizer/).
|
|
247
|
+
generated_id: str
|
|
248
|
+
Some non-standard/generated document identifier used in ID fields.
|
|
249
|
+
sierra_id: str
|
|
250
|
+
Document's corresponding Sierra ID.
|
|
251
|
+
permalink: str
|
|
252
|
+
Permanent link, where the document can be accessed.
|
|
253
|
+
min_language_ratio: float
|
|
254
|
+
Cutoff ratio for languages. If ratio for some language
|
|
255
|
+
does not exceed the set threshold, the language will not
|
|
256
|
+
be added to the final output.
|
|
257
|
+
convert_ratio: bool
|
|
258
|
+
If enabled, all ratios are converted into percentages.
|
|
259
|
+
|
|
260
|
+
"""
|
|
261
|
+
self.__digitizer_output: dict = digitizer_output
|
|
262
|
+
self.__min_language_ratio: float = min_language_ratio
|
|
263
|
+
self.__convert_ratio: bool = convert_ratio
|
|
264
|
+
self.__sierra_id: str = sierra_id
|
|
265
|
+
self.__generated_id: str = generated_id
|
|
266
|
+
self.__permalink: str = permalink.removesuffix("/")
|
|
267
|
+
self.__texts: List[dict] = []
|
|
268
|
+
self.__images: List[dict] = []
|
|
269
|
+
self.__doc_meta: dict = {}
|
|
270
|
+
self.__page_mappings: List[dict] = []
|
|
271
|
+
self.__dcterms_haspart: dict = {}
|
|
272
|
+
self.__dcterms_conforms_to: dict = {}
|
|
273
|
+
self.__dc_language: dict = {}
|
|
274
|
+
self.__dc_origin: dict = {}
|
|
275
|
+
self.__dc_identifier: List[dict] = []
|
|
276
|
+
self.__doc_id: str = ""
|
|
277
|
+
|
|
278
|
+
self.__doc_schemas = DocSchemas(
|
|
279
|
+
doc_meta=self.doc_meta,
|
|
280
|
+
sierra_id=self.__sierra_id,
|
|
281
|
+
generated_id=self.__generated_id,
|
|
282
|
+
permalink=self.__permalink,
|
|
283
|
+
min_language_ratio=self.__min_language_ratio,
|
|
284
|
+
convert_ratio=self.__convert_ratio
|
|
285
|
+
)
|
|
286
|
+
self.__digar_schema: dict = {}
|
|
287
|
+
|
|
288
|
+
def _get_page_number(self, page_content: dict) -> int:
|
|
289
|
+
""" Retrieves page number from image or text object.
|
|
290
|
+
"""
|
|
291
|
+
_segments = page_content["texts"] + page_content["images"]
|
|
292
|
+
_first_segment = _segments[0]
|
|
293
|
+
if "start_page" in _first_segment:
|
|
294
|
+
page_number = _first_segment.get("start_page")
|
|
295
|
+
elif "page" in _first_segment:
|
|
296
|
+
page_number = _first_segment.get("page")
|
|
297
|
+
return page_number
|
|
298
|
+
|
|
299
|
+
@property
|
|
300
|
+
def doc_id(self) -> str:
|
|
301
|
+
""" Retrieves document ID to use for generating
|
|
302
|
+
page and segment ids. Preference order:
|
|
303
|
+
1. permalink; 2. sierra_id; 3. generated document id
|
|
304
|
+
"""
|
|
305
|
+
if not self.__doc_id:
|
|
306
|
+
if self.__permalink:
|
|
307
|
+
self.__doc_id = self.__permalink
|
|
308
|
+
elif self.__sierra_id:
|
|
309
|
+
self.__doc_id = self.__sierra_id
|
|
310
|
+
else:
|
|
311
|
+
self.__doc_id = self.__generated_id
|
|
312
|
+
return self.__doc_id
|
|
313
|
+
|
|
314
|
+
@property
|
|
315
|
+
def texts(self) -> List[dict]:
|
|
316
|
+
if not self.__texts:
|
|
317
|
+
self.__texts = self.__digitizer_output.get("texts")
|
|
318
|
+
return self.__texts
|
|
319
|
+
|
|
320
|
+
@property
|
|
321
|
+
def images(self) -> List[dict]:
|
|
322
|
+
if not self.__images:
|
|
323
|
+
self.__images = self.__digitizer_output.get("images")
|
|
324
|
+
return self.__images
|
|
325
|
+
|
|
326
|
+
@property
|
|
327
|
+
def doc_meta(self) -> dict:
|
|
328
|
+
if not self.__doc_meta:
|
|
329
|
+
self.__doc_meta = self.__digitizer_output.get("doc_meta")
|
|
330
|
+
return self.__doc_meta
|
|
331
|
+
|
|
332
|
+
@property
|
|
333
|
+
def page_mappings(self) -> List[dict]:
|
|
334
|
+
if not self.__page_mappings:
|
|
335
|
+
mapped = defaultdict(lambda: defaultdict(list))
|
|
336
|
+
for text in self.texts:
|
|
337
|
+
mapped[text["start_page"]]["texts"].append(text)
|
|
338
|
+
for img in self.images:
|
|
339
|
+
mapped[img["page"]]["images"].append(img)
|
|
340
|
+
|
|
341
|
+
self.__page_mappings = [
|
|
342
|
+
v for k, v in sorted(list(mapped.items()), key=lambda x: x[0])
|
|
343
|
+
]
|
|
344
|
+
return self.__page_mappings
|
|
345
|
+
|
|
346
|
+
@property
|
|
347
|
+
def dcterms_haspart(self) -> dict:
|
|
348
|
+
if not self.__dcterms_haspart:
|
|
349
|
+
self.__dcterms_haspart = {
|
|
350
|
+
"dcterms:hasPart": [
|
|
351
|
+
PageSchema(
|
|
352
|
+
page_texts=page["texts"],
|
|
353
|
+
page_images=page["images"],
|
|
354
|
+
page_number=self._get_page_number(page),
|
|
355
|
+
doc_id=self.doc_id
|
|
356
|
+
).schema
|
|
357
|
+
for page in self.page_mappings
|
|
358
|
+
]
|
|
359
|
+
}
|
|
360
|
+
return self.__dcterms_haspart
|
|
361
|
+
|
|
362
|
+
@property
|
|
363
|
+
def dcterms_conforms_to(self) -> dict:
|
|
364
|
+
if not self.__dcterms_conforms_to:
|
|
365
|
+
schema_content = [
|
|
366
|
+
self.__doc_schemas.text_quality_schema,
|
|
367
|
+
]
|
|
368
|
+
# Add OCR Accuracy only when it is not empty:
|
|
369
|
+
if self.__doc_schemas.ocr_accuracy_schema:
|
|
370
|
+
schema_content.append(self.__doc_schemas.ocr_accuracy_schema)
|
|
371
|
+
self.__dcterms_conforms_to = {
|
|
372
|
+
"dcterms:conformsTo": schema_content
|
|
373
|
+
}
|
|
374
|
+
return self.__dcterms_conforms_to
|
|
375
|
+
|
|
376
|
+
@property
|
|
377
|
+
def dc_language(self) -> dict:
|
|
378
|
+
if not self.__dc_language:
|
|
379
|
+
self.__dc_language = {
|
|
380
|
+
"dc:language": self.__doc_schemas.language_schema
|
|
381
|
+
}
|
|
382
|
+
return self.__dc_language
|
|
383
|
+
|
|
384
|
+
@property
|
|
385
|
+
def dc_origin(self) -> dict:
|
|
386
|
+
if not self.__dc_origin:
|
|
387
|
+
self.__dc_origin = {
|
|
388
|
+
"dcterms:provenance": self.__doc_schemas.origin_schema
|
|
389
|
+
}
|
|
390
|
+
return self.__dc_origin
|
|
391
|
+
|
|
392
|
+
@property
|
|
393
|
+
def dc_identifier(self) -> List[dict]:
|
|
394
|
+
if not self.__dc_identifier:
|
|
395
|
+
self.__dc_identifier = {
|
|
396
|
+
"dc:identifier": self.__doc_schemas.identifier_schema
|
|
397
|
+
}
|
|
398
|
+
return self.__dc_identifier
|
|
399
|
+
|
|
400
|
+
@property
|
|
401
|
+
def digar_schema(self) -> dict:
|
|
402
|
+
if not self.__digar_schema:
|
|
403
|
+
self.__digar_schema = {}
|
|
404
|
+
self.__digar_schema.update(self.dcterms_conforms_to)
|
|
405
|
+
self.__digar_schema.update(self.dcterms_haspart)
|
|
406
|
+
self.__digar_schema.update(self.dc_language)
|
|
407
|
+
self.__digar_schema.update(self.dc_origin)
|
|
408
|
+
self.__digar_schema.update(self.dc_identifier)
|
|
409
|
+
return self.__digar_schema
|
rara_tools/elastic.py
CHANGED
rara_tools/exceptions.py
CHANGED
rara_tools/s3.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import uuid
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Any, Generator, List, Optional
|
|
4
4
|
|
|
5
5
|
from minio import Minio
|
|
6
6
|
|
|
7
|
-
from .exceptions import
|
|
7
|
+
from .exceptions import (S3ConnectionException, S3InitException,
|
|
8
|
+
S3InputException)
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
class S3Files:
|
rara_tools/utils.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from iso639 import Lang
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def lang_to_iso639_1(lang: str, unk_code: str = "unk") -> str:
|
|
5
|
+
""" Converts language into ISO-639-1 standard.
|
|
6
|
+
Input can be any language code in a valid ISO-639
|
|
7
|
+
standard or even a full name of the language,
|
|
8
|
+
e.g. "Estonian".
|
|
9
|
+
|
|
10
|
+
Parameters
|
|
11
|
+
-----------
|
|
12
|
+
lang: str
|
|
13
|
+
Language code in any valid ISO-639 standard.
|
|
14
|
+
|
|
15
|
+
unk_code: str
|
|
16
|
+
Code to return incase of invalid/unsupported
|
|
17
|
+
input language.
|
|
18
|
+
|
|
19
|
+
Returns
|
|
20
|
+
-------
|
|
21
|
+
Language code in ISO-639-1 standard.
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
lg = Lang(lang)
|
|
25
|
+
iso_639_1_lang = lg.pt1
|
|
26
|
+
except:
|
|
27
|
+
iso_639_1_lang = unk_code
|
|
28
|
+
return iso_639_1_lang
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def lang_to_iso639_2(lang: str, unk_code: str = "unk") -> str:
|
|
32
|
+
""" Converts language into ISO-639-2 standard.
|
|
33
|
+
Input can be any language code in a valid ISO-639
|
|
34
|
+
standard or even a full name of the language,
|
|
35
|
+
e.g. "Estonian".
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
-----------
|
|
39
|
+
lang: str
|
|
40
|
+
Language code in any valid ISO-639 standard.
|
|
41
|
+
|
|
42
|
+
unk_code: str
|
|
43
|
+
Code to return incase of invalid/unsupported
|
|
44
|
+
input language.
|
|
45
|
+
|
|
46
|
+
Returns
|
|
47
|
+
-------
|
|
48
|
+
Language code in ISO-639-2 standard.
|
|
49
|
+
"""
|
|
50
|
+
try:
|
|
51
|
+
lg = Lang(lang)
|
|
52
|
+
# NB! uses bibliographic identifier (e.g. "de" -> "ger")
|
|
53
|
+
# opposed to terminological identifier ("de" -> "deu").
|
|
54
|
+
# This can be changed by replaving lg.pt2b -> lg.pt2t
|
|
55
|
+
iso_639_2_lang = lg.pt2b
|
|
56
|
+
except:
|
|
57
|
+
iso_639_2_lang = unk_code
|
|
58
|
+
return iso_639_2_lang
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def lang_to_iso639_3(lang: str, unk_code: str = "unk") -> str:
|
|
62
|
+
""" Converts language into ISO-639-3 standard.
|
|
63
|
+
Input can be any language code in a valid ISO-639
|
|
64
|
+
standard or even a full name of the language,
|
|
65
|
+
e.g. "Estonian".
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
-----------
|
|
69
|
+
lang: str
|
|
70
|
+
Language code in any valid ISO-639 standard.
|
|
71
|
+
unk_code: str
|
|
72
|
+
|
|
73
|
+
Code to return incase of invalid/unsupported
|
|
74
|
+
input language.
|
|
75
|
+
|
|
76
|
+
Returns
|
|
77
|
+
-------
|
|
78
|
+
str
|
|
79
|
+
Language code in ISO-639-3 standard.
|
|
80
|
+
"""
|
|
81
|
+
try:
|
|
82
|
+
lg = Lang(lang)
|
|
83
|
+
iso_639_3_lang = lg.pt3
|
|
84
|
+
except:
|
|
85
|
+
iso_639_3_lang = unk_code
|
|
86
|
+
return iso_639_3_lang
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def ratio_to_percentage(ratio: float) -> str:
|
|
90
|
+
""" Converts ratio to corresponding percentage.
|
|
91
|
+
|
|
92
|
+
Parameters
|
|
93
|
+
-----------
|
|
94
|
+
ratio: float
|
|
95
|
+
Float in range [0,1]
|
|
96
|
+
|
|
97
|
+
Returns
|
|
98
|
+
--------
|
|
99
|
+
str
|
|
100
|
+
Percentage corresponding to the float.
|
|
101
|
+
|
|
102
|
+
"""
|
|
103
|
+
percentage = f"{int(ratio*100)}%"
|
|
104
|
+
return percentage
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: rara-tools
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.10
|
|
4
4
|
Summary: Tools to support Kata's work.
|
|
5
5
|
Classifier: Programming Language :: Python :: 3
|
|
6
6
|
Classifier: Programming Language :: Python :: 3.10
|
|
@@ -14,6 +14,7 @@ Requires-Dist: elasticsearch==8.*
|
|
|
14
14
|
Requires-Dist: elasticsearch_dsl==8.*
|
|
15
15
|
Requires-Dist: minio==7.*
|
|
16
16
|
Requires-Dist: requests
|
|
17
|
+
Requires-Dist: iso639-lang
|
|
17
18
|
Provides-Extra: testing
|
|
18
19
|
Requires-Dist: pytest>=8.0; extra == "testing"
|
|
19
20
|
Requires-Dist: pytest-order; extra == "testing"
|
|
@@ -28,25 +29,28 @@ Requires-Dist: pytest-order; extra == "testing"
|
|
|
28
29
|
|
|
29
30
|
---
|
|
30
31
|
|
|
31
|
-
## ✨ Features
|
|
32
|
+
## ✨ Features
|
|
32
33
|
|
|
33
34
|
- Elasticsearch index & document operations
|
|
34
35
|
- S3 file management operations
|
|
35
36
|
- Task reporting to Core API
|
|
37
|
+
- Converting SIERRA API responses to Pymarc compatible JSON
|
|
38
|
+
|
|
36
39
|
---
|
|
37
40
|
|
|
38
|
-
## ⚡ Quick Start
|
|
41
|
+
## ⚡ Quick Start
|
|
39
42
|
|
|
40
43
|
Get started with `rara-tools` in just a few steps:
|
|
41
44
|
|
|
42
45
|
1. **Install the Package**
|
|
43
|
-
Ensure you're using Python 3.10 or above, then run:
|
|
46
|
+
Ensure you're using Python 3.10 or above, then run:
|
|
47
|
+
|
|
44
48
|
```bash
|
|
45
49
|
pip install rara-tools
|
|
46
50
|
```
|
|
47
51
|
|
|
48
52
|
2. **Import and Use**
|
|
49
|
-
Example usage to download a folder from S3:
|
|
53
|
+
Example usage to download a folder from S3:
|
|
50
54
|
|
|
51
55
|
```python
|
|
52
56
|
from rara_tools.s3 import S3Files
|
|
@@ -77,22 +81,25 @@ Follow the steps below to install the `rara-tools` package, either via `pip` or
|
|
|
77
81
|
Create or activate a Python environment using Python **3.10** or above.
|
|
78
82
|
|
|
79
83
|
2. **Install the Package**
|
|
80
|
-
|
|
84
|
+
Run the following command:
|
|
85
|
+
|
|
81
86
|
```bash
|
|
82
87
|
pip install rara-tools
|
|
83
88
|
```
|
|
84
|
-
|
|
89
|
+
|
|
90
|
+
</details>
|
|
85
91
|
|
|
86
92
|
---
|
|
87
93
|
|
|
88
94
|
### Local Installation
|
|
89
95
|
|
|
90
|
-
Follow these steps to install the `rara-tools` package locally:
|
|
96
|
+
Follow these steps to install the `rara-tools` package locally:
|
|
91
97
|
|
|
92
98
|
<details><summary>Click to expand</summary>
|
|
93
99
|
|
|
94
100
|
1. **Clone the Repository**
|
|
95
|
-
Clone the repository and navigate into it:
|
|
101
|
+
Clone the repository and navigate into it:
|
|
102
|
+
|
|
96
103
|
```bash
|
|
97
104
|
git clone <repository-url>
|
|
98
105
|
cd <repository-directory>
|
|
@@ -100,25 +107,29 @@ Follow these steps to install the `rara-tools` package locally:
|
|
|
100
107
|
|
|
101
108
|
2. **Set Up Python Environment**
|
|
102
109
|
Create or activate a Python environment using Python 3.10 or above. E.g:
|
|
110
|
+
|
|
103
111
|
```bash
|
|
104
112
|
conda create -n py310 python==3.10
|
|
105
113
|
conda activate py310
|
|
106
114
|
```
|
|
107
115
|
|
|
108
116
|
3. **Install Build Package**
|
|
109
|
-
Install the `build` package to enable local builds:
|
|
117
|
+
Install the `build` package to enable local builds:
|
|
118
|
+
|
|
110
119
|
```bash
|
|
111
120
|
pip install build
|
|
112
121
|
```
|
|
113
122
|
|
|
114
123
|
4. **Build the Package**
|
|
115
|
-
Run the following command inside the repository:
|
|
124
|
+
Run the following command inside the repository:
|
|
125
|
+
|
|
116
126
|
```bash
|
|
117
127
|
python -m build
|
|
118
128
|
```
|
|
119
129
|
|
|
120
130
|
5. **Install the Package**
|
|
121
|
-
Install the built package locally:
|
|
131
|
+
Install the built package locally:
|
|
132
|
+
|
|
122
133
|
```bash
|
|
123
134
|
pip install .
|
|
124
135
|
```
|
|
@@ -131,13 +142,13 @@ Follow these steps to install the `rara-tools` package locally:
|
|
|
131
142
|
|
|
132
143
|
Follow these steps to test the `rara-tools` package.
|
|
133
144
|
|
|
134
|
-
|
|
135
145
|
### How to Test
|
|
136
146
|
|
|
137
147
|
<details><summary>Click to expand</summary>
|
|
138
148
|
|
|
139
149
|
1. **Clone the Repository**
|
|
140
|
-
Clone the repository and navigate into it:
|
|
150
|
+
Clone the repository and navigate into it:
|
|
151
|
+
|
|
141
152
|
```bash
|
|
142
153
|
git clone <repository-url>
|
|
143
154
|
cd <repository-directory>
|
|
@@ -147,25 +158,29 @@ Follow these steps to test the `rara-tools` package.
|
|
|
147
158
|
Create or activate a Python environment using Python 3.10 or above.
|
|
148
159
|
|
|
149
160
|
3. **Install Build Package**
|
|
150
|
-
Install the `build` package:
|
|
161
|
+
Install the `build` package:
|
|
162
|
+
|
|
151
163
|
```bash
|
|
152
164
|
pip install build
|
|
153
165
|
```
|
|
154
166
|
|
|
155
167
|
4. **Build the Package**
|
|
156
|
-
Build the package inside the repository:
|
|
168
|
+
Build the package inside the repository:
|
|
169
|
+
|
|
157
170
|
```bash
|
|
158
171
|
python -m build
|
|
159
172
|
```
|
|
160
173
|
|
|
161
174
|
5. **Install with Testing Dependencies**
|
|
162
|
-
Install the package along with its testing dependencies:
|
|
175
|
+
Install the package along with its testing dependencies:
|
|
176
|
+
|
|
163
177
|
```bash
|
|
164
178
|
pip install .[testing]
|
|
165
179
|
```
|
|
166
180
|
|
|
167
181
|
6. **Run Tests**
|
|
168
|
-
Run the test suite from the repository root:
|
|
182
|
+
Run the test suite from the repository root:
|
|
183
|
+
|
|
169
184
|
```bash
|
|
170
185
|
python -m pytest -v tests
|
|
171
186
|
```
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
rara_tools/converters.py,sha256=JcS74VzV6jm12l3C6aqMJBY9nuVW_aevQeCe32KmfrE,1576
|
|
2
|
+
rara_tools/decorators.py,sha256=MjOyvZ5nTkwxwx2JLFEGpKKBysvecFw6EN6UDrSvZLU,2187
|
|
3
|
+
rara_tools/digar_schema_converter.py,sha256=gGwhqdwxyTXODF0LP5Xi0u8uRoICfaIU3MRe1EVBnEc,13935
|
|
4
|
+
rara_tools/elastic.py,sha256=vEvrbIPRtdqTdrNrPH2cewHLMfOTSf87a4JOiRQgYyA,7146
|
|
5
|
+
rara_tools/exceptions.py,sha256=BwNh4qWxau_ylr9RqZoYwd1KnExI6oWWWDno3jkh8q4,474
|
|
6
|
+
rara_tools/s3.py,sha256=uNDu2HzMYHAWh33RcHeyPFK7gdQfQPxsdfohyIKezEY,4467
|
|
7
|
+
rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
|
|
8
|
+
rara_tools/utils.py,sha256=9vSbmuWYU5ydr4lXBKlUKa0xzDccFsaJv4T-XwgUfuY,2578
|
|
9
|
+
rara_tools/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
rara_tools/constants/digitizer.py,sha256=gJ3jOMwuZfKcLqgOAxTyB266VYsskLabJiMUiSz3xX4,297
|
|
11
|
+
rara_tools/constants/general.py,sha256=E9Jaw-YxocS_tOZw9QBoxO3e9KK5EMbLoM0R7D4Iflw,171
|
|
12
|
+
rara_tools-0.0.10.dist-info/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
|
|
13
|
+
rara_tools-0.0.10.dist-info/METADATA,sha256=jV6nZKhjjwDL6TWt-fKWudWNUAViZTVDL0J39fefFtM,3895
|
|
14
|
+
rara_tools-0.0.10.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
15
|
+
rara_tools-0.0.10.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
|
|
16
|
+
rara_tools-0.0.10.dist-info/RECORD,,
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
rara_tools/decorators.py,sha256=rYDk5CEHhCZvqeFaHku8qLMv7G7NTMWppHwLg3ZeVj4,2186
|
|
2
|
-
rara_tools/elastic.py,sha256=nNlCmoyKfCkM_2r1jtbpSpUn4S8IrLOKak17QwhNSvs,7146
|
|
3
|
-
rara_tools/exceptions.py,sha256=FtuHG-2snaEfADA25HjjutGNQzNo6sTdSfqk9VrzOuE,374
|
|
4
|
-
rara_tools/s3.py,sha256=eqMiOKbjXvXY04JJV68gmOU-4DUnwEaeYdhjQSI6crU,4440
|
|
5
|
-
rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
|
|
6
|
-
rara_tools/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
-
rara_tools/constants/digitizer.py,sha256=gJ3jOMwuZfKcLqgOAxTyB266VYsskLabJiMUiSz3xX4,297
|
|
8
|
-
rara_tools/constants/general.py,sha256=E9Jaw-YxocS_tOZw9QBoxO3e9KK5EMbLoM0R7D4Iflw,171
|
|
9
|
-
rara_tools-0.0.8.dist-info/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
|
|
10
|
-
rara_tools-0.0.8.dist-info/METADATA,sha256=TMrOrd_YtH83jCAzbNpBrHlcN7ta6VQwYBD_HqH3unM,3820
|
|
11
|
-
rara_tools-0.0.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
12
|
-
rara_tools-0.0.8.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
|
|
13
|
-
rara_tools-0.0.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|