rara-tools 0.0.9__tar.gz → 0.0.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

Files changed (30) hide show
  1. {rara_tools-0.0.9/rara_tools.egg-info → rara_tools-0.0.10}/PKG-INFO +2 -1
  2. rara_tools-0.0.10/VERSION +1 -0
  3. rara_tools-0.0.10/rara_tools/digar_schema_converter.py +409 -0
  4. rara_tools-0.0.10/rara_tools/utils.py +104 -0
  5. {rara_tools-0.0.9 → rara_tools-0.0.10/rara_tools.egg-info}/PKG-INFO +2 -1
  6. {rara_tools-0.0.9 → rara_tools-0.0.10}/rara_tools.egg-info/SOURCES.txt +3 -0
  7. {rara_tools-0.0.9 → rara_tools-0.0.10}/rara_tools.egg-info/requires.txt +1 -0
  8. {rara_tools-0.0.9 → rara_tools-0.0.10}/requirements.txt +1 -0
  9. rara_tools-0.0.10/tests/test_digar_schema_converter.py +133 -0
  10. rara_tools-0.0.9/VERSION +0 -1
  11. {rara_tools-0.0.9 → rara_tools-0.0.10}/LICENSE.md +0 -0
  12. {rara_tools-0.0.9 → rara_tools-0.0.10}/README.md +0 -0
  13. {rara_tools-0.0.9 → rara_tools-0.0.10}/pyproject.toml +0 -0
  14. {rara_tools-0.0.9 → rara_tools-0.0.10}/rara_tools/constants/__init__.py +0 -0
  15. {rara_tools-0.0.9 → rara_tools-0.0.10}/rara_tools/constants/digitizer.py +0 -0
  16. {rara_tools-0.0.9 → rara_tools-0.0.10}/rara_tools/constants/general.py +0 -0
  17. {rara_tools-0.0.9 → rara_tools-0.0.10}/rara_tools/converters.py +0 -0
  18. {rara_tools-0.0.9 → rara_tools-0.0.10}/rara_tools/decorators.py +0 -0
  19. {rara_tools-0.0.9 → rara_tools-0.0.10}/rara_tools/elastic.py +0 -0
  20. {rara_tools-0.0.9 → rara_tools-0.0.10}/rara_tools/exceptions.py +0 -0
  21. {rara_tools-0.0.9 → rara_tools-0.0.10}/rara_tools/s3.py +0 -0
  22. {rara_tools-0.0.9 → rara_tools-0.0.10}/rara_tools/task_reporter.py +0 -0
  23. {rara_tools-0.0.9 → rara_tools-0.0.10}/rara_tools.egg-info/dependency_links.txt +0 -0
  24. {rara_tools-0.0.9 → rara_tools-0.0.10}/rara_tools.egg-info/top_level.txt +0 -0
  25. {rara_tools-0.0.9 → rara_tools-0.0.10}/setup.cfg +0 -0
  26. {rara_tools-0.0.9 → rara_tools-0.0.10}/tests/test_converters.py +0 -0
  27. {rara_tools-0.0.9 → rara_tools-0.0.10}/tests/test_elastic.py +0 -0
  28. {rara_tools-0.0.9 → rara_tools-0.0.10}/tests/test_s3_exceptions.py +0 -0
  29. {rara_tools-0.0.9 → rara_tools-0.0.10}/tests/test_s3_file_operations.py +0 -0
  30. {rara_tools-0.0.9 → rara_tools-0.0.10}/tests/test_task_reporter.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: rara-tools
3
- Version: 0.0.9
3
+ Version: 0.0.10
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -14,6 +14,7 @@ Requires-Dist: elasticsearch==8.*
14
14
  Requires-Dist: elasticsearch_dsl==8.*
15
15
  Requires-Dist: minio==7.*
16
16
  Requires-Dist: requests
17
+ Requires-Dist: iso639-lang
17
18
  Provides-Extra: testing
18
19
  Requires-Dist: pytest>=8.0; extra == "testing"
19
20
  Requires-Dist: pytest-order; extra == "testing"
@@ -0,0 +1 @@
1
+ 0.0.10
@@ -0,0 +1,409 @@
1
+ from collections import defaultdict
2
+ from typing import List, NoReturn
3
+
4
+ from rara_tools.utils import lang_to_iso639_2, ratio_to_percentage
5
+
6
+ GENERAL_DOC_IDENTIFIER = "Filepath"
7
+ UNDEFINED_LANGUAGE_VALUE = "unk"
8
+ QUALITY_RATIO_TYPE = "Float"
9
+
10
+
11
+ class ImagePageSchema:
12
+ def __init__(self, image: dict) -> NoReturn:
13
+ self.__image = image
14
+ self.__schema: dict = {}
15
+
16
+ @property
17
+ def schema(self) -> dict:
18
+ if not self.__schema:
19
+ self.__schema = {
20
+ "@type": "VisualArtwork",
21
+ "@id": "",
22
+ "value": self.__image.get("label"),
23
+ "description": "",
24
+ "schema:position": self.__image.get("page")
25
+ }
26
+ return self.__schema
27
+
28
+
29
+ class TextPageSchema:
30
+ def __init__(self, page: dict) -> NoReturn:
31
+ self.__page: dict = page
32
+ self.__schema: dict = {}
33
+
34
+ @property
35
+ def schema(self) -> dict:
36
+ if not self.__schema:
37
+ self.__schema = {
38
+ "@type": "Text", # CONSTANT
39
+ "@id": "", # Will be added in a later stage
40
+ "value": "Textblock", # CONSTANT
41
+ "content": self.__page.get("text"),
42
+ "schema:position": self.__page.get("start_page") # start_page ?
43
+ }
44
+ return self.__schema
45
+
46
+
47
+ class PageSchema:
48
+ def __init__(
49
+ self,
50
+ page_texts: List[dict],
51
+ page_images: List[dict],
52
+ page_number: int,
53
+ doc_id: str
54
+ ) -> NoReturn:
55
+ self.__page_texts: List[dict] = page_texts
56
+ self.__page_images: List[dict] = page_images
57
+ self.__page_nr: int = page_number
58
+ self.__page_id: str = ""
59
+ self.__doc_id: str = doc_id
60
+ self.__schema: dict = {}
61
+
62
+ def _add_segment_ids(self, segments: List[dict]) -> List[dict]:
63
+ for i, segment in enumerate(segments):
64
+ segment_id = f"{self.page_id}/{i + 1}"
65
+ segment["@id"] = segment_id
66
+ return segments
67
+
68
+ @property
69
+ def page_id(self) -> str:
70
+ if not self.__page_id:
71
+ self.__page_id = f"{self.__doc_id}/{self.__page_nr}"
72
+ return self.__page_id
73
+
74
+ @property
75
+ def schema(self) -> dict:
76
+ if not self.__schema:
77
+ self.__schema = {
78
+ "@type": "CreativeWork", # CONSTANT for pages
79
+ "@id": self.page_id,
80
+ "hasPart": []
81
+ }
82
+ text_schemas = [
83
+ TextPageSchema(page).schema
84
+ for page in self.__page_texts
85
+ ]
86
+ image_schemas = [
87
+ ImagePageSchema(image).schema
88
+ for image in self.__page_images
89
+ ]
90
+
91
+ page_schemas = text_schemas + image_schemas
92
+ page_schemas_with_ids = self._add_segment_ids(page_schemas)
93
+
94
+ self.__schema["hasPart"].extend(page_schemas_with_ids)
95
+
96
+ return self.__schema
97
+
98
+
99
+ class DocSchemas:
100
+ def __init__(
101
+ self,
102
+ doc_meta: dict,
103
+ sierra_id: str = "",
104
+ generated_id: str = "",
105
+ permalink: str = "",
106
+ min_language_ratio: float = 0.2,
107
+ convert_ratio: bool = True
108
+ ) -> NoReturn:
109
+ self.__convert_ratio = convert_ratio
110
+ self.__min_language_ratio = min_language_ratio
111
+ self.__sierra_id = sierra_id
112
+ self.__generated_id = generated_id
113
+ self.__permalink = permalink
114
+ self.__doc_meta = doc_meta
115
+ self.__ocr_accuracy_schema: dict = {}
116
+ self.__text_quality_schema: dict = {}
117
+ self.__language_schema: List[dict] = []
118
+ self.__identifier_schema: List[dict] = []
119
+ self.__origin_schema: dict = {}
120
+ self.__origin: str = ""
121
+
122
+ @property
123
+ def origin(self) -> str:
124
+ if not self.__origin:
125
+ if self.__doc_meta["ocr_applied"]:
126
+ self.__origin = "Reformatted digital"
127
+ else:
128
+ self.__origin = "Born digital"
129
+ return self.__origin
130
+
131
+ @property
132
+ def ocr_accuracy_schema(self) -> dict:
133
+ if not self.__ocr_accuracy_schema:
134
+ ocr_quality = self.__doc_meta.get("alto_text_quality")
135
+ if ocr_quality:
136
+ self.__ocr_accuracy_schema = {
137
+ "comment": "Estimated OCR accuracy"
138
+ }
139
+ if self.__convert_ratio:
140
+ type_and_value = {
141
+ "@type": QUALITY_RATIO_TYPE,
142
+ "value": ocr_quality
143
+ }
144
+ else:
145
+ type_and_value = {
146
+ "@type": "Text",
147
+ "value": ratio_to_percentage(ocr_quality)
148
+ }
149
+ self.__ocr_accuracy_schema.update(type_and_value)
150
+ return self.__ocr_accuracy_schema
151
+
152
+ @property
153
+ def text_quality_schema(self) -> dict:
154
+ if not self.__text_quality_schema:
155
+ text_quality = self.__doc_meta.get("text_quality")
156
+ self.__text_quality_schema = {
157
+ "comment": "Estimated n-gram-based text quality"
158
+ }
159
+ if self.__convert_ratio:
160
+ type_and_value = {
161
+ "@type": QUALITY_RATIO_TYPE,
162
+ "value": text_quality
163
+ }
164
+ else:
165
+ type_and_value = {
166
+ "@type": "Text",
167
+ "value": ratio_to_percentage(text_quality)
168
+ }
169
+ self.__text_quality_schema.update(type_and_value)
170
+ return self.__text_quality_schema
171
+
172
+ @property
173
+ def language_schema(self) -> List[dict]:
174
+ if not self.__language_schema:
175
+ self.__language_schema = [
176
+ {
177
+ "@type": "ISO 639-2",
178
+ "value": lang_to_iso639_2(
179
+ lang["language"],
180
+ unk_code=UNDEFINED_LANGUAGE_VALUE
181
+ )
182
+ }
183
+ for lang in self.__doc_meta["languages"]
184
+ if lang["ratio"] >= self.__min_language_ratio
185
+ ]
186
+ return self.__language_schema
187
+
188
+ @property
189
+ def identifier_schema(self) -> List[dict]:
190
+ if not self.__identifier_schema:
191
+ identifiers = []
192
+ if self.__sierra_id:
193
+ identifiers.append(
194
+ {
195
+ "@type": "Identifier",
196
+ "qualifier": "OPAC",
197
+ "value": self.__sierra_id
198
+ }
199
+ )
200
+ if self.__permalink:
201
+ identifiers.append(
202
+ {
203
+ "@type": "Identifier",
204
+ "qualifier": "Permalink",
205
+ "value": self.__permalink
206
+ }
207
+ )
208
+ if self.__generated_id:
209
+ identifiers.append(
210
+ {
211
+ "@type": "Identifier",
212
+ "qualifier": GENERAL_DOC_IDENTIFIER,
213
+ "value": self.__generated_id
214
+ }
215
+ )
216
+ self.__identifier_schema = identifiers
217
+
218
+ return self.__identifier_schema
219
+
220
+ @property
221
+ def origin_schema(self) -> dict:
222
+ if not self.__origin_schema:
223
+ self.__origin_schema = {
224
+ "@type": "Text",
225
+ "value": self.origin,
226
+ "comment": "Origin"
227
+ }
228
+ return self.__origin_schema
229
+
230
+
231
+ class DIGARSchemaConverter:
232
+ def __init__(
233
+ self,
234
+ digitizer_output: dict,
235
+ generated_id: str,
236
+ sierra_id: str = "",
237
+ permalink: str = "",
238
+ min_language_ratio: float = 0.2,
239
+ convert_ratio: bool = False
240
+ ) -> NoReturn:
241
+ """ Initialize DIGARSchemaConverter object.
242
+
243
+ Parameters
244
+ ----------
245
+ digitizer_output: dict
246
+ Raw output of rara-digitizer (https://pypi.org/project/rara-digitizer/).
247
+ generated_id: str
248
+ Some non-standard/generated document identifier used in ID fields.
249
+ sierra_id: str
250
+ Document's corresponding Sierra ID.
251
+ permalink: str
252
+ Permanent link, where the document can be accessed.
253
+ min_language_ratio: float
254
+ Cutoff ratio for languages. If ratio for some language
255
+ does not exceed the set threshold, the language will not
256
+ be added to the final output.
257
+ convert_ratio: bool
258
+ If enabled, all ratios are converted into percentages.
259
+
260
+ """
261
+ self.__digitizer_output: dict = digitizer_output
262
+ self.__min_language_ratio: float = min_language_ratio
263
+ self.__convert_ratio: bool = convert_ratio
264
+ self.__sierra_id: str = sierra_id
265
+ self.__generated_id: str = generated_id
266
+ self.__permalink: str = permalink.removesuffix("/")
267
+ self.__texts: List[dict] = []
268
+ self.__images: List[dict] = []
269
+ self.__doc_meta: dict = {}
270
+ self.__page_mappings: List[dict] = []
271
+ self.__dcterms_haspart: dict = {}
272
+ self.__dcterms_conforms_to: dict = {}
273
+ self.__dc_language: dict = {}
274
+ self.__dc_origin: dict = {}
275
+ self.__dc_identifier: List[dict] = []
276
+ self.__doc_id: str = ""
277
+
278
+ self.__doc_schemas = DocSchemas(
279
+ doc_meta=self.doc_meta,
280
+ sierra_id=self.__sierra_id,
281
+ generated_id=self.__generated_id,
282
+ permalink=self.__permalink,
283
+ min_language_ratio=self.__min_language_ratio,
284
+ convert_ratio=self.__convert_ratio
285
+ )
286
+ self.__digar_schema: dict = {}
287
+
288
+ def _get_page_number(self, page_content: dict) -> int:
289
+ """ Retrieves page number from image or text object.
290
+ """
291
+ _segments = page_content["texts"] + page_content["images"]
292
+ _first_segment = _segments[0]
293
+ if "start_page" in _first_segment:
294
+ page_number = _first_segment.get("start_page")
295
+ elif "page" in _first_segment:
296
+ page_number = _first_segment.get("page")
297
+ return page_number
298
+
299
+ @property
300
+ def doc_id(self) -> str:
301
+ """ Retrieves document ID to use for generating
302
+ page and segment ids. Preference order:
303
+ 1. permalink; 2. sierra_id; 3. generated document id
304
+ """
305
+ if not self.__doc_id:
306
+ if self.__permalink:
307
+ self.__doc_id = self.__permalink
308
+ elif self.__sierra_id:
309
+ self.__doc_id = self.__sierra_id
310
+ else:
311
+ self.__doc_id = self.__generated_id
312
+ return self.__doc_id
313
+
314
+ @property
315
+ def texts(self) -> List[dict]:
316
+ if not self.__texts:
317
+ self.__texts = self.__digitizer_output.get("texts")
318
+ return self.__texts
319
+
320
+ @property
321
+ def images(self) -> List[dict]:
322
+ if not self.__images:
323
+ self.__images = self.__digitizer_output.get("images")
324
+ return self.__images
325
+
326
+ @property
327
+ def doc_meta(self) -> dict:
328
+ if not self.__doc_meta:
329
+ self.__doc_meta = self.__digitizer_output.get("doc_meta")
330
+ return self.__doc_meta
331
+
332
+ @property
333
+ def page_mappings(self) -> List[dict]:
334
+ if not self.__page_mappings:
335
+ mapped = defaultdict(lambda: defaultdict(list))
336
+ for text in self.texts:
337
+ mapped[text["start_page"]]["texts"].append(text)
338
+ for img in self.images:
339
+ mapped[img["page"]]["images"].append(img)
340
+
341
+ self.__page_mappings = [
342
+ v for k, v in sorted(list(mapped.items()), key=lambda x: x[0])
343
+ ]
344
+ return self.__page_mappings
345
+
346
+ @property
347
+ def dcterms_haspart(self) -> dict:
348
+ if not self.__dcterms_haspart:
349
+ self.__dcterms_haspart = {
350
+ "dcterms:hasPart": [
351
+ PageSchema(
352
+ page_texts=page["texts"],
353
+ page_images=page["images"],
354
+ page_number=self._get_page_number(page),
355
+ doc_id=self.doc_id
356
+ ).schema
357
+ for page in self.page_mappings
358
+ ]
359
+ }
360
+ return self.__dcterms_haspart
361
+
362
+ @property
363
+ def dcterms_conforms_to(self) -> dict:
364
+ if not self.__dcterms_conforms_to:
365
+ schema_content = [
366
+ self.__doc_schemas.text_quality_schema,
367
+ ]
368
+ # Add OCR Accuracy only when it is not empty:
369
+ if self.__doc_schemas.ocr_accuracy_schema:
370
+ schema_content.append(self.__doc_schemas.ocr_accuracy_schema)
371
+ self.__dcterms_conforms_to = {
372
+ "dcterms:conformsTo": schema_content
373
+ }
374
+ return self.__dcterms_conforms_to
375
+
376
+ @property
377
+ def dc_language(self) -> dict:
378
+ if not self.__dc_language:
379
+ self.__dc_language = {
380
+ "dc:language": self.__doc_schemas.language_schema
381
+ }
382
+ return self.__dc_language
383
+
384
+ @property
385
+ def dc_origin(self) -> dict:
386
+ if not self.__dc_origin:
387
+ self.__dc_origin = {
388
+ "dcterms:provenance": self.__doc_schemas.origin_schema
389
+ }
390
+ return self.__dc_origin
391
+
392
+ @property
393
+ def dc_identifier(self) -> List[dict]:
394
+ if not self.__dc_identifier:
395
+ self.__dc_identifier = {
396
+ "dc:identifier": self.__doc_schemas.identifier_schema
397
+ }
398
+ return self.__dc_identifier
399
+
400
+ @property
401
+ def digar_schema(self) -> dict:
402
+ if not self.__digar_schema:
403
+ self.__digar_schema = {}
404
+ self.__digar_schema.update(self.dcterms_conforms_to)
405
+ self.__digar_schema.update(self.dcterms_haspart)
406
+ self.__digar_schema.update(self.dc_language)
407
+ self.__digar_schema.update(self.dc_origin)
408
+ self.__digar_schema.update(self.dc_identifier)
409
+ return self.__digar_schema
@@ -0,0 +1,104 @@
1
+ from iso639 import Lang
2
+
3
+
4
+ def lang_to_iso639_1(lang: str, unk_code: str = "unk") -> str:
5
+ """ Converts language into ISO-639-1 standard.
6
+ Input can be any language code in a valid ISO-639
7
+ standard or even a full name of the language,
8
+ e.g. "Estonian".
9
+
10
+ Parameters
11
+ -----------
12
+ lang: str
13
+ Language code in any valid ISO-639 standard.
14
+
15
+ unk_code: str
16
+ Code to return incase of invalid/unsupported
17
+ input language.
18
+
19
+ Returns
20
+ -------
21
+ Language code in ISO-639-1 standard.
22
+ """
23
+ try:
24
+ lg = Lang(lang)
25
+ iso_639_1_lang = lg.pt1
26
+ except:
27
+ iso_639_1_lang = unk_code
28
+ return iso_639_1_lang
29
+
30
+
31
+ def lang_to_iso639_2(lang: str, unk_code: str = "unk") -> str:
32
+ """ Converts language into ISO-639-2 standard.
33
+ Input can be any language code in a valid ISO-639
34
+ standard or even a full name of the language,
35
+ e.g. "Estonian".
36
+
37
+ Parameters
38
+ -----------
39
+ lang: str
40
+ Language code in any valid ISO-639 standard.
41
+
42
+ unk_code: str
43
+ Code to return incase of invalid/unsupported
44
+ input language.
45
+
46
+ Returns
47
+ -------
48
+ Language code in ISO-639-2 standard.
49
+ """
50
+ try:
51
+ lg = Lang(lang)
52
+ # NB! uses bibliographic identifier (e.g. "de" -> "ger")
53
+ # opposed to terminological identifier ("de" -> "deu").
54
+ # This can be changed by replaving lg.pt2b -> lg.pt2t
55
+ iso_639_2_lang = lg.pt2b
56
+ except:
57
+ iso_639_2_lang = unk_code
58
+ return iso_639_2_lang
59
+
60
+
61
+ def lang_to_iso639_3(lang: str, unk_code: str = "unk") -> str:
62
+ """ Converts language into ISO-639-3 standard.
63
+ Input can be any language code in a valid ISO-639
64
+ standard or even a full name of the language,
65
+ e.g. "Estonian".
66
+
67
+ Parameters
68
+ -----------
69
+ lang: str
70
+ Language code in any valid ISO-639 standard.
71
+ unk_code: str
72
+
73
+ Code to return incase of invalid/unsupported
74
+ input language.
75
+
76
+ Returns
77
+ -------
78
+ str
79
+ Language code in ISO-639-3 standard.
80
+ """
81
+ try:
82
+ lg = Lang(lang)
83
+ iso_639_3_lang = lg.pt3
84
+ except:
85
+ iso_639_3_lang = unk_code
86
+ return iso_639_3_lang
87
+
88
+
89
+ def ratio_to_percentage(ratio: float) -> str:
90
+ """ Converts ratio to corresponding percentage.
91
+
92
+ Parameters
93
+ -----------
94
+ ratio: float
95
+ Float in range [0,1]
96
+
97
+ Returns
98
+ --------
99
+ str
100
+ Percentage corresponding to the float.
101
+
102
+ """
103
+ percentage = f"{int(ratio*100)}%"
104
+ return percentage
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: rara-tools
3
- Version: 0.0.9
3
+ Version: 0.0.10
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -14,6 +14,7 @@ Requires-Dist: elasticsearch==8.*
14
14
  Requires-Dist: elasticsearch_dsl==8.*
15
15
  Requires-Dist: minio==7.*
16
16
  Requires-Dist: requests
17
+ Requires-Dist: iso639-lang
17
18
  Provides-Extra: testing
18
19
  Requires-Dist: pytest>=8.0; extra == "testing"
19
20
  Requires-Dist: pytest-order; extra == "testing"
@@ -5,10 +5,12 @@ pyproject.toml
5
5
  requirements.txt
6
6
  rara_tools/converters.py
7
7
  rara_tools/decorators.py
8
+ rara_tools/digar_schema_converter.py
8
9
  rara_tools/elastic.py
9
10
  rara_tools/exceptions.py
10
11
  rara_tools/s3.py
11
12
  rara_tools/task_reporter.py
13
+ rara_tools/utils.py
12
14
  rara_tools.egg-info/PKG-INFO
13
15
  rara_tools.egg-info/SOURCES.txt
14
16
  rara_tools.egg-info/dependency_links.txt
@@ -18,6 +20,7 @@ rara_tools/constants/__init__.py
18
20
  rara_tools/constants/digitizer.py
19
21
  rara_tools/constants/general.py
20
22
  tests/test_converters.py
23
+ tests/test_digar_schema_converter.py
21
24
  tests/test_elastic.py
22
25
  tests/test_s3_exceptions.py
23
26
  tests/test_s3_file_operations.py
@@ -2,6 +2,7 @@ elasticsearch==8.*
2
2
  elasticsearch_dsl==8.*
3
3
  minio==7.*
4
4
  requests
5
+ iso639-lang
5
6
 
6
7
  [testing]
7
8
  pytest>=8.0
@@ -2,3 +2,4 @@ elasticsearch==8.*
2
2
  elasticsearch_dsl==8.*
3
3
  minio==7.*
4
4
  requests
5
+ iso639-lang
@@ -0,0 +1,133 @@
1
+ import json
2
+ import pytest
3
+ import os
4
+ import sys
5
+
6
+ from rara_tools.digar_schema_converter import DIGARSchemaConverter
7
+
8
+ def load_json(file_path: str):
9
+ with open(file_path, "r") as f:
10
+ data = json.load(f)
11
+ return data
12
+
13
+
14
+ TEST_DIGITIZER_OUTPUT_FILE = os.path.join(".", "tests", "test_data", "b1267058_test_digitizer_output.json")
15
+ TEST_DIGITIZER_OUTPUT = load_json(TEST_DIGITIZER_OUTPUT_FILE)
16
+ TEST_SIERRA_ID = "b1267058"
17
+ TEST_GENERATED_ID = "hsasaHSAHHGDhb"
18
+ TEST_PERMALINK = "https://www.digar.ee/b1267058"
19
+
20
+ def test_digar_schema_converstion_default():
21
+ converter = DIGARSchemaConverter(
22
+ digitizer_output=TEST_DIGITIZER_OUTPUT,
23
+ sierra_id=TEST_SIERRA_ID,
24
+ generated_id=TEST_GENERATED_ID
25
+ )
26
+ digar_schema = converter.digar_schema
27
+
28
+ # check that all neseccary fields are present
29
+ assert "dc:language" in digar_schema
30
+ assert "dcterms:provenance" in digar_schema
31
+ assert "dc:identifier" in digar_schema
32
+ assert "dcterms:hasPart" in digar_schema
33
+ assert "dcterms:conformsTo" in digar_schema
34
+
35
+ languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
36
+ # check that languages are converted into ISO-693-2
37
+ for lang in languages:
38
+ assert len(lang) == 3
39
+
40
+
41
+ # check that ratio is converted into percentage
42
+ text_quality = digar_schema.get("dcterms:conformsTo")[0].get("value")
43
+ assert isinstance(text_quality, str)
44
+
45
+
46
+ def test_digar_schema_id_generation():
47
+ """ Tests ID generation logic.
48
+ """
49
+ converter = DIGARSchemaConverter(
50
+ digitizer_output=TEST_DIGITIZER_OUTPUT,
51
+ sierra_id=TEST_SIERRA_ID,
52
+ generated_id=TEST_GENERATED_ID,
53
+ permalink=TEST_PERMALINK
54
+
55
+ )
56
+
57
+ #If permalink is given, this should be used as base ID
58
+ digar_schema = converter.digar_schema
59
+ first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
60
+
61
+ assert first_segment_id.startswith(TEST_PERMALINK)
62
+
63
+ converter = DIGARSchemaConverter(
64
+ digitizer_output=TEST_DIGITIZER_OUTPUT,
65
+ sierra_id=TEST_SIERRA_ID,
66
+ generated_id=TEST_GENERATED_ID
67
+ )
68
+
69
+ #If permalink is NOT given, Sierra ID should be used as base ID
70
+ digar_schema = converter.digar_schema
71
+ first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
72
+ assert first_segment_id.startswith(TEST_SIERRA_ID)
73
+
74
+
75
+ converter = DIGARSchemaConverter(
76
+ digitizer_output=TEST_DIGITIZER_OUTPUT,
77
+ generated_id=TEST_GENERATED_ID
78
+ )
79
+
80
+ #If neiter permalink nor Sierra ID is given, generated ID should be used as base ID
81
+ digar_schema = converter.digar_schema
82
+ first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
83
+ assert first_segment_id.startswith(TEST_GENERATED_ID)
84
+
85
+
86
+ def test_restricting_languages_with_ratio():
87
+ """ Checks that param `min_language_ratio` influences
88
+ the number of output languages.
89
+ """
90
+ converter = DIGARSchemaConverter(
91
+ digitizer_output=TEST_DIGITIZER_OUTPUT,
92
+ sierra_id=TEST_SIERRA_ID,
93
+ generated_id=TEST_GENERATED_ID,
94
+ permalink=TEST_PERMALINK,
95
+ min_language_ratio=0
96
+
97
+ )
98
+
99
+ #If permalink is given, this should be used as base ID
100
+ digar_schema = converter.digar_schema
101
+ languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
102
+ assert len(languages) == 7
103
+
104
+ converter = DIGARSchemaConverter(
105
+ digitizer_output=TEST_DIGITIZER_OUTPUT,
106
+ sierra_id=TEST_SIERRA_ID,
107
+ generated_id=TEST_GENERATED_ID,
108
+ permalink=TEST_PERMALINK,
109
+ min_language_ratio=0.02
110
+
111
+ )
112
+
113
+ #If permalink is given, this should be used as base ID
114
+ digar_schema = converter.digar_schema
115
+ languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
116
+ assert len(languages) == 2
117
+
118
+ converter = DIGARSchemaConverter(
119
+ digitizer_output=TEST_DIGITIZER_OUTPUT,
120
+ sierra_id=TEST_SIERRA_ID,
121
+ generated_id=TEST_GENERATED_ID,
122
+ permalink=TEST_PERMALINK,
123
+ min_language_ratio=0.5
124
+
125
+ )
126
+
127
+ #If permalink is given, this should be used as base ID
128
+ digar_schema = converter.digar_schema
129
+ languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
130
+ assert len(languages) == 1
131
+
132
+
133
+
rara_tools-0.0.9/VERSION DELETED
@@ -1 +0,0 @@
1
- 0.0.9
File without changes
File without changes
File without changes
File without changes
File without changes