rara-tools 0.0.9__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

@@ -0,0 +1,409 @@
1
+ from collections import defaultdict
2
+ from typing import List, NoReturn
3
+
4
+ from rara_tools.utils import lang_to_iso639_2, ratio_to_percentage
5
+
6
+ GENERAL_DOC_IDENTIFIER = "Filepath"
7
+ UNDEFINED_LANGUAGE_VALUE = "unk"
8
+ QUALITY_RATIO_TYPE = "Float"
9
+
10
+
11
+ class ImagePageSchema:
12
+ def __init__(self, image: dict) -> NoReturn:
13
+ self.__image = image
14
+ self.__schema: dict = {}
15
+
16
+ @property
17
+ def schema(self) -> dict:
18
+ if not self.__schema:
19
+ self.__schema = {
20
+ "@type": "VisualArtwork",
21
+ "@id": "",
22
+ "value": self.__image.get("label"),
23
+ "description": "",
24
+ "schema:position": self.__image.get("page")
25
+ }
26
+ return self.__schema
27
+
28
+
29
+ class TextPageSchema:
30
+ def __init__(self, page: dict) -> NoReturn:
31
+ self.__page: dict = page
32
+ self.__schema: dict = {}
33
+
34
+ @property
35
+ def schema(self) -> dict:
36
+ if not self.__schema:
37
+ self.__schema = {
38
+ "@type": "Text", # CONSTANT
39
+ "@id": "", # Will be added in a later stage
40
+ "value": "Textblock", # CONSTANT
41
+ "content": self.__page.get("text"),
42
+ "schema:position": self.__page.get("start_page") # start_page ?
43
+ }
44
+ return self.__schema
45
+
46
+
47
+ class PageSchema:
48
+ def __init__(
49
+ self,
50
+ page_texts: List[dict],
51
+ page_images: List[dict],
52
+ page_number: int,
53
+ doc_id: str
54
+ ) -> NoReturn:
55
+ self.__page_texts: List[dict] = page_texts
56
+ self.__page_images: List[dict] = page_images
57
+ self.__page_nr: int = page_number
58
+ self.__page_id: str = ""
59
+ self.__doc_id: str = doc_id
60
+ self.__schema: dict = {}
61
+
62
+ def _add_segment_ids(self, segments: List[dict]) -> List[dict]:
63
+ for i, segment in enumerate(segments):
64
+ segment_id = f"{self.page_id}/{i + 1}"
65
+ segment["@id"] = segment_id
66
+ return segments
67
+
68
+ @property
69
+ def page_id(self) -> str:
70
+ if not self.__page_id:
71
+ self.__page_id = f"{self.__doc_id}/{self.__page_nr}"
72
+ return self.__page_id
73
+
74
+ @property
75
+ def schema(self) -> dict:
76
+ if not self.__schema:
77
+ self.__schema = {
78
+ "@type": "CreativeWork", # CONSTANT for pages
79
+ "@id": self.page_id,
80
+ "hasPart": []
81
+ }
82
+ text_schemas = [
83
+ TextPageSchema(page).schema
84
+ for page in self.__page_texts
85
+ ]
86
+ image_schemas = [
87
+ ImagePageSchema(image).schema
88
+ for image in self.__page_images
89
+ ]
90
+
91
+ page_schemas = text_schemas + image_schemas
92
+ page_schemas_with_ids = self._add_segment_ids(page_schemas)
93
+
94
+ self.__schema["hasPart"].extend(page_schemas_with_ids)
95
+
96
+ return self.__schema
97
+
98
+
99
+ class DocSchemas:
100
+ def __init__(
101
+ self,
102
+ doc_meta: dict,
103
+ sierra_id: str = "",
104
+ generated_id: str = "",
105
+ permalink: str = "",
106
+ min_language_ratio: float = 0.2,
107
+ convert_ratio: bool = True
108
+ ) -> NoReturn:
109
+ self.__convert_ratio = convert_ratio
110
+ self.__min_language_ratio = min_language_ratio
111
+ self.__sierra_id = sierra_id
112
+ self.__generated_id = generated_id
113
+ self.__permalink = permalink
114
+ self.__doc_meta = doc_meta
115
+ self.__ocr_accuracy_schema: dict = {}
116
+ self.__text_quality_schema: dict = {}
117
+ self.__language_schema: List[dict] = []
118
+ self.__identifier_schema: List[dict] = []
119
+ self.__origin_schema: dict = {}
120
+ self.__origin: str = ""
121
+
122
+ @property
123
+ def origin(self) -> str:
124
+ if not self.__origin:
125
+ if self.__doc_meta["ocr_applied"]:
126
+ self.__origin = "Reformatted digital"
127
+ else:
128
+ self.__origin = "Born digital"
129
+ return self.__origin
130
+
131
+ @property
132
+ def ocr_accuracy_schema(self) -> dict:
133
+ if not self.__ocr_accuracy_schema:
134
+ ocr_quality = self.__doc_meta.get("alto_text_quality")
135
+ if ocr_quality:
136
+ self.__ocr_accuracy_schema = {
137
+ "comment": "Estimated OCR accuracy"
138
+ }
139
+ if self.__convert_ratio:
140
+ type_and_value = {
141
+ "@type": QUALITY_RATIO_TYPE,
142
+ "value": ocr_quality
143
+ }
144
+ else:
145
+ type_and_value = {
146
+ "@type": "Text",
147
+ "value": ratio_to_percentage(ocr_quality)
148
+ }
149
+ self.__ocr_accuracy_schema.update(type_and_value)
150
+ return self.__ocr_accuracy_schema
151
+
152
+ @property
153
+ def text_quality_schema(self) -> dict:
154
+ if not self.__text_quality_schema:
155
+ text_quality = self.__doc_meta.get("text_quality")
156
+ self.__text_quality_schema = {
157
+ "comment": "Estimated n-gram-based text quality"
158
+ }
159
+ if self.__convert_ratio:
160
+ type_and_value = {
161
+ "@type": QUALITY_RATIO_TYPE,
162
+ "value": text_quality
163
+ }
164
+ else:
165
+ type_and_value = {
166
+ "@type": "Text",
167
+ "value": ratio_to_percentage(text_quality)
168
+ }
169
+ self.__text_quality_schema.update(type_and_value)
170
+ return self.__text_quality_schema
171
+
172
+ @property
173
+ def language_schema(self) -> List[dict]:
174
+ if not self.__language_schema:
175
+ self.__language_schema = [
176
+ {
177
+ "@type": "ISO 639-2",
178
+ "value": lang_to_iso639_2(
179
+ lang["language"],
180
+ unk_code=UNDEFINED_LANGUAGE_VALUE
181
+ )
182
+ }
183
+ for lang in self.__doc_meta["languages"]
184
+ if lang["ratio"] >= self.__min_language_ratio
185
+ ]
186
+ return self.__language_schema
187
+
188
+ @property
189
+ def identifier_schema(self) -> List[dict]:
190
+ if not self.__identifier_schema:
191
+ identifiers = []
192
+ if self.__sierra_id:
193
+ identifiers.append(
194
+ {
195
+ "@type": "Identifier",
196
+ "qualifier": "OPAC",
197
+ "value": self.__sierra_id
198
+ }
199
+ )
200
+ if self.__permalink:
201
+ identifiers.append(
202
+ {
203
+ "@type": "Identifier",
204
+ "qualifier": "Permalink",
205
+ "value": self.__permalink
206
+ }
207
+ )
208
+ if self.__generated_id:
209
+ identifiers.append(
210
+ {
211
+ "@type": "Identifier",
212
+ "qualifier": GENERAL_DOC_IDENTIFIER,
213
+ "value": self.__generated_id
214
+ }
215
+ )
216
+ self.__identifier_schema = identifiers
217
+
218
+ return self.__identifier_schema
219
+
220
+ @property
221
+ def origin_schema(self) -> dict:
222
+ if not self.__origin_schema:
223
+ self.__origin_schema = {
224
+ "@type": "Text",
225
+ "value": self.origin,
226
+ "comment": "Origin"
227
+ }
228
+ return self.__origin_schema
229
+
230
+
231
+ class DIGARSchemaConverter:
232
+ def __init__(
233
+ self,
234
+ digitizer_output: dict,
235
+ generated_id: str,
236
+ sierra_id: str = "",
237
+ permalink: str = "",
238
+ min_language_ratio: float = 0.2,
239
+ convert_ratio: bool = False
240
+ ) -> NoReturn:
241
+ """ Initialize DIGARSchemaConverter object.
242
+
243
+ Parameters
244
+ ----------
245
+ digitizer_output: dict
246
+ Raw output of rara-digitizer (https://pypi.org/project/rara-digitizer/).
247
+ generated_id: str
248
+ Some non-standard/generated document identifier used in ID fields.
249
+ sierra_id: str
250
+ Document's corresponding Sierra ID.
251
+ permalink: str
252
+ Permanent link, where the document can be accessed.
253
+ min_language_ratio: float
254
+ Cutoff ratio for languages. If ratio for some language
255
+ does not exceed the set threshold, the language will not
256
+ be added to the final output.
257
+ convert_ratio: bool
258
+ If enabled, all ratios are converted into percentages.
259
+
260
+ """
261
+ self.__digitizer_output: dict = digitizer_output
262
+ self.__min_language_ratio: float = min_language_ratio
263
+ self.__convert_ratio: bool = convert_ratio
264
+ self.__sierra_id: str = sierra_id
265
+ self.__generated_id: str = generated_id
266
+ self.__permalink: str = permalink.removesuffix("/")
267
+ self.__texts: List[dict] = []
268
+ self.__images: List[dict] = []
269
+ self.__doc_meta: dict = {}
270
+ self.__page_mappings: List[dict] = []
271
+ self.__dcterms_haspart: dict = {}
272
+ self.__dcterms_conforms_to: dict = {}
273
+ self.__dc_language: dict = {}
274
+ self.__dc_origin: dict = {}
275
+ self.__dc_identifier: List[dict] = []
276
+ self.__doc_id: str = ""
277
+
278
+ self.__doc_schemas = DocSchemas(
279
+ doc_meta=self.doc_meta,
280
+ sierra_id=self.__sierra_id,
281
+ generated_id=self.__generated_id,
282
+ permalink=self.__permalink,
283
+ min_language_ratio=self.__min_language_ratio,
284
+ convert_ratio=self.__convert_ratio
285
+ )
286
+ self.__digar_schema: dict = {}
287
+
288
+ def _get_page_number(self, page_content: dict) -> int:
289
+ """ Retrieves page number from image or text object.
290
+ """
291
+ _segments = page_content["texts"] + page_content["images"]
292
+ _first_segment = _segments[0]
293
+ if "start_page" in _first_segment:
294
+ page_number = _first_segment.get("start_page")
295
+ elif "page" in _first_segment:
296
+ page_number = _first_segment.get("page")
297
+ return page_number
298
+
299
+ @property
300
+ def doc_id(self) -> str:
301
+ """ Retrieves document ID to use for generating
302
+ page and segment ids. Preference order:
303
+ 1. permalink; 2. sierra_id; 3. generated document id
304
+ """
305
+ if not self.__doc_id:
306
+ if self.__permalink:
307
+ self.__doc_id = self.__permalink
308
+ elif self.__sierra_id:
309
+ self.__doc_id = self.__sierra_id
310
+ else:
311
+ self.__doc_id = self.__generated_id
312
+ return self.__doc_id
313
+
314
+ @property
315
+ def texts(self) -> List[dict]:
316
+ if not self.__texts:
317
+ self.__texts = self.__digitizer_output.get("texts")
318
+ return self.__texts
319
+
320
+ @property
321
+ def images(self) -> List[dict]:
322
+ if not self.__images:
323
+ self.__images = self.__digitizer_output.get("images")
324
+ return self.__images
325
+
326
+ @property
327
+ def doc_meta(self) -> dict:
328
+ if not self.__doc_meta:
329
+ self.__doc_meta = self.__digitizer_output.get("doc_meta")
330
+ return self.__doc_meta
331
+
332
+ @property
333
+ def page_mappings(self) -> List[dict]:
334
+ if not self.__page_mappings:
335
+ mapped = defaultdict(lambda: defaultdict(list))
336
+ for text in self.texts:
337
+ mapped[text["start_page"]]["texts"].append(text)
338
+ for img in self.images:
339
+ mapped[img["page"]]["images"].append(img)
340
+
341
+ self.__page_mappings = [
342
+ v for k, v in sorted(list(mapped.items()), key=lambda x: x[0])
343
+ ]
344
+ return self.__page_mappings
345
+
346
+ @property
347
+ def dcterms_haspart(self) -> dict:
348
+ if not self.__dcterms_haspart:
349
+ self.__dcterms_haspart = {
350
+ "dcterms:hasPart": [
351
+ PageSchema(
352
+ page_texts=page["texts"],
353
+ page_images=page["images"],
354
+ page_number=self._get_page_number(page),
355
+ doc_id=self.doc_id
356
+ ).schema
357
+ for page in self.page_mappings
358
+ ]
359
+ }
360
+ return self.__dcterms_haspart
361
+
362
+ @property
363
+ def dcterms_conforms_to(self) -> dict:
364
+ if not self.__dcterms_conforms_to:
365
+ schema_content = [
366
+ self.__doc_schemas.text_quality_schema,
367
+ ]
368
+ # Add OCR Accuracy only when it is not empty:
369
+ if self.__doc_schemas.ocr_accuracy_schema:
370
+ schema_content.append(self.__doc_schemas.ocr_accuracy_schema)
371
+ self.__dcterms_conforms_to = {
372
+ "dcterms:conformsTo": schema_content
373
+ }
374
+ return self.__dcterms_conforms_to
375
+
376
+ @property
377
+ def dc_language(self) -> dict:
378
+ if not self.__dc_language:
379
+ self.__dc_language = {
380
+ "dc:language": self.__doc_schemas.language_schema
381
+ }
382
+ return self.__dc_language
383
+
384
+ @property
385
+ def dc_origin(self) -> dict:
386
+ if not self.__dc_origin:
387
+ self.__dc_origin = {
388
+ "dcterms:provenance": self.__doc_schemas.origin_schema
389
+ }
390
+ return self.__dc_origin
391
+
392
+ @property
393
+ def dc_identifier(self) -> List[dict]:
394
+ if not self.__dc_identifier:
395
+ self.__dc_identifier = {
396
+ "dc:identifier": self.__doc_schemas.identifier_schema
397
+ }
398
+ return self.__dc_identifier
399
+
400
+ @property
401
+ def digar_schema(self) -> dict:
402
+ if not self.__digar_schema:
403
+ self.__digar_schema = {}
404
+ self.__digar_schema.update(self.dcterms_conforms_to)
405
+ self.__digar_schema.update(self.dcterms_haspart)
406
+ self.__digar_schema.update(self.dc_language)
407
+ self.__digar_schema.update(self.dc_origin)
408
+ self.__digar_schema.update(self.dc_identifier)
409
+ return self.__digar_schema
rara_tools/utils.py ADDED
@@ -0,0 +1,104 @@
1
+ from iso639 import Lang
2
+
3
+
4
+ def lang_to_iso639_1(lang: str, unk_code: str = "unk") -> str:
5
+ """ Converts language into ISO-639-1 standard.
6
+ Input can be any language code in a valid ISO-639
7
+ standard or even a full name of the language,
8
+ e.g. "Estonian".
9
+
10
+ Parameters
11
+ -----------
12
+ lang: str
13
+ Language code in any valid ISO-639 standard.
14
+
15
+ unk_code: str
16
+ Code to return incase of invalid/unsupported
17
+ input language.
18
+
19
+ Returns
20
+ -------
21
+ Language code in ISO-639-1 standard.
22
+ """
23
+ try:
24
+ lg = Lang(lang)
25
+ iso_639_1_lang = lg.pt1
26
+ except:
27
+ iso_639_1_lang = unk_code
28
+ return iso_639_1_lang
29
+
30
+
31
+ def lang_to_iso639_2(lang: str, unk_code: str = "unk") -> str:
32
+ """ Converts language into ISO-639-2 standard.
33
+ Input can be any language code in a valid ISO-639
34
+ standard or even a full name of the language,
35
+ e.g. "Estonian".
36
+
37
+ Parameters
38
+ -----------
39
+ lang: str
40
+ Language code in any valid ISO-639 standard.
41
+
42
+ unk_code: str
43
+ Code to return incase of invalid/unsupported
44
+ input language.
45
+
46
+ Returns
47
+ -------
48
+ Language code in ISO-639-2 standard.
49
+ """
50
+ try:
51
+ lg = Lang(lang)
52
+ # NB! uses bibliographic identifier (e.g. "de" -> "ger")
53
+ # opposed to terminological identifier ("de" -> "deu").
54
+ # This can be changed by replaving lg.pt2b -> lg.pt2t
55
+ iso_639_2_lang = lg.pt2b
56
+ except:
57
+ iso_639_2_lang = unk_code
58
+ return iso_639_2_lang
59
+
60
+
61
+ def lang_to_iso639_3(lang: str, unk_code: str = "unk") -> str:
62
+ """ Converts language into ISO-639-3 standard.
63
+ Input can be any language code in a valid ISO-639
64
+ standard or even a full name of the language,
65
+ e.g. "Estonian".
66
+
67
+ Parameters
68
+ -----------
69
+ lang: str
70
+ Language code in any valid ISO-639 standard.
71
+ unk_code: str
72
+
73
+ Code to return incase of invalid/unsupported
74
+ input language.
75
+
76
+ Returns
77
+ -------
78
+ str
79
+ Language code in ISO-639-3 standard.
80
+ """
81
+ try:
82
+ lg = Lang(lang)
83
+ iso_639_3_lang = lg.pt3
84
+ except:
85
+ iso_639_3_lang = unk_code
86
+ return iso_639_3_lang
87
+
88
+
89
+ def ratio_to_percentage(ratio: float) -> str:
90
+ """ Converts ratio to corresponding percentage.
91
+
92
+ Parameters
93
+ -----------
94
+ ratio: float
95
+ Float in range [0,1]
96
+
97
+ Returns
98
+ --------
99
+ str
100
+ Percentage corresponding to the float.
101
+
102
+ """
103
+ percentage = f"{int(ratio*100)}%"
104
+ return percentage
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: rara-tools
3
- Version: 0.0.9
3
+ Version: 0.0.10
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -14,6 +14,7 @@ Requires-Dist: elasticsearch==8.*
14
14
  Requires-Dist: elasticsearch_dsl==8.*
15
15
  Requires-Dist: minio==7.*
16
16
  Requires-Dist: requests
17
+ Requires-Dist: iso639-lang
17
18
  Provides-Extra: testing
18
19
  Requires-Dist: pytest>=8.0; extra == "testing"
19
20
  Requires-Dist: pytest-order; extra == "testing"
@@ -1,14 +1,16 @@
1
1
  rara_tools/converters.py,sha256=JcS74VzV6jm12l3C6aqMJBY9nuVW_aevQeCe32KmfrE,1576
2
2
  rara_tools/decorators.py,sha256=MjOyvZ5nTkwxwx2JLFEGpKKBysvecFw6EN6UDrSvZLU,2187
3
+ rara_tools/digar_schema_converter.py,sha256=gGwhqdwxyTXODF0LP5Xi0u8uRoICfaIU3MRe1EVBnEc,13935
3
4
  rara_tools/elastic.py,sha256=vEvrbIPRtdqTdrNrPH2cewHLMfOTSf87a4JOiRQgYyA,7146
4
5
  rara_tools/exceptions.py,sha256=BwNh4qWxau_ylr9RqZoYwd1KnExI6oWWWDno3jkh8q4,474
5
6
  rara_tools/s3.py,sha256=uNDu2HzMYHAWh33RcHeyPFK7gdQfQPxsdfohyIKezEY,4467
6
7
  rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
8
+ rara_tools/utils.py,sha256=9vSbmuWYU5ydr4lXBKlUKa0xzDccFsaJv4T-XwgUfuY,2578
7
9
  rara_tools/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
10
  rara_tools/constants/digitizer.py,sha256=gJ3jOMwuZfKcLqgOAxTyB266VYsskLabJiMUiSz3xX4,297
9
11
  rara_tools/constants/general.py,sha256=E9Jaw-YxocS_tOZw9QBoxO3e9KK5EMbLoM0R7D4Iflw,171
10
- rara_tools-0.0.9.dist-info/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
11
- rara_tools-0.0.9.dist-info/METADATA,sha256=HhxVd2e_lhAizmc9p88dOVuaCygVRH5tDv3xrPZXVmk,3867
12
- rara_tools-0.0.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
13
- rara_tools-0.0.9.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
14
- rara_tools-0.0.9.dist-info/RECORD,,
12
+ rara_tools-0.0.10.dist-info/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
13
+ rara_tools-0.0.10.dist-info/METADATA,sha256=jV6nZKhjjwDL6TWt-fKWudWNUAViZTVDL0J39fefFtM,3895
14
+ rara_tools-0.0.10.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
15
+ rara_tools-0.0.10.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
16
+ rara_tools-0.0.10.dist-info/RECORD,,