rara-tools 0.0.9__tar.gz → 0.0.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

Files changed (31) hide show
  1. {rara_tools-0.0.9/rara_tools.egg-info → rara_tools-0.0.11}/PKG-INFO +2 -1
  2. rara_tools-0.0.11/VERSION +1 -0
  3. rara_tools-0.0.11/rara_tools/digar_schema_converter.py +416 -0
  4. {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools/elastic.py +140 -1
  5. rara_tools-0.0.11/rara_tools/utils.py +104 -0
  6. {rara_tools-0.0.9 → rara_tools-0.0.11/rara_tools.egg-info}/PKG-INFO +2 -1
  7. {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools.egg-info/SOURCES.txt +4 -0
  8. {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools.egg-info/requires.txt +1 -0
  9. {rara_tools-0.0.9 → rara_tools-0.0.11}/requirements.txt +1 -0
  10. rara_tools-0.0.11/tests/test_digar_schema_converter.py +133 -0
  11. rara_tools-0.0.11/tests/test_elastic_vector_and_search_operations.py +140 -0
  12. rara_tools-0.0.9/VERSION +0 -1
  13. {rara_tools-0.0.9 → rara_tools-0.0.11}/LICENSE.md +0 -0
  14. {rara_tools-0.0.9 → rara_tools-0.0.11}/README.md +0 -0
  15. {rara_tools-0.0.9 → rara_tools-0.0.11}/pyproject.toml +0 -0
  16. {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools/constants/__init__.py +0 -0
  17. {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools/constants/digitizer.py +0 -0
  18. {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools/constants/general.py +0 -0
  19. {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools/converters.py +0 -0
  20. {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools/decorators.py +0 -0
  21. {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools/exceptions.py +0 -0
  22. {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools/s3.py +0 -0
  23. {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools/task_reporter.py +0 -0
  24. {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools.egg-info/dependency_links.txt +0 -0
  25. {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools.egg-info/top_level.txt +0 -0
  26. {rara_tools-0.0.9 → rara_tools-0.0.11}/setup.cfg +0 -0
  27. {rara_tools-0.0.9 → rara_tools-0.0.11}/tests/test_converters.py +0 -0
  28. {rara_tools-0.0.9 → rara_tools-0.0.11}/tests/test_elastic.py +0 -0
  29. {rara_tools-0.0.9 → rara_tools-0.0.11}/tests/test_s3_exceptions.py +0 -0
  30. {rara_tools-0.0.9 → rara_tools-0.0.11}/tests/test_s3_file_operations.py +0 -0
  31. {rara_tools-0.0.9 → rara_tools-0.0.11}/tests/test_task_reporter.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: rara-tools
3
- Version: 0.0.9
3
+ Version: 0.0.11
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -14,6 +14,7 @@ Requires-Dist: elasticsearch==8.*
14
14
  Requires-Dist: elasticsearch_dsl==8.*
15
15
  Requires-Dist: minio==7.*
16
16
  Requires-Dist: requests
17
+ Requires-Dist: iso639-lang
17
18
  Provides-Extra: testing
18
19
  Requires-Dist: pytest>=8.0; extra == "testing"
19
20
  Requires-Dist: pytest-order; extra == "testing"
@@ -0,0 +1 @@
1
+ 0.0.11
@@ -0,0 +1,416 @@
1
+ from collections import defaultdict
2
+ from typing import List, NoReturn
3
+
4
+ from rara_tools.utils import lang_to_iso639_2, ratio_to_percentage
5
+
6
+
7
+ UNDEFINED_LANGUAGE_VALUE = "unk"
8
+ QUALITY_RATIO_TYPE = "Float"
9
+
10
+
11
+ class ImagePageSchema:
12
+ def __init__(self, image: dict) -> NoReturn:
13
+ self.__image = image
14
+ self.__schema: dict = {}
15
+
16
+ @property
17
+ def schema(self) -> dict:
18
+ if not self.__schema:
19
+ self.__schema = {
20
+ "@type": "VisualArtwork",
21
+ "@id": "",
22
+ "value": self.__image.get("label"),
23
+ "description": "",
24
+ "schema:position": self.__image.get("page")
25
+ }
26
+ return self.__schema
27
+
28
+
29
+ class TextPageSchema:
30
+ def __init__(self, page: dict) -> NoReturn:
31
+ self.__page: dict = page
32
+ self.__schema: dict = {}
33
+
34
+ @property
35
+ def schema(self) -> dict:
36
+ if not self.__schema:
37
+ self.__schema = {
38
+ "@type": "Text", # CONSTANT
39
+ "@id": "", # Will be added in a later stage
40
+ "value": "Textblock", # CONSTANT
41
+ "content": self.__page.get("text"),
42
+ "schema:position": self.__page.get("start_page") # start_page ?
43
+ }
44
+ return self.__schema
45
+
46
+
47
+ class PageSchema:
48
+ def __init__(
49
+ self,
50
+ page_texts: List[dict],
51
+ page_images: List[dict],
52
+ page_number: int,
53
+ doc_id: str
54
+ ) -> NoReturn:
55
+ self.__page_texts: List[dict] = page_texts
56
+ self.__page_images: List[dict] = page_images
57
+ self.__page_nr: int = page_number
58
+ self.__page_id: str = ""
59
+ self.__doc_id: str = doc_id
60
+ self.__schema: dict = {}
61
+
62
+ def _add_segment_ids(self, segments: List[dict]) -> List[dict]:
63
+ for i, segment in enumerate(segments):
64
+ segment_id = f"{self.page_id}/{i + 1}"
65
+ segment["@id"] = segment_id
66
+ return segments
67
+
68
+ @property
69
+ def page_id(self) -> str:
70
+ if not self.__page_id:
71
+ self.__page_id = f"{self.__doc_id}/{self.__page_nr}"
72
+ return self.__page_id
73
+
74
+ @property
75
+ def schema(self) -> dict:
76
+ if not self.__schema:
77
+ self.__schema = {
78
+ "@type": "CreativeWork", # CONSTANT for pages
79
+ "@id": self.page_id,
80
+ "hasPart": []
81
+ }
82
+ text_schemas = [
83
+ TextPageSchema(page).schema
84
+ for page in self.__page_texts
85
+ ]
86
+ image_schemas = [
87
+ ImagePageSchema(image).schema
88
+ for image in self.__page_images
89
+ ]
90
+
91
+ page_schemas = text_schemas + image_schemas
92
+ page_schemas_with_ids = self._add_segment_ids(page_schemas)
93
+
94
+ self.__schema["hasPart"].extend(page_schemas_with_ids)
95
+
96
+ return self.__schema
97
+
98
+
99
+ class DocSchemas:
100
+ def __init__(
101
+ self,
102
+ doc_meta: dict,
103
+ sierra_id: str = "",
104
+ generated_id: str = "",
105
+ permalink: str = "",
106
+ min_language_ratio: float = 0.2,
107
+ convert_ratio: bool = True,
108
+ generated_id_type: str = "CustomID"
109
+ ) -> NoReturn:
110
+ self.__convert_ratio = convert_ratio
111
+ self.__min_language_ratio = min_language_ratio
112
+ self.__sierra_id = sierra_id
113
+ self.__generated_id = generated_id
114
+ self.__permalink = permalink
115
+ self.__generated_id_type = generated_id_type
116
+ self.__doc_meta = doc_meta
117
+ self.__ocr_accuracy_schema: dict = {}
118
+ self.__text_quality_schema: dict = {}
119
+ self.__language_schema: List[dict] = []
120
+ self.__identifier_schema: List[dict] = []
121
+ self.__origin_schema: dict = {}
122
+ self.__origin: str = ""
123
+
124
+ @property
125
+ def origin(self) -> str:
126
+ if not self.__origin:
127
+ if self.__doc_meta["ocr_applied"]:
128
+ self.__origin = "Reformatted digital"
129
+ else:
130
+ self.__origin = "Born digital"
131
+ return self.__origin
132
+
133
+ @property
134
+ def ocr_accuracy_schema(self) -> dict:
135
+ if not self.__ocr_accuracy_schema:
136
+ ocr_quality = self.__doc_meta.get("alto_text_quality")
137
+ if ocr_quality:
138
+ self.__ocr_accuracy_schema = {
139
+ "comment": "Estimated OCR accuracy"
140
+ }
141
+ if self.__convert_ratio:
142
+ type_and_value = {
143
+ "@type": QUALITY_RATIO_TYPE,
144
+ "value": ocr_quality
145
+ }
146
+ else:
147
+ type_and_value = {
148
+ "@type": "Text",
149
+ "value": ratio_to_percentage(ocr_quality)
150
+ }
151
+ self.__ocr_accuracy_schema.update(type_and_value)
152
+ return self.__ocr_accuracy_schema
153
+
154
+ @property
155
+ def text_quality_schema(self) -> dict:
156
+ if not self.__text_quality_schema:
157
+ text_quality = self.__doc_meta.get("text_quality")
158
+ self.__text_quality_schema = {
159
+ "comment": "Estimated n-gram-based text quality"
160
+ }
161
+ if self.__convert_ratio:
162
+ type_and_value = {
163
+ "@type": QUALITY_RATIO_TYPE,
164
+ "value": text_quality
165
+ }
166
+ else:
167
+ type_and_value = {
168
+ "@type": "Text",
169
+ "value": ratio_to_percentage(text_quality)
170
+ }
171
+ self.__text_quality_schema.update(type_and_value)
172
+ return self.__text_quality_schema
173
+
174
+ @property
175
+ def language_schema(self) -> List[dict]:
176
+ if not self.__language_schema:
177
+ self.__language_schema = [
178
+ {
179
+ "@type": "ISO 639-2",
180
+ "value": lang_to_iso639_2(
181
+ lang["language"],
182
+ unk_code=UNDEFINED_LANGUAGE_VALUE
183
+ )
184
+ }
185
+ for lang in self.__doc_meta["languages"]
186
+ if lang["ratio"] >= self.__min_language_ratio
187
+ ]
188
+ return self.__language_schema
189
+
190
+ @property
191
+ def identifier_schema(self) -> List[dict]:
192
+ if not self.__identifier_schema:
193
+ identifiers = []
194
+ if self.__sierra_id:
195
+ identifiers.append(
196
+ {
197
+ "@type": "Identifier",
198
+ "qualifier": "OPAC",
199
+ "value": self.__sierra_id
200
+ }
201
+ )
202
+ if self.__permalink:
203
+ identifiers.append(
204
+ {
205
+ "@type": "Identifier",
206
+ "qualifier": "Permalink",
207
+ "value": self.__permalink
208
+ }
209
+ )
210
+ if self.__generated_id:
211
+ identifiers.append(
212
+ {
213
+ "@type": "Identifier",
214
+ "qualifier": self.__generated_id_type,
215
+ "value": self.__generated_id
216
+ }
217
+ )
218
+ self.__identifier_schema = identifiers
219
+
220
+ return self.__identifier_schema
221
+
222
+ @property
223
+ def origin_schema(self) -> dict:
224
+ if not self.__origin_schema:
225
+ self.__origin_schema = {
226
+ "@type": "Text",
227
+ "value": self.origin,
228
+ "comment": "Origin"
229
+ }
230
+ return self.__origin_schema
231
+
232
+
233
+ class DIGARSchemaConverter:
234
+ def __init__(
235
+ self,
236
+ digitizer_output: dict,
237
+ generated_id: str,
238
+ sierra_id: str = "",
239
+ permalink: str = "",
240
+ generated_id_type: str = "CustomID",
241
+ min_language_ratio: float = 0.2,
242
+ convert_ratio: bool = False
243
+ ) -> NoReturn:
244
+ """ Initialize DIGARSchemaConverter object.
245
+
246
+ Parameters
247
+ ----------
248
+ digitizer_output: dict
249
+ Raw output of rara-digitizer (https://pypi.org/project/rara-digitizer/).
250
+ generated_id: str
251
+ Some non-standard/generated document identifier used in ID fields.
252
+ sierra_id: str
253
+ Document's corresponding Sierra ID.
254
+ permalink: str
255
+ Permanent link, where the document can be accessed.
256
+ generated_id_type: str
257
+ Method / type of generated ID (e.g. 'UUID')
258
+ min_language_ratio: float
259
+ Cutoff ratio for languages. If ratio for some language
260
+ does not exceed the set threshold, the language will not
261
+ be added to the final output.
262
+ convert_ratio: bool
263
+ If enabled, all ratios are converted into percentages.
264
+
265
+ """
266
+ self.__digitizer_output: dict = digitizer_output
267
+ self.__min_language_ratio: float = min_language_ratio
268
+ self.__convert_ratio: bool = convert_ratio
269
+ self.__sierra_id: str = sierra_id
270
+ self.__generated_id: str = generated_id
271
+ self.__permalink: str = permalink.removesuffix("/")
272
+ self.__generated_id_type: str = generated_id_type
273
+ self.__texts: List[dict] = []
274
+ self.__images: List[dict] = []
275
+ self.__doc_meta: dict = {}
276
+ self.__page_mappings: List[dict] = []
277
+ self.__dcterms_haspart: dict = {}
278
+ self.__dcterms_conforms_to: dict = {}
279
+ self.__dc_language: dict = {}
280
+ self.__dc_origin: dict = {}
281
+ self.__dc_identifier: List[dict] = []
282
+ self.__doc_id: str = ""
283
+
284
+ self.__doc_schemas = DocSchemas(
285
+ doc_meta=self.doc_meta,
286
+ sierra_id=self.__sierra_id,
287
+ generated_id=self.__generated_id,
288
+ permalink=self.__permalink,
289
+ min_language_ratio=self.__min_language_ratio,
290
+ convert_ratio=self.__convert_ratio,
291
+ generated_id_type=self.__generated_id_type
292
+ )
293
+ self.__digar_schema: dict = {}
294
+
295
+ def _get_page_number(self, page_content: dict) -> int:
296
+ """ Retrieves page number from image or text object.
297
+ """
298
+ _segments = page_content["texts"] + page_content["images"]
299
+ _first_segment = _segments[0]
300
+ if "start_page" in _first_segment:
301
+ page_number = _first_segment.get("start_page")
302
+ elif "page" in _first_segment:
303
+ page_number = _first_segment.get("page")
304
+ return page_number
305
+
306
+ @property
307
+ def doc_id(self) -> str:
308
+ """ Retrieves document ID to use for generating
309
+ page and segment ids. Preference order:
310
+ 1. permalink; 2. sierra_id; 3. generated document id
311
+ """
312
+ if not self.__doc_id:
313
+ if self.__permalink:
314
+ self.__doc_id = self.__permalink
315
+ elif self.__sierra_id:
316
+ self.__doc_id = self.__sierra_id
317
+ else:
318
+ self.__doc_id = self.__generated_id
319
+ return self.__doc_id
320
+
321
+ @property
322
+ def texts(self) -> List[dict]:
323
+ if not self.__texts:
324
+ self.__texts = self.__digitizer_output.get("texts")
325
+ return self.__texts
326
+
327
+ @property
328
+ def images(self) -> List[dict]:
329
+ if not self.__images:
330
+ self.__images = self.__digitizer_output.get("images")
331
+ return self.__images
332
+
333
+ @property
334
+ def doc_meta(self) -> dict:
335
+ if not self.__doc_meta:
336
+ self.__doc_meta = self.__digitizer_output.get("doc_meta")
337
+ return self.__doc_meta
338
+
339
+ @property
340
+ def page_mappings(self) -> List[dict]:
341
+ if not self.__page_mappings:
342
+ mapped = defaultdict(lambda: defaultdict(list))
343
+ for text in self.texts:
344
+ mapped[text["start_page"]]["texts"].append(text)
345
+ for img in self.images:
346
+ mapped[img["page"]]["images"].append(img)
347
+
348
+ self.__page_mappings = [
349
+ v for k, v in sorted(list(mapped.items()), key=lambda x: x[0])
350
+ ]
351
+ return self.__page_mappings
352
+
353
+ @property
354
+ def dcterms_haspart(self) -> dict:
355
+ if not self.__dcterms_haspart:
356
+ self.__dcterms_haspart = {
357
+ "dcterms:hasPart": [
358
+ PageSchema(
359
+ page_texts=page["texts"],
360
+ page_images=page["images"],
361
+ page_number=self._get_page_number(page),
362
+ doc_id=self.doc_id
363
+ ).schema
364
+ for page in self.page_mappings
365
+ ]
366
+ }
367
+ return self.__dcterms_haspart
368
+
369
+ @property
370
+ def dcterms_conforms_to(self) -> dict:
371
+ if not self.__dcterms_conforms_to:
372
+ schema_content = [
373
+ self.__doc_schemas.text_quality_schema,
374
+ ]
375
+ # Add OCR Accuracy only when it is not empty:
376
+ if self.__doc_schemas.ocr_accuracy_schema:
377
+ schema_content.append(self.__doc_schemas.ocr_accuracy_schema)
378
+ self.__dcterms_conforms_to = {
379
+ "dcterms:conformsTo": schema_content
380
+ }
381
+ return self.__dcterms_conforms_to
382
+
383
+ @property
384
+ def dc_language(self) -> dict:
385
+ if not self.__dc_language:
386
+ self.__dc_language = {
387
+ "dc:language": self.__doc_schemas.language_schema
388
+ }
389
+ return self.__dc_language
390
+
391
+ @property
392
+ def dc_origin(self) -> dict:
393
+ if not self.__dc_origin:
394
+ self.__dc_origin = {
395
+ "dcterms:provenance": self.__doc_schemas.origin_schema
396
+ }
397
+ return self.__dc_origin
398
+
399
+ @property
400
+ def dc_identifier(self) -> List[dict]:
401
+ if not self.__dc_identifier:
402
+ self.__dc_identifier = {
403
+ "dc:identifier": self.__doc_schemas.identifier_schema
404
+ }
405
+ return self.__dc_identifier
406
+
407
+ @property
408
+ def digar_schema(self) -> dict:
409
+ if not self.__digar_schema:
410
+ self.__digar_schema = {}
411
+ self.__digar_schema.update(self.dcterms_conforms_to)
412
+ self.__digar_schema.update(self.dcterms_haspart)
413
+ self.__digar_schema.update(self.dc_language)
414
+ self.__digar_schema.update(self.dc_origin)
415
+ self.__digar_schema.update(self.dc_identifier)
416
+ return self.__digar_schema
@@ -1,10 +1,11 @@
1
- from typing import Any, Dict, Iterator, Optional
1
+ from typing import Any, Dict, Iterator, Optional, List
2
2
 
3
3
  import elasticsearch_dsl
4
4
  from elastic_transport import ObjectApiResponse
5
5
  from elasticsearch import Elasticsearch
6
6
  from elasticsearch.helpers import bulk
7
7
  from elasticsearch_dsl import Index
8
+ from elasticsearch_dsl.response import Response
8
9
 
9
10
  from .decorators import _elastic_connection
10
11
 
@@ -82,6 +83,63 @@ class KataElastic:
82
83
  def add_mapping(self, index_name: str, schema: dict):
83
84
  index = Index(name=index_name)
84
85
  return index.put_mapping(body=schema, using=self.elasticsearch)
86
+
87
+
88
+ @_elastic_connection
89
+ def add_vector_mapping(
90
+ self,
91
+ index_name: str,
92
+ field: str,
93
+ schema: Optional[dict] = None,
94
+ dims: int = 1024
95
+ ) -> dict:
96
+ vector_mapping = {
97
+ "properties": {
98
+ field: {
99
+ "type": "dense_vector",
100
+ "dims": dims
101
+ }
102
+ }
103
+ }
104
+ mapping = schema or vector_mapping
105
+ index = Index(name=index_name)
106
+ return index.put_mapping(body=mapping, using=self.elasticsearch)
107
+
108
+
109
+ @_elastic_connection
110
+ def add_ann_vector_mapping(
111
+ self,
112
+ index_name: str,
113
+ field: str,
114
+ schema: Optional[dict] = None,
115
+ dims: int = 1024
116
+ ) -> dict:
117
+ vector_mapping = {
118
+ "properties": {
119
+ field: {
120
+ "type": "dense_vector",
121
+ "dims": dims,
122
+ "similarity": "cosine",
123
+ "index": True
124
+ }
125
+ }
126
+ }
127
+ mapping = schema or vector_mapping
128
+ index = Index(name=index_name)
129
+ return index.put_mapping(body=mapping, using=self.elasticsearch)
130
+
131
+ @_elastic_connection
132
+ def add_vector(
133
+ self,
134
+ index_name: str,
135
+ document_id: str,
136
+ vector: List[float],
137
+ field: str
138
+ ) -> dict:
139
+ schema = {"doc": {field: vector}}
140
+ return self.elasticsearch.update(
141
+ index=index_name, id=document_id, body=schema, refresh="wait_for"
142
+ )
85
143
 
86
144
  @_elastic_connection
87
145
  def create_index(
@@ -170,6 +228,87 @@ class KataElastic:
170
228
  s.scan(), key=lambda doc: [getattr(doc, field) for field in sort_fields]
171
229
  )
172
230
  return documents
231
+
232
+
233
+ @_elastic_connection
234
+ def execute_fuzzy_search(
235
+ self,
236
+ index: str,
237
+ field: str,
238
+ entity: str,
239
+ fuzziness: int = 2,
240
+ prefix_length: int = 1,
241
+ max_expansions: int = 50
242
+ ) -> Response:
243
+ """Executes a fuzzy search.
244
+ :param: index str: Index to search from.
245
+ :param: entity str: Entity to search matches for.
246
+ :param: fuzziness int: Maximum edit distance for a match.
247
+ :param: prefix_length int: Number of characters in the prefix that
248
+ should overlap with the original entity's prefix.
249
+ :param: max_expansion int: maximum number of terms the fuzzy query
250
+ will match before halting the search
251
+ :return: Dict on search results.
252
+ """
253
+ query_params = {
254
+ f"{field}.keyword": {
255
+ "value": entity,
256
+ "fuzziness": fuzziness,
257
+ "max_expansions": max_expansions,
258
+ "prefix_length": prefix_length
259
+ }
260
+ }
261
+ s = elasticsearch_dsl.Search(using=self.elasticsearch, index=index)
262
+ s = s.query("fuzzy", **query_params)
263
+ response = s.execute()
264
+ return response
265
+
266
+ def execute_vector_search(
267
+ self,
268
+ index: str,
269
+ field: str,
270
+ query_vector: List[float],
271
+ k: int = 10,
272
+ num_candidates: int = 100,
273
+ n_docs: int = 10,
274
+ elastic_ids: List[str] = []
275
+ ) -> Response:
276
+ """ Execute a vector search.
277
+ NB! Works only with ANN mapping!
278
+
279
+ :param: index str: Index to search from.
280
+ :param: field str: Field containing vectorized data.
281
+ :param: query vector List[float]: Vector to search matches for.
282
+ :param: k int: Number of nearest neighbors to return.
283
+ :param: num_candidates int: Number of candidates considered before selecting k results.
284
+ :param: elastic_ids: List[str]: Elastic ID-s for restricting the search.
285
+ """
286
+
287
+ s = elasticsearch_dsl.Search(using=self.elasticsearch, index=index)
288
+
289
+ # Add kNN vector search
290
+ s = s.extra(
291
+ knn={
292
+ "field": field,
293
+ "query_vector": query_vector,
294
+ "k": k,
295
+ "num_candidates": num_candidates
296
+ }
297
+ )
298
+
299
+ # Add ID filtering, if elastic_ids are specified
300
+ if elastic_ids:
301
+ s = s.query(
302
+ elasticsearch_dsl.Q("terms", _id=elastic_ids)
303
+ )
304
+
305
+ # Sort by score and return `n_docs` best-matching documents
306
+ s = s.extra(size=n_docs)
307
+
308
+ # Execute the search
309
+ response = s.execute()
310
+ return response
311
+
173
312
 
174
313
  def __str__(self) -> str:
175
314
  return self.elasticsearch_url
@@ -0,0 +1,104 @@
1
+ from iso639 import Lang
2
+
3
+
4
+ def lang_to_iso639_1(lang: str, unk_code: str = "unk") -> str:
5
+ """ Converts language into ISO-639-1 standard.
6
+ Input can be any language code in a valid ISO-639
7
+ standard or even a full name of the language,
8
+ e.g. "Estonian".
9
+
10
+ Parameters
11
+ -----------
12
+ lang: str
13
+ Language code in any valid ISO-639 standard.
14
+
15
+ unk_code: str
16
+ Code to return incase of invalid/unsupported
17
+ input language.
18
+
19
+ Returns
20
+ -------
21
+ Language code in ISO-639-1 standard.
22
+ """
23
+ try:
24
+ lg = Lang(lang)
25
+ iso_639_1_lang = lg.pt1
26
+ except:
27
+ iso_639_1_lang = unk_code
28
+ return iso_639_1_lang
29
+
30
+
31
+ def lang_to_iso639_2(lang: str, unk_code: str = "unk") -> str:
32
+ """ Converts language into ISO-639-2 standard.
33
+ Input can be any language code in a valid ISO-639
34
+ standard or even a full name of the language,
35
+ e.g. "Estonian".
36
+
37
+ Parameters
38
+ -----------
39
+ lang: str
40
+ Language code in any valid ISO-639 standard.
41
+
42
+ unk_code: str
43
+ Code to return incase of invalid/unsupported
44
+ input language.
45
+
46
+ Returns
47
+ -------
48
+ Language code in ISO-639-2 standard.
49
+ """
50
+ try:
51
+ lg = Lang(lang)
52
+ # NB! uses bibliographic identifier (e.g. "de" -> "ger")
53
+ # opposed to terminological identifier ("de" -> "deu").
54
+ # This can be changed by replaving lg.pt2b -> lg.pt2t
55
+ iso_639_2_lang = lg.pt2b
56
+ except:
57
+ iso_639_2_lang = unk_code
58
+ return iso_639_2_lang
59
+
60
+
61
+ def lang_to_iso639_3(lang: str, unk_code: str = "unk") -> str:
62
+ """ Converts language into ISO-639-3 standard.
63
+ Input can be any language code in a valid ISO-639
64
+ standard or even a full name of the language,
65
+ e.g. "Estonian".
66
+
67
+ Parameters
68
+ -----------
69
+ lang: str
70
+ Language code in any valid ISO-639 standard.
71
+ unk_code: str
72
+
73
+ Code to return incase of invalid/unsupported
74
+ input language.
75
+
76
+ Returns
77
+ -------
78
+ str
79
+ Language code in ISO-639-3 standard.
80
+ """
81
+ try:
82
+ lg = Lang(lang)
83
+ iso_639_3_lang = lg.pt3
84
+ except:
85
+ iso_639_3_lang = unk_code
86
+ return iso_639_3_lang
87
+
88
+
89
+ def ratio_to_percentage(ratio: float) -> str:
90
+ """ Converts ratio to corresponding percentage.
91
+
92
+ Parameters
93
+ -----------
94
+ ratio: float
95
+ Float in range [0,1]
96
+
97
+ Returns
98
+ --------
99
+ str
100
+ Percentage corresponding to the float.
101
+
102
+ """
103
+ percentage = f"{int(ratio*100)}%"
104
+ return percentage
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: rara-tools
3
- Version: 0.0.9
3
+ Version: 0.0.11
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -14,6 +14,7 @@ Requires-Dist: elasticsearch==8.*
14
14
  Requires-Dist: elasticsearch_dsl==8.*
15
15
  Requires-Dist: minio==7.*
16
16
  Requires-Dist: requests
17
+ Requires-Dist: iso639-lang
17
18
  Provides-Extra: testing
18
19
  Requires-Dist: pytest>=8.0; extra == "testing"
19
20
  Requires-Dist: pytest-order; extra == "testing"
@@ -5,10 +5,12 @@ pyproject.toml
5
5
  requirements.txt
6
6
  rara_tools/converters.py
7
7
  rara_tools/decorators.py
8
+ rara_tools/digar_schema_converter.py
8
9
  rara_tools/elastic.py
9
10
  rara_tools/exceptions.py
10
11
  rara_tools/s3.py
11
12
  rara_tools/task_reporter.py
13
+ rara_tools/utils.py
12
14
  rara_tools.egg-info/PKG-INFO
13
15
  rara_tools.egg-info/SOURCES.txt
14
16
  rara_tools.egg-info/dependency_links.txt
@@ -18,7 +20,9 @@ rara_tools/constants/__init__.py
18
20
  rara_tools/constants/digitizer.py
19
21
  rara_tools/constants/general.py
20
22
  tests/test_converters.py
23
+ tests/test_digar_schema_converter.py
21
24
  tests/test_elastic.py
25
+ tests/test_elastic_vector_and_search_operations.py
22
26
  tests/test_s3_exceptions.py
23
27
  tests/test_s3_file_operations.py
24
28
  tests/test_task_reporter.py
@@ -2,6 +2,7 @@ elasticsearch==8.*
2
2
  elasticsearch_dsl==8.*
3
3
  minio==7.*
4
4
  requests
5
+ iso639-lang
5
6
 
6
7
  [testing]
7
8
  pytest>=8.0
@@ -2,3 +2,4 @@ elasticsearch==8.*
2
2
  elasticsearch_dsl==8.*
3
3
  minio==7.*
4
4
  requests
5
+ iso639-lang
@@ -0,0 +1,133 @@
1
+ import json
2
+ import pytest
3
+ import os
4
+ import sys
5
+
6
+ from rara_tools.digar_schema_converter import DIGARSchemaConverter
7
+
8
+ def load_json(file_path: str):
9
+ with open(file_path, "r") as f:
10
+ data = json.load(f)
11
+ return data
12
+
13
+
14
+ TEST_DIGITIZER_OUTPUT_FILE = os.path.join(".", "tests", "test_data", "b1267058_test_digitizer_output.json")
15
+ TEST_DIGITIZER_OUTPUT = load_json(TEST_DIGITIZER_OUTPUT_FILE)
16
+ TEST_SIERRA_ID = "b1267058"
17
+ TEST_GENERATED_ID = "hsasaHSAHHGDhb"
18
+ TEST_PERMALINK = "https://www.digar.ee/b1267058"
19
+
20
+ def test_digar_schema_converstion_default():
21
+ converter = DIGARSchemaConverter(
22
+ digitizer_output=TEST_DIGITIZER_OUTPUT,
23
+ sierra_id=TEST_SIERRA_ID,
24
+ generated_id=TEST_GENERATED_ID
25
+ )
26
+ digar_schema = converter.digar_schema
27
+
28
+ # check that all neseccary fields are present
29
+ assert "dc:language" in digar_schema
30
+ assert "dcterms:provenance" in digar_schema
31
+ assert "dc:identifier" in digar_schema
32
+ assert "dcterms:hasPart" in digar_schema
33
+ assert "dcterms:conformsTo" in digar_schema
34
+
35
+ languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
36
+ # check that languages are converted into ISO-693-2
37
+ for lang in languages:
38
+ assert len(lang) == 3
39
+
40
+
41
+ # check that ratio is converted into percentage
42
+ text_quality = digar_schema.get("dcterms:conformsTo")[0].get("value")
43
+ assert isinstance(text_quality, str)
44
+
45
+
46
+ def test_digar_schema_id_generation():
47
+ """ Tests ID generation logic.
48
+ """
49
+ converter = DIGARSchemaConverter(
50
+ digitizer_output=TEST_DIGITIZER_OUTPUT,
51
+ sierra_id=TEST_SIERRA_ID,
52
+ generated_id=TEST_GENERATED_ID,
53
+ permalink=TEST_PERMALINK
54
+
55
+ )
56
+
57
+ #If permalink is given, this should be used as base ID
58
+ digar_schema = converter.digar_schema
59
+ first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
60
+
61
+ assert first_segment_id.startswith(TEST_PERMALINK)
62
+
63
+ converter = DIGARSchemaConverter(
64
+ digitizer_output=TEST_DIGITIZER_OUTPUT,
65
+ sierra_id=TEST_SIERRA_ID,
66
+ generated_id=TEST_GENERATED_ID
67
+ )
68
+
69
+ #If permalink is NOT given, Sierra ID should be used as base ID
70
+ digar_schema = converter.digar_schema
71
+ first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
72
+ assert first_segment_id.startswith(TEST_SIERRA_ID)
73
+
74
+
75
+ converter = DIGARSchemaConverter(
76
+ digitizer_output=TEST_DIGITIZER_OUTPUT,
77
+ generated_id=TEST_GENERATED_ID
78
+ )
79
+
80
+ #If neiter permalink nor Sierra ID is given, generated ID should be used as base ID
81
+ digar_schema = converter.digar_schema
82
+ first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
83
+ assert first_segment_id.startswith(TEST_GENERATED_ID)
84
+
85
+
86
+ def test_restricting_languages_with_ratio():
87
+ """ Checks that param `min_language_ratio` influences
88
+ the number of output languages.
89
+ """
90
+ converter = DIGARSchemaConverter(
91
+ digitizer_output=TEST_DIGITIZER_OUTPUT,
92
+ sierra_id=TEST_SIERRA_ID,
93
+ generated_id=TEST_GENERATED_ID,
94
+ permalink=TEST_PERMALINK,
95
+ min_language_ratio=0
96
+
97
+ )
98
+
99
+ #If permalink is given, this should be used as base ID
100
+ digar_schema = converter.digar_schema
101
+ languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
102
+ assert len(languages) == 7
103
+
104
+ converter = DIGARSchemaConverter(
105
+ digitizer_output=TEST_DIGITIZER_OUTPUT,
106
+ sierra_id=TEST_SIERRA_ID,
107
+ generated_id=TEST_GENERATED_ID,
108
+ permalink=TEST_PERMALINK,
109
+ min_language_ratio=0.02
110
+
111
+ )
112
+
113
+ #If permalink is given, this should be used as base ID
114
+ digar_schema = converter.digar_schema
115
+ languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
116
+ assert len(languages) == 2
117
+
118
+ converter = DIGARSchemaConverter(
119
+ digitizer_output=TEST_DIGITIZER_OUTPUT,
120
+ sierra_id=TEST_SIERRA_ID,
121
+ generated_id=TEST_GENERATED_ID,
122
+ permalink=TEST_PERMALINK,
123
+ min_language_ratio=0.5
124
+
125
+ )
126
+
127
+ #If permalink is given, this should be used as base ID
128
+ digar_schema = converter.digar_schema
129
+ languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
130
+ assert len(languages) == 1
131
+
132
+
133
+
@@ -0,0 +1,140 @@
1
+ import json
2
+ import os
3
+ import pytest
4
+
5
+ from time import sleep
6
+ from rara_tools.elastic import KataElastic
7
+
8
+
9
+ def load_json(file_path: str):
10
+ with open(file_path, "r") as fh:
11
+ data = json.load(fh)
12
+ return data
13
+
14
+ TEST_DOCUMENTS = load_json("./tests/test_data/elastic_vectorized_docs.json")
15
+ TEST_VECTOR_DATA = load_json("./tests/test_data/test_vector_data.json")
16
+ TEST_VECTOR = TEST_VECTOR_DATA.get("vector")
17
+
18
+ es_url = os.getenv("ELASTIC_TEST_URL", "http://localhost:9200")
19
+ ELASTIC = KataElastic(es_url)
20
+
21
+ TEST_KNN_INDEX_NAME = "tools_knn_testing_index"
22
+ TEST_ANN_INDEX_NAME = "tools_ann_testing_index"
23
+
24
+ TEST_VECTOR_FIELD = "vector"
25
+
26
+
27
+
28
+ @pytest.mark.order(1)
29
+ def test_index_creation_with_knn_vector_mapping():
30
+ """ Tests if index created and documents indexed.
31
+ """
32
+ # Create test index
33
+ created = ELASTIC.create_index(TEST_KNN_INDEX_NAME)
34
+ assert created["acknowledged"] is True
35
+ result = ELASTIC.add_vector_mapping(
36
+ index_name=TEST_KNN_INDEX_NAME,
37
+ field=TEST_VECTOR_FIELD
38
+ )
39
+ assert result["acknowledged"] is True
40
+
41
+
42
+ @pytest.mark.order(2)
43
+ def test_index_creation_with_ann_vector_mapping():
44
+ """ Tests if index created and documents indexed.
45
+ """
46
+ # Create test index
47
+ created = ELASTIC.create_index(TEST_ANN_INDEX_NAME)
48
+ assert created["acknowledged"] is True
49
+ result = ELASTIC.add_ann_vector_mapping(
50
+ index_name=TEST_ANN_INDEX_NAME,
51
+ field=TEST_VECTOR_FIELD
52
+ )
53
+ assert result["acknowledged"] is True
54
+
55
+
56
+ @pytest.mark.order(3)
57
+ def test_vectorized_document_addition_knn_index():
58
+ """ Tests indexing vectorized documents.
59
+ """
60
+ # Add test documents
61
+ for document in TEST_DOCUMENTS:
62
+ indexed = ELASTIC.index_document(TEST_KNN_INDEX_NAME, document)
63
+ assert indexed["result"] == "created"
64
+ # let it index
65
+ sleep(1)
66
+
67
+ @pytest.mark.order(4)
68
+ def test_vectorized_document_addition_ann_index():
69
+ """ Tests indexing vectorized documents.
70
+ """
71
+ # Add test documents
72
+ for document in TEST_DOCUMENTS:
73
+ indexed = ELASTIC.index_document(TEST_ANN_INDEX_NAME, document)
74
+ assert indexed["result"] == "created"
75
+ # let it index
76
+ sleep(1)
77
+
78
+ @pytest.mark.order(5)
79
+ def test_fuzzy_search():
80
+ """ Tests fuzzy search.
81
+ """
82
+ response = ELASTIC.execute_fuzzy_search(
83
+ index=TEST_ANN_INDEX_NAME,
84
+ field="variations",
85
+ entity="Paul Keres",
86
+ fuzziness=0
87
+ )
88
+ total_hits = response.hits.total.value
89
+ assert total_hits == 2
90
+
91
+ response = ELASTIC.execute_fuzzy_search(
92
+ index=TEST_ANN_INDEX_NAME,
93
+ field="variations",
94
+ entity="Paul Keres",
95
+ fuzziness=2
96
+ )
97
+ total_hits = response.hits.total.value
98
+ assert total_hits == 3
99
+
100
+
101
+ @pytest.mark.order(6)
102
+ def test_vector_search():
103
+ """ Tests vector search.
104
+ """
105
+ # Execut fuzzy search to get ID restrictions
106
+ response = ELASTIC.execute_fuzzy_search(
107
+ index=TEST_ANN_INDEX_NAME,
108
+ field="variations",
109
+ entity="Paul Keres",
110
+ fuzziness=2
111
+ )
112
+ total_hits = response.hits.total.value
113
+ assert total_hits == 3
114
+ elastic_ids = [hit.meta.id for hit in response]
115
+
116
+ response = ELASTIC.execute_vector_search(
117
+ index=TEST_ANN_INDEX_NAME,
118
+ field="vector",
119
+ query_vector=TEST_VECTOR,
120
+ k=1,
121
+ n_docs=1,
122
+ num_candidates=10,
123
+ elastic_ids=elastic_ids
124
+ )
125
+ descriptions = [hit.description for hit in response]
126
+ assert len(descriptions) == 1
127
+ assert descriptions[0] == "Eesti maletaja ja maleteoreetik"
128
+
129
+
130
+ @pytest.mark.order(7)
131
+ def test_index_deleting():
132
+ """
133
+ Tests deleting index. We delete the test index now.
134
+ """
135
+ indices = [TEST_KNN_INDEX_NAME, TEST_ANN_INDEX_NAME]
136
+ for index in indices:
137
+ deleted = ELASTIC.delete_index(index)
138
+ sleep(1)
139
+ assert deleted["acknowledged"] is True
140
+
rara_tools-0.0.9/VERSION DELETED
@@ -1 +0,0 @@
1
- 0.0.9
File without changes
File without changes
File without changes
File without changes
File without changes