rara-tools 0.0.9__tar.gz → 0.0.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rara-tools might be problematic. Click here for more details.
- {rara_tools-0.0.9/rara_tools.egg-info → rara_tools-0.0.11}/PKG-INFO +2 -1
- rara_tools-0.0.11/VERSION +1 -0
- rara_tools-0.0.11/rara_tools/digar_schema_converter.py +416 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools/elastic.py +140 -1
- rara_tools-0.0.11/rara_tools/utils.py +104 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11/rara_tools.egg-info}/PKG-INFO +2 -1
- {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools.egg-info/SOURCES.txt +4 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools.egg-info/requires.txt +1 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/requirements.txt +1 -0
- rara_tools-0.0.11/tests/test_digar_schema_converter.py +133 -0
- rara_tools-0.0.11/tests/test_elastic_vector_and_search_operations.py +140 -0
- rara_tools-0.0.9/VERSION +0 -1
- {rara_tools-0.0.9 → rara_tools-0.0.11}/LICENSE.md +0 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/README.md +0 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/pyproject.toml +0 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools/constants/__init__.py +0 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools/constants/digitizer.py +0 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools/constants/general.py +0 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools/converters.py +0 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools/decorators.py +0 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools/exceptions.py +0 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools/s3.py +0 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools/task_reporter.py +0 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools.egg-info/dependency_links.txt +0 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/rara_tools.egg-info/top_level.txt +0 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/setup.cfg +0 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/tests/test_converters.py +0 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/tests/test_elastic.py +0 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/tests/test_s3_exceptions.py +0 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/tests/test_s3_file_operations.py +0 -0
- {rara_tools-0.0.9 → rara_tools-0.0.11}/tests/test_task_reporter.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: rara-tools
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.11
|
|
4
4
|
Summary: Tools to support Kata's work.
|
|
5
5
|
Classifier: Programming Language :: Python :: 3
|
|
6
6
|
Classifier: Programming Language :: Python :: 3.10
|
|
@@ -14,6 +14,7 @@ Requires-Dist: elasticsearch==8.*
|
|
|
14
14
|
Requires-Dist: elasticsearch_dsl==8.*
|
|
15
15
|
Requires-Dist: minio==7.*
|
|
16
16
|
Requires-Dist: requests
|
|
17
|
+
Requires-Dist: iso639-lang
|
|
17
18
|
Provides-Extra: testing
|
|
18
19
|
Requires-Dist: pytest>=8.0; extra == "testing"
|
|
19
20
|
Requires-Dist: pytest-order; extra == "testing"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.0.11
|
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import List, NoReturn
|
|
3
|
+
|
|
4
|
+
from rara_tools.utils import lang_to_iso639_2, ratio_to_percentage
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
UNDEFINED_LANGUAGE_VALUE = "unk"
|
|
8
|
+
QUALITY_RATIO_TYPE = "Float"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ImagePageSchema:
|
|
12
|
+
def __init__(self, image: dict) -> NoReturn:
|
|
13
|
+
self.__image = image
|
|
14
|
+
self.__schema: dict = {}
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def schema(self) -> dict:
|
|
18
|
+
if not self.__schema:
|
|
19
|
+
self.__schema = {
|
|
20
|
+
"@type": "VisualArtwork",
|
|
21
|
+
"@id": "",
|
|
22
|
+
"value": self.__image.get("label"),
|
|
23
|
+
"description": "",
|
|
24
|
+
"schema:position": self.__image.get("page")
|
|
25
|
+
}
|
|
26
|
+
return self.__schema
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class TextPageSchema:
|
|
30
|
+
def __init__(self, page: dict) -> NoReturn:
|
|
31
|
+
self.__page: dict = page
|
|
32
|
+
self.__schema: dict = {}
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def schema(self) -> dict:
|
|
36
|
+
if not self.__schema:
|
|
37
|
+
self.__schema = {
|
|
38
|
+
"@type": "Text", # CONSTANT
|
|
39
|
+
"@id": "", # Will be added in a later stage
|
|
40
|
+
"value": "Textblock", # CONSTANT
|
|
41
|
+
"content": self.__page.get("text"),
|
|
42
|
+
"schema:position": self.__page.get("start_page") # start_page ?
|
|
43
|
+
}
|
|
44
|
+
return self.__schema
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class PageSchema:
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
page_texts: List[dict],
|
|
51
|
+
page_images: List[dict],
|
|
52
|
+
page_number: int,
|
|
53
|
+
doc_id: str
|
|
54
|
+
) -> NoReturn:
|
|
55
|
+
self.__page_texts: List[dict] = page_texts
|
|
56
|
+
self.__page_images: List[dict] = page_images
|
|
57
|
+
self.__page_nr: int = page_number
|
|
58
|
+
self.__page_id: str = ""
|
|
59
|
+
self.__doc_id: str = doc_id
|
|
60
|
+
self.__schema: dict = {}
|
|
61
|
+
|
|
62
|
+
def _add_segment_ids(self, segments: List[dict]) -> List[dict]:
|
|
63
|
+
for i, segment in enumerate(segments):
|
|
64
|
+
segment_id = f"{self.page_id}/{i + 1}"
|
|
65
|
+
segment["@id"] = segment_id
|
|
66
|
+
return segments
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def page_id(self) -> str:
|
|
70
|
+
if not self.__page_id:
|
|
71
|
+
self.__page_id = f"{self.__doc_id}/{self.__page_nr}"
|
|
72
|
+
return self.__page_id
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def schema(self) -> dict:
|
|
76
|
+
if not self.__schema:
|
|
77
|
+
self.__schema = {
|
|
78
|
+
"@type": "CreativeWork", # CONSTANT for pages
|
|
79
|
+
"@id": self.page_id,
|
|
80
|
+
"hasPart": []
|
|
81
|
+
}
|
|
82
|
+
text_schemas = [
|
|
83
|
+
TextPageSchema(page).schema
|
|
84
|
+
for page in self.__page_texts
|
|
85
|
+
]
|
|
86
|
+
image_schemas = [
|
|
87
|
+
ImagePageSchema(image).schema
|
|
88
|
+
for image in self.__page_images
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
page_schemas = text_schemas + image_schemas
|
|
92
|
+
page_schemas_with_ids = self._add_segment_ids(page_schemas)
|
|
93
|
+
|
|
94
|
+
self.__schema["hasPart"].extend(page_schemas_with_ids)
|
|
95
|
+
|
|
96
|
+
return self.__schema
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class DocSchemas:
|
|
100
|
+
def __init__(
|
|
101
|
+
self,
|
|
102
|
+
doc_meta: dict,
|
|
103
|
+
sierra_id: str = "",
|
|
104
|
+
generated_id: str = "",
|
|
105
|
+
permalink: str = "",
|
|
106
|
+
min_language_ratio: float = 0.2,
|
|
107
|
+
convert_ratio: bool = True,
|
|
108
|
+
generated_id_type: str = "CustomID"
|
|
109
|
+
) -> NoReturn:
|
|
110
|
+
self.__convert_ratio = convert_ratio
|
|
111
|
+
self.__min_language_ratio = min_language_ratio
|
|
112
|
+
self.__sierra_id = sierra_id
|
|
113
|
+
self.__generated_id = generated_id
|
|
114
|
+
self.__permalink = permalink
|
|
115
|
+
self.__generated_id_type = generated_id_type
|
|
116
|
+
self.__doc_meta = doc_meta
|
|
117
|
+
self.__ocr_accuracy_schema: dict = {}
|
|
118
|
+
self.__text_quality_schema: dict = {}
|
|
119
|
+
self.__language_schema: List[dict] = []
|
|
120
|
+
self.__identifier_schema: List[dict] = []
|
|
121
|
+
self.__origin_schema: dict = {}
|
|
122
|
+
self.__origin: str = ""
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def origin(self) -> str:
|
|
126
|
+
if not self.__origin:
|
|
127
|
+
if self.__doc_meta["ocr_applied"]:
|
|
128
|
+
self.__origin = "Reformatted digital"
|
|
129
|
+
else:
|
|
130
|
+
self.__origin = "Born digital"
|
|
131
|
+
return self.__origin
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def ocr_accuracy_schema(self) -> dict:
|
|
135
|
+
if not self.__ocr_accuracy_schema:
|
|
136
|
+
ocr_quality = self.__doc_meta.get("alto_text_quality")
|
|
137
|
+
if ocr_quality:
|
|
138
|
+
self.__ocr_accuracy_schema = {
|
|
139
|
+
"comment": "Estimated OCR accuracy"
|
|
140
|
+
}
|
|
141
|
+
if self.__convert_ratio:
|
|
142
|
+
type_and_value = {
|
|
143
|
+
"@type": QUALITY_RATIO_TYPE,
|
|
144
|
+
"value": ocr_quality
|
|
145
|
+
}
|
|
146
|
+
else:
|
|
147
|
+
type_and_value = {
|
|
148
|
+
"@type": "Text",
|
|
149
|
+
"value": ratio_to_percentage(ocr_quality)
|
|
150
|
+
}
|
|
151
|
+
self.__ocr_accuracy_schema.update(type_and_value)
|
|
152
|
+
return self.__ocr_accuracy_schema
|
|
153
|
+
|
|
154
|
+
@property
|
|
155
|
+
def text_quality_schema(self) -> dict:
|
|
156
|
+
if not self.__text_quality_schema:
|
|
157
|
+
text_quality = self.__doc_meta.get("text_quality")
|
|
158
|
+
self.__text_quality_schema = {
|
|
159
|
+
"comment": "Estimated n-gram-based text quality"
|
|
160
|
+
}
|
|
161
|
+
if self.__convert_ratio:
|
|
162
|
+
type_and_value = {
|
|
163
|
+
"@type": QUALITY_RATIO_TYPE,
|
|
164
|
+
"value": text_quality
|
|
165
|
+
}
|
|
166
|
+
else:
|
|
167
|
+
type_and_value = {
|
|
168
|
+
"@type": "Text",
|
|
169
|
+
"value": ratio_to_percentage(text_quality)
|
|
170
|
+
}
|
|
171
|
+
self.__text_quality_schema.update(type_and_value)
|
|
172
|
+
return self.__text_quality_schema
|
|
173
|
+
|
|
174
|
+
@property
|
|
175
|
+
def language_schema(self) -> List[dict]:
|
|
176
|
+
if not self.__language_schema:
|
|
177
|
+
self.__language_schema = [
|
|
178
|
+
{
|
|
179
|
+
"@type": "ISO 639-2",
|
|
180
|
+
"value": lang_to_iso639_2(
|
|
181
|
+
lang["language"],
|
|
182
|
+
unk_code=UNDEFINED_LANGUAGE_VALUE
|
|
183
|
+
)
|
|
184
|
+
}
|
|
185
|
+
for lang in self.__doc_meta["languages"]
|
|
186
|
+
if lang["ratio"] >= self.__min_language_ratio
|
|
187
|
+
]
|
|
188
|
+
return self.__language_schema
|
|
189
|
+
|
|
190
|
+
@property
|
|
191
|
+
def identifier_schema(self) -> List[dict]:
|
|
192
|
+
if not self.__identifier_schema:
|
|
193
|
+
identifiers = []
|
|
194
|
+
if self.__sierra_id:
|
|
195
|
+
identifiers.append(
|
|
196
|
+
{
|
|
197
|
+
"@type": "Identifier",
|
|
198
|
+
"qualifier": "OPAC",
|
|
199
|
+
"value": self.__sierra_id
|
|
200
|
+
}
|
|
201
|
+
)
|
|
202
|
+
if self.__permalink:
|
|
203
|
+
identifiers.append(
|
|
204
|
+
{
|
|
205
|
+
"@type": "Identifier",
|
|
206
|
+
"qualifier": "Permalink",
|
|
207
|
+
"value": self.__permalink
|
|
208
|
+
}
|
|
209
|
+
)
|
|
210
|
+
if self.__generated_id:
|
|
211
|
+
identifiers.append(
|
|
212
|
+
{
|
|
213
|
+
"@type": "Identifier",
|
|
214
|
+
"qualifier": self.__generated_id_type,
|
|
215
|
+
"value": self.__generated_id
|
|
216
|
+
}
|
|
217
|
+
)
|
|
218
|
+
self.__identifier_schema = identifiers
|
|
219
|
+
|
|
220
|
+
return self.__identifier_schema
|
|
221
|
+
|
|
222
|
+
@property
|
|
223
|
+
def origin_schema(self) -> dict:
|
|
224
|
+
if not self.__origin_schema:
|
|
225
|
+
self.__origin_schema = {
|
|
226
|
+
"@type": "Text",
|
|
227
|
+
"value": self.origin,
|
|
228
|
+
"comment": "Origin"
|
|
229
|
+
}
|
|
230
|
+
return self.__origin_schema
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class DIGARSchemaConverter:
|
|
234
|
+
def __init__(
|
|
235
|
+
self,
|
|
236
|
+
digitizer_output: dict,
|
|
237
|
+
generated_id: str,
|
|
238
|
+
sierra_id: str = "",
|
|
239
|
+
permalink: str = "",
|
|
240
|
+
generated_id_type: str = "CustomID",
|
|
241
|
+
min_language_ratio: float = 0.2,
|
|
242
|
+
convert_ratio: bool = False
|
|
243
|
+
) -> NoReturn:
|
|
244
|
+
""" Initialize DIGARSchemaConverter object.
|
|
245
|
+
|
|
246
|
+
Parameters
|
|
247
|
+
----------
|
|
248
|
+
digitizer_output: dict
|
|
249
|
+
Raw output of rara-digitizer (https://pypi.org/project/rara-digitizer/).
|
|
250
|
+
generated_id: str
|
|
251
|
+
Some non-standard/generated document identifier used in ID fields.
|
|
252
|
+
sierra_id: str
|
|
253
|
+
Document's corresponding Sierra ID.
|
|
254
|
+
permalink: str
|
|
255
|
+
Permanent link, where the document can be accessed.
|
|
256
|
+
generated_id_type: str
|
|
257
|
+
Method / type of generated ID (e.g. 'UUID')
|
|
258
|
+
min_language_ratio: float
|
|
259
|
+
Cutoff ratio for languages. If ratio for some language
|
|
260
|
+
does not exceed the set threshold, the language will not
|
|
261
|
+
be added to the final output.
|
|
262
|
+
convert_ratio: bool
|
|
263
|
+
If enabled, all ratios are converted into percentages.
|
|
264
|
+
|
|
265
|
+
"""
|
|
266
|
+
self.__digitizer_output: dict = digitizer_output
|
|
267
|
+
self.__min_language_ratio: float = min_language_ratio
|
|
268
|
+
self.__convert_ratio: bool = convert_ratio
|
|
269
|
+
self.__sierra_id: str = sierra_id
|
|
270
|
+
self.__generated_id: str = generated_id
|
|
271
|
+
self.__permalink: str = permalink.removesuffix("/")
|
|
272
|
+
self.__generated_id_type: str = generated_id_type
|
|
273
|
+
self.__texts: List[dict] = []
|
|
274
|
+
self.__images: List[dict] = []
|
|
275
|
+
self.__doc_meta: dict = {}
|
|
276
|
+
self.__page_mappings: List[dict] = []
|
|
277
|
+
self.__dcterms_haspart: dict = {}
|
|
278
|
+
self.__dcterms_conforms_to: dict = {}
|
|
279
|
+
self.__dc_language: dict = {}
|
|
280
|
+
self.__dc_origin: dict = {}
|
|
281
|
+
self.__dc_identifier: List[dict] = []
|
|
282
|
+
self.__doc_id: str = ""
|
|
283
|
+
|
|
284
|
+
self.__doc_schemas = DocSchemas(
|
|
285
|
+
doc_meta=self.doc_meta,
|
|
286
|
+
sierra_id=self.__sierra_id,
|
|
287
|
+
generated_id=self.__generated_id,
|
|
288
|
+
permalink=self.__permalink,
|
|
289
|
+
min_language_ratio=self.__min_language_ratio,
|
|
290
|
+
convert_ratio=self.__convert_ratio,
|
|
291
|
+
generated_id_type=self.__generated_id_type
|
|
292
|
+
)
|
|
293
|
+
self.__digar_schema: dict = {}
|
|
294
|
+
|
|
295
|
+
def _get_page_number(self, page_content: dict) -> int:
|
|
296
|
+
""" Retrieves page number from image or text object.
|
|
297
|
+
"""
|
|
298
|
+
_segments = page_content["texts"] + page_content["images"]
|
|
299
|
+
_first_segment = _segments[0]
|
|
300
|
+
if "start_page" in _first_segment:
|
|
301
|
+
page_number = _first_segment.get("start_page")
|
|
302
|
+
elif "page" in _first_segment:
|
|
303
|
+
page_number = _first_segment.get("page")
|
|
304
|
+
return page_number
|
|
305
|
+
|
|
306
|
+
@property
|
|
307
|
+
def doc_id(self) -> str:
|
|
308
|
+
""" Retrieves document ID to use for generating
|
|
309
|
+
page and segment ids. Preference order:
|
|
310
|
+
1. permalink; 2. sierra_id; 3. generated document id
|
|
311
|
+
"""
|
|
312
|
+
if not self.__doc_id:
|
|
313
|
+
if self.__permalink:
|
|
314
|
+
self.__doc_id = self.__permalink
|
|
315
|
+
elif self.__sierra_id:
|
|
316
|
+
self.__doc_id = self.__sierra_id
|
|
317
|
+
else:
|
|
318
|
+
self.__doc_id = self.__generated_id
|
|
319
|
+
return self.__doc_id
|
|
320
|
+
|
|
321
|
+
@property
|
|
322
|
+
def texts(self) -> List[dict]:
|
|
323
|
+
if not self.__texts:
|
|
324
|
+
self.__texts = self.__digitizer_output.get("texts")
|
|
325
|
+
return self.__texts
|
|
326
|
+
|
|
327
|
+
@property
|
|
328
|
+
def images(self) -> List[dict]:
|
|
329
|
+
if not self.__images:
|
|
330
|
+
self.__images = self.__digitizer_output.get("images")
|
|
331
|
+
return self.__images
|
|
332
|
+
|
|
333
|
+
@property
|
|
334
|
+
def doc_meta(self) -> dict:
|
|
335
|
+
if not self.__doc_meta:
|
|
336
|
+
self.__doc_meta = self.__digitizer_output.get("doc_meta")
|
|
337
|
+
return self.__doc_meta
|
|
338
|
+
|
|
339
|
+
@property
|
|
340
|
+
def page_mappings(self) -> List[dict]:
|
|
341
|
+
if not self.__page_mappings:
|
|
342
|
+
mapped = defaultdict(lambda: defaultdict(list))
|
|
343
|
+
for text in self.texts:
|
|
344
|
+
mapped[text["start_page"]]["texts"].append(text)
|
|
345
|
+
for img in self.images:
|
|
346
|
+
mapped[img["page"]]["images"].append(img)
|
|
347
|
+
|
|
348
|
+
self.__page_mappings = [
|
|
349
|
+
v for k, v in sorted(list(mapped.items()), key=lambda x: x[0])
|
|
350
|
+
]
|
|
351
|
+
return self.__page_mappings
|
|
352
|
+
|
|
353
|
+
@property
|
|
354
|
+
def dcterms_haspart(self) -> dict:
|
|
355
|
+
if not self.__dcterms_haspart:
|
|
356
|
+
self.__dcterms_haspart = {
|
|
357
|
+
"dcterms:hasPart": [
|
|
358
|
+
PageSchema(
|
|
359
|
+
page_texts=page["texts"],
|
|
360
|
+
page_images=page["images"],
|
|
361
|
+
page_number=self._get_page_number(page),
|
|
362
|
+
doc_id=self.doc_id
|
|
363
|
+
).schema
|
|
364
|
+
for page in self.page_mappings
|
|
365
|
+
]
|
|
366
|
+
}
|
|
367
|
+
return self.__dcterms_haspart
|
|
368
|
+
|
|
369
|
+
@property
|
|
370
|
+
def dcterms_conforms_to(self) -> dict:
|
|
371
|
+
if not self.__dcterms_conforms_to:
|
|
372
|
+
schema_content = [
|
|
373
|
+
self.__doc_schemas.text_quality_schema,
|
|
374
|
+
]
|
|
375
|
+
# Add OCR Accuracy only when it is not empty:
|
|
376
|
+
if self.__doc_schemas.ocr_accuracy_schema:
|
|
377
|
+
schema_content.append(self.__doc_schemas.ocr_accuracy_schema)
|
|
378
|
+
self.__dcterms_conforms_to = {
|
|
379
|
+
"dcterms:conformsTo": schema_content
|
|
380
|
+
}
|
|
381
|
+
return self.__dcterms_conforms_to
|
|
382
|
+
|
|
383
|
+
@property
|
|
384
|
+
def dc_language(self) -> dict:
|
|
385
|
+
if not self.__dc_language:
|
|
386
|
+
self.__dc_language = {
|
|
387
|
+
"dc:language": self.__doc_schemas.language_schema
|
|
388
|
+
}
|
|
389
|
+
return self.__dc_language
|
|
390
|
+
|
|
391
|
+
@property
|
|
392
|
+
def dc_origin(self) -> dict:
|
|
393
|
+
if not self.__dc_origin:
|
|
394
|
+
self.__dc_origin = {
|
|
395
|
+
"dcterms:provenance": self.__doc_schemas.origin_schema
|
|
396
|
+
}
|
|
397
|
+
return self.__dc_origin
|
|
398
|
+
|
|
399
|
+
@property
|
|
400
|
+
def dc_identifier(self) -> List[dict]:
|
|
401
|
+
if not self.__dc_identifier:
|
|
402
|
+
self.__dc_identifier = {
|
|
403
|
+
"dc:identifier": self.__doc_schemas.identifier_schema
|
|
404
|
+
}
|
|
405
|
+
return self.__dc_identifier
|
|
406
|
+
|
|
407
|
+
@property
|
|
408
|
+
def digar_schema(self) -> dict:
|
|
409
|
+
if not self.__digar_schema:
|
|
410
|
+
self.__digar_schema = {}
|
|
411
|
+
self.__digar_schema.update(self.dcterms_conforms_to)
|
|
412
|
+
self.__digar_schema.update(self.dcterms_haspart)
|
|
413
|
+
self.__digar_schema.update(self.dc_language)
|
|
414
|
+
self.__digar_schema.update(self.dc_origin)
|
|
415
|
+
self.__digar_schema.update(self.dc_identifier)
|
|
416
|
+
return self.__digar_schema
|
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
from typing import Any, Dict, Iterator, Optional
|
|
1
|
+
from typing import Any, Dict, Iterator, Optional, List
|
|
2
2
|
|
|
3
3
|
import elasticsearch_dsl
|
|
4
4
|
from elastic_transport import ObjectApiResponse
|
|
5
5
|
from elasticsearch import Elasticsearch
|
|
6
6
|
from elasticsearch.helpers import bulk
|
|
7
7
|
from elasticsearch_dsl import Index
|
|
8
|
+
from elasticsearch_dsl.response import Response
|
|
8
9
|
|
|
9
10
|
from .decorators import _elastic_connection
|
|
10
11
|
|
|
@@ -82,6 +83,63 @@ class KataElastic:
|
|
|
82
83
|
def add_mapping(self, index_name: str, schema: dict):
|
|
83
84
|
index = Index(name=index_name)
|
|
84
85
|
return index.put_mapping(body=schema, using=self.elasticsearch)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@_elastic_connection
|
|
89
|
+
def add_vector_mapping(
|
|
90
|
+
self,
|
|
91
|
+
index_name: str,
|
|
92
|
+
field: str,
|
|
93
|
+
schema: Optional[dict] = None,
|
|
94
|
+
dims: int = 1024
|
|
95
|
+
) -> dict:
|
|
96
|
+
vector_mapping = {
|
|
97
|
+
"properties": {
|
|
98
|
+
field: {
|
|
99
|
+
"type": "dense_vector",
|
|
100
|
+
"dims": dims
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
mapping = schema or vector_mapping
|
|
105
|
+
index = Index(name=index_name)
|
|
106
|
+
return index.put_mapping(body=mapping, using=self.elasticsearch)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@_elastic_connection
|
|
110
|
+
def add_ann_vector_mapping(
|
|
111
|
+
self,
|
|
112
|
+
index_name: str,
|
|
113
|
+
field: str,
|
|
114
|
+
schema: Optional[dict] = None,
|
|
115
|
+
dims: int = 1024
|
|
116
|
+
) -> dict:
|
|
117
|
+
vector_mapping = {
|
|
118
|
+
"properties": {
|
|
119
|
+
field: {
|
|
120
|
+
"type": "dense_vector",
|
|
121
|
+
"dims": dims,
|
|
122
|
+
"similarity": "cosine",
|
|
123
|
+
"index": True
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
mapping = schema or vector_mapping
|
|
128
|
+
index = Index(name=index_name)
|
|
129
|
+
return index.put_mapping(body=mapping, using=self.elasticsearch)
|
|
130
|
+
|
|
131
|
+
@_elastic_connection
|
|
132
|
+
def add_vector(
|
|
133
|
+
self,
|
|
134
|
+
index_name: str,
|
|
135
|
+
document_id: str,
|
|
136
|
+
vector: List[float],
|
|
137
|
+
field: str
|
|
138
|
+
) -> dict:
|
|
139
|
+
schema = {"doc": {field: vector}}
|
|
140
|
+
return self.elasticsearch.update(
|
|
141
|
+
index=index_name, id=document_id, body=schema, refresh="wait_for"
|
|
142
|
+
)
|
|
85
143
|
|
|
86
144
|
@_elastic_connection
|
|
87
145
|
def create_index(
|
|
@@ -170,6 +228,87 @@ class KataElastic:
|
|
|
170
228
|
s.scan(), key=lambda doc: [getattr(doc, field) for field in sort_fields]
|
|
171
229
|
)
|
|
172
230
|
return documents
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
@_elastic_connection
|
|
234
|
+
def execute_fuzzy_search(
|
|
235
|
+
self,
|
|
236
|
+
index: str,
|
|
237
|
+
field: str,
|
|
238
|
+
entity: str,
|
|
239
|
+
fuzziness: int = 2,
|
|
240
|
+
prefix_length: int = 1,
|
|
241
|
+
max_expansions: int = 50
|
|
242
|
+
) -> Response:
|
|
243
|
+
"""Executes a fuzzy search.
|
|
244
|
+
:param: index str: Index to search from.
|
|
245
|
+
:param: entity str: Entity to search matches for.
|
|
246
|
+
:param: fuzziness int: Maximum edit distance for a match.
|
|
247
|
+
:param: prefix_length int: Number of characters in the prefix that
|
|
248
|
+
should overlap with the original entity's prefix.
|
|
249
|
+
:param: max_expansion int: maximum number of terms the fuzzy query
|
|
250
|
+
will match before halting the search
|
|
251
|
+
:return: Dict on search results.
|
|
252
|
+
"""
|
|
253
|
+
query_params = {
|
|
254
|
+
f"{field}.keyword": {
|
|
255
|
+
"value": entity,
|
|
256
|
+
"fuzziness": fuzziness,
|
|
257
|
+
"max_expansions": max_expansions,
|
|
258
|
+
"prefix_length": prefix_length
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
s = elasticsearch_dsl.Search(using=self.elasticsearch, index=index)
|
|
262
|
+
s = s.query("fuzzy", **query_params)
|
|
263
|
+
response = s.execute()
|
|
264
|
+
return response
|
|
265
|
+
|
|
266
|
+
def execute_vector_search(
|
|
267
|
+
self,
|
|
268
|
+
index: str,
|
|
269
|
+
field: str,
|
|
270
|
+
query_vector: List[float],
|
|
271
|
+
k: int = 10,
|
|
272
|
+
num_candidates: int = 100,
|
|
273
|
+
n_docs: int = 10,
|
|
274
|
+
elastic_ids: List[str] = []
|
|
275
|
+
) -> Response:
|
|
276
|
+
""" Execute a vector search.
|
|
277
|
+
NB! Works only with ANN mapping!
|
|
278
|
+
|
|
279
|
+
:param: index str: Index to search from.
|
|
280
|
+
:param: field str: Field containing vectorized data.
|
|
281
|
+
:param: query vector List[float]: Vector to search matches for.
|
|
282
|
+
:param: k int: Number of nearest neighbors to return.
|
|
283
|
+
:param: num_candidates int: Number of candidates considered before selecting k results.
|
|
284
|
+
:param: elastic_ids: List[str]: Elastic ID-s for restricting the search.
|
|
285
|
+
"""
|
|
286
|
+
|
|
287
|
+
s = elasticsearch_dsl.Search(using=self.elasticsearch, index=index)
|
|
288
|
+
|
|
289
|
+
# Add kNN vector search
|
|
290
|
+
s = s.extra(
|
|
291
|
+
knn={
|
|
292
|
+
"field": field,
|
|
293
|
+
"query_vector": query_vector,
|
|
294
|
+
"k": k,
|
|
295
|
+
"num_candidates": num_candidates
|
|
296
|
+
}
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
# Add ID filtering, if elastic_ids are specified
|
|
300
|
+
if elastic_ids:
|
|
301
|
+
s = s.query(
|
|
302
|
+
elasticsearch_dsl.Q("terms", _id=elastic_ids)
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# Sort by score and return `n_docs` best-matching documents
|
|
306
|
+
s = s.extra(size=n_docs)
|
|
307
|
+
|
|
308
|
+
# Execute the search
|
|
309
|
+
response = s.execute()
|
|
310
|
+
return response
|
|
311
|
+
|
|
173
312
|
|
|
174
313
|
def __str__(self) -> str:
|
|
175
314
|
return self.elasticsearch_url
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from iso639 import Lang
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def lang_to_iso639_1(lang: str, unk_code: str = "unk") -> str:
|
|
5
|
+
""" Converts language into ISO-639-1 standard.
|
|
6
|
+
Input can be any language code in a valid ISO-639
|
|
7
|
+
standard or even a full name of the language,
|
|
8
|
+
e.g. "Estonian".
|
|
9
|
+
|
|
10
|
+
Parameters
|
|
11
|
+
-----------
|
|
12
|
+
lang: str
|
|
13
|
+
Language code in any valid ISO-639 standard.
|
|
14
|
+
|
|
15
|
+
unk_code: str
|
|
16
|
+
Code to return incase of invalid/unsupported
|
|
17
|
+
input language.
|
|
18
|
+
|
|
19
|
+
Returns
|
|
20
|
+
-------
|
|
21
|
+
Language code in ISO-639-1 standard.
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
lg = Lang(lang)
|
|
25
|
+
iso_639_1_lang = lg.pt1
|
|
26
|
+
except:
|
|
27
|
+
iso_639_1_lang = unk_code
|
|
28
|
+
return iso_639_1_lang
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def lang_to_iso639_2(lang: str, unk_code: str = "unk") -> str:
|
|
32
|
+
""" Converts language into ISO-639-2 standard.
|
|
33
|
+
Input can be any language code in a valid ISO-639
|
|
34
|
+
standard or even a full name of the language,
|
|
35
|
+
e.g. "Estonian".
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
-----------
|
|
39
|
+
lang: str
|
|
40
|
+
Language code in any valid ISO-639 standard.
|
|
41
|
+
|
|
42
|
+
unk_code: str
|
|
43
|
+
Code to return incase of invalid/unsupported
|
|
44
|
+
input language.
|
|
45
|
+
|
|
46
|
+
Returns
|
|
47
|
+
-------
|
|
48
|
+
Language code in ISO-639-2 standard.
|
|
49
|
+
"""
|
|
50
|
+
try:
|
|
51
|
+
lg = Lang(lang)
|
|
52
|
+
# NB! uses bibliographic identifier (e.g. "de" -> "ger")
|
|
53
|
+
# opposed to terminological identifier ("de" -> "deu").
|
|
54
|
+
# This can be changed by replaving lg.pt2b -> lg.pt2t
|
|
55
|
+
iso_639_2_lang = lg.pt2b
|
|
56
|
+
except:
|
|
57
|
+
iso_639_2_lang = unk_code
|
|
58
|
+
return iso_639_2_lang
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def lang_to_iso639_3(lang: str, unk_code: str = "unk") -> str:
|
|
62
|
+
""" Converts language into ISO-639-3 standard.
|
|
63
|
+
Input can be any language code in a valid ISO-639
|
|
64
|
+
standard or even a full name of the language,
|
|
65
|
+
e.g. "Estonian".
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
-----------
|
|
69
|
+
lang: str
|
|
70
|
+
Language code in any valid ISO-639 standard.
|
|
71
|
+
unk_code: str
|
|
72
|
+
|
|
73
|
+
Code to return incase of invalid/unsupported
|
|
74
|
+
input language.
|
|
75
|
+
|
|
76
|
+
Returns
|
|
77
|
+
-------
|
|
78
|
+
str
|
|
79
|
+
Language code in ISO-639-3 standard.
|
|
80
|
+
"""
|
|
81
|
+
try:
|
|
82
|
+
lg = Lang(lang)
|
|
83
|
+
iso_639_3_lang = lg.pt3
|
|
84
|
+
except:
|
|
85
|
+
iso_639_3_lang = unk_code
|
|
86
|
+
return iso_639_3_lang
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def ratio_to_percentage(ratio: float) -> str:
|
|
90
|
+
""" Converts ratio to corresponding percentage.
|
|
91
|
+
|
|
92
|
+
Parameters
|
|
93
|
+
-----------
|
|
94
|
+
ratio: float
|
|
95
|
+
Float in range [0,1]
|
|
96
|
+
|
|
97
|
+
Returns
|
|
98
|
+
--------
|
|
99
|
+
str
|
|
100
|
+
Percentage corresponding to the float.
|
|
101
|
+
|
|
102
|
+
"""
|
|
103
|
+
percentage = f"{int(ratio*100)}%"
|
|
104
|
+
return percentage
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: rara-tools
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.11
|
|
4
4
|
Summary: Tools to support Kata's work.
|
|
5
5
|
Classifier: Programming Language :: Python :: 3
|
|
6
6
|
Classifier: Programming Language :: Python :: 3.10
|
|
@@ -14,6 +14,7 @@ Requires-Dist: elasticsearch==8.*
|
|
|
14
14
|
Requires-Dist: elasticsearch_dsl==8.*
|
|
15
15
|
Requires-Dist: minio==7.*
|
|
16
16
|
Requires-Dist: requests
|
|
17
|
+
Requires-Dist: iso639-lang
|
|
17
18
|
Provides-Extra: testing
|
|
18
19
|
Requires-Dist: pytest>=8.0; extra == "testing"
|
|
19
20
|
Requires-Dist: pytest-order; extra == "testing"
|
|
@@ -5,10 +5,12 @@ pyproject.toml
|
|
|
5
5
|
requirements.txt
|
|
6
6
|
rara_tools/converters.py
|
|
7
7
|
rara_tools/decorators.py
|
|
8
|
+
rara_tools/digar_schema_converter.py
|
|
8
9
|
rara_tools/elastic.py
|
|
9
10
|
rara_tools/exceptions.py
|
|
10
11
|
rara_tools/s3.py
|
|
11
12
|
rara_tools/task_reporter.py
|
|
13
|
+
rara_tools/utils.py
|
|
12
14
|
rara_tools.egg-info/PKG-INFO
|
|
13
15
|
rara_tools.egg-info/SOURCES.txt
|
|
14
16
|
rara_tools.egg-info/dependency_links.txt
|
|
@@ -18,7 +20,9 @@ rara_tools/constants/__init__.py
|
|
|
18
20
|
rara_tools/constants/digitizer.py
|
|
19
21
|
rara_tools/constants/general.py
|
|
20
22
|
tests/test_converters.py
|
|
23
|
+
tests/test_digar_schema_converter.py
|
|
21
24
|
tests/test_elastic.py
|
|
25
|
+
tests/test_elastic_vector_and_search_operations.py
|
|
22
26
|
tests/test_s3_exceptions.py
|
|
23
27
|
tests/test_s3_file_operations.py
|
|
24
28
|
tests/test_task_reporter.py
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import pytest
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
from rara_tools.digar_schema_converter import DIGARSchemaConverter
|
|
7
|
+
|
|
8
|
+
def load_json(file_path: str):
|
|
9
|
+
with open(file_path, "r") as f:
|
|
10
|
+
data = json.load(f)
|
|
11
|
+
return data
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
TEST_DIGITIZER_OUTPUT_FILE = os.path.join(".", "tests", "test_data", "b1267058_test_digitizer_output.json")
|
|
15
|
+
TEST_DIGITIZER_OUTPUT = load_json(TEST_DIGITIZER_OUTPUT_FILE)
|
|
16
|
+
TEST_SIERRA_ID = "b1267058"
|
|
17
|
+
TEST_GENERATED_ID = "hsasaHSAHHGDhb"
|
|
18
|
+
TEST_PERMALINK = "https://www.digar.ee/b1267058"
|
|
19
|
+
|
|
20
|
+
def test_digar_schema_converstion_default():
|
|
21
|
+
converter = DIGARSchemaConverter(
|
|
22
|
+
digitizer_output=TEST_DIGITIZER_OUTPUT,
|
|
23
|
+
sierra_id=TEST_SIERRA_ID,
|
|
24
|
+
generated_id=TEST_GENERATED_ID
|
|
25
|
+
)
|
|
26
|
+
digar_schema = converter.digar_schema
|
|
27
|
+
|
|
28
|
+
# check that all neseccary fields are present
|
|
29
|
+
assert "dc:language" in digar_schema
|
|
30
|
+
assert "dcterms:provenance" in digar_schema
|
|
31
|
+
assert "dc:identifier" in digar_schema
|
|
32
|
+
assert "dcterms:hasPart" in digar_schema
|
|
33
|
+
assert "dcterms:conformsTo" in digar_schema
|
|
34
|
+
|
|
35
|
+
languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
|
|
36
|
+
# check that languages are converted into ISO-693-2
|
|
37
|
+
for lang in languages:
|
|
38
|
+
assert len(lang) == 3
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# check that ratio is converted into percentage
|
|
42
|
+
text_quality = digar_schema.get("dcterms:conformsTo")[0].get("value")
|
|
43
|
+
assert isinstance(text_quality, str)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_digar_schema_id_generation():
|
|
47
|
+
""" Tests ID generation logic.
|
|
48
|
+
"""
|
|
49
|
+
converter = DIGARSchemaConverter(
|
|
50
|
+
digitizer_output=TEST_DIGITIZER_OUTPUT,
|
|
51
|
+
sierra_id=TEST_SIERRA_ID,
|
|
52
|
+
generated_id=TEST_GENERATED_ID,
|
|
53
|
+
permalink=TEST_PERMALINK
|
|
54
|
+
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
#If permalink is given, this should be used as base ID
|
|
58
|
+
digar_schema = converter.digar_schema
|
|
59
|
+
first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
|
|
60
|
+
|
|
61
|
+
assert first_segment_id.startswith(TEST_PERMALINK)
|
|
62
|
+
|
|
63
|
+
converter = DIGARSchemaConverter(
|
|
64
|
+
digitizer_output=TEST_DIGITIZER_OUTPUT,
|
|
65
|
+
sierra_id=TEST_SIERRA_ID,
|
|
66
|
+
generated_id=TEST_GENERATED_ID
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
#If permalink is NOT given, Sierra ID should be used as base ID
|
|
70
|
+
digar_schema = converter.digar_schema
|
|
71
|
+
first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
|
|
72
|
+
assert first_segment_id.startswith(TEST_SIERRA_ID)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
converter = DIGARSchemaConverter(
|
|
76
|
+
digitizer_output=TEST_DIGITIZER_OUTPUT,
|
|
77
|
+
generated_id=TEST_GENERATED_ID
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
#If neiter permalink nor Sierra ID is given, generated ID should be used as base ID
|
|
81
|
+
digar_schema = converter.digar_schema
|
|
82
|
+
first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
|
|
83
|
+
assert first_segment_id.startswith(TEST_GENERATED_ID)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_restricting_languages_with_ratio():
|
|
87
|
+
""" Checks that param `min_language_ratio` influences
|
|
88
|
+
the number of output languages.
|
|
89
|
+
"""
|
|
90
|
+
converter = DIGARSchemaConverter(
|
|
91
|
+
digitizer_output=TEST_DIGITIZER_OUTPUT,
|
|
92
|
+
sierra_id=TEST_SIERRA_ID,
|
|
93
|
+
generated_id=TEST_GENERATED_ID,
|
|
94
|
+
permalink=TEST_PERMALINK,
|
|
95
|
+
min_language_ratio=0
|
|
96
|
+
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
#If permalink is given, this should be used as base ID
|
|
100
|
+
digar_schema = converter.digar_schema
|
|
101
|
+
languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
|
|
102
|
+
assert len(languages) == 7
|
|
103
|
+
|
|
104
|
+
converter = DIGARSchemaConverter(
|
|
105
|
+
digitizer_output=TEST_DIGITIZER_OUTPUT,
|
|
106
|
+
sierra_id=TEST_SIERRA_ID,
|
|
107
|
+
generated_id=TEST_GENERATED_ID,
|
|
108
|
+
permalink=TEST_PERMALINK,
|
|
109
|
+
min_language_ratio=0.02
|
|
110
|
+
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
#If permalink is given, this should be used as base ID
|
|
114
|
+
digar_schema = converter.digar_schema
|
|
115
|
+
languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
|
|
116
|
+
assert len(languages) == 2
|
|
117
|
+
|
|
118
|
+
converter = DIGARSchemaConverter(
|
|
119
|
+
digitizer_output=TEST_DIGITIZER_OUTPUT,
|
|
120
|
+
sierra_id=TEST_SIERRA_ID,
|
|
121
|
+
generated_id=TEST_GENERATED_ID,
|
|
122
|
+
permalink=TEST_PERMALINK,
|
|
123
|
+
min_language_ratio=0.5
|
|
124
|
+
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
#If permalink is given, this should be used as base ID
|
|
128
|
+
digar_schema = converter.digar_schema
|
|
129
|
+
languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
|
|
130
|
+
assert len(languages) == 1
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from time import sleep
|
|
6
|
+
from rara_tools.elastic import KataElastic
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def load_json(file_path: str):
|
|
10
|
+
with open(file_path, "r") as fh:
|
|
11
|
+
data = json.load(fh)
|
|
12
|
+
return data
|
|
13
|
+
|
|
14
|
+
TEST_DOCUMENTS = load_json("./tests/test_data/elastic_vectorized_docs.json")
|
|
15
|
+
TEST_VECTOR_DATA = load_json("./tests/test_data/test_vector_data.json")
|
|
16
|
+
TEST_VECTOR = TEST_VECTOR_DATA.get("vector")
|
|
17
|
+
|
|
18
|
+
es_url = os.getenv("ELASTIC_TEST_URL", "http://localhost:9200")
|
|
19
|
+
ELASTIC = KataElastic(es_url)
|
|
20
|
+
|
|
21
|
+
TEST_KNN_INDEX_NAME = "tools_knn_testing_index"
|
|
22
|
+
TEST_ANN_INDEX_NAME = "tools_ann_testing_index"
|
|
23
|
+
|
|
24
|
+
TEST_VECTOR_FIELD = "vector"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@pytest.mark.order(1)
|
|
29
|
+
def test_index_creation_with_knn_vector_mapping():
|
|
30
|
+
""" Tests if index created and documents indexed.
|
|
31
|
+
"""
|
|
32
|
+
# Create test index
|
|
33
|
+
created = ELASTIC.create_index(TEST_KNN_INDEX_NAME)
|
|
34
|
+
assert created["acknowledged"] is True
|
|
35
|
+
result = ELASTIC.add_vector_mapping(
|
|
36
|
+
index_name=TEST_KNN_INDEX_NAME,
|
|
37
|
+
field=TEST_VECTOR_FIELD
|
|
38
|
+
)
|
|
39
|
+
assert result["acknowledged"] is True
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@pytest.mark.order(2)
|
|
43
|
+
def test_index_creation_with_ann_vector_mapping():
|
|
44
|
+
""" Tests if index created and documents indexed.
|
|
45
|
+
"""
|
|
46
|
+
# Create test index
|
|
47
|
+
created = ELASTIC.create_index(TEST_ANN_INDEX_NAME)
|
|
48
|
+
assert created["acknowledged"] is True
|
|
49
|
+
result = ELASTIC.add_ann_vector_mapping(
|
|
50
|
+
index_name=TEST_ANN_INDEX_NAME,
|
|
51
|
+
field=TEST_VECTOR_FIELD
|
|
52
|
+
)
|
|
53
|
+
assert result["acknowledged"] is True
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@pytest.mark.order(3)
|
|
57
|
+
def test_vectorized_document_addition_knn_index():
|
|
58
|
+
""" Tests indexing vectorized documents.
|
|
59
|
+
"""
|
|
60
|
+
# Add test documents
|
|
61
|
+
for document in TEST_DOCUMENTS:
|
|
62
|
+
indexed = ELASTIC.index_document(TEST_KNN_INDEX_NAME, document)
|
|
63
|
+
assert indexed["result"] == "created"
|
|
64
|
+
# let it index
|
|
65
|
+
sleep(1)
|
|
66
|
+
|
|
67
|
+
@pytest.mark.order(4)
|
|
68
|
+
def test_vectorized_document_addition_ann_index():
|
|
69
|
+
""" Tests indexing vectorized documents.
|
|
70
|
+
"""
|
|
71
|
+
# Add test documents
|
|
72
|
+
for document in TEST_DOCUMENTS:
|
|
73
|
+
indexed = ELASTIC.index_document(TEST_ANN_INDEX_NAME, document)
|
|
74
|
+
assert indexed["result"] == "created"
|
|
75
|
+
# let it index
|
|
76
|
+
sleep(1)
|
|
77
|
+
|
|
78
|
+
@pytest.mark.order(5)
|
|
79
|
+
def test_fuzzy_search():
|
|
80
|
+
""" Tests fuzzy search.
|
|
81
|
+
"""
|
|
82
|
+
response = ELASTIC.execute_fuzzy_search(
|
|
83
|
+
index=TEST_ANN_INDEX_NAME,
|
|
84
|
+
field="variations",
|
|
85
|
+
entity="Paul Keres",
|
|
86
|
+
fuzziness=0
|
|
87
|
+
)
|
|
88
|
+
total_hits = response.hits.total.value
|
|
89
|
+
assert total_hits == 2
|
|
90
|
+
|
|
91
|
+
response = ELASTIC.execute_fuzzy_search(
|
|
92
|
+
index=TEST_ANN_INDEX_NAME,
|
|
93
|
+
field="variations",
|
|
94
|
+
entity="Paul Keres",
|
|
95
|
+
fuzziness=2
|
|
96
|
+
)
|
|
97
|
+
total_hits = response.hits.total.value
|
|
98
|
+
assert total_hits == 3
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@pytest.mark.order(6)
|
|
102
|
+
def test_vector_search():
|
|
103
|
+
""" Tests vector search.
|
|
104
|
+
"""
|
|
105
|
+
# Execut fuzzy search to get ID restrictions
|
|
106
|
+
response = ELASTIC.execute_fuzzy_search(
|
|
107
|
+
index=TEST_ANN_INDEX_NAME,
|
|
108
|
+
field="variations",
|
|
109
|
+
entity="Paul Keres",
|
|
110
|
+
fuzziness=2
|
|
111
|
+
)
|
|
112
|
+
total_hits = response.hits.total.value
|
|
113
|
+
assert total_hits == 3
|
|
114
|
+
elastic_ids = [hit.meta.id for hit in response]
|
|
115
|
+
|
|
116
|
+
response = ELASTIC.execute_vector_search(
|
|
117
|
+
index=TEST_ANN_INDEX_NAME,
|
|
118
|
+
field="vector",
|
|
119
|
+
query_vector=TEST_VECTOR,
|
|
120
|
+
k=1,
|
|
121
|
+
n_docs=1,
|
|
122
|
+
num_candidates=10,
|
|
123
|
+
elastic_ids=elastic_ids
|
|
124
|
+
)
|
|
125
|
+
descriptions = [hit.description for hit in response]
|
|
126
|
+
assert len(descriptions) == 1
|
|
127
|
+
assert descriptions[0] == "Eesti maletaja ja maleteoreetik"
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
@pytest.mark.order(7)
|
|
131
|
+
def test_index_deleting():
|
|
132
|
+
"""
|
|
133
|
+
Tests deleting index. We delete the test index now.
|
|
134
|
+
"""
|
|
135
|
+
indices = [TEST_KNN_INDEX_NAME, TEST_ANN_INDEX_NAME]
|
|
136
|
+
for index in indices:
|
|
137
|
+
deleted = ELASTIC.delete_index(index)
|
|
138
|
+
sleep(1)
|
|
139
|
+
assert deleted["acknowledged"] is True
|
|
140
|
+
|
rara_tools-0.0.9/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.0.9
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|