rara-tools 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rara-tools might be problematic. Click here for more details.
- rara_tools/digar_schema_converter.py +416 -0
- rara_tools/elastic.py +140 -1
- rara_tools/utils.py +104 -0
- {rara_tools-0.0.9.dist-info → rara_tools-0.0.11.dist-info}/METADATA +2 -1
- {rara_tools-0.0.9.dist-info → rara_tools-0.0.11.dist-info}/RECORD +8 -6
- {rara_tools-0.0.9.dist-info → rara_tools-0.0.11.dist-info}/LICENSE.md +0 -0
- {rara_tools-0.0.9.dist-info → rara_tools-0.0.11.dist-info}/WHEEL +0 -0
- {rara_tools-0.0.9.dist-info → rara_tools-0.0.11.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import List, NoReturn
|
|
3
|
+
|
|
4
|
+
from rara_tools.utils import lang_to_iso639_2, ratio_to_percentage
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
UNDEFINED_LANGUAGE_VALUE = "unk"
|
|
8
|
+
QUALITY_RATIO_TYPE = "Float"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ImagePageSchema:
|
|
12
|
+
def __init__(self, image: dict) -> NoReturn:
|
|
13
|
+
self.__image = image
|
|
14
|
+
self.__schema: dict = {}
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def schema(self) -> dict:
|
|
18
|
+
if not self.__schema:
|
|
19
|
+
self.__schema = {
|
|
20
|
+
"@type": "VisualArtwork",
|
|
21
|
+
"@id": "",
|
|
22
|
+
"value": self.__image.get("label"),
|
|
23
|
+
"description": "",
|
|
24
|
+
"schema:position": self.__image.get("page")
|
|
25
|
+
}
|
|
26
|
+
return self.__schema
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class TextPageSchema:
|
|
30
|
+
def __init__(self, page: dict) -> NoReturn:
|
|
31
|
+
self.__page: dict = page
|
|
32
|
+
self.__schema: dict = {}
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def schema(self) -> dict:
|
|
36
|
+
if not self.__schema:
|
|
37
|
+
self.__schema = {
|
|
38
|
+
"@type": "Text", # CONSTANT
|
|
39
|
+
"@id": "", # Will be added in a later stage
|
|
40
|
+
"value": "Textblock", # CONSTANT
|
|
41
|
+
"content": self.__page.get("text"),
|
|
42
|
+
"schema:position": self.__page.get("start_page") # start_page ?
|
|
43
|
+
}
|
|
44
|
+
return self.__schema
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class PageSchema:
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
page_texts: List[dict],
|
|
51
|
+
page_images: List[dict],
|
|
52
|
+
page_number: int,
|
|
53
|
+
doc_id: str
|
|
54
|
+
) -> NoReturn:
|
|
55
|
+
self.__page_texts: List[dict] = page_texts
|
|
56
|
+
self.__page_images: List[dict] = page_images
|
|
57
|
+
self.__page_nr: int = page_number
|
|
58
|
+
self.__page_id: str = ""
|
|
59
|
+
self.__doc_id: str = doc_id
|
|
60
|
+
self.__schema: dict = {}
|
|
61
|
+
|
|
62
|
+
def _add_segment_ids(self, segments: List[dict]) -> List[dict]:
|
|
63
|
+
for i, segment in enumerate(segments):
|
|
64
|
+
segment_id = f"{self.page_id}/{i + 1}"
|
|
65
|
+
segment["@id"] = segment_id
|
|
66
|
+
return segments
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def page_id(self) -> str:
|
|
70
|
+
if not self.__page_id:
|
|
71
|
+
self.__page_id = f"{self.__doc_id}/{self.__page_nr}"
|
|
72
|
+
return self.__page_id
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def schema(self) -> dict:
|
|
76
|
+
if not self.__schema:
|
|
77
|
+
self.__schema = {
|
|
78
|
+
"@type": "CreativeWork", # CONSTANT for pages
|
|
79
|
+
"@id": self.page_id,
|
|
80
|
+
"hasPart": []
|
|
81
|
+
}
|
|
82
|
+
text_schemas = [
|
|
83
|
+
TextPageSchema(page).schema
|
|
84
|
+
for page in self.__page_texts
|
|
85
|
+
]
|
|
86
|
+
image_schemas = [
|
|
87
|
+
ImagePageSchema(image).schema
|
|
88
|
+
for image in self.__page_images
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
page_schemas = text_schemas + image_schemas
|
|
92
|
+
page_schemas_with_ids = self._add_segment_ids(page_schemas)
|
|
93
|
+
|
|
94
|
+
self.__schema["hasPart"].extend(page_schemas_with_ids)
|
|
95
|
+
|
|
96
|
+
return self.__schema
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class DocSchemas:
|
|
100
|
+
def __init__(
|
|
101
|
+
self,
|
|
102
|
+
doc_meta: dict,
|
|
103
|
+
sierra_id: str = "",
|
|
104
|
+
generated_id: str = "",
|
|
105
|
+
permalink: str = "",
|
|
106
|
+
min_language_ratio: float = 0.2,
|
|
107
|
+
convert_ratio: bool = True,
|
|
108
|
+
generated_id_type: str = "CustomID"
|
|
109
|
+
) -> NoReturn:
|
|
110
|
+
self.__convert_ratio = convert_ratio
|
|
111
|
+
self.__min_language_ratio = min_language_ratio
|
|
112
|
+
self.__sierra_id = sierra_id
|
|
113
|
+
self.__generated_id = generated_id
|
|
114
|
+
self.__permalink = permalink
|
|
115
|
+
self.__generated_id_type = generated_id_type
|
|
116
|
+
self.__doc_meta = doc_meta
|
|
117
|
+
self.__ocr_accuracy_schema: dict = {}
|
|
118
|
+
self.__text_quality_schema: dict = {}
|
|
119
|
+
self.__language_schema: List[dict] = []
|
|
120
|
+
self.__identifier_schema: List[dict] = []
|
|
121
|
+
self.__origin_schema: dict = {}
|
|
122
|
+
self.__origin: str = ""
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def origin(self) -> str:
|
|
126
|
+
if not self.__origin:
|
|
127
|
+
if self.__doc_meta["ocr_applied"]:
|
|
128
|
+
self.__origin = "Reformatted digital"
|
|
129
|
+
else:
|
|
130
|
+
self.__origin = "Born digital"
|
|
131
|
+
return self.__origin
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def ocr_accuracy_schema(self) -> dict:
|
|
135
|
+
if not self.__ocr_accuracy_schema:
|
|
136
|
+
ocr_quality = self.__doc_meta.get("alto_text_quality")
|
|
137
|
+
if ocr_quality:
|
|
138
|
+
self.__ocr_accuracy_schema = {
|
|
139
|
+
"comment": "Estimated OCR accuracy"
|
|
140
|
+
}
|
|
141
|
+
if self.__convert_ratio:
|
|
142
|
+
type_and_value = {
|
|
143
|
+
"@type": QUALITY_RATIO_TYPE,
|
|
144
|
+
"value": ocr_quality
|
|
145
|
+
}
|
|
146
|
+
else:
|
|
147
|
+
type_and_value = {
|
|
148
|
+
"@type": "Text",
|
|
149
|
+
"value": ratio_to_percentage(ocr_quality)
|
|
150
|
+
}
|
|
151
|
+
self.__ocr_accuracy_schema.update(type_and_value)
|
|
152
|
+
return self.__ocr_accuracy_schema
|
|
153
|
+
|
|
154
|
+
@property
|
|
155
|
+
def text_quality_schema(self) -> dict:
|
|
156
|
+
if not self.__text_quality_schema:
|
|
157
|
+
text_quality = self.__doc_meta.get("text_quality")
|
|
158
|
+
self.__text_quality_schema = {
|
|
159
|
+
"comment": "Estimated n-gram-based text quality"
|
|
160
|
+
}
|
|
161
|
+
if self.__convert_ratio:
|
|
162
|
+
type_and_value = {
|
|
163
|
+
"@type": QUALITY_RATIO_TYPE,
|
|
164
|
+
"value": text_quality
|
|
165
|
+
}
|
|
166
|
+
else:
|
|
167
|
+
type_and_value = {
|
|
168
|
+
"@type": "Text",
|
|
169
|
+
"value": ratio_to_percentage(text_quality)
|
|
170
|
+
}
|
|
171
|
+
self.__text_quality_schema.update(type_and_value)
|
|
172
|
+
return self.__text_quality_schema
|
|
173
|
+
|
|
174
|
+
@property
|
|
175
|
+
def language_schema(self) -> List[dict]:
|
|
176
|
+
if not self.__language_schema:
|
|
177
|
+
self.__language_schema = [
|
|
178
|
+
{
|
|
179
|
+
"@type": "ISO 639-2",
|
|
180
|
+
"value": lang_to_iso639_2(
|
|
181
|
+
lang["language"],
|
|
182
|
+
unk_code=UNDEFINED_LANGUAGE_VALUE
|
|
183
|
+
)
|
|
184
|
+
}
|
|
185
|
+
for lang in self.__doc_meta["languages"]
|
|
186
|
+
if lang["ratio"] >= self.__min_language_ratio
|
|
187
|
+
]
|
|
188
|
+
return self.__language_schema
|
|
189
|
+
|
|
190
|
+
@property
|
|
191
|
+
def identifier_schema(self) -> List[dict]:
|
|
192
|
+
if not self.__identifier_schema:
|
|
193
|
+
identifiers = []
|
|
194
|
+
if self.__sierra_id:
|
|
195
|
+
identifiers.append(
|
|
196
|
+
{
|
|
197
|
+
"@type": "Identifier",
|
|
198
|
+
"qualifier": "OPAC",
|
|
199
|
+
"value": self.__sierra_id
|
|
200
|
+
}
|
|
201
|
+
)
|
|
202
|
+
if self.__permalink:
|
|
203
|
+
identifiers.append(
|
|
204
|
+
{
|
|
205
|
+
"@type": "Identifier",
|
|
206
|
+
"qualifier": "Permalink",
|
|
207
|
+
"value": self.__permalink
|
|
208
|
+
}
|
|
209
|
+
)
|
|
210
|
+
if self.__generated_id:
|
|
211
|
+
identifiers.append(
|
|
212
|
+
{
|
|
213
|
+
"@type": "Identifier",
|
|
214
|
+
"qualifier": self.__generated_id_type,
|
|
215
|
+
"value": self.__generated_id
|
|
216
|
+
}
|
|
217
|
+
)
|
|
218
|
+
self.__identifier_schema = identifiers
|
|
219
|
+
|
|
220
|
+
return self.__identifier_schema
|
|
221
|
+
|
|
222
|
+
@property
|
|
223
|
+
def origin_schema(self) -> dict:
|
|
224
|
+
if not self.__origin_schema:
|
|
225
|
+
self.__origin_schema = {
|
|
226
|
+
"@type": "Text",
|
|
227
|
+
"value": self.origin,
|
|
228
|
+
"comment": "Origin"
|
|
229
|
+
}
|
|
230
|
+
return self.__origin_schema
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class DIGARSchemaConverter:
|
|
234
|
+
def __init__(
|
|
235
|
+
self,
|
|
236
|
+
digitizer_output: dict,
|
|
237
|
+
generated_id: str,
|
|
238
|
+
sierra_id: str = "",
|
|
239
|
+
permalink: str = "",
|
|
240
|
+
generated_id_type: str = "CustomID",
|
|
241
|
+
min_language_ratio: float = 0.2,
|
|
242
|
+
convert_ratio: bool = False
|
|
243
|
+
) -> NoReturn:
|
|
244
|
+
""" Initialize DIGARSchemaConverter object.
|
|
245
|
+
|
|
246
|
+
Parameters
|
|
247
|
+
----------
|
|
248
|
+
digitizer_output: dict
|
|
249
|
+
Raw output of rara-digitizer (https://pypi.org/project/rara-digitizer/).
|
|
250
|
+
generated_id: str
|
|
251
|
+
Some non-standard/generated document identifier used in ID fields.
|
|
252
|
+
sierra_id: str
|
|
253
|
+
Document's corresponding Sierra ID.
|
|
254
|
+
permalink: str
|
|
255
|
+
Permanent link, where the document can be accessed.
|
|
256
|
+
generated_id_type: str
|
|
257
|
+
Method / type of generated ID (e.g. 'UUID')
|
|
258
|
+
min_language_ratio: float
|
|
259
|
+
Cutoff ratio for languages. If ratio for some language
|
|
260
|
+
does not exceed the set threshold, the language will not
|
|
261
|
+
be added to the final output.
|
|
262
|
+
convert_ratio: bool
|
|
263
|
+
If enabled, all ratios are converted into percentages.
|
|
264
|
+
|
|
265
|
+
"""
|
|
266
|
+
self.__digitizer_output: dict = digitizer_output
|
|
267
|
+
self.__min_language_ratio: float = min_language_ratio
|
|
268
|
+
self.__convert_ratio: bool = convert_ratio
|
|
269
|
+
self.__sierra_id: str = sierra_id
|
|
270
|
+
self.__generated_id: str = generated_id
|
|
271
|
+
self.__permalink: str = permalink.removesuffix("/")
|
|
272
|
+
self.__generated_id_type: str = generated_id_type
|
|
273
|
+
self.__texts: List[dict] = []
|
|
274
|
+
self.__images: List[dict] = []
|
|
275
|
+
self.__doc_meta: dict = {}
|
|
276
|
+
self.__page_mappings: List[dict] = []
|
|
277
|
+
self.__dcterms_haspart: dict = {}
|
|
278
|
+
self.__dcterms_conforms_to: dict = {}
|
|
279
|
+
self.__dc_language: dict = {}
|
|
280
|
+
self.__dc_origin: dict = {}
|
|
281
|
+
self.__dc_identifier: List[dict] = []
|
|
282
|
+
self.__doc_id: str = ""
|
|
283
|
+
|
|
284
|
+
self.__doc_schemas = DocSchemas(
|
|
285
|
+
doc_meta=self.doc_meta,
|
|
286
|
+
sierra_id=self.__sierra_id,
|
|
287
|
+
generated_id=self.__generated_id,
|
|
288
|
+
permalink=self.__permalink,
|
|
289
|
+
min_language_ratio=self.__min_language_ratio,
|
|
290
|
+
convert_ratio=self.__convert_ratio,
|
|
291
|
+
generated_id_type=self.__generated_id_type
|
|
292
|
+
)
|
|
293
|
+
self.__digar_schema: dict = {}
|
|
294
|
+
|
|
295
|
+
def _get_page_number(self, page_content: dict) -> int:
|
|
296
|
+
""" Retrieves page number from image or text object.
|
|
297
|
+
"""
|
|
298
|
+
_segments = page_content["texts"] + page_content["images"]
|
|
299
|
+
_first_segment = _segments[0]
|
|
300
|
+
if "start_page" in _first_segment:
|
|
301
|
+
page_number = _first_segment.get("start_page")
|
|
302
|
+
elif "page" in _first_segment:
|
|
303
|
+
page_number = _first_segment.get("page")
|
|
304
|
+
return page_number
|
|
305
|
+
|
|
306
|
+
@property
|
|
307
|
+
def doc_id(self) -> str:
|
|
308
|
+
""" Retrieves document ID to use for generating
|
|
309
|
+
page and segment ids. Preference order:
|
|
310
|
+
1. permalink; 2. sierra_id; 3. generated document id
|
|
311
|
+
"""
|
|
312
|
+
if not self.__doc_id:
|
|
313
|
+
if self.__permalink:
|
|
314
|
+
self.__doc_id = self.__permalink
|
|
315
|
+
elif self.__sierra_id:
|
|
316
|
+
self.__doc_id = self.__sierra_id
|
|
317
|
+
else:
|
|
318
|
+
self.__doc_id = self.__generated_id
|
|
319
|
+
return self.__doc_id
|
|
320
|
+
|
|
321
|
+
@property
|
|
322
|
+
def texts(self) -> List[dict]:
|
|
323
|
+
if not self.__texts:
|
|
324
|
+
self.__texts = self.__digitizer_output.get("texts")
|
|
325
|
+
return self.__texts
|
|
326
|
+
|
|
327
|
+
@property
|
|
328
|
+
def images(self) -> List[dict]:
|
|
329
|
+
if not self.__images:
|
|
330
|
+
self.__images = self.__digitizer_output.get("images")
|
|
331
|
+
return self.__images
|
|
332
|
+
|
|
333
|
+
@property
|
|
334
|
+
def doc_meta(self) -> dict:
|
|
335
|
+
if not self.__doc_meta:
|
|
336
|
+
self.__doc_meta = self.__digitizer_output.get("doc_meta")
|
|
337
|
+
return self.__doc_meta
|
|
338
|
+
|
|
339
|
+
@property
|
|
340
|
+
def page_mappings(self) -> List[dict]:
|
|
341
|
+
if not self.__page_mappings:
|
|
342
|
+
mapped = defaultdict(lambda: defaultdict(list))
|
|
343
|
+
for text in self.texts:
|
|
344
|
+
mapped[text["start_page"]]["texts"].append(text)
|
|
345
|
+
for img in self.images:
|
|
346
|
+
mapped[img["page"]]["images"].append(img)
|
|
347
|
+
|
|
348
|
+
self.__page_mappings = [
|
|
349
|
+
v for k, v in sorted(list(mapped.items()), key=lambda x: x[0])
|
|
350
|
+
]
|
|
351
|
+
return self.__page_mappings
|
|
352
|
+
|
|
353
|
+
@property
|
|
354
|
+
def dcterms_haspart(self) -> dict:
|
|
355
|
+
if not self.__dcterms_haspart:
|
|
356
|
+
self.__dcterms_haspart = {
|
|
357
|
+
"dcterms:hasPart": [
|
|
358
|
+
PageSchema(
|
|
359
|
+
page_texts=page["texts"],
|
|
360
|
+
page_images=page["images"],
|
|
361
|
+
page_number=self._get_page_number(page),
|
|
362
|
+
doc_id=self.doc_id
|
|
363
|
+
).schema
|
|
364
|
+
for page in self.page_mappings
|
|
365
|
+
]
|
|
366
|
+
}
|
|
367
|
+
return self.__dcterms_haspart
|
|
368
|
+
|
|
369
|
+
@property
|
|
370
|
+
def dcterms_conforms_to(self) -> dict:
|
|
371
|
+
if not self.__dcterms_conforms_to:
|
|
372
|
+
schema_content = [
|
|
373
|
+
self.__doc_schemas.text_quality_schema,
|
|
374
|
+
]
|
|
375
|
+
# Add OCR Accuracy only when it is not empty:
|
|
376
|
+
if self.__doc_schemas.ocr_accuracy_schema:
|
|
377
|
+
schema_content.append(self.__doc_schemas.ocr_accuracy_schema)
|
|
378
|
+
self.__dcterms_conforms_to = {
|
|
379
|
+
"dcterms:conformsTo": schema_content
|
|
380
|
+
}
|
|
381
|
+
return self.__dcterms_conforms_to
|
|
382
|
+
|
|
383
|
+
@property
|
|
384
|
+
def dc_language(self) -> dict:
|
|
385
|
+
if not self.__dc_language:
|
|
386
|
+
self.__dc_language = {
|
|
387
|
+
"dc:language": self.__doc_schemas.language_schema
|
|
388
|
+
}
|
|
389
|
+
return self.__dc_language
|
|
390
|
+
|
|
391
|
+
@property
|
|
392
|
+
def dc_origin(self) -> dict:
|
|
393
|
+
if not self.__dc_origin:
|
|
394
|
+
self.__dc_origin = {
|
|
395
|
+
"dcterms:provenance": self.__doc_schemas.origin_schema
|
|
396
|
+
}
|
|
397
|
+
return self.__dc_origin
|
|
398
|
+
|
|
399
|
+
@property
|
|
400
|
+
def dc_identifier(self) -> List[dict]:
|
|
401
|
+
if not self.__dc_identifier:
|
|
402
|
+
self.__dc_identifier = {
|
|
403
|
+
"dc:identifier": self.__doc_schemas.identifier_schema
|
|
404
|
+
}
|
|
405
|
+
return self.__dc_identifier
|
|
406
|
+
|
|
407
|
+
@property
|
|
408
|
+
def digar_schema(self) -> dict:
|
|
409
|
+
if not self.__digar_schema:
|
|
410
|
+
self.__digar_schema = {}
|
|
411
|
+
self.__digar_schema.update(self.dcterms_conforms_to)
|
|
412
|
+
self.__digar_schema.update(self.dcterms_haspart)
|
|
413
|
+
self.__digar_schema.update(self.dc_language)
|
|
414
|
+
self.__digar_schema.update(self.dc_origin)
|
|
415
|
+
self.__digar_schema.update(self.dc_identifier)
|
|
416
|
+
return self.__digar_schema
|
rara_tools/elastic.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
from typing import Any, Dict, Iterator, Optional
|
|
1
|
+
from typing import Any, Dict, Iterator, Optional, List
|
|
2
2
|
|
|
3
3
|
import elasticsearch_dsl
|
|
4
4
|
from elastic_transport import ObjectApiResponse
|
|
5
5
|
from elasticsearch import Elasticsearch
|
|
6
6
|
from elasticsearch.helpers import bulk
|
|
7
7
|
from elasticsearch_dsl import Index
|
|
8
|
+
from elasticsearch_dsl.response import Response
|
|
8
9
|
|
|
9
10
|
from .decorators import _elastic_connection
|
|
10
11
|
|
|
@@ -82,6 +83,63 @@ class KataElastic:
|
|
|
82
83
|
def add_mapping(self, index_name: str, schema: dict):
|
|
83
84
|
index = Index(name=index_name)
|
|
84
85
|
return index.put_mapping(body=schema, using=self.elasticsearch)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@_elastic_connection
|
|
89
|
+
def add_vector_mapping(
|
|
90
|
+
self,
|
|
91
|
+
index_name: str,
|
|
92
|
+
field: str,
|
|
93
|
+
schema: Optional[dict] = None,
|
|
94
|
+
dims: int = 1024
|
|
95
|
+
) -> dict:
|
|
96
|
+
vector_mapping = {
|
|
97
|
+
"properties": {
|
|
98
|
+
field: {
|
|
99
|
+
"type": "dense_vector",
|
|
100
|
+
"dims": dims
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
mapping = schema or vector_mapping
|
|
105
|
+
index = Index(name=index_name)
|
|
106
|
+
return index.put_mapping(body=mapping, using=self.elasticsearch)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@_elastic_connection
|
|
110
|
+
def add_ann_vector_mapping(
|
|
111
|
+
self,
|
|
112
|
+
index_name: str,
|
|
113
|
+
field: str,
|
|
114
|
+
schema: Optional[dict] = None,
|
|
115
|
+
dims: int = 1024
|
|
116
|
+
) -> dict:
|
|
117
|
+
vector_mapping = {
|
|
118
|
+
"properties": {
|
|
119
|
+
field: {
|
|
120
|
+
"type": "dense_vector",
|
|
121
|
+
"dims": dims,
|
|
122
|
+
"similarity": "cosine",
|
|
123
|
+
"index": True
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
mapping = schema or vector_mapping
|
|
128
|
+
index = Index(name=index_name)
|
|
129
|
+
return index.put_mapping(body=mapping, using=self.elasticsearch)
|
|
130
|
+
|
|
131
|
+
@_elastic_connection
|
|
132
|
+
def add_vector(
|
|
133
|
+
self,
|
|
134
|
+
index_name: str,
|
|
135
|
+
document_id: str,
|
|
136
|
+
vector: List[float],
|
|
137
|
+
field: str
|
|
138
|
+
) -> dict:
|
|
139
|
+
schema = {"doc": {field: vector}}
|
|
140
|
+
return self.elasticsearch.update(
|
|
141
|
+
index=index_name, id=document_id, body=schema, refresh="wait_for"
|
|
142
|
+
)
|
|
85
143
|
|
|
86
144
|
@_elastic_connection
|
|
87
145
|
def create_index(
|
|
@@ -170,6 +228,87 @@ class KataElastic:
|
|
|
170
228
|
s.scan(), key=lambda doc: [getattr(doc, field) for field in sort_fields]
|
|
171
229
|
)
|
|
172
230
|
return documents
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
@_elastic_connection
|
|
234
|
+
def execute_fuzzy_search(
|
|
235
|
+
self,
|
|
236
|
+
index: str,
|
|
237
|
+
field: str,
|
|
238
|
+
entity: str,
|
|
239
|
+
fuzziness: int = 2,
|
|
240
|
+
prefix_length: int = 1,
|
|
241
|
+
max_expansions: int = 50
|
|
242
|
+
) -> Response:
|
|
243
|
+
"""Executes a fuzzy search.
|
|
244
|
+
:param: index str: Index to search from.
|
|
245
|
+
:param: entity str: Entity to search matches for.
|
|
246
|
+
:param: fuzziness int: Maximum edit distance for a match.
|
|
247
|
+
:param: prefix_length int: Number of characters in the prefix that
|
|
248
|
+
should overlap with the original entity's prefix.
|
|
249
|
+
:param: max_expansion int: maximum number of terms the fuzzy query
|
|
250
|
+
will match before halting the search
|
|
251
|
+
:return: Dict on search results.
|
|
252
|
+
"""
|
|
253
|
+
query_params = {
|
|
254
|
+
f"{field}.keyword": {
|
|
255
|
+
"value": entity,
|
|
256
|
+
"fuzziness": fuzziness,
|
|
257
|
+
"max_expansions": max_expansions,
|
|
258
|
+
"prefix_length": prefix_length
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
s = elasticsearch_dsl.Search(using=self.elasticsearch, index=index)
|
|
262
|
+
s = s.query("fuzzy", **query_params)
|
|
263
|
+
response = s.execute()
|
|
264
|
+
return response
|
|
265
|
+
|
|
266
|
+
def execute_vector_search(
|
|
267
|
+
self,
|
|
268
|
+
index: str,
|
|
269
|
+
field: str,
|
|
270
|
+
query_vector: List[float],
|
|
271
|
+
k: int = 10,
|
|
272
|
+
num_candidates: int = 100,
|
|
273
|
+
n_docs: int = 10,
|
|
274
|
+
elastic_ids: List[str] = []
|
|
275
|
+
) -> Response:
|
|
276
|
+
""" Execute a vector search.
|
|
277
|
+
NB! Works only with ANN mapping!
|
|
278
|
+
|
|
279
|
+
:param: index str: Index to search from.
|
|
280
|
+
:param: field str: Field containing vectorized data.
|
|
281
|
+
:param: query vector List[float]: Vector to search matches for.
|
|
282
|
+
:param: k int: Number of nearest neighbors to return.
|
|
283
|
+
:param: num_candidates int: Number of candidates considered before selecting k results.
|
|
284
|
+
:param: elastic_ids: List[str]: Elastic ID-s for restricting the search.
|
|
285
|
+
"""
|
|
286
|
+
|
|
287
|
+
s = elasticsearch_dsl.Search(using=self.elasticsearch, index=index)
|
|
288
|
+
|
|
289
|
+
# Add kNN vector search
|
|
290
|
+
s = s.extra(
|
|
291
|
+
knn={
|
|
292
|
+
"field": field,
|
|
293
|
+
"query_vector": query_vector,
|
|
294
|
+
"k": k,
|
|
295
|
+
"num_candidates": num_candidates
|
|
296
|
+
}
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
# Add ID filtering, if elastic_ids are specified
|
|
300
|
+
if elastic_ids:
|
|
301
|
+
s = s.query(
|
|
302
|
+
elasticsearch_dsl.Q("terms", _id=elastic_ids)
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# Sort by score and return `n_docs` best-matching documents
|
|
306
|
+
s = s.extra(size=n_docs)
|
|
307
|
+
|
|
308
|
+
# Execute the search
|
|
309
|
+
response = s.execute()
|
|
310
|
+
return response
|
|
311
|
+
|
|
173
312
|
|
|
174
313
|
def __str__(self) -> str:
|
|
175
314
|
return self.elasticsearch_url
|
rara_tools/utils.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from iso639 import Lang
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def lang_to_iso639_1(lang: str, unk_code: str = "unk") -> str:
|
|
5
|
+
""" Converts language into ISO-639-1 standard.
|
|
6
|
+
Input can be any language code in a valid ISO-639
|
|
7
|
+
standard or even a full name of the language,
|
|
8
|
+
e.g. "Estonian".
|
|
9
|
+
|
|
10
|
+
Parameters
|
|
11
|
+
-----------
|
|
12
|
+
lang: str
|
|
13
|
+
Language code in any valid ISO-639 standard.
|
|
14
|
+
|
|
15
|
+
unk_code: str
|
|
16
|
+
Code to return incase of invalid/unsupported
|
|
17
|
+
input language.
|
|
18
|
+
|
|
19
|
+
Returns
|
|
20
|
+
-------
|
|
21
|
+
Language code in ISO-639-1 standard.
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
lg = Lang(lang)
|
|
25
|
+
iso_639_1_lang = lg.pt1
|
|
26
|
+
except:
|
|
27
|
+
iso_639_1_lang = unk_code
|
|
28
|
+
return iso_639_1_lang
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def lang_to_iso639_2(lang: str, unk_code: str = "unk") -> str:
|
|
32
|
+
""" Converts language into ISO-639-2 standard.
|
|
33
|
+
Input can be any language code in a valid ISO-639
|
|
34
|
+
standard or even a full name of the language,
|
|
35
|
+
e.g. "Estonian".
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
-----------
|
|
39
|
+
lang: str
|
|
40
|
+
Language code in any valid ISO-639 standard.
|
|
41
|
+
|
|
42
|
+
unk_code: str
|
|
43
|
+
Code to return incase of invalid/unsupported
|
|
44
|
+
input language.
|
|
45
|
+
|
|
46
|
+
Returns
|
|
47
|
+
-------
|
|
48
|
+
Language code in ISO-639-2 standard.
|
|
49
|
+
"""
|
|
50
|
+
try:
|
|
51
|
+
lg = Lang(lang)
|
|
52
|
+
# NB! uses bibliographic identifier (e.g. "de" -> "ger")
|
|
53
|
+
# opposed to terminological identifier ("de" -> "deu").
|
|
54
|
+
# This can be changed by replaving lg.pt2b -> lg.pt2t
|
|
55
|
+
iso_639_2_lang = lg.pt2b
|
|
56
|
+
except:
|
|
57
|
+
iso_639_2_lang = unk_code
|
|
58
|
+
return iso_639_2_lang
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def lang_to_iso639_3(lang: str, unk_code: str = "unk") -> str:
|
|
62
|
+
""" Converts language into ISO-639-3 standard.
|
|
63
|
+
Input can be any language code in a valid ISO-639
|
|
64
|
+
standard or even a full name of the language,
|
|
65
|
+
e.g. "Estonian".
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
-----------
|
|
69
|
+
lang: str
|
|
70
|
+
Language code in any valid ISO-639 standard.
|
|
71
|
+
unk_code: str
|
|
72
|
+
|
|
73
|
+
Code to return incase of invalid/unsupported
|
|
74
|
+
input language.
|
|
75
|
+
|
|
76
|
+
Returns
|
|
77
|
+
-------
|
|
78
|
+
str
|
|
79
|
+
Language code in ISO-639-3 standard.
|
|
80
|
+
"""
|
|
81
|
+
try:
|
|
82
|
+
lg = Lang(lang)
|
|
83
|
+
iso_639_3_lang = lg.pt3
|
|
84
|
+
except:
|
|
85
|
+
iso_639_3_lang = unk_code
|
|
86
|
+
return iso_639_3_lang
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def ratio_to_percentage(ratio: float) -> str:
|
|
90
|
+
""" Converts ratio to corresponding percentage.
|
|
91
|
+
|
|
92
|
+
Parameters
|
|
93
|
+
-----------
|
|
94
|
+
ratio: float
|
|
95
|
+
Float in range [0,1]
|
|
96
|
+
|
|
97
|
+
Returns
|
|
98
|
+
--------
|
|
99
|
+
str
|
|
100
|
+
Percentage corresponding to the float.
|
|
101
|
+
|
|
102
|
+
"""
|
|
103
|
+
percentage = f"{int(ratio*100)}%"
|
|
104
|
+
return percentage
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: rara-tools
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.11
|
|
4
4
|
Summary: Tools to support Kata's work.
|
|
5
5
|
Classifier: Programming Language :: Python :: 3
|
|
6
6
|
Classifier: Programming Language :: Python :: 3.10
|
|
@@ -14,6 +14,7 @@ Requires-Dist: elasticsearch==8.*
|
|
|
14
14
|
Requires-Dist: elasticsearch_dsl==8.*
|
|
15
15
|
Requires-Dist: minio==7.*
|
|
16
16
|
Requires-Dist: requests
|
|
17
|
+
Requires-Dist: iso639-lang
|
|
17
18
|
Provides-Extra: testing
|
|
18
19
|
Requires-Dist: pytest>=8.0; extra == "testing"
|
|
19
20
|
Requires-Dist: pytest-order; extra == "testing"
|
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
rara_tools/converters.py,sha256=JcS74VzV6jm12l3C6aqMJBY9nuVW_aevQeCe32KmfrE,1576
|
|
2
2
|
rara_tools/decorators.py,sha256=MjOyvZ5nTkwxwx2JLFEGpKKBysvecFw6EN6UDrSvZLU,2187
|
|
3
|
-
rara_tools/
|
|
3
|
+
rara_tools/digar_schema_converter.py,sha256=k95U2iRlEA3sh772-v6snhHW6fju6qSTMnvWJ6DpzZk,14254
|
|
4
|
+
rara_tools/elastic.py,sha256=LZfHZqeTDjCEb5YX4CLPJEFffRSZAcRq6AtyP49Fo0E,11575
|
|
4
5
|
rara_tools/exceptions.py,sha256=BwNh4qWxau_ylr9RqZoYwd1KnExI6oWWWDno3jkh8q4,474
|
|
5
6
|
rara_tools/s3.py,sha256=uNDu2HzMYHAWh33RcHeyPFK7gdQfQPxsdfohyIKezEY,4467
|
|
6
7
|
rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
|
|
8
|
+
rara_tools/utils.py,sha256=9vSbmuWYU5ydr4lXBKlUKa0xzDccFsaJv4T-XwgUfuY,2578
|
|
7
9
|
rara_tools/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
10
|
rara_tools/constants/digitizer.py,sha256=gJ3jOMwuZfKcLqgOAxTyB266VYsskLabJiMUiSz3xX4,297
|
|
9
11
|
rara_tools/constants/general.py,sha256=E9Jaw-YxocS_tOZw9QBoxO3e9KK5EMbLoM0R7D4Iflw,171
|
|
10
|
-
rara_tools-0.0.
|
|
11
|
-
rara_tools-0.0.
|
|
12
|
-
rara_tools-0.0.
|
|
13
|
-
rara_tools-0.0.
|
|
14
|
-
rara_tools-0.0.
|
|
12
|
+
rara_tools-0.0.11.dist-info/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
|
|
13
|
+
rara_tools-0.0.11.dist-info/METADATA,sha256=pDcladCQ1A9O9Wh4UDSh0eHwNyqcGY1BOwPxSJKpLFk,3895
|
|
14
|
+
rara_tools-0.0.11.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
15
|
+
rara_tools-0.0.11.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
|
|
16
|
+
rara_tools-0.0.11.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|