rara-tools 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

@@ -3,7 +3,7 @@ from typing import List, NoReturn
3
3
 
4
4
  from rara_tools.utils import lang_to_iso639_2, ratio_to_percentage
5
5
 
6
- GENERAL_DOC_IDENTIFIER = "Filepath"
6
+
7
7
  UNDEFINED_LANGUAGE_VALUE = "unk"
8
8
  QUALITY_RATIO_TYPE = "Float"
9
9
 
@@ -104,13 +104,15 @@ class DocSchemas:
104
104
  generated_id: str = "",
105
105
  permalink: str = "",
106
106
  min_language_ratio: float = 0.2,
107
- convert_ratio: bool = True
107
+ convert_ratio: bool = True,
108
+ generated_id_type: str = "CustomID"
108
109
  ) -> NoReturn:
109
110
  self.__convert_ratio = convert_ratio
110
111
  self.__min_language_ratio = min_language_ratio
111
112
  self.__sierra_id = sierra_id
112
113
  self.__generated_id = generated_id
113
114
  self.__permalink = permalink
115
+ self.__generated_id_type = generated_id_type
114
116
  self.__doc_meta = doc_meta
115
117
  self.__ocr_accuracy_schema: dict = {}
116
118
  self.__text_quality_schema: dict = {}
@@ -209,7 +211,7 @@ class DocSchemas:
209
211
  identifiers.append(
210
212
  {
211
213
  "@type": "Identifier",
212
- "qualifier": GENERAL_DOC_IDENTIFIER,
214
+ "qualifier": self.__generated_id_type,
213
215
  "value": self.__generated_id
214
216
  }
215
217
  )
@@ -235,6 +237,7 @@ class DIGARSchemaConverter:
235
237
  generated_id: str,
236
238
  sierra_id: str = "",
237
239
  permalink: str = "",
240
+ generated_id_type: str = "CustomID",
238
241
  min_language_ratio: float = 0.2,
239
242
  convert_ratio: bool = False
240
243
  ) -> NoReturn:
@@ -250,6 +253,8 @@ class DIGARSchemaConverter:
250
253
  Document's corresponding Sierra ID.
251
254
  permalink: str
252
255
  Permanent link, where the document can be accessed.
256
+ generated_id_type: str
257
+ Method / type of generated ID (e.g. 'UUID')
253
258
  min_language_ratio: float
254
259
  Cutoff ratio for languages. If ratio for some language
255
260
  does not exceed the set threshold, the language will not
@@ -264,6 +269,7 @@ class DIGARSchemaConverter:
264
269
  self.__sierra_id: str = sierra_id
265
270
  self.__generated_id: str = generated_id
266
271
  self.__permalink: str = permalink.removesuffix("/")
272
+ self.__generated_id_type: str = generated_id_type
267
273
  self.__texts: List[dict] = []
268
274
  self.__images: List[dict] = []
269
275
  self.__doc_meta: dict = {}
@@ -281,7 +287,8 @@ class DIGARSchemaConverter:
281
287
  generated_id=self.__generated_id,
282
288
  permalink=self.__permalink,
283
289
  min_language_ratio=self.__min_language_ratio,
284
- convert_ratio=self.__convert_ratio
290
+ convert_ratio=self.__convert_ratio,
291
+ generated_id_type=self.__generated_id_type
285
292
  )
286
293
  self.__digar_schema: dict = {}
287
294
 
rara_tools/elastic.py CHANGED
@@ -1,10 +1,11 @@
1
- from typing import Any, Dict, Iterator, Optional
1
+ from typing import Any, Dict, Iterator, Optional, List
2
2
 
3
3
  import elasticsearch_dsl
4
4
  from elastic_transport import ObjectApiResponse
5
5
  from elasticsearch import Elasticsearch
6
6
  from elasticsearch.helpers import bulk
7
7
  from elasticsearch_dsl import Index
8
+ from elasticsearch_dsl.response import Response
8
9
 
9
10
  from .decorators import _elastic_connection
10
11
 
@@ -82,6 +83,63 @@ class KataElastic:
82
83
  def add_mapping(self, index_name: str, schema: dict):
83
84
  index = Index(name=index_name)
84
85
  return index.put_mapping(body=schema, using=self.elasticsearch)
86
+
87
+
88
+ @_elastic_connection
89
+ def add_vector_mapping(
90
+ self,
91
+ index_name: str,
92
+ field: str,
93
+ schema: Optional[dict] = None,
94
+ dims: int = 1024
95
+ ) -> dict:
96
+ vector_mapping = {
97
+ "properties": {
98
+ field: {
99
+ "type": "dense_vector",
100
+ "dims": dims
101
+ }
102
+ }
103
+ }
104
+ mapping = schema or vector_mapping
105
+ index = Index(name=index_name)
106
+ return index.put_mapping(body=mapping, using=self.elasticsearch)
107
+
108
+
109
+ @_elastic_connection
110
+ def add_ann_vector_mapping(
111
+ self,
112
+ index_name: str,
113
+ field: str,
114
+ schema: Optional[dict] = None,
115
+ dims: int = 1024
116
+ ) -> dict:
117
+ vector_mapping = {
118
+ "properties": {
119
+ field: {
120
+ "type": "dense_vector",
121
+ "dims": dims,
122
+ "similarity": "cosine",
123
+ "index": True
124
+ }
125
+ }
126
+ }
127
+ mapping = schema or vector_mapping
128
+ index = Index(name=index_name)
129
+ return index.put_mapping(body=mapping, using=self.elasticsearch)
130
+
131
+ @_elastic_connection
132
+ def add_vector(
133
+ self,
134
+ index_name: str,
135
+ document_id: str,
136
+ vector: List[float],
137
+ field: str
138
+ ) -> dict:
139
+ schema = {"doc": {field: vector}}
140
+ return self.elasticsearch.update(
141
+ index=index_name, id=document_id, body=schema, refresh="wait_for"
142
+ )
85
143
 
86
144
  @_elastic_connection
87
145
  def create_index(
@@ -170,6 +228,87 @@ class KataElastic:
170
228
  s.scan(), key=lambda doc: [getattr(doc, field) for field in sort_fields]
171
229
  )
172
230
  return documents
231
+
232
+
233
+ @_elastic_connection
234
+ def execute_fuzzy_search(
235
+ self,
236
+ index: str,
237
+ field: str,
238
+ entity: str,
239
+ fuzziness: int = 2,
240
+ prefix_length: int = 1,
241
+ max_expansions: int = 50
242
+ ) -> Response:
243
+ """Executes a fuzzy search.
244
+ :param: index str: Index to search from.
245
+ :param: entity str: Entity to search matches for.
246
+ :param: fuzziness int: Maximum edit distance for a match.
247
+ :param: prefix_length int: Number of characters in the prefix that
248
+ should overlap with the original entity's prefix.
249
+ :param: max_expansion int: maximum number of terms the fuzzy query
250
+ will match before halting the search
251
+ :return: Dict on search results.
252
+ """
253
+ query_params = {
254
+ f"{field}.keyword": {
255
+ "value": entity,
256
+ "fuzziness": fuzziness,
257
+ "max_expansions": max_expansions,
258
+ "prefix_length": prefix_length
259
+ }
260
+ }
261
+ s = elasticsearch_dsl.Search(using=self.elasticsearch, index=index)
262
+ s = s.query("fuzzy", **query_params)
263
+ response = s.execute()
264
+ return response
265
+
266
+ def execute_vector_search(
267
+ self,
268
+ index: str,
269
+ field: str,
270
+ query_vector: List[float],
271
+ k: int = 10,
272
+ num_candidates: int = 100,
273
+ n_docs: int = 10,
274
+ elastic_ids: List[str] = []
275
+ ) -> Response:
276
+ """ Execute a vector search.
277
+ NB! Works only with ANN mapping!
278
+
279
+ :param: index str: Index to search from.
280
+ :param: field str: Field containing vectorized data.
281
+ :param: query vector List[float]: Vector to search matches for.
282
+ :param: k int: Number of nearest neighbors to return.
283
+ :param: num_candidates int: Number of candidates considered before selecting k results.
284
+ :param: elastic_ids: List[str]: Elastic ID-s for restricting the search.
285
+ """
286
+
287
+ s = elasticsearch_dsl.Search(using=self.elasticsearch, index=index)
288
+
289
+ # Add kNN vector search
290
+ s = s.extra(
291
+ knn={
292
+ "field": field,
293
+ "query_vector": query_vector,
294
+ "k": k,
295
+ "num_candidates": num_candidates
296
+ }
297
+ )
298
+
299
+ # Add ID filtering, if elastic_ids are specified
300
+ if elastic_ids:
301
+ s = s.query(
302
+ elasticsearch_dsl.Q("terms", _id=elastic_ids)
303
+ )
304
+
305
+ # Sort by score and return `n_docs` best-matching documents
306
+ s = s.extra(size=n_docs)
307
+
308
+ # Execute the search
309
+ response = s.execute()
310
+ return response
311
+
173
312
 
174
313
  def __str__(self) -> str:
175
314
  return self.elasticsearch_url
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: rara-tools
3
- Version: 0.0.10
3
+ Version: 0.0.11
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -1,7 +1,7 @@
1
1
  rara_tools/converters.py,sha256=JcS74VzV6jm12l3C6aqMJBY9nuVW_aevQeCe32KmfrE,1576
2
2
  rara_tools/decorators.py,sha256=MjOyvZ5nTkwxwx2JLFEGpKKBysvecFw6EN6UDrSvZLU,2187
3
- rara_tools/digar_schema_converter.py,sha256=gGwhqdwxyTXODF0LP5Xi0u8uRoICfaIU3MRe1EVBnEc,13935
4
- rara_tools/elastic.py,sha256=vEvrbIPRtdqTdrNrPH2cewHLMfOTSf87a4JOiRQgYyA,7146
3
+ rara_tools/digar_schema_converter.py,sha256=k95U2iRlEA3sh772-v6snhHW6fju6qSTMnvWJ6DpzZk,14254
4
+ rara_tools/elastic.py,sha256=LZfHZqeTDjCEb5YX4CLPJEFffRSZAcRq6AtyP49Fo0E,11575
5
5
  rara_tools/exceptions.py,sha256=BwNh4qWxau_ylr9RqZoYwd1KnExI6oWWWDno3jkh8q4,474
6
6
  rara_tools/s3.py,sha256=uNDu2HzMYHAWh33RcHeyPFK7gdQfQPxsdfohyIKezEY,4467
7
7
  rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
@@ -9,8 +9,8 @@ rara_tools/utils.py,sha256=9vSbmuWYU5ydr4lXBKlUKa0xzDccFsaJv4T-XwgUfuY,2578
9
9
  rara_tools/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  rara_tools/constants/digitizer.py,sha256=gJ3jOMwuZfKcLqgOAxTyB266VYsskLabJiMUiSz3xX4,297
11
11
  rara_tools/constants/general.py,sha256=E9Jaw-YxocS_tOZw9QBoxO3e9KK5EMbLoM0R7D4Iflw,171
12
- rara_tools-0.0.10.dist-info/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
13
- rara_tools-0.0.10.dist-info/METADATA,sha256=jV6nZKhjjwDL6TWt-fKWudWNUAViZTVDL0J39fefFtM,3895
14
- rara_tools-0.0.10.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
15
- rara_tools-0.0.10.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
16
- rara_tools-0.0.10.dist-info/RECORD,,
12
+ rara_tools-0.0.11.dist-info/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
13
+ rara_tools-0.0.11.dist-info/METADATA,sha256=pDcladCQ1A9O9Wh4UDSh0eHwNyqcGY1BOwPxSJKpLFk,3895
14
+ rara_tools-0.0.11.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
15
+ rara_tools-0.0.11.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
16
+ rara_tools-0.0.11.dist-info/RECORD,,