rara-tools 0.6.14__tar.gz → 0.6.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rara-tools might be problematic. Click here for more details.
- {rara_tools-0.6.14/rara_tools.egg-info → rara_tools-0.6.16}/PKG-INFO +1 -1
- rara_tools-0.6.16/VERSION +1 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/elastic.py +46 -36
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/normalizers/base.py +9 -2
- {rara_tools-0.6.14 → rara_tools-0.6.16/rara_tools.egg-info}/PKG-INFO +1 -1
- {rara_tools-0.6.14 → rara_tools-0.6.16}/tests/test_elastic.py +22 -0
- rara_tools-0.6.14/VERSION +0 -1
- {rara_tools-0.6.14 → rara_tools-0.6.16}/LICENSE.md +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/README.md +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/pyproject.toml +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/constants/__init__.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/constants/digitizer.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/constants/general.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/constants/language_evaluator.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/constants/linker.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/constants/meta_extractor.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/constants/normalizers.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/constants/parsers.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/constants/subject_indexer.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/converters.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/decorators.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/digar_schema_converter.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/exceptions.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/formatters.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/normalizers/__init__.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/normalizers/authorities.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/normalizers/bibs.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/normalizers/viaf.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/parsers/marc_parsers/base_parser.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/parsers/marc_parsers/ems_parser.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/parsers/marc_parsers/location_parser.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/parsers/marc_parsers/organization_parser.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/parsers/marc_parsers/person_parser.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/parsers/marc_parsers/title_parser.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/parsers/marc_records/base_record.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/parsers/marc_records/ems_record.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/parsers/marc_records/organization_record.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/parsers/marc_records/person_record.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/parsers/marc_records/title_record.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/parsers/tools/entity_normalizers.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/parsers/tools/marc_converter.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/parsers/tools/russian_transliterator.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/s3.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/task_reporter.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/utils.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools.egg-info/SOURCES.txt +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools.egg-info/dependency_links.txt +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools.egg-info/requires.txt +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools.egg-info/top_level.txt +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/requirements.txt +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/setup.cfg +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/tests/test_digar_schema_converter.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/tests/test_elastic_vector_and_search_operations.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/tests/test_entity_normalizers.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/tests/test_formatters.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/tests/test_marc_parsers.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/tests/test_normalization.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/tests/test_s3_exceptions.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/tests/test_s3_file_operations.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/tests/test_sierra_converters.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/tests/test_task_reporter.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/tests/test_utils.py +0 -0
- {rara_tools-0.6.14 → rara_tools-0.6.16}/tests/test_viaf_client.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.6.16
|
|
@@ -4,7 +4,7 @@ import elasticsearch_dsl
|
|
|
4
4
|
from elastic_transport import ObjectApiResponse
|
|
5
5
|
from elasticsearch import Elasticsearch
|
|
6
6
|
from elasticsearch.helpers import bulk
|
|
7
|
-
from elasticsearch_dsl import Index
|
|
7
|
+
from elasticsearch_dsl import Index, Search, Q
|
|
8
8
|
from elasticsearch_dsl.response import Response
|
|
9
9
|
|
|
10
10
|
from .decorators import _elastic_connection
|
|
@@ -84,20 +84,30 @@ class KataElastic:
|
|
|
84
84
|
def add_mapping(self, index_name: str, schema: dict):
|
|
85
85
|
index = Index(name=index_name)
|
|
86
86
|
return index.put_mapping(body=schema, using=self.elasticsearch)
|
|
87
|
-
|
|
88
|
-
|
|
87
|
+
|
|
88
|
+
@_elastic_connection
|
|
89
|
+
def delete_by_query(self, index: str, query_kwargs: dict, query_type: str = "term", wait_for_completion=True):
|
|
90
|
+
query = Q(query_type, **query_kwargs)
|
|
91
|
+
s = Search(using=self.elasticsearch, index=index).query(query)
|
|
92
|
+
response = self.elasticsearch.delete_by_query(
|
|
93
|
+
index=index,
|
|
94
|
+
body={"query": s.to_dict()["query"]},
|
|
95
|
+
wait_for_completion=True
|
|
96
|
+
)
|
|
97
|
+
return response
|
|
98
|
+
|
|
89
99
|
@_elastic_connection
|
|
90
100
|
def add_vector_mapping(
|
|
91
|
-
self,
|
|
92
|
-
index_name: str,
|
|
93
|
-
field: str,
|
|
94
|
-
schema: Optional[dict] = None,
|
|
101
|
+
self,
|
|
102
|
+
index_name: str,
|
|
103
|
+
field: str,
|
|
104
|
+
schema: Optional[dict] = None,
|
|
95
105
|
dims: int = 1024
|
|
96
106
|
) -> dict:
|
|
97
107
|
vector_mapping = {
|
|
98
108
|
"properties": {
|
|
99
109
|
field: {
|
|
100
|
-
"type": "dense_vector",
|
|
110
|
+
"type": "dense_vector",
|
|
101
111
|
"dims": dims
|
|
102
112
|
}
|
|
103
113
|
}
|
|
@@ -105,22 +115,21 @@ class KataElastic:
|
|
|
105
115
|
mapping = schema or vector_mapping
|
|
106
116
|
index = Index(name=index_name)
|
|
107
117
|
return index.put_mapping(body=mapping, using=self.elasticsearch)
|
|
108
|
-
|
|
109
|
-
|
|
118
|
+
|
|
110
119
|
@_elastic_connection
|
|
111
120
|
def add_ann_vector_mapping(
|
|
112
|
-
self,
|
|
121
|
+
self,
|
|
113
122
|
index_name: str,
|
|
114
123
|
field: str,
|
|
115
|
-
schema: Optional[dict] = None,
|
|
124
|
+
schema: Optional[dict] = None,
|
|
116
125
|
dims: int = 1024
|
|
117
126
|
) -> dict:
|
|
118
127
|
vector_mapping = {
|
|
119
128
|
"properties": {
|
|
120
129
|
field: {
|
|
121
|
-
"type": "dense_vector",
|
|
122
|
-
"dims": dims,
|
|
123
|
-
"similarity": "cosine",
|
|
130
|
+
"type": "dense_vector",
|
|
131
|
+
"dims": dims,
|
|
132
|
+
"similarity": "cosine",
|
|
124
133
|
"index": True
|
|
125
134
|
}
|
|
126
135
|
}
|
|
@@ -131,15 +140,19 @@ class KataElastic:
|
|
|
131
140
|
|
|
132
141
|
@_elastic_connection
|
|
133
142
|
def add_vector(
|
|
134
|
-
self,
|
|
143
|
+
self,
|
|
135
144
|
index_name: str,
|
|
136
|
-
document_id: str,
|
|
137
|
-
vector: List[float],
|
|
138
|
-
field: str
|
|
145
|
+
document_id: str,
|
|
146
|
+
vector: List[float],
|
|
147
|
+
field: str,
|
|
148
|
+
refresh: str = "wait_for"
|
|
139
149
|
) -> dict:
|
|
140
150
|
schema = {"doc": {field: vector}}
|
|
141
151
|
return self.elasticsearch.update(
|
|
142
|
-
index=index_name,
|
|
152
|
+
index=index_name,
|
|
153
|
+
id=document_id,
|
|
154
|
+
body=schema,
|
|
155
|
+
refresh=refresh
|
|
143
156
|
)
|
|
144
157
|
|
|
145
158
|
@_elastic_connection
|
|
@@ -204,7 +217,7 @@ class KataElastic:
|
|
|
204
217
|
actions = [{"_index": last_index_name, "_source": document} for document in documents]
|
|
205
218
|
successful_count, error_count = bulk(actions=actions, client=self.elasticsearch, max_retries=3, refresh=refresh)
|
|
206
219
|
return successful_count, error_count
|
|
207
|
-
|
|
220
|
+
|
|
208
221
|
@_elastic_connection
|
|
209
222
|
def bulk_index_without_rollver(
|
|
210
223
|
self,
|
|
@@ -240,16 +253,15 @@ class KataElastic:
|
|
|
240
253
|
s.scan(), key=lambda doc: [getattr(doc, field) for field in sort_fields]
|
|
241
254
|
)
|
|
242
255
|
return documents
|
|
243
|
-
|
|
244
|
-
|
|
256
|
+
|
|
245
257
|
@_elastic_connection
|
|
246
258
|
def execute_fuzzy_search(
|
|
247
|
-
self,
|
|
259
|
+
self,
|
|
248
260
|
index: str,
|
|
249
261
|
field: str,
|
|
250
|
-
entity: str,
|
|
251
|
-
fuzziness: int = 2,
|
|
252
|
-
prefix_length: int = 1,
|
|
262
|
+
entity: str,
|
|
263
|
+
fuzziness: int = 2,
|
|
264
|
+
prefix_length: int = 1,
|
|
253
265
|
max_expansions: int = 50
|
|
254
266
|
) -> Response:
|
|
255
267
|
"""Executes a fuzzy search.
|
|
@@ -261,7 +273,7 @@ class KataElastic:
|
|
|
261
273
|
:param: max_expansion int: maximum number of terms the fuzzy query
|
|
262
274
|
will match before halting the search
|
|
263
275
|
:return: Dict on search results.
|
|
264
|
-
"""
|
|
276
|
+
"""
|
|
265
277
|
query_params = {
|
|
266
278
|
f"{field}.keyword": {
|
|
267
279
|
"value": entity,
|
|
@@ -274,7 +286,7 @@ class KataElastic:
|
|
|
274
286
|
s = s.query("fuzzy", **query_params)
|
|
275
287
|
response = s.execute()
|
|
276
288
|
return response
|
|
277
|
-
|
|
289
|
+
|
|
278
290
|
def execute_ann_vector_search(
|
|
279
291
|
self,
|
|
280
292
|
index: str,
|
|
@@ -303,8 +315,8 @@ class KataElastic:
|
|
|
303
315
|
s = s.extra(
|
|
304
316
|
knn={
|
|
305
317
|
"field": field,
|
|
306
|
-
"query_vector": query_vector,
|
|
307
|
-
"k": k,
|
|
318
|
+
"query_vector": query_vector,
|
|
319
|
+
"k": k,
|
|
308
320
|
"num_candidates": num_candidates
|
|
309
321
|
}
|
|
310
322
|
)
|
|
@@ -314,15 +326,14 @@ class KataElastic:
|
|
|
314
326
|
s = s.query(
|
|
315
327
|
elasticsearch_dsl.Q("terms", _id=elastic_ids)
|
|
316
328
|
)
|
|
317
|
-
|
|
329
|
+
|
|
318
330
|
# Sort by score and return `n_docs` best-matching documents
|
|
319
331
|
s = s.extra(size=n_docs)
|
|
320
332
|
|
|
321
333
|
# Execute the search
|
|
322
334
|
response = s.execute()
|
|
323
335
|
return response
|
|
324
|
-
|
|
325
|
-
|
|
336
|
+
|
|
326
337
|
def execute_script_score_vector_search(
|
|
327
338
|
self,
|
|
328
339
|
index: str,
|
|
@@ -341,7 +352,7 @@ class KataElastic:
|
|
|
341
352
|
:param: elastic_ids: List[str]: Elastic ID-s for restricting the search.
|
|
342
353
|
"""
|
|
343
354
|
s = elasticsearch_dsl.Search(using=self.elasticsearch, index=index)
|
|
344
|
-
|
|
355
|
+
|
|
345
356
|
if elastic_ids:
|
|
346
357
|
query = elasticsearch_dsl.Q("terms", _id=elastic_ids)
|
|
347
358
|
else:
|
|
@@ -364,6 +375,5 @@ class KataElastic:
|
|
|
364
375
|
response = s.execute()
|
|
365
376
|
return response
|
|
366
377
|
|
|
367
|
-
|
|
368
378
|
def __str__(self) -> str:
|
|
369
379
|
return self.elasticsearch_url
|
|
@@ -280,8 +280,15 @@ class RecordNormalizer:
|
|
|
280
280
|
|
|
281
281
|
@property
|
|
282
282
|
def data(self) -> List[dict]:
|
|
283
|
-
"""
|
|
284
|
-
|
|
283
|
+
"""Shorthand to get all normalized records as dict, skipping failures."""
|
|
284
|
+
result = []
|
|
285
|
+
for record in self:
|
|
286
|
+
try:
|
|
287
|
+
result.append(record.as_dict())
|
|
288
|
+
except Exception as e:
|
|
289
|
+
logger.error(f"Failed to normalize record: {e}")
|
|
290
|
+
continue
|
|
291
|
+
return result
|
|
285
292
|
|
|
286
293
|
def __iter__(self) -> Iterator:
|
|
287
294
|
viaf_id_path = "viaf.queryResult.records.record.0.recordData.VIAFCluster.viafID"
|
|
@@ -4,7 +4,9 @@ import time
|
|
|
4
4
|
import uuid
|
|
5
5
|
from time import sleep
|
|
6
6
|
|
|
7
|
+
import elasticsearch_dsl
|
|
7
8
|
import pytest
|
|
9
|
+
|
|
8
10
|
from rara_tools.elastic import KataElastic
|
|
9
11
|
|
|
10
12
|
with open("./tests/test_data/elastic_docs.json") as fh:
|
|
@@ -28,6 +30,7 @@ def test_index_creation():
|
|
|
28
30
|
assert created["acknowledged"] is True
|
|
29
31
|
time.sleep(2)
|
|
30
32
|
|
|
33
|
+
|
|
31
34
|
@pytest.mark.order(2)
|
|
32
35
|
def test_check():
|
|
33
36
|
"""Tests health check method.
|
|
@@ -119,6 +122,25 @@ def test_document_deleting():
|
|
|
119
122
|
result = ELASTIC.get_documents_by_key(TEST_INDEX_NAME, "bar")
|
|
120
123
|
assert len(result) == 0
|
|
121
124
|
|
|
125
|
+
unique_id = uuid.uuid4().hex
|
|
126
|
+
document_amount = 10
|
|
127
|
+
documents = [{"doc_id": unique_id, "page": x} for x in range(document_amount)]
|
|
128
|
+
ELASTIC.bulk_index(documents, TEST_INDEX_NAME, rollover_limit=100, refresh="wait_for")
|
|
129
|
+
query = elasticsearch_dsl.Q("term", **{"doc_id.keyword": unique_id})
|
|
130
|
+
search = elasticsearch_dsl.Search(using=ELASTIC.elasticsearch, index=TEST_DOCUMENT_INDEX).query(query)
|
|
131
|
+
count = [hit.to_dict() for hit in search.scan()]
|
|
132
|
+
assert len(count) == document_amount
|
|
133
|
+
response = ELASTIC.delete_by_query(TEST_DOCUMENT_INDEX, {"doc_id.keyword": unique_id})
|
|
134
|
+
|
|
135
|
+
attempt = 0
|
|
136
|
+
while attempt < 3 and count != 0:
|
|
137
|
+
search = elasticsearch_dsl.Search(using=ELASTIC.elasticsearch, index=TEST_DOCUMENT_INDEX).query(query)
|
|
138
|
+
count = [hit.to_dict() for hit in search.scan()]
|
|
139
|
+
time.sleep(3)
|
|
140
|
+
attempt += 1
|
|
141
|
+
|
|
142
|
+
assert len(count) == 0
|
|
143
|
+
|
|
122
144
|
|
|
123
145
|
@pytest.mark.order(8)
|
|
124
146
|
def test_index_deleting():
|
rara_tools-0.6.14/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.6.14
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/parsers/marc_parsers/organization_parser.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rara_tools-0.6.14 → rara_tools-0.6.16}/rara_tools/parsers/marc_records/organization_record.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|