rara-tools 0.0.11__tar.gz → 0.0.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rara-tools might be problematic. Click here for more details.
- {rara_tools-0.0.11/rara_tools.egg-info → rara_tools-0.0.13}/PKG-INFO +1 -1
- rara_tools-0.0.13/VERSION +1 -0
- rara_tools-0.0.13/rara_tools/converters.py +81 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/rara_tools/elastic.py +44 -1
- {rara_tools-0.0.11 → rara_tools-0.0.13/rara_tools.egg-info}/PKG-INFO +1 -1
- {rara_tools-0.0.11 → rara_tools-0.0.13}/tests/test_converters.py +33 -11
- {rara_tools-0.0.11 → rara_tools-0.0.13}/tests/test_elastic_vector_and_search_operations.py +31 -4
- rara_tools-0.0.11/VERSION +0 -1
- rara_tools-0.0.11/rara_tools/converters.py +0 -41
- {rara_tools-0.0.11 → rara_tools-0.0.13}/LICENSE.md +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/README.md +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/pyproject.toml +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/rara_tools/constants/__init__.py +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/rara_tools/constants/digitizer.py +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/rara_tools/constants/general.py +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/rara_tools/decorators.py +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/rara_tools/digar_schema_converter.py +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/rara_tools/exceptions.py +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/rara_tools/s3.py +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/rara_tools/task_reporter.py +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/rara_tools/utils.py +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/rara_tools.egg-info/SOURCES.txt +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/rara_tools.egg-info/dependency_links.txt +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/rara_tools.egg-info/requires.txt +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/rara_tools.egg-info/top_level.txt +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/requirements.txt +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/setup.cfg +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/tests/test_digar_schema_converter.py +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/tests/test_elastic.py +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/tests/test_s3_exceptions.py +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/tests/test_s3_file_operations.py +0 -0
- {rara_tools-0.0.11 → rara_tools-0.0.13}/tests/test_task_reporter.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.0.13
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
from .exceptions import SierraResponseConverterException
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SierraResponseConverter:
|
|
5
|
+
"""Converts a JSON response from the Sierra API to MARC-in-JSON format."""
|
|
6
|
+
|
|
7
|
+
def __init__(self, response: dict):
|
|
8
|
+
if not isinstance(response, dict):
|
|
9
|
+
raise SierraResponseConverterException("Please provide a valid JSON response.")
|
|
10
|
+
self.response = response
|
|
11
|
+
|
|
12
|
+
def _map_control_fields(self, field: dict) -> dict:
|
|
13
|
+
# for tags < 010, no subfields, instead one str value in "value"
|
|
14
|
+
return {field["tag"]: field["value"]}
|
|
15
|
+
|
|
16
|
+
def _map_data_fields(self, field: dict) -> dict:
|
|
17
|
+
""" Maps marc fields > 010.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
field (dict): Contains the marc tag and list with indicators and subfields.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
dict: standardised marc-in-json format.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
data = field["data"]
|
|
27
|
+
|
|
28
|
+
# Order matters ind1, in2, subfields
|
|
29
|
+
field_data = {
|
|
30
|
+
"ind1": data.get("ind1", " "),
|
|
31
|
+
"ind2": data.get("ind2", " "),
|
|
32
|
+
"subfields": data.get("subfields", [])
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
return {field["tag"]: field_data}
|
|
36
|
+
|
|
37
|
+
def _is_marc21structured(self, field: dict) -> bool:
|
|
38
|
+
"""Checks if the field is already structured according to MARC21 in JSON"""
|
|
39
|
+
return any(key.isdigit() for key in field.keys())
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _handle_field_type(self, field: dict) -> dict:
|
|
43
|
+
|
|
44
|
+
if self._is_marc21structured(field):
|
|
45
|
+
return field
|
|
46
|
+
|
|
47
|
+
if field.get("data"):
|
|
48
|
+
return self._map_data_fields(field)
|
|
49
|
+
|
|
50
|
+
tag = field.get("tag")
|
|
51
|
+
|
|
52
|
+
if not tag:
|
|
53
|
+
raise SierraResponseConverterException("Field is missing MARC21 tag.")
|
|
54
|
+
|
|
55
|
+
if tag < "010":
|
|
56
|
+
return self._map_control_fields(field)
|
|
57
|
+
else:
|
|
58
|
+
return self._map_data_fields(field)
|
|
59
|
+
|
|
60
|
+
def _convert_response(self) -> list:
|
|
61
|
+
entries = self.response.get("entries")
|
|
62
|
+
if not entries:
|
|
63
|
+
raise SierraResponseConverterException("No entries found in the response.")
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
return {"fields": [
|
|
67
|
+
{e["id"]: [
|
|
68
|
+
self._handle_field_type(f) for f in e["marc"]["fields"]
|
|
69
|
+
]}
|
|
70
|
+
for e in entries
|
|
71
|
+
]}
|
|
72
|
+
|
|
73
|
+
except KeyError as e:
|
|
74
|
+
raise SierraResponseConverterException(f"Malformed response: missing key {e}")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def convert(self) -> list:
|
|
78
|
+
try:
|
|
79
|
+
return self._convert_response()
|
|
80
|
+
except Exception as e:
|
|
81
|
+
raise SierraResponseConverterException(f"An unexpected error occurred: {e}")
|
|
@@ -263,7 +263,7 @@ class KataElastic:
|
|
|
263
263
|
response = s.execute()
|
|
264
264
|
return response
|
|
265
265
|
|
|
266
|
-
def
|
|
266
|
+
def execute_ann_vector_search(
|
|
267
267
|
self,
|
|
268
268
|
index: str,
|
|
269
269
|
field: str,
|
|
@@ -281,6 +281,7 @@ class KataElastic:
|
|
|
281
281
|
:param: query vector List[float]: Vector to search matches for.
|
|
282
282
|
:param: k int: Number of nearest neighbors to return.
|
|
283
283
|
:param: num_candidates int: Number of candidates considered before selecting k results.
|
|
284
|
+
:param: n_docs: int: Number of documents to return.
|
|
284
285
|
:param: elastic_ids: List[str]: Elastic ID-s for restricting the search.
|
|
285
286
|
"""
|
|
286
287
|
|
|
@@ -308,6 +309,48 @@ class KataElastic:
|
|
|
308
309
|
# Execute the search
|
|
309
310
|
response = s.execute()
|
|
310
311
|
return response
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def execute_script_score_vector_search(
|
|
315
|
+
self,
|
|
316
|
+
index: str,
|
|
317
|
+
field: str,
|
|
318
|
+
query_vector: List[float],
|
|
319
|
+
n_docs: int = 10,
|
|
320
|
+
elastic_ids: List[str] = []
|
|
321
|
+
) -> Response:
|
|
322
|
+
""" Execute a vector search.
|
|
323
|
+
NB! Requires different mapping than ANN!
|
|
324
|
+
|
|
325
|
+
:param: index str: Index to search from.
|
|
326
|
+
:param: field str: Field containing vectorized data.
|
|
327
|
+
:param: query vector List[float]: Vector to search matches for.
|
|
328
|
+
:param: n_docs: int: Number of documents to return.
|
|
329
|
+
:param: elastic_ids: List[str]: Elastic ID-s for restricting the search.
|
|
330
|
+
"""
|
|
331
|
+
s = elasticsearch_dsl.Search(using=self.elasticsearch, index=index)
|
|
332
|
+
|
|
333
|
+
if elastic_ids:
|
|
334
|
+
query = elasticsearch_dsl.Q("terms", _id=elastic_ids)
|
|
335
|
+
else:
|
|
336
|
+
query = elasticsearch_dsl.Q("match_all")
|
|
337
|
+
# Apply script_score query
|
|
338
|
+
s = s.query(
|
|
339
|
+
"script_score",
|
|
340
|
+
query=query,
|
|
341
|
+
script={
|
|
342
|
+
"source": f"1.0 + cosineSimilarity(params.query_vector, '{field}')",
|
|
343
|
+
"params": {
|
|
344
|
+
"query_vector": query_vector
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
)
|
|
348
|
+
# Set min_score and limit number of documents
|
|
349
|
+
s = s.extra(size=n_docs)
|
|
350
|
+
|
|
351
|
+
# Execute search
|
|
352
|
+
response = s.execute()
|
|
353
|
+
return response
|
|
311
354
|
|
|
312
355
|
|
|
313
356
|
def __str__(self) -> str:
|
|
@@ -5,12 +5,22 @@ import pytest
|
|
|
5
5
|
from rara_tools.converters import SierraResponseConverter
|
|
6
6
|
from rara_tools.exceptions import SierraResponseConverterException
|
|
7
7
|
|
|
8
|
+
import json
|
|
9
|
+
|
|
8
10
|
root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
9
11
|
|
|
10
12
|
SIERRA_TEST_DATA_DIR = os.path.join(root, "tests", "test_data", "sierra")
|
|
11
13
|
INPUT_DIR = os.path.join(SIERRA_TEST_DATA_DIR, "input")
|
|
12
14
|
OUTPUT_DIR = os.path.join(SIERRA_TEST_DATA_DIR, "output")
|
|
13
15
|
|
|
16
|
+
def compare_results(expected, converted):
|
|
17
|
+
return json.dumps(expected) == json.dumps(converted)
|
|
18
|
+
|
|
19
|
+
def read_json_file(file_path):
|
|
20
|
+
with open(file_path, "r") as f:
|
|
21
|
+
data = f.read()
|
|
22
|
+
return json.loads(data)
|
|
23
|
+
|
|
14
24
|
example_res = {
|
|
15
25
|
"total": 100,
|
|
16
26
|
"start": 50000,
|
|
@@ -27,6 +37,8 @@ example_res = {
|
|
|
27
37
|
{
|
|
28
38
|
# "tag": "100",
|
|
29
39
|
"data": {
|
|
40
|
+
"ind1": "1",
|
|
41
|
+
"ind2": " ",
|
|
30
42
|
"subfields": [
|
|
31
43
|
{
|
|
32
44
|
"code": "a",
|
|
@@ -36,18 +48,13 @@ example_res = {
|
|
|
36
48
|
"code": "d",
|
|
37
49
|
"data": "1975-"
|
|
38
50
|
}
|
|
39
|
-
]
|
|
40
|
-
"ind1": "1",
|
|
41
|
-
"ind2": " "
|
|
51
|
+
]
|
|
42
52
|
}
|
|
43
53
|
},
|
|
44
54
|
]}}]}
|
|
45
55
|
|
|
46
56
|
|
|
47
|
-
|
|
48
|
-
with open(file_path, "r") as f:
|
|
49
|
-
data = f.read()
|
|
50
|
-
return json.loads(data)
|
|
57
|
+
|
|
51
58
|
|
|
52
59
|
def test_convert_bibs_response():
|
|
53
60
|
response = read_json_file(os.path.join(INPUT_DIR, "bibs.json"))
|
|
@@ -55,8 +62,9 @@ def test_convert_bibs_response():
|
|
|
55
62
|
converter = SierraResponseConverter(response)
|
|
56
63
|
data = converter.convert()
|
|
57
64
|
|
|
58
|
-
expected = read_json_file(os.path.join(OUTPUT_DIR, "bibs.json"))
|
|
59
|
-
|
|
65
|
+
expected = read_json_file(os.path.join(OUTPUT_DIR, "bibs.json"))
|
|
66
|
+
|
|
67
|
+
assert compare_results(expected, data)
|
|
60
68
|
|
|
61
69
|
|
|
62
70
|
def test_convert_keywords_response():
|
|
@@ -67,9 +75,10 @@ def test_convert_keywords_response():
|
|
|
67
75
|
converter = SierraResponseConverter(response)
|
|
68
76
|
data = converter.convert()
|
|
69
77
|
|
|
78
|
+
|
|
70
79
|
expected = read_json_file(os.path.join(OUTPUT_DIR, "keywords.json"))
|
|
71
80
|
|
|
72
|
-
assert data
|
|
81
|
+
assert compare_results(expected, data)
|
|
73
82
|
|
|
74
83
|
|
|
75
84
|
def test_convert_authorities_response():
|
|
@@ -82,7 +91,20 @@ def test_convert_authorities_response():
|
|
|
82
91
|
|
|
83
92
|
expected = read_json_file(os.path.join(OUTPUT_DIR, "authorities.json"))
|
|
84
93
|
|
|
85
|
-
assert data
|
|
94
|
+
assert compare_results(expected, data)
|
|
95
|
+
|
|
96
|
+
def test_converter_handles_marc_in_json_response():
|
|
97
|
+
""" Gracefully handle entries already in MARC-in-JSON format """
|
|
98
|
+
with open(os.path.join(INPUT_DIR, "bibsmarc.json"), "r") as f:
|
|
99
|
+
response = f.read()
|
|
100
|
+
response = json.loads(response)
|
|
101
|
+
|
|
102
|
+
converter = SierraResponseConverter(response)
|
|
103
|
+
data = converter.convert()
|
|
104
|
+
|
|
105
|
+
expected = read_json_file(os.path.join(OUTPUT_DIR, "bibsmarc.json"))
|
|
106
|
+
|
|
107
|
+
assert compare_results(expected, data)
|
|
86
108
|
|
|
87
109
|
def test_convert_with_wrong_format():
|
|
88
110
|
with pytest.raises(SierraResponseConverterException):
|
|
@@ -99,8 +99,8 @@ def test_fuzzy_search():
|
|
|
99
99
|
|
|
100
100
|
|
|
101
101
|
@pytest.mark.order(6)
|
|
102
|
-
def
|
|
103
|
-
""" Tests vector search.
|
|
102
|
+
def test_ann_vector_search():
|
|
103
|
+
""" Tests ANN vector search.
|
|
104
104
|
"""
|
|
105
105
|
# Execut fuzzy search to get ID restrictions
|
|
106
106
|
response = ELASTIC.execute_fuzzy_search(
|
|
@@ -113,7 +113,7 @@ def test_vector_search():
|
|
|
113
113
|
assert total_hits == 3
|
|
114
114
|
elastic_ids = [hit.meta.id for hit in response]
|
|
115
115
|
|
|
116
|
-
response = ELASTIC.
|
|
116
|
+
response = ELASTIC.execute_ann_vector_search(
|
|
117
117
|
index=TEST_ANN_INDEX_NAME,
|
|
118
118
|
field="vector",
|
|
119
119
|
query_vector=TEST_VECTOR,
|
|
@@ -126,8 +126,35 @@ def test_vector_search():
|
|
|
126
126
|
assert len(descriptions) == 1
|
|
127
127
|
assert descriptions[0] == "Eesti maletaja ja maleteoreetik"
|
|
128
128
|
|
|
129
|
-
|
|
129
|
+
|
|
130
130
|
@pytest.mark.order(7)
|
|
131
|
+
def test_script_score_vector_search():
|
|
132
|
+
""" Tests ANN vector search.
|
|
133
|
+
"""
|
|
134
|
+
# Execut fuzzy search to get ID restrictions
|
|
135
|
+
response = ELASTIC.execute_fuzzy_search(
|
|
136
|
+
index=TEST_KNN_INDEX_NAME,
|
|
137
|
+
field="variations",
|
|
138
|
+
entity="Paul Keres",
|
|
139
|
+
fuzziness=2
|
|
140
|
+
)
|
|
141
|
+
total_hits = response.hits.total.value
|
|
142
|
+
assert total_hits == 3
|
|
143
|
+
elastic_ids = [hit.meta.id for hit in response]
|
|
144
|
+
|
|
145
|
+
response = ELASTIC.execute_script_score_vector_search(
|
|
146
|
+
index=TEST_KNN_INDEX_NAME,
|
|
147
|
+
field="vector",
|
|
148
|
+
query_vector=TEST_VECTOR,
|
|
149
|
+
n_docs=1,
|
|
150
|
+
elastic_ids=elastic_ids
|
|
151
|
+
)
|
|
152
|
+
descriptions = [hit.description for hit in response]
|
|
153
|
+
assert len(descriptions) == 1
|
|
154
|
+
assert descriptions[0] == "Eesti maletaja ja maleteoreetik"
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@pytest.mark.order(8)
|
|
131
158
|
def test_index_deleting():
|
|
132
159
|
"""
|
|
133
160
|
Tests deleting index. We delete the test index now.
|
rara_tools-0.0.11/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.0.11
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
from .exceptions import SierraResponseConverterException
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class SierraResponseConverter:
|
|
5
|
-
""" Takes a JSON response from the Sierra API (https://tester.ester.ee/iii/sierra-api/swagger/index.html)
|
|
6
|
-
and converts it to MARC-in-JSON format.
|
|
7
|
-
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
def __init__(self, response: dict):
|
|
11
|
-
if not isinstance(response, dict):
|
|
12
|
-
raise SierraResponseConverterException("Please provide a valid JSON response.")
|
|
13
|
-
self.response = response
|
|
14
|
-
|
|
15
|
-
def _map_field_data(self, field):
|
|
16
|
-
tag = field.get("tag")
|
|
17
|
-
if not tag:
|
|
18
|
-
raise SierraResponseConverterException("Field is missing a valid 'tag'.")
|
|
19
|
-
data = field.get("data", {})
|
|
20
|
-
return {tag: data}
|
|
21
|
-
|
|
22
|
-
def _convert_response(self):
|
|
23
|
-
response = self.response
|
|
24
|
-
|
|
25
|
-
entries = response.get("entries")
|
|
26
|
-
if not entries:
|
|
27
|
-
raise SierraResponseConverterException("No entries found in the response.")
|
|
28
|
-
|
|
29
|
-
try:
|
|
30
|
-
fields = [self._map_field_data(f) for e in entries for f in e["marc"]["fields"]]
|
|
31
|
-
except KeyError as e:
|
|
32
|
-
raise SierraResponseConverterException(f"Missing expected MARC fields in the response: {e}")
|
|
33
|
-
|
|
34
|
-
return {"fields": fields}
|
|
35
|
-
|
|
36
|
-
def convert(self):
|
|
37
|
-
"""Runner method, converts the response to MARC-in-JSON format with error handling."""
|
|
38
|
-
try:
|
|
39
|
-
return self._convert_response()
|
|
40
|
-
except Exception as e:
|
|
41
|
-
raise SierraResponseConverterException(f"An unexpected error occurred during conversion: {e}")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|