rara-tools 0.0.4__tar.gz → 0.0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rara-tools might be problematic. Click here for more details.
- {rara_tools-0.0.4/rara_tools.egg-info → rara_tools-0.0.9}/PKG-INFO +32 -18
- {rara_tools-0.0.4 → rara_tools-0.0.9}/README.md +31 -17
- rara_tools-0.0.9/VERSION +1 -0
- {rara_tools-0.0.4 → rara_tools-0.0.9}/pyproject.toml +1 -0
- rara_tools-0.0.9/rara_tools/constants/__init__.py +0 -0
- rara_tools-0.0.9/rara_tools/constants/digitizer.py +13 -0
- rara_tools-0.0.9/rara_tools/constants/general.py +10 -0
- rara_tools-0.0.9/rara_tools/converters.py +41 -0
- {rara_tools-0.0.4 → rara_tools-0.0.9}/rara_tools/decorators.py +3 -3
- rara_tools-0.0.9/rara_tools/elastic.py +175 -0
- {rara_tools-0.0.4 → rara_tools-0.0.9}/rara_tools/exceptions.py +3 -0
- {rara_tools-0.0.4 → rara_tools-0.0.9}/rara_tools/s3.py +3 -2
- {rara_tools-0.0.4 → rara_tools-0.0.9/rara_tools.egg-info}/PKG-INFO +32 -18
- {rara_tools-0.0.4 → rara_tools-0.0.9}/rara_tools.egg-info/SOURCES.txt +5 -0
- rara_tools-0.0.9/tests/test_converters.py +105 -0
- rara_tools-0.0.9/tests/test_elastic.py +131 -0
- {rara_tools-0.0.4 → rara_tools-0.0.9}/tests/test_s3_exceptions.py +4 -3
- {rara_tools-0.0.4 → rara_tools-0.0.9}/tests/test_task_reporter.py +1 -0
- rara_tools-0.0.4/VERSION +0 -1
- rara_tools-0.0.4/rara_tools/elastic.py +0 -92
- rara_tools-0.0.4/tests/test_elastic.py +0 -69
- {rara_tools-0.0.4 → rara_tools-0.0.9}/LICENSE.md +0 -0
- {rara_tools-0.0.4 → rara_tools-0.0.9}/rara_tools/task_reporter.py +0 -0
- {rara_tools-0.0.4 → rara_tools-0.0.9}/rara_tools.egg-info/dependency_links.txt +0 -0
- {rara_tools-0.0.4 → rara_tools-0.0.9}/rara_tools.egg-info/requires.txt +0 -0
- {rara_tools-0.0.4 → rara_tools-0.0.9}/rara_tools.egg-info/top_level.txt +0 -0
- {rara_tools-0.0.4 → rara_tools-0.0.9}/requirements.txt +0 -0
- {rara_tools-0.0.4 → rara_tools-0.0.9}/setup.cfg +0 -0
- {rara_tools-0.0.4 → rara_tools-0.0.9}/tests/test_s3_file_operations.py +1 -1
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: rara-tools
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.9
|
|
4
4
|
Summary: Tools to support Kata's work.
|
|
5
5
|
Classifier: Programming Language :: Python :: 3
|
|
6
6
|
Classifier: Programming Language :: Python :: 3.10
|
|
@@ -28,25 +28,28 @@ Requires-Dist: pytest-order; extra == "testing"
|
|
|
28
28
|
|
|
29
29
|
---
|
|
30
30
|
|
|
31
|
-
## ✨ Features
|
|
31
|
+
## ✨ Features
|
|
32
32
|
|
|
33
33
|
- Elasticsearch index & document operations
|
|
34
34
|
- S3 file management operations
|
|
35
35
|
- Task reporting to Core API
|
|
36
|
+
- Converting SIERRA API responses to Pymarc compatible JSON
|
|
37
|
+
|
|
36
38
|
---
|
|
37
39
|
|
|
38
|
-
## ⚡ Quick Start
|
|
40
|
+
## ⚡ Quick Start
|
|
39
41
|
|
|
40
42
|
Get started with `rara-tools` in just a few steps:
|
|
41
43
|
|
|
42
44
|
1. **Install the Package**
|
|
43
|
-
Ensure you're using Python 3.10 or above, then run:
|
|
45
|
+
Ensure you're using Python 3.10 or above, then run:
|
|
46
|
+
|
|
44
47
|
```bash
|
|
45
48
|
pip install rara-tools
|
|
46
49
|
```
|
|
47
50
|
|
|
48
51
|
2. **Import and Use**
|
|
49
|
-
Example usage to download a folder from S3:
|
|
52
|
+
Example usage to download a folder from S3:
|
|
50
53
|
|
|
51
54
|
```python
|
|
52
55
|
from rara_tools.s3 import S3Files
|
|
@@ -77,22 +80,25 @@ Follow the steps below to install the `rara-tools` package, either via `pip` or
|
|
|
77
80
|
Create or activate a Python environment using Python **3.10** or above.
|
|
78
81
|
|
|
79
82
|
2. **Install the Package**
|
|
80
|
-
|
|
83
|
+
Run the following command:
|
|
84
|
+
|
|
81
85
|
```bash
|
|
82
86
|
pip install rara-tools
|
|
83
87
|
```
|
|
84
|
-
|
|
88
|
+
|
|
89
|
+
</details>
|
|
85
90
|
|
|
86
91
|
---
|
|
87
92
|
|
|
88
93
|
### Local Installation
|
|
89
94
|
|
|
90
|
-
Follow these steps to install the `rara-tools` package locally:
|
|
95
|
+
Follow these steps to install the `rara-tools` package locally:
|
|
91
96
|
|
|
92
97
|
<details><summary>Click to expand</summary>
|
|
93
98
|
|
|
94
99
|
1. **Clone the Repository**
|
|
95
|
-
Clone the repository and navigate into it:
|
|
100
|
+
Clone the repository and navigate into it:
|
|
101
|
+
|
|
96
102
|
```bash
|
|
97
103
|
git clone <repository-url>
|
|
98
104
|
cd <repository-directory>
|
|
@@ -100,25 +106,29 @@ Follow these steps to install the `rara-tools` package locally:
|
|
|
100
106
|
|
|
101
107
|
2. **Set Up Python Environment**
|
|
102
108
|
Create or activate a Python environment using Python 3.10 or above. E.g:
|
|
109
|
+
|
|
103
110
|
```bash
|
|
104
111
|
conda create -n py310 python==3.10
|
|
105
112
|
conda activate py310
|
|
106
113
|
```
|
|
107
114
|
|
|
108
115
|
3. **Install Build Package**
|
|
109
|
-
Install the `build` package to enable local builds:
|
|
116
|
+
Install the `build` package to enable local builds:
|
|
117
|
+
|
|
110
118
|
```bash
|
|
111
119
|
pip install build
|
|
112
120
|
```
|
|
113
121
|
|
|
114
122
|
4. **Build the Package**
|
|
115
|
-
Run the following command inside the repository:
|
|
123
|
+
Run the following command inside the repository:
|
|
124
|
+
|
|
116
125
|
```bash
|
|
117
126
|
python -m build
|
|
118
127
|
```
|
|
119
128
|
|
|
120
129
|
5. **Install the Package**
|
|
121
|
-
Install the built package locally:
|
|
130
|
+
Install the built package locally:
|
|
131
|
+
|
|
122
132
|
```bash
|
|
123
133
|
pip install .
|
|
124
134
|
```
|
|
@@ -131,13 +141,13 @@ Follow these steps to install the `rara-tools` package locally:
|
|
|
131
141
|
|
|
132
142
|
Follow these steps to test the `rara-tools` package.
|
|
133
143
|
|
|
134
|
-
|
|
135
144
|
### How to Test
|
|
136
145
|
|
|
137
146
|
<details><summary>Click to expand</summary>
|
|
138
147
|
|
|
139
148
|
1. **Clone the Repository**
|
|
140
|
-
Clone the repository and navigate into it:
|
|
149
|
+
Clone the repository and navigate into it:
|
|
150
|
+
|
|
141
151
|
```bash
|
|
142
152
|
git clone <repository-url>
|
|
143
153
|
cd <repository-directory>
|
|
@@ -147,25 +157,29 @@ Follow these steps to test the `rara-tools` package.
|
|
|
147
157
|
Create or activate a Python environment using Python 3.10 or above.
|
|
148
158
|
|
|
149
159
|
3. **Install Build Package**
|
|
150
|
-
Install the `build` package:
|
|
160
|
+
Install the `build` package:
|
|
161
|
+
|
|
151
162
|
```bash
|
|
152
163
|
pip install build
|
|
153
164
|
```
|
|
154
165
|
|
|
155
166
|
4. **Build the Package**
|
|
156
|
-
Build the package inside the repository:
|
|
167
|
+
Build the package inside the repository:
|
|
168
|
+
|
|
157
169
|
```bash
|
|
158
170
|
python -m build
|
|
159
171
|
```
|
|
160
172
|
|
|
161
173
|
5. **Install with Testing Dependencies**
|
|
162
|
-
Install the package along with its testing dependencies:
|
|
174
|
+
Install the package along with its testing dependencies:
|
|
175
|
+
|
|
163
176
|
```bash
|
|
164
177
|
pip install .[testing]
|
|
165
178
|
```
|
|
166
179
|
|
|
167
180
|
6. **Run Tests**
|
|
168
|
-
Run the test suite from the repository root:
|
|
181
|
+
Run the test suite from the repository root:
|
|
182
|
+
|
|
169
183
|
```bash
|
|
170
184
|
python -m pytest -v tests
|
|
171
185
|
```
|
|
@@ -8,25 +8,28 @@
|
|
|
8
8
|
|
|
9
9
|
---
|
|
10
10
|
|
|
11
|
-
## ✨ Features
|
|
11
|
+
## ✨ Features
|
|
12
12
|
|
|
13
13
|
- Elasticsearch index & document operations
|
|
14
14
|
- S3 file management operations
|
|
15
15
|
- Task reporting to Core API
|
|
16
|
+
- Converting SIERRA API responses to Pymarc compatible JSON
|
|
17
|
+
|
|
16
18
|
---
|
|
17
19
|
|
|
18
|
-
## ⚡ Quick Start
|
|
20
|
+
## ⚡ Quick Start
|
|
19
21
|
|
|
20
22
|
Get started with `rara-tools` in just a few steps:
|
|
21
23
|
|
|
22
24
|
1. **Install the Package**
|
|
23
|
-
Ensure you're using Python 3.10 or above, then run:
|
|
25
|
+
Ensure you're using Python 3.10 or above, then run:
|
|
26
|
+
|
|
24
27
|
```bash
|
|
25
28
|
pip install rara-tools
|
|
26
29
|
```
|
|
27
30
|
|
|
28
31
|
2. **Import and Use**
|
|
29
|
-
Example usage to download a folder from S3:
|
|
32
|
+
Example usage to download a folder from S3:
|
|
30
33
|
|
|
31
34
|
```python
|
|
32
35
|
from rara_tools.s3 import S3Files
|
|
@@ -57,22 +60,25 @@ Follow the steps below to install the `rara-tools` package, either via `pip` or
|
|
|
57
60
|
Create or activate a Python environment using Python **3.10** or above.
|
|
58
61
|
|
|
59
62
|
2. **Install the Package**
|
|
60
|
-
|
|
63
|
+
Run the following command:
|
|
64
|
+
|
|
61
65
|
```bash
|
|
62
66
|
pip install rara-tools
|
|
63
67
|
```
|
|
64
|
-
|
|
68
|
+
|
|
69
|
+
</details>
|
|
65
70
|
|
|
66
71
|
---
|
|
67
72
|
|
|
68
73
|
### Local Installation
|
|
69
74
|
|
|
70
|
-
Follow these steps to install the `rara-tools` package locally:
|
|
75
|
+
Follow these steps to install the `rara-tools` package locally:
|
|
71
76
|
|
|
72
77
|
<details><summary>Click to expand</summary>
|
|
73
78
|
|
|
74
79
|
1. **Clone the Repository**
|
|
75
|
-
Clone the repository and navigate into it:
|
|
80
|
+
Clone the repository and navigate into it:
|
|
81
|
+
|
|
76
82
|
```bash
|
|
77
83
|
git clone <repository-url>
|
|
78
84
|
cd <repository-directory>
|
|
@@ -80,25 +86,29 @@ Follow these steps to install the `rara-tools` package locally:
|
|
|
80
86
|
|
|
81
87
|
2. **Set Up Python Environment**
|
|
82
88
|
Create or activate a Python environment using Python 3.10 or above. E.g:
|
|
89
|
+
|
|
83
90
|
```bash
|
|
84
91
|
conda create -n py310 python==3.10
|
|
85
92
|
conda activate py310
|
|
86
93
|
```
|
|
87
94
|
|
|
88
95
|
3. **Install Build Package**
|
|
89
|
-
Install the `build` package to enable local builds:
|
|
96
|
+
Install the `build` package to enable local builds:
|
|
97
|
+
|
|
90
98
|
```bash
|
|
91
99
|
pip install build
|
|
92
100
|
```
|
|
93
101
|
|
|
94
102
|
4. **Build the Package**
|
|
95
|
-
Run the following command inside the repository:
|
|
103
|
+
Run the following command inside the repository:
|
|
104
|
+
|
|
96
105
|
```bash
|
|
97
106
|
python -m build
|
|
98
107
|
```
|
|
99
108
|
|
|
100
109
|
5. **Install the Package**
|
|
101
|
-
Install the built package locally:
|
|
110
|
+
Install the built package locally:
|
|
111
|
+
|
|
102
112
|
```bash
|
|
103
113
|
pip install .
|
|
104
114
|
```
|
|
@@ -111,13 +121,13 @@ Follow these steps to install the `rara-tools` package locally:
|
|
|
111
121
|
|
|
112
122
|
Follow these steps to test the `rara-tools` package.
|
|
113
123
|
|
|
114
|
-
|
|
115
124
|
### How to Test
|
|
116
125
|
|
|
117
126
|
<details><summary>Click to expand</summary>
|
|
118
127
|
|
|
119
128
|
1. **Clone the Repository**
|
|
120
|
-
Clone the repository and navigate into it:
|
|
129
|
+
Clone the repository and navigate into it:
|
|
130
|
+
|
|
121
131
|
```bash
|
|
122
132
|
git clone <repository-url>
|
|
123
133
|
cd <repository-directory>
|
|
@@ -127,25 +137,29 @@ Follow these steps to test the `rara-tools` package.
|
|
|
127
137
|
Create or activate a Python environment using Python 3.10 or above.
|
|
128
138
|
|
|
129
139
|
3. **Install Build Package**
|
|
130
|
-
Install the `build` package:
|
|
140
|
+
Install the `build` package:
|
|
141
|
+
|
|
131
142
|
```bash
|
|
132
143
|
pip install build
|
|
133
144
|
```
|
|
134
145
|
|
|
135
146
|
4. **Build the Package**
|
|
136
|
-
Build the package inside the repository:
|
|
147
|
+
Build the package inside the repository:
|
|
148
|
+
|
|
137
149
|
```bash
|
|
138
150
|
python -m build
|
|
139
151
|
```
|
|
140
152
|
|
|
141
153
|
5. **Install with Testing Dependencies**
|
|
142
|
-
Install the package along with its testing dependencies:
|
|
154
|
+
Install the package along with its testing dependencies:
|
|
155
|
+
|
|
143
156
|
```bash
|
|
144
157
|
pip install .[testing]
|
|
145
158
|
```
|
|
146
159
|
|
|
147
160
|
6. **Run Tests**
|
|
148
|
-
Run the test suite from the repository root:
|
|
161
|
+
Run the test suite from the repository root:
|
|
162
|
+
|
|
149
163
|
```bash
|
|
150
164
|
python -m pytest -v tests
|
|
151
165
|
```
|
rara_tools-0.0.9/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.0.9
|
|
File without changes
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
class StatusKeys:
|
|
2
|
+
CLEAN_UP = "digitizer_clean_up"
|
|
3
|
+
ELASTICSEARCH_UPLOAD = "digitizer_elasticsearch_upload"
|
|
4
|
+
UPLOAD = "s3_upload"
|
|
5
|
+
DOWNLOAD = "digitizer_s3_download"
|
|
6
|
+
OCR = "digitizer_ocr"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Queue:
|
|
10
|
+
IO = "io"
|
|
11
|
+
DOWNLOAD = "download"
|
|
12
|
+
FINISH = "finish"
|
|
13
|
+
OCR = "ocr"
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from .exceptions import SierraResponseConverterException
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SierraResponseConverter:
|
|
5
|
+
""" Takes a JSON response from the Sierra API (https://tester.ester.ee/iii/sierra-api/swagger/index.html)
|
|
6
|
+
and converts it to MARC-in-JSON format.
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
def __init__(self, response: dict):
|
|
11
|
+
if not isinstance(response, dict):
|
|
12
|
+
raise SierraResponseConverterException("Please provide a valid JSON response.")
|
|
13
|
+
self.response = response
|
|
14
|
+
|
|
15
|
+
def _map_field_data(self, field):
|
|
16
|
+
tag = field.get("tag")
|
|
17
|
+
if not tag:
|
|
18
|
+
raise SierraResponseConverterException("Field is missing a valid 'tag'.")
|
|
19
|
+
data = field.get("data", {})
|
|
20
|
+
return {tag: data}
|
|
21
|
+
|
|
22
|
+
def _convert_response(self):
|
|
23
|
+
response = self.response
|
|
24
|
+
|
|
25
|
+
entries = response.get("entries")
|
|
26
|
+
if not entries:
|
|
27
|
+
raise SierraResponseConverterException("No entries found in the response.")
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
fields = [self._map_field_data(f) for e in entries for f in e["marc"]["fields"]]
|
|
31
|
+
except KeyError as e:
|
|
32
|
+
raise SierraResponseConverterException(f"Missing expected MARC fields in the response: {e}")
|
|
33
|
+
|
|
34
|
+
return {"fields": fields}
|
|
35
|
+
|
|
36
|
+
def convert(self):
|
|
37
|
+
"""Runner method, converts the response to MARC-in-JSON format with error handling."""
|
|
38
|
+
try:
|
|
39
|
+
return self._convert_response()
|
|
40
|
+
except Exception as e:
|
|
41
|
+
raise SierraResponseConverterException(f"An unexpected error occurred during conversion: {e}")
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import functools
|
|
2
|
+
from typing import Any, Callable
|
|
3
|
+
|
|
2
4
|
from elasticsearch import AuthenticationException
|
|
3
5
|
from elasticsearch import ConnectionError as ElasticsearchConnectionError
|
|
4
6
|
from elasticsearch import ConnectionTimeout, NotFoundError, RequestError
|
|
5
|
-
from typing import Any, Callable
|
|
6
7
|
|
|
7
8
|
from .exceptions import ElasticsearchException
|
|
8
9
|
|
|
9
|
-
|
|
10
10
|
ELASTIC_NOT_FOUND_MESSAGE = 'Could not find specified data from Elasticsearch!'
|
|
11
11
|
ELASTIC_REQUEST_ERROR_MESSAGE = 'Error executing Elasticsearch query! Bad query?'
|
|
12
12
|
ELASTIC_CONNECTION_TIMEOUT_MESSAGE = 'Connection to Elasticsearch took too long, please try again later!'
|
|
@@ -39,4 +39,4 @@ def _elastic_connection(func: Callable) -> Callable:
|
|
|
39
39
|
raise ElasticsearchException(ELASTIC_CONNECTION_ERROR_MESSAGE) from exception
|
|
40
40
|
except Exception as exception:
|
|
41
41
|
raise ElasticsearchException(ELASTIC_UNKNOWN_ERROR_MESSAGE) from exception
|
|
42
|
-
return wrapper
|
|
42
|
+
return wrapper
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
from typing import Any, Dict, Iterator, Optional
|
|
2
|
+
|
|
3
|
+
import elasticsearch_dsl
|
|
4
|
+
from elastic_transport import ObjectApiResponse
|
|
5
|
+
from elasticsearch import Elasticsearch
|
|
6
|
+
from elasticsearch.helpers import bulk
|
|
7
|
+
from elasticsearch_dsl import Index
|
|
8
|
+
|
|
9
|
+
from .decorators import _elastic_connection
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class KataElastic:
|
|
13
|
+
"""A class to manage all required Elasticsearch operations for Kata.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
TYPE_MAPPING = {
|
|
17
|
+
"keyword": elasticsearch_dsl.Keyword,
|
|
18
|
+
"text": elasticsearch_dsl.Text,
|
|
19
|
+
"float": elasticsearch_dsl.Float,
|
|
20
|
+
"integer": elasticsearch_dsl.Integer,
|
|
21
|
+
"date": elasticsearch_dsl.Date,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
DEFAULT_MAPPING = {
|
|
25
|
+
"text": "keyword",
|
|
26
|
+
"parent_id": "keyword",
|
|
27
|
+
"text_quality": "float",
|
|
28
|
+
"n_chars": "integer",
|
|
29
|
+
"n_words": "integer",
|
|
30
|
+
"language": "keyword",
|
|
31
|
+
"end_page": "integer",
|
|
32
|
+
"start_page": "integer",
|
|
33
|
+
"sequence_nr": "integer",
|
|
34
|
+
"section_title": "keyword",
|
|
35
|
+
"section_type": "keyword",
|
|
36
|
+
"section_meta": "keyword",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
def __init__(self, elasticsearch_url: str, timeout: Optional[int] = None):
|
|
40
|
+
self.timeout = timeout
|
|
41
|
+
self.elasticsearch_url = elasticsearch_url
|
|
42
|
+
self.elasticsearch = Elasticsearch(self.elasticsearch_url, request_timeout=self.timeout)
|
|
43
|
+
|
|
44
|
+
def _produce_rollover_index(self, index_prefix: str, rollover_limit: int) -> str:
|
|
45
|
+
indices = self.elasticsearch.indices.get(index=f"{index_prefix}-*", expand_wildcards="open")
|
|
46
|
+
sorted_indices = sorted([(k, v["settings"]["index"]["creation_date"]) for k, v in indices.items()], key=lambda x: x[1], reverse=True)
|
|
47
|
+
sorted_indices = [i[0] for i in sorted_indices]
|
|
48
|
+
|
|
49
|
+
# new index name if none exist
|
|
50
|
+
if not len(sorted_indices):
|
|
51
|
+
last_index_name = f"{index_prefix}-0"
|
|
52
|
+
last_index_count = 0
|
|
53
|
+
else:
|
|
54
|
+
last_index_name = sorted_indices[0]
|
|
55
|
+
last_index_count = self.elasticsearch.count(index=last_index_name)["count"]
|
|
56
|
+
# check the size of the last index of the pipeline
|
|
57
|
+
if last_index_count >= rollover_limit:
|
|
58
|
+
new_index_number = int(last_index_name[-1]) + 1
|
|
59
|
+
last_index_name = f"{index_prefix}-{new_index_number}"
|
|
60
|
+
|
|
61
|
+
return last_index_name
|
|
62
|
+
|
|
63
|
+
@_elastic_connection
|
|
64
|
+
def check(self) -> bool:
|
|
65
|
+
"""Checks Elasticsearch connection.
|
|
66
|
+
:return: bool: Elasticsearch alive or dead.
|
|
67
|
+
"""
|
|
68
|
+
if self.elasticsearch.ping():
|
|
69
|
+
return True
|
|
70
|
+
return False
|
|
71
|
+
|
|
72
|
+
def generate_mapping(self, schema: dict | None = None) -> dict:
|
|
73
|
+
mapping_dsl = elasticsearch_dsl.Mapping()
|
|
74
|
+
mapping = schema or self.DEFAULT_MAPPING
|
|
75
|
+
for field_name, field_type in mapping.items():
|
|
76
|
+
if field_type in self.TYPE_MAPPING:
|
|
77
|
+
# We instantiate the class stored in the type mapping.
|
|
78
|
+
mapping_dsl.field(field_name, self.TYPE_MAPPING[field_type]())
|
|
79
|
+
return mapping_dsl.to_dict()
|
|
80
|
+
|
|
81
|
+
@_elastic_connection
|
|
82
|
+
def add_mapping(self, index_name: str, schema: dict):
|
|
83
|
+
index = Index(name=index_name)
|
|
84
|
+
return index.put_mapping(body=schema, using=self.elasticsearch)
|
|
85
|
+
|
|
86
|
+
@_elastic_connection
|
|
87
|
+
def create_index(
|
|
88
|
+
self,
|
|
89
|
+
index: str,
|
|
90
|
+
shards: int = 3,
|
|
91
|
+
replicas: int = 1,
|
|
92
|
+
settings: Optional[dict] = None,
|
|
93
|
+
) -> Dict | None:
|
|
94
|
+
"""Creates empty index.
|
|
95
|
+
:param: index str: Name of the index to create.
|
|
96
|
+
:param: shards int: Number of shards for the index.
|
|
97
|
+
:param: replicas int: Number of replicas of the index.
|
|
98
|
+
:param: settings dict: Overwrite settings for the index.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
index_exists = self.elasticsearch.indices.exists(index=index).body
|
|
102
|
+
if index_exists is False:
|
|
103
|
+
setting_body = settings or {
|
|
104
|
+
"number_of_shards": shards,
|
|
105
|
+
"number_of_replicas": replicas,
|
|
106
|
+
}
|
|
107
|
+
return self.elasticsearch.indices.create(index=index, settings=setting_body)
|
|
108
|
+
|
|
109
|
+
@_elastic_connection
|
|
110
|
+
def delete_index(self, index: str, ignore: Optional[bool] = True) -> Dict:
|
|
111
|
+
"""Deletes index.
|
|
112
|
+
:param: index str: Name of the index to be deleted.
|
|
113
|
+
:param: ignore bool: Ignore errors because of closed/deleted index.
|
|
114
|
+
:return: Dict of Elastic's acknowledgement of the action.
|
|
115
|
+
"""
|
|
116
|
+
response = self.elasticsearch.indices.delete(index=index, ignore_unavailable=ignore, expand_wildcards="open")
|
|
117
|
+
return response
|
|
118
|
+
|
|
119
|
+
@_elastic_connection
|
|
120
|
+
def delete_document(self, index: str, document_id: str) -> ObjectApiResponse[Any]:
|
|
121
|
+
"""Deletes document fom index.
|
|
122
|
+
:param: document_id str: ID of the document to be deleted.
|
|
123
|
+
:param: index str: Index where the document is to be found.
|
|
124
|
+
:param: ignore bool: Ignore errors because of closed/deleted index.
|
|
125
|
+
:return: Dict of Elastic's acknowledgement of the action.
|
|
126
|
+
"""
|
|
127
|
+
response = self.elasticsearch.delete(id=document_id, index=index)
|
|
128
|
+
return response
|
|
129
|
+
|
|
130
|
+
@_elastic_connection
|
|
131
|
+
def bulk_index(
|
|
132
|
+
self,
|
|
133
|
+
documents: Iterator[dict],
|
|
134
|
+
index_prefix: str,
|
|
135
|
+
rollover_limit: int,
|
|
136
|
+
refresh="false",
|
|
137
|
+
create_index: bool = True
|
|
138
|
+
) -> (int, int):
|
|
139
|
+
last_index_name = self._produce_rollover_index(index_prefix, rollover_limit)
|
|
140
|
+
if create_index:
|
|
141
|
+
response = self.create_index(index=last_index_name)
|
|
142
|
+
response = self.add_mapping(index_name=last_index_name, schema=self.generate_mapping())
|
|
143
|
+
pass
|
|
144
|
+
|
|
145
|
+
actions = [{"_index": last_index_name, "_source": document} for document in documents]
|
|
146
|
+
successful_count, error_count = bulk(actions=actions, client=self.elasticsearch, max_retries=3, refresh=refresh)
|
|
147
|
+
return successful_count, error_count
|
|
148
|
+
|
|
149
|
+
@_elastic_connection
|
|
150
|
+
def index_document(self, index: str, body: dict, document_id: Optional[str] = None) -> Dict:
|
|
151
|
+
"""Indexes document.
|
|
152
|
+
:param: index str: Index that document will be indexed into.
|
|
153
|
+
:param: body dict: Document body.
|
|
154
|
+
:param: document_id str: Optional id for the document. Is generated automatically if None.
|
|
155
|
+
:return: Dict of Elastic's acknowledgement of the action.
|
|
156
|
+
"""
|
|
157
|
+
if document_id:
|
|
158
|
+
indexed = self.elasticsearch.index(index=index, id=document_id, body=body)
|
|
159
|
+
else:
|
|
160
|
+
indexed = self.elasticsearch.index(index=index, body=body)
|
|
161
|
+
return indexed
|
|
162
|
+
|
|
163
|
+
@_elastic_connection
|
|
164
|
+
def get_documents_by_key(self, index: str, document_key: str, sort_fields=("start_page", "end_page", "sequence_nr",)):
|
|
165
|
+
index = f"{index}-*"
|
|
166
|
+
s = elasticsearch_dsl.Search(using=self.elasticsearch, index=index)
|
|
167
|
+
s = s.query("match", parent_id=document_key).sort(*sort_fields)
|
|
168
|
+
# Since scan doesn't allow for sorting, we do it manually after fetching the documents.
|
|
169
|
+
documents = sorted(
|
|
170
|
+
s.scan(), key=lambda doc: [getattr(doc, field) for field in sort_fields]
|
|
171
|
+
)
|
|
172
|
+
return documents
|
|
173
|
+
|
|
174
|
+
def __str__(self) -> str:
|
|
175
|
+
return self.elasticsearch_url
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import uuid
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Any, Generator, List, Optional
|
|
4
4
|
|
|
5
5
|
from minio import Minio
|
|
6
6
|
|
|
7
|
-
from .exceptions import
|
|
7
|
+
from .exceptions import (S3ConnectionException, S3InitException,
|
|
8
|
+
S3InputException)
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
class S3Files:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: rara-tools
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.9
|
|
4
4
|
Summary: Tools to support Kata's work.
|
|
5
5
|
Classifier: Programming Language :: Python :: 3
|
|
6
6
|
Classifier: Programming Language :: Python :: 3.10
|
|
@@ -28,25 +28,28 @@ Requires-Dist: pytest-order; extra == "testing"
|
|
|
28
28
|
|
|
29
29
|
---
|
|
30
30
|
|
|
31
|
-
## ✨ Features
|
|
31
|
+
## ✨ Features
|
|
32
32
|
|
|
33
33
|
- Elasticsearch index & document operations
|
|
34
34
|
- S3 file management operations
|
|
35
35
|
- Task reporting to Core API
|
|
36
|
+
- Converting SIERRA API responses to Pymarc compatible JSON
|
|
37
|
+
|
|
36
38
|
---
|
|
37
39
|
|
|
38
|
-
## ⚡ Quick Start
|
|
40
|
+
## ⚡ Quick Start
|
|
39
41
|
|
|
40
42
|
Get started with `rara-tools` in just a few steps:
|
|
41
43
|
|
|
42
44
|
1. **Install the Package**
|
|
43
|
-
Ensure you're using Python 3.10 or above, then run:
|
|
45
|
+
Ensure you're using Python 3.10 or above, then run:
|
|
46
|
+
|
|
44
47
|
```bash
|
|
45
48
|
pip install rara-tools
|
|
46
49
|
```
|
|
47
50
|
|
|
48
51
|
2. **Import and Use**
|
|
49
|
-
Example usage to download a folder from S3:
|
|
52
|
+
Example usage to download a folder from S3:
|
|
50
53
|
|
|
51
54
|
```python
|
|
52
55
|
from rara_tools.s3 import S3Files
|
|
@@ -77,22 +80,25 @@ Follow the steps below to install the `rara-tools` package, either via `pip` or
|
|
|
77
80
|
Create or activate a Python environment using Python **3.10** or above.
|
|
78
81
|
|
|
79
82
|
2. **Install the Package**
|
|
80
|
-
|
|
83
|
+
Run the following command:
|
|
84
|
+
|
|
81
85
|
```bash
|
|
82
86
|
pip install rara-tools
|
|
83
87
|
```
|
|
84
|
-
|
|
88
|
+
|
|
89
|
+
</details>
|
|
85
90
|
|
|
86
91
|
---
|
|
87
92
|
|
|
88
93
|
### Local Installation
|
|
89
94
|
|
|
90
|
-
Follow these steps to install the `rara-tools` package locally:
|
|
95
|
+
Follow these steps to install the `rara-tools` package locally:
|
|
91
96
|
|
|
92
97
|
<details><summary>Click to expand</summary>
|
|
93
98
|
|
|
94
99
|
1. **Clone the Repository**
|
|
95
|
-
Clone the repository and navigate into it:
|
|
100
|
+
Clone the repository and navigate into it:
|
|
101
|
+
|
|
96
102
|
```bash
|
|
97
103
|
git clone <repository-url>
|
|
98
104
|
cd <repository-directory>
|
|
@@ -100,25 +106,29 @@ Follow these steps to install the `rara-tools` package locally:
|
|
|
100
106
|
|
|
101
107
|
2. **Set Up Python Environment**
|
|
102
108
|
Create or activate a Python environment using Python 3.10 or above. E.g:
|
|
109
|
+
|
|
103
110
|
```bash
|
|
104
111
|
conda create -n py310 python==3.10
|
|
105
112
|
conda activate py310
|
|
106
113
|
```
|
|
107
114
|
|
|
108
115
|
3. **Install Build Package**
|
|
109
|
-
Install the `build` package to enable local builds:
|
|
116
|
+
Install the `build` package to enable local builds:
|
|
117
|
+
|
|
110
118
|
```bash
|
|
111
119
|
pip install build
|
|
112
120
|
```
|
|
113
121
|
|
|
114
122
|
4. **Build the Package**
|
|
115
|
-
Run the following command inside the repository:
|
|
123
|
+
Run the following command inside the repository:
|
|
124
|
+
|
|
116
125
|
```bash
|
|
117
126
|
python -m build
|
|
118
127
|
```
|
|
119
128
|
|
|
120
129
|
5. **Install the Package**
|
|
121
|
-
Install the built package locally:
|
|
130
|
+
Install the built package locally:
|
|
131
|
+
|
|
122
132
|
```bash
|
|
123
133
|
pip install .
|
|
124
134
|
```
|
|
@@ -131,13 +141,13 @@ Follow these steps to install the `rara-tools` package locally:
|
|
|
131
141
|
|
|
132
142
|
Follow these steps to test the `rara-tools` package.
|
|
133
143
|
|
|
134
|
-
|
|
135
144
|
### How to Test
|
|
136
145
|
|
|
137
146
|
<details><summary>Click to expand</summary>
|
|
138
147
|
|
|
139
148
|
1. **Clone the Repository**
|
|
140
|
-
Clone the repository and navigate into it:
|
|
149
|
+
Clone the repository and navigate into it:
|
|
150
|
+
|
|
141
151
|
```bash
|
|
142
152
|
git clone <repository-url>
|
|
143
153
|
cd <repository-directory>
|
|
@@ -147,25 +157,29 @@ Follow these steps to test the `rara-tools` package.
|
|
|
147
157
|
Create or activate a Python environment using Python 3.10 or above.
|
|
148
158
|
|
|
149
159
|
3. **Install Build Package**
|
|
150
|
-
Install the `build` package:
|
|
160
|
+
Install the `build` package:
|
|
161
|
+
|
|
151
162
|
```bash
|
|
152
163
|
pip install build
|
|
153
164
|
```
|
|
154
165
|
|
|
155
166
|
4. **Build the Package**
|
|
156
|
-
Build the package inside the repository:
|
|
167
|
+
Build the package inside the repository:
|
|
168
|
+
|
|
157
169
|
```bash
|
|
158
170
|
python -m build
|
|
159
171
|
```
|
|
160
172
|
|
|
161
173
|
5. **Install with Testing Dependencies**
|
|
162
|
-
Install the package along with its testing dependencies:
|
|
174
|
+
Install the package along with its testing dependencies:
|
|
175
|
+
|
|
163
176
|
```bash
|
|
164
177
|
pip install .[testing]
|
|
165
178
|
```
|
|
166
179
|
|
|
167
180
|
6. **Run Tests**
|
|
168
|
-
Run the test suite from the repository root:
|
|
181
|
+
Run the test suite from the repository root:
|
|
182
|
+
|
|
169
183
|
```bash
|
|
170
184
|
python -m pytest -v tests
|
|
171
185
|
```
|
|
@@ -3,6 +3,7 @@ README.md
|
|
|
3
3
|
VERSION
|
|
4
4
|
pyproject.toml
|
|
5
5
|
requirements.txt
|
|
6
|
+
rara_tools/converters.py
|
|
6
7
|
rara_tools/decorators.py
|
|
7
8
|
rara_tools/elastic.py
|
|
8
9
|
rara_tools/exceptions.py
|
|
@@ -13,6 +14,10 @@ rara_tools.egg-info/SOURCES.txt
|
|
|
13
14
|
rara_tools.egg-info/dependency_links.txt
|
|
14
15
|
rara_tools.egg-info/requires.txt
|
|
15
16
|
rara_tools.egg-info/top_level.txt
|
|
17
|
+
rara_tools/constants/__init__.py
|
|
18
|
+
rara_tools/constants/digitizer.py
|
|
19
|
+
rara_tools/constants/general.py
|
|
20
|
+
tests/test_converters.py
|
|
16
21
|
tests/test_elastic.py
|
|
17
22
|
tests/test_s3_exceptions.py
|
|
18
23
|
tests/test_s3_file_operations.py
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
from rara_tools.converters import SierraResponseConverter
|
|
6
|
+
from rara_tools.exceptions import SierraResponseConverterException
|
|
7
|
+
|
|
8
|
+
root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
9
|
+
|
|
10
|
+
SIERRA_TEST_DATA_DIR = os.path.join(root, "tests", "test_data", "sierra")
|
|
11
|
+
INPUT_DIR = os.path.join(SIERRA_TEST_DATA_DIR, "input")
|
|
12
|
+
OUTPUT_DIR = os.path.join(SIERRA_TEST_DATA_DIR, "output")
|
|
13
|
+
|
|
14
|
+
example_res = {
|
|
15
|
+
"total": 100,
|
|
16
|
+
"start": 50000,
|
|
17
|
+
"entries": [
|
|
18
|
+
{
|
|
19
|
+
"id": 1126963,
|
|
20
|
+
"updatedDate": "2016-02-09T08:42:52Z",
|
|
21
|
+
"createdDate": "2014-05-17T17:22:00Z",
|
|
22
|
+
"deleted": False,
|
|
23
|
+
"suppressed": False,
|
|
24
|
+
"marc": {
|
|
25
|
+
"leader": "00000nz a2200145n 4500",
|
|
26
|
+
"fields": [
|
|
27
|
+
{
|
|
28
|
+
# "tag": "100",
|
|
29
|
+
"data": {
|
|
30
|
+
"subfields": [
|
|
31
|
+
{
|
|
32
|
+
"code": "a",
|
|
33
|
+
"data": "Viggor, Signe,"
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
"code": "d",
|
|
37
|
+
"data": "1975-"
|
|
38
|
+
}
|
|
39
|
+
],
|
|
40
|
+
"ind1": "1",
|
|
41
|
+
"ind2": " "
|
|
42
|
+
}
|
|
43
|
+
},
|
|
44
|
+
]}}]}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def read_json_file(file_path):
|
|
48
|
+
with open(file_path, "r") as f:
|
|
49
|
+
data = f.read()
|
|
50
|
+
return json.loads(data)
|
|
51
|
+
|
|
52
|
+
def test_convert_bibs_response():
|
|
53
|
+
response = read_json_file(os.path.join(INPUT_DIR, "bibs.json"))
|
|
54
|
+
|
|
55
|
+
converter = SierraResponseConverter(response)
|
|
56
|
+
data = converter.convert()
|
|
57
|
+
|
|
58
|
+
expected = read_json_file(os.path.join(OUTPUT_DIR, "bibs.json"))
|
|
59
|
+
assert data == expected
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_convert_keywords_response():
|
|
63
|
+
with open(os.path.join(INPUT_DIR, "keywords.json"), "r") as f:
|
|
64
|
+
response = f.read()
|
|
65
|
+
response = json.loads(response)
|
|
66
|
+
|
|
67
|
+
converter = SierraResponseConverter(response)
|
|
68
|
+
data = converter.convert()
|
|
69
|
+
|
|
70
|
+
expected = read_json_file(os.path.join(OUTPUT_DIR, "keywords.json"))
|
|
71
|
+
|
|
72
|
+
assert data == expected
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def test_convert_authorities_response():
|
|
76
|
+
with open(os.path.join(INPUT_DIR, "authorities.json"), "r") as f:
|
|
77
|
+
response = f.read()
|
|
78
|
+
response = json.loads(response)
|
|
79
|
+
|
|
80
|
+
converter = SierraResponseConverter(response)
|
|
81
|
+
data = converter.convert()
|
|
82
|
+
|
|
83
|
+
expected = read_json_file(os.path.join(OUTPUT_DIR, "authorities.json"))
|
|
84
|
+
|
|
85
|
+
assert data == expected
|
|
86
|
+
|
|
87
|
+
def test_convert_with_wrong_format():
|
|
88
|
+
with pytest.raises(SierraResponseConverterException):
|
|
89
|
+
SierraResponseConverter("$")
|
|
90
|
+
|
|
91
|
+
def test_convert_missing_tag():
|
|
92
|
+
with pytest.raises(SierraResponseConverterException):
|
|
93
|
+
response = example_res.copy()
|
|
94
|
+
response["entries"][0]["marc"]["fields"][0].pop("tag", None)
|
|
95
|
+
|
|
96
|
+
converter = SierraResponseConverter(response)
|
|
97
|
+
converter.convert()
|
|
98
|
+
|
|
99
|
+
def test_no_entries_in_response():
|
|
100
|
+
with pytest.raises(SierraResponseConverterException):
|
|
101
|
+
response = example_res.copy()
|
|
102
|
+
response.pop("entries", [])
|
|
103
|
+
|
|
104
|
+
converter = SierraResponseConverter(response)
|
|
105
|
+
converter.convert()
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
import uuid
|
|
5
|
+
from time import sleep
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
from rara_tools.elastic import KataElastic
|
|
9
|
+
|
|
10
|
+
with open("./tests/test_data/elastic_docs.json") as fh:
|
|
11
|
+
TEST_DOCUMENTS = json.load(fh)
|
|
12
|
+
|
|
13
|
+
es_url = os.getenv("ELASTIC_TEST_URL", "http://localhost:9200")
|
|
14
|
+
ELASTIC = KataElastic(es_url)
|
|
15
|
+
ELASTIC_BAD = KataElastic("http://locallost:9012")
|
|
16
|
+
TEST_INDEX_NAME = "tools_testing_index"
|
|
17
|
+
TEST_DOCUMENT_ID = None
|
|
18
|
+
TEST_DOCUMENT_INDEX = None
|
|
19
|
+
PARENT_ID = uuid.uuid4().hex
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@pytest.mark.order(1)
|
|
23
|
+
def test_index_creation():
|
|
24
|
+
""" Tests if index created and documents indexed.
|
|
25
|
+
"""
|
|
26
|
+
# Create test index
|
|
27
|
+
created = ELASTIC.create_index(TEST_INDEX_NAME)
|
|
28
|
+
assert created["acknowledged"] is True
|
|
29
|
+
time.sleep(2)
|
|
30
|
+
|
|
31
|
+
@pytest.mark.order(2)
|
|
32
|
+
def test_check():
|
|
33
|
+
"""Tests health check method.
|
|
34
|
+
"""
|
|
35
|
+
assert ELASTIC.check() is True
|
|
36
|
+
# test bad connection
|
|
37
|
+
assert ELASTIC_BAD.check() is False
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@pytest.mark.order(2)
|
|
41
|
+
def test_creating_index_again():
|
|
42
|
+
"""
|
|
43
|
+
Test to see that running the function for index generation doesn't trigger errors
|
|
44
|
+
on duplicates.
|
|
45
|
+
"""
|
|
46
|
+
# Create test index
|
|
47
|
+
created = ELASTIC.create_index(TEST_INDEX_NAME)
|
|
48
|
+
assert created is None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@pytest.mark.order(3)
|
|
52
|
+
def test_adding_mapping_to_index():
|
|
53
|
+
"""Test adding mapping to an index"""
|
|
54
|
+
schema = ELASTIC.generate_mapping()
|
|
55
|
+
result = ELASTIC.add_mapping(TEST_INDEX_NAME, schema)
|
|
56
|
+
assert result["acknowledged"] is True
|
|
57
|
+
# Test adding the mapping again doesn't create errors.
|
|
58
|
+
result = ELASTIC.add_mapping(TEST_INDEX_NAME, schema)
|
|
59
|
+
assert result["acknowledged"] is True
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@pytest.mark.order(4)
|
|
63
|
+
def test_document_addition():
|
|
64
|
+
# Add test documents
|
|
65
|
+
for document in TEST_DOCUMENTS:
|
|
66
|
+
indexed = ELASTIC.index_document(TEST_INDEX_NAME, document)
|
|
67
|
+
assert indexed["result"] == "created"
|
|
68
|
+
# let it index
|
|
69
|
+
sleep(1)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@pytest.mark.order(5)
|
|
73
|
+
def test_bulk_indexing_documents_cause_rollover():
|
|
74
|
+
data = [{"start_page": number, "sequence_nr": 1, "end_page": number, "parent_id": PARENT_ID} for number in range(10)]
|
|
75
|
+
chunks = [data[i:i + 3] for i in range(0, len(data), 3)]
|
|
76
|
+
for chunk in chunks:
|
|
77
|
+
success, errors = ELASTIC.bulk_index(chunk, TEST_INDEX_NAME, rollover_limit=3, refresh="wait_for")
|
|
78
|
+
assert success is 3 or success is 1
|
|
79
|
+
|
|
80
|
+
created_indices = ELASTIC.elasticsearch.indices.get(index=f"{TEST_INDEX_NAME}-*", expand_wildcards="open").body
|
|
81
|
+
assert len(created_indices) == 4
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@pytest.mark.order(6)
|
|
85
|
+
def test_bulk_indexing_and_document_fetch():
|
|
86
|
+
"""
|
|
87
|
+
Test that the whole process of indexing a bunch of different texts and then the retrieval
|
|
88
|
+
of only the requested documents works as intended.
|
|
89
|
+
"""
|
|
90
|
+
success, errors = ELASTIC.bulk_index(TEST_DOCUMENTS, TEST_INDEX_NAME, rollover_limit=3, refresh="wait_for")
|
|
91
|
+
|
|
92
|
+
# Test the integrity of the limiting query.
|
|
93
|
+
result = ELASTIC.get_documents_by_key(TEST_INDEX_NAME, "foo")
|
|
94
|
+
assert len(result) == 2
|
|
95
|
+
result = ELASTIC.get_documents_by_key(TEST_INDEX_NAME, "bar")
|
|
96
|
+
global TEST_DOCUMENT_ID
|
|
97
|
+
global TEST_DOCUMENT_INDEX
|
|
98
|
+
TEST_DOCUMENT_ID = result[0].meta.id
|
|
99
|
+
TEST_DOCUMENT_INDEX = result[0].meta.index
|
|
100
|
+
assert len(result) == 1
|
|
101
|
+
result = ELASTIC.get_documents_by_key(TEST_INDEX_NAME, "loll")
|
|
102
|
+
assert len(result) == 0
|
|
103
|
+
|
|
104
|
+
# Check that sorting works as expected.
|
|
105
|
+
results = ELASTIC.get_documents_by_key(TEST_INDEX_NAME, PARENT_ID)
|
|
106
|
+
for index, document in enumerate(results):
|
|
107
|
+
assert document.start_page == index
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@pytest.mark.order(7)
|
|
111
|
+
def test_document_deleting():
|
|
112
|
+
"""
|
|
113
|
+
Tests deleting a document from index.
|
|
114
|
+
"""
|
|
115
|
+
deleted = ELASTIC.delete_document(TEST_DOCUMENT_INDEX, TEST_DOCUMENT_ID)
|
|
116
|
+
assert deleted["result"] == "deleted"
|
|
117
|
+
sleep(1)
|
|
118
|
+
# check if document was actually deleted
|
|
119
|
+
result = ELASTIC.get_documents_by_key(TEST_INDEX_NAME, "bar")
|
|
120
|
+
assert len(result) == 0
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@pytest.mark.order(8)
|
|
124
|
+
def test_index_deleting():
|
|
125
|
+
"""
|
|
126
|
+
Tests deleting index. We delete the test index now.
|
|
127
|
+
"""
|
|
128
|
+
deleted = ELASTIC.delete_index(TEST_INDEX_NAME)
|
|
129
|
+
for i in range(10):
|
|
130
|
+
ELASTIC.delete_index(f"{TEST_INDEX_NAME}-{i}")
|
|
131
|
+
assert deleted["acknowledged"] is True
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
import pytest
|
|
2
1
|
import os
|
|
3
|
-
from rara_tools.s3 import S3Files
|
|
4
|
-
from rara_tools.exceptions import S3InitException, S3ConnectionException, S3InputException
|
|
5
2
|
|
|
3
|
+
import pytest
|
|
4
|
+
from rara_tools.exceptions import (S3ConnectionException, S3InitException,
|
|
5
|
+
S3InputException)
|
|
6
|
+
from rara_tools.s3 import S3Files
|
|
6
7
|
|
|
7
8
|
TEST_URL = os.getenv("S3_URL", "s3.eu-central-003.backblazeb2.com")
|
|
8
9
|
TEST_ACCESS_KEY = os.getenv("S3_ACCESS_KEY", "")
|
rara_tools-0.0.4/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.0.4
|
|
@@ -1,92 +0,0 @@
|
|
|
1
|
-
from typing import Dict, Optional, List
|
|
2
|
-
from elasticsearch import Elasticsearch
|
|
3
|
-
from elasticsearch_dsl import Search
|
|
4
|
-
|
|
5
|
-
from .decorators import _elastic_connection
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class KataElastic:
|
|
9
|
-
"""A class to manage all required Elasticsearch operations for Kata.
|
|
10
|
-
"""
|
|
11
|
-
def __init__(self, elasticsearch_url: str, timeout: Optional[int] = None):
|
|
12
|
-
self.timeout = timeout
|
|
13
|
-
self.elasticsearch_url = elasticsearch_url
|
|
14
|
-
self.elasticsearch = Elasticsearch(self.elasticsearch_url, request_timeout=self.timeout)
|
|
15
|
-
|
|
16
|
-
@_elastic_connection
|
|
17
|
-
def check(self) -> bool:
|
|
18
|
-
"""Checks Elasticsearch connection.
|
|
19
|
-
:return: bool: Elasticsearch alive or dead.
|
|
20
|
-
"""
|
|
21
|
-
if self.elasticsearch.ping():
|
|
22
|
-
return True
|
|
23
|
-
return False
|
|
24
|
-
|
|
25
|
-
@_elastic_connection
|
|
26
|
-
def create_index(
|
|
27
|
-
self,
|
|
28
|
-
index: str,
|
|
29
|
-
shards: int = 3,
|
|
30
|
-
replicas: int = 1,
|
|
31
|
-
settings: Optional[dict] = None
|
|
32
|
-
) -> Dict:
|
|
33
|
-
"""Creates empty index.
|
|
34
|
-
:param: index str: Name of the index to create.
|
|
35
|
-
:param: shards int: Number of shards for the index.
|
|
36
|
-
:param: replicas int: Number of replicas of the index.
|
|
37
|
-
:param: settings dict: Overwrite settings for the index.
|
|
38
|
-
"""
|
|
39
|
-
body = settings or {
|
|
40
|
-
"number_of_shards": shards,
|
|
41
|
-
"number_of_replicas": replicas,
|
|
42
|
-
}
|
|
43
|
-
return self.elasticsearch.indices.create(index=index, settings=body)
|
|
44
|
-
|
|
45
|
-
@_elastic_connection
|
|
46
|
-
def delete_index(self, index: str, ignore: Optional[bool] = True) -> Dict:
|
|
47
|
-
"""Deletes index.
|
|
48
|
-
:param: index str: Name of the index to be deleted.
|
|
49
|
-
:param: ignore bool: Ignore errors because of closed/deleted index.
|
|
50
|
-
:return: Dict of Elastic's acknowledgement of the action.
|
|
51
|
-
"""
|
|
52
|
-
response = self.elasticsearch.indices.delete(index=index, ignore_unavailable=ignore)
|
|
53
|
-
return response
|
|
54
|
-
|
|
55
|
-
@_elastic_connection
|
|
56
|
-
def delete_document(self, index: str, document_id: str) -> Dict:
|
|
57
|
-
"""Deletes document fom index.
|
|
58
|
-
:param: document_id str: ID of the document to be deleted.
|
|
59
|
-
:param: index str: Index where the document is to be found.
|
|
60
|
-
:param: ignore bool: Ignore errors because of closed/deleted index.
|
|
61
|
-
:return: Dict of Elastic's acknowledgement of the action.
|
|
62
|
-
"""
|
|
63
|
-
response = self.elasticsearch.delete(id=document_id, index=index)
|
|
64
|
-
return response
|
|
65
|
-
|
|
66
|
-
@_elastic_connection
|
|
67
|
-
def index_document(self, index: str, body: dict, document_id: Optional[str] = None) -> Dict:
|
|
68
|
-
"""Indexes document.
|
|
69
|
-
:param: index str: Index that document will be indexed into.
|
|
70
|
-
:param: body dict: Document body.
|
|
71
|
-
:param: document_id str: Optional id for the document. Is generated automatically if None.
|
|
72
|
-
:return: Dict of Elastic's acknowledgement of the action.
|
|
73
|
-
"""
|
|
74
|
-
if document_id:
|
|
75
|
-
indexed = self.elasticsearch.index(index=index, id=document_id, body=body)
|
|
76
|
-
else:
|
|
77
|
-
indexed = self.elasticsearch.index(index=index, body=body)
|
|
78
|
-
return indexed
|
|
79
|
-
|
|
80
|
-
@_elastic_connection
|
|
81
|
-
def get_documents_by_key(self, index: str, document_key: str) -> List:
|
|
82
|
-
"""This method is for retrieving all texts/pages of the original document.
|
|
83
|
-
:param: index str: Index to search the documents from.
|
|
84
|
-
:param: document_key str: parent_id field that connects pages of document together.
|
|
85
|
-
:return: List of matching documents.
|
|
86
|
-
"""
|
|
87
|
-
s = Search(using=self.elasticsearch, index=index)
|
|
88
|
-
docs = s.query("match", parent_id=document_key).execute()
|
|
89
|
-
return docs
|
|
90
|
-
|
|
91
|
-
def __str__(self) -> str:
|
|
92
|
-
return self.elasticsearch_url
|
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
import pytest
|
|
2
|
-
import json
|
|
3
|
-
import os
|
|
4
|
-
from time import sleep
|
|
5
|
-
|
|
6
|
-
from rara_tools.elastic import KataElastic
|
|
7
|
-
|
|
8
|
-
with open("./tests/test_data/elastic_docs.json") as fh:
|
|
9
|
-
TEST_DOCUMENTS = json.load(fh)
|
|
10
|
-
|
|
11
|
-
es_url = os.getenv("ELASTIC_TEST_URL", "http://localhost:9200")
|
|
12
|
-
ELASTIC = KataElastic(es_url)
|
|
13
|
-
ELASTIC_BAD = KataElastic("http://locallost:9012")
|
|
14
|
-
TEST_INDEX_NAME = "tools_testing_index"
|
|
15
|
-
TEST_DOCUMENT_ID = None
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
@pytest.mark.order(1)
|
|
19
|
-
def test_index_creation_and_data_indexing():
|
|
20
|
-
""" Tests if index created and documents indexed.
|
|
21
|
-
"""
|
|
22
|
-
# Create test index
|
|
23
|
-
created = ELASTIC.create_index(TEST_INDEX_NAME)
|
|
24
|
-
assert created["acknowledged"] is True
|
|
25
|
-
# Add test documents
|
|
26
|
-
for document in TEST_DOCUMENTS:
|
|
27
|
-
indexed = ELASTIC.index_document(TEST_INDEX_NAME, document)
|
|
28
|
-
assert indexed["result"] == "created"
|
|
29
|
-
# let it index
|
|
30
|
-
sleep(1)
|
|
31
|
-
|
|
32
|
-
@pytest.mark.order(2)
|
|
33
|
-
def test_check():
|
|
34
|
-
"""Tests health check method.
|
|
35
|
-
"""
|
|
36
|
-
assert ELASTIC.check() is True
|
|
37
|
-
# test bad connection
|
|
38
|
-
assert ELASTIC_BAD.check() is False
|
|
39
|
-
|
|
40
|
-
@pytest.mark.order(3)
|
|
41
|
-
def test_get_document_by_key():
|
|
42
|
-
"""Tests if correct documents fetched.
|
|
43
|
-
"""
|
|
44
|
-
result = ELASTIC.get_documents_by_key(TEST_INDEX_NAME, "foo")
|
|
45
|
-
assert len(result) == 2
|
|
46
|
-
result = ELASTIC.get_documents_by_key(TEST_INDEX_NAME, "bar")
|
|
47
|
-
global TEST_DOCUMENT_ID
|
|
48
|
-
TEST_DOCUMENT_ID = result[0].meta.id
|
|
49
|
-
assert len(result) == 1
|
|
50
|
-
result = ELASTIC.get_documents_by_key(TEST_INDEX_NAME, "loll")
|
|
51
|
-
assert len(result) == 0
|
|
52
|
-
|
|
53
|
-
@pytest.mark.order(5)
|
|
54
|
-
def test_document_deleting():
|
|
55
|
-
""" Tests deleting a document from index.
|
|
56
|
-
"""
|
|
57
|
-
deleted = ELASTIC.delete_document(TEST_INDEX_NAME, TEST_DOCUMENT_ID)
|
|
58
|
-
assert deleted["result"] == "deleted"
|
|
59
|
-
sleep(1)
|
|
60
|
-
# check if document was actually deleted
|
|
61
|
-
result = ELASTIC.get_documents_by_key(TEST_INDEX_NAME, "bar")
|
|
62
|
-
assert len(result) == 0
|
|
63
|
-
|
|
64
|
-
@pytest.mark.order(5)
|
|
65
|
-
def test_index_deleting():
|
|
66
|
-
""" Tests deleting index. We delete the test index now.
|
|
67
|
-
"""
|
|
68
|
-
deleted = ELASTIC.delete_index(TEST_INDEX_NAME)
|
|
69
|
-
assert deleted["acknowledged"] is True
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|