rara-tools 0.0.3__py3-none-any.whl → 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rara-tools might be problematic. Click here for more details.
- rara_tools/constants/__init__.py +0 -0
- rara_tools/constants/digitizer.py +13 -0
- rara_tools/constants/general.py +10 -0
- rara_tools/elastic.py +104 -21
- rara_tools/s3.py +4 -3
- {rara_tools-0.0.3.dist-info → rara_tools-0.0.8.dist-info}/METADATA +1 -1
- rara_tools-0.0.8.dist-info/RECORD +13 -0
- rara_tools-0.0.3.dist-info/RECORD +0 -10
- {rara_tools-0.0.3.dist-info → rara_tools-0.0.8.dist-info}/LICENSE.md +0 -0
- {rara_tools-0.0.3.dist-info → rara_tools-0.0.8.dist-info}/WHEEL +0 -0
- {rara_tools-0.0.3.dist-info → rara_tools-0.0.8.dist-info}/top_level.txt +0 -0
|
File without changes
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
class StatusKeys:
|
|
2
|
+
CLEAN_UP = "digitizer_clean_up"
|
|
3
|
+
ELASTICSEARCH_UPLOAD = "digitizer_elasticsearch_upload"
|
|
4
|
+
UPLOAD = "s3_upload"
|
|
5
|
+
DOWNLOAD = "digitizer_s3_download"
|
|
6
|
+
OCR = "digitizer_ocr"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Queue:
|
|
10
|
+
IO = "io"
|
|
11
|
+
DOWNLOAD = "download"
|
|
12
|
+
FINISH = "finish"
|
|
13
|
+
OCR = "ocr"
|
rara_tools/elastic.py
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
|
-
from typing import Dict, Optional,
|
|
1
|
+
from typing import Dict, Optional, Any, Iterator
|
|
2
|
+
|
|
3
|
+
import elasticsearch_dsl
|
|
4
|
+
from elastic_transport import ObjectApiResponse
|
|
2
5
|
from elasticsearch import Elasticsearch
|
|
3
|
-
from
|
|
6
|
+
from elasticsearch.helpers import bulk
|
|
7
|
+
from elasticsearch_dsl import Index
|
|
4
8
|
|
|
5
9
|
from .decorators import _elastic_connection
|
|
6
10
|
|
|
@@ -8,11 +12,54 @@ from .decorators import _elastic_connection
|
|
|
8
12
|
class KataElastic:
|
|
9
13
|
"""A class to manage all required Elasticsearch operations for Kata.
|
|
10
14
|
"""
|
|
15
|
+
|
|
16
|
+
TYPE_MAPPING = {
|
|
17
|
+
"keyword": elasticsearch_dsl.Keyword,
|
|
18
|
+
"text": elasticsearch_dsl.Text,
|
|
19
|
+
"float": elasticsearch_dsl.Float,
|
|
20
|
+
"integer": elasticsearch_dsl.Integer,
|
|
21
|
+
"date": elasticsearch_dsl.Date,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
DEFAULT_MAPPING = {
|
|
25
|
+
"text": "keyword",
|
|
26
|
+
"parent_id": "keyword",
|
|
27
|
+
"text_quality": "float",
|
|
28
|
+
"n_chars": "integer",
|
|
29
|
+
"n_words": "integer",
|
|
30
|
+
"language": "keyword",
|
|
31
|
+
"end_page": "integer",
|
|
32
|
+
"start_page": "integer",
|
|
33
|
+
"sequence_nr": "integer",
|
|
34
|
+
"section_title": "keyword",
|
|
35
|
+
"section_type": "keyword",
|
|
36
|
+
"section_meta": "keyword",
|
|
37
|
+
}
|
|
38
|
+
|
|
11
39
|
def __init__(self, elasticsearch_url: str, timeout: Optional[int] = None):
|
|
12
40
|
self.timeout = timeout
|
|
13
41
|
self.elasticsearch_url = elasticsearch_url
|
|
14
42
|
self.elasticsearch = Elasticsearch(self.elasticsearch_url, request_timeout=self.timeout)
|
|
15
43
|
|
|
44
|
+
def _produce_rollover_index(self, index_prefix: str, rollover_limit: int) -> str:
|
|
45
|
+
indices = self.elasticsearch.indices.get(index=f"{index_prefix}-*", expand_wildcards="open")
|
|
46
|
+
sorted_indices = sorted([(k, v["settings"]["index"]["creation_date"]) for k, v in indices.items()], key=lambda x: x[1], reverse=True)
|
|
47
|
+
sorted_indices = [i[0] for i in sorted_indices]
|
|
48
|
+
|
|
49
|
+
# new index name if none exist
|
|
50
|
+
if not len(sorted_indices):
|
|
51
|
+
last_index_name = f"{index_prefix}-0"
|
|
52
|
+
last_index_count = 0
|
|
53
|
+
else:
|
|
54
|
+
last_index_name = sorted_indices[0]
|
|
55
|
+
last_index_count = self.elasticsearch.count(index=last_index_name)["count"]
|
|
56
|
+
# check the size of the last index of the pipeline
|
|
57
|
+
if last_index_count >= rollover_limit:
|
|
58
|
+
new_index_number = int(last_index_name[-1]) + 1
|
|
59
|
+
last_index_name = f"{index_prefix}-{new_index_number}"
|
|
60
|
+
|
|
61
|
+
return last_index_name
|
|
62
|
+
|
|
16
63
|
@_elastic_connection
|
|
17
64
|
def check(self) -> bool:
|
|
18
65
|
"""Checks Elasticsearch connection.
|
|
@@ -22,25 +69,42 @@ class KataElastic:
|
|
|
22
69
|
return True
|
|
23
70
|
return False
|
|
24
71
|
|
|
72
|
+
def generate_mapping(self, schema: dict | None = None) -> dict:
|
|
73
|
+
mapping_dsl = elasticsearch_dsl.Mapping()
|
|
74
|
+
mapping = schema or self.DEFAULT_MAPPING
|
|
75
|
+
for field_name, field_type in mapping.items():
|
|
76
|
+
if field_type in self.TYPE_MAPPING:
|
|
77
|
+
# We instantiate the class stored in the type mapping.
|
|
78
|
+
mapping_dsl.field(field_name, self.TYPE_MAPPING[field_type]())
|
|
79
|
+
return mapping_dsl.to_dict()
|
|
80
|
+
|
|
81
|
+
@_elastic_connection
|
|
82
|
+
def add_mapping(self, index_name: str, schema: dict):
|
|
83
|
+
index = Index(name=index_name)
|
|
84
|
+
return index.put_mapping(body=schema, using=self.elasticsearch)
|
|
85
|
+
|
|
25
86
|
@_elastic_connection
|
|
26
87
|
def create_index(
|
|
27
88
|
self,
|
|
28
89
|
index: str,
|
|
29
90
|
shards: int = 3,
|
|
30
91
|
replicas: int = 1,
|
|
31
|
-
settings: Optional[dict] = None
|
|
32
|
-
) -> Dict:
|
|
92
|
+
settings: Optional[dict] = None,
|
|
93
|
+
) -> Dict | None:
|
|
33
94
|
"""Creates empty index.
|
|
34
95
|
:param: index str: Name of the index to create.
|
|
35
96
|
:param: shards int: Number of shards for the index.
|
|
36
97
|
:param: replicas int: Number of replicas of the index.
|
|
37
98
|
:param: settings dict: Overwrite settings for the index.
|
|
38
99
|
"""
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
100
|
+
|
|
101
|
+
index_exists = self.elasticsearch.indices.exists(index=index).body
|
|
102
|
+
if index_exists is False:
|
|
103
|
+
setting_body = settings or {
|
|
104
|
+
"number_of_shards": shards,
|
|
105
|
+
"number_of_replicas": replicas,
|
|
106
|
+
}
|
|
107
|
+
return self.elasticsearch.indices.create(index=index, settings=setting_body)
|
|
44
108
|
|
|
45
109
|
@_elastic_connection
|
|
46
110
|
def delete_index(self, index: str, ignore: Optional[bool] = True) -> Dict:
|
|
@@ -49,11 +113,11 @@ class KataElastic:
|
|
|
49
113
|
:param: ignore bool: Ignore errors because of closed/deleted index.
|
|
50
114
|
:return: Dict of Elastic's acknowledgement of the action.
|
|
51
115
|
"""
|
|
52
|
-
response = self.elasticsearch.indices.delete(index=index, ignore_unavailable=ignore)
|
|
116
|
+
response = self.elasticsearch.indices.delete(index=index, ignore_unavailable=ignore, expand_wildcards="open")
|
|
53
117
|
return response
|
|
54
118
|
|
|
55
119
|
@_elastic_connection
|
|
56
|
-
def delete_document(self, index: str, document_id: str) ->
|
|
120
|
+
def delete_document(self, index: str, document_id: str) -> ObjectApiResponse[Any]:
|
|
57
121
|
"""Deletes document fom index.
|
|
58
122
|
:param: document_id str: ID of the document to be deleted.
|
|
59
123
|
:param: index str: Index where the document is to be found.
|
|
@@ -63,6 +127,25 @@ class KataElastic:
|
|
|
63
127
|
response = self.elasticsearch.delete(id=document_id, index=index)
|
|
64
128
|
return response
|
|
65
129
|
|
|
130
|
+
@_elastic_connection
|
|
131
|
+
def bulk_index(
|
|
132
|
+
self,
|
|
133
|
+
documents: Iterator[dict],
|
|
134
|
+
index_prefix: str,
|
|
135
|
+
rollover_limit: int,
|
|
136
|
+
refresh="false",
|
|
137
|
+
create_index: bool = True
|
|
138
|
+
) -> (int, int):
|
|
139
|
+
last_index_name = self._produce_rollover_index(index_prefix, rollover_limit)
|
|
140
|
+
if create_index:
|
|
141
|
+
response = self.create_index(index=last_index_name)
|
|
142
|
+
response = self.add_mapping(index_name=last_index_name, schema=self.generate_mapping())
|
|
143
|
+
pass
|
|
144
|
+
|
|
145
|
+
actions = [{"_index": last_index_name, "_source": document} for document in documents]
|
|
146
|
+
successful_count, error_count = bulk(actions=actions, client=self.elasticsearch, max_retries=3, refresh=refresh)
|
|
147
|
+
return successful_count, error_count
|
|
148
|
+
|
|
66
149
|
@_elastic_connection
|
|
67
150
|
def index_document(self, index: str, body: dict, document_id: Optional[str] = None) -> Dict:
|
|
68
151
|
"""Indexes document.
|
|
@@ -78,15 +161,15 @@ class KataElastic:
|
|
|
78
161
|
return indexed
|
|
79
162
|
|
|
80
163
|
@_elastic_connection
|
|
81
|
-
def get_documents_by_key(self, index: str, document_key: str
|
|
82
|
-
""
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
return
|
|
164
|
+
def get_documents_by_key(self, index: str, document_key: str, sort_fields=("start_page", "end_page", "sequence_nr",)):
|
|
165
|
+
index = f"{index}-*"
|
|
166
|
+
s = elasticsearch_dsl.Search(using=self.elasticsearch, index=index)
|
|
167
|
+
s = s.query("match", parent_id=document_key).sort(*sort_fields)
|
|
168
|
+
# Since scan doesn't allow for sorting, we do it manually after fetching the documents.
|
|
169
|
+
documents = sorted(
|
|
170
|
+
s.scan(), key=lambda doc: [getattr(doc, field) for field in sort_fields]
|
|
171
|
+
)
|
|
172
|
+
return documents
|
|
90
173
|
|
|
91
174
|
def __str__(self) -> str:
|
|
92
|
-
|
|
175
|
+
return self.elasticsearch_url
|
rara_tools/s3.py
CHANGED
|
@@ -45,12 +45,13 @@ class S3Files:
|
|
|
45
45
|
raise S3InputException(f"File '{file_path}' does not exist in file system!")
|
|
46
46
|
return self.minio_client.fput_object(self.bucket, s3_path_name, file_path)
|
|
47
47
|
|
|
48
|
-
def list(self, prefix: Optional[str] = "") -> List:
|
|
49
|
-
"""Lists all available files in S3 bucket.
|
|
48
|
+
def list(self, prefix: Optional[str] = "", recursive: Optional[bool] = True) -> List:
|
|
49
|
+
"""Lists all available directories or files in S3 bucket.
|
|
50
50
|
:param: prefix str: Limits the listing to a given prefix.
|
|
51
|
+
:param: recursive bool: List files recursively.
|
|
51
52
|
:return: List of file paths in S3.
|
|
52
53
|
"""
|
|
53
|
-
list_of_objects = self.minio_client.list_objects(self.bucket, prefix=prefix, recursive=
|
|
54
|
+
list_of_objects = self.minio_client.list_objects(self.bucket, prefix=prefix, recursive=recursive)
|
|
54
55
|
list_of_objects = [o.object_name for o in list_of_objects]
|
|
55
56
|
return list_of_objects
|
|
56
57
|
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
rara_tools/decorators.py,sha256=rYDk5CEHhCZvqeFaHku8qLMv7G7NTMWppHwLg3ZeVj4,2186
|
|
2
|
+
rara_tools/elastic.py,sha256=nNlCmoyKfCkM_2r1jtbpSpUn4S8IrLOKak17QwhNSvs,7146
|
|
3
|
+
rara_tools/exceptions.py,sha256=FtuHG-2snaEfADA25HjjutGNQzNo6sTdSfqk9VrzOuE,374
|
|
4
|
+
rara_tools/s3.py,sha256=eqMiOKbjXvXY04JJV68gmOU-4DUnwEaeYdhjQSI6crU,4440
|
|
5
|
+
rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
|
|
6
|
+
rara_tools/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
rara_tools/constants/digitizer.py,sha256=gJ3jOMwuZfKcLqgOAxTyB266VYsskLabJiMUiSz3xX4,297
|
|
8
|
+
rara_tools/constants/general.py,sha256=E9Jaw-YxocS_tOZw9QBoxO3e9KK5EMbLoM0R7D4Iflw,171
|
|
9
|
+
rara_tools-0.0.8.dist-info/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
|
|
10
|
+
rara_tools-0.0.8.dist-info/METADATA,sha256=TMrOrd_YtH83jCAzbNpBrHlcN7ta6VQwYBD_HqH3unM,3820
|
|
11
|
+
rara_tools-0.0.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
12
|
+
rara_tools-0.0.8.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
|
|
13
|
+
rara_tools-0.0.8.dist-info/RECORD,,
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
rara_tools/decorators.py,sha256=rYDk5CEHhCZvqeFaHku8qLMv7G7NTMWppHwLg3ZeVj4,2186
|
|
2
|
-
rara_tools/elastic.py,sha256=MVqai6wDQlDQeHQzAKsRpxOchI29y3W1UiridgfH6d4,3718
|
|
3
|
-
rara_tools/exceptions.py,sha256=FtuHG-2snaEfADA25HjjutGNQzNo6sTdSfqk9VrzOuE,374
|
|
4
|
-
rara_tools/s3.py,sha256=NxvY98gQdu2wAM3UOS6c2UzOCB2h96x417h7GrD23fI,4330
|
|
5
|
-
rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
|
|
6
|
-
rara_tools-0.0.3.dist-info/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
|
|
7
|
-
rara_tools-0.0.3.dist-info/METADATA,sha256=Jh9IQfWDepNQxHu67JitXPMyxyYS_sOfwrPcPg6jSpc,3820
|
|
8
|
-
rara_tools-0.0.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
9
|
-
rara_tools-0.0.3.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
|
|
10
|
-
rara_tools-0.0.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|