rara-tools 0.0.4__py3-none-any.whl → 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

File without changes
@@ -0,0 +1,13 @@
1
+ class StatusKeys:
2
+ CLEAN_UP = "digitizer_clean_up"
3
+ ELASTICSEARCH_UPLOAD = "digitizer_elasticsearch_upload"
4
+ UPLOAD = "s3_upload"
5
+ DOWNLOAD = "digitizer_s3_download"
6
+ OCR = "digitizer_ocr"
7
+
8
+
9
+ class Queue:
10
+ IO = "io"
11
+ DOWNLOAD = "download"
12
+ FINISH = "finish"
13
+ OCR = "ocr"
@@ -0,0 +1,10 @@
1
+ class Status:
2
+ FAILED = "FAILED"
3
+ PENDING = "PENDING"
4
+ RUNNING = "RUNNING"
5
+ COMPLETED = "COMPLETED"
6
+ RETRYING = "RETRYING"
7
+
8
+
9
+ class Queue:
10
+ CORE = "core"
rara_tools/elastic.py CHANGED
@@ -1,6 +1,10 @@
1
- from typing import Dict, Optional, List
1
+ from typing import Dict, Optional, Any, Iterator
2
+
3
+ import elasticsearch_dsl
4
+ from elastic_transport import ObjectApiResponse
2
5
  from elasticsearch import Elasticsearch
3
- from elasticsearch_dsl import Search
6
+ from elasticsearch.helpers import bulk
7
+ from elasticsearch_dsl import Index
4
8
 
5
9
  from .decorators import _elastic_connection
6
10
 
@@ -8,11 +12,54 @@ from .decorators import _elastic_connection
8
12
  class KataElastic:
9
13
  """A class to manage all required Elasticsearch operations for Kata.
10
14
  """
15
+
16
+ TYPE_MAPPING = {
17
+ "keyword": elasticsearch_dsl.Keyword,
18
+ "text": elasticsearch_dsl.Text,
19
+ "float": elasticsearch_dsl.Float,
20
+ "integer": elasticsearch_dsl.Integer,
21
+ "date": elasticsearch_dsl.Date,
22
+ }
23
+
24
+ DEFAULT_MAPPING = {
25
+ "text": "keyword",
26
+ "parent_id": "keyword",
27
+ "text_quality": "float",
28
+ "n_chars": "integer",
29
+ "n_words": "integer",
30
+ "language": "keyword",
31
+ "end_page": "integer",
32
+ "start_page": "integer",
33
+ "sequence_nr": "integer",
34
+ "section_title": "keyword",
35
+ "section_type": "keyword",
36
+ "section_meta": "keyword",
37
+ }
38
+
11
39
  def __init__(self, elasticsearch_url: str, timeout: Optional[int] = None):
12
40
  self.timeout = timeout
13
41
  self.elasticsearch_url = elasticsearch_url
14
42
  self.elasticsearch = Elasticsearch(self.elasticsearch_url, request_timeout=self.timeout)
15
43
 
44
+ def _produce_rollover_index(self, index_prefix: str, rollover_limit: int) -> str:
45
+ indices = self.elasticsearch.indices.get(index=f"{index_prefix}-*", expand_wildcards="open")
46
+ sorted_indices = sorted([(k, v["settings"]["index"]["creation_date"]) for k, v in indices.items()], key=lambda x: x[1], reverse=True)
47
+ sorted_indices = [i[0] for i in sorted_indices]
48
+
49
+ # new index name if none exist
50
+ if not len(sorted_indices):
51
+ last_index_name = f"{index_prefix}-0"
52
+ last_index_count = 0
53
+ else:
54
+ last_index_name = sorted_indices[0]
55
+ last_index_count = self.elasticsearch.count(index=last_index_name)["count"]
56
+ # check the size of the last index of the pipeline
57
+ if last_index_count >= rollover_limit:
58
+ new_index_number = int(last_index_name[-1]) + 1
59
+ last_index_name = f"{index_prefix}-{new_index_number}"
60
+
61
+ return last_index_name
62
+
16
63
  @_elastic_connection
17
64
  def check(self) -> bool:
18
65
  """Checks Elasticsearch connection.
@@ -22,25 +69,42 @@ class KataElastic:
22
69
  return True
23
70
  return False
24
71
 
72
+ def generate_mapping(self, schema: dict | None = None) -> dict:
73
+ mapping_dsl = elasticsearch_dsl.Mapping()
74
+ mapping = schema or self.DEFAULT_MAPPING
75
+ for field_name, field_type in mapping.items():
76
+ if field_type in self.TYPE_MAPPING:
77
+ # We instantiate the class stored in the type mapping.
78
+ mapping_dsl.field(field_name, self.TYPE_MAPPING[field_type]())
79
+ return mapping_dsl.to_dict()
80
+
81
+ @_elastic_connection
82
+ def add_mapping(self, index_name: str, schema: dict):
83
+ index = Index(name=index_name)
84
+ return index.put_mapping(body=schema, using=self.elasticsearch)
85
+
25
86
  @_elastic_connection
26
87
  def create_index(
27
88
  self,
28
89
  index: str,
29
90
  shards: int = 3,
30
91
  replicas: int = 1,
31
- settings: Optional[dict] = None
32
- ) -> Dict:
92
+ settings: Optional[dict] = None,
93
+ ) -> Dict | None:
33
94
  """Creates empty index.
34
95
  :param: index str: Name of the index to create.
35
96
  :param: shards int: Number of shards for the index.
36
97
  :param: replicas int: Number of replicas of the index.
37
98
  :param: settings dict: Overwrite settings for the index.
38
99
  """
39
- body = settings or {
40
- "number_of_shards": shards,
41
- "number_of_replicas": replicas,
42
- }
43
- return self.elasticsearch.indices.create(index=index, settings=body)
100
+
101
+ index_exists = self.elasticsearch.indices.exists(index=index).body
102
+ if index_exists is False:
103
+ setting_body = settings or {
104
+ "number_of_shards": shards,
105
+ "number_of_replicas": replicas,
106
+ }
107
+ return self.elasticsearch.indices.create(index=index, settings=setting_body)
44
108
 
45
109
  @_elastic_connection
46
110
  def delete_index(self, index: str, ignore: Optional[bool] = True) -> Dict:
@@ -49,11 +113,11 @@ class KataElastic:
49
113
  :param: ignore bool: Ignore errors because of closed/deleted index.
50
114
  :return: Dict of Elastic's acknowledgement of the action.
51
115
  """
52
- response = self.elasticsearch.indices.delete(index=index, ignore_unavailable=ignore)
116
+ response = self.elasticsearch.indices.delete(index=index, ignore_unavailable=ignore, expand_wildcards="open")
53
117
  return response
54
118
 
55
119
  @_elastic_connection
56
- def delete_document(self, index: str, document_id: str) -> Dict:
120
+ def delete_document(self, index: str, document_id: str) -> ObjectApiResponse[Any]:
57
121
  """Deletes document fom index.
58
122
  :param: document_id str: ID of the document to be deleted.
59
123
  :param: index str: Index where the document is to be found.
@@ -63,6 +127,25 @@ class KataElastic:
63
127
  response = self.elasticsearch.delete(id=document_id, index=index)
64
128
  return response
65
129
 
130
+ @_elastic_connection
131
+ def bulk_index(
132
+ self,
133
+ documents: Iterator[dict],
134
+ index_prefix: str,
135
+ rollover_limit: int,
136
+ refresh="false",
137
+ create_index: bool = True
138
+ ) -> (int, int):
139
+ last_index_name = self._produce_rollover_index(index_prefix, rollover_limit)
140
+ if create_index:
141
+ response = self.create_index(index=last_index_name)
142
+ response = self.add_mapping(index_name=last_index_name, schema=self.generate_mapping())
143
+ pass
144
+
145
+ actions = [{"_index": last_index_name, "_source": document} for document in documents]
146
+ successful_count, error_count = bulk(actions=actions, client=self.elasticsearch, max_retries=3, refresh=refresh)
147
+ return successful_count, error_count
148
+
66
149
  @_elastic_connection
67
150
  def index_document(self, index: str, body: dict, document_id: Optional[str] = None) -> Dict:
68
151
  """Indexes document.
@@ -78,15 +161,15 @@ class KataElastic:
78
161
  return indexed
79
162
 
80
163
  @_elastic_connection
81
- def get_documents_by_key(self, index: str, document_key: str) -> List:
82
- """This method is for retrieving all texts/pages of the original document.
83
- :param: index str: Index to search the documents from.
84
- :param: document_key str: parent_id field that connects pages of document together.
85
- :return: List of matching documents.
86
- """
87
- s = Search(using=self.elasticsearch, index=index)
88
- docs = s.query("match", parent_id=document_key).execute()
89
- return docs
164
+ def get_documents_by_key(self, index: str, document_key: str, sort_fields=("start_page", "end_page", "sequence_nr",)):
165
+ index = f"{index}-*"
166
+ s = elasticsearch_dsl.Search(using=self.elasticsearch, index=index)
167
+ s = s.query("match", parent_id=document_key).sort(*sort_fields)
168
+ # Since scan doesn't allow for sorting, we do it manually after fetching the documents.
169
+ documents = sorted(
170
+ s.scan(), key=lambda doc: [getattr(doc, field) for field in sort_fields]
171
+ )
172
+ return documents
90
173
 
91
174
  def __str__(self) -> str:
92
- return self.elasticsearch_url
175
+ return self.elasticsearch_url
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: rara-tools
3
- Version: 0.0.4
3
+ Version: 0.0.8
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -0,0 +1,13 @@
1
+ rara_tools/decorators.py,sha256=rYDk5CEHhCZvqeFaHku8qLMv7G7NTMWppHwLg3ZeVj4,2186
2
+ rara_tools/elastic.py,sha256=nNlCmoyKfCkM_2r1jtbpSpUn4S8IrLOKak17QwhNSvs,7146
3
+ rara_tools/exceptions.py,sha256=FtuHG-2snaEfADA25HjjutGNQzNo6sTdSfqk9VrzOuE,374
4
+ rara_tools/s3.py,sha256=eqMiOKbjXvXY04JJV68gmOU-4DUnwEaeYdhjQSI6crU,4440
5
+ rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
6
+ rara_tools/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ rara_tools/constants/digitizer.py,sha256=gJ3jOMwuZfKcLqgOAxTyB266VYsskLabJiMUiSz3xX4,297
8
+ rara_tools/constants/general.py,sha256=E9Jaw-YxocS_tOZw9QBoxO3e9KK5EMbLoM0R7D4Iflw,171
9
+ rara_tools-0.0.8.dist-info/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
10
+ rara_tools-0.0.8.dist-info/METADATA,sha256=TMrOrd_YtH83jCAzbNpBrHlcN7ta6VQwYBD_HqH3unM,3820
11
+ rara_tools-0.0.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
12
+ rara_tools-0.0.8.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
13
+ rara_tools-0.0.8.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- rara_tools/decorators.py,sha256=rYDk5CEHhCZvqeFaHku8qLMv7G7NTMWppHwLg3ZeVj4,2186
2
- rara_tools/elastic.py,sha256=MVqai6wDQlDQeHQzAKsRpxOchI29y3W1UiridgfH6d4,3718
3
- rara_tools/exceptions.py,sha256=FtuHG-2snaEfADA25HjjutGNQzNo6sTdSfqk9VrzOuE,374
4
- rara_tools/s3.py,sha256=eqMiOKbjXvXY04JJV68gmOU-4DUnwEaeYdhjQSI6crU,4440
5
- rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
6
- rara_tools-0.0.4.dist-info/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
7
- rara_tools-0.0.4.dist-info/METADATA,sha256=onb2qPn7IXknjCM09yxKfuEZYJlTlUwetCrS2ZXlPio,3820
8
- rara_tools-0.0.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
9
- rara_tools-0.0.4.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
10
- rara_tools-0.0.4.dist-info/RECORD,,