arkindex-base-worker 0.4.0rc6__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {arkindex_base_worker-0.4.0rc6.dist-info → arkindex_base_worker-0.5.0.dist-info}/METADATA +9 -12
  2. arkindex_base_worker-0.5.0.dist-info/RECORD +60 -0
  3. {arkindex_base_worker-0.4.0rc6.dist-info → arkindex_base_worker-0.5.0.dist-info}/WHEEL +1 -1
  4. {arkindex_base_worker-0.4.0rc6.dist-info → arkindex_base_worker-0.5.0.dist-info}/top_level.txt +1 -0
  5. arkindex_worker/__init__.py +3 -0
  6. arkindex_worker/cache.py +6 -25
  7. arkindex_worker/image.py +105 -66
  8. arkindex_worker/utils.py +2 -1
  9. arkindex_worker/worker/__init__.py +17 -31
  10. arkindex_worker/worker/base.py +16 -9
  11. arkindex_worker/worker/classification.py +36 -34
  12. arkindex_worker/worker/corpus.py +3 -3
  13. arkindex_worker/worker/dataset.py +9 -9
  14. arkindex_worker/worker/element.py +261 -231
  15. arkindex_worker/worker/entity.py +137 -206
  16. arkindex_worker/worker/image.py +3 -3
  17. arkindex_worker/worker/metadata.py +27 -38
  18. arkindex_worker/worker/task.py +9 -9
  19. arkindex_worker/worker/training.py +15 -11
  20. arkindex_worker/worker/transcription.py +77 -71
  21. examples/standalone/python/worker.py +171 -0
  22. examples/tooled/python/worker.py +50 -0
  23. tests/conftest.py +22 -36
  24. tests/test_base_worker.py +1 -1
  25. tests/test_cache.py +1 -2
  26. tests/test_dataset_worker.py +1 -1
  27. tests/test_elements_worker/test_element.py +200 -26
  28. tests/test_elements_worker/{test_entity_create.py → test_entity.py} +220 -227
  29. tests/test_elements_worker/test_metadata.py +0 -47
  30. tests/test_elements_worker/test_training.py +8 -8
  31. tests/test_elements_worker/test_worker.py +15 -14
  32. tests/test_image.py +244 -126
  33. tests/test_merge.py +0 -7
  34. tests/test_utils.py +37 -0
  35. arkindex_base_worker-0.4.0rc6.dist-info/RECORD +0 -61
  36. arkindex_worker/worker/version.py +0 -58
  37. tests/test_elements_worker/test_entity_list_and_check.py +0 -160
  38. tests/test_elements_worker/test_version.py +0 -60
  39. {arkindex_base_worker-0.4.0rc6.dist-info → arkindex_base_worker-0.5.0.dist-info/licenses}/LICENSE +0 -0
@@ -146,6 +146,13 @@ class BaseWorker:
146
146
  # Define API Client
147
147
  self.setup_api_client()
148
148
 
149
+ # Known and available classes in processed corpus
150
+ self.classes = {}
151
+ # Known and available entity types in processed corpus
152
+ self.entity_types = {}
153
+ # Known and available element types in processed corpus
154
+ self.corpus_types = {}
155
+
149
156
  @property
150
157
  def corpus_id(self) -> str:
151
158
  """
@@ -268,12 +275,12 @@ class BaseWorker:
268
275
  # Retrieve initial configuration from API
269
276
  self.config = worker_version["configuration"].get("configuration", {})
270
277
  if "user_configuration" in worker_version["configuration"]:
271
- # Add default values (if set) to user_configuration
278
+ # Add missing values (using the provided default if set) to user_configuration
272
279
  for key, value in worker_version["configuration"][
273
280
  "user_configuration"
274
281
  ].items():
275
- if "default" in value and key not in self.model_configuration:
276
- self.user_configuration[key] = value["default"]
282
+ if key not in self.model_configuration:
283
+ self.user_configuration[key] = value.get("default")
277
284
 
278
285
  # Load all required secrets
279
286
  required_secrets = worker_version["configuration"].get("secrets", [])
@@ -305,9 +312,9 @@ class BaseWorker:
305
312
 
306
313
  if self.use_cache:
307
314
  if self.args.database is not None:
308
- assert (
309
- self.args.database.is_file()
310
- ), f"Database in {self.args.database} does not exist"
315
+ assert self.args.database.is_file(), (
316
+ f"Database in {self.args.database} does not exist"
317
+ )
311
318
  self.cache_path = self.args.database
312
319
  else:
313
320
  cache_dir = self.task_data_dir / self.task_id
@@ -378,9 +385,9 @@ class BaseWorker:
378
385
  gpg = gnupg.GPG()
379
386
  with path.open("rb") as gpg_file:
380
387
  decrypted = gpg.decrypt_file(gpg_file)
381
- assert (
382
- decrypted.ok
383
- ), f"GPG error: {decrypted.status} - {decrypted.stderr}"
388
+ assert decrypted.ok, (
389
+ f"GPG error: {decrypted.status} - {decrypted.stderr}"
390
+ )
384
391
  secret = decrypted.data.decode("utf-8")
385
392
  logging.info(f"Loaded local secret {name}")
386
393
  except Exception as e:
@@ -27,7 +27,7 @@ class ClassificationMixin:
27
27
  )
28
28
  self.classes = {ml_class["name"]: ml_class["id"] for ml_class in corpus_classes}
29
29
  logger.info(
30
- f'Loaded {len(self.classes)} ML {pluralize("class", len(self.classes))} in corpus ({self.corpus_id})'
30
+ f"Loaded {len(self.classes)} ML {pluralize('class', len(self.classes))} in corpus ({self.corpus_id})"
31
31
  )
32
32
 
33
33
  def get_ml_class_id(self, ml_class: str) -> str:
@@ -49,7 +49,7 @@ class ClassificationMixin:
49
49
  "CreateMLClass", id=self.corpus_id, body={"name": ml_class}
50
50
  )
51
51
  ml_class_id = self.classes[ml_class] = response["id"]
52
- logger.debug(f"Created ML class {response['id']}")
52
+ logger.debug(f"Created a new ML class {response['id']}")
53
53
  except ErrorResponse as e:
54
54
  # Only reload for 400 errors
55
55
  if e.status_code != 400:
@@ -57,12 +57,12 @@ class ClassificationMixin:
57
57
 
58
58
  # Reload and make sure we have the class
59
59
  logger.info(
60
- f"Reloading corpus classes to see if {ml_class} already exists"
60
+ f"Unable to create the ML class `{ml_class}`. Refreshing corpus classes cache."
61
61
  )
62
62
  self.load_corpus_classes()
63
- assert (
64
- ml_class in self.classes
65
- ), "Missing class {ml_class} even after reloading"
63
+ assert ml_class in self.classes, (
64
+ f"Missing ML class {ml_class} even after refreshing."
65
+ )
66
66
  ml_class_id = self.classes[ml_class]
67
67
 
68
68
  return ml_class_id
@@ -86,9 +86,9 @@ class ClassificationMixin:
86
86
  ),
87
87
  None,
88
88
  )
89
- assert (
90
- ml_class_name is not None
91
- ), f"Missing class with id ({ml_class_id}) in corpus ({self.corpus_id})"
89
+ assert ml_class_name is not None, (
90
+ f"Missing class with id ({ml_class_id}) in corpus ({self.corpus_id})"
91
+ )
92
92
  return ml_class_name
93
93
 
94
94
  def create_classification(
@@ -107,18 +107,18 @@ class ClassificationMixin:
107
107
  :param high_confidence: Whether or not the classification is of high confidence.
108
108
  :returns: The created classification, as returned by the ``CreateClassification`` API endpoint.
109
109
  """
110
- assert element and isinstance(
111
- element, Element | CachedElement
112
- ), "element shouldn't be null and should be an Element or CachedElement"
113
- assert ml_class and isinstance(
114
- ml_class, str
115
- ), "ml_class shouldn't be null and should be of type str"
116
- assert (
117
- isinstance(confidence, float) and 0 <= confidence <= 1
118
- ), "confidence shouldn't be null and should be a float in [0..1] range"
119
- assert isinstance(
120
- high_confidence, bool
121
- ), "high_confidence shouldn't be null and should be of type bool"
110
+ assert element and isinstance(element, Element | CachedElement), (
111
+ "element shouldn't be null and should be an Element or CachedElement"
112
+ )
113
+ assert ml_class and isinstance(ml_class, str), (
114
+ "ml_class shouldn't be null and should be of type str"
115
+ )
116
+ assert isinstance(confidence, float) and 0 <= confidence <= 1, (
117
+ "confidence shouldn't be null and should be a float in [0..1] range"
118
+ )
119
+ assert isinstance(high_confidence, bool), (
120
+ "high_confidence shouldn't be null and should be of type bool"
121
+ )
122
122
  if self.is_read_only:
123
123
  logger.warning(
124
124
  "Cannot create classification as this worker is in read-only mode"
@@ -198,31 +198,33 @@ class ClassificationMixin:
198
198
  :returns: List of created classifications, as returned in the ``classifications`` field by
199
199
  the ``CreateClassifications`` API endpoint.
200
200
  """
201
- assert element and isinstance(
202
- element, Element | CachedElement
203
- ), "element shouldn't be null and should be an Element or CachedElement"
204
- assert classifications and isinstance(
205
- classifications, list
206
- ), "classifications shouldn't be null and should be of type list"
201
+ assert element and isinstance(element, Element | CachedElement), (
202
+ "element shouldn't be null and should be an Element or CachedElement"
203
+ )
204
+ assert classifications and isinstance(classifications, list), (
205
+ "classifications shouldn't be null and should be of type list"
206
+ )
207
207
 
208
208
  for index, classification in enumerate(classifications):
209
209
  ml_class = classification.get("ml_class")
210
- assert (
211
- ml_class and isinstance(ml_class, str)
212
- ), f"Classification at index {index} in classifications: ml_class shouldn't be null and should be of type str"
210
+ assert ml_class and isinstance(ml_class, str), (
211
+ f"Classification at index {index} in classifications: ml_class shouldn't be null and should be of type str"
212
+ )
213
213
 
214
214
  confidence = classification.get("confidence")
215
215
  assert (
216
216
  confidence is not None
217
217
  and isinstance(confidence, float)
218
218
  and 0 <= confidence <= 1
219
- ), f"Classification at index {index} in classifications: confidence shouldn't be null and should be a float in [0..1] range"
219
+ ), (
220
+ f"Classification at index {index} in classifications: confidence shouldn't be null and should be a float in [0..1] range"
221
+ )
220
222
 
221
223
  high_confidence = classification.get("high_confidence")
222
224
  if high_confidence is not None:
223
- assert isinstance(
224
- high_confidence, bool
225
- ), f"Classification at index {index} in classifications: high_confidence should be of type bool"
225
+ assert isinstance(high_confidence, bool), (
226
+ f"Classification at index {index} in classifications: high_confidence should be of type bool"
227
+ )
226
228
 
227
229
  if self.is_read_only:
228
230
  logger.warning(
@@ -76,9 +76,9 @@ class CorpusMixin:
76
76
  key=itemgetter("updated"),
77
77
  reverse=True,
78
78
  )
79
- assert (
80
- len(exports) > 0
81
- ), f'No available exports found for the corpus ({self.corpus_id}) with state "{CorpusExportState.Done.value.capitalize()}".'
79
+ assert len(exports) > 0, (
80
+ f'No available exports found for the corpus ({self.corpus_id}) with state "{CorpusExportState.Done.value.capitalize()}".'
81
+ )
82
82
 
83
83
  # Download latest export
84
84
  export_id: str = exports[0]["id"]
@@ -113,9 +113,9 @@ class DatasetMixin:
113
113
  :param dataset_set: Set to find elements in.
114
114
  :returns: An iterator of Element built from the ``ListDatasetElements`` API endpoint.
115
115
  """
116
- assert dataset_set and isinstance(
117
- dataset_set, Set
118
- ), "dataset_set shouldn't be null and should be a Set"
116
+ assert dataset_set and isinstance(dataset_set, Set), (
117
+ "dataset_set shouldn't be null and should be a Set"
118
+ )
119
119
 
120
120
  results = self.api_client.paginate(
121
121
  "ListDatasetElements", id=dataset_set.dataset.id, set=dataset_set.name
@@ -152,12 +152,12 @@ class DatasetMixin:
152
152
  :param state: State of the dataset.
153
153
  :returns: The updated ``Dataset`` object from the ``PartialUpdateDataset`` API endpoint.
154
154
  """
155
- assert dataset and isinstance(
156
- dataset, Dataset
157
- ), "dataset shouldn't be null and should be a Dataset"
158
- assert state and isinstance(
159
- state, DatasetState
160
- ), "state shouldn't be null and should be a str from DatasetState"
155
+ assert dataset and isinstance(dataset, Dataset), (
156
+ "dataset shouldn't be null and should be a Dataset"
157
+ )
158
+ assert state and isinstance(state, DatasetState), (
159
+ "state shouldn't be null and should be a str from DatasetState"
160
+ )
161
161
 
162
162
  if self.is_read_only:
163
163
  logger.warning("Cannot update dataset as this worker is in read-only mode")