PyPI - arkindex-base-worker - Versions diffs - 0.4.0rc6__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

arkindex-base-worker 0.4.0rc6py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

{arkindex_base_worker-0.4.0rc6.dist-info → arkindex_base_worker-0.5.0.dist-info}/METADATA +9 -12
arkindex_base_worker-0.5.0.dist-info/RECORD +60 -0
{arkindex_base_worker-0.4.0rc6.dist-info → arkindex_base_worker-0.5.0.dist-info}/WHEEL +1 -1
{arkindex_base_worker-0.4.0rc6.dist-info → arkindex_base_worker-0.5.0.dist-info}/top_level.txt +1 -0
arkindex_worker/__init__.py +3 -0
arkindex_worker/cache.py +6 -25
arkindex_worker/image.py +105 -66
arkindex_worker/utils.py +2 -1
arkindex_worker/worker/__init__.py +17 -31
arkindex_worker/worker/base.py +16 -9
arkindex_worker/worker/classification.py +36 -34
arkindex_worker/worker/corpus.py +3 -3
arkindex_worker/worker/dataset.py +9 -9
arkindex_worker/worker/element.py +261 -231
arkindex_worker/worker/entity.py +137 -206
arkindex_worker/worker/image.py +3 -3
arkindex_worker/worker/metadata.py +27 -38
arkindex_worker/worker/task.py +9 -9
arkindex_worker/worker/training.py +15 -11
arkindex_worker/worker/transcription.py +77 -71
examples/standalone/python/worker.py +171 -0
examples/tooled/python/worker.py +50 -0
tests/conftest.py +22 -36
tests/test_base_worker.py +1 -1
tests/test_cache.py +1 -2
tests/test_dataset_worker.py +1 -1
tests/test_elements_worker/test_element.py +200 -26
tests/test_elements_worker/{test_entity_create.py → test_entity.py} +220 -227
tests/test_elements_worker/test_metadata.py +0 -47
tests/test_elements_worker/test_training.py +8 -8
tests/test_elements_worker/test_worker.py +15 -14
tests/test_image.py +244 -126
tests/test_merge.py +0 -7
tests/test_utils.py +37 -0
arkindex_base_worker-0.4.0rc6.dist-info/RECORD +0 -61
arkindex_worker/worker/version.py +0 -58
tests/test_elements_worker/test_entity_list_and_check.py +0 -160
tests/test_elements_worker/test_version.py +0 -60
{arkindex_base_worker-0.4.0rc6.dist-info → arkindex_base_worker-0.5.0.dist-info/licenses}/LICENSE +0 -0

arkindex_worker/worker/base.py CHANGED Viewed

@@ -146,6 +146,13 @@ class BaseWorker:
         # Define API Client
         self.setup_api_client()
+        # Known and available classes in processed corpus
+        self.classes = {}
+        # Known and available entity types in processed corpus
+        self.entity_types = {}
+        # Known and available element types in processed corpus
+        self.corpus_types = {}
     @property
     def corpus_id(self) -> str:
         """
@@ -268,12 +275,12 @@ class BaseWorker:
         # Retrieve initial configuration from API
         self.config = worker_version["configuration"].get("configuration", {})
         if "user_configuration" in worker_version["configuration"]:
-            # Add default values (if set) to user_configuration
+            # Add missing values (using the provided default if set) to user_configuration
             for key, value in worker_version["configuration"][
                 "user_configuration"
             ].items():
-                if "default" in value and key not in self.model_configuration:
-                    self.user_configuration[key] = value["default"]
+                if key not in self.model_configuration:
+                    self.user_configuration[key] = value.get("default")
         # Load all required secrets
         required_secrets = worker_version["configuration"].get("secrets", [])
@@ -305,9 +312,9 @@ class BaseWorker:
         if self.use_cache:
             if self.args.database is not None:
-                assert (
-                    self.args.database.is_file()
-                ), f"Database in {self.args.database} does not exist"
+                assert self.args.database.is_file(), (
+                    f"Database in {self.args.database} does not exist"
+                )
                 self.cache_path = self.args.database
             else:
                 cache_dir = self.task_data_dir / self.task_id
@@ -378,9 +385,9 @@ class BaseWorker:
                 gpg = gnupg.GPG()
                 with path.open("rb") as gpg_file:
                     decrypted = gpg.decrypt_file(gpg_file)
-                assert (
-                    decrypted.ok
-                ), f"GPG error: {decrypted.status} - {decrypted.stderr}"
+                assert decrypted.ok, (
+                    f"GPG error: {decrypted.status} - {decrypted.stderr}"
+                )
                 secret = decrypted.data.decode("utf-8")
                 logging.info(f"Loaded local secret {name}")
             except Exception as e:

arkindex_worker/worker/classification.py CHANGED Viewed

@@ -27,7 +27,7 @@ class ClassificationMixin:
         )
         self.classes = {ml_class["name"]: ml_class["id"] for ml_class in corpus_classes}
         logger.info(
-            f'Loaded {len(self.classes)} ML {pluralize("class", len(self.classes))} in corpus ({self.corpus_id})'
+            f"Loaded {len(self.classes)} ML {pluralize('class', len(self.classes))} in corpus ({self.corpus_id})"
         )
     def get_ml_class_id(self, ml_class: str) -> str:
@@ -49,7 +49,7 @@ class ClassificationMixin:
                     "CreateMLClass", id=self.corpus_id, body={"name": ml_class}
                 )
                 ml_class_id = self.classes[ml_class] = response["id"]
-                logger.debug(f"Created ML class {response['id']}")
+                logger.debug(f"Created a new ML class {response['id']}")
             except ErrorResponse as e:
                 # Only reload for 400 errors
                 if e.status_code != 400:
@@ -57,12 +57,12 @@ class ClassificationMixin:
                 # Reload and make sure we have the class
                 logger.info(
-                    f"Reloading corpus classes to see if {ml_class} already exists"
+                    f"Unable to create the ML class `{ml_class}`. Refreshing corpus classes cache."
                 )
                 self.load_corpus_classes()
-                assert (
-                    ml_class in self.classes
-                ), "Missing class {ml_class} even after reloading"
+                assert ml_class in self.classes, (
+                    f"Missing ML class {ml_class} even after refreshing."
+                )
                 ml_class_id = self.classes[ml_class]
         return ml_class_id
@@ -86,9 +86,9 @@ class ClassificationMixin:
             ),
             None,
         )
-        assert (
-            ml_class_name is not None
-        ), f"Missing class with id ({ml_class_id}) in corpus ({self.corpus_id})"
+        assert ml_class_name is not None, (
+            f"Missing class with id ({ml_class_id}) in corpus ({self.corpus_id})"
+        )
         return ml_class_name
     def create_classification(
@@ -107,18 +107,18 @@ class ClassificationMixin:
         :param high_confidence: Whether or not the classification is of high confidence.
         :returns: The created classification, as returned by the ``CreateClassification`` API endpoint.
         """
-        assert element and isinstance(
-            element, Element | CachedElement
-        ), "element shouldn't be null and should be an Element or CachedElement"
-        assert ml_class and isinstance(
-            ml_class, str
-        ), "ml_class shouldn't be null and should be of type str"
-        assert (
-            isinstance(confidence, float) and 0 <= confidence <= 1
-        ), "confidence shouldn't be null and should be a float in [0..1] range"
-        assert isinstance(
-            high_confidence, bool
-        ), "high_confidence shouldn't be null and should be of type bool"
+        assert element and isinstance(element, Element | CachedElement), (
+            "element shouldn't be null and should be an Element or CachedElement"
+        )
+        assert ml_class and isinstance(ml_class, str), (
+            "ml_class shouldn't be null and should be of type str"
+        )
+        assert isinstance(confidence, float) and 0 <= confidence <= 1, (
+            "confidence shouldn't be null and should be a float in [0..1] range"
+        )
+        assert isinstance(high_confidence, bool), (
+            "high_confidence shouldn't be null and should be of type bool"
+        )
         if self.is_read_only:
             logger.warning(
                 "Cannot create classification as this worker is in read-only mode"
@@ -198,31 +198,33 @@ class ClassificationMixin:
         :returns: List of created classifications, as returned in the ``classifications`` field by
            the ``CreateClassifications`` API endpoint.
         """
-        assert element and isinstance(
-            element, Element | CachedElement
-        ), "element shouldn't be null and should be an Element or CachedElement"
-        assert classifications and isinstance(
-            classifications, list
-        ), "classifications shouldn't be null and should be of type list"
+        assert element and isinstance(element, Element | CachedElement), (
+            "element shouldn't be null and should be an Element or CachedElement"
+        )
+        assert classifications and isinstance(classifications, list), (
+            "classifications shouldn't be null and should be of type list"
+        )
         for index, classification in enumerate(classifications):
             ml_class = classification.get("ml_class")
-            assert (
-                ml_class and isinstance(ml_class, str)
-            ), f"Classification at index {index} in classifications: ml_class shouldn't be null and should be of type str"
+            assert ml_class and isinstance(ml_class, str), (
+                f"Classification at index {index} in classifications: ml_class shouldn't be null and should be of type str"
+            )
             confidence = classification.get("confidence")
             assert (
                 confidence is not None
                 and isinstance(confidence, float)
                 and 0 <= confidence <= 1
-            ), f"Classification at index {index} in classifications: confidence shouldn't be null and should be a float in [0..1] range"
+            ), (
+                f"Classification at index {index} in classifications: confidence shouldn't be null and should be a float in [0..1] range"
+            )
             high_confidence = classification.get("high_confidence")
             if high_confidence is not None:
-                assert isinstance(
-                    high_confidence, bool
-                ), f"Classification at index {index} in classifications: high_confidence should be of type bool"
+                assert isinstance(high_confidence, bool), (
+                    f"Classification at index {index} in classifications: high_confidence should be of type bool"
+                )
         if self.is_read_only:
             logger.warning(

arkindex_worker/worker/corpus.py CHANGED Viewed

@@ -76,9 +76,9 @@ class CorpusMixin:
             key=itemgetter("updated"),
             reverse=True,
         )
-        assert (
-            len(exports) > 0
-        ), f'No available exports found for the corpus ({self.corpus_id}) with state "{CorpusExportState.Done.value.capitalize()}".'
+        assert len(exports) > 0, (
+            f'No available exports found for the corpus ({self.corpus_id}) with state "{CorpusExportState.Done.value.capitalize()}".'
+        )
         # Download latest export
         export_id: str = exports[0]["id"]

arkindex_worker/worker/dataset.py CHANGED Viewed

@@ -113,9 +113,9 @@ class DatasetMixin:
         :param dataset_set: Set to find elements in.
         :returns: An iterator of Element built from the ``ListDatasetElements`` API endpoint.
         """
-        assert dataset_set and isinstance(
-            dataset_set, Set
-        ), "dataset_set shouldn't be null and should be a Set"
+        assert dataset_set and isinstance(dataset_set, Set), (
+            "dataset_set shouldn't be null and should be a Set"
+        )
         results = self.api_client.paginate(
             "ListDatasetElements", id=dataset_set.dataset.id, set=dataset_set.name
@@ -152,12 +152,12 @@ class DatasetMixin:
         :param state: State of the dataset.
         :returns: The updated ``Dataset`` object from the ``PartialUpdateDataset`` API endpoint.
         """
-        assert dataset and isinstance(
-            dataset, Dataset
-        ), "dataset shouldn't be null and should be a Dataset"
-        assert state and isinstance(
-            state, DatasetState
-        ), "state shouldn't be null and should be a str from DatasetState"
+        assert dataset and isinstance(dataset, Dataset), (
+            "dataset shouldn't be null and should be a Dataset"
+        )
+        assert state and isinstance(state, DatasetState), (
+            "state shouldn't be null and should be a str from DatasetState"
+        )
         if self.is_read_only:
             logger.warning("Cannot update dataset as this worker is in read-only mode")

arkindex-base-worker 0.4.0rc6__py3-none-any.whl → 0.5.0__py3-none-any.whl

arkindex-base-worker 0.4.0rc6py3-none-any.whl → 0.5.0py3-none-any.whl