PyPI - arkindex-base-worker - Versions diffs - 0.5.0b3__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

arkindex-base-worker 0.5.0b3py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{arkindex_base_worker-0.5.0b3.dist-info → arkindex_base_worker-0.5.1.dist-info}/METADATA +7 -8
{arkindex_base_worker-0.5.0b3.dist-info → arkindex_base_worker-0.5.1.dist-info}/RECORD +27 -27
arkindex_worker/cache.py +8 -22
arkindex_worker/image.py +5 -1
arkindex_worker/models.py +5 -0
arkindex_worker/utils.py +27 -0
arkindex_worker/worker/__init__.py +62 -6
arkindex_worker/worker/base.py +53 -1
arkindex_worker/worker/element.py +20 -0
arkindex_worker/worker/entity.py +17 -126
arkindex_worker/worker/metadata.py +3 -14
tests/conftest.py +113 -12
tests/test_base_worker.py +99 -125
tests/test_cache.py +2 -3
tests/test_dataset_worker.py +5 -0
tests/test_element.py +52 -12
tests/test_elements_worker/__init__.py +4 -0
tests/test_elements_worker/{test_entity_create.py → test_entity.py} +220 -227
tests/test_elements_worker/test_metadata.py +0 -47
tests/test_elements_worker/test_worker.py +106 -0
tests/test_image.py +19 -3
tests/test_merge.py +0 -7
tests/test_modern_config.py +81 -0
tests/test_utils.py +42 -0
tests/test_elements_worker/test_entity_list_and_check.py +0 -293
{arkindex_base_worker-0.5.0b3.dist-info → arkindex_base_worker-0.5.1.dist-info}/WHEEL +0 -0
{arkindex_base_worker-0.5.0b3.dist-info → arkindex_base_worker-0.5.1.dist-info}/licenses/LICENSE +0 -0
{arkindex_base_worker-0.5.0b3.dist-info → arkindex_base_worker-0.5.1.dist-info}/top_level.txt +0 -0

arkindex_worker/worker/entity.py CHANGED Viewed

@@ -11,16 +11,14 @@ from peewee import IntegrityError
 from arkindex.exceptions import ErrorResponse
 from arkindex_worker import logger
 from arkindex_worker.cache import (
-    CachedEntity,
     CachedTranscriptionEntity,
     unsupported_cache,
 )
-from arkindex_worker.models import Element, Transcription
+from arkindex_worker.models import Transcription
 from arkindex_worker.utils import pluralize
 class Entity(TypedDict):
-    name: str
     type_id: str
     length: int
     offset: int
@@ -126,88 +124,20 @@ class EntityMixin:
             # Create the type if non-existent
             self.create_entity_type(entity_type)
-    def create_entity(
-        self,
-        name: str,
-        type: str,
-        metas=None,
-        validated=None,
-    ):
-        """
-        Create an entity on the given corpus.
-        If cache support is enabled, a [CachedEntity][arkindex_worker.cache.CachedEntity] will also be created.
-        :param name: Name of the entity.
-        :param type: Type of the entity.
-        """
-        assert name and isinstance(name, str), (
-            "name shouldn't be null and should be of type str"
-        )
-        assert type and isinstance(type, str), (
-            "type shouldn't be null and should be of type str"
-        )
-        metas = metas or {}
-        if metas:
-            assert isinstance(metas, dict), "metas should be of type dict"
-        if validated is not None:
-            assert isinstance(validated, bool), "validated should be of type bool"
-        if self.is_read_only:
-            logger.warning("Cannot create entity as this worker is in read-only mode")
-            return
-        # Retrieve entity_type ID
-        if not self.entity_types:
-            # Load entity_types of corpus
-            self.list_corpus_entity_types()
-        entity_type_id = self.entity_types.get(type)
-        assert entity_type_id, f"Entity type `{type}` not found in the corpus."
-        entity = self.api_client.request(
-            "CreateEntity",
-            body={
-                "name": name,
-                "type_id": entity_type_id,
-                "metas": metas,
-                "validated": validated,
-                "corpus": self.corpus_id,
-                "worker_run_id": self.worker_run_id,
-            },
-        )
-        if self.use_cache:
-            # Store entity in local cache
-            try:
-                to_insert = [
-                    {
-                        "id": entity["id"],
-                        "type": type,
-                        "name": name,
-                        "validated": validated if validated is not None else False,
-                        "metas": metas,
-                        "worker_run_id": self.worker_run_id,
-                    }
-                ]
-                CachedEntity.insert_many(to_insert).execute()
-            except IntegrityError as e:
-                logger.warning(f"Couldn't save created entity in local cache: {e}")
-        return entity["id"]
     def create_transcription_entity(
         self,
         transcription: Transcription,
-        entity: str,
+        type_id: str,
         offset: int,
         length: int,
         confidence: float | None = None,
     ) -> dict[str, str | int] | None:
         """
-        Create a link between an existing entity and an existing transcription.
+        Create an entity on an existing transcription.
         If cache support is enabled, a `CachedTranscriptionEntity` will also be created.
         :param transcription: Transcription to create the entity on.
-        :param entity: UUID of the existing entity.
+        :param type_id: UUID of the entity type.
         :param offset: Starting position of the entity in the transcription's text,
            as a 0-based index.
         :param length: Length of the entity in the transcription's text.
@@ -218,8 +148,8 @@ class EntityMixin:
         assert transcription and isinstance(transcription, Transcription), (
             "transcription shouldn't be null and should be a Transcription"
         )
-        assert entity and isinstance(entity, str), (
-            "entity shouldn't be null and should be of type str"
+        assert type_id and isinstance(type_id, str), (
+            "type_id shouldn't be null and should be of type str"
         )
         assert offset is not None and isinstance(offset, int) and offset >= 0, (
             "offset shouldn't be null and should be a positive integer"
@@ -237,7 +167,7 @@ class EntityMixin:
             return
         body = {
-            "entity": entity,
+            "type_id": type_id,
             "length": length,
             "offset": offset,
             "worker_run_id": self.worker_run_id,
@@ -245,7 +175,7 @@ class EntityMixin:
         if confidence is not None:
             body["confidence"] = confidence
-        transcription_ent = self.api_client.request(
+        tr_entity = self.api_client.request(
             "CreateTranscriptionEntity",
             id=transcription.id,
             body=body,
@@ -256,7 +186,7 @@ class EntityMixin:
             try:
                 CachedTranscriptionEntity.create(
                     transcription=transcription.id,
-                    entity=entity,
+                    type=tr_entity["type"]["name"],
                     offset=offset,
                     length=length,
                     worker_run_id=self.worker_run_id,
@@ -267,7 +197,7 @@ class EntityMixin:
                     f"Couldn't save created transcription entity in local cache: {e}"
                 )
-        return transcription_ent
+        return tr_entity
     @unsupported_cache
     def create_transcription_entities(
@@ -276,14 +206,11 @@ class EntityMixin:
         entities: list[Entity],
     ) -> list[dict[str, str]]:
         """
-        Create multiple entities attached to a transcription in a single API request.
+        Create multiple entities on a transcription in a single API request.
         :param transcription: Transcription to create the entity on.
         :param entities: List of dicts, one per element. Each dict can have the following keys:
-            name (str)
-               Required. Name of the entity.
             type_id (str)
                Required. ID of the EntityType of the entity.
@@ -296,7 +223,7 @@ class EntityMixin:
             confidence (float or None)
                 Optional confidence score, between 0.0 and 1.0.
-        :return: List of dicts, with each dict having a two keys, `transcription_entity_id` and `entity_id`, holding the UUID of each created object.
+        :return: List of strings, holding the UUID of each created object.
         """
         assert transcription and isinstance(transcription, Transcription), (
             "transcription shouldn't be null and should be of type Transcription"
@@ -311,11 +238,6 @@ class EntityMixin:
                 f"Entity at index {index} in entities: Should be of type dict"
             )
-            name = entity.get("name")
-            assert name and isinstance(name, str), (
-                f"Entity at index {index} in entities: name shouldn't be null and should be of type str"
-            )
             type_id = entity.get("type_id")
             assert type_id and isinstance(type_id, str), (
                 f"Entity at index {index} in entities: type_id shouldn't be null and should be of type str"
@@ -339,7 +261,7 @@ class EntityMixin:
             )
         assert len(entities) == len(
-            set(map(itemgetter("offset", "length", "name", "type_id"), entities))
+            set(map(itemgetter("offset", "length", "type_id"), entities))
         ), "entities should be unique"
         if self.is_read_only:
@@ -348,16 +270,16 @@ class EntityMixin:
             )
             return
-        created_entities = self.api_client.request(
+        created_tr_entities = self.api_client.request(
             "CreateTranscriptionEntities",
             id=transcription.id,
             body={
                 "worker_run_id": self.worker_run_id,
-                "entities": entities,
+                "transcription_entities": entities,
             },
-        )["entities"]
+        )["transcription_entities"]
-        return created_entities
+        return created_tr_entities
     def list_transcription_entities(
         self,
@@ -412,34 +334,3 @@ class EntityMixin:
         return self.api_client.paginate(
             "ListTranscriptionEntities", id=transcription.id, **query_params
         )
-    def list_corpus_entities(
-        self,
-        name: str | None = None,
-        parent: Element | None = None,
-    ):
-        """
-        List all entities in the worker's corpus and store them in the ``self.entities`` cache.
-        :param name: Filter entities by part of their name (case-insensitive)
-        :param parent: Restrict entities to those linked to all transcriptions of an element and all its descendants. Note that links to metadata are ignored.
-        """
-        query_params = {}
-        if name is not None:
-            assert name and isinstance(name, str), "name should be of type str"
-            query_params["name"] = name
-        if parent is not None:
-            assert isinstance(parent, Element), "parent should be of type Element"
-            query_params["parent"] = parent.id
-        self.entities = {
-            entity["id"]: entity
-            for entity in self.api_client.paginate(
-                "ListCorpusEntities", id=self.corpus_id, **query_params
-            )
-        }
-        count = len(self.entities)
-        logger.info(
-            f"Loaded {count} {pluralize('entity', count)} in corpus ({self.corpus_id})"
-        )

arkindex_worker/worker/metadata.py CHANGED Viewed

@@ -20,10 +20,10 @@ class MetaType(Enum):
     A regular string with no special interpretation.
     """
-    HTML = "html"
+    Markdown = "markdown"
     """
-    A metadata with a string value that should be interpreted as HTML content.
-    The allowed HTML tags are restricted for security reasons.
+    A metadata with a string value that should be interpreted as Markdown content.
+    HTML is allowed, but the allowed HTML tags are restricted for security reasons.
     """
     Date = "date"
@@ -64,7 +64,6 @@ class MetaDataMixin:
         type: MetaType,
         name: str,
         value: str,
-        entity: str | None = None,
     ) -> str:
         """
         Create a metadata on the given element through API.
@@ -73,7 +72,6 @@ class MetaDataMixin:
         :param type: Type of the metadata.
         :param name: Name of the metadata.
         :param value: Value of the metadata.
-        :param entity: UUID of an entity this metadata is related to.
         :returns: UUID of the created metadata.
         """
         assert element and isinstance(element, Element | CachedElement), (
@@ -88,8 +86,6 @@ class MetaDataMixin:
         assert value and isinstance(value, str), (
             "value shouldn't be null and should be of type str"
         )
-        if entity:
-            assert isinstance(entity, str), "entity should be of type str"
         if self.is_read_only:
             logger.warning("Cannot create metadata as this worker is in read-only mode")
             return
@@ -101,7 +97,6 @@ class MetaDataMixin:
                 "type": type.value,
                 "name": name,
                 "value": value,
-                "entity_id": entity,
                 "worker_run_id": self.worker_run_id,
             },
         )
@@ -125,7 +120,6 @@ class MetaDataMixin:
             - type: MetaType
             - name: str
             - value: str | int | float
-            - entity_id: str | None
         :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
         :returns: A list of dicts as returned in the ``metadata_list`` field by the ``CreateMetaDataBulk`` API endpoint.
@@ -157,16 +151,11 @@ class MetaDataMixin:
                 metadata.get("value"), str | float | int
             ), "value shouldn't be null and should be of type (str or float or int)"
-            assert metadata.get("entity_id") is None or isinstance(
-                metadata.get("entity_id"), str
-            ), "entity_id should be None or a str"
             metas.append(
                 {
                     "type": metadata.get("type").value,
                     "name": metadata.get("name"),
                     "value": metadata.get("value"),
-                    "entity_id": metadata.get("entity_id"),
                 }
             )

tests/conftest.py CHANGED Viewed

@@ -103,12 +103,6 @@ def _mock_worker_run_api(responses):
     payload = {
         "id": "56785678-5678-5678-5678-567856785678",
         "parents": [],
-        "worker": {
-            "id": "deadbeef-1234-5678-1234-worker",
-            "name": "Fake worker",
-            "slug": "fake_worker",
-            "type": "classifier",
-        },
         "worker_version": {
             "id": "12341234-1234-1234-1234-123412341234",
             "configuration": {
@@ -153,6 +147,7 @@ def _mock_worker_run_api(responses):
             "train_folder_id": None,
             "validation_folder_id": None,
             "test_folder_id": None,
+            "skip_elements_json": False,
         },
         "summary": "Worker Fake worker @ 123412",
     }
@@ -165,6 +160,13 @@ def _mock_worker_run_api(responses):
         content_type="application/json",
     )
+    # By default, stick to classic configuration
+    responses.add(
+        responses.GET,
+        "http://testserver/api/v1/workers/runs/56785678-5678-5678-5678-567856785678/configuration/",
+        status=400,
+    )
 @pytest.fixture
 def _mock_worker_run_no_revision_api(responses):
@@ -172,12 +174,6 @@ def _mock_worker_run_no_revision_api(responses):
     payload = {
         "id": "56785678-5678-5678-5678-567856785678",
         "parents": [],
-        "worker": {
-            "id": "deadbeef-1234-5678-1234-worker",
-            "name": "Fake worker",
-            "slug": "fake_worker",
-            "type": "classifier",
-        },
         "worker_version": {
             "id": "12341234-1234-1234-1234-123412341234",
             "configuration": {
@@ -233,6 +229,56 @@ def _mock_worker_run_no_revision_api(responses):
     )
+@pytest.fixture
+def mock_base_worker_modern_conf(mocker, responses):
+    """
+    Provide a base worker to test modern configuration with (not provided in the fixture)
+    """
+    worker = BaseWorker()
+    mocker.patch.object(sys, "argv")
+    worker.args = worker.parser.parse_args()
+    payload = {
+        "id": "56785678-5678-5678-5678-567856785678",
+        "parents": [],
+        "worker_version": {
+            "id": "12341234-1234-1234-1234-123412341234",
+            "worker": {
+                "id": "deadbeef-1234-5678-1234-worker",
+                "name": "Fake worker",
+                "slug": "fake_worker",
+                "type": "classifier",
+            },
+            "revision": {"hash": "deadbeef1234"},
+            "configuration": {
+                "configuration": {"extra_key1": "not showing up"},
+                "user_configuration": {"extra_key2": "not showing up"},
+            },
+        },
+        "configuration": {
+            "id": "af0daaf4-983e-4703-a7ed-a10f146d6684",
+            "name": "my-userconfig",
+            "configuration": {
+                "extra_key3": "not showing up",
+            },
+        },
+        "model_version": None,
+        "process": {
+            "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
+            "corpus": CORPUS_ID,
+        },
+        "summary": "Worker Fake worker @ 123412",
+    }
+    responses.add(
+        responses.GET,
+        "http://testserver/api/v1/process/workers/56785678-5678-5678-5678-567856785678/",
+        status=200,
+        json=payload,
+    )
+    return worker
 @pytest.fixture
 def _mock_activity_calls(responses):
     """
@@ -282,6 +328,61 @@ def mock_elements_worker_with_list(monkeypatch, responses, mock_elements_worker)
     return mock_elements_worker
+@pytest.fixture
+def mock_elements_worker_consume_wa(monkeypatch, responses, mock_elements_worker):
+    """
+    Mock a worker instance to use StartWorkerActivity to consume worker activities
+    instead of reading a JSON file
+    """
+    # Enable consume worker activities through the process configuration
+    responses.replace(
+        responses.GET,
+        "http://testserver/api/v1/process/workers/56785678-5678-5678-5678-567856785678/",
+        status=200,
+        json={
+            "id": "56785678-5678-5678-5678-567856785678",
+            "parents": [],
+            "worker_version": {
+                "id": "12341234-1234-1234-1234-123412341234",
+                "configuration": {
+                    "docker": {"image": "python:3"},
+                    "configuration": {"someKey": "someValue"},
+                    "secrets": [],
+                },
+                "worker": {
+                    "id": "deadbeef-1234-5678-1234-worker",
+                    "name": "Fake worker",
+                    "slug": "fake_worker",
+                    "type": "classifier",
+                },
+            },
+            "configuration": None,
+            "model_version": None,
+            "process": {
+                "name": None,
+                "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
+                "state": "running",
+                "mode": "workers",
+                "corpus": CORPUS_ID,
+                "use_cache": False,
+                "activity_state": "ready",
+                "model_id": None,
+                "train_folder_id": None,
+                "validation_folder_id": None,
+                "test_folder_id": None,
+                "skip_elements_json": True,
+            },
+            "summary": "Worker Fake worker @ 123412",
+        },
+    )
+    # Call configure again to use updated process infos
+    mock_elements_worker.configure()
+    return mock_elements_worker
 @pytest.fixture
 def mock_cache_db(tmp_path):
     cache_path = tmp_path / "db.sqlite"

arkindex-base-worker 0.5.0b3__py3-none-any.whl → 0.5.1__py3-none-any.whl

arkindex-base-worker 0.5.0b3py3-none-any.whl → 0.5.1py3-none-any.whl