arkindex-base-worker 0.3.7rc5__tar.gz → 0.3.7rc7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/PKG-INFO +2 -3
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_base_worker.egg-info/PKG-INFO +2 -3
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_base_worker.egg-info/SOURCES.txt +6 -2
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_base_worker.egg-info/requires.txt +1 -2
- arkindex-base-worker-0.3.7rc7/arkindex_base_worker.egg-info/top_level.txt +6 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/image.py +4 -1
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/models.py +12 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/__init__.py +112 -121
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/base.py +2 -14
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/dataset.py +19 -26
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/entity.py +4 -2
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/metadata.py +7 -7
- arkindex-base-worker-0.3.7rc7/hooks/pre_gen_project.py +3 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/pyproject.toml +5 -2
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/requirements.txt +1 -2
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/tests/conftest.py +12 -7
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/tests/test_dataset_worker.py +279 -401
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/test_dataset.py +99 -145
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/test_metadata.py +21 -21
- arkindex-base-worker-0.3.7rc7/worker-demo/tests/conftest.py +32 -0
- arkindex-base-worker-0.3.7rc7/worker-demo/tests/test_worker.py +12 -0
- arkindex-base-worker-0.3.7rc7/worker-demo/worker_demo/__init__.py +6 -0
- arkindex-base-worker-0.3.7rc7/worker-demo/worker_demo/worker.py +19 -0
- arkindex-base-worker-0.3.7rc5/arkindex_base_worker.egg-info/top_level.txt +0 -2
- arkindex-base-worker-0.3.7rc5/setup.py +0 -4
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/LICENSE +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/README.md +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_base_worker.egg-info/dependency_links.txt +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/__init__.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/cache.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/utils.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/classification.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/element.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/task.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/training.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/transcription.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/version.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/docs-requirements.txt +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/setup.cfg +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/tests/__init__.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/tests/test_base_worker.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/tests/test_cache.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/tests/test_element.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/__init__.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/test_classifications.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/test_cli.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/test_elements.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/test_entities.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/test_task.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/test_training.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/test_transcriptions.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/tests/test_elements_worker/test_worker.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/tests/test_image.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/tests/test_merge.py +0 -0
- {arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: arkindex-base-worker
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.7rc7
|
|
4
4
|
Summary: Base Worker to easily build Arkindex ML workflows
|
|
5
5
|
Author-email: Teklia <contact@teklia.com>
|
|
6
6
|
Maintainer-email: Teklia <contact@teklia.com>
|
|
@@ -41,13 +41,12 @@ Classifier: Topic :: Text Processing :: Linguistic
|
|
|
41
41
|
Requires-Python: >=3.10
|
|
42
42
|
Description-Content-Type: text/markdown
|
|
43
43
|
License-File: LICENSE
|
|
44
|
-
Requires-Dist: arkindex-client==1.0.14
|
|
45
44
|
Requires-Dist: peewee==3.17.0
|
|
46
45
|
Requires-Dist: Pillow==10.2.0
|
|
47
46
|
Requires-Dist: pymdown-extensions==10.7
|
|
48
47
|
Requires-Dist: python-gnupg==0.5.2
|
|
49
48
|
Requires-Dist: shapely==2.0.3
|
|
50
|
-
Requires-Dist:
|
|
49
|
+
Requires-Dist: teklia-toolbox==0.1.4rc3
|
|
51
50
|
Requires-Dist: zstandard==0.22.0
|
|
52
51
|
Provides-Extra: docs
|
|
53
52
|
Requires-Dist: black==24.2.0; extra == "docs"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: arkindex-base-worker
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.7rc7
|
|
4
4
|
Summary: Base Worker to easily build Arkindex ML workflows
|
|
5
5
|
Author-email: Teklia <contact@teklia.com>
|
|
6
6
|
Maintainer-email: Teklia <contact@teklia.com>
|
|
@@ -41,13 +41,12 @@ Classifier: Topic :: Text Processing :: Linguistic
|
|
|
41
41
|
Requires-Python: >=3.10
|
|
42
42
|
Description-Content-Type: text/markdown
|
|
43
43
|
License-File: LICENSE
|
|
44
|
-
Requires-Dist: arkindex-client==1.0.14
|
|
45
44
|
Requires-Dist: peewee==3.17.0
|
|
46
45
|
Requires-Dist: Pillow==10.2.0
|
|
47
46
|
Requires-Dist: pymdown-extensions==10.7
|
|
48
47
|
Requires-Dist: python-gnupg==0.5.2
|
|
49
48
|
Requires-Dist: shapely==2.0.3
|
|
50
|
-
Requires-Dist:
|
|
49
|
+
Requires-Dist: teklia-toolbox==0.1.4rc3
|
|
51
50
|
Requires-Dist: zstandard==0.22.0
|
|
52
51
|
Provides-Extra: docs
|
|
53
52
|
Requires-Dist: black==24.2.0; extra == "docs"
|
|
@@ -3,7 +3,6 @@ README.md
|
|
|
3
3
|
docs-requirements.txt
|
|
4
4
|
pyproject.toml
|
|
5
5
|
requirements.txt
|
|
6
|
-
setup.py
|
|
7
6
|
arkindex_base_worker.egg-info/PKG-INFO
|
|
8
7
|
arkindex_base_worker.egg-info/SOURCES.txt
|
|
9
8
|
arkindex_base_worker.egg-info/dependency_links.txt
|
|
@@ -25,6 +24,7 @@ arkindex_worker/worker/task.py
|
|
|
25
24
|
arkindex_worker/worker/training.py
|
|
26
25
|
arkindex_worker/worker/transcription.py
|
|
27
26
|
arkindex_worker/worker/version.py
|
|
27
|
+
hooks/pre_gen_project.py
|
|
28
28
|
tests/__init__.py
|
|
29
29
|
tests/conftest.py
|
|
30
30
|
tests/test_base_worker.py
|
|
@@ -44,4 +44,8 @@ tests/test_elements_worker/test_metadata.py
|
|
|
44
44
|
tests/test_elements_worker/test_task.py
|
|
45
45
|
tests/test_elements_worker/test_training.py
|
|
46
46
|
tests/test_elements_worker/test_transcriptions.py
|
|
47
|
-
tests/test_elements_worker/test_worker.py
|
|
47
|
+
tests/test_elements_worker/test_worker.py
|
|
48
|
+
worker-demo/tests/conftest.py
|
|
49
|
+
worker-demo/tests/test_worker.py
|
|
50
|
+
worker-demo/worker_demo/__init__.py
|
|
51
|
+
worker-demo/worker_demo/worker.py
|
|
@@ -21,6 +21,7 @@ from tenacity import (
|
|
|
21
21
|
)
|
|
22
22
|
|
|
23
23
|
from arkindex_worker import logger
|
|
24
|
+
from teklia_toolbox.requests import should_verify_cert
|
|
24
25
|
|
|
25
26
|
# Avoid circular imports error when type checking
|
|
26
27
|
if TYPE_CHECKING:
|
|
@@ -175,7 +176,9 @@ def _retry_log(retry_state, *args, **kwargs):
|
|
|
175
176
|
reraise=True,
|
|
176
177
|
)
|
|
177
178
|
def _retried_request(url, *args, method=requests.get, **kwargs):
|
|
178
|
-
resp = method(
|
|
179
|
+
resp = method(
|
|
180
|
+
url, *args, timeout=DOWNLOAD_TIMEOUT, verify=should_verify_cert(url), **kwargs
|
|
181
|
+
)
|
|
179
182
|
resp.raise_for_status()
|
|
180
183
|
return resp
|
|
181
184
|
|
|
@@ -20,6 +20,8 @@ class MagicDict(dict):
|
|
|
20
20
|
Automagically convert lists and dicts to MagicDicts and lists of MagicDicts
|
|
21
21
|
Allows for nested access: foo.bar.baz
|
|
22
22
|
"""
|
|
23
|
+
if isinstance(item, Dataset):
|
|
24
|
+
return item
|
|
23
25
|
if isinstance(item, list):
|
|
24
26
|
return list(map(self._magify, item))
|
|
25
27
|
if isinstance(item, dict):
|
|
@@ -272,6 +274,16 @@ class Dataset(ArkindexModel):
|
|
|
272
274
|
return f"{self.id}.tar.zst"
|
|
273
275
|
|
|
274
276
|
|
|
277
|
+
class Set(MagicDict):
|
|
278
|
+
"""
|
|
279
|
+
Describes an Arkindex dataset set.
|
|
280
|
+
"""
|
|
281
|
+
|
|
282
|
+
def __str__(self):
|
|
283
|
+
# Not using ArkindexModel.__str__ as we do not retrieve the Set ID
|
|
284
|
+
return f"{self.__class__.__name__} ({self.name}) from {self.dataset}"
|
|
285
|
+
|
|
286
|
+
|
|
275
287
|
class Artifact(ArkindexModel):
|
|
276
288
|
"""
|
|
277
289
|
Describes an Arkindex artifact.
|
{arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/__init__.py
RENAMED
|
@@ -7,26 +7,25 @@ import json
|
|
|
7
7
|
import os
|
|
8
8
|
import sys
|
|
9
9
|
import uuid
|
|
10
|
+
from argparse import ArgumentTypeError
|
|
10
11
|
from collections.abc import Iterable, Iterator
|
|
11
12
|
from enum import Enum
|
|
12
|
-
from itertools import groupby
|
|
13
|
-
from operator import itemgetter
|
|
14
13
|
from pathlib import Path
|
|
15
14
|
|
|
16
15
|
from apistar.exceptions import ErrorResponse
|
|
17
16
|
|
|
18
17
|
from arkindex_worker import logger
|
|
19
18
|
from arkindex_worker.cache import CachedElement
|
|
20
|
-
from arkindex_worker.models import Dataset, Element
|
|
19
|
+
from arkindex_worker.models import Dataset, Element, Set
|
|
21
20
|
from arkindex_worker.worker.base import BaseWorker
|
|
22
21
|
from arkindex_worker.worker.classification import ClassificationMixin
|
|
23
22
|
from arkindex_worker.worker.dataset import DatasetMixin, DatasetState
|
|
24
23
|
from arkindex_worker.worker.element import ElementMixin
|
|
25
|
-
from arkindex_worker.worker.entity import EntityMixin
|
|
24
|
+
from arkindex_worker.worker.entity import EntityMixin
|
|
26
25
|
from arkindex_worker.worker.metadata import MetaDataMixin, MetaType # noqa: F401
|
|
27
26
|
from arkindex_worker.worker.task import TaskMixin
|
|
28
27
|
from arkindex_worker.worker.transcription import TranscriptionMixin
|
|
29
|
-
from arkindex_worker.worker.version import WorkerVersionMixin
|
|
28
|
+
from arkindex_worker.worker.version import WorkerVersionMixin
|
|
30
29
|
|
|
31
30
|
|
|
32
31
|
class ActivityState(Enum):
|
|
@@ -160,6 +159,16 @@ class ElementsWorker(
|
|
|
160
159
|
super().configure()
|
|
161
160
|
super().configure_cache()
|
|
162
161
|
|
|
162
|
+
# Retrieve the model configuration
|
|
163
|
+
if self.model_configuration:
|
|
164
|
+
self.config.update(self.model_configuration)
|
|
165
|
+
logger.info("Model version configuration retrieved")
|
|
166
|
+
|
|
167
|
+
# Retrieve the user configuration
|
|
168
|
+
if self.user_configuration:
|
|
169
|
+
self.config.update(self.user_configuration)
|
|
170
|
+
logger.info("User configuration retrieved")
|
|
171
|
+
|
|
163
172
|
def run(self):
|
|
164
173
|
"""
|
|
165
174
|
Implements an Arkindex worker that goes through each element returned by
|
|
@@ -301,6 +310,21 @@ class ElementsWorker(
|
|
|
301
310
|
return True
|
|
302
311
|
|
|
303
312
|
|
|
313
|
+
def check_dataset_set(value: str) -> tuple[uuid.UUID, str]:
|
|
314
|
+
values = value.split(":")
|
|
315
|
+
if len(values) != 2:
|
|
316
|
+
raise ArgumentTypeError(
|
|
317
|
+
f"'{value}' is not in the correct format `<dataset_id>:<set_name>`"
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
dataset_id, set_name = values
|
|
321
|
+
try:
|
|
322
|
+
dataset_id = uuid.UUID(dataset_id)
|
|
323
|
+
return (dataset_id, set_name)
|
|
324
|
+
except (TypeError, ValueError) as e:
|
|
325
|
+
raise ArgumentTypeError(f"'{dataset_id}' should be a valid UUID") from e
|
|
326
|
+
|
|
327
|
+
|
|
304
328
|
class MissingDatasetArchive(Exception):
|
|
305
329
|
"""
|
|
306
330
|
Exception raised when the compressed archive associated to
|
|
@@ -310,7 +334,7 @@ class MissingDatasetArchive(Exception):
|
|
|
310
334
|
|
|
311
335
|
class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
|
|
312
336
|
"""
|
|
313
|
-
Base class for ML workers that operate on Arkindex
|
|
337
|
+
Base class for ML workers that operate on Arkindex dataset sets.
|
|
314
338
|
|
|
315
339
|
This class inherits from numerous mixin classes found in other modules of
|
|
316
340
|
``arkindex.worker``, which provide helpers to read and write to the Arkindex API.
|
|
@@ -320,24 +344,26 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
|
|
|
320
344
|
self,
|
|
321
345
|
description: str = "Arkindex Dataset Worker",
|
|
322
346
|
support_cache: bool = False,
|
|
323
|
-
generator: bool = False,
|
|
324
347
|
):
|
|
325
348
|
"""
|
|
326
349
|
:param description: The worker's description.
|
|
327
350
|
:param support_cache: Whether the worker supports cache.
|
|
328
|
-
:param generator: Whether the worker generates the dataset archive artifact.
|
|
329
351
|
"""
|
|
330
352
|
super().__init__(description, support_cache)
|
|
331
353
|
|
|
354
|
+
self.downloaded_artifact: Path | None = None
|
|
355
|
+
|
|
332
356
|
self.parser.add_argument(
|
|
333
|
-
"--
|
|
334
|
-
type=
|
|
357
|
+
"--set",
|
|
358
|
+
type=check_dataset_set,
|
|
335
359
|
nargs="+",
|
|
336
|
-
help="
|
|
360
|
+
help="""
|
|
361
|
+
One or more Arkindex dataset sets, format is <dataset_uuid>:<set_name>
|
|
362
|
+
(e.g.: "12341234-1234-1234-1234-123412341234:train")
|
|
363
|
+
""",
|
|
364
|
+
default=[],
|
|
337
365
|
)
|
|
338
366
|
|
|
339
|
-
self.generator = generator
|
|
340
|
-
|
|
341
367
|
def configure(self):
|
|
342
368
|
"""
|
|
343
369
|
Setup the worker using CLI arguments and environment variables.
|
|
@@ -351,162 +377,127 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
|
|
|
351
377
|
super().configure()
|
|
352
378
|
super().configure_cache()
|
|
353
379
|
|
|
354
|
-
|
|
380
|
+
# Retrieve the model configuration
|
|
381
|
+
if self.model_configuration:
|
|
382
|
+
self.config.update(self.model_configuration)
|
|
383
|
+
logger.info("Model version configuration retrieved")
|
|
384
|
+
|
|
385
|
+
# Retrieve the user configuration
|
|
386
|
+
if self.user_configuration:
|
|
387
|
+
self.config.update(self.user_configuration)
|
|
388
|
+
logger.info("User configuration retrieved")
|
|
389
|
+
|
|
390
|
+
def cleanup_downloaded_artifact(self) -> None:
|
|
391
|
+
"""
|
|
392
|
+
Cleanup the downloaded artifact if any
|
|
393
|
+
"""
|
|
394
|
+
if not self.downloaded_artifact:
|
|
395
|
+
return
|
|
396
|
+
|
|
397
|
+
self.downloaded_artifact.unlink(missing_ok=True)
|
|
398
|
+
|
|
399
|
+
def download_dataset_artifact(self, dataset: Dataset) -> None:
|
|
355
400
|
"""
|
|
356
401
|
Find and download the compressed archive artifact describing a dataset using
|
|
357
402
|
the [list_artifacts][arkindex_worker.worker.task.TaskMixin.list_artifacts] and
|
|
358
403
|
[download_artifact][arkindex_worker.worker.task.TaskMixin.download_artifact] methods.
|
|
359
404
|
|
|
360
405
|
:param dataset: The dataset to retrieve the compressed archive artifact for.
|
|
361
|
-
:returns: A path to the downloaded artifact.
|
|
362
406
|
:raises MissingDatasetArchive: When the dataset artifact is not found.
|
|
363
407
|
"""
|
|
408
|
+
extra_dir = self.find_extras_directory()
|
|
409
|
+
archive = extra_dir / dataset.filepath
|
|
410
|
+
if archive.exists():
|
|
411
|
+
return
|
|
364
412
|
|
|
365
|
-
|
|
413
|
+
# Cleanup the dataset artifact that was downloaded previously
|
|
414
|
+
self.cleanup_downloaded_artifact()
|
|
366
415
|
|
|
416
|
+
logger.info(f"Downloading artifact for {dataset}")
|
|
417
|
+
task_id = uuid.UUID(dataset.task_id)
|
|
367
418
|
for artifact in self.list_artifacts(task_id):
|
|
368
419
|
if artifact.path != dataset.filepath:
|
|
369
420
|
continue
|
|
370
421
|
|
|
371
|
-
extra_dir = self.find_extras_directory()
|
|
372
|
-
archive = extra_dir / dataset.filepath
|
|
373
422
|
archive.write_bytes(self.download_artifact(task_id, artifact).read())
|
|
374
|
-
|
|
423
|
+
self.downloaded_artifact = archive
|
|
424
|
+
return
|
|
375
425
|
|
|
376
426
|
raise MissingDatasetArchive(
|
|
377
427
|
"The dataset compressed archive artifact was not found."
|
|
378
428
|
)
|
|
379
429
|
|
|
380
|
-
def
|
|
381
|
-
self, dataset: Dataset
|
|
382
|
-
) -> Iterator[tuple[str, list[Element]]]:
|
|
383
|
-
"""
|
|
384
|
-
List the elements in the dataset, grouped by split, using the
|
|
385
|
-
[list_dataset_elements][arkindex_worker.worker.dataset.DatasetMixin.list_dataset_elements] method.
|
|
386
|
-
|
|
387
|
-
:param dataset: The dataset to retrieve elements from.
|
|
388
|
-
:returns: An iterator of tuples containing the split name and the list of its elements.
|
|
389
|
-
"""
|
|
390
|
-
|
|
391
|
-
def format_split(
|
|
392
|
-
split: tuple[str, Iterator[tuple[str, Element]]],
|
|
393
|
-
) -> tuple[str, list[Element]]:
|
|
394
|
-
return (split[0], list(map(itemgetter(1), list(split[1]))))
|
|
395
|
-
|
|
396
|
-
return map(
|
|
397
|
-
format_split,
|
|
398
|
-
groupby(
|
|
399
|
-
sorted(self.list_dataset_elements(dataset), key=itemgetter(0)),
|
|
400
|
-
key=itemgetter(0),
|
|
401
|
-
),
|
|
402
|
-
)
|
|
403
|
-
|
|
404
|
-
def process_dataset(self, dataset: Dataset):
|
|
430
|
+
def process_set(self, set: Set):
|
|
405
431
|
"""
|
|
406
|
-
Override this method to implement your worker and process a single Arkindex dataset at once.
|
|
432
|
+
Override this method to implement your worker and process a single Arkindex dataset set at once.
|
|
407
433
|
|
|
408
|
-
:param
|
|
434
|
+
:param set: The set to process.
|
|
409
435
|
"""
|
|
410
436
|
|
|
411
|
-
def
|
|
437
|
+
def list_sets(self) -> Iterator[Set]:
|
|
412
438
|
"""
|
|
413
|
-
List the
|
|
414
|
-
[
|
|
439
|
+
List the sets to be processed, either from the CLI arguments or using the
|
|
440
|
+
[list_process_sets][arkindex_worker.worker.dataset.DatasetMixin.list_process_sets] method.
|
|
415
441
|
|
|
416
|
-
:returns: An iterator of
|
|
417
|
-
else an iterator of ``Dataset`` objects.
|
|
442
|
+
:returns: An iterator of ``Set`` objects.
|
|
418
443
|
"""
|
|
419
|
-
if self.is_read_only:
|
|
420
|
-
|
|
444
|
+
if not self.is_read_only:
|
|
445
|
+
yield from self.list_process_sets()
|
|
446
|
+
|
|
447
|
+
datasets: dict[uuid.UUID, Dataset] = {}
|
|
448
|
+
for dataset_id, set_name in self.args.set:
|
|
449
|
+
# Retrieving dataset information is not already cached
|
|
450
|
+
if dataset_id not in datasets:
|
|
451
|
+
datasets[dataset_id] = Dataset(
|
|
452
|
+
**self.request("RetrieveDataset", id=dataset_id)
|
|
453
|
+
)
|
|
421
454
|
|
|
422
|
-
|
|
455
|
+
yield Set(name=set_name, dataset=datasets[dataset_id])
|
|
423
456
|
|
|
424
457
|
def run(self):
|
|
425
458
|
"""
|
|
426
|
-
Implements an Arkindex worker that goes through each dataset returned by
|
|
427
|
-
[
|
|
459
|
+
Implements an Arkindex worker that goes through each dataset set returned by
|
|
460
|
+
[list_sets][arkindex_worker.worker.DatasetWorker.list_sets].
|
|
428
461
|
|
|
429
|
-
It calls [
|
|
430
|
-
catching exceptions
|
|
431
|
-
when the worker is a generator.
|
|
462
|
+
It calls [process_set][arkindex_worker.worker.DatasetWorker.process_set],
|
|
463
|
+
catching exceptions.
|
|
432
464
|
"""
|
|
433
465
|
self.configure()
|
|
434
466
|
|
|
435
|
-
|
|
436
|
-
if not
|
|
437
|
-
logger.warning("No
|
|
467
|
+
dataset_sets: list[Set] = list(self.list_sets())
|
|
468
|
+
if not dataset_sets:
|
|
469
|
+
logger.warning("No sets to process, stopping.")
|
|
438
470
|
sys.exit(1)
|
|
439
471
|
|
|
440
|
-
# Process every
|
|
441
|
-
count = len(
|
|
472
|
+
# Process every set
|
|
473
|
+
count = len(dataset_sets)
|
|
442
474
|
failed = 0
|
|
443
|
-
for i,
|
|
444
|
-
dataset = None
|
|
445
|
-
dataset_artifact = None
|
|
446
|
-
|
|
475
|
+
for i, dataset_set in enumerate(dataset_sets, start=1):
|
|
447
476
|
try:
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
else:
|
|
452
|
-
# Load dataset using the Arkindex API
|
|
453
|
-
dataset = Dataset(**self.request("RetrieveDataset", id=item))
|
|
454
|
-
|
|
455
|
-
if self.generator:
|
|
456
|
-
assert (
|
|
457
|
-
dataset.state
|
|
458
|
-
in [DatasetState.Open.value, DatasetState.Error.value]
|
|
459
|
-
), "When generating a new dataset, its state should be Open or Error."
|
|
460
|
-
else:
|
|
461
|
-
assert (
|
|
462
|
-
dataset.state == DatasetState.Complete.value
|
|
463
|
-
), "When processing an existing dataset, its state should be Complete."
|
|
464
|
-
|
|
465
|
-
logger.info(f"Processing {dataset} ({i}/{count})")
|
|
466
|
-
|
|
467
|
-
if self.generator:
|
|
468
|
-
# Update the dataset state to Building
|
|
469
|
-
logger.info(f"Building {dataset} ({i}/{count})")
|
|
470
|
-
self.update_dataset_state(dataset, DatasetState.Building)
|
|
471
|
-
else:
|
|
472
|
-
logger.info(f"Downloading data for {dataset} ({i}/{count})")
|
|
473
|
-
dataset_artifact = self.download_dataset_artifact(dataset)
|
|
477
|
+
assert (
|
|
478
|
+
dataset_set.dataset.state == DatasetState.Complete.value
|
|
479
|
+
), "When processing a set, its dataset state should be Complete."
|
|
474
480
|
|
|
475
|
-
|
|
476
|
-
self.
|
|
481
|
+
logger.info(f"Retrieving data for {dataset_set} ({i}/{count})")
|
|
482
|
+
self.download_dataset_artifact(dataset_set.dataset)
|
|
477
483
|
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
logger.info(f"Completed {dataset} ({i}/{count})")
|
|
481
|
-
self.update_dataset_state(dataset, DatasetState.Complete)
|
|
484
|
+
logger.info(f"Processing {dataset_set} ({i}/{count})")
|
|
485
|
+
self.process_set(dataset_set)
|
|
482
486
|
except Exception as e:
|
|
483
|
-
# Handle errors occurring while retrieving
|
|
487
|
+
# Handle errors occurring while retrieving or processing this dataset set
|
|
484
488
|
failed += 1
|
|
485
489
|
|
|
486
|
-
# Handle the case where we failed retrieving the dataset
|
|
487
|
-
dataset_id = dataset.id if dataset else item
|
|
488
|
-
|
|
489
490
|
if isinstance(e, ErrorResponse):
|
|
490
|
-
message = f"An API error occurred while processing
|
|
491
|
+
message = f"An API error occurred while processing {dataset_set}: {e.title} - {e.content}"
|
|
491
492
|
else:
|
|
492
|
-
message = (
|
|
493
|
-
f"Failed running worker on dataset {dataset_id}: {repr(e)}"
|
|
494
|
-
)
|
|
493
|
+
message = f"Failed running worker on {dataset_set}: {repr(e)}"
|
|
495
494
|
|
|
496
|
-
logger.warning(
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
if dataset and self.generator:
|
|
501
|
-
# Try to update the state to Error regardless of the response
|
|
502
|
-
with contextlib.suppress(Exception):
|
|
503
|
-
self.update_dataset_state(dataset, DatasetState.Error)
|
|
504
|
-
finally:
|
|
505
|
-
# Cleanup the dataset artifact if it was downloaded, no matter what
|
|
506
|
-
if dataset_artifact:
|
|
507
|
-
dataset_artifact.unlink(missing_ok=True)
|
|
495
|
+
logger.warning(message, exc_info=e if self.args.verbose else None)
|
|
496
|
+
|
|
497
|
+
# Cleanup the latest downloaded dataset artifact
|
|
498
|
+
self.cleanup_downloaded_artifact()
|
|
508
499
|
|
|
509
|
-
message = f'Ran on {count}
|
|
500
|
+
message = f'Ran on {count} set{"s"[:count>1]}: {count - failed} completed, {failed} failed'
|
|
510
501
|
if failed:
|
|
511
502
|
logger.error(message)
|
|
512
503
|
if failed >= count: # Everything failed!
|
{arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/base.py
RENAMED
|
@@ -21,7 +21,6 @@ from tenacity import (
|
|
|
21
21
|
wait_exponential,
|
|
22
22
|
)
|
|
23
23
|
|
|
24
|
-
from arkindex import ArkindexClient, options_from_env
|
|
25
24
|
from arkindex_worker import logger
|
|
26
25
|
from arkindex_worker.cache import (
|
|
27
26
|
check_version,
|
|
@@ -31,18 +30,7 @@ from arkindex_worker.cache import (
|
|
|
31
30
|
merge_parents_cache,
|
|
32
31
|
)
|
|
33
32
|
from arkindex_worker.utils import close_delete_file, extract_tar_zst_archive
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def _is_500_error(exc: Exception) -> bool:
|
|
37
|
-
"""
|
|
38
|
-
Check if an Arkindex API error has a HTTP 5xx error code.
|
|
39
|
-
Used to retry most API calls in [BaseWorker][arkindex_worker.worker.base.BaseWorker].
|
|
40
|
-
:param exc: Exception to check
|
|
41
|
-
"""
|
|
42
|
-
if not isinstance(exc, ErrorResponse):
|
|
43
|
-
return False
|
|
44
|
-
|
|
45
|
-
return 500 <= exc.status_code < 600
|
|
33
|
+
from teklia_toolbox.requests import _get_arkindex_client, _is_500_error
|
|
46
34
|
|
|
47
35
|
|
|
48
36
|
class ExtrasDirNotFoundError(Exception):
|
|
@@ -197,7 +185,7 @@ class BaseWorker:
|
|
|
197
185
|
Create an ArkindexClient to make API requests towards Arkindex instances.
|
|
198
186
|
"""
|
|
199
187
|
# Build Arkindex API client from environment variables
|
|
200
|
-
self.api_client =
|
|
188
|
+
self.api_client = _get_arkindex_client()
|
|
201
189
|
logger.debug(f"Setup Arkindex API client on {self.api_client.document.url}")
|
|
202
190
|
|
|
203
191
|
def configure_for_developers(self):
|
{arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/dataset.py
RENAMED
|
@@ -7,7 +7,7 @@ from enum import Enum
|
|
|
7
7
|
|
|
8
8
|
from arkindex_worker import logger
|
|
9
9
|
from arkindex_worker.cache import unsupported_cache
|
|
10
|
-
from arkindex_worker.models import Dataset, Element
|
|
10
|
+
from arkindex_worker.models import Dataset, Element, Set
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class DatasetState(Enum):
|
|
@@ -37,49 +37,42 @@ class DatasetState(Enum):
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
class DatasetMixin:
|
|
40
|
-
def
|
|
40
|
+
def list_process_sets(self) -> Iterator[Set]:
|
|
41
41
|
"""
|
|
42
|
-
List
|
|
42
|
+
List dataset sets associated to the worker's process. This helper is not available in developer mode.
|
|
43
43
|
|
|
44
|
-
:returns: An iterator of ``
|
|
44
|
+
:returns: An iterator of ``Set`` objects built from the ``ListProcessSets`` API endpoint.
|
|
45
45
|
"""
|
|
46
46
|
assert not self.is_read_only, "This helper is not available in read-only mode."
|
|
47
47
|
|
|
48
48
|
results = self.api_client.paginate(
|
|
49
|
-
"
|
|
49
|
+
"ListProcessSets", id=self.process_information["id"]
|
|
50
50
|
)
|
|
51
51
|
|
|
52
52
|
return map(
|
|
53
|
-
lambda result:
|
|
53
|
+
lambda result: Set(
|
|
54
|
+
name=result["set_name"], dataset=Dataset(**result["dataset"])
|
|
55
|
+
),
|
|
54
56
|
results,
|
|
55
57
|
)
|
|
56
58
|
|
|
57
|
-
def
|
|
59
|
+
def list_set_elements(self, dataset_set: Set) -> Iterator[Element]:
|
|
58
60
|
"""
|
|
59
|
-
List elements in a dataset.
|
|
61
|
+
List elements in a dataset set.
|
|
60
62
|
|
|
61
|
-
:param
|
|
62
|
-
:returns: An iterator of
|
|
63
|
+
:param dataset_set: Set to find elements in.
|
|
64
|
+
:returns: An iterator of Element built from the ``ListDatasetElements`` API endpoint.
|
|
63
65
|
"""
|
|
64
|
-
assert
|
|
65
|
-
|
|
66
|
-
), "
|
|
67
|
-
|
|
68
|
-
if dataset.sets == dataset.selected_sets:
|
|
69
|
-
results = self.api_client.paginate("ListDatasetElements", id=dataset.id)
|
|
70
|
-
else:
|
|
71
|
-
results = iter(
|
|
72
|
-
element
|
|
73
|
-
for selected_set in dataset.selected_sets
|
|
74
|
-
for element in self.api_client.paginate(
|
|
75
|
-
"ListDatasetElements", id=dataset.id, set=selected_set
|
|
76
|
-
)
|
|
77
|
-
)
|
|
66
|
+
assert dataset_set and isinstance(
|
|
67
|
+
dataset_set, Set
|
|
68
|
+
), "dataset_set shouldn't be null and should be a Set"
|
|
78
69
|
|
|
79
|
-
|
|
80
|
-
|
|
70
|
+
results = self.api_client.paginate(
|
|
71
|
+
"ListDatasetElements", id=dataset_set.dataset.id, set=dataset_set.name
|
|
81
72
|
)
|
|
82
73
|
|
|
74
|
+
return map(lambda result: Element(**result["element"]), results)
|
|
75
|
+
|
|
83
76
|
@unsupported_cache
|
|
84
77
|
def update_dataset_state(self, dataset: Dataset, state: DatasetState) -> Dataset:
|
|
85
78
|
"""
|
{arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/entity.py
RENAMED
|
@@ -380,8 +380,9 @@ class EntityMixin:
|
|
|
380
380
|
"ListCorpusEntities", id=self.corpus_id, **query_params
|
|
381
381
|
)
|
|
382
382
|
}
|
|
383
|
+
count = len(self.entities)
|
|
383
384
|
logger.info(
|
|
384
|
-
f
|
|
385
|
+
f'Loaded {count} entit{"ies" if count > 1 else "y"} in corpus ({self.corpus_id})'
|
|
385
386
|
)
|
|
386
387
|
|
|
387
388
|
def list_corpus_entity_types(
|
|
@@ -396,6 +397,7 @@ class EntityMixin:
|
|
|
396
397
|
"ListCorpusEntityTypes", id=self.corpus_id
|
|
397
398
|
)
|
|
398
399
|
}
|
|
400
|
+
count = len(self.entity_types)
|
|
399
401
|
logger.info(
|
|
400
|
-
f
|
|
402
|
+
f'Loaded {count} entity type{"s"[:count>1]} in corpus ({self.corpus_id}).'
|
|
401
403
|
)
|
{arkindex-base-worker-0.3.7rc5 → arkindex-base-worker-0.3.7rc7}/arkindex_worker/worker/metadata.py
RENAMED
|
@@ -108,17 +108,17 @@ class MetaDataMixin:
|
|
|
108
108
|
return metadata["id"]
|
|
109
109
|
|
|
110
110
|
@unsupported_cache
|
|
111
|
-
def
|
|
111
|
+
def create_metadata_bulk(
|
|
112
112
|
self,
|
|
113
113
|
element: Element | CachedElement,
|
|
114
|
-
|
|
114
|
+
metadata_list: list[dict[str, MetaType | str | int | float | None]],
|
|
115
115
|
) -> list[dict[str, str]]:
|
|
116
116
|
"""
|
|
117
117
|
Create multiple metadata on an existing element.
|
|
118
118
|
This method does not support cache.
|
|
119
119
|
|
|
120
120
|
:param element: The element to create multiple metadata on.
|
|
121
|
-
:param
|
|
121
|
+
:param metadata_list: The list of dict whose keys are the following:
|
|
122
122
|
- type: MetaType
|
|
123
123
|
- name: str
|
|
124
124
|
- value: str | int | float
|
|
@@ -128,13 +128,13 @@ class MetaDataMixin:
|
|
|
128
128
|
element, Element | CachedElement
|
|
129
129
|
), "element shouldn't be null and should be of type Element or CachedElement"
|
|
130
130
|
|
|
131
|
-
assert
|
|
132
|
-
|
|
133
|
-
), "
|
|
131
|
+
assert metadata_list and isinstance(
|
|
132
|
+
metadata_list, list
|
|
133
|
+
), "metadata_list shouldn't be null and should be of type list of dict"
|
|
134
134
|
|
|
135
135
|
# Make a copy to avoid modifying the metadata_list argument
|
|
136
136
|
metas = []
|
|
137
|
-
for index, metadata in enumerate(
|
|
137
|
+
for index, metadata in enumerate(metadata_list):
|
|
138
138
|
assert isinstance(
|
|
139
139
|
metadata, dict
|
|
140
140
|
), f"Element at index {index} in metadata_list: Should be of type dict"
|