arkindex-base-worker 0.3.6rc2__py3-none-any.whl → 0.3.6rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: arkindex-base-worker
3
- Version: 0.3.6rc2
3
+ Version: 0.3.6rc4
4
4
  Summary: Base Worker to easily build Arkindex ML workflows
5
5
  Author-email: Teklia <contact@teklia.com>
6
6
  Maintainer-email: Teklia <contact@teklia.com>
@@ -18,6 +18,14 @@ Classifier: Programming Language :: Python :: 3.11
18
18
  Classifier: Topic :: Text Processing :: Linguistic
19
19
  Requires-Python: >=3.10
20
20
  Description-Content-Type: text/markdown
21
+ Requires-Dist: arkindex-client ==1.0.14
22
+ Requires-Dist: peewee ==3.17.0
23
+ Requires-Dist: Pillow ==10.1.0
24
+ Requires-Dist: pymdown-extensions ==10.3.1
25
+ Requires-Dist: python-gnupg ==0.5.1
26
+ Requires-Dist: shapely ==2.0.2
27
+ Requires-Dist: tenacity ==8.2.3
28
+ Requires-Dist: zstandard ==0.22.0
21
29
  Provides-Extra: docs
22
30
  Requires-Dist: black ==23.11.0 ; extra == 'docs'
23
31
  Requires-Dist: doc8 ==1.1.1 ; extra == 'docs'
@@ -3,12 +3,12 @@ arkindex_worker/cache.py,sha256=ZbXJo-O24W8x6nbS0IJm32Tas9CKLaHBBeyQyvF-Nyo,1090
3
3
  arkindex_worker/image.py,sha256=uwfUE9hy0Iw-e3vU7OHmLSqouxbznWq08SykXmPD1Cs,14107
4
4
  arkindex_worker/models.py,sha256=DVrZPIurSiOoHvj3t_Szwd0j1t6pnwBx_dqwhNakzN0,9528
5
5
  arkindex_worker/utils.py,sha256=_lC1-RYvNWXEkK-AuF4_FraoggP1tYPdalNFSj4jDb4,6885
6
- arkindex_worker/worker/__init__.py,sha256=4_kDCgjFp7oUCdhnNUEoLbrpUyr2MYK5JStrUYmgUDc,19062
6
+ arkindex_worker/worker/__init__.py,sha256=Iun6jhuakKdCGKjQtgqDWEpWO1HrxK34RoxdzE5gcRs,19322
7
7
  arkindex_worker/worker/base.py,sha256=4eG4v4vejvFv9UtTRhxEZkXEBVzlFd3rILHK8lt-mbc,19397
8
8
  arkindex_worker/worker/classification.py,sha256=CoMIj7SFFlt90W1r5FQmsB80qK9Zfltcm3k-37FSHA0,10693
9
9
  arkindex_worker/worker/dataset.py,sha256=LRZU_KkOuCRkxlkdqw1PHYnu1zmoQfm_OiY8Sqt6mi0,2754
10
10
  arkindex_worker/worker/element.py,sha256=5z6yAO2jKCF4U7Tz2cGdFX34Zxdo0gBWkU-ciIZ69bQ,30705
11
- arkindex_worker/worker/entity.py,sha256=WJANdUUvFdR_6c3wYwfckWdZqeq-W8TciDPm-YcmkCE,13397
11
+ arkindex_worker/worker/entity.py,sha256=l0gCoeaoUBFU7pv2iC4pHkSQVjiIur4M15P7Mg_WlaA,13601
12
12
  arkindex_worker/worker/metadata.py,sha256=PnzyHkPyb-mtgItzRi4s-_f0dsEOM3ak8F_bFoqp3O0,6225
13
13
  arkindex_worker/worker/task.py,sha256=cz3wJNPgogZv1lm_3lm7WScitQtYQtL6H6I7Xokq208,1475
14
14
  arkindex_worker/worker/training.py,sha256=rhg4TPFo0ignnCkgbekUYmNXX8u2iZGyeM1VCs1R5kI,10140
@@ -17,7 +17,7 @@ arkindex_worker/worker/version.py,sha256=uL-OrwuFZB8TNU6ePmdKIL3g3e-GE2tqHEWBRpX
17
17
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  tests/conftest.py,sha256=BEeyspbvxwSpFvQ1taj8ysnu7wAE3d7pWUDMmmKC5Ho,21837
19
19
  tests/test_base_worker.py,sha256=AF1pjvNckN80LVyLJ4ILXJ122fEnWtztK7ZncVDI_Ms,24976
20
- tests/test_cache.py,sha256=6pdgdsU20pn-CMLbYA1Af0A2QZ_0eiQ0UqtlIMoujfU,10425
20
+ tests/test_cache.py,sha256=ii0gyr0DrG7ChEs7pmT8hMdSguAOAcCze4bRMiFQxuk,10640
21
21
  tests/test_dataset_worker.py,sha256=XxBCLRroCIq97P37_qWc9I7QiyE3zUL7fLAw1J_BI7E,27703
22
22
  tests/test_element.py,sha256=2G9M15TLxQRmvrWM9Kw2ucnElh4kSv_oF_5FYwwAxTY,13181
23
23
  tests/test_image.py,sha256=nIT0NhmuHtD9I1rcMSvqSfjQqvTE5Ko8tAQGLIkm_zo,15232
@@ -28,13 +28,13 @@ tests/test_elements_worker/test_classifications.py,sha256=0_6vryoQc2-s3CQWANsEva
28
28
  tests/test_elements_worker/test_cli.py,sha256=DdCRKobesehL61c5QwaZOZCde7bsTlmeSN2iosQ5_2s,2873
29
29
  tests/test_elements_worker/test_dataset.py,sha256=ElDPrYTTt8KzaZ_Xf5uslUD6_kiGZybntO10uqCquLo,12063
30
30
  tests/test_elements_worker/test_elements.py,sha256=F7azXQBNec4QLH62uXaGgAgg82GEz0q98pYXH3Sh640,75761
31
- tests/test_elements_worker/test_entities.py,sha256=epfiLH-cs0g8nKr4e8som7N_10lbogJIXyLyMMiz460,33792
31
+ tests/test_elements_worker/test_entities.py,sha256=ZOFB3ckKJvNG2kIPUX_kz_378k3uQrJmvYHpR_xiVuo,33789
32
32
  tests/test_elements_worker/test_metadata.py,sha256=c3kXPYRXVPDnGim28Ncg5YO4I0ejh3qyi7dBvbSYxMU,17739
33
33
  tests/test_elements_worker/test_task.py,sha256=FCpxE9UpouKXgjGvWgNHEai_Hiy2d1YmqRG-_v2s27s,6312
34
34
  tests/test_elements_worker/test_training.py,sha256=WeG-cDuJ-YhPgfKH47TtXBxyargtLuk7c8tsik2WnL8,8414
35
35
  tests/test_elements_worker/test_transcriptions.py,sha256=6UWGriQVwEORunJYW11mGcD16voZGFY41i_NIdXuqnI,68750
36
36
  tests/test_elements_worker/test_worker.py,sha256=zD8sY5yZFhuUr1txVX8z7bSgW4I2jNuzH5i1TM3qkZI,16491
37
- arkindex_base_worker-0.3.6rc2.dist-info/METADATA,sha256=UWtS8SfoCWWXb5N7BQHa8SrJqpxDed-Rb754uNVqenE,1536
38
- arkindex_base_worker-0.3.6rc2.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
39
- arkindex_base_worker-0.3.6rc2.dist-info/top_level.txt,sha256=TtagLI8LSv7GE7nG8MQqDFAJ5bNDPJn7Z5vizOgrWkA,22
40
- arkindex_base_worker-0.3.6rc2.dist-info/RECORD,,
37
+ arkindex_base_worker-0.3.6rc4.dist-info/METADATA,sha256=5ug8gbQxLwb87ArFDtHn32bekEexXmiTEysRPA3UKio,1814
38
+ arkindex_base_worker-0.3.6rc4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
39
+ arkindex_base_worker-0.3.6rc4.dist-info/top_level.txt,sha256=TtagLI8LSv7GE7nG8MQqDFAJ5bNDPJn7Z5vizOgrWkA,22
40
+ arkindex_base_worker-0.3.6rc4.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.41.3)
2
+ Generator: bdist_wheel (0.42.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -387,7 +387,7 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
387
387
  """
388
388
 
389
389
  def format_split(
390
- split: tuple[str, Iterator[tuple[str, Element]]]
390
+ split: tuple[str, Iterator[tuple[str, Element]]],
391
391
  ) -> tuple[str, list[Element]]:
392
392
  return (split[0], list(map(itemgetter(1), list(split[1]))))
393
393
 
@@ -440,6 +440,8 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
440
440
  failed = 0
441
441
  for i, item in enumerate(datasets, start=1):
442
442
  dataset = None
443
+ dataset_artifact = None
444
+
443
445
  try:
444
446
  if not self.is_read_only:
445
447
  # Just use the result of list_datasets as the dataset
@@ -465,7 +467,7 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
465
467
  self.update_dataset_state(dataset, DatasetState.Building)
466
468
  else:
467
469
  logger.info(f"Downloading data for {dataset} ({i}/{count})")
468
- self.download_dataset_artifact(dataset)
470
+ dataset_artifact = self.download_dataset_artifact(dataset)
469
471
 
470
472
  # Process the dataset
471
473
  self.process_dataset(dataset)
@@ -496,6 +498,10 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
496
498
  # Try to update the state to Error regardless of the response
497
499
  with contextlib.suppress(Exception):
498
500
  self.update_dataset_state(dataset, DatasetState.Error)
501
+ finally:
502
+ # Cleanup the dataset artifact if it was downloaded, no matter what
503
+ if dataset_artifact:
504
+ dataset_artifact.unlink(missing_ok=True)
499
505
 
500
506
  if failed:
501
507
  logger.error(
@@ -331,8 +331,7 @@ class EntityMixin:
331
331
  parent: Element | None = None,
332
332
  ):
333
333
  """
334
- List all entities in the worker's corpus
335
- This method does not support cache
334
+ List all entities in the worker's corpus and store them in the ``self.entities`` cache.
336
335
  :param name: Filter entities by part of their name (case-insensitive)
337
336
  :param parent: Restrict entities to those linked to all transcriptions of an element and all its descendants. Note that links to metadata are ignored.
338
337
  """
@@ -346,8 +345,14 @@ class EntityMixin:
346
345
  assert isinstance(parent, Element), "parent should be of type Element"
347
346
  query_params["parent"] = parent.id
348
347
 
349
- return self.api_client.paginate(
350
- "ListCorpusEntities", id=self.corpus_id, **query_params
348
+ self.entities = {
349
+ entity["id"]: entity
350
+ for entity in self.api_client.paginate(
351
+ "ListCorpusEntities", id=self.corpus_id, **query_params
352
+ )
353
+ }
354
+ logger.info(
355
+ f"Loaded {len(self.entities)} entities in corpus ({self.corpus_id})"
351
356
  )
352
357
 
353
358
  def list_corpus_entity_types(
tests/test_cache.py CHANGED
@@ -53,6 +53,9 @@ def test_create_tables(tmp_path):
53
53
  init_cache_db(db_path)
54
54
  create_tables()
55
55
 
56
+ # WARNING: If you are updating this schema following a development you have made
57
+ # in base-worker, make sure to upgrade the arkindex_worker.cache.SQL_VERSION in
58
+ # the same merge request as your changes.
56
59
  expected_schema = """CREATE TABLE "classifications" ("id" TEXT NOT NULL PRIMARY KEY, "element_id" TEXT NOT NULL, "class_name" TEXT NOT NULL, "confidence" REAL NOT NULL, "state" VARCHAR(10) NOT NULL, "worker_run_id" TEXT, FOREIGN KEY ("element_id") REFERENCES "elements" ("id"))
57
60
  CREATE TABLE "dataset_elements" ("id" TEXT NOT NULL PRIMARY KEY, "element_id" TEXT NOT NULL, "dataset_id" TEXT NOT NULL, "set_name" VARCHAR(255) NOT NULL, FOREIGN KEY ("element_id") REFERENCES "elements" ("id"), FOREIGN KEY ("dataset_id") REFERENCES "datasets" ("id"))
58
61
  CREATE TABLE "datasets" ("id" TEXT NOT NULL PRIMARY KEY, "name" VARCHAR(255) NOT NULL, "state" VARCHAR(255) NOT NULL DEFAULT 'open', "sets" TEXT NOT NULL)
@@ -741,12 +741,13 @@ def test_list_corpus_entities(responses, mock_elements_worker):
741
741
  },
742
742
  )
743
743
 
744
- # list is required to actually do the request
745
- assert list(mock_elements_worker.list_corpus_entities()) == [
746
- {
744
+ mock_elements_worker.list_corpus_entities()
745
+
746
+ assert mock_elements_worker.entities == {
747
+ "fake_entity_id": {
747
748
  "id": "fake_entity_id",
748
749
  }
749
- ]
750
+ }
750
751
 
751
752
  assert len(responses.calls) == len(BASE_API_CALLS) + 1
752
753
  assert [