arkindex-base-worker 0.5.0b3__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,16 +11,14 @@ from peewee import IntegrityError
11
11
  from arkindex.exceptions import ErrorResponse
12
12
  from arkindex_worker import logger
13
13
  from arkindex_worker.cache import (
14
- CachedEntity,
15
14
  CachedTranscriptionEntity,
16
15
  unsupported_cache,
17
16
  )
18
- from arkindex_worker.models import Element, Transcription
17
+ from arkindex_worker.models import Transcription
19
18
  from arkindex_worker.utils import pluralize
20
19
 
21
20
 
22
21
  class Entity(TypedDict):
23
- name: str
24
22
  type_id: str
25
23
  length: int
26
24
  offset: int
@@ -126,88 +124,20 @@ class EntityMixin:
126
124
  # Create the type if non-existent
127
125
  self.create_entity_type(entity_type)
128
126
 
129
- def create_entity(
130
- self,
131
- name: str,
132
- type: str,
133
- metas=None,
134
- validated=None,
135
- ):
136
- """
137
- Create an entity on the given corpus.
138
- If cache support is enabled, a [CachedEntity][arkindex_worker.cache.CachedEntity] will also be created.
139
-
140
- :param name: Name of the entity.
141
- :param type: Type of the entity.
142
- """
143
- assert name and isinstance(name, str), (
144
- "name shouldn't be null and should be of type str"
145
- )
146
- assert type and isinstance(type, str), (
147
- "type shouldn't be null and should be of type str"
148
- )
149
- metas = metas or {}
150
- if metas:
151
- assert isinstance(metas, dict), "metas should be of type dict"
152
- if validated is not None:
153
- assert isinstance(validated, bool), "validated should be of type bool"
154
- if self.is_read_only:
155
- logger.warning("Cannot create entity as this worker is in read-only mode")
156
- return
157
-
158
- # Retrieve entity_type ID
159
- if not self.entity_types:
160
- # Load entity_types of corpus
161
- self.list_corpus_entity_types()
162
-
163
- entity_type_id = self.entity_types.get(type)
164
- assert entity_type_id, f"Entity type `{type}` not found in the corpus."
165
-
166
- entity = self.api_client.request(
167
- "CreateEntity",
168
- body={
169
- "name": name,
170
- "type_id": entity_type_id,
171
- "metas": metas,
172
- "validated": validated,
173
- "corpus": self.corpus_id,
174
- "worker_run_id": self.worker_run_id,
175
- },
176
- )
177
-
178
- if self.use_cache:
179
- # Store entity in local cache
180
- try:
181
- to_insert = [
182
- {
183
- "id": entity["id"],
184
- "type": type,
185
- "name": name,
186
- "validated": validated if validated is not None else False,
187
- "metas": metas,
188
- "worker_run_id": self.worker_run_id,
189
- }
190
- ]
191
- CachedEntity.insert_many(to_insert).execute()
192
- except IntegrityError as e:
193
- logger.warning(f"Couldn't save created entity in local cache: {e}")
194
-
195
- return entity["id"]
196
-
197
127
  def create_transcription_entity(
198
128
  self,
199
129
  transcription: Transcription,
200
- entity: str,
130
+ type_id: str,
201
131
  offset: int,
202
132
  length: int,
203
133
  confidence: float | None = None,
204
134
  ) -> dict[str, str | int] | None:
205
135
  """
206
- Create a link between an existing entity and an existing transcription.
136
+ Create an entity on an existing transcription.
207
137
  If cache support is enabled, a `CachedTranscriptionEntity` will also be created.
208
138
 
209
139
  :param transcription: Transcription to create the entity on.
210
- :param entity: UUID of the existing entity.
140
+ :param type_id: UUID of the entity type.
211
141
  :param offset: Starting position of the entity in the transcription's text,
212
142
  as a 0-based index.
213
143
  :param length: Length of the entity in the transcription's text.
@@ -218,8 +148,8 @@ class EntityMixin:
218
148
  assert transcription and isinstance(transcription, Transcription), (
219
149
  "transcription shouldn't be null and should be a Transcription"
220
150
  )
221
- assert entity and isinstance(entity, str), (
222
- "entity shouldn't be null and should be of type str"
151
+ assert type_id and isinstance(type_id, str), (
152
+ "type_id shouldn't be null and should be of type str"
223
153
  )
224
154
  assert offset is not None and isinstance(offset, int) and offset >= 0, (
225
155
  "offset shouldn't be null and should be a positive integer"
@@ -237,7 +167,7 @@ class EntityMixin:
237
167
  return
238
168
 
239
169
  body = {
240
- "entity": entity,
170
+ "type_id": type_id,
241
171
  "length": length,
242
172
  "offset": offset,
243
173
  "worker_run_id": self.worker_run_id,
@@ -245,7 +175,7 @@ class EntityMixin:
245
175
  if confidence is not None:
246
176
  body["confidence"] = confidence
247
177
 
248
- transcription_ent = self.api_client.request(
178
+ tr_entity = self.api_client.request(
249
179
  "CreateTranscriptionEntity",
250
180
  id=transcription.id,
251
181
  body=body,
@@ -256,7 +186,7 @@ class EntityMixin:
256
186
  try:
257
187
  CachedTranscriptionEntity.create(
258
188
  transcription=transcription.id,
259
- entity=entity,
189
+ type=tr_entity["type"]["name"],
260
190
  offset=offset,
261
191
  length=length,
262
192
  worker_run_id=self.worker_run_id,
@@ -267,7 +197,7 @@ class EntityMixin:
267
197
  f"Couldn't save created transcription entity in local cache: {e}"
268
198
  )
269
199
 
270
- return transcription_ent
200
+ return tr_entity
271
201
 
272
202
  @unsupported_cache
273
203
  def create_transcription_entities(
@@ -276,14 +206,11 @@ class EntityMixin:
276
206
  entities: list[Entity],
277
207
  ) -> list[dict[str, str]]:
278
208
  """
279
- Create multiple entities attached to a transcription in a single API request.
209
+ Create multiple entities on a transcription in a single API request.
280
210
 
281
211
  :param transcription: Transcription to create the entity on.
282
212
  :param entities: List of dicts, one per element. Each dict can have the following keys:
283
213
 
284
- name (str)
285
- Required. Name of the entity.
286
-
287
214
  type_id (str)
288
215
  Required. ID of the EntityType of the entity.
289
216
 
@@ -296,7 +223,7 @@ class EntityMixin:
296
223
  confidence (float or None)
297
224
  Optional confidence score, between 0.0 and 1.0.
298
225
 
299
- :return: List of dicts, with each dict having a two keys, `transcription_entity_id` and `entity_id`, holding the UUID of each created object.
226
+ :return: List of strings, holding the UUID of each created object.
300
227
  """
301
228
  assert transcription and isinstance(transcription, Transcription), (
302
229
  "transcription shouldn't be null and should be of type Transcription"
@@ -311,11 +238,6 @@ class EntityMixin:
311
238
  f"Entity at index {index} in entities: Should be of type dict"
312
239
  )
313
240
 
314
- name = entity.get("name")
315
- assert name and isinstance(name, str), (
316
- f"Entity at index {index} in entities: name shouldn't be null and should be of type str"
317
- )
318
-
319
241
  type_id = entity.get("type_id")
320
242
  assert type_id and isinstance(type_id, str), (
321
243
  f"Entity at index {index} in entities: type_id shouldn't be null and should be of type str"
@@ -339,7 +261,7 @@ class EntityMixin:
339
261
  )
340
262
 
341
263
  assert len(entities) == len(
342
- set(map(itemgetter("offset", "length", "name", "type_id"), entities))
264
+ set(map(itemgetter("offset", "length", "type_id"), entities))
343
265
  ), "entities should be unique"
344
266
 
345
267
  if self.is_read_only:
@@ -348,16 +270,16 @@ class EntityMixin:
348
270
  )
349
271
  return
350
272
 
351
- created_entities = self.api_client.request(
273
+ created_tr_entities = self.api_client.request(
352
274
  "CreateTranscriptionEntities",
353
275
  id=transcription.id,
354
276
  body={
355
277
  "worker_run_id": self.worker_run_id,
356
- "entities": entities,
278
+ "transcription_entities": entities,
357
279
  },
358
- )["entities"]
280
+ )["transcription_entities"]
359
281
 
360
- return created_entities
282
+ return created_tr_entities
361
283
 
362
284
  def list_transcription_entities(
363
285
  self,
@@ -412,34 +334,3 @@ class EntityMixin:
412
334
  return self.api_client.paginate(
413
335
  "ListTranscriptionEntities", id=transcription.id, **query_params
414
336
  )
415
-
416
- def list_corpus_entities(
417
- self,
418
- name: str | None = None,
419
- parent: Element | None = None,
420
- ):
421
- """
422
- List all entities in the worker's corpus and store them in the ``self.entities`` cache.
423
- :param name: Filter entities by part of their name (case-insensitive)
424
- :param parent: Restrict entities to those linked to all transcriptions of an element and all its descendants. Note that links to metadata are ignored.
425
- """
426
- query_params = {}
427
-
428
- if name is not None:
429
- assert name and isinstance(name, str), "name should be of type str"
430
- query_params["name"] = name
431
-
432
- if parent is not None:
433
- assert isinstance(parent, Element), "parent should be of type Element"
434
- query_params["parent"] = parent.id
435
-
436
- self.entities = {
437
- entity["id"]: entity
438
- for entity in self.api_client.paginate(
439
- "ListCorpusEntities", id=self.corpus_id, **query_params
440
- )
441
- }
442
- count = len(self.entities)
443
- logger.info(
444
- f"Loaded {count} {pluralize('entity', count)} in corpus ({self.corpus_id})"
445
- )
@@ -20,10 +20,10 @@ class MetaType(Enum):
20
20
  A regular string with no special interpretation.
21
21
  """
22
22
 
23
- HTML = "html"
23
+ Markdown = "markdown"
24
24
  """
25
- A metadata with a string value that should be interpreted as HTML content.
26
- The allowed HTML tags are restricted for security reasons.
25
+ A metadata with a string value that should be interpreted as Markdown content.
26
+ HTML is allowed, but the allowed HTML tags are restricted for security reasons.
27
27
  """
28
28
 
29
29
  Date = "date"
@@ -64,7 +64,6 @@ class MetaDataMixin:
64
64
  type: MetaType,
65
65
  name: str,
66
66
  value: str,
67
- entity: str | None = None,
68
67
  ) -> str:
69
68
  """
70
69
  Create a metadata on the given element through API.
@@ -73,7 +72,6 @@ class MetaDataMixin:
73
72
  :param type: Type of the metadata.
74
73
  :param name: Name of the metadata.
75
74
  :param value: Value of the metadata.
76
- :param entity: UUID of an entity this metadata is related to.
77
75
  :returns: UUID of the created metadata.
78
76
  """
79
77
  assert element and isinstance(element, Element | CachedElement), (
@@ -88,8 +86,6 @@ class MetaDataMixin:
88
86
  assert value and isinstance(value, str), (
89
87
  "value shouldn't be null and should be of type str"
90
88
  )
91
- if entity:
92
- assert isinstance(entity, str), "entity should be of type str"
93
89
  if self.is_read_only:
94
90
  logger.warning("Cannot create metadata as this worker is in read-only mode")
95
91
  return
@@ -101,7 +97,6 @@ class MetaDataMixin:
101
97
  "type": type.value,
102
98
  "name": name,
103
99
  "value": value,
104
- "entity_id": entity,
105
100
  "worker_run_id": self.worker_run_id,
106
101
  },
107
102
  )
@@ -125,7 +120,6 @@ class MetaDataMixin:
125
120
  - type: MetaType
126
121
  - name: str
127
122
  - value: str | int | float
128
- - entity_id: str | None
129
123
  :param batch_size: The size of each batch, which will be used to split the publication to avoid API errors.
130
124
 
131
125
  :returns: A list of dicts as returned in the ``metadata_list`` field by the ``CreateMetaDataBulk`` API endpoint.
@@ -157,16 +151,11 @@ class MetaDataMixin:
157
151
  metadata.get("value"), str | float | int
158
152
  ), "value shouldn't be null and should be of type (str or float or int)"
159
153
 
160
- assert metadata.get("entity_id") is None or isinstance(
161
- metadata.get("entity_id"), str
162
- ), "entity_id should be None or a str"
163
-
164
154
  metas.append(
165
155
  {
166
156
  "type": metadata.get("type").value,
167
157
  "name": metadata.get("name"),
168
158
  "value": metadata.get("value"),
169
- "entity_id": metadata.get("entity_id"),
170
159
  }
171
160
  )
172
161
 
tests/conftest.py CHANGED
@@ -103,12 +103,6 @@ def _mock_worker_run_api(responses):
103
103
  payload = {
104
104
  "id": "56785678-5678-5678-5678-567856785678",
105
105
  "parents": [],
106
- "worker": {
107
- "id": "deadbeef-1234-5678-1234-worker",
108
- "name": "Fake worker",
109
- "slug": "fake_worker",
110
- "type": "classifier",
111
- },
112
106
  "worker_version": {
113
107
  "id": "12341234-1234-1234-1234-123412341234",
114
108
  "configuration": {
@@ -153,6 +147,7 @@ def _mock_worker_run_api(responses):
153
147
  "train_folder_id": None,
154
148
  "validation_folder_id": None,
155
149
  "test_folder_id": None,
150
+ "skip_elements_json": False,
156
151
  },
157
152
  "summary": "Worker Fake worker @ 123412",
158
153
  }
@@ -165,6 +160,13 @@ def _mock_worker_run_api(responses):
165
160
  content_type="application/json",
166
161
  )
167
162
 
163
+ # By default, stick to classic configuration
164
+ responses.add(
165
+ responses.GET,
166
+ "http://testserver/api/v1/workers/runs/56785678-5678-5678-5678-567856785678/configuration/",
167
+ status=400,
168
+ )
169
+
168
170
 
169
171
  @pytest.fixture
170
172
  def _mock_worker_run_no_revision_api(responses):
@@ -172,12 +174,6 @@ def _mock_worker_run_no_revision_api(responses):
172
174
  payload = {
173
175
  "id": "56785678-5678-5678-5678-567856785678",
174
176
  "parents": [],
175
- "worker": {
176
- "id": "deadbeef-1234-5678-1234-worker",
177
- "name": "Fake worker",
178
- "slug": "fake_worker",
179
- "type": "classifier",
180
- },
181
177
  "worker_version": {
182
178
  "id": "12341234-1234-1234-1234-123412341234",
183
179
  "configuration": {
@@ -233,6 +229,56 @@ def _mock_worker_run_no_revision_api(responses):
233
229
  )
234
230
 
235
231
 
232
+ @pytest.fixture
233
+ def mock_base_worker_modern_conf(mocker, responses):
234
+ """
235
+ Provide a base worker to test modern configuration with (not provided in the fixture)
236
+ """
237
+ worker = BaseWorker()
238
+ mocker.patch.object(sys, "argv")
239
+ worker.args = worker.parser.parse_args()
240
+
241
+ payload = {
242
+ "id": "56785678-5678-5678-5678-567856785678",
243
+ "parents": [],
244
+ "worker_version": {
245
+ "id": "12341234-1234-1234-1234-123412341234",
246
+ "worker": {
247
+ "id": "deadbeef-1234-5678-1234-worker",
248
+ "name": "Fake worker",
249
+ "slug": "fake_worker",
250
+ "type": "classifier",
251
+ },
252
+ "revision": {"hash": "deadbeef1234"},
253
+ "configuration": {
254
+ "configuration": {"extra_key1": "not showing up"},
255
+ "user_configuration": {"extra_key2": "not showing up"},
256
+ },
257
+ },
258
+ "configuration": {
259
+ "id": "af0daaf4-983e-4703-a7ed-a10f146d6684",
260
+ "name": "my-userconfig",
261
+ "configuration": {
262
+ "extra_key3": "not showing up",
263
+ },
264
+ },
265
+ "model_version": None,
266
+ "process": {
267
+ "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
268
+ "corpus": CORPUS_ID,
269
+ },
270
+ "summary": "Worker Fake worker @ 123412",
271
+ }
272
+ responses.add(
273
+ responses.GET,
274
+ "http://testserver/api/v1/process/workers/56785678-5678-5678-5678-567856785678/",
275
+ status=200,
276
+ json=payload,
277
+ )
278
+
279
+ return worker
280
+
281
+
236
282
  @pytest.fixture
237
283
  def _mock_activity_calls(responses):
238
284
  """
@@ -282,6 +328,61 @@ def mock_elements_worker_with_list(monkeypatch, responses, mock_elements_worker)
282
328
  return mock_elements_worker
283
329
 
284
330
 
331
+ @pytest.fixture
332
+ def mock_elements_worker_consume_wa(monkeypatch, responses, mock_elements_worker):
333
+ """
334
+ Mock a worker instance to use StartWorkerActivity to consume worker activities
335
+ instead of reading a JSON file
336
+ """
337
+
338
+ # Enable consume worker activities through the process configuration
339
+ responses.replace(
340
+ responses.GET,
341
+ "http://testserver/api/v1/process/workers/56785678-5678-5678-5678-567856785678/",
342
+ status=200,
343
+ json={
344
+ "id": "56785678-5678-5678-5678-567856785678",
345
+ "parents": [],
346
+ "worker_version": {
347
+ "id": "12341234-1234-1234-1234-123412341234",
348
+ "configuration": {
349
+ "docker": {"image": "python:3"},
350
+ "configuration": {"someKey": "someValue"},
351
+ "secrets": [],
352
+ },
353
+ "worker": {
354
+ "id": "deadbeef-1234-5678-1234-worker",
355
+ "name": "Fake worker",
356
+ "slug": "fake_worker",
357
+ "type": "classifier",
358
+ },
359
+ },
360
+ "configuration": None,
361
+ "model_version": None,
362
+ "process": {
363
+ "name": None,
364
+ "id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff",
365
+ "state": "running",
366
+ "mode": "workers",
367
+ "corpus": CORPUS_ID,
368
+ "use_cache": False,
369
+ "activity_state": "ready",
370
+ "model_id": None,
371
+ "train_folder_id": None,
372
+ "validation_folder_id": None,
373
+ "test_folder_id": None,
374
+ "skip_elements_json": True,
375
+ },
376
+ "summary": "Worker Fake worker @ 123412",
377
+ },
378
+ )
379
+
380
+ # Call configure again to use updated process infos
381
+ mock_elements_worker.configure()
382
+
383
+ return mock_elements_worker
384
+
385
+
285
386
  @pytest.fixture
286
387
  def mock_cache_db(tmp_path):
287
388
  cache_path = tmp_path / "db.sqlite"