datashare-python 0.5.0__tar.gz → 0.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {datashare_python-0.5.0 → datashare_python-0.6.1}/PKG-INFO +1 -1
  2. {datashare_python-0.5.0 → datashare_python-0.6.1}/datashare_python/cli/__init__.py +0 -2
  3. {datashare_python-0.5.0 → datashare_python-0.6.1}/datashare_python/cli/task.py +2 -3
  4. {datashare_python-0.5.0 → datashare_python-0.6.1}/datashare_python/config.py +5 -0
  5. {datashare_python-0.5.0 → datashare_python-0.6.1}/datashare_python/conftest.py +9 -0
  6. {datashare_python-0.5.0 → datashare_python-0.6.1}/datashare_python/constants.py +2 -3
  7. {datashare_python-0.5.0 → datashare_python-0.6.1}/datashare_python/objects.py +98 -5
  8. {datashare_python-0.5.0 → datashare_python-0.6.1}/datashare_python/utils.py +40 -38
  9. datashare_python-0.6.1/datashare_python/worker-template.tar.gz +0 -0
  10. {datashare_python-0.5.0 → datashare_python-0.6.1}/pyproject.toml +1 -1
  11. datashare_python-0.5.0/datashare_python/cli/local.py +0 -33
  12. datashare_python-0.5.0/datashare_python/local_client.py +0 -69
  13. datashare_python-0.5.0/datashare_python/worker-template.tar.gz +0 -0
  14. {datashare_python-0.5.0 → datashare_python-0.6.1}/.gitignore +0 -0
  15. {datashare_python-0.5.0 → datashare_python-0.6.1}/README.md +0 -0
  16. {datashare_python-0.5.0 → datashare_python-0.6.1}/datashare_python/.gitignore +0 -0
  17. {datashare_python-0.5.0 → datashare_python-0.6.1}/datashare_python/__init__.py +0 -0
  18. {datashare_python-0.5.0 → datashare_python-0.6.1}/datashare_python/__main__.py +0 -0
  19. {datashare_python-0.5.0 → datashare_python-0.6.1}/datashare_python/cli/project.py +0 -0
  20. {datashare_python-0.5.0 → datashare_python-0.6.1}/datashare_python/cli/utils.py +0 -0
  21. {datashare_python-0.5.0 → datashare_python-0.6.1}/datashare_python/cli/worker.py +0 -0
  22. {datashare_python-0.5.0 → datashare_python-0.6.1}/datashare_python/dependencies.py +0 -0
  23. {datashare_python-0.5.0 → datashare_python-0.6.1}/datashare_python/discovery.py +0 -0
  24. {datashare_python-0.5.0 → datashare_python-0.6.1}/datashare_python/exceptions.py +0 -0
  25. {datashare_python-0.5.0 → datashare_python-0.6.1}/datashare_python/task_client.py +0 -0
  26. {datashare_python-0.5.0 → datashare_python-0.6.1}/datashare_python/template.py +0 -0
  27. {datashare_python-0.5.0 → datashare_python-0.6.1}/datashare_python/types_.py +0 -0
  28. {datashare_python-0.5.0 → datashare_python-0.6.1}/datashare_python/worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datashare-python
3
- Version: 0.5.0
3
+ Version: 0.6.1
4
4
  Summary: Manage Pythoœn tasks and local resources in Datashare
5
5
  Project-URL: Homepage, https://icij.github.io/datashare-python/
6
6
  Project-URL: Documentation, https://icij.github.io/datashare-python/
@@ -6,7 +6,6 @@ import typer
6
6
  from icij_common.logging_utils import setup_loggers
7
7
 
8
8
  import datashare_python
9
- from datashare_python.cli.local import local_app
10
9
  from datashare_python.cli.project import project_app
11
10
  from datashare_python.cli.task import task_app
12
11
  from datashare_python.cli.utils import AsyncTyper
@@ -16,7 +15,6 @@ cli_app = AsyncTyper(
16
15
  context_settings={"help_option_names": ["-h", "--help"]},
17
16
  pretty_exceptions_enable=False,
18
17
  )
19
- cli_app.add_typer(local_app)
20
18
  cli_app.add_typer(project_app)
21
19
  cli_app.add_typer(task_app)
22
20
  cli_app.add_typer(worker_app)
@@ -10,8 +10,7 @@ import typer
10
10
  from alive_progress import alive_bar
11
11
 
12
12
  from datashare_python.cli.utils import AsyncTyper, eprint
13
- from datashare_python.constants import PYTHON_TASK_GROUP
14
- from datashare_python.objects import READY_STATES, Task, TaskError, TaskState
13
+ from datashare_python.objects import READY_STATES, Task, TaskError, TaskGroup, TaskState
15
14
  from datashare_python.task_client import DatashareTaskClient
16
15
 
17
16
  logger = logging.getLogger(__name__)
@@ -41,7 +40,7 @@ async def start(
41
40
  group: Annotated[
42
41
  str | None,
43
42
  typer.Option("--group", "-g", help=_GROUP_HELP),
44
- ] = PYTHON_TASK_GROUP.name,
43
+ ] = TaskGroup.python, # noqa: F821
45
44
  ds_address: Annotated[
46
45
  str, typer.Option("--ds-address", "-a", help=_DS_URL_HELP)
47
46
  ] = DEFAULT_DS_ADDRESS,
@@ -1,3 +1,4 @@
1
+ from pathlib import Path
1
2
  from typing import ClassVar
2
3
 
3
4
  from icij_common.es import ESClient
@@ -87,6 +88,10 @@ class WorkerConfig(ICIJSettings, LogWithWorkerIDMixin, BaseModel):
87
88
 
88
89
  max_concurrent_io_activities: int = 5
89
90
 
91
+ docs_root: Path | None = None
92
+ artifacts_root: Path | None = None
93
+ workdir: Path | None = None
94
+
90
95
  def to_es_client(self) -> ESClient:
91
96
  return self.elasticsearch.to_es_client(self.datashare.api_key)
92
97
 
@@ -4,6 +4,7 @@ from collections.abc import AsyncGenerator, Generator, Iterator, Sequence
4
4
  from pathlib import Path
5
5
 
6
6
  import aiohttp
7
+ import nest_asyncio
7
8
  import pytest
8
9
  from elasticsearch._async.helpers import async_streaming_bulk
9
10
  from icij_common.es import DOC_ROOT_ID, ES_DOCUMENT_TYPE, ID, ESClient
@@ -200,6 +201,8 @@ def index_docs_ops(
200
201
  }
201
202
  doc = doc.model_dump(by_alias=True) # noqa: PLW2901
202
203
  op.update(doc)
204
+ if "path" in op:
205
+ op["path"] = str(op["path"])
203
206
  op["_id"] = doc[ID]
204
207
  op["routing"] = doc[DOC_ROOT_ID]
205
208
  op["type"] = ES_DOCUMENT_TYPE
@@ -259,6 +262,7 @@ def doc_2() -> Document:
259
262
  def doc_3() -> Document:
260
263
  return Document(
261
264
  id="doc-3",
265
+ index=TEST_PROJECT,
262
266
  root_document="root-3",
263
267
  language="SPANISH",
264
268
  content="traduce este texto al inglés",
@@ -294,3 +298,8 @@ async def all_done(task_client: DatashareTaskClient, not_done: list[str]) -> boo
294
298
  return False
295
299
  not_done.remove(t_id)
296
300
  return True
301
+
302
+
303
+ @pytest.fixture # noqa: F405
304
+ def typer_asyncio_patch() -> None:
305
+ nest_asyncio.apply()
@@ -1,11 +1,8 @@
1
1
  from pathlib import Path
2
2
 
3
- from .objects import TaskGroup
4
-
5
3
  PACKAGE_DIR = Path(__file__).parent
6
4
  PACKAGE_ROOT = PACKAGE_DIR.parent
7
5
 
8
- PYTHON_TASK_GROUP = TaskGroup(name="PYTHON")
9
6
 
10
7
  DEFAULT_TEMPORAL_ADDRESS = "temporal:7233"
11
8
 
@@ -14,3 +11,5 @@ DEFAULT_DS_ADDRESS = "http://localhost:8080"
14
11
  DEFAULT_NAMESPACE = "datashare-default"
15
12
 
16
13
  METADATA_JSON = "metadata.json"
14
+
15
+ TIKA_METADATA_RESOURCENAME = "tika_metadata_resourcename"
@@ -3,12 +3,26 @@ from collections.abc import Awaitable, Callable
3
3
  from dataclasses import dataclass
4
4
  from datetime import UTC, datetime
5
5
  from enum import StrEnum, unique
6
- from typing import Any, Literal, Self, TypeVar
6
+ from io import BytesIO
7
+ from pathlib import Path
8
+ from typing import Any, Literal, Self, TypeVar, cast
7
9
 
8
10
  from temporalio import workflow
9
11
 
12
+ from .constants import TIKA_METADATA_RESOURCENAME
13
+
10
14
  with workflow.unsafe.imports_passed_through():
11
- from icij_common.es import DOC_CONTENT, DOC_LANGUAGE, DOC_ROOT_ID, ID_, SOURCE
15
+ from icij_common.es import (
16
+ DOC_CONTENT,
17
+ DOC_CONTENT_TRANSLATED,
18
+ DOC_LANGUAGE,
19
+ DOC_METADATA,
20
+ DOC_PATH,
21
+ DOC_ROOT_ID,
22
+ ID_,
23
+ INDEX_,
24
+ SOURCE,
25
+ )
12
26
 
13
27
  from icij_common.pydantic_utils import (
14
28
  icij_config,
@@ -135,17 +149,56 @@ class Task(Message):
135
149
  class TaskGroup:
136
150
  name: str
137
151
 
152
+ @property
153
+ @classmethod
154
+ def python(cls) -> Self:
155
+ return cls(name="PYTHON")
156
+
157
+
158
+ @unique
159
+ class DocumentLocation(StrEnum):
160
+ ORIGINAL = "original"
161
+ ARTIFACTS = "artifacts"
162
+ WORKDIR = "workdir"
163
+
164
+
165
+ class FilesystemDocument(DatashareModel):
166
+ id: str
167
+ path: Path
168
+ index: str
169
+ location: DocumentLocation
170
+ resource_name: str
171
+
172
+ def locate(
173
+ self, original_root: Path, *, artifacts_root: Path, workdir: Path
174
+ ) -> Path:
175
+ from datashare_python.utils import artifacts_dir # noqa: PLC0415
176
+
177
+ project = self.index
178
+ match self.location:
179
+ case DocumentLocation.ORIGINAL:
180
+ return original_root / self.path
181
+ case DocumentLocation.ARTIFACTS:
182
+ return artifacts_root / artifacts_dir(self.id, project=project) / "raw"
183
+ case DocumentLocation.WORKDIR:
184
+ return workdir / self.path
185
+ case _:
186
+ raise ValueError(f"invalid location: {self.path}")
187
+
138
188
 
139
189
  class Document(DatashareModel):
140
190
  id: str
141
- root_document: str
142
191
  language: str
192
+ index: str | None = None
193
+ root_document: str | None = None
143
194
  content: str | None = None
144
195
  content_type: str | None = None
196
+ path: Path | None = None
145
197
  tags: list[str] = Field(default_factory=list)
146
198
  content_translated: dict[str, str] = Field(
147
199
  default_factory=dict, alias="content_translated"
148
200
  )
201
+ metadata: dict[str, Any] | None = None
149
202
  type: str = Field(default="Document", frozen=True)
150
203
 
151
204
  @classmethod
@@ -153,9 +206,49 @@ class Document(DatashareModel):
153
206
  sources = es_doc[SOURCE]
154
207
  return cls(
155
208
  id=es_doc[ID_],
156
- content=sources[DOC_CONTENT],
157
- content_translated=sources.get("content_translated", dict()),
209
+ index=es_doc.get(INDEX_),
210
+ content=sources.get(DOC_CONTENT),
211
+ content_translated=sources.get(DOC_CONTENT_TRANSLATED, dict()),
158
212
  language=sources[DOC_LANGUAGE],
159
213
  root_document=sources[DOC_ROOT_ID],
160
214
  tags=sources.get("tags", []),
215
+ path=sources.get(DOC_PATH),
216
+ metadata=sources.get(DOC_METADATA),
217
+ )
218
+
219
+ def to_filesystem(self) -> FilesystemDocument:
220
+ from .utils import artifacts_dir # noqa: PLC0415
221
+
222
+ if self.metadata is None:
223
+ raise ValueError(
224
+ "can't compute filesyste path for document withtout metadata"
225
+ )
226
+ resource_name = cast(str, self.metadata[TIKA_METADATA_RESOURCENAME])
227
+ if self.root_document is None:
228
+ path = self.path
229
+ location = DocumentLocation.ORIGINAL
230
+ else:
231
+ if self.index is None:
232
+ msg = (
233
+ f"can't compute filesystem path for embedded doc {self.id} without"
234
+ f" index"
235
+ )
236
+ raise ValueError(msg)
237
+ path = artifacts_dir(doc_id=self.id, project=self.index) / "raw"
238
+ location = DocumentLocation.ARTIFACTS
239
+ return FilesystemDocument(
240
+ id=self.id,
241
+ path=path,
242
+ index=self.index,
243
+ location=location,
244
+ resource_name=resource_name,
161
245
  )
246
+
247
+
248
+ @dataclass(frozen=True)
249
+ class DocArtifact:
250
+ project: str
251
+ doc_id: str
252
+ artifact: bytes | BytesIO
253
+ filename: str
254
+ metadata_key: str
@@ -13,6 +13,7 @@ from datetime import timedelta
13
13
  from functools import partial, wraps
14
14
  from hashlib import sha256
15
15
  from inspect import signature
16
+ from io import BytesIO
16
17
  from pathlib import Path
17
18
  from typing import Any, ParamSpec, TypeVar
18
19
  from uuid import uuid4
@@ -34,6 +35,7 @@ from temporalio.common import RetryPolicy, SearchAttributeKey
34
35
  from temporalio.exceptions import ApplicationError
35
36
 
36
37
  from .constants import METADATA_JSON
38
+ from .objects import DocArtifact
37
39
  from .types_ import ProgressRateHandler, RawProgressHandler
38
40
 
39
41
  DependencyLabel = str | None
@@ -431,49 +433,43 @@ class LogWithWorkerIDMixin:
431
433
  return handlers
432
434
 
433
435
 
434
- def safe_dir(filename: str) -> Path:
435
- filename = filename.split(".", maxsplit=1)[0]
436
- parts = (p for p in (filename[:2], filename[2:4]) if p)
436
+ def safe_dir(doc_id: str) -> Path:
437
+ if len(doc_id) < 4:
438
+ raise ValueError(f"expected doc_id to be at least 4, found {doc_id}")
439
+ parts = (p for p in (doc_id[:2], doc_id[2:4]) if p)
437
440
  return Path(*parts)
438
441
 
439
442
 
440
- def artifacts_dir(project: str, *, filename: str) -> Path:
441
- return Path(project, safe_dir(filename), filename)
443
+ def artifacts_dir(doc_id: str, *, project: str) -> Path:
444
+ return Path(project, safe_dir(doc_id), doc_id)
442
445
 
443
446
 
444
- def metadata_path(filename: str, *, project: str) -> Path:
445
- metadata_path = artifacts_dir(project, filename=filename) / METADATA_JSON
447
+ def _metadata_path(doc_id: str, *, project: str) -> Path:
448
+ metadata_path = artifacts_dir(doc_id, project=project) / METADATA_JSON
446
449
  return metadata_path
447
450
 
448
451
 
449
- def _read_artifact_metadata(root: Path, project: str, *, filename: str) -> dict:
450
- m_path = root / metadata_path(filename, project=project)
452
+ def _read_artifact_metadata(root: Path, artifact: DocArtifact) -> dict:
453
+ m_path = root / _metadata_path(artifact.filename, project=artifact.project)
451
454
  return json.loads(m_path.read_text())
452
455
 
453
456
 
454
- def write_artifact(
455
- artifact: bytes,
456
- root: Path,
457
- *,
458
- project: str,
459
- filename: str,
460
- metadata_key: str,
461
- metadata_value: str,
462
- ) -> Path:
463
- artif_dir = root / artifacts_dir(project, filename=filename)
457
+ def write_artifact(root: Path, artifact: DocArtifact) -> Path:
458
+ artif_dir = root / artifacts_dir(artifact.doc_id, project=artifact.project)
464
459
  artif_dir.mkdir(exist_ok=True, parents=True)
465
460
  # TODO: if transcriptions are too large we could also serialize them
466
461
  # as jsonl
467
- transcription_path = artif_dir / metadata_value
468
- transcription_path.write_bytes(artifact)
469
- try:
470
- meta = _read_artifact_metadata(root, project, filename=filename)
471
- except FileNotFoundError:
472
- meta = dict()
473
- meta[metadata_key] = metadata_value
474
- meta_path = root / artifacts_dir(project, filename=filename) / METADATA_JSON
462
+ artifact_path: Path = artif_dir / artifact.filename
463
+ if isinstance(artifact.artifact, bytes):
464
+ artifact_path.write_bytes(artifact.artifact)
465
+ elif isinstance(artifact_path, BytesIO):
466
+ with artifact_path.open("wb") as f:
467
+ f.write(artifact.artifact.read())
468
+ meta_path = root / _metadata_path(artifact.doc_id, project=artifact.project)
469
+ meta = _read_artifact_metadata(root, artifact) if meta_path.exists() else dict()
470
+ meta[artifact.metadata_key] = artifact.filename
475
471
  meta_path.write_text(json.dumps(meta))
476
- return transcription_path.relative_to(artif_dir)
472
+ return artifact_path.relative_to(artif_dir)
477
473
 
478
474
 
479
475
  def debuggable_name(
@@ -483,17 +479,23 @@ def debuggable_name(
483
479
  displayable_file_name = [c[:component_size_limit] for c in path.parts]
484
480
  uuid = sha256(str(path).encode()).hexdigest() if deterministic else uuid4().hex
485
481
  uuid = uuid[:20]
486
- return f"{uuid}-{'__'.join(displayable_file_name)}"
482
+ return f"{uuid}-{'--'.join(displayable_file_name)}"
487
483
 
488
484
 
489
- def activity_contextual_id(*, wf_context: bool = False) -> str:
485
+ def activity_contextual_id(
486
+ *, wf_context: bool = True, act_context: bool = False, run_context: bool = False
487
+ ) -> str:
488
+ contextual_id = []
490
489
  act_info = activity.info()
491
- act_id = act_info.activity_id
492
- act_run_id = act_info.activity_id
493
- act_type = act_info.activity_type
494
- contextual_id = f"{act_type}-{act_id}-{act_run_id}"
490
+ if not wf_context and not act_context:
491
+ raise ValueError("at least one of wf_context and act_context must be True")
495
492
  if wf_context:
496
- wf_id = act_info.workflow_id
497
- wf_run_id = act_info.workflow_run_id
498
- contextual_id += f"-{wf_id}-{wf_run_id}"
499
- return contextual_id
493
+ contextual_id.append(act_info.workflow_id)
494
+ if run_context:
495
+ contextual_id.append(act_info.workflow_run_id)
496
+ if act_context:
497
+ contextual_id.append(act_info.activity_type)
498
+ contextual_id.append(act_info.activity_id)
499
+ if run_context:
500
+ contextual_id.append(act_info.activity_run_id)
501
+ return "-".join(contextual_id)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "datashare-python"
3
- version = "0.5.0"
3
+ version = "0.6.1"
4
4
  description = "Manage Pythoœn tasks and local resources in Datashare"
5
5
  authors = [
6
6
  { name = "Clément Doumouro", email = "cdoumouro@icij.org" },
@@ -1,33 +0,0 @@
1
- from typing import Annotated
2
-
3
- import typer
4
-
5
- from datashare_python.cli.utils import AsyncTyper
6
- from datashare_python.constants import DEFAULT_NAMESPACE, DEFAULT_TEMPORAL_ADDRESS
7
- from datashare_python.local_client import LocalClient
8
-
9
- _REGISTER_NAMESPACE_HELP = "register namespace"
10
- _TEMPORAL_URL_HELP = "address for temporal server"
11
- _NAMESPACE_HELP = "namespace name"
12
- _LOCAL = "local"
13
-
14
- local_app = AsyncTyper(name=_LOCAL)
15
-
16
-
17
- @local_app.async_command(help=_REGISTER_NAMESPACE_HELP)
18
- async def register_namespace(
19
- namespace: Annotated[
20
- str, typer.Option("--namespace", "-n", help=_NAMESPACE_HELP)
21
- ] = DEFAULT_NAMESPACE,
22
- temporal_address: Annotated[
23
- str, typer.Option("--temporal-address", "-a", help=_TEMPORAL_URL_HELP)
24
- ] = DEFAULT_TEMPORAL_ADDRESS,
25
- ) -> None:
26
- """Create namespace
27
-
28
- :param namespace: namespace
29
- :param temporal_address: target host
30
- """
31
- client = LocalClient()
32
-
33
- await client.register_namespace(temporal_address, namespace)
@@ -1,69 +0,0 @@
1
- import asyncio
2
- import importlib
3
- import logging
4
- from functools import partial
5
-
6
- from google.protobuf.duration_pb2 import Duration
7
- from temporalio.api.workflowservice.v1 import (
8
- ListNamespacesRequest,
9
- RegisterNamespaceRequest,
10
- )
11
- from temporalio.service import ConnectConfig, ServiceClient
12
-
13
- LOGGER = logging.getLogger(__name__)
14
-
15
-
16
- class LocalClient:
17
- @staticmethod
18
- async def start_workers(
19
- temporal_address: str, worker_paths: list[str], *args, **kwargs
20
- ) -> None:
21
- """Start worker modules, defined as a list of worker paths
22
-
23
- :param temporal_address: temporal address string (host:port)
24
- :param worker_paths: list of worker modules
25
- """
26
- workers = []
27
-
28
- for worker_path in worker_paths:
29
- try:
30
- module_parts = worker_path.split(".")
31
- module_path = ".".join(module_parts[:-1])
32
- worker_method = module_parts[-1]
33
- module = importlib.import_module(module_path)
34
- worker = getattr(module, worker_method)
35
- worker_partial = partial(worker, *args, **kwargs)
36
-
37
- workers.append(worker_partial(target_host=temporal_address))
38
-
39
- LOGGER.info("'%s' imported successfully.", worker_path)
40
- except (ModuleNotFoundError, AttributeError):
41
- LOGGER.error("'%s' not found in path. Skipping.", worker_path)
42
- continue
43
-
44
- await asyncio.gather(*workers, return_exceptions=True)
45
-
46
- @staticmethod
47
- async def register_namespace(temporal_address: str, namespace: str) -> None:
48
- """Register a temporal namespace
49
-
50
- :param temporal_address: temporal address string
51
- :param namespace: namespace string
52
- """
53
- client = await ServiceClient.connect(
54
- ConnectConfig(target_host=temporal_address)
55
- )
56
- list_resp = await client.workflow_service.list_namespaces(
57
- ListNamespacesRequest()
58
- )
59
-
60
- if namespace in [ns.namespace_info.name for ns in list_resp.namespaces]:
61
- return
62
-
63
- await client.workflow_service.register_namespace(
64
- RegisterNamespaceRequest(
65
- namespace=namespace,
66
- # retain for thirty days
67
- workflow_execution_retention_period=Duration(seconds=30 * 24 * 60 * 60),
68
- )
69
- )