datashare-python 0.4.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {datashare_python-0.4.0 → datashare_python-0.6.0}/PKG-INFO +1 -1
  2. {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/cli/__init__.py +0 -2
  3. {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/cli/worker.py +1 -8
  4. {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/conftest.py +8 -0
  5. {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/discovery.py +11 -31
  6. {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/objects.py +12 -0
  7. {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/utils.py +24 -28
  8. datashare_python-0.6.0/datashare_python/worker-template.tar.gz +0 -0
  9. {datashare_python-0.4.0 → datashare_python-0.6.0}/pyproject.toml +1 -1
  10. datashare_python-0.4.0/datashare_python/cli/local.py +0 -33
  11. datashare_python-0.4.0/datashare_python/local_client.py +0 -69
  12. datashare_python-0.4.0/datashare_python/worker-template.tar.gz +0 -0
  13. {datashare_python-0.4.0 → datashare_python-0.6.0}/.gitignore +0 -0
  14. {datashare_python-0.4.0 → datashare_python-0.6.0}/README.md +0 -0
  15. {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/.gitignore +0 -0
  16. {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/__init__.py +0 -0
  17. {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/__main__.py +0 -0
  18. {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/cli/project.py +0 -0
  19. {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/cli/task.py +0 -0
  20. {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/cli/utils.py +0 -0
  21. {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/config.py +0 -0
  22. {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/constants.py +0 -0
  23. {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/dependencies.py +0 -0
  24. {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/exceptions.py +0 -0
  25. {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/task_client.py +0 -0
  26. {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/template.py +0 -0
  27. {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/types_.py +0 -0
  28. {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datashare-python
3
- Version: 0.4.0
3
+ Version: 0.6.0
4
4
  Summary: Manage Pythoœn tasks and local resources in Datashare
5
5
  Project-URL: Homepage, https://icij.github.io/datashare-python/
6
6
  Project-URL: Documentation, https://icij.github.io/datashare-python/
@@ -6,7 +6,6 @@ import typer
6
6
  from icij_common.logging_utils import setup_loggers
7
7
 
8
8
  import datashare_python
9
- from datashare_python.cli.local import local_app
10
9
  from datashare_python.cli.project import project_app
11
10
  from datashare_python.cli.task import task_app
12
11
  from datashare_python.cli.utils import AsyncTyper
@@ -16,7 +15,6 @@ cli_app = AsyncTyper(
16
15
  context_settings={"help_option_names": ["-h", "--help"]},
17
16
  pretty_exceptions_enable=False,
18
17
  )
19
- cli_app.add_typer(local_app)
20
18
  cli_app.add_typer(project_app)
21
19
  cli_app.add_typer(task_app)
22
20
  cli_app.add_typer(worker_app)
@@ -24,7 +24,6 @@ _LIST_ACTIVITY_NAMES_HELP = "activity names filters (supports regexes)"
24
24
  _START_WORKER_WORKFLOWS_HELP = "workflow names run by the worker (supports regexes)"
25
25
  _START_WORKER_ACTIVITIES_HELP = "activity names run by the worker (supports regexes)"
26
26
  _START_WORKER_DEPS_HELP = "worker lifetime dependencies name in the registry"
27
- _START_WORKER_CONFIG_HELP = "worker config class key the worker configs registry"
28
27
  _START_WORKER_WORKER_ID_PREFIX_HELP = "worker ID prefix"
29
28
  _START_WORKER_CONFIG_PATH_HELP = (
30
29
  "path to a worker config YAML file,"
@@ -86,9 +85,6 @@ async def start(
86
85
  dependencies: Annotated[
87
86
  str | None, typer.Option(help=_START_WORKER_DEPS_HELP)
88
87
  ] = None,
89
- worker_config_name: Annotated[
90
- str | None, typer.Option(help=_START_WORKER_CONFIG_HELP)
91
- ] = None,
92
88
  config_path: Annotated[
93
89
  Path | None,
94
90
  typer.Option(
@@ -107,10 +103,7 @@ async def start(
107
103
  ] = None,
108
104
  ) -> None:
109
105
  registered_wfs, registered_acts, registered_deps, worker_config_cls = discover(
110
- workflows,
111
- act_names=activities,
112
- deps_name=dependencies,
113
- worker_config_name=worker_config_name,
106
+ workflows, act_names=activities, deps_name=dependencies
114
107
  )
115
108
  if config_path is not None:
116
109
  with config_path.open() as f:
@@ -4,6 +4,7 @@ from collections.abc import AsyncGenerator, Generator, Iterator, Sequence
4
4
  from pathlib import Path
5
5
 
6
6
  import aiohttp
7
+ import nest_asyncio
7
8
  import pytest
8
9
  from elasticsearch._async.helpers import async_streaming_bulk
9
10
  from icij_common.es import DOC_ROOT_ID, ES_DOCUMENT_TYPE, ID, ESClient
@@ -200,6 +201,8 @@ def index_docs_ops(
200
201
  }
201
202
  doc = doc.model_dump(by_alias=True) # noqa: PLW2901
202
203
  op.update(doc)
204
+ if "path" in op:
205
+ op["path"] = str(op["path"])
203
206
  op["_id"] = doc[ID]
204
207
  op["routing"] = doc[DOC_ROOT_ID]
205
208
  op["type"] = ES_DOCUMENT_TYPE
@@ -294,3 +297,8 @@ async def all_done(task_client: DatashareTaskClient, not_done: list[str]) -> boo
294
297
  return False
295
298
  not_done.remove(t_id)
296
299
  return True
300
+
301
+
302
+ @pytest.fixture # noqa: F405
303
+ def typer_asyncio_patch() -> None:
304
+ nest_asyncio.apply()
@@ -13,11 +13,11 @@ logger = logging.getLogger(__name__)
13
13
  Activity = ActivityWithProgress | Callable | type
14
14
 
15
15
  _DEPENDENCIES = "dependencies"
16
- _WORKER_CONFIGS = "worker_configs"
16
+ _WORKER_CONFIG_CLS = "worker_config_cls"
17
17
  _WORKFLOW_GROUP = "datashare.workflows"
18
18
  _ACTIVITIES_GROUP = "datashare.activities"
19
19
  _DEPENDENCIES_GROUP = "datashare.dependencies"
20
- _WORKER_CONFIGS_GROUP = "datashare.worker_configs"
20
+ _WORKER_CONFIG_CLS_GROUP = "datashare.worker_config_cls"
21
21
 
22
22
  _RegisteredWorkflow = tuple[str, type]
23
23
  _RegisteredActivity = tuple[str, Activity]
@@ -31,11 +31,7 @@ _Discovery = tuple[
31
31
 
32
32
 
33
33
  def discover(
34
- wf_names: list[str] | None,
35
- *,
36
- act_names: list[str] | None,
37
- deps_name: str | None,
38
- worker_config_name: str | None,
34
+ wf_names: list[str] | None, *, act_names: list[str] | None, deps_name: str | None
39
35
  ) -> _Discovery:
40
36
  discovered = ""
41
37
  wfs = None
@@ -82,11 +78,8 @@ def discover(
82
78
  f"- {n_deps} dependenc{'ies' if n_deps > 1 else 'y'}:"
83
79
  f" {', '.join(deps_names)}"
84
80
  )
85
- if worker_config_name is not None:
86
- worker_config_cls = discover_worker_configs(worker_config_name)
87
- discovered += f"- worker config class: {worker_config_cls}"
88
- else:
89
- worker_config_cls = WorkerConfig
81
+ worker_config_cls = discover_worker_config_cls()
82
+ discovered += f"- worker config class: {worker_config_cls}"
90
83
  logger.info("discovered:\n%s", discovered)
91
84
  return wfs, acts, deps, worker_config_cls
92
85
 
@@ -147,28 +140,15 @@ def discover_dependencies(name: str) -> _Dependencies:
147
140
  raise LookupError(msg) from e
148
141
 
149
142
 
150
- def discover_worker_configs(name: str) -> type[WorkerConfig]:
151
- impls = entry_points(name=_WORKER_CONFIGS, group=_WORKER_CONFIGS_GROUP)
143
+ def discover_worker_config_cls() -> type[WorkerConfig]:
144
+ impls = entry_points(name=_WORKER_CONFIG_CLS, group=_WORKER_CONFIG_CLS_GROUP)
152
145
  if not impls:
153
- available_impls = entry_points(group=_WORKER_CONFIGS_GROUP)
154
- msg = (
155
- f'failed to find worker config: "{name}", '
156
- f"available dependencies: {available_impls}"
157
- )
158
- raise LookupError(msg)
146
+ return WorkerConfig
159
147
  if len(impls) > 1:
160
- msg = f'found multiple worker configs for name "{name}": {impls}'
148
+ msg = f'found multiple registered worker configs classes": {impls}'
161
149
  raise ValueError(msg)
162
- deps_registry = impls[_WORKER_CONFIGS].load()
163
- try:
164
- return deps_registry[name]
165
- except KeyError as e:
166
- available = list(deps_registry)
167
- msg = (
168
- f'failed to find worker config for name "{name}", available worker '
169
- f"configs: {available}"
170
- )
171
- raise LookupError(msg) from e
150
+ deps_registry = impls[_WORKER_CONFIG_CLS].load()
151
+ return deps_registry
172
152
 
173
153
 
174
154
  def _parse_wf_name(wf_type: type) -> str:
@@ -3,6 +3,8 @@ from collections.abc import Awaitable, Callable
3
3
  from dataclasses import dataclass
4
4
  from datetime import UTC, datetime
5
5
  from enum import StrEnum, unique
6
+ from io import BytesIO
7
+ from pathlib import Path
6
8
  from typing import Any, Literal, Self, TypeVar
7
9
 
8
10
  from temporalio import workflow
@@ -142,6 +144,7 @@ class Document(DatashareModel):
142
144
  language: str
143
145
  content: str | None = None
144
146
  content_type: str | None = None
147
+ path: Path | None = None
145
148
  tags: list[str] = Field(default_factory=list)
146
149
  content_translated: dict[str, str] = Field(
147
150
  default_factory=dict, alias="content_translated"
@@ -159,3 +162,12 @@ class Document(DatashareModel):
159
162
  root_document=sources[DOC_ROOT_ID],
160
163
  tags=sources.get("tags", []),
161
164
  )
165
+
166
+
167
+ @dataclass(frozen=True)
168
+ class DocArtifact:
169
+ project: str
170
+ doc_id: str
171
+ artifact: bytes | BytesIO
172
+ filename: str
173
+ metadata_key: str
@@ -13,6 +13,7 @@ from datetime import timedelta
13
13
  from functools import partial, wraps
14
14
  from hashlib import sha256
15
15
  from inspect import signature
16
+ from io import BytesIO
16
17
  from pathlib import Path
17
18
  from typing import Any, ParamSpec, TypeVar
18
19
  from uuid import uuid4
@@ -34,6 +35,7 @@ from temporalio.common import RetryPolicy, SearchAttributeKey
34
35
  from temporalio.exceptions import ApplicationError
35
36
 
36
37
  from .constants import METADATA_JSON
38
+ from .objects import DocArtifact
37
39
  from .types_ import ProgressRateHandler, RawProgressHandler
38
40
 
39
41
  DependencyLabel = str | None
@@ -431,49 +433,43 @@ class LogWithWorkerIDMixin:
431
433
  return handlers
432
434
 
433
435
 
434
- def safe_dir(filename: str) -> Path:
435
- filename = filename.split(".", maxsplit=1)[0]
436
- parts = (p for p in (filename[:2], filename[2:4]) if p)
436
+ def safe_dir(doc_id: str) -> Path:
437
+ if len(doc_id) < 4:
438
+ raise ValueError(f"expected doc_id to be at least 4, found {doc_id}")
439
+ parts = (p for p in (doc_id[:2], doc_id[2:4]) if p)
437
440
  return Path(*parts)
438
441
 
439
442
 
440
- def artifacts_dir(project: str, *, filename: str) -> Path:
441
- return Path(project, safe_dir(filename), filename)
443
+ def _artifacts_dir(doc_id: str, *, project: str) -> Path:
444
+ return Path(project, safe_dir(doc_id), doc_id)
442
445
 
443
446
 
444
- def metadata_path(filename: str, *, project: str) -> Path:
445
- metadata_path = artifacts_dir(project, filename=filename) / METADATA_JSON
447
+ def _metadata_path(doc_id: str, *, project: str) -> Path:
448
+ metadata_path = _artifacts_dir(doc_id, project=project) / METADATA_JSON
446
449
  return metadata_path
447
450
 
448
451
 
449
- def _read_artifact_metadata(root: Path, project: str, *, filename: str) -> dict:
450
- m_path = root / metadata_path(filename, project=project)
452
+ def _read_artifact_metadata(root: Path, artifact: DocArtifact) -> dict:
453
+ m_path = root / _metadata_path(artifact.filename, project=artifact.project)
451
454
  return json.loads(m_path.read_text())
452
455
 
453
456
 
454
- def write_artifact(
455
- artifact: bytes,
456
- root: Path,
457
- *,
458
- project: str,
459
- filename: str,
460
- metadata_key: str,
461
- metadata_value: str,
462
- ) -> Path:
463
- artif_dir = root / artifacts_dir(project, filename=filename)
457
+ def write_artifact(root: Path, artifact: DocArtifact) -> Path:
458
+ artif_dir = root / _artifacts_dir(artifact.doc_id, project=artifact.project)
464
459
  artif_dir.mkdir(exist_ok=True, parents=True)
465
460
  # TODO: if transcriptions are too large we could also serialize them
466
461
  # as jsonl
467
- transcription_path = artif_dir / metadata_value
468
- transcription_path.write_bytes(artifact)
469
- try:
470
- meta = _read_artifact_metadata(root, project, filename=filename)
471
- except FileNotFoundError:
472
- meta = dict()
473
- meta[metadata_key] = metadata_value
474
- meta_path = root / artifacts_dir(project, filename=filename) / METADATA_JSON
462
+ artifact_path: Path = artif_dir / artifact.filename
463
+ if isinstance(artifact.artifact, bytes):
464
+ artifact_path.write_bytes(artifact.artifact)
465
+ elif isinstance(artifact_path, BytesIO):
466
+ with artifact_path.open("wb") as f:
467
+ f.write(artifact.artifact.read())
468
+ meta_path = root / _metadata_path(artifact.doc_id, project=artifact.project)
469
+ meta = _read_artifact_metadata(root, artifact) if meta_path.exists() else dict()
470
+ meta[artifact.metadata_key] = artifact.filename
475
471
  meta_path.write_text(json.dumps(meta))
476
- return transcription_path.relative_to(artif_dir)
472
+ return artifact_path.relative_to(artif_dir)
477
473
 
478
474
 
479
475
  def debuggable_name(
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "datashare-python"
3
- version = "0.4.0"
3
+ version = "0.6.0"
4
4
  description = "Manage Pythoœn tasks and local resources in Datashare"
5
5
  authors = [
6
6
  { name = "Clément Doumouro", email = "cdoumouro@icij.org" },
@@ -1,33 +0,0 @@
1
- from typing import Annotated
2
-
3
- import typer
4
-
5
- from datashare_python.cli.utils import AsyncTyper
6
- from datashare_python.constants import DEFAULT_NAMESPACE, DEFAULT_TEMPORAL_ADDRESS
7
- from datashare_python.local_client import LocalClient
8
-
9
- _REGISTER_NAMESPACE_HELP = "register namespace"
10
- _TEMPORAL_URL_HELP = "address for temporal server"
11
- _NAMESPACE_HELP = "namespace name"
12
- _LOCAL = "local"
13
-
14
- local_app = AsyncTyper(name=_LOCAL)
15
-
16
-
17
- @local_app.async_command(help=_REGISTER_NAMESPACE_HELP)
18
- async def register_namespace(
19
- namespace: Annotated[
20
- str, typer.Option("--namespace", "-n", help=_NAMESPACE_HELP)
21
- ] = DEFAULT_NAMESPACE,
22
- temporal_address: Annotated[
23
- str, typer.Option("--temporal-address", "-a", help=_TEMPORAL_URL_HELP)
24
- ] = DEFAULT_TEMPORAL_ADDRESS,
25
- ) -> None:
26
- """Create namespace
27
-
28
- :param namespace: namespace
29
- :param temporal_address: target host
30
- """
31
- client = LocalClient()
32
-
33
- await client.register_namespace(temporal_address, namespace)
@@ -1,69 +0,0 @@
1
- import asyncio
2
- import importlib
3
- import logging
4
- from functools import partial
5
-
6
- from google.protobuf.duration_pb2 import Duration
7
- from temporalio.api.workflowservice.v1 import (
8
- ListNamespacesRequest,
9
- RegisterNamespaceRequest,
10
- )
11
- from temporalio.service import ConnectConfig, ServiceClient
12
-
13
- LOGGER = logging.getLogger(__name__)
14
-
15
-
16
- class LocalClient:
17
- @staticmethod
18
- async def start_workers(
19
- temporal_address: str, worker_paths: list[str], *args, **kwargs
20
- ) -> None:
21
- """Start worker modules, defined as a list of worker paths
22
-
23
- :param temporal_address: temporal address string (host:port)
24
- :param worker_paths: list of worker modules
25
- """
26
- workers = []
27
-
28
- for worker_path in worker_paths:
29
- try:
30
- module_parts = worker_path.split(".")
31
- module_path = ".".join(module_parts[:-1])
32
- worker_method = module_parts[-1]
33
- module = importlib.import_module(module_path)
34
- worker = getattr(module, worker_method)
35
- worker_partial = partial(worker, *args, **kwargs)
36
-
37
- workers.append(worker_partial(target_host=temporal_address))
38
-
39
- LOGGER.info("'%s' imported successfully.", worker_path)
40
- except (ModuleNotFoundError, AttributeError):
41
- LOGGER.error("'%s' not found in path. Skipping.", worker_path)
42
- continue
43
-
44
- await asyncio.gather(*workers, return_exceptions=True)
45
-
46
- @staticmethod
47
- async def register_namespace(temporal_address: str, namespace: str) -> None:
48
- """Register a temporal namespace
49
-
50
- :param temporal_address: temporal address string
51
- :param namespace: namespace string
52
- """
53
- client = await ServiceClient.connect(
54
- ConnectConfig(target_host=temporal_address)
55
- )
56
- list_resp = await client.workflow_service.list_namespaces(
57
- ListNamespacesRequest()
58
- )
59
-
60
- if namespace in [ns.namespace_info.name for ns in list_resp.namespaces]:
61
- return
62
-
63
- await client.workflow_service.register_namespace(
64
- RegisterNamespaceRequest(
65
- namespace=namespace,
66
- # retain for thirty days
67
- workflow_execution_retention_period=Duration(seconds=30 * 24 * 60 * 60),
68
- )
69
- )