datashare-python 0.4.0__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datashare_python-0.4.0 → datashare_python-0.6.0}/PKG-INFO +1 -1
- {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/cli/__init__.py +0 -2
- {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/cli/worker.py +1 -8
- {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/conftest.py +8 -0
- {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/discovery.py +11 -31
- {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/objects.py +12 -0
- {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/utils.py +24 -28
- datashare_python-0.6.0/datashare_python/worker-template.tar.gz +0 -0
- {datashare_python-0.4.0 → datashare_python-0.6.0}/pyproject.toml +1 -1
- datashare_python-0.4.0/datashare_python/cli/local.py +0 -33
- datashare_python-0.4.0/datashare_python/local_client.py +0 -69
- datashare_python-0.4.0/datashare_python/worker-template.tar.gz +0 -0
- {datashare_python-0.4.0 → datashare_python-0.6.0}/.gitignore +0 -0
- {datashare_python-0.4.0 → datashare_python-0.6.0}/README.md +0 -0
- {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/.gitignore +0 -0
- {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/__init__.py +0 -0
- {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/__main__.py +0 -0
- {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/cli/project.py +0 -0
- {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/cli/task.py +0 -0
- {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/cli/utils.py +0 -0
- {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/config.py +0 -0
- {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/constants.py +0 -0
- {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/dependencies.py +0 -0
- {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/exceptions.py +0 -0
- {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/task_client.py +0 -0
- {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/template.py +0 -0
- {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/types_.py +0 -0
- {datashare_python-0.4.0 → datashare_python-0.6.0}/datashare_python/worker.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datashare-python
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Manage Pythoœn tasks and local resources in Datashare
|
|
5
5
|
Project-URL: Homepage, https://icij.github.io/datashare-python/
|
|
6
6
|
Project-URL: Documentation, https://icij.github.io/datashare-python/
|
|
@@ -6,7 +6,6 @@ import typer
|
|
|
6
6
|
from icij_common.logging_utils import setup_loggers
|
|
7
7
|
|
|
8
8
|
import datashare_python
|
|
9
|
-
from datashare_python.cli.local import local_app
|
|
10
9
|
from datashare_python.cli.project import project_app
|
|
11
10
|
from datashare_python.cli.task import task_app
|
|
12
11
|
from datashare_python.cli.utils import AsyncTyper
|
|
@@ -16,7 +15,6 @@ cli_app = AsyncTyper(
|
|
|
16
15
|
context_settings={"help_option_names": ["-h", "--help"]},
|
|
17
16
|
pretty_exceptions_enable=False,
|
|
18
17
|
)
|
|
19
|
-
cli_app.add_typer(local_app)
|
|
20
18
|
cli_app.add_typer(project_app)
|
|
21
19
|
cli_app.add_typer(task_app)
|
|
22
20
|
cli_app.add_typer(worker_app)
|
|
@@ -24,7 +24,6 @@ _LIST_ACTIVITY_NAMES_HELP = "activity names filters (supports regexes)"
|
|
|
24
24
|
_START_WORKER_WORKFLOWS_HELP = "workflow names run by the worker (supports regexes)"
|
|
25
25
|
_START_WORKER_ACTIVITIES_HELP = "activity names run by the worker (supports regexes)"
|
|
26
26
|
_START_WORKER_DEPS_HELP = "worker lifetime dependencies name in the registry"
|
|
27
|
-
_START_WORKER_CONFIG_HELP = "worker config class key the worker configs registry"
|
|
28
27
|
_START_WORKER_WORKER_ID_PREFIX_HELP = "worker ID prefix"
|
|
29
28
|
_START_WORKER_CONFIG_PATH_HELP = (
|
|
30
29
|
"path to a worker config YAML file,"
|
|
@@ -86,9 +85,6 @@ async def start(
|
|
|
86
85
|
dependencies: Annotated[
|
|
87
86
|
str | None, typer.Option(help=_START_WORKER_DEPS_HELP)
|
|
88
87
|
] = None,
|
|
89
|
-
worker_config_name: Annotated[
|
|
90
|
-
str | None, typer.Option(help=_START_WORKER_CONFIG_HELP)
|
|
91
|
-
] = None,
|
|
92
88
|
config_path: Annotated[
|
|
93
89
|
Path | None,
|
|
94
90
|
typer.Option(
|
|
@@ -107,10 +103,7 @@ async def start(
|
|
|
107
103
|
] = None,
|
|
108
104
|
) -> None:
|
|
109
105
|
registered_wfs, registered_acts, registered_deps, worker_config_cls = discover(
|
|
110
|
-
workflows,
|
|
111
|
-
act_names=activities,
|
|
112
|
-
deps_name=dependencies,
|
|
113
|
-
worker_config_name=worker_config_name,
|
|
106
|
+
workflows, act_names=activities, deps_name=dependencies
|
|
114
107
|
)
|
|
115
108
|
if config_path is not None:
|
|
116
109
|
with config_path.open() as f:
|
|
@@ -4,6 +4,7 @@ from collections.abc import AsyncGenerator, Generator, Iterator, Sequence
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
6
|
import aiohttp
|
|
7
|
+
import nest_asyncio
|
|
7
8
|
import pytest
|
|
8
9
|
from elasticsearch._async.helpers import async_streaming_bulk
|
|
9
10
|
from icij_common.es import DOC_ROOT_ID, ES_DOCUMENT_TYPE, ID, ESClient
|
|
@@ -200,6 +201,8 @@ def index_docs_ops(
|
|
|
200
201
|
}
|
|
201
202
|
doc = doc.model_dump(by_alias=True) # noqa: PLW2901
|
|
202
203
|
op.update(doc)
|
|
204
|
+
if "path" in op:
|
|
205
|
+
op["path"] = str(op["path"])
|
|
203
206
|
op["_id"] = doc[ID]
|
|
204
207
|
op["routing"] = doc[DOC_ROOT_ID]
|
|
205
208
|
op["type"] = ES_DOCUMENT_TYPE
|
|
@@ -294,3 +297,8 @@ async def all_done(task_client: DatashareTaskClient, not_done: list[str]) -> boo
|
|
|
294
297
|
return False
|
|
295
298
|
not_done.remove(t_id)
|
|
296
299
|
return True
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
@pytest.fixture # noqa: F405
|
|
303
|
+
def typer_asyncio_patch() -> None:
|
|
304
|
+
nest_asyncio.apply()
|
|
@@ -13,11 +13,11 @@ logger = logging.getLogger(__name__)
|
|
|
13
13
|
Activity = ActivityWithProgress | Callable | type
|
|
14
14
|
|
|
15
15
|
_DEPENDENCIES = "dependencies"
|
|
16
|
-
|
|
16
|
+
_WORKER_CONFIG_CLS = "worker_config_cls"
|
|
17
17
|
_WORKFLOW_GROUP = "datashare.workflows"
|
|
18
18
|
_ACTIVITIES_GROUP = "datashare.activities"
|
|
19
19
|
_DEPENDENCIES_GROUP = "datashare.dependencies"
|
|
20
|
-
|
|
20
|
+
_WORKER_CONFIG_CLS_GROUP = "datashare.worker_config_cls"
|
|
21
21
|
|
|
22
22
|
_RegisteredWorkflow = tuple[str, type]
|
|
23
23
|
_RegisteredActivity = tuple[str, Activity]
|
|
@@ -31,11 +31,7 @@ _Discovery = tuple[
|
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
def discover(
|
|
34
|
-
wf_names: list[str] | None,
|
|
35
|
-
*,
|
|
36
|
-
act_names: list[str] | None,
|
|
37
|
-
deps_name: str | None,
|
|
38
|
-
worker_config_name: str | None,
|
|
34
|
+
wf_names: list[str] | None, *, act_names: list[str] | None, deps_name: str | None
|
|
39
35
|
) -> _Discovery:
|
|
40
36
|
discovered = ""
|
|
41
37
|
wfs = None
|
|
@@ -82,11 +78,8 @@ def discover(
|
|
|
82
78
|
f"- {n_deps} dependenc{'ies' if n_deps > 1 else 'y'}:"
|
|
83
79
|
f" {', '.join(deps_names)}"
|
|
84
80
|
)
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
discovered += f"- worker config class: {worker_config_cls}"
|
|
88
|
-
else:
|
|
89
|
-
worker_config_cls = WorkerConfig
|
|
81
|
+
worker_config_cls = discover_worker_config_cls()
|
|
82
|
+
discovered += f"- worker config class: {worker_config_cls}"
|
|
90
83
|
logger.info("discovered:\n%s", discovered)
|
|
91
84
|
return wfs, acts, deps, worker_config_cls
|
|
92
85
|
|
|
@@ -147,28 +140,15 @@ def discover_dependencies(name: str) -> _Dependencies:
|
|
|
147
140
|
raise LookupError(msg) from e
|
|
148
141
|
|
|
149
142
|
|
|
150
|
-
def
|
|
151
|
-
impls = entry_points(name=
|
|
143
|
+
def discover_worker_config_cls() -> type[WorkerConfig]:
|
|
144
|
+
impls = entry_points(name=_WORKER_CONFIG_CLS, group=_WORKER_CONFIG_CLS_GROUP)
|
|
152
145
|
if not impls:
|
|
153
|
-
|
|
154
|
-
msg = (
|
|
155
|
-
f'failed to find worker config: "{name}", '
|
|
156
|
-
f"available dependencies: {available_impls}"
|
|
157
|
-
)
|
|
158
|
-
raise LookupError(msg)
|
|
146
|
+
return WorkerConfig
|
|
159
147
|
if len(impls) > 1:
|
|
160
|
-
msg = f'found multiple worker configs
|
|
148
|
+
msg = f'found multiple registered worker configs classes": {impls}'
|
|
161
149
|
raise ValueError(msg)
|
|
162
|
-
deps_registry = impls[
|
|
163
|
-
|
|
164
|
-
return deps_registry[name]
|
|
165
|
-
except KeyError as e:
|
|
166
|
-
available = list(deps_registry)
|
|
167
|
-
msg = (
|
|
168
|
-
f'failed to find worker config for name "{name}", available worker '
|
|
169
|
-
f"configs: {available}"
|
|
170
|
-
)
|
|
171
|
-
raise LookupError(msg) from e
|
|
150
|
+
deps_registry = impls[_WORKER_CONFIG_CLS].load()
|
|
151
|
+
return deps_registry
|
|
172
152
|
|
|
173
153
|
|
|
174
154
|
def _parse_wf_name(wf_type: type) -> str:
|
|
@@ -3,6 +3,8 @@ from collections.abc import Awaitable, Callable
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from datetime import UTC, datetime
|
|
5
5
|
from enum import StrEnum, unique
|
|
6
|
+
from io import BytesIO
|
|
7
|
+
from pathlib import Path
|
|
6
8
|
from typing import Any, Literal, Self, TypeVar
|
|
7
9
|
|
|
8
10
|
from temporalio import workflow
|
|
@@ -142,6 +144,7 @@ class Document(DatashareModel):
|
|
|
142
144
|
language: str
|
|
143
145
|
content: str | None = None
|
|
144
146
|
content_type: str | None = None
|
|
147
|
+
path: Path | None = None
|
|
145
148
|
tags: list[str] = Field(default_factory=list)
|
|
146
149
|
content_translated: dict[str, str] = Field(
|
|
147
150
|
default_factory=dict, alias="content_translated"
|
|
@@ -159,3 +162,12 @@ class Document(DatashareModel):
|
|
|
159
162
|
root_document=sources[DOC_ROOT_ID],
|
|
160
163
|
tags=sources.get("tags", []),
|
|
161
164
|
)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@dataclass(frozen=True)
|
|
168
|
+
class DocArtifact:
|
|
169
|
+
project: str
|
|
170
|
+
doc_id: str
|
|
171
|
+
artifact: bytes | BytesIO
|
|
172
|
+
filename: str
|
|
173
|
+
metadata_key: str
|
|
@@ -13,6 +13,7 @@ from datetime import timedelta
|
|
|
13
13
|
from functools import partial, wraps
|
|
14
14
|
from hashlib import sha256
|
|
15
15
|
from inspect import signature
|
|
16
|
+
from io import BytesIO
|
|
16
17
|
from pathlib import Path
|
|
17
18
|
from typing import Any, ParamSpec, TypeVar
|
|
18
19
|
from uuid import uuid4
|
|
@@ -34,6 +35,7 @@ from temporalio.common import RetryPolicy, SearchAttributeKey
|
|
|
34
35
|
from temporalio.exceptions import ApplicationError
|
|
35
36
|
|
|
36
37
|
from .constants import METADATA_JSON
|
|
38
|
+
from .objects import DocArtifact
|
|
37
39
|
from .types_ import ProgressRateHandler, RawProgressHandler
|
|
38
40
|
|
|
39
41
|
DependencyLabel = str | None
|
|
@@ -431,49 +433,43 @@ class LogWithWorkerIDMixin:
|
|
|
431
433
|
return handlers
|
|
432
434
|
|
|
433
435
|
|
|
434
|
-
def safe_dir(
|
|
435
|
-
|
|
436
|
-
|
|
436
|
+
def safe_dir(doc_id: str) -> Path:
|
|
437
|
+
if len(doc_id) < 4:
|
|
438
|
+
raise ValueError(f"expected doc_id to be at least 4, found {doc_id}")
|
|
439
|
+
parts = (p for p in (doc_id[:2], doc_id[2:4]) if p)
|
|
437
440
|
return Path(*parts)
|
|
438
441
|
|
|
439
442
|
|
|
440
|
-
def
|
|
441
|
-
return Path(project, safe_dir(
|
|
443
|
+
def _artifacts_dir(doc_id: str, *, project: str) -> Path:
|
|
444
|
+
return Path(project, safe_dir(doc_id), doc_id)
|
|
442
445
|
|
|
443
446
|
|
|
444
|
-
def
|
|
445
|
-
metadata_path =
|
|
447
|
+
def _metadata_path(doc_id: str, *, project: str) -> Path:
|
|
448
|
+
metadata_path = _artifacts_dir(doc_id, project=project) / METADATA_JSON
|
|
446
449
|
return metadata_path
|
|
447
450
|
|
|
448
451
|
|
|
449
|
-
def _read_artifact_metadata(root: Path,
|
|
450
|
-
m_path = root /
|
|
452
|
+
def _read_artifact_metadata(root: Path, artifact: DocArtifact) -> dict:
|
|
453
|
+
m_path = root / _metadata_path(artifact.filename, project=artifact.project)
|
|
451
454
|
return json.loads(m_path.read_text())
|
|
452
455
|
|
|
453
456
|
|
|
454
|
-
def write_artifact(
|
|
455
|
-
artifact
|
|
456
|
-
root: Path,
|
|
457
|
-
*,
|
|
458
|
-
project: str,
|
|
459
|
-
filename: str,
|
|
460
|
-
metadata_key: str,
|
|
461
|
-
metadata_value: str,
|
|
462
|
-
) -> Path:
|
|
463
|
-
artif_dir = root / artifacts_dir(project, filename=filename)
|
|
457
|
+
def write_artifact(root: Path, artifact: DocArtifact) -> Path:
|
|
458
|
+
artif_dir = root / _artifacts_dir(artifact.doc_id, project=artifact.project)
|
|
464
459
|
artif_dir.mkdir(exist_ok=True, parents=True)
|
|
465
460
|
# TODO: if transcriptions are too large we could also serialize them
|
|
466
461
|
# as jsonl
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
462
|
+
artifact_path: Path = artif_dir / artifact.filename
|
|
463
|
+
if isinstance(artifact.artifact, bytes):
|
|
464
|
+
artifact_path.write_bytes(artifact.artifact)
|
|
465
|
+
elif isinstance(artifact_path, BytesIO):
|
|
466
|
+
with artifact_path.open("wb") as f:
|
|
467
|
+
f.write(artifact.artifact.read())
|
|
468
|
+
meta_path = root / _metadata_path(artifact.doc_id, project=artifact.project)
|
|
469
|
+
meta = _read_artifact_metadata(root, artifact) if meta_path.exists() else dict()
|
|
470
|
+
meta[artifact.metadata_key] = artifact.filename
|
|
475
471
|
meta_path.write_text(json.dumps(meta))
|
|
476
|
-
return
|
|
472
|
+
return artifact_path.relative_to(artif_dir)
|
|
477
473
|
|
|
478
474
|
|
|
479
475
|
def debuggable_name(
|
|
Binary file
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
from typing import Annotated
|
|
2
|
-
|
|
3
|
-
import typer
|
|
4
|
-
|
|
5
|
-
from datashare_python.cli.utils import AsyncTyper
|
|
6
|
-
from datashare_python.constants import DEFAULT_NAMESPACE, DEFAULT_TEMPORAL_ADDRESS
|
|
7
|
-
from datashare_python.local_client import LocalClient
|
|
8
|
-
|
|
9
|
-
_REGISTER_NAMESPACE_HELP = "register namespace"
|
|
10
|
-
_TEMPORAL_URL_HELP = "address for temporal server"
|
|
11
|
-
_NAMESPACE_HELP = "namespace name"
|
|
12
|
-
_LOCAL = "local"
|
|
13
|
-
|
|
14
|
-
local_app = AsyncTyper(name=_LOCAL)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@local_app.async_command(help=_REGISTER_NAMESPACE_HELP)
|
|
18
|
-
async def register_namespace(
|
|
19
|
-
namespace: Annotated[
|
|
20
|
-
str, typer.Option("--namespace", "-n", help=_NAMESPACE_HELP)
|
|
21
|
-
] = DEFAULT_NAMESPACE,
|
|
22
|
-
temporal_address: Annotated[
|
|
23
|
-
str, typer.Option("--temporal-address", "-a", help=_TEMPORAL_URL_HELP)
|
|
24
|
-
] = DEFAULT_TEMPORAL_ADDRESS,
|
|
25
|
-
) -> None:
|
|
26
|
-
"""Create namespace
|
|
27
|
-
|
|
28
|
-
:param namespace: namespace
|
|
29
|
-
:param temporal_address: target host
|
|
30
|
-
"""
|
|
31
|
-
client = LocalClient()
|
|
32
|
-
|
|
33
|
-
await client.register_namespace(temporal_address, namespace)
|
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import importlib
|
|
3
|
-
import logging
|
|
4
|
-
from functools import partial
|
|
5
|
-
|
|
6
|
-
from google.protobuf.duration_pb2 import Duration
|
|
7
|
-
from temporalio.api.workflowservice.v1 import (
|
|
8
|
-
ListNamespacesRequest,
|
|
9
|
-
RegisterNamespaceRequest,
|
|
10
|
-
)
|
|
11
|
-
from temporalio.service import ConnectConfig, ServiceClient
|
|
12
|
-
|
|
13
|
-
LOGGER = logging.getLogger(__name__)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class LocalClient:
|
|
17
|
-
@staticmethod
|
|
18
|
-
async def start_workers(
|
|
19
|
-
temporal_address: str, worker_paths: list[str], *args, **kwargs
|
|
20
|
-
) -> None:
|
|
21
|
-
"""Start worker modules, defined as a list of worker paths
|
|
22
|
-
|
|
23
|
-
:param temporal_address: temporal address string (host:port)
|
|
24
|
-
:param worker_paths: list of worker modules
|
|
25
|
-
"""
|
|
26
|
-
workers = []
|
|
27
|
-
|
|
28
|
-
for worker_path in worker_paths:
|
|
29
|
-
try:
|
|
30
|
-
module_parts = worker_path.split(".")
|
|
31
|
-
module_path = ".".join(module_parts[:-1])
|
|
32
|
-
worker_method = module_parts[-1]
|
|
33
|
-
module = importlib.import_module(module_path)
|
|
34
|
-
worker = getattr(module, worker_method)
|
|
35
|
-
worker_partial = partial(worker, *args, **kwargs)
|
|
36
|
-
|
|
37
|
-
workers.append(worker_partial(target_host=temporal_address))
|
|
38
|
-
|
|
39
|
-
LOGGER.info("'%s' imported successfully.", worker_path)
|
|
40
|
-
except (ModuleNotFoundError, AttributeError):
|
|
41
|
-
LOGGER.error("'%s' not found in path. Skipping.", worker_path)
|
|
42
|
-
continue
|
|
43
|
-
|
|
44
|
-
await asyncio.gather(*workers, return_exceptions=True)
|
|
45
|
-
|
|
46
|
-
@staticmethod
|
|
47
|
-
async def register_namespace(temporal_address: str, namespace: str) -> None:
|
|
48
|
-
"""Register a temporal namespace
|
|
49
|
-
|
|
50
|
-
:param temporal_address: temporal address string
|
|
51
|
-
:param namespace: namespace string
|
|
52
|
-
"""
|
|
53
|
-
client = await ServiceClient.connect(
|
|
54
|
-
ConnectConfig(target_host=temporal_address)
|
|
55
|
-
)
|
|
56
|
-
list_resp = await client.workflow_service.list_namespaces(
|
|
57
|
-
ListNamespacesRequest()
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
if namespace in [ns.namespace_info.name for ns in list_resp.namespaces]:
|
|
61
|
-
return
|
|
62
|
-
|
|
63
|
-
await client.workflow_service.register_namespace(
|
|
64
|
-
RegisterNamespaceRequest(
|
|
65
|
-
namespace=namespace,
|
|
66
|
-
# retain for thirty days
|
|
67
|
-
workflow_execution_retention_period=Duration(seconds=30 * 24 * 60 * 60),
|
|
68
|
-
)
|
|
69
|
-
)
|
|
Binary file
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|