datashare-python 0.2.23__py3-none-any.whl → 0.2.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datashare_python/cli/worker.py +66 -43
- datashare_python/config.py +30 -2
- datashare_python/conftest.py +11 -0
- datashare_python/constants.py +2 -0
- datashare_python/dependencies.py +27 -7
- datashare_python/discovery.py +103 -4
- datashare_python/objects.py +17 -3
- datashare_python/template.py +11 -4
- datashare_python/utils.py +152 -25
- datashare_python/worker-template.tar.gz +0 -0
- datashare_python/worker.py +106 -5
- {datashare_python-0.2.23.dist-info → datashare_python-0.2.25.dist-info}/METADATA +4 -4
- datashare_python-0.2.25.dist-info/RECORD +27 -0
- datashare_python-0.2.23.dist-info/RECORD +0 -27
- {datashare_python-0.2.23.dist-info → datashare_python-0.2.25.dist-info}/WHEEL +0 -0
- {datashare_python-0.2.23.dist-info → datashare_python-0.2.25.dist-info}/entry_points.txt +0 -0
datashare_python/cli/worker.py
CHANGED
|
@@ -1,12 +1,15 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
import logging
|
|
3
|
+
from pathlib import Path
|
|
2
4
|
from typing import Annotated
|
|
3
5
|
|
|
4
6
|
import typer
|
|
7
|
+
import yaml
|
|
8
|
+
from icij_common.pydantic_utils import safe_copy
|
|
5
9
|
|
|
6
|
-
from datashare_python.
|
|
7
|
-
from datashare_python.discovery import discover_activities, discover_workflows
|
|
8
|
-
from datashare_python.
|
|
9
|
-
from datashare_python.worker import datashare_worker
|
|
10
|
+
from datashare_python.config import TemporalClientConfig, WorkerConfig
|
|
11
|
+
from datashare_python.discovery import discover, discover_activities, discover_workflows
|
|
12
|
+
from datashare_python.worker import bootstrap_worker, create_worker_id
|
|
10
13
|
|
|
11
14
|
from .utils import AsyncTyper
|
|
12
15
|
|
|
@@ -20,11 +23,13 @@ _LIST_ACTIVITY_NAMES_HELP = "activity names filters (supports regexes)"
|
|
|
20
23
|
|
|
21
24
|
_START_WORKER_WORKFLOWS_HELP = "workflow names run by the worker (supports regexes)"
|
|
22
25
|
_START_WORKER_ACTIVITIES_HELP = "activity names run by the worker (supports regexes)"
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
"
|
|
26
|
+
_START_WORKER_DEPS_HELP = "worker lifetime dependencies name in the registry"
|
|
27
|
+
_START_WORKER_WORKER_ID_PREFIX_HELP = "worker ID prefix"
|
|
28
|
+
_START_WORKER_CONFIG_PATH_HELP = (
|
|
29
|
+
"path to a worker config YAML file,"
|
|
30
|
+
" if not provided will load worker configuration from env variables"
|
|
27
31
|
)
|
|
32
|
+
_WORKER_QUEUE_HELP = "worker task queue"
|
|
28
33
|
_TEMPORAL_NAMESPACE_HELP = "worker temporal namespace"
|
|
29
34
|
|
|
30
35
|
_TEMPORAL_URL_HELP = "address for temporal server"
|
|
@@ -73,42 +78,60 @@ async def start(
|
|
|
73
78
|
workflows: Annotated[list[str], typer.Option(help=_START_WORKER_WORKFLOWS_HELP)],
|
|
74
79
|
activities: Annotated[list[str], typer.Option(help=_START_WORKER_ACTIVITIES_HELP)],
|
|
75
80
|
queue: Annotated[str, typer.Option("--queue", "-q", help=_WORKER_QUEUE_HELP)],
|
|
81
|
+
dependencies: Annotated[
|
|
82
|
+
str | None, typer.Option(help=_START_WORKER_DEPS_HELP)
|
|
83
|
+
] = None,
|
|
84
|
+
config_path: Annotated[
|
|
85
|
+
Path | None,
|
|
86
|
+
typer.Option(
|
|
87
|
+
"--config-path", "--config", "-c", help=_START_WORKER_CONFIG_PATH_HELP
|
|
88
|
+
),
|
|
89
|
+
] = None,
|
|
90
|
+
worker_id_prefix: Annotated[
|
|
91
|
+
str | None, typer.Option(help=_START_WORKER_WORKER_ID_PREFIX_HELP)
|
|
92
|
+
] = None,
|
|
76
93
|
temporal_address: Annotated[
|
|
77
|
-
str, typer.Option("--temporal-address", "-a", help=_TEMPORAL_URL_HELP)
|
|
78
|
-
] =
|
|
94
|
+
str | None, typer.Option("--temporal-address", "-a", help=_TEMPORAL_URL_HELP)
|
|
95
|
+
] = None,
|
|
79
96
|
namespace: Annotated[
|
|
80
|
-
str
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
int, typer.Option("--max-activities", help=_WORKER_MAX_ACTIVITIES_HELP)
|
|
84
|
-
] = 1,
|
|
97
|
+
str | None,
|
|
98
|
+
typer.Option("--temporal-namespace", "-ns", help=_TEMPORAL_NAMESPACE_HELP),
|
|
99
|
+
] = None,
|
|
85
100
|
) -> None:
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
if
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
client,
|
|
105
|
-
workflows=wfs,
|
|
106
|
-
activities=acts,
|
|
107
|
-
task_queue=queue,
|
|
108
|
-
max_concurrent_activities=max_concurrent_activities,
|
|
101
|
+
if config_path is not None:
|
|
102
|
+
with config_path.open() as f:
|
|
103
|
+
bootstrap_config = WorkerConfig.model_validate(
|
|
104
|
+
yaml.load(f, Loader=yaml.Loader)
|
|
105
|
+
)
|
|
106
|
+
else:
|
|
107
|
+
bootstrap_config = WorkerConfig()
|
|
108
|
+
temporal_override = dict()
|
|
109
|
+
if temporal_address is not None:
|
|
110
|
+
temporal_override["host"] = temporal_address
|
|
111
|
+
if namespace is not None:
|
|
112
|
+
temporal_override["namespace"] = namespace
|
|
113
|
+
if temporal_override:
|
|
114
|
+
temporal_config = TemporalClientConfig(**temporal_override)
|
|
115
|
+
update = {"temporal": temporal_config}
|
|
116
|
+
bootstrap_config = safe_copy(bootstrap_config, update=update)
|
|
117
|
+
registered_wfs, registered_acts, registered_deps = discover(
|
|
118
|
+
workflows, act_names=activities, deps_name=dependencies
|
|
109
119
|
)
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
120
|
+
worker_id = create_worker_id(worker_id_prefix or "worker")
|
|
121
|
+
client = await bootstrap_config.to_temporal_client()
|
|
122
|
+
event_loop = asyncio.get_event_loop()
|
|
123
|
+
async with bootstrap_worker(
|
|
124
|
+
worker_id,
|
|
125
|
+
activities=registered_acts,
|
|
126
|
+
workflows=registered_wfs,
|
|
127
|
+
dependencies=registered_deps,
|
|
128
|
+
bootstrap_config=bootstrap_config,
|
|
129
|
+
client=client,
|
|
130
|
+
event_loop=event_loop,
|
|
131
|
+
task_queue=queue,
|
|
132
|
+
) as worker:
|
|
133
|
+
try:
|
|
134
|
+
await worker.run()
|
|
135
|
+
except Exception as e: # noqa: BLE001
|
|
136
|
+
await worker.shutdown()
|
|
137
|
+
raise e
|
datashare_python/config.py
CHANGED
|
@@ -4,7 +4,13 @@ from icij_common.es import ESClient
|
|
|
4
4
|
from icij_common.pydantic_utils import ICIJSettings
|
|
5
5
|
from pydantic import Field, PrivateAttr
|
|
6
6
|
from pydantic_settings import SettingsConfigDict
|
|
7
|
-
from temporalio.contrib.pydantic import
|
|
7
|
+
from temporalio.contrib.pydantic import PydanticJSONPlainPayloadConverter, ToJsonOptions
|
|
8
|
+
from temporalio.converter import (
|
|
9
|
+
CompositePayloadConverter,
|
|
10
|
+
DataConverter,
|
|
11
|
+
DefaultPayloadConverter,
|
|
12
|
+
JSONPlainPayloadConverter,
|
|
13
|
+
)
|
|
8
14
|
|
|
9
15
|
import datashare_python
|
|
10
16
|
|
|
@@ -64,7 +70,7 @@ class TemporalClientConfig(BaseModel):
|
|
|
64
70
|
self._client = await TemporalClient.connect(
|
|
65
71
|
target_host=self.host,
|
|
66
72
|
namespace=self.namespace,
|
|
67
|
-
data_converter=
|
|
73
|
+
data_converter=PYDANTIC_DATA_CONVERTER,
|
|
68
74
|
)
|
|
69
75
|
return self._client
|
|
70
76
|
|
|
@@ -83,6 +89,8 @@ class WorkerConfig(ICIJSettings, LogWithWorkerIDMixin, BaseModel):
|
|
|
83
89
|
elasticsearch: ESClientConfig = ESClientConfig()
|
|
84
90
|
temporal: TemporalClientConfig = TemporalClientConfig()
|
|
85
91
|
|
|
92
|
+
max_concurrent_io_activities: int = 5
|
|
93
|
+
|
|
86
94
|
def to_es_client(self) -> ESClient:
|
|
87
95
|
return self.elasticsearch.to_es_client(self.datashare.api_key)
|
|
88
96
|
|
|
@@ -91,3 +99,23 @@ class WorkerConfig(ICIJSettings, LogWithWorkerIDMixin, BaseModel):
|
|
|
91
99
|
|
|
92
100
|
async def to_temporal_client(self) -> TemporalClient:
|
|
93
101
|
return await self.temporal.to_client()
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class _PydanticPayloadConverter(CompositePayloadConverter):
|
|
105
|
+
def __init__(self) -> None:
|
|
106
|
+
json_payload_converter = PydanticJSONPlainPayloadConverter(
|
|
107
|
+
ToJsonOptions(exclude_unset=False)
|
|
108
|
+
)
|
|
109
|
+
super().__init__(
|
|
110
|
+
*(
|
|
111
|
+
c
|
|
112
|
+
if not isinstance(c, JSONPlainPayloadConverter)
|
|
113
|
+
else json_payload_converter
|
|
114
|
+
for c in DefaultPayloadConverter.default_encoding_payload_converters
|
|
115
|
+
)
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
PYDANTIC_DATA_CONVERTER = DataConverter(
|
|
120
|
+
payload_converter_class=_PydanticPayloadConverter
|
|
121
|
+
)
|
datashare_python/conftest.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from asyncio import AbstractEventLoop
|
|
3
3
|
from collections.abc import AsyncGenerator, Generator, Iterator, Sequence
|
|
4
|
+
from pathlib import Path
|
|
4
5
|
|
|
5
6
|
import aiohttp
|
|
6
7
|
import pytest
|
|
@@ -50,6 +51,9 @@ _INDEX_BODY = {
|
|
|
50
51
|
"language": {"type": "keyword"},
|
|
51
52
|
"documentId": {"type": "keyword"},
|
|
52
53
|
"join": {"type": "join", "relations": {"Document": "NamedEntity"}},
|
|
54
|
+
"contentType": {"type": "keyword"},
|
|
55
|
+
"content": {"type": "text"},
|
|
56
|
+
"contentTranslated": {"type": "text"},
|
|
53
57
|
}
|
|
54
58
|
}
|
|
55
59
|
}
|
|
@@ -102,6 +106,13 @@ def test_worker_config() -> WorkerConfig:
|
|
|
102
106
|
)
|
|
103
107
|
|
|
104
108
|
|
|
109
|
+
@pytest.fixture
|
|
110
|
+
def test_worker_config_path(test_worker_config: WorkerConfig, tmpdir: Path) -> Path:
|
|
111
|
+
config_path = Path(tmpdir) / "config.json"
|
|
112
|
+
config_path.write_text(test_worker_config.model_dump_json())
|
|
113
|
+
return config_path
|
|
114
|
+
|
|
115
|
+
|
|
105
116
|
@pytest.fixture(scope="session")
|
|
106
117
|
async def worker_lifetime_deps(
|
|
107
118
|
event_loop: AbstractEventLoop,
|
datashare_python/constants.py
CHANGED
datashare_python/dependencies.py
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
1
|
+
import inspect
|
|
1
2
|
import logging
|
|
2
3
|
from asyncio import AbstractEventLoop, iscoroutine
|
|
3
|
-
from collections.abc import AsyncGenerator
|
|
4
|
+
from collections.abc import AsyncGenerator, Callable
|
|
4
5
|
from contextlib import AsyncExitStack, asynccontextmanager
|
|
5
6
|
from contextvars import ContextVar
|
|
7
|
+
from copy import deepcopy
|
|
8
|
+
from typing import Any
|
|
6
9
|
|
|
7
10
|
from icij_common.es import ESClient
|
|
8
11
|
|
|
@@ -20,7 +23,7 @@ TASK_CLIENT: ContextVar[DatashareTaskClient] = ContextVar("task_client")
|
|
|
20
23
|
TEMPORAL_CLIENT: ContextVar[TemporalClient] = ContextVar("temporal_client")
|
|
21
24
|
|
|
22
25
|
|
|
23
|
-
def set_event_loop(event_loop: AbstractEventLoop
|
|
26
|
+
def set_event_loop(event_loop: AbstractEventLoop) -> None:
|
|
24
27
|
EVENT_LOOP.set(event_loop)
|
|
25
28
|
|
|
26
29
|
|
|
@@ -31,13 +34,13 @@ def lifespan_event_loop() -> AbstractEventLoop:
|
|
|
31
34
|
raise DependencyInjectionError("event loop") from e
|
|
32
35
|
|
|
33
36
|
|
|
34
|
-
def set_loggers(worker_config: WorkerConfig, worker_id: str
|
|
37
|
+
def set_loggers(worker_config: WorkerConfig, worker_id: str) -> None:
|
|
35
38
|
worker_config.setup_loggers(worker_id=worker_id)
|
|
36
39
|
logger.info("worker loggers ready to log 💬")
|
|
37
40
|
logger.info("app config: %s", worker_config.model_dump_json(indent=2))
|
|
38
41
|
|
|
39
42
|
|
|
40
|
-
async def set_es_client(worker_config: WorkerConfig
|
|
43
|
+
async def set_es_client(worker_config: WorkerConfig) -> ESClient:
|
|
41
44
|
client = worker_config.to_es_client()
|
|
42
45
|
ES_CLIENT.set(client)
|
|
43
46
|
return client
|
|
@@ -52,7 +55,7 @@ def lifespan_es_client() -> ESClient:
|
|
|
52
55
|
|
|
53
56
|
|
|
54
57
|
# Task client setup
|
|
55
|
-
async def set_task_client(worker_config: WorkerConfig
|
|
58
|
+
async def set_task_client(worker_config: WorkerConfig) -> DatashareTaskClient:
|
|
56
59
|
task_client = worker_config.to_task_client()
|
|
57
60
|
TASK_CLIENT.set(task_client)
|
|
58
61
|
return task_client
|
|
@@ -67,7 +70,7 @@ def lifespan_task_client() -> DatashareTaskClient:
|
|
|
67
70
|
|
|
68
71
|
|
|
69
72
|
# Temporal client setup
|
|
70
|
-
async def set_temporal_client(worker_config: WorkerConfig
|
|
73
|
+
async def set_temporal_client(worker_config: WorkerConfig) -> None:
|
|
71
74
|
client = await worker_config.to_temporal_client()
|
|
72
75
|
TEMPORAL_CLIENT.set(client)
|
|
73
76
|
|
|
@@ -86,7 +89,7 @@ async def with_dependencies(
|
|
|
86
89
|
) -> AsyncGenerator[None, None]:
|
|
87
90
|
async with AsyncExitStack() as stack:
|
|
88
91
|
for dep in dependencies:
|
|
89
|
-
cm = dep(**kwargs)
|
|
92
|
+
cm = dep(**add_missing_args(dep, dict(), **kwargs))
|
|
90
93
|
if hasattr(cm, "__aenter__"):
|
|
91
94
|
await stack.enter_async_context(cm)
|
|
92
95
|
elif hasattr(cm, "__enter__"):
|
|
@@ -94,3 +97,20 @@ async def with_dependencies(
|
|
|
94
97
|
elif iscoroutine(cm):
|
|
95
98
|
await cm
|
|
96
99
|
yield
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def add_missing_args(fn: Callable, args: dict[str, Any], **kwargs) -> dict[str, Any]:
|
|
103
|
+
# We make the choice not to raise in case of missing argument here, the error will
|
|
104
|
+
# be correctly raise when the function is called
|
|
105
|
+
from_kwargs = dict()
|
|
106
|
+
sig = inspect.signature(fn)
|
|
107
|
+
for param_name in sig.parameters:
|
|
108
|
+
if param_name in args:
|
|
109
|
+
continue
|
|
110
|
+
kwargs_value = kwargs.get(param_name)
|
|
111
|
+
if kwargs_value is not None:
|
|
112
|
+
from_kwargs[param_name] = kwargs_value
|
|
113
|
+
if from_kwargs:
|
|
114
|
+
args = deepcopy(args)
|
|
115
|
+
args.update(from_kwargs)
|
|
116
|
+
return args
|
datashare_python/discovery.py
CHANGED
|
@@ -1,18 +1,77 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import re
|
|
2
3
|
from collections.abc import Callable, Iterable
|
|
3
4
|
from importlib.metadata import entry_points
|
|
4
5
|
|
|
6
|
+
from .types_ import ContextManagerFactory
|
|
5
7
|
from .utils import ActivityWithProgress
|
|
6
8
|
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
7
11
|
Activity = ActivityWithProgress | Callable | type
|
|
8
12
|
|
|
13
|
+
_DEPENDENCIES = "dependencies"
|
|
9
14
|
_WORKFLOW_GROUPS = "datashare.workflows"
|
|
10
15
|
_ACTIVITIES_GROUPS = "datashare.activities"
|
|
16
|
+
_DEPENDENCIES_GROUPS = "datashare.dependencies"
|
|
17
|
+
|
|
18
|
+
_RegisteredWorkflow = tuple[str, type]
|
|
19
|
+
_RegisteredActivity = tuple[str, Activity]
|
|
20
|
+
_Dependencies = list[ContextManagerFactory]
|
|
21
|
+
_Discovery = tuple[
|
|
22
|
+
Iterable[_RegisteredWorkflow] | None,
|
|
23
|
+
Iterable[_RegisteredActivity] | None,
|
|
24
|
+
_Dependencies | None,
|
|
25
|
+
]
|
|
11
26
|
|
|
12
27
|
|
|
13
|
-
def
|
|
28
|
+
def discover(
|
|
29
|
+
wf_names: list[str] | None, *, act_names: list[str] | None, deps_name: str | None
|
|
30
|
+
) -> _Discovery:
|
|
31
|
+
discovered = ""
|
|
32
|
+
wfs = None
|
|
33
|
+
if wf_names is not None:
|
|
34
|
+
discovered_wfs = discover_workflows(wf_names)
|
|
35
|
+
if discovered_wfs:
|
|
36
|
+
wf_names, wfs = zip(*discovered_wfs, strict=True)
|
|
37
|
+
if wf_names:
|
|
38
|
+
n_wfs = len(wf_names)
|
|
39
|
+
discovered += (
|
|
40
|
+
f"- {n_wfs} workflow{'s' if n_wfs > 1 else ''}:"
|
|
41
|
+
f" {', '.join(wf_names)}"
|
|
42
|
+
)
|
|
43
|
+
acts = None
|
|
44
|
+
if act_names is not None:
|
|
45
|
+
discovered_acts = discover_activities(act_names)
|
|
46
|
+
if discovered_acts:
|
|
47
|
+
act_names, acts = zip(*discovered_acts, strict=True)
|
|
48
|
+
if act_names:
|
|
49
|
+
if discovered:
|
|
50
|
+
discovered += "\n"
|
|
51
|
+
n_acts = len(act_names)
|
|
52
|
+
discovered += (
|
|
53
|
+
f"- {n_acts} activit{'ies' if n_acts > 1 else 'y'}:"
|
|
54
|
+
f" {', '.join(act_names)}"
|
|
55
|
+
)
|
|
56
|
+
if not acts and not wfs:
|
|
57
|
+
raise ValueError("Couldn't find any registered activity or workflow.")
|
|
58
|
+
deps = discover_dependencies(deps_name)
|
|
59
|
+
if deps:
|
|
60
|
+
n_deps = len(deps)
|
|
61
|
+
discovered += "\n"
|
|
62
|
+
deps_names = (d.__name__ for d in deps)
|
|
63
|
+
discovered += (
|
|
64
|
+
f"- {n_deps} dependenc{'ies' if n_deps > 1 else 'y'}:"
|
|
65
|
+
f" {', '.join(deps_names)}"
|
|
66
|
+
)
|
|
67
|
+
logger.info("discovered:\n%s", discovered)
|
|
68
|
+
return wfs, acts, deps
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def discover_workflows(names: list[str]) -> list[_RegisteredWorkflow]:
|
|
14
72
|
pattern = None if not names else re.compile(rf"^{'|'.join(names)}$")
|
|
15
73
|
impls = entry_points(group=_WORKFLOW_GROUPS)
|
|
74
|
+
registered = []
|
|
16
75
|
for wf_impls in impls:
|
|
17
76
|
wf_impls = wf_impls.load() # noqa: PLW2901
|
|
18
77
|
if not isinstance(wf_impls, list | tuple | set):
|
|
@@ -21,12 +80,14 @@ def discover_workflows(names: list[str]) -> Iterable[tuple[str, type]]:
|
|
|
21
80
|
wf_name = _parse_wf_name(wf_impl)
|
|
22
81
|
if pattern and not pattern.match(wf_name):
|
|
23
82
|
continue
|
|
24
|
-
|
|
83
|
+
registered.append((wf_name, wf_impl))
|
|
84
|
+
return registered
|
|
25
85
|
|
|
26
86
|
|
|
27
|
-
def discover_activities(names: list[str]) ->
|
|
87
|
+
def discover_activities(names: list[str]) -> list[_RegisteredActivity]:
|
|
28
88
|
pattern = None if not names else re.compile(rf"^{'|'.join(names)}$")
|
|
29
89
|
impls = entry_points(group=_ACTIVITIES_GROUPS)
|
|
90
|
+
registered = []
|
|
30
91
|
for act_impls in impls:
|
|
31
92
|
act_impls = act_impls.load() # noqa: PLW2901
|
|
32
93
|
if not isinstance(act_impls, list | tuple | set):
|
|
@@ -35,7 +96,45 @@ def discover_activities(names: list[str]) -> Iterable[tuple[str, Activity]]:
|
|
|
35
96
|
act_name = _parse_activity_name(act_impl)
|
|
36
97
|
if pattern and not pattern.match(act_name):
|
|
37
98
|
continue
|
|
38
|
-
|
|
99
|
+
registered.append((act_name, act_impl))
|
|
100
|
+
return registered
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def discover_dependencies(name: str | None) -> _Dependencies | None:
|
|
104
|
+
impls = entry_points(name=_DEPENDENCIES, group=_DEPENDENCIES_GROUPS)
|
|
105
|
+
if not impls:
|
|
106
|
+
if name is None:
|
|
107
|
+
return None
|
|
108
|
+
available_impls = entry_points(group=_DEPENDENCIES_GROUPS)
|
|
109
|
+
msg = (
|
|
110
|
+
f'failed to find dependency: "{name}", '
|
|
111
|
+
f"available dependencies: {available_impls}"
|
|
112
|
+
)
|
|
113
|
+
raise LookupError(msg)
|
|
114
|
+
if len(impls) > 1:
|
|
115
|
+
msg = f'found multiple dependencies for name "{name}": {impls}'
|
|
116
|
+
raise ValueError(msg)
|
|
117
|
+
deps_registry = impls[_DEPENDENCIES].load()
|
|
118
|
+
if name:
|
|
119
|
+
try:
|
|
120
|
+
return deps_registry[name]
|
|
121
|
+
except KeyError as e:
|
|
122
|
+
available = list(deps_registry)
|
|
123
|
+
msg = (
|
|
124
|
+
f'failed to find dependency for name "{name}", available dependencies: '
|
|
125
|
+
f"{available}"
|
|
126
|
+
)
|
|
127
|
+
raise LookupError(msg) from e
|
|
128
|
+
if not deps_registry:
|
|
129
|
+
raise ValueError("empty dependency registry !")
|
|
130
|
+
if len(deps_registry) > 1:
|
|
131
|
+
available = ", ".join('"' + d + '"' for d in deps_registry)
|
|
132
|
+
msg = (
|
|
133
|
+
f"dependency registry contains multiples entries {available},"
|
|
134
|
+
f" please select one by providing a name"
|
|
135
|
+
)
|
|
136
|
+
raise ValueError(msg)
|
|
137
|
+
return next(iter(deps_registry.values()))
|
|
39
138
|
|
|
40
139
|
|
|
41
140
|
def _parse_wf_name(wf_type: type) -> str:
|
datashare_python/objects.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from collections.abc import Callable
|
|
2
|
+
from collections.abc import Awaitable, Callable
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from datetime import UTC, datetime
|
|
5
5
|
from enum import StrEnum, unique
|
|
6
|
-
from typing import Any, Literal, Self
|
|
6
|
+
from typing import Any, Literal, Self, TypeVar
|
|
7
7
|
|
|
8
8
|
from temporalio import workflow
|
|
9
9
|
|
|
@@ -23,14 +23,26 @@ from pydantic.main import IncEx
|
|
|
23
23
|
logger = logging.getLogger(__name__)
|
|
24
24
|
|
|
25
25
|
|
|
26
|
+
T = TypeVar("T")
|
|
27
|
+
Predicate = Callable[[T], bool] | Callable[[T], Awaitable[bool]]
|
|
28
|
+
|
|
29
|
+
|
|
26
30
|
class BaseModel(_BaseModel):
|
|
27
31
|
model_config = merge_configs(icij_config(), no_enum_values_config())
|
|
28
32
|
|
|
29
33
|
|
|
34
|
+
class BasePayload(_BaseModel):
|
|
35
|
+
model_config = icij_config()
|
|
36
|
+
|
|
37
|
+
|
|
30
38
|
class DatashareModel(BaseModel):
|
|
31
39
|
model_config = merge_configs(BaseModel.model_config, lowercamel_case_config())
|
|
32
40
|
|
|
33
41
|
|
|
42
|
+
class LowerCamelCaseModel(_BaseModel):
|
|
43
|
+
model_config = merge_configs(icij_config(), lowercamel_case_config())
|
|
44
|
+
|
|
45
|
+
|
|
34
46
|
@unique
|
|
35
47
|
class TaskState(StrEnum):
|
|
36
48
|
CREATED = "CREATED"
|
|
@@ -135,12 +147,14 @@ class TaskGroup:
|
|
|
135
147
|
class Document(DatashareModel):
|
|
136
148
|
id: str
|
|
137
149
|
root_document: str
|
|
138
|
-
content: str
|
|
139
150
|
language: str
|
|
151
|
+
content: str | None = None
|
|
152
|
+
content_type: str | None = None
|
|
140
153
|
tags: list[str] = Field(default_factory=list)
|
|
141
154
|
content_translated: dict[str, str] = Field(
|
|
142
155
|
default_factory=dict, alias="content_translated"
|
|
143
156
|
)
|
|
157
|
+
type: str = Field(default="Document", frozen=True)
|
|
144
158
|
|
|
145
159
|
@classmethod
|
|
146
160
|
def from_es(cls, es_doc: dict) -> Self:
|
datashare_python/template.py
CHANGED
|
@@ -71,12 +71,12 @@ def _update_pyproject_toml(
|
|
|
71
71
|
) -> dict[str, Any]:
|
|
72
72
|
pyproject_toml = deepcopy(pyproject_toml)
|
|
73
73
|
|
|
74
|
-
pyproject_toml["tool"]["uv"].pop("sources")
|
|
75
|
-
pyproject_toml["tool"]["uv"].pop("index")
|
|
74
|
+
pyproject_toml["tool"]["uv"].pop("sources", None)
|
|
75
|
+
pyproject_toml["tool"]["uv"].pop("index", None)
|
|
76
76
|
|
|
77
77
|
project = pyproject_toml["project"]
|
|
78
78
|
project["authors"] = []
|
|
79
|
-
project.pop("urls")
|
|
79
|
+
project.pop("urls", None)
|
|
80
80
|
project["dependencies"] = sorted(
|
|
81
81
|
d
|
|
82
82
|
for d in project["dependencies"]
|
|
@@ -87,7 +87,7 @@ def _update_pyproject_toml(
|
|
|
87
87
|
for d in project["dependencies"]
|
|
88
88
|
if any(d.startswith(base) for base in _BASE_DEPS)
|
|
89
89
|
)
|
|
90
|
-
project.pop("optional-dependencies")
|
|
90
|
+
project.pop("optional-dependencies", None)
|
|
91
91
|
|
|
92
92
|
entry_points = project["entry-points"]
|
|
93
93
|
|
|
@@ -105,4 +105,11 @@ def _update_pyproject_toml(
|
|
|
105
105
|
i if i != "worker_template" else package_name for i in hatch_sdist["packages"]
|
|
106
106
|
]
|
|
107
107
|
|
|
108
|
+
hatch_sdist = pyproject_toml["tool"]["hatch"]["build"]["targets"]["sdist"]
|
|
109
|
+
if "only-include" in hatch_sdist:
|
|
110
|
+
hatch_sdist["only-include"] = [
|
|
111
|
+
i if i != "worker_template" else package_name
|
|
112
|
+
for i in hatch_sdist["only-include"]
|
|
113
|
+
]
|
|
114
|
+
|
|
108
115
|
return pyproject_toml
|
datashare_python/utils.py
CHANGED
|
@@ -1,14 +1,24 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import inspect
|
|
3
|
+
import json
|
|
3
4
|
import logging
|
|
4
5
|
import sys
|
|
5
|
-
from collections.abc import
|
|
6
|
+
from collections.abc import (
|
|
7
|
+
Callable,
|
|
8
|
+
Coroutine,
|
|
9
|
+
)
|
|
10
|
+
from copy import deepcopy
|
|
6
11
|
from dataclasses import dataclass
|
|
12
|
+
from datetime import timedelta
|
|
7
13
|
from functools import partial, wraps
|
|
14
|
+
from hashlib import sha256
|
|
8
15
|
from inspect import signature
|
|
9
|
-
from
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, ParamSpec, TypeVar
|
|
18
|
+
from uuid import uuid4
|
|
10
19
|
|
|
11
20
|
import nest_asyncio
|
|
21
|
+
import temporalio
|
|
12
22
|
from icij_common.logging_utils import (
|
|
13
23
|
DATE_FMT,
|
|
14
24
|
STREAM_HANDLER_FMT,
|
|
@@ -20,9 +30,10 @@ from pydantic.fields import FieldInfo
|
|
|
20
30
|
from pythonjsonlogger.json import JsonFormatter
|
|
21
31
|
from temporalio import activity, workflow
|
|
22
32
|
from temporalio.client import Client, WorkflowHandle
|
|
23
|
-
from temporalio.common import SearchAttributeKey
|
|
33
|
+
from temporalio.common import RetryPolicy, SearchAttributeKey
|
|
24
34
|
from temporalio.exceptions import ApplicationError
|
|
25
35
|
|
|
36
|
+
from .constants import METADATA_JSON
|
|
26
37
|
from .types_ import ProgressRateHandler, RawProgressHandler
|
|
27
38
|
|
|
28
39
|
DependencyLabel = str | None
|
|
@@ -34,6 +45,13 @@ PROGRESS_HANDLER_ARG = "progress"
|
|
|
34
45
|
P = ParamSpec("P")
|
|
35
46
|
T = TypeVar("T")
|
|
36
47
|
|
|
48
|
+
_NEVER_RETRIABLES = {
|
|
49
|
+
"ValidationError",
|
|
50
|
+
"TypeError",
|
|
51
|
+
"PydanticSchemaGenerationError",
|
|
52
|
+
"PydanticSerializationError",
|
|
53
|
+
}
|
|
54
|
+
|
|
37
55
|
|
|
38
56
|
@dataclass(frozen=True)
|
|
39
57
|
class Progress:
|
|
@@ -86,6 +104,38 @@ class WorkflowWithProgress:
|
|
|
86
104
|
workflow.upsert_search_attributes(attributes)
|
|
87
105
|
|
|
88
106
|
|
|
107
|
+
def _retry_policy_with_default(retry_policy: RetryPolicy | None) -> RetryPolicy:
|
|
108
|
+
if retry_policy is None:
|
|
109
|
+
retry_policy = RetryPolicy(non_retryable_error_types=[])
|
|
110
|
+
retry_policy = deepcopy(retry_policy)
|
|
111
|
+
non_retryable_error_types = set(retry_policy.non_retryable_error_types)
|
|
112
|
+
non_retryable_error_types.update(_NEVER_RETRIABLES)
|
|
113
|
+
retry_policy.non_retryable_error_types = list(non_retryable_error_types)
|
|
114
|
+
return retry_policy
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
async def execute_activity(
|
|
118
|
+
activity: Callable,
|
|
119
|
+
task_queue: str,
|
|
120
|
+
arg: Any = temporalio.common._arg_unset,
|
|
121
|
+
*,
|
|
122
|
+
args: list | None = None,
|
|
123
|
+
start_to_close_timeout: timedelta | None = None,
|
|
124
|
+
retry_policy: temporalio.common.RetryPolicy | None = None,
|
|
125
|
+
) -> Any:
|
|
126
|
+
if args is None:
|
|
127
|
+
args = []
|
|
128
|
+
retry_policy = _retry_policy_with_default(retry_policy)
|
|
129
|
+
return await workflow.execute_activity(
|
|
130
|
+
activity,
|
|
131
|
+
arg=arg,
|
|
132
|
+
args=args,
|
|
133
|
+
start_to_close_timeout=start_to_close_timeout,
|
|
134
|
+
task_queue=task_queue,
|
|
135
|
+
retry_policy=retry_policy,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
|
|
89
139
|
async def progress_handler(
|
|
90
140
|
progress: float,
|
|
91
141
|
handle: WorkflowHandle,
|
|
@@ -147,10 +197,7 @@ def with_progress(weight: float = 1.0) -> Callable[P, T]:
|
|
|
147
197
|
client=self._temporal_client, weight=weight
|
|
148
198
|
)
|
|
149
199
|
await handler(0.0)
|
|
150
|
-
|
|
151
|
-
res = await activity_fn(self, *args, progress=handler)
|
|
152
|
-
else:
|
|
153
|
-
res = await activity_fn(self, *args)
|
|
200
|
+
res = await activity_fn(self, *args, progress=handler)
|
|
154
201
|
await handler(1.0)
|
|
155
202
|
return res
|
|
156
203
|
|
|
@@ -170,12 +217,9 @@ def with_progress(weight: float = 1.0) -> Callable[P, T]:
|
|
|
170
217
|
client=self._temporal_client, weight=weight
|
|
171
218
|
)
|
|
172
219
|
event_loop = self._event_loop
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
else:
|
|
177
|
-
res = activity_fn(self, *args)
|
|
178
|
-
event_loop.run_until_complete(handler(1.0))
|
|
220
|
+
asyncio.run_coroutine_threadsafe(handler(0.0), event_loop).result()
|
|
221
|
+
res = activity_fn(self, *args, progress=handler)
|
|
222
|
+
asyncio.run_coroutine_threadsafe(handler(1.0), event_loop).result()
|
|
179
223
|
return res
|
|
180
224
|
|
|
181
225
|
return wrapper
|
|
@@ -197,6 +241,7 @@ def positional_args_only(activity_fn: Callable[P, T]) -> Callable[P, T]:
|
|
|
197
241
|
# recreate kwargs from pargs
|
|
198
242
|
new_args, new_kwargs = _unpack_positional_args(args, keyword_only, params)
|
|
199
243
|
return await activity_fn(*new_args, **new_kwargs, **kwargs)
|
|
244
|
+
|
|
200
245
|
else:
|
|
201
246
|
|
|
202
247
|
@wraps(activity_fn)
|
|
@@ -234,8 +279,28 @@ def with_retriables(
|
|
|
234
279
|
retriables: set[type[Exception]] = None,
|
|
235
280
|
) -> Callable[[Callable[P, T]], Callable[P, T]]:
|
|
236
281
|
if retriables is None:
|
|
237
|
-
|
|
238
|
-
|
|
282
|
+
|
|
283
|
+
def decorator(activity_fn: Callable[P, T]) -> Callable[P, T]:
|
|
284
|
+
if asyncio.iscoroutinefunction(activity_fn):
|
|
285
|
+
|
|
286
|
+
@wraps(activity_fn)
|
|
287
|
+
async def wrapper(*args, **kwargs) -> T:
|
|
288
|
+
try:
|
|
289
|
+
return await activity_fn(*args, **kwargs)
|
|
290
|
+
except Exception as e:
|
|
291
|
+
raise fatal_error_from_exception(e) from e
|
|
292
|
+
else:
|
|
293
|
+
|
|
294
|
+
@wraps(activity_fn)
|
|
295
|
+
def wrapper(*args, **kwargs) -> T:
|
|
296
|
+
try:
|
|
297
|
+
return activity_fn(*args, **kwargs)
|
|
298
|
+
except Exception as e:
|
|
299
|
+
raise fatal_error_from_exception(e) from e
|
|
300
|
+
|
|
301
|
+
return wrapper
|
|
302
|
+
|
|
303
|
+
return decorator
|
|
239
304
|
|
|
240
305
|
def decorator(activity_fn: Callable[P, T]) -> Callable[P, T]:
|
|
241
306
|
if asyncio.iscoroutinefunction(activity_fn):
|
|
@@ -248,6 +313,7 @@ def with_retriables(
|
|
|
248
313
|
raise
|
|
249
314
|
except Exception as e:
|
|
250
315
|
raise fatal_error_from_exception(e) from e
|
|
316
|
+
|
|
251
317
|
else:
|
|
252
318
|
|
|
253
319
|
@wraps(activity_fn)
|
|
@@ -268,20 +334,13 @@ def activity_defn(
|
|
|
268
334
|
name: str,
|
|
269
335
|
progress_weight: float = 1.0,
|
|
270
336
|
retriables: set[type[Exception]] = None,
|
|
271
|
-
*,
|
|
272
|
-
no_thread_cancel_exception: bool = False,
|
|
273
|
-
dynamic: bool = False,
|
|
274
337
|
) -> Callable[[Callable[P, T]], Callable[P, T]]:
|
|
275
338
|
def decorator(activity_fn: Callable[P, T]) -> Callable[P, T]:
|
|
276
339
|
activity_fn = positional_args_only(activity_fn)
|
|
277
340
|
activity_fn = with_retriables(retriables)(activity_fn)
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
name=name,
|
|
282
|
-
no_thread_cancel_exception=no_thread_cancel_exception,
|
|
283
|
-
dynamic=dynamic,
|
|
284
|
-
)
|
|
341
|
+
if supports_progress(activity_fn):
|
|
342
|
+
activity_fn = with_progress(progress_weight)(activity_fn)
|
|
343
|
+
activity_fn = activity.defn(activity_fn, name=name)
|
|
285
344
|
return activity_fn
|
|
286
345
|
|
|
287
346
|
return decorator
|
|
@@ -370,3 +429,71 @@ class LogWithWorkerIDMixin:
|
|
|
370
429
|
handler.addFilter(worker_id_filter)
|
|
371
430
|
handler.setLevel(log_level)
|
|
372
431
|
return handlers
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
def safe_dir(filename: str) -> Path:
|
|
435
|
+
filename = filename.split(".", maxsplit=1)[0]
|
|
436
|
+
parts = (p for p in (filename[:2], filename[2:4]) if p)
|
|
437
|
+
return Path(*parts)
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def artifacts_dir(project: str, *, filename: str) -> Path:
|
|
441
|
+
return Path(project, safe_dir(filename), filename)
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def metadata_path(filename: str, *, project: str) -> Path:
|
|
445
|
+
metadata_path = artifacts_dir(project, filename=filename) / METADATA_JSON
|
|
446
|
+
return metadata_path
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def _read_artifact_metadata(root: Path, project: str, *, filename: str) -> dict:
|
|
450
|
+
m_path = root / metadata_path(filename, project=project)
|
|
451
|
+
return json.loads(m_path.read_text())
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def write_artifact(
|
|
455
|
+
artifact: bytes,
|
|
456
|
+
root: Path,
|
|
457
|
+
*,
|
|
458
|
+
project: str,
|
|
459
|
+
filename: str,
|
|
460
|
+
metadata_key: str,
|
|
461
|
+
metadata_value: str,
|
|
462
|
+
) -> Path:
|
|
463
|
+
artif_dir = root / artifacts_dir(project, filename=filename)
|
|
464
|
+
artif_dir.mkdir(exist_ok=True, parents=True)
|
|
465
|
+
# TODO: if transcriptions are too large we could also serialize them
|
|
466
|
+
# as jsonl
|
|
467
|
+
transcription_path = artif_dir / metadata_value
|
|
468
|
+
transcription_path.write_bytes(artifact)
|
|
469
|
+
try:
|
|
470
|
+
meta = _read_artifact_metadata(root, project, filename=filename)
|
|
471
|
+
except FileNotFoundError:
|
|
472
|
+
meta = dict()
|
|
473
|
+
meta[metadata_key] = metadata_value
|
|
474
|
+
meta_path = root / artifacts_dir(project, filename=filename) / METADATA_JSON
|
|
475
|
+
meta_path.write_text(json.dumps(meta))
|
|
476
|
+
return transcription_path.relative_to(artif_dir)
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def debuggable_name(
|
|
480
|
+
path: str, component_size_limit: int = 10, *, deterministic: bool = False
|
|
481
|
+
) -> str:
|
|
482
|
+
path = Path(path)
|
|
483
|
+
displayable_file_name = [c[:component_size_limit] for c in path.parts]
|
|
484
|
+
uuid = sha256(str(path).encode()).hexdigest() if deterministic else uuid4().hex
|
|
485
|
+
uuid = uuid[:20]
|
|
486
|
+
return f"{uuid}-{'__'.join(displayable_file_name)}"
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def activity_contextual_id(*, wf_context: bool = False) -> str:
|
|
490
|
+
act_info = activity.info()
|
|
491
|
+
act_id = act_info.activity_id
|
|
492
|
+
act_run_id = act_info.activity_id
|
|
493
|
+
act_type = act_info.activity_type
|
|
494
|
+
contextual_id = f"{act_type}-{act_id}-{act_run_id}"
|
|
495
|
+
if wf_context:
|
|
496
|
+
wf_id = act_info.workflow_id
|
|
497
|
+
wf_run_id = act_info.workflow_run_id
|
|
498
|
+
contextual_id += f"-{wf_id}-{wf_run_id}"
|
|
499
|
+
return contextual_id
|
|
Binary file
|
datashare_python/worker.py
CHANGED
|
@@ -1,13 +1,28 @@
|
|
|
1
|
+
import inspect
|
|
1
2
|
import logging
|
|
3
|
+
import os
|
|
4
|
+
import socket
|
|
5
|
+
import sys
|
|
6
|
+
import threading
|
|
7
|
+
from asyncio import AbstractEventLoop
|
|
8
|
+
from collections.abc import AsyncGenerator, Callable
|
|
2
9
|
from concurrent.futures import ThreadPoolExecutor
|
|
10
|
+
from contextlib import asynccontextmanager
|
|
11
|
+
from typing import Any
|
|
3
12
|
|
|
4
13
|
from temporalio.worker import PollerBehaviorSimpleMaximum, Worker
|
|
5
14
|
|
|
15
|
+
from .config import WorkerConfig
|
|
16
|
+
from .dependencies import with_dependencies
|
|
6
17
|
from .discovery import Activity
|
|
7
|
-
from .types_ import TemporalClient
|
|
18
|
+
from .types_ import ContextManagerFactory, TemporalClient
|
|
8
19
|
|
|
9
20
|
logger = logging.getLogger(__name__)
|
|
10
21
|
|
|
22
|
+
_TEMPORAL_CLIENT = "temporal_client"
|
|
23
|
+
_EVENT_LOOP = "event_loop"
|
|
24
|
+
_EXPECTED_INIT_ARGS = {"self", _TEMPORAL_CLIENT, _EVENT_LOOP, "args", "kwargs"}
|
|
25
|
+
|
|
11
26
|
_SEPARATE_IO_AND_CPU_WORKERS = """The worker will run sync (CPU-bound) activities as \
|
|
12
27
|
well as IO-bound workflows.
|
|
13
28
|
To avoid deadlocks due to the GIL, we advise to run all CPU-bound activities inside a \
|
|
@@ -30,20 +45,21 @@ _ACTIVITY_THREAD_NAME_PREFIX = "datashare-activity-worker-"
|
|
|
30
45
|
|
|
31
46
|
def datashare_worker(
|
|
32
47
|
client: TemporalClient,
|
|
48
|
+
worker_id: str,
|
|
33
49
|
*,
|
|
34
50
|
workflows: list[type] | None = None,
|
|
35
51
|
activities: list[Activity] | None = None,
|
|
36
52
|
task_queue: str,
|
|
37
53
|
# Scale horizontally be default for activities, each worker processes one activity
|
|
38
54
|
# at a time
|
|
39
|
-
|
|
55
|
+
max_concurrent_io_activities: int = 10,
|
|
40
56
|
) -> Worker:
|
|
41
57
|
if workflows is None:
|
|
42
58
|
workflows = []
|
|
43
59
|
if activities is None:
|
|
44
60
|
activities = []
|
|
45
61
|
are_async = [a.__temporal_activity_definition.is_async for a in activities]
|
|
46
|
-
if all(not a for a in are_async):
|
|
62
|
+
if are_async and all(not a for a in are_async):
|
|
47
63
|
activity_executor = ThreadPoolExecutor(
|
|
48
64
|
thread_name_prefix=_ACTIVITY_THREAD_NAME_PREFIX
|
|
49
65
|
)
|
|
@@ -55,11 +71,15 @@ def datashare_worker(
|
|
|
55
71
|
)
|
|
56
72
|
logger.warning(_SEPARATE_IO_AND_CPU_ACTIVITIES)
|
|
57
73
|
|
|
58
|
-
|
|
59
|
-
|
|
74
|
+
max_concurrent_activities = max_concurrent_io_activities
|
|
75
|
+
if isinstance(activity_executor, ThreadPoolExecutor):
|
|
76
|
+
max_concurrent_activities = 1
|
|
77
|
+
if workflows:
|
|
78
|
+
logger.warning(_SEPARATE_IO_AND_CPU_WORKERS)
|
|
60
79
|
|
|
61
80
|
return Worker(
|
|
62
81
|
client,
|
|
82
|
+
identity=worker_id,
|
|
63
83
|
workflows=workflows,
|
|
64
84
|
activities=activities,
|
|
65
85
|
task_queue=task_queue,
|
|
@@ -72,3 +92,84 @@ def datashare_worker(
|
|
|
72
92
|
# several of them
|
|
73
93
|
workflow_task_poller_behavior=PollerBehaviorSimpleMaximum(5),
|
|
74
94
|
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def create_worker_id(prefix: str) -> str:
|
|
98
|
+
pid = os.getpid()
|
|
99
|
+
threadid = threading.get_ident()
|
|
100
|
+
hostname = socket.gethostname()
|
|
101
|
+
# TODO: this might not be unique when using asyncio
|
|
102
|
+
return f"{prefix}-{hostname}-{pid}-{threadid}"
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def init_activity(
|
|
106
|
+
activity: Callable, client: TemporalClient, event_loop: AbstractEventLoop
|
|
107
|
+
) -> Callable:
|
|
108
|
+
is_object_method = "." not in activity.__qualname__
|
|
109
|
+
if is_object_method:
|
|
110
|
+
return activity
|
|
111
|
+
cls = _get_class_from_method(activity)
|
|
112
|
+
init_args = inspect.signature(cls.__init__).parameters
|
|
113
|
+
invalid = [p for p in init_args if p not in _EXPECTED_INIT_ARGS]
|
|
114
|
+
if invalid:
|
|
115
|
+
msg = f"invalid activity arguments: {invalid}"
|
|
116
|
+
raise ValueError(msg)
|
|
117
|
+
kwargs = {_TEMPORAL_CLIENT: client, _EVENT_LOOP: event_loop}
|
|
118
|
+
kwargs = {k: v for k, v in kwargs.items() if k in _EXPECTED_INIT_ARGS}
|
|
119
|
+
if not kwargs:
|
|
120
|
+
return activity
|
|
121
|
+
act_instance = cls(**kwargs)
|
|
122
|
+
act_method = getattr(act_instance, activity.__name__)
|
|
123
|
+
return act_method
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@asynccontextmanager
|
|
127
|
+
async def bootstrap_worker(
|
|
128
|
+
worker_id: str,
|
|
129
|
+
*,
|
|
130
|
+
activities: list[Callable[..., Any] | None] | None = None,
|
|
131
|
+
workflows: list[type] | None = None,
|
|
132
|
+
bootstrap_config: WorkerConfig,
|
|
133
|
+
client: TemporalClient,
|
|
134
|
+
event_loop: AbstractEventLoop,
|
|
135
|
+
task_queue: str,
|
|
136
|
+
dependencies: list[ContextManagerFactory] | None = None,
|
|
137
|
+
) -> AsyncGenerator[Worker, None]:
|
|
138
|
+
deps_cm = (
|
|
139
|
+
with_dependencies(
|
|
140
|
+
dependencies,
|
|
141
|
+
worker_config=bootstrap_config,
|
|
142
|
+
worker_id=worker_id,
|
|
143
|
+
event_loop=event_loop,
|
|
144
|
+
)
|
|
145
|
+
if dependencies
|
|
146
|
+
else _do_nothing_cm()
|
|
147
|
+
)
|
|
148
|
+
async with deps_cm:
|
|
149
|
+
if activities is not None:
|
|
150
|
+
acts = [
|
|
151
|
+
init_activity(a, client=client, event_loop=event_loop)
|
|
152
|
+
for a in activities
|
|
153
|
+
]
|
|
154
|
+
else:
|
|
155
|
+
acts = None
|
|
156
|
+
worker = datashare_worker(
|
|
157
|
+
client,
|
|
158
|
+
worker_id,
|
|
159
|
+
workflows=workflows,
|
|
160
|
+
activities=acts,
|
|
161
|
+
task_queue=task_queue,
|
|
162
|
+
max_concurrent_io_activities=bootstrap_config.max_concurrent_io_activities,
|
|
163
|
+
)
|
|
164
|
+
yield worker
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@asynccontextmanager
|
|
168
|
+
async def _do_nothing_cm() -> AsyncGenerator[None, None]:
|
|
169
|
+
yield
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _get_class_from_method(method: Callable) -> type:
|
|
173
|
+
class_name = method.__qualname__.rsplit(".", 1)[0]
|
|
174
|
+
module = sys.modules[method.__module__]
|
|
175
|
+
return getattr(module, class_name)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datashare-python
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.25
|
|
4
4
|
Summary: Manage Pythoœn tasks and local resources in Datashare
|
|
5
5
|
Project-URL: Homepage, https://icij.github.io/datashare-python/
|
|
6
6
|
Project-URL: Documentation, https://icij.github.io/datashare-python/
|
|
@@ -9,13 +9,13 @@ Project-URL: Issues, https://github.com/ICIJ/datashare-python/issues
|
|
|
9
9
|
Author-email: Clément Doumouro <cdoumouro@icij.org>, Clément Doumouro <clement.doumouro@gmail.com>, Lion Summerbell <lsummerbell@icij.org>
|
|
10
10
|
Requires-Python: <4,>=3.11
|
|
11
11
|
Requires-Dist: aiohttp~=3.11.9
|
|
12
|
-
Requires-Dist: aiostream~=0.6.4
|
|
13
12
|
Requires-Dist: alive-progress~=3.2.0
|
|
14
|
-
Requires-Dist: datashare-worker-template~=0.1
|
|
13
|
+
Requires-Dist: datashare-worker-template[ml]~=0.1
|
|
15
14
|
Requires-Dist: hatchling~=1.27.0
|
|
16
|
-
Requires-Dist: icij-common[elasticsearch]~=0.
|
|
15
|
+
Requires-Dist: icij-common[elasticsearch]~=0.8.2
|
|
17
16
|
Requires-Dist: nest-asyncio~=1.6.0
|
|
18
17
|
Requires-Dist: python-json-logger~=4.0.0
|
|
18
|
+
Requires-Dist: pyyaml~=6.0
|
|
19
19
|
Requires-Dist: temporalio~=1.23.0
|
|
20
20
|
Requires-Dist: tomlkit~=0.14.0
|
|
21
21
|
Requires-Dist: typer~=0.15.4
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
datashare_python/.gitignore,sha256=e-SRgnvGGdsjRrqgKsTzALz6Obx8IYiOjr0yaAxT6v8,22
|
|
2
|
+
datashare_python/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
datashare_python/__main__.py,sha256=g-fvS46zl9umKmGrSpl-OG-8PSuZgjqvTCqjpsZtSps,101
|
|
4
|
+
datashare_python/config.py,sha256=u6iyOeSXzIO30Yja8Vj9LjM-cq8ESRBy3Kse6UadAMg,3763
|
|
5
|
+
datashare_python/conftest.py,sha256=BdRLjy9eJtxAKLDCcon1Nyhzn54CIw2z4s3ZOupNYGo,8256
|
|
6
|
+
datashare_python/constants.py,sha256=e6Px11OUee9GSHwTgsgFMszGCMwpW-OznHSMgINvepc,338
|
|
7
|
+
datashare_python/dependencies.py,sha256=Diu7alKGaFWyC_ajp0fKU-xp8u5f_8x1axAHVBlppD0,3707
|
|
8
|
+
datashare_python/discovery.py,sha256=R0wws-_QgiK3GyTSf3p2UV41Ok-iuKeZauh6EJAdzGc,6401
|
|
9
|
+
datashare_python/exceptions.py,sha256=bVHEAXxDPKfxeeMC0hJXEsrJkgsKO2ESAhxWU96GA4M,496
|
|
10
|
+
datashare_python/local_client.py,sha256=GP9MTcHVQ1mcb2eO6TiQ7mzQdx199lZRhK8DRuJqJVQ,2359
|
|
11
|
+
datashare_python/objects.py,sha256=GMi2hlKuWFbWWoC2r8ITQGQcMsobHHChGvm4ZfIjMl0,4537
|
|
12
|
+
datashare_python/task_client.py,sha256=oTmP8bvZW0UyhLNMi1AV3XIAx7hrdbxNRss2Mw2azEc,8435
|
|
13
|
+
datashare_python/template.py,sha256=RxKTYLXoS_EQ8Jc41JkBXppPdbCFqDWfP3BmC0gvB5o,4024
|
|
14
|
+
datashare_python/types_.py,sha256=9Hk1XqpdXbM1TnEzwvJ5G9ABbaCZW9KgBTtiPBVn_7k,649
|
|
15
|
+
datashare_python/utils.py,sha256=DQt-rBwC3Ok72u8VyerG3rqwUTx3ftLfPdMQ5cnRrgs,16801
|
|
16
|
+
datashare_python/worker-template.tar.gz,sha256=CTokaLmqZZ6tJPGT4N7YG9aDsP66yS1HvJN-IzWhh0c,142348
|
|
17
|
+
datashare_python/worker.py,sha256=A4SnmDB4y0ck6Wp_UZWdsSOyTvW54Z2Bq76gxtp-_PE,6070
|
|
18
|
+
datashare_python/cli/__init__.py,sha256=5MGSE_0SwlOiwbyPwsP8RIXlTBB2_GGP0zDg4l6UAIY,1479
|
|
19
|
+
datashare_python/cli/local.py,sha256=S-7qMpSqzi0oMvu01TCFEb8tayEvpw4pXMdCszKEYtU,986
|
|
20
|
+
datashare_python/cli/project.py,sha256=w32Gy9AOL5B00uDT4in7YUCt2g68FnNbvwg2M3a8G6o,946
|
|
21
|
+
datashare_python/cli/task.py,sha256=9If5OC7loG4C4gWWl4iOeqPJ4GOLlCWXQfuNLUHORrQ,5860
|
|
22
|
+
datashare_python/cli/utils.py,sha256=p69CQb0zfixuyBkiZprhdMCc_NuYwXyAn6vC9H1UzAw,911
|
|
23
|
+
datashare_python/cli/worker.py,sha256=tJ2xj_TCyjZVh1Jlb_AknHEg8xn9Js90Vb39slew8t4,5160
|
|
24
|
+
datashare_python-0.2.25.dist-info/METADATA,sha256=yIqCI2amsvCqNeKqnJ0RJtHUV1OnrUn7Lue5DgbYWk0,958
|
|
25
|
+
datashare_python-0.2.25.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
26
|
+
datashare_python-0.2.25.dist-info/entry_points.txt,sha256=ILE7auxabHWiu3GC-AunWnzjhOI_SbZp7D4GqZHlLw4,68
|
|
27
|
+
datashare_python-0.2.25.dist-info/RECORD,,
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
datashare_python/.gitignore,sha256=e-SRgnvGGdsjRrqgKsTzALz6Obx8IYiOjr0yaAxT6v8,22
|
|
2
|
-
datashare_python/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
datashare_python/__main__.py,sha256=g-fvS46zl9umKmGrSpl-OG-8PSuZgjqvTCqjpsZtSps,101
|
|
4
|
-
datashare_python/config.py,sha256=tTdLeBxefsLbXBy0sZS1n1wYuLPGMITm8mklo7CDraM,2954
|
|
5
|
-
datashare_python/conftest.py,sha256=TVa57J4FQcwVIYgjCiNShvu4F1dUi94qJ7Q-SGlML6k,7853
|
|
6
|
-
datashare_python/constants.py,sha256=QrtQEf-qHAkdzs1xJuyeN6XbCalTRVcgVvEZNdT-soU,305
|
|
7
|
-
datashare_python/dependencies.py,sha256=3KDZ71x3KGnik-QPfXgSPRRLYfbj0oSQ0jCG-3x1c3M,3017
|
|
8
|
-
datashare_python/discovery.py,sha256=zCTbTxFmAaY4aRwCyUAGh-HQVoZAnxVOygp5cl2Oha4,2957
|
|
9
|
-
datashare_python/exceptions.py,sha256=bVHEAXxDPKfxeeMC0hJXEsrJkgsKO2ESAhxWU96GA4M,496
|
|
10
|
-
datashare_python/local_client.py,sha256=GP9MTcHVQ1mcb2eO6TiQ7mzQdx199lZRhK8DRuJqJVQ,2359
|
|
11
|
-
datashare_python/objects.py,sha256=UDc9PDaSXADLgmnfvqGBrDTmPVL-rRpyNHvY277-1x8,4147
|
|
12
|
-
datashare_python/task_client.py,sha256=oTmP8bvZW0UyhLNMi1AV3XIAx7hrdbxNRss2Mw2azEc,8435
|
|
13
|
-
datashare_python/template.py,sha256=EaFX7A6sw-HCb3RWrRvMg61cnKzfZlIB6Liom4mJXys,3725
|
|
14
|
-
datashare_python/types_.py,sha256=9Hk1XqpdXbM1TnEzwvJ5G9ABbaCZW9KgBTtiPBVn_7k,649
|
|
15
|
-
datashare_python/utils.py,sha256=mFegsYFoIRzRtPhx12748qrahe-c0rKNx02-r8tTG8Q,12777
|
|
16
|
-
datashare_python/worker-template.tar.gz,sha256=THxySB554TAzBl2K5IIIj7yKulv0lhodM-zVN_Zk1Sg,142868
|
|
17
|
-
datashare_python/worker.py,sha256=E1cECV0q2XcGI234gZZ5eBvqeULlTK0FUb0VSWkRQUI,2924
|
|
18
|
-
datashare_python/cli/__init__.py,sha256=5MGSE_0SwlOiwbyPwsP8RIXlTBB2_GGP0zDg4l6UAIY,1479
|
|
19
|
-
datashare_python/cli/local.py,sha256=S-7qMpSqzi0oMvu01TCFEb8tayEvpw4pXMdCszKEYtU,986
|
|
20
|
-
datashare_python/cli/project.py,sha256=w32Gy9AOL5B00uDT4in7YUCt2g68FnNbvwg2M3a8G6o,946
|
|
21
|
-
datashare_python/cli/task.py,sha256=9If5OC7loG4C4gWWl4iOeqPJ4GOLlCWXQfuNLUHORrQ,5860
|
|
22
|
-
datashare_python/cli/utils.py,sha256=p69CQb0zfixuyBkiZprhdMCc_NuYwXyAn6vC9H1UzAw,911
|
|
23
|
-
datashare_python/cli/worker.py,sha256=lmxeZDhLbGb-98XplCERN3R3Yo-sU0UkhO7KIo9Zbo4,4401
|
|
24
|
-
datashare_python-0.2.23.dist-info/METADATA,sha256=yzNfCxfVgTNJLV2RigdLh8ltF64_h5JNpm6FBhru7Sg,959
|
|
25
|
-
datashare_python-0.2.23.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
26
|
-
datashare_python-0.2.23.dist-info/entry_points.txt,sha256=ILE7auxabHWiu3GC-AunWnzjhOI_SbZp7D4GqZHlLw4,68
|
|
27
|
-
datashare_python-0.2.23.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|