datashare-python 0.2.23__tar.gz → 0.2.25__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {datashare_python-0.2.23 → datashare_python-0.2.25}/PKG-INFO +4 -4
  2. datashare_python-0.2.25/datashare_python/cli/worker.py +137 -0
  3. {datashare_python-0.2.23 → datashare_python-0.2.25}/datashare_python/config.py +30 -2
  4. {datashare_python-0.2.23 → datashare_python-0.2.25}/datashare_python/conftest.py +11 -0
  5. {datashare_python-0.2.23 → datashare_python-0.2.25}/datashare_python/constants.py +2 -0
  6. {datashare_python-0.2.23 → datashare_python-0.2.25}/datashare_python/dependencies.py +27 -7
  7. datashare_python-0.2.25/datashare_python/discovery.py +178 -0
  8. {datashare_python-0.2.23 → datashare_python-0.2.25}/datashare_python/objects.py +17 -3
  9. {datashare_python-0.2.23 → datashare_python-0.2.25}/datashare_python/template.py +11 -4
  10. {datashare_python-0.2.23 → datashare_python-0.2.25}/datashare_python/utils.py +152 -25
  11. datashare_python-0.2.25/datashare_python/worker-template.tar.gz +0 -0
  12. datashare_python-0.2.25/datashare_python/worker.py +175 -0
  13. {datashare_python-0.2.23 → datashare_python-0.2.25}/pyproject.toml +4 -5
  14. datashare_python-0.2.23/datashare_python/cli/worker.py +0 -114
  15. datashare_python-0.2.23/datashare_python/discovery.py +0 -79
  16. datashare_python-0.2.23/datashare_python/worker-template.tar.gz +0 -0
  17. datashare_python-0.2.23/datashare_python/worker.py +0 -74
  18. {datashare_python-0.2.23 → datashare_python-0.2.25}/.gitignore +0 -0
  19. {datashare_python-0.2.23 → datashare_python-0.2.25}/README.md +0 -0
  20. {datashare_python-0.2.23 → datashare_python-0.2.25}/datashare_python/.gitignore +0 -0
  21. {datashare_python-0.2.23 → datashare_python-0.2.25}/datashare_python/__init__.py +0 -0
  22. {datashare_python-0.2.23 → datashare_python-0.2.25}/datashare_python/__main__.py +0 -0
  23. {datashare_python-0.2.23 → datashare_python-0.2.25}/datashare_python/cli/__init__.py +0 -0
  24. {datashare_python-0.2.23 → datashare_python-0.2.25}/datashare_python/cli/local.py +0 -0
  25. {datashare_python-0.2.23 → datashare_python-0.2.25}/datashare_python/cli/project.py +0 -0
  26. {datashare_python-0.2.23 → datashare_python-0.2.25}/datashare_python/cli/task.py +0 -0
  27. {datashare_python-0.2.23 → datashare_python-0.2.25}/datashare_python/cli/utils.py +0 -0
  28. {datashare_python-0.2.23 → datashare_python-0.2.25}/datashare_python/exceptions.py +0 -0
  29. {datashare_python-0.2.23 → datashare_python-0.2.25}/datashare_python/local_client.py +0 -0
  30. {datashare_python-0.2.23 → datashare_python-0.2.25}/datashare_python/task_client.py +0 -0
  31. {datashare_python-0.2.23 → datashare_python-0.2.25}/datashare_python/types_.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datashare-python
3
- Version: 0.2.23
3
+ Version: 0.2.25
4
4
  Summary: Manage Pythoœn tasks and local resources in Datashare
5
5
  Project-URL: Homepage, https://icij.github.io/datashare-python/
6
6
  Project-URL: Documentation, https://icij.github.io/datashare-python/
@@ -9,13 +9,13 @@ Project-URL: Issues, https://github.com/ICIJ/datashare-python/issues
9
9
  Author-email: Clément Doumouro <cdoumouro@icij.org>, Clément Doumouro <clement.doumouro@gmail.com>, Lion Summerbell <lsummerbell@icij.org>
10
10
  Requires-Python: <4,>=3.11
11
11
  Requires-Dist: aiohttp~=3.11.9
12
- Requires-Dist: aiostream~=0.6.4
13
12
  Requires-Dist: alive-progress~=3.2.0
14
- Requires-Dist: datashare-worker-template~=0.1
13
+ Requires-Dist: datashare-worker-template[ml]~=0.1
15
14
  Requires-Dist: hatchling~=1.27.0
16
- Requires-Dist: icij-common[elasticsearch]~=0.7.3
15
+ Requires-Dist: icij-common[elasticsearch]~=0.8.2
17
16
  Requires-Dist: nest-asyncio~=1.6.0
18
17
  Requires-Dist: python-json-logger~=4.0.0
18
+ Requires-Dist: pyyaml~=6.0
19
19
  Requires-Dist: temporalio~=1.23.0
20
20
  Requires-Dist: tomlkit~=0.14.0
21
21
  Requires-Dist: typer~=0.15.4
@@ -0,0 +1,137 @@
1
+ import asyncio
2
+ import logging
3
+ from pathlib import Path
4
+ from typing import Annotated
5
+
6
+ import typer
7
+ import yaml
8
+ from icij_common.pydantic_utils import safe_copy
9
+
10
+ from datashare_python.config import TemporalClientConfig, WorkerConfig
11
+ from datashare_python.discovery import discover, discover_activities, discover_workflows
12
+ from datashare_python.worker import bootstrap_worker, create_worker_id
13
+
14
+ from .utils import AsyncTyper
15
+
16
+ _START_WORKER_HELP = "start a datashare worker"
17
+
18
+ _LIST_WORKFLOWS_HELP = "list registered workflows"
19
+ _LIST_WORKFLOW_NAMES_HELP = "workflow names filters (supports regexes)"
20
+
21
+ _LIST_ACTIVITIES_HELP = "list registered activities"
22
+ _LIST_ACTIVITY_NAMES_HELP = "activity names filters (supports regexes)"
23
+
24
+ _START_WORKER_WORKFLOWS_HELP = "workflow names run by the worker (supports regexes)"
25
+ _START_WORKER_ACTIVITIES_HELP = "activity names run by the worker (supports regexes)"
26
+ _START_WORKER_DEPS_HELP = "worker lifetime dependencies name in the registry"
27
+ _START_WORKER_WORKER_ID_PREFIX_HELP = "worker ID prefix"
28
+ _START_WORKER_CONFIG_PATH_HELP = (
29
+ "path to a worker config YAML file,"
30
+ " if not provided will load worker configuration from env variables"
31
+ )
32
+ _WORKER_QUEUE_HELP = "worker task queue"
33
+ _TEMPORAL_NAMESPACE_HELP = "worker temporal namespace"
34
+
35
+ _TEMPORAL_URL_HELP = "address for temporal server"
36
+ _NAMESPACE_HELP = "namespace name"
37
+ _WORKER = "worker"
38
+
39
+ worker_app = AsyncTyper(name=_WORKER)
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+
44
+ @worker_app.async_command(help=_LIST_WORKFLOWS_HELP)
45
+ async def list_workflows(
46
+ names: Annotated[list[str], typer.Argument(help=_LIST_WORKFLOW_NAMES_HELP)],
47
+ ) -> None:
48
+ workflows = [wf_name for wf_name, _ in discover_workflows(names)]
49
+ if not workflows:
50
+ out = """Couldn't find any registered workflow 🤔.
51
+ Make sure your workflow plugins correctly expose workflow entry points, refer to the \
52
+ documentation to learn how to do so."""
53
+ print(out)
54
+ return
55
+ workflows = "\n".join(f"- {wf}" for wf in workflows)
56
+ out = f"Found {len(workflows)} registered workflows:\n{workflows}"
57
+ print(out)
58
+
59
+
60
+ @worker_app.async_command(help=_LIST_ACTIVITIES_HELP)
61
+ async def list_activities(
62
+ names: Annotated[list[str], typer.Argument(help=_LIST_ACTIVITY_NAMES_HELP)],
63
+ ) -> None:
64
+ activities = [act_name for act_name, _ in discover_activities(names)]
65
+ if not activities:
66
+ out = """Couldn't find any registered activity 🤔.
67
+ Make sure your activity plugins correctly expose activity entry points, refer \
68
+ to the documentation to learn how to do so."""
69
+ print(out)
70
+ return
71
+ activities = "\n".join(f"- {act}" for act in activities)
72
+ out = f"Found {len(activities)} registered activities:\n{activities}"
73
+ print(out)
74
+
75
+
76
+ @worker_app.async_command(help=_START_WORKER_HELP)
77
+ async def start(
78
+ workflows: Annotated[list[str], typer.Option(help=_START_WORKER_WORKFLOWS_HELP)],
79
+ activities: Annotated[list[str], typer.Option(help=_START_WORKER_ACTIVITIES_HELP)],
80
+ queue: Annotated[str, typer.Option("--queue", "-q", help=_WORKER_QUEUE_HELP)],
81
+ dependencies: Annotated[
82
+ str | None, typer.Option(help=_START_WORKER_DEPS_HELP)
83
+ ] = None,
84
+ config_path: Annotated[
85
+ Path | None,
86
+ typer.Option(
87
+ "--config-path", "--config", "-c", help=_START_WORKER_CONFIG_PATH_HELP
88
+ ),
89
+ ] = None,
90
+ worker_id_prefix: Annotated[
91
+ str | None, typer.Option(help=_START_WORKER_WORKER_ID_PREFIX_HELP)
92
+ ] = None,
93
+ temporal_address: Annotated[
94
+ str | None, typer.Option("--temporal-address", "-a", help=_TEMPORAL_URL_HELP)
95
+ ] = None,
96
+ namespace: Annotated[
97
+ str | None,
98
+ typer.Option("--temporal-namespace", "-ns", help=_TEMPORAL_NAMESPACE_HELP),
99
+ ] = None,
100
+ ) -> None:
101
+ if config_path is not None:
102
+ with config_path.open() as f:
103
+ bootstrap_config = WorkerConfig.model_validate(
104
+ yaml.load(f, Loader=yaml.Loader)
105
+ )
106
+ else:
107
+ bootstrap_config = WorkerConfig()
108
+ temporal_override = dict()
109
+ if temporal_address is not None:
110
+ temporal_override["host"] = temporal_address
111
+ if namespace is not None:
112
+ temporal_override["namespace"] = namespace
113
+ if temporal_override:
114
+ temporal_config = TemporalClientConfig(**temporal_override)
115
+ update = {"temporal": temporal_config}
116
+ bootstrap_config = safe_copy(bootstrap_config, update=update)
117
+ registered_wfs, registered_acts, registered_deps = discover(
118
+ workflows, act_names=activities, deps_name=dependencies
119
+ )
120
+ worker_id = create_worker_id(worker_id_prefix or "worker")
121
+ client = await bootstrap_config.to_temporal_client()
122
+ event_loop = asyncio.get_event_loop()
123
+ async with bootstrap_worker(
124
+ worker_id,
125
+ activities=registered_acts,
126
+ workflows=registered_wfs,
127
+ dependencies=registered_deps,
128
+ bootstrap_config=bootstrap_config,
129
+ client=client,
130
+ event_loop=event_loop,
131
+ task_queue=queue,
132
+ ) as worker:
133
+ try:
134
+ await worker.run()
135
+ except Exception as e: # noqa: BLE001
136
+ await worker.shutdown()
137
+ raise e
@@ -4,7 +4,13 @@ from icij_common.es import ESClient
4
4
  from icij_common.pydantic_utils import ICIJSettings
5
5
  from pydantic import Field, PrivateAttr
6
6
  from pydantic_settings import SettingsConfigDict
7
- from temporalio.contrib.pydantic import pydantic_data_converter
7
+ from temporalio.contrib.pydantic import PydanticJSONPlainPayloadConverter, ToJsonOptions
8
+ from temporalio.converter import (
9
+ CompositePayloadConverter,
10
+ DataConverter,
11
+ DefaultPayloadConverter,
12
+ JSONPlainPayloadConverter,
13
+ )
8
14
 
9
15
  import datashare_python
10
16
 
@@ -64,7 +70,7 @@ class TemporalClientConfig(BaseModel):
64
70
  self._client = await TemporalClient.connect(
65
71
  target_host=self.host,
66
72
  namespace=self.namespace,
67
- data_converter=pydantic_data_converter,
73
+ data_converter=PYDANTIC_DATA_CONVERTER,
68
74
  )
69
75
  return self._client
70
76
 
@@ -83,6 +89,8 @@ class WorkerConfig(ICIJSettings, LogWithWorkerIDMixin, BaseModel):
83
89
  elasticsearch: ESClientConfig = ESClientConfig()
84
90
  temporal: TemporalClientConfig = TemporalClientConfig()
85
91
 
92
+ max_concurrent_io_activities: int = 5
93
+
86
94
  def to_es_client(self) -> ESClient:
87
95
  return self.elasticsearch.to_es_client(self.datashare.api_key)
88
96
 
@@ -91,3 +99,23 @@ class WorkerConfig(ICIJSettings, LogWithWorkerIDMixin, BaseModel):
91
99
 
92
100
  async def to_temporal_client(self) -> TemporalClient:
93
101
  return await self.temporal.to_client()
102
+
103
+
104
+ class _PydanticPayloadConverter(CompositePayloadConverter):
105
+ def __init__(self) -> None:
106
+ json_payload_converter = PydanticJSONPlainPayloadConverter(
107
+ ToJsonOptions(exclude_unset=False)
108
+ )
109
+ super().__init__(
110
+ *(
111
+ c
112
+ if not isinstance(c, JSONPlainPayloadConverter)
113
+ else json_payload_converter
114
+ for c in DefaultPayloadConverter.default_encoding_payload_converters
115
+ )
116
+ )
117
+
118
+
119
+ PYDANTIC_DATA_CONVERTER = DataConverter(
120
+ payload_converter_class=_PydanticPayloadConverter
121
+ )
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  from asyncio import AbstractEventLoop
3
3
  from collections.abc import AsyncGenerator, Generator, Iterator, Sequence
4
+ from pathlib import Path
4
5
 
5
6
  import aiohttp
6
7
  import pytest
@@ -50,6 +51,9 @@ _INDEX_BODY = {
50
51
  "language": {"type": "keyword"},
51
52
  "documentId": {"type": "keyword"},
52
53
  "join": {"type": "join", "relations": {"Document": "NamedEntity"}},
54
+ "contentType": {"type": "keyword"},
55
+ "content": {"type": "text"},
56
+ "contentTranslated": {"type": "text"},
53
57
  }
54
58
  }
55
59
  }
@@ -102,6 +106,13 @@ def test_worker_config() -> WorkerConfig:
102
106
  )
103
107
 
104
108
 
109
+ @pytest.fixture
110
+ def test_worker_config_path(test_worker_config: WorkerConfig, tmpdir: Path) -> Path:
111
+ config_path = Path(tmpdir) / "config.json"
112
+ config_path.write_text(test_worker_config.model_dump_json())
113
+ return config_path
114
+
115
+
105
116
  @pytest.fixture(scope="session")
106
117
  async def worker_lifetime_deps(
107
118
  event_loop: AbstractEventLoop,
@@ -12,3 +12,5 @@ DEFAULT_TEMPORAL_ADDRESS = "temporal:7233"
12
12
  DEFAULT_DS_ADDRESS = "http://localhost:8080"
13
13
 
14
14
  DEFAULT_NAMESPACE = "datashare-default"
15
+
16
+ METADATA_JSON = "metadata.json"
@@ -1,8 +1,11 @@
1
+ import inspect
1
2
  import logging
2
3
  from asyncio import AbstractEventLoop, iscoroutine
3
- from collections.abc import AsyncGenerator
4
+ from collections.abc import AsyncGenerator, Callable
4
5
  from contextlib import AsyncExitStack, asynccontextmanager
5
6
  from contextvars import ContextVar
7
+ from copy import deepcopy
8
+ from typing import Any
6
9
 
7
10
  from icij_common.es import ESClient
8
11
 
@@ -20,7 +23,7 @@ TASK_CLIENT: ContextVar[DatashareTaskClient] = ContextVar("task_client")
20
23
  TEMPORAL_CLIENT: ContextVar[TemporalClient] = ContextVar("temporal_client")
21
24
 
22
25
 
23
- def set_event_loop(event_loop: AbstractEventLoop, **_) -> None:
26
+ def set_event_loop(event_loop: AbstractEventLoop) -> None:
24
27
  EVENT_LOOP.set(event_loop)
25
28
 
26
29
 
@@ -31,13 +34,13 @@ def lifespan_event_loop() -> AbstractEventLoop:
31
34
  raise DependencyInjectionError("event loop") from e
32
35
 
33
36
 
34
- def set_loggers(worker_config: WorkerConfig, worker_id: str, **_) -> None:
37
+ def set_loggers(worker_config: WorkerConfig, worker_id: str) -> None:
35
38
  worker_config.setup_loggers(worker_id=worker_id)
36
39
  logger.info("worker loggers ready to log 💬")
37
40
  logger.info("app config: %s", worker_config.model_dump_json(indent=2))
38
41
 
39
42
 
40
- async def set_es_client(worker_config: WorkerConfig, **_) -> ESClient:
43
+ async def set_es_client(worker_config: WorkerConfig) -> ESClient:
41
44
  client = worker_config.to_es_client()
42
45
  ES_CLIENT.set(client)
43
46
  return client
@@ -52,7 +55,7 @@ def lifespan_es_client() -> ESClient:
52
55
 
53
56
 
54
57
  # Task client setup
55
- async def set_task_client(worker_config: WorkerConfig, **_) -> DatashareTaskClient:
58
+ async def set_task_client(worker_config: WorkerConfig) -> DatashareTaskClient:
56
59
  task_client = worker_config.to_task_client()
57
60
  TASK_CLIENT.set(task_client)
58
61
  return task_client
@@ -67,7 +70,7 @@ def lifespan_task_client() -> DatashareTaskClient:
67
70
 
68
71
 
69
72
  # Temporal client setup
70
- async def set_temporal_client(worker_config: WorkerConfig, **_) -> None:
73
+ async def set_temporal_client(worker_config: WorkerConfig) -> None:
71
74
  client = await worker_config.to_temporal_client()
72
75
  TEMPORAL_CLIENT.set(client)
73
76
 
@@ -86,7 +89,7 @@ async def with_dependencies(
86
89
  ) -> AsyncGenerator[None, None]:
87
90
  async with AsyncExitStack() as stack:
88
91
  for dep in dependencies:
89
- cm = dep(**kwargs)
92
+ cm = dep(**add_missing_args(dep, dict(), **kwargs))
90
93
  if hasattr(cm, "__aenter__"):
91
94
  await stack.enter_async_context(cm)
92
95
  elif hasattr(cm, "__enter__"):
@@ -94,3 +97,20 @@ async def with_dependencies(
94
97
  elif iscoroutine(cm):
95
98
  await cm
96
99
  yield
100
+
101
+
102
+ def add_missing_args(fn: Callable, args: dict[str, Any], **kwargs) -> dict[str, Any]:
103
+ # We make the choice not to raise in case of missing argument here, the error will
104
+ # be correctly raise when the function is called
105
+ from_kwargs = dict()
106
+ sig = inspect.signature(fn)
107
+ for param_name in sig.parameters:
108
+ if param_name in args:
109
+ continue
110
+ kwargs_value = kwargs.get(param_name)
111
+ if kwargs_value is not None:
112
+ from_kwargs[param_name] = kwargs_value
113
+ if from_kwargs:
114
+ args = deepcopy(args)
115
+ args.update(from_kwargs)
116
+ return args
@@ -0,0 +1,178 @@
1
+ import logging
2
+ import re
3
+ from collections.abc import Callable, Iterable
4
+ from importlib.metadata import entry_points
5
+
6
+ from .types_ import ContextManagerFactory
7
+ from .utils import ActivityWithProgress
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ Activity = ActivityWithProgress | Callable | type
12
+
13
+ _DEPENDENCIES = "dependencies"
14
+ _WORKFLOW_GROUPS = "datashare.workflows"
15
+ _ACTIVITIES_GROUPS = "datashare.activities"
16
+ _DEPENDENCIES_GROUPS = "datashare.dependencies"
17
+
18
+ _RegisteredWorkflow = tuple[str, type]
19
+ _RegisteredActivity = tuple[str, Activity]
20
+ _Dependencies = list[ContextManagerFactory]
21
+ _Discovery = tuple[
22
+ Iterable[_RegisteredWorkflow] | None,
23
+ Iterable[_RegisteredActivity] | None,
24
+ _Dependencies | None,
25
+ ]
26
+
27
+
28
+ def discover(
29
+ wf_names: list[str] | None, *, act_names: list[str] | None, deps_name: str | None
30
+ ) -> _Discovery:
31
+ discovered = ""
32
+ wfs = None
33
+ if wf_names is not None:
34
+ discovered_wfs = discover_workflows(wf_names)
35
+ if discovered_wfs:
36
+ wf_names, wfs = zip(*discovered_wfs, strict=True)
37
+ if wf_names:
38
+ n_wfs = len(wf_names)
39
+ discovered += (
40
+ f"- {n_wfs} workflow{'s' if n_wfs > 1 else ''}:"
41
+ f" {', '.join(wf_names)}"
42
+ )
43
+ acts = None
44
+ if act_names is not None:
45
+ discovered_acts = discover_activities(act_names)
46
+ if discovered_acts:
47
+ act_names, acts = zip(*discovered_acts, strict=True)
48
+ if act_names:
49
+ if discovered:
50
+ discovered += "\n"
51
+ n_acts = len(act_names)
52
+ discovered += (
53
+ f"- {n_acts} activit{'ies' if n_acts > 1 else 'y'}:"
54
+ f" {', '.join(act_names)}"
55
+ )
56
+ if not acts and not wfs:
57
+ raise ValueError("Couldn't find any registered activity or workflow.")
58
+ deps = discover_dependencies(deps_name)
59
+ if deps:
60
+ n_deps = len(deps)
61
+ discovered += "\n"
62
+ deps_names = (d.__name__ for d in deps)
63
+ discovered += (
64
+ f"- {n_deps} dependenc{'ies' if n_deps > 1 else 'y'}:"
65
+ f" {', '.join(deps_names)}"
66
+ )
67
+ logger.info("discovered:\n%s", discovered)
68
+ return wfs, acts, deps
69
+
70
+
71
+ def discover_workflows(names: list[str]) -> list[_RegisteredWorkflow]:
72
+ pattern = None if not names else re.compile(rf"^{'|'.join(names)}$")
73
+ impls = entry_points(group=_WORKFLOW_GROUPS)
74
+ registered = []
75
+ for wf_impls in impls:
76
+ wf_impls = wf_impls.load() # noqa: PLW2901
77
+ if not isinstance(wf_impls, list | tuple | set):
78
+ wf_impls = [wf_impls] # noqa: PLW2901
79
+ for wf_impl in wf_impls:
80
+ wf_name = _parse_wf_name(wf_impl)
81
+ if pattern and not pattern.match(wf_name):
82
+ continue
83
+ registered.append((wf_name, wf_impl))
84
+ return registered
85
+
86
+
87
+ def discover_activities(names: list[str]) -> list[_RegisteredActivity]:
88
+ pattern = None if not names else re.compile(rf"^{'|'.join(names)}$")
89
+ impls = entry_points(group=_ACTIVITIES_GROUPS)
90
+ registered = []
91
+ for act_impls in impls:
92
+ act_impls = act_impls.load() # noqa: PLW2901
93
+ if not isinstance(act_impls, list | tuple | set):
94
+ act_impls = [act_impls] # noqa: PLW2901
95
+ for act_impl in act_impls:
96
+ act_name = _parse_activity_name(act_impl)
97
+ if pattern and not pattern.match(act_name):
98
+ continue
99
+ registered.append((act_name, act_impl))
100
+ return registered
101
+
102
+
103
+ def discover_dependencies(name: str | None) -> _Dependencies | None:
104
+ impls = entry_points(name=_DEPENDENCIES, group=_DEPENDENCIES_GROUPS)
105
+ if not impls:
106
+ if name is None:
107
+ return None
108
+ available_impls = entry_points(group=_DEPENDENCIES_GROUPS)
109
+ msg = (
110
+ f'failed to find dependency: "{name}", '
111
+ f"available dependencies: {available_impls}"
112
+ )
113
+ raise LookupError(msg)
114
+ if len(impls) > 1:
115
+ msg = f'found multiple dependencies for name "{name}": {impls}'
116
+ raise ValueError(msg)
117
+ deps_registry = impls[_DEPENDENCIES].load()
118
+ if name:
119
+ try:
120
+ return deps_registry[name]
121
+ except KeyError as e:
122
+ available = list(deps_registry)
123
+ msg = (
124
+ f'failed to find dependency for name "{name}", available dependencies: '
125
+ f"{available}"
126
+ )
127
+ raise LookupError(msg) from e
128
+ if not deps_registry:
129
+ raise ValueError("empty dependency registry !")
130
+ if len(deps_registry) > 1:
131
+ available = ", ".join('"' + d + '"' for d in deps_registry)
132
+ msg = (
133
+ f"dependency registry contains multiples entries {available},"
134
+ f" please select one by providing a name"
135
+ )
136
+ raise ValueError(msg)
137
+ return next(iter(deps_registry.values()))
138
+
139
+
140
+ def _parse_wf_name(wf_type: type) -> str:
141
+ if not isinstance(wf_type, type):
142
+ msg = (
143
+ f"expected registered workflow implementation to be a temporal workflow"
144
+ f" decorated with @workflow.defn(name=<name>) class, found: {type(wf_type)}"
145
+ )
146
+ raise TypeError(msg)
147
+
148
+ wf_defn = getattr(wf_type, "__temporal_workflow_definition", None)
149
+ if wf_defn is None:
150
+ msg = (
151
+ f"expected registered workflow implementation to be a temporal workflow"
152
+ f" decorated with @workflow.defn(name=<name>) class, found: {wf_type}"
153
+ )
154
+ raise ValueError(msg)
155
+ if wf_defn.name is None:
156
+ msg = (
157
+ "missing workflow definition name, please register your workflow"
158
+ " with an explicit name: @workflow.defn(name=<name>)"
159
+ )
160
+ raise ValueError(msg)
161
+ return wf_defn.name
162
+
163
+
164
+ def _parse_activity_name(act: Activity) -> str:
165
+ act_defn = getattr(act, "__temporal_activity_definition", None)
166
+ if act_defn is None:
167
+ msg = (
168
+ f"expected registered actitiby implementation to be a temporal activity"
169
+ f" decorated with @activity.defn(name=<name>), found: {act}"
170
+ )
171
+ raise ValueError(msg)
172
+ if act_defn.name is None:
173
+ msg = (
174
+ "missing activity definition name, please register your activities"
175
+ " with an explicit name: @activity.defn(name=<name>)"
176
+ )
177
+ raise ValueError(msg)
178
+ return act_defn.name
@@ -1,9 +1,9 @@
1
1
  import logging
2
- from collections.abc import Callable
2
+ from collections.abc import Awaitable, Callable
3
3
  from dataclasses import dataclass
4
4
  from datetime import UTC, datetime
5
5
  from enum import StrEnum, unique
6
- from typing import Any, Literal, Self
6
+ from typing import Any, Literal, Self, TypeVar
7
7
 
8
8
  from temporalio import workflow
9
9
 
@@ -23,14 +23,26 @@ from pydantic.main import IncEx
23
23
  logger = logging.getLogger(__name__)
24
24
 
25
25
 
26
+ T = TypeVar("T")
27
+ Predicate = Callable[[T], bool] | Callable[[T], Awaitable[bool]]
28
+
29
+
26
30
  class BaseModel(_BaseModel):
27
31
  model_config = merge_configs(icij_config(), no_enum_values_config())
28
32
 
29
33
 
34
+ class BasePayload(_BaseModel):
35
+ model_config = icij_config()
36
+
37
+
30
38
  class DatashareModel(BaseModel):
31
39
  model_config = merge_configs(BaseModel.model_config, lowercamel_case_config())
32
40
 
33
41
 
42
+ class LowerCamelCaseModel(_BaseModel):
43
+ model_config = merge_configs(icij_config(), lowercamel_case_config())
44
+
45
+
34
46
  @unique
35
47
  class TaskState(StrEnum):
36
48
  CREATED = "CREATED"
@@ -135,12 +147,14 @@ class TaskGroup:
135
147
  class Document(DatashareModel):
136
148
  id: str
137
149
  root_document: str
138
- content: str
139
150
  language: str
151
+ content: str | None = None
152
+ content_type: str | None = None
140
153
  tags: list[str] = Field(default_factory=list)
141
154
  content_translated: dict[str, str] = Field(
142
155
  default_factory=dict, alias="content_translated"
143
156
  )
157
+ type: str = Field(default="Document", frozen=True)
144
158
 
145
159
  @classmethod
146
160
  def from_es(cls, es_doc: dict) -> Self:
@@ -71,12 +71,12 @@ def _update_pyproject_toml(
71
71
  ) -> dict[str, Any]:
72
72
  pyproject_toml = deepcopy(pyproject_toml)
73
73
 
74
- pyproject_toml["tool"]["uv"].pop("sources")
75
- pyproject_toml["tool"]["uv"].pop("index")
74
+ pyproject_toml["tool"]["uv"].pop("sources", None)
75
+ pyproject_toml["tool"]["uv"].pop("index", None)
76
76
 
77
77
  project = pyproject_toml["project"]
78
78
  project["authors"] = []
79
- project.pop("urls")
79
+ project.pop("urls", None)
80
80
  project["dependencies"] = sorted(
81
81
  d
82
82
  for d in project["dependencies"]
@@ -87,7 +87,7 @@ def _update_pyproject_toml(
87
87
  for d in project["dependencies"]
88
88
  if any(d.startswith(base) for base in _BASE_DEPS)
89
89
  )
90
- project.pop("optional-dependencies")
90
+ project.pop("optional-dependencies", None)
91
91
 
92
92
  entry_points = project["entry-points"]
93
93
 
@@ -105,4 +105,11 @@ def _update_pyproject_toml(
105
105
  i if i != "worker_template" else package_name for i in hatch_sdist["packages"]
106
106
  ]
107
107
 
108
+ hatch_sdist = pyproject_toml["tool"]["hatch"]["build"]["targets"]["sdist"]
109
+ if "only-include" in hatch_sdist:
110
+ hatch_sdist["only-include"] = [
111
+ i if i != "worker_template" else package_name
112
+ for i in hatch_sdist["only-include"]
113
+ ]
114
+
108
115
  return pyproject_toml