datashare-python 0.7.1__py3-none-any.whl → 0.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,9 @@
1
1
  from pathlib import Path
2
- from typing import Annotated, Literal
2
+ from typing import Literal
3
3
 
4
4
  from icij_common.es import ESClient
5
5
  from icij_common.pydantic_utils import ICIJSettings
6
- from pydantic import Field, PrivateAttr
6
+ from pydantic import PrivateAttr
7
7
  from pydantic_settings import SettingsConfigDict
8
8
  from temporalio.contrib.pydantic import PydanticJSONPlainPayloadConverter, ToJsonOptions
9
9
  from temporalio.converter import (
@@ -83,12 +83,14 @@ class LoggingConfig(BaseModel):
83
83
  loggers: dict[str, LogLevel]
84
84
 
85
85
 
86
+ _DEFAULT_LOGGERS = {datashare_python.__name__: "INFO"}
87
+ _DEFAULT_LOGGING_CONFIG = LoggingConfig(log_in_json=True, loggers=_DEFAULT_LOGGERS)
88
+
89
+
86
90
  class WorkerConfig(ICIJSettings, BaseModel):
87
91
  model_config = DS_WORKER_SETTINGS_CONFIG
88
92
 
89
- logging: Annotated[LoggingConfig, Field(frozen=True)] = {
90
- datashare_python.__name__: "INFO"
91
- }
93
+ logging: LoggingConfig = _DEFAULT_LOGGING_CONFIG
92
94
 
93
95
  datashare: DatashareClientConfig = DatashareClientConfig()
94
96
  elasticsearch: ESClientConfig = ESClientConfig()
@@ -2,11 +2,7 @@ import logging
2
2
  import sys
3
3
  from copy import copy
4
4
 
5
- from icij_common.logging_utils import (
6
- DATE_FMT,
7
- STREAM_HANDLER_FMT,
8
- STREAM_HANDLER_FMT_WITH_WORKER_ID,
9
- )
5
+ from icij_common.logging_utils import DATE_FMT, STREAM_HANDLER_FMT
10
6
  from pythonjsonlogger.core import RESERVED_ATTRS, BaseJsonFormatter
11
7
  from pythonjsonlogger.orjson import OrjsonFormatter
12
8
  from temporalio import activity, workflow
@@ -26,6 +22,11 @@ _LOGGED_ATTRIBUTES = (
26
22
  )
27
23
 
28
24
 
25
+ _STREAM_HANDLER_FMT_WITH_WORKER_ID = (
26
+ "[%(levelname)s][%(asctime)s.%(msecs)03d][%(worker_id)s][%(name)s]: %(message)s"
27
+ )
28
+
29
+
29
30
  def setup_worker_loggers(
30
31
  loggers: dict[str, LogLevel], *, worker_id: str | None, in_json: bool
31
32
  ) -> None:
@@ -35,35 +36,18 @@ def setup_worker_loggers(
35
36
  logger = logging.getLogger(logger_name)
36
37
  logger.setLevel(level)
37
38
  logger.handlers = []
38
- for handler in _get_worker_handlers(level, worker_id, in_json=in_json):
39
+ for handler in _get_worker_handlers(level, worker_filter, in_json=in_json):
39
40
  logger.addHandler(handler)
40
- logger.addFilter(worker_filter)
41
-
42
-
43
- def _get_worker_handlers(
44
- level: int, worker_id: str | None, *, in_json: bool
45
- ) -> list[logging.Handler]:
46
- stream_handler = logging.StreamHandler(sys.stderr)
47
- if in_json:
48
- fmt = _json_formatter(datefmt=DATE_FMT)
49
- else:
50
- if worker_id is not None:
51
- fmt = STREAM_HANDLER_FMT_WITH_WORKER_ID
52
- else:
53
- fmt = STREAM_HANDLER_FMT
54
- fmt = logging.Formatter(fmt, DATE_FMT)
55
- stream_handler.setFormatter(fmt)
56
- stream_handler.setLevel(level)
57
- return [stream_handler]
58
41
 
59
42
 
60
43
  class WorkerFilter(logging.Filter):
61
- def __init__(self, worker_id: str) -> None:
44
+ def __init__(self, worker_id: str | None) -> None:
62
45
  super().__init__()
63
- self._worker_id = worker_id
46
+ self.worker_id = worker_id
64
47
 
65
48
  def filter(self, record: logging.LogRecord) -> bool:
66
- record.worker_id = self._worker_id
49
+ if self.worker_id is not None:
50
+ record.worker_id = self.worker_id
67
51
  if workflow.in_workflow():
68
52
  wf_info = workflow.info()
69
53
  for attr in _WF_LOGGED_ATTRS:
@@ -79,6 +63,24 @@ class WorkerFilter(logging.Filter):
79
63
  return True
80
64
 
81
65
 
66
+ def _get_worker_handlers(
67
+ level: int, worker_filter: WorkerFilter, *, in_json: bool
68
+ ) -> list[logging.Handler]:
69
+ stream_handler = logging.StreamHandler(sys.stderr)
70
+ if in_json:
71
+ fmt = _json_formatter(datefmt=DATE_FMT)
72
+ else:
73
+ if worker_filter.worker_id is not None:
74
+ fmt = _STREAM_HANDLER_FMT_WITH_WORKER_ID
75
+ else:
76
+ fmt = STREAM_HANDLER_FMT
77
+ fmt = logging.Formatter(fmt, DATE_FMT)
78
+ stream_handler.setFormatter(fmt)
79
+ stream_handler.setLevel(level)
80
+ stream_handler.addFilter(worker_filter)
81
+ return [stream_handler]
82
+
83
+
82
84
  def _json_formatter(datefmt: str) -> BaseJsonFormatter:
83
85
  fmt = OrjsonFormatter( # let's keep logging as fast as possible
84
86
  _LOGGED_ATTRIBUTES, datefmt=datefmt
datashare_python/utils.py CHANGED
@@ -1,12 +1,10 @@
1
1
  import asyncio
2
+ import contextlib
3
+ import contextvars
2
4
  import inspect
3
5
  import json
4
- import logging
5
- import sys
6
- from collections.abc import (
7
- Callable,
8
- Coroutine,
9
- )
6
+ import threading
7
+ from collections.abc import Awaitable, Callable, Coroutine
10
8
  from copy import deepcopy
11
9
  from dataclasses import dataclass
12
10
  from datetime import timedelta
@@ -20,15 +18,6 @@ from uuid import uuid4
20
18
 
21
19
  import nest_asyncio
22
20
  import temporalio
23
- from icij_common.logging_utils import (
24
- DATE_FMT,
25
- STREAM_HANDLER_FMT,
26
- STREAM_HANDLER_FMT_WITH_WORKER_ID,
27
- WorkerIdFilter,
28
- )
29
- from icij_common.pydantic_utils import get_field_default_value
30
- from pydantic.fields import FieldInfo
31
- from pythonjsonlogger.json import JsonFormatter
32
21
  from temporalio import activity, workflow
33
22
  from temporalio.client import Client, WorkflowHandle
34
23
  from temporalio.common import RetryPolicy, SearchAttributeKey
@@ -123,6 +112,7 @@ async def execute_activity(
123
112
  *,
124
113
  args: list | None = None,
125
114
  start_to_close_timeout: timedelta | None = None,
115
+ heartbeat_timeout: timedelta = timedelta(minutes=1),
126
116
  retry_policy: temporalio.common.RetryPolicy | None = None,
127
117
  ) -> Any:
128
118
  if args is None:
@@ -135,6 +125,7 @@ async def execute_activity(
135
125
  start_to_close_timeout=start_to_close_timeout,
136
126
  task_queue=task_queue,
137
127
  retry_policy=retry_policy,
128
+ heartbeat_timeout=heartbeat_timeout,
138
129
  )
139
130
 
140
131
 
@@ -150,6 +141,8 @@ async def progress_handler(
150
141
  activity_id=activity_id, run_id=run_id, progress=progress, weight=weight
151
142
  )
152
143
  await handle.signal("update_progress", signal)
144
+ with contextlib.suppress(RuntimeError, asyncio.TimeoutError):
145
+ activity.heartbeat()
153
146
 
154
147
 
155
148
  def get_activity_progress_handler_async(
@@ -229,6 +222,74 @@ def with_progress(weight: float = 1.0) -> Callable[P, T]:
229
222
  return decorator
230
223
 
231
224
 
225
+ def with_async_heartbeat(
226
+ activity_fn: Callable[P, Awaitable[T]], n_missed_before_timeout: int
227
+ ) -> Callable[P, Awaitable[T]]:
228
+ # Copied from
229
+ # https://github.com/temporalio/samples-python/blob/main/custom_decorator/activity_utils.py
230
+ @wraps(activity_fn)
231
+ async def wrapper(*args, **kwargs) -> T:
232
+ heartbeat_timeout = activity.info().heartbeat_timeout
233
+ heartbeat_task = None
234
+ if heartbeat_timeout:
235
+ period = heartbeat_timeout.total_seconds() / n_missed_before_timeout
236
+ heartbeat_task = asyncio.create_task(_async_heartbeat_every(period))
237
+ try:
238
+ activity.heartbeat()
239
+ return await activity_fn(*args, **kwargs)
240
+ finally:
241
+ if heartbeat_task:
242
+ heartbeat_task.cancel()
243
+ await asyncio.wait([heartbeat_task])
244
+
245
+ return wrapper
246
+
247
+
248
+ async def _async_heartbeat_every(period: float, *details: Any) -> None:
249
+ with contextlib.suppress(RuntimeError, asyncio.TimeoutError):
250
+ activity.heartbeat(*details)
251
+ while True:
252
+ await asyncio.sleep(period)
253
+ with contextlib.suppress(RuntimeError, asyncio.TimeoutError):
254
+ activity.heartbeat(*details)
255
+
256
+
257
+ def with_sync_heartbeat(
258
+ activity_fn: Callable[P, T], n_missed_before_timeout: int
259
+ ) -> Callable[P, T]:
260
+ @wraps(activity_fn)
261
+ def wrapper(*args, **kwargs) -> T:
262
+ heartbeat_timeout = activity.info().heartbeat_timeout
263
+ heartbeat_thread, stop_event = None, None
264
+ if heartbeat_timeout:
265
+ period = heartbeat_timeout.total_seconds() / n_missed_before_timeout
266
+ ctx = contextvars.copy_context()
267
+ run_args = (_sync_heartbeat_every, period, threading.Event())
268
+ heartbeat_thread, stop_event = (
269
+ threading.Thread(target=ctx.run, args=run_args),
270
+ run_args[-1],
271
+ )
272
+ heartbeat_thread.start()
273
+ try:
274
+ return activity_fn(*args, **kwargs)
275
+ finally:
276
+ if heartbeat_thread:
277
+ stop_event.set()
278
+ heartbeat_thread.join()
279
+
280
+ return wrapper
281
+
282
+
283
+ def _sync_heartbeat_every(
284
+ period: float, stop_event: threading.Event, *details: Any
285
+ ) -> None:
286
+ with contextlib.suppress(RuntimeError, asyncio.TimeoutError):
287
+ activity.heartbeat(*details)
288
+ while not stop_event.wait(period):
289
+ with contextlib.suppress(RuntimeError, asyncio.TimeoutError):
290
+ activity.heartbeat(*details)
291
+
292
+
232
293
  def positional_args_only(activity_fn: Callable[P, T]) -> Callable[P, T]:
233
294
  sig = inspect.signature(activity_fn)
234
295
 
@@ -336,6 +397,7 @@ def activity_defn(
336
397
  name: str,
337
398
  progress_weight: float = 1.0,
338
399
  retriables: set[type[Exception]] = None,
400
+ n_missed_heartbeats_before_timeout: int = 5,
339
401
  ) -> Callable[[Callable[P, T]], Callable[P, T]]:
340
402
  def decorator(activity_fn: Callable[P, T]) -> Callable[P, T]:
341
403
  # TODO: some of these could probably be reimplemented more elegantly using
@@ -344,6 +406,15 @@ def activity_defn(
344
406
  activity_fn = with_retriables(retriables)(activity_fn)
345
407
  if supports_progress(activity_fn):
346
408
  activity_fn = with_progress(progress_weight)(activity_fn)
409
+ is_async = asyncio.iscoroutinefunction(activity_fn)
410
+ if is_async:
411
+ activity_fn = with_async_heartbeat(
412
+ activity_fn, n_missed_heartbeats_before_timeout
413
+ )
414
+ else:
415
+ activity_fn = with_sync_heartbeat(
416
+ activity_fn, n_missed_heartbeats_before_timeout
417
+ )
347
418
  activity_fn = activity.defn(activity_fn, name=name)
348
419
  return activity_fn
349
420
 
@@ -382,59 +453,6 @@ def to_scaled_progress(
382
453
  return _scaled
383
454
 
384
455
 
385
- class LogWithWorkerIDMixin:
386
- def setup_loggers(self, worker_id: str | None = None) -> None:
387
- # Ugly work around the Pydantic V1 limitations...
388
- all_loggers = self.loggers
389
- if isinstance(all_loggers, FieldInfo):
390
- all_loggers = get_field_default_value(all_loggers)
391
- all_loggers.append(__name__)
392
- loggers = sorted(set(all_loggers))
393
- log_level = self.log_level
394
- if isinstance(log_level, FieldInfo):
395
- log_level = get_field_default_value(log_level)
396
- force_warning = getattr(self, "force_warning_loggers", [])
397
- if isinstance(force_warning, FieldInfo):
398
- force_warning = get_field_default_value(force_warning)
399
- force_warning = set(force_warning)
400
- worker_id_filter = None
401
- if worker_id is not None:
402
- worker_id_filter = WorkerIdFilter(worker_id)
403
- handlers = self._handlers(worker_id_filter, log_level)
404
- for logger_ in loggers:
405
- logger_ = logging.getLogger(logger_) # noqa: PLW2901
406
- level = getattr(logging, log_level)
407
- if logger_.name in force_warning:
408
- level = max(logging.WARNING, level)
409
- logger_.setLevel(level)
410
- logger_.handlers = []
411
- for handler in handlers:
412
- logger_.addHandler(handler)
413
-
414
- def _handlers(
415
- self, worker_id_filter: logging.Filter | None, log_level: int
416
- ) -> list[logging.Handler]:
417
- stream_handler = logging.StreamHandler(sys.stderr)
418
- if worker_id_filter is not None:
419
- fmt = STREAM_HANDLER_FMT_WITH_WORKER_ID
420
- else:
421
- fmt = STREAM_HANDLER_FMT
422
- log_in_json = getattr(self, "log_in_json", False)
423
- if isinstance(log_in_json, FieldInfo):
424
- log_in_json = get_field_default_value(log_in_json)
425
- if log_in_json:
426
- fmt = JsonFormatter(fmt, DATE_FMT)
427
- else:
428
- fmt = logging.Formatter(fmt, DATE_FMT)
429
- stream_handler.setFormatter(fmt)
430
- handlers = [stream_handler]
431
- for handler in handlers:
432
- if worker_id_filter is not None:
433
- handler.addFilter(worker_id_filter)
434
- handler.setLevel(log_level)
435
- return handlers
436
-
437
-
438
456
  def safe_dir(doc_id: str) -> Path:
439
457
  if len(doc_id) < 4:
440
458
  raise ValueError(f"expected doc_id to be at least 4, found {doc_id}")
Binary file
@@ -11,7 +11,12 @@ from contextlib import asynccontextmanager
11
11
  from copy import copy
12
12
  from typing import Any
13
13
 
14
- from temporalio.worker import PollerBehaviorSimpleMaximum, Worker
14
+ from temporalio.worker import (
15
+ PollerBehaviorSimpleMaximum,
16
+ UnsandboxedWorkflowRunner,
17
+ Worker,
18
+ )
19
+ from temporalio.worker.workflow_sandbox import SandboxedWorkflowRunner
15
20
 
16
21
  from .config import WorkerConfig
17
22
  from .dependencies import with_dependencies
@@ -62,6 +67,7 @@ def datashare_worker(
62
67
  # Scale horizontally be default for activities, each worker processes one activity
63
68
  # at a time
64
69
  max_concurrent_io_activities: int = 10,
70
+ sandboxed: bool = True,
65
71
  ) -> DatashareWorker:
66
72
  if workflows is None:
67
73
  workflows = []
@@ -86,6 +92,7 @@ def datashare_worker(
86
92
  if workflows:
87
93
  logger.warning(_SEPARATE_IO_AND_CPU_WORKERS)
88
94
  interceptors = [TraceContextInterceptor()]
95
+ wf_runner = SandboxedWorkflowRunner() if sandboxed else UnsandboxedWorkflowRunner()
89
96
  return DatashareWorker(
90
97
  client,
91
98
  interceptors=interceptors,
@@ -101,6 +108,7 @@ def datashare_worker(
101
108
  # Workflow tasks are assumed to be very lightweight and fast we can reserve
102
109
  # several of them
103
110
  workflow_task_poller_behavior=PollerBehaviorSimpleMaximum(5),
111
+ workflow_runner=wf_runner,
104
112
  )
105
113
 
106
114
 
@@ -144,6 +152,7 @@ async def worker_context(
144
152
  event_loop: AbstractEventLoop,
145
153
  task_queue: str,
146
154
  dependencies: list[ContextManagerFactory] | None = None,
155
+ sandboxed: bool = True,
147
156
  ) -> AsyncGenerator[DatashareWorker, None]:
148
157
  discovered = []
149
158
  if activities is not None:
@@ -185,6 +194,7 @@ async def worker_context(
185
194
  activities=acts,
186
195
  task_queue=task_queue,
187
196
  max_concurrent_io_activities=worker_config.max_concurrent_io_activities,
197
+ sandboxed=sandboxed,
188
198
  )
189
199
  async with worker:
190
200
  yield worker
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datashare-python
3
- Version: 0.7.1
4
- Summary: Manage Pythoœn tasks and local resources in Datashare
3
+ Version: 0.7.3
4
+ Summary: Manage Python tasks and local resources in Datashare
5
5
  Project-URL: Homepage, https://icij.github.io/datashare-python/
6
6
  Project-URL: Documentation, https://icij.github.io/datashare-python/
7
7
  Project-URL: Repository, https://github.com/ICIJ/datashare-python
@@ -1,27 +1,27 @@
1
1
  datashare_python/.gitignore,sha256=e-SRgnvGGdsjRrqgKsTzALz6Obx8IYiOjr0yaAxT6v8,22
2
2
  datashare_python/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  datashare_python/__main__.py,sha256=g-fvS46zl9umKmGrSpl-OG-8PSuZgjqvTCqjpsZtSps,101
4
- datashare_python/config.py,sha256=Q4iu3ZGaQB7npaHJWclxPIfgzZTf_-8VxxhXrB9nlpE,3928
4
+ datashare_python/config.py,sha256=_Cx4EB1yHXXcLUtw1OBlMk2SKcJRwxqJwgRu6klbxNg,3994
5
5
  datashare_python/conftest.py,sha256=MrmQKFcUipm_qn-cHsLovZMwMMtVxyK0s1lmKEx54bc,8651
6
6
  datashare_python/constants.py,sha256=a8-ceZKBVMXydcoNQ35fSjFjxeJ7dt-N6eAvqtPpf9g,320
7
7
  datashare_python/dependencies.py,sha256=KJuAp6Dmv8DQuFnGjbWiHu7StzZj97eBPDyZ_RfCQRc,4141
8
8
  datashare_python/discovery.py,sha256=BPB_Ak6d1-vcf9vAQA63IRb2U8h83_mIIi8MbKbFzQ0,7020
9
9
  datashare_python/exceptions.py,sha256=bVHEAXxDPKfxeeMC0hJXEsrJkgsKO2ESAhxWU96GA4M,496
10
10
  datashare_python/interceptors.py,sha256=Pl7GodPO4KbfflmacpW-vOUgLazjlXSlDNENbpOUt1c,6725
11
- datashare_python/logging_.py,sha256=-qHz4ztKz4mOCO2z4wunQ4M3xoVhztNvxaiozuMLFRM,2815
11
+ datashare_python/logging_.py,sha256=XUhZTtofbOqJi1gwytYpUVqvoGPhoz5p2orsXs2FaWs,2968
12
12
  datashare_python/objects.py,sha256=pE0DGNNkl1etxz5ed7T-EaGo1o9TONjH2Lg9u1qdAWU,7571
13
13
  datashare_python/task_client.py,sha256=oTmP8bvZW0UyhLNMi1AV3XIAx7hrdbxNRss2Mw2azEc,8435
14
14
  datashare_python/template.py,sha256=RxKTYLXoS_EQ8Jc41JkBXppPdbCFqDWfP3BmC0gvB5o,4024
15
15
  datashare_python/types_.py,sha256=9Hk1XqpdXbM1TnEzwvJ5G9ABbaCZW9KgBTtiPBVn_7k,649
16
- datashare_python/utils.py,sha256=inVjtlBbgL88mN0UM73SSzW76koTW5MGC0NlyopqRW4,17412
17
- datashare_python/worker-template.tar.gz,sha256=gNSDvn4Lh8iFpFk6j8nlu7pJoIDLh7SVa6EHcjXrj54,286805
18
- datashare_python/worker.py,sha256=1FdmwYKWKYUKteTM3RC6kFQHR02q8NUDe91hv68QPEo,7207
16
+ datashare_python/utils.py,sha256=gX3_RJEJS0sAYBNVfBLoWJu7_hIANhylSAohzXVW-yQ,17982
17
+ datashare_python/worker-template.tar.gz,sha256=bOqoF6xVJRyFQaRYHIXXre31WYdmEqDLGeiXRr4Inqg,287091
18
+ datashare_python/worker.py,sha256=czrN9Z0fPFX-6KHinX8Orx4vb9tpta2e7Qs6H0NiYyE,7534
19
19
  datashare_python/cli/__init__.py,sha256=9BPWtssDgsVfWMsZ1TtZCla0EC_kai4RHttr8oNLYOE,1401
20
20
  datashare_python/cli/project.py,sha256=w32Gy9AOL5B00uDT4in7YUCt2g68FnNbvwg2M3a8G6o,946
21
21
  datashare_python/cli/task.py,sha256=8mvKGS21bZ14BgZ0Uo-dfameljkaI2ZBha80ywCy-E8,5822
22
22
  datashare_python/cli/utils.py,sha256=p69CQb0zfixuyBkiZprhdMCc_NuYwXyAn6vC9H1UzAw,911
23
23
  datashare_python/cli/worker.py,sha256=I4KTpFIpXFowioFn72Rm6LBCYlY-Dhp4NBIPvtRgUXE,5283
24
- datashare_python-0.7.1.dist-info/METADATA,sha256=FF5E62UhxGi0zBseyb4b_6TQRl2JtDZlA1wLECoTblg,923
25
- datashare_python-0.7.1.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
26
- datashare_python-0.7.1.dist-info/entry_points.txt,sha256=ILE7auxabHWiu3GC-AunWnzjhOI_SbZp7D4GqZHlLw4,68
27
- datashare_python-0.7.1.dist-info/RECORD,,
24
+ datashare_python-0.7.3.dist-info/METADATA,sha256=sZEiq4mFYgmvkYuJOO6KWePleBYWDex722rVR3FSm3I,921
25
+ datashare_python-0.7.3.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
26
+ datashare_python-0.7.3.dist-info/entry_points.txt,sha256=ILE7auxabHWiu3GC-AunWnzjhOI_SbZp7D4GqZHlLw4,68
27
+ datashare_python-0.7.3.dist-info/RECORD,,