datatailr 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datatailr might be problematic. Click here for more details.
- datatailr/__init__.py +1 -35
- datatailr/acl.py +35 -3
- datatailr/blob.py +13 -13
- datatailr/build/image.py +38 -2
- datatailr/dt_json.py +32 -0
- datatailr/errors.py +17 -0
- datatailr/group.py +19 -13
- datatailr/logging.py +27 -10
- datatailr/sbin/datatailr_run.py +147 -0
- datatailr/sbin/datatailr_run_app.py +28 -0
- datatailr/sbin/{run_job.py → datatailr_run_batch.py} +5 -20
- datatailr/scheduler/__init__.py +24 -8
- datatailr/scheduler/arguments_cache.py +71 -43
- datatailr/scheduler/base.py +195 -69
- datatailr/scheduler/batch.py +141 -19
- datatailr/scheduler/batch_decorator.py +53 -24
- datatailr/scheduler/constants.py +1 -1
- datatailr/scheduler/schedule.py +117 -0
- datatailr/scheduler/utils.py +3 -1
- datatailr/user.py +30 -17
- datatailr/utils.py +20 -0
- datatailr/wrapper.py +0 -6
- {datatailr-0.1.6.dist-info → datatailr-0.1.8.dist-info}/METADATA +37 -4
- datatailr-0.1.8.dist-info/RECORD +30 -0
- datatailr-0.1.8.dist-info/entry_points.txt +4 -0
- datatailr-0.1.8.dist-info/top_level.txt +1 -0
- datatailr-0.1.6.dist-info/RECORD +0 -29
- datatailr-0.1.6.dist-info/entry_points.txt +0 -2
- datatailr-0.1.6.dist-info/top_level.txt +0 -2
- test_module/__init__.py +0 -17
- test_module/test_submodule.py +0 -38
- {datatailr-0.1.6.dist-info → datatailr-0.1.8.dist-info}/WHEEL +0 -0
- {datatailr-0.1.6.dist-info → datatailr-0.1.8.dist-info}/licenses/LICENSE +0 -0
datatailr/scheduler/batch.py
CHANGED
|
@@ -14,9 +14,11 @@ import contextvars
|
|
|
14
14
|
import json
|
|
15
15
|
import os
|
|
16
16
|
from functools import reduce
|
|
17
|
-
from typing import Dict, List, Optional, Sequence, Set, Tuple, Union
|
|
17
|
+
from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
|
|
18
|
+
import uuid
|
|
18
19
|
|
|
19
20
|
from datatailr import Image
|
|
21
|
+
from datatailr.dt_json import encode_json
|
|
20
22
|
from datatailr.errors import BatchJobError
|
|
21
23
|
from datatailr.logging import DatatailrLogger
|
|
22
24
|
from datatailr.scheduler.base import (
|
|
@@ -29,9 +31,12 @@ from datatailr.scheduler.base import (
|
|
|
29
31
|
User,
|
|
30
32
|
)
|
|
31
33
|
from datatailr.scheduler.constants import DEFAULT_TASK_CPU, DEFAULT_TASK_MEMORY
|
|
34
|
+
from datatailr.scheduler.arguments_cache import ArgumentsCache
|
|
35
|
+
from datatailr.scheduler.schedule import Schedule
|
|
32
36
|
from datatailr.utils import is_dt_installed
|
|
33
37
|
|
|
34
38
|
__DAG_CONTEXT__: contextvars.ContextVar = contextvars.ContextVar("dag_context")
|
|
39
|
+
__ARGUMENTS_CACHE__ = ArgumentsCache()
|
|
35
40
|
logger = DatatailrLogger(os.path.abspath(__file__)).get_logger()
|
|
36
41
|
|
|
37
42
|
|
|
@@ -39,13 +44,6 @@ def get_current_manager():
|
|
|
39
44
|
return __DAG_CONTEXT__.get(None)
|
|
40
45
|
|
|
41
46
|
|
|
42
|
-
def next_batch_job_id():
|
|
43
|
-
i = 0
|
|
44
|
-
while True:
|
|
45
|
-
yield i
|
|
46
|
-
i += 1
|
|
47
|
-
|
|
48
|
-
|
|
49
47
|
class CyclicDependencyError(BatchJobError):
|
|
50
48
|
"""
|
|
51
49
|
Exception raised when a cyclic dependency is detected in the batch job dependencies.
|
|
@@ -79,6 +77,12 @@ class MissingDagError(BatchJobError):
|
|
|
79
77
|
)
|
|
80
78
|
|
|
81
79
|
|
|
80
|
+
class CodePackageMismatchError(BatchJobError):
|
|
81
|
+
def __init__(self, message: str):
|
|
82
|
+
super().__init__(message)
|
|
83
|
+
self.message = message
|
|
84
|
+
|
|
85
|
+
|
|
82
86
|
class BatchJob:
|
|
83
87
|
"""
|
|
84
88
|
Represents a job within a batch job.
|
|
@@ -93,6 +97,7 @@ class BatchJob:
|
|
|
93
97
|
resources: Optional[Resources] = None,
|
|
94
98
|
dependencies: Sequence[Union[str, BatchJob]] = [],
|
|
95
99
|
dag: Optional[Batch] = get_current_manager(),
|
|
100
|
+
argument_mapping: Dict[str, str] = {},
|
|
96
101
|
):
|
|
97
102
|
self.name = name
|
|
98
103
|
self.entrypoint = entrypoint
|
|
@@ -102,12 +107,14 @@ class BatchJob:
|
|
|
102
107
|
raise MissingDagError()
|
|
103
108
|
self.__id = dag.next_job_id
|
|
104
109
|
self.dag = dag
|
|
110
|
+
self.__args: Dict[str, Any] = {}
|
|
105
111
|
self.dag.__BATCH_JOB_NAMES__[self.name] = self.__id
|
|
106
112
|
self.dependencies = self.translate_dependencies()
|
|
107
113
|
assert all(
|
|
108
114
|
isinstance(dep, int) for dep in self.dependencies
|
|
109
115
|
), "All dependencies must be integers representing job IDs."
|
|
110
116
|
self.dag.add_job(self)
|
|
117
|
+
self.__argument_mapping = argument_mapping or {}
|
|
111
118
|
|
|
112
119
|
def __call__(self, *args, **kwds) -> BatchJob:
|
|
113
120
|
"""
|
|
@@ -116,6 +123,22 @@ class BatchJob:
|
|
|
116
123
|
"""
|
|
117
124
|
return self
|
|
118
125
|
|
|
126
|
+
@property
|
|
127
|
+
def args(self) -> Dict[str, Any]:
|
|
128
|
+
"""
|
|
129
|
+
Returns the arguments for the BatchJob instance.
|
|
130
|
+
"""
|
|
131
|
+
return self.__args or {}
|
|
132
|
+
|
|
133
|
+
@args.setter
|
|
134
|
+
def args(self, args: Dict[str, Any]):
|
|
135
|
+
"""
|
|
136
|
+
Sets the arguments for the BatchJob instance.
|
|
137
|
+
"""
|
|
138
|
+
if not isinstance(args, dict):
|
|
139
|
+
raise TypeError(f"Expected a dictionary for args, got {type(args)}")
|
|
140
|
+
self.__args = args
|
|
141
|
+
|
|
119
142
|
@property
|
|
120
143
|
def id(self) -> int:
|
|
121
144
|
"""
|
|
@@ -123,7 +146,7 @@ class BatchJob:
|
|
|
123
146
|
"""
|
|
124
147
|
return self.__id
|
|
125
148
|
|
|
126
|
-
def alias(self, name: str):
|
|
149
|
+
def alias(self, name: str) -> BatchJob:
|
|
127
150
|
"""
|
|
128
151
|
Set an alias for the BatchJob instance.
|
|
129
152
|
|
|
@@ -136,19 +159,48 @@ class BatchJob:
|
|
|
136
159
|
self.name = name
|
|
137
160
|
return self
|
|
138
161
|
|
|
162
|
+
def set_resources(
|
|
163
|
+
self,
|
|
164
|
+
resources: Optional[Resources] = None,
|
|
165
|
+
memory: Optional[str] = None,
|
|
166
|
+
cpu: Optional[float] = None,
|
|
167
|
+
) -> BatchJob:
|
|
168
|
+
"""
|
|
169
|
+
Set the resources for the BatchJob instance.
|
|
170
|
+
|
|
171
|
+
:param resources: The Resources instance to set.
|
|
172
|
+
"""
|
|
173
|
+
if resources is not None:
|
|
174
|
+
if not isinstance(resources, Resources):
|
|
175
|
+
raise TypeError(f"Expected Resources instance, got {type(resources)}")
|
|
176
|
+
else:
|
|
177
|
+
resources = Resources(
|
|
178
|
+
memory=memory or DEFAULT_TASK_MEMORY, cpu=cpu or DEFAULT_TASK_CPU
|
|
179
|
+
)
|
|
180
|
+
self.resources = resources
|
|
181
|
+
return self
|
|
182
|
+
|
|
139
183
|
def __repr__(self):
|
|
140
184
|
return (
|
|
141
185
|
f"BatchJob(name={self.name}, entrypoint={self.entrypoint}, "
|
|
142
186
|
f"resources={self.resources}) (id={self.__id})"
|
|
143
187
|
)
|
|
144
188
|
|
|
189
|
+
def __getstate__(self) -> object:
|
|
190
|
+
state = self.__dict__.copy()
|
|
191
|
+
state.pop("dag", None)
|
|
192
|
+
return state
|
|
193
|
+
|
|
194
|
+
def __setstate__(self, state: dict):
|
|
195
|
+
self.__dict__.update(state)
|
|
196
|
+
|
|
145
197
|
def to_dict(self):
|
|
146
198
|
"""
|
|
147
199
|
Convert the BatchJob instance to a dictionary representation.
|
|
148
200
|
"""
|
|
149
201
|
return {
|
|
150
202
|
"display_name": self.name,
|
|
151
|
-
"
|
|
203
|
+
"child_number": self.__id,
|
|
152
204
|
"entrypoint": str(self.entrypoint),
|
|
153
205
|
"memory": self.resources.memory if self.resources else DEFAULT_TASK_MEMORY,
|
|
154
206
|
"cpu": self.resources.cpu if self.resources else DEFAULT_TASK_CPU,
|
|
@@ -183,6 +235,9 @@ class BatchJob:
|
|
|
183
235
|
|
|
184
236
|
def __add_dependency__(self, other):
|
|
185
237
|
self.dependencies.add(other.__id)
|
|
238
|
+
arg_name = self.__argument_mapping.get(other.name, other.name)
|
|
239
|
+
if arg_name is not None:
|
|
240
|
+
self.__args[arg_name] = other
|
|
186
241
|
|
|
187
242
|
def __lshift__(
|
|
188
243
|
self, other: Sequence[BatchJob] | BatchJob
|
|
@@ -223,7 +278,13 @@ class BatchJob:
|
|
|
223
278
|
Execute the job's entrypoint.
|
|
224
279
|
"""
|
|
225
280
|
if isinstance(self.entrypoint, EntryPoint):
|
|
226
|
-
|
|
281
|
+
env = {
|
|
282
|
+
"DATATAILR_BATCH_ID": str(self.dag.id),
|
|
283
|
+
"DATATAILR_JOB_ID": str(self.__id),
|
|
284
|
+
"DATATAILR_JOB_NAME": self.name,
|
|
285
|
+
"DATATAILR_JOB_ARGUMENT_MAPPING": encode_json(self.__argument_mapping),
|
|
286
|
+
}
|
|
287
|
+
self.entrypoint(env=env)
|
|
227
288
|
else:
|
|
228
289
|
raise TypeError(f"Invalid entrypoint type: {type(self.entrypoint)}")
|
|
229
290
|
|
|
@@ -237,12 +298,17 @@ class Batch(Job):
|
|
|
237
298
|
|
|
238
299
|
def __init__(
|
|
239
300
|
self,
|
|
240
|
-
environment: Optional[Environment],
|
|
241
301
|
name: str,
|
|
242
|
-
|
|
243
|
-
|
|
302
|
+
environment: Optional[Environment] = Environment.DEV,
|
|
303
|
+
schedule: Optional[Schedule] = None,
|
|
304
|
+
image: Optional[Image] = None,
|
|
305
|
+
run_as: Optional[Union[str, User]] = User.signed_user(),
|
|
244
306
|
resources: Resources = Resources(memory="100m", cpu=1),
|
|
245
307
|
acl: Optional[ACL] = None,
|
|
308
|
+
local_run: bool = False,
|
|
309
|
+
python_requirements: str = "",
|
|
310
|
+
build_script_pre: str = "",
|
|
311
|
+
build_script_post: str = "",
|
|
246
312
|
):
|
|
247
313
|
super().__init__(
|
|
248
314
|
environment=environment,
|
|
@@ -251,19 +317,25 @@ class Batch(Job):
|
|
|
251
317
|
run_as=run_as,
|
|
252
318
|
resources=resources,
|
|
253
319
|
acl=acl,
|
|
320
|
+
python_requirements=python_requirements,
|
|
321
|
+
build_script_pre=build_script_pre,
|
|
322
|
+
build_script_post=build_script_post,
|
|
323
|
+
type=JobType.BATCH,
|
|
254
324
|
)
|
|
255
|
-
self.type = JobType.BATCH
|
|
256
325
|
self.__jobs: List[BatchJob] = []
|
|
257
326
|
self._auto_run = False
|
|
258
|
-
self.__next_job_id =
|
|
327
|
+
self.__next_job_id = -1
|
|
259
328
|
self.__BATCH_JOB_NAMES__: Dict[str, int] = {}
|
|
329
|
+
self.__local_run = local_run
|
|
330
|
+
self.__schedule = schedule
|
|
260
331
|
|
|
261
332
|
@property
|
|
262
333
|
def next_job_id(self):
|
|
263
334
|
"""
|
|
264
335
|
Returns a generator for the next job ID in the batch.
|
|
265
336
|
"""
|
|
266
|
-
|
|
337
|
+
self.__next_job_id += 1
|
|
338
|
+
return self.__next_job_id
|
|
267
339
|
|
|
268
340
|
def add_job(self, job: BatchJob):
|
|
269
341
|
"""
|
|
@@ -279,6 +351,25 @@ class Batch(Job):
|
|
|
279
351
|
raise DuplicateJobNameError(job.name)
|
|
280
352
|
# Use the batch level resource values as defaults for jobs
|
|
281
353
|
job.resources = job.resources or self.resources
|
|
354
|
+
image_path_to_repo = self.image.path_to_repo
|
|
355
|
+
image_path_to_module = self.image.path_to_module
|
|
356
|
+
package_path_to_repo = job.entrypoint.path_to_repo
|
|
357
|
+
package_path_to_module = job.entrypoint.path_to_module
|
|
358
|
+
|
|
359
|
+
if image_path_to_repo is None:
|
|
360
|
+
self.image.path_to_repo = package_path_to_repo
|
|
361
|
+
elif package_path_to_repo != image_path_to_repo:
|
|
362
|
+
raise CodePackageMismatchError(
|
|
363
|
+
f"Function {job.entrypoint.function_name} is defined in a different package root: "
|
|
364
|
+
f"{package_path_to_repo} != {image_path_to_repo}"
|
|
365
|
+
)
|
|
366
|
+
if image_path_to_module is None:
|
|
367
|
+
self.image.path_to_module = package_path_to_module
|
|
368
|
+
elif package_path_to_module != image_path_to_module:
|
|
369
|
+
raise CodePackageMismatchError(
|
|
370
|
+
f"Function {job.entrypoint.function_name} is defined in a different module: "
|
|
371
|
+
f"{package_path_to_module} != {image_path_to_module}"
|
|
372
|
+
)
|
|
282
373
|
self.__jobs.append(job)
|
|
283
374
|
|
|
284
375
|
def is_job_in(self, job: BatchJob) -> bool:
|
|
@@ -293,6 +384,7 @@ class Batch(Job):
|
|
|
293
384
|
"""
|
|
294
385
|
batch_dict = super().to_dict()
|
|
295
386
|
batch_dict["jobs"] = [job.to_dict() for job in self.__jobs]
|
|
387
|
+
batch_dict["schedule"] = str(self.__schedule) if self.__schedule else None
|
|
296
388
|
return batch_dict
|
|
297
389
|
|
|
298
390
|
def to_json(self):
|
|
@@ -345,11 +437,41 @@ class Batch(Job):
|
|
|
345
437
|
"A cyclic dependency exists amongst {}".format(jobs)
|
|
346
438
|
)
|
|
347
439
|
|
|
440
|
+
def get_schedule_args(self) -> Dict[str, Any]:
|
|
441
|
+
if isinstance(self.__schedule, Schedule):
|
|
442
|
+
args = {
|
|
443
|
+
"at_minute": self.__schedule.at_minutes,
|
|
444
|
+
"every_minute": self.__schedule.every_minute,
|
|
445
|
+
"at_hour": self.__schedule.at_hours,
|
|
446
|
+
"every_hour": self.__schedule.every_hour,
|
|
447
|
+
"weekdays": self.__schedule.weekdays,
|
|
448
|
+
"day_of_month": self.__schedule.day_of_month,
|
|
449
|
+
"in_month": self.__schedule.in_month,
|
|
450
|
+
"every_month": self.__schedule.every_month,
|
|
451
|
+
"timezone": self.__schedule.timezone,
|
|
452
|
+
"run_after_job_uuid": self.__schedule.run_after_job_uuid,
|
|
453
|
+
"run_after_job_name": self.__schedule.run_after_job_name,
|
|
454
|
+
"run_after_job_condition": self.__schedule.run_after_job_condition,
|
|
455
|
+
}
|
|
456
|
+
args = {key: value for key, value in args.items() if value is not None}
|
|
457
|
+
for key, value in args.items():
|
|
458
|
+
if isinstance(value, list):
|
|
459
|
+
args[key] = ",".join(map(str, value))
|
|
460
|
+
return args
|
|
461
|
+
return {}
|
|
462
|
+
|
|
348
463
|
def run(self) -> Tuple[bool, str]:
|
|
349
|
-
|
|
464
|
+
def arg_name(arg: Union[BatchJob, str]) -> str:
|
|
465
|
+
return arg.name if isinstance(arg, BatchJob) else arg
|
|
466
|
+
|
|
467
|
+
args = {
|
|
468
|
+
j.name: {k: arg_name(v) for k, v in j.args.items()} for j in self.__jobs
|
|
469
|
+
}
|
|
470
|
+
__ARGUMENTS_CACHE__.add_arguments(self.id, args)
|
|
471
|
+
if not self.__local_run and is_dt_installed():
|
|
350
472
|
return super().run()
|
|
351
473
|
else:
|
|
352
|
-
os.environ["DATATAILR_BATCH_RUN_ID"] =
|
|
474
|
+
os.environ["DATATAILR_BATCH_RUN_ID"] = uuid.uuid4().hex[:8]
|
|
353
475
|
for step in self.__topological_sort__():
|
|
354
476
|
for job_id in step:
|
|
355
477
|
job = self.__jobs[job_id]
|
|
@@ -11,21 +11,25 @@
|
|
|
11
11
|
import functools
|
|
12
12
|
import inspect
|
|
13
13
|
import os
|
|
14
|
+
from typing import Callable
|
|
14
15
|
|
|
15
16
|
from datatailr.logging import DatatailrLogger
|
|
16
|
-
from datatailr.scheduler.arguments_cache import ArgumentsCache
|
|
17
|
+
from datatailr.scheduler.arguments_cache import ArgumentsCache, CacheNotFoundError
|
|
17
18
|
from datatailr.scheduler.base import EntryPoint, JobType, Resources
|
|
18
|
-
from datatailr.scheduler.batch import
|
|
19
|
+
from datatailr.scheduler.batch import (
|
|
20
|
+
BatchJob,
|
|
21
|
+
get_current_manager,
|
|
22
|
+
)
|
|
19
23
|
from datatailr.scheduler.constants import DEFAULT_TASK_CPU, DEFAULT_TASK_MEMORY
|
|
20
24
|
from datatailr.scheduler.utils import get_available_env_args
|
|
21
25
|
|
|
22
26
|
__ARGUMENTS_CACHE__ = ArgumentsCache()
|
|
23
|
-
__FUNCTIONS_CREATED_IN_DAG__: dict[
|
|
27
|
+
__FUNCTIONS_CREATED_IN_DAG__: dict[Callable, str] = {}
|
|
24
28
|
logger = DatatailrLogger(__name__).get_logger()
|
|
25
29
|
|
|
26
30
|
|
|
27
31
|
def batch_run_id() -> str:
|
|
28
|
-
return os.
|
|
32
|
+
return os.getenv("DATATAILR_BATCH_RUN_ID", "unknown")
|
|
29
33
|
|
|
30
34
|
|
|
31
35
|
def dag_id(job: BatchJob) -> str:
|
|
@@ -34,7 +38,7 @@ def dag_id(job: BatchJob) -> str:
|
|
|
34
38
|
)
|
|
35
39
|
|
|
36
40
|
|
|
37
|
-
def batch_decorator(memory=DEFAULT_TASK_MEMORY, cpu=DEFAULT_TASK_CPU):
|
|
41
|
+
def batch_decorator(memory: str = DEFAULT_TASK_MEMORY, cpu: float = DEFAULT_TASK_CPU):
|
|
38
42
|
"""
|
|
39
43
|
Decorator to mark a function as a batch job.
|
|
40
44
|
This decorator can be used to wrap functions that should be executed as part of batch jobs.
|
|
@@ -55,39 +59,62 @@ def batch_decorator(memory=DEFAULT_TASK_MEMORY, cpu=DEFAULT_TASK_CPU):
|
|
|
55
59
|
# There are two possible scenarios:
|
|
56
60
|
# 1. The function is called directly, not as part of a batch job. In this case, the args and kwargs should be used.
|
|
57
61
|
# 2. The function is called as part of a batch job - it was constructed as part of a DAG and is now being executed.
|
|
58
|
-
if func not in __FUNCTIONS_CREATED_IN_DAG__:
|
|
59
|
-
return func(*args, **kwargs)
|
|
60
|
-
function_arguments = [v.name for v in parameters.values()]
|
|
61
62
|
env_args = get_available_env_args()
|
|
63
|
+
all_function_args = [
|
|
64
|
+
v.name
|
|
65
|
+
for v in parameters.values()
|
|
66
|
+
if v.kind
|
|
67
|
+
not in (
|
|
68
|
+
inspect.Parameter.VAR_POSITIONAL,
|
|
69
|
+
inspect.Parameter.VAR_KEYWORD,
|
|
70
|
+
)
|
|
71
|
+
]
|
|
62
72
|
final_args = list(args)
|
|
63
|
-
final_kwargs = kwargs.copy()
|
|
64
73
|
|
|
65
74
|
for name, value in env_args.items():
|
|
66
|
-
if name in
|
|
67
|
-
if len(final_args) < len(
|
|
75
|
+
if name in all_function_args:
|
|
76
|
+
if len(final_args) < len(all_function_args):
|
|
68
77
|
final_args.extend(
|
|
69
|
-
[None] * (len(
|
|
78
|
+
[None] * (len(all_function_args) - len(final_args))
|
|
70
79
|
)
|
|
71
|
-
final_args[
|
|
72
|
-
|
|
73
|
-
|
|
80
|
+
final_args[all_function_args.index(name)] = value
|
|
81
|
+
try:
|
|
82
|
+
final_kwargs = __ARGUMENTS_CACHE__.get_arguments(
|
|
83
|
+
dag_id(func),
|
|
84
|
+
os.getenv("DATATAILR_JOB_NAME", func.__name__),
|
|
85
|
+
os.getenv("DATATAILR_BATCH_RUN_ID"),
|
|
86
|
+
)
|
|
87
|
+
except CacheNotFoundError:
|
|
88
|
+
final_kwargs = kwargs
|
|
89
|
+
|
|
90
|
+
if varargs is not None and varkw is None:
|
|
91
|
+
for key in list(final_kwargs.keys()):
|
|
92
|
+
if key not in parameters:
|
|
93
|
+
final_args.append(final_kwargs.pop(key))
|
|
94
|
+
|
|
95
|
+
# Some of the loaded arguments are actually args and not kwargs.
|
|
96
|
+
if len(final_args) == len(parameters.keys()):
|
|
97
|
+
for i, arg_name in enumerate(parameters.keys()):
|
|
98
|
+
final_args[i] = final_kwargs.pop(arg_name, final_args[i])
|
|
99
|
+
result = func(*final_args, **final_kwargs)
|
|
100
|
+
__ARGUMENTS_CACHE__.add_result(
|
|
101
|
+
batch_run_id(),
|
|
102
|
+
os.getenv("DATATAILR_JOB_NAME", func.__name__),
|
|
103
|
+
result,
|
|
74
104
|
)
|
|
75
|
-
result = func(**function_arguments)
|
|
76
|
-
__ARGUMENTS_CACHE__.add_result(batch_run_id(), func.__name__, result)
|
|
77
105
|
return result
|
|
78
106
|
else:
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
107
|
+
if varargs is not None:
|
|
108
|
+
all_args = {job.name: job for job in args}
|
|
109
|
+
else:
|
|
110
|
+
all_args = dict(zip(spec.args, args)) | kwargs
|
|
84
111
|
dag.set_autorun(True)
|
|
112
|
+
|
|
85
113
|
job = BatchJob(
|
|
86
114
|
name=func.__name__,
|
|
87
115
|
entrypoint=EntryPoint(
|
|
88
116
|
JobType.BATCH,
|
|
89
|
-
|
|
90
|
-
function_name=func.__name__,
|
|
117
|
+
func=func,
|
|
91
118
|
),
|
|
92
119
|
resources=Resources(memory=memory, cpu=cpu),
|
|
93
120
|
dependencies=[
|
|
@@ -97,6 +124,8 @@ def batch_decorator(memory=DEFAULT_TASK_MEMORY, cpu=DEFAULT_TASK_CPU):
|
|
|
97
124
|
],
|
|
98
125
|
dag=dag,
|
|
99
126
|
)
|
|
127
|
+
job.args = all_args
|
|
128
|
+
__FUNCTIONS_CREATED_IN_DAG__[job.entrypoint.func] = dag.id
|
|
100
129
|
return job
|
|
101
130
|
|
|
102
131
|
module = inspect.getmodule(func)
|
datatailr/scheduler/constants.py
CHANGED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
##########################################################################
|
|
2
|
+
#
|
|
3
|
+
# Copyright (c) 2025 - Datatailr Inc.
|
|
4
|
+
# All Rights Reserved.
|
|
5
|
+
#
|
|
6
|
+
# This file is part of Datatailr and subject to the terms and conditions
|
|
7
|
+
# defined in 'LICENSE.txt'. Unauthorized copying and/or distribution
|
|
8
|
+
# of this file, in parts or full, via any medium is strictly prohibited.
|
|
9
|
+
##########################################################################
|
|
10
|
+
|
|
11
|
+
from typing import Any
|
|
12
|
+
from datatailr.wrapper import dt__Job
|
|
13
|
+
import re
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
__CLIENT__ = dt__Job()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Schedule:
|
|
20
|
+
"""
|
|
21
|
+
Represents a schedule for batch jobs.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
cron_expression: str = "",
|
|
27
|
+
at_minutes: list[int] | None = None,
|
|
28
|
+
every_minute: int | None = None,
|
|
29
|
+
at_hours: list[int] | None = None,
|
|
30
|
+
every_hour: int | None = None,
|
|
31
|
+
weekdays: list[str] | None = None,
|
|
32
|
+
day_of_month: int | None = None,
|
|
33
|
+
in_month: list[str] | None = None,
|
|
34
|
+
every_month: int | None = None,
|
|
35
|
+
timezone: str | None = None,
|
|
36
|
+
run_after_job_uuid: str | None = None,
|
|
37
|
+
run_after_job_name: str | None = None,
|
|
38
|
+
run_after_job_condition: str | None = None,
|
|
39
|
+
):
|
|
40
|
+
self.at_minutes = at_minutes
|
|
41
|
+
self.every_minute = every_minute
|
|
42
|
+
self.at_hours = at_hours
|
|
43
|
+
self.every_hour = every_hour
|
|
44
|
+
self.weekdays = weekdays
|
|
45
|
+
self.day_of_month = day_of_month
|
|
46
|
+
self.in_month = in_month
|
|
47
|
+
self.every_month = every_month
|
|
48
|
+
self.timezone = timezone
|
|
49
|
+
self.run_after_job_uuid = run_after_job_uuid
|
|
50
|
+
self.run_after_job_name = run_after_job_name
|
|
51
|
+
self.run_after_job_condition = run_after_job_condition
|
|
52
|
+
self.schedule_expression = None
|
|
53
|
+
self.cron_expression = cron_expression
|
|
54
|
+
|
|
55
|
+
self.__is_set__ = False
|
|
56
|
+
|
|
57
|
+
def __str__(self) -> str:
|
|
58
|
+
self.__compile__()
|
|
59
|
+
return self.cron_expression
|
|
60
|
+
|
|
61
|
+
def __repr__(self) -> str:
|
|
62
|
+
self.__compile__()
|
|
63
|
+
return f"Schedule(cron_expression={self.cron_expression}, timezone={self.timezone}) - {self.schedule_expression}"
|
|
64
|
+
|
|
65
|
+
def __setattr__(self, name: str, value: Any) -> None:
|
|
66
|
+
super().__setattr__(name, value)
|
|
67
|
+
if name in [
|
|
68
|
+
"at_minutes",
|
|
69
|
+
"at_hours",
|
|
70
|
+
"every_minute",
|
|
71
|
+
"every_hour",
|
|
72
|
+
"weekdays",
|
|
73
|
+
"day_of_month",
|
|
74
|
+
"in_month",
|
|
75
|
+
"every_month",
|
|
76
|
+
]:
|
|
77
|
+
self.__is_set__ = False
|
|
78
|
+
|
|
79
|
+
def __compile__(self):
|
|
80
|
+
if self.__is_set__:
|
|
81
|
+
return
|
|
82
|
+
argument_name = [
|
|
83
|
+
"at_minutes",
|
|
84
|
+
"at_hours",
|
|
85
|
+
"every_minute",
|
|
86
|
+
"every_hour",
|
|
87
|
+
"weekdays",
|
|
88
|
+
"day_of_month",
|
|
89
|
+
"in_month",
|
|
90
|
+
"every_month",
|
|
91
|
+
"run_after_job_uuid",
|
|
92
|
+
"run_after_job_name",
|
|
93
|
+
"run_after_job_condition",
|
|
94
|
+
]
|
|
95
|
+
arguments = {}
|
|
96
|
+
|
|
97
|
+
for key in argument_name:
|
|
98
|
+
if hasattr(self, key) and getattr(self, key) is not None:
|
|
99
|
+
value = getattr(self, key)
|
|
100
|
+
if isinstance(value, list):
|
|
101
|
+
value = ",".join(map(str, value))
|
|
102
|
+
arguments[key] = value
|
|
103
|
+
|
|
104
|
+
result = __CLIENT__.run("", cron_string=True, **arguments)
|
|
105
|
+
match = re.match(r"^(.*?)\s*\((.*?)\)$", result)
|
|
106
|
+
if match:
|
|
107
|
+
cron_expression, schedule_expression = match.groups()
|
|
108
|
+
self.cron_expression = cron_expression.strip()
|
|
109
|
+
self.schedule_expression = schedule_expression.strip()
|
|
110
|
+
self.__is_set__ = True
|
|
111
|
+
|
|
112
|
+
def get_cron_string(self) -> str:
|
|
113
|
+
"""
|
|
114
|
+
Returns the compiled cron string.
|
|
115
|
+
"""
|
|
116
|
+
self.__compile__()
|
|
117
|
+
return self.cron_expression
|
datatailr/scheduler/utils.py
CHANGED
|
@@ -16,7 +16,9 @@ from datatailr.scheduler.constants import BATCH_JOB_ARGUMENTS
|
|
|
16
16
|
def get_available_env_args():
|
|
17
17
|
"""
|
|
18
18
|
Get the available environment variables for batch job arguments.
|
|
19
|
-
|
|
19
|
+
|
|
20
|
+
This function retrieves the environment variables that match the keys defined in DATATAILR_BATCH_JOB_ARGUMENTS.
|
|
21
|
+
|
|
20
22
|
Returns:
|
|
21
23
|
dict: A dictionary of available environment variables for batch jobs.
|
|
22
24
|
"""
|