datatailr 0.1.5__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datatailr might be problematic. Click here for more details.
- datatailr/__init__.py +1 -35
- datatailr/acl.py +35 -3
- datatailr/blob.py +13 -13
- datatailr/build/image.py +38 -2
- datatailr/dt_json.py +32 -0
- datatailr/errors.py +17 -0
- datatailr/group.py +20 -12
- datatailr/logging.py +27 -10
- datatailr/sbin/datatailr_run.py +147 -0
- datatailr/sbin/datatailr_run_app.py +28 -0
- datatailr/sbin/{run_job.py → datatailr_run_batch.py} +5 -20
- datatailr/scheduler/__init__.py +24 -8
- datatailr/scheduler/arguments_cache.py +88 -45
- datatailr/scheduler/base.py +195 -69
- datatailr/scheduler/batch.py +155 -19
- datatailr/scheduler/batch_decorator.py +56 -26
- datatailr/scheduler/constants.py +1 -1
- datatailr/scheduler/schedule.py +117 -0
- datatailr/scheduler/utils.py +3 -1
- datatailr/user.py +34 -14
- datatailr/utils.py +20 -0
- datatailr/wrapper.py +10 -10
- {datatailr-0.1.5.dist-info → datatailr-0.1.8.dist-info}/METADATA +38 -5
- datatailr-0.1.8.dist-info/RECORD +30 -0
- datatailr-0.1.8.dist-info/entry_points.txt +4 -0
- datatailr-0.1.8.dist-info/top_level.txt +1 -0
- datatailr-0.1.5.dist-info/RECORD +0 -29
- datatailr-0.1.5.dist-info/entry_points.txt +0 -2
- datatailr-0.1.5.dist-info/top_level.txt +0 -2
- test_module/__init__.py +0 -17
- test_module/test_submodule.py +0 -38
- {datatailr-0.1.5.dist-info → datatailr-0.1.8.dist-info}/WHEEL +0 -0
- {datatailr-0.1.5.dist-info → datatailr-0.1.8.dist-info}/licenses/LICENSE +0 -0
datatailr/scheduler/base.py
CHANGED
|
@@ -8,7 +8,11 @@
|
|
|
8
8
|
# of this file, in parts or full, via any medium is strictly prohibited.
|
|
9
9
|
# *************************************************************************
|
|
10
10
|
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from datetime import datetime
|
|
11
14
|
import importlib
|
|
15
|
+
import inspect
|
|
12
16
|
import json
|
|
13
17
|
import os
|
|
14
18
|
import subprocess
|
|
@@ -18,12 +22,27 @@ from dataclasses import dataclass
|
|
|
18
22
|
from enum import Enum
|
|
19
23
|
from typing import Callable, Optional, Tuple, Union
|
|
20
24
|
|
|
21
|
-
from datatailr import ACL, Environment, User,
|
|
25
|
+
from datatailr import ACL, Environment, User, is_dt_installed
|
|
26
|
+
from datatailr.wrapper import dt__Job
|
|
27
|
+
from datatailr.scheduler.constants import DEFAULT_TASK_MEMORY, DEFAULT_TASK_CPU
|
|
22
28
|
from datatailr.build.image import Image
|
|
23
29
|
from datatailr.errors import BatchJobError
|
|
24
30
|
from datatailr.logging import DatatailrLogger
|
|
31
|
+
from datatailr.utils import run_shell_command
|
|
25
32
|
|
|
26
33
|
logger = DatatailrLogger(os.path.abspath(__file__)).get_logger()
|
|
34
|
+
__client__ = dt__Job()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def set_allow_unsafe_scheduling(allow: bool):
|
|
38
|
+
"""
|
|
39
|
+
Set whether to allow unsafe scheduling of jobs.
|
|
40
|
+
This is a global setting that affects how jobs are scheduled.
|
|
41
|
+
"""
|
|
42
|
+
if allow:
|
|
43
|
+
os.environ["DATATAILR_ALLOW_UNSAFE_SCHEDULING"] = "true"
|
|
44
|
+
else:
|
|
45
|
+
os.environ.pop("DATATAILR_ALLOW_UNSAFE_SCHEDULING", None)
|
|
27
46
|
|
|
28
47
|
|
|
29
48
|
class RepoValidationError(BatchJobError):
|
|
@@ -40,6 +59,7 @@ class JobType(Enum):
|
|
|
40
59
|
BATCH = "batch"
|
|
41
60
|
SERVICE = "service"
|
|
42
61
|
APP = "app"
|
|
62
|
+
EXCEL = "excel"
|
|
43
63
|
UNKNOWN = "unknown"
|
|
44
64
|
|
|
45
65
|
def __str__(self):
|
|
@@ -55,8 +75,14 @@ class Resources:
|
|
|
55
75
|
Represents the resources required for a job.
|
|
56
76
|
"""
|
|
57
77
|
|
|
58
|
-
memory: str =
|
|
59
|
-
cpu:
|
|
78
|
+
memory: str = DEFAULT_TASK_MEMORY
|
|
79
|
+
cpu: float = DEFAULT_TASK_CPU
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# TODO: create a dt_run script that will:
|
|
83
|
+
# 1. create user and group if not exists
|
|
84
|
+
# 2. set the correct path
|
|
85
|
+
# 3. run the job based on its type
|
|
60
86
|
|
|
61
87
|
|
|
62
88
|
class EntryPoint:
|
|
@@ -68,26 +94,30 @@ class EntryPoint:
|
|
|
68
94
|
def __init__(
|
|
69
95
|
self,
|
|
70
96
|
type: JobType,
|
|
71
|
-
func:
|
|
72
|
-
module_name: Optional[str] = None,
|
|
73
|
-
function_name: Optional[str] = None,
|
|
97
|
+
func: Callable,
|
|
74
98
|
):
|
|
75
|
-
if func is None and (module_name is None or function_name is None):
|
|
76
|
-
raise ValueError(
|
|
77
|
-
"Either a function or module and function names must be provided."
|
|
78
|
-
)
|
|
79
99
|
self.func = func
|
|
80
|
-
self.module_name = func.__module__
|
|
81
|
-
self.function_name = func.__name__
|
|
100
|
+
self.module_name = func.__module__
|
|
101
|
+
self.function_name = func.__name__
|
|
82
102
|
self.type = type
|
|
83
103
|
|
|
104
|
+
# Find the absolute path to the repository and then the relative path to the module.
|
|
105
|
+
# This will be used in the creation of the code 'bundle' when building the image.
|
|
106
|
+
path_to_repo = run_shell_command("git rev-parse --show-toplevel")[0]
|
|
107
|
+
path_to_code = inspect.getfile(func)
|
|
108
|
+
package_root = path_to_code
|
|
109
|
+
module_parts = self.module_name.split(".")
|
|
110
|
+
for _ in module_parts:
|
|
111
|
+
package_root = os.path.dirname(package_root)
|
|
112
|
+
path_to_module = os.path.relpath(package_root, path_to_repo)
|
|
113
|
+
self.path_to_repo = path_to_repo
|
|
114
|
+
self.path_to_module = path_to_module
|
|
115
|
+
|
|
84
116
|
def __call__(self, *args, **kwargs):
|
|
117
|
+
os.environ.update(kwargs.pop("env", {}))
|
|
85
118
|
if self.type == JobType.BATCH:
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
func = getattr(module, self.function_name)
|
|
89
|
-
elif self.func is not None:
|
|
90
|
-
func = self.func
|
|
119
|
+
module = importlib.import_module(self.module_name)
|
|
120
|
+
func = getattr(module, self.function_name)
|
|
91
121
|
return func(*args, **kwargs)
|
|
92
122
|
|
|
93
123
|
elif self.type == JobType.SERVICE:
|
|
@@ -106,13 +136,28 @@ class EntryPoint:
|
|
|
106
136
|
class Job:
|
|
107
137
|
def __init__(
|
|
108
138
|
self,
|
|
109
|
-
environment: Optional[Environment],
|
|
110
139
|
name: str,
|
|
111
|
-
|
|
112
|
-
|
|
140
|
+
environment: Optional[Environment] = Environment.DEV,
|
|
141
|
+
image: Optional[Image] = None,
|
|
142
|
+
run_as: Optional[Union[str, User]] = User.signed_user(),
|
|
113
143
|
resources: Resources = Resources(memory="100m", cpu=1),
|
|
114
144
|
acl: Optional[ACL] = None,
|
|
145
|
+
python_requirements: str = "",
|
|
146
|
+
build_script_pre: str = "",
|
|
147
|
+
build_script_post: str = "",
|
|
148
|
+
type: JobType = JobType.UNKNOWN,
|
|
149
|
+
entrypoint: Optional[EntryPoint] = None,
|
|
150
|
+
update_existing: bool = False,
|
|
115
151
|
):
|
|
152
|
+
if environment is None:
|
|
153
|
+
environment = Environment.DEV
|
|
154
|
+
|
|
155
|
+
if update_existing:
|
|
156
|
+
existing_job = self.__get_existing__(name, environment)
|
|
157
|
+
if existing_job:
|
|
158
|
+
self.from_dict(existing_job)
|
|
159
|
+
return
|
|
160
|
+
|
|
116
161
|
if run_as is None:
|
|
117
162
|
run_as = User.signed_user()
|
|
118
163
|
if environment is None:
|
|
@@ -126,11 +171,16 @@ class Job:
|
|
|
126
171
|
self.name = name
|
|
127
172
|
self.run_as = run_as
|
|
128
173
|
self.resources = resources
|
|
174
|
+
if image is None:
|
|
175
|
+
image = Image(
|
|
176
|
+
acl=self.acl,
|
|
177
|
+
python_requirements=python_requirements,
|
|
178
|
+
build_script_pre=build_script_pre,
|
|
179
|
+
build_script_post=build_script_post,
|
|
180
|
+
)
|
|
129
181
|
self.image = image
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
self.type: JobType = JobType.UNKNOWN
|
|
133
|
-
self.entrypoint = None
|
|
182
|
+
self.type = type
|
|
183
|
+
self.entrypoint = entrypoint
|
|
134
184
|
self.__id = str(uuid.uuid4())
|
|
135
185
|
|
|
136
186
|
@property
|
|
@@ -140,6 +190,25 @@ class Job:
|
|
|
140
190
|
"""
|
|
141
191
|
return self.__id
|
|
142
192
|
|
|
193
|
+
@classmethod
|
|
194
|
+
def __get_existing__(
|
|
195
|
+
cls, job_name: str, environment: Environment
|
|
196
|
+
) -> Optional[dict]:
|
|
197
|
+
"""
|
|
198
|
+
Retrieve an existing job instance from the DataTailr platform.
|
|
199
|
+
Based on the job name and environment.
|
|
200
|
+
"""
|
|
201
|
+
job_list = __client__.ls(filter=f"name={job_name},environment={environment}")
|
|
202
|
+
if not isinstance(job_list, list):
|
|
203
|
+
return None
|
|
204
|
+
if len(job_list) == 0:
|
|
205
|
+
return None
|
|
206
|
+
if len(job_list) > 1:
|
|
207
|
+
raise BatchJobError(
|
|
208
|
+
f"Multiple jobs found with name '{job_name}' in environment '{environment}'."
|
|
209
|
+
)
|
|
210
|
+
return job_list[0]
|
|
211
|
+
|
|
143
212
|
def __repr__(self):
|
|
144
213
|
return (
|
|
145
214
|
f"Job(name={self.name}, environment={self.environment}, "
|
|
@@ -169,31 +238,64 @@ class Job:
|
|
|
169
238
|
job_dict["cpu"] = self.resources.cpu
|
|
170
239
|
return job_dict
|
|
171
240
|
|
|
241
|
+
def from_dict(self, job_dict: dict):
|
|
242
|
+
self.name = job_dict["name"]
|
|
243
|
+
self.image = job_dict["image"]
|
|
244
|
+
|
|
245
|
+
environment = job_dict.get("environment", "dev")
|
|
246
|
+
environment = Environment(environment.lower())
|
|
247
|
+
self.environment = environment
|
|
248
|
+
|
|
249
|
+
user = job_dict["run_as"]["name"]
|
|
250
|
+
user = User(user.lower())
|
|
251
|
+
self.run_as = user
|
|
252
|
+
|
|
253
|
+
self.resources = Resources(memory=job_dict["memory"], cpu=job_dict["num_cpus"])
|
|
254
|
+
acl = job_dict.get("acl", None)
|
|
255
|
+
if acl is None:
|
|
256
|
+
acl = ACL(user=self.run_as)
|
|
257
|
+
else:
|
|
258
|
+
acl = ACL.from_dict(acl)
|
|
259
|
+
self.acl = acl
|
|
260
|
+
self.python_requirements = (job_dict.get("python_requirements", ""),)
|
|
261
|
+
self.build_script_pre = (job_dict.get("build_script_pre", ""),)
|
|
262
|
+
self.build_script_post = (job_dict.get("build_script_post", ""),)
|
|
263
|
+
self.type = JobType(job_dict.get("type", "unknown").lower())
|
|
264
|
+
self.state = job_dict["state"]
|
|
265
|
+
self.create_time = datetime.fromtimestamp(job_dict["create_time"] * 1e-6)
|
|
266
|
+
self.version = job_dict["version"]
|
|
267
|
+
self.__id = job_dict["id"]
|
|
268
|
+
|
|
172
269
|
def to_json(self):
|
|
173
270
|
"""
|
|
174
271
|
Convert the Job instance to a JSON string representation.
|
|
175
272
|
"""
|
|
176
273
|
return json.dumps(self.to_dict())
|
|
177
274
|
|
|
178
|
-
def verify_repo_is_ready(self) -> Tuple[
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
)
|
|
275
|
+
def verify_repo_is_ready(self) -> Tuple[str, str]:
|
|
276
|
+
"""
|
|
277
|
+
Verify if the repository is ready for job execution.
|
|
278
|
+
The check consists of two parts:
|
|
279
|
+
1. Check if there are uncommitted changes in the repository.
|
|
280
|
+
2. Check if the local commit matches the remote HEAD (the repo is synced with the remote).
|
|
281
|
+
Returns a tuple of (branch: str, commit_hash: str).
|
|
282
|
+
"""
|
|
283
|
+
local_commit = run_shell_command("git rev-parse HEAD")[0]
|
|
284
|
+
branch_name = run_shell_command("git rev-parse --abbrev-ref HEAD")[0]
|
|
285
|
+
|
|
286
|
+
if os.getenv("DATATAILR_ALLOW_UNSAFE_SCHEDULING", "false").lower() == "true":
|
|
287
|
+
return branch_name, local_commit
|
|
288
|
+
return_code = run_shell_command("git diff --exit-code")[1]
|
|
289
|
+
is_committed = return_code == 0
|
|
290
|
+
|
|
185
291
|
if not is_committed:
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
"Uncommitted changes detected. Please commit your changes before running the job.",
|
|
292
|
+
raise RepoValidationError(
|
|
293
|
+
"Please commit your changes before running the job."
|
|
189
294
|
)
|
|
190
295
|
|
|
191
|
-
local_commit = subprocess.run(
|
|
192
|
-
("git rev-parse HEAD"), shell=True, capture_output=True, text=True
|
|
193
|
-
).stdout.strip()
|
|
194
296
|
remote_commit = (
|
|
195
297
|
subprocess.run(
|
|
196
|
-
("git ls-remote origin HEAD"),
|
|
298
|
+
("remote_commit = $(git ls-remote origin HEAD)"),
|
|
197
299
|
shell=True,
|
|
198
300
|
capture_output=True,
|
|
199
301
|
text=True,
|
|
@@ -203,43 +305,67 @@ class Job:
|
|
|
203
305
|
)
|
|
204
306
|
|
|
205
307
|
if local_commit != remote_commit:
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
"Local commit does not match remote HEAD. Please pull the latest changes before running the job.",
|
|
308
|
+
raise RepoValidationError(
|
|
309
|
+
"Please sync your local repository with the remote before running the job."
|
|
209
310
|
)
|
|
210
311
|
|
|
211
|
-
|
|
212
|
-
("git rev-parse --abbrev-ref HEAD"),
|
|
213
|
-
shell=True,
|
|
214
|
-
capture_output=True,
|
|
215
|
-
text=True,
|
|
216
|
-
).stdout.strip()
|
|
217
|
-
return True, ""
|
|
312
|
+
return branch_name, local_commit
|
|
218
313
|
|
|
219
|
-
def
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
check_result = self.verify_repo_is_ready()
|
|
229
|
-
if not check_result[0]:
|
|
230
|
-
raise RepoValidationError(check_result[1])
|
|
231
|
-
logger.info(
|
|
232
|
-
f"Running job '{self.name}' in environment '{self.environment}' as '{self.run_as}'"
|
|
233
|
-
)
|
|
314
|
+
def __prepare__(self) -> str:
|
|
315
|
+
branch_name, local_commit = self.verify_repo_is_ready()
|
|
316
|
+
self.image.update(
|
|
317
|
+
branch_name=branch_name,
|
|
318
|
+
commit_hash=local_commit,
|
|
319
|
+
)
|
|
320
|
+
logger.info(
|
|
321
|
+
f"Running job '{self.name}' in environment '{self.environment}' as '{self.run_as}'"
|
|
322
|
+
)
|
|
234
323
|
|
|
235
|
-
|
|
236
|
-
|
|
324
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as temp_file:
|
|
325
|
+
temp_file.write(self.to_json().encode())
|
|
326
|
+
return temp_file.name
|
|
237
327
|
|
|
238
|
-
|
|
239
|
-
|
|
328
|
+
def get_schedule_args(self) -> dict:
|
|
329
|
+
"""
|
|
330
|
+
Returns additional arguments for scheduling the job.
|
|
331
|
+
Override or extend this method as needed.
|
|
332
|
+
"""
|
|
333
|
+
return {}
|
|
240
334
|
|
|
241
|
-
|
|
242
|
-
|
|
335
|
+
def __run_command__(self, command: str) -> Tuple[bool, str]:
|
|
336
|
+
"""
|
|
337
|
+
Run a command in the context of the job.
|
|
338
|
+
This is used to execute the job's entry point.
|
|
339
|
+
"""
|
|
340
|
+
if not is_dt_installed():
|
|
243
341
|
raise NotImplementedError(
|
|
244
342
|
"DataTailr is not installed. Please install DataTailr to run this job."
|
|
245
343
|
)
|
|
344
|
+
try:
|
|
345
|
+
temp_file_name = self.__prepare__()
|
|
346
|
+
|
|
347
|
+
if command == "run":
|
|
348
|
+
__client__.run(f"file://{temp_file_name}", **self.get_schedule_args())
|
|
349
|
+
elif command == "save":
|
|
350
|
+
__client__.save(f"file://{temp_file_name}", **self.get_schedule_args())
|
|
351
|
+
else:
|
|
352
|
+
raise ValueError(f"Unknown command: {command}")
|
|
353
|
+
os.remove(temp_file_name)
|
|
354
|
+
except Exception as e:
|
|
355
|
+
logger.error(f"Error running command '{command}': {e}")
|
|
356
|
+
return False, str(e)
|
|
357
|
+
return True, f"Job '{self.name}' {command}d successfully."
|
|
358
|
+
|
|
359
|
+
def save(self) -> Tuple[bool, str]:
|
|
360
|
+
"""
|
|
361
|
+
Save the job to the DataTailr platform.
|
|
362
|
+
If the job already exists, it will be updated.
|
|
363
|
+
"""
|
|
364
|
+
return self.__run_command__("save")
|
|
365
|
+
|
|
366
|
+
def run(self) -> Tuple[bool, str]:
|
|
367
|
+
"""
|
|
368
|
+
Run the job. This method should be implemented to execute the job logic.
|
|
369
|
+
It verifies the repository state and prepares the job for execution.
|
|
370
|
+
"""
|
|
371
|
+
return self.__run_command__("run")
|
datatailr/scheduler/batch.py
CHANGED
|
@@ -14,9 +14,11 @@ import contextvars
|
|
|
14
14
|
import json
|
|
15
15
|
import os
|
|
16
16
|
from functools import reduce
|
|
17
|
-
from typing import Dict, List, Optional, Sequence, Set, Tuple, Union
|
|
17
|
+
from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
|
|
18
|
+
import uuid
|
|
18
19
|
|
|
19
20
|
from datatailr import Image
|
|
21
|
+
from datatailr.dt_json import encode_json
|
|
20
22
|
from datatailr.errors import BatchJobError
|
|
21
23
|
from datatailr.logging import DatatailrLogger
|
|
22
24
|
from datatailr.scheduler.base import (
|
|
@@ -29,9 +31,12 @@ from datatailr.scheduler.base import (
|
|
|
29
31
|
User,
|
|
30
32
|
)
|
|
31
33
|
from datatailr.scheduler.constants import DEFAULT_TASK_CPU, DEFAULT_TASK_MEMORY
|
|
34
|
+
from datatailr.scheduler.arguments_cache import ArgumentsCache
|
|
35
|
+
from datatailr.scheduler.schedule import Schedule
|
|
32
36
|
from datatailr.utils import is_dt_installed
|
|
33
37
|
|
|
34
38
|
__DAG_CONTEXT__: contextvars.ContextVar = contextvars.ContextVar("dag_context")
|
|
39
|
+
__ARGUMENTS_CACHE__ = ArgumentsCache()
|
|
35
40
|
logger = DatatailrLogger(os.path.abspath(__file__)).get_logger()
|
|
36
41
|
|
|
37
42
|
|
|
@@ -39,13 +44,6 @@ def get_current_manager():
|
|
|
39
44
|
return __DAG_CONTEXT__.get(None)
|
|
40
45
|
|
|
41
46
|
|
|
42
|
-
def next_batch_job_id():
|
|
43
|
-
i = 0
|
|
44
|
-
while True:
|
|
45
|
-
yield i
|
|
46
|
-
i += 1
|
|
47
|
-
|
|
48
|
-
|
|
49
47
|
class CyclicDependencyError(BatchJobError):
|
|
50
48
|
"""
|
|
51
49
|
Exception raised when a cyclic dependency is detected in the batch job dependencies.
|
|
@@ -79,6 +77,12 @@ class MissingDagError(BatchJobError):
|
|
|
79
77
|
)
|
|
80
78
|
|
|
81
79
|
|
|
80
|
+
class CodePackageMismatchError(BatchJobError):
|
|
81
|
+
def __init__(self, message: str):
|
|
82
|
+
super().__init__(message)
|
|
83
|
+
self.message = message
|
|
84
|
+
|
|
85
|
+
|
|
82
86
|
class BatchJob:
|
|
83
87
|
"""
|
|
84
88
|
Represents a job within a batch job.
|
|
@@ -93,6 +97,7 @@ class BatchJob:
|
|
|
93
97
|
resources: Optional[Resources] = None,
|
|
94
98
|
dependencies: Sequence[Union[str, BatchJob]] = [],
|
|
95
99
|
dag: Optional[Batch] = get_current_manager(),
|
|
100
|
+
argument_mapping: Dict[str, str] = {},
|
|
96
101
|
):
|
|
97
102
|
self.name = name
|
|
98
103
|
self.entrypoint = entrypoint
|
|
@@ -102,14 +107,46 @@ class BatchJob:
|
|
|
102
107
|
raise MissingDagError()
|
|
103
108
|
self.__id = dag.next_job_id
|
|
104
109
|
self.dag = dag
|
|
110
|
+
self.__args: Dict[str, Any] = {}
|
|
105
111
|
self.dag.__BATCH_JOB_NAMES__[self.name] = self.__id
|
|
106
112
|
self.dependencies = self.translate_dependencies()
|
|
107
113
|
assert all(
|
|
108
114
|
isinstance(dep, int) for dep in self.dependencies
|
|
109
115
|
), "All dependencies must be integers representing job IDs."
|
|
110
116
|
self.dag.add_job(self)
|
|
117
|
+
self.__argument_mapping = argument_mapping or {}
|
|
118
|
+
|
|
119
|
+
def __call__(self, *args, **kwds) -> BatchJob:
|
|
120
|
+
"""
|
|
121
|
+
Allows the BatchJob instance to be called like a function, returning itself.
|
|
122
|
+
This is useful for chaining or functional-style programming.
|
|
123
|
+
"""
|
|
124
|
+
return self
|
|
125
|
+
|
|
126
|
+
@property
|
|
127
|
+
def args(self) -> Dict[str, Any]:
|
|
128
|
+
"""
|
|
129
|
+
Returns the arguments for the BatchJob instance.
|
|
130
|
+
"""
|
|
131
|
+
return self.__args or {}
|
|
132
|
+
|
|
133
|
+
@args.setter
|
|
134
|
+
def args(self, args: Dict[str, Any]):
|
|
135
|
+
"""
|
|
136
|
+
Sets the arguments for the BatchJob instance.
|
|
137
|
+
"""
|
|
138
|
+
if not isinstance(args, dict):
|
|
139
|
+
raise TypeError(f"Expected a dictionary for args, got {type(args)}")
|
|
140
|
+
self.__args = args
|
|
141
|
+
|
|
142
|
+
@property
|
|
143
|
+
def id(self) -> int:
|
|
144
|
+
"""
|
|
145
|
+
Returns the unique identifier of the BatchJob instance.
|
|
146
|
+
"""
|
|
147
|
+
return self.__id
|
|
111
148
|
|
|
112
|
-
def alias(self, name: str):
|
|
149
|
+
def alias(self, name: str) -> BatchJob:
|
|
113
150
|
"""
|
|
114
151
|
Set an alias for the BatchJob instance.
|
|
115
152
|
|
|
@@ -122,19 +159,48 @@ class BatchJob:
|
|
|
122
159
|
self.name = name
|
|
123
160
|
return self
|
|
124
161
|
|
|
162
|
+
def set_resources(
|
|
163
|
+
self,
|
|
164
|
+
resources: Optional[Resources] = None,
|
|
165
|
+
memory: Optional[str] = None,
|
|
166
|
+
cpu: Optional[float] = None,
|
|
167
|
+
) -> BatchJob:
|
|
168
|
+
"""
|
|
169
|
+
Set the resources for the BatchJob instance.
|
|
170
|
+
|
|
171
|
+
:param resources: The Resources instance to set.
|
|
172
|
+
"""
|
|
173
|
+
if resources is not None:
|
|
174
|
+
if not isinstance(resources, Resources):
|
|
175
|
+
raise TypeError(f"Expected Resources instance, got {type(resources)}")
|
|
176
|
+
else:
|
|
177
|
+
resources = Resources(
|
|
178
|
+
memory=memory or DEFAULT_TASK_MEMORY, cpu=cpu or DEFAULT_TASK_CPU
|
|
179
|
+
)
|
|
180
|
+
self.resources = resources
|
|
181
|
+
return self
|
|
182
|
+
|
|
125
183
|
def __repr__(self):
|
|
126
184
|
return (
|
|
127
185
|
f"BatchJob(name={self.name}, entrypoint={self.entrypoint}, "
|
|
128
186
|
f"resources={self.resources}) (id={self.__id})"
|
|
129
187
|
)
|
|
130
188
|
|
|
189
|
+
def __getstate__(self) -> object:
|
|
190
|
+
state = self.__dict__.copy()
|
|
191
|
+
state.pop("dag", None)
|
|
192
|
+
return state
|
|
193
|
+
|
|
194
|
+
def __setstate__(self, state: dict):
|
|
195
|
+
self.__dict__.update(state)
|
|
196
|
+
|
|
131
197
|
def to_dict(self):
|
|
132
198
|
"""
|
|
133
199
|
Convert the BatchJob instance to a dictionary representation.
|
|
134
200
|
"""
|
|
135
201
|
return {
|
|
136
202
|
"display_name": self.name,
|
|
137
|
-
"
|
|
203
|
+
"child_number": self.__id,
|
|
138
204
|
"entrypoint": str(self.entrypoint),
|
|
139
205
|
"memory": self.resources.memory if self.resources else DEFAULT_TASK_MEMORY,
|
|
140
206
|
"cpu": self.resources.cpu if self.resources else DEFAULT_TASK_CPU,
|
|
@@ -169,6 +235,9 @@ class BatchJob:
|
|
|
169
235
|
|
|
170
236
|
def __add_dependency__(self, other):
|
|
171
237
|
self.dependencies.add(other.__id)
|
|
238
|
+
arg_name = self.__argument_mapping.get(other.name, other.name)
|
|
239
|
+
if arg_name is not None:
|
|
240
|
+
self.__args[arg_name] = other
|
|
172
241
|
|
|
173
242
|
def __lshift__(
|
|
174
243
|
self, other: Sequence[BatchJob] | BatchJob
|
|
@@ -209,7 +278,13 @@ class BatchJob:
|
|
|
209
278
|
Execute the job's entrypoint.
|
|
210
279
|
"""
|
|
211
280
|
if isinstance(self.entrypoint, EntryPoint):
|
|
212
|
-
|
|
281
|
+
env = {
|
|
282
|
+
"DATATAILR_BATCH_ID": str(self.dag.id),
|
|
283
|
+
"DATATAILR_JOB_ID": str(self.__id),
|
|
284
|
+
"DATATAILR_JOB_NAME": self.name,
|
|
285
|
+
"DATATAILR_JOB_ARGUMENT_MAPPING": encode_json(self.__argument_mapping),
|
|
286
|
+
}
|
|
287
|
+
self.entrypoint(env=env)
|
|
213
288
|
else:
|
|
214
289
|
raise TypeError(f"Invalid entrypoint type: {type(self.entrypoint)}")
|
|
215
290
|
|
|
@@ -223,12 +298,17 @@ class Batch(Job):
|
|
|
223
298
|
|
|
224
299
|
def __init__(
|
|
225
300
|
self,
|
|
226
|
-
environment: Optional[Environment],
|
|
227
301
|
name: str,
|
|
228
|
-
|
|
229
|
-
|
|
302
|
+
environment: Optional[Environment] = Environment.DEV,
|
|
303
|
+
schedule: Optional[Schedule] = None,
|
|
304
|
+
image: Optional[Image] = None,
|
|
305
|
+
run_as: Optional[Union[str, User]] = User.signed_user(),
|
|
230
306
|
resources: Resources = Resources(memory="100m", cpu=1),
|
|
231
307
|
acl: Optional[ACL] = None,
|
|
308
|
+
local_run: bool = False,
|
|
309
|
+
python_requirements: str = "",
|
|
310
|
+
build_script_pre: str = "",
|
|
311
|
+
build_script_post: str = "",
|
|
232
312
|
):
|
|
233
313
|
super().__init__(
|
|
234
314
|
environment=environment,
|
|
@@ -237,19 +317,25 @@ class Batch(Job):
|
|
|
237
317
|
run_as=run_as,
|
|
238
318
|
resources=resources,
|
|
239
319
|
acl=acl,
|
|
320
|
+
python_requirements=python_requirements,
|
|
321
|
+
build_script_pre=build_script_pre,
|
|
322
|
+
build_script_post=build_script_post,
|
|
323
|
+
type=JobType.BATCH,
|
|
240
324
|
)
|
|
241
|
-
self.type = JobType.BATCH
|
|
242
325
|
self.__jobs: List[BatchJob] = []
|
|
243
326
|
self._auto_run = False
|
|
244
|
-
self.__next_job_id =
|
|
327
|
+
self.__next_job_id = -1
|
|
245
328
|
self.__BATCH_JOB_NAMES__: Dict[str, int] = {}
|
|
329
|
+
self.__local_run = local_run
|
|
330
|
+
self.__schedule = schedule
|
|
246
331
|
|
|
247
332
|
@property
|
|
248
333
|
def next_job_id(self):
|
|
249
334
|
"""
|
|
250
335
|
Returns a generator for the next job ID in the batch.
|
|
251
336
|
"""
|
|
252
|
-
|
|
337
|
+
self.__next_job_id += 1
|
|
338
|
+
return self.__next_job_id
|
|
253
339
|
|
|
254
340
|
def add_job(self, job: BatchJob):
|
|
255
341
|
"""
|
|
@@ -265,6 +351,25 @@ class Batch(Job):
|
|
|
265
351
|
raise DuplicateJobNameError(job.name)
|
|
266
352
|
# Use the batch level resource values as defaults for jobs
|
|
267
353
|
job.resources = job.resources or self.resources
|
|
354
|
+
image_path_to_repo = self.image.path_to_repo
|
|
355
|
+
image_path_to_module = self.image.path_to_module
|
|
356
|
+
package_path_to_repo = job.entrypoint.path_to_repo
|
|
357
|
+
package_path_to_module = job.entrypoint.path_to_module
|
|
358
|
+
|
|
359
|
+
if image_path_to_repo is None:
|
|
360
|
+
self.image.path_to_repo = package_path_to_repo
|
|
361
|
+
elif package_path_to_repo != image_path_to_repo:
|
|
362
|
+
raise CodePackageMismatchError(
|
|
363
|
+
f"Function {job.entrypoint.function_name} is defined in a different package root: "
|
|
364
|
+
f"{package_path_to_repo} != {image_path_to_repo}"
|
|
365
|
+
)
|
|
366
|
+
if image_path_to_module is None:
|
|
367
|
+
self.image.path_to_module = package_path_to_module
|
|
368
|
+
elif package_path_to_module != image_path_to_module:
|
|
369
|
+
raise CodePackageMismatchError(
|
|
370
|
+
f"Function {job.entrypoint.function_name} is defined in a different module: "
|
|
371
|
+
f"{package_path_to_module} != {image_path_to_module}"
|
|
372
|
+
)
|
|
268
373
|
self.__jobs.append(job)
|
|
269
374
|
|
|
270
375
|
def is_job_in(self, job: BatchJob) -> bool:
|
|
@@ -279,6 +384,7 @@ class Batch(Job):
|
|
|
279
384
|
"""
|
|
280
385
|
batch_dict = super().to_dict()
|
|
281
386
|
batch_dict["jobs"] = [job.to_dict() for job in self.__jobs]
|
|
387
|
+
batch_dict["schedule"] = str(self.__schedule) if self.__schedule else None
|
|
282
388
|
return batch_dict
|
|
283
389
|
|
|
284
390
|
def to_json(self):
|
|
@@ -331,11 +437,41 @@ class Batch(Job):
|
|
|
331
437
|
"A cyclic dependency exists amongst {}".format(jobs)
|
|
332
438
|
)
|
|
333
439
|
|
|
440
|
+
def get_schedule_args(self) -> Dict[str, Any]:
|
|
441
|
+
if isinstance(self.__schedule, Schedule):
|
|
442
|
+
args = {
|
|
443
|
+
"at_minute": self.__schedule.at_minutes,
|
|
444
|
+
"every_minute": self.__schedule.every_minute,
|
|
445
|
+
"at_hour": self.__schedule.at_hours,
|
|
446
|
+
"every_hour": self.__schedule.every_hour,
|
|
447
|
+
"weekdays": self.__schedule.weekdays,
|
|
448
|
+
"day_of_month": self.__schedule.day_of_month,
|
|
449
|
+
"in_month": self.__schedule.in_month,
|
|
450
|
+
"every_month": self.__schedule.every_month,
|
|
451
|
+
"timezone": self.__schedule.timezone,
|
|
452
|
+
"run_after_job_uuid": self.__schedule.run_after_job_uuid,
|
|
453
|
+
"run_after_job_name": self.__schedule.run_after_job_name,
|
|
454
|
+
"run_after_job_condition": self.__schedule.run_after_job_condition,
|
|
455
|
+
}
|
|
456
|
+
args = {key: value for key, value in args.items() if value is not None}
|
|
457
|
+
for key, value in args.items():
|
|
458
|
+
if isinstance(value, list):
|
|
459
|
+
args[key] = ",".join(map(str, value))
|
|
460
|
+
return args
|
|
461
|
+
return {}
|
|
462
|
+
|
|
334
463
|
def run(self) -> Tuple[bool, str]:
|
|
335
|
-
|
|
464
|
+
def arg_name(arg: Union[BatchJob, str]) -> str:
|
|
465
|
+
return arg.name if isinstance(arg, BatchJob) else arg
|
|
466
|
+
|
|
467
|
+
args = {
|
|
468
|
+
j.name: {k: arg_name(v) for k, v in j.args.items()} for j in self.__jobs
|
|
469
|
+
}
|
|
470
|
+
__ARGUMENTS_CACHE__.add_arguments(self.id, args)
|
|
471
|
+
if not self.__local_run and is_dt_installed():
|
|
336
472
|
return super().run()
|
|
337
473
|
else:
|
|
338
|
-
os.environ["DATATAILR_BATCH_RUN_ID"] =
|
|
474
|
+
os.environ["DATATAILR_BATCH_RUN_ID"] = uuid.uuid4().hex[:8]
|
|
339
475
|
for step in self.__topological_sort__():
|
|
340
476
|
for job_id in step:
|
|
341
477
|
job = self.__jobs[job_id]
|