datatailr 0.1.6__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datatailr might be problematic. Click here for more details.
- datatailr/__init__.py +1 -35
- datatailr/acl.py +35 -3
- datatailr/blob.py +13 -13
- datatailr/build/image.py +38 -2
- datatailr/dt_json.py +32 -0
- datatailr/errors.py +17 -0
- datatailr/group.py +18 -14
- datatailr/logging.py +21 -10
- datatailr/sbin/datatailr_run.py +147 -0
- datatailr/sbin/datatailr_run_app.py +37 -0
- datatailr/sbin/{run_job.py → datatailr_run_batch.py} +5 -20
- datatailr/sbin/datatailr_run_excel.py +34 -0
- datatailr/sbin/datatailr_run_service.py +34 -0
- datatailr/scheduler/__init__.py +24 -8
- datatailr/scheduler/arguments_cache.py +71 -43
- datatailr/scheduler/base.py +195 -79
- datatailr/scheduler/batch.py +141 -19
- datatailr/scheduler/batch_decorator.py +53 -24
- datatailr/scheduler/constants.py +1 -1
- datatailr/scheduler/schedule.py +117 -0
- datatailr/scheduler/utils.py +3 -1
- datatailr/user.py +21 -21
- datatailr/utils.py +20 -0
- datatailr/wrapper.py +0 -6
- {datatailr-0.1.6.dist-info → datatailr-0.1.10.dist-info}/METADATA +37 -4
- datatailr-0.1.10.dist-info/RECORD +32 -0
- datatailr-0.1.10.dist-info/entry_points.txt +6 -0
- datatailr-0.1.10.dist-info/top_level.txt +1 -0
- datatailr-0.1.6.dist-info/RECORD +0 -29
- datatailr-0.1.6.dist-info/entry_points.txt +0 -2
- datatailr-0.1.6.dist-info/top_level.txt +0 -2
- test_module/__init__.py +0 -17
- test_module/test_submodule.py +0 -38
- {datatailr-0.1.6.dist-info → datatailr-0.1.10.dist-info}/WHEEL +0 -0
- {datatailr-0.1.6.dist-info → datatailr-0.1.10.dist-info}/licenses/LICENSE +0 -0
datatailr/scheduler/base.py
CHANGED
|
@@ -8,22 +8,40 @@
|
|
|
8
8
|
# of this file, in parts or full, via any medium is strictly prohibited.
|
|
9
9
|
# *************************************************************************
|
|
10
10
|
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from datetime import datetime
|
|
11
14
|
import importlib
|
|
15
|
+
import inspect
|
|
12
16
|
import json
|
|
13
17
|
import os
|
|
14
|
-
import subprocess
|
|
15
18
|
import tempfile
|
|
16
19
|
import uuid
|
|
17
20
|
from dataclasses import dataclass
|
|
18
21
|
from enum import Enum
|
|
19
22
|
from typing import Callable, Optional, Tuple, Union
|
|
20
23
|
|
|
21
|
-
from datatailr import ACL, Environment, User,
|
|
24
|
+
from datatailr import ACL, Environment, User, is_dt_installed
|
|
25
|
+
from datatailr.wrapper import dt__Job
|
|
26
|
+
from datatailr.scheduler.constants import DEFAULT_TASK_MEMORY, DEFAULT_TASK_CPU
|
|
22
27
|
from datatailr.build.image import Image
|
|
23
28
|
from datatailr.errors import BatchJobError
|
|
24
29
|
from datatailr.logging import DatatailrLogger
|
|
30
|
+
from datatailr.utils import run_shell_command
|
|
25
31
|
|
|
26
32
|
logger = DatatailrLogger(os.path.abspath(__file__)).get_logger()
|
|
33
|
+
__client__ = dt__Job()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def set_allow_unsafe_scheduling(allow: bool):
|
|
37
|
+
"""
|
|
38
|
+
Set whether to allow unsafe scheduling of jobs.
|
|
39
|
+
This is a global setting that affects how jobs are scheduled.
|
|
40
|
+
"""
|
|
41
|
+
if allow:
|
|
42
|
+
os.environ["DATATAILR_ALLOW_UNSAFE_SCHEDULING"] = "true"
|
|
43
|
+
else:
|
|
44
|
+
os.environ.pop("DATATAILR_ALLOW_UNSAFE_SCHEDULING", None)
|
|
27
45
|
|
|
28
46
|
|
|
29
47
|
class RepoValidationError(BatchJobError):
|
|
@@ -40,6 +58,7 @@ class JobType(Enum):
|
|
|
40
58
|
BATCH = "batch"
|
|
41
59
|
SERVICE = "service"
|
|
42
60
|
APP = "app"
|
|
61
|
+
EXCEL = "excel"
|
|
43
62
|
UNKNOWN = "unknown"
|
|
44
63
|
|
|
45
64
|
def __str__(self):
|
|
@@ -55,8 +74,14 @@ class Resources:
|
|
|
55
74
|
Represents the resources required for a job.
|
|
56
75
|
"""
|
|
57
76
|
|
|
58
|
-
memory: str =
|
|
59
|
-
cpu:
|
|
77
|
+
memory: str = DEFAULT_TASK_MEMORY
|
|
78
|
+
cpu: float = DEFAULT_TASK_CPU
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# TODO: create a dt_run script that will:
|
|
82
|
+
# 1. create user and group if not exists
|
|
83
|
+
# 2. set the correct path
|
|
84
|
+
# 3. run the job based on its type
|
|
60
85
|
|
|
61
86
|
|
|
62
87
|
class EntryPoint:
|
|
@@ -68,26 +93,30 @@ class EntryPoint:
|
|
|
68
93
|
def __init__(
|
|
69
94
|
self,
|
|
70
95
|
type: JobType,
|
|
71
|
-
func:
|
|
72
|
-
module_name: Optional[str] = None,
|
|
73
|
-
function_name: Optional[str] = None,
|
|
96
|
+
func: Callable,
|
|
74
97
|
):
|
|
75
|
-
if func is None and (module_name is None or function_name is None):
|
|
76
|
-
raise ValueError(
|
|
77
|
-
"Either a function or module and function names must be provided."
|
|
78
|
-
)
|
|
79
98
|
self.func = func
|
|
80
|
-
self.module_name = func.__module__
|
|
81
|
-
self.function_name = func.__name__
|
|
99
|
+
self.module_name = func.__module__
|
|
100
|
+
self.function_name = func.__name__
|
|
82
101
|
self.type = type
|
|
83
102
|
|
|
103
|
+
# Find the absolute path to the repository and then the relative path to the module.
|
|
104
|
+
# This will be used in the creation of the code 'bundle' when building the image.
|
|
105
|
+
path_to_repo = run_shell_command("git rev-parse --show-toplevel")[0]
|
|
106
|
+
path_to_code = inspect.getfile(func)
|
|
107
|
+
package_root = path_to_code
|
|
108
|
+
module_parts = self.module_name.split(".")
|
|
109
|
+
for _ in module_parts:
|
|
110
|
+
package_root = os.path.dirname(package_root)
|
|
111
|
+
path_to_module = os.path.relpath(package_root, path_to_repo)
|
|
112
|
+
self.path_to_repo = path_to_repo
|
|
113
|
+
self.path_to_module = path_to_module
|
|
114
|
+
|
|
84
115
|
def __call__(self, *args, **kwargs):
|
|
116
|
+
os.environ.update(kwargs.pop("env", {}))
|
|
85
117
|
if self.type == JobType.BATCH:
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
func = getattr(module, self.function_name)
|
|
89
|
-
elif self.func is not None:
|
|
90
|
-
func = self.func
|
|
118
|
+
module = importlib.import_module(self.module_name)
|
|
119
|
+
func = getattr(module, self.function_name)
|
|
91
120
|
return func(*args, **kwargs)
|
|
92
121
|
|
|
93
122
|
elif self.type == JobType.SERVICE:
|
|
@@ -106,13 +135,28 @@ class EntryPoint:
|
|
|
106
135
|
class Job:
|
|
107
136
|
def __init__(
|
|
108
137
|
self,
|
|
109
|
-
environment: Optional[Environment],
|
|
110
138
|
name: str,
|
|
111
|
-
|
|
112
|
-
|
|
139
|
+
environment: Optional[Environment] = Environment.DEV,
|
|
140
|
+
image: Optional[Image] = None,
|
|
141
|
+
run_as: Optional[Union[str, User]] = None,
|
|
113
142
|
resources: Resources = Resources(memory="100m", cpu=1),
|
|
114
143
|
acl: Optional[ACL] = None,
|
|
144
|
+
python_requirements: str = "",
|
|
145
|
+
build_script_pre: str = "",
|
|
146
|
+
build_script_post: str = "",
|
|
147
|
+
type: JobType = JobType.UNKNOWN,
|
|
148
|
+
entrypoint: Optional[EntryPoint] = None,
|
|
149
|
+
update_existing: bool = False,
|
|
115
150
|
):
|
|
151
|
+
if environment is None:
|
|
152
|
+
environment = Environment.DEV
|
|
153
|
+
|
|
154
|
+
if update_existing:
|
|
155
|
+
existing_job = self.__get_existing__(name, environment)
|
|
156
|
+
if existing_job:
|
|
157
|
+
self.from_dict(existing_job)
|
|
158
|
+
return
|
|
159
|
+
|
|
116
160
|
if run_as is None:
|
|
117
161
|
run_as = User.signed_user()
|
|
118
162
|
if environment is None:
|
|
@@ -126,11 +170,16 @@ class Job:
|
|
|
126
170
|
self.name = name
|
|
127
171
|
self.run_as = run_as
|
|
128
172
|
self.resources = resources
|
|
173
|
+
if image is None:
|
|
174
|
+
image = Image(
|
|
175
|
+
acl=self.acl,
|
|
176
|
+
python_requirements=python_requirements,
|
|
177
|
+
build_script_pre=build_script_pre,
|
|
178
|
+
build_script_post=build_script_post,
|
|
179
|
+
)
|
|
129
180
|
self.image = image
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
self.type: JobType = JobType.UNKNOWN
|
|
133
|
-
self.entrypoint = None
|
|
181
|
+
self.type = type
|
|
182
|
+
self.entrypoint = entrypoint
|
|
134
183
|
self.__id = str(uuid.uuid4())
|
|
135
184
|
|
|
136
185
|
@property
|
|
@@ -140,6 +189,25 @@ class Job:
|
|
|
140
189
|
"""
|
|
141
190
|
return self.__id
|
|
142
191
|
|
|
192
|
+
@classmethod
|
|
193
|
+
def __get_existing__(
|
|
194
|
+
cls, job_name: str, environment: Environment
|
|
195
|
+
) -> Optional[dict]:
|
|
196
|
+
"""
|
|
197
|
+
Retrieve an existing job instance from the DataTailr platform.
|
|
198
|
+
Based on the job name and environment.
|
|
199
|
+
"""
|
|
200
|
+
job_list = __client__.ls(filter=f"name={job_name},environment={environment}")
|
|
201
|
+
if not isinstance(job_list, list):
|
|
202
|
+
return None
|
|
203
|
+
if len(job_list) == 0:
|
|
204
|
+
return None
|
|
205
|
+
if len(job_list) > 1:
|
|
206
|
+
raise BatchJobError(
|
|
207
|
+
f"Multiple jobs found with name '{job_name}' in environment '{environment}'."
|
|
208
|
+
)
|
|
209
|
+
return job_list[0]
|
|
210
|
+
|
|
143
211
|
def __repr__(self):
|
|
144
212
|
return (
|
|
145
213
|
f"Job(name={self.name}, environment={self.environment}, "
|
|
@@ -169,77 +237,125 @@ class Job:
|
|
|
169
237
|
job_dict["cpu"] = self.resources.cpu
|
|
170
238
|
return job_dict
|
|
171
239
|
|
|
240
|
+
def from_dict(self, job_dict: dict):
|
|
241
|
+
self.name = job_dict["name"]
|
|
242
|
+
self.image = job_dict["image"]
|
|
243
|
+
|
|
244
|
+
environment = job_dict.get("environment", "dev")
|
|
245
|
+
environment = Environment(environment.lower())
|
|
246
|
+
self.environment = environment
|
|
247
|
+
|
|
248
|
+
user = job_dict["run_as"]["name"]
|
|
249
|
+
user = User(user.lower())
|
|
250
|
+
self.run_as = user
|
|
251
|
+
|
|
252
|
+
self.resources = Resources(memory=job_dict["memory"], cpu=job_dict["num_cpus"])
|
|
253
|
+
acl = job_dict.get("acl", None)
|
|
254
|
+
if acl is None:
|
|
255
|
+
acl = ACL(user=self.run_as)
|
|
256
|
+
else:
|
|
257
|
+
acl = ACL.from_dict(acl)
|
|
258
|
+
self.acl = acl
|
|
259
|
+
self.python_requirements = (job_dict.get("python_requirements", ""),)
|
|
260
|
+
self.build_script_pre = (job_dict.get("build_script_pre", ""),)
|
|
261
|
+
self.build_script_post = (job_dict.get("build_script_post", ""),)
|
|
262
|
+
self.type = JobType(job_dict.get("type", "unknown").lower())
|
|
263
|
+
self.state = job_dict["state"]
|
|
264
|
+
self.create_time = datetime.fromtimestamp(job_dict["create_time"] * 1e-6)
|
|
265
|
+
self.version = job_dict["version"]
|
|
266
|
+
self.__id = job_dict["id"]
|
|
267
|
+
|
|
172
268
|
def to_json(self):
|
|
173
269
|
"""
|
|
174
270
|
Convert the Job instance to a JSON string representation.
|
|
175
271
|
"""
|
|
176
272
|
return json.dumps(self.to_dict())
|
|
177
273
|
|
|
178
|
-
def verify_repo_is_ready(self) -> Tuple[
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
)
|
|
274
|
+
def verify_repo_is_ready(self) -> Tuple[str, str]:
|
|
275
|
+
"""
|
|
276
|
+
Verify if the repository is ready for job execution.
|
|
277
|
+
The check consists of two parts:
|
|
278
|
+
1. Check if there are uncommitted changes in the repository.
|
|
279
|
+
2. Check if the local commit matches the remote HEAD (the repo is synced with the remote).
|
|
280
|
+
Returns a tuple of (branch: str, commit_hash: str).
|
|
281
|
+
"""
|
|
282
|
+
local_commit = run_shell_command("git rev-parse HEAD")[0]
|
|
283
|
+
branch_name = run_shell_command("git rev-parse --abbrev-ref HEAD")[0]
|
|
284
|
+
|
|
285
|
+
if os.getenv("DATATAILR_ALLOW_UNSAFE_SCHEDULING", "false").lower() == "true":
|
|
286
|
+
return branch_name, local_commit
|
|
287
|
+
return_code = run_shell_command("git diff --exit-code")[1]
|
|
288
|
+
is_committed = return_code == 0
|
|
289
|
+
|
|
185
290
|
if not is_committed:
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
"Uncommitted changes detected. Please commit your changes before running the job.",
|
|
291
|
+
raise RepoValidationError(
|
|
292
|
+
"Please commit your changes before running the job."
|
|
189
293
|
)
|
|
190
294
|
|
|
191
|
-
|
|
192
|
-
("git rev-parse HEAD"), shell=True, capture_output=True, text=True
|
|
193
|
-
).stdout.strip()
|
|
194
|
-
remote_commit = (
|
|
195
|
-
subprocess.run(
|
|
196
|
-
("git ls-remote origin HEAD"),
|
|
197
|
-
shell=True,
|
|
198
|
-
capture_output=True,
|
|
199
|
-
text=True,
|
|
200
|
-
)
|
|
201
|
-
.stdout.strip()
|
|
202
|
-
.split("\t")[0]
|
|
203
|
-
)
|
|
295
|
+
remote_commit = run_shell_command("git ls-remote origin HEAD")[0].split("\t")[0]
|
|
204
296
|
|
|
205
297
|
if local_commit != remote_commit:
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
"Local commit does not match remote HEAD. Please pull the latest changes before running the job.",
|
|
298
|
+
raise RepoValidationError(
|
|
299
|
+
"Please sync your local repository with the remote before running the job."
|
|
209
300
|
)
|
|
210
301
|
|
|
211
|
-
|
|
212
|
-
("git rev-parse --abbrev-ref HEAD"),
|
|
213
|
-
shell=True,
|
|
214
|
-
capture_output=True,
|
|
215
|
-
text=True,
|
|
216
|
-
).stdout.strip()
|
|
217
|
-
return True, ""
|
|
302
|
+
return branch_name, local_commit
|
|
218
303
|
|
|
219
|
-
def
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
check_result = self.verify_repo_is_ready()
|
|
229
|
-
if not check_result[0]:
|
|
230
|
-
raise RepoValidationError(check_result[1])
|
|
231
|
-
logger.info(
|
|
232
|
-
f"Running job '{self.name}' in environment '{self.environment}' as '{self.run_as}'"
|
|
233
|
-
)
|
|
304
|
+
def __prepare__(self) -> str:
|
|
305
|
+
branch_name, local_commit = self.verify_repo_is_ready()
|
|
306
|
+
self.image.update(
|
|
307
|
+
branch_name=branch_name,
|
|
308
|
+
commit_hash=local_commit,
|
|
309
|
+
)
|
|
310
|
+
logger.info(
|
|
311
|
+
f"Running job '{self.name}' in environment '{self.environment}' as '{self.run_as}'"
|
|
312
|
+
)
|
|
234
313
|
|
|
235
|
-
|
|
236
|
-
|
|
314
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as temp_file:
|
|
315
|
+
temp_file.write(self.to_json().encode())
|
|
316
|
+
return temp_file.name
|
|
237
317
|
|
|
238
|
-
|
|
239
|
-
|
|
318
|
+
def get_schedule_args(self) -> dict:
|
|
319
|
+
"""
|
|
320
|
+
Returns additional arguments for scheduling the job.
|
|
321
|
+
Override or extend this method as needed.
|
|
322
|
+
"""
|
|
323
|
+
return {}
|
|
240
324
|
|
|
241
|
-
|
|
242
|
-
|
|
325
|
+
def __run_command__(self, command: str) -> Tuple[bool, str]:
|
|
326
|
+
"""
|
|
327
|
+
Run a command in the context of the job.
|
|
328
|
+
This is used to execute the job's entry point.
|
|
329
|
+
"""
|
|
330
|
+
if not is_dt_installed():
|
|
243
331
|
raise NotImplementedError(
|
|
244
332
|
"DataTailr is not installed. Please install DataTailr to run this job."
|
|
245
333
|
)
|
|
334
|
+
try:
|
|
335
|
+
temp_file_name = self.__prepare__()
|
|
336
|
+
|
|
337
|
+
if command == "run":
|
|
338
|
+
__client__.run(f"file://{temp_file_name}", **self.get_schedule_args())
|
|
339
|
+
elif command == "save":
|
|
340
|
+
__client__.save(f"file://{temp_file_name}", **self.get_schedule_args())
|
|
341
|
+
else:
|
|
342
|
+
raise ValueError(f"Unknown command: {command}")
|
|
343
|
+
os.remove(temp_file_name)
|
|
344
|
+
except Exception as e:
|
|
345
|
+
logger.error(f"Error running command '{command}': {e}")
|
|
346
|
+
return False, str(e)
|
|
347
|
+
return True, f"Job '{self.name}' {command}d successfully."
|
|
348
|
+
|
|
349
|
+
def save(self) -> Tuple[bool, str]:
|
|
350
|
+
"""
|
|
351
|
+
Save the job to the DataTailr platform.
|
|
352
|
+
If the job already exists, it will be updated.
|
|
353
|
+
"""
|
|
354
|
+
return self.__run_command__("save")
|
|
355
|
+
|
|
356
|
+
def run(self) -> Tuple[bool, str]:
|
|
357
|
+
"""
|
|
358
|
+
Run the job. This method should be implemented to execute the job logic.
|
|
359
|
+
It verifies the repository state and prepares the job for execution.
|
|
360
|
+
"""
|
|
361
|
+
return self.__run_command__("run")
|
datatailr/scheduler/batch.py
CHANGED
|
@@ -14,9 +14,11 @@ import contextvars
|
|
|
14
14
|
import json
|
|
15
15
|
import os
|
|
16
16
|
from functools import reduce
|
|
17
|
-
from typing import Dict, List, Optional, Sequence, Set, Tuple, Union
|
|
17
|
+
from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
|
|
18
|
+
import uuid
|
|
18
19
|
|
|
19
20
|
from datatailr import Image
|
|
21
|
+
from datatailr.dt_json import encode_json
|
|
20
22
|
from datatailr.errors import BatchJobError
|
|
21
23
|
from datatailr.logging import DatatailrLogger
|
|
22
24
|
from datatailr.scheduler.base import (
|
|
@@ -29,9 +31,12 @@ from datatailr.scheduler.base import (
|
|
|
29
31
|
User,
|
|
30
32
|
)
|
|
31
33
|
from datatailr.scheduler.constants import DEFAULT_TASK_CPU, DEFAULT_TASK_MEMORY
|
|
34
|
+
from datatailr.scheduler.arguments_cache import ArgumentsCache
|
|
35
|
+
from datatailr.scheduler.schedule import Schedule
|
|
32
36
|
from datatailr.utils import is_dt_installed
|
|
33
37
|
|
|
34
38
|
__DAG_CONTEXT__: contextvars.ContextVar = contextvars.ContextVar("dag_context")
|
|
39
|
+
__ARGUMENTS_CACHE__ = ArgumentsCache()
|
|
35
40
|
logger = DatatailrLogger(os.path.abspath(__file__)).get_logger()
|
|
36
41
|
|
|
37
42
|
|
|
@@ -39,13 +44,6 @@ def get_current_manager():
|
|
|
39
44
|
return __DAG_CONTEXT__.get(None)
|
|
40
45
|
|
|
41
46
|
|
|
42
|
-
def next_batch_job_id():
|
|
43
|
-
i = 0
|
|
44
|
-
while True:
|
|
45
|
-
yield i
|
|
46
|
-
i += 1
|
|
47
|
-
|
|
48
|
-
|
|
49
47
|
class CyclicDependencyError(BatchJobError):
|
|
50
48
|
"""
|
|
51
49
|
Exception raised when a cyclic dependency is detected in the batch job dependencies.
|
|
@@ -79,6 +77,12 @@ class MissingDagError(BatchJobError):
|
|
|
79
77
|
)
|
|
80
78
|
|
|
81
79
|
|
|
80
|
+
class CodePackageMismatchError(BatchJobError):
|
|
81
|
+
def __init__(self, message: str):
|
|
82
|
+
super().__init__(message)
|
|
83
|
+
self.message = message
|
|
84
|
+
|
|
85
|
+
|
|
82
86
|
class BatchJob:
|
|
83
87
|
"""
|
|
84
88
|
Represents a job within a batch job.
|
|
@@ -93,6 +97,7 @@ class BatchJob:
|
|
|
93
97
|
resources: Optional[Resources] = None,
|
|
94
98
|
dependencies: Sequence[Union[str, BatchJob]] = [],
|
|
95
99
|
dag: Optional[Batch] = get_current_manager(),
|
|
100
|
+
argument_mapping: Dict[str, str] = {},
|
|
96
101
|
):
|
|
97
102
|
self.name = name
|
|
98
103
|
self.entrypoint = entrypoint
|
|
@@ -102,12 +107,14 @@ class BatchJob:
|
|
|
102
107
|
raise MissingDagError()
|
|
103
108
|
self.__id = dag.next_job_id
|
|
104
109
|
self.dag = dag
|
|
110
|
+
self.__args: Dict[str, Any] = {}
|
|
105
111
|
self.dag.__BATCH_JOB_NAMES__[self.name] = self.__id
|
|
106
112
|
self.dependencies = self.translate_dependencies()
|
|
107
113
|
assert all(
|
|
108
114
|
isinstance(dep, int) for dep in self.dependencies
|
|
109
115
|
), "All dependencies must be integers representing job IDs."
|
|
110
116
|
self.dag.add_job(self)
|
|
117
|
+
self.__argument_mapping = argument_mapping or {}
|
|
111
118
|
|
|
112
119
|
def __call__(self, *args, **kwds) -> BatchJob:
|
|
113
120
|
"""
|
|
@@ -116,6 +123,22 @@ class BatchJob:
|
|
|
116
123
|
"""
|
|
117
124
|
return self
|
|
118
125
|
|
|
126
|
+
@property
|
|
127
|
+
def args(self) -> Dict[str, Any]:
|
|
128
|
+
"""
|
|
129
|
+
Returns the arguments for the BatchJob instance.
|
|
130
|
+
"""
|
|
131
|
+
return self.__args or {}
|
|
132
|
+
|
|
133
|
+
@args.setter
|
|
134
|
+
def args(self, args: Dict[str, Any]):
|
|
135
|
+
"""
|
|
136
|
+
Sets the arguments for the BatchJob instance.
|
|
137
|
+
"""
|
|
138
|
+
if not isinstance(args, dict):
|
|
139
|
+
raise TypeError(f"Expected a dictionary for args, got {type(args)}")
|
|
140
|
+
self.__args = args
|
|
141
|
+
|
|
119
142
|
@property
|
|
120
143
|
def id(self) -> int:
|
|
121
144
|
"""
|
|
@@ -123,7 +146,7 @@ class BatchJob:
|
|
|
123
146
|
"""
|
|
124
147
|
return self.__id
|
|
125
148
|
|
|
126
|
-
def alias(self, name: str):
|
|
149
|
+
def alias(self, name: str) -> BatchJob:
|
|
127
150
|
"""
|
|
128
151
|
Set an alias for the BatchJob instance.
|
|
129
152
|
|
|
@@ -136,19 +159,48 @@ class BatchJob:
|
|
|
136
159
|
self.name = name
|
|
137
160
|
return self
|
|
138
161
|
|
|
162
|
+
def set_resources(
|
|
163
|
+
self,
|
|
164
|
+
resources: Optional[Resources] = None,
|
|
165
|
+
memory: Optional[str] = None,
|
|
166
|
+
cpu: Optional[float] = None,
|
|
167
|
+
) -> BatchJob:
|
|
168
|
+
"""
|
|
169
|
+
Set the resources for the BatchJob instance.
|
|
170
|
+
|
|
171
|
+
:param resources: The Resources instance to set.
|
|
172
|
+
"""
|
|
173
|
+
if resources is not None:
|
|
174
|
+
if not isinstance(resources, Resources):
|
|
175
|
+
raise TypeError(f"Expected Resources instance, got {type(resources)}")
|
|
176
|
+
else:
|
|
177
|
+
resources = Resources(
|
|
178
|
+
memory=memory or DEFAULT_TASK_MEMORY, cpu=cpu or DEFAULT_TASK_CPU
|
|
179
|
+
)
|
|
180
|
+
self.resources = resources
|
|
181
|
+
return self
|
|
182
|
+
|
|
139
183
|
def __repr__(self):
|
|
140
184
|
return (
|
|
141
185
|
f"BatchJob(name={self.name}, entrypoint={self.entrypoint}, "
|
|
142
186
|
f"resources={self.resources}) (id={self.__id})"
|
|
143
187
|
)
|
|
144
188
|
|
|
189
|
+
def __getstate__(self) -> object:
|
|
190
|
+
state = self.__dict__.copy()
|
|
191
|
+
state.pop("dag", None)
|
|
192
|
+
return state
|
|
193
|
+
|
|
194
|
+
def __setstate__(self, state: dict):
|
|
195
|
+
self.__dict__.update(state)
|
|
196
|
+
|
|
145
197
|
def to_dict(self):
|
|
146
198
|
"""
|
|
147
199
|
Convert the BatchJob instance to a dictionary representation.
|
|
148
200
|
"""
|
|
149
201
|
return {
|
|
150
202
|
"display_name": self.name,
|
|
151
|
-
"
|
|
203
|
+
"child_number": self.__id,
|
|
152
204
|
"entrypoint": str(self.entrypoint),
|
|
153
205
|
"memory": self.resources.memory if self.resources else DEFAULT_TASK_MEMORY,
|
|
154
206
|
"cpu": self.resources.cpu if self.resources else DEFAULT_TASK_CPU,
|
|
@@ -183,6 +235,9 @@ class BatchJob:
|
|
|
183
235
|
|
|
184
236
|
def __add_dependency__(self, other):
|
|
185
237
|
self.dependencies.add(other.__id)
|
|
238
|
+
arg_name = self.__argument_mapping.get(other.name, other.name)
|
|
239
|
+
if arg_name is not None:
|
|
240
|
+
self.__args[arg_name] = other
|
|
186
241
|
|
|
187
242
|
def __lshift__(
|
|
188
243
|
self, other: Sequence[BatchJob] | BatchJob
|
|
@@ -223,7 +278,13 @@ class BatchJob:
|
|
|
223
278
|
Execute the job's entrypoint.
|
|
224
279
|
"""
|
|
225
280
|
if isinstance(self.entrypoint, EntryPoint):
|
|
226
|
-
|
|
281
|
+
env = {
|
|
282
|
+
"DATATAILR_BATCH_ID": str(self.dag.id),
|
|
283
|
+
"DATATAILR_JOB_ID": str(self.__id),
|
|
284
|
+
"DATATAILR_JOB_NAME": self.name,
|
|
285
|
+
"DATATAILR_JOB_ARGUMENT_MAPPING": encode_json(self.__argument_mapping),
|
|
286
|
+
}
|
|
287
|
+
self.entrypoint(env=env)
|
|
227
288
|
else:
|
|
228
289
|
raise TypeError(f"Invalid entrypoint type: {type(self.entrypoint)}")
|
|
229
290
|
|
|
@@ -237,12 +298,17 @@ class Batch(Job):
|
|
|
237
298
|
|
|
238
299
|
def __init__(
|
|
239
300
|
self,
|
|
240
|
-
environment: Optional[Environment],
|
|
241
301
|
name: str,
|
|
242
|
-
|
|
243
|
-
|
|
302
|
+
environment: Optional[Environment] = Environment.DEV,
|
|
303
|
+
schedule: Optional[Schedule] = None,
|
|
304
|
+
image: Optional[Image] = None,
|
|
305
|
+
run_as: Optional[Union[str, User]] = None,
|
|
244
306
|
resources: Resources = Resources(memory="100m", cpu=1),
|
|
245
307
|
acl: Optional[ACL] = None,
|
|
308
|
+
local_run: bool = False,
|
|
309
|
+
python_requirements: str = "",
|
|
310
|
+
build_script_pre: str = "",
|
|
311
|
+
build_script_post: str = "",
|
|
246
312
|
):
|
|
247
313
|
super().__init__(
|
|
248
314
|
environment=environment,
|
|
@@ -251,19 +317,25 @@ class Batch(Job):
|
|
|
251
317
|
run_as=run_as,
|
|
252
318
|
resources=resources,
|
|
253
319
|
acl=acl,
|
|
320
|
+
python_requirements=python_requirements,
|
|
321
|
+
build_script_pre=build_script_pre,
|
|
322
|
+
build_script_post=build_script_post,
|
|
323
|
+
type=JobType.BATCH,
|
|
254
324
|
)
|
|
255
|
-
self.type = JobType.BATCH
|
|
256
325
|
self.__jobs: List[BatchJob] = []
|
|
257
326
|
self._auto_run = False
|
|
258
|
-
self.__next_job_id =
|
|
327
|
+
self.__next_job_id = -1
|
|
259
328
|
self.__BATCH_JOB_NAMES__: Dict[str, int] = {}
|
|
329
|
+
self.__local_run = local_run
|
|
330
|
+
self.__schedule = schedule
|
|
260
331
|
|
|
261
332
|
@property
|
|
262
333
|
def next_job_id(self):
|
|
263
334
|
"""
|
|
264
335
|
Returns a generator for the next job ID in the batch.
|
|
265
336
|
"""
|
|
266
|
-
|
|
337
|
+
self.__next_job_id += 1
|
|
338
|
+
return self.__next_job_id
|
|
267
339
|
|
|
268
340
|
def add_job(self, job: BatchJob):
|
|
269
341
|
"""
|
|
@@ -279,6 +351,25 @@ class Batch(Job):
|
|
|
279
351
|
raise DuplicateJobNameError(job.name)
|
|
280
352
|
# Use the batch level resource values as defaults for jobs
|
|
281
353
|
job.resources = job.resources or self.resources
|
|
354
|
+
image_path_to_repo = self.image.path_to_repo
|
|
355
|
+
image_path_to_module = self.image.path_to_module
|
|
356
|
+
package_path_to_repo = job.entrypoint.path_to_repo
|
|
357
|
+
package_path_to_module = job.entrypoint.path_to_module
|
|
358
|
+
|
|
359
|
+
if image_path_to_repo is None:
|
|
360
|
+
self.image.path_to_repo = package_path_to_repo
|
|
361
|
+
elif package_path_to_repo != image_path_to_repo:
|
|
362
|
+
raise CodePackageMismatchError(
|
|
363
|
+
f"Function {job.entrypoint.function_name} is defined in a different package root: "
|
|
364
|
+
f"{package_path_to_repo} != {image_path_to_repo}"
|
|
365
|
+
)
|
|
366
|
+
if image_path_to_module is None:
|
|
367
|
+
self.image.path_to_module = package_path_to_module
|
|
368
|
+
elif package_path_to_module != image_path_to_module:
|
|
369
|
+
raise CodePackageMismatchError(
|
|
370
|
+
f"Function {job.entrypoint.function_name} is defined in a different module: "
|
|
371
|
+
f"{package_path_to_module} != {image_path_to_module}"
|
|
372
|
+
)
|
|
282
373
|
self.__jobs.append(job)
|
|
283
374
|
|
|
284
375
|
def is_job_in(self, job: BatchJob) -> bool:
|
|
@@ -293,6 +384,7 @@ class Batch(Job):
|
|
|
293
384
|
"""
|
|
294
385
|
batch_dict = super().to_dict()
|
|
295
386
|
batch_dict["jobs"] = [job.to_dict() for job in self.__jobs]
|
|
387
|
+
batch_dict["schedule"] = str(self.__schedule) if self.__schedule else None
|
|
296
388
|
return batch_dict
|
|
297
389
|
|
|
298
390
|
def to_json(self):
|
|
@@ -345,11 +437,41 @@ class Batch(Job):
|
|
|
345
437
|
"A cyclic dependency exists amongst {}".format(jobs)
|
|
346
438
|
)
|
|
347
439
|
|
|
440
|
+
def get_schedule_args(self) -> Dict[str, Any]:
|
|
441
|
+
if isinstance(self.__schedule, Schedule):
|
|
442
|
+
args = {
|
|
443
|
+
"at_minutes": self.__schedule.at_minutes,
|
|
444
|
+
"every_minute": self.__schedule.every_minute,
|
|
445
|
+
"at_hours": self.__schedule.at_hours,
|
|
446
|
+
"every_hour": self.__schedule.every_hour,
|
|
447
|
+
"weekdays": self.__schedule.weekdays,
|
|
448
|
+
"day_of_month": self.__schedule.day_of_month,
|
|
449
|
+
"in_month": self.__schedule.in_month,
|
|
450
|
+
"every_month": self.__schedule.every_month,
|
|
451
|
+
"timezone": self.__schedule.timezone,
|
|
452
|
+
"run_after_job_uuid": self.__schedule.run_after_job_uuid,
|
|
453
|
+
"run_after_job_name": self.__schedule.run_after_job_name,
|
|
454
|
+
"run_after_job_condition": self.__schedule.run_after_job_condition,
|
|
455
|
+
}
|
|
456
|
+
args = {key: value for key, value in args.items() if value is not None}
|
|
457
|
+
for key, value in args.items():
|
|
458
|
+
if isinstance(value, list):
|
|
459
|
+
args[key] = ",".join(map(str, value))
|
|
460
|
+
return args
|
|
461
|
+
return {}
|
|
462
|
+
|
|
348
463
|
def run(self) -> Tuple[bool, str]:
|
|
349
|
-
|
|
464
|
+
def arg_name(arg: Union[BatchJob, str]) -> str:
|
|
465
|
+
return arg.name if isinstance(arg, BatchJob) else arg
|
|
466
|
+
|
|
467
|
+
args = {
|
|
468
|
+
j.name: {k: arg_name(v) for k, v in j.args.items()} for j in self.__jobs
|
|
469
|
+
}
|
|
470
|
+
__ARGUMENTS_CACHE__.add_arguments(self.id, args)
|
|
471
|
+
if not self.__local_run and is_dt_installed():
|
|
350
472
|
return super().run()
|
|
351
473
|
else:
|
|
352
|
-
os.environ["DATATAILR_BATCH_RUN_ID"] =
|
|
474
|
+
os.environ["DATATAILR_BATCH_RUN_ID"] = uuid.uuid4().hex[:8]
|
|
353
475
|
for step in self.__topological_sort__():
|
|
354
476
|
for job_id in step:
|
|
355
477
|
job = self.__jobs[job_id]
|