datatailr 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datatailr might be problematic. Click here for more details.
- datatailr/__init__.py +1 -35
- datatailr/acl.py +35 -3
- datatailr/blob.py +13 -13
- datatailr/build/image.py +38 -2
- datatailr/dt_json.py +32 -0
- datatailr/errors.py +17 -0
- datatailr/group.py +19 -13
- datatailr/logging.py +27 -10
- datatailr/sbin/datatailr_run.py +147 -0
- datatailr/sbin/datatailr_run_app.py +28 -0
- datatailr/sbin/{run_job.py → datatailr_run_batch.py} +5 -20
- datatailr/scheduler/__init__.py +24 -8
- datatailr/scheduler/arguments_cache.py +71 -43
- datatailr/scheduler/base.py +195 -69
- datatailr/scheduler/batch.py +141 -19
- datatailr/scheduler/batch_decorator.py +53 -24
- datatailr/scheduler/constants.py +1 -1
- datatailr/scheduler/schedule.py +117 -0
- datatailr/scheduler/utils.py +3 -1
- datatailr/user.py +30 -17
- datatailr/utils.py +20 -0
- datatailr/wrapper.py +0 -6
- {datatailr-0.1.6.dist-info → datatailr-0.1.8.dist-info}/METADATA +37 -4
- datatailr-0.1.8.dist-info/RECORD +30 -0
- datatailr-0.1.8.dist-info/entry_points.txt +4 -0
- datatailr-0.1.8.dist-info/top_level.txt +1 -0
- datatailr-0.1.6.dist-info/RECORD +0 -29
- datatailr-0.1.6.dist-info/entry_points.txt +0 -2
- datatailr-0.1.6.dist-info/top_level.txt +0 -2
- test_module/__init__.py +0 -17
- test_module/test_submodule.py +0 -38
- {datatailr-0.1.6.dist-info → datatailr-0.1.8.dist-info}/WHEEL +0 -0
- {datatailr-0.1.6.dist-info → datatailr-0.1.8.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
# *************************************************************************
|
|
4
|
+
#
|
|
5
|
+
# Copyright (c) 2025 - Datatailr Inc.
|
|
6
|
+
# All Rights Reserved.
|
|
7
|
+
#
|
|
8
|
+
# This file is part of Datatailr and subject to the terms and conditions
|
|
9
|
+
# defined in 'LICENSE.txt'. Unauthorized copying and/or distribution
|
|
10
|
+
# of this file, in parts or full, via any medium is strictly prohibited.
|
|
11
|
+
# *************************************************************************
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
|
|
15
|
+
from datatailr.logging import DatatailrLogger
|
|
16
|
+
|
|
17
|
+
logger = DatatailrLogger(os.path.abspath(__file__)).get_logger()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def run():
|
|
21
|
+
logger.info("Starting Datatailr app...")
|
|
22
|
+
entrypoint = os.environ.get("DATATAILR_ENTRYPOINT")
|
|
23
|
+
|
|
24
|
+
if entrypoint is None:
|
|
25
|
+
raise ValueError("Environment variable 'DATATAILR_ENTRYPOINT' is not set.")
|
|
26
|
+
|
|
27
|
+
os.system(entrypoint)
|
|
28
|
+
logger.info(f"Running entrypoint: {entrypoint}")
|
|
@@ -12,19 +12,19 @@
|
|
|
12
12
|
|
|
13
13
|
import importlib
|
|
14
14
|
import os
|
|
15
|
-
import pickle
|
|
16
15
|
|
|
17
|
-
from datatailr import dt__Blob
|
|
18
16
|
from datatailr.logging import DatatailrLogger
|
|
19
17
|
|
|
20
18
|
logger = DatatailrLogger(os.path.abspath(__file__)).get_logger()
|
|
21
19
|
|
|
22
20
|
|
|
23
|
-
def
|
|
21
|
+
def run():
|
|
22
|
+
logger.info("Running Datatailr batch job")
|
|
24
23
|
entry_point = os.environ.get("DATATAILR_BATCH_ENTRYPOINT")
|
|
25
24
|
batch_run_id = os.environ.get("DATATAILR_BATCH_RUN_ID")
|
|
26
25
|
batch_id = os.environ.get("DATATAILR_BATCH_ID")
|
|
27
26
|
job_id = os.environ.get("DATATAILR_JOB_ID")
|
|
27
|
+
logger.info(f"Batch run ID: {batch_run_id}, Batch ID: {batch_id}, Job ID: {job_id}")
|
|
28
28
|
|
|
29
29
|
if entry_point is None:
|
|
30
30
|
raise ValueError(
|
|
@@ -44,20 +44,5 @@ def main():
|
|
|
44
44
|
raise ValueError(
|
|
45
45
|
f"The function '{func_name}' in module '{module_name}' is not callable."
|
|
46
46
|
)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
with open(result_path, "wb") as f:
|
|
50
|
-
pickle.dump(result, f)
|
|
51
|
-
blob = dt__Blob()
|
|
52
|
-
blob.cp(result_path, "blob://")
|
|
53
|
-
logger.info(f"{result_path} copied to blob storage.")
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
if __name__ == "__main__":
|
|
57
|
-
try:
|
|
58
|
-
logger.debug("Starting job execution...")
|
|
59
|
-
main()
|
|
60
|
-
logger.debug("Job executed successfully.")
|
|
61
|
-
except Exception as e:
|
|
62
|
-
logger.error(f"Error during job execution: {e}")
|
|
63
|
-
raise
|
|
47
|
+
function()
|
|
48
|
+
logger.info("Datatailr batch job completed successfully.")
|
datatailr/scheduler/__init__.py
CHANGED
|
@@ -8,31 +8,47 @@
|
|
|
8
8
|
# of this file, in parts or full, via any medium is strictly prohibited.
|
|
9
9
|
# *************************************************************************
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
r"""
|
|
12
|
+
Datatailr Scheduler Module
|
|
13
|
+
==========================
|
|
14
|
+
|
|
15
|
+
The `datatailr.scheduler` module provides a framework for scheduling and managing batch jobs.
|
|
16
|
+
|
|
17
|
+
The main job types are:
|
|
18
|
+
_______________________
|
|
19
|
+
|
|
20
|
+
- **Batch**: Represents a batch job that can be scheduled and executed.
|
|
21
|
+
The job can include multiple tasks which can be run in parallel or sequentially.
|
|
22
|
+
- **Service**: Represents a service job that runs continuously.
|
|
23
|
+
- **App**: Represents a web app or a dashboard, which can be built using one of the supported frameworks,
|
|
24
|
+
such as `Streamlit <https://streamlit.io/>`_, `Dash <https://dash.plotly.com/>`_, or `Panel <https://panel.holoviz.org/>`_.
|
|
25
|
+
- **Excel**: Represents an Excel add-in.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from datatailr.errors import BatchJobError
|
|
12
29
|
from datatailr.scheduler.base import (
|
|
13
|
-
ACL,
|
|
14
30
|
EntryPoint,
|
|
15
31
|
Environment,
|
|
16
32
|
Job,
|
|
17
33
|
JobType,
|
|
18
34
|
Resources,
|
|
19
|
-
|
|
35
|
+
set_allow_unsafe_scheduling,
|
|
20
36
|
)
|
|
21
37
|
from datatailr.scheduler.batch import Batch, BatchJob, DuplicateJobNameError
|
|
22
|
-
from datatailr.scheduler.batch_decorator import batch_decorator as
|
|
38
|
+
from datatailr.scheduler.batch_decorator import batch_decorator as batch_job
|
|
39
|
+
from datatailr.scheduler.schedule import Schedule
|
|
23
40
|
|
|
24
41
|
__all__ = [
|
|
25
42
|
"Job",
|
|
26
43
|
"JobType",
|
|
27
44
|
"Environment",
|
|
28
|
-
"User",
|
|
29
45
|
"Resources",
|
|
30
|
-
"ACL",
|
|
31
46
|
"EntryPoint",
|
|
32
47
|
"Batch",
|
|
33
48
|
"BatchJob",
|
|
34
|
-
"
|
|
35
|
-
"DatatailrError",
|
|
49
|
+
"batch_job",
|
|
36
50
|
"BatchJobError",
|
|
37
51
|
"DuplicateJobNameError",
|
|
52
|
+
"set_allow_unsafe_scheduling",
|
|
53
|
+
"Schedule",
|
|
38
54
|
]
|
|
@@ -21,17 +21,26 @@ and the inner dictionaries contain the arguments.
|
|
|
21
21
|
This module is for internal use of the datatailr package.
|
|
22
22
|
"""
|
|
23
23
|
|
|
24
|
-
from
|
|
24
|
+
from datatailr.dt_json import json, decode_json
|
|
25
|
+
import os
|
|
25
26
|
import pickle
|
|
26
|
-
from typing import Any, Dict
|
|
27
|
+
from typing import Any, Dict, Optional
|
|
27
28
|
|
|
28
29
|
from datatailr import is_dt_installed, Blob
|
|
29
|
-
from datatailr.
|
|
30
|
+
from datatailr.errors import DatatailrError
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
__BLOB_STORAGE__ = Blob()
|
|
33
34
|
|
|
34
35
|
|
|
36
|
+
class CacheNotFoundError(DatatailrError):
|
|
37
|
+
"""Custom error for cache operations."""
|
|
38
|
+
|
|
39
|
+
def __init__(self, message: str):
|
|
40
|
+
super().__init__(message)
|
|
41
|
+
self.message = message
|
|
42
|
+
|
|
43
|
+
|
|
35
44
|
class ArgumentsCache:
|
|
36
45
|
def __init__(self, use_persistent_cache: bool = is_dt_installed()):
|
|
37
46
|
"""
|
|
@@ -40,11 +49,12 @@ class ArgumentsCache:
|
|
|
40
49
|
:param use_persistent_cache: If True, use the persistent cache backend. Otherwise, use in-memory cache.
|
|
41
50
|
"""
|
|
42
51
|
self.use_persistent_cache = use_persistent_cache
|
|
43
|
-
self.
|
|
44
|
-
|
|
45
|
-
|
|
52
|
+
if not self.use_persistent_cache:
|
|
53
|
+
# Create a temp folder, for local caching
|
|
54
|
+
os.makedirs("/tmp/datatailr/batch/arguments", exist_ok=True)
|
|
55
|
+
os.makedirs("/tmp/datatailr/batch/results", exist_ok=True)
|
|
46
56
|
|
|
47
|
-
def add_arguments(self,
|
|
57
|
+
def add_arguments(self, batch_id: str, arguments: Dict[str, Any]):
|
|
48
58
|
"""
|
|
49
59
|
Add arguments to the cache for a specific job and batch run.
|
|
50
60
|
|
|
@@ -52,13 +62,16 @@ class ArgumentsCache:
|
|
|
52
62
|
:param job_name: Name of the job.
|
|
53
63
|
:param arguments: Dictionary of arguments to store.
|
|
54
64
|
"""
|
|
55
|
-
|
|
56
|
-
|
|
65
|
+
path = f"/tmp/datatailr/batch/arguments/{batch_id}.pkl"
|
|
66
|
+
if self.use_persistent_cache:
|
|
57
67
|
self._add_to_persistent_cache(path, arguments)
|
|
58
68
|
else:
|
|
59
|
-
|
|
69
|
+
with open(path, "wb") as f:
|
|
70
|
+
pickle.dump(arguments, f)
|
|
60
71
|
|
|
61
|
-
def get_arguments(
|
|
72
|
+
def get_arguments(
|
|
73
|
+
self, batch_id: str, job: str, batch_run_id: Optional[str]
|
|
74
|
+
) -> Dict[str, Any]:
|
|
62
75
|
"""
|
|
63
76
|
Retrieve arguments from the cache for a specific job and batch run.
|
|
64
77
|
|
|
@@ -66,27 +79,37 @@ class ArgumentsCache:
|
|
|
66
79
|
:param job_name: Name of the job.
|
|
67
80
|
:return: Dictionary of arguments.
|
|
68
81
|
"""
|
|
82
|
+
path = f"/tmp/datatailr/batch/arguments/{batch_id}.pkl"
|
|
69
83
|
if self.use_persistent_cache and isinstance(job, str):
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
f"Expected a dictionary for arguments, got {type(arg_keys)}"
|
|
75
|
-
)
|
|
84
|
+
try:
|
|
85
|
+
arg_keys = self._get_from_persistent_cache(path)
|
|
86
|
+
except RuntimeError:
|
|
87
|
+
return {}
|
|
76
88
|
else:
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
89
|
+
if not os.path.exists(path):
|
|
90
|
+
raise CacheNotFoundError(
|
|
91
|
+
f"Cache file not found: {path}. Ensure that the arguments have been cached."
|
|
92
|
+
)
|
|
93
|
+
with open(path, "rb") as f:
|
|
94
|
+
try:
|
|
95
|
+
arg_keys = pickle.load(f)
|
|
96
|
+
except EOFError:
|
|
97
|
+
return {}
|
|
98
|
+
if not isinstance(arg_keys, dict):
|
|
99
|
+
raise TypeError(
|
|
100
|
+
f"Expected a dictionary for arguments, got {type(arg_keys)}"
|
|
101
|
+
)
|
|
102
|
+
if batch_run_id is None:
|
|
103
|
+
return arg_keys[job]
|
|
104
|
+
arguments_mapping = decode_json(
|
|
105
|
+
os.getenv("DATATAILR_JOB_ARGUMENT_MAPPING", "{}")
|
|
106
|
+
)
|
|
107
|
+
arguments_mapping = {value: key for key, value in arguments_mapping.items()}
|
|
108
|
+
args = {
|
|
109
|
+
arguments_mapping.get(name, name): self.get_result(batch_run_id, value)
|
|
110
|
+
for name, value in arg_keys[job].items()
|
|
111
|
+
}
|
|
112
|
+
return args
|
|
90
113
|
|
|
91
114
|
def add_result(self, batch_run_id: str, job: str, result: Any):
|
|
92
115
|
"""
|
|
@@ -96,13 +119,14 @@ class ArgumentsCache:
|
|
|
96
119
|
:param job: Name of the job.
|
|
97
120
|
:param result: Result of the batch job.
|
|
98
121
|
"""
|
|
122
|
+
path = f"/tmp/datatailr/batch/results/{batch_run_id}_{job}.pkl"
|
|
99
123
|
if self.use_persistent_cache and isinstance(job, str):
|
|
100
|
-
path = f"{batch_run_id}/{job}/result"
|
|
101
124
|
self._add_to_persistent_cache(path, result)
|
|
102
125
|
else:
|
|
103
|
-
|
|
126
|
+
with open(path, "wb") as f:
|
|
127
|
+
pickle.dump(result, f)
|
|
104
128
|
|
|
105
|
-
def get_result(self, batch_run_id: str, job:
|
|
129
|
+
def get_result(self, batch_run_id: str, job: Any) -> Any:
|
|
106
130
|
"""
|
|
107
131
|
Retrieve the result of a batch job from the cache.
|
|
108
132
|
|
|
@@ -110,10 +134,17 @@ class ArgumentsCache:
|
|
|
110
134
|
:param job: Name of the job.
|
|
111
135
|
:return: Result of the batch job.
|
|
112
136
|
"""
|
|
137
|
+
path = f"/tmp/datatailr/batch/results/{batch_run_id}_{job}.pkl"
|
|
113
138
|
if self.use_persistent_cache and isinstance(job, str):
|
|
114
|
-
path = f"{batch_run_id}/{job}/result"
|
|
115
139
|
return self._get_from_persistent_cache(path)
|
|
116
|
-
|
|
140
|
+
else:
|
|
141
|
+
if not os.path.exists(path):
|
|
142
|
+
return job
|
|
143
|
+
with open(path, "rb") as f:
|
|
144
|
+
try:
|
|
145
|
+
return pickle.load(f)
|
|
146
|
+
except EOFError:
|
|
147
|
+
return None
|
|
117
148
|
|
|
118
149
|
def _add_to_persistent_cache(self, path: str, blob: Any):
|
|
119
150
|
"""
|
|
@@ -124,9 +155,8 @@ class ArgumentsCache:
|
|
|
124
155
|
:raises TypeError: If the blob cannot be pickled.
|
|
125
156
|
|
|
126
157
|
"""
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
)
|
|
158
|
+
path = path.replace("/tmp/", "")
|
|
159
|
+
__BLOB_STORAGE__.put_blob(path, json.dumps(blob))
|
|
130
160
|
|
|
131
161
|
def _get_from_persistent_cache(self, path: str) -> Any:
|
|
132
162
|
"""
|
|
@@ -134,8 +164,6 @@ class ArgumentsCache:
|
|
|
134
164
|
|
|
135
165
|
:param path: Path in the Blob storage where the blob is stored.
|
|
136
166
|
"""
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
except (TypeError, EOFError):
|
|
141
|
-
return {}
|
|
167
|
+
path = path.replace("/tmp/", "")
|
|
168
|
+
data = __BLOB_STORAGE__.get_blob(path)
|
|
169
|
+
return json.loads(data)
|
datatailr/scheduler/base.py
CHANGED
|
@@ -8,7 +8,11 @@
|
|
|
8
8
|
# of this file, in parts or full, via any medium is strictly prohibited.
|
|
9
9
|
# *************************************************************************
|
|
10
10
|
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from datetime import datetime
|
|
11
14
|
import importlib
|
|
15
|
+
import inspect
|
|
12
16
|
import json
|
|
13
17
|
import os
|
|
14
18
|
import subprocess
|
|
@@ -18,12 +22,27 @@ from dataclasses import dataclass
|
|
|
18
22
|
from enum import Enum
|
|
19
23
|
from typing import Callable, Optional, Tuple, Union
|
|
20
24
|
|
|
21
|
-
from datatailr import ACL, Environment, User,
|
|
25
|
+
from datatailr import ACL, Environment, User, is_dt_installed
|
|
26
|
+
from datatailr.wrapper import dt__Job
|
|
27
|
+
from datatailr.scheduler.constants import DEFAULT_TASK_MEMORY, DEFAULT_TASK_CPU
|
|
22
28
|
from datatailr.build.image import Image
|
|
23
29
|
from datatailr.errors import BatchJobError
|
|
24
30
|
from datatailr.logging import DatatailrLogger
|
|
31
|
+
from datatailr.utils import run_shell_command
|
|
25
32
|
|
|
26
33
|
logger = DatatailrLogger(os.path.abspath(__file__)).get_logger()
|
|
34
|
+
__client__ = dt__Job()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def set_allow_unsafe_scheduling(allow: bool):
|
|
38
|
+
"""
|
|
39
|
+
Set whether to allow unsafe scheduling of jobs.
|
|
40
|
+
This is a global setting that affects how jobs are scheduled.
|
|
41
|
+
"""
|
|
42
|
+
if allow:
|
|
43
|
+
os.environ["DATATAILR_ALLOW_UNSAFE_SCHEDULING"] = "true"
|
|
44
|
+
else:
|
|
45
|
+
os.environ.pop("DATATAILR_ALLOW_UNSAFE_SCHEDULING", None)
|
|
27
46
|
|
|
28
47
|
|
|
29
48
|
class RepoValidationError(BatchJobError):
|
|
@@ -40,6 +59,7 @@ class JobType(Enum):
|
|
|
40
59
|
BATCH = "batch"
|
|
41
60
|
SERVICE = "service"
|
|
42
61
|
APP = "app"
|
|
62
|
+
EXCEL = "excel"
|
|
43
63
|
UNKNOWN = "unknown"
|
|
44
64
|
|
|
45
65
|
def __str__(self):
|
|
@@ -55,8 +75,14 @@ class Resources:
|
|
|
55
75
|
Represents the resources required for a job.
|
|
56
76
|
"""
|
|
57
77
|
|
|
58
|
-
memory: str =
|
|
59
|
-
cpu:
|
|
78
|
+
memory: str = DEFAULT_TASK_MEMORY
|
|
79
|
+
cpu: float = DEFAULT_TASK_CPU
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# TODO: create a dt_run script that will:
|
|
83
|
+
# 1. create user and group if not exists
|
|
84
|
+
# 2. set the correct path
|
|
85
|
+
# 3. run the job based on its type
|
|
60
86
|
|
|
61
87
|
|
|
62
88
|
class EntryPoint:
|
|
@@ -68,26 +94,30 @@ class EntryPoint:
|
|
|
68
94
|
def __init__(
|
|
69
95
|
self,
|
|
70
96
|
type: JobType,
|
|
71
|
-
func:
|
|
72
|
-
module_name: Optional[str] = None,
|
|
73
|
-
function_name: Optional[str] = None,
|
|
97
|
+
func: Callable,
|
|
74
98
|
):
|
|
75
|
-
if func is None and (module_name is None or function_name is None):
|
|
76
|
-
raise ValueError(
|
|
77
|
-
"Either a function or module and function names must be provided."
|
|
78
|
-
)
|
|
79
99
|
self.func = func
|
|
80
|
-
self.module_name = func.__module__
|
|
81
|
-
self.function_name = func.__name__
|
|
100
|
+
self.module_name = func.__module__
|
|
101
|
+
self.function_name = func.__name__
|
|
82
102
|
self.type = type
|
|
83
103
|
|
|
104
|
+
# Find the absolute path to the repository and then the relative path to the module.
|
|
105
|
+
# This will be used in the creation of the code 'bundle' when building the image.
|
|
106
|
+
path_to_repo = run_shell_command("git rev-parse --show-toplevel")[0]
|
|
107
|
+
path_to_code = inspect.getfile(func)
|
|
108
|
+
package_root = path_to_code
|
|
109
|
+
module_parts = self.module_name.split(".")
|
|
110
|
+
for _ in module_parts:
|
|
111
|
+
package_root = os.path.dirname(package_root)
|
|
112
|
+
path_to_module = os.path.relpath(package_root, path_to_repo)
|
|
113
|
+
self.path_to_repo = path_to_repo
|
|
114
|
+
self.path_to_module = path_to_module
|
|
115
|
+
|
|
84
116
|
def __call__(self, *args, **kwargs):
|
|
117
|
+
os.environ.update(kwargs.pop("env", {}))
|
|
85
118
|
if self.type == JobType.BATCH:
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
func = getattr(module, self.function_name)
|
|
89
|
-
elif self.func is not None:
|
|
90
|
-
func = self.func
|
|
119
|
+
module = importlib.import_module(self.module_name)
|
|
120
|
+
func = getattr(module, self.function_name)
|
|
91
121
|
return func(*args, **kwargs)
|
|
92
122
|
|
|
93
123
|
elif self.type == JobType.SERVICE:
|
|
@@ -106,13 +136,28 @@ class EntryPoint:
|
|
|
106
136
|
class Job:
|
|
107
137
|
def __init__(
|
|
108
138
|
self,
|
|
109
|
-
environment: Optional[Environment],
|
|
110
139
|
name: str,
|
|
111
|
-
|
|
112
|
-
|
|
140
|
+
environment: Optional[Environment] = Environment.DEV,
|
|
141
|
+
image: Optional[Image] = None,
|
|
142
|
+
run_as: Optional[Union[str, User]] = User.signed_user(),
|
|
113
143
|
resources: Resources = Resources(memory="100m", cpu=1),
|
|
114
144
|
acl: Optional[ACL] = None,
|
|
145
|
+
python_requirements: str = "",
|
|
146
|
+
build_script_pre: str = "",
|
|
147
|
+
build_script_post: str = "",
|
|
148
|
+
type: JobType = JobType.UNKNOWN,
|
|
149
|
+
entrypoint: Optional[EntryPoint] = None,
|
|
150
|
+
update_existing: bool = False,
|
|
115
151
|
):
|
|
152
|
+
if environment is None:
|
|
153
|
+
environment = Environment.DEV
|
|
154
|
+
|
|
155
|
+
if update_existing:
|
|
156
|
+
existing_job = self.__get_existing__(name, environment)
|
|
157
|
+
if existing_job:
|
|
158
|
+
self.from_dict(existing_job)
|
|
159
|
+
return
|
|
160
|
+
|
|
116
161
|
if run_as is None:
|
|
117
162
|
run_as = User.signed_user()
|
|
118
163
|
if environment is None:
|
|
@@ -126,11 +171,16 @@ class Job:
|
|
|
126
171
|
self.name = name
|
|
127
172
|
self.run_as = run_as
|
|
128
173
|
self.resources = resources
|
|
174
|
+
if image is None:
|
|
175
|
+
image = Image(
|
|
176
|
+
acl=self.acl,
|
|
177
|
+
python_requirements=python_requirements,
|
|
178
|
+
build_script_pre=build_script_pre,
|
|
179
|
+
build_script_post=build_script_post,
|
|
180
|
+
)
|
|
129
181
|
self.image = image
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
self.type: JobType = JobType.UNKNOWN
|
|
133
|
-
self.entrypoint = None
|
|
182
|
+
self.type = type
|
|
183
|
+
self.entrypoint = entrypoint
|
|
134
184
|
self.__id = str(uuid.uuid4())
|
|
135
185
|
|
|
136
186
|
@property
|
|
@@ -140,6 +190,25 @@ class Job:
|
|
|
140
190
|
"""
|
|
141
191
|
return self.__id
|
|
142
192
|
|
|
193
|
+
@classmethod
|
|
194
|
+
def __get_existing__(
|
|
195
|
+
cls, job_name: str, environment: Environment
|
|
196
|
+
) -> Optional[dict]:
|
|
197
|
+
"""
|
|
198
|
+
Retrieve an existing job instance from the DataTailr platform.
|
|
199
|
+
Based on the job name and environment.
|
|
200
|
+
"""
|
|
201
|
+
job_list = __client__.ls(filter=f"name={job_name},environment={environment}")
|
|
202
|
+
if not isinstance(job_list, list):
|
|
203
|
+
return None
|
|
204
|
+
if len(job_list) == 0:
|
|
205
|
+
return None
|
|
206
|
+
if len(job_list) > 1:
|
|
207
|
+
raise BatchJobError(
|
|
208
|
+
f"Multiple jobs found with name '{job_name}' in environment '{environment}'."
|
|
209
|
+
)
|
|
210
|
+
return job_list[0]
|
|
211
|
+
|
|
143
212
|
def __repr__(self):
|
|
144
213
|
return (
|
|
145
214
|
f"Job(name={self.name}, environment={self.environment}, "
|
|
@@ -169,31 +238,64 @@ class Job:
|
|
|
169
238
|
job_dict["cpu"] = self.resources.cpu
|
|
170
239
|
return job_dict
|
|
171
240
|
|
|
241
|
+
def from_dict(self, job_dict: dict):
|
|
242
|
+
self.name = job_dict["name"]
|
|
243
|
+
self.image = job_dict["image"]
|
|
244
|
+
|
|
245
|
+
environment = job_dict.get("environment", "dev")
|
|
246
|
+
environment = Environment(environment.lower())
|
|
247
|
+
self.environment = environment
|
|
248
|
+
|
|
249
|
+
user = job_dict["run_as"]["name"]
|
|
250
|
+
user = User(user.lower())
|
|
251
|
+
self.run_as = user
|
|
252
|
+
|
|
253
|
+
self.resources = Resources(memory=job_dict["memory"], cpu=job_dict["num_cpus"])
|
|
254
|
+
acl = job_dict.get("acl", None)
|
|
255
|
+
if acl is None:
|
|
256
|
+
acl = ACL(user=self.run_as)
|
|
257
|
+
else:
|
|
258
|
+
acl = ACL.from_dict(acl)
|
|
259
|
+
self.acl = acl
|
|
260
|
+
self.python_requirements = (job_dict.get("python_requirements", ""),)
|
|
261
|
+
self.build_script_pre = (job_dict.get("build_script_pre", ""),)
|
|
262
|
+
self.build_script_post = (job_dict.get("build_script_post", ""),)
|
|
263
|
+
self.type = JobType(job_dict.get("type", "unknown").lower())
|
|
264
|
+
self.state = job_dict["state"]
|
|
265
|
+
self.create_time = datetime.fromtimestamp(job_dict["create_time"] * 1e-6)
|
|
266
|
+
self.version = job_dict["version"]
|
|
267
|
+
self.__id = job_dict["id"]
|
|
268
|
+
|
|
172
269
|
def to_json(self):
|
|
173
270
|
"""
|
|
174
271
|
Convert the Job instance to a JSON string representation.
|
|
175
272
|
"""
|
|
176
273
|
return json.dumps(self.to_dict())
|
|
177
274
|
|
|
178
|
-
def verify_repo_is_ready(self) -> Tuple[
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
)
|
|
275
|
+
def verify_repo_is_ready(self) -> Tuple[str, str]:
|
|
276
|
+
"""
|
|
277
|
+
Verify if the repository is ready for job execution.
|
|
278
|
+
The check consists of two parts:
|
|
279
|
+
1. Check if there are uncommitted changes in the repository.
|
|
280
|
+
2. Check if the local commit matches the remote HEAD (the repo is synced with the remote).
|
|
281
|
+
Returns a tuple of (branch: str, commit_hash: str).
|
|
282
|
+
"""
|
|
283
|
+
local_commit = run_shell_command("git rev-parse HEAD")[0]
|
|
284
|
+
branch_name = run_shell_command("git rev-parse --abbrev-ref HEAD")[0]
|
|
285
|
+
|
|
286
|
+
if os.getenv("DATATAILR_ALLOW_UNSAFE_SCHEDULING", "false").lower() == "true":
|
|
287
|
+
return branch_name, local_commit
|
|
288
|
+
return_code = run_shell_command("git diff --exit-code")[1]
|
|
289
|
+
is_committed = return_code == 0
|
|
290
|
+
|
|
185
291
|
if not is_committed:
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
"Uncommitted changes detected. Please commit your changes before running the job.",
|
|
292
|
+
raise RepoValidationError(
|
|
293
|
+
"Please commit your changes before running the job."
|
|
189
294
|
)
|
|
190
295
|
|
|
191
|
-
local_commit = subprocess.run(
|
|
192
|
-
("git rev-parse HEAD"), shell=True, capture_output=True, text=True
|
|
193
|
-
).stdout.strip()
|
|
194
296
|
remote_commit = (
|
|
195
297
|
subprocess.run(
|
|
196
|
-
("git ls-remote origin HEAD"),
|
|
298
|
+
("remote_commit = $(git ls-remote origin HEAD)"),
|
|
197
299
|
shell=True,
|
|
198
300
|
capture_output=True,
|
|
199
301
|
text=True,
|
|
@@ -203,43 +305,67 @@ class Job:
|
|
|
203
305
|
)
|
|
204
306
|
|
|
205
307
|
if local_commit != remote_commit:
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
"Local commit does not match remote HEAD. Please pull the latest changes before running the job.",
|
|
308
|
+
raise RepoValidationError(
|
|
309
|
+
"Please sync your local repository with the remote before running the job."
|
|
209
310
|
)
|
|
210
311
|
|
|
211
|
-
|
|
212
|
-
("git rev-parse --abbrev-ref HEAD"),
|
|
213
|
-
shell=True,
|
|
214
|
-
capture_output=True,
|
|
215
|
-
text=True,
|
|
216
|
-
).stdout.strip()
|
|
217
|
-
return True, ""
|
|
312
|
+
return branch_name, local_commit
|
|
218
313
|
|
|
219
|
-
def
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
check_result = self.verify_repo_is_ready()
|
|
229
|
-
if not check_result[0]:
|
|
230
|
-
raise RepoValidationError(check_result[1])
|
|
231
|
-
logger.info(
|
|
232
|
-
f"Running job '{self.name}' in environment '{self.environment}' as '{self.run_as}'"
|
|
233
|
-
)
|
|
314
|
+
def __prepare__(self) -> str:
|
|
315
|
+
branch_name, local_commit = self.verify_repo_is_ready()
|
|
316
|
+
self.image.update(
|
|
317
|
+
branch_name=branch_name,
|
|
318
|
+
commit_hash=local_commit,
|
|
319
|
+
)
|
|
320
|
+
logger.info(
|
|
321
|
+
f"Running job '{self.name}' in environment '{self.environment}' as '{self.run_as}'"
|
|
322
|
+
)
|
|
234
323
|
|
|
235
|
-
|
|
236
|
-
|
|
324
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as temp_file:
|
|
325
|
+
temp_file.write(self.to_json().encode())
|
|
326
|
+
return temp_file.name
|
|
237
327
|
|
|
238
|
-
|
|
239
|
-
|
|
328
|
+
def get_schedule_args(self) -> dict:
|
|
329
|
+
"""
|
|
330
|
+
Returns additional arguments for scheduling the job.
|
|
331
|
+
Override or extend this method as needed.
|
|
332
|
+
"""
|
|
333
|
+
return {}
|
|
240
334
|
|
|
241
|
-
|
|
242
|
-
|
|
335
|
+
def __run_command__(self, command: str) -> Tuple[bool, str]:
|
|
336
|
+
"""
|
|
337
|
+
Run a command in the context of the job.
|
|
338
|
+
This is used to execute the job's entry point.
|
|
339
|
+
"""
|
|
340
|
+
if not is_dt_installed():
|
|
243
341
|
raise NotImplementedError(
|
|
244
342
|
"DataTailr is not installed. Please install DataTailr to run this job."
|
|
245
343
|
)
|
|
344
|
+
try:
|
|
345
|
+
temp_file_name = self.__prepare__()
|
|
346
|
+
|
|
347
|
+
if command == "run":
|
|
348
|
+
__client__.run(f"file://{temp_file_name}", **self.get_schedule_args())
|
|
349
|
+
elif command == "save":
|
|
350
|
+
__client__.save(f"file://{temp_file_name}", **self.get_schedule_args())
|
|
351
|
+
else:
|
|
352
|
+
raise ValueError(f"Unknown command: {command}")
|
|
353
|
+
os.remove(temp_file_name)
|
|
354
|
+
except Exception as e:
|
|
355
|
+
logger.error(f"Error running command '{command}': {e}")
|
|
356
|
+
return False, str(e)
|
|
357
|
+
return True, f"Job '{self.name}' {command}d successfully."
|
|
358
|
+
|
|
359
|
+
def save(self) -> Tuple[bool, str]:
|
|
360
|
+
"""
|
|
361
|
+
Save the job to the DataTailr platform.
|
|
362
|
+
If the job already exists, it will be updated.
|
|
363
|
+
"""
|
|
364
|
+
return self.__run_command__("save")
|
|
365
|
+
|
|
366
|
+
def run(self) -> Tuple[bool, str]:
|
|
367
|
+
"""
|
|
368
|
+
Run the job. This method should be implemented to execute the job logic.
|
|
369
|
+
It verifies the repository state and prepares the job for execution.
|
|
370
|
+
"""
|
|
371
|
+
return self.__run_command__("run")
|