datatailr 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datatailr might be problematic. Click here for more details.
- datatailr/__init__.py +63 -0
- datatailr/acl.py +80 -0
- datatailr/blob.py +103 -0
- datatailr/build/__init__.py +11 -0
- datatailr/build/image.py +87 -0
- datatailr/dt_json.py +42 -0
- datatailr/errors.py +10 -0
- datatailr/group.py +136 -0
- datatailr/logging.py +93 -0
- datatailr/sbin/run_job.py +63 -0
- datatailr/scheduler/__init__.py +38 -0
- datatailr/scheduler/arguments_cache.py +126 -0
- datatailr/scheduler/base.py +238 -0
- datatailr/scheduler/batch.py +347 -0
- datatailr/scheduler/batch_decorator.py +112 -0
- datatailr/scheduler/constants.py +20 -0
- datatailr/scheduler/utils.py +28 -0
- datatailr/user.py +201 -0
- datatailr/utils.py +35 -0
- datatailr/version.py +14 -0
- datatailr/wrapper.py +204 -0
- datatailr-0.1.2.dist-info/METADATA +24 -0
- datatailr-0.1.2.dist-info/RECORD +29 -0
- {datatailr-0.1.0.dist-info → datatailr-0.1.2.dist-info}/WHEEL +1 -1
- datatailr-0.1.2.dist-info/entry_points.txt +2 -0
- {datatailr-0.1.0.dist-info → datatailr-0.1.2.dist-info}/licenses/LICENSE +0 -1
- datatailr-0.1.2.dist-info/top_level.txt +2 -0
- test_module/__init__.py +17 -0
- test_module/test_submodule.py +38 -0
- datatailr-0.1.0.dist-info/METADATA +0 -17
- datatailr-0.1.0.dist-info/RECORD +0 -6
- datatailr-0.1.0.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# *************************************************************************
|
|
2
|
+
#
|
|
3
|
+
# Copyright (c) 2025 - Datatailr Inc.
|
|
4
|
+
# All Rights Reserved.
|
|
5
|
+
#
|
|
6
|
+
# This file is part of Datatailr and subject to the terms and conditions
|
|
7
|
+
# defined in 'LICENSE.txt'. Unauthorized copying and/or distribution
|
|
8
|
+
# of this file, in parts or full, via any medium is strictly prohibited.
|
|
9
|
+
# *************************************************************************
|
|
10
|
+
|
|
11
|
+
from datatailr.errors import BatchJobError, DatatailrError
|
|
12
|
+
from datatailr.scheduler.base import (
|
|
13
|
+
ACL,
|
|
14
|
+
EntryPoint,
|
|
15
|
+
Environment,
|
|
16
|
+
Job,
|
|
17
|
+
JobType,
|
|
18
|
+
Resources,
|
|
19
|
+
User,
|
|
20
|
+
)
|
|
21
|
+
from datatailr.scheduler.batch import Batch, BatchJob, DuplicateJobNameError
|
|
22
|
+
from datatailr.scheduler.batch_decorator import batch_decorator as batch
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"Job",
|
|
26
|
+
"JobType",
|
|
27
|
+
"Environment",
|
|
28
|
+
"User",
|
|
29
|
+
"Resources",
|
|
30
|
+
"ACL",
|
|
31
|
+
"EntryPoint",
|
|
32
|
+
"Batch",
|
|
33
|
+
"BatchJob",
|
|
34
|
+
"batch",
|
|
35
|
+
"DatatailrError",
|
|
36
|
+
"BatchJobError",
|
|
37
|
+
"DuplicateJobNameError",
|
|
38
|
+
]
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
##########################################################################
|
|
2
|
+
#
|
|
3
|
+
# Copyright (c) 2025 - Datatailr Inc.
|
|
4
|
+
# All Rights Reserved.
|
|
5
|
+
#
|
|
6
|
+
# This file is part of Datatailr and subject to the terms and conditions
|
|
7
|
+
# defined in 'LICENSE.txt'. Unauthorized copying and/or distribution
|
|
8
|
+
# of this file, in parts or full, via any medium is strictly prohibited.
|
|
9
|
+
##########################################################################
|
|
10
|
+
|
|
11
|
+
"""
|
|
12
|
+
Module for caching arguments passed to batch jobs.
|
|
13
|
+
|
|
14
|
+
This module provides two backends for caching:
|
|
15
|
+
1. In-memory cache for local runs.
|
|
16
|
+
2. Persistent cache using the dt__Blob module for remote runs.
|
|
17
|
+
|
|
18
|
+
The cache stores arguments as a dictionary of dictionaries, where the outer dictionary's keys are job names
|
|
19
|
+
and the inner dictionaries contain the arguments.
|
|
20
|
+
|
|
21
|
+
This module is for internal use of the datatailr package.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from collections import defaultdict
|
|
25
|
+
from typing import Any, Dict
|
|
26
|
+
|
|
27
|
+
from datatailr import is_dt_installed
|
|
28
|
+
from datatailr.scheduler import BatchJob
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ArgumentsCache:
|
|
32
|
+
def __init__(self, use_persistent_cache: bool = is_dt_installed()):
|
|
33
|
+
"""
|
|
34
|
+
Initialize the ArgumentsCache.
|
|
35
|
+
|
|
36
|
+
:param use_persistent_cache: If True, use the persistent cache backend. Otherwise, use in-memory cache.
|
|
37
|
+
"""
|
|
38
|
+
self.use_persistent_cache = use_persistent_cache
|
|
39
|
+
self.in_memory_cache: Dict[str, Dict[str, Dict[str, Any]]] = defaultdict(
|
|
40
|
+
lambda: defaultdict(dict)
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def add_arguments(self, batch_run_id: str, job: str, arguments: Dict[str, Any]):
|
|
44
|
+
"""
|
|
45
|
+
Add arguments to the cache for a specific job and batch run.
|
|
46
|
+
|
|
47
|
+
:param batch_run_id: Identifier for the batch run.
|
|
48
|
+
:param job_name: Name of the job.
|
|
49
|
+
:param arguments: Dictionary of arguments to store.
|
|
50
|
+
"""
|
|
51
|
+
if self.use_persistent_cache and isinstance(job, str):
|
|
52
|
+
self._add_to_persistent_cache(batch_run_id, job, arguments)
|
|
53
|
+
else:
|
|
54
|
+
self.in_memory_cache[batch_run_id][job]["args"] = arguments
|
|
55
|
+
|
|
56
|
+
def get_arguments(self, batch_run_id: str, job: str) -> Dict[str, Any]:
|
|
57
|
+
"""
|
|
58
|
+
Retrieve arguments from the cache for a specific job and batch run.
|
|
59
|
+
|
|
60
|
+
:param batch_run_id: Identifier for the batch run.
|
|
61
|
+
:param job_name: Name of the job.
|
|
62
|
+
:return: Dictionary of arguments.
|
|
63
|
+
"""
|
|
64
|
+
if self.use_persistent_cache and isinstance(job, str):
|
|
65
|
+
return self._get_from_persistent_cache(batch_run_id, job)
|
|
66
|
+
arguments = {}
|
|
67
|
+
for key, value in (
|
|
68
|
+
self.in_memory_cache.get(batch_run_id, {})
|
|
69
|
+
.get(job, {})
|
|
70
|
+
.get("args", {})
|
|
71
|
+
.items()
|
|
72
|
+
):
|
|
73
|
+
if isinstance(value, BatchJob):
|
|
74
|
+
arguments[key] = value.name
|
|
75
|
+
else:
|
|
76
|
+
arguments[key] = value
|
|
77
|
+
return arguments
|
|
78
|
+
|
|
79
|
+
def add_result(self, batch_run_id: str, job: str, result: Any):
|
|
80
|
+
"""
|
|
81
|
+
Add the result of a batch job to the cache.
|
|
82
|
+
|
|
83
|
+
:param batch_run_id: Identifier for the batch run.
|
|
84
|
+
:param job: Name of the job.
|
|
85
|
+
:param result: Result of the batch job.
|
|
86
|
+
"""
|
|
87
|
+
if self.use_persistent_cache and isinstance(job, str):
|
|
88
|
+
self._add_to_persistent_cache(batch_run_id, job, {"result": result})
|
|
89
|
+
else:
|
|
90
|
+
self.in_memory_cache[batch_run_id][job]["result"] = result
|
|
91
|
+
|
|
92
|
+
def get_result(self, batch_run_id: str, job: str) -> Any:
|
|
93
|
+
"""
|
|
94
|
+
Retrieve the result of a batch job from the cache.
|
|
95
|
+
|
|
96
|
+
:param batch_run_id: Identifier for the batch run.
|
|
97
|
+
:param job: Name of the job.
|
|
98
|
+
:return: Result of the batch job.
|
|
99
|
+
"""
|
|
100
|
+
if self.use_persistent_cache and isinstance(job, str):
|
|
101
|
+
return self._get_from_persistent_cache(batch_run_id, job).get("result")
|
|
102
|
+
return self.in_memory_cache[batch_run_id][job].get("result")
|
|
103
|
+
|
|
104
|
+
def _add_to_persistent_cache(
|
|
105
|
+
self, batch_run_id: str, job_name: str, arguments: Dict[str, Any]
|
|
106
|
+
):
|
|
107
|
+
"""
|
|
108
|
+
Add arguments to the persistent cache.
|
|
109
|
+
|
|
110
|
+
:param batch_run_id: Identifier for the batch run.
|
|
111
|
+
:param job_name: Name of the job.
|
|
112
|
+
:param arguments: Dictionary of arguments to store.
|
|
113
|
+
"""
|
|
114
|
+
pass
|
|
115
|
+
|
|
116
|
+
def _get_from_persistent_cache(
|
|
117
|
+
self, batch_run_id: str, job_name: str
|
|
118
|
+
) -> Dict[str, Any]:
|
|
119
|
+
"""
|
|
120
|
+
Retrieve arguments from the persistent cache.
|
|
121
|
+
|
|
122
|
+
:param batch_run_id: Identifier for the batch run.
|
|
123
|
+
:param job_name: Name of the job.
|
|
124
|
+
:return: Dictionary of arguments.
|
|
125
|
+
"""
|
|
126
|
+
return {}
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
# *************************************************************************
|
|
2
|
+
#
|
|
3
|
+
# Copyright (c) 2025 - Datatailr Inc.
|
|
4
|
+
# All Rights Reserved.
|
|
5
|
+
#
|
|
6
|
+
# This file is part of Datatailr and subject to the terms and conditions
|
|
7
|
+
# defined in 'LICENSE.txt'. Unauthorized copying and/or distribution
|
|
8
|
+
# of this file, in parts or full, via any medium is strictly prohibited.
|
|
9
|
+
# *************************************************************************
|
|
10
|
+
|
|
11
|
+
import importlib
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
import subprocess
|
|
15
|
+
import tempfile
|
|
16
|
+
import uuid
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from enum import Enum
|
|
19
|
+
from typing import Callable, Optional, Tuple, Union
|
|
20
|
+
|
|
21
|
+
from datatailr import ACL, Environment, User, dt__Job, is_dt_installed
|
|
22
|
+
from datatailr.build.image import Image
|
|
23
|
+
from datatailr.logging import DatatailrLogger
|
|
24
|
+
|
|
25
|
+
logger = DatatailrLogger(os.path.abspath(__file__)).get_logger()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class JobType(Enum):
|
|
29
|
+
"""
|
|
30
|
+
Enum representing different types of DataTailr jobs.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
BATCH = "batch"
|
|
34
|
+
SERVICE = "service"
|
|
35
|
+
APP = "app"
|
|
36
|
+
UNKNOWN = "unknown"
|
|
37
|
+
|
|
38
|
+
def __str__(self):
|
|
39
|
+
return self.value
|
|
40
|
+
|
|
41
|
+
def __repr__(self):
|
|
42
|
+
return f"JobType.{self.name}('{self.value}')"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class Resources:
|
|
47
|
+
"""
|
|
48
|
+
Represents the resources required for a job.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
memory: str = "100m"
|
|
52
|
+
cpu: int = 1
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class EntryPoint:
|
|
56
|
+
"""
|
|
57
|
+
Represents an entry point for a DataTailr job.
|
|
58
|
+
This can be a function or a callable object.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
def __init__(
|
|
62
|
+
self,
|
|
63
|
+
type: JobType,
|
|
64
|
+
func: Optional[Callable] = None,
|
|
65
|
+
module_name: Optional[str] = None,
|
|
66
|
+
function_name: Optional[str] = None,
|
|
67
|
+
):
|
|
68
|
+
if func is None and (module_name is None or function_name is None):
|
|
69
|
+
raise ValueError(
|
|
70
|
+
"Either a function or module and function names must be provided."
|
|
71
|
+
)
|
|
72
|
+
self.func = func
|
|
73
|
+
self.module_name = func.__module__ if func else module_name
|
|
74
|
+
self.function_name = func.__name__ if func else function_name
|
|
75
|
+
self.type = type
|
|
76
|
+
|
|
77
|
+
def __call__(self, *args, **kwargs):
|
|
78
|
+
if self.type == JobType.BATCH:
|
|
79
|
+
if self.module_name and self.function_name:
|
|
80
|
+
module = importlib.import_module(self.module_name)
|
|
81
|
+
func = getattr(module, self.function_name)
|
|
82
|
+
elif self.func is not None:
|
|
83
|
+
func = self.func
|
|
84
|
+
return func(*args, **kwargs)
|
|
85
|
+
|
|
86
|
+
elif self.type == JobType.SERVICE:
|
|
87
|
+
raise NotImplementedError("Service jobs are not yet implemented.")
|
|
88
|
+
|
|
89
|
+
elif self.type == JobType.APP:
|
|
90
|
+
raise NotImplementedError("App jobs are not yet implemented.")
|
|
91
|
+
|
|
92
|
+
def __repr__(self):
|
|
93
|
+
return f"EntryPoint({self.function_name} from {self.module_name}, type={self.type})"
|
|
94
|
+
|
|
95
|
+
def __str__(self):
|
|
96
|
+
return f"{self.module_name}.{self.function_name}"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class Job:
|
|
100
|
+
def __init__(
|
|
101
|
+
self,
|
|
102
|
+
environment: Optional[Environment],
|
|
103
|
+
name: str,
|
|
104
|
+
image: Image,
|
|
105
|
+
run_as: Optional[Union[str, User]],
|
|
106
|
+
resources: Resources = Resources(memory="100m", cpu=1),
|
|
107
|
+
acl: Optional[ACL] = None,
|
|
108
|
+
):
|
|
109
|
+
if run_as is None:
|
|
110
|
+
run_as = User.signed_user()
|
|
111
|
+
if environment is None:
|
|
112
|
+
environment = Environment.DEV
|
|
113
|
+
elif isinstance(environment, str):
|
|
114
|
+
environment = Environment(environment.lower())
|
|
115
|
+
if isinstance(environment, str):
|
|
116
|
+
environment = Environment(environment)
|
|
117
|
+
self.acl = acl or ACL(user=run_as)
|
|
118
|
+
self.environment = environment
|
|
119
|
+
self.name = name
|
|
120
|
+
self.run_as = run_as
|
|
121
|
+
self.resources = resources
|
|
122
|
+
self.image = image
|
|
123
|
+
|
|
124
|
+
# Placeholders, to be set in derived classes
|
|
125
|
+
self.type: JobType = JobType.UNKNOWN
|
|
126
|
+
self.entrypoint = None
|
|
127
|
+
self.__id = str(uuid.uuid4())
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def id(self) -> str:
|
|
131
|
+
"""
|
|
132
|
+
Unique identifier for the job.
|
|
133
|
+
"""
|
|
134
|
+
return self.__id
|
|
135
|
+
|
|
136
|
+
def __repr__(self):
|
|
137
|
+
return (
|
|
138
|
+
f"Job(name={self.name}, environment={self.environment}, "
|
|
139
|
+
f"run_as={self.run_as}, resources={self.resources}, "
|
|
140
|
+
f"acl={self.acl}, type={self.type}, "
|
|
141
|
+
f"entrypoint={self.entrypoint}, image={self.image})"
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
def to_dict(self):
|
|
145
|
+
"""
|
|
146
|
+
Convert the Job instance to a dictionary representation.
|
|
147
|
+
"""
|
|
148
|
+
job_dict = {
|
|
149
|
+
"environment": str(self.environment),
|
|
150
|
+
"image": self.image.to_dict(),
|
|
151
|
+
"type": str(self.type) if self.type else None,
|
|
152
|
+
"name": self.name,
|
|
153
|
+
"run_as": self.run_as.name
|
|
154
|
+
if isinstance(self.run_as, User)
|
|
155
|
+
else self.run_as,
|
|
156
|
+
"acl": self.acl.to_dict(),
|
|
157
|
+
}
|
|
158
|
+
if self.type != JobType.BATCH:
|
|
159
|
+
job_dict["entrypoint"] = str(self.entrypoint) if self.entrypoint else None
|
|
160
|
+
job_dict["image"] = (self.image,)
|
|
161
|
+
job_dict["memory"] = (self.resources.memory,)
|
|
162
|
+
job_dict["cpu"] = self.resources.cpu
|
|
163
|
+
return job_dict
|
|
164
|
+
|
|
165
|
+
def to_json(self):
|
|
166
|
+
"""
|
|
167
|
+
Convert the Job instance to a JSON string representation.
|
|
168
|
+
"""
|
|
169
|
+
return json.dumps(self.to_dict())
|
|
170
|
+
|
|
171
|
+
def verify_repo_is_ready(self) -> Tuple[bool, str]:
|
|
172
|
+
is_committed = (
|
|
173
|
+
subprocess.run(
|
|
174
|
+
("git diff --exit-code"), shell=True, capture_output=True
|
|
175
|
+
).returncode
|
|
176
|
+
== 0
|
|
177
|
+
)
|
|
178
|
+
if not is_committed:
|
|
179
|
+
return (
|
|
180
|
+
False,
|
|
181
|
+
"Uncommitted changes detected. Please commit your changes before running the job.",
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
local_commit = subprocess.run(
|
|
185
|
+
("git rev-parse HEAD"), shell=True, capture_output=True, text=True
|
|
186
|
+
).stdout.strip()
|
|
187
|
+
remote_commit = (
|
|
188
|
+
subprocess.run(
|
|
189
|
+
("git ls-remote origin HEAD"),
|
|
190
|
+
shell=True,
|
|
191
|
+
capture_output=True,
|
|
192
|
+
text=True,
|
|
193
|
+
)
|
|
194
|
+
.stdout.strip()
|
|
195
|
+
.split("\t")[0]
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
if local_commit != remote_commit:
|
|
199
|
+
return (
|
|
200
|
+
False,
|
|
201
|
+
"Local commit does not match remote HEAD. Please pull the latest changes before running the job.",
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
branch = subprocess.run(
|
|
205
|
+
("git rev-parse --abbrev-ref HEAD"),
|
|
206
|
+
shell=True,
|
|
207
|
+
capture_output=True,
|
|
208
|
+
text=True,
|
|
209
|
+
).stdout.strip()
|
|
210
|
+
return True, ""
|
|
211
|
+
|
|
212
|
+
def run(self) -> Tuple[bool, str]:
|
|
213
|
+
"""
|
|
214
|
+
Run the job. This method should be implemented to execute the job logic.
|
|
215
|
+
It verifies the repository state and prepares the job for execution.
|
|
216
|
+
Returns a tuple of (success: bool, message: str).
|
|
217
|
+
If the repository is not ready, it returns False with an error message.
|
|
218
|
+
If the job runs successfully, it returns True with an empty message.
|
|
219
|
+
"""
|
|
220
|
+
if is_dt_installed():
|
|
221
|
+
check_result = self.verify_repo_is_ready()
|
|
222
|
+
if not check_result[0]:
|
|
223
|
+
return False, check_result[1]
|
|
224
|
+
logger.info(
|
|
225
|
+
f"Running job '{self.name}' in environment '{self.environment}' as '{self.run_as}'"
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as temp_file:
|
|
229
|
+
temp_file.write(self.to_json().encode())
|
|
230
|
+
|
|
231
|
+
dt__Job().run(f"file://{temp_file.name}")
|
|
232
|
+
os.remove(temp_file.name)
|
|
233
|
+
|
|
234
|
+
return True, ""
|
|
235
|
+
else:
|
|
236
|
+
raise NotImplementedError(
|
|
237
|
+
"DataTailr is not installed. Please install DataTailr to run this job."
|
|
238
|
+
)
|