datatailr 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datatailr might be problematic. Click here for more details.

@@ -0,0 +1,38 @@
1
+ # *************************************************************************
2
+ #
3
+ # Copyright (c) 2025 - Datatailr Inc.
4
+ # All Rights Reserved.
5
+ #
6
+ # This file is part of Datatailr and subject to the terms and conditions
7
+ # defined in 'LICENSE.txt'. Unauthorized copying and/or distribution
8
+ # of this file, in parts or full, via any medium is strictly prohibited.
9
+ # *************************************************************************
10
+
11
+ from datatailr.errors import BatchJobError, DatatailrError
12
+ from datatailr.scheduler.base import (
13
+ ACL,
14
+ EntryPoint,
15
+ Environment,
16
+ Job,
17
+ JobType,
18
+ Resources,
19
+ User,
20
+ )
21
+ from datatailr.scheduler.batch import Batch, BatchJob, DuplicateJobNameError
22
+ from datatailr.scheduler.batch_decorator import batch_decorator as batch
23
+
24
+ __all__ = [
25
+ "Job",
26
+ "JobType",
27
+ "Environment",
28
+ "User",
29
+ "Resources",
30
+ "ACL",
31
+ "EntryPoint",
32
+ "Batch",
33
+ "BatchJob",
34
+ "batch",
35
+ "DatatailrError",
36
+ "BatchJobError",
37
+ "DuplicateJobNameError",
38
+ ]
@@ -0,0 +1,126 @@
1
+ ##########################################################################
2
+ #
3
+ # Copyright (c) 2025 - Datatailr Inc.
4
+ # All Rights Reserved.
5
+ #
6
+ # This file is part of Datatailr and subject to the terms and conditions
7
+ # defined in 'LICENSE.txt'. Unauthorized copying and/or distribution
8
+ # of this file, in parts or full, via any medium is strictly prohibited.
9
+ ##########################################################################
10
+
11
+ """
12
+ Module for caching arguments passed to batch jobs.
13
+
14
+ This module provides two backends for caching:
15
+ 1. In-memory cache for local runs.
16
+ 2. Persistent cache using the dt__Blob module for remote runs.
17
+
18
+ The cache stores arguments as a dictionary of dictionaries, where the outer dictionary's keys are job names
19
+ and the inner dictionaries contain the arguments.
20
+
21
+ This module is for internal use of the datatailr package.
22
+ """
23
+
24
+ from collections import defaultdict
25
+ from typing import Any, Dict
26
+
27
+ from datatailr import is_dt_installed
28
+ from datatailr.scheduler import BatchJob
29
+
30
+
31
+ class ArgumentsCache:
32
+ def __init__(self, use_persistent_cache: bool = is_dt_installed()):
33
+ """
34
+ Initialize the ArgumentsCache.
35
+
36
+ :param use_persistent_cache: If True, use the persistent cache backend. Otherwise, use in-memory cache.
37
+ """
38
+ self.use_persistent_cache = use_persistent_cache
39
+ self.in_memory_cache: Dict[str, Dict[str, Dict[str, Any]]] = defaultdict(
40
+ lambda: defaultdict(dict)
41
+ )
42
+
43
+ def add_arguments(self, batch_run_id: str, job: str, arguments: Dict[str, Any]):
44
+ """
45
+ Add arguments to the cache for a specific job and batch run.
46
+
47
+ :param batch_run_id: Identifier for the batch run.
48
+ :param job_name: Name of the job.
49
+ :param arguments: Dictionary of arguments to store.
50
+ """
51
+ if self.use_persistent_cache and isinstance(job, str):
52
+ self._add_to_persistent_cache(batch_run_id, job, arguments)
53
+ else:
54
+ self.in_memory_cache[batch_run_id][job]["args"] = arguments
55
+
56
+ def get_arguments(self, batch_run_id: str, job: str) -> Dict[str, Any]:
57
+ """
58
+ Retrieve arguments from the cache for a specific job and batch run.
59
+
60
+ :param batch_run_id: Identifier for the batch run.
61
+ :param job_name: Name of the job.
62
+ :return: Dictionary of arguments.
63
+ """
64
+ if self.use_persistent_cache and isinstance(job, str):
65
+ return self._get_from_persistent_cache(batch_run_id, job)
66
+ arguments = {}
67
+ for key, value in (
68
+ self.in_memory_cache.get(batch_run_id, {})
69
+ .get(job, {})
70
+ .get("args", {})
71
+ .items()
72
+ ):
73
+ if isinstance(value, BatchJob):
74
+ arguments[key] = value.name
75
+ else:
76
+ arguments[key] = value
77
+ return arguments
78
+
79
+ def add_result(self, batch_run_id: str, job: str, result: Any):
80
+ """
81
+ Add the result of a batch job to the cache.
82
+
83
+ :param batch_run_id: Identifier for the batch run.
84
+ :param job: Name of the job.
85
+ :param result: Result of the batch job.
86
+ """
87
+ if self.use_persistent_cache and isinstance(job, str):
88
+ self._add_to_persistent_cache(batch_run_id, job, {"result": result})
89
+ else:
90
+ self.in_memory_cache[batch_run_id][job]["result"] = result
91
+
92
+ def get_result(self, batch_run_id: str, job: str) -> Any:
93
+ """
94
+ Retrieve the result of a batch job from the cache.
95
+
96
+ :param batch_run_id: Identifier for the batch run.
97
+ :param job: Name of the job.
98
+ :return: Result of the batch job.
99
+ """
100
+ if self.use_persistent_cache and isinstance(job, str):
101
+ return self._get_from_persistent_cache(batch_run_id, job).get("result")
102
+ return self.in_memory_cache[batch_run_id][job].get("result")
103
+
104
+ def _add_to_persistent_cache(
105
+ self, batch_run_id: str, job_name: str, arguments: Dict[str, Any]
106
+ ):
107
+ """
108
+ Add arguments to the persistent cache.
109
+
110
+ :param batch_run_id: Identifier for the batch run.
111
+ :param job_name: Name of the job.
112
+ :param arguments: Dictionary of arguments to store.
113
+ """
114
+ pass
115
+
116
+ def _get_from_persistent_cache(
117
+ self, batch_run_id: str, job_name: str
118
+ ) -> Dict[str, Any]:
119
+ """
120
+ Retrieve arguments from the persistent cache.
121
+
122
+ :param batch_run_id: Identifier for the batch run.
123
+ :param job_name: Name of the job.
124
+ :return: Dictionary of arguments.
125
+ """
126
+ return {}
@@ -0,0 +1,238 @@
1
+ # *************************************************************************
2
+ #
3
+ # Copyright (c) 2025 - Datatailr Inc.
4
+ # All Rights Reserved.
5
+ #
6
+ # This file is part of Datatailr and subject to the terms and conditions
7
+ # defined in 'LICENSE.txt'. Unauthorized copying and/or distribution
8
+ # of this file, in parts or full, via any medium is strictly prohibited.
9
+ # *************************************************************************
10
+
11
+ import importlib
12
+ import json
13
+ import os
14
+ import subprocess
15
+ import tempfile
16
+ import uuid
17
+ from dataclasses import dataclass
18
+ from enum import Enum
19
+ from typing import Callable, Optional, Tuple, Union
20
+
21
+ from datatailr import ACL, Environment, User, dt__Job, is_dt_installed
22
+ from datatailr.build.image import Image
23
+ from datatailr.logging import DatatailrLogger
24
+
25
+ logger = DatatailrLogger(os.path.abspath(__file__)).get_logger()
26
+
27
+
28
+ class JobType(Enum):
29
+ """
30
+ Enum representing different types of DataTailr jobs.
31
+ """
32
+
33
+ BATCH = "batch"
34
+ SERVICE = "service"
35
+ APP = "app"
36
+ UNKNOWN = "unknown"
37
+
38
+ def __str__(self):
39
+ return self.value
40
+
41
+ def __repr__(self):
42
+ return f"JobType.{self.name}('{self.value}')"
43
+
44
+
45
+ @dataclass
46
+ class Resources:
47
+ """
48
+ Represents the resources required for a job.
49
+ """
50
+
51
+ memory: str = "100m"
52
+ cpu: int = 1
53
+
54
+
55
+ class EntryPoint:
56
+ """
57
+ Represents an entry point for a DataTailr job.
58
+ This can be a function or a callable object.
59
+ """
60
+
61
+ def __init__(
62
+ self,
63
+ type: JobType,
64
+ func: Optional[Callable] = None,
65
+ module_name: Optional[str] = None,
66
+ function_name: Optional[str] = None,
67
+ ):
68
+ if func is None and (module_name is None or function_name is None):
69
+ raise ValueError(
70
+ "Either a function or module and function names must be provided."
71
+ )
72
+ self.func = func
73
+ self.module_name = func.__module__ if func else module_name
74
+ self.function_name = func.__name__ if func else function_name
75
+ self.type = type
76
+
77
+ def __call__(self, *args, **kwargs):
78
+ if self.type == JobType.BATCH:
79
+ if self.module_name and self.function_name:
80
+ module = importlib.import_module(self.module_name)
81
+ func = getattr(module, self.function_name)
82
+ elif self.func is not None:
83
+ func = self.func
84
+ return func(*args, **kwargs)
85
+
86
+ elif self.type == JobType.SERVICE:
87
+ raise NotImplementedError("Service jobs are not yet implemented.")
88
+
89
+ elif self.type == JobType.APP:
90
+ raise NotImplementedError("App jobs are not yet implemented.")
91
+
92
+ def __repr__(self):
93
+ return f"EntryPoint({self.function_name} from {self.module_name}, type={self.type})"
94
+
95
+ def __str__(self):
96
+ return f"{self.module_name}.{self.function_name}"
97
+
98
+
99
+ class Job:
100
+ def __init__(
101
+ self,
102
+ environment: Optional[Environment],
103
+ name: str,
104
+ image: Image,
105
+ run_as: Optional[Union[str, User]],
106
+ resources: Resources = Resources(memory="100m", cpu=1),
107
+ acl: Optional[ACL] = None,
108
+ ):
109
+ if run_as is None:
110
+ run_as = User.signed_user()
111
+ if environment is None:
112
+ environment = Environment.DEV
113
+ elif isinstance(environment, str):
114
+ environment = Environment(environment.lower())
115
+ if isinstance(environment, str):
116
+ environment = Environment(environment)
117
+ self.acl = acl or ACL(user=run_as)
118
+ self.environment = environment
119
+ self.name = name
120
+ self.run_as = run_as
121
+ self.resources = resources
122
+ self.image = image
123
+
124
+ # Placeholders, to be set in derived classes
125
+ self.type: JobType = JobType.UNKNOWN
126
+ self.entrypoint = None
127
+ self.__id = str(uuid.uuid4())
128
+
129
+ @property
130
+ def id(self) -> str:
131
+ """
132
+ Unique identifier for the job.
133
+ """
134
+ return self.__id
135
+
136
+ def __repr__(self):
137
+ return (
138
+ f"Job(name={self.name}, environment={self.environment}, "
139
+ f"run_as={self.run_as}, resources={self.resources}, "
140
+ f"acl={self.acl}, type={self.type}, "
141
+ f"entrypoint={self.entrypoint}, image={self.image})"
142
+ )
143
+
144
+ def to_dict(self):
145
+ """
146
+ Convert the Job instance to a dictionary representation.
147
+ """
148
+ job_dict = {
149
+ "environment": str(self.environment),
150
+ "image": self.image.to_dict(),
151
+ "type": str(self.type) if self.type else None,
152
+ "name": self.name,
153
+ "run_as": self.run_as.name
154
+ if isinstance(self.run_as, User)
155
+ else self.run_as,
156
+ "acl": self.acl.to_dict(),
157
+ }
158
+ if self.type != JobType.BATCH:
159
+ job_dict["entrypoint"] = str(self.entrypoint) if self.entrypoint else None
160
+ job_dict["image"] = (self.image,)
161
+ job_dict["memory"] = (self.resources.memory,)
162
+ job_dict["cpu"] = self.resources.cpu
163
+ return job_dict
164
+
165
+ def to_json(self):
166
+ """
167
+ Convert the Job instance to a JSON string representation.
168
+ """
169
+ return json.dumps(self.to_dict())
170
+
171
+ def verify_repo_is_ready(self) -> Tuple[bool, str]:
172
+ is_committed = (
173
+ subprocess.run(
174
+ ("git diff --exit-code"), shell=True, capture_output=True
175
+ ).returncode
176
+ == 0
177
+ )
178
+ if not is_committed:
179
+ return (
180
+ False,
181
+ "Uncommitted changes detected. Please commit your changes before running the job.",
182
+ )
183
+
184
+ local_commit = subprocess.run(
185
+ ("git rev-parse HEAD"), shell=True, capture_output=True, text=True
186
+ ).stdout.strip()
187
+ remote_commit = (
188
+ subprocess.run(
189
+ ("git ls-remote origin HEAD"),
190
+ shell=True,
191
+ capture_output=True,
192
+ text=True,
193
+ )
194
+ .stdout.strip()
195
+ .split("\t")[0]
196
+ )
197
+
198
+ if local_commit != remote_commit:
199
+ return (
200
+ False,
201
+ "Local commit does not match remote HEAD. Please pull the latest changes before running the job.",
202
+ )
203
+
204
+ branch = subprocess.run(
205
+ ("git rev-parse --abbrev-ref HEAD"),
206
+ shell=True,
207
+ capture_output=True,
208
+ text=True,
209
+ ).stdout.strip()
210
+ return True, ""
211
+
212
+ def run(self) -> Tuple[bool, str]:
213
+ """
214
+ Run the job. This method should be implemented to execute the job logic.
215
+ It verifies the repository state and prepares the job for execution.
216
+ Returns a tuple of (success: bool, message: str).
217
+ If the repository is not ready, it returns False with an error message.
218
+ If the job runs successfully, it returns True with an empty message.
219
+ """
220
+ if is_dt_installed():
221
+ check_result = self.verify_repo_is_ready()
222
+ if not check_result[0]:
223
+ return False, check_result[1]
224
+ logger.info(
225
+ f"Running job '{self.name}' in environment '{self.environment}' as '{self.run_as}'"
226
+ )
227
+
228
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as temp_file:
229
+ temp_file.write(self.to_json().encode())
230
+
231
+ dt__Job().run(f"file://{temp_file.name}")
232
+ os.remove(temp_file.name)
233
+
234
+ return True, ""
235
+ else:
236
+ raise NotImplementedError(
237
+ "DataTailr is not installed. Please install DataTailr to run this job."
238
+ )