datatailr 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datatailr might be problematic. Click here for more details.

@@ -0,0 +1,350 @@
1
+ # *************************************************************************
2
+ #
3
+ # Copyright (c) 2025 - Datatailr Inc.
4
+ # All Rights Reserved.
5
+ #
6
+ # This file is part of Datatailr and subject to the terms and conditions
7
+ # defined in 'LICENSE.txt'. Unauthorized copying and/or distribution
8
+ # of this file, in parts or full, via any medium is strictly prohibited.
9
+ # *************************************************************************
10
+
11
+ from __future__ import annotations
12
+
13
+ import contextvars
14
+ import json
15
+ import os
16
+ from functools import reduce
17
+ from typing import Dict, List, Optional, Sequence, Set, Tuple, Union
18
+
19
+ from datatailr import Image
20
+ from datatailr.errors import BatchJobError
21
+ from datatailr.logging import DatatailrLogger
22
+ from datatailr.scheduler.base import (
23
+ ACL,
24
+ EntryPoint,
25
+ Environment,
26
+ Job,
27
+ JobType,
28
+ Resources,
29
+ User,
30
+ )
31
+ from datatailr.scheduler.constants import DEFAULT_TASK_CPU, DEFAULT_TASK_MEMORY
32
+ from datatailr.utils import is_dt_installed
33
+
34
+ __DAG_CONTEXT__: contextvars.ContextVar = contextvars.ContextVar("dag_context")
35
+ logger = DatatailrLogger(os.path.abspath(__file__)).get_logger()
36
+
37
+
38
+ def get_current_manager():
39
+ return __DAG_CONTEXT__.get(None)
40
+
41
+
42
+ def next_batch_job_id():
43
+ i = 0
44
+ while True:
45
+ yield i
46
+ i += 1
47
+
48
+
49
+ class CyclicDependencyError(BatchJobError):
50
+ """
51
+ Exception raised when a cyclic dependency is detected in the batch job dependencies.
52
+ """
53
+
54
+ def __init__(self, message: str):
55
+ super().__init__(message)
56
+ self.message = message
57
+
58
+
59
+ class DuplicateJobNameError(BatchJobError):
60
+ """
61
+ Exception raised when a job with a duplicate name is added to the batch.
62
+ """
63
+
64
+ def __init__(self, job_name: str):
65
+ super().__init__(
66
+ f"A job with the name '{job_name}' already exists in the batch."
67
+ )
68
+ self.job_name = job_name
69
+
70
+
71
+ class MissingDagError(BatchJobError):
72
+ """
73
+ Exception raised when a BatchJob is created outside the context of a Batch.
74
+ """
75
+
76
+ def __init__(self):
77
+ super().__init__(
78
+ "A BatchJob must be either created within the context of a Batch or a Batch object has to be provided as the dag argument."
79
+ )
80
+
81
+
82
+ class BatchJob:
83
+ """
84
+ Represents a job within a batch job.
85
+
86
+ This class can be extended to define specific configurations for each job in the batch.
87
+ """
88
+
89
+ def __init__(
90
+ self,
91
+ name: str,
92
+ entrypoint: EntryPoint,
93
+ resources: Optional[Resources] = None,
94
+ dependencies: Sequence[Union[str, BatchJob]] = [],
95
+ dag: Optional[Batch] = get_current_manager(),
96
+ ):
97
+ self.name = name
98
+ self.entrypoint = entrypoint
99
+ self.resources = resources
100
+ self.dependencies: set = set(dependencies)
101
+ if dag is None:
102
+ raise MissingDagError()
103
+ self.__id = dag.next_job_id
104
+ self.dag = dag
105
+ self.dag.__BATCH_JOB_NAMES__[self.name] = self.__id
106
+ self.dependencies = self.translate_dependencies()
107
+ assert all(
108
+ isinstance(dep, int) for dep in self.dependencies
109
+ ), "All dependencies must be integers representing job IDs."
110
+ self.dag.add_job(self)
111
+
112
+ def alias(self, name: str):
113
+ """
114
+ Set an alias for the BatchJob instance.
115
+
116
+ :param name: The alias name to set.
117
+ """
118
+ if name in self.dag.__BATCH_JOB_NAMES__:
119
+ raise DuplicateJobNameError(name)
120
+ assert self.dag.__BATCH_JOB_NAMES__.pop(self.name) == self.__id
121
+ self.dag.__BATCH_JOB_NAMES__[name] = self.__id
122
+ self.name = name
123
+ return self
124
+
125
+ def __repr__(self):
126
+ return (
127
+ f"BatchJob(name={self.name}, entrypoint={self.entrypoint}, "
128
+ f"resources={self.resources}) (id={self.__id})"
129
+ )
130
+
131
+ def to_dict(self):
132
+ """
133
+ Convert the BatchJob instance to a dictionary representation.
134
+ """
135
+ return {
136
+ "display_name": self.name,
137
+ "name": self.__id,
138
+ "entrypoint": str(self.entrypoint),
139
+ "memory": self.resources.memory if self.resources else DEFAULT_TASK_MEMORY,
140
+ "cpu": self.resources.cpu if self.resources else DEFAULT_TASK_CPU,
141
+ "depends_on": list(self.dependencies),
142
+ }
143
+
144
+ def to_json(self):
145
+ """
146
+ Convert the BatchJob instance to a JSON string representation.
147
+ """
148
+ return json.dumps(self.to_dict())
149
+
150
+ def translate_dependencies(self) -> Set[int]:
151
+ """
152
+ Translate the dependencies of the BatchJob instance into a format suitable for the batch job.
153
+ """
154
+
155
+ def get_dependency_name(dep):
156
+ if isinstance(dep, str):
157
+ return dep
158
+ elif isinstance(dep, BatchJob):
159
+ return dep.name
160
+ else:
161
+ raise TypeError(f"Unsupported dependency type: {type(dep)}")
162
+
163
+ return set(
164
+ [
165
+ self.dag.__BATCH_JOB_NAMES__[get_dependency_name(dep)]
166
+ for dep in self.dependencies
167
+ ]
168
+ )
169
+
170
+ def __add_dependency__(self, other):
171
+ self.dependencies.add(other.__id)
172
+
173
+ def __lshift__(
174
+ self, other: Sequence[BatchJob] | BatchJob
175
+ ) -> Sequence[BatchJob] | BatchJob:
176
+ if isinstance(other, list):
177
+ for task in other:
178
+ self.__add_dependency__(task)
179
+ else:
180
+ self.__add_dependency__(other)
181
+ return other
182
+
183
+ def __rshift__(
184
+ self, other: Sequence[BatchJob] | BatchJob
185
+ ) -> Sequence[BatchJob] | BatchJob:
186
+ if isinstance(other, Sequence):
187
+ for task in other:
188
+ task.__add_dependency__(self)
189
+ else:
190
+ other.__add_dependency__(self)
191
+ return other
192
+
193
+ def __rrshift__(self, other: Sequence[BatchJob] | BatchJob) -> BatchJob:
194
+ self.__lshift__(other)
195
+ return self
196
+
197
+ def __rlshift__(self, other: Sequence[BatchJob] | BatchJob) -> BatchJob:
198
+ self.__rshift__(other)
199
+ return self
200
+
201
+ def __hash__(self):
202
+ return self.__id
203
+
204
+ def __lt__(self, other: BatchJob) -> bool:
205
+ return self.__id < other.__id
206
+
207
+ def run(self):
208
+ """
209
+ Execute the job's entrypoint.
210
+ """
211
+ if isinstance(self.entrypoint, EntryPoint):
212
+ self.entrypoint()
213
+ else:
214
+ raise TypeError(f"Invalid entrypoint type: {type(self.entrypoint)}")
215
+
216
+
217
+ class Batch(Job):
218
+ """
219
+ Represents a batch job in the scheduler.
220
+
221
+ Inherits from Job and is used to define batch jobs with specific configurations.
222
+ """
223
+
224
+ def __init__(
225
+ self,
226
+ environment: Optional[Environment],
227
+ name: str,
228
+ image: Image,
229
+ run_as: Optional[Union[str, User]],
230
+ resources: Resources = Resources(memory="100m", cpu=1),
231
+ acl: Optional[ACL] = None,
232
+ ):
233
+ super().__init__(
234
+ environment=environment,
235
+ name=name,
236
+ image=image,
237
+ run_as=run_as,
238
+ resources=resources,
239
+ acl=acl,
240
+ )
241
+ self.type = JobType.BATCH
242
+ self.__jobs: List[BatchJob] = []
243
+ self._auto_run = False
244
+ self.__next_job_id = next_batch_job_id()
245
+ self.__BATCH_JOB_NAMES__: Dict[str, int] = {}
246
+
247
+ @property
248
+ def next_job_id(self):
249
+ """
250
+ Returns a generator for the next job ID in the batch.
251
+ """
252
+ return next(self.__next_job_id)
253
+
254
+ def add_job(self, job: BatchJob):
255
+ """
256
+ Adds a job to the batch job.
257
+
258
+ :param job: The BatchJob instance to add.
259
+ """
260
+ if not isinstance(job, BatchJob):
261
+ raise TypeError(
262
+ f"Only BatchJob instances can be added to a Batch. Got {type(job)} instead."
263
+ )
264
+ if self.get_job_by_name(job.name) is not None:
265
+ raise DuplicateJobNameError(job.name)
266
+ # Use the batch level resource values as defaults for jobs
267
+ job.resources = job.resources or self.resources
268
+ self.__jobs.append(job)
269
+
270
+ def is_job_in(self, job: BatchJob) -> bool:
271
+ return job in self.__jobs
272
+
273
+ def get_job_by_name(self, job_name: str) -> Optional[BatchJob]:
274
+ return next((job for job in self.__jobs if job.name == job_name), None)
275
+
276
+ def to_dict(self):
277
+ """
278
+ Convert the Batch instance to a dictionary representation.
279
+ """
280
+ batch_dict = super().to_dict()
281
+ batch_dict["jobs"] = [job.to_dict() for job in self.__jobs]
282
+ return batch_dict
283
+
284
+ def to_json(self):
285
+ """
286
+ Convert the Batch instance to a JSON string representation.
287
+ """
288
+ return json.dumps(self.to_dict(), indent=4)
289
+
290
+ def __repr__(self):
291
+ jobs_repr = ", ".join(repr(job) for job in self.__jobs)
292
+ return (
293
+ f"Batch(name={self.name}, environment={self.environment}, "
294
+ f"run_as={self.run_as}, resources={self.resources}, "
295
+ f"acl={self.acl}, jobs=[{jobs_repr}])"
296
+ )
297
+
298
+ def set_autorun(self, auto_run):
299
+ self._auto_run = auto_run
300
+
301
+ def __enter__(self):
302
+ self._token = __DAG_CONTEXT__.set(self)
303
+ return self
304
+
305
+ def __exit__(self, exc_type, exc_value, exc_traceback):
306
+ __DAG_CONTEXT__.reset(self._token)
307
+ if self._auto_run:
308
+ self.run()
309
+
310
+ def __topological_sort__(self):
311
+ jobs = {
312
+ hash(job): set([hash(dep) for dep in job.dependencies])
313
+ for job in self.__jobs
314
+ }
315
+
316
+ for k, v in jobs.items():
317
+ v.discard(k) # ignore self dependencies
318
+ extra_items_in_deps = reduce(set.union, jobs.values()) - set(jobs.keys())
319
+ jobs.update({item: set() for item in extra_items_in_deps})
320
+ while True:
321
+ ordered = set(item for item, dep in jobs.items() if not dep)
322
+ if not ordered:
323
+ break
324
+ yield sorted(ordered)
325
+ jobs = {
326
+ item: (dep - ordered)
327
+ for item, dep in jobs.items()
328
+ if item not in ordered
329
+ }
330
+ if jobs:
331
+ raise CyclicDependencyError(
332
+ "A cyclic dependency exists amongst {}".format(jobs)
333
+ )
334
+
335
+ def run(self) -> Tuple[bool, str]:
336
+ if is_dt_installed():
337
+ return super().run()
338
+ else:
339
+ os.environ["DATATAILR_BATCH_RUN_ID"] = "1"
340
+ for step in self.__topological_sort__():
341
+ for job_id in step:
342
+ job = self.__jobs[job_id]
343
+ logger.info(
344
+ f"Batch {self.name}, running job '{job.name}' in environment '{self.environment}' as '{self.run_as}'"
345
+ )
346
+ job.run()
347
+ from datatailr.scheduler.batch_decorator import __FUNCTIONS_CREATED_IN_DAG__
348
+
349
+ __FUNCTIONS_CREATED_IN_DAG__.clear()
350
+ return True, ""
@@ -0,0 +1,112 @@
1
+ # *************************************************************************
2
+ #
3
+ # Copyright (c) 2025 - Datatailr Inc.
4
+ # All Rights Reserved.
5
+ #
6
+ # This file is part of Datatailr and subject to the terms and conditions
7
+ # defined in 'LICENSE.txt'. Unauthorized copying and/or distribution
8
+ # of this file, in parts or full, via any medium is strictly prohibited.
9
+ # *************************************************************************
10
+
11
+ import functools
12
+ import inspect
13
+ import os
14
+
15
+ from datatailr.logging import DatatailrLogger
16
+ from datatailr.scheduler.arguments_cache import ArgumentsCache
17
+ from datatailr.scheduler.base import EntryPoint, JobType, Resources
18
+ from datatailr.scheduler.batch import BatchJob, get_current_manager
19
+ from datatailr.scheduler.constants import DEFAULT_TASK_CPU, DEFAULT_TASK_MEMORY
20
+ from datatailr.scheduler.utils import get_available_env_args
21
+
22
+ __ARGUMENTS_CACHE__ = ArgumentsCache()
23
+ __FUNCTIONS_CREATED_IN_DAG__: dict[BatchJob, str] = {}
24
+ logger = DatatailrLogger(__name__).get_logger()
25
+
26
+
27
+ def batch_run_id() -> str:
28
+ return os.environ["DATATAILR_BATCH_RUN_ID"]
29
+
30
+
31
+ def dag_id(job: BatchJob) -> str:
32
+ return os.getenv(
33
+ "DATATAILR_BATCH_ID", __FUNCTIONS_CREATED_IN_DAG__.get(job, "unknown")
34
+ )
35
+
36
+
37
+ def batch_decorator(memory=DEFAULT_TASK_MEMORY, cpu=DEFAULT_TASK_CPU):
38
+ """
39
+ Decorator to mark a function as a batch job.
40
+ This decorator can be used to wrap functions that should be executed as part of batch jobs.
41
+ """
42
+
43
+ def decorator(func):
44
+ spec = inspect.getfullargspec(func)
45
+ signature = inspect.signature(func)
46
+ varargs = spec.varargs
47
+ varkw = spec.varkw
48
+ parameters = signature.parameters
49
+
50
+ @functools.wraps(func)
51
+ def batch_main(*args, **kwargs):
52
+ dag = get_current_manager()
53
+ if dag is None:
54
+ logger.info(f'Function "{func.__name__}" is being executed.')
55
+ # There are two possible scenarios:
56
+ # 1. The function is called directly, not as part of a batch job. In this case, the args and kwargs should be used.
57
+ # 2. The function is called as part of a batch job - it was constructed as part of a DAG and is now being executed.
58
+ if func not in __FUNCTIONS_CREATED_IN_DAG__:
59
+ return func(*args, **kwargs)
60
+ function_arguments = [v.name for v in parameters.values()]
61
+ env_args = get_available_env_args()
62
+ final_args = list(args)
63
+ final_kwargs = kwargs.copy()
64
+
65
+ for name, value in env_args.items():
66
+ if name in function_arguments:
67
+ if len(final_args) < len(function_arguments):
68
+ final_args.extend(
69
+ [None] * (len(function_arguments) - len(final_args))
70
+ )
71
+ final_args[function_arguments.index(name)] = value
72
+ function_arguments = __ARGUMENTS_CACHE__.get_arguments(
73
+ dag_id(func), func.__name__
74
+ )
75
+ result = func(**function_arguments)
76
+ __ARGUMENTS_CACHE__.add_result(batch_run_id(), func.__name__, result)
77
+ return result
78
+ else:
79
+ __FUNCTIONS_CREATED_IN_DAG__[func] = dag.id
80
+ all_args = dict(zip(spec.args, args)) | kwargs
81
+
82
+ __ARGUMENTS_CACHE__.add_arguments(dag.id, func.__name__, all_args)
83
+
84
+ dag.set_autorun(True)
85
+ job = BatchJob(
86
+ name=func.__name__,
87
+ entrypoint=EntryPoint(
88
+ JobType.BATCH,
89
+ module_name=func.__module__,
90
+ function_name=func.__name__,
91
+ ),
92
+ resources=Resources(memory=memory, cpu=cpu),
93
+ dependencies=[
94
+ value.name
95
+ for _, value in all_args.items()
96
+ if isinstance(value, BatchJob)
97
+ ],
98
+ dag=dag,
99
+ )
100
+ return job
101
+
102
+ module = inspect.getmodule(func)
103
+ if hasattr(module, "__batch_main__"):
104
+ if func.__name__ in getattr(module, "__batch_main__"):
105
+ raise ValueError(f"Duplicate batch main function {func.__name__}")
106
+ module.__batch_main__[func.__name__] = batch_main # type: ignore
107
+ else:
108
+ setattr(module, "__batch_main__", {func.__name__: batch_main})
109
+
110
+ return batch_main
111
+
112
+ return decorator
@@ -0,0 +1,20 @@
1
+ # *************************************************************************
2
+ #
3
+ # Copyright (c) 2025 - Datatailr Inc.
4
+ # All Rights Reserved.
5
+ #
6
+ # This file is part of Datatailr and subject to the terms and conditions
7
+ # defined in 'LICENSE.txt'. Unauthorized copying and/or distribution
8
+ # of this file, in parts or full, via any medium is strictly prohibited.
9
+ # *************************************************************************
10
+
11
+ DEFAULT_TASK_MEMORY = "100m"
12
+ DEFAULT_TASK_CPU = 1
13
+
14
+ BATCH_JOB_ARGUMENTS = (
15
+ "rundate",
16
+ "scheduled_time",
17
+ "started_at",
18
+ "batch_name",
19
+ "job_name",
20
+ )
@@ -0,0 +1,28 @@
1
+ # *************************************************************************
2
+ #
3
+ # Copyright (c) 2025 - Datatailr Inc.
4
+ # All Rights Reserved.
5
+ #
6
+ # This file is part of Datatailr and subject to the terms and conditions
7
+ # defined in 'LICENSE.txt'. Unauthorized copying and/or distribution
8
+ # of this file, in parts or full, via any medium is strictly prohibited.
9
+ # *************************************************************************
10
+
11
+ import os
12
+
13
+ from datatailr.scheduler.constants import BATCH_JOB_ARGUMENTS
14
+
15
+
16
+ def get_available_env_args():
17
+ """
18
+ Get the available environment variables for batch job arguments.
19
+ This function retrieves the environment variables that match the keys defined in BATCH_JOB_ARGUMENTS.
20
+ Returns:
21
+ dict: A dictionary of available environment variables for batch jobs.
22
+ """
23
+ available_args = {}
24
+ for key, value in os.environ.items():
25
+ arg_key = key.replace("DATATAILR_BATCH_ARG_", "").lower()
26
+ if arg_key in BATCH_JOB_ARGUMENTS:
27
+ available_args[arg_key] = value
28
+ return available_args