datatailr 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datatailr might be problematic. Click here for more details.

@@ -14,9 +14,11 @@ import contextvars
14
14
  import json
15
15
  import os
16
16
  from functools import reduce
17
- from typing import Dict, List, Optional, Sequence, Set, Tuple, Union
17
+ from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
18
+ import uuid
18
19
 
19
20
  from datatailr import Image
21
+ from datatailr.dt_json import encode_json
20
22
  from datatailr.errors import BatchJobError
21
23
  from datatailr.logging import DatatailrLogger
22
24
  from datatailr.scheduler.base import (
@@ -29,9 +31,12 @@ from datatailr.scheduler.base import (
29
31
  User,
30
32
  )
31
33
  from datatailr.scheduler.constants import DEFAULT_TASK_CPU, DEFAULT_TASK_MEMORY
34
+ from datatailr.scheduler.arguments_cache import ArgumentsCache
35
+ from datatailr.scheduler.schedule import Schedule
32
36
  from datatailr.utils import is_dt_installed
33
37
 
34
38
  __DAG_CONTEXT__: contextvars.ContextVar = contextvars.ContextVar("dag_context")
39
+ __ARGUMENTS_CACHE__ = ArgumentsCache()
35
40
  logger = DatatailrLogger(os.path.abspath(__file__)).get_logger()
36
41
 
37
42
 
@@ -39,13 +44,6 @@ def get_current_manager():
39
44
  return __DAG_CONTEXT__.get(None)
40
45
 
41
46
 
42
- def next_batch_job_id():
43
- i = 0
44
- while True:
45
- yield i
46
- i += 1
47
-
48
-
49
47
  class CyclicDependencyError(BatchJobError):
50
48
  """
51
49
  Exception raised when a cyclic dependency is detected in the batch job dependencies.
@@ -79,6 +77,12 @@ class MissingDagError(BatchJobError):
79
77
  )
80
78
 
81
79
 
80
+ class CodePackageMismatchError(BatchJobError):
81
+ def __init__(self, message: str):
82
+ super().__init__(message)
83
+ self.message = message
84
+
85
+
82
86
  class BatchJob:
83
87
  """
84
88
  Represents a job within a batch job.
@@ -93,6 +97,7 @@ class BatchJob:
93
97
  resources: Optional[Resources] = None,
94
98
  dependencies: Sequence[Union[str, BatchJob]] = [],
95
99
  dag: Optional[Batch] = get_current_manager(),
100
+ argument_mapping: Dict[str, str] = {},
96
101
  ):
97
102
  self.name = name
98
103
  self.entrypoint = entrypoint
@@ -102,12 +107,14 @@ class BatchJob:
102
107
  raise MissingDagError()
103
108
  self.__id = dag.next_job_id
104
109
  self.dag = dag
110
+ self.__args: Dict[str, Any] = {}
105
111
  self.dag.__BATCH_JOB_NAMES__[self.name] = self.__id
106
112
  self.dependencies = self.translate_dependencies()
107
113
  assert all(
108
114
  isinstance(dep, int) for dep in self.dependencies
109
115
  ), "All dependencies must be integers representing job IDs."
110
116
  self.dag.add_job(self)
117
+ self.__argument_mapping = argument_mapping or {}
111
118
 
112
119
  def __call__(self, *args, **kwds) -> BatchJob:
113
120
  """
@@ -116,6 +123,22 @@ class BatchJob:
116
123
  """
117
124
  return self
118
125
 
126
+ @property
127
+ def args(self) -> Dict[str, Any]:
128
+ """
129
+ Returns the arguments for the BatchJob instance.
130
+ """
131
+ return self.__args or {}
132
+
133
+ @args.setter
134
+ def args(self, args: Dict[str, Any]):
135
+ """
136
+ Sets the arguments for the BatchJob instance.
137
+ """
138
+ if not isinstance(args, dict):
139
+ raise TypeError(f"Expected a dictionary for args, got {type(args)}")
140
+ self.__args = args
141
+
119
142
  @property
120
143
  def id(self) -> int:
121
144
  """
@@ -123,7 +146,7 @@ class BatchJob:
123
146
  """
124
147
  return self.__id
125
148
 
126
- def alias(self, name: str):
149
+ def alias(self, name: str) -> BatchJob:
127
150
  """
128
151
  Set an alias for the BatchJob instance.
129
152
 
@@ -136,19 +159,48 @@ class BatchJob:
136
159
  self.name = name
137
160
  return self
138
161
 
162
+ def set_resources(
163
+ self,
164
+ resources: Optional[Resources] = None,
165
+ memory: Optional[str] = None,
166
+ cpu: Optional[float] = None,
167
+ ) -> BatchJob:
168
+ """
169
+ Set the resources for the BatchJob instance.
170
+
171
+ :param resources: The Resources instance to set.
172
+ """
173
+ if resources is not None:
174
+ if not isinstance(resources, Resources):
175
+ raise TypeError(f"Expected Resources instance, got {type(resources)}")
176
+ else:
177
+ resources = Resources(
178
+ memory=memory or DEFAULT_TASK_MEMORY, cpu=cpu or DEFAULT_TASK_CPU
179
+ )
180
+ self.resources = resources
181
+ return self
182
+
139
183
  def __repr__(self):
140
184
  return (
141
185
  f"BatchJob(name={self.name}, entrypoint={self.entrypoint}, "
142
186
  f"resources={self.resources}) (id={self.__id})"
143
187
  )
144
188
 
189
+ def __getstate__(self) -> object:
190
+ state = self.__dict__.copy()
191
+ state.pop("dag", None)
192
+ return state
193
+
194
+ def __setstate__(self, state: dict):
195
+ self.__dict__.update(state)
196
+
145
197
  def to_dict(self):
146
198
  """
147
199
  Convert the BatchJob instance to a dictionary representation.
148
200
  """
149
201
  return {
150
202
  "display_name": self.name,
151
- "name": self.__id,
203
+ "child_number": self.__id,
152
204
  "entrypoint": str(self.entrypoint),
153
205
  "memory": self.resources.memory if self.resources else DEFAULT_TASK_MEMORY,
154
206
  "cpu": self.resources.cpu if self.resources else DEFAULT_TASK_CPU,
@@ -183,6 +235,9 @@ class BatchJob:
183
235
 
184
236
  def __add_dependency__(self, other):
185
237
  self.dependencies.add(other.__id)
238
+ arg_name = self.__argument_mapping.get(other.name, other.name)
239
+ if arg_name is not None:
240
+ self.__args[arg_name] = other
186
241
 
187
242
  def __lshift__(
188
243
  self, other: Sequence[BatchJob] | BatchJob
@@ -223,7 +278,13 @@ class BatchJob:
223
278
  Execute the job's entrypoint.
224
279
  """
225
280
  if isinstance(self.entrypoint, EntryPoint):
226
- self.entrypoint()
281
+ env = {
282
+ "DATATAILR_BATCH_ID": str(self.dag.id),
283
+ "DATATAILR_JOB_ID": str(self.__id),
284
+ "DATATAILR_JOB_NAME": self.name,
285
+ "DATATAILR_JOB_ARGUMENT_MAPPING": encode_json(self.__argument_mapping),
286
+ }
287
+ self.entrypoint(env=env)
227
288
  else:
228
289
  raise TypeError(f"Invalid entrypoint type: {type(self.entrypoint)}")
229
290
 
@@ -237,12 +298,17 @@ class Batch(Job):
237
298
 
238
299
  def __init__(
239
300
  self,
240
- environment: Optional[Environment],
241
301
  name: str,
242
- image: Image,
243
- run_as: Optional[Union[str, User]],
302
+ environment: Optional[Environment] = Environment.DEV,
303
+ schedule: Optional[Schedule] = None,
304
+ image: Optional[Image] = None,
305
+ run_as: Optional[Union[str, User]] = User.signed_user(),
244
306
  resources: Resources = Resources(memory="100m", cpu=1),
245
307
  acl: Optional[ACL] = None,
308
+ local_run: bool = False,
309
+ python_requirements: str = "",
310
+ build_script_pre: str = "",
311
+ build_script_post: str = "",
246
312
  ):
247
313
  super().__init__(
248
314
  environment=environment,
@@ -251,19 +317,25 @@ class Batch(Job):
251
317
  run_as=run_as,
252
318
  resources=resources,
253
319
  acl=acl,
320
+ python_requirements=python_requirements,
321
+ build_script_pre=build_script_pre,
322
+ build_script_post=build_script_post,
323
+ type=JobType.BATCH,
254
324
  )
255
- self.type = JobType.BATCH
256
325
  self.__jobs: List[BatchJob] = []
257
326
  self._auto_run = False
258
- self.__next_job_id = next_batch_job_id()
327
+ self.__next_job_id = -1
259
328
  self.__BATCH_JOB_NAMES__: Dict[str, int] = {}
329
+ self.__local_run = local_run
330
+ self.__schedule = schedule
260
331
 
261
332
  @property
262
333
  def next_job_id(self):
263
334
  """
264
335
  Returns a generator for the next job ID in the batch.
265
336
  """
266
- return next(self.__next_job_id)
337
+ self.__next_job_id += 1
338
+ return self.__next_job_id
267
339
 
268
340
  def add_job(self, job: BatchJob):
269
341
  """
@@ -279,6 +351,25 @@ class Batch(Job):
279
351
  raise DuplicateJobNameError(job.name)
280
352
  # Use the batch level resource values as defaults for jobs
281
353
  job.resources = job.resources or self.resources
354
+ image_path_to_repo = self.image.path_to_repo
355
+ image_path_to_module = self.image.path_to_module
356
+ package_path_to_repo = job.entrypoint.path_to_repo
357
+ package_path_to_module = job.entrypoint.path_to_module
358
+
359
+ if image_path_to_repo is None:
360
+ self.image.path_to_repo = package_path_to_repo
361
+ elif package_path_to_repo != image_path_to_repo:
362
+ raise CodePackageMismatchError(
363
+ f"Function {job.entrypoint.function_name} is defined in a different package root: "
364
+ f"{package_path_to_repo} != {image_path_to_repo}"
365
+ )
366
+ if image_path_to_module is None:
367
+ self.image.path_to_module = package_path_to_module
368
+ elif package_path_to_module != image_path_to_module:
369
+ raise CodePackageMismatchError(
370
+ f"Function {job.entrypoint.function_name} is defined in a different module: "
371
+ f"{package_path_to_module} != {image_path_to_module}"
372
+ )
282
373
  self.__jobs.append(job)
283
374
 
284
375
  def is_job_in(self, job: BatchJob) -> bool:
@@ -293,6 +384,7 @@ class Batch(Job):
293
384
  """
294
385
  batch_dict = super().to_dict()
295
386
  batch_dict["jobs"] = [job.to_dict() for job in self.__jobs]
387
+ batch_dict["schedule"] = str(self.__schedule) if self.__schedule else None
296
388
  return batch_dict
297
389
 
298
390
  def to_json(self):
@@ -345,11 +437,41 @@ class Batch(Job):
345
437
  "A cyclic dependency exists amongst {}".format(jobs)
346
438
  )
347
439
 
440
+ def get_schedule_args(self) -> Dict[str, Any]:
441
+ if isinstance(self.__schedule, Schedule):
442
+ args = {
443
+ "at_minute": self.__schedule.at_minutes,
444
+ "every_minute": self.__schedule.every_minute,
445
+ "at_hour": self.__schedule.at_hours,
446
+ "every_hour": self.__schedule.every_hour,
447
+ "weekdays": self.__schedule.weekdays,
448
+ "day_of_month": self.__schedule.day_of_month,
449
+ "in_month": self.__schedule.in_month,
450
+ "every_month": self.__schedule.every_month,
451
+ "timezone": self.__schedule.timezone,
452
+ "run_after_job_uuid": self.__schedule.run_after_job_uuid,
453
+ "run_after_job_name": self.__schedule.run_after_job_name,
454
+ "run_after_job_condition": self.__schedule.run_after_job_condition,
455
+ }
456
+ args = {key: value for key, value in args.items() if value is not None}
457
+ for key, value in args.items():
458
+ if isinstance(value, list):
459
+ args[key] = ",".join(map(str, value))
460
+ return args
461
+ return {}
462
+
348
463
  def run(self) -> Tuple[bool, str]:
349
- if is_dt_installed():
464
+ def arg_name(arg: Union[BatchJob, str]) -> str:
465
+ return arg.name if isinstance(arg, BatchJob) else arg
466
+
467
+ args = {
468
+ j.name: {k: arg_name(v) for k, v in j.args.items()} for j in self.__jobs
469
+ }
470
+ __ARGUMENTS_CACHE__.add_arguments(self.id, args)
471
+ if not self.__local_run and is_dt_installed():
350
472
  return super().run()
351
473
  else:
352
- os.environ["DATATAILR_BATCH_RUN_ID"] = "1"
474
+ os.environ["DATATAILR_BATCH_RUN_ID"] = uuid.uuid4().hex[:8]
353
475
  for step in self.__topological_sort__():
354
476
  for job_id in step:
355
477
  job = self.__jobs[job_id]
@@ -11,21 +11,25 @@
11
11
  import functools
12
12
  import inspect
13
13
  import os
14
+ from typing import Callable
14
15
 
15
16
  from datatailr.logging import DatatailrLogger
16
- from datatailr.scheduler.arguments_cache import ArgumentsCache
17
+ from datatailr.scheduler.arguments_cache import ArgumentsCache, CacheNotFoundError
17
18
  from datatailr.scheduler.base import EntryPoint, JobType, Resources
18
- from datatailr.scheduler.batch import BatchJob, get_current_manager
19
+ from datatailr.scheduler.batch import (
20
+ BatchJob,
21
+ get_current_manager,
22
+ )
19
23
  from datatailr.scheduler.constants import DEFAULT_TASK_CPU, DEFAULT_TASK_MEMORY
20
24
  from datatailr.scheduler.utils import get_available_env_args
21
25
 
22
26
  __ARGUMENTS_CACHE__ = ArgumentsCache()
23
- __FUNCTIONS_CREATED_IN_DAG__: dict[BatchJob, str] = {}
27
+ __FUNCTIONS_CREATED_IN_DAG__: dict[Callable, str] = {}
24
28
  logger = DatatailrLogger(__name__).get_logger()
25
29
 
26
30
 
27
31
  def batch_run_id() -> str:
28
- return os.environ["DATATAILR_BATCH_RUN_ID"]
32
+ return os.getenv("DATATAILR_BATCH_RUN_ID", "unknown")
29
33
 
30
34
 
31
35
  def dag_id(job: BatchJob) -> str:
@@ -34,7 +38,7 @@ def dag_id(job: BatchJob) -> str:
34
38
  )
35
39
 
36
40
 
37
- def batch_decorator(memory=DEFAULT_TASK_MEMORY, cpu=DEFAULT_TASK_CPU):
41
+ def batch_decorator(memory: str = DEFAULT_TASK_MEMORY, cpu: float = DEFAULT_TASK_CPU):
38
42
  """
39
43
  Decorator to mark a function as a batch job.
40
44
  This decorator can be used to wrap functions that should be executed as part of batch jobs.
@@ -55,39 +59,62 @@ def batch_decorator(memory=DEFAULT_TASK_MEMORY, cpu=DEFAULT_TASK_CPU):
55
59
  # There are two possible scenarios:
56
60
  # 1. The function is called directly, not as part of a batch job. In this case, the args and kwargs should be used.
57
61
  # 2. The function is called as part of a batch job - it was constructed as part of a DAG and is now being executed.
58
- if func not in __FUNCTIONS_CREATED_IN_DAG__:
59
- return func(*args, **kwargs)
60
- function_arguments = [v.name for v in parameters.values()]
61
62
  env_args = get_available_env_args()
63
+ all_function_args = [
64
+ v.name
65
+ for v in parameters.values()
66
+ if v.kind
67
+ not in (
68
+ inspect.Parameter.VAR_POSITIONAL,
69
+ inspect.Parameter.VAR_KEYWORD,
70
+ )
71
+ ]
62
72
  final_args = list(args)
63
- final_kwargs = kwargs.copy()
64
73
 
65
74
  for name, value in env_args.items():
66
- if name in function_arguments:
67
- if len(final_args) < len(function_arguments):
75
+ if name in all_function_args:
76
+ if len(final_args) < len(all_function_args):
68
77
  final_args.extend(
69
- [None] * (len(function_arguments) - len(final_args))
78
+ [None] * (len(all_function_args) - len(final_args))
70
79
  )
71
- final_args[function_arguments.index(name)] = value
72
- function_arguments = __ARGUMENTS_CACHE__.get_arguments(
73
- dag_id(func), func.__name__
80
+ final_args[all_function_args.index(name)] = value
81
+ try:
82
+ final_kwargs = __ARGUMENTS_CACHE__.get_arguments(
83
+ dag_id(func),
84
+ os.getenv("DATATAILR_JOB_NAME", func.__name__),
85
+ os.getenv("DATATAILR_BATCH_RUN_ID"),
86
+ )
87
+ except CacheNotFoundError:
88
+ final_kwargs = kwargs
89
+
90
+ if varargs is not None and varkw is None:
91
+ for key in list(final_kwargs.keys()):
92
+ if key not in parameters:
93
+ final_args.append(final_kwargs.pop(key))
94
+
95
+ # Some of the loaded arguments are actually args and not kwargs.
96
+ if len(final_args) == len(parameters.keys()):
97
+ for i, arg_name in enumerate(parameters.keys()):
98
+ final_args[i] = final_kwargs.pop(arg_name, final_args[i])
99
+ result = func(*final_args, **final_kwargs)
100
+ __ARGUMENTS_CACHE__.add_result(
101
+ batch_run_id(),
102
+ os.getenv("DATATAILR_JOB_NAME", func.__name__),
103
+ result,
74
104
  )
75
- result = func(**function_arguments)
76
- __ARGUMENTS_CACHE__.add_result(batch_run_id(), func.__name__, result)
77
105
  return result
78
106
  else:
79
- __FUNCTIONS_CREATED_IN_DAG__[func] = dag.id
80
- all_args = dict(zip(spec.args, args)) | kwargs
81
-
82
- __ARGUMENTS_CACHE__.add_arguments(dag.id, func.__name__, all_args)
83
-
107
+ if varargs is not None:
108
+ all_args = {job.name: job for job in args}
109
+ else:
110
+ all_args = dict(zip(spec.args, args)) | kwargs
84
111
  dag.set_autorun(True)
112
+
85
113
  job = BatchJob(
86
114
  name=func.__name__,
87
115
  entrypoint=EntryPoint(
88
116
  JobType.BATCH,
89
- module_name=func.__module__,
90
- function_name=func.__name__,
117
+ func=func,
91
118
  ),
92
119
  resources=Resources(memory=memory, cpu=cpu),
93
120
  dependencies=[
@@ -97,6 +124,8 @@ def batch_decorator(memory=DEFAULT_TASK_MEMORY, cpu=DEFAULT_TASK_CPU):
97
124
  ],
98
125
  dag=dag,
99
126
  )
127
+ job.args = all_args
128
+ __FUNCTIONS_CREATED_IN_DAG__[job.entrypoint.func] = dag.id
100
129
  return job
101
130
 
102
131
  module = inspect.getmodule(func)
@@ -9,7 +9,7 @@
9
9
  # *************************************************************************
10
10
 
11
11
  DEFAULT_TASK_MEMORY = "100m"
12
- DEFAULT_TASK_CPU = 1
12
+ DEFAULT_TASK_CPU = 1.0
13
13
 
14
14
  BATCH_JOB_ARGUMENTS = (
15
15
  "rundate",
@@ -0,0 +1,117 @@
1
+ ##########################################################################
2
+ #
3
+ # Copyright (c) 2025 - Datatailr Inc.
4
+ # All Rights Reserved.
5
+ #
6
+ # This file is part of Datatailr and subject to the terms and conditions
7
+ # defined in 'LICENSE.txt'. Unauthorized copying and/or distribution
8
+ # of this file, in parts or full, via any medium is strictly prohibited.
9
+ ##########################################################################
10
+
11
+ from typing import Any
12
+ from datatailr.wrapper import dt__Job
13
+ import re
14
+
15
+
16
+ __CLIENT__ = dt__Job()
17
+
18
+
19
+ class Schedule:
20
+ """
21
+ Represents a schedule for batch jobs.
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ cron_expression: str = "",
27
+ at_minutes: list[int] | None = None,
28
+ every_minute: int | None = None,
29
+ at_hours: list[int] | None = None,
30
+ every_hour: int | None = None,
31
+ weekdays: list[str] | None = None,
32
+ day_of_month: int | None = None,
33
+ in_month: list[str] | None = None,
34
+ every_month: int | None = None,
35
+ timezone: str | None = None,
36
+ run_after_job_uuid: str | None = None,
37
+ run_after_job_name: str | None = None,
38
+ run_after_job_condition: str | None = None,
39
+ ):
40
+ self.at_minutes = at_minutes
41
+ self.every_minute = every_minute
42
+ self.at_hours = at_hours
43
+ self.every_hour = every_hour
44
+ self.weekdays = weekdays
45
+ self.day_of_month = day_of_month
46
+ self.in_month = in_month
47
+ self.every_month = every_month
48
+ self.timezone = timezone
49
+ self.run_after_job_uuid = run_after_job_uuid
50
+ self.run_after_job_name = run_after_job_name
51
+ self.run_after_job_condition = run_after_job_condition
52
+ self.schedule_expression = None
53
+ self.cron_expression = cron_expression
54
+
55
+ self.__is_set__ = False
56
+
57
+ def __str__(self) -> str:
58
+ self.__compile__()
59
+ return self.cron_expression
60
+
61
+ def __repr__(self) -> str:
62
+ self.__compile__()
63
+ return f"Schedule(cron_expression={self.cron_expression}, timezone={self.timezone}) - {self.schedule_expression}"
64
+
65
+ def __setattr__(self, name: str, value: Any) -> None:
66
+ super().__setattr__(name, value)
67
+ if name in [
68
+ "at_minutes",
69
+ "at_hours",
70
+ "every_minute",
71
+ "every_hour",
72
+ "weekdays",
73
+ "day_of_month",
74
+ "in_month",
75
+ "every_month",
76
+ ]:
77
+ self.__is_set__ = False
78
+
79
+ def __compile__(self):
80
+ if self.__is_set__:
81
+ return
82
+ argument_name = [
83
+ "at_minutes",
84
+ "at_hours",
85
+ "every_minute",
86
+ "every_hour",
87
+ "weekdays",
88
+ "day_of_month",
89
+ "in_month",
90
+ "every_month",
91
+ "run_after_job_uuid",
92
+ "run_after_job_name",
93
+ "run_after_job_condition",
94
+ ]
95
+ arguments = {}
96
+
97
+ for key in argument_name:
98
+ if hasattr(self, key) and getattr(self, key) is not None:
99
+ value = getattr(self, key)
100
+ if isinstance(value, list):
101
+ value = ",".join(map(str, value))
102
+ arguments[key] = value
103
+
104
+ result = __CLIENT__.run("", cron_string=True, **arguments)
105
+ match = re.match(r"^(.*?)\s*\((.*?)\)$", result)
106
+ if match:
107
+ cron_expression, schedule_expression = match.groups()
108
+ self.cron_expression = cron_expression.strip()
109
+ self.schedule_expression = schedule_expression.strip()
110
+ self.__is_set__ = True
111
+
112
+ def get_cron_string(self) -> str:
113
+ """
114
+ Returns the compiled cron string.
115
+ """
116
+ self.__compile__()
117
+ return self.cron_expression
@@ -16,7 +16,9 @@ from datatailr.scheduler.constants import BATCH_JOB_ARGUMENTS
16
16
  def get_available_env_args():
17
17
  """
18
18
  Get the available environment variables for batch job arguments.
19
- This function retrieves the environment variables that match the keys defined in BATCH_JOB_ARGUMENTS.
19
+
20
+ This function retrieves the environment variables that match the keys defined in DATATAILR_BATCH_JOB_ARGUMENTS.
21
+
20
22
  Returns:
21
23
  dict: A dictionary of available environment variables for batch jobs.
22
24
  """