datatailr 0.1.20__tar.gz → 0.1.22__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datatailr might be problematic. Click here for more details.
- {datatailr-0.1.20/src/datatailr.egg-info → datatailr-0.1.22}/PKG-INFO +1 -1
- {datatailr-0.1.20 → datatailr-0.1.22}/pyproject.toml +1 -1
- datatailr-0.1.22/src/datatailr/excel.py +19 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/scheduler/arguments_cache.py +12 -6
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/scheduler/base.py +20 -11
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/scheduler/batch.py +18 -5
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/user.py +1 -1
- {datatailr-0.1.20 → datatailr-0.1.22/src/datatailr.egg-info}/PKG-INFO +1 -1
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr.egg-info/SOURCES.txt +1 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/sbin/datatailr_run.py +1 -7
- {datatailr-0.1.20 → datatailr-0.1.22}/LICENSE +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/README.md +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/setup.cfg +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/setup.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/__init__.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/acl.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/blob.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/build/__init__.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/build/image.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/dt_json.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/errors.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/group.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/logging.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/scheduler/__init__.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/scheduler/batch_decorator.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/scheduler/constants.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/scheduler/schedule.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/scheduler/utils.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/utils.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/version.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr/wrapper.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr.egg-info/dependency_links.txt +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr.egg-info/entry_points.txt +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr.egg-info/requires.txt +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/datatailr.egg-info/top_level.txt +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/sbin/datatailr_run_app.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/sbin/datatailr_run_batch.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/sbin/datatailr_run_excel.py +0 -0
- {datatailr-0.1.20 → datatailr-0.1.22}/src/sbin/datatailr_run_service.py +0 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
try:
|
|
2
|
+
from dt.excel import Addin # type: ignore
|
|
3
|
+
except ImportError:
|
|
4
|
+
|
|
5
|
+
class DummyAddin:
|
|
6
|
+
def __init__(self, name: str, description: str) -> None:
|
|
7
|
+
self.name = name
|
|
8
|
+
self.description = description
|
|
9
|
+
|
|
10
|
+
def expose(self, **kwargs):
|
|
11
|
+
def decorator(func):
|
|
12
|
+
return func
|
|
13
|
+
|
|
14
|
+
return decorator
|
|
15
|
+
|
|
16
|
+
def run(self):
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
Addin = DummyAddin
|
|
@@ -21,16 +21,18 @@ and the inner dictionaries contain the arguments.
|
|
|
21
21
|
This module is for internal use of the datatailr package.
|
|
22
22
|
"""
|
|
23
23
|
|
|
24
|
-
from datatailr.dt_json import json
|
|
24
|
+
from datatailr.dt_json import json
|
|
25
25
|
import os
|
|
26
26
|
import pickle
|
|
27
27
|
from typing import Any, Dict, Optional
|
|
28
28
|
|
|
29
29
|
from datatailr import is_dt_installed, Blob
|
|
30
30
|
from datatailr.errors import DatatailrError
|
|
31
|
+
from datatailr.logging import DatatailrLogger
|
|
31
32
|
|
|
32
33
|
|
|
33
34
|
__BLOB_STORAGE__ = Blob()
|
|
35
|
+
logger = DatatailrLogger(__name__).get_logger()
|
|
34
36
|
|
|
35
37
|
|
|
36
38
|
class CacheNotFoundError(DatatailrError):
|
|
@@ -79,6 +81,9 @@ class ArgumentsCache:
|
|
|
79
81
|
:param job_name: Name of the job.
|
|
80
82
|
:return: Dictionary of arguments.
|
|
81
83
|
"""
|
|
84
|
+
logger.info(
|
|
85
|
+
f"Retrieving arguments for {batch_id=}, {job=}, {batch_run_id=}, {self.use_persistent_cache=}"
|
|
86
|
+
)
|
|
82
87
|
path = f"/tmp/datatailr/batch/arguments/{batch_id}.pkl"
|
|
83
88
|
if self.use_persistent_cache and isinstance(job, str):
|
|
84
89
|
try:
|
|
@@ -101,12 +106,8 @@ class ArgumentsCache:
|
|
|
101
106
|
)
|
|
102
107
|
if batch_run_id is None:
|
|
103
108
|
return arg_keys[job]
|
|
104
|
-
arguments_mapping = decode_json(
|
|
105
|
-
os.getenv("DATATAILR_JOB_ARGUMENT_MAPPING", "{}")
|
|
106
|
-
)
|
|
107
|
-
arguments_mapping = {value: key for key, value in arguments_mapping.items()}
|
|
108
109
|
args = {
|
|
109
|
-
|
|
110
|
+
name: self.get_result(batch_run_id, value)
|
|
110
111
|
for name, value in arg_keys[job].items()
|
|
111
112
|
}
|
|
112
113
|
return args
|
|
@@ -120,6 +121,9 @@ class ArgumentsCache:
|
|
|
120
121
|
:param result: Result of the batch job.
|
|
121
122
|
"""
|
|
122
123
|
path = f"/tmp/datatailr/batch/results/{batch_run_id}_{job}.pkl"
|
|
124
|
+
logger.info(
|
|
125
|
+
f"Adding result for {batch_run_id=}, {job=}, {result=}, {self.use_persistent_cache=}"
|
|
126
|
+
)
|
|
123
127
|
if self.use_persistent_cache and isinstance(job, str):
|
|
124
128
|
self._add_to_persistent_cache(path, result)
|
|
125
129
|
else:
|
|
@@ -156,6 +160,7 @@ class ArgumentsCache:
|
|
|
156
160
|
|
|
157
161
|
"""
|
|
158
162
|
path = path.replace("/tmp/", "")
|
|
163
|
+
logger.info(f"Adding arguments to persistent cache for {path=}")
|
|
159
164
|
__BLOB_STORAGE__.put_blob(path, json.dumps(blob))
|
|
160
165
|
|
|
161
166
|
def _get_from_persistent_cache(self, path: str) -> Any:
|
|
@@ -165,5 +170,6 @@ class ArgumentsCache:
|
|
|
165
170
|
:param path: Path in the Blob storage where the blob is stored.
|
|
166
171
|
"""
|
|
167
172
|
path = path.replace("/tmp/", "")
|
|
173
|
+
logger.info(f"Retrieving arguments from persistent cache for {path=}")
|
|
168
174
|
data = __BLOB_STORAGE__.get_blob(path)
|
|
169
175
|
return json.loads(data)
|
|
@@ -11,8 +11,7 @@
|
|
|
11
11
|
from __future__ import annotations
|
|
12
12
|
|
|
13
13
|
from datetime import datetime
|
|
14
|
-
import importlib
|
|
15
|
-
import inspect
|
|
14
|
+
import importlib.util
|
|
16
15
|
import json
|
|
17
16
|
import os
|
|
18
17
|
import tempfile
|
|
@@ -96,12 +95,17 @@ class EntryPoint:
|
|
|
96
95
|
|
|
97
96
|
# Find the absolute path to the repository and then the relative path to the module.
|
|
98
97
|
# This will be used in the creation of the code 'bundle' when building the image.
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
98
|
+
module_spec = importlib.util.find_spec(func.__module__)
|
|
99
|
+
if module_spec is not None and module_spec.origin is not None:
|
|
100
|
+
package_root = module_spec.origin
|
|
101
|
+
else:
|
|
102
|
+
package_root = "."
|
|
102
103
|
module_parts = self.module_name.split(".")
|
|
103
104
|
for _ in module_parts:
|
|
104
105
|
package_root = os.path.dirname(package_root)
|
|
106
|
+
path_to_repo = run_shell_command(
|
|
107
|
+
f"cd {package_root} && git rev-parse --show-toplevel"
|
|
108
|
+
)[0]
|
|
105
109
|
path_to_module = os.path.relpath(package_root, path_to_repo)
|
|
106
110
|
self.path_to_repo = path_to_repo
|
|
107
111
|
self.path_to_module = path_to_module
|
|
@@ -123,7 +127,7 @@ class EntryPoint:
|
|
|
123
127
|
return f"EntryPoint({self.function_name} from {self.module_name}, type={self.type})"
|
|
124
128
|
|
|
125
129
|
def __str__(self):
|
|
126
|
-
return f"{self.module_name}
|
|
130
|
+
return f"{self.module_name}:{self.function_name}"
|
|
127
131
|
|
|
128
132
|
|
|
129
133
|
class Job:
|
|
@@ -277,20 +281,25 @@ class Job:
|
|
|
277
281
|
2. Check if the local commit matches the remote HEAD (the repo is synced with the remote).
|
|
278
282
|
Returns a tuple of (branch: str, commit_hash: str).
|
|
279
283
|
"""
|
|
280
|
-
|
|
281
|
-
|
|
284
|
+
path_to_repo = self.image.path_to_repo or "."
|
|
285
|
+
local_commit = run_shell_command(f"cd {path_to_repo} && git rev-parse HEAD")[0]
|
|
286
|
+
branch_name = run_shell_command(
|
|
287
|
+
f"cd {path_to_repo} && git rev-parse --abbrev-ref HEAD"
|
|
288
|
+
)[0]
|
|
282
289
|
|
|
283
290
|
if os.getenv("DATATAILR_ALLOW_UNSAFE_SCHEDULING", "false").lower() == "true":
|
|
284
291
|
return branch_name, local_commit
|
|
285
|
-
return_code = run_shell_command("git diff --exit-code")
|
|
286
|
-
is_committed = return_code == 0
|
|
292
|
+
return_code = run_shell_command(f"cd {path_to_repo} && git diff --exit-code")
|
|
293
|
+
is_committed = return_code is not None and return_code[1] == 0
|
|
287
294
|
|
|
288
295
|
if not is_committed:
|
|
289
296
|
raise RepoValidationError(
|
|
290
297
|
"Please commit your changes before running the job."
|
|
291
298
|
)
|
|
292
299
|
|
|
293
|
-
remote_commit = run_shell_command(
|
|
300
|
+
remote_commit = run_shell_command(
|
|
301
|
+
f"cd {path_to_repo} && git ls-remote origin HEAD"
|
|
302
|
+
)[0].split("\t")[0]
|
|
294
303
|
|
|
295
304
|
if local_commit != remote_commit:
|
|
296
305
|
raise RepoValidationError(
|
|
@@ -18,7 +18,6 @@ from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
|
|
|
18
18
|
import uuid
|
|
19
19
|
|
|
20
20
|
from datatailr import Image
|
|
21
|
-
from datatailr.dt_json import encode_json
|
|
22
21
|
from datatailr.errors import BatchJobError
|
|
23
22
|
from datatailr.logging import DatatailrLogger
|
|
24
23
|
from datatailr.scheduler.base import (
|
|
@@ -114,7 +113,7 @@ class BatchJob:
|
|
|
114
113
|
isinstance(dep, int) for dep in self.dependencies
|
|
115
114
|
), "All dependencies must be integers representing job IDs."
|
|
116
115
|
self.dag.add_job(self)
|
|
117
|
-
self.
|
|
116
|
+
self.argument_mapping = argument_mapping or {}
|
|
118
117
|
|
|
119
118
|
def __call__(self, *args, **kwds) -> BatchJob:
|
|
120
119
|
"""
|
|
@@ -200,6 +199,7 @@ class BatchJob:
|
|
|
200
199
|
"""
|
|
201
200
|
return {
|
|
202
201
|
"display_name": self.name,
|
|
202
|
+
"name": self.name,
|
|
203
203
|
"child_number": self.__id,
|
|
204
204
|
"entrypoint": str(self.entrypoint),
|
|
205
205
|
"memory": self.resources.memory if self.resources else DEFAULT_TASK_MEMORY,
|
|
@@ -235,7 +235,7 @@ class BatchJob:
|
|
|
235
235
|
|
|
236
236
|
def __add_dependency__(self, other):
|
|
237
237
|
self.dependencies.add(other.__id)
|
|
238
|
-
arg_name = self.
|
|
238
|
+
arg_name = self.argument_mapping.get(other.name, other.name)
|
|
239
239
|
if arg_name is not None:
|
|
240
240
|
self.__args[arg_name] = other
|
|
241
241
|
|
|
@@ -282,7 +282,6 @@ class BatchJob:
|
|
|
282
282
|
"DATATAILR_BATCH_ID": str(self.dag.id),
|
|
283
283
|
"DATATAILR_JOB_ID": str(self.__id),
|
|
284
284
|
"DATATAILR_JOB_NAME": self.name,
|
|
285
|
-
"DATATAILR_JOB_ARGUMENT_MAPPING": encode_json(self.__argument_mapping),
|
|
286
285
|
}
|
|
287
286
|
self.entrypoint(env=env)
|
|
288
287
|
else:
|
|
@@ -464,9 +463,23 @@ class Batch(Job):
|
|
|
464
463
|
def arg_name(arg: Union[BatchJob, str]) -> str:
|
|
465
464
|
return arg.name if isinstance(arg, BatchJob) else arg
|
|
466
465
|
|
|
466
|
+
def merged(dst: dict[str, str], src: dict[str, str]) -> dict[str, str]:
|
|
467
|
+
# copy so we don't mutate the original mapping
|
|
468
|
+
out = dict(dst)
|
|
469
|
+
seen_vals = set(out.values())
|
|
470
|
+
for k, v in src.items():
|
|
471
|
+
if v not in seen_vals:
|
|
472
|
+
out[k] = v
|
|
473
|
+
seen_vals.add(v)
|
|
474
|
+
return out
|
|
475
|
+
|
|
467
476
|
args = {
|
|
468
|
-
j.name:
|
|
477
|
+
j.name: merged(
|
|
478
|
+
j.argument_mapping, {k: arg_name(v) for k, v in j.args.items()}
|
|
479
|
+
)
|
|
480
|
+
for j in self.__jobs
|
|
469
481
|
}
|
|
482
|
+
|
|
470
483
|
__ARGUMENTS_CACHE__.add_arguments(self.id, args)
|
|
471
484
|
if not self.__local_run and is_dt_installed():
|
|
472
485
|
return super().run()
|
|
@@ -27,7 +27,6 @@
|
|
|
27
27
|
# DATATAILR_GID - the group ID of the group as it is defined in the system.
|
|
28
28
|
# DATATAILR_JOB_TYPE - the type of job to run. (batch\service\app\excel\IDE)
|
|
29
29
|
# Job environment variables (not all are always relevant, depending on the job type):
|
|
30
|
-
# DATATAILR_JOB_ARGUMENT_MAPPING - a JSON string mapping job argument names to their
|
|
31
30
|
# DATATAILR_BATCH_RUN_ID - the unique identifier for the batch run.
|
|
32
31
|
# DATATAILR_BATCH_ID - the unique identifier for the batch.
|
|
33
32
|
# DATATAILR_JOB_ID - the unique identifier for the job.
|
|
@@ -37,7 +36,6 @@ import os
|
|
|
37
36
|
import sys
|
|
38
37
|
from typing import Tuple
|
|
39
38
|
from datatailr.logging import DatatailrLogger
|
|
40
|
-
from datatailr.dt_json import encode_json
|
|
41
39
|
|
|
42
40
|
|
|
43
41
|
logger = DatatailrLogger(os.path.abspath(__file__)).get_logger()
|
|
@@ -83,7 +81,7 @@ def run_command_as_user(command: str, user: str, env_vars: dict):
|
|
|
83
81
|
Run a command as a specific user with the given environment variables.
|
|
84
82
|
"""
|
|
85
83
|
env_vars.update({"PATH": get_env_var("PATH")})
|
|
86
|
-
env_vars.update({"PYTHONPATH": get_env_var("PYTHONPATH")})
|
|
84
|
+
env_vars.update({"PYTHONPATH": get_env_var("PYTHONPATH", "")})
|
|
87
85
|
env_vars_str = " ".join(f"{key}='{value}'" for key, value in env_vars.items())
|
|
88
86
|
full_command = f"sudo -u {user} {env_vars_str} {command}"
|
|
89
87
|
logger.debug(f"Running command: {full_command}")
|
|
@@ -103,16 +101,12 @@ def main():
|
|
|
103
101
|
if job_type == "batch":
|
|
104
102
|
run_id = get_env_var("DATATAILR_BATCH_RUN_ID")
|
|
105
103
|
batch_id = get_env_var("DATATAILR_BATCH_ID")
|
|
106
|
-
job_argument_mapping = get_env_var(
|
|
107
|
-
"DATATAILR_JOB_ARGUMENT_MAPPING", encode_json({})
|
|
108
|
-
)
|
|
109
104
|
entrypoint = get_env_var("DATATAILR_BATCH_ENTRYPOINT")
|
|
110
105
|
env = {
|
|
111
106
|
"DATATAILR_BATCH_RUN_ID": run_id,
|
|
112
107
|
"DATATAILR_BATCH_ID": batch_id,
|
|
113
108
|
"DATATAILR_JOB_ID": job_id,
|
|
114
109
|
"DATATAILR_BATCH_ENTRYPOINT": entrypoint,
|
|
115
|
-
"DATATAILR_JOB_ARGUMENT_MAPPING": job_argument_mapping,
|
|
116
110
|
}
|
|
117
111
|
run_command_as_user("datatailr_run_batch", user, env)
|
|
118
112
|
elif job_type == "service":
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|