datatailr 0.1.73__tar.gz → 0.1.81__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datatailr-0.1.73/src/datatailr.egg-info → datatailr-0.1.81}/PKG-INFO +19 -15
- {datatailr-0.1.73 → datatailr-0.1.81}/README.md +18 -14
- {datatailr-0.1.73 → datatailr-0.1.81}/pyproject.toml +2 -1
- {datatailr-0.1.73 → datatailr-0.1.81}/setup.py +3 -1
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/__init__.py +14 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/build/image.py +6 -4
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/excel/addin.py +35 -8
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/logging.py +85 -4
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/scheduler/__init__.py +8 -2
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/scheduler/base.py +28 -15
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/scheduler/batch.py +32 -6
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/scheduler/batch_decorator.py +12 -3
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/scheduler/constants.py +2 -2
- datatailr-0.1.81/src/datatailr/scheduler/job.py +112 -0
- datatailr-0.1.81/src/datatailr/scheduler/workflow.py +84 -0
- {datatailr-0.1.73 → datatailr-0.1.81/src/datatailr.egg-info}/PKG-INFO +19 -15
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr.egg-info/SOURCES.txt +6 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr.egg-info/entry_points.txt +1 -0
- datatailr-0.1.81/src/datatailr.egg-info/top_level.txt +2 -0
- datatailr-0.1.81/src/datatailr_demo/README.md +112 -0
- datatailr-0.1.81/src/datatailr_demo/__init__.py +15 -0
- datatailr-0.1.81/src/datatailr_demo/examples.py +47 -0
- datatailr-0.1.81/src/sbin/datatailr_cli.py +195 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/sbin/datatailr_run.py +147 -35
- {datatailr-0.1.73 → datatailr-0.1.81}/src/sbin/datatailr_run_excel.py +2 -2
- datatailr-0.1.73/src/datatailr.egg-info/top_level.txt +0 -1
- {datatailr-0.1.73 → datatailr-0.1.81}/LICENSE +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/setup.cfg +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/acl.py +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/blob.py +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/build/__init__.py +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/dt_json.py +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/errors.py +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/excel/__init__.py +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/excel/stubs.py +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/group.py +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/scheduler/arguments_cache.py +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/scheduler/schedule.py +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/scheduler/utils.py +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/tag.py +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/user.py +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/utils.py +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/version.py +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/wrapper.py +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr.egg-info/dependency_links.txt +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr.egg-info/requires.txt +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/sbin/datatailr_run_app.py +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/sbin/datatailr_run_batch.py +0 -0
- {datatailr-0.1.73 → datatailr-0.1.81}/src/sbin/datatailr_run_service.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datatailr
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.81
|
|
4
4
|
Summary: Ready-to-Use Platform That Drives Business Insights
|
|
5
5
|
Author-email: Datatailr <info@datatailr.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -84,25 +84,27 @@ print(datatailr.__provider__)
|
|
|
84
84
|
The following example shows how to create a simple data pipeline using the Datatailr Python package.
|
|
85
85
|
|
|
86
86
|
```python
|
|
87
|
-
from datatailr
|
|
87
|
+
from datatailr import workflow, task
|
|
88
88
|
|
|
89
|
-
@
|
|
89
|
+
@task()
|
|
90
90
|
def func_no_args() -> str:
|
|
91
91
|
return "no_args"
|
|
92
92
|
|
|
93
93
|
|
|
94
|
-
@
|
|
94
|
+
@task()
|
|
95
95
|
def func_with_args(a: int, b: float) -> str:
|
|
96
96
|
return f"args: {a}, {b}"
|
|
97
97
|
|
|
98
|
-
|
|
98
|
+
@workflow(name="MY test DAG")
|
|
99
|
+
def my_workflow():
|
|
99
100
|
for n in range(2):
|
|
100
101
|
res1 = func_no_args().alias(f"func_{n}")
|
|
101
102
|
res2 = func_with_args(1, res1).alias(f"func_with_args_{n}")
|
|
103
|
+
my_workflow(local_run=True)
|
|
102
104
|
```
|
|
103
105
|
|
|
104
106
|
Running this code will create a graph of jobs and execute it.
|
|
105
|
-
Each node on the graph represents a job, which in turn is a call to a function decorated with `@
|
|
107
|
+
Each node on the graph represents a job, which in turn is a call to a function decorated with `@task()`.
|
|
106
108
|
|
|
107
109
|
Since this is a local run then the execution of each node will happen sequentially in the same process.
|
|
108
110
|
|
|
@@ -117,14 +119,14 @@ You will first need to separate your function definitions from the DAG definitio
|
|
|
117
119
|
```python
|
|
118
120
|
# my_module.py
|
|
119
121
|
|
|
120
|
-
from datatailr
|
|
122
|
+
from datatailr import task
|
|
121
123
|
|
|
122
|
-
@
|
|
124
|
+
@task()
|
|
123
125
|
def func_no_args() -> str:
|
|
124
126
|
return "no_args"
|
|
125
127
|
|
|
126
128
|
|
|
127
|
-
@
|
|
129
|
+
@task()
|
|
128
130
|
def func_with_args(a: int, b: float) -> str:
|
|
129
131
|
return f"args: {a}, {b}"
|
|
130
132
|
```
|
|
@@ -133,18 +135,20 @@ To use these functions in a batch job, you just need to import them and run in a
|
|
|
133
135
|
|
|
134
136
|
```python
|
|
135
137
|
from my_module import func_no_args, func_with_args
|
|
136
|
-
from datatailr
|
|
138
|
+
from datatailr import workflow
|
|
137
139
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
with Batch(name="MY test DAG", schedule=schedule) as dag:
|
|
140
|
+
@workflow(name="MY test DAG")
|
|
141
|
+
def my_workflow():
|
|
141
142
|
for n in range(2):
|
|
142
143
|
res1 = func_no_args().alias(f"func_{n}")
|
|
143
144
|
res2 = func_with_args(1, res1).alias(f"func_with_args_{n}")
|
|
145
|
+
|
|
146
|
+
schedule = Schedule(at_hours=0)
|
|
147
|
+
my_workflow(schedule=schedule)
|
|
144
148
|
```
|
|
145
149
|
|
|
146
|
-
This will submit the entire
|
|
147
|
-
The
|
|
150
|
+
This will submit the entire workflow for execution, and the scheduler will take care of running the jobs in parallel and managing the resources.
|
|
151
|
+
The workflow in the example above will be scheduled to run daily at 00:00.
|
|
148
152
|
|
|
149
153
|
___
|
|
150
154
|
Visit [our website](https://www.datatailr.com/) for more!
|
|
@@ -47,25 +47,27 @@ print(datatailr.__provider__)
|
|
|
47
47
|
The following example shows how to create a simple data pipeline using the Datatailr Python package.
|
|
48
48
|
|
|
49
49
|
```python
|
|
50
|
-
from datatailr
|
|
50
|
+
from datatailr import workflow, task
|
|
51
51
|
|
|
52
|
-
@
|
|
52
|
+
@task()
|
|
53
53
|
def func_no_args() -> str:
|
|
54
54
|
return "no_args"
|
|
55
55
|
|
|
56
56
|
|
|
57
|
-
@
|
|
57
|
+
@task()
|
|
58
58
|
def func_with_args(a: int, b: float) -> str:
|
|
59
59
|
return f"args: {a}, {b}"
|
|
60
60
|
|
|
61
|
-
|
|
61
|
+
@workflow(name="MY test DAG")
|
|
62
|
+
def my_workflow():
|
|
62
63
|
for n in range(2):
|
|
63
64
|
res1 = func_no_args().alias(f"func_{n}")
|
|
64
65
|
res2 = func_with_args(1, res1).alias(f"func_with_args_{n}")
|
|
66
|
+
my_workflow(local_run=True)
|
|
65
67
|
```
|
|
66
68
|
|
|
67
69
|
Running this code will create a graph of jobs and execute it.
|
|
68
|
-
Each node on the graph represents a job, which in turn is a call to a function decorated with `@
|
|
70
|
+
Each node on the graph represents a job, which in turn is a call to a function decorated with `@task()`.
|
|
69
71
|
|
|
70
72
|
Since this is a local run then the execution of each node will happen sequentially in the same process.
|
|
71
73
|
|
|
@@ -80,14 +82,14 @@ You will first need to separate your function definitions from the DAG definitio
|
|
|
80
82
|
```python
|
|
81
83
|
# my_module.py
|
|
82
84
|
|
|
83
|
-
from datatailr
|
|
85
|
+
from datatailr import task
|
|
84
86
|
|
|
85
|
-
@
|
|
87
|
+
@task()
|
|
86
88
|
def func_no_args() -> str:
|
|
87
89
|
return "no_args"
|
|
88
90
|
|
|
89
91
|
|
|
90
|
-
@
|
|
92
|
+
@task()
|
|
91
93
|
def func_with_args(a: int, b: float) -> str:
|
|
92
94
|
return f"args: {a}, {b}"
|
|
93
95
|
```
|
|
@@ -96,18 +98,20 @@ To use these functions in a batch job, you just need to import them and run in a
|
|
|
96
98
|
|
|
97
99
|
```python
|
|
98
100
|
from my_module import func_no_args, func_with_args
|
|
99
|
-
from datatailr
|
|
101
|
+
from datatailr import workflow
|
|
100
102
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
with Batch(name="MY test DAG", schedule=schedule) as dag:
|
|
103
|
+
@workflow(name="MY test DAG")
|
|
104
|
+
def my_workflow():
|
|
104
105
|
for n in range(2):
|
|
105
106
|
res1 = func_no_args().alias(f"func_{n}")
|
|
106
107
|
res2 = func_with_args(1, res1).alias(f"func_with_args_{n}")
|
|
108
|
+
|
|
109
|
+
schedule = Schedule(at_hours=0)
|
|
110
|
+
my_workflow(schedule=schedule)
|
|
107
111
|
```
|
|
108
112
|
|
|
109
|
-
This will submit the entire
|
|
110
|
-
The
|
|
113
|
+
This will submit the entire workflow for execution, and the scheduler will take care of running the jobs in parallel and managing the resources.
|
|
114
|
+
The workflow in the example above will be scheduled to run daily at 00:00.
|
|
111
115
|
|
|
112
116
|
___
|
|
113
117
|
Visit [our website](https://www.datatailr.com/) for more!
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "datatailr"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.81"
|
|
8
8
|
description = "Ready-to-Use Platform That Drives Business Insights"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -39,6 +39,7 @@ datatailr_run_batch = "datatailr.sbin.datatailr_run_batch:run"
|
|
|
39
39
|
datatailr_run_app = "datatailr.sbin.datatailr_run_app:run"
|
|
40
40
|
datatailr_run_excel = "datatailr.sbin.datatailr_run_excel:run"
|
|
41
41
|
datatailr_run_service = "datatailr.sbin.datatailr_run_service:run"
|
|
42
|
+
datatailr = "datatailr.sbin.datatailr_cli:main"
|
|
42
43
|
|
|
43
44
|
[project.optional-dependencies]
|
|
44
45
|
dev = [
|
|
@@ -10,12 +10,14 @@ setup(
|
|
|
10
10
|
(
|
|
11
11
|
"/datatailr/sbin",
|
|
12
12
|
[
|
|
13
|
+
"src/sbin/datatailr_cli.py",
|
|
13
14
|
"src/sbin/datatailr_run.py",
|
|
14
15
|
"src/sbin/datatailr_run_batch.py",
|
|
15
16
|
"src/sbin/datatailr_run_app.py",
|
|
16
17
|
"src/sbin/datatailr_run_excel.py",
|
|
17
18
|
"src/sbin/datatailr_run_service.py",
|
|
18
19
|
],
|
|
19
|
-
)
|
|
20
|
+
),
|
|
21
|
+
("datatailr_demo", ["src/datatailr_demo/README.md"]),
|
|
20
22
|
],
|
|
21
23
|
)
|
|
@@ -16,6 +16,14 @@ from datatailr.blob import Blob
|
|
|
16
16
|
from datatailr.build import Image
|
|
17
17
|
from datatailr.utils import Environment, is_dt_installed
|
|
18
18
|
from datatailr.version import __version__
|
|
19
|
+
from datatailr.scheduler import (
|
|
20
|
+
App,
|
|
21
|
+
Service,
|
|
22
|
+
ExcelAddin,
|
|
23
|
+
workflow,
|
|
24
|
+
task,
|
|
25
|
+
set_allow_unsafe_scheduling,
|
|
26
|
+
)
|
|
19
27
|
|
|
20
28
|
system = dt__System()
|
|
21
29
|
if isinstance(system, mock_cli_tool):
|
|
@@ -33,4 +41,10 @@ __all__ = [
|
|
|
33
41
|
"__version__",
|
|
34
42
|
"__provider__",
|
|
35
43
|
"is_dt_installed",
|
|
44
|
+
"App",
|
|
45
|
+
"Service",
|
|
46
|
+
"ExcelAddin",
|
|
47
|
+
"workflow",
|
|
48
|
+
"task",
|
|
49
|
+
"set_allow_unsafe_scheduling",
|
|
36
50
|
]
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
|
|
11
11
|
import json
|
|
12
12
|
import os
|
|
13
|
-
import
|
|
13
|
+
import sys
|
|
14
14
|
from typing import Optional
|
|
15
15
|
|
|
16
16
|
from datatailr import ACL, User
|
|
@@ -26,7 +26,7 @@ class Image:
|
|
|
26
26
|
def __init__(
|
|
27
27
|
self,
|
|
28
28
|
acl: Optional[ACL] = None,
|
|
29
|
-
python_version: str = "
|
|
29
|
+
python_version: str = "auto",
|
|
30
30
|
python_requirements: str | list[str] = "",
|
|
31
31
|
build_script_pre: str = "",
|
|
32
32
|
build_script_post: str = "",
|
|
@@ -56,8 +56,10 @@ class Image:
|
|
|
56
56
|
def python_version(self, value: str):
|
|
57
57
|
if not isinstance(value, str):
|
|
58
58
|
raise TypeError("python_version must be a string.")
|
|
59
|
-
if
|
|
60
|
-
|
|
59
|
+
if value.lower() == "auto":
|
|
60
|
+
value = f"{sys.version_info.major}.{sys.version_info.minor}"
|
|
61
|
+
if value not in ["3.10", "3.11", "3.12", "3.13", "3.14"]:
|
|
62
|
+
raise ValueError(f"Invalid python_version: {value}")
|
|
61
63
|
self._python_version = value
|
|
62
64
|
|
|
63
65
|
@property
|
|
@@ -12,6 +12,8 @@ import sys
|
|
|
12
12
|
import importlib
|
|
13
13
|
import subprocess
|
|
14
14
|
import inspect
|
|
15
|
+
from urllib.parse import urlparse
|
|
16
|
+
|
|
15
17
|
import numpy as np
|
|
16
18
|
|
|
17
19
|
try:
|
|
@@ -45,11 +47,32 @@ def get_package_root(mod):
|
|
|
45
47
|
return mod_path
|
|
46
48
|
|
|
47
49
|
|
|
50
|
+
def matches_annotation(value, annotation):
|
|
51
|
+
if isinstance(value, np.ndarray):
|
|
52
|
+
return True
|
|
53
|
+
if annotation is bool:
|
|
54
|
+
return isinstance(value, bool) or (type(value) is int and value in (0, 1))
|
|
55
|
+
if annotation is float:
|
|
56
|
+
return isinstance(value, float) or (type(value) is int)
|
|
57
|
+
return isinstance(value, annotation)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def extract_hostname(url: str) -> str | None:
|
|
61
|
+
url = url if url else ""
|
|
62
|
+
if "://" not in url:
|
|
63
|
+
url = "//" + url
|
|
64
|
+
return urlparse(url).hostname
|
|
65
|
+
|
|
66
|
+
|
|
48
67
|
class Addin(AddinBase):
|
|
49
68
|
def __init__(self, *args, **kwargs):
|
|
50
69
|
super(Addin, self).__init__(*args, **kwargs)
|
|
70
|
+
f = inspect.currentframe().f_back
|
|
71
|
+
mod = inspect.getmodule(f)
|
|
72
|
+
if mod is not None:
|
|
73
|
+
setattr(mod, "__dt_addin__", self)
|
|
51
74
|
|
|
52
|
-
def run(self, port):
|
|
75
|
+
def run(self, port, ws_port, ide=True):
|
|
53
76
|
# Excel addin executable will try to import an object literally called "addin"
|
|
54
77
|
# from a module passed to dt-excel.sh as an argument. So to find which module
|
|
55
78
|
# to pass to dt-excel.sh, we walk the callstack until a module with "addin"
|
|
@@ -67,14 +90,14 @@ class Addin(AddinBase):
|
|
|
67
90
|
finally:
|
|
68
91
|
sys.path.pop(0)
|
|
69
92
|
|
|
70
|
-
addin_obj = getattr(imported_mod, "
|
|
93
|
+
addin_obj = getattr(imported_mod, "__dt_addin__", None)
|
|
71
94
|
if addin_obj is self or id(addin_obj) == id(self):
|
|
72
95
|
found_module = mod
|
|
73
96
|
break
|
|
74
97
|
|
|
75
98
|
if not found_module:
|
|
76
99
|
raise ValueError(
|
|
77
|
-
"'
|
|
100
|
+
"'__dt_addin__' not found."
|
|
78
101
|
)
|
|
79
102
|
|
|
80
103
|
if found_module.__name__ != "__main__":
|
|
@@ -91,11 +114,14 @@ class Addin(AddinBase):
|
|
|
91
114
|
module_name = os.path.splitext(os.path.basename(filename))[0]
|
|
92
115
|
dir_name = os.path.dirname(os.path.abspath(filename))
|
|
93
116
|
|
|
117
|
+
ide_flag = "-i" if ide else ""
|
|
118
|
+
hostname = extract_hostname(os.environ.get("VSCODE_PROXY_URI"))
|
|
119
|
+
|
|
94
120
|
subprocess.run(
|
|
95
121
|
[
|
|
96
122
|
"bash",
|
|
97
123
|
"-c",
|
|
98
|
-
f'PYTHONPATH="{dir_name}:$PYTHONPATH" /opt/datatailr/bin/dt-excel.sh -n -H
|
|
124
|
+
f'PYTHONPATH="{dir_name}:$PYTHONPATH" /opt/datatailr/bin/dt-excel.sh {ide_flag} -n -H {hostname} -p {port} -w {ws_port} {module_name}',
|
|
99
125
|
]
|
|
100
126
|
)
|
|
101
127
|
|
|
@@ -115,17 +141,18 @@ class Addin(AddinBase):
|
|
|
115
141
|
# be called directly from python code without requiring positional argument for _id
|
|
116
142
|
_id = args[0]
|
|
117
143
|
|
|
144
|
+
bound = signature.bind_partial(**kwargs)
|
|
145
|
+
bound.apply_defaults()
|
|
118
146
|
for arg in signature.parameters.values():
|
|
119
147
|
if streaming and arg.name == "queue":
|
|
120
148
|
continue
|
|
121
149
|
|
|
122
|
-
if not (
|
|
123
|
-
|
|
124
|
-
or isinstance(kwargs[arg.name], np.ndarray)
|
|
150
|
+
if not matches_annotation(
|
|
151
|
+
bound.arguments[arg.name], arg.annotation
|
|
125
152
|
):
|
|
126
153
|
raise ValueError(
|
|
127
154
|
"excel/python/dt/excel.py: Got argument of wrong type, expected %s or numpy.ndarray, got %s"
|
|
128
|
-
% (arg.annotation, type(
|
|
155
|
+
% (arg.annotation, type(bound.arguments[arg.name]))
|
|
129
156
|
)
|
|
130
157
|
queue = Queue(self.name.lower() + "." + func.__name__, _id)
|
|
131
158
|
if not streaming:
|
|
@@ -33,6 +33,70 @@ def get_log_level() -> int:
|
|
|
33
33
|
return logging.INFO
|
|
34
34
|
|
|
35
35
|
|
|
36
|
+
def ansi_symbols_supported() -> bool:
|
|
37
|
+
"""Check if the terminal supports ANSI symbols."""
|
|
38
|
+
if sys.platform.startswith("win"):
|
|
39
|
+
return (
|
|
40
|
+
os.getenv("ANSICON") is not None
|
|
41
|
+
or os.getenv("WT_SESSION") is not None
|
|
42
|
+
or "TERM" in os.environ
|
|
43
|
+
and os.environ["TERM"] == "xterm-256color"
|
|
44
|
+
)
|
|
45
|
+
else:
|
|
46
|
+
return sys.stdout.isatty()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
ANSI_AVAILABLE = ansi_symbols_supported()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def color_text(text: str, color_name: str) -> str:
|
|
53
|
+
"""Wrap text with ANSI color codes if supported."""
|
|
54
|
+
if not ANSI_AVAILABLE:
|
|
55
|
+
return text
|
|
56
|
+
|
|
57
|
+
colors = {
|
|
58
|
+
"red": "\033[31m",
|
|
59
|
+
"green": "\033[32m",
|
|
60
|
+
"yellow": "\033[33m",
|
|
61
|
+
"blue": "\033[34m",
|
|
62
|
+
"magenta": "\033[35m",
|
|
63
|
+
"cyan": "\033[36m",
|
|
64
|
+
"bold": "\033[1m",
|
|
65
|
+
"reset": "\033[0m",
|
|
66
|
+
}
|
|
67
|
+
color_code = colors.get(color_name.lower(), "")
|
|
68
|
+
reset_code = colors["reset"] if color_code else ""
|
|
69
|
+
return f"{color_code}{text}{reset_code}"
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def RED(text: str) -> str:
|
|
73
|
+
return color_text(text, "red")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def GREEN(text: str) -> str:
|
|
77
|
+
return color_text(text, "green")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def YELLOW(text: str) -> str:
|
|
81
|
+
return color_text(text, "yellow")
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def BLUE(text: str) -> str:
|
|
85
|
+
return color_text(text, "blue")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def MAGENTA(text: str) -> str:
|
|
89
|
+
return color_text(text, "magenta")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def CYAN(text: str) -> str:
|
|
93
|
+
return color_text(text, "cyan")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def BOLD(text: str) -> str:
|
|
97
|
+
return color_text(text, "bold")
|
|
98
|
+
|
|
99
|
+
|
|
36
100
|
class MaxLevelFilter(logging.Filter):
|
|
37
101
|
"""Allow only log records at or below a given level."""
|
|
38
102
|
|
|
@@ -55,6 +119,26 @@ class MinLevelFilter(logging.Filter):
|
|
|
55
119
|
return record.levelno >= self.level
|
|
56
120
|
|
|
57
121
|
|
|
122
|
+
class ColoredFormatter(logging.Formatter):
|
|
123
|
+
COLORS = {
|
|
124
|
+
logging.DEBUG: "\033[34m", # Blue
|
|
125
|
+
logging.INFO: "\033[32m", # Green
|
|
126
|
+
logging.WARNING: "\033[33m", # Yellow
|
|
127
|
+
logging.ERROR: "\033[31m", # Red
|
|
128
|
+
logging.CRITICAL: "\033[41m", # Red background
|
|
129
|
+
}
|
|
130
|
+
RESET = "\033[0m"
|
|
131
|
+
BOLD = "\033[1m"
|
|
132
|
+
|
|
133
|
+
def format(self, record):
|
|
134
|
+
color = self.COLORS.get(record.levelno, self.RESET)
|
|
135
|
+
timestamp = f"{self.BOLD}{self.formatTime(record)}{self.RESET}"
|
|
136
|
+
level = f"{color}{record.levelname}{self.RESET}"
|
|
137
|
+
message = f"{color}{record.getMessage()}{self.RESET}"
|
|
138
|
+
LOG_FORMAT = f"{timestamp} - {level} - {node_name}:{node_ip} - {user} - {job_name} - {record.name} - [Line {record.lineno}]: {message}"
|
|
139
|
+
return LOG_FORMAT
|
|
140
|
+
|
|
141
|
+
|
|
58
142
|
tag = dt__Tag()
|
|
59
143
|
node_name = tag.get("node_name") or "local"
|
|
60
144
|
node_ip = tag.get("node_ip")
|
|
@@ -67,8 +151,6 @@ except Exception:
|
|
|
67
151
|
|
|
68
152
|
user = getpass.getuser()
|
|
69
153
|
|
|
70
|
-
LOG_FORMAT = f"%(asctime)s - %(levelname)s - {node_name}:{node_ip} - {user} - {job_name} - %(name)s - [Line %(lineno)d]: %(message)s"
|
|
71
|
-
|
|
72
154
|
|
|
73
155
|
class DatatailrLogger:
|
|
74
156
|
def __init__(
|
|
@@ -76,7 +158,6 @@ class DatatailrLogger:
|
|
|
76
158
|
name: str,
|
|
77
159
|
log_file: Optional[str] = None,
|
|
78
160
|
log_level: int = get_log_level(),
|
|
79
|
-
log_format: str = LOG_FORMAT,
|
|
80
161
|
):
|
|
81
162
|
"""
|
|
82
163
|
Initialize the DatatailrLogger.
|
|
@@ -88,7 +169,7 @@ class DatatailrLogger:
|
|
|
88
169
|
self.logger = logging.getLogger(name)
|
|
89
170
|
self.logger.setLevel(log_level)
|
|
90
171
|
|
|
91
|
-
formatter =
|
|
172
|
+
formatter = ColoredFormatter()
|
|
92
173
|
|
|
93
174
|
# stdout handler (DEBUG/INFO only)
|
|
94
175
|
stdout_handler = logging.StreamHandler(sys.stdout)
|
|
@@ -35,8 +35,10 @@ from datatailr.scheduler.base import (
|
|
|
35
35
|
set_allow_unsafe_scheduling,
|
|
36
36
|
)
|
|
37
37
|
from datatailr.scheduler.batch import Batch, BatchJob, DuplicateJobNameError
|
|
38
|
-
from datatailr.scheduler.batch_decorator import batch_decorator as
|
|
38
|
+
from datatailr.scheduler.batch_decorator import batch_decorator as task
|
|
39
39
|
from datatailr.scheduler.schedule import Schedule
|
|
40
|
+
from datatailr.scheduler.job import App, Service, ExcelAddin
|
|
41
|
+
from datatailr.scheduler.workflow import workflow
|
|
40
42
|
|
|
41
43
|
__all__ = [
|
|
42
44
|
"Job",
|
|
@@ -46,9 +48,13 @@ __all__ = [
|
|
|
46
48
|
"EntryPoint",
|
|
47
49
|
"Batch",
|
|
48
50
|
"BatchJob",
|
|
49
|
-
"
|
|
51
|
+
"task",
|
|
50
52
|
"BatchJobError",
|
|
51
53
|
"DuplicateJobNameError",
|
|
52
54
|
"set_allow_unsafe_scheduling",
|
|
53
55
|
"Schedule",
|
|
56
|
+
"App",
|
|
57
|
+
"Service",
|
|
58
|
+
"ExcelAddin",
|
|
59
|
+
"workflow",
|
|
54
60
|
]
|
|
@@ -14,18 +14,19 @@ from datetime import datetime
|
|
|
14
14
|
import importlib.util
|
|
15
15
|
import json
|
|
16
16
|
import os
|
|
17
|
+
import re
|
|
17
18
|
import tempfile
|
|
18
19
|
import uuid
|
|
19
20
|
from dataclasses import dataclass
|
|
20
21
|
from enum import Enum
|
|
21
|
-
from typing import Callable, Dict, Optional, Tuple, Union
|
|
22
|
+
from typing import Callable, Dict, Optional, Tuple, Union, List
|
|
22
23
|
|
|
23
24
|
from datatailr import ACL, Environment, User, is_dt_installed
|
|
24
25
|
from datatailr.wrapper import dt__Job
|
|
25
26
|
from datatailr.scheduler.constants import DEFAULT_TASK_MEMORY, DEFAULT_TASK_CPU
|
|
26
27
|
from datatailr.build.image import Image
|
|
27
28
|
from datatailr.errors import BatchJobError
|
|
28
|
-
from datatailr.logging import DatatailrLogger
|
|
29
|
+
from datatailr.logging import CYAN, DatatailrLogger
|
|
29
30
|
from datatailr.utils import run_shell_command, dict_to_env_vars
|
|
30
31
|
|
|
31
32
|
logger = DatatailrLogger(os.path.abspath(__file__)).get_logger()
|
|
@@ -142,10 +143,10 @@ class Job:
|
|
|
142
143
|
environment: Optional[Environment] = Environment.DEV,
|
|
143
144
|
image: Optional[Image] = None,
|
|
144
145
|
run_as: Optional[Union[str, User]] = None,
|
|
145
|
-
resources: Resources = Resources(
|
|
146
|
+
resources: Resources = Resources(),
|
|
146
147
|
acl: Optional[ACL] = None,
|
|
147
|
-
python_version: str = "
|
|
148
|
-
python_requirements: str = "",
|
|
148
|
+
python_version: str = "auto",
|
|
149
|
+
python_requirements: str | List[str] = "",
|
|
149
150
|
build_script_pre: str = "",
|
|
150
151
|
build_script_post: str = "",
|
|
151
152
|
env_vars: Dict[str, str | int | float | bool] = {},
|
|
@@ -153,6 +154,12 @@ class Job:
|
|
|
153
154
|
entrypoint: Optional[EntryPoint] = None,
|
|
154
155
|
update_existing: bool = False,
|
|
155
156
|
):
|
|
157
|
+
# valid names must be lowercase, alphanumeric and underscores only
|
|
158
|
+
if not re.match(r"^[a-z0-9_]+$", name):
|
|
159
|
+
raise ValueError(
|
|
160
|
+
f"Invalid job name: {name}. Only lowercase letters, numbers, and underscores are allowed."
|
|
161
|
+
)
|
|
162
|
+
|
|
156
163
|
if environment is None:
|
|
157
164
|
environment = Environment.DEV
|
|
158
165
|
|
|
@@ -245,7 +252,6 @@ class Job:
|
|
|
245
252
|
if self.type == JobType.EXCEL:
|
|
246
253
|
if "DATATAILR_LOCAL" not in self.__env_vars:
|
|
247
254
|
self.__env_vars.update({"DATATAILR_LOCAL": "false"})
|
|
248
|
-
job_dict["per_user_job"] = True
|
|
249
255
|
if self.type != JobType.BATCH:
|
|
250
256
|
job_dict["entrypoint"] = str(self.entrypoint) if self.entrypoint else None
|
|
251
257
|
job_dict["env"] = dict_to_env_vars(self.__env_vars)
|
|
@@ -294,6 +300,7 @@ class Job:
|
|
|
294
300
|
Returns a tuple of (branch: str, commit_hash: str).
|
|
295
301
|
"""
|
|
296
302
|
path_to_repo = self.image.path_to_repo or "."
|
|
303
|
+
branch_name, local_commit, return_code = "unknown", "unknown", None
|
|
297
304
|
try:
|
|
298
305
|
local_commit = run_shell_command(
|
|
299
306
|
f"cd {path_to_repo} && git rev-parse HEAD"
|
|
@@ -301,6 +308,13 @@ class Job:
|
|
|
301
308
|
branch_name = run_shell_command(
|
|
302
309
|
f"cd {path_to_repo} && git rev-parse --abbrev-ref HEAD"
|
|
303
310
|
)[0]
|
|
311
|
+
|
|
312
|
+
if (
|
|
313
|
+
os.getenv("DATATAILR_ALLOW_UNSAFE_SCHEDULING", "false").lower()
|
|
314
|
+
== "true"
|
|
315
|
+
):
|
|
316
|
+
return branch_name, local_commit
|
|
317
|
+
|
|
304
318
|
return_code = run_shell_command(
|
|
305
319
|
f"cd {path_to_repo} && git diff --exit-code"
|
|
306
320
|
)
|
|
@@ -309,15 +323,11 @@ class Job:
|
|
|
309
323
|
logger.warning(
|
|
310
324
|
"Git is not installed or not found in PATH. Repository validation is not possible."
|
|
311
325
|
)
|
|
312
|
-
branch_name, local_commit, return_code = "unknown", "unknown", None
|
|
313
326
|
else:
|
|
314
327
|
raise RepoValidationError(
|
|
315
328
|
f"Error accessing git repository at {path_to_repo}: {e}"
|
|
316
329
|
) from e
|
|
317
330
|
|
|
318
|
-
if os.getenv("DATATAILR_ALLOW_UNSAFE_SCHEDULING", "false").lower() == "true":
|
|
319
|
-
return branch_name, local_commit
|
|
320
|
-
|
|
321
331
|
is_committed = return_code is not None and return_code[1] == 0
|
|
322
332
|
|
|
323
333
|
if not is_committed:
|
|
@@ -342,10 +352,6 @@ class Job:
|
|
|
342
352
|
branch_name=branch_name,
|
|
343
353
|
commit_hash=local_commit,
|
|
344
354
|
)
|
|
345
|
-
logger.info(
|
|
346
|
-
f"Running job '{self.name}' in environment '{self.environment}' as '{self.run_as}'"
|
|
347
|
-
)
|
|
348
|
-
|
|
349
355
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as temp_file:
|
|
350
356
|
temp_file.write(self.to_json().encode())
|
|
351
357
|
return temp_file.name
|
|
@@ -375,7 +381,10 @@ class Job:
|
|
|
375
381
|
)
|
|
376
382
|
try:
|
|
377
383
|
temp_file_name = self.__prepare__()
|
|
378
|
-
|
|
384
|
+
action = {"run": "Running", "save": "Saving", "start": "Starting"}.get(
|
|
385
|
+
command, "Processing"
|
|
386
|
+
)
|
|
387
|
+
print(CYAN(f"{action} '{self.name}' as {self.run_as} ..."))
|
|
379
388
|
if command == "run":
|
|
380
389
|
result = __client__.run(
|
|
381
390
|
f"file://{temp_file_name}", **self.get_schedule_args()
|
|
@@ -393,6 +402,10 @@ class Job:
|
|
|
393
402
|
logger.error(f"Error running command '{command}': {e}")
|
|
394
403
|
return False, str(e)
|
|
395
404
|
self.__set_existing_id__(result)
|
|
405
|
+
action = {"run": "ran", "save": "saved", "start": "started"}.get(
|
|
406
|
+
command, "processed"
|
|
407
|
+
)
|
|
408
|
+
print(CYAN(f"Job '{self.name}' {action} successfully."))
|
|
396
409
|
return True, result
|
|
397
410
|
|
|
398
411
|
def save(self) -> Tuple[bool, str]:
|