PyPI - datatailr - Versions diffs - 0.1.73__tar.gz → 0.1.81__tar.gz - Mend

datatailr 0.1.73tar.gz → 0.1.81tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

{datatailr-0.1.73/src/datatailr.egg-info → datatailr-0.1.81}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datatailr
-Version: 0.1.73
+Version: 0.1.81
 Summary: Ready-to-Use Platform That Drives Business Insights
 Author-email: Datatailr <info@datatailr.com>
 License-Expression: MIT
@@ -84,25 +84,27 @@ print(datatailr.__provider__)
 The following example shows how to create a simple data pipeline using the Datatailr Python package.
 ```python
-from datatailr.scheduler import batch_job, Batch
+from datatailr import workflow, task
-@batch_job()
+@task()
 def func_no_args() -> str:
     return "no_args"
-@batch_job()
+@task()
 def func_with_args(a: int, b: float) -> str:
     return f"args: {a}, {b}"
-with Batch(name="MY test DAG", local_run=True) as dag:
+@workflow(name="MY test DAG")
+def my_workflow():
     for n in range(2):
         res1 = func_no_args().alias(f"func_{n}")
         res2 = func_with_args(1, res1).alias(f"func_with_args_{n}")
+my_workflow(local_run=True)
 ```
 Running this code will create a graph of jobs and execute it.
-Each node on the graph represents a job, which in turn is a call to a function decorated with `@batch_job()`.
+Each node on the graph represents a job, which in turn is a call to a function decorated with `@task()`.
 Since this is a local run then the execution of each node will happen sequentially in the same process.
@@ -117,14 +119,14 @@ You will first need to separate your function definitions from the DAG definitio
 ```python
 # my_module.py
-from datatailr.scheduler import batch_job
+from datatailr import task
-@batch_job()
+@task()
 def func_no_args() -> str:
     return "no_args"
-@batch_job()
+@task()
 def func_with_args(a: int, b: float) -> str:
     return f"args: {a}, {b}"
 ```
@@ -133,18 +135,20 @@ To use these functions in a batch job, you just need to import them and run in a
 ```python
 from my_module import func_no_args, func_with_args
-from datatailr.scheduler import Batch, Schedule
+from datatailr import workflow
-schedule = Schedule(at_hours=0)
-with Batch(name="MY test DAG", schedule=schedule) as dag:
+@workflow(name="MY test DAG")
+def my_workflow():
     for n in range(2):
         res1 = func_no_args().alias(f"func_{n}")
         res2 = func_with_args(1, res1).alias(f"func_with_args_{n}")
+schedule = Schedule(at_hours=0)
+my_workflow(schedule=schedule)
 ```
-This will submit the entire DAG for execution, and the scheduler will take care of running the jobs in parallel and managing the resources.
-The DAG in the example above will be scheduled to run daily at 00:00.
+This will submit the entire workflow for execution, and the scheduler will take care of running the jobs in parallel and managing the resources.
+The workflow in the example above will be scheduled to run daily at 00:00.
 ___
 Visit [our website](https://www.datatailr.com/) for more!

{datatailr-0.1.73 → datatailr-0.1.81}/README.md RENAMED Viewed

@@ -47,25 +47,27 @@ print(datatailr.__provider__)
 The following example shows how to create a simple data pipeline using the Datatailr Python package.
 ```python
-from datatailr.scheduler import batch_job, Batch
+from datatailr import workflow, task
-@batch_job()
+@task()
 def func_no_args() -> str:
     return "no_args"
-@batch_job()
+@task()
 def func_with_args(a: int, b: float) -> str:
     return f"args: {a}, {b}"
-with Batch(name="MY test DAG", local_run=True) as dag:
+@workflow(name="MY test DAG")
+def my_workflow():
     for n in range(2):
         res1 = func_no_args().alias(f"func_{n}")
         res2 = func_with_args(1, res1).alias(f"func_with_args_{n}")
+my_workflow(local_run=True)
 ```
 Running this code will create a graph of jobs and execute it.
-Each node on the graph represents a job, which in turn is a call to a function decorated with `@batch_job()`.
+Each node on the graph represents a job, which in turn is a call to a function decorated with `@task()`.
 Since this is a local run then the execution of each node will happen sequentially in the same process.
@@ -80,14 +82,14 @@ You will first need to separate your function definitions from the DAG definitio
 ```python
 # my_module.py
-from datatailr.scheduler import batch_job
+from datatailr import task
-@batch_job()
+@task()
 def func_no_args() -> str:
     return "no_args"
-@batch_job()
+@task()
 def func_with_args(a: int, b: float) -> str:
     return f"args: {a}, {b}"
 ```
@@ -96,18 +98,20 @@ To use these functions in a batch job, you just need to import them and run in a
 ```python
 from my_module import func_no_args, func_with_args
-from datatailr.scheduler import Batch, Schedule
+from datatailr import workflow
-schedule = Schedule(at_hours=0)
-with Batch(name="MY test DAG", schedule=schedule) as dag:
+@workflow(name="MY test DAG")
+def my_workflow():
     for n in range(2):
         res1 = func_no_args().alias(f"func_{n}")
         res2 = func_with_args(1, res1).alias(f"func_with_args_{n}")
+schedule = Schedule(at_hours=0)
+my_workflow(schedule=schedule)
 ```
-This will submit the entire DAG for execution, and the scheduler will take care of running the jobs in parallel and managing the resources.
-The DAG in the example above will be scheduled to run daily at 00:00.
+This will submit the entire workflow for execution, and the scheduler will take care of running the jobs in parallel and managing the resources.
+The workflow in the example above will be scheduled to run daily at 00:00.
 ___
 Visit [our website](https://www.datatailr.com/) for more!

{datatailr-0.1.73 → datatailr-0.1.81}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "datatailr"
-version = "0.1.73"
+version = "0.1.81"
 description = "Ready-to-Use Platform That Drives Business Insights"
 readme = "README.md"
 requires-python = ">=3.9"
@@ -39,6 +39,7 @@ datatailr_run_batch = "datatailr.sbin.datatailr_run_batch:run"
 datatailr_run_app = "datatailr.sbin.datatailr_run_app:run"
 datatailr_run_excel = "datatailr.sbin.datatailr_run_excel:run"
 datatailr_run_service = "datatailr.sbin.datatailr_run_service:run"
+datatailr = "datatailr.sbin.datatailr_cli:main"
 [project.optional-dependencies]
 dev = [

{datatailr-0.1.73 → datatailr-0.1.81}/setup.py RENAMED Viewed

@@ -10,12 +10,14 @@ setup(
         (
             "/datatailr/sbin",
             [
+                "src/sbin/datatailr_cli.py",
                 "src/sbin/datatailr_run.py",
                 "src/sbin/datatailr_run_batch.py",
                 "src/sbin/datatailr_run_app.py",
                 "src/sbin/datatailr_run_excel.py",
                 "src/sbin/datatailr_run_service.py",
             ],
-        )
+        ),
+        ("datatailr_demo", ["src/datatailr_demo/README.md"]),
     ],
 )

{datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/__init__.py RENAMED Viewed

@@ -16,6 +16,14 @@ from datatailr.blob import Blob
 from datatailr.build import Image
 from datatailr.utils import Environment, is_dt_installed
 from datatailr.version import __version__
+from datatailr.scheduler import (
+    App,
+    Service,
+    ExcelAddin,
+    workflow,
+    task,
+    set_allow_unsafe_scheduling,
+)
 system = dt__System()
 if isinstance(system, mock_cli_tool):
@@ -33,4 +41,10 @@ __all__ = [
     "__version__",
     "__provider__",
     "is_dt_installed",
+    "App",
+    "Service",
+    "ExcelAddin",
+    "workflow",
+    "task",
+    "set_allow_unsafe_scheduling",
 ]

{datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/build/image.py RENAMED Viewed

@@ -10,7 +10,7 @@
 import json
 import os
-import re
+import sys
 from typing import Optional
 from datatailr import ACL, User
@@ -26,7 +26,7 @@ class Image:
     def __init__(
         self,
         acl: Optional[ACL] = None,
-        python_version: str = "3.12",
+        python_version: str = "auto",
         python_requirements: str | list[str] = "",
         build_script_pre: str = "",
         build_script_post: str = "",
@@ -56,8 +56,10 @@ class Image:
     def python_version(self, value: str):
         if not isinstance(value, str):
             raise TypeError("python_version must be a string.")
-        if not re.match(r"^\d+\.\d+(\.\d+)?$", value):
-            raise ValueError("Invalid python_version format. Expected format: X.Y[.Z]")
+        if value.lower() == "auto":
+            value = f"{sys.version_info.major}.{sys.version_info.minor}"
+        if value not in ["3.10", "3.11", "3.12", "3.13", "3.14"]:
+            raise ValueError(f"Invalid python_version: {value}")
         self._python_version = value
     @property

{datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/excel/addin.py RENAMED Viewed

@@ -12,6 +12,8 @@ import sys
 import importlib
 import subprocess
 import inspect
+from urllib.parse import urlparse
 import numpy as np
 try:
@@ -45,11 +47,32 @@ def get_package_root(mod):
     return mod_path
+def matches_annotation(value, annotation):
+    if isinstance(value, np.ndarray):
+        return True
+    if annotation is bool:
+        return isinstance(value, bool) or (type(value) is int and value in (0, 1))
+    if annotation is float:
+        return isinstance(value, float) or (type(value) is int)
+    return isinstance(value, annotation)
+def extract_hostname(url: str) -> str | None:
+    url = url if url else ""
+    if "://" not in url:
+        url = "//" + url
+    return urlparse(url).hostname
 class Addin(AddinBase):
     def __init__(self, *args, **kwargs):
         super(Addin, self).__init__(*args, **kwargs)
+        f = inspect.currentframe().f_back
+        mod = inspect.getmodule(f)
+        if mod is not None:
+            setattr(mod, "__dt_addin__", self)
-    def run(self, port):
+    def run(self, port, ws_port, ide=True):
         # Excel addin executable will try to import an object literally called "addin"
         # from a module passed to dt-excel.sh as an argument. So to find which module
         # to pass to dt-excel.sh, we walk the callstack until a module with "addin"
@@ -67,14 +90,14 @@ class Addin(AddinBase):
             finally:
                 sys.path.pop(0)
-            addin_obj = getattr(imported_mod, "addin", None)
+            addin_obj = getattr(imported_mod, "__dt_addin__", None)
             if addin_obj is self or id(addin_obj) == id(self):
                 found_module = mod
                 break
         if not found_module:
             raise ValueError(
-                "'addin' not found. Please, use 'addin' as variable name for your Addin instance."
+                "'__dt_addin__' not found."
             )
         if found_module.__name__ != "__main__":
@@ -91,11 +114,14 @@ class Addin(AddinBase):
             module_name = os.path.splitext(os.path.basename(filename))[0]
             dir_name = os.path.dirname(os.path.abspath(filename))
+        ide_flag = "-i" if ide else ""
+        hostname = extract_hostname(os.environ.get("VSCODE_PROXY_URI"))
         subprocess.run(
             [
                 "bash",
                 "-c",
-                f'PYTHONPATH="{dir_name}:$PYTHONPATH" /opt/datatailr/bin/dt-excel.sh -n -H "localhost" -l -p {port} -w 8000 {module_name}',
+                f'PYTHONPATH="{dir_name}:$PYTHONPATH" /opt/datatailr/bin/dt-excel.sh {ide_flag} -n -H {hostname} -p {port} -w {ws_port} {module_name}',
             ]
         )
@@ -115,17 +141,18 @@ class Addin(AddinBase):
                 # be called directly from python code without requiring positional argument for _id
                 _id = args[0]
+                bound = signature.bind_partial(**kwargs)
+                bound.apply_defaults()
                 for arg in signature.parameters.values():
                     if streaming and arg.name == "queue":
                         continue
-                    if not (
-                        isinstance(kwargs[arg.name], arg.annotation)
-                        or isinstance(kwargs[arg.name], np.ndarray)
+                    if not matches_annotation(
+                        bound.arguments[arg.name], arg.annotation
                     ):
                         raise ValueError(
                             "excel/python/dt/excel.py: Got argument of wrong type, expected %s or numpy.ndarray, got %s"
-                            % (arg.annotation, type(kwargs[arg.name]))
+                            % (arg.annotation, type(bound.arguments[arg.name]))
                         )
                 queue = Queue(self.name.lower() + "." + func.__name__, _id)
                 if not streaming:

{datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/logging.py RENAMED Viewed

@@ -33,6 +33,70 @@ def get_log_level() -> int:
         return logging.INFO
+def ansi_symbols_supported() -> bool:
+    """Check if the terminal supports ANSI symbols."""
+    if sys.platform.startswith("win"):
+        return (
+            os.getenv("ANSICON") is not None
+            or os.getenv("WT_SESSION") is not None
+            or "TERM" in os.environ
+            and os.environ["TERM"] == "xterm-256color"
+        )
+    else:
+        return sys.stdout.isatty()
+ANSI_AVAILABLE = ansi_symbols_supported()
+def color_text(text: str, color_name: str) -> str:
+    """Wrap text with ANSI color codes if supported."""
+    if not ANSI_AVAILABLE:
+        return text
+    colors = {
+        "red": "\033[31m",
+        "green": "\033[32m",
+        "yellow": "\033[33m",
+        "blue": "\033[34m",
+        "magenta": "\033[35m",
+        "cyan": "\033[36m",
+        "bold": "\033[1m",
+        "reset": "\033[0m",
+    }
+    color_code = colors.get(color_name.lower(), "")
+    reset_code = colors["reset"] if color_code else ""
+    return f"{color_code}{text}{reset_code}"
+def RED(text: str) -> str:
+    return color_text(text, "red")
+def GREEN(text: str) -> str:
+    return color_text(text, "green")
+def YELLOW(text: str) -> str:
+    return color_text(text, "yellow")
+def BLUE(text: str) -> str:
+    return color_text(text, "blue")
+def MAGENTA(text: str) -> str:
+    return color_text(text, "magenta")
+def CYAN(text: str) -> str:
+    return color_text(text, "cyan")
+def BOLD(text: str) -> str:
+    return color_text(text, "bold")
 class MaxLevelFilter(logging.Filter):
     """Allow only log records at or below a given level."""
@@ -55,6 +119,26 @@ class MinLevelFilter(logging.Filter):
         return record.levelno >= self.level
+class ColoredFormatter(logging.Formatter):
+    COLORS = {
+        logging.DEBUG: "\033[34m",  # Blue
+        logging.INFO: "\033[32m",  # Green
+        logging.WARNING: "\033[33m",  # Yellow
+        logging.ERROR: "\033[31m",  # Red
+        logging.CRITICAL: "\033[41m",  # Red background
+    }
+    RESET = "\033[0m"
+    BOLD = "\033[1m"
+    def format(self, record):
+        color = self.COLORS.get(record.levelno, self.RESET)
+        timestamp = f"{self.BOLD}{self.formatTime(record)}{self.RESET}"
+        level = f"{color}{record.levelname}{self.RESET}"
+        message = f"{color}{record.getMessage()}{self.RESET}"
+        LOG_FORMAT = f"{timestamp} - {level} - {node_name}:{node_ip} - {user} - {job_name} - {record.name} - [Line {record.lineno}]: {message}"
+        return LOG_FORMAT
 tag = dt__Tag()
 node_name = tag.get("node_name") or "local"
 node_ip = tag.get("node_ip")
@@ -67,8 +151,6 @@ except Exception:
     user = getpass.getuser()
-LOG_FORMAT = f"%(asctime)s - %(levelname)s - {node_name}:{node_ip} - {user} - {job_name} - %(name)s - [Line %(lineno)d]: %(message)s"
 class DatatailrLogger:
     def __init__(
@@ -76,7 +158,6 @@ class DatatailrLogger:
         name: str,
         log_file: Optional[str] = None,
         log_level: int = get_log_level(),
-        log_format: str = LOG_FORMAT,
     ):
         """
         Initialize the DatatailrLogger.
@@ -88,7 +169,7 @@ class DatatailrLogger:
         self.logger = logging.getLogger(name)
         self.logger.setLevel(log_level)
-        formatter = logging.Formatter(log_format)
+        formatter = ColoredFormatter()
         # stdout handler (DEBUG/INFO only)
         stdout_handler = logging.StreamHandler(sys.stdout)

{datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/scheduler/__init__.py RENAMED Viewed

@@ -35,8 +35,10 @@ from datatailr.scheduler.base import (
     set_allow_unsafe_scheduling,
 )
 from datatailr.scheduler.batch import Batch, BatchJob, DuplicateJobNameError
-from datatailr.scheduler.batch_decorator import batch_decorator as batch_job
+from datatailr.scheduler.batch_decorator import batch_decorator as task
 from datatailr.scheduler.schedule import Schedule
+from datatailr.scheduler.job import App, Service, ExcelAddin
+from datatailr.scheduler.workflow import workflow
 __all__ = [
     "Job",
@@ -46,9 +48,13 @@ __all__ = [
     "EntryPoint",
     "Batch",
     "BatchJob",
-    "batch_job",
+    "task",
     "BatchJobError",
     "DuplicateJobNameError",
     "set_allow_unsafe_scheduling",
     "Schedule",
+    "App",
+    "Service",
+    "ExcelAddin",
+    "workflow",
 ]

{datatailr-0.1.73 → datatailr-0.1.81}/src/datatailr/scheduler/base.py RENAMED Viewed

@@ -14,18 +14,19 @@ from datetime import datetime
 import importlib.util
 import json
 import os
+import re
 import tempfile
 import uuid
 from dataclasses import dataclass
 from enum import Enum
-from typing import Callable, Dict, Optional, Tuple, Union
+from typing import Callable, Dict, Optional, Tuple, Union, List
 from datatailr import ACL, Environment, User, is_dt_installed
 from datatailr.wrapper import dt__Job
 from datatailr.scheduler.constants import DEFAULT_TASK_MEMORY, DEFAULT_TASK_CPU
 from datatailr.build.image import Image
 from datatailr.errors import BatchJobError
-from datatailr.logging import DatatailrLogger
+from datatailr.logging import CYAN, DatatailrLogger
 from datatailr.utils import run_shell_command, dict_to_env_vars
 logger = DatatailrLogger(os.path.abspath(__file__)).get_logger()
@@ -142,10 +143,10 @@ class Job:
         environment: Optional[Environment] = Environment.DEV,
         image: Optional[Image] = None,
         run_as: Optional[Union[str, User]] = None,
-        resources: Resources = Resources(memory="128m", cpu=0.25),
+        resources: Resources = Resources(),
         acl: Optional[ACL] = None,
-        python_version: str = "3.12",
-        python_requirements: str = "",
+        python_version: str = "auto",
+        python_requirements: str | List[str] = "",
         build_script_pre: str = "",
         build_script_post: str = "",
         env_vars: Dict[str, str | int | float | bool] = {},
@@ -153,6 +154,12 @@ class Job:
         entrypoint: Optional[EntryPoint] = None,
         update_existing: bool = False,
     ):
+        # valid names must be lowercase, alphanumeric and underscores only
+        if not re.match(r"^[a-z0-9_]+$", name):
+            raise ValueError(
+                f"Invalid job name: {name}. Only lowercase letters, numbers, and underscores are allowed."
+            )
         if environment is None:
             environment = Environment.DEV
@@ -245,7 +252,6 @@ class Job:
         if self.type == JobType.EXCEL:
             if "DATATAILR_LOCAL" not in self.__env_vars:
                 self.__env_vars.update({"DATATAILR_LOCAL": "false"})
-            job_dict["per_user_job"] = True
         if self.type != JobType.BATCH:
             job_dict["entrypoint"] = str(self.entrypoint) if self.entrypoint else None
             job_dict["env"] = dict_to_env_vars(self.__env_vars)
@@ -294,6 +300,7 @@ class Job:
         Returns a tuple of (branch: str, commit_hash: str).
         """
         path_to_repo = self.image.path_to_repo or "."
+        branch_name, local_commit, return_code = "unknown", "unknown", None
         try:
             local_commit = run_shell_command(
                 f"cd {path_to_repo} && git rev-parse HEAD"
@@ -301,6 +308,13 @@ class Job:
             branch_name = run_shell_command(
                 f"cd {path_to_repo} && git rev-parse --abbrev-ref HEAD"
             )[0]
+            if (
+                os.getenv("DATATAILR_ALLOW_UNSAFE_SCHEDULING", "false").lower()
+                == "true"
+            ):
+                return branch_name, local_commit
             return_code = run_shell_command(
                 f"cd {path_to_repo} && git diff --exit-code"
             )
@@ -309,15 +323,11 @@ class Job:
                 logger.warning(
                     "Git is not installed or not found in PATH. Repository validation is not possible."
                 )
-                branch_name, local_commit, return_code = "unknown", "unknown", None
             else:
                 raise RepoValidationError(
                     f"Error accessing git repository at {path_to_repo}: {e}"
                 ) from e
-        if os.getenv("DATATAILR_ALLOW_UNSAFE_SCHEDULING", "false").lower() == "true":
-            return branch_name, local_commit
         is_committed = return_code is not None and return_code[1] == 0
         if not is_committed:
@@ -342,10 +352,6 @@ class Job:
             branch_name=branch_name,
             commit_hash=local_commit,
         )
-        logger.info(
-            f"Running job '{self.name}' in environment '{self.environment}' as '{self.run_as}'"
-        )
         with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as temp_file:
             temp_file.write(self.to_json().encode())
         return temp_file.name
@@ -375,7 +381,10 @@ class Job:
             )
         try:
             temp_file_name = self.__prepare__()
+            action = {"run": "Running", "save": "Saving", "start": "Starting"}.get(
+                command, "Processing"
+            )
+            print(CYAN(f"{action} '{self.name}' as {self.run_as} ..."))
             if command == "run":
                 result = __client__.run(
                     f"file://{temp_file_name}", **self.get_schedule_args()
@@ -393,6 +402,10 @@ class Job:
             logger.error(f"Error running command '{command}': {e}")
             return False, str(e)
         self.__set_existing_id__(result)
+        action = {"run": "ran", "save": "saved", "start": "started"}.get(
+            command, "processed"
+        )
+        print(CYAN(f"Job '{self.name}' {action} successfully."))
         return True, result
     def save(self) -> Tuple[bool, str]:

datatailr 0.1.73__tar.gz → 0.1.81__tar.gz

datatailr 0.1.73tar.gz → 0.1.81tar.gz