PyPI - datablade - Versions diffs - 0.0.0__py3-none-any.whl → 0.0.5__py3-none-any.whl - Mend

datablade 0.0.0py3-none-any.whl → 0.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

datablade/__init__.py +41 -1
datablade/blade.py +153 -0
datablade/core/__init__.py +28 -7
datablade/core/frames.py +23 -236
datablade/core/json.py +5 -10
datablade/core/lists.py +5 -10
datablade/core/messages.py +23 -11
datablade/core/strings.py +5 -43
datablade/core/zip.py +5 -24
datablade/dataframes/__init__.py +43 -0
datablade/dataframes/frames.py +485 -0
datablade/dataframes/readers.py +540 -0
datablade/io/__init__.py +15 -0
datablade/io/json.py +33 -0
datablade/io/zip.py +73 -0
datablade/sql/__init__.py +32 -0
datablade/sql/bulk_load.py +405 -0
datablade/sql/ddl.py +227 -0
datablade/sql/ddl_pyarrow.py +287 -0
datablade/sql/dialects.py +10 -0
datablade/sql/quoting.py +42 -0
datablade/utils/__init__.py +37 -0
datablade/utils/lists.py +29 -0
datablade/utils/logging.py +159 -0
datablade/utils/messages.py +29 -0
datablade/utils/strings.py +86 -0
datablade-0.0.5.dist-info/METADATA +351 -0
datablade-0.0.5.dist-info/RECORD +31 -0
{datablade-0.0.0.dist-info → datablade-0.0.5.dist-info}/WHEEL +1 -1
{datablade-0.0.0.dist-info → datablade-0.0.5.dist-info/licenses}/LICENSE +20 -20
datablade-0.0.0.dist-info/METADATA +0 -13
datablade-0.0.0.dist-info/RECORD +0 -13
{datablade-0.0.0.dist-info → datablade-0.0.5.dist-info}/top_level.txt +0 -0

datablade/sql/ddl_pyarrow.py ADDED Viewed

@@ -0,0 +1,287 @@
+from __future__ import annotations
+import logging
+from typing import List, Optional
+from ..utils.messages import print_verbose
+from .ddl import _qualify_name
+from .dialects import Dialect
+from .quoting import quote_identifier
+logger = logging.getLogger("datablade")
+def _require_pyarrow():
+    try:
+        import pyarrow as pa  # type: ignore
+        import pyarrow.parquet as pq  # type: ignore
+    except ImportError as exc:  # pragma: no cover
+        raise ImportError(
+            "Parquet DDL generation requires 'pyarrow'. Install with: pip install pyarrow"
+        ) from exc
+    return pa, pq
+def _sql_type_from_arrow(data_type, dialect: Dialect) -> Optional[str]:  # noqa: C901
+    """Map a pyarrow.DataType to a SQL type string.
+    Returns None when there is no clean mapping and the caller should drop the column.
+    """
+    pa, _ = _require_pyarrow()
+    # Dictionary-encoded columns behave like their value type for DDL purposes.
+    if pa.types.is_dictionary(data_type):
+        return _sql_type_from_arrow(data_type.value_type, dialect)
+    # Nested/complex types: no clean general mapping across dialects.
+    if (
+        pa.types.is_struct(data_type)
+        or pa.types.is_list(data_type)
+        or pa.types.is_large_list(data_type)
+        or pa.types.is_fixed_size_list(data_type)
+        or pa.types.is_map(data_type)
+        or pa.types.is_union(data_type)
+    ):
+        return None
+    if dialect == Dialect.SQLSERVER:
+        if pa.types.is_boolean(data_type):
+            return "bit"
+        if pa.types.is_int8(data_type) or pa.types.is_int16(data_type):
+            return "smallint"
+        if pa.types.is_int32(data_type):
+            return "int"
+        if pa.types.is_int64(data_type):
+            return "bigint"
+        if pa.types.is_uint8(data_type) or pa.types.is_uint16(data_type):
+            return "int"
+        if pa.types.is_uint32(data_type):
+            return "bigint"
+        if pa.types.is_uint64(data_type):
+            return "decimal(20, 0)"
+        if pa.types.is_float16(data_type) or pa.types.is_float32(data_type):
+            return "real"
+        if pa.types.is_float64(data_type):
+            return "float"
+        if pa.types.is_decimal(data_type):
+            precision = min(int(data_type.precision), 38)
+            scale = int(data_type.scale)
+            return f"decimal({precision}, {scale})"
+        if pa.types.is_date(data_type):
+            return "date"
+        if pa.types.is_time(data_type):
+            return "time"
+        if pa.types.is_timestamp(data_type):
+            # SQL Server has datetimeoffset for tz-aware values.
+            return "datetimeoffset" if data_type.tz is not None else "datetime2"
+        if pa.types.is_binary(data_type) or pa.types.is_large_binary(data_type):
+            return "varbinary(max)"
+        if pa.types.is_fixed_size_binary(data_type):
+            return (
+                f"varbinary({int(data_type.byte_width)})"
+                if int(data_type.byte_width) <= 8000
+                else "varbinary(max)"
+            )
+        if pa.types.is_string(data_type) or pa.types.is_large_string(data_type):
+            return "nvarchar(max)"
+        # Anything else (including null) is not reliably representable.
+        return None
+    if dialect == Dialect.POSTGRES:
+        if pa.types.is_boolean(data_type):
+            return "boolean"
+        if pa.types.is_int8(data_type) or pa.types.is_int16(data_type):
+            return "smallint"
+        if pa.types.is_int32(data_type):
+            return "integer"
+        if pa.types.is_int64(data_type):
+            return "bigint"
+        if pa.types.is_unsigned_integer(data_type):
+            # Postgres has no unsigned ints; use a wider signed or numeric.
+            if pa.types.is_uint8(data_type) or pa.types.is_uint16(data_type):
+                return "integer"
+            if pa.types.is_uint32(data_type):
+                return "bigint"
+            if pa.types.is_uint64(data_type):
+                return "numeric(20, 0)"
+        if pa.types.is_float16(data_type) or pa.types.is_float32(data_type):
+            return "real"
+        if pa.types.is_float64(data_type):
+            return "double precision"
+        if pa.types.is_decimal(data_type):
+            precision = int(data_type.precision)
+            scale = int(data_type.scale)
+            return f"numeric({precision}, {scale})"
+        if pa.types.is_date(data_type):
+            return "date"
+        if pa.types.is_time(data_type):
+            return "time"
+        if pa.types.is_timestamp(data_type):
+            return "timestamptz" if data_type.tz is not None else "timestamp"
+        if pa.types.is_binary(data_type) or pa.types.is_large_binary(data_type):
+            return "bytea"
+        if pa.types.is_fixed_size_binary(data_type):
+            return "bytea"
+        if pa.types.is_string(data_type) or pa.types.is_large_string(data_type):
+            return "text"
+        return None
+    if dialect == Dialect.MYSQL:
+        if pa.types.is_boolean(data_type):
+            return "TINYINT(1)"
+        if pa.types.is_int8(data_type) or pa.types.is_int16(data_type):
+            return "SMALLINT"
+        if pa.types.is_int32(data_type):
+            return "INT"
+        if pa.types.is_int64(data_type):
+            return "BIGINT"
+        if pa.types.is_unsigned_integer(data_type):
+            # MySQL supports UNSIGNED, but we keep mappings consistent with the existing
+            # pandas-based DDL generator (signed types).
+            if pa.types.is_uint8(data_type) or pa.types.is_uint16(data_type):
+                return "INT"
+            if pa.types.is_uint32(data_type):
+                return "BIGINT"
+            if pa.types.is_uint64(data_type):
+                return "DECIMAL(20, 0)"
+        if pa.types.is_float16(data_type) or pa.types.is_float32(data_type):
+            return "FLOAT"
+        if pa.types.is_float64(data_type):
+            return "DOUBLE"
+        if pa.types.is_decimal(data_type):
+            precision = int(data_type.precision)
+            scale = int(data_type.scale)
+            return f"DECIMAL({precision}, {scale})"
+        if pa.types.is_date(data_type):
+            return "DATE"
+        if pa.types.is_time(data_type):
+            return "TIME"
+        if pa.types.is_timestamp(data_type):
+            return "DATETIME"
+        if pa.types.is_binary(data_type) or pa.types.is_large_binary(data_type):
+            return "LONGBLOB"
+        if pa.types.is_fixed_size_binary(data_type):
+            width = int(data_type.byte_width)
+            return f"VARBINARY({width})" if width <= 65535 else "LONGBLOB"
+        if pa.types.is_string(data_type) or pa.types.is_large_string(data_type):
+            return "TEXT"
+        return None
+    if dialect == Dialect.DUCKDB:
+        if pa.types.is_boolean(data_type):
+            return "BOOLEAN"
+        if pa.types.is_signed_integer(data_type):
+            return "BIGINT"
+        if pa.types.is_unsigned_integer(data_type):
+            return "UBIGINT"
+        if pa.types.is_floating(data_type):
+            return "DOUBLE"
+        if pa.types.is_decimal(data_type):
+            precision = int(data_type.precision)
+            scale = int(data_type.scale)
+            return f"DECIMAL({precision}, {scale})"
+        if pa.types.is_date(data_type):
+            return "DATE"
+        if pa.types.is_time(data_type):
+            return "TIME"
+        if pa.types.is_timestamp(data_type):
+            return "TIMESTAMPTZ" if data_type.tz is not None else "TIMESTAMP"
+        if pa.types.is_binary(data_type) or pa.types.is_large_binary(data_type):
+            return "BLOB"
+        if pa.types.is_fixed_size_binary(data_type):
+            return "BLOB"
+        if pa.types.is_string(data_type) or pa.types.is_large_string(data_type):
+            return "VARCHAR"
+        return None
+    raise NotImplementedError(f"Dialect not supported: {dialect}")
+def generate_create_table_from_parquet(
+    parquet_path: str,
+    catalog: Optional[str] = None,
+    schema: Optional[str] = None,
+    table: str = "table",
+    drop_existing: bool = True,
+    dialect: Dialect = Dialect.SQLSERVER,
+    verbose: bool = False,
+) -> str:
+    """Generate a CREATE TABLE statement from a Parquet file schema.
+    This reads the Parquet schema only (via PyArrow) and does not materialize data.
+    Columns whose Parquet types have no clean mapping for the chosen dialect are
+    dropped, and a warning is logged under logger name 'datablade'.
+    """
+    if (
+        parquet_path is None
+        or not isinstance(parquet_path, str)
+        or not parquet_path.strip()
+    ):
+        raise ValueError("parquet_path must be a non-empty string")
+    if not isinstance(table, str) or not table.strip():
+        raise ValueError("table must be a non-empty string")
+    if catalog is not None and (not isinstance(catalog, str) or not catalog.strip()):
+        raise ValueError("catalog, if provided, must be a non-empty string")
+    if schema is not None and (not isinstance(schema, str) or not schema.strip()):
+        raise ValueError("schema, if provided, must be a non-empty string")
+    _, pq = _require_pyarrow()
+    arrow_schema = pq.ParquetFile(parquet_path).schema_arrow
+    qualified_name = _qualify_name(catalog, schema, table, dialect)
+    lines: List[str] = []
+    for field in arrow_schema:
+        sql_type = _sql_type_from_arrow(field.type, dialect)
+        if sql_type is None:
+            logger.warning(
+                "Dropping Parquet column %r (type=%s) for dialect=%s: unsupported type",
+                field.name,
+                str(field.type),
+                dialect.value,
+            )
+            continue
+        null_str = "NULL" if field.nullable else "NOT NULL"
+        lines.append(
+            f"    {quote_identifier(str(field.name), dialect)} {sql_type} {null_str}"
+        )
+    if not lines:
+        raise ValueError(
+            "No supported columns found in Parquet schema for the selected dialect"
+        )
+    body = ",\n".join(lines)
+    drop_clause = ""
+    if drop_existing:
+        if dialect == Dialect.SQLSERVER:
+            if catalog:
+                drop_clause = (
+                    f"USE {quote_identifier(catalog, dialect)};\n"
+                    f"IF OBJECT_ID('{qualified_name}') IS NOT NULL "
+                    f"DROP TABLE {qualified_name};\n"
+                )
+            else:
+                drop_clause = (
+                    f"IF OBJECT_ID('{qualified_name}') IS NOT NULL "
+                    f"DROP TABLE {qualified_name};\n"
+                )
+        else:
+            drop_clause = f"DROP TABLE IF EXISTS {qualified_name};\n"
+    statement = f"{drop_clause}CREATE TABLE {qualified_name} (\n{body}\n);"
+    print_verbose(
+        f"Generated CREATE TABLE from Parquet schema for {qualified_name}", verbose
+    )
+    return statement

datablade/sql/dialects.py ADDED Viewed

@@ -0,0 +1,10 @@
+from enum import Enum
+class Dialect(str, Enum):
+    """Supported SQL dialects for datablade DDL helpers."""
+    SQLSERVER = "sqlserver"
+    POSTGRES = "postgres"
+    MYSQL = "mysql"
+    DUCKDB = "duckdb"

datablade/sql/quoting.py ADDED Viewed

@@ -0,0 +1,42 @@
+from typing import Optional
+from .dialects import Dialect
+def quote_identifier(name: Optional[str], dialect: Dialect = Dialect.SQLSERVER) -> str:
+    """
+    Quote an identifier for the given SQL dialect.
+    Args:
+        name: Identifier to quote; must be non-empty string.
+        dialect: Target SQL dialect.
+    Returns:
+        Quoted identifier string.
+    Raises:
+        ValueError: If name is missing/empty.
+        TypeError: If name is not a string.
+        NotImplementedError: If dialect is unsupported.
+    """
+    if name is None:
+        raise ValueError("name must be provided")
+    if not isinstance(name, str):
+        raise TypeError("name must be a string")
+    cleaned = name.strip()
+    if not cleaned:
+        raise ValueError("name must be a non-empty string")
+    if dialect == Dialect.SQLSERVER:
+        return f"[{cleaned.replace('[', '').replace(']', '')}]"
+    if dialect == Dialect.POSTGRES:
+        escaped = cleaned.replace('"', '""')
+        return f'"{escaped}"'
+    if dialect == Dialect.MYSQL:
+        escaped = cleaned.replace("`", "``")
+        return f"`{escaped}`"
+    if dialect == Dialect.DUCKDB:
+        escaped = cleaned.replace('"', '""')
+        return f'"{escaped}"'
+    raise NotImplementedError(f"Dialect not supported: {dialect}")

datablade/utils/__init__.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""
+General utility functions for common operations.
+This module provides functions for:
+- String manipulation and SQL name quoting
+- List operations (flattening)
+- Logging and messaging
+- Path standardization
+"""
+from .lists import flatten
+from .logging import print_verbose  # backward compatibility
+from .logging import (
+    configure_logging,
+    get_logger,
+    log,
+    log_debug,
+    log_error,
+    log_info,
+    log_warning,
+)
+from .strings import pathing, sql_quotename
+__all__ = [
+    "sql_quotename",
+    "pathing",
+    "flatten",
+    # Logging
+    "get_logger",
+    "configure_logging",
+    "log",
+    "log_debug",
+    "log_info",
+    "log_warning",
+    "log_error",
+    "print_verbose",
+]

datablade/utils/lists.py ADDED Viewed

@@ -0,0 +1,29 @@
+from typing import Any, List
+def flatten(nest: List[Any]) -> List[Any]:
+    """
+    Flatten a nested list recursively to a single-level list.
+    Args:
+        nest: A potentially nested list structure.
+    Returns:
+        A flat list containing all elements from the nested structure.
+    Examples:
+        >>> flatten([1, [2, 3], [[4], 5]])
+        [1, 2, 3, 4, 5]
+        >>> flatten([1, 2, 3])
+        [1, 2, 3]
+    """
+    if not isinstance(nest, list):
+        raise TypeError("nest must be a list")
+    result = []
+    for item in nest:
+        if isinstance(item, list):
+            result.extend(flatten(item))
+        else:
+            result.append(item)
+    return result

datablade/utils/logging.py ADDED Viewed

@@ -0,0 +1,159 @@
+"""
+Logging utilities for datablade.
+Provides a configurable logger that can be used across all modules.
+By default, logs to console at INFO level. Users can configure
+handlers, levels, and formatters as needed.
+"""
+import logging
+import pathlib
+from typing import Any, Optional
+# Create the datablade logger
+_logger = logging.getLogger("datablade")
+_logger.setLevel(logging.DEBUG)  # Allow all levels; handlers control output
+# Default console handler (can be replaced by user)
+_default_handler: Optional[logging.Handler] = None
+def _ensure_handler() -> None:
+    """Ensure at least one handler is configured."""
+    global _default_handler
+    if not _logger.handlers and _default_handler is None:
+        _default_handler = logging.StreamHandler()
+        _default_handler.setLevel(logging.INFO)
+        formatter = logging.Formatter(
+            "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+        )
+        _default_handler.setFormatter(formatter)
+        _logger.addHandler(_default_handler)
+def get_logger() -> logging.Logger:
+    """
+    Get the datablade logger instance.
+    Returns:
+        The configured datablade logger.
+    """
+    _ensure_handler()
+    return _logger
+def configure_logging(
+    level: int = logging.INFO,
+    handler: Optional[logging.Handler] = None,
+    format_string: Optional[str] = None,
+    *,
+    log_file: Optional[str | pathlib.Path] = None,
+    format: Optional[str] = None,
+) -> logging.Logger:
+    """
+    Configure the datablade logger.
+    Args:
+        level: Logging level (e.g., logging.DEBUG, logging.INFO).
+        handler: Optional custom handler. If None, uses StreamHandler.
+        format_string: Optional format string for log messages.
+    Returns:
+        The configured logger instance.
+    """
+    global _default_handler
+    if format is not None:
+        if format_string is not None:
+            raise ValueError("Provide only one of format_string or format")
+        format_string = format
+    # Remove existing handlers
+    for h in _logger.handlers[:]:
+        _logger.removeHandler(h)
+    _default_handler = None
+    # Add new handler
+    if handler is None:
+        if log_file is not None:
+            log_path = pathlib.Path(log_file)
+            if log_path.parent:
+                log_path.parent.mkdir(parents=True, exist_ok=True)
+            handler = logging.FileHandler(log_path, encoding="utf-8")
+        else:
+            handler = logging.StreamHandler()
+    handler.setLevel(level)
+    if format_string:
+        formatter = logging.Formatter(format_string)
+    else:
+        formatter = logging.Formatter(
+            "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+        )
+    handler.setFormatter(formatter)
+    _logger.addHandler(handler)
+    _default_handler = handler
+    return _logger
+def log(
+    message: Any,
+    level: int = logging.INFO,
+    verbose: bool = True,
+) -> None:
+    """
+    Log a message at the specified level if verbose is True.
+    Args:
+        message: The message to log (converted to string).
+        level: Logging level (default: INFO).
+        verbose: If False, message is not logged.
+    Returns:
+        None
+    """
+    if not verbose:
+        return
+    _ensure_handler()
+    _logger.log(level, str(message))
+def log_debug(message: Any, verbose: bool = True) -> None:
+    """Log a DEBUG level message."""
+    log(message, logging.DEBUG, verbose)
+def log_info(message: Any, verbose: bool = True) -> None:
+    """Log an INFO level message."""
+    log(message, logging.INFO, verbose)
+def log_warning(message: Any, verbose: bool = True) -> None:
+    """Log a WARNING level message."""
+    log(message, logging.WARNING, verbose)
+def log_error(message: Any, verbose: bool = True) -> None:
+    """Log an ERROR level message."""
+    log(message, logging.ERROR, verbose)
+# Backward compatibility alias
+def print_verbose(message: Any, verbose: bool = True) -> None:
+    """
+    Print a message if verbose is True.
+    This is a backward-compatible alias for log_info.
+    Args:
+        message: The message to print (converted to string).
+        verbose: If True, the message will be logged.
+    Returns:
+        None
+    """
+    log_info(message, verbose)

datablade/utils/messages.py ADDED Viewed

@@ -0,0 +1,29 @@
+"""
+Messaging utilities for datablade.
+This module provides backward-compatible message functions.
+For new code, prefer using datablade.utils.logging directly.
+"""
+# Re-export from logging module for backward compatibility
+from .logging import (
+    configure_logging,
+    get_logger,
+    log,
+    log_debug,
+    log_error,
+    log_info,
+    log_warning,
+    print_verbose,
+)
+__all__ = [
+    "print_verbose",
+    "log",
+    "log_debug",
+    "log_info",
+    "log_warning",
+    "log_error",
+    "get_logger",
+    "configure_logging",
+]

datablade/utils/strings.py ADDED Viewed

@@ -0,0 +1,86 @@
+import pathlib
+from typing import Optional, Union
+from .messages import print_verbose
+def sql_quotename(
+    name: Optional[str] = None,
+    brackets: bool = True,
+    ticks: bool = False,
+    verbose: bool = False,
+) -> str:
+    """
+    Quote a SQL Server name string with brackets or ticks.
+    Args:
+        name: The name to quote. Must be a non-empty string.
+        brackets: If True, wraps the name in square brackets [name].
+        ticks: If True, wraps the name in single quotes 'name'.
+            Takes precedence over brackets if both are True.
+        verbose: If True, prints error messages.
+    Returns:
+        The quoted name string.
+    Raises:
+        ValueError: If name is None or empty after stripping.
+        TypeError: If name is not a string.
+    Examples:
+        >>> sql_quotename('table_name')
+        '[table_name]'
+        >>> sql_quotename('table_name', brackets=False, ticks=True)
+        "'table_name'"
+    """
+    if name is None:
+        print_verbose("No name provided; exiting sql_quotename.", verbose)
+        raise ValueError("name must be provided")
+    if not isinstance(name, str):
+        raise TypeError("name must be a string")
+    cleaned = name.strip()
+    if not cleaned:
+        raise ValueError("name must be a non-empty string")
+    return_value = cleaned.replace("[", "").replace("]", "")
+    if brackets:
+        return_value = f"[{return_value}]"
+    if ticks or not brackets:
+        return_value = f"'{return_value}'"
+    return return_value
+def pathing(
+    input: Optional[Union[str, pathlib.Path]], verbose: bool = False
+) -> pathlib.Path:
+    """
+    Standardize and validate a path string or Path object.
+    Args:
+        input: The path to standardize (string or pathlib.Path). Must not be None.
+        verbose: If True, prints error messages.
+    Returns:
+        A pathlib.Path object if the path exists.
+    Raises:
+        ValueError: If input is None or the path does not exist.
+        TypeError: If input is not a string or pathlib.Path.
+    """
+    if input is None:
+        print_verbose("No path provided; exiting pathing.", verbose)
+        raise ValueError("path input must be provided")
+    if isinstance(input, str):
+        normalized = input.replace("\\", "/")
+        path_obj = pathlib.Path(normalized)
+    elif isinstance(input, pathlib.Path):
+        path_obj = input
+    else:
+        raise TypeError("input must be a string or pathlib.Path")
+    if path_obj.exists():
+        return path_obj
+    print_verbose(f"Path {path_obj} does not exist; exiting pathing.", verbose)
+    raise ValueError(f"Path does not exist: {path_obj}")

datablade 0.0.0__py3-none-any.whl → 0.0.5__py3-none-any.whl

datablade 0.0.0py3-none-any.whl → 0.0.5py3-none-any.whl