PyPI - datablade - Versions diffs - 0.0.0__py3-none-any.whl → 0.0.6__py3-none-any.whl - Mend

datablade 0.0.0py3-none-any.whl → 0.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

datablade/__init__.py +49 -1
datablade/blade.py +322 -0
datablade/core/__init__.py +28 -7
datablade/core/frames.py +23 -236
datablade/core/json.py +5 -10
datablade/core/lists.py +5 -10
datablade/core/messages.py +23 -11
datablade/core/strings.py +5 -43
datablade/core/zip.py +5 -24
datablade/dataframes/__init__.py +51 -0
datablade/dataframes/frames.py +585 -0
datablade/dataframes/readers.py +1367 -0
datablade/docs/ARCHITECTURE.md +102 -0
datablade/docs/OBJECT_REGISTRY.md +194 -0
datablade/docs/README.md +57 -0
datablade/docs/TESTING.md +37 -0
datablade/docs/USAGE.md +409 -0
datablade/docs/__init__.py +87 -0
datablade/docs/__main__.py +6 -0
datablade/io/__init__.py +15 -0
datablade/io/json.py +70 -0
datablade/io/zip.py +111 -0
datablade/registry.py +581 -0
datablade/sql/__init__.py +56 -0
datablade/sql/bulk_load.py +665 -0
datablade/sql/ddl.py +402 -0
datablade/sql/ddl_pyarrow.py +411 -0
datablade/sql/dialects.py +12 -0
datablade/sql/quoting.py +44 -0
datablade/sql/schema_spec.py +65 -0
datablade/sql/sqlserver.py +390 -0
datablade/utils/__init__.py +38 -0
datablade/utils/lists.py +32 -0
datablade/utils/logging.py +204 -0
datablade/utils/messages.py +29 -0
datablade/utils/strings.py +249 -0
datablade-0.0.6.dist-info/METADATA +406 -0
datablade-0.0.6.dist-info/RECORD +41 -0
{datablade-0.0.0.dist-info → datablade-0.0.6.dist-info}/WHEEL +1 -1
{datablade-0.0.0.dist-info → datablade-0.0.6.dist-info/licenses}/LICENSE +20 -20
datablade-0.0.0.dist-info/METADATA +0 -13
datablade-0.0.0.dist-info/RECORD +0 -13
{datablade-0.0.0.dist-info → datablade-0.0.6.dist-info}/top_level.txt +0 -0

datablade/sql/ddl.py ADDED Viewed

@@ -0,0 +1,402 @@
+"""Pandas-driven DDL generation for multiple SQL dialects."""
+from typing import Any, List, Mapping, Optional
+import pandas as pd
+from ..utils.messages import print_verbose
+from .dialects import Dialect
+from .quoting import quote_identifier
+from .schema_spec import resolve_column_spec, resolve_string_policy
+_VALID_PREFER_LENGTH = {"estimate", "minimum", "maximum"}
+def _coerce_positive_int(value: Any, label: str) -> Optional[int]:
+    if value is None:
+        return None
+    if isinstance(value, bool) or not isinstance(value, int) or value <= 0:
+        raise ValueError(f"{label} must be a positive integer")
+    return int(value)
+def _coerce_non_negative_int(value: Any, label: str) -> int:
+    if value is None:
+        return 0
+    if isinstance(value, bool) or not isinstance(value, int) or value < 0:
+        raise ValueError(f"{label} must be a non-negative integer")
+    return int(value)
+def _coerce_optional_bool(value: Any, label: str) -> Optional[bool]:
+    if value is None:
+        return None
+    if not isinstance(value, bool):
+        raise TypeError(f"{label} must be a boolean")
+    return value
+def _normalize_string_policy(policy: Optional[Mapping[str, Any]]) -> dict:
+    policy = {} if policy is None else dict(policy)
+    if "defined_pad" in policy and "pad" not in policy:
+        policy["pad"] = policy["defined_pad"]
+    prefer_length = policy.get("prefer_length", "estimate")
+    if prefer_length not in _VALID_PREFER_LENGTH:
+        raise ValueError(
+            "prefer_length must be one of 'estimate', 'minimum', or 'maximum'"
+        )
+    min_length = _coerce_positive_int(policy.get("min_length"), "min_length")
+    max_length = _coerce_positive_int(policy.get("max_length"), "max_length")
+    pad = _coerce_non_negative_int(policy.get("pad"), "pad")
+    empty_as_null = (
+        _coerce_optional_bool(policy.get("empty_as_null"), "empty_as_null") or False
+    )
+    allow_null = _coerce_optional_bool(policy.get("allow_null"), "allow_null")
+    return {
+        "prefer_length": prefer_length,
+        "min_length": min_length,
+        "max_length": max_length,
+        "pad": pad,
+        "empty_as_null": empty_as_null,
+        "allow_null": allow_null,
+    }
+def _string_series_stats(series: pd.Series, empty_as_null: bool) -> tuple[int, bool]:
+    non_null = series.dropna()
+    if non_null.empty:
+        return 0, False
+    as_str = non_null.astype(str)
+    empty_mask = as_str == ""
+    any_empty = bool(empty_mask.any())
+    if empty_as_null:
+        as_str = as_str[~empty_mask]
+    if as_str.empty:
+        return 0, any_empty
+    lengths = as_str.map(len)
+    max_length = int(lengths.max()) if not lengths.empty else 0
+    return max_length, any_empty
+def _select_string_length(
+    max_length: int,
+    *,
+    prefer_length: str,
+    pad: int,
+    min_length: Optional[int],
+    max_length_bound: Optional[int],
+) -> int:
+    if prefer_length == "minimum" and min_length is not None:
+        length = min_length
+    elif prefer_length == "maximum" and max_length_bound is not None:
+        length = max_length_bound
+    else:
+        length = max_length + pad
+    if min_length is not None:
+        length = max(length, min_length)
+    if max_length_bound is not None:
+        length = min(length, max_length_bound)
+    return max(1, int(length))
+def _infer_sql_type(  # noqa: C901
+    series: pd.Series,
+    dialect: Dialect,
+    *,
+    string_policy: Optional[Mapping[str, Any]] = None,
+) -> str:
+    """Infer a SQL column type for a pandas Series given a dialect."""
+    dtype = series.dtype
+    def _is_bytes_like(value: Any) -> bool:
+        return isinstance(value, (bytes, bytearray, memoryview))
+    def _is_bytes_like_series(s: pd.Series) -> bool:
+        if not pd.api.types.is_object_dtype(s.dtype):
+            return False
+        non_null = s.dropna()
+        if non_null.empty:
+            return False
+        sample = non_null.iloc[:100]
+        # require all sampled values to be bytes-like
+        return bool(sample.map(_is_bytes_like).all())
+    if dialect == Dialect.SQLSERVER:
+        # Use SQL Server's tiered integer sizes for best-fit types.
+        if pd.api.types.is_integer_dtype(dtype):
+            non_null = series.dropna()
+            if non_null.empty:
+                return "bigint"
+            min_value = non_null.min()
+            max_value = non_null.max()
+            if min_value >= 0 and max_value <= 255:
+                return "tinyint"
+            if min_value >= -32768 and max_value <= 32767:
+                return "smallint"
+            if min_value >= -2147483648 and max_value <= 2147483647:
+                return "int"
+            return "bigint"
+        if pd.api.types.is_float_dtype(dtype):
+            return "float"
+        if pd.api.types.is_bool_dtype(dtype):
+            return "bit"
+        if pd.api.types.is_datetime64_any_dtype(dtype):
+            return "datetime2"
+        if _is_bytes_like_series(series):
+            return "varbinary(max)"
+        # strings / objects
+        policy = _normalize_string_policy(string_policy)
+        max_length, _ = _string_series_stats(series, policy["empty_as_null"])
+        max_length = _select_string_length(
+            max_length,
+            prefer_length=policy["prefer_length"],
+            pad=policy["pad"],
+            min_length=policy["min_length"],
+            max_length_bound=policy["max_length"],
+        )
+        if (
+            pd.api.types.is_object_dtype(dtype)
+            or pd.api.types.is_string_dtype(dtype)
+            or isinstance(dtype, pd.CategoricalDtype)
+        ):
+            return f"nvarchar({max_length if max_length <= 4000 else 'max'})"
+        return "nvarchar(max)"
+    if dialect == Dialect.POSTGRES:
+        # PostgreSQL integer sizes are narrower than SQL Server's tinyint.
+        if pd.api.types.is_integer_dtype(dtype):
+            non_null = series.dropna()
+            if non_null.empty:
+                return "bigint"
+            min_value = non_null.min()
+            max_value = non_null.max()
+            if min_value >= -32768 and max_value <= 32767:
+                return "smallint"
+            if min_value >= -2147483648 and max_value <= 2147483647:
+                return "integer"
+            return "bigint"
+        if pd.api.types.is_float_dtype(dtype):
+            return "double precision"
+        if pd.api.types.is_bool_dtype(dtype):
+            return "boolean"
+        if pd.api.types.is_datetime64_any_dtype(dtype):
+            return "timestamp"
+        if _is_bytes_like_series(series):
+            return "bytea"
+        policy = _normalize_string_policy(string_policy)
+        max_length, _ = _string_series_stats(series, policy["empty_as_null"])
+        max_length = _select_string_length(
+            max_length,
+            prefer_length=policy["prefer_length"],
+            pad=policy["pad"],
+            min_length=policy["min_length"],
+            max_length_bound=policy["max_length"],
+        )
+        return f"varchar({max_length})" if max_length <= 65535 else "text"
+    if dialect == Dialect.MYSQL:
+        # Keep MySQL type names consistent with the existing DDL outputs.
+        if pd.api.types.is_integer_dtype(dtype):
+            non_null = series.dropna()
+            if non_null.empty:
+                return "BIGINT"
+            min_value = non_null.min()
+            max_value = non_null.max()
+            if min_value >= -32768 and max_value <= 32767:
+                return "SMALLINT"
+            if min_value >= -2147483648 and max_value <= 2147483647:
+                return "INT"
+            return "BIGINT"
+        if pd.api.types.is_float_dtype(dtype):
+            return "DOUBLE"
+        if pd.api.types.is_bool_dtype(dtype):
+            return "TINYINT(1)"
+        if pd.api.types.is_datetime64_any_dtype(dtype):
+            return "DATETIME"
+        if _is_bytes_like_series(series):
+            return "LONGBLOB"
+        policy = _normalize_string_policy(string_policy)
+        max_length, _ = _string_series_stats(series, policy["empty_as_null"])
+        max_length = _select_string_length(
+            max_length,
+            prefer_length=policy["prefer_length"],
+            pad=policy["pad"],
+            min_length=policy["min_length"],
+            max_length_bound=policy["max_length"],
+        )
+        return f"VARCHAR({max_length})" if max_length <= 65535 else "TEXT"
+    if dialect == Dialect.DUCKDB:
+        # DuckDB has simplified type names and distinguishes signed/unsigned.
+        if pd.api.types.is_integer_dtype(dtype):
+            return (
+                "BIGINT" if pd.api.types.is_signed_integer_dtype(dtype) else "UBIGINT"
+            )
+        if pd.api.types.is_float_dtype(dtype):
+            return "DOUBLE"
+        if pd.api.types.is_bool_dtype(dtype):
+            return "BOOLEAN"
+        if pd.api.types.is_datetime64_any_dtype(dtype):
+            return "TIMESTAMP"
+        if _is_bytes_like_series(series):
+            return "BLOB"
+        return "VARCHAR"
+    raise NotImplementedError(f"Dialect not supported: {dialect}")
+def _qualify_name(
+    catalog: Optional[str], schema: Optional[str], table: str, dialect: Dialect
+) -> str:
+    """Build a fully-qualified table name for the selected dialect."""
+    if dialect == Dialect.SQLSERVER:
+        # catalog and schema are both used when provided
+        if catalog:
+            return (
+                f"{quote_identifier(catalog, dialect)}."
+                f"{quote_identifier(schema or 'dbo', dialect)}."
+                f"{quote_identifier(table, dialect)}"
+            )
+        return (
+            f"{quote_identifier(schema or 'dbo', dialect)}."
+            f"{quote_identifier(table, dialect)}"
+        )
+    if schema:
+        return f"{quote_identifier(schema, dialect)}.{quote_identifier(table, dialect)}"
+    return quote_identifier(table, dialect)
+def generate_create_table(
+    df: pd.DataFrame,
+    catalog: Optional[str] = None,
+    schema: Optional[str] = None,
+    table: str = "table",
+    drop_existing: bool = True,
+    dialect: Dialect = Dialect.SQLSERVER,
+    use_go: bool = False,
+    schema_spec: Optional[Mapping[str, Any]] = None,
+    verbose: bool = False,
+) -> str:
+    """
+    Generate a CREATE TABLE statement for the given dialect.
+    Args:
+        df: Source DataFrame.
+        catalog: Optional catalog/database name.
+        schema: Optional schema name (defaults per dialect).
+        table: Target table name.
+        drop_existing: If True, include a DROP TABLE IF EXISTS stanza.
+        dialect: SQL dialect.
+        use_go: If True and dialect is SQL Server, insert a GO batch separator
+            after USE when a catalog is provided.
+        schema_spec: Optional schema overrides for column types and string sizing.
+        verbose: If True, prints progress messages.
+    Returns:
+        CREATE TABLE statement as string.
+    Raises:
+        ValueError: On missing/invalid inputs.
+        TypeError: If df is not a DataFrame.
+        NotImplementedError: If dialect unsupported.
+    """
+    if df is None:
+        raise ValueError("df must be provided")
+    if not isinstance(df, pd.DataFrame):
+        raise TypeError("df must be a pandas DataFrame")
+    if not isinstance(table, str) or not table.strip():
+        raise ValueError("table must be a non-empty string")
+    if catalog is not None and (not isinstance(catalog, str) or not catalog.strip()):
+        raise ValueError("catalog, if provided, must be a non-empty string")
+    if schema is not None and (not isinstance(schema, str) or not schema.strip()):
+        raise ValueError("schema, if provided, must be a non-empty string")
+    if not isinstance(use_go, bool):
+        raise TypeError("use_go must be a boolean")
+    qualified_name = _qualify_name(catalog, schema, table, dialect)
+    lines: List[str] = []
+    for column in df.columns:
+        series = df[column]
+        column_name = str(column)
+        defaults, column_spec = resolve_column_spec(column_name, schema_spec)
+        string_policy = resolve_string_policy(column_name, defaults, column_spec)
+        normalized_policy = _normalize_string_policy(string_policy)
+        nullable = series.isnull().any()
+        if normalized_policy["empty_as_null"] and (
+            pd.api.types.is_object_dtype(series.dtype)
+            or isinstance(series.dtype, pd.CategoricalDtype)
+            or pd.api.types.is_string_dtype(series.dtype)
+        ):
+            _, any_empty = _string_series_stats(series, True)
+            if any_empty:
+                nullable = True
+        nullable_override = _coerce_optional_bool(
+            column_spec.get("nullable"), "nullable"
+        )
+        if nullable_override is None:
+            nullable_override = _coerce_optional_bool(
+                column_spec.get("allow_null"), "allow_null"
+            )
+        if nullable_override is None:
+            nullable_override = normalized_policy["allow_null"]
+        if nullable_override is None:
+            nullable_override = _coerce_optional_bool(
+                defaults.get("nullable"), "defaults.nullable"
+            )
+        if nullable_override is None:
+            nullable_override = _coerce_optional_bool(
+                defaults.get("allow_null"), "defaults.allow_null"
+            )
+        if nullable_override is not None:
+            nullable = nullable_override
+        sql_type_override = column_spec.get("sql_type")
+        if sql_type_override is not None:
+            if not isinstance(sql_type_override, str) or not sql_type_override.strip():
+                raise ValueError(
+                    f"schema_spec.columns['{column_name}'].sql_type must be a non-empty string"
+                )
+            sql_type = sql_type_override.strip()
+        else:
+            sql_type = _infer_sql_type(series, dialect, string_policy=normalized_policy)
+        null_str = "NULL" if nullable else "NOT NULL"
+        lines.append(
+            f"    {quote_identifier(column_name, dialect)} {sql_type} {null_str}"
+        )
+    body = ",\n".join(lines)
+    drop_clause = ""
+    if drop_existing:
+        if dialect == Dialect.SQLSERVER:
+            object_id_name = qualified_name.replace("'", "''")
+            if catalog:
+                batch_sep = "GO\n" if use_go else ""
+                drop_clause = (
+                    f"USE {quote_identifier(catalog, dialect)};\n"
+                    f"{batch_sep}IF OBJECT_ID('{object_id_name}') IS NOT NULL "
+                    f"DROP TABLE {qualified_name};\n"
+                )
+            else:
+                drop_clause = (
+                    f"IF OBJECT_ID('{object_id_name}') IS NOT NULL "
+                    f"DROP TABLE {qualified_name};\n"
+                )
+        else:
+            drop_clause = f"DROP TABLE IF EXISTS {qualified_name};\n"
+    statement = f"{drop_clause}CREATE TABLE {qualified_name} (\n{body}\n);"
+    print_verbose(f"Generated CREATE TABLE for {qualified_name}", verbose)
+    return statement

datablade 0.0.0__py3-none-any.whl → 0.0.6__py3-none-any.whl

datablade 0.0.0py3-none-any.whl → 0.0.6py3-none-any.whl