PyPI - datablade - Versions diffs - 0.0.0__py3-none-any.whl → 0.0.6__py3-none-any.whl - Mend

datablade 0.0.0py3-none-any.whl → 0.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

datablade/__init__.py +49 -1
datablade/blade.py +322 -0
datablade/core/__init__.py +28 -7
datablade/core/frames.py +23 -236
datablade/core/json.py +5 -10
datablade/core/lists.py +5 -10
datablade/core/messages.py +23 -11
datablade/core/strings.py +5 -43
datablade/core/zip.py +5 -24
datablade/dataframes/__init__.py +51 -0
datablade/dataframes/frames.py +585 -0
datablade/dataframes/readers.py +1367 -0
datablade/docs/ARCHITECTURE.md +102 -0
datablade/docs/OBJECT_REGISTRY.md +194 -0
datablade/docs/README.md +57 -0
datablade/docs/TESTING.md +37 -0
datablade/docs/USAGE.md +409 -0
datablade/docs/__init__.py +87 -0
datablade/docs/__main__.py +6 -0
datablade/io/__init__.py +15 -0
datablade/io/json.py +70 -0
datablade/io/zip.py +111 -0
datablade/registry.py +581 -0
datablade/sql/__init__.py +56 -0
datablade/sql/bulk_load.py +665 -0
datablade/sql/ddl.py +402 -0
datablade/sql/ddl_pyarrow.py +411 -0
datablade/sql/dialects.py +12 -0
datablade/sql/quoting.py +44 -0
datablade/sql/schema_spec.py +65 -0
datablade/sql/sqlserver.py +390 -0
datablade/utils/__init__.py +38 -0
datablade/utils/lists.py +32 -0
datablade/utils/logging.py +204 -0
datablade/utils/messages.py +29 -0
datablade/utils/strings.py +249 -0
datablade-0.0.6.dist-info/METADATA +406 -0
datablade-0.0.6.dist-info/RECORD +41 -0
{datablade-0.0.0.dist-info → datablade-0.0.6.dist-info}/WHEEL +1 -1
{datablade-0.0.0.dist-info → datablade-0.0.6.dist-info/licenses}/LICENSE +20 -20
datablade-0.0.0.dist-info/METADATA +0 -13
datablade-0.0.0.dist-info/RECORD +0 -13
{datablade-0.0.0.dist-info → datablade-0.0.6.dist-info}/top_level.txt +0 -0

datablade/dataframes/frames.py ADDED Viewed

@@ -0,0 +1,585 @@
+"""DataFrame transformation and export helpers.
+This module focuses on schema-aware DataFrame cleanup, type inference,
+and downstream serialization helpers (Parquet schema/table generation
+and SQL Server-compatible DDL helpers).
+"""
+import os
+import pathlib
+import shutil
+import subprocess
+from typing import Any, Optional, Union
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+from ..utils.logging import log_debug, log_error, log_info, log_warning
+from ..utils.strings import coerce_path, ensure_directory
+_BYTES_LIKE_TYPES = (bytes, bytearray, memoryview)
+def _is_bytes_like(value: Any) -> bool:
+    """Return True when a value should be treated as binary data."""
+    return isinstance(value, _BYTES_LIKE_TYPES)
+def _infer_object_pa_type(col_data: pd.Series) -> pa.DataType:
+    """Infer a PyArrow type for object-dtype Series.
+    We sample non-null values and prefer "narrower" types (binary, string,
+    bool, integer, float) before falling back to pyarrow's inference.
+    """
+    non_null = col_data.dropna()
+    if non_null.empty:
+        return pa.string()
+    # Sample to avoid scanning very large object columns.
+    sample = non_null.iloc[:100].tolist()
+    if all(_is_bytes_like(v) for v in sample):
+        return pa.binary()
+    if all(isinstance(v, str) for v in sample):
+        return pa.string()
+    if all(isinstance(v, (bool, np.bool_)) for v in sample):
+        return pa.bool_()
+    if all(
+        isinstance(v, (int, np.integer)) and not isinstance(v, (bool, np.bool_))
+        for v in sample
+    ):
+        min_value = min(sample)
+        max_value = max(sample)
+        if min_value >= np.iinfo(np.int8).min and max_value <= np.iinfo(np.int8).max:
+            return pa.int8()
+        if min_value >= np.iinfo(np.int16).min and max_value <= np.iinfo(np.int16).max:
+            return pa.int16()
+        if min_value >= np.iinfo(np.int32).min and max_value <= np.iinfo(np.int32).max:
+            return pa.int32()
+        return pa.int64()
+    if all(isinstance(v, (float, np.floating)) for v in sample):
+        return pa.float64()
+    try:
+        # PyArrow can infer types for mixed object values; we normalize into
+        # the closest "primitive" type for DDL/Parquet stability.
+        inferred = pa.infer_type(sample)
+        if pa.types.is_binary(inferred) or pa.types.is_large_binary(inferred):
+            return pa.binary()
+        if pa.types.is_string(inferred) or pa.types.is_large_string(inferred):
+            return pa.string()
+        if pa.types.is_boolean(inferred):
+            return pa.bool_()
+        if pa.types.is_integer(inferred):
+            return pa.int64()
+        if pa.types.is_floating(inferred):
+            return pa.float64()
+        if pa.types.is_timestamp(inferred) or pa.types.is_date(inferred):
+            return inferred
+    except Exception:
+        pass
+    return pa.string()
+def try_cast_string_columns_to_numeric(
+    df: Optional[pd.DataFrame] = None,
+    convert_partial: bool = False,
+    verbose: bool = False,
+) -> pd.DataFrame:
+    """
+    Attempt to cast DataFrame string columns to numeric values where possible.
+    Args:
+        df: The DataFrame to process. If None, returns None.
+        convert_partial: If True, columns with some values convertible to numeric types
+            will be converted to numeric types with NaNs where conversion failed.
+            If False, only columns where all values can be converted will be converted.
+        verbose: If True, prints progress messages.
+    Returns:
+        DataFrame with string columns converted to numeric types where possible,
+        or None if no DataFrame is provided.
+    """
+    if df is None:
+        log_warning(
+            "No DataFrame provided; exiting try_cast_string_columns_to_numeric.",
+            verbose,
+        )
+        raise ValueError("df must be provided")
+    if not isinstance(df, pd.DataFrame):
+        raise TypeError("df must be a pandas DataFrame")
+    for col in df.columns:
+        dtype = df[col].dtype
+        if pd.api.types.is_object_dtype(dtype) or pd.api.types.is_string_dtype(dtype):
+            non_null = df[col].dropna()
+            if (
+                pd.api.types.is_object_dtype(dtype)
+                and not non_null.empty
+                and non_null.iloc[:100].map(_is_bytes_like).any()
+            ):
+                log_debug(
+                    f"Column '{col}' contains bytes-like values; skipping numeric coercion.",
+                    verbose,
+                )
+                continue
+            converted = pd.to_numeric(df[col], errors="coerce")
+            has_nan = converted.isnull().any()
+            if not has_nan:
+                df[col] = converted
+                log_info(f"Column '{col}' successfully converted to numeric.", verbose)
+            else:
+                if convert_partial:
+                    df[col] = converted
+                    log_info(
+                        f"Column '{col}' partially converted to numeric with NaNs where conversion failed.",
+                        verbose,
+                    )
+                else:
+                    log_debug(
+                        f"Column '{col}' could not be fully converted to numeric; leaving as is.",
+                        verbose,
+                    )
+    return df
+def clean_dataframe_columns(
+    df: Optional[pd.DataFrame] = None, verbose: bool = False
+) -> pd.DataFrame:
+    """
+    Clean the DataFrame columns by flattening MultiIndex, converting to strings,
+    and removing duplicates.
+    Args:
+        df: The DataFrame to clean. If None, returns None.
+        verbose: If True, prints progress messages.
+    Returns:
+        The cleaned DataFrame with:
+        - Flattened MultiIndex columns
+        - String column names
+        - Duplicate columns removed (keeping first occurrence)
+        Returns None if no DataFrame is provided.
+    """
+    if df is None:
+        log_warning("No DataFrame provided; exiting clean_dataframe_columns.", verbose)
+        raise ValueError("df must be provided")
+    if not isinstance(df, pd.DataFrame):
+        raise TypeError("df must be a pandas DataFrame")
+    # Step 1: Flatten MultiIndex columns.
+    if isinstance(df.columns, pd.MultiIndex):
+        df.columns = ["_".join(map(str, col)).strip() for col in df.columns.values]
+        log_info("Flattened MultiIndex columns.", verbose)
+    # Step 2: Convert non-string column names to strings.
+    df.columns = df.columns.map(str)
+    log_debug("Converted column names to strings.", verbose)
+    # Step 3: Remove duplicate columns, keeping the first occurrence.
+    duplicates = df.columns.duplicated()
+    if duplicates.any():
+        duplicate_cols = df.columns[duplicates]
+        log_warning(f"Duplicate columns found: {list(duplicate_cols)}", verbose)
+        df = df.loc[:, ~duplicates]
+        log_info("Removed duplicate columns, keeping the first occurrence.", verbose)
+    return df
+def generate_parquet_schema(
+    df: Optional[pd.DataFrame] = None, verbose: bool = False
+) -> pa.Schema:
+    """
+    Generate a PyArrow Schema from a pandas DataFrame with optimized data types.
+    Args:
+        df: The DataFrame to generate the schema from. If None, returns None.
+        verbose: If True, prints progress messages.
+    Returns:
+        PyArrow Schema object with optimized types (smallest integer type that fits the data),
+        or None if no DataFrame is provided.
+    """
+    if df is None:
+        log_warning("No DataFrame provided; exiting generate_parquet_schema.", verbose)
+        raise ValueError("df must be provided")
+    if not isinstance(df, pd.DataFrame):
+        raise TypeError("df must be a pandas DataFrame")
+    fields = []
+    for column in df.columns:
+        col_data = df[column]
+        col_name = column
+        dtype = col_data.dtype
+        # Determine if the column contains any nulls.
+        nullable = col_data.isnull().any()
+        # Map pandas dtype to PyArrow type.
+        pa_type = None
+        if pd.api.types.is_integer_dtype(dtype):
+            # Check the range to determine the smallest integer type.
+            non_null = col_data.dropna()
+            if non_null.empty:
+                pa_type = pa.int64()
+            else:
+                min_value = non_null.min()
+                max_value = non_null.max()
+                if (
+                    min_value >= np.iinfo(np.int8).min
+                    and max_value <= np.iinfo(np.int8).max
+                ):
+                    pa_type = pa.int8()
+                elif (
+                    min_value >= np.iinfo(np.int16).min
+                    and max_value <= np.iinfo(np.int16).max
+                ):
+                    pa_type = pa.int16()
+                elif (
+                    min_value >= np.iinfo(np.int32).min
+                    and max_value <= np.iinfo(np.int32).max
+                ):
+                    pa_type = pa.int32()
+                else:
+                    pa_type = pa.int64()
+        elif pd.api.types.is_float_dtype(dtype):
+            pa_type = pa.float64()
+        elif pd.api.types.is_bool_dtype(dtype):
+            pa_type = pa.bool_()
+        elif isinstance(dtype, pd.DatetimeTZDtype):
+            tz = getattr(getattr(col_data.dt, "tz", None), "zone", None) or str(
+                col_data.dt.tz
+            )
+            pa_type = pa.timestamp("ms", tz=tz)
+        elif pd.api.types.is_datetime64_any_dtype(dtype):
+            pa_type = pa.timestamp("ms")
+        elif pd.api.types.is_timedelta64_dtype(dtype):
+            pa_type = pa.duration("ns")
+        elif isinstance(dtype, pd.CategoricalDtype):
+            pa_type = pa.string()
+        elif pd.api.types.is_object_dtype(dtype):
+            pa_type = _infer_object_pa_type(col_data)
+        else:
+            pa_type = pa.string()
+        # Create a field.
+        field = pa.field(col_name, pa_type, nullable=nullable)
+        fields.append(field)
+    schema = pa.schema(fields)
+    return schema
+def pandas_to_parquet_table(
+    df: Optional[pd.DataFrame] = None,
+    convert: bool = True,
+    partial: bool = False,
+    preserve_index: bool = False,
+    verbose: bool = False,
+) -> pa.Table:
+    """
+    Generate a PyArrow Table from a pandas DataFrame with automatic type conversion.
+    Args:
+        df: The DataFrame to convert. If None, returns None.
+        convert: If True, attempts to cast string columns to numeric types.
+        partial: If True with convert, allows partial conversion with NaNs.
+        preserve_index: If True, preserves the DataFrame index in the table.
+        verbose: If True, prints progress messages.
+    Returns:
+        PyArrow Table object with optimized schema, or None if DataFrame is None
+        or conversion fails.
+    """
+    if df is None:
+        log_warning("No DataFrame provided; exiting pandas_to_parquet_table.", verbose)
+        raise ValueError("df must be provided")
+    if not isinstance(df, pd.DataFrame):
+        raise TypeError("df must be a pandas DataFrame")
+    def _unique_col_name(existing: set[str], desired: str) -> str:
+        """Return a column name that does not collide with existing names."""
+        if desired not in existing:
+            return desired
+        i = 1
+        while f"{desired}_{i}" in existing:
+            i += 1
+        return f"{desired}_{i}"
+    def _materialize_index_columns(input_df: pd.DataFrame) -> pd.DataFrame:
+        """Convert Index/MultiIndex into explicit columns."""
+        if isinstance(input_df.index, pd.MultiIndex):
+            index_names: list[str] = []
+            for i, name in enumerate(input_df.index.names):
+                index_names.append(name or f"__index_level_{i}__")
+            existing = set(map(str, input_df.columns))
+            index_names = [_unique_col_name(existing, str(n)) for n in index_names]
+            idx_df = input_df.index.to_frame(index=False)
+            idx_df.columns = index_names
+            out_df = pd.concat([idx_df, input_df.reset_index(drop=True)], axis=1)
+            return out_df
+        index_name = (
+            str(input_df.index.name)
+            if input_df.index.name is not None
+            else "__index_level_0__"
+        )
+        existing = set(map(str, input_df.columns))
+        index_name = _unique_col_name(existing, index_name)
+        out_df = input_df.copy()
+        out_df.insert(0, index_name, out_df.index.to_numpy(copy=False))
+        out_df = out_df.reset_index(drop=True)
+        return out_df
+    # Clean columns before schema inference to avoid duplicate/invalid names.
+    df = clean_dataframe_columns(df=df, verbose=verbose)
+    if preserve_index:
+        # Preserve index by materializing it into real columns.
+        df = _materialize_index_columns(df)
+    if convert:
+        # Attempt numeric coercion so Parquet schema uses numeric types.
+        df = try_cast_string_columns_to_numeric(
+            df=df, convert_partial=partial, verbose=verbose
+        )
+    # Build schema explicitly so the Parquet table uses stable, optimized types.
+    schema = generate_parquet_schema(df=df, verbose=verbose)
+    try:
+        # We materialize index into regular columns above so that the schema can
+        # fully describe the resulting table and the index is preserved even
+        # when an explicit schema is supplied.
+        table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
+        return table
+    except Exception as e:
+        log_error(f"Error generating PyArrow Table: {e}", verbose)
+        raise
+def generate_sql_server_create_table_string(
+    df: Optional[pd.DataFrame] = None,
+    catalog: str = "database",
+    schema: str = "dbo",
+    table: str = "table",
+    dropexisting: bool = True,
+    use_go: bool = False,
+    schema_spec: Optional[dict] = None,
+    verbose: bool = False,
+) -> str:
+    """
+    Generate a SQL Server CREATE TABLE statement from a pandas DataFrame.
+    Args:
+        df: The DataFrame to generate the schema from. If None, returns None.
+        catalog: The database catalog name.
+        schema: The schema name (default: 'dbo').
+        table: The table name.
+        dropexisting: If True, includes DROP TABLE IF EXISTS statement.
+        use_go: If True, inserts GO after USE when a catalog is supplied.
+        verbose: If True, prints progress messages.
+    Returns:
+        SQL Server CREATE TABLE statement string with optimized column types,
+        or None if no DataFrame is provided.
+    """
+    if df is None:
+        log_warning(
+            "No DataFrame provided; exiting generate_sql_server_create_table_string.",
+            verbose,
+        )
+        raise ValueError("df must be provided")
+    if not isinstance(df, pd.DataFrame):
+        raise TypeError("df must be a pandas DataFrame")
+    if not isinstance(catalog, str) or not catalog.strip():
+        raise ValueError("catalog must be a non-empty string")
+    if not isinstance(schema, str) or not schema.strip():
+        raise ValueError("schema must be a non-empty string")
+    if not isinstance(table, str) or not table.strip():
+        raise ValueError("table must be a non-empty string")
+    from ..sql.ddl import generate_create_table
+    from ..sql.dialects import Dialect
+    return generate_create_table(
+        df=df,
+        catalog=catalog,
+        schema=schema,
+        table=table,
+        drop_existing=dropexisting,
+        dialect=Dialect.SQLSERVER,
+        use_go=use_go,
+        schema_spec=schema_spec,
+        verbose=verbose,
+    )
+def write_to_file_and_sql(
+    df: pd.DataFrame,
+    file_path: Union[str, pathlib.Path],
+    table_name: str,
+    sql_server: str,
+    database: str,
+    username: Optional[str] = None,
+    password: Optional[str] = None,
+    use_trusted_connection: bool = False,
+    use_azure_ad: bool = False,
+    use_env_credentials: bool = True,
+    verbose: bool = False,
+) -> None:
+    """
+    Write a DataFrame to a CSV file and import it to SQL Server using BCP.
+    Args:
+        df: The DataFrame to write.
+        file_path: Path where the CSV file will be saved.
+        table_name: Name of the SQL Server table.
+        sql_server: SQL Server instance name.
+        database: Database name.
+        username: SQL Server username.
+        password: SQL Server password.
+        use_trusted_connection: If True, use integrated authentication (-T).
+        use_azure_ad: If True, use Azure AD authentication (-G).
+        use_env_credentials: If True, fall back to DATABLADE_SQLSERVER_USERNAME
+            and DATABLADE_SQLSERVER_PASSWORD when username/password not provided.
+        verbose: If True, prints progress messages.
+    Raises:
+        subprocess.CalledProcessError: If BCP command fails.
+    """
+    if df is None or not isinstance(df, pd.DataFrame):
+        raise TypeError("df must be a pandas DataFrame")
+    path_obj = coerce_path(
+        file_path,
+        must_exist=False,
+        verbose=verbose,
+        label="file_path",
+    )
+    if not isinstance(table_name, str) or not table_name.strip():
+        raise ValueError("table_name must be a non-empty string")
+    if not isinstance(sql_server, str) or not sql_server.strip():
+        raise ValueError("sql_server must be a non-empty string")
+    if not isinstance(database, str) or not database.strip():
+        raise ValueError("database must be a non-empty string")
+    if use_trusted_connection and use_azure_ad:
+        raise ValueError(
+            "use_trusted_connection and use_azure_ad are mutually exclusive"
+        )
+    if use_env_credentials:
+        if not username:
+            username = os.getenv("DATABLADE_SQLSERVER_USERNAME")
+        if not password:
+            password = os.getenv("DATABLADE_SQLSERVER_PASSWORD")
+    if not use_trusted_connection:
+        if not isinstance(username, str) or not username.strip():
+            raise ValueError("username must be a non-empty string")
+        if not password and not use_azure_ad:
+            raise ValueError("password must be provided")
+    ensure_directory(path_obj.parent, verbose=verbose, label="output_dir")
+    df.to_csv(path_obj, index=False)
+    log_info(f"DataFrame written to file {path_obj}.", verbose)
+    qualified_table = f"{database}.dbo.{table_name}"
+    bcp_args = [
+        "bcp",
+        qualified_table,
+        "in",
+        str(path_obj),
+        "-c",
+        "-t,",
+        "-S",
+        sql_server,
+    ]
+    bcp_preview = bcp_args[:-1] + ["***REDACTED***"]
+    bcp_path = shutil.which("bcp")
+    if not bcp_path:
+        install_steps = (
+            "Install the SQL Server command line utilities (bcp) and ensure the "
+            "binary is on PATH. For example: "
+            "macOS (Homebrew): brew install msodbcsql17 mssql-tools; "
+            "Linux (Debian/Ubuntu): install mssql-tools; "
+            "Windows: install SQL Server Command Line Utilities and restart your shell."
+        )
+        path_env = os.environ.get("PATH", "")
+        message = (
+            "BCP executable was not found on PATH. "
+            f"PATH={path_env}. {install_steps} "
+            f"Command preview: {bcp_preview}."
+        )
+        log_error(message, verbose)
+        raise FileNotFoundError(message)
+    if use_trusted_connection:
+        bcp_args.append("-T")
+    else:
+        if use_azure_ad:
+            bcp_args.append("-G")
+        if username:
+            bcp_args.extend(["-U", username])
+        if password:
+            bcp_args.extend(["-P", password])
+    log_debug(
+        f"Executing BCP load to {qualified_table} on server {sql_server} as user {username}.",
+        verbose,
+    )
+    redacted_args = []
+    redact_next = False
+    for arg in bcp_args:
+        if redact_next:
+            redacted_args.append("***REDACTED***")
+            redact_next = False
+            continue
+        redacted_args.append(arg)
+        if arg == "-P":
+            redact_next = True
+    if "-P" in bcp_args:
+        log_warning(
+            "BCP authentication uses -P with a plaintext password. "
+            "Consider using trusted connection (-T) or Azure AD (-G).",
+            verbose,
+        )
+    log_debug(f"BCP args: {redacted_args}", verbose)
+    process = subprocess.run(
+        bcp_args,
+        shell=False,
+        check=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+    if process.returncode == 0:
+        log_info(
+            f"DataFrame successfully written to SQL Server table {table_name}.", verbose
+        )
+    else:
+        error_msg = process.stderr.decode()
+        log_error(
+            "Error writing DataFrame to SQL Server table "
+            f"{table_name}: {error_msg} "
+            f"PATH={os.environ.get('PATH', '')}. "
+            "Ensure BCP is installed (SQL Server command line utilities) and on PATH. "
+            f"Command preview: {bcp_preview}.",
+            verbose,
+        )
+        raise subprocess.CalledProcessError(
+            process.returncode,
+            bcp_args,
+            output=process.stdout,
+            stderr=process.stderr,
+        )

datablade 0.0.0__py3-none-any.whl → 0.0.6__py3-none-any.whl

datablade 0.0.0py3-none-any.whl → 0.0.6py3-none-any.whl