PyPI - batch-analytics - Versions diffs - 0.1.0__py3-none-any.whl - Mend

batch-analytics 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

batch_analytics/__init__.py +44 -0
batch_analytics/__main__.py +5 -0
batch_analytics/analytics/__init__.py +19 -0
batch_analytics/analytics/correlation.py +113 -0
batch_analytics/analytics/linear_regression.py +136 -0
batch_analytics/analytics/pca_clustering.py +143 -0
batch_analytics/analytics/t_test.py +184 -0
batch_analytics/config.py +169 -0
batch_analytics/extract.py +118 -0
batch_analytics/job_runner.py +300 -0
batch_analytics/log.py +101 -0
batch_analytics/modules.py +24 -0
batch_analytics/output/__init__.py +22 -0
batch_analytics/output/base.py +97 -0
batch_analytics/output/clickhouse.py +89 -0
batch_analytics/output/local.py +36 -0
batch_analytics/output/s3.py +82 -0
batch_analytics/transform.py +184 -0
batch_analytics-0.1.0.dist-info/METADATA +80 -0
batch_analytics-0.1.0.dist-info/RECORD +23 -0
batch_analytics-0.1.0.dist-info/WHEEL +5 -0
batch_analytics-0.1.0.dist-info/entry_points.txt +2 -0
batch_analytics-0.1.0.dist-info/top_level.txt +1 -0

batch_analytics/__init__.py ADDED Viewed

@@ -0,0 +1,44 @@
+"""
+Batch analytics pipeline: Extract, Transform, Log stages + analytics modules.
+Stages:
+  - Extract: Load data from ClickHouse via Spark ClickHouse connector or JDBC
+  - Transform: Deduplicate and stage data (parquet/delta/clickhouse)
+  - Log: Persist run metadata and analytics results
+Analytics modules:
+  - Module 1: Linear regression (XY) with slope comparison across groups
+  - Module 2: Multi-feature correlation
+  - Module 3: PCA + KMeans clustering
+"""
+from .config import BatchAnalyticsConfig, SparkK8sConfig
+from .extract import extract_all, extract_table, extract_unified
+from .transform import (
+    extract_anchor_id,
+    load_staged,
+    remove_duplicates,
+    stage_to_clickhouse,
+    transform,
+    transform_and_stage,
+)
+from .log import log_analytics_artifacts, log_run
+from .job_runner import run_pipeline, create_spark_session
+__all__ = [
+    "BatchAnalyticsConfig",
+    "SparkK8sConfig",
+    "extract_anchor_id",
+    "extract_all",
+    "extract_table",
+    "extract_unified",
+    "remove_duplicates",
+    "stage_to_clickhouse",
+    "transform",
+    "transform_and_stage",
+    "load_staged",
+    "log_run",
+    "log_analytics_artifacts",
+    "run_pipeline",
+    "create_spark_session",
+]

batch_analytics/__main__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Allow python -m batch_analytics to run the job runner."""
+from .job_runner import main
+import sys
+sys.exit(main())

batch_analytics/analytics/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""
+Analytics modules for batch analytics pipeline.
+- Module 1: Linear regression (XY) and slope comparison
+- Module 2: Multi-feature correlation
+- Module 3: PCA and clustering
+- Module 4: T-test to compare means of two groups
+"""
+from .linear_regression import run_linear_regression
+from .correlation import run_correlation
+from .pca_clustering import run_pca_clustering
+from .t_test import run_t_test
+__all__ = [
+    "run_linear_regression",
+    "run_correlation",
+    "run_pca_clustering",
+    "run_t_test",
+]

batch_analytics/analytics/correlation.py ADDED Viewed

@@ -0,0 +1,113 @@
+"""
+Module 2: Multi-feature correlation analysis.
+"""
+import logging
+from typing import Any
+from pyspark.ml.feature import VectorAssembler
+from pyspark.ml.stat import Correlation
+from pyspark.sql import DataFrame, SparkSession
+from ..config import BatchAnalyticsConfig
+logger = logging.getLogger(__name__)
+def run_correlation(
+    spark: SparkSession,
+    df: DataFrame,
+    config: BatchAnalyticsConfig,
+) -> dict[str, Any]:
+    """
+    Compute correlation matrix over multiple numeric features.
+    Optionally identify pairs above a threshold (collinearity).
+    Returns:
+        - correlation_matrix: full matrix (list of lists)
+        - feature_names: column order
+        - high_corr_pairs: pairs with |corr| >= threshold
+        - threshold: used threshold
+    """
+    feature_cols = [
+        c.strip()
+        for c in config.analytics.corr_features.split(",")
+        if c.strip()
+    ]
+    if not feature_cols:
+        # Auto-select numeric columns
+        feature_cols = [
+            f.name
+            for f in df.schema.fields
+            if "double" in str(f.dataType).lower()
+            or "int" in str(f.dataType).lower()
+            or "long" in str(f.dataType).lower()
+            or "float" in str(f.dataType).lower()
+        ]
+        logger.info("Auto-selected %d numeric columns for correlation", len(feature_cols))
+    missing = [c for c in feature_cols if c not in df.columns]
+    if missing:
+        raise ValueError(
+            f"Correlation features not found: {missing}. "
+            f"Available: {df.columns[:15]}..."
+        )
+    if len(feature_cols) < 2:
+        return {
+            "correlation_matrix": [[1.0]],
+            "feature_names": feature_cols,
+            "high_corr_pairs": [],
+            "threshold": config.analytics.corr_threshold,
+        }
+    # Cast to double and drop nulls
+    from pyspark.sql.functions import col
+    from pyspark.sql.types import DoubleType
+    df_num = df.select(
+        *[col(c).cast(DoubleType()).alias(c) for c in feature_cols]
+    ).dropna()
+    assembler = VectorAssembler(
+        inputCols=feature_cols,
+        outputCol="features",
+        handleInvalid="skip",
+    )
+    df_vec = assembler.transform(df_num)
+    corr_matrix = Correlation.corr(df_vec, "features", "pearson")
+    corr_row = corr_matrix.head()
+    if corr_row is None:
+        return {
+            "correlation_matrix": [],
+            "feature_names": feature_cols,
+            "high_corr_pairs": [],
+            "threshold": config.analytics.corr_threshold,
+        }
+    import numpy as np
+    arr = corr_row[0].toArray()
+    matrix = np.asarray(arr)
+    threshold = config.analytics.corr_threshold
+    high_pairs: list[dict] = []
+    n = len(feature_cols)
+    for i in range(n):
+        for j in range(i + 1, n):
+            val = float(matrix[i, j])
+            if abs(val) >= threshold:
+                high_pairs.append({
+                    "feature_a": feature_cols[i],
+                    "feature_b": feature_cols[j],
+                    "correlation": val,
+                })
+    return {
+        "correlation_matrix": matrix.tolist(),
+        "feature_names": feature_cols,
+        "high_corr_pairs": high_pairs,
+        "threshold": threshold,
+    }

batch_analytics/analytics/linear_regression.py ADDED Viewed

@@ -0,0 +1,136 @@
+"""
+Module 1: Simple linear regression on XY data with slope comparison across groups.
+"""
+import logging
+from typing import Any
+from pyspark.ml.feature import VectorAssembler
+from pyspark.ml.regression import LinearRegression
+from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql.functions import col, concat_ws
+from pyspark.sql.types import DoubleType
+from ..config import BatchAnalyticsConfig
+logger = logging.getLogger(__name__)
+def _ensure_numeric(df: DataFrame, x_col: str, y_col: str) -> DataFrame:
+    """Cast X and Y columns to double."""
+    return df.select(
+        col(x_col).cast(DoubleType()).alias(x_col),
+        col(y_col).cast(DoubleType()).alias(y_col),
+        *[c for c in df.columns if c not in (x_col, y_col)],
+    ).filter(col(x_col).isNotNull() & col(y_col).isNotNull())
+def run_linear_regression(
+    spark: SparkSession,
+    df: DataFrame,
+    config: BatchAnalyticsConfig,
+) -> dict[str, Any]:
+    """
+    Run simple linear regression: Y ~ X.
+    If group columns are configured, fit separate models per group and compare slopes.
+    Returns:
+        - slopes: per-group slope and intercept
+        - global: single regression over all data
+        - slope_comparison: pairwise slope differences (when multiple groups)
+    """
+    x_col = config.analytics.lr_x_column
+    y_col = config.analytics.lr_y_column
+    group_cols = [
+        c.strip()
+        for c in config.analytics.lr_group_columns.split(",")
+        if c.strip()
+    ]
+    # Ensure required columns exist
+    if x_col not in df.columns or y_col not in df.columns:
+        numeric_cols = [f.name for f in df.schema.fields if "double" in str(f.dataType).lower() or "int" in str(f.dataType).lower()]
+        if len(numeric_cols) >= 2:
+            x_col = numeric_cols[0]
+            y_col = numeric_cols[1]
+            logger.warning(
+                "LR columns not found, using first two numeric: %s, %s",
+                x_col,
+                y_col,
+            )
+        else:
+            raise ValueError(
+                f"Could not find LR columns (x={x_col}, y={y_col}). "
+                "Specify BATCH_LR_X_COLUMN and BATCH_LR_Y_COLUMN."
+            )
+    df = _ensure_numeric(df, x_col, y_col)
+    assembler = VectorAssembler(inputCols=[x_col], outputCol="features", handleInvalid="skip")
+    df_vec = assembler.transform(df).withColumnRenamed(y_col, "label")
+    # Global regression (all data)
+    lr = LinearRegression(featuresCol="features", labelCol="label")
+    global_model = lr.fit(df_vec)
+    global_slope = float(global_model.coefficients[0])
+    global_intercept = float(global_model.intercept)
+    global_r2 = float(global_model.summary.r2)
+    result: dict[str, Any] = {
+        "global": {
+            "slope": global_slope,
+            "intercept": global_intercept,
+            "r2": global_r2,
+        },
+        "slopes": {},
+        "slope_comparison": [],
+    }
+    if not group_cols or not all(g in df.columns for g in group_cols):
+        return result
+    # Per-group regression
+    group_key = "_group_key"
+    df_grouped = df_vec.withColumn(
+        group_key,
+        concat_ws("|", *[col(g).cast("string") for g in group_cols]),
+    )
+    groups = df_grouped.select(group_key).distinct().collect()
+    slopes_by_group: dict[str, dict] = {}
+    for row in groups:
+        key_str = row[group_key]
+        if key_str is None:
+            continue
+        key_parts = key_str.split("|")
+        sub_df = df_grouped.filter(col(group_key) == key_str).drop(group_key)
+        if sub_df.count() < 2:
+            continue
+        model = lr.fit(sub_df)
+        slopes_by_group[key_str] = {
+            "slope": float(model.coefficients[0]),
+            "intercept": float(model.intercept),
+            "r2": float(model.summary.r2),
+            "n": sub_df.count(),
+            "group": dict(zip(group_cols, key_parts)),
+        }
+    result["slopes"] = slopes_by_group
+    # Slope comparison: pairwise differences
+    keys = list(slopes_by_group.keys())
+    for i in range(len(keys)):
+        for j in range(i + 1, len(keys)):
+            s1 = slopes_by_group[keys[i]]["slope"]
+            s2 = slopes_by_group[keys[j]]["slope"]
+            result["slope_comparison"].append({
+                "group_a": keys[i],
+                "group_b": keys[j],
+                "slope_a": s1,
+                "slope_b": s2,
+                "slope_diff": s1 - s2,
+            })
+    return result

batch_analytics/analytics/pca_clustering.py ADDED Viewed

@@ -0,0 +1,143 @@
+"""
+Module 3: PCA for key feature identification and clustering on staged data.
+"""
+import logging
+from typing import Any
+from pyspark.ml.clustering import KMeans
+from pyspark.ml.feature import PCA, StandardScaler, VectorAssembler
+from pyspark.sql import DataFrame, SparkSession
+from ..config import BatchAnalyticsConfig
+logger = logging.getLogger(__name__)
+def run_pca_clustering(
+    spark: SparkSession,
+    df: DataFrame,
+    config: BatchAnalyticsConfig,
+) -> dict[str, Any]:
+    """
+    Run PCA to identify key features (principal components) and KMeans clustering.
+    Uses scaled features; PCA components capture variance; clustering groups similar rows.
+    Returns:
+        - pca: explained variance, cumulative variance, feature loadings per PC
+        - clustering: cluster assignments, centroids, sizes
+    """
+    feature_cols = [
+        c.strip()
+        for c in config.analytics.pca_features.split(",")
+        if c.strip()
+    ]
+    if not feature_cols:
+        feature_cols = [
+            f.name
+            for f in df.schema.fields
+            if "double" in str(f.dataType).lower()
+            or "int" in str(f.dataType).lower()
+            or "long" in str(f.dataType).lower()
+            or "float" in str(f.dataType).lower()
+        ]
+        logger.info("Auto-selected %d numeric columns for PCA/clustering", len(feature_cols))
+    missing = [c for c in feature_cols if c not in df.columns]
+    if missing:
+        raise ValueError(
+            f"PCA features not found: {missing}. Available: {df.columns[:15]}..."
+        )
+    if len(feature_cols) < 2:
+        return {
+            "pca": {"explained_variance": [1.0], "feature_names": feature_cols},
+            "clustering": {"k": config.analytics.cluster_k, "sizes": []},
+        }
+    from pyspark.sql.functions import col
+    from pyspark.sql.types import DoubleType
+    df_num = df.select(
+        *[col(c).cast(DoubleType()).alias(c) for c in feature_cols]
+    ).dropna()
+    assembler = VectorAssembler(
+        inputCols=feature_cols,
+        outputCol="features_raw",
+        handleInvalid="skip",
+    )
+    df_vec = assembler.transform(df_num)
+    scaler = StandardScaler(
+        inputCol="features_raw",
+        outputCol="features",
+        withStd=True,
+        withMean=True,
+    )
+    scaler_model = scaler.fit(df_vec)
+    df_scaled = scaler_model.transform(df_vec)
+    # PCA - keep enough components for target variance
+    variance_threshold = config.analytics.pca_variance_threshold
+    n_comp_max = min(len(feature_cols), 20)
+    pca = PCA(k=n_comp_max, inputCol="features", outputCol="pca_features")
+    pca_model = pca.fit(df_scaled)
+    explained = pca_model.explainedVariance.toArray().tolist()
+    cumsum = []
+    s = 0.0
+    for v in explained:
+        s += v
+        cumsum.append(s)
+    # Feature loadings per PC (which original features contribute most)
+    components = pca_model.pc.toArray()
+    loadings: list[dict] = []
+    for i, comp in enumerate(components):
+        ranked = sorted(
+            zip(feature_cols, comp.tolist()),
+            key=lambda x: abs(x[1]),
+            reverse=True,
+        )
+        loadings.append({
+            "pc": i + 1,
+            "top_features": [{"name": n, "loading": float(v)} for n, v in ranked[:5]],
+        })
+    k = config.analytics.cluster_k
+    kmeans = KMeans(k=k, seed=42, featuresCol="pca_features", predictionCol="cluster")
+    kmeans_model = kmeans.fit(df_scaled)
+    df_clustered = kmeans_model.transform(df_scaled)
+    cluster_sizes = (
+        df_clustered.groupBy("cluster")
+        .count()
+        .orderBy("cluster")
+        .collect()
+    )
+    sizes = {int(r["cluster"]): int(r["count"]) for r in cluster_sizes}
+    centroids = []
+    for i in range(k):
+        c = kmeans_model.clusterCenters()[i]
+        centroids.append({"cluster": i, "centroid": c.tolist()})
+    return {
+        "pca": {
+            "explained_variance": explained,
+            "cumulative_variance": cumsum,
+            "feature_names": feature_cols,
+            "component_loadings": loadings,
+            "n_components": len(explained),
+        },
+        "clustering": {
+            "k": k,
+            "sizes": sizes,
+            "centroids": centroids,
+            "within_cluster_sum_of_squared_distances": float(
+                kmeans_model.summary.trainingCost
+            ),
+        },
+    }

batch_analytics/analytics/t_test.py ADDED Viewed

@@ -0,0 +1,184 @@
+"""
+Module 4: T-test to compare means of two sets of data.
+"""
+import logging
+from typing import Any
+from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql.functions import col, avg, stddev, count
+from pyspark.sql.types import DoubleType
+from ..config import BatchAnalyticsConfig
+logger = logging.getLogger(__name__)
+def run_t_test(
+    spark: SparkSession,
+    df: DataFrame,
+    config: BatchAnalyticsConfig,
+) -> dict[str, Any]:
+    """
+    Perform an independent samples t-test to compare the means of two groups.
+    Supports two modes:
+    1. Value + group: one numeric column, one categorical column with 2 levels
+    2. Two columns: two numeric columns, compare their means
+    Uses Welch's t-test (does not assume equal variances).
+    Returns:
+        - group_a, group_b: names/summary of each group
+        - mean_a, mean_b, std_a, std_b, n_a, n_b
+        - t_statistic, p_value, difference (mean_a - mean_b)
+    """
+    value_col = (config.analytics.ttest_value_column or "").strip()
+    group_col = (config.analytics.ttest_group_column or "").strip()
+    col_a = (config.analytics.ttest_col_a or "").strip()
+    col_b = (config.analytics.ttest_col_b or "").strip()
+    # Mode 1: value column + group column (both required)
+    if value_col and group_col and value_col in df.columns and group_col in df.columns:
+        return _run_t_test_by_group(df, value_col, group_col)
+    # Mode 2: two numeric columns
+    if col_a and col_b and col_a in df.columns and col_b in df.columns:
+        return _run_t_test_two_columns(df, col_a, col_b)
+    # Fallback: find first numeric + first string-like column with 2 distinct values
+    numeric_cols = [
+        f.name for f in df.schema.fields
+        if "double" in str(f.dataType).lower()
+        or "int" in str(f.dataType).lower()
+        or "float" in str(f.dataType).lower()
+    ]
+    string_cols = [
+        f.name for f in df.schema.fields
+        if "string" in str(f.dataType).lower()
+    ]
+    for nc in numeric_cols:
+        for sc in string_cols:
+            distinct = df.select(sc).distinct().count()
+            if distinct == 2:
+                logger.info("Auto-selected t-test: value=%s, group=%s", nc, sc)
+                return _run_t_test_by_group(df, nc, sc)
+    raise ValueError(
+        "T-test requires either (BATCH_TTEST_VALUE_COLUMN + BATCH_TTEST_GROUP_COLUMN) "
+        "or (BATCH_TTEST_COL_A + BATCH_TTEST_COL_B). "
+        f"Available columns: {df.columns}"
+    )
+def _run_t_test_by_group(
+    df: DataFrame,
+    value_col: str,
+    group_col: str,
+) -> dict[str, Any]:
+    """T-test: compare mean of value_col across two levels of group_col."""
+    df_num = df.select(
+        col(value_col).cast(DoubleType()).alias("_val"),
+        col(group_col).cast("string").alias("_grp"),
+    ).filter(col("_val").isNotNull() & col("_grp").isNotNull())
+    stats = (
+        df_num.groupBy("_grp")
+        .agg(
+            avg("_val").alias("mean"),
+            stddev("_val").alias("std"),
+            count("_val").alias("n"),
+        )
+        .collect()
+    )
+    if len(stats) != 2:
+        raise ValueError(
+            f"T-test requires exactly 2 groups in {group_col}. Found: {[r['_grp'] for r in stats]}"
+        )
+    r0, r1 = stats[0], stats[1]
+    return _compute_t_test_result(
+        group_a=r0["_grp"],
+        mean_a=float(r0["mean"]),
+        std_a=float(r0["std"] or 0.0),
+        n_a=int(r0["n"]),
+        group_b=r1["_grp"],
+        mean_b=float(r1["mean"]),
+        std_b=float(r1["std"] or 0.0),
+        n_b=int(r1["n"]),
+    )
+def _run_t_test_two_columns(df: DataFrame, col_a: str, col_b: str) -> dict[str, Any]:
+    """T-test: compare means of two numeric columns."""
+    df_num = df.select(
+        col(col_a).cast(DoubleType()).alias("_a"),
+        col(col_b).cast(DoubleType()).alias("_b"),
+    ).filter(col("_a").isNotNull() & col("_b").isNotNull())
+    row = df_num.agg(
+        avg("_a").alias("mean_a"),
+        stddev("_a").alias("std_a"),
+        count("_a").alias("n_a"),
+        avg("_b").alias("mean_b"),
+        stddev("_b").alias("std_b"),
+        count("_b").alias("n_b"),
+    ).collect()[0]
+    return _compute_t_test_result(
+        group_a=col_a,
+        mean_a=float(row["mean_a"]),
+        std_a=float(row["std_a"] or 0.0),
+        n_a=int(row["n_a"]),
+        group_b=col_b,
+        mean_b=float(row["mean_b"]),
+        std_b=float(row["std_b"] or 0.0),
+        n_b=int(row["n_b"]),
+    )
+def _compute_t_test_result(
+    group_a: str,
+    mean_a: float,
+    std_a: float,
+    n_a: int,
+    group_b: str,
+    mean_b: float,
+    std_b: float,
+    n_b: int,
+) -> dict[str, Any]:
+    """Compute Welch's t-test from summary statistics."""
+    try:
+        from scipy import stats
+    except ImportError:
+        raise ImportError("T-test requires scipy. Install with: pip install scipy")
+    t_stat, p_value = stats.ttest_ind_from_stats(
+        mean1=mean_a,
+        std1=std_a,
+        nobs1=n_a,
+        mean2=mean_b,
+        std2=std_b,
+        nobs2=n_b,
+        equal_var=False,  # Welch's t-test
+    )
+    return {
+        "group_a": {
+            "name": group_a,
+            "mean": mean_a,
+            "std": std_a,
+            "n": n_a,
+        },
+        "group_b": {
+            "name": group_b,
+            "mean": mean_b,
+            "std": std_b,
+            "n": n_b,
+        },
+        "t_statistic": float(t_stat),
+        "p_value": float(p_value),
+        "mean_difference": mean_a - mean_b,
+        "test": "Welch",
+    }