batch-analytics 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,44 @@
1
+ """
2
+ Batch analytics pipeline: Extract, Transform, Log stages + analytics modules.
3
+
4
+ Stages:
5
+ - Extract: Load data from ClickHouse via Spark ClickHouse connector or JDBC
6
+ - Transform: Deduplicate and stage data (parquet/delta/clickhouse)
7
+ - Log: Persist run metadata and analytics results
8
+
9
+ Analytics modules:
10
+ - Module 1: Linear regression (XY) with slope comparison across groups
11
+ - Module 2: Multi-feature correlation
12
+ - Module 3: PCA + KMeans clustering
13
+ """
14
+
15
+ from .config import BatchAnalyticsConfig, SparkK8sConfig
16
+ from .extract import extract_all, extract_table, extract_unified
17
+ from .transform import (
18
+ extract_anchor_id,
19
+ load_staged,
20
+ remove_duplicates,
21
+ stage_to_clickhouse,
22
+ transform,
23
+ transform_and_stage,
24
+ )
25
+ from .log import log_analytics_artifacts, log_run
26
+ from .job_runner import run_pipeline, create_spark_session
27
+
28
+ __all__ = [
29
+ "BatchAnalyticsConfig",
30
+ "SparkK8sConfig",
31
+ "extract_anchor_id",
32
+ "extract_all",
33
+ "extract_table",
34
+ "extract_unified",
35
+ "remove_duplicates",
36
+ "stage_to_clickhouse",
37
+ "transform",
38
+ "transform_and_stage",
39
+ "load_staged",
40
+ "log_run",
41
+ "log_analytics_artifacts",
42
+ "run_pipeline",
43
+ "create_spark_session",
44
+ ]
@@ -0,0 +1,5 @@
1
+ """Allow python -m batch_analytics to run the job runner."""
2
+
3
+ from .job_runner import main
4
+ import sys
5
+ sys.exit(main())
@@ -0,0 +1,19 @@
1
+ """
2
+ Analytics modules for batch analytics pipeline.
3
+ - Module 1: Linear regression (XY) and slope comparison
4
+ - Module 2: Multi-feature correlation
5
+ - Module 3: PCA and clustering
6
+ - Module 4: T-test to compare means of two groups
7
+ """
8
+
9
+ from .linear_regression import run_linear_regression
10
+ from .correlation import run_correlation
11
+ from .pca_clustering import run_pca_clustering
12
+ from .t_test import run_t_test
13
+
14
+ __all__ = [
15
+ "run_linear_regression",
16
+ "run_correlation",
17
+ "run_pca_clustering",
18
+ "run_t_test",
19
+ ]
@@ -0,0 +1,113 @@
1
+ """
2
+ Module 2: Multi-feature correlation analysis.
3
+ """
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ from pyspark.ml.feature import VectorAssembler
9
+ from pyspark.ml.stat import Correlation
10
+ from pyspark.sql import DataFrame, SparkSession
11
+
12
+ from ..config import BatchAnalyticsConfig
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def run_correlation(
18
+ spark: SparkSession,
19
+ df: DataFrame,
20
+ config: BatchAnalyticsConfig,
21
+ ) -> dict[str, Any]:
22
+ """
23
+ Compute correlation matrix over multiple numeric features.
24
+ Optionally identify pairs above a threshold (collinearity).
25
+
26
+ Returns:
27
+ - correlation_matrix: full matrix (list of lists)
28
+ - feature_names: column order
29
+ - high_corr_pairs: pairs with |corr| >= threshold
30
+ - threshold: used threshold
31
+ """
32
+ feature_cols = [
33
+ c.strip()
34
+ for c in config.analytics.corr_features.split(",")
35
+ if c.strip()
36
+ ]
37
+
38
+ if not feature_cols:
39
+ # Auto-select numeric columns
40
+ feature_cols = [
41
+ f.name
42
+ for f in df.schema.fields
43
+ if "double" in str(f.dataType).lower()
44
+ or "int" in str(f.dataType).lower()
45
+ or "long" in str(f.dataType).lower()
46
+ or "float" in str(f.dataType).lower()
47
+ ]
48
+ logger.info("Auto-selected %d numeric columns for correlation", len(feature_cols))
49
+
50
+ missing = [c for c in feature_cols if c not in df.columns]
51
+ if missing:
52
+ raise ValueError(
53
+ f"Correlation features not found: {missing}. "
54
+ f"Available: {df.columns[:15]}..."
55
+ )
56
+
57
+ if len(feature_cols) < 2:
58
+ return {
59
+ "correlation_matrix": [[1.0]],
60
+ "feature_names": feature_cols,
61
+ "high_corr_pairs": [],
62
+ "threshold": config.analytics.corr_threshold,
63
+ }
64
+
65
+ # Cast to double and drop nulls
66
+ from pyspark.sql.functions import col
67
+ from pyspark.sql.types import DoubleType
68
+
69
+ df_num = df.select(
70
+ *[col(c).cast(DoubleType()).alias(c) for c in feature_cols]
71
+ ).dropna()
72
+
73
+ assembler = VectorAssembler(
74
+ inputCols=feature_cols,
75
+ outputCol="features",
76
+ handleInvalid="skip",
77
+ )
78
+ df_vec = assembler.transform(df_num)
79
+
80
+ corr_matrix = Correlation.corr(df_vec, "features", "pearson")
81
+ corr_row = corr_matrix.head()
82
+ if corr_row is None:
83
+ return {
84
+ "correlation_matrix": [],
85
+ "feature_names": feature_cols,
86
+ "high_corr_pairs": [],
87
+ "threshold": config.analytics.corr_threshold,
88
+ }
89
+
90
+ import numpy as np
91
+
92
+ arr = corr_row[0].toArray()
93
+ matrix = np.asarray(arr)
94
+
95
+ threshold = config.analytics.corr_threshold
96
+ high_pairs: list[dict] = []
97
+ n = len(feature_cols)
98
+ for i in range(n):
99
+ for j in range(i + 1, n):
100
+ val = float(matrix[i, j])
101
+ if abs(val) >= threshold:
102
+ high_pairs.append({
103
+ "feature_a": feature_cols[i],
104
+ "feature_b": feature_cols[j],
105
+ "correlation": val,
106
+ })
107
+
108
+ return {
109
+ "correlation_matrix": matrix.tolist(),
110
+ "feature_names": feature_cols,
111
+ "high_corr_pairs": high_pairs,
112
+ "threshold": threshold,
113
+ }
@@ -0,0 +1,136 @@
1
+ """
2
+ Module 1: Simple linear regression on XY data with slope comparison across groups.
3
+ """
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ from pyspark.ml.feature import VectorAssembler
9
+ from pyspark.ml.regression import LinearRegression
10
+ from pyspark.sql import DataFrame, SparkSession
11
+ from pyspark.sql.functions import col, concat_ws
12
+ from pyspark.sql.types import DoubleType
13
+
14
+ from ..config import BatchAnalyticsConfig
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def _ensure_numeric(df: DataFrame, x_col: str, y_col: str) -> DataFrame:
20
+ """Cast X and Y columns to double."""
21
+ return df.select(
22
+ col(x_col).cast(DoubleType()).alias(x_col),
23
+ col(y_col).cast(DoubleType()).alias(y_col),
24
+ *[c for c in df.columns if c not in (x_col, y_col)],
25
+ ).filter(col(x_col).isNotNull() & col(y_col).isNotNull())
26
+
27
+
28
+ def run_linear_regression(
29
+ spark: SparkSession,
30
+ df: DataFrame,
31
+ config: BatchAnalyticsConfig,
32
+ ) -> dict[str, Any]:
33
+ """
34
+ Run simple linear regression: Y ~ X.
35
+ If group columns are configured, fit separate models per group and compare slopes.
36
+
37
+ Returns:
38
+ - slopes: per-group slope and intercept
39
+ - global: single regression over all data
40
+ - slope_comparison: pairwise slope differences (when multiple groups)
41
+ """
42
+ x_col = config.analytics.lr_x_column
43
+ y_col = config.analytics.lr_y_column
44
+ group_cols = [
45
+ c.strip()
46
+ for c in config.analytics.lr_group_columns.split(",")
47
+ if c.strip()
48
+ ]
49
+
50
+ # Ensure required columns exist
51
+ if x_col not in df.columns or y_col not in df.columns:
52
+ numeric_cols = [f.name for f in df.schema.fields if "double" in str(f.dataType).lower() or "int" in str(f.dataType).lower()]
53
+ if len(numeric_cols) >= 2:
54
+ x_col = numeric_cols[0]
55
+ y_col = numeric_cols[1]
56
+ logger.warning(
57
+ "LR columns not found, using first two numeric: %s, %s",
58
+ x_col,
59
+ y_col,
60
+ )
61
+ else:
62
+ raise ValueError(
63
+ f"Could not find LR columns (x={x_col}, y={y_col}). "
64
+ "Specify BATCH_LR_X_COLUMN and BATCH_LR_Y_COLUMN."
65
+ )
66
+
67
+ df = _ensure_numeric(df, x_col, y_col)
68
+
69
+ assembler = VectorAssembler(inputCols=[x_col], outputCol="features", handleInvalid="skip")
70
+ df_vec = assembler.transform(df).withColumnRenamed(y_col, "label")
71
+
72
+ # Global regression (all data)
73
+ lr = LinearRegression(featuresCol="features", labelCol="label")
74
+ global_model = lr.fit(df_vec)
75
+ global_slope = float(global_model.coefficients[0])
76
+ global_intercept = float(global_model.intercept)
77
+ global_r2 = float(global_model.summary.r2)
78
+
79
+ result: dict[str, Any] = {
80
+ "global": {
81
+ "slope": global_slope,
82
+ "intercept": global_intercept,
83
+ "r2": global_r2,
84
+ },
85
+ "slopes": {},
86
+ "slope_comparison": [],
87
+ }
88
+
89
+ if not group_cols or not all(g in df.columns for g in group_cols):
90
+ return result
91
+
92
+ # Per-group regression
93
+ group_key = "_group_key"
94
+ df_grouped = df_vec.withColumn(
95
+ group_key,
96
+ concat_ws("|", *[col(g).cast("string") for g in group_cols]),
97
+ )
98
+ groups = df_grouped.select(group_key).distinct().collect()
99
+
100
+ slopes_by_group: dict[str, dict] = {}
101
+ for row in groups:
102
+ key_str = row[group_key]
103
+ if key_str is None:
104
+ continue
105
+ key_parts = key_str.split("|")
106
+
107
+ sub_df = df_grouped.filter(col(group_key) == key_str).drop(group_key)
108
+ if sub_df.count() < 2:
109
+ continue
110
+
111
+ model = lr.fit(sub_df)
112
+ slopes_by_group[key_str] = {
113
+ "slope": float(model.coefficients[0]),
114
+ "intercept": float(model.intercept),
115
+ "r2": float(model.summary.r2),
116
+ "n": sub_df.count(),
117
+ "group": dict(zip(group_cols, key_parts)),
118
+ }
119
+
120
+ result["slopes"] = slopes_by_group
121
+
122
+ # Slope comparison: pairwise differences
123
+ keys = list(slopes_by_group.keys())
124
+ for i in range(len(keys)):
125
+ for j in range(i + 1, len(keys)):
126
+ s1 = slopes_by_group[keys[i]]["slope"]
127
+ s2 = slopes_by_group[keys[j]]["slope"]
128
+ result["slope_comparison"].append({
129
+ "group_a": keys[i],
130
+ "group_b": keys[j],
131
+ "slope_a": s1,
132
+ "slope_b": s2,
133
+ "slope_diff": s1 - s2,
134
+ })
135
+
136
+ return result
@@ -0,0 +1,143 @@
1
+ """
2
+ Module 3: PCA for key feature identification and clustering on staged data.
3
+ """
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ from pyspark.ml.clustering import KMeans
9
+ from pyspark.ml.feature import PCA, StandardScaler, VectorAssembler
10
+ from pyspark.sql import DataFrame, SparkSession
11
+
12
+ from ..config import BatchAnalyticsConfig
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def run_pca_clustering(
18
+ spark: SparkSession,
19
+ df: DataFrame,
20
+ config: BatchAnalyticsConfig,
21
+ ) -> dict[str, Any]:
22
+ """
23
+ Run PCA to identify key features (principal components) and KMeans clustering.
24
+ Uses scaled features; PCA components capture variance; clustering groups similar rows.
25
+
26
+ Returns:
27
+ - pca: explained variance, cumulative variance, feature loadings per PC
28
+ - clustering: cluster assignments, centroids, sizes
29
+ """
30
+ feature_cols = [
31
+ c.strip()
32
+ for c in config.analytics.pca_features.split(",")
33
+ if c.strip()
34
+ ]
35
+
36
+ if not feature_cols:
37
+ feature_cols = [
38
+ f.name
39
+ for f in df.schema.fields
40
+ if "double" in str(f.dataType).lower()
41
+ or "int" in str(f.dataType).lower()
42
+ or "long" in str(f.dataType).lower()
43
+ or "float" in str(f.dataType).lower()
44
+ ]
45
+ logger.info("Auto-selected %d numeric columns for PCA/clustering", len(feature_cols))
46
+
47
+ missing = [c for c in feature_cols if c not in df.columns]
48
+ if missing:
49
+ raise ValueError(
50
+ f"PCA features not found: {missing}. Available: {df.columns[:15]}..."
51
+ )
52
+
53
+ if len(feature_cols) < 2:
54
+ return {
55
+ "pca": {"explained_variance": [1.0], "feature_names": feature_cols},
56
+ "clustering": {"k": config.analytics.cluster_k, "sizes": []},
57
+ }
58
+
59
+ from pyspark.sql.functions import col
60
+ from pyspark.sql.types import DoubleType
61
+
62
+ df_num = df.select(
63
+ *[col(c).cast(DoubleType()).alias(c) for c in feature_cols]
64
+ ).dropna()
65
+
66
+ assembler = VectorAssembler(
67
+ inputCols=feature_cols,
68
+ outputCol="features_raw",
69
+ handleInvalid="skip",
70
+ )
71
+ df_vec = assembler.transform(df_num)
72
+
73
+ scaler = StandardScaler(
74
+ inputCol="features_raw",
75
+ outputCol="features",
76
+ withStd=True,
77
+ withMean=True,
78
+ )
79
+ scaler_model = scaler.fit(df_vec)
80
+ df_scaled = scaler_model.transform(df_vec)
81
+
82
+ # PCA - keep enough components for target variance
83
+ variance_threshold = config.analytics.pca_variance_threshold
84
+ n_comp_max = min(len(feature_cols), 20)
85
+ pca = PCA(k=n_comp_max, inputCol="features", outputCol="pca_features")
86
+ pca_model = pca.fit(df_scaled)
87
+
88
+ explained = pca_model.explainedVariance.toArray().tolist()
89
+ cumsum = []
90
+ s = 0.0
91
+ for v in explained:
92
+ s += v
93
+ cumsum.append(s)
94
+
95
+ # Feature loadings per PC (which original features contribute most)
96
+ components = pca_model.pc.toArray()
97
+ loadings: list[dict] = []
98
+ for i, comp in enumerate(components):
99
+ ranked = sorted(
100
+ zip(feature_cols, comp.tolist()),
101
+ key=lambda x: abs(x[1]),
102
+ reverse=True,
103
+ )
104
+ loadings.append({
105
+ "pc": i + 1,
106
+ "top_features": [{"name": n, "loading": float(v)} for n, v in ranked[:5]],
107
+ })
108
+
109
+ k = config.analytics.cluster_k
110
+ kmeans = KMeans(k=k, seed=42, featuresCol="pca_features", predictionCol="cluster")
111
+ kmeans_model = kmeans.fit(df_scaled)
112
+
113
+ df_clustered = kmeans_model.transform(df_scaled)
114
+ cluster_sizes = (
115
+ df_clustered.groupBy("cluster")
116
+ .count()
117
+ .orderBy("cluster")
118
+ .collect()
119
+ )
120
+ sizes = {int(r["cluster"]): int(r["count"]) for r in cluster_sizes}
121
+
122
+ centroids = []
123
+ for i in range(k):
124
+ c = kmeans_model.clusterCenters()[i]
125
+ centroids.append({"cluster": i, "centroid": c.tolist()})
126
+
127
+ return {
128
+ "pca": {
129
+ "explained_variance": explained,
130
+ "cumulative_variance": cumsum,
131
+ "feature_names": feature_cols,
132
+ "component_loadings": loadings,
133
+ "n_components": len(explained),
134
+ },
135
+ "clustering": {
136
+ "k": k,
137
+ "sizes": sizes,
138
+ "centroids": centroids,
139
+ "within_cluster_sum_of_squared_distances": float(
140
+ kmeans_model.summary.trainingCost
141
+ ),
142
+ },
143
+ }
@@ -0,0 +1,184 @@
1
+ """
2
+ Module 4: T-test to compare means of two sets of data.
3
+ """
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ from pyspark.sql import DataFrame, SparkSession
9
+ from pyspark.sql.functions import col, avg, stddev, count
10
+ from pyspark.sql.types import DoubleType
11
+
12
+ from ..config import BatchAnalyticsConfig
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def run_t_test(
18
+ spark: SparkSession,
19
+ df: DataFrame,
20
+ config: BatchAnalyticsConfig,
21
+ ) -> dict[str, Any]:
22
+ """
23
+ Perform an independent samples t-test to compare the means of two groups.
24
+
25
+ Supports two modes:
26
+ 1. Value + group: one numeric column, one categorical column with 2 levels
27
+ 2. Two columns: two numeric columns, compare their means
28
+
29
+ Uses Welch's t-test (does not assume equal variances).
30
+
31
+ Returns:
32
+ - group_a, group_b: names/summary of each group
33
+ - mean_a, mean_b, std_a, std_b, n_a, n_b
34
+ - t_statistic, p_value, difference (mean_a - mean_b)
35
+ """
36
+ value_col = (config.analytics.ttest_value_column or "").strip()
37
+ group_col = (config.analytics.ttest_group_column or "").strip()
38
+ col_a = (config.analytics.ttest_col_a or "").strip()
39
+ col_b = (config.analytics.ttest_col_b or "").strip()
40
+
41
+ # Mode 1: value column + group column (both required)
42
+ if value_col and group_col and value_col in df.columns and group_col in df.columns:
43
+ return _run_t_test_by_group(df, value_col, group_col)
44
+
45
+ # Mode 2: two numeric columns
46
+ if col_a and col_b and col_a in df.columns and col_b in df.columns:
47
+ return _run_t_test_two_columns(df, col_a, col_b)
48
+
49
+ # Fallback: find first numeric + first string-like column with 2 distinct values
50
+ numeric_cols = [
51
+ f.name for f in df.schema.fields
52
+ if "double" in str(f.dataType).lower()
53
+ or "int" in str(f.dataType).lower()
54
+ or "float" in str(f.dataType).lower()
55
+ ]
56
+ string_cols = [
57
+ f.name for f in df.schema.fields
58
+ if "string" in str(f.dataType).lower()
59
+ ]
60
+ for nc in numeric_cols:
61
+ for sc in string_cols:
62
+ distinct = df.select(sc).distinct().count()
63
+ if distinct == 2:
64
+ logger.info("Auto-selected t-test: value=%s, group=%s", nc, sc)
65
+ return _run_t_test_by_group(df, nc, sc)
66
+
67
+ raise ValueError(
68
+ "T-test requires either (BATCH_TTEST_VALUE_COLUMN + BATCH_TTEST_GROUP_COLUMN) "
69
+ "or (BATCH_TTEST_COL_A + BATCH_TTEST_COL_B). "
70
+ f"Available columns: {df.columns}"
71
+ )
72
+
73
+
74
+ def _run_t_test_by_group(
75
+ df: DataFrame,
76
+ value_col: str,
77
+ group_col: str,
78
+ ) -> dict[str, Any]:
79
+ """T-test: compare mean of value_col across two levels of group_col."""
80
+ df_num = df.select(
81
+ col(value_col).cast(DoubleType()).alias("_val"),
82
+ col(group_col).cast("string").alias("_grp"),
83
+ ).filter(col("_val").isNotNull() & col("_grp").isNotNull())
84
+
85
+ stats = (
86
+ df_num.groupBy("_grp")
87
+ .agg(
88
+ avg("_val").alias("mean"),
89
+ stddev("_val").alias("std"),
90
+ count("_val").alias("n"),
91
+ )
92
+ .collect()
93
+ )
94
+
95
+ if len(stats) != 2:
96
+ raise ValueError(
97
+ f"T-test requires exactly 2 groups in {group_col}. Found: {[r['_grp'] for r in stats]}"
98
+ )
99
+
100
+ r0, r1 = stats[0], stats[1]
101
+ return _compute_t_test_result(
102
+ group_a=r0["_grp"],
103
+ mean_a=float(r0["mean"]),
104
+ std_a=float(r0["std"] or 0.0),
105
+ n_a=int(r0["n"]),
106
+ group_b=r1["_grp"],
107
+ mean_b=float(r1["mean"]),
108
+ std_b=float(r1["std"] or 0.0),
109
+ n_b=int(r1["n"]),
110
+ )
111
+
112
+
113
+ def _run_t_test_two_columns(df: DataFrame, col_a: str, col_b: str) -> dict[str, Any]:
114
+ """T-test: compare means of two numeric columns."""
115
+ df_num = df.select(
116
+ col(col_a).cast(DoubleType()).alias("_a"),
117
+ col(col_b).cast(DoubleType()).alias("_b"),
118
+ ).filter(col("_a").isNotNull() & col("_b").isNotNull())
119
+
120
+ row = df_num.agg(
121
+ avg("_a").alias("mean_a"),
122
+ stddev("_a").alias("std_a"),
123
+ count("_a").alias("n_a"),
124
+ avg("_b").alias("mean_b"),
125
+ stddev("_b").alias("std_b"),
126
+ count("_b").alias("n_b"),
127
+ ).collect()[0]
128
+
129
+ return _compute_t_test_result(
130
+ group_a=col_a,
131
+ mean_a=float(row["mean_a"]),
132
+ std_a=float(row["std_a"] or 0.0),
133
+ n_a=int(row["n_a"]),
134
+ group_b=col_b,
135
+ mean_b=float(row["mean_b"]),
136
+ std_b=float(row["std_b"] or 0.0),
137
+ n_b=int(row["n_b"]),
138
+ )
139
+
140
+
141
+ def _compute_t_test_result(
142
+ group_a: str,
143
+ mean_a: float,
144
+ std_a: float,
145
+ n_a: int,
146
+ group_b: str,
147
+ mean_b: float,
148
+ std_b: float,
149
+ n_b: int,
150
+ ) -> dict[str, Any]:
151
+ """Compute Welch's t-test from summary statistics."""
152
+ try:
153
+ from scipy import stats
154
+ except ImportError:
155
+ raise ImportError("T-test requires scipy. Install with: pip install scipy")
156
+
157
+ t_stat, p_value = stats.ttest_ind_from_stats(
158
+ mean1=mean_a,
159
+ std1=std_a,
160
+ nobs1=n_a,
161
+ mean2=mean_b,
162
+ std2=std_b,
163
+ nobs2=n_b,
164
+ equal_var=False, # Welch's t-test
165
+ )
166
+
167
+ return {
168
+ "group_a": {
169
+ "name": group_a,
170
+ "mean": mean_a,
171
+ "std": std_a,
172
+ "n": n_a,
173
+ },
174
+ "group_b": {
175
+ "name": group_b,
176
+ "mean": mean_b,
177
+ "std": std_b,
178
+ "n": n_b,
179
+ },
180
+ "t_statistic": float(t_stat),
181
+ "p_value": float(p_value),
182
+ "mean_difference": mean_a - mean_b,
183
+ "test": "Welch",
184
+ }