batch-analytics 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. batch_analytics-0.1.0/PKG-INFO +80 -0
  2. batch_analytics-0.1.0/README.md +61 -0
  3. batch_analytics-0.1.0/pyproject.toml +31 -0
  4. batch_analytics-0.1.0/setup.cfg +4 -0
  5. batch_analytics-0.1.0/src/batch_analytics/__init__.py +44 -0
  6. batch_analytics-0.1.0/src/batch_analytics/__main__.py +5 -0
  7. batch_analytics-0.1.0/src/batch_analytics/analytics/__init__.py +19 -0
  8. batch_analytics-0.1.0/src/batch_analytics/analytics/correlation.py +113 -0
  9. batch_analytics-0.1.0/src/batch_analytics/analytics/linear_regression.py +136 -0
  10. batch_analytics-0.1.0/src/batch_analytics/analytics/pca_clustering.py +143 -0
  11. batch_analytics-0.1.0/src/batch_analytics/analytics/t_test.py +184 -0
  12. batch_analytics-0.1.0/src/batch_analytics/config.py +169 -0
  13. batch_analytics-0.1.0/src/batch_analytics/extract.py +118 -0
  14. batch_analytics-0.1.0/src/batch_analytics/job_runner.py +300 -0
  15. batch_analytics-0.1.0/src/batch_analytics/log.py +101 -0
  16. batch_analytics-0.1.0/src/batch_analytics/modules.py +24 -0
  17. batch_analytics-0.1.0/src/batch_analytics/output/__init__.py +22 -0
  18. batch_analytics-0.1.0/src/batch_analytics/output/base.py +97 -0
  19. batch_analytics-0.1.0/src/batch_analytics/output/clickhouse.py +89 -0
  20. batch_analytics-0.1.0/src/batch_analytics/output/local.py +36 -0
  21. batch_analytics-0.1.0/src/batch_analytics/output/s3.py +82 -0
  22. batch_analytics-0.1.0/src/batch_analytics/transform.py +184 -0
  23. batch_analytics-0.1.0/src/batch_analytics.egg-info/PKG-INFO +80 -0
  24. batch_analytics-0.1.0/src/batch_analytics.egg-info/SOURCES.txt +26 -0
  25. batch_analytics-0.1.0/src/batch_analytics.egg-info/dependency_links.txt +1 -0
  26. batch_analytics-0.1.0/src/batch_analytics.egg-info/entry_points.txt +2 -0
  27. batch_analytics-0.1.0/src/batch_analytics.egg-info/requires.txt +14 -0
  28. batch_analytics-0.1.0/src/batch_analytics.egg-info/top_level.txt +1 -0
@@ -0,0 +1,80 @@
1
+ Metadata-Version: 2.4
2
+ Name: batch-analytics
3
+ Version: 0.1.0
4
+ Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
+ Author: Analytics Team
6
+ License: MIT
7
+ Requires-Python: >=3.9
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: pyspark<3.6,>=3.4
10
+ Provides-Extra: dev
11
+ Requires-Dist: pytest>=7.0; extra == "dev"
12
+ Provides-Extra: s3
13
+ Requires-Dist: boto3>=1.28; extra == "s3"
14
+ Provides-Extra: clickhouse
15
+ Requires-Dist: clickhouse-connect>=0.7; extra == "clickhouse"
16
+ Provides-Extra: output
17
+ Requires-Dist: boto3>=1.28; extra == "output"
18
+ Requires-Dist: clickhouse-connect>=0.7; extra == "output"
19
+
20
+ # Batch Analytics
21
+
22
+ PySpark-based analytics pipeline for ClickHouse data: **Extract** → **Transform** → **Stage** → **Analytics**. Designed to run as the main application inside a Spark driver container (invoked by `analytics_runners` via SparkApplication CRD).
23
+
24
+ ## Bundle contents
25
+
26
+ Only the files required for the batch analytics job runner:
27
+
28
+ ```
29
+ analytics/
30
+ ├── pyproject.toml
31
+ ├── requirements-batch.txt
32
+ ├── README.md
33
+ └── src/
34
+ └── batch_analytics/
35
+ ├── __init__.py
36
+ ├── __main__.py # python -m batch_analytics
37
+ ├── job_runner.py # Entry point
38
+ ├── config.py
39
+ ├── extract.py
40
+ ├── transform.py
41
+ ├── log.py
42
+ ├── README.md
43
+ └── analytics/
44
+ ├── __init__.py
45
+ ├── linear_regression.py
46
+ ├── correlation.py
47
+ ├── pca_clustering.py
48
+ └── t_test.py
49
+ ```
50
+
51
+ ## Install
52
+
53
+ ```bash
54
+ pip install -e .
55
+ # or: pip install -r requirements-batch.txt && pip install -e .
56
+ ```
57
+
58
+ ## Run
59
+
60
+ ```bash
61
+ # Via module
62
+ python -m batch_analytics
63
+
64
+ # Via CLI (after pip install -e .)
65
+ batch-analytics
66
+
67
+ # Full pipeline
68
+ batch-analytics
69
+
70
+ # Analytics only (from staged ClickHouse table)
71
+ batch-analytics --from-stage --modules lr corr pca ttest
72
+ ```
73
+
74
+ ## Configuration
75
+
76
+ See `src/batch_analytics/README.md` for environment variables and usage.
77
+
78
+ ## Docker image
79
+
80
+ For Spark on Kubernetes, build an image that includes this package and exposes `job_runner.py` at the path used by `mainApplicationFile` (e.g. `local:///opt/analytics/job_runner.py`).
@@ -0,0 +1,61 @@
1
+ # Batch Analytics
2
+
3
+ PySpark-based analytics pipeline for ClickHouse data: **Extract** → **Transform** → **Stage** → **Analytics**. Designed to run as the main application inside a Spark driver container (invoked by `analytics_runners` via SparkApplication CRD).
4
+
5
+ ## Bundle contents
6
+
7
+ Only the files required for the batch analytics job runner:
8
+
9
+ ```
10
+ analytics/
11
+ ├── pyproject.toml
12
+ ├── requirements-batch.txt
13
+ ├── README.md
14
+ └── src/
15
+ └── batch_analytics/
16
+ ├── __init__.py
17
+ ├── __main__.py # python -m batch_analytics
18
+ ├── job_runner.py # Entry point
19
+ ├── config.py
20
+ ├── extract.py
21
+ ├── transform.py
22
+ ├── log.py
23
+ ├── README.md
24
+ └── analytics/
25
+ ├── __init__.py
26
+ ├── linear_regression.py
27
+ ├── correlation.py
28
+ ├── pca_clustering.py
29
+ └── t_test.py
30
+ ```
31
+
32
+ ## Install
33
+
34
+ ```bash
35
+ pip install -e .
36
+ # or: pip install -r requirements-batch.txt && pip install -e .
37
+ ```
38
+
39
+ ## Run
40
+
41
+ ```bash
42
+ # Via module
43
+ python -m batch_analytics
44
+
45
+ # Via CLI (after pip install -e .)
46
+ batch-analytics
47
+
48
+ # Full pipeline
49
+ batch-analytics
50
+
51
+ # Analytics only (from staged ClickHouse table)
52
+ batch-analytics --from-stage --modules lr corr pca ttest
53
+ ```
54
+
55
+ ## Configuration
56
+
57
+ See `src/batch_analytics/README.md` for environment variables and usage.
58
+
59
+ ## Docker image
60
+
61
+ For Spark on Kubernetes, build an image that includes this package and exposes `job_runner.py` at the path used by `mainApplicationFile` (e.g. `local:///opt/analytics/job_runner.py`).
@@ -0,0 +1,31 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "batch-analytics"
7
+ version = "0.1.0"
8
+ description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ dependencies = [
12
+ "pyspark>=3.4,<3.6",
13
+ ]
14
+ authors = [{ name = "Analytics Team" }]
15
+ license = { text = "MIT" }
16
+
17
+ [project.optional-dependencies]
18
+ dev = ["pytest>=7.0"]
19
+ s3 = ["boto3>=1.28"]
20
+ clickhouse = ["clickhouse-connect>=0.7"]
21
+ output = ["boto3>=1.28", "clickhouse-connect>=0.7"]
22
+
23
+ [project.scripts]
24
+ batch-analytics = "batch_analytics.job_runner:main"
25
+
26
+ [tool.setuptools]
27
+ package-dir = { "" = "src" }
28
+
29
+ [tool.setuptools.packages.find]
30
+ where = ["src"]
31
+ include = ["batch_analytics*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,44 @@
1
+ """
2
+ Batch analytics pipeline: Extract, Transform, Log stages + analytics modules.
3
+
4
+ Stages:
5
+ - Extract: Load data from ClickHouse via Spark ClickHouse connector or JDBC
6
+ - Transform: Deduplicate and stage data (parquet/delta/clickhouse)
7
+ - Log: Persist run metadata and analytics results
8
+
9
+ Analytics modules:
10
+ - Module 1: Linear regression (XY) with slope comparison across groups
11
+ - Module 2: Multi-feature correlation
12
+ - Module 3: PCA + KMeans clustering
13
+ """
14
+
15
+ from .config import BatchAnalyticsConfig, SparkK8sConfig
16
+ from .extract import extract_all, extract_table, extract_unified
17
+ from .transform import (
18
+ extract_anchor_id,
19
+ load_staged,
20
+ remove_duplicates,
21
+ stage_to_clickhouse,
22
+ transform,
23
+ transform_and_stage,
24
+ )
25
+ from .log import log_analytics_artifacts, log_run
26
+ from .job_runner import run_pipeline, create_spark_session
27
+
28
+ __all__ = [
29
+ "BatchAnalyticsConfig",
30
+ "SparkK8sConfig",
31
+ "extract_anchor_id",
32
+ "extract_all",
33
+ "extract_table",
34
+ "extract_unified",
35
+ "remove_duplicates",
36
+ "stage_to_clickhouse",
37
+ "transform",
38
+ "transform_and_stage",
39
+ "load_staged",
40
+ "log_run",
41
+ "log_analytics_artifacts",
42
+ "run_pipeline",
43
+ "create_spark_session",
44
+ ]
@@ -0,0 +1,5 @@
1
+ """Allow python -m batch_analytics to run the job runner."""
2
+
3
+ from .job_runner import main
4
+ import sys
5
+ sys.exit(main())
@@ -0,0 +1,19 @@
1
+ """
2
+ Analytics modules for batch analytics pipeline.
3
+ - Module 1: Linear regression (XY) and slope comparison
4
+ - Module 2: Multi-feature correlation
5
+ - Module 3: PCA and clustering
6
+ - Module 4: T-test to compare means of two groups
7
+ """
8
+
9
+ from .linear_regression import run_linear_regression
10
+ from .correlation import run_correlation
11
+ from .pca_clustering import run_pca_clustering
12
+ from .t_test import run_t_test
13
+
14
+ __all__ = [
15
+ "run_linear_regression",
16
+ "run_correlation",
17
+ "run_pca_clustering",
18
+ "run_t_test",
19
+ ]
@@ -0,0 +1,113 @@
1
+ """
2
+ Module 2: Multi-feature correlation analysis.
3
+ """
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ from pyspark.ml.feature import VectorAssembler
9
+ from pyspark.ml.stat import Correlation
10
+ from pyspark.sql import DataFrame, SparkSession
11
+
12
+ from ..config import BatchAnalyticsConfig
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def run_correlation(
18
+ spark: SparkSession,
19
+ df: DataFrame,
20
+ config: BatchAnalyticsConfig,
21
+ ) -> dict[str, Any]:
22
+ """
23
+ Compute correlation matrix over multiple numeric features.
24
+ Optionally identify pairs above a threshold (collinearity).
25
+
26
+ Returns:
27
+ - correlation_matrix: full matrix (list of lists)
28
+ - feature_names: column order
29
+ - high_corr_pairs: pairs with |corr| >= threshold
30
+ - threshold: used threshold
31
+ """
32
+ feature_cols = [
33
+ c.strip()
34
+ for c in config.analytics.corr_features.split(",")
35
+ if c.strip()
36
+ ]
37
+
38
+ if not feature_cols:
39
+ # Auto-select numeric columns
40
+ feature_cols = [
41
+ f.name
42
+ for f in df.schema.fields
43
+ if "double" in str(f.dataType).lower()
44
+ or "int" in str(f.dataType).lower()
45
+ or "long" in str(f.dataType).lower()
46
+ or "float" in str(f.dataType).lower()
47
+ ]
48
+ logger.info("Auto-selected %d numeric columns for correlation", len(feature_cols))
49
+
50
+ missing = [c for c in feature_cols if c not in df.columns]
51
+ if missing:
52
+ raise ValueError(
53
+ f"Correlation features not found: {missing}. "
54
+ f"Available: {df.columns[:15]}..."
55
+ )
56
+
57
+ if len(feature_cols) < 2:
58
+ return {
59
+ "correlation_matrix": [[1.0]],
60
+ "feature_names": feature_cols,
61
+ "high_corr_pairs": [],
62
+ "threshold": config.analytics.corr_threshold,
63
+ }
64
+
65
+ # Cast to double and drop nulls
66
+ from pyspark.sql.functions import col
67
+ from pyspark.sql.types import DoubleType
68
+
69
+ df_num = df.select(
70
+ *[col(c).cast(DoubleType()).alias(c) for c in feature_cols]
71
+ ).dropna()
72
+
73
+ assembler = VectorAssembler(
74
+ inputCols=feature_cols,
75
+ outputCol="features",
76
+ handleInvalid="skip",
77
+ )
78
+ df_vec = assembler.transform(df_num)
79
+
80
+ corr_matrix = Correlation.corr(df_vec, "features", "pearson")
81
+ corr_row = corr_matrix.head()
82
+ if corr_row is None:
83
+ return {
84
+ "correlation_matrix": [],
85
+ "feature_names": feature_cols,
86
+ "high_corr_pairs": [],
87
+ "threshold": config.analytics.corr_threshold,
88
+ }
89
+
90
+ import numpy as np
91
+
92
+ arr = corr_row[0].toArray()
93
+ matrix = np.asarray(arr)
94
+
95
+ threshold = config.analytics.corr_threshold
96
+ high_pairs: list[dict] = []
97
+ n = len(feature_cols)
98
+ for i in range(n):
99
+ for j in range(i + 1, n):
100
+ val = float(matrix[i, j])
101
+ if abs(val) >= threshold:
102
+ high_pairs.append({
103
+ "feature_a": feature_cols[i],
104
+ "feature_b": feature_cols[j],
105
+ "correlation": val,
106
+ })
107
+
108
+ return {
109
+ "correlation_matrix": matrix.tolist(),
110
+ "feature_names": feature_cols,
111
+ "high_corr_pairs": high_pairs,
112
+ "threshold": threshold,
113
+ }
@@ -0,0 +1,136 @@
1
+ """
2
+ Module 1: Simple linear regression on XY data with slope comparison across groups.
3
+ """
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ from pyspark.ml.feature import VectorAssembler
9
+ from pyspark.ml.regression import LinearRegression
10
+ from pyspark.sql import DataFrame, SparkSession
11
+ from pyspark.sql.functions import col, concat_ws
12
+ from pyspark.sql.types import DoubleType
13
+
14
+ from ..config import BatchAnalyticsConfig
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def _ensure_numeric(df: DataFrame, x_col: str, y_col: str) -> DataFrame:
20
+ """Cast X and Y columns to double."""
21
+ return df.select(
22
+ col(x_col).cast(DoubleType()).alias(x_col),
23
+ col(y_col).cast(DoubleType()).alias(y_col),
24
+ *[c for c in df.columns if c not in (x_col, y_col)],
25
+ ).filter(col(x_col).isNotNull() & col(y_col).isNotNull())
26
+
27
+
28
+ def run_linear_regression(
29
+ spark: SparkSession,
30
+ df: DataFrame,
31
+ config: BatchAnalyticsConfig,
32
+ ) -> dict[str, Any]:
33
+ """
34
+ Run simple linear regression: Y ~ X.
35
+ If group columns are configured, fit separate models per group and compare slopes.
36
+
37
+ Returns:
38
+ - slopes: per-group slope and intercept
39
+ - global: single regression over all data
40
+ - slope_comparison: pairwise slope differences (when multiple groups)
41
+ """
42
+ x_col = config.analytics.lr_x_column
43
+ y_col = config.analytics.lr_y_column
44
+ group_cols = [
45
+ c.strip()
46
+ for c in config.analytics.lr_group_columns.split(",")
47
+ if c.strip()
48
+ ]
49
+
50
+ # Ensure required columns exist
51
+ if x_col not in df.columns or y_col not in df.columns:
52
+ numeric_cols = [f.name for f in df.schema.fields if "double" in str(f.dataType).lower() or "int" in str(f.dataType).lower()]
53
+ if len(numeric_cols) >= 2:
54
+ x_col = numeric_cols[0]
55
+ y_col = numeric_cols[1]
56
+ logger.warning(
57
+ "LR columns not found, using first two numeric: %s, %s",
58
+ x_col,
59
+ y_col,
60
+ )
61
+ else:
62
+ raise ValueError(
63
+ f"Could not find LR columns (x={x_col}, y={y_col}). "
64
+ "Specify BATCH_LR_X_COLUMN and BATCH_LR_Y_COLUMN."
65
+ )
66
+
67
+ df = _ensure_numeric(df, x_col, y_col)
68
+
69
+ assembler = VectorAssembler(inputCols=[x_col], outputCol="features", handleInvalid="skip")
70
+ df_vec = assembler.transform(df).withColumnRenamed(y_col, "label")
71
+
72
+ # Global regression (all data)
73
+ lr = LinearRegression(featuresCol="features", labelCol="label")
74
+ global_model = lr.fit(df_vec)
75
+ global_slope = float(global_model.coefficients[0])
76
+ global_intercept = float(global_model.intercept)
77
+ global_r2 = float(global_model.summary.r2)
78
+
79
+ result: dict[str, Any] = {
80
+ "global": {
81
+ "slope": global_slope,
82
+ "intercept": global_intercept,
83
+ "r2": global_r2,
84
+ },
85
+ "slopes": {},
86
+ "slope_comparison": [],
87
+ }
88
+
89
+ if not group_cols or not all(g in df.columns for g in group_cols):
90
+ return result
91
+
92
+ # Per-group regression
93
+ group_key = "_group_key"
94
+ df_grouped = df_vec.withColumn(
95
+ group_key,
96
+ concat_ws("|", *[col(g).cast("string") for g in group_cols]),
97
+ )
98
+ groups = df_grouped.select(group_key).distinct().collect()
99
+
100
+ slopes_by_group: dict[str, dict] = {}
101
+ for row in groups:
102
+ key_str = row[group_key]
103
+ if key_str is None:
104
+ continue
105
+ key_parts = key_str.split("|")
106
+
107
+ sub_df = df_grouped.filter(col(group_key) == key_str).drop(group_key)
108
+ if sub_df.count() < 2:
109
+ continue
110
+
111
+ model = lr.fit(sub_df)
112
+ slopes_by_group[key_str] = {
113
+ "slope": float(model.coefficients[0]),
114
+ "intercept": float(model.intercept),
115
+ "r2": float(model.summary.r2),
116
+ "n": sub_df.count(),
117
+ "group": dict(zip(group_cols, key_parts)),
118
+ }
119
+
120
+ result["slopes"] = slopes_by_group
121
+
122
+ # Slope comparison: pairwise differences
123
+ keys = list(slopes_by_group.keys())
124
+ for i in range(len(keys)):
125
+ for j in range(i + 1, len(keys)):
126
+ s1 = slopes_by_group[keys[i]]["slope"]
127
+ s2 = slopes_by_group[keys[j]]["slope"]
128
+ result["slope_comparison"].append({
129
+ "group_a": keys[i],
130
+ "group_b": keys[j],
131
+ "slope_a": s1,
132
+ "slope_b": s2,
133
+ "slope_diff": s1 - s2,
134
+ })
135
+
136
+ return result
@@ -0,0 +1,143 @@
1
+ """
2
+ Module 3: PCA for key feature identification and clustering on staged data.
3
+ """
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ from pyspark.ml.clustering import KMeans
9
+ from pyspark.ml.feature import PCA, StandardScaler, VectorAssembler
10
+ from pyspark.sql import DataFrame, SparkSession
11
+
12
+ from ..config import BatchAnalyticsConfig
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def run_pca_clustering(
18
+ spark: SparkSession,
19
+ df: DataFrame,
20
+ config: BatchAnalyticsConfig,
21
+ ) -> dict[str, Any]:
22
+ """
23
+ Run PCA to identify key features (principal components) and KMeans clustering.
24
+ Uses scaled features; PCA components capture variance; clustering groups similar rows.
25
+
26
+ Returns:
27
+ - pca: explained variance, cumulative variance, feature loadings per PC
28
+ - clustering: cluster assignments, centroids, sizes
29
+ """
30
+ feature_cols = [
31
+ c.strip()
32
+ for c in config.analytics.pca_features.split(",")
33
+ if c.strip()
34
+ ]
35
+
36
+ if not feature_cols:
37
+ feature_cols = [
38
+ f.name
39
+ for f in df.schema.fields
40
+ if "double" in str(f.dataType).lower()
41
+ or "int" in str(f.dataType).lower()
42
+ or "long" in str(f.dataType).lower()
43
+ or "float" in str(f.dataType).lower()
44
+ ]
45
+ logger.info("Auto-selected %d numeric columns for PCA/clustering", len(feature_cols))
46
+
47
+ missing = [c for c in feature_cols if c not in df.columns]
48
+ if missing:
49
+ raise ValueError(
50
+ f"PCA features not found: {missing}. Available: {df.columns[:15]}..."
51
+ )
52
+
53
+ if len(feature_cols) < 2:
54
+ return {
55
+ "pca": {"explained_variance": [1.0], "feature_names": feature_cols},
56
+ "clustering": {"k": config.analytics.cluster_k, "sizes": []},
57
+ }
58
+
59
+ from pyspark.sql.functions import col
60
+ from pyspark.sql.types import DoubleType
61
+
62
+ df_num = df.select(
63
+ *[col(c).cast(DoubleType()).alias(c) for c in feature_cols]
64
+ ).dropna()
65
+
66
+ assembler = VectorAssembler(
67
+ inputCols=feature_cols,
68
+ outputCol="features_raw",
69
+ handleInvalid="skip",
70
+ )
71
+ df_vec = assembler.transform(df_num)
72
+
73
+ scaler = StandardScaler(
74
+ inputCol="features_raw",
75
+ outputCol="features",
76
+ withStd=True,
77
+ withMean=True,
78
+ )
79
+ scaler_model = scaler.fit(df_vec)
80
+ df_scaled = scaler_model.transform(df_vec)
81
+
82
+ # PCA - keep enough components for target variance
83
+ variance_threshold = config.analytics.pca_variance_threshold
84
+ n_comp_max = min(len(feature_cols), 20)
85
+ pca = PCA(k=n_comp_max, inputCol="features", outputCol="pca_features")
86
+ pca_model = pca.fit(df_scaled)
87
+
88
+ explained = pca_model.explainedVariance.toArray().tolist()
89
+ cumsum = []
90
+ s = 0.0
91
+ for v in explained:
92
+ s += v
93
+ cumsum.append(s)
94
+
95
+ # Feature loadings per PC (which original features contribute most)
96
+ components = pca_model.pc.toArray()
97
+ loadings: list[dict] = []
98
+ for i, comp in enumerate(components):
99
+ ranked = sorted(
100
+ zip(feature_cols, comp.tolist()),
101
+ key=lambda x: abs(x[1]),
102
+ reverse=True,
103
+ )
104
+ loadings.append({
105
+ "pc": i + 1,
106
+ "top_features": [{"name": n, "loading": float(v)} for n, v in ranked[:5]],
107
+ })
108
+
109
+ k = config.analytics.cluster_k
110
+ kmeans = KMeans(k=k, seed=42, featuresCol="pca_features", predictionCol="cluster")
111
+ kmeans_model = kmeans.fit(df_scaled)
112
+
113
+ df_clustered = kmeans_model.transform(df_scaled)
114
+ cluster_sizes = (
115
+ df_clustered.groupBy("cluster")
116
+ .count()
117
+ .orderBy("cluster")
118
+ .collect()
119
+ )
120
+ sizes = {int(r["cluster"]): int(r["count"]) for r in cluster_sizes}
121
+
122
+ centroids = []
123
+ for i in range(k):
124
+ c = kmeans_model.clusterCenters()[i]
125
+ centroids.append({"cluster": i, "centroid": c.tolist()})
126
+
127
+ return {
128
+ "pca": {
129
+ "explained_variance": explained,
130
+ "cumulative_variance": cumsum,
131
+ "feature_names": feature_cols,
132
+ "component_loadings": loadings,
133
+ "n_components": len(explained),
134
+ },
135
+ "clustering": {
136
+ "k": k,
137
+ "sizes": sizes,
138
+ "centroids": centroids,
139
+ "within_cluster_sum_of_squared_distances": float(
140
+ kmeans_model.summary.trainingCost
141
+ ),
142
+ },
143
+ }