batch-analytics 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- batch_analytics-0.1.0/PKG-INFO +80 -0
- batch_analytics-0.1.0/README.md +61 -0
- batch_analytics-0.1.0/pyproject.toml +31 -0
- batch_analytics-0.1.0/setup.cfg +4 -0
- batch_analytics-0.1.0/src/batch_analytics/__init__.py +44 -0
- batch_analytics-0.1.0/src/batch_analytics/__main__.py +5 -0
- batch_analytics-0.1.0/src/batch_analytics/analytics/__init__.py +19 -0
- batch_analytics-0.1.0/src/batch_analytics/analytics/correlation.py +113 -0
- batch_analytics-0.1.0/src/batch_analytics/analytics/linear_regression.py +136 -0
- batch_analytics-0.1.0/src/batch_analytics/analytics/pca_clustering.py +143 -0
- batch_analytics-0.1.0/src/batch_analytics/analytics/t_test.py +184 -0
- batch_analytics-0.1.0/src/batch_analytics/config.py +169 -0
- batch_analytics-0.1.0/src/batch_analytics/extract.py +118 -0
- batch_analytics-0.1.0/src/batch_analytics/job_runner.py +300 -0
- batch_analytics-0.1.0/src/batch_analytics/log.py +101 -0
- batch_analytics-0.1.0/src/batch_analytics/modules.py +24 -0
- batch_analytics-0.1.0/src/batch_analytics/output/__init__.py +22 -0
- batch_analytics-0.1.0/src/batch_analytics/output/base.py +97 -0
- batch_analytics-0.1.0/src/batch_analytics/output/clickhouse.py +89 -0
- batch_analytics-0.1.0/src/batch_analytics/output/local.py +36 -0
- batch_analytics-0.1.0/src/batch_analytics/output/s3.py +82 -0
- batch_analytics-0.1.0/src/batch_analytics/transform.py +184 -0
- batch_analytics-0.1.0/src/batch_analytics.egg-info/PKG-INFO +80 -0
- batch_analytics-0.1.0/src/batch_analytics.egg-info/SOURCES.txt +26 -0
- batch_analytics-0.1.0/src/batch_analytics.egg-info/dependency_links.txt +1 -0
- batch_analytics-0.1.0/src/batch_analytics.egg-info/entry_points.txt +2 -0
- batch_analytics-0.1.0/src/batch_analytics.egg-info/requires.txt +14 -0
- batch_analytics-0.1.0/src/batch_analytics.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: batch-analytics
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
|
|
5
|
+
Author: Analytics Team
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: pyspark<3.6,>=3.4
|
|
10
|
+
Provides-Extra: dev
|
|
11
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
12
|
+
Provides-Extra: s3
|
|
13
|
+
Requires-Dist: boto3>=1.28; extra == "s3"
|
|
14
|
+
Provides-Extra: clickhouse
|
|
15
|
+
Requires-Dist: clickhouse-connect>=0.7; extra == "clickhouse"
|
|
16
|
+
Provides-Extra: output
|
|
17
|
+
Requires-Dist: boto3>=1.28; extra == "output"
|
|
18
|
+
Requires-Dist: clickhouse-connect>=0.7; extra == "output"
|
|
19
|
+
|
|
20
|
+
# Batch Analytics
|
|
21
|
+
|
|
22
|
+
PySpark-based analytics pipeline for ClickHouse data: **Extract** → **Transform** → **Stage** → **Analytics**. Designed to run as the main application inside a Spark driver container (invoked by `analytics_runners` via SparkApplication CRD).
|
|
23
|
+
|
|
24
|
+
## Bundle contents
|
|
25
|
+
|
|
26
|
+
Only the files required for the batch analytics job runner:
|
|
27
|
+
|
|
28
|
+
```
|
|
29
|
+
analytics/
|
|
30
|
+
├── pyproject.toml
|
|
31
|
+
├── requirements-batch.txt
|
|
32
|
+
├── README.md
|
|
33
|
+
└── src/
|
|
34
|
+
└── batch_analytics/
|
|
35
|
+
├── __init__.py
|
|
36
|
+
├── __main__.py # python -m batch_analytics
|
|
37
|
+
├── job_runner.py # Entry point
|
|
38
|
+
├── config.py
|
|
39
|
+
├── extract.py
|
|
40
|
+
├── transform.py
|
|
41
|
+
├── log.py
|
|
42
|
+
├── README.md
|
|
43
|
+
└── analytics/
|
|
44
|
+
├── __init__.py
|
|
45
|
+
├── linear_regression.py
|
|
46
|
+
├── correlation.py
|
|
47
|
+
├── pca_clustering.py
|
|
48
|
+
└── t_test.py
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Install
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install -e .
|
|
55
|
+
# or: pip install -r requirements-batch.txt && pip install -e .
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Run
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
# Via module
|
|
62
|
+
python -m batch_analytics
|
|
63
|
+
|
|
64
|
+
# Via CLI (after pip install -e .)
|
|
65
|
+
batch-analytics
|
|
66
|
+
|
|
67
|
+
# Full pipeline
|
|
68
|
+
batch-analytics
|
|
69
|
+
|
|
70
|
+
# Analytics only (from staged ClickHouse table)
|
|
71
|
+
batch-analytics --from-stage --modules lr corr pca ttest
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Configuration
|
|
75
|
+
|
|
76
|
+
See `src/batch_analytics/README.md` for environment variables and usage.
|
|
77
|
+
|
|
78
|
+
## Docker image
|
|
79
|
+
|
|
80
|
+
For Spark on Kubernetes, build an image that includes this package and exposes `job_runner.py` at the path used by `mainApplicationFile` (e.g. `local:///opt/analytics/job_runner.py`).
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# Batch Analytics
|
|
2
|
+
|
|
3
|
+
PySpark-based analytics pipeline for ClickHouse data: **Extract** → **Transform** → **Stage** → **Analytics**. Designed to run as the main application inside a Spark driver container (invoked by `analytics_runners` via SparkApplication CRD).
|
|
4
|
+
|
|
5
|
+
## Bundle contents
|
|
6
|
+
|
|
7
|
+
Only the files required for the batch analytics job runner:
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
analytics/
|
|
11
|
+
├── pyproject.toml
|
|
12
|
+
├── requirements-batch.txt
|
|
13
|
+
├── README.md
|
|
14
|
+
└── src/
|
|
15
|
+
└── batch_analytics/
|
|
16
|
+
├── __init__.py
|
|
17
|
+
├── __main__.py # python -m batch_analytics
|
|
18
|
+
├── job_runner.py # Entry point
|
|
19
|
+
├── config.py
|
|
20
|
+
├── extract.py
|
|
21
|
+
├── transform.py
|
|
22
|
+
├── log.py
|
|
23
|
+
├── README.md
|
|
24
|
+
└── analytics/
|
|
25
|
+
├── __init__.py
|
|
26
|
+
├── linear_regression.py
|
|
27
|
+
├── correlation.py
|
|
28
|
+
├── pca_clustering.py
|
|
29
|
+
└── t_test.py
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Install
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install -e .
|
|
36
|
+
# or: pip install -r requirements-batch.txt && pip install -e .
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Run
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
# Via module
|
|
43
|
+
python -m batch_analytics
|
|
44
|
+
|
|
45
|
+
# Via CLI (after pip install -e .)
|
|
46
|
+
batch-analytics
|
|
47
|
+
|
|
48
|
+
# Full pipeline
|
|
49
|
+
batch-analytics
|
|
50
|
+
|
|
51
|
+
# Analytics only (from staged ClickHouse table)
|
|
52
|
+
batch-analytics --from-stage --modules lr corr pca ttest
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Configuration
|
|
56
|
+
|
|
57
|
+
See `src/batch_analytics/README.md` for environment variables and usage.
|
|
58
|
+
|
|
59
|
+
## Docker image
|
|
60
|
+
|
|
61
|
+
For Spark on Kubernetes, build an image that includes this package and exposes `job_runner.py` at the path used by `mainApplicationFile` (e.g. `local:///opt/analytics/job_runner.py`).
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "batch-analytics"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"pyspark>=3.4,<3.6",
|
|
13
|
+
]
|
|
14
|
+
authors = [{ name = "Analytics Team" }]
|
|
15
|
+
license = { text = "MIT" }
|
|
16
|
+
|
|
17
|
+
[project.optional-dependencies]
|
|
18
|
+
dev = ["pytest>=7.0"]
|
|
19
|
+
s3 = ["boto3>=1.28"]
|
|
20
|
+
clickhouse = ["clickhouse-connect>=0.7"]
|
|
21
|
+
output = ["boto3>=1.28", "clickhouse-connect>=0.7"]
|
|
22
|
+
|
|
23
|
+
[project.scripts]
|
|
24
|
+
batch-analytics = "batch_analytics.job_runner:main"
|
|
25
|
+
|
|
26
|
+
[tool.setuptools]
|
|
27
|
+
package-dir = { "" = "src" }
|
|
28
|
+
|
|
29
|
+
[tool.setuptools.packages.find]
|
|
30
|
+
where = ["src"]
|
|
31
|
+
include = ["batch_analytics*"]
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Batch analytics pipeline: Extract, Transform, Log stages + analytics modules.
|
|
3
|
+
|
|
4
|
+
Stages:
|
|
5
|
+
- Extract: Load data from ClickHouse via Spark ClickHouse connector or JDBC
|
|
6
|
+
- Transform: Deduplicate and stage data (parquet/delta/clickhouse)
|
|
7
|
+
- Log: Persist run metadata and analytics results
|
|
8
|
+
|
|
9
|
+
Analytics modules:
|
|
10
|
+
- Module 1: Linear regression (XY) with slope comparison across groups
|
|
11
|
+
- Module 2: Multi-feature correlation
|
|
12
|
+
- Module 3: PCA + KMeans clustering
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from .config import BatchAnalyticsConfig, SparkK8sConfig
|
|
16
|
+
from .extract import extract_all, extract_table, extract_unified
|
|
17
|
+
from .transform import (
|
|
18
|
+
extract_anchor_id,
|
|
19
|
+
load_staged,
|
|
20
|
+
remove_duplicates,
|
|
21
|
+
stage_to_clickhouse,
|
|
22
|
+
transform,
|
|
23
|
+
transform_and_stage,
|
|
24
|
+
)
|
|
25
|
+
from .log import log_analytics_artifacts, log_run
|
|
26
|
+
from .job_runner import run_pipeline, create_spark_session
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"BatchAnalyticsConfig",
|
|
30
|
+
"SparkK8sConfig",
|
|
31
|
+
"extract_anchor_id",
|
|
32
|
+
"extract_all",
|
|
33
|
+
"extract_table",
|
|
34
|
+
"extract_unified",
|
|
35
|
+
"remove_duplicates",
|
|
36
|
+
"stage_to_clickhouse",
|
|
37
|
+
"transform",
|
|
38
|
+
"transform_and_stage",
|
|
39
|
+
"load_staged",
|
|
40
|
+
"log_run",
|
|
41
|
+
"log_analytics_artifacts",
|
|
42
|
+
"run_pipeline",
|
|
43
|
+
"create_spark_session",
|
|
44
|
+
]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Analytics modules for batch analytics pipeline.
|
|
3
|
+
- Module 1: Linear regression (XY) and slope comparison
|
|
4
|
+
- Module 2: Multi-feature correlation
|
|
5
|
+
- Module 3: PCA and clustering
|
|
6
|
+
- Module 4: T-test to compare means of two groups
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .linear_regression import run_linear_regression
|
|
10
|
+
from .correlation import run_correlation
|
|
11
|
+
from .pca_clustering import run_pca_clustering
|
|
12
|
+
from .t_test import run_t_test
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"run_linear_regression",
|
|
16
|
+
"run_correlation",
|
|
17
|
+
"run_pca_clustering",
|
|
18
|
+
"run_t_test",
|
|
19
|
+
]
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module 2: Multi-feature correlation analysis.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pyspark.ml.feature import VectorAssembler
|
|
9
|
+
from pyspark.ml.stat import Correlation
|
|
10
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
11
|
+
|
|
12
|
+
from ..config import BatchAnalyticsConfig
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def run_correlation(
|
|
18
|
+
spark: SparkSession,
|
|
19
|
+
df: DataFrame,
|
|
20
|
+
config: BatchAnalyticsConfig,
|
|
21
|
+
) -> dict[str, Any]:
|
|
22
|
+
"""
|
|
23
|
+
Compute correlation matrix over multiple numeric features.
|
|
24
|
+
Optionally identify pairs above a threshold (collinearity).
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
- correlation_matrix: full matrix (list of lists)
|
|
28
|
+
- feature_names: column order
|
|
29
|
+
- high_corr_pairs: pairs with |corr| >= threshold
|
|
30
|
+
- threshold: used threshold
|
|
31
|
+
"""
|
|
32
|
+
feature_cols = [
|
|
33
|
+
c.strip()
|
|
34
|
+
for c in config.analytics.corr_features.split(",")
|
|
35
|
+
if c.strip()
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
if not feature_cols:
|
|
39
|
+
# Auto-select numeric columns
|
|
40
|
+
feature_cols = [
|
|
41
|
+
f.name
|
|
42
|
+
for f in df.schema.fields
|
|
43
|
+
if "double" in str(f.dataType).lower()
|
|
44
|
+
or "int" in str(f.dataType).lower()
|
|
45
|
+
or "long" in str(f.dataType).lower()
|
|
46
|
+
or "float" in str(f.dataType).lower()
|
|
47
|
+
]
|
|
48
|
+
logger.info("Auto-selected %d numeric columns for correlation", len(feature_cols))
|
|
49
|
+
|
|
50
|
+
missing = [c for c in feature_cols if c not in df.columns]
|
|
51
|
+
if missing:
|
|
52
|
+
raise ValueError(
|
|
53
|
+
f"Correlation features not found: {missing}. "
|
|
54
|
+
f"Available: {df.columns[:15]}..."
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
if len(feature_cols) < 2:
|
|
58
|
+
return {
|
|
59
|
+
"correlation_matrix": [[1.0]],
|
|
60
|
+
"feature_names": feature_cols,
|
|
61
|
+
"high_corr_pairs": [],
|
|
62
|
+
"threshold": config.analytics.corr_threshold,
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# Cast to double and drop nulls
|
|
66
|
+
from pyspark.sql.functions import col
|
|
67
|
+
from pyspark.sql.types import DoubleType
|
|
68
|
+
|
|
69
|
+
df_num = df.select(
|
|
70
|
+
*[col(c).cast(DoubleType()).alias(c) for c in feature_cols]
|
|
71
|
+
).dropna()
|
|
72
|
+
|
|
73
|
+
assembler = VectorAssembler(
|
|
74
|
+
inputCols=feature_cols,
|
|
75
|
+
outputCol="features",
|
|
76
|
+
handleInvalid="skip",
|
|
77
|
+
)
|
|
78
|
+
df_vec = assembler.transform(df_num)
|
|
79
|
+
|
|
80
|
+
corr_matrix = Correlation.corr(df_vec, "features", "pearson")
|
|
81
|
+
corr_row = corr_matrix.head()
|
|
82
|
+
if corr_row is None:
|
|
83
|
+
return {
|
|
84
|
+
"correlation_matrix": [],
|
|
85
|
+
"feature_names": feature_cols,
|
|
86
|
+
"high_corr_pairs": [],
|
|
87
|
+
"threshold": config.analytics.corr_threshold,
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
import numpy as np
|
|
91
|
+
|
|
92
|
+
arr = corr_row[0].toArray()
|
|
93
|
+
matrix = np.asarray(arr)
|
|
94
|
+
|
|
95
|
+
threshold = config.analytics.corr_threshold
|
|
96
|
+
high_pairs: list[dict] = []
|
|
97
|
+
n = len(feature_cols)
|
|
98
|
+
for i in range(n):
|
|
99
|
+
for j in range(i + 1, n):
|
|
100
|
+
val = float(matrix[i, j])
|
|
101
|
+
if abs(val) >= threshold:
|
|
102
|
+
high_pairs.append({
|
|
103
|
+
"feature_a": feature_cols[i],
|
|
104
|
+
"feature_b": feature_cols[j],
|
|
105
|
+
"correlation": val,
|
|
106
|
+
})
|
|
107
|
+
|
|
108
|
+
return {
|
|
109
|
+
"correlation_matrix": matrix.tolist(),
|
|
110
|
+
"feature_names": feature_cols,
|
|
111
|
+
"high_corr_pairs": high_pairs,
|
|
112
|
+
"threshold": threshold,
|
|
113
|
+
}
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module 1: Simple linear regression on XY data with slope comparison across groups.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pyspark.ml.feature import VectorAssembler
|
|
9
|
+
from pyspark.ml.regression import LinearRegression
|
|
10
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
11
|
+
from pyspark.sql.functions import col, concat_ws
|
|
12
|
+
from pyspark.sql.types import DoubleType
|
|
13
|
+
|
|
14
|
+
from ..config import BatchAnalyticsConfig
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _ensure_numeric(df: DataFrame, x_col: str, y_col: str) -> DataFrame:
|
|
20
|
+
"""Cast X and Y columns to double."""
|
|
21
|
+
return df.select(
|
|
22
|
+
col(x_col).cast(DoubleType()).alias(x_col),
|
|
23
|
+
col(y_col).cast(DoubleType()).alias(y_col),
|
|
24
|
+
*[c for c in df.columns if c not in (x_col, y_col)],
|
|
25
|
+
).filter(col(x_col).isNotNull() & col(y_col).isNotNull())
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def run_linear_regression(
|
|
29
|
+
spark: SparkSession,
|
|
30
|
+
df: DataFrame,
|
|
31
|
+
config: BatchAnalyticsConfig,
|
|
32
|
+
) -> dict[str, Any]:
|
|
33
|
+
"""
|
|
34
|
+
Run simple linear regression: Y ~ X.
|
|
35
|
+
If group columns are configured, fit separate models per group and compare slopes.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
- slopes: per-group slope and intercept
|
|
39
|
+
- global: single regression over all data
|
|
40
|
+
- slope_comparison: pairwise slope differences (when multiple groups)
|
|
41
|
+
"""
|
|
42
|
+
x_col = config.analytics.lr_x_column
|
|
43
|
+
y_col = config.analytics.lr_y_column
|
|
44
|
+
group_cols = [
|
|
45
|
+
c.strip()
|
|
46
|
+
for c in config.analytics.lr_group_columns.split(",")
|
|
47
|
+
if c.strip()
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
# Ensure required columns exist
|
|
51
|
+
if x_col not in df.columns or y_col not in df.columns:
|
|
52
|
+
numeric_cols = [f.name for f in df.schema.fields if "double" in str(f.dataType).lower() or "int" in str(f.dataType).lower()]
|
|
53
|
+
if len(numeric_cols) >= 2:
|
|
54
|
+
x_col = numeric_cols[0]
|
|
55
|
+
y_col = numeric_cols[1]
|
|
56
|
+
logger.warning(
|
|
57
|
+
"LR columns not found, using first two numeric: %s, %s",
|
|
58
|
+
x_col,
|
|
59
|
+
y_col,
|
|
60
|
+
)
|
|
61
|
+
else:
|
|
62
|
+
raise ValueError(
|
|
63
|
+
f"Could not find LR columns (x={x_col}, y={y_col}). "
|
|
64
|
+
"Specify BATCH_LR_X_COLUMN and BATCH_LR_Y_COLUMN."
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
df = _ensure_numeric(df, x_col, y_col)
|
|
68
|
+
|
|
69
|
+
assembler = VectorAssembler(inputCols=[x_col], outputCol="features", handleInvalid="skip")
|
|
70
|
+
df_vec = assembler.transform(df).withColumnRenamed(y_col, "label")
|
|
71
|
+
|
|
72
|
+
# Global regression (all data)
|
|
73
|
+
lr = LinearRegression(featuresCol="features", labelCol="label")
|
|
74
|
+
global_model = lr.fit(df_vec)
|
|
75
|
+
global_slope = float(global_model.coefficients[0])
|
|
76
|
+
global_intercept = float(global_model.intercept)
|
|
77
|
+
global_r2 = float(global_model.summary.r2)
|
|
78
|
+
|
|
79
|
+
result: dict[str, Any] = {
|
|
80
|
+
"global": {
|
|
81
|
+
"slope": global_slope,
|
|
82
|
+
"intercept": global_intercept,
|
|
83
|
+
"r2": global_r2,
|
|
84
|
+
},
|
|
85
|
+
"slopes": {},
|
|
86
|
+
"slope_comparison": [],
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
if not group_cols or not all(g in df.columns for g in group_cols):
|
|
90
|
+
return result
|
|
91
|
+
|
|
92
|
+
# Per-group regression
|
|
93
|
+
group_key = "_group_key"
|
|
94
|
+
df_grouped = df_vec.withColumn(
|
|
95
|
+
group_key,
|
|
96
|
+
concat_ws("|", *[col(g).cast("string") for g in group_cols]),
|
|
97
|
+
)
|
|
98
|
+
groups = df_grouped.select(group_key).distinct().collect()
|
|
99
|
+
|
|
100
|
+
slopes_by_group: dict[str, dict] = {}
|
|
101
|
+
for row in groups:
|
|
102
|
+
key_str = row[group_key]
|
|
103
|
+
if key_str is None:
|
|
104
|
+
continue
|
|
105
|
+
key_parts = key_str.split("|")
|
|
106
|
+
|
|
107
|
+
sub_df = df_grouped.filter(col(group_key) == key_str).drop(group_key)
|
|
108
|
+
if sub_df.count() < 2:
|
|
109
|
+
continue
|
|
110
|
+
|
|
111
|
+
model = lr.fit(sub_df)
|
|
112
|
+
slopes_by_group[key_str] = {
|
|
113
|
+
"slope": float(model.coefficients[0]),
|
|
114
|
+
"intercept": float(model.intercept),
|
|
115
|
+
"r2": float(model.summary.r2),
|
|
116
|
+
"n": sub_df.count(),
|
|
117
|
+
"group": dict(zip(group_cols, key_parts)),
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
result["slopes"] = slopes_by_group
|
|
121
|
+
|
|
122
|
+
# Slope comparison: pairwise differences
|
|
123
|
+
keys = list(slopes_by_group.keys())
|
|
124
|
+
for i in range(len(keys)):
|
|
125
|
+
for j in range(i + 1, len(keys)):
|
|
126
|
+
s1 = slopes_by_group[keys[i]]["slope"]
|
|
127
|
+
s2 = slopes_by_group[keys[j]]["slope"]
|
|
128
|
+
result["slope_comparison"].append({
|
|
129
|
+
"group_a": keys[i],
|
|
130
|
+
"group_b": keys[j],
|
|
131
|
+
"slope_a": s1,
|
|
132
|
+
"slope_b": s2,
|
|
133
|
+
"slope_diff": s1 - s2,
|
|
134
|
+
})
|
|
135
|
+
|
|
136
|
+
return result
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module 3: PCA for key feature identification and clustering on staged data.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pyspark.ml.clustering import KMeans
|
|
9
|
+
from pyspark.ml.feature import PCA, StandardScaler, VectorAssembler
|
|
10
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
11
|
+
|
|
12
|
+
from ..config import BatchAnalyticsConfig
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def run_pca_clustering(
|
|
18
|
+
spark: SparkSession,
|
|
19
|
+
df: DataFrame,
|
|
20
|
+
config: BatchAnalyticsConfig,
|
|
21
|
+
) -> dict[str, Any]:
|
|
22
|
+
"""
|
|
23
|
+
Run PCA to identify key features (principal components) and KMeans clustering.
|
|
24
|
+
Uses scaled features; PCA components capture variance; clustering groups similar rows.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
- pca: explained variance, cumulative variance, feature loadings per PC
|
|
28
|
+
- clustering: cluster assignments, centroids, sizes
|
|
29
|
+
"""
|
|
30
|
+
feature_cols = [
|
|
31
|
+
c.strip()
|
|
32
|
+
for c in config.analytics.pca_features.split(",")
|
|
33
|
+
if c.strip()
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
if not feature_cols:
|
|
37
|
+
feature_cols = [
|
|
38
|
+
f.name
|
|
39
|
+
for f in df.schema.fields
|
|
40
|
+
if "double" in str(f.dataType).lower()
|
|
41
|
+
or "int" in str(f.dataType).lower()
|
|
42
|
+
or "long" in str(f.dataType).lower()
|
|
43
|
+
or "float" in str(f.dataType).lower()
|
|
44
|
+
]
|
|
45
|
+
logger.info("Auto-selected %d numeric columns for PCA/clustering", len(feature_cols))
|
|
46
|
+
|
|
47
|
+
missing = [c for c in feature_cols if c not in df.columns]
|
|
48
|
+
if missing:
|
|
49
|
+
raise ValueError(
|
|
50
|
+
f"PCA features not found: {missing}. Available: {df.columns[:15]}..."
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
if len(feature_cols) < 2:
|
|
54
|
+
return {
|
|
55
|
+
"pca": {"explained_variance": [1.0], "feature_names": feature_cols},
|
|
56
|
+
"clustering": {"k": config.analytics.cluster_k, "sizes": []},
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
from pyspark.sql.functions import col
|
|
60
|
+
from pyspark.sql.types import DoubleType
|
|
61
|
+
|
|
62
|
+
df_num = df.select(
|
|
63
|
+
*[col(c).cast(DoubleType()).alias(c) for c in feature_cols]
|
|
64
|
+
).dropna()
|
|
65
|
+
|
|
66
|
+
assembler = VectorAssembler(
|
|
67
|
+
inputCols=feature_cols,
|
|
68
|
+
outputCol="features_raw",
|
|
69
|
+
handleInvalid="skip",
|
|
70
|
+
)
|
|
71
|
+
df_vec = assembler.transform(df_num)
|
|
72
|
+
|
|
73
|
+
scaler = StandardScaler(
|
|
74
|
+
inputCol="features_raw",
|
|
75
|
+
outputCol="features",
|
|
76
|
+
withStd=True,
|
|
77
|
+
withMean=True,
|
|
78
|
+
)
|
|
79
|
+
scaler_model = scaler.fit(df_vec)
|
|
80
|
+
df_scaled = scaler_model.transform(df_vec)
|
|
81
|
+
|
|
82
|
+
# PCA - keep enough components for target variance
|
|
83
|
+
variance_threshold = config.analytics.pca_variance_threshold
|
|
84
|
+
n_comp_max = min(len(feature_cols), 20)
|
|
85
|
+
pca = PCA(k=n_comp_max, inputCol="features", outputCol="pca_features")
|
|
86
|
+
pca_model = pca.fit(df_scaled)
|
|
87
|
+
|
|
88
|
+
explained = pca_model.explainedVariance.toArray().tolist()
|
|
89
|
+
cumsum = []
|
|
90
|
+
s = 0.0
|
|
91
|
+
for v in explained:
|
|
92
|
+
s += v
|
|
93
|
+
cumsum.append(s)
|
|
94
|
+
|
|
95
|
+
# Feature loadings per PC (which original features contribute most)
|
|
96
|
+
components = pca_model.pc.toArray()
|
|
97
|
+
loadings: list[dict] = []
|
|
98
|
+
for i, comp in enumerate(components):
|
|
99
|
+
ranked = sorted(
|
|
100
|
+
zip(feature_cols, comp.tolist()),
|
|
101
|
+
key=lambda x: abs(x[1]),
|
|
102
|
+
reverse=True,
|
|
103
|
+
)
|
|
104
|
+
loadings.append({
|
|
105
|
+
"pc": i + 1,
|
|
106
|
+
"top_features": [{"name": n, "loading": float(v)} for n, v in ranked[:5]],
|
|
107
|
+
})
|
|
108
|
+
|
|
109
|
+
k = config.analytics.cluster_k
|
|
110
|
+
kmeans = KMeans(k=k, seed=42, featuresCol="pca_features", predictionCol="cluster")
|
|
111
|
+
kmeans_model = kmeans.fit(df_scaled)
|
|
112
|
+
|
|
113
|
+
df_clustered = kmeans_model.transform(df_scaled)
|
|
114
|
+
cluster_sizes = (
|
|
115
|
+
df_clustered.groupBy("cluster")
|
|
116
|
+
.count()
|
|
117
|
+
.orderBy("cluster")
|
|
118
|
+
.collect()
|
|
119
|
+
)
|
|
120
|
+
sizes = {int(r["cluster"]): int(r["count"]) for r in cluster_sizes}
|
|
121
|
+
|
|
122
|
+
centroids = []
|
|
123
|
+
for i in range(k):
|
|
124
|
+
c = kmeans_model.clusterCenters()[i]
|
|
125
|
+
centroids.append({"cluster": i, "centroid": c.tolist()})
|
|
126
|
+
|
|
127
|
+
return {
|
|
128
|
+
"pca": {
|
|
129
|
+
"explained_variance": explained,
|
|
130
|
+
"cumulative_variance": cumsum,
|
|
131
|
+
"feature_names": feature_cols,
|
|
132
|
+
"component_loadings": loadings,
|
|
133
|
+
"n_components": len(explained),
|
|
134
|
+
},
|
|
135
|
+
"clustering": {
|
|
136
|
+
"k": k,
|
|
137
|
+
"sizes": sizes,
|
|
138
|
+
"centroids": centroids,
|
|
139
|
+
"within_cluster_sum_of_squared_distances": float(
|
|
140
|
+
kmeans_model.summary.trainingCost
|
|
141
|
+
),
|
|
142
|
+
},
|
|
143
|
+
}
|