batch-analytics 0.3.5__tar.gz → 0.3.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/PKG-INFO +3 -2
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/README.md +1 -1
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/pyproject.toml +4 -2
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/analytics/t_test.py +114 -17
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics.egg-info/PKG-INFO +3 -2
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics.egg-info/requires.txt +1 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/setup.cfg +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/analytics/pca_clustering.py +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/config.py +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/extract.py +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/job_runner.py +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/transform.py +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: batch-analytics
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.7
|
|
4
4
|
Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
|
|
5
5
|
Author: Litewave Analytics Team
|
|
6
6
|
License: MIT
|
|
@@ -8,6 +8,7 @@ Requires-Python: >=3.8
|
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
Requires-Dist: pyspark<3.6,>=3.4
|
|
10
10
|
Requires-Dist: numpy>=1.19.0
|
|
11
|
+
Requires-Dist: scipy>=1.5.0
|
|
11
12
|
Provides-Extra: dev
|
|
12
13
|
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
13
14
|
Provides-Extra: ttest
|
|
@@ -65,7 +66,7 @@ analytics/
|
|
|
65
66
|
pip install -e .
|
|
66
67
|
# or install every runtime dependency used anywhere in the package, then editable:
|
|
67
68
|
pip install -r requirements.txt && pip install -e .
|
|
68
|
-
# PyPI
|
|
69
|
+
# PyPI install includes numpy and scipy (t-test); extras: s3, clickhouse, output, full
|
|
69
70
|
pip install "batch-analytics[full]"
|
|
70
71
|
```
|
|
71
72
|
|
|
@@ -36,7 +36,7 @@ analytics/
|
|
|
36
36
|
pip install -e .
|
|
37
37
|
# or install every runtime dependency used anywhere in the package, then editable:
|
|
38
38
|
pip install -r requirements.txt && pip install -e .
|
|
39
|
-
# PyPI
|
|
39
|
+
# PyPI install includes numpy and scipy (t-test); extras: s3, clickhouse, output, full
|
|
40
40
|
pip install "batch-analytics[full]"
|
|
41
41
|
```
|
|
42
42
|
|
|
@@ -4,20 +4,22 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.7"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
11
11
|
dependencies = [
|
|
12
12
|
"pyspark>=3.4,<3.6",
|
|
13
13
|
"numpy>=1.19.0",
|
|
14
|
+
# Welch t-test (t_test.py); keep on core deps so `pip install batch-analytics` works in minimal driver images
|
|
15
|
+
"scipy>=1.5.0",
|
|
14
16
|
]
|
|
15
17
|
authors = [{ name = "Litewave Analytics Team" }]
|
|
16
18
|
license = { text = "MIT" }
|
|
17
19
|
|
|
18
20
|
[project.optional-dependencies]
|
|
19
21
|
dev = ["pytest>=7.0"]
|
|
20
|
-
#
|
|
22
|
+
# Legacy: scipy is a core dependency; kept so `pip install "batch-analytics[ttest]"` still resolves.
|
|
21
23
|
ttest = ["scipy>=1.5.0"]
|
|
22
24
|
s3 = ["boto3>=1.28"]
|
|
23
25
|
# 0.9+ uses list[...] etc. and breaks on Python 3.8; 3.9+ can take current clickhouse-connect.
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Module 4: T-test
|
|
2
|
+
Module 4: T-test and one-way ANOVA for comparing means across groups.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import Any, Dict
|
|
6
|
+
from typing import Any, Dict, List
|
|
7
7
|
|
|
8
8
|
from pyspark.sql import DataFrame, SparkSession
|
|
9
9
|
from pyspark.sql.functions import col, avg, stddev, count
|
|
@@ -20,18 +20,17 @@ def run_t_test(
|
|
|
20
20
|
config: BatchAnalyticsConfig,
|
|
21
21
|
) -> Dict[str, Any]:
|
|
22
22
|
"""
|
|
23
|
-
|
|
23
|
+
Compare means across groups or two numeric columns.
|
|
24
24
|
|
|
25
|
-
Supports
|
|
26
|
-
1. Value + group: one numeric column, one categorical column
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
25
|
+
Supports:
|
|
26
|
+
1. Value + group: one numeric column, one categorical column.
|
|
27
|
+
- **2 groups:** Welch's t-test (unequal variances).
|
|
28
|
+
- **3+ groups:** one-way ANOVA (F-test on equal means).
|
|
29
|
+
2. Two columns: two numeric columns, Welch t-test on their column means.
|
|
30
30
|
|
|
31
31
|
Returns:
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
- t_statistic, p_value, difference (mean_a - mean_b)
|
|
32
|
+
For Welch: ``group_a``, ``group_b``, ``t_statistic``, ``p_value``, ``test`` = ``\"Welch\"``.
|
|
33
|
+
For ANOVA: ``groups``, ``f_statistic``, ``p_value``, ``test`` = ``\"one_way_anova\"``, SS/df.
|
|
35
34
|
"""
|
|
36
35
|
value_col = (config.analytics.ttest_value_column or "").strip()
|
|
37
36
|
group_col = (config.analytics.ttest_group_column or "").strip()
|
|
@@ -46,7 +45,7 @@ def run_t_test(
|
|
|
46
45
|
if col_a and col_b and col_a in df.columns and col_b in df.columns:
|
|
47
46
|
return _run_t_test_two_columns(df, col_a, col_b)
|
|
48
47
|
|
|
49
|
-
# Fallback:
|
|
48
|
+
# Fallback: first numeric + string-like column with at least 2 distinct groups
|
|
50
49
|
numeric_cols = [
|
|
51
50
|
f.name for f in df.schema.fields
|
|
52
51
|
if "double" in str(f.dataType).lower()
|
|
@@ -60,8 +59,10 @@ def run_t_test(
|
|
|
60
59
|
for nc in numeric_cols:
|
|
61
60
|
for sc in string_cols:
|
|
62
61
|
distinct = df.select(sc).distinct().count()
|
|
63
|
-
if distinct
|
|
64
|
-
logger.info(
|
|
62
|
+
if distinct >= 2:
|
|
63
|
+
logger.info(
|
|
64
|
+
"Auto-selected value=%s, group=%s (%d groups)", nc, sc, distinct
|
|
65
|
+
)
|
|
65
66
|
return _run_t_test_by_group(df, nc, sc)
|
|
66
67
|
|
|
67
68
|
raise ValueError(
|
|
@@ -76,7 +77,7 @@ def _run_t_test_by_group(
|
|
|
76
77
|
value_col: str,
|
|
77
78
|
group_col: str,
|
|
78
79
|
) -> Dict[str, Any]:
|
|
79
|
-
"""
|
|
80
|
+
"""Compare mean of value_col across groups: Welch (2 groups) or one-way ANOVA (3+)."""
|
|
80
81
|
df_num = df.select(
|
|
81
82
|
col(value_col).cast(DoubleType()).alias("_val"),
|
|
82
83
|
col(group_col).cast("string").alias("_grp"),
|
|
@@ -92,11 +93,15 @@ def _run_t_test_by_group(
|
|
|
92
93
|
.collect()
|
|
93
94
|
)
|
|
94
95
|
|
|
95
|
-
|
|
96
|
+
k = len(stats)
|
|
97
|
+
if k < 2:
|
|
96
98
|
raise ValueError(
|
|
97
|
-
f"
|
|
99
|
+
f"Need at least 2 groups in {group_col} for comparison. Found: {[r['_grp'] for r in stats]}"
|
|
98
100
|
)
|
|
99
101
|
|
|
102
|
+
if k > 2:
|
|
103
|
+
return _run_one_way_anova(stats, value_col, group_col)
|
|
104
|
+
|
|
100
105
|
r0, r1 = stats[0], stats[1]
|
|
101
106
|
return _compute_t_test_result(
|
|
102
107
|
group_a=r0["_grp"],
|
|
@@ -110,6 +115,98 @@ def _run_t_test_by_group(
|
|
|
110
115
|
)
|
|
111
116
|
|
|
112
117
|
|
|
118
|
+
def _run_one_way_anova(
|
|
119
|
+
stats_rows: List[Any],
|
|
120
|
+
value_col: str,
|
|
121
|
+
group_col: str,
|
|
122
|
+
) -> Dict[str, Any]:
|
|
123
|
+
"""
|
|
124
|
+
One-way ANOVA from per-group mean, sample stddev, and n (Spark ``stddev`` uses ddof=1).
|
|
125
|
+
|
|
126
|
+
SS_within = sum_i (n_i - 1) * s_i^2
|
|
127
|
+
SS_between = sum_i n_i * (mean_i - grand_mean)^2
|
|
128
|
+
"""
|
|
129
|
+
try:
|
|
130
|
+
from scipy import stats as scipy_stats
|
|
131
|
+
except ImportError:
|
|
132
|
+
raise ImportError("ANOVA requires scipy. Install with: pip install scipy")
|
|
133
|
+
|
|
134
|
+
groups: List[Dict[str, Any]] = []
|
|
135
|
+
for r in stats_rows:
|
|
136
|
+
name = r["_grp"]
|
|
137
|
+
mean = float(r["mean"])
|
|
138
|
+
std_raw = r["std"]
|
|
139
|
+
std = 0.0 if std_raw is None else float(std_raw)
|
|
140
|
+
n = int(r["n"])
|
|
141
|
+
groups.append(
|
|
142
|
+
{"name": name, "mean": mean, "std": std, "n": n}
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
k = len(groups)
|
|
146
|
+
N = sum(g["n"] for g in groups)
|
|
147
|
+
if N <= k:
|
|
148
|
+
raise ValueError(
|
|
149
|
+
f"ANOVA needs more observations than groups (N={N}, k={k})"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
grand_mean = sum(g["n"] * g["mean"] for g in groups) / N
|
|
153
|
+
ss_between = sum(g["n"] * (g["mean"] - grand_mean) ** 2 for g in groups)
|
|
154
|
+
ss_within = sum(
|
|
155
|
+
(g["n"] - 1) * (g["std"] ** 2) for g in groups if g["n"] > 1
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
df_between = k - 1
|
|
159
|
+
df_within = N - k
|
|
160
|
+
if df_within <= 0:
|
|
161
|
+
raise ValueError(
|
|
162
|
+
f"ANOVA: df_within must be positive (N={N}, k={k})"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
ms_between = ss_between / df_between
|
|
166
|
+
ms_within = ss_within / df_within if df_within > 0 else 0.0
|
|
167
|
+
|
|
168
|
+
if ms_within <= 0.0:
|
|
169
|
+
if ms_between <= 0.0:
|
|
170
|
+
f_stat = 0.0
|
|
171
|
+
p_value = 1.0
|
|
172
|
+
else:
|
|
173
|
+
f_stat = float("inf")
|
|
174
|
+
p_value = 0.0
|
|
175
|
+
else:
|
|
176
|
+
f_stat = ms_between / ms_within
|
|
177
|
+
p_value = float(scipy_stats.f.sf(f_stat, df_between, df_within))
|
|
178
|
+
|
|
179
|
+
out: Dict[str, Any] = {
|
|
180
|
+
"test": "one_way_anova",
|
|
181
|
+
"value_column": value_col,
|
|
182
|
+
"group_column": group_col,
|
|
183
|
+
"k_groups": k,
|
|
184
|
+
"n_total": N,
|
|
185
|
+
"grand_mean": grand_mean,
|
|
186
|
+
"f_statistic": f_stat,
|
|
187
|
+
"p_value": p_value,
|
|
188
|
+
"df_between": df_between,
|
|
189
|
+
"df_within": df_within,
|
|
190
|
+
"ss_between": ss_between,
|
|
191
|
+
"ss_within": ss_within,
|
|
192
|
+
"ms_between": ms_between,
|
|
193
|
+
"ms_within": ms_within,
|
|
194
|
+
"groups": [
|
|
195
|
+
{
|
|
196
|
+
"name": g["name"],
|
|
197
|
+
"mean": g["mean"],
|
|
198
|
+
"std": g["std"],
|
|
199
|
+
"n": g["n"],
|
|
200
|
+
}
|
|
201
|
+
for g in sorted(groups, key=lambda x: str(x["name"]))
|
|
202
|
+
],
|
|
203
|
+
}
|
|
204
|
+
if f_stat == float("inf"):
|
|
205
|
+
out["f_statistic"] = None
|
|
206
|
+
out["f_statistic_note"] = "infinite (MS_within == 0, reject equal means if SS_between > 0)"
|
|
207
|
+
return out
|
|
208
|
+
|
|
209
|
+
|
|
113
210
|
def _run_t_test_two_columns(df: DataFrame, col_a: str, col_b: str) -> Dict[str, Any]:
|
|
114
211
|
"""T-test: compare means of two numeric columns."""
|
|
115
212
|
df_num = df.select(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: batch-analytics
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.7
|
|
4
4
|
Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
|
|
5
5
|
Author: Litewave Analytics Team
|
|
6
6
|
License: MIT
|
|
@@ -8,6 +8,7 @@ Requires-Python: >=3.8
|
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
Requires-Dist: pyspark<3.6,>=3.4
|
|
10
10
|
Requires-Dist: numpy>=1.19.0
|
|
11
|
+
Requires-Dist: scipy>=1.5.0
|
|
11
12
|
Provides-Extra: dev
|
|
12
13
|
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
13
14
|
Provides-Extra: ttest
|
|
@@ -65,7 +66,7 @@ analytics/
|
|
|
65
66
|
pip install -e .
|
|
66
67
|
# or install every runtime dependency used anywhere in the package, then editable:
|
|
67
68
|
pip install -r requirements.txt && pip install -e .
|
|
68
|
-
# PyPI
|
|
69
|
+
# PyPI install includes numpy and scipy (t-test); extras: s3, clickhouse, output, full
|
|
69
70
|
pip install "batch-analytics[full]"
|
|
70
71
|
```
|
|
71
72
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.5 → batch_analytics-0.3.7}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|