batch-analytics 0.3.3__tar.gz → 0.3.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/PKG-INFO +1 -1
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/pyproject.toml +1 -1
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/analytics/pca_clustering.py +3 -2
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics.egg-info/PKG-INFO +1 -1
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/README.md +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/setup.cfg +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/analytics/t_test.py +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/config.py +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/extract.py +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/job_runner.py +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/transform.py +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics.egg-info/requires.txt +0 -0
- {batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.5"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
{batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
@@ -84,6 +84,7 @@ def run_pca_clustering(
|
|
|
84
84
|
n_comp_max = min(len(feature_cols), 20)
|
|
85
85
|
pca = PCA(k=n_comp_max, inputCol="features", outputCol="pca_features")
|
|
86
86
|
pca_model = pca.fit(df_scaled)
|
|
87
|
+
df_pca = pca_model.transform(df_scaled)
|
|
87
88
|
|
|
88
89
|
explained = pca_model.explainedVariance.toArray().tolist()
|
|
89
90
|
cumsum = []
|
|
@@ -108,9 +109,9 @@ def run_pca_clustering(
|
|
|
108
109
|
|
|
109
110
|
k = config.analytics.cluster_k
|
|
110
111
|
kmeans = KMeans(k=k, seed=42, featuresCol="pca_features", predictionCol="cluster")
|
|
111
|
-
kmeans_model = kmeans.fit(
|
|
112
|
+
kmeans_model = kmeans.fit(df_pca)
|
|
112
113
|
|
|
113
|
-
df_clustered = kmeans_model.transform(
|
|
114
|
+
df_clustered = kmeans_model.transform(df_pca)
|
|
114
115
|
cluster_sizes = (
|
|
115
116
|
df_clustered.groupBy("cluster")
|
|
116
117
|
.count()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.3 → batch_analytics-0.3.5}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|