batch-analytics 0.3.2__tar.gz → 0.3.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/PKG-INFO +1 -1
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/pyproject.toml +1 -1
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/analytics/pca_clustering.py +3 -2
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/job_runner.py +14 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics.egg-info/PKG-INFO +1 -1
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/README.md +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/setup.cfg +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/analytics/t_test.py +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/config.py +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/extract.py +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/transform.py +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics.egg-info/requires.txt +0 -0
- {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.4"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
{batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
@@ -84,6 +84,7 @@ def run_pca_clustering(
|
|
|
84
84
|
n_comp_max = min(len(feature_cols), 20)
|
|
85
85
|
pca = PCA(k=n_comp_max, inputCol="features", outputCol="pca_features")
|
|
86
86
|
pca_model = pca.fit(df_scaled)
|
|
87
|
+
df_pca = pca_model.transform(df_scaled)
|
|
87
88
|
|
|
88
89
|
explained = pca_model.explainedVariance.toArray().tolist()
|
|
89
90
|
cumsum = []
|
|
@@ -108,9 +109,9 @@ def run_pca_clustering(
|
|
|
108
109
|
|
|
109
110
|
k = config.analytics.cluster_k
|
|
110
111
|
kmeans = KMeans(k=k, seed=42, featuresCol="pca_features", predictionCol="cluster")
|
|
111
|
-
kmeans_model = kmeans.fit(
|
|
112
|
+
kmeans_model = kmeans.fit(df_pca)
|
|
112
113
|
|
|
113
|
-
df_clustered = kmeans_model.transform(
|
|
114
|
+
df_clustered = kmeans_model.transform(df_pca)
|
|
114
115
|
cluster_sizes = (
|
|
115
116
|
df_clustered.groupBy("cluster")
|
|
116
117
|
.count()
|
|
@@ -113,6 +113,20 @@ def create_spark_session(
|
|
|
113
113
|
|
|
114
114
|
if jar_list:
|
|
115
115
|
builder = builder.config("spark.jars", ",".join(jar_list))
|
|
116
|
+
elif os.environ.get("BATCH_ALLOW_INHERITED_SPARK_JARS", "").strip().lower() not in (
|
|
117
|
+
"1",
|
|
118
|
+
"true",
|
|
119
|
+
"yes",
|
|
120
|
+
):
|
|
121
|
+
# spark-submit / image ENV often set SPARK_JARS → --jars; Python never sees it in jar_list
|
|
122
|
+
# above, but Spark still adds them ("Added JAR /opt/spark/jars/...") and K8s executors fail.
|
|
123
|
+
# Empty spark.jars overrides inherited submit --jars; $SPARK_HOME/jars stay on JVM classpath.
|
|
124
|
+
builder = builder.config("spark.jars", "")
|
|
125
|
+
logger.info(
|
|
126
|
+
"spark.jars cleared (override spark-submit --jars / SPARK_JARS); "
|
|
127
|
+
"JARs under $SPARK_HOME/jars remain on the classpath. "
|
|
128
|
+
"Set BATCH_ALLOW_INHERITED_SPARK_JARS=1 to keep submit-inherited spark.jars."
|
|
129
|
+
)
|
|
116
130
|
if packages:
|
|
117
131
|
builder = builder.config("spark.jars.packages", ",".join(packages))
|
|
118
132
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|