batch-analytics 0.3.2__tar.gz → 0.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/PKG-INFO +1 -1
  2. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/pyproject.toml +1 -1
  3. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/analytics/pca_clustering.py +3 -2
  4. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/job_runner.py +14 -0
  5. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics.egg-info/PKG-INFO +1 -1
  6. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/README.md +0 -0
  7. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/setup.cfg +0 -0
  8. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/__init__.py +0 -0
  9. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/__main__.py +0 -0
  10. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/analytics/__init__.py +0 -0
  11. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/analytics/correlation.py +0 -0
  12. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/analytics/linear_regression.py +0 -0
  13. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/analytics/t_test.py +0 -0
  14. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/config.py +0 -0
  15. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/extract.py +0 -0
  16. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/log.py +0 -0
  17. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/modules.py +0 -0
  18. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/output/__init__.py +0 -0
  19. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/output/base.py +0 -0
  20. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/output/clickhouse.py +0 -0
  21. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/output/local.py +0 -0
  22. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/output/s3.py +0 -0
  23. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics/transform.py +0 -0
  24. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
  25. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  26. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  27. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics.egg-info/requires.txt +0 -0
  28. {batch_analytics-0.3.2 → batch_analytics-0.3.4}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.2
3
+ Version: 0.3.4
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.3.2"
7
+ version = "0.3.4"
8
8
  description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -84,6 +84,7 @@ def run_pca_clustering(
84
84
  n_comp_max = min(len(feature_cols), 20)
85
85
  pca = PCA(k=n_comp_max, inputCol="features", outputCol="pca_features")
86
86
  pca_model = pca.fit(df_scaled)
87
+ df_pca = pca_model.transform(df_scaled)
87
88
 
88
89
  explained = pca_model.explainedVariance.toArray().tolist()
89
90
  cumsum = []
@@ -108,9 +109,9 @@ def run_pca_clustering(
108
109
 
109
110
  k = config.analytics.cluster_k
110
111
  kmeans = KMeans(k=k, seed=42, featuresCol="pca_features", predictionCol="cluster")
111
- kmeans_model = kmeans.fit(df_scaled)
112
+ kmeans_model = kmeans.fit(df_pca)
112
113
 
113
- df_clustered = kmeans_model.transform(df_scaled)
114
+ df_clustered = kmeans_model.transform(df_pca)
114
115
  cluster_sizes = (
115
116
  df_clustered.groupBy("cluster")
116
117
  .count()
@@ -113,6 +113,20 @@ def create_spark_session(
113
113
 
114
114
  if jar_list:
115
115
  builder = builder.config("spark.jars", ",".join(jar_list))
116
+ elif os.environ.get("BATCH_ALLOW_INHERITED_SPARK_JARS", "").strip().lower() not in (
117
+ "1",
118
+ "true",
119
+ "yes",
120
+ ):
121
+ # spark-submit / image ENV often set SPARK_JARS → --jars; Python never sees it in jar_list
122
+ # above, but Spark still adds them ("Added JAR /opt/spark/jars/...") and K8s executors fail.
123
+ # Empty spark.jars overrides inherited submit --jars; $SPARK_HOME/jars stay on JVM classpath.
124
+ builder = builder.config("spark.jars", "")
125
+ logger.info(
126
+ "spark.jars cleared (override spark-submit --jars / SPARK_JARS); "
127
+ "JARs under $SPARK_HOME/jars remain on the classpath. "
128
+ "Set BATCH_ALLOW_INHERITED_SPARK_JARS=1 to keep submit-inherited spark.jars."
129
+ )
116
130
  if packages:
117
131
  builder = builder.config("spark.jars.packages", ",".join(packages))
118
132
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.2
3
+ Version: 0.3.4
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT