batch-analytics 0.2.9__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/PKG-INFO +1 -1
  2. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/pyproject.toml +1 -1
  3. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics/job_runner.py +22 -3
  4. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics.egg-info/PKG-INFO +1 -1
  5. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/README.md +0 -0
  6. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/setup.cfg +0 -0
  7. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics/__init__.py +0 -0
  8. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics/__main__.py +0 -0
  9. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics/analytics/__init__.py +0 -0
  10. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics/analytics/correlation.py +0 -0
  11. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics/analytics/linear_regression.py +0 -0
  12. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics/analytics/pca_clustering.py +0 -0
  13. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics/analytics/t_test.py +0 -0
  14. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics/config.py +0 -0
  15. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics/extract.py +0 -0
  16. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics/log.py +0 -0
  17. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics/modules.py +0 -0
  18. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics/output/__init__.py +0 -0
  19. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics/output/base.py +0 -0
  20. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics/output/clickhouse.py +0 -0
  21. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics/output/local.py +0 -0
  22. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics/output/s3.py +0 -0
  23. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics/transform.py +0 -0
  24. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
  25. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  26. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  27. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics.egg-info/requires.txt +0 -0
  28. {batch_analytics-0.2.9 → batch_analytics-0.3.0}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.2.9
3
+ Version: 0.3.0
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.2.9"
7
+ version = "0.3.0"
8
8
  description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -13,7 +13,7 @@ from typing import Dict, List, Optional
13
13
 
14
14
  from pyspark.sql import SparkSession
15
15
 
16
- from .config import BatchAnalyticsConfig
16
+ from .config import BatchAnalyticsConfig, SparkK8sConfig
17
17
  from .extract import extract_unified
18
18
  from .log import log_dataframe_summary, log_run
19
19
  from .modules import DEFAULT_MODULES, MODULE_REGISTRY, VALID_MODULES
@@ -27,6 +27,21 @@ logging.basicConfig(
27
27
  logger = logging.getLogger(__name__)
28
28
 
29
29
 
30
+ def _spark_on_kubernetes_for_classpath(cfg: SparkK8sConfig) -> bool:
31
+ """
32
+ True when Spark executors run on Kubernetes.
33
+
34
+ Spark Operator cluster mode passes --master k8s://... to spark-submit; SPARK_MASTER is often
35
+ unset, so SparkK8sConfig.master defaults to local[*] while the JVM still uses Kubernetes.
36
+ Image-baked JARs must go on extraClassPath in that case (spark.jars breaks executors).
37
+ """
38
+ if cfg.master.startswith("k8s://"):
39
+ return True
40
+ if os.environ.get("SPARK_MASTER", "").strip().startswith("k8s://"):
41
+ return True
42
+ return bool(os.environ.get("KUBERNETES_SERVICE_HOST"))
43
+
44
+
30
45
  def create_spark_session(
31
46
  app_name: str = "BatchAnalytics",
32
47
  clickhouse_jars: Optional[str] = None,
@@ -64,13 +79,13 @@ def create_spark_session(
64
79
  else:
65
80
  packages.append(part)
66
81
 
67
- if cfg.master.startswith("k8s://"):
82
+ if _spark_on_kubernetes_for_classpath(cfg):
68
83
  packages.append("org.apache.hadoop:hadoop-aws:3.3.4")
69
84
 
70
85
  # Spark-on-K8s: absolute paths in spark.jars are re-sent to executors as ./basename.jar and
71
86
  # fail there ("Unable to create executor due to ./clickhouse-spark-runtime-..."). JARs baked
72
87
  # into the driver/executor image belong on the JVM classpath instead.
73
- if cfg.master.startswith("k8s://") and jar_list:
88
+ if _spark_on_kubernetes_for_classpath(cfg) and jar_list:
74
89
  local_cp: List[str] = []
75
90
  remote_jars: List[str] = []
76
91
  for p in jar_list:
@@ -83,6 +98,10 @@ def create_spark_session(
83
98
  remote_jars.append(p)
84
99
  if local_cp:
85
100
  joined = ":".join(local_cp)
101
+ logger.info(
102
+ "Spark on Kubernetes: image JARs on driver/executor extraClassPath (not spark.jars): %s",
103
+ joined,
104
+ )
86
105
  builder = (
87
106
  builder.config("spark.driver.extraClassPath", joined)
88
107
  .config("spark.executor.extraClassPath", joined)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.2.9
3
+ Version: 0.3.0
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT