batch-analytics 0.3.0__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/PKG-INFO +1 -1
  2. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/pyproject.toml +1 -1
  3. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/job_runner.py +47 -50
  4. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics.egg-info/PKG-INFO +1 -1
  5. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/README.md +0 -0
  6. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/setup.cfg +0 -0
  7. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/__init__.py +0 -0
  8. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/__main__.py +0 -0
  9. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/analytics/__init__.py +0 -0
  10. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/analytics/correlation.py +0 -0
  11. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/analytics/linear_regression.py +0 -0
  12. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/analytics/pca_clustering.py +0 -0
  13. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/analytics/t_test.py +0 -0
  14. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/config.py +0 -0
  15. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/extract.py +0 -0
  16. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/log.py +0 -0
  17. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/modules.py +0 -0
  18. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/output/__init__.py +0 -0
  19. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/output/base.py +0 -0
  20. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/output/clickhouse.py +0 -0
  21. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/output/local.py +0 -0
  22. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/output/s3.py +0 -0
  23. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/transform.py +0 -0
  24. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
  25. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  26. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  27. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics.egg-info/requires.txt +0 -0
  28. {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.3.0"
7
+ version = "0.3.2"
8
8
  description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -10,10 +10,11 @@ import sys
10
10
  import uuid
11
11
  from pathlib import Path
12
12
  from typing import Dict, List, Optional
13
+ from urllib.parse import urlparse
13
14
 
14
15
  from pyspark.sql import SparkSession
15
16
 
16
- from .config import BatchAnalyticsConfig, SparkK8sConfig
17
+ from .config import BatchAnalyticsConfig
17
18
  from .extract import extract_unified
18
19
  from .log import log_dataframe_summary, log_run
19
20
  from .modules import DEFAULT_MODULES, MODULE_REGISTRY, VALID_MODULES
@@ -27,19 +28,48 @@ logging.basicConfig(
27
28
  logger = logging.getLogger(__name__)
28
29
 
29
30
 
30
- def _spark_on_kubernetes_for_classpath(cfg: SparkK8sConfig) -> bool:
31
+ def _local_jar_path_for_match(p: str) -> str:
32
+ """Normalize file:/path or /path for prefix checks."""
33
+ p = p.strip()
34
+ if not p:
35
+ return p
36
+ if p.startswith("file:"):
37
+ parsed = urlparse(p)
38
+ return parsed.path if parsed.path else p[5:]
39
+ return p
40
+
41
+
42
+ def _omit_spark_distrib_jars(jar_list: List[str]) -> List[str]:
31
43
  """
32
- True when Spark executors run on Kubernetes.
44
+ Drop paths under $SPARK_HOME/jars from spark.jars.
45
+
46
+ Those JARs are already on the driver and executor JVM classpath (Spark launch scripts add
47
+ $SPARK_HOME/jars/*). Listing them again in spark.jars makes Spark distribute them to executors
48
+ as ./basename.jar and breaks Kubernetes executors.
33
49
 
34
- Spark Operator cluster mode passes --master k8s://... to spark-submit; SPARK_MASTER is often
35
- unset, so SparkK8sConfig.master defaults to local[*] while the JVM still uses Kubernetes.
36
- Image-baked JARs must go on extraClassPath in that case (spark.jars breaks executors).
50
+ SPARK_HOME may be unset or empty in some pods; always treat /opt/spark/jars/ as Spark distro
51
+ (apache/spark images).
37
52
  """
38
- if cfg.master.startswith("k8s://"):
39
- return True
40
- if os.environ.get("SPARK_MASTER", "").strip().startswith("k8s://"):
41
- return True
42
- return bool(os.environ.get("KUBERNETES_SERVICE_HOST"))
53
+ spark_home = (os.environ.get("SPARK_HOME") or "/opt/spark").rstrip("/")
54
+ prefixes = (f"{spark_home}/jars/", "/opt/spark/jars/")
55
+ out: List[str] = []
56
+ skipped: List[str] = []
57
+ for p in jar_list:
58
+ p = p.strip()
59
+ if not p:
60
+ continue
61
+ norm = _local_jar_path_for_match(p)
62
+ if norm.endswith(".jar") and any(norm.startswith(pref) for pref in prefixes):
63
+ skipped.append(p)
64
+ else:
65
+ out.append(p)
66
+ if skipped:
67
+ logger.info(
68
+ "Omitting spark.jars for JARs already on Spark classpath (%s): %s",
69
+ prefixes[0],
70
+ ",".join(skipped),
71
+ )
72
+ return out
43
73
 
44
74
 
45
75
  def create_spark_session(
@@ -79,36 +109,7 @@ def create_spark_session(
79
109
  else:
80
110
  packages.append(part)
81
111
 
82
- if _spark_on_kubernetes_for_classpath(cfg):
83
- packages.append("org.apache.hadoop:hadoop-aws:3.3.4")
84
-
85
- # Spark-on-K8s: absolute paths in spark.jars are re-sent to executors as ./basename.jar and
86
- # fail there ("Unable to create executor due to ./clickhouse-spark-runtime-..."). JARs baked
87
- # into the driver/executor image belong on the JVM classpath instead.
88
- if _spark_on_kubernetes_for_classpath(cfg) and jar_list:
89
- local_cp: List[str] = []
90
- remote_jars: List[str] = []
91
- for p in jar_list:
92
- p = p.strip()
93
- if not p:
94
- continue
95
- if p.startswith("/") and p.endswith(".jar"):
96
- local_cp.append(p)
97
- else:
98
- remote_jars.append(p)
99
- if local_cp:
100
- joined = ":".join(local_cp)
101
- logger.info(
102
- "Spark on Kubernetes: image JARs on driver/executor extraClassPath (not spark.jars): %s",
103
- joined,
104
- )
105
- builder = (
106
- builder.config("spark.driver.extraClassPath", joined)
107
- .config("spark.executor.extraClassPath", joined)
108
- .config("spark.kubernetes.driver.extraClassPath", joined)
109
- .config("spark.kubernetes.executor.extraClassPath", joined)
110
- )
111
- jar_list = remote_jars
112
+ jar_list = _omit_spark_distrib_jars(jar_list)
112
113
 
113
114
  if jar_list:
114
115
  builder = builder.config("spark.jars", ",".join(jar_list))
@@ -217,16 +218,12 @@ def run_pipeline(
217
218
  # Native format("clickhouse") needs clickhouse-spark-runtime; JDBC needs shaded clickhouse-jdbc (*-all),
218
219
  # not the thin Maven artifact: thin JAR lacks HttpClient 5 (ClassicHttpRequest).
219
220
  # Override: BATCH_SPARK_CLICKHOUSE_PACKAGES=maven coords / https jar URLs (comma-sep) or "" for SPARK_JARS only.
221
+ # Empty/unset: rely on $SPARK_HOME/jars (analytics-runner image). Do not add spark.jars /
222
+ # spark.jars.packages for ClickHouse here — that breaks K8s executors (./basename.jar).
223
+ # For ad-hoc runs without the image, set e.g.
224
+ # BATCH_SPARK_CLICKHOUSE_PACKAGES=com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0,https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/0.9.8/clickhouse-jdbc-0.9.8-all.jar
220
225
  _raw_ch = os.environ.get("BATCH_SPARK_CLICKHOUSE_PACKAGES")
221
- if _raw_ch is None:
222
- # Shaded *-all.jar; pin matches Docker image / SPARK_JARS (0.9.x; no 0.10.x on Central for this artifact).
223
- _ch_jdbc = os.environ.get("BATCH_CLICKHOUSE_JDBC_VERSION", "0.9.8").strip()
224
- ch_pkgs = (
225
- "com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0,"
226
- f"https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/{_ch_jdbc}/"
227
- f"clickhouse-jdbc-{_ch_jdbc}-all.jar"
228
- )
229
- elif not _raw_ch.strip():
226
+ if _raw_ch is None or not _raw_ch.strip():
230
227
  ch_pkgs = None
231
228
  else:
232
229
  ch_pkgs = _raw_ch.strip()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT