batch-analytics 0.3.1__tar.gz → 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/PKG-INFO +1 -1
  2. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/pyproject.toml +1 -1
  3. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics/job_runner.py +39 -13
  4. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics.egg-info/PKG-INFO +1 -1
  5. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/README.md +0 -0
  6. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/setup.cfg +0 -0
  7. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics/__init__.py +0 -0
  8. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics/__main__.py +0 -0
  9. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics/analytics/__init__.py +0 -0
  10. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics/analytics/correlation.py +0 -0
  11. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics/analytics/linear_regression.py +0 -0
  12. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics/analytics/pca_clustering.py +0 -0
  13. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics/analytics/t_test.py +0 -0
  14. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics/config.py +0 -0
  15. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics/extract.py +0 -0
  16. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics/log.py +0 -0
  17. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics/modules.py +0 -0
  18. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics/output/__init__.py +0 -0
  19. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics/output/base.py +0 -0
  20. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics/output/clickhouse.py +0 -0
  21. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics/output/local.py +0 -0
  22. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics/output/s3.py +0 -0
  23. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics/transform.py +0 -0
  24. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
  25. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  26. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  27. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics.egg-info/requires.txt +0 -0
  28. {batch_analytics-0.3.1 → batch_analytics-0.3.3}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.1
3
+ Version: 0.3.3
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.3.1"
7
+ version = "0.3.3"
8
8
  description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -10,6 +10,7 @@ import sys
10
10
  import uuid
11
11
  from pathlib import Path
12
12
  from typing import Dict, List, Optional
13
+ from urllib.parse import urlparse
13
14
 
14
15
  from pyspark.sql import SparkSession
15
16
 
@@ -27,6 +28,17 @@ logging.basicConfig(
27
28
  logger = logging.getLogger(__name__)
28
29
 
29
30
 
31
+ def _local_jar_path_for_match(p: str) -> str:
32
+ """Normalize file:/path or /path for prefix checks."""
33
+ p = p.strip()
34
+ if not p:
35
+ return p
36
+ if p.startswith("file:"):
37
+ parsed = urlparse(p)
38
+ return parsed.path if parsed.path else p[5:]
39
+ return p
40
+
41
+
30
42
  def _omit_spark_distrib_jars(jar_list: List[str]) -> List[str]:
31
43
  """
32
44
  Drop paths under $SPARK_HOME/jars from spark.jars.
@@ -34,23 +46,27 @@ def _omit_spark_distrib_jars(jar_list: List[str]) -> List[str]:
34
46
  Those JARs are already on the driver and executor JVM classpath (Spark launch scripts add
35
47
  $SPARK_HOME/jars/*). Listing them again in spark.jars makes Spark distribute them to executors
36
48
  as ./basename.jar and breaks Kubernetes executors.
49
+
50
+ SPARK_HOME may be unset or empty in some pods; always treat /opt/spark/jars/ as Spark distro
51
+ (apache/spark images).
37
52
  """
38
- spark_home = os.environ.get("SPARK_HOME", "/opt/spark").rstrip("/")
39
- prefix = f"{spark_home}/jars/"
53
+ spark_home = (os.environ.get("SPARK_HOME") or "/opt/spark").rstrip("/")
54
+ prefixes = (f"{spark_home}/jars/", "/opt/spark/jars/")
40
55
  out: List[str] = []
41
56
  skipped: List[str] = []
42
57
  for p in jar_list:
43
58
  p = p.strip()
44
59
  if not p:
45
60
  continue
46
- if p.startswith(prefix) and p.endswith(".jar"):
61
+ norm = _local_jar_path_for_match(p)
62
+ if norm.endswith(".jar") and any(norm.startswith(pref) for pref in prefixes):
47
63
  skipped.append(p)
48
64
  else:
49
65
  out.append(p)
50
66
  if skipped:
51
67
  logger.info(
52
68
  "Omitting spark.jars for JARs already on Spark classpath (%s): %s",
53
- prefix,
69
+ prefixes[0],
54
70
  ",".join(skipped),
55
71
  )
56
72
  return out
@@ -97,6 +113,20 @@ def create_spark_session(
97
113
 
98
114
  if jar_list:
99
115
  builder = builder.config("spark.jars", ",".join(jar_list))
116
+ elif os.environ.get("BATCH_ALLOW_INHERITED_SPARK_JARS", "").strip().lower() not in (
117
+ "1",
118
+ "true",
119
+ "yes",
120
+ ):
121
+ # spark-submit / image ENV often set SPARK_JARS → --jars; Python never sees it in jar_list
122
+ # above, but Spark still adds them ("Added JAR /opt/spark/jars/...") and K8s executors fail.
123
+ # Empty spark.jars overrides inherited submit --jars; $SPARK_HOME/jars stay on JVM classpath.
124
+ builder = builder.config("spark.jars", "")
125
+ logger.info(
126
+ "spark.jars cleared (override spark-submit --jars / SPARK_JARS); "
127
+ "JARs under $SPARK_HOME/jars remain on the classpath. "
128
+ "Set BATCH_ALLOW_INHERITED_SPARK_JARS=1 to keep submit-inherited spark.jars."
129
+ )
100
130
  if packages:
101
131
  builder = builder.config("spark.jars.packages", ",".join(packages))
102
132
 
@@ -202,16 +232,12 @@ def run_pipeline(
202
232
  # Native format("clickhouse") needs clickhouse-spark-runtime; JDBC needs shaded clickhouse-jdbc (*-all),
203
233
  # not the thin Maven artifact: thin JAR lacks HttpClient 5 (ClassicHttpRequest).
204
234
  # Override: BATCH_SPARK_CLICKHOUSE_PACKAGES=maven coords / https jar URLs (comma-sep) or "" for SPARK_JARS only.
235
+ # Empty/unset: rely on $SPARK_HOME/jars (analytics-runner image). Do not add spark.jars /
236
+ # spark.jars.packages for ClickHouse here — that breaks K8s executors (./basename.jar).
237
+ # For ad-hoc runs without the image, set e.g.
238
+ # BATCH_SPARK_CLICKHOUSE_PACKAGES=com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0,https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/0.9.8/clickhouse-jdbc-0.9.8-all.jar
205
239
  _raw_ch = os.environ.get("BATCH_SPARK_CLICKHOUSE_PACKAGES")
206
- if _raw_ch is None:
207
- # Shaded *-all.jar; pin matches Docker image / SPARK_JARS (0.9.x; no 0.10.x on Central for this artifact).
208
- _ch_jdbc = os.environ.get("BATCH_CLICKHOUSE_JDBC_VERSION", "0.9.8").strip()
209
- ch_pkgs = (
210
- "com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0,"
211
- f"https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/{_ch_jdbc}/"
212
- f"clickhouse-jdbc-{_ch_jdbc}-all.jar"
213
- )
214
- elif not _raw_ch.strip():
240
+ if _raw_ch is None or not _raw_ch.strip():
215
241
  ch_pkgs = None
216
242
  else:
217
243
  ch_pkgs = _raw_ch.strip()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.1
3
+ Version: 0.3.3
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT