batch-analytics 0.3.0__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/PKG-INFO +1 -1
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/pyproject.toml +1 -1
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/job_runner.py +47 -50
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics.egg-info/PKG-INFO +1 -1
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/README.md +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/setup.cfg +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/analytics/pca_clustering.py +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/analytics/t_test.py +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/config.py +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/extract.py +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/transform.py +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics.egg-info/requires.txt +0 -0
- {batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.2"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -10,10 +10,11 @@ import sys
|
|
|
10
10
|
import uuid
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
from typing import Dict, List, Optional
|
|
13
|
+
from urllib.parse import urlparse
|
|
13
14
|
|
|
14
15
|
from pyspark.sql import SparkSession
|
|
15
16
|
|
|
16
|
-
from .config import BatchAnalyticsConfig
|
|
17
|
+
from .config import BatchAnalyticsConfig
|
|
17
18
|
from .extract import extract_unified
|
|
18
19
|
from .log import log_dataframe_summary, log_run
|
|
19
20
|
from .modules import DEFAULT_MODULES, MODULE_REGISTRY, VALID_MODULES
|
|
@@ -27,19 +28,48 @@ logging.basicConfig(
|
|
|
27
28
|
logger = logging.getLogger(__name__)
|
|
28
29
|
|
|
29
30
|
|
|
30
|
-
def
|
|
31
|
+
def _local_jar_path_for_match(p: str) -> str:
|
|
32
|
+
"""Normalize file:/path or /path for prefix checks."""
|
|
33
|
+
p = p.strip()
|
|
34
|
+
if not p:
|
|
35
|
+
return p
|
|
36
|
+
if p.startswith("file:"):
|
|
37
|
+
parsed = urlparse(p)
|
|
38
|
+
return parsed.path if parsed.path else p[5:]
|
|
39
|
+
return p
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _omit_spark_distrib_jars(jar_list: List[str]) -> List[str]:
|
|
31
43
|
"""
|
|
32
|
-
|
|
44
|
+
Drop paths under $SPARK_HOME/jars from spark.jars.
|
|
45
|
+
|
|
46
|
+
Those JARs are already on the driver and executor JVM classpath (Spark launch scripts add
|
|
47
|
+
$SPARK_HOME/jars/*). Listing them again in spark.jars makes Spark distribute them to executors
|
|
48
|
+
as ./basename.jar and breaks Kubernetes executors.
|
|
33
49
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
Image-baked JARs must go on extraClassPath in that case (spark.jars breaks executors).
|
|
50
|
+
SPARK_HOME may be unset or empty in some pods; always treat /opt/spark/jars/ as Spark distro
|
|
51
|
+
(apache/spark images).
|
|
37
52
|
"""
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
53
|
+
spark_home = (os.environ.get("SPARK_HOME") or "/opt/spark").rstrip("/")
|
|
54
|
+
prefixes = (f"{spark_home}/jars/", "/opt/spark/jars/")
|
|
55
|
+
out: List[str] = []
|
|
56
|
+
skipped: List[str] = []
|
|
57
|
+
for p in jar_list:
|
|
58
|
+
p = p.strip()
|
|
59
|
+
if not p:
|
|
60
|
+
continue
|
|
61
|
+
norm = _local_jar_path_for_match(p)
|
|
62
|
+
if norm.endswith(".jar") and any(norm.startswith(pref) for pref in prefixes):
|
|
63
|
+
skipped.append(p)
|
|
64
|
+
else:
|
|
65
|
+
out.append(p)
|
|
66
|
+
if skipped:
|
|
67
|
+
logger.info(
|
|
68
|
+
"Omitting spark.jars for JARs already on Spark classpath (%s): %s",
|
|
69
|
+
prefixes[0],
|
|
70
|
+
",".join(skipped),
|
|
71
|
+
)
|
|
72
|
+
return out
|
|
43
73
|
|
|
44
74
|
|
|
45
75
|
def create_spark_session(
|
|
@@ -79,36 +109,7 @@ def create_spark_session(
|
|
|
79
109
|
else:
|
|
80
110
|
packages.append(part)
|
|
81
111
|
|
|
82
|
-
|
|
83
|
-
packages.append("org.apache.hadoop:hadoop-aws:3.3.4")
|
|
84
|
-
|
|
85
|
-
# Spark-on-K8s: absolute paths in spark.jars are re-sent to executors as ./basename.jar and
|
|
86
|
-
# fail there ("Unable to create executor due to ./clickhouse-spark-runtime-..."). JARs baked
|
|
87
|
-
# into the driver/executor image belong on the JVM classpath instead.
|
|
88
|
-
if _spark_on_kubernetes_for_classpath(cfg) and jar_list:
|
|
89
|
-
local_cp: List[str] = []
|
|
90
|
-
remote_jars: List[str] = []
|
|
91
|
-
for p in jar_list:
|
|
92
|
-
p = p.strip()
|
|
93
|
-
if not p:
|
|
94
|
-
continue
|
|
95
|
-
if p.startswith("/") and p.endswith(".jar"):
|
|
96
|
-
local_cp.append(p)
|
|
97
|
-
else:
|
|
98
|
-
remote_jars.append(p)
|
|
99
|
-
if local_cp:
|
|
100
|
-
joined = ":".join(local_cp)
|
|
101
|
-
logger.info(
|
|
102
|
-
"Spark on Kubernetes: image JARs on driver/executor extraClassPath (not spark.jars): %s",
|
|
103
|
-
joined,
|
|
104
|
-
)
|
|
105
|
-
builder = (
|
|
106
|
-
builder.config("spark.driver.extraClassPath", joined)
|
|
107
|
-
.config("spark.executor.extraClassPath", joined)
|
|
108
|
-
.config("spark.kubernetes.driver.extraClassPath", joined)
|
|
109
|
-
.config("spark.kubernetes.executor.extraClassPath", joined)
|
|
110
|
-
)
|
|
111
|
-
jar_list = remote_jars
|
|
112
|
+
jar_list = _omit_spark_distrib_jars(jar_list)
|
|
112
113
|
|
|
113
114
|
if jar_list:
|
|
114
115
|
builder = builder.config("spark.jars", ",".join(jar_list))
|
|
@@ -217,16 +218,12 @@ def run_pipeline(
|
|
|
217
218
|
# Native format("clickhouse") needs clickhouse-spark-runtime; JDBC needs shaded clickhouse-jdbc (*-all),
|
|
218
219
|
# not the thin Maven artifact: thin JAR lacks HttpClient 5 (ClassicHttpRequest).
|
|
219
220
|
# Override: BATCH_SPARK_CLICKHOUSE_PACKAGES=maven coords / https jar URLs (comma-sep) or "" for SPARK_JARS only.
|
|
221
|
+
# Empty/unset: rely on $SPARK_HOME/jars (analytics-runner image). Do not add spark.jars /
|
|
222
|
+
# spark.jars.packages for ClickHouse here — that breaks K8s executors (./basename.jar).
|
|
223
|
+
# For ad-hoc runs without the image, set e.g.
|
|
224
|
+
# BATCH_SPARK_CLICKHOUSE_PACKAGES=com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0,https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/0.9.8/clickhouse-jdbc-0.9.8-all.jar
|
|
220
225
|
_raw_ch = os.environ.get("BATCH_SPARK_CLICKHOUSE_PACKAGES")
|
|
221
|
-
if _raw_ch is None:
|
|
222
|
-
# Shaded *-all.jar; pin matches Docker image / SPARK_JARS (0.9.x; no 0.10.x on Central for this artifact).
|
|
223
|
-
_ch_jdbc = os.environ.get("BATCH_CLICKHOUSE_JDBC_VERSION", "0.9.8").strip()
|
|
224
|
-
ch_pkgs = (
|
|
225
|
-
"com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0,"
|
|
226
|
-
f"https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/{_ch_jdbc}/"
|
|
227
|
-
f"clickhouse-jdbc-{_ch_jdbc}-all.jar"
|
|
228
|
-
)
|
|
229
|
-
elif not _raw_ch.strip():
|
|
226
|
+
if _raw_ch is None or not _raw_ch.strip():
|
|
230
227
|
ch_pkgs = None
|
|
231
228
|
else:
|
|
232
229
|
ch_pkgs = _raw_ch.strip()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.0 → batch_analytics-0.3.2}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|