batch-analytics 0.2.9__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/PKG-INFO +1 -1
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/pyproject.toml +1 -1
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/job_runner.py +30 -26
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics.egg-info/PKG-INFO +1 -1
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/README.md +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/setup.cfg +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/analytics/pca_clustering.py +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/analytics/t_test.py +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/config.py +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/extract.py +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/transform.py +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics.egg-info/requires.txt +0 -0
- {batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.3.1"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -27,6 +27,35 @@ logging.basicConfig(
|
|
|
27
27
|
logger = logging.getLogger(__name__)
|
|
28
28
|
|
|
29
29
|
|
|
30
|
+
def _omit_spark_distrib_jars(jar_list: List[str]) -> List[str]:
|
|
31
|
+
"""
|
|
32
|
+
Drop paths under $SPARK_HOME/jars from spark.jars.
|
|
33
|
+
|
|
34
|
+
Those JARs are already on the driver and executor JVM classpath (Spark launch scripts add
|
|
35
|
+
$SPARK_HOME/jars/*). Listing them again in spark.jars makes Spark distribute them to executors
|
|
36
|
+
as ./basename.jar and breaks Kubernetes executors.
|
|
37
|
+
"""
|
|
38
|
+
spark_home = os.environ.get("SPARK_HOME", "/opt/spark").rstrip("/")
|
|
39
|
+
prefix = f"{spark_home}/jars/"
|
|
40
|
+
out: List[str] = []
|
|
41
|
+
skipped: List[str] = []
|
|
42
|
+
for p in jar_list:
|
|
43
|
+
p = p.strip()
|
|
44
|
+
if not p:
|
|
45
|
+
continue
|
|
46
|
+
if p.startswith(prefix) and p.endswith(".jar"):
|
|
47
|
+
skipped.append(p)
|
|
48
|
+
else:
|
|
49
|
+
out.append(p)
|
|
50
|
+
if skipped:
|
|
51
|
+
logger.info(
|
|
52
|
+
"Omitting spark.jars for JARs already on Spark classpath (%s): %s",
|
|
53
|
+
prefix,
|
|
54
|
+
",".join(skipped),
|
|
55
|
+
)
|
|
56
|
+
return out
|
|
57
|
+
|
|
58
|
+
|
|
30
59
|
def create_spark_session(
|
|
31
60
|
app_name: str = "BatchAnalytics",
|
|
32
61
|
clickhouse_jars: Optional[str] = None,
|
|
@@ -64,32 +93,7 @@ def create_spark_session(
|
|
|
64
93
|
else:
|
|
65
94
|
packages.append(part)
|
|
66
95
|
|
|
67
|
-
|
|
68
|
-
packages.append("org.apache.hadoop:hadoop-aws:3.3.4")
|
|
69
|
-
|
|
70
|
-
# Spark-on-K8s: absolute paths in spark.jars are re-sent to executors as ./basename.jar and
|
|
71
|
-
# fail there ("Unable to create executor due to ./clickhouse-spark-runtime-..."). JARs baked
|
|
72
|
-
# into the driver/executor image belong on the JVM classpath instead.
|
|
73
|
-
if cfg.master.startswith("k8s://") and jar_list:
|
|
74
|
-
local_cp: List[str] = []
|
|
75
|
-
remote_jars: List[str] = []
|
|
76
|
-
for p in jar_list:
|
|
77
|
-
p = p.strip()
|
|
78
|
-
if not p:
|
|
79
|
-
continue
|
|
80
|
-
if p.startswith("/") and p.endswith(".jar"):
|
|
81
|
-
local_cp.append(p)
|
|
82
|
-
else:
|
|
83
|
-
remote_jars.append(p)
|
|
84
|
-
if local_cp:
|
|
85
|
-
joined = ":".join(local_cp)
|
|
86
|
-
builder = (
|
|
87
|
-
builder.config("spark.driver.extraClassPath", joined)
|
|
88
|
-
.config("spark.executor.extraClassPath", joined)
|
|
89
|
-
.config("spark.kubernetes.driver.extraClassPath", joined)
|
|
90
|
-
.config("spark.kubernetes.executor.extraClassPath", joined)
|
|
91
|
-
)
|
|
92
|
-
jar_list = remote_jars
|
|
96
|
+
jar_list = _omit_spark_distrib_jars(jar_list)
|
|
93
97
|
|
|
94
98
|
if jar_list:
|
|
95
99
|
builder = builder.config("spark.jars", ",".join(jar_list))
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
{batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
{batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.2.9 → batch_analytics-0.3.1}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|