batch-analytics 0.2.5__tar.gz → 0.2.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/PKG-INFO +1 -1
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/pyproject.toml +1 -1
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/config.py +8 -1
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/job_runner.py +14 -2
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics.egg-info/PKG-INFO +1 -1
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/README.md +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/setup.cfg +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/analytics/pca_clustering.py +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/analytics/t_test.py +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/extract.py +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/transform.py +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics.egg-info/requires.txt +0 -0
- {batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.7"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -29,7 +29,14 @@ class ClickHouseConfig:
|
|
|
29
29
|
|
|
30
30
|
@property
|
|
31
31
|
def jdbc_properties(self) -> dict:
|
|
32
|
-
props = {
|
|
32
|
+
props = {
|
|
33
|
+
"user": self.user,
|
|
34
|
+
"driver": "com.clickhouse.jdbc.ClickHouseDriver",
|
|
35
|
+
# Match Spark read codec default: avoids JDBC LZ4/gzip mismatches with server HTTP compression
|
|
36
|
+
"compress_algorithm": os.environ.get(
|
|
37
|
+
"CLICKHOUSE_JDBC_COMPRESS_ALGORITHM", "none"
|
|
38
|
+
),
|
|
39
|
+
}
|
|
33
40
|
if self.password:
|
|
34
41
|
props["password"] = self.password
|
|
35
42
|
return props
|
|
@@ -92,6 +92,16 @@ def create_spark_session(
|
|
|
92
92
|
builder = builder.config(f"spark.sql.catalog.{ch_cat}.password", ch.password)
|
|
93
93
|
if ch.protocol.lower() == "https":
|
|
94
94
|
builder = builder.config(f"spark.sql.catalog.{ch_cat}.option.ssl", "true")
|
|
95
|
+
# Avoid Lz4InputStream "Magic is not correct" when server HTTP compression != client expectation
|
|
96
|
+
# (see clickhouse-java#1449 / server enable_http_compression user defaults).
|
|
97
|
+
read_codec = os.environ.get(
|
|
98
|
+
"SPARK_CLICKHOUSE_READ_COMPRESSION_CODEC", "none"
|
|
99
|
+
).strip()
|
|
100
|
+
if read_codec:
|
|
101
|
+
builder = builder.config(
|
|
102
|
+
"spark.clickhouse.read.compression.codec",
|
|
103
|
+
read_codec,
|
|
104
|
+
)
|
|
95
105
|
|
|
96
106
|
if cfg.master.startswith("k8s://"):
|
|
97
107
|
driver_host = socket.gethostbyname(socket.gethostname())
|
|
@@ -166,10 +176,12 @@ def run_pipeline(
|
|
|
166
176
|
# Override: BATCH_SPARK_CLICKHOUSE_PACKAGES=maven coords / https jar URLs (comma-sep) or "" for SPARK_JARS only.
|
|
167
177
|
_raw_ch = os.environ.get("BATCH_SPARK_CLICKHOUSE_PACKAGES")
|
|
168
178
|
if _raw_ch is None:
|
|
179
|
+
# Shaded *-all.jar; pin matches Docker image / SPARK_JARS (0.9.x; no 0.10.x on Central for this artifact).
|
|
180
|
+
_ch_jdbc = os.environ.get("BATCH_CLICKHOUSE_JDBC_VERSION", "0.9.8").strip()
|
|
169
181
|
ch_pkgs = (
|
|
170
182
|
"com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0,"
|
|
171
|
-
"https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/
|
|
172
|
-
"clickhouse-jdbc-
|
|
183
|
+
f"https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/{_ch_jdbc}/"
|
|
184
|
+
f"clickhouse-jdbc-{_ch_jdbc}-all.jar"
|
|
173
185
|
)
|
|
174
186
|
elif not _raw_ch.strip():
|
|
175
187
|
ch_pkgs = None
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
{batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
{batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.2.5 → batch_analytics-0.2.7}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|