batch-analytics 0.2.3__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/PKG-INFO +1 -1
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/pyproject.toml +1 -1
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/extract.py +2 -1
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/job_runner.py +26 -3
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/transform.py +21 -20
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics.egg-info/PKG-INFO +1 -1
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/README.md +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/setup.cfg +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/analytics/pca_clustering.py +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/analytics/t_test.py +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/config.py +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics.egg-info/requires.txt +0 -0
- {batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.5"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -3,6 +3,7 @@ Extract stage: Load data from ClickHouse using Spark ClickHouse connector or JDB
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
import os
|
|
6
7
|
from typing import Dict, List, Optional
|
|
7
8
|
|
|
8
9
|
from pyspark.sql import DataFrame, SparkSession
|
|
@@ -59,7 +60,7 @@ def extract_table(
|
|
|
59
60
|
Uses native connector if configured, otherwise JDBC.
|
|
60
61
|
"""
|
|
61
62
|
if config.extract.use_native_connector:
|
|
62
|
-
df =
|
|
63
|
+
df = _read_via_catalog(spark, config, table)
|
|
63
64
|
if df is None:
|
|
64
65
|
df = _read_via_jdbc(spark, config, table)
|
|
65
66
|
else:
|
|
@@ -72,6 +72,27 @@ def create_spark_session(
|
|
|
72
72
|
if packages:
|
|
73
73
|
builder = builder.config("spark.jars.packages", ",".join(packages))
|
|
74
74
|
|
|
75
|
+
# clickhouse-spark-runtime does not register legacy clickhouse.DefaultSource; the connector
|
|
76
|
+
# expects a Spark catalog (see ClickHouse docs). Enables spark.table("catalog.db.table").
|
|
77
|
+
ch_cat = os.environ.get("BATCH_CLICKHOUSE_CATALOG", "batch_ch").strip()
|
|
78
|
+
if ch_cat:
|
|
79
|
+
ch = config.clickhouse
|
|
80
|
+
builder = (
|
|
81
|
+
builder.config(
|
|
82
|
+
f"spark.sql.catalog.{ch_cat}",
|
|
83
|
+
"com.clickhouse.spark.ClickHouseCatalog",
|
|
84
|
+
)
|
|
85
|
+
.config(f"spark.sql.catalog.{ch_cat}.host", ch.host)
|
|
86
|
+
.config(f"spark.sql.catalog.{ch_cat}.protocol", ch.protocol)
|
|
87
|
+
.config(f"spark.sql.catalog.{ch_cat}.http_port", str(ch.port))
|
|
88
|
+
.config(f"spark.sql.catalog.{ch_cat}.user", ch.user)
|
|
89
|
+
.config(f"spark.sql.catalog.{ch_cat}.database", ch.database)
|
|
90
|
+
)
|
|
91
|
+
if ch.password:
|
|
92
|
+
builder = builder.config(f"spark.sql.catalog.{ch_cat}.password", ch.password)
|
|
93
|
+
if ch.protocol.lower() == "https":
|
|
94
|
+
builder = builder.config(f"spark.sql.catalog.{ch_cat}.option.ssl", "true")
|
|
95
|
+
|
|
75
96
|
if cfg.master.startswith("k8s://"):
|
|
76
97
|
driver_host = socket.gethostbyname(socket.gethostname())
|
|
77
98
|
builder = (
|
|
@@ -140,13 +161,15 @@ def run_pipeline(
|
|
|
140
161
|
run_id = str(uuid.uuid4())[:8]
|
|
141
162
|
|
|
142
163
|
if spark is None:
|
|
143
|
-
# Native format("clickhouse") needs clickhouse-spark-runtime; JDBC
|
|
144
|
-
#
|
|
164
|
+
# Native format("clickhouse") needs clickhouse-spark-runtime; JDBC needs shaded clickhouse-jdbc (*-all),
|
|
165
|
+
# not the thin Maven artifact: thin JAR lacks HttpClient 5 (ClassicHttpRequest).
|
|
166
|
+
# Override: BATCH_SPARK_CLICKHOUSE_PACKAGES=maven coords / https jar URLs (comma-sep) or "" for SPARK_JARS only.
|
|
145
167
|
_raw_ch = os.environ.get("BATCH_SPARK_CLICKHOUSE_PACKAGES")
|
|
146
168
|
if _raw_ch is None:
|
|
147
169
|
ch_pkgs = (
|
|
148
170
|
"com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0,"
|
|
149
|
-
"com
|
|
171
|
+
"https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/0.6.2/"
|
|
172
|
+
"clickhouse-jdbc-0.6.2-all.jar"
|
|
150
173
|
)
|
|
151
174
|
elif not _raw_ch.strip():
|
|
152
175
|
ch_pkgs = None
|
|
@@ -3,6 +3,7 @@ Transform stage: Clean data (remove duplicates), extract add_dimension, and stag
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
import os
|
|
6
7
|
from typing import Optional, Sequence
|
|
7
8
|
|
|
8
9
|
from pyspark.sql import DataFrame, SparkSession
|
|
@@ -170,24 +171,24 @@ def load_staged(
|
|
|
170
171
|
if fmt == "delta":
|
|
171
172
|
return spark.read.format("delta").load(staging_path)
|
|
172
173
|
if fmt == "clickhouse":
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
.
|
|
179
|
-
|
|
180
|
-
.
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
174
|
+
ch = config.clickhouse
|
|
175
|
+
tbl = config.transform.staging_table
|
|
176
|
+
cat = os.environ.get("BATCH_CLICKHOUSE_CATALOG", "batch_ch").strip()
|
|
177
|
+
if cat:
|
|
178
|
+
try:
|
|
179
|
+
return spark.table(f"{cat}.{ch.database}.{tbl}")
|
|
180
|
+
except Exception as e:
|
|
181
|
+
logger.warning(
|
|
182
|
+
"load_staged: catalog table %s.%s.%s failed (%s), using JDBC",
|
|
183
|
+
cat,
|
|
184
|
+
ch.database,
|
|
185
|
+
tbl,
|
|
186
|
+
e,
|
|
187
|
+
)
|
|
188
|
+
dbtable = f"(SELECT * FROM `{ch.database}`.`{tbl}`) AS _stg"
|
|
189
|
+
return spark.read.jdbc(
|
|
190
|
+
ch.jdbc_url,
|
|
191
|
+
dbtable,
|
|
192
|
+
properties=ch.jdbc_properties,
|
|
193
|
+
)
|
|
193
194
|
return spark.read.format(fmt).load(staging_path)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
{batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
{batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.2.3 → batch_analytics-0.2.5}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|