batch-analytics 0.3.13__tar.gz → 0.3.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/PKG-INFO +1 -1
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/pyproject.toml +1 -1
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/config.py +13 -4
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/extract.py +48 -2
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/transform.py +17 -5
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/PKG-INFO +1 -1
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/README.md +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/setup.cfg +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/analytics/pca_clustering.py +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/analytics/t_test.py +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/job_runner.py +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/requires.txt +0 -0
- {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.14"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -29,16 +29,23 @@ class ClickHouseConfig:
|
|
|
29
29
|
|
|
30
30
|
@property
|
|
31
31
|
def jdbc_properties(self) -> dict:
|
|
32
|
+
"""JDBC connection properties for Spark.
|
|
33
|
+
|
|
34
|
+
clickhouse-jdbc (clickhouse-java v0.6+) rejects legacy keys such as
|
|
35
|
+
``compress_algorithm`` (ClientMisconfigurationException). Prefer JDBC URL
|
|
36
|
+
query parameters for compression behavior. To force the old property for
|
|
37
|
+
legacy stacks, set CLICKHOUSE_JDBC_LEGACY_COMPRESS_ALGORITHM (e.g. ``none``).
|
|
38
|
+
"""
|
|
32
39
|
props = {
|
|
33
40
|
"user": self.user,
|
|
34
41
|
"driver": "com.clickhouse.jdbc.ClickHouseDriver",
|
|
35
|
-
# Match Spark read codec default: avoids JDBC LZ4/gzip mismatches with server HTTP compression
|
|
36
|
-
"compress_algorithm": os.environ.get(
|
|
37
|
-
"CLICKHOUSE_JDBC_COMPRESS_ALGORITHM", "none"
|
|
38
|
-
),
|
|
39
42
|
}
|
|
40
43
|
if self.password:
|
|
41
44
|
props["password"] = self.password
|
|
45
|
+
# Opt-in legacy property for older shaded JDBC stacks only.
|
|
46
|
+
legacy = os.environ.get("CLICKHOUSE_JDBC_LEGACY_COMPRESS_ALGORITHM", "").strip()
|
|
47
|
+
if legacy:
|
|
48
|
+
props["compress_algorithm"] = legacy
|
|
42
49
|
return props
|
|
43
50
|
|
|
44
51
|
|
|
@@ -78,6 +85,8 @@ class TransformConfig:
|
|
|
78
85
|
staging_format: str = os.environ.get("BATCH_STAGING_FORMAT", "clickhouse")
|
|
79
86
|
# Staging table name in ClickHouse (when format=clickhouse)
|
|
80
87
|
staging_table: str = os.environ.get("BATCH_STAGING_TABLE", "analytics_staging")
|
|
88
|
+
# Spark save mode for ClickHouse staging (and path staging): overwrite | append
|
|
89
|
+
staging_write_mode: str = os.environ.get("BATCH_STAGING_WRITE_MODE", "overwrite")
|
|
81
90
|
# Source column holding a JSON object or Python dict string; every top-level key becomes a new String column
|
|
82
91
|
# (see transform.expand_kv_blob_column). Example: add_dimensions {'anchor_id':'...','lot':'A1'}
|
|
83
92
|
add_dimension_column: str = os.environ.get("BATCH_ADD_DIMENSION_COLUMN", "add_dimension")
|
|
@@ -4,7 +4,8 @@ Extract stage: Load data from ClickHouse using Spark ClickHouse connector or JDB
|
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
|
-
|
|
7
|
+
import os
|
|
8
|
+
from typing import Dict, List, Optional, Tuple
|
|
8
9
|
|
|
9
10
|
from pyspark.sql import DataFrame, SparkSession
|
|
10
11
|
from pyspark.sql.functions import col
|
|
@@ -59,6 +60,48 @@ def _apply_extract_filter(df: DataFrame, config: BatchAnalyticsConfig) -> DataFr
|
|
|
59
60
|
return filtered
|
|
60
61
|
|
|
61
62
|
|
|
63
|
+
def _parse_ch_database_table(table: str, default_database: str) -> Tuple[str, str]:
|
|
64
|
+
"""
|
|
65
|
+
Resolve ``table`` reference to (database, table_name).
|
|
66
|
+
|
|
67
|
+
``batch_metric_facts`` → (default_database, batch_metric_facts)
|
|
68
|
+
``analytics.batch_metric_facts`` → (analytics, batch_metric_facts)
|
|
69
|
+
"""
|
|
70
|
+
t = (table or "").strip()
|
|
71
|
+
if not t:
|
|
72
|
+
return default_database, t
|
|
73
|
+
if "." in t and not t.startswith("("):
|
|
74
|
+
db, tbl = t.split(".", 1)
|
|
75
|
+
db, tbl = db.strip(), tbl.strip()
|
|
76
|
+
if db and tbl:
|
|
77
|
+
return db, tbl
|
|
78
|
+
return default_database, t
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _read_via_catalog(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str) -> Optional[DataFrame]:
|
|
82
|
+
"""
|
|
83
|
+
Read via Spark SQL catalog (ClickHouseCatalog), registered in job_runner.create_spark_session.
|
|
84
|
+
|
|
85
|
+
clickhouse-spark-runtime does **not** register legacy short name ``format(\"clickhouse\")`` /
|
|
86
|
+
``clickhouse.DefaultSource``; catalog + ``spark.table(catalog.db.table)`` is the supported path.
|
|
87
|
+
"""
|
|
88
|
+
cat = os.environ.get("BATCH_CLICKHOUSE_CATALOG", "batch_ch").strip()
|
|
89
|
+
if not cat:
|
|
90
|
+
return None
|
|
91
|
+
db, tbl = _parse_ch_database_table(table, cfg.clickhouse.database)
|
|
92
|
+
ident = f"{cat}.{db}.{tbl}"
|
|
93
|
+
try:
|
|
94
|
+
return spark.table(ident)
|
|
95
|
+
except Exception as e:
|
|
96
|
+
logger.warning(
|
|
97
|
+
"Catalog read failed for %s (%s): %s. Trying other readers.",
|
|
98
|
+
ident,
|
|
99
|
+
table,
|
|
100
|
+
e,
|
|
101
|
+
)
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
|
|
62
105
|
def _read_via_format(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str) -> Optional[DataFrame]:
|
|
63
106
|
"""
|
|
64
107
|
Read from ClickHouse using the native format API (clickhouse-spark-runtime).
|
|
@@ -106,7 +149,10 @@ def extract_table(
|
|
|
106
149
|
Uses native connector if configured, otherwise JDBC.
|
|
107
150
|
"""
|
|
108
151
|
if config.extract.use_native_connector:
|
|
109
|
-
|
|
152
|
+
# Prefer catalog (matches clickhouse-spark-runtime); avoid legacy DefaultSource path.
|
|
153
|
+
df = _read_via_catalog(spark, config, table)
|
|
154
|
+
if df is None:
|
|
155
|
+
df = _read_via_format(spark, config, table)
|
|
110
156
|
if df is None:
|
|
111
157
|
df = _read_via_jdbc(spark, config, table)
|
|
112
158
|
else:
|
|
@@ -183,6 +183,15 @@ def transform(
|
|
|
183
183
|
return remove_duplicates(transformed, key_columns=dedup_cols)
|
|
184
184
|
|
|
185
185
|
|
|
186
|
+
def _normalize_staging_write_mode(raw: str) -> str:
|
|
187
|
+
"""Spark DataFrameWriter mode: overwrite (replace table contents) or append."""
|
|
188
|
+
m = (raw or "overwrite").strip().lower()
|
|
189
|
+
if m in ("overwrite", "append"):
|
|
190
|
+
return m
|
|
191
|
+
logger.warning("Invalid BATCH_STAGING_WRITE_MODE=%r; using overwrite", raw)
|
|
192
|
+
return "overwrite"
|
|
193
|
+
|
|
194
|
+
|
|
186
195
|
def stage_to_clickhouse(
|
|
187
196
|
spark: SparkSession,
|
|
188
197
|
df: DataFrame,
|
|
@@ -192,8 +201,10 @@ def stage_to_clickhouse(
|
|
|
192
201
|
Write transformed data to ClickHouse staging table.
|
|
193
202
|
Separate job from transform; must complete before analytics can run.
|
|
194
203
|
Uses native connector if available, else JDBC.
|
|
204
|
+
Write mode from BATCH_STAGING_WRITE_MODE (default overwrite = full replace).
|
|
195
205
|
"""
|
|
196
206
|
n = df.count()
|
|
207
|
+
mode = _normalize_staging_write_mode(config.transform.staging_write_mode)
|
|
197
208
|
try:
|
|
198
209
|
ch = config.clickhouse
|
|
199
210
|
writer = (
|
|
@@ -204,7 +215,7 @@ def stage_to_clickhouse(
|
|
|
204
215
|
.option("database", ch.database)
|
|
205
216
|
.option("table", config.transform.staging_table)
|
|
206
217
|
.option("user", ch.user)
|
|
207
|
-
.mode(
|
|
218
|
+
.mode(mode)
|
|
208
219
|
)
|
|
209
220
|
if ch.password:
|
|
210
221
|
writer = writer.option("password", ch.password)
|
|
@@ -214,7 +225,7 @@ def stage_to_clickhouse(
|
|
|
214
225
|
df.write.jdbc(
|
|
215
226
|
config.clickhouse.jdbc_url,
|
|
216
227
|
config.transform.staging_table,
|
|
217
|
-
mode=
|
|
228
|
+
mode=mode,
|
|
218
229
|
properties=config.clickhouse.jdbc_properties,
|
|
219
230
|
)
|
|
220
231
|
logger.info(
|
|
@@ -233,14 +244,15 @@ def stage_to_path(
|
|
|
233
244
|
"""Write transformed data to parquet/delta (for local dev or intermediate storage)."""
|
|
234
245
|
path = config.transform.staging_path
|
|
235
246
|
fmt = config.transform.staging_format
|
|
247
|
+
mode = _normalize_staging_write_mode(config.transform.staging_write_mode)
|
|
236
248
|
if fmt == "parquet":
|
|
237
|
-
df.write.mode(
|
|
249
|
+
df.write.mode(mode).parquet(path)
|
|
238
250
|
logger.info("Staged data to %s (parquet)", path)
|
|
239
251
|
elif fmt == "delta":
|
|
240
|
-
df.write.format("delta").mode(
|
|
252
|
+
df.write.format("delta").mode(mode).save(path)
|
|
241
253
|
logger.info("Staged data to %s (delta)", path)
|
|
242
254
|
else:
|
|
243
|
-
df.write.format(fmt).mode(
|
|
255
|
+
df.write.format(fmt).mode(mode).save(path)
|
|
244
256
|
logger.info("Staged data to %s (%s)", path, fmt)
|
|
245
257
|
|
|
246
258
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/top_level.txt
RENAMED
|
File without changes
|