batch-analytics 0.3.21__tar.gz → 0.3.23__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/PKG-INFO +3 -1
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/pyproject.toml +3 -1
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/analytics/gluon_autogluon_train.py +7 -2
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/transform.py +126 -7
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics.egg-info/PKG-INFO +3 -1
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics.egg-info/requires.txt +2 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/README.md +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/setup.cfg +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/analytics/gluon_autogluon_infer.py +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/analytics/pca_clustering.py +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/analytics/t_test.py +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/config.py +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/extract.py +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/job_runner.py +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/utils/__init__.py +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: batch-analytics
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.23
|
|
4
4
|
Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
|
|
5
5
|
Author: Litewave Analytics Team
|
|
6
6
|
License: MIT
|
|
@@ -25,6 +25,7 @@ Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra ==
|
|
|
25
25
|
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "output"
|
|
26
26
|
Provides-Extra: autogluon
|
|
27
27
|
Requires-Dist: autogluon-tabular[lightgbm]<2.0,>=1.0; extra == "autogluon"
|
|
28
|
+
Requires-Dist: typing-extensions>=4.8.0; extra == "autogluon"
|
|
28
29
|
Requires-Dist: pandas>=1.3.0; extra == "autogluon"
|
|
29
30
|
Requires-Dist: boto3>=1.28; extra == "autogluon"
|
|
30
31
|
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "autogluon"
|
|
@@ -37,6 +38,7 @@ Requires-Dist: boto3>=1.28; extra == "full"
|
|
|
37
38
|
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "full"
|
|
38
39
|
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "full"
|
|
39
40
|
Requires-Dist: autogluon-tabular[lightgbm]<2.0,>=1.0; extra == "full"
|
|
41
|
+
Requires-Dist: typing-extensions>=4.8.0; extra == "full"
|
|
40
42
|
Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "full"
|
|
41
43
|
|
|
42
44
|
# Batch Analytics
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.23"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -34,6 +34,7 @@ output = [
|
|
|
34
34
|
# autogluon-tabular[lightgbm]: tabular AutoML with LightGBM only (no PyTorch from full autogluon metapackage)
|
|
35
35
|
autogluon = [
|
|
36
36
|
"autogluon-tabular[lightgbm]>=1.0,<2.0",
|
|
37
|
+
"typing-extensions>=4.8.0",
|
|
37
38
|
"pandas>=1.3.0",
|
|
38
39
|
"boto3>=1.28",
|
|
39
40
|
"clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
|
|
@@ -47,6 +48,7 @@ full = [
|
|
|
47
48
|
"clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
|
|
48
49
|
"clickhouse-connect>=0.7; python_version >= '3.9'",
|
|
49
50
|
"autogluon-tabular[lightgbm]>=1.0,<2.0",
|
|
51
|
+
"typing-extensions>=4.8.0",
|
|
50
52
|
"pyarrow>=10.0.0; python_version >= '3.8'",
|
|
51
53
|
]
|
|
52
54
|
|
|
@@ -97,8 +97,13 @@ def main() -> None:
|
|
|
97
97
|
|
|
98
98
|
try:
|
|
99
99
|
from autogluon.tabular import TabularPredictor
|
|
100
|
-
except ImportError:
|
|
101
|
-
|
|
100
|
+
except ImportError as e:
|
|
101
|
+
if isinstance(e, ModuleNotFoundError) and e.name == "typing_extensions":
|
|
102
|
+
logger.exception(
|
|
103
|
+
"Install typing-extensions (required by AutoGluon): pip install 'typing-extensions>=4.8'"
|
|
104
|
+
)
|
|
105
|
+
else:
|
|
106
|
+
logger.exception("autogluon is not installed; use pip install 'batch-analytics[autogluon]'")
|
|
102
107
|
sys.exit(4)
|
|
103
108
|
|
|
104
109
|
problem_type = os.environ.get("AUTOGLUON_PROBLEM_TYPE", "binary").strip() or "binary"
|
|
@@ -192,6 +192,69 @@ def _normalize_staging_write_mode(raw: str) -> str:
|
|
|
192
192
|
return "overwrite"
|
|
193
193
|
|
|
194
194
|
|
|
195
|
+
def _ch_quoted_ident(name: str) -> str:
|
|
196
|
+
"""ClickHouse identifier in ORDER BY (escape backticks)."""
|
|
197
|
+
return "`" + name.replace("`", "``") + "`"
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _ch_order_by_key_expr(col: str, df: DataFrame) -> str:
|
|
201
|
+
"""
|
|
202
|
+
Build one ORDER BY key expression. Spark nullable columns map to Nullable in CH; CH 25
|
|
203
|
+
rejects nullable sort keys unless ``allow_nullable_key`` is applied — the Spark connector
|
|
204
|
+
often does not forward ``tableProperty("settings.allow_nullable_key")`` into DDL.
|
|
205
|
+
|
|
206
|
+
Using ``assumeNotNull(col)`` yields a non-nullable key expression so CREATE TABLE succeeds.
|
|
207
|
+
Only wrap when Spark marks the field nullable (``assumeNotNull`` requires Nullable input).
|
|
208
|
+
"""
|
|
209
|
+
q = _ch_quoted_ident(col)
|
|
210
|
+
for f in df.schema.fields:
|
|
211
|
+
if f.name == col:
|
|
212
|
+
if f.nullable:
|
|
213
|
+
return f"assumeNotNull({q})"
|
|
214
|
+
return q
|
|
215
|
+
return q
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _strip_outer_tuple_parens(s: str) -> str:
|
|
219
|
+
"""clickhouse-spark-runtime wraps ``order_by`` in parentheses; do not also wrap here."""
|
|
220
|
+
s = s.strip()
|
|
221
|
+
if len(s) >= 2 and s[0] == "(" and s[-1] == ")":
|
|
222
|
+
return s[1:-1].strip()
|
|
223
|
+
return s
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _merge_tree_order_by_for_staging(df: DataFrame, config: BatchAnalyticsConfig) -> str:
|
|
227
|
+
"""
|
|
228
|
+
ClickHouse 25+ rejects MergeTree DDL without ORDER BY. The Spark ClickHouse catalog
|
|
229
|
+
passes this via DataFrameWriterV2.tableProperty("order_by", ...).
|
|
230
|
+
|
|
231
|
+
**Important:** The connector emits ``ORDER BY (<order_by>)``. Do **not** wrap the value
|
|
232
|
+
in an extra ``(...)`` or ClickHouse sees ``ORDER BY ((...))`` and raises a syntax error.
|
|
233
|
+
|
|
234
|
+
Resolution order:
|
|
235
|
+
1. BATCH_CLICKHOUSE_STAGING_ORDER_BY — comma-separated key expressions (no outer parens);
|
|
236
|
+
one layer of surrounding ``(...)`` is stripped if present.
|
|
237
|
+
2. BATCH_DEDUP_COLUMNS / transform.dedup_columns — comma-separated, ``assumeNotNull`` when nullable.
|
|
238
|
+
3. First column of the staging DataFrame schema (same nullable rule).
|
|
239
|
+
"""
|
|
240
|
+
explicit = os.environ.get("BATCH_CLICKHOUSE_STAGING_ORDER_BY", "").strip()
|
|
241
|
+
if explicit:
|
|
242
|
+
return _strip_outer_tuple_parens(explicit)
|
|
243
|
+
dedup = (config.transform.dedup_columns or "").strip()
|
|
244
|
+
if dedup:
|
|
245
|
+
cols = [c.strip() for c in dedup.split(",") if c.strip()]
|
|
246
|
+
if cols:
|
|
247
|
+
parts = [_ch_order_by_key_expr(c, df) for c in cols]
|
|
248
|
+
return ", ".join(parts)
|
|
249
|
+
names = [f.name for f in df.schema.fields]
|
|
250
|
+
if names:
|
|
251
|
+
return _ch_order_by_key_expr(names[0], df)
|
|
252
|
+
raise ValueError(
|
|
253
|
+
"Cannot derive MergeTree ORDER BY for staging: set BATCH_CLICKHOUSE_STAGING_ORDER_BY, "
|
|
254
|
+
"or BATCH_DEDUP_COLUMNS / dedup_columns, or ensure the DataFrame has columns."
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
|
|
195
258
|
def stage_to_clickhouse(
|
|
196
259
|
spark: SparkSession,
|
|
197
260
|
df: DataFrame,
|
|
@@ -200,20 +263,76 @@ def stage_to_clickhouse(
|
|
|
200
263
|
"""
|
|
201
264
|
Write transformed data to ClickHouse staging table.
|
|
202
265
|
Separate job from transform; must complete before analytics can run.
|
|
203
|
-
|
|
266
|
+
|
|
267
|
+
Preferred path: Spark SQL **catalog** API (``DataFrame.writeTo``), matching
|
|
268
|
+
``job_runner.create_spark_session`` registration of ``ClickHouseCatalog``
|
|
269
|
+
(``BATCH_CLICKHOUSE_CATALOG``, default ``batch_ch``). The clickhouse-spark-runtime
|
|
270
|
+
0.8.x connector does **not** register the legacy short name ``format("clickhouse")``
|
|
271
|
+
/ ``clickhouse.DefaultSource``.
|
|
272
|
+
|
|
273
|
+
Fallback: legacy ``format("clickhouse")`` (older stacks), then JDBC (may fail on
|
|
274
|
+
ClickHouse 25+ auto-DDL without ORDER BY).
|
|
275
|
+
|
|
276
|
+
**MergeTree ORDER BY** (required on ClickHouse 25+): set ``BATCH_CLICKHOUSE_STAGING_ORDER_BY``,
|
|
277
|
+
or rely on ``BATCH_DEDUP_COLUMNS`` / ``dedup_columns`` (comma-separated keys; no extra outer parens).
|
|
278
|
+
|
|
279
|
+
**Nullable sort keys**: (1) Dedup-derived ``ORDER BY`` uses ``assumeNotNull(`col`)`` when
|
|
280
|
+
Spark marks the field nullable. (2) The Spark connector often **does not** pass
|
|
281
|
+
``tableProperty("settings.allow_nullable_key")`` into DDL, so we also set
|
|
282
|
+
``engine`` to ``MergeTree() SETTINGS allow_nullable_key = 1``, which ClickHouse
|
|
283
|
+
applies to the created table.
|
|
284
|
+
|
|
204
285
|
Write mode from BATCH_STAGING_WRITE_MODE (default overwrite = full replace).
|
|
205
286
|
"""
|
|
206
287
|
n = df.count()
|
|
207
288
|
mode = _normalize_staging_write_mode(config.transform.staging_write_mode)
|
|
289
|
+
ch = config.clickhouse
|
|
290
|
+
tbl = config.transform.staging_table
|
|
291
|
+
cat = os.environ.get("BATCH_CLICKHOUSE_CATALOG", "batch_ch").strip()
|
|
292
|
+
|
|
293
|
+
if cat:
|
|
294
|
+
try:
|
|
295
|
+
full_name = f"{cat}.{ch.database}.{tbl}"
|
|
296
|
+
order_by = _merge_tree_order_by_for_staging(df, config)
|
|
297
|
+
logger.info(
|
|
298
|
+
"Staging to ClickHouse via catalog %s (mode=%s, order_by=%s)",
|
|
299
|
+
full_name,
|
|
300
|
+
mode,
|
|
301
|
+
order_by,
|
|
302
|
+
)
|
|
303
|
+
# MergeTree SETTINGS in ENGINE: connector often ignores tableProperty("settings.*")
|
|
304
|
+
# for CREATE TABLE; CH 25 then reports allow_nullable_key disabled.
|
|
305
|
+
_mt_engine = "MergeTree() SETTINGS allow_nullable_key = 1"
|
|
306
|
+
w2 = (
|
|
307
|
+
df.writeTo(full_name)
|
|
308
|
+
.tableProperty("engine", _mt_engine)
|
|
309
|
+
.tableProperty("order_by", order_by)
|
|
310
|
+
)
|
|
311
|
+
if mode == "overwrite":
|
|
312
|
+
w2.createOrReplace()
|
|
313
|
+
else:
|
|
314
|
+
w2.append()
|
|
315
|
+
logger.info(
|
|
316
|
+
"Staged data to ClickHouse %s.%s (%d rows)",
|
|
317
|
+
ch.database,
|
|
318
|
+
tbl,
|
|
319
|
+
n,
|
|
320
|
+
)
|
|
321
|
+
return
|
|
322
|
+
except Exception as e:
|
|
323
|
+
logger.warning(
|
|
324
|
+
"ClickHouse catalog write failed (%s), trying legacy format/jdbc",
|
|
325
|
+
e,
|
|
326
|
+
)
|
|
327
|
+
|
|
208
328
|
try:
|
|
209
|
-
ch = config.clickhouse
|
|
210
329
|
writer = (
|
|
211
330
|
df.write.format("clickhouse")
|
|
212
331
|
.option("host", ch.host)
|
|
213
332
|
.option("protocol", ch.protocol)
|
|
214
333
|
.option("http_port", str(ch.port))
|
|
215
334
|
.option("database", ch.database)
|
|
216
|
-
.option("table",
|
|
335
|
+
.option("table", tbl)
|
|
217
336
|
.option("user", ch.user)
|
|
218
337
|
.mode(mode)
|
|
219
338
|
)
|
|
@@ -221,17 +340,17 @@ def stage_to_clickhouse(
|
|
|
221
340
|
writer = writer.option("password", ch.password)
|
|
222
341
|
writer.save()
|
|
223
342
|
except Exception as e:
|
|
224
|
-
logger.warning("ClickHouse
|
|
343
|
+
logger.warning("ClickHouse legacy format failed (%s), using JDBC", e)
|
|
225
344
|
df.write.jdbc(
|
|
226
345
|
config.clickhouse.jdbc_url,
|
|
227
|
-
|
|
346
|
+
tbl,
|
|
228
347
|
mode=mode,
|
|
229
348
|
properties=config.clickhouse.jdbc_properties,
|
|
230
349
|
)
|
|
231
350
|
logger.info(
|
|
232
351
|
"Staged data to ClickHouse %s.%s (%d rows)",
|
|
233
|
-
|
|
234
|
-
|
|
352
|
+
ch.database,
|
|
353
|
+
tbl,
|
|
235
354
|
n,
|
|
236
355
|
)
|
|
237
356
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: batch-analytics
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.23
|
|
4
4
|
Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
|
|
5
5
|
Author: Litewave Analytics Team
|
|
6
6
|
License: MIT
|
|
@@ -25,6 +25,7 @@ Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra ==
|
|
|
25
25
|
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "output"
|
|
26
26
|
Provides-Extra: autogluon
|
|
27
27
|
Requires-Dist: autogluon-tabular[lightgbm]<2.0,>=1.0; extra == "autogluon"
|
|
28
|
+
Requires-Dist: typing-extensions>=4.8.0; extra == "autogluon"
|
|
28
29
|
Requires-Dist: pandas>=1.3.0; extra == "autogluon"
|
|
29
30
|
Requires-Dist: boto3>=1.28; extra == "autogluon"
|
|
30
31
|
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "autogluon"
|
|
@@ -37,6 +38,7 @@ Requires-Dist: boto3>=1.28; extra == "full"
|
|
|
37
38
|
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "full"
|
|
38
39
|
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "full"
|
|
39
40
|
Requires-Dist: autogluon-tabular[lightgbm]<2.0,>=1.0; extra == "full"
|
|
41
|
+
Requires-Dist: typing-extensions>=4.8.0; extra == "full"
|
|
40
42
|
Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "full"
|
|
41
43
|
|
|
42
44
|
# Batch Analytics
|
|
@@ -2,6 +2,7 @@ numpy>=1.19.0
|
|
|
2
2
|
|
|
3
3
|
[autogluon]
|
|
4
4
|
autogluon-tabular[lightgbm]<2.0,>=1.0
|
|
5
|
+
typing-extensions>=4.8.0
|
|
5
6
|
pandas>=1.3.0
|
|
6
7
|
boto3>=1.28
|
|
7
8
|
|
|
@@ -30,6 +31,7 @@ pyspark<3.6,>=3.4
|
|
|
30
31
|
scipy>=1.5.0
|
|
31
32
|
boto3>=1.28
|
|
32
33
|
autogluon-tabular[lightgbm]<2.0,>=1.0
|
|
34
|
+
typing-extensions>=4.8.0
|
|
33
35
|
|
|
34
36
|
[full:python_version < "3.9"]
|
|
35
37
|
clickhouse-connect<0.9,>=0.7
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.21 → batch_analytics-0.3.23}/src/batch_analytics.egg-info/top_level.txt
RENAMED
|
File without changes
|