batch-analytics 0.3.23__tar.gz → 0.3.24__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/PKG-INFO +1 -1
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/pyproject.toml +1 -1
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/job_runner.py +1 -1
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/transform.py +25 -21
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/PKG-INFO +1 -1
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/README.md +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/setup.cfg +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/analytics/gluon_autogluon_infer.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/analytics/gluon_autogluon_train.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/analytics/pca_clustering.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/analytics/t_test.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/config.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/extract.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/utils/__init__.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/requires.txt +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.24"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -235,7 +235,7 @@ def run_pipeline(
|
|
|
235
235
|
# Empty/unset: rely on $SPARK_HOME/jars (analytics-runner image). Do not add spark.jars /
|
|
236
236
|
# spark.jars.packages for ClickHouse here — that breaks K8s executors (./basename.jar).
|
|
237
237
|
# For ad-hoc runs without the image, set e.g.
|
|
238
|
-
# BATCH_SPARK_CLICKHOUSE_PACKAGES=com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.
|
|
238
|
+
# BATCH_SPARK_CLICKHOUSE_PACKAGES=com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.10.0,https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/0.9.8/clickhouse-jdbc-0.9.8-all.jar
|
|
239
239
|
_raw_ch = os.environ.get("BATCH_SPARK_CLICKHOUSE_PACKAGES")
|
|
240
240
|
if _raw_ch is None or not _raw_ch.strip():
|
|
241
241
|
ch_pkgs = None
|
|
@@ -200,8 +200,9 @@ def _ch_quoted_ident(name: str) -> str:
|
|
|
200
200
|
def _ch_order_by_key_expr(col: str, df: DataFrame) -> str:
|
|
201
201
|
"""
|
|
202
202
|
Build one ORDER BY key expression. Spark nullable columns map to Nullable in CH; CH 25
|
|
203
|
-
rejects nullable sort keys unless ``allow_nullable_key`` is
|
|
204
|
-
|
|
203
|
+
rejects nullable sort keys unless ``allow_nullable_key`` is set (via
|
|
204
|
+
``tableProperty("settings.allow_nullable_key", "1")`` in the catalog path) or keys are
|
|
205
|
+
wrapped with ``assumeNotNull``.
|
|
205
206
|
|
|
206
207
|
Using ``assumeNotNull(col)`` yields a non-nullable key expression so CREATE TABLE succeeds.
|
|
207
208
|
Only wrap when Spark marks the field nullable (``assumeNotNull`` requires Nullable input).
|
|
@@ -264,25 +265,28 @@ def stage_to_clickhouse(
|
|
|
264
265
|
Write transformed data to ClickHouse staging table.
|
|
265
266
|
Separate job from transform; must complete before analytics can run.
|
|
266
267
|
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
(``BATCH_CLICKHOUSE_CATALOG``, default ``batch_ch``). The clickhouse-spark-runtime
|
|
270
|
-
0.8.x connector does **not** register the legacy short name ``format("clickhouse")``
|
|
271
|
-
/ ``clickhouse.DefaultSource``.
|
|
268
|
+
**Only Spark’s native ClickHouse integration** (``clickhouse-spark-runtime`` DataSourceV2):
|
|
269
|
+
no separate Python DDL client for table creation.
|
|
272
270
|
|
|
273
|
-
|
|
274
|
-
|
|
271
|
+
1. If ``BATCH_CLICKHOUSE_CATALOG`` is set (default ``batch_ch`` when the job registers
|
|
272
|
+
``ClickHouseCatalog`` in ``job_runner.create_spark_session``): ``DataFrame.writeTo`` with
|
|
273
|
+
``tableProperty("engine", "MergeTree()")``, ``order_by``, and
|
|
274
|
+
``tableProperty("settings.allow_nullable_key", "1")``, then ``createOrReplace()`` or
|
|
275
|
+
``append()``.
|
|
276
|
+
2. Otherwise (or on catalog failure): ``format("clickhouse")`` with the same connection
|
|
277
|
+
options, then JDBC as last resort.
|
|
275
278
|
|
|
276
|
-
|
|
277
|
-
|
|
279
|
+
Pin ``clickhouse-spark-runtime-3.5_2.12`` **0.10.0+** on the Spark classpath (see
|
|
280
|
+
``analytics_runner`` Dockerfile ``CLICKHOUSE_SPARK_RUNTIME_VERSION``) for ClickHouse **25.x**
|
|
281
|
+
servers.
|
|
278
282
|
|
|
279
|
-
**
|
|
280
|
-
|
|
281
|
-
``tableProperty("settings.allow_nullable_key")`` into DDL, so we also set
|
|
282
|
-
``engine`` to ``MergeTree() SETTINGS allow_nullable_key = 1``, which ClickHouse
|
|
283
|
-
applies to the created table.
|
|
283
|
+
**MergeTree ORDER BY**: ``BATCH_CLICKHOUSE_STAGING_ORDER_BY``, or dedup columns, or first
|
|
284
|
+
column. Do not add an extra outer ``(...)`` around ``order_by`` (the connector wraps it).
|
|
284
285
|
|
|
285
|
-
|
|
286
|
+
**Nullable keys**: ``assumeNotNull(`col`)`` for nullable Spark columns in the sort key, plus
|
|
287
|
+
``settings.allow_nullable_key`` when needed.
|
|
288
|
+
|
|
289
|
+
Write mode from ``BATCH_STAGING_WRITE_MODE`` (default overwrite).
|
|
286
290
|
"""
|
|
287
291
|
n = df.count()
|
|
288
292
|
mode = _normalize_staging_write_mode(config.transform.staging_write_mode)
|
|
@@ -300,13 +304,13 @@ def stage_to_clickhouse(
|
|
|
300
304
|
mode,
|
|
301
305
|
order_by,
|
|
302
306
|
)
|
|
303
|
-
# MergeTree
|
|
304
|
-
#
|
|
305
|
-
_mt_engine = "MergeTree() SETTINGS allow_nullable_key = 1"
|
|
307
|
+
# Plain MergeTree() only — SETTINGS belong in tableProperty("settings.*", ...) so the
|
|
308
|
+
# connector emits them after ORDER BY; inline SETTINGS in ENGINE breaks CH 25.5 parsing.
|
|
306
309
|
w2 = (
|
|
307
310
|
df.writeTo(full_name)
|
|
308
|
-
.tableProperty("engine",
|
|
311
|
+
.tableProperty("engine", "MergeTree()")
|
|
309
312
|
.tableProperty("order_by", order_by)
|
|
313
|
+
.tableProperty("settings.allow_nullable_key", "1")
|
|
310
314
|
)
|
|
311
315
|
if mode == "overwrite":
|
|
312
316
|
w2.createOrReplace()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/top_level.txt
RENAMED
|
File without changes
|