PyPI - batch-analytics - Versions diffs - 0.3.23__tar.gz → 0.3.24__tar.gz - Mend

batch-analytics 0.3.23tar.gz → 0.3.24tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{batch_analytics-0.3.23 → batch_analytics-0.3.24}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: batch-analytics
-Version: 0.3.23
+Version: 0.3.24
 Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
 Author: Litewave Analytics Team
 License: MIT

{batch_analytics-0.3.23 → batch_analytics-0.3.24}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "batch-analytics"
-version = "0.3.23"
+version = "0.3.24"
 description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
 readme = "README.md"
 requires-python = ">=3.8"

{batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/job_runner.py RENAMED Viewed

@@ -235,7 +235,7 @@ def run_pipeline(
         # Empty/unset: rely on $SPARK_HOME/jars (analytics-runner image). Do not add spark.jars /
         # spark.jars.packages for ClickHouse here — that breaks K8s executors (./basename.jar).
         # For ad-hoc runs without the image, set e.g.
-        # BATCH_SPARK_CLICKHOUSE_PACKAGES=com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0,https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/0.9.8/clickhouse-jdbc-0.9.8-all.jar
+        # BATCH_SPARK_CLICKHOUSE_PACKAGES=com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.10.0,https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/0.9.8/clickhouse-jdbc-0.9.8-all.jar
         _raw_ch = os.environ.get("BATCH_SPARK_CLICKHOUSE_PACKAGES")
         if _raw_ch is None or not _raw_ch.strip():
             ch_pkgs = None

{batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/transform.py RENAMED Viewed

@@ -200,8 +200,9 @@ def _ch_quoted_ident(name: str) -> str:
 def _ch_order_by_key_expr(col: str, df: DataFrame) -> str:
     """
     Build one ORDER BY key expression. Spark nullable columns map to Nullable in CH; CH 25
-    rejects nullable sort keys unless ``allow_nullable_key`` is applied — the Spark connector
-    often does not forward ``tableProperty("settings.allow_nullable_key")`` into DDL.
+    rejects nullable sort keys unless ``allow_nullable_key`` is set (via
+    ``tableProperty("settings.allow_nullable_key", "1")`` in the catalog path) or keys are
+    wrapped with ``assumeNotNull``.
     Using ``assumeNotNull(col)`` yields a non-nullable key expression so CREATE TABLE succeeds.
     Only wrap when Spark marks the field nullable (``assumeNotNull`` requires Nullable input).
@@ -264,25 +265,28 @@ def stage_to_clickhouse(
     Write transformed data to ClickHouse staging table.
     Separate job from transform; must complete before analytics can run.
-    Preferred path: Spark SQL **catalog** API (``DataFrame.writeTo``), matching
-    ``job_runner.create_spark_session`` registration of ``ClickHouseCatalog``
-    (``BATCH_CLICKHOUSE_CATALOG``, default ``batch_ch``). The clickhouse-spark-runtime
-    0.8.x connector does **not** register the legacy short name ``format("clickhouse")``
-    / ``clickhouse.DefaultSource``.
+    **Only Spark’s native ClickHouse integration** (``clickhouse-spark-runtime`` DataSourceV2):
+    no separate Python DDL client for table creation.
-    Fallback: legacy ``format("clickhouse")`` (older stacks), then JDBC (may fail on
-    ClickHouse 25+ auto-DDL without ORDER BY).
+    1. If ``BATCH_CLICKHOUSE_CATALOG`` is set (default ``batch_ch`` when the job registers
+       ``ClickHouseCatalog`` in ``job_runner.create_spark_session``): ``DataFrame.writeTo`` with
+       ``tableProperty("engine", "MergeTree()")``, ``order_by``, and
+       ``tableProperty("settings.allow_nullable_key", "1")``, then ``createOrReplace()`` or
+       ``append()``.
+    2. Otherwise (or on catalog failure): ``format("clickhouse")`` with the same connection
+       options, then JDBC as last resort.
-    **MergeTree ORDER BY** (required on ClickHouse 25+): set ``BATCH_CLICKHOUSE_STAGING_ORDER_BY``,
-    or rely on ``BATCH_DEDUP_COLUMNS`` / ``dedup_columns`` (comma-separated keys; no extra outer parens).
+    Pin ``clickhouse-spark-runtime-3.5_2.12`` **0.10.0+** on the Spark classpath (see
+    ``analytics_runner`` Dockerfile ``CLICKHOUSE_SPARK_RUNTIME_VERSION``) for ClickHouse **25.x**
+    servers.
-    **Nullable sort keys**: (1) Dedup-derived ``ORDER BY`` uses ``assumeNotNull(`col`)`` when
-    Spark marks the field nullable. (2) The Spark connector often **does not** pass
-    ``tableProperty("settings.allow_nullable_key")`` into DDL, so we also set
-    ``engine`` to ``MergeTree() SETTINGS allow_nullable_key = 1``, which ClickHouse
-    applies to the created table.
+    **MergeTree ORDER BY**: ``BATCH_CLICKHOUSE_STAGING_ORDER_BY``, or dedup columns, or first
+    column. Do not add an extra outer ``(...)`` around ``order_by`` (the connector wraps it).
-    Write mode from BATCH_STAGING_WRITE_MODE (default overwrite = full replace).
+    **Nullable keys**: ``assumeNotNull(`col`)`` for nullable Spark columns in the sort key, plus
+    ``settings.allow_nullable_key`` when needed.
+    Write mode from ``BATCH_STAGING_WRITE_MODE`` (default overwrite).
     """
     n = df.count()
     mode = _normalize_staging_write_mode(config.transform.staging_write_mode)
@@ -300,13 +304,13 @@ def stage_to_clickhouse(
                 mode,
                 order_by,
             )
-            # MergeTree SETTINGS in ENGINE: connector often ignores tableProperty("settings.*")
-            # for CREATE TABLE; CH 25 then reports allow_nullable_key disabled.
-            _mt_engine = "MergeTree() SETTINGS allow_nullable_key = 1"
+            # Plain MergeTree() only — SETTINGS belong in tableProperty("settings.*", ...) so the
+            # connector emits them after ORDER BY; inline SETTINGS in ENGINE breaks CH 25.5 parsing.
             w2 = (
                 df.writeTo(full_name)
-                .tableProperty("engine", _mt_engine)
+                .tableProperty("engine", "MergeTree()")
                 .tableProperty("order_by", order_by)
+                .tableProperty("settings.allow_nullable_key", "1")
             )
             if mode == "overwrite":
                 w2.createOrReplace()

{batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: batch-analytics
-Version: 0.3.23
+Version: 0.3.24
 Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
 Author: Litewave Analytics Team
 License: MIT