PyPI - batch-analytics - Versions diffs - 0.3.22__tar.gz → 0.3.24__tar.gz - Mend

batch-analytics 0.3.22tar.gz → 0.3.24tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{batch_analytics-0.3.22 → batch_analytics-0.3.24}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: batch-analytics
-Version: 0.3.22
+Version: 0.3.24
 Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
 Author: Litewave Analytics Team
 License: MIT

{batch_analytics-0.3.22 → batch_analytics-0.3.24}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "batch-analytics"
-version = "0.3.22"
+version = "0.3.24"
 description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
 readme = "README.md"
 requires-python = ">=3.8"

{batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/job_runner.py RENAMED Viewed

@@ -235,7 +235,7 @@ def run_pipeline(
         # Empty/unset: rely on $SPARK_HOME/jars (analytics-runner image). Do not add spark.jars /
         # spark.jars.packages for ClickHouse here — that breaks K8s executors (./basename.jar).
         # For ad-hoc runs without the image, set e.g.
-        # BATCH_SPARK_CLICKHOUSE_PACKAGES=com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0,https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/0.9.8/clickhouse-jdbc-0.9.8-all.jar
+        # BATCH_SPARK_CLICKHOUSE_PACKAGES=com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.10.0,https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/0.9.8/clickhouse-jdbc-0.9.8-all.jar
         _raw_ch = os.environ.get("BATCH_SPARK_CLICKHOUSE_PACKAGES")
         if _raw_ch is None or not _raw_ch.strip():
             ch_pkgs = None

{batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/transform.py RENAMED Viewed

@@ -192,6 +192,70 @@ def _normalize_staging_write_mode(raw: str) -> str:
     return "overwrite"
+def _ch_quoted_ident(name: str) -> str:
+    """ClickHouse identifier in ORDER BY (escape backticks)."""
+    return "`" + name.replace("`", "``") + "`"
+def _ch_order_by_key_expr(col: str, df: DataFrame) -> str:
+    """
+    Build one ORDER BY key expression. Spark nullable columns map to Nullable in CH; CH 25
+    rejects nullable sort keys unless ``allow_nullable_key`` is set (via
+    ``tableProperty("settings.allow_nullable_key", "1")`` in the catalog path) or keys are
+    wrapped with ``assumeNotNull``.
+    Using ``assumeNotNull(col)`` yields a non-nullable key expression so CREATE TABLE succeeds.
+    Only wrap when Spark marks the field nullable (``assumeNotNull`` requires Nullable input).
+    """
+    q = _ch_quoted_ident(col)
+    for f in df.schema.fields:
+        if f.name == col:
+            if f.nullable:
+                return f"assumeNotNull({q})"
+            return q
+    return q
+def _strip_outer_tuple_parens(s: str) -> str:
+    """clickhouse-spark-runtime wraps ``order_by`` in parentheses; do not also wrap here."""
+    s = s.strip()
+    if len(s) >= 2 and s[0] == "(" and s[-1] == ")":
+        return s[1:-1].strip()
+    return s
+def _merge_tree_order_by_for_staging(df: DataFrame, config: BatchAnalyticsConfig) -> str:
+    """
+    ClickHouse 25+ rejects MergeTree DDL without ORDER BY. The Spark ClickHouse catalog
+    passes this via DataFrameWriterV2.tableProperty("order_by", ...).
+    **Important:** The connector emits ``ORDER BY (<order_by>)``. Do **not** wrap the value
+    in an extra ``(...)`` or ClickHouse sees ``ORDER BY ((...))`` and raises a syntax error.
+    Resolution order:
+    1. BATCH_CLICKHOUSE_STAGING_ORDER_BY — comma-separated key expressions (no outer parens);
+       one layer of surrounding ``(...)`` is stripped if present.
+    2. BATCH_DEDUP_COLUMNS / transform.dedup_columns — comma-separated, ``assumeNotNull`` when nullable.
+    3. First column of the staging DataFrame schema (same nullable rule).
+    """
+    explicit = os.environ.get("BATCH_CLICKHOUSE_STAGING_ORDER_BY", "").strip()
+    if explicit:
+        return _strip_outer_tuple_parens(explicit)
+    dedup = (config.transform.dedup_columns or "").strip()
+    if dedup:
+        cols = [c.strip() for c in dedup.split(",") if c.strip()]
+        if cols:
+            parts = [_ch_order_by_key_expr(c, df) for c in cols]
+            return ", ".join(parts)
+    names = [f.name for f in df.schema.fields]
+    if names:
+        return _ch_order_by_key_expr(names[0], df)
+    raise ValueError(
+        "Cannot derive MergeTree ORDER BY for staging: set BATCH_CLICKHOUSE_STAGING_ORDER_BY, "
+        "or BATCH_DEDUP_COLUMNS / dedup_columns, or ensure the DataFrame has columns."
+    )
 def stage_to_clickhouse(
     spark: SparkSession,
     df: DataFrame,
@@ -201,16 +265,28 @@ def stage_to_clickhouse(
     Write transformed data to ClickHouse staging table.
     Separate job from transform; must complete before analytics can run.
-    Preferred path: Spark SQL **catalog** API (``DataFrame.writeTo``), matching
-    ``job_runner.create_spark_session`` registration of ``ClickHouseCatalog``
-    (``BATCH_CLICKHOUSE_CATALOG``, default ``batch_ch``). The clickhouse-spark-runtime
-    0.8.x connector does **not** register the legacy short name ``format("clickhouse")``
-    / ``clickhouse.DefaultSource``.
+    **Only Spark’s native ClickHouse integration** (``clickhouse-spark-runtime`` DataSourceV2):
+    no separate Python DDL client for table creation.
-    Fallback: legacy ``format("clickhouse")`` (older stacks), then JDBC (may fail on
-    ClickHouse 25+ auto-DDL without ORDER BY).
+    1. If ``BATCH_CLICKHOUSE_CATALOG`` is set (default ``batch_ch`` when the job registers
+       ``ClickHouseCatalog`` in ``job_runner.create_spark_session``): ``DataFrame.writeTo`` with
+       ``tableProperty("engine", "MergeTree()")``, ``order_by``, and
+       ``tableProperty("settings.allow_nullable_key", "1")``, then ``createOrReplace()`` or
+       ``append()``.
+    2. Otherwise (or on catalog failure): ``format("clickhouse")`` with the same connection
+       options, then JDBC as last resort.
-    Write mode from BATCH_STAGING_WRITE_MODE (default overwrite = full replace).
+    Pin ``clickhouse-spark-runtime-3.5_2.12`` **0.10.0+** on the Spark classpath (see
+    ``analytics_runner`` Dockerfile ``CLICKHOUSE_SPARK_RUNTIME_VERSION``) for ClickHouse **25.x**
+    servers.
+    **MergeTree ORDER BY**: ``BATCH_CLICKHOUSE_STAGING_ORDER_BY``, or dedup columns, or first
+    column. Do not add an extra outer ``(...)`` around ``order_by`` (the connector wraps it).
+    **Nullable keys**: ``assumeNotNull(`col`)`` for nullable Spark columns in the sort key, plus
+    ``settings.allow_nullable_key`` when needed.
+    Write mode from ``BATCH_STAGING_WRITE_MODE`` (default overwrite).
     """
     n = df.count()
     mode = _normalize_staging_write_mode(config.transform.staging_write_mode)
@@ -221,12 +297,21 @@ def stage_to_clickhouse(
     if cat:
         try:
             full_name = f"{cat}.{ch.database}.{tbl}"
+            order_by = _merge_tree_order_by_for_staging(df, config)
             logger.info(
-                "Staging to ClickHouse via catalog %s (mode=%s)",
+                "Staging to ClickHouse via catalog %s (mode=%s, order_by=%s)",
                 full_name,
                 mode,
+                order_by,
+            )
+            # Plain MergeTree() only — SETTINGS belong in tableProperty("settings.*", ...) so the
+            # connector emits them after ORDER BY; inline SETTINGS in ENGINE breaks CH 25.5 parsing.
+            w2 = (
+                df.writeTo(full_name)
+                .tableProperty("engine", "MergeTree()")
+                .tableProperty("order_by", order_by)
+                .tableProperty("settings.allow_nullable_key", "1")
             )
-            w2 = df.writeTo(full_name)
             if mode == "overwrite":
                 w2.createOrReplace()
             else:

{batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: batch-analytics
-Version: 0.3.22
+Version: 0.3.24
 Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
 Author: Litewave Analytics Team
 License: MIT