PyPI - batch-analytics - Versions diffs - 0.3.23__tar.gz → 0.3.26__tar.gz - Mend

batch-analytics 0.3.23tar.gz → 0.3.26tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{batch_analytics-0.3.23 → batch_analytics-0.3.26}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: batch-analytics
-Version: 0.3.23
+Version: 0.3.26
 Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
 Author: Litewave Analytics Team
 License: MIT

{batch_analytics-0.3.23 → batch_analytics-0.3.26}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "batch-analytics"
-version = "0.3.23"
+version = "0.3.26"
 description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
 readme = "README.md"
 requires-python = ">=3.8"

{batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/analytics/t_test.py RENAMED Viewed

@@ -146,7 +146,10 @@ def _run_one_way_anova(
     N = sum(g["n"] for g in groups)
     if N <= k:
         raise ValueError(
-            f"ANOVA needs more observations than groups (N={N}, k={k})"
+            f"ANOVA needs total observations N > number of groups k (got N={N}, k={k}). "
+            "Common cause: exactly one row per group (e.g. one raw-material charge per batch per material), "
+            "so there is no within-group residual. Use data with replicates per group, a different group column, "
+            "or compare batches using a table with many rows per batch (e.g. operations or equipment_usage)."
         )
     grand_mean = sum(g["n"] * g["mean"] for g in groups) / N

{batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/config.py RENAMED Viewed

@@ -92,6 +92,8 @@ class TransformConfig:
     add_dimension_column: str = os.environ.get("BATCH_ADD_DIMENSION_COLUMN", "add_dimension")
     # Legacy: no longer used; output column names match JSON keys (e.g. anchor_id). Kept for env compatibility.
     anchor_id_column: str = os.environ.get("BATCH_ANCHOR_ID_COLUMN", "anchor_id")
+    # JSON object: {"new_col": "Spark SQL expression"} applied after KV expansion, before dedupe.
+    expr_columns_json: str = os.environ.get("BATCH_TRANSFORM_EXPR_COLUMNS", "").strip()
 @dataclass

{batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/job_runner.py RENAMED Viewed

@@ -235,7 +235,7 @@ def run_pipeline(
         # Empty/unset: rely on $SPARK_HOME/jars (analytics-runner image). Do not add spark.jars /
         # spark.jars.packages for ClickHouse here — that breaks K8s executors (./basename.jar).
         # For ad-hoc runs without the image, set e.g.
-        # BATCH_SPARK_CLICKHOUSE_PACKAGES=com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0,https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/0.9.8/clickhouse-jdbc-0.9.8-all.jar
+        # BATCH_SPARK_CLICKHOUSE_PACKAGES=com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.10.0,https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/0.9.8/clickhouse-jdbc-0.9.8-all.jar
         _raw_ch = os.environ.get("BATCH_SPARK_CLICKHOUSE_PACKAGES")
         if _raw_ch is None or not _raw_ch.strip():
             ch_pkgs = None

{batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/transform.py RENAMED Viewed

@@ -10,7 +10,7 @@ import re
 from typing import Any, Dict, List, Optional, Sequence, Set
 from pyspark.sql import DataFrame, SparkSession
-from pyspark.sql.functions import col, explode, map_keys, udf
+from pyspark.sql.functions import col, explode, expr as spark_expr, map_keys, udf
 from pyspark.sql.types import MapType, StringType
 from .config import BatchAnalyticsConfig
@@ -165,16 +165,49 @@ def remove_duplicates(
     return df_cleaned
+def apply_spark_expr_columns(
+    df: DataFrame,
+    config: BatchAnalyticsConfig,
+) -> DataFrame:
+    """
+    Add or replace columns from Spark SQL expressions (``BATCH_TRANSFORM_EXPR_COLUMNS``).
+    Value must be a JSON object mapping **output column name** → **expression** (same dialect as
+    ``selectExpr``), e.g. ``{"y": "cast(duration_minutes as double)", "x": "cast(regexp_extract(operation_id, '([0-9]+)$', 1) as double)"}`` (Java regex; prefer trailing digits to avoid hyphen/range issues in patterns like ``ETC-1-OP-…``).
+    """
+    raw = (config.transform.expr_columns_json or "").strip()
+    if not raw:
+        return df
+    try:
+        mapping = json.loads(raw)
+    except json.JSONDecodeError as e:
+        raise ValueError(
+            f"BATCH_TRANSFORM_EXPR_COLUMNS must be valid JSON object: {e}"
+        ) from e
+    if not isinstance(mapping, dict):
+        raise ValueError("BATCH_TRANSFORM_EXPR_COLUMNS must be a JSON object of column -> sql_expr")
+    out = df
+    for name, sql in mapping.items():
+        col_name = str(name).strip()
+        expr_sql = str(sql).strip()
+        if not col_name or not expr_sql:
+            continue
+        out = out.withColumn(col_name, spark_expr(expr_sql))
+    return out
 def transform(
     df: DataFrame,
     config: BatchAnalyticsConfig,
 ) -> DataFrame:
     """
     Apply transformation only: (1) expand JSON/KV blob column into one column per top-level key,
-    (2) deduplicate by BATCH_DEDUP_COLUMNS if set, else by full row.
+    (2) optional Spark SQL expression columns (``BATCH_TRANSFORM_EXPR_COLUMNS``),
+    (3) deduplicate by BATCH_DEDUP_COLUMNS if set, else by full row.
     Does not write anywhere. Use stage_to_clickhouse() separately to persist.
     """
     transformed = expand_kv_blob_column(df, config)
+    transformed = apply_spark_expr_columns(transformed, config)
     dedup_cols = (
         [c.strip() for c in config.transform.dedup_columns.split(",") if c.strip()]
         if config.transform.dedup_columns
@@ -200,8 +233,9 @@ def _ch_quoted_ident(name: str) -> str:
 def _ch_order_by_key_expr(col: str, df: DataFrame) -> str:
     """
     Build one ORDER BY key expression. Spark nullable columns map to Nullable in CH; CH 25
-    rejects nullable sort keys unless ``allow_nullable_key`` is applied — the Spark connector
-    often does not forward ``tableProperty("settings.allow_nullable_key")`` into DDL.
+    rejects nullable sort keys unless ``allow_nullable_key`` is set (via
+    ``tableProperty("settings.allow_nullable_key", "1")`` in the catalog path) or keys are
+    wrapped with ``assumeNotNull``.
     Using ``assumeNotNull(col)`` yields a non-nullable key expression so CREATE TABLE succeeds.
     Only wrap when Spark marks the field nullable (``assumeNotNull`` requires Nullable input).
@@ -264,25 +298,28 @@ def stage_to_clickhouse(
     Write transformed data to ClickHouse staging table.
     Separate job from transform; must complete before analytics can run.
-    Preferred path: Spark SQL **catalog** API (``DataFrame.writeTo``), matching
-    ``job_runner.create_spark_session`` registration of ``ClickHouseCatalog``
-    (``BATCH_CLICKHOUSE_CATALOG``, default ``batch_ch``). The clickhouse-spark-runtime
-    0.8.x connector does **not** register the legacy short name ``format("clickhouse")``
-    / ``clickhouse.DefaultSource``.
+    **Only Spark’s native ClickHouse integration** (``clickhouse-spark-runtime`` DataSourceV2):
+    no separate Python DDL client for table creation.
+    1. If ``BATCH_CLICKHOUSE_CATALOG`` is set (default ``batch_ch`` when the job registers
+       ``ClickHouseCatalog`` in ``job_runner.create_spark_session``): ``DataFrame.writeTo`` with
+       ``tableProperty("engine", "MergeTree()")``, ``order_by``, and
+       ``tableProperty("settings.allow_nullable_key", "1")``, then ``createOrReplace()`` or
+       ``append()``.
+    2. Otherwise (or on catalog failure): ``format("clickhouse")`` with the same connection
+       options, then JDBC as last resort.
-    Fallback: legacy ``format("clickhouse")`` (older stacks), then JDBC (may fail on
-    ClickHouse 25+ auto-DDL without ORDER BY).
+    Pin ``clickhouse-spark-runtime-3.5_2.12`` **0.10.0+** on the Spark classpath (see
+    ``analytics_runner`` Dockerfile ``CLICKHOUSE_SPARK_RUNTIME_VERSION``) for ClickHouse **25.x**
+    servers.
-    **MergeTree ORDER BY** (required on ClickHouse 25+): set ``BATCH_CLICKHOUSE_STAGING_ORDER_BY``,
-    or rely on ``BATCH_DEDUP_COLUMNS`` / ``dedup_columns`` (comma-separated keys; no extra outer parens).
+    **MergeTree ORDER BY**: ``BATCH_CLICKHOUSE_STAGING_ORDER_BY``, or dedup columns, or first
+    column. Do not add an extra outer ``(...)`` around ``order_by`` (the connector wraps it).
-    **Nullable sort keys**: (1) Dedup-derived ``ORDER BY`` uses ``assumeNotNull(`col`)`` when
-    Spark marks the field nullable. (2) The Spark connector often **does not** pass
-    ``tableProperty("settings.allow_nullable_key")`` into DDL, so we also set
-    ``engine`` to ``MergeTree() SETTINGS allow_nullable_key = 1``, which ClickHouse
-    applies to the created table.
+    **Nullable keys**: ``assumeNotNull(`col`)`` for nullable Spark columns in the sort key, plus
+    ``settings.allow_nullable_key`` when needed.
-    Write mode from BATCH_STAGING_WRITE_MODE (default overwrite = full replace).
+    Write mode from ``BATCH_STAGING_WRITE_MODE`` (default overwrite).
     """
     n = df.count()
     mode = _normalize_staging_write_mode(config.transform.staging_write_mode)
@@ -300,13 +337,13 @@ def stage_to_clickhouse(
                 mode,
                 order_by,
             )
-            # MergeTree SETTINGS in ENGINE: connector often ignores tableProperty("settings.*")
-            # for CREATE TABLE; CH 25 then reports allow_nullable_key disabled.
-            _mt_engine = "MergeTree() SETTINGS allow_nullable_key = 1"
+            # Plain MergeTree() only — SETTINGS belong in tableProperty("settings.*", ...) so the
+            # connector emits them after ORDER BY; inline SETTINGS in ENGINE breaks CH 25.5 parsing.
             w2 = (
                 df.writeTo(full_name)
-                .tableProperty("engine", _mt_engine)
+                .tableProperty("engine", "MergeTree()")
                 .tableProperty("order_by", order_by)
+                .tableProperty("settings.allow_nullable_key", "1")
             )
             if mode == "overwrite":
                 w2.createOrReplace()

{batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: batch-analytics
-Version: 0.3.23
+Version: 0.3.26
 Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
 Author: Litewave Analytics Team
 License: MIT