PyPI - batch-analytics - Versions diffs - 0.3.6__tar.gz → 0.3.13__tar.gz - Mend

batch-analytics 0.3.6tar.gz → 0.3.13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{batch_analytics-0.3.6 → batch_analytics-0.3.13}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: batch-analytics
-Version: 0.3.6
+Version: 0.3.13
 Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
 Author: Litewave Analytics Team
 License: MIT

{batch_analytics-0.3.6 → batch_analytics-0.3.13}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "batch-analytics"
-version = "0.3.6"
+version = "0.3.13"
 description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
 readme = "README.md"
 requires-python = ">=3.8"

{batch_analytics-0.3.6 → batch_analytics-0.3.13}/src/batch_analytics/__init__.py RENAMED Viewed

@@ -13,8 +13,14 @@ Analytics modules:
 """
 from .config import BatchAnalyticsConfig, SparkK8sConfig
-from .extract import extract_all, extract_table, extract_unified
+from .extract import (
+    extract_all,
+    extract_table,
+    extract_unified,
+    parse_extract_filter_values,
+)
 from .transform import (
+    expand_kv_blob_column,
     extract_anchor_id,
     load_staged,
     remove_duplicates,
@@ -28,10 +34,12 @@ from .job_runner import run_pipeline, create_spark_session
 __all__ = [
     "BatchAnalyticsConfig",
     "SparkK8sConfig",
+    "expand_kv_blob_column",
     "extract_anchor_id",
     "extract_all",
     "extract_table",
     "extract_unified",
+    "parse_extract_filter_values",
     "remove_duplicates",
     "stage_to_clickhouse",
     "transform",

{batch_analytics-0.3.6 → batch_analytics-0.3.13}/src/batch_analytics/analytics/t_test.py RENAMED Viewed

@@ -1,9 +1,9 @@
 """
-Module 4: T-test to compare means of two sets of data.
+Module 4: T-test and one-way ANOVA for comparing means across groups.
 """
 import logging
-from typing import Any, Dict
+from typing import Any, Dict, List
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.functions import col, avg, stddev, count
@@ -20,18 +20,17 @@ def run_t_test(
     config: BatchAnalyticsConfig,
 ) -> Dict[str, Any]:
     """
-    Perform an independent samples t-test to compare the means of two groups.
+    Compare means across groups or two numeric columns.
-    Supports two modes:
-    1. Value + group: one numeric column, one categorical column with 2 levels
-    2. Two columns: two numeric columns, compare their means
-    Uses Welch's t-test (does not assume equal variances).
+    Supports:
+    1. Value + group: one numeric column, one categorical column.
+       - **2 groups:** Welch's t-test (unequal variances).
+       - **3+ groups:** one-way ANOVA (F-test on equal means).
+    2. Two columns: two numeric columns, Welch t-test on their column means.
     Returns:
-        - group_a, group_b: names/summary of each group
-        - mean_a, mean_b, std_a, std_b, n_a, n_b
-        - t_statistic, p_value, difference (mean_a - mean_b)
+        For Welch: ``group_a``, ``group_b``, ``t_statistic``, ``p_value``, ``test`` = ``\"Welch\"``.
+        For ANOVA: ``groups``, ``f_statistic``, ``p_value``, ``test`` = ``\"one_way_anova\"``, SS/df.
     """
     value_col = (config.analytics.ttest_value_column or "").strip()
     group_col = (config.analytics.ttest_group_column or "").strip()
@@ -46,7 +45,7 @@ def run_t_test(
     if col_a and col_b and col_a in df.columns and col_b in df.columns:
         return _run_t_test_two_columns(df, col_a, col_b)
-    # Fallback: find first numeric + first string-like column with 2 distinct values
+    # Fallback: first numeric + string-like column with at least 2 distinct groups
     numeric_cols = [
         f.name for f in df.schema.fields
         if "double" in str(f.dataType).lower()
@@ -60,8 +59,10 @@ def run_t_test(
     for nc in numeric_cols:
         for sc in string_cols:
             distinct = df.select(sc).distinct().count()
-            if distinct == 2:
-                logger.info("Auto-selected t-test: value=%s, group=%s", nc, sc)
+            if distinct >= 2:
+                logger.info(
+                    "Auto-selected value=%s, group=%s (%d groups)", nc, sc, distinct
+                )
                 return _run_t_test_by_group(df, nc, sc)
     raise ValueError(
@@ -76,7 +77,7 @@ def _run_t_test_by_group(
     value_col: str,
     group_col: str,
 ) -> Dict[str, Any]:
-    """T-test: compare mean of value_col across two levels of group_col."""
+    """Compare mean of value_col across groups: Welch (2 groups) or one-way ANOVA (3+)."""
     df_num = df.select(
         col(value_col).cast(DoubleType()).alias("_val"),
         col(group_col).cast("string").alias("_grp"),
@@ -92,11 +93,15 @@ def _run_t_test_by_group(
         .collect()
     )
-    if len(stats) != 2:
+    k = len(stats)
+    if k < 2:
         raise ValueError(
-            f"T-test requires exactly 2 groups in {group_col}. Found: {[r['_grp'] for r in stats]}"
+            f"Need at least 2 groups in {group_col} for comparison. Found: {[r['_grp'] for r in stats]}"
         )
+    if k > 2:
+        return _run_one_way_anova(stats, value_col, group_col)
     r0, r1 = stats[0], stats[1]
     return _compute_t_test_result(
         group_a=r0["_grp"],
@@ -110,6 +115,98 @@ def _run_t_test_by_group(
     )
+def _run_one_way_anova(
+    stats_rows: List[Any],
+    value_col: str,
+    group_col: str,
+) -> Dict[str, Any]:
+    """
+    One-way ANOVA from per-group mean, sample stddev, and n (Spark ``stddev`` uses ddof=1).
+    SS_within = sum_i (n_i - 1) * s_i^2
+    SS_between = sum_i n_i * (mean_i - grand_mean)^2
+    """
+    try:
+        from scipy import stats as scipy_stats
+    except ImportError:
+        raise ImportError("ANOVA requires scipy. Install with: pip install scipy")
+    groups: List[Dict[str, Any]] = []
+    for r in stats_rows:
+        name = r["_grp"]
+        mean = float(r["mean"])
+        std_raw = r["std"]
+        std = 0.0 if std_raw is None else float(std_raw)
+        n = int(r["n"])
+        groups.append(
+            {"name": name, "mean": mean, "std": std, "n": n}
+        )
+    k = len(groups)
+    N = sum(g["n"] for g in groups)
+    if N <= k:
+        raise ValueError(
+            f"ANOVA needs more observations than groups (N={N}, k={k})"
+        )
+    grand_mean = sum(g["n"] * g["mean"] for g in groups) / N
+    ss_between = sum(g["n"] * (g["mean"] - grand_mean) ** 2 for g in groups)
+    ss_within = sum(
+        (g["n"] - 1) * (g["std"] ** 2) for g in groups if g["n"] > 1
+    )
+    df_between = k - 1
+    df_within = N - k
+    if df_within <= 0:
+        raise ValueError(
+            f"ANOVA: df_within must be positive (N={N}, k={k})"
+        )
+    ms_between = ss_between / df_between
+    ms_within = ss_within / df_within if df_within > 0 else 0.0
+    if ms_within <= 0.0:
+        if ms_between <= 0.0:
+            f_stat = 0.0
+            p_value = 1.0
+        else:
+            f_stat = float("inf")
+            p_value = 0.0
+    else:
+        f_stat = ms_between / ms_within
+        p_value = float(scipy_stats.f.sf(f_stat, df_between, df_within))
+    out: Dict[str, Any] = {
+        "test": "one_way_anova",
+        "value_column": value_col,
+        "group_column": group_col,
+        "k_groups": k,
+        "n_total": N,
+        "grand_mean": grand_mean,
+        "f_statistic": f_stat,
+        "p_value": p_value,
+        "df_between": df_between,
+        "df_within": df_within,
+        "ss_between": ss_between,
+        "ss_within": ss_within,
+        "ms_between": ms_between,
+        "ms_within": ms_within,
+        "groups": [
+            {
+                "name": g["name"],
+                "mean": g["mean"],
+                "std": g["std"],
+                "n": g["n"],
+            }
+            for g in sorted(groups, key=lambda x: str(x["name"]))
+        ],
+    }
+    if f_stat == float("inf"):
+        out["f_statistic"] = None
+        out["f_statistic_note"] = "infinite (MS_within == 0, reject equal means if SS_between > 0)"
+    return out
 def _run_t_test_two_columns(df: DataFrame, col_a: str, col_b: str) -> Dict[str, Any]:
     """T-test: compare means of two numeric columns."""
     df_num = df.select(

{batch_analytics-0.3.6 → batch_analytics-0.3.13}/src/batch_analytics/config.py RENAMED Viewed

@@ -55,13 +55,18 @@ class ExtractConfig:
     use_native_connector: bool = os.environ.get(
         "BATCH_USE_NATIVE_CONNECTOR", "false"
     ).lower() == "true"
+    # Optional WHERE col IN (...) after read. Empty filter_column = no filter (full table).
+    # filter_values: comma-separated list, or JSON array e.g. ["a","b"] for values containing commas.
+    filter_column: str = os.environ.get("BATCH_EXTRACT_FILTER_COLUMN", "").strip()
+    filter_values: str = os.environ.get("BATCH_EXTRACT_FILTER_VALUES", "").strip()
 @dataclass
 class TransformConfig:
     """Transform stage configuration."""
-    # Columns to use for deduplication (comma-separated); empty = use all columns
+    # Order: extract anchor_id from add_dimension(s) column, then dedupe by these keys.
+    # Deduplication keys (comma-separated). Empty = dropDuplicates() on full row (all columns).
     dedup_columns: str = os.environ.get("BATCH_DEDUP_COLUMNS", "")
     # Staging output path (local or S3)
     staging_path: str = os.environ.get(
@@ -73,8 +78,10 @@ class TransformConfig:
     staging_format: str = os.environ.get("BATCH_STAGING_FORMAT", "clickhouse")
     # Staging table name in ClickHouse (when format=clickhouse)
     staging_table: str = os.environ.get("BATCH_STAGING_TABLE", "analytics_staging")
-    # Extract anchor_id from add_dimension column (e.g. {'anchor_id':'GP/GPH(D)/II(W)/250019'})
+    # Source column holding a JSON object or Python dict string; every top-level key becomes a new String column
+    # (see transform.expand_kv_blob_column). Example: add_dimensions {'anchor_id':'...','lot':'A1'}
     add_dimension_column: str = os.environ.get("BATCH_ADD_DIMENSION_COLUMN", "add_dimension")
+    # Legacy: no longer used; output column names match JSON keys (e.g. anchor_id). Kept for env compatibility.
     anchor_id_column: str = os.environ.get("BATCH_ANCHOR_ID_COLUMN", "anchor_id")

{batch_analytics-0.3.6 → batch_analytics-0.3.13}/src/batch_analytics/extract.py RENAMED Viewed

@@ -2,17 +2,63 @@
 Extract stage: Load data from ClickHouse using Spark ClickHouse connector or JDBC.
 """
+import json
 import logging
-import os
 from typing import Dict, List, Optional
 from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql.functions import col
 from .config import BatchAnalyticsConfig
 logger = logging.getLogger(__name__)
+def parse_extract_filter_values(raw: str) -> List[str]:
+    """
+    Parse BATCH_EXTRACT_FILTER_VALUES: comma-separated tokens, or JSON array string.
+    Examples:
+      a,b,c -> ["a","b","c"]
+      ["GP/A","GP/B"] -> JSON list (values may contain commas)
+    """
+    text = (raw or "").strip()
+    if not text:
+        return []
+    if text.startswith("["):
+        try:
+            data = json.loads(text)
+            if isinstance(data, list):
+                out = [str(x).strip() for x in data if str(x).strip()]
+                return out
+        except json.JSONDecodeError:
+            logger.warning("BATCH_EXTRACT_FILTER_VALUES looks like JSON but failed to parse; using comma split")
+    return [p.strip() for p in text.split(",") if p.strip()]
+def _apply_extract_filter(df: DataFrame, config: BatchAnalyticsConfig) -> DataFrame:
+    """Apply col IN (values) when filter_column is set; empty column = no filter."""
+    col_name = (config.extract.filter_column or "").strip()
+    if not col_name:
+        return df
+    if col_name not in df.columns:
+        logger.warning(
+            "BATCH_EXTRACT_FILTER_COLUMN=%r not in extracted columns %s; skipping filter",
+            col_name,
+            df.columns,
+        )
+        return df
+    values = parse_extract_filter_values(config.extract.filter_values)
+    if not values:
+        logger.warning(
+            "BATCH_EXTRACT_FILTER_COLUMN=%r set but BATCH_EXTRACT_FILTER_VALUES is empty; skipping IN filter",
+            col_name,
+        )
+        return df
+    filtered = df.filter(col(col_name).isin(values))
+    return filtered
 def _read_via_format(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str) -> Optional[DataFrame]:
     """
     Read from ClickHouse using the native format API (clickhouse-spark-runtime).
@@ -60,12 +106,13 @@ def extract_table(
     Uses native connector if configured, otherwise JDBC.
     """
     if config.extract.use_native_connector:
-        df = _read_via_catalog(spark, config, table)
+        df = _read_via_format(spark, config, table)
         if df is None:
             df = _read_via_jdbc(spark, config, table)
     else:
         df = _read_via_jdbc(spark, config, table)
+    df = _apply_extract_filter(df, config)
     logger.info("Extracted table %s: %d rows", table, df.count())
     return df

{batch_analytics-0.3.6 → batch_analytics-0.3.13}/src/batch_analytics/job_runner.py RENAMED Viewed

@@ -270,9 +270,9 @@ def run_pipeline(
     else:
         df_transformed = df_raw  # df_raw already loaded from staged when not run_extract
         if run_extract and run_analytics:
-            from .transform import extract_anchor_id, remove_duplicates
+            from .transform import expand_kv_blob_column, remove_duplicates
-            df_transformed = extract_anchor_id(df_raw, config)
+            df_transformed = expand_kv_blob_column(df_raw, config)
             dedup_cols = (
                 [c.strip() for c in config.transform.dedup_columns.split(",") if c.strip()]
                 if config.transform.dedup_columns

{batch_analytics-0.3.6 → batch_analytics-0.3.13}/src/batch_analytics/transform.py RENAMED Viewed

@@ -1,42 +1,143 @@
 """
-Transform stage: Clean data (remove duplicates), extract add_dimension, and stage.
+Transform stage: Clean data (remove duplicates), expand JSON/KV blob column, and stage.
 """
+import ast
+import json
 import logging
 import os
-from typing import Optional, Sequence
+import re
+from typing import Any, Dict, List, Optional, Sequence, Set
 from pyspark.sql import DataFrame, SparkSession
-from pyspark.sql.functions import coalesce, col, get_json_object, regexp_extract
+from pyspark.sql.functions import col, explode, map_keys, udf
+from pyspark.sql.types import MapType, StringType
 from .config import BatchAnalyticsConfig
 logger = logging.getLogger(__name__)
-def extract_anchor_id(
+def _stringify_leaf(v: Any) -> str:
+    if v is None:
+        return ""
+    if isinstance(v, (dict, list)):
+        return json.dumps(v, separators=(",", ":"))
+    return str(v)
+def parse_blob_to_strmap(s: Any) -> Dict[str, str]:
+    """
+    Parse a cell value into a flat string map (top-level keys only).
+    Accepts standard JSON objects or Python repr dicts (e.g. single-quoted).
+    Non-dict / unparsable input yields an empty map.
+    """
+    if s is None:
+        return {}
+    text = str(s).strip()
+    if not text:
+        return {}
+    obj: Any = None
+    try:
+        obj = json.loads(text)
+    except json.JSONDecodeError:
+        pass
+    if obj is None:
+        try:
+            obj = ast.literal_eval(text)
+        except (ValueError, SyntaxError, MemoryError):
+            return {}
+    if not isinstance(obj, dict):
+        return {}
+    out: Dict[str, str] = {}
+    for k, v in obj.items():
+        key = str(k).strip()
+        if not key:
+            continue
+        out[key] = _stringify_leaf(v)
+    return out
+def _spark_safe_base_name(key: str) -> str:
+    """Sanitize JSON key to a usable Spark column name."""
+    s = re.sub(r"[^0-9a-zA-Z_]", "_", key.strip())
+    s = re.sub(r"_+", "_", s).strip("_")
+    if not s:
+        return "kv_key"
+    if s[0].isdigit():
+        s = "c_" + s
+    return s
+def _unique_column_name(base: str, used: Set[str]) -> str:
+    name = base
+    n = 1
+    while name in used:
+        n += 1
+        name = f"{base}_{n}"
+    used.add(name)
+    return name
+def expand_kv_blob_column(
     df: DataFrame,
     config: BatchAnalyticsConfig,
 ) -> DataFrame:
     """
-    Extract anchor_id from add_dimension column.
-    Supports JSON format {"anchor_id":"value"} or Python-dict {"anchor_id":"value"}.
-    Creates a new column (anchor_id by default) with the extracted value.
+    Parse the configured blob column into top-level key/value pairs and add one String column per key.
+    No per-key user configuration: every distinct key observed in the column (across the dataset)
+    becomes a column; values are strings (nested dict/list serialized as JSON). Empty / null cells
+    yield nulls in those columns.
+    Source column: ``config.transform.add_dimension_column`` (env ``BATCH_ADD_DIMENSION_COLUMN``).
     """
     col_name = config.transform.add_dimension_column
-    out_col = config.transform.anchor_id_column
     if col_name not in df.columns:
-        logger.debug("Column %s not found, skipping anchor_id extraction", col_name)
+        logger.debug("KV blob column %r not found, skipping expansion", col_name)
         return df
-    # Valid JSON: {"anchor_id":"GP/GPH(D)/II(W)/250019"}
-    json_extract = get_json_object(col(col_name), "$.anchor_id")
-    # Python-dict style: {'anchor_id':'GP/GPH(D)/II(W)/250019'}
-    regex_extract = regexp_extract(col(col_name), r"'anchor_id'\s*:\s*'([^']*)'", 1)
+    parse_udf = udf(parse_blob_to_strmap, MapType(StringType(), StringType()))
+    with_map = df.withColumn("_kv_blob_map", parse_udf(col(col_name)))
+    key_rows = (
+        with_map.select(explode(map_keys(col("_kv_blob_map"))).alias("_k"))
+        .where(col("_k").isNotNull())
+        .distinct()
+        .collect()
+    )
+    all_keys: List[str] = sorted({str(r._k).strip() for r in key_rows if r._k and str(r._k).strip()})
+    if not all_keys:
+        logger.info("No keys found in KV blob column %r; dropping temporary map only", col_name)
+        return with_map.drop("_kv_blob_map")
+    used: Set[str] = set(with_map.columns)
+    out = with_map
+    added: List[str] = []
+    for k in all_keys:
+        base = _spark_safe_base_name(k)
+        target = _unique_column_name(base, used)
+        added.append(target)
+        out = out.withColumn(target, col("_kv_blob_map").getItem(k))
+    out = out.drop("_kv_blob_map")
+    logger.info(
+        "Expanded KV blob column %r into %d columns: %s",
+        col_name,
+        len(added),
+        ", ".join(added),
+    )
+    return out
-    extracted = coalesce(json_extract, regex_extract)
-    return df.withColumn(out_col, extracted)
+def extract_anchor_id(
+    df: DataFrame,
+    config: BatchAnalyticsConfig,
+) -> DataFrame:
+    """Backward-compatible name: expands all keys from the blob column (not only ``anchor_id``)."""
+    return expand_kv_blob_column(df, config)
 def remove_duplicates(
@@ -69,10 +170,11 @@ def transform(
     config: BatchAnalyticsConfig,
 ) -> DataFrame:
     """
-    Apply transformation only: extract anchor_id, remove duplicates.
+    Apply transformation only: (1) expand JSON/KV blob column into one column per top-level key,
+    (2) deduplicate by BATCH_DEDUP_COLUMNS if set, else by full row.
     Does not write anywhere. Use stage_to_clickhouse() separately to persist.
     """
-    transformed = extract_anchor_id(df, config)
+    transformed = expand_kv_blob_column(df, config)
     dedup_cols = (
         [c.strip() for c in config.transform.dedup_columns.split(",") if c.strip()]
         if config.transform.dedup_columns

{batch_analytics-0.3.6 → batch_analytics-0.3.13}/src/batch_analytics.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: batch-analytics
-Version: 0.3.6
+Version: 0.3.13
 Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
 Author: Litewave Analytics Team
 License: MIT