batch-analytics 0.3.23__tar.gz → 0.3.26__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/PKG-INFO +1 -1
  2. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/pyproject.toml +1 -1
  3. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/analytics/t_test.py +4 -1
  4. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/config.py +2 -0
  5. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/job_runner.py +1 -1
  6. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/transform.py +60 -23
  7. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/PKG-INFO +1 -1
  8. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/README.md +0 -0
  9. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/setup.cfg +0 -0
  10. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/__init__.py +0 -0
  11. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/__main__.py +0 -0
  12. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/analytics/__init__.py +0 -0
  13. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/analytics/correlation.py +0 -0
  14. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/analytics/gluon_autogluon_infer.py +0 -0
  15. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/analytics/gluon_autogluon_train.py +0 -0
  16. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/analytics/linear_regression.py +0 -0
  17. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/analytics/pca_clustering.py +0 -0
  18. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/extract.py +0 -0
  19. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/log.py +0 -0
  20. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/modules.py +0 -0
  21. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/output/__init__.py +0 -0
  22. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/output/base.py +0 -0
  23. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/output/clickhouse.py +0 -0
  24. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/output/local.py +0 -0
  25. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/output/s3.py +0 -0
  26. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/utils/__init__.py +0 -0
  27. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
  28. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
  29. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  30. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  31. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/requires.txt +0 -0
  32. {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.23
3
+ Version: 0.3.26
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.3.23"
7
+ version = "0.3.26"
8
8
  description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -146,7 +146,10 @@ def _run_one_way_anova(
146
146
  N = sum(g["n"] for g in groups)
147
147
  if N <= k:
148
148
  raise ValueError(
149
- f"ANOVA needs more observations than groups (N={N}, k={k})"
149
+ f"ANOVA needs total observations N > number of groups k (got N={N}, k={k}). "
150
+ "Common cause: exactly one row per group (e.g. one raw-material charge per batch per material), "
151
+ "so there is no within-group residual. Use data with replicates per group, a different group column, "
152
+ "or compare batches using a table with many rows per batch (e.g. operations or equipment_usage)."
150
153
  )
151
154
 
152
155
  grand_mean = sum(g["n"] * g["mean"] for g in groups) / N
@@ -92,6 +92,8 @@ class TransformConfig:
92
92
  add_dimension_column: str = os.environ.get("BATCH_ADD_DIMENSION_COLUMN", "add_dimension")
93
93
  # Legacy: no longer used; output column names match JSON keys (e.g. anchor_id). Kept for env compatibility.
94
94
  anchor_id_column: str = os.environ.get("BATCH_ANCHOR_ID_COLUMN", "anchor_id")
95
+ # JSON object: {"new_col": "Spark SQL expression"} applied after KV expansion, before dedupe.
96
+ expr_columns_json: str = os.environ.get("BATCH_TRANSFORM_EXPR_COLUMNS", "").strip()
95
97
 
96
98
 
97
99
  @dataclass
@@ -235,7 +235,7 @@ def run_pipeline(
235
235
  # Empty/unset: rely on $SPARK_HOME/jars (analytics-runner image). Do not add spark.jars /
236
236
  # spark.jars.packages for ClickHouse here — that breaks K8s executors (./basename.jar).
237
237
  # For ad-hoc runs without the image, set e.g.
238
- # BATCH_SPARK_CLICKHOUSE_PACKAGES=com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0,https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/0.9.8/clickhouse-jdbc-0.9.8-all.jar
238
+ # BATCH_SPARK_CLICKHOUSE_PACKAGES=com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.10.0,https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/0.9.8/clickhouse-jdbc-0.9.8-all.jar
239
239
  _raw_ch = os.environ.get("BATCH_SPARK_CLICKHOUSE_PACKAGES")
240
240
  if _raw_ch is None or not _raw_ch.strip():
241
241
  ch_pkgs = None
@@ -10,7 +10,7 @@ import re
10
10
  from typing import Any, Dict, List, Optional, Sequence, Set
11
11
 
12
12
  from pyspark.sql import DataFrame, SparkSession
13
- from pyspark.sql.functions import col, explode, map_keys, udf
13
+ from pyspark.sql.functions import col, explode, expr as spark_expr, map_keys, udf
14
14
  from pyspark.sql.types import MapType, StringType
15
15
 
16
16
  from .config import BatchAnalyticsConfig
@@ -165,16 +165,49 @@ def remove_duplicates(
165
165
  return df_cleaned
166
166
 
167
167
 
168
+ def apply_spark_expr_columns(
169
+ df: DataFrame,
170
+ config: BatchAnalyticsConfig,
171
+ ) -> DataFrame:
172
+ """
173
+ Add or replace columns from Spark SQL expressions (``BATCH_TRANSFORM_EXPR_COLUMNS``).
174
+
175
+ Value must be a JSON object mapping **output column name** → **expression** (same dialect as
176
+ ``selectExpr``), e.g. ``{"y": "cast(duration_minutes as double)", "x": "cast(regexp_extract(operation_id, '([0-9]+)$', 1) as double)"}`` (Java regex; prefer trailing digits to avoid hyphen/range issues in patterns like ``ETC-1-OP-…``).
177
+ """
178
+ raw = (config.transform.expr_columns_json or "").strip()
179
+ if not raw:
180
+ return df
181
+ try:
182
+ mapping = json.loads(raw)
183
+ except json.JSONDecodeError as e:
184
+ raise ValueError(
185
+ f"BATCH_TRANSFORM_EXPR_COLUMNS must be valid JSON object: {e}"
186
+ ) from e
187
+ if not isinstance(mapping, dict):
188
+ raise ValueError("BATCH_TRANSFORM_EXPR_COLUMNS must be a JSON object of column -> sql_expr")
189
+ out = df
190
+ for name, sql in mapping.items():
191
+ col_name = str(name).strip()
192
+ expr_sql = str(sql).strip()
193
+ if not col_name or not expr_sql:
194
+ continue
195
+ out = out.withColumn(col_name, spark_expr(expr_sql))
196
+ return out
197
+
198
+
168
199
  def transform(
169
200
  df: DataFrame,
170
201
  config: BatchAnalyticsConfig,
171
202
  ) -> DataFrame:
172
203
  """
173
204
  Apply transformation only: (1) expand JSON/KV blob column into one column per top-level key,
174
- (2) deduplicate by BATCH_DEDUP_COLUMNS if set, else by full row.
205
+ (2) optional Spark SQL expression columns (``BATCH_TRANSFORM_EXPR_COLUMNS``),
206
+ (3) deduplicate by BATCH_DEDUP_COLUMNS if set, else by full row.
175
207
  Does not write anywhere. Use stage_to_clickhouse() separately to persist.
176
208
  """
177
209
  transformed = expand_kv_blob_column(df, config)
210
+ transformed = apply_spark_expr_columns(transformed, config)
178
211
  dedup_cols = (
179
212
  [c.strip() for c in config.transform.dedup_columns.split(",") if c.strip()]
180
213
  if config.transform.dedup_columns
@@ -200,8 +233,9 @@ def _ch_quoted_ident(name: str) -> str:
200
233
  def _ch_order_by_key_expr(col: str, df: DataFrame) -> str:
201
234
  """
202
235
  Build one ORDER BY key expression. Spark nullable columns map to Nullable in CH; CH 25
203
- rejects nullable sort keys unless ``allow_nullable_key`` is applied — the Spark connector
204
- often does not forward ``tableProperty("settings.allow_nullable_key")`` into DDL.
236
+ rejects nullable sort keys unless ``allow_nullable_key`` is set (via
237
+ ``tableProperty("settings.allow_nullable_key", "1")`` in the catalog path) or keys are
238
+ wrapped with ``assumeNotNull``.
205
239
 
206
240
  Using ``assumeNotNull(col)`` yields a non-nullable key expression so CREATE TABLE succeeds.
207
241
  Only wrap when Spark marks the field nullable (``assumeNotNull`` requires Nullable input).
@@ -264,25 +298,28 @@ def stage_to_clickhouse(
264
298
  Write transformed data to ClickHouse staging table.
265
299
  Separate job from transform; must complete before analytics can run.
266
300
 
267
- Preferred path: Spark SQL **catalog** API (``DataFrame.writeTo``), matching
268
- ``job_runner.create_spark_session`` registration of ``ClickHouseCatalog``
269
- (``BATCH_CLICKHOUSE_CATALOG``, default ``batch_ch``). The clickhouse-spark-runtime
270
- 0.8.x connector does **not** register the legacy short name ``format("clickhouse")``
271
- / ``clickhouse.DefaultSource``.
301
+ **Only Spark’s native ClickHouse integration** (``clickhouse-spark-runtime`` DataSourceV2):
302
+ no separate Python DDL client for table creation.
303
+
304
+ 1. If ``BATCH_CLICKHOUSE_CATALOG`` is set (default ``batch_ch`` when the job registers
305
+ ``ClickHouseCatalog`` in ``job_runner.create_spark_session``): ``DataFrame.writeTo`` with
306
+ ``tableProperty("engine", "MergeTree()")``, ``order_by``, and
307
+ ``tableProperty("settings.allow_nullable_key", "1")``, then ``createOrReplace()`` or
308
+ ``append()``.
309
+ 2. Otherwise (or on catalog failure): ``format("clickhouse")`` with the same connection
310
+ options, then JDBC as last resort.
272
311
 
273
- Fallback: legacy ``format("clickhouse")`` (older stacks), then JDBC (may fail on
274
- ClickHouse 25+ auto-DDL without ORDER BY).
312
+ Pin ``clickhouse-spark-runtime-3.5_2.12`` **0.10.0+** on the Spark classpath (see
313
+ ``analytics_runner`` Dockerfile ``CLICKHOUSE_SPARK_RUNTIME_VERSION``) for ClickHouse **25.x**
314
+ servers.
275
315
 
276
- **MergeTree ORDER BY** (required on ClickHouse 25+): set ``BATCH_CLICKHOUSE_STAGING_ORDER_BY``,
277
- or rely on ``BATCH_DEDUP_COLUMNS`` / ``dedup_columns`` (comma-separated keys; no extra outer parens).
316
+ **MergeTree ORDER BY**: ``BATCH_CLICKHOUSE_STAGING_ORDER_BY``, or dedup columns, or first
317
+ column. Do not add an extra outer ``(...)`` around ``order_by`` (the connector wraps it).
278
318
 
279
- **Nullable sort keys**: (1) Dedup-derived ``ORDER BY`` uses ``assumeNotNull(`col`)`` when
280
- Spark marks the field nullable. (2) The Spark connector often **does not** pass
281
- ``tableProperty("settings.allow_nullable_key")`` into DDL, so we also set
282
- ``engine`` to ``MergeTree() SETTINGS allow_nullable_key = 1``, which ClickHouse
283
- applies to the created table.
319
+ **Nullable keys**: ``assumeNotNull(`col`)`` for nullable Spark columns in the sort key, plus
320
+ ``settings.allow_nullable_key`` when needed.
284
321
 
285
- Write mode from BATCH_STAGING_WRITE_MODE (default overwrite = full replace).
322
+ Write mode from ``BATCH_STAGING_WRITE_MODE`` (default overwrite).
286
323
  """
287
324
  n = df.count()
288
325
  mode = _normalize_staging_write_mode(config.transform.staging_write_mode)
@@ -300,13 +337,13 @@ def stage_to_clickhouse(
300
337
  mode,
301
338
  order_by,
302
339
  )
303
- # MergeTree SETTINGS in ENGINE: connector often ignores tableProperty("settings.*")
304
- # for CREATE TABLE; CH 25 then reports allow_nullable_key disabled.
305
- _mt_engine = "MergeTree() SETTINGS allow_nullable_key = 1"
340
+ # Plain MergeTree() only SETTINGS belong in tableProperty("settings.*", ...) so the
341
+ # connector emits them after ORDER BY; inline SETTINGS in ENGINE breaks CH 25.5 parsing.
306
342
  w2 = (
307
343
  df.writeTo(full_name)
308
- .tableProperty("engine", _mt_engine)
344
+ .tableProperty("engine", "MergeTree()")
309
345
  .tableProperty("order_by", order_by)
346
+ .tableProperty("settings.allow_nullable_key", "1")
310
347
  )
311
348
  if mode == "overwrite":
312
349
  w2.createOrReplace()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.23
3
+ Version: 0.3.26
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT