batch-analytics 0.3.24__tar.gz → 0.3.26__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/PKG-INFO +1 -1
  2. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/pyproject.toml +1 -1
  3. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/analytics/t_test.py +4 -1
  4. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/config.py +2 -0
  5. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/transform.py +35 -2
  6. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/PKG-INFO +1 -1
  7. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/README.md +0 -0
  8. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/setup.cfg +0 -0
  9. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/__init__.py +0 -0
  10. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/__main__.py +0 -0
  11. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/analytics/__init__.py +0 -0
  12. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/analytics/correlation.py +0 -0
  13. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/analytics/gluon_autogluon_infer.py +0 -0
  14. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/analytics/gluon_autogluon_train.py +0 -0
  15. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/analytics/linear_regression.py +0 -0
  16. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/analytics/pca_clustering.py +0 -0
  17. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/extract.py +0 -0
  18. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/job_runner.py +0 -0
  19. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/log.py +0 -0
  20. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/modules.py +0 -0
  21. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/output/__init__.py +0 -0
  22. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/output/base.py +0 -0
  23. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/output/clickhouse.py +0 -0
  24. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/output/local.py +0 -0
  25. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/output/s3.py +0 -0
  26. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/utils/__init__.py +0 -0
  27. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
  28. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
  29. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  30. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  31. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/requires.txt +0 -0
  32. {batch_analytics-0.3.24 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.24
3
+ Version: 0.3.26
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.3.24"
7
+ version = "0.3.26"
8
8
  description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -146,7 +146,10 @@ def _run_one_way_anova(
146
146
  N = sum(g["n"] for g in groups)
147
147
  if N <= k:
148
148
  raise ValueError(
149
- f"ANOVA needs more observations than groups (N={N}, k={k})"
149
+ f"ANOVA needs total observations N > number of groups k (got N={N}, k={k}). "
150
+ "Common cause: exactly one row per group (e.g. one raw-material charge per batch per material), "
151
+ "so there is no within-group residual. Use data with replicates per group, a different group column, "
152
+ "or compare batches using a table with many rows per batch (e.g. operations or equipment_usage)."
150
153
  )
151
154
 
152
155
  grand_mean = sum(g["n"] * g["mean"] for g in groups) / N
@@ -92,6 +92,8 @@ class TransformConfig:
92
92
  add_dimension_column: str = os.environ.get("BATCH_ADD_DIMENSION_COLUMN", "add_dimension")
93
93
  # Legacy: no longer used; output column names match JSON keys (e.g. anchor_id). Kept for env compatibility.
94
94
  anchor_id_column: str = os.environ.get("BATCH_ANCHOR_ID_COLUMN", "anchor_id")
95
+ # JSON object: {"new_col": "Spark SQL expression"} applied after KV expansion, before dedupe.
96
+ expr_columns_json: str = os.environ.get("BATCH_TRANSFORM_EXPR_COLUMNS", "").strip()
95
97
 
96
98
 
97
99
  @dataclass
@@ -10,7 +10,7 @@ import re
10
10
  from typing import Any, Dict, List, Optional, Sequence, Set
11
11
 
12
12
  from pyspark.sql import DataFrame, SparkSession
13
- from pyspark.sql.functions import col, explode, map_keys, udf
13
+ from pyspark.sql.functions import col, explode, expr as spark_expr, map_keys, udf
14
14
  from pyspark.sql.types import MapType, StringType
15
15
 
16
16
  from .config import BatchAnalyticsConfig
@@ -165,16 +165,49 @@ def remove_duplicates(
165
165
  return df_cleaned
166
166
 
167
167
 
168
+ def apply_spark_expr_columns(
169
+ df: DataFrame,
170
+ config: BatchAnalyticsConfig,
171
+ ) -> DataFrame:
172
+ """
173
+ Add or replace columns from Spark SQL expressions (``BATCH_TRANSFORM_EXPR_COLUMNS``).
174
+
175
+ Value must be a JSON object mapping **output column name** → **expression** (same dialect as
176
+ ``selectExpr``), e.g. ``{"y": "cast(duration_minutes as double)", "x": "cast(regexp_extract(operation_id, '([0-9]+)$', 1) as double)"}`` (Java regex; prefer trailing digits to avoid hyphen/range issues in patterns like ``ETC-1-OP-…``).
177
+ """
178
+ raw = (config.transform.expr_columns_json or "").strip()
179
+ if not raw:
180
+ return df
181
+ try:
182
+ mapping = json.loads(raw)
183
+ except json.JSONDecodeError as e:
184
+ raise ValueError(
185
+ f"BATCH_TRANSFORM_EXPR_COLUMNS must be valid JSON object: {e}"
186
+ ) from e
187
+ if not isinstance(mapping, dict):
188
+ raise ValueError("BATCH_TRANSFORM_EXPR_COLUMNS must be a JSON object of column -> sql_expr")
189
+ out = df
190
+ for name, sql in mapping.items():
191
+ col_name = str(name).strip()
192
+ expr_sql = str(sql).strip()
193
+ if not col_name or not expr_sql:
194
+ continue
195
+ out = out.withColumn(col_name, spark_expr(expr_sql))
196
+ return out
197
+
198
+
168
199
  def transform(
169
200
  df: DataFrame,
170
201
  config: BatchAnalyticsConfig,
171
202
  ) -> DataFrame:
172
203
  """
173
204
  Apply transformation only: (1) expand JSON/KV blob column into one column per top-level key,
174
- (2) deduplicate by BATCH_DEDUP_COLUMNS if set, else by full row.
205
+ (2) optional Spark SQL expression columns (``BATCH_TRANSFORM_EXPR_COLUMNS``),
206
+ (3) deduplicate by BATCH_DEDUP_COLUMNS if set, else by full row.
175
207
  Does not write anywhere. Use stage_to_clickhouse() separately to persist.
176
208
  """
177
209
  transformed = expand_kv_blob_column(df, config)
210
+ transformed = apply_spark_expr_columns(transformed, config)
178
211
  dedup_cols = (
179
212
  [c.strip() for c in config.transform.dedup_columns.split(",") if c.strip()]
180
213
  if config.transform.dedup_columns
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.24
3
+ Version: 0.3.26
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT