batch-analytics 0.3.24__tar.gz → 0.3.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/PKG-INFO +1 -1
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/pyproject.toml +1 -1
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/analytics/t_test.py +4 -1
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/config.py +4 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/transform.py +127 -2
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics.egg-info/PKG-INFO +1 -1
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/README.md +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/setup.cfg +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/analytics/gluon_autogluon_infer.py +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/analytics/gluon_autogluon_train.py +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/analytics/pca_clustering.py +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/extract.py +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/job_runner.py +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/utils/__init__.py +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics.egg-info/requires.txt +0 -0
- {batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.27"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -146,7 +146,10 @@ def _run_one_way_anova(
|
|
|
146
146
|
N = sum(g["n"] for g in groups)
|
|
147
147
|
if N <= k:
|
|
148
148
|
raise ValueError(
|
|
149
|
-
f"ANOVA needs
|
|
149
|
+
f"ANOVA needs total observations N > number of groups k (got N={N}, k={k}). "
|
|
150
|
+
"Common cause: exactly one row per group (e.g. one raw-material charge per batch per material), "
|
|
151
|
+
"so there is no within-group residual. Use data with replicates per group, a different group column, "
|
|
152
|
+
"or compare batches using a table with many rows per batch (e.g. operations or equipment_usage)."
|
|
150
153
|
)
|
|
151
154
|
|
|
152
155
|
grand_mean = sum(g["n"] * g["mean"] for g in groups) / N
|
|
@@ -92,6 +92,10 @@ class TransformConfig:
|
|
|
92
92
|
add_dimension_column: str = os.environ.get("BATCH_ADD_DIMENSION_COLUMN", "add_dimension")
|
|
93
93
|
# Legacy: no longer used; output column names match JSON keys (e.g. anchor_id). Kept for env compatibility.
|
|
94
94
|
anchor_id_column: str = os.environ.get("BATCH_ANCHOR_ID_COLUMN", "anchor_id")
|
|
95
|
+
# JSON object: {"new_col": "Spark SQL expression"} applied after KV expansion, before dedupe.
|
|
96
|
+
expr_columns_json: str = os.environ.get("BATCH_TRANSFORM_EXPR_COLUMNS", "").strip()
|
|
97
|
+
# JSON pivot spec (group_by, pivot_column, value_column, agg, optional pivot_values, column_name_prefix).
|
|
98
|
+
pivot_json: str = os.environ.get("BATCH_TRANSFORM_PIVOT_JSON", "").strip()
|
|
95
99
|
|
|
96
100
|
|
|
97
101
|
@dataclass
|
|
@@ -10,7 +10,7 @@ import re
|
|
|
10
10
|
from typing import Any, Dict, List, Optional, Sequence, Set
|
|
11
11
|
|
|
12
12
|
from pyspark.sql import DataFrame, SparkSession
|
|
13
|
-
from pyspark.sql.functions import col, explode, map_keys, udf
|
|
13
|
+
from pyspark.sql.functions import col, explode, expr as spark_expr, map_keys, udf
|
|
14
14
|
from pyspark.sql.types import MapType, StringType
|
|
15
15
|
|
|
16
16
|
from .config import BatchAnalyticsConfig
|
|
@@ -165,16 +165,141 @@ def remove_duplicates(
|
|
|
165
165
|
return df_cleaned
|
|
166
166
|
|
|
167
167
|
|
|
168
|
+
def apply_spark_expr_columns(
|
|
169
|
+
df: DataFrame,
|
|
170
|
+
config: BatchAnalyticsConfig,
|
|
171
|
+
) -> DataFrame:
|
|
172
|
+
"""
|
|
173
|
+
Add or replace columns from Spark SQL expressions (``BATCH_TRANSFORM_EXPR_COLUMNS``).
|
|
174
|
+
|
|
175
|
+
Value must be a JSON object mapping **output column name** → **expression** (same dialect as
|
|
176
|
+
``selectExpr``), e.g. ``{"y": "cast(duration_minutes as double)", "x": "cast(regexp_extract(operation_id, '([0-9]+)$', 1) as double)"}`` (Java regex; prefer trailing digits to avoid hyphen/range issues in patterns like ``ETC-1-OP-…``).
|
|
177
|
+
"""
|
|
178
|
+
raw = (config.transform.expr_columns_json or "").strip()
|
|
179
|
+
if not raw:
|
|
180
|
+
return df
|
|
181
|
+
try:
|
|
182
|
+
mapping = json.loads(raw)
|
|
183
|
+
except json.JSONDecodeError as e:
|
|
184
|
+
raise ValueError(
|
|
185
|
+
f"BATCH_TRANSFORM_EXPR_COLUMNS must be valid JSON object: {e}"
|
|
186
|
+
) from e
|
|
187
|
+
if not isinstance(mapping, dict):
|
|
188
|
+
raise ValueError("BATCH_TRANSFORM_EXPR_COLUMNS must be a JSON object of column -> sql_expr")
|
|
189
|
+
out = df
|
|
190
|
+
for name, sql in mapping.items():
|
|
191
|
+
col_name = str(name).strip()
|
|
192
|
+
expr_sql = str(sql).strip()
|
|
193
|
+
if not col_name or not expr_sql:
|
|
194
|
+
continue
|
|
195
|
+
out = out.withColumn(col_name, spark_expr(expr_sql))
|
|
196
|
+
return out
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _pivot_output_column_name(prefix: str, pivot_value: str) -> str:
|
|
200
|
+
"""Stable identifier for pivoted columns (e.g. imp_rm_001 from RM-001)."""
|
|
201
|
+
p = (prefix or "v").rstrip("_")
|
|
202
|
+
body = str(pivot_value).replace("-", "_").replace(" ", "_").lower()
|
|
203
|
+
return f"{p}_{body}"
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def apply_pivot(
|
|
207
|
+
df: DataFrame,
|
|
208
|
+
config: BatchAnalyticsConfig,
|
|
209
|
+
) -> DataFrame:
|
|
210
|
+
"""
|
|
211
|
+
GroupBy + pivot + aggregate (``BATCH_TRANSFORM_PIVOT_JSON``).
|
|
212
|
+
|
|
213
|
+
Example::
|
|
214
|
+
|
|
215
|
+
{
|
|
216
|
+
"group_by": "batch_id",
|
|
217
|
+
"pivot_column": "material_id",
|
|
218
|
+
"value_column": "total_impurities_pct",
|
|
219
|
+
"agg": "max",
|
|
220
|
+
"pivot_values": ["RM-001", "RM-002"],
|
|
221
|
+
"column_name_prefix": "imp",
|
|
222
|
+
"rename_pivot_columns": true
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
If ``pivot_values`` is omitted, distinct values are collected (sorted by string order).
|
|
226
|
+
"""
|
|
227
|
+
raw = (config.transform.pivot_json or "").strip()
|
|
228
|
+
if not raw:
|
|
229
|
+
return df
|
|
230
|
+
try:
|
|
231
|
+
spec = json.loads(raw)
|
|
232
|
+
except json.JSONDecodeError as e:
|
|
233
|
+
raise ValueError(f"BATCH_TRANSFORM_PIVOT_JSON must be valid JSON: {e}") from e
|
|
234
|
+
if not isinstance(spec, dict):
|
|
235
|
+
raise ValueError("BATCH_TRANSFORM_PIVOT_JSON must be a JSON object")
|
|
236
|
+
|
|
237
|
+
group_by = spec.get("group_by") or spec.get("groupBy")
|
|
238
|
+
pivot_col = (spec.get("pivot_column") or "").strip()
|
|
239
|
+
value_col = (spec.get("value_column") or "").strip()
|
|
240
|
+
if not group_by or not pivot_col or not value_col:
|
|
241
|
+
raise ValueError("pivot spec requires group_by, pivot_column, value_column")
|
|
242
|
+
agg_name = (spec.get("agg") or "max").strip().lower()
|
|
243
|
+
prefix = (spec.get("column_name_prefix") or "v").strip() or "v"
|
|
244
|
+
rename_pivot = spec.get("rename_pivot_columns", True)
|
|
245
|
+
|
|
246
|
+
from pyspark.sql import functions as F
|
|
247
|
+
|
|
248
|
+
gcols = [c.strip() for c in str(group_by).split(",") if c.strip()]
|
|
249
|
+
for c in gcols + [pivot_col, value_col]:
|
|
250
|
+
if c not in df.columns:
|
|
251
|
+
raise ValueError(f"pivot: column {c!r} not in dataframe; have {df.columns}")
|
|
252
|
+
|
|
253
|
+
vc = F.col(value_col)
|
|
254
|
+
if agg_name == "max":
|
|
255
|
+
agg_expr = F.max(vc)
|
|
256
|
+
elif agg_name == "min":
|
|
257
|
+
agg_expr = F.min(vc)
|
|
258
|
+
elif agg_name in ("sum",):
|
|
259
|
+
agg_expr = F.sum(vc)
|
|
260
|
+
elif agg_name in ("avg", "mean"):
|
|
261
|
+
agg_expr = F.avg(vc)
|
|
262
|
+
elif agg_name == "first":
|
|
263
|
+
agg_expr = F.first(vc)
|
|
264
|
+
else:
|
|
265
|
+
raise ValueError(
|
|
266
|
+
f"pivot agg must be one of max,min,sum,avg,mean,first; got {agg_name!r}"
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
pivot_values = spec.get("pivot_values")
|
|
270
|
+
if pivot_values is not None:
|
|
271
|
+
pv: List[str] = [str(v) for v in pivot_values]
|
|
272
|
+
else:
|
|
273
|
+
pv = [r[0] for r in df.select(pivot_col).distinct().sort(pivot_col).collect()]
|
|
274
|
+
logger.info("pivot: inferred %d distinct %s values", len(pv), pivot_col)
|
|
275
|
+
|
|
276
|
+
out = df.groupBy(*gcols).pivot(pivot_col, pv).agg(agg_expr)
|
|
277
|
+
|
|
278
|
+
if rename_pivot:
|
|
279
|
+
for v in pv:
|
|
280
|
+
old = v
|
|
281
|
+
if old not in out.columns:
|
|
282
|
+
continue
|
|
283
|
+
new_name = _pivot_output_column_name(prefix, old)
|
|
284
|
+
if new_name != old:
|
|
285
|
+
out = out.withColumnRenamed(old, new_name)
|
|
286
|
+
return out
|
|
287
|
+
|
|
288
|
+
|
|
168
289
|
def transform(
|
|
169
290
|
df: DataFrame,
|
|
170
291
|
config: BatchAnalyticsConfig,
|
|
171
292
|
) -> DataFrame:
|
|
172
293
|
"""
|
|
173
294
|
Apply transformation only: (1) expand JSON/KV blob column into one column per top-level key,
|
|
174
|
-
(2)
|
|
295
|
+
(2) optional Spark SQL expression columns (``BATCH_TRANSFORM_EXPR_COLUMNS``),
|
|
296
|
+
(3) optional groupBy+pivot (``BATCH_TRANSFORM_PIVOT_JSON``),
|
|
297
|
+
(4) deduplicate by BATCH_DEDUP_COLUMNS if set, else by full row.
|
|
175
298
|
Does not write anywhere. Use stage_to_clickhouse() separately to persist.
|
|
176
299
|
"""
|
|
177
300
|
transformed = expand_kv_blob_column(df, config)
|
|
301
|
+
transformed = apply_spark_expr_columns(transformed, config)
|
|
302
|
+
transformed = apply_pivot(transformed, config)
|
|
178
303
|
dedup_cols = (
|
|
179
304
|
[c.strip() for c in config.transform.dedup_columns.split(",") if c.strip()]
|
|
180
305
|
if config.transform.dedup_columns
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.24 → batch_analytics-0.3.27}/src/batch_analytics.egg-info/top_level.txt
RENAMED
|
File without changes
|