batch-analytics 0.3.26__tar.gz → 0.3.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/PKG-INFO +1 -1
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/pyproject.toml +1 -1
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/config.py +2 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/transform.py +93 -1
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics.egg-info/PKG-INFO +1 -1
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/README.md +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/setup.cfg +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/analytics/gluon_autogluon_infer.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/analytics/gluon_autogluon_train.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/analytics/pca_clustering.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/analytics/t_test.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/extract.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/job_runner.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/utils/__init__.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics.egg-info/requires.txt +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.27"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -94,6 +94,8 @@ class TransformConfig:
|
|
|
94
94
|
anchor_id_column: str = os.environ.get("BATCH_ANCHOR_ID_COLUMN", "anchor_id")
|
|
95
95
|
# JSON object: {"new_col": "Spark SQL expression"} applied after KV expansion, before dedupe.
|
|
96
96
|
expr_columns_json: str = os.environ.get("BATCH_TRANSFORM_EXPR_COLUMNS", "").strip()
|
|
97
|
+
# JSON pivot spec (group_by, pivot_column, value_column, agg, optional pivot_values, column_name_prefix).
|
|
98
|
+
pivot_json: str = os.environ.get("BATCH_TRANSFORM_PIVOT_JSON", "").strip()
|
|
97
99
|
|
|
98
100
|
|
|
99
101
|
@dataclass
|
|
@@ -196,6 +196,96 @@ def apply_spark_expr_columns(
|
|
|
196
196
|
return out
|
|
197
197
|
|
|
198
198
|
|
|
199
|
+
def _pivot_output_column_name(prefix: str, pivot_value: str) -> str:
|
|
200
|
+
"""Stable identifier for pivoted columns (e.g. imp_rm_001 from RM-001)."""
|
|
201
|
+
p = (prefix or "v").rstrip("_")
|
|
202
|
+
body = str(pivot_value).replace("-", "_").replace(" ", "_").lower()
|
|
203
|
+
return f"{p}_{body}"
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def apply_pivot(
|
|
207
|
+
df: DataFrame,
|
|
208
|
+
config: BatchAnalyticsConfig,
|
|
209
|
+
) -> DataFrame:
|
|
210
|
+
"""
|
|
211
|
+
GroupBy + pivot + aggregate (``BATCH_TRANSFORM_PIVOT_JSON``).
|
|
212
|
+
|
|
213
|
+
Example::
|
|
214
|
+
|
|
215
|
+
{
|
|
216
|
+
"group_by": "batch_id",
|
|
217
|
+
"pivot_column": "material_id",
|
|
218
|
+
"value_column": "total_impurities_pct",
|
|
219
|
+
"agg": "max",
|
|
220
|
+
"pivot_values": ["RM-001", "RM-002"],
|
|
221
|
+
"column_name_prefix": "imp",
|
|
222
|
+
"rename_pivot_columns": true
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
If ``pivot_values`` is omitted, distinct values are collected (sorted by string order).
|
|
226
|
+
"""
|
|
227
|
+
raw = (config.transform.pivot_json or "").strip()
|
|
228
|
+
if not raw:
|
|
229
|
+
return df
|
|
230
|
+
try:
|
|
231
|
+
spec = json.loads(raw)
|
|
232
|
+
except json.JSONDecodeError as e:
|
|
233
|
+
raise ValueError(f"BATCH_TRANSFORM_PIVOT_JSON must be valid JSON: {e}") from e
|
|
234
|
+
if not isinstance(spec, dict):
|
|
235
|
+
raise ValueError("BATCH_TRANSFORM_PIVOT_JSON must be a JSON object")
|
|
236
|
+
|
|
237
|
+
group_by = spec.get("group_by") or spec.get("groupBy")
|
|
238
|
+
pivot_col = (spec.get("pivot_column") or "").strip()
|
|
239
|
+
value_col = (spec.get("value_column") or "").strip()
|
|
240
|
+
if not group_by or not pivot_col or not value_col:
|
|
241
|
+
raise ValueError("pivot spec requires group_by, pivot_column, value_column")
|
|
242
|
+
agg_name = (spec.get("agg") or "max").strip().lower()
|
|
243
|
+
prefix = (spec.get("column_name_prefix") or "v").strip() or "v"
|
|
244
|
+
rename_pivot = spec.get("rename_pivot_columns", True)
|
|
245
|
+
|
|
246
|
+
from pyspark.sql import functions as F
|
|
247
|
+
|
|
248
|
+
gcols = [c.strip() for c in str(group_by).split(",") if c.strip()]
|
|
249
|
+
for c in gcols + [pivot_col, value_col]:
|
|
250
|
+
if c not in df.columns:
|
|
251
|
+
raise ValueError(f"pivot: column {c!r} not in dataframe; have {df.columns}")
|
|
252
|
+
|
|
253
|
+
vc = F.col(value_col)
|
|
254
|
+
if agg_name == "max":
|
|
255
|
+
agg_expr = F.max(vc)
|
|
256
|
+
elif agg_name == "min":
|
|
257
|
+
agg_expr = F.min(vc)
|
|
258
|
+
elif agg_name in ("sum",):
|
|
259
|
+
agg_expr = F.sum(vc)
|
|
260
|
+
elif agg_name in ("avg", "mean"):
|
|
261
|
+
agg_expr = F.avg(vc)
|
|
262
|
+
elif agg_name == "first":
|
|
263
|
+
agg_expr = F.first(vc)
|
|
264
|
+
else:
|
|
265
|
+
raise ValueError(
|
|
266
|
+
f"pivot agg must be one of max,min,sum,avg,mean,first; got {agg_name!r}"
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
pivot_values = spec.get("pivot_values")
|
|
270
|
+
if pivot_values is not None:
|
|
271
|
+
pv: List[str] = [str(v) for v in pivot_values]
|
|
272
|
+
else:
|
|
273
|
+
pv = [r[0] for r in df.select(pivot_col).distinct().sort(pivot_col).collect()]
|
|
274
|
+
logger.info("pivot: inferred %d distinct %s values", len(pv), pivot_col)
|
|
275
|
+
|
|
276
|
+
out = df.groupBy(*gcols).pivot(pivot_col, pv).agg(agg_expr)
|
|
277
|
+
|
|
278
|
+
if rename_pivot:
|
|
279
|
+
for v in pv:
|
|
280
|
+
old = v
|
|
281
|
+
if old not in out.columns:
|
|
282
|
+
continue
|
|
283
|
+
new_name = _pivot_output_column_name(prefix, old)
|
|
284
|
+
if new_name != old:
|
|
285
|
+
out = out.withColumnRenamed(old, new_name)
|
|
286
|
+
return out
|
|
287
|
+
|
|
288
|
+
|
|
199
289
|
def transform(
|
|
200
290
|
df: DataFrame,
|
|
201
291
|
config: BatchAnalyticsConfig,
|
|
@@ -203,11 +293,13 @@ def transform(
|
|
|
203
293
|
"""
|
|
204
294
|
Apply transformation only: (1) expand JSON/KV blob column into one column per top-level key,
|
|
205
295
|
(2) optional Spark SQL expression columns (``BATCH_TRANSFORM_EXPR_COLUMNS``),
|
|
206
|
-
(3)
|
|
296
|
+
(3) optional groupBy+pivot (``BATCH_TRANSFORM_PIVOT_JSON``),
|
|
297
|
+
(4) deduplicate by BATCH_DEDUP_COLUMNS if set, else by full row.
|
|
207
298
|
Does not write anywhere. Use stage_to_clickhouse() separately to persist.
|
|
208
299
|
"""
|
|
209
300
|
transformed = expand_kv_blob_column(df, config)
|
|
210
301
|
transformed = apply_spark_expr_columns(transformed, config)
|
|
302
|
+
transformed = apply_pivot(transformed, config)
|
|
211
303
|
dedup_cols = (
|
|
212
304
|
[c.strip() for c in config.transform.dedup_columns.split(",") if c.strip()]
|
|
213
305
|
if config.transform.dedup_columns
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.26 → batch_analytics-0.3.27}/src/batch_analytics.egg-info/top_level.txt
RENAMED
|
File without changes
|