batch-analytics 0.3.23__tar.gz → 0.3.26__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/PKG-INFO +1 -1
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/pyproject.toml +1 -1
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/analytics/t_test.py +4 -1
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/config.py +2 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/job_runner.py +1 -1
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/transform.py +60 -23
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/PKG-INFO +1 -1
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/README.md +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/setup.cfg +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/analytics/gluon_autogluon_infer.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/analytics/gluon_autogluon_train.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/analytics/pca_clustering.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/extract.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/utils/__init__.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/requires.txt +0 -0
- {batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.26"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -146,7 +146,10 @@ def _run_one_way_anova(
|
|
|
146
146
|
N = sum(g["n"] for g in groups)
|
|
147
147
|
if N <= k:
|
|
148
148
|
raise ValueError(
|
|
149
|
-
f"ANOVA needs
|
|
149
|
+
f"ANOVA needs total observations N > number of groups k (got N={N}, k={k}). "
|
|
150
|
+
"Common cause: exactly one row per group (e.g. one raw-material charge per batch per material), "
|
|
151
|
+
"so there is no within-group residual. Use data with replicates per group, a different group column, "
|
|
152
|
+
"or compare batches using a table with many rows per batch (e.g. operations or equipment_usage)."
|
|
150
153
|
)
|
|
151
154
|
|
|
152
155
|
grand_mean = sum(g["n"] * g["mean"] for g in groups) / N
|
|
@@ -92,6 +92,8 @@ class TransformConfig:
|
|
|
92
92
|
add_dimension_column: str = os.environ.get("BATCH_ADD_DIMENSION_COLUMN", "add_dimension")
|
|
93
93
|
# Legacy: no longer used; output column names match JSON keys (e.g. anchor_id). Kept for env compatibility.
|
|
94
94
|
anchor_id_column: str = os.environ.get("BATCH_ANCHOR_ID_COLUMN", "anchor_id")
|
|
95
|
+
# JSON object: {"new_col": "Spark SQL expression"} applied after KV expansion, before dedupe.
|
|
96
|
+
expr_columns_json: str = os.environ.get("BATCH_TRANSFORM_EXPR_COLUMNS", "").strip()
|
|
95
97
|
|
|
96
98
|
|
|
97
99
|
@dataclass
|
|
@@ -235,7 +235,7 @@ def run_pipeline(
|
|
|
235
235
|
# Empty/unset: rely on $SPARK_HOME/jars (analytics-runner image). Do not add spark.jars /
|
|
236
236
|
# spark.jars.packages for ClickHouse here — that breaks K8s executors (./basename.jar).
|
|
237
237
|
# For ad-hoc runs without the image, set e.g.
|
|
238
|
-
# BATCH_SPARK_CLICKHOUSE_PACKAGES=com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.
|
|
238
|
+
# BATCH_SPARK_CLICKHOUSE_PACKAGES=com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.10.0,https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/0.9.8/clickhouse-jdbc-0.9.8-all.jar
|
|
239
239
|
_raw_ch = os.environ.get("BATCH_SPARK_CLICKHOUSE_PACKAGES")
|
|
240
240
|
if _raw_ch is None or not _raw_ch.strip():
|
|
241
241
|
ch_pkgs = None
|
|
@@ -10,7 +10,7 @@ import re
|
|
|
10
10
|
from typing import Any, Dict, List, Optional, Sequence, Set
|
|
11
11
|
|
|
12
12
|
from pyspark.sql import DataFrame, SparkSession
|
|
13
|
-
from pyspark.sql.functions import col, explode, map_keys, udf
|
|
13
|
+
from pyspark.sql.functions import col, explode, expr as spark_expr, map_keys, udf
|
|
14
14
|
from pyspark.sql.types import MapType, StringType
|
|
15
15
|
|
|
16
16
|
from .config import BatchAnalyticsConfig
|
|
@@ -165,16 +165,49 @@ def remove_duplicates(
|
|
|
165
165
|
return df_cleaned
|
|
166
166
|
|
|
167
167
|
|
|
168
|
+
def apply_spark_expr_columns(
|
|
169
|
+
df: DataFrame,
|
|
170
|
+
config: BatchAnalyticsConfig,
|
|
171
|
+
) -> DataFrame:
|
|
172
|
+
"""
|
|
173
|
+
Add or replace columns from Spark SQL expressions (``BATCH_TRANSFORM_EXPR_COLUMNS``).
|
|
174
|
+
|
|
175
|
+
Value must be a JSON object mapping **output column name** → **expression** (same dialect as
|
|
176
|
+
``selectExpr``), e.g. ``{"y": "cast(duration_minutes as double)", "x": "cast(regexp_extract(operation_id, '([0-9]+)$', 1) as double)"}`` (Java regex; prefer trailing digits to avoid hyphen/range issues in patterns like ``ETC-1-OP-…``).
|
|
177
|
+
"""
|
|
178
|
+
raw = (config.transform.expr_columns_json or "").strip()
|
|
179
|
+
if not raw:
|
|
180
|
+
return df
|
|
181
|
+
try:
|
|
182
|
+
mapping = json.loads(raw)
|
|
183
|
+
except json.JSONDecodeError as e:
|
|
184
|
+
raise ValueError(
|
|
185
|
+
f"BATCH_TRANSFORM_EXPR_COLUMNS must be valid JSON object: {e}"
|
|
186
|
+
) from e
|
|
187
|
+
if not isinstance(mapping, dict):
|
|
188
|
+
raise ValueError("BATCH_TRANSFORM_EXPR_COLUMNS must be a JSON object of column -> sql_expr")
|
|
189
|
+
out = df
|
|
190
|
+
for name, sql in mapping.items():
|
|
191
|
+
col_name = str(name).strip()
|
|
192
|
+
expr_sql = str(sql).strip()
|
|
193
|
+
if not col_name or not expr_sql:
|
|
194
|
+
continue
|
|
195
|
+
out = out.withColumn(col_name, spark_expr(expr_sql))
|
|
196
|
+
return out
|
|
197
|
+
|
|
198
|
+
|
|
168
199
|
def transform(
|
|
169
200
|
df: DataFrame,
|
|
170
201
|
config: BatchAnalyticsConfig,
|
|
171
202
|
) -> DataFrame:
|
|
172
203
|
"""
|
|
173
204
|
Apply transformation only: (1) expand JSON/KV blob column into one column per top-level key,
|
|
174
|
-
(2)
|
|
205
|
+
(2) optional Spark SQL expression columns (``BATCH_TRANSFORM_EXPR_COLUMNS``),
|
|
206
|
+
(3) deduplicate by BATCH_DEDUP_COLUMNS if set, else by full row.
|
|
175
207
|
Does not write anywhere. Use stage_to_clickhouse() separately to persist.
|
|
176
208
|
"""
|
|
177
209
|
transformed = expand_kv_blob_column(df, config)
|
|
210
|
+
transformed = apply_spark_expr_columns(transformed, config)
|
|
178
211
|
dedup_cols = (
|
|
179
212
|
[c.strip() for c in config.transform.dedup_columns.split(",") if c.strip()]
|
|
180
213
|
if config.transform.dedup_columns
|
|
@@ -200,8 +233,9 @@ def _ch_quoted_ident(name: str) -> str:
|
|
|
200
233
|
def _ch_order_by_key_expr(col: str, df: DataFrame) -> str:
|
|
201
234
|
"""
|
|
202
235
|
Build one ORDER BY key expression. Spark nullable columns map to Nullable in CH; CH 25
|
|
203
|
-
rejects nullable sort keys unless ``allow_nullable_key`` is
|
|
204
|
-
|
|
236
|
+
rejects nullable sort keys unless ``allow_nullable_key`` is set (via
|
|
237
|
+
``tableProperty("settings.allow_nullable_key", "1")`` in the catalog path) or keys are
|
|
238
|
+
wrapped with ``assumeNotNull``.
|
|
205
239
|
|
|
206
240
|
Using ``assumeNotNull(col)`` yields a non-nullable key expression so CREATE TABLE succeeds.
|
|
207
241
|
Only wrap when Spark marks the field nullable (``assumeNotNull`` requires Nullable input).
|
|
@@ -264,25 +298,28 @@ def stage_to_clickhouse(
|
|
|
264
298
|
Write transformed data to ClickHouse staging table.
|
|
265
299
|
Separate job from transform; must complete before analytics can run.
|
|
266
300
|
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
301
|
+
**Only Spark’s native ClickHouse integration** (``clickhouse-spark-runtime`` DataSourceV2):
|
|
302
|
+
no separate Python DDL client for table creation.
|
|
303
|
+
|
|
304
|
+
1. If ``BATCH_CLICKHOUSE_CATALOG`` is set (default ``batch_ch`` when the job registers
|
|
305
|
+
``ClickHouseCatalog`` in ``job_runner.create_spark_session``): ``DataFrame.writeTo`` with
|
|
306
|
+
``tableProperty("engine", "MergeTree()")``, ``order_by``, and
|
|
307
|
+
``tableProperty("settings.allow_nullable_key", "1")``, then ``createOrReplace()`` or
|
|
308
|
+
``append()``.
|
|
309
|
+
2. Otherwise (or on catalog failure): ``format("clickhouse")`` with the same connection
|
|
310
|
+
options, then JDBC as last resort.
|
|
272
311
|
|
|
273
|
-
|
|
274
|
-
|
|
312
|
+
Pin ``clickhouse-spark-runtime-3.5_2.12`` **0.10.0+** on the Spark classpath (see
|
|
313
|
+
``analytics_runner`` Dockerfile ``CLICKHOUSE_SPARK_RUNTIME_VERSION``) for ClickHouse **25.x**
|
|
314
|
+
servers.
|
|
275
315
|
|
|
276
|
-
**MergeTree ORDER BY
|
|
277
|
-
|
|
316
|
+
**MergeTree ORDER BY**: ``BATCH_CLICKHOUSE_STAGING_ORDER_BY``, or dedup columns, or first
|
|
317
|
+
column. Do not add an extra outer ``(...)`` around ``order_by`` (the connector wraps it).
|
|
278
318
|
|
|
279
|
-
**Nullable
|
|
280
|
-
|
|
281
|
-
``tableProperty("settings.allow_nullable_key")`` into DDL, so we also set
|
|
282
|
-
``engine`` to ``MergeTree() SETTINGS allow_nullable_key = 1``, which ClickHouse
|
|
283
|
-
applies to the created table.
|
|
319
|
+
**Nullable keys**: ``assumeNotNull(`col`)`` for nullable Spark columns in the sort key, plus
|
|
320
|
+
``settings.allow_nullable_key`` when needed.
|
|
284
321
|
|
|
285
|
-
Write mode from BATCH_STAGING_WRITE_MODE (default overwrite
|
|
322
|
+
Write mode from ``BATCH_STAGING_WRITE_MODE`` (default overwrite).
|
|
286
323
|
"""
|
|
287
324
|
n = df.count()
|
|
288
325
|
mode = _normalize_staging_write_mode(config.transform.staging_write_mode)
|
|
@@ -300,13 +337,13 @@ def stage_to_clickhouse(
|
|
|
300
337
|
mode,
|
|
301
338
|
order_by,
|
|
302
339
|
)
|
|
303
|
-
# MergeTree
|
|
304
|
-
#
|
|
305
|
-
_mt_engine = "MergeTree() SETTINGS allow_nullable_key = 1"
|
|
340
|
+
# Plain MergeTree() only — SETTINGS belong in tableProperty("settings.*", ...) so the
|
|
341
|
+
# connector emits them after ORDER BY; inline SETTINGS in ENGINE breaks CH 25.5 parsing.
|
|
306
342
|
w2 = (
|
|
307
343
|
df.writeTo(full_name)
|
|
308
|
-
.tableProperty("engine",
|
|
344
|
+
.tableProperty("engine", "MergeTree()")
|
|
309
345
|
.tableProperty("order_by", order_by)
|
|
346
|
+
.tableProperty("settings.allow_nullable_key", "1")
|
|
310
347
|
)
|
|
311
348
|
if mode == "overwrite":
|
|
312
349
|
w2.createOrReplace()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.23 → batch_analytics-0.3.26}/src/batch_analytics.egg-info/top_level.txt
RENAMED
|
File without changes
|