batch-analytics 0.3.26__tar.gz → 0.3.28__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/PKG-INFO +1 -1
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/pyproject.toml +1 -1
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/analytics/__init__.py +2 -0
- batch_analytics-0.3.28/src/batch_analytics/analytics/equipment_oee.py +201 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/config.py +39 -10
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/modules.py +2 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/transform.py +105 -3
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics.egg-info/PKG-INFO +1 -1
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics.egg-info/SOURCES.txt +1 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/README.md +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/setup.cfg +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/analytics/gluon_autogluon_infer.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/analytics/gluon_autogluon_train.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/analytics/pca_clustering.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/analytics/t_test.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/extract.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/job_runner.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/utils/__init__.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics.egg-info/requires.txt +0 -0
- {batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.28"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -10,10 +10,12 @@ from .linear_regression import run_linear_regression
|
|
|
10
10
|
from .correlation import run_correlation
|
|
11
11
|
from .pca_clustering import run_pca_clustering
|
|
12
12
|
from .t_test import run_t_test
|
|
13
|
+
from .equipment_oee import run_equipment_oee
|
|
13
14
|
|
|
14
15
|
__all__ = [
|
|
15
16
|
"run_linear_regression",
|
|
16
17
|
"run_correlation",
|
|
17
18
|
"run_pca_clustering",
|
|
18
19
|
"run_t_test",
|
|
20
|
+
"run_equipment_oee",
|
|
19
21
|
]
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Equipment-level OEE from interval timing, batch quality, and equipment×operation ideal cycle.
|
|
3
|
+
|
|
4
|
+
- Interval: (equipment_id, operation_id, batch_id) — actual run + idle gaps
|
|
5
|
+
- Batch quality: (batch_id) — is_good_batch
|
|
6
|
+
- Ideal cycle: (equipment_id, operation_id) — standard minutes; batch-independent
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
from typing import Any, Dict, List, Tuple
|
|
13
|
+
|
|
14
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
15
|
+
from pyspark.sql import functions as F
|
|
16
|
+
|
|
17
|
+
from ..config import BatchAnalyticsConfig
|
|
18
|
+
from ..extract import extract_table
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
_INTERVAL_REQUIRED = (
|
|
23
|
+
"equipment_id",
|
|
24
|
+
"batch_id",
|
|
25
|
+
"operation_id",
|
|
26
|
+
"planned_production_minutes",
|
|
27
|
+
"actual_run_minutes",
|
|
28
|
+
"idle_gap_minutes",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
_BATCH_QUALITY_REQUIRED = ("batch_id", "is_good_batch")
|
|
32
|
+
|
|
33
|
+
_IDEAL_CYCLE_REQUIRED = (
|
|
34
|
+
"equipment_id",
|
|
35
|
+
"operation_id",
|
|
36
|
+
"ideal_cycle_time_minutes",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _require_columns(df: DataFrame, cols: Tuple[str, ...], label: str) -> None:
|
|
41
|
+
missing = [c for c in cols if c not in df.columns]
|
|
42
|
+
if missing:
|
|
43
|
+
raise ValueError(
|
|
44
|
+
f"equipment_oee {label} missing columns: {missing}. Available: {df.columns}"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _load_batch_quality(spark: SparkSession, config: BatchAnalyticsConfig) -> DataFrame:
|
|
49
|
+
table = (config.analytics.equipment_oee_batch_quality_table or "").strip()
|
|
50
|
+
if not table:
|
|
51
|
+
raise ValueError(
|
|
52
|
+
"equipment_oee requires batch-level quality. Set BATCH_OEE_BATCH_QUALITY_TABLE "
|
|
53
|
+
"to the ClickHouse table (e.g. etc1_executed_bpr_batch_quality)."
|
|
54
|
+
)
|
|
55
|
+
df = extract_table(spark, table, config)
|
|
56
|
+
_require_columns(df, _BATCH_QUALITY_REQUIRED, "batch quality table")
|
|
57
|
+
return df.select(
|
|
58
|
+
"batch_id",
|
|
59
|
+
F.col("is_good_batch").cast("double").alias("is_good_batch"),
|
|
60
|
+
*(
|
|
61
|
+
[F.col("batch_disposition")]
|
|
62
|
+
if "batch_disposition" in df.columns
|
|
63
|
+
else []
|
|
64
|
+
),
|
|
65
|
+
).dropDuplicates(["batch_id"])
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _load_ideal_cycle(spark: SparkSession, config: BatchAnalyticsConfig) -> DataFrame:
|
|
69
|
+
table = (config.analytics.equipment_oee_ideal_cycle_table or "").strip()
|
|
70
|
+
if not table:
|
|
71
|
+
raise ValueError(
|
|
72
|
+
"equipment_oee requires equipment×operation ideal cycle standards. "
|
|
73
|
+
"Set BATCH_OEE_IDEAL_CYCLE_TABLE "
|
|
74
|
+
"(e.g. etc1_executed_bpr_equipment_ideal_cycle)."
|
|
75
|
+
)
|
|
76
|
+
df = extract_table(spark, table, config)
|
|
77
|
+
_require_columns(df, _IDEAL_CYCLE_REQUIRED, "ideal cycle table")
|
|
78
|
+
return df.select(
|
|
79
|
+
"equipment_id",
|
|
80
|
+
"operation_id",
|
|
81
|
+
F.col("ideal_cycle_time_minutes").cast("double").alias("ideal_cycle_time_minutes"),
|
|
82
|
+
).dropDuplicates(["equipment_id", "operation_id"])
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _staging_availability(df: DataFrame) -> DataFrame:
|
|
86
|
+
return (
|
|
87
|
+
df.groupBy("equipment_id", "batch_id")
|
|
88
|
+
.agg(
|
|
89
|
+
F.sum(F.col("actual_run_minutes").cast("double")).alias("actual_run_minutes"),
|
|
90
|
+
F.sum(F.col("idle_gap_minutes").cast("double")).alias("downtime_minutes"),
|
|
91
|
+
)
|
|
92
|
+
.withColumn(
|
|
93
|
+
"planned_production_minutes",
|
|
94
|
+
F.col("actual_run_minutes") + F.col("downtime_minutes"),
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _interval_with_ideal(interval_df: DataFrame, ideal_df: DataFrame) -> DataFrame:
|
|
100
|
+
return interval_df.join(ideal_df, on=["equipment_id", "operation_id"], how="inner")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _staging_performance(interval_df: DataFrame, ideal_df: DataFrame) -> DataFrame:
|
|
104
|
+
joined = _interval_with_ideal(interval_df, ideal_df)
|
|
105
|
+
return joined.groupBy("equipment_id", "batch_id", "operation_id").agg(
|
|
106
|
+
F.first("ideal_cycle_time_minutes").alias("ideal_cycle_time_minutes"),
|
|
107
|
+
F.sum(F.col("actual_run_minutes").cast("double")).alias("actual_run_minutes"),
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _staging_performance_totals(interval_df: DataFrame, ideal_df: DataFrame) -> DataFrame:
|
|
112
|
+
joined = _interval_with_ideal(interval_df, ideal_df)
|
|
113
|
+
return joined.groupBy("equipment_id", "batch_id").agg(
|
|
114
|
+
F.sum("ideal_cycle_time_minutes").alias("ideal_cycle_time_minutes"),
|
|
115
|
+
F.sum(F.col("actual_run_minutes").cast("double")).alias("actual_run_minutes"),
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _rows_to_dicts(df: DataFrame) -> List[Dict[str, Any]]:
|
|
120
|
+
return [row.asDict() for row in df.collect()]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def run_equipment_oee(
|
|
124
|
+
spark: SparkSession,
|
|
125
|
+
df: DataFrame,
|
|
126
|
+
config: BatchAnalyticsConfig,
|
|
127
|
+
) -> Dict[str, Any]:
|
|
128
|
+
"""
|
|
129
|
+
Compute equipment × batch OEE.
|
|
130
|
+
|
|
131
|
+
Performance uses ideal_cycle_time_minutes from BATCH_OEE_IDEAL_CYCLE_TABLE
|
|
132
|
+
(equipment_id × operation_id, batch-independent), not batch-specific BPR durations.
|
|
133
|
+
"""
|
|
134
|
+
_require_columns(df, _INTERVAL_REQUIRED, "interval staging")
|
|
135
|
+
|
|
136
|
+
batch_q = _load_batch_quality(spark, config)
|
|
137
|
+
ideal = _load_ideal_cycle(spark, config)
|
|
138
|
+
avail = _staging_availability(df)
|
|
139
|
+
perf_ops = _staging_performance(df, ideal)
|
|
140
|
+
perf_tot = _staging_performance_totals(df, ideal)
|
|
141
|
+
|
|
142
|
+
joined = (
|
|
143
|
+
avail.join(perf_tot, on=["equipment_id", "batch_id"], how="inner")
|
|
144
|
+
.join(batch_q, on="batch_id", how="inner")
|
|
145
|
+
.withColumn(
|
|
146
|
+
"availability_pct",
|
|
147
|
+
F.when(
|
|
148
|
+
F.col("planned_production_minutes") > 0,
|
|
149
|
+
F.col("actual_run_minutes") / F.col("planned_production_minutes"),
|
|
150
|
+
).otherwise(F.lit(0.0)),
|
|
151
|
+
)
|
|
152
|
+
.withColumn(
|
|
153
|
+
"performance_pct",
|
|
154
|
+
F.when(
|
|
155
|
+
F.col("actual_run_minutes") > 0,
|
|
156
|
+
F.least(
|
|
157
|
+
F.lit(1.0),
|
|
158
|
+
F.col("ideal_cycle_time_minutes") / F.col("actual_run_minutes"),
|
|
159
|
+
),
|
|
160
|
+
).otherwise(F.lit(0.0)),
|
|
161
|
+
)
|
|
162
|
+
.withColumn("quality_pct", F.col("is_good_batch").cast("double"))
|
|
163
|
+
.withColumn(
|
|
164
|
+
"oee_pct",
|
|
165
|
+
F.col("availability_pct")
|
|
166
|
+
* F.col("performance_pct")
|
|
167
|
+
* F.col("quality_pct"),
|
|
168
|
+
)
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
out_cols = [
|
|
172
|
+
"equipment_id",
|
|
173
|
+
"batch_id",
|
|
174
|
+
"planned_production_minutes",
|
|
175
|
+
"actual_run_minutes",
|
|
176
|
+
"downtime_minutes",
|
|
177
|
+
"ideal_cycle_time_minutes",
|
|
178
|
+
"is_good_batch",
|
|
179
|
+
"availability_pct",
|
|
180
|
+
"performance_pct",
|
|
181
|
+
"quality_pct",
|
|
182
|
+
"oee_pct",
|
|
183
|
+
]
|
|
184
|
+
if "batch_disposition" in joined.columns:
|
|
185
|
+
out_cols.insert(7, "batch_disposition")
|
|
186
|
+
|
|
187
|
+
result_df = joined.select(*out_cols).orderBy("equipment_id", "batch_id")
|
|
188
|
+
|
|
189
|
+
n = result_df.count()
|
|
190
|
+
logger.info("equipment_oee: %d equipment×batch rows", n)
|
|
191
|
+
|
|
192
|
+
return {
|
|
193
|
+
"row_count": n,
|
|
194
|
+
"staging_availability": _rows_to_dicts(avail.orderBy("equipment_id", "batch_id")),
|
|
195
|
+
"staging_batch_quality": _rows_to_dicts(batch_q.orderBy("batch_id")),
|
|
196
|
+
"staging_ideal_cycle": _rows_to_dicts(ideal.orderBy("equipment_id", "operation_id")),
|
|
197
|
+
"staging_performance_by_operation": _rows_to_dicts(
|
|
198
|
+
perf_ops.orderBy("equipment_id", "batch_id", "operation_id")
|
|
199
|
+
),
|
|
200
|
+
"equipment_oee": _rows_to_dicts(result_df),
|
|
201
|
+
}
|
|
@@ -74,26 +74,47 @@ class TransformConfig:
|
|
|
74
74
|
|
|
75
75
|
# Order: extract anchor_id from add_dimension(s) column, then dedupe by these keys.
|
|
76
76
|
# Deduplication keys (comma-separated). Empty = dropDuplicates() on full row (all columns).
|
|
77
|
-
|
|
77
|
+
# Use default_factory so values reflect os.environ when BatchAnalyticsConfig() is built, not at import time.
|
|
78
|
+
dedup_columns: str = field(
|
|
79
|
+
default_factory=lambda: os.environ.get("BATCH_DEDUP_COLUMNS", "")
|
|
80
|
+
)
|
|
78
81
|
# Staging output path (local or S3)
|
|
79
|
-
staging_path: str =
|
|
80
|
-
|
|
81
|
-
|
|
82
|
+
staging_path: str = field(
|
|
83
|
+
default_factory=lambda: os.environ.get(
|
|
84
|
+
"BATCH_STAGING_PATH",
|
|
85
|
+
"/tmp/analytics_stage",
|
|
86
|
+
)
|
|
82
87
|
)
|
|
83
88
|
# Output format for load_staged when reading (parquet/delta/clickhouse).
|
|
84
89
|
# Stage job always writes to ClickHouse; use clickhouse for analytics to read from staged table.
|
|
85
|
-
staging_format: str =
|
|
90
|
+
staging_format: str = field(
|
|
91
|
+
default_factory=lambda: os.environ.get("BATCH_STAGING_FORMAT", "clickhouse")
|
|
92
|
+
)
|
|
86
93
|
# Staging table name in ClickHouse (when format=clickhouse)
|
|
87
|
-
staging_table: str =
|
|
94
|
+
staging_table: str = field(
|
|
95
|
+
default_factory=lambda: os.environ.get("BATCH_STAGING_TABLE", "analytics_staging")
|
|
96
|
+
)
|
|
88
97
|
# Spark save mode for ClickHouse staging (and path staging): overwrite | append
|
|
89
|
-
staging_write_mode: str =
|
|
98
|
+
staging_write_mode: str = field(
|
|
99
|
+
default_factory=lambda: os.environ.get("BATCH_STAGING_WRITE_MODE", "overwrite")
|
|
100
|
+
)
|
|
90
101
|
# Source column holding a JSON object or Python dict string; every top-level key becomes a new String column
|
|
91
102
|
# (see transform.expand_kv_blob_column). Example: add_dimensions {'anchor_id':'...','lot':'A1'}
|
|
92
|
-
add_dimension_column: str =
|
|
103
|
+
add_dimension_column: str = field(
|
|
104
|
+
default_factory=lambda: os.environ.get("BATCH_ADD_DIMENSION_COLUMN", "add_dimension")
|
|
105
|
+
)
|
|
93
106
|
# Legacy: no longer used; output column names match JSON keys (e.g. anchor_id). Kept for env compatibility.
|
|
94
|
-
anchor_id_column: str =
|
|
107
|
+
anchor_id_column: str = field(
|
|
108
|
+
default_factory=lambda: os.environ.get("BATCH_ANCHOR_ID_COLUMN", "anchor_id")
|
|
109
|
+
)
|
|
95
110
|
# JSON object: {"new_col": "Spark SQL expression"} applied after KV expansion, before dedupe.
|
|
96
|
-
expr_columns_json: str =
|
|
111
|
+
expr_columns_json: str = field(
|
|
112
|
+
default_factory=lambda: os.environ.get("BATCH_TRANSFORM_EXPR_COLUMNS", "").strip()
|
|
113
|
+
)
|
|
114
|
+
# JSON pivot spec (group_by, pivot_column, value_column, agg, optional pivot_values, column_name_prefix).
|
|
115
|
+
pivot_json: str = field(
|
|
116
|
+
default_factory=lambda: os.environ.get("BATCH_TRANSFORM_PIVOT_JSON", "").strip()
|
|
117
|
+
)
|
|
97
118
|
|
|
98
119
|
|
|
99
120
|
@dataclass
|
|
@@ -156,6 +177,14 @@ class AnalyticsConfig:
|
|
|
156
177
|
ttest_col_a: str = os.environ.get("BATCH_TTEST_COL_A", "")
|
|
157
178
|
ttest_col_b: str = os.environ.get("BATCH_TTEST_COL_B", "")
|
|
158
179
|
|
|
180
|
+
# Equipment OEE: batch-level quality table (batch_id, is_good_batch, …)
|
|
181
|
+
equipment_oee_batch_quality_table: str = os.environ.get(
|
|
182
|
+
"BATCH_OEE_BATCH_QUALITY_TABLE", ""
|
|
183
|
+
).strip()
|
|
184
|
+
equipment_oee_ideal_cycle_table: str = os.environ.get(
|
|
185
|
+
"BATCH_OEE_IDEAL_CYCLE_TABLE", ""
|
|
186
|
+
).strip()
|
|
187
|
+
|
|
159
188
|
|
|
160
189
|
@dataclass
|
|
161
190
|
class SparkK8sConfig:
|
|
@@ -10,6 +10,7 @@ from .analytics import (
|
|
|
10
10
|
run_correlation,
|
|
11
11
|
run_pca_clustering,
|
|
12
12
|
run_t_test,
|
|
13
|
+
run_equipment_oee,
|
|
13
14
|
)
|
|
14
15
|
|
|
15
16
|
# module_arg -> (run_fn, result_key)
|
|
@@ -18,6 +19,7 @@ MODULE_REGISTRY = {
|
|
|
18
19
|
"corr": (run_correlation, "correlation"),
|
|
19
20
|
"pca": (run_pca_clustering, "pca_clustering"),
|
|
20
21
|
"ttest": (run_t_test, "t_test"),
|
|
22
|
+
"oee": (run_equipment_oee, "equipment_oee"),
|
|
21
23
|
}
|
|
22
24
|
|
|
23
25
|
VALID_MODULES = list(MODULE_REGISTRY.keys())
|
|
@@ -175,7 +175,11 @@ def apply_spark_expr_columns(
|
|
|
175
175
|
Value must be a JSON object mapping **output column name** → **expression** (same dialect as
|
|
176
176
|
``selectExpr``), e.g. ``{"y": "cast(duration_minutes as double)", "x": "cast(regexp_extract(operation_id, '([0-9]+)$', 1) as double)"}`` (Java regex; prefer trailing digits to avoid hyphen/range issues in patterns like ``ETC-1-OP-…``).
|
|
177
177
|
"""
|
|
178
|
-
|
|
178
|
+
# Prefer live os.environ (Spark driver env), same as BATCH_CLICKHOUSE_STAGING_ORDER_BY; config
|
|
179
|
+
# snapshot may lag if env was set after import (dataclass defaults used to freeze at class def).
|
|
180
|
+
raw = os.environ.get("BATCH_TRANSFORM_EXPR_COLUMNS", "").strip()
|
|
181
|
+
if not raw:
|
|
182
|
+
raw = (config.transform.expr_columns_json or "").strip()
|
|
179
183
|
if not raw:
|
|
180
184
|
return df
|
|
181
185
|
try:
|
|
@@ -196,6 +200,98 @@ def apply_spark_expr_columns(
|
|
|
196
200
|
return out
|
|
197
201
|
|
|
198
202
|
|
|
203
|
+
def _pivot_output_column_name(prefix: str, pivot_value: str) -> str:
|
|
204
|
+
"""Stable identifier for pivoted columns (e.g. imp_rm_001 from RM-001)."""
|
|
205
|
+
p = (prefix or "v").rstrip("_")
|
|
206
|
+
body = str(pivot_value).replace("-", "_").replace(" ", "_").lower()
|
|
207
|
+
return f"{p}_{body}"
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def apply_pivot(
|
|
211
|
+
df: DataFrame,
|
|
212
|
+
config: BatchAnalyticsConfig,
|
|
213
|
+
) -> DataFrame:
|
|
214
|
+
"""
|
|
215
|
+
GroupBy + pivot + aggregate (``BATCH_TRANSFORM_PIVOT_JSON``).
|
|
216
|
+
|
|
217
|
+
Example::
|
|
218
|
+
|
|
219
|
+
{
|
|
220
|
+
"group_by": "batch_id",
|
|
221
|
+
"pivot_column": "material_id",
|
|
222
|
+
"value_column": "total_impurities_pct",
|
|
223
|
+
"agg": "max",
|
|
224
|
+
"pivot_values": ["RM-001", "RM-002"],
|
|
225
|
+
"column_name_prefix": "imp",
|
|
226
|
+
"rename_pivot_columns": true
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
If ``pivot_values`` is omitted, distinct values are collected (sorted by string order).
|
|
230
|
+
"""
|
|
231
|
+
raw = os.environ.get("BATCH_TRANSFORM_PIVOT_JSON", "").strip()
|
|
232
|
+
if not raw:
|
|
233
|
+
raw = (config.transform.pivot_json or "").strip()
|
|
234
|
+
if not raw:
|
|
235
|
+
return df
|
|
236
|
+
try:
|
|
237
|
+
spec = json.loads(raw)
|
|
238
|
+
except json.JSONDecodeError as e:
|
|
239
|
+
raise ValueError(f"BATCH_TRANSFORM_PIVOT_JSON must be valid JSON: {e}") from e
|
|
240
|
+
if not isinstance(spec, dict):
|
|
241
|
+
raise ValueError("BATCH_TRANSFORM_PIVOT_JSON must be a JSON object")
|
|
242
|
+
|
|
243
|
+
group_by = spec.get("group_by") or spec.get("groupBy")
|
|
244
|
+
pivot_col = (spec.get("pivot_column") or "").strip()
|
|
245
|
+
value_col = (spec.get("value_column") or "").strip()
|
|
246
|
+
if not group_by or not pivot_col or not value_col:
|
|
247
|
+
raise ValueError("pivot spec requires group_by, pivot_column, value_column")
|
|
248
|
+
agg_name = (spec.get("agg") or "max").strip().lower()
|
|
249
|
+
prefix = (spec.get("column_name_prefix") or "v").strip() or "v"
|
|
250
|
+
rename_pivot = spec.get("rename_pivot_columns", True)
|
|
251
|
+
|
|
252
|
+
from pyspark.sql import functions as F
|
|
253
|
+
|
|
254
|
+
gcols = [c.strip() for c in str(group_by).split(",") if c.strip()]
|
|
255
|
+
for c in gcols + [pivot_col, value_col]:
|
|
256
|
+
if c not in df.columns:
|
|
257
|
+
raise ValueError(f"pivot: column {c!r} not in dataframe; have {df.columns}")
|
|
258
|
+
|
|
259
|
+
vc = F.col(value_col)
|
|
260
|
+
if agg_name == "max":
|
|
261
|
+
agg_expr = F.max(vc)
|
|
262
|
+
elif agg_name == "min":
|
|
263
|
+
agg_expr = F.min(vc)
|
|
264
|
+
elif agg_name in ("sum",):
|
|
265
|
+
agg_expr = F.sum(vc)
|
|
266
|
+
elif agg_name in ("avg", "mean"):
|
|
267
|
+
agg_expr = F.avg(vc)
|
|
268
|
+
elif agg_name == "first":
|
|
269
|
+
agg_expr = F.first(vc)
|
|
270
|
+
else:
|
|
271
|
+
raise ValueError(
|
|
272
|
+
f"pivot agg must be one of max,min,sum,avg,mean,first; got {agg_name!r}"
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
pivot_values = spec.get("pivot_values")
|
|
276
|
+
if pivot_values is not None:
|
|
277
|
+
pv: List[str] = [str(v) for v in pivot_values]
|
|
278
|
+
else:
|
|
279
|
+
pv = [r[0] for r in df.select(pivot_col).distinct().sort(pivot_col).collect()]
|
|
280
|
+
logger.info("pivot: inferred %d distinct %s values", len(pv), pivot_col)
|
|
281
|
+
|
|
282
|
+
out = df.groupBy(*gcols).pivot(pivot_col, pv).agg(agg_expr)
|
|
283
|
+
|
|
284
|
+
if rename_pivot:
|
|
285
|
+
for v in pv:
|
|
286
|
+
old = v
|
|
287
|
+
if old not in out.columns:
|
|
288
|
+
continue
|
|
289
|
+
new_name = _pivot_output_column_name(prefix, old)
|
|
290
|
+
if new_name != old:
|
|
291
|
+
out = out.withColumnRenamed(old, new_name)
|
|
292
|
+
return out
|
|
293
|
+
|
|
294
|
+
|
|
199
295
|
def transform(
|
|
200
296
|
df: DataFrame,
|
|
201
297
|
config: BatchAnalyticsConfig,
|
|
@@ -203,11 +299,13 @@ def transform(
|
|
|
203
299
|
"""
|
|
204
300
|
Apply transformation only: (1) expand JSON/KV blob column into one column per top-level key,
|
|
205
301
|
(2) optional Spark SQL expression columns (``BATCH_TRANSFORM_EXPR_COLUMNS``),
|
|
206
|
-
(3)
|
|
302
|
+
(3) optional groupBy+pivot (``BATCH_TRANSFORM_PIVOT_JSON``),
|
|
303
|
+
(4) deduplicate by BATCH_DEDUP_COLUMNS if set, else by full row.
|
|
207
304
|
Does not write anywhere. Use stage_to_clickhouse() separately to persist.
|
|
208
305
|
"""
|
|
209
306
|
transformed = expand_kv_blob_column(df, config)
|
|
210
307
|
transformed = apply_spark_expr_columns(transformed, config)
|
|
308
|
+
transformed = apply_pivot(transformed, config)
|
|
211
309
|
dedup_cols = (
|
|
212
310
|
[c.strip() for c in config.transform.dedup_columns.split(",") if c.strip()]
|
|
213
311
|
if config.transform.dedup_columns
|
|
@@ -331,11 +429,15 @@ def stage_to_clickhouse(
|
|
|
331
429
|
try:
|
|
332
430
|
full_name = f"{cat}.{ch.database}.{tbl}"
|
|
333
431
|
order_by = _merge_tree_order_by_for_staging(df, config)
|
|
432
|
+
# Helps compare local vs CH 25.x prod: missing ORDER BY cols are a schema issue here, not env.
|
|
334
433
|
logger.info(
|
|
335
|
-
"Staging to ClickHouse via catalog %s (mode=%s, order_by=%s)"
|
|
434
|
+
"Staging to ClickHouse via catalog %s (mode=%s, order_by=%s); "
|
|
435
|
+
"dataframe columns (%d): %s",
|
|
336
436
|
full_name,
|
|
337
437
|
mode,
|
|
338
438
|
order_by,
|
|
439
|
+
len(df.columns),
|
|
440
|
+
sorted(df.columns),
|
|
339
441
|
)
|
|
340
442
|
# Plain MergeTree() only — SETTINGS belong in tableProperty("settings.*", ...) so the
|
|
341
443
|
# connector emits them after ORDER BY; inline SETTINGS in ENGINE breaks CH 25.5 parsing.
|
|
@@ -16,6 +16,7 @@ src/batch_analytics.egg-info/requires.txt
|
|
|
16
16
|
src/batch_analytics.egg-info/top_level.txt
|
|
17
17
|
src/batch_analytics/analytics/__init__.py
|
|
18
18
|
src/batch_analytics/analytics/correlation.py
|
|
19
|
+
src/batch_analytics/analytics/equipment_oee.py
|
|
19
20
|
src/batch_analytics/analytics/gluon_autogluon_infer.py
|
|
20
21
|
src/batch_analytics/analytics/gluon_autogluon_train.py
|
|
21
22
|
src/batch_analytics/analytics/linear_regression.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.26 → batch_analytics-0.3.28}/src/batch_analytics.egg-info/top_level.txt
RENAMED
|
File without changes
|