batch-analytics 0.3.27__tar.gz → 0.3.29__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/PKG-INFO +1 -1
  2. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/pyproject.toml +1 -1
  3. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/analytics/__init__.py +2 -0
  4. batch_analytics-0.3.29/src/batch_analytics/analytics/equipment_oee.py +207 -0
  5. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/config.py +38 -11
  6. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/modules.py +2 -0
  7. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/transform.py +13 -3
  8. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics.egg-info/PKG-INFO +1 -1
  9. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics.egg-info/SOURCES.txt +1 -0
  10. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/README.md +0 -0
  11. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/setup.cfg +0 -0
  12. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/__init__.py +0 -0
  13. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/__main__.py +0 -0
  14. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/analytics/correlation.py +0 -0
  15. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/analytics/gluon_autogluon_infer.py +0 -0
  16. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/analytics/gluon_autogluon_train.py +0 -0
  17. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/analytics/linear_regression.py +0 -0
  18. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/analytics/pca_clustering.py +0 -0
  19. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/analytics/t_test.py +0 -0
  20. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/extract.py +0 -0
  21. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/job_runner.py +0 -0
  22. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/log.py +0 -0
  23. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/output/__init__.py +0 -0
  24. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/output/base.py +0 -0
  25. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/output/clickhouse.py +0 -0
  26. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/output/local.py +0 -0
  27. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/output/s3.py +0 -0
  28. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/utils/__init__.py +0 -0
  29. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
  30. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  31. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  32. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics.egg-info/requires.txt +0 -0
  33. {batch_analytics-0.3.27 → batch_analytics-0.3.29}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.27
3
+ Version: 0.3.29
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.3.27"
7
+ version = "0.3.29"
8
8
  description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -10,10 +10,12 @@ from .linear_regression import run_linear_regression
10
10
  from .correlation import run_correlation
11
11
  from .pca_clustering import run_pca_clustering
12
12
  from .t_test import run_t_test
13
+ from .equipment_oee import run_equipment_oee
13
14
 
14
15
  __all__ = [
15
16
  "run_linear_regression",
16
17
  "run_correlation",
17
18
  "run_pca_clustering",
18
19
  "run_t_test",
20
+ "run_equipment_oee",
19
21
  ]
@@ -0,0 +1,207 @@
1
+ """
2
+ Equipment-level OEE from interval timing, batch quality, and equipment×operation ideal cycle.
3
+
4
+ - Interval: (equipment_id, operation_id, batch_id) — actual run + idle gaps
5
+ - Batch quality: (batch_id) — is_good_batch
6
+ - Ideal cycle: (equipment_id, operation_id) — standard minutes; batch-independent
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ from typing import Any, Dict, List, Tuple
13
+
14
+ from pyspark.sql import DataFrame, SparkSession
15
+ from pyspark.sql import functions as F
16
+
17
+ from ..config import BatchAnalyticsConfig
18
+ from ..extract import extract_table
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ _INTERVAL_REQUIRED = (
23
+ "equipment_id",
24
+ "batch_id",
25
+ "operation_id",
26
+ "planned_production_minutes",
27
+ "actual_run_minutes",
28
+ "idle_gap_minutes",
29
+ )
30
+
31
+ _BATCH_QUALITY_REQUIRED = ("batch_id", "is_good_batch")
32
+
33
+ _IDEAL_CYCLE_REQUIRED = (
34
+ "equipment_id",
35
+ "operation_id",
36
+ "ideal_cycle_time_minutes",
37
+ )
38
+
39
+
40
+ def _require_columns(df: DataFrame, cols: Tuple[str, ...], label: str) -> None:
41
+ missing = [c for c in cols if c not in df.columns]
42
+ if missing:
43
+ raise ValueError(
44
+ f"equipment_oee {label} missing columns: {missing}. Available: {df.columns}"
45
+ )
46
+
47
+
48
+ def _load_batch_quality(spark: SparkSession, config: BatchAnalyticsConfig) -> DataFrame:
49
+ table = (config.analytics.equipment_oee_batch_quality_table or "").strip()
50
+ if not table:
51
+ raise ValueError(
52
+ "equipment_oee requires batch-level quality. Set BATCH_OEE_BATCH_QUALITY_TABLE "
53
+ "to the ClickHouse table (e.g. etc1_executed_bpr_batch_quality)."
54
+ )
55
+ df = extract_table(spark, table, config)
56
+ _require_columns(df, _BATCH_QUALITY_REQUIRED, "batch quality table")
57
+ return df.select(
58
+ "batch_id",
59
+ F.col("is_good_batch").cast("double").alias("is_good_batch"),
60
+ *(
61
+ [F.col("batch_disposition")]
62
+ if "batch_disposition" in df.columns
63
+ else []
64
+ ),
65
+ ).dropDuplicates(["batch_id"])
66
+
67
+
68
+ def _load_ideal_cycle(spark: SparkSession, config: BatchAnalyticsConfig) -> DataFrame:
69
+ table = (config.analytics.equipment_oee_ideal_cycle_table or "").strip()
70
+ if not table:
71
+ raise ValueError(
72
+ "equipment_oee requires equipment×operation ideal cycle standards. "
73
+ "Set BATCH_OEE_IDEAL_CYCLE_TABLE "
74
+ "(e.g. etc1_executed_bpr_equipment_ideal_cycle)."
75
+ )
76
+ df = extract_table(spark, table, config)
77
+ _require_columns(df, _IDEAL_CYCLE_REQUIRED, "ideal cycle table")
78
+ return df.select(
79
+ "equipment_id",
80
+ "operation_id",
81
+ F.col("ideal_cycle_time_minutes").cast("double").alias("ideal_cycle_time_minutes"),
82
+ ).dropDuplicates(["equipment_id", "operation_id"])
83
+
84
+
85
+ def _staging_availability(df: DataFrame) -> DataFrame:
86
+ return (
87
+ df.groupBy("equipment_id", "batch_id")
88
+ .agg(
89
+ F.sum(F.col("actual_run_minutes").cast("double")).alias("actual_run_minutes"),
90
+ F.sum(F.col("idle_gap_minutes").cast("double")).alias("downtime_minutes"),
91
+ )
92
+ .withColumn(
93
+ "planned_production_minutes",
94
+ F.col("actual_run_minutes") + F.col("downtime_minutes"),
95
+ )
96
+ )
97
+
98
+
99
+ def _interval_with_ideal(interval_df: DataFrame, ideal_df: DataFrame) -> DataFrame:
100
+ return interval_df.join(ideal_df, on=["equipment_id", "operation_id"], how="inner")
101
+
102
+
103
+ def _staging_performance(interval_df: DataFrame, ideal_df: DataFrame) -> DataFrame:
104
+ joined = _interval_with_ideal(interval_df, ideal_df)
105
+ return joined.groupBy("equipment_id", "batch_id", "operation_id").agg(
106
+ F.first("ideal_cycle_time_minutes").alias("ideal_cycle_time_minutes"),
107
+ F.sum(F.col("actual_run_minutes").cast("double")).alias("actual_run_minutes"),
108
+ )
109
+
110
+
111
+ def _staging_performance_totals(interval_df: DataFrame, ideal_df: DataFrame) -> DataFrame:
112
+ joined = _interval_with_ideal(interval_df, ideal_df)
113
+ return joined.groupBy("equipment_id", "batch_id").agg(
114
+ F.sum("ideal_cycle_time_minutes").alias("ideal_cycle_time_minutes"),
115
+ F.sum(F.col("actual_run_minutes").cast("double")).alias("actual_run_minutes"),
116
+ )
117
+
118
+
119
+ def _rows_to_dicts(df: DataFrame) -> List[Dict[str, Any]]:
120
+ return [row.asDict() for row in df.collect()]
121
+
122
+
123
+ def run_equipment_oee(
124
+ spark: SparkSession,
125
+ df: DataFrame,
126
+ config: BatchAnalyticsConfig,
127
+ ) -> Dict[str, Any]:
128
+ """
129
+ Compute equipment × batch OEE.
130
+
131
+ Performance uses ideal_cycle_time_minutes from BATCH_OEE_IDEAL_CYCLE_TABLE
132
+ (equipment_id × operation_id, batch-independent), not batch-specific BPR durations.
133
+ """
134
+ _require_columns(df, _INTERVAL_REQUIRED, "interval staging")
135
+
136
+ batch_q = _load_batch_quality(spark, config)
137
+ ideal = _load_ideal_cycle(spark, config)
138
+ avail = _staging_availability(df)
139
+ perf_ops = _staging_performance(df, ideal)
140
+ # Only ideal totals are joined here; avail already has actual_run_minutes.
141
+ # Keeping both would make F.col("actual_run_minutes") ambiguous after the join.
142
+ perf_tot = _staging_performance_totals(df, ideal).select(
143
+ "equipment_id",
144
+ "batch_id",
145
+ "ideal_cycle_time_minutes",
146
+ )
147
+
148
+ joined = (
149
+ avail.join(perf_tot, on=["equipment_id", "batch_id"], how="inner")
150
+ .join(batch_q, on="batch_id", how="inner")
151
+ .withColumn(
152
+ "availability_pct",
153
+ F.when(
154
+ F.col("planned_production_minutes") > 0,
155
+ F.col("actual_run_minutes") / F.col("planned_production_minutes"),
156
+ ).otherwise(F.lit(0.0)),
157
+ )
158
+ .withColumn(
159
+ "performance_pct",
160
+ F.when(
161
+ F.col("actual_run_minutes") > 0,
162
+ F.least(
163
+ F.lit(1.0),
164
+ F.col("ideal_cycle_time_minutes") / F.col("actual_run_minutes"),
165
+ ),
166
+ ).otherwise(F.lit(0.0)),
167
+ )
168
+ .withColumn("quality_pct", F.col("is_good_batch").cast("double"))
169
+ .withColumn(
170
+ "oee_pct",
171
+ F.col("availability_pct")
172
+ * F.col("performance_pct")
173
+ * F.col("quality_pct"),
174
+ )
175
+ )
176
+
177
+ out_cols = [
178
+ "equipment_id",
179
+ "batch_id",
180
+ "planned_production_minutes",
181
+ "actual_run_minutes",
182
+ "downtime_minutes",
183
+ "ideal_cycle_time_minutes",
184
+ "is_good_batch",
185
+ "availability_pct",
186
+ "performance_pct",
187
+ "quality_pct",
188
+ "oee_pct",
189
+ ]
190
+ if "batch_disposition" in joined.columns:
191
+ out_cols.insert(7, "batch_disposition")
192
+
193
+ result_df = joined.select(*out_cols).orderBy("equipment_id", "batch_id")
194
+
195
+ n = result_df.count()
196
+ logger.info("equipment_oee: %d equipment×batch rows", n)
197
+
198
+ return {
199
+ "row_count": n,
200
+ "staging_availability": _rows_to_dicts(avail.orderBy("equipment_id", "batch_id")),
201
+ "staging_batch_quality": _rows_to_dicts(batch_q.orderBy("batch_id")),
202
+ "staging_ideal_cycle": _rows_to_dicts(ideal.orderBy("equipment_id", "operation_id")),
203
+ "staging_performance_by_operation": _rows_to_dicts(
204
+ perf_ops.orderBy("equipment_id", "batch_id", "operation_id")
205
+ ),
206
+ "equipment_oee": _rows_to_dicts(result_df),
207
+ }
@@ -74,28 +74,47 @@ class TransformConfig:
74
74
 
75
75
  # Order: extract anchor_id from add_dimension(s) column, then dedupe by these keys.
76
76
  # Deduplication keys (comma-separated). Empty = dropDuplicates() on full row (all columns).
77
- dedup_columns: str = os.environ.get("BATCH_DEDUP_COLUMNS", "")
77
+ # Use default_factory so values reflect os.environ when BatchAnalyticsConfig() is built, not at import time.
78
+ dedup_columns: str = field(
79
+ default_factory=lambda: os.environ.get("BATCH_DEDUP_COLUMNS", "")
80
+ )
78
81
  # Staging output path (local or S3)
79
- staging_path: str = os.environ.get(
80
- "BATCH_STAGING_PATH",
81
- "/tmp/analytics_stage",
82
+ staging_path: str = field(
83
+ default_factory=lambda: os.environ.get(
84
+ "BATCH_STAGING_PATH",
85
+ "/tmp/analytics_stage",
86
+ )
82
87
  )
83
88
  # Output format for load_staged when reading (parquet/delta/clickhouse).
84
89
  # Stage job always writes to ClickHouse; use clickhouse for analytics to read from staged table.
85
- staging_format: str = os.environ.get("BATCH_STAGING_FORMAT", "clickhouse")
90
+ staging_format: str = field(
91
+ default_factory=lambda: os.environ.get("BATCH_STAGING_FORMAT", "clickhouse")
92
+ )
86
93
  # Staging table name in ClickHouse (when format=clickhouse)
87
- staging_table: str = os.environ.get("BATCH_STAGING_TABLE", "analytics_staging")
94
+ staging_table: str = field(
95
+ default_factory=lambda: os.environ.get("BATCH_STAGING_TABLE", "analytics_staging")
96
+ )
88
97
  # Spark save mode for ClickHouse staging (and path staging): overwrite | append
89
- staging_write_mode: str = os.environ.get("BATCH_STAGING_WRITE_MODE", "overwrite")
98
+ staging_write_mode: str = field(
99
+ default_factory=lambda: os.environ.get("BATCH_STAGING_WRITE_MODE", "overwrite")
100
+ )
90
101
  # Source column holding a JSON object or Python dict string; every top-level key becomes a new String column
91
102
  # (see transform.expand_kv_blob_column). Example: add_dimensions {'anchor_id':'...','lot':'A1'}
92
- add_dimension_column: str = os.environ.get("BATCH_ADD_DIMENSION_COLUMN", "add_dimension")
103
+ add_dimension_column: str = field(
104
+ default_factory=lambda: os.environ.get("BATCH_ADD_DIMENSION_COLUMN", "add_dimension")
105
+ )
93
106
  # Legacy: no longer used; output column names match JSON keys (e.g. anchor_id). Kept for env compatibility.
94
- anchor_id_column: str = os.environ.get("BATCH_ANCHOR_ID_COLUMN", "anchor_id")
107
+ anchor_id_column: str = field(
108
+ default_factory=lambda: os.environ.get("BATCH_ANCHOR_ID_COLUMN", "anchor_id")
109
+ )
95
110
  # JSON object: {"new_col": "Spark SQL expression"} applied after KV expansion, before dedupe.
96
- expr_columns_json: str = os.environ.get("BATCH_TRANSFORM_EXPR_COLUMNS", "").strip()
111
+ expr_columns_json: str = field(
112
+ default_factory=lambda: os.environ.get("BATCH_TRANSFORM_EXPR_COLUMNS", "").strip()
113
+ )
97
114
  # JSON pivot spec (group_by, pivot_column, value_column, agg, optional pivot_values, column_name_prefix).
98
- pivot_json: str = os.environ.get("BATCH_TRANSFORM_PIVOT_JSON", "").strip()
115
+ pivot_json: str = field(
116
+ default_factory=lambda: os.environ.get("BATCH_TRANSFORM_PIVOT_JSON", "").strip()
117
+ )
99
118
 
100
119
 
101
120
  @dataclass
@@ -158,6 +177,14 @@ class AnalyticsConfig:
158
177
  ttest_col_a: str = os.environ.get("BATCH_TTEST_COL_A", "")
159
178
  ttest_col_b: str = os.environ.get("BATCH_TTEST_COL_B", "")
160
179
 
180
+ # Equipment OEE: batch-level quality table (batch_id, is_good_batch, …)
181
+ equipment_oee_batch_quality_table: str = os.environ.get(
182
+ "BATCH_OEE_BATCH_QUALITY_TABLE", ""
183
+ ).strip()
184
+ equipment_oee_ideal_cycle_table: str = os.environ.get(
185
+ "BATCH_OEE_IDEAL_CYCLE_TABLE", ""
186
+ ).strip()
187
+
161
188
 
162
189
  @dataclass
163
190
  class SparkK8sConfig:
@@ -10,6 +10,7 @@ from .analytics import (
10
10
  run_correlation,
11
11
  run_pca_clustering,
12
12
  run_t_test,
13
+ run_equipment_oee,
13
14
  )
14
15
 
15
16
  # module_arg -> (run_fn, result_key)
@@ -18,6 +19,7 @@ MODULE_REGISTRY = {
18
19
  "corr": (run_correlation, "correlation"),
19
20
  "pca": (run_pca_clustering, "pca_clustering"),
20
21
  "ttest": (run_t_test, "t_test"),
22
+ "oee": (run_equipment_oee, "equipment_oee"),
21
23
  }
22
24
 
23
25
  VALID_MODULES = list(MODULE_REGISTRY.keys())
@@ -175,7 +175,11 @@ def apply_spark_expr_columns(
175
175
  Value must be a JSON object mapping **output column name** → **expression** (same dialect as
176
176
  ``selectExpr``), e.g. ``{"y": "cast(duration_minutes as double)", "x": "cast(regexp_extract(operation_id, '([0-9]+)$', 1) as double)"}`` (Java regex; prefer trailing digits to avoid hyphen/range issues in patterns like ``ETC-1-OP-…``).
177
177
  """
178
- raw = (config.transform.expr_columns_json or "").strip()
178
+ # Prefer live os.environ (Spark driver env), same as BATCH_CLICKHOUSE_STAGING_ORDER_BY; config
179
+ # snapshot may lag if env was set after import (dataclass defaults used to freeze at class def).
180
+ raw = os.environ.get("BATCH_TRANSFORM_EXPR_COLUMNS", "").strip()
181
+ if not raw:
182
+ raw = (config.transform.expr_columns_json or "").strip()
179
183
  if not raw:
180
184
  return df
181
185
  try:
@@ -224,7 +228,9 @@ def apply_pivot(
224
228
 
225
229
  If ``pivot_values`` is omitted, distinct values are collected (sorted by string order).
226
230
  """
227
- raw = (config.transform.pivot_json or "").strip()
231
+ raw = os.environ.get("BATCH_TRANSFORM_PIVOT_JSON", "").strip()
232
+ if not raw:
233
+ raw = (config.transform.pivot_json or "").strip()
228
234
  if not raw:
229
235
  return df
230
236
  try:
@@ -423,11 +429,15 @@ def stage_to_clickhouse(
423
429
  try:
424
430
  full_name = f"{cat}.{ch.database}.{tbl}"
425
431
  order_by = _merge_tree_order_by_for_staging(df, config)
432
+ # Helps compare local vs CH 25.x prod: missing ORDER BY cols are a schema issue here, not env.
426
433
  logger.info(
427
- "Staging to ClickHouse via catalog %s (mode=%s, order_by=%s)",
434
+ "Staging to ClickHouse via catalog %s (mode=%s, order_by=%s); "
435
+ "dataframe columns (%d): %s",
428
436
  full_name,
429
437
  mode,
430
438
  order_by,
439
+ len(df.columns),
440
+ sorted(df.columns),
431
441
  )
432
442
  # Plain MergeTree() only — SETTINGS belong in tableProperty("settings.*", ...) so the
433
443
  # connector emits them after ORDER BY; inline SETTINGS in ENGINE breaks CH 25.5 parsing.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.27
3
+ Version: 0.3.29
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
@@ -16,6 +16,7 @@ src/batch_analytics.egg-info/requires.txt
16
16
  src/batch_analytics.egg-info/top_level.txt
17
17
  src/batch_analytics/analytics/__init__.py
18
18
  src/batch_analytics/analytics/correlation.py
19
+ src/batch_analytics/analytics/equipment_oee.py
19
20
  src/batch_analytics/analytics/gluon_autogluon_infer.py
20
21
  src/batch_analytics/analytics/gluon_autogluon_train.py
21
22
  src/batch_analytics/analytics/linear_regression.py