batch-analytics 0.3.30__tar.gz → 0.3.34__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/PKG-INFO +2 -2
  2. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/pyproject.toml +2 -2
  3. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/analytics/__init__.py +2 -0
  4. batch_analytics-0.3.34/src/batch_analytics/analytics/pca.py +70 -0
  5. batch_analytics-0.3.34/src/batch_analytics/analytics/pca_core.py +124 -0
  6. batch_analytics-0.3.34/src/batch_analytics/analytics/pca_mvda.py +126 -0
  7. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/config.py +5 -1
  8. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/modules.py +4 -0
  9. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics.egg-info/PKG-INFO +2 -2
  10. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics.egg-info/SOURCES.txt +5 -1
  11. batch_analytics-0.3.34/tests/test_pca_mvda.py +86 -0
  12. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/README.md +0 -0
  13. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/setup.cfg +0 -0
  14. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/__init__.py +0 -0
  15. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/__main__.py +0 -0
  16. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/analytics/correlation.py +0 -0
  17. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/analytics/equipment_oee.py +0 -0
  18. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/analytics/gluon_autogluon_infer.py +0 -0
  19. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/analytics/gluon_autogluon_train.py +0 -0
  20. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/analytics/linear_regression.py +0 -0
  21. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/analytics/pca_clustering.py +0 -0
  22. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/analytics/t_test.py +0 -0
  23. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/extract.py +0 -0
  24. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/job_runner.py +0 -0
  25. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/log.py +0 -0
  26. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/output/__init__.py +0 -0
  27. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/output/base.py +0 -0
  28. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/output/clickhouse.py +0 -0
  29. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/output/local.py +0 -0
  30. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/output/s3.py +0 -0
  31. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/transform.py +0 -0
  32. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/utils/__init__.py +0 -0
  33. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
  34. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  35. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  36. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics.egg-info/requires.txt +0 -0
  37. {batch_analytics-0.3.30 → batch_analytics-0.3.34}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.30
4
- Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
3
+ Version: 0.3.34
4
+ Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test, LLM classification).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
7
7
  Requires-Python: >=3.8
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.3.30"
8
- description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
7
+ version = "0.3.34"
8
+ description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test, LLM classification)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
11
11
  dependencies = [
@@ -8,6 +8,7 @@ Analytics modules for batch analytics pipeline.
8
8
 
9
9
  from .linear_regression import run_linear_regression
10
10
  from .correlation import run_correlation
11
+ from .pca import run_pca
11
12
  from .pca_clustering import run_pca_clustering
12
13
  from .t_test import run_t_test
13
14
  from .equipment_oee import run_equipment_oee
@@ -15,6 +16,7 @@ from .equipment_oee import run_equipment_oee
15
16
  __all__ = [
16
17
  "run_linear_regression",
17
18
  "run_correlation",
19
+ "run_pca",
18
20
  "run_pca_clustering",
19
21
  "run_t_test",
20
22
  "run_equipment_oee",
@@ -0,0 +1,70 @@
1
+ """
2
+ Module: MVDA PCA on staged data (litewave-analytics POST /pca compatible response).
3
+ """
4
+
5
+ import logging
6
+ from typing import Any, Dict
7
+
8
+ from pyspark.sql import DataFrame, SparkSession
9
+
10
+ from ..config import BatchAnalyticsConfig
11
+ from .pca_core import (
12
+ build_mvda_pca_response,
13
+ empty_mvda_response,
14
+ ensure_target_in_features,
15
+ fit_pca_and_collect_scores,
16
+ prepare_pca_dataframe,
17
+ resolve_n_components,
18
+ resolve_pca_feature_cols,
19
+ )
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def run_pca(
25
+ spark: SparkSession,
26
+ df: DataFrame,
27
+ config: BatchAnalyticsConfig,
28
+ ) -> Dict[str, Any]:
29
+ """
30
+ Principal Component Analysis with MVDA response shape.
31
+
32
+ Returns flat dict: n_components, explained_variance, cumulative_variance,
33
+ dominant_features, loadings, scores, row_count, message.
34
+ """
35
+ del spark # SparkSession kept for MODULE_REGISTRY signature consistency
36
+ min_rows = config.analytics.pca_min_rows
37
+ sample_limit = config.analytics.pca_sample_size
38
+ target_feature = (config.analytics.pca_target_feature or "").strip()
39
+
40
+ feature_cols = resolve_pca_feature_cols(df, config)
41
+ if len(feature_cols) < 2:
42
+ raise ValueError("At least 2 features required for PCA")
43
+
44
+ feature_cols, target_feature = ensure_target_in_features(
45
+ feature_cols, target_feature
46
+ )
47
+
48
+ df_scaled, _df_num, row_count = prepare_pca_dataframe(df, feature_cols)
49
+ if row_count < min_rows:
50
+ return empty_mvda_response(row_count, min_rows)
51
+
52
+ k = resolve_n_components(len(feature_cols), config)
53
+ explained, pc_matrix, _pca_model, pca_rows = fit_pca_and_collect_scores(
54
+ df_scaled,
55
+ feature_cols,
56
+ target_feature,
57
+ k,
58
+ row_count,
59
+ sample_limit,
60
+ )
61
+
62
+ return build_mvda_pca_response(
63
+ feature_cols,
64
+ target_feature,
65
+ explained,
66
+ pc_matrix,
67
+ pca_rows,
68
+ row_count,
69
+ min_rows,
70
+ )
@@ -0,0 +1,124 @@
1
+ """
2
+ Shared PCA Spark pipeline helpers.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import logging
8
+ from typing import List, Tuple
9
+
10
+ from pyspark.ml.feature import StandardScaler, VectorAssembler
11
+ from pyspark.sql import DataFrame
12
+ from pyspark.sql.functions import col
13
+ from pyspark.sql.types import DoubleType
14
+
15
+ from ..config import BatchAnalyticsConfig
16
+ from .pca_mvda import (
17
+ build_mvda_pca_response,
18
+ empty_mvda_response,
19
+ ensure_target_in_features,
20
+ build_loadings_and_dominant,
21
+ )
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ __all__ = [
26
+ "resolve_pca_feature_cols",
27
+ "ensure_target_in_features",
28
+ "empty_mvda_response",
29
+ "build_loadings_and_dominant",
30
+ "build_mvda_pca_response",
31
+ "prepare_pca_dataframe",
32
+ "resolve_n_components",
33
+ "fit_pca_and_collect_scores",
34
+ ]
35
+
36
+
37
+ def resolve_pca_feature_cols(df: DataFrame, config: BatchAnalyticsConfig) -> List[str]:
38
+ """Resolve feature column names from config or auto-detect numerics."""
39
+ feature_cols = [
40
+ c.strip()
41
+ for c in config.analytics.pca_features.split(",")
42
+ if c.strip()
43
+ ]
44
+ if not feature_cols:
45
+ feature_cols = [
46
+ f.name
47
+ for f in df.schema.fields
48
+ if "double" in str(f.dataType).lower()
49
+ or "int" in str(f.dataType).lower()
50
+ or "long" in str(f.dataType).lower()
51
+ or "float" in str(f.dataType).lower()
52
+ ]
53
+ logger.info("Auto-selected %d numeric columns for PCA", len(feature_cols))
54
+ return feature_cols
55
+
56
+
57
+ def prepare_pca_dataframe(
58
+ df: DataFrame, feature_cols: List[str]
59
+ ) -> Tuple[DataFrame, DataFrame, int]:
60
+ """Cast features, drop NA, assemble and scale. Returns (df_scaled, df_num, row_count)."""
61
+ missing = [c for c in feature_cols if c not in df.columns]
62
+ if missing:
63
+ raise ValueError(
64
+ f"PCA features not found: {missing}. Available: {df.columns[:15]}..."
65
+ )
66
+
67
+ df_num = df.select(
68
+ *[col(c).cast(DoubleType()).alias(c) for c in feature_cols]
69
+ ).dropna()
70
+ row_count = df_num.count()
71
+
72
+ assembler = VectorAssembler(
73
+ inputCols=feature_cols,
74
+ outputCol="features_raw",
75
+ handleInvalid="skip",
76
+ )
77
+ df_vec = assembler.transform(df_num)
78
+
79
+ scaler = StandardScaler(
80
+ inputCol="features_raw",
81
+ outputCol="features",
82
+ withStd=True,
83
+ withMean=True,
84
+ )
85
+ scaler_model = scaler.fit(df_vec)
86
+ df_scaled = scaler_model.transform(df_vec)
87
+ return df_scaled, df_num, row_count
88
+
89
+
90
+ def resolve_n_components(n_features: int, config: BatchAnalyticsConfig) -> int:
91
+ raw = (config.analytics.pca_n_components or "").strip()
92
+ if not raw:
93
+ return n_features
94
+ return min(int(raw), n_features)
95
+
96
+
97
+ def fit_pca_and_collect_scores(
98
+ df_scaled: DataFrame,
99
+ feature_cols: List[str],
100
+ target_feature: str,
101
+ k: int,
102
+ row_count: int,
103
+ sample_limit: int,
104
+ ):
105
+ """Fit Spark PCA and collect rows for scores scatter (pca_features + features)."""
106
+ from pyspark.ml.feature import PCA as SparkPCA
107
+
108
+ pca = SparkPCA(k=k, inputCol="features", outputCol="pca_features")
109
+ pca_model = pca.fit(df_scaled)
110
+
111
+ explained = pca_model.explainedVariance.toArray().tolist()
112
+ pc_matrix = pca_model.pc.toArray()
113
+
114
+ pca_rows = []
115
+ if sample_limit > 0:
116
+ if row_count > sample_limit:
117
+ fraction = min(sample_limit / row_count * 1.2, 1.0)
118
+ sample_df = df_scaled.sample(fraction=fraction, seed=42)
119
+ else:
120
+ sample_df = df_scaled
121
+ result_df = pca_model.transform(sample_df)
122
+ select_cols = ["pca_features"] + feature_cols
123
+ pca_rows = result_df.select(*select_cols).limit(sample_limit).collect()
124
+ return explained, pc_matrix, pca_model, pca_rows
@@ -0,0 +1,126 @@
1
+ """
2
+ MVDA PCA response builders (no PySpark). Compatible with litewave-analytics POST /pca.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any, Dict, List, Optional, Tuple
8
+
9
+ INSUFFICIENT_DATA_TEMPLATE = (
10
+ "Insufficient data — only {row_count} data-points available (need at least {min_rows})"
11
+ )
12
+
13
+
14
+ def ensure_target_in_features(
15
+ feature_cols: List[str], target_feature: str
16
+ ) -> Tuple[List[str], str]:
17
+ """Match litewave: target is included in the PCA feature set."""
18
+ target = (target_feature or "").strip()
19
+ if not target and feature_cols:
20
+ target = feature_cols[0]
21
+ cols = list(feature_cols)
22
+ if target and target not in cols:
23
+ cols = [target] + cols
24
+ return cols, target
25
+
26
+
27
+ def empty_mvda_response(
28
+ row_count: int, min_rows: int, message: Optional[str] = None
29
+ ) -> Dict[str, Any]:
30
+ return {
31
+ "n_components": 0,
32
+ "explained_variance": [],
33
+ "cumulative_variance": [],
34
+ "dominant_features": [],
35
+ "loadings": [],
36
+ "scores": {
37
+ "pc1_label": "",
38
+ "pc1": [],
39
+ "pc2_label": "",
40
+ "pc2": [],
41
+ "target_values": [],
42
+ },
43
+ "row_count": row_count,
44
+ "message": message
45
+ or INSUFFICIENT_DATA_TEMPLATE.format(row_count=row_count, min_rows=min_rows),
46
+ }
47
+
48
+
49
+ def build_loadings_and_dominant(
50
+ feature_cols: List[str], pc_matrix, n_components: int
51
+ ) -> Tuple[List[dict], List[dict]]:
52
+ """pc_matrix shape (n_features, k) — Spark PCA pc layout."""
53
+ loadings: List[dict] = []
54
+ dominant_features: List[dict] = []
55
+ for pc_i in range(n_components):
56
+ vec = pc_matrix[:, pc_i]
57
+ max_idx = int(max(range(len(vec)), key=lambda j: abs(float(vec[j]))))
58
+ dominant_features.append({"pc": pc_i + 1, "feature": feature_cols[max_idx]})
59
+ for j, name in enumerate(feature_cols):
60
+ loadings.append(
61
+ {"pc": pc_i + 1, "feature": name, "value": round(float(vec[j]), 6)}
62
+ )
63
+ return loadings, dominant_features
64
+
65
+
66
+ def build_mvda_pca_response(
67
+ feature_cols: List[str],
68
+ target_feature: str,
69
+ explained: List[float],
70
+ pc_matrix,
71
+ pca_rows: List,
72
+ row_count: int,
73
+ min_rows: int,
74
+ ) -> Dict[str, Any]:
75
+ """Build flat MVDA PCA payload (litewave POST /pca compatible)."""
76
+ if row_count < min_rows:
77
+ return empty_mvda_response(row_count, min_rows)
78
+
79
+ n_components = len(explained)
80
+ cumsum: List[float] = []
81
+ s = 0.0
82
+ for v in explained:
83
+ s += v
84
+ cumsum.append(round(s, 6))
85
+
86
+ loadings, dominant_features = build_loadings_and_dominant(
87
+ feature_cols, pc_matrix, n_components
88
+ )
89
+
90
+ def _pc_label(pc_idx: int) -> str:
91
+ feat = (
92
+ dominant_features[pc_idx]["feature"]
93
+ if pc_idx < len(dominant_features)
94
+ else ""
95
+ )
96
+ pct = round(explained[pc_idx] * 100, 1) if pc_idx < len(explained) else 0
97
+ return f"PC{pc_idx + 1} \u2014 {feat} ({pct}%)"
98
+
99
+ scores: Dict[str, Any] = {
100
+ "pc1_label": _pc_label(0),
101
+ "pc1": [],
102
+ "pc2_label": _pc_label(1) if n_components >= 2 else "",
103
+ "pc2": [],
104
+ "target_values": [],
105
+ }
106
+ for row in pca_rows:
107
+ pca_vec = row["pca_features"].toArray()
108
+ scores["pc1"].append(round(float(pca_vec[0]), 4))
109
+ if n_components >= 2 and len(pca_vec) > 1:
110
+ scores["pc2"].append(round(float(pca_vec[1]), 4))
111
+ if target_feature:
112
+ target_val = row[target_feature]
113
+ scores["target_values"].append(
114
+ round(float(target_val), 4) if target_val is not None else None
115
+ )
116
+
117
+ return {
118
+ "n_components": n_components,
119
+ "explained_variance": [round(v, 6) for v in explained],
120
+ "cumulative_variance": cumsum,
121
+ "dominant_features": dominant_features,
122
+ "loadings": loadings,
123
+ "scores": scores,
124
+ "row_count": row_count,
125
+ "message": None,
126
+ }
@@ -162,8 +162,12 @@ class AnalyticsConfig:
162
162
  )
163
163
  corr_threshold: float = float(os.environ.get("BATCH_CORR_THRESHOLD", "0.8"))
164
164
 
165
- # Module 3: PCA + Clustering
165
+ # Module 3: PCA (MVDA) and PCA + clustering
166
166
  pca_features: str = os.environ.get("BATCH_PCA_FEATURES", "")
167
+ pca_target_feature: str = os.environ.get("BATCH_PCA_TARGET_FEATURE", "")
168
+ pca_n_components: str = os.environ.get("BATCH_PCA_N_COMPONENTS", "")
169
+ pca_sample_size: int = int(os.environ.get("BATCH_PCA_SAMPLE_SIZE", "5000"))
170
+ pca_min_rows: int = int(os.environ.get("BATCH_PCA_MIN_ROWS", "10"))
167
171
  pca_variance_threshold: float = float(
168
172
  os.environ.get("BATCH_PCA_VARIANCE_THRESHOLD", "0.95")
169
173
  )
@@ -8,16 +8,20 @@ See analytics_runner/catalog/analytics_catalog.yaml.
8
8
  from .analytics import (
9
9
  run_linear_regression,
10
10
  run_correlation,
11
+ run_pca,
11
12
  run_pca_clustering,
12
13
  run_t_test,
13
14
  run_equipment_oee,
14
15
  )
15
16
 
16
17
  # module_arg -> (run_fn, result_key)
18
+ # "pca" is unchanged for pca_clustering jobs (--modules pca → nested pca_clustering result).
19
+ # "pca_mvda" is the litewave-compatible flat PCA response (method_id pca).
17
20
  MODULE_REGISTRY = {
18
21
  "lr": (run_linear_regression, "linear_regression"),
19
22
  "corr": (run_correlation, "correlation"),
20
23
  "pca": (run_pca_clustering, "pca_clustering"),
24
+ "pca_mvda": (run_pca, "pca"),
21
25
  "ttest": (run_t_test, "t_test"),
22
26
  "oee": (run_equipment_oee, "equipment_oee"),
23
27
  }
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.30
4
- Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
3
+ Version: 0.3.34
4
+ Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test, LLM classification).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
7
7
  Requires-Python: >=3.8
@@ -20,7 +20,10 @@ src/batch_analytics/analytics/equipment_oee.py
20
20
  src/batch_analytics/analytics/gluon_autogluon_infer.py
21
21
  src/batch_analytics/analytics/gluon_autogluon_train.py
22
22
  src/batch_analytics/analytics/linear_regression.py
23
+ src/batch_analytics/analytics/pca.py
23
24
  src/batch_analytics/analytics/pca_clustering.py
25
+ src/batch_analytics/analytics/pca_core.py
26
+ src/batch_analytics/analytics/pca_mvda.py
24
27
  src/batch_analytics/analytics/t_test.py
25
28
  src/batch_analytics/output/__init__.py
26
29
  src/batch_analytics/output/base.py
@@ -28,4 +31,5 @@ src/batch_analytics/output/clickhouse.py
28
31
  src/batch_analytics/output/local.py
29
32
  src/batch_analytics/output/s3.py
30
33
  src/batch_analytics/utils/__init__.py
31
- src/batch_analytics/utils/gluon_autogluon_common.py
34
+ src/batch_analytics/utils/gluon_autogluon_common.py
35
+ tests/test_pca_mvda.py
@@ -0,0 +1,86 @@
1
+ """Unit tests for MVDA PCA response helpers (no Spark required)."""
2
+
3
+ import importlib.util
4
+ from pathlib import Path
5
+
6
+ import numpy as np
7
+
8
+ _SRC = Path(__file__).resolve().parents[1] / "src" / "batch_analytics" / "analytics" / "pca_mvda.py"
9
+ _spec = importlib.util.spec_from_file_location("pca_mvda", _SRC)
10
+ pca_mvda = importlib.util.module_from_spec(_spec)
11
+ assert _spec.loader is not None
12
+ _spec.loader.exec_module(pca_mvda)
13
+
14
+ build_loadings_and_dominant = pca_mvda.build_loadings_and_dominant
15
+ build_mvda_pca_response = pca_mvda.build_mvda_pca_response
16
+ empty_mvda_response = pca_mvda.empty_mvda_response
17
+ ensure_target_in_features = pca_mvda.ensure_target_in_features
18
+
19
+
20
+ class _FakePcaVec:
21
+ def __init__(self, arr):
22
+ self._arr = arr
23
+
24
+ def toArray(self):
25
+ return self._arr
26
+
27
+
28
+ class _FakeRow:
29
+ def __init__(self, pca_vec, target_val, feature_cols, target_feature):
30
+ self._data = {"pca_features": _FakePcaVec(pca_vec)}
31
+ for f in feature_cols:
32
+ self._data[f] = 1.0 if f != target_feature else target_val
33
+
34
+ def __getitem__(self, key):
35
+ return self._data[key]
36
+
37
+
38
+ def test_ensure_target_prepended():
39
+ cols, target = ensure_target_in_features(
40
+ ["a", "b"], "actual_yield"
41
+ )
42
+ assert cols[0] == "actual_yield"
43
+ assert target == "actual_yield"
44
+
45
+
46
+ def test_empty_mvda_insufficient_rows():
47
+ out = empty_mvda_response(5, 10)
48
+ assert out["n_components"] == 0
49
+ assert out["row_count"] == 5
50
+ assert "need at least 10" in out["message"]
51
+
52
+
53
+ def test_build_loadings_shape():
54
+ features = ["x", "y", "z"]
55
+ pc = np.array([[0.9, 0.1], [0.2, 0.8], [0.5, 0.5]])
56
+ loadings, dominant = build_loadings_and_dominant(features, pc, 2)
57
+ assert len(loadings) == 6
58
+ assert len(dominant) == 2
59
+ assert all("pc" in d and "feature" in d for d in dominant)
60
+
61
+
62
+ def test_build_mvda_response_scores_lengths():
63
+ features = ["actual_yield", "feat_b"]
64
+ explained = [0.6, 0.4]
65
+ pc_matrix = np.array([[0.7, 0.3], [0.5, 0.5]])
66
+ rows = [
67
+ _FakeRow([1.0, 0.5], 100.0, features, "actual_yield"),
68
+ _FakeRow([-0.5, 1.0], 200.0, features, "actual_yield"),
69
+ ]
70
+ out = build_mvda_pca_response(
71
+ features,
72
+ "actual_yield",
73
+ explained,
74
+ pc_matrix,
75
+ rows,
76
+ row_count=102,
77
+ min_rows=10,
78
+ )
79
+ assert out["n_components"] == 2
80
+ assert out["message"] is None
81
+ assert out["row_count"] == 102
82
+ assert len(out["scores"]["pc1"]) == 2
83
+ assert len(out["scores"]["pc2"]) == 2
84
+ assert len(out["scores"]["target_values"]) == 2
85
+ assert out["cumulative_variance"][-1] == 1.0
86
+ assert len(out["loadings"]) == 4