tanml 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tanml might be problematic. Click here for more details.

Files changed (62) hide show
  1. tanml/__init__.py +1 -0
  2. tanml/check_runners/__init__.py +0 -0
  3. tanml/check_runners/base_runner.py +6 -0
  4. tanml/check_runners/cleaning_repro_runner.py +18 -0
  5. tanml/check_runners/correlation_runner.py +15 -0
  6. tanml/check_runners/data_quality_runner.py +24 -0
  7. tanml/check_runners/eda_runner.py +21 -0
  8. tanml/check_runners/explainability_runner.py +28 -0
  9. tanml/check_runners/input_cluster_runner.py +43 -0
  10. tanml/check_runners/logistic_stats_runner.py +28 -0
  11. tanml/check_runners/model_meta_runner.py +23 -0
  12. tanml/check_runners/performance_runner.py +28 -0
  13. tanml/check_runners/raw_data_runner.py +41 -0
  14. tanml/check_runners/rule_engine_runner.py +5 -0
  15. tanml/check_runners/stress_test_runner.py +26 -0
  16. tanml/check_runners/vif_runner.py +54 -0
  17. tanml/checks/__init__.py +0 -0
  18. tanml/checks/base.py +20 -0
  19. tanml/checks/cleaning_repro.py +47 -0
  20. tanml/checks/correlation.py +61 -0
  21. tanml/checks/data_quality.py +26 -0
  22. tanml/checks/eda.py +67 -0
  23. tanml/checks/explainability/shap_check.py +55 -0
  24. tanml/checks/input_cluster.py +109 -0
  25. tanml/checks/logit_stats.py +59 -0
  26. tanml/checks/model_contents.py +40 -0
  27. tanml/checks/model_meta.py +50 -0
  28. tanml/checks/performance.py +90 -0
  29. tanml/checks/raw_data.py +47 -0
  30. tanml/checks/rule_engine.py +45 -0
  31. tanml/checks/stress_test.py +64 -0
  32. tanml/checks/vif.py +51 -0
  33. tanml/cli/__init__.py +0 -0
  34. tanml/cli/arg_parser.py +31 -0
  35. tanml/cli/init_cmd.py +8 -0
  36. tanml/cli/main.py +27 -0
  37. tanml/cli/validate_cmd.py +7 -0
  38. tanml/config_templates/__init__.py +0 -0
  39. tanml/config_templates/rules_multiple_models_datasets.yaml +144 -0
  40. tanml/config_templates/rules_one_dataset_segment_column.yaml +140 -0
  41. tanml/config_templates/rules_one_model_one_dataset.yaml +143 -0
  42. tanml/engine/__init__.py +0 -0
  43. tanml/engine/check_agent_registry.py +42 -0
  44. tanml/engine/core_engine_agent.py +115 -0
  45. tanml/engine/segmentation_agent.py +118 -0
  46. tanml/engine/validation_agent.py +91 -0
  47. tanml/report/report_builder.py +230 -0
  48. tanml/report/templates/report_template.docx +0 -0
  49. tanml/utils/__init__.py +0 -0
  50. tanml/utils/data_loader.py +17 -0
  51. tanml/utils/model_loader.py +35 -0
  52. tanml/utils/r_loader.py +30 -0
  53. tanml/utils/sas_loader.py +50 -0
  54. tanml/utils/yaml_generator.py +34 -0
  55. tanml/utils/yaml_loader.py +5 -0
  56. tanml/validate.py +209 -0
  57. tanml-0.1.6.dist-info/METADATA +317 -0
  58. tanml-0.1.6.dist-info/RECORD +62 -0
  59. tanml-0.1.6.dist-info/WHEEL +5 -0
  60. tanml-0.1.6.dist-info/entry_points.txt +2 -0
  61. tanml-0.1.6.dist-info/licenses/LICENSE +21 -0
  62. tanml-0.1.6.dist-info/top_level.txt +1 -0
@@ -0,0 +1,109 @@
1
+ # -----------------------------------------------------------------------------
2
+ # File: tanml/checks/input_cluster.py
3
+ # -----------------------------------------------------------------------------
4
+ from pathlib import Path
5
+
6
+ import matplotlib.pyplot as plt
7
+ import pandas as pd
8
+ from sklearn.cluster import KMeans
9
+ from sklearn.preprocessing import StandardScaler
10
+
11
+
12
+ class InputClusterCoverageCheck:
13
+ """Cluster the model inputs and report their distribution.
14
+
15
+ On run() this class will:
16
+ • auto‑select k (or respect the YAML‑supplied one)
17
+ • save a cluster_distribution.csv and a input_cluster_plot.png
18
+ • return a result‑dict consumed by the report builder.
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ cleaned_df: pd.DataFrame,
24
+ feature_names: list[str],
25
+ rule_config: dict | None = None,
26
+ ) -> None:
27
+ self.cleaned_df = cleaned_df
28
+ self.feature_names = feature_names
29
+
30
+ cfg = (rule_config or {}).get("InputClusterCoverageCheck", {})
31
+ self.n_clusters: int | None = cfg.get("n_clusters") # None → auto
32
+ self.max_k: int = cfg.get("max_k", 10) # elbow upper bound
33
+
34
+
35
+ def _auto_select_k(self, X_scaled: pd.DataFrame) -> int:
36
+ """Simple elbow‑method heuristic."""
37
+ inertias: list[float] = []
38
+ for k in range(1, self.max_k + 1):
39
+ km = KMeans(n_clusters=k, n_init=10, random_state=42)
40
+ km.fit(X_scaled)
41
+ inertias.append(km.inertia_)
42
+
43
+ deltas = pd.Series(inertias).diff().iloc[1:]
44
+ second_delta = deltas.diff().iloc[1:]
45
+ if second_delta.empty:
46
+ return 2 # fall‑back if data are too small
47
+ return second_delta.idxmax() + 1 # +1 because diff() shifts index
48
+
49
+
50
+ def run(self) -> dict:
51
+ missing = set(self.feature_names) - set(self.cleaned_df.columns)
52
+ if missing:
53
+ raise ValueError(f"Features not in cleaned_df: {missing}")
54
+
55
+ X = self.cleaned_df[self.feature_names]
56
+ X_scaled = StandardScaler().fit_transform(X)
57
+
58
+ if self.n_clusters is None:
59
+ self.n_clusters = self._auto_select_k(X_scaled)
60
+
61
+ km = KMeans(n_clusters=self.n_clusters, n_init=10, random_state=42)
62
+ labels = km.fit_predict(X_scaled)
63
+
64
+ dfc = X.copy()
65
+ dfc["cluster"] = labels
66
+
67
+ summary = (
68
+ dfc["cluster"].value_counts().sort_index().reset_index(name="Count")
69
+ )
70
+ summary.columns = ["Cluster", "Count"]
71
+ total = len(dfc)
72
+ summary["Percent"] = (summary["Count"] / total * 100).round(2)
73
+
74
+ Path("reports/clusters").mkdir(parents=True, exist_ok=True)
75
+ Path("reports/images").mkdir(parents=True, exist_ok=True)
76
+
77
+ csv_path = "reports/clusters/cluster_distribution.csv"
78
+ plot_path = "reports/images/input_cluster_plot.png"
79
+
80
+ summary.to_csv(csv_path, index=False)
81
+ self._save_bar_chart(summary, plot_path)
82
+
83
+ return {
84
+ "cluster_table": summary.to_dict(orient="records"),
85
+ "cluster_csv": csv_path,
86
+ "cluster_plot_img": plot_path,
87
+ "n_clusters": self.n_clusters,
88
+ }
89
+
90
+ @staticmethod
91
+ def _save_bar_chart(summary: pd.DataFrame, plot_path: str) -> None:
92
+ """Helper used internally and by tests."""
93
+ fig, ax = plt.subplots(figsize=(6, 4))
94
+ bars = ax.barh(y=summary["Cluster"].astype(str), width=summary["Count"])
95
+ ax.set_xlabel("Count")
96
+ ax.set_ylabel("Cluster")
97
+ ax.set_title("Input Cluster Distribution")
98
+ max_count = summary["Count"].max() or 0
99
+ for bar in bars:
100
+ w = bar.get_width()
101
+ ax.text(
102
+ w + max_count * 0.01,
103
+ bar.get_y() + bar.get_height() / 2,
104
+ f"{int(w)}",
105
+ va="center",
106
+ )
107
+ fig.tight_layout()
108
+ fig.savefig(plot_path)
109
+ plt.close(fig)
@@ -0,0 +1,59 @@
1
+ # tanml/checks/logit_stats.py
2
+
3
+ import statsmodels.api as sm
4
+
5
+ class LogisticStatsCheck:
6
+ def __init__(self, model, X, y, config):
7
+ self.model = model
8
+ self.X = X
9
+ self.y = y
10
+ self.config = config or {}
11
+
12
+ def run(self):
13
+ try:
14
+ # 1) Add constant and fit the statsmodels Logit
15
+ Xc = sm.add_constant(self.X, has_constant='add')
16
+ res = sm.Logit(self.y, Xc).fit(disp=False)
17
+
18
+ # 2) Extract coefficient table
19
+ coef = res.params
20
+ stderr = res.bse
21
+ zscore = coef / stderr
22
+ pvals = res.pvalues
23
+
24
+ table = []
25
+ for feat in coef.index:
26
+ label = "Intercept" if feat.lower() == "const" else feat
27
+ table.append({
28
+ "feature": label,
29
+ "coefficient": float(coef[feat]),
30
+ "std_error": float(stderr[feat]),
31
+ "z_score": float(zscore[feat]),
32
+ "p_value": float(pvals[feat]),
33
+ })
34
+
35
+ # 3) Fit statistics
36
+ fit = {
37
+ "log_lik": float(res.llf),
38
+ "aic": float(res.aic),
39
+ "bic": float(res.bic),
40
+ "pseudo_r2": float(res.prsquared),
41
+ }
42
+
43
+ # 4) Full summary text
44
+ summary = res.summary().as_text()
45
+
46
+ return {
47
+ "table": table,
48
+ "fit": fit,
49
+ "summary": summary,
50
+ "object": res
51
+ }
52
+
53
+ except Exception as e:
54
+ return {
55
+ "table": [],
56
+ "fit": {},
57
+ "summary": f"LogisticStatsCheck failed: {e}",
58
+ "object": None
59
+ }
@@ -0,0 +1,40 @@
1
+ # tanml/checks/model_contents.py
2
+
3
+ import inspect
4
+
5
+ class ModelContentsCheck:
6
+ def __init__(self, model):
7
+ self.model = model
8
+
9
+ def run(self):
10
+ summary = {}
11
+
12
+ # 1. Model type and module
13
+ summary["model_class"] = type(self.model).__name__
14
+ summary["module"] = type(self.model).__module__
15
+
16
+ # 2. Feature names
17
+ if hasattr(self.model, "feature_names_in_"):
18
+ summary["feature_names_in"] = list(self.model.feature_names_in_)
19
+
20
+ # 3. Hyperparameters
21
+ if hasattr(self.model, "get_params"):
22
+ try:
23
+ summary["hyperparameters"] = self.model.get_params()
24
+ except Exception:
25
+ summary["hyperparameters"] = "Could not extract"
26
+
27
+ # 4. Coefficients
28
+ if hasattr(self.model, "coef_"):
29
+ try:
30
+ summary["coefficients"] = self.model.coef_.tolist()
31
+ except Exception:
32
+ summary["coefficients"] = "Could not extract"
33
+
34
+ # 5. Public attributes
35
+ summary["attributes"] = [
36
+ name for name in dir(self.model)
37
+ if not name.startswith("_") and not inspect.ismethod(getattr(self.model, name))
38
+ ]
39
+
40
+ return summary
@@ -0,0 +1,50 @@
1
+ from .base import BaseCheck
2
+ import pandas as pd
3
+
4
+ class ModelMetaCheck(BaseCheck):
5
+ def __init__(self, model, X_train, X_test, y_train, y_test, rule_config, cleaned_data):
6
+ super().__init__(model, X_train, X_test, y_train, y_test, rule_config, cleaned_data)
7
+
8
+ def run(self):
9
+ result = {}
10
+ try:
11
+ # Basic metadata
12
+ result["model_type"] = type(self.model).__name__
13
+ result["model_class"] = type(self.model).__name__
14
+ result["module"] = getattr(self.model, "__module__", "Unknown")
15
+
16
+ # Features
17
+ result["n_features"] = self.X_train.shape[1]
18
+ result["feature_names"] = list(getattr(self.X_train, "columns", []))
19
+
20
+ # Training stats
21
+ result["n_train_rows"] = self.X_train.shape[0]
22
+ y_series = pd.Series(self.y_train)
23
+ result["target_balance"] = y_series.value_counts().to_dict()
24
+
25
+ # Hyperparameters
26
+ try:
27
+ params = self.model.get_params()
28
+ result["hyperparam_table"] = [
29
+ {"param": k, "value": str(v)} for k, v in params.items()
30
+ ]
31
+ except Exception as e:
32
+ result["hyperparam_table"] = [{"param": "error", "value": str(e)}]
33
+
34
+ # Public attributes
35
+ try:
36
+ result["attributes"] = {
37
+ k: str(v)
38
+ for k, v in self.model.__dict__.items()
39
+ if not k.startswith("_")
40
+ }
41
+ except Exception as e:
42
+ result["attributes"] = {"error": str(e)}
43
+
44
+ result["status"] = "Model metadata extracted successfully"
45
+
46
+ except Exception as e:
47
+ result["error"] = str(e)
48
+ result["status"] = "ModelMetaCheck failed"
49
+
50
+ return result
@@ -0,0 +1,90 @@
1
+ from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix
2
+ from scipy.stats import ks_2samp
3
+ from .base import BaseCheck
4
+ import numpy as np
5
+
6
+ class PerformanceCheck(BaseCheck):
7
+ def __init__(self, model, X_train, X_test, y_train, y_test, rule_config, cleaned_data):
8
+ super().__init__(model, X_train, X_test, y_train, y_test, rule_config, cleaned_data)
9
+ self.model = model
10
+ self.X_train = X_train
11
+ self.X_test = X_test
12
+ self.y_train = y_train
13
+ self.y_test = y_test
14
+ self.rule_config = rule_config or {}
15
+
16
+ def run(self):
17
+ """Compute metrics from model predictions on provided test set."""
18
+ result = {
19
+ "accuracy": self.compute_accuracy(),
20
+ "auc_roc": self.compute_auc(),
21
+ "f1": self.compute_f1(),
22
+ "ks": self.compute_ks(),
23
+ "confusion_matrix": self.compute_confusion(),
24
+ }
25
+ result["auc"] = result["auc_roc"] # alias for backward compatibility
26
+ return result
27
+
28
+ def compute_accuracy(self):
29
+ y_pred = self.model.predict(self.X_test)
30
+ return round(accuracy_score(self.y_test, y_pred), 4)
31
+
32
+ def compute_auc(self):
33
+ y_prob = self.model.predict_proba(self.X_test)[:, 1]
34
+ return round(roc_auc_score(self.y_test, y_prob), 4)
35
+
36
+ def compute_f1(self):
37
+ y_pred = self.model.predict(self.X_test)
38
+ return round(f1_score(self.y_test, y_pred), 4)
39
+
40
+ def compute_ks(self):
41
+ y_prob = self.model.predict_proba(self.X_test)[:, 1]
42
+ y_true = self.y_test
43
+
44
+ # Split by class
45
+ prob_0 = y_prob[y_true == 0]
46
+ prob_1 = y_prob[y_true == 1]
47
+
48
+ if len(prob_0) < 2 or len(prob_1) < 2:
49
+ return "Insufficient data for KS test"
50
+
51
+ return round(ks_2samp(prob_0, prob_1).statistic, 4)
52
+
53
+ def compute_confusion(self):
54
+ y_pred = self.model.predict(self.X_test)
55
+ return confusion_matrix(self.y_test, y_pred).tolist()
56
+
57
+ @staticmethod
58
+ def from_predictions(y_true, y_proba):
59
+ """
60
+ Compute metrics directly from true labels and predicted probs.
61
+ Returns a dict: accuracy, auc, f1, ks, confusion_matrix.
62
+ """
63
+ y_true_arr = np.array(y_true)
64
+ y_proba_arr = np.array(y_proba)
65
+ y_pred = (y_proba_arr >= 0.5).astype(int)
66
+
67
+ # Avoid invalid metrics when only one class present
68
+ if len(np.unique(y_true_arr)) < 2:
69
+ return {
70
+ "accuracy": "N/A", "auc": "N/A", "f1": "N/A",
71
+ "ks": "N/A", "confusion_matrix": []
72
+ }
73
+
74
+ result = {
75
+ "accuracy": round(accuracy_score(y_true_arr, y_pred), 4),
76
+ "auc_roc": round(roc_auc_score(y_true_arr, y_proba_arr), 4),
77
+ "f1": round(f1_score(y_true_arr, y_pred), 4),
78
+ "confusion_matrix": confusion_matrix(y_true_arr, y_pred).tolist(),
79
+ }
80
+
81
+ prob_0 = y_proba_arr[y_true_arr == 0]
82
+ prob_1 = y_proba_arr[y_true_arr == 1]
83
+
84
+ result["ks"] = (
85
+ round(ks_2samp(prob_0, prob_1).statistic, 4)
86
+ if len(prob_0) >= 2 and len(prob_1) >= 2 else "N/A"
87
+ )
88
+
89
+ result["auc"] = result["auc_roc"] # alias for template/report
90
+ return result
@@ -0,0 +1,47 @@
1
+ # tanml/checks/raw_data.py ← make sure this is the ONLY copy on disk
2
+ from .base import BaseCheck
3
+ import pandas as pd
4
+
5
+ class RawDataCheck(BaseCheck):
6
+ def __init__(self,
7
+ model, X_train, X_test, y_train, y_test,
8
+ rule_config, cleaned_data,
9
+ raw_data=None):
10
+ # bring in rule_config & cleaned_data
11
+ super().__init__(model, X_train, X_test, y_train, y_test,
12
+ rule_config, cleaned_data)
13
+
14
+ if not hasattr(self, "config") or self.config is None:
15
+ self.config = {}
16
+
17
+ if raw_data is not None:
18
+ if isinstance(raw_data, (str, bytes)):
19
+ raw_data = pd.read_csv(raw_data)
20
+ if not isinstance(raw_data, pd.DataFrame):
21
+ raise ValueError("raw_data must be a pandas DataFrame or CSV path")
22
+ self.config["raw_data"] = raw_data
23
+
24
+ def run(self):
25
+ results = {}
26
+ try:
27
+ df = self.config.get("raw_data")
28
+ if not isinstance(df, pd.DataFrame):
29
+ raise ValueError("raw_data not found or not a DataFrame")
30
+
31
+ results["total_rows"] = int(df.shape[0])
32
+ results["total_columns"] = int(df.shape[1])
33
+
34
+ miss = df.isnull().mean().round(4)
35
+ results["avg_missing"] = float(miss.mean())
36
+ results["columns_with_missing"] = miss[miss > 0].to_dict()
37
+
38
+ results["duplicate_rows"] = int(df.duplicated().sum())
39
+
40
+ const_cols = [c for c in df.columns if df[c].nunique(dropna=False) <= 1]
41
+ results["constant_columns"] = const_cols
42
+
43
+ except Exception as e:
44
+ results["error"] = str(e)
45
+
46
+ return {"RawDataCheck": results}
47
+
@@ -0,0 +1,45 @@
1
+ from .base import BaseCheck
2
+
3
+ class RuleEngineCheck(BaseCheck):
4
+ def run(self):
5
+ result = {}
6
+ try:
7
+ perf = self.rule_config.get("check_results", {}).get("PerformanceCheck", {})
8
+ rules = self.rule_config.get("rules", {})
9
+
10
+ applied_rules = {}
11
+
12
+ for metric, conditions in rules.items():
13
+ actual_value = perf.get(metric)
14
+
15
+ if actual_value is None:
16
+ applied_rules[metric] = "❌ Metric not found"
17
+ continue
18
+
19
+ # Attempt to cast to float
20
+ try:
21
+ actual_value = float(actual_value)
22
+ except (ValueError, TypeError):
23
+ applied_rules[metric] = f"❌ Invalid value: {actual_value}"
24
+ continue
25
+
26
+ # Evaluate min/max
27
+ rule_messages = []
28
+ passed = True
29
+
30
+ if "min" in conditions and actual_value < conditions["min"]:
31
+ passed = False
32
+ rule_messages.append(f"{actual_value:.4f} < min {conditions['min']}")
33
+ if "max" in conditions and actual_value > conditions["max"]:
34
+ passed = False
35
+ rule_messages.append(f"{actual_value:.4f} > max {conditions['max']}")
36
+
37
+ applied_rules[metric] = "✅ Passed" if passed else "❌ " + " | ".join(rule_messages)
38
+
39
+ result["rules"] = applied_rules
40
+ result["overall_pass"] = all(v.startswith("✅") for v in applied_rules.values())
41
+
42
+ except Exception as e:
43
+ result["error"] = str(e)
44
+
45
+ return result
@@ -0,0 +1,64 @@
1
+ from sklearn.metrics import roc_auc_score, accuracy_score
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ class StressTestCheck:
6
+ def __init__(self, model, X, y, epsilon=0.01, perturb_fraction=0.2):
7
+ self.model = model
8
+ self.X = X.copy()
9
+ self.y = y
10
+ self.epsilon = epsilon
11
+ self.perturb_fraction = perturb_fraction
12
+
13
+ def run(self):
14
+ np.random.seed(42)
15
+ results = []
16
+
17
+ # Compute baseline metrics
18
+ try:
19
+ base_proba = self.model.predict_proba(self.X)[:, 1]
20
+ base_pred = (base_proba >= 0.5).astype(int)
21
+ base_auc = roc_auc_score(self.y, base_proba)
22
+ base_acc = accuracy_score(self.y, base_pred)
23
+ except Exception as e:
24
+ print(f"⚠️ Error computing baseline metrics: {e}")
25
+ return []
26
+
27
+ # Perturb each numeric feature
28
+ for col in self.X.columns:
29
+ if not pd.api.types.is_numeric_dtype(self.X[col]):
30
+ continue # skip non-numeric features
31
+
32
+ try:
33
+ n_perturb = int(self.perturb_fraction * len(self.X))
34
+ idx = np.random.choice(self.X.index, size=n_perturb, replace=False)
35
+
36
+ X_perturbed = self.X.copy()
37
+ X_perturbed.loc[idx, col] += self.epsilon
38
+
39
+ perturbed_proba = self.model.predict_proba(X_perturbed)[:, 1]
40
+ perturbed_pred = (perturbed_proba >= 0.5).astype(int)
41
+
42
+ pert_auc = roc_auc_score(self.y, perturbed_proba)
43
+ pert_acc = accuracy_score(self.y, perturbed_pred)
44
+
45
+ results.append({
46
+ "feature": col,
47
+ "perturbation": f"±{round(self.epsilon * 100, 2)}%",
48
+ "accuracy": round(pert_acc, 4),
49
+ "auc": round(pert_auc, 4),
50
+ "delta_accuracy": round(pert_acc - base_acc, 4),
51
+ "delta_auc": round(pert_auc - base_auc, 4),
52
+ })
53
+
54
+ except Exception as e:
55
+ results.append({
56
+ "feature": col,
57
+ "perturbation": f"±{round(self.epsilon * 100, 2)}%",
58
+ "accuracy": "error",
59
+ "auc": "error",
60
+ "delta_accuracy": f"Error: {e}",
61
+ "delta_auc": f"Error: {e}",
62
+ })
63
+
64
+ return pd.DataFrame(results)
tanml/checks/vif.py ADDED
@@ -0,0 +1,51 @@
1
+ import pandas as pd
2
+ import os
3
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
4
+
5
+ class VIFCheck:
6
+ def __init__(self, model, X_train, X_test, y_train, y_test, rule_config, cleaned_df, output_dir="reports/vif"):
7
+ self.cleaned_df = cleaned_df.select_dtypes(include=['float64', 'int64']).dropna()
8
+ self.output_dir = output_dir
9
+ os.makedirs(self.output_dir, exist_ok=True)
10
+
11
+ def run(self):
12
+ result = {}
13
+ try:
14
+ if self.cleaned_df.shape[1] < 2:
15
+ return {
16
+ "vif_table": [],
17
+ "high_vif_features": [],
18
+ "status": "Not enough numeric features"
19
+ }
20
+
21
+ X = self.cleaned_df.copy()
22
+ X.insert(0, "Intercept", 1) # Add constant term for VIF
23
+
24
+ vif_data = []
25
+ for i in range(X.shape[1]):
26
+ try:
27
+ vif = variance_inflation_factor(X.values, i)
28
+ except Exception:
29
+ vif = float("inf")
30
+ vif_data.append({
31
+ "feature": X.columns[i],
32
+ "VIF": round(vif, 2)
33
+ })
34
+
35
+ high_vif = [row["feature"] for row in vif_data if row["feature"] != "Intercept" and row["VIF"] > 5]
36
+
37
+ # Save to CSV
38
+ output_path = os.path.join(self.output_dir, "vif_table.csv")
39
+ pd.DataFrame(vif_data).to_csv(output_path, index=False)
40
+
41
+ result["vif_table"] = vif_data
42
+ result["high_vif_features"] = high_vif
43
+ result["csv_path"] = output_path
44
+ result["status"] = "VIF computed successfully"
45
+
46
+ except Exception as e:
47
+ result["vif_table"] = []
48
+ result["high_vif_features"] = []
49
+ result["status"] = f"VIFCheck failed: {str(e)}"
50
+
51
+ return result
tanml/cli/__init__.py ADDED
File without changes
@@ -0,0 +1,31 @@
1
+ import argparse
2
+
3
+ def parse_args():
4
+ parser = argparse.ArgumentParser(description="Run TanML model validation toolkit")
5
+ subparsers = parser.add_subparsers(dest="command", required=True)
6
+
7
+
8
+ validate_parser = subparsers.add_parser("validate", help="Run validation checks and generate report")
9
+ validate_parser.add_argument("--model", required=False,
10
+ help="Model path: .pkl for sklearn/xgb, .csv for SAS or R logistic")
11
+ validate_parser.add_argument("--raw", required=False, help="Path to raw input data file")
12
+ validate_parser.add_argument("--cleaned", required=False, help="Path to cleaned input data file")
13
+ validate_parser.add_argument("--rules", required=True, help="Path to rules.yaml config file")
14
+ validate_parser.add_argument("--target", required=False, help="Target column name (optional)")
15
+ validate_parser.add_argument("--features", required=False, help="Comma-separated list of features")
16
+ validate_parser.add_argument(
17
+ "--report_path",
18
+ type=str,
19
+ default="reports/final_report.docx",
20
+ help="Path to output DOCX report. Example: --report_path my_reports/run1.docx"
21
+ )
22
+
23
+
24
+ init_parser = subparsers.add_parser("init", help="Generate starter rules.yaml file")
25
+ init_parser.add_argument("--scenario", required=True, choices=["A", "B", "C"],
26
+ help="Choose validation scenario: A (single model), B (multiple segments), C (single dataset + segment column)")
27
+ init_parser.add_argument(
28
+ "--output", type=str, default="rules.yaml",
29
+ help="Destination path for rules YAML file (default: rules.yaml)"
30
+ )
31
+ return parser.parse_args()
tanml/cli/init_cmd.py ADDED
@@ -0,0 +1,8 @@
1
+ from tanml.utils.yaml_generator import generate_rules_yaml
2
+
3
+ def run_init(scenario, dest_path="rules.yaml", overwrite=False):
4
+ try:
5
+ generate_rules_yaml(scenario=scenario, dest_path=dest_path, overwrite=overwrite)
6
+
7
+ except Exception as e:
8
+ print(f"❌ Failed to create YAML: {e}")
tanml/cli/main.py ADDED
@@ -0,0 +1,27 @@
1
+ import argparse
2
+ from tanml.cli.validate_cmd import run_validate
3
+ from tanml.cli.init_cmd import run_init
4
+
5
+ def main():
6
+ parser = argparse.ArgumentParser(prog="tanml")
7
+ subparsers = parser.add_subparsers(dest="command")
8
+
9
+ # tanml validate --rules path.yaml
10
+ validate_parser = subparsers.add_parser("validate", help="Run model validation")
11
+ validate_parser.add_argument("--rules", required=True, help="Path to rules/config YAML")
12
+
13
+ # tanml init --scenario B
14
+ init_parser = subparsers.add_parser("init", help="Initialize rules YAML template")
15
+ init_parser.add_argument("--scenario", choices=["A", "B", "C"], required=True, help="Scenario type")
16
+ init_parser.add_argument("--overwrite", action="store_true", help="Overwrite existing rules.yaml if it exists")
17
+ init_parser.add_argument("--output", default="rules.yaml", help="Path where rules.yaml should be saved (default: rules.yaml)")
18
+
19
+ args = parser.parse_args()
20
+
21
+ if args.command == "validate":
22
+ run_validate(args.rules)
23
+ elif args.command == "init":
24
+ run_init(args.scenario, dest_path=args.output, overwrite=args.overwrite)
25
+
26
+ else:
27
+ parser.print_help()
@@ -0,0 +1,7 @@
1
+ # tanml/cli/validate_cmd.py
2
+
3
+ from tanml.validate import validate_from_yaml
4
+
5
+ def run_validate(rules_path):
6
+ print(f"🧪 Starting validation using rules from: {rules_path}")
7
+ validate_from_yaml(rules_path)
File without changes