tanml 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tanml might be problematic. Click here for more details.

Files changed (49) hide show
  1. tanml/__init__.py +1 -1
  2. tanml/check_runners/cleaning_repro_runner.py +2 -2
  3. tanml/check_runners/correlation_runner.py +49 -12
  4. tanml/check_runners/explainability_runner.py +12 -22
  5. tanml/check_runners/logistic_stats_runner.py +196 -17
  6. tanml/check_runners/performance_runner.py +82 -26
  7. tanml/check_runners/raw_data_runner.py +29 -14
  8. tanml/check_runners/regression_metrics_runner.py +195 -0
  9. tanml/check_runners/stress_test_runner.py +23 -6
  10. tanml/check_runners/vif_runner.py +33 -27
  11. tanml/checks/correlation.py +241 -41
  12. tanml/checks/explainability/shap_check.py +261 -29
  13. tanml/checks/logit_stats.py +186 -54
  14. tanml/checks/performance_classification.py +305 -0
  15. tanml/checks/raw_data.py +58 -23
  16. tanml/checks/regression_metrics.py +167 -0
  17. tanml/checks/stress_test.py +157 -53
  18. tanml/cli/main.py +99 -27
  19. tanml/engine/check_agent_registry.py +20 -10
  20. tanml/engine/core_engine_agent.py +199 -37
  21. tanml/models/registry.py +329 -0
  22. tanml/report/report_builder.py +1180 -147
  23. tanml/report/templates/report_template_cls.docx +0 -0
  24. tanml/report/templates/report_template_reg.docx +0 -0
  25. tanml/ui/app.py +1205 -0
  26. tanml/utils/data_loader.py +105 -15
  27. tanml-0.1.7.dist-info/METADATA +164 -0
  28. tanml-0.1.7.dist-info/RECORD +54 -0
  29. tanml/cli/arg_parser.py +0 -31
  30. tanml/cli/init_cmd.py +0 -8
  31. tanml/cli/validate_cmd.py +0 -7
  32. tanml/config_templates/rules_multiple_models_datasets.yaml +0 -144
  33. tanml/config_templates/rules_one_dataset_segment_column.yaml +0 -140
  34. tanml/config_templates/rules_one_model_one_dataset.yaml +0 -143
  35. tanml/engine/segmentation_agent.py +0 -118
  36. tanml/engine/validation_agent.py +0 -91
  37. tanml/report/templates/report_template.docx +0 -0
  38. tanml/utils/model_loader.py +0 -35
  39. tanml/utils/r_loader.py +0 -30
  40. tanml/utils/sas_loader.py +0 -50
  41. tanml/utils/yaml_generator.py +0 -34
  42. tanml/utils/yaml_loader.py +0 -5
  43. tanml/validate.py +0 -209
  44. tanml-0.1.6.dist-info/METADATA +0 -317
  45. tanml-0.1.6.dist-info/RECORD +0 -62
  46. {tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/WHEEL +0 -0
  47. {tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/entry_points.txt +0 -0
  48. {tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/licenses/LICENSE +0 -0
  49. {tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/top_level.txt +0 -0
@@ -1,143 +0,0 @@
1
- # ============================================================
2
- # TanML Validation Configuration File: Scenario A
3
- # ------------------------------------------------------------
4
- # 🧪 Scenario: One model, one cleaned dataset (no segmentation)
5
- #
6
- # ✅ Required:
7
- # - Choose ONE set of paths (Option A or Option B)
8
- # - Choose ONE model source block (Option A or Option B)
9
- # - Fill in your feature names and target column
10
- # - Adjust any thresholds or check options as needed
11
- # ============================================================
12
-
13
-
14
- # ------------------------------------------
15
- # REQUIRED: Model Input Schema
16
- # ------------------------------------------
17
- model:
18
- features:
19
- - feature_0 # 👉 replace with actual feature names used in your model
20
- - feature_1
21
- - feature_2
22
- - feature_3
23
- - feature_4
24
- target: default_flag # 👉 replace with actual target column
25
-
26
- # ------------------------------------------
27
- # OUTPUT: Report Path
28
- # ------------------------------------------
29
- output:
30
- report_path: /mnt/c/Users/you/Desktop/tanml_output/scenario_a_report.docx # 👉 customize as needed
31
-
32
-
33
- # ------------------------------------------
34
- # ✅ OPTION A — Use a Pretrained Model (.pkl)
35
- # ------------------------------------------
36
- # Use this option when you already have a trained model saved as a `.pkl` file.
37
- # You must provide the path to:
38
- # - The cleaned dataset used during training
39
- # - The raw dataset (optional, but recommended)
40
- # - The trained model file via `paths.model`
41
- #
42
- # Be sure to:
43
- # - Keep `from_pickle: true` in `model_source`
44
- # - Match the `path` in `model_source` and `paths.model`
45
- # - Comment out OPTION B block if using this
46
- # ------------------------------------------
47
- paths:
48
- raw_data: /mnt/c/Users/your_name/Desktop/data/raw.csv # 👉 optional — use null if raw data is not available
49
- cleaned_data: data/cleaned_a.csv
50
- model: models/logistic/model_a.pkl
51
-
52
- model_source:
53
- from_pickle: true
54
-
55
-
56
- # ------------------------------------------
57
- # 🔁 OPTION B — Retrain Model from Scratch
58
- # ------------------------------------------
59
- # Use this option if you want TanML to retrain the model for you
60
- # using the cleaned dataset and the specified algorithm + hyperparameters.
61
- #
62
- # You must provide:
63
- # - Path to the raw dataset (optional)
64
- # - Path to the cleaned dataset (required)
65
- # - The full model definition: type, module, and hyperparameters
66
- #
67
- # Be sure to:
68
- # - Set `from_pickle: false`
69
- # - Omit or comment out the `paths.model` line
70
- # - Comment out the OPTION A block above
71
- # ------------------------------------------
72
- # paths:
73
- # raw_data: /mnt/c/Users/your_name/Desktop/data/raw.csv
74
- # cleaned_data: data/cleaned_a.csv
75
-
76
- # model_source:
77
- # from_pickle: false
78
- # type: LogisticRegression
79
- # module: sklearn.linear_model
80
- # hyperparameters:
81
- # penalty: "l2"
82
- # C: 1.0
83
- # solver: "liblinear"
84
- # class_weight: "balanced"
85
- # max_iter: 100
86
- # random_state: 42
87
-
88
- # ------------------------------------------
89
- # PERFORMANCE THRESHOLDS
90
- # ------------------------------------------
91
- auc_roc:
92
- min: 0.60
93
-
94
- f1:
95
- min: 0.60
96
-
97
- ks:
98
- min: 0.20
99
-
100
- # ------------------------------------------
101
- # VALIDATION CHECKS
102
- # ------------------------------------------
103
-
104
- EDACheck:
105
- enabled: true
106
- max_plots: -1 # -1 = all numeric; or set number of columns
107
-
108
- correlation:
109
- enabled: true
110
-
111
- VIFCheck:
112
- enabled: true
113
-
114
- raw_data_check:
115
- enabled: true
116
-
117
- model_meta:
118
- enabled: true
119
-
120
- # ------------------------------------------
121
- # STRESS TESTING (Robustness Check)
122
- # ------------------------------------------
123
- StressTestCheck:
124
- enabled: true
125
- epsilon: 0.01 # ➜ 1% noise
126
- perturb_fraction: 0.2 # ➜ 20% of rows
127
-
128
- # ------------------------------------------
129
- # INPUT CLUSTER COVERAGE
130
- # ------------------------------------------
131
- InputClusterCoverageCheck:
132
- enabled: true
133
- n_clusters: 5 # ➜ fixed clusters for coverage bar chart
134
- max_k: 10 # ➜ elbow method search (if needed)
135
-
136
- # ------------------------------------------
137
- # EXPLAINABILITY
138
- # ------------------------------------------
139
- explainability:
140
- shap:
141
- enabled: true
142
- background_sample_size: 100 # ➜ SHAP explainer training background
143
- test_sample_size: 200 # ➜ test rows to explain
@@ -1,118 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import os
4
- import joblib
5
- import importlib
6
- from tanml.utils.sas_loader import SASLogisticModel
7
- from tanml.utils.r_loader import RLogisticModel
8
- from tanml.engine.validation_agent import SegmentValidator
9
- from tanml.utils.data_loader import load_dataframe
10
-
11
-
12
- def handle_segmentation(segment_config, rule_config, args=None, report_output=None):
13
- global_raw_path = rule_config.get("paths", {}).get("raw_data")
14
- segment_col = segment_config.get("column") # ✅ only required for Scenario C
15
- report_template = report_output # e.g., "reports/report_{segment}.docx"
16
-
17
- print("🔍 Detected segmentation setup in rules.yaml. Running each segment run separately...")
18
-
19
- for name, run_cfg in segment_config["runs"].items():
20
- print(f"\n🔹 Validating segment: {name}")
21
- model_path = run_cfg["model"]
22
-
23
- # CASE 1: Retrain model from cleaned data (model: train)
24
- if isinstance(model_path, str) and model_path.lower() == "train":
25
- print(f"🛠️ Retraining model from cleaned data for segment: {name}")
26
-
27
- if "cleaned" in run_cfg:
28
- cleaned_df_run = load_dataframe(run_cfg["cleaned"])
29
- else:
30
- full_cleaned = load_dataframe(rule_config["paths"]["cleaned_data"])
31
- if segment_col:
32
- cleaned_df_run = full_cleaned[full_cleaned[segment_col] == name]
33
- else:
34
- raise ValueError("Missing segment column for slicing cleaned data.")
35
-
36
- X = cleaned_df_run[rule_config["model"]["features"]]
37
- y = cleaned_df_run[rule_config["model"]["target"]]
38
-
39
- model_source = rule_config.get("model_source", {})
40
- model_type = model_source.get("type")
41
- model_module = model_source.get("module")
42
- model_params = model_source.get("hyperparameters", {})
43
-
44
- if not model_type or not model_module:
45
- raise ValueError("❌ 'model_source.type' and 'model_source.module' must be defined for retraining.")
46
-
47
- model_class = getattr(importlib.import_module(model_module), model_type)
48
- model = model_class(**model_params)
49
- print(f"📦 Using model: {model}")
50
-
51
- model.fit(X, y)
52
- print(f"✅ Retrained {model_type} for segment '{name}'")
53
-
54
- elif isinstance(model_path, str) and "r_logistic" in model_path.lower():
55
- model = RLogisticModel(model_path)
56
-
57
- elif isinstance(model_path, str) and model_path.endswith(".pkl"):
58
- model = joblib.load(model_path)
59
-
60
- elif isinstance(model_path, str) and model_path.endswith(".csv"):
61
- base = os.path.splitext(model_path)[0]
62
- model = SASLogisticModel(
63
- coeffs_path=model_path,
64
- intercept_path=base + "_intercept.txt",
65
- feature_order_path=base + "_features.txt"
66
- )
67
-
68
- else:
69
- raise ValueError(f"❌ Unsupported model format for segment '{name}': {model_path}")
70
-
71
- # Load and optionally slice raw data
72
- raw_df_run = None
73
- if global_raw_path and os.path.exists(global_raw_path):
74
- full_raw = load_dataframe(global_raw_path)
75
- if segment_col:
76
- raw_df_run = full_raw[full_raw[segment_col] == name]
77
- else:
78
- raw_df_run = full_raw
79
-
80
- print(f"[DEBUG] raw_df_run for {name}: {type(raw_df_run)}, rows = {len(raw_df_run) if raw_df_run is not None else 'None'}")
81
-
82
- # Load cleaned data
83
- if "cleaned" in run_cfg:
84
- cleaned_df_run = load_dataframe(run_cfg["cleaned"])
85
- else:
86
- full_cleaned = load_dataframe(rule_config["paths"]["cleaned_data"])
87
- if segment_col:
88
- cleaned_df_run = full_cleaned[full_cleaned[segment_col] == name]
89
- else:
90
- cleaned_df_run = full_cleaned
91
-
92
- # Format output path
93
- report_base = report_template.format(segment=name)
94
-
95
- # Run validation
96
- validator = SegmentValidator(
97
- segment_column=segment_col,
98
- segment_values=[name],
99
- model=model,
100
- raw_df=raw_df_run,
101
- cleaned_df=cleaned_df_run,
102
- target_col=rule_config.get("model", {}).get("target"),
103
- config=rule_config,
104
- segment_name=name,
105
- report_base=report_base
106
- )
107
- results = validator.run()
108
-
109
- # Extract report path from SegmentValidator result
110
- report_base_path = report_template.format(segment=name)
111
- report_path = os.path.join(report_base_path, f"report_{name}.docx")
112
-
113
- print(f"📄 Report saved: {report_path}")
114
- print(f"✅ Segment '{name}' validated.")
115
-
116
-
117
- print("✅ All segment runs completed.")
118
- return True
@@ -1,91 +0,0 @@
1
- from __future__ import annotations
2
- from importlib.resources import files
3
- from datetime import datetime
4
- import tzlocal
5
- import json
6
- from pathlib import Path
7
-
8
- from tanml.engine.core_engine_agent import ValidationEngine
9
- from tanml.report.report_builder import ReportBuilder
10
- from tanml.engine.check_agent_registry import CHECK_RUNNER_REGISTRY
11
-
12
-
13
- class SegmentValidator:
14
- def __init__(
15
- self,
16
- model,
17
- raw_df,
18
- cleaned_df,
19
- config,
20
- target_col=None,
21
- segment_column=None,
22
- segment_values=None,
23
- segment_name=None,
24
- report_base="reports"
25
- ):
26
- self.model = model
27
- self.raw_df = raw_df
28
- self.cleaned_df = cleaned_df
29
- self.config = config
30
- self.target_col = target_col
31
- self.segment_column = segment_column
32
- self.segment_values = segment_values
33
- self.segment_name = segment_name
34
- self.report_base = report_base
35
-
36
- def run(self):
37
- if self.segment_name:
38
- return self._run_single(self.cleaned_df, self.segment_name)
39
-
40
- if not self.segment_column:
41
- raise ValueError("Segmentation column not specified in rules.yaml")
42
-
43
- results = {}
44
- for segment in self.segment_values:
45
- segment_df = self.cleaned_df[self.cleaned_df[self.segment_column] == segment]
46
- print(f"🔹 Running validation for segment value: {segment}")
47
- result = self._run_single(segment_df, segment)
48
- results[segment] = result
49
-
50
- return results
51
-
52
- def _run_single(self, segment_df, segment_name):
53
- if self.target_col is None or self.target_col not in segment_df.columns:
54
- raise ValueError(f"❌ Target column '{self.target_col}' missing in cleaned data for segment '{segment_name}'")
55
-
56
- y = segment_df[self.target_col]
57
- cols_to_drop = [c for c in (self.target_col, self.segment_column) if c in segment_df.columns]
58
- X = segment_df.drop(columns=cols_to_drop)
59
-
60
- # Pass raw_df to ValidationEngine so CleaningReproCheck works
61
- engine = ValidationEngine(self.model, X, X, y, y, self.config, segment_df, self.raw_df)
62
- results = engine.run_all_checks()
63
-
64
- local_tz = tzlocal.get_localzone()
65
- now = datetime.now(local_tz)
66
- results["validation_date"] = now.strftime("%Y-%m-%d %H:%M:%S %Z (UTC%z)")
67
- results["model_path"] = self.model.__class__.__name__
68
- results["validated_by"] = "TanML Automated Validator"
69
- results["rules"] = self.config
70
-
71
- report_base_formatted = self.report_base.format(segment=segment_name) if "{segment}" in self.report_base else self.report_base
72
- output_dir = Path(report_base_formatted)
73
- output_dir.mkdir(parents=True, exist_ok=True)
74
-
75
- tpl_cfg = self.config.get("output", {}).get("template_path") # may be None
76
- template_path = (
77
- Path(tpl_cfg).expanduser() if tpl_cfg
78
- else files("tanml.report.templates").joinpath("report_template.docx")
79
- )
80
-
81
- output_path = output_dir / f"report_{segment_name}.docx"
82
-
83
- builder = ReportBuilder(results, template_path, output_path)
84
- builder.build()
85
-
86
- print(f"📄 Report saved for segment '{segment_name}': {output_path}")
87
- print(json.dumps(results, indent=2, default=str))
88
- return results
89
-
90
-
91
- __all__ = ["SegmentValidator", "CHECK_RUNNER_REGISTRY"]
Binary file
@@ -1,35 +0,0 @@
1
- # tanml/utils/model_loader.py
2
-
3
- import os
4
- import joblib
5
- from tanml.utils.sas_loader import SASLogisticModel
6
- from tanml.utils.r_loader import RLogisticModel
7
-
8
- def load_model(model_path):
9
- """
10
- Load a model from path. Supports:
11
- - sklearn/xgboost .pkl
12
- - SAS .csv with _intercept.txt and _features.txt
13
- - R exported logistic CSV
14
- """
15
- if not model_path:
16
- raise ValueError("❌ No model path provided.")
17
-
18
- if "r_logistic" in model_path.lower():
19
- print("✅ Detected R Logistic Regression model")
20
- return RLogisticModel(model_path)
21
-
22
- elif model_path.endswith(".pkl"):
23
- print(f"✅ Loading sklearn/XGB model from {model_path}")
24
- return joblib.load(model_path)
25
-
26
- elif model_path.endswith(".csv"):
27
- base = os.path.splitext(model_path)[0]
28
- return SASLogisticModel(
29
- coeffs_path=model_path,
30
- intercept_path=base + "_intercept.txt",
31
- feature_order_path=base + "_features.txt"
32
- )
33
-
34
- else:
35
- raise ValueError("❌ Unsupported model format. Use .pkl, .csv, or R model CSV")
tanml/utils/r_loader.py DELETED
@@ -1,30 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
-
4
- class RLogisticModel:
5
- """
6
- Wrapper for logistic regression models exported from R.
7
- Assumes CSV with columns: ID, y_true, y_pred_proba
8
- """
9
-
10
- def __init__(self, model_path: str):
11
- self.model_path = model_path
12
- self.df = pd.read_csv(model_path)
13
-
14
- # Check required columns exist
15
- expected_cols = {'y_true', 'y_pred_proba'}
16
- if not expected_cols.issubset(set(self.df.columns)):
17
- raise ValueError(f"R model CSV must contain columns: {expected_cols}")
18
-
19
- self.y_true = self.df['y_true'].values
20
- self.y_pred_proba = self.df['y_pred_proba'].values
21
-
22
- def predict_proba(self, X=None):
23
- """
24
- Mimics sklearn’s predict_proba format: n_samples x 2
25
- """
26
- proba = self.y_pred_proba.reshape(-1, 1)
27
- return np.hstack([1 - proba, proba])
28
-
29
- def get_true_labels(self):
30
- return self.y_true
tanml/utils/sas_loader.py DELETED
@@ -1,50 +0,0 @@
1
- # File: tanml/utils/sas_loader.py
2
-
3
- import pandas as pd
4
- import numpy as np
5
-
6
-
7
- class SASLogisticModel:
8
- def __init__(self, coeffs_path, intercept_path, feature_order_path):
9
- self.coeffs_path = coeffs_path
10
- self.intercept_path = intercept_path
11
- self.feature_order_path = feature_order_path
12
-
13
- self.coefficients = self._load_coefficients()
14
- self.intercept = self._load_intercept()
15
- self.feature_order = self._load_feature_order()
16
-
17
- def _load_coefficients(self):
18
- return pd.read_csv(self.coeffs_path, index_col=0).squeeze("columns")
19
-
20
- def _load_intercept(self):
21
- with open(self.intercept_path) as f:
22
- return float(f.read().strip())
23
-
24
- def _load_feature_order(self):
25
- with open(self.feature_order_path) as f:
26
- return [line.strip() for line in f.readlines()]
27
-
28
- def predict_proba(self, X):
29
- """
30
- Return a NumPy array shaped (n_samples, 2) like sklearn:
31
- [:, 0] = P(class 0), [:, 1] = P(class 1)
32
- """
33
- X = X[self.feature_order]
34
- logits = X.dot(self.coefficients) + self.intercept
35
-
36
- # numeric stability clamp
37
- logits = logits.clip(-700, 700)
38
-
39
- proba_1 = 1 / (1 + np.exp(-logits))
40
- proba_0 = 1 - proba_1
41
- return np.vstack([proba_0, proba_1]).T # shape (n, 2)
42
-
43
- def predict(self, X):
44
- """
45
- Return class labels (0/1) based on 0.5 threshold.
46
- Works with the NumPy array returned by predict_proba().
47
- """
48
- proba_1 = self.predict_proba(X)[:, 1] # probability of class 1
49
- return (proba_1 >= 0.5).astype(int)
50
-
@@ -1,34 +0,0 @@
1
- from pathlib import Path
2
- import shutil
3
-
4
- def generate_rules_yaml(
5
- scenario: str = "A",
6
- dest_path: str = "rules.yaml",
7
- overwrite: bool = False
8
- ):
9
-
10
- scenario = scenario.upper()
11
- template_map = {
12
- "A": "rules_one_model_one_dataset.yaml",
13
- "B": "rules_multiple_models_datasets.yaml",
14
- "C": "rules_one_dataset_segment_column.yaml"
15
- }
16
-
17
- if scenario not in template_map:
18
- raise ValueError("Invalid scenario. Must be 'A', 'B', or 'C'.")
19
-
20
- src = Path(__file__).parent.parent / "config_templates" / template_map[scenario]
21
- if not src.exists():
22
- raise FileNotFoundError(f"Template not found at {src}")
23
-
24
- dst = Path(dest_path)
25
- if dst.exists() and not overwrite:
26
- raise FileExistsError(f"{dst} already exists. Use --overwrite to replace it.")
27
-
28
- if dst.parent and not dst.parent.exists():
29
- dst.parent.mkdir(parents=True, exist_ok=True)
30
-
31
- shutil.copyfile(src, dst)
32
-
33
- print(f"✅ Created: {dst.resolve()} for Scenario {scenario}")
34
- print("👉 Now edit this YAML to fill in your model, data, and feature details.")
@@ -1,5 +0,0 @@
1
- import yaml
2
-
3
- def load_yaml_config(path):
4
- with open(path, "r") as f:
5
- return yaml.safe_load(f)