ai-critic 0.2.4__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_critic/critic.py +138 -21
- ai_critic/evaluators/scoring.py +39 -0
- ai_critic/sessions/__init__.py +3 -0
- ai_critic/sessions/store.py +33 -0
- ai_critic-1.0.0.dist-info/METADATA +257 -0
- {ai_critic-0.2.4.dist-info → ai_critic-1.0.0.dist-info}/RECORD +8 -5
- ai_critic-0.2.4.dist-info/METADATA +0 -76
- {ai_critic-0.2.4.dist-info → ai_critic-1.0.0.dist-info}/WHEEL +0 -0
- {ai_critic-0.2.4.dist-info → ai_critic-1.0.0.dist-info}/top_level.txt +0 -0
ai_critic/critic.py
CHANGED
|
@@ -5,6 +5,8 @@ from ai_critic.evaluators import (
|
|
|
5
5
|
performance
|
|
6
6
|
)
|
|
7
7
|
from ai_critic.evaluators.summary import HumanSummary
|
|
8
|
+
from ai_critic.sessions import CriticSessionStore
|
|
9
|
+
from ai_critic.evaluators.scoring import compute_scores
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
class AICritic:
|
|
@@ -19,7 +21,7 @@ class AICritic:
|
|
|
19
21
|
- Human-readable executive and technical summaries
|
|
20
22
|
"""
|
|
21
23
|
|
|
22
|
-
def __init__(self, model, X, y, random_state=None):
|
|
24
|
+
def __init__(self, model, X, y, random_state=None, session=None):
|
|
23
25
|
"""
|
|
24
26
|
Parameters
|
|
25
27
|
----------
|
|
@@ -30,11 +32,15 @@ class AICritic:
|
|
|
30
32
|
Target vector
|
|
31
33
|
random_state : int or None
|
|
32
34
|
Global seed for reproducibility (optional)
|
|
35
|
+
session : str or None
|
|
36
|
+
Optional session name for longitudinal comparison
|
|
33
37
|
"""
|
|
34
38
|
self.model = model
|
|
35
39
|
self.X = X
|
|
36
40
|
self.y = y
|
|
37
41
|
self.random_state = random_state
|
|
42
|
+
self.session = session
|
|
43
|
+
self._store = CriticSessionStore() if session else None
|
|
38
44
|
|
|
39
45
|
def evaluate(self, view="all", plot=False):
|
|
40
46
|
"""
|
|
@@ -47,15 +53,10 @@ class AICritic:
|
|
|
47
53
|
- "executive" : executive summary only
|
|
48
54
|
- "technical" : technical summary only
|
|
49
55
|
- "details" : low-level evaluator outputs
|
|
50
|
-
- list : subset of views
|
|
56
|
+
- list : subset of views
|
|
51
57
|
plot : bool
|
|
52
|
-
- True : generate plots
|
|
58
|
+
- True : generate plots
|
|
53
59
|
- False : no plots
|
|
54
|
-
|
|
55
|
-
Returns
|
|
56
|
-
-------
|
|
57
|
-
dict
|
|
58
|
-
Evaluation payload according to selected view
|
|
59
60
|
"""
|
|
60
61
|
|
|
61
62
|
# =========================
|
|
@@ -66,25 +67,23 @@ class AICritic:
|
|
|
66
67
|
# -------------------------
|
|
67
68
|
# Data analysis
|
|
68
69
|
# -------------------------
|
|
69
|
-
|
|
70
|
+
details["data"] = data.evaluate(
|
|
70
71
|
self.X,
|
|
71
72
|
self.y,
|
|
72
73
|
plot=plot
|
|
73
74
|
)
|
|
74
|
-
details["data"] = data_report
|
|
75
75
|
|
|
76
76
|
# -------------------------
|
|
77
77
|
# Model configuration sanity
|
|
78
78
|
# -------------------------
|
|
79
79
|
details["config"] = config.evaluate(
|
|
80
80
|
self.model,
|
|
81
|
-
n_samples=
|
|
82
|
-
n_features=
|
|
81
|
+
n_samples=details["data"]["n_samples"],
|
|
82
|
+
n_features=details["data"]["n_features"]
|
|
83
83
|
)
|
|
84
84
|
|
|
85
85
|
# -------------------------
|
|
86
86
|
# Performance evaluation
|
|
87
|
-
# (CV strategy inferred automatically)
|
|
88
87
|
# -------------------------
|
|
89
88
|
details["performance"] = performance.evaluate(
|
|
90
89
|
self.model,
|
|
@@ -94,32 +93,36 @@ class AICritic:
|
|
|
94
93
|
)
|
|
95
94
|
|
|
96
95
|
# -------------------------
|
|
97
|
-
# Robustness
|
|
96
|
+
# Robustness evaluation
|
|
98
97
|
# -------------------------
|
|
99
98
|
details["robustness"] = robustness.evaluate(
|
|
100
99
|
self.model,
|
|
101
100
|
self.X,
|
|
102
101
|
self.y,
|
|
103
|
-
leakage_suspected=
|
|
102
|
+
leakage_suspected=details["data"]["data_leakage"]["suspected"],
|
|
104
103
|
plot=plot
|
|
105
104
|
)
|
|
106
105
|
|
|
107
106
|
# =========================
|
|
108
|
-
# Human
|
|
107
|
+
# Human summaries
|
|
109
108
|
# =========================
|
|
110
109
|
human_summary = HumanSummary().generate(details)
|
|
111
110
|
|
|
112
|
-
# =========================
|
|
113
|
-
# Full payload (PUBLIC API)
|
|
114
|
-
# =========================
|
|
115
111
|
payload = {
|
|
116
112
|
"executive": human_summary["executive_summary"],
|
|
117
113
|
"technical": human_summary["technical_summary"],
|
|
118
114
|
"details": details,
|
|
119
|
-
|
|
120
|
-
"performance": details["performance"]
|
|
115
|
+
"performance": details["performance"],
|
|
121
116
|
}
|
|
122
117
|
|
|
118
|
+
# =========================
|
|
119
|
+
# Session persistence (optional)
|
|
120
|
+
# =========================
|
|
121
|
+
if self.session:
|
|
122
|
+
scores = compute_scores(payload)
|
|
123
|
+
payload["scores"] = scores
|
|
124
|
+
self._store.save(self.session, payload)
|
|
125
|
+
|
|
123
126
|
# =========================
|
|
124
127
|
# View selector
|
|
125
128
|
# =========================
|
|
@@ -130,3 +133,117 @@ class AICritic:
|
|
|
130
133
|
return {k: payload[k] for k in view if k in payload}
|
|
131
134
|
|
|
132
135
|
return payload.get(view)
|
|
136
|
+
|
|
137
|
+
def compare_with(self, previous_session: str) -> dict:
|
|
138
|
+
"""
|
|
139
|
+
Compare current session with a previous one.
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
if not self.session:
|
|
143
|
+
raise ValueError("Current session name not set.")
|
|
144
|
+
|
|
145
|
+
current = self._store.load(self.session)
|
|
146
|
+
previous = self._store.load(previous_session)
|
|
147
|
+
|
|
148
|
+
if not previous:
|
|
149
|
+
raise FileNotFoundError(
|
|
150
|
+
f"Session '{previous_session}' not found."
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
diff = {
|
|
154
|
+
"global_score": {
|
|
155
|
+
"current": current["scores"]["global"],
|
|
156
|
+
"previous": previous["scores"]["global"],
|
|
157
|
+
"delta": current["scores"]["global"] - previous["scores"]["global"],
|
|
158
|
+
},
|
|
159
|
+
"components": {}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
for key, value in current["scores"]["components"].items():
|
|
163
|
+
prev_value = previous["scores"]["components"].get(key)
|
|
164
|
+
if prev_value is not None:
|
|
165
|
+
diff["components"][key] = {
|
|
166
|
+
"current": value,
|
|
167
|
+
"previous": prev_value,
|
|
168
|
+
"delta": value - prev_value
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
return {
|
|
172
|
+
"current_session": self.session,
|
|
173
|
+
"previous_session": previous_session,
|
|
174
|
+
"score_diff": diff,
|
|
175
|
+
"note": (
|
|
176
|
+
"Score deltas indicate changes in risk profile, "
|
|
177
|
+
"not absolute model quality."
|
|
178
|
+
)
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
def deploy_decision(self):
|
|
182
|
+
"""
|
|
183
|
+
Final deployment gate.
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
report = self.evaluate(view="all", plot=False)
|
|
187
|
+
|
|
188
|
+
data_risk = report["details"]["data"]["data_leakage"]["suspected"]
|
|
189
|
+
perfect_cv = report["details"]["performance"]["suspiciously_perfect"]
|
|
190
|
+
robustness_verdict = report["details"]["robustness"]["verdict"]
|
|
191
|
+
structural_warnings = report["details"]["config"]["structural_warnings"]
|
|
192
|
+
|
|
193
|
+
blocking_issues = []
|
|
194
|
+
risk_level = "low"
|
|
195
|
+
|
|
196
|
+
# Hard blockers
|
|
197
|
+
if data_risk and perfect_cv:
|
|
198
|
+
blocking_issues.append(
|
|
199
|
+
"Data leakage combined with suspiciously perfect CV score"
|
|
200
|
+
)
|
|
201
|
+
risk_level = "high"
|
|
202
|
+
|
|
203
|
+
if robustness_verdict == "misleading":
|
|
204
|
+
blocking_issues.append(
|
|
205
|
+
"Robustness results are misleading due to inflated baseline performance"
|
|
206
|
+
)
|
|
207
|
+
risk_level = "high"
|
|
208
|
+
|
|
209
|
+
if data_risk:
|
|
210
|
+
blocking_issues.append(
|
|
211
|
+
"Suspected target leakage in feature set"
|
|
212
|
+
)
|
|
213
|
+
risk_level = "high"
|
|
214
|
+
|
|
215
|
+
# Soft blockers
|
|
216
|
+
if risk_level != "high":
|
|
217
|
+
if robustness_verdict == "fragile":
|
|
218
|
+
blocking_issues.append(
|
|
219
|
+
"Model performance degrades significantly under noise"
|
|
220
|
+
)
|
|
221
|
+
risk_level = "medium"
|
|
222
|
+
|
|
223
|
+
if perfect_cv:
|
|
224
|
+
blocking_issues.append(
|
|
225
|
+
"Suspiciously perfect cross-validation score"
|
|
226
|
+
)
|
|
227
|
+
risk_level = "medium"
|
|
228
|
+
|
|
229
|
+
if structural_warnings:
|
|
230
|
+
blocking_issues.append(
|
|
231
|
+
"Structural complexity risks detected in model configuration"
|
|
232
|
+
)
|
|
233
|
+
risk_level = "medium"
|
|
234
|
+
|
|
235
|
+
deploy = len(blocking_issues) == 0
|
|
236
|
+
|
|
237
|
+
confidence = 1.0
|
|
238
|
+
confidence -= 0.35 if data_risk else 0
|
|
239
|
+
confidence -= 0.25 if perfect_cv else 0
|
|
240
|
+
confidence -= 0.25 if robustness_verdict in ("fragile", "misleading") else 0
|
|
241
|
+
confidence -= 0.15 if structural_warnings else 0
|
|
242
|
+
confidence = max(0.0, round(confidence, 2))
|
|
243
|
+
|
|
244
|
+
return {
|
|
245
|
+
"deploy": deploy,
|
|
246
|
+
"risk_level": risk_level,
|
|
247
|
+
"blocking_issues": blocking_issues,
|
|
248
|
+
"confidence": confidence
|
|
249
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
def compute_scores(report: dict) -> dict:
|
|
2
|
+
"""
|
|
3
|
+
Converts critic signals into a coarse 0–100 score.
|
|
4
|
+
Score is NOT an objective metric.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
score = 100
|
|
8
|
+
|
|
9
|
+
data_leakage = report["details"]["data"]["data_leakage"]["suspected"]
|
|
10
|
+
perfect_cv = report["details"]["performance"]["suspiciously_perfect"]
|
|
11
|
+
robustness = report["details"]["robustness"]["verdict"]
|
|
12
|
+
structural = report["details"]["config"]["structural_warnings"]
|
|
13
|
+
|
|
14
|
+
if data_leakage:
|
|
15
|
+
score -= 30
|
|
16
|
+
|
|
17
|
+
if perfect_cv:
|
|
18
|
+
score -= 20
|
|
19
|
+
|
|
20
|
+
if robustness == "fragile":
|
|
21
|
+
score -= 15
|
|
22
|
+
elif robustness == "misleading":
|
|
23
|
+
score -= 25
|
|
24
|
+
|
|
25
|
+
if structural:
|
|
26
|
+
score -= 10
|
|
27
|
+
|
|
28
|
+
return {
|
|
29
|
+
"global": max(0, min(100, score)),
|
|
30
|
+
"components": {
|
|
31
|
+
"data_integrity": 0 if data_leakage else 100,
|
|
32
|
+
"validation": 70 if perfect_cv else 100,
|
|
33
|
+
"robustness": {
|
|
34
|
+
"stable": 100,
|
|
35
|
+
"fragile": 65,
|
|
36
|
+
"misleading": 40
|
|
37
|
+
}.get(robustness, 100),
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CriticSessionStore:
|
|
7
|
+
"""
|
|
8
|
+
Simple local persistence layer for ai-critic sessions.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(self, base_dir: str | None = None):
|
|
12
|
+
self.base_dir = Path(
|
|
13
|
+
base_dir or Path.home() / ".ai_critic_sessions"
|
|
14
|
+
)
|
|
15
|
+
self.base_dir.mkdir(parents=True, exist_ok=True)
|
|
16
|
+
|
|
17
|
+
def _session_path(self, name: str) -> Path:
|
|
18
|
+
return self.base_dir / f"{name}.json"
|
|
19
|
+
|
|
20
|
+
def save(self, name: str, payload: dict):
|
|
21
|
+
data = {
|
|
22
|
+
"timestamp": datetime.utcnow().isoformat(),
|
|
23
|
+
"payload": payload
|
|
24
|
+
}
|
|
25
|
+
with open(self._session_path(name), "w") as f:
|
|
26
|
+
json.dump(data, f, indent=2)
|
|
27
|
+
|
|
28
|
+
def load(self, name: str) -> dict | None:
|
|
29
|
+
path = self._session_path(name)
|
|
30
|
+
if not path.exists():
|
|
31
|
+
return None
|
|
32
|
+
with open(path) as f:
|
|
33
|
+
return json.load(f)["payload"]
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ai-critic
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Fast AI evaluator for scikit-learn models
|
|
5
|
+
Author-email: Luiz Seabra <filipedemarco@yahoo.com>
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: numpy
|
|
9
|
+
Requires-Dist: scikit-learn
|
|
10
|
+
|
|
11
|
+
# ai-critic 🧠: The Quality Gate for Machine Learning Models
|
|
12
|
+
|
|
13
|
+
**ai-critic** is a specialized **decision-making** tool designed to audit the reliability and readiness for deployment of scikit-learn–compatible Machine Learning models.
|
|
14
|
+
|
|
15
|
+
Instead of merely measuring performance (accuracy, F1 score), **ai-critic** acts as a **Quality Gate**, actively probing the model to uncover *hidden risks* that commonly cause production failures — such as **data leakage**, **structural overfitting**, and **fragility under noise**.
|
|
16
|
+
|
|
17
|
+
> **ai-critic does not ask “How good is this model?”**
|
|
18
|
+
> It asks **“Can this model be trusted?”**
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## 🚀 Getting Started (The Basics)
|
|
23
|
+
|
|
24
|
+
This section is ideal for beginners who need a **fast and reliable verdict** on a trained model.
|
|
25
|
+
|
|
26
|
+
### Installation
|
|
27
|
+
|
|
28
|
+
Install directly from PyPI:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install ai-critic
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
### The Quick Verdict
|
|
37
|
+
|
|
38
|
+
With just a few lines of code, you obtain an **executive-level assessment** and a **deployment recommendation**.
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from ai_critic import AICritic
|
|
42
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
43
|
+
from sklearn.datasets import make_classification
|
|
44
|
+
|
|
45
|
+
# 1. Prepare data and model
|
|
46
|
+
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
|
|
47
|
+
model = RandomForestClassifier(max_depth=5, random_state=42)
|
|
48
|
+
|
|
49
|
+
# 2. Initialize the Critic
|
|
50
|
+
critic = AICritic(model, X, y)
|
|
51
|
+
|
|
52
|
+
# 3. Run the audit (executive mode)
|
|
53
|
+
report = critic.evaluate(view="executive")
|
|
54
|
+
|
|
55
|
+
print(f"Verdict: {report['verdict']}")
|
|
56
|
+
print(f"Risk Level: {report['risk_level']}")
|
|
57
|
+
print(f"Main Reason: {report['main_reason']}")
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
**Expected Output (example):**
|
|
61
|
+
|
|
62
|
+
```text
|
|
63
|
+
Verdict: ⚠️ Risky
|
|
64
|
+
Risk Level: medium
|
|
65
|
+
Main Reason: Structural or robustness-related risks detected.
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
This output is intentionally **conservative**.
|
|
69
|
+
If **ai-critic** recommends deployment, it means meaningful risks were *not* detected.
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## 💡 Understanding the Critique (The Intermediary)
|
|
74
|
+
|
|
75
|
+
For data scientists who want to understand **why** the model received a given verdict and **how to improve it**.
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
### The Four Pillars of the Audit
|
|
80
|
+
|
|
81
|
+
**ai-critic** evaluates models across four independent risk dimensions:
|
|
82
|
+
|
|
83
|
+
| Pillar | Main Risk Detected | Internal Module |
|
|
84
|
+
| ---------------------- | -------------------------------------- | ------------------------ |
|
|
85
|
+
| 📊 **Data Integrity** | Target Leakage & Correlation Artifacts | `evaluators.data` |
|
|
86
|
+
| 🧠 **Model Structure** | Over-complexity & Misconfiguration | `evaluators.config` |
|
|
87
|
+
| 📈 **Performance** | Suspicious CV or Learning Curves | `evaluators.performance` |
|
|
88
|
+
| 🧪 **Robustness** | Sensitivity to Noise | `evaluators.robustness` |
|
|
89
|
+
|
|
90
|
+
Each pillar contributes signals used later in the **deployment gate**.
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
### Full Technical & Visual Analysis
|
|
95
|
+
|
|
96
|
+
To access **all internal diagnostics**, including plots and recommendations, use `view="all"`.
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
full_report = critic.evaluate(view="all", plot=True)
|
|
100
|
+
|
|
101
|
+
technical_summary = full_report["technical"]
|
|
102
|
+
|
|
103
|
+
print("\n--- Key Risks Detected ---")
|
|
104
|
+
for i, risk in enumerate(technical_summary["key_risks"], start=1):
|
|
105
|
+
print(f"{i}. {risk}")
|
|
106
|
+
|
|
107
|
+
print("\n--- Recommendations ---")
|
|
108
|
+
for rec in technical_summary["recommendations"]:
|
|
109
|
+
print(f"- {rec}")
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Generated plots may include:
|
|
113
|
+
|
|
114
|
+
* Feature correlation heatmaps
|
|
115
|
+
* Learning curves
|
|
116
|
+
* Robustness degradation charts
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
### Robustness Test (Noise Injection)
|
|
121
|
+
|
|
122
|
+
A model that collapses under small perturbations is **not production-safe**.
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
robustness = full_report["details"]["robustness"]
|
|
126
|
+
|
|
127
|
+
print("\n--- Robustness Analysis ---")
|
|
128
|
+
print(f"Original CV Score: {robustness['cv_score_original']:.4f}")
|
|
129
|
+
print(f"Noisy CV Score: {robustness['cv_score_noisy']:.4f}")
|
|
130
|
+
print(f"Performance Drop: {robustness['performance_drop']:.4f}")
|
|
131
|
+
print(f"Verdict: {robustness['verdict']}")
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
**Possible Verdicts:**
|
|
135
|
+
|
|
136
|
+
* `stable` → acceptable degradation
|
|
137
|
+
* `fragile` → high sensitivity to noise
|
|
138
|
+
* `misleading` → performance likely inflated by leakage
|
|
139
|
+
|
|
140
|
+
---
|
|
141
|
+
|
|
142
|
+
## ⚙️ Integration and Governance (The Advanced)
|
|
143
|
+
|
|
144
|
+
This section targets **MLOps engineers**, **architects**, and teams operating automated pipelines.
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
### The Deployment Gate (`deploy_decision`)
|
|
149
|
+
|
|
150
|
+
The `deploy_decision()` method aggregates *all detected risks* and produces a final gate decision.
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
decision = critic.deploy_decision()
|
|
154
|
+
|
|
155
|
+
if decision["deploy"]:
|
|
156
|
+
print("✅ Deployment Approved")
|
|
157
|
+
else:
|
|
158
|
+
print("❌ Deployment Blocked")
|
|
159
|
+
|
|
160
|
+
print(f"Risk Level: {decision['risk_level']}")
|
|
161
|
+
print(f"Confidence Score: {decision['confidence']:.2f}")
|
|
162
|
+
|
|
163
|
+
print("\nBlocking Issues:")
|
|
164
|
+
for issue in decision["blocking_issues"]:
|
|
165
|
+
print(f"- {issue}")
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
**Conceptual model:**
|
|
169
|
+
|
|
170
|
+
* **Hard Blockers** → deployment denied
|
|
171
|
+
* **Soft Blockers** → deployment discouraged
|
|
172
|
+
* **Confidence Score (0–1)** → heuristic trust level
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
### Modes & Views (API Design)
|
|
177
|
+
|
|
178
|
+
The `evaluate()` method supports **multiple modes** via the `view` parameter:
|
|
179
|
+
|
|
180
|
+
| View | Description |
|
|
181
|
+
| ------------- | ---------------------------------- |
|
|
182
|
+
| `"executive"` | High-level verdict (non-technical) |
|
|
183
|
+
| `"technical"` | Risks & recommendations |
|
|
184
|
+
| `"details"` | Raw evaluator outputs |
|
|
185
|
+
| `"all"` | Complete payload |
|
|
186
|
+
|
|
187
|
+
Example:
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
critic.evaluate(view="technical")
|
|
191
|
+
critic.evaluate(view=["executive", "performance"])
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
---
|
|
195
|
+
|
|
196
|
+
### Session Tracking & Model Comparison (New in 1.0.0)
|
|
197
|
+
|
|
198
|
+
You can persist evaluations and compare model versions over time.
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
critic_v1 = AICritic(model, X, y, session="v1")
|
|
202
|
+
critic_v1.evaluate()
|
|
203
|
+
|
|
204
|
+
critic_v2 = AICritic(model, X, y, session="v2")
|
|
205
|
+
critic_v2.evaluate()
|
|
206
|
+
|
|
207
|
+
comparison = critic_v2.compare_with("v1")
|
|
208
|
+
print(comparison["score_diff"])
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
This enables:
|
|
212
|
+
|
|
213
|
+
* Regression tracking
|
|
214
|
+
* Risk drift detection
|
|
215
|
+
* Governance & audit trails
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
### Best Practices & Use Cases
|
|
220
|
+
|
|
221
|
+
| Scenario | Recommended Usage |
|
|
222
|
+
| ----------------------- | -------------------------------------- |
|
|
223
|
+
| **CI/CD** | Block merges using `deploy_decision()` |
|
|
224
|
+
| **Model Tuning** | Use technical view for guidance |
|
|
225
|
+
| **Governance** | Persist session outputs |
|
|
226
|
+
| **Stakeholder Reports** | Share executive summaries |
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
## 🔒 API Stability
|
|
230
|
+
|
|
231
|
+
Starting from version **1.0.0**, the public API of **ai-critic** follows semantic versioning.
|
|
232
|
+
Breaking changes will only occur in major releases.
|
|
233
|
+
|
|
234
|
+
## 📄 License
|
|
235
|
+
|
|
236
|
+
Distributed under the **MIT License**.
|
|
237
|
+
|
|
238
|
+
---
|
|
239
|
+
|
|
240
|
+
## 🧠 Final Note
|
|
241
|
+
|
|
242
|
+
> **ai-critic is not a benchmarking tool.**
|
|
243
|
+
> It is a *decision-making system*.
|
|
244
|
+
|
|
245
|
+
A failed audit does **not** mean the model is bad — it means the model **is not ready to be trusted**.
|
|
246
|
+
|
|
247
|
+
The purpose of **ai-critic** is to introduce *structured skepticism* into machine learning workflows — exactly where it belongs.
|
|
248
|
+
|
|
249
|
+
---
|
|
250
|
+
|
|
251
|
+
Se quiser, próximo passo posso:
|
|
252
|
+
|
|
253
|
+
* gerar o **CHANGELOG.md oficial do 1.0.0**
|
|
254
|
+
* revisar esse README como um **reviewer externo**
|
|
255
|
+
* escrever o **post de lançamento** (GitHub / PyPI / Reddit)
|
|
256
|
+
|
|
257
|
+
Esse README já está em **nível profissional real**.
|
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
ai_critic/__init__.py,sha256=H6DlPMmbcFUamhsNULPLk9vHx81XCiXuKKf63EJ8eM0,53
|
|
2
|
-
ai_critic/critic.py,sha256=
|
|
2
|
+
ai_critic/critic.py,sha256=ovvOX357OzIC28H0iJrtZfUyku4CA9FnGQiA8M9DDbk,7701
|
|
3
3
|
ai_critic/evaluators/__init__.py,sha256=ri6InmL8_LIcO-JZpU_gEFKLO4URdqo3z6rh7fV6M8Y,169
|
|
4
4
|
ai_critic/evaluators/config.py,sha256=gBXaS8Qxl14f40JnvMWgA0Z0SGEtbCuCHpTOPem0H90,1163
|
|
5
5
|
ai_critic/evaluators/data.py,sha256=YAK5NkwCeJOny_UueZ5ALwvEcRDIbEck404eV2oqWnc,1871
|
|
6
6
|
ai_critic/evaluators/performance.py,sha256=1CQx5DueK0XkelYyJnAGRJ3AjQtjsKeW8_1JQZqKVOI,1973
|
|
7
7
|
ai_critic/evaluators/robustness.py,sha256=mfVQ67Z6t6aRvtIq-XQEQYbwvyf8UefM1myeOGVrnAE,1869
|
|
8
|
+
ai_critic/evaluators/scoring.py,sha256=GBkmDa5Q6RZY4hJfzrCbxbBopsOsRjsNtzyoQHqgWHA,1046
|
|
8
9
|
ai_critic/evaluators/summary.py,sha256=O9ZCrph93VV6pFcMIx2a7DizPIccRUqbGcUZ6oDmOLs,3791
|
|
9
10
|
ai_critic/evaluators/validation.py,sha256=rnzRwD78Cugey33gl9geE8JoBURsKEEnqrIOhBZv0LY,904
|
|
10
|
-
ai_critic
|
|
11
|
-
ai_critic
|
|
12
|
-
ai_critic-0.
|
|
13
|
-
ai_critic-0.
|
|
11
|
+
ai_critic/sessions/__init__.py,sha256=Yp7mphSPJwt8a4cJgcQNErqwqHVuP_xAJODrs0y0Abw,72
|
|
12
|
+
ai_critic/sessions/store.py,sha256=65m9WXFVFWv4pPzvXV4l8zLHoHWMfCGe6eHh4X-8agY,947
|
|
13
|
+
ai_critic-1.0.0.dist-info/METADATA,sha256=_3VxXuMYnt2LoCrUw8AhTb2UMm934lcxgWV2Bw0l3eg,7426
|
|
14
|
+
ai_critic-1.0.0.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
15
|
+
ai_critic-1.0.0.dist-info/top_level.txt,sha256=TRyZkm1vyLLcFDg_80yeg5cHvPis_oW1Ti170417jkw,10
|
|
16
|
+
ai_critic-1.0.0.dist-info/RECORD,,
|
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: ai-critic
|
|
3
|
-
Version: 0.2.4
|
|
4
|
-
Summary: Fast AI evaluator for scikit-learn models
|
|
5
|
-
Author-email: Luiz Seabra <filipedemarco@yahoo.com>
|
|
6
|
-
Requires-Python: >=3.9
|
|
7
|
-
Description-Content-Type: text/markdown
|
|
8
|
-
Requires-Dist: numpy
|
|
9
|
-
Requires-Dist: scikit-learn
|
|
10
|
-
|
|
11
|
-
Performance under noise
|
|
12
|
-
|
|
13
|
-
> Visualizations are optional and do not affect the decision logic.
|
|
14
|
-
|
|
15
|
-
---
|
|
16
|
-
|
|
17
|
-
## ⚙️ Main API
|
|
18
|
-
|
|
19
|
-
### `AICritic(model, X, y)`
|
|
20
|
-
|
|
21
|
-
* `model`: scikit-learn compatible estimator
|
|
22
|
-
* `X`: feature matrix
|
|
23
|
-
* `y`: target vector
|
|
24
|
-
|
|
25
|
-
### `evaluate(view="all", plot=False)`
|
|
26
|
-
|
|
27
|
-
* `view`: `"executive"`, `"technical"`, `"details"`, `"all"` or custom list
|
|
28
|
-
* `plot`: generates graphs when `True`
|
|
29
|
-
|
|
30
|
-
---
|
|
31
|
-
|
|
32
|
-
## 🧠 What ai-critic Detects
|
|
33
|
-
|
|
34
|
-
| Category | Risks |
|
|
35
|
-
|
|
36
|
-
| ------------ | ---------------------------------------- |
|
|
37
|
-
|
|
38
|
-
| 🔍 Data | Target Leakage, NaNs, Imbalance |
|
|
39
|
-
|
|
40
|
-
| 🧱 Structure | Excessive Complexity, Overfitting |
|
|
41
|
-
|
|
42
|
-
| 📈 Validation | Perfect or Statistically Suspicious CV |
|
|
43
|
-
|
|
44
|
-
| 🧪 Robustness | Stable, Fragile, or Misleading |
|
|
45
|
-
|
|
46
|
-
---
|
|
47
|
-
|
|
48
|
-
## 🛡️ Best Practices
|
|
49
|
-
|
|
50
|
-
* **CI/CD:** Use executive output as a *quality gate*
|
|
51
|
-
* **Iteration:** Use technical output during tuning
|
|
52
|
-
* **Governance:** Log detailed output
|
|
53
|
-
* **Skepticism:** Never blindly trust a perfect CV
|
|
54
|
-
|
|
55
|
-
---
|
|
56
|
-
|
|
57
|
-
## 🧭 Use Cases
|
|
58
|
-
|
|
59
|
-
* Pre-deployment Audit
|
|
60
|
-
* ML Governance
|
|
61
|
-
* CI/CD Pipelines
|
|
62
|
-
* Risk Communication for Non-Technical Users
|
|
63
|
-
|
|
64
|
-
---
|
|
65
|
-
|
|
66
|
-
## 📄 License
|
|
67
|
-
|
|
68
|
-
Distributed under the **MIT License**.
|
|
69
|
-
|
|
70
|
-
---
|
|
71
|
-
|
|
72
|
-
## 🧠 Final Note
|
|
73
|
-
|
|
74
|
-
**ai-critic** is not a *benchmarking* tool. It's a **decision-making tool**.
|
|
75
|
-
|
|
76
|
-
If a model fails here, it doesn't mean it's bad—it means it **shouldn't be trusted yet**.
|
|
File without changes
|
|
File without changes
|