ai-critic 0.2.5__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_critic/critic.py +87 -49
- ai_critic/evaluators/adapters.py +84 -0
- ai_critic/evaluators/scoring.py +39 -0
- ai_critic/sessions/__init__.py +3 -0
- ai_critic/sessions/store.py +33 -0
- ai_critic-1.1.0.dist-info/METADATA +289 -0
- {ai_critic-0.2.5.dist-info → ai_critic-1.1.0.dist-info}/RECORD +9 -5
- {ai_critic-0.2.5.dist-info → ai_critic-1.1.0.dist-info}/WHEEL +1 -1
- ai_critic-0.2.5.dist-info/METADATA +0 -200
- {ai_critic-0.2.5.dist-info → ai_critic-1.1.0.dist-info}/top_level.txt +0 -0
ai_critic/critic.py
CHANGED
|
@@ -2,14 +2,17 @@ from ai_critic.evaluators import (
|
|
|
2
2
|
robustness,
|
|
3
3
|
config,
|
|
4
4
|
data,
|
|
5
|
-
performance
|
|
5
|
+
performance,
|
|
6
|
+
adapters # <- novo import
|
|
6
7
|
)
|
|
7
8
|
from ai_critic.evaluators.summary import HumanSummary
|
|
9
|
+
from ai_critic.sessions import CriticSessionStore
|
|
10
|
+
from ai_critic.evaluators.scoring import compute_scores
|
|
8
11
|
|
|
9
12
|
|
|
10
13
|
class AICritic:
|
|
11
14
|
"""
|
|
12
|
-
Automated reviewer for scikit-learn models.
|
|
15
|
+
Automated reviewer for scikit-learn, PyTorch, or TensorFlow models.
|
|
13
16
|
|
|
14
17
|
Produces a multi-layered risk assessment including:
|
|
15
18
|
- Data integrity analysis
|
|
@@ -19,22 +22,37 @@ class AICritic:
|
|
|
19
22
|
- Human-readable executive and technical summaries
|
|
20
23
|
"""
|
|
21
24
|
|
|
22
|
-
def __init__(self, model, X, y, random_state=None):
|
|
25
|
+
def __init__(self, model, X, y, random_state=None, session=None, framework="sklearn", adapter_kwargs=None):
|
|
23
26
|
"""
|
|
24
27
|
Parameters
|
|
25
28
|
----------
|
|
26
|
-
model :
|
|
29
|
+
model : object
|
|
30
|
+
scikit-learn estimator, torch.nn.Module, or tf.keras.Model
|
|
27
31
|
X : np.ndarray
|
|
28
32
|
Feature matrix
|
|
29
33
|
y : np.ndarray
|
|
30
34
|
Target vector
|
|
31
35
|
random_state : int or None
|
|
32
36
|
Global seed for reproducibility (optional)
|
|
37
|
+
session : str or None
|
|
38
|
+
Optional session name for longitudinal comparison
|
|
39
|
+
framework : str
|
|
40
|
+
"sklearn" (default), "torch", or "tensorflow"
|
|
41
|
+
adapter_kwargs : dict
|
|
42
|
+
Extra kwargs para o adaptador (ex: epochs, lr, batch_size)
|
|
33
43
|
"""
|
|
34
|
-
|
|
44
|
+
adapter_kwargs = adapter_kwargs or {}
|
|
45
|
+
self.framework = framework.lower()
|
|
46
|
+
if self.framework != "sklearn":
|
|
47
|
+
self.model = adapters.ModelAdapter(model, framework=self.framework, **adapter_kwargs)
|
|
48
|
+
else:
|
|
49
|
+
self.model = model
|
|
50
|
+
|
|
35
51
|
self.X = X
|
|
36
52
|
self.y = y
|
|
37
53
|
self.random_state = random_state
|
|
54
|
+
self.session = session
|
|
55
|
+
self._store = CriticSessionStore() if session else None
|
|
38
56
|
|
|
39
57
|
def evaluate(self, view="all", plot=False):
|
|
40
58
|
"""
|
|
@@ -47,15 +65,10 @@ class AICritic:
|
|
|
47
65
|
- "executive" : executive summary only
|
|
48
66
|
- "technical" : technical summary only
|
|
49
67
|
- "details" : low-level evaluator outputs
|
|
50
|
-
- list : subset of views
|
|
68
|
+
- list : subset of views
|
|
51
69
|
plot : bool
|
|
52
|
-
- True : generate plots
|
|
70
|
+
- True : generate plots
|
|
53
71
|
- False : no plots
|
|
54
|
-
|
|
55
|
-
Returns
|
|
56
|
-
-------
|
|
57
|
-
dict
|
|
58
|
-
Evaluation payload according to selected view
|
|
59
72
|
"""
|
|
60
73
|
|
|
61
74
|
# =========================
|
|
@@ -66,25 +79,23 @@ class AICritic:
|
|
|
66
79
|
# -------------------------
|
|
67
80
|
# Data analysis
|
|
68
81
|
# -------------------------
|
|
69
|
-
|
|
82
|
+
details["data"] = data.evaluate(
|
|
70
83
|
self.X,
|
|
71
84
|
self.y,
|
|
72
85
|
plot=plot
|
|
73
86
|
)
|
|
74
|
-
details["data"] = data_report
|
|
75
87
|
|
|
76
88
|
# -------------------------
|
|
77
89
|
# Model configuration sanity
|
|
78
90
|
# -------------------------
|
|
79
91
|
details["config"] = config.evaluate(
|
|
80
92
|
self.model,
|
|
81
|
-
n_samples=
|
|
82
|
-
n_features=
|
|
93
|
+
n_samples=details["data"]["n_samples"],
|
|
94
|
+
n_features=details["data"]["n_features"]
|
|
83
95
|
)
|
|
84
96
|
|
|
85
97
|
# -------------------------
|
|
86
98
|
# Performance evaluation
|
|
87
|
-
# (CV strategy inferred automatically)
|
|
88
99
|
# -------------------------
|
|
89
100
|
details["performance"] = performance.evaluate(
|
|
90
101
|
self.model,
|
|
@@ -94,32 +105,36 @@ class AICritic:
|
|
|
94
105
|
)
|
|
95
106
|
|
|
96
107
|
# -------------------------
|
|
97
|
-
# Robustness
|
|
108
|
+
# Robustness evaluation
|
|
98
109
|
# -------------------------
|
|
99
110
|
details["robustness"] = robustness.evaluate(
|
|
100
111
|
self.model,
|
|
101
112
|
self.X,
|
|
102
113
|
self.y,
|
|
103
|
-
leakage_suspected=
|
|
114
|
+
leakage_suspected=details["data"]["data_leakage"]["suspected"],
|
|
104
115
|
plot=plot
|
|
105
116
|
)
|
|
106
117
|
|
|
107
118
|
# =========================
|
|
108
|
-
# Human
|
|
119
|
+
# Human summaries
|
|
109
120
|
# =========================
|
|
110
121
|
human_summary = HumanSummary().generate(details)
|
|
111
122
|
|
|
112
|
-
# =========================
|
|
113
|
-
# Full payload (PUBLIC API)
|
|
114
|
-
# =========================
|
|
115
123
|
payload = {
|
|
116
124
|
"executive": human_summary["executive_summary"],
|
|
117
125
|
"technical": human_summary["technical_summary"],
|
|
118
126
|
"details": details,
|
|
119
|
-
|
|
120
|
-
"performance": details["performance"]
|
|
127
|
+
"performance": details["performance"],
|
|
121
128
|
}
|
|
122
129
|
|
|
130
|
+
# =========================
|
|
131
|
+
# Session persistence (optional)
|
|
132
|
+
# =========================
|
|
133
|
+
if self.session:
|
|
134
|
+
scores = compute_scores(payload)
|
|
135
|
+
payload["scores"] = scores
|
|
136
|
+
self._store.save(self.session, payload)
|
|
137
|
+
|
|
123
138
|
# =========================
|
|
124
139
|
# View selector
|
|
125
140
|
# =========================
|
|
@@ -130,22 +145,56 @@ class AICritic:
|
|
|
130
145
|
return {k: payload[k] for k in view if k in payload}
|
|
131
146
|
|
|
132
147
|
return payload.get(view)
|
|
148
|
+
|
|
149
|
+
def compare_with(self, previous_session: str) -> dict:
|
|
150
|
+
"""
|
|
151
|
+
Compare current session with a previous one.
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
if not self.session:
|
|
155
|
+
raise ValueError("Current session name not set.")
|
|
156
|
+
|
|
157
|
+
current = self._store.load(self.session)
|
|
158
|
+
previous = self._store.load(previous_session)
|
|
159
|
+
|
|
160
|
+
if not previous:
|
|
161
|
+
raise FileNotFoundError(
|
|
162
|
+
f"Session '{previous_session}' not found."
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
diff = {
|
|
166
|
+
"global_score": {
|
|
167
|
+
"current": current["scores"]["global"],
|
|
168
|
+
"previous": previous["scores"]["global"],
|
|
169
|
+
"delta": current["scores"]["global"] - previous["scores"]["global"],
|
|
170
|
+
},
|
|
171
|
+
"components": {}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
for key, value in current["scores"]["components"].items():
|
|
175
|
+
prev_value = previous["scores"]["components"].get(key)
|
|
176
|
+
if prev_value is not None:
|
|
177
|
+
diff["components"][key] = {
|
|
178
|
+
"current": value,
|
|
179
|
+
"previous": prev_value,
|
|
180
|
+
"delta": value - prev_value
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
return {
|
|
184
|
+
"current_session": self.session,
|
|
185
|
+
"previous_session": previous_session,
|
|
186
|
+
"score_diff": diff,
|
|
187
|
+
"note": (
|
|
188
|
+
"Score deltas indicate changes in risk profile, "
|
|
189
|
+
"not absolute model quality."
|
|
190
|
+
)
|
|
191
|
+
}
|
|
192
|
+
|
|
133
193
|
def deploy_decision(self):
|
|
134
194
|
"""
|
|
135
195
|
Final deployment gate.
|
|
136
|
-
|
|
137
|
-
Returns
|
|
138
|
-
-------
|
|
139
|
-
dict
|
|
140
|
-
{
|
|
141
|
-
"deploy": bool,
|
|
142
|
-
"risk_level": str,
|
|
143
|
-
"blocking_issues": list[str],
|
|
144
|
-
"confidence": float
|
|
145
|
-
}
|
|
146
196
|
"""
|
|
147
197
|
|
|
148
|
-
# Reusa TODA a pipeline existente
|
|
149
198
|
report = self.evaluate(view="all", plot=False)
|
|
150
199
|
|
|
151
200
|
data_risk = report["details"]["data"]["data_leakage"]["suspected"]
|
|
@@ -156,9 +205,7 @@ class AICritic:
|
|
|
156
205
|
blocking_issues = []
|
|
157
206
|
risk_level = "low"
|
|
158
207
|
|
|
159
|
-
#
|
|
160
|
-
# Hard blockers (❌)
|
|
161
|
-
# =========================
|
|
208
|
+
# Hard blockers
|
|
162
209
|
if data_risk and perfect_cv:
|
|
163
210
|
blocking_issues.append(
|
|
164
211
|
"Data leakage combined with suspiciously perfect CV score"
|
|
@@ -177,9 +224,7 @@ class AICritic:
|
|
|
177
224
|
)
|
|
178
225
|
risk_level = "high"
|
|
179
226
|
|
|
180
|
-
#
|
|
181
|
-
# Soft blockers (⚠️)
|
|
182
|
-
# =========================
|
|
227
|
+
# Soft blockers
|
|
183
228
|
if risk_level != "high":
|
|
184
229
|
if robustness_verdict == "fragile":
|
|
185
230
|
blocking_issues.append(
|
|
@@ -199,14 +244,8 @@ class AICritic:
|
|
|
199
244
|
)
|
|
200
245
|
risk_level = "medium"
|
|
201
246
|
|
|
202
|
-
# =========================
|
|
203
|
-
# Final decision
|
|
204
|
-
# =========================
|
|
205
247
|
deploy = len(blocking_issues) == 0
|
|
206
248
|
|
|
207
|
-
# =========================
|
|
208
|
-
# Confidence heuristic
|
|
209
|
-
# =========================
|
|
210
249
|
confidence = 1.0
|
|
211
250
|
confidence -= 0.35 if data_risk else 0
|
|
212
251
|
confidence -= 0.25 if perfect_cv else 0
|
|
@@ -220,4 +259,3 @@ class AICritic:
|
|
|
220
259
|
"blocking_issues": blocking_issues,
|
|
221
260
|
"confidence": confidence
|
|
222
261
|
}
|
|
223
|
-
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# evaluators/adapters.py
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
import torch
|
|
6
|
+
import torch.nn as nn
|
|
7
|
+
except ImportError:
|
|
8
|
+
torch = None
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import tensorflow as tf
|
|
12
|
+
except ImportError:
|
|
13
|
+
tf = None
|
|
14
|
+
|
|
15
|
+
class ModelAdapter:
|
|
16
|
+
"""
|
|
17
|
+
Wraps scikit-learn, PyTorch, or TensorFlow models to provide a
|
|
18
|
+
unified fit/predict interface for AICritic.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, model, framework="sklearn", **kwargs):
|
|
22
|
+
"""
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
model : object
|
|
26
|
+
The original model (sklearn estimator, torch.nn.Module, or tf.keras.Model)
|
|
27
|
+
framework : str
|
|
28
|
+
One of "sklearn", "torch", "tensorflow"
|
|
29
|
+
kwargs : dict
|
|
30
|
+
Extra hyperparameters for training (epochs, batch_size, optimizer, etc)
|
|
31
|
+
"""
|
|
32
|
+
self.model = model
|
|
33
|
+
self.framework = framework.lower()
|
|
34
|
+
self.kwargs = kwargs
|
|
35
|
+
|
|
36
|
+
if self.framework not in ("sklearn", "torch", "tensorflow"):
|
|
37
|
+
raise ValueError(f"Unsupported framework: {framework}")
|
|
38
|
+
|
|
39
|
+
# PyTorch default settings
|
|
40
|
+
if self.framework == "torch":
|
|
41
|
+
self.epochs = kwargs.get("epochs", 5)
|
|
42
|
+
self.lr = kwargs.get("lr", 1e-3)
|
|
43
|
+
self.loss_fn = kwargs.get("loss_fn", nn.MSELoss())
|
|
44
|
+
self.optimizer_class = kwargs.get("optimizer", torch.optim.Adam)
|
|
45
|
+
self.device = kwargs.get("device", "cpu")
|
|
46
|
+
self.model.to(self.device)
|
|
47
|
+
|
|
48
|
+
# TensorFlow default settings
|
|
49
|
+
if self.framework == "tensorflow":
|
|
50
|
+
self.epochs = kwargs.get("epochs", 5)
|
|
51
|
+
self.batch_size = kwargs.get("batch_size", 32)
|
|
52
|
+
self.loss_fn = kwargs.get("loss_fn", "mse")
|
|
53
|
+
self.optimizer = kwargs.get("optimizer", "adam")
|
|
54
|
+
self.model.compile(optimizer=self.optimizer, loss=self.loss_fn)
|
|
55
|
+
|
|
56
|
+
def fit(self, X, y):
|
|
57
|
+
if self.framework == "sklearn":
|
|
58
|
+
self.model.fit(X, y)
|
|
59
|
+
elif self.framework == "torch":
|
|
60
|
+
X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
|
|
61
|
+
y_tensor = torch.tensor(y, dtype=torch.float32).to(self.device).view(-1, 1)
|
|
62
|
+
optimizer = self.optimizer_class(self.model.parameters(), lr=self.lr)
|
|
63
|
+
|
|
64
|
+
self.model.train()
|
|
65
|
+
for epoch in range(self.epochs):
|
|
66
|
+
optimizer.zero_grad()
|
|
67
|
+
output = self.model(X_tensor)
|
|
68
|
+
loss = self.loss_fn(output, y_tensor)
|
|
69
|
+
loss.backward()
|
|
70
|
+
optimizer.step()
|
|
71
|
+
elif self.framework == "tensorflow":
|
|
72
|
+
self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
|
|
73
|
+
return self
|
|
74
|
+
|
|
75
|
+
def predict(self, X):
|
|
76
|
+
if self.framework == "sklearn":
|
|
77
|
+
return self.model.predict(X)
|
|
78
|
+
elif self.framework == "torch":
|
|
79
|
+
self.model.eval()
|
|
80
|
+
with torch.no_grad():
|
|
81
|
+
X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
|
|
82
|
+
return self.model(X_tensor).cpu().numpy().flatten()
|
|
83
|
+
elif self.framework == "tensorflow":
|
|
84
|
+
return self.model.predict(X).flatten()
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
def compute_scores(report: dict) -> dict:
|
|
2
|
+
"""
|
|
3
|
+
Converts critic signals into a coarse 0–100 score.
|
|
4
|
+
Score is NOT an objective metric.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
score = 100
|
|
8
|
+
|
|
9
|
+
data_leakage = report["details"]["data"]["data_leakage"]["suspected"]
|
|
10
|
+
perfect_cv = report["details"]["performance"]["suspiciously_perfect"]
|
|
11
|
+
robustness = report["details"]["robustness"]["verdict"]
|
|
12
|
+
structural = report["details"]["config"]["structural_warnings"]
|
|
13
|
+
|
|
14
|
+
if data_leakage:
|
|
15
|
+
score -= 30
|
|
16
|
+
|
|
17
|
+
if perfect_cv:
|
|
18
|
+
score -= 20
|
|
19
|
+
|
|
20
|
+
if robustness == "fragile":
|
|
21
|
+
score -= 15
|
|
22
|
+
elif robustness == "misleading":
|
|
23
|
+
score -= 25
|
|
24
|
+
|
|
25
|
+
if structural:
|
|
26
|
+
score -= 10
|
|
27
|
+
|
|
28
|
+
return {
|
|
29
|
+
"global": max(0, min(100, score)),
|
|
30
|
+
"components": {
|
|
31
|
+
"data_integrity": 0 if data_leakage else 100,
|
|
32
|
+
"validation": 70 if perfect_cv else 100,
|
|
33
|
+
"robustness": {
|
|
34
|
+
"stable": 100,
|
|
35
|
+
"fragile": 65,
|
|
36
|
+
"misleading": 40
|
|
37
|
+
}.get(robustness, 100),
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CriticSessionStore:
|
|
7
|
+
"""
|
|
8
|
+
Simple local persistence layer for ai-critic sessions.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(self, base_dir: str | None = None):
|
|
12
|
+
self.base_dir = Path(
|
|
13
|
+
base_dir or Path.home() / ".ai_critic_sessions"
|
|
14
|
+
)
|
|
15
|
+
self.base_dir.mkdir(parents=True, exist_ok=True)
|
|
16
|
+
|
|
17
|
+
def _session_path(self, name: str) -> Path:
|
|
18
|
+
return self.base_dir / f"{name}.json"
|
|
19
|
+
|
|
20
|
+
def save(self, name: str, payload: dict):
|
|
21
|
+
data = {
|
|
22
|
+
"timestamp": datetime.utcnow().isoformat(),
|
|
23
|
+
"payload": payload
|
|
24
|
+
}
|
|
25
|
+
with open(self._session_path(name), "w") as f:
|
|
26
|
+
json.dump(data, f, indent=2)
|
|
27
|
+
|
|
28
|
+
def load(self, name: str) -> dict | None:
|
|
29
|
+
path = self._session_path(name)
|
|
30
|
+
if not path.exists():
|
|
31
|
+
return None
|
|
32
|
+
with open(path) as f:
|
|
33
|
+
return json.load(f)["payload"]
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ai-critic
|
|
3
|
+
Version: 1.1.0
|
|
4
|
+
Summary: Fast AI evaluator for scikit-learn models
|
|
5
|
+
Author-email: Luiz Seabra <filipedemarco@yahoo.com>
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: numpy
|
|
9
|
+
Requires-Dist: scikit-learn
|
|
10
|
+
|
|
11
|
+
# ai-critic 🧠: The Quality Gate for Machine Learning Models
|
|
12
|
+
|
|
13
|
+
**ai-critic** is a specialized **decision-making** tool designed to audit the reliability and readiness for deployment of **scikit-learn**, **PyTorch**, and **TensorFlow** models.
|
|
14
|
+
|
|
15
|
+
Instead of merely measuring performance (accuracy, F1 score), **ai-critic** acts as a **Quality Gate**, actively probing the model to uncover *hidden risks* that commonly cause production failures — such as **data leakage**, **structural overfitting**, and **fragility under noise**.
|
|
16
|
+
|
|
17
|
+
> **ai-critic does not ask “How good is this model?”**
|
|
18
|
+
> It asks **“Can this model be trusted?”**
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## 🚀 Getting Started (The Basics)
|
|
23
|
+
|
|
24
|
+
This section is ideal for beginners who need a **fast and reliable verdict** on a trained model.
|
|
25
|
+
|
|
26
|
+
### Installation
|
|
27
|
+
|
|
28
|
+
Install directly from PyPI:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install ai-critic
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
### The Quick Verdict
|
|
37
|
+
|
|
38
|
+
With just a few lines of code, you obtain an **executive-level assessment** and a **deployment recommendation**.
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from ai_critic import AICritic
|
|
42
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
43
|
+
from sklearn.datasets import make_classification
|
|
44
|
+
|
|
45
|
+
# 1. Prepare data and model
|
|
46
|
+
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
|
|
47
|
+
model = RandomForestClassifier(max_depth=5, random_state=42)
|
|
48
|
+
|
|
49
|
+
# 2. Initialize the Critic
|
|
50
|
+
critic = AICritic(model, X, y)
|
|
51
|
+
|
|
52
|
+
# 3. Run the audit (executive mode)
|
|
53
|
+
report = critic.evaluate(view="executive")
|
|
54
|
+
|
|
55
|
+
print(f"Verdict: {report['verdict']}")
|
|
56
|
+
print(f"Risk Level: {report['risk_level']}")
|
|
57
|
+
print(f"Main Reason: {report['main_reason']}")
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
**Expected Output (example):**
|
|
61
|
+
|
|
62
|
+
```text
|
|
63
|
+
Verdict: ⚠️ Risky
|
|
64
|
+
Risk Level: medium
|
|
65
|
+
Main Reason: Structural or robustness-related risks detected.
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
This output is intentionally **conservative**.
|
|
69
|
+
If **ai-critic** recommends deployment, it means meaningful risks were *not* detected.
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## 💡 Understanding the Critique (The Intermediary)
|
|
74
|
+
|
|
75
|
+
For data scientists who want to understand **why** the model received a given verdict and **how to improve it**.
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
### The Four Pillars of the Audit
|
|
80
|
+
|
|
81
|
+
**ai-critic** evaluates models across four independent risk dimensions:
|
|
82
|
+
|
|
83
|
+
| Pillar | Main Risk Detected | Internal Module |
|
|
84
|
+
| ---------------------- | -------------------------------------- | ------------------------ |
|
|
85
|
+
| 📊 **Data Integrity** | Target Leakage & Correlation Artifacts | `evaluators.data` |
|
|
86
|
+
| 🧠 **Model Structure** | Over-complexity & Misconfiguration | `evaluators.config` |
|
|
87
|
+
| 📈 **Performance** | Suspicious CV or Learning Curves | `evaluators.performance` |
|
|
88
|
+
| 🧪 **Robustness** | Sensitivity to Noise | `evaluators.robustness` |
|
|
89
|
+
|
|
90
|
+
Each pillar contributes signals used later in the **deployment gate**.
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
### Full Technical & Visual Analysis
|
|
95
|
+
|
|
96
|
+
To access **all internal diagnostics**, including plots and recommendations, use `view="all"`.
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
full_report = critic.evaluate(view="all", plot=True)
|
|
100
|
+
|
|
101
|
+
technical_summary = full_report["technical"]
|
|
102
|
+
|
|
103
|
+
print("\n--- Key Risks Detected ---")
|
|
104
|
+
for i, risk in enumerate(technical_summary["key_risks"], start=1):
|
|
105
|
+
print(f"{i}. {risk}")
|
|
106
|
+
|
|
107
|
+
print("\n--- Recommendations ---")
|
|
108
|
+
for rec in technical_summary["recommendations"]:
|
|
109
|
+
print(f"- {rec}")
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Generated plots may include:
|
|
113
|
+
|
|
114
|
+
* Feature correlation heatmaps
|
|
115
|
+
* Learning curves
|
|
116
|
+
* Robustness degradation charts
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
### Robustness Test (Noise Injection)
|
|
121
|
+
|
|
122
|
+
A model that collapses under small perturbations is **not production-safe**.
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
robustness = full_report["details"]["robustness"]
|
|
126
|
+
|
|
127
|
+
print("\n--- Robustness Analysis ---")
|
|
128
|
+
print(f"Original CV Score: {robustness['cv_score_original']:.4f}")
|
|
129
|
+
print(f"Noisy CV Score: {robustness['cv_score_noisy']:.4f}")
|
|
130
|
+
print(f"Performance Drop: {robustness['performance_drop']:.4f}")
|
|
131
|
+
print(f"Verdict: {robustness['verdict']}")
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
**Possible Verdicts:**
|
|
135
|
+
|
|
136
|
+
* `stable` → acceptable degradation
|
|
137
|
+
* `fragile` → high sensitivity to noise
|
|
138
|
+
* `misleading` → performance likely inflated by leakage
|
|
139
|
+
|
|
140
|
+
---
|
|
141
|
+
|
|
142
|
+
## ⚙️ Integration and Governance (The Advanced)
|
|
143
|
+
|
|
144
|
+
This section targets **MLOps engineers**, **architects**, and teams operating automated pipelines.
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
### Multi-Framework Support
|
|
149
|
+
|
|
150
|
+
**ai-critic 1.0+** supports models from multiple frameworks with the **same API**:
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
# PyTorch Example
|
|
154
|
+
import torch
|
|
155
|
+
import torch.nn as nn
|
|
156
|
+
from ai_critic import AICritic
|
|
157
|
+
|
|
158
|
+
X = torch.randn(1000, 20)
|
|
159
|
+
y = torch.randint(0, 2, (1000,))
|
|
160
|
+
|
|
161
|
+
model = nn.Sequential(
|
|
162
|
+
nn.Linear(20, 32),
|
|
163
|
+
nn.ReLU(),
|
|
164
|
+
nn.Linear(32, 2)
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
critic = AICritic(model, X, y, framework="torch", adapter_kwargs={"epochs":5, "batch_size":64})
|
|
168
|
+
report = critic.evaluate(view="executive")
|
|
169
|
+
print(report)
|
|
170
|
+
|
|
171
|
+
# TensorFlow Example
|
|
172
|
+
import tensorflow as tf
|
|
173
|
+
|
|
174
|
+
model = tf.keras.Sequential([
|
|
175
|
+
tf.keras.layers.Dense(32, activation="relu", input_shape=(20,)),
|
|
176
|
+
tf.keras.layers.Dense(2)
|
|
177
|
+
])
|
|
178
|
+
critic = AICritic(model, X.numpy(), y.numpy(), framework="tensorflow", adapter_kwargs={"epochs":5})
|
|
179
|
+
report = critic.evaluate(view="executive")
|
|
180
|
+
print(report)
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
> No need to rewrite evaluation code — **one Critic API works for sklearn, PyTorch, or TensorFlow**.
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
### The Deployment Gate (`deploy_decision`)
|
|
188
|
+
|
|
189
|
+
The `deploy_decision()` method aggregates *all detected risks* and produces a final gate decision.
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
decision = critic.deploy_decision()
|
|
193
|
+
|
|
194
|
+
if decision["deploy"]:
|
|
195
|
+
print("✅ Deployment Approved")
|
|
196
|
+
else:
|
|
197
|
+
print("❌ Deployment Blocked")
|
|
198
|
+
|
|
199
|
+
print(f"Risk Level: {decision['risk_level']}")
|
|
200
|
+
print(f"Confidence Score: {decision['confidence']:.2f}")
|
|
201
|
+
|
|
202
|
+
print("\nBlocking Issues:")
|
|
203
|
+
for issue in decision["blocking_issues"]:
|
|
204
|
+
print(f"- {issue}")
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
**Conceptual model:**
|
|
208
|
+
|
|
209
|
+
* **Hard Blockers** → deployment denied
|
|
210
|
+
* **Soft Blockers** → deployment discouraged
|
|
211
|
+
* **Confidence Score (0–1)** → heuristic trust level
|
|
212
|
+
|
|
213
|
+
---
|
|
214
|
+
|
|
215
|
+
### Modes & Views (API Design)
|
|
216
|
+
|
|
217
|
+
The `evaluate()` method supports **multiple modes** via the `view` parameter:
|
|
218
|
+
|
|
219
|
+
| View | Description |
|
|
220
|
+
| ------------- | ---------------------------------- |
|
|
221
|
+
| `"executive"` | High-level verdict (non-technical) |
|
|
222
|
+
| `"technical"` | Risks & recommendations |
|
|
223
|
+
| `"details"` | Raw evaluator outputs |
|
|
224
|
+
| `"all"` | Complete payload |
|
|
225
|
+
|
|
226
|
+
Example:
|
|
227
|
+
|
|
228
|
+
```python
|
|
229
|
+
critic.evaluate(view="technical")
|
|
230
|
+
critic.evaluate(view=["executive", "performance"])
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
---
|
|
234
|
+
|
|
235
|
+
### Session Tracking & Model Comparison
|
|
236
|
+
|
|
237
|
+
You can persist evaluations and compare model versions over time.
|
|
238
|
+
|
|
239
|
+
```python
|
|
240
|
+
critic_v1 = AICritic(model, X, y, session="v1")
|
|
241
|
+
critic_v1.evaluate()
|
|
242
|
+
|
|
243
|
+
critic_v2 = AICritic(model, X, y, session="v2")
|
|
244
|
+
critic_v2.evaluate()
|
|
245
|
+
|
|
246
|
+
comparison = critic_v2.compare_with("v1")
|
|
247
|
+
print(comparison["score_diff"])
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
This enables:
|
|
251
|
+
|
|
252
|
+
* Regression tracking
|
|
253
|
+
* Risk drift detection
|
|
254
|
+
* Governance & audit trails
|
|
255
|
+
|
|
256
|
+
---
|
|
257
|
+
|
|
258
|
+
### Best Practices & Use Cases
|
|
259
|
+
|
|
260
|
+
| Scenario | Recommended Usage |
|
|
261
|
+
| ----------------------- | -------------------------------------- |
|
|
262
|
+
| **CI/CD** | Block merges using `deploy_decision()` |
|
|
263
|
+
| **Model Tuning** | Use technical view for guidance |
|
|
264
|
+
| **Governance** | Persist session outputs |
|
|
265
|
+
| **Stakeholder Reports** | Share executive summaries |
|
|
266
|
+
|
|
267
|
+
---
|
|
268
|
+
|
|
269
|
+
## 🔒 API Stability
|
|
270
|
+
|
|
271
|
+
Starting from version **1.0.0**, the public API of **ai-critic** follows semantic versioning.
|
|
272
|
+
Breaking changes will only occur in major releases.
|
|
273
|
+
|
|
274
|
+
---
|
|
275
|
+
|
|
276
|
+
## 📄 License
|
|
277
|
+
|
|
278
|
+
Distributed under the **MIT License**.
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
282
|
+
## 🧠 Final Note
|
|
283
|
+
|
|
284
|
+
> **ai-critic is not a benchmarking tool.**
|
|
285
|
+
> It is a *decision-making system*.
|
|
286
|
+
|
|
287
|
+
A failed audit does **not** mean the model is bad — it means the model **is not ready to be trusted**.
|
|
288
|
+
|
|
289
|
+
The purpose of **ai-critic** is to introduce *structured skepticism* into machine learning workflows — exactly where it belongs.
|
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
ai_critic/__init__.py,sha256=H6DlPMmbcFUamhsNULPLk9vHx81XCiXuKKf63EJ8eM0,53
|
|
2
|
-
ai_critic/critic.py,sha256=
|
|
2
|
+
ai_critic/critic.py,sha256=I9MeVHVCN-lWffPm3DJCgbFVVW8VTIs_qhXd-aP3X5Q,8277
|
|
3
3
|
ai_critic/evaluators/__init__.py,sha256=ri6InmL8_LIcO-JZpU_gEFKLO4URdqo3z6rh7fV6M8Y,169
|
|
4
|
+
ai_critic/evaluators/adapters.py,sha256=8Xw9Ccg1iGVNwVQDGVIqhWj5-Sg6evqCZhg21u8EP20,3068
|
|
4
5
|
ai_critic/evaluators/config.py,sha256=gBXaS8Qxl14f40JnvMWgA0Z0SGEtbCuCHpTOPem0H90,1163
|
|
5
6
|
ai_critic/evaluators/data.py,sha256=YAK5NkwCeJOny_UueZ5ALwvEcRDIbEck404eV2oqWnc,1871
|
|
6
7
|
ai_critic/evaluators/performance.py,sha256=1CQx5DueK0XkelYyJnAGRJ3AjQtjsKeW8_1JQZqKVOI,1973
|
|
7
8
|
ai_critic/evaluators/robustness.py,sha256=mfVQ67Z6t6aRvtIq-XQEQYbwvyf8UefM1myeOGVrnAE,1869
|
|
9
|
+
ai_critic/evaluators/scoring.py,sha256=GBkmDa5Q6RZY4hJfzrCbxbBopsOsRjsNtzyoQHqgWHA,1046
|
|
8
10
|
ai_critic/evaluators/summary.py,sha256=O9ZCrph93VV6pFcMIx2a7DizPIccRUqbGcUZ6oDmOLs,3791
|
|
9
11
|
ai_critic/evaluators/validation.py,sha256=rnzRwD78Cugey33gl9geE8JoBURsKEEnqrIOhBZv0LY,904
|
|
10
|
-
ai_critic
|
|
11
|
-
ai_critic
|
|
12
|
-
ai_critic-
|
|
13
|
-
ai_critic-
|
|
12
|
+
ai_critic/sessions/__init__.py,sha256=Yp7mphSPJwt8a4cJgcQNErqwqHVuP_xAJODrs0y0Abw,72
|
|
13
|
+
ai_critic/sessions/store.py,sha256=65m9WXFVFWv4pPzvXV4l8zLHoHWMfCGe6eHh4X-8agY,947
|
|
14
|
+
ai_critic-1.1.0.dist-info/METADATA,sha256=gIxyPwmDrmkqzZdLILBkq6uOl_2UAu5QRIV7Xis2rGc,8114
|
|
15
|
+
ai_critic-1.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
16
|
+
ai_critic-1.1.0.dist-info/top_level.txt,sha256=TRyZkm1vyLLcFDg_80yeg5cHvPis_oW1Ti170417jkw,10
|
|
17
|
+
ai_critic-1.1.0.dist-info/RECORD,,
|
|
@@ -1,200 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: ai-critic
|
|
3
|
-
Version: 0.2.5
|
|
4
|
-
Summary: Fast AI evaluator for scikit-learn models
|
|
5
|
-
Author-email: Luiz Seabra <filipedemarco@yahoo.com>
|
|
6
|
-
Requires-Python: >=3.9
|
|
7
|
-
Description-Content-Type: text/markdown
|
|
8
|
-
Requires-Dist: numpy
|
|
9
|
-
Requires-Dist: scikit-learn
|
|
10
|
-
|
|
11
|
-
# ai-critic 🧠: The Quality Gate for Machine Learning Models
|
|
12
|
-
|
|
13
|
-
**ai-critic** is a specialized **decision-making** tool designed to audit the reliability and readiness for deployment of scikit-learn compatible Machine Learning models.
|
|
14
|
-
|
|
15
|
-
Instead of just measuring performance (accuracy, F1 score), **ai-critic** acts as a "Quality Gate," operating the model in search of hidden risks that can lead to production failures, such as data leaks, structural overfitting, and vulnerability to noise.
|
|
16
|
-
|
|
17
|
-
---
|
|
18
|
-
|
|
19
|
-
## 🚀 1. Getting Started (The Basics)
|
|
20
|
-
|
|
21
|
-
This section is ideal for beginners who need a quick verdict on the health of their model.
|
|
22
|
-
|
|
23
|
-
### 1.1. Installation
|
|
24
|
-
|
|
25
|
-
Install the library directly from PyPI:
|
|
26
|
-
|
|
27
|
-
```bash
|
|
28
|
-
pip install ai-critic
|
|
29
|
-
```
|
|
30
|
-
|
|
31
|
-
### 1.2. The Quick Verdict
|
|
32
|
-
|
|
33
|
-
With just a few lines, you can get an executive evaluation and a deployment recommendation.
|
|
34
|
-
|
|
35
|
-
```python
|
|
36
|
-
from ai_critic import AICritic
|
|
37
|
-
from sklearn.ensemble import RandomForestClassifier
|
|
38
|
-
from sklearn.datasets import make_classification
|
|
39
|
-
|
|
40
|
-
# 1. Prepare your data and model
|
|
41
|
-
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
|
|
42
|
-
model = RandomForestClassifier(max_depth=5, random_state=42)
|
|
43
|
-
|
|
44
|
-
# 2. Initialize Criticism
|
|
45
|
-
# AICritic performs all audits internally
|
|
46
|
-
critic = AICritic(model, X, y)
|
|
47
|
-
|
|
48
|
-
# 3. Obtain the Executive Summary
|
|
49
|
-
report = critic.evaluate(view="executive")
|
|
50
|
-
|
|
51
|
-
print(f"Verdict: {report['verdict']}")
|
|
52
|
-
print(f"Risk: {report['risk_level']}")
|
|
53
|
-
print(f"Reason Main: {report['main_reason']}")
|
|
54
|
-
|
|
55
|
-
#Expected Output:
|
|
56
|
-
|
|
57
|
-
# Verdict: ✅ Acceptable
|
|
58
|
-
# Risk: Low
|
|
59
|
-
# Main Reason: No critic risks detected.
|
|
60
|
-
|
|
61
|
-
```
|
|
62
|
-
|
|
63
|
-
---
|
|
64
|
-
|
|
65
|
-
## 💡 2. Understanding the Critique (The Intermediary)
|
|
66
|
-
|
|
67
|
-
For the data scientist who needs to understand *why* the model received a verdict and what the next steps are.
|
|
68
|
-
|
|
69
|
-
### 2.1. The Four Pillars of the Audit
|
|
70
|
-
|
|
71
|
-
The **ai-critic** evaluates your model across four critic dimensions.
|
|
72
|
-
|
|
73
|
-
| Category | Main Risk | Code Module |
|
|
74
|
-
| :--- | :--- | :--- |
|
|
75
|
-
| 📈 **Validation** | Suspicious CV Scores | `ai_critic.performance` |
|
|
76
|
-
| 🧪 **Robustness** | Noise Vulnerability | `ai_critic.robustness` |
|
|
77
|
-
|
|
78
|
-
2.2. Visual and Technical Analysis
|
|
79
|
-
|
|
80
|
-
The `evaluate` method allows you to view the results and access the complete technical report.
|
|
81
|
-
|
|
82
|
-
```Python
|
|
83
|
-
# Continuing the previous example...
|
|
84
|
-
|
|
85
|
-
# 1. Generate the full report and visualizations
|
|
86
|
-
# plot=True generates Correlation, Learning Curve, and Robustness graphs
|
|
87
|
-
full_report = critic.evaluate(view="all", plot=True)
|
|
88
|
-
|
|
89
|
-
# 2. Access the Technical Summary for Recommendations
|
|
90
|
-
technical_summary = full_report["technical"]
|
|
91
|
-
|
|
92
|
-
print("\n--- Technical Recommendations ---")
|
|
93
|
-
for i, risk in enumerate(technical_summary["key_risks"]):
|
|
94
|
-
print(f"Risk {i+1}: {risk}")
|
|
95
|
-
print(f"Recommendation: {technical_summary['recommendations'][i]}")
|
|
96
|
-
|
|
97
|
-
# Example of Risk (if there were one):
|
|
98
|
-
# Risk 1: The depth of the tree may be too high for the size of the dataset.
|
|
99
|
-
|
|
100
|
-
# Recommendation: Reduce model complexity or adjust hyperparameters.
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
###2.3. Robustness Test
|
|
104
|
-
|
|
105
|
-
A robust model should maintain its performance even with small disturbances in the data. The `ai-critic` test assesses this by injecting noise into the input data.
|
|
106
|
-
|
|
107
|
-
```python
|
|
108
|
-
# Accessing the specific result of the Robustness module
|
|
109
|
-
robustness_result = full_report["details"]["robustness"]
|
|
110
|
-
|
|
111
|
-
print("\n--- Robustness Test ---")
|
|
112
|
-
print(f"Original CV Score: {robustness_result['cv_score_original']:.4f}")
|
|
113
|
-
print(f"CV Score with Noise: {robustness_result['cv_score_noisy']:.4f}")
|
|
114
|
-
print(f"Performance Drop: {robustness_result['performance_drop']:.4f}")
|
|
115
|
-
print(f"Robustness Verdict: {robustness_result['verdict']}")
|
|
116
|
-
|
|
117
|
-
# Possible Verdicts:
|
|
118
|
-
# - Stable: Acceptable drop.
|
|
119
|
-
|
|
120
|
-
# - Fragile: Significant drop (risk).
|
|
121
|
-
|
|
122
|
-
# - Misleading: Original performance inflated by leakage.
|
|
123
|
-
|
|
124
|
-
```
|
|
125
|
-
|
|
126
|
-
---
|
|
127
|
-
|
|
128
|
-
## ⚙️ 3. Integration and Governance (The Advanced)
|
|
129
|
-
|
|
130
|
-
This section is for MLOps engineers and architects looking to integrate **ai-critic** into automated pipelines and create custom deployment logic.
|
|
131
|
-
|
|
132
|
-
###3.1. The Deployment Gate (`deploy_decision`)
|
|
133
|
-
|
|
134
|
-
The `deploy_decision()` method is the final control point. It returns a structured object that classifies problems into *Hard Blockers* (prevent deployment) and *Soft Blockers* (require attention, but can be accepted with reservations).
|
|
135
|
-
|
|
136
|
-
Python
|
|
137
|
-
# Example of use in a CI/CD pipeline
|
|
138
|
-
decision = critic.deploy_decision()
|
|
139
|
-
|
|
140
|
-
if decision["deploy"]:
|
|
141
|
-
print("✅ Deployment Approved. Risk Level: Low.")
|
|
142
|
-
other:
|
|
143
|
-
print(f"❌ Deployment Blocked. Risk Level: {decision['risk_level'].upper()}")
|
|
144
|
-
print("Blocking Issues:")
|
|
145
|
-
for issue in decision["blocking_issues"]:
|
|
146
|
-
print(f"- {problem}")
|
|
147
|
-
|
|
148
|
-
# The decision object also includes a heuristic confidence score (0.0 to 1.0)
|
|
149
|
-
print(f"Heuristic Confidence in Model: {decision['confidence']:.2f}")
|
|
150
|
-
|
|
151
|
-
```
|
|
152
|
-
|
|
153
|
-
###3.2. AccessFor custom *governance* rules or logic, you can access the raw data of each module through the `"details"` view.
|
|
154
|
-
|
|
155
|
-
```python
|
|
156
|
-
# Accessing Data Leakage Details
|
|
157
|
-
data_details = critic.evaluate(view="details")["data"]
|
|
158
|
-
|
|
159
|
-
if data_details["data_leakage"]["suspected"]:
|
|
160
|
-
|
|
161
|
-
print("\n--- Data Leak Alert ---")
|
|
162
|
-
|
|
163
|
-
for detail in data_details["data_leakage"]["details"]:
|
|
164
|
-
|
|
165
|
-
print(f"Feature {detail['feature_index']} with correlation of {detail['correlation']:.4f}")
|
|
166
|
-
|
|
167
|
-
# Accessing Structural Overfitting Details
|
|
168
|
-
config_details = critic.evaluate(view="details")["config"]
|
|
169
|
-
|
|
170
|
-
if config_details["structural_warnings"]:
|
|
171
|
-
|
|
172
|
-
print("\n--- Structural Alert ---")
|
|
173
|
-
|
|
174
|
-
for warning in config_details["structural_warnings"]:
|
|
175
|
-
|
|
176
|
-
print(f"Warning: {warning['message']} (Max Depth: {warning['max_depth']}, Recommended: {warning['recommended_max_depth']})")
|
|
177
|
-
```
|
|
178
|
-
|
|
179
|
-
### 3.3. Best Practices and Use Cases
|
|
180
|
-
|
|
181
|
-
| Use | Recommended Action |
|
|
182
|
-
| :--- | :--- |
|
|
183
|
-
| **CI/CD** | Use `deploy_decision()` as an automated quality gate. |
|
|
184
|
-
| **Tuning** | Use the technical view to guide hyperparameter optimization. |
|
|
185
|
-
| **Governance** | Log the details view for auditing and compliance. |
|
|
186
|
-
| **Communication** | Use the executive view to report risks to non-technical stakeholders. |
|
|
187
|
-
|
|
188
|
-
---
|
|
189
|
-
|
|
190
|
-
## 📄 License
|
|
191
|
-
|
|
192
|
-
Distributed under the **MIT License**.
|
|
193
|
-
|
|
194
|
-
--
|
|
195
|
-
|
|
196
|
-
## 🧠 Final Note
|
|
197
|
-
|
|
198
|
-
> **ai-critic** is not a benchmarking tool. It's a decision-making tool.
|
|
199
|
-
|
|
200
|
-
If a model fails here, it doesn't mean it's "bad," but rather that it **shouldn't be trusted yet**. The goal is to inject the necessary skepticism to build truly robust AI systems.
|
|
File without changes
|