ai-critic 0.2.5__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ai_critic/critic.py CHANGED
@@ -2,14 +2,17 @@ from ai_critic.evaluators import (
2
2
  robustness,
3
3
  config,
4
4
  data,
5
- performance
5
+ performance,
6
+ adapters # <- novo import
6
7
  )
7
8
  from ai_critic.evaluators.summary import HumanSummary
9
+ from ai_critic.sessions import CriticSessionStore
10
+ from ai_critic.evaluators.scoring import compute_scores
8
11
 
9
12
 
10
13
  class AICritic:
11
14
  """
12
- Automated reviewer for scikit-learn models.
15
+ Automated reviewer for scikit-learn, PyTorch, or TensorFlow models.
13
16
 
14
17
  Produces a multi-layered risk assessment including:
15
18
  - Data integrity analysis
@@ -19,22 +22,37 @@ class AICritic:
19
22
  - Human-readable executive and technical summaries
20
23
  """
21
24
 
22
- def __init__(self, model, X, y, random_state=None):
25
+ def __init__(self, model, X, y, random_state=None, session=None, framework="sklearn", adapter_kwargs=None):
23
26
  """
24
27
  Parameters
25
28
  ----------
26
- model : sklearn-compatible estimator
29
+ model : object
30
+ scikit-learn estimator, torch.nn.Module, or tf.keras.Model
27
31
  X : np.ndarray
28
32
  Feature matrix
29
33
  y : np.ndarray
30
34
  Target vector
31
35
  random_state : int or None
32
36
  Global seed for reproducibility (optional)
37
+ session : str or None
38
+ Optional session name for longitudinal comparison
39
+ framework : str
40
+ "sklearn" (default), "torch", or "tensorflow"
41
+ adapter_kwargs : dict
42
+ Extra kwargs para o adaptador (ex: epochs, lr, batch_size)
33
43
  """
34
- self.model = model
44
+ adapter_kwargs = adapter_kwargs or {}
45
+ self.framework = framework.lower()
46
+ if self.framework != "sklearn":
47
+ self.model = adapters.ModelAdapter(model, framework=self.framework, **adapter_kwargs)
48
+ else:
49
+ self.model = model
50
+
35
51
  self.X = X
36
52
  self.y = y
37
53
  self.random_state = random_state
54
+ self.session = session
55
+ self._store = CriticSessionStore() if session else None
38
56
 
39
57
  def evaluate(self, view="all", plot=False):
40
58
  """
@@ -47,15 +65,10 @@ class AICritic:
47
65
  - "executive" : executive summary only
48
66
  - "technical" : technical summary only
49
67
  - "details" : low-level evaluator outputs
50
- - list : subset of views (e.g. ["executive", "details"])
68
+ - list : subset of views
51
69
  plot : bool
52
- - True : generate plots (learning curve, heatmap, robustness)
70
+ - True : generate plots
53
71
  - False : no plots
54
-
55
- Returns
56
- -------
57
- dict
58
- Evaluation payload according to selected view
59
72
  """
60
73
 
61
74
  # =========================
@@ -66,25 +79,23 @@ class AICritic:
66
79
  # -------------------------
67
80
  # Data analysis
68
81
  # -------------------------
69
- data_report = data.evaluate(
82
+ details["data"] = data.evaluate(
70
83
  self.X,
71
84
  self.y,
72
85
  plot=plot
73
86
  )
74
- details["data"] = data_report
75
87
 
76
88
  # -------------------------
77
89
  # Model configuration sanity
78
90
  # -------------------------
79
91
  details["config"] = config.evaluate(
80
92
  self.model,
81
- n_samples=data_report["n_samples"],
82
- n_features=data_report["n_features"]
93
+ n_samples=details["data"]["n_samples"],
94
+ n_features=details["data"]["n_features"]
83
95
  )
84
96
 
85
97
  # -------------------------
86
98
  # Performance evaluation
87
- # (CV strategy inferred automatically)
88
99
  # -------------------------
89
100
  details["performance"] = performance.evaluate(
90
101
  self.model,
@@ -94,32 +105,36 @@ class AICritic:
94
105
  )
95
106
 
96
107
  # -------------------------
97
- # Robustness & leakage analysis
108
+ # Robustness evaluation
98
109
  # -------------------------
99
110
  details["robustness"] = robustness.evaluate(
100
111
  self.model,
101
112
  self.X,
102
113
  self.y,
103
- leakage_suspected=data_report["data_leakage"]["suspected"],
114
+ leakage_suspected=details["data"]["data_leakage"]["suspected"],
104
115
  plot=plot
105
116
  )
106
117
 
107
118
  # =========================
108
- # Human-centered summaries
119
+ # Human summaries
109
120
  # =========================
110
121
  human_summary = HumanSummary().generate(details)
111
122
 
112
- # =========================
113
- # Full payload (PUBLIC API)
114
- # =========================
115
123
  payload = {
116
124
  "executive": human_summary["executive_summary"],
117
125
  "technical": human_summary["technical_summary"],
118
126
  "details": details,
119
- # Convenience shortcut (prevents KeyError in user code)
120
- "performance": details["performance"]
127
+ "performance": details["performance"],
121
128
  }
122
129
 
130
+ # =========================
131
+ # Session persistence (optional)
132
+ # =========================
133
+ if self.session:
134
+ scores = compute_scores(payload)
135
+ payload["scores"] = scores
136
+ self._store.save(self.session, payload)
137
+
123
138
  # =========================
124
139
  # View selector
125
140
  # =========================
@@ -130,22 +145,56 @@ class AICritic:
130
145
  return {k: payload[k] for k in view if k in payload}
131
146
 
132
147
  return payload.get(view)
148
+
149
+ def compare_with(self, previous_session: str) -> dict:
150
+ """
151
+ Compare current session with a previous one.
152
+ """
153
+
154
+ if not self.session:
155
+ raise ValueError("Current session name not set.")
156
+
157
+ current = self._store.load(self.session)
158
+ previous = self._store.load(previous_session)
159
+
160
+ if not previous:
161
+ raise FileNotFoundError(
162
+ f"Session '{previous_session}' not found."
163
+ )
164
+
165
+ diff = {
166
+ "global_score": {
167
+ "current": current["scores"]["global"],
168
+ "previous": previous["scores"]["global"],
169
+ "delta": current["scores"]["global"] - previous["scores"]["global"],
170
+ },
171
+ "components": {}
172
+ }
173
+
174
+ for key, value in current["scores"]["components"].items():
175
+ prev_value = previous["scores"]["components"].get(key)
176
+ if prev_value is not None:
177
+ diff["components"][key] = {
178
+ "current": value,
179
+ "previous": prev_value,
180
+ "delta": value - prev_value
181
+ }
182
+
183
+ return {
184
+ "current_session": self.session,
185
+ "previous_session": previous_session,
186
+ "score_diff": diff,
187
+ "note": (
188
+ "Score deltas indicate changes in risk profile, "
189
+ "not absolute model quality."
190
+ )
191
+ }
192
+
133
193
  def deploy_decision(self):
134
194
  """
135
195
  Final deployment gate.
136
-
137
- Returns
138
- -------
139
- dict
140
- {
141
- "deploy": bool,
142
- "risk_level": str,
143
- "blocking_issues": list[str],
144
- "confidence": float
145
- }
146
196
  """
147
197
 
148
- # Reusa TODA a pipeline existente
149
198
  report = self.evaluate(view="all", plot=False)
150
199
 
151
200
  data_risk = report["details"]["data"]["data_leakage"]["suspected"]
@@ -156,9 +205,7 @@ class AICritic:
156
205
  blocking_issues = []
157
206
  risk_level = "low"
158
207
 
159
- # =========================
160
- # Hard blockers (❌)
161
- # =========================
208
+ # Hard blockers
162
209
  if data_risk and perfect_cv:
163
210
  blocking_issues.append(
164
211
  "Data leakage combined with suspiciously perfect CV score"
@@ -177,9 +224,7 @@ class AICritic:
177
224
  )
178
225
  risk_level = "high"
179
226
 
180
- # =========================
181
- # Soft blockers (⚠️)
182
- # =========================
227
+ # Soft blockers
183
228
  if risk_level != "high":
184
229
  if robustness_verdict == "fragile":
185
230
  blocking_issues.append(
@@ -199,14 +244,8 @@ class AICritic:
199
244
  )
200
245
  risk_level = "medium"
201
246
 
202
- # =========================
203
- # Final decision
204
- # =========================
205
247
  deploy = len(blocking_issues) == 0
206
248
 
207
- # =========================
208
- # Confidence heuristic
209
- # =========================
210
249
  confidence = 1.0
211
250
  confidence -= 0.35 if data_risk else 0
212
251
  confidence -= 0.25 if perfect_cv else 0
@@ -220,4 +259,3 @@ class AICritic:
220
259
  "blocking_issues": blocking_issues,
221
260
  "confidence": confidence
222
261
  }
223
-
@@ -0,0 +1,84 @@
1
+ # evaluators/adapters.py
2
+ import numpy as np
3
+
4
+ try:
5
+ import torch
6
+ import torch.nn as nn
7
+ except ImportError:
8
+ torch = None
9
+
10
+ try:
11
+ import tensorflow as tf
12
+ except ImportError:
13
+ tf = None
14
+
15
+ class ModelAdapter:
16
+ """
17
+ Wraps scikit-learn, PyTorch, or TensorFlow models to provide a
18
+ unified fit/predict interface for AICritic.
19
+ """
20
+
21
+ def __init__(self, model, framework="sklearn", **kwargs):
22
+ """
23
+ Parameters
24
+ ----------
25
+ model : object
26
+ The original model (sklearn estimator, torch.nn.Module, or tf.keras.Model)
27
+ framework : str
28
+ One of "sklearn", "torch", "tensorflow"
29
+ kwargs : dict
30
+ Extra hyperparameters for training (epochs, batch_size, optimizer, etc)
31
+ """
32
+ self.model = model
33
+ self.framework = framework.lower()
34
+ self.kwargs = kwargs
35
+
36
+ if self.framework not in ("sklearn", "torch", "tensorflow"):
37
+ raise ValueError(f"Unsupported framework: {framework}")
38
+
39
+ # PyTorch default settings
40
+ if self.framework == "torch":
41
+ self.epochs = kwargs.get("epochs", 5)
42
+ self.lr = kwargs.get("lr", 1e-3)
43
+ self.loss_fn = kwargs.get("loss_fn", nn.MSELoss())
44
+ self.optimizer_class = kwargs.get("optimizer", torch.optim.Adam)
45
+ self.device = kwargs.get("device", "cpu")
46
+ self.model.to(self.device)
47
+
48
+ # TensorFlow default settings
49
+ if self.framework == "tensorflow":
50
+ self.epochs = kwargs.get("epochs", 5)
51
+ self.batch_size = kwargs.get("batch_size", 32)
52
+ self.loss_fn = kwargs.get("loss_fn", "mse")
53
+ self.optimizer = kwargs.get("optimizer", "adam")
54
+ self.model.compile(optimizer=self.optimizer, loss=self.loss_fn)
55
+
56
+ def fit(self, X, y):
57
+ if self.framework == "sklearn":
58
+ self.model.fit(X, y)
59
+ elif self.framework == "torch":
60
+ X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
61
+ y_tensor = torch.tensor(y, dtype=torch.float32).to(self.device).view(-1, 1)
62
+ optimizer = self.optimizer_class(self.model.parameters(), lr=self.lr)
63
+
64
+ self.model.train()
65
+ for epoch in range(self.epochs):
66
+ optimizer.zero_grad()
67
+ output = self.model(X_tensor)
68
+ loss = self.loss_fn(output, y_tensor)
69
+ loss.backward()
70
+ optimizer.step()
71
+ elif self.framework == "tensorflow":
72
+ self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
73
+ return self
74
+
75
+ def predict(self, X):
76
+ if self.framework == "sklearn":
77
+ return self.model.predict(X)
78
+ elif self.framework == "torch":
79
+ self.model.eval()
80
+ with torch.no_grad():
81
+ X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
82
+ return self.model(X_tensor).cpu().numpy().flatten()
83
+ elif self.framework == "tensorflow":
84
+ return self.model.predict(X).flatten()
@@ -0,0 +1,39 @@
1
+ def compute_scores(report: dict) -> dict:
2
+ """
3
+ Converts critic signals into a coarse 0–100 score.
4
+ Score is NOT an objective metric.
5
+ """
6
+
7
+ score = 100
8
+
9
+ data_leakage = report["details"]["data"]["data_leakage"]["suspected"]
10
+ perfect_cv = report["details"]["performance"]["suspiciously_perfect"]
11
+ robustness = report["details"]["robustness"]["verdict"]
12
+ structural = report["details"]["config"]["structural_warnings"]
13
+
14
+ if data_leakage:
15
+ score -= 30
16
+
17
+ if perfect_cv:
18
+ score -= 20
19
+
20
+ if robustness == "fragile":
21
+ score -= 15
22
+ elif robustness == "misleading":
23
+ score -= 25
24
+
25
+ if structural:
26
+ score -= 10
27
+
28
+ return {
29
+ "global": max(0, min(100, score)),
30
+ "components": {
31
+ "data_integrity": 0 if data_leakage else 100,
32
+ "validation": 70 if perfect_cv else 100,
33
+ "robustness": {
34
+ "stable": 100,
35
+ "fragile": 65,
36
+ "misleading": 40
37
+ }.get(robustness, 100),
38
+ }
39
+ }
@@ -0,0 +1,3 @@
1
+ from .store import CriticSessionStore
2
+
3
+ __all__ = ["CriticSessionStore"]
@@ -0,0 +1,33 @@
1
+ import json
2
+ from pathlib import Path
3
+ from datetime import datetime
4
+
5
+
6
+ class CriticSessionStore:
7
+ """
8
+ Simple local persistence layer for ai-critic sessions.
9
+ """
10
+
11
+ def __init__(self, base_dir: str | None = None):
12
+ self.base_dir = Path(
13
+ base_dir or Path.home() / ".ai_critic_sessions"
14
+ )
15
+ self.base_dir.mkdir(parents=True, exist_ok=True)
16
+
17
+ def _session_path(self, name: str) -> Path:
18
+ return self.base_dir / f"{name}.json"
19
+
20
+ def save(self, name: str, payload: dict):
21
+ data = {
22
+ "timestamp": datetime.utcnow().isoformat(),
23
+ "payload": payload
24
+ }
25
+ with open(self._session_path(name), "w") as f:
26
+ json.dump(data, f, indent=2)
27
+
28
+ def load(self, name: str) -> dict | None:
29
+ path = self._session_path(name)
30
+ if not path.exists():
31
+ return None
32
+ with open(path) as f:
33
+ return json.load(f)["payload"]
@@ -0,0 +1,289 @@
1
+ Metadata-Version: 2.4
2
+ Name: ai-critic
3
+ Version: 1.1.0
4
+ Summary: Fast AI evaluator for scikit-learn models
5
+ Author-email: Luiz Seabra <filipedemarco@yahoo.com>
6
+ Requires-Python: >=3.9
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: numpy
9
+ Requires-Dist: scikit-learn
10
+
11
+ # ai-critic 🧠: The Quality Gate for Machine Learning Models
12
+
13
+ **ai-critic** is a specialized **decision-making** tool designed to audit the reliability and readiness for deployment of **scikit-learn**, **PyTorch**, and **TensorFlow** models.
14
+
15
+ Instead of merely measuring performance (accuracy, F1 score), **ai-critic** acts as a **Quality Gate**, actively probing the model to uncover *hidden risks* that commonly cause production failures — such as **data leakage**, **structural overfitting**, and **fragility under noise**.
16
+
17
+ > **ai-critic does not ask “How good is this model?”**
18
+ > It asks **“Can this model be trusted?”**
19
+
20
+ ---
21
+
22
+ ## 🚀 Getting Started (The Basics)
23
+
24
+ This section is ideal for beginners who need a **fast and reliable verdict** on a trained model.
25
+
26
+ ### Installation
27
+
28
+ Install directly from PyPI:
29
+
30
+ ```bash
31
+ pip install ai-critic
32
+ ```
33
+
34
+ ---
35
+
36
+ ### The Quick Verdict
37
+
38
+ With just a few lines of code, you obtain an **executive-level assessment** and a **deployment recommendation**.
39
+
40
+ ```python
41
+ from ai_critic import AICritic
42
+ from sklearn.ensemble import RandomForestClassifier
43
+ from sklearn.datasets import make_classification
44
+
45
+ # 1. Prepare data and model
46
+ X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
47
+ model = RandomForestClassifier(max_depth=5, random_state=42)
48
+
49
+ # 2. Initialize the Critic
50
+ critic = AICritic(model, X, y)
51
+
52
+ # 3. Run the audit (executive mode)
53
+ report = critic.evaluate(view="executive")
54
+
55
+ print(f"Verdict: {report['verdict']}")
56
+ print(f"Risk Level: {report['risk_level']}")
57
+ print(f"Main Reason: {report['main_reason']}")
58
+ ```
59
+
60
+ **Expected Output (example):**
61
+
62
+ ```text
63
+ Verdict: ⚠️ Risky
64
+ Risk Level: medium
65
+ Main Reason: Structural or robustness-related risks detected.
66
+ ```
67
+
68
+ This output is intentionally **conservative**.
69
+ If **ai-critic** recommends deployment, it means meaningful risks were *not* detected.
70
+
71
+ ---
72
+
73
+ ## 💡 Understanding the Critique (The Intermediary)
74
+
75
+ For data scientists who want to understand **why** the model received a given verdict and **how to improve it**.
76
+
77
+ ---
78
+
79
+ ### The Four Pillars of the Audit
80
+
81
+ **ai-critic** evaluates models across four independent risk dimensions:
82
+
83
+ | Pillar | Main Risk Detected | Internal Module |
84
+ | ---------------------- | -------------------------------------- | ------------------------ |
85
+ | 📊 **Data Integrity** | Target Leakage & Correlation Artifacts | `evaluators.data` |
86
+ | 🧠 **Model Structure** | Over-complexity & Misconfiguration | `evaluators.config` |
87
+ | 📈 **Performance** | Suspicious CV or Learning Curves | `evaluators.performance` |
88
+ | 🧪 **Robustness** | Sensitivity to Noise | `evaluators.robustness` |
89
+
90
+ Each pillar contributes signals used later in the **deployment gate**.
91
+
92
+ ---
93
+
94
+ ### Full Technical & Visual Analysis
95
+
96
+ To access **all internal diagnostics**, including plots and recommendations, use `view="all"`.
97
+
98
+ ```python
99
+ full_report = critic.evaluate(view="all", plot=True)
100
+
101
+ technical_summary = full_report["technical"]
102
+
103
+ print("\n--- Key Risks Detected ---")
104
+ for i, risk in enumerate(technical_summary["key_risks"], start=1):
105
+ print(f"{i}. {risk}")
106
+
107
+ print("\n--- Recommendations ---")
108
+ for rec in technical_summary["recommendations"]:
109
+ print(f"- {rec}")
110
+ ```
111
+
112
+ Generated plots may include:
113
+
114
+ * Feature correlation heatmaps
115
+ * Learning curves
116
+ * Robustness degradation charts
117
+
118
+ ---
119
+
120
+ ### Robustness Test (Noise Injection)
121
+
122
+ A model that collapses under small perturbations is **not production-safe**.
123
+
124
+ ```python
125
+ robustness = full_report["details"]["robustness"]
126
+
127
+ print("\n--- Robustness Analysis ---")
128
+ print(f"Original CV Score: {robustness['cv_score_original']:.4f}")
129
+ print(f"Noisy CV Score: {robustness['cv_score_noisy']:.4f}")
130
+ print(f"Performance Drop: {robustness['performance_drop']:.4f}")
131
+ print(f"Verdict: {robustness['verdict']}")
132
+ ```
133
+
134
+ **Possible Verdicts:**
135
+
136
+ * `stable` → acceptable degradation
137
+ * `fragile` → high sensitivity to noise
138
+ * `misleading` → performance likely inflated by leakage
139
+
140
+ ---
141
+
142
+ ## ⚙️ Integration and Governance (The Advanced)
143
+
144
+ This section targets **MLOps engineers**, **architects**, and teams operating automated pipelines.
145
+
146
+ ---
147
+
148
+ ### Multi-Framework Support
149
+
150
+ **ai-critic 1.0+** supports models from multiple frameworks with the **same API**:
151
+
152
+ ```python
153
+ # PyTorch Example
154
+ import torch
155
+ import torch.nn as nn
156
+ from ai_critic import AICritic
157
+
158
+ X = torch.randn(1000, 20)
159
+ y = torch.randint(0, 2, (1000,))
160
+
161
+ model = nn.Sequential(
162
+ nn.Linear(20, 32),
163
+ nn.ReLU(),
164
+ nn.Linear(32, 2)
165
+ )
166
+
167
+ critic = AICritic(model, X, y, framework="torch", adapter_kwargs={"epochs":5, "batch_size":64})
168
+ report = critic.evaluate(view="executive")
169
+ print(report)
170
+
171
+ # TensorFlow Example
172
+ import tensorflow as tf
173
+
174
+ model = tf.keras.Sequential([
175
+ tf.keras.layers.Dense(32, activation="relu", input_shape=(20,)),
176
+ tf.keras.layers.Dense(2)
177
+ ])
178
+ critic = AICritic(model, X.numpy(), y.numpy(), framework="tensorflow", adapter_kwargs={"epochs":5})
179
+ report = critic.evaluate(view="executive")
180
+ print(report)
181
+ ```
182
+
183
+ > No need to rewrite evaluation code — **one Critic API works for sklearn, PyTorch, or TensorFlow**.
184
+
185
+ ---
186
+
187
+ ### The Deployment Gate (`deploy_decision`)
188
+
189
+ The `deploy_decision()` method aggregates *all detected risks* and produces a final gate decision.
190
+
191
+ ```python
192
+ decision = critic.deploy_decision()
193
+
194
+ if decision["deploy"]:
195
+ print("✅ Deployment Approved")
196
+ else:
197
+ print("❌ Deployment Blocked")
198
+
199
+ print(f"Risk Level: {decision['risk_level']}")
200
+ print(f"Confidence Score: {decision['confidence']:.2f}")
201
+
202
+ print("\nBlocking Issues:")
203
+ for issue in decision["blocking_issues"]:
204
+ print(f"- {issue}")
205
+ ```
206
+
207
+ **Conceptual model:**
208
+
209
+ * **Hard Blockers** → deployment denied
210
+ * **Soft Blockers** → deployment discouraged
211
+ * **Confidence Score (0–1)** → heuristic trust level
212
+
213
+ ---
214
+
215
+ ### Modes & Views (API Design)
216
+
217
+ The `evaluate()` method supports **multiple modes** via the `view` parameter:
218
+
219
+ | View | Description |
220
+ | ------------- | ---------------------------------- |
221
+ | `"executive"` | High-level verdict (non-technical) |
222
+ | `"technical"` | Risks & recommendations |
223
+ | `"details"` | Raw evaluator outputs |
224
+ | `"all"` | Complete payload |
225
+
226
+ Example:
227
+
228
+ ```python
229
+ critic.evaluate(view="technical")
230
+ critic.evaluate(view=["executive", "performance"])
231
+ ```
232
+
233
+ ---
234
+
235
+ ### Session Tracking & Model Comparison
236
+
237
+ You can persist evaluations and compare model versions over time.
238
+
239
+ ```python
240
+ critic_v1 = AICritic(model, X, y, session="v1")
241
+ critic_v1.evaluate()
242
+
243
+ critic_v2 = AICritic(model, X, y, session="v2")
244
+ critic_v2.evaluate()
245
+
246
+ comparison = critic_v2.compare_with("v1")
247
+ print(comparison["score_diff"])
248
+ ```
249
+
250
+ This enables:
251
+
252
+ * Regression tracking
253
+ * Risk drift detection
254
+ * Governance & audit trails
255
+
256
+ ---
257
+
258
+ ### Best Practices & Use Cases
259
+
260
+ | Scenario | Recommended Usage |
261
+ | ----------------------- | -------------------------------------- |
262
+ | **CI/CD** | Block merges using `deploy_decision()` |
263
+ | **Model Tuning** | Use technical view for guidance |
264
+ | **Governance** | Persist session outputs |
265
+ | **Stakeholder Reports** | Share executive summaries |
266
+
267
+ ---
268
+
269
+ ## 🔒 API Stability
270
+
271
+ Starting from version **1.0.0**, the public API of **ai-critic** follows semantic versioning.
272
+ Breaking changes will only occur in major releases.
273
+
274
+ ---
275
+
276
+ ## 📄 License
277
+
278
+ Distributed under the **MIT License**.
279
+
280
+ ---
281
+
282
+ ## 🧠 Final Note
283
+
284
+ > **ai-critic is not a benchmarking tool.**
285
+ > It is a *decision-making system*.
286
+
287
+ A failed audit does **not** mean the model is bad — it means the model **is not ready to be trusted**.
288
+
289
+ The purpose of **ai-critic** is to introduce *structured skepticism* into machine learning workflows — exactly where it belongs.
@@ -1,13 +1,17 @@
1
1
  ai_critic/__init__.py,sha256=H6DlPMmbcFUamhsNULPLk9vHx81XCiXuKKf63EJ8eM0,53
2
- ai_critic/critic.py,sha256=Qewfu3mRRu1ORywij7zcEnL_kq_U-OUSHAG1PnTJdlA,6739
2
+ ai_critic/critic.py,sha256=I9MeVHVCN-lWffPm3DJCgbFVVW8VTIs_qhXd-aP3X5Q,8277
3
3
  ai_critic/evaluators/__init__.py,sha256=ri6InmL8_LIcO-JZpU_gEFKLO4URdqo3z6rh7fV6M8Y,169
4
+ ai_critic/evaluators/adapters.py,sha256=8Xw9Ccg1iGVNwVQDGVIqhWj5-Sg6evqCZhg21u8EP20,3068
4
5
  ai_critic/evaluators/config.py,sha256=gBXaS8Qxl14f40JnvMWgA0Z0SGEtbCuCHpTOPem0H90,1163
5
6
  ai_critic/evaluators/data.py,sha256=YAK5NkwCeJOny_UueZ5ALwvEcRDIbEck404eV2oqWnc,1871
6
7
  ai_critic/evaluators/performance.py,sha256=1CQx5DueK0XkelYyJnAGRJ3AjQtjsKeW8_1JQZqKVOI,1973
7
8
  ai_critic/evaluators/robustness.py,sha256=mfVQ67Z6t6aRvtIq-XQEQYbwvyf8UefM1myeOGVrnAE,1869
9
+ ai_critic/evaluators/scoring.py,sha256=GBkmDa5Q6RZY4hJfzrCbxbBopsOsRjsNtzyoQHqgWHA,1046
8
10
  ai_critic/evaluators/summary.py,sha256=O9ZCrph93VV6pFcMIx2a7DizPIccRUqbGcUZ6oDmOLs,3791
9
11
  ai_critic/evaluators/validation.py,sha256=rnzRwD78Cugey33gl9geE8JoBURsKEEnqrIOhBZv0LY,904
10
- ai_critic-0.2.5.dist-info/METADATA,sha256=NNLka05cHrUg1YykLfivgXFmOs_KLy0ERvSEq8OksCk,6512
11
- ai_critic-0.2.5.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
12
- ai_critic-0.2.5.dist-info/top_level.txt,sha256=TRyZkm1vyLLcFDg_80yeg5cHvPis_oW1Ti170417jkw,10
13
- ai_critic-0.2.5.dist-info/RECORD,,
12
+ ai_critic/sessions/__init__.py,sha256=Yp7mphSPJwt8a4cJgcQNErqwqHVuP_xAJODrs0y0Abw,72
13
+ ai_critic/sessions/store.py,sha256=65m9WXFVFWv4pPzvXV4l8zLHoHWMfCGe6eHh4X-8agY,947
14
+ ai_critic-1.1.0.dist-info/METADATA,sha256=gIxyPwmDrmkqzZdLILBkq6uOl_2UAu5QRIV7Xis2rGc,8114
15
+ ai_critic-1.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
16
+ ai_critic-1.1.0.dist-info/top_level.txt,sha256=TRyZkm1vyLLcFDg_80yeg5cHvPis_oW1Ti170417jkw,10
17
+ ai_critic-1.1.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.10.1)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,200 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: ai-critic
3
- Version: 0.2.5
4
- Summary: Fast AI evaluator for scikit-learn models
5
- Author-email: Luiz Seabra <filipedemarco@yahoo.com>
6
- Requires-Python: >=3.9
7
- Description-Content-Type: text/markdown
8
- Requires-Dist: numpy
9
- Requires-Dist: scikit-learn
10
-
11
- # ai-critic 🧠: The Quality Gate for Machine Learning Models
12
-
13
- **ai-critic** is a specialized **decision-making** tool designed to audit the reliability and readiness for deployment of scikit-learn compatible Machine Learning models.
14
-
15
- Instead of just measuring performance (accuracy, F1 score), **ai-critic** acts as a "Quality Gate," operating the model in search of hidden risks that can lead to production failures, such as data leaks, structural overfitting, and vulnerability to noise.
16
-
17
- ---
18
-
19
- ## 🚀 1. Getting Started (The Basics)
20
-
21
- This section is ideal for beginners who need a quick verdict on the health of their model.
22
-
23
- ### 1.1. Installation
24
-
25
- Install the library directly from PyPI:
26
-
27
- ```bash
28
- pip install ai-critic
29
- ```
30
-
31
- ### 1.2. The Quick Verdict
32
-
33
- With just a few lines, you can get an executive evaluation and a deployment recommendation.
34
-
35
- ```python
36
- from ai_critic import AICritic
37
- from sklearn.ensemble import RandomForestClassifier
38
- from sklearn.datasets import make_classification
39
-
40
- # 1. Prepare your data and model
41
- X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
42
- model = RandomForestClassifier(max_depth=5, random_state=42)
43
-
44
- # 2. Initialize Criticism
45
- # AICritic performs all audits internally
46
- critic = AICritic(model, X, y)
47
-
48
- # 3. Obtain the Executive Summary
49
- report = critic.evaluate(view="executive")
50
-
51
- print(f"Verdict: {report['verdict']}")
52
- print(f"Risk: {report['risk_level']}")
53
- print(f"Reason Main: {report['main_reason']}")
54
-
55
- #Expected Output:
56
-
57
- # Verdict: ✅ Acceptable
58
- # Risk: Low
59
- # Main Reason: No critic risks detected.
60
-
61
- ```
62
-
63
- ---
64
-
65
- ## 💡 2. Understanding the Critique (The Intermediary)
66
-
67
- For the data scientist who needs to understand *why* the model received a verdict and what the next steps are.
68
-
69
- ### 2.1. The Four Pillars of the Audit
70
-
71
- The **ai-critic** evaluates your model across four critic dimensions.
72
-
73
- | Category | Main Risk | Code Module |
74
- | :--- | :--- | :--- |
75
- | 📈 **Validation** | Suspicious CV Scores | `ai_critic.performance` |
76
- | 🧪 **Robustness** | Noise Vulnerability | `ai_critic.robustness` |
77
-
78
- 2.2. Visual and Technical Analysis
79
-
80
- The `evaluate` method allows you to view the results and access the complete technical report.
81
-
82
- ```Python
83
- # Continuing the previous example...
84
-
85
- # 1. Generate the full report and visualizations
86
- # plot=True generates Correlation, Learning Curve, and Robustness graphs
87
- full_report = critic.evaluate(view="all", plot=True)
88
-
89
- # 2. Access the Technical Summary for Recommendations
90
- technical_summary = full_report["technical"]
91
-
92
- print("\n--- Technical Recommendations ---")
93
- for i, risk in enumerate(technical_summary["key_risks"]):
94
- print(f"Risk {i+1}: {risk}")
95
- print(f"Recommendation: {technical_summary['recommendations'][i]}")
96
-
97
- # Example of Risk (if there were one):
98
- # Risk 1: The depth of the tree may be too high for the size of the dataset.
99
-
100
- # Recommendation: Reduce model complexity or adjust hyperparameters.
101
-
102
-
103
- ###2.3. Robustness Test
104
-
105
- A robust model should maintain its performance even with small disturbances in the data. The `ai-critic` test assesses this by injecting noise into the input data.
106
-
107
- ```python
108
- # Accessing the specific result of the Robustness module
109
- robustness_result = full_report["details"]["robustness"]
110
-
111
- print("\n--- Robustness Test ---")
112
- print(f"Original CV Score: {robustness_result['cv_score_original']:.4f}")
113
- print(f"CV Score with Noise: {robustness_result['cv_score_noisy']:.4f}")
114
- print(f"Performance Drop: {robustness_result['performance_drop']:.4f}")
115
- print(f"Robustness Verdict: {robustness_result['verdict']}")
116
-
117
- # Possible Verdicts:
118
- # - Stable: Acceptable drop.
119
-
120
- # - Fragile: Significant drop (risk).
121
-
122
- # - Misleading: Original performance inflated by leakage.
123
-
124
- ```
125
-
126
- ---
127
-
128
- ## ⚙️ 3. Integration and Governance (The Advanced)
129
-
130
- This section is for MLOps engineers and architects looking to integrate **ai-critic** into automated pipelines and create custom deployment logic.
131
-
132
- ###3.1. The Deployment Gate (`deploy_decision`)
133
-
134
- The `deploy_decision()` method is the final control point. It returns a structured object that classifies problems into *Hard Blockers* (prevent deployment) and *Soft Blockers* (require attention, but can be accepted with reservations).
135
-
136
- Python
137
- # Example of use in a CI/CD pipeline
138
- decision = critic.deploy_decision()
139
-
140
- if decision["deploy"]:
141
- print("✅ Deployment Approved. Risk Level: Low.")
142
- other:
143
- print(f"❌ Deployment Blocked. Risk Level: {decision['risk_level'].upper()}")
144
- print("Blocking Issues:")
145
- for issue in decision["blocking_issues"]:
146
- print(f"- {problem}")
147
-
148
- # The decision object also includes a heuristic confidence score (0.0 to 1.0)
149
- print(f"Heuristic Confidence in Model: {decision['confidence']:.2f}")
150
-
151
- ```
152
-
153
- ###3.2. AccessFor custom *governance* rules or logic, you can access the raw data of each module through the `"details"` view.
154
-
155
- ```python
156
- # Accessing Data Leakage Details
157
- data_details = critic.evaluate(view="details")["data"]
158
-
159
- if data_details["data_leakage"]["suspected"]:
160
-
161
- print("\n--- Data Leak Alert ---")
162
-
163
- for detail in data_details["data_leakage"]["details"]:
164
-
165
- print(f"Feature {detail['feature_index']} with correlation of {detail['correlation']:.4f}")
166
-
167
- # Accessing Structural Overfitting Details
168
- config_details = critic.evaluate(view="details")["config"]
169
-
170
- if config_details["structural_warnings"]:
171
-
172
- print("\n--- Structural Alert ---")
173
-
174
- for warning in config_details["structural_warnings"]:
175
-
176
- print(f"Warning: {warning['message']} (Max Depth: {warning['max_depth']}, Recommended: {warning['recommended_max_depth']})")
177
- ```
178
-
179
- ### 3.3. Best Practices and Use Cases
180
-
181
- | Use | Recommended Action |
182
- | :--- | :--- |
183
- | **CI/CD** | Use `deploy_decision()` as an automated quality gate. |
184
- | **Tuning** | Use the technical view to guide hyperparameter optimization. |
185
- | **Governance** | Log the details view for auditing and compliance. |
186
- | **Communication** | Use the executive view to report risks to non-technical stakeholders. |
187
-
188
- ---
189
-
190
- ## 📄 License
191
-
192
- Distributed under the **MIT License**.
193
-
194
- --
195
-
196
- ## 🧠 Final Note
197
-
198
- > **ai-critic** is not a benchmarking tool. It's a decision-making tool.
199
-
200
- If a model fails here, it doesn't mean it's "bad," but rather that it **shouldn't be trusted yet**. The goal is to inject the necessary skepticism to build truly robust AI systems.