ai-critic 1.2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_critic/ai_suggestions/predictor.py +5 -0
- ai_critic/ai_suggestions/rules.py +3 -0
- ai_critic/cli.py +141 -0
- ai_critic/critic.py +81 -201
- ai_critic/evaluators/config.py +29 -5
- ai_critic/feedback/__init__.py +3 -0
- ai_critic/feedback/store.py +23 -0
- ai_critic/learning/__init__.py +13 -0
- ai_critic/learning/critic_model.py +25 -0
- ai_critic/learning/features.py +15 -0
- ai_critic/learning/policy.py +20 -0
- ai_critic/learning/recommender.py +26 -0
- ai_critic/learning/trainer.py +16 -0
- ai_critic/ml/suggester.py +63 -0
- ai_critic/telemetry/__init__.py +0 -0
- ai_critic/telemetry/anonymizer.py +9 -0
- ai_critic/telemetry/client.py +6 -0
- ai_critic/telemetry/event.py +15 -0
- ai_critic/telemetry/local_store.py +9 -0
- ai_critic/telemetry/schema.py +11 -0
- ai_critic/telemetry/sender.py +9 -0
- ai_critic-2.0.0.dist-info/METADATA +390 -0
- ai_critic-2.0.0.dist-info/RECORD +37 -0
- ai_critic-1.2.0.dist-info/METADATA +0 -290
- ai_critic-1.2.0.dist-info/RECORD +0 -18
- {ai_critic-1.2.0.dist-info → ai_critic-2.0.0.dist-info}/WHEEL +0 -0
- {ai_critic-1.2.0.dist-info → ai_critic-2.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
def recommend_changes(report):
|
|
2
|
+
recs = []
|
|
3
|
+
|
|
4
|
+
config = report["details"]["config"]
|
|
5
|
+
perf = report["details"]["performance"]
|
|
6
|
+
data = report["details"]["data"]
|
|
7
|
+
|
|
8
|
+
if config["risk_level"] == "high":
|
|
9
|
+
recs.append(
|
|
10
|
+
"Reduce model complexity (e.g., lower max_depth, fewer estimators)."
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
if perf["suspiciously_perfect"]:
|
|
14
|
+
recs.append(
|
|
15
|
+
"Suspiciously perfect performance detected — verify data leakage."
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
if data["data_leakage"]["suspected"]:
|
|
19
|
+
recs.append(
|
|
20
|
+
"Potential target leakage — review feature engineering pipeline."
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
if not recs:
|
|
24
|
+
recs.append("No critical changes recommended.")
|
|
25
|
+
|
|
26
|
+
return recs
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from .features import extract_features
|
|
2
|
+
|
|
3
|
+
class CriticTrainer:
|
|
4
|
+
def __init__(self, critic_model, min_samples=10):
|
|
5
|
+
self.model = critic_model
|
|
6
|
+
self.min_samples = min_samples
|
|
7
|
+
self.X = []
|
|
8
|
+
self.y = []
|
|
9
|
+
|
|
10
|
+
def add_feedback(self, report, success: bool):
|
|
11
|
+
features = extract_features(report)
|
|
12
|
+
self.X.append(list(features.values()))
|
|
13
|
+
self.y.append(int(success))
|
|
14
|
+
|
|
15
|
+
if len(self.y) >= self.min_samples:
|
|
16
|
+
self.model.train(self.X, self.y)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# ai_critic/ml/suggester.py
|
|
2
|
+
|
|
3
|
+
def suggest_fix(event: dict) -> dict:
|
|
4
|
+
"""
|
|
5
|
+
Lightweight ML-ready suggestion engine.
|
|
6
|
+
Today: rule-based.
|
|
7
|
+
Tomorrow: trained on global telemetry.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
signals = event["signals"]
|
|
11
|
+
score = event["score"]
|
|
12
|
+
|
|
13
|
+
# 🔴 Casos críticos
|
|
14
|
+
if signals["leakage"] and signals["perfect_cv"]:
|
|
15
|
+
return {
|
|
16
|
+
"verdict": "critical",
|
|
17
|
+
"suggestion": (
|
|
18
|
+
"Strong evidence of data leakage. "
|
|
19
|
+
"Audit features highly correlated with the target, "
|
|
20
|
+
"remove shortcuts and re-run validation."
|
|
21
|
+
)
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
# 🟠 Robustez fraca
|
|
25
|
+
if signals["robustness"] == "fragile":
|
|
26
|
+
return {
|
|
27
|
+
"verdict": "warning",
|
|
28
|
+
"suggestion": (
|
|
29
|
+
"Model is fragile under noise. "
|
|
30
|
+
"Consider stronger regularization, "
|
|
31
|
+
"simpler architecture or more data."
|
|
32
|
+
)
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
# 🟠 Estrutura pesada
|
|
36
|
+
if signals["structural"] == "high":
|
|
37
|
+
return {
|
|
38
|
+
"verdict": "warning",
|
|
39
|
+
"suggestion": (
|
|
40
|
+
"Model complexity may be too high for dataset size. "
|
|
41
|
+
"Reduce depth, number of parameters or features."
|
|
42
|
+
)
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
# 🟢 Caso saudável
|
|
46
|
+
if score >= 85:
|
|
47
|
+
return {
|
|
48
|
+
"verdict": "ok",
|
|
49
|
+
"suggestion": (
|
|
50
|
+
"Model behavior looks consistent. "
|
|
51
|
+
"No critical risks detected at this stage."
|
|
52
|
+
)
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
# 🟡 Default
|
|
56
|
+
return {
|
|
57
|
+
"verdict": "review",
|
|
58
|
+
"suggestion": (
|
|
59
|
+
"No critical failures detected, "
|
|
60
|
+
"but model could benefit from further validation "
|
|
61
|
+
"and robustness checks."
|
|
62
|
+
)
|
|
63
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
def anonymize(report: dict) -> dict:
|
|
2
|
+
return {
|
|
3
|
+
"model_type": report["meta"]["model_type"],
|
|
4
|
+
"score": report["score"]["global"],
|
|
5
|
+
"signals": {
|
|
6
|
+
"leakage": report["details"]["data"]["data_leakage"]["suspected"],
|
|
7
|
+
"robustness": report["details"]["robustness"]["verdict"],
|
|
8
|
+
}
|
|
9
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
def build_event(report: dict) -> dict:
|
|
2
|
+
return {
|
|
3
|
+
"model_type": report["meta"]["model_type"],
|
|
4
|
+
"framework": report["meta"]["framework"],
|
|
5
|
+
"n_samples": report["meta"]["n_samples"],
|
|
6
|
+
"n_features": report["meta"]["n_features"],
|
|
7
|
+
"score": report["scores"]["global"],
|
|
8
|
+
"risk_level": report["executive"]["risk_level"],
|
|
9
|
+
"signals": {
|
|
10
|
+
"leakage": report["details"]["data"]["data_leakage"]["suspected"],
|
|
11
|
+
"perfect_cv": report["details"]["performance"]["suspiciously_perfect"],
|
|
12
|
+
"robustness": report["details"]["robustness"]["verdict"],
|
|
13
|
+
"structural": report["details"]["config"]["risk_level"],
|
|
14
|
+
}
|
|
15
|
+
}
|
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ai-critic
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: Fast AI evaluator for scikit-learn models
|
|
5
|
+
Author-email: Luiz Seabra <filipedemarco@yahoo.com>
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: numpy
|
|
9
|
+
Requires-Dist: scikit-learn
|
|
10
|
+
|
|
11
|
+
# ai-critic 🧠
|
|
12
|
+
|
|
13
|
+
## The Quality Gate for Machine Learning Models
|
|
14
|
+
|
|
15
|
+
**ai-critic** is a specialized **decision-making system** designed to evaluate whether a machine learning model is **safe, reliable, and trustworthy enough** to be deployed in real-world environments.
|
|
16
|
+
|
|
17
|
+
Unlike traditional ML evaluation tools that focus almost exclusively on *performance metrics*, **ai-critic** operates as a **Quality Gate** — a final checkpoint that actively probes models to uncover **hidden risks** that frequently cause silent failures in production.
|
|
18
|
+
|
|
19
|
+
> **ai-critic does not ask *“How accurate is this model?”***
|
|
20
|
+
> It asks ***“Can this model be trusted in the real world?”***
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## 🎯 What Problem Does ai-critic Solve?
|
|
25
|
+
|
|
26
|
+
In production, most ML failures are **not accuracy problems**.
|
|
27
|
+
|
|
28
|
+
They are caused by:
|
|
29
|
+
|
|
30
|
+
* Data leakage hidden inside features
|
|
31
|
+
* Overfitting disguised as strong validation scores
|
|
32
|
+
* Models that collapse under small noise
|
|
33
|
+
* Models that rely on a single fragile signal
|
|
34
|
+
* Configuration choices that look fine — but are structurally unsafe
|
|
35
|
+
|
|
36
|
+
These failures usually appear **after deployment**, when it is already expensive or dangerous to fix them.
|
|
37
|
+
|
|
38
|
+
**ai-critic exists to catch these failures *before* deployment.**
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## 🚀 Getting Started (The Basics)
|
|
43
|
+
|
|
44
|
+
This section is intentionally designed for **beginners**, **students**, and **engineers under time pressure**.
|
|
45
|
+
|
|
46
|
+
If you only want a **fast, conservative verdict**, this is all you need.
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
### Installation
|
|
51
|
+
|
|
52
|
+
Install directly from PyPI:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install ai-critic
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Python ≥ 3.8 is recommended.
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
### The Quick Verdict
|
|
63
|
+
|
|
64
|
+
With just a few lines of code, you can obtain:
|
|
65
|
+
|
|
66
|
+
* An **executive-level verdict**
|
|
67
|
+
* A **risk classification**
|
|
68
|
+
* A **deployment recommendation**
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from ai_critic import AICritic
|
|
72
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
73
|
+
from sklearn.datasets import make_classification
|
|
74
|
+
|
|
75
|
+
# 1. Prepare data and model
|
|
76
|
+
X, y = make_classification(
|
|
77
|
+
n_samples=1000,
|
|
78
|
+
n_features=20,
|
|
79
|
+
random_state=42
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
model = RandomForestClassifier(
|
|
83
|
+
max_depth=5,
|
|
84
|
+
random_state=42
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# 2. Initialize the Critic
|
|
88
|
+
critic = AICritic(model, X, y)
|
|
89
|
+
|
|
90
|
+
# 3. Run the audit
|
|
91
|
+
report = critic.evaluate(view="executive")
|
|
92
|
+
|
|
93
|
+
print(f"Verdict: {report['verdict']}")
|
|
94
|
+
print(f"Risk Level: {report['risk_level']}")
|
|
95
|
+
print(f"Deploy Recommended: {report['deploy_recommended']}")
|
|
96
|
+
print(f"Main Reason: {report['main_reason']}")
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
**Example Output:**
|
|
100
|
+
|
|
101
|
+
```text
|
|
102
|
+
Verdict: ⚠️ Risky
|
|
103
|
+
Risk Level: medium
|
|
104
|
+
Deploy Recommended: False
|
|
105
|
+
Main Reason: Structural, robustness, or dependency-related risks detected.
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
This verdict is intentionally **conservative by design**.
|
|
109
|
+
|
|
110
|
+
> If **ai-critic approves deployment**, it means **no meaningful risks were detected** by multiple independent heuristics.
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## 🧭 How to Read the Verdict
|
|
115
|
+
|
|
116
|
+
| Field | Meaning |
|
|
117
|
+
| -------------------- | ----------------------- |
|
|
118
|
+
| `verdict` | Human-readable summary |
|
|
119
|
+
| `risk_level` | low / medium / high |
|
|
120
|
+
| `deploy_recommended` | Final gate decision |
|
|
121
|
+
| `main_reason` | Primary blocking factor |
|
|
122
|
+
|
|
123
|
+
The goal is clarity, not ambiguity.
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
## 💡 Understanding the Critique (Intermediate Level)
|
|
128
|
+
|
|
129
|
+
This section is for **data scientists**, **ML engineers**, and **students** who want to understand *why* the model was flagged — and how to improve it.
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
### The Four Pillars of the Audit
|
|
134
|
+
|
|
135
|
+
**ai-critic** evaluates models across **four independent risk dimensions**.
|
|
136
|
+
|
|
137
|
+
| Pillar | What It Detects | Why It Matters |
|
|
138
|
+
| ------------------ | -------------------------------- | -------------------- |
|
|
139
|
+
| 📊 Data Integrity | Leakage, correlations, shortcuts | Inflated performance |
|
|
140
|
+
| 🧠 Model Structure | Over-complexity, unsafe configs | Poor generalization |
|
|
141
|
+
| 📈 Performance | Suspicious CV behavior | False confidence |
|
|
142
|
+
| 🧪 Robustness | Noise sensitivity | Production collapse |
|
|
143
|
+
|
|
144
|
+
Each pillar produces **signals**, not binary judgments.
|
|
145
|
+
|
|
146
|
+
Those signals are later aggregated by the **deployment gate**.
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## 📊 Data Integrity Analysis
|
|
151
|
+
|
|
152
|
+
This pillar focuses on **the relationship between features and the target**.
|
|
153
|
+
|
|
154
|
+
It answers questions like:
|
|
155
|
+
|
|
156
|
+
* Are some features *too predictive*?
|
|
157
|
+
* Are there suspicious correlations?
|
|
158
|
+
* Does performance collapse when a single feature is disturbed?
|
|
159
|
+
|
|
160
|
+
These are classic symptoms of **data leakage** and **shortcut learning**.
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
## 🧠 Model Structure Analysis
|
|
165
|
+
|
|
166
|
+
A model can be accurate and still be unsafe.
|
|
167
|
+
|
|
168
|
+
Structural analysis looks for:
|
|
169
|
+
|
|
170
|
+
* Excessive depth
|
|
171
|
+
* Over-parameterization
|
|
172
|
+
* Configuration choices that amplify variance
|
|
173
|
+
* Inconsistent bias–variance tradeoffs
|
|
174
|
+
|
|
175
|
+
This is especially important for:
|
|
176
|
+
|
|
177
|
+
* Decision trees
|
|
178
|
+
* Boosting models
|
|
179
|
+
* Neural networks with limited data
|
|
180
|
+
|
|
181
|
+
---
|
|
182
|
+
|
|
183
|
+
## 📈 Performance Sanity Checks
|
|
184
|
+
|
|
185
|
+
Rather than optimizing metrics, **ai-critic questions them**.
|
|
186
|
+
|
|
187
|
+
It checks:
|
|
188
|
+
|
|
189
|
+
* Cross-validation stability
|
|
190
|
+
* Variance across folds
|
|
191
|
+
* Learning curve consistency
|
|
192
|
+
* Performance under perturbations
|
|
193
|
+
|
|
194
|
+
A strong score that behaves strangely is treated as **a warning, not a success**.
|
|
195
|
+
|
|
196
|
+
---
|
|
197
|
+
|
|
198
|
+
## 🧪 Robustness Testing (Noise Injection)
|
|
199
|
+
|
|
200
|
+
Production data is **never clean**.
|
|
201
|
+
|
|
202
|
+
This test injects controlled noise into inputs and measures degradation.
|
|
203
|
+
|
|
204
|
+
```python
|
|
205
|
+
robustness = report["details"]["robustness"]
|
|
206
|
+
|
|
207
|
+
print(f"Original CV Score: {robustness['cv_score_original']}")
|
|
208
|
+
print(f"Noisy CV Score: {robustness['cv_score_noisy']}")
|
|
209
|
+
print(f"Performance Drop: {robustness['performance_drop']}")
|
|
210
|
+
print(f"Verdict: {robustness['verdict']}")
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
Possible outcomes:
|
|
214
|
+
|
|
215
|
+
* `stable` → acceptable degradation
|
|
216
|
+
* `fragile` → high sensitivity
|
|
217
|
+
* `misleading` → performance likely inflated
|
|
218
|
+
|
|
219
|
+
---
|
|
220
|
+
|
|
221
|
+
## 🔍 Explainability & Feature Sensitivity
|
|
222
|
+
|
|
223
|
+
Accuracy alone hides *why* a model works.
|
|
224
|
+
|
|
225
|
+
The explainability module performs **feature sensitivity analysis** to detect:
|
|
226
|
+
|
|
227
|
+
* Feature-level leakage
|
|
228
|
+
* Over-reliance on a single signal
|
|
229
|
+
* Structural shortcuts
|
|
230
|
+
|
|
231
|
+
---
|
|
232
|
+
|
|
233
|
+
### How Explainability Works
|
|
234
|
+
|
|
235
|
+
For each feature:
|
|
236
|
+
|
|
237
|
+
1. The feature is randomly permuted.
|
|
238
|
+
2. The model is re-evaluated.
|
|
239
|
+
3. Performance drop is measured.
|
|
240
|
+
|
|
241
|
+
Large drops indicate **critical dependency**.
|
|
242
|
+
|
|
243
|
+
This approach is:
|
|
244
|
+
|
|
245
|
+
* Model-agnostic
|
|
246
|
+
* Lightweight
|
|
247
|
+
* Framework-independent
|
|
248
|
+
* Interpretable by humans
|
|
249
|
+
|
|
250
|
+
---
|
|
251
|
+
|
|
252
|
+
### Explainability Verdicts
|
|
253
|
+
|
|
254
|
+
| Verdict | Meaning |
|
|
255
|
+
| ---------------------- | ------------------------ |
|
|
256
|
+
| `stable` | Balanced feature usage |
|
|
257
|
+
| `feature_dependency` | Few features dominate |
|
|
258
|
+
| `feature_leakage_risk` | Single feature dominates |
|
|
259
|
+
|
|
260
|
+
These verdicts **directly affect**:
|
|
261
|
+
|
|
262
|
+
* Deployment decision
|
|
263
|
+
* Confidence score
|
|
264
|
+
* Recommendations
|
|
265
|
+
|
|
266
|
+
---
|
|
267
|
+
|
|
268
|
+
## 🧠 Recommendations Engine (New)
|
|
269
|
+
|
|
270
|
+
**ai-critic does not stop at “deploy or not”.**
|
|
271
|
+
|
|
272
|
+
It generates **actionable recommendations**, such as:
|
|
273
|
+
|
|
274
|
+
* “Reduce `max_depth`”
|
|
275
|
+
* “Increase regularization”
|
|
276
|
+
* “Likely feature leakage detected”
|
|
277
|
+
* “Model shows structural overfitting”
|
|
278
|
+
* “High noise sensitivity — retrain with augmentation”
|
|
279
|
+
|
|
280
|
+
These recommendations are **rule-based + data-driven**, not LLM hallucinations.
|
|
281
|
+
|
|
282
|
+
---
|
|
283
|
+
|
|
284
|
+
## ⚙️ Deployment Gate
|
|
285
|
+
|
|
286
|
+
The final decision is produced by `deploy_decision()`.
|
|
287
|
+
|
|
288
|
+
```python
|
|
289
|
+
decision = critic.deploy_decision()
|
|
290
|
+
|
|
291
|
+
print(decision["deploy"])
|
|
292
|
+
print(decision["risk_level"])
|
|
293
|
+
print(decision["confidence"])
|
|
294
|
+
print(decision["blocking_issues"])
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
Conceptually:
|
|
298
|
+
|
|
299
|
+
* **Hard blockers** → deployment denied
|
|
300
|
+
* **Soft blockers** → deployment discouraged
|
|
301
|
+
* **Confidence score (0–1)** → heuristic trust
|
|
302
|
+
|
|
303
|
+
---
|
|
304
|
+
|
|
305
|
+
## 🔄 Feedback Loop & Learning Critic
|
|
306
|
+
|
|
307
|
+
**ai-critic improves over time**.
|
|
308
|
+
|
|
309
|
+
Each evaluation can be stored as feedback:
|
|
310
|
+
|
|
311
|
+
* Model config
|
|
312
|
+
* Signals
|
|
313
|
+
* Final outcome
|
|
314
|
+
* Human override (optional)
|
|
315
|
+
|
|
316
|
+
This enables:
|
|
317
|
+
|
|
318
|
+
* Meta-learning
|
|
319
|
+
* Better future recommendations
|
|
320
|
+
* Context-aware criticism
|
|
321
|
+
|
|
322
|
+
---
|
|
323
|
+
|
|
324
|
+
## 🧪 Session Tracking & Comparison
|
|
325
|
+
|
|
326
|
+
You can compare models over time:
|
|
327
|
+
|
|
328
|
+
```python
|
|
329
|
+
critic_v1 = AICritic(model, X, y, session="v1")
|
|
330
|
+
critic_v1.evaluate()
|
|
331
|
+
|
|
332
|
+
critic_v2 = AICritic(model, X, y, session="v2")
|
|
333
|
+
critic_v2.evaluate()
|
|
334
|
+
|
|
335
|
+
critic_v2.compare_with("v1")
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
Use cases:
|
|
339
|
+
|
|
340
|
+
* Regression detection
|
|
341
|
+
* Risk drift
|
|
342
|
+
* Governance audits
|
|
343
|
+
|
|
344
|
+
---
|
|
345
|
+
|
|
346
|
+
## ⚙️ Multi-Framework Support
|
|
347
|
+
|
|
348
|
+
The same API works for:
|
|
349
|
+
|
|
350
|
+
* scikit-learn
|
|
351
|
+
* PyTorch
|
|
352
|
+
* TensorFlow
|
|
353
|
+
|
|
354
|
+
Adapters handle training, evaluation, and probing internally.
|
|
355
|
+
|
|
356
|
+
---
|
|
357
|
+
|
|
358
|
+
## 🧩 Design Philosophy
|
|
359
|
+
|
|
360
|
+
**ai-critic is intentionally skeptical.**
|
|
361
|
+
|
|
362
|
+
It assumes:
|
|
363
|
+
|
|
364
|
+
* Metrics can lie
|
|
365
|
+
* Data is imperfect
|
|
366
|
+
* Models fail silently
|
|
367
|
+
* Confidence must be earned
|
|
368
|
+
|
|
369
|
+
This makes it ideal as a **final gate**, not a tuning toy.
|
|
370
|
+
|
|
371
|
+
---
|
|
372
|
+
|
|
373
|
+
## 🛡️ What ai-critic Is NOT
|
|
374
|
+
|
|
375
|
+
* ❌ A hyperparameter optimizer
|
|
376
|
+
* ❌ A leaderboard benchmark tool
|
|
377
|
+
* ❌ A replacement for domain expertise
|
|
378
|
+
* ❌ A magic “approve all” system
|
|
379
|
+
|
|
380
|
+
---
|
|
381
|
+
|
|
382
|
+
## 🧠 Final Note
|
|
383
|
+
|
|
384
|
+
> **ai-critic is not here to make models look good.**
|
|
385
|
+
> It exists to **prevent bad models from looking good enough to deploy**.
|
|
386
|
+
|
|
387
|
+
A failed audit does **not** mean your model is bad.
|
|
388
|
+
It means your model is **not yet safe to trust**.
|
|
389
|
+
|
|
390
|
+
That distinction is everything.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
ai_critic/__init__.py,sha256=H6DlPMmbcFUamhsNULPLk9vHx81XCiXuKKf63EJ8eM0,53
|
|
2
|
+
ai_critic/cli.py,sha256=4rf9g-CjtYaS9jjLPYWI56Z_6JLHhKh4KSbELYWwsX8,4133
|
|
3
|
+
ai_critic/critic.py,sha256=4rf9g-CjtYaS9jjLPYWI56Z_6JLHhKh4KSbELYWwsX8,4133
|
|
4
|
+
ai_critic/ai_suggestions/predictor.py,sha256=pn20sG1MjXuzXSquu0IqAcSzM2Y3gROHTq8zRSle0RM,155
|
|
5
|
+
ai_critic/ai_suggestions/rules.py,sha256=ZbCtPc5OZgPGzqgYNXf43C2aVdGWKsjhQ-_zdsmtftw,132
|
|
6
|
+
ai_critic/evaluators/__init__.py,sha256=ri6InmL8_LIcO-JZpU_gEFKLO4URdqo3z6rh7fV6M8Y,169
|
|
7
|
+
ai_critic/evaluators/adapters.py,sha256=8Xw9Ccg1iGVNwVQDGVIqhWj5-Sg6evqCZhg21u8EP20,3068
|
|
8
|
+
ai_critic/evaluators/config.py,sha256=eeHOqyU-GOMFdQlhoyJnsaEZYMF1XewyoFKYP7d0o-w,1911
|
|
9
|
+
ai_critic/evaluators/data.py,sha256=YAK5NkwCeJOny_UueZ5ALwvEcRDIbEck404eV2oqWnc,1871
|
|
10
|
+
ai_critic/evaluators/explainability.py,sha256=UWbcb5uVI78d1ljfdrWd2DrjlwEz1y9CeVtkukefEfA,1759
|
|
11
|
+
ai_critic/evaluators/performance.py,sha256=1CQx5DueK0XkelYyJnAGRJ3AjQtjsKeW8_1JQZqKVOI,1973
|
|
12
|
+
ai_critic/evaluators/robustness.py,sha256=mfVQ67Z6t6aRvtIq-XQEQYbwvyf8UefM1myeOGVrnAE,1869
|
|
13
|
+
ai_critic/evaluators/scoring.py,sha256=9rgkCXKKm9G1Lfwn5i9HcsJTN5OUjxMycOUzhWkp_2g,1576
|
|
14
|
+
ai_critic/evaluators/summary.py,sha256=H9rU9tXAXqyQ34L6bOOOHrdIapSq71gcjjc8jfyJMq4,5003
|
|
15
|
+
ai_critic/evaluators/validation.py,sha256=rnzRwD78Cugey33gl9geE8JoBURsKEEnqrIOhBZv0LY,904
|
|
16
|
+
ai_critic/feedback/__init__.py,sha256=JWzTxV8ycpoQPrHuwWzwmuDlpVAL-vzUx2dXy1_7z9c,62
|
|
17
|
+
ai_critic/feedback/store.py,sha256=BpWAM9byj-zfFG6cjxNOvVDoEPP5TUe5DSiAgVQ_8Rg,606
|
|
18
|
+
ai_critic/learning/__init__.py,sha256=umxvyyh8bKdyZP5tRidJUJ-mqeRlTkNcV0HE7_0qHTU,318
|
|
19
|
+
ai_critic/learning/critic_model.py,sha256=KqTOYjk4DsdHjcps-IZO-Udx0KPq6fHVWD9w8OViWyo,708
|
|
20
|
+
ai_critic/learning/features.py,sha256=T5thN96ZBv63gSKxHARc87al2UJP5AIF4_4SAPt1fuE,663
|
|
21
|
+
ai_critic/learning/policy.py,sha256=c2bAtvUu6DK1KDtxTScK-C2xMcQC-HvXmPxpLwggygg,511
|
|
22
|
+
ai_critic/learning/recommender.py,sha256=DxL_C-oqahJJ_u69486JyxBNszYibbru1YBV-TyWgRQ,715
|
|
23
|
+
ai_critic/learning/trainer.py,sha256=UsSS_1QMYQNf4UCGBvbVtWr3n60qC6ZUMP9AmBilKz8,497
|
|
24
|
+
ai_critic/ml/suggester.py,sha256=DKq5NjT7oLBXVw2jNtDLeSjzamV7un2QPqxMHYRIh1Q,1757
|
|
25
|
+
ai_critic/sessions/__init__.py,sha256=Yp7mphSPJwt8a4cJgcQNErqwqHVuP_xAJODrs0y0Abw,72
|
|
26
|
+
ai_critic/sessions/store.py,sha256=65m9WXFVFWv4pPzvXV4l8zLHoHWMfCGe6eHh4X-8agY,947
|
|
27
|
+
ai_critic/telemetry/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
|
+
ai_critic/telemetry/anonymizer.py,sha256=Mg40ZZ1U8vbUEar_n3ZtrDwTfLvJB-qnRlmqXEDAsnk,332
|
|
29
|
+
ai_critic/telemetry/client.py,sha256=CuEibfDkI5Y_Y1yV8BT0aXEjFgZurcL5p5Pk7yfpTz4,130
|
|
30
|
+
ai_critic/telemetry/event.py,sha256=9n2tGmrrAyFhx26Yb3wUImO72ANJc06ZM8yPs4Q48Uc,697
|
|
31
|
+
ai_critic/telemetry/local_store.py,sha256=zZ3CgvHYMUhwgpa_VuITI4rlPHCV4yJKxxYjIGoijfA,225
|
|
32
|
+
ai_critic/telemetry/schema.py,sha256=1fbZqmEmgvmM4FNmhI7O2sQfUaGEZty5SzqLYVo3y0g,200
|
|
33
|
+
ai_critic/telemetry/sender.py,sha256=rsxKbmbT2UvoT_oFK8mQxdcqZRfHLRgrkozHJqbsDps,228
|
|
34
|
+
ai_critic-2.0.0.dist-info/METADATA,sha256=MUuBILCf0XNrD0D8kjdOa7HX-ENYPqW4AGfNsBlpajQ,9293
|
|
35
|
+
ai_critic-2.0.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
36
|
+
ai_critic-2.0.0.dist-info/top_level.txt,sha256=TRyZkm1vyLLcFDg_80yeg5cHvPis_oW1Ti170417jkw,10
|
|
37
|
+
ai_critic-2.0.0.dist-info/RECORD,,
|