sage-governance 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.MD +481 -0
- package/LICENSE +21 -0
- package/README.md +319 -0
- package/bin/sage.js +55 -0
- package/claude.json +16 -0
- package/codex.json +22 -0
- package/cursor.json +27 -0
- package/docs/architecture.md +38 -0
- package/opencode.json +24 -0
- package/package.json +58 -0
- package/requirements.txt +7 -0
- package/rules/general/EU_AI_Act_Annex_III.md +29 -0
- package/rules/general/OECD_Principles.md +20 -0
- package/rules/general/UNESCO_AI_Ethics.md +237 -0
- package/rules/general/UN_Human_Rights.md +183 -0
- package/rules/index.json +145 -0
- package/sage/mcp_server.py +459 -0
- package/sage/report_gen.py +408 -0
- package/sage/sage_agent.py +710 -0
- package/sage/security_agent.py +455 -0
- package/sage/startup.py +311 -0
|
@@ -0,0 +1,710 @@
|
|
|
1
|
+
"""
|
|
2
|
+
sage_agent.py — SAGE Intent Classifier & Ethics Evaluator
|
|
3
|
+
══════════════════════════════════════════════════════════
|
|
4
|
+
Classifies developer intent, evaluates ethics/fairness/regulatory compliance,
|
|
5
|
+
and generates structured tradeoff options.
|
|
6
|
+
|
|
7
|
+
DESIGN DECISIONS
|
|
8
|
+
────────────────
|
|
9
|
+
1. DETERMINISTIC FIRST — domain detection, risk scoring, proxy detection,
|
|
10
|
+
and tradeoff selection are all rule-based. These never fail.
|
|
11
|
+
|
|
12
|
+
2. LLM SECOND — the LLM is used ONLY to generate a human-readable reasoning
|
|
13
|
+
summary. If the LLM is unavailable or fails, a deterministic fallback
|
|
14
|
+
summary is returned. The Pydantic schema is ALWAYS satisfied.
|
|
15
|
+
|
|
16
|
+
3. PYDANTIC CONTRACT — every output goes through SageEvaluateResponse.
|
|
17
|
+
The schema is the contract Roshan builds opencode.json against.
|
|
18
|
+
It cannot silently change shape.
|
|
19
|
+
|
|
20
|
+
Author: SAGE Team / Team SAGE (Hackathon)
|
|
21
|
+
License: MIT
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import re
|
|
27
|
+
import sys
|
|
28
|
+
from typing import Any, Literal, Optional
|
|
29
|
+
|
|
30
|
+
from pydantic import BaseModel, field_validator
|
|
31
|
+
|
|
32
|
+
# startup is always imported first — it pre-loads all globals
|
|
33
|
+
from startup import (
|
|
34
|
+
EU_AI_ACT_ANNEX_III,
|
|
35
|
+
FAIRLEARN_AVAILABLE,
|
|
36
|
+
LLM_AVAILABLE,
|
|
37
|
+
LLM_CLIENT,
|
|
38
|
+
LLM_MODEL,
|
|
39
|
+
POLICY_INDEX,
|
|
40
|
+
PROTECTED_ATTRIBUTES,
|
|
41
|
+
PROXY_ATTRIBUTE_MAP,
|
|
42
|
+
UDHR_ARTICLE_MAP,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
46
|
+
# PYDANTIC MODELS — the immutable contract
|
|
47
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class FairnessOption(BaseModel):
|
|
51
|
+
"""One concrete fairness strategy with full tradeoff analysis."""
|
|
52
|
+
|
|
53
|
+
name: str
|
|
54
|
+
definition: str
|
|
55
|
+
stakeholder_view: str # Northpointe / ProPublica / GDPR framing
|
|
56
|
+
pros: list[str]
|
|
57
|
+
cons: list[str]
|
|
58
|
+
fairlearn_api: str
|
|
59
|
+
diffprivlib_api: Optional[str] = None
|
|
60
|
+
who_benefits: str
|
|
61
|
+
tradeoff_cost: str
|
|
62
|
+
compatible_with: list[str] # other options it can coexist with
|
|
63
|
+
incompatible_with: list[str] # Fairness Impossibility Theorem entries
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class SageEvaluateResponse(BaseModel):
|
|
67
|
+
"""
|
|
68
|
+
The complete SAGE evaluation result.
|
|
69
|
+
Roshan: build opencode.json against THIS schema. It will not change shape.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
risk_level: Literal["LOW", "MEDIUM", "HIGH", "CRITICAL"]
|
|
73
|
+
eu_ai_act_annex: Optional[str]
|
|
74
|
+
eu_ai_act_category: Optional[str]
|
|
75
|
+
udhr_articles: list[str]
|
|
76
|
+
protected_attributes: list[str]
|
|
77
|
+
proxy_attributes: list[str]
|
|
78
|
+
detected_domain: str
|
|
79
|
+
intent_summary: str
|
|
80
|
+
compliance_flags: list[str]
|
|
81
|
+
fairness_options: list[FairnessOption]
|
|
82
|
+
immediate_actions: list[str]
|
|
83
|
+
regulations: list[str]
|
|
84
|
+
requires_human_review: bool
|
|
85
|
+
sage_reasoning: str # LLM-enriched or deterministic fallback
|
|
86
|
+
fairness_impossibility: bool # True when base rates differ across groups
|
|
87
|
+
|
|
88
|
+
@field_validator("intent_summary")
|
|
89
|
+
@classmethod
|
|
90
|
+
def truncate_summary(cls, v: str) -> str:
|
|
91
|
+
return v[:300]
|
|
92
|
+
|
|
93
|
+
@field_validator("risk_level")
|
|
94
|
+
@classmethod
|
|
95
|
+
def validate_risk(cls, v: str) -> str:
|
|
96
|
+
assert v in ("LOW", "MEDIUM", "HIGH", "CRITICAL")
|
|
97
|
+
return v
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
101
|
+
# FAIRNESS OPTION LIBRARY
|
|
102
|
+
# Three frameworks from the COMPAS/SAMPLE_CASES evidence base.
|
|
103
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
104
|
+
|
|
105
|
+
_FAIRNESS_LIBRARY: dict[str, FairnessOption] = {
|
|
106
|
+
"equalized_odds": FairnessOption(
|
|
107
|
+
name="Equalized Odds",
|
|
108
|
+
definition=(
|
|
109
|
+
"Equal true-positive AND false-positive rates across all groups. "
|
|
110
|
+
"The model makes equally frequent mistakes for every demographic."
|
|
111
|
+
),
|
|
112
|
+
stakeholder_view=(
|
|
113
|
+
"ProPublica standard — 'An innocent Black defendant must not be "
|
|
114
|
+
"twice as likely to be wrongly flagged as high-risk.'"
|
|
115
|
+
),
|
|
116
|
+
pros=[
|
|
117
|
+
"Directly addresses the ProPublica COMPAS critique",
|
|
118
|
+
"Protects individuals from being misclassified at higher rates by group",
|
|
119
|
+
"Supported natively by Fairlearn ExponentiatedGradient",
|
|
120
|
+
"Satisfies EU AI Act Article 10 non-discrimination requirements",
|
|
121
|
+
],
|
|
122
|
+
cons=[
|
|
123
|
+
"Mathematically incompatible with Demographic Parity and Predictive Parity "
|
|
124
|
+
"when base rates differ (Fairness Impossibility Theorem)",
|
|
125
|
+
"Requires protected attributes at training time — GDPR tension",
|
|
126
|
+
"Reducing FPR for one group may increase FNR for another",
|
|
127
|
+
],
|
|
128
|
+
fairlearn_api="fairlearn.reductions.ExponentiatedGradient(estimator, EqualizedOdds())",
|
|
129
|
+
who_benefits=(
|
|
130
|
+
"Individuals from groups incorrectly classified at higher rates "
|
|
131
|
+
"(e.g. Black defendants in COMPAS, women in credit scoring)"
|
|
132
|
+
),
|
|
133
|
+
tradeoff_cost="Overall accuracy drop; requires demographic data collection",
|
|
134
|
+
compatible_with=[],
|
|
135
|
+
incompatible_with=["demographic_parity", "predictive_parity"],
|
|
136
|
+
),
|
|
137
|
+
|
|
138
|
+
"demographic_parity": FairnessOption(
|
|
139
|
+
name="Demographic Parity",
|
|
140
|
+
definition=(
|
|
141
|
+
"Equal fraction of positive predictions across all groups, "
|
|
142
|
+
"regardless of ground-truth differences."
|
|
143
|
+
),
|
|
144
|
+
stakeholder_view=(
|
|
145
|
+
"Equality-of-outcome view — 'Force proportional representation now, "
|
|
146
|
+
"because historical data cannot be trusted.'"
|
|
147
|
+
),
|
|
148
|
+
pros=[
|
|
149
|
+
"Eliminates visible representation disparity in predictions",
|
|
150
|
+
"Simple to measure and communicate to non-technical stakeholders",
|
|
151
|
+
"No protected attributes needed at inference time",
|
|
152
|
+
],
|
|
153
|
+
cons=[
|
|
154
|
+
"Ignores actual underlying differences in ground-truth rates",
|
|
155
|
+
"Mathematically incompatible with Equalized Odds and Predictive Parity",
|
|
156
|
+
"Can produce individually unfair outcomes (penalizes qualified individuals)",
|
|
157
|
+
],
|
|
158
|
+
fairlearn_api="fairlearn.metrics.demographic_parity_difference(y_true, y_pred, sensitive_features=sf)",
|
|
159
|
+
who_benefits="Groups underrepresented in positive predictions at the population level",
|
|
160
|
+
tradeoff_cost="Individual fairness; overall predictive accuracy; requires base-rate trust",
|
|
161
|
+
compatible_with=[],
|
|
162
|
+
incompatible_with=["equalized_odds", "predictive_parity"],
|
|
163
|
+
),
|
|
164
|
+
|
|
165
|
+
"predictive_parity": FairnessOption(
|
|
166
|
+
name="Predictive Parity (Calibration)",
|
|
167
|
+
definition=(
|
|
168
|
+
"Equal positive predictive value (precision) across groups — "
|
|
169
|
+
"a given score carries the same probability of the outcome for all groups."
|
|
170
|
+
),
|
|
171
|
+
stakeholder_view=(
|
|
172
|
+
"Northpointe standard — 'If a score of 7 means 70% recidivism for "
|
|
173
|
+
"white defendants, it must mean the same for Black defendants.'"
|
|
174
|
+
),
|
|
175
|
+
pros=[
|
|
176
|
+
"Ensures score interpretability is equal across groups",
|
|
177
|
+
"Does not require protected attributes at inference time",
|
|
178
|
+
"Defends calibration when base rates genuinely differ in the real world",
|
|
179
|
+
],
|
|
180
|
+
cons=[
|
|
181
|
+
"Permits systematically higher false-positive rates for disadvantaged groups",
|
|
182
|
+
"Northpointe used this argument while ProPublica demonstrated concrete harm",
|
|
183
|
+
"When base rates differ, Fairness Impossibility Theorem guarantees "
|
|
184
|
+
"this metric co-exists with unequal error rates",
|
|
185
|
+
],
|
|
186
|
+
fairlearn_api="fairlearn.metrics.MetricFrame(metrics=selection_rate, sensitive_features=sf).by_group",
|
|
187
|
+
who_benefits=(
|
|
188
|
+
"System operators who need calibrated scores; "
|
|
189
|
+
"groups with higher true base rates"
|
|
190
|
+
),
|
|
191
|
+
tradeoff_cost="Equal error rates; protection from individual misclassification",
|
|
192
|
+
compatible_with=[],
|
|
193
|
+
incompatible_with=["equalized_odds", "demographic_parity"],
|
|
194
|
+
),
|
|
195
|
+
|
|
196
|
+
"differential_privacy": FairnessOption(
|
|
197
|
+
name="Differential Privacy (ε-DP)",
|
|
198
|
+
definition=(
|
|
199
|
+
"Formal mathematical guarantee: adding or removing any single "
|
|
200
|
+
"training record changes model output by at most e^ε — "
|
|
201
|
+
"individual records cannot be reconstructed."
|
|
202
|
+
),
|
|
203
|
+
stakeholder_view=(
|
|
204
|
+
"Privacy-first — 'Protect every individual in the training set "
|
|
205
|
+
"regardless of group membership.'"
|
|
206
|
+
),
|
|
207
|
+
pros=[
|
|
208
|
+
"Provable protection against model inversion and membership inference attacks",
|
|
209
|
+
"Satisfies GDPR data minimization principle by design",
|
|
210
|
+
"Strong regulatory story under GDPR Article 25 (privacy by design)",
|
|
211
|
+
],
|
|
212
|
+
cons=[
|
|
213
|
+
"Noise injection disproportionately harms underrepresented groups — "
|
|
214
|
+
"DP can WORSEN fairness for minorities",
|
|
215
|
+
"Significant accuracy cost; privacy budget ε directly trades off with utility",
|
|
216
|
+
"diffprivlib supports only Logistic Regression and select estimators",
|
|
217
|
+
],
|
|
218
|
+
fairlearn_api="diffprivlib.models.LogisticRegression(epsilon=1.0)",
|
|
219
|
+
diffprivlib_api="diffprivlib.models.LogisticRegression(epsilon=1.0)",
|
|
220
|
+
who_benefits="All individuals in the training set; regulatory compliance teams",
|
|
221
|
+
tradeoff_cost=(
|
|
222
|
+
"Model accuracy; minority group fairness (noise amplification); "
|
|
223
|
+
"limited model family choices"
|
|
224
|
+
),
|
|
225
|
+
compatible_with=["demographic_parity"],
|
|
226
|
+
incompatible_with=[],
|
|
227
|
+
),
|
|
228
|
+
|
|
229
|
+
"threshold_optimizer": FairnessOption(
|
|
230
|
+
name="Post-Processing Threshold Optimizer",
|
|
231
|
+
definition=(
|
|
232
|
+
"Train a race-blind model, then adjust decision thresholds "
|
|
233
|
+
"per group at deployment time to satisfy Equalized Odds — "
|
|
234
|
+
"protected attributes used only for threshold calibration, "
|
|
235
|
+
"never as model features."
|
|
236
|
+
),
|
|
237
|
+
stakeholder_view=(
|
|
238
|
+
"Pragmatic compromise — 'The model itself is blind; only the "
|
|
239
|
+
"deployment policy uses demographic information.'"
|
|
240
|
+
),
|
|
241
|
+
pros=[
|
|
242
|
+
"Model weights do not encode protected attributes — lower model inversion risk",
|
|
243
|
+
"Strong accuracy (uses GradientBoosting as base model)",
|
|
244
|
+
"Achieves Equalized Odds without embedding sensitive features in the model",
|
|
245
|
+
"Threshold table is a small, auditable, human-readable artifact",
|
|
246
|
+
],
|
|
247
|
+
cons=[
|
|
248
|
+
"Requires protected attributes at inference time for threshold lookup",
|
|
249
|
+
"Still technically processes demographic data — GDPR Art. 9 consideration",
|
|
250
|
+
"GradientBoosting base model is a black box (EU AI Act transparency tension)",
|
|
251
|
+
],
|
|
252
|
+
fairlearn_api=(
|
|
253
|
+
"from fairlearn.postprocessing import ThresholdOptimizer\n"
|
|
254
|
+
"ThresholdOptimizer(estimator=clf, constraints='equalized_odds').fit(X, y, sensitive_features=sf)"
|
|
255
|
+
),
|
|
256
|
+
who_benefits="Teams wanting Equalized Odds without encoding race into model weights",
|
|
257
|
+
tradeoff_cost="Slight accuracy drop; still requires demographic data collection",
|
|
258
|
+
compatible_with=["equalized_odds"],
|
|
259
|
+
incompatible_with=["demographic_parity"],
|
|
260
|
+
),
|
|
261
|
+
|
|
262
|
+
"human_in_loop_escalation": FairnessOption(
|
|
263
|
+
name="Human-in-the-Loop Escalation",
|
|
264
|
+
definition=(
|
|
265
|
+
"Flagged activities are routed to a human moderator before action. "
|
|
266
|
+
"Auto-escalate is reserved only for high-confidence P0 cases."
|
|
267
|
+
),
|
|
268
|
+
stakeholder_view=(
|
|
269
|
+
"Respect child autonomy and prevent arbitrary censorship by enforcing human review."
|
|
270
|
+
),
|
|
271
|
+
pros=[
|
|
272
|
+
"Prevents false positive blocks",
|
|
273
|
+
"Ensures human oversight (EU AI Act Art 14)",
|
|
274
|
+
"Protects children from automated censorship",
|
|
275
|
+
],
|
|
276
|
+
cons=[
|
|
277
|
+
"High latency in safety intervention",
|
|
278
|
+
"Moderator mental health exposure",
|
|
279
|
+
"High operational costs",
|
|
280
|
+
],
|
|
281
|
+
fairlearn_api="# Safeguarding: human_in_the_loop_review",
|
|
282
|
+
who_benefits="Minors experiencing false-positive flags, privacy advocates",
|
|
283
|
+
tradeoff_cost="Intervention latency, moderator overhead",
|
|
284
|
+
compatible_with=["metadata_only_retention"],
|
|
285
|
+
incompatible_with=[],
|
|
286
|
+
),
|
|
287
|
+
|
|
288
|
+
"metadata_only_retention": FairnessOption(
|
|
289
|
+
name="Metadata-Only Retention",
|
|
290
|
+
definition=(
|
|
291
|
+
"Log only anonymized metadata (user ID hash, timestamp, alert type) "
|
|
292
|
+
"for compliance. Discard message contents immediately."
|
|
293
|
+
),
|
|
294
|
+
stakeholder_view=(
|
|
295
|
+
"GDPR Article 25 Privacy by Design — protect children's communication "
|
|
296
|
+
"from data breaches."
|
|
297
|
+
),
|
|
298
|
+
pros=[
|
|
299
|
+
"Minimizes privacy violation risk",
|
|
300
|
+
"Limits data breach liability",
|
|
301
|
+
"Compliant with UN CRC Article 16 (privacy)",
|
|
302
|
+
],
|
|
303
|
+
cons=[
|
|
304
|
+
"Impossible to perform manual post-audit",
|
|
305
|
+
"Cannot retrain model on false negatives/positives",
|
|
306
|
+
"Limits evidence gathering for active abuse",
|
|
307
|
+
],
|
|
308
|
+
fairlearn_api="# Safeguarding: metadata_only_retention",
|
|
309
|
+
who_benefits="Minors in the dataset, data protection officers",
|
|
310
|
+
tradeoff_cost="Model auditability, future training data collection",
|
|
311
|
+
compatible_with=["human_in_loop_escalation"],
|
|
312
|
+
incompatible_with=[],
|
|
313
|
+
),
|
|
314
|
+
|
|
315
|
+
"recall_first_detection": FairnessOption(
|
|
316
|
+
name="Recall-First Detection (FN vs FP)",
|
|
317
|
+
definition=(
|
|
318
|
+
"Optimize model threshold to minimize False Negatives (FN). "
|
|
319
|
+
"Flag all potential safety issues, accepting higher False Positives (FP)."
|
|
320
|
+
),
|
|
321
|
+
stakeholder_view=(
|
|
322
|
+
"Safety first — a missed grooming or abuse attempt is far worse than a false flag."
|
|
323
|
+
),
|
|
324
|
+
pros=[
|
|
325
|
+
"Maximizes protection of child safety",
|
|
326
|
+
"Captures subtle grooming patterns early",
|
|
327
|
+
"Allows swift intervention",
|
|
328
|
+
],
|
|
329
|
+
cons=[
|
|
330
|
+
"High rate of false alerts",
|
|
331
|
+
"Moderator fatigue from noise",
|
|
332
|
+
"Disrupts user experience with false blocks",
|
|
333
|
+
],
|
|
334
|
+
fairlearn_api="# Safeguarding: recall_first_optimization",
|
|
335
|
+
who_benefits="Vulnerable children, child protection teams",
|
|
336
|
+
tradeoff_cost="Moderator burden, user disruption",
|
|
337
|
+
compatible_with=["human_in_loop_escalation"],
|
|
338
|
+
incompatible_with=[],
|
|
339
|
+
),
|
|
340
|
+
|
|
341
|
+
"cybersecurity_tradeoff": FairnessOption(
|
|
342
|
+
name="Transparency vs. Gaming & Inversion",
|
|
343
|
+
definition=(
|
|
344
|
+
"Balance public transparency of model details (weights, features) against "
|
|
345
|
+
"the risk of adversarial gaming (manipulation of features by users) "
|
|
346
|
+
"and model inversion (reconstructing training data)."
|
|
347
|
+
),
|
|
348
|
+
stakeholder_view=(
|
|
349
|
+
"Security-first — limit public exposure of model parameters to prevent exploitation."
|
|
350
|
+
),
|
|
351
|
+
pros=[
|
|
352
|
+
"Prevents users from gaming the scoring system",
|
|
353
|
+
"Protects proprietary model IP",
|
|
354
|
+
"Reduces risk of model inversion attacks",
|
|
355
|
+
],
|
|
356
|
+
cons=[
|
|
357
|
+
"Violates EU AI Act Article 13 transparency requirements",
|
|
358
|
+
"Prevents independent public audits of bias",
|
|
359
|
+
"Restricts user right to explanation",
|
|
360
|
+
],
|
|
361
|
+
fairlearn_api="# Cybersecurity: adversarial training and API rate limiting / model obfuscation",
|
|
362
|
+
who_benefits="System operators, database administrators, model providers",
|
|
363
|
+
tradeoff_cost="Transparency, public auditability, explaining individual predictions",
|
|
364
|
+
compatible_with=["equalized_odds", "differential_privacy", "demographic_parity"],
|
|
365
|
+
incompatible_with=[],
|
|
366
|
+
),
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
371
|
+
# DETERMINISTIC DETECTION FUNCTIONS
|
|
372
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def _detect_domain(text: str) -> tuple[str, str | None, list[str]]:
|
|
376
|
+
"""
|
|
377
|
+
Returns (domain_key, eu_ai_act_annex | None, udhr_articles).
|
|
378
|
+
Pure keyword matching — deterministic and fast.
|
|
379
|
+
"""
|
|
380
|
+
text_lower = text.lower()
|
|
381
|
+
for domain_key, info in EU_AI_ACT_ANNEX_III.items():
|
|
382
|
+
if any(kw in text_lower for kw in info["keywords"]):
|
|
383
|
+
return domain_key, info["annex"], UDHR_ARTICLE_MAP.get(domain_key, ["Article 2"])
|
|
384
|
+
return "general", None, UDHR_ARTICLE_MAP["general"]
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def _detect_protected_attributes(text: str) -> list[str]:
|
|
388
|
+
"""Detects protected attributes mentioned directly."""
|
|
389
|
+
text_lower = text.lower()
|
|
390
|
+
found = []
|
|
391
|
+
for attr in PROTECTED_ATTRIBUTES:
|
|
392
|
+
pattern = r"\b" + re.escape(attr.replace("_", "[ _]?")) + r"\b"
|
|
393
|
+
if re.search(pattern, text_lower):
|
|
394
|
+
found.append(attr)
|
|
395
|
+
return list(set(found))
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def _detect_proxy_attributes(text: str, protected_attrs: list[str]) -> list[str]:
|
|
399
|
+
"""
|
|
400
|
+
Detects proxy attributes using the semantic PROXY_ATTRIBUTE_MAP.
|
|
401
|
+
Always scans all proxy attributes to prevent missing proxy features
|
|
402
|
+
when a different protected attribute is explicitly specified.
|
|
403
|
+
"""
|
|
404
|
+
text_lower = text.lower()
|
|
405
|
+
found: list[str] = []
|
|
406
|
+
for group, proxies in PROXY_ATTRIBUTE_MAP.items():
|
|
407
|
+
for proxy in proxies:
|
|
408
|
+
if proxy.lower() in text_lower:
|
|
409
|
+
found.append(proxy)
|
|
410
|
+
return list(set(found))
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def _score_risk(
|
|
414
|
+
domain: str,
|
|
415
|
+
protected_attrs: list[str],
|
|
416
|
+
proxy_attrs: list[str],
|
|
417
|
+
annex: str | None,
|
|
418
|
+
) -> Literal["LOW", "MEDIUM", "HIGH", "CRITICAL"]:
|
|
419
|
+
"""Deterministic risk scoring. No LLM involved."""
|
|
420
|
+
score = 0
|
|
421
|
+
if annex:
|
|
422
|
+
score += 3
|
|
423
|
+
score += min(len(protected_attrs) * 2, 4)
|
|
424
|
+
score += min(len(proxy_attrs), 2)
|
|
425
|
+
if domain in ("criminal_justice", "children_safety"):
|
|
426
|
+
score += 2
|
|
427
|
+
elif domain in ("credit_scoring", "employment"):
|
|
428
|
+
score += 1
|
|
429
|
+
|
|
430
|
+
if score >= 7:
|
|
431
|
+
return "CRITICAL"
|
|
432
|
+
if score >= 5:
|
|
433
|
+
return "HIGH"
|
|
434
|
+
if score >= 3:
|
|
435
|
+
return "MEDIUM"
|
|
436
|
+
return "LOW"
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def _build_compliance_flags(
|
|
440
|
+
domain: str,
|
|
441
|
+
protected_attrs: list[str],
|
|
442
|
+
proxy_attrs: list[str],
|
|
443
|
+
annex: str | None,
|
|
444
|
+
) -> list[str]:
|
|
445
|
+
flags: list[str] = []
|
|
446
|
+
if protected_attrs:
|
|
447
|
+
flags.append(
|
|
448
|
+
f"PROTECTED ATTRIBUTES IN FEATURE SET: {', '.join(protected_attrs)} — "
|
|
449
|
+
"direct use may violate GDPR Article 9 and EU AI Act Article 10"
|
|
450
|
+
)
|
|
451
|
+
if proxy_attrs:
|
|
452
|
+
flags.append(
|
|
453
|
+
f"PROXY DISCRIMINATION RISK: {', '.join(proxy_attrs)} — "
|
|
454
|
+
"these features may encode protected characteristics indirectly"
|
|
455
|
+
)
|
|
456
|
+
if annex:
|
|
457
|
+
flags.append(
|
|
458
|
+
f"EU AI ACT HIGH-RISK CLASSIFICATION: {annex} — "
|
|
459
|
+
"mandatory conformity assessment, DPIA, and bias monitoring required pre-deployment"
|
|
460
|
+
)
|
|
461
|
+
if domain == "criminal_justice":
|
|
462
|
+
flags.append(
|
|
463
|
+
"FAIRNESS IMPOSSIBILITY THEOREM: Demographic Parity, Equalized Odds, and "
|
|
464
|
+
"Predictive Parity cannot all be satisfied simultaneously when base rates differ. "
|
|
465
|
+
"A values choice must be documented."
|
|
466
|
+
)
|
|
467
|
+
if domain == "children_safety":
|
|
468
|
+
flags.append(
|
|
469
|
+
"CHILDREN'S RIGHTS: UN CRC Articles 3, 12, 16, 19, 34 apply. "
|
|
470
|
+
"Explicit escalation policy, data retention limits, and false-negative tolerance "
|
|
471
|
+
"must be defined and documented before proceeding."
|
|
472
|
+
)
|
|
473
|
+
if domain == "employment":
|
|
474
|
+
flags.append(
|
|
475
|
+
"AD DELIVERY BIAS: Algorithm optimization for engagement may produce "
|
|
476
|
+
"gender/race disparities even with neutral targeting (Ali et al. 2019, Lambrecht & Tucker 2019). "
|
|
477
|
+
"Audit delivery outcomes, not just model inputs."
|
|
478
|
+
)
|
|
479
|
+
return flags
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def _select_fairness_options(domain: str) -> list[FairnessOption]:
|
|
483
|
+
"""Returns the 3 most relevant options ordered by relevance, covering Fairness, Privacy, and Cybersecurity."""
|
|
484
|
+
domain_map: dict[str, list[str]] = {
|
|
485
|
+
"criminal_justice": [
|
|
486
|
+
"equalized_odds", "differential_privacy", "cybersecurity_tradeoff"
|
|
487
|
+
],
|
|
488
|
+
"credit_scoring": [
|
|
489
|
+
"equalized_odds", "differential_privacy", "cybersecurity_tradeoff"
|
|
490
|
+
],
|
|
491
|
+
"employment": [
|
|
492
|
+
"demographic_parity", "differential_privacy", "cybersecurity_tradeoff"
|
|
493
|
+
],
|
|
494
|
+
"children_safety": [
|
|
495
|
+
"human_in_loop_escalation", "metadata_only_retention", "recall_first_detection"
|
|
496
|
+
],
|
|
497
|
+
}
|
|
498
|
+
keys = domain_map.get(domain, ["equalized_odds", "differential_privacy", "cybersecurity_tradeoff"])
|
|
499
|
+
return [_FAIRNESS_LIBRARY[k] for k in keys]
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
def _build_regulations(domain: str, annex: str | None) -> list[str]:
|
|
503
|
+
regs: list[str] = []
|
|
504
|
+
if annex:
|
|
505
|
+
domain_info = EU_AI_ACT_ANNEX_III.get(domain, {})
|
|
506
|
+
regs.append(f"EU AI Act {annex}: {domain_info.get('description', '')}")
|
|
507
|
+
regs.append("GDPR Article 9 — Processing of special category personal data")
|
|
508
|
+
regs.append("UNESCO Recommendation on AI Ethics (2021) — Principle 7: Fairness & Non-Discrimination")
|
|
509
|
+
regs.append("OECD AI Principles (2019) — Principle 1.3: Fairness & Inclusivity")
|
|
510
|
+
if domain == "criminal_justice":
|
|
511
|
+
regs += [
|
|
512
|
+
"UDHR Article 7 — Equal protection under the law",
|
|
513
|
+
"UDHR Article 10 — Right to fair and public hearing",
|
|
514
|
+
"ECHR Article 6 — Right to fair trial",
|
|
515
|
+
"State v. Loomis (2016) — Due process in algorithmic sentencing",
|
|
516
|
+
]
|
|
517
|
+
elif domain == "employment":
|
|
518
|
+
regs += [
|
|
519
|
+
"UDHR Article 23 — Right to work without discrimination",
|
|
520
|
+
"EU Charter Article 21 — Non-discrimination",
|
|
521
|
+
"Equal Employment Opportunity Act (US)",
|
|
522
|
+
]
|
|
523
|
+
elif domain == "credit_scoring":
|
|
524
|
+
regs += [
|
|
525
|
+
"UDHR Article 22 — Right to social security",
|
|
526
|
+
"Equal Credit Opportunity Act (US, 1974)",
|
|
527
|
+
"EU Charter Article 21 — Non-discrimination in financial services",
|
|
528
|
+
]
|
|
529
|
+
elif domain == "children_safety":
|
|
530
|
+
regs += [
|
|
531
|
+
"UN Convention on the Rights of the Child Articles 3, 12, 16, 19, 34",
|
|
532
|
+
"UNICEF Guidance on AI and Children v3.0 (2025)",
|
|
533
|
+
]
|
|
534
|
+
return regs
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def _build_immediate_actions(
|
|
538
|
+
risk_level: str,
|
|
539
|
+
protected_attrs: list[str],
|
|
540
|
+
proxy_attrs: list[str],
|
|
541
|
+
domain: str,
|
|
542
|
+
annex: str | None,
|
|
543
|
+
) -> list[str]:
|
|
544
|
+
actions: list[str] = []
|
|
545
|
+
if risk_level in ("HIGH", "CRITICAL"):
|
|
546
|
+
actions.append(
|
|
547
|
+
"Conduct Data Protection Impact Assessment (DPIA) before deployment — "
|
|
548
|
+
"required by GDPR Article 35 for high-risk processing"
|
|
549
|
+
)
|
|
550
|
+
if protected_attrs:
|
|
551
|
+
actions.append(
|
|
552
|
+
f"Evaluate whether {', '.join(protected_attrs)} can be removed from "
|
|
553
|
+
"the feature set; audit remaining features for proxy discrimination"
|
|
554
|
+
)
|
|
555
|
+
if proxy_attrs:
|
|
556
|
+
actions.append(
|
|
557
|
+
f"Audit correlation between [{', '.join(proxy_attrs)}] and protected "
|
|
558
|
+
"characteristics using Fairlearn MetricFrame before training"
|
|
559
|
+
)
|
|
560
|
+
if annex:
|
|
561
|
+
actions.append(
|
|
562
|
+
f"Register this system under EU AI Act {annex} — conformity assessment "
|
|
563
|
+
"and bias monitoring are mandatory, not optional"
|
|
564
|
+
)
|
|
565
|
+
if domain == "children_safety":
|
|
566
|
+
actions.append(
|
|
567
|
+
"Define and document escalation policy (who reviews flagged messages), "
|
|
568
|
+
"data retention limit (how long conversations are stored), and "
|
|
569
|
+
"acceptable false-negative tolerance before writing any code"
|
|
570
|
+
)
|
|
571
|
+
if domain == "criminal_justice":
|
|
572
|
+
actions.append(
|
|
573
|
+
"Select ONE fairness metric (Equalized Odds recommended per ProPublica standard) "
|
|
574
|
+
"and document the values judgment. "
|
|
575
|
+
"The Fairness Impossibility Theorem means you cannot satisfy all three simultaneously."
|
|
576
|
+
)
|
|
577
|
+
return actions
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
581
|
+
# LLM ENRICHMENT (optional; fails gracefully)
|
|
582
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
def _enrich_with_llm(prompt: str, det: dict[str, Any]) -> str:
|
|
586
|
+
"""
|
|
587
|
+
Calls the configured LLM to write a 2-3 sentence governance explanation.
|
|
588
|
+
Returns a deterministic fallback if LLM is unavailable or times out.
|
|
589
|
+
"""
|
|
590
|
+
fallback = (
|
|
591
|
+
f"Deterministic analysis: Domain={det['domain']}, Risk={det['risk_level']}. "
|
|
592
|
+
f"Protected attributes: {det['protected_attributes'] or 'none detected directly'}. "
|
|
593
|
+
f"Proxy attributes: {det['proxy_attributes'] or 'none detected'}. "
|
|
594
|
+
f"Applicable regulation: {det.get('annex') or 'none (not classified as high-risk)'}."
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
if not LLM_AVAILABLE:
|
|
598
|
+
return fallback
|
|
599
|
+
|
|
600
|
+
system = (
|
|
601
|
+
"You are SAGE, a governance agent for AI systems. Given a developer's coding "
|
|
602
|
+
"prompt and a preliminary risk classification, write a concise 2-3 sentence "
|
|
603
|
+
"explanation of WHY this prompt raises ethical or regulatory concerns. "
|
|
604
|
+
"Reference specific laws, principles, or documented real-world cases. "
|
|
605
|
+
"Be factual and precise. Output ONLY the explanation — no JSON, no preamble, "
|
|
606
|
+
"no bullet points."
|
|
607
|
+
)
|
|
608
|
+
user = (
|
|
609
|
+
f"Developer prompt: \"{prompt}\"\n\n"
|
|
610
|
+
f"SAGE preliminary classification:\n"
|
|
611
|
+
f"- Domain: {det['domain']}\n"
|
|
612
|
+
f"- Risk level: {det['risk_level']}\n"
|
|
613
|
+
f"- Protected attributes detected: {det['protected_attributes']}\n"
|
|
614
|
+
f"- Proxy attributes detected: {det['proxy_attributes']}\n"
|
|
615
|
+
f"- Applicable regulation: {det.get('annex', 'None')}\n\n"
|
|
616
|
+
"Write 2-3 sentences explaining the governance concern."
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
try:
|
|
620
|
+
response = LLM_CLIENT.chat.completions.create(
|
|
621
|
+
model=LLM_MODEL,
|
|
622
|
+
max_tokens=350,
|
|
623
|
+
messages=[
|
|
624
|
+
{"role": "system", "content": system},
|
|
625
|
+
{"role": "user", "content": user},
|
|
626
|
+
],
|
|
627
|
+
)
|
|
628
|
+
return response.choices[0].message.content.strip()
|
|
629
|
+
except Exception as exc:
|
|
630
|
+
print(f"[sage_agent] LLM enrichment failed: {exc}", file=sys.stderr)
|
|
631
|
+
return fallback
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
635
|
+
# MAIN PUBLIC API
|
|
636
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
def evaluate(
|
|
640
|
+
prompt: str,
|
|
641
|
+
code: str = "",
|
|
642
|
+
context: str = "",
|
|
643
|
+
) -> SageEvaluateResponse:
|
|
644
|
+
"""
|
|
645
|
+
Full SAGE evaluation pipeline.
|
|
646
|
+
|
|
647
|
+
Step 1: Deterministic domain/attribute/risk detection (always runs)
|
|
648
|
+
Step 2: LLM reasoning enrichment (optional, graceful fallback)
|
|
649
|
+
Step 3: Pydantic validation (always enforced)
|
|
650
|
+
|
|
651
|
+
Returns: SageEvaluateResponse — Pydantic-validated, always parseable.
|
|
652
|
+
"""
|
|
653
|
+
full_text = f"{prompt} {code} {context}"
|
|
654
|
+
|
|
655
|
+
# ── Step 1: Deterministic analysis ────────────────────────────────────────
|
|
656
|
+
domain, annex, udhr_articles = _detect_domain(full_text)
|
|
657
|
+
protected_attrs = _detect_protected_attributes(full_text)
|
|
658
|
+
proxy_attrs = _detect_proxy_attributes(full_text, protected_attrs)
|
|
659
|
+
risk_level = _score_risk(domain, protected_attrs, proxy_attrs, annex)
|
|
660
|
+
flags = _build_compliance_flags(domain, protected_attrs, proxy_attrs, annex)
|
|
661
|
+
options = _select_fairness_options(domain)
|
|
662
|
+
regulations = _build_regulations(domain, annex)
|
|
663
|
+
actions = _build_immediate_actions(
|
|
664
|
+
risk_level, protected_attrs, proxy_attrs, domain, annex
|
|
665
|
+
)
|
|
666
|
+
domain_info = EU_AI_ACT_ANNEX_III.get(domain, {})
|
|
667
|
+
|
|
668
|
+
det = {
|
|
669
|
+
"domain": domain,
|
|
670
|
+
"risk_level": risk_level,
|
|
671
|
+
"protected_attributes": protected_attrs,
|
|
672
|
+
"proxy_attributes": proxy_attrs,
|
|
673
|
+
"annex": annex,
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
# ── Step 2: LLM enrichment ────────────────────────────────────────────────
|
|
677
|
+
reasoning = _enrich_with_llm(prompt, det)
|
|
678
|
+
|
|
679
|
+
# ── Step 3: Pydantic-validated return ────────────────────────────────────
|
|
680
|
+
return SageEvaluateResponse(
|
|
681
|
+
risk_level=risk_level,
|
|
682
|
+
eu_ai_act_annex=annex,
|
|
683
|
+
eu_ai_act_category=domain_info.get("description"),
|
|
684
|
+
udhr_articles=udhr_articles,
|
|
685
|
+
protected_attributes=protected_attrs,
|
|
686
|
+
proxy_attributes=proxy_attrs,
|
|
687
|
+
detected_domain=domain,
|
|
688
|
+
intent_summary=prompt[:300],
|
|
689
|
+
compliance_flags=flags,
|
|
690
|
+
fairness_options=options,
|
|
691
|
+
immediate_actions=actions,
|
|
692
|
+
regulations=regulations,
|
|
693
|
+
requires_human_review=risk_level in ("HIGH", "CRITICAL"),
|
|
694
|
+
sage_reasoning=reasoning,
|
|
695
|
+
fairness_impossibility=domain in ("criminal_justice", "credit_scoring", "employment"),
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
def log_model_metrics(metrics: dict[str, Any], dataset_info: Optional[dict[str, Any]] = None) -> str:
|
|
700
|
+
"""
|
|
701
|
+
Log model training/evaluation metrics and dataset info to the audit trail.
|
|
702
|
+
This is called automatically when the model trains or evaluates.
|
|
703
|
+
"""
|
|
704
|
+
from startup import write_audit_entry
|
|
705
|
+
entry = {
|
|
706
|
+
"event_type": "model_trained",
|
|
707
|
+
"metrics": metrics,
|
|
708
|
+
"dataset_info": dataset_info or {},
|
|
709
|
+
}
|
|
710
|
+
return write_audit_entry(entry)
|