sage-governance 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,311 @@
1
+ """
2
+ startup.py — SAGE Pre-Loader
3
+ ════════════════════════════
4
+ Imported ONCE when mcp_server.py starts. Loads every heavy dependency,
5
+ policy document, and lookup table into module-level globals.
6
+
7
+ WHY THIS FILE EXISTS
8
+ ────────────────────
9
+ MCP stdio servers run as a persistent Python process — they are NOT
10
+ respawned per tool call. However, if heavy imports live inside tool
11
+ functions they still add latency on the FIRST call of each session.
12
+ Preloading here keeps every tool call fast regardless of import order.
13
+
14
+ Author: SAGE Team / Team SAGE (Hackathon)
15
+ License: MIT
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import json
21
+ import os
22
+ import pathlib
23
+ import sys
24
+ from datetime import datetime, timezone
25
+ from typing import Any
26
+
27
+ # ── Resolve project root regardless of CWD ────────────────────────────────────
28
+ _THIS_FILE = pathlib.Path(__file__).resolve()
29
+ PROJECT_ROOT = _THIS_FILE.parent.parent
30
+
31
+ RULES_DIR = PROJECT_ROOT / "rules"
32
+ AUDIT_FILE = PROJECT_ROOT / "audit-trail" / "decisions.jsonl"
33
+ LOGS_FILE = PROJECT_ROOT / "LOGS.md"
34
+ LOCAL_MEMORY = PROJECT_ROOT / "local_memory.md"
35
+ REPORTS_DIR = PROJECT_ROOT / "reports"
36
+
37
+ # ── Ensure required dirs & files exist ───────────────────────────────────────
38
+ for _p in (AUDIT_FILE.parent, REPORTS_DIR):
39
+ _p.mkdir(parents=True, exist_ok=True)
40
+ for _f in (AUDIT_FILE, LOGS_FILE, LOCAL_MEMORY):
41
+ _f.touch(exist_ok=True)
42
+
43
+ import hashlib
44
+
45
+ def write_audit_entry(entry: dict) -> str:
46
+ """
47
+ Append one JSON line to decisions.jsonl with SHA-256 chain link.
48
+ Reads the last line to extract the last entry's hash to serve as prev_hash,
49
+ ensuring chain integrity across separate runs/processes.
50
+ """
51
+ prev_hash = ""
52
+ session_id = entry.get("session_id")
53
+
54
+ if AUDIT_FILE.exists() and AUDIT_FILE.stat().st_size > 1:
55
+ try:
56
+ with open(AUDIT_FILE, "r", encoding="utf-8") as fh:
57
+ lines = fh.readlines()
58
+ if lines:
59
+ last_line = lines[-1].strip()
60
+ if last_line:
61
+ last_entry = json.loads(last_line)
62
+ prev_hash = last_entry.get("entry_hash", "")
63
+ if not session_id:
64
+ session_id = last_entry.get("session_id")
65
+ except Exception:
66
+ pass
67
+
68
+ if not session_id:
69
+ session_id = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
70
+
71
+ entry["session_id"] = session_id
72
+ entry["timestamp"] = datetime.now(timezone.utc).isoformat()
73
+
74
+ payload = json.dumps(entry, sort_keys=True, default=str) + prev_hash
75
+ current_hash = hashlib.sha256(payload.encode()).hexdigest()
76
+
77
+ entry["entry_hash"] = current_hash
78
+ entry["prev_hash"] = prev_hash
79
+
80
+ with open(AUDIT_FILE, "a", encoding="utf-8") as fh:
81
+ fh.write(json.dumps(entry, default=str) + "\n")
82
+
83
+ return current_hash
84
+
85
+
86
+ # ══════════════════════════════════════════════════════════════════════════════
87
+ # POLICY DOCUMENTS
88
+ # ══════════════════════════════════════════════════════════════════════════════
89
+
90
+ POLICY_DOCS: dict[str, str] = {}
91
+ _general_dir = RULES_DIR / "general"
92
+ if _general_dir.exists():
93
+ for _policy_file in _general_dir.glob("*.md"):
94
+ POLICY_DOCS[_policy_file.stem] = _policy_file.read_text(encoding="utf-8")
95
+
96
+ POLICY_INDEX: dict[str, Any] = {}
97
+ _index_path = RULES_DIR / "index.json"
98
+ if _index_path.exists():
99
+ try:
100
+ POLICY_INDEX = json.loads(_index_path.read_text(encoding="utf-8"))
101
+ except json.JSONDecodeError:
102
+ POLICY_INDEX = {}
103
+
104
+ # ══════════════════════════════════════════════════════════════════════════════
105
+ # LLM CLIENT (OpenAI — used only for human-readable reasoning enrichment)
106
+ # Override model with SAGE_LLM_MODEL env var. Defaults to gpt-4o-mini.
107
+ # ══════════════════════════════════════════════════════════════════════════════
108
+
109
+ LLM_AVAILABLE = False
110
+ LLM_CLIENT: Any = None
111
+ LLM_MODEL = os.environ.get("SAGE_LLM_MODEL", "gpt-4o-mini")
112
+
113
+ try:
114
+ import openai as _openai
115
+ _api_key = os.environ.get("OPENAI_API_KEY", "")
116
+ if _api_key:
117
+ LLM_CLIENT = _openai.OpenAI(api_key=_api_key)
118
+ LLM_AVAILABLE = True
119
+ else:
120
+ print(
121
+ "[SAGE startup] WARNING: OPENAI_API_KEY not set. "
122
+ "Deterministic-only mode active (no LLM enrichment).",
123
+ file=sys.stderr,
124
+ )
125
+ except ImportError:
126
+ print(
127
+ "[SAGE startup] WARNING: 'openai' package not found. "
128
+ "Run: pip install openai",
129
+ file=sys.stderr,
130
+ )
131
+
132
+ # ══════════════════════════════════════════════════════════════════════════════
133
+ # OPTIONAL HEAVY DEPS (fail gracefully if not installed)
134
+ # ══════════════════════════════════════════════════════════════════════════════
135
+
136
+ FAIRLEARN_AVAILABLE = False
137
+ try:
138
+ from fairlearn.metrics import ( # noqa: F401
139
+ demographic_parity_difference,
140
+ equalized_odds_difference,
141
+ MetricFrame,
142
+ )
143
+ FAIRLEARN_AVAILABLE = True
144
+ except ImportError:
145
+ pass
146
+
147
+ DIFFPRIVLIB_AVAILABLE = False
148
+ try:
149
+ import diffprivlib # noqa: F401
150
+ DIFFPRIVLIB_AVAILABLE = True
151
+ except ImportError:
152
+ pass
153
+
154
+ # ══════════════════════════════════════════════════════════════════════════════
155
+ # PROTECTED ATTRIBUTES (direct usage triggers P1 finding)
156
+ # ══════════════════════════════════════════════════════════════════════════════
157
+
158
+ PROTECTED_ATTRIBUTES: list[str] = [
159
+ "race", "ethnicity", "color", "sex", "gender", "age",
160
+ "religion", "national_origin", "nationality", "disability",
161
+ "pregnancy", "marital_status", "sexual_orientation",
162
+ "gender_identity", "skin_color",
163
+ ]
164
+
165
+ # ══════════════════════════════════════════════════════════════════════════════
166
+ # PROXY ATTRIBUTE MAP
167
+ # Semantic — not grep. Maps protected characteristics → known proxy variables.
168
+ # Source: ProPublica COMPAS analysis, Ali et al. 2019, Lambrecht & Tucker 2019.
169
+ # ══════════════════════════════════════════════════════════════════════════════
170
+
171
+ PROXY_ATTRIBUTE_MAP: dict[str, list[str]] = {
172
+ "race": [
173
+ "zip_code", "zip code", "zipcode", "postal_code", "postal code",
174
+ "neighborhood", "census_tract", "school_district", "ward",
175
+ "surname", "last_name", "family_name", "name",
176
+ "bank_branch", "grocery_store_distance", "church_attendance",
177
+ "prior_arrests", "arrest_history",
178
+ ],
179
+ "gender": [
180
+ "browsing_history", "page_likes", "purchase_history",
181
+ "maternity", "paternity", "childcare", "toy_preferences",
182
+ "cosmetics", "sports_interest", "car_type",
183
+ "clothing_category", "grooming_products",
184
+ ],
185
+ "age": [
186
+ "graduation_year", "years_experience", "first_job_year",
187
+ "account_age", "profile_creation_date", "class_year",
188
+ ],
189
+ "socioeconomic_status": [
190
+ "education_level", "school_name", "employment_gap",
191
+ "credit_history_length", "bank_type", "car_ownership",
192
+ "zip_code", "neighborhood", "income_bracket",
193
+ ],
194
+ "criminal_history_as_proxy_for_race": [
195
+ "prior_arrests", "priors_count", "juv_fel_count",
196
+ "family_history", "neighborhood_crime_rate",
197
+ "zip_code", "school_district",
198
+ ],
199
+ }
200
+
201
+ # ══════════════════════════════════════════════════════════════════════════════
202
+ # EU AI ACT ANNEX III — HIGH-RISK CATEGORIES
203
+ # ══════════════════════════════════════════════════════════════════════════════
204
+
205
+ EU_AI_ACT_ANNEX_III: dict[str, dict[str, Any]] = {
206
+ "criminal_justice": {
207
+ "annex": "Annex III.6.d",
208
+ "description": (
209
+ "AI systems for law enforcement assessing risk of offending "
210
+ "or re-offending (e.g. COMPAS, recidivism prediction)"
211
+ ),
212
+ "keywords": [
213
+ "recidivism", "criminal", "parole", "sentencing", "compas",
214
+ "reoffend", "bail", "arrest", "two_year_recid", "risk score",
215
+ "reoffending", "criminal justice",
216
+ ],
217
+ },
218
+ "employment": {
219
+ "annex": "Annex III.4.a",
220
+ "description": (
221
+ "AI systems for recruitment/selection, including targeted "
222
+ "job advertisements, CV filtering, candidate evaluation"
223
+ ),
224
+ "keywords": [
225
+ "job", "recruitment", "hiring", "resume", "cv", "employment",
226
+ "advertisement", "candidate", "fairjob", "job ad", "click",
227
+ "click-through", "ctr",
228
+ ],
229
+ },
230
+ "credit_scoring": {
231
+ "annex": "Annex III.5.b",
232
+ "description": (
233
+ "AI systems for creditworthiness evaluation or "
234
+ "credit score determination"
235
+ ),
236
+ "keywords": [
237
+ "credit", "loan", "default", "creditworthiness", "mortgage",
238
+ "financial risk", "lending", "apple card", "credit card",
239
+ "credit limit", "credit score",
240
+ ],
241
+ },
242
+ "education": {
243
+ "annex": "Annex III.3.a",
244
+ "description": (
245
+ "AI systems determining access to educational institutions "
246
+ "or evaluating students"
247
+ ),
248
+ "keywords": [
249
+ "education", "school", "admission", "student", "grade",
250
+ "academic", "university", "college", "exam",
251
+ ],
252
+ },
253
+ "essential_services": {
254
+ "annex": "Annex III.5.a",
255
+ "description": (
256
+ "AI systems for essential private and public services "
257
+ "(healthcare, insurance, housing, social security)"
258
+ ),
259
+ "keywords": [
260
+ "healthcare", "insurance", "benefit", "welfare",
261
+ "housing", "social security", "medical",
262
+ ],
263
+ },
264
+ "children_safety": {
265
+ "annex": "Annex III (context-dependent)",
266
+ "description": (
267
+ "AI systems impacting children's safety, privacy, or wellbeing; "
268
+ "UNICEF 10 Principles apply"
269
+ ),
270
+ "keywords": [
271
+ "child", "minor", "children", "safeguarding", "grooming",
272
+ "bullying", "moderation", "self-harm", "abuse", "distress",
273
+ "escalation", "chat safety",
274
+ ],
275
+ },
276
+ }
277
+
278
+ # ══════════════════════════════════════════════════════════════════════════════
279
+ # UDHR ARTICLE MAP (domain → relevant UDHR articles)
280
+ # ══════════════════════════════════════════════════════════════════════════════
281
+
282
+ UDHR_ARTICLE_MAP: dict[str, list[str]] = {
283
+ "criminal_justice": ["Article 7", "Article 10", "Article 11"],
284
+ "employment": ["Article 23"],
285
+ "credit_scoring": ["Article 22"],
286
+ "education": ["Article 26"],
287
+ "essential_services": ["Article 22", "Article 25"],
288
+ "children_safety": [
289
+ "UN CRC Article 3 (best interests)",
290
+ "UN CRC Article 12 (right to be heard)",
291
+ "UN CRC Article 16 (privacy)",
292
+ "UN CRC Article 19 (protection from abuse)",
293
+ "UN CRC Article 34 (sexual exploitation)",
294
+ ],
295
+ "general": ["Article 2", "Article 7"],
296
+ }
297
+
298
+ # ══════════════════════════════════════════════════════════════════════════════
299
+ # STARTUP REPORT
300
+ # ══════════════════════════════════════════════════════════════════════════════
301
+
302
+ print(
303
+ f"[SAGE startup] ✅ Policies loaded: {list(POLICY_DOCS.keys()) or 'none (add to rules/general/)'}",
304
+ file=sys.stderr,
305
+ )
306
+ print(f"[SAGE startup] ✅ LLM: {'available (' + LLM_MODEL + ')' if LLM_AVAILABLE else 'deterministic-only mode'}",
307
+ file=sys.stderr)
308
+ print(f"[SAGE startup] ✅ Fairlearn: {FAIRLEARN_AVAILABLE} | diffprivlib: {DIFFPRIVLIB_AVAILABLE}",
309
+ file=sys.stderr)
310
+ print(f"[SAGE startup] ✅ Session start: {datetime.now(timezone.utc).isoformat()}",
311
+ file=sys.stderr)