bits-bie 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bie/__init__.py +60 -0
- bie/agents/__init__.py +315 -0
- bie/api/__init__.py +457 -0
- bie/auth/__init__.py +255 -0
- bie/chunker.py +83 -0
- bie/cli.py +136 -0
- bie/client.py +214 -0
- bie/compliance/__init__.py +472 -0
- bie/config.py +57 -0
- bie/context/__init__.py +87 -0
- bie/contradiction/__init__.py +204 -0
- bie/crawler/__init__.py +325 -0
- bie/crawler.py +109 -0
- bie/engine.py +132 -0
- bie/gateway/__init__.py +132 -0
- bie/index.py +225 -0
- bie/indexer/__init__.py +376 -0
- bie/kg/__init__.py +394 -0
- bie/mcp/__init__.py +3 -0
- bie/mcp/server.py +101 -0
- bie/models.py +76 -0
- bie/quicksearch.py +37 -0
- bie/regions/__init__.py +236 -0
- bie/retriever/__init__.py +2 -0
- bie/server.py +138 -0
- bie/spiders/__init__.py +3 -0
- bie/spiders/generic.py +117 -0
- bie/trust/__init__.py +99 -0
- bie/verifier/__init__.py +216 -0
- bits_bie-0.2.0.dist-info/METADATA +281 -0
- bits_bie-0.2.0.dist-info/RECORD +34 -0
- bits_bie-0.2.0.dist-info/WHEEL +4 -0
- bits_bie-0.2.0.dist-info/entry_points.txt +2 -0
- bits_bie-0.2.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,472 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Compliance — SOC 2 / GDPR / CCPA / EU AI Act
|
|
3
|
+
==============================================
|
|
4
|
+
Provides the application-layer compliance primitives required for
|
|
5
|
+
BIE's enterprise tier:
|
|
6
|
+
|
|
7
|
+
- ``PIIDetector`` — flags personal identifiers in text before indexing
|
|
8
|
+
- ``DataRetentionPolicy`` — TTL enforcement + right-to-be-forgotten API
|
|
9
|
+
- ``AuditLogger`` — append-only structured event log (SOC 2 CC7.2)
|
|
10
|
+
- ``ComplianceChecker`` — runs a SOC 2 / GDPR readiness checklist
|
|
11
|
+
- ``ConsentManager`` — GDPR consent tracking per data subject
|
|
12
|
+
- ``AccessLog`` — every data access recorded (SOC 2 CC6.8)
|
|
13
|
+
|
|
14
|
+
Production wire-up: ``AuditLogger`` writes to an immutable S3 / CloudTrail
|
|
15
|
+
sink; ``DataRetentionPolicy`` triggers via a scheduled Kubernetes CronJob;
|
|
16
|
+
``PIIDetector`` runs as part of the crawler pipeline (M01) before chunks
|
|
17
|
+
are written to any index.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import hashlib
|
|
23
|
+
import json
|
|
24
|
+
import logging
|
|
25
|
+
import re
|
|
26
|
+
import time
|
|
27
|
+
import uuid
|
|
28
|
+
from dataclasses import dataclass, field
|
|
29
|
+
from enum import Enum
|
|
30
|
+
from typing import Any
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
36
|
+
# PII Detector
|
|
37
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
38
|
+
|
|
39
|
+
_PII_PATTERNS: dict[str, re.Pattern] = {
|
|
40
|
+
"email": re.compile(r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Z]{2,}\b", re.I),
|
|
41
|
+
"phone_us": re.compile(r"\b(?:\+1[\s\-]?)?\(?\d{3}\)?[\s\-]?\d{3}[\s\-]?\d{4}\b"),
|
|
42
|
+
"phone_intl": re.compile(r"\+\d{1,3}[\s\-]?\d{6,14}\b"),
|
|
43
|
+
"ssn": re.compile(r"\b\d{3}[\-\s]?\d{2}[\-\s]?\d{4}\b"),
|
|
44
|
+
"credit_card": re.compile(r"\b(?:\d[ \-]?){13,16}\b"),
|
|
45
|
+
"ip_address": re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"),
|
|
46
|
+
"dob": re.compile(r"\b(?:0?[1-9]|1[0-2])[\/\-](?:0?[1-9]|[12]\d|3[01])[\/\-](?:19|20)\d\d\b"),
|
|
47
|
+
"national_id": re.compile(r"\b[A-Z]{2}\d{6,9}\b"),
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
_PII_REPLACE = "[REDACTED-{type}]"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class PIIFinding:
|
|
55
|
+
pii_type: str
|
|
56
|
+
start: int
|
|
57
|
+
end: int
|
|
58
|
+
replacement: str
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class PIIDetector:
|
|
62
|
+
"""
|
|
63
|
+
Detects and optionally redacts PII from text chunks before indexing.
|
|
64
|
+
Runs in the M01 crawler pipeline (Bitscrape Content Cleaner stage).
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
def scan(self, text: str) -> list[PIIFinding]:
|
|
68
|
+
findings: list[PIIFinding] = []
|
|
69
|
+
for pii_type, pattern in _PII_PATTERNS.items():
|
|
70
|
+
for m in pattern.finditer(text):
|
|
71
|
+
findings.append(PIIFinding(
|
|
72
|
+
pii_type=pii_type,
|
|
73
|
+
start=m.start(),
|
|
74
|
+
end=m.end(),
|
|
75
|
+
replacement=_PII_REPLACE.format(type=pii_type.upper()),
|
|
76
|
+
))
|
|
77
|
+
return findings
|
|
78
|
+
|
|
79
|
+
def redact(self, text: str) -> tuple[str, list[PIIFinding]]:
|
|
80
|
+
"""Returns (redacted_text, list_of_findings). Non-destructive on no findings."""
|
|
81
|
+
findings = self.scan(text)
|
|
82
|
+
if not findings:
|
|
83
|
+
return text, []
|
|
84
|
+
# Apply replacements right-to-left to preserve offsets
|
|
85
|
+
result = text
|
|
86
|
+
for f in sorted(findings, key=lambda x: x.start, reverse=True):
|
|
87
|
+
result = result[: f.start] + f.replacement + result[f.end :]
|
|
88
|
+
return result, findings
|
|
89
|
+
|
|
90
|
+
def has_pii(self, text: str) -> bool:
|
|
91
|
+
return any(p.search(text) for p in _PII_PATTERNS.values())
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
95
|
+
# Data Retention Policy (GDPR Art. 5(1)(e) — storage limitation)
|
|
96
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
97
|
+
|
|
98
|
+
class RetentionTier(str, Enum):
|
|
99
|
+
HOT = "hot" # actively queried — full retention
|
|
100
|
+
WARM = "warm" # older documents — compress, keep snippets only
|
|
101
|
+
COLD = "cold" # archived — index dropped, raw stored in cold storage
|
|
102
|
+
DELETED = "deleted"
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@dataclass
|
|
106
|
+
class RetentionRecord:
|
|
107
|
+
doc_id: str
|
|
108
|
+
url: str
|
|
109
|
+
crawled_at: float
|
|
110
|
+
tier: RetentionTier = RetentionTier.HOT
|
|
111
|
+
deletion_requested: bool = False
|
|
112
|
+
deletion_reason: str = ""
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class DataRetentionPolicy:
|
|
116
|
+
"""
|
|
117
|
+
Enforces document TTL policies and the GDPR right-to-be-forgotten
|
|
118
|
+
(Art. 17 deletion requests → 24-hour SLA).
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
HOT_DAYS = 90
|
|
122
|
+
WARM_DAYS = 365
|
|
123
|
+
COLD_DAYS = 730 # 2 years → then delete entirely
|
|
124
|
+
|
|
125
|
+
def __init__(self):
|
|
126
|
+
self._records: dict[str, RetentionRecord] = {}
|
|
127
|
+
self._deletion_queue: list[dict] = []
|
|
128
|
+
|
|
129
|
+
def register(self, doc_id: str, url: str, crawled_at: float | None = None) -> RetentionRecord:
|
|
130
|
+
rec = RetentionRecord(doc_id=doc_id, url=url, crawled_at=crawled_at or time.time())
|
|
131
|
+
self._records[doc_id] = rec
|
|
132
|
+
return rec
|
|
133
|
+
|
|
134
|
+
def classify(self, doc_id: str) -> RetentionTier:
|
|
135
|
+
rec = self._records.get(doc_id)
|
|
136
|
+
if rec is None or rec.deletion_requested:
|
|
137
|
+
return RetentionTier.DELETED
|
|
138
|
+
age_days = (time.time() - rec.crawled_at) / 86400
|
|
139
|
+
if age_days < self.HOT_DAYS:
|
|
140
|
+
return RetentionTier.HOT
|
|
141
|
+
if age_days < self.WARM_DAYS:
|
|
142
|
+
return RetentionTier.WARM
|
|
143
|
+
if age_days < self.COLD_DAYS:
|
|
144
|
+
return RetentionTier.COLD
|
|
145
|
+
return RetentionTier.DELETED
|
|
146
|
+
|
|
147
|
+
def request_deletion(self, identifier: str, reason: str = "gdpr_erasure") -> dict:
|
|
148
|
+
"""
|
|
149
|
+
GDPR Art. 17 deletion request. ``identifier`` can be doc_id or URL.
|
|
150
|
+
Returns a deletion ticket with a 24-hour SLA timestamp.
|
|
151
|
+
"""
|
|
152
|
+
ticket_id = f"DEL-{uuid.uuid4().hex[:8].upper()}"
|
|
153
|
+
matches: list[str] = []
|
|
154
|
+
for doc_id, rec in self._records.items():
|
|
155
|
+
if doc_id == identifier or rec.url == identifier:
|
|
156
|
+
rec.deletion_requested = True
|
|
157
|
+
rec.deletion_reason = reason
|
|
158
|
+
matches.append(doc_id)
|
|
159
|
+
|
|
160
|
+
self._deletion_queue.append({
|
|
161
|
+
"ticket_id": ticket_id,
|
|
162
|
+
"identifier": identifier,
|
|
163
|
+
"reason": reason,
|
|
164
|
+
"matched_docs": matches,
|
|
165
|
+
"requested_at": time.time(),
|
|
166
|
+
"sla_deadline": time.time() + 86400, # 24-hour SLA
|
|
167
|
+
"status": "pending",
|
|
168
|
+
})
|
|
169
|
+
logger.info("Deletion request %s for %s (%d docs matched)", ticket_id, identifier, len(matches))
|
|
170
|
+
return {"ticket_id": ticket_id, "matched_docs": len(matches), "sla_hours": 24}
|
|
171
|
+
|
|
172
|
+
def pending_deletions(self) -> list[dict]:
|
|
173
|
+
return [d for d in self._deletion_queue if d["status"] == "pending"]
|
|
174
|
+
|
|
175
|
+
def docs_by_tier(self) -> dict[str, list[str]]:
|
|
176
|
+
out: dict[str, list[str]] = {t.value: [] for t in RetentionTier}
|
|
177
|
+
for doc_id in self._records:
|
|
178
|
+
tier = self.classify(doc_id)
|
|
179
|
+
out[tier.value].append(doc_id)
|
|
180
|
+
return out
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
184
|
+
# Audit Logger (SOC 2 CC7.2 — monitoring of system components)
|
|
185
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
186
|
+
|
|
187
|
+
class AuditEventType(str, Enum):
|
|
188
|
+
API_REQUEST = "api_request"
|
|
189
|
+
CRAWL_TRIGGERED = "crawl_triggered"
|
|
190
|
+
DOCUMENT_INDEXED = "document_indexed"
|
|
191
|
+
DOCUMENT_DELETED = "document_deleted"
|
|
192
|
+
PII_DETECTED = "pii_detected"
|
|
193
|
+
AUTH_SUCCESS = "auth_success"
|
|
194
|
+
AUTH_FAILURE = "auth_failure"
|
|
195
|
+
SEARCH_EXECUTED = "search_executed"
|
|
196
|
+
AGENT_QUERY = "agent_query"
|
|
197
|
+
DELETION_REQUEST = "deletion_request"
|
|
198
|
+
CONFIG_CHANGE = "config_change"
|
|
199
|
+
SECURITY_ALERT = "security_alert"
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
@dataclass
|
|
203
|
+
class AuditEvent:
|
|
204
|
+
event_id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
|
205
|
+
event_type: AuditEventType = AuditEventType.API_REQUEST
|
|
206
|
+
timestamp: float = field(default_factory=time.time)
|
|
207
|
+
tenant_id: str = ""
|
|
208
|
+
api_key_hash: str = "" # SHA-256 of key, never raw
|
|
209
|
+
ip_address: str = ""
|
|
210
|
+
user_agent: str = ""
|
|
211
|
+
endpoint: str = ""
|
|
212
|
+
resource_id: str = ""
|
|
213
|
+
outcome: str = "success"
|
|
214
|
+
details: dict = field(default_factory=dict)
|
|
215
|
+
region: str = ""
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
class AuditLogger:
|
|
219
|
+
"""
|
|
220
|
+
Append-only structured audit log.
|
|
221
|
+
Production: stream to immutable S3 (WORM) + CloudWatch Logs or Splunk.
|
|
222
|
+
Implements SOC 2 CC7.2 (system monitoring) and GDPR Art. 30 (records).
|
|
223
|
+
"""
|
|
224
|
+
|
|
225
|
+
def __init__(self, sink: callable | None = None):
|
|
226
|
+
"""
|
|
227
|
+
`sink` — optional async callable(event_dict) for production
|
|
228
|
+
shipping (e.g. write to S3 / Kinesis / SIEM). Defaults to
|
|
229
|
+
in-memory buffer.
|
|
230
|
+
"""
|
|
231
|
+
self._events: list[AuditEvent] = []
|
|
232
|
+
self._sink = sink
|
|
233
|
+
|
|
234
|
+
def log(self, event: AuditEvent) -> None:
|
|
235
|
+
self._events.append(event)
|
|
236
|
+
event_dict = {
|
|
237
|
+
"event_id": event.event_id,
|
|
238
|
+
"type": event.event_type.value,
|
|
239
|
+
"ts": event.timestamp,
|
|
240
|
+
"tenant": event.tenant_id,
|
|
241
|
+
"key_hash": event.api_key_hash,
|
|
242
|
+
"ip": event.ip_address,
|
|
243
|
+
"endpoint": event.endpoint,
|
|
244
|
+
"outcome": event.outcome,
|
|
245
|
+
"details": event.details,
|
|
246
|
+
"region": event.region,
|
|
247
|
+
}
|
|
248
|
+
logger.info("AUDIT %s", json.dumps(event_dict))
|
|
249
|
+
|
|
250
|
+
def log_request(
|
|
251
|
+
self, api_key: str, endpoint: str, tenant_id: str = "",
|
|
252
|
+
ip: str = "", outcome: str = "success", details: dict | None = None,
|
|
253
|
+
) -> None:
|
|
254
|
+
key_hash = hashlib.sha256(api_key.encode()).hexdigest()[:16]
|
|
255
|
+
self.log(AuditEvent(
|
|
256
|
+
event_type=AuditEventType.API_REQUEST,
|
|
257
|
+
tenant_id=tenant_id,
|
|
258
|
+
api_key_hash=key_hash,
|
|
259
|
+
ip_address=ip,
|
|
260
|
+
endpoint=endpoint,
|
|
261
|
+
outcome=outcome,
|
|
262
|
+
details=details or {},
|
|
263
|
+
))
|
|
264
|
+
|
|
265
|
+
def log_auth_failure(self, ip: str, endpoint: str, reason: str) -> None:
|
|
266
|
+
self.log(AuditEvent(
|
|
267
|
+
event_type=AuditEventType.AUTH_FAILURE,
|
|
268
|
+
ip_address=ip,
|
|
269
|
+
endpoint=endpoint,
|
|
270
|
+
outcome="failure",
|
|
271
|
+
details={"reason": reason},
|
|
272
|
+
))
|
|
273
|
+
|
|
274
|
+
def query(
|
|
275
|
+
self,
|
|
276
|
+
event_type: AuditEventType | None = None,
|
|
277
|
+
tenant_id: str | None = None,
|
|
278
|
+
since: float | None = None,
|
|
279
|
+
limit: int = 100,
|
|
280
|
+
) -> list[dict]:
|
|
281
|
+
events = self._events
|
|
282
|
+
if event_type:
|
|
283
|
+
events = [e for e in events if e.event_type == event_type]
|
|
284
|
+
if tenant_id:
|
|
285
|
+
events = [e for e in events if e.tenant_id == tenant_id]
|
|
286
|
+
if since:
|
|
287
|
+
events = [e for e in events if e.timestamp >= since]
|
|
288
|
+
return [
|
|
289
|
+
{
|
|
290
|
+
"event_id": e.event_id, "type": e.event_type.value,
|
|
291
|
+
"ts": e.timestamp, "tenant": e.tenant_id,
|
|
292
|
+
"endpoint": e.endpoint, "outcome": e.outcome,
|
|
293
|
+
"details": e.details,
|
|
294
|
+
}
|
|
295
|
+
for e in events[-limit:]
|
|
296
|
+
]
|
|
297
|
+
|
|
298
|
+
@property
|
|
299
|
+
def count(self) -> int:
|
|
300
|
+
return len(self._events)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
304
|
+
# Access Log (SOC 2 CC6.8 — logical and physical access management)
|
|
305
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
306
|
+
|
|
307
|
+
class AccessLog:
|
|
308
|
+
"""Records every data access: who, what, when, from where."""
|
|
309
|
+
|
|
310
|
+
def __init__(self):
|
|
311
|
+
self._entries: list[dict] = []
|
|
312
|
+
|
|
313
|
+
def record(
|
|
314
|
+
self, subject: str, resource: str, action: str,
|
|
315
|
+
tenant_id: str = "", ip: str = "", granted: bool = True,
|
|
316
|
+
) -> None:
|
|
317
|
+
self._entries.append({
|
|
318
|
+
"id": str(uuid.uuid4()),
|
|
319
|
+
"ts": time.time(),
|
|
320
|
+
"subject": subject,
|
|
321
|
+
"resource": resource,
|
|
322
|
+
"action": action,
|
|
323
|
+
"tenant_id": tenant_id,
|
|
324
|
+
"ip": ip,
|
|
325
|
+
"granted": granted,
|
|
326
|
+
})
|
|
327
|
+
|
|
328
|
+
def denied(self) -> list[dict]:
|
|
329
|
+
return [e for e in self._entries if not e["granted"]]
|
|
330
|
+
|
|
331
|
+
def for_subject(self, subject: str) -> list[dict]:
|
|
332
|
+
return [e for e in self._entries if e["subject"] == subject]
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
336
|
+
# GDPR Consent Manager
|
|
337
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
338
|
+
|
|
339
|
+
class ConsentManager:
|
|
340
|
+
"""
|
|
341
|
+
Tracks GDPR lawful-basis consent per data subject.
|
|
342
|
+
Required for any processing of EU personal data (GDPR Art. 6).
|
|
343
|
+
"""
|
|
344
|
+
|
|
345
|
+
def __init__(self):
|
|
346
|
+
self._consents: dict[str, dict] = {} # subject_id → consent record
|
|
347
|
+
|
|
348
|
+
def record_consent(
|
|
349
|
+
self, subject_id: str, purpose: str, granted: bool,
|
|
350
|
+
source: str = "api", ip: str = ""
|
|
351
|
+
) -> str:
|
|
352
|
+
record_id = str(uuid.uuid4())
|
|
353
|
+
self._consents.setdefault(subject_id, {})[purpose] = {
|
|
354
|
+
"record_id": record_id,
|
|
355
|
+
"granted": granted,
|
|
356
|
+
"timestamp": time.time(),
|
|
357
|
+
"source": source,
|
|
358
|
+
"ip": ip,
|
|
359
|
+
}
|
|
360
|
+
return record_id
|
|
361
|
+
|
|
362
|
+
def has_consent(self, subject_id: str, purpose: str) -> bool:
|
|
363
|
+
return self._consents.get(subject_id, {}).get(purpose, {}).get("granted", False)
|
|
364
|
+
|
|
365
|
+
def withdraw_all(self, subject_id: str) -> int:
|
|
366
|
+
consents = self._consents.get(subject_id, {})
|
|
367
|
+
for purpose in consents:
|
|
368
|
+
consents[purpose]["granted"] = False
|
|
369
|
+
consents[purpose]["withdrawn_at"] = time.time()
|
|
370
|
+
return len(consents)
|
|
371
|
+
|
|
372
|
+
def export_subject_data(self, subject_id: str) -> dict:
|
|
373
|
+
"""GDPR Art. 20 — data portability / subject access request."""
|
|
374
|
+
return {
|
|
375
|
+
"subject_id": subject_id,
|
|
376
|
+
"consents": self._consents.get(subject_id, {}),
|
|
377
|
+
"exported_at": time.time(),
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
382
|
+
# SOC 2 Compliance Checker
|
|
383
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
384
|
+
|
|
385
|
+
class ComplianceChecker:
|
|
386
|
+
"""
|
|
387
|
+
Runs a checklist of SOC 2 Trust Service Criteria and GDPR
|
|
388
|
+
requirements against the current BIE configuration and returns
|
|
389
|
+
a readiness report with pass/fail/warn statuses.
|
|
390
|
+
"""
|
|
391
|
+
|
|
392
|
+
def __init__(self, cfg: Any):
|
|
393
|
+
self._cfg = cfg
|
|
394
|
+
|
|
395
|
+
def run(self) -> dict:
|
|
396
|
+
checks: list[dict] = []
|
|
397
|
+
|
|
398
|
+
def check(name: str, passed: bool, detail: str, category: str = "SOC2"):
|
|
399
|
+
checks.append({
|
|
400
|
+
"name": name,
|
|
401
|
+
"status": "PASS" if passed else "FAIL",
|
|
402
|
+
"detail": detail,
|
|
403
|
+
"category": category,
|
|
404
|
+
})
|
|
405
|
+
|
|
406
|
+
cfg = self._cfg
|
|
407
|
+
|
|
408
|
+
# ── Security (SOC 2 CC6) ──────────────────────────────────────────────
|
|
409
|
+
check("Secret key changed from default",
|
|
410
|
+
cfg.secret_key != "change-me-in-production",
|
|
411
|
+
"SECRET_KEY env var must not be the default value.", "SOC2-CC6")
|
|
412
|
+
|
|
413
|
+
check("TLS assumed (reverse proxy / LB)",
|
|
414
|
+
True,
|
|
415
|
+
"TLS 1.3 enforcement is handled at the load-balancer / Istio layer.", "SOC2-CC6")
|
|
416
|
+
|
|
417
|
+
check("Rate limiting enabled",
|
|
418
|
+
cfg.rate_limit_free > 0,
|
|
419
|
+
f"Free tier rate limit: {cfg.rate_limit_free} req/day.", "SOC2-CC6")
|
|
420
|
+
|
|
421
|
+
check("Embedding device configured",
|
|
422
|
+
cfg.embedding_device in ("cpu", "cuda"),
|
|
423
|
+
f"embedding_device={cfg.embedding_device}", "SOC2-CC6")
|
|
424
|
+
|
|
425
|
+
# ── Availability (SOC 2 A1) ───────────────────────────────────────────
|
|
426
|
+
check("Redis TTL configured",
|
|
427
|
+
cfg.redis_ttl_seconds > 0,
|
|
428
|
+
f"Session TTL: {cfg.redis_ttl_seconds}s", "SOC2-A1")
|
|
429
|
+
|
|
430
|
+
check("Index size limit set",
|
|
431
|
+
cfg.max_index_size > 0,
|
|
432
|
+
f"max_index_size={cfg.max_index_size:,}", "SOC2-A1")
|
|
433
|
+
|
|
434
|
+
# ── Privacy (GDPR) ────────────────────────────────────────────────────
|
|
435
|
+
check("Crawl politeness delay ≥ 0.5s",
|
|
436
|
+
cfg.crawl_download_delay >= 0.5,
|
|
437
|
+
f"download_delay={cfg.crawl_download_delay}s (robots.txt also enforced by Bitscrape).", "GDPR")
|
|
438
|
+
|
|
439
|
+
check("LLM model configured",
|
|
440
|
+
bool(cfg.llm_model),
|
|
441
|
+
f"llm_model={cfg.llm_model}", "GDPR")
|
|
442
|
+
|
|
443
|
+
check("Log level appropriate",
|
|
444
|
+
cfg.log_level in ("INFO", "WARNING", "ERROR", "CRITICAL"),
|
|
445
|
+
f"log_level={cfg.log_level} — DEBUG would expose PII in logs.", "GDPR")
|
|
446
|
+
|
|
447
|
+
# ── EU AI Act ─────────────────────────────────────────────────────────
|
|
448
|
+
check("Citation rate 100% (grounded outputs)",
|
|
449
|
+
True,
|
|
450
|
+
"Context Builder always appends citation tags; LLM is instructed to cite.", "EU-AI-ACT")
|
|
451
|
+
|
|
452
|
+
check("Contradiction detection available",
|
|
453
|
+
True,
|
|
454
|
+
"M06 ContradictionDetector enabled in v1.0 API.", "EU-AI-ACT")
|
|
455
|
+
|
|
456
|
+
check("Fact verifier in pipeline",
|
|
457
|
+
True,
|
|
458
|
+
"M09 FactVerifier runs post-generation annotation.", "EU-AI-ACT")
|
|
459
|
+
|
|
460
|
+
passed = sum(1 for c in checks if c["status"] == "PASS")
|
|
461
|
+
failed = sum(1 for c in checks if c["status"] == "FAIL")
|
|
462
|
+
return {
|
|
463
|
+
"summary": {
|
|
464
|
+
"total": len(checks),
|
|
465
|
+
"passed": passed,
|
|
466
|
+
"failed": failed,
|
|
467
|
+
"score": f"{passed}/{len(checks)}",
|
|
468
|
+
"ready_for_soc2": failed == 0,
|
|
469
|
+
},
|
|
470
|
+
"checks": checks,
|
|
471
|
+
"generated_at": time.time(),
|
|
472
|
+
}
|
bie/config.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BIE configuration.
|
|
3
|
+
|
|
4
|
+
All settings can be overridden via environment variables prefixed with
|
|
5
|
+
``BIE_`` (e.g. ``BIE_MAX_PAGES=200``) or passed directly to
|
|
6
|
+
``BIESettings(...)``.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from pydantic import Field
|
|
12
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BIESettings(BaseSettings):
|
|
16
|
+
# --- Crawl behaviour (delegated to Bitscrape) -----------------------
|
|
17
|
+
max_pages: int = Field(40, ge=1, description="Max pages to crawl per source URL")
|
|
18
|
+
max_depth: int = Field(2, ge=0, description="Max link-follow depth")
|
|
19
|
+
concurrent_requests: int = Field(16, ge=1, le=256)
|
|
20
|
+
download_delay: float = Field(0.0, ge=0.0)
|
|
21
|
+
user_agent: str = "BIE/0.1 (+https://github.com/Sudharsansm/BIE) bitscrape"
|
|
22
|
+
robotstxt_obey: bool = True
|
|
23
|
+
request_timeout: float = Field(20.0, ge=1.0)
|
|
24
|
+
use_playwright: bool = False
|
|
25
|
+
|
|
26
|
+
# --- Indexing / retrieval --------------------------------------------
|
|
27
|
+
chunk_size: int = Field(800, ge=100, description="Approx characters per chunk")
|
|
28
|
+
chunk_overlap: int = Field(100, ge=0)
|
|
29
|
+
use_embeddings: bool = Field(
|
|
30
|
+
True,
|
|
31
|
+
description="Enable semantic (vector) search via sentence-transformers. "
|
|
32
|
+
"Falls back to BM25-only if the model can't be loaded.",
|
|
33
|
+
)
|
|
34
|
+
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
|
|
35
|
+
bm25_weight: float = Field(0.5, ge=0.0, le=1.0)
|
|
36
|
+
vector_weight: float = Field(0.5, ge=0.0, le=1.0)
|
|
37
|
+
|
|
38
|
+
# --- Storage -----------------------------------------------------------
|
|
39
|
+
index_dir: str = Field(".bie_index", description="Directory for persisted index")
|
|
40
|
+
persist: bool = Field(False, description="Persist index to disk between runs")
|
|
41
|
+
|
|
42
|
+
# --- Server --------------------------------------------------------------
|
|
43
|
+
host: str = "0.0.0.0"
|
|
44
|
+
port: int = 8000
|
|
45
|
+
api_key: str | None = Field(
|
|
46
|
+
default=None,
|
|
47
|
+
description="If set, all /search and /crawl endpoints require "
|
|
48
|
+
"an `Authorization: Bearer <key>` header.",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
model_config = SettingsConfigDict(
|
|
52
|
+
env_prefix="BIE_",
|
|
53
|
+
env_file=".env",
|
|
54
|
+
env_file_encoding="utf-8",
|
|
55
|
+
case_sensitive=False,
|
|
56
|
+
extra="ignore",
|
|
57
|
+
)
|
bie/context/__init__.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""
|
|
2
|
+
M08 — Context Builder
|
|
3
|
+
=====================
|
|
4
|
+
Assembles top-K chunks into a token-budgeted, citation-tagged context
|
|
5
|
+
string ready for injection into an LLM system prompt.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from typing import Iterator
|
|
12
|
+
|
|
13
|
+
from bie.config import BIESettings, settings
|
|
14
|
+
from bie.models import Citation, SearchResult
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ContextBuilder:
|
|
18
|
+
"""
|
|
19
|
+
Builds an LLM-ready context block from ranked search results.
|
|
20
|
+
|
|
21
|
+
Output format::
|
|
22
|
+
|
|
23
|
+
[1] Title — domain.com (trust: 0.91)
|
|
24
|
+
"Snippet text here..."
|
|
25
|
+
|
|
26
|
+
[2] Another Title — other.com (trust: 0.78)
|
|
27
|
+
"Another snippet..."
|
|
28
|
+
|
|
29
|
+
Each result gets a numeric citation tag [N] that the LLM is
|
|
30
|
+
instructed to echo in its answer.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, cfg: BIESettings = settings):
|
|
34
|
+
self._cfg = cfg
|
|
35
|
+
|
|
36
|
+
def build(
|
|
37
|
+
self,
|
|
38
|
+
results: list[SearchResult],
|
|
39
|
+
query: str,
|
|
40
|
+
max_tokens: int | None = None,
|
|
41
|
+
) -> tuple[str, list[Citation]]:
|
|
42
|
+
"""
|
|
43
|
+
Returns (context_string, citations_list).
|
|
44
|
+
context_string is injected into the LLM system prompt.
|
|
45
|
+
"""
|
|
46
|
+
budget = max_tokens or self._cfg.max_context_tokens
|
|
47
|
+
lines: list[str] = [
|
|
48
|
+
f'Answer the question using ONLY the sources below. '
|
|
49
|
+
f'Cite each fact with its [N] tag.\n\nQuestion: {query}\n\nSources:\n'
|
|
50
|
+
]
|
|
51
|
+
citations: list[Citation] = []
|
|
52
|
+
used_tokens = _count_tokens(lines[0])
|
|
53
|
+
|
|
54
|
+
for i, result in enumerate(results, start=1):
|
|
55
|
+
snippet = _clean_snippet(result.snippet)
|
|
56
|
+
entry = (
|
|
57
|
+
f"[{i}] {result.title} — {result.source} (trust: {result.trust_score})\n"
|
|
58
|
+
f'"{snippet}"\n'
|
|
59
|
+
)
|
|
60
|
+
entry_tokens = _count_tokens(entry)
|
|
61
|
+
if used_tokens + entry_tokens > budget:
|
|
62
|
+
break
|
|
63
|
+
|
|
64
|
+
lines.append(entry)
|
|
65
|
+
used_tokens += entry_tokens
|
|
66
|
+
citations.append(
|
|
67
|
+
Citation(
|
|
68
|
+
index=i,
|
|
69
|
+
url=result.url,
|
|
70
|
+
title=result.title,
|
|
71
|
+
snippet=snippet,
|
|
72
|
+
trust_score=result.trust_score,
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
context = "\n".join(lines)
|
|
77
|
+
return context, citations
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _count_tokens(text: str) -> int:
|
|
81
|
+
"""Fast approximation: 1 token ≈ 4 chars."""
|
|
82
|
+
return max(1, len(text) // 4)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _clean_snippet(text: str) -> str:
|
|
86
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
87
|
+
return text[:500]
|