bits-bie 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bie/api/__init__.py ADDED
@@ -0,0 +1,457 @@
1
+ """
2
+ M11 — BIE v1.0 Agent API
3
+ ==========================
4
+ Full v1.0 REST API: all v0.1 endpoints plus Knowledge Graph,
5
+ Multi-Agent Orchestrator, Contradiction Detector, Fact Verifier,
6
+ SSO/Enterprise auth, compliance endpoints, and multi-region status.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import asyncio
12
+ import logging
13
+ import time
14
+ from contextlib import asynccontextmanager
15
+ from typing import AsyncIterator
16
+
17
+ from fastapi import Depends, FastAPI, Header, HTTPException, Request, status
18
+ from fastapi.middleware.cors import CORSMiddleware
19
+ from fastapi.responses import JSONResponse
20
+ from sse_starlette.sse import EventSourceResponse
21
+
22
+ from bie.agents import AgentOrchestrator, SharedMemory
23
+ from bie.auth import APIKeyStore, JWTManager, RBAC, Role
24
+ from bie.compliance import (
25
+ AuditLogger, AuditEvent, AuditEventType,
26
+ ComplianceChecker, DataRetentionPolicy, PIIDetector,
27
+ )
28
+ from bie.config import BIESettings, settings
29
+ from bie.context import ContextBuilder
30
+ from bie.contradiction import ContradictionDetector
31
+ from bie.crawler import BIECrawler
32
+ from bie.gateway import LLMGateway
33
+ from bie.indexer import HybridIndex, HybridRetriever
34
+ from bie.kg import KnowledgeGraph
35
+ from bie.models import (
36
+ AgentResponse, CrawlRequest, CrawlResponse,
37
+ HealthResponse, SearchRequest, SearchResponse, SearchResult,
38
+ )
39
+ from bie.regions import RegionRegistry, GeoRouter, ReplicationManager, ShardRouter
40
+ from bie.trust import TrustEngine
41
+ from bie.verifier import FactVerifier
42
+
43
+ logger = logging.getLogger(__name__)
44
+
45
+ # ── App-wide singletons ────────────────────────────────────────────────────────
46
+ _index = HybridIndex()
47
+ _retriever = HybridRetriever(_index)
48
+ _crawler = BIECrawler()
49
+ _trust = TrustEngine()
50
+ _context_builder = ContextBuilder()
51
+ _llm = LLMGateway()
52
+ _kg = KnowledgeGraph()
53
+ _contradiction = ContradictionDetector()
54
+ _fact_verifier = FactVerifier(kg=_kg)
55
+ _memory = SharedMemory()
56
+ _orchestrator = AgentOrchestrator(
57
+ retriever=_retriever, kg=_kg, llm=_llm,
58
+ fact_verifier=_fact_verifier, memory=_memory,
59
+ )
60
+
61
+ # Auth + compliance singletons
62
+ _key_store = APIKeyStore()
63
+ _jwt_manager = JWTManager()
64
+ _audit = AuditLogger()
65
+ _pii = PIIDetector()
66
+ _retention = DataRetentionPolicy()
67
+ _compliance = ComplianceChecker(settings)
68
+
69
+ # Multi-region singletons
70
+ _region_registry = RegionRegistry()
71
+ _geo_router = GeoRouter(_region_registry)
72
+ _shard_router = ShardRouter(_region_registry)
73
+ _replication = ReplicationManager(_region_registry)
74
+
75
+ _start_time = time.time()
76
+
77
+
78
+ # ── Lifespan ───────────────────────────────────────────────────────────────────
79
+
80
+ @asynccontextmanager
81
+ async def lifespan(app: FastAPI):
82
+ logger.info("BIE v1.0 starting — region=%s", settings.region)
83
+ _region_registry.get(settings.region) # validate local region
84
+ yield
85
+ await _llm.close()
86
+ logger.info("BIE v1.0 shut down.")
87
+
88
+
89
+ # ── App factory ────────────────────────────────────────────────────────────────
90
+
91
+ def create_app(cfg: BIESettings = settings) -> FastAPI:
92
+ app = FastAPI(
93
+ title="BitSearch Intelligence Engine v1.0",
94
+ description=(
95
+ "AI-native real-time retrieval — Bitscrape-powered.\n"
96
+ "Multi-region · 10B-doc index · SOC 2 · Enterprise Auth\n"
97
+ "Knowledge Graph · Contradiction Detection · Multi-Agent Orchestration"
98
+ ),
99
+ version="1.0.0",
100
+ lifespan=lifespan,
101
+ docs_url="/docs",
102
+ redoc_url="/redoc",
103
+ )
104
+
105
+ app.add_middleware(
106
+ CORSMiddleware,
107
+ allow_origins=["*"],
108
+ allow_methods=["*"],
109
+ allow_headers=["*"],
110
+ )
111
+
112
+ # ── Auth dependency ────────────────────────────────────────────────────────
113
+
114
+ async def require_api_key(
115
+ request: Request,
116
+ x_api_key: str = Header(..., alias="X-API-Key"),
117
+ ) -> tuple:
118
+ result = _key_store.validate(x_api_key)
119
+ if result is None:
120
+ _audit.log_auth_failure(
121
+ ip=request.client.host if request.client else "",
122
+ endpoint=str(request.url.path),
123
+ reason="invalid_api_key",
124
+ )
125
+ raise HTTPException(status_code=401, detail="Invalid API key")
126
+ key_rec, tenant = result
127
+ if not _key_store.record_usage(x_api_key):
128
+ raise HTTPException(status_code=429, detail="Monthly quota exceeded")
129
+ if cfg.audit_log_enabled:
130
+ _audit.log_request(
131
+ api_key=x_api_key,
132
+ endpoint=str(request.url.path),
133
+ tenant_id=tenant.tenant_id,
134
+ ip=request.client.host if request.client else "",
135
+ )
136
+ return key_rec, tenant
137
+
138
+ def require_role(required: Role):
139
+ async def dep(auth=Depends(require_api_key)):
140
+ key_rec, tenant = auth
141
+ if not RBAC.has_permission(key_rec.role, _endpoint_permission(required)):
142
+ raise HTTPException(status_code=403, detail=f"Role '{key_rec.role}' lacks required permission.")
143
+ return key_rec, tenant
144
+ return dep
145
+
146
+ def _endpoint_permission(role: Role) -> str:
147
+ return {
148
+ Role.VIEWER: "search:read",
149
+ Role.DEVELOPER: "agent:read",
150
+ Role.ADMIN: "indices:write",
151
+ Role.OWNER: "tenant:manage",
152
+ }.get(role, "search:read")
153
+
154
+ # ══════════════════════════════════════════════════════════════════════════
155
+ # Search endpoints
156
+ # ══════════════════════════════════════════════════════════════════════════
157
+
158
+ @app.post("/search", response_model=SearchResponse, tags=["Search"])
159
+ async def search(req: SearchRequest, auth=Depends(require_api_key)) -> SearchResponse:
160
+ """Hybrid BM25 + vector search with trust reweighting and contradiction flags."""
161
+ t0 = time.perf_counter()
162
+ results = await _retriever.search(req.query, top_k=req.top_k, filters=req.filters)
163
+
164
+ # Contradiction detection
165
+ flags = _contradiction.detect(results)
166
+ flag_map: dict[str, list[str]] = {}
167
+ for f in flags:
168
+ for cid in (f.chunk_id_a, f.chunk_id_b):
169
+ flag_map.setdefault(cid, []).append(f.explanation)
170
+ for r in results:
171
+ r.contradiction_flags = flag_map.get(r.chunk_id, [])
172
+
173
+ return SearchResponse(
174
+ query=req.query,
175
+ results=results,
176
+ total_found=len(results),
177
+ latency_ms=round((time.perf_counter() - t0) * 1000, 1),
178
+ )
179
+
180
+ @app.get("/search/stream", tags=["Search"])
181
+ async def search_stream(query: str, top_k: int = 10, auth=Depends(require_api_key)):
182
+ """SSE streaming search — emits each result as it scores."""
183
+ async def gen() -> AsyncIterator[dict]:
184
+ results = await _retriever.search(query=query, top_k=top_k)
185
+ for r in results:
186
+ yield {"event": "result", "data": r.model_dump_json()}
187
+ await asyncio.sleep(0)
188
+ yield {"event": "done", "data": "[DONE]"}
189
+ return EventSourceResponse(gen())
190
+
191
+ # ══════════════════════════════════════════════════════════════════════════
192
+ # Agent / RAG endpoints
193
+ # ══════════════════════════════════════════════════════════════════════════
194
+
195
+ @app.post("/agent/query", response_model=AgentResponse, tags=["Agent"])
196
+ async def agent_query(req: SearchRequest, auth=Depends(require_api_key)) -> AgentResponse:
197
+ """Full RAG: retrieve → context → LLM → grounded answer + citations + fact check."""
198
+ results = await _retriever.search(req.query, top_k=req.top_k, filters=req.filters)
199
+ if not results:
200
+ return AgentResponse(
201
+ query=req.query,
202
+ answer="No relevant information found.",
203
+ citations=[], latency_ms=0.0,
204
+ )
205
+ context, citations = _context_builder.build(results, req.query)
206
+ resp = await _llm.generate(req.query, context, citations, results)
207
+
208
+ # Post-generation fact verification
209
+ verifications = await _fact_verifier.verify(resp.answer, results)
210
+ unverified = [v["claim"] for v in verifications if not v["verified"]]
211
+ if unverified:
212
+ resp.answer = _fact_verifier.annotate_answer(resp.answer, verifications)
213
+ resp.contradiction_flags = unverified
214
+
215
+ # Contradiction check vs retrieved evidence
216
+ c_flags = _contradiction.verify_answer(resp.answer, results)
217
+ if c_flags:
218
+ resp.contradiction_flags.extend([f.explanation for f in c_flags])
219
+
220
+ return resp
221
+
222
+ @app.post("/agent/orchestrate", tags=["Agent"])
223
+ async def agent_orchestrate(
224
+ req: SearchRequest,
225
+ session_id: str | None = None,
226
+ mode: str = "async",
227
+ auth=Depends(require_api_key),
228
+ ) -> dict:
229
+ """
230
+ Multi-Agent Orchestrator (M07): decomposes query → parallel sub-agents
231
+ (web search, KG lookup) → merges → synthesizes grounded answer.
232
+ """
233
+ key_rec, tenant = auth
234
+ return await _orchestrator.run(
235
+ query=req.query,
236
+ session_id=session_id,
237
+ top_k=req.top_k,
238
+ mode=mode,
239
+ token_budget=settings.agent_token_budget,
240
+ )
241
+
242
+ @app.get("/agent/stream", tags=["Agent"])
243
+ async def agent_stream(query: str, top_k: int = 10, auth=Depends(require_api_key)):
244
+ """Streaming LLM token output via SSE."""
245
+ results = await _retriever.search(query=query, top_k=top_k)
246
+ context, _ = _context_builder.build(results, query)
247
+ async def token_gen():
248
+ async for token in _llm.generate_stream(context):
249
+ yield {"event": "token", "data": token}
250
+ yield {"event": "done", "data": "[DONE]"}
251
+ return EventSourceResponse(token_gen())
252
+
253
+ # ══════════════════════════════════════════════════════════════════════════
254
+ # Crawler endpoints
255
+ # ══════════════════════════════════════════════════════════════════════════
256
+
257
+ @app.post("/crawl/url", response_model=CrawlResponse, tags=["Crawler"])
258
+ async def crawl_url(req: CrawlRequest, auth=Depends(require_api_key)) -> CrawlResponse:
259
+ """On-demand single-URL crawl with PII detection, trust scoring, and indexing."""
260
+ if _trust.is_blocked(req.url):
261
+ raise HTTPException(status_code=400, detail="URL domain is blocked.")
262
+
263
+ result = await _crawler.crawl_single(req.url)
264
+ if result is None:
265
+ return CrawlResponse(url=req.url, status="failed", message="Could not extract content.")
266
+
267
+ doc, chunks = result
268
+ trust_score = _trust.score(req.url)
269
+ doc.metadata["trust_score"] = trust_score
270
+
271
+ # PII scan and redact each chunk before indexing
272
+ if settings.pii_detection_enabled:
273
+ for chunk in chunks:
274
+ redacted, findings = _pii.redact(chunk.text)
275
+ if findings:
276
+ chunk.text = redacted
277
+ _audit.log(AuditEvent(
278
+ event_type=AuditEventType.PII_DETECTED,
279
+ resource_id=chunk.chunk_id,
280
+ details={"findings": len(findings), "url": req.url},
281
+ ))
282
+ chunk.trust_score = trust_score
283
+
284
+ count = await _index.add_documents([(doc, chunks)])
285
+ _kg.ingest_document(doc, chunks)
286
+ _retention.register(doc.doc_id, doc.url, doc.crawled_at)
287
+
288
+ # Shard routing info
289
+ region_id, shard = _shard_router.route(doc.doc_id)
290
+
291
+ return CrawlResponse(
292
+ url=req.url, status="indexed",
293
+ message=f"Indexed {count} chunks → region={region_id}, shard={shard}",
294
+ )
295
+
296
+ @app.post("/crawl/batch", tags=["Crawler"])
297
+ async def crawl_batch(urls: list[str], auth=Depends(require_api_key)) -> dict:
298
+ """Batch crawl up to 50 URLs."""
299
+ if len(urls) > 50:
300
+ raise HTTPException(status_code=400, detail="Max 50 URLs per batch call.")
301
+ docs = await _crawler.crawl_urls(urls)
302
+ enriched = []
303
+ for doc, chunks in docs:
304
+ ts = _trust.score(doc.url)
305
+ doc.metadata["trust_score"] = ts
306
+ if settings.pii_detection_enabled:
307
+ for chunk in chunks:
308
+ redacted, _ = _pii.redact(chunk.text)
309
+ chunk.text = redacted
310
+ chunk.trust_score = ts
311
+ enriched.append((doc, chunks))
312
+ total = await _index.add_documents(enriched)
313
+ for doc, chunks in enriched:
314
+ _kg.ingest_document(doc, chunks)
315
+ _retention.register(doc.doc_id, doc.url, doc.crawled_at)
316
+ return {"status": "ok", "urls_attempted": len(urls), "chunks_indexed": total}
317
+
318
+ # ══════════════════════════════════════════════════════════════════════════
319
+ # Knowledge Graph endpoints
320
+ # ══════════════════════════════════════════════════════════════════════════
321
+
322
+ @app.get("/kg/search", tags=["Knowledge Graph"])
323
+ async def kg_search(q: str, limit: int = 10, auth=Depends(require_api_key)) -> dict:
324
+ """Entity search in the Knowledge Graph."""
325
+ entities = _kg.search_entities(q, limit=limit)
326
+ return {"query": q, "entities": entities, "total": len(entities)}
327
+
328
+ @app.post("/kg/query", tags=["Knowledge Graph"])
329
+ async def kg_query(
330
+ source_type: str | None = None,
331
+ relation: str | None = None,
332
+ target_type: str | None = None,
333
+ limit: int = 50,
334
+ auth=Depends(require_api_key),
335
+ ) -> dict:
336
+ """Graph pattern query (SPARQL-compatible filter on node types and relation types)."""
337
+ results = _kg.query_pattern(source_type, relation, target_type, limit)
338
+ return {"results": results, "total": len(results)}
339
+
340
+ @app.get("/kg/entity/{entity_id}", tags=["Knowledge Graph"])
341
+ async def kg_entity(entity_id: str, auth=Depends(require_api_key)) -> dict:
342
+ """Get full entity graph: node + all neighbors."""
343
+ graph = _kg.get_entity_graph(entity_id)
344
+ if graph is None:
345
+ raise HTTPException(status_code=404, detail="Entity not found")
346
+ return graph
347
+
348
+ @app.get("/kg/stats", tags=["Knowledge Graph"])
349
+ async def kg_stats(auth=Depends(require_api_key)) -> dict:
350
+ return {"nodes": _kg.node_count, "edges": _kg.edge_count}
351
+
352
+ # ══════════════════════════════════════════════════════════════════════════
353
+ # Trust & feedback
354
+ # ══════════════════════════════════════════════════════════════════════════
355
+
356
+ @app.post("/feedback", tags=["Trust"])
357
+ async def feedback(url: str, positive: bool, auth=Depends(require_api_key)) -> dict:
358
+ _trust.register_feedback(url, positive)
359
+ return {"status": "ok", "url": url, "positive": positive}
360
+
361
+ # ══════════════════════════════════════════════════════════════════════════
362
+ # Compliance endpoints
363
+ # ══════════════════════════════════════════════════════════════════════════
364
+
365
+ @app.post("/compliance/deletion", tags=["Compliance"])
366
+ async def request_deletion(identifier: str, reason: str = "gdpr_erasure", auth=Depends(require_api_key)) -> dict:
367
+ """GDPR Art. 17 — Right to erasure. Returns a deletion ticket with 24-hour SLA."""
368
+ return _retention.request_deletion(identifier, reason)
369
+
370
+ @app.get("/compliance/audit", tags=["Compliance"])
371
+ async def get_audit_log(limit: int = 100, auth=Depends(require_api_key)) -> dict:
372
+ """SOC 2 CC7.2 — returns recent audit events for this tenant."""
373
+ key_rec, tenant = auth
374
+ events = _audit.query(tenant_id=tenant.tenant_id, limit=limit)
375
+ return {"events": events, "total": len(events)}
376
+
377
+ @app.get("/compliance/checklist", tags=["Compliance"])
378
+ async def compliance_checklist(auth=Depends(require_api_key)) -> dict:
379
+ """SOC 2 + GDPR readiness checklist for the current configuration."""
380
+ return _compliance.run()
381
+
382
+ @app.get("/compliance/retention", tags=["Compliance"])
383
+ async def retention_status(auth=Depends(require_api_key)) -> dict:
384
+ """Data retention tier distribution."""
385
+ return _retention.docs_by_tier()
386
+
387
+ # ══════════════════════════════════════════════════════════════════════════
388
+ # Multi-region endpoints
389
+ # ══════════════════════════════════════════════════════════════════════════
390
+
391
+ @app.get("/regions", tags=["Multi-Region"])
392
+ async def list_regions(auth=Depends(require_api_key)) -> dict:
393
+ return _replication.status()
394
+
395
+ @app.get("/regions/route", tags=["Multi-Region"])
396
+ async def route_region(lat: float | None = None, lon: float | None = None, auth=Depends(require_api_key)) -> dict:
397
+ region = _geo_router.route(lat, lon)
398
+ return {
399
+ "routed_to": region.region_id,
400
+ "name": region.name,
401
+ "endpoint": region.endpoint,
402
+ "avg_latency_ms": region.avg_latency_ms,
403
+ }
404
+
405
+ # ══════════════════════════════════════════════════════════════════════════
406
+ # Indices management (enterprise)
407
+ # ══════════════════════════════════════════════════════════════════════════
408
+
409
+ @app.post("/indices/update", tags=["Indices"])
410
+ async def indices_update(
411
+ doc: dict,
412
+ auth=Depends(require_role(Role.ADMIN)),
413
+ ) -> dict:
414
+ """Push a document directly into BIE indexes (enterprise token required)."""
415
+ from bie.models import DocumentRecord
416
+ from bie.crawler import TextChunker
417
+ d = DocumentRecord(**doc)
418
+ chunks = TextChunker(chunk_size=settings.chunk_size).chunk(d)
419
+ count = await _index.add_documents([(d, chunks)])
420
+ _kg.ingest_document(d, chunks)
421
+ return {"status": "ok", "chunks_indexed": count, "doc_id": d.doc_id}
422
+
423
+ # ══════════════════════════════════════════════════════════════════════════
424
+ # Operations
425
+ # ══════════════════════════════════════════════════════════════════════════
426
+
427
+ @app.get("/metrics", tags=["Operations"])
428
+ async def metrics(auth=Depends(require_api_key)) -> dict:
429
+ key_rec, tenant = auth
430
+ quota = _key_store.quota_status(key_rec.api_key)
431
+ return {
432
+ "tenant_id": tenant.tenant_id,
433
+ "tier": tenant.tier.value,
434
+ "api_requests_used": key_rec.requests_this_month,
435
+ "quota": quota,
436
+ "index_docs": _index.doc_count,
437
+ "index_chunks": _index.chunk_count,
438
+ "kg_nodes": _kg.node_count,
439
+ "kg_edges": _kg.edge_count,
440
+ "audit_events": _audit.count,
441
+ "region": settings.region,
442
+ "uptime_seconds": round(time.time() - _start_time, 1),
443
+ }
444
+
445
+ @app.get("/health", response_model=HealthResponse, tags=["Operations"])
446
+ async def health() -> HealthResponse:
447
+ """Service health check — no auth required."""
448
+ return HealthResponse(
449
+ status="ok",
450
+ index_size=_index.doc_count,
451
+ uptime_seconds=round(time.time() - _start_time, 1),
452
+ )
453
+
454
+ return app
455
+
456
+
457
+ app = create_app()