mcp-agentic-pipelines 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +93 -0
- package/README.md +258 -0
- package/package.json +70 -0
- package/packages/clinical/package.json +22 -0
- package/packages/clinical/src/index.ts +262 -0
- package/packages/clinical/tsconfig.json +13 -0
- package/packages/core/package.json +21 -0
- package/packages/core/src/config.ts +138 -0
- package/packages/core/src/errors.ts +100 -0
- package/packages/core/src/index.ts +104 -0
- package/packages/core/src/llm-config.ts +213 -0
- package/packages/core/src/logging.ts +66 -0
- package/packages/core/src/python-bridge.ts +384 -0
- package/packages/core/src/rate-limiter.ts +136 -0
- package/packages/core/src/types.ts +203 -0
- package/packages/core/src/validation.ts +101 -0
- package/packages/core/tsconfig.json +10 -0
- package/packages/deeppipe/package.json +21 -0
- package/packages/deeppipe/src/index.ts +424 -0
- package/packages/deeppipe/tsconfig.json +13 -0
- package/packages/piste/package.json +20 -0
- package/packages/piste/src/index.ts +48 -0
- package/packages/piste/tsconfig.json +13 -0
- package/packages/precis/package.json +20 -0
- package/packages/precis/src/index.ts +67 -0
- package/packages/precis/tsconfig.json +13 -0
- package/packages/server/package.json +31 -0
- package/packages/server/src/index.ts +427 -0
- package/packages/server/tsconfig.json +17 -0
- package/setup.mjs +141 -0
- package/test.mjs +337 -0
- package/vendors/clinical-intake/pipeline.mjs +349 -0
- package/vendors/clinical-intake/questions/en.txt +9 -0
- package/vendors/clinical-intake/questions/fr.txt +9 -0
- package/vendors/piste/.env.example +73 -0
- package/vendors/piste/app/core/__init__.py +4 -0
- package/vendors/piste/app/core/config.py +83 -0
- package/vendors/piste/app/core/debuglog.py +16 -0
- package/vendors/piste/app/core/middleware.py +40 -0
- package/vendors/piste/bridge_piste.py +301 -0
- package/vendors/piste/pipeline/__init__.py +4 -0
- package/vendors/piste/pipeline/compiler.py +68 -0
- package/vendors/piste/pipeline/offline/__init__.py +28 -0
- package/vendors/piste/pipeline/offline/verifaid_pipeline.py +247 -0
- package/vendors/piste/pipeline/replay.py +15 -0
- package/vendors/piste/pipeline/replay_engine.py +249 -0
- package/vendors/piste/pipeline/signatures/__init__.py +4 -0
- package/vendors/piste/pipeline/signatures/signatures.py +136 -0
- package/vendors/piste/pipeline/stage1/__init__.py +21 -0
- package/vendors/piste/pipeline/stage1/atomic_decomposer.py +61 -0
- package/vendors/piste/pipeline/stage1/check_worthiness.py +100 -0
- package/vendors/piste/pipeline/stage1/orchestrator.py +175 -0
- package/vendors/piste/pipeline/stage1/test_stage1.py +162 -0
- package/vendors/piste/pipeline/stage2/__init__.py +34 -0
- package/vendors/piste/pipeline/stage2/blind_retriever.py +303 -0
- package/vendors/piste/pipeline/stage2/canonical_mapper.py +124 -0
- package/vendors/piste/pipeline/stage2/credibility_scorer.py +85 -0
- package/vendors/piste/pipeline/stage2/orchestrator.py +311 -0
- package/vendors/piste/pipeline/stage2/query_refiner.py +88 -0
- package/vendors/piste/pipeline/stage2/search_decision.py +69 -0
- package/vendors/piste/pipeline/stage2/test_stage2.py +265 -0
- package/vendors/piste/pipeline/stage3/__init__.py +20 -0
- package/vendors/piste/pipeline/stage3/classifier.py +79 -0
- package/vendors/piste/pipeline/stage3/orchestrator.py +225 -0
- package/vendors/piste/pipeline/stage3/test_stage3.py +101 -0
- package/vendors/piste/pipeline/stage4/__init__.py +33 -0
- package/vendors/piste/pipeline/stage4/criticality_gate.py +177 -0
- package/vendors/piste/pipeline/stage4/orchestrator.py +269 -0
- package/vendors/piste/pipeline/stage4/test_stage4.py +192 -0
- package/vendors/piste/pipeline/stage4/verdict_aggregator.py +157 -0
- package/vendors/piste/requirements.txt +53 -0
- package/vendors/precis/backend/__init__.py +6 -0
- package/vendors/precis/backend/agents/__init__.py +3 -0
- package/vendors/precis/backend/agents/data_synthesis.py +105 -0
- package/vendors/precis/backend/agents/dist_free_synth.py +97 -0
- package/vendors/precis/backend/agents/exact_hash_retriever.py +327 -0
- package/vendors/precis/backend/agents/fusion_ranker.py +64 -0
- package/vendors/precis/backend/agents/guardrail.py +175 -0
- package/vendors/precis/backend/agents/query_expander.py +89 -0
- package/vendors/precis/backend/agents/radial_interpol.py +99 -0
- package/vendors/precis/backend/agents/report_generator.py +92 -0
- package/vendors/precis/backend/agents/semantic_reranker.py +135 -0
- package/vendors/precis/backend/agents/stat_anomaly.py +93 -0
- package/vendors/precis/backend/agents/vector_index.py +123 -0
- package/vendors/precis/backend/agents/veri_score.py +341 -0
- package/vendors/precis/backend/agents/work_order_extractor.py +205 -0
- package/vendors/precis/backend/api/__init__.py +3 -0
- package/vendors/precis/backend/api/routes/__init__.py +3 -0
- package/vendors/precis/backend/config.py +88 -0
- package/vendors/precis/backend/core/__init__.py +13 -0
- package/vendors/precis/backend/core/hashing.py +22 -0
- package/vendors/precis/backend/core/metrics.py +77 -0
- package/vendors/precis/backend/core/multitoken.py +166 -0
- package/vendors/precis/backend/core/pmi.py +54 -0
- package/vendors/precis/backend/core/stemming.py +74 -0
- package/vendors/precis/backend/core/tracing.py +150 -0
- package/vendors/precis/backend/data/__init__.py +3 -0
- package/vendors/precis/backend/data/chunker.py +57 -0
- package/vendors/precis/backend/data/pdf_parser.py +42 -0
- package/vendors/precis/backend/db/__init__.py +3 -0
- package/vendors/precis/backend/db/models.py +173 -0
- package/vendors/precis/backend/db/repository.py +269 -0
- package/vendors/precis/backend/llm/__init__.py +3 -0
- package/vendors/precis/backend/llm/anthropic_provider.py +39 -0
- package/vendors/precis/backend/llm/base.py +147 -0
- package/vendors/precis/backend/llm/deepseek_provider.py +43 -0
- package/vendors/precis/backend/llm/factory.py +60 -0
- package/vendors/precis/backend/llm/google_provider.py +39 -0
- package/vendors/precis/backend/llm/ollama_provider.py +54 -0
- package/vendors/precis/backend/llm/openai_provider.py +50 -0
- package/vendors/precis/backend/main.py +677 -0
- package/vendors/precis/backend/orchestrator/__init__.py +3 -0
- package/vendors/precis/backend/orchestrator/planner.py +81 -0
- package/vendors/precis/backend/orchestrator/router.py +319 -0
- package/vendors/precis/backend/orchestrator/types.py +58 -0
- package/vendors/precis/bridge_precis.py +185 -0
- package/vendors/precis/data/sample_reports/README.md +8 -0
- package/vendors/precis/data/seed_data.py +115 -0
- package/vendors/precis/requirements.txt +19 -0
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Dict, List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class TrainingNode:
|
|
10
|
+
beta: np.ndarray
|
|
11
|
+
response: float
|
|
12
|
+
metadata: Dict = field(default_factory=dict)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class PredictionResult:
|
|
17
|
+
predicted_value: float
|
|
18
|
+
confidence: float
|
|
19
|
+
contributing_nodes: List[Dict]
|
|
20
|
+
metadata: Dict = field(default_factory=dict)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class RadialInterpolPredictor:
|
|
24
|
+
"""RBF closed-form predictor. f_pred(x) = Σ ω_k(x)·f(β_k)·exp[-τ·K(x,β_k)] with Σ ω_k·exp[-τ·K] = 1."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, tau: float = 500.0, gamma: float = 1.0) -> None:
|
|
27
|
+
self.tau = tau
|
|
28
|
+
self.gamma = gamma
|
|
29
|
+
self.nodes: List[TrainingNode] = []
|
|
30
|
+
self.n: int = 0
|
|
31
|
+
self.m: int = 0
|
|
32
|
+
self._X_min: Optional[np.ndarray] = None
|
|
33
|
+
self._X_max: Optional[np.ndarray] = None
|
|
34
|
+
self._node_access_counts: np.ndarray = np.array([])
|
|
35
|
+
self._total_predictions: int = 0
|
|
36
|
+
|
|
37
|
+
def fit(self, X: np.ndarray, y: np.ndarray, metadata: Optional[List[Dict]] = None) -> None:
|
|
38
|
+
self._X_min = X.min(axis=0)
|
|
39
|
+
self._X_max = X.max(axis=0)
|
|
40
|
+
X_t = self._phi_transform(X)
|
|
41
|
+
meta_list = metadata or [{}] * len(X)
|
|
42
|
+
self.nodes = [TrainingNode(beta=X_t[i], response=y[i], metadata=meta_list[i]) for i in range(len(X))]
|
|
43
|
+
self.n = len(self.nodes)
|
|
44
|
+
self.m = X.shape[1]
|
|
45
|
+
self._node_access_counts = np.zeros(self.n)
|
|
46
|
+
|
|
47
|
+
def predict(self, x: np.ndarray, top_k: int = 20,
|
|
48
|
+
trace=None) -> Tuple[float, List[Dict]]:
|
|
49
|
+
if self.n == 0:
|
|
50
|
+
return 0.0, []
|
|
51
|
+
x_t = self._phi_transform(x.reshape(1, -1))[0]
|
|
52
|
+
contributions, w_sum, w_norm = [], 0.0, 0.0
|
|
53
|
+
for i, node in enumerate(self.nodes):
|
|
54
|
+
K = self._kernel(x_t, node.beta)
|
|
55
|
+
w = np.exp(-self.tau * K)
|
|
56
|
+
if w > 1e-15:
|
|
57
|
+
w_sum += w * node.response
|
|
58
|
+
w_norm += w
|
|
59
|
+
self._node_access_counts[i] += 1
|
|
60
|
+
contributions.append({"node_idx": i, "weight": float(w), "kernel_distance": float(K),
|
|
61
|
+
"response": float(node.response), "metadata": node.metadata})
|
|
62
|
+
if w_norm == 0:
|
|
63
|
+
return 0.0, []
|
|
64
|
+
f_pred = w_sum / w_norm
|
|
65
|
+
self._total_predictions += 1
|
|
66
|
+
contributions.sort(key=lambda c: c["weight"], reverse=True)
|
|
67
|
+
if trace:
|
|
68
|
+
trace.event(type("TE", (), {"value": "decision.prediction"})(), agent_name="RadialInterpol",
|
|
69
|
+
message=f"Predicted {f_pred:.4f} from {len(contributions)} active nodes",
|
|
70
|
+
data={"top_weight": contributions[0]["weight"] if contributions else 0})
|
|
71
|
+
return f_pred, contributions[:top_k]
|
|
72
|
+
|
|
73
|
+
def _kernel(self, x: np.ndarray, beta: np.ndarray) -> float:
|
|
74
|
+
return self.gamma * float(np.dot(x - beta, x - beta))
|
|
75
|
+
|
|
76
|
+
def _phi_transform(self, X: np.ndarray) -> np.ndarray:
|
|
77
|
+
if self._X_min is None or self._X_max is None:
|
|
78
|
+
return X
|
|
79
|
+
denom = self._X_max - self._X_min
|
|
80
|
+
denom[denom == 0] = 1.0
|
|
81
|
+
return (X - self._X_min) / denom
|
|
82
|
+
|
|
83
|
+
def auto_distill(self, min_access: int = 0) -> int:
|
|
84
|
+
if self._total_predictions == 0:
|
|
85
|
+
return 0
|
|
86
|
+
keep = self._node_access_counts >= min_access
|
|
87
|
+
removed = self.n - int(np.sum(keep))
|
|
88
|
+
self.nodes = [n for i, n in enumerate(self.nodes) if keep[i]]
|
|
89
|
+
self._node_access_counts = self._node_access_counts[keep]
|
|
90
|
+
self.n = len(self.nodes)
|
|
91
|
+
return removed
|
|
92
|
+
|
|
93
|
+
def get_weights_distribution(self, x: np.ndarray) -> np.ndarray:
|
|
94
|
+
if self.n == 0:
|
|
95
|
+
return np.array([])
|
|
96
|
+
x_t = self._phi_transform(x.reshape(1, -1))[0]
|
|
97
|
+
weights = np.array([np.exp(-self.tau * self._kernel(x_t, n.beta)) for n in self.nodes])
|
|
98
|
+
s = weights.sum()
|
|
99
|
+
return weights / s if s > 0 else weights
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class ReportSection:
|
|
10
|
+
section_id: str = ""
|
|
11
|
+
title: str = ""
|
|
12
|
+
content: str = ""
|
|
13
|
+
citations: List[Dict[str, Any]] = field(default_factory=list)
|
|
14
|
+
agent_source: str = ""
|
|
15
|
+
relevance: float = 1.0
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ReportGenerator:
|
|
19
|
+
"""Assembles agent results into a structured report with citations."""
|
|
20
|
+
|
|
21
|
+
def __init__(self) -> None:
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
async def generate(self, query: str, agent_results: List[Any],
|
|
25
|
+
veriscore_report: Optional[Any] = None,
|
|
26
|
+
guardrail_result: Optional[Any] = None) -> dict:
|
|
27
|
+
sections = []
|
|
28
|
+
all_citations = []
|
|
29
|
+
|
|
30
|
+
for i, result in enumerate(agent_results):
|
|
31
|
+
if not result.success:
|
|
32
|
+
sections.append({"title": f"Agent: {result.agent_name}",
|
|
33
|
+
"content": f"Error: {result.error_message}",
|
|
34
|
+
"agent_source": result.agent_name})
|
|
35
|
+
continue
|
|
36
|
+
|
|
37
|
+
data = result.data or {}
|
|
38
|
+
|
|
39
|
+
# Build human-readable content from structured data
|
|
40
|
+
if isinstance(data, dict):
|
|
41
|
+
parts = []
|
|
42
|
+
synth = data.get("synthesis", "")
|
|
43
|
+
if synth:
|
|
44
|
+
parts.append(str(synth))
|
|
45
|
+
for item in data.get("results", []):
|
|
46
|
+
if isinstance(item, dict):
|
|
47
|
+
txt = item.get("text", "")
|
|
48
|
+
src = item.get("source", "")
|
|
49
|
+
score = item.get("score", "")
|
|
50
|
+
mt = item.get("match_type", "")
|
|
51
|
+
if txt:
|
|
52
|
+
line = str(txt)
|
|
53
|
+
if src:
|
|
54
|
+
line += f" [{src}]"
|
|
55
|
+
if score:
|
|
56
|
+
line += f" [score={score}, {mt}]"
|
|
57
|
+
parts.append(line)
|
|
58
|
+
pred = data.get("prediction")
|
|
59
|
+
if pred is not None:
|
|
60
|
+
parts.append(f"Prediction: {pred}")
|
|
61
|
+
for flag in data.get("flags", []):
|
|
62
|
+
parts.append(str(flag))
|
|
63
|
+
content = "\n".join(parts) if parts else "(no readable content)"
|
|
64
|
+
else:
|
|
65
|
+
content = str(data)[:1000]
|
|
66
|
+
|
|
67
|
+
for citation in getattr(result, "citations", []):
|
|
68
|
+
all_citations.append({**citation, "agent_source": result.agent_name})
|
|
69
|
+
|
|
70
|
+
sections.append({"title": f"Findings from {result.agent_name}",
|
|
71
|
+
"content": content, "agent_source": result.agent_name,
|
|
72
|
+
"citations": getattr(result, "citations", [])})
|
|
73
|
+
|
|
74
|
+
evaluation_summary = None
|
|
75
|
+
if veriscore_report:
|
|
76
|
+
evaluation_summary = {
|
|
77
|
+
"relevancy": getattr(veriscore_report, "relevancy_score", 0),
|
|
78
|
+
"trust": getattr(veriscore_report, "trustworthiness_score", 0),
|
|
79
|
+
"hallucination_rate": getattr(veriscore_report, "hallucination_rate", 0),
|
|
80
|
+
"citation_coverage": getattr(veriscore_report, "citation_coverage", 0),
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
guardrail_summary = None
|
|
84
|
+
if guardrail_result:
|
|
85
|
+
guardrail_summary = {
|
|
86
|
+
"action": getattr(guardrail_result, "action", "pass"),
|
|
87
|
+
"issues": getattr(guardrail_result, "issues_found", []),
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
return {"query": query, "sections": sections, "citations": all_citations,
|
|
91
|
+
"evaluation": evaluation_summary, "guardrail": guardrail_summary,
|
|
92
|
+
"generated_at": datetime.now().isoformat()}
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT
|
|
2
|
+
|
|
3
|
+
No embeddings, no vector DB — the LLM reads candidate text and scores relevance 0-100.
|
|
4
|
+
Only passes the best candidates to the synthesis step.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
8
|
+
from backend.llm.base import LLMProvider
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SemanticReRanker:
|
|
12
|
+
"""Uses LLM semantic understanding to filter hash-retrieved candidates by true relevance."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, llm: LLMProvider) -> None:
|
|
15
|
+
self.llm = llm
|
|
16
|
+
|
|
17
|
+
async def rerank(self, query: str, candidates: List[Dict[str, Any]],
|
|
18
|
+
top_k: int = 5) -> List[Dict[str, Any]]:
|
|
19
|
+
"""Score each candidate by semantic relevance to the query. Returns top_k.
|
|
20
|
+
|
|
21
|
+
Each candidate: {"text": str, "source": str, "score": float, "page": int, ...}
|
|
22
|
+
"""
|
|
23
|
+
if not candidates:
|
|
24
|
+
return []
|
|
25
|
+
if len(candidates) <= top_k:
|
|
26
|
+
return candidates
|
|
27
|
+
|
|
28
|
+
# Build a scoring prompt with numbered candidates (use surrounding context when available)
|
|
29
|
+
items = []
|
|
30
|
+
for i, c in enumerate(candidates[:20]):
|
|
31
|
+
# Prefer surrounding context (full paragraph) over short n-gram text
|
|
32
|
+
text = c.get('surrounding', '') or c.get('sentence', '') or c.get('text', '')
|
|
33
|
+
items.append(f"[{i}] {text[:300]}")
|
|
34
|
+
|
|
35
|
+
prompt = f"""You are a precise relevance judge. Score each text chunk below for how well
|
|
36
|
+
it answers this query: "{query}"
|
|
37
|
+
|
|
38
|
+
For each chunk, give a score from 0-100:
|
|
39
|
+
90-100: Directly answers the query with specific facts
|
|
40
|
+
70-89: Related and useful context
|
|
41
|
+
40-69: Tangentially related
|
|
42
|
+
0-39: Not relevant
|
|
43
|
+
|
|
44
|
+
Text chunks:
|
|
45
|
+
{chr(10).join(items)}
|
|
46
|
+
|
|
47
|
+
Return ONLY a JSON array: [{{"index": 0, "score": 85, "reason": "5 words"}}, ...]
|
|
48
|
+
Score ALL chunks. Be strict — only give high scores for truly relevant content."""
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
import asyncio, json
|
|
52
|
+
response = await asyncio.wait_for(
|
|
53
|
+
self.llm.generate(prompt, max_tokens=200, temperature=0.0),
|
|
54
|
+
timeout=20
|
|
55
|
+
)
|
|
56
|
+
# Parse the JSON array
|
|
57
|
+
start = response.find("[")
|
|
58
|
+
end = response.rfind("]") + 1
|
|
59
|
+
if start >= 0 and end > start:
|
|
60
|
+
scores = json.loads(response[start:end])
|
|
61
|
+
# Map scores back to candidates
|
|
62
|
+
scored = []
|
|
63
|
+
for s in scores:
|
|
64
|
+
idx = s.get("index", 0)
|
|
65
|
+
if 0 <= idx < len(candidates):
|
|
66
|
+
candidates[idx]["semantic_score"] = s.get("score", 0)
|
|
67
|
+
candidates[idx]["relevance_reason"] = s.get("reason", "")
|
|
68
|
+
scored.append(candidates[idx])
|
|
69
|
+
# Sort by semantic score, return top_k
|
|
70
|
+
scored.sort(key=lambda c: c.get("semantic_score", 0), reverse=True)
|
|
71
|
+
return scored[:top_k]
|
|
72
|
+
except Exception:
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
# Fallback: return top by original hash score
|
|
76
|
+
return sorted(candidates, key=lambda c: c.get("score", 0), reverse=True)[:top_k]
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class DirectReader:
|
|
80
|
+
"""When hash search finds nothing, ask the LLM to directly read document text and answer."""
|
|
81
|
+
|
|
82
|
+
def __init__(self, llm: LLMProvider) -> None:
|
|
83
|
+
self.llm = llm
|
|
84
|
+
|
|
85
|
+
async def read_and_answer(self, query: str, doc_snippets: List[Dict[str, str]],
|
|
86
|
+
index) -> Dict[str, Any]:
|
|
87
|
+
"""Read document snippets directly and attempt to answer the query.
|
|
88
|
+
|
|
89
|
+
doc_snippets: [{"text": "...", "source": "file.pdf", "page": 1}, ...]
|
|
90
|
+
"""
|
|
91
|
+
if not doc_snippets:
|
|
92
|
+
return {"found": False, "answer": "No documents have been uploaded yet. Please upload a document to search.", "citation": ""}
|
|
93
|
+
|
|
94
|
+
doc_names = ", ".join(d.get("source", "unknown") for d in doc_snippets[:5])
|
|
95
|
+
|
|
96
|
+
context = "\n\n".join(
|
|
97
|
+
f"[DOC: {d['source']}, page {d.get('page', 1)}]\n{d['text'][:3000]}"
|
|
98
|
+
for d in doc_snippets[:5]
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
prompt = f"""You are a precise document analyst. Read the document excerpts below
|
|
102
|
+
and answer this query: "{query}"
|
|
103
|
+
|
|
104
|
+
Documents available: {doc_names}
|
|
105
|
+
|
|
106
|
+
DOCUMENTS:
|
|
107
|
+
{context[:15000]}
|
|
108
|
+
|
|
109
|
+
INSTRUCTIONS:
|
|
110
|
+
- If the query mentions a section number (like "section 3.4" or just "3.4"), locate
|
|
111
|
+
ALL content under that section heading in the documents and extract the key points.
|
|
112
|
+
Section headings may appear as "3.4 Title", "§3.4", or just "3.4" followed by text.
|
|
113
|
+
- Summarize what that section actually says — do not just report that the heading exists.
|
|
114
|
+
- If the query asks for "key findings" or a "summary", provide the substantive content
|
|
115
|
+
even if the document doesn't explicitly label anything as "key findings."
|
|
116
|
+
- If you genuinely cannot find the section content anywhere in the provided text,
|
|
117
|
+
only then state that it's not found.
|
|
118
|
+
|
|
119
|
+
Return your answer in this JSON format:
|
|
120
|
+
{{"found": true/false, "answer": "your detailed answer", "citation": "document name, page X"}}"""
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
import asyncio, json
|
|
124
|
+
response = await asyncio.wait_for(
|
|
125
|
+
self.llm.generate(prompt, max_tokens=300, temperature=0.2),
|
|
126
|
+
timeout=25
|
|
127
|
+
)
|
|
128
|
+
start = response.find("{")
|
|
129
|
+
end = response.rfind("}") + 1
|
|
130
|
+
if start >= 0 and end > start:
|
|
131
|
+
result = json.loads(response[start:end])
|
|
132
|
+
return result
|
|
133
|
+
return {"found": False, "answer": response[:500], "citation": ""}
|
|
134
|
+
except Exception as e:
|
|
135
|
+
return {"found": False, "answer": f"(LLM reading unavailable: {e})", "citation": ""}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class AnomalyFlag:
|
|
12
|
+
entity_id: str
|
|
13
|
+
flag_type: str
|
|
14
|
+
severity: float
|
|
15
|
+
evidence: Dict[str, Any]
|
|
16
|
+
timestamp: datetime
|
|
17
|
+
description: str
|
|
18
|
+
recommendations: List[str] = field(default_factory=list)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class StatAnomalyDetector:
|
|
22
|
+
"""Multi-method anomaly detector: entity mapping, temporal spikes, geospatial, value outliers."""
|
|
23
|
+
|
|
24
|
+
def __init__(self) -> None:
|
|
25
|
+
self.entity_hash: Dict[str, Dict[str, int]] = defaultdict(dict)
|
|
26
|
+
self.temporal_hash: Dict[str, List[Tuple[datetime, float]]] = defaultdict(list)
|
|
27
|
+
self.geo_hash: Dict[str, List[str]] = defaultdict(list)
|
|
28
|
+
self.value_tracker: Dict[str, List[float]] = defaultdict(list)
|
|
29
|
+
self.flags: List[AnomalyFlag] = []
|
|
30
|
+
|
|
31
|
+
def ingest(self, entity_id: str, counterparty_id: Optional[str] = None,
|
|
32
|
+
value: Optional[float] = None, timestamp: Optional[datetime] = None,
|
|
33
|
+
jurisdiction: Optional[str] = None, metadata: Optional[Dict] = None) -> None:
|
|
34
|
+
if counterparty_id:
|
|
35
|
+
self.entity_hash[entity_id][counterparty_id] = self.entity_hash[entity_id].get(counterparty_id, 0) + 1
|
|
36
|
+
if timestamp is not None and value is not None:
|
|
37
|
+
self.temporal_hash[entity_id].append((timestamp, value))
|
|
38
|
+
if jurisdiction:
|
|
39
|
+
self.geo_hash[entity_id].append(jurisdiction)
|
|
40
|
+
if value is not None:
|
|
41
|
+
self.value_tracker[entity_id].append(value)
|
|
42
|
+
|
|
43
|
+
def ingest_batch(self, events: List[Dict[str, Any]]) -> int:
|
|
44
|
+
for evt in events:
|
|
45
|
+
self.ingest(**{k: evt.get(k) for k in ("entity_id", "counterparty_id", "value", "timestamp", "jurisdiction")})
|
|
46
|
+
return len(events)
|
|
47
|
+
|
|
48
|
+
def detect_multi_entity_anomalies(self, max_relationships: int = 50) -> List[AnomalyFlag]:
|
|
49
|
+
flags = []
|
|
50
|
+
for eid, counterparties in self.entity_hash.items():
|
|
51
|
+
n = len(counterparties)
|
|
52
|
+
if n > max_relationships:
|
|
53
|
+
flags.append(AnomalyFlag(entity_id=eid, flag_type="multi_entity",
|
|
54
|
+
severity=min(1.0, n / (2 * max_relationships)),
|
|
55
|
+
evidence={"n_counterparties": n, "counterparties": dict(counterparties)},
|
|
56
|
+
timestamp=datetime.now(),
|
|
57
|
+
description=f"Entity has {n} counterparties (threshold: {max_relationships})"))
|
|
58
|
+
return sorted(flags, key=lambda f: f.severity, reverse=True)
|
|
59
|
+
|
|
60
|
+
def detect_temporal_spikes(self, spike_threshold_sigma: float = 3.0,
|
|
61
|
+
window_hours: int = 1) -> List[AnomalyFlag]:
|
|
62
|
+
flags = []
|
|
63
|
+
for eid, events in self.temporal_hash.items():
|
|
64
|
+
if len(events) < 10:
|
|
65
|
+
continue
|
|
66
|
+
hourly = defaultdict(list)
|
|
67
|
+
for ts, amount in events:
|
|
68
|
+
hour_key = ts.replace(minute=0, second=0, microsecond=0)
|
|
69
|
+
hourly[hour_key].append(amount)
|
|
70
|
+
counts = [len(v) for v in hourly.values()]
|
|
71
|
+
mean_c, std_c = float(np.mean(counts)), float(np.std(counts))
|
|
72
|
+
if std_c == 0:
|
|
73
|
+
continue
|
|
74
|
+
for hour, amounts in hourly.items():
|
|
75
|
+
z = (len(amounts) - mean_c) / std_c
|
|
76
|
+
if z > spike_threshold_sigma:
|
|
77
|
+
flags.append(AnomalyFlag(entity_id=eid, flag_type="temporal_spike",
|
|
78
|
+
severity=min(1.0, z / (2 * spike_threshold_sigma)),
|
|
79
|
+
evidence={"hour": str(hour), "count": len(amounts), "total_amount": sum(amounts), "z_score": float(z)},
|
|
80
|
+
timestamp=hour,
|
|
81
|
+
description=f"{len(amounts)} events at {hour} ({z:.1f} above mean)"))
|
|
82
|
+
return sorted(flags, key=lambda f: f.severity, reverse=True)
|
|
83
|
+
|
|
84
|
+
def detect_all(self, trace=None) -> List[AnomalyFlag]:
|
|
85
|
+
self.flags = []
|
|
86
|
+
self.flags.extend(self.detect_multi_entity_anomalies())
|
|
87
|
+
self.flags.extend(self.detect_temporal_spikes())
|
|
88
|
+
self.flags.sort(key=lambda f: f.severity, reverse=True)
|
|
89
|
+
if trace:
|
|
90
|
+
for flag in self.flags[:5]:
|
|
91
|
+
trace.event(type("TE", (), {"value": "anomaly.flagged"})(), agent_name="StatAnomaly",
|
|
92
|
+
message=flag.description, data={"severity": flag.severity, "type": flag.flag_type})
|
|
93
|
+
return self.flags
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT
|
|
2
|
+
|
|
3
|
+
Adds embedding-based retrieval alongside the hash index for multi-source fusion.
|
|
4
|
+
Uses all-MiniLM-L6-v2 (80MB model, 384-dim vectors) — runs locally, no API calls.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import faiss
|
|
12
|
+
from sentence_transformers import SentenceTransformer
|
|
13
|
+
HAS_FAISS = True
|
|
14
|
+
except ImportError:
|
|
15
|
+
HAS_FAISS = False
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class VectorIndex:
|
|
19
|
+
"""Semantic document retrieval using FAISS in-memory vector index.
|
|
20
|
+
|
|
21
|
+
Complements the hash index by finding semantically similar content
|
|
22
|
+
that doesn't share exact words with the query.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
model_name : str
|
|
27
|
+
HuggingFace sentence-transformers model name.
|
|
28
|
+
encode_batch_size : int
|
|
29
|
+
Batch size for model.encode(). Larger values (64–128) encode
|
|
30
|
+
many chunks faster on CPU; reduce to 8–16 if GPU OOM occurs.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, model_name: str = "all-MiniLM-L6-v2",
|
|
34
|
+
encode_batch_size: int = 64) -> None:
|
|
35
|
+
if not HAS_FAISS:
|
|
36
|
+
raise ImportError("faiss-cpu and sentence-transformers required: pip install faiss-cpu sentence-transformers")
|
|
37
|
+
|
|
38
|
+
self.model = SentenceTransformer(model_name)
|
|
39
|
+
self.dimension = self.model.get_sentence_embedding_dimension() # 384
|
|
40
|
+
self.encode_batch_size = encode_batch_size
|
|
41
|
+
self.index = faiss.IndexFlatIP(self.dimension) # Inner product (cosine with normalized vectors)
|
|
42
|
+
self.chunks: List[Dict[str, Any]] = [] # {"text": ..., "source": ..., "page": ...}
|
|
43
|
+
self._chunk_vectors: Optional[np.ndarray] = None
|
|
44
|
+
|
|
45
|
+
def index_text(self, text: str, source: str = "unknown",
|
|
46
|
+
chunk_size: int = 300, chunk_overlap: int = 50) -> int:
|
|
47
|
+
"""Split text into overlapping chunks and index as vectors. Returns chunk count."""
|
|
48
|
+
words = text.split()
|
|
49
|
+
chunks = []
|
|
50
|
+
start = 0
|
|
51
|
+
while start < len(words):
|
|
52
|
+
end = min(start + chunk_size, len(words))
|
|
53
|
+
chunk = " ".join(words[start:end])
|
|
54
|
+
chunks.append({"text": chunk, "source": source, "page": 1})
|
|
55
|
+
start += chunk_size - chunk_overlap
|
|
56
|
+
|
|
57
|
+
if not chunks:
|
|
58
|
+
return 0
|
|
59
|
+
|
|
60
|
+
# Encode in batches for throughput — sentence-transformers uses an
|
|
61
|
+
# internal batch loop; a larger batch_size reduces Python→C round-trips.
|
|
62
|
+
vectors = self.model.encode(
|
|
63
|
+
[c["text"] for c in chunks],
|
|
64
|
+
batch_size=self.encode_batch_size,
|
|
65
|
+
show_progress_bar=False,
|
|
66
|
+
)
|
|
67
|
+
# Normalize for cosine similarity (inner product on unit vectors = cosine)
|
|
68
|
+
faiss.normalize_L2(vectors)
|
|
69
|
+
self.index.add(vectors.astype(np.float32))
|
|
70
|
+
self.chunks.extend(chunks)
|
|
71
|
+
return len(chunks)
|
|
72
|
+
|
|
73
|
+
def search(self, query: str, top_k: int = 10,
|
|
74
|
+
source_filter: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
75
|
+
"""Search for semantically similar chunks. Returns scored results.
|
|
76
|
+
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
source_filter : Optional[List[str]]
|
|
80
|
+
If provided, only return chunks from these source documents
|
|
81
|
+
(case-insensitive basename matching).
|
|
82
|
+
"""
|
|
83
|
+
if self.index.ntotal == 0:
|
|
84
|
+
return []
|
|
85
|
+
|
|
86
|
+
query_vec = self.model.encode([query], show_progress_bar=False)
|
|
87
|
+
faiss.normalize_L2(query_vec)
|
|
88
|
+
# Fetch more candidates than needed so filtering doesn't starve results
|
|
89
|
+
fetch_k = min(top_k * 3, self.index.ntotal) if source_filter else min(top_k, self.index.ntotal)
|
|
90
|
+
scores, indices = self.index.search(query_vec.astype(np.float32), fetch_k)
|
|
91
|
+
|
|
92
|
+
results = []
|
|
93
|
+
for score, idx in zip(scores[0], indices[0]):
|
|
94
|
+
if idx >= 0 and idx < len(self.chunks):
|
|
95
|
+
chunk = self.chunks[idx]
|
|
96
|
+
# ── Document-scope filter ────────────────────────
|
|
97
|
+
if source_filter:
|
|
98
|
+
import os
|
|
99
|
+
filter_set = set()
|
|
100
|
+
for f in source_filter:
|
|
101
|
+
f = str(f).lower().strip()
|
|
102
|
+
f = os.path.basename(f)
|
|
103
|
+
if f:
|
|
104
|
+
filter_set.add(f)
|
|
105
|
+
if filter_set and os.path.basename(chunk.get("source", "").lower().strip()) not in filter_set:
|
|
106
|
+
continue
|
|
107
|
+
results.append({
|
|
108
|
+
"text": chunk["text"],
|
|
109
|
+
"source": chunk["source"],
|
|
110
|
+
"page": chunk.get("page", 1),
|
|
111
|
+
"score": round(float(score), 3), # Cosine similarity, 0-1
|
|
112
|
+
"match_type": "semantic",
|
|
113
|
+
})
|
|
114
|
+
if len(results) >= top_k:
|
|
115
|
+
break
|
|
116
|
+
return results
|
|
117
|
+
|
|
118
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
119
|
+
return {
|
|
120
|
+
"total_vectors": int(self.index.ntotal),
|
|
121
|
+
"total_chunks": len(self.chunks),
|
|
122
|
+
"dimension": self.dimension,
|
|
123
|
+
}
|