mcp-agentic-pipelines 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +93 -0
- package/README.md +258 -0
- package/package.json +70 -0
- package/packages/clinical/package.json +22 -0
- package/packages/clinical/src/index.ts +262 -0
- package/packages/clinical/tsconfig.json +13 -0
- package/packages/core/package.json +21 -0
- package/packages/core/src/config.ts +138 -0
- package/packages/core/src/errors.ts +100 -0
- package/packages/core/src/index.ts +104 -0
- package/packages/core/src/llm-config.ts +213 -0
- package/packages/core/src/logging.ts +66 -0
- package/packages/core/src/python-bridge.ts +384 -0
- package/packages/core/src/rate-limiter.ts +136 -0
- package/packages/core/src/types.ts +203 -0
- package/packages/core/src/validation.ts +101 -0
- package/packages/core/tsconfig.json +10 -0
- package/packages/deeppipe/package.json +21 -0
- package/packages/deeppipe/src/index.ts +424 -0
- package/packages/deeppipe/tsconfig.json +13 -0
- package/packages/piste/package.json +20 -0
- package/packages/piste/src/index.ts +48 -0
- package/packages/piste/tsconfig.json +13 -0
- package/packages/precis/package.json +20 -0
- package/packages/precis/src/index.ts +67 -0
- package/packages/precis/tsconfig.json +13 -0
- package/packages/server/package.json +31 -0
- package/packages/server/src/index.ts +427 -0
- package/packages/server/tsconfig.json +17 -0
- package/setup.mjs +141 -0
- package/test.mjs +337 -0
- package/vendors/clinical-intake/pipeline.mjs +349 -0
- package/vendors/clinical-intake/questions/en.txt +9 -0
- package/vendors/clinical-intake/questions/fr.txt +9 -0
- package/vendors/piste/.env.example +73 -0
- package/vendors/piste/app/core/__init__.py +4 -0
- package/vendors/piste/app/core/config.py +83 -0
- package/vendors/piste/app/core/debuglog.py +16 -0
- package/vendors/piste/app/core/middleware.py +40 -0
- package/vendors/piste/bridge_piste.py +301 -0
- package/vendors/piste/pipeline/__init__.py +4 -0
- package/vendors/piste/pipeline/compiler.py +68 -0
- package/vendors/piste/pipeline/offline/__init__.py +28 -0
- package/vendors/piste/pipeline/offline/verifaid_pipeline.py +247 -0
- package/vendors/piste/pipeline/replay.py +15 -0
- package/vendors/piste/pipeline/replay_engine.py +249 -0
- package/vendors/piste/pipeline/signatures/__init__.py +4 -0
- package/vendors/piste/pipeline/signatures/signatures.py +136 -0
- package/vendors/piste/pipeline/stage1/__init__.py +21 -0
- package/vendors/piste/pipeline/stage1/atomic_decomposer.py +61 -0
- package/vendors/piste/pipeline/stage1/check_worthiness.py +100 -0
- package/vendors/piste/pipeline/stage1/orchestrator.py +175 -0
- package/vendors/piste/pipeline/stage1/test_stage1.py +162 -0
- package/vendors/piste/pipeline/stage2/__init__.py +34 -0
- package/vendors/piste/pipeline/stage2/blind_retriever.py +303 -0
- package/vendors/piste/pipeline/stage2/canonical_mapper.py +124 -0
- package/vendors/piste/pipeline/stage2/credibility_scorer.py +85 -0
- package/vendors/piste/pipeline/stage2/orchestrator.py +311 -0
- package/vendors/piste/pipeline/stage2/query_refiner.py +88 -0
- package/vendors/piste/pipeline/stage2/search_decision.py +69 -0
- package/vendors/piste/pipeline/stage2/test_stage2.py +265 -0
- package/vendors/piste/pipeline/stage3/__init__.py +20 -0
- package/vendors/piste/pipeline/stage3/classifier.py +79 -0
- package/vendors/piste/pipeline/stage3/orchestrator.py +225 -0
- package/vendors/piste/pipeline/stage3/test_stage3.py +101 -0
- package/vendors/piste/pipeline/stage4/__init__.py +33 -0
- package/vendors/piste/pipeline/stage4/criticality_gate.py +177 -0
- package/vendors/piste/pipeline/stage4/orchestrator.py +269 -0
- package/vendors/piste/pipeline/stage4/test_stage4.py +192 -0
- package/vendors/piste/pipeline/stage4/verdict_aggregator.py +157 -0
- package/vendors/piste/requirements.txt +53 -0
- package/vendors/precis/backend/__init__.py +6 -0
- package/vendors/precis/backend/agents/__init__.py +3 -0
- package/vendors/precis/backend/agents/data_synthesis.py +105 -0
- package/vendors/precis/backend/agents/dist_free_synth.py +97 -0
- package/vendors/precis/backend/agents/exact_hash_retriever.py +327 -0
- package/vendors/precis/backend/agents/fusion_ranker.py +64 -0
- package/vendors/precis/backend/agents/guardrail.py +175 -0
- package/vendors/precis/backend/agents/query_expander.py +89 -0
- package/vendors/precis/backend/agents/radial_interpol.py +99 -0
- package/vendors/precis/backend/agents/report_generator.py +92 -0
- package/vendors/precis/backend/agents/semantic_reranker.py +135 -0
- package/vendors/precis/backend/agents/stat_anomaly.py +93 -0
- package/vendors/precis/backend/agents/vector_index.py +123 -0
- package/vendors/precis/backend/agents/veri_score.py +341 -0
- package/vendors/precis/backend/agents/work_order_extractor.py +205 -0
- package/vendors/precis/backend/api/__init__.py +3 -0
- package/vendors/precis/backend/api/routes/__init__.py +3 -0
- package/vendors/precis/backend/config.py +88 -0
- package/vendors/precis/backend/core/__init__.py +13 -0
- package/vendors/precis/backend/core/hashing.py +22 -0
- package/vendors/precis/backend/core/metrics.py +77 -0
- package/vendors/precis/backend/core/multitoken.py +166 -0
- package/vendors/precis/backend/core/pmi.py +54 -0
- package/vendors/precis/backend/core/stemming.py +74 -0
- package/vendors/precis/backend/core/tracing.py +150 -0
- package/vendors/precis/backend/data/__init__.py +3 -0
- package/vendors/precis/backend/data/chunker.py +57 -0
- package/vendors/precis/backend/data/pdf_parser.py +42 -0
- package/vendors/precis/backend/db/__init__.py +3 -0
- package/vendors/precis/backend/db/models.py +173 -0
- package/vendors/precis/backend/db/repository.py +269 -0
- package/vendors/precis/backend/llm/__init__.py +3 -0
- package/vendors/precis/backend/llm/anthropic_provider.py +39 -0
- package/vendors/precis/backend/llm/base.py +147 -0
- package/vendors/precis/backend/llm/deepseek_provider.py +43 -0
- package/vendors/precis/backend/llm/factory.py +60 -0
- package/vendors/precis/backend/llm/google_provider.py +39 -0
- package/vendors/precis/backend/llm/ollama_provider.py +54 -0
- package/vendors/precis/backend/llm/openai_provider.py +50 -0
- package/vendors/precis/backend/main.py +677 -0
- package/vendors/precis/backend/orchestrator/__init__.py +3 -0
- package/vendors/precis/backend/orchestrator/planner.py +81 -0
- package/vendors/precis/backend/orchestrator/router.py +319 -0
- package/vendors/precis/backend/orchestrator/types.py +58 -0
- package/vendors/precis/bridge_precis.py +185 -0
- package/vendors/precis/data/sample_reports/README.md +8 -0
- package/vendors/precis/data/seed_data.py +115 -0
- package/vendors/precis/requirements.txt +19 -0
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
"""© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any, Dict, Iterator, List, Optional, Tuple
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class MultiToken:
|
|
9
|
+
"""Variable-length sequence of stemmed words with source provenance."""
|
|
10
|
+
tokens: Tuple[str, ...]
|
|
11
|
+
token_type: str = "standard"
|
|
12
|
+
source_doc: str = ""
|
|
13
|
+
source_page: int = 0
|
|
14
|
+
source_position: int = 0
|
|
15
|
+
font_size: Optional[float] = None
|
|
16
|
+
is_title: bool = False
|
|
17
|
+
is_header: bool = False
|
|
18
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class RetrievalResult:
|
|
23
|
+
"""A single retrieval match with relevance, trust, and match type."""
|
|
24
|
+
multitoken: MultiToken
|
|
25
|
+
relevance_score: float
|
|
26
|
+
trustworthiness_score: float
|
|
27
|
+
match_type: str # "exact" | "subset" | "contextual" | "semantic_fallback"
|
|
28
|
+
matched_tokens: List[str] = field(default_factory=list)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class NestedHashIndex:
|
|
32
|
+
"""Core retrieval engine: nested dict tree → O(m) exact lookup, no embeddings."""
|
|
33
|
+
|
|
34
|
+
def __init__(self) -> None:
|
|
35
|
+
self.index: Dict[str, Any] = {}
|
|
36
|
+
self.multitoken_count: int = 0
|
|
37
|
+
self.unique_tokens: set = set()
|
|
38
|
+
self._access_counts: Dict[str, int] = {}
|
|
39
|
+
self._doc_texts: Dict[str, str] = {} # filename → full original text
|
|
40
|
+
from backend.core.pmi import PMIScorer
|
|
41
|
+
self.pmi: PMIScorer = PMIScorer()
|
|
42
|
+
|
|
43
|
+
# ── Insert ─────────────────────────────────────────────────
|
|
44
|
+
|
|
45
|
+
def insert(self, mt: MultiToken) -> None:
|
|
46
|
+
current = self.index
|
|
47
|
+
for token in mt.tokens:
|
|
48
|
+
if token not in current:
|
|
49
|
+
current[token] = {}
|
|
50
|
+
current = current[token]
|
|
51
|
+
if "_items" not in current:
|
|
52
|
+
current["_items"] = []
|
|
53
|
+
current["_items"].append(mt)
|
|
54
|
+
self.multitoken_count += 1
|
|
55
|
+
self.unique_tokens.update(mt.tokens)
|
|
56
|
+
# Feed PMI scorer for token rarity weighting
|
|
57
|
+
try:
|
|
58
|
+
from backend.core.pmi import get_pmi_scorer
|
|
59
|
+
get_pmi_scorer().ingest_tokens(list(mt.tokens))
|
|
60
|
+
except Exception:
|
|
61
|
+
pass
|
|
62
|
+
self.pmi.ingest_tokens(list(mt.tokens)) # Build PMI statistics
|
|
63
|
+
|
|
64
|
+
def insert_batch(self, multitokens: List[MultiToken]) -> int:
|
|
65
|
+
for mt in multitokens:
|
|
66
|
+
self.insert(mt)
|
|
67
|
+
return len(multitokens)
|
|
68
|
+
|
|
69
|
+
# ── Search ─────────────────────────────────────────────────
|
|
70
|
+
|
|
71
|
+
def exact_search(self, query_tokens: Tuple[str, ...]) -> List[RetrievalResult]:
|
|
72
|
+
current = self.index
|
|
73
|
+
for token in query_tokens:
|
|
74
|
+
if token not in current:
|
|
75
|
+
return []
|
|
76
|
+
current = current[token]
|
|
77
|
+
items = current.get("_items", [])
|
|
78
|
+
self._record_access(items)
|
|
79
|
+
return [RetrievalResult(multitoken=mt, relevance_score=1.0,
|
|
80
|
+
trustworthiness_score=self._trust_score(mt), match_type="exact")
|
|
81
|
+
for mt in items]
|
|
82
|
+
|
|
83
|
+
def subset_search(self, query_tokens: Tuple[str, ...],
|
|
84
|
+
min_match_ratio: float = 0.75) -> List[RetrievalResult]:
|
|
85
|
+
results: List[RetrievalResult] = []
|
|
86
|
+
query_set = set(query_tokens)
|
|
87
|
+
for leaf in self._iter_leaves():
|
|
88
|
+
items = leaf.get("_items", [])
|
|
89
|
+
if not items:
|
|
90
|
+
continue
|
|
91
|
+
leaf_tokens = set(items[0].tokens)
|
|
92
|
+
overlap = len(query_set & leaf_tokens)
|
|
93
|
+
ratio = overlap / len(query_set) if query_set else 0.0
|
|
94
|
+
if ratio >= min_match_ratio:
|
|
95
|
+
score = ratio * (overlap / len(leaf_tokens)) if leaf_tokens else ratio
|
|
96
|
+
self._record_access(items)
|
|
97
|
+
for mt in items:
|
|
98
|
+
results.append(RetrievalResult(multitoken=mt, relevance_score=score,
|
|
99
|
+
trustworthiness_score=self._trust_score(mt), match_type="subset",
|
|
100
|
+
matched_tokens=list(query_set & leaf_tokens)))
|
|
101
|
+
results.sort(key=lambda r: r.relevance_score, reverse=True)
|
|
102
|
+
return results
|
|
103
|
+
|
|
104
|
+
def contextual_search(self, query_tokens: Tuple[str, ...]) -> List[RetrievalResult]:
|
|
105
|
+
results: List[RetrievalResult] = []
|
|
106
|
+
query_set = set(query_tokens)
|
|
107
|
+
for leaf in self._iter_leaves():
|
|
108
|
+
items = leaf.get("_items", [])
|
|
109
|
+
if not items:
|
|
110
|
+
continue
|
|
111
|
+
for mt in items:
|
|
112
|
+
if mt.token_type == "contextual" or mt.is_title or mt.is_header:
|
|
113
|
+
overlap = len(query_set & set(mt.tokens))
|
|
114
|
+
if overlap > 0:
|
|
115
|
+
score = min(overlap / len(query_set) * 1.5, 1.0)
|
|
116
|
+
results.append(RetrievalResult(multitoken=mt, relevance_score=score,
|
|
117
|
+
trustworthiness_score=self._trust_score(mt), match_type="contextual",
|
|
118
|
+
matched_tokens=list(query_set & set(mt.tokens))))
|
|
119
|
+
results.sort(key=lambda r: r.relevance_score, reverse=True)
|
|
120
|
+
return results
|
|
121
|
+
|
|
122
|
+
def hybrid_search(self, query_tokens: Tuple[str, ...],
|
|
123
|
+
include_semantic_fallback: bool = False,
|
|
124
|
+
source_filter: Optional[List[str]] = None,
|
|
125
|
+
trace=None) -> List[RetrievalResult]:
|
|
126
|
+
"""Multi-tier search across all indexed documents.
|
|
127
|
+
|
|
128
|
+
Parameters
|
|
129
|
+
----------
|
|
130
|
+
source_filter : Optional[List[str]]
|
|
131
|
+
If provided, only return results whose source_doc is in this list.
|
|
132
|
+
Case-insensitive basename matching (e.g. ``["report.pdf"]``).
|
|
133
|
+
"""
|
|
134
|
+
# Tier 1: exact
|
|
135
|
+
results = self.exact_search(query_tokens)
|
|
136
|
+
if trace:
|
|
137
|
+
trace.event(type("TE", (), {"value": "decision.search_type"})(), agent_name="ExactHash",
|
|
138
|
+
message=f"Exact search: {len(results)} results", data={"tier": 1, "count": len(results)})
|
|
139
|
+
# Tier 2: subset fallback (lowered threshold to 0.5 for better recall)
|
|
140
|
+
if len(results) < 5 and len(query_tokens) >= 2:
|
|
141
|
+
subset = self.subset_search(query_tokens, min_match_ratio=0.5)
|
|
142
|
+
if trace:
|
|
143
|
+
trace.event(type("TE", (), {"value": "decision.search_type"})(), agent_name="ExactHash",
|
|
144
|
+
message=f"Subset fallback: {len(subset)} results", data={"tier": 2, "count": len(subset)})
|
|
145
|
+
results.extend(subset)
|
|
146
|
+
# Tier 3: contextual boost (titles/headers)
|
|
147
|
+
ctx = self.contextual_search(query_tokens)
|
|
148
|
+
results.extend(ctx)
|
|
149
|
+
# Tier 4: broad sweep — any token overlap at all
|
|
150
|
+
if len(results) < 3:
|
|
151
|
+
broad = self.broad_search(query_tokens)
|
|
152
|
+
if trace:
|
|
153
|
+
trace.event(type("TE", (), {"value": "decision.search_type"})(), agent_name="ExactHash",
|
|
154
|
+
message=f"Broad sweep: {len(broad)} results", data={"tier": 4, "count": len(broad)})
|
|
155
|
+
results.extend(broad)
|
|
156
|
+
# Tier 5: PMI re-ranking — rare-token matches boosted, boilerplate suppressed
|
|
157
|
+
try:
|
|
158
|
+
from backend.core.pmi import get_pmi_scorer
|
|
159
|
+
pmi = get_pmi_scorer()
|
|
160
|
+
for r in results:
|
|
161
|
+
pmi_score = pmi.score(list(query_tokens), list(r.multitoken.tokens))
|
|
162
|
+
pmi_norm = pmi.normalize_score(pmi_score)
|
|
163
|
+
# Blend: 70% structural match + 30% token rarity
|
|
164
|
+
r.relevance_score = round(r.relevance_score * 0.7 + pmi_norm * 0.3, 4)
|
|
165
|
+
except Exception:
|
|
166
|
+
pass
|
|
167
|
+
|
|
168
|
+
# ── Document-scope filter ────────────────────────────────
|
|
169
|
+
if source_filter:
|
|
170
|
+
import os
|
|
171
|
+
filter_set = set()
|
|
172
|
+
for f in source_filter:
|
|
173
|
+
f = str(f).lower().strip()
|
|
174
|
+
f = os.path.basename(f)
|
|
175
|
+
if f:
|
|
176
|
+
filter_set.add(f)
|
|
177
|
+
before = len(results)
|
|
178
|
+
if filter_set:
|
|
179
|
+
results = [r for r in results
|
|
180
|
+
if os.path.basename(r.multitoken.source_doc.lower().strip()) in filter_set]
|
|
181
|
+
print(f"[Precis] Hash filter: source_filter={source_filter!r} filter_set={filter_set!r} before={before} after={len(results)}")
|
|
182
|
+
|
|
183
|
+
results.sort(key=lambda r: r.relevance_score, reverse=True)
|
|
184
|
+
return results
|
|
185
|
+
|
|
186
|
+
def broad_search(self, query_tokens: Tuple[str, ...]) -> List[RetrievalResult]:
|
|
187
|
+
"""Last-resort search: any leaf with meaningful token overlap (≥30% of query)."""
|
|
188
|
+
results: List[RetrievalResult] = []
|
|
189
|
+
query_set = set(query_tokens)
|
|
190
|
+
min_overlap = max(1, int(len(query_tokens) * 0.3)) # Require ≥30% query token match
|
|
191
|
+
for leaf in self._iter_leaves():
|
|
192
|
+
items = leaf.get("_items", [])
|
|
193
|
+
if not items:
|
|
194
|
+
continue
|
|
195
|
+
for mt in items:
|
|
196
|
+
overlap = len(query_set & set(mt.tokens))
|
|
197
|
+
if overlap >= min_overlap:
|
|
198
|
+
score = min(overlap / len(query_set) * 0.8, 0.9)
|
|
199
|
+
results.append(RetrievalResult(multitoken=mt, relevance_score=score,
|
|
200
|
+
trustworthiness_score=self._trust_score(mt), match_type="broad",
|
|
201
|
+
matched_tokens=list(query_set & set(mt.tokens))))
|
|
202
|
+
results.sort(key=lambda r: r.relevance_score, reverse=True)
|
|
203
|
+
return results[:30] # Cap broad results
|
|
204
|
+
|
|
205
|
+
# ── Maintenance ─────────────────────────────────────────────
|
|
206
|
+
|
|
207
|
+
def auto_distill(self, min_access_count: int = 3) -> int:
|
|
208
|
+
removed = 0
|
|
209
|
+
for leaf in list(self._iter_leaves()):
|
|
210
|
+
items = leaf.get("_items", [])
|
|
211
|
+
if not items:
|
|
212
|
+
continue
|
|
213
|
+
key = self._leaf_key(items[0])
|
|
214
|
+
if self._access_counts.get(key, 0) < min_access_count:
|
|
215
|
+
leaf["_items"] = []
|
|
216
|
+
self.multitoken_count -= len(items)
|
|
217
|
+
removed += len(items)
|
|
218
|
+
return removed
|
|
219
|
+
|
|
220
|
+
def index_document(self, text: str, source: str = "uploaded_document") -> int:
|
|
221
|
+
"""Parse raw text into MultiTokens and insert into the index. Returns count of tokens indexed."""
|
|
222
|
+
# Store original text for context retrieval
|
|
223
|
+
self._doc_texts[source] = text
|
|
224
|
+
|
|
225
|
+
from backend.core.multitoken import MultiTokenExtractor
|
|
226
|
+
|
|
227
|
+
extractor = MultiTokenExtractor(max_token_length=7, min_token_length=2)
|
|
228
|
+
|
|
229
|
+
lines = text.strip().split("\n")
|
|
230
|
+
parsed = [{"page_number": 1, "elements": []}]
|
|
231
|
+
for line in lines:
|
|
232
|
+
stripped = line.strip()
|
|
233
|
+
if not stripped:
|
|
234
|
+
continue
|
|
235
|
+
parsed[0]["elements"].append({
|
|
236
|
+
"text": stripped,
|
|
237
|
+
"is_title": stripped.isupper() and len(stripped) < 80,
|
|
238
|
+
"is_header": stripped.isupper() and len(stripped) < 60,
|
|
239
|
+
"font_size": 14.0 if stripped.isupper() else 10.0,
|
|
240
|
+
})
|
|
241
|
+
|
|
242
|
+
# Use the proper MultiTokenExtractor API: index_document(filename, parsed, self)
|
|
243
|
+
return extractor.index_document(source, parsed, self)
|
|
244
|
+
|
|
245
|
+
def get_statistics(self) -> Dict[str, Any]:
|
|
246
|
+
depth = self._compute_depth(self.index)
|
|
247
|
+
return {"multitoken_count": self.multitoken_count, "unique_tokens": len(self.unique_tokens),
|
|
248
|
+
"index_depth": depth, "memory_estimate_mb": self.multitoken_count * 0.002,
|
|
249
|
+
"cached_documents": len(self._doc_texts)}
|
|
250
|
+
|
|
251
|
+
def get_context(self, source_doc: str, source_page: int, source_position: int,
|
|
252
|
+
window: int = 5) -> Dict[str, Any]:
|
|
253
|
+
"""Retrieve surrounding lines from the original document for a match position.
|
|
254
|
+
|
|
255
|
+
source_position is the LINE NUMBER in the full document (set during indexing).
|
|
256
|
+
Uses raw lines without merging so the index stays accurate.
|
|
257
|
+
"""
|
|
258
|
+
text = self._doc_texts.get(source_doc, "")
|
|
259
|
+
if not text:
|
|
260
|
+
return {"sentence": "(source text not cached)", "surrounding": "", "page": source_page, "file": source_doc}
|
|
261
|
+
|
|
262
|
+
lines = text.split("\n")
|
|
263
|
+
# Filter out fully empty lines but keep line numbering intact
|
|
264
|
+
non_empty = [(i, l.strip()) for i, l in enumerate(lines) if l.strip()]
|
|
265
|
+
|
|
266
|
+
if not non_empty:
|
|
267
|
+
return {"sentence": "", "surrounding": "", "page": source_page, "file": source_doc}
|
|
268
|
+
|
|
269
|
+
# Find the closest non-empty line to source_position
|
|
270
|
+
idx = 0
|
|
271
|
+
for i, (line_no, _) in enumerate(non_empty):
|
|
272
|
+
if line_no >= source_position:
|
|
273
|
+
idx = i
|
|
274
|
+
break
|
|
275
|
+
else:
|
|
276
|
+
idx = len(non_empty) - 1
|
|
277
|
+
|
|
278
|
+
start = max(0, idx - window)
|
|
279
|
+
end = min(len(non_empty), idx + window + 1)
|
|
280
|
+
|
|
281
|
+
surrounding = "\n".join(l for _, l in non_empty[start:end])
|
|
282
|
+
sentence = non_empty[idx][1] if idx < len(non_empty) else ""
|
|
283
|
+
|
|
284
|
+
return {
|
|
285
|
+
"sentence": sentence,
|
|
286
|
+
"surrounding": surrounding,
|
|
287
|
+
"page": source_page,
|
|
288
|
+
"file": source_doc,
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
# ── Internal ────────────────────────────────────────────────
|
|
292
|
+
|
|
293
|
+
def _iter_leaves(self) -> Iterator[Dict[str, Any]]:
|
|
294
|
+
def recurse(node):
|
|
295
|
+
if "_items" in node:
|
|
296
|
+
yield node
|
|
297
|
+
for k, v in node.items():
|
|
298
|
+
if k != "_items" and isinstance(v, dict):
|
|
299
|
+
yield from recurse(v)
|
|
300
|
+
yield from recurse(self.index)
|
|
301
|
+
|
|
302
|
+
def _trust_score(self, mt: MultiToken) -> float:
|
|
303
|
+
score = 0.5
|
|
304
|
+
if mt.is_title:
|
|
305
|
+
score += 0.2
|
|
306
|
+
if mt.is_header:
|
|
307
|
+
score += 0.1
|
|
308
|
+
if mt.font_size and mt.font_size > 12:
|
|
309
|
+
score += 0.1
|
|
310
|
+
if mt.token_type == "contextual":
|
|
311
|
+
score += 0.15
|
|
312
|
+
return min(score, 1.0)
|
|
313
|
+
|
|
314
|
+
def _record_access(self, items: List[MultiToken]) -> None:
|
|
315
|
+
for mt in items:
|
|
316
|
+
key = self._leaf_key(mt)
|
|
317
|
+
self._access_counts[key] = self._access_counts.get(key, 0) + 1
|
|
318
|
+
|
|
319
|
+
@staticmethod
|
|
320
|
+
def _leaf_key(mt: MultiToken) -> str:
|
|
321
|
+
return f"{mt.source_doc}|{mt.source_page}|{mt.source_position}"
|
|
322
|
+
|
|
323
|
+
@staticmethod
|
|
324
|
+
def _compute_depth(node: dict) -> int:
|
|
325
|
+
if not isinstance(node, dict) or not node:
|
|
326
|
+
return 0
|
|
327
|
+
return 1 + max((NestedHashIndex._compute_depth(v) for k, v in node.items() if k != "_items"), default=0)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT
|
|
2
|
+
|
|
3
|
+
Implements RRF (Reciprocal Rank Fusion): score = Σ 1/(k + rank_i)
|
|
4
|
+
where k=60 is the standard constant, and rank_i is the result's rank in each source.
|
|
5
|
+
|
|
6
|
+
Produces a single ranked list from multiple retrieval backends.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Any, Dict, List
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class FusionRanker:
|
|
13
|
+
"""Combines results from multiple retrieval engines using RRF."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, k: int = 60) -> None:
|
|
16
|
+
self.k = k # RRF constant
|
|
17
|
+
|
|
18
|
+
def fuse(self, sources: Dict[str, List[Dict[str, Any]]],
|
|
19
|
+
top_k: int = 15) -> List[Dict[str, Any]]:
|
|
20
|
+
"""Fuse multiple ranked result lists into one.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
sources: {"hash": [...], "vector": [...]} — each list pre-sorted by score
|
|
24
|
+
top_k: Max results to return
|
|
25
|
+
"""
|
|
26
|
+
# Assign RRF scores
|
|
27
|
+
fused: Dict[str, Dict[str, Any]] = {} # key = text[:100]
|
|
28
|
+
|
|
29
|
+
for source_name, results in sources.items():
|
|
30
|
+
for rank, item in enumerate(results):
|
|
31
|
+
key = item.get("text", "")[:100] # Dedup key
|
|
32
|
+
rrf_score = 1.0 / (self.k + rank + 1)
|
|
33
|
+
|
|
34
|
+
if key in fused:
|
|
35
|
+
fused[key]["rrf_score"] += rrf_score
|
|
36
|
+
fused[key]["sources"].add(source_name)
|
|
37
|
+
# Keep the higher original score
|
|
38
|
+
if item.get("score", 0) > fused[key].get("original_score", 0):
|
|
39
|
+
fused[key]["original_score"] = item.get("score", 0)
|
|
40
|
+
fused[key]["match_type"] = item.get("match_type", "")
|
|
41
|
+
fused[key]["source"] = item.get("source", "")
|
|
42
|
+
fused[key]["page"] = item.get("page", 1)
|
|
43
|
+
else:
|
|
44
|
+
fused[key] = {
|
|
45
|
+
"text": item.get("text", ""),
|
|
46
|
+
"source": item.get("source", ""),
|
|
47
|
+
"page": item.get("page", 1),
|
|
48
|
+
"rrf_score": rrf_score,
|
|
49
|
+
"original_score": item.get("score", 0),
|
|
50
|
+
"match_type": item.get("match_type", ""),
|
|
51
|
+
"sources": {source_name},
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
# Sort by RRF score, return top_k
|
|
55
|
+
ranked = sorted(fused.values(), key=lambda x: -x["rrf_score"])[:top_k]
|
|
56
|
+
|
|
57
|
+
# Normalize scores to 0-1
|
|
58
|
+
if ranked:
|
|
59
|
+
max_rrf = ranked[0]["rrf_score"]
|
|
60
|
+
for item in ranked:
|
|
61
|
+
item["score"] = round(item["rrf_score"] / max_rrf, 3) if max_rrf > 0 else 0
|
|
62
|
+
item["match_type"] = "fusion:" + "+".join(sorted(item["sources"]))
|
|
63
|
+
|
|
64
|
+
return ranked
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"""© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class GuardrailAction(str, Enum):
|
|
10
|
+
PASS = "pass"
|
|
11
|
+
FLAG = "flag"
|
|
12
|
+
REDACT = "redact"
|
|
13
|
+
BLOCK = "block"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class GuardrailResult:
|
|
18
|
+
action: GuardrailAction
|
|
19
|
+
issues_found: List[str] = field(default_factory=list)
|
|
20
|
+
redacted_content: List[str] = field(default_factory=list)
|
|
21
|
+
redacted_response: Optional[str] = None # ← response with PII scrubbed (when action=REDACT)
|
|
22
|
+
confidence: float = 1.0
|
|
23
|
+
requires_human_review: bool = False
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class GuardrailAgent:
|
|
27
|
+
"""Validates outputs before delivery. PII detection, hallucination check, content policy.
|
|
28
|
+
|
|
29
|
+
Layer 1 — PII Detection & Redaction:
|
|
30
|
+
Scans the generated response for SSNs, credit-card numbers, email
|
|
31
|
+
addresses, and phone numbers. When PII is found the response is
|
|
32
|
+
*redacted* in-place rather than blocked outright, so the user still
|
|
33
|
+
receives useful content.
|
|
34
|
+
|
|
35
|
+
Layer 2 — Prompt Injection:
|
|
36
|
+
Checks the original query for injection markers. These are ALWAYS
|
|
37
|
+
blocked — no response is returned.
|
|
38
|
+
|
|
39
|
+
Layer 3 — Hallucination Threshold:
|
|
40
|
+
If the VeriScore hallucination rate exceeds 30 %, the response is
|
|
41
|
+
flagged for human review but still delivered (with a warning).
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
# ── PII patterns: (regex, human-readable type, replacement mask) ─
|
|
45
|
+
_PII_PATTERNS: List[Tuple[str, str, str]] = [
|
|
46
|
+
(r'\b\d{3}-\d{2}-\d{4}\b', "SSN", "[REDACTED-SSN]"),
|
|
47
|
+
(r'\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b', "Credit Card", "[REDACTED-CC]"),
|
|
48
|
+
(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', "Email", "[REDACTED-EMAIL]"),
|
|
49
|
+
(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', "Phone", "[REDACTED-PHONE]"),
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
# ── Prompt injection markers: any match → immediate BLOCK ──────
|
|
53
|
+
_INJECTION_MARKERS: List[str] = [
|
|
54
|
+
"ignore previous", "you are now", "system prompt",
|
|
55
|
+
"[/INST]", "<|im_start|>", "<|im_end|>",
|
|
56
|
+
"forget all", "new instructions", "pretend you are",
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
# ── Thresholds ─────────────────────────────────────────────────
|
|
60
|
+
HALLUCINATION_BLOCK_THRESHOLD: float = 0.8 # > 80 % → block (extreme cases only)
|
|
61
|
+
HALLUCINATION_FLAG_THRESHOLD: float = 0.3 # > 30 % → flag
|
|
62
|
+
|
|
63
|
+
# ── Public API ──────────────────────────────────────────────────
|
|
64
|
+
|
|
65
|
+
async def validate(
|
|
66
|
+
self,
|
|
67
|
+
generated_response: str,
|
|
68
|
+
retrieved_sources: List[Dict[str, Any]],
|
|
69
|
+
original_query: str,
|
|
70
|
+
veriscore_report: Optional[Any] = None,
|
|
71
|
+
) -> GuardrailResult:
|
|
72
|
+
"""Run all safety layers. Returns a GuardrailResult with the
|
|
73
|
+
most restrictive action warranted by any layer.
|
|
74
|
+
"""
|
|
75
|
+
issues: List[str] = []
|
|
76
|
+
redacted: List[str] = []
|
|
77
|
+
redacted_text: Optional[str] = None
|
|
78
|
+
confidence: float = 1.0
|
|
79
|
+
needs_review: bool = False
|
|
80
|
+
|
|
81
|
+
# ── Layer 1: PII Detection & Redaction ──────────────────
|
|
82
|
+
redacted_text, pii_found = self._redact_pii(generated_response)
|
|
83
|
+
if pii_found:
|
|
84
|
+
for _, pii_type, _ in self._PII_PATTERNS:
|
|
85
|
+
if re.search(self._PII_PATTERNS[0][0], generated_response): # quick re-check
|
|
86
|
+
pass
|
|
87
|
+
issues.extend(pii_found)
|
|
88
|
+
redacted.extend(pii_found)
|
|
89
|
+
confidence = 0.85
|
|
90
|
+
needs_review = True
|
|
91
|
+
|
|
92
|
+
# ── Layer 2: Prompt Injection ──────────────────────────
|
|
93
|
+
query_lower = original_query.lower()
|
|
94
|
+
for marker in self._INJECTION_MARKERS:
|
|
95
|
+
if marker.lower() in query_lower:
|
|
96
|
+
issues.append(f"Prompt injection detected: '{marker}'")
|
|
97
|
+
return GuardrailResult(
|
|
98
|
+
action=GuardrailAction.BLOCK,
|
|
99
|
+
issues_found=issues,
|
|
100
|
+
redacted_content=redacted,
|
|
101
|
+
confidence=1.0,
|
|
102
|
+
requires_human_review=True,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# ── Layer 3: Hallucination Rate Threshold ──────────────
|
|
106
|
+
if veriscore_report is not None:
|
|
107
|
+
hall_rate = getattr(veriscore_report, "hallucination_rate", 0.0)
|
|
108
|
+
if hall_rate > self.HALLUCINATION_BLOCK_THRESHOLD:
|
|
109
|
+
issues.append(
|
|
110
|
+
f"Critical hallucination rate ({hall_rate:.0%} > "
|
|
111
|
+
f"{self.HALLUCINATION_BLOCK_THRESHOLD:.0%}) — response blocked"
|
|
112
|
+
)
|
|
113
|
+
return GuardrailResult(
|
|
114
|
+
action=GuardrailAction.BLOCK,
|
|
115
|
+
issues_found=issues,
|
|
116
|
+
redacted_content=redacted,
|
|
117
|
+
confidence=0.6,
|
|
118
|
+
requires_human_review=True,
|
|
119
|
+
)
|
|
120
|
+
elif hall_rate > self.HALLUCINATION_FLAG_THRESHOLD:
|
|
121
|
+
issues.append(
|
|
122
|
+
f"High hallucination rate ({hall_rate:.0%} > "
|
|
123
|
+
f"{self.HALLUCINATION_FLAG_THRESHOLD:.0%})"
|
|
124
|
+
)
|
|
125
|
+
confidence = 0.7
|
|
126
|
+
needs_review = True
|
|
127
|
+
|
|
128
|
+
# ── Also check flagged_issues from VeriScore ───────────
|
|
129
|
+
if veriscore_report is not None:
|
|
130
|
+
for fi in getattr(veriscore_report, "flagged_issues", []):
|
|
131
|
+
if fi not in issues:
|
|
132
|
+
issues.append(fi)
|
|
133
|
+
|
|
134
|
+
# ── Decide final action ────────────────────────────────
|
|
135
|
+
if not issues:
|
|
136
|
+
return GuardrailResult(
|
|
137
|
+
action=GuardrailAction.PASS,
|
|
138
|
+
redacted_response=redacted_text or generated_response,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
if redacted:
|
|
142
|
+
# PII was found → redact and deliver
|
|
143
|
+
return GuardrailResult(
|
|
144
|
+
action=GuardrailAction.REDACT,
|
|
145
|
+
issues_found=issues,
|
|
146
|
+
redacted_content=redacted,
|
|
147
|
+
redacted_response=redacted_text,
|
|
148
|
+
confidence=confidence,
|
|
149
|
+
requires_human_review=needs_review,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Non-PII issues → flag for review but still deliver
|
|
153
|
+
return GuardrailResult(
|
|
154
|
+
action=GuardrailAction.FLAG,
|
|
155
|
+
issues_found=issues,
|
|
156
|
+
redacted_response=generated_response,
|
|
157
|
+
confidence=confidence,
|
|
158
|
+
requires_human_review=needs_review,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# ── PII Redaction ──────────────────────────────────────────────
|
|
162
|
+
|
|
163
|
+
def _redact_pii(self, text: str) -> Tuple[str, List[str]]:
|
|
164
|
+
"""Scan *text* for PII patterns and replace matches with safe tokens.
|
|
165
|
+
|
|
166
|
+
Returns (redacted_text, list_of_types_found).
|
|
167
|
+
"""
|
|
168
|
+
found: List[str] = []
|
|
169
|
+
result = text
|
|
170
|
+
for pattern, pii_type, replacement in self._PII_PATTERNS:
|
|
171
|
+
matches = re.findall(pattern, result)
|
|
172
|
+
if matches:
|
|
173
|
+
found.append(f"PII redacted: {pii_type} ({len(matches)} instance(s))")
|
|
174
|
+
result = re.sub(pattern, replacement, result)
|
|
175
|
+
return result, found
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT
|
|
2
|
+
|
|
3
|
+
Transforms natural language into the domain-specific vocabulary found in indexed documents.
|
|
4
|
+
Example: "currency impact" → "foreign exchange rate exposure currency fluctuation risk"
|
|
5
|
+
No embeddings, no vector DB — just the LLM's knowledge of financial synonyms.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Optional
|
|
9
|
+
from backend.llm.base import LLMProvider
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class QueryExpander:
|
|
13
|
+
"""Rewrites queries using LLM knowledge of domain terminology when exact hash search fails."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, llm: LLMProvider) -> None:
|
|
16
|
+
self.llm = llm
|
|
17
|
+
|
|
18
|
+
async def expand(self, original_query: str, failed_tokens: list,
|
|
19
|
+
document_domain: str = "financial and legal") -> list:
|
|
20
|
+
"""Generate alternative queries with different terminology.
|
|
21
|
+
|
|
22
|
+
Returns a list of expanded query strings, sorted by likely relevance.
|
|
23
|
+
"""
|
|
24
|
+
token_str = ", ".join(failed_tokens[:20]) if failed_tokens else original_query
|
|
25
|
+
|
|
26
|
+
prompt = f"""You are a {document_domain} domain expert. A document search system failed
|
|
27
|
+
to find matches for the following query because the EXACT words don't appear in the documents.
|
|
28
|
+
|
|
29
|
+
ORIGINAL QUERY: {original_query}
|
|
30
|
+
FAILED SEARCH TOKENS: {token_str}
|
|
31
|
+
|
|
32
|
+
The documents contain formal {document_domain} terminology. Rewrite the original query
|
|
33
|
+
using alternative words, synonyms, and related {document_domain} terms that are MORE LIKELY
|
|
34
|
+
to appear in formal documents.
|
|
35
|
+
|
|
36
|
+
For example:
|
|
37
|
+
- "currency impact" → "foreign exchange rate exposure"
|
|
38
|
+
- "money lost" → "financial impairment write-down loss"
|
|
39
|
+
- "hacking problem" → "cybersecurity incident data breach unauthorized access"
|
|
40
|
+
- "worker shortage" → "talent attrition labor supply constraints headcount reduction"
|
|
41
|
+
- "green rules" → "environmental regulation climate compliance carbon emission"
|
|
42
|
+
|
|
43
|
+
Return ONLY a JSON list of 3 rewritten queries, most likely to match first:
|
|
44
|
+
["rewritten query 1", "rewritten query 2", "rewritten query 3"]"""
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
import asyncio
|
|
48
|
+
response = await asyncio.wait_for(
|
|
49
|
+
self.llm.generate(prompt, max_tokens=300, temperature=0.3),
|
|
50
|
+
timeout=20
|
|
51
|
+
)
|
|
52
|
+
# Parse JSON list from response
|
|
53
|
+
import json
|
|
54
|
+
# Find the JSON array in the response
|
|
55
|
+
start = response.find("[")
|
|
56
|
+
end = response.rfind("]") + 1
|
|
57
|
+
if start >= 0 and end > start:
|
|
58
|
+
expansions = json.loads(response[start:end])
|
|
59
|
+
if isinstance(expansions, list):
|
|
60
|
+
return expansions[:3]
|
|
61
|
+
except Exception:
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
# Fallback: simple word-level expansion using common financial synonyms
|
|
65
|
+
return [self._basic_expand(original_query)]
|
|
66
|
+
|
|
67
|
+
def _basic_expand(self, query: str) -> str:
|
|
68
|
+
"""Simple synonym substitution when LLM is unavailable."""
|
|
69
|
+
synonyms = {
|
|
70
|
+
"currency": "foreign exchange fx rate",
|
|
71
|
+
"money": "capital funds revenue cash",
|
|
72
|
+
"risk": "exposure uncertainty volatility",
|
|
73
|
+
"profit": "earnings income margin return",
|
|
74
|
+
"loss": "impairment write-down decline decrease",
|
|
75
|
+
"revenue": "sales income turnover top-line",
|
|
76
|
+
"cost": "expense expenditure outlay",
|
|
77
|
+
"market": "sector industry segment",
|
|
78
|
+
"growth": "expansion increase appreciation",
|
|
79
|
+
"rule": "regulation compliance requirement policy",
|
|
80
|
+
"problem": "issue incident concern challenge",
|
|
81
|
+
"impact": "effect influence exposure consequence",
|
|
82
|
+
}
|
|
83
|
+
words = query.lower().split()
|
|
84
|
+
expanded = []
|
|
85
|
+
for w in words:
|
|
86
|
+
expanded.append(w)
|
|
87
|
+
if w in synonyms:
|
|
88
|
+
expanded.append(synonyms[w])
|
|
89
|
+
return " ".join(expanded)
|