mcp-agentic-pipelines 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/.env.example +93 -0
  2. package/README.md +258 -0
  3. package/package.json +70 -0
  4. package/packages/clinical/package.json +22 -0
  5. package/packages/clinical/src/index.ts +262 -0
  6. package/packages/clinical/tsconfig.json +13 -0
  7. package/packages/core/package.json +21 -0
  8. package/packages/core/src/config.ts +138 -0
  9. package/packages/core/src/errors.ts +100 -0
  10. package/packages/core/src/index.ts +104 -0
  11. package/packages/core/src/llm-config.ts +213 -0
  12. package/packages/core/src/logging.ts +66 -0
  13. package/packages/core/src/python-bridge.ts +384 -0
  14. package/packages/core/src/rate-limiter.ts +136 -0
  15. package/packages/core/src/types.ts +203 -0
  16. package/packages/core/src/validation.ts +101 -0
  17. package/packages/core/tsconfig.json +10 -0
  18. package/packages/deeppipe/package.json +21 -0
  19. package/packages/deeppipe/src/index.ts +424 -0
  20. package/packages/deeppipe/tsconfig.json +13 -0
  21. package/packages/piste/package.json +20 -0
  22. package/packages/piste/src/index.ts +48 -0
  23. package/packages/piste/tsconfig.json +13 -0
  24. package/packages/precis/package.json +20 -0
  25. package/packages/precis/src/index.ts +67 -0
  26. package/packages/precis/tsconfig.json +13 -0
  27. package/packages/server/package.json +31 -0
  28. package/packages/server/src/index.ts +427 -0
  29. package/packages/server/tsconfig.json +17 -0
  30. package/setup.mjs +141 -0
  31. package/test.mjs +337 -0
  32. package/vendors/clinical-intake/pipeline.mjs +349 -0
  33. package/vendors/clinical-intake/questions/en.txt +9 -0
  34. package/vendors/clinical-intake/questions/fr.txt +9 -0
  35. package/vendors/piste/.env.example +73 -0
  36. package/vendors/piste/app/core/__init__.py +4 -0
  37. package/vendors/piste/app/core/config.py +83 -0
  38. package/vendors/piste/app/core/debuglog.py +16 -0
  39. package/vendors/piste/app/core/middleware.py +40 -0
  40. package/vendors/piste/bridge_piste.py +301 -0
  41. package/vendors/piste/pipeline/__init__.py +4 -0
  42. package/vendors/piste/pipeline/compiler.py +68 -0
  43. package/vendors/piste/pipeline/offline/__init__.py +28 -0
  44. package/vendors/piste/pipeline/offline/verifaid_pipeline.py +247 -0
  45. package/vendors/piste/pipeline/replay.py +15 -0
  46. package/vendors/piste/pipeline/replay_engine.py +249 -0
  47. package/vendors/piste/pipeline/signatures/__init__.py +4 -0
  48. package/vendors/piste/pipeline/signatures/signatures.py +136 -0
  49. package/vendors/piste/pipeline/stage1/__init__.py +21 -0
  50. package/vendors/piste/pipeline/stage1/atomic_decomposer.py +61 -0
  51. package/vendors/piste/pipeline/stage1/check_worthiness.py +100 -0
  52. package/vendors/piste/pipeline/stage1/orchestrator.py +175 -0
  53. package/vendors/piste/pipeline/stage1/test_stage1.py +162 -0
  54. package/vendors/piste/pipeline/stage2/__init__.py +34 -0
  55. package/vendors/piste/pipeline/stage2/blind_retriever.py +303 -0
  56. package/vendors/piste/pipeline/stage2/canonical_mapper.py +124 -0
  57. package/vendors/piste/pipeline/stage2/credibility_scorer.py +85 -0
  58. package/vendors/piste/pipeline/stage2/orchestrator.py +311 -0
  59. package/vendors/piste/pipeline/stage2/query_refiner.py +88 -0
  60. package/vendors/piste/pipeline/stage2/search_decision.py +69 -0
  61. package/vendors/piste/pipeline/stage2/test_stage2.py +265 -0
  62. package/vendors/piste/pipeline/stage3/__init__.py +20 -0
  63. package/vendors/piste/pipeline/stage3/classifier.py +79 -0
  64. package/vendors/piste/pipeline/stage3/orchestrator.py +225 -0
  65. package/vendors/piste/pipeline/stage3/test_stage3.py +101 -0
  66. package/vendors/piste/pipeline/stage4/__init__.py +33 -0
  67. package/vendors/piste/pipeline/stage4/criticality_gate.py +177 -0
  68. package/vendors/piste/pipeline/stage4/orchestrator.py +269 -0
  69. package/vendors/piste/pipeline/stage4/test_stage4.py +192 -0
  70. package/vendors/piste/pipeline/stage4/verdict_aggregator.py +157 -0
  71. package/vendors/piste/requirements.txt +53 -0
  72. package/vendors/precis/backend/__init__.py +6 -0
  73. package/vendors/precis/backend/agents/__init__.py +3 -0
  74. package/vendors/precis/backend/agents/data_synthesis.py +105 -0
  75. package/vendors/precis/backend/agents/dist_free_synth.py +97 -0
  76. package/vendors/precis/backend/agents/exact_hash_retriever.py +327 -0
  77. package/vendors/precis/backend/agents/fusion_ranker.py +64 -0
  78. package/vendors/precis/backend/agents/guardrail.py +175 -0
  79. package/vendors/precis/backend/agents/query_expander.py +89 -0
  80. package/vendors/precis/backend/agents/radial_interpol.py +99 -0
  81. package/vendors/precis/backend/agents/report_generator.py +92 -0
  82. package/vendors/precis/backend/agents/semantic_reranker.py +135 -0
  83. package/vendors/precis/backend/agents/stat_anomaly.py +93 -0
  84. package/vendors/precis/backend/agents/vector_index.py +123 -0
  85. package/vendors/precis/backend/agents/veri_score.py +341 -0
  86. package/vendors/precis/backend/agents/work_order_extractor.py +205 -0
  87. package/vendors/precis/backend/api/__init__.py +3 -0
  88. package/vendors/precis/backend/api/routes/__init__.py +3 -0
  89. package/vendors/precis/backend/config.py +88 -0
  90. package/vendors/precis/backend/core/__init__.py +13 -0
  91. package/vendors/precis/backend/core/hashing.py +22 -0
  92. package/vendors/precis/backend/core/metrics.py +77 -0
  93. package/vendors/precis/backend/core/multitoken.py +166 -0
  94. package/vendors/precis/backend/core/pmi.py +54 -0
  95. package/vendors/precis/backend/core/stemming.py +74 -0
  96. package/vendors/precis/backend/core/tracing.py +150 -0
  97. package/vendors/precis/backend/data/__init__.py +3 -0
  98. package/vendors/precis/backend/data/chunker.py +57 -0
  99. package/vendors/precis/backend/data/pdf_parser.py +42 -0
  100. package/vendors/precis/backend/db/__init__.py +3 -0
  101. package/vendors/precis/backend/db/models.py +173 -0
  102. package/vendors/precis/backend/db/repository.py +269 -0
  103. package/vendors/precis/backend/llm/__init__.py +3 -0
  104. package/vendors/precis/backend/llm/anthropic_provider.py +39 -0
  105. package/vendors/precis/backend/llm/base.py +147 -0
  106. package/vendors/precis/backend/llm/deepseek_provider.py +43 -0
  107. package/vendors/precis/backend/llm/factory.py +60 -0
  108. package/vendors/precis/backend/llm/google_provider.py +39 -0
  109. package/vendors/precis/backend/llm/ollama_provider.py +54 -0
  110. package/vendors/precis/backend/llm/openai_provider.py +50 -0
  111. package/vendors/precis/backend/main.py +677 -0
  112. package/vendors/precis/backend/orchestrator/__init__.py +3 -0
  113. package/vendors/precis/backend/orchestrator/planner.py +81 -0
  114. package/vendors/precis/backend/orchestrator/router.py +319 -0
  115. package/vendors/precis/backend/orchestrator/types.py +58 -0
  116. package/vendors/precis/bridge_precis.py +185 -0
  117. package/vendors/precis/data/sample_reports/README.md +8 -0
  118. package/vendors/precis/data/seed_data.py +115 -0
  119. package/vendors/precis/requirements.txt +19 -0
@@ -0,0 +1,327 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any, Dict, Iterator, List, Optional, Tuple
5
+
6
+
7
+ @dataclass
8
+ class MultiToken:
9
+ """Variable-length sequence of stemmed words with source provenance."""
10
+ tokens: Tuple[str, ...]
11
+ token_type: str = "standard"
12
+ source_doc: str = ""
13
+ source_page: int = 0
14
+ source_position: int = 0
15
+ font_size: Optional[float] = None
16
+ is_title: bool = False
17
+ is_header: bool = False
18
+ metadata: Dict[str, Any] = field(default_factory=dict)
19
+
20
+
21
+ @dataclass
22
+ class RetrievalResult:
23
+ """A single retrieval match with relevance, trust, and match type."""
24
+ multitoken: MultiToken
25
+ relevance_score: float
26
+ trustworthiness_score: float
27
+ match_type: str # "exact" | "subset" | "contextual" | "semantic_fallback"
28
+ matched_tokens: List[str] = field(default_factory=list)
29
+
30
+
31
+ class NestedHashIndex:
32
+ """Core retrieval engine: nested dict tree → O(m) exact lookup, no embeddings."""
33
+
34
+ def __init__(self) -> None:
35
+ self.index: Dict[str, Any] = {}
36
+ self.multitoken_count: int = 0
37
+ self.unique_tokens: set = set()
38
+ self._access_counts: Dict[str, int] = {}
39
+ self._doc_texts: Dict[str, str] = {} # filename → full original text
40
+ from backend.core.pmi import PMIScorer
41
+ self.pmi: PMIScorer = PMIScorer()
42
+
43
+ # ── Insert ─────────────────────────────────────────────────
44
+
45
+ def insert(self, mt: MultiToken) -> None:
46
+ current = self.index
47
+ for token in mt.tokens:
48
+ if token not in current:
49
+ current[token] = {}
50
+ current = current[token]
51
+ if "_items" not in current:
52
+ current["_items"] = []
53
+ current["_items"].append(mt)
54
+ self.multitoken_count += 1
55
+ self.unique_tokens.update(mt.tokens)
56
+ # Feed PMI scorer for token rarity weighting
57
+ try:
58
+ from backend.core.pmi import get_pmi_scorer
59
+ get_pmi_scorer().ingest_tokens(list(mt.tokens))
60
+ except Exception:
61
+ pass
62
+ self.pmi.ingest_tokens(list(mt.tokens)) # Build PMI statistics
63
+
64
+ def insert_batch(self, multitokens: List[MultiToken]) -> int:
65
+ for mt in multitokens:
66
+ self.insert(mt)
67
+ return len(multitokens)
68
+
69
+ # ── Search ─────────────────────────────────────────────────
70
+
71
+ def exact_search(self, query_tokens: Tuple[str, ...]) -> List[RetrievalResult]:
72
+ current = self.index
73
+ for token in query_tokens:
74
+ if token not in current:
75
+ return []
76
+ current = current[token]
77
+ items = current.get("_items", [])
78
+ self._record_access(items)
79
+ return [RetrievalResult(multitoken=mt, relevance_score=1.0,
80
+ trustworthiness_score=self._trust_score(mt), match_type="exact")
81
+ for mt in items]
82
+
83
+ def subset_search(self, query_tokens: Tuple[str, ...],
84
+ min_match_ratio: float = 0.75) -> List[RetrievalResult]:
85
+ results: List[RetrievalResult] = []
86
+ query_set = set(query_tokens)
87
+ for leaf in self._iter_leaves():
88
+ items = leaf.get("_items", [])
89
+ if not items:
90
+ continue
91
+ leaf_tokens = set(items[0].tokens)
92
+ overlap = len(query_set & leaf_tokens)
93
+ ratio = overlap / len(query_set) if query_set else 0.0
94
+ if ratio >= min_match_ratio:
95
+ score = ratio * (overlap / len(leaf_tokens)) if leaf_tokens else ratio
96
+ self._record_access(items)
97
+ for mt in items:
98
+ results.append(RetrievalResult(multitoken=mt, relevance_score=score,
99
+ trustworthiness_score=self._trust_score(mt), match_type="subset",
100
+ matched_tokens=list(query_set & leaf_tokens)))
101
+ results.sort(key=lambda r: r.relevance_score, reverse=True)
102
+ return results
103
+
104
+ def contextual_search(self, query_tokens: Tuple[str, ...]) -> List[RetrievalResult]:
105
+ results: List[RetrievalResult] = []
106
+ query_set = set(query_tokens)
107
+ for leaf in self._iter_leaves():
108
+ items = leaf.get("_items", [])
109
+ if not items:
110
+ continue
111
+ for mt in items:
112
+ if mt.token_type == "contextual" or mt.is_title or mt.is_header:
113
+ overlap = len(query_set & set(mt.tokens))
114
+ if overlap > 0:
115
+ score = min(overlap / len(query_set) * 1.5, 1.0)
116
+ results.append(RetrievalResult(multitoken=mt, relevance_score=score,
117
+ trustworthiness_score=self._trust_score(mt), match_type="contextual",
118
+ matched_tokens=list(query_set & set(mt.tokens))))
119
+ results.sort(key=lambda r: r.relevance_score, reverse=True)
120
+ return results
121
+
122
+ def hybrid_search(self, query_tokens: Tuple[str, ...],
123
+ include_semantic_fallback: bool = False,
124
+ source_filter: Optional[List[str]] = None,
125
+ trace=None) -> List[RetrievalResult]:
126
+ """Multi-tier search across all indexed documents.
127
+
128
+ Parameters
129
+ ----------
130
+ source_filter : Optional[List[str]]
131
+ If provided, only return results whose source_doc is in this list.
132
+ Case-insensitive basename matching (e.g. ``["report.pdf"]``).
133
+ """
134
+ # Tier 1: exact
135
+ results = self.exact_search(query_tokens)
136
+ if trace:
137
+ trace.event(type("TE", (), {"value": "decision.search_type"})(), agent_name="ExactHash",
138
+ message=f"Exact search: {len(results)} results", data={"tier": 1, "count": len(results)})
139
+ # Tier 2: subset fallback (lowered threshold to 0.5 for better recall)
140
+ if len(results) < 5 and len(query_tokens) >= 2:
141
+ subset = self.subset_search(query_tokens, min_match_ratio=0.5)
142
+ if trace:
143
+ trace.event(type("TE", (), {"value": "decision.search_type"})(), agent_name="ExactHash",
144
+ message=f"Subset fallback: {len(subset)} results", data={"tier": 2, "count": len(subset)})
145
+ results.extend(subset)
146
+ # Tier 3: contextual boost (titles/headers)
147
+ ctx = self.contextual_search(query_tokens)
148
+ results.extend(ctx)
149
+ # Tier 4: broad sweep — any token overlap at all
150
+ if len(results) < 3:
151
+ broad = self.broad_search(query_tokens)
152
+ if trace:
153
+ trace.event(type("TE", (), {"value": "decision.search_type"})(), agent_name="ExactHash",
154
+ message=f"Broad sweep: {len(broad)} results", data={"tier": 4, "count": len(broad)})
155
+ results.extend(broad)
156
+ # Tier 5: PMI re-ranking — rare-token matches boosted, boilerplate suppressed
157
+ try:
158
+ from backend.core.pmi import get_pmi_scorer
159
+ pmi = get_pmi_scorer()
160
+ for r in results:
161
+ pmi_score = pmi.score(list(query_tokens), list(r.multitoken.tokens))
162
+ pmi_norm = pmi.normalize_score(pmi_score)
163
+ # Blend: 70% structural match + 30% token rarity
164
+ r.relevance_score = round(r.relevance_score * 0.7 + pmi_norm * 0.3, 4)
165
+ except Exception:
166
+ pass
167
+
168
+ # ── Document-scope filter ────────────────────────────────
169
+ if source_filter:
170
+ import os
171
+ filter_set = set()
172
+ for f in source_filter:
173
+ f = str(f).lower().strip()
174
+ f = os.path.basename(f)
175
+ if f:
176
+ filter_set.add(f)
177
+ before = len(results)
178
+ if filter_set:
179
+ results = [r for r in results
180
+ if os.path.basename(r.multitoken.source_doc.lower().strip()) in filter_set]
181
+ print(f"[Precis] Hash filter: source_filter={source_filter!r} filter_set={filter_set!r} before={before} after={len(results)}")
182
+
183
+ results.sort(key=lambda r: r.relevance_score, reverse=True)
184
+ return results
185
+
186
+ def broad_search(self, query_tokens: Tuple[str, ...]) -> List[RetrievalResult]:
187
+ """Last-resort search: any leaf with meaningful token overlap (≥30% of query)."""
188
+ results: List[RetrievalResult] = []
189
+ query_set = set(query_tokens)
190
+ min_overlap = max(1, int(len(query_tokens) * 0.3)) # Require ≥30% query token match
191
+ for leaf in self._iter_leaves():
192
+ items = leaf.get("_items", [])
193
+ if not items:
194
+ continue
195
+ for mt in items:
196
+ overlap = len(query_set & set(mt.tokens))
197
+ if overlap >= min_overlap:
198
+ score = min(overlap / len(query_set) * 0.8, 0.9)
199
+ results.append(RetrievalResult(multitoken=mt, relevance_score=score,
200
+ trustworthiness_score=self._trust_score(mt), match_type="broad",
201
+ matched_tokens=list(query_set & set(mt.tokens))))
202
+ results.sort(key=lambda r: r.relevance_score, reverse=True)
203
+ return results[:30] # Cap broad results
204
+
205
+ # ── Maintenance ─────────────────────────────────────────────
206
+
207
+ def auto_distill(self, min_access_count: int = 3) -> int:
208
+ removed = 0
209
+ for leaf in list(self._iter_leaves()):
210
+ items = leaf.get("_items", [])
211
+ if not items:
212
+ continue
213
+ key = self._leaf_key(items[0])
214
+ if self._access_counts.get(key, 0) < min_access_count:
215
+ leaf["_items"] = []
216
+ self.multitoken_count -= len(items)
217
+ removed += len(items)
218
+ return removed
219
+
220
+ def index_document(self, text: str, source: str = "uploaded_document") -> int:
221
+ """Parse raw text into MultiTokens and insert into the index. Returns count of tokens indexed."""
222
+ # Store original text for context retrieval
223
+ self._doc_texts[source] = text
224
+
225
+ from backend.core.multitoken import MultiTokenExtractor
226
+
227
+ extractor = MultiTokenExtractor(max_token_length=7, min_token_length=2)
228
+
229
+ lines = text.strip().split("\n")
230
+ parsed = [{"page_number": 1, "elements": []}]
231
+ for line in lines:
232
+ stripped = line.strip()
233
+ if not stripped:
234
+ continue
235
+ parsed[0]["elements"].append({
236
+ "text": stripped,
237
+ "is_title": stripped.isupper() and len(stripped) < 80,
238
+ "is_header": stripped.isupper() and len(stripped) < 60,
239
+ "font_size": 14.0 if stripped.isupper() else 10.0,
240
+ })
241
+
242
+ # Use the proper MultiTokenExtractor API: index_document(filename, parsed, self)
243
+ return extractor.index_document(source, parsed, self)
244
+
245
+ def get_statistics(self) -> Dict[str, Any]:
246
+ depth = self._compute_depth(self.index)
247
+ return {"multitoken_count": self.multitoken_count, "unique_tokens": len(self.unique_tokens),
248
+ "index_depth": depth, "memory_estimate_mb": self.multitoken_count * 0.002,
249
+ "cached_documents": len(self._doc_texts)}
250
+
251
+ def get_context(self, source_doc: str, source_page: int, source_position: int,
252
+ window: int = 5) -> Dict[str, Any]:
253
+ """Retrieve surrounding lines from the original document for a match position.
254
+
255
+ source_position is the LINE NUMBER in the full document (set during indexing).
256
+ Uses raw lines without merging so the index stays accurate.
257
+ """
258
+ text = self._doc_texts.get(source_doc, "")
259
+ if not text:
260
+ return {"sentence": "(source text not cached)", "surrounding": "", "page": source_page, "file": source_doc}
261
+
262
+ lines = text.split("\n")
263
+ # Filter out fully empty lines but keep line numbering intact
264
+ non_empty = [(i, l.strip()) for i, l in enumerate(lines) if l.strip()]
265
+
266
+ if not non_empty:
267
+ return {"sentence": "", "surrounding": "", "page": source_page, "file": source_doc}
268
+
269
+ # Find the closest non-empty line to source_position
270
+ idx = 0
271
+ for i, (line_no, _) in enumerate(non_empty):
272
+ if line_no >= source_position:
273
+ idx = i
274
+ break
275
+ else:
276
+ idx = len(non_empty) - 1
277
+
278
+ start = max(0, idx - window)
279
+ end = min(len(non_empty), idx + window + 1)
280
+
281
+ surrounding = "\n".join(l for _, l in non_empty[start:end])
282
+ sentence = non_empty[idx][1] if idx < len(non_empty) else ""
283
+
284
+ return {
285
+ "sentence": sentence,
286
+ "surrounding": surrounding,
287
+ "page": source_page,
288
+ "file": source_doc,
289
+ }
290
+
291
+ # ── Internal ────────────────────────────────────────────────
292
+
293
+ def _iter_leaves(self) -> Iterator[Dict[str, Any]]:
294
+ def recurse(node):
295
+ if "_items" in node:
296
+ yield node
297
+ for k, v in node.items():
298
+ if k != "_items" and isinstance(v, dict):
299
+ yield from recurse(v)
300
+ yield from recurse(self.index)
301
+
302
+ def _trust_score(self, mt: MultiToken) -> float:
303
+ score = 0.5
304
+ if mt.is_title:
305
+ score += 0.2
306
+ if mt.is_header:
307
+ score += 0.1
308
+ if mt.font_size and mt.font_size > 12:
309
+ score += 0.1
310
+ if mt.token_type == "contextual":
311
+ score += 0.15
312
+ return min(score, 1.0)
313
+
314
+ def _record_access(self, items: List[MultiToken]) -> None:
315
+ for mt in items:
316
+ key = self._leaf_key(mt)
317
+ self._access_counts[key] = self._access_counts.get(key, 0) + 1
318
+
319
+ @staticmethod
320
+ def _leaf_key(mt: MultiToken) -> str:
321
+ return f"{mt.source_doc}|{mt.source_page}|{mt.source_position}"
322
+
323
+ @staticmethod
324
+ def _compute_depth(node: dict) -> int:
325
+ if not isinstance(node, dict) or not node:
326
+ return 0
327
+ return 1 + max((NestedHashIndex._compute_depth(v) for k, v in node.items() if k != "_items"), default=0)
@@ -0,0 +1,64 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT
2
+
3
+ Implements RRF (Reciprocal Rank Fusion): score = Σ 1/(k + rank_i)
4
+ where k=60 is the standard constant, and rank_i is the result's rank in each source.
5
+
6
+ Produces a single ranked list from multiple retrieval backends.
7
+ """
8
+
9
+ from typing import Any, Dict, List
10
+
11
+
12
+ class FusionRanker:
13
+ """Combines results from multiple retrieval engines using RRF."""
14
+
15
+ def __init__(self, k: int = 60) -> None:
16
+ self.k = k # RRF constant
17
+
18
+ def fuse(self, sources: Dict[str, List[Dict[str, Any]]],
19
+ top_k: int = 15) -> List[Dict[str, Any]]:
20
+ """Fuse multiple ranked result lists into one.
21
+
22
+ Args:
23
+ sources: {"hash": [...], "vector": [...]} — each list pre-sorted by score
24
+ top_k: Max results to return
25
+ """
26
+ # Assign RRF scores
27
+ fused: Dict[str, Dict[str, Any]] = {} # key = text[:100]
28
+
29
+ for source_name, results in sources.items():
30
+ for rank, item in enumerate(results):
31
+ key = item.get("text", "")[:100] # Dedup key
32
+ rrf_score = 1.0 / (self.k + rank + 1)
33
+
34
+ if key in fused:
35
+ fused[key]["rrf_score"] += rrf_score
36
+ fused[key]["sources"].add(source_name)
37
+ # Keep the higher original score
38
+ if item.get("score", 0) > fused[key].get("original_score", 0):
39
+ fused[key]["original_score"] = item.get("score", 0)
40
+ fused[key]["match_type"] = item.get("match_type", "")
41
+ fused[key]["source"] = item.get("source", "")
42
+ fused[key]["page"] = item.get("page", 1)
43
+ else:
44
+ fused[key] = {
45
+ "text": item.get("text", ""),
46
+ "source": item.get("source", ""),
47
+ "page": item.get("page", 1),
48
+ "rrf_score": rrf_score,
49
+ "original_score": item.get("score", 0),
50
+ "match_type": item.get("match_type", ""),
51
+ "sources": {source_name},
52
+ }
53
+
54
+ # Sort by RRF score, return top_k
55
+ ranked = sorted(fused.values(), key=lambda x: -x["rrf_score"])[:top_k]
56
+
57
+ # Normalize scores to 0-1
58
+ if ranked:
59
+ max_rrf = ranked[0]["rrf_score"]
60
+ for item in ranked:
61
+ item["score"] = round(item["rrf_score"] / max_rrf, 3) if max_rrf > 0 else 0
62
+ item["match_type"] = "fusion:" + "+".join(sorted(item["sources"]))
63
+
64
+ return ranked
@@ -0,0 +1,175 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
2
+
3
+ from dataclasses import dataclass, field
4
+ from enum import Enum
5
+ from typing import Any, Dict, List, Optional, Tuple
6
+ import re
7
+
8
+
9
+ class GuardrailAction(str, Enum):
10
+ PASS = "pass"
11
+ FLAG = "flag"
12
+ REDACT = "redact"
13
+ BLOCK = "block"
14
+
15
+
16
+ @dataclass
17
+ class GuardrailResult:
18
+ action: GuardrailAction
19
+ issues_found: List[str] = field(default_factory=list)
20
+ redacted_content: List[str] = field(default_factory=list)
21
+ redacted_response: Optional[str] = None # ← response with PII scrubbed (when action=REDACT)
22
+ confidence: float = 1.0
23
+ requires_human_review: bool = False
24
+
25
+
26
+ class GuardrailAgent:
27
+ """Validates outputs before delivery. PII detection, hallucination check, content policy.
28
+
29
+ Layer 1 — PII Detection & Redaction:
30
+ Scans the generated response for SSNs, credit-card numbers, email
31
+ addresses, and phone numbers. When PII is found the response is
32
+ *redacted* in-place rather than blocked outright, so the user still
33
+ receives useful content.
34
+
35
+ Layer 2 — Prompt Injection:
36
+ Checks the original query for injection markers. These are ALWAYS
37
+ blocked — no response is returned.
38
+
39
+ Layer 3 — Hallucination Threshold:
40
+ If the VeriScore hallucination rate exceeds 30 %, the response is
41
+ flagged for human review but still delivered (with a warning).
42
+ """
43
+
44
+ # ── PII patterns: (regex, human-readable type, replacement mask) ─
45
+ _PII_PATTERNS: List[Tuple[str, str, str]] = [
46
+ (r'\b\d{3}-\d{2}-\d{4}\b', "SSN", "[REDACTED-SSN]"),
47
+ (r'\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b', "Credit Card", "[REDACTED-CC]"),
48
+ (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', "Email", "[REDACTED-EMAIL]"),
49
+ (r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', "Phone", "[REDACTED-PHONE]"),
50
+ ]
51
+
52
+ # ── Prompt injection markers: any match → immediate BLOCK ──────
53
+ _INJECTION_MARKERS: List[str] = [
54
+ "ignore previous", "you are now", "system prompt",
55
+ "[/INST]", "<|im_start|>", "<|im_end|>",
56
+ "forget all", "new instructions", "pretend you are",
57
+ ]
58
+
59
+ # ── Thresholds ─────────────────────────────────────────────────
60
+ HALLUCINATION_BLOCK_THRESHOLD: float = 0.8 # > 80 % → block (extreme cases only)
61
+ HALLUCINATION_FLAG_THRESHOLD: float = 0.3 # > 30 % → flag
62
+
63
+ # ── Public API ──────────────────────────────────────────────────
64
+
65
+ async def validate(
66
+ self,
67
+ generated_response: str,
68
+ retrieved_sources: List[Dict[str, Any]],
69
+ original_query: str,
70
+ veriscore_report: Optional[Any] = None,
71
+ ) -> GuardrailResult:
72
+ """Run all safety layers. Returns a GuardrailResult with the
73
+ most restrictive action warranted by any layer.
74
+ """
75
+ issues: List[str] = []
76
+ redacted: List[str] = []
77
+ redacted_text: Optional[str] = None
78
+ confidence: float = 1.0
79
+ needs_review: bool = False
80
+
81
+ # ── Layer 1: PII Detection & Redaction ──────────────────
82
+ redacted_text, pii_found = self._redact_pii(generated_response)
83
+ if pii_found:
84
+ for _, pii_type, _ in self._PII_PATTERNS:
85
+ if re.search(self._PII_PATTERNS[0][0], generated_response): # quick re-check
86
+ pass
87
+ issues.extend(pii_found)
88
+ redacted.extend(pii_found)
89
+ confidence = 0.85
90
+ needs_review = True
91
+
92
+ # ── Layer 2: Prompt Injection ──────────────────────────
93
+ query_lower = original_query.lower()
94
+ for marker in self._INJECTION_MARKERS:
95
+ if marker.lower() in query_lower:
96
+ issues.append(f"Prompt injection detected: '{marker}'")
97
+ return GuardrailResult(
98
+ action=GuardrailAction.BLOCK,
99
+ issues_found=issues,
100
+ redacted_content=redacted,
101
+ confidence=1.0,
102
+ requires_human_review=True,
103
+ )
104
+
105
+ # ── Layer 3: Hallucination Rate Threshold ──────────────
106
+ if veriscore_report is not None:
107
+ hall_rate = getattr(veriscore_report, "hallucination_rate", 0.0)
108
+ if hall_rate > self.HALLUCINATION_BLOCK_THRESHOLD:
109
+ issues.append(
110
+ f"Critical hallucination rate ({hall_rate:.0%} > "
111
+ f"{self.HALLUCINATION_BLOCK_THRESHOLD:.0%}) — response blocked"
112
+ )
113
+ return GuardrailResult(
114
+ action=GuardrailAction.BLOCK,
115
+ issues_found=issues,
116
+ redacted_content=redacted,
117
+ confidence=0.6,
118
+ requires_human_review=True,
119
+ )
120
+ elif hall_rate > self.HALLUCINATION_FLAG_THRESHOLD:
121
+ issues.append(
122
+ f"High hallucination rate ({hall_rate:.0%} > "
123
+ f"{self.HALLUCINATION_FLAG_THRESHOLD:.0%})"
124
+ )
125
+ confidence = 0.7
126
+ needs_review = True
127
+
128
+ # ── Also check flagged_issues from VeriScore ───────────
129
+ if veriscore_report is not None:
130
+ for fi in getattr(veriscore_report, "flagged_issues", []):
131
+ if fi not in issues:
132
+ issues.append(fi)
133
+
134
+ # ── Decide final action ────────────────────────────────
135
+ if not issues:
136
+ return GuardrailResult(
137
+ action=GuardrailAction.PASS,
138
+ redacted_response=redacted_text or generated_response,
139
+ )
140
+
141
+ if redacted:
142
+ # PII was found → redact and deliver
143
+ return GuardrailResult(
144
+ action=GuardrailAction.REDACT,
145
+ issues_found=issues,
146
+ redacted_content=redacted,
147
+ redacted_response=redacted_text,
148
+ confidence=confidence,
149
+ requires_human_review=needs_review,
150
+ )
151
+
152
+ # Non-PII issues → flag for review but still deliver
153
+ return GuardrailResult(
154
+ action=GuardrailAction.FLAG,
155
+ issues_found=issues,
156
+ redacted_response=generated_response,
157
+ confidence=confidence,
158
+ requires_human_review=needs_review,
159
+ )
160
+
161
+ # ── PII Redaction ──────────────────────────────────────────────
162
+
163
+ def _redact_pii(self, text: str) -> Tuple[str, List[str]]:
164
+ """Scan *text* for PII patterns and replace matches with safe tokens.
165
+
166
+ Returns (redacted_text, list_of_types_found).
167
+ """
168
+ found: List[str] = []
169
+ result = text
170
+ for pattern, pii_type, replacement in self._PII_PATTERNS:
171
+ matches = re.findall(pattern, result)
172
+ if matches:
173
+ found.append(f"PII redacted: {pii_type} ({len(matches)} instance(s))")
174
+ result = re.sub(pattern, replacement, result)
175
+ return result, found
@@ -0,0 +1,89 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT
2
+
3
+ Transforms natural language into the domain-specific vocabulary found in indexed documents.
4
+ Example: "currency impact" → "foreign exchange rate exposure currency fluctuation risk"
5
+ No embeddings, no vector DB — just the LLM's knowledge of financial synonyms.
6
+ """
7
+
8
+ from typing import Optional
9
+ from backend.llm.base import LLMProvider
10
+
11
+
12
+ class QueryExpander:
13
+ """Rewrites queries using LLM knowledge of domain terminology when exact hash search fails."""
14
+
15
+ def __init__(self, llm: LLMProvider) -> None:
16
+ self.llm = llm
17
+
18
+ async def expand(self, original_query: str, failed_tokens: list,
19
+ document_domain: str = "financial and legal") -> list:
20
+ """Generate alternative queries with different terminology.
21
+
22
+ Returns a list of expanded query strings, sorted by likely relevance.
23
+ """
24
+ token_str = ", ".join(failed_tokens[:20]) if failed_tokens else original_query
25
+
26
+ prompt = f"""You are a {document_domain} domain expert. A document search system failed
27
+ to find matches for the following query because the EXACT words don't appear in the documents.
28
+
29
+ ORIGINAL QUERY: {original_query}
30
+ FAILED SEARCH TOKENS: {token_str}
31
+
32
+ The documents contain formal {document_domain} terminology. Rewrite the original query
33
+ using alternative words, synonyms, and related {document_domain} terms that are MORE LIKELY
34
+ to appear in formal documents.
35
+
36
+ For example:
37
+ - "currency impact" → "foreign exchange rate exposure"
38
+ - "money lost" → "financial impairment write-down loss"
39
+ - "hacking problem" → "cybersecurity incident data breach unauthorized access"
40
+ - "worker shortage" → "talent attrition labor supply constraints headcount reduction"
41
+ - "green rules" → "environmental regulation climate compliance carbon emission"
42
+
43
+ Return ONLY a JSON list of 3 rewritten queries, most likely to match first:
44
+ ["rewritten query 1", "rewritten query 2", "rewritten query 3"]"""
45
+
46
+ try:
47
+ import asyncio
48
+ response = await asyncio.wait_for(
49
+ self.llm.generate(prompt, max_tokens=300, temperature=0.3),
50
+ timeout=20
51
+ )
52
+ # Parse JSON list from response
53
+ import json
54
+ # Find the JSON array in the response
55
+ start = response.find("[")
56
+ end = response.rfind("]") + 1
57
+ if start >= 0 and end > start:
58
+ expansions = json.loads(response[start:end])
59
+ if isinstance(expansions, list):
60
+ return expansions[:3]
61
+ except Exception:
62
+ pass
63
+
64
+ # Fallback: simple word-level expansion using common financial synonyms
65
+ return [self._basic_expand(original_query)]
66
+
67
+ def _basic_expand(self, query: str) -> str:
68
+ """Simple synonym substitution when LLM is unavailable."""
69
+ synonyms = {
70
+ "currency": "foreign exchange fx rate",
71
+ "money": "capital funds revenue cash",
72
+ "risk": "exposure uncertainty volatility",
73
+ "profit": "earnings income margin return",
74
+ "loss": "impairment write-down decline decrease",
75
+ "revenue": "sales income turnover top-line",
76
+ "cost": "expense expenditure outlay",
77
+ "market": "sector industry segment",
78
+ "growth": "expansion increase appreciation",
79
+ "rule": "regulation compliance requirement policy",
80
+ "problem": "issue incident concern challenge",
81
+ "impact": "effect influence exposure consequence",
82
+ }
83
+ words = query.lower().split()
84
+ expanded = []
85
+ for w in words:
86
+ expanded.append(w)
87
+ if w in synonyms:
88
+ expanded.append(synonyms[w])
89
+ return " ".join(expanded)