mcp-agentic-pipelines 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/.env.example +93 -0
  2. package/README.md +258 -0
  3. package/package.json +70 -0
  4. package/packages/clinical/package.json +22 -0
  5. package/packages/clinical/src/index.ts +262 -0
  6. package/packages/clinical/tsconfig.json +13 -0
  7. package/packages/core/package.json +21 -0
  8. package/packages/core/src/config.ts +138 -0
  9. package/packages/core/src/errors.ts +100 -0
  10. package/packages/core/src/index.ts +104 -0
  11. package/packages/core/src/llm-config.ts +213 -0
  12. package/packages/core/src/logging.ts +66 -0
  13. package/packages/core/src/python-bridge.ts +384 -0
  14. package/packages/core/src/rate-limiter.ts +136 -0
  15. package/packages/core/src/types.ts +203 -0
  16. package/packages/core/src/validation.ts +101 -0
  17. package/packages/core/tsconfig.json +10 -0
  18. package/packages/deeppipe/package.json +21 -0
  19. package/packages/deeppipe/src/index.ts +424 -0
  20. package/packages/deeppipe/tsconfig.json +13 -0
  21. package/packages/piste/package.json +20 -0
  22. package/packages/piste/src/index.ts +48 -0
  23. package/packages/piste/tsconfig.json +13 -0
  24. package/packages/precis/package.json +20 -0
  25. package/packages/precis/src/index.ts +67 -0
  26. package/packages/precis/tsconfig.json +13 -0
  27. package/packages/server/package.json +31 -0
  28. package/packages/server/src/index.ts +427 -0
  29. package/packages/server/tsconfig.json +17 -0
  30. package/setup.mjs +141 -0
  31. package/test.mjs +337 -0
  32. package/vendors/clinical-intake/pipeline.mjs +349 -0
  33. package/vendors/clinical-intake/questions/en.txt +9 -0
  34. package/vendors/clinical-intake/questions/fr.txt +9 -0
  35. package/vendors/piste/.env.example +73 -0
  36. package/vendors/piste/app/core/__init__.py +4 -0
  37. package/vendors/piste/app/core/config.py +83 -0
  38. package/vendors/piste/app/core/debuglog.py +16 -0
  39. package/vendors/piste/app/core/middleware.py +40 -0
  40. package/vendors/piste/bridge_piste.py +301 -0
  41. package/vendors/piste/pipeline/__init__.py +4 -0
  42. package/vendors/piste/pipeline/compiler.py +68 -0
  43. package/vendors/piste/pipeline/offline/__init__.py +28 -0
  44. package/vendors/piste/pipeline/offline/verifaid_pipeline.py +247 -0
  45. package/vendors/piste/pipeline/replay.py +15 -0
  46. package/vendors/piste/pipeline/replay_engine.py +249 -0
  47. package/vendors/piste/pipeline/signatures/__init__.py +4 -0
  48. package/vendors/piste/pipeline/signatures/signatures.py +136 -0
  49. package/vendors/piste/pipeline/stage1/__init__.py +21 -0
  50. package/vendors/piste/pipeline/stage1/atomic_decomposer.py +61 -0
  51. package/vendors/piste/pipeline/stage1/check_worthiness.py +100 -0
  52. package/vendors/piste/pipeline/stage1/orchestrator.py +175 -0
  53. package/vendors/piste/pipeline/stage1/test_stage1.py +162 -0
  54. package/vendors/piste/pipeline/stage2/__init__.py +34 -0
  55. package/vendors/piste/pipeline/stage2/blind_retriever.py +303 -0
  56. package/vendors/piste/pipeline/stage2/canonical_mapper.py +124 -0
  57. package/vendors/piste/pipeline/stage2/credibility_scorer.py +85 -0
  58. package/vendors/piste/pipeline/stage2/orchestrator.py +311 -0
  59. package/vendors/piste/pipeline/stage2/query_refiner.py +88 -0
  60. package/vendors/piste/pipeline/stage2/search_decision.py +69 -0
  61. package/vendors/piste/pipeline/stage2/test_stage2.py +265 -0
  62. package/vendors/piste/pipeline/stage3/__init__.py +20 -0
  63. package/vendors/piste/pipeline/stage3/classifier.py +79 -0
  64. package/vendors/piste/pipeline/stage3/orchestrator.py +225 -0
  65. package/vendors/piste/pipeline/stage3/test_stage3.py +101 -0
  66. package/vendors/piste/pipeline/stage4/__init__.py +33 -0
  67. package/vendors/piste/pipeline/stage4/criticality_gate.py +177 -0
  68. package/vendors/piste/pipeline/stage4/orchestrator.py +269 -0
  69. package/vendors/piste/pipeline/stage4/test_stage4.py +192 -0
  70. package/vendors/piste/pipeline/stage4/verdict_aggregator.py +157 -0
  71. package/vendors/piste/requirements.txt +53 -0
  72. package/vendors/precis/backend/__init__.py +6 -0
  73. package/vendors/precis/backend/agents/__init__.py +3 -0
  74. package/vendors/precis/backend/agents/data_synthesis.py +105 -0
  75. package/vendors/precis/backend/agents/dist_free_synth.py +97 -0
  76. package/vendors/precis/backend/agents/exact_hash_retriever.py +327 -0
  77. package/vendors/precis/backend/agents/fusion_ranker.py +64 -0
  78. package/vendors/precis/backend/agents/guardrail.py +175 -0
  79. package/vendors/precis/backend/agents/query_expander.py +89 -0
  80. package/vendors/precis/backend/agents/radial_interpol.py +99 -0
  81. package/vendors/precis/backend/agents/report_generator.py +92 -0
  82. package/vendors/precis/backend/agents/semantic_reranker.py +135 -0
  83. package/vendors/precis/backend/agents/stat_anomaly.py +93 -0
  84. package/vendors/precis/backend/agents/vector_index.py +123 -0
  85. package/vendors/precis/backend/agents/veri_score.py +341 -0
  86. package/vendors/precis/backend/agents/work_order_extractor.py +205 -0
  87. package/vendors/precis/backend/api/__init__.py +3 -0
  88. package/vendors/precis/backend/api/routes/__init__.py +3 -0
  89. package/vendors/precis/backend/config.py +88 -0
  90. package/vendors/precis/backend/core/__init__.py +13 -0
  91. package/vendors/precis/backend/core/hashing.py +22 -0
  92. package/vendors/precis/backend/core/metrics.py +77 -0
  93. package/vendors/precis/backend/core/multitoken.py +166 -0
  94. package/vendors/precis/backend/core/pmi.py +54 -0
  95. package/vendors/precis/backend/core/stemming.py +74 -0
  96. package/vendors/precis/backend/core/tracing.py +150 -0
  97. package/vendors/precis/backend/data/__init__.py +3 -0
  98. package/vendors/precis/backend/data/chunker.py +57 -0
  99. package/vendors/precis/backend/data/pdf_parser.py +42 -0
  100. package/vendors/precis/backend/db/__init__.py +3 -0
  101. package/vendors/precis/backend/db/models.py +173 -0
  102. package/vendors/precis/backend/db/repository.py +269 -0
  103. package/vendors/precis/backend/llm/__init__.py +3 -0
  104. package/vendors/precis/backend/llm/anthropic_provider.py +39 -0
  105. package/vendors/precis/backend/llm/base.py +147 -0
  106. package/vendors/precis/backend/llm/deepseek_provider.py +43 -0
  107. package/vendors/precis/backend/llm/factory.py +60 -0
  108. package/vendors/precis/backend/llm/google_provider.py +39 -0
  109. package/vendors/precis/backend/llm/ollama_provider.py +54 -0
  110. package/vendors/precis/backend/llm/openai_provider.py +50 -0
  111. package/vendors/precis/backend/main.py +677 -0
  112. package/vendors/precis/backend/orchestrator/__init__.py +3 -0
  113. package/vendors/precis/backend/orchestrator/planner.py +81 -0
  114. package/vendors/precis/backend/orchestrator/router.py +319 -0
  115. package/vendors/precis/backend/orchestrator/types.py +58 -0
  116. package/vendors/precis/bridge_precis.py +185 -0
  117. package/vendors/precis/data/sample_reports/README.md +8 -0
  118. package/vendors/precis/data/seed_data.py +115 -0
  119. package/vendors/precis/requirements.txt +19 -0
@@ -0,0 +1,99 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
2
+
3
+ import numpy as np
4
+ from dataclasses import dataclass, field
5
+ from typing import Dict, List, Optional, Tuple
6
+
7
+
8
+ @dataclass
9
+ class TrainingNode:
10
+ beta: np.ndarray
11
+ response: float
12
+ metadata: Dict = field(default_factory=dict)
13
+
14
+
15
+ @dataclass
16
+ class PredictionResult:
17
+ predicted_value: float
18
+ confidence: float
19
+ contributing_nodes: List[Dict]
20
+ metadata: Dict = field(default_factory=dict)
21
+
22
+
23
+ class RadialInterpolPredictor:
24
+ """RBF closed-form predictor. f_pred(x) = Σ ω_k(x)·f(β_k)·exp[-τ·K(x,β_k)] with Σ ω_k·exp[-τ·K] = 1."""
25
+
26
+ def __init__(self, tau: float = 500.0, gamma: float = 1.0) -> None:
27
+ self.tau = tau
28
+ self.gamma = gamma
29
+ self.nodes: List[TrainingNode] = []
30
+ self.n: int = 0
31
+ self.m: int = 0
32
+ self._X_min: Optional[np.ndarray] = None
33
+ self._X_max: Optional[np.ndarray] = None
34
+ self._node_access_counts: np.ndarray = np.array([])
35
+ self._total_predictions: int = 0
36
+
37
+ def fit(self, X: np.ndarray, y: np.ndarray, metadata: Optional[List[Dict]] = None) -> None:
38
+ self._X_min = X.min(axis=0)
39
+ self._X_max = X.max(axis=0)
40
+ X_t = self._phi_transform(X)
41
+ meta_list = metadata or [{}] * len(X)
42
+ self.nodes = [TrainingNode(beta=X_t[i], response=y[i], metadata=meta_list[i]) for i in range(len(X))]
43
+ self.n = len(self.nodes)
44
+ self.m = X.shape[1]
45
+ self._node_access_counts = np.zeros(self.n)
46
+
47
+ def predict(self, x: np.ndarray, top_k: int = 20,
48
+ trace=None) -> Tuple[float, List[Dict]]:
49
+ if self.n == 0:
50
+ return 0.0, []
51
+ x_t = self._phi_transform(x.reshape(1, -1))[0]
52
+ contributions, w_sum, w_norm = [], 0.0, 0.0
53
+ for i, node in enumerate(self.nodes):
54
+ K = self._kernel(x_t, node.beta)
55
+ w = np.exp(-self.tau * K)
56
+ if w > 1e-15:
57
+ w_sum += w * node.response
58
+ w_norm += w
59
+ self._node_access_counts[i] += 1
60
+ contributions.append({"node_idx": i, "weight": float(w), "kernel_distance": float(K),
61
+ "response": float(node.response), "metadata": node.metadata})
62
+ if w_norm == 0:
63
+ return 0.0, []
64
+ f_pred = w_sum / w_norm
65
+ self._total_predictions += 1
66
+ contributions.sort(key=lambda c: c["weight"], reverse=True)
67
+ if trace:
68
+ trace.event(type("TE", (), {"value": "decision.prediction"})(), agent_name="RadialInterpol",
69
+ message=f"Predicted {f_pred:.4f} from {len(contributions)} active nodes",
70
+ data={"top_weight": contributions[0]["weight"] if contributions else 0})
71
+ return f_pred, contributions[:top_k]
72
+
73
+ def _kernel(self, x: np.ndarray, beta: np.ndarray) -> float:
74
+ return self.gamma * float(np.dot(x - beta, x - beta))
75
+
76
+ def _phi_transform(self, X: np.ndarray) -> np.ndarray:
77
+ if self._X_min is None or self._X_max is None:
78
+ return X
79
+ denom = self._X_max - self._X_min
80
+ denom[denom == 0] = 1.0
81
+ return (X - self._X_min) / denom
82
+
83
+ def auto_distill(self, min_access: int = 0) -> int:
84
+ if self._total_predictions == 0:
85
+ return 0
86
+ keep = self._node_access_counts >= min_access
87
+ removed = self.n - int(np.sum(keep))
88
+ self.nodes = [n for i, n in enumerate(self.nodes) if keep[i]]
89
+ self._node_access_counts = self._node_access_counts[keep]
90
+ self.n = len(self.nodes)
91
+ return removed
92
+
93
+ def get_weights_distribution(self, x: np.ndarray) -> np.ndarray:
94
+ if self.n == 0:
95
+ return np.array([])
96
+ x_t = self._phi_transform(x.reshape(1, -1))[0]
97
+ weights = np.array([np.exp(-self.tau * self._kernel(x_t, n.beta)) for n in self.nodes])
98
+ s = weights.sum()
99
+ return weights / s if s > 0 else weights
@@ -0,0 +1,92 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
2
+
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+ from typing import Any, Dict, List, Optional
6
+
7
+
8
+ @dataclass
9
+ class ReportSection:
10
+ section_id: str = ""
11
+ title: str = ""
12
+ content: str = ""
13
+ citations: List[Dict[str, Any]] = field(default_factory=list)
14
+ agent_source: str = ""
15
+ relevance: float = 1.0
16
+
17
+
18
+ class ReportGenerator:
19
+ """Assembles agent results into a structured report with citations."""
20
+
21
+ def __init__(self) -> None:
22
+ pass
23
+
24
+ async def generate(self, query: str, agent_results: List[Any],
25
+ veriscore_report: Optional[Any] = None,
26
+ guardrail_result: Optional[Any] = None) -> dict:
27
+ sections = []
28
+ all_citations = []
29
+
30
+ for i, result in enumerate(agent_results):
31
+ if not result.success:
32
+ sections.append({"title": f"Agent: {result.agent_name}",
33
+ "content": f"Error: {result.error_message}",
34
+ "agent_source": result.agent_name})
35
+ continue
36
+
37
+ data = result.data or {}
38
+
39
+ # Build human-readable content from structured data
40
+ if isinstance(data, dict):
41
+ parts = []
42
+ synth = data.get("synthesis", "")
43
+ if synth:
44
+ parts.append(str(synth))
45
+ for item in data.get("results", []):
46
+ if isinstance(item, dict):
47
+ txt = item.get("text", "")
48
+ src = item.get("source", "")
49
+ score = item.get("score", "")
50
+ mt = item.get("match_type", "")
51
+ if txt:
52
+ line = str(txt)
53
+ if src:
54
+ line += f" [{src}]"
55
+ if score:
56
+ line += f" [score={score}, {mt}]"
57
+ parts.append(line)
58
+ pred = data.get("prediction")
59
+ if pred is not None:
60
+ parts.append(f"Prediction: {pred}")
61
+ for flag in data.get("flags", []):
62
+ parts.append(str(flag))
63
+ content = "\n".join(parts) if parts else "(no readable content)"
64
+ else:
65
+ content = str(data)[:1000]
66
+
67
+ for citation in getattr(result, "citations", []):
68
+ all_citations.append({**citation, "agent_source": result.agent_name})
69
+
70
+ sections.append({"title": f"Findings from {result.agent_name}",
71
+ "content": content, "agent_source": result.agent_name,
72
+ "citations": getattr(result, "citations", [])})
73
+
74
+ evaluation_summary = None
75
+ if veriscore_report:
76
+ evaluation_summary = {
77
+ "relevancy": getattr(veriscore_report, "relevancy_score", 0),
78
+ "trust": getattr(veriscore_report, "trustworthiness_score", 0),
79
+ "hallucination_rate": getattr(veriscore_report, "hallucination_rate", 0),
80
+ "citation_coverage": getattr(veriscore_report, "citation_coverage", 0),
81
+ }
82
+
83
+ guardrail_summary = None
84
+ if guardrail_result:
85
+ guardrail_summary = {
86
+ "action": getattr(guardrail_result, "action", "pass"),
87
+ "issues": getattr(guardrail_result, "issues_found", []),
88
+ }
89
+
90
+ return {"query": query, "sections": sections, "citations": all_citations,
91
+ "evaluation": evaluation_summary, "guardrail": guardrail_summary,
92
+ "generated_at": datetime.now().isoformat()}
@@ -0,0 +1,135 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT
2
+
3
+ No embeddings, no vector DB — the LLM reads candidate text and scores relevance 0-100.
4
+ Only passes the best candidates to the synthesis step.
5
+ """
6
+
7
+ from typing import Any, Dict, List, Optional, Tuple
8
+ from backend.llm.base import LLMProvider
9
+
10
+
11
+ class SemanticReRanker:
12
+ """Uses LLM semantic understanding to filter hash-retrieved candidates by true relevance."""
13
+
14
+ def __init__(self, llm: LLMProvider) -> None:
15
+ self.llm = llm
16
+
17
+ async def rerank(self, query: str, candidates: List[Dict[str, Any]],
18
+ top_k: int = 5) -> List[Dict[str, Any]]:
19
+ """Score each candidate by semantic relevance to the query. Returns top_k.
20
+
21
+ Each candidate: {"text": str, "source": str, "score": float, "page": int, ...}
22
+ """
23
+ if not candidates:
24
+ return []
25
+ if len(candidates) <= top_k:
26
+ return candidates
27
+
28
+ # Build a scoring prompt with numbered candidates (use surrounding context when available)
29
+ items = []
30
+ for i, c in enumerate(candidates[:20]):
31
+ # Prefer surrounding context (full paragraph) over short n-gram text
32
+ text = c.get('surrounding', '') or c.get('sentence', '') or c.get('text', '')
33
+ items.append(f"[{i}] {text[:300]}")
34
+
35
+ prompt = f"""You are a precise relevance judge. Score each text chunk below for how well
36
+ it answers this query: "{query}"
37
+
38
+ For each chunk, give a score from 0-100:
39
+ 90-100: Directly answers the query with specific facts
40
+ 70-89: Related and useful context
41
+ 40-69: Tangentially related
42
+ 0-39: Not relevant
43
+
44
+ Text chunks:
45
+ {chr(10).join(items)}
46
+
47
+ Return ONLY a JSON array: [{{"index": 0, "score": 85, "reason": "5 words"}}, ...]
48
+ Score ALL chunks. Be strict — only give high scores for truly relevant content."""
49
+
50
+ try:
51
+ import asyncio, json
52
+ response = await asyncio.wait_for(
53
+ self.llm.generate(prompt, max_tokens=200, temperature=0.0),
54
+ timeout=20
55
+ )
56
+ # Parse the JSON array
57
+ start = response.find("[")
58
+ end = response.rfind("]") + 1
59
+ if start >= 0 and end > start:
60
+ scores = json.loads(response[start:end])
61
+ # Map scores back to candidates
62
+ scored = []
63
+ for s in scores:
64
+ idx = s.get("index", 0)
65
+ if 0 <= idx < len(candidates):
66
+ candidates[idx]["semantic_score"] = s.get("score", 0)
67
+ candidates[idx]["relevance_reason"] = s.get("reason", "")
68
+ scored.append(candidates[idx])
69
+ # Sort by semantic score, return top_k
70
+ scored.sort(key=lambda c: c.get("semantic_score", 0), reverse=True)
71
+ return scored[:top_k]
72
+ except Exception:
73
+ pass
74
+
75
+ # Fallback: return top by original hash score
76
+ return sorted(candidates, key=lambda c: c.get("score", 0), reverse=True)[:top_k]
77
+
78
+
79
+ class DirectReader:
80
+ """When hash search finds nothing, ask the LLM to directly read document text and answer."""
81
+
82
+ def __init__(self, llm: LLMProvider) -> None:
83
+ self.llm = llm
84
+
85
+ async def read_and_answer(self, query: str, doc_snippets: List[Dict[str, str]],
86
+ index) -> Dict[str, Any]:
87
+ """Read document snippets directly and attempt to answer the query.
88
+
89
+ doc_snippets: [{"text": "...", "source": "file.pdf", "page": 1}, ...]
90
+ """
91
+ if not doc_snippets:
92
+ return {"found": False, "answer": "No documents have been uploaded yet. Please upload a document to search.", "citation": ""}
93
+
94
+ doc_names = ", ".join(d.get("source", "unknown") for d in doc_snippets[:5])
95
+
96
+ context = "\n\n".join(
97
+ f"[DOC: {d['source']}, page {d.get('page', 1)}]\n{d['text'][:3000]}"
98
+ for d in doc_snippets[:5]
99
+ )
100
+
101
+ prompt = f"""You are a precise document analyst. Read the document excerpts below
102
+ and answer this query: "{query}"
103
+
104
+ Documents available: {doc_names}
105
+
106
+ DOCUMENTS:
107
+ {context[:15000]}
108
+
109
+ INSTRUCTIONS:
110
+ - If the query mentions a section number (like "section 3.4" or just "3.4"), locate
111
+ ALL content under that section heading in the documents and extract the key points.
112
+ Section headings may appear as "3.4 Title", "§3.4", or just "3.4" followed by text.
113
+ - Summarize what that section actually says — do not just report that the heading exists.
114
+ - If the query asks for "key findings" or a "summary", provide the substantive content
115
+ even if the document doesn't explicitly label anything as "key findings."
116
+ - If you genuinely cannot find the section content anywhere in the provided text,
117
+ only then state that it's not found.
118
+
119
+ Return your answer in this JSON format:
120
+ {{"found": true/false, "answer": "your detailed answer", "citation": "document name, page X"}}"""
121
+
122
+ try:
123
+ import asyncio, json
124
+ response = await asyncio.wait_for(
125
+ self.llm.generate(prompt, max_tokens=300, temperature=0.2),
126
+ timeout=25
127
+ )
128
+ start = response.find("{")
129
+ end = response.rfind("}") + 1
130
+ if start >= 0 and end > start:
131
+ result = json.loads(response[start:end])
132
+ return result
133
+ return {"found": False, "answer": response[:500], "citation": ""}
134
+ except Exception as e:
135
+ return {"found": False, "answer": f"(LLM reading unavailable: {e})", "citation": ""}
@@ -0,0 +1,93 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
2
+
3
+ import numpy as np
4
+ from collections import defaultdict
5
+ from dataclasses import dataclass, field
6
+ from datetime import datetime
7
+ from typing import Any, Dict, List, Optional, Tuple
8
+
9
+
10
+ @dataclass
11
+ class AnomalyFlag:
12
+ entity_id: str
13
+ flag_type: str
14
+ severity: float
15
+ evidence: Dict[str, Any]
16
+ timestamp: datetime
17
+ description: str
18
+ recommendations: List[str] = field(default_factory=list)
19
+
20
+
21
+ class StatAnomalyDetector:
22
+ """Multi-method anomaly detector: entity mapping, temporal spikes, geospatial, value outliers."""
23
+
24
+ def __init__(self) -> None:
25
+ self.entity_hash: Dict[str, Dict[str, int]] = defaultdict(dict)
26
+ self.temporal_hash: Dict[str, List[Tuple[datetime, float]]] = defaultdict(list)
27
+ self.geo_hash: Dict[str, List[str]] = defaultdict(list)
28
+ self.value_tracker: Dict[str, List[float]] = defaultdict(list)
29
+ self.flags: List[AnomalyFlag] = []
30
+
31
+ def ingest(self, entity_id: str, counterparty_id: Optional[str] = None,
32
+ value: Optional[float] = None, timestamp: Optional[datetime] = None,
33
+ jurisdiction: Optional[str] = None, metadata: Optional[Dict] = None) -> None:
34
+ if counterparty_id:
35
+ self.entity_hash[entity_id][counterparty_id] = self.entity_hash[entity_id].get(counterparty_id, 0) + 1
36
+ if timestamp is not None and value is not None:
37
+ self.temporal_hash[entity_id].append((timestamp, value))
38
+ if jurisdiction:
39
+ self.geo_hash[entity_id].append(jurisdiction)
40
+ if value is not None:
41
+ self.value_tracker[entity_id].append(value)
42
+
43
+ def ingest_batch(self, events: List[Dict[str, Any]]) -> int:
44
+ for evt in events:
45
+ self.ingest(**{k: evt.get(k) for k in ("entity_id", "counterparty_id", "value", "timestamp", "jurisdiction")})
46
+ return len(events)
47
+
48
+ def detect_multi_entity_anomalies(self, max_relationships: int = 50) -> List[AnomalyFlag]:
49
+ flags = []
50
+ for eid, counterparties in self.entity_hash.items():
51
+ n = len(counterparties)
52
+ if n > max_relationships:
53
+ flags.append(AnomalyFlag(entity_id=eid, flag_type="multi_entity",
54
+ severity=min(1.0, n / (2 * max_relationships)),
55
+ evidence={"n_counterparties": n, "counterparties": dict(counterparties)},
56
+ timestamp=datetime.now(),
57
+ description=f"Entity has {n} counterparties (threshold: {max_relationships})"))
58
+ return sorted(flags, key=lambda f: f.severity, reverse=True)
59
+
60
+ def detect_temporal_spikes(self, spike_threshold_sigma: float = 3.0,
61
+ window_hours: int = 1) -> List[AnomalyFlag]:
62
+ flags = []
63
+ for eid, events in self.temporal_hash.items():
64
+ if len(events) < 10:
65
+ continue
66
+ hourly = defaultdict(list)
67
+ for ts, amount in events:
68
+ hour_key = ts.replace(minute=0, second=0, microsecond=0)
69
+ hourly[hour_key].append(amount)
70
+ counts = [len(v) for v in hourly.values()]
71
+ mean_c, std_c = float(np.mean(counts)), float(np.std(counts))
72
+ if std_c == 0:
73
+ continue
74
+ for hour, amounts in hourly.items():
75
+ z = (len(amounts) - mean_c) / std_c
76
+ if z > spike_threshold_sigma:
77
+ flags.append(AnomalyFlag(entity_id=eid, flag_type="temporal_spike",
78
+ severity=min(1.0, z / (2 * spike_threshold_sigma)),
79
+ evidence={"hour": str(hour), "count": len(amounts), "total_amount": sum(amounts), "z_score": float(z)},
80
+ timestamp=hour,
81
+ description=f"{len(amounts)} events at {hour} ({z:.1f} above mean)"))
82
+ return sorted(flags, key=lambda f: f.severity, reverse=True)
83
+
84
+ def detect_all(self, trace=None) -> List[AnomalyFlag]:
85
+ self.flags = []
86
+ self.flags.extend(self.detect_multi_entity_anomalies())
87
+ self.flags.extend(self.detect_temporal_spikes())
88
+ self.flags.sort(key=lambda f: f.severity, reverse=True)
89
+ if trace:
90
+ for flag in self.flags[:5]:
91
+ trace.event(type("TE", (), {"value": "anomaly.flagged"})(), agent_name="StatAnomaly",
92
+ message=flag.description, data={"severity": flag.severity, "type": flag.flag_type})
93
+ return self.flags
@@ -0,0 +1,123 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT
2
+
3
+ Adds embedding-based retrieval alongside the hash index for multi-source fusion.
4
+ Uses all-MiniLM-L6-v2 (80MB model, 384-dim vectors) — runs locally, no API calls.
5
+ """
6
+
7
+ import numpy as np
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+
10
+ try:
11
+ import faiss
12
+ from sentence_transformers import SentenceTransformer
13
+ HAS_FAISS = True
14
+ except ImportError:
15
+ HAS_FAISS = False
16
+
17
+
18
+ class VectorIndex:
19
+ """Semantic document retrieval using FAISS in-memory vector index.
20
+
21
+ Complements the hash index by finding semantically similar content
22
+ that doesn't share exact words with the query.
23
+
24
+ Parameters
25
+ ----------
26
+ model_name : str
27
+ HuggingFace sentence-transformers model name.
28
+ encode_batch_size : int
29
+ Batch size for model.encode(). Larger values (64–128) encode
30
+ many chunks faster on CPU; reduce to 8–16 if GPU OOM occurs.
31
+ """
32
+
33
+ def __init__(self, model_name: str = "all-MiniLM-L6-v2",
34
+ encode_batch_size: int = 64) -> None:
35
+ if not HAS_FAISS:
36
+ raise ImportError("faiss-cpu and sentence-transformers required: pip install faiss-cpu sentence-transformers")
37
+
38
+ self.model = SentenceTransformer(model_name)
39
+ self.dimension = self.model.get_sentence_embedding_dimension() # 384
40
+ self.encode_batch_size = encode_batch_size
41
+ self.index = faiss.IndexFlatIP(self.dimension) # Inner product (cosine with normalized vectors)
42
+ self.chunks: List[Dict[str, Any]] = [] # {"text": ..., "source": ..., "page": ...}
43
+ self._chunk_vectors: Optional[np.ndarray] = None
44
+
45
+ def index_text(self, text: str, source: str = "unknown",
46
+ chunk_size: int = 300, chunk_overlap: int = 50) -> int:
47
+ """Split text into overlapping chunks and index as vectors. Returns chunk count."""
48
+ words = text.split()
49
+ chunks = []
50
+ start = 0
51
+ while start < len(words):
52
+ end = min(start + chunk_size, len(words))
53
+ chunk = " ".join(words[start:end])
54
+ chunks.append({"text": chunk, "source": source, "page": 1})
55
+ start += chunk_size - chunk_overlap
56
+
57
+ if not chunks:
58
+ return 0
59
+
60
+ # Encode in batches for throughput — sentence-transformers uses an
61
+ # internal batch loop; a larger batch_size reduces Python→C round-trips.
62
+ vectors = self.model.encode(
63
+ [c["text"] for c in chunks],
64
+ batch_size=self.encode_batch_size,
65
+ show_progress_bar=False,
66
+ )
67
+ # Normalize for cosine similarity (inner product on unit vectors = cosine)
68
+ faiss.normalize_L2(vectors)
69
+ self.index.add(vectors.astype(np.float32))
70
+ self.chunks.extend(chunks)
71
+ return len(chunks)
72
+
73
+ def search(self, query: str, top_k: int = 10,
74
+ source_filter: Optional[List[str]] = None) -> List[Dict[str, Any]]:
75
+ """Search for semantically similar chunks. Returns scored results.
76
+
77
+ Parameters
78
+ ----------
79
+ source_filter : Optional[List[str]]
80
+ If provided, only return chunks from these source documents
81
+ (case-insensitive basename matching).
82
+ """
83
+ if self.index.ntotal == 0:
84
+ return []
85
+
86
+ query_vec = self.model.encode([query], show_progress_bar=False)
87
+ faiss.normalize_L2(query_vec)
88
+ # Fetch more candidates than needed so filtering doesn't starve results
89
+ fetch_k = min(top_k * 3, self.index.ntotal) if source_filter else min(top_k, self.index.ntotal)
90
+ scores, indices = self.index.search(query_vec.astype(np.float32), fetch_k)
91
+
92
+ results = []
93
+ for score, idx in zip(scores[0], indices[0]):
94
+ if idx >= 0 and idx < len(self.chunks):
95
+ chunk = self.chunks[idx]
96
+ # ── Document-scope filter ────────────────────────
97
+ if source_filter:
98
+ import os
99
+ filter_set = set()
100
+ for f in source_filter:
101
+ f = str(f).lower().strip()
102
+ f = os.path.basename(f)
103
+ if f:
104
+ filter_set.add(f)
105
+ if filter_set and os.path.basename(chunk.get("source", "").lower().strip()) not in filter_set:
106
+ continue
107
+ results.append({
108
+ "text": chunk["text"],
109
+ "source": chunk["source"],
110
+ "page": chunk.get("page", 1),
111
+ "score": round(float(score), 3), # Cosine similarity, 0-1
112
+ "match_type": "semantic",
113
+ })
114
+ if len(results) >= top_k:
115
+ break
116
+ return results
117
+
118
+ def get_stats(self) -> Dict[str, Any]:
119
+ return {
120
+ "total_vectors": int(self.index.ntotal),
121
+ "total_chunks": len(self.chunks),
122
+ "dimension": self.dimension,
123
+ }