mcp-agentic-pipelines 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/.env.example +93 -0
  2. package/README.md +258 -0
  3. package/package.json +70 -0
  4. package/packages/clinical/package.json +22 -0
  5. package/packages/clinical/src/index.ts +262 -0
  6. package/packages/clinical/tsconfig.json +13 -0
  7. package/packages/core/package.json +21 -0
  8. package/packages/core/src/config.ts +138 -0
  9. package/packages/core/src/errors.ts +100 -0
  10. package/packages/core/src/index.ts +104 -0
  11. package/packages/core/src/llm-config.ts +213 -0
  12. package/packages/core/src/logging.ts +66 -0
  13. package/packages/core/src/python-bridge.ts +384 -0
  14. package/packages/core/src/rate-limiter.ts +136 -0
  15. package/packages/core/src/types.ts +203 -0
  16. package/packages/core/src/validation.ts +101 -0
  17. package/packages/core/tsconfig.json +10 -0
  18. package/packages/deeppipe/package.json +21 -0
  19. package/packages/deeppipe/src/index.ts +424 -0
  20. package/packages/deeppipe/tsconfig.json +13 -0
  21. package/packages/piste/package.json +20 -0
  22. package/packages/piste/src/index.ts +48 -0
  23. package/packages/piste/tsconfig.json +13 -0
  24. package/packages/precis/package.json +20 -0
  25. package/packages/precis/src/index.ts +67 -0
  26. package/packages/precis/tsconfig.json +13 -0
  27. package/packages/server/package.json +31 -0
  28. package/packages/server/src/index.ts +427 -0
  29. package/packages/server/tsconfig.json +17 -0
  30. package/setup.mjs +141 -0
  31. package/test.mjs +337 -0
  32. package/vendors/clinical-intake/pipeline.mjs +349 -0
  33. package/vendors/clinical-intake/questions/en.txt +9 -0
  34. package/vendors/clinical-intake/questions/fr.txt +9 -0
  35. package/vendors/piste/.env.example +73 -0
  36. package/vendors/piste/app/core/__init__.py +4 -0
  37. package/vendors/piste/app/core/config.py +83 -0
  38. package/vendors/piste/app/core/debuglog.py +16 -0
  39. package/vendors/piste/app/core/middleware.py +40 -0
  40. package/vendors/piste/bridge_piste.py +301 -0
  41. package/vendors/piste/pipeline/__init__.py +4 -0
  42. package/vendors/piste/pipeline/compiler.py +68 -0
  43. package/vendors/piste/pipeline/offline/__init__.py +28 -0
  44. package/vendors/piste/pipeline/offline/verifaid_pipeline.py +247 -0
  45. package/vendors/piste/pipeline/replay.py +15 -0
  46. package/vendors/piste/pipeline/replay_engine.py +249 -0
  47. package/vendors/piste/pipeline/signatures/__init__.py +4 -0
  48. package/vendors/piste/pipeline/signatures/signatures.py +136 -0
  49. package/vendors/piste/pipeline/stage1/__init__.py +21 -0
  50. package/vendors/piste/pipeline/stage1/atomic_decomposer.py +61 -0
  51. package/vendors/piste/pipeline/stage1/check_worthiness.py +100 -0
  52. package/vendors/piste/pipeline/stage1/orchestrator.py +175 -0
  53. package/vendors/piste/pipeline/stage1/test_stage1.py +162 -0
  54. package/vendors/piste/pipeline/stage2/__init__.py +34 -0
  55. package/vendors/piste/pipeline/stage2/blind_retriever.py +303 -0
  56. package/vendors/piste/pipeline/stage2/canonical_mapper.py +124 -0
  57. package/vendors/piste/pipeline/stage2/credibility_scorer.py +85 -0
  58. package/vendors/piste/pipeline/stage2/orchestrator.py +311 -0
  59. package/vendors/piste/pipeline/stage2/query_refiner.py +88 -0
  60. package/vendors/piste/pipeline/stage2/search_decision.py +69 -0
  61. package/vendors/piste/pipeline/stage2/test_stage2.py +265 -0
  62. package/vendors/piste/pipeline/stage3/__init__.py +20 -0
  63. package/vendors/piste/pipeline/stage3/classifier.py +79 -0
  64. package/vendors/piste/pipeline/stage3/orchestrator.py +225 -0
  65. package/vendors/piste/pipeline/stage3/test_stage3.py +101 -0
  66. package/vendors/piste/pipeline/stage4/__init__.py +33 -0
  67. package/vendors/piste/pipeline/stage4/criticality_gate.py +177 -0
  68. package/vendors/piste/pipeline/stage4/orchestrator.py +269 -0
  69. package/vendors/piste/pipeline/stage4/test_stage4.py +192 -0
  70. package/vendors/piste/pipeline/stage4/verdict_aggregator.py +157 -0
  71. package/vendors/piste/requirements.txt +53 -0
  72. package/vendors/precis/backend/__init__.py +6 -0
  73. package/vendors/precis/backend/agents/__init__.py +3 -0
  74. package/vendors/precis/backend/agents/data_synthesis.py +105 -0
  75. package/vendors/precis/backend/agents/dist_free_synth.py +97 -0
  76. package/vendors/precis/backend/agents/exact_hash_retriever.py +327 -0
  77. package/vendors/precis/backend/agents/fusion_ranker.py +64 -0
  78. package/vendors/precis/backend/agents/guardrail.py +175 -0
  79. package/vendors/precis/backend/agents/query_expander.py +89 -0
  80. package/vendors/precis/backend/agents/radial_interpol.py +99 -0
  81. package/vendors/precis/backend/agents/report_generator.py +92 -0
  82. package/vendors/precis/backend/agents/semantic_reranker.py +135 -0
  83. package/vendors/precis/backend/agents/stat_anomaly.py +93 -0
  84. package/vendors/precis/backend/agents/vector_index.py +123 -0
  85. package/vendors/precis/backend/agents/veri_score.py +341 -0
  86. package/vendors/precis/backend/agents/work_order_extractor.py +205 -0
  87. package/vendors/precis/backend/api/__init__.py +3 -0
  88. package/vendors/precis/backend/api/routes/__init__.py +3 -0
  89. package/vendors/precis/backend/config.py +88 -0
  90. package/vendors/precis/backend/core/__init__.py +13 -0
  91. package/vendors/precis/backend/core/hashing.py +22 -0
  92. package/vendors/precis/backend/core/metrics.py +77 -0
  93. package/vendors/precis/backend/core/multitoken.py +166 -0
  94. package/vendors/precis/backend/core/pmi.py +54 -0
  95. package/vendors/precis/backend/core/stemming.py +74 -0
  96. package/vendors/precis/backend/core/tracing.py +150 -0
  97. package/vendors/precis/backend/data/__init__.py +3 -0
  98. package/vendors/precis/backend/data/chunker.py +57 -0
  99. package/vendors/precis/backend/data/pdf_parser.py +42 -0
  100. package/vendors/precis/backend/db/__init__.py +3 -0
  101. package/vendors/precis/backend/db/models.py +173 -0
  102. package/vendors/precis/backend/db/repository.py +269 -0
  103. package/vendors/precis/backend/llm/__init__.py +3 -0
  104. package/vendors/precis/backend/llm/anthropic_provider.py +39 -0
  105. package/vendors/precis/backend/llm/base.py +147 -0
  106. package/vendors/precis/backend/llm/deepseek_provider.py +43 -0
  107. package/vendors/precis/backend/llm/factory.py +60 -0
  108. package/vendors/precis/backend/llm/google_provider.py +39 -0
  109. package/vendors/precis/backend/llm/ollama_provider.py +54 -0
  110. package/vendors/precis/backend/llm/openai_provider.py +50 -0
  111. package/vendors/precis/backend/main.py +677 -0
  112. package/vendors/precis/backend/orchestrator/__init__.py +3 -0
  113. package/vendors/precis/backend/orchestrator/planner.py +81 -0
  114. package/vendors/precis/backend/orchestrator/router.py +319 -0
  115. package/vendors/precis/backend/orchestrator/types.py +58 -0
  116. package/vendors/precis/bridge_precis.py +185 -0
  117. package/vendors/precis/data/sample_reports/README.md +8 -0
  118. package/vendors/precis/data/seed_data.py +115 -0
  119. package/vendors/precis/requirements.txt +19 -0
@@ -0,0 +1,81 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
2
+
3
+ import json
4
+ from typing import Dict, List, Optional
5
+
6
+ from backend.orchestrator.types import ExecutionPlan, SubTask, TaskType
7
+ from backend.llm.base import LLMProvider
8
+ from backend.core.tracing import TraceCollector, TraceEventType
9
+
10
+
11
+ class PlannerAgent:
12
+ """Decomposes natural language queries into structured execution plans via LLM."""
13
+
14
+ def __init__(self, llm: LLMProvider) -> None:
15
+ self.llm = llm
16
+
17
+ async def plan(self, query: str, available_tools: Optional[List[Dict[str, str]]] = None,
18
+ conversation_history: Optional[List[Dict[str, str]]] = None,
19
+ trace: Optional[TraceCollector] = None) -> ExecutionPlan:
20
+ if trace:
21
+ trace.span_start("Planner", "plan")
22
+
23
+ tool_desc = ""
24
+ if available_tools:
25
+ tool_desc = "\n".join(f"- {t['name']}: {t['description']}" for t in available_tools)
26
+
27
+ system_prompt = (
28
+ "You are a query planning agent. Decompose the user's question into specific subtasks. "
29
+ "Available specialized agents:\n"
30
+ f"{tool_desc}\n\n"
31
+ "Return ONLY valid JSON:\n"
32
+ '{"subtasks": [{"id": "1", "type": "factual_retrieval|data_synthesis", '
33
+ '"query": "specific sub-query", "priority": 1, "depends_on": []}], "reasoning": "why this plan"}\n'
34
+ 'Use "factual_retrieval" to search documents for facts. '
35
+ 'Use "data_synthesis" to combine multiple results into an answer.'
36
+ )
37
+
38
+ user_prompt = f"Query: {query}\n\nPlan this query into subtasks."
39
+ if conversation_history:
40
+ history_text = "\n".join(f"Q: {t['query']}\nA: {t.get('response_summary', '')}"
41
+ for t in conversation_history[-5:])
42
+ user_prompt = f"Conversation history:\n{history_text}\n\n{user_prompt}"
43
+
44
+ try:
45
+ response = await self.llm.generate(user_prompt, system_prompt=system_prompt, temperature=0.0, max_tokens=300)
46
+ plan = self._parse_response(response, query)
47
+ except Exception:
48
+ plan = self._fallback_plan(query)
49
+
50
+ if trace:
51
+ trace.event(TraceEventType.PLAN_CREATED, agent_name="Planner",
52
+ message=f"Created plan with {len(plan.subtasks)} subtasks",
53
+ data={"subtask_count": len(plan.subtasks), "reasoning": plan.reasoning})
54
+ trace.span_end()
55
+ return plan
56
+
57
+ def _parse_response(self, response: str, query: str) -> ExecutionPlan:
58
+ try:
59
+ data = json.loads(response)
60
+ except json.JSONDecodeError:
61
+ start = response.find("{")
62
+ end = response.rfind("}") + 1
63
+ data = json.loads(response[start:end]) if start >= 0 and end > start else {}
64
+
65
+ subtasks = []
66
+ for s in data.get("subtasks", []):
67
+ try:
68
+ ttype = TaskType(s["type"])
69
+ except ValueError:
70
+ ttype = TaskType.FACTUAL_RETRIEVAL
71
+ subtasks.append(SubTask(id=s.get("id", str(len(subtasks))), type=ttype,
72
+ query=s.get("query", query), priority=s.get("priority", 1),
73
+ depends_on=s.get("depends_on", [])))
74
+ return ExecutionPlan(original_query=query, subtasks=subtasks,
75
+ reasoning=data.get("reasoning", "LLM-generated plan"))
76
+
77
+ def _fallback_plan(self, query: str) -> ExecutionPlan:
78
+ return ExecutionPlan(original_query=query, subtasks=[
79
+ SubTask(id="1", type=TaskType.FACTUAL_RETRIEVAL, query=query),
80
+ SubTask(id="2", type=TaskType.CREATIVE_REASONING, query=query, depends_on=["1"]),
81
+ ], reasoning="Fallback plan: retrieve then reason")
@@ -0,0 +1,319 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
2
+
3
+ import asyncio
4
+ import time
5
+ from dataclasses import dataclass
6
+ from typing import Dict, List, Optional, Type
7
+
8
+ from backend.orchestrator.types import AgentResult, SubTask, TaskType
9
+ from backend.llm.base import LLMProvider
10
+ from backend.core.tracing import TraceCollector, TraceEventType
11
+
12
+
13
+ @dataclass
14
+ class AgentRegistryEntry:
15
+ name: str
16
+ task_type: TaskType
17
+ agent_class: Type
18
+ description: str
19
+ is_external_llm: bool = False
20
+ singleton_instance: object = None # Pre-built instance to reuse (e.g., seeded index)
21
+
22
+
23
+ class AgentRegistry:
24
+ """Registry mapping TaskType → specialized agent class."""
25
+
26
+ def __init__(self) -> None:
27
+ self._registry: Dict[TaskType, AgentRegistryEntry] = {}
28
+
29
+ def register(self, entry: AgentRegistryEntry) -> None:
30
+ self._registry[entry.task_type] = entry
31
+
32
+ def get(self, task_type: TaskType) -> Optional[AgentRegistryEntry]:
33
+ return self._registry.get(task_type)
34
+
35
+ def list_tools(self) -> List[Dict[str, str]]:
36
+ return [{"name": e.name, "description": e.description} for e in self._registry.values()]
37
+
38
+
39
+ class RouterAgent:
40
+ """Routes SubTasks to agents. Handles parallel execution with dependency ordering."""
41
+
42
+ # ── Section-number helpers ─────────────────────────────────────
43
+
44
+ @staticmethod
45
+ def _next_section_candidates(tokens: tuple) -> list:
46
+ """Given a section-number token like '3.4', yield likely next
47
+ section numbers: '3.5', then '4'. Used to find section
48
+ boundaries via the hash index."""
49
+ import re
50
+ for t in tokens:
51
+ m = re.match(r'^(\d+)\.(\d+)$', t)
52
+ if m:
53
+ major, minor = int(m.group(1)), int(m.group(2))
54
+ yield f"{major}.{minor + 1}" # 3.4 → 3.5
55
+ yield str(major + 1) # 3.4 → 4
56
+ break
57
+
58
+ def __init__(self, registry: AgentRegistry, llm: Optional[LLMProvider] = None) -> None:
59
+ self.registry = registry
60
+ self.llm = llm
61
+
62
+ async def execute_subtask(self, subtask: SubTask,
63
+ source_filter: Optional[List[str]] = None,
64
+ search_mode: str = "standard",
65
+ trace: Optional[TraceCollector] = None) -> AgentResult:
66
+ entry = self.registry.get(subtask.type)
67
+ if entry is None:
68
+ return AgentResult(subtask_id=subtask.id, agent_name="router",
69
+ success=False, error_message=f"No agent for {subtask.type.value}")
70
+
71
+ # Data synthesis is handled by main.py with LLM — return a placeholder
72
+ if entry.is_external_llm:
73
+ return AgentResult(subtask_id=subtask.id, agent_name=entry.name,
74
+ success=True, data={"synthesis": "(pending LLM synthesis)"})
75
+
76
+ start = time.time()
77
+ if trace:
78
+ trace.span_start(entry.name, "execute")
79
+ trace.event(TraceEventType.AGENT_STARTED, agent_name=entry.name,
80
+ message=f"Starting: {subtask.query[:80]}...")
81
+
82
+ try:
83
+ # Use singleton instance if provided, otherwise create new
84
+ if entry.singleton_instance is not None:
85
+ agent = entry.singleton_instance
86
+ elif not entry.is_external_llm:
87
+ agent = entry.agent_class()
88
+ else:
89
+ agent = None
90
+ if agent and hasattr(agent, "hybrid_search"):
91
+ # Stem query tokens to match the stemmed index
92
+ from backend.core.stemming import PrecisStemmer
93
+ stemmer = PrecisStemmer()
94
+ query_words = [w for w in subtask.query.lower().split() if len(w) > 1]
95
+ stemmed_tokens = tuple(stemmer.stem_tokens(query_words))
96
+ results = agent.hybrid_search(stemmed_tokens, source_filter=source_filter, trace=trace)
97
+ print(f"[Precis] Router hash search: source_filter={source_filter!r} results={len(results)}")
98
+
99
+ # Auto-retry with query expansion when 0 results (Thorough mode only)
100
+ if len(results) == 0 and self.llm and search_mode == "thorough":
101
+ try:
102
+ from backend.agents.query_expander import QueryExpander
103
+ expander = QueryExpander(self.llm)
104
+ expanded_queries = await expander.expand(
105
+ subtask.query, list(stemmed_tokens)
106
+ )
107
+ for eq in expanded_queries:
108
+ eq_words = [w for w in eq.lower().split() if len(w) > 1]
109
+ eq_stemmed = tuple(stemmer.stem_tokens(eq_words))
110
+ retry_results = agent.hybrid_search(eq_stemmed, source_filter=source_filter, trace=trace)
111
+ if retry_results:
112
+ if trace:
113
+ trace.event(
114
+ type("TE", (), {"value": "decision.search_type"})(),
115
+ agent_name="QueryExpander",
116
+ message=f"Expanded '{subtask.query[:40]}...' → '{eq[:60]}...' → {len(retry_results)} results",
117
+ data={"original_query": subtask.query, "expanded_query": eq, "results": len(retry_results)}
118
+ )
119
+ results = retry_results
120
+ break
121
+ except Exception as ex:
122
+ if trace:
123
+ trace.event(
124
+ type("TE", (), {"value": "agent.failed"})(),
125
+ agent_name="QueryExpander",
126
+ message=f"Expansion failed: {ex}"
127
+ )
128
+ # Build hash items (CPU work in thread)
129
+ def build_hash_items():
130
+ items = []
131
+ for r in results[:40]:
132
+ text = " ".join(r.multitoken.metadata.get("original_words", r.multitoken.tokens))
133
+ ctx = agent.get_context(r.multitoken.source_doc, r.multitoken.source_page,
134
+ r.multitoken.source_position)
135
+ items.append({
136
+ "text": text, "surrounding": ctx["surrounding"],
137
+ "sentence": ctx["sentence"], "page": ctx["page"],
138
+ "source": ctx["file"], "score": round(r.relevance_score, 3),
139
+ "match_type": r.match_type,
140
+ })
141
+ return items
142
+
143
+ # Run hash build + vector search concurrently (skip vector in Fast mode)
144
+ hash_task = asyncio.to_thread(build_hash_items)
145
+ vec_task = None
146
+ if search_mode != "fast":
147
+ try:
148
+ import backend.main as _main
149
+ if _main._vector_index:
150
+ vec_task = asyncio.to_thread(_main._vector_index.search, subtask.query, 10, source_filter)
151
+ except Exception:
152
+ pass
153
+
154
+ if vec_task:
155
+ raw_items, vec_results = await asyncio.gather(hash_task, vec_task)
156
+ else:
157
+ raw_items = await hash_task
158
+ vec_results = []
159
+ print(f"[Precis] After gather: raw_items={len(raw_items)} vec_results={len(vec_results)}")
160
+
161
+ # Fuse hash + vector results
162
+ if vec_results:
163
+ try:
164
+ from backend.agents.fusion_ranker import FusionRanker
165
+ fuser = FusionRanker()
166
+ fused = fuser.fuse({"hash": raw_items, "vector": vec_results}, top_k=15)
167
+ raw_items = [{
168
+ "text": f["text"],
169
+ "surrounding": f.get("surrounding", f["text"]),
170
+ "sentence": f.get("sentence", f["text"][:200]),
171
+ "page": f.get("page", 1),
172
+ "source": f.get("source", ""),
173
+ "score": f.get("score", 0),
174
+ "match_type": f.get("match_type", "fusion"),
175
+ } for f in fused]
176
+ if trace:
177
+ trace.event(type("TE",(),{"value":"decision.fusion"})(), agent_name="FusionRanker",
178
+ message=f"Fused hash+vector: {len(fused)} results",
179
+ data={"vector_items": len(vec_results), "fused": len(fused)})
180
+ except Exception:
181
+ pass
182
+
183
+ # Dedup by surrounding text (same paragraph = same result)
184
+ seen_texts = {}
185
+ for item in raw_items:
186
+ key = item["surrounding"][:120] # First 120 chars of surrounding context
187
+ if key not in seen_texts or item["score"] > seen_texts[key]["score"]:
188
+ seen_texts[key] = item
189
+ deduped = sorted(seen_texts.values(), key=lambda x: -x["score"])[:15]
190
+ print(f"[Precis] After dedup: deduped={len(deduped)} first_source={deduped[0].get('source','') if deduped else 'EMPTY'}")
191
+
192
+ # Semantic re-ranking: DISABLED — DeepSeek scores every n-gram
193
+ # fragment as 0, adding ~3s latency with zero ranking benefit.
194
+ # Re-enable when using a stronger LLM (GPT-4, Claude) that can
195
+ # actually judge fragment relevance.
196
+ _ENABLE_SEMANTIC_RERANKER = False
197
+ if _ENABLE_SEMANTIC_RERANKER and self.llm and len(deduped) > 3:
198
+ try:
199
+ from backend.agents.semantic_reranker import SemanticReRanker
200
+ reranker = SemanticReRanker(self.llm)
201
+ reranked = await reranker.rerank(subtask.query, deduped, top_k=5)
202
+ if reranked:
203
+ if trace:
204
+ trace.event(
205
+ type("TE", (), {"value": "decision.rerank"})(),
206
+ agent_name="SemanticReRanker",
207
+ message=f"Re-ranked {len(deduped)} → {len(reranked)} results",
208
+ data={"before": len(deduped), "after": len(reranked),
209
+ "top_score": reranked[0].get("semantic_score", 0)}
210
+ )
211
+ deduped = reranked
212
+ except Exception:
213
+ pass # Re-ranking is best-effort; fall back to hash scores
214
+
215
+ # Quality filter: keep results above minimum hash-score threshold.
216
+ # (SemanticReRanker is disabled, so we use the original hash scores.)
217
+ MIN_SCORE = 0.10
218
+ deduped = [d for d in deduped if d.get("score", 0) >= MIN_SCORE]
219
+ print(f"[Precis] After MIN_SCORE: deduped={len(deduped)}")
220
+
221
+ # Thorough mode: HASH found the section heading.
222
+ # Both matches are TOC entries — "next heading" is always
223
+ # the adjacent TOC line, so boundary detection fails.
224
+ # Instead: extract a generous window from the BEST match
225
+ # (the one with the most surrounding content). No char cap —
226
+ # the LLM is smart enough to identify the section body.
227
+ if search_mode == "thorough":
228
+ print(f"[Precis] Thorough: generous window from hash position")
229
+ try:
230
+ import os
231
+ filter_set = None
232
+ if source_filter:
233
+ filter_set = {os.path.basename(str(f).lower().strip()) for f in source_filter}
234
+
235
+ if hasattr(agent, '_doc_texts'):
236
+ for filename, text in agent._doc_texts.items():
237
+ if filter_set and os.path.basename(filename.lower().strip()) not in filter_set:
238
+ continue
239
+ lines = text.split("\n")
240
+ best_excerpt = ""
241
+ best_pos = 0
242
+
243
+ for r in results[:5]:
244
+ pos = r.multitoken.source_position
245
+ end = min(len(lines), pos + 300)
246
+ excerpt = "\n".join(lines[pos:end])
247
+ # Keep the one with the MOST content
248
+ if len(excerpt) > len(best_excerpt):
249
+ best_excerpt = excerpt
250
+ best_pos = pos
251
+
252
+ if len(best_excerpt) > 200:
253
+ deduped.append({
254
+ "text": best_excerpt,
255
+ "source": filename,
256
+ "score": 0.7,
257
+ "match_type": "section_body",
258
+ "page": best_pos // 40 + 1,
259
+ "surrounding": best_excerpt,
260
+ "sentence": best_excerpt[:500],
261
+ })
262
+ if trace:
263
+ trace.event(
264
+ type("TE", (), {"value": "decision.direct_read"})(),
265
+ agent_name="SectionExtractor",
266
+ message=f"Section window: {len(best_excerpt)} chars from line {best_pos}",
267
+ data={"from": best_pos, "chars": len(best_excerpt)},
268
+ )
269
+ except Exception:
270
+ pass
271
+
272
+ result = AgentResult(subtask_id=subtask.id, agent_name=entry.name, success=True,
273
+ data={"results": deduped},
274
+ citations=[{"source_doc": r.multitoken.source_doc,
275
+ "source_page": r.multitoken.source_page}
276
+ for r in results[:20]])
277
+ elif agent and hasattr(agent, "predict"):
278
+ import numpy as np
279
+ pred, contribs = agent.predict(np.array([0.5]), trace=trace)
280
+ result = AgentResult(subtask_id=subtask.id, agent_name=entry.name, success=True,
281
+ data={"prediction": float(pred), "contributing_nodes": contribs[:5]})
282
+ elif agent and hasattr(agent, "detect_all"):
283
+ flags = agent.detect_all(trace=trace)
284
+ result = AgentResult(subtask_id=subtask.id, agent_name=entry.name, success=True,
285
+ data={"flags": [{"entity": f.entity_id, "type": f.flag_type,
286
+ "severity": f.severity} for f in flags]})
287
+ else:
288
+ result = AgentResult(subtask_id=subtask.id, agent_name=entry.name, success=True,
289
+ data={"response": "Agent executed successfully"})
290
+ except Exception as e:
291
+ result = AgentResult(subtask_id=subtask.id, agent_name=entry.name,
292
+ success=False, error_message=str(e))
293
+
294
+ result.execution_time_ms = (time.time() - start) * 1000
295
+ if trace:
296
+ trace.event(TraceEventType.AGENT_COMPLETED, agent_name=entry.name,
297
+ message="Completed" if result.success else f"Failed: {result.error_message}",
298
+ data={"success": result.success, "duration_ms": result.execution_time_ms})
299
+ trace.span_end()
300
+ return result
301
+
302
+ async def execute_plan(self, subtasks: List[SubTask], max_parallel: int = 4,
303
+ source_filter: Optional[List[str]] = None,
304
+ search_mode: str = "standard",
305
+ trace: Optional[TraceCollector] = None) -> List[AgentResult]:
306
+ print(f"[Precis] execute_plan: source_filter={source_filter!r} mode={search_mode}")
307
+ results: Dict[str, AgentResult] = {}
308
+ pending = list(subtasks)
309
+ while pending:
310
+ ready = [s for s in pending if all(d in results for d in s.depends_on)]
311
+ if not ready:
312
+ break
313
+ batch = ready[:max_parallel]
314
+ pending = [s for s in pending if s not in batch]
315
+ batch_results = await asyncio.gather(*(self.execute_subtask(s, source_filter, search_mode, trace) for s in batch))
316
+ for s, r in zip(batch, batch_results):
317
+ results[s.id] = r
318
+ return [results.get(s.id, AgentResult(subtask_id=s.id, agent_name="router", success=False,
319
+ error_message="Dependency not met")) for s in subtasks]
@@ -0,0 +1,58 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
2
+
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+ from enum import Enum
6
+ from typing import Any, Dict, List, Optional
7
+ import uuid
8
+
9
+
10
+ class TaskType(str, Enum):
11
+ FACTUAL_RETRIEVAL = "factual_retrieval"
12
+ ANOMALY_DETECTION = "anomaly_detection"
13
+ PREDICTION = "prediction"
14
+ DATA_SYNTHESIS = "data_synthesis"
15
+ CREATIVE_REASONING = "creative_reasoning"
16
+ EVALUATION = "evaluation"
17
+
18
+
19
+ @dataclass
20
+ class SubTask:
21
+ id: str
22
+ type: TaskType
23
+ query: str
24
+ context: Dict[str, Any] = field(default_factory=dict)
25
+ priority: int = 1
26
+ depends_on: List[str] = field(default_factory=list)
27
+
28
+
29
+ @dataclass
30
+ class ExecutionPlan:
31
+ plan_id: str = field(default_factory=lambda: str(uuid.uuid4())[:8])
32
+ original_query: str = ""
33
+ subtasks: List[SubTask] = field(default_factory=list)
34
+ reasoning: str = ""
35
+ created_at: datetime = field(default_factory=datetime.now)
36
+ metadata: Dict[str, Any] = field(default_factory=dict)
37
+
38
+
39
+ @dataclass
40
+ class AgentResult:
41
+ subtask_id: str = ""
42
+ agent_name: str = ""
43
+ success: bool = True
44
+ data: Any = None
45
+ citations: List[Dict[str, Any]] = field(default_factory=list)
46
+ error_message: str = ""
47
+ execution_time_ms: float = 0.0
48
+
49
+
50
+ @dataclass
51
+ class FinalReport:
52
+ query: str = ""
53
+ narrative: str = ""
54
+ agent_results: List[AgentResult] = field(default_factory=list)
55
+ evaluation: Optional[Any] = None
56
+ citations: List[Dict[str, Any]] = field(default_factory=list)
57
+ generated_at: datetime = field(default_factory=datetime.now)
58
+ execution_plan: Optional[ExecutionPlan] = None
@@ -0,0 +1,185 @@
1
+ """
2
+ Precis Bridge — stdin/stdout JSON worker for MCP server.
3
+
4
+ Usage: python bridge_precis.py
5
+
6
+ Reads JSON requests from stdin, processes them using the real Precis backend,
7
+ writes JSON responses to stdout.
8
+
9
+ Protocol:
10
+ Input: {"id": 1, "action": "query", "params": {"query": "...", "search_mode": "standard"}}
11
+ Output: {"id": 1, "result": {...}} or {"id": 1, "error": "message"}
12
+
13
+ Actions:
14
+ - query: Full RAG pipeline
15
+ - list_documents: List indexed documents
16
+ - debug_stem: Show stemmer output for a query
17
+ - debug_search: Direct hybrid search result
18
+ - health: Returns {"status": "ok"}
19
+ """
20
+
21
+ import sys, importlib, json, asyncio, os
22
+
23
+ # ── Verify dependencies (installed by MCP server on startup) ─────
24
+ REQUIRED = {'fastapi': 'fastapi', 'uvicorn': 'uvicorn', 'pydantic': 'pydantic',
25
+ 'numpy': 'numpy', 'nltk': 'nltk', 'sqlalchemy': 'sqlalchemy',
26
+ 'httpx': 'httpx', 'dotenv': 'python-dotenv'}
27
+ _missing = [mod for mod in REQUIRED if not importlib.util.find_spec(mod)]
28
+ if _missing:
29
+ sys.stderr.write(f'[precis] FATAL: missing packages: {", ".join(_missing)}. '
30
+ f'The MCP server should have installed them.\n')
31
+ sys.stderr.flush()
32
+ sys.exit(1)
33
+
34
+ # Ensure the precis-agentic-pipeline directory is on sys.path
35
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
36
+
37
+ # Suppress excessive logging
38
+ os.environ.setdefault('PRECIS_LOG_LEVEL', 'WARNING')
39
+
40
+ # ── Global state (initialized lazily) ──────────────────────────────────
41
+ _app = None
42
+ _loop = None
43
+
44
+ def get_loop():
45
+ global _loop
46
+ if _loop is None or _loop.is_closed():
47
+ _loop = asyncio.new_event_loop()
48
+ asyncio.set_event_loop(_loop)
49
+ return _loop
50
+
51
+ def init_app():
52
+ """Initialize the Precis FastAPI app and its lifespan (DB, indexes, LLM)."""
53
+ global _app
54
+ if _app is not None:
55
+ return _app
56
+
57
+ from backend.main import app, lifespan
58
+ _app = app
59
+
60
+ loop = get_loop()
61
+ # Run the lifespan startup
62
+ async def startup():
63
+ async with lifespan(app) as gen:
64
+ await gen.__anext__()
65
+ loop.run_until_complete(startup())
66
+
67
+ return _app
68
+
69
+
70
+ def handle_query(params):
71
+ """Execute a full RAG query through the real Precis pipeline."""
72
+ from backend.main import process_query
73
+
74
+ loop = get_loop()
75
+ query_dict = {
76
+ "query": params.get("query", ""),
77
+ "session_id": params.get("session_id", None),
78
+ "source_filter": params.get("source_filter", None),
79
+ "search_mode": params.get("search_mode", "standard"),
80
+ }
81
+
82
+ result = loop.run_until_complete(process_query(query_dict))
83
+ return result
84
+
85
+
86
+ def handle_list_documents(params):
87
+ """List indexed documents from the database."""
88
+ from backend.db.repository import get_all_documents
89
+ docs = get_all_documents()
90
+ return docs
91
+
92
+
93
+ def handle_debug_stem(params):
94
+ """Show how the PrecisStemmer processes a query."""
95
+ from backend.core.stemming import PrecisStemmer
96
+ stemmer = PrecisStemmer()
97
+ raw = params.get("q", "").lower().split()
98
+ stemmed = stemmer.stem_tokens(raw)
99
+ return {"raw_tokens": raw, "stemmed_tokens": list(stemmed)}
100
+
101
+
102
+ def handle_debug_search(params):
103
+ """Run a direct hybrid search bypassing the planner."""
104
+ from backend.core.stemming import PrecisStemmer
105
+ import backend.main as _main
106
+
107
+ stemmer = PrecisStemmer()
108
+ raw = params.get("q", "").lower().split()
109
+ stemmed = tuple(stemmer.stem_tokens(raw))
110
+ index = _main._demo_index
111
+ results = index.hybrid_search(stemmed)
112
+
113
+ return {
114
+ "query": params.get("q", ""),
115
+ "stemmed_tokens": list(stemmed),
116
+ "result_count": len(results),
117
+ "results": [
118
+ {
119
+ "tokens": list(r.multitoken.tokens) if hasattr(r, 'multitoken') else [],
120
+ "source": r.multitoken.source_doc if hasattr(r, 'multitoken') else "",
121
+ "score": r.relevance_score if hasattr(r, 'relevance_score') else 0,
122
+ "match_type": r.match_type if hasattr(r, 'match_type') else "",
123
+ }
124
+ for r in results[:10]
125
+ ],
126
+ }
127
+
128
+
129
+ # ── Action dispatcher ─────────────────────────────────────────────────
130
+
131
+ ACTIONS = {
132
+ "query": handle_query,
133
+ "list_documents": handle_list_documents,
134
+ "debug_stem": handle_debug_stem,
135
+ "debug_search": handle_debug_search,
136
+ "health": lambda p: {"status": "ok", "backend": "precis"},
137
+ }
138
+
139
+
140
+ def main():
141
+ # Send ready signal
142
+ sys.stdout.write("__READY__\n")
143
+ sys.stdout.flush()
144
+
145
+ # Initialize app on first request, not at startup (faster initial ready signal)
146
+ _initialized = False
147
+
148
+ for line in sys.stdin:
149
+ line = line.strip()
150
+ if not line:
151
+ continue
152
+
153
+ try:
154
+ request = json.loads(line)
155
+ except json.JSONDecodeError:
156
+ continue
157
+
158
+ req_id = request.get("id")
159
+ action = request.get("action", "")
160
+ params = request.get("params", {})
161
+
162
+ handler = ACTIONS.get(action)
163
+ if not handler:
164
+ result = {"id": req_id, "error": f"Unknown action: {action}"}
165
+ sys.stdout.write(json.dumps(result) + "\n")
166
+ sys.stdout.flush()
167
+ continue
168
+
169
+ try:
170
+ # Lazy init
171
+ if not _initialized and action != "health":
172
+ init_app()
173
+ _initialized = True
174
+
175
+ result_data = handler(params)
176
+ response = {"id": req_id, "result": result_data}
177
+ except Exception as e:
178
+ response = {"id": req_id, "error": str(e)}
179
+
180
+ sys.stdout.write(json.dumps(response, default=str) + "\n")
181
+ sys.stdout.flush()
182
+
183
+
184
+ if __name__ == "__main__":
185
+ main()
@@ -0,0 +1,8 @@
1
+ # Sample data for Precis demo
2
+ # Place public 10-K reports, earnings transcripts, or financial documents here.
3
+ # These are used by the seed_data.py script to populate the demo index.
4
+ #
5
+ # Suggested sources (all public domain):
6
+ # - SEC EDGAR: https://www.sec.gov/edgar.shtml
7
+ # - Sample 10-K filings from large public companies
8
+ # - Earnings call transcripts