mcp-agentic-pipelines 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +93 -0
- package/README.md +258 -0
- package/package.json +70 -0
- package/packages/clinical/package.json +22 -0
- package/packages/clinical/src/index.ts +262 -0
- package/packages/clinical/tsconfig.json +13 -0
- package/packages/core/package.json +21 -0
- package/packages/core/src/config.ts +138 -0
- package/packages/core/src/errors.ts +100 -0
- package/packages/core/src/index.ts +104 -0
- package/packages/core/src/llm-config.ts +213 -0
- package/packages/core/src/logging.ts +66 -0
- package/packages/core/src/python-bridge.ts +384 -0
- package/packages/core/src/rate-limiter.ts +136 -0
- package/packages/core/src/types.ts +203 -0
- package/packages/core/src/validation.ts +101 -0
- package/packages/core/tsconfig.json +10 -0
- package/packages/deeppipe/package.json +21 -0
- package/packages/deeppipe/src/index.ts +424 -0
- package/packages/deeppipe/tsconfig.json +13 -0
- package/packages/piste/package.json +20 -0
- package/packages/piste/src/index.ts +48 -0
- package/packages/piste/tsconfig.json +13 -0
- package/packages/precis/package.json +20 -0
- package/packages/precis/src/index.ts +67 -0
- package/packages/precis/tsconfig.json +13 -0
- package/packages/server/package.json +31 -0
- package/packages/server/src/index.ts +427 -0
- package/packages/server/tsconfig.json +17 -0
- package/setup.mjs +141 -0
- package/test.mjs +337 -0
- package/vendors/clinical-intake/pipeline.mjs +349 -0
- package/vendors/clinical-intake/questions/en.txt +9 -0
- package/vendors/clinical-intake/questions/fr.txt +9 -0
- package/vendors/piste/.env.example +73 -0
- package/vendors/piste/app/core/__init__.py +4 -0
- package/vendors/piste/app/core/config.py +83 -0
- package/vendors/piste/app/core/debuglog.py +16 -0
- package/vendors/piste/app/core/middleware.py +40 -0
- package/vendors/piste/bridge_piste.py +301 -0
- package/vendors/piste/pipeline/__init__.py +4 -0
- package/vendors/piste/pipeline/compiler.py +68 -0
- package/vendors/piste/pipeline/offline/__init__.py +28 -0
- package/vendors/piste/pipeline/offline/verifaid_pipeline.py +247 -0
- package/vendors/piste/pipeline/replay.py +15 -0
- package/vendors/piste/pipeline/replay_engine.py +249 -0
- package/vendors/piste/pipeline/signatures/__init__.py +4 -0
- package/vendors/piste/pipeline/signatures/signatures.py +136 -0
- package/vendors/piste/pipeline/stage1/__init__.py +21 -0
- package/vendors/piste/pipeline/stage1/atomic_decomposer.py +61 -0
- package/vendors/piste/pipeline/stage1/check_worthiness.py +100 -0
- package/vendors/piste/pipeline/stage1/orchestrator.py +175 -0
- package/vendors/piste/pipeline/stage1/test_stage1.py +162 -0
- package/vendors/piste/pipeline/stage2/__init__.py +34 -0
- package/vendors/piste/pipeline/stage2/blind_retriever.py +303 -0
- package/vendors/piste/pipeline/stage2/canonical_mapper.py +124 -0
- package/vendors/piste/pipeline/stage2/credibility_scorer.py +85 -0
- package/vendors/piste/pipeline/stage2/orchestrator.py +311 -0
- package/vendors/piste/pipeline/stage2/query_refiner.py +88 -0
- package/vendors/piste/pipeline/stage2/search_decision.py +69 -0
- package/vendors/piste/pipeline/stage2/test_stage2.py +265 -0
- package/vendors/piste/pipeline/stage3/__init__.py +20 -0
- package/vendors/piste/pipeline/stage3/classifier.py +79 -0
- package/vendors/piste/pipeline/stage3/orchestrator.py +225 -0
- package/vendors/piste/pipeline/stage3/test_stage3.py +101 -0
- package/vendors/piste/pipeline/stage4/__init__.py +33 -0
- package/vendors/piste/pipeline/stage4/criticality_gate.py +177 -0
- package/vendors/piste/pipeline/stage4/orchestrator.py +269 -0
- package/vendors/piste/pipeline/stage4/test_stage4.py +192 -0
- package/vendors/piste/pipeline/stage4/verdict_aggregator.py +157 -0
- package/vendors/piste/requirements.txt +53 -0
- package/vendors/precis/backend/__init__.py +6 -0
- package/vendors/precis/backend/agents/__init__.py +3 -0
- package/vendors/precis/backend/agents/data_synthesis.py +105 -0
- package/vendors/precis/backend/agents/dist_free_synth.py +97 -0
- package/vendors/precis/backend/agents/exact_hash_retriever.py +327 -0
- package/vendors/precis/backend/agents/fusion_ranker.py +64 -0
- package/vendors/precis/backend/agents/guardrail.py +175 -0
- package/vendors/precis/backend/agents/query_expander.py +89 -0
- package/vendors/precis/backend/agents/radial_interpol.py +99 -0
- package/vendors/precis/backend/agents/report_generator.py +92 -0
- package/vendors/precis/backend/agents/semantic_reranker.py +135 -0
- package/vendors/precis/backend/agents/stat_anomaly.py +93 -0
- package/vendors/precis/backend/agents/vector_index.py +123 -0
- package/vendors/precis/backend/agents/veri_score.py +341 -0
- package/vendors/precis/backend/agents/work_order_extractor.py +205 -0
- package/vendors/precis/backend/api/__init__.py +3 -0
- package/vendors/precis/backend/api/routes/__init__.py +3 -0
- package/vendors/precis/backend/config.py +88 -0
- package/vendors/precis/backend/core/__init__.py +13 -0
- package/vendors/precis/backend/core/hashing.py +22 -0
- package/vendors/precis/backend/core/metrics.py +77 -0
- package/vendors/precis/backend/core/multitoken.py +166 -0
- package/vendors/precis/backend/core/pmi.py +54 -0
- package/vendors/precis/backend/core/stemming.py +74 -0
- package/vendors/precis/backend/core/tracing.py +150 -0
- package/vendors/precis/backend/data/__init__.py +3 -0
- package/vendors/precis/backend/data/chunker.py +57 -0
- package/vendors/precis/backend/data/pdf_parser.py +42 -0
- package/vendors/precis/backend/db/__init__.py +3 -0
- package/vendors/precis/backend/db/models.py +173 -0
- package/vendors/precis/backend/db/repository.py +269 -0
- package/vendors/precis/backend/llm/__init__.py +3 -0
- package/vendors/precis/backend/llm/anthropic_provider.py +39 -0
- package/vendors/precis/backend/llm/base.py +147 -0
- package/vendors/precis/backend/llm/deepseek_provider.py +43 -0
- package/vendors/precis/backend/llm/factory.py +60 -0
- package/vendors/precis/backend/llm/google_provider.py +39 -0
- package/vendors/precis/backend/llm/ollama_provider.py +54 -0
- package/vendors/precis/backend/llm/openai_provider.py +50 -0
- package/vendors/precis/backend/main.py +677 -0
- package/vendors/precis/backend/orchestrator/__init__.py +3 -0
- package/vendors/precis/backend/orchestrator/planner.py +81 -0
- package/vendors/precis/backend/orchestrator/router.py +319 -0
- package/vendors/precis/backend/orchestrator/types.py +58 -0
- package/vendors/precis/bridge_precis.py +185 -0
- package/vendors/precis/data/sample_reports/README.md +8 -0
- package/vendors/precis/data/seed_data.py +115 -0
- package/vendors/precis/requirements.txt +19 -0
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
# Copyright (c) 2026 Jinan Kordab
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Stage 2b: Blind Retriever [J2]
|
|
6
|
+
================================
|
|
7
|
+
Veri-fact.ai's jewel: the retriever executes NEUTRAL search queries
|
|
8
|
+
and NEVER sees the original claim. This prevents confirmation bias
|
|
9
|
+
at the architecture level — not the prompt level.
|
|
10
|
+
|
|
11
|
+
Executes searches against multiple providers with graceful fallback:
|
|
12
|
+
Primary: Tavily
|
|
13
|
+
Fallback: Serper → Google CSE
|
|
14
|
+
|
|
15
|
+
Also checks FAISS Tier-1 cache before external search [J7].
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import asyncio
|
|
19
|
+
import hashlib
|
|
20
|
+
import time
|
|
21
|
+
from typing import List, Optional
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
|
|
24
|
+
import httpx
|
|
25
|
+
import numpy as np
|
|
26
|
+
from app.core.config import settings
|
|
27
|
+
from app.services.vector_store import faiss_store
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class RawSearchResult:
|
|
32
|
+
"""A single raw search result from any provider, before canonical mapping."""
|
|
33
|
+
provider: str
|
|
34
|
+
query_used: str
|
|
35
|
+
title: str
|
|
36
|
+
url: str
|
|
37
|
+
snippet: str
|
|
38
|
+
domain: str = ""
|
|
39
|
+
raw_data: dict = field(default_factory=dict)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class BlindRetriever:
|
|
43
|
+
"""
|
|
44
|
+
Executes neutral search queries against web search providers.
|
|
45
|
+
|
|
46
|
+
CRITICAL ARCHITECTURAL CONSTRAINT [J2]:
|
|
47
|
+
This module NEVER receives the original claim text.
|
|
48
|
+
It only sees neutral, factual queries generated by SearchDecisionGenerator.
|
|
49
|
+
This prevents confirmation bias — the retriever cannot preferentially
|
|
50
|
+
return sources that support or attack the claim's framing.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(self):
|
|
54
|
+
self.http_client: Optional[httpx.AsyncClient] = None
|
|
55
|
+
self._locale: str = "en" # Default locale
|
|
56
|
+
|
|
57
|
+
def set_locale(self, locale: str):
|
|
58
|
+
"""Set locale for region-biased search results [C2]."""
|
|
59
|
+
self._locale = locale
|
|
60
|
+
|
|
61
|
+
async def _get_client(self) -> httpx.AsyncClient:
|
|
62
|
+
if self.http_client is None:
|
|
63
|
+
self.http_client = httpx.AsyncClient(timeout=30.0)
|
|
64
|
+
return self.http_client
|
|
65
|
+
|
|
66
|
+
async def search(
|
|
67
|
+
self,
|
|
68
|
+
queries: List[str],
|
|
69
|
+
top_k: int = 5,
|
|
70
|
+
locale: str = "en",
|
|
71
|
+
) -> List[RawSearchResult]:
|
|
72
|
+
"""
|
|
73
|
+
Execute searches across all queries with provider fallback.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
queries: Neutral search queries (NEVER the original claim).
|
|
77
|
+
top_k: Max results per query.
|
|
78
|
+
locale: Language/region for locale-biased search results [C2].
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
List of RawSearchResult objects from all providers.
|
|
82
|
+
"""
|
|
83
|
+
self.set_locale(locale)
|
|
84
|
+
all_results: List[RawSearchResult] = []
|
|
85
|
+
|
|
86
|
+
# Check FAISS Tier-1 cache first [J7]
|
|
87
|
+
cached = await self._check_faiss_cache(queries)
|
|
88
|
+
if cached:
|
|
89
|
+
all_results.extend(cached)
|
|
90
|
+
|
|
91
|
+
# Execute parallel web searches
|
|
92
|
+
tasks = [self._search_single_query(q, top_k) for q in queries]
|
|
93
|
+
results_per_query = await asyncio.gather(*tasks, return_exceptions=True)
|
|
94
|
+
|
|
95
|
+
for query, result in zip(queries, results_per_query):
|
|
96
|
+
if isinstance(result, Exception):
|
|
97
|
+
continue # Individual query failure is non-fatal
|
|
98
|
+
all_results.extend(result)
|
|
99
|
+
|
|
100
|
+
# Deduplicate by URL
|
|
101
|
+
seen_urls = set()
|
|
102
|
+
unique_results = []
|
|
103
|
+
for r in all_results:
|
|
104
|
+
normalized_url = r.url.lower().rstrip("/")
|
|
105
|
+
if normalized_url not in seen_urls:
|
|
106
|
+
seen_urls.add(normalized_url)
|
|
107
|
+
unique_results.append(r)
|
|
108
|
+
|
|
109
|
+
return unique_results
|
|
110
|
+
|
|
111
|
+
async def _search_single_query(
|
|
112
|
+
self, query: str, top_k: int
|
|
113
|
+
) -> List[RawSearchResult]:
|
|
114
|
+
"""Search across ALL available providers concurrently, then aggregate results."""
|
|
115
|
+
client = await self._get_client()
|
|
116
|
+
all_results: List[RawSearchResult] = []
|
|
117
|
+
|
|
118
|
+
# Build list of coroutines for each configured provider
|
|
119
|
+
tasks = []
|
|
120
|
+
task_names = []
|
|
121
|
+
|
|
122
|
+
if settings.TAVILY_API_KEY:
|
|
123
|
+
tasks.append(self._search_tavily(client, query, top_k))
|
|
124
|
+
task_names.append("Tavily")
|
|
125
|
+
|
|
126
|
+
if settings.SERPER_API_KEY:
|
|
127
|
+
tasks.append(self._search_serper(client, query, top_k))
|
|
128
|
+
task_names.append("Serper")
|
|
129
|
+
|
|
130
|
+
if settings.GOOGLE_CSE_API_KEY and settings.GOOGLE_CSE_ID:
|
|
131
|
+
tasks.append(self._search_google_cse(client, query, top_k))
|
|
132
|
+
task_names.append("Google CSE")
|
|
133
|
+
|
|
134
|
+
if not tasks:
|
|
135
|
+
print(f"[SEARCH] No search providers configured for query: {query[:50]}...", flush=True)
|
|
136
|
+
return []
|
|
137
|
+
|
|
138
|
+
# Execute all providers concurrently
|
|
139
|
+
gathered = await asyncio.gather(*tasks, return_exceptions=True)
|
|
140
|
+
|
|
141
|
+
for name, result in zip(task_names, gathered):
|
|
142
|
+
if isinstance(result, Exception):
|
|
143
|
+
print(f"[SEARCH] {name} FAILED: {result}", flush=True)
|
|
144
|
+
elif isinstance(result, list):
|
|
145
|
+
print(f"[SEARCH] {name} returned {len(result)} results for query: {query[:50]}...", flush=True)
|
|
146
|
+
all_results.extend(result)
|
|
147
|
+
else:
|
|
148
|
+
print(f"[SEARCH] {name} returned unexpected type: {type(result)}", flush=True)
|
|
149
|
+
|
|
150
|
+
return all_results
|
|
151
|
+
|
|
152
|
+
async def _search_tavily(
|
|
153
|
+
self, client: httpx.AsyncClient, query: str, top_k: int
|
|
154
|
+
) -> List[RawSearchResult]:
|
|
155
|
+
"""Search via Tavily API with locale-aware region biasing [C2]."""
|
|
156
|
+
from app.services.locale_service import get_search_region_params
|
|
157
|
+
|
|
158
|
+
region_params = get_search_region_params(self._locale)
|
|
159
|
+
response = await client.post(
|
|
160
|
+
"https://api.tavily.com/search",
|
|
161
|
+
json={
|
|
162
|
+
"query": query,
|
|
163
|
+
"max_results": top_k,
|
|
164
|
+
"search_depth": "basic",
|
|
165
|
+
},
|
|
166
|
+
headers={
|
|
167
|
+
"Authorization": f"Bearer {settings.TAVILY_API_KEY}",
|
|
168
|
+
"Content-Type": "application/json",
|
|
169
|
+
},
|
|
170
|
+
)
|
|
171
|
+
response.raise_for_status()
|
|
172
|
+
data = response.json()
|
|
173
|
+
|
|
174
|
+
results = []
|
|
175
|
+
for item in data.get("results", [])[:top_k]:
|
|
176
|
+
domain = self._extract_domain(item.get("url", ""))
|
|
177
|
+
results.append(RawSearchResult(
|
|
178
|
+
provider="tavily",
|
|
179
|
+
query_used=query,
|
|
180
|
+
title=item.get("title", ""),
|
|
181
|
+
url=item.get("url", ""),
|
|
182
|
+
snippet=item.get("content", ""),
|
|
183
|
+
domain=domain,
|
|
184
|
+
raw_data=item,
|
|
185
|
+
))
|
|
186
|
+
return results
|
|
187
|
+
|
|
188
|
+
async def _search_serper(
|
|
189
|
+
self, client: httpx.AsyncClient, query: str, top_k: int
|
|
190
|
+
) -> List[RawSearchResult]:
|
|
191
|
+
"""Search via Serper.dev (Google Search API) with locale biasing [C2]."""
|
|
192
|
+
from app.services.locale_service import get_search_region_params
|
|
193
|
+
|
|
194
|
+
region_params = get_search_region_params(self._locale)
|
|
195
|
+
response = await client.post(
|
|
196
|
+
"https://google.serper.dev/search",
|
|
197
|
+
json={
|
|
198
|
+
"q": query,
|
|
199
|
+
"num": top_k,
|
|
200
|
+
"gl": region_params.get("gl", "us"),
|
|
201
|
+
"hl": region_params.get("hl", "en"),
|
|
202
|
+
},
|
|
203
|
+
headers={"X-API-KEY": settings.SERPER_API_KEY},
|
|
204
|
+
)
|
|
205
|
+
response.raise_for_status()
|
|
206
|
+
data = response.json()
|
|
207
|
+
|
|
208
|
+
results = []
|
|
209
|
+
for item in data.get("organic", [])[:top_k]:
|
|
210
|
+
domain = self._extract_domain(item.get("link", ""))
|
|
211
|
+
results.append(RawSearchResult(
|
|
212
|
+
provider="serper",
|
|
213
|
+
query_used=query,
|
|
214
|
+
title=item.get("title", ""),
|
|
215
|
+
url=item.get("link", ""),
|
|
216
|
+
snippet=item.get("snippet", ""),
|
|
217
|
+
domain=domain,
|
|
218
|
+
raw_data=item,
|
|
219
|
+
))
|
|
220
|
+
return results
|
|
221
|
+
|
|
222
|
+
async def _search_google_cse(
|
|
223
|
+
self, client: httpx.AsyncClient, query: str, top_k: int
|
|
224
|
+
) -> List[RawSearchResult]:
|
|
225
|
+
"""Search via Google Custom Search Engine with Canada/Quebec locale bias [C2]."""
|
|
226
|
+
# Build locale-aware query with site restrictions
|
|
227
|
+
locale_query = self._build_locale_query(query)
|
|
228
|
+
|
|
229
|
+
response = await client.get(
|
|
230
|
+
"https://www.googleapis.com/customsearch/v1",
|
|
231
|
+
params={
|
|
232
|
+
"key": settings.GOOGLE_CSE_API_KEY,
|
|
233
|
+
"cx": settings.GOOGLE_CSE_ID,
|
|
234
|
+
"q": locale_query,
|
|
235
|
+
"num": min(top_k, 10),
|
|
236
|
+
"cr": "countryCA", # Restrict to Canada
|
|
237
|
+
"lr": f"lang_{self._locale}", # Language restriction
|
|
238
|
+
},
|
|
239
|
+
)
|
|
240
|
+
response.raise_for_status()
|
|
241
|
+
data = response.json()
|
|
242
|
+
|
|
243
|
+
results = []
|
|
244
|
+
for item in data.get("items", [])[:top_k]:
|
|
245
|
+
domain = self._extract_domain(item.get("link", ""))
|
|
246
|
+
results.append(RawSearchResult(
|
|
247
|
+
provider="google_cse",
|
|
248
|
+
query_used=query,
|
|
249
|
+
title=item.get("title", ""),
|
|
250
|
+
url=item.get("link", ""),
|
|
251
|
+
snippet=item.get("snippet", ""),
|
|
252
|
+
domain=domain,
|
|
253
|
+
raw_data=item,
|
|
254
|
+
))
|
|
255
|
+
return results
|
|
256
|
+
|
|
257
|
+
def _build_locale_query(self, query: str) -> str:
|
|
258
|
+
"""Add locale-specific site restrictions to focus on Canadian/Quebec sources [C2]."""
|
|
259
|
+
# Authoritative Canadian sources for political fact-checking
|
|
260
|
+
ca_sites = [
|
|
261
|
+
"gc.ca", "canada.ca", "parl.ca", "elections.ca",
|
|
262
|
+
"cbc.ca", "radio-canada.ca", "ctvnews.ca", "globalnews.ca",
|
|
263
|
+
"theglobeandmail.com", "macleans.ca", "hilltimes.com",
|
|
264
|
+
"policyalternatives.ca",
|
|
265
|
+
]
|
|
266
|
+
qc_sites = [
|
|
267
|
+
"qc.ca", "assnat.qc.ca", "electionsquebec.qc.ca",
|
|
268
|
+
"lapresse.ca", "ledevoir.com",
|
|
269
|
+
]
|
|
270
|
+
|
|
271
|
+
if self._locale == "fr":
|
|
272
|
+
sites = qc_sites + ca_sites
|
|
273
|
+
else:
|
|
274
|
+
sites = ca_sites + qc_sites
|
|
275
|
+
|
|
276
|
+
# Add site: operators (Google CSE supports this)
|
|
277
|
+
site_filter = " OR ".join(f"site:{s}" for s in sites[:10])
|
|
278
|
+
return f"({query}) ({site_filter})"
|
|
279
|
+
|
|
280
|
+
async def _check_faiss_cache(
|
|
281
|
+
self, queries: List[str]
|
|
282
|
+
) -> List[RawSearchResult]:
|
|
283
|
+
"""Check FAISS Tier-1 cache for semantically similar verified claims."""
|
|
284
|
+
# In production, embed queries and search FAISS.
|
|
285
|
+
# For now, return empty — FAISS cache populated in Loop 2 (Phase 8).
|
|
286
|
+
return []
|
|
287
|
+
|
|
288
|
+
@staticmethod
|
|
289
|
+
def _extract_domain(url: str) -> str:
|
|
290
|
+
"""Extract domain name from URL."""
|
|
291
|
+
from urllib.parse import urlparse
|
|
292
|
+
try:
|
|
293
|
+
parsed = urlparse(url)
|
|
294
|
+
domain = parsed.netloc.lower()
|
|
295
|
+
if domain.startswith("www."):
|
|
296
|
+
domain = domain[4:]
|
|
297
|
+
return domain
|
|
298
|
+
except Exception:
|
|
299
|
+
return ""
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
# Singleton
|
|
303
|
+
blind_retriever = BlindRetriever()
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# Copyright (c) 2026 Jinan Kordab
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Stage 2e: Canonical Evidence Mapper [C6]
|
|
6
|
+
===========================================
|
|
7
|
+
Normalizes heterogeneous search provider formats into a unified
|
|
8
|
+
CanonicalEvidence schema.
|
|
9
|
+
|
|
10
|
+
The Replay Architecture's Canonical Mapping pattern applied to evidence:
|
|
11
|
+
Tavily → CanonicalEvidence
|
|
12
|
+
Serper → CanonicalEvidence
|
|
13
|
+
Google CSE → CanonicalEvidence
|
|
14
|
+
FAISS cache → CanonicalEvidence
|
|
15
|
+
|
|
16
|
+
All downstream stages (3, 4) operate on CanonicalEvidence ONLY.
|
|
17
|
+
No stage-specific format handling. Adding a new provider requires
|
|
18
|
+
ONLY a new adapter — zero changes to the verification pipeline.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
from datetime import datetime
|
|
23
|
+
from typing import List, Optional
|
|
24
|
+
from uuid import UUID
|
|
25
|
+
|
|
26
|
+
from pipeline.stage2.blind_retriever import RawSearchResult
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class CanonicalEvidence:
|
|
31
|
+
"""
|
|
32
|
+
Unified evidence schema — the single format for all downstream stages.
|
|
33
|
+
|
|
34
|
+
Schema [C6]:
|
|
35
|
+
- title: Source title/headline
|
|
36
|
+
- url: Full URL to the source
|
|
37
|
+
- excerpt: Relevant text excerpt from the source
|
|
38
|
+
- source_domain: Domain name (e.g., bbc.com)
|
|
39
|
+
- credibility_score: 0.0–1.0 per-domain credibility [J1b]
|
|
40
|
+
- retrieval_ts: ISO 8601 timestamp of retrieval
|
|
41
|
+
- query_used: The search query that found this source
|
|
42
|
+
- provider: Which search provider returned this (tavily, serper, google_cse, faiss)
|
|
43
|
+
- db_id: PK of the persisted Source row — set by Stage 2 after insert,
|
|
44
|
+
consumed by Stage 3 to populate classifications.source_id (FK).
|
|
45
|
+
Ephemeral, not serialized into the JSONB blob.
|
|
46
|
+
"""
|
|
47
|
+
title: str
|
|
48
|
+
url: str
|
|
49
|
+
excerpt: str
|
|
50
|
+
source_domain: str
|
|
51
|
+
credibility_score: float
|
|
52
|
+
retrieval_ts: str = field(default_factory=lambda: datetime.utcnow().isoformat())
|
|
53
|
+
query_used: str = ""
|
|
54
|
+
provider: str = ""
|
|
55
|
+
db_id: Optional[UUID] = None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class CanonicalEvidenceMapper:
|
|
59
|
+
"""
|
|
60
|
+
Normalizes raw search results from any provider into CanonicalEvidence.
|
|
61
|
+
|
|
62
|
+
This is the ONLY place that handles provider-specific formats.
|
|
63
|
+
Downstream stages (classification, verdict) never see raw provider data.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
async def map_results(
|
|
67
|
+
self,
|
|
68
|
+
raw_results: List[RawSearchResult],
|
|
69
|
+
credibility_scorer=None, # CredibilityScorer instance
|
|
70
|
+
) -> List[CanonicalEvidence]:
|
|
71
|
+
"""
|
|
72
|
+
Map all raw results to canonical format, attaching credibility scores.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
raw_results: Raw results from BlindRetriever.
|
|
76
|
+
credibility_scorer: CredibilityScorer for per-domain scoring.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
List of CanonicalEvidence objects.
|
|
80
|
+
"""
|
|
81
|
+
canonical_list = []
|
|
82
|
+
|
|
83
|
+
for raw in raw_results:
|
|
84
|
+
# Resolve credibility score
|
|
85
|
+
cred_score = 0.5 # Default
|
|
86
|
+
if credibility_scorer:
|
|
87
|
+
cred_score = await credibility_scorer.score_domain(raw.domain)
|
|
88
|
+
|
|
89
|
+
canonical = CanonicalEvidence(
|
|
90
|
+
title=raw.title,
|
|
91
|
+
url=raw.url,
|
|
92
|
+
excerpt=raw.snippet,
|
|
93
|
+
source_domain=raw.domain,
|
|
94
|
+
credibility_score=cred_score,
|
|
95
|
+
query_used=raw.query_used,
|
|
96
|
+
provider=raw.provider,
|
|
97
|
+
)
|
|
98
|
+
canonical_list.append(canonical)
|
|
99
|
+
|
|
100
|
+
# Sort by credibility score descending (most credible first)
|
|
101
|
+
canonical_list.sort(key=lambda x: x.credibility_score, reverse=True)
|
|
102
|
+
|
|
103
|
+
return canonical_list
|
|
104
|
+
|
|
105
|
+
@staticmethod
|
|
106
|
+
def to_dict(evidence: CanonicalEvidence) -> dict:
|
|
107
|
+
"""Serialize CanonicalEvidence to dict for JSONB storage."""
|
|
108
|
+
return {
|
|
109
|
+
"title": evidence.title,
|
|
110
|
+
"url": evidence.url,
|
|
111
|
+
"excerpt": evidence.excerpt,
|
|
112
|
+
"source_domain": evidence.source_domain,
|
|
113
|
+
"credibility_score": evidence.credibility_score,
|
|
114
|
+
"retrieval_ts": evidence.retrieval_ts,
|
|
115
|
+
"query_used": evidence.query_used,
|
|
116
|
+
"provider": evidence.provider,
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
@staticmethod
|
|
120
|
+
def from_dict(data: dict) -> CanonicalEvidence:
|
|
121
|
+
"""Deserialize CanonicalEvidence from JSONB dict."""
|
|
122
|
+
# db_id is not stored in JSONB (it's the row PK itself); strip if present.
|
|
123
|
+
clean = {k: v for k, v in data.items() if k != "db_id"}
|
|
124
|
+
return CanonicalEvidence(**clean)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# Copyright (c) 2026 Jinan Kordab
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Stage 2c: Per-Domain Credibility Scorer [J1b]
|
|
6
|
+
================================================
|
|
7
|
+
Veracity's second jewel: continuous 0–1 credibility scoring per source domain.
|
|
8
|
+
Uses Lin et al. (2023) composite domain quality database.
|
|
9
|
+
|
|
10
|
+
Unlike Aletheia's binary blacklist, this provides a continuous, auditable metric.
|
|
11
|
+
Users see not just WHICH sources were used, but HOW credible each one is.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from typing import Optional
|
|
15
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
16
|
+
from sqlalchemy import select
|
|
17
|
+
from app.db.models import Domain
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class CredibilityScorer:
|
|
21
|
+
"""
|
|
22
|
+
Scores each source domain on a continuous 0–1 credibility scale.
|
|
23
|
+
|
|
24
|
+
Jewel [J1b] — Veracity's per-domain credibility:
|
|
25
|
+
Transforms source transparency from binary (blocked/allowed) into
|
|
26
|
+
a continuous, auditable metric that educates users about source quality
|
|
27
|
+
while holding the system accountable.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
# Domain credibility scores are fetched from PostgreSQL domains table.
|
|
31
|
+
# Unknown domains get a default score of 0.5 (neutral, flagged for review).
|
|
32
|
+
|
|
33
|
+
DEFAULT_SCORE: float = 0.5
|
|
34
|
+
UNKNOWN_DOMAIN_SCORE: float = 0.5
|
|
35
|
+
|
|
36
|
+
def __init__(self, db: Optional[AsyncSession] = None):
|
|
37
|
+
self.db = db
|
|
38
|
+
self._cache: dict[str, float] = {} # In-memory cache per session
|
|
39
|
+
|
|
40
|
+
async def score_domain(self, domain: str) -> float:
|
|
41
|
+
"""
|
|
42
|
+
Get credibility score for a domain.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
0.0–1.0 score, where 1.0 = highest credibility.
|
|
46
|
+
"""
|
|
47
|
+
domain = domain.lower().strip()
|
|
48
|
+
|
|
49
|
+
# Check in-memory cache
|
|
50
|
+
if domain in self._cache:
|
|
51
|
+
return self._cache[domain]
|
|
52
|
+
|
|
53
|
+
# Query PostgreSQL domains table
|
|
54
|
+
if self.db:
|
|
55
|
+
result = await self.db.execute(
|
|
56
|
+
select(Domain.credibility_score).where(
|
|
57
|
+
Domain.domain_name == domain
|
|
58
|
+
)
|
|
59
|
+
)
|
|
60
|
+
score = result.scalar()
|
|
61
|
+
if score is not None:
|
|
62
|
+
self._cache[domain] = float(score)
|
|
63
|
+
return float(score)
|
|
64
|
+
|
|
65
|
+
# Unknown domain → default neutral score
|
|
66
|
+
self._cache[domain] = self.DEFAULT_SCORE
|
|
67
|
+
return self.DEFAULT_SCORE
|
|
68
|
+
|
|
69
|
+
async def score_domains(
|
|
70
|
+
self, domains: list[str]
|
|
71
|
+
) -> dict[str, float]:
|
|
72
|
+
"""Batch-score multiple domains."""
|
|
73
|
+
scores = {}
|
|
74
|
+
for domain in domains:
|
|
75
|
+
scores[domain] = await self.score_domain(domain)
|
|
76
|
+
return scores
|
|
77
|
+
|
|
78
|
+
async def is_reliable(self, domain: str) -> bool:
|
|
79
|
+
"""Quick binary check: is this domain generally reliable?"""
|
|
80
|
+
score = await self.score_domain(domain)
|
|
81
|
+
return score >= 0.6
|
|
82
|
+
|
|
83
|
+
def flush_cache(self):
|
|
84
|
+
"""Clear in-memory cache (useful between sessions)."""
|
|
85
|
+
self._cache.clear()
|