mcp-agentic-pipelines 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/.env.example +93 -0
  2. package/README.md +258 -0
  3. package/package.json +70 -0
  4. package/packages/clinical/package.json +22 -0
  5. package/packages/clinical/src/index.ts +262 -0
  6. package/packages/clinical/tsconfig.json +13 -0
  7. package/packages/core/package.json +21 -0
  8. package/packages/core/src/config.ts +138 -0
  9. package/packages/core/src/errors.ts +100 -0
  10. package/packages/core/src/index.ts +104 -0
  11. package/packages/core/src/llm-config.ts +213 -0
  12. package/packages/core/src/logging.ts +66 -0
  13. package/packages/core/src/python-bridge.ts +384 -0
  14. package/packages/core/src/rate-limiter.ts +136 -0
  15. package/packages/core/src/types.ts +203 -0
  16. package/packages/core/src/validation.ts +101 -0
  17. package/packages/core/tsconfig.json +10 -0
  18. package/packages/deeppipe/package.json +21 -0
  19. package/packages/deeppipe/src/index.ts +424 -0
  20. package/packages/deeppipe/tsconfig.json +13 -0
  21. package/packages/piste/package.json +20 -0
  22. package/packages/piste/src/index.ts +48 -0
  23. package/packages/piste/tsconfig.json +13 -0
  24. package/packages/precis/package.json +20 -0
  25. package/packages/precis/src/index.ts +67 -0
  26. package/packages/precis/tsconfig.json +13 -0
  27. package/packages/server/package.json +31 -0
  28. package/packages/server/src/index.ts +427 -0
  29. package/packages/server/tsconfig.json +17 -0
  30. package/setup.mjs +141 -0
  31. package/test.mjs +337 -0
  32. package/vendors/clinical-intake/pipeline.mjs +349 -0
  33. package/vendors/clinical-intake/questions/en.txt +9 -0
  34. package/vendors/clinical-intake/questions/fr.txt +9 -0
  35. package/vendors/piste/.env.example +73 -0
  36. package/vendors/piste/app/core/__init__.py +4 -0
  37. package/vendors/piste/app/core/config.py +83 -0
  38. package/vendors/piste/app/core/debuglog.py +16 -0
  39. package/vendors/piste/app/core/middleware.py +40 -0
  40. package/vendors/piste/bridge_piste.py +301 -0
  41. package/vendors/piste/pipeline/__init__.py +4 -0
  42. package/vendors/piste/pipeline/compiler.py +68 -0
  43. package/vendors/piste/pipeline/offline/__init__.py +28 -0
  44. package/vendors/piste/pipeline/offline/verifaid_pipeline.py +247 -0
  45. package/vendors/piste/pipeline/replay.py +15 -0
  46. package/vendors/piste/pipeline/replay_engine.py +249 -0
  47. package/vendors/piste/pipeline/signatures/__init__.py +4 -0
  48. package/vendors/piste/pipeline/signatures/signatures.py +136 -0
  49. package/vendors/piste/pipeline/stage1/__init__.py +21 -0
  50. package/vendors/piste/pipeline/stage1/atomic_decomposer.py +61 -0
  51. package/vendors/piste/pipeline/stage1/check_worthiness.py +100 -0
  52. package/vendors/piste/pipeline/stage1/orchestrator.py +175 -0
  53. package/vendors/piste/pipeline/stage1/test_stage1.py +162 -0
  54. package/vendors/piste/pipeline/stage2/__init__.py +34 -0
  55. package/vendors/piste/pipeline/stage2/blind_retriever.py +303 -0
  56. package/vendors/piste/pipeline/stage2/canonical_mapper.py +124 -0
  57. package/vendors/piste/pipeline/stage2/credibility_scorer.py +85 -0
  58. package/vendors/piste/pipeline/stage2/orchestrator.py +311 -0
  59. package/vendors/piste/pipeline/stage2/query_refiner.py +88 -0
  60. package/vendors/piste/pipeline/stage2/search_decision.py +69 -0
  61. package/vendors/piste/pipeline/stage2/test_stage2.py +265 -0
  62. package/vendors/piste/pipeline/stage3/__init__.py +20 -0
  63. package/vendors/piste/pipeline/stage3/classifier.py +79 -0
  64. package/vendors/piste/pipeline/stage3/orchestrator.py +225 -0
  65. package/vendors/piste/pipeline/stage3/test_stage3.py +101 -0
  66. package/vendors/piste/pipeline/stage4/__init__.py +33 -0
  67. package/vendors/piste/pipeline/stage4/criticality_gate.py +177 -0
  68. package/vendors/piste/pipeline/stage4/orchestrator.py +269 -0
  69. package/vendors/piste/pipeline/stage4/test_stage4.py +192 -0
  70. package/vendors/piste/pipeline/stage4/verdict_aggregator.py +157 -0
  71. package/vendors/piste/requirements.txt +53 -0
  72. package/vendors/precis/backend/__init__.py +6 -0
  73. package/vendors/precis/backend/agents/__init__.py +3 -0
  74. package/vendors/precis/backend/agents/data_synthesis.py +105 -0
  75. package/vendors/precis/backend/agents/dist_free_synth.py +97 -0
  76. package/vendors/precis/backend/agents/exact_hash_retriever.py +327 -0
  77. package/vendors/precis/backend/agents/fusion_ranker.py +64 -0
  78. package/vendors/precis/backend/agents/guardrail.py +175 -0
  79. package/vendors/precis/backend/agents/query_expander.py +89 -0
  80. package/vendors/precis/backend/agents/radial_interpol.py +99 -0
  81. package/vendors/precis/backend/agents/report_generator.py +92 -0
  82. package/vendors/precis/backend/agents/semantic_reranker.py +135 -0
  83. package/vendors/precis/backend/agents/stat_anomaly.py +93 -0
  84. package/vendors/precis/backend/agents/vector_index.py +123 -0
  85. package/vendors/precis/backend/agents/veri_score.py +341 -0
  86. package/vendors/precis/backend/agents/work_order_extractor.py +205 -0
  87. package/vendors/precis/backend/api/__init__.py +3 -0
  88. package/vendors/precis/backend/api/routes/__init__.py +3 -0
  89. package/vendors/precis/backend/config.py +88 -0
  90. package/vendors/precis/backend/core/__init__.py +13 -0
  91. package/vendors/precis/backend/core/hashing.py +22 -0
  92. package/vendors/precis/backend/core/metrics.py +77 -0
  93. package/vendors/precis/backend/core/multitoken.py +166 -0
  94. package/vendors/precis/backend/core/pmi.py +54 -0
  95. package/vendors/precis/backend/core/stemming.py +74 -0
  96. package/vendors/precis/backend/core/tracing.py +150 -0
  97. package/vendors/precis/backend/data/__init__.py +3 -0
  98. package/vendors/precis/backend/data/chunker.py +57 -0
  99. package/vendors/precis/backend/data/pdf_parser.py +42 -0
  100. package/vendors/precis/backend/db/__init__.py +3 -0
  101. package/vendors/precis/backend/db/models.py +173 -0
  102. package/vendors/precis/backend/db/repository.py +269 -0
  103. package/vendors/precis/backend/llm/__init__.py +3 -0
  104. package/vendors/precis/backend/llm/anthropic_provider.py +39 -0
  105. package/vendors/precis/backend/llm/base.py +147 -0
  106. package/vendors/precis/backend/llm/deepseek_provider.py +43 -0
  107. package/vendors/precis/backend/llm/factory.py +60 -0
  108. package/vendors/precis/backend/llm/google_provider.py +39 -0
  109. package/vendors/precis/backend/llm/ollama_provider.py +54 -0
  110. package/vendors/precis/backend/llm/openai_provider.py +50 -0
  111. package/vendors/precis/backend/main.py +677 -0
  112. package/vendors/precis/backend/orchestrator/__init__.py +3 -0
  113. package/vendors/precis/backend/orchestrator/planner.py +81 -0
  114. package/vendors/precis/backend/orchestrator/router.py +319 -0
  115. package/vendors/precis/backend/orchestrator/types.py +58 -0
  116. package/vendors/precis/bridge_precis.py +185 -0
  117. package/vendors/precis/data/sample_reports/README.md +8 -0
  118. package/vendors/precis/data/seed_data.py +115 -0
  119. package/vendors/precis/requirements.txt +19 -0
@@ -0,0 +1,303 @@
1
+ # Copyright (c) 2026 Jinan Kordab
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ """
5
+ Stage 2b: Blind Retriever [J2]
6
+ ================================
7
+ Veri-fact.ai's jewel: the retriever executes NEUTRAL search queries
8
+ and NEVER sees the original claim. This prevents confirmation bias
9
+ at the architecture level — not the prompt level.
10
+
11
+ Executes searches against multiple providers with graceful fallback:
12
+ Primary: Tavily
13
+ Fallback: Serper → Google CSE
14
+
15
+ Also checks FAISS Tier-1 cache before external search [J7].
16
+ """
17
+
18
+ import asyncio
19
+ import hashlib
20
+ import time
21
+ from typing import List, Optional
22
+ from dataclasses import dataclass, field
23
+
24
+ import httpx
25
+ import numpy as np
26
+ from app.core.config import settings
27
+ from app.services.vector_store import faiss_store
28
+
29
+
30
+ @dataclass
31
+ class RawSearchResult:
32
+ """A single raw search result from any provider, before canonical mapping."""
33
+ provider: str
34
+ query_used: str
35
+ title: str
36
+ url: str
37
+ snippet: str
38
+ domain: str = ""
39
+ raw_data: dict = field(default_factory=dict)
40
+
41
+
42
+ class BlindRetriever:
43
+ """
44
+ Executes neutral search queries against web search providers.
45
+
46
+ CRITICAL ARCHITECTURAL CONSTRAINT [J2]:
47
+ This module NEVER receives the original claim text.
48
+ It only sees neutral, factual queries generated by SearchDecisionGenerator.
49
+ This prevents confirmation bias — the retriever cannot preferentially
50
+ return sources that support or attack the claim's framing.
51
+ """
52
+
53
+ def __init__(self):
54
+ self.http_client: Optional[httpx.AsyncClient] = None
55
+ self._locale: str = "en" # Default locale
56
+
57
+ def set_locale(self, locale: str):
58
+ """Set locale for region-biased search results [C2]."""
59
+ self._locale = locale
60
+
61
+ async def _get_client(self) -> httpx.AsyncClient:
62
+ if self.http_client is None:
63
+ self.http_client = httpx.AsyncClient(timeout=30.0)
64
+ return self.http_client
65
+
66
+ async def search(
67
+ self,
68
+ queries: List[str],
69
+ top_k: int = 5,
70
+ locale: str = "en",
71
+ ) -> List[RawSearchResult]:
72
+ """
73
+ Execute searches across all queries with provider fallback.
74
+
75
+ Args:
76
+ queries: Neutral search queries (NEVER the original claim).
77
+ top_k: Max results per query.
78
+ locale: Language/region for locale-biased search results [C2].
79
+
80
+ Returns:
81
+ List of RawSearchResult objects from all providers.
82
+ """
83
+ self.set_locale(locale)
84
+ all_results: List[RawSearchResult] = []
85
+
86
+ # Check FAISS Tier-1 cache first [J7]
87
+ cached = await self._check_faiss_cache(queries)
88
+ if cached:
89
+ all_results.extend(cached)
90
+
91
+ # Execute parallel web searches
92
+ tasks = [self._search_single_query(q, top_k) for q in queries]
93
+ results_per_query = await asyncio.gather(*tasks, return_exceptions=True)
94
+
95
+ for query, result in zip(queries, results_per_query):
96
+ if isinstance(result, Exception):
97
+ continue # Individual query failure is non-fatal
98
+ all_results.extend(result)
99
+
100
+ # Deduplicate by URL
101
+ seen_urls = set()
102
+ unique_results = []
103
+ for r in all_results:
104
+ normalized_url = r.url.lower().rstrip("/")
105
+ if normalized_url not in seen_urls:
106
+ seen_urls.add(normalized_url)
107
+ unique_results.append(r)
108
+
109
+ return unique_results
110
+
111
+ async def _search_single_query(
112
+ self, query: str, top_k: int
113
+ ) -> List[RawSearchResult]:
114
+ """Search across ALL available providers concurrently, then aggregate results."""
115
+ client = await self._get_client()
116
+ all_results: List[RawSearchResult] = []
117
+
118
+ # Build list of coroutines for each configured provider
119
+ tasks = []
120
+ task_names = []
121
+
122
+ if settings.TAVILY_API_KEY:
123
+ tasks.append(self._search_tavily(client, query, top_k))
124
+ task_names.append("Tavily")
125
+
126
+ if settings.SERPER_API_KEY:
127
+ tasks.append(self._search_serper(client, query, top_k))
128
+ task_names.append("Serper")
129
+
130
+ if settings.GOOGLE_CSE_API_KEY and settings.GOOGLE_CSE_ID:
131
+ tasks.append(self._search_google_cse(client, query, top_k))
132
+ task_names.append("Google CSE")
133
+
134
+ if not tasks:
135
+ print(f"[SEARCH] No search providers configured for query: {query[:50]}...", flush=True)
136
+ return []
137
+
138
+ # Execute all providers concurrently
139
+ gathered = await asyncio.gather(*tasks, return_exceptions=True)
140
+
141
+ for name, result in zip(task_names, gathered):
142
+ if isinstance(result, Exception):
143
+ print(f"[SEARCH] {name} FAILED: {result}", flush=True)
144
+ elif isinstance(result, list):
145
+ print(f"[SEARCH] {name} returned {len(result)} results for query: {query[:50]}...", flush=True)
146
+ all_results.extend(result)
147
+ else:
148
+ print(f"[SEARCH] {name} returned unexpected type: {type(result)}", flush=True)
149
+
150
+ return all_results
151
+
152
+ async def _search_tavily(
153
+ self, client: httpx.AsyncClient, query: str, top_k: int
154
+ ) -> List[RawSearchResult]:
155
+ """Search via Tavily API with locale-aware region biasing [C2]."""
156
+ from app.services.locale_service import get_search_region_params
157
+
158
+ region_params = get_search_region_params(self._locale)
159
+ response = await client.post(
160
+ "https://api.tavily.com/search",
161
+ json={
162
+ "query": query,
163
+ "max_results": top_k,
164
+ "search_depth": "basic",
165
+ },
166
+ headers={
167
+ "Authorization": f"Bearer {settings.TAVILY_API_KEY}",
168
+ "Content-Type": "application/json",
169
+ },
170
+ )
171
+ response.raise_for_status()
172
+ data = response.json()
173
+
174
+ results = []
175
+ for item in data.get("results", [])[:top_k]:
176
+ domain = self._extract_domain(item.get("url", ""))
177
+ results.append(RawSearchResult(
178
+ provider="tavily",
179
+ query_used=query,
180
+ title=item.get("title", ""),
181
+ url=item.get("url", ""),
182
+ snippet=item.get("content", ""),
183
+ domain=domain,
184
+ raw_data=item,
185
+ ))
186
+ return results
187
+
188
+ async def _search_serper(
189
+ self, client: httpx.AsyncClient, query: str, top_k: int
190
+ ) -> List[RawSearchResult]:
191
+ """Search via Serper.dev (Google Search API) with locale biasing [C2]."""
192
+ from app.services.locale_service import get_search_region_params
193
+
194
+ region_params = get_search_region_params(self._locale)
195
+ response = await client.post(
196
+ "https://google.serper.dev/search",
197
+ json={
198
+ "q": query,
199
+ "num": top_k,
200
+ "gl": region_params.get("gl", "us"),
201
+ "hl": region_params.get("hl", "en"),
202
+ },
203
+ headers={"X-API-KEY": settings.SERPER_API_KEY},
204
+ )
205
+ response.raise_for_status()
206
+ data = response.json()
207
+
208
+ results = []
209
+ for item in data.get("organic", [])[:top_k]:
210
+ domain = self._extract_domain(item.get("link", ""))
211
+ results.append(RawSearchResult(
212
+ provider="serper",
213
+ query_used=query,
214
+ title=item.get("title", ""),
215
+ url=item.get("link", ""),
216
+ snippet=item.get("snippet", ""),
217
+ domain=domain,
218
+ raw_data=item,
219
+ ))
220
+ return results
221
+
222
+ async def _search_google_cse(
223
+ self, client: httpx.AsyncClient, query: str, top_k: int
224
+ ) -> List[RawSearchResult]:
225
+ """Search via Google Custom Search Engine with Canada/Quebec locale bias [C2]."""
226
+ # Build locale-aware query with site restrictions
227
+ locale_query = self._build_locale_query(query)
228
+
229
+ response = await client.get(
230
+ "https://www.googleapis.com/customsearch/v1",
231
+ params={
232
+ "key": settings.GOOGLE_CSE_API_KEY,
233
+ "cx": settings.GOOGLE_CSE_ID,
234
+ "q": locale_query,
235
+ "num": min(top_k, 10),
236
+ "cr": "countryCA", # Restrict to Canada
237
+ "lr": f"lang_{self._locale}", # Language restriction
238
+ },
239
+ )
240
+ response.raise_for_status()
241
+ data = response.json()
242
+
243
+ results = []
244
+ for item in data.get("items", [])[:top_k]:
245
+ domain = self._extract_domain(item.get("link", ""))
246
+ results.append(RawSearchResult(
247
+ provider="google_cse",
248
+ query_used=query,
249
+ title=item.get("title", ""),
250
+ url=item.get("link", ""),
251
+ snippet=item.get("snippet", ""),
252
+ domain=domain,
253
+ raw_data=item,
254
+ ))
255
+ return results
256
+
257
+ def _build_locale_query(self, query: str) -> str:
258
+ """Add locale-specific site restrictions to focus on Canadian/Quebec sources [C2]."""
259
+ # Authoritative Canadian sources for political fact-checking
260
+ ca_sites = [
261
+ "gc.ca", "canada.ca", "parl.ca", "elections.ca",
262
+ "cbc.ca", "radio-canada.ca", "ctvnews.ca", "globalnews.ca",
263
+ "theglobeandmail.com", "macleans.ca", "hilltimes.com",
264
+ "policyalternatives.ca",
265
+ ]
266
+ qc_sites = [
267
+ "qc.ca", "assnat.qc.ca", "electionsquebec.qc.ca",
268
+ "lapresse.ca", "ledevoir.com",
269
+ ]
270
+
271
+ if self._locale == "fr":
272
+ sites = qc_sites + ca_sites
273
+ else:
274
+ sites = ca_sites + qc_sites
275
+
276
+ # Add site: operators (Google CSE supports this)
277
+ site_filter = " OR ".join(f"site:{s}" for s in sites[:10])
278
+ return f"({query}) ({site_filter})"
279
+
280
+ async def _check_faiss_cache(
281
+ self, queries: List[str]
282
+ ) -> List[RawSearchResult]:
283
+ """Check FAISS Tier-1 cache for semantically similar verified claims."""
284
+ # In production, embed queries and search FAISS.
285
+ # For now, return empty — FAISS cache populated in Loop 2 (Phase 8).
286
+ return []
287
+
288
+ @staticmethod
289
+ def _extract_domain(url: str) -> str:
290
+ """Extract domain name from URL."""
291
+ from urllib.parse import urlparse
292
+ try:
293
+ parsed = urlparse(url)
294
+ domain = parsed.netloc.lower()
295
+ if domain.startswith("www."):
296
+ domain = domain[4:]
297
+ return domain
298
+ except Exception:
299
+ return ""
300
+
301
+
302
+ # Singleton
303
+ blind_retriever = BlindRetriever()
@@ -0,0 +1,124 @@
1
+ # Copyright (c) 2026 Jinan Kordab
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ """
5
+ Stage 2e: Canonical Evidence Mapper [C6]
6
+ ===========================================
7
+ Normalizes heterogeneous search provider formats into a unified
8
+ CanonicalEvidence schema.
9
+
10
+ The Replay Architecture's Canonical Mapping pattern applied to evidence:
11
+ Tavily → CanonicalEvidence
12
+ Serper → CanonicalEvidence
13
+ Google CSE → CanonicalEvidence
14
+ FAISS cache → CanonicalEvidence
15
+
16
+ All downstream stages (3, 4) operate on CanonicalEvidence ONLY.
17
+ No stage-specific format handling. Adding a new provider requires
18
+ ONLY a new adapter — zero changes to the verification pipeline.
19
+ """
20
+
21
+ from dataclasses import dataclass, field
22
+ from datetime import datetime
23
+ from typing import List, Optional
24
+ from uuid import UUID
25
+
26
+ from pipeline.stage2.blind_retriever import RawSearchResult
27
+
28
+
29
+ @dataclass
30
+ class CanonicalEvidence:
31
+ """
32
+ Unified evidence schema — the single format for all downstream stages.
33
+
34
+ Schema [C6]:
35
+ - title: Source title/headline
36
+ - url: Full URL to the source
37
+ - excerpt: Relevant text excerpt from the source
38
+ - source_domain: Domain name (e.g., bbc.com)
39
+ - credibility_score: 0.0–1.0 per-domain credibility [J1b]
40
+ - retrieval_ts: ISO 8601 timestamp of retrieval
41
+ - query_used: The search query that found this source
42
+ - provider: Which search provider returned this (tavily, serper, google_cse, faiss)
43
+ - db_id: PK of the persisted Source row — set by Stage 2 after insert,
44
+ consumed by Stage 3 to populate classifications.source_id (FK).
45
+ Ephemeral, not serialized into the JSONB blob.
46
+ """
47
+ title: str
48
+ url: str
49
+ excerpt: str
50
+ source_domain: str
51
+ credibility_score: float
52
+ retrieval_ts: str = field(default_factory=lambda: datetime.utcnow().isoformat())
53
+ query_used: str = ""
54
+ provider: str = ""
55
+ db_id: Optional[UUID] = None
56
+
57
+
58
+ class CanonicalEvidenceMapper:
59
+ """
60
+ Normalizes raw search results from any provider into CanonicalEvidence.
61
+
62
+ This is the ONLY place that handles provider-specific formats.
63
+ Downstream stages (classification, verdict) never see raw provider data.
64
+ """
65
+
66
+ async def map_results(
67
+ self,
68
+ raw_results: List[RawSearchResult],
69
+ credibility_scorer=None, # CredibilityScorer instance
70
+ ) -> List[CanonicalEvidence]:
71
+ """
72
+ Map all raw results to canonical format, attaching credibility scores.
73
+
74
+ Args:
75
+ raw_results: Raw results from BlindRetriever.
76
+ credibility_scorer: CredibilityScorer for per-domain scoring.
77
+
78
+ Returns:
79
+ List of CanonicalEvidence objects.
80
+ """
81
+ canonical_list = []
82
+
83
+ for raw in raw_results:
84
+ # Resolve credibility score
85
+ cred_score = 0.5 # Default
86
+ if credibility_scorer:
87
+ cred_score = await credibility_scorer.score_domain(raw.domain)
88
+
89
+ canonical = CanonicalEvidence(
90
+ title=raw.title,
91
+ url=raw.url,
92
+ excerpt=raw.snippet,
93
+ source_domain=raw.domain,
94
+ credibility_score=cred_score,
95
+ query_used=raw.query_used,
96
+ provider=raw.provider,
97
+ )
98
+ canonical_list.append(canonical)
99
+
100
+ # Sort by credibility score descending (most credible first)
101
+ canonical_list.sort(key=lambda x: x.credibility_score, reverse=True)
102
+
103
+ return canonical_list
104
+
105
+ @staticmethod
106
+ def to_dict(evidence: CanonicalEvidence) -> dict:
107
+ """Serialize CanonicalEvidence to dict for JSONB storage."""
108
+ return {
109
+ "title": evidence.title,
110
+ "url": evidence.url,
111
+ "excerpt": evidence.excerpt,
112
+ "source_domain": evidence.source_domain,
113
+ "credibility_score": evidence.credibility_score,
114
+ "retrieval_ts": evidence.retrieval_ts,
115
+ "query_used": evidence.query_used,
116
+ "provider": evidence.provider,
117
+ }
118
+
119
+ @staticmethod
120
+ def from_dict(data: dict) -> CanonicalEvidence:
121
+ """Deserialize CanonicalEvidence from JSONB dict."""
122
+ # db_id is not stored in JSONB (it's the row PK itself); strip if present.
123
+ clean = {k: v for k, v in data.items() if k != "db_id"}
124
+ return CanonicalEvidence(**clean)
@@ -0,0 +1,85 @@
1
+ # Copyright (c) 2026 Jinan Kordab
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ """
5
+ Stage 2c: Per-Domain Credibility Scorer [J1b]
6
+ ================================================
7
+ Veracity's second jewel: continuous 0–1 credibility scoring per source domain.
8
+ Uses Lin et al. (2023) composite domain quality database.
9
+
10
+ Unlike Aletheia's binary blacklist, this provides a continuous, auditable metric.
11
+ Users see not just WHICH sources were used, but HOW credible each one is.
12
+ """
13
+
14
+ from typing import Optional
15
+ from sqlalchemy.ext.asyncio import AsyncSession
16
+ from sqlalchemy import select
17
+ from app.db.models import Domain
18
+
19
+
20
+ class CredibilityScorer:
21
+ """
22
+ Scores each source domain on a continuous 0–1 credibility scale.
23
+
24
+ Jewel [J1b] — Veracity's per-domain credibility:
25
+ Transforms source transparency from binary (blocked/allowed) into
26
+ a continuous, auditable metric that educates users about source quality
27
+ while holding the system accountable.
28
+ """
29
+
30
+ # Domain credibility scores are fetched from PostgreSQL domains table.
31
+ # Unknown domains get a default score of 0.5 (neutral, flagged for review).
32
+
33
+ DEFAULT_SCORE: float = 0.5
34
+ UNKNOWN_DOMAIN_SCORE: float = 0.5
35
+
36
+ def __init__(self, db: Optional[AsyncSession] = None):
37
+ self.db = db
38
+ self._cache: dict[str, float] = {} # In-memory cache per session
39
+
40
+ async def score_domain(self, domain: str) -> float:
41
+ """
42
+ Get credibility score for a domain.
43
+
44
+ Returns:
45
+ 0.0–1.0 score, where 1.0 = highest credibility.
46
+ """
47
+ domain = domain.lower().strip()
48
+
49
+ # Check in-memory cache
50
+ if domain in self._cache:
51
+ return self._cache[domain]
52
+
53
+ # Query PostgreSQL domains table
54
+ if self.db:
55
+ result = await self.db.execute(
56
+ select(Domain.credibility_score).where(
57
+ Domain.domain_name == domain
58
+ )
59
+ )
60
+ score = result.scalar()
61
+ if score is not None:
62
+ self._cache[domain] = float(score)
63
+ return float(score)
64
+
65
+ # Unknown domain → default neutral score
66
+ self._cache[domain] = self.DEFAULT_SCORE
67
+ return self.DEFAULT_SCORE
68
+
69
+ async def score_domains(
70
+ self, domains: list[str]
71
+ ) -> dict[str, float]:
72
+ """Batch-score multiple domains."""
73
+ scores = {}
74
+ for domain in domains:
75
+ scores[domain] = await self.score_domain(domain)
76
+ return scores
77
+
78
+ async def is_reliable(self, domain: str) -> bool:
79
+ """Quick binary check: is this domain generally reliable?"""
80
+ score = await self.score_domain(domain)
81
+ return score >= 0.6
82
+
83
+ def flush_cache(self):
84
+ """Clear in-memory cache (useful between sessions)."""
85
+ self._cache.clear()