mcp-agentic-pipelines 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +93 -0
- package/README.md +258 -0
- package/package.json +70 -0
- package/packages/clinical/package.json +22 -0
- package/packages/clinical/src/index.ts +262 -0
- package/packages/clinical/tsconfig.json +13 -0
- package/packages/core/package.json +21 -0
- package/packages/core/src/config.ts +138 -0
- package/packages/core/src/errors.ts +100 -0
- package/packages/core/src/index.ts +104 -0
- package/packages/core/src/llm-config.ts +213 -0
- package/packages/core/src/logging.ts +66 -0
- package/packages/core/src/python-bridge.ts +384 -0
- package/packages/core/src/rate-limiter.ts +136 -0
- package/packages/core/src/types.ts +203 -0
- package/packages/core/src/validation.ts +101 -0
- package/packages/core/tsconfig.json +10 -0
- package/packages/deeppipe/package.json +21 -0
- package/packages/deeppipe/src/index.ts +424 -0
- package/packages/deeppipe/tsconfig.json +13 -0
- package/packages/piste/package.json +20 -0
- package/packages/piste/src/index.ts +48 -0
- package/packages/piste/tsconfig.json +13 -0
- package/packages/precis/package.json +20 -0
- package/packages/precis/src/index.ts +67 -0
- package/packages/precis/tsconfig.json +13 -0
- package/packages/server/package.json +31 -0
- package/packages/server/src/index.ts +427 -0
- package/packages/server/tsconfig.json +17 -0
- package/setup.mjs +141 -0
- package/test.mjs +337 -0
- package/vendors/clinical-intake/pipeline.mjs +349 -0
- package/vendors/clinical-intake/questions/en.txt +9 -0
- package/vendors/clinical-intake/questions/fr.txt +9 -0
- package/vendors/piste/.env.example +73 -0
- package/vendors/piste/app/core/__init__.py +4 -0
- package/vendors/piste/app/core/config.py +83 -0
- package/vendors/piste/app/core/debuglog.py +16 -0
- package/vendors/piste/app/core/middleware.py +40 -0
- package/vendors/piste/bridge_piste.py +301 -0
- package/vendors/piste/pipeline/__init__.py +4 -0
- package/vendors/piste/pipeline/compiler.py +68 -0
- package/vendors/piste/pipeline/offline/__init__.py +28 -0
- package/vendors/piste/pipeline/offline/verifaid_pipeline.py +247 -0
- package/vendors/piste/pipeline/replay.py +15 -0
- package/vendors/piste/pipeline/replay_engine.py +249 -0
- package/vendors/piste/pipeline/signatures/__init__.py +4 -0
- package/vendors/piste/pipeline/signatures/signatures.py +136 -0
- package/vendors/piste/pipeline/stage1/__init__.py +21 -0
- package/vendors/piste/pipeline/stage1/atomic_decomposer.py +61 -0
- package/vendors/piste/pipeline/stage1/check_worthiness.py +100 -0
- package/vendors/piste/pipeline/stage1/orchestrator.py +175 -0
- package/vendors/piste/pipeline/stage1/test_stage1.py +162 -0
- package/vendors/piste/pipeline/stage2/__init__.py +34 -0
- package/vendors/piste/pipeline/stage2/blind_retriever.py +303 -0
- package/vendors/piste/pipeline/stage2/canonical_mapper.py +124 -0
- package/vendors/piste/pipeline/stage2/credibility_scorer.py +85 -0
- package/vendors/piste/pipeline/stage2/orchestrator.py +311 -0
- package/vendors/piste/pipeline/stage2/query_refiner.py +88 -0
- package/vendors/piste/pipeline/stage2/search_decision.py +69 -0
- package/vendors/piste/pipeline/stage2/test_stage2.py +265 -0
- package/vendors/piste/pipeline/stage3/__init__.py +20 -0
- package/vendors/piste/pipeline/stage3/classifier.py +79 -0
- package/vendors/piste/pipeline/stage3/orchestrator.py +225 -0
- package/vendors/piste/pipeline/stage3/test_stage3.py +101 -0
- package/vendors/piste/pipeline/stage4/__init__.py +33 -0
- package/vendors/piste/pipeline/stage4/criticality_gate.py +177 -0
- package/vendors/piste/pipeline/stage4/orchestrator.py +269 -0
- package/vendors/piste/pipeline/stage4/test_stage4.py +192 -0
- package/vendors/piste/pipeline/stage4/verdict_aggregator.py +157 -0
- package/vendors/piste/requirements.txt +53 -0
- package/vendors/precis/backend/__init__.py +6 -0
- package/vendors/precis/backend/agents/__init__.py +3 -0
- package/vendors/precis/backend/agents/data_synthesis.py +105 -0
- package/vendors/precis/backend/agents/dist_free_synth.py +97 -0
- package/vendors/precis/backend/agents/exact_hash_retriever.py +327 -0
- package/vendors/precis/backend/agents/fusion_ranker.py +64 -0
- package/vendors/precis/backend/agents/guardrail.py +175 -0
- package/vendors/precis/backend/agents/query_expander.py +89 -0
- package/vendors/precis/backend/agents/radial_interpol.py +99 -0
- package/vendors/precis/backend/agents/report_generator.py +92 -0
- package/vendors/precis/backend/agents/semantic_reranker.py +135 -0
- package/vendors/precis/backend/agents/stat_anomaly.py +93 -0
- package/vendors/precis/backend/agents/vector_index.py +123 -0
- package/vendors/precis/backend/agents/veri_score.py +341 -0
- package/vendors/precis/backend/agents/work_order_extractor.py +205 -0
- package/vendors/precis/backend/api/__init__.py +3 -0
- package/vendors/precis/backend/api/routes/__init__.py +3 -0
- package/vendors/precis/backend/config.py +88 -0
- package/vendors/precis/backend/core/__init__.py +13 -0
- package/vendors/precis/backend/core/hashing.py +22 -0
- package/vendors/precis/backend/core/metrics.py +77 -0
- package/vendors/precis/backend/core/multitoken.py +166 -0
- package/vendors/precis/backend/core/pmi.py +54 -0
- package/vendors/precis/backend/core/stemming.py +74 -0
- package/vendors/precis/backend/core/tracing.py +150 -0
- package/vendors/precis/backend/data/__init__.py +3 -0
- package/vendors/precis/backend/data/chunker.py +57 -0
- package/vendors/precis/backend/data/pdf_parser.py +42 -0
- package/vendors/precis/backend/db/__init__.py +3 -0
- package/vendors/precis/backend/db/models.py +173 -0
- package/vendors/precis/backend/db/repository.py +269 -0
- package/vendors/precis/backend/llm/__init__.py +3 -0
- package/vendors/precis/backend/llm/anthropic_provider.py +39 -0
- package/vendors/precis/backend/llm/base.py +147 -0
- package/vendors/precis/backend/llm/deepseek_provider.py +43 -0
- package/vendors/precis/backend/llm/factory.py +60 -0
- package/vendors/precis/backend/llm/google_provider.py +39 -0
- package/vendors/precis/backend/llm/ollama_provider.py +54 -0
- package/vendors/precis/backend/llm/openai_provider.py +50 -0
- package/vendors/precis/backend/main.py +677 -0
- package/vendors/precis/backend/orchestrator/__init__.py +3 -0
- package/vendors/precis/backend/orchestrator/planner.py +81 -0
- package/vendors/precis/backend/orchestrator/router.py +319 -0
- package/vendors/precis/backend/orchestrator/types.py +58 -0
- package/vendors/precis/bridge_precis.py +185 -0
- package/vendors/precis/data/sample_reports/README.md +8 -0
- package/vendors/precis/data/seed_data.py +115 -0
- package/vendors/precis/requirements.txt +19 -0
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
# Copyright (c) 2026 Jinan Kordab
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Unit Tests — Stage 2: Blind Retrieval
|
|
6
|
+
=======================================
|
|
7
|
+
Tests SearchDecisionGenerator [J1], BlindRetriever [J2],
|
|
8
|
+
CredibilityScorer [J1b], QueryRefiner [J8c], CanonicalEvidenceMapper [C6].
|
|
9
|
+
|
|
10
|
+
Run: pytest pipeline/stage2/test_stage2.py -v
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import pytest
|
|
14
|
+
from unittest.mock import patch, MagicMock, AsyncMock
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# ============================================================
|
|
18
|
+
# Canonical Evidence Mapper Tests [C6]
|
|
19
|
+
# ============================================================
|
|
20
|
+
|
|
21
|
+
class TestCanonicalEvidenceMapper:
|
|
22
|
+
"""Test evidence normalization from all provider formats."""
|
|
23
|
+
|
|
24
|
+
@pytest.fixture
|
|
25
|
+
def mapper(self):
|
|
26
|
+
from pipeline.stage2.canonical_mapper import CanonicalEvidenceMapper
|
|
27
|
+
return CanonicalEvidenceMapper()
|
|
28
|
+
|
|
29
|
+
@pytest.fixture
|
|
30
|
+
def tavily_result(self):
|
|
31
|
+
from pipeline.stage2.blind_retriever import RawSearchResult
|
|
32
|
+
return RawSearchResult(
|
|
33
|
+
provider="tavily",
|
|
34
|
+
query_used="price of oil June 2008",
|
|
35
|
+
title="Oil reaches record high in 2008",
|
|
36
|
+
url="https://reuters.com/oil-2008",
|
|
37
|
+
snippet="Crude oil prices reached a record $145 per barrel...",
|
|
38
|
+
domain="reuters.com",
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def test_map_tavily_result(self, mapper, tavily_result):
|
|
42
|
+
"""Tavily result → CanonicalEvidence with correct fields."""
|
|
43
|
+
import asyncio
|
|
44
|
+
results = asyncio.run(
|
|
45
|
+
mapper.map_results([tavily_result])
|
|
46
|
+
)
|
|
47
|
+
assert len(results) == 1
|
|
48
|
+
ev = results[0]
|
|
49
|
+
assert ev.title == "Oil reaches record high in 2008"
|
|
50
|
+
assert ev.url == "https://reuters.com/oil-2008"
|
|
51
|
+
assert ev.source_domain == "reuters.com"
|
|
52
|
+
assert ev.provider == "tavily"
|
|
53
|
+
assert ev.query_used == "price of oil June 2008"
|
|
54
|
+
|
|
55
|
+
def test_map_results_sorts_by_credibility(self, mapper):
|
|
56
|
+
"""Results are sorted by credibility score descending."""
|
|
57
|
+
from pipeline.stage2.blind_retriever import RawSearchResult
|
|
58
|
+
import asyncio
|
|
59
|
+
|
|
60
|
+
raw = [
|
|
61
|
+
RawSearchResult("tavily", "q", "Low cred", "http://a.com", "...", "low.com"),
|
|
62
|
+
RawSearchResult("serper", "q", "High cred", "http://b.com", "...", "high.com"),
|
|
63
|
+
RawSearchResult("tavily", "q", "Mid cred", "http://c.com", "...", "mid.com"),
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
# Mock credibility scorer
|
|
67
|
+
class MockScorer:
|
|
68
|
+
async def score_domain(self, domain):
|
|
69
|
+
return {"high.com": 0.95, "mid.com": 0.60, "low.com": 0.25}[domain]
|
|
70
|
+
|
|
71
|
+
results = asyncio.run(
|
|
72
|
+
mapper.map_results(raw, MockScorer())
|
|
73
|
+
)
|
|
74
|
+
assert results[0].source_domain == "high.com"
|
|
75
|
+
assert results[1].source_domain == "mid.com"
|
|
76
|
+
assert results[2].source_domain == "low.com"
|
|
77
|
+
|
|
78
|
+
def test_to_dict_from_dict_roundtrip(self, mapper):
|
|
79
|
+
"""CanonicalEvidence serializes/deserializes for JSONB storage."""
|
|
80
|
+
from pipeline.stage2.canonical_mapper import CanonicalEvidence
|
|
81
|
+
|
|
82
|
+
ev = CanonicalEvidence(
|
|
83
|
+
title="Test",
|
|
84
|
+
url="https://example.com",
|
|
85
|
+
excerpt="An excerpt.",
|
|
86
|
+
source_domain="example.com",
|
|
87
|
+
credibility_score=0.85,
|
|
88
|
+
query_used="test query",
|
|
89
|
+
provider="tavily",
|
|
90
|
+
)
|
|
91
|
+
d = mapper.to_dict(ev)
|
|
92
|
+
restored = mapper.from_dict(d)
|
|
93
|
+
assert restored.title == "Test"
|
|
94
|
+
assert restored.credibility_score == 0.85
|
|
95
|
+
assert restored.source_domain == "example.com"
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# ============================================================
|
|
99
|
+
# Credibility Scorer Tests [J1b]
|
|
100
|
+
# ============================================================
|
|
101
|
+
|
|
102
|
+
class TestCredibilityScorer:
|
|
103
|
+
"""Test per-domain credibility scoring."""
|
|
104
|
+
|
|
105
|
+
def test_default_score_for_unknown_domain(self):
|
|
106
|
+
from pipeline.stage2.credibility_scorer import CredibilityScorer
|
|
107
|
+
scorer = CredibilityScorer(db=None)
|
|
108
|
+
# Without DB, always returns default
|
|
109
|
+
assert scorer.DEFAULT_SCORE == 0.5
|
|
110
|
+
|
|
111
|
+
def test_cache_hit_after_first_lookup(self):
|
|
112
|
+
from pipeline.stage2.credibility_scorer import CredibilityScorer
|
|
113
|
+
import asyncio
|
|
114
|
+
scorer = CredibilityScorer(db=None)
|
|
115
|
+
scorer._cache["test.com"] = 0.88
|
|
116
|
+
score = asyncio.run(scorer.score_domain("test.com"))
|
|
117
|
+
assert score == 0.88
|
|
118
|
+
|
|
119
|
+
def test_is_reliable_threshold(self):
|
|
120
|
+
from pipeline.stage2.credibility_scorer import CredibilityScorer
|
|
121
|
+
scorer = CredibilityScorer(db=None)
|
|
122
|
+
scorer._cache["good.com"] = 0.92
|
|
123
|
+
scorer._cache["bad.com"] = 0.30
|
|
124
|
+
assert asyncio.run(scorer.is_reliable("good.com")) is True
|
|
125
|
+
assert asyncio.run(scorer.is_reliable("bad.com")) is False
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# ============================================================
|
|
129
|
+
# Search Decision Generator Tests [J1]
|
|
130
|
+
# ============================================================
|
|
131
|
+
|
|
132
|
+
class TestSearchDecisionGenerator:
|
|
133
|
+
"""Test search decision logic."""
|
|
134
|
+
|
|
135
|
+
def test_known_fact_skips_search(self):
|
|
136
|
+
"""Simple, well-known facts should skip search."""
|
|
137
|
+
from pipeline.stage2.search_decision import SearchDecisionGenerator
|
|
138
|
+
gen = SearchDecisionGenerator()
|
|
139
|
+
# The actual decision depends on LLM, but the module structure
|
|
140
|
+
# ensures needs_search is a bool and queries are strings
|
|
141
|
+
assert gen.decide is not None
|
|
142
|
+
assert gen.generate_queries is not None
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# ============================================================
|
|
146
|
+
# Query Refiner Tests [J8c]
|
|
147
|
+
# ============================================================
|
|
148
|
+
|
|
149
|
+
class TestQueryRefiner:
|
|
150
|
+
"""Test intelligent query refinement — Loop 1."""
|
|
151
|
+
|
|
152
|
+
@pytest.fixture
|
|
153
|
+
def refiner(self):
|
|
154
|
+
from pipeline.stage2.query_refiner import QueryRefiner
|
|
155
|
+
return QueryRefiner()
|
|
156
|
+
|
|
157
|
+
def test_analyze_no_results(self, refiner):
|
|
158
|
+
"""No results → specific message."""
|
|
159
|
+
reason = refiner.analyze_insufficiency([], "test claim")
|
|
160
|
+
assert "No search results" in reason
|
|
161
|
+
|
|
162
|
+
def test_analyze_low_credibility(self, refiner):
|
|
163
|
+
"""Majority low-credibility sources → flagged."""
|
|
164
|
+
# Create mock results with low credibility scores
|
|
165
|
+
class MockResult:
|
|
166
|
+
credibility_score = 0.2
|
|
167
|
+
results = [MockResult() for _ in range(5)]
|
|
168
|
+
reason = refiner.analyze_insufficiency(results, "test claim")
|
|
169
|
+
assert "low-credibility" in reason.lower()
|
|
170
|
+
|
|
171
|
+
def test_analyze_few_results(self, refiner):
|
|
172
|
+
"""Few results → insufficient warning."""
|
|
173
|
+
class MockResult:
|
|
174
|
+
credibility_score = 0.8
|
|
175
|
+
results = [MockResult(), MockResult()]
|
|
176
|
+
reason = refiner.analyze_insufficiency(results, "test claim")
|
|
177
|
+
assert "Only 2 results" in reason
|
|
178
|
+
|
|
179
|
+
def test_max_refined_queries_capped(self, refiner):
|
|
180
|
+
"""Refined queries are capped at 3."""
|
|
181
|
+
# The forward method caps at 3; verify the module exists
|
|
182
|
+
assert refiner.max_retries > 0
|
|
183
|
+
assert refiner.refine is not None
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
# ============================================================
|
|
187
|
+
# Blind Retriever Tests [J2]
|
|
188
|
+
# ============================================================
|
|
189
|
+
|
|
190
|
+
class TestBlindRetriever:
|
|
191
|
+
"""Test blind retrieval architecture."""
|
|
192
|
+
|
|
193
|
+
def test_domain_extraction(self):
|
|
194
|
+
"""Domain extraction from URLs works correctly."""
|
|
195
|
+
from pipeline.stage2.blind_retriever import BlindRetriever
|
|
196
|
+
retriever = BlindRetriever()
|
|
197
|
+
|
|
198
|
+
assert retriever._extract_domain("https://www.bbc.com/news") == "bbc.com"
|
|
199
|
+
assert retriever._extract_domain("https://reuters.com/article/1") == "reuters.com"
|
|
200
|
+
assert retriever._extract_domain("http://sub.domain.co.uk/path") == "sub.domain.co.uk"
|
|
201
|
+
|
|
202
|
+
def test_deduplication_by_url(self):
|
|
203
|
+
"""Duplicate URLs are removed from results."""
|
|
204
|
+
import asyncio
|
|
205
|
+
from pipeline.stage2.blind_retriever import BlindRetriever, RawSearchResult
|
|
206
|
+
|
|
207
|
+
retriever = BlindRetriever()
|
|
208
|
+
results = [
|
|
209
|
+
RawSearchResult("tavily", "q", "A", "https://example.com", "...", "example.com"),
|
|
210
|
+
RawSearchResult("serper", "q", "A dup", "https://example.com", "...", "example.com"),
|
|
211
|
+
RawSearchResult("tavily", "q", "B", "https://other.com", "...", "other.com"),
|
|
212
|
+
]
|
|
213
|
+
|
|
214
|
+
# Simulate dedup logic
|
|
215
|
+
seen = set()
|
|
216
|
+
unique = []
|
|
217
|
+
for r in results:
|
|
218
|
+
norm = r.url.lower().rstrip("/")
|
|
219
|
+
if norm not in seen:
|
|
220
|
+
seen.add(norm)
|
|
221
|
+
unique.append(r)
|
|
222
|
+
|
|
223
|
+
assert len(unique) == 2
|
|
224
|
+
assert unique[0].url == "https://example.com"
|
|
225
|
+
assert unique[1].url == "https://other.com"
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
# ============================================================
|
|
229
|
+
# Stage2Result Tests
|
|
230
|
+
# ============================================================
|
|
231
|
+
|
|
232
|
+
class TestStage2Result:
|
|
233
|
+
"""Test Stage2Result dataclass."""
|
|
234
|
+
|
|
235
|
+
def test_skipped_search_result(self):
|
|
236
|
+
from pipeline.stage2.orchestrator import Stage2Result
|
|
237
|
+
result = Stage2Result(
|
|
238
|
+
atomic_claim="Water boils at 100°C.",
|
|
239
|
+
needs_search=False,
|
|
240
|
+
search_queries=[],
|
|
241
|
+
search_reasoning="Well-known scientific fact.",
|
|
242
|
+
skipped_search=True,
|
|
243
|
+
)
|
|
244
|
+
assert result.needs_search is False
|
|
245
|
+
assert result.skipped_search is True
|
|
246
|
+
assert len(result.canonical_evidence) == 0
|
|
247
|
+
|
|
248
|
+
def test_search_with_retry_result(self):
|
|
249
|
+
from pipeline.stage2.orchestrator import Stage2Result
|
|
250
|
+
from pipeline.stage2.canonical_mapper import CanonicalEvidence
|
|
251
|
+
result = Stage2Result(
|
|
252
|
+
atomic_claim="Complex claim.",
|
|
253
|
+
needs_search=True,
|
|
254
|
+
search_queries=["neutral query"],
|
|
255
|
+
search_reasoning="Requires external evidence.",
|
|
256
|
+
canonical_evidence=[
|
|
257
|
+
CanonicalEvidence("S1", "http://a.com", "...", "a.com", 0.9),
|
|
258
|
+
],
|
|
259
|
+
retry_count=1,
|
|
260
|
+
retry_queries=["refined query"],
|
|
261
|
+
insufficient_reason="Initial results insufficient.",
|
|
262
|
+
)
|
|
263
|
+
assert result.retry_count == 1
|
|
264
|
+
assert len(result.retry_queries) == 1
|
|
265
|
+
assert len(result.canonical_evidence) == 1
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Copyright (c) 2026 Jinan Kordab
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
# Stage 3 — Per-Source Classification [J3][J8]
|
|
5
|
+
# Classifier: pipeline/stage3/classifier.py
|
|
6
|
+
# Orchestrator: pipeline/stage3/orchestrator.py (asyncio.gather parallelism)
|
|
7
|
+
|
|
8
|
+
from pipeline.stage3.classifier import SourceClassifier, source_classifier
|
|
9
|
+
from pipeline.stage3.orchestrator import (
|
|
10
|
+
Stage3Orchestrator, Stage3Result, ClassificationResult, stage3_orchestrator,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"SourceClassifier",
|
|
15
|
+
"source_classifier",
|
|
16
|
+
"Stage3Orchestrator",
|
|
17
|
+
"Stage3Result",
|
|
18
|
+
"ClassificationResult",
|
|
19
|
+
"stage3_orchestrator",
|
|
20
|
+
]
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Copyright (c) 2026 Jinan Kordab
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Stage 3: Per-Source Classification [J3][J8]
|
|
6
|
+
==============================================
|
|
7
|
+
Aletheia's jewel: each evidence source is independently classified
|
|
8
|
+
as SUPPORTS, REFUTES, or UNRELATED to the claim BEFORE aggregation.
|
|
9
|
+
|
|
10
|
+
Run in parallel via asyncio.gather — N classifiers = N simultaneous LLM calls.
|
|
11
|
+
This provides:
|
|
12
|
+
- Full audit trail: every source's contribution is explicit
|
|
13
|
+
- Debuggability: errors isolated to single classifications
|
|
14
|
+
- Parallelism: independent evaluations run concurrently
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import dspy
|
|
18
|
+
from typing import Tuple
|
|
19
|
+
from pipeline.signatures.signatures import SourceClassificationSignature
|
|
20
|
+
from pipeline.stage2.canonical_mapper import CanonicalEvidence
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SourceClassifier(dspy.Module):
|
|
24
|
+
"""
|
|
25
|
+
DSPy module that classifies a single evidence source relative to a claim.
|
|
26
|
+
|
|
27
|
+
Jewel [J3] — Aletheia's structured per-source classification:
|
|
28
|
+
- Each source evaluated independently
|
|
29
|
+
- Label: SUPPORTS / REFUTES / UNRELATED
|
|
30
|
+
- Returns confidence + rationale for audit trail [C5]
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self):
|
|
34
|
+
super().__init__()
|
|
35
|
+
self.classify = dspy.ChainOfThought(SourceClassificationSignature)
|
|
36
|
+
|
|
37
|
+
def forward(
|
|
38
|
+
self,
|
|
39
|
+
claim: str,
|
|
40
|
+
evidence: CanonicalEvidence,
|
|
41
|
+
locale: str = "en",
|
|
42
|
+
) -> Tuple[str, float, str]:
|
|
43
|
+
"""
|
|
44
|
+
Classify one source against the claim.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
claim: The atomic claim being verified.
|
|
48
|
+
evidence: A single CanonicalEvidence source.
|
|
49
|
+
locale: Language locale for the response (en, fr).
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
label: "SUPPORTS", "REFUTES", or "UNRELATED"
|
|
53
|
+
confidence: 0.0–1.0
|
|
54
|
+
rationale: Explanation of the classification
|
|
55
|
+
"""
|
|
56
|
+
result = self.classify(
|
|
57
|
+
claim=claim,
|
|
58
|
+
evidence_title=evidence.title,
|
|
59
|
+
evidence_excerpt=evidence.excerpt,
|
|
60
|
+
source_domain=evidence.source_domain,
|
|
61
|
+
credibility_score=evidence.credibility_score,
|
|
62
|
+
locale=locale,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
label = result.label.strip().upper()
|
|
66
|
+
# Normalize to valid labels
|
|
67
|
+
if label not in ("SUPPORTS", "REFUTES", "UNRELATED"):
|
|
68
|
+
if "SUPPORT" in label:
|
|
69
|
+
label = "SUPPORTS"
|
|
70
|
+
elif "REFUT" in label:
|
|
71
|
+
label = "REFUTES"
|
|
72
|
+
else:
|
|
73
|
+
label = "UNRELATED"
|
|
74
|
+
|
|
75
|
+
return label, float(result.confidence), result.rationale
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# Singleton
|
|
79
|
+
source_classifier = SourceClassifier()
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# Copyright (c) 2026 Jinan Kordab
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Stage 3 Orchestrator — Per-Source Classification
|
|
6
|
+
===================================================
|
|
7
|
+
Runs N classifiers in parallel via asyncio.gather.
|
|
8
|
+
Each source independently evaluated → SUPPORTS/REFUTES/UNRELATED.
|
|
9
|
+
|
|
10
|
+
Emits SSE events per classification for real-time frontend updates.
|
|
11
|
+
Writes APPEND-ONLY classification records to PostgreSQL [C5].
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import asyncio
|
|
15
|
+
import time
|
|
16
|
+
import uuid
|
|
17
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
18
|
+
from typing import Optional, List
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
|
|
21
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
22
|
+
|
|
23
|
+
from app.db.models import StageRecord, Classification
|
|
24
|
+
from pipeline.stage3.classifier import source_classifier
|
|
25
|
+
from pipeline.stage2.canonical_mapper import CanonicalEvidence
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class ClassificationResult:
|
|
30
|
+
"""Output of a single source classification."""
|
|
31
|
+
source_index: int
|
|
32
|
+
source_url: str
|
|
33
|
+
source_domain: str
|
|
34
|
+
label: str # SUPPORTS | REFUTES | UNRELATED
|
|
35
|
+
confidence: float
|
|
36
|
+
rationale: str
|
|
37
|
+
credibility_score: float
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class Stage3Result:
|
|
42
|
+
"""Output of Stage 3 — Per-Source Classification."""
|
|
43
|
+
atomic_claim: str
|
|
44
|
+
classifications: list[ClassificationResult]
|
|
45
|
+
support_count: int = 0
|
|
46
|
+
refute_count: int = 0
|
|
47
|
+
unrelated_count: int = 0
|
|
48
|
+
total_sources: int = 0
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class Stage3Orchestrator:
|
|
52
|
+
"""
|
|
53
|
+
Orchestrates Stage 3 — parallel per-source classification.
|
|
54
|
+
|
|
55
|
+
Jewel [J3][J8] — Aletheia's structured evaluation:
|
|
56
|
+
Each source classified independently BEFORE aggregation.
|
|
57
|
+
Parallel execution via asyncio.gather reduces latency.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
_semaphore: asyncio.Semaphore = asyncio.Semaphore(200)
|
|
61
|
+
_executor: ThreadPoolExecutor = ThreadPoolExecutor(max_workers=200)
|
|
62
|
+
|
|
63
|
+
def __init__(self, sse_callback: Optional[callable] = None):
|
|
64
|
+
self.sse_callback = sse_callback
|
|
65
|
+
|
|
66
|
+
async def process(
|
|
67
|
+
self,
|
|
68
|
+
atomic_claim: str,
|
|
69
|
+
evidence_list: List[CanonicalEvidence],
|
|
70
|
+
db: Optional[AsyncSession] = None,
|
|
71
|
+
run_id: Optional[uuid.UUID] = None,
|
|
72
|
+
locale: str = "en",
|
|
73
|
+
) -> Stage3Result:
|
|
74
|
+
"""
|
|
75
|
+
Classify all evidence sources in parallel against one atomic claim.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
atomic_claim: The atomic claim to verify.
|
|
79
|
+
evidence_list: CanonicalEvidence sources from Stage 2.
|
|
80
|
+
db: Optional DB session for audit ledger writes.
|
|
81
|
+
run_id: UUID of the analysis run for audit trail [C5].
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Stage3Result with all per-source classifications.
|
|
85
|
+
"""
|
|
86
|
+
await self._emit("stage_3_start", {
|
|
87
|
+
"atomic_claim": atomic_claim,
|
|
88
|
+
"sources_count": len(evidence_list),
|
|
89
|
+
})
|
|
90
|
+
|
|
91
|
+
if not evidence_list:
|
|
92
|
+
await self._emit("stage_3_complete", {
|
|
93
|
+
"atomic_claim": atomic_claim,
|
|
94
|
+
"support": 0, "refute": 0, "unrelated": 0,
|
|
95
|
+
})
|
|
96
|
+
return Stage3Result(
|
|
97
|
+
atomic_claim=atomic_claim,
|
|
98
|
+
classifications=[],
|
|
99
|
+
total_sources=0,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# --- Run N classifiers in parallel ---
|
|
103
|
+
t0 = time.monotonic()
|
|
104
|
+
|
|
105
|
+
async def _classify_with_limit(
|
|
106
|
+
idx: int, claim: str, ev: CanonicalEvidence, loc: str
|
|
107
|
+
) -> ClassificationResult:
|
|
108
|
+
async with self._semaphore:
|
|
109
|
+
return await self._classify_single(idx, claim, ev, loc)
|
|
110
|
+
|
|
111
|
+
tasks = [
|
|
112
|
+
_classify_with_limit(i, atomic_claim, evidence, locale)
|
|
113
|
+
for i, evidence in enumerate(evidence_list)
|
|
114
|
+
]
|
|
115
|
+
results: List[ClassificationResult] = await asyncio.gather(*tasks)
|
|
116
|
+
|
|
117
|
+
latency_total = (time.monotonic() - t0) * 1000
|
|
118
|
+
|
|
119
|
+
# --- Tally ---
|
|
120
|
+
support_count = sum(1 for r in results if r.label == "SUPPORTS")
|
|
121
|
+
refute_count = sum(1 for r in results if r.label == "REFUTES")
|
|
122
|
+
unrelated_count = sum(1 for r in results if r.label == "UNRELATED")
|
|
123
|
+
|
|
124
|
+
# --- Write to PostgreSQL (append-only) ---
|
|
125
|
+
if db:
|
|
126
|
+
for r in results:
|
|
127
|
+
# Stage 2 attached the Source row PK to the in-memory evidence
|
|
128
|
+
# at insertion time; reuse it here as the FK.
|
|
129
|
+
ev = evidence_list[r.source_index]
|
|
130
|
+
db.add(Classification(
|
|
131
|
+
run_id=run_id or uuid.UUID("00000000-0000-0000-0000-000000000000"),
|
|
132
|
+
source_id=ev.db_id,
|
|
133
|
+
label=r.label,
|
|
134
|
+
confidence=r.confidence,
|
|
135
|
+
rationale=r.rationale,
|
|
136
|
+
model_used="dspy/source_classifier",
|
|
137
|
+
))
|
|
138
|
+
|
|
139
|
+
db.add(StageRecord(
|
|
140
|
+
run_id=run_id or uuid.UUID("00000000-0000-0000-0000-000000000000"),
|
|
141
|
+
stage_name="stage_3",
|
|
142
|
+
input_snapshot={
|
|
143
|
+
"atomic_claim": atomic_claim,
|
|
144
|
+
"sources_count": len(evidence_list),
|
|
145
|
+
},
|
|
146
|
+
output_snapshot={
|
|
147
|
+
"support": support_count,
|
|
148
|
+
"refute": refute_count,
|
|
149
|
+
"unrelated": unrelated_count,
|
|
150
|
+
"classifications": [
|
|
151
|
+
{"label": r.label, "confidence": r.confidence}
|
|
152
|
+
for r in results
|
|
153
|
+
],
|
|
154
|
+
},
|
|
155
|
+
model_used="dspy/source_classifier",
|
|
156
|
+
latency_ms=latency_total,
|
|
157
|
+
retry_attempt=0,
|
|
158
|
+
))
|
|
159
|
+
|
|
160
|
+
# --- Emit SSE events ---
|
|
161
|
+
for r in results:
|
|
162
|
+
await self._emit("source_classified", {
|
|
163
|
+
"source_index": r.source_index,
|
|
164
|
+
"source_url": r.source_url,
|
|
165
|
+
"source_domain": r.source_domain,
|
|
166
|
+
"label": r.label,
|
|
167
|
+
"confidence": r.confidence,
|
|
168
|
+
"credibility_score": r.credibility_score,
|
|
169
|
+
})
|
|
170
|
+
|
|
171
|
+
await self._emit("stage_3_complete", {
|
|
172
|
+
"atomic_claim": atomic_claim,
|
|
173
|
+
"support": support_count,
|
|
174
|
+
"refute": refute_count,
|
|
175
|
+
"unrelated": unrelated_count,
|
|
176
|
+
})
|
|
177
|
+
|
|
178
|
+
return Stage3Result(
|
|
179
|
+
atomic_claim=atomic_claim,
|
|
180
|
+
classifications=results,
|
|
181
|
+
support_count=support_count,
|
|
182
|
+
refute_count=refute_count,
|
|
183
|
+
unrelated_count=unrelated_count,
|
|
184
|
+
total_sources=len(evidence_list),
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
async def _classify_single(
|
|
188
|
+
self,
|
|
189
|
+
index: int,
|
|
190
|
+
atomic_claim: str,
|
|
191
|
+
evidence: CanonicalEvidence,
|
|
192
|
+
locale: str = "en",
|
|
193
|
+
) -> ClassificationResult:
|
|
194
|
+
"""Classify a single source (runs in parallel via asyncio.gather).
|
|
195
|
+
|
|
196
|
+
The DSPy source_classifier is synchronous and would block the asyncio
|
|
197
|
+
event loop on every LLM call, forcing serial execution despite the
|
|
198
|
+
gather. We off-load each call to a thread so the event loop stays
|
|
199
|
+
free and N classifications truly run concurrently.
|
|
200
|
+
"""
|
|
201
|
+
loop = asyncio.get_running_loop()
|
|
202
|
+
label, confidence, rationale = await loop.run_in_executor(
|
|
203
|
+
self._executor,
|
|
204
|
+
source_classifier,
|
|
205
|
+
atomic_claim,
|
|
206
|
+
evidence,
|
|
207
|
+
locale,
|
|
208
|
+
)
|
|
209
|
+
return ClassificationResult(
|
|
210
|
+
source_index=index,
|
|
211
|
+
source_url=evidence.url,
|
|
212
|
+
source_domain=evidence.source_domain,
|
|
213
|
+
label=label,
|
|
214
|
+
confidence=confidence,
|
|
215
|
+
rationale=rationale,
|
|
216
|
+
credibility_score=evidence.credibility_score,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
async def _emit(self, event_type: str, data: dict):
|
|
220
|
+
if self.sse_callback:
|
|
221
|
+
await self.sse_callback(event_type, data)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
# Singleton
|
|
225
|
+
stage3_orchestrator = Stage3Orchestrator()
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# Copyright (c) 2026 Jinan Kordab
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Unit Tests — Stage 3: Per-Source Classification
|
|
6
|
+
=================================================
|
|
7
|
+
Tests SourceClassifier [J3], parallel orchestrator, ClassificationResult.
|
|
8
|
+
|
|
9
|
+
Run: pytest pipeline/stage3/test_stage3.py -v
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import pytest
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TestClassificationResult:
|
|
16
|
+
"""Test ClassificationResult dataclass."""
|
|
17
|
+
|
|
18
|
+
def test_supports_classification(self):
|
|
19
|
+
from pipeline.stage3.orchestrator import ClassificationResult
|
|
20
|
+
r = ClassificationResult(
|
|
21
|
+
source_index=0,
|
|
22
|
+
source_url="https://reuters.com/article",
|
|
23
|
+
source_domain="reuters.com",
|
|
24
|
+
label="SUPPORTS",
|
|
25
|
+
confidence=0.92,
|
|
26
|
+
rationale="The article confirms the claim with primary data.",
|
|
27
|
+
credibility_score=0.95,
|
|
28
|
+
)
|
|
29
|
+
assert r.label == "SUPPORTS"
|
|
30
|
+
assert r.credibility_score == 0.95
|
|
31
|
+
|
|
32
|
+
def test_refutes_classification(self):
|
|
33
|
+
from pipeline.stage3.orchestrator import ClassificationResult
|
|
34
|
+
r = ClassificationResult(
|
|
35
|
+
source_index=1,
|
|
36
|
+
source_url="https://example.com",
|
|
37
|
+
source_domain="example.com",
|
|
38
|
+
label="REFUTES",
|
|
39
|
+
confidence=0.78,
|
|
40
|
+
rationale="Data contradicts the claim.",
|
|
41
|
+
credibility_score=0.45,
|
|
42
|
+
)
|
|
43
|
+
assert r.label == "REFUTES"
|
|
44
|
+
|
|
45
|
+
def test_unrelated_classification(self):
|
|
46
|
+
from pipeline.stage3.orchestrator import ClassificationResult
|
|
47
|
+
r = ClassificationResult(
|
|
48
|
+
source_index=2,
|
|
49
|
+
source_url="https://other.com",
|
|
50
|
+
source_domain="other.com",
|
|
51
|
+
label="UNRELATED",
|
|
52
|
+
confidence=0.95,
|
|
53
|
+
rationale="Source is about a different topic.",
|
|
54
|
+
credibility_score=0.60,
|
|
55
|
+
)
|
|
56
|
+
assert r.label == "UNRELATED"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class TestStage3Result:
|
|
60
|
+
"""Test Stage3Result tallying."""
|
|
61
|
+
|
|
62
|
+
def test_tally_counts(self):
|
|
63
|
+
from pipeline.stage3.orchestrator import Stage3Result, ClassificationResult
|
|
64
|
+
|
|
65
|
+
classifications = [
|
|
66
|
+
ClassificationResult(0, "a.com", "a.com", "SUPPORTS", 0.9, "...", 0.9),
|
|
67
|
+
ClassificationResult(1, "b.com", "b.com", "SUPPORTS", 0.8, "...", 0.8),
|
|
68
|
+
ClassificationResult(2, "c.com", "c.com", "REFUTES", 0.7, "...", 0.7),
|
|
69
|
+
ClassificationResult(3, "d.com", "d.com", "UNRELATED", 0.9, "...", 0.5),
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
# Manual tally
|
|
73
|
+
support = sum(1 for c in classifications if c.label == "SUPPORTS")
|
|
74
|
+
refute = sum(1 for c in classifications if c.label == "REFUTES")
|
|
75
|
+
unrelated = sum(1 for c in classifications if c.label == "UNRELATED")
|
|
76
|
+
|
|
77
|
+
assert support == 2
|
|
78
|
+
assert refute == 1
|
|
79
|
+
assert unrelated == 1
|
|
80
|
+
|
|
81
|
+
def test_empty_evidence(self):
|
|
82
|
+
from pipeline.stage3.orchestrator import Stage3Result
|
|
83
|
+
result = Stage3Result(
|
|
84
|
+
atomic_claim="Test",
|
|
85
|
+
classifications=[],
|
|
86
|
+
total_sources=0,
|
|
87
|
+
)
|
|
88
|
+
assert result.total_sources == 0
|
|
89
|
+
assert result.support_count == 0
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class TestStage3Orchestrator:
|
|
93
|
+
"""Test parallel classification orchestration."""
|
|
94
|
+
|
|
95
|
+
def test_sse_callback_stored(self):
|
|
96
|
+
from pipeline.stage3.orchestrator import Stage3Orchestrator
|
|
97
|
+
calls = []
|
|
98
|
+
async def cb(event, data):
|
|
99
|
+
calls.append((event, data))
|
|
100
|
+
orch = Stage3Orchestrator(sse_callback=cb)
|
|
101
|
+
assert orch.sse_callback is not None
|