mcp-agentic-pipelines 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/.env.example +93 -0
  2. package/README.md +258 -0
  3. package/package.json +70 -0
  4. package/packages/clinical/package.json +22 -0
  5. package/packages/clinical/src/index.ts +262 -0
  6. package/packages/clinical/tsconfig.json +13 -0
  7. package/packages/core/package.json +21 -0
  8. package/packages/core/src/config.ts +138 -0
  9. package/packages/core/src/errors.ts +100 -0
  10. package/packages/core/src/index.ts +104 -0
  11. package/packages/core/src/llm-config.ts +213 -0
  12. package/packages/core/src/logging.ts +66 -0
  13. package/packages/core/src/python-bridge.ts +384 -0
  14. package/packages/core/src/rate-limiter.ts +136 -0
  15. package/packages/core/src/types.ts +203 -0
  16. package/packages/core/src/validation.ts +101 -0
  17. package/packages/core/tsconfig.json +10 -0
  18. package/packages/deeppipe/package.json +21 -0
  19. package/packages/deeppipe/src/index.ts +424 -0
  20. package/packages/deeppipe/tsconfig.json +13 -0
  21. package/packages/piste/package.json +20 -0
  22. package/packages/piste/src/index.ts +48 -0
  23. package/packages/piste/tsconfig.json +13 -0
  24. package/packages/precis/package.json +20 -0
  25. package/packages/precis/src/index.ts +67 -0
  26. package/packages/precis/tsconfig.json +13 -0
  27. package/packages/server/package.json +31 -0
  28. package/packages/server/src/index.ts +427 -0
  29. package/packages/server/tsconfig.json +17 -0
  30. package/setup.mjs +141 -0
  31. package/test.mjs +337 -0
  32. package/vendors/clinical-intake/pipeline.mjs +349 -0
  33. package/vendors/clinical-intake/questions/en.txt +9 -0
  34. package/vendors/clinical-intake/questions/fr.txt +9 -0
  35. package/vendors/piste/.env.example +73 -0
  36. package/vendors/piste/app/core/__init__.py +4 -0
  37. package/vendors/piste/app/core/config.py +83 -0
  38. package/vendors/piste/app/core/debuglog.py +16 -0
  39. package/vendors/piste/app/core/middleware.py +40 -0
  40. package/vendors/piste/bridge_piste.py +301 -0
  41. package/vendors/piste/pipeline/__init__.py +4 -0
  42. package/vendors/piste/pipeline/compiler.py +68 -0
  43. package/vendors/piste/pipeline/offline/__init__.py +28 -0
  44. package/vendors/piste/pipeline/offline/verifaid_pipeline.py +247 -0
  45. package/vendors/piste/pipeline/replay.py +15 -0
  46. package/vendors/piste/pipeline/replay_engine.py +249 -0
  47. package/vendors/piste/pipeline/signatures/__init__.py +4 -0
  48. package/vendors/piste/pipeline/signatures/signatures.py +136 -0
  49. package/vendors/piste/pipeline/stage1/__init__.py +21 -0
  50. package/vendors/piste/pipeline/stage1/atomic_decomposer.py +61 -0
  51. package/vendors/piste/pipeline/stage1/check_worthiness.py +100 -0
  52. package/vendors/piste/pipeline/stage1/orchestrator.py +175 -0
  53. package/vendors/piste/pipeline/stage1/test_stage1.py +162 -0
  54. package/vendors/piste/pipeline/stage2/__init__.py +34 -0
  55. package/vendors/piste/pipeline/stage2/blind_retriever.py +303 -0
  56. package/vendors/piste/pipeline/stage2/canonical_mapper.py +124 -0
  57. package/vendors/piste/pipeline/stage2/credibility_scorer.py +85 -0
  58. package/vendors/piste/pipeline/stage2/orchestrator.py +311 -0
  59. package/vendors/piste/pipeline/stage2/query_refiner.py +88 -0
  60. package/vendors/piste/pipeline/stage2/search_decision.py +69 -0
  61. package/vendors/piste/pipeline/stage2/test_stage2.py +265 -0
  62. package/vendors/piste/pipeline/stage3/__init__.py +20 -0
  63. package/vendors/piste/pipeline/stage3/classifier.py +79 -0
  64. package/vendors/piste/pipeline/stage3/orchestrator.py +225 -0
  65. package/vendors/piste/pipeline/stage3/test_stage3.py +101 -0
  66. package/vendors/piste/pipeline/stage4/__init__.py +33 -0
  67. package/vendors/piste/pipeline/stage4/criticality_gate.py +177 -0
  68. package/vendors/piste/pipeline/stage4/orchestrator.py +269 -0
  69. package/vendors/piste/pipeline/stage4/test_stage4.py +192 -0
  70. package/vendors/piste/pipeline/stage4/verdict_aggregator.py +157 -0
  71. package/vendors/piste/requirements.txt +53 -0
  72. package/vendors/precis/backend/__init__.py +6 -0
  73. package/vendors/precis/backend/agents/__init__.py +3 -0
  74. package/vendors/precis/backend/agents/data_synthesis.py +105 -0
  75. package/vendors/precis/backend/agents/dist_free_synth.py +97 -0
  76. package/vendors/precis/backend/agents/exact_hash_retriever.py +327 -0
  77. package/vendors/precis/backend/agents/fusion_ranker.py +64 -0
  78. package/vendors/precis/backend/agents/guardrail.py +175 -0
  79. package/vendors/precis/backend/agents/query_expander.py +89 -0
  80. package/vendors/precis/backend/agents/radial_interpol.py +99 -0
  81. package/vendors/precis/backend/agents/report_generator.py +92 -0
  82. package/vendors/precis/backend/agents/semantic_reranker.py +135 -0
  83. package/vendors/precis/backend/agents/stat_anomaly.py +93 -0
  84. package/vendors/precis/backend/agents/vector_index.py +123 -0
  85. package/vendors/precis/backend/agents/veri_score.py +341 -0
  86. package/vendors/precis/backend/agents/work_order_extractor.py +205 -0
  87. package/vendors/precis/backend/api/__init__.py +3 -0
  88. package/vendors/precis/backend/api/routes/__init__.py +3 -0
  89. package/vendors/precis/backend/config.py +88 -0
  90. package/vendors/precis/backend/core/__init__.py +13 -0
  91. package/vendors/precis/backend/core/hashing.py +22 -0
  92. package/vendors/precis/backend/core/metrics.py +77 -0
  93. package/vendors/precis/backend/core/multitoken.py +166 -0
  94. package/vendors/precis/backend/core/pmi.py +54 -0
  95. package/vendors/precis/backend/core/stemming.py +74 -0
  96. package/vendors/precis/backend/core/tracing.py +150 -0
  97. package/vendors/precis/backend/data/__init__.py +3 -0
  98. package/vendors/precis/backend/data/chunker.py +57 -0
  99. package/vendors/precis/backend/data/pdf_parser.py +42 -0
  100. package/vendors/precis/backend/db/__init__.py +3 -0
  101. package/vendors/precis/backend/db/models.py +173 -0
  102. package/vendors/precis/backend/db/repository.py +269 -0
  103. package/vendors/precis/backend/llm/__init__.py +3 -0
  104. package/vendors/precis/backend/llm/anthropic_provider.py +39 -0
  105. package/vendors/precis/backend/llm/base.py +147 -0
  106. package/vendors/precis/backend/llm/deepseek_provider.py +43 -0
  107. package/vendors/precis/backend/llm/factory.py +60 -0
  108. package/vendors/precis/backend/llm/google_provider.py +39 -0
  109. package/vendors/precis/backend/llm/ollama_provider.py +54 -0
  110. package/vendors/precis/backend/llm/openai_provider.py +50 -0
  111. package/vendors/precis/backend/main.py +677 -0
  112. package/vendors/precis/backend/orchestrator/__init__.py +3 -0
  113. package/vendors/precis/backend/orchestrator/planner.py +81 -0
  114. package/vendors/precis/backend/orchestrator/router.py +319 -0
  115. package/vendors/precis/backend/orchestrator/types.py +58 -0
  116. package/vendors/precis/bridge_precis.py +185 -0
  117. package/vendors/precis/data/sample_reports/README.md +8 -0
  118. package/vendors/precis/data/seed_data.py +115 -0
  119. package/vendors/precis/requirements.txt +19 -0
@@ -0,0 +1,265 @@
1
+ # Copyright (c) 2026 Jinan Kordab
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ """
5
+ Unit Tests — Stage 2: Blind Retrieval
6
+ =======================================
7
+ Tests SearchDecisionGenerator [J1], BlindRetriever [J2],
8
+ CredibilityScorer [J1b], QueryRefiner [J8c], CanonicalEvidenceMapper [C6].
9
+
10
+ Run: pytest pipeline/stage2/test_stage2.py -v
11
+ """
12
+
13
+ import pytest
14
+ from unittest.mock import patch, MagicMock, AsyncMock
15
+
16
+
17
+ # ============================================================
18
+ # Canonical Evidence Mapper Tests [C6]
19
+ # ============================================================
20
+
21
+ class TestCanonicalEvidenceMapper:
22
+ """Test evidence normalization from all provider formats."""
23
+
24
+ @pytest.fixture
25
+ def mapper(self):
26
+ from pipeline.stage2.canonical_mapper import CanonicalEvidenceMapper
27
+ return CanonicalEvidenceMapper()
28
+
29
+ @pytest.fixture
30
+ def tavily_result(self):
31
+ from pipeline.stage2.blind_retriever import RawSearchResult
32
+ return RawSearchResult(
33
+ provider="tavily",
34
+ query_used="price of oil June 2008",
35
+ title="Oil reaches record high in 2008",
36
+ url="https://reuters.com/oil-2008",
37
+ snippet="Crude oil prices reached a record $145 per barrel...",
38
+ domain="reuters.com",
39
+ )
40
+
41
+ def test_map_tavily_result(self, mapper, tavily_result):
42
+ """Tavily result → CanonicalEvidence with correct fields."""
43
+ import asyncio
44
+ results = asyncio.run(
45
+ mapper.map_results([tavily_result])
46
+ )
47
+ assert len(results) == 1
48
+ ev = results[0]
49
+ assert ev.title == "Oil reaches record high in 2008"
50
+ assert ev.url == "https://reuters.com/oil-2008"
51
+ assert ev.source_domain == "reuters.com"
52
+ assert ev.provider == "tavily"
53
+ assert ev.query_used == "price of oil June 2008"
54
+
55
+ def test_map_results_sorts_by_credibility(self, mapper):
56
+ """Results are sorted by credibility score descending."""
57
+ from pipeline.stage2.blind_retriever import RawSearchResult
58
+ import asyncio
59
+
60
+ raw = [
61
+ RawSearchResult("tavily", "q", "Low cred", "http://a.com", "...", "low.com"),
62
+ RawSearchResult("serper", "q", "High cred", "http://b.com", "...", "high.com"),
63
+ RawSearchResult("tavily", "q", "Mid cred", "http://c.com", "...", "mid.com"),
64
+ ]
65
+
66
+ # Mock credibility scorer
67
+ class MockScorer:
68
+ async def score_domain(self, domain):
69
+ return {"high.com": 0.95, "mid.com": 0.60, "low.com": 0.25}[domain]
70
+
71
+ results = asyncio.run(
72
+ mapper.map_results(raw, MockScorer())
73
+ )
74
+ assert results[0].source_domain == "high.com"
75
+ assert results[1].source_domain == "mid.com"
76
+ assert results[2].source_domain == "low.com"
77
+
78
+ def test_to_dict_from_dict_roundtrip(self, mapper):
79
+ """CanonicalEvidence serializes/deserializes for JSONB storage."""
80
+ from pipeline.stage2.canonical_mapper import CanonicalEvidence
81
+
82
+ ev = CanonicalEvidence(
83
+ title="Test",
84
+ url="https://example.com",
85
+ excerpt="An excerpt.",
86
+ source_domain="example.com",
87
+ credibility_score=0.85,
88
+ query_used="test query",
89
+ provider="tavily",
90
+ )
91
+ d = mapper.to_dict(ev)
92
+ restored = mapper.from_dict(d)
93
+ assert restored.title == "Test"
94
+ assert restored.credibility_score == 0.85
95
+ assert restored.source_domain == "example.com"
96
+
97
+
98
+ # ============================================================
99
+ # Credibility Scorer Tests [J1b]
100
+ # ============================================================
101
+
102
+ class TestCredibilityScorer:
103
+ """Test per-domain credibility scoring."""
104
+
105
+ def test_default_score_for_unknown_domain(self):
106
+ from pipeline.stage2.credibility_scorer import CredibilityScorer
107
+ scorer = CredibilityScorer(db=None)
108
+ # Without DB, always returns default
109
+ assert scorer.DEFAULT_SCORE == 0.5
110
+
111
+ def test_cache_hit_after_first_lookup(self):
112
+ from pipeline.stage2.credibility_scorer import CredibilityScorer
113
+ import asyncio
114
+ scorer = CredibilityScorer(db=None)
115
+ scorer._cache["test.com"] = 0.88
116
+ score = asyncio.run(scorer.score_domain("test.com"))
117
+ assert score == 0.88
118
+
119
+ def test_is_reliable_threshold(self):
120
+ from pipeline.stage2.credibility_scorer import CredibilityScorer
121
+ scorer = CredibilityScorer(db=None)
122
+ scorer._cache["good.com"] = 0.92
123
+ scorer._cache["bad.com"] = 0.30
124
+ assert asyncio.run(scorer.is_reliable("good.com")) is True
125
+ assert asyncio.run(scorer.is_reliable("bad.com")) is False
126
+
127
+
128
+ # ============================================================
129
+ # Search Decision Generator Tests [J1]
130
+ # ============================================================
131
+
132
+ class TestSearchDecisionGenerator:
133
+ """Test search decision logic."""
134
+
135
+ def test_known_fact_skips_search(self):
136
+ """Simple, well-known facts should skip search."""
137
+ from pipeline.stage2.search_decision import SearchDecisionGenerator
138
+ gen = SearchDecisionGenerator()
139
+ # The actual decision depends on LLM, but the module structure
140
+ # ensures needs_search is a bool and queries are strings
141
+ assert gen.decide is not None
142
+ assert gen.generate_queries is not None
143
+
144
+
145
+ # ============================================================
146
+ # Query Refiner Tests [J8c]
147
+ # ============================================================
148
+
149
+ class TestQueryRefiner:
150
+ """Test intelligent query refinement — Loop 1."""
151
+
152
+ @pytest.fixture
153
+ def refiner(self):
154
+ from pipeline.stage2.query_refiner import QueryRefiner
155
+ return QueryRefiner()
156
+
157
+ def test_analyze_no_results(self, refiner):
158
+ """No results → specific message."""
159
+ reason = refiner.analyze_insufficiency([], "test claim")
160
+ assert "No search results" in reason
161
+
162
+ def test_analyze_low_credibility(self, refiner):
163
+ """Majority low-credibility sources → flagged."""
164
+ # Create mock results with low credibility scores
165
+ class MockResult:
166
+ credibility_score = 0.2
167
+ results = [MockResult() for _ in range(5)]
168
+ reason = refiner.analyze_insufficiency(results, "test claim")
169
+ assert "low-credibility" in reason.lower()
170
+
171
+ def test_analyze_few_results(self, refiner):
172
+ """Few results → insufficient warning."""
173
+ class MockResult:
174
+ credibility_score = 0.8
175
+ results = [MockResult(), MockResult()]
176
+ reason = refiner.analyze_insufficiency(results, "test claim")
177
+ assert "Only 2 results" in reason
178
+
179
+ def test_max_refined_queries_capped(self, refiner):
180
+ """Refined queries are capped at 3."""
181
+ # The forward method caps at 3; verify the module exists
182
+ assert refiner.max_retries > 0
183
+ assert refiner.refine is not None
184
+
185
+
186
+ # ============================================================
187
+ # Blind Retriever Tests [J2]
188
+ # ============================================================
189
+
190
+ class TestBlindRetriever:
191
+ """Test blind retrieval architecture."""
192
+
193
+ def test_domain_extraction(self):
194
+ """Domain extraction from URLs works correctly."""
195
+ from pipeline.stage2.blind_retriever import BlindRetriever
196
+ retriever = BlindRetriever()
197
+
198
+ assert retriever._extract_domain("https://www.bbc.com/news") == "bbc.com"
199
+ assert retriever._extract_domain("https://reuters.com/article/1") == "reuters.com"
200
+ assert retriever._extract_domain("http://sub.domain.co.uk/path") == "sub.domain.co.uk"
201
+
202
+ def test_deduplication_by_url(self):
203
+ """Duplicate URLs are removed from results."""
204
+ import asyncio
205
+ from pipeline.stage2.blind_retriever import BlindRetriever, RawSearchResult
206
+
207
+ retriever = BlindRetriever()
208
+ results = [
209
+ RawSearchResult("tavily", "q", "A", "https://example.com", "...", "example.com"),
210
+ RawSearchResult("serper", "q", "A dup", "https://example.com", "...", "example.com"),
211
+ RawSearchResult("tavily", "q", "B", "https://other.com", "...", "other.com"),
212
+ ]
213
+
214
+ # Simulate dedup logic
215
+ seen = set()
216
+ unique = []
217
+ for r in results:
218
+ norm = r.url.lower().rstrip("/")
219
+ if norm not in seen:
220
+ seen.add(norm)
221
+ unique.append(r)
222
+
223
+ assert len(unique) == 2
224
+ assert unique[0].url == "https://example.com"
225
+ assert unique[1].url == "https://other.com"
226
+
227
+
228
+ # ============================================================
229
+ # Stage2Result Tests
230
+ # ============================================================
231
+
232
+ class TestStage2Result:
233
+ """Test Stage2Result dataclass."""
234
+
235
+ def test_skipped_search_result(self):
236
+ from pipeline.stage2.orchestrator import Stage2Result
237
+ result = Stage2Result(
238
+ atomic_claim="Water boils at 100°C.",
239
+ needs_search=False,
240
+ search_queries=[],
241
+ search_reasoning="Well-known scientific fact.",
242
+ skipped_search=True,
243
+ )
244
+ assert result.needs_search is False
245
+ assert result.skipped_search is True
246
+ assert len(result.canonical_evidence) == 0
247
+
248
+ def test_search_with_retry_result(self):
249
+ from pipeline.stage2.orchestrator import Stage2Result
250
+ from pipeline.stage2.canonical_mapper import CanonicalEvidence
251
+ result = Stage2Result(
252
+ atomic_claim="Complex claim.",
253
+ needs_search=True,
254
+ search_queries=["neutral query"],
255
+ search_reasoning="Requires external evidence.",
256
+ canonical_evidence=[
257
+ CanonicalEvidence("S1", "http://a.com", "...", "a.com", 0.9),
258
+ ],
259
+ retry_count=1,
260
+ retry_queries=["refined query"],
261
+ insufficient_reason="Initial results insufficient.",
262
+ )
263
+ assert result.retry_count == 1
264
+ assert len(result.retry_queries) == 1
265
+ assert len(result.canonical_evidence) == 1
@@ -0,0 +1,20 @@
1
+ # Copyright (c) 2026 Jinan Kordab
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ # Stage 3 — Per-Source Classification [J3][J8]
5
+ # Classifier: pipeline/stage3/classifier.py
6
+ # Orchestrator: pipeline/stage3/orchestrator.py (asyncio.gather parallelism)
7
+
8
+ from pipeline.stage3.classifier import SourceClassifier, source_classifier
9
+ from pipeline.stage3.orchestrator import (
10
+ Stage3Orchestrator, Stage3Result, ClassificationResult, stage3_orchestrator,
11
+ )
12
+
13
+ __all__ = [
14
+ "SourceClassifier",
15
+ "source_classifier",
16
+ "Stage3Orchestrator",
17
+ "Stage3Result",
18
+ "ClassificationResult",
19
+ "stage3_orchestrator",
20
+ ]
@@ -0,0 +1,79 @@
1
+ # Copyright (c) 2026 Jinan Kordab
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ """
5
+ Stage 3: Per-Source Classification [J3][J8]
6
+ ==============================================
7
+ Aletheia's jewel: each evidence source is independently classified
8
+ as SUPPORTS, REFUTES, or UNRELATED to the claim BEFORE aggregation.
9
+
10
+ Run in parallel via asyncio.gather — N classifiers = N simultaneous LLM calls.
11
+ This provides:
12
+ - Full audit trail: every source's contribution is explicit
13
+ - Debuggability: errors isolated to single classifications
14
+ - Parallelism: independent evaluations run concurrently
15
+ """
16
+
17
+ import dspy
18
+ from typing import Tuple
19
+ from pipeline.signatures.signatures import SourceClassificationSignature
20
+ from pipeline.stage2.canonical_mapper import CanonicalEvidence
21
+
22
+
23
+ class SourceClassifier(dspy.Module):
24
+ """
25
+ DSPy module that classifies a single evidence source relative to a claim.
26
+
27
+ Jewel [J3] — Aletheia's structured per-source classification:
28
+ - Each source evaluated independently
29
+ - Label: SUPPORTS / REFUTES / UNRELATED
30
+ - Returns confidence + rationale for audit trail [C5]
31
+ """
32
+
33
+ def __init__(self):
34
+ super().__init__()
35
+ self.classify = dspy.ChainOfThought(SourceClassificationSignature)
36
+
37
+ def forward(
38
+ self,
39
+ claim: str,
40
+ evidence: CanonicalEvidence,
41
+ locale: str = "en",
42
+ ) -> Tuple[str, float, str]:
43
+ """
44
+ Classify one source against the claim.
45
+
46
+ Args:
47
+ claim: The atomic claim being verified.
48
+ evidence: A single CanonicalEvidence source.
49
+ locale: Language locale for the response (en, fr).
50
+
51
+ Returns:
52
+ label: "SUPPORTS", "REFUTES", or "UNRELATED"
53
+ confidence: 0.0–1.0
54
+ rationale: Explanation of the classification
55
+ """
56
+ result = self.classify(
57
+ claim=claim,
58
+ evidence_title=evidence.title,
59
+ evidence_excerpt=evidence.excerpt,
60
+ source_domain=evidence.source_domain,
61
+ credibility_score=evidence.credibility_score,
62
+ locale=locale,
63
+ )
64
+
65
+ label = result.label.strip().upper()
66
+ # Normalize to valid labels
67
+ if label not in ("SUPPORTS", "REFUTES", "UNRELATED"):
68
+ if "SUPPORT" in label:
69
+ label = "SUPPORTS"
70
+ elif "REFUT" in label:
71
+ label = "REFUTES"
72
+ else:
73
+ label = "UNRELATED"
74
+
75
+ return label, float(result.confidence), result.rationale
76
+
77
+
78
+ # Singleton
79
+ source_classifier = SourceClassifier()
@@ -0,0 +1,225 @@
1
+ # Copyright (c) 2026 Jinan Kordab
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ """
5
+ Stage 3 Orchestrator — Per-Source Classification
6
+ ===================================================
7
+ Runs N classifiers in parallel via asyncio.gather.
8
+ Each source independently evaluated → SUPPORTS/REFUTES/UNRELATED.
9
+
10
+ Emits SSE events per classification for real-time frontend updates.
11
+ Writes APPEND-ONLY classification records to PostgreSQL [C5].
12
+ """
13
+
14
+ import asyncio
15
+ import time
16
+ import uuid
17
+ from concurrent.futures import ThreadPoolExecutor
18
+ from typing import Optional, List
19
+ from dataclasses import dataclass, field
20
+
21
+ from sqlalchemy.ext.asyncio import AsyncSession
22
+
23
+ from app.db.models import StageRecord, Classification
24
+ from pipeline.stage3.classifier import source_classifier
25
+ from pipeline.stage2.canonical_mapper import CanonicalEvidence
26
+
27
+
28
+ @dataclass
29
+ class ClassificationResult:
30
+ """Output of a single source classification."""
31
+ source_index: int
32
+ source_url: str
33
+ source_domain: str
34
+ label: str # SUPPORTS | REFUTES | UNRELATED
35
+ confidence: float
36
+ rationale: str
37
+ credibility_score: float
38
+
39
+
40
+ @dataclass
41
+ class Stage3Result:
42
+ """Output of Stage 3 — Per-Source Classification."""
43
+ atomic_claim: str
44
+ classifications: list[ClassificationResult]
45
+ support_count: int = 0
46
+ refute_count: int = 0
47
+ unrelated_count: int = 0
48
+ total_sources: int = 0
49
+
50
+
51
+ class Stage3Orchestrator:
52
+ """
53
+ Orchestrates Stage 3 — parallel per-source classification.
54
+
55
+ Jewel [J3][J8] — Aletheia's structured evaluation:
56
+ Each source classified independently BEFORE aggregation.
57
+ Parallel execution via asyncio.gather reduces latency.
58
+ """
59
+
60
+ _semaphore: asyncio.Semaphore = asyncio.Semaphore(200)
61
+ _executor: ThreadPoolExecutor = ThreadPoolExecutor(max_workers=200)
62
+
63
+ def __init__(self, sse_callback: Optional[callable] = None):
64
+ self.sse_callback = sse_callback
65
+
66
+ async def process(
67
+ self,
68
+ atomic_claim: str,
69
+ evidence_list: List[CanonicalEvidence],
70
+ db: Optional[AsyncSession] = None,
71
+ run_id: Optional[uuid.UUID] = None,
72
+ locale: str = "en",
73
+ ) -> Stage3Result:
74
+ """
75
+ Classify all evidence sources in parallel against one atomic claim.
76
+
77
+ Args:
78
+ atomic_claim: The atomic claim to verify.
79
+ evidence_list: CanonicalEvidence sources from Stage 2.
80
+ db: Optional DB session for audit ledger writes.
81
+ run_id: UUID of the analysis run for audit trail [C5].
82
+
83
+ Returns:
84
+ Stage3Result with all per-source classifications.
85
+ """
86
+ await self._emit("stage_3_start", {
87
+ "atomic_claim": atomic_claim,
88
+ "sources_count": len(evidence_list),
89
+ })
90
+
91
+ if not evidence_list:
92
+ await self._emit("stage_3_complete", {
93
+ "atomic_claim": atomic_claim,
94
+ "support": 0, "refute": 0, "unrelated": 0,
95
+ })
96
+ return Stage3Result(
97
+ atomic_claim=atomic_claim,
98
+ classifications=[],
99
+ total_sources=0,
100
+ )
101
+
102
+ # --- Run N classifiers in parallel ---
103
+ t0 = time.monotonic()
104
+
105
+ async def _classify_with_limit(
106
+ idx: int, claim: str, ev: CanonicalEvidence, loc: str
107
+ ) -> ClassificationResult:
108
+ async with self._semaphore:
109
+ return await self._classify_single(idx, claim, ev, loc)
110
+
111
+ tasks = [
112
+ _classify_with_limit(i, atomic_claim, evidence, locale)
113
+ for i, evidence in enumerate(evidence_list)
114
+ ]
115
+ results: List[ClassificationResult] = await asyncio.gather(*tasks)
116
+
117
+ latency_total = (time.monotonic() - t0) * 1000
118
+
119
+ # --- Tally ---
120
+ support_count = sum(1 for r in results if r.label == "SUPPORTS")
121
+ refute_count = sum(1 for r in results if r.label == "REFUTES")
122
+ unrelated_count = sum(1 for r in results if r.label == "UNRELATED")
123
+
124
+ # --- Write to PostgreSQL (append-only) ---
125
+ if db:
126
+ for r in results:
127
+ # Stage 2 attached the Source row PK to the in-memory evidence
128
+ # at insertion time; reuse it here as the FK.
129
+ ev = evidence_list[r.source_index]
130
+ db.add(Classification(
131
+ run_id=run_id or uuid.UUID("00000000-0000-0000-0000-000000000000"),
132
+ source_id=ev.db_id,
133
+ label=r.label,
134
+ confidence=r.confidence,
135
+ rationale=r.rationale,
136
+ model_used="dspy/source_classifier",
137
+ ))
138
+
139
+ db.add(StageRecord(
140
+ run_id=run_id or uuid.UUID("00000000-0000-0000-0000-000000000000"),
141
+ stage_name="stage_3",
142
+ input_snapshot={
143
+ "atomic_claim": atomic_claim,
144
+ "sources_count": len(evidence_list),
145
+ },
146
+ output_snapshot={
147
+ "support": support_count,
148
+ "refute": refute_count,
149
+ "unrelated": unrelated_count,
150
+ "classifications": [
151
+ {"label": r.label, "confidence": r.confidence}
152
+ for r in results
153
+ ],
154
+ },
155
+ model_used="dspy/source_classifier",
156
+ latency_ms=latency_total,
157
+ retry_attempt=0,
158
+ ))
159
+
160
+ # --- Emit SSE events ---
161
+ for r in results:
162
+ await self._emit("source_classified", {
163
+ "source_index": r.source_index,
164
+ "source_url": r.source_url,
165
+ "source_domain": r.source_domain,
166
+ "label": r.label,
167
+ "confidence": r.confidence,
168
+ "credibility_score": r.credibility_score,
169
+ })
170
+
171
+ await self._emit("stage_3_complete", {
172
+ "atomic_claim": atomic_claim,
173
+ "support": support_count,
174
+ "refute": refute_count,
175
+ "unrelated": unrelated_count,
176
+ })
177
+
178
+ return Stage3Result(
179
+ atomic_claim=atomic_claim,
180
+ classifications=results,
181
+ support_count=support_count,
182
+ refute_count=refute_count,
183
+ unrelated_count=unrelated_count,
184
+ total_sources=len(evidence_list),
185
+ )
186
+
187
+ async def _classify_single(
188
+ self,
189
+ index: int,
190
+ atomic_claim: str,
191
+ evidence: CanonicalEvidence,
192
+ locale: str = "en",
193
+ ) -> ClassificationResult:
194
+ """Classify a single source (runs in parallel via asyncio.gather).
195
+
196
+ The DSPy source_classifier is synchronous and would block the asyncio
197
+ event loop on every LLM call, forcing serial execution despite the
198
+ gather. We off-load each call to a thread so the event loop stays
199
+ free and N classifications truly run concurrently.
200
+ """
201
+ loop = asyncio.get_running_loop()
202
+ label, confidence, rationale = await loop.run_in_executor(
203
+ self._executor,
204
+ source_classifier,
205
+ atomic_claim,
206
+ evidence,
207
+ locale,
208
+ )
209
+ return ClassificationResult(
210
+ source_index=index,
211
+ source_url=evidence.url,
212
+ source_domain=evidence.source_domain,
213
+ label=label,
214
+ confidence=confidence,
215
+ rationale=rationale,
216
+ credibility_score=evidence.credibility_score,
217
+ )
218
+
219
+ async def _emit(self, event_type: str, data: dict):
220
+ if self.sse_callback:
221
+ await self.sse_callback(event_type, data)
222
+
223
+
224
+ # Singleton
225
+ stage3_orchestrator = Stage3Orchestrator()
@@ -0,0 +1,101 @@
1
+ # Copyright (c) 2026 Jinan Kordab
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ """
5
+ Unit Tests — Stage 3: Per-Source Classification
6
+ =================================================
7
+ Tests SourceClassifier [J3], parallel orchestrator, ClassificationResult.
8
+
9
+ Run: pytest pipeline/stage3/test_stage3.py -v
10
+ """
11
+
12
+ import pytest
13
+
14
+
15
+ class TestClassificationResult:
16
+ """Test ClassificationResult dataclass."""
17
+
18
+ def test_supports_classification(self):
19
+ from pipeline.stage3.orchestrator import ClassificationResult
20
+ r = ClassificationResult(
21
+ source_index=0,
22
+ source_url="https://reuters.com/article",
23
+ source_domain="reuters.com",
24
+ label="SUPPORTS",
25
+ confidence=0.92,
26
+ rationale="The article confirms the claim with primary data.",
27
+ credibility_score=0.95,
28
+ )
29
+ assert r.label == "SUPPORTS"
30
+ assert r.credibility_score == 0.95
31
+
32
+ def test_refutes_classification(self):
33
+ from pipeline.stage3.orchestrator import ClassificationResult
34
+ r = ClassificationResult(
35
+ source_index=1,
36
+ source_url="https://example.com",
37
+ source_domain="example.com",
38
+ label="REFUTES",
39
+ confidence=0.78,
40
+ rationale="Data contradicts the claim.",
41
+ credibility_score=0.45,
42
+ )
43
+ assert r.label == "REFUTES"
44
+
45
+ def test_unrelated_classification(self):
46
+ from pipeline.stage3.orchestrator import ClassificationResult
47
+ r = ClassificationResult(
48
+ source_index=2,
49
+ source_url="https://other.com",
50
+ source_domain="other.com",
51
+ label="UNRELATED",
52
+ confidence=0.95,
53
+ rationale="Source is about a different topic.",
54
+ credibility_score=0.60,
55
+ )
56
+ assert r.label == "UNRELATED"
57
+
58
+
59
+ class TestStage3Result:
60
+ """Test Stage3Result tallying."""
61
+
62
+ def test_tally_counts(self):
63
+ from pipeline.stage3.orchestrator import Stage3Result, ClassificationResult
64
+
65
+ classifications = [
66
+ ClassificationResult(0, "a.com", "a.com", "SUPPORTS", 0.9, "...", 0.9),
67
+ ClassificationResult(1, "b.com", "b.com", "SUPPORTS", 0.8, "...", 0.8),
68
+ ClassificationResult(2, "c.com", "c.com", "REFUTES", 0.7, "...", 0.7),
69
+ ClassificationResult(3, "d.com", "d.com", "UNRELATED", 0.9, "...", 0.5),
70
+ ]
71
+
72
+ # Manual tally
73
+ support = sum(1 for c in classifications if c.label == "SUPPORTS")
74
+ refute = sum(1 for c in classifications if c.label == "REFUTES")
75
+ unrelated = sum(1 for c in classifications if c.label == "UNRELATED")
76
+
77
+ assert support == 2
78
+ assert refute == 1
79
+ assert unrelated == 1
80
+
81
+ def test_empty_evidence(self):
82
+ from pipeline.stage3.orchestrator import Stage3Result
83
+ result = Stage3Result(
84
+ atomic_claim="Test",
85
+ classifications=[],
86
+ total_sources=0,
87
+ )
88
+ assert result.total_sources == 0
89
+ assert result.support_count == 0
90
+
91
+
92
+ class TestStage3Orchestrator:
93
+ """Test parallel classification orchestration."""
94
+
95
+ def test_sse_callback_stored(self):
96
+ from pipeline.stage3.orchestrator import Stage3Orchestrator
97
+ calls = []
98
+ async def cb(event, data):
99
+ calls.append((event, data))
100
+ orch = Stage3Orchestrator(sse_callback=cb)
101
+ assert orch.sse_callback is not None