@jayarrowz/mcp-arsr 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,56 @@
1
+ import type { ARSRConfig, Claim, ScoredClaim, ClaimEvidence } from "../types.js";
2
+ /**
3
+ * Generate an initial draft response to a query.
4
+ * Uses web search to ground the draft in real data — this prevents
5
+ * the "I don't know" refusal problem where the inner model punts
6
+ * on questions outside its training data.
7
+ *
8
+ * Returns the draft text and a structured is_refusal flag classified
9
+ * by the inner LLM (replacing brittle string matching).
10
+ */
11
+ export declare function generateDraft(query: string, context?: string, config?: ARSRConfig): Promise<{
12
+ draft: string;
13
+ is_refusal: boolean;
14
+ }>;
15
+ /**
16
+ * Decompose a draft into individually verifiable atomic claims.
17
+ * If the draft is a refusal/non-answer, extracts claims from the
18
+ * original query context instead so the loop can still retrieve evidence.
19
+ */
20
+ export declare function decomposeClaims(draft: string, originalQuery?: string, isRefusal?: boolean, config?: ARSRConfig): Promise<Claim[]>;
21
+ /**
22
+ * Score claims by generating multiple rephrasings and measuring agreement.
23
+ * Uses semantic entropy: low agreement across rephrasings = high uncertainty.
24
+ */
25
+ export declare function scoreClaims(claims: Claim[], config?: ARSRConfig): Promise<ScoredClaim[]>;
26
+ /**
27
+ * For low-confidence claims, generate adversarial search queries and retrieve evidence.
28
+ * Uses the inner LLM + web search to find supporting/contradicting sources.
29
+ */
30
+ export declare function retrieveEvidence(claims: ScoredClaim[], strategy?: string, config?: ARSRConfig): Promise<ClaimEvidence[]>;
31
+ /**
32
+ * Revise the draft based on evidence, returning the new text + change log.
33
+ * If the original draft was a refusal/non-answer, writes a completely new
34
+ * response from the evidence instead of trying to edit the refusal.
35
+ */
36
+ export declare function reviseDraft(draft: string, evidence: ClaimEvidence[], scored: ScoredClaim[], originalQuery?: string, isRefusal?: boolean, config?: ARSRConfig): Promise<{
37
+ revised: string;
38
+ changes: Array<{
39
+ claim_id: string;
40
+ action: string;
41
+ original: string;
42
+ revised: string;
43
+ reason: string;
44
+ }>;
45
+ conflicts: Array<{
46
+ claim_id: string;
47
+ description: string;
48
+ }>;
49
+ }>;
50
+ /**
51
+ * Decide whether to continue the refinement loop.
52
+ */
53
+ export declare function shouldContinue(iteration: number, scored: ScoredClaim[], maxIterations: number, confidenceThreshold: number, previousAvgConfidence: number | null): Promise<{
54
+ decision: "continue" | "stop";
55
+ reason: string;
56
+ }>;
@@ -0,0 +1,361 @@
1
+ import Anthropic from "@anthropic-ai/sdk";
2
+ import { DEFAULT_CONFIG } from "../types.js";
3
+ let client = null;
4
+ function getClient() {
5
+ if (!client) {
6
+ client = new Anthropic(); // Uses ANTHROPIC_API_KEY env var
7
+ }
8
+ return client;
9
+ }
10
+ async function askInner(system, user, config = DEFAULT_CONFIG) {
11
+ const api = getClient();
12
+ const response = await api.messages.create({
13
+ model: config.inner_model,
14
+ max_tokens: 4096,
15
+ system,
16
+ messages: [{ role: "user", content: user }],
17
+ });
18
+ const textBlock = response.content.find((b) => b.type === "text");
19
+ return textBlock ? textBlock.text : "";
20
+ }
21
+ async function askInnerWithSearch(system, user, config = DEFAULT_CONFIG) {
22
+ const api = getClient();
23
+ const response = await api.messages.create({
24
+ model: config.inner_model,
25
+ max_tokens: 4096,
26
+ system,
27
+ messages: [{ role: "user", content: user }],
28
+ tools: [{ type: "web_search_20250305", name: "web_search" }],
29
+ });
30
+ // Extract text and citations from the response
31
+ let text = "";
32
+ const citations = [];
33
+ for (const block of response.content) {
34
+ if (block.type === "text") {
35
+ text += block.text;
36
+ // Extract any inline citations
37
+ if ("citations" in block && Array.isArray(block.citations)) {
38
+ for (const cite of block.citations) {
39
+ if ("url" in cite && "title" in cite) {
40
+ citations.push({
41
+ url: cite.url,
42
+ title: cite.title,
43
+ });
44
+ }
45
+ }
46
+ }
47
+ }
48
+ }
49
+ return { text, citations };
50
+ }
51
+ function extractJSON(raw) {
52
+ // Strip markdown fences if present
53
+ const cleaned = raw
54
+ .replace(/```json\s*/gi, "")
55
+ .replace(/```\s*/g, "")
56
+ .trim();
57
+ // Try to find JSON object or array
58
+ const jsonMatch = cleaned.match(/[\[{][\s\S]*[\]}]/);
59
+ if (!jsonMatch) {
60
+ throw new Error(`No JSON found in LLM output: ${cleaned.slice(0, 200)}`);
61
+ }
62
+ return JSON.parse(jsonMatch[0]);
63
+ }
64
+ /**
65
+ * Classify whether a draft is a refusal/non-answer using the inner LLM.
66
+ * Returns true if the draft deflects, redirects, or refuses to answer.
67
+ */
68
+ async function classifyRefusal(draft, config = DEFAULT_CONFIG) {
69
+ const system = `You are a response classifier. Determine whether the given text is a REFUSAL or NON-ANSWER.
70
+
71
+ A refusal is any response that:
72
+ - Says it cannot, does not, or is unable to provide the information
73
+ - Redirects the user to check another source instead of answering
74
+ - Provides only generic suggestions instead of a direct answer
75
+ - Hedges so heavily that no substantive information is conveyed
76
+
77
+ Respond with ONLY a JSON object: { "is_refusal": true } or { "is_refusal": false }`;
78
+ const raw = await askInner(system, `Classify this response:\n\n${draft}`, config);
79
+ try {
80
+ const result = extractJSON(raw);
81
+ return result.is_refusal === true;
82
+ }
83
+ catch {
84
+ return false;
85
+ }
86
+ }
87
+ /**
88
+ * Generate an initial draft response to a query.
89
+ * Uses web search to ground the draft in real data — this prevents
90
+ * the "I don't know" refusal problem where the inner model punts
91
+ * on questions outside its training data.
92
+ *
93
+ * Returns the draft text and a structured is_refusal flag classified
94
+ * by the inner LLM (replacing brittle string matching).
95
+ */
96
+ export async function generateDraft(query, context, config = DEFAULT_CONFIG) {
97
+ const system = `You are a helpful, accurate research assistant. Your job is to ANSWER the user's question with specific, concrete facts.
98
+
99
+ CRITICAL RULES:
100
+ - You MUST search the web to find the answer. Do NOT rely on memory alone.
101
+ - You MUST provide a direct, substantive answer with specific facts, numbers, and details.
102
+ - NEVER say "I don't have access to" or "I recommend checking" or "I cannot provide". These are failures.
103
+ - NEVER give a list of places to look instead of the answer. FIND the answer yourself.
104
+ - If the question asks about specific data (vote counts, statistics, dates), SEARCH FOR IT and REPORT IT.
105
+ - It's OK to be wrong — your answer will be fact-checked and corrected later. A wrong answer is better than no answer.
106
+ - Include your best understanding even if uncertain. The refinement loop will fix errors.
107
+ ${context ? `\nAdditional context:\n${context}` : ""}`;
108
+ const { text } = await askInnerWithSearch(system, query, config);
109
+ const is_refusal = await classifyRefusal(text, config);
110
+ return { draft: text, is_refusal };
111
+ }
112
+ /**
113
+ * Decompose a draft into individually verifiable atomic claims.
114
+ * If the draft is a refusal/non-answer, extracts claims from the
115
+ * original query context instead so the loop can still retrieve evidence.
116
+ */
117
+ export async function decomposeClaims(draft, originalQuery, isRefusal = false, config = DEFAULT_CONFIG) {
118
+ let textToDecompose = draft;
119
+ let systemAddendum = "";
120
+ if (isRefusal && originalQuery) {
121
+ // The draft was classified as a refusal/non-answer by the LLM.
122
+ // Extract claims from the user's question instead.
123
+ textToDecompose = originalQuery;
124
+ systemAddendum = `
125
+ IMPORTANT: The original draft was a non-answer/refusal. You are now extracting the factual claims
126
+ embedded in the USER'S QUESTION instead. These are the claims that need to be verified.
127
+ For example, if the user says "Is it true that X got Y votes?", extract "X got Y votes" as a claim.`;
128
+ }
129
+ const system = `You are a claim extraction engine. Given a text, extract every distinct factual claim as a separate item.
130
+
131
+ Rules:
132
+ - Each claim must be a single, independently verifiable statement
133
+ - Preserve the original meaning precisely
134
+ - Skip opinions, hedges ("I think"), and meta-commentary
135
+ - Include the source_span (the exact substring from the original text)
136
+ - Give each claim a short id like "c1", "c2", etc.
137
+ ${systemAddendum}
138
+
139
+ Respond ONLY with a JSON array:
140
+ [{ "id": "c1", "text": "The claim as a standalone statement", "source_span": "exact quote from draft" }, ...]`;
141
+ const raw = await askInner(system, `Extract all factual claims from:\n\n${textToDecompose}`, config);
142
+ return extractJSON(raw);
143
+ }
144
+ /**
145
+ * Score claims by generating multiple rephrasings and measuring agreement.
146
+ * Uses semantic entropy: low agreement across rephrasings = high uncertainty.
147
+ */
148
+ export async function scoreClaims(claims, config = DEFAULT_CONFIG) {
149
+ const system = `You are an uncertainty estimation engine. For each claim, assess how likely it is to be factually correct.
150
+
151
+ Consider:
152
+ - Is this common knowledge or obscure?
153
+ - Are there well-known disputes about this?
154
+ - Does the specificity (exact numbers, dates, names) increase risk of error?
155
+ - Could this be a common misconception?
156
+
157
+ Respond ONLY with a JSON array:
158
+ [{
159
+ "id": "c1",
160
+ "confidence": 0.92,
161
+ "entropy": 0.15,
162
+ "method": "semantic_entropy",
163
+ "reasoning": "brief explanation"
164
+ }, ...]
165
+
166
+ confidence: 0.0 = certainly wrong, 1.0 = certainly correct
167
+ entropy: 0.0 = very certain, 1.0 = highly uncertain`;
168
+ const claimsText = claims
169
+ .map((c) => `[${c.id}] ${c.text}`)
170
+ .join("\n");
171
+ const raw = await askInner(system, `Score the uncertainty of each claim:\n\n${claimsText}`, config);
172
+ const scored = extractJSON(raw);
173
+ return scored.map((s) => {
174
+ const original = claims.find((c) => c.id === s.id);
175
+ return {
176
+ id: s.id,
177
+ text: original?.text ?? "",
178
+ source_span: original?.source_span ?? "",
179
+ confidence: Math.max(0, Math.min(1, s.confidence)),
180
+ entropy: Math.max(0, Math.min(1, s.entropy)),
181
+ method: "semantic_entropy",
182
+ };
183
+ });
184
+ }
185
+ /**
186
+ * For low-confidence claims, generate adversarial search queries and retrieve evidence.
187
+ * Uses the inner LLM + web search to find supporting/contradicting sources.
188
+ */
189
+ export async function retrieveEvidence(claims, strategy = "adversarial", config = DEFAULT_CONFIG) {
190
+ const results = [];
191
+ for (const claim of claims) {
192
+ const queryGenSystem = `You are a search query generator for fact-checking.
193
+ Strategy: ${strategy}
194
+
195
+ For "adversarial": generate queries designed to DISPROVE the claim. Search for counterexamples, corrections, or the actual facts.
196
+ For "confirmatory": generate queries to find authoritative sources that confirm the claim.
197
+ For "balanced": generate both supporting and challenging queries.
198
+
199
+ Respond ONLY with a JSON array of 2-3 search queries:
200
+ ["query 1", "query 2", "query 3"]`;
201
+ const queriesRaw = await askInner(queryGenSystem, `Generate search queries to fact-check: "${claim.text}"`, config);
202
+ let queries;
203
+ try {
204
+ queries = extractJSON(queriesRaw);
205
+ }
206
+ catch {
207
+ queries = [claim.text];
208
+ }
209
+ const allDocs = [];
210
+ for (const query of queries.slice(0, 3)) {
211
+ try {
212
+ const searchSystem = `You are a fact-checking research assistant. Search for information about the given query and evaluate what you find relative to this claim: "${claim.text}"
213
+
214
+ After searching, respond with a JSON array of the most relevant results:
215
+ [{
216
+ "title": "Page title",
217
+ "url": "https://...",
218
+ "snippet": "The relevant excerpt (max 100 words)",
219
+ "stance": "supports" | "contradicts" | "neutral" | "unclear"
220
+ }]
221
+
222
+ Respond ONLY with the JSON array.`;
223
+ const { text } = await askInnerWithSearch(searchSystem, `Search and evaluate: ${query}`, config);
224
+ try {
225
+ const docs = extractJSON(text);
226
+ allDocs.push(...docs);
227
+ }
228
+ catch {
229
+ // If parsing fails, still capture as a single doc
230
+ allDocs.push({
231
+ title: "Search result",
232
+ url: "",
233
+ snippet: text.slice(0, 300),
234
+ stance: "unclear",
235
+ });
236
+ }
237
+ }
238
+ catch (err) {
239
+ console.error(`Search failed for query "${query}":`, err);
240
+ }
241
+ }
242
+ // Step 3: Summarize the evidence stance
243
+ const supports = allDocs.filter((d) => d.stance === "supports").length;
244
+ const contradicts = allDocs.filter((d) => d.stance === "contradicts").length;
245
+ let overall_stance;
246
+ if (allDocs.length === 0)
247
+ overall_stance = "insufficient";
248
+ else if (supports > 0 && contradicts > 0)
249
+ overall_stance = "mixed";
250
+ else if (contradicts > supports)
251
+ overall_stance = "contradicted";
252
+ else
253
+ overall_stance = "supported";
254
+ // Step 4: Generate a concise summary
255
+ const summarySystem = `Summarize the evidence for/against this claim in 1-2 sentences. Be direct.`;
256
+ const summaryInput = `Claim: "${claim.text}"\nEvidence:\n${allDocs.map((d) => `- [${d.stance}] ${d.snippet}`).join("\n")}`;
257
+ const summary = allDocs.length > 0
258
+ ? await askInner(summarySystem, summaryInput, config)
259
+ : "No evidence found.";
260
+ results.push({
261
+ claim_id: claim.id,
262
+ claim_text: claim.text,
263
+ docs: allDocs,
264
+ overall_stance,
265
+ summary,
266
+ });
267
+ }
268
+ return results;
269
+ }
270
+ /**
271
+ * Revise the draft based on evidence, returning the new text + change log.
272
+ * If the original draft was a refusal/non-answer, writes a completely new
273
+ * response from the evidence instead of trying to edit the refusal.
274
+ */
275
+ export async function reviseDraft(draft, evidence, scored, originalQuery, isRefusal = false, config = DEFAULT_CONFIG) {
276
+ const evidenceSummary = evidence
277
+ .map((e) => `[${e.claim_id}] "${e.claim_text}" → ${e.overall_stance}\n Evidence: ${e.summary}`)
278
+ .join("\n\n");
279
+ let system;
280
+ if (isRefusal && originalQuery) {
281
+ // The draft was a non-answer. Write a NEW response from the evidence.
282
+ system = `You are a response generation engine. The original draft FAILED to answer the user's question — it was a refusal or redirect.
283
+
284
+ Your job: Write a COMPLETELY NEW response that DIRECTLY ANSWERS the user's question using the evidence provided.
285
+
286
+ User's original question: "${originalQuery}"
287
+
288
+ Rules:
289
+ - DIRECTLY answer the question using the evidence gathered
290
+ - Include specific facts, numbers, and details from the evidence
291
+ - If the evidence contradicts what the user claimed, say so clearly
292
+ - If the evidence supports what the user claimed, confirm it
293
+ - Add hedging ("reportedly", "according to...") only for genuinely mixed evidence
294
+ - Do NOT say "I don't have access" or redirect to other sources
295
+
296
+ Respond with JSON:
297
+ {
298
+ "revised": "The full NEW response that directly answers the question",
299
+ "changes": [
300
+ { "claim_id": "c1", "action": "generated_from_evidence", "original": "N/A - draft was a refusal", "revised": "the new claim", "reason": "Evidence shows..." }
301
+ ],
302
+ "conflicts": [
303
+ { "claim_id": "c2", "description": "Sources disagree about..." }
304
+ ]
305
+ }`;
306
+ }
307
+ else {
308
+ system = `You are a response revision engine. Given an original draft and fact-checking evidence, produce a corrected version.
309
+
310
+ Rules:
311
+ - Fix claims that were contradicted by evidence
312
+ - Add hedging language ("reportedly", "according to...") for mixed evidence
313
+ - Remove claims with no supporting evidence if they are central to the answer
314
+ - Keep claims that were supported — don't weaken what's already correct
315
+ - Preserve the original tone and structure as much as possible
316
+
317
+ Respond with JSON:
318
+ {
319
+ "revised": "The full revised response text",
320
+ "changes": [
321
+ { "claim_id": "c1", "action": "corrected|removed|hedged|kept", "original": "...", "revised": "...", "reason": "..." }
322
+ ],
323
+ "conflicts": [
324
+ { "claim_id": "c2", "description": "Sources disagree about..." }
325
+ ]
326
+ }`;
327
+ }
328
+ const raw = await askInner(system, `Original draft:\n${draft}\n\nEvidence report:\n${evidenceSummary}`, config);
329
+ return extractJSON(raw);
330
+ }
331
+ /**
332
+ * Decide whether to continue the refinement loop.
333
+ */
334
+ export async function shouldContinue(iteration, scored, maxIterations, confidenceThreshold, previousAvgConfidence) {
335
+ if (iteration >= maxIterations) {
336
+ return { decision: "stop", reason: `Budget exhausted (${maxIterations} iterations)` };
337
+ }
338
+ const avgConfidence = scored.length > 0
339
+ ? scored.reduce((sum, c) => sum + c.confidence, 0) / scored.length
340
+ : 1;
341
+ const lowConfidence = scored.filter((c) => c.confidence < confidenceThreshold);
342
+ if (lowConfidence.length === 0) {
343
+ return {
344
+ decision: "stop",
345
+ reason: `All ${scored.length} claims above confidence threshold (${confidenceThreshold})`,
346
+ };
347
+ }
348
+ if (previousAvgConfidence !== null) {
349
+ const improvement = avgConfidence - previousAvgConfidence;
350
+ if (improvement < 0.02) {
351
+ return {
352
+ decision: "stop",
353
+ reason: `Confidence converged (Δ=${improvement.toFixed(3)}, threshold=0.02). ${lowConfidence.length} claims remain below threshold.`,
354
+ };
355
+ }
356
+ }
357
+ return {
358
+ decision: "continue",
359
+ reason: `${lowConfidence.length}/${scored.length} claims below threshold. Avg confidence: ${avgConfidence.toFixed(3)}. Continuing refinement.`,
360
+ };
361
+ }
@@ -0,0 +1,53 @@
1
+ export interface Claim {
2
+ id: string;
3
+ text: string;
4
+ source_span: string;
5
+ }
6
+ export interface ScoredClaim extends Claim {
7
+ confidence: number;
8
+ entropy: number;
9
+ method: "semantic_entropy" | "consistency_vote";
10
+ variants?: string[];
11
+ }
12
+ export interface EvidenceDoc {
13
+ title: string;
14
+ url: string;
15
+ snippet: string;
16
+ stance: "supports" | "contradicts" | "neutral" | "unclear";
17
+ }
18
+ export interface ClaimEvidence {
19
+ claim_id: string;
20
+ claim_text: string;
21
+ docs: EvidenceDoc[];
22
+ overall_stance: "supported" | "contradicted" | "mixed" | "insufficient";
23
+ summary: string;
24
+ }
25
+ export interface RevisionChange {
26
+ claim_id: string;
27
+ action: "kept" | "corrected" | "removed" | "hedged";
28
+ original: string;
29
+ revised: string;
30
+ reason: string;
31
+ }
32
+ export interface Conflict {
33
+ claim_id: string;
34
+ description: string;
35
+ sources_for: string[];
36
+ sources_against: string[];
37
+ }
38
+ export interface LoopState {
39
+ iteration: number;
40
+ max_iterations: number;
41
+ confidence_threshold: number;
42
+ previous_avg_confidence: number | null;
43
+ claims_improved: number;
44
+ claims_degraded: number;
45
+ }
46
+ export interface ARSRConfig {
47
+ max_iterations: number;
48
+ confidence_threshold: number;
49
+ entropy_samples: number;
50
+ retrieval_strategy: "adversarial" | "confirmatory" | "balanced";
51
+ inner_model: string;
52
+ }
53
+ export declare const DEFAULT_CONFIG: ARSRConfig;
@@ -0,0 +1,7 @@
1
+ export const DEFAULT_CONFIG = {
2
+ max_iterations: parseInt(process.env.ARSR_MAX_ITERATIONS || "3", 10),
3
+ confidence_threshold: parseFloat(process.env.ARSR_CONFIDENCE_THRESHOLD || "0.85"),
4
+ entropy_samples: parseInt(process.env.ARSR_ENTROPY_SAMPLES || "3", 10),
5
+ retrieval_strategy: process.env.ARSR_RETRIEVAL_STRATEGY || "adversarial",
6
+ inner_model: process.env.ARSR_INNER_MODEL || "claude-haiku-4-5-20251001",
7
+ };
package/glama.json ADDED
@@ -0,0 +1,6 @@
1
+ {
2
+ "$schema": "https://glama.ai/mcp/schemas/server.json",
3
+ "maintainers": [
4
+ "JayArrowz"
5
+ ]
6
+ }
package/package.json ADDED
@@ -0,0 +1,31 @@
1
+ {
2
+ "name": "@jayarrowz/mcp-arsr",
3
+ "version": "1.0.0",
4
+ "description": "Adaptive Retrieval-Augmented Self-Refinement MCP Server — a closed-loop system that lets LLMs iteratively verify and correct their own claims using uncertainty-guided retrieval.",
5
+ "license": "MIT",
6
+ "author": "Jay Arrowz (https://github.com/jayarrowz)",
7
+ "homepage": "https://github.com/jayarrowz/mcp-arsr",
8
+ "bugs": "https://github.com/jayarrowz/mcp-arsr/issues",
9
+ "main": "dist/src/index.js",
10
+ "bin": {
11
+ "mcp-arsr": "dist/src/index.js"
12
+ },
13
+ "scripts": {
14
+ "build": "tsc",
15
+ "start": "node dist/src/index.js",
16
+ "dev": "tsc --watch"
17
+ },
18
+ "dependencies": {
19
+ "@anthropic-ai/sdk": "^0.36.3",
20
+ "@modelcontextprotocol/sdk": "^1.27.1",
21
+ "zod": "^4.3.6"
22
+ },
23
+ "devDependencies": {
24
+ "@types/express": "^5.0.6",
25
+ "@types/node": "^22.0.0",
26
+ "ts-node": "^10.9.2",
27
+ "tsx": "^4.21.0",
28
+ "typescript": "^5.7.0"
29
+ },
30
+ "type": "module"
31
+ }
package/smithery.yaml ADDED
@@ -0,0 +1,13 @@
1
+ startCommand:
2
+ type: "stdio"
3
+ configSchema:
4
+ type: "object"
5
+ properties: {}
6
+ additionalProperties: false
7
+ commandFunction:
8
+ # A JS function that produces the CLI command based on the given config to start the MCP on stdio.
9
+ |-
10
+ (config) => ({
11
+ command: 'node',
12
+ args: ['dist/src/index.js']
13
+ })