npm - @jayarrowz/mcp-arsr - Versions diffs - 1.0.0 - Mend

@jayarrowz/mcp-arsr 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/.gitattributes +2 -0
package/Dockerfile +19 -0
package/LICENSE +21 -0
package/README.md +125 -0
package/dist/src/index.d.ts +1 -0
package/dist/src/index.js +325 -0
package/dist/src/schemas/tools.d.ts +85 -0
package/dist/src/schemas/tools.js +100 -0
package/dist/src/services/llm.d.ts +56 -0
package/dist/src/services/llm.js +361 -0
package/dist/src/types.d.ts +53 -0
package/dist/src/types.js +7 -0
package/glama.json +6 -0
package/package.json +31 -0
package/smithery.yaml +13 -0
package/src/index.ts +395 -0
package/src/schemas/tools.ts +118 -0
package/src/services/llm.ts +480 -0
package/src/types.ts +67 -0
package/tsconfig.json +16 -0

package/src/services/llm.ts ADDED Viewed

@@ -0,0 +1,480 @@
+import Anthropic from "@anthropic-ai/sdk";
+import { DEFAULT_CONFIG } from "../types.js";
+import type { ARSRConfig, Claim, ScoredClaim, EvidenceDoc, ClaimEvidence } from "../types.js";
+let client: Anthropic | null = null;
+function getClient(): Anthropic {
+  if (!client) {
+    client = new Anthropic(); // Uses ANTHROPIC_API_KEY env var
+  }
+  return client;
+}
+async function askInner(
+  system: string,
+  user: string,
+  config: ARSRConfig = DEFAULT_CONFIG
+): Promise<string> {
+  const api = getClient();
+  const response = await api.messages.create({
+    model: config.inner_model,
+    max_tokens: 4096,
+    system,
+    messages: [{ role: "user", content: user }],
+  });
+  const textBlock = response.content.find((b) => b.type === "text");
+  return textBlock ? textBlock.text : "";
+}
+async function askInnerWithSearch(
+  system: string,
+  user: string,
+  config: ARSRConfig = DEFAULT_CONFIG
+): Promise<{ text: string; citations: Array<{ url: string; title: string }> }> {
+  const api = getClient();
+  const response = await api.messages.create({
+    model: config.inner_model,
+    max_tokens: 4096,
+    system,
+    messages: [{ role: "user", content: user }],
+    tools: [{ type: "web_search_20250305", name: "web_search" } as unknown as Anthropic.Messages.Tool],
+  });
+  // Extract text and citations from the response
+  let text = "";
+  const citations: Array<{ url: string; title: string }> = [];
+  for (const block of response.content) {
+    if (block.type === "text") {
+      text += block.text;
+      // Extract any inline citations
+      if ("citations" in block && Array.isArray(block.citations)) {
+        for (const cite of block.citations) {
+          if ("url" in cite && "title" in cite) {
+            citations.push({
+              url: cite.url as string,
+              title: cite.title as string,
+            });
+          }
+        }
+      }
+    }
+  }
+  return { text, citations };
+}
+function extractJSON<T>(raw: string): T {
+  // Strip markdown fences if present
+  const cleaned = raw
+    .replace(/```json\s*/gi, "")
+    .replace(/```\s*/g, "")
+    .trim();
+  // Try to find JSON object or array
+  const jsonMatch = cleaned.match(/[\[{][\s\S]*[\]}]/);
+  if (!jsonMatch) {
+    throw new Error(`No JSON found in LLM output: ${cleaned.slice(0, 200)}`);
+  }
+  return JSON.parse(jsonMatch[0]) as T;
+}
+/**
+ * Classify whether a draft is a refusal/non-answer using the inner LLM.
+ * Returns true if the draft deflects, redirects, or refuses to answer.
+ */
+async function classifyRefusal(
+  draft: string,
+  config: ARSRConfig = DEFAULT_CONFIG
+): Promise<boolean> {
+  const system = `You are a response classifier. Determine whether the given text is a REFUSAL or NON-ANSWER.
+A refusal is any response that:
+- Says it cannot, does not, or is unable to provide the information
+- Redirects the user to check another source instead of answering
+- Provides only generic suggestions instead of a direct answer
+- Hedges so heavily that no substantive information is conveyed
+Respond with ONLY a JSON object: { "is_refusal": true } or { "is_refusal": false }`;
+  const raw = await askInner(system, `Classify this response:\n\n${draft}`, config);
+  try {
+    const result = extractJSON<{ is_refusal: boolean }>(raw);
+    return result.is_refusal === true;
+  } catch {
+    return false;
+  }
+}
+/**
+ * Generate an initial draft response to a query.
+ * Uses web search to ground the draft in real data — this prevents
+ * the "I don't know" refusal problem where the inner model punts
+ * on questions outside its training data.
+ *
+ * Returns the draft text and a structured is_refusal flag classified
+ * by the inner LLM (replacing brittle string matching).
+ */
+export async function generateDraft(
+  query: string,
+  context?: string,
+  config: ARSRConfig = DEFAULT_CONFIG
+): Promise<{ draft: string; is_refusal: boolean }> {
+  const system = `You are a helpful, accurate research assistant. Your job is to ANSWER the user's question with specific, concrete facts.
+CRITICAL RULES:
+- You MUST search the web to find the answer. Do NOT rely on memory alone.
+- You MUST provide a direct, substantive answer with specific facts, numbers, and details.
+- NEVER say "I don't have access to" or "I recommend checking" or "I cannot provide". These are failures.
+- NEVER give a list of places to look instead of the answer. FIND the answer yourself.
+- If the question asks about specific data (vote counts, statistics, dates), SEARCH FOR IT and REPORT IT.
+- It's OK to be wrong — your answer will be fact-checked and corrected later. A wrong answer is better than no answer.
+- Include your best understanding even if uncertain. The refinement loop will fix errors.
+${context ? `\nAdditional context:\n${context}` : ""}`;
+  const { text } = await askInnerWithSearch(system, query, config);
+  const is_refusal = await classifyRefusal(text, config);
+  return { draft: text, is_refusal };
+}
+/**
+ * Decompose a draft into individually verifiable atomic claims.
+ * If the draft is a refusal/non-answer, extracts claims from the
+ * original query context instead so the loop can still retrieve evidence.
+ */
+export async function decomposeClaims(
+  draft: string,
+  originalQuery?: string,
+  isRefusal: boolean = false,
+  config: ARSRConfig = DEFAULT_CONFIG
+): Promise<Claim[]> {
+  let textToDecompose = draft;
+  let systemAddendum = "";
+  if (isRefusal && originalQuery) {
+    // The draft was classified as a refusal/non-answer by the LLM.
+    // Extract claims from the user's question instead.
+    textToDecompose = originalQuery;
+    systemAddendum = `
+IMPORTANT: The original draft was a non-answer/refusal. You are now extracting the factual claims
+embedded in the USER'S QUESTION instead. These are the claims that need to be verified.
+For example, if the user says "Is it true that X got Y votes?", extract "X got Y votes" as a claim.`;
+  }
+  const system = `You are a claim extraction engine. Given a text, extract every distinct factual claim as a separate item.
+Rules:
+- Each claim must be a single, independently verifiable statement
+- Preserve the original meaning precisely
+- Skip opinions, hedges ("I think"), and meta-commentary
+- Include the source_span (the exact substring from the original text)
+- Give each claim a short id like "c1", "c2", etc.
+${systemAddendum}
+Respond ONLY with a JSON array:
+[{ "id": "c1", "text": "The claim as a standalone statement", "source_span": "exact quote from draft" }, ...]`;
+  const raw = await askInner(system, `Extract all factual claims from:\n\n${textToDecompose}`, config);
+  return extractJSON<Claim[]>(raw);
+}
+/**
+ * Score claims by generating multiple rephrasings and measuring agreement.
+ * Uses semantic entropy: low agreement across rephrasings = high uncertainty.
+ */
+export async function scoreClaims(
+  claims: Claim[],
+  config: ARSRConfig = DEFAULT_CONFIG
+): Promise<ScoredClaim[]> {
+  const system = `You are an uncertainty estimation engine. For each claim, assess how likely it is to be factually correct.
+Consider:
+- Is this common knowledge or obscure?
+- Are there well-known disputes about this?
+- Does the specificity (exact numbers, dates, names) increase risk of error?
+- Could this be a common misconception?
+Respond ONLY with a JSON array:
+[{
+  "id": "c1",
+  "confidence": 0.92,
+  "entropy": 0.15,
+  "method": "semantic_entropy",
+  "reasoning": "brief explanation"
+}, ...]
+confidence: 0.0 = certainly wrong, 1.0 = certainly correct
+entropy: 0.0 = very certain, 1.0 = highly uncertain`;
+  const claimsText = claims
+    .map((c) => `[${c.id}] ${c.text}`)
+    .join("\n");
+  const raw = await askInner(
+    system,
+    `Score the uncertainty of each claim:\n\n${claimsText}`,
+    config
+  );
+  const scored = extractJSON<Array<{
+    id: string;
+    confidence: number;
+    entropy: number;
+    method: string;
+    reasoning?: string;
+  }>>(raw);
+  return scored.map((s) => {
+    const original = claims.find((c) => c.id === s.id);
+    return {
+      id: s.id,
+      text: original?.text ?? "",
+      source_span: original?.source_span ?? "",
+      confidence: Math.max(0, Math.min(1, s.confidence)),
+      entropy: Math.max(0, Math.min(1, s.entropy)),
+      method: "semantic_entropy" as const,
+    };
+  });
+}
+/**
+ * For low-confidence claims, generate adversarial search queries and retrieve evidence.
+ * Uses the inner LLM + web search to find supporting/contradicting sources.
+ */
+export async function retrieveEvidence(
+  claims: ScoredClaim[],
+  strategy: string = "adversarial",
+  config: ARSRConfig = DEFAULT_CONFIG
+): Promise<ClaimEvidence[]> {
+  const results: ClaimEvidence[] = [];
+  for (const claim of claims) {
+    const queryGenSystem = `You are a search query generator for fact-checking.
+Strategy: ${strategy}
+For "adversarial": generate queries designed to DISPROVE the claim. Search for counterexamples, corrections, or the actual facts.
+For "confirmatory": generate queries to find authoritative sources that confirm the claim.
+For "balanced": generate both supporting and challenging queries.
+Respond ONLY with a JSON array of 2-3 search queries:
+["query 1", "query 2", "query 3"]`;
+    const queriesRaw = await askInner(
+      queryGenSystem,
+      `Generate search queries to fact-check: "${claim.text}"`,
+      config
+    );
+    let queries: string[];
+    try {
+      queries = extractJSON<string[]>(queriesRaw);
+    } catch {
+      queries = [claim.text];
+    }
+    const allDocs: EvidenceDoc[] = [];
+    for (const query of queries.slice(0, 3)) {
+      try {
+        const searchSystem = `You are a fact-checking research assistant. Search for information about the given query and evaluate what you find relative to this claim: "${claim.text}"
+After searching, respond with a JSON array of the most relevant results:
+[{
+  "title": "Page title",
+  "url": "https://...",
+  "snippet": "The relevant excerpt (max 100 words)",
+  "stance": "supports" | "contradicts" | "neutral" | "unclear"
+}]
+Respond ONLY with the JSON array.`;
+        const { text } = await askInnerWithSearch(
+          searchSystem,
+          `Search and evaluate: ${query}`,
+          config
+        );
+        try {
+          const docs = extractJSON<EvidenceDoc[]>(text);
+          allDocs.push(...docs);
+        } catch {
+          // If parsing fails, still capture as a single doc
+          allDocs.push({
+            title: "Search result",
+            url: "",
+            snippet: text.slice(0, 300),
+            stance: "unclear",
+          });
+        }
+      } catch (err) {
+        console.error(`Search failed for query "${query}":`, err);
+      }
+    }
+    // Step 3: Summarize the evidence stance
+    const supports = allDocs.filter((d) => d.stance === "supports").length;
+    const contradicts = allDocs.filter((d) => d.stance === "contradicts").length;
+    let overall_stance: ClaimEvidence["overall_stance"];
+    if (allDocs.length === 0) overall_stance = "insufficient";
+    else if (supports > 0 && contradicts > 0) overall_stance = "mixed";
+    else if (contradicts > supports) overall_stance = "contradicted";
+    else overall_stance = "supported";
+    // Step 4: Generate a concise summary
+    const summarySystem = `Summarize the evidence for/against this claim in 1-2 sentences. Be direct.`;
+    const summaryInput = `Claim: "${claim.text}"\nEvidence:\n${allDocs.map((d) => `- [${d.stance}] ${d.snippet}`).join("\n")}`;
+    const summary = allDocs.length > 0
+      ? await askInner(summarySystem, summaryInput, config)
+      : "No evidence found.";
+    results.push({
+      claim_id: claim.id,
+      claim_text: claim.text,
+      docs: allDocs,
+      overall_stance,
+      summary,
+    });
+  }
+  return results;
+}
+/**
+ * Revise the draft based on evidence, returning the new text + change log.
+ * If the original draft was a refusal/non-answer, writes a completely new
+ * response from the evidence instead of trying to edit the refusal.
+ */
+export async function reviseDraft(
+  draft: string,
+  evidence: ClaimEvidence[],
+  scored: ScoredClaim[],
+  originalQuery?: string,
+  isRefusal: boolean = false,
+  config: ARSRConfig = DEFAULT_CONFIG
+): Promise<{
+  revised: string;
+  changes: Array<{
+    claim_id: string;
+    action: string;
+    original: string;
+    revised: string;
+    reason: string;
+  }>;
+  conflicts: Array<{
+    claim_id: string;
+    description: string;
+  }>;
+}> {
+  const evidenceSummary = evidence
+    .map(
+      (e) =>
+        `[${e.claim_id}] "${e.claim_text}" → ${e.overall_stance}\n  Evidence: ${e.summary}`
+    )
+    .join("\n\n");
+  let system: string;
+  if (isRefusal && originalQuery) {
+    // The draft was a non-answer. Write a NEW response from the evidence.
+    system = `You are a response generation engine. The original draft FAILED to answer the user's question — it was a refusal or redirect.
+Your job: Write a COMPLETELY NEW response that DIRECTLY ANSWERS the user's question using the evidence provided.
+User's original question: "${originalQuery}"
+Rules:
+- DIRECTLY answer the question using the evidence gathered
+- Include specific facts, numbers, and details from the evidence
+- If the evidence contradicts what the user claimed, say so clearly
+- If the evidence supports what the user claimed, confirm it
+- Add hedging ("reportedly", "according to...") only for genuinely mixed evidence
+- Do NOT say "I don't have access" or redirect to other sources
+Respond with JSON:
+{
+  "revised": "The full NEW response that directly answers the question",
+  "changes": [
+    { "claim_id": "c1", "action": "generated_from_evidence", "original": "N/A - draft was a refusal", "revised": "the new claim", "reason": "Evidence shows..." }
+  ],
+  "conflicts": [
+    { "claim_id": "c2", "description": "Sources disagree about..." }
+  ]
+}`;
+  } else {
+    system = `You are a response revision engine. Given an original draft and fact-checking evidence, produce a corrected version.
+Rules:
+- Fix claims that were contradicted by evidence
+- Add hedging language ("reportedly", "according to...") for mixed evidence
+- Remove claims with no supporting evidence if they are central to the answer
+- Keep claims that were supported — don't weaken what's already correct
+- Preserve the original tone and structure as much as possible
+Respond with JSON:
+{
+  "revised": "The full revised response text",
+  "changes": [
+    { "claim_id": "c1", "action": "corrected|removed|hedged|kept", "original": "...", "revised": "...", "reason": "..." }
+  ],
+  "conflicts": [
+    { "claim_id": "c2", "description": "Sources disagree about..." }
+  ]
+}`;
+  }
+  const raw = await askInner(
+    system,
+    `Original draft:\n${draft}\n\nEvidence report:\n${evidenceSummary}`,
+    config
+  );
+  return extractJSON(raw);
+}
+/**
+ * Decide whether to continue the refinement loop.
+ */
+export async function shouldContinue(
+  iteration: number,
+  scored: ScoredClaim[],
+  maxIterations: number,
+  confidenceThreshold: number,
+  previousAvgConfidence: number | null
+): Promise<{ decision: "continue" | "stop"; reason: string }> {
+  if (iteration >= maxIterations) {
+    return { decision: "stop", reason: `Budget exhausted (${maxIterations} iterations)` };
+  }
+  const avgConfidence =
+    scored.length > 0
+      ? scored.reduce((sum, c) => sum + c.confidence, 0) / scored.length
+      : 1;
+  const lowConfidence = scored.filter(
+    (c) => c.confidence < confidenceThreshold
+  );
+  if (lowConfidence.length === 0) {
+    return {
+      decision: "stop",
+      reason: `All ${scored.length} claims above confidence threshold (${confidenceThreshold})`,
+    };
+  }
+  if (previousAvgConfidence !== null) {
+    const improvement = avgConfidence - previousAvgConfidence;
+    if (improvement < 0.02) {
+      return {
+        decision: "stop",
+        reason: `Confidence converged (Δ=${improvement.toFixed(3)}, threshold=0.02). ${lowConfidence.length} claims remain below threshold.`,
+      };
+    }
+  }
+  return {
+    decision: "continue",
+    reason: `${lowConfidence.length}/${scored.length} claims below threshold. Avg confidence: ${avgConfidence.toFixed(3)}. Continuing refinement.`,
+  };
+}

package/src/types.ts ADDED Viewed

@@ -0,0 +1,67 @@
+export interface Claim {
+  id: string;
+  text: string;
+  source_span: string; // The substring of the draft this claim came from
+}
+export interface ScoredClaim extends Claim {
+  confidence: number;     // 0-1, higher = more confident
+  entropy: number;        // Semantic entropy across rephrasings
+  method: "semantic_entropy" | "consistency_vote";
+  variants?: string[];    // The rephrasings used to compute entropy
+}
+export interface EvidenceDoc {
+  title: string;
+  url: string;
+  snippet: string;
+  stance: "supports" | "contradicts" | "neutral" | "unclear";
+}
+export interface ClaimEvidence {
+  claim_id: string;
+  claim_text: string;
+  docs: EvidenceDoc[];
+  overall_stance: "supported" | "contradicted" | "mixed" | "insufficient";
+  summary: string;
+}
+export interface RevisionChange {
+  claim_id: string;
+  action: "kept" | "corrected" | "removed" | "hedged";
+  original: string;
+  revised: string;
+  reason: string;
+}
+export interface Conflict {
+  claim_id: string;
+  description: string;
+  sources_for: string[];
+  sources_against: string[];
+}
+export interface LoopState {
+  iteration: number;
+  max_iterations: number;
+  confidence_threshold: number;
+  previous_avg_confidence: number | null;
+  claims_improved: number;
+  claims_degraded: number;
+}
+export interface ARSRConfig {
+  max_iterations: number;
+  confidence_threshold: number;
+  entropy_samples: number;
+  retrieval_strategy: "adversarial" | "confirmatory" | "balanced";
+  inner_model: string;
+}
+export const DEFAULT_CONFIG: ARSRConfig = {
+  max_iterations: parseInt(process.env.ARSR_MAX_ITERATIONS || "3", 10),
+  confidence_threshold: parseFloat(process.env.ARSR_CONFIDENCE_THRESHOLD || "0.85"),
+  entropy_samples: parseInt(process.env.ARSR_ENTROPY_SAMPLES || "3", 10),
+  retrieval_strategy: (process.env.ARSR_RETRIEVAL_STRATEGY as ARSRConfig["retrieval_strategy"]) || "adversarial",
+  inner_model: process.env.ARSR_INNER_MODEL || "claude-haiku-4-5-20251001",
+};

package/tsconfig.json ADDED Viewed

@@ -0,0 +1,16 @@
+{
+  "compilerOptions": {
+    "target": "ES2020",
+    "module": "ESNext",
+    "moduleResolution": "bundler",
+    "esModuleInterop": true,
+    "outDir": "./dist",
+    "rootDir": ".",
+    "strict": true,
+    "declaration": true,
+    "skipLibCheck": true,
+    "allowSyntheticDefaultImports": true
+  },
+  "include": ["./src/**/*.ts"],
+  "exclude": ["node_modules", "**/*.test.ts"]
+}