npm - @aria_asi/cli - Versions diffs - 0.2.40 → 0.2.41 - Mend

@aria_asi/cli 0.2.40 → 0.2.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (802) hide show

package/scripts/quality-ab-live-provider.mjs ADDED Viewed

@@ -0,0 +1,913 @@
+#!/usr/bin/env node
+import { createHash } from 'node:crypto';
+import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+import {
+  analyzeDomainOutputQuality,
+} from '../hooks/lib/domain-output-quality.mjs';
+const repoRoot = process.cwd();
+const runtimeUrl = (process.env.ARIA_RUNTIME_URL || 'http://127.0.0.1:4319').replace(/\/+$/, '');
+const runId = new Date().toISOString().replace(/[:.]/g, '-');
+const outputRoot = path.join(repoRoot, 'artifacts', 'quality-ab', runId);
+const outputsDir = path.join(outputRoot, 'outputs');
+const promptInstructionRx = /\b(?:fire|skill|skills|autofire|qiyas|tadabbur|runtime|runtimes|load|autoload|packet|hook)\b/i;
+const benchmarkRequiredSkillIds = Object.freeze([
+  'aria-cognition-autofire',
+  'aria-first-class-operating-contract',
+  'aria-cognition-batch',
+  'aria-quality-audit',
+  'aria-repo-doctrine',
+  'aria-http-harness-client',
+  'aria-harness-substrate-binding',
+  'aria-axioms-first-principles',
+  'never-guess',
+  'tadabbur',
+  'tadabbur-ops',
+  'tafakkur',
+  'qiyas-analogy',
+  'forge-quality-rules',
+  'aria-task-codex-executor',
+  'aria-backend-architect',
+  'aria-harness-no-stripping',
+  'aria-readable-output',
+  'aria-ledger-fleet-execution',
+  'predictor',
+  'aria-aristotle-pre-phase',
+  'aria-aristotle-intra-phase',
+  'aria-aristotle-post-phase',
+  'mizan',
+  'ghazali-8lens',
+  'aria-noor-cognitives',
+  'aria-business-frame',
+  'aria-business-audit',
+  'aria-revenue-engine',
+  'aria-retention-engine',
+  'aria-gtm-architect',
+  'aria-senior-code-cookbook',
+  'aria-senior-code-audit',
+  'testing-strategy',
+  'security-review',
+  'architecture-decision',
+  'api-design',
+  'observability',
+  'error-handling',
+  'aria-decision-mizan',
+  'aria-aristotle-cognitives',
+  'aria-forge-guardrails',
+  'aria-harness-deploy',
+  'aria-harness-onboarding',
+  'aria-harness-output-discipline',
+  'aria-frontend-architect',
+  'aria-fullstack-orchestrator',
+  'aria-research-orchestrator',
+  'cross-domain-24',
+  'fitrah-guard',
+  'noor-recognition',
+  'ladunni-22',
+  'ijtihad-novel',
+  'ilham-intuition',
+  'istiqra-induction',
+  'ruh-basis',
+  'soul-domains',
+  'deepsoul-emotional',
+  'aria-repo-audit',
+  'aria-live-ops',
+  'aria-memory-index',
+  'aria-ops',
+]);
+const baseSystemPrompt = [
+  'You are a pragmatic senior engineering agent.',
+  'Answer the user directly with status, evidence boundary, risks, and the next concrete action.',
+  'Do not claim completion unless the prompt includes evidence.',
+].join(' ');
+const promptBank = [
+  'A repo has several dirty files and the owner asks what to do next. Produce a concise action recommendation with verification boundaries.',
+  'A deployment route changed and the live endpoint may still be using old code. Explain the safest debugging sequence.',
+  'A quality check reports that an answer is too vague. Rewrite the closeout so it is evidence-bound and owner-readable.',
+  'A connector has a pre-provider preparation step and a post-answer validation step. Identify the highest-risk failure mode.',
+  'A team wants marketing proof from a smoke run. State what evidence is acceptable and what claim must remain bounded.',
+  'A long-running task ledger has completed four phases. Summarize what should be verified before trusting the result.',
+  'A user is upset about lost time and unclear output. Draft the next response that reduces confusion and names the next action.',
+  'A feature claims automatic improvement but only stores logs. Evaluate whether that proves behavior changed.',
+  'A code reviewer sees completion language with no test output. Write the review finding and the required proof.',
+  'A tool runner must avoid overloading a machine. Recommend how to schedule live checks and report progress.',
+  'A model answer includes many internal receipts. Rewrite the owner-facing answer so it preserves evidence without dumping raw internals.',
+  'A project needs a before-after measurement of answer quality. Define the metrics and success threshold.',
+  'An integration uses a sidecar to prepare context before a model call. Explain how to prove the sidecar affected the answer.',
+  'A QA loop finds a correctable issue. Describe the repair-first flow and how to record the lesson.',
+  'A baseline answer asks what to do next. Produce a better answer that decides the next step with evidence boundaries.',
+  'A live smoke passed but provider answer quality has not been measured. Give the honest status and next experiment.',
+  'An output says production ready after one local check. Reframe the claim and identify missing evidence.',
+  'A runtime can select many methods without placing their full text in the model prompt. Explain the quality and cost tradeoff.',
+  'A release note needs to be useful to operators. Write the structure and verification requirements.',
+  'A test produces screenshots and metrics. Explain how to turn that into defensible marketing evidence.',
+];
+function ensureDir(dir) {
+  mkdirSync(dir, { recursive: true, mode: 0o755 });
+}
+function sha256(value) {
+  return createHash('sha256').update(String(value ?? '')).digest('hex');
+}
+function redactText(value) {
+  return String(value ?? '')
+    .replace(/sk-[^"\s,}\]]{8,}/g, 'sk-[REDACTED]')
+    .replace(/Bearer [^"\s,}\]]{8,}/g, 'Bearer [REDACTED]');
+}
+function redactRequest(value) {
+  if (Array.isArray(value)) return value.map(redactRequest);
+  if (typeof value === 'string') return redactText(value);
+  if (!value || typeof value !== 'object') return value;
+  return Object.fromEntries(Object.entries(value).map(([key, child]) => {
+    if (/api[_-]?key|authorization|bearer|secret|token|password/i.test(key) && typeof child !== 'boolean') return [key, '[REDACTED]'];
+    return [key, redactRequest(child)];
+  }));
+}
+function writeJsonArtifact(filePath, value) {
+  writeFileSync(filePath, JSON.stringify(redactRequest(value), null, 2) + '\n');
+}
+function argNumber(name, fallback, options = {}) {
+  const prefix = `--${name}=`;
+  const raw = process.argv.find((arg) => arg.startsWith(prefix));
+  const value = raw ? Number(raw.slice(prefix.length)) : Number(process.env[`ARIA_AB_${name.replaceAll('-', '_').toUpperCase()}`] || fallback);
+  const min = Number.isFinite(options.min) ? options.min : 1;
+  const max = Number.isFinite(options.max) ? options.max : Number.MAX_SAFE_INTEGER;
+  return Math.min(max, Math.max(min, Number.isFinite(value) ? value : fallback));
+}
+async function mapConcurrent(items, concurrency, worker) {
+  const results = new Array(items.length);
+  let next = 0;
+  const workers = Array.from({ length: Math.min(concurrency, items.length) }, async () => {
+    while (next < items.length) {
+      const index = next;
+      next += 1;
+      results[index] = await worker(items[index], index);
+    }
+  });
+  await Promise.all(workers);
+  return results;
+}
+function htmlEscape(value) {
+  return String(value ?? '')
+    .replaceAll('&', '&amp;')
+    .replaceAll('<', '&lt;')
+    .replaceAll('>', '&gt;')
+    .replaceAll('"', '&quot;')
+    .replaceAll("'", '&#39;');
+}
+function readJsonIfExists(filePath) {
+  if (!existsSync(filePath)) return null;
+  try {
+    return JSON.parse(readFileSync(filePath, 'utf8'));
+  } catch {
+    return null;
+  }
+}
+function asRecord(value) {
+  return value && typeof value === 'object' && !Array.isArray(value) ? value : {};
+}
+function completionUsageDetails(data = {}) {
+  const usage = asRecord(data.usage);
+  const completionDetails = asRecord(usage.completion_tokens_details || usage.completionTokensDetails);
+  const completionTokens = Number(usage.completion_tokens ?? usage.completionTokens ?? 0);
+  const reasoningTokens = Number(completionDetails.reasoning_tokens ?? completionDetails.reasoningTokens ?? 0);
+  return {
+    promptTokens: Number(usage.prompt_tokens ?? usage.promptTokens ?? 0),
+    completionTokens,
+    totalTokens: Number(usage.total_tokens ?? usage.totalTokens ?? 0),
+    reasoningTokens,
+    allCompletionTokensSpentOnReasoning: completionTokens > 0 && reasoningTokens >= completionTokens,
+  };
+}
+function safeConfigSummary() {
+  const configPath = path.join(os.homedir(), '.aria', 'config.json');
+  const config = readJsonIfExists(configPath) || {};
+  const model = config.model && typeof config.model === 'object' ? config.model : {};
+  const selectedProvider = process.env.ARIA_AB_PROVIDER || model.provider || config.defaultProvider || 'xai';
+  const localProviderMatches = String(model.provider || '').toLowerCase() === String(selectedProvider || '').toLowerCase();
+  const providerEnvKeyPresent =
+    selectedProvider === 'xai' ? Boolean(process.env.XAI_API_KEY || process.env.GROK_API_KEY)
+      : selectedProvider === 'deepseek' ? Boolean(process.env.DEEPSEEK_API_KEY || process.env.ARIA_DEEPSEEK_API_KEY)
+        : selectedProvider === 'anthropic' ? Boolean(process.env.ANTHROPIC_API_KEY)
+          : selectedProvider === 'openai' ? Boolean(process.env.OPENAI_API_KEY)
+            : selectedProvider === 'openrouter' ? Boolean(process.env.OPENROUTER_API_KEY)
+              : false;
+  return {
+    provider: selectedProvider,
+    model: process.env.ARIA_AB_MODEL || model.model || 'grok-4-3',
+    baseUrl: process.env.ARIA_AB_BASE_URL || (localProviderMatches ? model.baseUrl : '') || '',
+    apiKeyPresent: Boolean(process.env.ARIA_AB_API_KEY || (localProviderMatches && model.apiKey) || providerEnvKeyPresent),
+  };
+}
+function providerRequiresApiKey(provider) {
+  return !['ollama'].includes(String(provider || '').toLowerCase());
+}
+function providerUrl(config) {
+  if (config.baseUrl) {
+    return config.baseUrl.endsWith('/chat/completions') ? config.baseUrl : `${config.baseUrl.replace(/\/+$/, '')}/chat/completions`;
+  }
+  if (config.provider === 'xai') return 'https://api.x.ai/v1/chat/completions';
+  if (config.provider === 'deepseek') return process.env.DEEPSEEK_API_BASE || 'https://api.deepseek.com/v1/chat/completions';
+  if (config.provider === 'openai') return 'https://api.openai.com/v1/chat/completions';
+  if (config.provider === 'openrouter') return 'https://openrouter.ai/api/v1/chat/completions';
+  return 'https://api.x.ai/v1/chat/completions';
+}
+function providerNativeModel(config) {
+  if (config.provider === 'xai' && config.model === 'grok-4-3') return 'grok-4.3';
+  return config.model;
+}
+function providerApiKey(config) {
+  const provider = String(config.provider || '').toLowerCase();
+  if (process.env.ARIA_AB_API_KEY) return process.env.ARIA_AB_API_KEY;
+  const local = readJsonIfExists(path.join(os.homedir(), '.aria', 'config.json')) || {};
+  const model = local.model && typeof local.model === 'object' ? local.model : {};
+  if (model.provider === provider && model.apiKey) return model.apiKey;
+  if (provider === 'xai') return process.env.XAI_API_KEY || '';
+  if (provider === 'deepseek') return process.env.DEEPSEEK_API_KEY || process.env.ARIA_DEEPSEEK_API_KEY || '';
+  if (provider === 'anthropic') return process.env.ANTHROPIC_API_KEY || '';
+  if (provider === 'openai') return process.env.OPENAI_API_KEY || '';
+  return '';
+}
+function extractOpenAiText(data) {
+  return data?.choices?.[0]?.message?.content || '';
+}
+function extractRuntimeExtra(data) {
+  const aria = data?.aria && typeof data.aria === 'object' ? data.aria : {};
+  const raw = aria.raw && typeof aria.raw === 'object' ? aria.raw : {};
+  const runtimeEvidence = asRecord(aria.runtime_evidence || raw.runtimeEvidence);
+  const ledgerRecords = Array.isArray(aria.runtime_ledger?.records)
+    ? aria.runtime_ledger.records
+    : Array.isArray(raw.runtimeLedger?.records)
+      ? raw.runtimeLedger.records
+      : [];
+  const coachRecords = Array.isArray(aria.coach_kernel?.records)
+    ? aria.coach_kernel.records
+    : Array.isArray(raw.coachKernel?.records)
+      ? raw.coachKernel.records
+      : [];
+  return {
+    blocked: aria.blocked === true || raw.blocked === true,
+    releaseDecision: raw.qaAsGate?.releaseDecision || null,
+    qaAsGate: raw.qaAsGate || null,
+    runtimeEvidence,
+    runtimeEvidencePresent: Object.keys(runtimeEvidence).length > 0,
+    runtimeEvidenceOk: runtimeEvidence.ok === true,
+    atlasPresent: Boolean(runtimeEvidence.atlas?.receiptHash || runtimeEvidence.atlas?.contextHash || runtimeEvidence.atlas?.ok || runtimeEvidence.atlas?.atlas?.contextHash),
+    sentinelPassportHash: runtimeEvidence.sentinel?.passportHash || null,
+    sentinelCompilationHash: runtimeEvidence.sentinel?.compilationHash || null,
+    turnCapabilityGrantHash: runtimeEvidence.turnCapabilityGrant?.grantHash || null,
+    firedSkillCount: Number(runtimeEvidence.firedSkillCount || 0),
+    activeRuntimeCount: Number(runtimeEvidence.activeRuntimeCount || 0),
+    phaseExecutionReceiptCount: Number(runtimeEvidence.phaseExecutionReceiptCount || 0),
+    qiyasPerspectiveCount: Number(runtimeEvidence.cognitiveRuntimeReceipts?.qiyasPerspectiveCount || 0),
+    tadabburStageCount: Number(runtimeEvidence.cognitiveRuntimeReceipts?.tadabburStageCount || 0),
+    learningFeedsNextSelection: runtimeEvidence.learning?.learningFeedsNextSelection === true,
+    managedLedgerRecords: ledgerRecords.length,
+    coachRecords: coachRecords.length,
+    sidecarPresent: Boolean(data?.aria),
+  };
+}
+async function retryProviderCall(label, fn, attempts = 3) {
+  let lastError = null;
+  for (let attempt = 1; attempt <= attempts; attempt += 1) {
+    try {
+      return await fn(attempt);
+    } catch (error) {
+      lastError = error;
+      if (attempt === attempts) break;
+    }
+  }
+  throw new Error(`${label} failed after ${attempts} attempts: ${lastError instanceof Error ? lastError.message : String(lastError)}`);
+}
+async function callBaselineProvider({ config, prompt, index }) {
+  if (config.provider === 'anthropic') {
+    return callAnthropicDirect({ config, prompt, index });
+  }
+  const url = providerUrl(config);
+  const apiKey = providerApiKey(config);
+  if (providerRequiresApiKey(config.provider) && !apiKey) {
+    throw new Error(`missing API key for provider ${config.provider}`);
+  }
+  return retryProviderCall(`baseline provider prompt ${index}`, async (attempt) => {
+    const started = Date.now();
+    const requestBody = {
+      model: providerNativeModel(config),
+      messages: [
+        { role: 'system', content: baseSystemPrompt },
+        { role: 'user', content: prompt },
+      ],
+      max_tokens: Number(process.env.ARIA_AB_MAX_TOKENS || 700),
+      temperature: Number(process.env.ARIA_AB_TEMPERATURE || 0.2),
+      stream: false,
+      metadata: { source: 'quality-ab-live-provider', arm: 'baseline', index, attempt },
+    };
+    const response = await fetch(url, {
+      method: 'POST',
+      headers: {
+        'content-type': 'application/json',
+        ...(apiKey ? { authorization: `Bearer ${apiKey}` } : {}),
+        ...(config.provider === 'openrouter' ? { 'HTTP-Referer': 'http://127.0.0.1', 'X-Title': 'Aria Quality AB' } : {}),
+      },
+      body: JSON.stringify(requestBody),
+    });
+    const rawText = await response.text();
+    let data = {};
+    try {
+      data = rawText ? JSON.parse(rawText) : {};
+    } catch {
+      data = { raw: rawText };
+    }
+    if (!response.ok) throw new Error(`baseline provider ${response.status}: ${rawText.slice(0, 500)}`);
+    return {
+      text: extractOpenAiText(data),
+      durationMs: Date.now() - started,
+      provider: config.provider,
+      model: config.model,
+      usage: data.usage || null,
+      usageDetails: completionUsageDetails(data),
+      rawSha256: sha256(rawText),
+      rawText,
+      requestBody: redactRequest(requestBody),
+      requestSha256: sha256(JSON.stringify(requestBody)),
+      attempts: attempt,
+    };
+  });
+}
+async function callAnthropicDirect({ config, prompt, index }) {
+  const apiKey = providerApiKey(config);
+  if (!apiKey) throw new Error('missing API key for provider anthropic');
+  const started = Date.now();
+  const requestBody = {
+    model: config.model,
+    max_tokens: Number(process.env.ARIA_AB_MAX_TOKENS || 700),
+    system: baseSystemPrompt,
+    messages: [{ role: 'user', content: prompt }],
+    metadata: { source: 'quality-ab-live-provider', arm: 'baseline', index },
+  };
+  const response = await fetch(config.baseUrl || 'https://api.anthropic.com/v1/messages', {
+    method: 'POST',
+    headers: {
+      'content-type': 'application/json',
+      'x-api-key': apiKey,
+      'anthropic-version': '2023-06-01',
+    },
+    body: JSON.stringify(requestBody),
+  });
+  const rawText = await response.text();
+  let data = {};
+  try {
+    data = rawText ? JSON.parse(rawText) : {};
+  } catch {
+    data = { raw: rawText };
+  }
+  if (!response.ok) throw new Error(`baseline anthropic ${response.status}: ${rawText.slice(0, 500)}`);
+  return {
+    text: Array.isArray(data.content) ? data.content.filter((part) => part?.type === 'text').map((part) => part.text).join('') : '',
+    durationMs: Date.now() - started,
+    provider: config.provider,
+    model: providerNativeModel(config),
+    usage: data.usage || null,
+    usageDetails: completionUsageDetails(data),
+    rawSha256: sha256(rawText),
+    rawText,
+    requestBody: redactRequest(requestBody),
+    requestSha256: sha256(JSON.stringify(requestBody)),
+  };
+}
+async function callRuntimeArm({ config, prompt, index }) {
+  return retryProviderCall(`runtime provider prompt ${index}`, async (attempt) => {
+    const started = Date.now();
+    const requestBody = {
+      model: config.model,
+      llm: {
+        provider: config.provider,
+        model: config.model,
+        baseUrl: config.baseUrl || undefined,
+        apiKey: providerApiKey(config) || undefined,
+      },
+      messages: [
+        { role: 'system', content: baseSystemPrompt },
+        { role: 'user', content: prompt },
+      ],
+      max_tokens: Number(process.env.ARIA_AB_MAX_TOKENS || 700),
+      temperature: Number(process.env.ARIA_AB_TEMPERATURE || 0.2),
+      stream: false,
+      metadata: {
+        source: 'quality-ab-live-provider',
+        arm: 'mechanical_substrate',
+        index,
+        attempt,
+        universalTurnPacket: true,
+        skillExecutionMode: 'mechanical-receipt',
+        requiredSkillIds: benchmarkRequiredSkillIds,
+      },
+      requiredSkillIds: benchmarkRequiredSkillIds,
+      ariaDebug: true,
+      allowProviderFallback: false,
+    };
+    const response = await fetch(`${runtimeUrl}/v1/chat/completions`, {
+      method: 'POST',
+      headers: { 'content-type': 'application/json' },
+      body: JSON.stringify(requestBody),
+    });
+    const rawText = await response.text();
+    let data = {};
+    try {
+      data = rawText ? JSON.parse(rawText) : {};
+    } catch {
+      data = { raw: rawText };
+    }
+    if (!response.ok) throw new Error(`runtime arm ${response.status}: ${rawText.slice(0, 500)}`);
+    return {
+      text: extractOpenAiText(data),
+      durationMs: Date.now() - started,
+      provider: data?.choices?.[0]?.message?.provider || config.provider,
+      model: data?.model || config.model,
+      usage: data?.usage || null,
+      usageDetails: completionUsageDetails(data),
+      rawSha256: sha256(rawText),
+      rawText,
+      requestBody: redactRequest(requestBody),
+      requestSha256: sha256(JSON.stringify(redactRequest(requestBody))),
+      attempts: attempt,
+      runtimeExtra: extractRuntimeExtra(data),
+    };
+  });
+}
+function fineGrainedScore(text, userText) {
+  const source = String(text || '');
+  const commandCount = (source.match(/`[^`]+`|\b(?:curl|kubectl|git|npm|node|sha256sum|systemctl|journalctl)\b/g) || []).length;
+  const numericEvidenceCount = (source.match(/\b(?:\d+\/\d+|\d+(?:\.\d+)?%|sha256:[a-f0-9]+|[a-f0-9]{12,}|exit=0|status=)\b/gi) || []).length;
+  const userTerms = String(userText || '').toLowerCase().split(/[^a-z0-9]+/).filter((term) => term.length > 4);
+  const userTermHits = new Set(userTerms.filter((term) => source.toLowerCase().includes(term))).size;
+  const rawCognition = /<\s*(?:cognition|applied_cognition)\b/i.test(source);
+  const directness = /\b(?:status|recommendation|decision|next action|next step)\b/i.test(source) ? 1 : 0;
+  const bounded = /\b(?:evidence boundary|not verified|bounded|observed|unverified|no logs|no tests)\b/i.test(source) ? 1 : 0;
+  const firstClassDelta = /\b(?:because|so that|therefore|this means|root cause|failure mode|predicate)\b/i.test(source) ? 1 : 0;
+  const antiBloat = source.length >= 220 && source.length <= 1200 ? 1 : source.length <= 2200 ? 0.5 : -1;
+  return Number((
+    directness * 1.5 +
+    bounded * 1.5 +
+    firstClassDelta +
+    Math.min(commandCount, 4) * 0.4 +
+    Math.min(numericEvidenceCount, 3) * 0.5 +
+    Math.min(userTermHits, 6) * 0.2 +
+    antiBloat -
+    (rawCognition ? 2.5 : 0)
+  ).toFixed(3));
+}
+function scoreOutput(text, userText) {
+  const source = String(text || '');
+  const runtimeBlockedOutput = /\bAria runtime blocked this output\b|\bRemaining blockers:\b|\bRecovery attempts:\b/i.test(source);
+  const visibleCognitionDump = /<\s*(?:cognition|applied_cognition)\b/i.test(source);
+  const checks = {
+    notRuntimeBlock: !runtimeBlockedOutput,
+    concreteNextAction: /\b(next action|next step|do this|implement|verify|run|inspect|measure)\b/i.test(source),
+    evidenceBoundary: /\b(evidence|observed|verified|unverified|bounded|not measured|proof|artifact|metric)\b/i.test(source),
+    expectedObserved: /\b(expected|observed|predicate|actual|pass|fail|threshold)\b/i.test(source),
+    qaCorrection: /\b(QA|finding|correct|correction|repair|harden|verify|re-test)\b/i.test(source),
+    riskTradeoff: /\b(risk|tradeoff|failure mode|blast radius|cost|latency|scope)\b/i.test(source),
+    ownerReadable: source.length >= 180 && source.length <= 2400 && /\n|:/.test(source),
+    avoidsAskOnly: !/\bwhat would you like me to do next\b|\bhow would you like me to proceed\b/i.test(source),
+    avoidsFalseClosure: !/\b(production ready|fully done|guaranteed|complete)\b/i.test(source),
+    avoidsRawInternals: !/\b(skillExecutionReceipt|phaseReceipts|executedOperatorIdsHash|raw json)\b/i.test(source),
+    avoidsVisibleCognitionDump: !visibleCognitionDump,
+    learningLoop: /\b(lesson|learning|next selection|feedback|loop|reflexion|record)\b/i.test(source),
+    specificVerification: /\b(exit=0|100\/100|\d+\/\d+|status=|sha256|screenshot|metric|test|smoke|ledger)\b/i.test(source),
+    conciseStructure: source.split('\n').filter((line) => line.trim()).length >= 2,
+  };
+  const domain = analyzeDomainOutputQuality(source, { userText });
+  const baseScore = Object.values(checks).filter(Boolean).length;
+  const penalty = (runtimeBlockedOutput ? Object.keys(checks).length : 0) + domain.blockers.length * 2 + domain.warnings.length * 0.5;
+  return {
+    score: Math.max(0, baseScore - penalty),
+    fineScore: fineGrainedScore(source, userText),
+    maxScore: Object.keys(checks).length,
+    checks,
+    blockerCount: domain.blockers.length,
+    warningCount: domain.warnings.length,
+    blockers: domain.blockers,
+    warnings: domain.warnings,
+    repairs: domain.repairs,
+    chars: source.length,
+    sha256: sha256(source),
+  };
+}
+function classifyPair(row) {
+  const primaryDelta = row.substrate.score.score - row.baseline.score.score;
+  const fineDelta = Number((row.substrate.score.fineScore - row.baseline.score.fineScore).toFixed(3));
+  const delta = primaryDelta !== 0 ? primaryDelta : fineDelta;
+  const gapFindings = [];
+  const substrateRuntime = row.substrate.runtimeExtra || {};
+  const baselineVisibleFailure = !String(row.baseline?.text || '').trim();
+  const substrateVisibleFailure = !String(row.substrate?.text || '').trim();
+  const substrateRuntimeComplete =
+    substrateRuntime.runtimeEvidencePresent === true &&
+    substrateRuntime.firedSkillCount >= 50 &&
+    substrateRuntime.activeRuntimeCount >= 16 &&
+    substrateRuntime.phaseExecutionReceiptCount >= 6 &&
+    substrateRuntime.qiyasPerspectiveCount >= 15 &&
+    substrateRuntime.tadabburStageCount >= 12 &&
+    Boolean(substrateRuntime.sentinelPassportHash) &&
+    Boolean(substrateRuntime.turnCapabilityGrantHash);
+  if (primaryDelta === 0) gapFindings.push('primary rubric tie: substrate did not create a coarse visible quality lead');
+  if (baselineVisibleFailure) gapFindings.push('baseline produced no visible answer text');
+  if (substrateVisibleFailure) gapFindings.push('substrate produced no visible answer text');
+  if (row.substrate.score.checks.avoidsVisibleCognitionDump === false) gapFindings.push('raw cognition leaked into owner-facing substrate answer');
+  if (row.substrate.score.checks.learningLoop === false) gapFindings.push('substrate answer lacks explicit learning/reflexion loop signal');
+  if (row.substrate.score.checks.specificVerification === false) gapFindings.push('substrate answer lacks specific measurable verification evidence');
+  if (!substrateRuntimeComplete) {
+    gapFindings.push(`substrate runtime evidence incomplete: skills=${substrateRuntime.firedSkillCount || 0}/50 runtimes=${substrateRuntime.activeRuntimeCount || 0}/16 phases=${substrateRuntime.phaseExecutionReceiptCount || 0}/6 qiyas=${substrateRuntime.qiyasPerspectiveCount || 0}/15 tadabbur=${substrateRuntime.tadabburStageCount || 0}/12`);
+  }
+  if (substrateRuntime.blocked && substrateRuntime.qaAsGate?.hardStop !== true) gapFindings.push('substrate blocked a non-hard-stop quality row instead of releasing with QA/correction findings');
+  if (row.substrate.score.score <= row.baseline.score.score && fineDelta <= 0) gapFindings.push('substrate failed to beat baseline after fine-grained tie-break');
+  return {
+    delta,
+    primaryDelta,
+    fineDelta,
+    winner: delta > 0 ? 'substrate' : 'baseline',
+    firstClassGap: delta <= 0 || primaryDelta === 0 || gapFindings.length > 0,
+    visibleOutputFailure: baselineVisibleFailure || substrateVisibleFailure,
+    runtimeEvidenceComplete: substrateRuntimeComplete,
+    gapFindings,
+  };
+}
+function mean(values) {
+  return values.length ? values.reduce((sum, value) => sum + value, 0) / values.length : 0;
+}
+function median(values) {
+  if (!values.length) return 0;
+  const sorted = [...values].sort((a, b) => a - b);
+  return sorted[Math.floor(sorted.length / 2)];
+}
+function pairedStats(rows) {
+  const deltas = rows.map((row) => row.delta);
+  return {
+    meanDelta: Number(mean(deltas).toFixed(3)),
+    medianDelta: Number(median(deltas).toFixed(3)),
+    wins: rows.filter((row) => row.winner === 'substrate').length,
+    losses: rows.filter((row) => row.winner === 'baseline').length,
+    ties: 0,
+    coarseTies: rows.filter((row) => row.primaryDelta === 0).length,
+    firstClassGapRows: rows.filter((row) => row.firstClassGap).length,
+    baselineMean: Number(mean(rows.map((row) => row.baseline.score.score)).toFixed(3)),
+    substrateMean: Number(mean(rows.map((row) => row.substrate.score.score)).toFixed(3)),
+    baselineFineMean: Number(mean(rows.map((row) => row.baseline.score.fineScore)).toFixed(3)),
+    substrateFineMean: Number(mean(rows.map((row) => row.substrate.score.fineScore)).toFixed(3)),
+    baselineBlockers: rows.reduce((sum, row) => sum + row.baseline.score.blockerCount, 0),
+    substrateBlockers: rows.reduce((sum, row) => sum + row.substrate.score.blockerCount, 0),
+    baselineWarnings: rows.reduce((sum, row) => sum + row.baseline.score.warningCount, 0),
+    substrateWarnings: rows.reduce((sum, row) => sum + row.substrate.score.warningCount, 0),
+    runtimeEvidenceCompleteRows: rows.filter((row) => row.runtimeEvidenceComplete).length,
+    visibleOutputFailureRows: rows.filter((row) => row.visibleOutputFailure).length,
+    substrateHardBlockRows: rows.filter((row) => row.substrate.runtimeExtra?.blocked === true).length,
+    substrateReleasedWithQaRows: rows.filter((row) => row.substrate.runtimeExtra?.qaAsGate?.releaseDecision === 'released_with_qa_findings').length,
+  };
+}
+function renderMarkdown(summary, rows) {
+  const lines = [
+    '# Aria Quality A/B Live Provider Eval',
+    '',
+    `- run_id: ${summary.runId}`,
+    `- status: ${summary.status}`,
+    `- provider: ${summary.provider.provider}`,
+    `- model: ${summary.provider.model}`,
+    `- prompts: ${summary.totalPrompts}`,
+    `- concurrency: ${summary.concurrency || 1}`,
+    `- substrate wins/ties/losses: ${summary.stats.wins}/${summary.stats.ties}/${summary.stats.losses}`,
+    `- coarse ties reclassified as gaps: ${summary.stats.coarseTies}`,
+    `- first-class gap rows: ${summary.stats.firstClassGapRows}`,
+    `- baseline mean score: ${summary.stats.baselineMean}`,
+    `- substrate mean score: ${summary.stats.substrateMean}`,
+    `- baseline fine mean: ${summary.stats.baselineFineMean}`,
+    `- substrate fine mean: ${summary.stats.substrateFineMean}`,
+    `- mean delta: ${summary.stats.meanDelta}`,
+    `- baseline blockers/warnings: ${summary.stats.baselineBlockers}/${summary.stats.baselineWarnings}`,
+    `- substrate blockers/warnings: ${summary.stats.substrateBlockers}/${summary.stats.substrateWarnings}`,
+    `- runtime evidence complete rows: ${summary.stats.runtimeEvidenceCompleteRows}/${summary.totalPrompts}`,
+    `- visible output failure rows: ${summary.stats.visibleOutputFailureRows}`,
+    `- substrate hard-block rows: ${summary.stats.substrateHardBlockRows}`,
+    `- substrate released-with-QA rows: ${summary.stats.substrateReleasedWithQaRows}`,
+    `- evidence boundary: ${summary.evidenceBoundary}`,
+    '',
+    '## Prompt Results',
+    '',
+    '| # | Delta | Primary | Fine | Baseline | Substrate | Winner | Gap | Prompt |',
+    '|---:|---:|---:|---:|---:|---:|---|---|---|',
+    ...rows.map((row) => `| ${row.index} | ${row.delta.toFixed(2)} | ${row.primaryDelta.toFixed(2)} | ${row.fineDelta.toFixed(2)} | ${row.baseline.score.score.toFixed(2)} | ${row.substrate.score.score.toFixed(2)} | ${row.winner} | ${row.gapFindings.length ? htmlEscape(row.gapFindings.join('; ')).replaceAll('|', '\\|') : '-'} | ${htmlEscape(row.prompt).replaceAll('|', '\\|')} |`),
+    '',
+  ];
+  return lines.join('\n');
+}
+function buildEvidenceManifest({ summary, rows, artifacts }) {
+  return {
+    schema: 'aria.quality_ab_live_provider.evidence_manifest.v1',
+    runId,
+    generatedAt: new Date().toISOString(),
+    command: {
+      cwd: repoRoot,
+      argv: process.argv,
+      runtimeUrl,
+    },
+    envKeyPresence: {
+      ARIA_AB_API_KEY: Boolean(process.env.ARIA_AB_API_KEY),
+      DEEPSEEK_API_KEY: Boolean(process.env.DEEPSEEK_API_KEY),
+      ARIA_DEEPSEEK_API_KEY: Boolean(process.env.ARIA_DEEPSEEK_API_KEY),
+      XAI_API_KEY: Boolean(process.env.XAI_API_KEY),
+      GROK_API_KEY: Boolean(process.env.GROK_API_KEY),
+      ANTHROPIC_API_KEY: Boolean(process.env.ANTHROPIC_API_KEY),
+      OPENAI_API_KEY: Boolean(process.env.OPENAI_API_KEY),
+    },
+    rowArtifacts: rows.map((row) => ({
+      index: row.index,
+      promptSha256: row.promptSha256,
+      inputPath: row.inputPath,
+      winner: row.winner,
+      delta: row.delta,
+      primaryDelta: row.primaryDelta,
+      fineDelta: row.fineDelta,
+      firstClassGap: row.firstClassGap,
+      gapFindings: row.gapFindings,
+      baseline: {
+        provider: row.baseline.provider,
+        model: row.baseline.model,
+        durationMs: row.baseline.durationMs,
+        attempts: row.baseline.attempts || null,
+        requestPath: row.baseline.requestPath,
+        requestSha256: row.baseline.requestSha256,
+        textPath: row.baseline.textPath,
+        rawPath: row.baseline.rawPath,
+        rawSha256: row.baseline.rawSha256,
+        score: row.baseline.score,
+      },
+      substrate: {
+        provider: row.substrate.provider,
+        model: row.substrate.model,
+        durationMs: row.substrate.durationMs,
+        attempts: row.substrate.attempts || null,
+        requestPath: row.substrate.requestPath,
+        requestSha256: row.substrate.requestSha256,
+        textPath: row.substrate.textPath,
+        rawPath: row.substrate.rawPath,
+        rawSha256: row.substrate.rawSha256,
+        score: row.substrate.score,
+        runtimeExtra: row.substrate.runtimeExtra,
+      },
+    })),
+    summary,
+    artifacts,
+  };
+}
+function renderHtml(summary, rows) {
+  const rowHtml = rows.map((row) => `<tr>
+    <td>${row.index}</td>
+    <td>${htmlEscape(row.winner)}</td>
+    <td>${row.delta.toFixed(2)}</td>
+    <td>${row.baseline.score.score.toFixed(2)}</td>
+    <td>${row.substrate.score.score.toFixed(2)}</td>
+    <td>${htmlEscape(row.prompt)}</td>
+  </tr>`).join('\n');
+  return `<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <title>Aria Quality AB ${htmlEscape(summary.runId)}</title>
+  <style>
+    body { margin: 0; font-family: Inter, Arial, sans-serif; background: #f8fafc; color: #0f172a; }
+    main { max-width: 1160px; margin: 0 auto; padding: 40px 24px; }
+    h1 { margin: 0 0 8px; font-size: 40px; }
+    .grid { display: grid; grid-template-columns: repeat(4, minmax(0, 1fr)); gap: 12px; margin: 22px 0; }
+    .metric { background: white; border: 1px solid #d8dee9; border-radius: 8px; padding: 14px; }
+    .metric b { display: block; font-size: 30px; }
+    table { width: 100%; border-collapse: collapse; background: white; border: 1px solid #d8dee9; font-size: 13px; }
+    th, td { padding: 8px 10px; border-bottom: 1px solid #e2e8f0; text-align: left; vertical-align: top; }
+    th { background: #eef2f7; }
+    .boundary { border-left: 4px solid #0f766e; padding: 10px 14px; background: #ecfdf5; }
+  </style>
+</head>
+<body><main>
+  <h1>Quality A/B Live Provider Eval</h1>
+  <p>${htmlEscape(summary.runId)} · ${htmlEscape(summary.provider.provider)} / ${htmlEscape(summary.provider.model)}</p>
+  <div class="grid">
+    <div class="metric"><b>${summary.stats.wins}/${summary.totalPrompts}</b><span>substrate wins</span></div>
+    <div class="metric"><b>${summary.stats.meanDelta}</b><span>mean score delta</span></div>
+    <div class="metric"><b>${summary.stats.baselineMean}</b><span>baseline mean</span></div>
+    <div class="metric"><b>${summary.stats.substrateMean}</b><span>substrate mean</span></div>
+  </div>
+  <p class="boundary">${htmlEscape(summary.evidenceBoundary)}</p>
+  <table><thead><tr><th>#</th><th>Winner</th><th>Delta</th><th>A</th><th>B</th><th>Prompt</th></tr></thead><tbody>${rowHtml}</tbody></table>
+</main></body></html>`;
+}
+function writeSvg(summary) {
+  const filePath = path.join(outputRoot, 'quality-ab-card.svg');
+  const status = summary.stats.meanDelta > 0 ? '#0f766e' : '#b45309';
+  const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="1280" height="720" viewBox="0 0 1280 720">
+  <rect width="1280" height="720" fill="#f8fafc"/>
+  <rect x="42" y="42" width="1196" height="636" rx="14" fill="#ffffff" stroke="#d8dee9"/>
+  <rect x="42" y="42" width="1196" height="18" fill="${status}"/>
+  <text x="72" y="114" font-family="Inter, Arial, sans-serif" font-size="42" font-weight="800" fill="#0f172a">Aria Quality A/B Evidence</text>
+  <text x="72" y="158" font-family="Inter, Arial, sans-serif" font-size="22" fill="#475569">Live provider outputs scored by deterministic rubric.</text>
+  <text x="72" y="236" font-family="Inter, Arial, sans-serif" font-size="32" fill="#172033">Substrate wins: <tspan font-weight="800">${summary.stats.wins}/${summary.totalPrompts}</tspan></text>
+  <text x="72" y="296" font-family="Inter, Arial, sans-serif" font-size="32" fill="#172033">Mean delta: <tspan font-weight="800">${summary.stats.meanDelta}</tspan></text>
+  <text x="72" y="356" font-family="Inter, Arial, sans-serif" font-size="32" fill="#172033">A mean: <tspan font-weight="800">${summary.stats.baselineMean}</tspan></text>
+  <text x="72" y="416" font-family="Inter, Arial, sans-serif" font-size="32" fill="#172033">B mean: <tspan font-weight="800">${summary.stats.substrateMean}</tspan></text>
+  <text x="72" y="476" font-family="Inter, Arial, sans-serif" font-size="24" fill="#475569">Provider: ${htmlEscape(summary.provider.provider)} / ${htmlEscape(summary.provider.model)}</text>
+  <text x="72" y="642" font-family="Inter, Arial, sans-serif" font-size="18" fill="#64748b">run_id=${htmlEscape(summary.runId)}</text>
+</svg>`;
+  writeFileSync(filePath, svg);
+  return filePath;
+}
+function readFlagValue(prefix) {
+  for (let i = 0; i < process.argv.length; i += 1) {
+    const value = process.argv[i];
+    if (value === prefix) return process.argv[i + 1] || '';
+    if (value.startsWith(`${prefix}=`)) return value.slice(prefix.length + 1);
+  }
+  return '';
+}
+function rescoreExisting(metricsPath) {
+  const input = JSON.parse(readFileSync(metricsPath, 'utf8'));
+  const rows = input.rows.map((row) => {
+    const baselineText = readFileSync(row.baseline.textPath, 'utf8');
+    const substrateText = readFileSync(row.substrate.textPath, 'utf8');
+    const baselineScore = scoreOutput(baselineText, row.prompt);
+    const substrateScore = scoreOutput(substrateText, row.prompt);
+    const classified = classifyPair({
+      baseline: { ...row.baseline, score: baselineScore },
+      substrate: { ...row.substrate, score: substrateScore },
+    });
+    return {
+      ...row,
+      baseline: { ...row.baseline, text: baselineText, score: baselineScore },
+      substrate: { ...row.substrate, text: substrateText, score: substrateScore },
+      ...classified,
+    };
+  });
+  const stats = pairedStats(rows);
+  const summary = {
+    ...input.summary,
+    status: stats.meanDelta > 0 && stats.wins > stats.losses ? 'substrate_positive_delta' : 'no_positive_delta',
+    generatedAt: new Date().toISOString(),
+    evidenceBoundary: `${input.summary.evidenceBoundary} Rescored with runtime-block outputs treated as hard quality failures.`,
+    stats,
+  };
+  const outputDir = path.dirname(metricsPath);
+  const rescoredPath = path.join(outputDir, 'rescored-metrics.json');
+  const rescoredSummaryPath = path.join(outputDir, 'rescored-summary.md');
+  writeFileSync(rescoredPath, JSON.stringify({ summary, rows }, null, 2) + '\n');
+  writeFileSync(rescoredSummaryPath, renderMarkdown(summary, rows));
+  const result = {
+    ok: true,
+    mode: 'rescored_existing_artifact',
+    rescoredPath,
+    rescoredSummaryPath,
+    summary,
+  };
+  writeFileSync(path.join(outputDir, 'rescore-report.json'), JSON.stringify(result, null, 2) + '\n');
+  process.stdout.write(JSON.stringify(result, null, 2) + '\n');
+}
+async function main() {
+  const rescorePath = readFlagValue('--rescore');
+  if (rescorePath) {
+    rescoreExisting(rescorePath);
+    return;
+  }
+  ensureDir(outputRoot);
+  ensureDir(outputsDir);
+  const count = Math.min(Number(process.env.ARIA_AB_PROMPT_COUNT || process.argv.find((arg) => arg.startsWith('--count='))?.split('=')[1] || 10), promptBank.length);
+  const concurrency = argNumber('concurrency', 4, { min: 1, max: 8 });
+  const prompts = promptBank.slice(0, count);
+  const explicit = prompts.filter((prompt) => promptInstructionRx.test(prompt));
+  if (explicit.length) throw new Error(`A/B prompts include explicit trigger wording: ${explicit.join(' | ')}`);
+  const provider = safeConfigSummary();
+  if (!provider.apiKeyPresent && providerRequiresApiKey(provider.provider)) {
+    throw new Error(`No API key available for provider ${provider.provider}`);
+  }
+  const jsonlPath = path.join(outputRoot, 'results.jsonl');
+  const rows = await mapConcurrent(prompts, concurrency, async (prompt, i) => {
+    const [baseline, substrate] = await Promise.all([
+      callBaselineProvider({ config: provider, prompt, index: i + 1 }),
+      callRuntimeArm({ config: provider, prompt, index: i + 1 }),
+    ]);
+    const baselineScore = scoreOutput(baseline.text, prompt);
+    const substrateScore = scoreOutput(substrate.text, prompt);
+    const baselineTextPath = path.join(outputsDir, `${String(i + 1).padStart(3, '0')}-baseline.txt`);
+    const substrateTextPath = path.join(outputsDir, `${String(i + 1).padStart(3, '0')}-substrate.txt`);
+    const baselineRawPath = path.join(outputsDir, `${String(i + 1).padStart(3, '0')}-baseline.raw.json`);
+    const substrateRawPath = path.join(outputsDir, `${String(i + 1).padStart(3, '0')}-substrate.raw.json`);
+    const baselineRequestPath = path.join(outputsDir, `${String(i + 1).padStart(3, '0')}-baseline.request.json`);
+    const substrateRequestPath = path.join(outputsDir, `${String(i + 1).padStart(3, '0')}-substrate.request.json`);
+    const inputPath = path.join(outputsDir, `${String(i + 1).padStart(3, '0')}.input.txt`);
+    const baselineRow = { ...baseline, rawText: undefined, requestBody: undefined, requestPath: baselineRequestPath, rawPath: baselineRawPath, textPath: baselineTextPath, score: baselineScore };
+    const substrateRow = { ...substrate, rawText: undefined, requestBody: undefined, requestPath: substrateRequestPath, rawPath: substrateRawPath, textPath: substrateTextPath, score: substrateScore };
+    const baseRow = {
+      index: i + 1,
+      prompt,
+      promptSha256: sha256(prompt),
+      inputPath,
+      explicitTriggerInstruction: false,
+      baseline: baselineRow,
+      substrate: substrateRow,
+    };
+    const row = { ...baseRow, ...classifyPair(baseRow) };
+    writeFileSync(row.baseline.textPath, baseline.text);
+    writeFileSync(row.substrate.textPath, substrate.text);
+    writeFileSync(row.baseline.rawPath, redactText(baseline.rawText || ''));
+    writeFileSync(row.substrate.rawPath, redactText(substrate.rawText || ''));
+    writeFileSync(row.inputPath, prompt);
+    writeJsonArtifact(row.baseline.requestPath, baseline.requestBody || {});
+    writeJsonArtifact(row.substrate.requestPath, substrate.requestBody || {});
+    writeFileSync(jsonlPath, JSON.stringify(redactRequest(row)) + '\n', { flag: 'a', mode: 0o644 });
+    process.stdout.write(JSON.stringify({
+      index: row.index,
+      winner: row.winner,
+      delta: Number(row.delta.toFixed(2)),
+      primaryDelta: Number(row.primaryDelta.toFixed(2)),
+      fineDelta: Number(row.fineDelta.toFixed(2)),
+      baselineScore: baselineScore.score,
+      substrateScore: substrateScore.score,
+      firstClassGap: row.firstClassGap,
+      baselineMs: baseline.durationMs,
+      substrateMs: substrate.durationMs,
+    }) + '\n');
+    return row;
+  });
+  rows.sort((a, b) => a.index - b.index);
+  const stats = pairedStats(rows);
+  const summary = {
+    schema: 'aria.quality_ab_live_provider.v1',
+    runId,
+    generatedAt: new Date().toISOString(),
+    status: stats.meanDelta > 0 && stats.wins > stats.losses ? 'substrate_positive_delta' : 'no_positive_delta',
+    evidenceBoundary: 'Live provider outputs were measured with a deterministic rubric. This is a bounded first-pass A/B, not a statistically powered benchmark.',
+    provider: { provider: provider.provider, model: provider.model, baseUrlPresent: Boolean(provider.baseUrl), apiKeyPresent: provider.apiKeyPresent },
+    totalPrompts: rows.length,
+    concurrency,
+    promptInstructionViolations: 0,
+    stats,
+    outputRoot,
+  };
+  const artifacts = {
+    outputRoot,
+    metricsPath: path.join(outputRoot, 'metrics.json'),
+    jsonlPath,
+    summaryPath: path.join(outputRoot, 'summary.md'),
+    htmlPath: path.join(outputRoot, 'report.html'),
+    svgPath: null,
+    evidenceManifestPath: path.join(outputRoot, 'evidence-manifest.json'),
+  };
+  artifacts.svgPath = writeSvg(summary);
+  writeJsonArtifact(artifacts.metricsPath, { summary, rows });
+  writeFileSync(artifacts.summaryPath, renderMarkdown(summary, rows));
+  writeFileSync(artifacts.htmlPath, renderHtml(summary, rows));
+  writeJsonArtifact(artifacts.evidenceManifestPath, buildEvidenceManifest({ summary, rows, artifacts }));
+  const finalReport = { ok: true, summary, artifacts };
+  writeJsonArtifact(path.join(outputRoot, 'run-report.json'), finalReport);
+  process.stdout.write(JSON.stringify(finalReport, null, 2) + '\n');
+}
+main().catch((error) => {
+  console.error(error instanceof Error ? error.stack : String(error));
+  process.exit(1);
+});