npm - alys-akusa - Versions diffs - 0.1.14 → 0.1.20 - Mend

alys-akusa 0.1.14 → 0.1.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/index.cjs +697 -47
package/package.json +3 -2

package/dist/index.cjs CHANGED Viewed

@@ -1525,8 +1525,8 @@ var require_number = __commonJS({
     var isNumber = /[0-9]/;
     var isDef = (any) => any !== void 0;
     var round = (number, precision) => {
-      let factor = Math.pow(10, precision);
-      return Math.round(number * factor) / factor;
+      let factor2 = Math.pow(10, precision);
+      return Math.round(number * factor2) / factor2;
     };
     var NumberPrompt = class extends Prompt {
       constructor(opts = {}) {
@@ -3859,8 +3859,8 @@ var require_number2 = __commonJS({
     var isNumber = /[0-9]/;
     var isDef = (any) => any !== void 0;
     var round = (number, precision) => {
-      let factor = Math.pow(10, precision);
-      return Math.round(number * factor) / factor;
+      let factor2 = Math.pow(10, precision);
+      return Math.round(number * factor2) / factor2;
     };
     var NumberPrompt = class extends Prompt {
       constructor(opts = {}) {
@@ -5098,8 +5098,17 @@ async function discoverResearchSources(topic, options = {}) {
   const embeddingProvider = options.embeddingProvider ?? createEmbeddingProvider();
   const scored = await scoreSearchResults(topic, deduped, embeddingProvider);
   const semanticThreshold = options.minSemanticScore ?? (embeddingProvider.name === "local-token-hash" ? 0.08 : 0.18);
-  const semanticallyFiltered = scored.filter((source) => (source.semanticScore ?? 0) >= semanticThreshold || scored.length <= limit);
-  const ranked = (semanticallyFiltered.length ? semanticallyFiltered : scored).sort((a, b) => (b.trustScore ?? b.score) - (a.trustScore ?? a.score)).slice(0, limit);
+  const topicAligned = scored.filter((source) => passesTopicSourceGate(topic, source, { mode: "balanced" }));
+  if (!topicAligned.length && scored.length) {
+    warnings.push("No search result passed Alys domain-alignment validation.");
+  }
+  const semanticallyFiltered = topicAligned.filter(
+    (source) => (source.semanticScore ?? 0) >= semanticThreshold || (source.domainAlignmentScore ?? 0) >= 0.66
+  );
+  if (!semanticallyFiltered.length && topicAligned.length) {
+    warnings.push("Topic-aligned sources were kept below the semantic threshold for inspection.");
+  }
+  const ranked = (semanticallyFiltered.length ? semanticallyFiltered : topicAligned).sort((a, b) => (b.trustScore ?? b.score) - (a.trustScore ?? a.score)).slice(0, limit);
   return {
     sources: ranked,
     graph: buildResearchGraph(topic, ranked),
@@ -5110,6 +5119,7 @@ async function discoverResearchSources(topic, options = {}) {
 }
 function createConfiguredSearchProviders() {
   const providers = [
+    new CuratedAuthoritySearchProvider(),
     new GitHubSearchProvider(env("GITHUB_TOKEN")),
     new KaggleSearchProvider(env("KAGGLE_USERNAME"), env("KAGGLE_KEY"))
   ];
@@ -5165,7 +5175,8 @@ var GitHubSearchProvider = class {
   name = "github";
   async search(query, options = {}) {
     const url = new URL("https://api.github.com/search/repositories");
-    url.searchParams.set("q", `${query} dataset OR benchmark OR corpus OR csv OR jsonl in:name,description,readme`);
+    const coreQuery = topicCoreTokens(query).slice(0, 4).join(" ") || query;
+    url.searchParams.set("q", `${coreQuery} dataset OR csv OR jsonl OR corpus in:name,description,readme`);
     url.searchParams.set("sort", "stars");
     url.searchParams.set("order", "desc");
     url.searchParams.set("per_page", String(Math.min(20, options.limit ?? 10)));
@@ -5394,18 +5405,37 @@ var LocalHeuristicSearchProvider = class {
     });
   }
 };
+var CuratedAuthoritySearchProvider = class {
+  name = "curated-authority";
+  async search(query, options = {}) {
+    const limit = Math.max(1, options.limit ?? 10);
+    return authorityProfilesForTopic(query).flatMap((profile) => profile.sources.map((source) => ({ ...source, profileId: profile.id }))).slice(0, limit).map((source) => ({
+      title: source.title,
+      url: source.url,
+      snippet: source.snippet,
+      publishedAt: source.publishedAt,
+      score: source.score,
+      provider: this.name,
+      query
+    }));
+  }
+};
 function buildResearchQueries(topic, count = 5) {
   const normalized = topic.trim().replace(/\s+/g, " ");
+  const domainHints = domainSpecificQueryHints(normalized);
   const facets = [
     normalized,
-    `${normalized} dataset github kaggle benchmark`,
+    `${normalized} authoritative source technical reference`,
+    `${normalized} dataset kaggle github csv jsonl`,
     `${normalized} public dataset csv jsonl parquet`,
     `${normalized} official documentation standards methodology`,
     `${normalized} research paper benchmark evaluation`,
     `${normalized} case study operational data`,
     `${normalized} risks failures incidents constraints`,
     `${normalized} statistics dataset schema examples`,
-    `${normalized} regulatory guidance technical report`
+    `${normalized} regulatory guidance technical report`,
+    `${normalized} filetype:pdf manual report`,
+    ...domainHints
   ];
   return Array.from(new Set(facets)).slice(0, Math.max(1, count));
 }
@@ -5418,15 +5448,21 @@ ${result.url}`);
   const domainCounts = countDomains(results.map((result) => domainFromUrl(result.url)));
   return results.map((result, index) => {
     const domain = domainFromUrl(result.url);
-    const semanticScore = candidateEmbeddings[index] ? clamp01(cosineSimilarity(queryEmbedding, candidateEmbeddings[index])) : lexicalRelevance(topic, `${result.title} ${result.snippet}`);
-    const relevanceScore = clamp01(lexicalRelevance(topic, `${result.title} ${result.snippet}`) * 0.55 + semanticScore * 0.45);
+    const candidateText = `${result.title} ${result.snippet} ${result.url}`;
+    const semanticScore = candidateEmbeddings[index] ? clamp01(cosineSimilarity(queryEmbedding, candidateEmbeddings[index])) : lexicalRelevance(topic, candidateText);
+    const lexicalScore = lexicalRelevance(topic, candidateText);
+    const domainAlignment = domainAlignmentScore(topic, candidateText);
+    const broadPenalty = broadSourcePenalty(topic, result, domainAlignment);
+    const relevanceScore = clamp01(lexicalScore * 0.34 + semanticScore * 0.28 + domainAlignment * 0.38 - broadPenalty * 0.34);
     const authority = authorityForDomain(domain, result.url);
+    const authorityProfile = authorityProfileForSource(topic, result, domain);
+    const authorityScore = authorityProfile ? Math.max(authority.score, authorityProfile.authorityScore) : authority.score;
     const freshnessScore = freshnessForDate(result.publishedAt);
     const duplicationRisk = clamp01(Math.max(0, (domainCounts.get(domain) ?? 1) - 1) * 0.12);
     const providerScore = normalizeProviderScore(result.score);
     const sourcePreference = sourcePreferenceScore(domain, result.url, result.provider);
     const trustScore = clamp01(
-      authority.score * 0.3 + relevanceScore * 0.27 + semanticScore * 0.18 + freshnessScore * 0.1 + providerScore * 0.07 + sourcePreference * 0.05 + (1 - duplicationRisk) * 0.05
+      authorityScore * 0.3 + relevanceScore * 0.24 + semanticScore * 0.14 + domainAlignment * 0.16 + freshnessScore * 0.1 + providerScore * 0.07 + sourcePreference * 0.05 + (1 - duplicationRisk) * 0.05 - broadPenalty * 0.26
     );
     return {
       id: sourceId(result.url),
@@ -5439,15 +5475,20 @@ ${result.url}`);
       query: result.query,
       domain,
       publishedAt: result.publishedAt,
-      authorityScore: Number(authority.score.toFixed(3)),
+      authorityScore: Number(authorityScore.toFixed(3)),
       relevanceScore: Number(relevanceScore.toFixed(3)),
       freshnessScore: Number(freshnessScore.toFixed(3)),
       duplicationRisk: Number(duplicationRisk.toFixed(3)),
       semanticScore: Number(semanticScore.toFixed(3)),
+      domainAlignmentScore: Number(domainAlignment.toFixed(3)),
       trustScore: Number(trustScore.toFixed(3)),
       sourceType: authority.type,
       qualitySignals: [
         ...authority.signals,
+        ...authorityProfile ? [`authority-pack:${authorityProfile.id}`] : [],
+        ...domainAlignment >= 0.72 ? ["strong-topic-alignment"] : [],
+        ...domainAlignment < 0.34 ? ["weak-topic-alignment"] : [],
+        ...broadPenalty >= 0.5 ? ["broad-source-penalty"] : [],
         ...sourcePreference >= 0.85 ? ["preferred-source-surface"] : [],
         ...result.provider === "github" ? ["github-repository-search"] : [],
         ...result.provider === "kaggle" ? ["kaggle-dataset-search"] : []
@@ -5455,11 +5496,36 @@ ${result.url}`);
     };
   });
 }
+function passesTopicSourceGate(topic, source, options = {}) {
+  const specialized = isSpecializedTopic(topic);
+  const mode = options.mode ?? "balanced";
+  const trust = source.trustScore ?? source.score;
+  const relevance = source.relevanceScore ?? source.score;
+  const semantic = source.semanticScore ?? source.score;
+  const alignment = source.domainAlignmentScore ?? domainAlignmentScore(topic, `${source.title} ${source.snippet} ${source.url}`);
+  const duplicateRisk = source.duplicationRisk ?? 0;
+  const sourceType = source.sourceType ?? "unknown";
+  const provider = (source.provider || source.discoveredBy || "").toLowerCase();
+  const isCodeSource = sourceType === "code" || provider === "github" || domainFromUrl(source.url) === "github.com";
+  const codeTopic = isCodeOrRepositoryTopic(topic);
+  const broadPenalty = source.qualitySignals?.includes("broad-source-penalty") ? 0.7 : 0;
+  const thresholds = mode === "fast" ? { trust: 0.54, relevance: 0.42, semantic: 0.06, alignment: 0.4 } : mode === "strict" ? { trust: 0.62, relevance: 0.52, semantic: 0.12, alignment: 0.56 } : mode === "maximum-quality" ? { trust: 0.5, relevance: 0.38, semantic: 0.08, alignment: 0.36 } : { trust: 0.52, relevance: 0.4, semantic: 0.08, alignment: 0.38 };
+  if (duplicateRisk >= 0.82) return false;
+  if (!specialized) {
+    return trust >= Math.max(0.42, thresholds.trust - 0.08) && relevance >= Math.max(0.28, thresholds.relevance - 0.1);
+  }
+  if (alignment < thresholds.alignment || relevance < thresholds.relevance || trust < thresholds.trust) return false;
+  if (semantic < thresholds.semantic && alignment < 0.68) return false;
+  if (broadPenalty >= 0.5 && alignment < 0.72) return false;
+  if (isCodeSource && !codeTopic && (alignment < 0.64 || relevance < 0.56)) return false;
+  return true;
+}
 function buildResearchGraph(topic, sources) {
   const topicId = `topic:${sourceId(topic)}`;
   const clusters = buildClusters(sources);
   const entities = extractEntities([topic, ...sources.flatMap((source) => [source.title, source.snippet])]).slice(0, 12);
   const contradictions = inferContradictions(sources, clusters);
+  const relationshipEdges = buildSourceRelationshipEdges(sources);
   return {
     topic,
     generatedAt: (/* @__PURE__ */ new Date()).toISOString(),
@@ -5527,7 +5593,8 @@ function buildResearchGraph(topic, sources) {
           weight: contradiction.severity === "high" ? 0.9 : 0.55,
           evidence: contradiction.reason
         }))
-      )
+      ),
+      ...relationshipEdges
     ],
     clusters,
     contradictions,
@@ -5537,10 +5604,53 @@ function buildResearchGraph(topic, sources) {
       averageTrust: average(sources.map((source) => source.trustScore ?? source.score)),
       averageRelevance: average(sources.map((source) => source.relevanceScore ?? source.score)),
       averageAuthority: average(sources.map((source) => source.authorityScore ?? 0.5)),
+      averageFreshness: average(sources.map((source) => source.freshnessScore ?? 0.62)),
+      corroborationEdges: relationshipEdges.filter((edge) => edge.relation === "corroborates").length,
+      contradictionEdges: contradictions.reduce((sum, contradiction) => sum + contradiction.sourceIds.length, 0),
       duplicateRisk: average(sources.map((source) => source.duplicationRisk ?? 0))
     }
   };
 }
+function buildSourceRelationshipEdges(sources) {
+  const edges = [];
+  for (let i = 0; i < sources.length; i++) {
+    for (let j = i + 1; j < sources.length; j++) {
+      const left = sources[i];
+      const right = sources[j];
+      if ((left.domain ?? domainFromUrl(left.url)) === (right.domain ?? domainFromUrl(right.url))) continue;
+      const leftText = `${left.title} ${left.snippet}`;
+      const rightText = `${right.title} ${right.snippet}`;
+      const similarity = jaccardSimilarity(leftText, rightText);
+      const sharedPack = authorityPack(left) && authorityPack(left) === authorityPack(right);
+      const alignment = Math.min(left.domainAlignmentScore ?? left.relevanceScore ?? 0, right.domainAlignmentScore ?? right.relevanceScore ?? 0);
+      const trust = Math.min(left.trustScore ?? left.score, right.trustScore ?? right.score);
+      const shouldLink = (sharedPack || similarity >= 0.14) && alignment >= 0.48 && trust >= 0.58;
+      if (!shouldLink) continue;
+      edges.push({
+        from: `source:${left.id}`,
+        to: `source:${right.id}`,
+        relation: "corroborates",
+        weight: Number(Math.min(1, similarity * 0.48 + alignment * 0.28 + trust * 0.24 + (sharedPack ? 0.16 : 0)).toFixed(3)),
+        evidence: sharedRelationshipEvidence(left, right, sharedPack ? authorityPack(left) ?? void 0 : void 0)
+      });
+    }
+  }
+  return edges.sort((a, b) => b.weight - a.weight).slice(0, 36);
+}
+function authorityPack(source) {
+  const signal = source.qualitySignals?.find((item) => item.startsWith("authority-pack:"));
+  return signal ? signal.replace("authority-pack:", "") : null;
+}
+function sharedRelationshipEvidence(left, right, pack) {
+  const terms = sharedTerms(`${left.title} ${left.snippet}`, `${right.title} ${right.snippet}`).slice(0, 6);
+  const packText = pack ? `same authority pack (${pack})` : "shared topic evidence";
+  return terms.length ? `${packText}; shared terms: ${terms.join(", ")}` : packText;
+}
+function sharedTerms(left, right) {
+  const leftTokens = tokenSet(normalizeForSearch(left));
+  const rightTokens = tokenSet(normalizeForSearch(right));
+  return [...leftTokens].filter((token) => token.length >= 5 && rightTokens.has(token) && !ENTITY_STOP_WORDS.has(token)).slice(0, 12);
+}
 function buildClusters(sources) {
   const byType = /* @__PURE__ */ new Map();
   for (const source of sources) {
@@ -5618,6 +5728,9 @@ function resultFromObject(item, provider, query, keys) {
     raw: item
   }];
 }
+function domainSpecificQueryHints(topic) {
+  return authorityProfilesForTopic(topic).flatMap((profile) => profile.queryHints.map((hint) => `${topic} ${hint}`));
+}
 async function fetchJson(url, init = {}, timeoutMs = SEARCH_TIMEOUT_MS) {
   const controller = new AbortController();
   const timeout = setTimeout(() => controller.abort(), timeoutMs);
@@ -5714,6 +5827,59 @@ function lexicalRelevance(topic, candidate) {
   const overlap = [...topicTokens].filter((token) => candidateTokens.has(token)).length;
   return clamp01(overlap / Math.max(1, topicTokens.size) * 0.85 + jaccardSimilarity(topic, candidate) * 0.15);
 }
+function domainAlignmentScore(topic, candidate) {
+  const coreTokens = topicCoreTokens(topic);
+  if (!coreTokens.length) return lexicalRelevance(topic, candidate);
+  const normalizedCandidate = normalizeForSearch(candidate);
+  const candidateTokens = tokenSet(normalizedCandidate);
+  const tokenHits = coreTokens.filter((token) => candidateTokenMatches(token, normalizedCandidate, candidateTokens));
+  const bigrams = coreTokens.slice(0, -1).map((token, index) => `${token} ${coreTokens[index + 1]}`);
+  const bigramHits = bigrams.filter((bigram) => normalizedCandidate.includes(bigram));
+  const phrase = coreTokens.join(" ");
+  const phraseScore = phrase.length > 4 && normalizedCandidate.includes(phrase) ? 1 : 0;
+  return clamp01(
+    tokenHits.length / Math.max(1, coreTokens.length) * 0.62 + bigramHits.length / Math.max(1, bigrams.length || 1) * 0.26 + phraseScore * 0.12
+  );
+}
+function topicCoreTokens(topic) {
+  return normalizeForSearch(topic).split(/\s+/).filter((token) => token.length >= 3 && !TOPIC_STOP_WORDS.has(token)).slice(0, 10);
+}
+function candidateTokenMatches(token, normalizedCandidate, candidateTokens) {
+  if (candidateTokens.has(token) || normalizedCandidate.includes(token)) return true;
+  const synonyms = TOPIC_SYNONYMS[token] ?? [];
+  return synonyms.some((synonym) => normalizedCandidate.includes(synonym));
+}
+function isSpecializedTopic(topic) {
+  const coreTokens = topicCoreTokens(topic);
+  return coreTokens.length >= 2;
+}
+function isCodeOrRepositoryTopic(topic) {
+  const normalized = normalizeForSearch(topic);
+  return CODE_TOPIC_TERMS.some((term) => normalized.includes(term));
+}
+function broadSourcePenalty(topic, result, alignment) {
+  if (!isSpecializedTopic(topic) || alignment >= 0.62) return 0;
+  const text = normalizeForSearch(`${result.title} ${result.snippet} ${result.url}`);
+  const broadHits = BROAD_SOURCE_TERMS.filter((term) => text.includes(term)).length;
+  const providerPenalty = result.provider === "github" ? 0.16 : 0;
+  return clamp01(broadHits * 0.14 + providerPenalty);
+}
+function normalizeForSearch(value) {
+  return value.toLowerCase().replace(/[^a-z0-9]+/g, " ").replace(/\s+/g, " ").trim();
+}
+function authorityProfilesForTopic(topic) {
+  const normalized = normalizeForSearch(topic);
+  return AUTHORITY_PROFILES.filter((profile) => profile.match.some((term) => normalized.includes(term)));
+}
+function authorityProfileForSource(topic, result, domain) {
+  const normalized = normalizeForSearch(`${result.title} ${result.snippet} ${result.url}`);
+  return authorityProfilesForTopic(topic).find((profile) => {
+    const domainMatch = profile.authorityDomains.some((authorityDomain) => domain === authorityDomain || domain.endsWith(`.${authorityDomain}`));
+    const sourceMatch = profile.sources.some((source) => canonicalizeUrl(source.url) === canonicalizeUrl(result.url));
+    const topicMatch = profile.match.some((term) => normalized.includes(term));
+    return (domainMatch || sourceMatch) && topicMatch;
+  }) ?? null;
+}
 function normalizeProviderScore(score) {
   if (typeof score !== "number" || Number.isNaN(score)) return 0.55;
   if (score <= 1) return clamp01(score);
@@ -5803,6 +5969,196 @@ var LOCAL_SOURCE_BASES = [
   { label: "Google Scholar", url: "https://scholar.google.com/scholar", querySuffix: "q", signal: "Academic source discovery surface." },
   { label: "OpenAlex", url: "https://openalex.org/search", querySuffix: "q", signal: "Open scholarly metadata and research graph candidates." }
 ];
+var AUTHORITY_PROFILES = [
+  {
+    id: "oil-gas",
+    label: "Oil & Gas",
+    match: ["oil", "gas", "pipeline", "pipelines", "drilling", "well", "reservoir", "scada", "pump", "hazardous liquid", "petroleum"],
+    authorityDomains: ["phmsa.dot.gov", "npms.phmsa.dot.gov", "bts.gov", "ntsb.gov", "spe.org", "api.org"],
+    queryHints: [
+      "PHMSA pipeline data report",
+      "SCADA pressure flow telemetry",
+      "API 1160 integrity management",
+      "pump station operations manual",
+      "incident report technical dataset"
+    ],
+    authorityScore: 0.94,
+    sources: [
+      {
+        title: "PHMSA Pipeline Incident 20 Year Trends",
+        url: "https://www.phmsa.dot.gov/data-and-statistics/pipeline/pipeline-incident-20-year-trends",
+        snippet: "Official PHMSA incident trend data for gas distribution, gas gathering, gas transmission, LNG, underground storage, and hazardous liquid pipeline systems, including operator-submitted incident records and flagged files.",
+        score: 0.96
+      },
+      {
+        title: "PHMSA Pipeline Source Data",
+        url: "https://www.phmsa.dot.gov/data-and-statistics/pipeline/source-data",
+        snippet: "Official PHMSA source data covering pipeline annual reports, incident reports, safety-related condition reports, integrity assurance notifications, gas systems, LNG, and hazardous liquid operators.",
+        score: 0.95
+      },
+      {
+        title: "PHMSA National Pipeline Performance Measures",
+        url: "https://www.phmsa.dot.gov/data-and-statistics/pipeline/national-pipeline-performance-measures",
+        snippet: "National pipeline performance measures for pipeline safety, integrity management, incident categories, serious incidents, significant incidents, all reported incidents, and pipeline infrastructure performance.",
+        score: 0.93
+      },
+      {
+        title: "PHMSA Pipeline Safety Data Report Index",
+        url: "https://www.phmsa.dot.gov/data-and-statistics/pipeline/pipeline-safety-data-report-index",
+        snippet: "PHMSA index of pipeline safety datasets, annual report summaries, integrity management performance, incident cause/type metrics, excavation damage, pipeline mileage, facilities, and technical resources.",
+        score: 0.93
+      },
+      {
+        title: "National Pipeline Mapping System Pipeline Data",
+        url: "https://www.npms.phmsa.dot.gov/PipelineData.aspx",
+        snippet: "National Pipeline Mapping System data for gas transmission pipelines, hazardous liquid pipelines, LNG plants, breakout tanks, operator submissions, pipeline mapping, and integrity-management context.",
+        score: 0.9
+      },
+      {
+        title: "Bureau of Transportation Statistics Pipeline Safety and Property Damage Data",
+        url: "https://www.bts.gov/content/hazardous-liquid-and-natural-gas-pipeline-safety-and-property-damage-data",
+        snippet: "BTS table for hazardous liquid and natural gas pipeline safety and property damage data sourced from PHMSA pipeline incident statistics and transportation safety records.",
+        score: 0.88
+      },
+      {
+        title: "NTSB Pipeline Investigation Reports",
+        url: "https://www.ntsb.gov/investigations/AccidentReports/Pages/Reports.aspx",
+        snippet: "National Transportation Safety Board investigation reports, including pipeline accident reports, factual records, emergency response findings, integrity management evidence, and safety recommendations.",
+        score: 0.86
+      }
+    ]
+  },
+  {
+    id: "healthcare",
+    label: "Healthcare",
+    match: ["medical", "clinical", "health", "healthcare", "patient", "biomedical", "drug", "disease", "diagnosis", "treatment"],
+    authorityDomains: ["nih.gov", "nlm.nih.gov", "pubmed.ncbi.nlm.nih.gov", "clinicaltrials.gov", "fda.gov", "open.fda.gov", "cdc.gov"],
+    queryHints: ["PubMed clinical guideline", "NIH medical terminology", "FDA open data", "CDC public health dataset"],
+    authorityScore: 0.95,
+    sources: [
+      {
+        title: "PubMed Biomedical Literature",
+        url: "https://pubmed.ncbi.nlm.nih.gov/",
+        snippet: "National Library of Medicine search surface for biomedical literature, clinical studies, medical terminology, and peer-reviewed evidence.",
+        score: 0.95
+      },
+      {
+        title: "ClinicalTrials.gov Data API",
+        url: "https://clinicaltrials.gov/data-api/about-api",
+        snippet: "Official ClinicalTrials.gov API and data access for clinical study metadata, interventions, conditions, sponsors, and study outcomes.",
+        score: 0.92
+      },
+      {
+        title: "openFDA APIs",
+        url: "https://open.fda.gov/apis/",
+        snippet: "Official FDA open data APIs for drugs, devices, foods, tobacco, and enforcement datasets.",
+        score: 0.92
+      },
+      {
+        title: "CDC Data Catalog",
+        url: "https://data.cdc.gov/",
+        snippet: "Official CDC public health datasets for surveillance, epidemiology, facilities, disease reporting, and health indicators.",
+        score: 0.9
+      }
+    ]
+  },
+  {
+    id: "legal",
+    label: "Legal & Compliance",
+    match: ["legal", "law", "compliance", "policy", "regulation", "regulatory", "contract", "privacy", "statute", "court"],
+    authorityDomains: ["govinfo.gov", "law.cornell.edu", "courtlistener.com", "federalregister.gov", "sec.gov", "justice.gov"],
+    queryHints: ["official regulation guidance", "statute case law corpus", "court opinion dataset", "federal register rule"],
+    authorityScore: 0.92,
+    sources: [
+      {
+        title: "GovInfo",
+        url: "https://www.govinfo.gov/",
+        snippet: "Official U.S. Government Publishing Office access to federal statutes, regulations, congressional documents, and official government publications.",
+        score: 0.92
+      },
+      {
+        title: "Cornell Legal Information Institute",
+        url: "https://www.law.cornell.edu/",
+        snippet: "Legal Information Institute access to U.S. Code, CFR, Supreme Court opinions, Wex legal dictionary, and legal reference material.",
+        score: 0.86
+      },
+      {
+        title: "CourtListener",
+        url: "https://www.courtlistener.com/",
+        snippet: "Public legal database for court opinions, dockets, judges, citations, and legal research datasets.",
+        score: 0.86
+      },
+      {
+        title: "Federal Register",
+        url: "https://www.federalregister.gov/",
+        snippet: "Official daily publication for U.S. federal rules, proposed rules, notices, executive orders, and regulatory actions.",
+        score: 0.9
+      }
+    ]
+  },
+  {
+    id: "finance",
+    label: "Finance",
+    match: ["finance", "financial", "banking", "market", "markets", "sec", "filing", "risk", "credit", "macroeconomic", "economic"],
+    authorityDomains: ["sec.gov", "fred.stlouisfed.org", "federalreserve.gov", "consumerfinance.gov", "treasury.gov"],
+    queryHints: ["SEC EDGAR filing data", "FRED economic dataset", "Federal Reserve data", "CFPB complaint database"],
+    authorityScore: 0.93,
+    sources: [
+      {
+        title: "SEC EDGAR",
+        url: "https://www.sec.gov/edgar",
+        snippet: "Official SEC EDGAR company filings, disclosures, financial statements, risk factors, and market regulatory documents.",
+        score: 0.94
+      },
+      {
+        title: "FRED Economic Data",
+        url: "https://fred.stlouisfed.org/",
+        snippet: "Federal Reserve Bank of St. Louis economic time series, macroeconomic indicators, rates, labor, inflation, and financial data.",
+        score: 0.92
+      },
+      {
+        title: "Federal Reserve Data",
+        url: "https://www.federalreserve.gov/data.htm",
+        snippet: "Official Federal Reserve data releases, banking data, monetary policy data, financial accounts, and regulatory reports.",
+        score: 0.91
+      },
+      {
+        title: "CFPB Consumer Complaint Database",
+        url: "https://www.consumerfinance.gov/data-research/consumer-complaints/",
+        snippet: "Consumer Financial Protection Bureau complaint database covering financial products, institutions, issues, responses, and trends.",
+        score: 0.88
+      }
+    ]
+  },
+  {
+    id: "developer-docs",
+    label: "Developer Documentation",
+    match: ["developer", "developers", "api", "sdk", "code", "repository", "github", "documentation", "docs", "package", "library", "framework"],
+    authorityDomains: ["docs.github.com", "developer.mozilla.org", "npmjs.com", "nodejs.org", "typescriptlang.org"],
+    queryHints: ["official API reference", "developer documentation examples", "SDK guide", "GitHub repository docs"],
+    authorityScore: 0.88,
+    sources: [
+      {
+        title: "GitHub Docs",
+        url: "https://docs.github.com/",
+        snippet: "Official GitHub documentation for repositories, Actions, APIs, packages, security, and developer workflows.",
+        score: 0.88
+      },
+      {
+        title: "MDN Web Docs",
+        url: "https://developer.mozilla.org/",
+        snippet: "Mozilla Developer Network reference for web platform APIs, JavaScript, HTML, CSS, browser behavior, and examples.",
+        score: 0.88
+      },
+      {
+        title: "npm Docs",
+        url: "https://docs.npmjs.com/",
+        snippet: "Official npm documentation for packages, publishing, package.json, CLI usage, registry behavior, and access control.",
+        score: 0.84
+      }
+    ]
+  }
+];
 var ENTITY_STOP_WORDS = /* @__PURE__ */ new Set([
   "about",
   "source",
@@ -5820,6 +6176,79 @@ var ENTITY_STOP_WORDS = /* @__PURE__ */ new Set([
   "example",
   "examples"
 ]);
+var TOPIC_STOP_WORDS = /* @__PURE__ */ new Set([
+  ...ENTITY_STOP_WORDS,
+  "data",
+  "records",
+  "record",
+  "row",
+  "rows",
+  "corpus",
+  "csv",
+  "json",
+  "jsonl",
+  "parquet",
+  "rag",
+  "fine",
+  "tune",
+  "tuning",
+  "training",
+  "ready",
+  "examples",
+  "example",
+  "with",
+  "from",
+  "into",
+  "about",
+  "info",
+  "information",
+  "generate",
+  "generated",
+  "synthetic",
+  "model",
+  "models",
+  "openai",
+  "anthropic",
+  "ai"
+]);
+var CODE_TOPIC_TERMS = [
+  "code",
+  "github",
+  "repository",
+  "repositories",
+  "developer",
+  "api",
+  "sdk",
+  "package",
+  "library",
+  "framework",
+  "docs",
+  "documentation",
+  "typescript",
+  "python",
+  "javascript"
+];
+var BROAD_SOURCE_TERMS = [
+  "awesome",
+  "tutorial",
+  "course",
+  "applied ml",
+  "machine learning",
+  "deep learning",
+  "data science",
+  "examples",
+  "notebook",
+  "collection",
+  "curated list",
+  "roadmap"
+];
+var TOPIC_SYNONYMS = {
+  oil: ["hazardous liquid", "petroleum", "crude", "liquid pipeline", "liquid pipelines"],
+  gas: ["natural gas", "lng", "gas transmission", "gas distribution"],
+  pipeline: ["pipelines", "transmission line", "hazardous liquid"],
+  drilling: ["wellbore", "rig", "bha", "mwd", "lwd"],
+  well: ["wellbore", "reservoir", "completion"]
+};
 // ../../packages/crawler/src/index.ts
 async function crawlSource(source) {
@@ -6015,6 +6444,7 @@ function explainRecordAcceptance(record, topic) {
   const trust = numericMetadata(record, "source_trust_score", 0);
   const authority = numericMetadata(record, "source_authority_score", 0);
   const relevance = numericMetadata(record, "source_relevance_score", 0);
+  const alignment = numericMetadata(record, "source_domain_alignment_score", 0);
   const supportCount = numericMetadata(record, "support_count", 0);
   const contradictions = numericMetadata(record, "contradiction_count", 0);
   const citationCoverage = citationCoverageScore(record);
@@ -6024,7 +6454,7 @@ function explainRecordAcceptance(record, topic) {
   else if (trust >= 0.5) reasons.push("Moderate source trust accepted with supporting quality signals.");
   else reasons.push("Source trust is weak; confidence should remain cautious.");
   if (authority >= 0.72) reasons.push("Source authority is strong enough to raise confidence.");
-  if (relevance >= 0.65 || relevanceScore >= 0.65) reasons.push("Record is semantically aligned with the requested dataset topic.");
+  if (alignment >= 0.65 || relevance >= 0.65 || relevanceScore >= 0.65) reasons.push("Record is semantically aligned with the requested dataset topic.");
   if (supportCount >= 2) reasons.push("Independent support was detected across sources.");
   if (contradictions === 0) reasons.push("No open contradiction penalty was applied.");
   if (citationCoverage >= 0.9) reasons.push("Record preserves source URL, source title, context, and confidence.");
@@ -6073,7 +6503,11 @@ function citationCoverageScore(record) {
 }
 function relevanceScoreForRecord(record, topic) {
   const metadataRelevance = numericMetadata(record, "source_relevance_score", NaN);
+  const metadataAlignment = numericMetadata(record, "source_domain_alignment_score", NaN);
   const lexical = jaccardSimilarity(topic, `${record.input} ${record.output} ${record.context}`) * 2.6;
+  if (Number.isFinite(metadataRelevance) && Number.isFinite(metadataAlignment)) {
+    return round01(metadataRelevance * 0.34 + metadataAlignment * 0.36 + Math.min(1, lexical) * 0.3);
+  }
   return round01(Number.isFinite(metadataRelevance) ? metadataRelevance * 0.55 + Math.min(1, lexical) * 0.45 : Math.min(1, lexical));
 }
 function estimateSourceDiversity(records) {
@@ -6256,6 +6690,7 @@ Source: ${result.source.url}
       freshnessScore: result.source.freshnessScore,
       duplicationRisk: result.source.duplicationRisk,
       semanticScore: result.source.semanticScore,
+      domainAlignmentScore: result.source.domainAlignmentScore,
       trustScore: result.source.trustScore
     }
   };
@@ -6735,7 +7170,7 @@ function getGroqKey() {
 function providerPrompt(options) {
   const { topic, datasetType, document, targetCount } = options;
   const sourceText = document.text.replace(/\s+/g, " ").trim().slice(0, 7e3);
-  const sourceScores = document.sourceScores ? `Trust: ${document.sourceScores.trustScore ?? "unknown"}; Authority: ${document.sourceScores.authorityScore ?? "unknown"}; Relevance: ${document.sourceScores.relevanceScore ?? "unknown"}; Freshness: ${document.sourceScores.freshnessScore ?? "unknown"}; Duplication risk: ${document.sourceScores.duplicationRisk ?? "unknown"}` : "Source quality scores unavailable.";
+  const sourceScores = document.sourceScores ? `Trust: ${document.sourceScores.trustScore ?? "unknown"}; Authority: ${document.sourceScores.authorityScore ?? "unknown"}; Relevance: ${document.sourceScores.relevanceScore ?? "unknown"}; Domain alignment: ${document.sourceScores.domainAlignmentScore ?? "unknown"}; Freshness: ${document.sourceScores.freshnessScore ?? "unknown"}; Duplication risk: ${document.sourceScores.duplicationRisk ?? "unknown"}` : "Source quality scores unavailable.";
   const mode = datasetType === "instruction" ? "Instruction tuning: input is a task/instruction, output is the ideal answer." : datasetType === "rag" ? "RAG: input is a realistic user query, output is a compact answer, context is retrieval-ready evidence text." : "QA: input is a question, output is a grounded answer.";
   const segmentBlock = options.segment ? `
 Active topic segment:
@@ -7263,16 +7698,19 @@ var StructuringAgent = class {
     const concurrency = Math.max(1, Math.min(documents.length, Number(process.env.ALYS_PROVIDER_CONCURRENCY ?? 3)));
     let totalGenerated = 0;
     const grouped = await mapLimit2(documents, concurrency, async (document, index) => {
-      const finding = findings[index] ?? findings[0];
+      const finding = findings.find((item) => item.id === document.sourceId) ?? findings[index] ?? findings[0];
       const trustScore = document.sourceScores?.trustScore ?? 0.62;
       const authorityScore = document.sourceScores?.authorityScore ?? 0.55;
       const relevanceScore = document.sourceScores?.relevanceScore ?? 0.55;
+      const alignmentScore = document.sourceScores?.domainAlignmentScore ?? relevanceScore;
       const duplicationRisk = document.sourceScores?.duplicationRisk ?? 0;
       if (trustScore < (options.minTrustScore ?? 0.42) || relevanceScore < (options.minRelevanceScore ?? 0.24)) return [];
       const sourceWeight = sourceQualityWeight(document);
       const segment2 = options.generationPlan ? segmentForSource(options.generationPlan, `${document.title} ${document.text}`, index) : void 0;
       const blueprint2 = options.generationPlan?.blueprint;
-      const baselineConfidence = clamp013((finding?.confidence ?? 0.7) * 0.55 + trustScore * 0.22 + authorityScore * 0.12 + relevanceScore * 0.11 - duplicationRisk * 0.08);
+      const baselineConfidence = clamp013(
+        (finding?.confidence ?? 0.62) * 0.42 + trustScore * 0.2 + authorityScore * 0.1 + relevanceScore * 0.12 + alignmentScore * 0.12 - duplicationRisk * 0.12
+      );
       const baseId = import_node_crypto3.default.createHash("sha1").update(`${topic}:${document.url}:${datasetType}`).digest("hex").slice(0, 14);
       const providerTarget = useProvider ? weightedRecordTarget(recordsPerDocument, options.providerRecordsPerDocument ?? recordsPerDocument, sourceWeight) : 0;
       let providerResult = null;
@@ -7289,8 +7727,10 @@ var StructuringAgent = class {
         });
       }
       if (providerResult?.records.length) {
+        const provenance = buildRecordProvenance(document, finding, documents);
         const mapped = providerResult.records.map((g, variantIndex) => {
-          const adjustedConfidence = trustWeightedConfidence(g.confidence, baselineConfidence, document, finding, sourceWeight);
+          const adjustedConfidence = trustWeightedConfidence(g.confidence, baselineConfidence, document, finding, sourceWeight, provenance.corroborationScore);
+          const confidenceFactors = confidenceFactorsForRecord(document, finding, provenance, adjustedConfidence, sourceWeight);
           return {
             id: `${baseId}-${variantIndex}`,
             input: g.input,
@@ -7313,14 +7753,24 @@ var StructuringAgent = class {
               provider: providerResult.provider,
               model: providerResult.model,
               latency_ms: providerResult.latencyMs,
-              support_count: finding?.support.length ?? 1,
+              support_count: provenance.supportUrls.length,
+              support_urls: provenance.supportUrls,
+              support_domains: provenance.supportDomains,
+              support_sources: provenance.supportSources,
+              primary_source_domain: provenance.primaryDomain,
+              corroboration_score: provenance.corroborationScore,
               contradiction_count: finding?.contradictions.length ?? 0,
+              contradiction_notes: finding?.contradictions ?? [],
+              contradiction_status: (finding?.contradictions.length ?? 0) > 0 ? "needs_review" : "clear",
+              confidence_factors: confidenceFactors,
               source_quality_weight: Number(sourceWeight.toFixed(3)),
               source_trust_score: document.sourceScores?.trustScore,
               source_authority_score: document.sourceScores?.authorityScore,
               source_relevance_score: document.sourceScores?.relevanceScore,
+              source_domain_alignment_score: document.sourceScores?.domainAlignmentScore,
+              source_freshness_score: document.sourceScores?.freshnessScore,
               source_duplication_risk: document.sourceScores?.duplicationRisk,
-              acceptance_reasons: acceptanceReasons(document, finding, segment2)
+              acceptance_reasons: acceptanceReasons(document, finding, segment2, provenance)
             },
             created_at: createdAt
           };
@@ -7374,42 +7824,102 @@ function sourceQualityWeight(document) {
   const authority = document.sourceScores?.authorityScore ?? 0.55;
   const relevance = document.sourceScores?.relevanceScore ?? 0.55;
   const semantic = document.sourceScores?.semanticScore ?? relevance;
+  const alignment = document.sourceScores?.domainAlignmentScore ?? relevance;
   const duplicateRisk = document.sourceScores?.duplicationRisk ?? 0;
-  return clamp013(trust * 0.36 + authority * 0.22 + relevance * 0.22 + semantic * 0.12 + (1 - duplicateRisk) * 0.08);
+  return clamp013(trust * 0.3 + authority * 0.18 + relevance * 0.18 + alignment * 0.18 + semantic * 0.1 + (1 - duplicateRisk) * 0.06);
 }
 function weightedRecordTarget(recordsPerDocument, providerTarget, sourceWeight) {
   const multiplier = sourceWeight >= 0.82 ? 1.45 : sourceWeight >= 0.68 ? 1.15 : sourceWeight >= 0.54 ? 0.85 : 0.45;
   return Math.max(0, Math.min(Math.ceil(recordsPerDocument * 1.7), Math.ceil(providerTarget * multiplier)));
 }
-function trustWeightedConfidence(providerConfidence, baselineConfidence, document, finding, sourceWeight) {
+function trustWeightedConfidence(providerConfidence, baselineConfidence, document, finding, sourceWeight, corroborationScore = 0) {
   const contradictions = finding?.contradictions.length ?? 0;
   const support = finding?.support.length ?? 1;
   const duplicateRisk = document.sourceScores?.duplicationRisk ?? 0;
-  const value = providerConfidence * 0.42 + baselineConfidence * 0.24 + sourceWeight * 0.22 + Math.min(0.08, support * 0.02) - Math.min(0.18, contradictions * 0.045) - duplicateRisk * 0.08;
+  const alignment = document.sourceScores?.domainAlignmentScore ?? document.sourceScores?.relevanceScore ?? 0.55;
+  const value = providerConfidence * 0.34 + baselineConfidence * 0.24 + sourceWeight * 0.18 + alignment * 0.12 + Math.min(0.08, support * 0.02) + corroborationScore * 0.08 - Math.min(0.18, contradictions * 0.045) - duplicateRisk * 0.1;
   return Number(clamp013(value).toFixed(3));
 }
-function acceptanceReasons(document, finding, segment2) {
+function acceptanceReasons(document, finding, segment2, provenance) {
   const reasons = [
     `source-trust:${document.sourceScores?.trustScore ?? "unknown"}`,
     `source-authority:${document.sourceScores?.authorityScore ?? "unknown"}`,
     `source-relevance:${document.sourceScores?.relevanceScore ?? "unknown"}`,
-    `support-count:${finding?.support.length ?? 1}`
+    `source-domain-alignment:${document.sourceScores?.domainAlignmentScore ?? "unknown"}`,
+    `source-freshness:${document.sourceScores?.freshnessScore ?? "unknown"}`,
+    `support-count:${provenance.supportUrls.length}`,
+    `corroboration-score:${provenance.corroborationScore}`
   ];
   if (segment2) reasons.push(`segment:${segment2.id}`);
+  if (provenance.supportDomains.length >= 2) reasons.push("cross-source-corroborated");
   if ((finding?.contradictions.length ?? 0) === 0) reasons.push("no-open-contradictions");
   return reasons;
 }
 function recordAcceptanceScore(record) {
   const trust = numericMetadata2(record, "source_trust_score", 0.6);
   const relevance = numericMetadata2(record, "source_relevance_score", 0.55);
+  const alignment = numericMetadata2(record, "source_domain_alignment_score", relevance);
   const qualityWeight = numericMetadata2(record, "source_quality_weight", 0.6);
+  const corroboration = numericMetadata2(record, "corroboration_score", 0);
   const contradictionCount = numericMetadata2(record, "contradiction_count", 0);
   const citation = record.source_url && record.context.trim().length > 40 ? 1 : 0;
   const base = scoreDatasetRecord(record);
   return clamp013(
-    base * 0.38 + record.confidence * 0.18 + trust * 0.14 + relevance * 0.12 + qualityWeight * 0.1 + citation * 0.08 - Math.min(0.2, contradictionCount * 0.04)
+    base * 0.32 + record.confidence * 0.16 + trust * 0.14 + relevance * 0.1 + alignment * 0.1 + qualityWeight * 0.08 + corroboration * 0.08 + citation * 0.02 - Math.min(0.2, contradictionCount * 0.04)
   );
 }
+function buildRecordProvenance(document, finding, documents = []) {
+  const primaryDomain = domainFromUrl3(document.url);
+  const supportUrls = Array.from(/* @__PURE__ */ new Set([document.url, ...finding?.support ?? []])).slice(0, 8);
+  const supportDomains = Array.from(new Set(supportUrls.map(domainFromUrl3).filter(Boolean)));
+  const supportSources = supportUrls.map((url) => {
+    const matched = documents.find((candidate) => candidate.url === url);
+    return {
+      title: matched?.title ?? (url === document.url ? document.title : domainFromUrl3(url)),
+      url,
+      domain: domainFromUrl3(url)
+    };
+  });
+  const independentDomains = supportDomains.filter((domain) => domain !== primaryDomain).length;
+  const supportCount = Math.max(0, supportUrls.length - 1);
+  const corroborationScore = clamp013(independentDomains * 0.34 + supportCount * 0.08);
+  return {
+    primaryDomain,
+    supportUrls,
+    supportDomains,
+    supportSources,
+    corroborationScore: Number(corroborationScore.toFixed(3))
+  };
+}
+function confidenceFactorsForRecord(document, finding, provenance, confidence, sourceWeight) {
+  const authority = document.sourceScores?.authorityScore ?? 0.55;
+  const trust = document.sourceScores?.trustScore ?? 0.62;
+  const relevance = document.sourceScores?.relevanceScore ?? 0.55;
+  const alignment = document.sourceScores?.domainAlignmentScore ?? relevance;
+  const freshness = document.sourceScores?.freshnessScore ?? 0.62;
+  const contradictionPenalty = Math.min(1, (finding?.contradictions.length ?? 0) * 0.22);
+  const retrievalSupport = Math.min(1, provenance.supportUrls.length / 4);
+  return {
+    overall: factor(confidence),
+    sourceAuthority: factor(authority),
+    sourceTrust: factor(trust),
+    domainAlignment: factor(alignment),
+    topicRelevance: factor(relevance),
+    sourceFreshness: factor(freshness),
+    temporalConfidence: factor(freshness * 0.6 + trust * 0.24 + authority * 0.16),
+    corroboration: factor(provenance.corroborationScore),
+    retrievalSupport: factor(retrievalSupport),
+    sourceQualityWeight: factor(sourceWeight),
+    contradictionPenalty: factor(contradictionPenalty)
+  };
+}
+function factor(score) {
+  const normalized = clamp013(score);
+  return {
+    score: Number(normalized.toFixed(3)),
+    level: normalized >= 0.86 ? "very_high" : normalized >= 0.68 ? "high" : normalized >= 0.42 ? "medium" : "low"
+  };
+}
 function numericMetadata2(record, key, fallback) {
   const value = record.metadata?.[key];
   return typeof value === "number" && Number.isFinite(value) ? value : fallback;
@@ -7575,20 +8085,34 @@ function performanceConfig(mode) {
     return {
       candidateMultiplier: 1.35,
       queryCap: 6,
-      gateMinimumRatio: 0.28,
-      minTrustScore: 0.5,
-      minRelevanceScore: 0.28,
+      gateMinimumRatio: 0.35,
+      minTrustScore: 0.54,
+      minRelevanceScore: 0.4,
+      minDomainAlignmentScore: 0.4,
       debateEnabled: false,
       datasetConcurrency: 2
     };
   }
+  if (mode === "strict") {
+    return {
+      candidateMultiplier: 1.6,
+      queryCap: 20,
+      gateMinimumRatio: 0.55,
+      minTrustScore: 0.62,
+      minRelevanceScore: 0.52,
+      minDomainAlignmentScore: 0.56,
+      debateEnabled: true,
+      datasetConcurrency: 1
+    };
+  }
   if (mode === "maximum-quality") {
     return {
       candidateMultiplier: 2.6,
       queryCap: 18,
       gateMinimumRatio: 0.42,
-      minTrustScore: 0.42,
-      minRelevanceScore: 0.24,
+      minTrustScore: 0.5,
+      minRelevanceScore: 0.36,
+      minDomainAlignmentScore: 0.36,
       debateEnabled: true,
       datasetConcurrency: 1
     };
@@ -7597,24 +8121,28 @@ function performanceConfig(mode) {
     candidateMultiplier: 1.85,
     queryCap: 12,
     gateMinimumRatio: 0.35,
-    minTrustScore: 0.44,
-    minRelevanceScore: 0.24,
+    minTrustScore: 0.52,
+    minRelevanceScore: 0.38,
+    minDomainAlignmentScore: 0.38,
     debateEnabled: true,
     datasetConcurrency: 1
   };
 }
-function gateSources(sources, mode) {
+function gateSources(sources, mode, topic) {
   const perf = performanceConfig(mode);
   const concreteSources = sources.filter(isConcreteEvidenceSource);
   const accepted = concreteSources.filter((source) => {
     const trust = source.trustScore ?? source.score;
     const relevance = source.relevanceScore ?? source.score;
+    const semantic = source.semanticScore ?? source.score;
+    const alignment = source.domainAlignmentScore ?? relevance;
     const duplicateRisk = source.duplicationRisk ?? 0;
-    const authority = source.authorityScore ?? 0.5;
-    return trust >= perf.minTrustScore && relevance >= perf.minRelevanceScore && duplicateRisk < 0.72 && (trust >= 0.52 || authority >= 0.72);
+    return trust >= perf.minTrustScore && relevance >= perf.minRelevanceScore && alignment >= perf.minDomainAlignmentScore && (semantic >= 0.08 || alignment >= 0.68) && duplicateRisk < 0.72 && isTopicAlignedSource(topic, source, mode);
   });
   const minimum = Math.min(concreteSources.length, Math.max(3, Math.ceil(concreteSources.length * perf.gateMinimumRatio)));
-  const fallback = accepted.length >= minimum ? accepted : accepted.length ? accepted : concreteSources.filter((source) => (source.authorityScore ?? 0) >= 0.78 && (source.relevanceScore ?? source.score) >= perf.minRelevanceScore).slice(0, minimum);
+  const fallback = accepted.length >= minimum ? accepted : accepted.length ? accepted : concreteSources.filter(
+    (source) => (source.authorityScore ?? 0) >= 0.78 && (source.relevanceScore ?? source.score) >= perf.minRelevanceScore && isTopicAlignedSource(topic, source, mode)
+  ).slice(0, minimum);
   return {
     sources: fallback,
     filtered: Math.max(0, sources.length - fallback.length)
@@ -7625,6 +8153,60 @@ function isConcreteEvidenceSource(source) {
   const provider = (source.provider || source.discoveredBy || "").toLowerCase();
   return provider !== "local-heuristic" && !provider.includes("heuristic");
 }
+function isTopicAlignedSource(topic, source, mode) {
+  const coreTokens = topicCoreTokens2(topic);
+  if (coreTokens.length < 2) return true;
+  const alignment = source.domainAlignmentScore ?? source.relevanceScore ?? source.score;
+  const relevance = source.relevanceScore ?? source.score;
+  const trust = source.trustScore ?? source.score;
+  const sourceType = source.sourceType ?? "unknown";
+  const provider = (source.provider || source.discoveredBy || "").toLowerCase();
+  const isGithub = provider === "github" || source.domain === "github.com" || source.url.includes("github.com/");
+  const codeTopic = /\b(code|github|repository|developer|api|sdk|package|library|framework|docs|documentation)\b/i.test(topic);
+  const minAlignment = mode === "strict" ? 0.56 : mode === "fast" ? 0.4 : 0.36;
+  if (alignment < minAlignment || relevance < 0.36 || trust < 0.48) return false;
+  if ((isGithub || sourceType === "code") && !codeTopic && (alignment < 0.64 || relevance < 0.56)) return false;
+  if (source.qualitySignals?.includes("broad-source-penalty") && alignment < 0.72) return false;
+  return true;
+}
+function topicCoreTokens2(topic) {
+  const stopWords = /* @__PURE__ */ new Set([
+    "data",
+    "dataset",
+    "datasets",
+    "records",
+    "record",
+    "rows",
+    "corpus",
+    "csv",
+    "json",
+    "jsonl",
+    "parquet",
+    "rag",
+    "fine",
+    "tune",
+    "tuning",
+    "training",
+    "ready",
+    "examples",
+    "example",
+    "with",
+    "from",
+    "into",
+    "about",
+    "info",
+    "information",
+    "generate",
+    "generated",
+    "synthetic",
+    "model",
+    "models",
+    "openai",
+    "anthropic",
+    "ai"
+  ]);
+  return topic.toLowerCase().replace(/[^a-z0-9]+/g, " ").split(/\s+/).filter((token) => token.length >= 3 && !stopWords.has(token)).slice(0, 10);
+}
 function sourceDiversityScore(sources) {
   if (!sources.length) return 0;
   const domains = new Set(sources.map((source) => source.domain || domainFromUrl4(source.url)));
@@ -7668,7 +8250,7 @@ async function generateDataset(options) {
     message: `${sources.length} ranked sources from ${research.providersUsed.join(", ") || "research engine"}`,
     metric: `trust ${Math.round(averageTrust * 100)} / 100`
   });
-  const gated = gateSources(sources, performanceMode);
+  const gated = gateSources(sources, performanceMode, options.topic);
   event(options.onEvent, {
     stage: "discovery",
     agent: discovery.name,
@@ -7799,7 +8381,7 @@ async function generateDataset(options) {
     outputDir: `${workspace.datasets}/${datasetId}`,
     formats: exportFormats,
     generationPlan,
-    sourceManifest: sources,
+    sourceManifest: gated.sources,
     researchGraph: research.graph,
     generationSummary: {
       contradictionsOpen,
@@ -7812,7 +8394,7 @@ async function generateDataset(options) {
     evaluation,
     qualityMetrics,
     metrics: {
-      sourcesDiscovered: sources.length,
+      sourcesDiscovered: gated.sources.length,
       documentsExtracted: extracted.length,
       findingsVerified: debated.length,
       duplicatesRemoved: deduped.removed,
@@ -7829,9 +8411,9 @@ async function generateDataset(options) {
   artifacts.push(await writeDatasetArtifact(workspace, datasetId, "data-dictionary.md", renderDataDictionary(manifest)));
   artifacts.push(await writeDatasetArtifact(workspace, datasetId, "source-graph.json", `${JSON.stringify(research.graph, null, 2)}
 `));
-  artifacts.push(await writeDatasetArtifact(workspace, datasetId, "sources.json", `${JSON.stringify(sources, null, 2)}
+  artifacts.push(await writeDatasetArtifact(workspace, datasetId, "sources.json", `${JSON.stringify(gated.sources, null, 2)}
 `));
-  artifacts.push(await writeDatasetArtifact(workspace, datasetId, "sources.md", renderSourcesMarkdown(options.topic, sources)));
+  artifacts.push(await writeDatasetArtifact(workspace, datasetId, "sources.md", renderSourcesMarkdown(options.topic, gated.sources)));
   artifacts.push(await writeDatasetArtifact(workspace, datasetId, "generation-plan.json", `${JSON.stringify(generationPlan, null, 2)}
 `));
   artifacts.push(await writeDatasetArtifact(workspace, datasetId, "benchmark-report.json", `${JSON.stringify(evaluation, null, 2)}
@@ -9750,7 +10332,7 @@ Flags:
   --type instruction|rag|qa
   --datasets 1
   --depth shallow|medium|deep
-  --mode fast|balanced|maximum-quality
+  --mode fast|balanced|strict|maximum-quality
   --sources 8
   --rows 125
   --workspace ~/Desktop/alys-output
@@ -9914,8 +10496,9 @@ function parseDepth(value) {
   return void 0;
 }
 function parsePerformanceMode(value) {
-  if (value === "fast" || value === "balanced" || value === "maximum-quality") return value;
+  if (value === "fast" || value === "balanced" || value === "strict" || value === "maximum-quality") return value;
   if (value === "max" || value === "quality") return "maximum-quality";
+  if (value === "trust" || value === "conservative") return "strict";
   return void 0;
 }
 function isPreparationCommand(command) {
@@ -9997,6 +10580,13 @@ function truncate(value, max = 88) {
   const normalized = value.replace(/\s+/g, " ").trim();
   return normalized.length > max ? `${normalized.slice(0, max - 1)}\u2026` : normalized;
 }
+function domainFromUrl5(url) {
+  try {
+    return new URL(url).hostname.replace(/^www\./, "");
+  } catch {
+    return url;
+  }
+}
 function getMetrics(dataset) {
   const metrics = dataset.manifest.metrics;
   return metrics && typeof metrics === "object" ? metrics : {};
@@ -10024,6 +10614,10 @@ function getSourceManifest(dataset) {
   if (!Array.isArray(sources)) return [];
   return sources.filter((source) => Boolean(source) && typeof source === "object").filter((source) => typeof source.title === "string" || typeof source.url === "string");
 }
+function getResearchGraph(dataset) {
+  const graph = dataset.manifest.researchGraph;
+  return graph && typeof graph === "object" ? graph : {};
+}
 function printStage(code, status, label, metric) {
   const tint = status === "DONE" || status === "OK" ? "green" : status === "WARN" ? "yellow" : "cyan";
   const prefix = `${paint(`[${code.padEnd(4).slice(0, 4)}]`, "gray")} ${paint(status.padEnd(4), tint)}`;
@@ -10065,11 +10659,43 @@ function previewRecord(dataset) {
     if (!input && !output) return null;
     const metadata = parsed.metadata && typeof parsed.metadata === "object" ? parsed.metadata : {};
     const explanation = Array.isArray(metadata.acceptance_explanation) ? metadata.acceptance_explanation.filter((item) => typeof item === "string") : Array.isArray(metadata.acceptance_reasons) ? metadata.acceptance_reasons.filter((item) => typeof item === "string") : [];
-    return { input: truncate(input, 92), output: truncate(output, 120), why: explanation.slice(0, 2).map((item) => truncate(item, 112)) };
+    const supportSources = Array.isArray(metadata.support_sources) ? metadata.support_sources.filter((item) => Boolean(item) && typeof item === "object").map((item) => {
+      const title = typeof item.title === "string" ? item.title : "";
+      const domain = typeof item.domain === "string" ? item.domain : "";
+      return truncate(title || domain || "source", 46);
+    }).filter(Boolean).slice(0, 3) : [];
+    const groundedBy = supportSources.length ? supportSources : Array.isArray(metadata.support_urls) ? metadata.support_urls.filter((item) => typeof item === "string").slice(0, 3).map(domainFromUrl5) : typeof parsed.source_url === "string" ? [domainFromUrl5(parsed.source_url)] : [];
+    const factors = confidenceFactorsLine(metadata.confidence_factors);
+    const cautions = Array.isArray(metadata.contradiction_notes) ? metadata.contradiction_notes.filter((item) => typeof item === "string").slice(0, 2) : [];
+    return {
+      input: truncate(input, 92),
+      output: truncate(output, 120),
+      why: explanation.slice(0, 2).map((item) => truncate(item, 112)),
+      groundedBy,
+      factors,
+      cautions: cautions.map((item) => truncate(item, 112))
+    };
   } catch {
     return null;
   }
 }
+function confidenceFactorsLine(value) {
+  if (!value || typeof value !== "object") return "";
+  const factors = value;
+  const parts = [
+    factorLabel("authority", factors.sourceAuthority),
+    factorLabel("alignment", factors.domainAlignment),
+    factorLabel("freshness", factors.sourceFreshness),
+    factorLabel("corroboration", factors.corroboration),
+    factorLabel("support", factors.retrievalSupport)
+  ].filter(Boolean);
+  return parts.join(" \xB7 ");
+}
+function factorLabel(label, value) {
+  if (!value || typeof value !== "object") return "";
+  const level = value.level;
+  return typeof level === "string" ? `${label} ${level.replace("_", " ")}` : "";
+}
 function depthMultiplier2(depth) {
   if (depth === "deep") return 1.6;
   if (depth === "shallow") return 0.75;
@@ -10148,8 +10774,13 @@ function printGenerationSummary(response, workspaceRoot) {
       acc.findings += Number(metrics.findingsVerified ?? 0);
       acc.duplicates += Number(metrics.duplicatesRemoved ?? summary.duplicatesRemoved ?? 0);
       const quality = getQualityMetrics(dataset);
+      const graph = getResearchGraph(dataset);
+      const graphMetrics = graph.metrics ?? {};
       acc.contradictions += Number(quality.contradictionResolutionCount ?? 0);
       acc.lowTrustFiltered += Number(quality.lowTrustSourceFilterRate ?? 0);
+      acc.corroborationEdges += Number(graphMetrics.corroborationEdges ?? 0);
+      acc.graphContradictions += Number(graphMetrics.contradictionEdges ?? 0);
+      acc.freshness.push(Number(graphMetrics.averageFreshness ?? 0));
       acc.citationCoverage.push(Number(quality.citationCoverage ?? 0));
       acc.uniqueness.push(Number(quality.recordUniqueness ?? 0));
       acc.relevance.push(Number(quality.relevanceScore ?? 0));
@@ -10169,8 +10800,11 @@ function printGenerationSummary(response, workspaceRoot) {
       findings: 0,
       duplicates: 0,
       contradictions: 0,
+      corroborationEdges: 0,
+      graphContradictions: 0,
       lowTrustFiltered: 0,
       confidences: [],
+      freshness: [],
       citationCoverage: [],
       uniqueness: [],
       relevance: [],
@@ -10188,9 +10822,11 @@ function printGenerationSummary(response, workspaceRoot) {
   const instructionTuning = average5(totals.instructionTuning);
   const factualGrounding = average5(totals.factualGrounding);
   const humanUsefulness = average5(totals.humanUsefulness);
+  const freshness = average5(totals.freshness);
   console.log("");
   console.log(paint("Alys run complete", "green"));
   printStage("SRC", "DONE", "Authoritative sources ranked", formatInt2(totals.sources));
+  printStage("SRC", "DONE", "Source corroboration edges", formatInt2(totals.corroborationEdges));
   printStage("SRC", "DONE", "Low-trust source filter applied", `${Math.round(totals.lowTrustFiltered / Math.max(1, response.datasets.length) * 100)}% avg filtered`);
   printStage("EXT", "DONE", "Source documents normalized", formatInt2(totals.documents));
   printStage("CHK", "DONE", "Findings verified", formatInt2(totals.findings));
@@ -10201,6 +10837,7 @@ function printGenerationSummary(response, workspaceRoot) {
   printStage("EVAL", "DONE", "Citation coverage", formatPercent2(citationCoverage));
   printStage("EVAL", "DONE", "Record uniqueness", formatPercent2(uniqueness));
   printStage("EVAL", "DONE", "Topic relevance", formatPercent2(relevance));
+  printStage("EVAL", "DONE", "Source freshness", formatPercent2(freshness));
   printStage("EVAL", "DONE", "RAG suitability", formatScore(ragSuitability));
   printStage("EVAL", "DONE", "Instruction tuning suitability", formatScore(instructionTuning));
   printStage("EVAL", "DONE", "Factual grounding", formatScore(factualGrounding));
@@ -10217,6 +10854,7 @@ function printGenerationSummary(response, workspaceRoot) {
     const sources = Number(metrics.sourcesDiscovered ?? 0);
     const confidenceValue = Number(metrics.averageConfidence ?? summary.averageConfidence ?? 0);
     const blueprint2 = getBlueprint(dataset);
+    const graphMetrics = getResearchGraph(dataset).metrics ?? {};
     const outputDir = import_node_path5.default.join(root, "datasets", dataset.id);
     console.log(`${paint("\u2022", "yellow")} ${paint(dataset.id, "white")}  ${formatInt2(records)} records  ${formatInt2(sources)} sources  ${formatPercent2(confidenceValue)} confidence`);
     console.log(`  ${truncate(dataset.topic, 110)}`);
@@ -10225,6 +10863,7 @@ function printGenerationSummary(response, workspaceRoot) {
     }
     console.log(`  ${paint(outputDir, "cyan")}`);
     console.log(`  quality ${formatPercent2(Number(quality.citationCoverage ?? 0))} citations \xB7 ${formatPercent2(Number(quality.recordUniqueness ?? 0))} unique \xB7 ${formatPercent2(Number(quality.sourceDiversity ?? 0))} source diversity`);
+    console.log(`  graph ${formatInt2(Number(graphMetrics.corroborationEdges ?? 0))} corroborations \xB7 ${formatPercent2(Number(graphMetrics.averageFreshness ?? 0))} freshness \xB7 ${formatInt2(Number(graphMetrics.providerCount ?? 0))} providers`);
     console.log(`  suitability RAG ${formatScore(Number(suitability.ragSuitability ?? 0))} \xB7 tuning ${formatScore(Number(suitability.instructionTuning ?? 0))} \xB7 usefulness ${formatScore(Number(suitability.humanUsefulness ?? 0))}`);
     const topSources = getSourceManifest(dataset).slice(0, 5);
     if (topSources.length) {
@@ -10232,9 +10871,11 @@ function printGenerationSummary(response, workspaceRoot) {
       for (const source of topSources) {
         const label = source.title || source.domain || source.provider || "source";
         const trust = Number(source.trustScore ?? source.authorityScore ?? source.relevanceScore ?? 0);
+        const alignment = Number(source.domainAlignmentScore ?? 0);
         const type = source.sourceType ? ` ${source.sourceType}` : "";
         const score = trust > 0 ? ` ${formatPercent2(trust)} trust` : "";
-        console.log(`    - ${truncate(label, 76)}${paint(`${type}${score}`, "gray")}`);
+        const alignmentLabel = alignment > 0 ? ` ${formatPercent2(alignment)} aligned` : "";
+        console.log(`    - ${truncate(label, 76)}${paint(`${type}${score}${alignmentLabel}`, "gray")}`);
         if (source.url) console.log(`      ${paint(source.url, "cyan")}`);
       }
     }
@@ -10243,6 +10884,15 @@ function printGenerationSummary(response, workspaceRoot) {
       console.log(paint("  preview", "gray"));
       if (preview.input) console.log(`    in  ${paint(preview.input, "gray")}`);
       if (preview.output) console.log(`    out ${preview.output}`);
+      if (preview.groundedBy.length) {
+        console.log(`    grounded by ${paint(preview.groundedBy.join(", "), "gray")}`);
+      }
+      if (preview.factors) {
+        console.log(`    confidence ${paint(preview.factors, "gray")}`);
+      }
+      for (const caution of preview.cautions) {
+        console.log(`    caution ${paint(caution, "yellow")}`);
+      }
       for (const reason of preview.why) {
         console.log(`    why ${paint(reason, "gray")}`);
       }

package/package.json CHANGED Viewed

@@ -1,12 +1,13 @@
 {
   "name": "alys-akusa",
-  "version": "0.1.14",
+  "version": "0.1.20",
   "private": false,
   "description": "Alys local CLI runtime for autonomous AI data preparation.",
   "license": "UNLICENSED",
   "type": "module",
   "bin": {
-    "alys": "dist/index.cjs"
+    "alys": "dist/index.cjs",
+    "alys-akusa": "dist/index.cjs"
   },
   "files": [
     "dist",