alys-akusa 0.1.14 → 0.1.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.cjs +697 -47
  2. package/package.json +1 -1
package/dist/index.cjs CHANGED
@@ -1525,8 +1525,8 @@ var require_number = __commonJS({
1525
1525
  var isNumber = /[0-9]/;
1526
1526
  var isDef = (any) => any !== void 0;
1527
1527
  var round = (number, precision) => {
1528
- let factor = Math.pow(10, precision);
1529
- return Math.round(number * factor) / factor;
1528
+ let factor2 = Math.pow(10, precision);
1529
+ return Math.round(number * factor2) / factor2;
1530
1530
  };
1531
1531
  var NumberPrompt = class extends Prompt {
1532
1532
  constructor(opts = {}) {
@@ -3859,8 +3859,8 @@ var require_number2 = __commonJS({
3859
3859
  var isNumber = /[0-9]/;
3860
3860
  var isDef = (any) => any !== void 0;
3861
3861
  var round = (number, precision) => {
3862
- let factor = Math.pow(10, precision);
3863
- return Math.round(number * factor) / factor;
3862
+ let factor2 = Math.pow(10, precision);
3863
+ return Math.round(number * factor2) / factor2;
3864
3864
  };
3865
3865
  var NumberPrompt = class extends Prompt {
3866
3866
  constructor(opts = {}) {
@@ -5098,8 +5098,17 @@ async function discoverResearchSources(topic, options = {}) {
5098
5098
  const embeddingProvider = options.embeddingProvider ?? createEmbeddingProvider();
5099
5099
  const scored = await scoreSearchResults(topic, deduped, embeddingProvider);
5100
5100
  const semanticThreshold = options.minSemanticScore ?? (embeddingProvider.name === "local-token-hash" ? 0.08 : 0.18);
5101
- const semanticallyFiltered = scored.filter((source) => (source.semanticScore ?? 0) >= semanticThreshold || scored.length <= limit);
5102
- const ranked = (semanticallyFiltered.length ? semanticallyFiltered : scored).sort((a, b) => (b.trustScore ?? b.score) - (a.trustScore ?? a.score)).slice(0, limit);
5101
+ const topicAligned = scored.filter((source) => passesTopicSourceGate(topic, source, { mode: "balanced" }));
5102
+ if (!topicAligned.length && scored.length) {
5103
+ warnings.push("No search result passed Alys domain-alignment validation.");
5104
+ }
5105
+ const semanticallyFiltered = topicAligned.filter(
5106
+ (source) => (source.semanticScore ?? 0) >= semanticThreshold || (source.domainAlignmentScore ?? 0) >= 0.66
5107
+ );
5108
+ if (!semanticallyFiltered.length && topicAligned.length) {
5109
+ warnings.push("Topic-aligned sources were kept below the semantic threshold for inspection.");
5110
+ }
5111
+ const ranked = (semanticallyFiltered.length ? semanticallyFiltered : topicAligned).sort((a, b) => (b.trustScore ?? b.score) - (a.trustScore ?? a.score)).slice(0, limit);
5103
5112
  return {
5104
5113
  sources: ranked,
5105
5114
  graph: buildResearchGraph(topic, ranked),
@@ -5110,6 +5119,7 @@ async function discoverResearchSources(topic, options = {}) {
5110
5119
  }
5111
5120
  function createConfiguredSearchProviders() {
5112
5121
  const providers = [
5122
+ new CuratedAuthoritySearchProvider(),
5113
5123
  new GitHubSearchProvider(env("GITHUB_TOKEN")),
5114
5124
  new KaggleSearchProvider(env("KAGGLE_USERNAME"), env("KAGGLE_KEY"))
5115
5125
  ];
@@ -5165,7 +5175,8 @@ var GitHubSearchProvider = class {
5165
5175
  name = "github";
5166
5176
  async search(query, options = {}) {
5167
5177
  const url = new URL("https://api.github.com/search/repositories");
5168
- url.searchParams.set("q", `${query} dataset OR benchmark OR corpus OR csv OR jsonl in:name,description,readme`);
5178
+ const coreQuery = topicCoreTokens(query).slice(0, 4).join(" ") || query;
5179
+ url.searchParams.set("q", `${coreQuery} dataset OR csv OR jsonl OR corpus in:name,description,readme`);
5169
5180
  url.searchParams.set("sort", "stars");
5170
5181
  url.searchParams.set("order", "desc");
5171
5182
  url.searchParams.set("per_page", String(Math.min(20, options.limit ?? 10)));
@@ -5394,18 +5405,37 @@ var LocalHeuristicSearchProvider = class {
5394
5405
  });
5395
5406
  }
5396
5407
  };
5408
+ var CuratedAuthoritySearchProvider = class {
5409
+ name = "curated-authority";
5410
+ async search(query, options = {}) {
5411
+ const limit = Math.max(1, options.limit ?? 10);
5412
+ return authorityProfilesForTopic(query).flatMap((profile) => profile.sources.map((source) => ({ ...source, profileId: profile.id }))).slice(0, limit).map((source) => ({
5413
+ title: source.title,
5414
+ url: source.url,
5415
+ snippet: source.snippet,
5416
+ publishedAt: source.publishedAt,
5417
+ score: source.score,
5418
+ provider: this.name,
5419
+ query
5420
+ }));
5421
+ }
5422
+ };
5397
5423
  function buildResearchQueries(topic, count = 5) {
5398
5424
  const normalized = topic.trim().replace(/\s+/g, " ");
5425
+ const domainHints = domainSpecificQueryHints(normalized);
5399
5426
  const facets = [
5400
5427
  normalized,
5401
- `${normalized} dataset github kaggle benchmark`,
5428
+ `${normalized} authoritative source technical reference`,
5429
+ `${normalized} dataset kaggle github csv jsonl`,
5402
5430
  `${normalized} public dataset csv jsonl parquet`,
5403
5431
  `${normalized} official documentation standards methodology`,
5404
5432
  `${normalized} research paper benchmark evaluation`,
5405
5433
  `${normalized} case study operational data`,
5406
5434
  `${normalized} risks failures incidents constraints`,
5407
5435
  `${normalized} statistics dataset schema examples`,
5408
- `${normalized} regulatory guidance technical report`
5436
+ `${normalized} regulatory guidance technical report`,
5437
+ `${normalized} filetype:pdf manual report`,
5438
+ ...domainHints
5409
5439
  ];
5410
5440
  return Array.from(new Set(facets)).slice(0, Math.max(1, count));
5411
5441
  }
@@ -5418,15 +5448,21 @@ ${result.url}`);
5418
5448
  const domainCounts = countDomains(results.map((result) => domainFromUrl(result.url)));
5419
5449
  return results.map((result, index) => {
5420
5450
  const domain = domainFromUrl(result.url);
5421
- const semanticScore = candidateEmbeddings[index] ? clamp01(cosineSimilarity(queryEmbedding, candidateEmbeddings[index])) : lexicalRelevance(topic, `${result.title} ${result.snippet}`);
5422
- const relevanceScore = clamp01(lexicalRelevance(topic, `${result.title} ${result.snippet}`) * 0.55 + semanticScore * 0.45);
5451
+ const candidateText = `${result.title} ${result.snippet} ${result.url}`;
5452
+ const semanticScore = candidateEmbeddings[index] ? clamp01(cosineSimilarity(queryEmbedding, candidateEmbeddings[index])) : lexicalRelevance(topic, candidateText);
5453
+ const lexicalScore = lexicalRelevance(topic, candidateText);
5454
+ const domainAlignment = domainAlignmentScore(topic, candidateText);
5455
+ const broadPenalty = broadSourcePenalty(topic, result, domainAlignment);
5456
+ const relevanceScore = clamp01(lexicalScore * 0.34 + semanticScore * 0.28 + domainAlignment * 0.38 - broadPenalty * 0.34);
5423
5457
  const authority = authorityForDomain(domain, result.url);
5458
+ const authorityProfile = authorityProfileForSource(topic, result, domain);
5459
+ const authorityScore = authorityProfile ? Math.max(authority.score, authorityProfile.authorityScore) : authority.score;
5424
5460
  const freshnessScore = freshnessForDate(result.publishedAt);
5425
5461
  const duplicationRisk = clamp01(Math.max(0, (domainCounts.get(domain) ?? 1) - 1) * 0.12);
5426
5462
  const providerScore = normalizeProviderScore(result.score);
5427
5463
  const sourcePreference = sourcePreferenceScore(domain, result.url, result.provider);
5428
5464
  const trustScore = clamp01(
5429
- authority.score * 0.3 + relevanceScore * 0.27 + semanticScore * 0.18 + freshnessScore * 0.1 + providerScore * 0.07 + sourcePreference * 0.05 + (1 - duplicationRisk) * 0.05
5465
+ authorityScore * 0.3 + relevanceScore * 0.24 + semanticScore * 0.14 + domainAlignment * 0.16 + freshnessScore * 0.1 + providerScore * 0.07 + sourcePreference * 0.05 + (1 - duplicationRisk) * 0.05 - broadPenalty * 0.26
5430
5466
  );
5431
5467
  return {
5432
5468
  id: sourceId(result.url),
@@ -5439,15 +5475,20 @@ ${result.url}`);
5439
5475
  query: result.query,
5440
5476
  domain,
5441
5477
  publishedAt: result.publishedAt,
5442
- authorityScore: Number(authority.score.toFixed(3)),
5478
+ authorityScore: Number(authorityScore.toFixed(3)),
5443
5479
  relevanceScore: Number(relevanceScore.toFixed(3)),
5444
5480
  freshnessScore: Number(freshnessScore.toFixed(3)),
5445
5481
  duplicationRisk: Number(duplicationRisk.toFixed(3)),
5446
5482
  semanticScore: Number(semanticScore.toFixed(3)),
5483
+ domainAlignmentScore: Number(domainAlignment.toFixed(3)),
5447
5484
  trustScore: Number(trustScore.toFixed(3)),
5448
5485
  sourceType: authority.type,
5449
5486
  qualitySignals: [
5450
5487
  ...authority.signals,
5488
+ ...authorityProfile ? [`authority-pack:${authorityProfile.id}`] : [],
5489
+ ...domainAlignment >= 0.72 ? ["strong-topic-alignment"] : [],
5490
+ ...domainAlignment < 0.34 ? ["weak-topic-alignment"] : [],
5491
+ ...broadPenalty >= 0.5 ? ["broad-source-penalty"] : [],
5451
5492
  ...sourcePreference >= 0.85 ? ["preferred-source-surface"] : [],
5452
5493
  ...result.provider === "github" ? ["github-repository-search"] : [],
5453
5494
  ...result.provider === "kaggle" ? ["kaggle-dataset-search"] : []
@@ -5455,11 +5496,36 @@ ${result.url}`);
5455
5496
  };
5456
5497
  });
5457
5498
  }
5499
+ function passesTopicSourceGate(topic, source, options = {}) {
5500
+ const specialized = isSpecializedTopic(topic);
5501
+ const mode = options.mode ?? "balanced";
5502
+ const trust = source.trustScore ?? source.score;
5503
+ const relevance = source.relevanceScore ?? source.score;
5504
+ const semantic = source.semanticScore ?? source.score;
5505
+ const alignment = source.domainAlignmentScore ?? domainAlignmentScore(topic, `${source.title} ${source.snippet} ${source.url}`);
5506
+ const duplicateRisk = source.duplicationRisk ?? 0;
5507
+ const sourceType = source.sourceType ?? "unknown";
5508
+ const provider = (source.provider || source.discoveredBy || "").toLowerCase();
5509
+ const isCodeSource = sourceType === "code" || provider === "github" || domainFromUrl(source.url) === "github.com";
5510
+ const codeTopic = isCodeOrRepositoryTopic(topic);
5511
+ const broadPenalty = source.qualitySignals?.includes("broad-source-penalty") ? 0.7 : 0;
5512
+ const thresholds = mode === "fast" ? { trust: 0.54, relevance: 0.42, semantic: 0.06, alignment: 0.4 } : mode === "strict" ? { trust: 0.62, relevance: 0.52, semantic: 0.12, alignment: 0.56 } : mode === "maximum-quality" ? { trust: 0.5, relevance: 0.38, semantic: 0.08, alignment: 0.36 } : { trust: 0.52, relevance: 0.4, semantic: 0.08, alignment: 0.38 };
5513
+ if (duplicateRisk >= 0.82) return false;
5514
+ if (!specialized) {
5515
+ return trust >= Math.max(0.42, thresholds.trust - 0.08) && relevance >= Math.max(0.28, thresholds.relevance - 0.1);
5516
+ }
5517
+ if (alignment < thresholds.alignment || relevance < thresholds.relevance || trust < thresholds.trust) return false;
5518
+ if (semantic < thresholds.semantic && alignment < 0.68) return false;
5519
+ if (broadPenalty >= 0.5 && alignment < 0.72) return false;
5520
+ if (isCodeSource && !codeTopic && (alignment < 0.64 || relevance < 0.56)) return false;
5521
+ return true;
5522
+ }
5458
5523
  function buildResearchGraph(topic, sources) {
5459
5524
  const topicId = `topic:${sourceId(topic)}`;
5460
5525
  const clusters = buildClusters(sources);
5461
5526
  const entities = extractEntities([topic, ...sources.flatMap((source) => [source.title, source.snippet])]).slice(0, 12);
5462
5527
  const contradictions = inferContradictions(sources, clusters);
5528
+ const relationshipEdges = buildSourceRelationshipEdges(sources);
5463
5529
  return {
5464
5530
  topic,
5465
5531
  generatedAt: (/* @__PURE__ */ new Date()).toISOString(),
@@ -5527,7 +5593,8 @@ function buildResearchGraph(topic, sources) {
5527
5593
  weight: contradiction.severity === "high" ? 0.9 : 0.55,
5528
5594
  evidence: contradiction.reason
5529
5595
  }))
5530
- )
5596
+ ),
5597
+ ...relationshipEdges
5531
5598
  ],
5532
5599
  clusters,
5533
5600
  contradictions,
@@ -5537,10 +5604,53 @@ function buildResearchGraph(topic, sources) {
5537
5604
  averageTrust: average(sources.map((source) => source.trustScore ?? source.score)),
5538
5605
  averageRelevance: average(sources.map((source) => source.relevanceScore ?? source.score)),
5539
5606
  averageAuthority: average(sources.map((source) => source.authorityScore ?? 0.5)),
5607
+ averageFreshness: average(sources.map((source) => source.freshnessScore ?? 0.62)),
5608
+ corroborationEdges: relationshipEdges.filter((edge) => edge.relation === "corroborates").length,
5609
+ contradictionEdges: contradictions.reduce((sum, contradiction) => sum + contradiction.sourceIds.length, 0),
5540
5610
  duplicateRisk: average(sources.map((source) => source.duplicationRisk ?? 0))
5541
5611
  }
5542
5612
  };
5543
5613
  }
5614
+ function buildSourceRelationshipEdges(sources) {
5615
+ const edges = [];
5616
+ for (let i = 0; i < sources.length; i++) {
5617
+ for (let j = i + 1; j < sources.length; j++) {
5618
+ const left = sources[i];
5619
+ const right = sources[j];
5620
+ if ((left.domain ?? domainFromUrl(left.url)) === (right.domain ?? domainFromUrl(right.url))) continue;
5621
+ const leftText = `${left.title} ${left.snippet}`;
5622
+ const rightText = `${right.title} ${right.snippet}`;
5623
+ const similarity = jaccardSimilarity(leftText, rightText);
5624
+ const sharedPack = authorityPack(left) && authorityPack(left) === authorityPack(right);
5625
+ const alignment = Math.min(left.domainAlignmentScore ?? left.relevanceScore ?? 0, right.domainAlignmentScore ?? right.relevanceScore ?? 0);
5626
+ const trust = Math.min(left.trustScore ?? left.score, right.trustScore ?? right.score);
5627
+ const shouldLink = (sharedPack || similarity >= 0.14) && alignment >= 0.48 && trust >= 0.58;
5628
+ if (!shouldLink) continue;
5629
+ edges.push({
5630
+ from: `source:${left.id}`,
5631
+ to: `source:${right.id}`,
5632
+ relation: "corroborates",
5633
+ weight: Number(Math.min(1, similarity * 0.48 + alignment * 0.28 + trust * 0.24 + (sharedPack ? 0.16 : 0)).toFixed(3)),
5634
+ evidence: sharedRelationshipEvidence(left, right, sharedPack ? authorityPack(left) ?? void 0 : void 0)
5635
+ });
5636
+ }
5637
+ }
5638
+ return edges.sort((a, b) => b.weight - a.weight).slice(0, 36);
5639
+ }
5640
+ function authorityPack(source) {
5641
+ const signal = source.qualitySignals?.find((item) => item.startsWith("authority-pack:"));
5642
+ return signal ? signal.replace("authority-pack:", "") : null;
5643
+ }
5644
+ function sharedRelationshipEvidence(left, right, pack) {
5645
+ const terms = sharedTerms(`${left.title} ${left.snippet}`, `${right.title} ${right.snippet}`).slice(0, 6);
5646
+ const packText = pack ? `same authority pack (${pack})` : "shared topic evidence";
5647
+ return terms.length ? `${packText}; shared terms: ${terms.join(", ")}` : packText;
5648
+ }
5649
+ function sharedTerms(left, right) {
5650
+ const leftTokens = tokenSet(normalizeForSearch(left));
5651
+ const rightTokens = tokenSet(normalizeForSearch(right));
5652
+ return [...leftTokens].filter((token) => token.length >= 5 && rightTokens.has(token) && !ENTITY_STOP_WORDS.has(token)).slice(0, 12);
5653
+ }
5544
5654
  function buildClusters(sources) {
5545
5655
  const byType = /* @__PURE__ */ new Map();
5546
5656
  for (const source of sources) {
@@ -5618,6 +5728,9 @@ function resultFromObject(item, provider, query, keys) {
5618
5728
  raw: item
5619
5729
  }];
5620
5730
  }
5731
+ function domainSpecificQueryHints(topic) {
5732
+ return authorityProfilesForTopic(topic).flatMap((profile) => profile.queryHints.map((hint) => `${topic} ${hint}`));
5733
+ }
5621
5734
  async function fetchJson(url, init = {}, timeoutMs = SEARCH_TIMEOUT_MS) {
5622
5735
  const controller = new AbortController();
5623
5736
  const timeout = setTimeout(() => controller.abort(), timeoutMs);
@@ -5714,6 +5827,59 @@ function lexicalRelevance(topic, candidate) {
5714
5827
  const overlap = [...topicTokens].filter((token) => candidateTokens.has(token)).length;
5715
5828
  return clamp01(overlap / Math.max(1, topicTokens.size) * 0.85 + jaccardSimilarity(topic, candidate) * 0.15);
5716
5829
  }
5830
+ function domainAlignmentScore(topic, candidate) {
5831
+ const coreTokens = topicCoreTokens(topic);
5832
+ if (!coreTokens.length) return lexicalRelevance(topic, candidate);
5833
+ const normalizedCandidate = normalizeForSearch(candidate);
5834
+ const candidateTokens = tokenSet(normalizedCandidate);
5835
+ const tokenHits = coreTokens.filter((token) => candidateTokenMatches(token, normalizedCandidate, candidateTokens));
5836
+ const bigrams = coreTokens.slice(0, -1).map((token, index) => `${token} ${coreTokens[index + 1]}`);
5837
+ const bigramHits = bigrams.filter((bigram) => normalizedCandidate.includes(bigram));
5838
+ const phrase = coreTokens.join(" ");
5839
+ const phraseScore = phrase.length > 4 && normalizedCandidate.includes(phrase) ? 1 : 0;
5840
+ return clamp01(
5841
+ tokenHits.length / Math.max(1, coreTokens.length) * 0.62 + bigramHits.length / Math.max(1, bigrams.length || 1) * 0.26 + phraseScore * 0.12
5842
+ );
5843
+ }
5844
+ function topicCoreTokens(topic) {
5845
+ return normalizeForSearch(topic).split(/\s+/).filter((token) => token.length >= 3 && !TOPIC_STOP_WORDS.has(token)).slice(0, 10);
5846
+ }
5847
+ function candidateTokenMatches(token, normalizedCandidate, candidateTokens) {
5848
+ if (candidateTokens.has(token) || normalizedCandidate.includes(token)) return true;
5849
+ const synonyms = TOPIC_SYNONYMS[token] ?? [];
5850
+ return synonyms.some((synonym) => normalizedCandidate.includes(synonym));
5851
+ }
5852
+ function isSpecializedTopic(topic) {
5853
+ const coreTokens = topicCoreTokens(topic);
5854
+ return coreTokens.length >= 2;
5855
+ }
5856
+ function isCodeOrRepositoryTopic(topic) {
5857
+ const normalized = normalizeForSearch(topic);
5858
+ return CODE_TOPIC_TERMS.some((term) => normalized.includes(term));
5859
+ }
5860
+ function broadSourcePenalty(topic, result, alignment) {
5861
+ if (!isSpecializedTopic(topic) || alignment >= 0.62) return 0;
5862
+ const text = normalizeForSearch(`${result.title} ${result.snippet} ${result.url}`);
5863
+ const broadHits = BROAD_SOURCE_TERMS.filter((term) => text.includes(term)).length;
5864
+ const providerPenalty = result.provider === "github" ? 0.16 : 0;
5865
+ return clamp01(broadHits * 0.14 + providerPenalty);
5866
+ }
5867
+ function normalizeForSearch(value) {
5868
+ return value.toLowerCase().replace(/[^a-z0-9]+/g, " ").replace(/\s+/g, " ").trim();
5869
+ }
5870
+ function authorityProfilesForTopic(topic) {
5871
+ const normalized = normalizeForSearch(topic);
5872
+ return AUTHORITY_PROFILES.filter((profile) => profile.match.some((term) => normalized.includes(term)));
5873
+ }
5874
+ function authorityProfileForSource(topic, result, domain) {
5875
+ const normalized = normalizeForSearch(`${result.title} ${result.snippet} ${result.url}`);
5876
+ return authorityProfilesForTopic(topic).find((profile) => {
5877
+ const domainMatch = profile.authorityDomains.some((authorityDomain) => domain === authorityDomain || domain.endsWith(`.${authorityDomain}`));
5878
+ const sourceMatch = profile.sources.some((source) => canonicalizeUrl(source.url) === canonicalizeUrl(result.url));
5879
+ const topicMatch = profile.match.some((term) => normalized.includes(term));
5880
+ return (domainMatch || sourceMatch) && topicMatch;
5881
+ }) ?? null;
5882
+ }
5717
5883
  function normalizeProviderScore(score) {
5718
5884
  if (typeof score !== "number" || Number.isNaN(score)) return 0.55;
5719
5885
  if (score <= 1) return clamp01(score);
@@ -5803,6 +5969,196 @@ var LOCAL_SOURCE_BASES = [
5803
5969
  { label: "Google Scholar", url: "https://scholar.google.com/scholar", querySuffix: "q", signal: "Academic source discovery surface." },
5804
5970
  { label: "OpenAlex", url: "https://openalex.org/search", querySuffix: "q", signal: "Open scholarly metadata and research graph candidates." }
5805
5971
  ];
5972
+ var AUTHORITY_PROFILES = [
5973
+ {
5974
+ id: "oil-gas",
5975
+ label: "Oil & Gas",
5976
+ match: ["oil", "gas", "pipeline", "pipelines", "drilling", "well", "reservoir", "scada", "pump", "hazardous liquid", "petroleum"],
5977
+ authorityDomains: ["phmsa.dot.gov", "npms.phmsa.dot.gov", "bts.gov", "ntsb.gov", "spe.org", "api.org"],
5978
+ queryHints: [
5979
+ "PHMSA pipeline data report",
5980
+ "SCADA pressure flow telemetry",
5981
+ "API 1160 integrity management",
5982
+ "pump station operations manual",
5983
+ "incident report technical dataset"
5984
+ ],
5985
+ authorityScore: 0.94,
5986
+ sources: [
5987
+ {
5988
+ title: "PHMSA Pipeline Incident 20 Year Trends",
5989
+ url: "https://www.phmsa.dot.gov/data-and-statistics/pipeline/pipeline-incident-20-year-trends",
5990
+ snippet: "Official PHMSA incident trend data for gas distribution, gas gathering, gas transmission, LNG, underground storage, and hazardous liquid pipeline systems, including operator-submitted incident records and flagged files.",
5991
+ score: 0.96
5992
+ },
5993
+ {
5994
+ title: "PHMSA Pipeline Source Data",
5995
+ url: "https://www.phmsa.dot.gov/data-and-statistics/pipeline/source-data",
5996
+ snippet: "Official PHMSA source data covering pipeline annual reports, incident reports, safety-related condition reports, integrity assurance notifications, gas systems, LNG, and hazardous liquid operators.",
5997
+ score: 0.95
5998
+ },
5999
+ {
6000
+ title: "PHMSA National Pipeline Performance Measures",
6001
+ url: "https://www.phmsa.dot.gov/data-and-statistics/pipeline/national-pipeline-performance-measures",
6002
+ snippet: "National pipeline performance measures for pipeline safety, integrity management, incident categories, serious incidents, significant incidents, all reported incidents, and pipeline infrastructure performance.",
6003
+ score: 0.93
6004
+ },
6005
+ {
6006
+ title: "PHMSA Pipeline Safety Data Report Index",
6007
+ url: "https://www.phmsa.dot.gov/data-and-statistics/pipeline/pipeline-safety-data-report-index",
6008
+ snippet: "PHMSA index of pipeline safety datasets, annual report summaries, integrity management performance, incident cause/type metrics, excavation damage, pipeline mileage, facilities, and technical resources.",
6009
+ score: 0.93
6010
+ },
6011
+ {
6012
+ title: "National Pipeline Mapping System Pipeline Data",
6013
+ url: "https://www.npms.phmsa.dot.gov/PipelineData.aspx",
6014
+ snippet: "National Pipeline Mapping System data for gas transmission pipelines, hazardous liquid pipelines, LNG plants, breakout tanks, operator submissions, pipeline mapping, and integrity-management context.",
6015
+ score: 0.9
6016
+ },
6017
+ {
6018
+ title: "Bureau of Transportation Statistics Pipeline Safety and Property Damage Data",
6019
+ url: "https://www.bts.gov/content/hazardous-liquid-and-natural-gas-pipeline-safety-and-property-damage-data",
6020
+ snippet: "BTS table for hazardous liquid and natural gas pipeline safety and property damage data sourced from PHMSA pipeline incident statistics and transportation safety records.",
6021
+ score: 0.88
6022
+ },
6023
+ {
6024
+ title: "NTSB Pipeline Investigation Reports",
6025
+ url: "https://www.ntsb.gov/investigations/AccidentReports/Pages/Reports.aspx",
6026
+ snippet: "National Transportation Safety Board investigation reports, including pipeline accident reports, factual records, emergency response findings, integrity management evidence, and safety recommendations.",
6027
+ score: 0.86
6028
+ }
6029
+ ]
6030
+ },
6031
+ {
6032
+ id: "healthcare",
6033
+ label: "Healthcare",
6034
+ match: ["medical", "clinical", "health", "healthcare", "patient", "biomedical", "drug", "disease", "diagnosis", "treatment"],
6035
+ authorityDomains: ["nih.gov", "nlm.nih.gov", "pubmed.ncbi.nlm.nih.gov", "clinicaltrials.gov", "fda.gov", "open.fda.gov", "cdc.gov"],
6036
+ queryHints: ["PubMed clinical guideline", "NIH medical terminology", "FDA open data", "CDC public health dataset"],
6037
+ authorityScore: 0.95,
6038
+ sources: [
6039
+ {
6040
+ title: "PubMed Biomedical Literature",
6041
+ url: "https://pubmed.ncbi.nlm.nih.gov/",
6042
+ snippet: "National Library of Medicine search surface for biomedical literature, clinical studies, medical terminology, and peer-reviewed evidence.",
6043
+ score: 0.95
6044
+ },
6045
+ {
6046
+ title: "ClinicalTrials.gov Data API",
6047
+ url: "https://clinicaltrials.gov/data-api/about-api",
6048
+ snippet: "Official ClinicalTrials.gov API and data access for clinical study metadata, interventions, conditions, sponsors, and study outcomes.",
6049
+ score: 0.92
6050
+ },
6051
+ {
6052
+ title: "openFDA APIs",
6053
+ url: "https://open.fda.gov/apis/",
6054
+ snippet: "Official FDA open data APIs for drugs, devices, foods, tobacco, and enforcement datasets.",
6055
+ score: 0.92
6056
+ },
6057
+ {
6058
+ title: "CDC Data Catalog",
6059
+ url: "https://data.cdc.gov/",
6060
+ snippet: "Official CDC public health datasets for surveillance, epidemiology, facilities, disease reporting, and health indicators.",
6061
+ score: 0.9
6062
+ }
6063
+ ]
6064
+ },
6065
+ {
6066
+ id: "legal",
6067
+ label: "Legal & Compliance",
6068
+ match: ["legal", "law", "compliance", "policy", "regulation", "regulatory", "contract", "privacy", "statute", "court"],
6069
+ authorityDomains: ["govinfo.gov", "law.cornell.edu", "courtlistener.com", "federalregister.gov", "sec.gov", "justice.gov"],
6070
+ queryHints: ["official regulation guidance", "statute case law corpus", "court opinion dataset", "federal register rule"],
6071
+ authorityScore: 0.92,
6072
+ sources: [
6073
+ {
6074
+ title: "GovInfo",
6075
+ url: "https://www.govinfo.gov/",
6076
+ snippet: "Official U.S. Government Publishing Office access to federal statutes, regulations, congressional documents, and official government publications.",
6077
+ score: 0.92
6078
+ },
6079
+ {
6080
+ title: "Cornell Legal Information Institute",
6081
+ url: "https://www.law.cornell.edu/",
6082
+ snippet: "Legal Information Institute access to U.S. Code, CFR, Supreme Court opinions, Wex legal dictionary, and legal reference material.",
6083
+ score: 0.86
6084
+ },
6085
+ {
6086
+ title: "CourtListener",
6087
+ url: "https://www.courtlistener.com/",
6088
+ snippet: "Public legal database for court opinions, dockets, judges, citations, and legal research datasets.",
6089
+ score: 0.86
6090
+ },
6091
+ {
6092
+ title: "Federal Register",
6093
+ url: "https://www.federalregister.gov/",
6094
+ snippet: "Official daily publication for U.S. federal rules, proposed rules, notices, executive orders, and regulatory actions.",
6095
+ score: 0.9
6096
+ }
6097
+ ]
6098
+ },
6099
+ {
6100
+ id: "finance",
6101
+ label: "Finance",
6102
+ match: ["finance", "financial", "banking", "market", "markets", "sec", "filing", "risk", "credit", "macroeconomic", "economic"],
6103
+ authorityDomains: ["sec.gov", "fred.stlouisfed.org", "federalreserve.gov", "consumerfinance.gov", "treasury.gov"],
6104
+ queryHints: ["SEC EDGAR filing data", "FRED economic dataset", "Federal Reserve data", "CFPB complaint database"],
6105
+ authorityScore: 0.93,
6106
+ sources: [
6107
+ {
6108
+ title: "SEC EDGAR",
6109
+ url: "https://www.sec.gov/edgar",
6110
+ snippet: "Official SEC EDGAR company filings, disclosures, financial statements, risk factors, and market regulatory documents.",
6111
+ score: 0.94
6112
+ },
6113
+ {
6114
+ title: "FRED Economic Data",
6115
+ url: "https://fred.stlouisfed.org/",
6116
+ snippet: "Federal Reserve Bank of St. Louis economic time series, macroeconomic indicators, rates, labor, inflation, and financial data.",
6117
+ score: 0.92
6118
+ },
6119
+ {
6120
+ title: "Federal Reserve Data",
6121
+ url: "https://www.federalreserve.gov/data.htm",
6122
+ snippet: "Official Federal Reserve data releases, banking data, monetary policy data, financial accounts, and regulatory reports.",
6123
+ score: 0.91
6124
+ },
6125
+ {
6126
+ title: "CFPB Consumer Complaint Database",
6127
+ url: "https://www.consumerfinance.gov/data-research/consumer-complaints/",
6128
+ snippet: "Consumer Financial Protection Bureau complaint database covering financial products, institutions, issues, responses, and trends.",
6129
+ score: 0.88
6130
+ }
6131
+ ]
6132
+ },
6133
+ {
6134
+ id: "developer-docs",
6135
+ label: "Developer Documentation",
6136
+ match: ["developer", "developers", "api", "sdk", "code", "repository", "github", "documentation", "docs", "package", "library", "framework"],
6137
+ authorityDomains: ["docs.github.com", "developer.mozilla.org", "npmjs.com", "nodejs.org", "typescriptlang.org"],
6138
+ queryHints: ["official API reference", "developer documentation examples", "SDK guide", "GitHub repository docs"],
6139
+ authorityScore: 0.88,
6140
+ sources: [
6141
+ {
6142
+ title: "GitHub Docs",
6143
+ url: "https://docs.github.com/",
6144
+ snippet: "Official GitHub documentation for repositories, Actions, APIs, packages, security, and developer workflows.",
6145
+ score: 0.88
6146
+ },
6147
+ {
6148
+ title: "MDN Web Docs",
6149
+ url: "https://developer.mozilla.org/",
6150
+ snippet: "Mozilla Developer Network reference for web platform APIs, JavaScript, HTML, CSS, browser behavior, and examples.",
6151
+ score: 0.88
6152
+ },
6153
+ {
6154
+ title: "npm Docs",
6155
+ url: "https://docs.npmjs.com/",
6156
+ snippet: "Official npm documentation for packages, publishing, package.json, CLI usage, registry behavior, and access control.",
6157
+ score: 0.84
6158
+ }
6159
+ ]
6160
+ }
6161
+ ];
5806
6162
  var ENTITY_STOP_WORDS = /* @__PURE__ */ new Set([
5807
6163
  "about",
5808
6164
  "source",
@@ -5820,6 +6176,79 @@ var ENTITY_STOP_WORDS = /* @__PURE__ */ new Set([
5820
6176
  "example",
5821
6177
  "examples"
5822
6178
  ]);
6179
+ var TOPIC_STOP_WORDS = /* @__PURE__ */ new Set([
6180
+ ...ENTITY_STOP_WORDS,
6181
+ "data",
6182
+ "records",
6183
+ "record",
6184
+ "row",
6185
+ "rows",
6186
+ "corpus",
6187
+ "csv",
6188
+ "json",
6189
+ "jsonl",
6190
+ "parquet",
6191
+ "rag",
6192
+ "fine",
6193
+ "tune",
6194
+ "tuning",
6195
+ "training",
6196
+ "ready",
6197
+ "examples",
6198
+ "example",
6199
+ "with",
6200
+ "from",
6201
+ "into",
6202
+ "about",
6203
+ "info",
6204
+ "information",
6205
+ "generate",
6206
+ "generated",
6207
+ "synthetic",
6208
+ "model",
6209
+ "models",
6210
+ "openai",
6211
+ "anthropic",
6212
+ "ai"
6213
+ ]);
6214
+ var CODE_TOPIC_TERMS = [
6215
+ "code",
6216
+ "github",
6217
+ "repository",
6218
+ "repositories",
6219
+ "developer",
6220
+ "api",
6221
+ "sdk",
6222
+ "package",
6223
+ "library",
6224
+ "framework",
6225
+ "docs",
6226
+ "documentation",
6227
+ "typescript",
6228
+ "python",
6229
+ "javascript"
6230
+ ];
6231
+ var BROAD_SOURCE_TERMS = [
6232
+ "awesome",
6233
+ "tutorial",
6234
+ "course",
6235
+ "applied ml",
6236
+ "machine learning",
6237
+ "deep learning",
6238
+ "data science",
6239
+ "examples",
6240
+ "notebook",
6241
+ "collection",
6242
+ "curated list",
6243
+ "roadmap"
6244
+ ];
6245
+ var TOPIC_SYNONYMS = {
6246
+ oil: ["hazardous liquid", "petroleum", "crude", "liquid pipeline", "liquid pipelines"],
6247
+ gas: ["natural gas", "lng", "gas transmission", "gas distribution"],
6248
+ pipeline: ["pipelines", "transmission line", "hazardous liquid"],
6249
+ drilling: ["wellbore", "rig", "bha", "mwd", "lwd"],
6250
+ well: ["wellbore", "reservoir", "completion"]
6251
+ };
5823
6252
 
5824
6253
  // ../../packages/crawler/src/index.ts
5825
6254
  async function crawlSource(source) {
@@ -6015,6 +6444,7 @@ function explainRecordAcceptance(record, topic) {
6015
6444
  const trust = numericMetadata(record, "source_trust_score", 0);
6016
6445
  const authority = numericMetadata(record, "source_authority_score", 0);
6017
6446
  const relevance = numericMetadata(record, "source_relevance_score", 0);
6447
+ const alignment = numericMetadata(record, "source_domain_alignment_score", 0);
6018
6448
  const supportCount = numericMetadata(record, "support_count", 0);
6019
6449
  const contradictions = numericMetadata(record, "contradiction_count", 0);
6020
6450
  const citationCoverage = citationCoverageScore(record);
@@ -6024,7 +6454,7 @@ function explainRecordAcceptance(record, topic) {
6024
6454
  else if (trust >= 0.5) reasons.push("Moderate source trust accepted with supporting quality signals.");
6025
6455
  else reasons.push("Source trust is weak; confidence should remain cautious.");
6026
6456
  if (authority >= 0.72) reasons.push("Source authority is strong enough to raise confidence.");
6027
- if (relevance >= 0.65 || relevanceScore >= 0.65) reasons.push("Record is semantically aligned with the requested dataset topic.");
6457
+ if (alignment >= 0.65 || relevance >= 0.65 || relevanceScore >= 0.65) reasons.push("Record is semantically aligned with the requested dataset topic.");
6028
6458
  if (supportCount >= 2) reasons.push("Independent support was detected across sources.");
6029
6459
  if (contradictions === 0) reasons.push("No open contradiction penalty was applied.");
6030
6460
  if (citationCoverage >= 0.9) reasons.push("Record preserves source URL, source title, context, and confidence.");
@@ -6073,7 +6503,11 @@ function citationCoverageScore(record) {
6073
6503
  }
6074
6504
  function relevanceScoreForRecord(record, topic) {
6075
6505
  const metadataRelevance = numericMetadata(record, "source_relevance_score", NaN);
6506
+ const metadataAlignment = numericMetadata(record, "source_domain_alignment_score", NaN);
6076
6507
  const lexical = jaccardSimilarity(topic, `${record.input} ${record.output} ${record.context}`) * 2.6;
6508
+ if (Number.isFinite(metadataRelevance) && Number.isFinite(metadataAlignment)) {
6509
+ return round01(metadataRelevance * 0.34 + metadataAlignment * 0.36 + Math.min(1, lexical) * 0.3);
6510
+ }
6077
6511
  return round01(Number.isFinite(metadataRelevance) ? metadataRelevance * 0.55 + Math.min(1, lexical) * 0.45 : Math.min(1, lexical));
6078
6512
  }
6079
6513
  function estimateSourceDiversity(records) {
@@ -6256,6 +6690,7 @@ Source: ${result.source.url}
6256
6690
  freshnessScore: result.source.freshnessScore,
6257
6691
  duplicationRisk: result.source.duplicationRisk,
6258
6692
  semanticScore: result.source.semanticScore,
6693
+ domainAlignmentScore: result.source.domainAlignmentScore,
6259
6694
  trustScore: result.source.trustScore
6260
6695
  }
6261
6696
  };
@@ -6735,7 +7170,7 @@ function getGroqKey() {
6735
7170
  function providerPrompt(options) {
6736
7171
  const { topic, datasetType, document, targetCount } = options;
6737
7172
  const sourceText = document.text.replace(/\s+/g, " ").trim().slice(0, 7e3);
6738
- const sourceScores = document.sourceScores ? `Trust: ${document.sourceScores.trustScore ?? "unknown"}; Authority: ${document.sourceScores.authorityScore ?? "unknown"}; Relevance: ${document.sourceScores.relevanceScore ?? "unknown"}; Freshness: ${document.sourceScores.freshnessScore ?? "unknown"}; Duplication risk: ${document.sourceScores.duplicationRisk ?? "unknown"}` : "Source quality scores unavailable.";
7173
+ const sourceScores = document.sourceScores ? `Trust: ${document.sourceScores.trustScore ?? "unknown"}; Authority: ${document.sourceScores.authorityScore ?? "unknown"}; Relevance: ${document.sourceScores.relevanceScore ?? "unknown"}; Domain alignment: ${document.sourceScores.domainAlignmentScore ?? "unknown"}; Freshness: ${document.sourceScores.freshnessScore ?? "unknown"}; Duplication risk: ${document.sourceScores.duplicationRisk ?? "unknown"}` : "Source quality scores unavailable.";
6739
7174
  const mode = datasetType === "instruction" ? "Instruction tuning: input is a task/instruction, output is the ideal answer." : datasetType === "rag" ? "RAG: input is a realistic user query, output is a compact answer, context is retrieval-ready evidence text." : "QA: input is a question, output is a grounded answer.";
6740
7175
  const segmentBlock = options.segment ? `
6741
7176
  Active topic segment:
@@ -7263,16 +7698,19 @@ var StructuringAgent = class {
7263
7698
  const concurrency = Math.max(1, Math.min(documents.length, Number(process.env.ALYS_PROVIDER_CONCURRENCY ?? 3)));
7264
7699
  let totalGenerated = 0;
7265
7700
  const grouped = await mapLimit2(documents, concurrency, async (document, index) => {
7266
- const finding = findings[index] ?? findings[0];
7701
+ const finding = findings.find((item) => item.id === document.sourceId) ?? findings[index] ?? findings[0];
7267
7702
  const trustScore = document.sourceScores?.trustScore ?? 0.62;
7268
7703
  const authorityScore = document.sourceScores?.authorityScore ?? 0.55;
7269
7704
  const relevanceScore = document.sourceScores?.relevanceScore ?? 0.55;
7705
+ const alignmentScore = document.sourceScores?.domainAlignmentScore ?? relevanceScore;
7270
7706
  const duplicationRisk = document.sourceScores?.duplicationRisk ?? 0;
7271
7707
  if (trustScore < (options.minTrustScore ?? 0.42) || relevanceScore < (options.minRelevanceScore ?? 0.24)) return [];
7272
7708
  const sourceWeight = sourceQualityWeight(document);
7273
7709
  const segment2 = options.generationPlan ? segmentForSource(options.generationPlan, `${document.title} ${document.text}`, index) : void 0;
7274
7710
  const blueprint2 = options.generationPlan?.blueprint;
7275
- const baselineConfidence = clamp013((finding?.confidence ?? 0.7) * 0.55 + trustScore * 0.22 + authorityScore * 0.12 + relevanceScore * 0.11 - duplicationRisk * 0.08);
7711
+ const baselineConfidence = clamp013(
7712
+ (finding?.confidence ?? 0.62) * 0.42 + trustScore * 0.2 + authorityScore * 0.1 + relevanceScore * 0.12 + alignmentScore * 0.12 - duplicationRisk * 0.12
7713
+ );
7276
7714
  const baseId = import_node_crypto3.default.createHash("sha1").update(`${topic}:${document.url}:${datasetType}`).digest("hex").slice(0, 14);
7277
7715
  const providerTarget = useProvider ? weightedRecordTarget(recordsPerDocument, options.providerRecordsPerDocument ?? recordsPerDocument, sourceWeight) : 0;
7278
7716
  let providerResult = null;
@@ -7289,8 +7727,10 @@ var StructuringAgent = class {
7289
7727
  });
7290
7728
  }
7291
7729
  if (providerResult?.records.length) {
7730
+ const provenance = buildRecordProvenance(document, finding, documents);
7292
7731
  const mapped = providerResult.records.map((g, variantIndex) => {
7293
- const adjustedConfidence = trustWeightedConfidence(g.confidence, baselineConfidence, document, finding, sourceWeight);
7732
+ const adjustedConfidence = trustWeightedConfidence(g.confidence, baselineConfidence, document, finding, sourceWeight, provenance.corroborationScore);
7733
+ const confidenceFactors = confidenceFactorsForRecord(document, finding, provenance, adjustedConfidence, sourceWeight);
7294
7734
  return {
7295
7735
  id: `${baseId}-${variantIndex}`,
7296
7736
  input: g.input,
@@ -7313,14 +7753,24 @@ var StructuringAgent = class {
7313
7753
  provider: providerResult.provider,
7314
7754
  model: providerResult.model,
7315
7755
  latency_ms: providerResult.latencyMs,
7316
- support_count: finding?.support.length ?? 1,
7756
+ support_count: provenance.supportUrls.length,
7757
+ support_urls: provenance.supportUrls,
7758
+ support_domains: provenance.supportDomains,
7759
+ support_sources: provenance.supportSources,
7760
+ primary_source_domain: provenance.primaryDomain,
7761
+ corroboration_score: provenance.corroborationScore,
7317
7762
  contradiction_count: finding?.contradictions.length ?? 0,
7763
+ contradiction_notes: finding?.contradictions ?? [],
7764
+ contradiction_status: (finding?.contradictions.length ?? 0) > 0 ? "needs_review" : "clear",
7765
+ confidence_factors: confidenceFactors,
7318
7766
  source_quality_weight: Number(sourceWeight.toFixed(3)),
7319
7767
  source_trust_score: document.sourceScores?.trustScore,
7320
7768
  source_authority_score: document.sourceScores?.authorityScore,
7321
7769
  source_relevance_score: document.sourceScores?.relevanceScore,
7770
+ source_domain_alignment_score: document.sourceScores?.domainAlignmentScore,
7771
+ source_freshness_score: document.sourceScores?.freshnessScore,
7322
7772
  source_duplication_risk: document.sourceScores?.duplicationRisk,
7323
- acceptance_reasons: acceptanceReasons(document, finding, segment2)
7773
+ acceptance_reasons: acceptanceReasons(document, finding, segment2, provenance)
7324
7774
  },
7325
7775
  created_at: createdAt
7326
7776
  };
@@ -7374,42 +7824,102 @@ function sourceQualityWeight(document) {
7374
7824
  const authority = document.sourceScores?.authorityScore ?? 0.55;
7375
7825
  const relevance = document.sourceScores?.relevanceScore ?? 0.55;
7376
7826
  const semantic = document.sourceScores?.semanticScore ?? relevance;
7827
+ const alignment = document.sourceScores?.domainAlignmentScore ?? relevance;
7377
7828
  const duplicateRisk = document.sourceScores?.duplicationRisk ?? 0;
7378
- return clamp013(trust * 0.36 + authority * 0.22 + relevance * 0.22 + semantic * 0.12 + (1 - duplicateRisk) * 0.08);
7829
+ return clamp013(trust * 0.3 + authority * 0.18 + relevance * 0.18 + alignment * 0.18 + semantic * 0.1 + (1 - duplicateRisk) * 0.06);
7379
7830
  }
7380
7831
  function weightedRecordTarget(recordsPerDocument, providerTarget, sourceWeight) {
7381
7832
  const multiplier = sourceWeight >= 0.82 ? 1.45 : sourceWeight >= 0.68 ? 1.15 : sourceWeight >= 0.54 ? 0.85 : 0.45;
7382
7833
  return Math.max(0, Math.min(Math.ceil(recordsPerDocument * 1.7), Math.ceil(providerTarget * multiplier)));
7383
7834
  }
7384
- function trustWeightedConfidence(providerConfidence, baselineConfidence, document, finding, sourceWeight) {
7835
+ function trustWeightedConfidence(providerConfidence, baselineConfidence, document, finding, sourceWeight, corroborationScore = 0) {
7385
7836
  const contradictions = finding?.contradictions.length ?? 0;
7386
7837
  const support = finding?.support.length ?? 1;
7387
7838
  const duplicateRisk = document.sourceScores?.duplicationRisk ?? 0;
7388
- const value = providerConfidence * 0.42 + baselineConfidence * 0.24 + sourceWeight * 0.22 + Math.min(0.08, support * 0.02) - Math.min(0.18, contradictions * 0.045) - duplicateRisk * 0.08;
7839
+ const alignment = document.sourceScores?.domainAlignmentScore ?? document.sourceScores?.relevanceScore ?? 0.55;
7840
+ const value = providerConfidence * 0.34 + baselineConfidence * 0.24 + sourceWeight * 0.18 + alignment * 0.12 + Math.min(0.08, support * 0.02) + corroborationScore * 0.08 - Math.min(0.18, contradictions * 0.045) - duplicateRisk * 0.1;
7389
7841
  return Number(clamp013(value).toFixed(3));
7390
7842
  }
7391
- function acceptanceReasons(document, finding, segment2) {
7843
+ function acceptanceReasons(document, finding, segment2, provenance) {
7392
7844
  const reasons = [
7393
7845
  `source-trust:${document.sourceScores?.trustScore ?? "unknown"}`,
7394
7846
  `source-authority:${document.sourceScores?.authorityScore ?? "unknown"}`,
7395
7847
  `source-relevance:${document.sourceScores?.relevanceScore ?? "unknown"}`,
7396
- `support-count:${finding?.support.length ?? 1}`
7848
+ `source-domain-alignment:${document.sourceScores?.domainAlignmentScore ?? "unknown"}`,
7849
+ `source-freshness:${document.sourceScores?.freshnessScore ?? "unknown"}`,
7850
+ `support-count:${provenance.supportUrls.length}`,
7851
+ `corroboration-score:${provenance.corroborationScore}`
7397
7852
  ];
7398
7853
  if (segment2) reasons.push(`segment:${segment2.id}`);
7854
+ if (provenance.supportDomains.length >= 2) reasons.push("cross-source-corroborated");
7399
7855
  if ((finding?.contradictions.length ?? 0) === 0) reasons.push("no-open-contradictions");
7400
7856
  return reasons;
7401
7857
  }
7402
7858
  function recordAcceptanceScore(record) {
7403
7859
  const trust = numericMetadata2(record, "source_trust_score", 0.6);
7404
7860
  const relevance = numericMetadata2(record, "source_relevance_score", 0.55);
7861
+ const alignment = numericMetadata2(record, "source_domain_alignment_score", relevance);
7405
7862
  const qualityWeight = numericMetadata2(record, "source_quality_weight", 0.6);
7863
+ const corroboration = numericMetadata2(record, "corroboration_score", 0);
7406
7864
  const contradictionCount = numericMetadata2(record, "contradiction_count", 0);
7407
7865
  const citation = record.source_url && record.context.trim().length > 40 ? 1 : 0;
7408
7866
  const base = scoreDatasetRecord(record);
7409
7867
  return clamp013(
7410
- base * 0.38 + record.confidence * 0.18 + trust * 0.14 + relevance * 0.12 + qualityWeight * 0.1 + citation * 0.08 - Math.min(0.2, contradictionCount * 0.04)
7868
+ base * 0.32 + record.confidence * 0.16 + trust * 0.14 + relevance * 0.1 + alignment * 0.1 + qualityWeight * 0.08 + corroboration * 0.08 + citation * 0.02 - Math.min(0.2, contradictionCount * 0.04)
7411
7869
  );
7412
7870
  }
7871
+ function buildRecordProvenance(document, finding, documents = []) {
7872
+ const primaryDomain = domainFromUrl3(document.url);
7873
+ const supportUrls = Array.from(/* @__PURE__ */ new Set([document.url, ...finding?.support ?? []])).slice(0, 8);
7874
+ const supportDomains = Array.from(new Set(supportUrls.map(domainFromUrl3).filter(Boolean)));
7875
+ const supportSources = supportUrls.map((url) => {
7876
+ const matched = documents.find((candidate) => candidate.url === url);
7877
+ return {
7878
+ title: matched?.title ?? (url === document.url ? document.title : domainFromUrl3(url)),
7879
+ url,
7880
+ domain: domainFromUrl3(url)
7881
+ };
7882
+ });
7883
+ const independentDomains = supportDomains.filter((domain) => domain !== primaryDomain).length;
7884
+ const supportCount = Math.max(0, supportUrls.length - 1);
7885
+ const corroborationScore = clamp013(independentDomains * 0.34 + supportCount * 0.08);
7886
+ return {
7887
+ primaryDomain,
7888
+ supportUrls,
7889
+ supportDomains,
7890
+ supportSources,
7891
+ corroborationScore: Number(corroborationScore.toFixed(3))
7892
+ };
7893
+ }
7894
+ function confidenceFactorsForRecord(document, finding, provenance, confidence, sourceWeight) {
7895
+ const authority = document.sourceScores?.authorityScore ?? 0.55;
7896
+ const trust = document.sourceScores?.trustScore ?? 0.62;
7897
+ const relevance = document.sourceScores?.relevanceScore ?? 0.55;
7898
+ const alignment = document.sourceScores?.domainAlignmentScore ?? relevance;
7899
+ const freshness = document.sourceScores?.freshnessScore ?? 0.62;
7900
+ const contradictionPenalty = Math.min(1, (finding?.contradictions.length ?? 0) * 0.22);
7901
+ const retrievalSupport = Math.min(1, provenance.supportUrls.length / 4);
7902
+ return {
7903
+ overall: factor(confidence),
7904
+ sourceAuthority: factor(authority),
7905
+ sourceTrust: factor(trust),
7906
+ domainAlignment: factor(alignment),
7907
+ topicRelevance: factor(relevance),
7908
+ sourceFreshness: factor(freshness),
7909
+ temporalConfidence: factor(freshness * 0.6 + trust * 0.24 + authority * 0.16),
7910
+ corroboration: factor(provenance.corroborationScore),
7911
+ retrievalSupport: factor(retrievalSupport),
7912
+ sourceQualityWeight: factor(sourceWeight),
7913
+ contradictionPenalty: factor(contradictionPenalty)
7914
+ };
7915
+ }
7916
+ function factor(score) {
7917
+ const normalized = clamp013(score);
7918
+ return {
7919
+ score: Number(normalized.toFixed(3)),
7920
+ level: normalized >= 0.86 ? "very_high" : normalized >= 0.68 ? "high" : normalized >= 0.42 ? "medium" : "low"
7921
+ };
7922
+ }
7413
7923
  function numericMetadata2(record, key, fallback) {
7414
7924
  const value = record.metadata?.[key];
7415
7925
  return typeof value === "number" && Number.isFinite(value) ? value : fallback;
@@ -7575,20 +8085,34 @@ function performanceConfig(mode) {
7575
8085
  return {
7576
8086
  candidateMultiplier: 1.35,
7577
8087
  queryCap: 6,
7578
- gateMinimumRatio: 0.28,
7579
- minTrustScore: 0.5,
7580
- minRelevanceScore: 0.28,
8088
+ gateMinimumRatio: 0.35,
8089
+ minTrustScore: 0.54,
8090
+ minRelevanceScore: 0.4,
8091
+ minDomainAlignmentScore: 0.4,
7581
8092
  debateEnabled: false,
7582
8093
  datasetConcurrency: 2
7583
8094
  };
7584
8095
  }
8096
+ if (mode === "strict") {
8097
+ return {
8098
+ candidateMultiplier: 1.6,
8099
+ queryCap: 20,
8100
+ gateMinimumRatio: 0.55,
8101
+ minTrustScore: 0.62,
8102
+ minRelevanceScore: 0.52,
8103
+ minDomainAlignmentScore: 0.56,
8104
+ debateEnabled: true,
8105
+ datasetConcurrency: 1
8106
+ };
8107
+ }
7585
8108
  if (mode === "maximum-quality") {
7586
8109
  return {
7587
8110
  candidateMultiplier: 2.6,
7588
8111
  queryCap: 18,
7589
8112
  gateMinimumRatio: 0.42,
7590
- minTrustScore: 0.42,
7591
- minRelevanceScore: 0.24,
8113
+ minTrustScore: 0.5,
8114
+ minRelevanceScore: 0.36,
8115
+ minDomainAlignmentScore: 0.36,
7592
8116
  debateEnabled: true,
7593
8117
  datasetConcurrency: 1
7594
8118
  };
@@ -7597,24 +8121,28 @@ function performanceConfig(mode) {
7597
8121
  candidateMultiplier: 1.85,
7598
8122
  queryCap: 12,
7599
8123
  gateMinimumRatio: 0.35,
7600
- minTrustScore: 0.44,
7601
- minRelevanceScore: 0.24,
8124
+ minTrustScore: 0.52,
8125
+ minRelevanceScore: 0.38,
8126
+ minDomainAlignmentScore: 0.38,
7602
8127
  debateEnabled: true,
7603
8128
  datasetConcurrency: 1
7604
8129
  };
7605
8130
  }
7606
- function gateSources(sources, mode) {
8131
+ function gateSources(sources, mode, topic) {
7607
8132
  const perf = performanceConfig(mode);
7608
8133
  const concreteSources = sources.filter(isConcreteEvidenceSource);
7609
8134
  const accepted = concreteSources.filter((source) => {
7610
8135
  const trust = source.trustScore ?? source.score;
7611
8136
  const relevance = source.relevanceScore ?? source.score;
8137
+ const semantic = source.semanticScore ?? source.score;
8138
+ const alignment = source.domainAlignmentScore ?? relevance;
7612
8139
  const duplicateRisk = source.duplicationRisk ?? 0;
7613
- const authority = source.authorityScore ?? 0.5;
7614
- return trust >= perf.minTrustScore && relevance >= perf.minRelevanceScore && duplicateRisk < 0.72 && (trust >= 0.52 || authority >= 0.72);
8140
+ return trust >= perf.minTrustScore && relevance >= perf.minRelevanceScore && alignment >= perf.minDomainAlignmentScore && (semantic >= 0.08 || alignment >= 0.68) && duplicateRisk < 0.72 && isTopicAlignedSource(topic, source, mode);
7615
8141
  });
7616
8142
  const minimum = Math.min(concreteSources.length, Math.max(3, Math.ceil(concreteSources.length * perf.gateMinimumRatio)));
7617
- const fallback = accepted.length >= minimum ? accepted : accepted.length ? accepted : concreteSources.filter((source) => (source.authorityScore ?? 0) >= 0.78 && (source.relevanceScore ?? source.score) >= perf.minRelevanceScore).slice(0, minimum);
8143
+ const fallback = accepted.length >= minimum ? accepted : accepted.length ? accepted : concreteSources.filter(
8144
+ (source) => (source.authorityScore ?? 0) >= 0.78 && (source.relevanceScore ?? source.score) >= perf.minRelevanceScore && isTopicAlignedSource(topic, source, mode)
8145
+ ).slice(0, minimum);
7618
8146
  return {
7619
8147
  sources: fallback,
7620
8148
  filtered: Math.max(0, sources.length - fallback.length)
@@ -7625,6 +8153,60 @@ function isConcreteEvidenceSource(source) {
7625
8153
  const provider = (source.provider || source.discoveredBy || "").toLowerCase();
7626
8154
  return provider !== "local-heuristic" && !provider.includes("heuristic");
7627
8155
  }
8156
+ function isTopicAlignedSource(topic, source, mode) {
8157
+ const coreTokens = topicCoreTokens2(topic);
8158
+ if (coreTokens.length < 2) return true;
8159
+ const alignment = source.domainAlignmentScore ?? source.relevanceScore ?? source.score;
8160
+ const relevance = source.relevanceScore ?? source.score;
8161
+ const trust = source.trustScore ?? source.score;
8162
+ const sourceType = source.sourceType ?? "unknown";
8163
+ const provider = (source.provider || source.discoveredBy || "").toLowerCase();
8164
+ const isGithub = provider === "github" || source.domain === "github.com" || source.url.includes("github.com/");
8165
+ const codeTopic = /\b(code|github|repository|developer|api|sdk|package|library|framework|docs|documentation)\b/i.test(topic);
8166
+ const minAlignment = mode === "strict" ? 0.56 : mode === "fast" ? 0.4 : 0.36;
8167
+ if (alignment < minAlignment || relevance < 0.36 || trust < 0.48) return false;
8168
+ if ((isGithub || sourceType === "code") && !codeTopic && (alignment < 0.64 || relevance < 0.56)) return false;
8169
+ if (source.qualitySignals?.includes("broad-source-penalty") && alignment < 0.72) return false;
8170
+ return true;
8171
+ }
8172
+ function topicCoreTokens2(topic) {
8173
+ const stopWords = /* @__PURE__ */ new Set([
8174
+ "data",
8175
+ "dataset",
8176
+ "datasets",
8177
+ "records",
8178
+ "record",
8179
+ "rows",
8180
+ "corpus",
8181
+ "csv",
8182
+ "json",
8183
+ "jsonl",
8184
+ "parquet",
8185
+ "rag",
8186
+ "fine",
8187
+ "tune",
8188
+ "tuning",
8189
+ "training",
8190
+ "ready",
8191
+ "examples",
8192
+ "example",
8193
+ "with",
8194
+ "from",
8195
+ "into",
8196
+ "about",
8197
+ "info",
8198
+ "information",
8199
+ "generate",
8200
+ "generated",
8201
+ "synthetic",
8202
+ "model",
8203
+ "models",
8204
+ "openai",
8205
+ "anthropic",
8206
+ "ai"
8207
+ ]);
8208
+ return topic.toLowerCase().replace(/[^a-z0-9]+/g, " ").split(/\s+/).filter((token) => token.length >= 3 && !stopWords.has(token)).slice(0, 10);
8209
+ }
7628
8210
  function sourceDiversityScore(sources) {
7629
8211
  if (!sources.length) return 0;
7630
8212
  const domains = new Set(sources.map((source) => source.domain || domainFromUrl4(source.url)));
@@ -7668,7 +8250,7 @@ async function generateDataset(options) {
7668
8250
  message: `${sources.length} ranked sources from ${research.providersUsed.join(", ") || "research engine"}`,
7669
8251
  metric: `trust ${Math.round(averageTrust * 100)} / 100`
7670
8252
  });
7671
- const gated = gateSources(sources, performanceMode);
8253
+ const gated = gateSources(sources, performanceMode, options.topic);
7672
8254
  event(options.onEvent, {
7673
8255
  stage: "discovery",
7674
8256
  agent: discovery.name,
@@ -7799,7 +8381,7 @@ async function generateDataset(options) {
7799
8381
  outputDir: `${workspace.datasets}/${datasetId}`,
7800
8382
  formats: exportFormats,
7801
8383
  generationPlan,
7802
- sourceManifest: sources,
8384
+ sourceManifest: gated.sources,
7803
8385
  researchGraph: research.graph,
7804
8386
  generationSummary: {
7805
8387
  contradictionsOpen,
@@ -7812,7 +8394,7 @@ async function generateDataset(options) {
7812
8394
  evaluation,
7813
8395
  qualityMetrics,
7814
8396
  metrics: {
7815
- sourcesDiscovered: sources.length,
8397
+ sourcesDiscovered: gated.sources.length,
7816
8398
  documentsExtracted: extracted.length,
7817
8399
  findingsVerified: debated.length,
7818
8400
  duplicatesRemoved: deduped.removed,
@@ -7829,9 +8411,9 @@ async function generateDataset(options) {
7829
8411
  artifacts.push(await writeDatasetArtifact(workspace, datasetId, "data-dictionary.md", renderDataDictionary(manifest)));
7830
8412
  artifacts.push(await writeDatasetArtifact(workspace, datasetId, "source-graph.json", `${JSON.stringify(research.graph, null, 2)}
7831
8413
  `));
7832
- artifacts.push(await writeDatasetArtifact(workspace, datasetId, "sources.json", `${JSON.stringify(sources, null, 2)}
8414
+ artifacts.push(await writeDatasetArtifact(workspace, datasetId, "sources.json", `${JSON.stringify(gated.sources, null, 2)}
7833
8415
  `));
7834
- artifacts.push(await writeDatasetArtifact(workspace, datasetId, "sources.md", renderSourcesMarkdown(options.topic, sources)));
8416
+ artifacts.push(await writeDatasetArtifact(workspace, datasetId, "sources.md", renderSourcesMarkdown(options.topic, gated.sources)));
7835
8417
  artifacts.push(await writeDatasetArtifact(workspace, datasetId, "generation-plan.json", `${JSON.stringify(generationPlan, null, 2)}
7836
8418
  `));
7837
8419
  artifacts.push(await writeDatasetArtifact(workspace, datasetId, "benchmark-report.json", `${JSON.stringify(evaluation, null, 2)}
@@ -9750,7 +10332,7 @@ Flags:
9750
10332
  --type instruction|rag|qa
9751
10333
  --datasets 1
9752
10334
  --depth shallow|medium|deep
9753
- --mode fast|balanced|maximum-quality
10335
+ --mode fast|balanced|strict|maximum-quality
9754
10336
  --sources 8
9755
10337
  --rows 125
9756
10338
  --workspace ~/Desktop/alys-output
@@ -9914,8 +10496,9 @@ function parseDepth(value) {
9914
10496
  return void 0;
9915
10497
  }
9916
10498
  function parsePerformanceMode(value) {
9917
- if (value === "fast" || value === "balanced" || value === "maximum-quality") return value;
10499
+ if (value === "fast" || value === "balanced" || value === "strict" || value === "maximum-quality") return value;
9918
10500
  if (value === "max" || value === "quality") return "maximum-quality";
10501
+ if (value === "trust" || value === "conservative") return "strict";
9919
10502
  return void 0;
9920
10503
  }
9921
10504
  function isPreparationCommand(command) {
@@ -9997,6 +10580,13 @@ function truncate(value, max = 88) {
9997
10580
  const normalized = value.replace(/\s+/g, " ").trim();
9998
10581
  return normalized.length > max ? `${normalized.slice(0, max - 1)}\u2026` : normalized;
9999
10582
  }
10583
+ function domainFromUrl5(url) {
10584
+ try {
10585
+ return new URL(url).hostname.replace(/^www\./, "");
10586
+ } catch {
10587
+ return url;
10588
+ }
10589
+ }
10000
10590
  function getMetrics(dataset) {
10001
10591
  const metrics = dataset.manifest.metrics;
10002
10592
  return metrics && typeof metrics === "object" ? metrics : {};
@@ -10024,6 +10614,10 @@ function getSourceManifest(dataset) {
10024
10614
  if (!Array.isArray(sources)) return [];
10025
10615
  return sources.filter((source) => Boolean(source) && typeof source === "object").filter((source) => typeof source.title === "string" || typeof source.url === "string");
10026
10616
  }
10617
+ function getResearchGraph(dataset) {
10618
+ const graph = dataset.manifest.researchGraph;
10619
+ return graph && typeof graph === "object" ? graph : {};
10620
+ }
10027
10621
  function printStage(code, status, label, metric) {
10028
10622
  const tint = status === "DONE" || status === "OK" ? "green" : status === "WARN" ? "yellow" : "cyan";
10029
10623
  const prefix = `${paint(`[${code.padEnd(4).slice(0, 4)}]`, "gray")} ${paint(status.padEnd(4), tint)}`;
@@ -10065,11 +10659,43 @@ function previewRecord(dataset) {
10065
10659
  if (!input && !output) return null;
10066
10660
  const metadata = parsed.metadata && typeof parsed.metadata === "object" ? parsed.metadata : {};
10067
10661
  const explanation = Array.isArray(metadata.acceptance_explanation) ? metadata.acceptance_explanation.filter((item) => typeof item === "string") : Array.isArray(metadata.acceptance_reasons) ? metadata.acceptance_reasons.filter((item) => typeof item === "string") : [];
10068
- return { input: truncate(input, 92), output: truncate(output, 120), why: explanation.slice(0, 2).map((item) => truncate(item, 112)) };
10662
+ const supportSources = Array.isArray(metadata.support_sources) ? metadata.support_sources.filter((item) => Boolean(item) && typeof item === "object").map((item) => {
10663
+ const title = typeof item.title === "string" ? item.title : "";
10664
+ const domain = typeof item.domain === "string" ? item.domain : "";
10665
+ return truncate(title || domain || "source", 46);
10666
+ }).filter(Boolean).slice(0, 3) : [];
10667
+ const groundedBy = supportSources.length ? supportSources : Array.isArray(metadata.support_urls) ? metadata.support_urls.filter((item) => typeof item === "string").slice(0, 3).map(domainFromUrl5) : typeof parsed.source_url === "string" ? [domainFromUrl5(parsed.source_url)] : [];
10668
+ const factors = confidenceFactorsLine(metadata.confidence_factors);
10669
+ const cautions = Array.isArray(metadata.contradiction_notes) ? metadata.contradiction_notes.filter((item) => typeof item === "string").slice(0, 2) : [];
10670
+ return {
10671
+ input: truncate(input, 92),
10672
+ output: truncate(output, 120),
10673
+ why: explanation.slice(0, 2).map((item) => truncate(item, 112)),
10674
+ groundedBy,
10675
+ factors,
10676
+ cautions: cautions.map((item) => truncate(item, 112))
10677
+ };
10069
10678
  } catch {
10070
10679
  return null;
10071
10680
  }
10072
10681
  }
10682
+ function confidenceFactorsLine(value) {
10683
+ if (!value || typeof value !== "object") return "";
10684
+ const factors = value;
10685
+ const parts = [
10686
+ factorLabel("authority", factors.sourceAuthority),
10687
+ factorLabel("alignment", factors.domainAlignment),
10688
+ factorLabel("freshness", factors.sourceFreshness),
10689
+ factorLabel("corroboration", factors.corroboration),
10690
+ factorLabel("support", factors.retrievalSupport)
10691
+ ].filter(Boolean);
10692
+ return parts.join(" \xB7 ");
10693
+ }
10694
+ function factorLabel(label, value) {
10695
+ if (!value || typeof value !== "object") return "";
10696
+ const level = value.level;
10697
+ return typeof level === "string" ? `${label} ${level.replace("_", " ")}` : "";
10698
+ }
10073
10699
  function depthMultiplier2(depth) {
10074
10700
  if (depth === "deep") return 1.6;
10075
10701
  if (depth === "shallow") return 0.75;
@@ -10148,8 +10774,13 @@ function printGenerationSummary(response, workspaceRoot) {
10148
10774
  acc.findings += Number(metrics.findingsVerified ?? 0);
10149
10775
  acc.duplicates += Number(metrics.duplicatesRemoved ?? summary.duplicatesRemoved ?? 0);
10150
10776
  const quality = getQualityMetrics(dataset);
10777
+ const graph = getResearchGraph(dataset);
10778
+ const graphMetrics = graph.metrics ?? {};
10151
10779
  acc.contradictions += Number(quality.contradictionResolutionCount ?? 0);
10152
10780
  acc.lowTrustFiltered += Number(quality.lowTrustSourceFilterRate ?? 0);
10781
+ acc.corroborationEdges += Number(graphMetrics.corroborationEdges ?? 0);
10782
+ acc.graphContradictions += Number(graphMetrics.contradictionEdges ?? 0);
10783
+ acc.freshness.push(Number(graphMetrics.averageFreshness ?? 0));
10153
10784
  acc.citationCoverage.push(Number(quality.citationCoverage ?? 0));
10154
10785
  acc.uniqueness.push(Number(quality.recordUniqueness ?? 0));
10155
10786
  acc.relevance.push(Number(quality.relevanceScore ?? 0));
@@ -10169,8 +10800,11 @@ function printGenerationSummary(response, workspaceRoot) {
10169
10800
  findings: 0,
10170
10801
  duplicates: 0,
10171
10802
  contradictions: 0,
10803
+ corroborationEdges: 0,
10804
+ graphContradictions: 0,
10172
10805
  lowTrustFiltered: 0,
10173
10806
  confidences: [],
10807
+ freshness: [],
10174
10808
  citationCoverage: [],
10175
10809
  uniqueness: [],
10176
10810
  relevance: [],
@@ -10188,9 +10822,11 @@ function printGenerationSummary(response, workspaceRoot) {
10188
10822
  const instructionTuning = average5(totals.instructionTuning);
10189
10823
  const factualGrounding = average5(totals.factualGrounding);
10190
10824
  const humanUsefulness = average5(totals.humanUsefulness);
10825
+ const freshness = average5(totals.freshness);
10191
10826
  console.log("");
10192
10827
  console.log(paint("Alys run complete", "green"));
10193
10828
  printStage("SRC", "DONE", "Authoritative sources ranked", formatInt2(totals.sources));
10829
+ printStage("SRC", "DONE", "Source corroboration edges", formatInt2(totals.corroborationEdges));
10194
10830
  printStage("SRC", "DONE", "Low-trust source filter applied", `${Math.round(totals.lowTrustFiltered / Math.max(1, response.datasets.length) * 100)}% avg filtered`);
10195
10831
  printStage("EXT", "DONE", "Source documents normalized", formatInt2(totals.documents));
10196
10832
  printStage("CHK", "DONE", "Findings verified", formatInt2(totals.findings));
@@ -10201,6 +10837,7 @@ function printGenerationSummary(response, workspaceRoot) {
10201
10837
  printStage("EVAL", "DONE", "Citation coverage", formatPercent2(citationCoverage));
10202
10838
  printStage("EVAL", "DONE", "Record uniqueness", formatPercent2(uniqueness));
10203
10839
  printStage("EVAL", "DONE", "Topic relevance", formatPercent2(relevance));
10840
+ printStage("EVAL", "DONE", "Source freshness", formatPercent2(freshness));
10204
10841
  printStage("EVAL", "DONE", "RAG suitability", formatScore(ragSuitability));
10205
10842
  printStage("EVAL", "DONE", "Instruction tuning suitability", formatScore(instructionTuning));
10206
10843
  printStage("EVAL", "DONE", "Factual grounding", formatScore(factualGrounding));
@@ -10217,6 +10854,7 @@ function printGenerationSummary(response, workspaceRoot) {
10217
10854
  const sources = Number(metrics.sourcesDiscovered ?? 0);
10218
10855
  const confidenceValue = Number(metrics.averageConfidence ?? summary.averageConfidence ?? 0);
10219
10856
  const blueprint2 = getBlueprint(dataset);
10857
+ const graphMetrics = getResearchGraph(dataset).metrics ?? {};
10220
10858
  const outputDir = import_node_path5.default.join(root, "datasets", dataset.id);
10221
10859
  console.log(`${paint("\u2022", "yellow")} ${paint(dataset.id, "white")} ${formatInt2(records)} records ${formatInt2(sources)} sources ${formatPercent2(confidenceValue)} confidence`);
10222
10860
  console.log(` ${truncate(dataset.topic, 110)}`);
@@ -10225,6 +10863,7 @@ function printGenerationSummary(response, workspaceRoot) {
10225
10863
  }
10226
10864
  console.log(` ${paint(outputDir, "cyan")}`);
10227
10865
  console.log(` quality ${formatPercent2(Number(quality.citationCoverage ?? 0))} citations \xB7 ${formatPercent2(Number(quality.recordUniqueness ?? 0))} unique \xB7 ${formatPercent2(Number(quality.sourceDiversity ?? 0))} source diversity`);
10866
+ console.log(` graph ${formatInt2(Number(graphMetrics.corroborationEdges ?? 0))} corroborations \xB7 ${formatPercent2(Number(graphMetrics.averageFreshness ?? 0))} freshness \xB7 ${formatInt2(Number(graphMetrics.providerCount ?? 0))} providers`);
10228
10867
  console.log(` suitability RAG ${formatScore(Number(suitability.ragSuitability ?? 0))} \xB7 tuning ${formatScore(Number(suitability.instructionTuning ?? 0))} \xB7 usefulness ${formatScore(Number(suitability.humanUsefulness ?? 0))}`);
10229
10868
  const topSources = getSourceManifest(dataset).slice(0, 5);
10230
10869
  if (topSources.length) {
@@ -10232,9 +10871,11 @@ function printGenerationSummary(response, workspaceRoot) {
10232
10871
  for (const source of topSources) {
10233
10872
  const label = source.title || source.domain || source.provider || "source";
10234
10873
  const trust = Number(source.trustScore ?? source.authorityScore ?? source.relevanceScore ?? 0);
10874
+ const alignment = Number(source.domainAlignmentScore ?? 0);
10235
10875
  const type = source.sourceType ? ` ${source.sourceType}` : "";
10236
10876
  const score = trust > 0 ? ` ${formatPercent2(trust)} trust` : "";
10237
- console.log(` - ${truncate(label, 76)}${paint(`${type}${score}`, "gray")}`);
10877
+ const alignmentLabel = alignment > 0 ? ` ${formatPercent2(alignment)} aligned` : "";
10878
+ console.log(` - ${truncate(label, 76)}${paint(`${type}${score}${alignmentLabel}`, "gray")}`);
10238
10879
  if (source.url) console.log(` ${paint(source.url, "cyan")}`);
10239
10880
  }
10240
10881
  }
@@ -10243,6 +10884,15 @@ function printGenerationSummary(response, workspaceRoot) {
10243
10884
  console.log(paint(" preview", "gray"));
10244
10885
  if (preview.input) console.log(` in ${paint(preview.input, "gray")}`);
10245
10886
  if (preview.output) console.log(` out ${preview.output}`);
10887
+ if (preview.groundedBy.length) {
10888
+ console.log(` grounded by ${paint(preview.groundedBy.join(", "), "gray")}`);
10889
+ }
10890
+ if (preview.factors) {
10891
+ console.log(` confidence ${paint(preview.factors, "gray")}`);
10892
+ }
10893
+ for (const caution of preview.cautions) {
10894
+ console.log(` caution ${paint(caution, "yellow")}`);
10895
+ }
10246
10896
  for (const reason of preview.why) {
10247
10897
  console.log(` why ${paint(reason, "gray")}`);
10248
10898
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "alys-akusa",
3
- "version": "0.1.14",
3
+ "version": "0.1.19",
4
4
  "private": false,
5
5
  "description": "Alys local CLI runtime for autonomous AI data preparation.",
6
6
  "license": "UNLICENSED",