alys-akusa 0.1.14 → 0.1.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +697 -47
- package/package.json +3 -2
package/dist/index.cjs
CHANGED
|
@@ -1525,8 +1525,8 @@ var require_number = __commonJS({
|
|
|
1525
1525
|
var isNumber = /[0-9]/;
|
|
1526
1526
|
var isDef = (any) => any !== void 0;
|
|
1527
1527
|
var round = (number, precision) => {
|
|
1528
|
-
let
|
|
1529
|
-
return Math.round(number *
|
|
1528
|
+
let factor2 = Math.pow(10, precision);
|
|
1529
|
+
return Math.round(number * factor2) / factor2;
|
|
1530
1530
|
};
|
|
1531
1531
|
var NumberPrompt = class extends Prompt {
|
|
1532
1532
|
constructor(opts = {}) {
|
|
@@ -3859,8 +3859,8 @@ var require_number2 = __commonJS({
|
|
|
3859
3859
|
var isNumber = /[0-9]/;
|
|
3860
3860
|
var isDef = (any) => any !== void 0;
|
|
3861
3861
|
var round = (number, precision) => {
|
|
3862
|
-
let
|
|
3863
|
-
return Math.round(number *
|
|
3862
|
+
let factor2 = Math.pow(10, precision);
|
|
3863
|
+
return Math.round(number * factor2) / factor2;
|
|
3864
3864
|
};
|
|
3865
3865
|
var NumberPrompt = class extends Prompt {
|
|
3866
3866
|
constructor(opts = {}) {
|
|
@@ -5098,8 +5098,17 @@ async function discoverResearchSources(topic, options = {}) {
|
|
|
5098
5098
|
const embeddingProvider = options.embeddingProvider ?? createEmbeddingProvider();
|
|
5099
5099
|
const scored = await scoreSearchResults(topic, deduped, embeddingProvider);
|
|
5100
5100
|
const semanticThreshold = options.minSemanticScore ?? (embeddingProvider.name === "local-token-hash" ? 0.08 : 0.18);
|
|
5101
|
-
const
|
|
5102
|
-
|
|
5101
|
+
const topicAligned = scored.filter((source) => passesTopicSourceGate(topic, source, { mode: "balanced" }));
|
|
5102
|
+
if (!topicAligned.length && scored.length) {
|
|
5103
|
+
warnings.push("No search result passed Alys domain-alignment validation.");
|
|
5104
|
+
}
|
|
5105
|
+
const semanticallyFiltered = topicAligned.filter(
|
|
5106
|
+
(source) => (source.semanticScore ?? 0) >= semanticThreshold || (source.domainAlignmentScore ?? 0) >= 0.66
|
|
5107
|
+
);
|
|
5108
|
+
if (!semanticallyFiltered.length && topicAligned.length) {
|
|
5109
|
+
warnings.push("Topic-aligned sources were kept below the semantic threshold for inspection.");
|
|
5110
|
+
}
|
|
5111
|
+
const ranked = (semanticallyFiltered.length ? semanticallyFiltered : topicAligned).sort((a, b) => (b.trustScore ?? b.score) - (a.trustScore ?? a.score)).slice(0, limit);
|
|
5103
5112
|
return {
|
|
5104
5113
|
sources: ranked,
|
|
5105
5114
|
graph: buildResearchGraph(topic, ranked),
|
|
@@ -5110,6 +5119,7 @@ async function discoverResearchSources(topic, options = {}) {
|
|
|
5110
5119
|
}
|
|
5111
5120
|
function createConfiguredSearchProviders() {
|
|
5112
5121
|
const providers = [
|
|
5122
|
+
new CuratedAuthoritySearchProvider(),
|
|
5113
5123
|
new GitHubSearchProvider(env("GITHUB_TOKEN")),
|
|
5114
5124
|
new KaggleSearchProvider(env("KAGGLE_USERNAME"), env("KAGGLE_KEY"))
|
|
5115
5125
|
];
|
|
@@ -5165,7 +5175,8 @@ var GitHubSearchProvider = class {
|
|
|
5165
5175
|
name = "github";
|
|
5166
5176
|
async search(query, options = {}) {
|
|
5167
5177
|
const url = new URL("https://api.github.com/search/repositories");
|
|
5168
|
-
|
|
5178
|
+
const coreQuery = topicCoreTokens(query).slice(0, 4).join(" ") || query;
|
|
5179
|
+
url.searchParams.set("q", `${coreQuery} dataset OR csv OR jsonl OR corpus in:name,description,readme`);
|
|
5169
5180
|
url.searchParams.set("sort", "stars");
|
|
5170
5181
|
url.searchParams.set("order", "desc");
|
|
5171
5182
|
url.searchParams.set("per_page", String(Math.min(20, options.limit ?? 10)));
|
|
@@ -5394,18 +5405,37 @@ var LocalHeuristicSearchProvider = class {
|
|
|
5394
5405
|
});
|
|
5395
5406
|
}
|
|
5396
5407
|
};
|
|
5408
|
+
var CuratedAuthoritySearchProvider = class {
|
|
5409
|
+
name = "curated-authority";
|
|
5410
|
+
async search(query, options = {}) {
|
|
5411
|
+
const limit = Math.max(1, options.limit ?? 10);
|
|
5412
|
+
return authorityProfilesForTopic(query).flatMap((profile) => profile.sources.map((source) => ({ ...source, profileId: profile.id }))).slice(0, limit).map((source) => ({
|
|
5413
|
+
title: source.title,
|
|
5414
|
+
url: source.url,
|
|
5415
|
+
snippet: source.snippet,
|
|
5416
|
+
publishedAt: source.publishedAt,
|
|
5417
|
+
score: source.score,
|
|
5418
|
+
provider: this.name,
|
|
5419
|
+
query
|
|
5420
|
+
}));
|
|
5421
|
+
}
|
|
5422
|
+
};
|
|
5397
5423
|
function buildResearchQueries(topic, count = 5) {
|
|
5398
5424
|
const normalized = topic.trim().replace(/\s+/g, " ");
|
|
5425
|
+
const domainHints = domainSpecificQueryHints(normalized);
|
|
5399
5426
|
const facets = [
|
|
5400
5427
|
normalized,
|
|
5401
|
-
`${normalized}
|
|
5428
|
+
`${normalized} authoritative source technical reference`,
|
|
5429
|
+
`${normalized} dataset kaggle github csv jsonl`,
|
|
5402
5430
|
`${normalized} public dataset csv jsonl parquet`,
|
|
5403
5431
|
`${normalized} official documentation standards methodology`,
|
|
5404
5432
|
`${normalized} research paper benchmark evaluation`,
|
|
5405
5433
|
`${normalized} case study operational data`,
|
|
5406
5434
|
`${normalized} risks failures incidents constraints`,
|
|
5407
5435
|
`${normalized} statistics dataset schema examples`,
|
|
5408
|
-
`${normalized} regulatory guidance technical report
|
|
5436
|
+
`${normalized} regulatory guidance technical report`,
|
|
5437
|
+
`${normalized} filetype:pdf manual report`,
|
|
5438
|
+
...domainHints
|
|
5409
5439
|
];
|
|
5410
5440
|
return Array.from(new Set(facets)).slice(0, Math.max(1, count));
|
|
5411
5441
|
}
|
|
@@ -5418,15 +5448,21 @@ ${result.url}`);
|
|
|
5418
5448
|
const domainCounts = countDomains(results.map((result) => domainFromUrl(result.url)));
|
|
5419
5449
|
return results.map((result, index) => {
|
|
5420
5450
|
const domain = domainFromUrl(result.url);
|
|
5421
|
-
const
|
|
5422
|
-
const
|
|
5451
|
+
const candidateText = `${result.title} ${result.snippet} ${result.url}`;
|
|
5452
|
+
const semanticScore = candidateEmbeddings[index] ? clamp01(cosineSimilarity(queryEmbedding, candidateEmbeddings[index])) : lexicalRelevance(topic, candidateText);
|
|
5453
|
+
const lexicalScore = lexicalRelevance(topic, candidateText);
|
|
5454
|
+
const domainAlignment = domainAlignmentScore(topic, candidateText);
|
|
5455
|
+
const broadPenalty = broadSourcePenalty(topic, result, domainAlignment);
|
|
5456
|
+
const relevanceScore = clamp01(lexicalScore * 0.34 + semanticScore * 0.28 + domainAlignment * 0.38 - broadPenalty * 0.34);
|
|
5423
5457
|
const authority = authorityForDomain(domain, result.url);
|
|
5458
|
+
const authorityProfile = authorityProfileForSource(topic, result, domain);
|
|
5459
|
+
const authorityScore = authorityProfile ? Math.max(authority.score, authorityProfile.authorityScore) : authority.score;
|
|
5424
5460
|
const freshnessScore = freshnessForDate(result.publishedAt);
|
|
5425
5461
|
const duplicationRisk = clamp01(Math.max(0, (domainCounts.get(domain) ?? 1) - 1) * 0.12);
|
|
5426
5462
|
const providerScore = normalizeProviderScore(result.score);
|
|
5427
5463
|
const sourcePreference = sourcePreferenceScore(domain, result.url, result.provider);
|
|
5428
5464
|
const trustScore = clamp01(
|
|
5429
|
-
|
|
5465
|
+
authorityScore * 0.3 + relevanceScore * 0.24 + semanticScore * 0.14 + domainAlignment * 0.16 + freshnessScore * 0.1 + providerScore * 0.07 + sourcePreference * 0.05 + (1 - duplicationRisk) * 0.05 - broadPenalty * 0.26
|
|
5430
5466
|
);
|
|
5431
5467
|
return {
|
|
5432
5468
|
id: sourceId(result.url),
|
|
@@ -5439,15 +5475,20 @@ ${result.url}`);
|
|
|
5439
5475
|
query: result.query,
|
|
5440
5476
|
domain,
|
|
5441
5477
|
publishedAt: result.publishedAt,
|
|
5442
|
-
authorityScore: Number(
|
|
5478
|
+
authorityScore: Number(authorityScore.toFixed(3)),
|
|
5443
5479
|
relevanceScore: Number(relevanceScore.toFixed(3)),
|
|
5444
5480
|
freshnessScore: Number(freshnessScore.toFixed(3)),
|
|
5445
5481
|
duplicationRisk: Number(duplicationRisk.toFixed(3)),
|
|
5446
5482
|
semanticScore: Number(semanticScore.toFixed(3)),
|
|
5483
|
+
domainAlignmentScore: Number(domainAlignment.toFixed(3)),
|
|
5447
5484
|
trustScore: Number(trustScore.toFixed(3)),
|
|
5448
5485
|
sourceType: authority.type,
|
|
5449
5486
|
qualitySignals: [
|
|
5450
5487
|
...authority.signals,
|
|
5488
|
+
...authorityProfile ? [`authority-pack:${authorityProfile.id}`] : [],
|
|
5489
|
+
...domainAlignment >= 0.72 ? ["strong-topic-alignment"] : [],
|
|
5490
|
+
...domainAlignment < 0.34 ? ["weak-topic-alignment"] : [],
|
|
5491
|
+
...broadPenalty >= 0.5 ? ["broad-source-penalty"] : [],
|
|
5451
5492
|
...sourcePreference >= 0.85 ? ["preferred-source-surface"] : [],
|
|
5452
5493
|
...result.provider === "github" ? ["github-repository-search"] : [],
|
|
5453
5494
|
...result.provider === "kaggle" ? ["kaggle-dataset-search"] : []
|
|
@@ -5455,11 +5496,36 @@ ${result.url}`);
|
|
|
5455
5496
|
};
|
|
5456
5497
|
});
|
|
5457
5498
|
}
|
|
5499
|
+
function passesTopicSourceGate(topic, source, options = {}) {
|
|
5500
|
+
const specialized = isSpecializedTopic(topic);
|
|
5501
|
+
const mode = options.mode ?? "balanced";
|
|
5502
|
+
const trust = source.trustScore ?? source.score;
|
|
5503
|
+
const relevance = source.relevanceScore ?? source.score;
|
|
5504
|
+
const semantic = source.semanticScore ?? source.score;
|
|
5505
|
+
const alignment = source.domainAlignmentScore ?? domainAlignmentScore(topic, `${source.title} ${source.snippet} ${source.url}`);
|
|
5506
|
+
const duplicateRisk = source.duplicationRisk ?? 0;
|
|
5507
|
+
const sourceType = source.sourceType ?? "unknown";
|
|
5508
|
+
const provider = (source.provider || source.discoveredBy || "").toLowerCase();
|
|
5509
|
+
const isCodeSource = sourceType === "code" || provider === "github" || domainFromUrl(source.url) === "github.com";
|
|
5510
|
+
const codeTopic = isCodeOrRepositoryTopic(topic);
|
|
5511
|
+
const broadPenalty = source.qualitySignals?.includes("broad-source-penalty") ? 0.7 : 0;
|
|
5512
|
+
const thresholds = mode === "fast" ? { trust: 0.54, relevance: 0.42, semantic: 0.06, alignment: 0.4 } : mode === "strict" ? { trust: 0.62, relevance: 0.52, semantic: 0.12, alignment: 0.56 } : mode === "maximum-quality" ? { trust: 0.5, relevance: 0.38, semantic: 0.08, alignment: 0.36 } : { trust: 0.52, relevance: 0.4, semantic: 0.08, alignment: 0.38 };
|
|
5513
|
+
if (duplicateRisk >= 0.82) return false;
|
|
5514
|
+
if (!specialized) {
|
|
5515
|
+
return trust >= Math.max(0.42, thresholds.trust - 0.08) && relevance >= Math.max(0.28, thresholds.relevance - 0.1);
|
|
5516
|
+
}
|
|
5517
|
+
if (alignment < thresholds.alignment || relevance < thresholds.relevance || trust < thresholds.trust) return false;
|
|
5518
|
+
if (semantic < thresholds.semantic && alignment < 0.68) return false;
|
|
5519
|
+
if (broadPenalty >= 0.5 && alignment < 0.72) return false;
|
|
5520
|
+
if (isCodeSource && !codeTopic && (alignment < 0.64 || relevance < 0.56)) return false;
|
|
5521
|
+
return true;
|
|
5522
|
+
}
|
|
5458
5523
|
function buildResearchGraph(topic, sources) {
|
|
5459
5524
|
const topicId = `topic:${sourceId(topic)}`;
|
|
5460
5525
|
const clusters = buildClusters(sources);
|
|
5461
5526
|
const entities = extractEntities([topic, ...sources.flatMap((source) => [source.title, source.snippet])]).slice(0, 12);
|
|
5462
5527
|
const contradictions = inferContradictions(sources, clusters);
|
|
5528
|
+
const relationshipEdges = buildSourceRelationshipEdges(sources);
|
|
5463
5529
|
return {
|
|
5464
5530
|
topic,
|
|
5465
5531
|
generatedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -5527,7 +5593,8 @@ function buildResearchGraph(topic, sources) {
|
|
|
5527
5593
|
weight: contradiction.severity === "high" ? 0.9 : 0.55,
|
|
5528
5594
|
evidence: contradiction.reason
|
|
5529
5595
|
}))
|
|
5530
|
-
)
|
|
5596
|
+
),
|
|
5597
|
+
...relationshipEdges
|
|
5531
5598
|
],
|
|
5532
5599
|
clusters,
|
|
5533
5600
|
contradictions,
|
|
@@ -5537,10 +5604,53 @@ function buildResearchGraph(topic, sources) {
|
|
|
5537
5604
|
averageTrust: average(sources.map((source) => source.trustScore ?? source.score)),
|
|
5538
5605
|
averageRelevance: average(sources.map((source) => source.relevanceScore ?? source.score)),
|
|
5539
5606
|
averageAuthority: average(sources.map((source) => source.authorityScore ?? 0.5)),
|
|
5607
|
+
averageFreshness: average(sources.map((source) => source.freshnessScore ?? 0.62)),
|
|
5608
|
+
corroborationEdges: relationshipEdges.filter((edge) => edge.relation === "corroborates").length,
|
|
5609
|
+
contradictionEdges: contradictions.reduce((sum, contradiction) => sum + contradiction.sourceIds.length, 0),
|
|
5540
5610
|
duplicateRisk: average(sources.map((source) => source.duplicationRisk ?? 0))
|
|
5541
5611
|
}
|
|
5542
5612
|
};
|
|
5543
5613
|
}
|
|
5614
|
+
function buildSourceRelationshipEdges(sources) {
|
|
5615
|
+
const edges = [];
|
|
5616
|
+
for (let i = 0; i < sources.length; i++) {
|
|
5617
|
+
for (let j = i + 1; j < sources.length; j++) {
|
|
5618
|
+
const left = sources[i];
|
|
5619
|
+
const right = sources[j];
|
|
5620
|
+
if ((left.domain ?? domainFromUrl(left.url)) === (right.domain ?? domainFromUrl(right.url))) continue;
|
|
5621
|
+
const leftText = `${left.title} ${left.snippet}`;
|
|
5622
|
+
const rightText = `${right.title} ${right.snippet}`;
|
|
5623
|
+
const similarity = jaccardSimilarity(leftText, rightText);
|
|
5624
|
+
const sharedPack = authorityPack(left) && authorityPack(left) === authorityPack(right);
|
|
5625
|
+
const alignment = Math.min(left.domainAlignmentScore ?? left.relevanceScore ?? 0, right.domainAlignmentScore ?? right.relevanceScore ?? 0);
|
|
5626
|
+
const trust = Math.min(left.trustScore ?? left.score, right.trustScore ?? right.score);
|
|
5627
|
+
const shouldLink = (sharedPack || similarity >= 0.14) && alignment >= 0.48 && trust >= 0.58;
|
|
5628
|
+
if (!shouldLink) continue;
|
|
5629
|
+
edges.push({
|
|
5630
|
+
from: `source:${left.id}`,
|
|
5631
|
+
to: `source:${right.id}`,
|
|
5632
|
+
relation: "corroborates",
|
|
5633
|
+
weight: Number(Math.min(1, similarity * 0.48 + alignment * 0.28 + trust * 0.24 + (sharedPack ? 0.16 : 0)).toFixed(3)),
|
|
5634
|
+
evidence: sharedRelationshipEvidence(left, right, sharedPack ? authorityPack(left) ?? void 0 : void 0)
|
|
5635
|
+
});
|
|
5636
|
+
}
|
|
5637
|
+
}
|
|
5638
|
+
return edges.sort((a, b) => b.weight - a.weight).slice(0, 36);
|
|
5639
|
+
}
|
|
5640
|
+
function authorityPack(source) {
|
|
5641
|
+
const signal = source.qualitySignals?.find((item) => item.startsWith("authority-pack:"));
|
|
5642
|
+
return signal ? signal.replace("authority-pack:", "") : null;
|
|
5643
|
+
}
|
|
5644
|
+
function sharedRelationshipEvidence(left, right, pack) {
|
|
5645
|
+
const terms = sharedTerms(`${left.title} ${left.snippet}`, `${right.title} ${right.snippet}`).slice(0, 6);
|
|
5646
|
+
const packText = pack ? `same authority pack (${pack})` : "shared topic evidence";
|
|
5647
|
+
return terms.length ? `${packText}; shared terms: ${terms.join(", ")}` : packText;
|
|
5648
|
+
}
|
|
5649
|
+
function sharedTerms(left, right) {
|
|
5650
|
+
const leftTokens = tokenSet(normalizeForSearch(left));
|
|
5651
|
+
const rightTokens = tokenSet(normalizeForSearch(right));
|
|
5652
|
+
return [...leftTokens].filter((token) => token.length >= 5 && rightTokens.has(token) && !ENTITY_STOP_WORDS.has(token)).slice(0, 12);
|
|
5653
|
+
}
|
|
5544
5654
|
function buildClusters(sources) {
|
|
5545
5655
|
const byType = /* @__PURE__ */ new Map();
|
|
5546
5656
|
for (const source of sources) {
|
|
@@ -5618,6 +5728,9 @@ function resultFromObject(item, provider, query, keys) {
|
|
|
5618
5728
|
raw: item
|
|
5619
5729
|
}];
|
|
5620
5730
|
}
|
|
5731
|
+
function domainSpecificQueryHints(topic) {
|
|
5732
|
+
return authorityProfilesForTopic(topic).flatMap((profile) => profile.queryHints.map((hint) => `${topic} ${hint}`));
|
|
5733
|
+
}
|
|
5621
5734
|
async function fetchJson(url, init = {}, timeoutMs = SEARCH_TIMEOUT_MS) {
|
|
5622
5735
|
const controller = new AbortController();
|
|
5623
5736
|
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
|
@@ -5714,6 +5827,59 @@ function lexicalRelevance(topic, candidate) {
|
|
|
5714
5827
|
const overlap = [...topicTokens].filter((token) => candidateTokens.has(token)).length;
|
|
5715
5828
|
return clamp01(overlap / Math.max(1, topicTokens.size) * 0.85 + jaccardSimilarity(topic, candidate) * 0.15);
|
|
5716
5829
|
}
|
|
5830
|
+
function domainAlignmentScore(topic, candidate) {
|
|
5831
|
+
const coreTokens = topicCoreTokens(topic);
|
|
5832
|
+
if (!coreTokens.length) return lexicalRelevance(topic, candidate);
|
|
5833
|
+
const normalizedCandidate = normalizeForSearch(candidate);
|
|
5834
|
+
const candidateTokens = tokenSet(normalizedCandidate);
|
|
5835
|
+
const tokenHits = coreTokens.filter((token) => candidateTokenMatches(token, normalizedCandidate, candidateTokens));
|
|
5836
|
+
const bigrams = coreTokens.slice(0, -1).map((token, index) => `${token} ${coreTokens[index + 1]}`);
|
|
5837
|
+
const bigramHits = bigrams.filter((bigram) => normalizedCandidate.includes(bigram));
|
|
5838
|
+
const phrase = coreTokens.join(" ");
|
|
5839
|
+
const phraseScore = phrase.length > 4 && normalizedCandidate.includes(phrase) ? 1 : 0;
|
|
5840
|
+
return clamp01(
|
|
5841
|
+
tokenHits.length / Math.max(1, coreTokens.length) * 0.62 + bigramHits.length / Math.max(1, bigrams.length || 1) * 0.26 + phraseScore * 0.12
|
|
5842
|
+
);
|
|
5843
|
+
}
|
|
5844
|
+
function topicCoreTokens(topic) {
|
|
5845
|
+
return normalizeForSearch(topic).split(/\s+/).filter((token) => token.length >= 3 && !TOPIC_STOP_WORDS.has(token)).slice(0, 10);
|
|
5846
|
+
}
|
|
5847
|
+
function candidateTokenMatches(token, normalizedCandidate, candidateTokens) {
|
|
5848
|
+
if (candidateTokens.has(token) || normalizedCandidate.includes(token)) return true;
|
|
5849
|
+
const synonyms = TOPIC_SYNONYMS[token] ?? [];
|
|
5850
|
+
return synonyms.some((synonym) => normalizedCandidate.includes(synonym));
|
|
5851
|
+
}
|
|
5852
|
+
function isSpecializedTopic(topic) {
|
|
5853
|
+
const coreTokens = topicCoreTokens(topic);
|
|
5854
|
+
return coreTokens.length >= 2;
|
|
5855
|
+
}
|
|
5856
|
+
function isCodeOrRepositoryTopic(topic) {
|
|
5857
|
+
const normalized = normalizeForSearch(topic);
|
|
5858
|
+
return CODE_TOPIC_TERMS.some((term) => normalized.includes(term));
|
|
5859
|
+
}
|
|
5860
|
+
function broadSourcePenalty(topic, result, alignment) {
|
|
5861
|
+
if (!isSpecializedTopic(topic) || alignment >= 0.62) return 0;
|
|
5862
|
+
const text = normalizeForSearch(`${result.title} ${result.snippet} ${result.url}`);
|
|
5863
|
+
const broadHits = BROAD_SOURCE_TERMS.filter((term) => text.includes(term)).length;
|
|
5864
|
+
const providerPenalty = result.provider === "github" ? 0.16 : 0;
|
|
5865
|
+
return clamp01(broadHits * 0.14 + providerPenalty);
|
|
5866
|
+
}
|
|
5867
|
+
function normalizeForSearch(value) {
|
|
5868
|
+
return value.toLowerCase().replace(/[^a-z0-9]+/g, " ").replace(/\s+/g, " ").trim();
|
|
5869
|
+
}
|
|
5870
|
+
function authorityProfilesForTopic(topic) {
|
|
5871
|
+
const normalized = normalizeForSearch(topic);
|
|
5872
|
+
return AUTHORITY_PROFILES.filter((profile) => profile.match.some((term) => normalized.includes(term)));
|
|
5873
|
+
}
|
|
5874
|
+
function authorityProfileForSource(topic, result, domain) {
|
|
5875
|
+
const normalized = normalizeForSearch(`${result.title} ${result.snippet} ${result.url}`);
|
|
5876
|
+
return authorityProfilesForTopic(topic).find((profile) => {
|
|
5877
|
+
const domainMatch = profile.authorityDomains.some((authorityDomain) => domain === authorityDomain || domain.endsWith(`.${authorityDomain}`));
|
|
5878
|
+
const sourceMatch = profile.sources.some((source) => canonicalizeUrl(source.url) === canonicalizeUrl(result.url));
|
|
5879
|
+
const topicMatch = profile.match.some((term) => normalized.includes(term));
|
|
5880
|
+
return (domainMatch || sourceMatch) && topicMatch;
|
|
5881
|
+
}) ?? null;
|
|
5882
|
+
}
|
|
5717
5883
|
function normalizeProviderScore(score) {
|
|
5718
5884
|
if (typeof score !== "number" || Number.isNaN(score)) return 0.55;
|
|
5719
5885
|
if (score <= 1) return clamp01(score);
|
|
@@ -5803,6 +5969,196 @@ var LOCAL_SOURCE_BASES = [
|
|
|
5803
5969
|
{ label: "Google Scholar", url: "https://scholar.google.com/scholar", querySuffix: "q", signal: "Academic source discovery surface." },
|
|
5804
5970
|
{ label: "OpenAlex", url: "https://openalex.org/search", querySuffix: "q", signal: "Open scholarly metadata and research graph candidates." }
|
|
5805
5971
|
];
|
|
5972
|
+
var AUTHORITY_PROFILES = [
|
|
5973
|
+
{
|
|
5974
|
+
id: "oil-gas",
|
|
5975
|
+
label: "Oil & Gas",
|
|
5976
|
+
match: ["oil", "gas", "pipeline", "pipelines", "drilling", "well", "reservoir", "scada", "pump", "hazardous liquid", "petroleum"],
|
|
5977
|
+
authorityDomains: ["phmsa.dot.gov", "npms.phmsa.dot.gov", "bts.gov", "ntsb.gov", "spe.org", "api.org"],
|
|
5978
|
+
queryHints: [
|
|
5979
|
+
"PHMSA pipeline data report",
|
|
5980
|
+
"SCADA pressure flow telemetry",
|
|
5981
|
+
"API 1160 integrity management",
|
|
5982
|
+
"pump station operations manual",
|
|
5983
|
+
"incident report technical dataset"
|
|
5984
|
+
],
|
|
5985
|
+
authorityScore: 0.94,
|
|
5986
|
+
sources: [
|
|
5987
|
+
{
|
|
5988
|
+
title: "PHMSA Pipeline Incident 20 Year Trends",
|
|
5989
|
+
url: "https://www.phmsa.dot.gov/data-and-statistics/pipeline/pipeline-incident-20-year-trends",
|
|
5990
|
+
snippet: "Official PHMSA incident trend data for gas distribution, gas gathering, gas transmission, LNG, underground storage, and hazardous liquid pipeline systems, including operator-submitted incident records and flagged files.",
|
|
5991
|
+
score: 0.96
|
|
5992
|
+
},
|
|
5993
|
+
{
|
|
5994
|
+
title: "PHMSA Pipeline Source Data",
|
|
5995
|
+
url: "https://www.phmsa.dot.gov/data-and-statistics/pipeline/source-data",
|
|
5996
|
+
snippet: "Official PHMSA source data covering pipeline annual reports, incident reports, safety-related condition reports, integrity assurance notifications, gas systems, LNG, and hazardous liquid operators.",
|
|
5997
|
+
score: 0.95
|
|
5998
|
+
},
|
|
5999
|
+
{
|
|
6000
|
+
title: "PHMSA National Pipeline Performance Measures",
|
|
6001
|
+
url: "https://www.phmsa.dot.gov/data-and-statistics/pipeline/national-pipeline-performance-measures",
|
|
6002
|
+
snippet: "National pipeline performance measures for pipeline safety, integrity management, incident categories, serious incidents, significant incidents, all reported incidents, and pipeline infrastructure performance.",
|
|
6003
|
+
score: 0.93
|
|
6004
|
+
},
|
|
6005
|
+
{
|
|
6006
|
+
title: "PHMSA Pipeline Safety Data Report Index",
|
|
6007
|
+
url: "https://www.phmsa.dot.gov/data-and-statistics/pipeline/pipeline-safety-data-report-index",
|
|
6008
|
+
snippet: "PHMSA index of pipeline safety datasets, annual report summaries, integrity management performance, incident cause/type metrics, excavation damage, pipeline mileage, facilities, and technical resources.",
|
|
6009
|
+
score: 0.93
|
|
6010
|
+
},
|
|
6011
|
+
{
|
|
6012
|
+
title: "National Pipeline Mapping System Pipeline Data",
|
|
6013
|
+
url: "https://www.npms.phmsa.dot.gov/PipelineData.aspx",
|
|
6014
|
+
snippet: "National Pipeline Mapping System data for gas transmission pipelines, hazardous liquid pipelines, LNG plants, breakout tanks, operator submissions, pipeline mapping, and integrity-management context.",
|
|
6015
|
+
score: 0.9
|
|
6016
|
+
},
|
|
6017
|
+
{
|
|
6018
|
+
title: "Bureau of Transportation Statistics Pipeline Safety and Property Damage Data",
|
|
6019
|
+
url: "https://www.bts.gov/content/hazardous-liquid-and-natural-gas-pipeline-safety-and-property-damage-data",
|
|
6020
|
+
snippet: "BTS table for hazardous liquid and natural gas pipeline safety and property damage data sourced from PHMSA pipeline incident statistics and transportation safety records.",
|
|
6021
|
+
score: 0.88
|
|
6022
|
+
},
|
|
6023
|
+
{
|
|
6024
|
+
title: "NTSB Pipeline Investigation Reports",
|
|
6025
|
+
url: "https://www.ntsb.gov/investigations/AccidentReports/Pages/Reports.aspx",
|
|
6026
|
+
snippet: "National Transportation Safety Board investigation reports, including pipeline accident reports, factual records, emergency response findings, integrity management evidence, and safety recommendations.",
|
|
6027
|
+
score: 0.86
|
|
6028
|
+
}
|
|
6029
|
+
]
|
|
6030
|
+
},
|
|
6031
|
+
{
|
|
6032
|
+
id: "healthcare",
|
|
6033
|
+
label: "Healthcare",
|
|
6034
|
+
match: ["medical", "clinical", "health", "healthcare", "patient", "biomedical", "drug", "disease", "diagnosis", "treatment"],
|
|
6035
|
+
authorityDomains: ["nih.gov", "nlm.nih.gov", "pubmed.ncbi.nlm.nih.gov", "clinicaltrials.gov", "fda.gov", "open.fda.gov", "cdc.gov"],
|
|
6036
|
+
queryHints: ["PubMed clinical guideline", "NIH medical terminology", "FDA open data", "CDC public health dataset"],
|
|
6037
|
+
authorityScore: 0.95,
|
|
6038
|
+
sources: [
|
|
6039
|
+
{
|
|
6040
|
+
title: "PubMed Biomedical Literature",
|
|
6041
|
+
url: "https://pubmed.ncbi.nlm.nih.gov/",
|
|
6042
|
+
snippet: "National Library of Medicine search surface for biomedical literature, clinical studies, medical terminology, and peer-reviewed evidence.",
|
|
6043
|
+
score: 0.95
|
|
6044
|
+
},
|
|
6045
|
+
{
|
|
6046
|
+
title: "ClinicalTrials.gov Data API",
|
|
6047
|
+
url: "https://clinicaltrials.gov/data-api/about-api",
|
|
6048
|
+
snippet: "Official ClinicalTrials.gov API and data access for clinical study metadata, interventions, conditions, sponsors, and study outcomes.",
|
|
6049
|
+
score: 0.92
|
|
6050
|
+
},
|
|
6051
|
+
{
|
|
6052
|
+
title: "openFDA APIs",
|
|
6053
|
+
url: "https://open.fda.gov/apis/",
|
|
6054
|
+
snippet: "Official FDA open data APIs for drugs, devices, foods, tobacco, and enforcement datasets.",
|
|
6055
|
+
score: 0.92
|
|
6056
|
+
},
|
|
6057
|
+
{
|
|
6058
|
+
title: "CDC Data Catalog",
|
|
6059
|
+
url: "https://data.cdc.gov/",
|
|
6060
|
+
snippet: "Official CDC public health datasets for surveillance, epidemiology, facilities, disease reporting, and health indicators.",
|
|
6061
|
+
score: 0.9
|
|
6062
|
+
}
|
|
6063
|
+
]
|
|
6064
|
+
},
|
|
6065
|
+
{
|
|
6066
|
+
id: "legal",
|
|
6067
|
+
label: "Legal & Compliance",
|
|
6068
|
+
match: ["legal", "law", "compliance", "policy", "regulation", "regulatory", "contract", "privacy", "statute", "court"],
|
|
6069
|
+
authorityDomains: ["govinfo.gov", "law.cornell.edu", "courtlistener.com", "federalregister.gov", "sec.gov", "justice.gov"],
|
|
6070
|
+
queryHints: ["official regulation guidance", "statute case law corpus", "court opinion dataset", "federal register rule"],
|
|
6071
|
+
authorityScore: 0.92,
|
|
6072
|
+
sources: [
|
|
6073
|
+
{
|
|
6074
|
+
title: "GovInfo",
|
|
6075
|
+
url: "https://www.govinfo.gov/",
|
|
6076
|
+
snippet: "Official U.S. Government Publishing Office access to federal statutes, regulations, congressional documents, and official government publications.",
|
|
6077
|
+
score: 0.92
|
|
6078
|
+
},
|
|
6079
|
+
{
|
|
6080
|
+
title: "Cornell Legal Information Institute",
|
|
6081
|
+
url: "https://www.law.cornell.edu/",
|
|
6082
|
+
snippet: "Legal Information Institute access to U.S. Code, CFR, Supreme Court opinions, Wex legal dictionary, and legal reference material.",
|
|
6083
|
+
score: 0.86
|
|
6084
|
+
},
|
|
6085
|
+
{
|
|
6086
|
+
title: "CourtListener",
|
|
6087
|
+
url: "https://www.courtlistener.com/",
|
|
6088
|
+
snippet: "Public legal database for court opinions, dockets, judges, citations, and legal research datasets.",
|
|
6089
|
+
score: 0.86
|
|
6090
|
+
},
|
|
6091
|
+
{
|
|
6092
|
+
title: "Federal Register",
|
|
6093
|
+
url: "https://www.federalregister.gov/",
|
|
6094
|
+
snippet: "Official daily publication for U.S. federal rules, proposed rules, notices, executive orders, and regulatory actions.",
|
|
6095
|
+
score: 0.9
|
|
6096
|
+
}
|
|
6097
|
+
]
|
|
6098
|
+
},
|
|
6099
|
+
{
|
|
6100
|
+
id: "finance",
|
|
6101
|
+
label: "Finance",
|
|
6102
|
+
match: ["finance", "financial", "banking", "market", "markets", "sec", "filing", "risk", "credit", "macroeconomic", "economic"],
|
|
6103
|
+
authorityDomains: ["sec.gov", "fred.stlouisfed.org", "federalreserve.gov", "consumerfinance.gov", "treasury.gov"],
|
|
6104
|
+
queryHints: ["SEC EDGAR filing data", "FRED economic dataset", "Federal Reserve data", "CFPB complaint database"],
|
|
6105
|
+
authorityScore: 0.93,
|
|
6106
|
+
sources: [
|
|
6107
|
+
{
|
|
6108
|
+
title: "SEC EDGAR",
|
|
6109
|
+
url: "https://www.sec.gov/edgar",
|
|
6110
|
+
snippet: "Official SEC EDGAR company filings, disclosures, financial statements, risk factors, and market regulatory documents.",
|
|
6111
|
+
score: 0.94
|
|
6112
|
+
},
|
|
6113
|
+
{
|
|
6114
|
+
title: "FRED Economic Data",
|
|
6115
|
+
url: "https://fred.stlouisfed.org/",
|
|
6116
|
+
snippet: "Federal Reserve Bank of St. Louis economic time series, macroeconomic indicators, rates, labor, inflation, and financial data.",
|
|
6117
|
+
score: 0.92
|
|
6118
|
+
},
|
|
6119
|
+
{
|
|
6120
|
+
title: "Federal Reserve Data",
|
|
6121
|
+
url: "https://www.federalreserve.gov/data.htm",
|
|
6122
|
+
snippet: "Official Federal Reserve data releases, banking data, monetary policy data, financial accounts, and regulatory reports.",
|
|
6123
|
+
score: 0.91
|
|
6124
|
+
},
|
|
6125
|
+
{
|
|
6126
|
+
title: "CFPB Consumer Complaint Database",
|
|
6127
|
+
url: "https://www.consumerfinance.gov/data-research/consumer-complaints/",
|
|
6128
|
+
snippet: "Consumer Financial Protection Bureau complaint database covering financial products, institutions, issues, responses, and trends.",
|
|
6129
|
+
score: 0.88
|
|
6130
|
+
}
|
|
6131
|
+
]
|
|
6132
|
+
},
|
|
6133
|
+
{
|
|
6134
|
+
id: "developer-docs",
|
|
6135
|
+
label: "Developer Documentation",
|
|
6136
|
+
match: ["developer", "developers", "api", "sdk", "code", "repository", "github", "documentation", "docs", "package", "library", "framework"],
|
|
6137
|
+
authorityDomains: ["docs.github.com", "developer.mozilla.org", "npmjs.com", "nodejs.org", "typescriptlang.org"],
|
|
6138
|
+
queryHints: ["official API reference", "developer documentation examples", "SDK guide", "GitHub repository docs"],
|
|
6139
|
+
authorityScore: 0.88,
|
|
6140
|
+
sources: [
|
|
6141
|
+
{
|
|
6142
|
+
title: "GitHub Docs",
|
|
6143
|
+
url: "https://docs.github.com/",
|
|
6144
|
+
snippet: "Official GitHub documentation for repositories, Actions, APIs, packages, security, and developer workflows.",
|
|
6145
|
+
score: 0.88
|
|
6146
|
+
},
|
|
6147
|
+
{
|
|
6148
|
+
title: "MDN Web Docs",
|
|
6149
|
+
url: "https://developer.mozilla.org/",
|
|
6150
|
+
snippet: "Mozilla Developer Network reference for web platform APIs, JavaScript, HTML, CSS, browser behavior, and examples.",
|
|
6151
|
+
score: 0.88
|
|
6152
|
+
},
|
|
6153
|
+
{
|
|
6154
|
+
title: "npm Docs",
|
|
6155
|
+
url: "https://docs.npmjs.com/",
|
|
6156
|
+
snippet: "Official npm documentation for packages, publishing, package.json, CLI usage, registry behavior, and access control.",
|
|
6157
|
+
score: 0.84
|
|
6158
|
+
}
|
|
6159
|
+
]
|
|
6160
|
+
}
|
|
6161
|
+
];
|
|
5806
6162
|
var ENTITY_STOP_WORDS = /* @__PURE__ */ new Set([
|
|
5807
6163
|
"about",
|
|
5808
6164
|
"source",
|
|
@@ -5820,6 +6176,79 @@ var ENTITY_STOP_WORDS = /* @__PURE__ */ new Set([
|
|
|
5820
6176
|
"example",
|
|
5821
6177
|
"examples"
|
|
5822
6178
|
]);
|
|
6179
|
+
var TOPIC_STOP_WORDS = /* @__PURE__ */ new Set([
|
|
6180
|
+
...ENTITY_STOP_WORDS,
|
|
6181
|
+
"data",
|
|
6182
|
+
"records",
|
|
6183
|
+
"record",
|
|
6184
|
+
"row",
|
|
6185
|
+
"rows",
|
|
6186
|
+
"corpus",
|
|
6187
|
+
"csv",
|
|
6188
|
+
"json",
|
|
6189
|
+
"jsonl",
|
|
6190
|
+
"parquet",
|
|
6191
|
+
"rag",
|
|
6192
|
+
"fine",
|
|
6193
|
+
"tune",
|
|
6194
|
+
"tuning",
|
|
6195
|
+
"training",
|
|
6196
|
+
"ready",
|
|
6197
|
+
"examples",
|
|
6198
|
+
"example",
|
|
6199
|
+
"with",
|
|
6200
|
+
"from",
|
|
6201
|
+
"into",
|
|
6202
|
+
"about",
|
|
6203
|
+
"info",
|
|
6204
|
+
"information",
|
|
6205
|
+
"generate",
|
|
6206
|
+
"generated",
|
|
6207
|
+
"synthetic",
|
|
6208
|
+
"model",
|
|
6209
|
+
"models",
|
|
6210
|
+
"openai",
|
|
6211
|
+
"anthropic",
|
|
6212
|
+
"ai"
|
|
6213
|
+
]);
|
|
6214
|
+
var CODE_TOPIC_TERMS = [
|
|
6215
|
+
"code",
|
|
6216
|
+
"github",
|
|
6217
|
+
"repository",
|
|
6218
|
+
"repositories",
|
|
6219
|
+
"developer",
|
|
6220
|
+
"api",
|
|
6221
|
+
"sdk",
|
|
6222
|
+
"package",
|
|
6223
|
+
"library",
|
|
6224
|
+
"framework",
|
|
6225
|
+
"docs",
|
|
6226
|
+
"documentation",
|
|
6227
|
+
"typescript",
|
|
6228
|
+
"python",
|
|
6229
|
+
"javascript"
|
|
6230
|
+
];
|
|
6231
|
+
var BROAD_SOURCE_TERMS = [
|
|
6232
|
+
"awesome",
|
|
6233
|
+
"tutorial",
|
|
6234
|
+
"course",
|
|
6235
|
+
"applied ml",
|
|
6236
|
+
"machine learning",
|
|
6237
|
+
"deep learning",
|
|
6238
|
+
"data science",
|
|
6239
|
+
"examples",
|
|
6240
|
+
"notebook",
|
|
6241
|
+
"collection",
|
|
6242
|
+
"curated list",
|
|
6243
|
+
"roadmap"
|
|
6244
|
+
];
|
|
6245
|
+
var TOPIC_SYNONYMS = {
|
|
6246
|
+
oil: ["hazardous liquid", "petroleum", "crude", "liquid pipeline", "liquid pipelines"],
|
|
6247
|
+
gas: ["natural gas", "lng", "gas transmission", "gas distribution"],
|
|
6248
|
+
pipeline: ["pipelines", "transmission line", "hazardous liquid"],
|
|
6249
|
+
drilling: ["wellbore", "rig", "bha", "mwd", "lwd"],
|
|
6250
|
+
well: ["wellbore", "reservoir", "completion"]
|
|
6251
|
+
};
|
|
5823
6252
|
|
|
5824
6253
|
// ../../packages/crawler/src/index.ts
|
|
5825
6254
|
async function crawlSource(source) {
|
|
@@ -6015,6 +6444,7 @@ function explainRecordAcceptance(record, topic) {
|
|
|
6015
6444
|
const trust = numericMetadata(record, "source_trust_score", 0);
|
|
6016
6445
|
const authority = numericMetadata(record, "source_authority_score", 0);
|
|
6017
6446
|
const relevance = numericMetadata(record, "source_relevance_score", 0);
|
|
6447
|
+
const alignment = numericMetadata(record, "source_domain_alignment_score", 0);
|
|
6018
6448
|
const supportCount = numericMetadata(record, "support_count", 0);
|
|
6019
6449
|
const contradictions = numericMetadata(record, "contradiction_count", 0);
|
|
6020
6450
|
const citationCoverage = citationCoverageScore(record);
|
|
@@ -6024,7 +6454,7 @@ function explainRecordAcceptance(record, topic) {
|
|
|
6024
6454
|
else if (trust >= 0.5) reasons.push("Moderate source trust accepted with supporting quality signals.");
|
|
6025
6455
|
else reasons.push("Source trust is weak; confidence should remain cautious.");
|
|
6026
6456
|
if (authority >= 0.72) reasons.push("Source authority is strong enough to raise confidence.");
|
|
6027
|
-
if (relevance >= 0.65 || relevanceScore >= 0.65) reasons.push("Record is semantically aligned with the requested dataset topic.");
|
|
6457
|
+
if (alignment >= 0.65 || relevance >= 0.65 || relevanceScore >= 0.65) reasons.push("Record is semantically aligned with the requested dataset topic.");
|
|
6028
6458
|
if (supportCount >= 2) reasons.push("Independent support was detected across sources.");
|
|
6029
6459
|
if (contradictions === 0) reasons.push("No open contradiction penalty was applied.");
|
|
6030
6460
|
if (citationCoverage >= 0.9) reasons.push("Record preserves source URL, source title, context, and confidence.");
|
|
@@ -6073,7 +6503,11 @@ function citationCoverageScore(record) {
|
|
|
6073
6503
|
}
|
|
6074
6504
|
function relevanceScoreForRecord(record, topic) {
|
|
6075
6505
|
const metadataRelevance = numericMetadata(record, "source_relevance_score", NaN);
|
|
6506
|
+
const metadataAlignment = numericMetadata(record, "source_domain_alignment_score", NaN);
|
|
6076
6507
|
const lexical = jaccardSimilarity(topic, `${record.input} ${record.output} ${record.context}`) * 2.6;
|
|
6508
|
+
if (Number.isFinite(metadataRelevance) && Number.isFinite(metadataAlignment)) {
|
|
6509
|
+
return round01(metadataRelevance * 0.34 + metadataAlignment * 0.36 + Math.min(1, lexical) * 0.3);
|
|
6510
|
+
}
|
|
6077
6511
|
return round01(Number.isFinite(metadataRelevance) ? metadataRelevance * 0.55 + Math.min(1, lexical) * 0.45 : Math.min(1, lexical));
|
|
6078
6512
|
}
|
|
6079
6513
|
function estimateSourceDiversity(records) {
|
|
@@ -6256,6 +6690,7 @@ Source: ${result.source.url}
|
|
|
6256
6690
|
freshnessScore: result.source.freshnessScore,
|
|
6257
6691
|
duplicationRisk: result.source.duplicationRisk,
|
|
6258
6692
|
semanticScore: result.source.semanticScore,
|
|
6693
|
+
domainAlignmentScore: result.source.domainAlignmentScore,
|
|
6259
6694
|
trustScore: result.source.trustScore
|
|
6260
6695
|
}
|
|
6261
6696
|
};
|
|
@@ -6735,7 +7170,7 @@ function getGroqKey() {
|
|
|
6735
7170
|
function providerPrompt(options) {
|
|
6736
7171
|
const { topic, datasetType, document, targetCount } = options;
|
|
6737
7172
|
const sourceText = document.text.replace(/\s+/g, " ").trim().slice(0, 7e3);
|
|
6738
|
-
const sourceScores = document.sourceScores ? `Trust: ${document.sourceScores.trustScore ?? "unknown"}; Authority: ${document.sourceScores.authorityScore ?? "unknown"}; Relevance: ${document.sourceScores.relevanceScore ?? "unknown"}; Freshness: ${document.sourceScores.freshnessScore ?? "unknown"}; Duplication risk: ${document.sourceScores.duplicationRisk ?? "unknown"}` : "Source quality scores unavailable.";
|
|
7173
|
+
const sourceScores = document.sourceScores ? `Trust: ${document.sourceScores.trustScore ?? "unknown"}; Authority: ${document.sourceScores.authorityScore ?? "unknown"}; Relevance: ${document.sourceScores.relevanceScore ?? "unknown"}; Domain alignment: ${document.sourceScores.domainAlignmentScore ?? "unknown"}; Freshness: ${document.sourceScores.freshnessScore ?? "unknown"}; Duplication risk: ${document.sourceScores.duplicationRisk ?? "unknown"}` : "Source quality scores unavailable.";
|
|
6739
7174
|
const mode = datasetType === "instruction" ? "Instruction tuning: input is a task/instruction, output is the ideal answer." : datasetType === "rag" ? "RAG: input is a realistic user query, output is a compact answer, context is retrieval-ready evidence text." : "QA: input is a question, output is a grounded answer.";
|
|
6740
7175
|
const segmentBlock = options.segment ? `
|
|
6741
7176
|
Active topic segment:
|
|
@@ -7263,16 +7698,19 @@ var StructuringAgent = class {
|
|
|
7263
7698
|
const concurrency = Math.max(1, Math.min(documents.length, Number(process.env.ALYS_PROVIDER_CONCURRENCY ?? 3)));
|
|
7264
7699
|
let totalGenerated = 0;
|
|
7265
7700
|
const grouped = await mapLimit2(documents, concurrency, async (document, index) => {
|
|
7266
|
-
const finding = findings[index] ?? findings[0];
|
|
7701
|
+
const finding = findings.find((item) => item.id === document.sourceId) ?? findings[index] ?? findings[0];
|
|
7267
7702
|
const trustScore = document.sourceScores?.trustScore ?? 0.62;
|
|
7268
7703
|
const authorityScore = document.sourceScores?.authorityScore ?? 0.55;
|
|
7269
7704
|
const relevanceScore = document.sourceScores?.relevanceScore ?? 0.55;
|
|
7705
|
+
const alignmentScore = document.sourceScores?.domainAlignmentScore ?? relevanceScore;
|
|
7270
7706
|
const duplicationRisk = document.sourceScores?.duplicationRisk ?? 0;
|
|
7271
7707
|
if (trustScore < (options.minTrustScore ?? 0.42) || relevanceScore < (options.minRelevanceScore ?? 0.24)) return [];
|
|
7272
7708
|
const sourceWeight = sourceQualityWeight(document);
|
|
7273
7709
|
const segment2 = options.generationPlan ? segmentForSource(options.generationPlan, `${document.title} ${document.text}`, index) : void 0;
|
|
7274
7710
|
const blueprint2 = options.generationPlan?.blueprint;
|
|
7275
|
-
const baselineConfidence = clamp013(
|
|
7711
|
+
const baselineConfidence = clamp013(
|
|
7712
|
+
(finding?.confidence ?? 0.62) * 0.42 + trustScore * 0.2 + authorityScore * 0.1 + relevanceScore * 0.12 + alignmentScore * 0.12 - duplicationRisk * 0.12
|
|
7713
|
+
);
|
|
7276
7714
|
const baseId = import_node_crypto3.default.createHash("sha1").update(`${topic}:${document.url}:${datasetType}`).digest("hex").slice(0, 14);
|
|
7277
7715
|
const providerTarget = useProvider ? weightedRecordTarget(recordsPerDocument, options.providerRecordsPerDocument ?? recordsPerDocument, sourceWeight) : 0;
|
|
7278
7716
|
let providerResult = null;
|
|
@@ -7289,8 +7727,10 @@ var StructuringAgent = class {
|
|
|
7289
7727
|
});
|
|
7290
7728
|
}
|
|
7291
7729
|
if (providerResult?.records.length) {
|
|
7730
|
+
const provenance = buildRecordProvenance(document, finding, documents);
|
|
7292
7731
|
const mapped = providerResult.records.map((g, variantIndex) => {
|
|
7293
|
-
const adjustedConfidence = trustWeightedConfidence(g.confidence, baselineConfidence, document, finding, sourceWeight);
|
|
7732
|
+
const adjustedConfidence = trustWeightedConfidence(g.confidence, baselineConfidence, document, finding, sourceWeight, provenance.corroborationScore);
|
|
7733
|
+
const confidenceFactors = confidenceFactorsForRecord(document, finding, provenance, adjustedConfidence, sourceWeight);
|
|
7294
7734
|
return {
|
|
7295
7735
|
id: `${baseId}-${variantIndex}`,
|
|
7296
7736
|
input: g.input,
|
|
@@ -7313,14 +7753,24 @@ var StructuringAgent = class {
|
|
|
7313
7753
|
provider: providerResult.provider,
|
|
7314
7754
|
model: providerResult.model,
|
|
7315
7755
|
latency_ms: providerResult.latencyMs,
|
|
7316
|
-
support_count:
|
|
7756
|
+
support_count: provenance.supportUrls.length,
|
|
7757
|
+
support_urls: provenance.supportUrls,
|
|
7758
|
+
support_domains: provenance.supportDomains,
|
|
7759
|
+
support_sources: provenance.supportSources,
|
|
7760
|
+
primary_source_domain: provenance.primaryDomain,
|
|
7761
|
+
corroboration_score: provenance.corroborationScore,
|
|
7317
7762
|
contradiction_count: finding?.contradictions.length ?? 0,
|
|
7763
|
+
contradiction_notes: finding?.contradictions ?? [],
|
|
7764
|
+
contradiction_status: (finding?.contradictions.length ?? 0) > 0 ? "needs_review" : "clear",
|
|
7765
|
+
confidence_factors: confidenceFactors,
|
|
7318
7766
|
source_quality_weight: Number(sourceWeight.toFixed(3)),
|
|
7319
7767
|
source_trust_score: document.sourceScores?.trustScore,
|
|
7320
7768
|
source_authority_score: document.sourceScores?.authorityScore,
|
|
7321
7769
|
source_relevance_score: document.sourceScores?.relevanceScore,
|
|
7770
|
+
source_domain_alignment_score: document.sourceScores?.domainAlignmentScore,
|
|
7771
|
+
source_freshness_score: document.sourceScores?.freshnessScore,
|
|
7322
7772
|
source_duplication_risk: document.sourceScores?.duplicationRisk,
|
|
7323
|
-
acceptance_reasons: acceptanceReasons(document, finding, segment2)
|
|
7773
|
+
acceptance_reasons: acceptanceReasons(document, finding, segment2, provenance)
|
|
7324
7774
|
},
|
|
7325
7775
|
created_at: createdAt
|
|
7326
7776
|
};
|
|
@@ -7374,42 +7824,102 @@ function sourceQualityWeight(document) {
|
|
|
7374
7824
|
const authority = document.sourceScores?.authorityScore ?? 0.55;
|
|
7375
7825
|
const relevance = document.sourceScores?.relevanceScore ?? 0.55;
|
|
7376
7826
|
const semantic = document.sourceScores?.semanticScore ?? relevance;
|
|
7827
|
+
const alignment = document.sourceScores?.domainAlignmentScore ?? relevance;
|
|
7377
7828
|
const duplicateRisk = document.sourceScores?.duplicationRisk ?? 0;
|
|
7378
|
-
return clamp013(trust * 0.
|
|
7829
|
+
return clamp013(trust * 0.3 + authority * 0.18 + relevance * 0.18 + alignment * 0.18 + semantic * 0.1 + (1 - duplicateRisk) * 0.06);
|
|
7379
7830
|
}
|
|
7380
7831
|
function weightedRecordTarget(recordsPerDocument, providerTarget, sourceWeight) {
|
|
7381
7832
|
const multiplier = sourceWeight >= 0.82 ? 1.45 : sourceWeight >= 0.68 ? 1.15 : sourceWeight >= 0.54 ? 0.85 : 0.45;
|
|
7382
7833
|
return Math.max(0, Math.min(Math.ceil(recordsPerDocument * 1.7), Math.ceil(providerTarget * multiplier)));
|
|
7383
7834
|
}
|
|
7384
|
-
function trustWeightedConfidence(providerConfidence, baselineConfidence, document, finding, sourceWeight) {
|
|
7835
|
+
function trustWeightedConfidence(providerConfidence, baselineConfidence, document, finding, sourceWeight, corroborationScore = 0) {
|
|
7385
7836
|
const contradictions = finding?.contradictions.length ?? 0;
|
|
7386
7837
|
const support = finding?.support.length ?? 1;
|
|
7387
7838
|
const duplicateRisk = document.sourceScores?.duplicationRisk ?? 0;
|
|
7388
|
-
const
|
|
7839
|
+
const alignment = document.sourceScores?.domainAlignmentScore ?? document.sourceScores?.relevanceScore ?? 0.55;
|
|
7840
|
+
const value = providerConfidence * 0.34 + baselineConfidence * 0.24 + sourceWeight * 0.18 + alignment * 0.12 + Math.min(0.08, support * 0.02) + corroborationScore * 0.08 - Math.min(0.18, contradictions * 0.045) - duplicateRisk * 0.1;
|
|
7389
7841
|
return Number(clamp013(value).toFixed(3));
|
|
7390
7842
|
}
|
|
7391
|
-
function acceptanceReasons(document, finding, segment2) {
|
|
7843
|
+
function acceptanceReasons(document, finding, segment2, provenance) {
|
|
7392
7844
|
const reasons = [
|
|
7393
7845
|
`source-trust:${document.sourceScores?.trustScore ?? "unknown"}`,
|
|
7394
7846
|
`source-authority:${document.sourceScores?.authorityScore ?? "unknown"}`,
|
|
7395
7847
|
`source-relevance:${document.sourceScores?.relevanceScore ?? "unknown"}`,
|
|
7396
|
-
`
|
|
7848
|
+
`source-domain-alignment:${document.sourceScores?.domainAlignmentScore ?? "unknown"}`,
|
|
7849
|
+
`source-freshness:${document.sourceScores?.freshnessScore ?? "unknown"}`,
|
|
7850
|
+
`support-count:${provenance.supportUrls.length}`,
|
|
7851
|
+
`corroboration-score:${provenance.corroborationScore}`
|
|
7397
7852
|
];
|
|
7398
7853
|
if (segment2) reasons.push(`segment:${segment2.id}`);
|
|
7854
|
+
if (provenance.supportDomains.length >= 2) reasons.push("cross-source-corroborated");
|
|
7399
7855
|
if ((finding?.contradictions.length ?? 0) === 0) reasons.push("no-open-contradictions");
|
|
7400
7856
|
return reasons;
|
|
7401
7857
|
}
|
|
7402
7858
|
function recordAcceptanceScore(record) {
|
|
7403
7859
|
const trust = numericMetadata2(record, "source_trust_score", 0.6);
|
|
7404
7860
|
const relevance = numericMetadata2(record, "source_relevance_score", 0.55);
|
|
7861
|
+
const alignment = numericMetadata2(record, "source_domain_alignment_score", relevance);
|
|
7405
7862
|
const qualityWeight = numericMetadata2(record, "source_quality_weight", 0.6);
|
|
7863
|
+
const corroboration = numericMetadata2(record, "corroboration_score", 0);
|
|
7406
7864
|
const contradictionCount = numericMetadata2(record, "contradiction_count", 0);
|
|
7407
7865
|
const citation = record.source_url && record.context.trim().length > 40 ? 1 : 0;
|
|
7408
7866
|
const base = scoreDatasetRecord(record);
|
|
7409
7867
|
return clamp013(
|
|
7410
|
-
base * 0.
|
|
7868
|
+
base * 0.32 + record.confidence * 0.16 + trust * 0.14 + relevance * 0.1 + alignment * 0.1 + qualityWeight * 0.08 + corroboration * 0.08 + citation * 0.02 - Math.min(0.2, contradictionCount * 0.04)
|
|
7411
7869
|
);
|
|
7412
7870
|
}
|
|
7871
|
+
function buildRecordProvenance(document, finding, documents = []) {
|
|
7872
|
+
const primaryDomain = domainFromUrl3(document.url);
|
|
7873
|
+
const supportUrls = Array.from(/* @__PURE__ */ new Set([document.url, ...finding?.support ?? []])).slice(0, 8);
|
|
7874
|
+
const supportDomains = Array.from(new Set(supportUrls.map(domainFromUrl3).filter(Boolean)));
|
|
7875
|
+
const supportSources = supportUrls.map((url) => {
|
|
7876
|
+
const matched = documents.find((candidate) => candidate.url === url);
|
|
7877
|
+
return {
|
|
7878
|
+
title: matched?.title ?? (url === document.url ? document.title : domainFromUrl3(url)),
|
|
7879
|
+
url,
|
|
7880
|
+
domain: domainFromUrl3(url)
|
|
7881
|
+
};
|
|
7882
|
+
});
|
|
7883
|
+
const independentDomains = supportDomains.filter((domain) => domain !== primaryDomain).length;
|
|
7884
|
+
const supportCount = Math.max(0, supportUrls.length - 1);
|
|
7885
|
+
const corroborationScore = clamp013(independentDomains * 0.34 + supportCount * 0.08);
|
|
7886
|
+
return {
|
|
7887
|
+
primaryDomain,
|
|
7888
|
+
supportUrls,
|
|
7889
|
+
supportDomains,
|
|
7890
|
+
supportSources,
|
|
7891
|
+
corroborationScore: Number(corroborationScore.toFixed(3))
|
|
7892
|
+
};
|
|
7893
|
+
}
|
|
7894
|
+
function confidenceFactorsForRecord(document, finding, provenance, confidence, sourceWeight) {
|
|
7895
|
+
const authority = document.sourceScores?.authorityScore ?? 0.55;
|
|
7896
|
+
const trust = document.sourceScores?.trustScore ?? 0.62;
|
|
7897
|
+
const relevance = document.sourceScores?.relevanceScore ?? 0.55;
|
|
7898
|
+
const alignment = document.sourceScores?.domainAlignmentScore ?? relevance;
|
|
7899
|
+
const freshness = document.sourceScores?.freshnessScore ?? 0.62;
|
|
7900
|
+
const contradictionPenalty = Math.min(1, (finding?.contradictions.length ?? 0) * 0.22);
|
|
7901
|
+
const retrievalSupport = Math.min(1, provenance.supportUrls.length / 4);
|
|
7902
|
+
return {
|
|
7903
|
+
overall: factor(confidence),
|
|
7904
|
+
sourceAuthority: factor(authority),
|
|
7905
|
+
sourceTrust: factor(trust),
|
|
7906
|
+
domainAlignment: factor(alignment),
|
|
7907
|
+
topicRelevance: factor(relevance),
|
|
7908
|
+
sourceFreshness: factor(freshness),
|
|
7909
|
+
temporalConfidence: factor(freshness * 0.6 + trust * 0.24 + authority * 0.16),
|
|
7910
|
+
corroboration: factor(provenance.corroborationScore),
|
|
7911
|
+
retrievalSupport: factor(retrievalSupport),
|
|
7912
|
+
sourceQualityWeight: factor(sourceWeight),
|
|
7913
|
+
contradictionPenalty: factor(contradictionPenalty)
|
|
7914
|
+
};
|
|
7915
|
+
}
|
|
7916
|
+
function factor(score) {
|
|
7917
|
+
const normalized = clamp013(score);
|
|
7918
|
+
return {
|
|
7919
|
+
score: Number(normalized.toFixed(3)),
|
|
7920
|
+
level: normalized >= 0.86 ? "very_high" : normalized >= 0.68 ? "high" : normalized >= 0.42 ? "medium" : "low"
|
|
7921
|
+
};
|
|
7922
|
+
}
|
|
7413
7923
|
function numericMetadata2(record, key, fallback) {
|
|
7414
7924
|
const value = record.metadata?.[key];
|
|
7415
7925
|
return typeof value === "number" && Number.isFinite(value) ? value : fallback;
|
|
@@ -7575,20 +8085,34 @@ function performanceConfig(mode) {
|
|
|
7575
8085
|
return {
|
|
7576
8086
|
candidateMultiplier: 1.35,
|
|
7577
8087
|
queryCap: 6,
|
|
7578
|
-
gateMinimumRatio: 0.
|
|
7579
|
-
minTrustScore: 0.
|
|
7580
|
-
minRelevanceScore: 0.
|
|
8088
|
+
gateMinimumRatio: 0.35,
|
|
8089
|
+
minTrustScore: 0.54,
|
|
8090
|
+
minRelevanceScore: 0.4,
|
|
8091
|
+
minDomainAlignmentScore: 0.4,
|
|
7581
8092
|
debateEnabled: false,
|
|
7582
8093
|
datasetConcurrency: 2
|
|
7583
8094
|
};
|
|
7584
8095
|
}
|
|
8096
|
+
if (mode === "strict") {
|
|
8097
|
+
return {
|
|
8098
|
+
candidateMultiplier: 1.6,
|
|
8099
|
+
queryCap: 20,
|
|
8100
|
+
gateMinimumRatio: 0.55,
|
|
8101
|
+
minTrustScore: 0.62,
|
|
8102
|
+
minRelevanceScore: 0.52,
|
|
8103
|
+
minDomainAlignmentScore: 0.56,
|
|
8104
|
+
debateEnabled: true,
|
|
8105
|
+
datasetConcurrency: 1
|
|
8106
|
+
};
|
|
8107
|
+
}
|
|
7585
8108
|
if (mode === "maximum-quality") {
|
|
7586
8109
|
return {
|
|
7587
8110
|
candidateMultiplier: 2.6,
|
|
7588
8111
|
queryCap: 18,
|
|
7589
8112
|
gateMinimumRatio: 0.42,
|
|
7590
|
-
minTrustScore: 0.
|
|
7591
|
-
minRelevanceScore: 0.
|
|
8113
|
+
minTrustScore: 0.5,
|
|
8114
|
+
minRelevanceScore: 0.36,
|
|
8115
|
+
minDomainAlignmentScore: 0.36,
|
|
7592
8116
|
debateEnabled: true,
|
|
7593
8117
|
datasetConcurrency: 1
|
|
7594
8118
|
};
|
|
@@ -7597,24 +8121,28 @@ function performanceConfig(mode) {
|
|
|
7597
8121
|
candidateMultiplier: 1.85,
|
|
7598
8122
|
queryCap: 12,
|
|
7599
8123
|
gateMinimumRatio: 0.35,
|
|
7600
|
-
minTrustScore: 0.
|
|
7601
|
-
minRelevanceScore: 0.
|
|
8124
|
+
minTrustScore: 0.52,
|
|
8125
|
+
minRelevanceScore: 0.38,
|
|
8126
|
+
minDomainAlignmentScore: 0.38,
|
|
7602
8127
|
debateEnabled: true,
|
|
7603
8128
|
datasetConcurrency: 1
|
|
7604
8129
|
};
|
|
7605
8130
|
}
|
|
7606
|
-
function gateSources(sources, mode) {
|
|
8131
|
+
function gateSources(sources, mode, topic) {
|
|
7607
8132
|
const perf = performanceConfig(mode);
|
|
7608
8133
|
const concreteSources = sources.filter(isConcreteEvidenceSource);
|
|
7609
8134
|
const accepted = concreteSources.filter((source) => {
|
|
7610
8135
|
const trust = source.trustScore ?? source.score;
|
|
7611
8136
|
const relevance = source.relevanceScore ?? source.score;
|
|
8137
|
+
const semantic = source.semanticScore ?? source.score;
|
|
8138
|
+
const alignment = source.domainAlignmentScore ?? relevance;
|
|
7612
8139
|
const duplicateRisk = source.duplicationRisk ?? 0;
|
|
7613
|
-
|
|
7614
|
-
return trust >= perf.minTrustScore && relevance >= perf.minRelevanceScore && duplicateRisk < 0.72 && (trust >= 0.52 || authority >= 0.72);
|
|
8140
|
+
return trust >= perf.minTrustScore && relevance >= perf.minRelevanceScore && alignment >= perf.minDomainAlignmentScore && (semantic >= 0.08 || alignment >= 0.68) && duplicateRisk < 0.72 && isTopicAlignedSource(topic, source, mode);
|
|
7615
8141
|
});
|
|
7616
8142
|
const minimum = Math.min(concreteSources.length, Math.max(3, Math.ceil(concreteSources.length * perf.gateMinimumRatio)));
|
|
7617
|
-
const fallback = accepted.length >= minimum ? accepted : accepted.length ? accepted : concreteSources.filter(
|
|
8143
|
+
const fallback = accepted.length >= minimum ? accepted : accepted.length ? accepted : concreteSources.filter(
|
|
8144
|
+
(source) => (source.authorityScore ?? 0) >= 0.78 && (source.relevanceScore ?? source.score) >= perf.minRelevanceScore && isTopicAlignedSource(topic, source, mode)
|
|
8145
|
+
).slice(0, minimum);
|
|
7618
8146
|
return {
|
|
7619
8147
|
sources: fallback,
|
|
7620
8148
|
filtered: Math.max(0, sources.length - fallback.length)
|
|
@@ -7625,6 +8153,60 @@ function isConcreteEvidenceSource(source) {
|
|
|
7625
8153
|
const provider = (source.provider || source.discoveredBy || "").toLowerCase();
|
|
7626
8154
|
return provider !== "local-heuristic" && !provider.includes("heuristic");
|
|
7627
8155
|
}
|
|
8156
|
+
function isTopicAlignedSource(topic, source, mode) {
|
|
8157
|
+
const coreTokens = topicCoreTokens2(topic);
|
|
8158
|
+
if (coreTokens.length < 2) return true;
|
|
8159
|
+
const alignment = source.domainAlignmentScore ?? source.relevanceScore ?? source.score;
|
|
8160
|
+
const relevance = source.relevanceScore ?? source.score;
|
|
8161
|
+
const trust = source.trustScore ?? source.score;
|
|
8162
|
+
const sourceType = source.sourceType ?? "unknown";
|
|
8163
|
+
const provider = (source.provider || source.discoveredBy || "").toLowerCase();
|
|
8164
|
+
const isGithub = provider === "github" || source.domain === "github.com" || source.url.includes("github.com/");
|
|
8165
|
+
const codeTopic = /\b(code|github|repository|developer|api|sdk|package|library|framework|docs|documentation)\b/i.test(topic);
|
|
8166
|
+
const minAlignment = mode === "strict" ? 0.56 : mode === "fast" ? 0.4 : 0.36;
|
|
8167
|
+
if (alignment < minAlignment || relevance < 0.36 || trust < 0.48) return false;
|
|
8168
|
+
if ((isGithub || sourceType === "code") && !codeTopic && (alignment < 0.64 || relevance < 0.56)) return false;
|
|
8169
|
+
if (source.qualitySignals?.includes("broad-source-penalty") && alignment < 0.72) return false;
|
|
8170
|
+
return true;
|
|
8171
|
+
}
|
|
8172
|
+
function topicCoreTokens2(topic) {
|
|
8173
|
+
const stopWords = /* @__PURE__ */ new Set([
|
|
8174
|
+
"data",
|
|
8175
|
+
"dataset",
|
|
8176
|
+
"datasets",
|
|
8177
|
+
"records",
|
|
8178
|
+
"record",
|
|
8179
|
+
"rows",
|
|
8180
|
+
"corpus",
|
|
8181
|
+
"csv",
|
|
8182
|
+
"json",
|
|
8183
|
+
"jsonl",
|
|
8184
|
+
"parquet",
|
|
8185
|
+
"rag",
|
|
8186
|
+
"fine",
|
|
8187
|
+
"tune",
|
|
8188
|
+
"tuning",
|
|
8189
|
+
"training",
|
|
8190
|
+
"ready",
|
|
8191
|
+
"examples",
|
|
8192
|
+
"example",
|
|
8193
|
+
"with",
|
|
8194
|
+
"from",
|
|
8195
|
+
"into",
|
|
8196
|
+
"about",
|
|
8197
|
+
"info",
|
|
8198
|
+
"information",
|
|
8199
|
+
"generate",
|
|
8200
|
+
"generated",
|
|
8201
|
+
"synthetic",
|
|
8202
|
+
"model",
|
|
8203
|
+
"models",
|
|
8204
|
+
"openai",
|
|
8205
|
+
"anthropic",
|
|
8206
|
+
"ai"
|
|
8207
|
+
]);
|
|
8208
|
+
return topic.toLowerCase().replace(/[^a-z0-9]+/g, " ").split(/\s+/).filter((token) => token.length >= 3 && !stopWords.has(token)).slice(0, 10);
|
|
8209
|
+
}
|
|
7628
8210
|
function sourceDiversityScore(sources) {
|
|
7629
8211
|
if (!sources.length) return 0;
|
|
7630
8212
|
const domains = new Set(sources.map((source) => source.domain || domainFromUrl4(source.url)));
|
|
@@ -7668,7 +8250,7 @@ async function generateDataset(options) {
|
|
|
7668
8250
|
message: `${sources.length} ranked sources from ${research.providersUsed.join(", ") || "research engine"}`,
|
|
7669
8251
|
metric: `trust ${Math.round(averageTrust * 100)} / 100`
|
|
7670
8252
|
});
|
|
7671
|
-
const gated = gateSources(sources, performanceMode);
|
|
8253
|
+
const gated = gateSources(sources, performanceMode, options.topic);
|
|
7672
8254
|
event(options.onEvent, {
|
|
7673
8255
|
stage: "discovery",
|
|
7674
8256
|
agent: discovery.name,
|
|
@@ -7799,7 +8381,7 @@ async function generateDataset(options) {
|
|
|
7799
8381
|
outputDir: `${workspace.datasets}/${datasetId}`,
|
|
7800
8382
|
formats: exportFormats,
|
|
7801
8383
|
generationPlan,
|
|
7802
|
-
sourceManifest: sources,
|
|
8384
|
+
sourceManifest: gated.sources,
|
|
7803
8385
|
researchGraph: research.graph,
|
|
7804
8386
|
generationSummary: {
|
|
7805
8387
|
contradictionsOpen,
|
|
@@ -7812,7 +8394,7 @@ async function generateDataset(options) {
|
|
|
7812
8394
|
evaluation,
|
|
7813
8395
|
qualityMetrics,
|
|
7814
8396
|
metrics: {
|
|
7815
|
-
sourcesDiscovered: sources.length,
|
|
8397
|
+
sourcesDiscovered: gated.sources.length,
|
|
7816
8398
|
documentsExtracted: extracted.length,
|
|
7817
8399
|
findingsVerified: debated.length,
|
|
7818
8400
|
duplicatesRemoved: deduped.removed,
|
|
@@ -7829,9 +8411,9 @@ async function generateDataset(options) {
|
|
|
7829
8411
|
artifacts.push(await writeDatasetArtifact(workspace, datasetId, "data-dictionary.md", renderDataDictionary(manifest)));
|
|
7830
8412
|
artifacts.push(await writeDatasetArtifact(workspace, datasetId, "source-graph.json", `${JSON.stringify(research.graph, null, 2)}
|
|
7831
8413
|
`));
|
|
7832
|
-
artifacts.push(await writeDatasetArtifact(workspace, datasetId, "sources.json", `${JSON.stringify(sources, null, 2)}
|
|
8414
|
+
artifacts.push(await writeDatasetArtifact(workspace, datasetId, "sources.json", `${JSON.stringify(gated.sources, null, 2)}
|
|
7833
8415
|
`));
|
|
7834
|
-
artifacts.push(await writeDatasetArtifact(workspace, datasetId, "sources.md", renderSourcesMarkdown(options.topic, sources)));
|
|
8416
|
+
artifacts.push(await writeDatasetArtifact(workspace, datasetId, "sources.md", renderSourcesMarkdown(options.topic, gated.sources)));
|
|
7835
8417
|
artifacts.push(await writeDatasetArtifact(workspace, datasetId, "generation-plan.json", `${JSON.stringify(generationPlan, null, 2)}
|
|
7836
8418
|
`));
|
|
7837
8419
|
artifacts.push(await writeDatasetArtifact(workspace, datasetId, "benchmark-report.json", `${JSON.stringify(evaluation, null, 2)}
|
|
@@ -9750,7 +10332,7 @@ Flags:
|
|
|
9750
10332
|
--type instruction|rag|qa
|
|
9751
10333
|
--datasets 1
|
|
9752
10334
|
--depth shallow|medium|deep
|
|
9753
|
-
--mode fast|balanced|maximum-quality
|
|
10335
|
+
--mode fast|balanced|strict|maximum-quality
|
|
9754
10336
|
--sources 8
|
|
9755
10337
|
--rows 125
|
|
9756
10338
|
--workspace ~/Desktop/alys-output
|
|
@@ -9914,8 +10496,9 @@ function parseDepth(value) {
|
|
|
9914
10496
|
return void 0;
|
|
9915
10497
|
}
|
|
9916
10498
|
function parsePerformanceMode(value) {
|
|
9917
|
-
if (value === "fast" || value === "balanced" || value === "maximum-quality") return value;
|
|
10499
|
+
if (value === "fast" || value === "balanced" || value === "strict" || value === "maximum-quality") return value;
|
|
9918
10500
|
if (value === "max" || value === "quality") return "maximum-quality";
|
|
10501
|
+
if (value === "trust" || value === "conservative") return "strict";
|
|
9919
10502
|
return void 0;
|
|
9920
10503
|
}
|
|
9921
10504
|
function isPreparationCommand(command) {
|
|
@@ -9997,6 +10580,13 @@ function truncate(value, max = 88) {
|
|
|
9997
10580
|
const normalized = value.replace(/\s+/g, " ").trim();
|
|
9998
10581
|
return normalized.length > max ? `${normalized.slice(0, max - 1)}\u2026` : normalized;
|
|
9999
10582
|
}
|
|
10583
|
+
function domainFromUrl5(url) {
|
|
10584
|
+
try {
|
|
10585
|
+
return new URL(url).hostname.replace(/^www\./, "");
|
|
10586
|
+
} catch {
|
|
10587
|
+
return url;
|
|
10588
|
+
}
|
|
10589
|
+
}
|
|
10000
10590
|
function getMetrics(dataset) {
|
|
10001
10591
|
const metrics = dataset.manifest.metrics;
|
|
10002
10592
|
return metrics && typeof metrics === "object" ? metrics : {};
|
|
@@ -10024,6 +10614,10 @@ function getSourceManifest(dataset) {
|
|
|
10024
10614
|
if (!Array.isArray(sources)) return [];
|
|
10025
10615
|
return sources.filter((source) => Boolean(source) && typeof source === "object").filter((source) => typeof source.title === "string" || typeof source.url === "string");
|
|
10026
10616
|
}
|
|
10617
|
+
function getResearchGraph(dataset) {
|
|
10618
|
+
const graph = dataset.manifest.researchGraph;
|
|
10619
|
+
return graph && typeof graph === "object" ? graph : {};
|
|
10620
|
+
}
|
|
10027
10621
|
function printStage(code, status, label, metric) {
|
|
10028
10622
|
const tint = status === "DONE" || status === "OK" ? "green" : status === "WARN" ? "yellow" : "cyan";
|
|
10029
10623
|
const prefix = `${paint(`[${code.padEnd(4).slice(0, 4)}]`, "gray")} ${paint(status.padEnd(4), tint)}`;
|
|
@@ -10065,11 +10659,43 @@ function previewRecord(dataset) {
|
|
|
10065
10659
|
if (!input && !output) return null;
|
|
10066
10660
|
const metadata = parsed.metadata && typeof parsed.metadata === "object" ? parsed.metadata : {};
|
|
10067
10661
|
const explanation = Array.isArray(metadata.acceptance_explanation) ? metadata.acceptance_explanation.filter((item) => typeof item === "string") : Array.isArray(metadata.acceptance_reasons) ? metadata.acceptance_reasons.filter((item) => typeof item === "string") : [];
|
|
10068
|
-
|
|
10662
|
+
const supportSources = Array.isArray(metadata.support_sources) ? metadata.support_sources.filter((item) => Boolean(item) && typeof item === "object").map((item) => {
|
|
10663
|
+
const title = typeof item.title === "string" ? item.title : "";
|
|
10664
|
+
const domain = typeof item.domain === "string" ? item.domain : "";
|
|
10665
|
+
return truncate(title || domain || "source", 46);
|
|
10666
|
+
}).filter(Boolean).slice(0, 3) : [];
|
|
10667
|
+
const groundedBy = supportSources.length ? supportSources : Array.isArray(metadata.support_urls) ? metadata.support_urls.filter((item) => typeof item === "string").slice(0, 3).map(domainFromUrl5) : typeof parsed.source_url === "string" ? [domainFromUrl5(parsed.source_url)] : [];
|
|
10668
|
+
const factors = confidenceFactorsLine(metadata.confidence_factors);
|
|
10669
|
+
const cautions = Array.isArray(metadata.contradiction_notes) ? metadata.contradiction_notes.filter((item) => typeof item === "string").slice(0, 2) : [];
|
|
10670
|
+
return {
|
|
10671
|
+
input: truncate(input, 92),
|
|
10672
|
+
output: truncate(output, 120),
|
|
10673
|
+
why: explanation.slice(0, 2).map((item) => truncate(item, 112)),
|
|
10674
|
+
groundedBy,
|
|
10675
|
+
factors,
|
|
10676
|
+
cautions: cautions.map((item) => truncate(item, 112))
|
|
10677
|
+
};
|
|
10069
10678
|
} catch {
|
|
10070
10679
|
return null;
|
|
10071
10680
|
}
|
|
10072
10681
|
}
|
|
10682
|
+
function confidenceFactorsLine(value) {
|
|
10683
|
+
if (!value || typeof value !== "object") return "";
|
|
10684
|
+
const factors = value;
|
|
10685
|
+
const parts = [
|
|
10686
|
+
factorLabel("authority", factors.sourceAuthority),
|
|
10687
|
+
factorLabel("alignment", factors.domainAlignment),
|
|
10688
|
+
factorLabel("freshness", factors.sourceFreshness),
|
|
10689
|
+
factorLabel("corroboration", factors.corroboration),
|
|
10690
|
+
factorLabel("support", factors.retrievalSupport)
|
|
10691
|
+
].filter(Boolean);
|
|
10692
|
+
return parts.join(" \xB7 ");
|
|
10693
|
+
}
|
|
10694
|
+
function factorLabel(label, value) {
|
|
10695
|
+
if (!value || typeof value !== "object") return "";
|
|
10696
|
+
const level = value.level;
|
|
10697
|
+
return typeof level === "string" ? `${label} ${level.replace("_", " ")}` : "";
|
|
10698
|
+
}
|
|
10073
10699
|
function depthMultiplier2(depth) {
|
|
10074
10700
|
if (depth === "deep") return 1.6;
|
|
10075
10701
|
if (depth === "shallow") return 0.75;
|
|
@@ -10148,8 +10774,13 @@ function printGenerationSummary(response, workspaceRoot) {
|
|
|
10148
10774
|
acc.findings += Number(metrics.findingsVerified ?? 0);
|
|
10149
10775
|
acc.duplicates += Number(metrics.duplicatesRemoved ?? summary.duplicatesRemoved ?? 0);
|
|
10150
10776
|
const quality = getQualityMetrics(dataset);
|
|
10777
|
+
const graph = getResearchGraph(dataset);
|
|
10778
|
+
const graphMetrics = graph.metrics ?? {};
|
|
10151
10779
|
acc.contradictions += Number(quality.contradictionResolutionCount ?? 0);
|
|
10152
10780
|
acc.lowTrustFiltered += Number(quality.lowTrustSourceFilterRate ?? 0);
|
|
10781
|
+
acc.corroborationEdges += Number(graphMetrics.corroborationEdges ?? 0);
|
|
10782
|
+
acc.graphContradictions += Number(graphMetrics.contradictionEdges ?? 0);
|
|
10783
|
+
acc.freshness.push(Number(graphMetrics.averageFreshness ?? 0));
|
|
10153
10784
|
acc.citationCoverage.push(Number(quality.citationCoverage ?? 0));
|
|
10154
10785
|
acc.uniqueness.push(Number(quality.recordUniqueness ?? 0));
|
|
10155
10786
|
acc.relevance.push(Number(quality.relevanceScore ?? 0));
|
|
@@ -10169,8 +10800,11 @@ function printGenerationSummary(response, workspaceRoot) {
|
|
|
10169
10800
|
findings: 0,
|
|
10170
10801
|
duplicates: 0,
|
|
10171
10802
|
contradictions: 0,
|
|
10803
|
+
corroborationEdges: 0,
|
|
10804
|
+
graphContradictions: 0,
|
|
10172
10805
|
lowTrustFiltered: 0,
|
|
10173
10806
|
confidences: [],
|
|
10807
|
+
freshness: [],
|
|
10174
10808
|
citationCoverage: [],
|
|
10175
10809
|
uniqueness: [],
|
|
10176
10810
|
relevance: [],
|
|
@@ -10188,9 +10822,11 @@ function printGenerationSummary(response, workspaceRoot) {
|
|
|
10188
10822
|
const instructionTuning = average5(totals.instructionTuning);
|
|
10189
10823
|
const factualGrounding = average5(totals.factualGrounding);
|
|
10190
10824
|
const humanUsefulness = average5(totals.humanUsefulness);
|
|
10825
|
+
const freshness = average5(totals.freshness);
|
|
10191
10826
|
console.log("");
|
|
10192
10827
|
console.log(paint("Alys run complete", "green"));
|
|
10193
10828
|
printStage("SRC", "DONE", "Authoritative sources ranked", formatInt2(totals.sources));
|
|
10829
|
+
printStage("SRC", "DONE", "Source corroboration edges", formatInt2(totals.corroborationEdges));
|
|
10194
10830
|
printStage("SRC", "DONE", "Low-trust source filter applied", `${Math.round(totals.lowTrustFiltered / Math.max(1, response.datasets.length) * 100)}% avg filtered`);
|
|
10195
10831
|
printStage("EXT", "DONE", "Source documents normalized", formatInt2(totals.documents));
|
|
10196
10832
|
printStage("CHK", "DONE", "Findings verified", formatInt2(totals.findings));
|
|
@@ -10201,6 +10837,7 @@ function printGenerationSummary(response, workspaceRoot) {
|
|
|
10201
10837
|
printStage("EVAL", "DONE", "Citation coverage", formatPercent2(citationCoverage));
|
|
10202
10838
|
printStage("EVAL", "DONE", "Record uniqueness", formatPercent2(uniqueness));
|
|
10203
10839
|
printStage("EVAL", "DONE", "Topic relevance", formatPercent2(relevance));
|
|
10840
|
+
printStage("EVAL", "DONE", "Source freshness", formatPercent2(freshness));
|
|
10204
10841
|
printStage("EVAL", "DONE", "RAG suitability", formatScore(ragSuitability));
|
|
10205
10842
|
printStage("EVAL", "DONE", "Instruction tuning suitability", formatScore(instructionTuning));
|
|
10206
10843
|
printStage("EVAL", "DONE", "Factual grounding", formatScore(factualGrounding));
|
|
@@ -10217,6 +10854,7 @@ function printGenerationSummary(response, workspaceRoot) {
|
|
|
10217
10854
|
const sources = Number(metrics.sourcesDiscovered ?? 0);
|
|
10218
10855
|
const confidenceValue = Number(metrics.averageConfidence ?? summary.averageConfidence ?? 0);
|
|
10219
10856
|
const blueprint2 = getBlueprint(dataset);
|
|
10857
|
+
const graphMetrics = getResearchGraph(dataset).metrics ?? {};
|
|
10220
10858
|
const outputDir = import_node_path5.default.join(root, "datasets", dataset.id);
|
|
10221
10859
|
console.log(`${paint("\u2022", "yellow")} ${paint(dataset.id, "white")} ${formatInt2(records)} records ${formatInt2(sources)} sources ${formatPercent2(confidenceValue)} confidence`);
|
|
10222
10860
|
console.log(` ${truncate(dataset.topic, 110)}`);
|
|
@@ -10225,6 +10863,7 @@ function printGenerationSummary(response, workspaceRoot) {
|
|
|
10225
10863
|
}
|
|
10226
10864
|
console.log(` ${paint(outputDir, "cyan")}`);
|
|
10227
10865
|
console.log(` quality ${formatPercent2(Number(quality.citationCoverage ?? 0))} citations \xB7 ${formatPercent2(Number(quality.recordUniqueness ?? 0))} unique \xB7 ${formatPercent2(Number(quality.sourceDiversity ?? 0))} source diversity`);
|
|
10866
|
+
console.log(` graph ${formatInt2(Number(graphMetrics.corroborationEdges ?? 0))} corroborations \xB7 ${formatPercent2(Number(graphMetrics.averageFreshness ?? 0))} freshness \xB7 ${formatInt2(Number(graphMetrics.providerCount ?? 0))} providers`);
|
|
10228
10867
|
console.log(` suitability RAG ${formatScore(Number(suitability.ragSuitability ?? 0))} \xB7 tuning ${formatScore(Number(suitability.instructionTuning ?? 0))} \xB7 usefulness ${formatScore(Number(suitability.humanUsefulness ?? 0))}`);
|
|
10229
10868
|
const topSources = getSourceManifest(dataset).slice(0, 5);
|
|
10230
10869
|
if (topSources.length) {
|
|
@@ -10232,9 +10871,11 @@ function printGenerationSummary(response, workspaceRoot) {
|
|
|
10232
10871
|
for (const source of topSources) {
|
|
10233
10872
|
const label = source.title || source.domain || source.provider || "source";
|
|
10234
10873
|
const trust = Number(source.trustScore ?? source.authorityScore ?? source.relevanceScore ?? 0);
|
|
10874
|
+
const alignment = Number(source.domainAlignmentScore ?? 0);
|
|
10235
10875
|
const type = source.sourceType ? ` ${source.sourceType}` : "";
|
|
10236
10876
|
const score = trust > 0 ? ` ${formatPercent2(trust)} trust` : "";
|
|
10237
|
-
|
|
10877
|
+
const alignmentLabel = alignment > 0 ? ` ${formatPercent2(alignment)} aligned` : "";
|
|
10878
|
+
console.log(` - ${truncate(label, 76)}${paint(`${type}${score}${alignmentLabel}`, "gray")}`);
|
|
10238
10879
|
if (source.url) console.log(` ${paint(source.url, "cyan")}`);
|
|
10239
10880
|
}
|
|
10240
10881
|
}
|
|
@@ -10243,6 +10884,15 @@ function printGenerationSummary(response, workspaceRoot) {
|
|
|
10243
10884
|
console.log(paint(" preview", "gray"));
|
|
10244
10885
|
if (preview.input) console.log(` in ${paint(preview.input, "gray")}`);
|
|
10245
10886
|
if (preview.output) console.log(` out ${preview.output}`);
|
|
10887
|
+
if (preview.groundedBy.length) {
|
|
10888
|
+
console.log(` grounded by ${paint(preview.groundedBy.join(", "), "gray")}`);
|
|
10889
|
+
}
|
|
10890
|
+
if (preview.factors) {
|
|
10891
|
+
console.log(` confidence ${paint(preview.factors, "gray")}`);
|
|
10892
|
+
}
|
|
10893
|
+
for (const caution of preview.cautions) {
|
|
10894
|
+
console.log(` caution ${paint(caution, "yellow")}`);
|
|
10895
|
+
}
|
|
10246
10896
|
for (const reason of preview.why) {
|
|
10247
10897
|
console.log(` why ${paint(reason, "gray")}`);
|
|
10248
10898
|
}
|
package/package.json
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "alys-akusa",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.20",
|
|
4
4
|
"private": false,
|
|
5
5
|
"description": "Alys local CLI runtime for autonomous AI data preparation.",
|
|
6
6
|
"license": "UNLICENSED",
|
|
7
7
|
"type": "module",
|
|
8
8
|
"bin": {
|
|
9
|
-
"alys": "dist/index.cjs"
|
|
9
|
+
"alys": "dist/index.cjs",
|
|
10
|
+
"alys-akusa": "dist/index.cjs"
|
|
10
11
|
},
|
|
11
12
|
"files": [
|
|
12
13
|
"dist",
|