alys-akusa 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +762 -76
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -5109,7 +5109,10 @@ async function discoverResearchSources(topic, options = {}) {
|
|
|
5109
5109
|
};
|
|
5110
5110
|
}
|
|
5111
5111
|
function createConfiguredSearchProviders() {
|
|
5112
|
-
const providers = [
|
|
5112
|
+
const providers = [
|
|
5113
|
+
new GitHubSearchProvider(env("GITHUB_TOKEN")),
|
|
5114
|
+
new KaggleSearchProvider(env("KAGGLE_USERNAME"), env("KAGGLE_KEY"))
|
|
5115
|
+
];
|
|
5113
5116
|
const tavilyKey = env("TAVILY_API_KEY");
|
|
5114
5117
|
if (tavilyKey) providers.push(new TavilySearchProvider(tavilyKey));
|
|
5115
5118
|
const serpApiKey = env("SERPAPI_API_KEY");
|
|
@@ -5155,6 +5158,107 @@ var TavilySearchProvider = class {
|
|
|
5155
5158
|
}));
|
|
5156
5159
|
}
|
|
5157
5160
|
};
|
|
5161
|
+
var GitHubSearchProvider = class {
|
|
5162
|
+
constructor(token = null) {
|
|
5163
|
+
this.token = token;
|
|
5164
|
+
}
|
|
5165
|
+
name = "github";
|
|
5166
|
+
async search(query, options = {}) {
|
|
5167
|
+
const url = new URL("https://api.github.com/search/repositories");
|
|
5168
|
+
url.searchParams.set("q", `${query} dataset OR benchmark OR corpus OR csv OR jsonl in:name,description,readme`);
|
|
5169
|
+
url.searchParams.set("sort", "stars");
|
|
5170
|
+
url.searchParams.set("order", "desc");
|
|
5171
|
+
url.searchParams.set("per_page", String(Math.min(20, options.limit ?? 10)));
|
|
5172
|
+
const headers = {
|
|
5173
|
+
Accept: "application/vnd.github+json",
|
|
5174
|
+
"User-Agent": "AlysResearchBot/0.1",
|
|
5175
|
+
"X-GitHub-Api-Version": "2022-11-28"
|
|
5176
|
+
};
|
|
5177
|
+
if (this.token) headers.Authorization = `Bearer ${this.token}`;
|
|
5178
|
+
const payload = await fetchJson(url.toString(), { headers }, options.timeoutMs);
|
|
5179
|
+
return asArray(payload.items).flatMap((item) => {
|
|
5180
|
+
if (!item || typeof item !== "object") return [];
|
|
5181
|
+
const object = item;
|
|
5182
|
+
const fullName = firstString(object, ["full_name"]);
|
|
5183
|
+
const htmlUrl = firstString(object, ["html_url"]);
|
|
5184
|
+
if (!fullName || !htmlUrl) return [];
|
|
5185
|
+
const stars = firstNumber(object, ["stargazers_count"]) ?? 0;
|
|
5186
|
+
const forks = firstNumber(object, ["forks_count"]) ?? 0;
|
|
5187
|
+
const topics = Array.isArray(object.topics) ? object.topics.map(String).slice(0, 8) : [];
|
|
5188
|
+
const license = object.license && typeof object.license === "object" ? firstString(object.license, ["spdx_id", "name"]) : "";
|
|
5189
|
+
const description = firstString(object, ["description"]);
|
|
5190
|
+
const language = firstString(object, ["language"]);
|
|
5191
|
+
const score = clamp01(Math.log10(stars + 1) / 5 * 0.72 + Math.log10(forks + 1) / 5 * 0.16 + (license ? 0.08 : 0) + (topics.length ? 0.04 : 0));
|
|
5192
|
+
return [{
|
|
5193
|
+
title: `GitHub: ${fullName}`,
|
|
5194
|
+
url: htmlUrl,
|
|
5195
|
+
snippet: [
|
|
5196
|
+
description,
|
|
5197
|
+
language ? `Language: ${language}.` : "",
|
|
5198
|
+
license ? `License: ${license}.` : "",
|
|
5199
|
+
topics.length ? `Topics: ${topics.join(", ")}.` : "",
|
|
5200
|
+
`Stars: ${stars}. Forks: ${forks}.`
|
|
5201
|
+
].filter(Boolean).join(" "),
|
|
5202
|
+
publishedAt: firstString(object, ["updated_at", "pushed_at", "created_at"]),
|
|
5203
|
+
score,
|
|
5204
|
+
provider: this.name,
|
|
5205
|
+
query,
|
|
5206
|
+
raw: item
|
|
5207
|
+
}];
|
|
5208
|
+
});
|
|
5209
|
+
}
|
|
5210
|
+
};
|
|
5211
|
+
var KaggleSearchProvider = class {
|
|
5212
|
+
constructor(username = null, key = null) {
|
|
5213
|
+
this.username = username;
|
|
5214
|
+
this.key = key;
|
|
5215
|
+
}
|
|
5216
|
+
name = "kaggle";
|
|
5217
|
+
async search(query, options = {}) {
|
|
5218
|
+
if (!this.username || !this.key) {
|
|
5219
|
+
return [];
|
|
5220
|
+
}
|
|
5221
|
+
const url = new URL("https://www.kaggle.com/api/v1/datasets/list");
|
|
5222
|
+
url.searchParams.set("search", query);
|
|
5223
|
+
url.searchParams.set("sortBy", "hottest");
|
|
5224
|
+
url.searchParams.set("pageSize", String(Math.min(20, options.limit ?? 10)));
|
|
5225
|
+
const auth = Buffer.from(`${this.username}:${this.key}`).toString("base64");
|
|
5226
|
+
const payload = await fetchJson(url.toString(), {
|
|
5227
|
+
headers: {
|
|
5228
|
+
Accept: "application/json",
|
|
5229
|
+
Authorization: `Basic ${auth}`,
|
|
5230
|
+
"User-Agent": "AlysResearchBot/0.1"
|
|
5231
|
+
}
|
|
5232
|
+
}, options.timeoutMs);
|
|
5233
|
+
const items = Array.isArray(payload) ? payload : payload && typeof payload === "object" ? asArray(payload.datasets ?? payload.results) : [];
|
|
5234
|
+
return items.flatMap((item) => {
|
|
5235
|
+
if (!item || typeof item !== "object") return [];
|
|
5236
|
+
const object = item;
|
|
5237
|
+
const ref = firstString(object, ["ref", "datasetRef", "ownerName"]);
|
|
5238
|
+
const title = firstString(object, ["title", "subtitle", "ref"]) || ref;
|
|
5239
|
+
const datasetUrl = firstString(object, ["url"]) || (ref ? `https://www.kaggle.com/datasets/${ref}` : "");
|
|
5240
|
+
if (!title || !datasetUrl) return [];
|
|
5241
|
+
const votes = firstNumber(object, ["voteCount", "votes"]) ?? 0;
|
|
5242
|
+
const downloads = firstNumber(object, ["downloadCount", "downloads"]) ?? 0;
|
|
5243
|
+
const usability = firstNumber(object, ["usabilityRating"]) ?? 0;
|
|
5244
|
+
const score = clamp01(Math.log10(downloads + 1) / 6 * 0.38 + Math.log10(votes + 1) / 5 * 0.24 + Math.min(1, usability) * 0.28 + 0.1);
|
|
5245
|
+
return [{
|
|
5246
|
+
title: `Kaggle: ${title}`,
|
|
5247
|
+
url: datasetUrl,
|
|
5248
|
+
snippet: [
|
|
5249
|
+
firstString(object, ["subtitle", "description"]),
|
|
5250
|
+
`Downloads: ${downloads}. Votes: ${votes}.`,
|
|
5251
|
+
usability ? `Usability: ${usability}.` : ""
|
|
5252
|
+
].filter(Boolean).join(" "),
|
|
5253
|
+
publishedAt: firstString(object, ["lastUpdated", "creationDate"]),
|
|
5254
|
+
score,
|
|
5255
|
+
provider: this.name,
|
|
5256
|
+
query,
|
|
5257
|
+
raw: item
|
|
5258
|
+
}];
|
|
5259
|
+
});
|
|
5260
|
+
}
|
|
5261
|
+
};
|
|
5158
5262
|
var SerpApiSearchProvider = class {
|
|
5159
5263
|
constructor(apiKey) {
|
|
5160
5264
|
this.apiKey = apiKey;
|
|
@@ -5294,6 +5398,8 @@ function buildResearchQueries(topic, count = 5) {
|
|
|
5294
5398
|
const normalized = topic.trim().replace(/\s+/g, " ");
|
|
5295
5399
|
const facets = [
|
|
5296
5400
|
normalized,
|
|
5401
|
+
`${normalized} dataset github kaggle benchmark`,
|
|
5402
|
+
`${normalized} public dataset csv jsonl parquet`,
|
|
5297
5403
|
`${normalized} official documentation standards methodology`,
|
|
5298
5404
|
`${normalized} research paper benchmark evaluation`,
|
|
5299
5405
|
`${normalized} case study operational data`,
|
|
@@ -5318,8 +5424,9 @@ ${result.url}`);
|
|
|
5318
5424
|
const freshnessScore = freshnessForDate(result.publishedAt);
|
|
5319
5425
|
const duplicationRisk = clamp01(Math.max(0, (domainCounts.get(domain) ?? 1) - 1) * 0.12);
|
|
5320
5426
|
const providerScore = normalizeProviderScore(result.score);
|
|
5427
|
+
const sourcePreference = sourcePreferenceScore(domain, result.url, result.provider);
|
|
5321
5428
|
const trustScore = clamp01(
|
|
5322
|
-
authority.score * 0.3 + relevanceScore * 0.27 + semanticScore * 0.18 + freshnessScore * 0.
|
|
5429
|
+
authority.score * 0.3 + relevanceScore * 0.27 + semanticScore * 0.18 + freshnessScore * 0.1 + providerScore * 0.07 + sourcePreference * 0.05 + (1 - duplicationRisk) * 0.05
|
|
5323
5430
|
);
|
|
5324
5431
|
return {
|
|
5325
5432
|
id: sourceId(result.url),
|
|
@@ -5339,7 +5446,12 @@ ${result.url}`);
|
|
|
5339
5446
|
semanticScore: Number(semanticScore.toFixed(3)),
|
|
5340
5447
|
trustScore: Number(trustScore.toFixed(3)),
|
|
5341
5448
|
sourceType: authority.type,
|
|
5342
|
-
qualitySignals:
|
|
5449
|
+
qualitySignals: [
|
|
5450
|
+
...authority.signals,
|
|
5451
|
+
...sourcePreference >= 0.85 ? ["preferred-source-surface"] : [],
|
|
5452
|
+
...result.provider === "github" ? ["github-repository-search"] : [],
|
|
5453
|
+
...result.provider === "kaggle" ? ["kaggle-dataset-search"] : []
|
|
5454
|
+
]
|
|
5343
5455
|
};
|
|
5344
5456
|
});
|
|
5345
5457
|
}
|
|
@@ -5549,6 +5661,14 @@ function authorityForDomain(domain, url) {
|
|
|
5549
5661
|
score = 0.78;
|
|
5550
5662
|
type = "official";
|
|
5551
5663
|
signals.push("official-documentation");
|
|
5664
|
+
} else if (host === "github.com") {
|
|
5665
|
+
score = 0.84;
|
|
5666
|
+
type = "code";
|
|
5667
|
+
signals.push("open-source-repository");
|
|
5668
|
+
} else if (host === "kaggle.com" || host.endsWith(".kaggle.com")) {
|
|
5669
|
+
score = 0.86;
|
|
5670
|
+
type = "dataset";
|
|
5671
|
+
signals.push("dataset-marketplace");
|
|
5552
5672
|
} else if (host.includes("wikipedia.org")) {
|
|
5553
5673
|
score = 0.62;
|
|
5554
5674
|
type = "community";
|
|
@@ -5564,6 +5684,18 @@ function authorityForDomain(domain, url) {
|
|
|
5564
5684
|
}
|
|
5565
5685
|
return { score, type, signals };
|
|
5566
5686
|
}
|
|
5687
|
+
function sourcePreferenceScore(domain, url, provider) {
|
|
5688
|
+
const host = domain.toLowerCase();
|
|
5689
|
+
if (provider === "kaggle" || host.includes("kaggle.com")) return 0.96;
|
|
5690
|
+
if (provider === "github" || host === "github.com") return 0.93;
|
|
5691
|
+
if (host.endsWith(".gov") || host.includes("nist.gov") || host.includes("sec.gov")) return 0.92;
|
|
5692
|
+
if (host.includes("arxiv.org") || host.endsWith(".edu") || host.includes("openalex.org")) return 0.88;
|
|
5693
|
+
if (host.includes("huggingface.co/datasets")) return 0.88;
|
|
5694
|
+
if (host.includes("data.gov") || url.includes("/dataset")) return 0.84;
|
|
5695
|
+
if (host.includes("docs.") || url.includes("/docs/") || url.includes("/documentation/")) return 0.78;
|
|
5696
|
+
if (host.includes("reddit.") || host.includes("medium.") || host.includes("substack.")) return 0.24;
|
|
5697
|
+
return 0.55;
|
|
5698
|
+
}
|
|
5567
5699
|
function freshnessForDate(value) {
|
|
5568
5700
|
if (!value) return 0.62;
|
|
5569
5701
|
const timestamp = Date.parse(value);
|
|
@@ -5658,6 +5790,8 @@ function env(key) {
|
|
|
5658
5790
|
return value || null;
|
|
5659
5791
|
}
|
|
5660
5792
|
var LOCAL_SOURCE_BASES = [
|
|
5793
|
+
{ label: "GitHub", url: "https://github.com/search", querySuffix: "q", signal: "Open-source repositories, examples, datasets, and benchmark code." },
|
|
5794
|
+
{ label: "Kaggle", url: "https://www.kaggle.com/datasets", querySuffix: "search", signal: "Public dataset catalog and dataset-level examples." },
|
|
5661
5795
|
{ label: "NIST", url: "https://www.nist.gov/search", querySuffix: "q", signal: "Technical guidance and standards language." },
|
|
5662
5796
|
{ label: "SEC", url: "https://www.sec.gov/search", querySuffix: "q", signal: "Regulatory filings and official disclosures." },
|
|
5663
5797
|
{ label: "PubMed", url: "https://pubmed.ncbi.nlm.nih.gov", querySuffix: "term", signal: "Academic and biomedical literature index." },
|
|
@@ -6128,13 +6262,235 @@ Source: ${result.source.url}
|
|
|
6128
6262
|
}
|
|
6129
6263
|
|
|
6130
6264
|
// ../../packages/prompts/src/index.ts
|
|
6131
|
-
|
|
6265
|
+
var DATASET_BLUEPRINTS = [
|
|
6266
|
+
blueprint({
|
|
6267
|
+
id: "instruction-finetune-corpus",
|
|
6268
|
+
label: "Instruction Fine-Tuning Corpus",
|
|
6269
|
+
description: "Task-and-answer examples for model fine-tuning, evaluator training, and assistant behavior shaping.",
|
|
6270
|
+
defaultType: "instruction",
|
|
6271
|
+
recommendedFormats: ["instruction", "jsonl", "csv", "markdown"],
|
|
6272
|
+
bestFor: ["fine-tuning", "assistant behavior", "domain task completion", "supervised training"],
|
|
6273
|
+
fields: [
|
|
6274
|
+
field("instruction", "string", "The user-facing task the model should perform.", true, "Handle a pricing objection from a VP of Sales."),
|
|
6275
|
+
field("input_context", "string", "The scenario, persona, constraints, or source-backed context for the instruction.", true, "Mid-market CRM buyer comparing annual contracts."),
|
|
6276
|
+
field("ideal_output", "string", "The high-quality answer or completion expected from the model.", true, "Acknowledge budget pressure, quantify missed pipeline risk, and offer a pilot path."),
|
|
6277
|
+
field("skill", "string", "The primary capability being trained.", true, "objection-handling"),
|
|
6278
|
+
field("difficulty", "string", "Expected difficulty or complexity level.", false, "intermediate"),
|
|
6279
|
+
field("source_url", "string", "The source URL supporting this record.", true, "https://example.com/source"),
|
|
6280
|
+
field("confidence", "number", "Alys confidence score after source and quality checks.", true, "0.86")
|
|
6281
|
+
],
|
|
6282
|
+
recordInstructions: [
|
|
6283
|
+
"Make the instruction actionable and directly trainable.",
|
|
6284
|
+
"Keep the output specific enough to teach a behavior, not a generic explanation.",
|
|
6285
|
+
"Include scenario/context when the answer depends on buyer role, domain, risk, or constraints."
|
|
6286
|
+
],
|
|
6287
|
+
qualityBar: [
|
|
6288
|
+
"Record teaches one clear skill.",
|
|
6289
|
+
"Answer is useful without reading the source page.",
|
|
6290
|
+
"No repeated template phrasing across rows."
|
|
6291
|
+
]
|
|
6292
|
+
}),
|
|
6293
|
+
blueprint({
|
|
6294
|
+
id: "rag-corpus",
|
|
6295
|
+
label: "RAG Retrieval Corpus",
|
|
6296
|
+
description: "Retrieval-ready passages, grounded answers, and citation-friendly chunks for search or knowledge bases.",
|
|
6297
|
+
defaultType: "rag",
|
|
6298
|
+
recommendedFormats: ["rag", "jsonl", "csv", "markdown"],
|
|
6299
|
+
bestFor: ["RAG", "semantic search", "knowledge bases", "citation-aware QA"],
|
|
6300
|
+
fields: [
|
|
6301
|
+
field("query", "string", "A realistic retrieval query or user question.", true, "What evidence supports the SOC 2 access review requirement?"),
|
|
6302
|
+
field("answer", "string", "A compact answer grounded only in the retrieved context.", true, "The control requires periodic review of user access and documented exceptions."),
|
|
6303
|
+
field("chunk_text", "string", "The retrieval text chunk that should be embedded.", true, "Access reviews should be performed periodically..."),
|
|
6304
|
+
field("source_title", "string", "Human-readable source title.", true, "SOC 2 Criteria Overview"),
|
|
6305
|
+
field("source_url", "string", "Canonical source URL.", true, "https://example.com/source"),
|
|
6306
|
+
field("citation_span", "string", "The source-backed phrase or section used as evidence.", false, "periodic review of user access"),
|
|
6307
|
+
field("confidence", "number", "Grounding confidence for the answer/chunk.", true, "0.91")
|
|
6308
|
+
],
|
|
6309
|
+
recordInstructions: [
|
|
6310
|
+
"Make each context chunk standalone and retrieval-ready.",
|
|
6311
|
+
"Answer only what the source context supports.",
|
|
6312
|
+
"Prefer compact, evidence-rich chunks over long summaries."
|
|
6313
|
+
],
|
|
6314
|
+
qualityBar: [
|
|
6315
|
+
"Chunk contains one coherent idea.",
|
|
6316
|
+
"Query is realistic, not keyword stuffing.",
|
|
6317
|
+
"Answer can be traced back to source context."
|
|
6318
|
+
]
|
|
6319
|
+
}),
|
|
6320
|
+
blueprint({
|
|
6321
|
+
id: "evaluation-qa",
|
|
6322
|
+
label: "Evaluation QA Dataset",
|
|
6323
|
+
description: "Question-answer records with expected answers, rubric signals, difficulty, and factual grounding.",
|
|
6324
|
+
defaultType: "qa",
|
|
6325
|
+
recommendedFormats: ["jsonl", "csv", "markdown"],
|
|
6326
|
+
bestFor: ["model evaluation", "benchmarking", "golden sets", "regression testing"],
|
|
6327
|
+
fields: [
|
|
6328
|
+
field("question", "string", "A precise evaluation question.", true, "Which mitigation should be used when duplicate records reduce retrieval diversity?"),
|
|
6329
|
+
field("expected_answer", "string", "The answer a model should produce.", true, "Apply semantic deduplication and retain records with stronger source support."),
|
|
6330
|
+
field("rubric", "string", "Criteria used to grade the model response.", true, "Must mention deduplication, source support, and diversity impact."),
|
|
6331
|
+
field("difficulty", "string", "Difficulty band.", true, "hard"),
|
|
6332
|
+
field("failure_modes", "array", "Likely wrong answers to catch.", false, "hallucinated metric, unsupported source claim"),
|
|
6333
|
+
field("source_url", "string", "Evidence source URL.", true, "https://example.com/source"),
|
|
6334
|
+
field("confidence", "number", "Grounding confidence.", true, "0.88")
|
|
6335
|
+
],
|
|
6336
|
+
recordInstructions: [
|
|
6337
|
+
"Write questions that test reasoning, not memorization only.",
|
|
6338
|
+
"Include rubric-like constraints in the answer or context.",
|
|
6339
|
+
"Add failure-mode awareness when the source contains ambiguity."
|
|
6340
|
+
],
|
|
6341
|
+
qualityBar: [
|
|
6342
|
+
"Question has one defensible expected answer.",
|
|
6343
|
+
"Rubric exposes what a weak model would miss.",
|
|
6344
|
+
"Evidence is visible in context."
|
|
6345
|
+
]
|
|
6346
|
+
}),
|
|
6347
|
+
blueprint({
|
|
6348
|
+
id: "b2b-saas-objections",
|
|
6349
|
+
label: "B2B SaaS Objection Handling",
|
|
6350
|
+
description: "Sales-training records for pricing, procurement, ROI, onboarding, integration, security, and competitive objections.",
|
|
6351
|
+
defaultType: "instruction",
|
|
6352
|
+
recommendedFormats: ["instruction", "jsonl", "csv", "markdown"],
|
|
6353
|
+
bestFor: ["sales enablement", "roleplay training", "support coaching", "fine-tuning"],
|
|
6354
|
+
fields: [
|
|
6355
|
+
field("buyer_role", "string", "The buyer or stakeholder persona.", true, "CFO"),
|
|
6356
|
+
field("company_segment", "string", "Customer segment or buying context.", true, "mid-market SaaS"),
|
|
6357
|
+
field("objection_category", "string", "Primary objection class.", true, "pricing"),
|
|
6358
|
+
field("objection", "string", "The exact buyer objection.", true, "This is too expensive compared to our current tool."),
|
|
6359
|
+
field("recommended_response", "string", "The ideal grounded response.", true, "Tie cost to pipeline leakage and propose a measured pilot."),
|
|
6360
|
+
field("proof_point", "string", "Evidence or reasoning used in the response.", false, "integration time and support burden"),
|
|
6361
|
+
field("follow_up_question", "string", "A next-step question that advances discovery.", true, "What cost are you currently assigning to delayed handoffs?")
|
|
6362
|
+
],
|
|
6363
|
+
recordInstructions: [
|
|
6364
|
+
"Make objections sound like real buyers, not canned sales scripts.",
|
|
6365
|
+
"Answer with empathy, evidence, and a concrete next step.",
|
|
6366
|
+
"Vary buyer role, deal stage, company size, and risk profile."
|
|
6367
|
+
],
|
|
6368
|
+
qualityBar: [
|
|
6369
|
+
"Response addresses the objection directly.",
|
|
6370
|
+
"No fake statistics or logos.",
|
|
6371
|
+
"Follow-up question is useful in a real sales call."
|
|
6372
|
+
]
|
|
6373
|
+
}),
|
|
6374
|
+
blueprint({
|
|
6375
|
+
id: "technical-docs-qa",
|
|
6376
|
+
label: "Technical Documentation QA",
|
|
6377
|
+
description: "Developer-facing questions, answers, commands, prerequisites, errors, and implementation details grounded in docs or repos.",
|
|
6378
|
+
defaultType: "qa",
|
|
6379
|
+
recommendedFormats: ["jsonl", "csv", "markdown", "rag"],
|
|
6380
|
+
bestFor: ["developer docs", "support bots", "SDK QA", "RAG corpora"],
|
|
6381
|
+
fields: [
|
|
6382
|
+
field("product_area", "string", "The API, package, CLI, or feature area.", true, "CLI authentication"),
|
|
6383
|
+
field("question", "string", "Developer question or task.", true, "How do I authenticate the CLI?"),
|
|
6384
|
+
field("answer", "string", "Specific answer grounded in documentation.", true, "Run npx alys-akusa login and finish the browser flow."),
|
|
6385
|
+
field("code_or_command", "string", "Relevant command, code, or config.", false, "npx alys-akusa login"),
|
|
6386
|
+
field("prerequisites", "array", "Required setup before this works.", false, "Alys account, browser access"),
|
|
6387
|
+
field("common_error", "string", "Likely failure mode or troubleshooting note.", false, "Expired CLI login session."),
|
|
6388
|
+
field("source_url", "string", "Documentation or repository URL.", true, "https://example.com/docs")
|
|
6389
|
+
],
|
|
6390
|
+
recordInstructions: [
|
|
6391
|
+
"Prefer commands, parameters, return shapes, limits, and edge cases.",
|
|
6392
|
+
"Never invent API names or package names.",
|
|
6393
|
+
"If docs are ambiguous, make the uncertainty visible."
|
|
6394
|
+
],
|
|
6395
|
+
qualityBar: [
|
|
6396
|
+
"Answer can be executed or verified.",
|
|
6397
|
+
"No fake APIs.",
|
|
6398
|
+
"Source context includes the relevant command or behavior."
|
|
6399
|
+
]
|
|
6400
|
+
}),
|
|
6401
|
+
blueprint({
|
|
6402
|
+
id: "legal-compliance-rag",
|
|
6403
|
+
label: "Legal & Compliance RAG Corpus",
|
|
6404
|
+
description: "Compliance-aware chunks and QA with jurisdictions, requirements, controls, exceptions, and evidence notes.",
|
|
6405
|
+
defaultType: "rag",
|
|
6406
|
+
recommendedFormats: ["rag", "jsonl", "csv", "markdown"],
|
|
6407
|
+
bestFor: ["compliance search", "policy QA", "audit prep", "legal retrieval"],
|
|
6408
|
+
fields: [
|
|
6409
|
+
field("jurisdiction", "string", "Relevant jurisdiction or regulatory scope.", false, "United States"),
|
|
6410
|
+
field("requirement", "string", "The rule, obligation, or policy requirement.", true, "Maintain access review evidence."),
|
|
6411
|
+
field("control_or_action", "string", "Concrete action, control, or procedure.", true, "Review user access quarterly and document exceptions."),
|
|
6412
|
+
field("exception_or_limit", "string", "Boundary, exception, or uncertainty.", false, "Frequency may vary by framework."),
|
|
6413
|
+
field("evidence_text", "string", "Source-backed evidence span.", true, "periodic access reviews"),
|
|
6414
|
+
field("source_url", "string", "Canonical source URL.", true, "https://example.com/policy"),
|
|
6415
|
+
field("confidence", "number", "Confidence after source checks.", true, "0.84")
|
|
6416
|
+
],
|
|
6417
|
+
recordInstructions: [
|
|
6418
|
+
"Preserve scope and limitations.",
|
|
6419
|
+
"Do not convert legal text into absolute advice when the source is conditional.",
|
|
6420
|
+
"Separate requirement, control, and exception clearly."
|
|
6421
|
+
],
|
|
6422
|
+
qualityBar: [
|
|
6423
|
+
"Jurisdiction/scope is not blurred.",
|
|
6424
|
+
"No invented legal conclusions.",
|
|
6425
|
+
"Evidence text supports the answer."
|
|
6426
|
+
]
|
|
6427
|
+
}),
|
|
6428
|
+
blueprint({
|
|
6429
|
+
id: "engineering-telemetry",
|
|
6430
|
+
label: "Engineering Telemetry & Operations",
|
|
6431
|
+
description: "Operational records for engineering systems: parameters, units, ranges, anomalies, recommendations, and safety constraints.",
|
|
6432
|
+
defaultType: "instruction",
|
|
6433
|
+
recommendedFormats: ["jsonl", "csv", "instruction", "markdown"],
|
|
6434
|
+
bestFor: ["industrial AI", "predictive maintenance", "ops training", "engineering assistants"],
|
|
6435
|
+
fields: [
|
|
6436
|
+
field("asset_type", "string", "Equipment, system, or asset class.", true, "oil pipeline pump station"),
|
|
6437
|
+
field("operation_phase", "string", "Workflow or operating phase.", true, "commissioning"),
|
|
6438
|
+
field("parameter", "string", "Observed parameter or measurement.", true, "pressure differential"),
|
|
6439
|
+
field("unit", "string", "Engineering unit.", false, "psi"),
|
|
6440
|
+
field("normal_range", "string", "Source-backed or cautiously inferred expected range.", false, "site-specific; verify against design docs"),
|
|
6441
|
+
field("abnormal_signal", "string", "Failure pattern, warning, or anomaly.", true, "rising vibration with falling flow rate"),
|
|
6442
|
+
field("recommended_action", "string", "Operationally safe next action.", true, "inspect pump seals and verify sensor calibration"),
|
|
6443
|
+
field("safety_note", "string", "Boundary or caution.", true, "do not exceed site operating procedures")
|
|
6444
|
+
],
|
|
6445
|
+
recordInstructions: [
|
|
6446
|
+
"Keep units, ranges, and recommendations physically plausible.",
|
|
6447
|
+
"If numeric ranges are not in source context, say they require site-specific verification.",
|
|
6448
|
+
"Include normal, abnormal, edge, and incident scenarios."
|
|
6449
|
+
],
|
|
6450
|
+
qualityBar: [
|
|
6451
|
+
"No invented unsafe operating limits.",
|
|
6452
|
+
"Action follows from the signal.",
|
|
6453
|
+
"Record includes constraints and uncertainty."
|
|
6454
|
+
]
|
|
6455
|
+
}),
|
|
6456
|
+
blueprint({
|
|
6457
|
+
id: "dataset-source-catalog",
|
|
6458
|
+
label: "Dataset Source Catalog",
|
|
6459
|
+
description: "Catalog records for public datasets, repositories, benchmark corpora, licenses, schemas, and use constraints.",
|
|
6460
|
+
defaultType: "qa",
|
|
6461
|
+
recommendedFormats: ["jsonl", "csv", "markdown"],
|
|
6462
|
+
bestFor: ["dataset discovery", "benchmark planning", "source audits", "training-data procurement"],
|
|
6463
|
+
fields: [
|
|
6464
|
+
field("source_name", "string", "Dataset/repository/source name.", true, "SDV benchmark dataset"),
|
|
6465
|
+
field("source_url", "string", "Canonical URL.", true, "https://github.com/sdv-dev/SDV"),
|
|
6466
|
+
field("source_type", "string", "Dataset, repository, benchmark, paper, documentation, or registry.", true, "repository"),
|
|
6467
|
+
field("domain", "string", "Domain or category.", false, "synthetic data"),
|
|
6468
|
+
field("available_formats", "array", "Known available formats.", false, "csv, jsonl, parquet"),
|
|
6469
|
+
field("license_or_terms", "string", "License or usage constraints when available.", false, "MIT"),
|
|
6470
|
+
field("schema_summary", "string", "Short description of fields/tables/tasks.", true, "Benchmark suite for tabular synthetic data."),
|
|
6471
|
+
field("trust_reason", "string", "Why this source is credible or useful.", true, "Primary GitHub repository with active documentation.")
|
|
6472
|
+
],
|
|
6473
|
+
recordInstructions: [
|
|
6474
|
+
"Favor GitHub, Kaggle, Hugging Face datasets, official benchmark pages, and primary repositories.",
|
|
6475
|
+
"Do not claim license, row count, or schema details unless visible in source context.",
|
|
6476
|
+
"Make the catalog immediately usable for dataset selection."
|
|
6477
|
+
],
|
|
6478
|
+
qualityBar: [
|
|
6479
|
+
"Every row points to a real source URL.",
|
|
6480
|
+
"License/format claims are source-backed or marked unknown.",
|
|
6481
|
+
"Trust reason is explicit."
|
|
6482
|
+
]
|
|
6483
|
+
})
|
|
6484
|
+
];
|
|
6485
|
+
function buildDatasetGenerationPlan(topic, datasetType) {
|
|
6132
6486
|
const normalized = topic.trim().replace(/\s+/g, " ");
|
|
6133
6487
|
const lower = normalized.toLowerCase();
|
|
6134
6488
|
const segments = knownSegments(lower, normalized) ?? genericSegments(normalized);
|
|
6489
|
+
const blueprint2 = selectDatasetBlueprint(normalized, datasetType);
|
|
6135
6490
|
return {
|
|
6136
6491
|
topic: normalized,
|
|
6137
6492
|
intent: inferIntent(lower),
|
|
6493
|
+
blueprint: blueprint2,
|
|
6138
6494
|
segments,
|
|
6139
6495
|
sourceCriteria: [
|
|
6140
6496
|
"Prefer official documentation, standards bodies, academic papers, technical reports, government sources, and primary company docs.",
|
|
@@ -6149,12 +6505,41 @@ function buildDatasetGenerationPlan(topic) {
|
|
|
6149
6505
|
"Repeated phrasing across records indicates mode collapse and must be suppressed."
|
|
6150
6506
|
],
|
|
6151
6507
|
outputSchemaNotes: [
|
|
6508
|
+
`Use the "${blueprint2.label}" blueprint (${blueprint2.id}) as the canonical row contract.`,
|
|
6509
|
+
`Recommended exports: ${blueprint2.recommendedFormats.join(", ")}.`,
|
|
6510
|
+
...blueprint2.fields.map((item) => `${item.required ? "Required" : "Optional"} field: ${item.name} (${item.type}) - ${item.description}`),
|
|
6152
6511
|
"Each record needs an actionable input, specific output, source context, source URL, confidence, tags, and metadata.",
|
|
6153
6512
|
"Metadata should include source trust, source authority, source relevance, segment ID, quality signals, and benchmark notes.",
|
|
6154
6513
|
"Records should be useful without the original source page while still preserving provenance."
|
|
6155
6514
|
]
|
|
6156
6515
|
};
|
|
6157
6516
|
}
|
|
6517
|
+
function selectDatasetBlueprint(topic, datasetType) {
|
|
6518
|
+
const lower = topic.toLowerCase();
|
|
6519
|
+
const selected = (() => {
|
|
6520
|
+
if ((lower.includes("b2b") || lower.includes("sales")) && lower.includes("objection")) return byId("b2b-saas-objections");
|
|
6521
|
+
if (lower.includes("legal") || lower.includes("compliance") || lower.includes("soc 2") || lower.includes("hipaa")) return byId("legal-compliance-rag");
|
|
6522
|
+
if (lower.includes("oil") || lower.includes("gas") || lower.includes("pipeline") || lower.includes("drilling") || lower.includes("telemetry") || lower.includes("maintenance")) return byId("engineering-telemetry");
|
|
6523
|
+
if (lower.includes("documentation") || lower.includes("developer") || lower.includes("api") || lower.includes("sdk") || lower.includes("cli")) return byId("technical-docs-qa");
|
|
6524
|
+
if (lower.includes("benchmark") || lower.includes("evaluation") || lower.includes("eval")) return byId("evaluation-qa");
|
|
6525
|
+
if (lower.includes("source catalog") || lower.includes("public dataset") || lower.includes("kaggle") || lower.includes("github dataset")) return byId("dataset-source-catalog");
|
|
6526
|
+
if (lower.includes("rag") || lower.includes("retrieval") || lower.includes("knowledge base")) return byId("rag-corpus");
|
|
6527
|
+
return datasetType === "rag" ? byId("rag-corpus") : datasetType === "qa" ? byId("evaluation-qa") : byId("instruction-finetune-corpus");
|
|
6528
|
+
})();
|
|
6529
|
+
return selected ?? DATASET_BLUEPRINTS[0];
|
|
6530
|
+
}
|
|
6531
|
+
function byId(id) {
|
|
6532
|
+
return DATASET_BLUEPRINTS.find((blueprint2) => blueprint2.id === id);
|
|
6533
|
+
}
|
|
6534
|
+
function blueprint(input) {
|
|
6535
|
+
return {
|
|
6536
|
+
...input,
|
|
6537
|
+
version: input.version ?? "1.0.0"
|
|
6538
|
+
};
|
|
6539
|
+
}
|
|
6540
|
+
function field(name, type, description, required, example) {
|
|
6541
|
+
return { name, type, description, required, example };
|
|
6542
|
+
}
|
|
6158
6543
|
function segmentForSource(plan, text, index = 0) {
|
|
6159
6544
|
const haystack = text.toLowerCase();
|
|
6160
6545
|
const scored = plan.segments.map((segment2) => {
|
|
@@ -6267,9 +6652,12 @@ Your job is to generate production-grade synthetic dataset records that can surv
|
|
|
6267
6652
|
Hard rules:
|
|
6268
6653
|
- Return only valid JSON matching the requested schema.
|
|
6269
6654
|
- Each record must be meaningfully different.
|
|
6655
|
+
- Every record must be grounded in the provided source context.
|
|
6656
|
+
- Do not introduce facts, numbers, names, benchmarks, URLs, citations, or claims that are not present in the source context.
|
|
6270
6657
|
- Preserve plausible domain physics, operational constraints, and realistic terminology.
|
|
6271
6658
|
- Prefer specific scenarios, values, failure modes, and edge cases over generic prose.
|
|
6272
|
-
-
|
|
6659
|
+
- If the source context is thin, produce cautious records about what can be inferred and what needs verification.
|
|
6660
|
+
- Do not include fake citations. Source URL and source title are attached outside your JSON.
|
|
6273
6661
|
- Never output placeholders, TODOs, markdown fences, or explanations.
|
|
6274
6662
|
`.trim();
|
|
6275
6663
|
function loadAlysEnv(cwd = process.cwd()) {
|
|
@@ -6361,6 +6749,9 @@ Active topic segment:
|
|
|
6361
6749
|
Generation plan intent:
|
|
6362
6750
|
${options.generationPlan.intent}
|
|
6363
6751
|
|
|
6752
|
+
Dataset blueprint:
|
|
6753
|
+
${renderBlueprintForPrompt(options.generationPlan)}
|
|
6754
|
+
|
|
6364
6755
|
Required source criteria:
|
|
6365
6756
|
${options.generationPlan.sourceCriteria.map((item) => `- ${item}`).join("\n")}
|
|
6366
6757
|
|
|
@@ -6394,6 +6785,9 @@ ${segmentBlock}
|
|
|
6394
6785
|
Generate exactly ${targetCount} records.
|
|
6395
6786
|
|
|
6396
6787
|
Domain quality requirements:
|
|
6788
|
+
- Ground every record in the source context below. If a detail is not in the context, do not invent it.
|
|
6789
|
+
- Use the dataset blueprint above as the semantic row contract. Do not waste tokens describing CSV/JSONL formatting; Alys handles export formatting.
|
|
6790
|
+
- Put the most important blueprint field values into input, output, and context. Add blueprint field names to metadata.signals or metadata.constraints when useful.
|
|
6397
6791
|
- Include realistic parameters, edge cases, constraints, and operational variability.
|
|
6398
6792
|
- For engineering domains, use physically plausible values and causal relationships.
|
|
6399
6793
|
- For tabular/synthetic-data domains, preserve schema-like consistency and row-level diversity.
|
|
@@ -6401,6 +6795,7 @@ Domain quality requirements:
|
|
|
6401
6795
|
- Avoid repeating the topic verbatim in every output.
|
|
6402
6796
|
- Reflect source quality: lower-confidence sources should create cautious, verification-aware records.
|
|
6403
6797
|
- If the source is low authority, write records as verification-aware training examples rather than confident factual claims.
|
|
6798
|
+
- Prefer source-backed dataset rows over broad advice. If the context only supports broad advice, lower confidence.
|
|
6404
6799
|
- Every output should contain a specific decision signal, scenario, constraint, or useful answer. Reject generic summaries.
|
|
6405
6800
|
- Prefer records that can be directly reused for fine-tuning, RAG evaluation, or QA benchmarks.
|
|
6406
6801
|
|
|
@@ -6425,8 +6820,29 @@ Return JSON with this shape:
|
|
|
6425
6820
|
|
|
6426
6821
|
Source context:
|
|
6427
6822
|
${sourceText}
|
|
6823
|
+
|
|
6824
|
+
Final grounding rule:
|
|
6825
|
+
Only use information supported by Source title, Source URL, Source quality, and Source context above.
|
|
6428
6826
|
`.trim();
|
|
6429
6827
|
}
|
|
6828
|
+
function renderBlueprintForPrompt(plan) {
|
|
6829
|
+
const blueprint2 = plan.blueprint;
|
|
6830
|
+
return [
|
|
6831
|
+
`ID: ${blueprint2.id}`,
|
|
6832
|
+
`Label: ${blueprint2.label}`,
|
|
6833
|
+
`Version: ${blueprint2.version}`,
|
|
6834
|
+
`Purpose: ${blueprint2.description}`,
|
|
6835
|
+
`Best for: ${blueprint2.bestFor.join(", ")}`,
|
|
6836
|
+
"Fields:",
|
|
6837
|
+
...blueprint2.fields.map(
|
|
6838
|
+
(field2) => `- ${field2.name} (${field2.type}${field2.required ? ", required" : ", optional"}): ${field2.description}${field2.example ? ` Example: ${field2.example}` : ""}`
|
|
6839
|
+
),
|
|
6840
|
+
"Record instructions:",
|
|
6841
|
+
...blueprint2.recordInstructions.map((item) => `- ${item}`),
|
|
6842
|
+
"Quality bar:",
|
|
6843
|
+
...blueprint2.qualityBar.map((item) => `- ${item}`)
|
|
6844
|
+
].join("\n");
|
|
6845
|
+
}
|
|
6430
6846
|
async function generateWithOpenAI(options) {
|
|
6431
6847
|
const apiKey = getOpenAIKey();
|
|
6432
6848
|
const model = process.env.ALYS_GENERATOR_MODEL || DEFAULT_OPENAI_MODEL;
|
|
@@ -6447,7 +6863,7 @@ async function generateWithOpenAI(options) {
|
|
|
6447
6863
|
},
|
|
6448
6864
|
body: JSON.stringify({
|
|
6449
6865
|
model,
|
|
6450
|
-
temperature: Number(process.env.ALYS_GENERATOR_TEMPERATURE ?? 0.
|
|
6866
|
+
temperature: Number(process.env.ALYS_GENERATOR_TEMPERATURE ?? 0.28),
|
|
6451
6867
|
max_tokens: Math.min(32768, Math.max(1600, options.targetCount * 520)),
|
|
6452
6868
|
messages: [
|
|
6453
6869
|
{ role: "system", content: ALYS_RECORD_SYSTEM_PROMPT },
|
|
@@ -6473,26 +6889,29 @@ async function generateWithOpenAI(options) {
|
|
|
6473
6889
|
provider: "openai",
|
|
6474
6890
|
model,
|
|
6475
6891
|
latencyMs: Date.now() - startedAt,
|
|
6476
|
-
records:
|
|
6892
|
+
records: groundProviderRecords(
|
|
6893
|
+
parseProviderRecords(content, options.baselineConfidence, options.datasetType),
|
|
6894
|
+
options.document
|
|
6895
|
+
)
|
|
6477
6896
|
};
|
|
6478
6897
|
}
|
|
6479
6898
|
async function generateWithOpenAIBatched(options) {
|
|
6480
|
-
const batchSize = Math.max(1, Math.min(
|
|
6481
|
-
const batches =
|
|
6482
|
-
|
|
6483
|
-
|
|
6484
|
-
|
|
6485
|
-
|
|
6486
|
-
|
|
6487
|
-
|
|
6488
|
-
|
|
6899
|
+
const batchSize = Math.max(1, Math.min(48, Number(process.env.ALYS_OPENAI_RECORDS_PER_CALL ?? 8)));
|
|
6900
|
+
const batches = Array.from(
|
|
6901
|
+
{ length: Math.ceil(options.targetCount / batchSize) },
|
|
6902
|
+
(_, index) => Math.min(batchSize, Math.max(0, options.targetCount - index * batchSize))
|
|
6903
|
+
).filter((count) => count > 0);
|
|
6904
|
+
const results = await mapLimit(
|
|
6905
|
+
batches,
|
|
6906
|
+
Number(process.env.ALYS_PROVIDER_BATCH_CONCURRENCY ?? 3),
|
|
6907
|
+
(count) => generateWithOpenAI({
|
|
6489
6908
|
...options,
|
|
6490
|
-
targetCount:
|
|
6491
|
-
})
|
|
6492
|
-
|
|
6493
|
-
|
|
6494
|
-
|
|
6495
|
-
|
|
6909
|
+
targetCount: count
|
|
6910
|
+
})
|
|
6911
|
+
);
|
|
6912
|
+
const records = results.flatMap((result) => result.records);
|
|
6913
|
+
const latencyMs = results.reduce((sum, result) => sum + result.latencyMs, 0);
|
|
6914
|
+
const model = results[0]?.model ?? process.env.ALYS_GENERATOR_MODEL ?? DEFAULT_OPENAI_MODEL;
|
|
6496
6915
|
return {
|
|
6497
6916
|
provider: "openai",
|
|
6498
6917
|
model,
|
|
@@ -6520,7 +6939,7 @@ async function generateWithGroq(options) {
|
|
|
6520
6939
|
},
|
|
6521
6940
|
body: JSON.stringify({
|
|
6522
6941
|
model,
|
|
6523
|
-
temperature: Number(process.env.ALYS_GENERATOR_TEMPERATURE ?? 0.
|
|
6942
|
+
temperature: Number(process.env.ALYS_GENERATOR_TEMPERATURE ?? 0.25),
|
|
6524
6943
|
max_tokens: Math.min(8192, Math.max(1600, options.targetCount * 520)),
|
|
6525
6944
|
messages: [
|
|
6526
6945
|
{ role: "system", content: ALYS_RECORD_SYSTEM_PROMPT },
|
|
@@ -6539,26 +6958,29 @@ async function generateWithGroq(options) {
|
|
|
6539
6958
|
provider: "groq",
|
|
6540
6959
|
model,
|
|
6541
6960
|
latencyMs: Date.now() - startedAt,
|
|
6542
|
-
records:
|
|
6961
|
+
records: groundProviderRecords(
|
|
6962
|
+
parseProviderRecords(content, options.baselineConfidence, options.datasetType),
|
|
6963
|
+
options.document
|
|
6964
|
+
)
|
|
6543
6965
|
};
|
|
6544
6966
|
}
|
|
6545
6967
|
async function generateWithGroqBatched(options) {
|
|
6546
|
-
const batchSize = Math.max(1, Math.min(
|
|
6547
|
-
const batches =
|
|
6548
|
-
|
|
6549
|
-
|
|
6550
|
-
|
|
6551
|
-
|
|
6552
|
-
|
|
6553
|
-
|
|
6554
|
-
|
|
6968
|
+
const batchSize = Math.max(1, Math.min(32, Number(process.env.ALYS_GROQ_RECORDS_PER_CALL ?? 8)));
|
|
6969
|
+
const batches = Array.from(
|
|
6970
|
+
{ length: Math.ceil(options.targetCount / batchSize) },
|
|
6971
|
+
(_, index) => Math.min(batchSize, Math.max(0, options.targetCount - index * batchSize))
|
|
6972
|
+
).filter((count) => count > 0);
|
|
6973
|
+
const results = await mapLimit(
|
|
6974
|
+
batches,
|
|
6975
|
+
Number(process.env.ALYS_PROVIDER_BATCH_CONCURRENCY ?? 3),
|
|
6976
|
+
(count) => generateWithGroq({
|
|
6555
6977
|
...options,
|
|
6556
|
-
targetCount:
|
|
6557
|
-
})
|
|
6558
|
-
|
|
6559
|
-
|
|
6560
|
-
|
|
6561
|
-
|
|
6978
|
+
targetCount: count
|
|
6979
|
+
})
|
|
6980
|
+
);
|
|
6981
|
+
const records = results.flatMap((result) => result.records);
|
|
6982
|
+
const latencyMs = results.reduce((sum, result) => sum + result.latencyMs, 0);
|
|
6983
|
+
const model = results[0]?.model ?? process.env.GROQ_MODEL ?? process.env.ALYS_GENERATOR_MODEL ?? DEFAULT_GROQ_MODEL;
|
|
6562
6984
|
return {
|
|
6563
6985
|
provider: "groq",
|
|
6564
6986
|
model,
|
|
@@ -6566,6 +6988,20 @@ async function generateWithGroqBatched(options) {
|
|
|
6566
6988
|
records: records.slice(0, options.targetCount)
|
|
6567
6989
|
};
|
|
6568
6990
|
}
|
|
6991
|
+
async function mapLimit(items, limit, worker) {
|
|
6992
|
+
const results = new Array(items.length);
|
|
6993
|
+
let nextIndex = 0;
|
|
6994
|
+
const workerCount = Math.max(1, Math.min(items.length, Math.floor(limit || 1)));
|
|
6995
|
+
async function runWorker() {
|
|
6996
|
+
while (nextIndex < items.length) {
|
|
6997
|
+
const index = nextIndex;
|
|
6998
|
+
nextIndex += 1;
|
|
6999
|
+
results[index] = await worker(items[index], index);
|
|
7000
|
+
}
|
|
7001
|
+
}
|
|
7002
|
+
await Promise.all(Array.from({ length: workerCount }, runWorker));
|
|
7003
|
+
return results;
|
|
7004
|
+
}
|
|
6569
7005
|
function providerRecordSchema() {
|
|
6570
7006
|
return {
|
|
6571
7007
|
type: "object",
|
|
@@ -6635,6 +7071,78 @@ function normalizeMetadata(value) {
|
|
|
6635
7071
|
benchmark_notes: Array.isArray(obj.benchmark_notes) ? obj.benchmark_notes.map(String).slice(0, 12) : []
|
|
6636
7072
|
};
|
|
6637
7073
|
}
|
|
7074
|
+
function groundProviderRecords(records, document) {
|
|
7075
|
+
const sourceTokens = meaningfulTokens(document.text);
|
|
7076
|
+
if (sourceTokens.size < 8) return [];
|
|
7077
|
+
const sourceIsFallback = /fallback reason:/i.test(document.text);
|
|
7078
|
+
const minimumGrounding = sourceIsFallback ? 0.025 : 0.04;
|
|
7079
|
+
return records.flatMap((record) => {
|
|
7080
|
+
const recordTokens = meaningfulTokens(`${record.input} ${record.output} ${record.context}`);
|
|
7081
|
+
if (!recordTokens.size) return [];
|
|
7082
|
+
let overlap = 0;
|
|
7083
|
+
for (const token of recordTokens) {
|
|
7084
|
+
if (sourceTokens.has(token)) overlap += 1;
|
|
7085
|
+
}
|
|
7086
|
+
const groundingScore = overlap / Math.max(1, Math.min(recordTokens.size, sourceTokens.size));
|
|
7087
|
+
if (groundingScore < minimumGrounding) return [];
|
|
7088
|
+
const confidence = clamp012(record.confidence * (0.76 + Math.min(0.24, groundingScore * 2.4)));
|
|
7089
|
+
return [{
|
|
7090
|
+
...record,
|
|
7091
|
+
confidence,
|
|
7092
|
+
metadata: {
|
|
7093
|
+
...record.metadata,
|
|
7094
|
+
signals: Array.from(/* @__PURE__ */ new Set([
|
|
7095
|
+
...record.metadata.signals,
|
|
7096
|
+
`source-grounding:${groundingScore.toFixed(3)}`,
|
|
7097
|
+
`source-url:${document.url}`
|
|
7098
|
+
])).slice(0, 12),
|
|
7099
|
+
constraints: Array.from(/* @__PURE__ */ new Set([
|
|
7100
|
+
...record.metadata.constraints,
|
|
7101
|
+
"accepted-after-source-grounding-check"
|
|
7102
|
+
])).slice(0, 12)
|
|
7103
|
+
}
|
|
7104
|
+
}];
|
|
7105
|
+
});
|
|
7106
|
+
}
|
|
7107
|
+
function meaningfulTokens(value) {
|
|
7108
|
+
const stopwords = /* @__PURE__ */ new Set([
|
|
7109
|
+
"about",
|
|
7110
|
+
"after",
|
|
7111
|
+
"also",
|
|
7112
|
+
"because",
|
|
7113
|
+
"before",
|
|
7114
|
+
"being",
|
|
7115
|
+
"between",
|
|
7116
|
+
"could",
|
|
7117
|
+
"dataset",
|
|
7118
|
+
"during",
|
|
7119
|
+
"every",
|
|
7120
|
+
"from",
|
|
7121
|
+
"have",
|
|
7122
|
+
"into",
|
|
7123
|
+
"more",
|
|
7124
|
+
"only",
|
|
7125
|
+
"other",
|
|
7126
|
+
"should",
|
|
7127
|
+
"source",
|
|
7128
|
+
"that",
|
|
7129
|
+
"their",
|
|
7130
|
+
"there",
|
|
7131
|
+
"these",
|
|
7132
|
+
"this",
|
|
7133
|
+
"through",
|
|
7134
|
+
"using",
|
|
7135
|
+
"what",
|
|
7136
|
+
"when",
|
|
7137
|
+
"where",
|
|
7138
|
+
"which",
|
|
7139
|
+
"with",
|
|
7140
|
+
"would"
|
|
7141
|
+
]);
|
|
7142
|
+
return new Set(
|
|
7143
|
+
value.toLowerCase().match(/[a-z0-9][a-z0-9._/-]{2,}/g)?.map((token) => token.replace(/^[._/-]+|[._/-]+$/g, "")).filter((token) => token.length >= 4 && !stopwords.has(token)).slice(0, 900) ?? []
|
|
7144
|
+
);
|
|
7145
|
+
}
|
|
6638
7146
|
function safeJson(text) {
|
|
6639
7147
|
const trimmed = text.trim();
|
|
6640
7148
|
try {
|
|
@@ -6680,7 +7188,7 @@ var DiscoveryAgent = class {
|
|
|
6680
7188
|
var ExtractionAgent = class {
|
|
6681
7189
|
name = "ExtractionAgent";
|
|
6682
7190
|
async run(sources) {
|
|
6683
|
-
const crawls = await
|
|
7191
|
+
const crawls = await mapLimit2(
|
|
6684
7192
|
sources,
|
|
6685
7193
|
Math.max(1, Number(process.env.ALYS_CRAWL_CONCURRENCY ?? 6)),
|
|
6686
7194
|
(source) => crawlSource(source)
|
|
@@ -6754,7 +7262,7 @@ var StructuringAgent = class {
|
|
|
6754
7262
|
}
|
|
6755
7263
|
const concurrency = Math.max(1, Math.min(documents.length, Number(process.env.ALYS_PROVIDER_CONCURRENCY ?? 3)));
|
|
6756
7264
|
let totalGenerated = 0;
|
|
6757
|
-
const grouped = await
|
|
7265
|
+
const grouped = await mapLimit2(documents, concurrency, async (document, index) => {
|
|
6758
7266
|
const finding = findings[index] ?? findings[0];
|
|
6759
7267
|
const trustScore = document.sourceScores?.trustScore ?? 0.62;
|
|
6760
7268
|
const authorityScore = document.sourceScores?.authorityScore ?? 0.55;
|
|
@@ -6763,6 +7271,7 @@ var StructuringAgent = class {
|
|
|
6763
7271
|
if (trustScore < (options.minTrustScore ?? 0.42) || relevanceScore < (options.minRelevanceScore ?? 0.24)) return [];
|
|
6764
7272
|
const sourceWeight = sourceQualityWeight(document);
|
|
6765
7273
|
const segment2 = options.generationPlan ? segmentForSource(options.generationPlan, `${document.title} ${document.text}`, index) : void 0;
|
|
7274
|
+
const blueprint2 = options.generationPlan?.blueprint;
|
|
6766
7275
|
const baselineConfidence = clamp013((finding?.confidence ?? 0.7) * 0.55 + trustScore * 0.22 + authorityScore * 0.12 + relevanceScore * 0.11 - duplicationRisk * 0.08);
|
|
6767
7276
|
const baseId = import_node_crypto3.default.createHash("sha1").update(`${topic}:${document.url}:${datasetType}`).digest("hex").slice(0, 14);
|
|
6768
7277
|
const providerTarget = useProvider ? weightedRecordTarget(recordsPerDocument, options.providerRecordsPerDocument ?? recordsPerDocument, sourceWeight) : 0;
|
|
@@ -6795,6 +7304,10 @@ var StructuringAgent = class {
|
|
|
6795
7304
|
...g.metadata,
|
|
6796
7305
|
topic,
|
|
6797
7306
|
kind: datasetType,
|
|
7307
|
+
blueprint_id: blueprint2?.id,
|
|
7308
|
+
blueprint_label: blueprint2?.label,
|
|
7309
|
+
blueprint_version: blueprint2?.version,
|
|
7310
|
+
blueprint_fields: blueprint2?.fields.map((field2) => field2.name),
|
|
6798
7311
|
segment_id: segment2?.id,
|
|
6799
7312
|
segment_label: segment2?.label,
|
|
6800
7313
|
provider: providerResult.provider,
|
|
@@ -6830,7 +7343,7 @@ function domainFromUrl3(url) {
|
|
|
6830
7343
|
return "unknown";
|
|
6831
7344
|
}
|
|
6832
7345
|
}
|
|
6833
|
-
async function
|
|
7346
|
+
async function mapLimit2(items, limit, worker) {
|
|
6834
7347
|
const results = new Array(items.length);
|
|
6835
7348
|
let nextIndex = 0;
|
|
6836
7349
|
async function runWorker() {
|
|
@@ -6922,6 +7435,11 @@ function toCsv(records) {
|
|
|
6922
7435
|
"source",
|
|
6923
7436
|
"source_url",
|
|
6924
7437
|
"confidence",
|
|
7438
|
+
"blueprint_id",
|
|
7439
|
+
"segment_id",
|
|
7440
|
+
"source_trust_score",
|
|
7441
|
+
"source_authority_score",
|
|
7442
|
+
"source_relevance_score",
|
|
6925
7443
|
"tags",
|
|
6926
7444
|
"metadata",
|
|
6927
7445
|
"created_at"
|
|
@@ -6934,12 +7452,24 @@ function toCsv(records) {
|
|
|
6934
7452
|
record.source,
|
|
6935
7453
|
record.source_url,
|
|
6936
7454
|
String(record.confidence),
|
|
7455
|
+
metadataString(record, "blueprint_id"),
|
|
7456
|
+
metadataString(record, "segment_id"),
|
|
7457
|
+
metadataString(record, "source_trust_score"),
|
|
7458
|
+
metadataString(record, "source_authority_score"),
|
|
7459
|
+
metadataString(record, "source_relevance_score"),
|
|
6937
7460
|
record.tags.join("|"),
|
|
6938
7461
|
JSON.stringify(record.metadata),
|
|
6939
7462
|
record.created_at
|
|
6940
7463
|
]);
|
|
6941
7464
|
return [header, ...rows].map((row) => row.map(escapeCsv).join(",")).join("\n") + "\n";
|
|
6942
7465
|
}
|
|
7466
|
+
function metadataString(record, key) {
|
|
7467
|
+
const value = record.metadata[key];
|
|
7468
|
+
if (value === null || value === void 0) return "";
|
|
7469
|
+
if (typeof value === "string") return value;
|
|
7470
|
+
if (typeof value === "number" || typeof value === "boolean") return String(value);
|
|
7471
|
+
return JSON.stringify(value);
|
|
7472
|
+
}
|
|
6943
7473
|
function toMarkdown(records) {
|
|
6944
7474
|
return records.map((record) => {
|
|
6945
7475
|
const title = record.input || record.id;
|
|
@@ -7075,20 +7605,26 @@ function performanceConfig(mode) {
|
|
|
7075
7605
|
}
|
|
7076
7606
|
function gateSources(sources, mode) {
|
|
7077
7607
|
const perf = performanceConfig(mode);
|
|
7078
|
-
const
|
|
7608
|
+
const concreteSources = sources.filter(isConcreteEvidenceSource);
|
|
7609
|
+
const accepted = concreteSources.filter((source) => {
|
|
7079
7610
|
const trust = source.trustScore ?? source.score;
|
|
7080
7611
|
const relevance = source.relevanceScore ?? source.score;
|
|
7081
7612
|
const duplicateRisk = source.duplicationRisk ?? 0;
|
|
7082
7613
|
const authority = source.authorityScore ?? 0.5;
|
|
7083
7614
|
return trust >= perf.minTrustScore && relevance >= perf.minRelevanceScore && duplicateRisk < 0.72 && (trust >= 0.52 || authority >= 0.72);
|
|
7084
7615
|
});
|
|
7085
|
-
const minimum = Math.min(
|
|
7086
|
-
const fallback = accepted.length >= minimum ? accepted :
|
|
7616
|
+
const minimum = Math.min(concreteSources.length, Math.max(3, Math.ceil(concreteSources.length * perf.gateMinimumRatio)));
|
|
7617
|
+
const fallback = accepted.length >= minimum ? accepted : accepted.length ? accepted : concreteSources.filter((source) => (source.authorityScore ?? 0) >= 0.78 && (source.relevanceScore ?? source.score) >= perf.minRelevanceScore).slice(0, minimum);
|
|
7087
7618
|
return {
|
|
7088
7619
|
sources: fallback,
|
|
7089
7620
|
filtered: Math.max(0, sources.length - fallback.length)
|
|
7090
7621
|
};
|
|
7091
7622
|
}
|
|
7623
|
+
function isConcreteEvidenceSource(source) {
|
|
7624
|
+
if (process.env.ALYS_ALLOW_HEURISTIC_GENERATION === "true") return true;
|
|
7625
|
+
const provider = (source.provider || source.discoveredBy || "").toLowerCase();
|
|
7626
|
+
return provider !== "local-heuristic" && !provider.includes("heuristic");
|
|
7627
|
+
}
|
|
7092
7628
|
function sourceDiversityScore(sources) {
|
|
7093
7629
|
if (!sources.length) return 0;
|
|
7094
7630
|
const domains = new Set(sources.map((source) => source.domain || domainFromUrl4(source.url)));
|
|
@@ -7115,7 +7651,7 @@ async function generateDataset(options) {
|
|
|
7115
7651
|
const targetRows = Math.max(1, Math.floor(options.targetRows ?? 100));
|
|
7116
7652
|
const datasetId = import_node_crypto4.default.createHash("sha1").update(`${options.topic}:${Date.now()}`).digest("hex").slice(0, 12);
|
|
7117
7653
|
const workspace = await ensureAlysWorkspace(options.workspaceRoot);
|
|
7118
|
-
const generationPlan = buildDatasetGenerationPlan(options.topic);
|
|
7654
|
+
const generationPlan = buildDatasetGenerationPlan(options.topic, datasetType);
|
|
7119
7655
|
const expandedQueries = planQueries(generationPlan).slice(0, perf.queryCap);
|
|
7120
7656
|
const discoveryEnabledSeed = options.discoverySeed ?? 0;
|
|
7121
7657
|
const verificationEnabled = options.verificationEnabled ?? true;
|
|
@@ -7140,6 +7676,9 @@ async function generateDataset(options) {
|
|
|
7140
7676
|
message: `${gated.filtered} low-trust or low-relevance sources filtered`,
|
|
7141
7677
|
metric: `${gated.sources.length} accepted`
|
|
7142
7678
|
});
|
|
7679
|
+
if (!gated.sources.length) {
|
|
7680
|
+
throw new Error("ALYS_NO_TRUSTED_SOURCES");
|
|
7681
|
+
}
|
|
7143
7682
|
const extraction = new ExtractionAgent();
|
|
7144
7683
|
event(options.onEvent, { stage: "extraction", agent: extraction.name, status: "running", message: "Extracting source text..." });
|
|
7145
7684
|
const extracted = await extraction.run(gated.sources);
|
|
@@ -7208,6 +7747,9 @@ async function generateDataset(options) {
|
|
|
7208
7747
|
}
|
|
7209
7748
|
});
|
|
7210
7749
|
event(options.onEvent, { stage: "structuring", agent: structuring.name, status: "success", message: `${structured.length} candidate records generated`, metric: `${targetRows} target` });
|
|
7750
|
+
if (!structured.length) {
|
|
7751
|
+
throw new Error("ALYS_NO_GROUNDED_RECORDS");
|
|
7752
|
+
}
|
|
7211
7753
|
const curator = new DatasetCuratorAgent();
|
|
7212
7754
|
event(options.onEvent, { stage: "curation", agent: curator.name, status: "running", message: "Curating final dataset..." });
|
|
7213
7755
|
const records = curator.run(structured, targetRows).map((record) => ({
|
|
@@ -7282,8 +7824,14 @@ async function generateDataset(options) {
|
|
|
7282
7824
|
`));
|
|
7283
7825
|
artifacts.push(await writeDatasetArtifact(workspace, datasetId, "metrics.json", `${JSON.stringify(qualityMetrics, null, 2)}
|
|
7284
7826
|
`));
|
|
7827
|
+
artifacts.push(await writeDatasetArtifact(workspace, datasetId, "schema.json", `${JSON.stringify(datasetSchema(manifest), null, 2)}
|
|
7828
|
+
`));
|
|
7829
|
+
artifacts.push(await writeDatasetArtifact(workspace, datasetId, "data-dictionary.md", renderDataDictionary(manifest)));
|
|
7285
7830
|
artifacts.push(await writeDatasetArtifact(workspace, datasetId, "source-graph.json", `${JSON.stringify(research.graph, null, 2)}
|
|
7286
7831
|
`));
|
|
7832
|
+
artifacts.push(await writeDatasetArtifact(workspace, datasetId, "sources.json", `${JSON.stringify(sources, null, 2)}
|
|
7833
|
+
`));
|
|
7834
|
+
artifacts.push(await writeDatasetArtifact(workspace, datasetId, "sources.md", renderSourcesMarkdown(options.topic, sources)));
|
|
7287
7835
|
artifacts.push(await writeDatasetArtifact(workspace, datasetId, "generation-plan.json", `${JSON.stringify(generationPlan, null, 2)}
|
|
7288
7836
|
`));
|
|
7289
7837
|
artifacts.push(await writeDatasetArtifact(workspace, datasetId, "benchmark-report.json", `${JSON.stringify(evaluation, null, 2)}
|
|
@@ -7292,6 +7840,101 @@ async function generateDataset(options) {
|
|
|
7292
7840
|
event(options.onEvent, { stage: "export", agent: "ArtifactStorage", status: "success", message: `Dataset written to ${manifest.outputDir}`, metric: manifest.outputDir });
|
|
7293
7841
|
return { manifest, records, artifacts };
|
|
7294
7842
|
}
|
|
7843
|
+
function datasetSchema(manifest) {
|
|
7844
|
+
const blueprint2 = manifest.generationPlan?.blueprint;
|
|
7845
|
+
if (!blueprint2) {
|
|
7846
|
+
return {
|
|
7847
|
+
title: "Alys Dataset Record",
|
|
7848
|
+
type: "object",
|
|
7849
|
+
properties: {
|
|
7850
|
+
id: { type: "string" },
|
|
7851
|
+
input: { type: "string" },
|
|
7852
|
+
output: { type: "string" },
|
|
7853
|
+
context: { type: "string" },
|
|
7854
|
+
source_url: { type: "string" },
|
|
7855
|
+
confidence: { type: "number" }
|
|
7856
|
+
},
|
|
7857
|
+
required: ["id", "input", "output", "context", "source_url", "confidence"]
|
|
7858
|
+
};
|
|
7859
|
+
}
|
|
7860
|
+
return {
|
|
7861
|
+
$schema: "https://json-schema.org/draft/2020-12/schema",
|
|
7862
|
+
title: blueprint2.label,
|
|
7863
|
+
description: blueprint2.description,
|
|
7864
|
+
blueprintId: blueprint2.id,
|
|
7865
|
+
blueprintVersion: blueprint2.version,
|
|
7866
|
+
recommendedFormats: blueprint2.recommendedFormats,
|
|
7867
|
+
type: "object",
|
|
7868
|
+
additionalProperties: false,
|
|
7869
|
+
required: blueprint2.fields.filter((field2) => field2.required).map((field2) => field2.name),
|
|
7870
|
+
properties: Object.fromEntries(
|
|
7871
|
+
blueprint2.fields.map((field2) => [
|
|
7872
|
+
field2.name,
|
|
7873
|
+
{
|
|
7874
|
+
type: field2.type,
|
|
7875
|
+
description: field2.description,
|
|
7876
|
+
...field2.example ? { examples: [field2.example] } : {}
|
|
7877
|
+
}
|
|
7878
|
+
])
|
|
7879
|
+
)
|
|
7880
|
+
};
|
|
7881
|
+
}
|
|
7882
|
+
function renderDataDictionary(manifest) {
|
|
7883
|
+
const blueprint2 = manifest.generationPlan?.blueprint;
|
|
7884
|
+
if (!blueprint2) {
|
|
7885
|
+
return "# Data Dictionary\n\nNo dataset blueprint was recorded for this run.\n";
|
|
7886
|
+
}
|
|
7887
|
+
const lines = [
|
|
7888
|
+
`# ${blueprint2.label} Data Dictionary`,
|
|
7889
|
+
"",
|
|
7890
|
+
blueprint2.description,
|
|
7891
|
+
"",
|
|
7892
|
+
`Blueprint: \`${blueprint2.id}@${blueprint2.version}\``,
|
|
7893
|
+
`Recommended formats: ${blueprint2.recommendedFormats.map((format) => `\`${format}\``).join(", ")}`,
|
|
7894
|
+
"",
|
|
7895
|
+
"## Fields",
|
|
7896
|
+
"",
|
|
7897
|
+
"| Field | Type | Required | Description | Example |",
|
|
7898
|
+
"| --- | --- | --- | --- | --- |",
|
|
7899
|
+
...blueprint2.fields.map(
|
|
7900
|
+
(field2) => `| \`${field2.name}\` | ${field2.type} | ${field2.required ? "yes" : "no"} | ${field2.description.replace(/\|/g, "\\|")} | ${field2.example?.replace(/\|/g, "\\|") ?? ""} |`
|
|
7901
|
+
),
|
|
7902
|
+
"",
|
|
7903
|
+
"## Record Instructions",
|
|
7904
|
+
"",
|
|
7905
|
+
...blueprint2.recordInstructions.map((item) => `- ${item}`),
|
|
7906
|
+
"",
|
|
7907
|
+
"## Quality Bar",
|
|
7908
|
+
"",
|
|
7909
|
+
...blueprint2.qualityBar.map((item) => `- ${item}`),
|
|
7910
|
+
""
|
|
7911
|
+
];
|
|
7912
|
+
return `${lines.join("\n")}
|
|
7913
|
+
`;
|
|
7914
|
+
}
|
|
7915
|
+
function renderSourcesMarkdown(topic, sources) {
|
|
7916
|
+
const lines = [
|
|
7917
|
+
`# Sources for ${topic}`,
|
|
7918
|
+
"",
|
|
7919
|
+
"Alys grounds generated records in the ranked source pool below. Higher trust scores influence confidence and record acceptance.",
|
|
7920
|
+
""
|
|
7921
|
+
];
|
|
7922
|
+
for (const [index, source] of sources.entries()) {
|
|
7923
|
+
const trust = typeof source.trustScore === "number" ? ` \xB7 trust ${Math.round(source.trustScore * 100)}%` : "";
|
|
7924
|
+
const type = source.sourceType ? ` \xB7 ${source.sourceType}` : "";
|
|
7925
|
+
const domain = source.domain ? ` \xB7 ${source.domain}` : "";
|
|
7926
|
+
lines.push(`${index + 1}. [${source.title}](${source.url})${domain}${type}${trust}`);
|
|
7927
|
+
if (source.snippet) lines.push(` ${source.snippet.replace(/\s+/g, " ").trim()}`);
|
|
7928
|
+
if (source.qualitySignals?.length) lines.push(` Signals: ${source.qualitySignals.slice(0, 6).join(", ")}`);
|
|
7929
|
+
lines.push("");
|
|
7930
|
+
}
|
|
7931
|
+
if (!sources.length) {
|
|
7932
|
+
lines.push("No sources were recorded for this run.");
|
|
7933
|
+
lines.push("");
|
|
7934
|
+
}
|
|
7935
|
+
return `${lines.join("\n")}
|
|
7936
|
+
`;
|
|
7937
|
+
}
|
|
7295
7938
|
function artifactFilename(format) {
|
|
7296
7939
|
if (format === "markdown") return "dataset.md";
|
|
7297
7940
|
if (format === "instruction") return "instruction.jsonl";
|
|
@@ -7313,7 +7956,7 @@ async function generateDatasets(options) {
|
|
|
7313
7956
|
const verificationEnabled = options.enableVerificationSwarm ?? performanceMode !== "fast";
|
|
7314
7957
|
const debateEnabled = verificationEnabled && perf.debateEnabled;
|
|
7315
7958
|
const multiplier = depthMultiplier(options.generationDepth);
|
|
7316
|
-
const results = await
|
|
7959
|
+
const results = await mapLimit3(Array.from({ length: datasetCount }, (_, i) => i), Number(process.env.ALYS_DATASET_CONCURRENCY ?? perf.datasetConcurrency), async (i) => {
|
|
7317
7960
|
const datasetIndex = i + 1;
|
|
7318
7961
|
const datasetSourceLimit = Math.max(1, Math.floor((options.sourceLimit ?? 24) * multiplier));
|
|
7319
7962
|
const datasetTargetRows = Math.max(1, Math.floor((options.targetRows ?? 100) * multiplier));
|
|
@@ -7343,7 +7986,7 @@ async function generateDatasets(options) {
|
|
|
7343
7986
|
const datasets = results;
|
|
7344
7987
|
return { manifests, artifacts, previews, datasets };
|
|
7345
7988
|
}
|
|
7346
|
-
async function
|
|
7989
|
+
async function mapLimit3(items, limit, worker) {
|
|
7347
7990
|
const results = new Array(items.length);
|
|
7348
7991
|
let nextIndex = 0;
|
|
7349
7992
|
const workerCount = Math.max(1, Math.min(items.length, Math.floor(limit || 1)));
|
|
@@ -7436,6 +8079,13 @@ Limits:
|
|
|
7436
8079
|
1 dataset = 1 generation
|
|
7437
8080
|
max 5 datasets per run
|
|
7438
8081
|
use --benchmark for local high-volume benchmark runs
|
|
8082
|
+
|
|
8083
|
+
Concepts:
|
|
8084
|
+
RAG chunks retrieval-ready records (for search or knowledge-base apps)
|
|
8085
|
+
Instruction tuning examples for fine-tuning (teaching model behavior)
|
|
8086
|
+
JSONL one JSON object per line (standard for ML pipelines)
|
|
8087
|
+
CSV spreadsheet-friendly rows (for review and analysis)
|
|
8088
|
+
Verification quality checks (confidence, repetition, schema validity)
|
|
7439
8089
|
`);
|
|
7440
8090
|
}
|
|
7441
8091
|
function loadCliEnv(cwd = process.cwd()) {
|
|
@@ -7621,6 +8271,17 @@ function getEvaluation(dataset) {
|
|
|
7621
8271
|
const evaluation = dataset.manifest.evaluation;
|
|
7622
8272
|
return evaluation && typeof evaluation === "object" ? evaluation : {};
|
|
7623
8273
|
}
|
|
8274
|
+
function getBlueprint(dataset) {
|
|
8275
|
+
const generationPlan = dataset.manifest.generationPlan;
|
|
8276
|
+
if (!generationPlan || typeof generationPlan !== "object") return {};
|
|
8277
|
+
const blueprint2 = generationPlan.blueprint;
|
|
8278
|
+
return blueprint2 && typeof blueprint2 === "object" ? blueprint2 : {};
|
|
8279
|
+
}
|
|
8280
|
+
function getSourceManifest(dataset) {
|
|
8281
|
+
const sources = dataset.manifest.sourceManifest;
|
|
8282
|
+
if (!Array.isArray(sources)) return [];
|
|
8283
|
+
return sources.filter((source) => Boolean(source) && typeof source === "object").filter((source) => typeof source.title === "string" || typeof source.url === "string");
|
|
8284
|
+
}
|
|
7624
8285
|
function printStage(code, status, label, metric) {
|
|
7625
8286
|
const tint = status === "DONE" || status === "OK" ? "green" : status === "WARN" ? "yellow" : "cyan";
|
|
7626
8287
|
const prefix = `${paint(`[${code.padEnd(4).slice(0, 4)}]`, "gray")} ${paint(status.padEnd(4), tint)}`;
|
|
@@ -7682,7 +8343,7 @@ function printUsage(profile) {
|
|
|
7682
8343
|
);
|
|
7683
8344
|
}
|
|
7684
8345
|
function printRunPlan(args) {
|
|
7685
|
-
const multiplier = depthMultiplier2(args.depth);
|
|
8346
|
+
const multiplier = args.performanceMode === "fast" ? 1 : depthMultiplier2(args.depth);
|
|
7686
8347
|
const effectiveSources = Math.max(1, Math.floor(args.sourceLimit * multiplier));
|
|
7687
8348
|
const effectiveRows = Math.max(1, Math.floor(args.targetRows * multiplier));
|
|
7688
8349
|
const totalRows = effectiveRows * args.datasetCount;
|
|
@@ -7813,12 +8474,28 @@ function printGenerationSummary(response, workspaceRoot) {
|
|
|
7813
8474
|
const records = Number(metrics.recordsGenerated ?? summary.recordsAccepted ?? 0);
|
|
7814
8475
|
const sources = Number(metrics.sourcesDiscovered ?? 0);
|
|
7815
8476
|
const confidenceValue = Number(metrics.averageConfidence ?? summary.averageConfidence ?? 0);
|
|
8477
|
+
const blueprint2 = getBlueprint(dataset);
|
|
7816
8478
|
const outputDir = import_node_path4.default.join(root, "datasets", dataset.id);
|
|
7817
8479
|
console.log(`${paint("\u2022", "yellow")} ${paint(dataset.id, "white")} ${formatInt(records)} records ${formatInt(sources)} sources ${formatPercent(confidenceValue)} confidence`);
|
|
7818
8480
|
console.log(` ${truncate(dataset.topic, 110)}`);
|
|
8481
|
+
if (blueprint2.label || blueprint2.id) {
|
|
8482
|
+
console.log(` blueprint ${paint(blueprint2.label ?? blueprint2.id ?? "dataset blueprint", "white")}${paint(blueprint2.id ? ` (${blueprint2.id})` : "", "gray")}`);
|
|
8483
|
+
}
|
|
7819
8484
|
console.log(` ${paint(outputDir, "cyan")}`);
|
|
7820
8485
|
console.log(` quality ${formatPercent(Number(quality.citationCoverage ?? 0))} citations \xB7 ${formatPercent(Number(quality.recordUniqueness ?? 0))} unique \xB7 ${formatPercent(Number(quality.sourceDiversity ?? 0))} source diversity`);
|
|
7821
8486
|
console.log(` suitability RAG ${formatScore(Number(suitability.ragSuitability ?? 0))} \xB7 tuning ${formatScore(Number(suitability.instructionTuning ?? 0))} \xB7 usefulness ${formatScore(Number(suitability.humanUsefulness ?? 0))}`);
|
|
8487
|
+
const topSources = getSourceManifest(dataset).slice(0, 5);
|
|
8488
|
+
if (topSources.length) {
|
|
8489
|
+
console.log(paint(" sources", "gray"));
|
|
8490
|
+
for (const source of topSources) {
|
|
8491
|
+
const label = source.title || source.domain || source.provider || "source";
|
|
8492
|
+
const trust = Number(source.trustScore ?? source.authorityScore ?? source.relevanceScore ?? 0);
|
|
8493
|
+
const type = source.sourceType ? ` ${source.sourceType}` : "";
|
|
8494
|
+
const score = trust > 0 ? ` ${formatPercent(trust)} trust` : "";
|
|
8495
|
+
console.log(` - ${truncate(label, 76)}${paint(`${type}${score}`, "gray")}`);
|
|
8496
|
+
if (source.url) console.log(` ${paint(source.url, "cyan")}`);
|
|
8497
|
+
}
|
|
8498
|
+
}
|
|
7822
8499
|
const preview = previewRecord(dataset);
|
|
7823
8500
|
if (preview) {
|
|
7824
8501
|
console.log(paint(" preview", "gray"));
|
|
@@ -8000,17 +8677,26 @@ async function handleGenerate(args, command) {
|
|
|
8000
8677
|
validate: (v) => v.trim().length ? true : "Please enter a topic."
|
|
8001
8678
|
})).topic);
|
|
8002
8679
|
if (!topic) throw new Error("Missing topic.");
|
|
8003
|
-
const datasetType = parseDatasetType(values.type) ?? (await (0, import_prompts3.default)({
|
|
8680
|
+
const datasetType = parseDatasetType(values.type) ?? (values.yes === true ? "instruction" : void 0) ?? (await (0, import_prompts3.default)({
|
|
8004
8681
|
type: "select",
|
|
8005
8682
|
name: "datasetType",
|
|
8006
|
-
message: "Dataset type?",
|
|
8683
|
+
message: "Dataset type? (choose what the output should be used for)",
|
|
8007
8684
|
choices: [
|
|
8008
|
-
{
|
|
8009
|
-
|
|
8010
|
-
|
|
8685
|
+
{
|
|
8686
|
+
title: "Instruction tuning (task + ideal answer examples)",
|
|
8687
|
+
value: "instruction"
|
|
8688
|
+
},
|
|
8689
|
+
{
|
|
8690
|
+
title: "RAG chunks (retrieval-ready context for search/knowledge bases)",
|
|
8691
|
+
value: "rag"
|
|
8692
|
+
},
|
|
8693
|
+
{
|
|
8694
|
+
title: "Question/Answer (direct QA pairs for evaluation or training)",
|
|
8695
|
+
value: "qa"
|
|
8696
|
+
}
|
|
8011
8697
|
]
|
|
8012
8698
|
})).datasetType;
|
|
8013
|
-
const requestedDatasetCount = values.datasets ? Math.max(1, Math.floor(Number(values.datasets))) : (await (0, import_prompts3.default)({
|
|
8699
|
+
const requestedDatasetCount = values.datasets ? Math.max(1, Math.floor(Number(values.datasets))) : values.yes === true ? 1 : (await (0, import_prompts3.default)({
|
|
8014
8700
|
type: "number",
|
|
8015
8701
|
name: "datasetCount",
|
|
8016
8702
|
message: "How many datasets?",
|
|
@@ -8023,56 +8709,56 @@ async function handleGenerate(args, command) {
|
|
|
8023
8709
|
return;
|
|
8024
8710
|
}
|
|
8025
8711
|
const datasetCount = requestedDatasetCount;
|
|
8026
|
-
const exportFormats = values.format ? parseFormats(values.format) : (await (0, import_prompts3.default)({
|
|
8712
|
+
const exportFormats = values.format ? parseFormats(values.format) : values.yes === true ? ["jsonl", "csv", "markdown"] : (await (0, import_prompts3.default)({
|
|
8027
8713
|
type: "multiselect",
|
|
8028
8714
|
name: "exportFormats",
|
|
8029
|
-
message: "Output formats?",
|
|
8715
|
+
message: "Output formats? (you can select multiple)",
|
|
8030
8716
|
choices: [
|
|
8031
|
-
{ title: "JSONL", value: "jsonl", selected: true },
|
|
8032
|
-
{ title: "CSV", value: "csv", selected: true },
|
|
8033
|
-
{ title: "Markdown", value: "markdown" },
|
|
8034
|
-
{ title: "Instruction dataset", value: "instruction" },
|
|
8035
|
-
{ title: "RAG chunks", value: "rag" }
|
|
8717
|
+
{ title: "JSONL (one JSON record per line, best for ML pipelines)", value: "jsonl", selected: true },
|
|
8718
|
+
{ title: "CSV (spreadsheet-friendly review format)", value: "csv", selected: true },
|
|
8719
|
+
{ title: "Markdown (readable summary for humans)", value: "markdown" },
|
|
8720
|
+
{ title: "Instruction dataset (fine-tuning JSONL)", value: "instruction" },
|
|
8721
|
+
{ title: "RAG chunks (retrieval-ready JSONL)", value: "rag" }
|
|
8036
8722
|
],
|
|
8037
8723
|
hint: "Use space to select multiple."
|
|
8038
8724
|
})).exportFormats;
|
|
8039
|
-
const depth = parseDepth(values.depth) ?? (await (0, import_prompts3.default)({
|
|
8725
|
+
const depth = parseDepth(values.depth) ?? (values.yes === true ? "medium" : void 0) ?? (await (0, import_prompts3.default)({
|
|
8040
8726
|
type: "select",
|
|
8041
8727
|
name: "depth",
|
|
8042
|
-
message: "Research depth?",
|
|
8728
|
+
message: "Research depth? (more depth can improve coverage but costs time)",
|
|
8043
8729
|
choices: [
|
|
8044
|
-
{ title: "Shallow", value: "shallow" },
|
|
8045
|
-
{ title: "Medium", value: "medium" },
|
|
8046
|
-
{ title: "Deep", value: "deep" }
|
|
8730
|
+
{ title: "Shallow (fastest, smaller context)", value: "shallow" },
|
|
8731
|
+
{ title: "Medium (balanced default)", value: "medium" },
|
|
8732
|
+
{ title: "Deep (broader coverage, slower)", value: "deep" }
|
|
8047
8733
|
]
|
|
8048
8734
|
})).depth;
|
|
8049
|
-
const sourceLimit = values.sources ? Math.min(maxSources, Math.max(1, Number(values.sources))) : (await (0, import_prompts3.default)({
|
|
8735
|
+
const sourceLimit = values.sources ? Math.min(maxSources, Math.max(1, Number(values.sources))) : values.yes === true ? benchmarkMode ? 48 : MAX_SOURCES_PER_RUN : (await (0, import_prompts3.default)({
|
|
8050
8736
|
type: "number",
|
|
8051
8737
|
name: "sourceLimit",
|
|
8052
|
-
message: "How many sources?",
|
|
8738
|
+
message: "How many sources? (more sources can improve coverage but may slow the run)",
|
|
8053
8739
|
initial: benchmarkMode ? 48 : MAX_SOURCES_PER_RUN,
|
|
8054
8740
|
min: 1,
|
|
8055
8741
|
max: maxSources
|
|
8056
8742
|
})).sourceLimit;
|
|
8057
|
-
const targetRows = values.rows ? Math.min(maxRows, Math.max(1, Number(values.rows))) : (await (0, import_prompts3.default)({
|
|
8743
|
+
const targetRows = values.rows ? Math.min(maxRows, Math.max(1, Number(values.rows))) : values.yes === true ? benchmarkMode ? 5e3 : MAX_ROWS_PER_DATASET : (await (0, import_prompts3.default)({
|
|
8058
8744
|
type: "number",
|
|
8059
8745
|
name: "targetRows",
|
|
8060
|
-
message: "Rows per dataset?",
|
|
8746
|
+
message: "Rows per dataset? (Alys aims for rows worth keeping, not raw volume)",
|
|
8061
8747
|
initial: benchmarkMode ? 5e3 : MAX_ROWS_PER_DATASET,
|
|
8062
8748
|
min: 1,
|
|
8063
8749
|
max: maxRows
|
|
8064
8750
|
})).targetRows;
|
|
8065
|
-
const workspaceRoot = (values.workspace ? String(values.workspace) : "").trim() || (await (0, import_prompts3.default)({
|
|
8751
|
+
const workspaceRoot = (values.workspace ? String(values.workspace) : "").trim() || (values.yes === true ? "~/Alys" : "") || (await (0, import_prompts3.default)({
|
|
8066
8752
|
type: "text",
|
|
8067
8753
|
name: "workspaceRoot",
|
|
8068
8754
|
message: "Export directory?",
|
|
8069
8755
|
initial: "~/Alys",
|
|
8070
8756
|
validate: (v) => v.trim().length ? true : "Enter an export directory."
|
|
8071
8757
|
})).workspaceRoot;
|
|
8072
|
-
const verificationEnabled = values.verify === true ? true : values["no-verify"] === true ? false : (await (0, import_prompts3.default)({
|
|
8758
|
+
const verificationEnabled = values.verify === true ? true : values["no-verify"] === true ? false : values.yes === true ? performanceMode !== "fast" : (await (0, import_prompts3.default)({
|
|
8073
8759
|
type: "toggle",
|
|
8074
8760
|
name: "verificationEnabled",
|
|
8075
|
-
message: "Enable verification checks?",
|
|
8761
|
+
message: "Enable verification checks? (slower, stricter about weak/repetitive records)",
|
|
8076
8762
|
initial: performanceMode !== "fast",
|
|
8077
8763
|
active: "Yes",
|
|
8078
8764
|
inactive: "No"
|
|
@@ -8119,9 +8805,9 @@ async function handleGenerate(args, command) {
|
|
|
8119
8805
|
console.log(paint("Runtime", "white"));
|
|
8120
8806
|
printStage("AUTH", "OK", "Usage linked", appUrl());
|
|
8121
8807
|
printStage("PLAN", "OK", "Generations charged only after successful completion", `${datasetCount} requested`);
|
|
8122
|
-
printStage("
|
|
8808
|
+
printStage("RUN", "RUN", "Dataset runtime starting", `${performanceMode} mode`);
|
|
8123
8809
|
const response = await withSpinner(
|
|
8124
|
-
"Alys
|
|
8810
|
+
"Alys runtime executing",
|
|
8125
8811
|
requestJson(
|
|
8126
8812
|
"/api/cli/generate",
|
|
8127
8813
|
{
|