alys-akusa 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.cjs +702 -30
  2. package/package.json +1 -1
package/dist/index.cjs CHANGED
@@ -5109,7 +5109,10 @@ async function discoverResearchSources(topic, options = {}) {
5109
5109
  };
5110
5110
  }
5111
5111
  function createConfiguredSearchProviders() {
5112
- const providers = [];
5112
+ const providers = [
5113
+ new GitHubSearchProvider(env("GITHUB_TOKEN")),
5114
+ new KaggleSearchProvider(env("KAGGLE_USERNAME"), env("KAGGLE_KEY"))
5115
+ ];
5113
5116
  const tavilyKey = env("TAVILY_API_KEY");
5114
5117
  if (tavilyKey) providers.push(new TavilySearchProvider(tavilyKey));
5115
5118
  const serpApiKey = env("SERPAPI_API_KEY");
@@ -5155,6 +5158,107 @@ var TavilySearchProvider = class {
5155
5158
  }));
5156
5159
  }
5157
5160
  };
5161
+ var GitHubSearchProvider = class {
5162
+ constructor(token = null) {
5163
+ this.token = token;
5164
+ }
5165
+ name = "github";
5166
+ async search(query, options = {}) {
5167
+ const url = new URL("https://api.github.com/search/repositories");
5168
+ url.searchParams.set("q", `${query} dataset OR benchmark OR corpus OR csv OR jsonl in:name,description,readme`);
5169
+ url.searchParams.set("sort", "stars");
5170
+ url.searchParams.set("order", "desc");
5171
+ url.searchParams.set("per_page", String(Math.min(20, options.limit ?? 10)));
5172
+ const headers = {
5173
+ Accept: "application/vnd.github+json",
5174
+ "User-Agent": "AlysResearchBot/0.1",
5175
+ "X-GitHub-Api-Version": "2022-11-28"
5176
+ };
5177
+ if (this.token) headers.Authorization = `Bearer ${this.token}`;
5178
+ const payload = await fetchJson(url.toString(), { headers }, options.timeoutMs);
5179
+ return asArray(payload.items).flatMap((item) => {
5180
+ if (!item || typeof item !== "object") return [];
5181
+ const object = item;
5182
+ const fullName = firstString(object, ["full_name"]);
5183
+ const htmlUrl = firstString(object, ["html_url"]);
5184
+ if (!fullName || !htmlUrl) return [];
5185
+ const stars = firstNumber(object, ["stargazers_count"]) ?? 0;
5186
+ const forks = firstNumber(object, ["forks_count"]) ?? 0;
5187
+ const topics = Array.isArray(object.topics) ? object.topics.map(String).slice(0, 8) : [];
5188
+ const license = object.license && typeof object.license === "object" ? firstString(object.license, ["spdx_id", "name"]) : "";
5189
+ const description = firstString(object, ["description"]);
5190
+ const language = firstString(object, ["language"]);
5191
+ const score = clamp01(Math.log10(stars + 1) / 5 * 0.72 + Math.log10(forks + 1) / 5 * 0.16 + (license ? 0.08 : 0) + (topics.length ? 0.04 : 0));
5192
+ return [{
5193
+ title: `GitHub: ${fullName}`,
5194
+ url: htmlUrl,
5195
+ snippet: [
5196
+ description,
5197
+ language ? `Language: ${language}.` : "",
5198
+ license ? `License: ${license}.` : "",
5199
+ topics.length ? `Topics: ${topics.join(", ")}.` : "",
5200
+ `Stars: ${stars}. Forks: ${forks}.`
5201
+ ].filter(Boolean).join(" "),
5202
+ publishedAt: firstString(object, ["updated_at", "pushed_at", "created_at"]),
5203
+ score,
5204
+ provider: this.name,
5205
+ query,
5206
+ raw: item
5207
+ }];
5208
+ });
5209
+ }
5210
+ };
5211
+ var KaggleSearchProvider = class {
5212
+ constructor(username = null, key = null) {
5213
+ this.username = username;
5214
+ this.key = key;
5215
+ }
5216
+ name = "kaggle";
5217
+ async search(query, options = {}) {
5218
+ if (!this.username || !this.key) {
5219
+ return [];
5220
+ }
5221
+ const url = new URL("https://www.kaggle.com/api/v1/datasets/list");
5222
+ url.searchParams.set("search", query);
5223
+ url.searchParams.set("sortBy", "hottest");
5224
+ url.searchParams.set("pageSize", String(Math.min(20, options.limit ?? 10)));
5225
+ const auth = Buffer.from(`${this.username}:${this.key}`).toString("base64");
5226
+ const payload = await fetchJson(url.toString(), {
5227
+ headers: {
5228
+ Accept: "application/json",
5229
+ Authorization: `Basic ${auth}`,
5230
+ "User-Agent": "AlysResearchBot/0.1"
5231
+ }
5232
+ }, options.timeoutMs);
5233
+ const items = Array.isArray(payload) ? payload : payload && typeof payload === "object" ? asArray(payload.datasets ?? payload.results) : [];
5234
+ return items.flatMap((item) => {
5235
+ if (!item || typeof item !== "object") return [];
5236
+ const object = item;
5237
+ const ref = firstString(object, ["ref", "datasetRef", "ownerName"]);
5238
+ const title = firstString(object, ["title", "subtitle", "ref"]) || ref;
5239
+ const datasetUrl = firstString(object, ["url"]) || (ref ? `https://www.kaggle.com/datasets/${ref}` : "");
5240
+ if (!title || !datasetUrl) return [];
5241
+ const votes = firstNumber(object, ["voteCount", "votes"]) ?? 0;
5242
+ const downloads = firstNumber(object, ["downloadCount", "downloads"]) ?? 0;
5243
+ const usability = firstNumber(object, ["usabilityRating"]) ?? 0;
5244
+ const score = clamp01(Math.log10(downloads + 1) / 6 * 0.38 + Math.log10(votes + 1) / 5 * 0.24 + Math.min(1, usability) * 0.28 + 0.1);
5245
+ return [{
5246
+ title: `Kaggle: ${title}`,
5247
+ url: datasetUrl,
5248
+ snippet: [
5249
+ firstString(object, ["subtitle", "description"]),
5250
+ `Downloads: ${downloads}. Votes: ${votes}.`,
5251
+ usability ? `Usability: ${usability}.` : ""
5252
+ ].filter(Boolean).join(" "),
5253
+ publishedAt: firstString(object, ["lastUpdated", "creationDate"]),
5254
+ score,
5255
+ provider: this.name,
5256
+ query,
5257
+ raw: item
5258
+ }];
5259
+ });
5260
+ }
5261
+ };
5158
5262
  var SerpApiSearchProvider = class {
5159
5263
  constructor(apiKey) {
5160
5264
  this.apiKey = apiKey;
@@ -5294,6 +5398,8 @@ function buildResearchQueries(topic, count = 5) {
5294
5398
  const normalized = topic.trim().replace(/\s+/g, " ");
5295
5399
  const facets = [
5296
5400
  normalized,
5401
+ `${normalized} dataset github kaggle benchmark`,
5402
+ `${normalized} public dataset csv jsonl parquet`,
5297
5403
  `${normalized} official documentation standards methodology`,
5298
5404
  `${normalized} research paper benchmark evaluation`,
5299
5405
  `${normalized} case study operational data`,
@@ -5318,8 +5424,9 @@ ${result.url}`);
5318
5424
  const freshnessScore = freshnessForDate(result.publishedAt);
5319
5425
  const duplicationRisk = clamp01(Math.max(0, (domainCounts.get(domain) ?? 1) - 1) * 0.12);
5320
5426
  const providerScore = normalizeProviderScore(result.score);
5427
+ const sourcePreference = sourcePreferenceScore(domain, result.url, result.provider);
5321
5428
  const trustScore = clamp01(
5322
- authority.score * 0.3 + relevanceScore * 0.27 + semanticScore * 0.18 + freshnessScore * 0.12 + providerScore * 0.08 + (1 - duplicationRisk) * 0.05
5429
+ authority.score * 0.3 + relevanceScore * 0.27 + semanticScore * 0.18 + freshnessScore * 0.1 + providerScore * 0.07 + sourcePreference * 0.05 + (1 - duplicationRisk) * 0.05
5323
5430
  );
5324
5431
  return {
5325
5432
  id: sourceId(result.url),
@@ -5339,7 +5446,12 @@ ${result.url}`);
5339
5446
  semanticScore: Number(semanticScore.toFixed(3)),
5340
5447
  trustScore: Number(trustScore.toFixed(3)),
5341
5448
  sourceType: authority.type,
5342
- qualitySignals: authority.signals
5449
+ qualitySignals: [
5450
+ ...authority.signals,
5451
+ ...sourcePreference >= 0.85 ? ["preferred-source-surface"] : [],
5452
+ ...result.provider === "github" ? ["github-repository-search"] : [],
5453
+ ...result.provider === "kaggle" ? ["kaggle-dataset-search"] : []
5454
+ ]
5343
5455
  };
5344
5456
  });
5345
5457
  }
@@ -5549,6 +5661,14 @@ function authorityForDomain(domain, url) {
5549
5661
  score = 0.78;
5550
5662
  type = "official";
5551
5663
  signals.push("official-documentation");
5664
+ } else if (host === "github.com") {
5665
+ score = 0.84;
5666
+ type = "code";
5667
+ signals.push("open-source-repository");
5668
+ } else if (host === "kaggle.com" || host.endsWith(".kaggle.com")) {
5669
+ score = 0.86;
5670
+ type = "dataset";
5671
+ signals.push("dataset-marketplace");
5552
5672
  } else if (host.includes("wikipedia.org")) {
5553
5673
  score = 0.62;
5554
5674
  type = "community";
@@ -5564,6 +5684,18 @@ function authorityForDomain(domain, url) {
5564
5684
  }
5565
5685
  return { score, type, signals };
5566
5686
  }
5687
+ function sourcePreferenceScore(domain, url, provider) {
5688
+ const host = domain.toLowerCase();
5689
+ if (provider === "kaggle" || host.includes("kaggle.com")) return 0.96;
5690
+ if (provider === "github" || host === "github.com") return 0.93;
5691
+ if (host.endsWith(".gov") || host.includes("nist.gov") || host.includes("sec.gov")) return 0.92;
5692
+ if (host.includes("arxiv.org") || host.endsWith(".edu") || host.includes("openalex.org")) return 0.88;
5693
+ if (host.includes("huggingface.co/datasets")) return 0.88;
5694
+ if (host.includes("data.gov") || url.includes("/dataset")) return 0.84;
5695
+ if (host.includes("docs.") || url.includes("/docs/") || url.includes("/documentation/")) return 0.78;
5696
+ if (host.includes("reddit.") || host.includes("medium.") || host.includes("substack.")) return 0.24;
5697
+ return 0.55;
5698
+ }
5567
5699
  function freshnessForDate(value) {
5568
5700
  if (!value) return 0.62;
5569
5701
  const timestamp = Date.parse(value);
@@ -5658,6 +5790,8 @@ function env(key) {
5658
5790
  return value || null;
5659
5791
  }
5660
5792
  var LOCAL_SOURCE_BASES = [
5793
+ { label: "GitHub", url: "https://github.com/search", querySuffix: "q", signal: "Open-source repositories, examples, datasets, and benchmark code." },
5794
+ { label: "Kaggle", url: "https://www.kaggle.com/datasets", querySuffix: "search", signal: "Public dataset catalog and dataset-level examples." },
5661
5795
  { label: "NIST", url: "https://www.nist.gov/search", querySuffix: "q", signal: "Technical guidance and standards language." },
5662
5796
  { label: "SEC", url: "https://www.sec.gov/search", querySuffix: "q", signal: "Regulatory filings and official disclosures." },
5663
5797
  { label: "PubMed", url: "https://pubmed.ncbi.nlm.nih.gov", querySuffix: "term", signal: "Academic and biomedical literature index." },
@@ -6128,13 +6262,235 @@ Source: ${result.source.url}
6128
6262
  }
6129
6263
 
6130
6264
  // ../../packages/prompts/src/index.ts
6131
- function buildDatasetGenerationPlan(topic) {
6265
+ var DATASET_BLUEPRINTS = [
6266
+ blueprint({
6267
+ id: "instruction-finetune-corpus",
6268
+ label: "Instruction Fine-Tuning Corpus",
6269
+ description: "Task-and-answer examples for model fine-tuning, evaluator training, and assistant behavior shaping.",
6270
+ defaultType: "instruction",
6271
+ recommendedFormats: ["instruction", "jsonl", "csv", "markdown"],
6272
+ bestFor: ["fine-tuning", "assistant behavior", "domain task completion", "supervised training"],
6273
+ fields: [
6274
+ field("instruction", "string", "The user-facing task the model should perform.", true, "Handle a pricing objection from a VP of Sales."),
6275
+ field("input_context", "string", "The scenario, persona, constraints, or source-backed context for the instruction.", true, "Mid-market CRM buyer comparing annual contracts."),
6276
+ field("ideal_output", "string", "The high-quality answer or completion expected from the model.", true, "Acknowledge budget pressure, quantify missed pipeline risk, and offer a pilot path."),
6277
+ field("skill", "string", "The primary capability being trained.", true, "objection-handling"),
6278
+ field("difficulty", "string", "Expected difficulty or complexity level.", false, "intermediate"),
6279
+ field("source_url", "string", "The source URL supporting this record.", true, "https://example.com/source"),
6280
+ field("confidence", "number", "Alys confidence score after source and quality checks.", true, "0.86")
6281
+ ],
6282
+ recordInstructions: [
6283
+ "Make the instruction actionable and directly trainable.",
6284
+ "Keep the output specific enough to teach a behavior, not a generic explanation.",
6285
+ "Include scenario/context when the answer depends on buyer role, domain, risk, or constraints."
6286
+ ],
6287
+ qualityBar: [
6288
+ "Record teaches one clear skill.",
6289
+ "Answer is useful without reading the source page.",
6290
+ "No repeated template phrasing across rows."
6291
+ ]
6292
+ }),
6293
+ blueprint({
6294
+ id: "rag-corpus",
6295
+ label: "RAG Retrieval Corpus",
6296
+ description: "Retrieval-ready passages, grounded answers, and citation-friendly chunks for search or knowledge bases.",
6297
+ defaultType: "rag",
6298
+ recommendedFormats: ["rag", "jsonl", "csv", "markdown"],
6299
+ bestFor: ["RAG", "semantic search", "knowledge bases", "citation-aware QA"],
6300
+ fields: [
6301
+ field("query", "string", "A realistic retrieval query or user question.", true, "What evidence supports the SOC 2 access review requirement?"),
6302
+ field("answer", "string", "A compact answer grounded only in the retrieved context.", true, "The control requires periodic review of user access and documented exceptions."),
6303
+ field("chunk_text", "string", "The retrieval text chunk that should be embedded.", true, "Access reviews should be performed periodically..."),
6304
+ field("source_title", "string", "Human-readable source title.", true, "SOC 2 Criteria Overview"),
6305
+ field("source_url", "string", "Canonical source URL.", true, "https://example.com/source"),
6306
+ field("citation_span", "string", "The source-backed phrase or section used as evidence.", false, "periodic review of user access"),
6307
+ field("confidence", "number", "Grounding confidence for the answer/chunk.", true, "0.91")
6308
+ ],
6309
+ recordInstructions: [
6310
+ "Make each context chunk standalone and retrieval-ready.",
6311
+ "Answer only what the source context supports.",
6312
+ "Prefer compact, evidence-rich chunks over long summaries."
6313
+ ],
6314
+ qualityBar: [
6315
+ "Chunk contains one coherent idea.",
6316
+ "Query is realistic, not keyword stuffing.",
6317
+ "Answer can be traced back to source context."
6318
+ ]
6319
+ }),
6320
+ blueprint({
6321
+ id: "evaluation-qa",
6322
+ label: "Evaluation QA Dataset",
6323
+ description: "Question-answer records with expected answers, rubric signals, difficulty, and factual grounding.",
6324
+ defaultType: "qa",
6325
+ recommendedFormats: ["jsonl", "csv", "markdown"],
6326
+ bestFor: ["model evaluation", "benchmarking", "golden sets", "regression testing"],
6327
+ fields: [
6328
+ field("question", "string", "A precise evaluation question.", true, "Which mitigation should be used when duplicate records reduce retrieval diversity?"),
6329
+ field("expected_answer", "string", "The answer a model should produce.", true, "Apply semantic deduplication and retain records with stronger source support."),
6330
+ field("rubric", "string", "Criteria used to grade the model response.", true, "Must mention deduplication, source support, and diversity impact."),
6331
+ field("difficulty", "string", "Difficulty band.", true, "hard"),
6332
+ field("failure_modes", "array", "Likely wrong answers to catch.", false, "hallucinated metric, unsupported source claim"),
6333
+ field("source_url", "string", "Evidence source URL.", true, "https://example.com/source"),
6334
+ field("confidence", "number", "Grounding confidence.", true, "0.88")
6335
+ ],
6336
+ recordInstructions: [
6337
+ "Write questions that test reasoning, not memorization only.",
6338
+ "Include rubric-like constraints in the answer or context.",
6339
+ "Add failure-mode awareness when the source contains ambiguity."
6340
+ ],
6341
+ qualityBar: [
6342
+ "Question has one defensible expected answer.",
6343
+ "Rubric exposes what a weak model would miss.",
6344
+ "Evidence is visible in context."
6345
+ ]
6346
+ }),
6347
+ blueprint({
6348
+ id: "b2b-saas-objections",
6349
+ label: "B2B SaaS Objection Handling",
6350
+ description: "Sales-training records for pricing, procurement, ROI, onboarding, integration, security, and competitive objections.",
6351
+ defaultType: "instruction",
6352
+ recommendedFormats: ["instruction", "jsonl", "csv", "markdown"],
6353
+ bestFor: ["sales enablement", "roleplay training", "support coaching", "fine-tuning"],
6354
+ fields: [
6355
+ field("buyer_role", "string", "The buyer or stakeholder persona.", true, "CFO"),
6356
+ field("company_segment", "string", "Customer segment or buying context.", true, "mid-market SaaS"),
6357
+ field("objection_category", "string", "Primary objection class.", true, "pricing"),
6358
+ field("objection", "string", "The exact buyer objection.", true, "This is too expensive compared to our current tool."),
6359
+ field("recommended_response", "string", "The ideal grounded response.", true, "Tie cost to pipeline leakage and propose a measured pilot."),
6360
+ field("proof_point", "string", "Evidence or reasoning used in the response.", false, "integration time and support burden"),
6361
+ field("follow_up_question", "string", "A next-step question that advances discovery.", true, "What cost are you currently assigning to delayed handoffs?")
6362
+ ],
6363
+ recordInstructions: [
6364
+ "Make objections sound like real buyers, not canned sales scripts.",
6365
+ "Answer with empathy, evidence, and a concrete next step.",
6366
+ "Vary buyer role, deal stage, company size, and risk profile."
6367
+ ],
6368
+ qualityBar: [
6369
+ "Response addresses the objection directly.",
6370
+ "No fake statistics or logos.",
6371
+ "Follow-up question is useful in a real sales call."
6372
+ ]
6373
+ }),
6374
+ blueprint({
6375
+ id: "technical-docs-qa",
6376
+ label: "Technical Documentation QA",
6377
+ description: "Developer-facing questions, answers, commands, prerequisites, errors, and implementation details grounded in docs or repos.",
6378
+ defaultType: "qa",
6379
+ recommendedFormats: ["jsonl", "csv", "markdown", "rag"],
6380
+ bestFor: ["developer docs", "support bots", "SDK QA", "RAG corpora"],
6381
+ fields: [
6382
+ field("product_area", "string", "The API, package, CLI, or feature area.", true, "CLI authentication"),
6383
+ field("question", "string", "Developer question or task.", true, "How do I authenticate the CLI?"),
6384
+ field("answer", "string", "Specific answer grounded in documentation.", true, "Run npx alys-akusa login and finish the browser flow."),
6385
+ field("code_or_command", "string", "Relevant command, code, or config.", false, "npx alys-akusa login"),
6386
+ field("prerequisites", "array", "Required setup before this works.", false, "Alys account, browser access"),
6387
+ field("common_error", "string", "Likely failure mode or troubleshooting note.", false, "Expired CLI login session."),
6388
+ field("source_url", "string", "Documentation or repository URL.", true, "https://example.com/docs")
6389
+ ],
6390
+ recordInstructions: [
6391
+ "Prefer commands, parameters, return shapes, limits, and edge cases.",
6392
+ "Never invent API names or package names.",
6393
+ "If docs are ambiguous, make the uncertainty visible."
6394
+ ],
6395
+ qualityBar: [
6396
+ "Answer can be executed or verified.",
6397
+ "No fake APIs.",
6398
+ "Source context includes the relevant command or behavior."
6399
+ ]
6400
+ }),
6401
+ blueprint({
6402
+ id: "legal-compliance-rag",
6403
+ label: "Legal & Compliance RAG Corpus",
6404
+ description: "Compliance-aware chunks and QA with jurisdictions, requirements, controls, exceptions, and evidence notes.",
6405
+ defaultType: "rag",
6406
+ recommendedFormats: ["rag", "jsonl", "csv", "markdown"],
6407
+ bestFor: ["compliance search", "policy QA", "audit prep", "legal retrieval"],
6408
+ fields: [
6409
+ field("jurisdiction", "string", "Relevant jurisdiction or regulatory scope.", false, "United States"),
6410
+ field("requirement", "string", "The rule, obligation, or policy requirement.", true, "Maintain access review evidence."),
6411
+ field("control_or_action", "string", "Concrete action, control, or procedure.", true, "Review user access quarterly and document exceptions."),
6412
+ field("exception_or_limit", "string", "Boundary, exception, or uncertainty.", false, "Frequency may vary by framework."),
6413
+ field("evidence_text", "string", "Source-backed evidence span.", true, "periodic access reviews"),
6414
+ field("source_url", "string", "Canonical source URL.", true, "https://example.com/policy"),
6415
+ field("confidence", "number", "Confidence after source checks.", true, "0.84")
6416
+ ],
6417
+ recordInstructions: [
6418
+ "Preserve scope and limitations.",
6419
+ "Do not convert legal text into absolute advice when the source is conditional.",
6420
+ "Separate requirement, control, and exception clearly."
6421
+ ],
6422
+ qualityBar: [
6423
+ "Jurisdiction/scope is not blurred.",
6424
+ "No invented legal conclusions.",
6425
+ "Evidence text supports the answer."
6426
+ ]
6427
+ }),
6428
+ blueprint({
6429
+ id: "engineering-telemetry",
6430
+ label: "Engineering Telemetry & Operations",
6431
+ description: "Operational records for engineering systems: parameters, units, ranges, anomalies, recommendations, and safety constraints.",
6432
+ defaultType: "instruction",
6433
+ recommendedFormats: ["jsonl", "csv", "instruction", "markdown"],
6434
+ bestFor: ["industrial AI", "predictive maintenance", "ops training", "engineering assistants"],
6435
+ fields: [
6436
+ field("asset_type", "string", "Equipment, system, or asset class.", true, "oil pipeline pump station"),
6437
+ field("operation_phase", "string", "Workflow or operating phase.", true, "commissioning"),
6438
+ field("parameter", "string", "Observed parameter or measurement.", true, "pressure differential"),
6439
+ field("unit", "string", "Engineering unit.", false, "psi"),
6440
+ field("normal_range", "string", "Source-backed or cautiously inferred expected range.", false, "site-specific; verify against design docs"),
6441
+ field("abnormal_signal", "string", "Failure pattern, warning, or anomaly.", true, "rising vibration with falling flow rate"),
6442
+ field("recommended_action", "string", "Operationally safe next action.", true, "inspect pump seals and verify sensor calibration"),
6443
+ field("safety_note", "string", "Boundary or caution.", true, "do not exceed site operating procedures")
6444
+ ],
6445
+ recordInstructions: [
6446
+ "Keep units, ranges, and recommendations physically plausible.",
6447
+ "If numeric ranges are not in source context, say they require site-specific verification.",
6448
+ "Include normal, abnormal, edge, and incident scenarios."
6449
+ ],
6450
+ qualityBar: [
6451
+ "No invented unsafe operating limits.",
6452
+ "Action follows from the signal.",
6453
+ "Record includes constraints and uncertainty."
6454
+ ]
6455
+ }),
6456
+ blueprint({
6457
+ id: "dataset-source-catalog",
6458
+ label: "Dataset Source Catalog",
6459
+ description: "Catalog records for public datasets, repositories, benchmark corpora, licenses, schemas, and use constraints.",
6460
+ defaultType: "qa",
6461
+ recommendedFormats: ["jsonl", "csv", "markdown"],
6462
+ bestFor: ["dataset discovery", "benchmark planning", "source audits", "training-data procurement"],
6463
+ fields: [
6464
+ field("source_name", "string", "Dataset/repository/source name.", true, "SDV benchmark dataset"),
6465
+ field("source_url", "string", "Canonical URL.", true, "https://github.com/sdv-dev/SDV"),
6466
+ field("source_type", "string", "Dataset, repository, benchmark, paper, documentation, or registry.", true, "repository"),
6467
+ field("domain", "string", "Domain or category.", false, "synthetic data"),
6468
+ field("available_formats", "array", "Known available formats.", false, "csv, jsonl, parquet"),
6469
+ field("license_or_terms", "string", "License or usage constraints when available.", false, "MIT"),
6470
+ field("schema_summary", "string", "Short description of fields/tables/tasks.", true, "Benchmark suite for tabular synthetic data."),
6471
+ field("trust_reason", "string", "Why this source is credible or useful.", true, "Primary GitHub repository with active documentation.")
6472
+ ],
6473
+ recordInstructions: [
6474
+ "Favor GitHub, Kaggle, Hugging Face datasets, official benchmark pages, and primary repositories.",
6475
+ "Do not claim license, row count, or schema details unless visible in source context.",
6476
+ "Make the catalog immediately usable for dataset selection."
6477
+ ],
6478
+ qualityBar: [
6479
+ "Every row points to a real source URL.",
6480
+ "License/format claims are source-backed or marked unknown.",
6481
+ "Trust reason is explicit."
6482
+ ]
6483
+ })
6484
+ ];
6485
+ function buildDatasetGenerationPlan(topic, datasetType) {
6132
6486
  const normalized = topic.trim().replace(/\s+/g, " ");
6133
6487
  const lower = normalized.toLowerCase();
6134
6488
  const segments = knownSegments(lower, normalized) ?? genericSegments(normalized);
6489
+ const blueprint2 = selectDatasetBlueprint(normalized, datasetType);
6135
6490
  return {
6136
6491
  topic: normalized,
6137
6492
  intent: inferIntent(lower),
6493
+ blueprint: blueprint2,
6138
6494
  segments,
6139
6495
  sourceCriteria: [
6140
6496
  "Prefer official documentation, standards bodies, academic papers, technical reports, government sources, and primary company docs.",
@@ -6149,12 +6505,41 @@ function buildDatasetGenerationPlan(topic) {
6149
6505
  "Repeated phrasing across records indicates mode collapse and must be suppressed."
6150
6506
  ],
6151
6507
  outputSchemaNotes: [
6508
+ `Use the "${blueprint2.label}" blueprint (${blueprint2.id}) as the canonical row contract.`,
6509
+ `Recommended exports: ${blueprint2.recommendedFormats.join(", ")}.`,
6510
+ ...blueprint2.fields.map((item) => `${item.required ? "Required" : "Optional"} field: ${item.name} (${item.type}) - ${item.description}`),
6152
6511
  "Each record needs an actionable input, specific output, source context, source URL, confidence, tags, and metadata.",
6153
6512
  "Metadata should include source trust, source authority, source relevance, segment ID, quality signals, and benchmark notes.",
6154
6513
  "Records should be useful without the original source page while still preserving provenance."
6155
6514
  ]
6156
6515
  };
6157
6516
  }
6517
+ function selectDatasetBlueprint(topic, datasetType) {
6518
+ const lower = topic.toLowerCase();
6519
+ const selected = (() => {
6520
+ if ((lower.includes("b2b") || lower.includes("sales")) && lower.includes("objection")) return byId("b2b-saas-objections");
6521
+ if (lower.includes("legal") || lower.includes("compliance") || lower.includes("soc 2") || lower.includes("hipaa")) return byId("legal-compliance-rag");
6522
+ if (lower.includes("oil") || lower.includes("gas") || lower.includes("pipeline") || lower.includes("drilling") || lower.includes("telemetry") || lower.includes("maintenance")) return byId("engineering-telemetry");
6523
+ if (lower.includes("documentation") || lower.includes("developer") || lower.includes("api") || lower.includes("sdk") || lower.includes("cli")) return byId("technical-docs-qa");
6524
+ if (lower.includes("benchmark") || lower.includes("evaluation") || lower.includes("eval")) return byId("evaluation-qa");
6525
+ if (lower.includes("source catalog") || lower.includes("public dataset") || lower.includes("kaggle") || lower.includes("github dataset")) return byId("dataset-source-catalog");
6526
+ if (lower.includes("rag") || lower.includes("retrieval") || lower.includes("knowledge base")) return byId("rag-corpus");
6527
+ return datasetType === "rag" ? byId("rag-corpus") : datasetType === "qa" ? byId("evaluation-qa") : byId("instruction-finetune-corpus");
6528
+ })();
6529
+ return selected ?? DATASET_BLUEPRINTS[0];
6530
+ }
6531
+ function byId(id) {
6532
+ return DATASET_BLUEPRINTS.find((blueprint2) => blueprint2.id === id);
6533
+ }
6534
+ function blueprint(input) {
6535
+ return {
6536
+ ...input,
6537
+ version: input.version ?? "1.0.0"
6538
+ };
6539
+ }
6540
+ function field(name, type, description, required, example) {
6541
+ return { name, type, description, required, example };
6542
+ }
6158
6543
  function segmentForSource(plan, text, index = 0) {
6159
6544
  const haystack = text.toLowerCase();
6160
6545
  const scored = plan.segments.map((segment2) => {
@@ -6267,9 +6652,12 @@ Your job is to generate production-grade synthetic dataset records that can surv
6267
6652
  Hard rules:
6268
6653
  - Return only valid JSON matching the requested schema.
6269
6654
  - Each record must be meaningfully different.
6655
+ - Every record must be grounded in the provided source context.
6656
+ - Do not introduce facts, numbers, names, benchmarks, URLs, citations, or claims that are not present in the source context.
6270
6657
  - Preserve plausible domain physics, operational constraints, and realistic terminology.
6271
6658
  - Prefer specific scenarios, values, failure modes, and edge cases over generic prose.
6272
- - Do not include fake citations. Source context is provenance, not proof of factual truth.
6659
+ - If the source context is thin, produce cautious records about what can be inferred and what needs verification.
6660
+ - Do not include fake citations. Source URL and source title are attached outside your JSON.
6273
6661
  - Never output placeholders, TODOs, markdown fences, or explanations.
6274
6662
  `.trim();
6275
6663
  function loadAlysEnv(cwd = process.cwd()) {
@@ -6361,6 +6749,9 @@ Active topic segment:
6361
6749
  Generation plan intent:
6362
6750
  ${options.generationPlan.intent}
6363
6751
 
6752
+ Dataset blueprint:
6753
+ ${renderBlueprintForPrompt(options.generationPlan)}
6754
+
6364
6755
  Required source criteria:
6365
6756
  ${options.generationPlan.sourceCriteria.map((item) => `- ${item}`).join("\n")}
6366
6757
 
@@ -6394,6 +6785,9 @@ ${segmentBlock}
6394
6785
  Generate exactly ${targetCount} records.
6395
6786
 
6396
6787
  Domain quality requirements:
6788
+ - Ground every record in the source context below. If a detail is not in the context, do not invent it.
6789
+ - Use the dataset blueprint above as the semantic row contract. Do not waste tokens describing CSV/JSONL formatting; Alys handles export formatting.
6790
+ - Put the most important blueprint field values into input, output, and context. Add blueprint field names to metadata.signals or metadata.constraints when useful.
6397
6791
  - Include realistic parameters, edge cases, constraints, and operational variability.
6398
6792
  - For engineering domains, use physically plausible values and causal relationships.
6399
6793
  - For tabular/synthetic-data domains, preserve schema-like consistency and row-level diversity.
@@ -6401,6 +6795,7 @@ Domain quality requirements:
6401
6795
  - Avoid repeating the topic verbatim in every output.
6402
6796
  - Reflect source quality: lower-confidence sources should create cautious, verification-aware records.
6403
6797
  - If the source is low authority, write records as verification-aware training examples rather than confident factual claims.
6798
+ - Prefer source-backed dataset rows over broad advice. If the context only supports broad advice, lower confidence.
6404
6799
  - Every output should contain a specific decision signal, scenario, constraint, or useful answer. Reject generic summaries.
6405
6800
  - Prefer records that can be directly reused for fine-tuning, RAG evaluation, or QA benchmarks.
6406
6801
 
@@ -6425,8 +6820,29 @@ Return JSON with this shape:
6425
6820
 
6426
6821
  Source context:
6427
6822
  ${sourceText}
6823
+
6824
+ Final grounding rule:
6825
+ Only use information supported by Source title, Source URL, Source quality, and Source context above.
6428
6826
  `.trim();
6429
6827
  }
6828
+ function renderBlueprintForPrompt(plan) {
6829
+ const blueprint2 = plan.blueprint;
6830
+ return [
6831
+ `ID: ${blueprint2.id}`,
6832
+ `Label: ${blueprint2.label}`,
6833
+ `Version: ${blueprint2.version}`,
6834
+ `Purpose: ${blueprint2.description}`,
6835
+ `Best for: ${blueprint2.bestFor.join(", ")}`,
6836
+ "Fields:",
6837
+ ...blueprint2.fields.map(
6838
+ (field2) => `- ${field2.name} (${field2.type}${field2.required ? ", required" : ", optional"}): ${field2.description}${field2.example ? ` Example: ${field2.example}` : ""}`
6839
+ ),
6840
+ "Record instructions:",
6841
+ ...blueprint2.recordInstructions.map((item) => `- ${item}`),
6842
+ "Quality bar:",
6843
+ ...blueprint2.qualityBar.map((item) => `- ${item}`)
6844
+ ].join("\n");
6845
+ }
6430
6846
  async function generateWithOpenAI(options) {
6431
6847
  const apiKey = getOpenAIKey();
6432
6848
  const model = process.env.ALYS_GENERATOR_MODEL || DEFAULT_OPENAI_MODEL;
@@ -6447,7 +6863,7 @@ async function generateWithOpenAI(options) {
6447
6863
  },
6448
6864
  body: JSON.stringify({
6449
6865
  model,
6450
- temperature: Number(process.env.ALYS_GENERATOR_TEMPERATURE ?? 0.45),
6866
+ temperature: Number(process.env.ALYS_GENERATOR_TEMPERATURE ?? 0.28),
6451
6867
  max_tokens: Math.min(32768, Math.max(1600, options.targetCount * 520)),
6452
6868
  messages: [
6453
6869
  { role: "system", content: ALYS_RECORD_SYSTEM_PROMPT },
@@ -6473,7 +6889,10 @@ async function generateWithOpenAI(options) {
6473
6889
  provider: "openai",
6474
6890
  model,
6475
6891
  latencyMs: Date.now() - startedAt,
6476
- records: parseProviderRecords(content, options.baselineConfidence, options.datasetType)
6892
+ records: groundProviderRecords(
6893
+ parseProviderRecords(content, options.baselineConfidence, options.datasetType),
6894
+ options.document
6895
+ )
6477
6896
  };
6478
6897
  }
6479
6898
  async function generateWithOpenAIBatched(options) {
@@ -6520,7 +6939,7 @@ async function generateWithGroq(options) {
6520
6939
  },
6521
6940
  body: JSON.stringify({
6522
6941
  model,
6523
- temperature: Number(process.env.ALYS_GENERATOR_TEMPERATURE ?? 0.35),
6942
+ temperature: Number(process.env.ALYS_GENERATOR_TEMPERATURE ?? 0.25),
6524
6943
  max_tokens: Math.min(8192, Math.max(1600, options.targetCount * 520)),
6525
6944
  messages: [
6526
6945
  { role: "system", content: ALYS_RECORD_SYSTEM_PROMPT },
@@ -6539,7 +6958,10 @@ async function generateWithGroq(options) {
6539
6958
  provider: "groq",
6540
6959
  model,
6541
6960
  latencyMs: Date.now() - startedAt,
6542
- records: parseProviderRecords(content, options.baselineConfidence, options.datasetType)
6961
+ records: groundProviderRecords(
6962
+ parseProviderRecords(content, options.baselineConfidence, options.datasetType),
6963
+ options.document
6964
+ )
6543
6965
  };
6544
6966
  }
6545
6967
  async function generateWithGroqBatched(options) {
@@ -6649,6 +7071,78 @@ function normalizeMetadata(value) {
6649
7071
  benchmark_notes: Array.isArray(obj.benchmark_notes) ? obj.benchmark_notes.map(String).slice(0, 12) : []
6650
7072
  };
6651
7073
  }
7074
+ function groundProviderRecords(records, document) {
7075
+ const sourceTokens = meaningfulTokens(document.text);
7076
+ if (sourceTokens.size < 8) return [];
7077
+ const sourceIsFallback = /fallback reason:/i.test(document.text);
7078
+ const minimumGrounding = sourceIsFallback ? 0.025 : 0.04;
7079
+ return records.flatMap((record) => {
7080
+ const recordTokens = meaningfulTokens(`${record.input} ${record.output} ${record.context}`);
7081
+ if (!recordTokens.size) return [];
7082
+ let overlap = 0;
7083
+ for (const token of recordTokens) {
7084
+ if (sourceTokens.has(token)) overlap += 1;
7085
+ }
7086
+ const groundingScore = overlap / Math.max(1, Math.min(recordTokens.size, sourceTokens.size));
7087
+ if (groundingScore < minimumGrounding) return [];
7088
+ const confidence = clamp012(record.confidence * (0.76 + Math.min(0.24, groundingScore * 2.4)));
7089
+ return [{
7090
+ ...record,
7091
+ confidence,
7092
+ metadata: {
7093
+ ...record.metadata,
7094
+ signals: Array.from(/* @__PURE__ */ new Set([
7095
+ ...record.metadata.signals,
7096
+ `source-grounding:${groundingScore.toFixed(3)}`,
7097
+ `source-url:${document.url}`
7098
+ ])).slice(0, 12),
7099
+ constraints: Array.from(/* @__PURE__ */ new Set([
7100
+ ...record.metadata.constraints,
7101
+ "accepted-after-source-grounding-check"
7102
+ ])).slice(0, 12)
7103
+ }
7104
+ }];
7105
+ });
7106
+ }
7107
+ function meaningfulTokens(value) {
7108
+ const stopwords = /* @__PURE__ */ new Set([
7109
+ "about",
7110
+ "after",
7111
+ "also",
7112
+ "because",
7113
+ "before",
7114
+ "being",
7115
+ "between",
7116
+ "could",
7117
+ "dataset",
7118
+ "during",
7119
+ "every",
7120
+ "from",
7121
+ "have",
7122
+ "into",
7123
+ "more",
7124
+ "only",
7125
+ "other",
7126
+ "should",
7127
+ "source",
7128
+ "that",
7129
+ "their",
7130
+ "there",
7131
+ "these",
7132
+ "this",
7133
+ "through",
7134
+ "using",
7135
+ "what",
7136
+ "when",
7137
+ "where",
7138
+ "which",
7139
+ "with",
7140
+ "would"
7141
+ ]);
7142
+ return new Set(
7143
+ value.toLowerCase().match(/[a-z0-9][a-z0-9._/-]{2,}/g)?.map((token) => token.replace(/^[._/-]+|[._/-]+$/g, "")).filter((token) => token.length >= 4 && !stopwords.has(token)).slice(0, 900) ?? []
7144
+ );
7145
+ }
6652
7146
  function safeJson(text) {
6653
7147
  const trimmed = text.trim();
6654
7148
  try {
@@ -6777,6 +7271,7 @@ var StructuringAgent = class {
6777
7271
  if (trustScore < (options.minTrustScore ?? 0.42) || relevanceScore < (options.minRelevanceScore ?? 0.24)) return [];
6778
7272
  const sourceWeight = sourceQualityWeight(document);
6779
7273
  const segment2 = options.generationPlan ? segmentForSource(options.generationPlan, `${document.title} ${document.text}`, index) : void 0;
7274
+ const blueprint2 = options.generationPlan?.blueprint;
6780
7275
  const baselineConfidence = clamp013((finding?.confidence ?? 0.7) * 0.55 + trustScore * 0.22 + authorityScore * 0.12 + relevanceScore * 0.11 - duplicationRisk * 0.08);
6781
7276
  const baseId = import_node_crypto3.default.createHash("sha1").update(`${topic}:${document.url}:${datasetType}`).digest("hex").slice(0, 14);
6782
7277
  const providerTarget = useProvider ? weightedRecordTarget(recordsPerDocument, options.providerRecordsPerDocument ?? recordsPerDocument, sourceWeight) : 0;
@@ -6809,6 +7304,10 @@ var StructuringAgent = class {
6809
7304
  ...g.metadata,
6810
7305
  topic,
6811
7306
  kind: datasetType,
7307
+ blueprint_id: blueprint2?.id,
7308
+ blueprint_label: blueprint2?.label,
7309
+ blueprint_version: blueprint2?.version,
7310
+ blueprint_fields: blueprint2?.fields.map((field2) => field2.name),
6812
7311
  segment_id: segment2?.id,
6813
7312
  segment_label: segment2?.label,
6814
7313
  provider: providerResult.provider,
@@ -6936,6 +7435,11 @@ function toCsv(records) {
6936
7435
  "source",
6937
7436
  "source_url",
6938
7437
  "confidence",
7438
+ "blueprint_id",
7439
+ "segment_id",
7440
+ "source_trust_score",
7441
+ "source_authority_score",
7442
+ "source_relevance_score",
6939
7443
  "tags",
6940
7444
  "metadata",
6941
7445
  "created_at"
@@ -6948,12 +7452,24 @@ function toCsv(records) {
6948
7452
  record.source,
6949
7453
  record.source_url,
6950
7454
  String(record.confidence),
7455
+ metadataString(record, "blueprint_id"),
7456
+ metadataString(record, "segment_id"),
7457
+ metadataString(record, "source_trust_score"),
7458
+ metadataString(record, "source_authority_score"),
7459
+ metadataString(record, "source_relevance_score"),
6951
7460
  record.tags.join("|"),
6952
7461
  JSON.stringify(record.metadata),
6953
7462
  record.created_at
6954
7463
  ]);
6955
7464
  return [header, ...rows].map((row) => row.map(escapeCsv).join(",")).join("\n") + "\n";
6956
7465
  }
7466
+ function metadataString(record, key) {
7467
+ const value = record.metadata[key];
7468
+ if (value === null || value === void 0) return "";
7469
+ if (typeof value === "string") return value;
7470
+ if (typeof value === "number" || typeof value === "boolean") return String(value);
7471
+ return JSON.stringify(value);
7472
+ }
6957
7473
  function toMarkdown(records) {
6958
7474
  return records.map((record) => {
6959
7475
  const title = record.input || record.id;
@@ -7089,20 +7605,26 @@ function performanceConfig(mode) {
7089
7605
  }
7090
7606
  function gateSources(sources, mode) {
7091
7607
  const perf = performanceConfig(mode);
7092
- const accepted = sources.filter((source) => {
7608
+ const concreteSources = sources.filter(isConcreteEvidenceSource);
7609
+ const accepted = concreteSources.filter((source) => {
7093
7610
  const trust = source.trustScore ?? source.score;
7094
7611
  const relevance = source.relevanceScore ?? source.score;
7095
7612
  const duplicateRisk = source.duplicationRisk ?? 0;
7096
7613
  const authority = source.authorityScore ?? 0.5;
7097
7614
  return trust >= perf.minTrustScore && relevance >= perf.minRelevanceScore && duplicateRisk < 0.72 && (trust >= 0.52 || authority >= 0.72);
7098
7615
  });
7099
- const minimum = Math.min(sources.length, Math.max(3, Math.ceil(sources.length * perf.gateMinimumRatio)));
7100
- const fallback = accepted.length >= minimum ? accepted : sources.slice(0, minimum);
7616
+ const minimum = Math.min(concreteSources.length, Math.max(3, Math.ceil(concreteSources.length * perf.gateMinimumRatio)));
7617
+ const fallback = accepted.length >= minimum ? accepted : accepted.length ? accepted : concreteSources.filter((source) => (source.authorityScore ?? 0) >= 0.78 && (source.relevanceScore ?? source.score) >= perf.minRelevanceScore).slice(0, minimum);
7101
7618
  return {
7102
7619
  sources: fallback,
7103
7620
  filtered: Math.max(0, sources.length - fallback.length)
7104
7621
  };
7105
7622
  }
7623
+ function isConcreteEvidenceSource(source) {
7624
+ if (process.env.ALYS_ALLOW_HEURISTIC_GENERATION === "true") return true;
7625
+ const provider = (source.provider || source.discoveredBy || "").toLowerCase();
7626
+ return provider !== "local-heuristic" && !provider.includes("heuristic");
7627
+ }
7106
7628
  function sourceDiversityScore(sources) {
7107
7629
  if (!sources.length) return 0;
7108
7630
  const domains = new Set(sources.map((source) => source.domain || domainFromUrl4(source.url)));
@@ -7129,7 +7651,7 @@ async function generateDataset(options) {
7129
7651
  const targetRows = Math.max(1, Math.floor(options.targetRows ?? 100));
7130
7652
  const datasetId = import_node_crypto4.default.createHash("sha1").update(`${options.topic}:${Date.now()}`).digest("hex").slice(0, 12);
7131
7653
  const workspace = await ensureAlysWorkspace(options.workspaceRoot);
7132
- const generationPlan = buildDatasetGenerationPlan(options.topic);
7654
+ const generationPlan = buildDatasetGenerationPlan(options.topic, datasetType);
7133
7655
  const expandedQueries = planQueries(generationPlan).slice(0, perf.queryCap);
7134
7656
  const discoveryEnabledSeed = options.discoverySeed ?? 0;
7135
7657
  const verificationEnabled = options.verificationEnabled ?? true;
@@ -7154,6 +7676,9 @@ async function generateDataset(options) {
7154
7676
  message: `${gated.filtered} low-trust or low-relevance sources filtered`,
7155
7677
  metric: `${gated.sources.length} accepted`
7156
7678
  });
7679
+ if (!gated.sources.length) {
7680
+ throw new Error("ALYS_NO_TRUSTED_SOURCES");
7681
+ }
7157
7682
  const extraction = new ExtractionAgent();
7158
7683
  event(options.onEvent, { stage: "extraction", agent: extraction.name, status: "running", message: "Extracting source text..." });
7159
7684
  const extracted = await extraction.run(gated.sources);
@@ -7222,6 +7747,9 @@ async function generateDataset(options) {
7222
7747
  }
7223
7748
  });
7224
7749
  event(options.onEvent, { stage: "structuring", agent: structuring.name, status: "success", message: `${structured.length} candidate records generated`, metric: `${targetRows} target` });
7750
+ if (!structured.length) {
7751
+ throw new Error("ALYS_NO_GROUNDED_RECORDS");
7752
+ }
7225
7753
  const curator = new DatasetCuratorAgent();
7226
7754
  event(options.onEvent, { stage: "curation", agent: curator.name, status: "running", message: "Curating final dataset..." });
7227
7755
  const records = curator.run(structured, targetRows).map((record) => ({
@@ -7296,8 +7824,14 @@ async function generateDataset(options) {
7296
7824
  `));
7297
7825
  artifacts.push(await writeDatasetArtifact(workspace, datasetId, "metrics.json", `${JSON.stringify(qualityMetrics, null, 2)}
7298
7826
  `));
7827
+ artifacts.push(await writeDatasetArtifact(workspace, datasetId, "schema.json", `${JSON.stringify(datasetSchema(manifest), null, 2)}
7828
+ `));
7829
+ artifacts.push(await writeDatasetArtifact(workspace, datasetId, "data-dictionary.md", renderDataDictionary(manifest)));
7299
7830
  artifacts.push(await writeDatasetArtifact(workspace, datasetId, "source-graph.json", `${JSON.stringify(research.graph, null, 2)}
7300
7831
  `));
7832
+ artifacts.push(await writeDatasetArtifact(workspace, datasetId, "sources.json", `${JSON.stringify(sources, null, 2)}
7833
+ `));
7834
+ artifacts.push(await writeDatasetArtifact(workspace, datasetId, "sources.md", renderSourcesMarkdown(options.topic, sources)));
7301
7835
  artifacts.push(await writeDatasetArtifact(workspace, datasetId, "generation-plan.json", `${JSON.stringify(generationPlan, null, 2)}
7302
7836
  `));
7303
7837
  artifacts.push(await writeDatasetArtifact(workspace, datasetId, "benchmark-report.json", `${JSON.stringify(evaluation, null, 2)}
@@ -7306,6 +7840,101 @@ async function generateDataset(options) {
7306
7840
  event(options.onEvent, { stage: "export", agent: "ArtifactStorage", status: "success", message: `Dataset written to ${manifest.outputDir}`, metric: manifest.outputDir });
7307
7841
  return { manifest, records, artifacts };
7308
7842
  }
7843
+ function datasetSchema(manifest) {
7844
+ const blueprint2 = manifest.generationPlan?.blueprint;
7845
+ if (!blueprint2) {
7846
+ return {
7847
+ title: "Alys Dataset Record",
7848
+ type: "object",
7849
+ properties: {
7850
+ id: { type: "string" },
7851
+ input: { type: "string" },
7852
+ output: { type: "string" },
7853
+ context: { type: "string" },
7854
+ source_url: { type: "string" },
7855
+ confidence: { type: "number" }
7856
+ },
7857
+ required: ["id", "input", "output", "context", "source_url", "confidence"]
7858
+ };
7859
+ }
7860
+ return {
7861
+ $schema: "https://json-schema.org/draft/2020-12/schema",
7862
+ title: blueprint2.label,
7863
+ description: blueprint2.description,
7864
+ blueprintId: blueprint2.id,
7865
+ blueprintVersion: blueprint2.version,
7866
+ recommendedFormats: blueprint2.recommendedFormats,
7867
+ type: "object",
7868
+ additionalProperties: false,
7869
+ required: blueprint2.fields.filter((field2) => field2.required).map((field2) => field2.name),
7870
+ properties: Object.fromEntries(
7871
+ blueprint2.fields.map((field2) => [
7872
+ field2.name,
7873
+ {
7874
+ type: field2.type,
7875
+ description: field2.description,
7876
+ ...field2.example ? { examples: [field2.example] } : {}
7877
+ }
7878
+ ])
7879
+ )
7880
+ };
7881
+ }
7882
+ function renderDataDictionary(manifest) {
7883
+ const blueprint2 = manifest.generationPlan?.blueprint;
7884
+ if (!blueprint2) {
7885
+ return "# Data Dictionary\n\nNo dataset blueprint was recorded for this run.\n";
7886
+ }
7887
+ const lines = [
7888
+ `# ${blueprint2.label} Data Dictionary`,
7889
+ "",
7890
+ blueprint2.description,
7891
+ "",
7892
+ `Blueprint: \`${blueprint2.id}@${blueprint2.version}\``,
7893
+ `Recommended formats: ${blueprint2.recommendedFormats.map((format) => `\`${format}\``).join(", ")}`,
7894
+ "",
7895
+ "## Fields",
7896
+ "",
7897
+ "| Field | Type | Required | Description | Example |",
7898
+ "| --- | --- | --- | --- | --- |",
7899
+ ...blueprint2.fields.map(
7900
+ (field2) => `| \`${field2.name}\` | ${field2.type} | ${field2.required ? "yes" : "no"} | ${field2.description.replace(/\|/g, "\\|")} | ${field2.example?.replace(/\|/g, "\\|") ?? ""} |`
7901
+ ),
7902
+ "",
7903
+ "## Record Instructions",
7904
+ "",
7905
+ ...blueprint2.recordInstructions.map((item) => `- ${item}`),
7906
+ "",
7907
+ "## Quality Bar",
7908
+ "",
7909
+ ...blueprint2.qualityBar.map((item) => `- ${item}`),
7910
+ ""
7911
+ ];
7912
+ return `${lines.join("\n")}
7913
+ `;
7914
+ }
7915
+ function renderSourcesMarkdown(topic, sources) {
7916
+ const lines = [
7917
+ `# Sources for ${topic}`,
7918
+ "",
7919
+ "Alys grounds generated records in the ranked source pool below. Higher trust scores influence confidence and record acceptance.",
7920
+ ""
7921
+ ];
7922
+ for (const [index, source] of sources.entries()) {
7923
+ const trust = typeof source.trustScore === "number" ? ` \xB7 trust ${Math.round(source.trustScore * 100)}%` : "";
7924
+ const type = source.sourceType ? ` \xB7 ${source.sourceType}` : "";
7925
+ const domain = source.domain ? ` \xB7 ${source.domain}` : "";
7926
+ lines.push(`${index + 1}. [${source.title}](${source.url})${domain}${type}${trust}`);
7927
+ if (source.snippet) lines.push(` ${source.snippet.replace(/\s+/g, " ").trim()}`);
7928
+ if (source.qualitySignals?.length) lines.push(` Signals: ${source.qualitySignals.slice(0, 6).join(", ")}`);
7929
+ lines.push("");
7930
+ }
7931
+ if (!sources.length) {
7932
+ lines.push("No sources were recorded for this run.");
7933
+ lines.push("");
7934
+ }
7935
+ return `${lines.join("\n")}
7936
+ `;
7937
+ }
7309
7938
  function artifactFilename(format) {
7310
7939
  if (format === "markdown") return "dataset.md";
7311
7940
  if (format === "instruction") return "instruction.jsonl";
@@ -7450,6 +8079,13 @@ Limits:
7450
8079
  1 dataset = 1 generation
7451
8080
  max 5 datasets per run
7452
8081
  use --benchmark for local high-volume benchmark runs
8082
+
8083
+ Concepts:
8084
+ RAG chunks retrieval-ready records (for search or knowledge-base apps)
8085
+ Instruction tuning examples for fine-tuning (teaching model behavior)
8086
+ JSONL one JSON object per line (standard for ML pipelines)
8087
+ CSV spreadsheet-friendly rows (for review and analysis)
8088
+ Verification quality checks (confidence, repetition, schema validity)
7453
8089
  `);
7454
8090
  }
7455
8091
  function loadCliEnv(cwd = process.cwd()) {
@@ -7635,6 +8271,17 @@ function getEvaluation(dataset) {
7635
8271
  const evaluation = dataset.manifest.evaluation;
7636
8272
  return evaluation && typeof evaluation === "object" ? evaluation : {};
7637
8273
  }
8274
+ function getBlueprint(dataset) {
8275
+ const generationPlan = dataset.manifest.generationPlan;
8276
+ if (!generationPlan || typeof generationPlan !== "object") return {};
8277
+ const blueprint2 = generationPlan.blueprint;
8278
+ return blueprint2 && typeof blueprint2 === "object" ? blueprint2 : {};
8279
+ }
8280
+ function getSourceManifest(dataset) {
8281
+ const sources = dataset.manifest.sourceManifest;
8282
+ if (!Array.isArray(sources)) return [];
8283
+ return sources.filter((source) => Boolean(source) && typeof source === "object").filter((source) => typeof source.title === "string" || typeof source.url === "string");
8284
+ }
7638
8285
  function printStage(code, status, label, metric) {
7639
8286
  const tint = status === "DONE" || status === "OK" ? "green" : status === "WARN" ? "yellow" : "cyan";
7640
8287
  const prefix = `${paint(`[${code.padEnd(4).slice(0, 4)}]`, "gray")} ${paint(status.padEnd(4), tint)}`;
@@ -7827,12 +8474,28 @@ function printGenerationSummary(response, workspaceRoot) {
7827
8474
  const records = Number(metrics.recordsGenerated ?? summary.recordsAccepted ?? 0);
7828
8475
  const sources = Number(metrics.sourcesDiscovered ?? 0);
7829
8476
  const confidenceValue = Number(metrics.averageConfidence ?? summary.averageConfidence ?? 0);
8477
+ const blueprint2 = getBlueprint(dataset);
7830
8478
  const outputDir = import_node_path4.default.join(root, "datasets", dataset.id);
7831
8479
  console.log(`${paint("\u2022", "yellow")} ${paint(dataset.id, "white")} ${formatInt(records)} records ${formatInt(sources)} sources ${formatPercent(confidenceValue)} confidence`);
7832
8480
  console.log(` ${truncate(dataset.topic, 110)}`);
8481
+ if (blueprint2.label || blueprint2.id) {
8482
+ console.log(` blueprint ${paint(blueprint2.label ?? blueprint2.id ?? "dataset blueprint", "white")}${paint(blueprint2.id ? ` (${blueprint2.id})` : "", "gray")}`);
8483
+ }
7833
8484
  console.log(` ${paint(outputDir, "cyan")}`);
7834
8485
  console.log(` quality ${formatPercent(Number(quality.citationCoverage ?? 0))} citations \xB7 ${formatPercent(Number(quality.recordUniqueness ?? 0))} unique \xB7 ${formatPercent(Number(quality.sourceDiversity ?? 0))} source diversity`);
7835
8486
  console.log(` suitability RAG ${formatScore(Number(suitability.ragSuitability ?? 0))} \xB7 tuning ${formatScore(Number(suitability.instructionTuning ?? 0))} \xB7 usefulness ${formatScore(Number(suitability.humanUsefulness ?? 0))}`);
8487
+ const topSources = getSourceManifest(dataset).slice(0, 5);
8488
+ if (topSources.length) {
8489
+ console.log(paint(" sources", "gray"));
8490
+ for (const source of topSources) {
8491
+ const label = source.title || source.domain || source.provider || "source";
8492
+ const trust = Number(source.trustScore ?? source.authorityScore ?? source.relevanceScore ?? 0);
8493
+ const type = source.sourceType ? ` ${source.sourceType}` : "";
8494
+ const score = trust > 0 ? ` ${formatPercent(trust)} trust` : "";
8495
+ console.log(` - ${truncate(label, 76)}${paint(`${type}${score}`, "gray")}`);
8496
+ if (source.url) console.log(` ${paint(source.url, "cyan")}`);
8497
+ }
8498
+ }
7836
8499
  const preview = previewRecord(dataset);
7837
8500
  if (preview) {
7838
8501
  console.log(paint(" preview", "gray"));
@@ -8017,11 +8680,20 @@ async function handleGenerate(args, command) {
8017
8680
  const datasetType = parseDatasetType(values.type) ?? (values.yes === true ? "instruction" : void 0) ?? (await (0, import_prompts3.default)({
8018
8681
  type: "select",
8019
8682
  name: "datasetType",
8020
- message: "Dataset type?",
8683
+ message: "Dataset type? (choose what the output should be used for)",
8021
8684
  choices: [
8022
- { title: "Instruction tuning", value: "instruction" },
8023
- { title: "RAG chunks", value: "rag" },
8024
- { title: "Question/Answer", value: "qa" }
8685
+ {
8686
+ title: "Instruction tuning (task + ideal answer examples)",
8687
+ value: "instruction"
8688
+ },
8689
+ {
8690
+ title: "RAG chunks (retrieval-ready context for search/knowledge bases)",
8691
+ value: "rag"
8692
+ },
8693
+ {
8694
+ title: "Question/Answer (direct QA pairs for evaluation or training)",
8695
+ value: "qa"
8696
+ }
8025
8697
  ]
8026
8698
  })).datasetType;
8027
8699
  const requestedDatasetCount = values.datasets ? Math.max(1, Math.floor(Number(values.datasets))) : values.yes === true ? 1 : (await (0, import_prompts3.default)({
@@ -8040,30 +8712,30 @@ async function handleGenerate(args, command) {
8040
8712
  const exportFormats = values.format ? parseFormats(values.format) : values.yes === true ? ["jsonl", "csv", "markdown"] : (await (0, import_prompts3.default)({
8041
8713
  type: "multiselect",
8042
8714
  name: "exportFormats",
8043
- message: "Output formats?",
8715
+ message: "Output formats? (you can select multiple)",
8044
8716
  choices: [
8045
- { title: "JSONL", value: "jsonl", selected: true },
8046
- { title: "CSV", value: "csv", selected: true },
8047
- { title: "Markdown", value: "markdown" },
8048
- { title: "Instruction dataset", value: "instruction" },
8049
- { title: "RAG chunks", value: "rag" }
8717
+ { title: "JSONL (one JSON record per line, best for ML pipelines)", value: "jsonl", selected: true },
8718
+ { title: "CSV (spreadsheet-friendly review format)", value: "csv", selected: true },
8719
+ { title: "Markdown (readable summary for humans)", value: "markdown" },
8720
+ { title: "Instruction dataset (fine-tuning JSONL)", value: "instruction" },
8721
+ { title: "RAG chunks (retrieval-ready JSONL)", value: "rag" }
8050
8722
  ],
8051
8723
  hint: "Use space to select multiple."
8052
8724
  })).exportFormats;
8053
8725
  const depth = parseDepth(values.depth) ?? (values.yes === true ? "medium" : void 0) ?? (await (0, import_prompts3.default)({
8054
8726
  type: "select",
8055
8727
  name: "depth",
8056
- message: "Research depth?",
8728
+ message: "Research depth? (more depth can improve coverage but costs time)",
8057
8729
  choices: [
8058
- { title: "Shallow", value: "shallow" },
8059
- { title: "Medium", value: "medium" },
8060
- { title: "Deep", value: "deep" }
8730
+ { title: "Shallow (fastest, smaller context)", value: "shallow" },
8731
+ { title: "Medium (balanced default)", value: "medium" },
8732
+ { title: "Deep (broader coverage, slower)", value: "deep" }
8061
8733
  ]
8062
8734
  })).depth;
8063
8735
  const sourceLimit = values.sources ? Math.min(maxSources, Math.max(1, Number(values.sources))) : values.yes === true ? benchmarkMode ? 48 : MAX_SOURCES_PER_RUN : (await (0, import_prompts3.default)({
8064
8736
  type: "number",
8065
8737
  name: "sourceLimit",
8066
- message: "How many sources?",
8738
+ message: "How many sources? (more sources can improve coverage but may slow the run)",
8067
8739
  initial: benchmarkMode ? 48 : MAX_SOURCES_PER_RUN,
8068
8740
  min: 1,
8069
8741
  max: maxSources
@@ -8071,7 +8743,7 @@ async function handleGenerate(args, command) {
8071
8743
  const targetRows = values.rows ? Math.min(maxRows, Math.max(1, Number(values.rows))) : values.yes === true ? benchmarkMode ? 5e3 : MAX_ROWS_PER_DATASET : (await (0, import_prompts3.default)({
8072
8744
  type: "number",
8073
8745
  name: "targetRows",
8074
- message: "Rows per dataset?",
8746
+ message: "Rows per dataset? (Alys aims for rows worth keeping, not raw volume)",
8075
8747
  initial: benchmarkMode ? 5e3 : MAX_ROWS_PER_DATASET,
8076
8748
  min: 1,
8077
8749
  max: maxRows
@@ -8086,7 +8758,7 @@ async function handleGenerate(args, command) {
8086
8758
  const verificationEnabled = values.verify === true ? true : values["no-verify"] === true ? false : values.yes === true ? performanceMode !== "fast" : (await (0, import_prompts3.default)({
8087
8759
  type: "toggle",
8088
8760
  name: "verificationEnabled",
8089
- message: "Enable verification checks?",
8761
+ message: "Enable verification checks? (slower, stricter about weak/repetitive records)",
8090
8762
  initial: performanceMode !== "fast",
8091
8763
  active: "Yes",
8092
8764
  inactive: "No"
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "alys-akusa",
3
- "version": "0.1.7",
3
+ "version": "0.1.8",
4
4
  "private": false,
5
5
  "description": "Alys local CLI runtime for autonomous dataset generation.",
6
6
  "license": "UNLICENSED",