alys-akusa 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.cjs +762 -76
  2. package/package.json +1 -1
package/dist/index.cjs CHANGED
@@ -5109,7 +5109,10 @@ async function discoverResearchSources(topic, options = {}) {
5109
5109
  };
5110
5110
  }
5111
5111
  function createConfiguredSearchProviders() {
5112
- const providers = [];
5112
+ const providers = [
5113
+ new GitHubSearchProvider(env("GITHUB_TOKEN")),
5114
+ new KaggleSearchProvider(env("KAGGLE_USERNAME"), env("KAGGLE_KEY"))
5115
+ ];
5113
5116
  const tavilyKey = env("TAVILY_API_KEY");
5114
5117
  if (tavilyKey) providers.push(new TavilySearchProvider(tavilyKey));
5115
5118
  const serpApiKey = env("SERPAPI_API_KEY");
@@ -5155,6 +5158,107 @@ var TavilySearchProvider = class {
5155
5158
  }));
5156
5159
  }
5157
5160
  };
5161
+ var GitHubSearchProvider = class {
5162
+ constructor(token = null) {
5163
+ this.token = token;
5164
+ }
5165
+ name = "github";
5166
+ async search(query, options = {}) {
5167
+ const url = new URL("https://api.github.com/search/repositories");
5168
+ url.searchParams.set("q", `${query} dataset OR benchmark OR corpus OR csv OR jsonl in:name,description,readme`);
5169
+ url.searchParams.set("sort", "stars");
5170
+ url.searchParams.set("order", "desc");
5171
+ url.searchParams.set("per_page", String(Math.min(20, options.limit ?? 10)));
5172
+ const headers = {
5173
+ Accept: "application/vnd.github+json",
5174
+ "User-Agent": "AlysResearchBot/0.1",
5175
+ "X-GitHub-Api-Version": "2022-11-28"
5176
+ };
5177
+ if (this.token) headers.Authorization = `Bearer ${this.token}`;
5178
+ const payload = await fetchJson(url.toString(), { headers }, options.timeoutMs);
5179
+ return asArray(payload.items).flatMap((item) => {
5180
+ if (!item || typeof item !== "object") return [];
5181
+ const object = item;
5182
+ const fullName = firstString(object, ["full_name"]);
5183
+ const htmlUrl = firstString(object, ["html_url"]);
5184
+ if (!fullName || !htmlUrl) return [];
5185
+ const stars = firstNumber(object, ["stargazers_count"]) ?? 0;
5186
+ const forks = firstNumber(object, ["forks_count"]) ?? 0;
5187
+ const topics = Array.isArray(object.topics) ? object.topics.map(String).slice(0, 8) : [];
5188
+ const license = object.license && typeof object.license === "object" ? firstString(object.license, ["spdx_id", "name"]) : "";
5189
+ const description = firstString(object, ["description"]);
5190
+ const language = firstString(object, ["language"]);
5191
+ const score = clamp01(Math.log10(stars + 1) / 5 * 0.72 + Math.log10(forks + 1) / 5 * 0.16 + (license ? 0.08 : 0) + (topics.length ? 0.04 : 0));
5192
+ return [{
5193
+ title: `GitHub: ${fullName}`,
5194
+ url: htmlUrl,
5195
+ snippet: [
5196
+ description,
5197
+ language ? `Language: ${language}.` : "",
5198
+ license ? `License: ${license}.` : "",
5199
+ topics.length ? `Topics: ${topics.join(", ")}.` : "",
5200
+ `Stars: ${stars}. Forks: ${forks}.`
5201
+ ].filter(Boolean).join(" "),
5202
+ publishedAt: firstString(object, ["updated_at", "pushed_at", "created_at"]),
5203
+ score,
5204
+ provider: this.name,
5205
+ query,
5206
+ raw: item
5207
+ }];
5208
+ });
5209
+ }
5210
+ };
5211
+ var KaggleSearchProvider = class {
5212
+ constructor(username = null, key = null) {
5213
+ this.username = username;
5214
+ this.key = key;
5215
+ }
5216
+ name = "kaggle";
5217
+ async search(query, options = {}) {
5218
+ if (!this.username || !this.key) {
5219
+ return [];
5220
+ }
5221
+ const url = new URL("https://www.kaggle.com/api/v1/datasets/list");
5222
+ url.searchParams.set("search", query);
5223
+ url.searchParams.set("sortBy", "hottest");
5224
+ url.searchParams.set("pageSize", String(Math.min(20, options.limit ?? 10)));
5225
+ const auth = Buffer.from(`${this.username}:${this.key}`).toString("base64");
5226
+ const payload = await fetchJson(url.toString(), {
5227
+ headers: {
5228
+ Accept: "application/json",
5229
+ Authorization: `Basic ${auth}`,
5230
+ "User-Agent": "AlysResearchBot/0.1"
5231
+ }
5232
+ }, options.timeoutMs);
5233
+ const items = Array.isArray(payload) ? payload : payload && typeof payload === "object" ? asArray(payload.datasets ?? payload.results) : [];
5234
+ return items.flatMap((item) => {
5235
+ if (!item || typeof item !== "object") return [];
5236
+ const object = item;
5237
+ const ref = firstString(object, ["ref", "datasetRef", "ownerName"]);
5238
+ const title = firstString(object, ["title", "subtitle", "ref"]) || ref;
5239
+ const datasetUrl = firstString(object, ["url"]) || (ref ? `https://www.kaggle.com/datasets/${ref}` : "");
5240
+ if (!title || !datasetUrl) return [];
5241
+ const votes = firstNumber(object, ["voteCount", "votes"]) ?? 0;
5242
+ const downloads = firstNumber(object, ["downloadCount", "downloads"]) ?? 0;
5243
+ const usability = firstNumber(object, ["usabilityRating"]) ?? 0;
5244
+ const score = clamp01(Math.log10(downloads + 1) / 6 * 0.38 + Math.log10(votes + 1) / 5 * 0.24 + Math.min(1, usability) * 0.28 + 0.1);
5245
+ return [{
5246
+ title: `Kaggle: ${title}`,
5247
+ url: datasetUrl,
5248
+ snippet: [
5249
+ firstString(object, ["subtitle", "description"]),
5250
+ `Downloads: ${downloads}. Votes: ${votes}.`,
5251
+ usability ? `Usability: ${usability}.` : ""
5252
+ ].filter(Boolean).join(" "),
5253
+ publishedAt: firstString(object, ["lastUpdated", "creationDate"]),
5254
+ score,
5255
+ provider: this.name,
5256
+ query,
5257
+ raw: item
5258
+ }];
5259
+ });
5260
+ }
5261
+ };
5158
5262
  var SerpApiSearchProvider = class {
5159
5263
  constructor(apiKey) {
5160
5264
  this.apiKey = apiKey;
@@ -5294,6 +5398,8 @@ function buildResearchQueries(topic, count = 5) {
5294
5398
  const normalized = topic.trim().replace(/\s+/g, " ");
5295
5399
  const facets = [
5296
5400
  normalized,
5401
+ `${normalized} dataset github kaggle benchmark`,
5402
+ `${normalized} public dataset csv jsonl parquet`,
5297
5403
  `${normalized} official documentation standards methodology`,
5298
5404
  `${normalized} research paper benchmark evaluation`,
5299
5405
  `${normalized} case study operational data`,
@@ -5318,8 +5424,9 @@ ${result.url}`);
5318
5424
  const freshnessScore = freshnessForDate(result.publishedAt);
5319
5425
  const duplicationRisk = clamp01(Math.max(0, (domainCounts.get(domain) ?? 1) - 1) * 0.12);
5320
5426
  const providerScore = normalizeProviderScore(result.score);
5427
+ const sourcePreference = sourcePreferenceScore(domain, result.url, result.provider);
5321
5428
  const trustScore = clamp01(
5322
- authority.score * 0.3 + relevanceScore * 0.27 + semanticScore * 0.18 + freshnessScore * 0.12 + providerScore * 0.08 + (1 - duplicationRisk) * 0.05
5429
+ authority.score * 0.3 + relevanceScore * 0.27 + semanticScore * 0.18 + freshnessScore * 0.1 + providerScore * 0.07 + sourcePreference * 0.05 + (1 - duplicationRisk) * 0.05
5323
5430
  );
5324
5431
  return {
5325
5432
  id: sourceId(result.url),
@@ -5339,7 +5446,12 @@ ${result.url}`);
5339
5446
  semanticScore: Number(semanticScore.toFixed(3)),
5340
5447
  trustScore: Number(trustScore.toFixed(3)),
5341
5448
  sourceType: authority.type,
5342
- qualitySignals: authority.signals
5449
+ qualitySignals: [
5450
+ ...authority.signals,
5451
+ ...sourcePreference >= 0.85 ? ["preferred-source-surface"] : [],
5452
+ ...result.provider === "github" ? ["github-repository-search"] : [],
5453
+ ...result.provider === "kaggle" ? ["kaggle-dataset-search"] : []
5454
+ ]
5343
5455
  };
5344
5456
  });
5345
5457
  }
@@ -5549,6 +5661,14 @@ function authorityForDomain(domain, url) {
5549
5661
  score = 0.78;
5550
5662
  type = "official";
5551
5663
  signals.push("official-documentation");
5664
+ } else if (host === "github.com") {
5665
+ score = 0.84;
5666
+ type = "code";
5667
+ signals.push("open-source-repository");
5668
+ } else if (host === "kaggle.com" || host.endsWith(".kaggle.com")) {
5669
+ score = 0.86;
5670
+ type = "dataset";
5671
+ signals.push("dataset-marketplace");
5552
5672
  } else if (host.includes("wikipedia.org")) {
5553
5673
  score = 0.62;
5554
5674
  type = "community";
@@ -5564,6 +5684,18 @@ function authorityForDomain(domain, url) {
5564
5684
  }
5565
5685
  return { score, type, signals };
5566
5686
  }
5687
+ function sourcePreferenceScore(domain, url, provider) {
5688
+ const host = domain.toLowerCase();
5689
+ if (provider === "kaggle" || host.includes("kaggle.com")) return 0.96;
5690
+ if (provider === "github" || host === "github.com") return 0.93;
5691
+ if (host.endsWith(".gov") || host.includes("nist.gov") || host.includes("sec.gov")) return 0.92;
5692
+ if (host.includes("arxiv.org") || host.endsWith(".edu") || host.includes("openalex.org")) return 0.88;
5693
+ if (host.includes("huggingface.co/datasets")) return 0.88;
5694
+ if (host.includes("data.gov") || url.includes("/dataset")) return 0.84;
5695
+ if (host.includes("docs.") || url.includes("/docs/") || url.includes("/documentation/")) return 0.78;
5696
+ if (host.includes("reddit.") || host.includes("medium.") || host.includes("substack.")) return 0.24;
5697
+ return 0.55;
5698
+ }
5567
5699
  function freshnessForDate(value) {
5568
5700
  if (!value) return 0.62;
5569
5701
  const timestamp = Date.parse(value);
@@ -5658,6 +5790,8 @@ function env(key) {
5658
5790
  return value || null;
5659
5791
  }
5660
5792
  var LOCAL_SOURCE_BASES = [
5793
+ { label: "GitHub", url: "https://github.com/search", querySuffix: "q", signal: "Open-source repositories, examples, datasets, and benchmark code." },
5794
+ { label: "Kaggle", url: "https://www.kaggle.com/datasets", querySuffix: "search", signal: "Public dataset catalog and dataset-level examples." },
5661
5795
  { label: "NIST", url: "https://www.nist.gov/search", querySuffix: "q", signal: "Technical guidance and standards language." },
5662
5796
  { label: "SEC", url: "https://www.sec.gov/search", querySuffix: "q", signal: "Regulatory filings and official disclosures." },
5663
5797
  { label: "PubMed", url: "https://pubmed.ncbi.nlm.nih.gov", querySuffix: "term", signal: "Academic and biomedical literature index." },
@@ -6128,13 +6262,235 @@ Source: ${result.source.url}
6128
6262
  }
6129
6263
 
6130
6264
  // ../../packages/prompts/src/index.ts
6131
- function buildDatasetGenerationPlan(topic) {
6265
+ var DATASET_BLUEPRINTS = [
6266
+ blueprint({
6267
+ id: "instruction-finetune-corpus",
6268
+ label: "Instruction Fine-Tuning Corpus",
6269
+ description: "Task-and-answer examples for model fine-tuning, evaluator training, and assistant behavior shaping.",
6270
+ defaultType: "instruction",
6271
+ recommendedFormats: ["instruction", "jsonl", "csv", "markdown"],
6272
+ bestFor: ["fine-tuning", "assistant behavior", "domain task completion", "supervised training"],
6273
+ fields: [
6274
+ field("instruction", "string", "The user-facing task the model should perform.", true, "Handle a pricing objection from a VP of Sales."),
6275
+ field("input_context", "string", "The scenario, persona, constraints, or source-backed context for the instruction.", true, "Mid-market CRM buyer comparing annual contracts."),
6276
+ field("ideal_output", "string", "The high-quality answer or completion expected from the model.", true, "Acknowledge budget pressure, quantify missed pipeline risk, and offer a pilot path."),
6277
+ field("skill", "string", "The primary capability being trained.", true, "objection-handling"),
6278
+ field("difficulty", "string", "Expected difficulty or complexity level.", false, "intermediate"),
6279
+ field("source_url", "string", "The source URL supporting this record.", true, "https://example.com/source"),
6280
+ field("confidence", "number", "Alys confidence score after source and quality checks.", true, "0.86")
6281
+ ],
6282
+ recordInstructions: [
6283
+ "Make the instruction actionable and directly trainable.",
6284
+ "Keep the output specific enough to teach a behavior, not a generic explanation.",
6285
+ "Include scenario/context when the answer depends on buyer role, domain, risk, or constraints."
6286
+ ],
6287
+ qualityBar: [
6288
+ "Record teaches one clear skill.",
6289
+ "Answer is useful without reading the source page.",
6290
+ "No repeated template phrasing across rows."
6291
+ ]
6292
+ }),
6293
+ blueprint({
6294
+ id: "rag-corpus",
6295
+ label: "RAG Retrieval Corpus",
6296
+ description: "Retrieval-ready passages, grounded answers, and citation-friendly chunks for search or knowledge bases.",
6297
+ defaultType: "rag",
6298
+ recommendedFormats: ["rag", "jsonl", "csv", "markdown"],
6299
+ bestFor: ["RAG", "semantic search", "knowledge bases", "citation-aware QA"],
6300
+ fields: [
6301
+ field("query", "string", "A realistic retrieval query or user question.", true, "What evidence supports the SOC 2 access review requirement?"),
6302
+ field("answer", "string", "A compact answer grounded only in the retrieved context.", true, "The control requires periodic review of user access and documented exceptions."),
6303
+ field("chunk_text", "string", "The retrieval text chunk that should be embedded.", true, "Access reviews should be performed periodically..."),
6304
+ field("source_title", "string", "Human-readable source title.", true, "SOC 2 Criteria Overview"),
6305
+ field("source_url", "string", "Canonical source URL.", true, "https://example.com/source"),
6306
+ field("citation_span", "string", "The source-backed phrase or section used as evidence.", false, "periodic review of user access"),
6307
+ field("confidence", "number", "Grounding confidence for the answer/chunk.", true, "0.91")
6308
+ ],
6309
+ recordInstructions: [
6310
+ "Make each context chunk standalone and retrieval-ready.",
6311
+ "Answer only what the source context supports.",
6312
+ "Prefer compact, evidence-rich chunks over long summaries."
6313
+ ],
6314
+ qualityBar: [
6315
+ "Chunk contains one coherent idea.",
6316
+ "Query is realistic, not keyword stuffing.",
6317
+ "Answer can be traced back to source context."
6318
+ ]
6319
+ }),
6320
+ blueprint({
6321
+ id: "evaluation-qa",
6322
+ label: "Evaluation QA Dataset",
6323
+ description: "Question-answer records with expected answers, rubric signals, difficulty, and factual grounding.",
6324
+ defaultType: "qa",
6325
+ recommendedFormats: ["jsonl", "csv", "markdown"],
6326
+ bestFor: ["model evaluation", "benchmarking", "golden sets", "regression testing"],
6327
+ fields: [
6328
+ field("question", "string", "A precise evaluation question.", true, "Which mitigation should be used when duplicate records reduce retrieval diversity?"),
6329
+ field("expected_answer", "string", "The answer a model should produce.", true, "Apply semantic deduplication and retain records with stronger source support."),
6330
+ field("rubric", "string", "Criteria used to grade the model response.", true, "Must mention deduplication, source support, and diversity impact."),
6331
+ field("difficulty", "string", "Difficulty band.", true, "hard"),
6332
+ field("failure_modes", "array", "Likely wrong answers to catch.", false, "hallucinated metric, unsupported source claim"),
6333
+ field("source_url", "string", "Evidence source URL.", true, "https://example.com/source"),
6334
+ field("confidence", "number", "Grounding confidence.", true, "0.88")
6335
+ ],
6336
+ recordInstructions: [
6337
+ "Write questions that test reasoning, not memorization only.",
6338
+ "Include rubric-like constraints in the answer or context.",
6339
+ "Add failure-mode awareness when the source contains ambiguity."
6340
+ ],
6341
+ qualityBar: [
6342
+ "Question has one defensible expected answer.",
6343
+ "Rubric exposes what a weak model would miss.",
6344
+ "Evidence is visible in context."
6345
+ ]
6346
+ }),
6347
+ blueprint({
6348
+ id: "b2b-saas-objections",
6349
+ label: "B2B SaaS Objection Handling",
6350
+ description: "Sales-training records for pricing, procurement, ROI, onboarding, integration, security, and competitive objections.",
6351
+ defaultType: "instruction",
6352
+ recommendedFormats: ["instruction", "jsonl", "csv", "markdown"],
6353
+ bestFor: ["sales enablement", "roleplay training", "support coaching", "fine-tuning"],
6354
+ fields: [
6355
+ field("buyer_role", "string", "The buyer or stakeholder persona.", true, "CFO"),
6356
+ field("company_segment", "string", "Customer segment or buying context.", true, "mid-market SaaS"),
6357
+ field("objection_category", "string", "Primary objection class.", true, "pricing"),
6358
+ field("objection", "string", "The exact buyer objection.", true, "This is too expensive compared to our current tool."),
6359
+ field("recommended_response", "string", "The ideal grounded response.", true, "Tie cost to pipeline leakage and propose a measured pilot."),
6360
+ field("proof_point", "string", "Evidence or reasoning used in the response.", false, "integration time and support burden"),
6361
+ field("follow_up_question", "string", "A next-step question that advances discovery.", true, "What cost are you currently assigning to delayed handoffs?")
6362
+ ],
6363
+ recordInstructions: [
6364
+ "Make objections sound like real buyers, not canned sales scripts.",
6365
+ "Answer with empathy, evidence, and a concrete next step.",
6366
+ "Vary buyer role, deal stage, company size, and risk profile."
6367
+ ],
6368
+ qualityBar: [
6369
+ "Response addresses the objection directly.",
6370
+ "No fake statistics or logos.",
6371
+ "Follow-up question is useful in a real sales call."
6372
+ ]
6373
+ }),
6374
+ blueprint({
6375
+ id: "technical-docs-qa",
6376
+ label: "Technical Documentation QA",
6377
+ description: "Developer-facing questions, answers, commands, prerequisites, errors, and implementation details grounded in docs or repos.",
6378
+ defaultType: "qa",
6379
+ recommendedFormats: ["jsonl", "csv", "markdown", "rag"],
6380
+ bestFor: ["developer docs", "support bots", "SDK QA", "RAG corpora"],
6381
+ fields: [
6382
+ field("product_area", "string", "The API, package, CLI, or feature area.", true, "CLI authentication"),
6383
+ field("question", "string", "Developer question or task.", true, "How do I authenticate the CLI?"),
6384
+ field("answer", "string", "Specific answer grounded in documentation.", true, "Run npx alys-akusa login and finish the browser flow."),
6385
+ field("code_or_command", "string", "Relevant command, code, or config.", false, "npx alys-akusa login"),
6386
+ field("prerequisites", "array", "Required setup before this works.", false, "Alys account, browser access"),
6387
+ field("common_error", "string", "Likely failure mode or troubleshooting note.", false, "Expired CLI login session."),
6388
+ field("source_url", "string", "Documentation or repository URL.", true, "https://example.com/docs")
6389
+ ],
6390
+ recordInstructions: [
6391
+ "Prefer commands, parameters, return shapes, limits, and edge cases.",
6392
+ "Never invent API names or package names.",
6393
+ "If docs are ambiguous, make the uncertainty visible."
6394
+ ],
6395
+ qualityBar: [
6396
+ "Answer can be executed or verified.",
6397
+ "No fake APIs.",
6398
+ "Source context includes the relevant command or behavior."
6399
+ ]
6400
+ }),
6401
+ blueprint({
6402
+ id: "legal-compliance-rag",
6403
+ label: "Legal & Compliance RAG Corpus",
6404
+ description: "Compliance-aware chunks and QA with jurisdictions, requirements, controls, exceptions, and evidence notes.",
6405
+ defaultType: "rag",
6406
+ recommendedFormats: ["rag", "jsonl", "csv", "markdown"],
6407
+ bestFor: ["compliance search", "policy QA", "audit prep", "legal retrieval"],
6408
+ fields: [
6409
+ field("jurisdiction", "string", "Relevant jurisdiction or regulatory scope.", false, "United States"),
6410
+ field("requirement", "string", "The rule, obligation, or policy requirement.", true, "Maintain access review evidence."),
6411
+ field("control_or_action", "string", "Concrete action, control, or procedure.", true, "Review user access quarterly and document exceptions."),
6412
+ field("exception_or_limit", "string", "Boundary, exception, or uncertainty.", false, "Frequency may vary by framework."),
6413
+ field("evidence_text", "string", "Source-backed evidence span.", true, "periodic access reviews"),
6414
+ field("source_url", "string", "Canonical source URL.", true, "https://example.com/policy"),
6415
+ field("confidence", "number", "Confidence after source checks.", true, "0.84")
6416
+ ],
6417
+ recordInstructions: [
6418
+ "Preserve scope and limitations.",
6419
+ "Do not convert legal text into absolute advice when the source is conditional.",
6420
+ "Separate requirement, control, and exception clearly."
6421
+ ],
6422
+ qualityBar: [
6423
+ "Jurisdiction/scope is not blurred.",
6424
+ "No invented legal conclusions.",
6425
+ "Evidence text supports the answer."
6426
+ ]
6427
+ }),
6428
+ blueprint({
6429
+ id: "engineering-telemetry",
6430
+ label: "Engineering Telemetry & Operations",
6431
+ description: "Operational records for engineering systems: parameters, units, ranges, anomalies, recommendations, and safety constraints.",
6432
+ defaultType: "instruction",
6433
+ recommendedFormats: ["jsonl", "csv", "instruction", "markdown"],
6434
+ bestFor: ["industrial AI", "predictive maintenance", "ops training", "engineering assistants"],
6435
+ fields: [
6436
+ field("asset_type", "string", "Equipment, system, or asset class.", true, "oil pipeline pump station"),
6437
+ field("operation_phase", "string", "Workflow or operating phase.", true, "commissioning"),
6438
+ field("parameter", "string", "Observed parameter or measurement.", true, "pressure differential"),
6439
+ field("unit", "string", "Engineering unit.", false, "psi"),
6440
+ field("normal_range", "string", "Source-backed or cautiously inferred expected range.", false, "site-specific; verify against design docs"),
6441
+ field("abnormal_signal", "string", "Failure pattern, warning, or anomaly.", true, "rising vibration with falling flow rate"),
6442
+ field("recommended_action", "string", "Operationally safe next action.", true, "inspect pump seals and verify sensor calibration"),
6443
+ field("safety_note", "string", "Boundary or caution.", true, "do not exceed site operating procedures")
6444
+ ],
6445
+ recordInstructions: [
6446
+ "Keep units, ranges, and recommendations physically plausible.",
6447
+ "If numeric ranges are not in source context, say they require site-specific verification.",
6448
+ "Include normal, abnormal, edge, and incident scenarios."
6449
+ ],
6450
+ qualityBar: [
6451
+ "No invented unsafe operating limits.",
6452
+ "Action follows from the signal.",
6453
+ "Record includes constraints and uncertainty."
6454
+ ]
6455
+ }),
6456
+ blueprint({
6457
+ id: "dataset-source-catalog",
6458
+ label: "Dataset Source Catalog",
6459
+ description: "Catalog records for public datasets, repositories, benchmark corpora, licenses, schemas, and use constraints.",
6460
+ defaultType: "qa",
6461
+ recommendedFormats: ["jsonl", "csv", "markdown"],
6462
+ bestFor: ["dataset discovery", "benchmark planning", "source audits", "training-data procurement"],
6463
+ fields: [
6464
+ field("source_name", "string", "Dataset/repository/source name.", true, "SDV benchmark dataset"),
6465
+ field("source_url", "string", "Canonical URL.", true, "https://github.com/sdv-dev/SDV"),
6466
+ field("source_type", "string", "Dataset, repository, benchmark, paper, documentation, or registry.", true, "repository"),
6467
+ field("domain", "string", "Domain or category.", false, "synthetic data"),
6468
+ field("available_formats", "array", "Known available formats.", false, "csv, jsonl, parquet"),
6469
+ field("license_or_terms", "string", "License or usage constraints when available.", false, "MIT"),
6470
+ field("schema_summary", "string", "Short description of fields/tables/tasks.", true, "Benchmark suite for tabular synthetic data."),
6471
+ field("trust_reason", "string", "Why this source is credible or useful.", true, "Primary GitHub repository with active documentation.")
6472
+ ],
6473
+ recordInstructions: [
6474
+ "Favor GitHub, Kaggle, Hugging Face datasets, official benchmark pages, and primary repositories.",
6475
+ "Do not claim license, row count, or schema details unless visible in source context.",
6476
+ "Make the catalog immediately usable for dataset selection."
6477
+ ],
6478
+ qualityBar: [
6479
+ "Every row points to a real source URL.",
6480
+ "License/format claims are source-backed or marked unknown.",
6481
+ "Trust reason is explicit."
6482
+ ]
6483
+ })
6484
+ ];
6485
+ function buildDatasetGenerationPlan(topic, datasetType) {
6132
6486
  const normalized = topic.trim().replace(/\s+/g, " ");
6133
6487
  const lower = normalized.toLowerCase();
6134
6488
  const segments = knownSegments(lower, normalized) ?? genericSegments(normalized);
6489
+ const blueprint2 = selectDatasetBlueprint(normalized, datasetType);
6135
6490
  return {
6136
6491
  topic: normalized,
6137
6492
  intent: inferIntent(lower),
6493
+ blueprint: blueprint2,
6138
6494
  segments,
6139
6495
  sourceCriteria: [
6140
6496
  "Prefer official documentation, standards bodies, academic papers, technical reports, government sources, and primary company docs.",
@@ -6149,12 +6505,41 @@ function buildDatasetGenerationPlan(topic) {
6149
6505
  "Repeated phrasing across records indicates mode collapse and must be suppressed."
6150
6506
  ],
6151
6507
  outputSchemaNotes: [
6508
+ `Use the "${blueprint2.label}" blueprint (${blueprint2.id}) as the canonical row contract.`,
6509
+ `Recommended exports: ${blueprint2.recommendedFormats.join(", ")}.`,
6510
+ ...blueprint2.fields.map((item) => `${item.required ? "Required" : "Optional"} field: ${item.name} (${item.type}) - ${item.description}`),
6152
6511
  "Each record needs an actionable input, specific output, source context, source URL, confidence, tags, and metadata.",
6153
6512
  "Metadata should include source trust, source authority, source relevance, segment ID, quality signals, and benchmark notes.",
6154
6513
  "Records should be useful without the original source page while still preserving provenance."
6155
6514
  ]
6156
6515
  };
6157
6516
  }
6517
+ function selectDatasetBlueprint(topic, datasetType) {
6518
+ const lower = topic.toLowerCase();
6519
+ const selected = (() => {
6520
+ if ((lower.includes("b2b") || lower.includes("sales")) && lower.includes("objection")) return byId("b2b-saas-objections");
6521
+ if (lower.includes("legal") || lower.includes("compliance") || lower.includes("soc 2") || lower.includes("hipaa")) return byId("legal-compliance-rag");
6522
+ if (lower.includes("oil") || lower.includes("gas") || lower.includes("pipeline") || lower.includes("drilling") || lower.includes("telemetry") || lower.includes("maintenance")) return byId("engineering-telemetry");
6523
+ if (lower.includes("documentation") || lower.includes("developer") || lower.includes("api") || lower.includes("sdk") || lower.includes("cli")) return byId("technical-docs-qa");
6524
+ if (lower.includes("benchmark") || lower.includes("evaluation") || lower.includes("eval")) return byId("evaluation-qa");
6525
+ if (lower.includes("source catalog") || lower.includes("public dataset") || lower.includes("kaggle") || lower.includes("github dataset")) return byId("dataset-source-catalog");
6526
+ if (lower.includes("rag") || lower.includes("retrieval") || lower.includes("knowledge base")) return byId("rag-corpus");
6527
+ return datasetType === "rag" ? byId("rag-corpus") : datasetType === "qa" ? byId("evaluation-qa") : byId("instruction-finetune-corpus");
6528
+ })();
6529
+ return selected ?? DATASET_BLUEPRINTS[0];
6530
+ }
6531
+ function byId(id) {
6532
+ return DATASET_BLUEPRINTS.find((blueprint2) => blueprint2.id === id);
6533
+ }
6534
+ function blueprint(input) {
6535
+ return {
6536
+ ...input,
6537
+ version: input.version ?? "1.0.0"
6538
+ };
6539
+ }
6540
+ function field(name, type, description, required, example) {
6541
+ return { name, type, description, required, example };
6542
+ }
6158
6543
  function segmentForSource(plan, text, index = 0) {
6159
6544
  const haystack = text.toLowerCase();
6160
6545
  const scored = plan.segments.map((segment2) => {
@@ -6267,9 +6652,12 @@ Your job is to generate production-grade synthetic dataset records that can surv
6267
6652
  Hard rules:
6268
6653
  - Return only valid JSON matching the requested schema.
6269
6654
  - Each record must be meaningfully different.
6655
+ - Every record must be grounded in the provided source context.
6656
+ - Do not introduce facts, numbers, names, benchmarks, URLs, citations, or claims that are not present in the source context.
6270
6657
  - Preserve plausible domain physics, operational constraints, and realistic terminology.
6271
6658
  - Prefer specific scenarios, values, failure modes, and edge cases over generic prose.
6272
- - Do not include fake citations. Source context is provenance, not proof of factual truth.
6659
+ - If the source context is thin, produce cautious records about what can be inferred and what needs verification.
6660
+ - Do not include fake citations. Source URL and source title are attached outside your JSON.
6273
6661
  - Never output placeholders, TODOs, markdown fences, or explanations.
6274
6662
  `.trim();
6275
6663
  function loadAlysEnv(cwd = process.cwd()) {
@@ -6361,6 +6749,9 @@ Active topic segment:
6361
6749
  Generation plan intent:
6362
6750
  ${options.generationPlan.intent}
6363
6751
 
6752
+ Dataset blueprint:
6753
+ ${renderBlueprintForPrompt(options.generationPlan)}
6754
+
6364
6755
  Required source criteria:
6365
6756
  ${options.generationPlan.sourceCriteria.map((item) => `- ${item}`).join("\n")}
6366
6757
 
@@ -6394,6 +6785,9 @@ ${segmentBlock}
6394
6785
  Generate exactly ${targetCount} records.
6395
6786
 
6396
6787
  Domain quality requirements:
6788
+ - Ground every record in the source context below. If a detail is not in the context, do not invent it.
6789
+ - Use the dataset blueprint above as the semantic row contract. Do not waste tokens describing CSV/JSONL formatting; Alys handles export formatting.
6790
+ - Put the most important blueprint field values into input, output, and context. Add blueprint field names to metadata.signals or metadata.constraints when useful.
6397
6791
  - Include realistic parameters, edge cases, constraints, and operational variability.
6398
6792
  - For engineering domains, use physically plausible values and causal relationships.
6399
6793
  - For tabular/synthetic-data domains, preserve schema-like consistency and row-level diversity.
@@ -6401,6 +6795,7 @@ Domain quality requirements:
6401
6795
  - Avoid repeating the topic verbatim in every output.
6402
6796
  - Reflect source quality: lower-confidence sources should create cautious, verification-aware records.
6403
6797
  - If the source is low authority, write records as verification-aware training examples rather than confident factual claims.
6798
+ - Prefer source-backed dataset rows over broad advice. If the context only supports broad advice, lower confidence.
6404
6799
  - Every output should contain a specific decision signal, scenario, constraint, or useful answer. Reject generic summaries.
6405
6800
  - Prefer records that can be directly reused for fine-tuning, RAG evaluation, or QA benchmarks.
6406
6801
 
@@ -6425,8 +6820,29 @@ Return JSON with this shape:
6425
6820
 
6426
6821
  Source context:
6427
6822
  ${sourceText}
6823
+
6824
+ Final grounding rule:
6825
+ Only use information supported by Source title, Source URL, Source quality, and Source context above.
6428
6826
  `.trim();
6429
6827
  }
6828
+ function renderBlueprintForPrompt(plan) {
6829
+ const blueprint2 = plan.blueprint;
6830
+ return [
6831
+ `ID: ${blueprint2.id}`,
6832
+ `Label: ${blueprint2.label}`,
6833
+ `Version: ${blueprint2.version}`,
6834
+ `Purpose: ${blueprint2.description}`,
6835
+ `Best for: ${blueprint2.bestFor.join(", ")}`,
6836
+ "Fields:",
6837
+ ...blueprint2.fields.map(
6838
+ (field2) => `- ${field2.name} (${field2.type}${field2.required ? ", required" : ", optional"}): ${field2.description}${field2.example ? ` Example: ${field2.example}` : ""}`
6839
+ ),
6840
+ "Record instructions:",
6841
+ ...blueprint2.recordInstructions.map((item) => `- ${item}`),
6842
+ "Quality bar:",
6843
+ ...blueprint2.qualityBar.map((item) => `- ${item}`)
6844
+ ].join("\n");
6845
+ }
6430
6846
  async function generateWithOpenAI(options) {
6431
6847
  const apiKey = getOpenAIKey();
6432
6848
  const model = process.env.ALYS_GENERATOR_MODEL || DEFAULT_OPENAI_MODEL;
@@ -6447,7 +6863,7 @@ async function generateWithOpenAI(options) {
6447
6863
  },
6448
6864
  body: JSON.stringify({
6449
6865
  model,
6450
- temperature: Number(process.env.ALYS_GENERATOR_TEMPERATURE ?? 0.45),
6866
+ temperature: Number(process.env.ALYS_GENERATOR_TEMPERATURE ?? 0.28),
6451
6867
  max_tokens: Math.min(32768, Math.max(1600, options.targetCount * 520)),
6452
6868
  messages: [
6453
6869
  { role: "system", content: ALYS_RECORD_SYSTEM_PROMPT },
@@ -6473,26 +6889,29 @@ async function generateWithOpenAI(options) {
6473
6889
  provider: "openai",
6474
6890
  model,
6475
6891
  latencyMs: Date.now() - startedAt,
6476
- records: parseProviderRecords(content, options.baselineConfidence, options.datasetType)
6892
+ records: groundProviderRecords(
6893
+ parseProviderRecords(content, options.baselineConfidence, options.datasetType),
6894
+ options.document
6895
+ )
6477
6896
  };
6478
6897
  }
6479
6898
  async function generateWithOpenAIBatched(options) {
6480
- const batchSize = Math.max(1, Math.min(24, Number(process.env.ALYS_OPENAI_RECORDS_PER_CALL ?? 8)));
6481
- const batches = Math.ceil(options.targetCount / batchSize);
6482
- const records = [];
6483
- let latencyMs = 0;
6484
- let model = process.env.ALYS_GENERATOR_MODEL || DEFAULT_OPENAI_MODEL;
6485
- for (let i = 0; i < batches; i++) {
6486
- const remaining = options.targetCount - records.length;
6487
- if (remaining <= 0) break;
6488
- const result = await generateWithOpenAI({
6899
+ const batchSize = Math.max(1, Math.min(48, Number(process.env.ALYS_OPENAI_RECORDS_PER_CALL ?? 8)));
6900
+ const batches = Array.from(
6901
+ { length: Math.ceil(options.targetCount / batchSize) },
6902
+ (_, index) => Math.min(batchSize, Math.max(0, options.targetCount - index * batchSize))
6903
+ ).filter((count) => count > 0);
6904
+ const results = await mapLimit(
6905
+ batches,
6906
+ Number(process.env.ALYS_PROVIDER_BATCH_CONCURRENCY ?? 3),
6907
+ (count) => generateWithOpenAI({
6489
6908
  ...options,
6490
- targetCount: Math.min(batchSize, remaining)
6491
- });
6492
- latencyMs += result.latencyMs;
6493
- model = result.model;
6494
- records.push(...result.records);
6495
- }
6909
+ targetCount: count
6910
+ })
6911
+ );
6912
+ const records = results.flatMap((result) => result.records);
6913
+ const latencyMs = results.reduce((sum, result) => sum + result.latencyMs, 0);
6914
+ const model = results[0]?.model ?? process.env.ALYS_GENERATOR_MODEL ?? DEFAULT_OPENAI_MODEL;
6496
6915
  return {
6497
6916
  provider: "openai",
6498
6917
  model,
@@ -6520,7 +6939,7 @@ async function generateWithGroq(options) {
6520
6939
  },
6521
6940
  body: JSON.stringify({
6522
6941
  model,
6523
- temperature: Number(process.env.ALYS_GENERATOR_TEMPERATURE ?? 0.35),
6942
+ temperature: Number(process.env.ALYS_GENERATOR_TEMPERATURE ?? 0.25),
6524
6943
  max_tokens: Math.min(8192, Math.max(1600, options.targetCount * 520)),
6525
6944
  messages: [
6526
6945
  { role: "system", content: ALYS_RECORD_SYSTEM_PROMPT },
@@ -6539,26 +6958,29 @@ async function generateWithGroq(options) {
6539
6958
  provider: "groq",
6540
6959
  model,
6541
6960
  latencyMs: Date.now() - startedAt,
6542
- records: parseProviderRecords(content, options.baselineConfidence, options.datasetType)
6961
+ records: groundProviderRecords(
6962
+ parseProviderRecords(content, options.baselineConfidence, options.datasetType),
6963
+ options.document
6964
+ )
6543
6965
  };
6544
6966
  }
6545
6967
  async function generateWithGroqBatched(options) {
6546
- const batchSize = Math.max(1, Math.min(16, Number(process.env.ALYS_GROQ_RECORDS_PER_CALL ?? 8)));
6547
- const batches = Math.ceil(options.targetCount / batchSize);
6548
- const records = [];
6549
- let latencyMs = 0;
6550
- let model = process.env.GROQ_MODEL || process.env.ALYS_GENERATOR_MODEL || DEFAULT_GROQ_MODEL;
6551
- for (let i = 0; i < batches; i++) {
6552
- const remaining = options.targetCount - records.length;
6553
- if (remaining <= 0) break;
6554
- const result = await generateWithGroq({
6968
+ const batchSize = Math.max(1, Math.min(32, Number(process.env.ALYS_GROQ_RECORDS_PER_CALL ?? 8)));
6969
+ const batches = Array.from(
6970
+ { length: Math.ceil(options.targetCount / batchSize) },
6971
+ (_, index) => Math.min(batchSize, Math.max(0, options.targetCount - index * batchSize))
6972
+ ).filter((count) => count > 0);
6973
+ const results = await mapLimit(
6974
+ batches,
6975
+ Number(process.env.ALYS_PROVIDER_BATCH_CONCURRENCY ?? 3),
6976
+ (count) => generateWithGroq({
6555
6977
  ...options,
6556
- targetCount: Math.min(batchSize, remaining)
6557
- });
6558
- latencyMs += result.latencyMs;
6559
- model = result.model;
6560
- records.push(...result.records);
6561
- }
6978
+ targetCount: count
6979
+ })
6980
+ );
6981
+ const records = results.flatMap((result) => result.records);
6982
+ const latencyMs = results.reduce((sum, result) => sum + result.latencyMs, 0);
6983
+ const model = results[0]?.model ?? process.env.GROQ_MODEL ?? process.env.ALYS_GENERATOR_MODEL ?? DEFAULT_GROQ_MODEL;
6562
6984
  return {
6563
6985
  provider: "groq",
6564
6986
  model,
@@ -6566,6 +6988,20 @@ async function generateWithGroqBatched(options) {
6566
6988
  records: records.slice(0, options.targetCount)
6567
6989
  };
6568
6990
  }
6991
+ async function mapLimit(items, limit, worker) {
6992
+ const results = new Array(items.length);
6993
+ let nextIndex = 0;
6994
+ const workerCount = Math.max(1, Math.min(items.length, Math.floor(limit || 1)));
6995
+ async function runWorker() {
6996
+ while (nextIndex < items.length) {
6997
+ const index = nextIndex;
6998
+ nextIndex += 1;
6999
+ results[index] = await worker(items[index], index);
7000
+ }
7001
+ }
7002
+ await Promise.all(Array.from({ length: workerCount }, runWorker));
7003
+ return results;
7004
+ }
6569
7005
  function providerRecordSchema() {
6570
7006
  return {
6571
7007
  type: "object",
@@ -6635,6 +7071,78 @@ function normalizeMetadata(value) {
6635
7071
  benchmark_notes: Array.isArray(obj.benchmark_notes) ? obj.benchmark_notes.map(String).slice(0, 12) : []
6636
7072
  };
6637
7073
  }
7074
+ function groundProviderRecords(records, document) {
7075
+ const sourceTokens = meaningfulTokens(document.text);
7076
+ if (sourceTokens.size < 8) return [];
7077
+ const sourceIsFallback = /fallback reason:/i.test(document.text);
7078
+ const minimumGrounding = sourceIsFallback ? 0.025 : 0.04;
7079
+ return records.flatMap((record) => {
7080
+ const recordTokens = meaningfulTokens(`${record.input} ${record.output} ${record.context}`);
7081
+ if (!recordTokens.size) return [];
7082
+ let overlap = 0;
7083
+ for (const token of recordTokens) {
7084
+ if (sourceTokens.has(token)) overlap += 1;
7085
+ }
7086
+ const groundingScore = overlap / Math.max(1, Math.min(recordTokens.size, sourceTokens.size));
7087
+ if (groundingScore < minimumGrounding) return [];
7088
+ const confidence = clamp012(record.confidence * (0.76 + Math.min(0.24, groundingScore * 2.4)));
7089
+ return [{
7090
+ ...record,
7091
+ confidence,
7092
+ metadata: {
7093
+ ...record.metadata,
7094
+ signals: Array.from(/* @__PURE__ */ new Set([
7095
+ ...record.metadata.signals,
7096
+ `source-grounding:${groundingScore.toFixed(3)}`,
7097
+ `source-url:${document.url}`
7098
+ ])).slice(0, 12),
7099
+ constraints: Array.from(/* @__PURE__ */ new Set([
7100
+ ...record.metadata.constraints,
7101
+ "accepted-after-source-grounding-check"
7102
+ ])).slice(0, 12)
7103
+ }
7104
+ }];
7105
+ });
7106
+ }
7107
+ function meaningfulTokens(value) {
7108
+ const stopwords = /* @__PURE__ */ new Set([
7109
+ "about",
7110
+ "after",
7111
+ "also",
7112
+ "because",
7113
+ "before",
7114
+ "being",
7115
+ "between",
7116
+ "could",
7117
+ "dataset",
7118
+ "during",
7119
+ "every",
7120
+ "from",
7121
+ "have",
7122
+ "into",
7123
+ "more",
7124
+ "only",
7125
+ "other",
7126
+ "should",
7127
+ "source",
7128
+ "that",
7129
+ "their",
7130
+ "there",
7131
+ "these",
7132
+ "this",
7133
+ "through",
7134
+ "using",
7135
+ "what",
7136
+ "when",
7137
+ "where",
7138
+ "which",
7139
+ "with",
7140
+ "would"
7141
+ ]);
7142
+ return new Set(
7143
+ value.toLowerCase().match(/[a-z0-9][a-z0-9._/-]{2,}/g)?.map((token) => token.replace(/^[._/-]+|[._/-]+$/g, "")).filter((token) => token.length >= 4 && !stopwords.has(token)).slice(0, 900) ?? []
7144
+ );
7145
+ }
6638
7146
  function safeJson(text) {
6639
7147
  const trimmed = text.trim();
6640
7148
  try {
@@ -6680,7 +7188,7 @@ var DiscoveryAgent = class {
6680
7188
  var ExtractionAgent = class {
6681
7189
  name = "ExtractionAgent";
6682
7190
  async run(sources) {
6683
- const crawls = await mapLimit(
7191
+ const crawls = await mapLimit2(
6684
7192
  sources,
6685
7193
  Math.max(1, Number(process.env.ALYS_CRAWL_CONCURRENCY ?? 6)),
6686
7194
  (source) => crawlSource(source)
@@ -6754,7 +7262,7 @@ var StructuringAgent = class {
6754
7262
  }
6755
7263
  const concurrency = Math.max(1, Math.min(documents.length, Number(process.env.ALYS_PROVIDER_CONCURRENCY ?? 3)));
6756
7264
  let totalGenerated = 0;
6757
- const grouped = await mapLimit(documents, concurrency, async (document, index) => {
7265
+ const grouped = await mapLimit2(documents, concurrency, async (document, index) => {
6758
7266
  const finding = findings[index] ?? findings[0];
6759
7267
  const trustScore = document.sourceScores?.trustScore ?? 0.62;
6760
7268
  const authorityScore = document.sourceScores?.authorityScore ?? 0.55;
@@ -6763,6 +7271,7 @@ var StructuringAgent = class {
6763
7271
  if (trustScore < (options.minTrustScore ?? 0.42) || relevanceScore < (options.minRelevanceScore ?? 0.24)) return [];
6764
7272
  const sourceWeight = sourceQualityWeight(document);
6765
7273
  const segment2 = options.generationPlan ? segmentForSource(options.generationPlan, `${document.title} ${document.text}`, index) : void 0;
7274
+ const blueprint2 = options.generationPlan?.blueprint;
6766
7275
  const baselineConfidence = clamp013((finding?.confidence ?? 0.7) * 0.55 + trustScore * 0.22 + authorityScore * 0.12 + relevanceScore * 0.11 - duplicationRisk * 0.08);
6767
7276
  const baseId = import_node_crypto3.default.createHash("sha1").update(`${topic}:${document.url}:${datasetType}`).digest("hex").slice(0, 14);
6768
7277
  const providerTarget = useProvider ? weightedRecordTarget(recordsPerDocument, options.providerRecordsPerDocument ?? recordsPerDocument, sourceWeight) : 0;
@@ -6795,6 +7304,10 @@ var StructuringAgent = class {
6795
7304
  ...g.metadata,
6796
7305
  topic,
6797
7306
  kind: datasetType,
7307
+ blueprint_id: blueprint2?.id,
7308
+ blueprint_label: blueprint2?.label,
7309
+ blueprint_version: blueprint2?.version,
7310
+ blueprint_fields: blueprint2?.fields.map((field2) => field2.name),
6798
7311
  segment_id: segment2?.id,
6799
7312
  segment_label: segment2?.label,
6800
7313
  provider: providerResult.provider,
@@ -6830,7 +7343,7 @@ function domainFromUrl3(url) {
6830
7343
  return "unknown";
6831
7344
  }
6832
7345
  }
6833
- async function mapLimit(items, limit, worker) {
7346
+ async function mapLimit2(items, limit, worker) {
6834
7347
  const results = new Array(items.length);
6835
7348
  let nextIndex = 0;
6836
7349
  async function runWorker() {
@@ -6922,6 +7435,11 @@ function toCsv(records) {
6922
7435
  "source",
6923
7436
  "source_url",
6924
7437
  "confidence",
7438
+ "blueprint_id",
7439
+ "segment_id",
7440
+ "source_trust_score",
7441
+ "source_authority_score",
7442
+ "source_relevance_score",
6925
7443
  "tags",
6926
7444
  "metadata",
6927
7445
  "created_at"
@@ -6934,12 +7452,24 @@ function toCsv(records) {
6934
7452
  record.source,
6935
7453
  record.source_url,
6936
7454
  String(record.confidence),
7455
+ metadataString(record, "blueprint_id"),
7456
+ metadataString(record, "segment_id"),
7457
+ metadataString(record, "source_trust_score"),
7458
+ metadataString(record, "source_authority_score"),
7459
+ metadataString(record, "source_relevance_score"),
6937
7460
  record.tags.join("|"),
6938
7461
  JSON.stringify(record.metadata),
6939
7462
  record.created_at
6940
7463
  ]);
6941
7464
  return [header, ...rows].map((row) => row.map(escapeCsv).join(",")).join("\n") + "\n";
6942
7465
  }
7466
+ function metadataString(record, key) {
7467
+ const value = record.metadata[key];
7468
+ if (value === null || value === void 0) return "";
7469
+ if (typeof value === "string") return value;
7470
+ if (typeof value === "number" || typeof value === "boolean") return String(value);
7471
+ return JSON.stringify(value);
7472
+ }
6943
7473
  function toMarkdown(records) {
6944
7474
  return records.map((record) => {
6945
7475
  const title = record.input || record.id;
@@ -7075,20 +7605,26 @@ function performanceConfig(mode) {
7075
7605
  }
7076
7606
  function gateSources(sources, mode) {
7077
7607
  const perf = performanceConfig(mode);
7078
- const accepted = sources.filter((source) => {
7608
+ const concreteSources = sources.filter(isConcreteEvidenceSource);
7609
+ const accepted = concreteSources.filter((source) => {
7079
7610
  const trust = source.trustScore ?? source.score;
7080
7611
  const relevance = source.relevanceScore ?? source.score;
7081
7612
  const duplicateRisk = source.duplicationRisk ?? 0;
7082
7613
  const authority = source.authorityScore ?? 0.5;
7083
7614
  return trust >= perf.minTrustScore && relevance >= perf.minRelevanceScore && duplicateRisk < 0.72 && (trust >= 0.52 || authority >= 0.72);
7084
7615
  });
7085
- const minimum = Math.min(sources.length, Math.max(3, Math.ceil(sources.length * perf.gateMinimumRatio)));
7086
- const fallback = accepted.length >= minimum ? accepted : sources.slice(0, minimum);
7616
+ const minimum = Math.min(concreteSources.length, Math.max(3, Math.ceil(concreteSources.length * perf.gateMinimumRatio)));
7617
+ const fallback = accepted.length >= minimum ? accepted : accepted.length ? accepted : concreteSources.filter((source) => (source.authorityScore ?? 0) >= 0.78 && (source.relevanceScore ?? source.score) >= perf.minRelevanceScore).slice(0, minimum);
7087
7618
  return {
7088
7619
  sources: fallback,
7089
7620
  filtered: Math.max(0, sources.length - fallback.length)
7090
7621
  };
7091
7622
  }
7623
+ function isConcreteEvidenceSource(source) {
7624
+ if (process.env.ALYS_ALLOW_HEURISTIC_GENERATION === "true") return true;
7625
+ const provider = (source.provider || source.discoveredBy || "").toLowerCase();
7626
+ return provider !== "local-heuristic" && !provider.includes("heuristic");
7627
+ }
7092
7628
  function sourceDiversityScore(sources) {
7093
7629
  if (!sources.length) return 0;
7094
7630
  const domains = new Set(sources.map((source) => source.domain || domainFromUrl4(source.url)));
@@ -7115,7 +7651,7 @@ async function generateDataset(options) {
7115
7651
  const targetRows = Math.max(1, Math.floor(options.targetRows ?? 100));
7116
7652
  const datasetId = import_node_crypto4.default.createHash("sha1").update(`${options.topic}:${Date.now()}`).digest("hex").slice(0, 12);
7117
7653
  const workspace = await ensureAlysWorkspace(options.workspaceRoot);
7118
- const generationPlan = buildDatasetGenerationPlan(options.topic);
7654
+ const generationPlan = buildDatasetGenerationPlan(options.topic, datasetType);
7119
7655
  const expandedQueries = planQueries(generationPlan).slice(0, perf.queryCap);
7120
7656
  const discoveryEnabledSeed = options.discoverySeed ?? 0;
7121
7657
  const verificationEnabled = options.verificationEnabled ?? true;
@@ -7140,6 +7676,9 @@ async function generateDataset(options) {
7140
7676
  message: `${gated.filtered} low-trust or low-relevance sources filtered`,
7141
7677
  metric: `${gated.sources.length} accepted`
7142
7678
  });
7679
+ if (!gated.sources.length) {
7680
+ throw new Error("ALYS_NO_TRUSTED_SOURCES");
7681
+ }
7143
7682
  const extraction = new ExtractionAgent();
7144
7683
  event(options.onEvent, { stage: "extraction", agent: extraction.name, status: "running", message: "Extracting source text..." });
7145
7684
  const extracted = await extraction.run(gated.sources);
@@ -7208,6 +7747,9 @@ async function generateDataset(options) {
7208
7747
  }
7209
7748
  });
7210
7749
  event(options.onEvent, { stage: "structuring", agent: structuring.name, status: "success", message: `${structured.length} candidate records generated`, metric: `${targetRows} target` });
7750
+ if (!structured.length) {
7751
+ throw new Error("ALYS_NO_GROUNDED_RECORDS");
7752
+ }
7211
7753
  const curator = new DatasetCuratorAgent();
7212
7754
  event(options.onEvent, { stage: "curation", agent: curator.name, status: "running", message: "Curating final dataset..." });
7213
7755
  const records = curator.run(structured, targetRows).map((record) => ({
@@ -7282,8 +7824,14 @@ async function generateDataset(options) {
7282
7824
  `));
7283
7825
  artifacts.push(await writeDatasetArtifact(workspace, datasetId, "metrics.json", `${JSON.stringify(qualityMetrics, null, 2)}
7284
7826
  `));
7827
+ artifacts.push(await writeDatasetArtifact(workspace, datasetId, "schema.json", `${JSON.stringify(datasetSchema(manifest), null, 2)}
7828
+ `));
7829
+ artifacts.push(await writeDatasetArtifact(workspace, datasetId, "data-dictionary.md", renderDataDictionary(manifest)));
7285
7830
  artifacts.push(await writeDatasetArtifact(workspace, datasetId, "source-graph.json", `${JSON.stringify(research.graph, null, 2)}
7286
7831
  `));
7832
+ artifacts.push(await writeDatasetArtifact(workspace, datasetId, "sources.json", `${JSON.stringify(sources, null, 2)}
7833
+ `));
7834
+ artifacts.push(await writeDatasetArtifact(workspace, datasetId, "sources.md", renderSourcesMarkdown(options.topic, sources)));
7287
7835
  artifacts.push(await writeDatasetArtifact(workspace, datasetId, "generation-plan.json", `${JSON.stringify(generationPlan, null, 2)}
7288
7836
  `));
7289
7837
  artifacts.push(await writeDatasetArtifact(workspace, datasetId, "benchmark-report.json", `${JSON.stringify(evaluation, null, 2)}
@@ -7292,6 +7840,101 @@ async function generateDataset(options) {
7292
7840
  event(options.onEvent, { stage: "export", agent: "ArtifactStorage", status: "success", message: `Dataset written to ${manifest.outputDir}`, metric: manifest.outputDir });
7293
7841
  return { manifest, records, artifacts };
7294
7842
  }
7843
+ function datasetSchema(manifest) {
7844
+ const blueprint2 = manifest.generationPlan?.blueprint;
7845
+ if (!blueprint2) {
7846
+ return {
7847
+ title: "Alys Dataset Record",
7848
+ type: "object",
7849
+ properties: {
7850
+ id: { type: "string" },
7851
+ input: { type: "string" },
7852
+ output: { type: "string" },
7853
+ context: { type: "string" },
7854
+ source_url: { type: "string" },
7855
+ confidence: { type: "number" }
7856
+ },
7857
+ required: ["id", "input", "output", "context", "source_url", "confidence"]
7858
+ };
7859
+ }
7860
+ return {
7861
+ $schema: "https://json-schema.org/draft/2020-12/schema",
7862
+ title: blueprint2.label,
7863
+ description: blueprint2.description,
7864
+ blueprintId: blueprint2.id,
7865
+ blueprintVersion: blueprint2.version,
7866
+ recommendedFormats: blueprint2.recommendedFormats,
7867
+ type: "object",
7868
+ additionalProperties: false,
7869
+ required: blueprint2.fields.filter((field2) => field2.required).map((field2) => field2.name),
7870
+ properties: Object.fromEntries(
7871
+ blueprint2.fields.map((field2) => [
7872
+ field2.name,
7873
+ {
7874
+ type: field2.type,
7875
+ description: field2.description,
7876
+ ...field2.example ? { examples: [field2.example] } : {}
7877
+ }
7878
+ ])
7879
+ )
7880
+ };
7881
+ }
7882
+ function renderDataDictionary(manifest) {
7883
+ const blueprint2 = manifest.generationPlan?.blueprint;
7884
+ if (!blueprint2) {
7885
+ return "# Data Dictionary\n\nNo dataset blueprint was recorded for this run.\n";
7886
+ }
7887
+ const lines = [
7888
+ `# ${blueprint2.label} Data Dictionary`,
7889
+ "",
7890
+ blueprint2.description,
7891
+ "",
7892
+ `Blueprint: \`${blueprint2.id}@${blueprint2.version}\``,
7893
+ `Recommended formats: ${blueprint2.recommendedFormats.map((format) => `\`${format}\``).join(", ")}`,
7894
+ "",
7895
+ "## Fields",
7896
+ "",
7897
+ "| Field | Type | Required | Description | Example |",
7898
+ "| --- | --- | --- | --- | --- |",
7899
+ ...blueprint2.fields.map(
7900
+ (field2) => `| \`${field2.name}\` | ${field2.type} | ${field2.required ? "yes" : "no"} | ${field2.description.replace(/\|/g, "\\|")} | ${field2.example?.replace(/\|/g, "\\|") ?? ""} |`
7901
+ ),
7902
+ "",
7903
+ "## Record Instructions",
7904
+ "",
7905
+ ...blueprint2.recordInstructions.map((item) => `- ${item}`),
7906
+ "",
7907
+ "## Quality Bar",
7908
+ "",
7909
+ ...blueprint2.qualityBar.map((item) => `- ${item}`),
7910
+ ""
7911
+ ];
7912
+ return `${lines.join("\n")}
7913
+ `;
7914
+ }
7915
+ function renderSourcesMarkdown(topic, sources) {
7916
+ const lines = [
7917
+ `# Sources for ${topic}`,
7918
+ "",
7919
+ "Alys grounds generated records in the ranked source pool below. Higher trust scores influence confidence and record acceptance.",
7920
+ ""
7921
+ ];
7922
+ for (const [index, source] of sources.entries()) {
7923
+ const trust = typeof source.trustScore === "number" ? ` \xB7 trust ${Math.round(source.trustScore * 100)}%` : "";
7924
+ const type = source.sourceType ? ` \xB7 ${source.sourceType}` : "";
7925
+ const domain = source.domain ? ` \xB7 ${source.domain}` : "";
7926
+ lines.push(`${index + 1}. [${source.title}](${source.url})${domain}${type}${trust}`);
7927
+ if (source.snippet) lines.push(` ${source.snippet.replace(/\s+/g, " ").trim()}`);
7928
+ if (source.qualitySignals?.length) lines.push(` Signals: ${source.qualitySignals.slice(0, 6).join(", ")}`);
7929
+ lines.push("");
7930
+ }
7931
+ if (!sources.length) {
7932
+ lines.push("No sources were recorded for this run.");
7933
+ lines.push("");
7934
+ }
7935
+ return `${lines.join("\n")}
7936
+ `;
7937
+ }
7295
7938
  function artifactFilename(format) {
7296
7939
  if (format === "markdown") return "dataset.md";
7297
7940
  if (format === "instruction") return "instruction.jsonl";
@@ -7313,7 +7956,7 @@ async function generateDatasets(options) {
7313
7956
  const verificationEnabled = options.enableVerificationSwarm ?? performanceMode !== "fast";
7314
7957
  const debateEnabled = verificationEnabled && perf.debateEnabled;
7315
7958
  const multiplier = depthMultiplier(options.generationDepth);
7316
- const results = await mapLimit2(Array.from({ length: datasetCount }, (_, i) => i), Number(process.env.ALYS_DATASET_CONCURRENCY ?? perf.datasetConcurrency), async (i) => {
7959
+ const results = await mapLimit3(Array.from({ length: datasetCount }, (_, i) => i), Number(process.env.ALYS_DATASET_CONCURRENCY ?? perf.datasetConcurrency), async (i) => {
7317
7960
  const datasetIndex = i + 1;
7318
7961
  const datasetSourceLimit = Math.max(1, Math.floor((options.sourceLimit ?? 24) * multiplier));
7319
7962
  const datasetTargetRows = Math.max(1, Math.floor((options.targetRows ?? 100) * multiplier));
@@ -7343,7 +7986,7 @@ async function generateDatasets(options) {
7343
7986
  const datasets = results;
7344
7987
  return { manifests, artifacts, previews, datasets };
7345
7988
  }
7346
- async function mapLimit2(items, limit, worker) {
7989
+ async function mapLimit3(items, limit, worker) {
7347
7990
  const results = new Array(items.length);
7348
7991
  let nextIndex = 0;
7349
7992
  const workerCount = Math.max(1, Math.min(items.length, Math.floor(limit || 1)));
@@ -7436,6 +8079,13 @@ Limits:
7436
8079
  1 dataset = 1 generation
7437
8080
  max 5 datasets per run
7438
8081
  use --benchmark for local high-volume benchmark runs
8082
+
8083
+ Concepts:
8084
+ RAG chunks retrieval-ready records (for search or knowledge-base apps)
8085
+ Instruction tuning examples for fine-tuning (teaching model behavior)
8086
+ JSONL one JSON object per line (standard for ML pipelines)
8087
+ CSV spreadsheet-friendly rows (for review and analysis)
8088
+ Verification quality checks (confidence, repetition, schema validity)
7439
8089
  `);
7440
8090
  }
7441
8091
  function loadCliEnv(cwd = process.cwd()) {
@@ -7621,6 +8271,17 @@ function getEvaluation(dataset) {
7621
8271
  const evaluation = dataset.manifest.evaluation;
7622
8272
  return evaluation && typeof evaluation === "object" ? evaluation : {};
7623
8273
  }
8274
+ function getBlueprint(dataset) {
8275
+ const generationPlan = dataset.manifest.generationPlan;
8276
+ if (!generationPlan || typeof generationPlan !== "object") return {};
8277
+ const blueprint2 = generationPlan.blueprint;
8278
+ return blueprint2 && typeof blueprint2 === "object" ? blueprint2 : {};
8279
+ }
8280
+ function getSourceManifest(dataset) {
8281
+ const sources = dataset.manifest.sourceManifest;
8282
+ if (!Array.isArray(sources)) return [];
8283
+ return sources.filter((source) => Boolean(source) && typeof source === "object").filter((source) => typeof source.title === "string" || typeof source.url === "string");
8284
+ }
7624
8285
  function printStage(code, status, label, metric) {
7625
8286
  const tint = status === "DONE" || status === "OK" ? "green" : status === "WARN" ? "yellow" : "cyan";
7626
8287
  const prefix = `${paint(`[${code.padEnd(4).slice(0, 4)}]`, "gray")} ${paint(status.padEnd(4), tint)}`;
@@ -7682,7 +8343,7 @@ function printUsage(profile) {
7682
8343
  );
7683
8344
  }
7684
8345
  function printRunPlan(args) {
7685
- const multiplier = depthMultiplier2(args.depth);
8346
+ const multiplier = args.performanceMode === "fast" ? 1 : depthMultiplier2(args.depth);
7686
8347
  const effectiveSources = Math.max(1, Math.floor(args.sourceLimit * multiplier));
7687
8348
  const effectiveRows = Math.max(1, Math.floor(args.targetRows * multiplier));
7688
8349
  const totalRows = effectiveRows * args.datasetCount;
@@ -7813,12 +8474,28 @@ function printGenerationSummary(response, workspaceRoot) {
7813
8474
  const records = Number(metrics.recordsGenerated ?? summary.recordsAccepted ?? 0);
7814
8475
  const sources = Number(metrics.sourcesDiscovered ?? 0);
7815
8476
  const confidenceValue = Number(metrics.averageConfidence ?? summary.averageConfidence ?? 0);
8477
+ const blueprint2 = getBlueprint(dataset);
7816
8478
  const outputDir = import_node_path4.default.join(root, "datasets", dataset.id);
7817
8479
  console.log(`${paint("\u2022", "yellow")} ${paint(dataset.id, "white")} ${formatInt(records)} records ${formatInt(sources)} sources ${formatPercent(confidenceValue)} confidence`);
7818
8480
  console.log(` ${truncate(dataset.topic, 110)}`);
8481
+ if (blueprint2.label || blueprint2.id) {
8482
+ console.log(` blueprint ${paint(blueprint2.label ?? blueprint2.id ?? "dataset blueprint", "white")}${paint(blueprint2.id ? ` (${blueprint2.id})` : "", "gray")}`);
8483
+ }
7819
8484
  console.log(` ${paint(outputDir, "cyan")}`);
7820
8485
  console.log(` quality ${formatPercent(Number(quality.citationCoverage ?? 0))} citations \xB7 ${formatPercent(Number(quality.recordUniqueness ?? 0))} unique \xB7 ${formatPercent(Number(quality.sourceDiversity ?? 0))} source diversity`);
7821
8486
  console.log(` suitability RAG ${formatScore(Number(suitability.ragSuitability ?? 0))} \xB7 tuning ${formatScore(Number(suitability.instructionTuning ?? 0))} \xB7 usefulness ${formatScore(Number(suitability.humanUsefulness ?? 0))}`);
8487
+ const topSources = getSourceManifest(dataset).slice(0, 5);
8488
+ if (topSources.length) {
8489
+ console.log(paint(" sources", "gray"));
8490
+ for (const source of topSources) {
8491
+ const label = source.title || source.domain || source.provider || "source";
8492
+ const trust = Number(source.trustScore ?? source.authorityScore ?? source.relevanceScore ?? 0);
8493
+ const type = source.sourceType ? ` ${source.sourceType}` : "";
8494
+ const score = trust > 0 ? ` ${formatPercent(trust)} trust` : "";
8495
+ console.log(` - ${truncate(label, 76)}${paint(`${type}${score}`, "gray")}`);
8496
+ if (source.url) console.log(` ${paint(source.url, "cyan")}`);
8497
+ }
8498
+ }
7822
8499
  const preview = previewRecord(dataset);
7823
8500
  if (preview) {
7824
8501
  console.log(paint(" preview", "gray"));
@@ -8000,17 +8677,26 @@ async function handleGenerate(args, command) {
8000
8677
  validate: (v) => v.trim().length ? true : "Please enter a topic."
8001
8678
  })).topic);
8002
8679
  if (!topic) throw new Error("Missing topic.");
8003
- const datasetType = parseDatasetType(values.type) ?? (await (0, import_prompts3.default)({
8680
+ const datasetType = parseDatasetType(values.type) ?? (values.yes === true ? "instruction" : void 0) ?? (await (0, import_prompts3.default)({
8004
8681
  type: "select",
8005
8682
  name: "datasetType",
8006
- message: "Dataset type?",
8683
+ message: "Dataset type? (choose what the output should be used for)",
8007
8684
  choices: [
8008
- { title: "Instruction tuning", value: "instruction" },
8009
- { title: "RAG chunks", value: "rag" },
8010
- { title: "Question/Answer", value: "qa" }
8685
+ {
8686
+ title: "Instruction tuning (task + ideal answer examples)",
8687
+ value: "instruction"
8688
+ },
8689
+ {
8690
+ title: "RAG chunks (retrieval-ready context for search/knowledge bases)",
8691
+ value: "rag"
8692
+ },
8693
+ {
8694
+ title: "Question/Answer (direct QA pairs for evaluation or training)",
8695
+ value: "qa"
8696
+ }
8011
8697
  ]
8012
8698
  })).datasetType;
8013
- const requestedDatasetCount = values.datasets ? Math.max(1, Math.floor(Number(values.datasets))) : (await (0, import_prompts3.default)({
8699
+ const requestedDatasetCount = values.datasets ? Math.max(1, Math.floor(Number(values.datasets))) : values.yes === true ? 1 : (await (0, import_prompts3.default)({
8014
8700
  type: "number",
8015
8701
  name: "datasetCount",
8016
8702
  message: "How many datasets?",
@@ -8023,56 +8709,56 @@ async function handleGenerate(args, command) {
8023
8709
  return;
8024
8710
  }
8025
8711
  const datasetCount = requestedDatasetCount;
8026
- const exportFormats = values.format ? parseFormats(values.format) : (await (0, import_prompts3.default)({
8712
+ const exportFormats = values.format ? parseFormats(values.format) : values.yes === true ? ["jsonl", "csv", "markdown"] : (await (0, import_prompts3.default)({
8027
8713
  type: "multiselect",
8028
8714
  name: "exportFormats",
8029
- message: "Output formats?",
8715
+ message: "Output formats? (you can select multiple)",
8030
8716
  choices: [
8031
- { title: "JSONL", value: "jsonl", selected: true },
8032
- { title: "CSV", value: "csv", selected: true },
8033
- { title: "Markdown", value: "markdown" },
8034
- { title: "Instruction dataset", value: "instruction" },
8035
- { title: "RAG chunks", value: "rag" }
8717
+ { title: "JSONL (one JSON record per line, best for ML pipelines)", value: "jsonl", selected: true },
8718
+ { title: "CSV (spreadsheet-friendly review format)", value: "csv", selected: true },
8719
+ { title: "Markdown (readable summary for humans)", value: "markdown" },
8720
+ { title: "Instruction dataset (fine-tuning JSONL)", value: "instruction" },
8721
+ { title: "RAG chunks (retrieval-ready JSONL)", value: "rag" }
8036
8722
  ],
8037
8723
  hint: "Use space to select multiple."
8038
8724
  })).exportFormats;
8039
- const depth = parseDepth(values.depth) ?? (await (0, import_prompts3.default)({
8725
+ const depth = parseDepth(values.depth) ?? (values.yes === true ? "medium" : void 0) ?? (await (0, import_prompts3.default)({
8040
8726
  type: "select",
8041
8727
  name: "depth",
8042
- message: "Research depth?",
8728
+ message: "Research depth? (more depth can improve coverage but costs time)",
8043
8729
  choices: [
8044
- { title: "Shallow", value: "shallow" },
8045
- { title: "Medium", value: "medium" },
8046
- { title: "Deep", value: "deep" }
8730
+ { title: "Shallow (fastest, smaller context)", value: "shallow" },
8731
+ { title: "Medium (balanced default)", value: "medium" },
8732
+ { title: "Deep (broader coverage, slower)", value: "deep" }
8047
8733
  ]
8048
8734
  })).depth;
8049
- const sourceLimit = values.sources ? Math.min(maxSources, Math.max(1, Number(values.sources))) : (await (0, import_prompts3.default)({
8735
+ const sourceLimit = values.sources ? Math.min(maxSources, Math.max(1, Number(values.sources))) : values.yes === true ? benchmarkMode ? 48 : MAX_SOURCES_PER_RUN : (await (0, import_prompts3.default)({
8050
8736
  type: "number",
8051
8737
  name: "sourceLimit",
8052
- message: "How many sources?",
8738
+ message: "How many sources? (more sources can improve coverage but may slow the run)",
8053
8739
  initial: benchmarkMode ? 48 : MAX_SOURCES_PER_RUN,
8054
8740
  min: 1,
8055
8741
  max: maxSources
8056
8742
  })).sourceLimit;
8057
- const targetRows = values.rows ? Math.min(maxRows, Math.max(1, Number(values.rows))) : (await (0, import_prompts3.default)({
8743
+ const targetRows = values.rows ? Math.min(maxRows, Math.max(1, Number(values.rows))) : values.yes === true ? benchmarkMode ? 5e3 : MAX_ROWS_PER_DATASET : (await (0, import_prompts3.default)({
8058
8744
  type: "number",
8059
8745
  name: "targetRows",
8060
- message: "Rows per dataset?",
8746
+ message: "Rows per dataset? (Alys aims for rows worth keeping, not raw volume)",
8061
8747
  initial: benchmarkMode ? 5e3 : MAX_ROWS_PER_DATASET,
8062
8748
  min: 1,
8063
8749
  max: maxRows
8064
8750
  })).targetRows;
8065
- const workspaceRoot = (values.workspace ? String(values.workspace) : "").trim() || (await (0, import_prompts3.default)({
8751
+ const workspaceRoot = (values.workspace ? String(values.workspace) : "").trim() || (values.yes === true ? "~/Alys" : "") || (await (0, import_prompts3.default)({
8066
8752
  type: "text",
8067
8753
  name: "workspaceRoot",
8068
8754
  message: "Export directory?",
8069
8755
  initial: "~/Alys",
8070
8756
  validate: (v) => v.trim().length ? true : "Enter an export directory."
8071
8757
  })).workspaceRoot;
8072
- const verificationEnabled = values.verify === true ? true : values["no-verify"] === true ? false : (await (0, import_prompts3.default)({
8758
+ const verificationEnabled = values.verify === true ? true : values["no-verify"] === true ? false : values.yes === true ? performanceMode !== "fast" : (await (0, import_prompts3.default)({
8073
8759
  type: "toggle",
8074
8760
  name: "verificationEnabled",
8075
- message: "Enable verification checks?",
8761
+ message: "Enable verification checks? (slower, stricter about weak/repetitive records)",
8076
8762
  initial: performanceMode !== "fast",
8077
8763
  active: "Yes",
8078
8764
  inactive: "No"
@@ -8119,9 +8805,9 @@ async function handleGenerate(args, command) {
8119
8805
  console.log(paint("Runtime", "white"));
8120
8806
  printStage("AUTH", "OK", "Usage linked", appUrl());
8121
8807
  printStage("PLAN", "OK", "Generations charged only after successful completion", `${datasetCount} requested`);
8122
- printStage("SRC", "RUN", "Research pipeline starting", `${sourceLimit} source target`);
8808
+ printStage("RUN", "RUN", "Dataset runtime starting", `${performanceMode} mode`);
8123
8809
  const response = await withSpinner(
8124
- "Alys research runtime executing",
8810
+ "Alys runtime executing",
8125
8811
  requestJson(
8126
8812
  "/api/cli/generate",
8127
8813
  {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "alys-akusa",
3
- "version": "0.1.6",
3
+ "version": "0.1.8",
4
4
  "private": false,
5
5
  "description": "Alys local CLI runtime for autonomous dataset generation.",
6
6
  "license": "UNLICENSED",