npm - alys-akusa - Versions diffs - 0.1.7 → 0.1.8 - Mend

alys-akusa 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/index.cjs +702 -30
package/package.json +1 -1

package/dist/index.cjs CHANGED Viewed

@@ -5109,7 +5109,10 @@ async function discoverResearchSources(topic, options = {}) {
   };
 }
 function createConfiguredSearchProviders() {
-  const providers = [];
+  const providers = [
+    new GitHubSearchProvider(env("GITHUB_TOKEN")),
+    new KaggleSearchProvider(env("KAGGLE_USERNAME"), env("KAGGLE_KEY"))
+  ];
   const tavilyKey = env("TAVILY_API_KEY");
   if (tavilyKey) providers.push(new TavilySearchProvider(tavilyKey));
   const serpApiKey = env("SERPAPI_API_KEY");
@@ -5155,6 +5158,107 @@ var TavilySearchProvider = class {
     }));
   }
 };
+var GitHubSearchProvider = class {
+  constructor(token = null) {
+    this.token = token;
+  }
+  name = "github";
+  async search(query, options = {}) {
+    const url = new URL("https://api.github.com/search/repositories");
+    url.searchParams.set("q", `${query} dataset OR benchmark OR corpus OR csv OR jsonl in:name,description,readme`);
+    url.searchParams.set("sort", "stars");
+    url.searchParams.set("order", "desc");
+    url.searchParams.set("per_page", String(Math.min(20, options.limit ?? 10)));
+    const headers = {
+      Accept: "application/vnd.github+json",
+      "User-Agent": "AlysResearchBot/0.1",
+      "X-GitHub-Api-Version": "2022-11-28"
+    };
+    if (this.token) headers.Authorization = `Bearer ${this.token}`;
+    const payload = await fetchJson(url.toString(), { headers }, options.timeoutMs);
+    return asArray(payload.items).flatMap((item) => {
+      if (!item || typeof item !== "object") return [];
+      const object = item;
+      const fullName = firstString(object, ["full_name"]);
+      const htmlUrl = firstString(object, ["html_url"]);
+      if (!fullName || !htmlUrl) return [];
+      const stars = firstNumber(object, ["stargazers_count"]) ?? 0;
+      const forks = firstNumber(object, ["forks_count"]) ?? 0;
+      const topics = Array.isArray(object.topics) ? object.topics.map(String).slice(0, 8) : [];
+      const license = object.license && typeof object.license === "object" ? firstString(object.license, ["spdx_id", "name"]) : "";
+      const description = firstString(object, ["description"]);
+      const language = firstString(object, ["language"]);
+      const score = clamp01(Math.log10(stars + 1) / 5 * 0.72 + Math.log10(forks + 1) / 5 * 0.16 + (license ? 0.08 : 0) + (topics.length ? 0.04 : 0));
+      return [{
+        title: `GitHub: ${fullName}`,
+        url: htmlUrl,
+        snippet: [
+          description,
+          language ? `Language: ${language}.` : "",
+          license ? `License: ${license}.` : "",
+          topics.length ? `Topics: ${topics.join(", ")}.` : "",
+          `Stars: ${stars}. Forks: ${forks}.`
+        ].filter(Boolean).join(" "),
+        publishedAt: firstString(object, ["updated_at", "pushed_at", "created_at"]),
+        score,
+        provider: this.name,
+        query,
+        raw: item
+      }];
+    });
+  }
+};
+var KaggleSearchProvider = class {
+  constructor(username = null, key = null) {
+    this.username = username;
+    this.key = key;
+  }
+  name = "kaggle";
+  async search(query, options = {}) {
+    if (!this.username || !this.key) {
+      return [];
+    }
+    const url = new URL("https://www.kaggle.com/api/v1/datasets/list");
+    url.searchParams.set("search", query);
+    url.searchParams.set("sortBy", "hottest");
+    url.searchParams.set("pageSize", String(Math.min(20, options.limit ?? 10)));
+    const auth = Buffer.from(`${this.username}:${this.key}`).toString("base64");
+    const payload = await fetchJson(url.toString(), {
+      headers: {
+        Accept: "application/json",
+        Authorization: `Basic ${auth}`,
+        "User-Agent": "AlysResearchBot/0.1"
+      }
+    }, options.timeoutMs);
+    const items = Array.isArray(payload) ? payload : payload && typeof payload === "object" ? asArray(payload.datasets ?? payload.results) : [];
+    return items.flatMap((item) => {
+      if (!item || typeof item !== "object") return [];
+      const object = item;
+      const ref = firstString(object, ["ref", "datasetRef", "ownerName"]);
+      const title = firstString(object, ["title", "subtitle", "ref"]) || ref;
+      const datasetUrl = firstString(object, ["url"]) || (ref ? `https://www.kaggle.com/datasets/${ref}` : "");
+      if (!title || !datasetUrl) return [];
+      const votes = firstNumber(object, ["voteCount", "votes"]) ?? 0;
+      const downloads = firstNumber(object, ["downloadCount", "downloads"]) ?? 0;
+      const usability = firstNumber(object, ["usabilityRating"]) ?? 0;
+      const score = clamp01(Math.log10(downloads + 1) / 6 * 0.38 + Math.log10(votes + 1) / 5 * 0.24 + Math.min(1, usability) * 0.28 + 0.1);
+      return [{
+        title: `Kaggle: ${title}`,
+        url: datasetUrl,
+        snippet: [
+          firstString(object, ["subtitle", "description"]),
+          `Downloads: ${downloads}. Votes: ${votes}.`,
+          usability ? `Usability: ${usability}.` : ""
+        ].filter(Boolean).join(" "),
+        publishedAt: firstString(object, ["lastUpdated", "creationDate"]),
+        score,
+        provider: this.name,
+        query,
+        raw: item
+      }];
+    });
+  }
+};
 var SerpApiSearchProvider = class {
   constructor(apiKey) {
     this.apiKey = apiKey;
@@ -5294,6 +5398,8 @@ function buildResearchQueries(topic, count = 5) {
   const normalized = topic.trim().replace(/\s+/g, " ");
   const facets = [
     normalized,
+    `${normalized} dataset github kaggle benchmark`,
+    `${normalized} public dataset csv jsonl parquet`,
     `${normalized} official documentation standards methodology`,
     `${normalized} research paper benchmark evaluation`,
     `${normalized} case study operational data`,
@@ -5318,8 +5424,9 @@ ${result.url}`);
     const freshnessScore = freshnessForDate(result.publishedAt);
     const duplicationRisk = clamp01(Math.max(0, (domainCounts.get(domain) ?? 1) - 1) * 0.12);
     const providerScore = normalizeProviderScore(result.score);
+    const sourcePreference = sourcePreferenceScore(domain, result.url, result.provider);
     const trustScore = clamp01(
-      authority.score * 0.3 + relevanceScore * 0.27 + semanticScore * 0.18 + freshnessScore * 0.12 + providerScore * 0.08 + (1 - duplicationRisk) * 0.05
+      authority.score * 0.3 + relevanceScore * 0.27 + semanticScore * 0.18 + freshnessScore * 0.1 + providerScore * 0.07 + sourcePreference * 0.05 + (1 - duplicationRisk) * 0.05
     );
     return {
       id: sourceId(result.url),
@@ -5339,7 +5446,12 @@ ${result.url}`);
       semanticScore: Number(semanticScore.toFixed(3)),
       trustScore: Number(trustScore.toFixed(3)),
       sourceType: authority.type,
-      qualitySignals: authority.signals
+      qualitySignals: [
+        ...authority.signals,
+        ...sourcePreference >= 0.85 ? ["preferred-source-surface"] : [],
+        ...result.provider === "github" ? ["github-repository-search"] : [],
+        ...result.provider === "kaggle" ? ["kaggle-dataset-search"] : []
+      ]
     };
   });
 }
@@ -5549,6 +5661,14 @@ function authorityForDomain(domain, url) {
     score = 0.78;
     type = "official";
     signals.push("official-documentation");
+  } else if (host === "github.com") {
+    score = 0.84;
+    type = "code";
+    signals.push("open-source-repository");
+  } else if (host === "kaggle.com" || host.endsWith(".kaggle.com")) {
+    score = 0.86;
+    type = "dataset";
+    signals.push("dataset-marketplace");
   } else if (host.includes("wikipedia.org")) {
     score = 0.62;
     type = "community";
@@ -5564,6 +5684,18 @@ function authorityForDomain(domain, url) {
   }
   return { score, type, signals };
 }
+function sourcePreferenceScore(domain, url, provider) {
+  const host = domain.toLowerCase();
+  if (provider === "kaggle" || host.includes("kaggle.com")) return 0.96;
+  if (provider === "github" || host === "github.com") return 0.93;
+  if (host.endsWith(".gov") || host.includes("nist.gov") || host.includes("sec.gov")) return 0.92;
+  if (host.includes("arxiv.org") || host.endsWith(".edu") || host.includes("openalex.org")) return 0.88;
+  if (host.includes("huggingface.co/datasets")) return 0.88;
+  if (host.includes("data.gov") || url.includes("/dataset")) return 0.84;
+  if (host.includes("docs.") || url.includes("/docs/") || url.includes("/documentation/")) return 0.78;
+  if (host.includes("reddit.") || host.includes("medium.") || host.includes("substack.")) return 0.24;
+  return 0.55;
+}
 function freshnessForDate(value) {
   if (!value) return 0.62;
   const timestamp = Date.parse(value);
@@ -5658,6 +5790,8 @@ function env(key) {
   return value || null;
 }
 var LOCAL_SOURCE_BASES = [
+  { label: "GitHub", url: "https://github.com/search", querySuffix: "q", signal: "Open-source repositories, examples, datasets, and benchmark code." },
+  { label: "Kaggle", url: "https://www.kaggle.com/datasets", querySuffix: "search", signal: "Public dataset catalog and dataset-level examples." },
   { label: "NIST", url: "https://www.nist.gov/search", querySuffix: "q", signal: "Technical guidance and standards language." },
   { label: "SEC", url: "https://www.sec.gov/search", querySuffix: "q", signal: "Regulatory filings and official disclosures." },
   { label: "PubMed", url: "https://pubmed.ncbi.nlm.nih.gov", querySuffix: "term", signal: "Academic and biomedical literature index." },
@@ -6128,13 +6262,235 @@ Source: ${result.source.url}
 }
 // ../../packages/prompts/src/index.ts
-function buildDatasetGenerationPlan(topic) {
+var DATASET_BLUEPRINTS = [
+  blueprint({
+    id: "instruction-finetune-corpus",
+    label: "Instruction Fine-Tuning Corpus",
+    description: "Task-and-answer examples for model fine-tuning, evaluator training, and assistant behavior shaping.",
+    defaultType: "instruction",
+    recommendedFormats: ["instruction", "jsonl", "csv", "markdown"],
+    bestFor: ["fine-tuning", "assistant behavior", "domain task completion", "supervised training"],
+    fields: [
+      field("instruction", "string", "The user-facing task the model should perform.", true, "Handle a pricing objection from a VP of Sales."),
+      field("input_context", "string", "The scenario, persona, constraints, or source-backed context for the instruction.", true, "Mid-market CRM buyer comparing annual contracts."),
+      field("ideal_output", "string", "The high-quality answer or completion expected from the model.", true, "Acknowledge budget pressure, quantify missed pipeline risk, and offer a pilot path."),
+      field("skill", "string", "The primary capability being trained.", true, "objection-handling"),
+      field("difficulty", "string", "Expected difficulty or complexity level.", false, "intermediate"),
+      field("source_url", "string", "The source URL supporting this record.", true, "https://example.com/source"),
+      field("confidence", "number", "Alys confidence score after source and quality checks.", true, "0.86")
+    ],
+    recordInstructions: [
+      "Make the instruction actionable and directly trainable.",
+      "Keep the output specific enough to teach a behavior, not a generic explanation.",
+      "Include scenario/context when the answer depends on buyer role, domain, risk, or constraints."
+    ],
+    qualityBar: [
+      "Record teaches one clear skill.",
+      "Answer is useful without reading the source page.",
+      "No repeated template phrasing across rows."
+    ]
+  }),
+  blueprint({
+    id: "rag-corpus",
+    label: "RAG Retrieval Corpus",
+    description: "Retrieval-ready passages, grounded answers, and citation-friendly chunks for search or knowledge bases.",
+    defaultType: "rag",
+    recommendedFormats: ["rag", "jsonl", "csv", "markdown"],
+    bestFor: ["RAG", "semantic search", "knowledge bases", "citation-aware QA"],
+    fields: [
+      field("query", "string", "A realistic retrieval query or user question.", true, "What evidence supports the SOC 2 access review requirement?"),
+      field("answer", "string", "A compact answer grounded only in the retrieved context.", true, "The control requires periodic review of user access and documented exceptions."),
+      field("chunk_text", "string", "The retrieval text chunk that should be embedded.", true, "Access reviews should be performed periodically..."),
+      field("source_title", "string", "Human-readable source title.", true, "SOC 2 Criteria Overview"),
+      field("source_url", "string", "Canonical source URL.", true, "https://example.com/source"),
+      field("citation_span", "string", "The source-backed phrase or section used as evidence.", false, "periodic review of user access"),
+      field("confidence", "number", "Grounding confidence for the answer/chunk.", true, "0.91")
+    ],
+    recordInstructions: [
+      "Make each context chunk standalone and retrieval-ready.",
+      "Answer only what the source context supports.",
+      "Prefer compact, evidence-rich chunks over long summaries."
+    ],
+    qualityBar: [
+      "Chunk contains one coherent idea.",
+      "Query is realistic, not keyword stuffing.",
+      "Answer can be traced back to source context."
+    ]
+  }),
+  blueprint({
+    id: "evaluation-qa",
+    label: "Evaluation QA Dataset",
+    description: "Question-answer records with expected answers, rubric signals, difficulty, and factual grounding.",
+    defaultType: "qa",
+    recommendedFormats: ["jsonl", "csv", "markdown"],
+    bestFor: ["model evaluation", "benchmarking", "golden sets", "regression testing"],
+    fields: [
+      field("question", "string", "A precise evaluation question.", true, "Which mitigation should be used when duplicate records reduce retrieval diversity?"),
+      field("expected_answer", "string", "The answer a model should produce.", true, "Apply semantic deduplication and retain records with stronger source support."),
+      field("rubric", "string", "Criteria used to grade the model response.", true, "Must mention deduplication, source support, and diversity impact."),
+      field("difficulty", "string", "Difficulty band.", true, "hard"),
+      field("failure_modes", "array", "Likely wrong answers to catch.", false, "hallucinated metric, unsupported source claim"),
+      field("source_url", "string", "Evidence source URL.", true, "https://example.com/source"),
+      field("confidence", "number", "Grounding confidence.", true, "0.88")
+    ],
+    recordInstructions: [
+      "Write questions that test reasoning, not memorization only.",
+      "Include rubric-like constraints in the answer or context.",
+      "Add failure-mode awareness when the source contains ambiguity."
+    ],
+    qualityBar: [
+      "Question has one defensible expected answer.",
+      "Rubric exposes what a weak model would miss.",
+      "Evidence is visible in context."
+    ]
+  }),
+  blueprint({
+    id: "b2b-saas-objections",
+    label: "B2B SaaS Objection Handling",
+    description: "Sales-training records for pricing, procurement, ROI, onboarding, integration, security, and competitive objections.",
+    defaultType: "instruction",
+    recommendedFormats: ["instruction", "jsonl", "csv", "markdown"],
+    bestFor: ["sales enablement", "roleplay training", "support coaching", "fine-tuning"],
+    fields: [
+      field("buyer_role", "string", "The buyer or stakeholder persona.", true, "CFO"),
+      field("company_segment", "string", "Customer segment or buying context.", true, "mid-market SaaS"),
+      field("objection_category", "string", "Primary objection class.", true, "pricing"),
+      field("objection", "string", "The exact buyer objection.", true, "This is too expensive compared to our current tool."),
+      field("recommended_response", "string", "The ideal grounded response.", true, "Tie cost to pipeline leakage and propose a measured pilot."),
+      field("proof_point", "string", "Evidence or reasoning used in the response.", false, "integration time and support burden"),
+      field("follow_up_question", "string", "A next-step question that advances discovery.", true, "What cost are you currently assigning to delayed handoffs?")
+    ],
+    recordInstructions: [
+      "Make objections sound like real buyers, not canned sales scripts.",
+      "Answer with empathy, evidence, and a concrete next step.",
+      "Vary buyer role, deal stage, company size, and risk profile."
+    ],
+    qualityBar: [
+      "Response addresses the objection directly.",
+      "No fake statistics or logos.",
+      "Follow-up question is useful in a real sales call."
+    ]
+  }),
+  blueprint({
+    id: "technical-docs-qa",
+    label: "Technical Documentation QA",
+    description: "Developer-facing questions, answers, commands, prerequisites, errors, and implementation details grounded in docs or repos.",
+    defaultType: "qa",
+    recommendedFormats: ["jsonl", "csv", "markdown", "rag"],
+    bestFor: ["developer docs", "support bots", "SDK QA", "RAG corpora"],
+    fields: [
+      field("product_area", "string", "The API, package, CLI, or feature area.", true, "CLI authentication"),
+      field("question", "string", "Developer question or task.", true, "How do I authenticate the CLI?"),
+      field("answer", "string", "Specific answer grounded in documentation.", true, "Run npx alys-akusa login and finish the browser flow."),
+      field("code_or_command", "string", "Relevant command, code, or config.", false, "npx alys-akusa login"),
+      field("prerequisites", "array", "Required setup before this works.", false, "Alys account, browser access"),
+      field("common_error", "string", "Likely failure mode or troubleshooting note.", false, "Expired CLI login session."),
+      field("source_url", "string", "Documentation or repository URL.", true, "https://example.com/docs")
+    ],
+    recordInstructions: [
+      "Prefer commands, parameters, return shapes, limits, and edge cases.",
+      "Never invent API names or package names.",
+      "If docs are ambiguous, make the uncertainty visible."
+    ],
+    qualityBar: [
+      "Answer can be executed or verified.",
+      "No fake APIs.",
+      "Source context includes the relevant command or behavior."
+    ]
+  }),
+  blueprint({
+    id: "legal-compliance-rag",
+    label: "Legal & Compliance RAG Corpus",
+    description: "Compliance-aware chunks and QA with jurisdictions, requirements, controls, exceptions, and evidence notes.",
+    defaultType: "rag",
+    recommendedFormats: ["rag", "jsonl", "csv", "markdown"],
+    bestFor: ["compliance search", "policy QA", "audit prep", "legal retrieval"],
+    fields: [
+      field("jurisdiction", "string", "Relevant jurisdiction or regulatory scope.", false, "United States"),
+      field("requirement", "string", "The rule, obligation, or policy requirement.", true, "Maintain access review evidence."),
+      field("control_or_action", "string", "Concrete action, control, or procedure.", true, "Review user access quarterly and document exceptions."),
+      field("exception_or_limit", "string", "Boundary, exception, or uncertainty.", false, "Frequency may vary by framework."),
+      field("evidence_text", "string", "Source-backed evidence span.", true, "periodic access reviews"),
+      field("source_url", "string", "Canonical source URL.", true, "https://example.com/policy"),
+      field("confidence", "number", "Confidence after source checks.", true, "0.84")
+    ],
+    recordInstructions: [
+      "Preserve scope and limitations.",
+      "Do not convert legal text into absolute advice when the source is conditional.",
+      "Separate requirement, control, and exception clearly."
+    ],
+    qualityBar: [
+      "Jurisdiction/scope is not blurred.",
+      "No invented legal conclusions.",
+      "Evidence text supports the answer."
+    ]
+  }),
+  blueprint({
+    id: "engineering-telemetry",
+    label: "Engineering Telemetry & Operations",
+    description: "Operational records for engineering systems: parameters, units, ranges, anomalies, recommendations, and safety constraints.",
+    defaultType: "instruction",
+    recommendedFormats: ["jsonl", "csv", "instruction", "markdown"],
+    bestFor: ["industrial AI", "predictive maintenance", "ops training", "engineering assistants"],
+    fields: [
+      field("asset_type", "string", "Equipment, system, or asset class.", true, "oil pipeline pump station"),
+      field("operation_phase", "string", "Workflow or operating phase.", true, "commissioning"),
+      field("parameter", "string", "Observed parameter or measurement.", true, "pressure differential"),
+      field("unit", "string", "Engineering unit.", false, "psi"),
+      field("normal_range", "string", "Source-backed or cautiously inferred expected range.", false, "site-specific; verify against design docs"),
+      field("abnormal_signal", "string", "Failure pattern, warning, or anomaly.", true, "rising vibration with falling flow rate"),
+      field("recommended_action", "string", "Operationally safe next action.", true, "inspect pump seals and verify sensor calibration"),
+      field("safety_note", "string", "Boundary or caution.", true, "do not exceed site operating procedures")
+    ],
+    recordInstructions: [
+      "Keep units, ranges, and recommendations physically plausible.",
+      "If numeric ranges are not in source context, say they require site-specific verification.",
+      "Include normal, abnormal, edge, and incident scenarios."
+    ],
+    qualityBar: [
+      "No invented unsafe operating limits.",
+      "Action follows from the signal.",
+      "Record includes constraints and uncertainty."
+    ]
+  }),
+  blueprint({
+    id: "dataset-source-catalog",
+    label: "Dataset Source Catalog",
+    description: "Catalog records for public datasets, repositories, benchmark corpora, licenses, schemas, and use constraints.",
+    defaultType: "qa",
+    recommendedFormats: ["jsonl", "csv", "markdown"],
+    bestFor: ["dataset discovery", "benchmark planning", "source audits", "training-data procurement"],
+    fields: [
+      field("source_name", "string", "Dataset/repository/source name.", true, "SDV benchmark dataset"),
+      field("source_url", "string", "Canonical URL.", true, "https://github.com/sdv-dev/SDV"),
+      field("source_type", "string", "Dataset, repository, benchmark, paper, documentation, or registry.", true, "repository"),
+      field("domain", "string", "Domain or category.", false, "synthetic data"),
+      field("available_formats", "array", "Known available formats.", false, "csv, jsonl, parquet"),
+      field("license_or_terms", "string", "License or usage constraints when available.", false, "MIT"),
+      field("schema_summary", "string", "Short description of fields/tables/tasks.", true, "Benchmark suite for tabular synthetic data."),
+      field("trust_reason", "string", "Why this source is credible or useful.", true, "Primary GitHub repository with active documentation.")
+    ],
+    recordInstructions: [
+      "Favor GitHub, Kaggle, Hugging Face datasets, official benchmark pages, and primary repositories.",
+      "Do not claim license, row count, or schema details unless visible in source context.",
+      "Make the catalog immediately usable for dataset selection."
+    ],
+    qualityBar: [
+      "Every row points to a real source URL.",
+      "License/format claims are source-backed or marked unknown.",
+      "Trust reason is explicit."
+    ]
+  })
+];
+function buildDatasetGenerationPlan(topic, datasetType) {
   const normalized = topic.trim().replace(/\s+/g, " ");
   const lower = normalized.toLowerCase();
   const segments = knownSegments(lower, normalized) ?? genericSegments(normalized);
+  const blueprint2 = selectDatasetBlueprint(normalized, datasetType);
   return {
     topic: normalized,
     intent: inferIntent(lower),
+    blueprint: blueprint2,
     segments,
     sourceCriteria: [
       "Prefer official documentation, standards bodies, academic papers, technical reports, government sources, and primary company docs.",
@@ -6149,12 +6505,41 @@ function buildDatasetGenerationPlan(topic) {
       "Repeated phrasing across records indicates mode collapse and must be suppressed."
     ],
     outputSchemaNotes: [
+      `Use the "${blueprint2.label}" blueprint (${blueprint2.id}) as the canonical row contract.`,
+      `Recommended exports: ${blueprint2.recommendedFormats.join(", ")}.`,
+      ...blueprint2.fields.map((item) => `${item.required ? "Required" : "Optional"} field: ${item.name} (${item.type}) - ${item.description}`),
       "Each record needs an actionable input, specific output, source context, source URL, confidence, tags, and metadata.",
       "Metadata should include source trust, source authority, source relevance, segment ID, quality signals, and benchmark notes.",
       "Records should be useful without the original source page while still preserving provenance."
     ]
   };
 }
+function selectDatasetBlueprint(topic, datasetType) {
+  const lower = topic.toLowerCase();
+  const selected = (() => {
+    if ((lower.includes("b2b") || lower.includes("sales")) && lower.includes("objection")) return byId("b2b-saas-objections");
+    if (lower.includes("legal") || lower.includes("compliance") || lower.includes("soc 2") || lower.includes("hipaa")) return byId("legal-compliance-rag");
+    if (lower.includes("oil") || lower.includes("gas") || lower.includes("pipeline") || lower.includes("drilling") || lower.includes("telemetry") || lower.includes("maintenance")) return byId("engineering-telemetry");
+    if (lower.includes("documentation") || lower.includes("developer") || lower.includes("api") || lower.includes("sdk") || lower.includes("cli")) return byId("technical-docs-qa");
+    if (lower.includes("benchmark") || lower.includes("evaluation") || lower.includes("eval")) return byId("evaluation-qa");
+    if (lower.includes("source catalog") || lower.includes("public dataset") || lower.includes("kaggle") || lower.includes("github dataset")) return byId("dataset-source-catalog");
+    if (lower.includes("rag") || lower.includes("retrieval") || lower.includes("knowledge base")) return byId("rag-corpus");
+    return datasetType === "rag" ? byId("rag-corpus") : datasetType === "qa" ? byId("evaluation-qa") : byId("instruction-finetune-corpus");
+  })();
+  return selected ?? DATASET_BLUEPRINTS[0];
+}
+function byId(id) {
+  return DATASET_BLUEPRINTS.find((blueprint2) => blueprint2.id === id);
+}
+function blueprint(input) {
+  return {
+    ...input,
+    version: input.version ?? "1.0.0"
+  };
+}
+function field(name, type, description, required, example) {
+  return { name, type, description, required, example };
+}
 function segmentForSource(plan, text, index = 0) {
   const haystack = text.toLowerCase();
   const scored = plan.segments.map((segment2) => {
@@ -6267,9 +6652,12 @@ Your job is to generate production-grade synthetic dataset records that can surv
 Hard rules:
 - Return only valid JSON matching the requested schema.
 - Each record must be meaningfully different.
+- Every record must be grounded in the provided source context.
+- Do not introduce facts, numbers, names, benchmarks, URLs, citations, or claims that are not present in the source context.
 - Preserve plausible domain physics, operational constraints, and realistic terminology.
 - Prefer specific scenarios, values, failure modes, and edge cases over generic prose.
-- Do not include fake citations. Source context is provenance, not proof of factual truth.
+- If the source context is thin, produce cautious records about what can be inferred and what needs verification.
+- Do not include fake citations. Source URL and source title are attached outside your JSON.
 - Never output placeholders, TODOs, markdown fences, or explanations.
 `.trim();
 function loadAlysEnv(cwd = process.cwd()) {
@@ -6361,6 +6749,9 @@ Active topic segment:
 Generation plan intent:
 ${options.generationPlan.intent}
+Dataset blueprint:
+${renderBlueprintForPrompt(options.generationPlan)}
 Required source criteria:
 ${options.generationPlan.sourceCriteria.map((item) => `- ${item}`).join("\n")}
@@ -6394,6 +6785,9 @@ ${segmentBlock}
 Generate exactly ${targetCount} records.
 Domain quality requirements:
+- Ground every record in the source context below. If a detail is not in the context, do not invent it.
+- Use the dataset blueprint above as the semantic row contract. Do not waste tokens describing CSV/JSONL formatting; Alys handles export formatting.
+- Put the most important blueprint field values into input, output, and context. Add blueprint field names to metadata.signals or metadata.constraints when useful.
 - Include realistic parameters, edge cases, constraints, and operational variability.
 - For engineering domains, use physically plausible values and causal relationships.
 - For tabular/synthetic-data domains, preserve schema-like consistency and row-level diversity.
@@ -6401,6 +6795,7 @@ Domain quality requirements:
 - Avoid repeating the topic verbatim in every output.
 - Reflect source quality: lower-confidence sources should create cautious, verification-aware records.
 - If the source is low authority, write records as verification-aware training examples rather than confident factual claims.
+- Prefer source-backed dataset rows over broad advice. If the context only supports broad advice, lower confidence.
 - Every output should contain a specific decision signal, scenario, constraint, or useful answer. Reject generic summaries.
 - Prefer records that can be directly reused for fine-tuning, RAG evaluation, or QA benchmarks.
@@ -6425,8 +6820,29 @@ Return JSON with this shape:
 Source context:
 ${sourceText}
+Final grounding rule:
+Only use information supported by Source title, Source URL, Source quality, and Source context above.
 `.trim();
 }
+function renderBlueprintForPrompt(plan) {
+  const blueprint2 = plan.blueprint;
+  return [
+    `ID: ${blueprint2.id}`,
+    `Label: ${blueprint2.label}`,
+    `Version: ${blueprint2.version}`,
+    `Purpose: ${blueprint2.description}`,
+    `Best for: ${blueprint2.bestFor.join(", ")}`,
+    "Fields:",
+    ...blueprint2.fields.map(
+      (field2) => `- ${field2.name} (${field2.type}${field2.required ? ", required" : ", optional"}): ${field2.description}${field2.example ? ` Example: ${field2.example}` : ""}`
+    ),
+    "Record instructions:",
+    ...blueprint2.recordInstructions.map((item) => `- ${item}`),
+    "Quality bar:",
+    ...blueprint2.qualityBar.map((item) => `- ${item}`)
+  ].join("\n");
+}
 async function generateWithOpenAI(options) {
   const apiKey = getOpenAIKey();
   const model = process.env.ALYS_GENERATOR_MODEL || DEFAULT_OPENAI_MODEL;
@@ -6447,7 +6863,7 @@ async function generateWithOpenAI(options) {
     },
     body: JSON.stringify({
       model,
-      temperature: Number(process.env.ALYS_GENERATOR_TEMPERATURE ?? 0.45),
+      temperature: Number(process.env.ALYS_GENERATOR_TEMPERATURE ?? 0.28),
       max_tokens: Math.min(32768, Math.max(1600, options.targetCount * 520)),
       messages: [
         { role: "system", content: ALYS_RECORD_SYSTEM_PROMPT },
@@ -6473,7 +6889,10 @@ async function generateWithOpenAI(options) {
     provider: "openai",
     model,
     latencyMs: Date.now() - startedAt,
-    records: parseProviderRecords(content, options.baselineConfidence, options.datasetType)
+    records: groundProviderRecords(
+      parseProviderRecords(content, options.baselineConfidence, options.datasetType),
+      options.document
+    )
   };
 }
 async function generateWithOpenAIBatched(options) {
@@ -6520,7 +6939,7 @@ async function generateWithGroq(options) {
     },
     body: JSON.stringify({
       model,
-      temperature: Number(process.env.ALYS_GENERATOR_TEMPERATURE ?? 0.35),
+      temperature: Number(process.env.ALYS_GENERATOR_TEMPERATURE ?? 0.25),
       max_tokens: Math.min(8192, Math.max(1600, options.targetCount * 520)),
       messages: [
         { role: "system", content: ALYS_RECORD_SYSTEM_PROMPT },
@@ -6539,7 +6958,10 @@ async function generateWithGroq(options) {
     provider: "groq",
     model,
     latencyMs: Date.now() - startedAt,
-    records: parseProviderRecords(content, options.baselineConfidence, options.datasetType)
+    records: groundProviderRecords(
+      parseProviderRecords(content, options.baselineConfidence, options.datasetType),
+      options.document
+    )
   };
 }
 async function generateWithGroqBatched(options) {
@@ -6649,6 +7071,78 @@ function normalizeMetadata(value) {
     benchmark_notes: Array.isArray(obj.benchmark_notes) ? obj.benchmark_notes.map(String).slice(0, 12) : []
   };
 }
+function groundProviderRecords(records, document) {
+  const sourceTokens = meaningfulTokens(document.text);
+  if (sourceTokens.size < 8) return [];
+  const sourceIsFallback = /fallback reason:/i.test(document.text);
+  const minimumGrounding = sourceIsFallback ? 0.025 : 0.04;
+  return records.flatMap((record) => {
+    const recordTokens = meaningfulTokens(`${record.input} ${record.output} ${record.context}`);
+    if (!recordTokens.size) return [];
+    let overlap = 0;
+    for (const token of recordTokens) {
+      if (sourceTokens.has(token)) overlap += 1;
+    }
+    const groundingScore = overlap / Math.max(1, Math.min(recordTokens.size, sourceTokens.size));
+    if (groundingScore < minimumGrounding) return [];
+    const confidence = clamp012(record.confidence * (0.76 + Math.min(0.24, groundingScore * 2.4)));
+    return [{
+      ...record,
+      confidence,
+      metadata: {
+        ...record.metadata,
+        signals: Array.from(/* @__PURE__ */ new Set([
+          ...record.metadata.signals,
+          `source-grounding:${groundingScore.toFixed(3)}`,
+          `source-url:${document.url}`
+        ])).slice(0, 12),
+        constraints: Array.from(/* @__PURE__ */ new Set([
+          ...record.metadata.constraints,
+          "accepted-after-source-grounding-check"
+        ])).slice(0, 12)
+      }
+    }];
+  });
+}
+function meaningfulTokens(value) {
+  const stopwords = /* @__PURE__ */ new Set([
+    "about",
+    "after",
+    "also",
+    "because",
+    "before",
+    "being",
+    "between",
+    "could",
+    "dataset",
+    "during",
+    "every",
+    "from",
+    "have",
+    "into",
+    "more",
+    "only",
+    "other",
+    "should",
+    "source",
+    "that",
+    "their",
+    "there",
+    "these",
+    "this",
+    "through",
+    "using",
+    "what",
+    "when",
+    "where",
+    "which",
+    "with",
+    "would"
+  ]);
+  return new Set(
+    value.toLowerCase().match(/[a-z0-9][a-z0-9._/-]{2,}/g)?.map((token) => token.replace(/^[._/-]+|[._/-]+$/g, "")).filter((token) => token.length >= 4 && !stopwords.has(token)).slice(0, 900) ?? []
+  );
+}
 function safeJson(text) {
   const trimmed = text.trim();
   try {
@@ -6777,6 +7271,7 @@ var StructuringAgent = class {
       if (trustScore < (options.minTrustScore ?? 0.42) || relevanceScore < (options.minRelevanceScore ?? 0.24)) return [];
       const sourceWeight = sourceQualityWeight(document);
       const segment2 = options.generationPlan ? segmentForSource(options.generationPlan, `${document.title} ${document.text}`, index) : void 0;
+      const blueprint2 = options.generationPlan?.blueprint;
       const baselineConfidence = clamp013((finding?.confidence ?? 0.7) * 0.55 + trustScore * 0.22 + authorityScore * 0.12 + relevanceScore * 0.11 - duplicationRisk * 0.08);
       const baseId = import_node_crypto3.default.createHash("sha1").update(`${topic}:${document.url}:${datasetType}`).digest("hex").slice(0, 14);
       const providerTarget = useProvider ? weightedRecordTarget(recordsPerDocument, options.providerRecordsPerDocument ?? recordsPerDocument, sourceWeight) : 0;
@@ -6809,6 +7304,10 @@ var StructuringAgent = class {
               ...g.metadata,
               topic,
               kind: datasetType,
+              blueprint_id: blueprint2?.id,
+              blueprint_label: blueprint2?.label,
+              blueprint_version: blueprint2?.version,
+              blueprint_fields: blueprint2?.fields.map((field2) => field2.name),
               segment_id: segment2?.id,
               segment_label: segment2?.label,
               provider: providerResult.provider,
@@ -6936,6 +7435,11 @@ function toCsv(records) {
     "source",
     "source_url",
     "confidence",
+    "blueprint_id",
+    "segment_id",
+    "source_trust_score",
+    "source_authority_score",
+    "source_relevance_score",
     "tags",
     "metadata",
     "created_at"
@@ -6948,12 +7452,24 @@ function toCsv(records) {
     record.source,
     record.source_url,
     String(record.confidence),
+    metadataString(record, "blueprint_id"),
+    metadataString(record, "segment_id"),
+    metadataString(record, "source_trust_score"),
+    metadataString(record, "source_authority_score"),
+    metadataString(record, "source_relevance_score"),
     record.tags.join("|"),
     JSON.stringify(record.metadata),
     record.created_at
   ]);
   return [header, ...rows].map((row) => row.map(escapeCsv).join(",")).join("\n") + "\n";
 }
+function metadataString(record, key) {
+  const value = record.metadata[key];
+  if (value === null || value === void 0) return "";
+  if (typeof value === "string") return value;
+  if (typeof value === "number" || typeof value === "boolean") return String(value);
+  return JSON.stringify(value);
+}
 function toMarkdown(records) {
   return records.map((record) => {
     const title = record.input || record.id;
@@ -7089,20 +7605,26 @@ function performanceConfig(mode) {
 }
 function gateSources(sources, mode) {
   const perf = performanceConfig(mode);
-  const accepted = sources.filter((source) => {
+  const concreteSources = sources.filter(isConcreteEvidenceSource);
+  const accepted = concreteSources.filter((source) => {
     const trust = source.trustScore ?? source.score;
     const relevance = source.relevanceScore ?? source.score;
     const duplicateRisk = source.duplicationRisk ?? 0;
     const authority = source.authorityScore ?? 0.5;
     return trust >= perf.minTrustScore && relevance >= perf.minRelevanceScore && duplicateRisk < 0.72 && (trust >= 0.52 || authority >= 0.72);
   });
-  const minimum = Math.min(sources.length, Math.max(3, Math.ceil(sources.length * perf.gateMinimumRatio)));
-  const fallback = accepted.length >= minimum ? accepted : sources.slice(0, minimum);
+  const minimum = Math.min(concreteSources.length, Math.max(3, Math.ceil(concreteSources.length * perf.gateMinimumRatio)));
+  const fallback = accepted.length >= minimum ? accepted : accepted.length ? accepted : concreteSources.filter((source) => (source.authorityScore ?? 0) >= 0.78 && (source.relevanceScore ?? source.score) >= perf.minRelevanceScore).slice(0, minimum);
   return {
     sources: fallback,
     filtered: Math.max(0, sources.length - fallback.length)
   };
 }
+function isConcreteEvidenceSource(source) {
+  if (process.env.ALYS_ALLOW_HEURISTIC_GENERATION === "true") return true;
+  const provider = (source.provider || source.discoveredBy || "").toLowerCase();
+  return provider !== "local-heuristic" && !provider.includes("heuristic");
+}
 function sourceDiversityScore(sources) {
   if (!sources.length) return 0;
   const domains = new Set(sources.map((source) => source.domain || domainFromUrl4(source.url)));
@@ -7129,7 +7651,7 @@ async function generateDataset(options) {
   const targetRows = Math.max(1, Math.floor(options.targetRows ?? 100));
   const datasetId = import_node_crypto4.default.createHash("sha1").update(`${options.topic}:${Date.now()}`).digest("hex").slice(0, 12);
   const workspace = await ensureAlysWorkspace(options.workspaceRoot);
-  const generationPlan = buildDatasetGenerationPlan(options.topic);
+  const generationPlan = buildDatasetGenerationPlan(options.topic, datasetType);
   const expandedQueries = planQueries(generationPlan).slice(0, perf.queryCap);
   const discoveryEnabledSeed = options.discoverySeed ?? 0;
   const verificationEnabled = options.verificationEnabled ?? true;
@@ -7154,6 +7676,9 @@ async function generateDataset(options) {
     message: `${gated.filtered} low-trust or low-relevance sources filtered`,
     metric: `${gated.sources.length} accepted`
   });
+  if (!gated.sources.length) {
+    throw new Error("ALYS_NO_TRUSTED_SOURCES");
+  }
   const extraction = new ExtractionAgent();
   event(options.onEvent, { stage: "extraction", agent: extraction.name, status: "running", message: "Extracting source text..." });
   const extracted = await extraction.run(gated.sources);
@@ -7222,6 +7747,9 @@ async function generateDataset(options) {
     }
   });
   event(options.onEvent, { stage: "structuring", agent: structuring.name, status: "success", message: `${structured.length} candidate records generated`, metric: `${targetRows} target` });
+  if (!structured.length) {
+    throw new Error("ALYS_NO_GROUNDED_RECORDS");
+  }
   const curator = new DatasetCuratorAgent();
   event(options.onEvent, { stage: "curation", agent: curator.name, status: "running", message: "Curating final dataset..." });
   const records = curator.run(structured, targetRows).map((record) => ({
@@ -7296,8 +7824,14 @@ async function generateDataset(options) {
 `));
   artifacts.push(await writeDatasetArtifact(workspace, datasetId, "metrics.json", `${JSON.stringify(qualityMetrics, null, 2)}
 `));
+  artifacts.push(await writeDatasetArtifact(workspace, datasetId, "schema.json", `${JSON.stringify(datasetSchema(manifest), null, 2)}
+`));
+  artifacts.push(await writeDatasetArtifact(workspace, datasetId, "data-dictionary.md", renderDataDictionary(manifest)));
   artifacts.push(await writeDatasetArtifact(workspace, datasetId, "source-graph.json", `${JSON.stringify(research.graph, null, 2)}
 `));
+  artifacts.push(await writeDatasetArtifact(workspace, datasetId, "sources.json", `${JSON.stringify(sources, null, 2)}
+`));
+  artifacts.push(await writeDatasetArtifact(workspace, datasetId, "sources.md", renderSourcesMarkdown(options.topic, sources)));
   artifacts.push(await writeDatasetArtifact(workspace, datasetId, "generation-plan.json", `${JSON.stringify(generationPlan, null, 2)}
 `));
   artifacts.push(await writeDatasetArtifact(workspace, datasetId, "benchmark-report.json", `${JSON.stringify(evaluation, null, 2)}
@@ -7306,6 +7840,101 @@ async function generateDataset(options) {
   event(options.onEvent, { stage: "export", agent: "ArtifactStorage", status: "success", message: `Dataset written to ${manifest.outputDir}`, metric: manifest.outputDir });
   return { manifest, records, artifacts };
 }
+function datasetSchema(manifest) {
+  const blueprint2 = manifest.generationPlan?.blueprint;
+  if (!blueprint2) {
+    return {
+      title: "Alys Dataset Record",
+      type: "object",
+      properties: {
+        id: { type: "string" },
+        input: { type: "string" },
+        output: { type: "string" },
+        context: { type: "string" },
+        source_url: { type: "string" },
+        confidence: { type: "number" }
+      },
+      required: ["id", "input", "output", "context", "source_url", "confidence"]
+    };
+  }
+  return {
+    $schema: "https://json-schema.org/draft/2020-12/schema",
+    title: blueprint2.label,
+    description: blueprint2.description,
+    blueprintId: blueprint2.id,
+    blueprintVersion: blueprint2.version,
+    recommendedFormats: blueprint2.recommendedFormats,
+    type: "object",
+    additionalProperties: false,
+    required: blueprint2.fields.filter((field2) => field2.required).map((field2) => field2.name),
+    properties: Object.fromEntries(
+      blueprint2.fields.map((field2) => [
+        field2.name,
+        {
+          type: field2.type,
+          description: field2.description,
+          ...field2.example ? { examples: [field2.example] } : {}
+        }
+      ])
+    )
+  };
+}
+function renderDataDictionary(manifest) {
+  const blueprint2 = manifest.generationPlan?.blueprint;
+  if (!blueprint2) {
+    return "# Data Dictionary\n\nNo dataset blueprint was recorded for this run.\n";
+  }
+  const lines = [
+    `# ${blueprint2.label} Data Dictionary`,
+    "",
+    blueprint2.description,
+    "",
+    `Blueprint: \`${blueprint2.id}@${blueprint2.version}\``,
+    `Recommended formats: ${blueprint2.recommendedFormats.map((format) => `\`${format}\``).join(", ")}`,
+    "",
+    "## Fields",
+    "",
+    "| Field | Type | Required | Description | Example |",
+    "| --- | --- | --- | --- | --- |",
+    ...blueprint2.fields.map(
+      (field2) => `| \`${field2.name}\` | ${field2.type} | ${field2.required ? "yes" : "no"} | ${field2.description.replace(/\|/g, "\\|")} | ${field2.example?.replace(/\|/g, "\\|") ?? ""} |`
+    ),
+    "",
+    "## Record Instructions",
+    "",
+    ...blueprint2.recordInstructions.map((item) => `- ${item}`),
+    "",
+    "## Quality Bar",
+    "",
+    ...blueprint2.qualityBar.map((item) => `- ${item}`),
+    ""
+  ];
+  return `${lines.join("\n")}
+`;
+}
+function renderSourcesMarkdown(topic, sources) {
+  const lines = [
+    `# Sources for ${topic}`,
+    "",
+    "Alys grounds generated records in the ranked source pool below. Higher trust scores influence confidence and record acceptance.",
+    ""
+  ];
+  for (const [index, source] of sources.entries()) {
+    const trust = typeof source.trustScore === "number" ? ` \xB7 trust ${Math.round(source.trustScore * 100)}%` : "";
+    const type = source.sourceType ? ` \xB7 ${source.sourceType}` : "";
+    const domain = source.domain ? ` \xB7 ${source.domain}` : "";
+    lines.push(`${index + 1}. [${source.title}](${source.url})${domain}${type}${trust}`);
+    if (source.snippet) lines.push(`   ${source.snippet.replace(/\s+/g, " ").trim()}`);
+    if (source.qualitySignals?.length) lines.push(`   Signals: ${source.qualitySignals.slice(0, 6).join(", ")}`);
+    lines.push("");
+  }
+  if (!sources.length) {
+    lines.push("No sources were recorded for this run.");
+    lines.push("");
+  }
+  return `${lines.join("\n")}
+`;
+}
 function artifactFilename(format) {
   if (format === "markdown") return "dataset.md";
   if (format === "instruction") return "instruction.jsonl";
@@ -7450,6 +8079,13 @@ Limits:
   1 dataset = 1 generation
   max 5 datasets per run
   use --benchmark for local high-volume benchmark runs
+Concepts:
+  RAG chunks          retrieval-ready records (for search or knowledge-base apps)
+  Instruction tuning examples for fine-tuning (teaching model behavior)
+  JSONL              one JSON object per line (standard for ML pipelines)
+  CSV                spreadsheet-friendly rows (for review and analysis)
+  Verification       quality checks (confidence, repetition, schema validity)
 `);
 }
 function loadCliEnv(cwd = process.cwd()) {
@@ -7635,6 +8271,17 @@ function getEvaluation(dataset) {
   const evaluation = dataset.manifest.evaluation;
   return evaluation && typeof evaluation === "object" ? evaluation : {};
 }
+function getBlueprint(dataset) {
+  const generationPlan = dataset.manifest.generationPlan;
+  if (!generationPlan || typeof generationPlan !== "object") return {};
+  const blueprint2 = generationPlan.blueprint;
+  return blueprint2 && typeof blueprint2 === "object" ? blueprint2 : {};
+}
+function getSourceManifest(dataset) {
+  const sources = dataset.manifest.sourceManifest;
+  if (!Array.isArray(sources)) return [];
+  return sources.filter((source) => Boolean(source) && typeof source === "object").filter((source) => typeof source.title === "string" || typeof source.url === "string");
+}
 function printStage(code, status, label, metric) {
   const tint = status === "DONE" || status === "OK" ? "green" : status === "WARN" ? "yellow" : "cyan";
   const prefix = `${paint(`[${code.padEnd(4).slice(0, 4)}]`, "gray")} ${paint(status.padEnd(4), tint)}`;
@@ -7827,12 +8474,28 @@ function printGenerationSummary(response, workspaceRoot) {
     const records = Number(metrics.recordsGenerated ?? summary.recordsAccepted ?? 0);
     const sources = Number(metrics.sourcesDiscovered ?? 0);
     const confidenceValue = Number(metrics.averageConfidence ?? summary.averageConfidence ?? 0);
+    const blueprint2 = getBlueprint(dataset);
     const outputDir = import_node_path4.default.join(root, "datasets", dataset.id);
     console.log(`${paint("\u2022", "yellow")} ${paint(dataset.id, "white")}  ${formatInt(records)} records  ${formatInt(sources)} sources  ${formatPercent(confidenceValue)} confidence`);
     console.log(`  ${truncate(dataset.topic, 110)}`);
+    if (blueprint2.label || blueprint2.id) {
+      console.log(`  blueprint ${paint(blueprint2.label ?? blueprint2.id ?? "dataset blueprint", "white")}${paint(blueprint2.id ? ` (${blueprint2.id})` : "", "gray")}`);
+    }
     console.log(`  ${paint(outputDir, "cyan")}`);
     console.log(`  quality ${formatPercent(Number(quality.citationCoverage ?? 0))} citations \xB7 ${formatPercent(Number(quality.recordUniqueness ?? 0))} unique \xB7 ${formatPercent(Number(quality.sourceDiversity ?? 0))} source diversity`);
     console.log(`  suitability RAG ${formatScore(Number(suitability.ragSuitability ?? 0))} \xB7 tuning ${formatScore(Number(suitability.instructionTuning ?? 0))} \xB7 usefulness ${formatScore(Number(suitability.humanUsefulness ?? 0))}`);
+    const topSources = getSourceManifest(dataset).slice(0, 5);
+    if (topSources.length) {
+      console.log(paint("  sources", "gray"));
+      for (const source of topSources) {
+        const label = source.title || source.domain || source.provider || "source";
+        const trust = Number(source.trustScore ?? source.authorityScore ?? source.relevanceScore ?? 0);
+        const type = source.sourceType ? ` ${source.sourceType}` : "";
+        const score = trust > 0 ? ` ${formatPercent(trust)} trust` : "";
+        console.log(`    - ${truncate(label, 76)}${paint(`${type}${score}`, "gray")}`);
+        if (source.url) console.log(`      ${paint(source.url, "cyan")}`);
+      }
+    }
     const preview = previewRecord(dataset);
     if (preview) {
       console.log(paint("  preview", "gray"));
@@ -8017,11 +8680,20 @@ async function handleGenerate(args, command) {
   const datasetType = parseDatasetType(values.type) ?? (values.yes === true ? "instruction" : void 0) ?? (await (0, import_prompts3.default)({
     type: "select",
     name: "datasetType",
-    message: "Dataset type?",
+    message: "Dataset type? (choose what the output should be used for)",
     choices: [
-      { title: "Instruction tuning", value: "instruction" },
-      { title: "RAG chunks", value: "rag" },
-      { title: "Question/Answer", value: "qa" }
+      {
+        title: "Instruction tuning (task + ideal answer examples)",
+        value: "instruction"
+      },
+      {
+        title: "RAG chunks (retrieval-ready context for search/knowledge bases)",
+        value: "rag"
+      },
+      {
+        title: "Question/Answer (direct QA pairs for evaluation or training)",
+        value: "qa"
+      }
     ]
   })).datasetType;
   const requestedDatasetCount = values.datasets ? Math.max(1, Math.floor(Number(values.datasets))) : values.yes === true ? 1 : (await (0, import_prompts3.default)({
@@ -8040,30 +8712,30 @@ async function handleGenerate(args, command) {
   const exportFormats = values.format ? parseFormats(values.format) : values.yes === true ? ["jsonl", "csv", "markdown"] : (await (0, import_prompts3.default)({
     type: "multiselect",
     name: "exportFormats",
-    message: "Output formats?",
+    message: "Output formats? (you can select multiple)",
     choices: [
-      { title: "JSONL", value: "jsonl", selected: true },
-      { title: "CSV", value: "csv", selected: true },
-      { title: "Markdown", value: "markdown" },
-      { title: "Instruction dataset", value: "instruction" },
-      { title: "RAG chunks", value: "rag" }
+      { title: "JSONL (one JSON record per line, best for ML pipelines)", value: "jsonl", selected: true },
+      { title: "CSV (spreadsheet-friendly review format)", value: "csv", selected: true },
+      { title: "Markdown (readable summary for humans)", value: "markdown" },
+      { title: "Instruction dataset (fine-tuning JSONL)", value: "instruction" },
+      { title: "RAG chunks (retrieval-ready JSONL)", value: "rag" }
     ],
     hint: "Use space to select multiple."
   })).exportFormats;
   const depth = parseDepth(values.depth) ?? (values.yes === true ? "medium" : void 0) ?? (await (0, import_prompts3.default)({
     type: "select",
     name: "depth",
-    message: "Research depth?",
+    message: "Research depth? (more depth can improve coverage but costs time)",
     choices: [
-      { title: "Shallow", value: "shallow" },
-      { title: "Medium", value: "medium" },
-      { title: "Deep", value: "deep" }
+      { title: "Shallow (fastest, smaller context)", value: "shallow" },
+      { title: "Medium (balanced default)", value: "medium" },
+      { title: "Deep (broader coverage, slower)", value: "deep" }
     ]
   })).depth;
   const sourceLimit = values.sources ? Math.min(maxSources, Math.max(1, Number(values.sources))) : values.yes === true ? benchmarkMode ? 48 : MAX_SOURCES_PER_RUN : (await (0, import_prompts3.default)({
     type: "number",
     name: "sourceLimit",
-    message: "How many sources?",
+    message: "How many sources? (more sources can improve coverage but may slow the run)",
     initial: benchmarkMode ? 48 : MAX_SOURCES_PER_RUN,
     min: 1,
     max: maxSources
@@ -8071,7 +8743,7 @@ async function handleGenerate(args, command) {
   const targetRows = values.rows ? Math.min(maxRows, Math.max(1, Number(values.rows))) : values.yes === true ? benchmarkMode ? 5e3 : MAX_ROWS_PER_DATASET : (await (0, import_prompts3.default)({
     type: "number",
     name: "targetRows",
-    message: "Rows per dataset?",
+    message: "Rows per dataset? (Alys aims for rows worth keeping, not raw volume)",
     initial: benchmarkMode ? 5e3 : MAX_ROWS_PER_DATASET,
     min: 1,
     max: maxRows
@@ -8086,7 +8758,7 @@ async function handleGenerate(args, command) {
   const verificationEnabled = values.verify === true ? true : values["no-verify"] === true ? false : values.yes === true ? performanceMode !== "fast" : (await (0, import_prompts3.default)({
     type: "toggle",
     name: "verificationEnabled",
-    message: "Enable verification checks?",
+    message: "Enable verification checks? (slower, stricter about weak/repetitive records)",
     initial: performanceMode !== "fast",
     active: "Yes",
     inactive: "No"

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "alys-akusa",
-  "version": "0.1.7",
+  "version": "0.1.8",
   "private": false,
   "description": "Alys local CLI runtime for autonomous dataset generation.",
   "license": "UNLICENSED",