npm - @tryformation/querylight-cli - Versions diffs - 0.1.1 → 0.2.1 - Mend

@tryformation/querylight-cli 0.1.1 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/README.md +64 -11
package/dist/chunk/chunker.d.ts +3 -1
package/dist/cli/main.js +1163 -285
package/dist/cli/run-cli.d.ts +4 -1
package/dist/core/concurrency.d.ts +1 -0
package/dist/core/constants.d.ts +3 -1
package/dist/core/gzip-json.d.ts +3 -0
package/dist/core/progress.d.ts +4 -0
package/dist/core/urls.d.ts +1 -0
package/dist/index/index-store.d.ts +3 -0
package/dist/index/querylight-indexer.d.ts +3 -1
package/dist/index.js +540 -141
package/dist/ingest/adapters/website-adapter.d.ts +6 -1
package/dist/ingest/adapters/website-feed-discovery.d.ts +6 -0
package/dist/ingest/extractors/html-extractor.d.ts +1 -0
package/dist/ingest/ingest-service.d.ts +5 -2
package/dist/types/models.d.ts +2 -2
package/dist/vector/dense.d.ts +3 -1
package/dist/vector/runtime.d.ts +2 -0
package/dist/vector/service.d.ts +20 -2
package/dist/vector/sparse.d.ts +3 -1
package/dist/vector/store.d.ts +8 -2
package/package.json +1 -1

package/dist/cli/main.js CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env node
 // src/cli/run-cli.ts
-import { Command } from "commander";
+import { Command, Option } from "commander";
 import { stat as stat4 } from "fs/promises";
 import path21 from "path";
@@ -14,6 +14,17 @@ import path4 from "path";
 import { readFile, writeFile } from "fs/promises";
 import path from "path";
 import YAML from "yaml";
+// src/core/constants.ts
+var PACKAGE_VERSION = "0.2.1";
+var DEFAULT_WORKSPACE = ".kb";
+var DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
+var LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
+// src/core/config.ts
+function normalizeModelCacheDir(configuredPath) {
+  return configuredPath === LEGACY_WORKSPACE_MODEL_CACHE_DIR ? DEFAULT_SHARED_MODEL_CACHE_DIR : configuredPath;
+}
 var defaultConfig = () => ({
   workspaceVersion: 1,
   index: {
@@ -41,17 +52,17 @@ var defaultConfig = () => ({
   retrieval: {
     defaultMode: "lexical",
     dense: {
-      enabled: false,
+      enabled: true,
       modelId: "Xenova/all-MiniLM-L6-v2",
-      cacheDir: ".kb/models/huggingface",
+      cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
       indexHashTables: 8,
       indexRandomSeed: 42,
       chunkTextMode: "title-heading-text"
     },
     sparse: {
-      enabled: false,
+      enabled: true,
       modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill",
-      cacheDir: ".kb/models/huggingface",
+      cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
       documentTopTokens: 128,
       queryEncoding: "tokenizer-token-weights",
       documentEncoding: "masked-lm-max-log1p-relu",
@@ -62,6 +73,7 @@ var defaultConfig = () => ({
     defaultUserAgent: "querylight-cli/0.1",
     obeyRobotsTxt: true,
     rateLimitMs: 1e3,
+    maxConcurrentRequests: 5,
     renderJs: false,
     retentionDays: 365,
     fetchArticles: true
@@ -112,11 +124,13 @@ async function loadConfig(workspacePath, configPath) {
       ...parsed.retrieval ?? {},
       dense: {
         ...defaults.retrieval.dense,
-        ...parsed.retrieval?.dense ?? {}
+        ...parsed.retrieval?.dense ?? {},
+        cacheDir: normalizeModelCacheDir(parsed.retrieval?.dense?.cacheDir ?? defaults.retrieval.dense.cacheDir)
       },
       sparse: {
         ...defaults.retrieval.sparse,
-        ...parsed.retrieval?.sparse ?? {}
+        ...parsed.retrieval?.sparse ?? {},
+        cacheDir: normalizeModelCacheDir(parsed.retrieval?.sparse?.cacheDir ?? defaults.retrieval.sparse.cacheDir)
       }
     },
     crawler: {
@@ -162,6 +176,14 @@ async function writeJsonl(filePath, records) {
 ` : "", "utf8");
 }
+// src/core/progress.ts
+function reportProgress(progress, message) {
+  progress?.("info", message);
+}
+function reportProgressDetail(progress, message) {
+  progress?.("detail", message);
+}
 // src/chunk/chunk-store.ts
 import path3 from "path";
 function chunksFile(workspacePath) {
@@ -269,11 +291,13 @@ function buildChunksForDocument(document, markdown, config, prior = /* @__PURE__
 async function chunkDocuments({
   workspacePath,
   sourceId,
-  documentId
+  documentId,
+  progress
 }) {
   const config = await loadConfig(workspacePath);
   const documents = await readJsonl(path4.join(workspacePath, "documents", "documents.jsonl"));
   const filtered = documents.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId));
+  reportProgress(progress, `Chunking ${filtered.length} document${filtered.length === 1 ? "" : "s"}`);
   const targetedDocumentIds = new Set(filtered.map((document) => document.id));
   const existingChunks = await loadChunks(workspacePath);
   const prior = new Map(existingChunks.map((chunk) => [chunk.id, chunk]));
@@ -281,19 +305,17 @@ async function chunkDocuments({
     existingChunks.filter((chunk) => !targetedDocumentIds.has(chunk.documentId)).map((chunk) => [chunk.id, chunk])
   );
   for (const document of filtered) {
+    reportProgressDetail(progress, `Chunking ${document.id} (${document.title})`);
     const raw = await readFile3(document.normalizedPath, "utf8");
     for (const chunk of buildChunksForDocument(document, raw, config, prior)) {
       nextChunks.set(chunk.id, chunk);
     }
   }
   await saveChunks(workspacePath, [...nextChunks.values()]);
+  reportProgress(progress, `Chunking complete: ${nextChunks.size} chunk${nextChunks.size === 1 ? "" : "s"} written`);
   return { chunksWritten: nextChunks.size };
 }
-// src/core/constants.ts
-var PACKAGE_VERSION = "0.1.0";
-var DEFAULT_WORKSPACE = ".kb";
 // src/core/errors.ts
 var CliError = class extends Error {
   constructor(message, code, exitCode, details) {
@@ -319,8 +341,6 @@ var DIRS = [
   "normalized",
   "indexes",
   "vectors",
-  "models",
-  "models/huggingface",
   "runs",
   "logs"
 ];
@@ -358,11 +378,12 @@ import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, Ranking
 import path11 from "path";
 // src/vector/dense.ts
-import { VectorFieldIndex, createSeededRandom } from "@tryformation/querylight-ts";
+import { VectorFieldIndex, cosineSimilarity, createSeededRandom } from "@tryformation/querylight-ts";
 import { mkdir as mkdir4 } from "fs/promises";
 import path8 from "path";
 // src/vector/runtime.ts
+import os from "os";
 import path6 from "path";
 import { fileURLToPath } from "url";
 import { execFile, execFileSync } from "child_process";
@@ -379,7 +400,22 @@ async function fileExists(filePath) {
 }
 // src/vector/runtime.ts
+function resolveQliHomeDir() {
+  return path6.resolve(process.env.QLI_HOME ?? path6.join(os.homedir(), ".qli"));
+}
 function resolveCacheDir(workspacePath, configuredPath) {
+  if (configuredPath === "~/.qli") {
+    return resolveQliHomeDir();
+  }
+  if (configuredPath.startsWith("~/.qli/")) {
+    return path6.join(resolveQliHomeDir(), configuredPath.slice("~/.qli/".length));
+  }
+  if (configuredPath === "~") {
+    return os.homedir();
+  }
+  if (configuredPath.startsWith("~/")) {
+    return path6.join(os.homedir(), configuredPath.slice(2));
+  }
   return path6.isAbsolute(configuredPath) ? configuredPath : path6.resolve(workspacePath, configuredPath.replace(/^\.kb\//, ""));
 }
 function packageRootFromImportMeta(importMetaUrl) {
@@ -403,6 +439,14 @@ async function ensureUvAvailable() {
     execFile("uv", ["--version"], (error) => error ? reject(error) : resolve2());
   });
 }
+async function isUvAvailable() {
+  try {
+    await ensureUvAvailable();
+    return true;
+  } catch {
+    return false;
+  }
+}
 async function runSparsePython({
   workspacePath,
   config,
@@ -446,55 +490,114 @@ async function getDenseTransformersRuntime(cacheDir) {
 }
 // src/vector/store.ts
-import { mkdir as mkdir3, readFile as readFile4, writeFile as writeFile3 } from "fs/promises";
+import { mkdir as mkdir3, rm, writeFile as writeFile4 } from "fs/promises";
 import path7 from "path";
+// src/core/gzip-json.ts
+import { readFile as readFile4, writeFile as writeFile3 } from "fs/promises";
+import { promisify } from "util";
+import { gunzip, gzip } from "zlib";
+var gzipAsync = promisify(gzip);
+var gunzipAsync = promisify(gunzip);
+async function writeGzipJson(filePath, value) {
+  const payload = JSON.stringify(value, null, 2);
+  await writeFile3(filePath, await gzipAsync(Buffer.from(payload, "utf8")));
+}
+async function readJsonFromGzipOrFile(gzipPath, legacyPath) {
+  if (await fileExists(gzipPath)) {
+    const payload = await readFile4(gzipPath);
+    return JSON.parse((await gunzipAsync(payload)).toString("utf8"));
+  }
+  if (legacyPath && await fileExists(legacyPath)) {
+    return JSON.parse(await readFile4(legacyPath, "utf8"));
+  }
+  return JSON.parse(await readFile4(gzipPath, "utf8"));
+}
+async function resolveExistingGzipOrFilePath(gzipPath, legacyPath) {
+  if (await fileExists(gzipPath)) {
+    return gzipPath;
+  }
+  if (legacyPath && await fileExists(legacyPath)) {
+    return legacyPath;
+  }
+  return gzipPath;
+}
+// src/vector/store.ts
 function vectorsDir(workspacePath) {
   return path7.join(workspacePath, "vectors");
 }
-function modelsDir(workspacePath) {
-  return path7.join(workspacePath, "models");
+function sharedModelStateDir() {
+  return path7.join(resolveQliHomeDir(), "models", "status");
 }
 function denseVectorPath(workspacePath) {
-  return path7.join(vectorsDir(workspacePath), "dense.latest.json");
+  return path7.join(vectorsDir(workspacePath), "dense.latest.json.gz");
 }
 function denseMetaPath(workspacePath) {
-  return path7.join(vectorsDir(workspacePath), "dense.latest.meta.json");
+  return path7.join(vectorsDir(workspacePath), "dense.latest.meta.json.gz");
 }
 function sparseVectorPath(workspacePath) {
-  return path7.join(vectorsDir(workspacePath), "sparse.latest.json");
+  return path7.join(vectorsDir(workspacePath), "sparse.latest.json.gz");
 }
 function sparseMetaPath(workspacePath) {
+  return path7.join(vectorsDir(workspacePath), "sparse.latest.meta.json.gz");
+}
+function legacyDenseVectorPath(workspacePath) {
+  return path7.join(vectorsDir(workspacePath), "dense.latest.json");
+}
+function legacyDenseMetaPath(workspacePath) {
+  return path7.join(vectorsDir(workspacePath), "dense.latest.meta.json");
+}
+function legacySparseVectorPath(workspacePath) {
+  return path7.join(vectorsDir(workspacePath), "sparse.latest.json");
+}
+function legacySparseMetaPath(workspacePath) {
   return path7.join(vectorsDir(workspacePath), "sparse.latest.meta.json");
 }
-function densePullMarker(workspacePath) {
-  return path7.join(modelsDir(workspacePath), "dense.pulled.json");
+function pullMarkerPath(type, workspacePath, modelId, cacheDir) {
+  const resolvedCacheDir = resolveCacheDir(workspacePath, cacheDir);
+  const cacheKey = sha256(resolvedCacheDir).slice(0, 16);
+  return path7.join(sharedModelStateDir(), type, `${encodeURIComponent(modelId)}.${cacheKey}.json`);
+}
+function densePullMarker(workspacePath, modelId, cacheDir) {
+  return pullMarkerPath("dense", workspacePath, modelId, cacheDir);
 }
-function sparsePullMarker(workspacePath) {
-  return path7.join(modelsDir(workspacePath), "sparse.pulled.json");
+function sparsePullMarker(workspacePath, modelId, cacheDir) {
+  return pullMarkerPath("sparse", workspacePath, modelId, cacheDir);
 }
 async function writeDensePayload(workspacePath, payload) {
   await mkdir3(vectorsDir(workspacePath), { recursive: true });
-  await writeFile3(denseVectorPath(workspacePath), JSON.stringify(payload, null, 2), "utf8");
-  await writeFile3(denseMetaPath(workspacePath), JSON.stringify(payload.metadata, null, 2), "utf8");
+  await writeGzipJson(denseVectorPath(workspacePath), payload);
+  await writeGzipJson(denseMetaPath(workspacePath), payload.metadata);
+  await Promise.all([
+    rm(legacyDenseVectorPath(workspacePath), { force: true }),
+    rm(legacyDenseMetaPath(workspacePath), { force: true })
+  ]);
 }
 async function readDensePayload(workspacePath) {
-  return JSON.parse(await readFile4(denseVectorPath(workspacePath), "utf8"));
+  return readJsonFromGzipOrFile(denseVectorPath(workspacePath), legacyDenseVectorPath(workspacePath));
 }
 async function writeSparsePayload(workspacePath, payload) {
   await mkdir3(vectorsDir(workspacePath), { recursive: true });
-  await writeFile3(sparseVectorPath(workspacePath), JSON.stringify(payload, null, 2), "utf8");
-  await writeFile3(sparseMetaPath(workspacePath), JSON.stringify(payload.metadata, null, 2), "utf8");
+  await writeGzipJson(sparseVectorPath(workspacePath), payload);
+  await writeGzipJson(sparseMetaPath(workspacePath), payload.metadata);
+  await Promise.all([
+    rm(legacySparseVectorPath(workspacePath), { force: true }),
+    rm(legacySparseMetaPath(workspacePath), { force: true })
+  ]);
 }
 async function readSparsePayload(workspacePath) {
-  return JSON.parse(await readFile4(sparseVectorPath(workspacePath), "utf8"));
+  return readJsonFromGzipOrFile(sparseVectorPath(workspacePath), legacySparseVectorPath(workspacePath));
 }
-async function writeDensePullMarker(workspacePath, value) {
-  await mkdir3(modelsDir(workspacePath), { recursive: true });
-  await writeFile3(densePullMarker(workspacePath), JSON.stringify(value, null, 2), "utf8");
+async function writeDensePullMarker(workspacePath, model, value) {
+  const markerPath = densePullMarker(workspacePath, model.modelId, model.cacheDir);
+  await mkdir3(path7.dirname(markerPath), { recursive: true });
+  await writeFile4(markerPath, JSON.stringify(value, null, 2), "utf8");
 }
-async function writeSparsePullMarker(workspacePath, value) {
-  await mkdir3(modelsDir(workspacePath), { recursive: true });
-  await writeFile3(sparsePullMarker(workspacePath), JSON.stringify(value, null, 2), "utf8");
+async function writeSparsePullMarker(workspacePath, model, value) {
+  const markerPath = sparsePullMarker(workspacePath, model.modelId, model.cacheDir);
+  await mkdir3(path7.dirname(markerPath), { recursive: true });
+  await writeFile4(markerPath, JSON.stringify(value, null, 2), "utf8");
 }
 async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
   const denseCacheDir = resolveCacheDir(workspacePath, dense.cacheDir);
@@ -504,30 +607,72 @@ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
       configured: dense.enabled,
       modelId: dense.modelId,
       cacheDir: denseCacheDir,
-      available: await fileExists(densePullMarker(workspacePath)),
-      artifactExists: await fileExists(denseVectorPath(workspacePath))
+      available: await fileExists(densePullMarker(workspacePath, dense.modelId, dense.cacheDir)),
+      artifactExists: await fileExists(denseVectorPath(workspacePath)) || await fileExists(legacyDenseVectorPath(workspacePath))
     },
     sparse: {
       configured: sparse.enabled,
       modelId: sparse.modelId,
       cacheDir: sparseCacheDir,
       uvAvailable,
-      available: await fileExists(sparsePullMarker(workspacePath)),
-      artifactExists: await fileExists(sparseVectorPath(workspacePath))
+      available: await fileExists(sparsePullMarker(workspacePath, sparse.modelId, sparse.cacheDir)),
+      artifactExists: await fileExists(sparseVectorPath(workspacePath)) || await fileExists(legacySparseVectorPath(workspacePath))
     }
   };
 }
 // src/vector/text.ts
+var LOW_SIGNAL_HEADINGS = /* @__PURE__ */ new Set([
+  "choose this instead of",
+  "how xyz runs it",
+  "naechste schritte",
+  "next steps",
+  "overview",
+  "passend wenn",
+  "problem",
+  "right fit",
+  "waehlen sie das stattdessen",
+  "was sie bekommen",
+  "what you get",
+  "wie xyz es umsetzt",
+  "uberblick",
+  "\xFCberblick"
+]);
+function normalizeHeading(value) {
+  return value.trim().toLowerCase();
+}
+function isLowSignalHeading(value) {
+  return LOW_SIGNAL_HEADINGS.has(normalizeHeading(value));
+}
+function stripLeadingHeading(text, heading) {
+  const lines = text.split("\n");
+  const firstContentIndex = lines.findIndex((line) => line.trim().length > 0);
+  if (firstContentIndex < 0) {
+    return text;
+  }
+  const match = /^(#{1,6})\s+(.+)$/.exec(lines[firstContentIndex] ?? "");
+  if (!match?.[2] || normalizeHeading(match[2]) !== normalizeHeading(heading)) {
+    return text;
+  }
+  const next = [...lines.slice(0, firstContentIndex), ...lines.slice(firstContentIndex + 1)].join("\n").trim();
+  return next;
+}
+function createVectorText(chunk) {
+  const meaningfulHeadings = chunk.headingPath.filter((heading) => !isLowSignalHeading(heading) && normalizeHeading(heading) !== normalizeHeading(chunk.title));
+  const textHeading = [...chunk.headingPath].reverse().find((heading) => isLowSignalHeading(heading) || normalizeHeading(heading) === normalizeHeading(chunk.title));
+  const body = textHeading ? stripLeadingHeading(chunk.text, textHeading) : chunk.text.trim();
+  return [chunk.title, ...meaningfulHeadings, body].filter(Boolean).join("\n\n");
+}
 function createDenseChunkText(chunk) {
-  return [chunk.title, ...chunk.headingPath, chunk.text].filter(Boolean).join("\n\n");
+  return createVectorText(chunk);
 }
 function createSparseChunkText(chunk) {
-  return [chunk.title, ...chunk.headingPath, chunk.text].filter(Boolean).join("\n\n");
+  return createVectorText(chunk);
 }
 // src/vector/dense.ts
 var denseEmbedderFactory = null;
+var EXACT_DENSE_RERANK_THRESHOLD = 5e3;
 async function createEmbedder(cacheDir, modelId) {
   if (denseEmbedderFactory) {
     return denseEmbedderFactory(cacheDir, modelId);
@@ -539,6 +684,9 @@ async function createEmbedder(cacheDir, modelId) {
     return output.tolist()[0];
   };
 }
+function exactDenseQuery(payload, vector, topK) {
+  return payload.chunks.map((chunk) => [chunk.chunkId, cosineSimilarity(vector, chunk.embedding)]).sort((left, right) => right[1] - left[1]).slice(0, topK);
+}
 async function pullDenseModel(workspacePath, config) {
   const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
   await mkdir4(cacheDir, { recursive: true });
@@ -547,7 +695,8 @@ async function pullDenseModel(workspacePath, config) {
 }
 async function buildDenseVectors({
   workspacePath,
-  config
+  config,
+  progress
 }) {
   const chunks = await readJsonl(path8.join(workspacePath, "chunks", "chunks.jsonl"));
   const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
@@ -555,6 +704,7 @@ async function buildDenseVectors({
   const embed = await createEmbedder(cacheDir, config.modelId);
   const records = [];
   let dimensions = 0;
+  reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for dense retrieval`);
   for (const chunk of chunks) {
     const embedding = await embed(createDenseChunkText(chunk));
     dimensions ||= embedding.length;
@@ -568,7 +718,11 @@ async function buildDenseVectors({
       text: chunk.text,
       embedding
     });
+    if (records.length === 1 || records.length % 100 === 0 || records.length === chunks.length) {
+      reportProgressDetail(progress, `Encoded ${records.length}/${chunks.length} chunks for dense retrieval`);
+    }
   }
+  reportProgress(progress, "Building dense vector index");
   const index = new VectorFieldIndex({
     numHashTables: config.indexHashTables,
     dimensions,
@@ -592,6 +746,7 @@ async function buildDenseVectors({
     chunks: records
   };
   await writeDensePayload(workspacePath, payload);
+  reportProgress(progress, `Dense vectors written for ${records.length} chunk${records.length === 1 ? "" : "s"}`);
   return payload;
 }
 async function denseQuery({
@@ -604,12 +759,19 @@ async function denseQuery({
   const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
   const embed = await createEmbedder(cacheDir, config.modelId);
   const vector = await embed(query);
+  if (payload.chunks.length <= EXACT_DENSE_RERANK_THRESHOLD) {
+    return exactDenseQuery(payload, vector, topK);
+  }
   const index = new VectorFieldIndex({
     numHashTables: payload.metadata.hashTables,
     dimensions: payload.metadata.dimensions,
     random: createSeededRandom(payload.metadata.randomSeed)
   }).loadState(payload.indexState);
-  return index.query(vector, topK);
+  const approximateHits = index.query(vector, topK);
+  if (approximateHits.length >= topK) {
+    return approximateHits;
+  }
+  return exactDenseQuery(payload, vector, topK);
 }
 // src/vector/sparse.ts
@@ -717,10 +879,13 @@ async function buildSparseDocuments(workspacePath, config, chunks) {
 }
 async function buildSparseVectors({
   workspacePath,
-  config
+  config,
+  progress
 }) {
   const chunks = await readJsonl(path9.join(workspacePath, "chunks", "chunks.jsonl"));
+  reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for sparse retrieval`);
   const built = await buildSparseDocuments(workspacePath, config, chunks);
+  reportProgress(progress, "Building sparse vector index");
   const index = new SparseVectorFieldIndex();
   for (const record of built.chunks) {
     index.insert(record.chunkId, [record.vector]);
@@ -742,6 +907,7 @@ async function buildSparseVectors({
     queryTokenWeights: built.queryTokenWeights
   };
   await writeSparsePayload(workspacePath, payload);
+  reportProgress(progress, `Sparse vectors written for ${built.chunks.length} chunk${built.chunks.length === 1 ? "" : "s"}`);
   return payload;
 }
 async function sparseQuery({
@@ -759,6 +925,7 @@ async function sparseQuery({
 }
 // src/vector/service.ts
+var pullModelsOverrideForTests = null;
 function resolveModelPullPlan({
   pullDenseFlag,
   pullSparseFlag,
@@ -775,90 +942,136 @@ function resolveModelPullPlan({
     pullSparse: uvAvailable
   };
 }
+function resolveMissingConfiguredModelPullPlan({
+  config,
+  status
+}) {
+  return {
+    pullDense: config.retrieval.dense.enabled && !status.dense.available,
+    pullSparse: config.retrieval.sparse.enabled && status.sparse.uvAvailable && !status.sparse.available
+  };
+}
 async function buildVectorArtifacts({
   workspacePath,
   config,
   denseOverride,
   sparseOverride,
-  buildAvailableModels = false
+  buildAvailableModels = false,
+  progress
 }) {
-  const modelStatus = buildAvailableModels ? await buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, await (async () => {
-    try {
-      await ensureUvAvailable();
-      return true;
-    } catch {
-      return false;
-    }
-  })()) : null;
+  const uvAvailable = await isUvAvailable();
+  const modelStatus = buildAvailableModels ? await buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, uvAvailable) : null;
   const denseEnabled = denseOverride ?? (buildAvailableModels ? config.retrieval.dense.enabled || Boolean(modelStatus?.dense.available) : config.retrieval.dense.enabled);
-  const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled);
-  const result2 = {};
+  const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled && uvAvailable);
+  const result = {};
   if (denseEnabled) {
-    result2.dense = await buildDenseVectors({ workspacePath, config: config.retrieval.dense });
+    reportProgress(progress, `Building dense vectors with ${config.retrieval.dense.modelId}`);
+    result.dense = await buildDenseVectors({ workspacePath, config: config.retrieval.dense, progress });
+  }
+  if ((sparseOverride || config.retrieval.sparse.enabled) && !uvAvailable) {
+    reportProgress(progress, "Skipping sparse vectors because uv is not available");
   }
   if (sparseEnabled) {
-    result2.sparse = await buildSparseVectors({ workspacePath, config: config.retrieval.sparse });
+    reportProgress(progress, `Building sparse vectors with ${config.retrieval.sparse.modelId}`);
+    result.sparse = await buildSparseVectors({ workspacePath, config: config.retrieval.sparse, progress });
   }
-  return result2;
+  return result;
 }
 async function pullModels({
   workspacePath,
   config,
   pullDense,
-  pullSparse
+  pullSparse,
+  progress
 }) {
+  if (pullModelsOverrideForTests) {
+    await pullModelsOverrideForTests({ workspacePath, config, pullDense, pullSparse, progress });
+    return;
+  }
   if (pullDense) {
+    reportProgress(progress, `Pulling dense model ${config.retrieval.dense.modelId}`);
     await pullDenseModel(workspacePath, config.retrieval.dense);
-    await writeDensePullMarker(workspacePath, {
+    await writeDensePullMarker(workspacePath, config.retrieval.dense, {
       pulledAt: (/* @__PURE__ */ new Date()).toISOString(),
-      modelId: config.retrieval.dense.modelId
+      modelId: config.retrieval.dense.modelId,
+      cacheDir: config.retrieval.dense.cacheDir
     });
+    reportProgress(progress, `Dense model ready: ${config.retrieval.dense.modelId}`);
   }
   if (pullSparse) {
+    reportProgress(progress, `Pulling sparse model ${config.retrieval.sparse.modelId}`);
     await pullSparseModel(workspacePath, config.retrieval.sparse);
-    await writeSparsePullMarker(workspacePath, {
+    await writeSparsePullMarker(workspacePath, config.retrieval.sparse, {
       pulledAt: (/* @__PURE__ */ new Date()).toISOString(),
-      modelId: config.retrieval.sparse.modelId
+      modelId: config.retrieval.sparse.modelId,
+      cacheDir: config.retrieval.sparse.cacheDir
     });
+    reportProgress(progress, `Sparse model ready: ${config.retrieval.sparse.modelId}`);
   }
 }
 async function getModelStatus(workspacePath, config) {
-  let uvAvailable = false;
-  try {
-    await ensureUvAvailable();
-    uvAvailable = true;
-  } catch {
-    uvAvailable = false;
-  }
+  const uvAvailable = await isUvAvailable();
   return buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, uvAvailable);
 }
 // src/index/index-store.ts
-import { readFile as readFile5, writeFile as writeFile4 } from "fs/promises";
+import { mkdir as mkdir6, rm as rm2 } from "fs/promises";
 import path10 from "path";
+function versionedIndexPath(workspacePath, stamp) {
+  return path10.join(workspacePath, "indexes", `${stamp}.json.gz`);
+}
+function versionedLegacyIndexPath(workspacePath, stamp) {
+  return path10.join(workspacePath, "indexes", `${stamp}.json`);
+}
+function versionedMetaPath(workspacePath, stamp) {
+  return path10.join(workspacePath, "indexes", `${stamp}.meta.json.gz`);
+}
+function versionedLegacyMetaPath(workspacePath, stamp) {
+  return path10.join(workspacePath, "indexes", `${stamp}.meta.json`);
+}
+function latestIndexPath(workspacePath) {
+  return path10.join(workspacePath, "indexes", "latest.json.gz");
+}
+function legacyLatestIndexPath(workspacePath) {
+  return path10.join(workspacePath, "indexes", "latest.json");
+}
+function latestMetaPath(workspacePath) {
+  return path10.join(workspacePath, "indexes", "latest.meta.json.gz");
+}
+function legacyLatestMetaPath(workspacePath) {
+  return path10.join(workspacePath, "indexes", "latest.meta.json");
+}
 async function writeIndexArtifacts({
   workspacePath,
   indexState,
   metadata
 }) {
   const stamp = metadata.createdAt.replace(/[:.]/g, "-");
-  const indexPath = path10.join(workspacePath, "indexes", `${stamp}.json`);
-  const metaPath = path10.join(workspacePath, "indexes", `${stamp}.meta.json`);
-  const latestIndexPath = path10.join(workspacePath, "indexes", "latest.json");
-  const latestMetaPath = path10.join(workspacePath, "indexes", "latest.meta.json");
-  const indexPayload = JSON.stringify(indexState, null, 2);
-  const metaPayload = JSON.stringify(metadata, null, 2);
-  await writeFile4(indexPath, indexPayload, "utf8");
-  await writeFile4(metaPath, metaPayload, "utf8");
-  await writeFile4(latestIndexPath, indexPayload, "utf8");
-  await writeFile4(latestMetaPath, metaPayload, "utf8");
-  return { indexPath: latestIndexPath, metadataPath: latestMetaPath };
+  const indexPath = versionedIndexPath(workspacePath, stamp);
+  const metaPath = versionedMetaPath(workspacePath, stamp);
+  const latestIndexArtifactPath = latestIndexPath(workspacePath);
+  const latestMetadataArtifactPath = latestMetaPath(workspacePath);
+  await mkdir6(path10.join(workspacePath, "indexes"), { recursive: true });
+  await writeGzipJson(indexPath, indexState);
+  await writeGzipJson(metaPath, metadata);
+  await writeGzipJson(latestIndexArtifactPath, indexState);
+  await writeGzipJson(latestMetadataArtifactPath, metadata);
+  await Promise.all([
+    rm2(legacyLatestIndexPath(workspacePath), { force: true }),
+    rm2(legacyLatestMetaPath(workspacePath), { force: true }),
+    rm2(versionedLegacyIndexPath(workspacePath, stamp), { force: true }),
+    rm2(versionedLegacyMetaPath(workspacePath, stamp), { force: true })
+  ]);
+  return { indexPath: latestIndexArtifactPath, metadataPath: latestMetadataArtifactPath };
 }
 async function readLatestIndexState(workspacePath) {
-  return JSON.parse(await readFile5(path10.join(workspacePath, "indexes", "latest.json"), "utf8"));
+  return readJsonFromGzipOrFile(latestIndexPath(workspacePath), legacyLatestIndexPath(workspacePath));
 }
 async function readLatestIndexMetadata(workspacePath) {
-  return JSON.parse(await readFile5(path10.join(workspacePath, "indexes", "latest.meta.json"), "utf8"));
+  return readJsonFromGzipOrFile(latestMetaPath(workspacePath), legacyLatestMetaPath(workspacePath));
+}
+async function resolveLatestIndexArtifactPath(workspacePath) {
+  return resolveExistingGzipOrFilePath(latestIndexPath(workspacePath), legacyLatestIndexPath(workspacePath));
 }
 // src/index/querylight-indexer.ts
@@ -900,14 +1113,17 @@ async function buildIndex({
   workspacePath,
   denseOverride,
   sparseOverride,
-  buildAvailableModels = false
+  buildAvailableModels = false,
+  progress
 }) {
   const config = await loadConfig(workspacePath);
+  reportProgress(progress, "Loading documents, chunks, and sources");
   const chunks = await readJsonl(path11.join(workspacePath, "chunks", "chunks.jsonl"));
   const documents = await readJsonl(path11.join(workspacePath, "documents", "documents.jsonl"));
   const sources = await readJsonl(path11.join(workspacePath, "sources", "sources.jsonl"));
   const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
   const index = new DocumentIndex(createIndexMapping(metadataFields));
+  reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
   for (const chunk of chunks) {
     index.index({
       id: chunk.id,
@@ -922,6 +1138,7 @@ async function buildIndex({
       }
     });
   }
+  reportProgressDetail(progress, `Indexed ${documents.length} document${documents.length === 1 ? "" : "s"} across ${sources.length} source${sources.length === 1 ? "" : "s"}`);
   const createdAt = (/* @__PURE__ */ new Date()).toISOString();
   const metadata = {
     id: `index_${createdAt.replace(/[:.]/g, "-")}`,
@@ -934,14 +1151,17 @@ async function buildIndex({
     fields: Object.keys(index.mapping),
     indexHash: sha256(JSON.stringify(index.indexState))
   };
+  reportProgress(progress, "Writing lexical index artifacts");
   const artifacts = await writeIndexArtifacts({ workspacePath, indexState: index.indexState, metadata });
   const vectors = await buildVectorArtifacts({
     workspacePath,
     config,
     denseOverride,
     sparseOverride,
-    buildAvailableModels
+    buildAvailableModels,
+    progress
   });
+  reportProgress(progress, `Index build complete: dense=${Boolean(vectors.dense)}, sparse=${Boolean(vectors.sparse)}`);
   return {
     metadata,
     indexPath: artifacts.indexPath,
@@ -953,6 +1173,27 @@ async function buildIndex({
 // src/ingest/ingest-service.ts
 import path17 from "path";
+// src/core/concurrency.ts
+async function mapWithConcurrency(items, limit, worker) {
+  if (items.length === 0) {
+    return;
+  }
+  const concurrency = Math.max(1, Math.floor(limit));
+  let nextIndex = 0;
+  await Promise.all(
+    Array.from({ length: Math.min(concurrency, items.length) }, async () => {
+      while (true) {
+        const index = nextIndex;
+        nextIndex += 1;
+        if (index >= items.length) {
+          return;
+        }
+        await worker(items[index], index);
+      }
+    })
+  );
+}
 // src/core/runs.ts
 import path12 from "path";
 async function writeRun(workspacePath, run) {
@@ -1021,7 +1262,7 @@ async function removeSource(workspacePath, sourceId) {
 }
 // src/ingest/document-utils.ts
-import { mkdir as mkdir6, rm, writeFile as writeFile5 } from "fs/promises";
+import { mkdir as mkdir7, rm as rm3, writeFile as writeFile5 } from "fs/promises";
 import path14 from "path";
 // src/normalize/normalize-markdown.ts
@@ -1074,7 +1315,7 @@ async function writeNormalizedDocument({
   normalizedPath,
   markdown
 }) {
-  await mkdir6(path14.dirname(normalizedPath), { recursive: true });
+  await mkdir7(path14.dirname(normalizedPath), { recursive: true });
   await writeFile5(
     normalizedPath,
     withFrontmatter(
@@ -1097,8 +1338,8 @@ async function writeNormalizedDocument({
 }
 async function deleteDocumentArtifacts(document) {
   await Promise.all([
-    document.rawPath ? rm(document.rawPath, { force: true }) : Promise.resolve(),
-    rm(document.normalizedPath, { force: true })
+    document.rawPath ? rm3(document.rawPath, { force: true }) : Promise.resolve(),
+    rm3(document.normalizedPath, { force: true })
   ]);
 }
@@ -1122,13 +1363,13 @@ async function listDirectoryFiles(source) {
 // src/ingest/adapters/file-adapter.ts
 import { basename, extname, resolve } from "path";
-import { mkdir as mkdir7, readFile as readFile9, stat as stat3, writeFile as writeFile6 } from "fs/promises";
+import { mkdir as mkdir8, readFile as readFile8, stat as stat3, writeFile as writeFile6 } from "fs/promises";
 // src/ingest/extractors/docx-extractor.ts
 import mammoth from "mammoth";
 async function extractDocx(filePath) {
-  const result2 = await mammoth.extractRawText({ path: filePath });
-  return result2.value;
+  const result = await mammoth.extractRawText({ path: filePath });
+  return result.value;
 }
 // src/ingest/extractors/html-extractor.ts
@@ -1142,9 +1383,41 @@ function stripBoilerplate(html) {
 // src/ingest/extractors/html-extractor.ts
 var turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
+var LOW_SIGNAL_SECTION_SELECTORS = [
+  "script",
+  "style",
+  "noscript",
+  "template",
+  "[data-blog-service-recommendations]",
+  "[data-blog-related-posts]"
+].join(", ");
 function cleanText(value) {
   return value.replace(/\s+/g, " ").trim();
 }
+function pruneLowSignalContent($) {
+  $(LOW_SIGNAL_SECTION_SELECTORS).remove();
+  $("form").each((_, element) => {
+    const action = cleanText($(element).attr("action") ?? "");
+    if (action.includes("substack.com/subscribe")) {
+      $(element).closest("section").remove();
+    }
+  });
+}
+function stripEscapedJsonPayloads(markdown) {
+  return markdown.split("\n").filter((line) => {
+    const trimmed = line.trim();
+    if (trimmed.length === 0) {
+      return true;
+    }
+    if (trimmed.length > 300 && /^"?\\?\[\{\\?"[a-z0-9_]+\\?":/i.test(trimmed)) {
+      return false;
+    }
+    if (trimmed.length > 300 && trimmed.includes('\\"permalink\\":') && trimmed.includes('\\"title\\":')) {
+      return false;
+    }
+    return true;
+  }).join("\n").replace(/\n{3,}/g, "\n\n").trim();
+}
 function chooseMeaningfulTitle($, fallbackTitle) {
   const candidates = [
     cleanText($("meta[property='og:title']").attr("content") ?? ""),
@@ -1181,14 +1454,27 @@ ${parts.join("\n\n")}
 function extractHtmlToMarkdown(html) {
   const cleaned = stripBoilerplate(html);
   const $ = load(cleaned);
+  pruneLowSignalContent($);
   const fallbackTitle = cleanText($("title").first().text()) || "Untitled";
   const title = chooseMeaningfulTitle($, fallbackTitle);
   const root = $("main").first().html() ?? $.root().html() ?? cleaned;
   return {
-    markdown: turndown.turndown(root),
+    markdown: stripEscapedJsonPayloads(turndown.turndown(root)),
     title
   };
 }
+function extractCanonicalUriFromHtml(html, baseUrl) {
+  const $ = load(html);
+  const href = $("link[rel='canonical']").first().attr("href")?.trim();
+  if (!href) {
+    return null;
+  }
+  try {
+    return new URL(href, baseUrl).href;
+  } catch {
+    return null;
+  }
+}
 function parseDateCandidate(value) {
   const trimmed = value.trim();
   if (!trimmed) {
@@ -1251,16 +1537,16 @@ function extractPublicationDateFromHtml(html) {
 }
 // src/ingest/extractors/markdown-extractor.ts
-import { readFile as readFile6 } from "fs/promises";
+import { readFile as readFile5 } from "fs/promises";
 async function extractMarkdown(filePath) {
-  return readFile6(filePath, "utf8");
+  return readFile5(filePath, "utf8");
 }
 // src/ingest/extractors/pdf-extractor.ts
-import { readFile as readFile7 } from "fs/promises";
+import { readFile as readFile6 } from "fs/promises";
 import { PDFParse } from "pdf-parse";
 async function extractPdf(filePath) {
-  const buffer = await readFile7(filePath);
+  const buffer = await readFile6(filePath);
   const parser = new PDFParse({ data: buffer });
   try {
     const parsed = await parser.getText();
@@ -1271,9 +1557,9 @@ async function extractPdf(filePath) {
 }
 // src/ingest/extractors/text-extractor.ts
-import { readFile as readFile8 } from "fs/promises";
+import { readFile as readFile7 } from "fs/promises";
 async function extractText(filePath) {
-  return readFile8(filePath, "utf8");
+  return readFile7(filePath, "utf8");
 }
 // src/ingest/adapters/file-adapter.ts
@@ -1308,7 +1594,7 @@ async function extractFileContent(filePath, mimeType) {
 ${text}`, raw: text };
   }
   if (mimeType === "text/html") {
-    const raw = await readFile9(filePath, "utf8");
+    const raw = await readFile8(filePath, "utf8");
     const extracted = extractHtmlToMarkdown(raw);
     return { title: extracted.title, markdown: `# ${extracted.title}
@@ -1364,8 +1650,8 @@ async function ingestFile({
   const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
   const indexedAt = now;
   const crawledAt = now;
-  await mkdir7(resolve(workspacePath, "normalized"), { recursive: true });
-  await mkdir7(resolve(workspacePath, "raw", source.id), { recursive: true });
+  await mkdir8(resolve(workspacePath, "normalized"), { recursive: true });
+  await mkdir8(resolve(workspacePath, "raw", source.id), { recursive: true });
   if (extracted.raw) {
     await writeFile6(rawPath, extracted.raw, "utf8");
   }
@@ -1430,7 +1716,7 @@ ${content}`;
   const now = (/* @__PURE__ */ new Date()).toISOString();
   const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
   const indexedAt = now;
-  await mkdir7(resolve(workspacePath, "normalized"), { recursive: true });
+  await mkdir8(resolve(workspacePath, "normalized"), { recursive: true });
   await writeNormalizedDocument({
     documentId,
     sourceId: source.id,
@@ -1474,7 +1760,7 @@ async function reprocessStoredDocument(document, source) {
   if (!document.rawPath) {
     return null;
   }
-  const raw = await readFile9(document.rawPath, "utf8");
+  const raw = await readFile8(document.rawPath, "utf8");
   const fallbackTitle = document.title || basename(document.uri);
   const extracted = await extractRawContent(raw, document.mimeType, fallbackTitle);
   const contentHash = sha256(extracted.markdown);
@@ -1591,8 +1877,21 @@ async function parseRssFeedDocument(xml, source) {
 }
 // src/ingest/adapters/url-adapter.ts
-import { mkdir as mkdir8, readFile as readFile10, writeFile as writeFile7 } from "fs/promises";
+import { mkdir as mkdir9, readFile as readFile9, writeFile as writeFile7 } from "fs/promises";
 import path16 from "path";
+// src/core/urls.ts
+function normalizeRemoteUrl(uri) {
+  try {
+    const parsed = new URL(uri);
+    parsed.hash = "";
+    return parsed.href;
+  } catch {
+    return uri;
+  }
+}
+// src/ingest/adapters/url-adapter.ts
 function buildHttpCache(response2, validatedAt) {
   return {
     etag: response2.headers.get("etag") ?? void 0,
@@ -1617,25 +1916,26 @@ async function normalizeRemoteDocument({
   responseStatus
 }) {
   const extracted = extractHtmlToMarkdown(body);
+  const canonicalUri = normalizeRemoteUrl(extractCanonicalUriFromHtml(body, url) ?? url);
   const markdown = `# ${extracted.title}
 ${extracted.markdown}`;
-  const documentId = stableId("doc", source.id, url);
+  const documentId = stableId("doc", source.id, canonicalUri);
   const normalizedPath = path16.resolve(workspacePath, "normalized", `${documentId}.md`);
-  const rawPath = path16.resolve(workspacePath, "raw", source.id, `${sha256(url).slice(0, 12)}.html`);
+  const rawPath = path16.resolve(workspacePath, "raw", source.id, `${sha256(canonicalUri).slice(0, 12)}.html`);
   const contentHash = sha256(markdown);
   const now = (/* @__PURE__ */ new Date()).toISOString();
   const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
   const indexedAt = now;
   const crawledAt = now;
   const resolvedPublicationDate = choosePublicationDate(publicationDate, extractPublicationDateFromHtml(body), previous?.publicationDate);
-  await mkdir8(path16.resolve(workspacePath, "raw", source.id), { recursive: true });
+  await mkdir9(path16.resolve(workspacePath, "raw", source.id), { recursive: true });
   await writeFile7(rawPath, body, "utf8");
   await writeNormalizedDocument({
     documentId,
     sourceId: source.id,
     title: extracted.title,
-    uri: url,
+    uri: canonicalUri,
     sourceUri,
     publicationDate: resolvedPublicationDate,
     crawledAt,
@@ -1650,8 +1950,9 @@ ${extracted.markdown}`;
     sourceId: source.id,
     sourceType: source.type,
     title: extracted.title,
-    uri: url,
+    uri: canonicalUri,
     sourceUri,
+    canonicalUri,
     mimeType: "text/html",
     rawPath,
     normalizedPath,
@@ -1749,7 +2050,7 @@ async function reprocessRemoteDocument(document, source) {
   if (!document.rawPath || !await fileExists(document.rawPath)) {
     return null;
   }
-  const raw = await readFile10(document.rawPath, "utf8");
+  const raw = await readFile9(document.rawPath, "utf8");
   const extracted = extractHtmlToMarkdown(raw);
   const markdown = `# ${extracted.title}
@@ -1825,6 +2126,18 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
   if (url.origin !== baseUrl.origin) {
     return false;
   }
+  if (url.search.length > 0) {
+    return false;
+  }
+  if (url.pathname.endsWith(".xml")) {
+    return false;
+  }
+  if (url.pathname.includes("/cdn-cgi/")) {
+    return false;
+  }
+  if (url.pathname === "/search" || url.pathname === "/search/" || url.pathname.endsWith("/search/")) {
+    return false;
+  }
   if (disallowRules.some((rule) => rule !== "/" && url.pathname.startsWith(rule))) {
     return false;
   }
@@ -1837,56 +2150,75 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
   }
   return true;
 }
-async function crawlWebsite(source) {
+function delay(ms) {
+  return new Promise((resolve2) => setTimeout(resolve2, ms));
+}
+async function crawlWebsite(source, defaults, progress) {
   const baseUrl = new URL(source.uri);
-  const userAgent = source.crawl?.userAgent ?? "querylight-cli/0.1";
+  const userAgent = source.crawl?.userAgent ?? defaults.userAgent;
   const includePatterns = source.crawl?.includePatterns ?? [];
   const excludePatterns = source.crawl?.excludePatterns ?? [];
   const maxDepth = source.crawl?.maxDepth ?? 2;
   const maxPages = source.crawl?.maxPages ?? 100;
-  const rateLimitMs = source.crawl?.rateLimitMs ?? 1e3;
+  const rateLimitMs = source.crawl?.rateLimitMs ?? defaults.rateLimitMs;
+  const maxConcurrentRequests = source.crawl?.maxConcurrentRequests ?? defaults.maxConcurrentRequests;
   const disallowRules = source.crawl?.obeyRobotsTxt === false ? [] : await fetchRobotsDisallow(baseUrl, userAgent);
-  const queue = [{ url: source.uri, depth: 0 }];
   const seen = /* @__PURE__ */ new Set();
   const results = [];
+  let currentLevel = [normalizeRemoteUrl(source.uri)];
   if (source.crawl?.useSitemap !== false) {
-    for (const url of await fetchSitemapUrls(baseUrl, userAgent)) {
-      queue.push({ url, depth: 1 });
-    }
-  }
-  while (queue.length > 0 && results.length < maxPages) {
-    const next = queue.shift();
-    if (!next || seen.has(next.url)) {
-      continue;
-    }
-    seen.add(next.url);
-    const url = new URL(next.url);
-    if (!isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules)) {
-      continue;
+    const sitemapUrls = (await fetchSitemapUrls(baseUrl, userAgent)).map((url) => normalizeRemoteUrl(url));
+    reportProgress(progress, `Discovered ${sitemapUrls.length} sitemap URL${sitemapUrls.length === 1 ? "" : "s"} for ${source.uri}`);
+    currentLevel = [
+      ...currentLevel,
+      ...sitemapUrls
+    ];
+  }
+  for (let depth = 0; depth <= maxDepth && currentLevel.length > 0 && results.length < maxPages; depth += 1) {
+    reportProgress(progress, `Crawl depth ${depth}: evaluating ${currentLevel.length} candidate URL${currentLevel.length === 1 ? "" : "s"}`);
+    const nextLevelCandidates = [];
+    const allowedUrls = [];
+    for (const candidate of currentLevel) {
+      const normalizedCandidate = normalizeRemoteUrl(candidate);
+      if (seen.has(normalizedCandidate)) {
+        continue;
+      }
+      seen.add(normalizedCandidate);
+      const url = new URL(normalizedCandidate);
+      if (!isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules)) {
+        continue;
+      }
+      allowedUrls.push(normalizedCandidate);
+      results.push(normalizedCandidate);
+      reportProgress(progress, `Discovered ${normalizedCandidate}`);
+      if (results.length >= maxPages) {
+        break;
+      }
     }
-    results.push(url.href);
-    if (next.depth >= maxDepth) {
-      continue;
+    reportProgress(progress, `Crawl depth ${depth}: queued ${allowedUrls.length} page${allowedUrls.length === 1 ? "" : "s"} for link extraction`);
+    if (depth >= maxDepth || results.length >= maxPages) {
+      break;
     }
-    const response2 = await fetch(url, { headers: { "user-agent": userAgent } });
-    const html = await response2.text();
-    const $ = load2(html);
-    $("a[href]").each((_, element) => {
-      const href = $(element).attr("href");
-      if (!href) {
-        return;
-      }
-      try {
-        const target = new URL(href, url);
-        if (!seen.has(target.href)) {
-          queue.push({ url: target.href, depth: next.depth + 1 });
+    await mapWithConcurrency(allowedUrls, maxConcurrentRequests, async (pageUrl) => {
+      const page = new URL(pageUrl);
+      const response2 = await fetch(page, { headers: { "user-agent": userAgent } });
+      const html = await response2.text();
+      const $ = load2(html);
+      $("a[href]").each((_, element) => {
+        const href = $(element).attr("href");
+        if (!href) {
+          return;
         }
-      } catch {
+        try {
+          nextLevelCandidates.push(normalizeRemoteUrl(new URL(href, page).href));
+        } catch {
+        }
+      });
+      if (rateLimitMs > 0) {
+        await delay(rateLimitMs);
       }
     });
-    if (rateLimitMs > 0) {
-      await new Promise((resolve2) => setTimeout(resolve2, rateLimitMs));
-    }
+    currentLevel = nextLevelCandidates;
   }
   return results;
 }
@@ -1961,6 +2293,8 @@ async function ingestRssSource({
   source,
   previous,
   nextDocuments,
+  maxConcurrentRequests,
+  onDocumentProcessed,
   onFailure
 }) {
   if (source.crawl?.fetchArticles === false) {
@@ -1968,11 +2302,12 @@ async function ingestRssSource({
   }
   const xml = await fetchFeedText(source);
   const items = await parseRssFeedDocument(xml, source);
+  const processedDocumentIds = /* @__PURE__ */ new Set();
   let added = 0;
   let changed = 0;
   let unchanged = 0;
   let failed = 0;
-  for (const item of items) {
+  await mapWithConcurrency(items, maxConcurrentRequests, async (item) => {
     try {
       const probe = previous.get(stableId("doc", source.id, item.url));
       const document = await fetchUrlDocument({
@@ -1983,28 +2318,40 @@ async function ingestRssSource({
         sourceUri: source.uri,
         publicationDate: item.publicationDate
       });
+      if (processedDocumentIds.has(document.id)) {
+        return;
+      }
+      processedDocumentIds.add(document.id);
+      const existingDocument = probe ?? previous.get(document.id);
       nextDocuments.set(document.id, document);
-      if (!probe) {
+      if (!existingDocument) {
         added += 1;
-      } else if (probe.contentHash !== document.contentHash) {
+        onDocumentProcessed?.(document.uri, "added");
+      } else if (existingDocument.contentHash !== document.contentHash) {
         changed += 1;
+        onDocumentProcessed?.(document.uri, "changed");
       } else {
         unchanged += 1;
+        onDocumentProcessed?.(document.uri, "unchanged");
       }
     } catch (error) {
       failed += 1;
       onFailure(item.url, error);
     }
-  }
+  });
   return { added, changed, unchanged, failed };
 }
 async function ingestSources({
   workspacePath,
   sourceIds,
-  changedOnly = false
+  changedOnly = false,
+  progress
 }) {
   const config = await loadConfig(workspacePath);
   const defaultRetentionDays = config.crawler.retentionDays;
+  const defaultUserAgent = config.crawler.defaultUserAgent;
+  const defaultRateLimitMs = config.crawler.rateLimitMs;
+  const defaultMaxConcurrentRequests = config.crawler.maxConcurrentRequests;
   const sources = (await listSources(workspacePath)).filter((source) => source.enabled && (!sourceIds || sourceIds.includes(source.id)));
   const existing = await loadDocuments(workspacePath);
   const previous = previousMap(existing);
@@ -2014,20 +2361,38 @@ async function ingestSources({
   let unchanged = 0;
   let failed = 0;
   const failures = [];
+  reportProgress(progress, `Ingesting ${sources.length} source${sources.length === 1 ? "" : "s"}`);
   for (const source of sources) {
+    const maxConcurrentRequests = source.crawl?.maxConcurrentRequests ?? defaultMaxConcurrentRequests;
+    const sourceBefore = { added, changed, unchanged, failed };
+    const processedDocumentIds = /* @__PURE__ */ new Set();
+    const reportDocumentOutcome = (uri, outcome) => {
+      const label = outcome === "unchanged" ? "Unchanged" : outcome === "changed" ? "Updated" : "Added";
+      reportProgress(progress, `${label} ${uri}`);
+    };
     const ingestOne = async (uri, producer) => {
       try {
         const probeId = stableId("doc", source.id, uri);
         const earlier = previous.get(probeId);
         const document = await producer();
+        if (processedDocumentIds.has(document.id)) {
+          reportProgressDetail(progress, `Skipped duplicate alias ${uri} -> ${document.uri}`);
+          return null;
+        }
+        processedDocumentIds.add(document.id);
+        const existingDocument = earlier ?? previous.get(document.id);
         nextDocuments.set(document.id, document);
-        if (!earlier) {
+        if (!existingDocument) {
           added += 1;
-        } else if (earlier.contentHash !== document.contentHash) {
+          reportDocumentOutcome(document.uri, "added");
+        } else if (existingDocument.contentHash !== document.contentHash) {
           changed += 1;
+          reportDocumentOutcome(document.uri, "changed");
         } else {
           unchanged += 1;
+          reportDocumentOutcome(document.uri, "unchanged");
         }
+        return document;
       } catch (error) {
         failed += 1;
         failures.push({
@@ -2035,50 +2400,69 @@ async function ingestSources({
           uri,
           message: error instanceof Error ? error.message : String(error)
         });
+        reportProgressDetail(progress, `Failed ${uri}: ${error instanceof Error ? error.message : String(error)}`);
+        return null;
       }
     };
     try {
+      reportProgress(progress, `Source ${source.name} (${source.type})`);
       if (source.type === "file") {
+        reportProgress(progress, `Reading file ${source.uri}`);
         await ingestOne(source.uri, () => ingestFile({ workspacePath, source, filePath: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
-        continue;
-      }
-      if (source.type === "directory") {
-        for (const filePath of await listDirectoryFiles(source)) {
+      } else if (source.type === "directory") {
+        const files = await listDirectoryFiles(source);
+        reportProgress(progress, `Scanning ${files.length} file${files.length === 1 ? "" : "s"} from ${source.uri}`);
+        for (const filePath of files) {
+          reportProgress(progress, `Reading file ${filePath}`);
           await ingestOne(filePath, () => ingestFile({ workspacePath, source, filePath, previous: previous.get(stableId("doc", source.id, filePath)) }));
         }
-        continue;
-      }
-      if (source.type === "url") {
+      } else if (source.type === "url") {
+        reportProgress(progress, `Fetching ${source.uri}`);
         await ingestOne(source.uri, () => fetchUrlDocument({ workspacePath, source, url: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
-        continue;
-      }
-      if (source.type === "website") {
-        for (const url of await crawlWebsite(source)) {
-          await ingestOne(url, () => fetchUrlDocument({ workspacePath, source, url, previous: previous.get(stableId("doc", source.id, url)) }));
-        }
-        continue;
-      }
-      if (source.type === "rss") {
-        const result2 = await ingestRssSource({
+      } else if (source.type === "website") {
+        reportProgress(progress, `Crawling ${source.uri}`);
+        const urls = await crawlWebsite(source, {
+          userAgent: defaultUserAgent,
+          rateLimitMs: defaultRateLimitMs,
+          maxConcurrentRequests
+        }, progress);
+        reportProgress(progress, `Fetched ${urls.length} page${urls.length === 1 ? "" : "s"} from crawl`);
+        const seenCanonicalUrls = /* @__PURE__ */ new Set();
+        await mapWithConcurrency(urls, maxConcurrentRequests, async (url) => {
+          if (seenCanonicalUrls.has(url)) {
+            reportProgressDetail(progress, `Skipped canonical duplicate ${url}`);
+            return;
+          }
+          reportProgress(progress, `Fetching ${url}`);
+          const document = await ingestOne(url, () => fetchUrlDocument({ workspacePath, source, url, previous: previous.get(stableId("doc", source.id, url)) }));
+          if (document) {
+            seenCanonicalUrls.add(document.uri);
+          }
+        });
+      } else if (source.type === "rss") {
+        reportProgress(progress, `Fetching feed ${source.uri}`);
+        const result = await ingestRssSource({
           workspacePath,
           source,
           previous,
           nextDocuments,
+          maxConcurrentRequests,
+          onDocumentProcessed: reportDocumentOutcome,
           onFailure: (uri, error) => {
             failures.push({
               sourceId: source.id,
               uri,
               message: error instanceof Error ? error.message : String(error)
             });
+            reportProgressDetail(progress, `Failed ${uri}: ${error instanceof Error ? error.message : String(error)}`);
           }
         });
-        added += result2.added;
-        changed += result2.changed;
-        unchanged += result2.unchanged;
-        failed += result2.failed;
-        continue;
-      }
-      if (source.type === "markdown" || source.type === "text") {
+        added += result.added;
+        changed += result.changed;
+        unchanged += result.unchanged;
+        failed += result.failed;
+      } else if (source.type === "markdown" || source.type === "text") {
+        reportProgress(progress, `Processing inline ${source.type} source ${source.id}`);
         await ingestOne(source.uri, () => ingestInlineContent({
           workspacePath,
           source,
@@ -2095,13 +2479,19 @@ async function ingestSources({
         uri: source.uri,
         message: error instanceof Error ? error.message : String(error)
       });
+      reportProgressDetail(progress, `Failed source ${source.name}: ${error instanceof Error ? error.message : String(error)}`);
     }
+    reportProgress(
+      progress,
+      `Finished ${source.name}: +${added - sourceBefore.added} added, ${changed - sourceBefore.changed} changed, ${unchanged - sourceBefore.unchanged} unchanged, ${failed - sourceBefore.failed} failed`
+    );
   }
   const expiringDocuments = [...nextDocuments.values()].filter((document) => {
     const source = sources.find((candidate) => candidate.id === document.sourceId);
     return source ? shouldExpireRssDocument(document, source, defaultRetentionDays) : false;
   });
   if (expiringDocuments.length > 0) {
+    reportProgress(progress, `Removing ${expiringDocuments.length} expired RSS document${expiringDocuments.length === 1 ? "" : "s"}`);
     const expiredIds = new Set(expiringDocuments.map((document) => document.id));
     for (const document of expiringDocuments) {
       nextDocuments.delete(document.id);
@@ -2128,6 +2518,7 @@ async function ingestSources({
     documentsSnapshot: documentSnapshot(finalDocuments)
   };
   await writeRun(workspacePath, run);
+  reportProgress(progress, `Ingest complete: ${added} added, ${changed} changed, ${unchanged} unchanged, ${failed} failed`);
   return {
     runId: id,
     documents: { added, changed, unchanged, failed },
@@ -2137,7 +2528,8 @@ async function ingestSources({
 async function reprocessDocuments({
   workspacePath,
   sourceId,
-  documentId
+  documentId,
+  progress
 }) {
   const documents = await loadDocuments(workspacePath);
   const sources = await listSources(workspacePath);
@@ -2145,15 +2537,20 @@ async function reprocessDocuments({
   const nextDocuments = new Map(documents.map((document) => [document.id, document]));
   let documentsReprocessed = 0;
   let documentsSkipped = 0;
-  for (const document of documents.filter((candidate) => (!sourceId || candidate.sourceId === sourceId) && (!documentId || candidate.id === documentId))) {
+  const targets = documents.filter((candidate) => (!sourceId || candidate.sourceId === sourceId) && (!documentId || candidate.id === documentId));
+  reportProgress(progress, `Reprocessing ${targets.length} document${targets.length === 1 ? "" : "s"}`);
+  for (const document of targets) {
+    reportProgressDetail(progress, `Reprocessing ${document.id} (${document.title})`);
     const source = sourceMap.get(document.sourceId);
     if (!source || !document.rawPath || !await fileExists(document.rawPath)) {
       documentsSkipped += 1;
+      reportProgressDetail(progress, `Skipped ${document.id}: raw source not available`);
       continue;
     }
     const updated = source.type === "url" || source.type === "website" || source.type === "rss" ? await reprocessRemoteDocument(document, source) : await reprocessStoredDocument(document, source);
     if (!updated) {
       documentsSkipped += 1;
+      reportProgressDetail(progress, `Skipped ${document.id}: source type could not be reprocessed`);
       continue;
     }
     nextDocuments.set(updated.id, updated);
@@ -2173,15 +2570,217 @@ async function reprocessDocuments({
     },
     documentsSnapshot: documentSnapshot(finalDocuments)
   });
+  reportProgress(progress, `Reprocess complete: ${documentsReprocessed} updated, ${documentsSkipped} skipped`);
   return { runId: id, documentsReprocessed, documentsSkipped };
 }
+// src/ingest/adapters/website-feed-discovery.ts
+import { load as load3 } from "cheerio";
+var COMMON_FEED_PATHS = [
+  "/feed",
+  "/feed.xml",
+  "/rss",
+  "/rss.xml",
+  "/atom.xml",
+  "/index.xml",
+  "/blog/feed",
+  "/blog/feed.xml",
+  "/blog/rss.xml",
+  "/blog/atom.xml",
+  "/blog/index.xml",
+  "/news/feed",
+  "/news/feed.xml",
+  "/news/rss.xml",
+  "/news/atom.xml",
+  "/news/index.xml"
+];
+function normalizeCandidateUrl(href, baseUrl) {
+  try {
+    const resolved = new URL(href, baseUrl);
+    if (!["http:", "https:"].includes(resolved.protocol)) {
+      return null;
+    }
+    return resolved.href;
+  } catch {
+    return null;
+  }
+}
+function looksLikeFeedLink(typeHint, href) {
+  const type = typeHint?.toLowerCase() ?? "";
+  const lowerHref = href.toLowerCase();
+  return type.includes("rss") || type.includes("atom") || type.includes("xml") || lowerHref.includes("/feed") || lowerHref.includes("/rss") || lowerHref.includes("/atom") || lowerHref.endsWith(".xml");
+}
+function extractDeclaredFeedCandidates(html, baseUrl) {
+  const $ = load3(html);
+  const candidates = [];
+  $("link[href]").each((index, element) => {
+    const rel = ($(element).attr("rel") ?? "").split(/\s+/).map((value) => value.trim().toLowerCase()).filter(Boolean);
+    const href = $(element).attr("href");
+    if (!href || !rel.includes("alternate")) {
+      return;
+    }
+    const typeHint = $(element).attr("type") ?? void 0;
+    if (!looksLikeFeedLink(typeHint, href)) {
+      return;
+    }
+    const normalized = normalizeCandidateUrl(href, baseUrl);
+    if (!normalized) {
+      return;
+    }
+    candidates.push({
+      url: normalized,
+      discoveredBy: "declared",
+      order: index,
+      typeHint
+    });
+  });
+  return candidates;
+}
+function buildCommonFeedCandidates(baseUrl) {
+  return COMMON_FEED_PATHS.map((pathname, index) => ({
+    url: new URL(pathname, baseUrl).href,
+    discoveredBy: "common",
+    order: index
+  }));
+}
+function dedupeCandidates(candidates) {
+  const seen = /* @__PURE__ */ new Set();
+  const deduped = [];
+  for (const candidate of candidates) {
+    if (seen.has(candidate.url)) {
+      continue;
+    }
+    seen.add(candidate.url);
+    deduped.push(candidate);
+  }
+  return deduped;
+}
+function looksLikeFeedDocument(contentType, body) {
+  const type = contentType?.toLowerCase() ?? "";
+  const lowerBody = body.toLowerCase();
+  return type.includes("rss") || type.includes("atom") || type.includes("xml") && (lowerBody.includes("<rss") || lowerBody.includes("<feed") || lowerBody.includes("<rdf:rdf")) || lowerBody.includes("<rss") || lowerBody.includes("<feed") || lowerBody.includes("<rdf:rdf");
+}
+function hasStablePrefixSegment(segment) {
+  return typeof segment === "string" && segment.length > 0 && /[a-z]/i.test(segment);
+}
+function deriveExcludePrefix(itemUrls, websiteOrigin) {
+  const paths = itemUrls.map((itemUrl) => {
+    try {
+      const parsed = new URL(itemUrl);
+      if (parsed.origin !== websiteOrigin) {
+        return null;
+      }
+      return parsed.pathname.split("/").filter(Boolean);
+    } catch {
+      return null;
+    }
+  }).filter((segments) => Array.isArray(segments));
+  if (paths.length < 2) {
+    return void 0;
+  }
+  const first = paths[0];
+  if (!first) {
+    return void 0;
+  }
+  let commonLength = 0;
+  while (commonLength < first.length) {
+    const nextSegment = first[commonLength];
+    if (!hasStablePrefixSegment(nextSegment) || !paths.every((segments) => segments[commonLength] === nextSegment)) {
+      break;
+    }
+    commonLength += 1;
+  }
+  if (commonLength === 0) {
+    return void 0;
+  }
+  return `/${first.slice(0, commonLength).join("/")}/`;
+}
+function scoreCandidate(candidate) {
+  const url = new URL(candidate.url);
+  const segments = url.pathname.split("/").filter(Boolean);
+  let score = candidate.discoveredBy === "declared" ? 1e3 : 100;
+  score -= candidate.order;
+  score -= segments.length * 10;
+  if (candidate.typeHint?.toLowerCase().includes("rss") || candidate.typeHint?.toLowerCase().includes("atom")) {
+    score += 25;
+  }
+  if (["/feed", "/feed.xml", "/rss", "/rss.xml", "/atom.xml", "/index.xml"].includes(url.pathname)) {
+    score += 50;
+  }
+  if (url.pathname.includes("comments")) {
+    score -= 200;
+  }
+  return score;
+}
+async function validateCandidate(candidate, websiteUrl, userAgent) {
+  try {
+    const response2 = await fetch(candidate.url, { headers: { "user-agent": userAgent } });
+    if (!response2.ok) {
+      return null;
+    }
+    const body = await response2.text();
+    if (!looksLikeFeedDocument(response2.headers.get("content-type"), body)) {
+      return null;
+    }
+    const source = {
+      id: "src_detected_feed",
+      type: "rss",
+      uri: candidate.url,
+      name: "Detected Feed",
+      enabled: true,
+      tags: [],
+      metadata: {},
+      createdAt: "1970-01-01T00:00:00.000Z",
+      updatedAt: "1970-01-01T00:00:00.000Z"
+    };
+    const items = await parseRssFeedDocument(body, source);
+    return {
+      feedUrl: candidate.url,
+      discoveredBy: candidate.discoveredBy,
+      excludePrefix: deriveExcludePrefix(items.map((item) => item.url), websiteUrl.origin)
+    };
+  } catch {
+    return null;
+  }
+}
+async function discoverWebsiteFeed(websiteUrl, userAgent) {
+  try {
+    const baseUrl = new URL(websiteUrl);
+    const response2 = await fetch(baseUrl, { headers: { "user-agent": userAgent } });
+    if (!response2.ok) {
+      return null;
+    }
+    const html = await response2.text();
+    const candidates = dedupeCandidates([
+      ...extractDeclaredFeedCandidates(html, baseUrl),
+      ...buildCommonFeedCandidates(baseUrl)
+    ]).sort((left, right) => scoreCandidate(right) - scoreCandidate(left));
+    for (const candidate of candidates) {
+      const validated = await validateCandidate(candidate, baseUrl, userAgent);
+      if (validated) {
+        return validated;
+      }
+    }
+    return null;
+  } catch {
+    return null;
+  }
+}
 // src/query/search-service.ts
-import { readFile as readFile11 } from "fs/promises";
+import { readFile as readFile10 } from "fs/promises";
 import { BoolQuery, MatchQuery, OP, TermQuery, reciprocalRankFusion } from "@tryformation/querylight-ts";
 import path18 from "path";
 async function loadHydratedIndex(workspacePath) {
-  const state = await readLatestIndexState(workspacePath);
+  let state;
+  try {
+    state = await readLatestIndexState(workspacePath);
+  } catch (error) {
+    if (error.code === "ENOENT") {
+      throw new CliError("lexical index is not built; run `qli rebuild` or `qli chunk` followed by `qli index build`", "INDEX_MISSING", 7 /* QueryError */);
+    }
+    throw error;
+  }
   const mapping = createIndexMapping(Object.keys(state.fieldState ?? {}).filter((field) => field.startsWith("metadata.")));
   return new (await import("@tryformation/querylight-ts")).DocumentIndex(mapping).loadState(state);
 }
@@ -2399,7 +2998,7 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
     if (!await fileExists(document.normalizedPath)) {
       return buildSnippet(chunk.text, query);
     }
-    const raw = await readFile11(document.normalizedPath, "utf8");
+    const raw = await readFile10(document.normalizedPath, "utf8");
     orderedChunks = buildChunksForDocument(document, raw, config);
     orderedChunkCache.set(document.id, orderedChunks);
   }
@@ -2417,9 +3016,25 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
 function normalizeDisplayTitle(title) {
   return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
 }
+var LOW_SIGNAL_RESULT_TITLES = /* @__PURE__ */ new Set([
+  "choose this instead of",
+  "how xyz runs it",
+  "naechste schritte",
+  "next steps",
+  "overview",
+  "passend wenn",
+  "problem",
+  "right fit",
+  "waehlen sie das stattdessen",
+  "was sie bekommen",
+  "what you get",
+  "wie xyz es umsetzt",
+  "uberblick",
+  "\xFCberblick"
+]);
 function chooseResultTitle(chunk) {
   const documentTitle = normalizeDisplayTitle(chunk.title);
-  const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter(Boolean);
+  const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter((heading) => heading.length > 0 && !LOW_SIGNAL_RESULT_TITLES.has(heading.toLowerCase()));
   const leafHeading = headings.at(-1);
   if (leafHeading && leafHeading.toLowerCase() !== documentTitle.toLowerCase()) {
     return leafHeading;
@@ -2441,6 +3056,9 @@ function normalizeUriPath(uri) {
     return uri.toLowerCase().replace(/\/+$/, "");
   }
 }
+function normalizeUriIdentity(uri) {
+  return normalizeRemoteUrl(uri).toLowerCase().replace(/\/+$/, "");
+}
 function uriSpecificity(uri) {
   const normalized = normalizeUriPath(uri);
   if (normalized === "/") {
@@ -2457,6 +3075,11 @@ function isMoreSpecificDuplicate(candidate, existing) {
   if (!candidateTitle || candidateTitle !== existingTitle) {
     return false;
   }
+  const candidateIdentity = normalizeUriIdentity(candidate.uri);
+  const existingIdentity = normalizeUriIdentity(existing.uri);
+  if (candidateIdentity === existingIdentity) {
+    return candidate.uri.length < existing.uri.length;
+  }
   const candidatePath = normalizeUriPath(candidate.uri);
   const existingPath = normalizeUriPath(existing.uri);
   if (candidatePath === existingPath) {
@@ -2471,28 +3094,28 @@ function isMoreSpecificDuplicate(candidate, existing) {
 }
 function collapseAggregateDuplicates(results, topK) {
   const deduped = [];
-  for (const result2 of results) {
+  for (const result of results) {
     const duplicateIndex = deduped.findIndex(
-      (existing) => isMoreSpecificDuplicate(result2, existing) || isMoreSpecificDuplicate(existing, result2)
+      (existing) => isMoreSpecificDuplicate(result, existing) || isMoreSpecificDuplicate(existing, result)
     );
     if (duplicateIndex < 0) {
-      deduped.push(result2);
+      deduped.push(result);
       continue;
     }
-    if (isMoreSpecificDuplicate(result2, deduped[duplicateIndex])) {
-      deduped[duplicateIndex] = result2;
+    if (isMoreSpecificDuplicate(result, deduped[duplicateIndex])) {
+      deduped[duplicateIndex] = result;
     }
   }
   return deduped.slice(0, topK);
 }
 function rerankResultsByDocument(results, topK) {
   const byDocument = /* @__PURE__ */ new Map();
-  for (const result2 of results) {
-    const existing = byDocument.get(result2.documentId);
+  for (const result of results) {
+    const existing = byDocument.get(result.documentId);
     if (existing) {
-      existing.push(result2);
+      existing.push(result);
     } else {
-      byDocument.set(result2.documentId, [result2]);
+      byDocument.set(result.documentId, [result]);
     }
   }
   const reranked = [...byDocument.values()].flatMap((group) => {
@@ -2501,7 +3124,7 @@ function rerankResultsByDocument(results, topK) {
     if (!best) {
       return [];
     }
-    const tailScore = rest.reduce((sum, result2) => sum + result2.score, 0);
+    const tailScore = rest.reduce((sum, result) => sum + result.score, 0);
     const aggregateScore = best.score + tailScore * 0.35 + (group.length - 1) * 0.2;
     return [{ ...best, score: aggregateScore }];
   }).sort((left, right) => right.score - left.score);
@@ -2569,7 +3192,6 @@ async function searchIndex({
           score: 0,
           title: chooseResultTitle(chunk),
           uri: chunk.uri,
-          headingPath: chunk.headingPath,
           snippet: await buildSnippetWithAdjacentChunks(chunk, document.title, {
             document,
             config,
@@ -2584,7 +3206,7 @@ async function searchIndex({
         };
       })
     );
-    return { retrievalMode: "lexical", results: latestResults.filter((result2) => result2 != null) };
+    return { retrievalMode: "lexical", results: latestResults.filter((result) => result != null) };
   }
   const lexicalHits = async () => {
     const index = await loadHydratedIndex(workspacePath);
@@ -2633,7 +3255,6 @@ async function searchIndex({
       score,
       title: chooseResultTitle(chunk),
       uri: chunk.uri,
-      headingPath: chunk.headingPath,
       snippet: await buildSnippetWithAdjacentChunks(chunk, normalizedQuery, {
         document: documents.get(chunk.documentId),
         config,
@@ -2647,13 +3268,13 @@ async function searchIndex({
       metadata: chunk.metadata
     };
   }));
-  const results = rawResults.filter((result2) => result2 != null);
+  const results = rawResults.filter((result) => result != null);
   return { retrievalMode: mode, results: rerankResultsByDocument(results, topK) };
 }
 // src/query/related-service.ts
 import path19 from "path";
-function cosineSimilarity(left, right) {
+function cosineSimilarity2(left, right) {
   let dot = 0;
   let leftNorm = 0;
   let rightNorm = 0;
@@ -2739,7 +3360,7 @@ async function findRelatedDocuments({
   const results = [...vectors.values()].filter((candidate) => candidate.document.id !== selected.id).map((candidate) => ({
     documentId: candidate.document.id,
     sourceId: candidate.document.sourceId,
-    score: cosineSimilarity(sourceVector.embedding, candidate.embedding),
+    score: cosineSimilarity2(sourceVector.embedding, candidate.embedding),
     title: candidate.document.title,
     uri: candidate.document.uri,
     metadata: candidate.document.metadata
@@ -2767,21 +3388,20 @@ async function createContext({
   const search = await searchIndex({ workspacePath, query, topK, showChunks: true, retrievalMode });
   const sources = [];
   let total = 0;
-  for (const result2 of search.results) {
-    const text = result2.text ?? "";
+  for (const result of search.results) {
+    const text = result.text ?? "";
     if (total + text.length > maxChars && sources.length > 0) {
       break;
     }
     total += text.length;
     sources.push({
-      chunkId: result2.chunkId,
-      documentId: result2.documentId,
-      sourceId: result2.sourceId,
-      title: result2.title,
-      uri: result2.uri,
-      headingPath: result2.headingPath,
+      chunkId: result.chunkId,
+      documentId: result.documentId,
+      sourceId: result.sourceId,
+      title: result.title,
+      uri: result.uri,
       text,
-      metadata: result2.metadata
+      metadata: result.metadata
     });
   }
   const markdown = [
@@ -2792,7 +3412,6 @@ async function createContext({
       `Title: ${source.title}`,
       `URL: ${source.uri}`,
       `Chunk ID: ${source.chunkId}`,
-      source.headingPath.length > 0 ? `Heading Path: ${source.headingPath.join(" > ")}` : "",
       "",
       source.text,
       ""
@@ -2871,27 +3490,30 @@ function formatSourcesTable(sources) {
   return table.toString();
 }
 function formatSearchResults(results) {
-  return results.map((result2, index) => [
-    `${index + 1}. ${colors.bold(result2.title)}`,
-    `   ${result2.uri}`,
-    `   Source type: ${result2.sourceType}`,
-    `   Published: ${result2.publicationDate ?? "n/a"}`,
-    `   Score: ${result2.score.toFixed(3)}`,
-    `   ${result2.snippet}`
-  ].join("\n")).join("\n\n");
+  return results.map((result, index) => [
+    `${index + 1}. ${colors.bold(result.title)}`,
+    `   URL: ${result.uri}`,
+    `   Source: ${result.sourceType} | Published: ${result.publicationDate ?? "n/a"} | Score: ${result.score.toFixed(3)}`,
+    "",
+    ...result.snippet.split("\n").map((line) => line.length > 0 ? `   ${line}` : "")
+  ].join("\n")).join(`
+${colors.dim("---")}
+`);
 }
 function formatRelatedDocuments(results) {
-  return results.map((result2, index) => [
-    `${index + 1}. ${colors.bold(result2.title)}`,
-    `   ${result2.uri}`,
-    `   Similarity: ${result2.score.toFixed(3)}`
+  return results.map((result, index) => [
+    `${index + 1}. ${colors.bold(result.title)}`,
+    `   ${result.uri}`,
+    `   Similarity: ${result.score.toFixed(3)}`
   ].join("\n")).join("\n\n");
 }
 // src/cli/run-cli.ts
 var SOURCE_TYPES = /* @__PURE__ */ new Set(["url", "website", "rss", "file", "directory", "markdown", "text"]);
 var RETRIEVAL_MODES = /* @__PURE__ */ new Set(["lexical", "dense", "sparse", "hybrid"]);
-var SOURCE_TYPE_LIST = ["url", "website", "rss", "file", "directory", "markdown", "text"];
+var SOURCE_TYPE_LIST = ["page", "website", "rss", "file", "directory", "markdown", "text"];
 var RETRIEVAL_MODE_LIST = ["lexical", "dense", "sparse", "hybrid"];
 var SEARCH_DATE_FIELDS = ["publicationDate", "firstSeenAt", "lastSeenAt", "lastChangedAt", "crawledAt"];
 function parseKeyValue(input) {
@@ -2914,11 +3536,46 @@ function parseOptionalNumber(input, optionName) {
   }
   return value;
 }
+function parseOptionalPositiveInteger(input, optionName) {
+  const value = parseOptionalNumber(input, optionName);
+  if (value === void 0) {
+    return void 0;
+  }
+  if (!Number.isInteger(value) || value < 1) {
+    throw new CliError(`invalid positive integer for ${optionName}: ${input}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
+  }
+  return value;
+}
 function setWhenDefined(target, key, value) {
   if (value !== void 0) {
     target[key] = value;
   }
 }
+function mergePatterns(existing, extra) {
+  const merged = [...existing ?? []];
+  if (extra && !merged.includes(extra)) {
+    merged.push(extra);
+  }
+  return merged.length > 0 ? merged : void 0;
+}
+function formatWebsiteSourceAdd(result) {
+  const lines = [`Added source ${result.primarySource.id}`];
+  if (!result.detectedFeed) {
+    lines.push("No feed detected during website registration.");
+    return lines.join("\n");
+  }
+  if (result.detectedFeed.source && result.detectedFeed.wasAdded) {
+    lines.push(`Detected feed ${result.detectedFeed.url} and added source ${result.detectedFeed.source.id}.`);
+  } else if (result.detectedFeed.source) {
+    lines.push(`Detected feed ${result.detectedFeed.url}. Source ${result.detectedFeed.source.id} already exists.`);
+  } else {
+    lines.push(`Detected feed ${result.detectedFeed.url}.`);
+  }
+  if (result.detectedFeed.excludePrefix) {
+    lines.push(`Excluded ${result.detectedFeed.excludePrefix} from the website crawl.`);
+  }
+  return lines.join("\n");
+}
 function createSourceCrawlConfig(type, options, defaults) {
   if (!["url", "website", "directory", "rss"].includes(type)) {
     return void 0;
@@ -2926,6 +3583,7 @@ function createSourceCrawlConfig(type, options, defaults) {
   const crawl = {};
   setWhenDefined(crawl, "maxDepth", parseOptionalNumber(options.maxDepth, "--max-depth"));
   setWhenDefined(crawl, "maxPages", parseOptionalNumber(options.maxPages, "--max-pages"));
+  setWhenDefined(crawl, "maxConcurrentRequests", parseOptionalPositiveInteger(options.maxConcurrentRequests, "--max-concurrent-requests"));
   setWhenDefined(crawl, "includePatterns", options.include);
   setWhenDefined(crawl, "excludePatterns", options.exclude);
   setWhenDefined(crawl, "obeyRobotsTxt", options.robots);
@@ -2944,14 +3602,48 @@ function createSourceCrawlConfig(type, options, defaults) {
   }
   return Object.keys(crawl).length > 0 ? crawl : void 0;
 }
+function validateSourceAddOptions(type, options) {
+  const reject = (optionName) => {
+    throw new CliError(`${optionName} is not supported for source type ${type}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
+  };
+  if (options.maxDepth !== void 0 && type !== "website") {
+    reject("--max-depth");
+  }
+  if (options.maxPages !== void 0 && type !== "website") {
+    reject("--max-pages");
+  }
+  if (options.maxConcurrentRequests !== void 0 && !["website", "rss"].includes(type)) {
+    reject("--max-concurrent-requests");
+  }
+  if (options.renderJs && type !== "website") {
+    reject("--render-js");
+  }
+  if (options.robots === false && type !== "website") {
+    reject("--no-robots");
+  }
+  if (options.rateLimitMs !== void 0 && type !== "website") {
+    reject("--rate-limit-ms");
+  }
+  if (options.include !== void 0 && !["website", "directory"].includes(type)) {
+    reject("--include");
+  }
+  if (options.exclude !== void 0 && !["website", "directory"].includes(type)) {
+    reject("--exclude");
+  }
+  if (options.retentionDays !== void 0 && type !== "rss") {
+    reject("--retention-days");
+  }
+}
 function allowedSourceConfigFields(source) {
   const fields = /* @__PURE__ */ new Set(["name", "tag", "metadata"]);
   if (source.type === "rss") {
     fields.add("retentionDays");
+    fields.add("maxConcurrentRequests");
   }
   if (source.type === "website") {
     fields.add("maxDepth");
     fields.add("maxPages");
+    fields.add("maxConcurrentRequests");
     fields.add("include");
     fields.add("exclude");
   }
@@ -2987,6 +3679,10 @@ function buildSourceConfigPatch(source, options) {
     checkAllowed("maxPages", "--max-pages");
     crawlPatch.maxPages = parseOptionalNumber(options.maxPages, "--max-pages");
   }
+  if (options.maxConcurrentRequests !== void 0) {
+    checkAllowed("maxConcurrentRequests", "--max-concurrent-requests");
+    crawlPatch.maxConcurrentRequests = parseOptionalPositiveInteger(options.maxConcurrentRequests, "--max-concurrent-requests");
+  }
   if (options.include !== void 0) {
     checkAllowed("include", "--include");
     crawlPatch.includePatterns = options.include;
@@ -3016,6 +3712,50 @@ function response(command, workspace, data, error) {
 }
 function writeOutput(capture, value, stderr = false) {
   (stderr ? capture.stderr : capture.stdout).push(value);
+  if (stderr) {
+    capture.onStderr?.(value);
+    return;
+  }
+  capture.onStdout?.(value);
+}
+function createProgressHandler(capture, options) {
+  if (options.json || options.silent || options.quiet) {
+    return void 0;
+  }
+  return (level, message) => {
+    if (level === "detail" && !options.verbose) {
+      return;
+    }
+    writeOutput(capture, message, true);
+  };
+}
+async function runIngestCommand({
+  workspace,
+  sourceId,
+  changedOnly,
+  dense,
+  sparse,
+  progress
+}) {
+  progress?.("info", "Ingest step 1/3: fetch and normalize");
+  const ingest = await ingestSources({
+    workspacePath: workspace,
+    sourceIds: sourceId ? [sourceId] : void 0,
+    changedOnly,
+    progress
+  });
+  progress?.("info", "Ingest step 2/3: chunk affected documents");
+  const chunk = await chunkDocuments({ workspacePath: workspace, sourceId, progress });
+  progress?.("info", "Ingest step 3/3: refresh index");
+  const indexBuild = await buildIndex({
+    workspacePath: workspace,
+    denseOverride: dense ? true : void 0,
+    sparseOverride: sparse ? true : void 0,
+    buildAvailableModels: true,
+    progress
+  });
+  progress?.("info", "Ingest complete");
+  return { ingest, chunk, indexPath: indexBuild.indexPath, metadata: indexBuild.metadata };
 }
 function parseRetrievalMode(input) {
   if (!input) {
@@ -3030,10 +3770,11 @@ function parseSourceType(input) {
   if (!input) {
     return void 0;
   }
-  if (!SOURCE_TYPES.has(input)) {
+  const normalized = input === "page" ? "url" : input;
+  if (!SOURCE_TYPES.has(normalized)) {
     throw new CliError(`unsupported source type: ${input}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
   }
-  return input;
+  return normalized;
 }
 function parseCommaSeparatedList(input) {
   const values = (input ?? "").split(",").map((value) => value.trim()).filter(Boolean);
@@ -3094,56 +3835,96 @@ function workspaceFromArgv(argv) {
   }
   return path21.resolve(DEFAULT_WORKSPACE);
 }
-async function runCli(argv) {
-  const capture = { stdout: [], stderr: [] };
+async function runCli(argv, io = {}) {
+  const capture = { stdout: [], stderr: [], ...io };
   const program = new Command();
-  program.name("qli").description("Build and query a local Querylight workspace from files, directories, URLs, websites, and feeds.").showHelpAfterError().option("--workspace <path>", "Workspace directory. Defaults to .kb in the current directory.", DEFAULT_WORKSPACE).option("--config <path>", "Optional config file override. Useful for testing alternate retrieval settings.").option("--json", "Return a stable JSON envelope for automation and agents.").option("--verbose", "Print more operational detail when a command supports it.").option("--quiet", "Suppress non-essential human-readable output.");
+  program.name("qli").description("Build and query a local Querylight workspace from files, directories, URLs, websites, and feeds.").showHelpAfterError().option("--workspace <path>", "Workspace directory. Defaults to .kb in the current directory.", DEFAULT_WORKSPACE).option("--config <path>", "Optional config file override. Useful for testing alternate retrieval settings.").option("--json", "Return a stable JSON envelope for automation and agents.").option("--silent", "Suppress progress logging for long-running commands.").option("--verbose", "Print more operational detail when a command supports it.").addOption(new Option("--quiet", "Deprecated alias for --silent.").hideHelp());
   program.addHelpText("after", `
 Workflow:
   1. Initialize a workspace with qli init
   2. Register one or more sources with qli source add
-  3. Build or refresh the workspace with qli rebuild
+  3. Refresh the workspace with qli ingest
   4. Query it with qli search, qli related, or qli context
 Examples:
   qli init
   qli source add directory ./docs --name "Product Docs" --tag docs
-  qli rebuild
+  qli ingest
+  qli rebuild --silent
   qli search "api authentication" --top-k 8
   qli context "How do API keys work?" --top-k 8 --max-chars 8000
+Long-running commands print progress to stderr by default. Use --silent to suppress it.
+Use --json when another tool needs stable structured output.
 Use qli <command> --help for command-specific options and examples.`);
-  program.command("init").description("Create a new workspace with the default directory layout and config.").option("--force").addHelpText("after", `
+  program.command("init").description("Create a new workspace with the default directory layout and config, then pull missing retrieval models.").option("--force").addHelpText("after", `
 Examples:
   qli init
   qli init --workspace ./kb
-  qli init --workspace /tmp/querylight --force`).action(async function command(options) {
+  qli init --workspace /tmp/querylight --force
+Notes:
+  init enables dense and sparse retrieval in new workspaces.
+  init pulls missing model assets for enabled retrieval modes.
+  Sparse model downloads require uv. If uv is not available, init skips the sparse pull.`).action(async function command(options) {
+    const global = this.optsWithGlobals();
     const workspace = await resolveWorkspace({ workspace: this.optsWithGlobals().workspace });
-    const result2 = await ensureWorkspace({ workspacePath: workspace, force: Boolean(options.force) });
-    emit(this.optsWithGlobals().json, capture, response("init", workspace, result2), `Initialized workspace at ${workspace}`);
+    const result = await ensureWorkspace({ workspacePath: workspace, force: Boolean(options.force) });
+    const config = await loadConfig(workspace, global.config);
+    const status = await getModelStatus(workspace, config);
+    const { pullDense, pullSparse } = resolveMissingConfiguredModelPullPlan({ config, status });
+    if (pullDense || pullSparse) {
+      await pullModels({ workspacePath: workspace, config, pullDense, pullSparse, progress: createProgressHandler(capture, global) });
+    }
+    emit(this.optsWithGlobals().json, capture, response("init", workspace, result), `Initialized workspace at ${workspace}`);
   });
   const source = program.command("source");
   source.description("Register, inspect, and manage workspace sources.");
-  source.command("add").description("Add a source definition. The source is enabled immediately.").argument("<type>", `Source type: ${SOURCE_TYPE_LIST.join(", ")}`).argument("<uri>", "Local path, URL, feed URL, or inline content depending on the source type.").requiredOption("--name <name>").option("--tag <tag...>", "Optional tags used later for filtering during search.").option("--metadata <key=value...>", "Extra metadata fields stored on the source.").option("--max-depth <n>", "Maximum crawl depth for website sources.").option("--max-pages <n>", "Maximum number of pages to ingest from a website source.").option("--include <pattern...>", "Only include matching paths or URLs.").option("--exclude <pattern...>", "Skip matching paths or URLs.").option("--render-js", "Render pages with JavaScript before extraction when supported.").option("--no-robots", "Ignore robots.txt for website crawling. Use only when you control the target site or have permission.").option("--rate-limit-ms <n>", "Delay between website requests.").option("--retention-days <n>", "Retention window in days for RSS items. Defaults to the workspace crawler retention setting.").addHelpText("after", `
+  source.command("add").description("Add a source definition. The source is enabled immediately. Use `page` for one page and `website` for multi-page crawling and feed detection.").argument("<type>", `Source type: ${SOURCE_TYPE_LIST.join(", ")}`).argument("<uri>", "Local path, URL, feed URL, or inline content depending on the source type.").requiredOption("--name <name>").option("--tag <tag...>", "Optional tags used later for filtering during search.").option("--metadata <key=value...>", "Extra metadata fields stored on the source.").option("--max-depth <n>", "Maximum crawl depth for website sources.").option("--max-pages <n>", "Maximum number of pages to ingest from a website source.").option("--max-concurrent-requests <n>", "Maximum remote requests in flight for a website or feed source.").option("--include <pattern...>", "Only include matching paths or URLs.").option("--exclude <pattern...>", "Skip matching paths or URLs.").option("--render-js", "Render pages with JavaScript before extraction when supported.").option("--no-robots", "Ignore robots.txt for website crawling. Use only when you control the target site or have permission.").option("--rate-limit-ms <n>", "Delay between website requests.").option("--retention-days <n>", "Retention window in days for RSS items. Defaults to the workspace crawler retention setting.").addHelpText("after", `
 Examples:
   qli source add directory ./docs --name "Local Docs" --tag docs
   qli source add file ./docs/auth.md --name "Auth Guide"
-  qli source add url https://example.com/docs/auth --name "Auth Page"
+  qli source add page https://example.com/docs/auth --name "Auth Page"
   qli source add website https://example.com --name "Docs Site" --max-depth 2 --max-pages 50 --include /docs/
+  qli source add website https://example.com --name "Docs Site" --max-concurrent-requests 8
+  qli source add website https://example.com --name "Example Site" --json
   qli source add rss https://example.com/feed.xml --name "Release Feed"
+  qli source add rss https://example.com/feed.xml --name "Release Feed" --max-concurrent-requests 3
   qli source add rss https://example.com/feed.xml --name "Release Feed" --retention-days 30
 Notes:
+  page stores one page. It does not crawl links or detect feeds.
+  Website sources may detect one blog or news feed during registration.
+  When a feed is added, qli also excludes the feed item prefix from the website crawl when it can infer one.
+  Website and RSS sources default to 5 remote requests in flight per source unless config.yaml or source settings override it.
+  Use --json when automation needs the full list of created sources.
   RSS sources store retention per feed.
-  When you omit --retention-days for RSS, qli stores the workspace default from config.yaml.`).action(async function command(type, uri, options) {
+  When you omit --retention-days for RSS, qli stores the workspace default from config.yaml.`).action(async function command(typeInput, uri, options) {
+    const type = parseSourceType(typeInput);
+    if (!type) {
+      throw new CliError(`unsupported source type: ${typeInput}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
+    }
     if (!SOURCE_TYPES.has(type)) {
       throw new CliError(`unsupported source type: ${type}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
     }
+    validateSourceAddOptions(type, options);
     const global = this.optsWithGlobals();
     const workspace = await resolveWorkspace({ workspace: global.workspace });
     const config = await loadConfig(workspace, global.config);
     const now = (/* @__PURE__ */ new Date()).toISOString();
-    const crawl = createSourceCrawlConfig(type, options, { retentionDays: config.crawler.retentionDays });
+    const initialCrawl = createSourceCrawlConfig(type, options, { retentionDays: config.crawler.retentionDays });
+    let crawl = initialCrawl;
+    let detectedFeed = null;
+    if (type === "website") {
+      detectedFeed = await discoverWebsiteFeed(uri, config.crawler.defaultUserAgent);
+      if (detectedFeed?.excludePrefix) {
+        crawl = {
+          ...crawl ?? {},
+          excludePatterns: mergePatterns(crawl?.excludePatterns, detectedFeed.excludePrefix)
+        };
+      }
+    }
     const stored = await addSource(workspace, {
       type,
       uri: ["file", "directory"].includes(type) ? path21.resolve(uri) : uri,
@@ -3155,11 +3936,50 @@ Notes:
       createdAt: now,
       updatedAt: now
     });
-    emit(global.json, capture, response("source add", workspace, stored), `Added source ${stored.id}`);
+    if (type !== "website") {
+      emit(global.json, capture, response("source add", workspace, stored), `Added source ${stored.id}`);
+      return;
+    }
+    let feedSource;
+    let feedWasAdded = false;
+    if (detectedFeed) {
+      const existingSources = await listSources(workspace);
+      feedSource = existingSources.find((source2) => source2.uri === detectedFeed?.feedUrl);
+      if (!feedSource) {
+        feedSource = await addSource(workspace, {
+          type: "rss",
+          uri: detectedFeed.feedUrl,
+          name: `${options.name} Feed`,
+          enabled: true,
+          tags: options.tag ?? [],
+          metadata: normalizeMetadata(options.metadata),
+          crawl: {
+            retentionDays: config.crawler.retentionDays,
+            fetchArticles: true
+          },
+          createdAt: now,
+          updatedAt: now
+        });
+        feedWasAdded = true;
+      }
+    }
+    const result = {
+      primarySource: stored,
+      addedSources: [stored, ...feedWasAdded && feedSource ? [feedSource] : []],
+      detectedFeed: detectedFeed ? {
+        url: detectedFeed.feedUrl,
+        discoveredBy: detectedFeed.discoveredBy,
+        excludePrefix: detectedFeed.excludePrefix,
+        source: feedSource,
+        wasAdded: feedWasAdded
+      } : null
+    };
+    emit(global.json, capture, response("source add", workspace, result), formatWebsiteSourceAdd(result));
   });
-  source.command("config").description("Edit supported settings on an existing source.").argument("<sourceId>", "Source id from qli source list.").option("--name <name>", "Update the source name.").option("--tag <tag...>", "Replace source tags with the provided values.").option("--metadata <key=value...>", "Merge metadata keys into the existing source metadata.").option("--max-depth <n>", "Set website crawl depth.").option("--max-pages <n>", "Set the page limit for website sources.").option("--include <pattern...>", "Set include patterns for website or directory sources.").option("--exclude <pattern...>", "Set exclude patterns for website or directory sources.").option("--retention-days <n>", "Set RSS retention in days for this feed.").addHelpText("after", `
+  source.command("config").description("Edit supported settings on an existing source.").argument("<sourceId>", "Source id from qli source list.").option("--name <name>", "Update the source name.").option("--tag <tag...>", "Replace source tags with the provided values.").option("--metadata <key=value...>", "Merge metadata keys into the existing source metadata.").option("--max-depth <n>", "Set website crawl depth.").option("--max-pages <n>", "Set the page limit for website sources.").option("--max-concurrent-requests <n>", "Set the remote request concurrency limit for website or feed sources.").option("--include <pattern...>", "Set include patterns for website or directory sources.").option("--exclude <pattern...>", "Set exclude patterns for website or directory sources.").option("--retention-days <n>", "Set RSS retention in days for this feed.").addHelpText("after", `
 Examples:
   qli source config src_123 --retention-days 30
+  qli source config src_123 --max-concurrent-requests 2
   qli source config src_123 --name "Docs Feed" --tag rss docs
   qli source config src_123 --include /docs/ --exclude /docs/archive/
   qli source config src_123 --metadata team=docs owner=platform --json
@@ -3218,35 +4038,56 @@ Examples:
     const updated = await updateSource(workspace, sourceId, { enabled: true, updatedAt: (/* @__PURE__ */ new Date()).toISOString() });
     emit(global.json, capture, response("source enable", workspace, updated), `Enabled source ${sourceId}`);
   });
-  program.command("ingest").description("Fetch and normalize source content into workspace documents.").option("--source <sourceId>", "Only ingest one source.").option("--changed-only", "Skip content that has not changed since the last run.").addHelpText("after", `
+  program.command("ingest").description("Fetch source content, update affected chunks, and refresh retrieval indexes.").option("--source <sourceId>", "Only ingest one source.").option("--changed-only", "Skip content that has not changed since the last run.").option("--dense", "Force a dense vector build if the dense model is available.").option("--sparse", "Force a sparse vector build if the sparse runtime is available.").addHelpText("after", `
 Examples:
   qli ingest
   qli ingest --source src_123
-  qli ingest --changed-only`).action(async function command(options) {
+  qli ingest --changed-only
+  qli ingest --dense --sparse
+  qli ingest --silent`).action(async function command(options) {
     const global = this.optsWithGlobals();
     const workspace = await resolveWorkspace({ workspace: global.workspace });
-    const result2 = await ingestSources({ workspacePath: workspace, sourceIds: options.source ? [options.source] : void 0, changedOnly: Boolean(options.changedOnly) });
-    emit(global.json, capture, response("ingest", workspace, result2), `Ingested ${result2.processedSources} sources`);
+    const result = await runIngestCommand({
+      workspace,
+      sourceId: options.source,
+      changedOnly: Boolean(options.changedOnly),
+      dense: Boolean(options.dense),
+      sparse: Boolean(options.sparse),
+      progress: createProgressHandler(capture, global)
+    });
+    emit(global.json, capture, response("ingest", workspace, result), `Processed ${result.ingest.processedSources} sources, wrote ${result.chunk.chunksWritten} chunks`);
   });
   program.command("chunk").description("Split normalized documents into retrieval chunks.").option("--source <sourceId>", "Only chunk documents from one source.").option("--document <documentId>", "Only chunk one document.").addHelpText("after", `
 Examples:
   qli chunk
   qli chunk --source src_123
-  qli chunk --document doc_123`).action(async function command(options) {
+  qli chunk --document doc_123
+  qli chunk --silent`).action(async function command(options) {
     const global = this.optsWithGlobals();
     const workspace = await resolveWorkspace({ workspace: global.workspace });
-    const result2 = await chunkDocuments({ workspacePath: workspace, sourceId: options.source, documentId: options.document });
-    emit(global.json, capture, response("chunk", workspace, result2), `Wrote ${result2.chunksWritten} chunks`);
+    const result = await chunkDocuments({
+      workspacePath: workspace,
+      sourceId: options.source,
+      documentId: options.document,
+      progress: createProgressHandler(capture, global)
+    });
+    emit(global.json, capture, response("chunk", workspace, result), `Wrote ${result.chunksWritten} chunks`);
   });
   program.command("reprocess").description("Re-run normalization for existing documents without fetching sources again.").option("--source <sourceId>", "Only reprocess documents from one source.").option("--document <documentId>", "Only reprocess one document.").addHelpText("after", `
 Examples:
   qli reprocess
   qli reprocess --source src_123
-  qli reprocess --document doc_123`).action(async function command(options) {
+  qli reprocess --document doc_123
+  qli reprocess --silent`).action(async function command(options) {
     const global = this.optsWithGlobals();
     const workspace = await resolveWorkspace({ workspace: global.workspace });
-    const result2 = await reprocessDocuments({ workspacePath: workspace, sourceId: options.source, documentId: options.document });
-    emit(global.json, capture, response("reprocess", workspace, result2), `Reprocessed ${result2.documentsReprocessed} documents`);
+    const result = await reprocessDocuments({
+      workspacePath: workspace,
+      sourceId: options.source,
+      documentId: options.document,
+      progress: createProgressHandler(capture, global)
+    });
+    emit(global.json, capture, response("reprocess", workspace, result), `Reprocessed ${result.documentsReprocessed} documents`);
   });
   const index = program.command("index");
   index.description("Build and inspect retrieval indexes.");
@@ -3254,33 +4095,47 @@ Examples:
 Examples:
   qli index build
   qli index build --dense
-  qli index build --dense --sparse`).action(async function command(options) {
+  qli index build --dense --sparse
+  qli index build --silent`).action(async function command(options) {
     const global = this.optsWithGlobals();
     const workspace = await resolveWorkspace({ workspace: global.workspace });
-    const result2 = await buildIndex({
+    const result = await buildIndex({
       workspacePath: workspace,
       denseOverride: options.dense ? true : void 0,
-      sparseOverride: options.sparse ? true : void 0
+      sparseOverride: options.sparse ? true : void 0,
+      progress: createProgressHandler(capture, global)
     });
-    emit(global.json, capture, response("index build", workspace, result2), `Built index at ${result2.indexPath}`);
+    emit(global.json, capture, response("index build", workspace, result), `Built index at ${result.indexPath}`);
   });
   program.command("rebuild").description("Run ingest, chunk, and index build in one command.").option("--source <sourceId>", "Only rebuild data for one source.").option("--changed-only", "Only ingest changed content before chunking and indexing.").option("--dense", "Force a dense vector build if the dense model is available.").option("--sparse", "Force a sparse vector build if the sparse runtime is available.").addHelpText("after", `
 Examples:
   qli rebuild
   qli rebuild --changed-only
   qli rebuild --source src_123
-  qli rebuild --dense --sparse`).action(async function command(options) {
+  qli rebuild --dense --sparse
+  qli rebuild --silent`).action(async function command(options) {
     const global = this.optsWithGlobals();
     const workspace = await resolveWorkspace({ workspace: global.workspace });
-    const ingest = await ingestSources({ workspacePath: workspace, sourceIds: options.source ? [options.source] : void 0, changedOnly: Boolean(options.changedOnly) });
-    const chunk = await chunkDocuments({ workspacePath: workspace, sourceId: options.source });
+    const progress = createProgressHandler(capture, global);
+    progress?.("info", "Rebuild step 1/3: ingest");
+    const ingest = await ingestSources({
+      workspacePath: workspace,
+      sourceIds: options.source ? [options.source] : void 0,
+      changedOnly: Boolean(options.changedOnly),
+      progress
+    });
+    progress?.("info", "Rebuild step 2/3: chunk");
+    const chunk = await chunkDocuments({ workspacePath: workspace, sourceId: options.source, progress });
+    progress?.("info", "Rebuild step 3/3: index");
     const indexBuild = await buildIndex({
       workspacePath: workspace,
       denseOverride: options.dense ? true : void 0,
       sparseOverride: options.sparse ? true : void 0,
-      buildAvailableModels: true
+      buildAvailableModels: true,
+      progress
     });
     const data = { ingest, chunk, indexPath: indexBuild.indexPath, metadata: indexBuild.metadata };
+    progress?.("info", "Rebuild complete");
     emit(global.json, capture, response("rebuild", workspace, data), `Processed ${ingest.processedSources} sources, wrote ${chunk.chunksWritten} chunks`);
   });
   program.command("search").description("Search the built index and return ranked matching documents or chunks.").argument("[query]", "Text query. Omit it to list the latest matching documents.").option("--top-k <n>", "Maximum number of results to return.", "12").option("--source <sourceIds>", "Restrict results to one or more source ids. Use comma-separated values.").option("--source-name <names>", "Restrict results to one or more source names. Use comma-separated values.").option("--source-type <types>", `Restrict results to one or more source types. Use comma-separated values: ${SOURCE_TYPE_LIST.join(", ")}`).option("--uri-prefix <prefixes>", "Restrict results to one or more URI prefixes. Use comma-separated values.").option("--tag <tags>", "Restrict results to one or more source tags. Use comma-separated values.").option("--metadata <key=value...>", "Restrict results to sources with matching metadata.").option("--since <date>", "Shortcut for --publication-date-from.").option("--until <date>", "Shortcut for --publication-date-to.").option("--changed-since <date>", "Only include documents changed on or after this date.").option("--has-publication-date", "Only include documents with a publication date.").option("--publication-date-from <date>", "Only include documents published on or after this date.").option("--publication-date-to <date>", "Only include documents published on or before this date.").option("--first-seen-at-from <date>", "Only include documents first seen on or after this date.").option("--first-seen-at-to <date>", "Only include documents first seen on or before this date.").option("--last-seen-at-from <date>", "Only include documents last seen on or after this date.").option("--last-seen-at-to <date>", "Only include documents last seen on or before this date.").option("--last-changed-at-from <date>", "Only include documents changed on or after this date.").option("--last-changed-at-to <date>", "Only include documents changed on or before this date.").option("--crawled-at-from <date>", "Only include documents crawled on or after this date.").option("--crawled-at-to <date>", "Only include documents crawled on or before this date.").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).option("--show-chunks", "Return chunk-level matches when available.").addHelpText("after", `
@@ -3291,7 +4146,7 @@ Examples:
   qli search --source-name "Release Feed,Company Blog" --uri-prefix https://example.com/news,https://example.com/blog
   qli search "billing" --metadata team=support
   qli search "embedding model" --retrieval hybrid --show-chunks
-  qli search --source-type rss,url --top-k 25 --json
+  qli search --source-type rss,page --top-k 25 --json
 Notes:
   lexical works without vector models.
@@ -3299,7 +4154,7 @@ Notes:
   When you omit the query, qli returns the latest matching documents sorted by publication date.`).action(async function command(query, options) {
     const global = this.optsWithGlobals();
     const workspace = await resolveWorkspace({ workspace: global.workspace });
-    const result2 = await searchIndex({
+    const result = await searchIndex({
       workspacePath: workspace,
       query: query ?? "",
       topK: Number(options.topK),
@@ -3314,7 +4169,7 @@ Notes:
       retrievalMode: parseRetrievalMode(options.retrieval),
       showChunks: Boolean(options.showChunks)
     });
-    emit(global.json, capture, response("search", workspace, result2), formatSearchResults(result2.results));
+    emit(global.json, capture, response("search", workspace, result), formatSearchResults(result.results));
   });
   program.command("related").description("Find documents similar to an existing document by id or URI.").argument("<document>", "Document id, uri, or canonical uri").option("--top-k <n>", "Maximum number of related documents to return.", "12").addHelpText("after", `
 Examples:
@@ -3326,12 +4181,12 @@ Dense vectors usually produce better related-document results. Pull models and r
   qli rebuild --dense`).action(async function command(document, options) {
     const global = this.optsWithGlobals();
     const workspace = await resolveWorkspace({ workspace: global.workspace });
-    const result2 = await findRelatedDocuments({
+    const result = await findRelatedDocuments({
       workspacePath: workspace,
       document,
       topK: Number(options.topK)
     });
-    emit(global.json, capture, response("related", workspace, result2), formatRelatedDocuments(result2.results));
+    emit(global.json, capture, response("related", workspace, result), formatRelatedDocuments(result.results));
   });
   program.command("context").description("Assemble retrieval context for an external LLM, agent, or prompt pipeline.").argument("<query>").option("--top-k <n>", "Maximum number of source passages to consider.", "12").option("--max-chars <n>", "Maximum output length for the rendered context block.", "12000").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).addHelpText("after", `
 Examples:
@@ -3342,14 +4197,14 @@ Examples:
 Use --json when another tool needs structured access to the raw passages and metadata.`).action(async function command(query, options) {
     const global = this.optsWithGlobals();
     const workspace = await resolveWorkspace({ workspace: global.workspace });
-    const result2 = await createContext({
+    const result = await createContext({
       workspacePath: workspace,
       query,
       topK: Number(options.topK),
       maxChars: Number(options.maxChars),
       retrievalMode: parseRetrievalMode(options.retrieval)
     });
-    emit(global.json, capture, response("context", workspace, result2), result2.markdown);
+    emit(global.json, capture, response("context", workspace, result), result.markdown);
   });
   const models = program.command("models");
   models.description("Inspect and download retrieval model assets.");
@@ -3358,7 +4213,9 @@ Examples:
   qli models pull
   qli models pull --dense
   qli models pull --sparse
+  qli models pull --silent
+Pulled model assets are shared under ~/.qli by default.
 If you plan to use related, dense search, or hybrid retrieval, pull the models and rebuild the index first.`).action(async function command(options) {
     const global = this.optsWithGlobals();
     const workspace = await resolveWorkspace({ workspace: global.workspace });
@@ -3369,17 +4226,27 @@ If you plan to use related, dense search, or hybrid retrieval, pull the models a
       pullSparseFlag: Boolean(options.sparse),
       uvAvailable: status.sparse.uvAvailable
     });
-    await pullModels({ workspacePath: workspace, config, pullDense, pullSparse });
+    await pullModels({ workspacePath: workspace, config, pullDense, pullSparse, progress: createProgressHandler(capture, global) });
     const data = {
-      dense: pullDense ? { pulled: true, modelId: config.retrieval.dense.modelId, cacheDir: config.retrieval.dense.cacheDir } : void 0,
-      sparse: pullSparse ? { pulled: true, modelId: config.retrieval.sparse.modelId, cacheDir: config.retrieval.sparse.cacheDir } : void 0
+      dense: pullDense ? {
+        pulled: true,
+        modelId: config.retrieval.dense.modelId,
+        cacheDir: resolveCacheDir(workspace, config.retrieval.dense.cacheDir)
+      } : void 0,
+      sparse: pullSparse ? {
+        pulled: true,
+        modelId: config.retrieval.sparse.modelId,
+        cacheDir: resolveCacheDir(workspace, config.retrieval.sparse.cacheDir)
+      } : void 0
     };
     emit(global.json, capture, response("models pull", workspace, data), "Pulled available models");
   });
-  models.command("status").description("Show whether model runtimes and artifacts are available in the workspace.").addHelpText("after", `
+  models.command("status").description("Show whether shared model assets, runtimes, and workspace vector artifacts are available.").addHelpText("after", `
 Examples:
   qli models status
-  qli models status --json`).action(async function command() {
+  qli models status --json
+The cacheDir fields show the resolved model cache path for the current workspace config.`).action(async function command() {
     const global = this.optsWithGlobals();
     const workspace = await resolveWorkspace({ workspace: global.workspace });
     const config = await loadConfig(workspace, global.config);
@@ -3394,8 +4261,8 @@ Examples:
   qli diff --since 2026-05-01`).action(async function command(options) {
     const global = this.optsWithGlobals();
     const workspace = await resolveWorkspace({ workspace: global.workspace });
-    const result2 = await diffWorkspace({ workspacePath: workspace, sourceId: options.source, documentId: options.document, since: options.since });
-    emit(global.json, capture, response("diff", workspace, result2), JSON.stringify(result2, null, 2));
+    const result = await diffWorkspace({ workspacePath: workspace, sourceId: options.source, documentId: options.document, since: options.since });
+    emit(global.json, capture, response("diff", workspace, result), JSON.stringify(result, null, 2));
   });
   const report = program.command("report");
   report.description("Render higher-level reports from workspace data.");
@@ -3427,7 +4294,7 @@ Examples:
     try {
       const meta = await readLatestIndexMetadata(workspace);
       latestIndex = meta.createdAt;
-      indexSize = (await stat4(`${workspace}/indexes/latest.json`)).size;
+      indexSize = (await stat4(await resolveLatestIndexArtifactPath(workspace))).size;
     } catch {
       latestIndex = void 0;
     }
@@ -3476,8 +4343,11 @@ Examples:
       checks.push("dense runtime importable");
     }
     if (config.retrieval.sparse.enabled) {
-      await ensureUvAvailable();
-      checks.push("uv available for sparse runtime");
+      if (await isUvAvailable()) {
+        checks.push("uv available for sparse runtime");
+      } else {
+        checks.push("uv missing for sparse runtime");
+      }
     }
     try {
       await readLatestIndexMetadata(workspace);
@@ -3511,13 +4381,21 @@ function emit(asJson, capture, body, human) {
 }
 // src/cli/main.ts
-var result = await runCli(process.argv.slice(2));
-if (result.stdout) {
-  process.stdout.write(`${result.stdout}
+try {
+  const result = await runCli(process.argv.slice(2), {
+    onStdout(value) {
+      process.stdout.write(`${value}
 `);
-}
-if (result.stderr) {
-  process.stderr.write(`${result.stderr}
+    },
+    onStderr(value) {
+      process.stderr.write(`${value}
+`);
+    }
+  });
+  process.exitCode = result.exitCode;
+} catch (error) {
+  const message = error instanceof Error ? error.stack ?? error.message : String(error);
+  process.stderr.write(`${message}
 `);
+  process.exitCode = 1;
 }
-process.exit(result.exitCode);