npm - @arabold/docs-mcp-server - Versions diffs - 1.26.2 → 1.27.1 - Mend

@arabold/docs-mcp-server 1.26.2 → 1.27.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/README.md +1 -1
package/db/migrations/010-add-depth-to-pages.sql +16 -0
package/dist/assets/main.css +1 -1
package/dist/index.js +1777 -1320
package/dist/index.js.map +1 -1
package/package.json +34 -29
package/public/assets/main.css +1 -1

package/dist/index.js CHANGED Viewed

@@ -6,7 +6,7 @@ import { VertexAIEmbeddings } from "@langchain/google-vertexai";
 import { AzureOpenAIEmbeddings, OpenAIEmbeddings } from "@langchain/openai";
 import { Embeddings } from "@langchain/core/embeddings";
 import { PostHog } from "posthog-node";
-import { randomUUID } from "node:crypto";
+import crypto, { randomUUID } from "node:crypto";
 import fs, { existsSync, readFileSync } from "node:fs";
 import path from "node:path";
 import { fileURLToPath, URL as URL$1 } from "node:url";
@@ -27,6 +27,7 @@ import psl from "psl";
 import { HeaderGenerator } from "header-generator";
 import fs$1 from "node:fs/promises";
 import axios from "axios";
+import { minimatch } from "minimatch";
 import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
 import remarkGfm from "remark-gfm";
 import remarkHtml from "remark-html";
@@ -40,7 +41,6 @@ import * as cheerio from "cheerio";
 import "node:vm";
 import { gfm } from "@joplin/turndown-plugin-gfm";
 import iconv from "iconv-lite";
-import { minimatch } from "minimatch";
 import { SSEServerTransport } from "@modelcontextprotocol/sdk/server/sse.js";
 import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
 import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
@@ -113,21 +113,6 @@ class MissingCredentialsError extends StoreError {
   }
 }
 const VECTOR_DIMENSION = 1536;
-function mapDbDocumentToDocument(doc) {
-  const chunkMetadata = JSON.parse(doc.metadata);
-  return {
-    id: doc.id,
-    pageContent: doc.content,
-    metadata: {
-      ...chunkMetadata,
-      // Page-level fields are always available from joined queries
-      url: doc.url,
-      title: doc.title || "",
-      // Convert null to empty string for consistency
-      ...doc.content_type && { contentType: doc.content_type }
-    }
-  };
-}
 var VersionStatus = /* @__PURE__ */ ((VersionStatus2) => {
   VersionStatus2["NOT_INDEXED"] = "not_indexed";
   VersionStatus2["QUEUED"] = "queued";
@@ -784,16 +769,16 @@ function extractProtocol(urlOrPath) {
   }
 }
 const name = "@arabold/docs-mcp-server";
-const version = "1.26.1";
+const version = "1.27.0";
 const description = "MCP server for fetching and searching documentation";
 const type = "module";
 const bin = { "docs-mcp-server": "dist/index.js" };
 const license = "MIT";
 const repository = { "type": "git", "url": "git+https://github.com/arabold/docs-mcp-server.git" };
 const files = ["dist", "public", "db", "README.md", "LICENSE", "package.json"];
-const scripts = { "prepare": "husky || true", "build": "vite build --config vite.config.web.ts && vite build", "start": "node --enable-source-maps dist/index.js", "cli": "node --enable-source-maps dist/index.js", "server": "node --enable-source-maps dist/index.ts", "web": "node --enable-source-maps dist/index.ts web", "dev": "npm-run-all --parallel dev:server dev:web", "dev:server": "vite-node --watch src/index.ts", "dev:web": "vite build --config vite.config.web.ts --watch", "test": "vitest run", "test:watch": "vitest", "test:coverage": "vitest run --coverage", "test:e2e": "vitest run --config test/vitest.config.ts", "test:e2e:watch": "vitest --config test/vitest.config.ts", "lint": "biome check .", "lint:fix": "biome check . --fix", "format": "biome format . --write", "postinstall": "echo 'Skipping Playwright browser install. See README.md for details.'" };
-const dependencies = { "@fastify/formbody": "^8.0.2", "@fastify/static": "^8.2.0", "@joplin/turndown-plugin-gfm": "^1.0.62", "@kitajs/html": "^4.2.9", "@kitajs/ts-html-plugin": "^4.1.1", "@langchain/aws": "^0.1.13", "@langchain/google-genai": "^0.2.16", "@langchain/google-vertexai": "^0.2.16", "@langchain/openai": "^0.6.3", "@modelcontextprotocol/sdk": "^1.17.1", "@trpc/client": "^11.4.4", "@trpc/server": "^11.4.4", "alpinejs": "^3.14.9", "axios": "^1.11.0", "axios-retry": "^4.5.0", "better-sqlite3": "^12.2.0", "cheerio": "^1.1.2", "commander": "^14.0.0", "dompurify": "^3.2.6", "dotenv": "^17.2.1", "env-paths": "^3.0.0", "fastify": "^5.4.0", "flowbite": "^3.1.2", "fuse.js": "^7.1.0", "header-generator": "^2.1.69", "htmx.org": "^2.0.6", "iconv-lite": "^0.6.3", "jose": "^6.0.12", "jsdom": "^26.1.0", "langchain": "^0.3.30", "mime": "^4.0.7", "minimatch": "^10.0.1", "playwright": "^1.52.0", "posthog-node": "^5.7.0", "psl": "^1.15.0", "remark": "^15.0.1", "remark-gfm": "^4.0.1", "remark-html": "^16.0.1", "semver": "^7.7.2", "sqlite-vec": "^0.1.7-alpha.2", "tree-sitter": "^0.21.1", "tree-sitter-javascript": "^0.23.1", "tree-sitter-python": "^0.21.0", "tree-sitter-typescript": "^0.23.2", "turndown": "^7.2.0", "zod": "^4.0.14" };
-const devDependencies = { "@biomejs/biome": "^2.1.3", "@commitlint/cli": "^19.8.1", "@commitlint/config-conventional": "^19.8.1", "@semantic-release/changelog": "^6.0.3", "@semantic-release/git": "^10.0.1", "@semantic-release/github": "^11.0.3", "@semantic-release/npm": "^12.0.2", "@tailwindcss/postcss": "^4.1.11", "@tailwindcss/vite": "^4.1.11", "@types/alpinejs": "^3.13.11", "@types/better-sqlite3": "^7.6.13", "@types/jsdom": "~21.1.7", "@types/lint-staged": "~13.3.0", "@types/node": "^24.1.0", "@types/node-fetch": "^2.6.13", "@types/psl": "^1.1.3", "@types/semver": "^7.7.0", "@types/turndown": "^5.0.5", "autoprefixer": "^10.4.21", "flowbite-typography": "^1.0.5", "husky": "^9.1.7", "lint-staged": "^16.1.2", "memfs": "^4.34.0", "npm-run-all": "^4.1.5", "postcss": "^8.5.6", "semantic-release": "^24.2.7", "tailwindcss": "^4.1.4", "typescript": "^5.9.2", "vite": "^6.3.5", "vite-node": "^3.1.2", "vite-plugin-dts": "^4.5.4", "vitest": "^3.2.4" };
+const scripts = { "prepare": "husky || true", "build": "vite build --config vite.config.web.ts && vite build", "start": "node --enable-source-maps dist/index.js", "cli": "node --enable-source-maps dist/index.js", "server": "node --enable-source-maps dist/index.ts", "web": "node --enable-source-maps dist/index.ts web", "dev": "npm-run-all --parallel dev:server dev:web", "dev:server": "vite-node --watch src/index.ts", "dev:web": "vite build --config vite.config.web.ts --watch", "test": "vitest run", "test:watch": "vitest", "test:coverage": "vitest run --coverage", "test:unit": "vitest run src", "test:e2e": "vitest run test", "test:live": "vitest run --exclude= test/html-pipeline-live-e2e.test.ts", "lint": "biome check .", "lint:fix": "biome check . --fix", "typecheck": "npx tsc --noEmit", "typecheck:build": "npx tsc --noEmit --project tsconfig.build.json", "format": "biome format . --write", "postinstall": "echo 'Skipping Playwright browser install. See README.md for details.'" };
+const dependencies = { "@fastify/formbody": "^8.0.2", "@fastify/static": "^8.3.0", "@joplin/turndown-plugin-gfm": "^1.0.62", "@kitajs/html": "^4.2.9", "@kitajs/ts-html-plugin": "^4.1.1", "@langchain/aws": "^0.1.13", "@langchain/google-genai": "^0.2.16", "@langchain/google-vertexai": "^0.2.16", "@langchain/openai": "^0.6.3", "@modelcontextprotocol/sdk": "^1.20.2", "@trpc/client": "^11.7.1", "@trpc/server": "^11.4.4", "alpinejs": "^3.14.9", "axios": "^1.13.1", "axios-retry": "^4.5.0", "better-sqlite3": "^12.4.1", "cheerio": "^1.1.2", "commander": "^14.0.0", "dompurify": "^3.3.0", "dotenv": "^17.2.3", "env-paths": "^3.0.0", "fastify": "^5.6.1", "flowbite": "^3.1.2", "fuse.js": "^7.1.0", "header-generator": "^2.1.76", "htmx.org": "^2.0.6", "iconv-lite": "^0.6.3", "jose": "^6.1.0", "jsdom": "^26.1.0", "langchain": "^0.3.30", "mime": "^4.1.0", "minimatch": "^10.0.1", "playwright": "^1.52.0", "posthog-node": "^5.11.0", "psl": "^1.15.0", "remark": "^15.0.1", "remark-gfm": "^4.0.1", "remark-html": "^16.0.1", "semver": "^7.7.3", "sqlite-vec": "^0.1.7-alpha.2", "tree-sitter": "^0.21.1", "tree-sitter-javascript": "^0.23.1", "tree-sitter-python": "^0.21.0", "tree-sitter-typescript": "^0.23.2", "turndown": "^7.2.2", "zod": "^4.1.12" };
+const devDependencies = { "@biomejs/biome": "^2.3.2", "@commitlint/cli": "^19.8.1", "@commitlint/config-conventional": "^19.8.1", "@semantic-release/changelog": "^6.0.3", "@semantic-release/git": "^10.0.1", "@semantic-release/github": "^11.0.6", "@semantic-release/npm": "^12.0.2", "@tailwindcss/postcss": "^4.1.16", "@tailwindcss/vite": "^4.1.16", "@types/alpinejs": "^3.13.11", "@types/better-sqlite3": "^7.6.13", "@types/jsdom": "~21.1.7", "@types/lint-staged": "~13.3.0", "@types/node": "^24.1.0", "@types/node-fetch": "^2.6.13", "@types/psl": "^1.1.3", "@types/semver": "^7.7.1", "@types/turndown": "^5.0.6", "autoprefixer": "^10.4.21", "flowbite-typography": "^1.0.5", "husky": "^9.1.7", "lint-staged": "^16.2.6", "memfs": "^4.50.0", "msw": "^2.12.2", "nock": "^14.0.10", "npm-run-all": "^4.1.5", "postcss": "^8.5.6", "semantic-release": "^24.2.9", "tailwindcss": "^4.1.4", "typescript": "^5.9.3", "vite": "^6.3.5", "vite-node": "^3.2.4", "vite-plugin-dts": "^4.5.4", "vitest": "^3.2.4" };
 const engines = { "node": ">=20.0.0" };
 const packageJson = {
   name,
@@ -1288,10 +1273,10 @@ class PipelineClient {
     this.activePolling.clear();
     logger.debug("PipelineClient stopped");
   }
-  async enqueueJob(library, version2, options) {
+  async enqueueScrapeJob(library, version2, options) {
     try {
       const normalizedVersion = typeof version2 === "string" && version2.trim().length === 0 ? null : version2 ?? null;
-      const result = await this.client.enqueueJob.mutate({
+      const result = await this.client.enqueueScrapeJob.mutate({
         library,
         version: normalizedVersion,
         options
@@ -1304,6 +1289,21 @@ class PipelineClient {
       );
     }
   }
+  async enqueueRefreshJob(library, version2) {
+    try {
+      const normalizedVersion = typeof version2 === "string" && version2.trim().length === 0 ? null : version2 ?? null;
+      const result = await this.client.enqueueRefreshJob.mutate({
+        library,
+        version: normalizedVersion
+      });
+      logger.debug(`Refresh job ${result.jobId} enqueued successfully`);
+      return result.jobId;
+    } catch (error) {
+      throw new Error(
+        `Failed to enqueue refresh job: ${error instanceof Error ? error.message : String(error)}`
+      );
+    }
+  }
   async getJob(jobId) {
     try {
       const serializedJob = await this.client.getJob.query({ id: jobId });
@@ -1753,6 +1753,12 @@ class FingerprintGenerator {
     return this.headerGenerator.getHeaders();
   }
 }
+var FetchStatus = /* @__PURE__ */ ((FetchStatus2) => {
+  FetchStatus2["SUCCESS"] = "success";
+  FetchStatus2["NOT_MODIFIED"] = "not_modified";
+  FetchStatus2["NOT_FOUND"] = "not_found";
+  return FetchStatus2;
+})(FetchStatus || {});
 class BrowserFetcher {
   browser = null;
   page = null;
@@ -1792,13 +1798,16 @@ class BrowserFetcher {
       const contentBuffer = Buffer.from(content, "utf-8");
       const contentType = response.headers()["content-type"] || "text/html";
       const { mimeType, charset } = MimeTypeUtils.parseContentType(contentType);
+      const etag = response.headers().etag;
       return {
         content: contentBuffer,
         mimeType,
         charset,
         encoding: void 0,
         // Browser handles encoding automatically
-        source: finalUrl
+        source: finalUrl,
+        etag,
+        status: FetchStatus.SUCCESS
       };
     } catch (error) {
       if (options?.signal?.aborted) {
@@ -1859,24 +1868,48 @@ class FileFetcher {
   /**
    * Fetches the content of a file given a file:// URL, decoding percent-encoded paths as needed.
    * Uses enhanced MIME type detection for better source code file recognition.
+   * Supports conditional fetching via ETag comparison for efficient refresh operations.
    */
-  async fetch(source, _options) {
+  async fetch(source, options) {
     let filePath = source.replace(/^file:\/\/\/?/, "");
     filePath = decodeURIComponent(filePath);
     if (!filePath.startsWith("/") && process.platform !== "win32") {
       filePath = `/${filePath}`;
     }
     try {
+      const stats = await fs$1.stat(filePath);
+      const currentEtag = crypto.createHash("md5").update(stats.mtime.toISOString()).digest("hex");
+      if (options?.etag && options.etag === currentEtag) {
+        return {
+          content: Buffer.from(""),
+          mimeType: "text/plain",
+          source,
+          etag: currentEtag,
+          lastModified: stats.mtime.toISOString(),
+          status: FetchStatus.NOT_MODIFIED
+        };
+      }
       const content = await fs$1.readFile(filePath);
       const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath);
       const mimeType = detectedMimeType || "application/octet-stream";
       return {
         content,
         mimeType,
-        source
+        source,
+        etag: currentEtag,
+        lastModified: stats.mtime.toISOString(),
+        status: FetchStatus.SUCCESS
         // Don't assume charset for text files - let the pipeline detect it
       };
     } catch (error) {
+      if (error.code === "ENOENT") {
+        return {
+          content: Buffer.from(""),
+          mimeType: "text/plain",
+          source,
+          status: FetchStatus.NOT_FOUND
+        };
+      }
       throw new ScraperError(
         `Failed to read file ${filePath}: ${error.message ?? "Unknown error"}`,
         false,
@@ -1982,6 +2015,12 @@ class HttpFetcher {
           ...options?.headers
           // User-provided headers override generated ones
         };
+        if (options?.etag) {
+          headers["If-None-Match"] = options.etag;
+          logger.debug(
+            `Conditional request for ${source} with If-None-Match: ${options.etag}`
+          );
+        }
         const config = {
           responseType: "arraybuffer",
           headers: {
@@ -1995,9 +2034,22 @@ class HttpFetcher {
           // Pass signal to axios
           // Axios follows redirects by default, we need to explicitly disable it if needed
           maxRedirects: followRedirects ? 5 : 0,
-          decompress: true
+          decompress: true,
+          // Allow 304 responses to be handled as successful responses
+          validateStatus: (status) => {
+            return status >= 200 && status < 300 || status === 304;
+          }
         };
         const response = await axios.get(source, config);
+        if (response.status === 304) {
+          logger.debug(`HTTP 304 Not Modified for ${source}`);
+          return {
+            content: Buffer.from(""),
+            mimeType: "text/plain",
+            source,
+            status: FetchStatus.NOT_MODIFIED
+          };
+        }
         const contentTypeHeader = response.headers["content-type"];
         const { mimeType, charset } = MimeTypeUtils.parseContentType(contentTypeHeader);
         const contentEncoding = response.headers["content-encoding"];
@@ -2017,12 +2069,21 @@ class HttpFetcher {
           response.request?.responseUrl || // Fallback to axios recorded config URL
           response.config?.url || source
         );
+        const etag = response.headers.etag || response.headers.ETag;
+        if (etag) {
+          logger.debug(`Received ETag for ${source}: ${etag}`);
+        }
+        const lastModified = response.headers["last-modified"];
+        const lastModifiedISO = lastModified ? new Date(lastModified).toISOString() : void 0;
         return {
           content,
           mimeType,
           charset,
           encoding: contentEncoding,
-          source: finalUrl
+          source: finalUrl,
+          etag,
+          lastModified: lastModifiedISO,
+          status: FetchStatus.SUCCESS
         };
       } catch (error) {
         const axiosError = error;
@@ -2031,6 +2092,15 @@ class HttpFetcher {
         if (options?.signal?.aborted || code === "ERR_CANCELED") {
           throw new CancellationError("HTTP fetch cancelled");
         }
+        if (status === 404) {
+          logger.debug(`Resource not found (404): ${source}`);
+          return {
+            content: Buffer.from(""),
+            mimeType: "text/plain",
+            source,
+            status: FetchStatus.NOT_FOUND
+          };
+        }
         if (!followRedirects && status && status >= 300 && status < 400) {
           const location = axiosError.response?.headers?.location;
           if (location) {
@@ -2125,101 +2195,522 @@ class AutoDetectFetcher {
     ]);
   }
 }
-class SplitterError extends Error {
-}
-class MinimumChunkSizeError extends SplitterError {
-  constructor(size, maxSize) {
-    super(
-      `Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`
-    );
+const DEFAULT_FILE_EXCLUSIONS = [
+  // CHANGELOG files (case variations)
+  "**/CHANGELOG.md",
+  "**/changelog.md",
+  "**/CHANGELOG.mdx",
+  "**/changelog.mdx",
+  // LICENSE files (case variations)
+  "**/LICENSE",
+  "**/LICENSE.md",
+  "**/license.md",
+  // CODE_OF_CONDUCT files (case variations)
+  "**/CODE_OF_CONDUCT.md",
+  "**/code_of_conduct.md",
+  // Test files
+  "**/*.test.*",
+  "**/*.spec.*",
+  "**/*_test.py",
+  "**/*_test.go",
+  // Package manager lock files
+  "**/*.lock",
+  "**/package-lock.json",
+  "**/yarn.lock",
+  "**/pnpm-lock.yaml",
+  "**/go.sum",
+  // Build artifacts
+  "**/*.min.js",
+  "**/*.min.css",
+  "**/*.map",
+  "**/*.d.ts",
+  // IDE/System files
+  "**/.DS_Store",
+  "**/Thumbs.db",
+  "**/*.swp",
+  "**/*.swo",
+  // Internal config files (using regex pattern)
+  "/.*\\.(ini|cfg|conf|log|pid)$/"
+];
+const DEFAULT_FOLDER_EXCLUSIONS = [
+  // Archive and deprecated content (matches anywhere in path)
+  "**/archive/**",
+  "**/archived/**",
+  "**/deprecated/**",
+  "**/legacy/**",
+  "**/old/**",
+  "**/outdated/**",
+  "**/previous/**",
+  "**/superseded/**",
+  // Specific paths that don't follow the general pattern
+  "docs/old/**",
+  // Test directories
+  "**/test/**",
+  "**/tests/**",
+  "**/__tests__/**",
+  "**/spec/**",
+  // Build output directories
+  "**/dist/**",
+  "**/build/**",
+  "**/out/**",
+  "**/target/**",
+  "**/.next/**",
+  "**/.nuxt/**",
+  // IDE directories
+  "**/.vscode/**",
+  "**/.idea/**",
+  // Internationalization folders - non-English locales
+  "**/i18n/ar*/**",
+  "**/i18n/de*/**",
+  "**/i18n/es*/**",
+  "**/i18n/fr*/**",
+  "**/i18n/hi*/**",
+  "**/i18n/it*/**",
+  "**/i18n/ja*/**",
+  "**/i18n/ko*/**",
+  "**/i18n/nl*/**",
+  "**/i18n/pl*/**",
+  "**/i18n/pt*/**",
+  "**/i18n/ru*/**",
+  "**/i18n/sv*/**",
+  "**/i18n/th*/**",
+  "**/i18n/tr*/**",
+  "**/i18n/vi*/**",
+  "**/i18n/zh*/**",
+  // Common locale folder patterns
+  "**/zh-cn/**",
+  "**/zh-hk/**",
+  "**/zh-mo/**",
+  "**/zh-sg/**",
+  "**/zh-tw/**"
+];
+const DEFAULT_EXCLUSION_PATTERNS = [
+  ...DEFAULT_FILE_EXCLUSIONS,
+  ...DEFAULT_FOLDER_EXCLUSIONS
+];
+function getEffectiveExclusionPatterns(userPatterns) {
+  if (userPatterns !== void 0) {
+    return userPatterns;
   }
+  return DEFAULT_EXCLUSION_PATTERNS;
 }
-class ContentSplitterError extends SplitterError {
+function isRegexPattern(pattern) {
+  return pattern.length > 2 && pattern.startsWith("/") && pattern.endsWith("/");
 }
-class GreedySplitter {
-  baseSplitter;
-  minChunkSize;
-  preferredChunkSize;
-  /**
-   * Combines a base document splitter with size constraints to produce optimally-sized chunks.
-   * The base splitter handles the initial semantic splitting, while this class handles
-   * the concatenation strategy.
-   */
-  constructor(baseSplitter, minChunkSize, preferredChunkSize) {
-    this.baseSplitter = baseSplitter;
-    this.minChunkSize = minChunkSize;
-    this.preferredChunkSize = preferredChunkSize;
+function patternToRegExp(pattern) {
+  if (isRegexPattern(pattern)) {
+    return new RegExp(pattern.slice(1, -1));
   }
-  /**
-   * Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks
-   * are combined until they reach the minimum size, but splits are preserved at major
-   * section boundaries to maintain document structure. This balances the need for
-   * context with semantic coherence.
-   */
-  async splitText(markdown, contentType) {
-    const initialChunks = await this.baseSplitter.splitText(markdown, contentType);
-    const concatenatedChunks = [];
-    let currentChunk = null;
-    for (const nextChunk of initialChunks) {
-      if (currentChunk) {
-        if (this.wouldExceedMaxSize(currentChunk, nextChunk)) {
-          concatenatedChunks.push(currentChunk);
-          currentChunk = this.cloneChunk(nextChunk);
-          continue;
-        }
-        if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk)) {
-          concatenatedChunks.push(currentChunk);
-          currentChunk = this.cloneChunk(nextChunk);
-          continue;
-        }
-        currentChunk.content += `${currentChunk.content.endsWith("\n") ? "" : "\n"}${nextChunk.content}`;
-        currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
-        currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
-      } else {
-        currentChunk = this.cloneChunk(nextChunk);
-      }
-    }
-    if (currentChunk) {
-      concatenatedChunks.push(currentChunk);
+  const re = minimatch.makeRe(pattern, { dot: true });
+  if (!re) throw new Error(`Invalid glob pattern: ${pattern}`);
+  return re;
+}
+function matchesAnyPattern(path2, patterns) {
+  if (!patterns || patterns.length === 0) return false;
+  const normalizedPath = path2.startsWith("/") ? path2 : `/${path2}`;
+  return patterns.some((pattern) => {
+    if (isRegexPattern(pattern)) {
+      return patternToRegExp(pattern).test(normalizedPath);
     }
-    return concatenatedChunks;
-  }
-  cloneChunk(chunk) {
-    return {
-      types: [...chunk.types],
-      content: chunk.content,
-      section: {
-        level: chunk.section.level,
-        path: [...chunk.section.path]
-      }
-    };
-  }
-  /**
-   * H1 and H2 headings represent major conceptual breaks in the document.
-   * Preserving these splits helps maintain the document's logical structure.
-   */
-  startsNewMajorSection(chunk) {
-    return chunk.section.level === 1 || chunk.section.level === 2;
+    const pathForMatch = normalizedPath.replace(/^\//, "");
+    const patternForMatch = pattern.startsWith("/") ? pattern.slice(1) : pattern;
+    return minimatch(pathForMatch, patternForMatch, { dot: true });
+  });
+}
+function extractPathAndQuery(url) {
+  try {
+    const u = new URL(url);
+    return u.pathname + (u.search || "");
+  } catch {
+    return url;
   }
-  /**
-   * Size limit check to ensure chunks remain within embedding model constraints.
-   * Essential for maintaining consistent embedding quality and avoiding truncation.
-   */
-  wouldExceedMaxSize(currentChunk, nextChunk) {
-    if (!currentChunk) {
-      return false;
+}
+function shouldIncludeUrl(url, includePatterns, excludePatterns) {
+  const path2 = extractPathAndQuery(url);
+  const normalizedPath = path2.startsWith("/") ? path2 : `/${path2}`;
+  let basename;
+  if (url.startsWith("file://")) {
+    try {
+      const u = new URL(url);
+      basename = u.pathname ? u.pathname.split("/").pop() : void 0;
+    } catch {
     }
-    return currentChunk.content.length + nextChunk.content.length > this.preferredChunkSize;
   }
-  /**
-   * Checks if one path is a prefix of another path, indicating a parent-child relationship
-   */
-  isPathIncluded(parentPath, childPath) {
-    if (parentPath.length >= childPath.length) return false;
-    return parentPath.every((part, i) => part === childPath[i]);
+  const stripSlash = (patterns) => patterns?.map((p) => p.startsWith("/") ? p.slice(1) : p);
+  const effectiveExcludePatterns = getEffectiveExclusionPatterns(excludePatterns);
+  if (matchesAnyPattern(url, effectiveExcludePatterns) || matchesAnyPattern(normalizedPath, effectiveExcludePatterns) || basename && matchesAnyPattern(basename, stripSlash(effectiveExcludePatterns)))
+    return false;
+  if (!includePatterns || includePatterns.length === 0) return true;
+  return matchesAnyPattern(url, includePatterns) || matchesAnyPattern(normalizedPath, includePatterns) || (basename ? matchesAnyPattern(basename, stripSlash(includePatterns)) : false);
+}
+function computeBaseDirectory(pathname) {
+  if (pathname === "") return "/";
+  if (pathname.endsWith("/")) return pathname;
+  const lastSegment = pathname.split("/").at(-1) || "";
+  const looksLikeFile = lastSegment.includes(".");
+  if (looksLikeFile) {
+    return pathname.replace(/\/[^/]*$/, "/");
   }
-  /**
-   * Merges section metadata when concatenating chunks, following these rules:
+  return `${pathname}/`;
+}
+function isInScope(baseUrl, targetUrl, scope) {
+  if (baseUrl.protocol !== targetUrl.protocol) return false;
+  switch (scope) {
+    case "subpages": {
+      if (baseUrl.hostname !== targetUrl.hostname) return false;
+      const baseDir = computeBaseDirectory(baseUrl.pathname);
+      return targetUrl.pathname.startsWith(baseDir);
+    }
+    case "hostname":
+      return baseUrl.hostname === targetUrl.hostname;
+    case "domain": {
+      return extractPrimaryDomain(baseUrl.hostname) === extractPrimaryDomain(targetUrl.hostname);
+    }
+    default:
+      return false;
+  }
+}
+const DEFAULT_MAX_DEPTH = 3;
+const DEFAULT_CONCURRENCY = 3;
+class BaseScraperStrategy {
+  /**
+   * Set of normalized URLs that have been marked for processing.
+   *
+   * IMPORTANT: URLs are added to this set BEFORE they are actually processed, not after.
+   * This prevents the same URL from being queued multiple times when discovered from different sources.
+   *
+   * Usage flow:
+   * 1. Initial queue setup: Root URL and initialQueue items are added to visited
+   * 2. During processing: When a page returns links, each link is checked against visited
+   * 3. In processBatch deduplication: Only links NOT in visited are added to the queue AND to visited
+   *
+   * This approach ensures:
+   * - No URL is processed more than once
+   * - No URL appears in the queue multiple times
+   * - Efficient deduplication across concurrent processing
+   */
+  visited = /* @__PURE__ */ new Set();
+  pageCount = 0;
+  totalDiscovered = 0;
+  // Track total URLs discovered (unlimited)
+  effectiveTotal = 0;
+  // Track effective total (limited by maxPages)
+  canonicalBaseUrl;
+  options;
+  constructor(options = {}) {
+    this.options = options;
+  }
+  /**
+   * Determines if a URL should be processed based on scope and include/exclude patterns in ScraperOptions.
+   * Scope is checked first, then patterns.
+   */
+  shouldProcessUrl(url, options) {
+    if (options.scope) {
+      try {
+        const base = this.canonicalBaseUrl ?? new URL$1(options.url);
+        const target = new URL$1(url);
+        if (!isInScope(base, target, options.scope)) return false;
+      } catch {
+        return false;
+      }
+    }
+    return shouldIncludeUrl(url, options.includePatterns, options.excludePatterns);
+  }
+  async processBatch(batch, baseUrl, options, progressCallback, signal) {
+    const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
+    const results = await Promise.all(
+      batch.map(async (item) => {
+        if (signal?.aborted) {
+          throw new CancellationError("Scraping cancelled during batch processing");
+        }
+        const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH;
+        if (item.depth > maxDepth) {
+          return [];
+        }
+        try {
+          const result = await this.processItem(item, options, signal);
+          const shouldCount = item.pageId !== void 0 || result.content !== void 0;
+          let currentPageCount = this.pageCount;
+          if (shouldCount) {
+            currentPageCount = ++this.pageCount;
+            logger.info(
+              `🌐 Scraping page ${currentPageCount}/${this.effectiveTotal} (depth ${item.depth}/${maxDepth}): ${item.url}`
+            );
+          }
+          if (result.status === FetchStatus.NOT_MODIFIED) {
+            logger.debug(`Page unchanged (304): ${item.url}`);
+            if (shouldCount) {
+              await progressCallback({
+                pagesScraped: currentPageCount,
+                totalPages: this.effectiveTotal,
+                totalDiscovered: this.totalDiscovered,
+                currentUrl: item.url,
+                depth: item.depth,
+                maxDepth,
+                result: null,
+                pageId: item.pageId
+              });
+            }
+            return [];
+          }
+          if (result.status === FetchStatus.NOT_FOUND) {
+            logger.debug(`Page deleted (404): ${item.url}`);
+            if (shouldCount) {
+              await progressCallback({
+                pagesScraped: currentPageCount,
+                totalPages: this.effectiveTotal,
+                totalDiscovered: this.totalDiscovered,
+                currentUrl: item.url,
+                depth: item.depth,
+                maxDepth,
+                result: null,
+                pageId: item.pageId,
+                deleted: true
+              });
+            }
+            return [];
+          }
+          if (result.status !== FetchStatus.SUCCESS) {
+            logger.error(`Unknown fetch status: ${result.status}`);
+            return [];
+          }
+          const finalUrl = result.url || item.url;
+          if (result.content) {
+            await progressCallback({
+              pagesScraped: currentPageCount,
+              totalPages: this.effectiveTotal,
+              totalDiscovered: this.totalDiscovered,
+              currentUrl: finalUrl,
+              depth: item.depth,
+              maxDepth,
+              result: {
+                url: finalUrl,
+                title: result.content.title?.trim() || result.title?.trim() || "",
+                contentType: result.contentType || "",
+                textContent: result.content.textContent || "",
+                links: result.content.links || [],
+                errors: result.content.errors || [],
+                chunks: result.content.chunks || [],
+                etag: result.etag || null,
+                lastModified: result.lastModified || null
+              },
+              pageId: item.pageId
+            });
+          }
+          const nextItems = result.links || [];
+          const linkBaseUrl = finalUrl ? new URL$1(finalUrl) : baseUrl;
+          return nextItems.map((value) => {
+            try {
+              const targetUrl = new URL$1(value, linkBaseUrl);
+              if (!this.shouldProcessUrl(targetUrl.href, options)) {
+                return null;
+              }
+              return {
+                url: targetUrl.href,
+                depth: item.depth + 1
+              };
+            } catch (_error) {
+              logger.warn(`❌ Invalid URL: ${value}`);
+            }
+            return null;
+          }).filter((item2) => item2 !== null);
+        } catch (error) {
+          if (options.ignoreErrors) {
+            logger.error(`❌ Failed to process ${item.url}: ${error}`);
+            return [];
+          }
+          throw error;
+        }
+      })
+    );
+    const allLinks = results.flat();
+    const uniqueLinks = [];
+    for (const item of allLinks) {
+      const normalizedUrl = normalizeUrl(item.url, this.options.urlNormalizerOptions);
+      if (!this.visited.has(normalizedUrl)) {
+        this.visited.add(normalizedUrl);
+        uniqueLinks.push(item);
+        this.totalDiscovered++;
+        if (this.effectiveTotal < maxPages) {
+          this.effectiveTotal++;
+        }
+      }
+    }
+    return uniqueLinks;
+  }
+  async scrape(options, progressCallback, signal) {
+    this.visited.clear();
+    this.pageCount = 0;
+    const initialQueue = options.initialQueue || [];
+    const isRefreshMode = initialQueue.length > 0;
+    this.canonicalBaseUrl = new URL$1(options.url);
+    let baseUrl = this.canonicalBaseUrl;
+    const queue = [];
+    const normalizedRootUrl = normalizeUrl(
+      options.url,
+      this.options.urlNormalizerOptions
+    );
+    if (isRefreshMode) {
+      logger.debug(
+        `Starting refresh mode with ${initialQueue.length} pre-populated pages`
+      );
+      for (const item of initialQueue) {
+        const normalizedUrl = normalizeUrl(item.url, this.options.urlNormalizerOptions);
+        if (!this.visited.has(normalizedUrl)) {
+          this.visited.add(normalizedUrl);
+          queue.push(item);
+        }
+      }
+    }
+    if (!this.visited.has(normalizedRootUrl)) {
+      this.visited.add(normalizedRootUrl);
+      queue.unshift({ url: options.url, depth: 0 });
+    }
+    this.totalDiscovered = queue.length;
+    this.effectiveTotal = queue.length;
+    const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
+    const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY;
+    while (queue.length > 0 && this.pageCount < maxPages) {
+      if (signal?.aborted) {
+        logger.debug(`${isRefreshMode ? "Refresh" : "Scraping"} cancelled by signal.`);
+        throw new CancellationError(
+          `${isRefreshMode ? "Refresh" : "Scraping"} cancelled by signal`
+        );
+      }
+      const remainingPages = maxPages - this.pageCount;
+      if (remainingPages <= 0) {
+        break;
+      }
+      const batchSize = Math.min(maxConcurrency, remainingPages, queue.length);
+      const batch = queue.splice(0, batchSize);
+      baseUrl = this.canonicalBaseUrl ?? baseUrl;
+      const newUrls = await this.processBatch(
+        batch,
+        baseUrl,
+        options,
+        progressCallback,
+        signal
+      );
+      queue.push(...newUrls);
+    }
+  }
+  /**
+   * Cleanup resources used by this strategy.
+   * Default implementation does nothing - override in derived classes as needed.
+   */
+  async cleanup() {
+  }
+}
+class SplitterError extends Error {
+}
+class MinimumChunkSizeError extends SplitterError {
+  constructor(size, maxSize) {
+    super(
+      `Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`
+    );
+  }
+}
+class ContentSplitterError extends SplitterError {
+}
+class GreedySplitter {
+  baseSplitter;
+  minChunkSize;
+  preferredChunkSize;
+  maxChunkSize;
+  /**
+   * Combines a base document splitter with size constraints to produce optimally-sized chunks.
+   * The base splitter handles the initial semantic splitting, while this class handles
+   * the concatenation strategy.
+   */
+  constructor(baseSplitter, minChunkSize, preferredChunkSize, maxChunkSize) {
+    this.baseSplitter = baseSplitter;
+    this.minChunkSize = minChunkSize;
+    this.preferredChunkSize = preferredChunkSize;
+    this.maxChunkSize = maxChunkSize;
+  }
+  /**
+   * Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks
+   * are combined until they reach the minimum size, but splits are preserved at major
+   * section boundaries to maintain document structure. This balances the need for
+   * context with semantic coherence.
+   */
+  async splitText(markdown, contentType) {
+    const initialChunks = await this.baseSplitter.splitText(markdown, contentType);
+    const concatenatedChunks = [];
+    let currentChunk = null;
+    for (const nextChunk of initialChunks) {
+      if (nextChunk.content.length > this.maxChunkSize) {
+        logger.warn(
+          `⚠ Chunk from base splitter exceeds max size: ${nextChunk.content.length} > ${this.maxChunkSize}`
+        );
+      }
+      if (currentChunk) {
+        const combinedSize = currentChunk.content.length + nextChunk.content.length;
+        if (combinedSize > this.maxChunkSize) {
+          concatenatedChunks.push(currentChunk);
+          currentChunk = this.cloneChunk(nextChunk);
+          continue;
+        }
+        if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk) && !this.isSameSection(currentChunk, nextChunk)) {
+          concatenatedChunks.push(currentChunk);
+          currentChunk = this.cloneChunk(nextChunk);
+          continue;
+        }
+        if (combinedSize > this.preferredChunkSize && currentChunk.content.length >= this.minChunkSize && nextChunk.content.length >= this.minChunkSize) {
+          concatenatedChunks.push(currentChunk);
+          currentChunk = this.cloneChunk(nextChunk);
+          continue;
+        }
+        currentChunk.content += `${currentChunk.content.endsWith("\n") ? "" : "\n"}${nextChunk.content}`;
+        currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
+        currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
+      } else {
+        currentChunk = this.cloneChunk(nextChunk);
+      }
+    }
+    if (currentChunk) {
+      concatenatedChunks.push(currentChunk);
+    }
+    return concatenatedChunks;
+  }
+  cloneChunk(chunk) {
+    return {
+      types: [...chunk.types],
+      content: chunk.content,
+      section: {
+        level: chunk.section.level,
+        path: [...chunk.section.path]
+      }
+    };
+  }
+  /**
+   * H1 and H2 headings represent major conceptual breaks in the document.
+   * Preserving these splits helps maintain the document's logical structure.
+   */
+  startsNewMajorSection(chunk) {
+    return chunk.section.level === 1 || chunk.section.level === 2;
+  }
+  /**
+   * Checks if two chunks belong to the same section by comparing their paths.
+   * Returns true if the paths are identical or if one is a parent of the other.
+   */
+  isSameSection(chunk1, chunk2) {
+    const path1 = chunk1.section.path;
+    const path2 = chunk2.section.path;
+    if (path1.length === path2.length && path1.every((part, i) => part === path2[i])) {
+      return true;
+    }
+    return this.isPathIncluded(path1, path2) || this.isPathIncluded(path2, path1);
+  }
+  /**
+   * Checks if one path is a prefix of another path, indicating a parent-child relationship
+   */
+  isPathIncluded(parentPath, childPath) {
+    if (parentPath.length >= childPath.length) return false;
+    return parentPath.every((part, i) => part === childPath[i]);
+  }
+  /**
+   * Merges section metadata when concatenating chunks, following these rules:
    * 1. Level: Always uses the lowest (most general) level between chunks
    * 2. Path selection:
    *    - For parent-child relationships (one path includes the other), uses the child's path
@@ -4195,7 +4686,7 @@ class HtmlMetadataExtractorMiddleware {
       }
       title = title || "Untitled";
       title = title.replace(/\s+/g, " ").trim();
-      context.metadata.title = title;
+      context.title = title;
       logger.debug(`Extracted title: "${title}" from ${context.source}`);
     } catch (error) {
       logger.error(`❌ Error extracting metadata from ${context.source}: ${error}`);
@@ -4653,7 +5144,7 @@ ${frame.content}
    * @param next The next middleware function in the pipeline.
    */
   async process(context, next) {
-    const contentType = context.options?.headers?.["content-type"] || context.metadata?.contentType || context.metadata?.mimeType;
+    const contentType = context.options?.headers?.["content-type"] || context.contentType;
     if (contentType && typeof contentType === "string" && !MimeTypeUtils.isHtml(contentType)) {
       logger.debug(
         `Skipping Playwright rendering for ${context.source} - content type '${contentType}' is not HTML`
@@ -5014,6 +5505,7 @@ ${text.replace(/^\n+|\n+$/g, "")}
         context.content = markdown;
         logger.debug(`Successfully converted HTML to Markdown for ${context.source}`);
       }
+      context.contentType = "text/markdown";
     } catch (error) {
       logger.error(
         `❌ Error converting HTML to Markdown for ${context.source}: ${error}`
@@ -5053,7 +5545,7 @@ class MarkdownMetadataExtractorMiddleware {
       if (match?.[1]) {
         title = match[1].trim();
       }
-      context.metadata.title = title;
+      context.title = title;
     } catch (error) {
       context.errors.push(
         new Error(
@@ -5225,10 +5717,10 @@ function convertToString(content, charset) {
 }
 class BasePipeline {
   /**
-   * Determines if this pipeline can process the given content.
+   * Determines if this pipeline can process content with the given MIME type.
    * Must be implemented by derived classes.
    */
-  canProcess(_rawContent) {
+  canProcess(_mimeType, _content) {
     throw new Error("Method not implemented.");
   }
   /**
@@ -5289,11 +5781,12 @@ class HtmlPipeline extends BasePipeline {
     this.greedySplitter = new GreedySplitter(
       semanticSplitter,
       SPLITTER_MIN_CHUNK_SIZE,
-      preferredChunkSize
+      preferredChunkSize,
+      maxChunkSize
     );
   }
-  canProcess(rawContent) {
-    return MimeTypeUtils.isHtml(rawContent.mimeType);
+  canProcess(mimeType) {
+    return MimeTypeUtils.isHtml(mimeType);
   }
   async process(rawContent, options, fetcher) {
     const resolvedCharset = resolveCharset(
@@ -5304,8 +5797,9 @@ class HtmlPipeline extends BasePipeline {
     const contentString = convertToString(rawContent.content, resolvedCharset);
     const context = {
       content: contentString,
+      contentType: rawContent.mimeType || "text/html",
       source: rawContent.source,
-      metadata: {},
+      // metadata: {},
       links: [],
       errors: [],
       options,
@@ -5320,8 +5814,9 @@ class HtmlPipeline extends BasePipeline {
       typeof context.content === "string" ? context.content : ""
     );
     return {
-      textContent: typeof context.content === "string" ? context.content : "",
-      metadata: context.metadata,
+      title: context.title,
+      contentType: context.contentType,
+      textContent: context.content,
       links: context.links,
       errors: context.errors,
       chunks
@@ -5345,9 +5840,9 @@ class JsonPipeline extends BasePipeline {
       preserveFormatting: true
     });
   }
-  canProcess(rawContent) {
-    if (!rawContent.mimeType) return false;
-    return MimeTypeUtils.isJson(rawContent.mimeType);
+  canProcess(mimeType) {
+    if (!mimeType) return false;
+    return MimeTypeUtils.isJson(mimeType);
   }
   async process(rawContent, options, fetcher) {
     const contentString = convertToString(rawContent.content, rawContent.charset);
@@ -5362,22 +5857,25 @@ class JsonPipeline extends BasePipeline {
       const fallbackChunks = await this.splitter.splitText(contentString);
       return {
         textContent: contentString,
-        metadata: {
-          isValidJson: false
-        },
+        // metadata: {
+        //   isValidJson: false,
+        // },
         links: [],
         errors: [],
         chunks: fallbackChunks
       };
     }
+    const metadata = this.extractMetadata(parsedJson);
     const context = {
       content: contentString,
       source: rawContent.source,
-      metadata: {
-        ...this.extractMetadata(parsedJson),
-        isValidJson,
-        jsonStructure: this.analyzeJsonStructure(parsedJson)
-      },
+      title: metadata.title,
+      contentType: rawContent.mimeType || "application/json",
+      // metadata: {
+      //   ...this.extractMetadata(parsedJson),
+      //   isValidJson,
+      //   jsonStructure: this.analyzeJsonStructure(parsedJson),
+      // },
       links: [],
       // JSON files typically don't contain links
       errors: [],
@@ -5387,8 +5885,9 @@ class JsonPipeline extends BasePipeline {
     await this.executeMiddlewareStack(this.middleware, context);
     const chunks = await this.splitter.splitText(context.content);
     return {
+      title: context.title,
+      contentType: context.contentType,
       textContent: context.content,
-      metadata: context.metadata,
       links: context.links,
       errors: context.errors,
       chunks
@@ -5418,30 +5917,6 @@ class JsonPipeline extends BasePipeline {
     }
     return metadata;
   }
-  /**
-   * Analyzes the structure of valid JSON for metadata
-   */
-  analyzeJsonStructure(parsedJson) {
-    if (Array.isArray(parsedJson)) {
-      return {
-        type: "array",
-        depth: this.calculateDepth(parsedJson),
-        itemCount: parsedJson.length
-      };
-    } else if (typeof parsedJson === "object" && parsedJson !== null) {
-      const obj = parsedJson;
-      return {
-        type: "object",
-        depth: this.calculateDepth(parsedJson),
-        propertyCount: Object.keys(obj).length
-      };
-    } else {
-      return {
-        type: typeof parsedJson,
-        depth: 1
-      };
-    }
-  }
   /**
    * Calculates the maximum nesting depth of a JSON structure
    */
@@ -5482,19 +5957,20 @@ class MarkdownPipeline extends BasePipeline {
     this.greedySplitter = new GreedySplitter(
       semanticSplitter,
       SPLITTER_MIN_CHUNK_SIZE,
-      preferredChunkSize
+      preferredChunkSize,
+      maxChunkSize
     );
   }
-  canProcess(rawContent) {
-    if (!rawContent.mimeType) return false;
-    return MimeTypeUtils.isMarkdown(rawContent.mimeType);
+  canProcess(mimeType) {
+    if (!mimeType) return false;
+    return MimeTypeUtils.isMarkdown(mimeType);
   }
   async process(rawContent, options, fetcher) {
     const contentString = convertToString(rawContent.content, rawContent.charset);
     const context = {
+      contentType: rawContent.mimeType || "text/markdown",
       content: contentString,
       source: rawContent.source,
-      metadata: {},
       links: [],
       errors: [],
       options,
@@ -5506,8 +5982,9 @@ class MarkdownPipeline extends BasePipeline {
       rawContent.mimeType
     );
     return {
+      title: context.title,
+      contentType: context.contentType,
       textContent: typeof context.content === "string" ? context.content : "",
-      metadata: context.metadata,
       links: context.links,
       errors: context.errors,
       chunks
@@ -5517,24 +5994,27 @@ class MarkdownPipeline extends BasePipeline {
 class SourceCodePipeline extends BasePipeline {
   middleware;
   splitter;
-  constructor(chunkSize = SPLITTER_PREFERRED_CHUNK_SIZE) {
+  constructor(_preferredChunkSize = SPLITTER_PREFERRED_CHUNK_SIZE, maxChunkSize = SPLITTER_MAX_CHUNK_SIZE) {
     super();
     this.middleware = [];
-    this.splitter = new TreesitterSourceCodeSplitter({ maxChunkSize: chunkSize });
+    this.splitter = new TreesitterSourceCodeSplitter({ maxChunkSize });
   }
-  canProcess(rawContent) {
-    if (!rawContent.mimeType) return false;
-    return MimeTypeUtils.isSourceCode(rawContent.mimeType);
+  canProcess(mimeType) {
+    if (!mimeType) return false;
+    return MimeTypeUtils.isSourceCode(mimeType);
   }
   async process(rawContent, options, fetcher) {
     const contentString = convertToString(rawContent.content, rawContent.charset);
     const context = {
+      contentType: rawContent.mimeType || "text/plain",
       content: contentString,
       source: rawContent.source,
-      metadata: {
-        language: rawContent.mimeType ? MimeTypeUtils.extractLanguageFromMimeType(rawContent.mimeType) : "text",
-        isSourceCode: true
-      },
+      // metadata: {
+      //   language: rawContent.mimeType
+      //     ? MimeTypeUtils.extractLanguageFromMimeType(rawContent.mimeType)
+      //     : "text",
+      //   isSourceCode: true,
+      // },
       links: [],
       // Source code files typically don't contain web links
       errors: [],
@@ -5544,8 +6024,10 @@ class SourceCodePipeline extends BasePipeline {
     await this.executeMiddlewareStack(this.middleware, context);
     const chunks = await this.splitter.splitText(context.content, rawContent.mimeType);
     return {
+      title: context.title,
+      contentType: context.contentType,
       textContent: context.content,
-      metadata: context.metadata,
+      // metadata: context.metadata,
       links: context.links,
       errors: context.errors,
       chunks
@@ -5594,17 +6076,22 @@ class TextDocumentSplitter {
 class TextPipeline extends BasePipeline {
   middleware;
   splitter;
-  constructor(chunkSize = SPLITTER_PREFERRED_CHUNK_SIZE) {
+  constructor(preferredChunkSize = SPLITTER_PREFERRED_CHUNK_SIZE, maxChunkSize = SPLITTER_MAX_CHUNK_SIZE) {
     super();
     this.middleware = [];
-    const textSplitter = new TextDocumentSplitter({ maxChunkSize: chunkSize });
-    this.splitter = new GreedySplitter(textSplitter, SPLITTER_MIN_CHUNK_SIZE, chunkSize);
+    const textSplitter = new TextDocumentSplitter({ maxChunkSize });
+    this.splitter = new GreedySplitter(
+      textSplitter,
+      SPLITTER_MIN_CHUNK_SIZE,
+      preferredChunkSize,
+      maxChunkSize
+    );
   }
-  canProcess(rawContent) {
-    if (!MimeTypeUtils.isSafeForTextProcessing(rawContent.mimeType)) {
+  canProcess(mimeType, content) {
+    if (!MimeTypeUtils.isSafeForTextProcessing(mimeType)) {
       return false;
     }
-    if (MimeTypeUtils.isBinary(rawContent.content)) {
+    if (content && MimeTypeUtils.isBinary(content)) {
       return false;
     }
     return true;
@@ -5612,12 +6099,11 @@ class TextPipeline extends BasePipeline {
   async process(rawContent, options, fetcher) {
     const contentString = convertToString(rawContent.content, rawContent.charset);
     const context = {
+      title: "",
+      // Title extraction can be added in middleware if needed
+      contentType: rawContent.mimeType || "text/plain",
       content: contentString,
       source: rawContent.source,
-      metadata: {
-        contentType: rawContent.mimeType || "text/plain",
-        isGenericText: true
-      },
       links: [],
       // Generic text content typically doesn't contain structured links
       errors: [],
@@ -5627,394 +6113,283 @@ class TextPipeline extends BasePipeline {
     await this.executeMiddlewareStack(this.middleware, context);
     const chunks = await this.splitter.splitText(context.content, rawContent.mimeType);
     return {
+      title: context.title,
+      contentType: context.contentType,
       textContent: context.content,
-      metadata: context.metadata,
       links: context.links,
       errors: context.errors,
-      chunks
-    };
-  }
-}
-let PipelineFactory$1 = class PipelineFactory {
-  /**
-   * Creates the standard set of content pipelines used by all scraper strategies.
-   * Includes HTML, Markdown, JSON, source code, and text processing capabilities.
-   * Each pipeline now handles both preprocessing and content-specific splitting.
-   * TextPipeline is placed last as the universal fallback for unknown content types.
-   *
-   * @param config - Optional configuration for pipeline chunk sizes
-   * @returns Array of content pipelines in processing order
-   */
-  static createStandardPipelines(config) {
-    const preferredChunkSize = config?.chunkSizes?.preferred ?? SPLITTER_PREFERRED_CHUNK_SIZE;
-    const maxChunkSize = config?.chunkSizes?.max ?? 2e3;
-    return [
-      new JsonPipeline(preferredChunkSize),
-      new SourceCodePipeline(preferredChunkSize),
-      new HtmlPipeline(preferredChunkSize, maxChunkSize),
-      new MarkdownPipeline(preferredChunkSize, maxChunkSize),
-      new TextPipeline(preferredChunkSize)
-      // Universal fallback - must be last
-    ];
-  }
-};
-const DEFAULT_FILE_EXCLUSIONS = [
-  // CHANGELOG files (case variations)
-  "**/CHANGELOG.md",
-  "**/changelog.md",
-  "**/CHANGELOG.mdx",
-  "**/changelog.mdx",
-  // LICENSE files (case variations)
-  "**/LICENSE",
-  "**/LICENSE.md",
-  "**/license.md",
-  // CODE_OF_CONDUCT files (case variations)
-  "**/CODE_OF_CONDUCT.md",
-  "**/code_of_conduct.md",
-  // Test files
-  "**/*.test.*",
-  "**/*.spec.*",
-  "**/*_test.py",
-  "**/*_test.go",
-  // Package manager lock files
-  "**/*.lock",
-  "**/package-lock.json",
-  "**/yarn.lock",
-  "**/pnpm-lock.yaml",
-  "**/go.sum",
-  // Build artifacts
-  "**/*.min.js",
-  "**/*.min.css",
-  "**/*.map",
-  "**/*.d.ts",
-  // IDE/System files
-  "**/.DS_Store",
-  "**/Thumbs.db",
-  "**/*.swp",
-  "**/*.swo",
-  // Internal config files (using regex pattern)
-  "/.*\\.(ini|cfg|conf|log|pid)$/"
-];
-const DEFAULT_FOLDER_EXCLUSIONS = [
-  // Archive and deprecated content (matches anywhere in path)
-  "**/archive/**",
-  "**/archived/**",
-  "**/deprecated/**",
-  "**/legacy/**",
-  "**/old/**",
-  "**/outdated/**",
-  "**/previous/**",
-  "**/superseded/**",
-  // Specific paths that don't follow the general pattern
-  "docs/old/**",
-  // Test directories
-  "**/test/**",
-  "**/tests/**",
-  "**/__tests__/**",
-  "**/spec/**",
-  // Build output directories
-  "**/dist/**",
-  "**/build/**",
-  "**/out/**",
-  "**/target/**",
-  "**/.next/**",
-  "**/.nuxt/**",
-  // IDE directories
-  "**/.vscode/**",
-  "**/.idea/**",
-  // Internationalization folders - non-English locales
-  "**/i18n/ar*/**",
-  "**/i18n/de*/**",
-  "**/i18n/es*/**",
-  "**/i18n/fr*/**",
-  "**/i18n/hi*/**",
-  "**/i18n/it*/**",
-  "**/i18n/ja*/**",
-  "**/i18n/ko*/**",
-  "**/i18n/nl*/**",
-  "**/i18n/pl*/**",
-  "**/i18n/pt*/**",
-  "**/i18n/ru*/**",
-  "**/i18n/sv*/**",
-  "**/i18n/th*/**",
-  "**/i18n/tr*/**",
-  "**/i18n/vi*/**",
-  "**/i18n/zh*/**",
-  // Common locale folder patterns
-  "**/zh-cn/**",
-  "**/zh-hk/**",
-  "**/zh-mo/**",
-  "**/zh-sg/**",
-  "**/zh-tw/**"
-];
-const DEFAULT_EXCLUSION_PATTERNS = [
-  ...DEFAULT_FILE_EXCLUSIONS,
-  ...DEFAULT_FOLDER_EXCLUSIONS
-];
-function getEffectiveExclusionPatterns(userPatterns) {
-  if (userPatterns !== void 0) {
-    return userPatterns;
+      chunks
+    };
   }
-  return DEFAULT_EXCLUSION_PATTERNS;
-}
-function isRegexPattern(pattern) {
-  return pattern.length > 2 && pattern.startsWith("/") && pattern.endsWith("/");
 }
-function patternToRegExp(pattern) {
-  if (isRegexPattern(pattern)) {
-    return new RegExp(pattern.slice(1, -1));
+let PipelineFactory$1 = class PipelineFactory {
+  /**
+   * Creates the standard set of content pipelines used by all scraper strategies.
+   * Includes HTML, Markdown, JSON, source code, and text processing capabilities.
+   * Each pipeline now handles both preprocessing and content-specific splitting.
+   * TextPipeline is placed last as the universal fallback for unknown content types.
+   *
+   * @param config - Optional configuration for pipeline chunk sizes
+   * @returns Array of content pipelines in processing order
+   */
+  static createStandardPipelines(config) {
+    const preferredChunkSize = config?.chunkSizes?.preferred ?? SPLITTER_PREFERRED_CHUNK_SIZE;
+    const maxChunkSize = config?.chunkSizes?.max ?? 2e3;
+    return [
+      new JsonPipeline(preferredChunkSize),
+      new SourceCodePipeline(preferredChunkSize, maxChunkSize),
+      new HtmlPipeline(preferredChunkSize, maxChunkSize),
+      new MarkdownPipeline(preferredChunkSize, maxChunkSize),
+      new TextPipeline(preferredChunkSize, maxChunkSize)
+      // Universal fallback - must be last
+    ];
   }
-  const re = minimatch.makeRe(pattern, { dot: true });
-  if (!re) throw new Error(`Invalid glob pattern: ${pattern}`);
-  return re;
-}
-function matchesAnyPattern(path2, patterns) {
-  if (!patterns || patterns.length === 0) return false;
-  const normalizedPath = path2.startsWith("/") ? path2 : `/${path2}`;
-  return patterns.some((pattern) => {
-    if (isRegexPattern(pattern)) {
-      return patternToRegExp(pattern).test(normalizedPath);
-    }
-    return minimatch(normalizedPath.replace(/^\//, ""), pattern, { dot: true });
-  });
-}
-function extractPathAndQuery(url) {
-  try {
-    const u = new URL(url);
-    return u.pathname + (u.search || "");
-  } catch {
-    return url;
+};
+class GitHubRepoProcessor {
+  httpFetcher = new HttpFetcher();
+  pipelines;
+  constructor() {
+    this.pipelines = PipelineFactory$1.createStandardPipelines();
   }
-}
-function shouldIncludeUrl(url, includePatterns, excludePatterns) {
-  const path2 = extractPathAndQuery(url);
-  const normalizedPath = path2.startsWith("/") ? path2 : `/${path2}`;
-  let basename;
-  if (url.startsWith("file://")) {
-    try {
-      const u = new URL(url);
-      basename = u.pathname ? u.pathname.split("/").pop() : void 0;
-    } catch {
+  /**
+   * Parses an HTTPS blob URL to extract repository information.
+   * Format: https://github.com/owner/repo/blob/branch/filepath
+   */
+  parseHttpsBlobUrl(url) {
+    const parsedUrl = new URL(url);
+    const segments = parsedUrl.pathname.split("/").filter(Boolean);
+    if (segments.length < 5 || segments[2] !== "blob") {
+      throw new Error(
+        `Invalid GitHub blob URL format. Expected: https://github.com/owner/repo/blob/branch/filepath. Got: ${url}`
+      );
     }
+    const owner = segments[0];
+    const repo = segments[1];
+    const branch = segments[3];
+    const filePath = segments.slice(4).join("/");
+    return { owner, repo, branch, filePath };
   }
-  const stripSlash = (patterns) => patterns?.map((p) => p.startsWith("/") ? p.slice(1) : p);
-  const effectiveExcludePatterns = getEffectiveExclusionPatterns(excludePatterns);
-  if (matchesAnyPattern(normalizedPath, effectiveExcludePatterns) || basename && matchesAnyPattern(basename, stripSlash(effectiveExcludePatterns)))
-    return false;
-  if (!includePatterns || includePatterns.length === 0) return true;
-  return matchesAnyPattern(normalizedPath, includePatterns) || (basename ? matchesAnyPattern(basename, stripSlash(includePatterns)) : false);
-}
-function computeBaseDirectory(pathname) {
-  if (pathname === "") return "/";
-  if (pathname.endsWith("/")) return pathname;
-  const lastSegment = pathname.split("/").at(-1) || "";
-  const looksLikeFile = lastSegment.includes(".");
-  if (looksLikeFile) {
-    return pathname.replace(/\/[^/]*$/, "/");
+  /**
+   * Fetches the raw content of a file from GitHub.
+   */
+  async fetchFileContent(repoInfo, filePath, etag, signal) {
+    const { owner, repo, branch } = repoInfo;
+    const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${filePath}`;
+    const rawContent = await this.httpFetcher.fetch(rawUrl, { signal, etag });
+    const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath);
+    if (detectedMimeType && rawContent.mimeType === "text/plain") {
+      return {
+        ...rawContent,
+        mimeType: detectedMimeType
+      };
+    }
+    return rawContent;
   }
-  return `${pathname}/`;
-}
-function isInScope(baseUrl, targetUrl, scope) {
-  if (baseUrl.protocol !== targetUrl.protocol) return false;
-  switch (scope) {
-    case "subpages": {
-      if (baseUrl.hostname !== targetUrl.hostname) return false;
-      const baseDir = computeBaseDirectory(baseUrl.pathname);
-      return targetUrl.pathname.startsWith(baseDir);
+  /**
+   * Processes a single GitHub repository file from an HTTPS blob URL.
+   */
+  async process(item, options, signal) {
+    const repoInfo = this.parseHttpsBlobUrl(item.url);
+    const { owner, repo, branch, filePath } = repoInfo;
+    const rawContent = await this.fetchFileContent(
+      { owner, repo, branch },
+      filePath,
+      item.etag,
+      signal
+    );
+    if (rawContent.status !== FetchStatus.SUCCESS) {
+      return { url: item.url, links: [], status: rawContent.status };
     }
-    case "hostname":
-      return baseUrl.hostname === targetUrl.hostname;
-    case "domain": {
-      return extractPrimaryDomain(baseUrl.hostname) === extractPrimaryDomain(targetUrl.hostname);
+    let processed;
+    for (const pipeline of this.pipelines) {
+      const contentBuffer = Buffer.isBuffer(rawContent.content) ? rawContent.content : Buffer.from(rawContent.content);
+      if (pipeline.canProcess(rawContent.mimeType || "text/plain", contentBuffer)) {
+        logger.debug(
+          `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`
+        );
+        const gitHubOptions = { ...options, scrapeMode: ScrapeMode.Fetch };
+        processed = await pipeline.process(rawContent, gitHubOptions, this.httpFetcher);
+        break;
+      }
     }
-    default:
-      return false;
+    if (!processed) {
+      logger.warn(
+        `⚠️  Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
+      );
+      return { url: item.url, links: [], status: FetchStatus.SUCCESS };
+    }
+    for (const err of processed.errors ?? []) {
+      logger.warn(`⚠️  Processing error for ${filePath}: ${err.message}`);
+    }
+    const githubUrl = `https://github.com/${owner}/${repo}/blob/${branch}/${filePath}`;
+    const filename = filePath.split("/").pop() || "Untitled";
+    return {
+      url: githubUrl,
+      title: processed.title?.trim() || filename || "Untitled",
+      etag: rawContent.etag,
+      lastModified: rawContent.lastModified,
+      contentType: rawContent.mimeType,
+      content: processed,
+      links: [],
+      // Always return empty links array for individual files
+      status: FetchStatus.SUCCESS
+    };
+  }
+  /**
+   * Cleanup resources used by this processor.
+   */
+  async cleanup() {
+    await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
   }
 }
-const DEFAULT_MAX_DEPTH = 3;
-const DEFAULT_CONCURRENCY = 3;
-class BaseScraperStrategy {
-  visited = /* @__PURE__ */ new Set();
-  pageCount = 0;
-  totalDiscovered = 0;
-  // Track total URLs discovered (unlimited)
-  effectiveTotal = 0;
-  // Track effective total (limited by maxPages)
-  canonicalBaseUrl;
-  options;
-  constructor(options = {}) {
-    this.options = options;
+class GitHubWikiProcessor {
+  httpFetcher = new HttpFetcher();
+  pipelines;
+  constructor() {
+    this.pipelines = PipelineFactory$1.createStandardPipelines();
   }
   /**
-   * Determines if a URL should be processed based on scope and include/exclude patterns in ScraperOptions.
-   * Scope is checked first, then patterns.
+   * Parses a GitHub wiki URL to extract repository information.
+   */
+  parseGitHubWikiUrl(url) {
+    const parsedUrl = new URL(url);
+    const match = parsedUrl.pathname.match(/^\/([^/]+)\/([^/]+)\/wiki/);
+    if (!match) {
+      throw new Error(`Invalid GitHub wiki URL: ${url}`);
+    }
+    const [, owner, repo] = match;
+    return { owner, repo };
+  }
+  /**
+   * Determines if a URL should be processed within the wiki scope.
    */
   shouldProcessUrl(url, options) {
-    if (options.scope) {
-      try {
-        const base = this.canonicalBaseUrl ?? new URL$1(options.url);
-        const target = new URL$1(url);
-        if (!isInScope(base, target, options.scope)) return false;
-      } catch {
+    try {
+      const parsedUrl = new URL(url);
+      const baseWikiInfo = this.parseGitHubWikiUrl(options.url);
+      const expectedWikiPath = `/${baseWikiInfo.owner}/${baseWikiInfo.repo}/wiki`;
+      if (!parsedUrl.pathname.startsWith(expectedWikiPath)) {
         return false;
       }
+      const wikiPagePath = parsedUrl.pathname.replace(expectedWikiPath, "").replace(/^\//, "");
+      return shouldIncludeUrl(
+        wikiPagePath || "Home",
+        options.includePatterns,
+        options.excludePatterns
+      );
+    } catch {
+      return false;
     }
-    return shouldIncludeUrl(url, options.includePatterns, options.excludePatterns);
   }
-  // Removed getProcessor method as processing is now handled by strategies using middleware pipelines
-  async processBatch(batch, baseUrl, options, progressCallback, signal) {
-    const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
-    const results = await Promise.all(
-      batch.map(async (item) => {
-        if (signal?.aborted) {
-          throw new CancellationError("Scraping cancelled during batch processing");
-        }
-        const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH;
-        if (item.depth > maxDepth) {
-          return [];
-        }
-        try {
-          const result = await this.processItem(item, options, void 0, signal);
-          if (item.depth === 0 && !this.canonicalBaseUrl && result?.finalUrl) {
-            try {
-              const finalUrlStr = result.finalUrl;
-              const original = new URL$1(options.url);
-              const finalUrlObj = new URL$1(finalUrlStr);
-              if (finalUrlObj.href !== original.href && (finalUrlObj.protocol === "http:" || finalUrlObj.protocol === "https:")) {
-                this.canonicalBaseUrl = finalUrlObj;
-                logger.debug(
-                  `Updated scope base after redirect: ${original.href} -> ${finalUrlObj.href}`
-                );
-              } else {
-                this.canonicalBaseUrl = original;
-              }
-            } catch {
-              this.canonicalBaseUrl = new URL$1(options.url);
-            }
-          }
-          if (result.document) {
-            this.pageCount++;
-            logger.info(
-              `🌐 Scraping page ${this.pageCount}/${this.effectiveTotal} (depth ${item.depth}/${maxDepth}): ${item.url}`
-            );
-            await progressCallback({
-              pagesScraped: this.pageCount,
-              totalPages: this.effectiveTotal,
-              totalDiscovered: this.totalDiscovered,
-              currentUrl: item.url,
-              depth: item.depth,
-              maxDepth,
-              document: result.document
-            });
-          }
-          const nextItems = result.links || [];
-          return nextItems.map((value) => {
-            try {
-              const targetUrl = new URL$1(value, baseUrl);
-              if (!this.shouldProcessUrl(targetUrl.href, options)) {
-                return null;
-              }
-              return {
-                url: targetUrl.href,
-                depth: item.depth + 1
-              };
-            } catch (_error) {
-              logger.warn(`❌ Invalid URL: ${value}`);
-            }
-            return null;
-          }).filter((item2) => item2 !== null);
-        } catch (error) {
-          if (options.ignoreErrors) {
-            logger.error(`❌ Failed to process ${item.url}: ${error}`);
-            return [];
-          }
-          throw error;
-        }
-      })
-    );
-    const allLinks = results.flat();
-    const uniqueLinks = [];
-    for (const item of allLinks) {
-      const normalizedUrl = normalizeUrl(item.url, this.options.urlNormalizerOptions);
-      if (!this.visited.has(normalizedUrl)) {
-        this.visited.add(normalizedUrl);
-        uniqueLinks.push(item);
-        this.totalDiscovered++;
-        if (this.effectiveTotal < maxPages) {
-          this.effectiveTotal++;
+  /**
+   * Processes a single GitHub wiki page.
+   */
+  async process(item, options, signal) {
+    const currentUrl = item.url;
+    try {
+      const rawContent = await this.httpFetcher.fetch(currentUrl, {
+        signal,
+        etag: item.etag
+      });
+      if (rawContent.status !== FetchStatus.SUCCESS) {
+        return { url: currentUrl, links: [], status: rawContent.status };
+      }
+      let processed;
+      for (const pipeline of this.pipelines) {
+        if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) {
+          logger.debug(
+            `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${currentUrl})`
+          );
+          const wikiOptions = { ...options, scrapeMode: ScrapeMode.Fetch };
+          processed = await pipeline.process(rawContent, wikiOptions, this.httpFetcher);
+          break;
         }
       }
-    }
-    return uniqueLinks;
-  }
-  async scrape(options, progressCallback, signal) {
-    this.visited.clear();
-    this.pageCount = 0;
-    this.totalDiscovered = 1;
-    this.effectiveTotal = 1;
-    this.canonicalBaseUrl = new URL$1(options.url);
-    let baseUrl = this.canonicalBaseUrl;
-    const queue = [{ url: options.url, depth: 0 }];
-    this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions));
-    const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
-    const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY;
-    while (queue.length > 0 && this.pageCount < maxPages) {
-      if (signal?.aborted) {
-        logger.debug("Scraping cancelled by signal.");
-        throw new CancellationError("Scraping cancelled by signal");
+      if (!processed) {
+        logger.warn(
+          `⚠️  Unsupported content type "${rawContent.mimeType}" for wiki page ${currentUrl}. Skipping processing.`
+        );
+        return { url: currentUrl, links: [], status: FetchStatus.SUCCESS };
       }
-      const remainingPages = maxPages - this.pageCount;
-      if (remainingPages <= 0) {
-        break;
+      for (const err of processed.errors ?? []) {
+        logger.warn(`⚠️  Processing error for ${currentUrl}: ${err.message}`);
       }
-      const batchSize = Math.min(
-        maxConcurrency,
-        // Use variable
-        remainingPages,
-        queue.length
-      );
-      const batch = queue.splice(0, batchSize);
-      baseUrl = this.canonicalBaseUrl ?? baseUrl;
-      const newUrls = await this.processBatch(
-        batch,
-        baseUrl,
-        options,
-        progressCallback,
-        signal
-      );
-      queue.push(...newUrls);
+      const parsedUrl = new URL(currentUrl);
+      const wikiInfo = this.parseGitHubWikiUrl(currentUrl);
+      const wikiPagePath = parsedUrl.pathname.replace(`/${wikiInfo.owner}/${wikiInfo.repo}/wiki`, "").replace(/^\//, "");
+      const pageTitle = wikiPagePath || "Home";
+      const links = processed.links || [];
+      const wikiLinks = links.filter((link) => {
+        if (!link || link.trim() === "" || link === "invalid-url" || link === "not-a-url-at-all") {
+          return false;
+        }
+        return true;
+      }).map((link) => {
+        try {
+          return new URL(link, currentUrl).href;
+        } catch {
+          return null;
+        }
+      }).filter((link) => link !== null).filter((link) => {
+        try {
+          const linkUrl = new URL(link);
+          return linkUrl.hostname === parsedUrl.hostname && linkUrl.pathname.startsWith(`/${wikiInfo.owner}/${wikiInfo.repo}/wiki`);
+        } catch {
+          return false;
+        }
+      });
+      return {
+        url: currentUrl,
+        title: pageTitle,
+        etag: rawContent.etag,
+        lastModified: rawContent.lastModified,
+        contentType: rawContent.mimeType,
+        content: processed,
+        links: wikiLinks,
+        status: FetchStatus.SUCCESS
+      };
+    } catch (error) {
+      logger.warn(`⚠️  Failed to process wiki page ${currentUrl}: ${error}`);
+      return { url: currentUrl, links: [], status: FetchStatus.SUCCESS };
     }
   }
   /**
-   * Cleanup resources used by this strategy.
-   * Default implementation does nothing - override in derived classes as needed.
+   * Cleanup resources used by this processor.
    */
   async cleanup() {
+    await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
   }
 }
-class GitHubRepoScraperStrategy extends BaseScraperStrategy {
+class GitHubScraperStrategy extends BaseScraperStrategy {
   httpFetcher = new HttpFetcher();
-  pipelines;
-  resolvedBranch;
-  // Cache the resolved default branch
-  constructor() {
-    super();
-    this.pipelines = PipelineFactory$1.createStandardPipelines();
-  }
+  wikiProcessor = new GitHubWikiProcessor();
+  repoProcessor = new GitHubRepoProcessor();
   canHandle(url) {
-    const { hostname } = new URL(url);
-    return ["github.com", "www.github.com"].includes(hostname);
-  }
-  /**
-   * Override shouldProcessUrl to handle github-file:// URLs specially.
-   * These URLs bypass scope checking since they're internal file references.
-   */
-  shouldProcessUrl(url, options) {
     if (url.startsWith("github-file://")) {
-      const filePath = url.replace("github-file://", "");
-      return shouldIncludeUrl(filePath, options.includePatterns, options.excludePatterns);
+      return true;
+    }
+    try {
+      const parsedUrl = new URL(url);
+      const { hostname, pathname } = parsedUrl;
+      if (!["github.com", "www.github.com"].includes(hostname)) {
+        return false;
+      }
+      const baseMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/?$/);
+      if (baseMatch) {
+        return true;
+      }
+      const treeMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/tree\//);
+      if (treeMatch) {
+        return true;
+      }
+      const blobMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/blob\//);
+      if (blobMatch) {
+        return true;
+      }
+      return false;
+    } catch {
+      return false;
     }
-    return super.shouldProcessUrl(url, options);
   }
   /**
    * Parses a GitHub URL to extract repository information.
@@ -6028,20 +6403,19 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
     const [, owner, repo] = match;
     const segments = parsedUrl.pathname.split("/").filter(Boolean);
     if (segments.length >= 4 && segments[2] === "blob") {
-      const branch2 = segments[3];
+      const branch = segments[3];
       const filePath = segments.length > 4 ? segments.slice(4).join("/") : void 0;
-      return { owner, repo, branch: branch2, filePath, isBlob: true };
+      return { owner, repo, branch, filePath, isBlob: true };
     }
-    if (segments.length < 4 || segments[2] !== "tree") {
-      return { owner, repo };
+    if (segments.length >= 4 && segments[2] === "tree") {
+      const branch = segments[3];
+      const subPath = segments.length > 4 ? segments.slice(4).join("/") : void 0;
+      return { owner, repo, branch, subPath };
     }
-    const branch = segments[3];
-    const subPath = segments.length > 4 ? segments.slice(4).join("/") : void 0;
-    return { owner, repo, branch, subPath };
+    return { owner, repo };
   }
   /**
    * Fetches the repository tree structure from GitHub API.
-   * Uses 'HEAD' to get the default branch if no branch is specified.
    */
   async fetchRepositoryTree(repoInfo, signal) {
     const { owner, repo, branch } = repoInfo;
@@ -6060,7 +6434,6 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
         targetBranch = "main";
       }
     }
-    this.resolvedBranch = targetBranch;
     const treeUrl = `https://api.github.com/repos/${owner}/${repo}/git/trees/${targetBranch}?recursive=1`;
     logger.debug(`Fetching repository tree: ${treeUrl}`);
     const rawContent = await this.httpFetcher.fetch(treeUrl, { signal });
@@ -6082,14 +6455,12 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
     }
     const path2 = item.path;
     const textExtensions = [
-      // Documentation
       ".md",
       ".mdx",
       ".txt",
       ".rst",
       ".adoc",
       ".asciidoc",
-      // Web technologies
       ".html",
       ".htm",
       ".xml",
@@ -6097,7 +6468,6 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
       ".scss",
       ".sass",
       ".less",
-      // Programming languages
       ".js",
       ".jsx",
       ".ts",
@@ -6133,7 +6503,6 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
       ".ps1",
       ".bat",
       ".cmd",
-      // Configuration and data
       ".json",
       ".yaml",
       ".yml",
@@ -6147,7 +6516,6 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
       ".dockerignore",
       ".gitattributes",
       ".editorconfig",
-      // Build and package management
       ".gradle",
       ".pom",
       ".sbt",
@@ -6156,10 +6524,7 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
       ".make",
       ".dockerfile",
       ".mod",
-      // Go modules (go.mod)
       ".sum",
-      // Go checksums (go.sum)
-      // Other text formats
       ".sql",
       ".graphql",
       ".gql",
@@ -6172,20 +6537,16 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
     ];
     const pathLower = path2.toLowerCase();
     const hasTextExtension = textExtensions.some((ext) => pathLower.endsWith(ext));
-    const hasCompoundExtension = pathLower.includes(".env.") || // .env.example, .env.local, etc.
-    pathLower.endsWith(".env") || pathLower.includes(".config.") || // webpack.config.js, etc.
-    pathLower.includes(".lock");
+    const hasCompoundExtension = pathLower.includes(".env.") || pathLower.endsWith(".env") || pathLower.includes(".config.") || pathLower.includes(".lock");
     const fileName = path2.split("/").pop() || "";
     const fileNameLower = fileName.toLowerCase();
     const commonTextFiles = [
-      // Documentation files without extensions
       "readme",
       "license",
       "changelog",
       "contributing",
       "authors",
       "maintainers",
-      // Build files without extensions
       "dockerfile",
       "makefile",
       "rakefile",
@@ -6193,374 +6554,125 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
       "podfile",
       "cartfile",
       "brewfile",
-      "procfile",
-      "vagrantfile",
-      "gulpfile",
-      "gruntfile",
-      // Configuration files (dotfiles)
-      ".prettierrc",
-      ".eslintrc",
-      ".babelrc",
-      ".nvmrc",
-      ".npmrc"
-    ];
-    const isCommonTextFile = commonTextFiles.some((name2) => {
-      if (name2.startsWith(".")) {
-        return fileNameLower === name2 || fileNameLower.startsWith(`${name2}.`);
-      }
-      return fileNameLower === name2 || fileNameLower.startsWith(`${name2}.`);
-    });
-    if (!hasTextExtension && !hasCompoundExtension && !isCommonTextFile) {
-      return false;
-    }
-    return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
-  }
-  /**
-   * Fetches the raw content of a file from GitHub.
-   */
-  async fetchFileContent(repoInfo, filePath, signal) {
-    const { owner, repo } = repoInfo;
-    const branch = this.resolvedBranch || repoInfo.branch || "main";
-    const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${filePath}`;
-    const rawContent = await this.httpFetcher.fetch(rawUrl, { signal });
-    const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath);
-    if (detectedMimeType && rawContent.mimeType === "text/plain") {
-      return {
-        ...rawContent,
-        mimeType: detectedMimeType
-      };
-    }
-    return rawContent;
-  }
-  async processItem(item, options, _progressCallback, signal) {
-    const repoInfo = this.parseGitHubUrl(options.url);
-    if (item.depth === 0) {
-      if ("isBlob" in repoInfo && repoInfo.isBlob) {
-        if (repoInfo.filePath) {
-          logger.info(
-            `📄 Processing single file: ${repoInfo.owner}/${repoInfo.repo}/${repoInfo.filePath}`
-          );
-          return { links: [`github-file://${repoInfo.filePath}`] };
-        } else {
-          logger.warn(
-            `⚠️  Blob URL without file path: ${options.url}. No files to process.`
-          );
-          return { links: [] };
-        }
-      }
-      logger.info(
-        `🗂️  Discovering repository structure for ${repoInfo.owner}/${repoInfo.repo}`
-      );
-      const { tree, resolvedBranch } = await this.fetchRepositoryTree(repoInfo, signal);
-      const fileItems = tree.tree.filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)).filter((treeItem) => this.shouldProcessFile(treeItem, options));
-      logger.info(
-        `📁 Found ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`
-      );
-      const links = fileItems.map((treeItem) => `github-file://${treeItem.path}`);
-      return { links };
-    }
-    if (item.url.startsWith("github-file://")) {
-      const filePath = item.url.replace("github-file://", "");
-      logger.info(
-        `🗂️  Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`
-      );
-      const rawContent = await this.fetchFileContent(repoInfo, filePath, signal);
-      let processed;
-      for (const pipeline of this.pipelines) {
-        if (pipeline.canProcess(rawContent)) {
-          logger.debug(
-            `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`
-          );
-          const gitHubOptions = { ...options, scrapeMode: ScrapeMode.Fetch };
-          processed = await pipeline.process(rawContent, gitHubOptions, this.httpFetcher);
-          break;
-        }
-      }
-      if (!processed) {
-        logger.warn(
-          `⚠️  Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
-        );
-        return { document: void 0, links: [] };
-      }
-      for (const err of processed.errors) {
-        logger.warn(`⚠️  Processing error for ${filePath}: ${err.message}`);
-      }
-      const githubUrl = `https://github.com/${repoInfo.owner}/${repoInfo.repo}/blob/${this.resolvedBranch || repoInfo.branch || "main"}/${filePath}`;
-      const processedTitle = processed.metadata.title;
-      const hasValidTitle = typeof processedTitle === "string" && processedTitle.trim() !== "";
-      const fallbackTitle = filePath.split("/").pop() || "Untitled";
-      return {
-        document: {
-          content: typeof processed.textContent === "string" ? processed.textContent : "",
-          metadata: {
-            url: githubUrl,
-            title: hasValidTitle ? processedTitle : fallbackTitle,
-            library: options.library,
-            version: options.version
-          },
-          contentType: rawContent.mimeType
-          // Preserve the detected MIME type
-        },
-        links: []
-        // Always return empty links array for individual files
-      };
-    }
-    return { document: void 0, links: [] };
-  }
-  /**
-   * Normalize a path by removing leading and trailing slashes.
-   */
-  normalizePath(path2) {
-    return path2.replace(/^\/+/, "").replace(/\/+$/, "");
-  }
-  isWithinSubPath(path2, subPath) {
-    if (!subPath) {
-      return true;
-    }
-    const trimmedSubPath = this.normalizePath(subPath);
-    if (trimmedSubPath.length === 0) {
-      return true;
-    }
-    const normalizedPath = this.normalizePath(path2);
-    if (normalizedPath === trimmedSubPath) {
-      return true;
-    }
-    return normalizedPath.startsWith(`${trimmedSubPath}/`);
-  }
-  async scrape(options, progressCallback, signal) {
-    const url = new URL(options.url);
-    if (!url.hostname.includes("github.com")) {
-      throw new Error("URL must be a GitHub URL");
-    }
-    return super.scrape(options, progressCallback, signal);
-  }
-  /**
-   * Cleanup resources used by this strategy, specifically the pipeline browser instances.
-   */
-  async cleanup() {
-    await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
-  }
-}
-class GitHubWikiScraperStrategy extends BaseScraperStrategy {
-  httpFetcher = new HttpFetcher();
-  pipelines;
-  constructor() {
-    super();
-    this.pipelines = PipelineFactory$1.createStandardPipelines();
-  }
-  canHandle(url) {
-    try {
-      const parsedUrl = new URL(url);
-      const { hostname, pathname } = parsedUrl;
-      return ["github.com", "www.github.com"].includes(hostname) && pathname.includes("/wiki") && pathname.match(/^\/([^/]+)\/([^/]+)\/wiki/) !== null;
-    } catch {
-      return false;
-    }
-  }
-  /**
-   * Parses a GitHub wiki URL to extract repository information.
-   */
-  parseGitHubWikiUrl(url) {
-    const parsedUrl = new URL(url);
-    const match = parsedUrl.pathname.match(/^\/([^/]+)\/([^/]+)\/wiki/);
-    if (!match) {
-      throw new Error(`Invalid GitHub wiki URL: ${url}`);
-    }
-    const [, owner, repo] = match;
-    return { owner, repo };
-  }
-  /**
-   * Override shouldProcessUrl to only process URLs within the wiki scope.
-   */
-  shouldProcessUrl(url, options) {
-    try {
-      const parsedUrl = new URL(url);
-      const wikiInfo = this.parseGitHubWikiUrl(options.url);
-      const expectedWikiPath = `/${wikiInfo.owner}/${wikiInfo.repo}/wiki`;
-      if (!parsedUrl.pathname.startsWith(expectedWikiPath)) {
-        return false;
-      }
-      const wikiPagePath = parsedUrl.pathname.replace(expectedWikiPath, "").replace(/^\//, "");
-      return shouldIncludeUrl(
-        wikiPagePath || "Home",
-        options.includePatterns,
-        options.excludePatterns
-      );
-    } catch {
-      return false;
-    }
-  }
-  async processItem(item, options, _progressCallback, signal) {
-    const currentUrl = item.url;
-    logger.info(
-      `📖 Processing wiki page ${this.pageCount}/${options.maxPages}: ${currentUrl}`
-    );
-    try {
-      const rawContent = await this.httpFetcher.fetch(currentUrl, { signal });
-      let processed;
-      for (const pipeline of this.pipelines) {
-        if (pipeline.canProcess(rawContent)) {
-          logger.debug(
-            `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${currentUrl})`
-          );
-          const wikiOptions = { ...options, scrapeMode: ScrapeMode.Fetch };
-          processed = await pipeline.process(rawContent, wikiOptions, this.httpFetcher);
-          break;
-        }
-      }
-      if (!processed) {
-        logger.warn(
-          `⚠️  Unsupported content type "${rawContent.mimeType}" for wiki page ${currentUrl}. Skipping processing.`
-        );
-        return { document: void 0, links: [] };
-      }
-      for (const err of processed.errors) {
-        logger.warn(`⚠️  Processing error for ${currentUrl}: ${err.message}`);
+      "procfile",
+      "vagrantfile",
+      "gulpfile",
+      "gruntfile",
+      ".prettierrc",
+      ".eslintrc",
+      ".babelrc",
+      ".nvmrc",
+      ".npmrc"
+    ];
+    const isCommonTextFile = commonTextFiles.some((name2) => {
+      if (name2.startsWith(".")) {
+        return fileNameLower === name2 || fileNameLower.startsWith(`${name2}.`);
       }
-      const parsedUrl = new URL(currentUrl);
-      const wikiInfo = this.parseGitHubWikiUrl(currentUrl);
-      const wikiPagePath = parsedUrl.pathname.replace(`/${wikiInfo.owner}/${wikiInfo.repo}/wiki`, "").replace(/^\//, "");
-      const pageTitle = wikiPagePath || "Home";
-      const document2 = {
-        content: typeof processed.textContent === "string" ? processed.textContent : "",
-        metadata: {
-          url: currentUrl,
-          title: typeof processed.metadata.title === "string" && processed.metadata.title.trim() !== "" ? processed.metadata.title : pageTitle,
-          library: options.library,
-          version: options.version
-        },
-        contentType: rawContent.mimeType
-      };
-      const links = processed.links || [];
-      const wikiLinks = links.filter((link) => {
-        if (!link || link.trim() === "" || link === "invalid-url" || link === "not-a-url-at-all") {
-          return false;
-        }
-        return true;
-      }).map((link) => {
-        try {
-          return new URL(link, currentUrl).href;
-        } catch {
-          return null;
-        }
-      }).filter((link) => link !== null).filter((link) => {
-        try {
-          const linkUrl = new URL(link);
-          return linkUrl.hostname === parsedUrl.hostname && linkUrl.pathname.startsWith(`/${wikiInfo.owner}/${wikiInfo.repo}/wiki`);
-        } catch {
-          return false;
-        }
-      });
-      return { document: document2, links: wikiLinks };
-    } catch (error) {
-      logger.warn(`⚠️  Failed to process wiki page ${currentUrl}: ${error}`);
-      return { document: void 0, links: [] };
-    }
-  }
-  async scrape(options, progressCallback, signal) {
-    const url = new URL(options.url);
-    if (!url.hostname.includes("github.com") || !url.pathname.includes("/wiki")) {
-      throw new Error("URL must be a GitHub wiki URL");
+      return fileNameLower === name2 || fileNameLower.startsWith(`${name2}.`);
+    });
+    if (hasTextExtension || hasCompoundExtension || isCommonTextFile) {
+      return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
     }
-    let startUrl = options.url;
-    if (url.pathname.endsWith("/wiki") || url.pathname.endsWith("/wiki/")) {
-      startUrl = url.pathname.endsWith("/") ? `${options.url}Home` : `${options.url}/Home`;
+    const mimeType = mime.getType(path2);
+    if (mimeType?.startsWith("text/")) {
+      logger.debug(`Including file with text MIME type: ${path2} (${mimeType})`);
+      return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
     }
-    const wikiOptions = { ...options, url: startUrl };
-    return super.scrape(wikiOptions, progressCallback, signal);
+    return false;
   }
   /**
-   * Cleanup resources used by this strategy.
+   * Checks if a path is within the specified subpath.
    */
-  async cleanup() {
-    await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
+  isWithinSubPath(path2, subPath) {
+    if (!subPath) {
+      return true;
+    }
+    const trimmedSubPath = subPath.replace(/^\/+/, "").replace(/\/+$/, "");
+    if (trimmedSubPath.length === 0) {
+      return true;
+    }
+    const normalizedPath = path2.replace(/^\/+/, "").replace(/\/+$/, "");
+    if (normalizedPath === trimmedSubPath) {
+      return true;
+    }
+    return normalizedPath.startsWith(`${trimmedSubPath}/`);
   }
-}
-class GitHubScraperStrategy {
-  repoStrategy = new GitHubRepoScraperStrategy();
-  wikiStrategy = new GitHubWikiScraperStrategy();
-  canHandle(url) {
+  async processItem(item, options, signal) {
+    if (item.url.startsWith("github-file://")) {
+      logger.info(
+        `🗑️  Legacy github-file:// URL detected, marking as deleted: ${item.url}`
+      );
+      return {
+        url: item.url,
+        links: [],
+        status: FetchStatus.NOT_FOUND
+      };
+    }
     try {
-      const parsedUrl = new URL(url);
-      const { hostname, pathname } = parsedUrl;
-      if (!["github.com", "www.github.com"].includes(hostname)) {
-        return false;
+      const parsedUrl = new URL(item.url);
+      if (/^\/[^/]+\/[^/]+\/wiki($|\/)/.test(parsedUrl.pathname)) {
+        return await this.wikiProcessor.process(item, options, signal);
       }
-      const pathMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/?$/);
-      return pathMatch !== null;
     } catch {
-      return false;
-    }
-  }
-  async scrape(options, progressCallback, signal) {
-    const url = new URL(options.url);
-    if (!url.hostname.includes("github.com")) {
-      throw new Error("URL must be a GitHub URL");
     }
-    const pathMatch = url.pathname.match(/^\/([^/]+)\/([^/]+)\/?$/);
-    if (!pathMatch) {
-      throw new Error("URL must be a base GitHub repository URL");
-    }
-    const [, owner, repo] = pathMatch;
-    logger.info(`🚀 Starting comprehensive GitHub scraping for ${owner}/${repo}`);
-    let totalPagesDiscovered = 0;
-    let wikiPagesScraped = 0;
-    let wikiCompleted = false;
-    let repoCompleted = false;
-    const mergedProgressCallback = async (progress) => {
-      if (!wikiCompleted) {
-        totalPagesDiscovered = progress.totalDiscovered;
-        wikiPagesScraped = progress.pagesScraped;
-      } else if (!repoCompleted) {
-        progress = {
-          ...progress,
-          pagesScraped: wikiPagesScraped + progress.pagesScraped,
-          totalPages: wikiPagesScraped + progress.totalPages,
-          totalDiscovered: totalPagesDiscovered + progress.totalDiscovered
+    if (item.depth === 0) {
+      const repoInfo = this.parseGitHubUrl(options.url);
+      const { owner, repo } = repoInfo;
+      logger.debug(`Discovering GitHub repository ${owner}/${repo}`);
+      const discoveredLinks = [];
+      if ("isBlob" in repoInfo && repoInfo.isBlob && repoInfo.filePath) {
+        const { branch = "main", filePath } = repoInfo;
+        logger.debug(
+          `Single file URL detected: ${owner}/${repo}/${filePath} - indexing file only`
+        );
+        discoveredLinks.push(
+          `https://github.com/${owner}/${repo}/blob/${branch}/${filePath}`
+        );
+        return {
+          url: item.url,
+          links: discoveredLinks,
+          status: FetchStatus.SUCCESS
         };
       }
-      await progressCallback(progress);
-    };
-    try {
       const wikiUrl = `${options.url.replace(/\/$/, "")}/wiki`;
-      const wikiOptions = { ...options, url: wikiUrl };
-      logger.info(`📖 Attempting to scrape wiki for ${owner}/${repo}`);
-      try {
-        await this.wikiStrategy.scrape(wikiOptions, mergedProgressCallback, signal);
-        wikiCompleted = true;
-        logger.info(
-          `✅ Completed wiki scraping for ${owner}/${repo} (${wikiPagesScraped} pages)`
-        );
-      } catch (error) {
-        wikiCompleted = true;
-        logger.info(`ℹ️  Wiki not available or accessible for ${owner}/${repo}: ${error}`);
-      }
-      const maxPages = options.maxPages || 1e3;
-      const remainingPages = Math.max(0, maxPages - wikiPagesScraped);
-      if (remainingPages > 0) {
-        logger.info(
-          `📂 Scraping repository code for ${owner}/${repo} (${remainingPages} pages remaining)`
-        );
-        const repoOptions = { ...options, maxPages: remainingPages };
-        await this.repoStrategy.scrape(repoOptions, mergedProgressCallback, signal);
-        repoCompleted = true;
-        logger.info(`✅ Completed repository code scraping for ${owner}/${repo}`);
-      } else {
-        logger.info(
-          `ℹ️  Skipping repository code scraping - page limit reached with wiki content`
-        );
+      discoveredLinks.push(wikiUrl);
+      logger.debug(`Discovered wiki URL: ${wikiUrl}`);
+      const { tree, resolvedBranch } = await this.fetchRepositoryTree(repoInfo, signal);
+      const fileItems = tree.tree.filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)).filter((treeItem) => this.shouldProcessFile(treeItem, options));
+      logger.debug(
+        `Discovered ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`
+      );
+      const fileUrls = fileItems.map(
+        (treeItem) => `https://github.com/${owner}/${repo}/blob/${resolvedBranch}/${treeItem.path}`
+      );
+      discoveredLinks.push(...fileUrls);
+      logger.debug(
+        `Discovery complete: ${fileUrls.length} repo file(s) + 1 wiki URL = ${discoveredLinks.length} total URLs`
+      );
+      return { url: item.url, links: discoveredLinks, status: FetchStatus.SUCCESS };
+    }
+    try {
+      const parsedUrl = new URL(item.url);
+      if (/^\/[^/]+\/[^/]+\/blob\//.test(parsedUrl.pathname)) {
+        logger.debug(`Processing HTTPS blob URL at depth ${item.depth}: ${item.url}`);
+        return await this.repoProcessor.process(item, options, signal);
       }
-      logger.info(`🎉 Comprehensive GitHub scraping completed for ${owner}/${repo}`);
     } catch (error) {
-      logger.error(`❌ GitHub scraping failed for ${owner}/${repo}: ${error}`);
-      throw error;
+      logger.warn(`⚠️  Failed to parse blob URL ${item.url}: ${error}`);
+      return { url: item.url, links: [], status: FetchStatus.SUCCESS };
     }
+    logger.debug(`No further processing for URL at depth ${item.depth}: ${item.url}`);
+    return { url: item.url, links: [], status: FetchStatus.SUCCESS };
+  }
+  async scrape(options, progressCallback, signal) {
+    const url = new URL(options.url);
+    if (!url.hostname.includes("github.com")) {
+      throw new Error("URL must be a GitHub URL");
+    }
+    await super.scrape(options, progressCallback, signal);
   }
-  /**
-   * Cleanup resources used by both underlying strategies.
-   */
   async cleanup() {
-    await Promise.allSettled([this.repoStrategy.cleanup(), this.wikiStrategy.cleanup()]);
+    await Promise.all([this.wikiProcessor.cleanup(), this.repoProcessor.cleanup()]);
   }
 }
 class LocalFileStrategy extends BaseScraperStrategy {
@@ -6573,23 +6685,41 @@ class LocalFileStrategy extends BaseScraperStrategy {
   canHandle(url) {
     return url.startsWith("file://");
   }
-  async processItem(item, options, _progressCallback, _signal) {
+  async processItem(item, options, _signal) {
     let filePath = item.url.replace(/^file:\/\/\/?/, "");
     filePath = decodeURIComponent(filePath);
     if (!filePath.startsWith("/") && process.platform !== "win32") {
       filePath = `/${filePath}`;
     }
-    const stats = await fs$1.stat(filePath);
+    let stats;
+    try {
+      stats = await fs$1.stat(filePath);
+    } catch (error) {
+      if (error.code === "ENOENT") {
+        logger.info(`✓ File deleted or not available: ${filePath}`);
+        return {
+          url: item.url,
+          links: [],
+          status: FetchStatus.NOT_FOUND
+        };
+      }
+      throw error;
+    }
     if (stats.isDirectory()) {
       const contents = await fs$1.readdir(filePath);
       const links = contents.map((name2) => `file://${path.join(filePath, name2)}`).filter((url) => this.shouldProcessUrl(url, options));
-      return { links };
+      return { url: item.url, links, status: FetchStatus.SUCCESS };
+    }
+    const rawContent = await this.fileFetcher.fetch(item.url, {
+      etag: item.etag
+    });
+    if (rawContent.status === FetchStatus.NOT_MODIFIED) {
+      logger.debug(`✓ File unchanged: ${filePath}`);
+      return { url: rawContent.source, links: [], status: FetchStatus.NOT_MODIFIED };
     }
-    logger.info(`🗂️  Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`);
-    const rawContent = await this.fileFetcher.fetch(item.url);
     let processed;
     for (const pipeline of this.pipelines) {
-      if (pipeline.canProcess(rawContent)) {
+      if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) {
         logger.debug(
           `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`
         );
@@ -6601,22 +6731,22 @@ class LocalFileStrategy extends BaseScraperStrategy {
       logger.warn(
         `⚠️  Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
       );
-      return { document: void 0, links: [] };
+      return { url: rawContent.source, links: [], status: FetchStatus.SUCCESS };
     }
-    for (const err of processed.errors) {
+    for (const err of processed.errors ?? []) {
       logger.warn(`⚠️  Processing error for ${filePath}: ${err.message}`);
     }
+    const filename = path.basename(filePath);
+    const title = processed.title?.trim() || filename || null;
     return {
-      document: {
-        content: typeof processed.textContent === "string" ? processed.textContent : "",
-        contentType: rawContent.mimeType,
-        metadata: {
-          url: rawContent.source,
-          title: typeof processed.metadata.title === "string" ? processed.metadata.title : "Untitled",
-          library: options.library,
-          version: options.version
-        }
-      }
+      url: rawContent.source,
+      title,
+      etag: rawContent.etag,
+      lastModified: rawContent.lastModified,
+      contentType: rawContent.mimeType,
+      content: processed,
+      links: [],
+      status: FetchStatus.SUCCESS
     };
   }
   /**
@@ -6652,19 +6782,32 @@ class WebScraperStrategy extends BaseScraperStrategy {
    * @param signal - Optional abort signal for request cancellation.
    * @returns An object containing the processed document and extracted links.
    */
-  async processItem(item, options, _progressCallback, signal) {
+  async processItem(item, options, signal) {
     const { url } = item;
     try {
+      if (item.etag) {
+        logger.debug(`Processing ${url} with stored ETag: ${item.etag}`);
+      }
       const fetchOptions = {
         signal,
         followRedirects: options.followRedirects,
-        headers: options.headers
+        headers: options.headers,
         // Forward custom headers
+        etag: item.etag
+        // Pass ETag for conditional requests
       };
       const rawContent = await this.fetcher.fetch(url, fetchOptions);
+      logger.debug(
+        `Fetch result for ${url}: status=${rawContent.status}, etag=${rawContent.etag || "none"}`
+      );
+      if (rawContent.status !== FetchStatus.SUCCESS) {
+        logger.debug(`Skipping pipeline for ${url} due to status: ${rawContent.status}`);
+        return { url: rawContent.source, links: [], status: rawContent.status };
+      }
       let processed;
       for (const pipeline of this.pipelines) {
-        if (pipeline.canProcess(rawContent)) {
+        const contentBuffer = Buffer.isBuffer(rawContent.content) ? rawContent.content : Buffer.from(rawContent.content);
+        if (pipeline.canProcess(rawContent.mimeType || "text/plain", contentBuffer)) {
           logger.debug(
             `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${url})`
           );
@@ -6676,40 +6819,47 @@ class WebScraperStrategy extends BaseScraperStrategy {
         logger.warn(
           `⚠️  Unsupported content type "${rawContent.mimeType}" for URL ${url}. Skipping processing.`
         );
-        return { document: void 0, links: [] };
+        return { url: rawContent.source, links: [], status: FetchStatus.SUCCESS };
       }
-      for (const err of processed.errors) {
+      for (const err of processed.errors ?? []) {
         logger.warn(`⚠️  Processing error for ${url}: ${err.message}`);
       }
       if (!processed.textContent || !processed.textContent.trim()) {
         logger.warn(
           `⚠️  No processable content found for ${url} after pipeline execution.`
         );
-        return { document: void 0, links: processed.links };
+        return {
+          url: rawContent.source,
+          links: processed.links,
+          status: FetchStatus.SUCCESS
+        };
       }
-      const baseUrl = item.depth === 0 ? new URL(rawContent.source) : this.canonicalBaseUrl ?? new URL(options.url);
-      const filteredLinks = processed.links.filter((link) => {
+      if (item.depth === 0) {
+        this.canonicalBaseUrl = new URL(rawContent.source);
+      }
+      const filteredLinks = processed.links?.filter((link) => {
         try {
           const targetUrl = new URL(link);
-          const scope = options.scope || "subpages";
-          return isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
+          if (!this.shouldProcessUrl(targetUrl.href, options)) {
+            return false;
+          }
+          if (this.shouldFollowLinkFn) {
+            const baseUrl = this.canonicalBaseUrl ?? new URL(options.url);
+            return this.shouldFollowLinkFn(baseUrl, targetUrl);
+          }
+          return true;
         } catch {
           return false;
         }
-      });
+      }) ?? [];
       return {
-        document: {
-          content: processed.textContent,
-          metadata: {
-            url,
-            title: typeof processed.metadata.title === "string" ? processed.metadata.title : "Untitled",
-            library: options.library,
-            version: options.version,
-            ...processed.metadata
-          }
-        },
+        url: rawContent.source,
+        etag: rawContent.etag,
+        lastModified: rawContent.lastModified,
+        contentType: processed.contentType || rawContent.mimeType,
+        content: processed,
         links: filteredLinks,
-        finalUrl: rawContent.source
+        status: FetchStatus.SUCCESS
       };
     } catch (error) {
       logger.error(`❌ Failed processing page ${url}: ${error}`);
@@ -6786,7 +6936,6 @@ class ScraperRegistry {
     this.strategies = [
       new NpmScraperStrategy(),
       new PyPiScraperStrategy(),
-      new GitHubWikiScraperStrategy(),
       new GitHubScraperStrategy(),
       new WebScraperStrategy(),
       new LocalFileStrategy()
@@ -6848,55 +6997,64 @@ class PipelineWorker {
    * @param callbacks - Callbacks provided by the manager for reporting.
    */
   async executeJob(job, callbacks) {
-    const {
-      id: jobId,
-      library,
-      version: version2,
-      sourceUrl,
-      scraperOptions,
-      abortController
-    } = job;
+    const { id: jobId, library, version: version2, scraperOptions, abortController } = job;
     const signal = abortController.signal;
     logger.debug(`[${jobId}] Worker starting job for ${library}@${version2}`);
     try {
-      await this.store.removeAllDocuments(library, version2);
-      logger.info(
-        `💾 Cleared store for ${library}@${version2 || "[no version]"} before scraping.`
-      );
-      const runtimeOptions = {
-        url: sourceUrl ?? "",
-        library,
-        version: version2,
-        ...scraperOptions
-      };
+      if (!scraperOptions.isRefresh) {
+        await this.store.removeAllDocuments(library, version2);
+        logger.info(
+          `💾 Cleared store for ${library}@${version2 || "[no version]"} before scraping.`
+        );
+      } else {
+        logger.info(
+          `🔄 Refresh operation - preserving existing data for ${library}@${version2 || "[no version]"}.`
+        );
+      }
       await this.scraperService.scrape(
-        runtimeOptions,
+        scraperOptions,
         async (progress) => {
           if (signal.aborted) {
             throw new CancellationError("Job cancelled during scraping progress");
           }
           await callbacks.onJobProgress?.(job, progress);
-          if (progress.document) {
+          if (progress.deleted && progress.pageId) {
             try {
-              await this.store.addDocument(library, version2, {
-                pageContent: progress.document.content,
-                metadata: {
-                  ...progress.document.metadata,
-                  mimeType: progress.document.contentType
-                  // Pass contentType as mimeType in metadata
-                }
-              });
+              await this.store.deletePage(progress.pageId);
               logger.debug(
-                `[${jobId}] Stored document: ${progress.document.metadata.url}`
+                `[${jobId}] Deleted page ${progress.pageId}: ${progress.currentUrl}`
+              );
+            } catch (docError) {
+              logger.error(
+                `❌ [${jobId}] Failed to delete page ${progress.pageId}: ${docError}`
+              );
+              const error = docError instanceof Error ? docError : new Error(String(docError));
+              await callbacks.onJobError?.(job, error);
+              throw error;
+            }
+          } else if (progress.result) {
+            try {
+              if (progress.pageId) {
+                await this.store.deletePage(progress.pageId);
+                logger.debug(
+                  `[${jobId}] Refreshing page ${progress.pageId}: ${progress.currentUrl}`
+                );
+              }
+              await this.store.addScrapeResult(
+                library,
+                version2,
+                progress.depth,
+                progress.result
               );
+              logger.debug(`[${jobId}] Stored processed content: ${progress.currentUrl}`);
             } catch (docError) {
               logger.error(
-                `❌ [${jobId}] Failed to store document ${progress.document.metadata.url}: ${docError}`
+                `❌ [${jobId}] Failed to process content ${progress.currentUrl}: ${docError}`
               );
               await callbacks.onJobError?.(
                 job,
                 docError instanceof Error ? docError : new Error(String(docError)),
-                progress.document
+                progress.result
               );
             }
           }
@@ -7108,15 +7266,8 @@ class PipelineManager {
   /**
    * Enqueues a new document processing job, aborting any existing QUEUED/RUNNING job for the same library+version (including unversioned).
    */
-  async enqueueJob(library, version2, options) {
+  async enqueueScrapeJob(library, version2, options) {
     const normalizedVersion = version2 ?? "";
-    const {
-      url,
-      library: _library,
-      version: _version,
-      signal: _signal,
-      ...versionOptions
-    } = options;
     const allJobs = await this.getJobs();
     const duplicateJobs = allJobs.filter(
       (job2) => job2.library === library && (job2.version ?? "") === normalizedVersion && // Normalize null to empty string for comparison
@@ -7158,8 +7309,8 @@ class PipelineManager {
       progressMaxPages: 0,
       errorMessage: null,
       updatedAt: /* @__PURE__ */ new Date(),
-      sourceUrl: url,
-      scraperOptions: versionOptions
+      sourceUrl: options.url,
+      scraperOptions: options
     };
     this.jobMap.set(jobId, job);
     this.jobQueue.push(jobId);
@@ -7174,6 +7325,78 @@ class PipelineManager {
     }
     return jobId;
   }
+  /**
+   * Enqueues a refresh job for an existing library version by re-scraping all pages
+   * and using ETag comparison to skip unchanged content.
+   *
+   * If the version was never completed (interrupted or failed scrape), performs a
+   * full re-scrape from scratch instead of a refresh to ensure completeness.
+   */
+  async enqueueRefreshJob(library, version2) {
+    const normalizedVersion = version2 ?? "";
+    try {
+      const versionId = await this.store.ensureVersion({
+        library,
+        version: normalizedVersion
+      });
+      const versionInfo = await this.store.getVersionById(versionId);
+      if (!versionInfo) {
+        throw new Error(`Version ID ${versionId} not found`);
+      }
+      const libraryInfo = await this.store.getLibraryById(versionInfo.library_id);
+      if (!libraryInfo) {
+        throw new Error(`Library ID ${versionInfo.library_id} not found`);
+      }
+      if (versionInfo && versionInfo.status !== VersionStatus.COMPLETED) {
+        logger.info(
+          `⚠️  Version ${library}@${normalizedVersion || "unversioned"} has status "${versionInfo.status}". Performing full re-scrape instead of refresh.`
+        );
+        return this.enqueueJobWithStoredOptions(library, normalizedVersion);
+      }
+      const pages = await this.store.getPagesByVersionId(versionId);
+      if (pages.length > 0) {
+        logger.debug(
+          `Sample page data: url=${pages[0].url}, etag=${pages[0].etag}, depth=${pages[0].depth}`
+        );
+      }
+      if (pages.length === 0) {
+        throw new Error(
+          `No pages found for ${library}@${normalizedVersion || "unversioned"}. Use scrape_docs to index it first.`
+        );
+      }
+      logger.info(
+        `🔄 Preparing refresh job for ${library}@${normalizedVersion || "unversioned"} with ${pages.length} page(s)`
+      );
+      const initialQueue = pages.map((page) => ({
+        url: page.url,
+        depth: page.depth ?? 0,
+        // Use original depth, fallback to 0 for old data
+        pageId: page.id,
+        etag: page.etag
+      }));
+      const storedOptions = await this.store.getScraperOptions(versionId);
+      const scraperOptions = {
+        url: storedOptions?.sourceUrl || pages[0].url,
+        // Required but not used when initialQueue is set
+        library,
+        version: normalizedVersion,
+        ...storedOptions?.options || {},
+        // Include stored options if available (spread first)
+        // Override with refresh-specific options (these must come after the spread)
+        initialQueue,
+        // Pre-populated queue with existing pages
+        isRefresh: true
+        // Mark this as a refresh operation
+      };
+      logger.info(
+        `📝 Enqueueing refresh job for ${library}@${normalizedVersion || "unversioned"}`
+      );
+      return this.enqueueScrapeJob(library, normalizedVersion, scraperOptions);
+    } catch (error) {
+      logger.error(`❌ Failed to enqueue refresh job: ${error}`);
+      throw error;
+    }
+  }
   /**
    * Enqueues a job using stored scraper options from a previous indexing run.
    * If no stored options are found, throws an error.
@@ -7201,7 +7424,7 @@ class PipelineManager {
       logger.info(
         `🔄 Re-indexing ${library}@${normalizedVersion || "unversioned"} with stored options from ${stored.sourceUrl}`
       );
-      return this.enqueueJob(library, normalizedVersion, completeOptions);
+      return this.enqueueScrapeJob(library, normalizedVersion, completeOptions);
     } catch (error) {
       logger.error(`❌ Failed to enqueue job with stored options: ${error}`);
       throw error;
@@ -7418,13 +7641,7 @@ class PipelineManager {
       await this.store.updateVersionStatus(versionId, dbStatus, errorMessage);
       if (newStatus === PipelineJobStatus.QUEUED && job.scraperOptions) {
         try {
-          const fullOptions = {
-            url: job.sourceUrl ?? "",
-            library: job.library,
-            version: job.version,
-            ...job.scraperOptions
-          };
-          await this.store.storeScraperOptions(versionId, fullOptions);
+          await this.store.storeScraperOptions(versionId, job.scraperOptions);
           logger.debug(
             `Stored scraper options for ${job.library}@${job.version}: ${job.sourceUrl}`
           );
@@ -7882,7 +8099,7 @@ async function createPipelineWithCallbacks(docService, options = {}) {
     },
     onJobError: async (job, error, document2) => {
       logger.warn(
-        `⚠️ Job ${job.id} error ${document2 ? `on document ${document2.metadata.url}` : ""}: ${error.message}`
+        `⚠️ Job ${job.id} error ${document2 ? `on document ${document2.url}` : ""}: ${error.message}`
       );
     }
   });
@@ -8113,6 +8330,45 @@ function createMcpServerInstance(tools, readOnly = false) {
         }
       }
     );
+    server.tool(
+      "refresh_version",
+      "Re-scrape a previously indexed library version, updating only changed pages.",
+      {
+        library: z.string().trim().describe("Library name."),
+        version: z.string().trim().optional().describe("Library version (optional, refreshes unversioned if omitted).")
+      },
+      {
+        title: "Refresh Library Version",
+        destructiveHint: false,
+        // Only updates changed content
+        openWorldHint: true
+        // requires internet access
+      },
+      async ({ library, version: version2 }) => {
+        analytics.track(TelemetryEvent.TOOL_USED, {
+          tool: "refresh_version",
+          context: "mcp_server",
+          library,
+          version: version2
+        });
+        try {
+          const result = await tools.refresh.execute({
+            library,
+            version: version2,
+            waitForCompletion: false
+            // Don't wait for completion
+          });
+          if ("jobId" in result) {
+            return createResponse(`🔄 Refresh job started with ID: ${result.jobId}.`);
+          }
+          return createResponse(
+            `Refresh finished immediately (unexpectedly) with ${result.pagesRefreshed} pages.`
+          );
+        } catch (error) {
+          return createError(error);
+        }
+      }
+    );
   }
   server.tool(
     "search_docs",
@@ -8638,7 +8894,7 @@ class FetchUrlTool {
       logger.info("🔄 Processing content...");
       let processed;
       for (const pipeline of this.pipelines) {
-        if (pipeline.canProcess(rawContent)) {
+        if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) {
           processed = await pipeline.process(
             rawContent,
             {
@@ -8673,7 +8929,7 @@ class FetchUrlTool {
         const contentString = convertToString(rawContent.content, resolvedCharset);
         return contentString;
       }
-      for (const err of processed.errors) {
+      for (const err of processed.errors ?? []) {
         logger.warn(`⚠️  Processing error for ${url}: ${err.message}`);
       }
       if (typeof processed.textContent !== "string" || !processed.textContent.trim()) {
@@ -8851,6 +9107,61 @@ class ListLibrariesTool {
     return { libraries };
   }
 }
+class RefreshVersionTool {
+  pipeline;
+  constructor(pipeline) {
+    this.pipeline = pipeline;
+  }
+  async execute(options) {
+    const { library, version: version2, waitForCompletion = true } = options;
+    let internalVersion;
+    const partialVersionRegex = /^\d+(\.\d+)?$/;
+    if (version2 === null || version2 === void 0) {
+      internalVersion = "";
+    } else {
+      const validFullVersion = semver.valid(version2);
+      if (validFullVersion) {
+        internalVersion = validFullVersion;
+      } else if (partialVersionRegex.test(version2)) {
+        const coercedVersion = semver.coerce(version2);
+        if (coercedVersion) {
+          internalVersion = coercedVersion.version;
+        } else {
+          throw new ValidationError(
+            `Invalid version format for refreshing: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`,
+            "RefreshVersionTool"
+          );
+        }
+      } else {
+        throw new ValidationError(
+          `Invalid version format for refreshing: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`,
+          "RefreshVersionTool"
+        );
+      }
+    }
+    internalVersion = internalVersion.toLowerCase();
+    const pipeline = this.pipeline;
+    const refreshVersion = internalVersion === "" ? null : internalVersion;
+    const jobId = await pipeline.enqueueRefreshJob(library, refreshVersion);
+    if (waitForCompletion) {
+      try {
+        await pipeline.waitForJobCompletion(jobId);
+        const finalJob = await pipeline.getJob(jobId);
+        const finalPagesRefreshed = finalJob?.progress?.pagesScraped ?? 0;
+        logger.debug(
+          `Refresh job ${jobId} finished with status ${finalJob?.status}. Pages refreshed: ${finalPagesRefreshed}`
+        );
+        return {
+          pagesRefreshed: finalPagesRefreshed
+        };
+      } catch (error) {
+        logger.error(`❌ Refresh job ${jobId} failed or was cancelled: ${error}`);
+        throw error;
+      }
+    }
+    return { jobId };
+  }
+}
 class RemoveTool {
   constructor(documentManagementService, pipeline) {
     this.documentManagementService = documentManagementService;
@@ -8871,19 +9182,7 @@ class RemoveTool {
     }
     logger.info(`🗑️ Removing library: ${library}${version2 ? `@${version2}` : ""}`);
     try {
-      const result = await this.documentManagementService.findBestVersion(
-        library,
-        version2
-      );
-      const normalizedVersion = version2 && version2.trim() !== "" ? version2 : null;
-      const versionExists = result.bestMatch === normalizedVersion || result.hasUnversioned && normalizedVersion === null;
-      if (!versionExists) {
-        const versionText = normalizedVersion ? `Version ${normalizedVersion}` : "Version";
-        throw new ToolError(
-          `${versionText} not found for library ${library}. Cannot remove non-existent version.`,
-          this.constructor.name
-        );
-      }
+      await this.documentManagementService.validateLibraryExists(library);
       const allJobs = await this.pipeline.getJobs();
       const jobs = allJobs.filter(
         (job) => job.library === library && job.version === (version2 ?? "") && (job.status === PipelineJobStatus.QUEUED || job.status === PipelineJobStatus.RUNNING)
@@ -8950,7 +9249,7 @@ class ScrapeTool {
     internalVersion = internalVersion.toLowerCase();
     const pipeline = this.pipeline;
     const enqueueVersion = internalVersion === "" ? null : internalVersion;
-    const jobId = await pipeline.enqueueJob(library, enqueueVersion, {
+    const jobId = await pipeline.enqueueScrapeJob(library, enqueueVersion, {
       url,
       library,
       version: internalVersion,
@@ -8997,7 +9296,18 @@ class DocumentManagementClient {
     logger.debug(`DocumentManagementClient (tRPC) created for: ${this.baseUrl}`);
   }
   async initialize() {
-    await this.client.ping.query();
+    try {
+      await this.client.ping.query();
+    } catch (error) {
+      logger.debug(
+        `Failed to connect to DocumentManagement server at ${this.baseUrl}: ${error}`
+      );
+      throw new Error(
+        `Failed to connect to server at ${this.baseUrl}.
+Please verify the server URL includes the correct port (default 8080) and ends with '/api' (e.g., 'http://localhost:8080/api').`
+      );
+    }
   }
   async shutdown() {
   }
@@ -9069,7 +9379,7 @@ class HierarchicalAssemblyStrategy {
     try {
       const chunksByDocument = /* @__PURE__ */ new Map();
       for (const chunk of initialChunks) {
-        const url = chunk.metadata.url;
+        const url = chunk.url;
         if (!chunksByDocument.has(url)) {
           chunksByDocument.set(url, []);
         }
@@ -9163,10 +9473,10 @@ class HierarchicalAssemblyStrategy {
     if (debug) {
       return chunks.map(
         (chunk) => `=== #${chunk.id} ${chunk.metadata.path?.join("/")} [${chunk.metadata.level}] ===
-` + chunk.pageContent
+` + chunk.content
       ).join("");
     }
-    return chunks.map((chunk) => chunk.pageContent).join("");
+    return chunks.map((chunk) => chunk.content).join("");
   }
   /**
    * Walks up the parent hierarchy from a chunk to collect the complete parent chain.
@@ -9192,42 +9502,17 @@ class HierarchicalAssemblyStrategy {
       visited.add(currentId);
       chainIds.push(currentId);
       depth++;
-      try {
-        const parentChunk = await documentStore.findParentChunk(
+      let parentChunk = await documentStore.findParentChunk(library, version2, currentId);
+      if (!parentChunk) {
+        parentChunk = await this.findAncestorWithGaps(
           library,
           version2,
-          currentId
+          currentChunk.url,
+          currentChunk.metadata.path ?? [],
+          documentStore
         );
-        if (parentChunk) {
-          currentChunk = parentChunk;
-        } else {
-          currentChunk = await this.findAncestorWithGaps(
-            library,
-            version2,
-            currentChunk.metadata,
-            documentStore
-          );
-        }
-      } catch (error) {
-        try {
-          const currentMetadata = currentChunk?.metadata;
-          if (currentMetadata) {
-            currentChunk = await this.findAncestorWithGaps(
-              library,
-              version2,
-              currentMetadata,
-              documentStore
-            );
-          } else {
-            currentChunk = null;
-          }
-        } catch (gapError) {
-          logger.warn(
-            `Parent lookup failed for chunk ${currentId}: ${error}. Gap search also failed: ${gapError}`
-          );
-          break;
-        }
       }
+      currentChunk = parentChunk;
     }
     if (depth >= maxDepth) {
       logger.warn(
@@ -9240,9 +9525,7 @@ class HierarchicalAssemblyStrategy {
    * Attempts to find ancestors when there are gaps in the hierarchy.
    * Tries progressively shorter path prefixes to find existing ancestor chunks.
    */
-  async findAncestorWithGaps(library, version2, metadata, documentStore) {
-    const path2 = metadata.path || [];
-    const url = metadata.url;
+  async findAncestorWithGaps(library, version2, url, path2, documentStore) {
     if (path2.length <= 1) {
       return null;
     }
@@ -9279,7 +9562,7 @@ class HierarchicalAssemblyStrategy {
       }
       const matchingChunks = allChunks.filter((chunk) => {
         const chunkPath = chunk.metadata.path || [];
-        const chunkUrl = chunk.metadata.url;
+        const chunkUrl = chunk.url;
         if (chunkUrl !== url) return false;
         if (chunkPath.length !== targetPath.length) return false;
         return chunkPath.every((part, index) => part === targetPath[index]);
@@ -9301,11 +9584,7 @@ class HierarchicalAssemblyStrategy {
       return current;
     }
     while (true) {
-      const parent = await documentStore.findParentChunk(
-        library,
-        version2,
-        current.id
-      );
+      const parent = await documentStore.findParentChunk(library, version2, current.id);
       if (!parent) {
         return null;
       }
@@ -9387,7 +9666,7 @@ class HierarchicalAssemblyStrategy {
       const ancestorChunks = await this.findChunksByExactPath(
         library,
         version2,
-        referenceChunk.metadata.url,
+        referenceChunk.url,
         ancestorPath,
         documentStore
       );
@@ -9465,13 +9744,9 @@ class HierarchicalAssemblyStrategy {
     for (const chunk of initialChunks) {
       const id = chunk.id;
       chunkIds.add(id);
-      try {
-        const parent = await documentStore.findParentChunk(library, version2, id);
-        if (parent) {
-          chunkIds.add(parent.id);
-        }
-      } catch (error) {
-        logger.warn(`Failed to find parent for chunk ${id}: ${error}`);
+      const parent = await documentStore.findParentChunk(library, version2, id);
+      if (parent) {
+        chunkIds.add(parent.id);
       }
       try {
         const children = await documentStore.findChildChunks(library, version2, id, 3);
@@ -9539,7 +9814,7 @@ class MarkdownAssemblyStrategy {
    * Assembles chunks using simple "\n\n" joining (current behavior).
    */
   assembleContent(chunks) {
-    return chunks.map((chunk) => chunk.pageContent).join("\n\n");
+    return chunks.map((chunk) => chunk.content).join("\n\n");
   }
   /**
    * Collects related chunk IDs for a single chunk using current context expansion logic.
@@ -9638,7 +9913,7 @@ class DocumentRetrieverService {
   groupResultsByUrl(results) {
     const resultsByUrl = /* @__PURE__ */ new Map();
     for (const result of results) {
-      const url = result.metadata.url;
+      const url = result.url;
       if (!resultsByUrl.has(url)) {
         resultsByUrl.set(url, []);
       }
@@ -9653,10 +9928,8 @@ class DocumentRetrieverService {
    * Processes a group of search results from the same URL using appropriate strategy.
    */
   async processUrlGroup(library, version2, url, initialChunks) {
-    const mimeType = initialChunks.length > 0 ? initialChunks[0].metadata.mimeType : void 0;
-    const maxScore = Math.max(
-      ...initialChunks.map((chunk) => chunk.metadata.score)
-    );
+    const mimeType = initialChunks.length > 0 ? initialChunks[0].content_type : void 0;
+    const maxScore = Math.max(...initialChunks.map((chunk) => chunk.score));
     const strategy = createContentAssemblyStrategy(mimeType);
     const selectedChunks = await strategy.selectChunks(
       library,
@@ -9845,7 +10118,7 @@ class DocumentStore {
   prepareStatements() {
     const statements = {
       getById: this.db.prepare(
-        `SELECT d.*, p.url, p.title, p.content_type
+        `SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type
          FROM documents d
          JOIN pages p ON d.page_id = p.id
          WHERE d.id = ?`
@@ -9858,7 +10131,7 @@ class DocumentStore {
         "UPDATE documents SET embedding = ? WHERE id = ?"
       ),
       insertPage: this.db.prepare(
-        "INSERT INTO pages (version_id, url, title, etag, last_modified, content_type) VALUES (?, ?, ?, ?, ?, ?) ON CONFLICT(version_id, url) DO UPDATE SET title = excluded.title, content_type = excluded.content_type"
+        "INSERT INTO pages (version_id, url, title, etag, last_modified, content_type, depth) VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(version_id, url) DO UPDATE SET title = excluded.title, content_type = excluded.content_type, etag = excluded.etag, last_modified = excluded.last_modified, depth = excluded.depth"
       ),
       getPageId: this.db.prepare(
         "SELECT id FROM pages WHERE version_id = ? AND url = ?"
@@ -9869,12 +10142,13 @@ class DocumentStore {
       getLibraryIdByName: this.db.prepare(
         "SELECT id FROM libraries WHERE name = ?"
       ),
+      getLibraryById: this.db.prepare("SELECT * FROM libraries WHERE id = ?"),
       // New version-related statements
       insertVersion: this.db.prepare(
         "INSERT INTO versions (library_id, name, status) VALUES (?, ?, 'not_indexed') ON CONFLICT(library_id, name) DO NOTHING"
       ),
       resolveVersionId: this.db.prepare(
-        "SELECT id FROM versions WHERE library_id = ? AND name IS ?"
+        "SELECT id FROM versions WHERE library_id = ? AND name = ?"
       ),
       getVersionById: this.db.prepare("SELECT * FROM versions WHERE id = ?"),
       queryVersionsByLibraryId: this.db.prepare(
@@ -9889,15 +10163,10 @@ class DocumentStore {
            WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')
          )`
       ),
-      deleteDocumentsByUrl: this.db.prepare(
-        `DELETE FROM documents
-         WHERE page_id IN (
-           SELECT p.id FROM pages p
-           JOIN versions v ON p.version_id = v.id
-           JOIN libraries l ON v.library_id = l.id
-           WHERE p.url = ? AND l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')
-         )`
+      deleteDocumentsByPageId: this.db.prepare(
+        "DELETE FROM documents WHERE page_id = ?"
       ),
+      deletePage: this.db.prepare("DELETE FROM pages WHERE id = ?"),
       deletePages: this.db.prepare(
         `DELETE FROM pages
          WHERE version_id IN (
@@ -9953,7 +10222,7 @@ class DocumentStore {
         ORDER BY l.name, version`
       ),
       getChildChunks: this.db.prepare(`
-        SELECT d.*, p.url, p.title, p.content_type FROM documents d
+        SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
         JOIN pages p ON d.page_id = p.id
         JOIN versions v ON p.version_id = v.id
         JOIN libraries l ON v.library_id = l.id
@@ -9967,7 +10236,7 @@ class DocumentStore {
         LIMIT ?
       `),
       getPrecedingSiblings: this.db.prepare(`
-        SELECT d.*, p.url, p.title, p.content_type FROM documents d
+        SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
         JOIN pages p ON d.page_id = p.id
         JOIN versions v ON p.version_id = v.id
         JOIN libraries l ON v.library_id = l.id
@@ -9980,7 +10249,7 @@ class DocumentStore {
         LIMIT ?
       `),
       getSubsequentSiblings: this.db.prepare(`
-        SELECT d.*, p.url, p.title, p.content_type FROM documents d
+        SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
         JOIN pages p ON d.page_id = p.id
         JOIN versions v ON p.version_id = v.id
         JOIN libraries l ON v.library_id = l.id
@@ -9993,7 +10262,7 @@ class DocumentStore {
         LIMIT ?
       `),
       getParentChunk: this.db.prepare(`
-        SELECT d.*, p.url, p.title, p.content_type FROM documents d
+        SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
         JOIN pages p ON d.page_id = p.id
         JOIN versions v ON p.version_id = v.id
         JOIN libraries l ON v.library_id = l.id
@@ -10035,6 +10304,9 @@ class DocumentStore {
         `SELECT v.id, v.library_id FROM versions v
          JOIN libraries l ON v.library_id = l.id
          WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')`
+      ),
+      getPagesByVersionId: this.db.prepare(
+        "SELECT * FROM pages WHERE version_id = ?"
       )
     };
     this.statements = statements;
@@ -10176,7 +10448,7 @@ class DocumentStore {
     this.statements.insertVersion.run(libraryId, normalizedVersion);
     const versionIdRow = this.statements.resolveVersionId.get(
       libraryId,
-      normalizedVersion === null ? "" : normalizedVersion
+      normalizedVersion
     );
     if (!versionIdRow || typeof versionIdRow.id !== "number") {
       throw new StoreError(
@@ -10238,6 +10510,32 @@ class DocumentStore {
       throw new StoreError(`Failed to get versions by status: ${error}`);
     }
   }
+  /**
+   * Retrieves a version by its ID.
+   * @param versionId The version ID to retrieve
+   * @returns The version record, or null if not found
+   */
+  async getVersionById(versionId) {
+    try {
+      const row = this.statements.getVersionById.get(versionId);
+      return row || null;
+    } catch (error) {
+      throw new StoreError(`Failed to get version by ID: ${error}`);
+    }
+  }
+  /**
+   * Retrieves a library by its ID.
+   * @param libraryId The library ID to retrieve
+   * @returns The library record, or null if not found
+   */
+  async getLibraryById(libraryId) {
+    try {
+      const row = this.statements.getLibraryById.get(libraryId);
+      return row || null;
+    } catch (error) {
+      throw new StoreError(`Failed to get library by ID: ${error}`);
+    }
+  }
   /**
    * Stores scraper options for a version to enable reproducible indexing.
    * @param versionId The version ID to update
@@ -10245,7 +10543,15 @@ class DocumentStore {
    */
   async storeScraperOptions(versionId, options) {
     try {
-      const { url: source_url, library, version: version2, signal, ...scraper_options } = options;
+      const {
+        url: source_url,
+        library: _library,
+        version: _version,
+        signal: _signal,
+        initialQueue: _initialQueue,
+        isRefresh: _isRefresh,
+        ...scraper_options
+      } = options;
       const optionsJson = JSON.stringify(scraper_options);
       this.statements.updateVersionScraperOptions.run(source_url, optionsJson, versionId);
     } catch (error) {
@@ -10356,36 +10662,96 @@ class DocumentStore {
       throw new ConnectionError("Failed to query library versions", error);
     }
   }
+  /**
+   * Helper method to detect if an error is related to input size limits.
+   * Checks for common error messages from various embedding providers.
+   */
+  isInputSizeError(error) {
+    if (!(error instanceof Error)) return false;
+    const message = error.message.toLowerCase();
+    return message.includes("maximum context length") || message.includes("too long") || message.includes("token limit") || message.includes("input is too large") || message.includes("exceeds") || message.includes("max") && message.includes("token");
+  }
+  /**
+   * Creates embeddings for an array of texts with automatic retry logic for size-related errors.
+   * If a batch fails due to size limits:
+   * - Batches with multiple texts are split in half and retried recursively
+   * - Single texts that are too large are truncated and retried once
+   *
+   * @param texts Array of texts to embed
+   * @returns Array of embedding vectors
+   */
+  async embedDocumentsWithRetry(texts) {
+    if (texts.length === 0) {
+      return [];
+    }
+    try {
+      return await this.embeddings.embedDocuments(texts);
+    } catch (error) {
+      if (this.isInputSizeError(error)) {
+        if (texts.length > 1) {
+          const midpoint = Math.floor(texts.length / 2);
+          const firstHalf = texts.slice(0, midpoint);
+          const secondHalf = texts.slice(midpoint);
+          logger.warn(
+            `⚠️  Batch of ${texts.length} texts exceeded size limit, splitting into ${firstHalf.length} + ${secondHalf.length}`
+          );
+          const [firstEmbeddings, secondEmbeddings] = await Promise.all([
+            this.embedDocumentsWithRetry(firstHalf),
+            this.embedDocumentsWithRetry(secondHalf)
+          ]);
+          return [...firstEmbeddings, ...secondEmbeddings];
+        } else {
+          const text = texts[0];
+          const midpoint = Math.floor(text.length / 2);
+          const firstHalf = text.substring(0, midpoint);
+          logger.warn(
+            `⚠️  Single text exceeded embedding size limit (${text.length} chars). Truncating at ${firstHalf.length} chars.`
+          );
+          try {
+            const embedding = await this.embedDocumentsWithRetry([firstHalf]);
+            logger.info(
+              `✓ Using embedding from first half of split text (${firstHalf.length} chars)`
+            );
+            return embedding;
+          } catch (retryError) {
+            logger.error(
+              `❌ Failed to embed even after splitting. Original length: ${text.length}`
+            );
+            throw retryError;
+          }
+        }
+      }
+      throw error;
+    }
+  }
   /**
    * Stores documents with library and version metadata, generating embeddings
    * for vector similarity search. Uses the new pages table to normalize page-level
    * metadata and avoid duplication across document chunks.
    */
-  async addDocuments(library, version2, documents) {
+  async addDocuments(library, version2, depth, result) {
     try {
-      if (documents.length === 0) {
+      const { title, url, chunks } = result;
+      if (chunks.length === 0) {
         return;
       }
-      const documentsByUrl = /* @__PURE__ */ new Map();
-      for (const doc of documents) {
-        const url = doc.metadata.url;
-        if (!url || typeof url !== "string" || !url.trim()) {
-          throw new StoreError("Document metadata must include a valid URL");
-        }
-        if (!documentsByUrl.has(url)) {
-          documentsByUrl.set(url, []);
-        }
-        documentsByUrl.get(url)?.push(doc);
-      }
       let paddedEmbeddings = [];
       if (this.isVectorSearchEnabled) {
-        const texts = documents.map((doc) => {
-          const header = `<title>${doc.metadata.title}</title>
-<url>${doc.metadata.url}</url>
-<path>${(doc.metadata.path || []).join(" / ")}</path>
+        const texts = chunks.map((chunk) => {
+          const header = `<title>${title}</title>
+<url>${url}</url>
+<path>${(chunk.section.path || []).join(" / ")}</path>
 `;
-          return `${header}${doc.pageContent}`;
+          return `${header}${chunk.content}`;
         });
+        for (let i = 0; i < texts.length; i++) {
+          const textSize = texts[i].length;
+          if (textSize > SPLITTER_MAX_CHUNK_SIZE) {
+            logger.warn(
+              `⚠️  Chunk ${i + 1}/${texts.length} exceeds max size: ${textSize} > ${SPLITTER_MAX_CHUNK_SIZE} chars (URL: ${url})`
+            );
+          }
+        }
         const maxBatchChars = EMBEDDING_BATCH_CHARS;
         const rawEmbeddings = [];
         let currentBatch = [];
@@ -10398,7 +10764,7 @@ class DocumentStore {
             logger.debug(
               `Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
             );
-            const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
+            const batchEmbeddings = await this.embedDocumentsWithRetry(currentBatch);
             rawEmbeddings.push(...batchEmbeddings);
             currentBatch = [];
             currentBatchSize = 0;
@@ -10410,7 +10776,7 @@ class DocumentStore {
             logger.debug(
               `Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
             );
-            const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
+            const batchEmbeddings = await this.embedDocumentsWithRetry(currentBatch);
             rawEmbeddings.push(...batchEmbeddings);
             currentBatch = [];
             currentBatchSize = 0;
@@ -10421,110 +10787,115 @@ class DocumentStore {
           logger.debug(
             `Processing final embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
           );
-          const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
+          const batchEmbeddings = await this.embedDocumentsWithRetry(currentBatch);
           rawEmbeddings.push(...batchEmbeddings);
         }
         paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
       }
       const versionId = await this.resolveVersionId(library, version2);
-      for (const url of documentsByUrl.keys()) {
-        const deletedCount = await this.deleteDocumentsByUrl(library, version2, url);
-        if (deletedCount > 0) {
-          logger.debug(`Deleted ${deletedCount} existing documents for URL: ${url}`);
-        }
-      }
-      const transaction = this.db.transaction((docsByUrl) => {
-        const pageIds = /* @__PURE__ */ new Map();
-        for (const [url, urlDocs] of docsByUrl) {
-          const firstDoc = urlDocs[0];
-          const title = firstDoc.metadata.title || "";
-          const contentType = firstDoc.metadata.contentType || null;
-          this.statements.insertPage.run(
-            versionId,
-            url,
-            title,
-            null,
-            // etag - will be populated during scraping
-            null,
-            // last_modified - will be populated during scraping
-            contentType
-          );
-          const existingPage = this.statements.getPageId.get(versionId, url);
-          if (!existingPage) {
-            throw new StoreError(`Failed to get page ID for URL: ${url}`);
-          }
-          const pageId = existingPage.id;
-          pageIds.set(url, pageId);
+      const existingPage = this.statements.getPageId.get(versionId, url);
+      if (existingPage) {
+        const result2 = this.statements.deleteDocumentsByPageId.run(existingPage.id);
+        if (result2.changes > 0) {
+          logger.debug(`Deleted ${result2.changes} existing documents for URL: ${url}`);
+        }
+      }
+      const transaction = this.db.transaction(() => {
+        const contentType = result.contentType || null;
+        const etag = result.etag || null;
+        const lastModified = result.lastModified || null;
+        this.statements.insertPage.run(
+          versionId,
+          url,
+          title || "",
+          etag,
+          lastModified,
+          contentType,
+          depth
+        );
+        const existingPage2 = this.statements.getPageId.get(versionId, url);
+        if (!existingPage2) {
+          throw new StoreError(`Failed to get page ID for URL: ${url}`);
         }
+        const pageId = existingPage2.id;
         let docIndex = 0;
-        for (const [url, urlDocs] of docsByUrl) {
-          const pageId = pageIds.get(url);
-          if (!pageId) {
-            throw new StoreError(`Failed to get page ID for URL: ${url}`);
-          }
-          for (let i = 0; i < urlDocs.length; i++) {
-            const doc = urlDocs[i];
-            const {
-              url: _,
-              title: __,
-              library: ___,
-              version: ____,
-              ...chunkMetadata
-            } = doc.metadata;
-            const result = this.statements.insertDocument.run(
-              pageId,
-              doc.pageContent,
-              JSON.stringify(chunkMetadata),
-              i
-              // sort_order within this page
+        for (let i = 0; i < chunks.length; i++) {
+          const chunk = chunks[i];
+          const result2 = this.statements.insertDocument.run(
+            pageId,
+            chunk.content,
+            JSON.stringify({
+              types: chunk.types,
+              level: chunk.section.level,
+              path: chunk.section.path
+            }),
+            i
+            // sort_order within this page
+          );
+          const rowId = result2.lastInsertRowid;
+          if (this.isVectorSearchEnabled && paddedEmbeddings.length > 0) {
+            this.statements.insertEmbedding.run(
+              BigInt(rowId),
+              JSON.stringify(paddedEmbeddings[docIndex])
             );
-            const rowId = result.lastInsertRowid;
-            if (this.isVectorSearchEnabled && paddedEmbeddings.length > 0) {
-              this.statements.insertEmbedding.run(
-                BigInt(rowId),
-                JSON.stringify(paddedEmbeddings[docIndex])
-              );
-            }
-            docIndex++;
           }
+          docIndex++;
         }
       });
-      transaction(documentsByUrl);
+      transaction();
     } catch (error) {
       throw new ConnectionError("Failed to add documents to store", error);
     }
   }
   /**
-   * Removes documents matching specified library and version
+   * Removes documents and pages matching specified library and version.
+   * This consolidated method deletes both documents and their associated pages.
    * @returns Number of documents deleted
    */
-  async deleteDocuments(library, version2) {
+  async deletePages(library, version2) {
     try {
       const normalizedVersion = version2.toLowerCase();
       const result = this.statements.deleteDocuments.run(
         library.toLowerCase(),
         normalizedVersion
       );
+      this.statements.deletePages.run(library.toLowerCase(), normalizedVersion);
       return result.changes;
     } catch (error) {
       throw new ConnectionError("Failed to delete documents", error);
     }
   }
   /**
-   * Removes documents for a specific URL within a library and version
-   * @returns Number of documents deleted
+   * Deletes a page and all its associated document chunks.
+   * Performs manual deletion in the correct order to satisfy foreign key constraints:
+   * 1. Delete document chunks (page_id references pages.id)
+   * 2. Delete page record
+   *
+   * This method is used during refresh operations when a page returns 404 Not Found.
    */
-  async deleteDocumentsByUrl(library, version2, url) {
+  async deletePage(pageId) {
     try {
-      const normalizedVersion = version2.toLowerCase();
-      const result = this.statements.deleteDocumentsByUrl.run(
-        url,
-        library.toLowerCase(),
-        normalizedVersion
-      );
-      return result.changes;
+      const docResult = this.statements.deleteDocumentsByPageId.run(pageId);
+      logger.debug(`Deleted ${docResult.changes} document(s) for page ID ${pageId}`);
+      const pageResult = this.statements.deletePage.run(pageId);
+      if (pageResult.changes > 0) {
+        logger.debug(`Deleted page record for page ID ${pageId}`);
+      }
+    } catch (error) {
+      throw new ConnectionError(`Failed to delete page ${pageId}`, error);
+    }
+  }
+  /**
+   * Retrieves all pages for a specific version ID with their metadata.
+   * Used for refresh operations to get existing pages with their ETags and depths.
+   * @returns Array of page records
+   */
+  async getPagesByVersionId(versionId) {
+    try {
+      const result = this.statements.getPagesByVersionId.all(versionId);
+      return result;
     } catch (error) {
-      throw new ConnectionError("Failed to delete documents by URL", error);
+      throw new ConnectionError("Failed to get pages by version ID", error);
     }
   }
   /**
@@ -10547,7 +10918,7 @@ class DocumentStore {
         return { documentsDeleted: 0, versionDeleted: false, libraryDeleted: false };
       }
       const { id: versionId, library_id: libraryId } = versionResult;
-      const documentsDeleted = await this.deleteDocuments(library, version2);
+      const documentsDeleted = await this.deletePages(library, version2);
       this.statements.deletePages.run(normalizedLibrary, normalizedVersion);
       const versionDeleteResult = this.statements.deleteVersionById.run(versionId);
       const versionDeleted = versionDeleteResult.changes > 0;
@@ -10565,6 +10936,27 @@ class DocumentStore {
       throw new ConnectionError("Failed to remove version", error);
     }
   }
+  /**
+   * Parses the metadata field from a JSON string to an object.
+   * This is necessary because better-sqlite3's json() function returns a string, not an object.
+   */
+  parseMetadata(row) {
+    if (row.metadata && typeof row.metadata === "string") {
+      try {
+        row.metadata = JSON.parse(row.metadata);
+      } catch (error) {
+        logger.warn(`Failed to parse metadata JSON: ${error}`);
+        row.metadata = {};
+      }
+    }
+    return row;
+  }
+  /**
+   * Parses metadata for an array of rows.
+   */
+  parseMetadataArray(rows) {
+    return rows.map((row) => this.parseMetadata(row));
+  }
   /**
    * Retrieves a document by its ID.
    * @param id The ID of the document.
@@ -10572,13 +10964,11 @@ class DocumentStore {
    */
   async getById(id) {
     try {
-      const row = this.statements.getById.get(
-        BigInt(id)
-      );
+      const row = this.statements.getById.get(BigInt(id));
       if (!row) {
         return null;
       }
-      return mapDbDocumentToDocument(row);
+      return this.parseMetadata(row);
     } catch (error) {
       throw new ConnectionError(`Failed to get document by ID ${id}`, error);
     }
@@ -10662,26 +11052,20 @@ class DocumentStore {
         );
         const rankedResults = this.assignRanks(rawResults);
         const topResults = rankedResults.sort((a, b) => b.rrf_score - a.rrf_score).slice(0, limit);
-        return topResults.map((row) => ({
-          ...mapDbDocumentToDocument({
+        return topResults.map((row) => {
+          const result = {
             ...row,
             url: row.url || "",
             // Ensure url is never undefined
-            title: row.title,
-            content_type: row.content_type
-          }),
-          metadata: {
-            ...JSON.parse(row.metadata),
-            id: row.id,
+            title: row.title || null,
+            content_type: row.content_type || null
+          };
+          return Object.assign(result, {
             score: row.rrf_score,
             vec_rank: row.vec_rank,
-            fts_rank: row.fts_rank,
-            // Explicitly add page fields if they exist
-            url: row.url || "",
-            title: row.title || "",
-            ...row.content_type && { contentType: row.content_type }
-          }
-        }));
+            fts_rank: row.fts_rank
+          });
+        });
       } else {
         const stmt = this.db.prepare(`
           SELECT
@@ -10713,28 +11097,21 @@ class DocumentStore {
           ftsQuery,
           limit
         );
-        return rawResults.map((row, index) => ({
-          ...mapDbDocumentToDocument({
+        return rawResults.map((row, index) => {
+          const result = {
             ...row,
             url: row.url || "",
             // Ensure url is never undefined
-            title: row.title,
-            content_type: row.content_type
-          }),
-          metadata: {
-            ...JSON.parse(row.metadata),
-            id: row.id,
+            title: row.title || null,
+            content_type: row.content_type || null
+          };
+          return Object.assign(result, {
             score: -row.fts_score,
             // Convert BM25 score to positive value for consistency
-            fts_rank: index + 1,
+            fts_rank: index + 1
             // Assign rank based on order (1-based)
-            // Explicitly ensure vec_rank is not included in FTS-only mode
-            // Explicitly add page fields
-            url: row.url || "",
-            title: row.title || "",
-            ...row.content_type && { contentType: row.content_type }
-          }
-        }));
+          });
+        });
       }
     } catch (error) {
       throw new ConnectionError(
@@ -10753,18 +11130,17 @@ class DocumentStore {
         return [];
       }
       const parentPath = parent.metadata.path ?? [];
-      const parentUrl = parent.metadata.url;
       const normalizedVersion = version2.toLowerCase();
       const result = this.statements.getChildChunks.all(
         library.toLowerCase(),
         normalizedVersion,
-        parentUrl,
+        parent.url,
         parentPath.length + 1,
         JSON.stringify(parentPath),
         BigInt(id),
         limit
       );
-      return result.map((row) => mapDbDocumentToDocument(row));
+      return this.parseMetadataArray(result);
     } catch (error) {
       throw new ConnectionError(`Failed to find child chunks for ID ${id}`, error);
     }
@@ -10778,17 +11154,16 @@ class DocumentStore {
       if (!reference) {
         return [];
       }
-      const refMetadata = reference.metadata;
       const normalizedVersion = version2.toLowerCase();
       const result = this.statements.getPrecedingSiblings.all(
         library.toLowerCase(),
         normalizedVersion,
-        refMetadata.url,
+        reference.url,
         BigInt(id),
-        JSON.stringify(refMetadata.path),
+        JSON.stringify(reference.metadata.path),
         limit
       );
-      return result.reverse().map((row) => mapDbDocumentToDocument(row));
+      return this.parseMetadataArray(result).reverse();
     } catch (error) {
       throw new ConnectionError(
         `Failed to find preceding sibling chunks for ID ${id}`,
@@ -10805,17 +11180,16 @@ class DocumentStore {
       if (!reference) {
         return [];
       }
-      const refMetadata = reference.metadata;
       const normalizedVersion = version2.toLowerCase();
       const result = this.statements.getSubsequentSiblings.all(
         library.toLowerCase(),
         normalizedVersion,
-        refMetadata.url,
+        reference.url,
         BigInt(id),
-        JSON.stringify(refMetadata.path),
+        JSON.stringify(reference.metadata.path),
         limit
       );
-      return result.map((row) => mapDbDocumentToDocument(row));
+      return this.parseMetadataArray(result);
     } catch (error) {
       throw new ConnectionError(
         `Failed to find subsequent sibling chunks for ID ${id}`,
@@ -10825,6 +11199,8 @@ class DocumentStore {
   }
   /**
    * Finds the parent chunk of a given document.
+   * Returns null if no parent is found or if there's a database error.
+   * Database errors are logged but not thrown to maintain consistent behavior.
    */
   async findParentChunk(library, version2, id) {
     try {
@@ -10832,8 +11208,7 @@ class DocumentStore {
       if (!child) {
         return null;
       }
-      const childMetadata = child.metadata;
-      const path2 = childMetadata.path ?? [];
+      const path2 = child.metadata.path ?? [];
       const parentPath = path2.slice(0, -1);
       if (parentPath.length === 0) {
         return null;
@@ -10842,21 +11217,22 @@ class DocumentStore {
       const result = this.statements.getParentChunk.get(
         library.toLowerCase(),
         normalizedVersion,
-        childMetadata.url,
+        child.url,
         JSON.stringify(parentPath),
         BigInt(id)
       );
       if (!result) {
         return null;
       }
-      return mapDbDocumentToDocument(result);
+      return this.parseMetadata(result);
     } catch (error) {
-      throw new ConnectionError(`Failed to find parent chunk for ID ${id}`, error);
+      logger.warn(`Failed to find parent chunk for ID ${id}: ${error}`);
+      return null;
     }
   }
   /**
    * Fetches multiple documents by their IDs in a single call.
-   * Returns an array of Document objects, sorted by their sort_order.
+   * Returns an array of DbPageChunk objects, sorted by their sort_order.
    */
   async findChunksByIds(library, version2, ids) {
     if (!ids.length) return [];
@@ -10864,7 +11240,7 @@ class DocumentStore {
       const normalizedVersion = version2.toLowerCase();
       const placeholders = ids.map(() => "?").join(",");
       const stmt = this.db.prepare(
-        `SELECT d.*, p.url, p.title, p.content_type FROM documents d
+        `SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
          JOIN pages p ON d.page_id = p.id
          JOIN versions v ON p.version_id = v.id
          JOIN libraries l ON v.library_id = l.id
@@ -10878,20 +11254,20 @@ class DocumentStore {
         normalizedVersion,
         ...ids
       );
-      return rows.map((row) => mapDbDocumentToDocument(row));
+      return this.parseMetadataArray(rows);
     } catch (error) {
       throw new ConnectionError("Failed to fetch documents by IDs", error);
     }
   }
   /**
    * Fetches all document chunks for a specific URL within a library and version.
-   * Returns documents sorted by their sort_order for proper reassembly.
+   * Returns DbPageChunk objects sorted by their sort_order for proper reassembly.
    */
   async findChunksByUrl(library, version2, url) {
     try {
       const normalizedVersion = version2.toLowerCase();
       const stmt = this.db.prepare(
-        `SELECT d.*, p.url, p.title, p.content_type FROM documents d
+        `SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
          JOIN pages p ON d.page_id = p.id
          JOIN versions v ON p.version_id = v.id
          JOIN libraries l ON v.library_id = l.id
@@ -10905,7 +11281,7 @@ class DocumentStore {
         normalizedVersion,
         url
       );
-      return rows.map((row) => mapDbDocumentToDocument(row));
+      return this.parseMetadataArray(rows);
     } catch (error) {
       throw new ConnectionError(`Failed to fetch documents by URL ${url}`, error);
     }
@@ -10923,9 +11299,8 @@ class DocumentManagementService {
     return (version2 ?? "").toLowerCase();
   }
   constructor(storePath, embeddingConfig, pipelineConfig) {
-    const dbDir = storePath;
-    const dbPath = path.join(dbDir, "documents.db");
-    logger.debug(`Using database directory: ${dbDir}`);
+    const dbPath = storePath === ":memory:" ? ":memory:" : path.join(storePath, "documents.db");
+    logger.debug(`Using database path: ${dbPath}`);
     this.store = new DocumentStore(dbPath, embeddingConfig);
     this.documentRetriever = new DocumentRetrieverService(this.store);
     this.pipelines = PipelineFactory$1.createStandardPipelines(pipelineConfig);
@@ -11136,9 +11511,24 @@ class DocumentManagementService {
     logger.info(
       `🗑️ Removing all documents from ${library}@${normalizedVersion || "[no version]"} store`
     );
-    const count = await this.store.deleteDocuments(library, normalizedVersion);
+    const count = await this.store.deletePages(library, normalizedVersion);
     logger.info(`🗑️ Deleted ${count} documents`);
   }
+  /**
+   * Deletes a page and all its associated document chunks.
+   * This is used during refresh operations when a page returns 404 Not Found.
+   */
+  async deletePage(pageId) {
+    logger.debug(`Deleting page ID: ${pageId}`);
+    await this.store.deletePage(pageId);
+  }
+  /**
+   * Retrieves all pages for a specific version ID with their metadata.
+   * Used for refresh operations to get existing pages with their ETags and depths.
+   */
+  async getPagesByVersionId(versionId) {
+    return this.store.getPagesByVersionId(versionId);
+  }
   /**
    * Completely removes a library version and all associated documents.
    * Also removes the library if no other versions remain.
@@ -11147,15 +11537,13 @@ class DocumentManagementService {
    */
   async removeVersion(library, version2) {
     const normalizedVersion = this.normalizeVersion(version2);
-    logger.info(`🗑️ Removing version: ${library}@${normalizedVersion || "[no version]"}`);
+    logger.debug(`Removing version: ${library}@${normalizedVersion || "[no version]"}`);
     const result = await this.store.removeVersion(library, normalizedVersion, true);
-    logger.info(
-      `🗑️ Removed ${result.documentsDeleted} documents, version: ${result.versionDeleted}, library: ${result.libraryDeleted}`
-    );
+    logger.info(`🗑️ Removed ${result.documentsDeleted} documents`);
     if (result.versionDeleted && result.libraryDeleted) {
-      logger.info(`✅ Completely removed library ${library} (was last version)`);
+      logger.info(`🗑️ Completely removed library ${library} (was last version)`);
     } else if (result.versionDeleted) {
-      logger.info(`✅ Removed version ${library}@${normalizedVersion || "[no version]"}`);
+      logger.info(`🗑️ Removed version ${library}@${normalizedVersion || "[no version]"}`);
     } else {
       logger.warn(
         `⚠️ Version ${library}@${normalizedVersion || "[no version]"} not found`
@@ -11163,91 +11551,68 @@ class DocumentManagementService {
     }
   }
   /**
-   * Adds a document to the store, splitting it into smaller chunks for better search results.
-   * Uses SemanticMarkdownSplitter to maintain markdown structure and content types during splitting.
-   * Preserves hierarchical structure of documents and distinguishes between text and code segments.
-   * If version is omitted, the document is added without a specific version.
+   * Adds pre-processed content directly to the store.
+   * This method is used when content has already been processed by a pipeline,
+   * avoiding redundant processing. Used primarily by the scraping pipeline.
+   *
+   * @param library Library name
+   * @param version Version string (null/undefined for unversioned)
+   * @param processed Pre-processed content with chunks already created
+   * @param pageId Optional page ID for refresh operations
    */
-  async addDocument(library, version2, document2) {
+  async addScrapeResult(library, version2, depth, result) {
     const processingStart = performance.now();
     const normalizedVersion = this.normalizeVersion(version2);
-    const url = document2.metadata.url;
-    if (!url || typeof url !== "string" || !url.trim()) {
-      throw new StoreError("Document metadata must include a valid URL");
+    const { url, title, chunks, contentType } = result;
+    if (!url) {
+      throw new StoreError("Processed content metadata must include a valid URL");
     }
-    logger.info(`📚 Adding document: ${document2.metadata.title}`);
-    if (!document2.pageContent.trim()) {
-      throw new Error("Document content cannot be empty");
+    logger.info(`📚 Adding processed content: ${title || url}`);
+    if (chunks.length === 0) {
+      logger.warn(`⚠️  No chunks in processed content for ${url}. Skipping.`);
+      return;
     }
-    const contentType = document2.metadata.mimeType;
     try {
-      const rawContent = {
-        source: url,
-        content: document2.pageContent,
-        mimeType: contentType || "text/plain"
-      };
-      const pipeline = this.pipelines.find((p) => p.canProcess(rawContent));
-      if (!pipeline) {
-        logger.warn(
-          `⚠️  Unsupported content type "${rawContent.mimeType}" for document ${url}. Skipping processing.`
-        );
-        return;
-      }
-      logger.debug(
-        `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${url})`
-      );
-      const scraperOptions = {
-        url,
-        library,
-        version: normalizedVersion,
-        scrapeMode: ScrapeMode.Fetch,
-        ignoreErrors: false,
-        maxConcurrency: 1
-      };
-      const processed = await pipeline.process(rawContent, scraperOptions);
-      const chunks = processed.chunks;
-      const splitDocs = chunks.map((chunk) => ({
-        pageContent: chunk.content,
-        metadata: {
-          ...document2.metadata,
-          level: chunk.section.level,
-          path: chunk.section.path
-        }
-      }));
-      logger.info(`✂️  Split document into ${splitDocs.length} chunks`);
-      await this.store.addDocuments(library, normalizedVersion, splitDocs);
+      logger.info(`✂️  Storing ${chunks.length} pre-split chunks`);
+      await this.store.addDocuments(library, normalizedVersion, depth, result);
       const processingTime = performance.now() - processingStart;
+      const totalContentSize = chunks.reduce(
+        (sum, chunk) => sum + chunk.content.length,
+        0
+      );
       analytics.track(TelemetryEvent.DOCUMENT_PROCESSED, {
         // Content characteristics (privacy-safe)
-        mimeType: contentType || "unknown",
-        contentSizeBytes: document2.pageContent.length,
+        mimeType: contentType,
+        contentSizeBytes: totalContentSize,
         // Processing metrics
         processingTimeMs: Math.round(processingTime),
-        chunksCreated: splitDocs.length,
+        chunksCreated: chunks.length,
         // Document characteristics
-        hasTitle: !!document2.metadata.title,
-        hasDescription: !!document2.metadata.description,
+        hasTitle: !!title,
         urlDomain: extractHostname(url),
-        depth: document2.metadata.depth,
+        depth,
         // Library context
         library,
         libraryVersion: normalizedVersion || null,
         // Processing efficiency
-        avgChunkSizeBytes: Math.round(document2.pageContent.length / splitDocs.length),
+        avgChunkSizeBytes: Math.round(totalContentSize / chunks.length),
         processingSpeedKbPerSec: Math.round(
-          document2.pageContent.length / 1024 / (processingTime / 1e3)
+          totalContentSize / 1024 / (processingTime / 1e3)
         )
       });
     } catch (error) {
       const processingTime = performance.now() - processingStart;
       if (error instanceof Error) {
         analytics.captureException(error, {
-          mimeType: contentType || "unknown",
-          contentSizeBytes: document2.pageContent.length,
+          mimeType: contentType,
+          contentSizeBytes: chunks.reduce(
+            (sum, chunk) => sum + chunk.content.length,
+            0
+          ),
           processingTimeMs: Math.round(processingTime),
           library,
           libraryVersion: normalizedVersion || null,
-          context: "document_processing",
+          context: "processed_content_storage",
           component: DocumentManagementService.constructor.name
         });
       }
@@ -11277,6 +11642,18 @@ class DocumentManagementService {
     );
     return versionId;
   }
+  /**
+   * Retrieves a version by its ID from the database.
+   */
+  async getVersionById(versionId) {
+    return this.store.getVersionById(versionId);
+  }
+  /**
+   * Retrieves a library by its ID from the database.
+   */
+  async getLibraryById(libraryId) {
+    return this.store.getLibraryById(libraryId);
+  }
 }
 async function createDocumentManagement(options = {}) {
   if (options.serverUrl) {
@@ -11368,6 +11745,7 @@ async function initializeTools(docService, pipeline) {
     listLibraries: new ListLibrariesTool(docService),
     findVersion: new FindVersionTool(docService),
     scrape: new ScrapeTool(pipeline),
+    refresh: new RefreshVersionTool(pipeline),
     search: new SearchTool(docService),
     listJobs: new ListJobsTool(pipeline),
     getJobInfo: new GetJobInfoTool(pipeline),
@@ -11480,11 +11858,15 @@ const optionalTrimmed = z$1.preprocess(
   (v) => typeof v === "string" ? v.trim() : v,
   z$1.string().min(1).optional().nullable()
 );
-const enqueueInput = z$1.object({
+const enqueueScrapeInput = z$1.object({
   library: nonEmptyTrimmed,
   version: optionalTrimmed,
   options: z$1.custom()
 });
+const enqueueRefreshInput = z$1.object({
+  library: nonEmptyTrimmed,
+  version: optionalTrimmed
+});
 const jobIdInput = z$1.object({ id: z$1.string().min(1) });
 const getJobsInput = z$1.object({
   status: z$1.nativeEnum(PipelineJobStatus).optional()
@@ -11492,12 +11874,12 @@ const getJobsInput = z$1.object({
 function createPipelineRouter(trpc) {
   const tt = trpc;
   return tt.router({
-    enqueueJob: tt.procedure.input(enqueueInput).mutation(
+    enqueueScrapeJob: tt.procedure.input(enqueueScrapeInput).mutation(
       async ({
         ctx,
         input
       }) => {
-        const jobId = await ctx.pipeline.enqueueJob(
+        const jobId = await ctx.pipeline.enqueueScrapeJob(
           input.library,
           input.version ?? null,
           input.options
@@ -11517,6 +11899,18 @@ function createPipelineRouter(trpc) {
         return { jobId };
       }
     ),
+    enqueueRefreshJob: tt.procedure.input(enqueueRefreshInput).mutation(
+      async ({
+        ctx,
+        input
+      }) => {
+        const jobId = await ctx.pipeline.enqueueRefreshJob(
+          input.library,
+          input.version ?? null
+        );
+        return { jobId };
+      }
+    ),
     getJob: tt.procedure.input(jobIdInput).query(
       async ({
         ctx,
@@ -13456,7 +13850,7 @@ async function registerWorkerService(pipeline) {
     },
     onJobError: async (job, error, document2) => {
       logger.warn(
-        `⚠️ Job ${job.id} error ${document2 ? `on document ${document2.metadata.url}` : ""}: ${error.message}`
+        `⚠️ Job ${job.id} error ${document2 ? `on document ${document2.url}` : ""}: ${error.message}`
       );
       analytics.captureException(error, {
         jobId: job.id,
@@ -13996,7 +14390,7 @@ async function findVersionAction(library, options, command) {
 function createFindVersionCommand(program) {
   return program.command("find-version <library>").description("Find the best matching version for a library").option("-v, --version <string>", "Pattern to match (optional, supports ranges)").option(
     "--server-url <url>",
-    "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
+    "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
   ).action(findVersionAction);
 }
 async function listAction(options, command) {
@@ -14022,7 +14416,7 @@ async function listAction(options, command) {
 function createListCommand(program) {
   return program.command("list").description("List all available libraries and their versions").option(
     "--server-url <url>",
-    "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
+    "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
   ).action(listAction);
 }
 function createMcpCommand(program) {
@@ -14045,7 +14439,7 @@ function createMcpCommand(program) {
     ).env("DOCS_MCP_EMBEDDING_MODEL")
   ).option(
     "--server-url <url>",
-    "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
+    "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
   ).option(
     "--read-only",
     "Run in read-only mode (only expose read tools, disable write/job tools)",
@@ -14169,6 +14563,68 @@ function createMcpCommand(program) {
     }
   );
 }
+async function refreshAction(library, options, command) {
+  await analytics.track(TelemetryEvent.CLI_COMMAND, {
+    command: "refresh",
+    library,
+    version: options.version,
+    useServerUrl: !!options.serverUrl
+  });
+  const serverUrl = options.serverUrl;
+  const globalOptions = getGlobalOptions(command);
+  const embeddingConfig = resolveEmbeddingContext(options.embeddingModel);
+  if (!serverUrl && !embeddingConfig) {
+    throw new Error(
+      "Embedding configuration is required for local refresh operations. Please set DOCS_MCP_EMBEDDING_MODEL environment variable or use --server-url for remote execution."
+    );
+  }
+  const docService = await createDocumentManagement({
+    serverUrl,
+    embeddingConfig,
+    storePath: globalOptions.storePath
+  });
+  let pipeline = null;
+  try {
+    const pipelineOptions = {
+      recoverJobs: false,
+      concurrency: 1,
+      serverUrl
+    };
+    pipeline = await createPipelineWithCallbacks(
+      serverUrl ? void 0 : docService,
+      pipelineOptions
+    );
+    await pipeline.start();
+    const refreshTool = new RefreshVersionTool(pipeline);
+    const result = await refreshTool.execute({
+      library,
+      version: options.version,
+      waitForCompletion: true
+      // Always wait for completion in CLI
+    });
+    if ("pagesRefreshed" in result) {
+      console.log(`✅ Successfully refreshed ${result.pagesRefreshed} pages`);
+    } else {
+      console.log(`🚀 Refresh job started with ID: ${result.jobId}`);
+    }
+  } finally {
+    if (pipeline) await pipeline.stop();
+    await docService.shutdown();
+  }
+}
+function createRefreshCommand(program) {
+  return program.command("refresh <library>").description(
+    "Re-scrape an existing library version, updating only changed pages.\n\nUses HTTP ETags to efficiently skip unchanged pages and only re-process\ncontent that has been modified or deleted since the last scrape.\n\nExamples:\n  refresh react --version 18.0.0\n  refresh mylib\n\nNote: The library and version must already be indexed. Use 'scrape' to index a new library/version."
+  ).option("-v, --version <string>", "Version of the library (optional)").addOption(
+    new Option(
+      "--embedding-model <model>",
+      "Embedding model configuration (e.g., 'openai:text-embedding-3-small')"
+    ).env("DOCS_MCP_EMBEDDING_MODEL")
+  ).option(
+    "--server-url <url>",
+    "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
+  ).action(refreshAction);
+}
 async function removeAction(library, options, command) {
   await analytics.track(TelemetryEvent.CLI_COMMAND, {
     command: "remove",
@@ -14203,7 +14659,7 @@ function createRemoveCommand(program) {
     "Version to remove (optional, removes unversioned if omitted)"
   ).option(
     "--server-url <url>",
-    "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
+    "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
   ).action(removeAction);
 }
 async function scrapeAction(library, url, options, command) {
@@ -14343,7 +14799,7 @@ function createScrapeCommand(program) {
     ).env("DOCS_MCP_EMBEDDING_MODEL")
   ).option(
     "--server-url <url>",
-    "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
+    "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
   ).action(scrapeAction);
 }
 async function searchAction(library, query, options, command) {
@@ -14396,7 +14852,7 @@ function createSearchCommand(program) {
     ).env("DOCS_MCP_EMBEDDING_MODEL")
   ).option(
     "--server-url <url>",
-    "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
+    "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
   ).action(searchAction);
 }
 function createWebCommand(program) {
@@ -14417,7 +14873,7 @@ function createWebCommand(program) {
     ).env("DOCS_MCP_EMBEDDING_MODEL")
   ).option(
     "--server-url <url>",
-    "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
+    "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
   ).action(
     async (cmdOptions, command) => {
       await analytics.track(TelemetryEvent.CLI_COMMAND, {
@@ -14612,6 +15068,7 @@ function createCliProgram() {
   createWebCommand(program);
   createWorkerCommand(program);
   createScrapeCommand(program);
+  createRefreshCommand(program);
   createSearchCommand(program);
   createListCommand(program);
   createFindVersionCommand(program);