@arabold/docs-mcp-server 1.26.1 → 1.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,4 +1,4 @@
1
- #!/usr/bin/env node
1
+ #!/usr/bin/env node --enable-source-maps
2
2
  import "dotenv/config";
3
3
  import { BedrockEmbeddings } from "@langchain/aws";
4
4
  import { GoogleGenerativeAIEmbeddings } from "@langchain/google-genai";
@@ -6,7 +6,7 @@ import { VertexAIEmbeddings } from "@langchain/google-vertexai";
6
6
  import { AzureOpenAIEmbeddings, OpenAIEmbeddings } from "@langchain/openai";
7
7
  import { Embeddings } from "@langchain/core/embeddings";
8
8
  import { PostHog } from "posthog-node";
9
- import { randomUUID } from "node:crypto";
9
+ import crypto, { randomUUID } from "node:crypto";
10
10
  import fs, { existsSync, readFileSync } from "node:fs";
11
11
  import path from "node:path";
12
12
  import { fileURLToPath, URL as URL$1 } from "node:url";
@@ -27,6 +27,7 @@ import psl from "psl";
27
27
  import { HeaderGenerator } from "header-generator";
28
28
  import fs$1 from "node:fs/promises";
29
29
  import axios from "axios";
30
+ import { minimatch } from "minimatch";
30
31
  import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
31
32
  import remarkGfm from "remark-gfm";
32
33
  import remarkHtml from "remark-html";
@@ -40,7 +41,6 @@ import * as cheerio from "cheerio";
40
41
  import "node:vm";
41
42
  import { gfm } from "@joplin/turndown-plugin-gfm";
42
43
  import iconv from "iconv-lite";
43
- import { minimatch } from "minimatch";
44
44
  import { SSEServerTransport } from "@modelcontextprotocol/sdk/server/sse.js";
45
45
  import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
46
46
  import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
@@ -113,21 +113,6 @@ class MissingCredentialsError extends StoreError {
113
113
  }
114
114
  }
115
115
  const VECTOR_DIMENSION = 1536;
116
- function mapDbDocumentToDocument(doc) {
117
- const chunkMetadata = JSON.parse(doc.metadata);
118
- return {
119
- id: doc.id,
120
- pageContent: doc.content,
121
- metadata: {
122
- ...chunkMetadata,
123
- // Page-level fields are always available from joined queries
124
- url: doc.url,
125
- title: doc.title || "",
126
- // Convert null to empty string for consistency
127
- ...doc.content_type && { contentType: doc.content_type }
128
- }
129
- };
130
- }
131
116
  var VersionStatus = /* @__PURE__ */ ((VersionStatus2) => {
132
117
  VersionStatus2["NOT_INDEXED"] = "not_indexed";
133
118
  VersionStatus2["QUEUED"] = "queued";
@@ -784,16 +769,16 @@ function extractProtocol(urlOrPath) {
784
769
  }
785
770
  }
786
771
  const name = "@arabold/docs-mcp-server";
787
- const version = "1.26.0";
772
+ const version = "1.26.2";
788
773
  const description = "MCP server for fetching and searching documentation";
789
774
  const type = "module";
790
775
  const bin = { "docs-mcp-server": "dist/index.js" };
791
776
  const license = "MIT";
792
777
  const repository = { "type": "git", "url": "git+https://github.com/arabold/docs-mcp-server.git" };
793
778
  const files = ["dist", "public", "db", "README.md", "LICENSE", "package.json"];
794
- const scripts = { "prepare": "husky || true", "build": "vite build --config vite.config.web.ts && vite build", "start": "node --enable-source-maps dist/index.js", "cli": "node --enable-source-maps dist/index.js", "server": "node --enable-source-maps dist/index.ts", "web": "node --enable-source-maps dist/index.ts web", "dev": "npm-run-all --parallel dev:server dev:web", "dev:server": "vite-node --watch src/index.ts", "dev:web": "vite build --config vite.config.web.ts --watch", "test": "vitest run", "test:watch": "vitest", "test:coverage": "vitest run --coverage", "test:e2e": "vitest run --config test/vitest.config.ts", "test:e2e:watch": "vitest --config test/vitest.config.ts", "lint": "biome check .", "lint:fix": "biome check . --fix", "format": "biome format . --write", "postinstall": "echo 'Skipping Playwright browser install. See README.md for details.'" };
795
- const dependencies = { "@fastify/formbody": "^8.0.2", "@fastify/static": "^8.2.0", "@joplin/turndown-plugin-gfm": "^1.0.62", "@kitajs/html": "^4.2.9", "@kitajs/ts-html-plugin": "^4.1.1", "@langchain/aws": "^0.1.13", "@langchain/google-genai": "^0.2.16", "@langchain/google-vertexai": "^0.2.16", "@langchain/openai": "^0.6.3", "@modelcontextprotocol/sdk": "^1.17.1", "@trpc/client": "^11.4.4", "@trpc/server": "^11.4.4", "alpinejs": "^3.14.9", "axios": "^1.11.0", "axios-retry": "^4.5.0", "better-sqlite3": "^12.2.0", "cheerio": "^1.1.2", "commander": "^14.0.0", "dompurify": "^3.2.6", "dotenv": "^17.2.1", "env-paths": "^3.0.0", "fastify": "^5.4.0", "flowbite": "^3.1.2", "fuse.js": "^7.1.0", "header-generator": "^2.1.69", "htmx.org": "^2.0.6", "iconv-lite": "^0.6.3", "jose": "^6.0.12", "jsdom": "^26.1.0", "langchain": "^0.3.30", "mime": "^4.0.7", "minimatch": "^10.0.1", "playwright": "^1.52.0", "posthog-node": "^5.7.0", "psl": "^1.15.0", "remark": "^15.0.1", "remark-gfm": "^4.0.1", "remark-html": "^16.0.1", "semver": "^7.7.2", "sqlite-vec": "^0.1.7-alpha.2", "tree-sitter": "^0.21.1", "tree-sitter-javascript": "^0.23.1", "tree-sitter-python": "^0.21.0", "tree-sitter-typescript": "^0.23.2", "turndown": "^7.2.0", "zod": "^4.0.14" };
796
- const devDependencies = { "@biomejs/biome": "^2.1.3", "@commitlint/cli": "^19.8.1", "@commitlint/config-conventional": "^19.8.1", "@semantic-release/changelog": "^6.0.3", "@semantic-release/git": "^10.0.1", "@semantic-release/github": "^11.0.3", "@semantic-release/npm": "^12.0.2", "@tailwindcss/postcss": "^4.1.11", "@tailwindcss/vite": "^4.1.11", "@types/alpinejs": "^3.13.11", "@types/better-sqlite3": "^7.6.13", "@types/jsdom": "~21.1.7", "@types/lint-staged": "~13.3.0", "@types/node": "^24.1.0", "@types/node-fetch": "^2.6.13", "@types/psl": "^1.1.3", "@types/semver": "^7.7.0", "@types/turndown": "^5.0.5", "autoprefixer": "^10.4.21", "flowbite-typography": "^1.0.5", "husky": "^9.1.7", "lint-staged": "^16.1.2", "memfs": "^4.34.0", "npm-run-all": "^4.1.5", "postcss": "^8.5.6", "semantic-release": "^24.2.7", "tailwindcss": "^4.1.4", "typescript": "^5.9.2", "vite": "^6.3.5", "vite-node": "^3.1.2", "vite-plugin-dts": "^4.5.4", "vitest": "^3.2.4" };
779
+ const scripts = { "prepare": "husky || true", "build": "vite build --config vite.config.web.ts && vite build", "start": "node --enable-source-maps dist/index.js", "cli": "node --enable-source-maps dist/index.js", "server": "node --enable-source-maps dist/index.ts", "web": "node --enable-source-maps dist/index.ts web", "dev": "npm-run-all --parallel dev:server dev:web", "dev:server": "vite-node --watch src/index.ts", "dev:web": "vite build --config vite.config.web.ts --watch", "test": "vitest run", "test:watch": "vitest", "test:coverage": "vitest run --coverage", "test:unit": "vitest run src", "test:e2e": "vitest run test", "test:live": "vitest run --exclude= test/html-pipeline-live-e2e.test.ts", "lint": "biome check .", "lint:fix": "biome check . --fix", "typecheck": "npx tsc --noEmit", "typecheck:build": "npx tsc --noEmit --project tsconfig.build.json", "format": "biome format . --write", "postinstall": "echo 'Skipping Playwright browser install. See README.md for details.'" };
780
+ const dependencies = { "@fastify/formbody": "^8.0.2", "@fastify/static": "^8.3.0", "@joplin/turndown-plugin-gfm": "^1.0.62", "@kitajs/html": "^4.2.9", "@kitajs/ts-html-plugin": "^4.1.1", "@langchain/aws": "^0.1.13", "@langchain/google-genai": "^0.2.16", "@langchain/google-vertexai": "^0.2.16", "@langchain/openai": "^0.6.3", "@modelcontextprotocol/sdk": "^1.20.2", "@trpc/client": "^11.7.1", "@trpc/server": "^11.4.4", "alpinejs": "^3.14.9", "axios": "^1.13.1", "axios-retry": "^4.5.0", "better-sqlite3": "^12.4.1", "cheerio": "^1.1.2", "commander": "^14.0.0", "dompurify": "^3.3.0", "dotenv": "^17.2.3", "env-paths": "^3.0.0", "fastify": "^5.6.1", "flowbite": "^3.1.2", "fuse.js": "^7.1.0", "header-generator": "^2.1.76", "htmx.org": "^2.0.6", "iconv-lite": "^0.6.3", "jose": "^6.1.0", "jsdom": "^26.1.0", "langchain": "^0.3.30", "mime": "^4.1.0", "minimatch": "^10.0.1", "playwright": "^1.52.0", "posthog-node": "^5.11.0", "psl": "^1.15.0", "remark": "^15.0.1", "remark-gfm": "^4.0.1", "remark-html": "^16.0.1", "semver": "^7.7.3", "sqlite-vec": "^0.1.7-alpha.2", "tree-sitter": "^0.21.1", "tree-sitter-javascript": "^0.23.1", "tree-sitter-python": "^0.21.0", "tree-sitter-typescript": "^0.23.2", "turndown": "^7.2.2", "zod": "^4.1.12" };
781
+ const devDependencies = { "@biomejs/biome": "^2.3.2", "@commitlint/cli": "^19.8.1", "@commitlint/config-conventional": "^19.8.1", "@semantic-release/changelog": "^6.0.3", "@semantic-release/git": "^10.0.1", "@semantic-release/github": "^11.0.6", "@semantic-release/npm": "^12.0.2", "@tailwindcss/postcss": "^4.1.16", "@tailwindcss/vite": "^4.1.16", "@types/alpinejs": "^3.13.11", "@types/better-sqlite3": "^7.6.13", "@types/jsdom": "~21.1.7", "@types/lint-staged": "~13.3.0", "@types/node": "^24.1.0", "@types/node-fetch": "^2.6.13", "@types/psl": "^1.1.3", "@types/semver": "^7.7.1", "@types/turndown": "^5.0.6", "autoprefixer": "^10.4.21", "flowbite-typography": "^1.0.5", "husky": "^9.1.7", "lint-staged": "^16.2.6", "memfs": "^4.50.0", "msw": "^2.12.2", "nock": "^14.0.10", "npm-run-all": "^4.1.5", "postcss": "^8.5.6", "semantic-release": "^24.2.9", "tailwindcss": "^4.1.4", "typescript": "^5.9.3", "vite": "^6.3.5", "vite-node": "^3.2.4", "vite-plugin-dts": "^4.5.4", "vitest": "^3.2.4" };
797
782
  const engines = { "node": ">=20.0.0" };
798
783
  const packageJson = {
799
784
  name,
@@ -1288,10 +1273,10 @@ class PipelineClient {
1288
1273
  this.activePolling.clear();
1289
1274
  logger.debug("PipelineClient stopped");
1290
1275
  }
1291
- async enqueueJob(library, version2, options) {
1276
+ async enqueueScrapeJob(library, version2, options) {
1292
1277
  try {
1293
1278
  const normalizedVersion = typeof version2 === "string" && version2.trim().length === 0 ? null : version2 ?? null;
1294
- const result = await this.client.enqueueJob.mutate({
1279
+ const result = await this.client.enqueueScrapeJob.mutate({
1295
1280
  library,
1296
1281
  version: normalizedVersion,
1297
1282
  options
@@ -1304,6 +1289,21 @@ class PipelineClient {
1304
1289
  );
1305
1290
  }
1306
1291
  }
1292
+ async enqueueRefreshJob(library, version2) {
1293
+ try {
1294
+ const normalizedVersion = typeof version2 === "string" && version2.trim().length === 0 ? null : version2 ?? null;
1295
+ const result = await this.client.enqueueRefreshJob.mutate({
1296
+ library,
1297
+ version: normalizedVersion
1298
+ });
1299
+ logger.debug(`Refresh job ${result.jobId} enqueued successfully`);
1300
+ return result.jobId;
1301
+ } catch (error) {
1302
+ throw new Error(
1303
+ `Failed to enqueue refresh job: ${error instanceof Error ? error.message : String(error)}`
1304
+ );
1305
+ }
1306
+ }
1307
1307
  async getJob(jobId) {
1308
1308
  try {
1309
1309
  const serializedJob = await this.client.getJob.query({ id: jobId });
@@ -1753,6 +1753,12 @@ class FingerprintGenerator {
1753
1753
  return this.headerGenerator.getHeaders();
1754
1754
  }
1755
1755
  }
1756
+ var FetchStatus = /* @__PURE__ */ ((FetchStatus2) => {
1757
+ FetchStatus2["SUCCESS"] = "success";
1758
+ FetchStatus2["NOT_MODIFIED"] = "not_modified";
1759
+ FetchStatus2["NOT_FOUND"] = "not_found";
1760
+ return FetchStatus2;
1761
+ })(FetchStatus || {});
1756
1762
  class BrowserFetcher {
1757
1763
  browser = null;
1758
1764
  page = null;
@@ -1792,13 +1798,16 @@ class BrowserFetcher {
1792
1798
  const contentBuffer = Buffer.from(content, "utf-8");
1793
1799
  const contentType = response.headers()["content-type"] || "text/html";
1794
1800
  const { mimeType, charset } = MimeTypeUtils.parseContentType(contentType);
1801
+ const etag = response.headers().etag;
1795
1802
  return {
1796
1803
  content: contentBuffer,
1797
1804
  mimeType,
1798
1805
  charset,
1799
1806
  encoding: void 0,
1800
1807
  // Browser handles encoding automatically
1801
- source: finalUrl
1808
+ source: finalUrl,
1809
+ etag,
1810
+ status: FetchStatus.SUCCESS
1802
1811
  };
1803
1812
  } catch (error) {
1804
1813
  if (options?.signal?.aborted) {
@@ -1859,24 +1868,48 @@ class FileFetcher {
1859
1868
  /**
1860
1869
  * Fetches the content of a file given a file:// URL, decoding percent-encoded paths as needed.
1861
1870
  * Uses enhanced MIME type detection for better source code file recognition.
1871
+ * Supports conditional fetching via ETag comparison for efficient refresh operations.
1862
1872
  */
1863
- async fetch(source, _options) {
1873
+ async fetch(source, options) {
1864
1874
  let filePath = source.replace(/^file:\/\/\/?/, "");
1865
1875
  filePath = decodeURIComponent(filePath);
1866
1876
  if (!filePath.startsWith("/") && process.platform !== "win32") {
1867
1877
  filePath = `/${filePath}`;
1868
1878
  }
1869
1879
  try {
1880
+ const stats = await fs$1.stat(filePath);
1881
+ const currentEtag = crypto.createHash("md5").update(stats.mtime.toISOString()).digest("hex");
1882
+ if (options?.etag && options.etag === currentEtag) {
1883
+ return {
1884
+ content: Buffer.from(""),
1885
+ mimeType: "text/plain",
1886
+ source,
1887
+ etag: currentEtag,
1888
+ lastModified: stats.mtime.toISOString(),
1889
+ status: FetchStatus.NOT_MODIFIED
1890
+ };
1891
+ }
1870
1892
  const content = await fs$1.readFile(filePath);
1871
1893
  const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath);
1872
1894
  const mimeType = detectedMimeType || "application/octet-stream";
1873
1895
  return {
1874
1896
  content,
1875
1897
  mimeType,
1876
- source
1898
+ source,
1899
+ etag: currentEtag,
1900
+ lastModified: stats.mtime.toISOString(),
1901
+ status: FetchStatus.SUCCESS
1877
1902
  // Don't assume charset for text files - let the pipeline detect it
1878
1903
  };
1879
1904
  } catch (error) {
1905
+ if (error.code === "ENOENT") {
1906
+ return {
1907
+ content: Buffer.from(""),
1908
+ mimeType: "text/plain",
1909
+ source,
1910
+ status: FetchStatus.NOT_FOUND
1911
+ };
1912
+ }
1880
1913
  throw new ScraperError(
1881
1914
  `Failed to read file ${filePath}: ${error.message ?? "Unknown error"}`,
1882
1915
  false,
@@ -1982,6 +2015,12 @@ class HttpFetcher {
1982
2015
  ...options?.headers
1983
2016
  // User-provided headers override generated ones
1984
2017
  };
2018
+ if (options?.etag) {
2019
+ headers["If-None-Match"] = options.etag;
2020
+ logger.debug(
2021
+ `Conditional request for ${source} with If-None-Match: ${options.etag}`
2022
+ );
2023
+ }
1985
2024
  const config = {
1986
2025
  responseType: "arraybuffer",
1987
2026
  headers: {
@@ -1995,9 +2034,22 @@ class HttpFetcher {
1995
2034
  // Pass signal to axios
1996
2035
  // Axios follows redirects by default, we need to explicitly disable it if needed
1997
2036
  maxRedirects: followRedirects ? 5 : 0,
1998
- decompress: true
2037
+ decompress: true,
2038
+ // Allow 304 responses to be handled as successful responses
2039
+ validateStatus: (status) => {
2040
+ return status >= 200 && status < 300 || status === 304;
2041
+ }
1999
2042
  };
2000
2043
  const response = await axios.get(source, config);
2044
+ if (response.status === 304) {
2045
+ logger.debug(`HTTP 304 Not Modified for ${source}`);
2046
+ return {
2047
+ content: Buffer.from(""),
2048
+ mimeType: "text/plain",
2049
+ source,
2050
+ status: FetchStatus.NOT_MODIFIED
2051
+ };
2052
+ }
2001
2053
  const contentTypeHeader = response.headers["content-type"];
2002
2054
  const { mimeType, charset } = MimeTypeUtils.parseContentType(contentTypeHeader);
2003
2055
  const contentEncoding = response.headers["content-encoding"];
@@ -2017,12 +2069,21 @@ class HttpFetcher {
2017
2069
  response.request?.responseUrl || // Fallback to axios recorded config URL
2018
2070
  response.config?.url || source
2019
2071
  );
2072
+ const etag = response.headers.etag || response.headers.ETag;
2073
+ if (etag) {
2074
+ logger.debug(`Received ETag for ${source}: ${etag}`);
2075
+ }
2076
+ const lastModified = response.headers["last-modified"];
2077
+ const lastModifiedISO = lastModified ? new Date(lastModified).toISOString() : void 0;
2020
2078
  return {
2021
2079
  content,
2022
2080
  mimeType,
2023
2081
  charset,
2024
2082
  encoding: contentEncoding,
2025
- source: finalUrl
2083
+ source: finalUrl,
2084
+ etag,
2085
+ lastModified: lastModifiedISO,
2086
+ status: FetchStatus.SUCCESS
2026
2087
  };
2027
2088
  } catch (error) {
2028
2089
  const axiosError = error;
@@ -2031,6 +2092,15 @@ class HttpFetcher {
2031
2092
  if (options?.signal?.aborted || code === "ERR_CANCELED") {
2032
2093
  throw new CancellationError("HTTP fetch cancelled");
2033
2094
  }
2095
+ if (status === 404) {
2096
+ logger.debug(`Resource not found (404): ${source}`);
2097
+ return {
2098
+ content: Buffer.from(""),
2099
+ mimeType: "text/plain",
2100
+ source,
2101
+ status: FetchStatus.NOT_FOUND
2102
+ };
2103
+ }
2034
2104
  if (!followRedirects && status && status >= 300 && status < 400) {
2035
2105
  const location = axiosError.response?.headers?.location;
2036
2106
  if (location) {
@@ -2125,101 +2195,522 @@ class AutoDetectFetcher {
2125
2195
  ]);
2126
2196
  }
2127
2197
  }
2128
- class SplitterError extends Error {
2129
- }
2130
- class MinimumChunkSizeError extends SplitterError {
2131
- constructor(size, maxSize) {
2132
- super(
2133
- `Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`
2134
- );
2198
+ const DEFAULT_FILE_EXCLUSIONS = [
2199
+ // CHANGELOG files (case variations)
2200
+ "**/CHANGELOG.md",
2201
+ "**/changelog.md",
2202
+ "**/CHANGELOG.mdx",
2203
+ "**/changelog.mdx",
2204
+ // LICENSE files (case variations)
2205
+ "**/LICENSE",
2206
+ "**/LICENSE.md",
2207
+ "**/license.md",
2208
+ // CODE_OF_CONDUCT files (case variations)
2209
+ "**/CODE_OF_CONDUCT.md",
2210
+ "**/code_of_conduct.md",
2211
+ // Test files
2212
+ "**/*.test.*",
2213
+ "**/*.spec.*",
2214
+ "**/*_test.py",
2215
+ "**/*_test.go",
2216
+ // Package manager lock files
2217
+ "**/*.lock",
2218
+ "**/package-lock.json",
2219
+ "**/yarn.lock",
2220
+ "**/pnpm-lock.yaml",
2221
+ "**/go.sum",
2222
+ // Build artifacts
2223
+ "**/*.min.js",
2224
+ "**/*.min.css",
2225
+ "**/*.map",
2226
+ "**/*.d.ts",
2227
+ // IDE/System files
2228
+ "**/.DS_Store",
2229
+ "**/Thumbs.db",
2230
+ "**/*.swp",
2231
+ "**/*.swo",
2232
+ // Internal config files (using regex pattern)
2233
+ "/.*\\.(ini|cfg|conf|log|pid)$/"
2234
+ ];
2235
+ const DEFAULT_FOLDER_EXCLUSIONS = [
2236
+ // Archive and deprecated content (matches anywhere in path)
2237
+ "**/archive/**",
2238
+ "**/archived/**",
2239
+ "**/deprecated/**",
2240
+ "**/legacy/**",
2241
+ "**/old/**",
2242
+ "**/outdated/**",
2243
+ "**/previous/**",
2244
+ "**/superseded/**",
2245
+ // Specific paths that don't follow the general pattern
2246
+ "docs/old/**",
2247
+ // Test directories
2248
+ "**/test/**",
2249
+ "**/tests/**",
2250
+ "**/__tests__/**",
2251
+ "**/spec/**",
2252
+ // Build output directories
2253
+ "**/dist/**",
2254
+ "**/build/**",
2255
+ "**/out/**",
2256
+ "**/target/**",
2257
+ "**/.next/**",
2258
+ "**/.nuxt/**",
2259
+ // IDE directories
2260
+ "**/.vscode/**",
2261
+ "**/.idea/**",
2262
+ // Internationalization folders - non-English locales
2263
+ "**/i18n/ar*/**",
2264
+ "**/i18n/de*/**",
2265
+ "**/i18n/es*/**",
2266
+ "**/i18n/fr*/**",
2267
+ "**/i18n/hi*/**",
2268
+ "**/i18n/it*/**",
2269
+ "**/i18n/ja*/**",
2270
+ "**/i18n/ko*/**",
2271
+ "**/i18n/nl*/**",
2272
+ "**/i18n/pl*/**",
2273
+ "**/i18n/pt*/**",
2274
+ "**/i18n/ru*/**",
2275
+ "**/i18n/sv*/**",
2276
+ "**/i18n/th*/**",
2277
+ "**/i18n/tr*/**",
2278
+ "**/i18n/vi*/**",
2279
+ "**/i18n/zh*/**",
2280
+ // Common locale folder patterns
2281
+ "**/zh-cn/**",
2282
+ "**/zh-hk/**",
2283
+ "**/zh-mo/**",
2284
+ "**/zh-sg/**",
2285
+ "**/zh-tw/**"
2286
+ ];
2287
+ const DEFAULT_EXCLUSION_PATTERNS = [
2288
+ ...DEFAULT_FILE_EXCLUSIONS,
2289
+ ...DEFAULT_FOLDER_EXCLUSIONS
2290
+ ];
2291
+ function getEffectiveExclusionPatterns(userPatterns) {
2292
+ if (userPatterns !== void 0) {
2293
+ return userPatterns;
2135
2294
  }
2295
+ return DEFAULT_EXCLUSION_PATTERNS;
2136
2296
  }
2137
- class ContentSplitterError extends SplitterError {
2297
+ function isRegexPattern(pattern) {
2298
+ return pattern.length > 2 && pattern.startsWith("/") && pattern.endsWith("/");
2138
2299
  }
2139
- class GreedySplitter {
2140
- baseSplitter;
2141
- minChunkSize;
2142
- preferredChunkSize;
2143
- /**
2144
- * Combines a base document splitter with size constraints to produce optimally-sized chunks.
2145
- * The base splitter handles the initial semantic splitting, while this class handles
2146
- * the concatenation strategy.
2147
- */
2148
- constructor(baseSplitter, minChunkSize, preferredChunkSize) {
2149
- this.baseSplitter = baseSplitter;
2150
- this.minChunkSize = minChunkSize;
2151
- this.preferredChunkSize = preferredChunkSize;
2300
+ function patternToRegExp(pattern) {
2301
+ if (isRegexPattern(pattern)) {
2302
+ return new RegExp(pattern.slice(1, -1));
2152
2303
  }
2153
- /**
2154
- * Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks
2155
- * are combined until they reach the minimum size, but splits are preserved at major
2156
- * section boundaries to maintain document structure. This balances the need for
2157
- * context with semantic coherence.
2158
- */
2159
- async splitText(markdown, contentType) {
2160
- const initialChunks = await this.baseSplitter.splitText(markdown, contentType);
2161
- const concatenatedChunks = [];
2162
- let currentChunk = null;
2163
- for (const nextChunk of initialChunks) {
2164
- if (currentChunk) {
2165
- if (this.wouldExceedMaxSize(currentChunk, nextChunk)) {
2166
- concatenatedChunks.push(currentChunk);
2167
- currentChunk = this.cloneChunk(nextChunk);
2168
- continue;
2169
- }
2170
- if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk)) {
2171
- concatenatedChunks.push(currentChunk);
2172
- currentChunk = this.cloneChunk(nextChunk);
2173
- continue;
2174
- }
2175
- currentChunk.content += `${currentChunk.content.endsWith("\n") ? "" : "\n"}${nextChunk.content}`;
2176
- currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
2177
- currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
2178
- } else {
2179
- currentChunk = this.cloneChunk(nextChunk);
2180
- }
2181
- }
2182
- if (currentChunk) {
2183
- concatenatedChunks.push(currentChunk);
2304
+ const re = minimatch.makeRe(pattern, { dot: true });
2305
+ if (!re) throw new Error(`Invalid glob pattern: ${pattern}`);
2306
+ return re;
2307
+ }
2308
+ function matchesAnyPattern(path2, patterns) {
2309
+ if (!patterns || patterns.length === 0) return false;
2310
+ const normalizedPath = path2.startsWith("/") ? path2 : `/${path2}`;
2311
+ return patterns.some((pattern) => {
2312
+ if (isRegexPattern(pattern)) {
2313
+ return patternToRegExp(pattern).test(normalizedPath);
2184
2314
  }
2185
- return concatenatedChunks;
2186
- }
2187
- cloneChunk(chunk) {
2188
- return {
2189
- types: [...chunk.types],
2190
- content: chunk.content,
2191
- section: {
2192
- level: chunk.section.level,
2193
- path: [...chunk.section.path]
2194
- }
2195
- };
2196
- }
2197
- /**
2198
- * H1 and H2 headings represent major conceptual breaks in the document.
2199
- * Preserving these splits helps maintain the document's logical structure.
2200
- */
2201
- startsNewMajorSection(chunk) {
2202
- return chunk.section.level === 1 || chunk.section.level === 2;
2315
+ const pathForMatch = normalizedPath.replace(/^\//, "");
2316
+ const patternForMatch = pattern.startsWith("/") ? pattern.slice(1) : pattern;
2317
+ return minimatch(pathForMatch, patternForMatch, { dot: true });
2318
+ });
2319
+ }
2320
+ function extractPathAndQuery(url) {
2321
+ try {
2322
+ const u = new URL(url);
2323
+ return u.pathname + (u.search || "");
2324
+ } catch {
2325
+ return url;
2203
2326
  }
2204
- /**
2205
- * Size limit check to ensure chunks remain within embedding model constraints.
2206
- * Essential for maintaining consistent embedding quality and avoiding truncation.
2207
- */
2208
- wouldExceedMaxSize(currentChunk, nextChunk) {
2209
- if (!currentChunk) {
2210
- return false;
2327
+ }
2328
+ function shouldIncludeUrl(url, includePatterns, excludePatterns) {
2329
+ const path2 = extractPathAndQuery(url);
2330
+ const normalizedPath = path2.startsWith("/") ? path2 : `/${path2}`;
2331
+ let basename;
2332
+ if (url.startsWith("file://")) {
2333
+ try {
2334
+ const u = new URL(url);
2335
+ basename = u.pathname ? u.pathname.split("/").pop() : void 0;
2336
+ } catch {
2211
2337
  }
2212
- return currentChunk.content.length + nextChunk.content.length > this.preferredChunkSize;
2213
2338
  }
2214
- /**
2215
- * Checks if one path is a prefix of another path, indicating a parent-child relationship
2216
- */
2217
- isPathIncluded(parentPath, childPath) {
2218
- if (parentPath.length >= childPath.length) return false;
2219
- return parentPath.every((part, i) => part === childPath[i]);
2339
+ const stripSlash = (patterns) => patterns?.map((p) => p.startsWith("/") ? p.slice(1) : p);
2340
+ const effectiveExcludePatterns = getEffectiveExclusionPatterns(excludePatterns);
2341
+ if (matchesAnyPattern(url, effectiveExcludePatterns) || matchesAnyPattern(normalizedPath, effectiveExcludePatterns) || basename && matchesAnyPattern(basename, stripSlash(effectiveExcludePatterns)))
2342
+ return false;
2343
+ if (!includePatterns || includePatterns.length === 0) return true;
2344
+ return matchesAnyPattern(url, includePatterns) || matchesAnyPattern(normalizedPath, includePatterns) || (basename ? matchesAnyPattern(basename, stripSlash(includePatterns)) : false);
2345
+ }
2346
+ function computeBaseDirectory(pathname) {
2347
+ if (pathname === "") return "/";
2348
+ if (pathname.endsWith("/")) return pathname;
2349
+ const lastSegment = pathname.split("/").at(-1) || "";
2350
+ const looksLikeFile = lastSegment.includes(".");
2351
+ if (looksLikeFile) {
2352
+ return pathname.replace(/\/[^/]*$/, "/");
2220
2353
  }
2221
- /**
2222
- * Merges section metadata when concatenating chunks, following these rules:
2354
+ return `${pathname}/`;
2355
+ }
2356
+ function isInScope(baseUrl, targetUrl, scope) {
2357
+ if (baseUrl.protocol !== targetUrl.protocol) return false;
2358
+ switch (scope) {
2359
+ case "subpages": {
2360
+ if (baseUrl.hostname !== targetUrl.hostname) return false;
2361
+ const baseDir = computeBaseDirectory(baseUrl.pathname);
2362
+ return targetUrl.pathname.startsWith(baseDir);
2363
+ }
2364
+ case "hostname":
2365
+ return baseUrl.hostname === targetUrl.hostname;
2366
+ case "domain": {
2367
+ return extractPrimaryDomain(baseUrl.hostname) === extractPrimaryDomain(targetUrl.hostname);
2368
+ }
2369
+ default:
2370
+ return false;
2371
+ }
2372
+ }
2373
+ const DEFAULT_MAX_DEPTH = 3;
2374
+ const DEFAULT_CONCURRENCY = 3;
2375
+ class BaseScraperStrategy {
2376
+ /**
2377
+ * Set of normalized URLs that have been marked for processing.
2378
+ *
2379
+ * IMPORTANT: URLs are added to this set BEFORE they are actually processed, not after.
2380
+ * This prevents the same URL from being queued multiple times when discovered from different sources.
2381
+ *
2382
+ * Usage flow:
2383
+ * 1. Initial queue setup: Root URL and initialQueue items are added to visited
2384
+ * 2. During processing: When a page returns links, each link is checked against visited
2385
+ * 3. In processBatch deduplication: Only links NOT in visited are added to the queue AND to visited
2386
+ *
2387
+ * This approach ensures:
2388
+ * - No URL is processed more than once
2389
+ * - No URL appears in the queue multiple times
2390
+ * - Efficient deduplication across concurrent processing
2391
+ */
2392
+ visited = /* @__PURE__ */ new Set();
2393
+ pageCount = 0;
2394
+ totalDiscovered = 0;
2395
+ // Track total URLs discovered (unlimited)
2396
+ effectiveTotal = 0;
2397
+ // Track effective total (limited by maxPages)
2398
+ canonicalBaseUrl;
2399
+ options;
2400
+ constructor(options = {}) {
2401
+ this.options = options;
2402
+ }
2403
+ /**
2404
+ * Determines if a URL should be processed based on scope and include/exclude patterns in ScraperOptions.
2405
+ * Scope is checked first, then patterns.
2406
+ */
2407
+ shouldProcessUrl(url, options) {
2408
+ if (options.scope) {
2409
+ try {
2410
+ const base = this.canonicalBaseUrl ?? new URL$1(options.url);
2411
+ const target = new URL$1(url);
2412
+ if (!isInScope(base, target, options.scope)) return false;
2413
+ } catch {
2414
+ return false;
2415
+ }
2416
+ }
2417
+ return shouldIncludeUrl(url, options.includePatterns, options.excludePatterns);
2418
+ }
2419
+ async processBatch(batch, baseUrl, options, progressCallback, signal) {
2420
+ const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
2421
+ const results = await Promise.all(
2422
+ batch.map(async (item) => {
2423
+ if (signal?.aborted) {
2424
+ throw new CancellationError("Scraping cancelled during batch processing");
2425
+ }
2426
+ const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH;
2427
+ if (item.depth > maxDepth) {
2428
+ return [];
2429
+ }
2430
+ try {
2431
+ const result = await this.processItem(item, options, signal);
2432
+ const shouldCount = item.pageId !== void 0 || result.content !== void 0;
2433
+ let currentPageCount = this.pageCount;
2434
+ if (shouldCount) {
2435
+ currentPageCount = ++this.pageCount;
2436
+ logger.info(
2437
+ `🌐 Scraping page ${currentPageCount}/${this.effectiveTotal} (depth ${item.depth}/${maxDepth}): ${item.url}`
2438
+ );
2439
+ }
2440
+ if (result.status === FetchStatus.NOT_MODIFIED) {
2441
+ logger.debug(`Page unchanged (304): ${item.url}`);
2442
+ if (shouldCount) {
2443
+ await progressCallback({
2444
+ pagesScraped: currentPageCount,
2445
+ totalPages: this.effectiveTotal,
2446
+ totalDiscovered: this.totalDiscovered,
2447
+ currentUrl: item.url,
2448
+ depth: item.depth,
2449
+ maxDepth,
2450
+ result: null,
2451
+ pageId: item.pageId
2452
+ });
2453
+ }
2454
+ return [];
2455
+ }
2456
+ if (result.status === FetchStatus.NOT_FOUND) {
2457
+ logger.debug(`Page deleted (404): ${item.url}`);
2458
+ if (shouldCount) {
2459
+ await progressCallback({
2460
+ pagesScraped: currentPageCount,
2461
+ totalPages: this.effectiveTotal,
2462
+ totalDiscovered: this.totalDiscovered,
2463
+ currentUrl: item.url,
2464
+ depth: item.depth,
2465
+ maxDepth,
2466
+ result: null,
2467
+ pageId: item.pageId,
2468
+ deleted: true
2469
+ });
2470
+ }
2471
+ return [];
2472
+ }
2473
+ if (result.status !== FetchStatus.SUCCESS) {
2474
+ logger.error(`Unknown fetch status: ${result.status}`);
2475
+ return [];
2476
+ }
2477
+ const finalUrl = result.url || item.url;
2478
+ if (result.content) {
2479
+ await progressCallback({
2480
+ pagesScraped: currentPageCount,
2481
+ totalPages: this.effectiveTotal,
2482
+ totalDiscovered: this.totalDiscovered,
2483
+ currentUrl: finalUrl,
2484
+ depth: item.depth,
2485
+ maxDepth,
2486
+ result: {
2487
+ url: finalUrl,
2488
+ title: result.content.title?.trim() || result.title?.trim() || "",
2489
+ contentType: result.contentType || "",
2490
+ textContent: result.content.textContent || "",
2491
+ links: result.content.links || [],
2492
+ errors: result.content.errors || [],
2493
+ chunks: result.content.chunks || [],
2494
+ etag: result.etag || null,
2495
+ lastModified: result.lastModified || null
2496
+ },
2497
+ pageId: item.pageId
2498
+ });
2499
+ }
2500
+ const nextItems = result.links || [];
2501
+ const linkBaseUrl = finalUrl ? new URL$1(finalUrl) : baseUrl;
2502
+ return nextItems.map((value) => {
2503
+ try {
2504
+ const targetUrl = new URL$1(value, linkBaseUrl);
2505
+ if (!this.shouldProcessUrl(targetUrl.href, options)) {
2506
+ return null;
2507
+ }
2508
+ return {
2509
+ url: targetUrl.href,
2510
+ depth: item.depth + 1
2511
+ };
2512
+ } catch (_error) {
2513
+ logger.warn(`❌ Invalid URL: ${value}`);
2514
+ }
2515
+ return null;
2516
+ }).filter((item2) => item2 !== null);
2517
+ } catch (error) {
2518
+ if (options.ignoreErrors) {
2519
+ logger.error(`❌ Failed to process ${item.url}: ${error}`);
2520
+ return [];
2521
+ }
2522
+ throw error;
2523
+ }
2524
+ })
2525
+ );
2526
+ const allLinks = results.flat();
2527
+ const uniqueLinks = [];
2528
+ for (const item of allLinks) {
2529
+ const normalizedUrl = normalizeUrl(item.url, this.options.urlNormalizerOptions);
2530
+ if (!this.visited.has(normalizedUrl)) {
2531
+ this.visited.add(normalizedUrl);
2532
+ uniqueLinks.push(item);
2533
+ this.totalDiscovered++;
2534
+ if (this.effectiveTotal < maxPages) {
2535
+ this.effectiveTotal++;
2536
+ }
2537
+ }
2538
+ }
2539
+ return uniqueLinks;
2540
+ }
2541
+ async scrape(options, progressCallback, signal) {
2542
+ this.visited.clear();
2543
+ this.pageCount = 0;
2544
+ const initialQueue = options.initialQueue || [];
2545
+ const isRefreshMode = initialQueue.length > 0;
2546
+ this.canonicalBaseUrl = new URL$1(options.url);
2547
+ let baseUrl = this.canonicalBaseUrl;
2548
+ const queue = [];
2549
+ const normalizedRootUrl = normalizeUrl(
2550
+ options.url,
2551
+ this.options.urlNormalizerOptions
2552
+ );
2553
+ if (isRefreshMode) {
2554
+ logger.debug(
2555
+ `Starting refresh mode with ${initialQueue.length} pre-populated pages`
2556
+ );
2557
+ for (const item of initialQueue) {
2558
+ const normalizedUrl = normalizeUrl(item.url, this.options.urlNormalizerOptions);
2559
+ if (!this.visited.has(normalizedUrl)) {
2560
+ this.visited.add(normalizedUrl);
2561
+ queue.push(item);
2562
+ }
2563
+ }
2564
+ }
2565
+ if (!this.visited.has(normalizedRootUrl)) {
2566
+ this.visited.add(normalizedRootUrl);
2567
+ queue.unshift({ url: options.url, depth: 0 });
2568
+ }
2569
+ this.totalDiscovered = queue.length;
2570
+ this.effectiveTotal = queue.length;
2571
+ const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
2572
+ const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY;
2573
+ while (queue.length > 0 && this.pageCount < maxPages) {
2574
+ if (signal?.aborted) {
2575
+ logger.debug(`${isRefreshMode ? "Refresh" : "Scraping"} cancelled by signal.`);
2576
+ throw new CancellationError(
2577
+ `${isRefreshMode ? "Refresh" : "Scraping"} cancelled by signal`
2578
+ );
2579
+ }
2580
+ const remainingPages = maxPages - this.pageCount;
2581
+ if (remainingPages <= 0) {
2582
+ break;
2583
+ }
2584
+ const batchSize = Math.min(maxConcurrency, remainingPages, queue.length);
2585
+ const batch = queue.splice(0, batchSize);
2586
+ baseUrl = this.canonicalBaseUrl ?? baseUrl;
2587
+ const newUrls = await this.processBatch(
2588
+ batch,
2589
+ baseUrl,
2590
+ options,
2591
+ progressCallback,
2592
+ signal
2593
+ );
2594
+ queue.push(...newUrls);
2595
+ }
2596
+ }
2597
+ /**
2598
+ * Cleanup resources used by this strategy.
2599
+ * Default implementation does nothing - override in derived classes as needed.
2600
+ */
2601
+ async cleanup() {
2602
+ }
2603
+ }
2604
+ class SplitterError extends Error {
2605
+ }
2606
+ class MinimumChunkSizeError extends SplitterError {
2607
+ constructor(size, maxSize) {
2608
+ super(
2609
+ `Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`
2610
+ );
2611
+ }
2612
+ }
2613
+ class ContentSplitterError extends SplitterError {
2614
+ }
2615
+ class GreedySplitter {
2616
+ baseSplitter;
2617
+ minChunkSize;
2618
+ preferredChunkSize;
2619
+ maxChunkSize;
2620
+ /**
2621
+ * Combines a base document splitter with size constraints to produce optimally-sized chunks.
2622
+ * The base splitter handles the initial semantic splitting, while this class handles
2623
+ * the concatenation strategy.
2624
+ */
2625
+ constructor(baseSplitter, minChunkSize, preferredChunkSize, maxChunkSize) {
2626
+ this.baseSplitter = baseSplitter;
2627
+ this.minChunkSize = minChunkSize;
2628
+ this.preferredChunkSize = preferredChunkSize;
2629
+ this.maxChunkSize = maxChunkSize;
2630
+ }
2631
+ /**
2632
+ * Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks
2633
+ * are combined until they reach the minimum size, but splits are preserved at major
2634
+ * section boundaries to maintain document structure. This balances the need for
2635
+ * context with semantic coherence.
2636
+ */
2637
+ async splitText(markdown, contentType) {
2638
+ const initialChunks = await this.baseSplitter.splitText(markdown, contentType);
2639
+ const concatenatedChunks = [];
2640
+ let currentChunk = null;
2641
+ for (const nextChunk of initialChunks) {
2642
+ if (nextChunk.content.length > this.maxChunkSize) {
2643
+ logger.warn(
2644
+ `⚠ Chunk from base splitter exceeds max size: ${nextChunk.content.length} > ${this.maxChunkSize}`
2645
+ );
2646
+ }
2647
+ if (currentChunk) {
2648
+ const combinedSize = currentChunk.content.length + nextChunk.content.length;
2649
+ if (combinedSize > this.maxChunkSize) {
2650
+ concatenatedChunks.push(currentChunk);
2651
+ currentChunk = this.cloneChunk(nextChunk);
2652
+ continue;
2653
+ }
2654
+ if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk) && !this.isSameSection(currentChunk, nextChunk)) {
2655
+ concatenatedChunks.push(currentChunk);
2656
+ currentChunk = this.cloneChunk(nextChunk);
2657
+ continue;
2658
+ }
2659
+ if (combinedSize > this.preferredChunkSize && currentChunk.content.length >= this.minChunkSize && nextChunk.content.length >= this.minChunkSize) {
2660
+ concatenatedChunks.push(currentChunk);
2661
+ currentChunk = this.cloneChunk(nextChunk);
2662
+ continue;
2663
+ }
2664
+ currentChunk.content += `${currentChunk.content.endsWith("\n") ? "" : "\n"}${nextChunk.content}`;
2665
+ currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
2666
+ currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
2667
+ } else {
2668
+ currentChunk = this.cloneChunk(nextChunk);
2669
+ }
2670
+ }
2671
+ if (currentChunk) {
2672
+ concatenatedChunks.push(currentChunk);
2673
+ }
2674
+ return concatenatedChunks;
2675
+ }
2676
+ cloneChunk(chunk) {
2677
+ return {
2678
+ types: [...chunk.types],
2679
+ content: chunk.content,
2680
+ section: {
2681
+ level: chunk.section.level,
2682
+ path: [...chunk.section.path]
2683
+ }
2684
+ };
2685
+ }
2686
+ /**
2687
+ * H1 and H2 headings represent major conceptual breaks in the document.
2688
+ * Preserving these splits helps maintain the document's logical structure.
2689
+ */
2690
+ startsNewMajorSection(chunk) {
2691
+ return chunk.section.level === 1 || chunk.section.level === 2;
2692
+ }
2693
+ /**
2694
+ * Checks if two chunks belong to the same section by comparing their paths.
2695
+ * Returns true if the paths are identical or if one is a parent of the other.
2696
+ */
2697
+ isSameSection(chunk1, chunk2) {
2698
+ const path1 = chunk1.section.path;
2699
+ const path2 = chunk2.section.path;
2700
+ if (path1.length === path2.length && path1.every((part, i) => part === path2[i])) {
2701
+ return true;
2702
+ }
2703
+ return this.isPathIncluded(path1, path2) || this.isPathIncluded(path2, path1);
2704
+ }
2705
+ /**
2706
+ * Checks if one path is a prefix of another path, indicating a parent-child relationship
2707
+ */
2708
+ isPathIncluded(parentPath, childPath) {
2709
+ if (parentPath.length >= childPath.length) return false;
2710
+ return parentPath.every((part, i) => part === childPath[i]);
2711
+ }
2712
+ /**
2713
+ * Merges section metadata when concatenating chunks, following these rules:
2223
2714
  * 1. Level: Always uses the lowest (most general) level between chunks
2224
2715
  * 2. Path selection:
2225
2716
  * - For parent-child relationships (one path includes the other), uses the child's path
@@ -4195,7 +4686,7 @@ class HtmlMetadataExtractorMiddleware {
4195
4686
  }
4196
4687
  title = title || "Untitled";
4197
4688
  title = title.replace(/\s+/g, " ").trim();
4198
- context.metadata.title = title;
4689
+ context.title = title;
4199
4690
  logger.debug(`Extracted title: "${title}" from ${context.source}`);
4200
4691
  } catch (error) {
4201
4692
  logger.error(`❌ Error extracting metadata from ${context.source}: ${error}`);
@@ -4653,7 +5144,7 @@ ${frame.content}
4653
5144
  * @param next The next middleware function in the pipeline.
4654
5145
  */
4655
5146
  async process(context, next) {
4656
- const contentType = context.options?.headers?.["content-type"] || context.metadata?.contentType || context.metadata?.mimeType;
5147
+ const contentType = context.options?.headers?.["content-type"] || context.contentType;
4657
5148
  if (contentType && typeof contentType === "string" && !MimeTypeUtils.isHtml(contentType)) {
4658
5149
  logger.debug(
4659
5150
  `Skipping Playwright rendering for ${context.source} - content type '${contentType}' is not HTML`
@@ -5014,6 +5505,7 @@ ${text.replace(/^\n+|\n+$/g, "")}
5014
5505
  context.content = markdown;
5015
5506
  logger.debug(`Successfully converted HTML to Markdown for ${context.source}`);
5016
5507
  }
5508
+ context.contentType = "text/markdown";
5017
5509
  } catch (error) {
5018
5510
  logger.error(
5019
5511
  `❌ Error converting HTML to Markdown for ${context.source}: ${error}`
@@ -5053,7 +5545,7 @@ class MarkdownMetadataExtractorMiddleware {
5053
5545
  if (match?.[1]) {
5054
5546
  title = match[1].trim();
5055
5547
  }
5056
- context.metadata.title = title;
5548
+ context.title = title;
5057
5549
  } catch (error) {
5058
5550
  context.errors.push(
5059
5551
  new Error(
@@ -5225,10 +5717,10 @@ function convertToString(content, charset) {
5225
5717
  }
5226
5718
  class BasePipeline {
5227
5719
  /**
5228
- * Determines if this pipeline can process the given content.
5720
+ * Determines if this pipeline can process content with the given MIME type.
5229
5721
  * Must be implemented by derived classes.
5230
5722
  */
5231
- canProcess(_rawContent) {
5723
+ canProcess(_mimeType, _content) {
5232
5724
  throw new Error("Method not implemented.");
5233
5725
  }
5234
5726
  /**
@@ -5289,11 +5781,12 @@ class HtmlPipeline extends BasePipeline {
5289
5781
  this.greedySplitter = new GreedySplitter(
5290
5782
  semanticSplitter,
5291
5783
  SPLITTER_MIN_CHUNK_SIZE,
5292
- preferredChunkSize
5784
+ preferredChunkSize,
5785
+ maxChunkSize
5293
5786
  );
5294
5787
  }
5295
- canProcess(rawContent) {
5296
- return MimeTypeUtils.isHtml(rawContent.mimeType);
5788
+ canProcess(mimeType) {
5789
+ return MimeTypeUtils.isHtml(mimeType);
5297
5790
  }
5298
5791
  async process(rawContent, options, fetcher) {
5299
5792
  const resolvedCharset = resolveCharset(
@@ -5304,8 +5797,9 @@ class HtmlPipeline extends BasePipeline {
5304
5797
  const contentString = convertToString(rawContent.content, resolvedCharset);
5305
5798
  const context = {
5306
5799
  content: contentString,
5800
+ contentType: rawContent.mimeType || "text/html",
5307
5801
  source: rawContent.source,
5308
- metadata: {},
5802
+ // metadata: {},
5309
5803
  links: [],
5310
5804
  errors: [],
5311
5805
  options,
@@ -5320,8 +5814,9 @@ class HtmlPipeline extends BasePipeline {
5320
5814
  typeof context.content === "string" ? context.content : ""
5321
5815
  );
5322
5816
  return {
5323
- textContent: typeof context.content === "string" ? context.content : "",
5324
- metadata: context.metadata,
5817
+ title: context.title,
5818
+ contentType: context.contentType,
5819
+ textContent: context.content,
5325
5820
  links: context.links,
5326
5821
  errors: context.errors,
5327
5822
  chunks
@@ -5345,9 +5840,9 @@ class JsonPipeline extends BasePipeline {
5345
5840
  preserveFormatting: true
5346
5841
  });
5347
5842
  }
5348
- canProcess(rawContent) {
5349
- if (!rawContent.mimeType) return false;
5350
- return MimeTypeUtils.isJson(rawContent.mimeType);
5843
+ canProcess(mimeType) {
5844
+ if (!mimeType) return false;
5845
+ return MimeTypeUtils.isJson(mimeType);
5351
5846
  }
5352
5847
  async process(rawContent, options, fetcher) {
5353
5848
  const contentString = convertToString(rawContent.content, rawContent.charset);
@@ -5362,22 +5857,25 @@ class JsonPipeline extends BasePipeline {
5362
5857
  const fallbackChunks = await this.splitter.splitText(contentString);
5363
5858
  return {
5364
5859
  textContent: contentString,
5365
- metadata: {
5366
- isValidJson: false
5367
- },
5860
+ // metadata: {
5861
+ // isValidJson: false,
5862
+ // },
5368
5863
  links: [],
5369
5864
  errors: [],
5370
5865
  chunks: fallbackChunks
5371
5866
  };
5372
5867
  }
5868
+ const metadata = this.extractMetadata(parsedJson);
5373
5869
  const context = {
5374
5870
  content: contentString,
5375
5871
  source: rawContent.source,
5376
- metadata: {
5377
- ...this.extractMetadata(parsedJson),
5378
- isValidJson,
5379
- jsonStructure: this.analyzeJsonStructure(parsedJson)
5380
- },
5872
+ title: metadata.title,
5873
+ contentType: rawContent.mimeType || "application/json",
5874
+ // metadata: {
5875
+ // ...this.extractMetadata(parsedJson),
5876
+ // isValidJson,
5877
+ // jsonStructure: this.analyzeJsonStructure(parsedJson),
5878
+ // },
5381
5879
  links: [],
5382
5880
  // JSON files typically don't contain links
5383
5881
  errors: [],
@@ -5387,8 +5885,9 @@ class JsonPipeline extends BasePipeline {
5387
5885
  await this.executeMiddlewareStack(this.middleware, context);
5388
5886
  const chunks = await this.splitter.splitText(context.content);
5389
5887
  return {
5888
+ title: context.title,
5889
+ contentType: context.contentType,
5390
5890
  textContent: context.content,
5391
- metadata: context.metadata,
5392
5891
  links: context.links,
5393
5892
  errors: context.errors,
5394
5893
  chunks
@@ -5418,30 +5917,6 @@ class JsonPipeline extends BasePipeline {
5418
5917
  }
5419
5918
  return metadata;
5420
5919
  }
5421
- /**
5422
- * Analyzes the structure of valid JSON for metadata
5423
- */
5424
- analyzeJsonStructure(parsedJson) {
5425
- if (Array.isArray(parsedJson)) {
5426
- return {
5427
- type: "array",
5428
- depth: this.calculateDepth(parsedJson),
5429
- itemCount: parsedJson.length
5430
- };
5431
- } else if (typeof parsedJson === "object" && parsedJson !== null) {
5432
- const obj = parsedJson;
5433
- return {
5434
- type: "object",
5435
- depth: this.calculateDepth(parsedJson),
5436
- propertyCount: Object.keys(obj).length
5437
- };
5438
- } else {
5439
- return {
5440
- type: typeof parsedJson,
5441
- depth: 1
5442
- };
5443
- }
5444
- }
5445
5920
  /**
5446
5921
  * Calculates the maximum nesting depth of a JSON structure
5447
5922
  */
@@ -5482,19 +5957,20 @@ class MarkdownPipeline extends BasePipeline {
5482
5957
  this.greedySplitter = new GreedySplitter(
5483
5958
  semanticSplitter,
5484
5959
  SPLITTER_MIN_CHUNK_SIZE,
5485
- preferredChunkSize
5960
+ preferredChunkSize,
5961
+ maxChunkSize
5486
5962
  );
5487
5963
  }
5488
- canProcess(rawContent) {
5489
- if (!rawContent.mimeType) return false;
5490
- return MimeTypeUtils.isMarkdown(rawContent.mimeType);
5964
+ canProcess(mimeType) {
5965
+ if (!mimeType) return false;
5966
+ return MimeTypeUtils.isMarkdown(mimeType);
5491
5967
  }
5492
5968
  async process(rawContent, options, fetcher) {
5493
5969
  const contentString = convertToString(rawContent.content, rawContent.charset);
5494
5970
  const context = {
5971
+ contentType: rawContent.mimeType || "text/markdown",
5495
5972
  content: contentString,
5496
5973
  source: rawContent.source,
5497
- metadata: {},
5498
5974
  links: [],
5499
5975
  errors: [],
5500
5976
  options,
@@ -5506,8 +5982,9 @@ class MarkdownPipeline extends BasePipeline {
5506
5982
  rawContent.mimeType
5507
5983
  );
5508
5984
  return {
5985
+ title: context.title,
5986
+ contentType: context.contentType,
5509
5987
  textContent: typeof context.content === "string" ? context.content : "",
5510
- metadata: context.metadata,
5511
5988
  links: context.links,
5512
5989
  errors: context.errors,
5513
5990
  chunks
@@ -5517,24 +5994,27 @@ class MarkdownPipeline extends BasePipeline {
5517
5994
  class SourceCodePipeline extends BasePipeline {
5518
5995
  middleware;
5519
5996
  splitter;
5520
- constructor(chunkSize = SPLITTER_PREFERRED_CHUNK_SIZE) {
5997
+ constructor(_preferredChunkSize = SPLITTER_PREFERRED_CHUNK_SIZE, maxChunkSize = SPLITTER_MAX_CHUNK_SIZE) {
5521
5998
  super();
5522
5999
  this.middleware = [];
5523
- this.splitter = new TreesitterSourceCodeSplitter({ maxChunkSize: chunkSize });
6000
+ this.splitter = new TreesitterSourceCodeSplitter({ maxChunkSize });
5524
6001
  }
5525
- canProcess(rawContent) {
5526
- if (!rawContent.mimeType) return false;
5527
- return MimeTypeUtils.isSourceCode(rawContent.mimeType);
6002
+ canProcess(mimeType) {
6003
+ if (!mimeType) return false;
6004
+ return MimeTypeUtils.isSourceCode(mimeType);
5528
6005
  }
5529
6006
  async process(rawContent, options, fetcher) {
5530
6007
  const contentString = convertToString(rawContent.content, rawContent.charset);
5531
6008
  const context = {
6009
+ contentType: rawContent.mimeType || "text/plain",
5532
6010
  content: contentString,
5533
6011
  source: rawContent.source,
5534
- metadata: {
5535
- language: rawContent.mimeType ? MimeTypeUtils.extractLanguageFromMimeType(rawContent.mimeType) : "text",
5536
- isSourceCode: true
5537
- },
6012
+ // metadata: {
6013
+ // language: rawContent.mimeType
6014
+ // ? MimeTypeUtils.extractLanguageFromMimeType(rawContent.mimeType)
6015
+ // : "text",
6016
+ // isSourceCode: true,
6017
+ // },
5538
6018
  links: [],
5539
6019
  // Source code files typically don't contain web links
5540
6020
  errors: [],
@@ -5544,8 +6024,10 @@ class SourceCodePipeline extends BasePipeline {
5544
6024
  await this.executeMiddlewareStack(this.middleware, context);
5545
6025
  const chunks = await this.splitter.splitText(context.content, rawContent.mimeType);
5546
6026
  return {
6027
+ title: context.title,
6028
+ contentType: context.contentType,
5547
6029
  textContent: context.content,
5548
- metadata: context.metadata,
6030
+ // metadata: context.metadata,
5549
6031
  links: context.links,
5550
6032
  errors: context.errors,
5551
6033
  chunks
@@ -5594,17 +6076,22 @@ class TextDocumentSplitter {
5594
6076
  class TextPipeline extends BasePipeline {
5595
6077
  middleware;
5596
6078
  splitter;
5597
- constructor(chunkSize = SPLITTER_PREFERRED_CHUNK_SIZE) {
6079
+ constructor(preferredChunkSize = SPLITTER_PREFERRED_CHUNK_SIZE, maxChunkSize = SPLITTER_MAX_CHUNK_SIZE) {
5598
6080
  super();
5599
6081
  this.middleware = [];
5600
- const textSplitter = new TextDocumentSplitter({ maxChunkSize: chunkSize });
5601
- this.splitter = new GreedySplitter(textSplitter, SPLITTER_MIN_CHUNK_SIZE, chunkSize);
6082
+ const textSplitter = new TextDocumentSplitter({ maxChunkSize });
6083
+ this.splitter = new GreedySplitter(
6084
+ textSplitter,
6085
+ SPLITTER_MIN_CHUNK_SIZE,
6086
+ preferredChunkSize,
6087
+ maxChunkSize
6088
+ );
5602
6089
  }
5603
- canProcess(rawContent) {
5604
- if (!MimeTypeUtils.isSafeForTextProcessing(rawContent.mimeType)) {
6090
+ canProcess(mimeType, content) {
6091
+ if (!MimeTypeUtils.isSafeForTextProcessing(mimeType)) {
5605
6092
  return false;
5606
6093
  }
5607
- if (MimeTypeUtils.isBinary(rawContent.content)) {
6094
+ if (content && MimeTypeUtils.isBinary(content)) {
5608
6095
  return false;
5609
6096
  }
5610
6097
  return true;
@@ -5612,12 +6099,11 @@ class TextPipeline extends BasePipeline {
5612
6099
  async process(rawContent, options, fetcher) {
5613
6100
  const contentString = convertToString(rawContent.content, rawContent.charset);
5614
6101
  const context = {
6102
+ title: "",
6103
+ // Title extraction can be added in middleware if needed
6104
+ contentType: rawContent.mimeType || "text/plain",
5615
6105
  content: contentString,
5616
6106
  source: rawContent.source,
5617
- metadata: {
5618
- contentType: rawContent.mimeType || "text/plain",
5619
- isGenericText: true
5620
- },
5621
6107
  links: [],
5622
6108
  // Generic text content typically doesn't contain structured links
5623
6109
  errors: [],
@@ -5627,394 +6113,283 @@ class TextPipeline extends BasePipeline {
5627
6113
  await this.executeMiddlewareStack(this.middleware, context);
5628
6114
  const chunks = await this.splitter.splitText(context.content, rawContent.mimeType);
5629
6115
  return {
6116
+ title: context.title,
6117
+ contentType: context.contentType,
5630
6118
  textContent: context.content,
5631
- metadata: context.metadata,
5632
6119
  links: context.links,
5633
6120
  errors: context.errors,
5634
- chunks
5635
- };
5636
- }
5637
- }
5638
- let PipelineFactory$1 = class PipelineFactory {
5639
- /**
5640
- * Creates the standard set of content pipelines used by all scraper strategies.
5641
- * Includes HTML, Markdown, JSON, source code, and text processing capabilities.
5642
- * Each pipeline now handles both preprocessing and content-specific splitting.
5643
- * TextPipeline is placed last as the universal fallback for unknown content types.
5644
- *
5645
- * @param config - Optional configuration for pipeline chunk sizes
5646
- * @returns Array of content pipelines in processing order
5647
- */
5648
- static createStandardPipelines(config) {
5649
- const preferredChunkSize = config?.chunkSizes?.preferred ?? SPLITTER_PREFERRED_CHUNK_SIZE;
5650
- const maxChunkSize = config?.chunkSizes?.max ?? 2e3;
5651
- return [
5652
- new JsonPipeline(preferredChunkSize),
5653
- new SourceCodePipeline(preferredChunkSize),
5654
- new HtmlPipeline(preferredChunkSize, maxChunkSize),
5655
- new MarkdownPipeline(preferredChunkSize, maxChunkSize),
5656
- new TextPipeline(preferredChunkSize)
5657
- // Universal fallback - must be last
5658
- ];
5659
- }
5660
- };
5661
- const DEFAULT_FILE_EXCLUSIONS = [
5662
- // CHANGELOG files (case variations)
5663
- "**/CHANGELOG.md",
5664
- "**/changelog.md",
5665
- "**/CHANGELOG.mdx",
5666
- "**/changelog.mdx",
5667
- // LICENSE files (case variations)
5668
- "**/LICENSE",
5669
- "**/LICENSE.md",
5670
- "**/license.md",
5671
- // CODE_OF_CONDUCT files (case variations)
5672
- "**/CODE_OF_CONDUCT.md",
5673
- "**/code_of_conduct.md",
5674
- // Test files
5675
- "**/*.test.*",
5676
- "**/*.spec.*",
5677
- "**/*_test.py",
5678
- "**/*_test.go",
5679
- // Package manager lock files
5680
- "**/*.lock",
5681
- "**/package-lock.json",
5682
- "**/yarn.lock",
5683
- "**/pnpm-lock.yaml",
5684
- "**/go.sum",
5685
- // Build artifacts
5686
- "**/*.min.js",
5687
- "**/*.min.css",
5688
- "**/*.map",
5689
- "**/*.d.ts",
5690
- // IDE/System files
5691
- "**/.DS_Store",
5692
- "**/Thumbs.db",
5693
- "**/*.swp",
5694
- "**/*.swo",
5695
- // Internal config files (using regex pattern)
5696
- "/.*\\.(ini|cfg|conf|log|pid)$/"
5697
- ];
5698
- const DEFAULT_FOLDER_EXCLUSIONS = [
5699
- // Archive and deprecated content (matches anywhere in path)
5700
- "**/archive/**",
5701
- "**/archived/**",
5702
- "**/deprecated/**",
5703
- "**/legacy/**",
5704
- "**/old/**",
5705
- "**/outdated/**",
5706
- "**/previous/**",
5707
- "**/superseded/**",
5708
- // Specific paths that don't follow the general pattern
5709
- "docs/old/**",
5710
- // Test directories
5711
- "**/test/**",
5712
- "**/tests/**",
5713
- "**/__tests__/**",
5714
- "**/spec/**",
5715
- // Build output directories
5716
- "**/dist/**",
5717
- "**/build/**",
5718
- "**/out/**",
5719
- "**/target/**",
5720
- "**/.next/**",
5721
- "**/.nuxt/**",
5722
- // IDE directories
5723
- "**/.vscode/**",
5724
- "**/.idea/**",
5725
- // Internationalization folders - non-English locales
5726
- "**/i18n/ar*/**",
5727
- "**/i18n/de*/**",
5728
- "**/i18n/es*/**",
5729
- "**/i18n/fr*/**",
5730
- "**/i18n/hi*/**",
5731
- "**/i18n/it*/**",
5732
- "**/i18n/ja*/**",
5733
- "**/i18n/ko*/**",
5734
- "**/i18n/nl*/**",
5735
- "**/i18n/pl*/**",
5736
- "**/i18n/pt*/**",
5737
- "**/i18n/ru*/**",
5738
- "**/i18n/sv*/**",
5739
- "**/i18n/th*/**",
5740
- "**/i18n/tr*/**",
5741
- "**/i18n/vi*/**",
5742
- "**/i18n/zh*/**",
5743
- // Common locale folder patterns
5744
- "**/zh-cn/**",
5745
- "**/zh-hk/**",
5746
- "**/zh-mo/**",
5747
- "**/zh-sg/**",
5748
- "**/zh-tw/**"
5749
- ];
5750
- const DEFAULT_EXCLUSION_PATTERNS = [
5751
- ...DEFAULT_FILE_EXCLUSIONS,
5752
- ...DEFAULT_FOLDER_EXCLUSIONS
5753
- ];
5754
- function getEffectiveExclusionPatterns(userPatterns) {
5755
- if (userPatterns !== void 0) {
5756
- return userPatterns;
6121
+ chunks
6122
+ };
5757
6123
  }
5758
- return DEFAULT_EXCLUSION_PATTERNS;
5759
- }
5760
- function isRegexPattern(pattern) {
5761
- return pattern.length > 2 && pattern.startsWith("/") && pattern.endsWith("/");
5762
6124
  }
5763
- function patternToRegExp(pattern) {
5764
- if (isRegexPattern(pattern)) {
5765
- return new RegExp(pattern.slice(1, -1));
6125
+ let PipelineFactory$1 = class PipelineFactory {
6126
+ /**
6127
+ * Creates the standard set of content pipelines used by all scraper strategies.
6128
+ * Includes HTML, Markdown, JSON, source code, and text processing capabilities.
6129
+ * Each pipeline now handles both preprocessing and content-specific splitting.
6130
+ * TextPipeline is placed last as the universal fallback for unknown content types.
6131
+ *
6132
+ * @param config - Optional configuration for pipeline chunk sizes
6133
+ * @returns Array of content pipelines in processing order
6134
+ */
6135
+ static createStandardPipelines(config) {
6136
+ const preferredChunkSize = config?.chunkSizes?.preferred ?? SPLITTER_PREFERRED_CHUNK_SIZE;
6137
+ const maxChunkSize = config?.chunkSizes?.max ?? 2e3;
6138
+ return [
6139
+ new JsonPipeline(preferredChunkSize),
6140
+ new SourceCodePipeline(preferredChunkSize, maxChunkSize),
6141
+ new HtmlPipeline(preferredChunkSize, maxChunkSize),
6142
+ new MarkdownPipeline(preferredChunkSize, maxChunkSize),
6143
+ new TextPipeline(preferredChunkSize, maxChunkSize)
6144
+ // Universal fallback - must be last
6145
+ ];
5766
6146
  }
5767
- const re = minimatch.makeRe(pattern, { dot: true });
5768
- if (!re) throw new Error(`Invalid glob pattern: ${pattern}`);
5769
- return re;
5770
- }
5771
- function matchesAnyPattern(path2, patterns) {
5772
- if (!patterns || patterns.length === 0) return false;
5773
- const normalizedPath = path2.startsWith("/") ? path2 : `/${path2}`;
5774
- return patterns.some((pattern) => {
5775
- if (isRegexPattern(pattern)) {
5776
- return patternToRegExp(pattern).test(normalizedPath);
5777
- }
5778
- return minimatch(normalizedPath.replace(/^\//, ""), pattern, { dot: true });
5779
- });
5780
- }
5781
- function extractPathAndQuery(url) {
5782
- try {
5783
- const u = new URL(url);
5784
- return u.pathname + (u.search || "");
5785
- } catch {
5786
- return url;
6147
+ };
6148
+ class GitHubRepoProcessor {
6149
+ httpFetcher = new HttpFetcher();
6150
+ pipelines;
6151
+ constructor() {
6152
+ this.pipelines = PipelineFactory$1.createStandardPipelines();
5787
6153
  }
5788
- }
5789
- function shouldIncludeUrl(url, includePatterns, excludePatterns) {
5790
- const path2 = extractPathAndQuery(url);
5791
- const normalizedPath = path2.startsWith("/") ? path2 : `/${path2}`;
5792
- let basename;
5793
- if (url.startsWith("file://")) {
5794
- try {
5795
- const u = new URL(url);
5796
- basename = u.pathname ? u.pathname.split("/").pop() : void 0;
5797
- } catch {
6154
+ /**
6155
+ * Parses an HTTPS blob URL to extract repository information.
6156
+ * Format: https://github.com/owner/repo/blob/branch/filepath
6157
+ */
6158
+ parseHttpsBlobUrl(url) {
6159
+ const parsedUrl = new URL(url);
6160
+ const segments = parsedUrl.pathname.split("/").filter(Boolean);
6161
+ if (segments.length < 5 || segments[2] !== "blob") {
6162
+ throw new Error(
6163
+ `Invalid GitHub blob URL format. Expected: https://github.com/owner/repo/blob/branch/filepath. Got: ${url}`
6164
+ );
5798
6165
  }
6166
+ const owner = segments[0];
6167
+ const repo = segments[1];
6168
+ const branch = segments[3];
6169
+ const filePath = segments.slice(4).join("/");
6170
+ return { owner, repo, branch, filePath };
5799
6171
  }
5800
- const stripSlash = (patterns) => patterns?.map((p) => p.startsWith("/") ? p.slice(1) : p);
5801
- const effectiveExcludePatterns = getEffectiveExclusionPatterns(excludePatterns);
5802
- if (matchesAnyPattern(normalizedPath, effectiveExcludePatterns) || basename && matchesAnyPattern(basename, stripSlash(effectiveExcludePatterns)))
5803
- return false;
5804
- if (!includePatterns || includePatterns.length === 0) return true;
5805
- return matchesAnyPattern(normalizedPath, includePatterns) || (basename ? matchesAnyPattern(basename, stripSlash(includePatterns)) : false);
5806
- }
5807
- function computeBaseDirectory(pathname) {
5808
- if (pathname === "") return "/";
5809
- if (pathname.endsWith("/")) return pathname;
5810
- const lastSegment = pathname.split("/").at(-1) || "";
5811
- const looksLikeFile = lastSegment.includes(".");
5812
- if (looksLikeFile) {
5813
- return pathname.replace(/\/[^/]*$/, "/");
6172
+ /**
6173
+ * Fetches the raw content of a file from GitHub.
6174
+ */
6175
+ async fetchFileContent(repoInfo, filePath, etag, signal) {
6176
+ const { owner, repo, branch } = repoInfo;
6177
+ const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${filePath}`;
6178
+ const rawContent = await this.httpFetcher.fetch(rawUrl, { signal, etag });
6179
+ const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath);
6180
+ if (detectedMimeType && rawContent.mimeType === "text/plain") {
6181
+ return {
6182
+ ...rawContent,
6183
+ mimeType: detectedMimeType
6184
+ };
6185
+ }
6186
+ return rawContent;
5814
6187
  }
5815
- return `${pathname}/`;
5816
- }
5817
- function isInScope(baseUrl, targetUrl, scope) {
5818
- if (baseUrl.protocol !== targetUrl.protocol) return false;
5819
- switch (scope) {
5820
- case "subpages": {
5821
- if (baseUrl.hostname !== targetUrl.hostname) return false;
5822
- const baseDir = computeBaseDirectory(baseUrl.pathname);
5823
- return targetUrl.pathname.startsWith(baseDir);
6188
+ /**
6189
+ * Processes a single GitHub repository file from an HTTPS blob URL.
6190
+ */
6191
+ async process(item, options, signal) {
6192
+ const repoInfo = this.parseHttpsBlobUrl(item.url);
6193
+ const { owner, repo, branch, filePath } = repoInfo;
6194
+ const rawContent = await this.fetchFileContent(
6195
+ { owner, repo, branch },
6196
+ filePath,
6197
+ item.etag,
6198
+ signal
6199
+ );
6200
+ if (rawContent.status !== FetchStatus.SUCCESS) {
6201
+ return { url: item.url, links: [], status: rawContent.status };
5824
6202
  }
5825
- case "hostname":
5826
- return baseUrl.hostname === targetUrl.hostname;
5827
- case "domain": {
5828
- return extractPrimaryDomain(baseUrl.hostname) === extractPrimaryDomain(targetUrl.hostname);
6203
+ let processed;
6204
+ for (const pipeline of this.pipelines) {
6205
+ const contentBuffer = Buffer.isBuffer(rawContent.content) ? rawContent.content : Buffer.from(rawContent.content);
6206
+ if (pipeline.canProcess(rawContent.mimeType || "text/plain", contentBuffer)) {
6207
+ logger.debug(
6208
+ `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`
6209
+ );
6210
+ const gitHubOptions = { ...options, scrapeMode: ScrapeMode.Fetch };
6211
+ processed = await pipeline.process(rawContent, gitHubOptions, this.httpFetcher);
6212
+ break;
6213
+ }
5829
6214
  }
5830
- default:
5831
- return false;
6215
+ if (!processed) {
6216
+ logger.warn(
6217
+ `⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
6218
+ );
6219
+ return { url: item.url, links: [], status: FetchStatus.SUCCESS };
6220
+ }
6221
+ for (const err of processed.errors ?? []) {
6222
+ logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`);
6223
+ }
6224
+ const githubUrl = `https://github.com/${owner}/${repo}/blob/${branch}/${filePath}`;
6225
+ const filename = filePath.split("/").pop() || "Untitled";
6226
+ return {
6227
+ url: githubUrl,
6228
+ title: processed.title?.trim() || filename || "Untitled",
6229
+ etag: rawContent.etag,
6230
+ lastModified: rawContent.lastModified,
6231
+ contentType: rawContent.mimeType,
6232
+ content: processed,
6233
+ links: [],
6234
+ // Always return empty links array for individual files
6235
+ status: FetchStatus.SUCCESS
6236
+ };
6237
+ }
6238
+ /**
6239
+ * Cleanup resources used by this processor.
6240
+ */
6241
+ async cleanup() {
6242
+ await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
5832
6243
  }
5833
6244
  }
5834
- const DEFAULT_MAX_DEPTH = 3;
5835
- const DEFAULT_CONCURRENCY = 3;
5836
- class BaseScraperStrategy {
5837
- visited = /* @__PURE__ */ new Set();
5838
- pageCount = 0;
5839
- totalDiscovered = 0;
5840
- // Track total URLs discovered (unlimited)
5841
- effectiveTotal = 0;
5842
- // Track effective total (limited by maxPages)
5843
- canonicalBaseUrl;
5844
- options;
5845
- constructor(options = {}) {
5846
- this.options = options;
6245
+ class GitHubWikiProcessor {
6246
+ httpFetcher = new HttpFetcher();
6247
+ pipelines;
6248
+ constructor() {
6249
+ this.pipelines = PipelineFactory$1.createStandardPipelines();
5847
6250
  }
5848
6251
  /**
5849
- * Determines if a URL should be processed based on scope and include/exclude patterns in ScraperOptions.
5850
- * Scope is checked first, then patterns.
6252
+ * Parses a GitHub wiki URL to extract repository information.
6253
+ */
6254
+ parseGitHubWikiUrl(url) {
6255
+ const parsedUrl = new URL(url);
6256
+ const match = parsedUrl.pathname.match(/^\/([^/]+)\/([^/]+)\/wiki/);
6257
+ if (!match) {
6258
+ throw new Error(`Invalid GitHub wiki URL: ${url}`);
6259
+ }
6260
+ const [, owner, repo] = match;
6261
+ return { owner, repo };
6262
+ }
6263
+ /**
6264
+ * Determines if a URL should be processed within the wiki scope.
5851
6265
  */
5852
6266
  shouldProcessUrl(url, options) {
5853
- if (options.scope) {
5854
- try {
5855
- const base = this.canonicalBaseUrl ?? new URL$1(options.url);
5856
- const target = new URL$1(url);
5857
- if (!isInScope(base, target, options.scope)) return false;
5858
- } catch {
6267
+ try {
6268
+ const parsedUrl = new URL(url);
6269
+ const baseWikiInfo = this.parseGitHubWikiUrl(options.url);
6270
+ const expectedWikiPath = `/${baseWikiInfo.owner}/${baseWikiInfo.repo}/wiki`;
6271
+ if (!parsedUrl.pathname.startsWith(expectedWikiPath)) {
5859
6272
  return false;
5860
6273
  }
6274
+ const wikiPagePath = parsedUrl.pathname.replace(expectedWikiPath, "").replace(/^\//, "");
6275
+ return shouldIncludeUrl(
6276
+ wikiPagePath || "Home",
6277
+ options.includePatterns,
6278
+ options.excludePatterns
6279
+ );
6280
+ } catch {
6281
+ return false;
5861
6282
  }
5862
- return shouldIncludeUrl(url, options.includePatterns, options.excludePatterns);
5863
6283
  }
5864
- // Removed getProcessor method as processing is now handled by strategies using middleware pipelines
5865
- async processBatch(batch, baseUrl, options, progressCallback, signal) {
5866
- const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
5867
- const results = await Promise.all(
5868
- batch.map(async (item) => {
5869
- if (signal?.aborted) {
5870
- throw new CancellationError("Scraping cancelled during batch processing");
5871
- }
5872
- const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH;
5873
- if (item.depth > maxDepth) {
5874
- return [];
5875
- }
5876
- try {
5877
- const result = await this.processItem(item, options, void 0, signal);
5878
- if (item.depth === 0 && !this.canonicalBaseUrl && result?.finalUrl) {
5879
- try {
5880
- const finalUrlStr = result.finalUrl;
5881
- const original = new URL$1(options.url);
5882
- const finalUrlObj = new URL$1(finalUrlStr);
5883
- if (finalUrlObj.href !== original.href && (finalUrlObj.protocol === "http:" || finalUrlObj.protocol === "https:")) {
5884
- this.canonicalBaseUrl = finalUrlObj;
5885
- logger.debug(
5886
- `Updated scope base after redirect: ${original.href} -> ${finalUrlObj.href}`
5887
- );
5888
- } else {
5889
- this.canonicalBaseUrl = original;
5890
- }
5891
- } catch {
5892
- this.canonicalBaseUrl = new URL$1(options.url);
5893
- }
5894
- }
5895
- if (result.document) {
5896
- this.pageCount++;
5897
- logger.info(
5898
- `🌐 Scraping page ${this.pageCount}/${this.effectiveTotal} (depth ${item.depth}/${maxDepth}): ${item.url}`
5899
- );
5900
- await progressCallback({
5901
- pagesScraped: this.pageCount,
5902
- totalPages: this.effectiveTotal,
5903
- totalDiscovered: this.totalDiscovered,
5904
- currentUrl: item.url,
5905
- depth: item.depth,
5906
- maxDepth,
5907
- document: result.document
5908
- });
5909
- }
5910
- const nextItems = result.links || [];
5911
- return nextItems.map((value) => {
5912
- try {
5913
- const targetUrl = new URL$1(value, baseUrl);
5914
- if (!this.shouldProcessUrl(targetUrl.href, options)) {
5915
- return null;
5916
- }
5917
- return {
5918
- url: targetUrl.href,
5919
- depth: item.depth + 1
5920
- };
5921
- } catch (_error) {
5922
- logger.warn(`❌ Invalid URL: ${value}`);
5923
- }
5924
- return null;
5925
- }).filter((item2) => item2 !== null);
5926
- } catch (error) {
5927
- if (options.ignoreErrors) {
5928
- logger.error(`❌ Failed to process ${item.url}: ${error}`);
5929
- return [];
5930
- }
5931
- throw error;
5932
- }
5933
- })
5934
- );
5935
- const allLinks = results.flat();
5936
- const uniqueLinks = [];
5937
- for (const item of allLinks) {
5938
- const normalizedUrl = normalizeUrl(item.url, this.options.urlNormalizerOptions);
5939
- if (!this.visited.has(normalizedUrl)) {
5940
- this.visited.add(normalizedUrl);
5941
- uniqueLinks.push(item);
5942
- this.totalDiscovered++;
5943
- if (this.effectiveTotal < maxPages) {
5944
- this.effectiveTotal++;
6284
+ /**
6285
+ * Processes a single GitHub wiki page.
6286
+ */
6287
+ async process(item, options, signal) {
6288
+ const currentUrl = item.url;
6289
+ try {
6290
+ const rawContent = await this.httpFetcher.fetch(currentUrl, {
6291
+ signal,
6292
+ etag: item.etag
6293
+ });
6294
+ if (rawContent.status !== FetchStatus.SUCCESS) {
6295
+ return { url: currentUrl, links: [], status: rawContent.status };
6296
+ }
6297
+ let processed;
6298
+ for (const pipeline of this.pipelines) {
6299
+ if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) {
6300
+ logger.debug(
6301
+ `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${currentUrl})`
6302
+ );
6303
+ const wikiOptions = { ...options, scrapeMode: ScrapeMode.Fetch };
6304
+ processed = await pipeline.process(rawContent, wikiOptions, this.httpFetcher);
6305
+ break;
5945
6306
  }
5946
6307
  }
5947
- }
5948
- return uniqueLinks;
5949
- }
5950
- async scrape(options, progressCallback, signal) {
5951
- this.visited.clear();
5952
- this.pageCount = 0;
5953
- this.totalDiscovered = 1;
5954
- this.effectiveTotal = 1;
5955
- this.canonicalBaseUrl = new URL$1(options.url);
5956
- let baseUrl = this.canonicalBaseUrl;
5957
- const queue = [{ url: options.url, depth: 0 }];
5958
- this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions));
5959
- const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
5960
- const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY;
5961
- while (queue.length > 0 && this.pageCount < maxPages) {
5962
- if (signal?.aborted) {
5963
- logger.debug("Scraping cancelled by signal.");
5964
- throw new CancellationError("Scraping cancelled by signal");
6308
+ if (!processed) {
6309
+ logger.warn(
6310
+ `⚠️ Unsupported content type "${rawContent.mimeType}" for wiki page ${currentUrl}. Skipping processing.`
6311
+ );
6312
+ return { url: currentUrl, links: [], status: FetchStatus.SUCCESS };
5965
6313
  }
5966
- const remainingPages = maxPages - this.pageCount;
5967
- if (remainingPages <= 0) {
5968
- break;
6314
+ for (const err of processed.errors ?? []) {
6315
+ logger.warn(`⚠️ Processing error for ${currentUrl}: ${err.message}`);
5969
6316
  }
5970
- const batchSize = Math.min(
5971
- maxConcurrency,
5972
- // Use variable
5973
- remainingPages,
5974
- queue.length
5975
- );
5976
- const batch = queue.splice(0, batchSize);
5977
- baseUrl = this.canonicalBaseUrl ?? baseUrl;
5978
- const newUrls = await this.processBatch(
5979
- batch,
5980
- baseUrl,
5981
- options,
5982
- progressCallback,
5983
- signal
5984
- );
5985
- queue.push(...newUrls);
6317
+ const parsedUrl = new URL(currentUrl);
6318
+ const wikiInfo = this.parseGitHubWikiUrl(currentUrl);
6319
+ const wikiPagePath = parsedUrl.pathname.replace(`/${wikiInfo.owner}/${wikiInfo.repo}/wiki`, "").replace(/^\//, "");
6320
+ const pageTitle = wikiPagePath || "Home";
6321
+ const links = processed.links || [];
6322
+ const wikiLinks = links.filter((link) => {
6323
+ if (!link || link.trim() === "" || link === "invalid-url" || link === "not-a-url-at-all") {
6324
+ return false;
6325
+ }
6326
+ return true;
6327
+ }).map((link) => {
6328
+ try {
6329
+ return new URL(link, currentUrl).href;
6330
+ } catch {
6331
+ return null;
6332
+ }
6333
+ }).filter((link) => link !== null).filter((link) => {
6334
+ try {
6335
+ const linkUrl = new URL(link);
6336
+ return linkUrl.hostname === parsedUrl.hostname && linkUrl.pathname.startsWith(`/${wikiInfo.owner}/${wikiInfo.repo}/wiki`);
6337
+ } catch {
6338
+ return false;
6339
+ }
6340
+ });
6341
+ return {
6342
+ url: currentUrl,
6343
+ title: pageTitle,
6344
+ etag: rawContent.etag,
6345
+ lastModified: rawContent.lastModified,
6346
+ contentType: rawContent.mimeType,
6347
+ content: processed,
6348
+ links: wikiLinks,
6349
+ status: FetchStatus.SUCCESS
6350
+ };
6351
+ } catch (error) {
6352
+ logger.warn(`⚠️ Failed to process wiki page ${currentUrl}: ${error}`);
6353
+ return { url: currentUrl, links: [], status: FetchStatus.SUCCESS };
5986
6354
  }
5987
6355
  }
5988
6356
  /**
5989
- * Cleanup resources used by this strategy.
5990
- * Default implementation does nothing - override in derived classes as needed.
6357
+ * Cleanup resources used by this processor.
5991
6358
  */
5992
6359
  async cleanup() {
6360
+ await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
5993
6361
  }
5994
6362
  }
5995
- class GitHubRepoScraperStrategy extends BaseScraperStrategy {
6363
+ class GitHubScraperStrategy extends BaseScraperStrategy {
5996
6364
  httpFetcher = new HttpFetcher();
5997
- pipelines;
5998
- resolvedBranch;
5999
- // Cache the resolved default branch
6000
- constructor() {
6001
- super();
6002
- this.pipelines = PipelineFactory$1.createStandardPipelines();
6003
- }
6365
+ wikiProcessor = new GitHubWikiProcessor();
6366
+ repoProcessor = new GitHubRepoProcessor();
6004
6367
  canHandle(url) {
6005
- const { hostname } = new URL(url);
6006
- return ["github.com", "www.github.com"].includes(hostname);
6007
- }
6008
- /**
6009
- * Override shouldProcessUrl to handle github-file:// URLs specially.
6010
- * These URLs bypass scope checking since they're internal file references.
6011
- */
6012
- shouldProcessUrl(url, options) {
6013
6368
  if (url.startsWith("github-file://")) {
6014
- const filePath = url.replace("github-file://", "");
6015
- return shouldIncludeUrl(filePath, options.includePatterns, options.excludePatterns);
6369
+ return true;
6370
+ }
6371
+ try {
6372
+ const parsedUrl = new URL(url);
6373
+ const { hostname, pathname } = parsedUrl;
6374
+ if (!["github.com", "www.github.com"].includes(hostname)) {
6375
+ return false;
6376
+ }
6377
+ const baseMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/?$/);
6378
+ if (baseMatch) {
6379
+ return true;
6380
+ }
6381
+ const treeMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/tree\//);
6382
+ if (treeMatch) {
6383
+ return true;
6384
+ }
6385
+ const blobMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/blob\//);
6386
+ if (blobMatch) {
6387
+ return true;
6388
+ }
6389
+ return false;
6390
+ } catch {
6391
+ return false;
6016
6392
  }
6017
- return super.shouldProcessUrl(url, options);
6018
6393
  }
6019
6394
  /**
6020
6395
  * Parses a GitHub URL to extract repository information.
@@ -6028,20 +6403,19 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
6028
6403
  const [, owner, repo] = match;
6029
6404
  const segments = parsedUrl.pathname.split("/").filter(Boolean);
6030
6405
  if (segments.length >= 4 && segments[2] === "blob") {
6031
- const branch2 = segments[3];
6406
+ const branch = segments[3];
6032
6407
  const filePath = segments.length > 4 ? segments.slice(4).join("/") : void 0;
6033
- return { owner, repo, branch: branch2, filePath, isBlob: true };
6408
+ return { owner, repo, branch, filePath, isBlob: true };
6034
6409
  }
6035
- if (segments.length < 4 || segments[2] !== "tree") {
6036
- return { owner, repo };
6410
+ if (segments.length >= 4 && segments[2] === "tree") {
6411
+ const branch = segments[3];
6412
+ const subPath = segments.length > 4 ? segments.slice(4).join("/") : void 0;
6413
+ return { owner, repo, branch, subPath };
6037
6414
  }
6038
- const branch = segments[3];
6039
- const subPath = segments.length > 4 ? segments.slice(4).join("/") : void 0;
6040
- return { owner, repo, branch, subPath };
6415
+ return { owner, repo };
6041
6416
  }
6042
6417
  /**
6043
6418
  * Fetches the repository tree structure from GitHub API.
6044
- * Uses 'HEAD' to get the default branch if no branch is specified.
6045
6419
  */
6046
6420
  async fetchRepositoryTree(repoInfo, signal) {
6047
6421
  const { owner, repo, branch } = repoInfo;
@@ -6060,7 +6434,6 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
6060
6434
  targetBranch = "main";
6061
6435
  }
6062
6436
  }
6063
- this.resolvedBranch = targetBranch;
6064
6437
  const treeUrl = `https://api.github.com/repos/${owner}/${repo}/git/trees/${targetBranch}?recursive=1`;
6065
6438
  logger.debug(`Fetching repository tree: ${treeUrl}`);
6066
6439
  const rawContent = await this.httpFetcher.fetch(treeUrl, { signal });
@@ -6082,14 +6455,12 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
6082
6455
  }
6083
6456
  const path2 = item.path;
6084
6457
  const textExtensions = [
6085
- // Documentation
6086
6458
  ".md",
6087
6459
  ".mdx",
6088
6460
  ".txt",
6089
6461
  ".rst",
6090
6462
  ".adoc",
6091
6463
  ".asciidoc",
6092
- // Web technologies
6093
6464
  ".html",
6094
6465
  ".htm",
6095
6466
  ".xml",
@@ -6097,7 +6468,6 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
6097
6468
  ".scss",
6098
6469
  ".sass",
6099
6470
  ".less",
6100
- // Programming languages
6101
6471
  ".js",
6102
6472
  ".jsx",
6103
6473
  ".ts",
@@ -6133,7 +6503,6 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
6133
6503
  ".ps1",
6134
6504
  ".bat",
6135
6505
  ".cmd",
6136
- // Configuration and data
6137
6506
  ".json",
6138
6507
  ".yaml",
6139
6508
  ".yml",
@@ -6147,7 +6516,6 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
6147
6516
  ".dockerignore",
6148
6517
  ".gitattributes",
6149
6518
  ".editorconfig",
6150
- // Build and package management
6151
6519
  ".gradle",
6152
6520
  ".pom",
6153
6521
  ".sbt",
@@ -6156,10 +6524,7 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
6156
6524
  ".make",
6157
6525
  ".dockerfile",
6158
6526
  ".mod",
6159
- // Go modules (go.mod)
6160
6527
  ".sum",
6161
- // Go checksums (go.sum)
6162
- // Other text formats
6163
6528
  ".sql",
6164
6529
  ".graphql",
6165
6530
  ".gql",
@@ -6172,20 +6537,16 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
6172
6537
  ];
6173
6538
  const pathLower = path2.toLowerCase();
6174
6539
  const hasTextExtension = textExtensions.some((ext) => pathLower.endsWith(ext));
6175
- const hasCompoundExtension = pathLower.includes(".env.") || // .env.example, .env.local, etc.
6176
- pathLower.endsWith(".env") || pathLower.includes(".config.") || // webpack.config.js, etc.
6177
- pathLower.includes(".lock");
6540
+ const hasCompoundExtension = pathLower.includes(".env.") || pathLower.endsWith(".env") || pathLower.includes(".config.") || pathLower.includes(".lock");
6178
6541
  const fileName = path2.split("/").pop() || "";
6179
6542
  const fileNameLower = fileName.toLowerCase();
6180
6543
  const commonTextFiles = [
6181
- // Documentation files without extensions
6182
6544
  "readme",
6183
6545
  "license",
6184
6546
  "changelog",
6185
6547
  "contributing",
6186
6548
  "authors",
6187
6549
  "maintainers",
6188
- // Build files without extensions
6189
6550
  "dockerfile",
6190
6551
  "makefile",
6191
6552
  "rakefile",
@@ -6193,374 +6554,125 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
6193
6554
  "podfile",
6194
6555
  "cartfile",
6195
6556
  "brewfile",
6196
- "procfile",
6197
- "vagrantfile",
6198
- "gulpfile",
6199
- "gruntfile",
6200
- // Configuration files (dotfiles)
6201
- ".prettierrc",
6202
- ".eslintrc",
6203
- ".babelrc",
6204
- ".nvmrc",
6205
- ".npmrc"
6206
- ];
6207
- const isCommonTextFile = commonTextFiles.some((name2) => {
6208
- if (name2.startsWith(".")) {
6209
- return fileNameLower === name2 || fileNameLower.startsWith(`${name2}.`);
6210
- }
6211
- return fileNameLower === name2 || fileNameLower.startsWith(`${name2}.`);
6212
- });
6213
- if (!hasTextExtension && !hasCompoundExtension && !isCommonTextFile) {
6214
- return false;
6215
- }
6216
- return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
6217
- }
6218
- /**
6219
- * Fetches the raw content of a file from GitHub.
6220
- */
6221
- async fetchFileContent(repoInfo, filePath, signal) {
6222
- const { owner, repo } = repoInfo;
6223
- const branch = this.resolvedBranch || repoInfo.branch || "main";
6224
- const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${filePath}`;
6225
- const rawContent = await this.httpFetcher.fetch(rawUrl, { signal });
6226
- const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath);
6227
- if (detectedMimeType && rawContent.mimeType === "text/plain") {
6228
- return {
6229
- ...rawContent,
6230
- mimeType: detectedMimeType
6231
- };
6232
- }
6233
- return rawContent;
6234
- }
6235
- async processItem(item, options, _progressCallback, signal) {
6236
- const repoInfo = this.parseGitHubUrl(options.url);
6237
- if (item.depth === 0) {
6238
- if ("isBlob" in repoInfo && repoInfo.isBlob) {
6239
- if (repoInfo.filePath) {
6240
- logger.info(
6241
- `📄 Processing single file: ${repoInfo.owner}/${repoInfo.repo}/${repoInfo.filePath}`
6242
- );
6243
- return { links: [`github-file://${repoInfo.filePath}`] };
6244
- } else {
6245
- logger.warn(
6246
- `⚠️ Blob URL without file path: ${options.url}. No files to process.`
6247
- );
6248
- return { links: [] };
6249
- }
6250
- }
6251
- logger.info(
6252
- `🗂️ Discovering repository structure for ${repoInfo.owner}/${repoInfo.repo}`
6253
- );
6254
- const { tree, resolvedBranch } = await this.fetchRepositoryTree(repoInfo, signal);
6255
- const fileItems = tree.tree.filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)).filter((treeItem) => this.shouldProcessFile(treeItem, options));
6256
- logger.info(
6257
- `📁 Found ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`
6258
- );
6259
- const links = fileItems.map((treeItem) => `github-file://${treeItem.path}`);
6260
- return { links };
6261
- }
6262
- if (item.url.startsWith("github-file://")) {
6263
- const filePath = item.url.replace("github-file://", "");
6264
- logger.info(
6265
- `🗂️ Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`
6266
- );
6267
- const rawContent = await this.fetchFileContent(repoInfo, filePath, signal);
6268
- let processed;
6269
- for (const pipeline of this.pipelines) {
6270
- if (pipeline.canProcess(rawContent)) {
6271
- logger.debug(
6272
- `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`
6273
- );
6274
- const gitHubOptions = { ...options, scrapeMode: ScrapeMode.Fetch };
6275
- processed = await pipeline.process(rawContent, gitHubOptions, this.httpFetcher);
6276
- break;
6277
- }
6278
- }
6279
- if (!processed) {
6280
- logger.warn(
6281
- `⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
6282
- );
6283
- return { document: void 0, links: [] };
6284
- }
6285
- for (const err of processed.errors) {
6286
- logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`);
6287
- }
6288
- const githubUrl = `https://github.com/${repoInfo.owner}/${repoInfo.repo}/blob/${this.resolvedBranch || repoInfo.branch || "main"}/${filePath}`;
6289
- const processedTitle = processed.metadata.title;
6290
- const hasValidTitle = typeof processedTitle === "string" && processedTitle.trim() !== "";
6291
- const fallbackTitle = filePath.split("/").pop() || "Untitled";
6292
- return {
6293
- document: {
6294
- content: typeof processed.textContent === "string" ? processed.textContent : "",
6295
- metadata: {
6296
- url: githubUrl,
6297
- title: hasValidTitle ? processedTitle : fallbackTitle,
6298
- library: options.library,
6299
- version: options.version
6300
- },
6301
- contentType: rawContent.mimeType
6302
- // Preserve the detected MIME type
6303
- },
6304
- links: []
6305
- // Always return empty links array for individual files
6306
- };
6307
- }
6308
- return { document: void 0, links: [] };
6309
- }
6310
- /**
6311
- * Normalize a path by removing leading and trailing slashes.
6312
- */
6313
- normalizePath(path2) {
6314
- return path2.replace(/^\/+/, "").replace(/\/+$/, "");
6315
- }
6316
- isWithinSubPath(path2, subPath) {
6317
- if (!subPath) {
6318
- return true;
6319
- }
6320
- const trimmedSubPath = this.normalizePath(subPath);
6321
- if (trimmedSubPath.length === 0) {
6322
- return true;
6323
- }
6324
- const normalizedPath = this.normalizePath(path2);
6325
- if (normalizedPath === trimmedSubPath) {
6326
- return true;
6327
- }
6328
- return normalizedPath.startsWith(`${trimmedSubPath}/`);
6329
- }
6330
- async scrape(options, progressCallback, signal) {
6331
- const url = new URL(options.url);
6332
- if (!url.hostname.includes("github.com")) {
6333
- throw new Error("URL must be a GitHub URL");
6334
- }
6335
- return super.scrape(options, progressCallback, signal);
6336
- }
6337
- /**
6338
- * Cleanup resources used by this strategy, specifically the pipeline browser instances.
6339
- */
6340
- async cleanup() {
6341
- await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
6342
- }
6343
- }
6344
- class GitHubWikiScraperStrategy extends BaseScraperStrategy {
6345
- httpFetcher = new HttpFetcher();
6346
- pipelines;
6347
- constructor() {
6348
- super();
6349
- this.pipelines = PipelineFactory$1.createStandardPipelines();
6350
- }
6351
- canHandle(url) {
6352
- try {
6353
- const parsedUrl = new URL(url);
6354
- const { hostname, pathname } = parsedUrl;
6355
- return ["github.com", "www.github.com"].includes(hostname) && pathname.includes("/wiki") && pathname.match(/^\/([^/]+)\/([^/]+)\/wiki/) !== null;
6356
- } catch {
6357
- return false;
6358
- }
6359
- }
6360
- /**
6361
- * Parses a GitHub wiki URL to extract repository information.
6362
- */
6363
- parseGitHubWikiUrl(url) {
6364
- const parsedUrl = new URL(url);
6365
- const match = parsedUrl.pathname.match(/^\/([^/]+)\/([^/]+)\/wiki/);
6366
- if (!match) {
6367
- throw new Error(`Invalid GitHub wiki URL: ${url}`);
6368
- }
6369
- const [, owner, repo] = match;
6370
- return { owner, repo };
6371
- }
6372
- /**
6373
- * Override shouldProcessUrl to only process URLs within the wiki scope.
6374
- */
6375
- shouldProcessUrl(url, options) {
6376
- try {
6377
- const parsedUrl = new URL(url);
6378
- const wikiInfo = this.parseGitHubWikiUrl(options.url);
6379
- const expectedWikiPath = `/${wikiInfo.owner}/${wikiInfo.repo}/wiki`;
6380
- if (!parsedUrl.pathname.startsWith(expectedWikiPath)) {
6381
- return false;
6382
- }
6383
- const wikiPagePath = parsedUrl.pathname.replace(expectedWikiPath, "").replace(/^\//, "");
6384
- return shouldIncludeUrl(
6385
- wikiPagePath || "Home",
6386
- options.includePatterns,
6387
- options.excludePatterns
6388
- );
6389
- } catch {
6390
- return false;
6391
- }
6392
- }
6393
- async processItem(item, options, _progressCallback, signal) {
6394
- const currentUrl = item.url;
6395
- logger.info(
6396
- `📖 Processing wiki page ${this.pageCount}/${options.maxPages}: ${currentUrl}`
6397
- );
6398
- try {
6399
- const rawContent = await this.httpFetcher.fetch(currentUrl, { signal });
6400
- let processed;
6401
- for (const pipeline of this.pipelines) {
6402
- if (pipeline.canProcess(rawContent)) {
6403
- logger.debug(
6404
- `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${currentUrl})`
6405
- );
6406
- const wikiOptions = { ...options, scrapeMode: ScrapeMode.Fetch };
6407
- processed = await pipeline.process(rawContent, wikiOptions, this.httpFetcher);
6408
- break;
6409
- }
6410
- }
6411
- if (!processed) {
6412
- logger.warn(
6413
- `⚠️ Unsupported content type "${rawContent.mimeType}" for wiki page ${currentUrl}. Skipping processing.`
6414
- );
6415
- return { document: void 0, links: [] };
6416
- }
6417
- for (const err of processed.errors) {
6418
- logger.warn(`⚠️ Processing error for ${currentUrl}: ${err.message}`);
6557
+ "procfile",
6558
+ "vagrantfile",
6559
+ "gulpfile",
6560
+ "gruntfile",
6561
+ ".prettierrc",
6562
+ ".eslintrc",
6563
+ ".babelrc",
6564
+ ".nvmrc",
6565
+ ".npmrc"
6566
+ ];
6567
+ const isCommonTextFile = commonTextFiles.some((name2) => {
6568
+ if (name2.startsWith(".")) {
6569
+ return fileNameLower === name2 || fileNameLower.startsWith(`${name2}.`);
6419
6570
  }
6420
- const parsedUrl = new URL(currentUrl);
6421
- const wikiInfo = this.parseGitHubWikiUrl(currentUrl);
6422
- const wikiPagePath = parsedUrl.pathname.replace(`/${wikiInfo.owner}/${wikiInfo.repo}/wiki`, "").replace(/^\//, "");
6423
- const pageTitle = wikiPagePath || "Home";
6424
- const document2 = {
6425
- content: typeof processed.textContent === "string" ? processed.textContent : "",
6426
- metadata: {
6427
- url: currentUrl,
6428
- title: typeof processed.metadata.title === "string" && processed.metadata.title.trim() !== "" ? processed.metadata.title : pageTitle,
6429
- library: options.library,
6430
- version: options.version
6431
- },
6432
- contentType: rawContent.mimeType
6433
- };
6434
- const links = processed.links || [];
6435
- const wikiLinks = links.filter((link) => {
6436
- if (!link || link.trim() === "" || link === "invalid-url" || link === "not-a-url-at-all") {
6437
- return false;
6438
- }
6439
- return true;
6440
- }).map((link) => {
6441
- try {
6442
- return new URL(link, currentUrl).href;
6443
- } catch {
6444
- return null;
6445
- }
6446
- }).filter((link) => link !== null).filter((link) => {
6447
- try {
6448
- const linkUrl = new URL(link);
6449
- return linkUrl.hostname === parsedUrl.hostname && linkUrl.pathname.startsWith(`/${wikiInfo.owner}/${wikiInfo.repo}/wiki`);
6450
- } catch {
6451
- return false;
6452
- }
6453
- });
6454
- return { document: document2, links: wikiLinks };
6455
- } catch (error) {
6456
- logger.warn(`⚠️ Failed to process wiki page ${currentUrl}: ${error}`);
6457
- return { document: void 0, links: [] };
6458
- }
6459
- }
6460
- async scrape(options, progressCallback, signal) {
6461
- const url = new URL(options.url);
6462
- if (!url.hostname.includes("github.com") || !url.pathname.includes("/wiki")) {
6463
- throw new Error("URL must be a GitHub wiki URL");
6571
+ return fileNameLower === name2 || fileNameLower.startsWith(`${name2}.`);
6572
+ });
6573
+ if (hasTextExtension || hasCompoundExtension || isCommonTextFile) {
6574
+ return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
6464
6575
  }
6465
- let startUrl = options.url;
6466
- if (url.pathname.endsWith("/wiki") || url.pathname.endsWith("/wiki/")) {
6467
- startUrl = url.pathname.endsWith("/") ? `${options.url}Home` : `${options.url}/Home`;
6576
+ const mimeType = mime.getType(path2);
6577
+ if (mimeType?.startsWith("text/")) {
6578
+ logger.debug(`Including file with text MIME type: ${path2} (${mimeType})`);
6579
+ return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
6468
6580
  }
6469
- const wikiOptions = { ...options, url: startUrl };
6470
- return super.scrape(wikiOptions, progressCallback, signal);
6581
+ return false;
6471
6582
  }
6472
6583
  /**
6473
- * Cleanup resources used by this strategy.
6584
+ * Checks if a path is within the specified subpath.
6474
6585
  */
6475
- async cleanup() {
6476
- await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
6586
+ isWithinSubPath(path2, subPath) {
6587
+ if (!subPath) {
6588
+ return true;
6589
+ }
6590
+ const trimmedSubPath = subPath.replace(/^\/+/, "").replace(/\/+$/, "");
6591
+ if (trimmedSubPath.length === 0) {
6592
+ return true;
6593
+ }
6594
+ const normalizedPath = path2.replace(/^\/+/, "").replace(/\/+$/, "");
6595
+ if (normalizedPath === trimmedSubPath) {
6596
+ return true;
6597
+ }
6598
+ return normalizedPath.startsWith(`${trimmedSubPath}/`);
6477
6599
  }
6478
- }
6479
- class GitHubScraperStrategy {
6480
- repoStrategy = new GitHubRepoScraperStrategy();
6481
- wikiStrategy = new GitHubWikiScraperStrategy();
6482
- canHandle(url) {
6600
+ async processItem(item, options, signal) {
6601
+ if (item.url.startsWith("github-file://")) {
6602
+ logger.info(
6603
+ `🗑️ Legacy github-file:// URL detected, marking as deleted: ${item.url}`
6604
+ );
6605
+ return {
6606
+ url: item.url,
6607
+ links: [],
6608
+ status: FetchStatus.NOT_FOUND
6609
+ };
6610
+ }
6483
6611
  try {
6484
- const parsedUrl = new URL(url);
6485
- const { hostname, pathname } = parsedUrl;
6486
- if (!["github.com", "www.github.com"].includes(hostname)) {
6487
- return false;
6612
+ const parsedUrl = new URL(item.url);
6613
+ if (/^\/[^/]+\/[^/]+\/wiki($|\/)/.test(parsedUrl.pathname)) {
6614
+ return await this.wikiProcessor.process(item, options, signal);
6488
6615
  }
6489
- const pathMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/?$/);
6490
- return pathMatch !== null;
6491
6616
  } catch {
6492
- return false;
6493
- }
6494
- }
6495
- async scrape(options, progressCallback, signal) {
6496
- const url = new URL(options.url);
6497
- if (!url.hostname.includes("github.com")) {
6498
- throw new Error("URL must be a GitHub URL");
6499
6617
  }
6500
- const pathMatch = url.pathname.match(/^\/([^/]+)\/([^/]+)\/?$/);
6501
- if (!pathMatch) {
6502
- throw new Error("URL must be a base GitHub repository URL");
6503
- }
6504
- const [, owner, repo] = pathMatch;
6505
- logger.info(`🚀 Starting comprehensive GitHub scraping for ${owner}/${repo}`);
6506
- let totalPagesDiscovered = 0;
6507
- let wikiPagesScraped = 0;
6508
- let wikiCompleted = false;
6509
- let repoCompleted = false;
6510
- const mergedProgressCallback = async (progress) => {
6511
- if (!wikiCompleted) {
6512
- totalPagesDiscovered = progress.totalDiscovered;
6513
- wikiPagesScraped = progress.pagesScraped;
6514
- } else if (!repoCompleted) {
6515
- progress = {
6516
- ...progress,
6517
- pagesScraped: wikiPagesScraped + progress.pagesScraped,
6518
- totalPages: wikiPagesScraped + progress.totalPages,
6519
- totalDiscovered: totalPagesDiscovered + progress.totalDiscovered
6618
+ if (item.depth === 0) {
6619
+ const repoInfo = this.parseGitHubUrl(options.url);
6620
+ const { owner, repo } = repoInfo;
6621
+ logger.debug(`Discovering GitHub repository ${owner}/${repo}`);
6622
+ const discoveredLinks = [];
6623
+ if ("isBlob" in repoInfo && repoInfo.isBlob && repoInfo.filePath) {
6624
+ const { branch = "main", filePath } = repoInfo;
6625
+ logger.debug(
6626
+ `Single file URL detected: ${owner}/${repo}/${filePath} - indexing file only`
6627
+ );
6628
+ discoveredLinks.push(
6629
+ `https://github.com/${owner}/${repo}/blob/${branch}/${filePath}`
6630
+ );
6631
+ return {
6632
+ url: item.url,
6633
+ links: discoveredLinks,
6634
+ status: FetchStatus.SUCCESS
6520
6635
  };
6521
6636
  }
6522
- await progressCallback(progress);
6523
- };
6524
- try {
6525
6637
  const wikiUrl = `${options.url.replace(/\/$/, "")}/wiki`;
6526
- const wikiOptions = { ...options, url: wikiUrl };
6527
- logger.info(`📖 Attempting to scrape wiki for ${owner}/${repo}`);
6528
- try {
6529
- await this.wikiStrategy.scrape(wikiOptions, mergedProgressCallback, signal);
6530
- wikiCompleted = true;
6531
- logger.info(
6532
- `✅ Completed wiki scraping for ${owner}/${repo} (${wikiPagesScraped} pages)`
6533
- );
6534
- } catch (error) {
6535
- wikiCompleted = true;
6536
- logger.info(`ℹ️ Wiki not available or accessible for ${owner}/${repo}: ${error}`);
6537
- }
6538
- const maxPages = options.maxPages || 1e3;
6539
- const remainingPages = Math.max(0, maxPages - wikiPagesScraped);
6540
- if (remainingPages > 0) {
6541
- logger.info(
6542
- `📂 Scraping repository code for ${owner}/${repo} (${remainingPages} pages remaining)`
6543
- );
6544
- const repoOptions = { ...options, maxPages: remainingPages };
6545
- await this.repoStrategy.scrape(repoOptions, mergedProgressCallback, signal);
6546
- repoCompleted = true;
6547
- logger.info(`✅ Completed repository code scraping for ${owner}/${repo}`);
6548
- } else {
6549
- logger.info(
6550
- `ℹ️ Skipping repository code scraping - page limit reached with wiki content`
6551
- );
6638
+ discoveredLinks.push(wikiUrl);
6639
+ logger.debug(`Discovered wiki URL: ${wikiUrl}`);
6640
+ const { tree, resolvedBranch } = await this.fetchRepositoryTree(repoInfo, signal);
6641
+ const fileItems = tree.tree.filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)).filter((treeItem) => this.shouldProcessFile(treeItem, options));
6642
+ logger.debug(
6643
+ `Discovered ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`
6644
+ );
6645
+ const fileUrls = fileItems.map(
6646
+ (treeItem) => `https://github.com/${owner}/${repo}/blob/${resolvedBranch}/${treeItem.path}`
6647
+ );
6648
+ discoveredLinks.push(...fileUrls);
6649
+ logger.debug(
6650
+ `Discovery complete: ${fileUrls.length} repo file(s) + 1 wiki URL = ${discoveredLinks.length} total URLs`
6651
+ );
6652
+ return { url: item.url, links: discoveredLinks, status: FetchStatus.SUCCESS };
6653
+ }
6654
+ try {
6655
+ const parsedUrl = new URL(item.url);
6656
+ if (/^\/[^/]+\/[^/]+\/blob\//.test(parsedUrl.pathname)) {
6657
+ logger.debug(`Processing HTTPS blob URL at depth ${item.depth}: ${item.url}`);
6658
+ return await this.repoProcessor.process(item, options, signal);
6552
6659
  }
6553
- logger.info(`🎉 Comprehensive GitHub scraping completed for ${owner}/${repo}`);
6554
6660
  } catch (error) {
6555
- logger.error(`❌ GitHub scraping failed for ${owner}/${repo}: ${error}`);
6556
- throw error;
6661
+ logger.warn(`⚠️ Failed to parse blob URL ${item.url}: ${error}`);
6662
+ return { url: item.url, links: [], status: FetchStatus.SUCCESS };
6557
6663
  }
6664
+ logger.debug(`No further processing for URL at depth ${item.depth}: ${item.url}`);
6665
+ return { url: item.url, links: [], status: FetchStatus.SUCCESS };
6666
+ }
6667
+ async scrape(options, progressCallback, signal) {
6668
+ const url = new URL(options.url);
6669
+ if (!url.hostname.includes("github.com")) {
6670
+ throw new Error("URL must be a GitHub URL");
6671
+ }
6672
+ await super.scrape(options, progressCallback, signal);
6558
6673
  }
6559
- /**
6560
- * Cleanup resources used by both underlying strategies.
6561
- */
6562
6674
  async cleanup() {
6563
- await Promise.allSettled([this.repoStrategy.cleanup(), this.wikiStrategy.cleanup()]);
6675
+ await Promise.all([this.wikiProcessor.cleanup(), this.repoProcessor.cleanup()]);
6564
6676
  }
6565
6677
  }
6566
6678
  class LocalFileStrategy extends BaseScraperStrategy {
@@ -6573,23 +6685,41 @@ class LocalFileStrategy extends BaseScraperStrategy {
6573
6685
  canHandle(url) {
6574
6686
  return url.startsWith("file://");
6575
6687
  }
6576
- async processItem(item, options, _progressCallback, _signal) {
6688
+ async processItem(item, options, _signal) {
6577
6689
  let filePath = item.url.replace(/^file:\/\/\/?/, "");
6578
6690
  filePath = decodeURIComponent(filePath);
6579
6691
  if (!filePath.startsWith("/") && process.platform !== "win32") {
6580
6692
  filePath = `/${filePath}`;
6581
6693
  }
6582
- const stats = await fs$1.stat(filePath);
6694
+ let stats;
6695
+ try {
6696
+ stats = await fs$1.stat(filePath);
6697
+ } catch (error) {
6698
+ if (error.code === "ENOENT") {
6699
+ logger.info(`✓ File deleted or not available: ${filePath}`);
6700
+ return {
6701
+ url: item.url,
6702
+ links: [],
6703
+ status: FetchStatus.NOT_FOUND
6704
+ };
6705
+ }
6706
+ throw error;
6707
+ }
6583
6708
  if (stats.isDirectory()) {
6584
6709
  const contents = await fs$1.readdir(filePath);
6585
6710
  const links = contents.map((name2) => `file://${path.join(filePath, name2)}`).filter((url) => this.shouldProcessUrl(url, options));
6586
- return { links };
6711
+ return { url: item.url, links, status: FetchStatus.SUCCESS };
6712
+ }
6713
+ const rawContent = await this.fileFetcher.fetch(item.url, {
6714
+ etag: item.etag
6715
+ });
6716
+ if (rawContent.status === FetchStatus.NOT_MODIFIED) {
6717
+ logger.debug(`✓ File unchanged: ${filePath}`);
6718
+ return { url: rawContent.source, links: [], status: FetchStatus.NOT_MODIFIED };
6587
6719
  }
6588
- logger.info(`🗂️ Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`);
6589
- const rawContent = await this.fileFetcher.fetch(item.url);
6590
6720
  let processed;
6591
6721
  for (const pipeline of this.pipelines) {
6592
- if (pipeline.canProcess(rawContent)) {
6722
+ if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) {
6593
6723
  logger.debug(
6594
6724
  `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`
6595
6725
  );
@@ -6601,22 +6731,22 @@ class LocalFileStrategy extends BaseScraperStrategy {
6601
6731
  logger.warn(
6602
6732
  `⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
6603
6733
  );
6604
- return { document: void 0, links: [] };
6734
+ return { url: rawContent.source, links: [], status: FetchStatus.SUCCESS };
6605
6735
  }
6606
- for (const err of processed.errors) {
6736
+ for (const err of processed.errors ?? []) {
6607
6737
  logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`);
6608
6738
  }
6739
+ const filename = path.basename(filePath);
6740
+ const title = processed.title?.trim() || filename || null;
6609
6741
  return {
6610
- document: {
6611
- content: typeof processed.textContent === "string" ? processed.textContent : "",
6612
- contentType: rawContent.mimeType,
6613
- metadata: {
6614
- url: rawContent.source,
6615
- title: typeof processed.metadata.title === "string" ? processed.metadata.title : "Untitled",
6616
- library: options.library,
6617
- version: options.version
6618
- }
6619
- }
6742
+ url: rawContent.source,
6743
+ title,
6744
+ etag: rawContent.etag,
6745
+ lastModified: rawContent.lastModified,
6746
+ contentType: rawContent.mimeType,
6747
+ content: processed,
6748
+ links: [],
6749
+ status: FetchStatus.SUCCESS
6620
6750
  };
6621
6751
  }
6622
6752
  /**
@@ -6652,19 +6782,32 @@ class WebScraperStrategy extends BaseScraperStrategy {
6652
6782
  * @param signal - Optional abort signal for request cancellation.
6653
6783
  * @returns An object containing the processed document and extracted links.
6654
6784
  */
6655
- async processItem(item, options, _progressCallback, signal) {
6785
+ async processItem(item, options, signal) {
6656
6786
  const { url } = item;
6657
6787
  try {
6788
+ if (item.etag) {
6789
+ logger.debug(`Processing ${url} with stored ETag: ${item.etag}`);
6790
+ }
6658
6791
  const fetchOptions = {
6659
6792
  signal,
6660
6793
  followRedirects: options.followRedirects,
6661
- headers: options.headers
6794
+ headers: options.headers,
6662
6795
  // Forward custom headers
6796
+ etag: item.etag
6797
+ // Pass ETag for conditional requests
6663
6798
  };
6664
6799
  const rawContent = await this.fetcher.fetch(url, fetchOptions);
6800
+ logger.debug(
6801
+ `Fetch result for ${url}: status=${rawContent.status}, etag=${rawContent.etag || "none"}`
6802
+ );
6803
+ if (rawContent.status !== FetchStatus.SUCCESS) {
6804
+ logger.debug(`Skipping pipeline for ${url} due to status: ${rawContent.status}`);
6805
+ return { url: rawContent.source, links: [], status: rawContent.status };
6806
+ }
6665
6807
  let processed;
6666
6808
  for (const pipeline of this.pipelines) {
6667
- if (pipeline.canProcess(rawContent)) {
6809
+ const contentBuffer = Buffer.isBuffer(rawContent.content) ? rawContent.content : Buffer.from(rawContent.content);
6810
+ if (pipeline.canProcess(rawContent.mimeType || "text/plain", contentBuffer)) {
6668
6811
  logger.debug(
6669
6812
  `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${url})`
6670
6813
  );
@@ -6676,40 +6819,47 @@ class WebScraperStrategy extends BaseScraperStrategy {
6676
6819
  logger.warn(
6677
6820
  `⚠️ Unsupported content type "${rawContent.mimeType}" for URL ${url}. Skipping processing.`
6678
6821
  );
6679
- return { document: void 0, links: [] };
6822
+ return { url: rawContent.source, links: [], status: FetchStatus.SUCCESS };
6680
6823
  }
6681
- for (const err of processed.errors) {
6824
+ for (const err of processed.errors ?? []) {
6682
6825
  logger.warn(`⚠️ Processing error for ${url}: ${err.message}`);
6683
6826
  }
6684
6827
  if (!processed.textContent || !processed.textContent.trim()) {
6685
6828
  logger.warn(
6686
6829
  `⚠️ No processable content found for ${url} after pipeline execution.`
6687
6830
  );
6688
- return { document: void 0, links: processed.links };
6831
+ return {
6832
+ url: rawContent.source,
6833
+ links: processed.links,
6834
+ status: FetchStatus.SUCCESS
6835
+ };
6689
6836
  }
6690
- const baseUrl = item.depth === 0 ? new URL(rawContent.source) : this.canonicalBaseUrl ?? new URL(options.url);
6691
- const filteredLinks = processed.links.filter((link) => {
6837
+ if (item.depth === 0) {
6838
+ this.canonicalBaseUrl = new URL(rawContent.source);
6839
+ }
6840
+ const filteredLinks = processed.links?.filter((link) => {
6692
6841
  try {
6693
6842
  const targetUrl = new URL(link);
6694
- const scope = options.scope || "subpages";
6695
- return isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
6843
+ if (!this.shouldProcessUrl(targetUrl.href, options)) {
6844
+ return false;
6845
+ }
6846
+ if (this.shouldFollowLinkFn) {
6847
+ const baseUrl = this.canonicalBaseUrl ?? new URL(options.url);
6848
+ return this.shouldFollowLinkFn(baseUrl, targetUrl);
6849
+ }
6850
+ return true;
6696
6851
  } catch {
6697
6852
  return false;
6698
6853
  }
6699
- });
6854
+ }) ?? [];
6700
6855
  return {
6701
- document: {
6702
- content: processed.textContent,
6703
- metadata: {
6704
- url,
6705
- title: typeof processed.metadata.title === "string" ? processed.metadata.title : "Untitled",
6706
- library: options.library,
6707
- version: options.version,
6708
- ...processed.metadata
6709
- }
6710
- },
6856
+ url: rawContent.source,
6857
+ etag: rawContent.etag,
6858
+ lastModified: rawContent.lastModified,
6859
+ contentType: processed.contentType || rawContent.mimeType,
6860
+ content: processed,
6711
6861
  links: filteredLinks,
6712
- finalUrl: rawContent.source
6862
+ status: FetchStatus.SUCCESS
6713
6863
  };
6714
6864
  } catch (error) {
6715
6865
  logger.error(`❌ Failed processing page ${url}: ${error}`);
@@ -6786,7 +6936,6 @@ class ScraperRegistry {
6786
6936
  this.strategies = [
6787
6937
  new NpmScraperStrategy(),
6788
6938
  new PyPiScraperStrategy(),
6789
- new GitHubWikiScraperStrategy(),
6790
6939
  new GitHubScraperStrategy(),
6791
6940
  new WebScraperStrategy(),
6792
6941
  new LocalFileStrategy()
@@ -6848,55 +6997,64 @@ class PipelineWorker {
6848
6997
  * @param callbacks - Callbacks provided by the manager for reporting.
6849
6998
  */
6850
6999
  async executeJob(job, callbacks) {
6851
- const {
6852
- id: jobId,
6853
- library,
6854
- version: version2,
6855
- sourceUrl,
6856
- scraperOptions,
6857
- abortController
6858
- } = job;
7000
+ const { id: jobId, library, version: version2, scraperOptions, abortController } = job;
6859
7001
  const signal = abortController.signal;
6860
7002
  logger.debug(`[${jobId}] Worker starting job for ${library}@${version2}`);
6861
7003
  try {
6862
- await this.store.removeAllDocuments(library, version2);
6863
- logger.info(
6864
- `💾 Cleared store for ${library}@${version2 || "[no version]"} before scraping.`
6865
- );
6866
- const runtimeOptions = {
6867
- url: sourceUrl ?? "",
6868
- library,
6869
- version: version2,
6870
- ...scraperOptions
6871
- };
7004
+ if (!scraperOptions.isRefresh) {
7005
+ await this.store.removeAllDocuments(library, version2);
7006
+ logger.info(
7007
+ `💾 Cleared store for ${library}@${version2 || "[no version]"} before scraping.`
7008
+ );
7009
+ } else {
7010
+ logger.info(
7011
+ `🔄 Refresh operation - preserving existing data for ${library}@${version2 || "[no version]"}.`
7012
+ );
7013
+ }
6872
7014
  await this.scraperService.scrape(
6873
- runtimeOptions,
7015
+ scraperOptions,
6874
7016
  async (progress) => {
6875
7017
  if (signal.aborted) {
6876
7018
  throw new CancellationError("Job cancelled during scraping progress");
6877
7019
  }
6878
7020
  await callbacks.onJobProgress?.(job, progress);
6879
- if (progress.document) {
7021
+ if (progress.deleted && progress.pageId) {
6880
7022
  try {
6881
- await this.store.addDocument(library, version2, {
6882
- pageContent: progress.document.content,
6883
- metadata: {
6884
- ...progress.document.metadata,
6885
- mimeType: progress.document.contentType
6886
- // Pass contentType as mimeType in metadata
6887
- }
6888
- });
7023
+ await this.store.deletePage(progress.pageId);
6889
7024
  logger.debug(
6890
- `[${jobId}] Stored document: ${progress.document.metadata.url}`
7025
+ `[${jobId}] Deleted page ${progress.pageId}: ${progress.currentUrl}`
7026
+ );
7027
+ } catch (docError) {
7028
+ logger.error(
7029
+ `❌ [${jobId}] Failed to delete page ${progress.pageId}: ${docError}`
7030
+ );
7031
+ const error = docError instanceof Error ? docError : new Error(String(docError));
7032
+ await callbacks.onJobError?.(job, error);
7033
+ throw error;
7034
+ }
7035
+ } else if (progress.result) {
7036
+ try {
7037
+ if (progress.pageId) {
7038
+ await this.store.deletePage(progress.pageId);
7039
+ logger.debug(
7040
+ `[${jobId}] Refreshing page ${progress.pageId}: ${progress.currentUrl}`
7041
+ );
7042
+ }
7043
+ await this.store.addScrapeResult(
7044
+ library,
7045
+ version2,
7046
+ progress.depth,
7047
+ progress.result
6891
7048
  );
7049
+ logger.debug(`[${jobId}] Stored processed content: ${progress.currentUrl}`);
6892
7050
  } catch (docError) {
6893
7051
  logger.error(
6894
- `❌ [${jobId}] Failed to store document ${progress.document.metadata.url}: ${docError}`
7052
+ `❌ [${jobId}] Failed to process content ${progress.currentUrl}: ${docError}`
6895
7053
  );
6896
7054
  await callbacks.onJobError?.(
6897
7055
  job,
6898
7056
  docError instanceof Error ? docError : new Error(String(docError)),
6899
- progress.document
7057
+ progress.result
6900
7058
  );
6901
7059
  }
6902
7060
  }
@@ -7108,15 +7266,8 @@ class PipelineManager {
7108
7266
  /**
7109
7267
  * Enqueues a new document processing job, aborting any existing QUEUED/RUNNING job for the same library+version (including unversioned).
7110
7268
  */
7111
- async enqueueJob(library, version2, options) {
7269
+ async enqueueScrapeJob(library, version2, options) {
7112
7270
  const normalizedVersion = version2 ?? "";
7113
- const {
7114
- url,
7115
- library: _library,
7116
- version: _version,
7117
- signal: _signal,
7118
- ...versionOptions
7119
- } = options;
7120
7271
  const allJobs = await this.getJobs();
7121
7272
  const duplicateJobs = allJobs.filter(
7122
7273
  (job2) => job2.library === library && (job2.version ?? "") === normalizedVersion && // Normalize null to empty string for comparison
@@ -7158,8 +7309,8 @@ class PipelineManager {
7158
7309
  progressMaxPages: 0,
7159
7310
  errorMessage: null,
7160
7311
  updatedAt: /* @__PURE__ */ new Date(),
7161
- sourceUrl: url,
7162
- scraperOptions: versionOptions
7312
+ sourceUrl: options.url,
7313
+ scraperOptions: options
7163
7314
  };
7164
7315
  this.jobMap.set(jobId, job);
7165
7316
  this.jobQueue.push(jobId);
@@ -7174,6 +7325,78 @@ class PipelineManager {
7174
7325
  }
7175
7326
  return jobId;
7176
7327
  }
7328
+ /**
7329
+ * Enqueues a refresh job for an existing library version by re-scraping all pages
7330
+ * and using ETag comparison to skip unchanged content.
7331
+ *
7332
+ * If the version was never completed (interrupted or failed scrape), performs a
7333
+ * full re-scrape from scratch instead of a refresh to ensure completeness.
7334
+ */
7335
+ async enqueueRefreshJob(library, version2) {
7336
+ const normalizedVersion = version2 ?? "";
7337
+ try {
7338
+ const versionId = await this.store.ensureVersion({
7339
+ library,
7340
+ version: normalizedVersion
7341
+ });
7342
+ const versionInfo = await this.store.getVersionById(versionId);
7343
+ if (!versionInfo) {
7344
+ throw new Error(`Version ID ${versionId} not found`);
7345
+ }
7346
+ const libraryInfo = await this.store.getLibraryById(versionInfo.library_id);
7347
+ if (!libraryInfo) {
7348
+ throw new Error(`Library ID ${versionInfo.library_id} not found`);
7349
+ }
7350
+ if (versionInfo && versionInfo.status !== VersionStatus.COMPLETED) {
7351
+ logger.info(
7352
+ `⚠️ Version ${library}@${normalizedVersion || "unversioned"} has status "${versionInfo.status}". Performing full re-scrape instead of refresh.`
7353
+ );
7354
+ return this.enqueueJobWithStoredOptions(library, normalizedVersion);
7355
+ }
7356
+ const pages = await this.store.getPagesByVersionId(versionId);
7357
+ if (pages.length > 0) {
7358
+ logger.debug(
7359
+ `Sample page data: url=${pages[0].url}, etag=${pages[0].etag}, depth=${pages[0].depth}`
7360
+ );
7361
+ }
7362
+ if (pages.length === 0) {
7363
+ throw new Error(
7364
+ `No pages found for ${library}@${normalizedVersion || "unversioned"}. Use scrape_docs to index it first.`
7365
+ );
7366
+ }
7367
+ logger.info(
7368
+ `🔄 Preparing refresh job for ${library}@${normalizedVersion || "unversioned"} with ${pages.length} page(s)`
7369
+ );
7370
+ const initialQueue = pages.map((page) => ({
7371
+ url: page.url,
7372
+ depth: page.depth ?? 0,
7373
+ // Use original depth, fallback to 0 for old data
7374
+ pageId: page.id,
7375
+ etag: page.etag
7376
+ }));
7377
+ const storedOptions = await this.store.getScraperOptions(versionId);
7378
+ const scraperOptions = {
7379
+ url: storedOptions?.sourceUrl || pages[0].url,
7380
+ // Required but not used when initialQueue is set
7381
+ library,
7382
+ version: normalizedVersion,
7383
+ ...storedOptions?.options || {},
7384
+ // Include stored options if available (spread first)
7385
+ // Override with refresh-specific options (these must come after the spread)
7386
+ initialQueue,
7387
+ // Pre-populated queue with existing pages
7388
+ isRefresh: true
7389
+ // Mark this as a refresh operation
7390
+ };
7391
+ logger.info(
7392
+ `📝 Enqueueing refresh job for ${library}@${normalizedVersion || "unversioned"}`
7393
+ );
7394
+ return this.enqueueScrapeJob(library, normalizedVersion, scraperOptions);
7395
+ } catch (error) {
7396
+ logger.error(`❌ Failed to enqueue refresh job: ${error}`);
7397
+ throw error;
7398
+ }
7399
+ }
7177
7400
  /**
7178
7401
  * Enqueues a job using stored scraper options from a previous indexing run.
7179
7402
  * If no stored options are found, throws an error.
@@ -7201,7 +7424,7 @@ class PipelineManager {
7201
7424
  logger.info(
7202
7425
  `🔄 Re-indexing ${library}@${normalizedVersion || "unversioned"} with stored options from ${stored.sourceUrl}`
7203
7426
  );
7204
- return this.enqueueJob(library, normalizedVersion, completeOptions);
7427
+ return this.enqueueScrapeJob(library, normalizedVersion, completeOptions);
7205
7428
  } catch (error) {
7206
7429
  logger.error(`❌ Failed to enqueue job with stored options: ${error}`);
7207
7430
  throw error;
@@ -7418,13 +7641,7 @@ class PipelineManager {
7418
7641
  await this.store.updateVersionStatus(versionId, dbStatus, errorMessage);
7419
7642
  if (newStatus === PipelineJobStatus.QUEUED && job.scraperOptions) {
7420
7643
  try {
7421
- const fullOptions = {
7422
- url: job.sourceUrl ?? "",
7423
- library: job.library,
7424
- version: job.version,
7425
- ...job.scraperOptions
7426
- };
7427
- await this.store.storeScraperOptions(versionId, fullOptions);
7644
+ await this.store.storeScraperOptions(versionId, job.scraperOptions);
7428
7645
  logger.debug(
7429
7646
  `Stored scraper options for ${job.library}@${job.version}: ${job.sourceUrl}`
7430
7647
  );
@@ -7882,7 +8099,7 @@ async function createPipelineWithCallbacks(docService, options = {}) {
7882
8099
  },
7883
8100
  onJobError: async (job, error, document2) => {
7884
8101
  logger.warn(
7885
- `⚠️ Job ${job.id} error ${document2 ? `on document ${document2.metadata.url}` : ""}: ${error.message}`
8102
+ `⚠️ Job ${job.id} error ${document2 ? `on document ${document2.url}` : ""}: ${error.message}`
7886
8103
  );
7887
8104
  }
7888
8105
  });
@@ -8113,6 +8330,45 @@ function createMcpServerInstance(tools, readOnly = false) {
8113
8330
  }
8114
8331
  }
8115
8332
  );
8333
+ server.tool(
8334
+ "refresh_version",
8335
+ "Re-scrape a previously indexed library version, updating only changed pages.",
8336
+ {
8337
+ library: z.string().trim().describe("Library name."),
8338
+ version: z.string().trim().optional().describe("Library version (optional, refreshes unversioned if omitted).")
8339
+ },
8340
+ {
8341
+ title: "Refresh Library Version",
8342
+ destructiveHint: false,
8343
+ // Only updates changed content
8344
+ openWorldHint: true
8345
+ // requires internet access
8346
+ },
8347
+ async ({ library, version: version2 }) => {
8348
+ analytics.track(TelemetryEvent.TOOL_USED, {
8349
+ tool: "refresh_version",
8350
+ context: "mcp_server",
8351
+ library,
8352
+ version: version2
8353
+ });
8354
+ try {
8355
+ const result = await tools.refresh.execute({
8356
+ library,
8357
+ version: version2,
8358
+ waitForCompletion: false
8359
+ // Don't wait for completion
8360
+ });
8361
+ if ("jobId" in result) {
8362
+ return createResponse(`🔄 Refresh job started with ID: ${result.jobId}.`);
8363
+ }
8364
+ return createResponse(
8365
+ `Refresh finished immediately (unexpectedly) with ${result.pagesRefreshed} pages.`
8366
+ );
8367
+ } catch (error) {
8368
+ return createError(error);
8369
+ }
8370
+ }
8371
+ );
8116
8372
  }
8117
8373
  server.tool(
8118
8374
  "search_docs",
@@ -8638,7 +8894,7 @@ class FetchUrlTool {
8638
8894
  logger.info("🔄 Processing content...");
8639
8895
  let processed;
8640
8896
  for (const pipeline of this.pipelines) {
8641
- if (pipeline.canProcess(rawContent)) {
8897
+ if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) {
8642
8898
  processed = await pipeline.process(
8643
8899
  rawContent,
8644
8900
  {
@@ -8673,7 +8929,7 @@ class FetchUrlTool {
8673
8929
  const contentString = convertToString(rawContent.content, resolvedCharset);
8674
8930
  return contentString;
8675
8931
  }
8676
- for (const err of processed.errors) {
8932
+ for (const err of processed.errors ?? []) {
8677
8933
  logger.warn(`⚠️ Processing error for ${url}: ${err.message}`);
8678
8934
  }
8679
8935
  if (typeof processed.textContent !== "string" || !processed.textContent.trim()) {
@@ -8851,6 +9107,61 @@ class ListLibrariesTool {
8851
9107
  return { libraries };
8852
9108
  }
8853
9109
  }
9110
+ class RefreshVersionTool {
9111
+ pipeline;
9112
+ constructor(pipeline) {
9113
+ this.pipeline = pipeline;
9114
+ }
9115
+ async execute(options) {
9116
+ const { library, version: version2, waitForCompletion = true } = options;
9117
+ let internalVersion;
9118
+ const partialVersionRegex = /^\d+(\.\d+)?$/;
9119
+ if (version2 === null || version2 === void 0) {
9120
+ internalVersion = "";
9121
+ } else {
9122
+ const validFullVersion = semver.valid(version2);
9123
+ if (validFullVersion) {
9124
+ internalVersion = validFullVersion;
9125
+ } else if (partialVersionRegex.test(version2)) {
9126
+ const coercedVersion = semver.coerce(version2);
9127
+ if (coercedVersion) {
9128
+ internalVersion = coercedVersion.version;
9129
+ } else {
9130
+ throw new ValidationError(
9131
+ `Invalid version format for refreshing: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`,
9132
+ "RefreshVersionTool"
9133
+ );
9134
+ }
9135
+ } else {
9136
+ throw new ValidationError(
9137
+ `Invalid version format for refreshing: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`,
9138
+ "RefreshVersionTool"
9139
+ );
9140
+ }
9141
+ }
9142
+ internalVersion = internalVersion.toLowerCase();
9143
+ const pipeline = this.pipeline;
9144
+ const refreshVersion = internalVersion === "" ? null : internalVersion;
9145
+ const jobId = await pipeline.enqueueRefreshJob(library, refreshVersion);
9146
+ if (waitForCompletion) {
9147
+ try {
9148
+ await pipeline.waitForJobCompletion(jobId);
9149
+ const finalJob = await pipeline.getJob(jobId);
9150
+ const finalPagesRefreshed = finalJob?.progress?.pagesScraped ?? 0;
9151
+ logger.debug(
9152
+ `Refresh job ${jobId} finished with status ${finalJob?.status}. Pages refreshed: ${finalPagesRefreshed}`
9153
+ );
9154
+ return {
9155
+ pagesRefreshed: finalPagesRefreshed
9156
+ };
9157
+ } catch (error) {
9158
+ logger.error(`❌ Refresh job ${jobId} failed or was cancelled: ${error}`);
9159
+ throw error;
9160
+ }
9161
+ }
9162
+ return { jobId };
9163
+ }
9164
+ }
8854
9165
  class RemoveTool {
8855
9166
  constructor(documentManagementService, pipeline) {
8856
9167
  this.documentManagementService = documentManagementService;
@@ -8871,19 +9182,7 @@ class RemoveTool {
8871
9182
  }
8872
9183
  logger.info(`🗑️ Removing library: ${library}${version2 ? `@${version2}` : ""}`);
8873
9184
  try {
8874
- const result = await this.documentManagementService.findBestVersion(
8875
- library,
8876
- version2
8877
- );
8878
- const normalizedVersion = version2 && version2.trim() !== "" ? version2 : null;
8879
- const versionExists = result.bestMatch === normalizedVersion || result.hasUnversioned && normalizedVersion === null;
8880
- if (!versionExists) {
8881
- const versionText = normalizedVersion ? `Version ${normalizedVersion}` : "Version";
8882
- throw new ToolError(
8883
- `${versionText} not found for library ${library}. Cannot remove non-existent version.`,
8884
- this.constructor.name
8885
- );
8886
- }
9185
+ await this.documentManagementService.validateLibraryExists(library);
8887
9186
  const allJobs = await this.pipeline.getJobs();
8888
9187
  const jobs = allJobs.filter(
8889
9188
  (job) => job.library === library && job.version === (version2 ?? "") && (job.status === PipelineJobStatus.QUEUED || job.status === PipelineJobStatus.RUNNING)
@@ -8950,7 +9249,7 @@ class ScrapeTool {
8950
9249
  internalVersion = internalVersion.toLowerCase();
8951
9250
  const pipeline = this.pipeline;
8952
9251
  const enqueueVersion = internalVersion === "" ? null : internalVersion;
8953
- const jobId = await pipeline.enqueueJob(library, enqueueVersion, {
9252
+ const jobId = await pipeline.enqueueScrapeJob(library, enqueueVersion, {
8954
9253
  url,
8955
9254
  library,
8956
9255
  version: internalVersion,
@@ -8997,7 +9296,18 @@ class DocumentManagementClient {
8997
9296
  logger.debug(`DocumentManagementClient (tRPC) created for: ${this.baseUrl}`);
8998
9297
  }
8999
9298
  async initialize() {
9000
- await this.client.ping.query();
9299
+ try {
9300
+ await this.client.ping.query();
9301
+ } catch (error) {
9302
+ logger.debug(
9303
+ `Failed to connect to DocumentManagement server at ${this.baseUrl}: ${error}`
9304
+ );
9305
+ throw new Error(
9306
+ `Failed to connect to server at ${this.baseUrl}.
9307
+
9308
+ Please verify the server URL includes the correct port (default 8080) and ends with '/api' (e.g., 'http://localhost:8080/api').`
9309
+ );
9310
+ }
9001
9311
  }
9002
9312
  async shutdown() {
9003
9313
  }
@@ -9069,7 +9379,7 @@ class HierarchicalAssemblyStrategy {
9069
9379
  try {
9070
9380
  const chunksByDocument = /* @__PURE__ */ new Map();
9071
9381
  for (const chunk of initialChunks) {
9072
- const url = chunk.metadata.url;
9382
+ const url = chunk.url;
9073
9383
  if (!chunksByDocument.has(url)) {
9074
9384
  chunksByDocument.set(url, []);
9075
9385
  }
@@ -9163,10 +9473,10 @@ class HierarchicalAssemblyStrategy {
9163
9473
  if (debug) {
9164
9474
  return chunks.map(
9165
9475
  (chunk) => `=== #${chunk.id} ${chunk.metadata.path?.join("/")} [${chunk.metadata.level}] ===
9166
- ` + chunk.pageContent
9476
+ ` + chunk.content
9167
9477
  ).join("");
9168
9478
  }
9169
- return chunks.map((chunk) => chunk.pageContent).join("");
9479
+ return chunks.map((chunk) => chunk.content).join("");
9170
9480
  }
9171
9481
  /**
9172
9482
  * Walks up the parent hierarchy from a chunk to collect the complete parent chain.
@@ -9192,42 +9502,17 @@ class HierarchicalAssemblyStrategy {
9192
9502
  visited.add(currentId);
9193
9503
  chainIds.push(currentId);
9194
9504
  depth++;
9195
- try {
9196
- const parentChunk = await documentStore.findParentChunk(
9505
+ let parentChunk = await documentStore.findParentChunk(library, version2, currentId);
9506
+ if (!parentChunk) {
9507
+ parentChunk = await this.findAncestorWithGaps(
9197
9508
  library,
9198
9509
  version2,
9199
- currentId
9510
+ currentChunk.url,
9511
+ currentChunk.metadata.path ?? [],
9512
+ documentStore
9200
9513
  );
9201
- if (parentChunk) {
9202
- currentChunk = parentChunk;
9203
- } else {
9204
- currentChunk = await this.findAncestorWithGaps(
9205
- library,
9206
- version2,
9207
- currentChunk.metadata,
9208
- documentStore
9209
- );
9210
- }
9211
- } catch (error) {
9212
- try {
9213
- const currentMetadata = currentChunk?.metadata;
9214
- if (currentMetadata) {
9215
- currentChunk = await this.findAncestorWithGaps(
9216
- library,
9217
- version2,
9218
- currentMetadata,
9219
- documentStore
9220
- );
9221
- } else {
9222
- currentChunk = null;
9223
- }
9224
- } catch (gapError) {
9225
- logger.warn(
9226
- `Parent lookup failed for chunk ${currentId}: ${error}. Gap search also failed: ${gapError}`
9227
- );
9228
- break;
9229
- }
9230
9514
  }
9515
+ currentChunk = parentChunk;
9231
9516
  }
9232
9517
  if (depth >= maxDepth) {
9233
9518
  logger.warn(
@@ -9240,9 +9525,7 @@ class HierarchicalAssemblyStrategy {
9240
9525
  * Attempts to find ancestors when there are gaps in the hierarchy.
9241
9526
  * Tries progressively shorter path prefixes to find existing ancestor chunks.
9242
9527
  */
9243
- async findAncestorWithGaps(library, version2, metadata, documentStore) {
9244
- const path2 = metadata.path || [];
9245
- const url = metadata.url;
9528
+ async findAncestorWithGaps(library, version2, url, path2, documentStore) {
9246
9529
  if (path2.length <= 1) {
9247
9530
  return null;
9248
9531
  }
@@ -9279,7 +9562,7 @@ class HierarchicalAssemblyStrategy {
9279
9562
  }
9280
9563
  const matchingChunks = allChunks.filter((chunk) => {
9281
9564
  const chunkPath = chunk.metadata.path || [];
9282
- const chunkUrl = chunk.metadata.url;
9565
+ const chunkUrl = chunk.url;
9283
9566
  if (chunkUrl !== url) return false;
9284
9567
  if (chunkPath.length !== targetPath.length) return false;
9285
9568
  return chunkPath.every((part, index) => part === targetPath[index]);
@@ -9301,11 +9584,7 @@ class HierarchicalAssemblyStrategy {
9301
9584
  return current;
9302
9585
  }
9303
9586
  while (true) {
9304
- const parent = await documentStore.findParentChunk(
9305
- library,
9306
- version2,
9307
- current.id
9308
- );
9587
+ const parent = await documentStore.findParentChunk(library, version2, current.id);
9309
9588
  if (!parent) {
9310
9589
  return null;
9311
9590
  }
@@ -9387,7 +9666,7 @@ class HierarchicalAssemblyStrategy {
9387
9666
  const ancestorChunks = await this.findChunksByExactPath(
9388
9667
  library,
9389
9668
  version2,
9390
- referenceChunk.metadata.url,
9669
+ referenceChunk.url,
9391
9670
  ancestorPath,
9392
9671
  documentStore
9393
9672
  );
@@ -9465,13 +9744,9 @@ class HierarchicalAssemblyStrategy {
9465
9744
  for (const chunk of initialChunks) {
9466
9745
  const id = chunk.id;
9467
9746
  chunkIds.add(id);
9468
- try {
9469
- const parent = await documentStore.findParentChunk(library, version2, id);
9470
- if (parent) {
9471
- chunkIds.add(parent.id);
9472
- }
9473
- } catch (error) {
9474
- logger.warn(`Failed to find parent for chunk ${id}: ${error}`);
9747
+ const parent = await documentStore.findParentChunk(library, version2, id);
9748
+ if (parent) {
9749
+ chunkIds.add(parent.id);
9475
9750
  }
9476
9751
  try {
9477
9752
  const children = await documentStore.findChildChunks(library, version2, id, 3);
@@ -9539,7 +9814,7 @@ class MarkdownAssemblyStrategy {
9539
9814
  * Assembles chunks using simple "\n\n" joining (current behavior).
9540
9815
  */
9541
9816
  assembleContent(chunks) {
9542
- return chunks.map((chunk) => chunk.pageContent).join("\n\n");
9817
+ return chunks.map((chunk) => chunk.content).join("\n\n");
9543
9818
  }
9544
9819
  /**
9545
9820
  * Collects related chunk IDs for a single chunk using current context expansion logic.
@@ -9638,7 +9913,7 @@ class DocumentRetrieverService {
9638
9913
  groupResultsByUrl(results) {
9639
9914
  const resultsByUrl = /* @__PURE__ */ new Map();
9640
9915
  for (const result of results) {
9641
- const url = result.metadata.url;
9916
+ const url = result.url;
9642
9917
  if (!resultsByUrl.has(url)) {
9643
9918
  resultsByUrl.set(url, []);
9644
9919
  }
@@ -9653,10 +9928,8 @@ class DocumentRetrieverService {
9653
9928
  * Processes a group of search results from the same URL using appropriate strategy.
9654
9929
  */
9655
9930
  async processUrlGroup(library, version2, url, initialChunks) {
9656
- const mimeType = initialChunks.length > 0 ? initialChunks[0].metadata.mimeType : void 0;
9657
- const maxScore = Math.max(
9658
- ...initialChunks.map((chunk) => chunk.metadata.score)
9659
- );
9931
+ const mimeType = initialChunks.length > 0 ? initialChunks[0].content_type : void 0;
9932
+ const maxScore = Math.max(...initialChunks.map((chunk) => chunk.score));
9660
9933
  const strategy = createContentAssemblyStrategy(mimeType);
9661
9934
  const selectedChunks = await strategy.selectChunks(
9662
9935
  library,
@@ -9845,7 +10118,7 @@ class DocumentStore {
9845
10118
  prepareStatements() {
9846
10119
  const statements = {
9847
10120
  getById: this.db.prepare(
9848
- `SELECT d.*, p.url, p.title, p.content_type
10121
+ `SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type
9849
10122
  FROM documents d
9850
10123
  JOIN pages p ON d.page_id = p.id
9851
10124
  WHERE d.id = ?`
@@ -9858,7 +10131,7 @@ class DocumentStore {
9858
10131
  "UPDATE documents SET embedding = ? WHERE id = ?"
9859
10132
  ),
9860
10133
  insertPage: this.db.prepare(
9861
- "INSERT INTO pages (version_id, url, title, etag, last_modified, content_type) VALUES (?, ?, ?, ?, ?, ?) ON CONFLICT(version_id, url) DO UPDATE SET title = excluded.title, content_type = excluded.content_type"
10134
+ "INSERT INTO pages (version_id, url, title, etag, last_modified, content_type, depth) VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(version_id, url) DO UPDATE SET title = excluded.title, content_type = excluded.content_type, etag = excluded.etag, last_modified = excluded.last_modified, depth = excluded.depth"
9862
10135
  ),
9863
10136
  getPageId: this.db.prepare(
9864
10137
  "SELECT id FROM pages WHERE version_id = ? AND url = ?"
@@ -9869,12 +10142,13 @@ class DocumentStore {
9869
10142
  getLibraryIdByName: this.db.prepare(
9870
10143
  "SELECT id FROM libraries WHERE name = ?"
9871
10144
  ),
10145
+ getLibraryById: this.db.prepare("SELECT * FROM libraries WHERE id = ?"),
9872
10146
  // New version-related statements
9873
10147
  insertVersion: this.db.prepare(
9874
10148
  "INSERT INTO versions (library_id, name, status) VALUES (?, ?, 'not_indexed') ON CONFLICT(library_id, name) DO NOTHING"
9875
10149
  ),
9876
10150
  resolveVersionId: this.db.prepare(
9877
- "SELECT id FROM versions WHERE library_id = ? AND name IS ?"
10151
+ "SELECT id FROM versions WHERE library_id = ? AND name = ?"
9878
10152
  ),
9879
10153
  getVersionById: this.db.prepare("SELECT * FROM versions WHERE id = ?"),
9880
10154
  queryVersionsByLibraryId: this.db.prepare(
@@ -9889,13 +10163,16 @@ class DocumentStore {
9889
10163
  WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')
9890
10164
  )`
9891
10165
  ),
9892
- deleteDocumentsByUrl: this.db.prepare(
9893
- `DELETE FROM documents
9894
- WHERE page_id IN (
9895
- SELECT p.id FROM pages p
9896
- JOIN versions v ON p.version_id = v.id
10166
+ deleteDocumentsByPageId: this.db.prepare(
10167
+ "DELETE FROM documents WHERE page_id = ?"
10168
+ ),
10169
+ deletePage: this.db.prepare("DELETE FROM pages WHERE id = ?"),
10170
+ deletePages: this.db.prepare(
10171
+ `DELETE FROM pages
10172
+ WHERE version_id IN (
10173
+ SELECT v.id FROM versions v
9897
10174
  JOIN libraries l ON v.library_id = l.id
9898
- WHERE p.url = ? AND l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')
10175
+ WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')
9899
10176
  )`
9900
10177
  ),
9901
10178
  getDocumentBySort: this.db.prepare(
@@ -9945,7 +10222,7 @@ class DocumentStore {
9945
10222
  ORDER BY l.name, version`
9946
10223
  ),
9947
10224
  getChildChunks: this.db.prepare(`
9948
- SELECT d.*, p.url, p.title, p.content_type FROM documents d
10225
+ SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
9949
10226
  JOIN pages p ON d.page_id = p.id
9950
10227
  JOIN versions v ON p.version_id = v.id
9951
10228
  JOIN libraries l ON v.library_id = l.id
@@ -9959,7 +10236,7 @@ class DocumentStore {
9959
10236
  LIMIT ?
9960
10237
  `),
9961
10238
  getPrecedingSiblings: this.db.prepare(`
9962
- SELECT d.*, p.url, p.title, p.content_type FROM documents d
10239
+ SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
9963
10240
  JOIN pages p ON d.page_id = p.id
9964
10241
  JOIN versions v ON p.version_id = v.id
9965
10242
  JOIN libraries l ON v.library_id = l.id
@@ -9972,7 +10249,7 @@ class DocumentStore {
9972
10249
  LIMIT ?
9973
10250
  `),
9974
10251
  getSubsequentSiblings: this.db.prepare(`
9975
- SELECT d.*, p.url, p.title, p.content_type FROM documents d
10252
+ SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
9976
10253
  JOIN pages p ON d.page_id = p.id
9977
10254
  JOIN versions v ON p.version_id = v.id
9978
10255
  JOIN libraries l ON v.library_id = l.id
@@ -9985,7 +10262,7 @@ class DocumentStore {
9985
10262
  LIMIT ?
9986
10263
  `),
9987
10264
  getParentChunk: this.db.prepare(`
9988
- SELECT d.*, p.url, p.title, p.content_type FROM documents d
10265
+ SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
9989
10266
  JOIN pages p ON d.page_id = p.id
9990
10267
  JOIN versions v ON p.version_id = v.id
9991
10268
  JOIN libraries l ON v.library_id = l.id
@@ -10027,6 +10304,9 @@ class DocumentStore {
10027
10304
  `SELECT v.id, v.library_id FROM versions v
10028
10305
  JOIN libraries l ON v.library_id = l.id
10029
10306
  WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')`
10307
+ ),
10308
+ getPagesByVersionId: this.db.prepare(
10309
+ "SELECT * FROM pages WHERE version_id = ?"
10030
10310
  )
10031
10311
  };
10032
10312
  this.statements = statements;
@@ -10168,7 +10448,7 @@ class DocumentStore {
10168
10448
  this.statements.insertVersion.run(libraryId, normalizedVersion);
10169
10449
  const versionIdRow = this.statements.resolveVersionId.get(
10170
10450
  libraryId,
10171
- normalizedVersion === null ? "" : normalizedVersion
10451
+ normalizedVersion
10172
10452
  );
10173
10453
  if (!versionIdRow || typeof versionIdRow.id !== "number") {
10174
10454
  throw new StoreError(
@@ -10230,6 +10510,32 @@ class DocumentStore {
10230
10510
  throw new StoreError(`Failed to get versions by status: ${error}`);
10231
10511
  }
10232
10512
  }
10513
+ /**
10514
+ * Retrieves a version by its ID.
10515
+ * @param versionId The version ID to retrieve
10516
+ * @returns The version record, or null if not found
10517
+ */
10518
+ async getVersionById(versionId) {
10519
+ try {
10520
+ const row = this.statements.getVersionById.get(versionId);
10521
+ return row || null;
10522
+ } catch (error) {
10523
+ throw new StoreError(`Failed to get version by ID: ${error}`);
10524
+ }
10525
+ }
10526
+ /**
10527
+ * Retrieves a library by its ID.
10528
+ * @param libraryId The library ID to retrieve
10529
+ * @returns The library record, or null if not found
10530
+ */
10531
+ async getLibraryById(libraryId) {
10532
+ try {
10533
+ const row = this.statements.getLibraryById.get(libraryId);
10534
+ return row || null;
10535
+ } catch (error) {
10536
+ throw new StoreError(`Failed to get library by ID: ${error}`);
10537
+ }
10538
+ }
10233
10539
  /**
10234
10540
  * Stores scraper options for a version to enable reproducible indexing.
10235
10541
  * @param versionId The version ID to update
@@ -10237,7 +10543,15 @@ class DocumentStore {
10237
10543
  */
10238
10544
  async storeScraperOptions(versionId, options) {
10239
10545
  try {
10240
- const { url: source_url, library, version: version2, signal, ...scraper_options } = options;
10546
+ const {
10547
+ url: source_url,
10548
+ library: _library,
10549
+ version: _version,
10550
+ signal: _signal,
10551
+ initialQueue: _initialQueue,
10552
+ isRefresh: _isRefresh,
10553
+ ...scraper_options
10554
+ } = options;
10241
10555
  const optionsJson = JSON.stringify(scraper_options);
10242
10556
  this.statements.updateVersionScraperOptions.run(source_url, optionsJson, versionId);
10243
10557
  } catch (error) {
@@ -10348,36 +10662,96 @@ class DocumentStore {
10348
10662
  throw new ConnectionError("Failed to query library versions", error);
10349
10663
  }
10350
10664
  }
10665
+ /**
10666
+ * Helper method to detect if an error is related to input size limits.
10667
+ * Checks for common error messages from various embedding providers.
10668
+ */
10669
+ isInputSizeError(error) {
10670
+ if (!(error instanceof Error)) return false;
10671
+ const message = error.message.toLowerCase();
10672
+ return message.includes("maximum context length") || message.includes("too long") || message.includes("token limit") || message.includes("input is too large") || message.includes("exceeds") || message.includes("max") && message.includes("token");
10673
+ }
10674
+ /**
10675
+ * Creates embeddings for an array of texts with automatic retry logic for size-related errors.
10676
+ * If a batch fails due to size limits:
10677
+ * - Batches with multiple texts are split in half and retried recursively
10678
+ * - Single texts that are too large are truncated and retried once
10679
+ *
10680
+ * @param texts Array of texts to embed
10681
+ * @returns Array of embedding vectors
10682
+ */
10683
+ async embedDocumentsWithRetry(texts) {
10684
+ if (texts.length === 0) {
10685
+ return [];
10686
+ }
10687
+ try {
10688
+ return await this.embeddings.embedDocuments(texts);
10689
+ } catch (error) {
10690
+ if (this.isInputSizeError(error)) {
10691
+ if (texts.length > 1) {
10692
+ const midpoint = Math.floor(texts.length / 2);
10693
+ const firstHalf = texts.slice(0, midpoint);
10694
+ const secondHalf = texts.slice(midpoint);
10695
+ logger.warn(
10696
+ `⚠️ Batch of ${texts.length} texts exceeded size limit, splitting into ${firstHalf.length} + ${secondHalf.length}`
10697
+ );
10698
+ const [firstEmbeddings, secondEmbeddings] = await Promise.all([
10699
+ this.embedDocumentsWithRetry(firstHalf),
10700
+ this.embedDocumentsWithRetry(secondHalf)
10701
+ ]);
10702
+ return [...firstEmbeddings, ...secondEmbeddings];
10703
+ } else {
10704
+ const text = texts[0];
10705
+ const midpoint = Math.floor(text.length / 2);
10706
+ const firstHalf = text.substring(0, midpoint);
10707
+ logger.warn(
10708
+ `⚠️ Single text exceeded embedding size limit (${text.length} chars). Truncating at ${firstHalf.length} chars.`
10709
+ );
10710
+ try {
10711
+ const embedding = await this.embedDocumentsWithRetry([firstHalf]);
10712
+ logger.info(
10713
+ `✓ Using embedding from first half of split text (${firstHalf.length} chars)`
10714
+ );
10715
+ return embedding;
10716
+ } catch (retryError) {
10717
+ logger.error(
10718
+ `❌ Failed to embed even after splitting. Original length: ${text.length}`
10719
+ );
10720
+ throw retryError;
10721
+ }
10722
+ }
10723
+ }
10724
+ throw error;
10725
+ }
10726
+ }
10351
10727
  /**
10352
10728
  * Stores documents with library and version metadata, generating embeddings
10353
10729
  * for vector similarity search. Uses the new pages table to normalize page-level
10354
10730
  * metadata and avoid duplication across document chunks.
10355
10731
  */
10356
- async addDocuments(library, version2, documents) {
10732
+ async addDocuments(library, version2, depth, result) {
10357
10733
  try {
10358
- if (documents.length === 0) {
10734
+ const { title, url, chunks } = result;
10735
+ if (chunks.length === 0) {
10359
10736
  return;
10360
10737
  }
10361
- const documentsByUrl = /* @__PURE__ */ new Map();
10362
- for (const doc of documents) {
10363
- const url = doc.metadata.url;
10364
- if (!url || typeof url !== "string" || !url.trim()) {
10365
- throw new StoreError("Document metadata must include a valid URL");
10366
- }
10367
- if (!documentsByUrl.has(url)) {
10368
- documentsByUrl.set(url, []);
10369
- }
10370
- documentsByUrl.get(url)?.push(doc);
10371
- }
10372
10738
  let paddedEmbeddings = [];
10373
10739
  if (this.isVectorSearchEnabled) {
10374
- const texts = documents.map((doc) => {
10375
- const header = `<title>${doc.metadata.title}</title>
10376
- <url>${doc.metadata.url}</url>
10377
- <path>${(doc.metadata.path || []).join(" / ")}</path>
10740
+ const texts = chunks.map((chunk) => {
10741
+ const header = `<title>${title}</title>
10742
+ <url>${url}</url>
10743
+ <path>${(chunk.section.path || []).join(" / ")}</path>
10378
10744
  `;
10379
- return `${header}${doc.pageContent}`;
10745
+ return `${header}${chunk.content}`;
10380
10746
  });
10747
+ for (let i = 0; i < texts.length; i++) {
10748
+ const textSize = texts[i].length;
10749
+ if (textSize > SPLITTER_MAX_CHUNK_SIZE) {
10750
+ logger.warn(
10751
+ `⚠️ Chunk ${i + 1}/${texts.length} exceeds max size: ${textSize} > ${SPLITTER_MAX_CHUNK_SIZE} chars (URL: ${url})`
10752
+ );
10753
+ }
10754
+ }
10381
10755
  const maxBatchChars = EMBEDDING_BATCH_CHARS;
10382
10756
  const rawEmbeddings = [];
10383
10757
  let currentBatch = [];
@@ -10390,7 +10764,7 @@ class DocumentStore {
10390
10764
  logger.debug(
10391
10765
  `Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
10392
10766
  );
10393
- const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
10767
+ const batchEmbeddings = await this.embedDocumentsWithRetry(currentBatch);
10394
10768
  rawEmbeddings.push(...batchEmbeddings);
10395
10769
  currentBatch = [];
10396
10770
  currentBatchSize = 0;
@@ -10402,7 +10776,7 @@ class DocumentStore {
10402
10776
  logger.debug(
10403
10777
  `Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
10404
10778
  );
10405
- const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
10779
+ const batchEmbeddings = await this.embedDocumentsWithRetry(currentBatch);
10406
10780
  rawEmbeddings.push(...batchEmbeddings);
10407
10781
  currentBatch = [];
10408
10782
  currentBatchSize = 0;
@@ -10413,110 +10787,115 @@ class DocumentStore {
10413
10787
  logger.debug(
10414
10788
  `Processing final embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
10415
10789
  );
10416
- const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
10790
+ const batchEmbeddings = await this.embedDocumentsWithRetry(currentBatch);
10417
10791
  rawEmbeddings.push(...batchEmbeddings);
10418
10792
  }
10419
10793
  paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
10420
10794
  }
10421
10795
  const versionId = await this.resolveVersionId(library, version2);
10422
- for (const url of documentsByUrl.keys()) {
10423
- const deletedCount = await this.deleteDocumentsByUrl(library, version2, url);
10424
- if (deletedCount > 0) {
10425
- logger.debug(`Deleted ${deletedCount} existing documents for URL: ${url}`);
10426
- }
10427
- }
10428
- const transaction = this.db.transaction((docsByUrl) => {
10429
- const pageIds = /* @__PURE__ */ new Map();
10430
- for (const [url, urlDocs] of docsByUrl) {
10431
- const firstDoc = urlDocs[0];
10432
- const title = firstDoc.metadata.title || "";
10433
- const contentType = firstDoc.metadata.contentType || null;
10434
- this.statements.insertPage.run(
10435
- versionId,
10436
- url,
10437
- title,
10438
- null,
10439
- // etag - will be populated during scraping
10440
- null,
10441
- // last_modified - will be populated during scraping
10442
- contentType
10443
- );
10444
- const existingPage = this.statements.getPageId.get(versionId, url);
10445
- if (!existingPage) {
10446
- throw new StoreError(`Failed to get page ID for URL: ${url}`);
10447
- }
10448
- const pageId = existingPage.id;
10449
- pageIds.set(url, pageId);
10796
+ const existingPage = this.statements.getPageId.get(versionId, url);
10797
+ if (existingPage) {
10798
+ const result2 = this.statements.deleteDocumentsByPageId.run(existingPage.id);
10799
+ if (result2.changes > 0) {
10800
+ logger.debug(`Deleted ${result2.changes} existing documents for URL: ${url}`);
10801
+ }
10802
+ }
10803
+ const transaction = this.db.transaction(() => {
10804
+ const contentType = result.contentType || null;
10805
+ const etag = result.etag || null;
10806
+ const lastModified = result.lastModified || null;
10807
+ this.statements.insertPage.run(
10808
+ versionId,
10809
+ url,
10810
+ title || "",
10811
+ etag,
10812
+ lastModified,
10813
+ contentType,
10814
+ depth
10815
+ );
10816
+ const existingPage2 = this.statements.getPageId.get(versionId, url);
10817
+ if (!existingPage2) {
10818
+ throw new StoreError(`Failed to get page ID for URL: ${url}`);
10450
10819
  }
10820
+ const pageId = existingPage2.id;
10451
10821
  let docIndex = 0;
10452
- for (const [url, urlDocs] of docsByUrl) {
10453
- const pageId = pageIds.get(url);
10454
- if (!pageId) {
10455
- throw new StoreError(`Failed to get page ID for URL: ${url}`);
10456
- }
10457
- for (let i = 0; i < urlDocs.length; i++) {
10458
- const doc = urlDocs[i];
10459
- const {
10460
- url: _,
10461
- title: __,
10462
- library: ___,
10463
- version: ____,
10464
- ...chunkMetadata
10465
- } = doc.metadata;
10466
- const result = this.statements.insertDocument.run(
10467
- pageId,
10468
- doc.pageContent,
10469
- JSON.stringify(chunkMetadata),
10470
- i
10471
- // sort_order within this page
10822
+ for (let i = 0; i < chunks.length; i++) {
10823
+ const chunk = chunks[i];
10824
+ const result2 = this.statements.insertDocument.run(
10825
+ pageId,
10826
+ chunk.content,
10827
+ JSON.stringify({
10828
+ types: chunk.types,
10829
+ level: chunk.section.level,
10830
+ path: chunk.section.path
10831
+ }),
10832
+ i
10833
+ // sort_order within this page
10834
+ );
10835
+ const rowId = result2.lastInsertRowid;
10836
+ if (this.isVectorSearchEnabled && paddedEmbeddings.length > 0) {
10837
+ this.statements.insertEmbedding.run(
10838
+ BigInt(rowId),
10839
+ JSON.stringify(paddedEmbeddings[docIndex])
10472
10840
  );
10473
- const rowId = result.lastInsertRowid;
10474
- if (this.isVectorSearchEnabled && paddedEmbeddings.length > 0) {
10475
- this.statements.insertEmbedding.run(
10476
- BigInt(rowId),
10477
- JSON.stringify(paddedEmbeddings[docIndex])
10478
- );
10479
- }
10480
- docIndex++;
10481
10841
  }
10842
+ docIndex++;
10482
10843
  }
10483
10844
  });
10484
- transaction(documentsByUrl);
10845
+ transaction();
10485
10846
  } catch (error) {
10486
10847
  throw new ConnectionError("Failed to add documents to store", error);
10487
10848
  }
10488
10849
  }
10489
10850
  /**
10490
- * Removes documents matching specified library and version
10851
+ * Removes documents and pages matching specified library and version.
10852
+ * This consolidated method deletes both documents and their associated pages.
10491
10853
  * @returns Number of documents deleted
10492
10854
  */
10493
- async deleteDocuments(library, version2) {
10855
+ async deletePages(library, version2) {
10494
10856
  try {
10495
10857
  const normalizedVersion = version2.toLowerCase();
10496
10858
  const result = this.statements.deleteDocuments.run(
10497
10859
  library.toLowerCase(),
10498
10860
  normalizedVersion
10499
10861
  );
10862
+ this.statements.deletePages.run(library.toLowerCase(), normalizedVersion);
10500
10863
  return result.changes;
10501
10864
  } catch (error) {
10502
10865
  throw new ConnectionError("Failed to delete documents", error);
10503
10866
  }
10504
10867
  }
10505
10868
  /**
10506
- * Removes documents for a specific URL within a library and version
10507
- * @returns Number of documents deleted
10869
+ * Deletes a page and all its associated document chunks.
10870
+ * Performs manual deletion in the correct order to satisfy foreign key constraints:
10871
+ * 1. Delete document chunks (page_id references pages.id)
10872
+ * 2. Delete page record
10873
+ *
10874
+ * This method is used during refresh operations when a page returns 404 Not Found.
10508
10875
  */
10509
- async deleteDocumentsByUrl(library, version2, url) {
10876
+ async deletePage(pageId) {
10510
10877
  try {
10511
- const normalizedVersion = version2.toLowerCase();
10512
- const result = this.statements.deleteDocumentsByUrl.run(
10513
- url,
10514
- library.toLowerCase(),
10515
- normalizedVersion
10516
- );
10517
- return result.changes;
10878
+ const docResult = this.statements.deleteDocumentsByPageId.run(pageId);
10879
+ logger.debug(`Deleted ${docResult.changes} document(s) for page ID ${pageId}`);
10880
+ const pageResult = this.statements.deletePage.run(pageId);
10881
+ if (pageResult.changes > 0) {
10882
+ logger.debug(`Deleted page record for page ID ${pageId}`);
10883
+ }
10884
+ } catch (error) {
10885
+ throw new ConnectionError(`Failed to delete page ${pageId}`, error);
10886
+ }
10887
+ }
10888
+ /**
10889
+ * Retrieves all pages for a specific version ID with their metadata.
10890
+ * Used for refresh operations to get existing pages with their ETags and depths.
10891
+ * @returns Array of page records
10892
+ */
10893
+ async getPagesByVersionId(versionId) {
10894
+ try {
10895
+ const result = this.statements.getPagesByVersionId.all(versionId);
10896
+ return result;
10518
10897
  } catch (error) {
10519
- throw new ConnectionError("Failed to delete documents by URL", error);
10898
+ throw new ConnectionError("Failed to get pages by version ID", error);
10520
10899
  }
10521
10900
  }
10522
10901
  /**
@@ -10539,7 +10918,8 @@ class DocumentStore {
10539
10918
  return { documentsDeleted: 0, versionDeleted: false, libraryDeleted: false };
10540
10919
  }
10541
10920
  const { id: versionId, library_id: libraryId } = versionResult;
10542
- const documentsDeleted = await this.deleteDocuments(library, version2);
10921
+ const documentsDeleted = await this.deletePages(library, version2);
10922
+ this.statements.deletePages.run(normalizedLibrary, normalizedVersion);
10543
10923
  const versionDeleteResult = this.statements.deleteVersionById.run(versionId);
10544
10924
  const versionDeleted = versionDeleteResult.changes > 0;
10545
10925
  let libraryDeleted = false;
@@ -10556,6 +10936,27 @@ class DocumentStore {
10556
10936
  throw new ConnectionError("Failed to remove version", error);
10557
10937
  }
10558
10938
  }
10939
+ /**
10940
+ * Parses the metadata field from a JSON string to an object.
10941
+ * This is necessary because better-sqlite3's json() function returns a string, not an object.
10942
+ */
10943
+ parseMetadata(row) {
10944
+ if (row.metadata && typeof row.metadata === "string") {
10945
+ try {
10946
+ row.metadata = JSON.parse(row.metadata);
10947
+ } catch (error) {
10948
+ logger.warn(`Failed to parse metadata JSON: ${error}`);
10949
+ row.metadata = {};
10950
+ }
10951
+ }
10952
+ return row;
10953
+ }
10954
+ /**
10955
+ * Parses metadata for an array of rows.
10956
+ */
10957
+ parseMetadataArray(rows) {
10958
+ return rows.map((row) => this.parseMetadata(row));
10959
+ }
10559
10960
  /**
10560
10961
  * Retrieves a document by its ID.
10561
10962
  * @param id The ID of the document.
@@ -10563,13 +10964,11 @@ class DocumentStore {
10563
10964
  */
10564
10965
  async getById(id) {
10565
10966
  try {
10566
- const row = this.statements.getById.get(
10567
- BigInt(id)
10568
- );
10967
+ const row = this.statements.getById.get(BigInt(id));
10569
10968
  if (!row) {
10570
10969
  return null;
10571
10970
  }
10572
- return mapDbDocumentToDocument(row);
10971
+ return this.parseMetadata(row);
10573
10972
  } catch (error) {
10574
10973
  throw new ConnectionError(`Failed to get document by ID ${id}`, error);
10575
10974
  }
@@ -10653,26 +11052,20 @@ class DocumentStore {
10653
11052
  );
10654
11053
  const rankedResults = this.assignRanks(rawResults);
10655
11054
  const topResults = rankedResults.sort((a, b) => b.rrf_score - a.rrf_score).slice(0, limit);
10656
- return topResults.map((row) => ({
10657
- ...mapDbDocumentToDocument({
11055
+ return topResults.map((row) => {
11056
+ const result = {
10658
11057
  ...row,
10659
11058
  url: row.url || "",
10660
11059
  // Ensure url is never undefined
10661
- title: row.title,
10662
- content_type: row.content_type
10663
- }),
10664
- metadata: {
10665
- ...JSON.parse(row.metadata),
10666
- id: row.id,
11060
+ title: row.title || null,
11061
+ content_type: row.content_type || null
11062
+ };
11063
+ return Object.assign(result, {
10667
11064
  score: row.rrf_score,
10668
11065
  vec_rank: row.vec_rank,
10669
- fts_rank: row.fts_rank,
10670
- // Explicitly add page fields if they exist
10671
- url: row.url || "",
10672
- title: row.title || "",
10673
- ...row.content_type && { contentType: row.content_type }
10674
- }
10675
- }));
11066
+ fts_rank: row.fts_rank
11067
+ });
11068
+ });
10676
11069
  } else {
10677
11070
  const stmt = this.db.prepare(`
10678
11071
  SELECT
@@ -10704,28 +11097,21 @@ class DocumentStore {
10704
11097
  ftsQuery,
10705
11098
  limit
10706
11099
  );
10707
- return rawResults.map((row, index) => ({
10708
- ...mapDbDocumentToDocument({
11100
+ return rawResults.map((row, index) => {
11101
+ const result = {
10709
11102
  ...row,
10710
11103
  url: row.url || "",
10711
11104
  // Ensure url is never undefined
10712
- title: row.title,
10713
- content_type: row.content_type
10714
- }),
10715
- metadata: {
10716
- ...JSON.parse(row.metadata),
10717
- id: row.id,
11105
+ title: row.title || null,
11106
+ content_type: row.content_type || null
11107
+ };
11108
+ return Object.assign(result, {
10718
11109
  score: -row.fts_score,
10719
11110
  // Convert BM25 score to positive value for consistency
10720
- fts_rank: index + 1,
11111
+ fts_rank: index + 1
10721
11112
  // Assign rank based on order (1-based)
10722
- // Explicitly ensure vec_rank is not included in FTS-only mode
10723
- // Explicitly add page fields
10724
- url: row.url || "",
10725
- title: row.title || "",
10726
- ...row.content_type && { contentType: row.content_type }
10727
- }
10728
- }));
11113
+ });
11114
+ });
10729
11115
  }
10730
11116
  } catch (error) {
10731
11117
  throw new ConnectionError(
@@ -10744,18 +11130,17 @@ class DocumentStore {
10744
11130
  return [];
10745
11131
  }
10746
11132
  const parentPath = parent.metadata.path ?? [];
10747
- const parentUrl = parent.metadata.url;
10748
11133
  const normalizedVersion = version2.toLowerCase();
10749
11134
  const result = this.statements.getChildChunks.all(
10750
11135
  library.toLowerCase(),
10751
11136
  normalizedVersion,
10752
- parentUrl,
11137
+ parent.url,
10753
11138
  parentPath.length + 1,
10754
11139
  JSON.stringify(parentPath),
10755
11140
  BigInt(id),
10756
11141
  limit
10757
11142
  );
10758
- return result.map((row) => mapDbDocumentToDocument(row));
11143
+ return this.parseMetadataArray(result);
10759
11144
  } catch (error) {
10760
11145
  throw new ConnectionError(`Failed to find child chunks for ID ${id}`, error);
10761
11146
  }
@@ -10769,17 +11154,16 @@ class DocumentStore {
10769
11154
  if (!reference) {
10770
11155
  return [];
10771
11156
  }
10772
- const refMetadata = reference.metadata;
10773
11157
  const normalizedVersion = version2.toLowerCase();
10774
11158
  const result = this.statements.getPrecedingSiblings.all(
10775
11159
  library.toLowerCase(),
10776
11160
  normalizedVersion,
10777
- refMetadata.url,
11161
+ reference.url,
10778
11162
  BigInt(id),
10779
- JSON.stringify(refMetadata.path),
11163
+ JSON.stringify(reference.metadata.path),
10780
11164
  limit
10781
11165
  );
10782
- return result.reverse().map((row) => mapDbDocumentToDocument(row));
11166
+ return this.parseMetadataArray(result).reverse();
10783
11167
  } catch (error) {
10784
11168
  throw new ConnectionError(
10785
11169
  `Failed to find preceding sibling chunks for ID ${id}`,
@@ -10796,17 +11180,16 @@ class DocumentStore {
10796
11180
  if (!reference) {
10797
11181
  return [];
10798
11182
  }
10799
- const refMetadata = reference.metadata;
10800
11183
  const normalizedVersion = version2.toLowerCase();
10801
11184
  const result = this.statements.getSubsequentSiblings.all(
10802
11185
  library.toLowerCase(),
10803
11186
  normalizedVersion,
10804
- refMetadata.url,
11187
+ reference.url,
10805
11188
  BigInt(id),
10806
- JSON.stringify(refMetadata.path),
11189
+ JSON.stringify(reference.metadata.path),
10807
11190
  limit
10808
11191
  );
10809
- return result.map((row) => mapDbDocumentToDocument(row));
11192
+ return this.parseMetadataArray(result);
10810
11193
  } catch (error) {
10811
11194
  throw new ConnectionError(
10812
11195
  `Failed to find subsequent sibling chunks for ID ${id}`,
@@ -10816,6 +11199,8 @@ class DocumentStore {
10816
11199
  }
10817
11200
  /**
10818
11201
  * Finds the parent chunk of a given document.
11202
+ * Returns null if no parent is found or if there's a database error.
11203
+ * Database errors are logged but not thrown to maintain consistent behavior.
10819
11204
  */
10820
11205
  async findParentChunk(library, version2, id) {
10821
11206
  try {
@@ -10823,8 +11208,7 @@ class DocumentStore {
10823
11208
  if (!child) {
10824
11209
  return null;
10825
11210
  }
10826
- const childMetadata = child.metadata;
10827
- const path2 = childMetadata.path ?? [];
11211
+ const path2 = child.metadata.path ?? [];
10828
11212
  const parentPath = path2.slice(0, -1);
10829
11213
  if (parentPath.length === 0) {
10830
11214
  return null;
@@ -10833,21 +11217,22 @@ class DocumentStore {
10833
11217
  const result = this.statements.getParentChunk.get(
10834
11218
  library.toLowerCase(),
10835
11219
  normalizedVersion,
10836
- childMetadata.url,
11220
+ child.url,
10837
11221
  JSON.stringify(parentPath),
10838
11222
  BigInt(id)
10839
11223
  );
10840
11224
  if (!result) {
10841
11225
  return null;
10842
11226
  }
10843
- return mapDbDocumentToDocument(result);
11227
+ return this.parseMetadata(result);
10844
11228
  } catch (error) {
10845
- throw new ConnectionError(`Failed to find parent chunk for ID ${id}`, error);
11229
+ logger.warn(`Failed to find parent chunk for ID ${id}: ${error}`);
11230
+ return null;
10846
11231
  }
10847
11232
  }
10848
11233
  /**
10849
11234
  * Fetches multiple documents by their IDs in a single call.
10850
- * Returns an array of Document objects, sorted by their sort_order.
11235
+ * Returns an array of DbPageChunk objects, sorted by their sort_order.
10851
11236
  */
10852
11237
  async findChunksByIds(library, version2, ids) {
10853
11238
  if (!ids.length) return [];
@@ -10855,7 +11240,7 @@ class DocumentStore {
10855
11240
  const normalizedVersion = version2.toLowerCase();
10856
11241
  const placeholders = ids.map(() => "?").join(",");
10857
11242
  const stmt = this.db.prepare(
10858
- `SELECT d.*, p.url, p.title, p.content_type FROM documents d
11243
+ `SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
10859
11244
  JOIN pages p ON d.page_id = p.id
10860
11245
  JOIN versions v ON p.version_id = v.id
10861
11246
  JOIN libraries l ON v.library_id = l.id
@@ -10869,20 +11254,20 @@ class DocumentStore {
10869
11254
  normalizedVersion,
10870
11255
  ...ids
10871
11256
  );
10872
- return rows.map((row) => mapDbDocumentToDocument(row));
11257
+ return this.parseMetadataArray(rows);
10873
11258
  } catch (error) {
10874
11259
  throw new ConnectionError("Failed to fetch documents by IDs", error);
10875
11260
  }
10876
11261
  }
10877
11262
  /**
10878
11263
  * Fetches all document chunks for a specific URL within a library and version.
10879
- * Returns documents sorted by their sort_order for proper reassembly.
11264
+ * Returns DbPageChunk objects sorted by their sort_order for proper reassembly.
10880
11265
  */
10881
11266
  async findChunksByUrl(library, version2, url) {
10882
11267
  try {
10883
11268
  const normalizedVersion = version2.toLowerCase();
10884
11269
  const stmt = this.db.prepare(
10885
- `SELECT d.*, p.url, p.title, p.content_type FROM documents d
11270
+ `SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
10886
11271
  JOIN pages p ON d.page_id = p.id
10887
11272
  JOIN versions v ON p.version_id = v.id
10888
11273
  JOIN libraries l ON v.library_id = l.id
@@ -10896,7 +11281,7 @@ class DocumentStore {
10896
11281
  normalizedVersion,
10897
11282
  url
10898
11283
  );
10899
- return rows.map((row) => mapDbDocumentToDocument(row));
11284
+ return this.parseMetadataArray(rows);
10900
11285
  } catch (error) {
10901
11286
  throw new ConnectionError(`Failed to fetch documents by URL ${url}`, error);
10902
11287
  }
@@ -10914,9 +11299,8 @@ class DocumentManagementService {
10914
11299
  return (version2 ?? "").toLowerCase();
10915
11300
  }
10916
11301
  constructor(storePath, embeddingConfig, pipelineConfig) {
10917
- const dbDir = storePath;
10918
- const dbPath = path.join(dbDir, "documents.db");
10919
- logger.debug(`Using database directory: ${dbDir}`);
11302
+ const dbPath = storePath === ":memory:" ? ":memory:" : path.join(storePath, "documents.db");
11303
+ logger.debug(`Using database path: ${dbPath}`);
10920
11304
  this.store = new DocumentStore(dbPath, embeddingConfig);
10921
11305
  this.documentRetriever = new DocumentRetrieverService(this.store);
10922
11306
  this.pipelines = PipelineFactory$1.createStandardPipelines(pipelineConfig);
@@ -11127,9 +11511,24 @@ class DocumentManagementService {
11127
11511
  logger.info(
11128
11512
  `🗑️ Removing all documents from ${library}@${normalizedVersion || "[no version]"} store`
11129
11513
  );
11130
- const count = await this.store.deleteDocuments(library, normalizedVersion);
11514
+ const count = await this.store.deletePages(library, normalizedVersion);
11131
11515
  logger.info(`🗑️ Deleted ${count} documents`);
11132
11516
  }
11517
+ /**
11518
+ * Deletes a page and all its associated document chunks.
11519
+ * This is used during refresh operations when a page returns 404 Not Found.
11520
+ */
11521
+ async deletePage(pageId) {
11522
+ logger.debug(`Deleting page ID: ${pageId}`);
11523
+ await this.store.deletePage(pageId);
11524
+ }
11525
+ /**
11526
+ * Retrieves all pages for a specific version ID with their metadata.
11527
+ * Used for refresh operations to get existing pages with their ETags and depths.
11528
+ */
11529
+ async getPagesByVersionId(versionId) {
11530
+ return this.store.getPagesByVersionId(versionId);
11531
+ }
11133
11532
  /**
11134
11533
  * Completely removes a library version and all associated documents.
11135
11534
  * Also removes the library if no other versions remain.
@@ -11138,15 +11537,13 @@ class DocumentManagementService {
11138
11537
  */
11139
11538
  async removeVersion(library, version2) {
11140
11539
  const normalizedVersion = this.normalizeVersion(version2);
11141
- logger.info(`🗑️ Removing version: ${library}@${normalizedVersion || "[no version]"}`);
11540
+ logger.debug(`Removing version: ${library}@${normalizedVersion || "[no version]"}`);
11142
11541
  const result = await this.store.removeVersion(library, normalizedVersion, true);
11143
- logger.info(
11144
- `🗑️ Removed ${result.documentsDeleted} documents, version: ${result.versionDeleted}, library: ${result.libraryDeleted}`
11145
- );
11542
+ logger.info(`🗑️ Removed ${result.documentsDeleted} documents`);
11146
11543
  if (result.versionDeleted && result.libraryDeleted) {
11147
- logger.info(`✅ Completely removed library ${library} (was last version)`);
11544
+ logger.info(`🗑️ Completely removed library ${library} (was last version)`);
11148
11545
  } else if (result.versionDeleted) {
11149
- logger.info(`✅ Removed version ${library}@${normalizedVersion || "[no version]"}`);
11546
+ logger.info(`🗑️ Removed version ${library}@${normalizedVersion || "[no version]"}`);
11150
11547
  } else {
11151
11548
  logger.warn(
11152
11549
  `⚠️ Version ${library}@${normalizedVersion || "[no version]"} not found`
@@ -11154,91 +11551,68 @@ class DocumentManagementService {
11154
11551
  }
11155
11552
  }
11156
11553
  /**
11157
- * Adds a document to the store, splitting it into smaller chunks for better search results.
11158
- * Uses SemanticMarkdownSplitter to maintain markdown structure and content types during splitting.
11159
- * Preserves hierarchical structure of documents and distinguishes between text and code segments.
11160
- * If version is omitted, the document is added without a specific version.
11554
+ * Adds pre-processed content directly to the store.
11555
+ * This method is used when content has already been processed by a pipeline,
11556
+ * avoiding redundant processing. Used primarily by the scraping pipeline.
11557
+ *
11558
+ * @param library Library name
11559
+ * @param version Version string (null/undefined for unversioned)
11560
+ * @param processed Pre-processed content with chunks already created
11561
+ * @param pageId Optional page ID for refresh operations
11161
11562
  */
11162
- async addDocument(library, version2, document2) {
11563
+ async addScrapeResult(library, version2, depth, result) {
11163
11564
  const processingStart = performance.now();
11164
11565
  const normalizedVersion = this.normalizeVersion(version2);
11165
- const url = document2.metadata.url;
11166
- if (!url || typeof url !== "string" || !url.trim()) {
11167
- throw new StoreError("Document metadata must include a valid URL");
11566
+ const { url, title, chunks, contentType } = result;
11567
+ if (!url) {
11568
+ throw new StoreError("Processed content metadata must include a valid URL");
11168
11569
  }
11169
- logger.info(`📚 Adding document: ${document2.metadata.title}`);
11170
- if (!document2.pageContent.trim()) {
11171
- throw new Error("Document content cannot be empty");
11570
+ logger.info(`📚 Adding processed content: ${title || url}`);
11571
+ if (chunks.length === 0) {
11572
+ logger.warn(`⚠️ No chunks in processed content for ${url}. Skipping.`);
11573
+ return;
11172
11574
  }
11173
- const contentType = document2.metadata.mimeType;
11174
11575
  try {
11175
- const rawContent = {
11176
- source: url,
11177
- content: document2.pageContent,
11178
- mimeType: contentType || "text/plain"
11179
- };
11180
- const pipeline = this.pipelines.find((p) => p.canProcess(rawContent));
11181
- if (!pipeline) {
11182
- logger.warn(
11183
- `⚠️ Unsupported content type "${rawContent.mimeType}" for document ${url}. Skipping processing.`
11184
- );
11185
- return;
11186
- }
11187
- logger.debug(
11188
- `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${url})`
11189
- );
11190
- const scraperOptions = {
11191
- url,
11192
- library,
11193
- version: normalizedVersion,
11194
- scrapeMode: ScrapeMode.Fetch,
11195
- ignoreErrors: false,
11196
- maxConcurrency: 1
11197
- };
11198
- const processed = await pipeline.process(rawContent, scraperOptions);
11199
- const chunks = processed.chunks;
11200
- const splitDocs = chunks.map((chunk) => ({
11201
- pageContent: chunk.content,
11202
- metadata: {
11203
- ...document2.metadata,
11204
- level: chunk.section.level,
11205
- path: chunk.section.path
11206
- }
11207
- }));
11208
- logger.info(`✂️ Split document into ${splitDocs.length} chunks`);
11209
- await this.store.addDocuments(library, normalizedVersion, splitDocs);
11576
+ logger.info(`✂️ Storing ${chunks.length} pre-split chunks`);
11577
+ await this.store.addDocuments(library, normalizedVersion, depth, result);
11210
11578
  const processingTime = performance.now() - processingStart;
11579
+ const totalContentSize = chunks.reduce(
11580
+ (sum, chunk) => sum + chunk.content.length,
11581
+ 0
11582
+ );
11211
11583
  analytics.track(TelemetryEvent.DOCUMENT_PROCESSED, {
11212
11584
  // Content characteristics (privacy-safe)
11213
- mimeType: contentType || "unknown",
11214
- contentSizeBytes: document2.pageContent.length,
11585
+ mimeType: contentType,
11586
+ contentSizeBytes: totalContentSize,
11215
11587
  // Processing metrics
11216
11588
  processingTimeMs: Math.round(processingTime),
11217
- chunksCreated: splitDocs.length,
11589
+ chunksCreated: chunks.length,
11218
11590
  // Document characteristics
11219
- hasTitle: !!document2.metadata.title,
11220
- hasDescription: !!document2.metadata.description,
11591
+ hasTitle: !!title,
11221
11592
  urlDomain: extractHostname(url),
11222
- depth: document2.metadata.depth,
11593
+ depth,
11223
11594
  // Library context
11224
11595
  library,
11225
11596
  libraryVersion: normalizedVersion || null,
11226
11597
  // Processing efficiency
11227
- avgChunkSizeBytes: Math.round(document2.pageContent.length / splitDocs.length),
11598
+ avgChunkSizeBytes: Math.round(totalContentSize / chunks.length),
11228
11599
  processingSpeedKbPerSec: Math.round(
11229
- document2.pageContent.length / 1024 / (processingTime / 1e3)
11600
+ totalContentSize / 1024 / (processingTime / 1e3)
11230
11601
  )
11231
11602
  });
11232
11603
  } catch (error) {
11233
11604
  const processingTime = performance.now() - processingStart;
11234
11605
  if (error instanceof Error) {
11235
11606
  analytics.captureException(error, {
11236
- mimeType: contentType || "unknown",
11237
- contentSizeBytes: document2.pageContent.length,
11607
+ mimeType: contentType,
11608
+ contentSizeBytes: chunks.reduce(
11609
+ (sum, chunk) => sum + chunk.content.length,
11610
+ 0
11611
+ ),
11238
11612
  processingTimeMs: Math.round(processingTime),
11239
11613
  library,
11240
11614
  libraryVersion: normalizedVersion || null,
11241
- context: "document_processing",
11615
+ context: "processed_content_storage",
11242
11616
  component: DocumentManagementService.constructor.name
11243
11617
  });
11244
11618
  }
@@ -11268,6 +11642,18 @@ class DocumentManagementService {
11268
11642
  );
11269
11643
  return versionId;
11270
11644
  }
11645
+ /**
11646
+ * Retrieves a version by its ID from the database.
11647
+ */
11648
+ async getVersionById(versionId) {
11649
+ return this.store.getVersionById(versionId);
11650
+ }
11651
+ /**
11652
+ * Retrieves a library by its ID from the database.
11653
+ */
11654
+ async getLibraryById(libraryId) {
11655
+ return this.store.getLibraryById(libraryId);
11656
+ }
11271
11657
  }
11272
11658
  async function createDocumentManagement(options = {}) {
11273
11659
  if (options.serverUrl) {
@@ -11359,6 +11745,7 @@ async function initializeTools(docService, pipeline) {
11359
11745
  listLibraries: new ListLibrariesTool(docService),
11360
11746
  findVersion: new FindVersionTool(docService),
11361
11747
  scrape: new ScrapeTool(pipeline),
11748
+ refresh: new RefreshVersionTool(pipeline),
11362
11749
  search: new SearchTool(docService),
11363
11750
  listJobs: new ListJobsTool(pipeline),
11364
11751
  getJobInfo: new GetJobInfoTool(pipeline),
@@ -11471,11 +11858,15 @@ const optionalTrimmed = z$1.preprocess(
11471
11858
  (v) => typeof v === "string" ? v.trim() : v,
11472
11859
  z$1.string().min(1).optional().nullable()
11473
11860
  );
11474
- const enqueueInput = z$1.object({
11861
+ const enqueueScrapeInput = z$1.object({
11475
11862
  library: nonEmptyTrimmed,
11476
11863
  version: optionalTrimmed,
11477
11864
  options: z$1.custom()
11478
11865
  });
11866
+ const enqueueRefreshInput = z$1.object({
11867
+ library: nonEmptyTrimmed,
11868
+ version: optionalTrimmed
11869
+ });
11479
11870
  const jobIdInput = z$1.object({ id: z$1.string().min(1) });
11480
11871
  const getJobsInput = z$1.object({
11481
11872
  status: z$1.nativeEnum(PipelineJobStatus).optional()
@@ -11483,12 +11874,12 @@ const getJobsInput = z$1.object({
11483
11874
  function createPipelineRouter(trpc) {
11484
11875
  const tt = trpc;
11485
11876
  return tt.router({
11486
- enqueueJob: tt.procedure.input(enqueueInput).mutation(
11877
+ enqueueScrapeJob: tt.procedure.input(enqueueScrapeInput).mutation(
11487
11878
  async ({
11488
11879
  ctx,
11489
11880
  input
11490
11881
  }) => {
11491
- const jobId = await ctx.pipeline.enqueueJob(
11882
+ const jobId = await ctx.pipeline.enqueueScrapeJob(
11492
11883
  input.library,
11493
11884
  input.version ?? null,
11494
11885
  input.options
@@ -11508,6 +11899,18 @@ function createPipelineRouter(trpc) {
11508
11899
  return { jobId };
11509
11900
  }
11510
11901
  ),
11902
+ enqueueRefreshJob: tt.procedure.input(enqueueRefreshInput).mutation(
11903
+ async ({
11904
+ ctx,
11905
+ input
11906
+ }) => {
11907
+ const jobId = await ctx.pipeline.enqueueRefreshJob(
11908
+ input.library,
11909
+ input.version ?? null
11910
+ );
11911
+ return { jobId };
11912
+ }
11913
+ ),
11511
11914
  getJob: tt.procedure.input(jobIdInput).query(
11512
11915
  async ({
11513
11916
  ctx,
@@ -13447,7 +13850,7 @@ async function registerWorkerService(pipeline) {
13447
13850
  },
13448
13851
  onJobError: async (job, error, document2) => {
13449
13852
  logger.warn(
13450
- `⚠️ Job ${job.id} error ${document2 ? `on document ${document2.metadata.url}` : ""}: ${error.message}`
13853
+ `⚠️ Job ${job.id} error ${document2 ? `on document ${document2.url}` : ""}: ${error.message}`
13451
13854
  );
13452
13855
  analytics.captureException(error, {
13453
13856
  jobId: job.id,
@@ -13987,7 +14390,7 @@ async function findVersionAction(library, options, command) {
13987
14390
  function createFindVersionCommand(program) {
13988
14391
  return program.command("find-version <library>").description("Find the best matching version for a library").option("-v, --version <string>", "Pattern to match (optional, supports ranges)").option(
13989
14392
  "--server-url <url>",
13990
- "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
14393
+ "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
13991
14394
  ).action(findVersionAction);
13992
14395
  }
13993
14396
  async function listAction(options, command) {
@@ -14013,7 +14416,7 @@ async function listAction(options, command) {
14013
14416
  function createListCommand(program) {
14014
14417
  return program.command("list").description("List all available libraries and their versions").option(
14015
14418
  "--server-url <url>",
14016
- "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
14419
+ "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
14017
14420
  ).action(listAction);
14018
14421
  }
14019
14422
  function createMcpCommand(program) {
@@ -14036,7 +14439,7 @@ function createMcpCommand(program) {
14036
14439
  ).env("DOCS_MCP_EMBEDDING_MODEL")
14037
14440
  ).option(
14038
14441
  "--server-url <url>",
14039
- "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
14442
+ "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
14040
14443
  ).option(
14041
14444
  "--read-only",
14042
14445
  "Run in read-only mode (only expose read tools, disable write/job tools)",
@@ -14160,6 +14563,68 @@ function createMcpCommand(program) {
14160
14563
  }
14161
14564
  );
14162
14565
  }
14566
+ async function refreshAction(library, options, command) {
14567
+ await analytics.track(TelemetryEvent.CLI_COMMAND, {
14568
+ command: "refresh",
14569
+ library,
14570
+ version: options.version,
14571
+ useServerUrl: !!options.serverUrl
14572
+ });
14573
+ const serverUrl = options.serverUrl;
14574
+ const globalOptions = getGlobalOptions(command);
14575
+ const embeddingConfig = resolveEmbeddingContext(options.embeddingModel);
14576
+ if (!serverUrl && !embeddingConfig) {
14577
+ throw new Error(
14578
+ "Embedding configuration is required for local refresh operations. Please set DOCS_MCP_EMBEDDING_MODEL environment variable or use --server-url for remote execution."
14579
+ );
14580
+ }
14581
+ const docService = await createDocumentManagement({
14582
+ serverUrl,
14583
+ embeddingConfig,
14584
+ storePath: globalOptions.storePath
14585
+ });
14586
+ let pipeline = null;
14587
+ try {
14588
+ const pipelineOptions = {
14589
+ recoverJobs: false,
14590
+ concurrency: 1,
14591
+ serverUrl
14592
+ };
14593
+ pipeline = await createPipelineWithCallbacks(
14594
+ serverUrl ? void 0 : docService,
14595
+ pipelineOptions
14596
+ );
14597
+ await pipeline.start();
14598
+ const refreshTool = new RefreshVersionTool(pipeline);
14599
+ const result = await refreshTool.execute({
14600
+ library,
14601
+ version: options.version,
14602
+ waitForCompletion: true
14603
+ // Always wait for completion in CLI
14604
+ });
14605
+ if ("pagesRefreshed" in result) {
14606
+ console.log(`✅ Successfully refreshed ${result.pagesRefreshed} pages`);
14607
+ } else {
14608
+ console.log(`🚀 Refresh job started with ID: ${result.jobId}`);
14609
+ }
14610
+ } finally {
14611
+ if (pipeline) await pipeline.stop();
14612
+ await docService.shutdown();
14613
+ }
14614
+ }
14615
+ function createRefreshCommand(program) {
14616
+ return program.command("refresh <library>").description(
14617
+ "Re-scrape an existing library version, updating only changed pages.\n\nUses HTTP ETags to efficiently skip unchanged pages and only re-process\ncontent that has been modified or deleted since the last scrape.\n\nExamples:\n refresh react --version 18.0.0\n refresh mylib\n\nNote: The library and version must already be indexed. Use 'scrape' to index a new library/version."
14618
+ ).option("-v, --version <string>", "Version of the library (optional)").addOption(
14619
+ new Option(
14620
+ "--embedding-model <model>",
14621
+ "Embedding model configuration (e.g., 'openai:text-embedding-3-small')"
14622
+ ).env("DOCS_MCP_EMBEDDING_MODEL")
14623
+ ).option(
14624
+ "--server-url <url>",
14625
+ "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
14626
+ ).action(refreshAction);
14627
+ }
14163
14628
  async function removeAction(library, options, command) {
14164
14629
  await analytics.track(TelemetryEvent.CLI_COMMAND, {
14165
14630
  command: "remove",
@@ -14194,7 +14659,7 @@ function createRemoveCommand(program) {
14194
14659
  "Version to remove (optional, removes unversioned if omitted)"
14195
14660
  ).option(
14196
14661
  "--server-url <url>",
14197
- "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
14662
+ "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
14198
14663
  ).action(removeAction);
14199
14664
  }
14200
14665
  async function scrapeAction(library, url, options, command) {
@@ -14334,7 +14799,7 @@ function createScrapeCommand(program) {
14334
14799
  ).env("DOCS_MCP_EMBEDDING_MODEL")
14335
14800
  ).option(
14336
14801
  "--server-url <url>",
14337
- "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
14802
+ "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
14338
14803
  ).action(scrapeAction);
14339
14804
  }
14340
14805
  async function searchAction(library, query, options, command) {
@@ -14387,7 +14852,7 @@ function createSearchCommand(program) {
14387
14852
  ).env("DOCS_MCP_EMBEDDING_MODEL")
14388
14853
  ).option(
14389
14854
  "--server-url <url>",
14390
- "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
14855
+ "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
14391
14856
  ).action(searchAction);
14392
14857
  }
14393
14858
  function createWebCommand(program) {
@@ -14408,7 +14873,7 @@ function createWebCommand(program) {
14408
14873
  ).env("DOCS_MCP_EMBEDDING_MODEL")
14409
14874
  ).option(
14410
14875
  "--server-url <url>",
14411
- "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
14876
+ "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
14412
14877
  ).action(
14413
14878
  async (cmdOptions, command) => {
14414
14879
  await analytics.track(TelemetryEvent.CLI_COMMAND, {
@@ -14603,6 +15068,7 @@ function createCliProgram() {
14603
15068
  createWebCommand(program);
14604
15069
  createWorkerCommand(program);
14605
15070
  createScrapeCommand(program);
15071
+ createRefreshCommand(program);
14606
15072
  createSearchCommand(program);
14607
15073
  createListCommand(program);
14608
15074
  createFindVersionCommand(program);