@arabold/docs-mcp-server 1.26.2 → 1.27.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -6,7 +6,7 @@ import { VertexAIEmbeddings } from "@langchain/google-vertexai";
6
6
  import { AzureOpenAIEmbeddings, OpenAIEmbeddings } from "@langchain/openai";
7
7
  import { Embeddings } from "@langchain/core/embeddings";
8
8
  import { PostHog } from "posthog-node";
9
- import { randomUUID } from "node:crypto";
9
+ import crypto, { randomUUID } from "node:crypto";
10
10
  import fs, { existsSync, readFileSync } from "node:fs";
11
11
  import path from "node:path";
12
12
  import { fileURLToPath, URL as URL$1 } from "node:url";
@@ -27,6 +27,7 @@ import psl from "psl";
27
27
  import { HeaderGenerator } from "header-generator";
28
28
  import fs$1 from "node:fs/promises";
29
29
  import axios from "axios";
30
+ import { minimatch } from "minimatch";
30
31
  import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
31
32
  import remarkGfm from "remark-gfm";
32
33
  import remarkHtml from "remark-html";
@@ -40,7 +41,6 @@ import * as cheerio from "cheerio";
40
41
  import "node:vm";
41
42
  import { gfm } from "@joplin/turndown-plugin-gfm";
42
43
  import iconv from "iconv-lite";
43
- import { minimatch } from "minimatch";
44
44
  import { SSEServerTransport } from "@modelcontextprotocol/sdk/server/sse.js";
45
45
  import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
46
46
  import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
@@ -113,21 +113,6 @@ class MissingCredentialsError extends StoreError {
113
113
  }
114
114
  }
115
115
  const VECTOR_DIMENSION = 1536;
116
- function mapDbDocumentToDocument(doc) {
117
- const chunkMetadata = JSON.parse(doc.metadata);
118
- return {
119
- id: doc.id,
120
- pageContent: doc.content,
121
- metadata: {
122
- ...chunkMetadata,
123
- // Page-level fields are always available from joined queries
124
- url: doc.url,
125
- title: doc.title || "",
126
- // Convert null to empty string for consistency
127
- ...doc.content_type && { contentType: doc.content_type }
128
- }
129
- };
130
- }
131
116
  var VersionStatus = /* @__PURE__ */ ((VersionStatus2) => {
132
117
  VersionStatus2["NOT_INDEXED"] = "not_indexed";
133
118
  VersionStatus2["QUEUED"] = "queued";
@@ -784,16 +769,16 @@ function extractProtocol(urlOrPath) {
784
769
  }
785
770
  }
786
771
  const name = "@arabold/docs-mcp-server";
787
- const version = "1.26.1";
772
+ const version = "1.27.0";
788
773
  const description = "MCP server for fetching and searching documentation";
789
774
  const type = "module";
790
775
  const bin = { "docs-mcp-server": "dist/index.js" };
791
776
  const license = "MIT";
792
777
  const repository = { "type": "git", "url": "git+https://github.com/arabold/docs-mcp-server.git" };
793
778
  const files = ["dist", "public", "db", "README.md", "LICENSE", "package.json"];
794
- const scripts = { "prepare": "husky || true", "build": "vite build --config vite.config.web.ts && vite build", "start": "node --enable-source-maps dist/index.js", "cli": "node --enable-source-maps dist/index.js", "server": "node --enable-source-maps dist/index.ts", "web": "node --enable-source-maps dist/index.ts web", "dev": "npm-run-all --parallel dev:server dev:web", "dev:server": "vite-node --watch src/index.ts", "dev:web": "vite build --config vite.config.web.ts --watch", "test": "vitest run", "test:watch": "vitest", "test:coverage": "vitest run --coverage", "test:e2e": "vitest run --config test/vitest.config.ts", "test:e2e:watch": "vitest --config test/vitest.config.ts", "lint": "biome check .", "lint:fix": "biome check . --fix", "format": "biome format . --write", "postinstall": "echo 'Skipping Playwright browser install. See README.md for details.'" };
795
- const dependencies = { "@fastify/formbody": "^8.0.2", "@fastify/static": "^8.2.0", "@joplin/turndown-plugin-gfm": "^1.0.62", "@kitajs/html": "^4.2.9", "@kitajs/ts-html-plugin": "^4.1.1", "@langchain/aws": "^0.1.13", "@langchain/google-genai": "^0.2.16", "@langchain/google-vertexai": "^0.2.16", "@langchain/openai": "^0.6.3", "@modelcontextprotocol/sdk": "^1.17.1", "@trpc/client": "^11.4.4", "@trpc/server": "^11.4.4", "alpinejs": "^3.14.9", "axios": "^1.11.0", "axios-retry": "^4.5.0", "better-sqlite3": "^12.2.0", "cheerio": "^1.1.2", "commander": "^14.0.0", "dompurify": "^3.2.6", "dotenv": "^17.2.1", "env-paths": "^3.0.0", "fastify": "^5.4.0", "flowbite": "^3.1.2", "fuse.js": "^7.1.0", "header-generator": "^2.1.69", "htmx.org": "^2.0.6", "iconv-lite": "^0.6.3", "jose": "^6.0.12", "jsdom": "^26.1.0", "langchain": "^0.3.30", "mime": "^4.0.7", "minimatch": "^10.0.1", "playwright": "^1.52.0", "posthog-node": "^5.7.0", "psl": "^1.15.0", "remark": "^15.0.1", "remark-gfm": "^4.0.1", "remark-html": "^16.0.1", "semver": "^7.7.2", "sqlite-vec": "^0.1.7-alpha.2", "tree-sitter": "^0.21.1", "tree-sitter-javascript": "^0.23.1", "tree-sitter-python": "^0.21.0", "tree-sitter-typescript": "^0.23.2", "turndown": "^7.2.0", "zod": "^4.0.14" };
796
- const devDependencies = { "@biomejs/biome": "^2.1.3", "@commitlint/cli": "^19.8.1", "@commitlint/config-conventional": "^19.8.1", "@semantic-release/changelog": "^6.0.3", "@semantic-release/git": "^10.0.1", "@semantic-release/github": "^11.0.3", "@semantic-release/npm": "^12.0.2", "@tailwindcss/postcss": "^4.1.11", "@tailwindcss/vite": "^4.1.11", "@types/alpinejs": "^3.13.11", "@types/better-sqlite3": "^7.6.13", "@types/jsdom": "~21.1.7", "@types/lint-staged": "~13.3.0", "@types/node": "^24.1.0", "@types/node-fetch": "^2.6.13", "@types/psl": "^1.1.3", "@types/semver": "^7.7.0", "@types/turndown": "^5.0.5", "autoprefixer": "^10.4.21", "flowbite-typography": "^1.0.5", "husky": "^9.1.7", "lint-staged": "^16.1.2", "memfs": "^4.34.0", "npm-run-all": "^4.1.5", "postcss": "^8.5.6", "semantic-release": "^24.2.7", "tailwindcss": "^4.1.4", "typescript": "^5.9.2", "vite": "^6.3.5", "vite-node": "^3.1.2", "vite-plugin-dts": "^4.5.4", "vitest": "^3.2.4" };
779
+ const scripts = { "prepare": "husky || true", "build": "vite build --config vite.config.web.ts && vite build", "start": "node --enable-source-maps dist/index.js", "cli": "node --enable-source-maps dist/index.js", "server": "node --enable-source-maps dist/index.ts", "web": "node --enable-source-maps dist/index.ts web", "dev": "npm-run-all --parallel dev:server dev:web", "dev:server": "vite-node --watch src/index.ts", "dev:web": "vite build --config vite.config.web.ts --watch", "test": "vitest run", "test:watch": "vitest", "test:coverage": "vitest run --coverage", "test:unit": "vitest run src", "test:e2e": "vitest run test", "test:live": "vitest run --exclude= test/html-pipeline-live-e2e.test.ts", "lint": "biome check .", "lint:fix": "biome check . --fix", "typecheck": "npx tsc --noEmit", "typecheck:build": "npx tsc --noEmit --project tsconfig.build.json", "format": "biome format . --write", "postinstall": "echo 'Skipping Playwright browser install. See README.md for details.'" };
780
+ const dependencies = { "@fastify/formbody": "^8.0.2", "@fastify/static": "^8.3.0", "@joplin/turndown-plugin-gfm": "^1.0.62", "@kitajs/html": "^4.2.9", "@kitajs/ts-html-plugin": "^4.1.1", "@langchain/aws": "^0.1.13", "@langchain/google-genai": "^0.2.16", "@langchain/google-vertexai": "^0.2.16", "@langchain/openai": "^0.6.3", "@modelcontextprotocol/sdk": "^1.20.2", "@trpc/client": "^11.7.1", "@trpc/server": "^11.4.4", "alpinejs": "^3.14.9", "axios": "^1.13.1", "axios-retry": "^4.5.0", "better-sqlite3": "^12.4.1", "cheerio": "^1.1.2", "commander": "^14.0.0", "dompurify": "^3.3.0", "dotenv": "^17.2.3", "env-paths": "^3.0.0", "fastify": "^5.6.1", "flowbite": "^3.1.2", "fuse.js": "^7.1.0", "header-generator": "^2.1.76", "htmx.org": "^2.0.6", "iconv-lite": "^0.6.3", "jose": "^6.1.0", "jsdom": "^26.1.0", "langchain": "^0.3.30", "mime": "^4.1.0", "minimatch": "^10.0.1", "playwright": "^1.52.0", "posthog-node": "^5.11.0", "psl": "^1.15.0", "remark": "^15.0.1", "remark-gfm": "^4.0.1", "remark-html": "^16.0.1", "semver": "^7.7.3", "sqlite-vec": "^0.1.7-alpha.2", "tree-sitter": "^0.21.1", "tree-sitter-javascript": "^0.23.1", "tree-sitter-python": "^0.21.0", "tree-sitter-typescript": "^0.23.2", "turndown": "^7.2.2", "zod": "^4.1.12" };
781
+ const devDependencies = { "@biomejs/biome": "^2.3.2", "@commitlint/cli": "^19.8.1", "@commitlint/config-conventional": "^19.8.1", "@semantic-release/changelog": "^6.0.3", "@semantic-release/git": "^10.0.1", "@semantic-release/github": "^11.0.6", "@semantic-release/npm": "^12.0.2", "@tailwindcss/postcss": "^4.1.16", "@tailwindcss/vite": "^4.1.16", "@types/alpinejs": "^3.13.11", "@types/better-sqlite3": "^7.6.13", "@types/jsdom": "~21.1.7", "@types/lint-staged": "~13.3.0", "@types/node": "^24.1.0", "@types/node-fetch": "^2.6.13", "@types/psl": "^1.1.3", "@types/semver": "^7.7.1", "@types/turndown": "^5.0.6", "autoprefixer": "^10.4.21", "flowbite-typography": "^1.0.5", "husky": "^9.1.7", "lint-staged": "^16.2.6", "memfs": "^4.50.0", "msw": "^2.12.2", "nock": "^14.0.10", "npm-run-all": "^4.1.5", "postcss": "^8.5.6", "semantic-release": "^24.2.9", "tailwindcss": "^4.1.4", "typescript": "^5.9.3", "vite": "^6.3.5", "vite-node": "^3.2.4", "vite-plugin-dts": "^4.5.4", "vitest": "^3.2.4" };
797
782
  const engines = { "node": ">=20.0.0" };
798
783
  const packageJson = {
799
784
  name,
@@ -1288,10 +1273,10 @@ class PipelineClient {
1288
1273
  this.activePolling.clear();
1289
1274
  logger.debug("PipelineClient stopped");
1290
1275
  }
1291
- async enqueueJob(library, version2, options) {
1276
+ async enqueueScrapeJob(library, version2, options) {
1292
1277
  try {
1293
1278
  const normalizedVersion = typeof version2 === "string" && version2.trim().length === 0 ? null : version2 ?? null;
1294
- const result = await this.client.enqueueJob.mutate({
1279
+ const result = await this.client.enqueueScrapeJob.mutate({
1295
1280
  library,
1296
1281
  version: normalizedVersion,
1297
1282
  options
@@ -1304,6 +1289,21 @@ class PipelineClient {
1304
1289
  );
1305
1290
  }
1306
1291
  }
1292
+ async enqueueRefreshJob(library, version2) {
1293
+ try {
1294
+ const normalizedVersion = typeof version2 === "string" && version2.trim().length === 0 ? null : version2 ?? null;
1295
+ const result = await this.client.enqueueRefreshJob.mutate({
1296
+ library,
1297
+ version: normalizedVersion
1298
+ });
1299
+ logger.debug(`Refresh job ${result.jobId} enqueued successfully`);
1300
+ return result.jobId;
1301
+ } catch (error) {
1302
+ throw new Error(
1303
+ `Failed to enqueue refresh job: ${error instanceof Error ? error.message : String(error)}`
1304
+ );
1305
+ }
1306
+ }
1307
1307
  async getJob(jobId) {
1308
1308
  try {
1309
1309
  const serializedJob = await this.client.getJob.query({ id: jobId });
@@ -1753,6 +1753,12 @@ class FingerprintGenerator {
1753
1753
  return this.headerGenerator.getHeaders();
1754
1754
  }
1755
1755
  }
1756
+ var FetchStatus = /* @__PURE__ */ ((FetchStatus2) => {
1757
+ FetchStatus2["SUCCESS"] = "success";
1758
+ FetchStatus2["NOT_MODIFIED"] = "not_modified";
1759
+ FetchStatus2["NOT_FOUND"] = "not_found";
1760
+ return FetchStatus2;
1761
+ })(FetchStatus || {});
1756
1762
  class BrowserFetcher {
1757
1763
  browser = null;
1758
1764
  page = null;
@@ -1792,13 +1798,16 @@ class BrowserFetcher {
1792
1798
  const contentBuffer = Buffer.from(content, "utf-8");
1793
1799
  const contentType = response.headers()["content-type"] || "text/html";
1794
1800
  const { mimeType, charset } = MimeTypeUtils.parseContentType(contentType);
1801
+ const etag = response.headers().etag;
1795
1802
  return {
1796
1803
  content: contentBuffer,
1797
1804
  mimeType,
1798
1805
  charset,
1799
1806
  encoding: void 0,
1800
1807
  // Browser handles encoding automatically
1801
- source: finalUrl
1808
+ source: finalUrl,
1809
+ etag,
1810
+ status: FetchStatus.SUCCESS
1802
1811
  };
1803
1812
  } catch (error) {
1804
1813
  if (options?.signal?.aborted) {
@@ -1859,24 +1868,48 @@ class FileFetcher {
1859
1868
  /**
1860
1869
  * Fetches the content of a file given a file:// URL, decoding percent-encoded paths as needed.
1861
1870
  * Uses enhanced MIME type detection for better source code file recognition.
1871
+ * Supports conditional fetching via ETag comparison for efficient refresh operations.
1862
1872
  */
1863
- async fetch(source, _options) {
1873
+ async fetch(source, options) {
1864
1874
  let filePath = source.replace(/^file:\/\/\/?/, "");
1865
1875
  filePath = decodeURIComponent(filePath);
1866
1876
  if (!filePath.startsWith("/") && process.platform !== "win32") {
1867
1877
  filePath = `/${filePath}`;
1868
1878
  }
1869
1879
  try {
1880
+ const stats = await fs$1.stat(filePath);
1881
+ const currentEtag = crypto.createHash("md5").update(stats.mtime.toISOString()).digest("hex");
1882
+ if (options?.etag && options.etag === currentEtag) {
1883
+ return {
1884
+ content: Buffer.from(""),
1885
+ mimeType: "text/plain",
1886
+ source,
1887
+ etag: currentEtag,
1888
+ lastModified: stats.mtime.toISOString(),
1889
+ status: FetchStatus.NOT_MODIFIED
1890
+ };
1891
+ }
1870
1892
  const content = await fs$1.readFile(filePath);
1871
1893
  const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath);
1872
1894
  const mimeType = detectedMimeType || "application/octet-stream";
1873
1895
  return {
1874
1896
  content,
1875
1897
  mimeType,
1876
- source
1898
+ source,
1899
+ etag: currentEtag,
1900
+ lastModified: stats.mtime.toISOString(),
1901
+ status: FetchStatus.SUCCESS
1877
1902
  // Don't assume charset for text files - let the pipeline detect it
1878
1903
  };
1879
1904
  } catch (error) {
1905
+ if (error.code === "ENOENT") {
1906
+ return {
1907
+ content: Buffer.from(""),
1908
+ mimeType: "text/plain",
1909
+ source,
1910
+ status: FetchStatus.NOT_FOUND
1911
+ };
1912
+ }
1880
1913
  throw new ScraperError(
1881
1914
  `Failed to read file ${filePath}: ${error.message ?? "Unknown error"}`,
1882
1915
  false,
@@ -1982,6 +2015,12 @@ class HttpFetcher {
1982
2015
  ...options?.headers
1983
2016
  // User-provided headers override generated ones
1984
2017
  };
2018
+ if (options?.etag) {
2019
+ headers["If-None-Match"] = options.etag;
2020
+ logger.debug(
2021
+ `Conditional request for ${source} with If-None-Match: ${options.etag}`
2022
+ );
2023
+ }
1985
2024
  const config = {
1986
2025
  responseType: "arraybuffer",
1987
2026
  headers: {
@@ -1995,9 +2034,22 @@ class HttpFetcher {
1995
2034
  // Pass signal to axios
1996
2035
  // Axios follows redirects by default, we need to explicitly disable it if needed
1997
2036
  maxRedirects: followRedirects ? 5 : 0,
1998
- decompress: true
2037
+ decompress: true,
2038
+ // Allow 304 responses to be handled as successful responses
2039
+ validateStatus: (status) => {
2040
+ return status >= 200 && status < 300 || status === 304;
2041
+ }
1999
2042
  };
2000
2043
  const response = await axios.get(source, config);
2044
+ if (response.status === 304) {
2045
+ logger.debug(`HTTP 304 Not Modified for ${source}`);
2046
+ return {
2047
+ content: Buffer.from(""),
2048
+ mimeType: "text/plain",
2049
+ source,
2050
+ status: FetchStatus.NOT_MODIFIED
2051
+ };
2052
+ }
2001
2053
  const contentTypeHeader = response.headers["content-type"];
2002
2054
  const { mimeType, charset } = MimeTypeUtils.parseContentType(contentTypeHeader);
2003
2055
  const contentEncoding = response.headers["content-encoding"];
@@ -2017,12 +2069,21 @@ class HttpFetcher {
2017
2069
  response.request?.responseUrl || // Fallback to axios recorded config URL
2018
2070
  response.config?.url || source
2019
2071
  );
2072
+ const etag = response.headers.etag || response.headers.ETag;
2073
+ if (etag) {
2074
+ logger.debug(`Received ETag for ${source}: ${etag}`);
2075
+ }
2076
+ const lastModified = response.headers["last-modified"];
2077
+ const lastModifiedISO = lastModified ? new Date(lastModified).toISOString() : void 0;
2020
2078
  return {
2021
2079
  content,
2022
2080
  mimeType,
2023
2081
  charset,
2024
2082
  encoding: contentEncoding,
2025
- source: finalUrl
2083
+ source: finalUrl,
2084
+ etag,
2085
+ lastModified: lastModifiedISO,
2086
+ status: FetchStatus.SUCCESS
2026
2087
  };
2027
2088
  } catch (error) {
2028
2089
  const axiosError = error;
@@ -2031,6 +2092,15 @@ class HttpFetcher {
2031
2092
  if (options?.signal?.aborted || code === "ERR_CANCELED") {
2032
2093
  throw new CancellationError("HTTP fetch cancelled");
2033
2094
  }
2095
+ if (status === 404) {
2096
+ logger.debug(`Resource not found (404): ${source}`);
2097
+ return {
2098
+ content: Buffer.from(""),
2099
+ mimeType: "text/plain",
2100
+ source,
2101
+ status: FetchStatus.NOT_FOUND
2102
+ };
2103
+ }
2034
2104
  if (!followRedirects && status && status >= 300 && status < 400) {
2035
2105
  const location = axiosError.response?.headers?.location;
2036
2106
  if (location) {
@@ -2125,101 +2195,522 @@ class AutoDetectFetcher {
2125
2195
  ]);
2126
2196
  }
2127
2197
  }
2128
- class SplitterError extends Error {
2129
- }
2130
- class MinimumChunkSizeError extends SplitterError {
2131
- constructor(size, maxSize) {
2132
- super(
2133
- `Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`
2134
- );
2198
+ const DEFAULT_FILE_EXCLUSIONS = [
2199
+ // CHANGELOG files (case variations)
2200
+ "**/CHANGELOG.md",
2201
+ "**/changelog.md",
2202
+ "**/CHANGELOG.mdx",
2203
+ "**/changelog.mdx",
2204
+ // LICENSE files (case variations)
2205
+ "**/LICENSE",
2206
+ "**/LICENSE.md",
2207
+ "**/license.md",
2208
+ // CODE_OF_CONDUCT files (case variations)
2209
+ "**/CODE_OF_CONDUCT.md",
2210
+ "**/code_of_conduct.md",
2211
+ // Test files
2212
+ "**/*.test.*",
2213
+ "**/*.spec.*",
2214
+ "**/*_test.py",
2215
+ "**/*_test.go",
2216
+ // Package manager lock files
2217
+ "**/*.lock",
2218
+ "**/package-lock.json",
2219
+ "**/yarn.lock",
2220
+ "**/pnpm-lock.yaml",
2221
+ "**/go.sum",
2222
+ // Build artifacts
2223
+ "**/*.min.js",
2224
+ "**/*.min.css",
2225
+ "**/*.map",
2226
+ "**/*.d.ts",
2227
+ // IDE/System files
2228
+ "**/.DS_Store",
2229
+ "**/Thumbs.db",
2230
+ "**/*.swp",
2231
+ "**/*.swo",
2232
+ // Internal config files (using regex pattern)
2233
+ "/.*\\.(ini|cfg|conf|log|pid)$/"
2234
+ ];
2235
+ const DEFAULT_FOLDER_EXCLUSIONS = [
2236
+ // Archive and deprecated content (matches anywhere in path)
2237
+ "**/archive/**",
2238
+ "**/archived/**",
2239
+ "**/deprecated/**",
2240
+ "**/legacy/**",
2241
+ "**/old/**",
2242
+ "**/outdated/**",
2243
+ "**/previous/**",
2244
+ "**/superseded/**",
2245
+ // Specific paths that don't follow the general pattern
2246
+ "docs/old/**",
2247
+ // Test directories
2248
+ "**/test/**",
2249
+ "**/tests/**",
2250
+ "**/__tests__/**",
2251
+ "**/spec/**",
2252
+ // Build output directories
2253
+ "**/dist/**",
2254
+ "**/build/**",
2255
+ "**/out/**",
2256
+ "**/target/**",
2257
+ "**/.next/**",
2258
+ "**/.nuxt/**",
2259
+ // IDE directories
2260
+ "**/.vscode/**",
2261
+ "**/.idea/**",
2262
+ // Internationalization folders - non-English locales
2263
+ "**/i18n/ar*/**",
2264
+ "**/i18n/de*/**",
2265
+ "**/i18n/es*/**",
2266
+ "**/i18n/fr*/**",
2267
+ "**/i18n/hi*/**",
2268
+ "**/i18n/it*/**",
2269
+ "**/i18n/ja*/**",
2270
+ "**/i18n/ko*/**",
2271
+ "**/i18n/nl*/**",
2272
+ "**/i18n/pl*/**",
2273
+ "**/i18n/pt*/**",
2274
+ "**/i18n/ru*/**",
2275
+ "**/i18n/sv*/**",
2276
+ "**/i18n/th*/**",
2277
+ "**/i18n/tr*/**",
2278
+ "**/i18n/vi*/**",
2279
+ "**/i18n/zh*/**",
2280
+ // Common locale folder patterns
2281
+ "**/zh-cn/**",
2282
+ "**/zh-hk/**",
2283
+ "**/zh-mo/**",
2284
+ "**/zh-sg/**",
2285
+ "**/zh-tw/**"
2286
+ ];
2287
+ const DEFAULT_EXCLUSION_PATTERNS = [
2288
+ ...DEFAULT_FILE_EXCLUSIONS,
2289
+ ...DEFAULT_FOLDER_EXCLUSIONS
2290
+ ];
2291
+ function getEffectiveExclusionPatterns(userPatterns) {
2292
+ if (userPatterns !== void 0) {
2293
+ return userPatterns;
2135
2294
  }
2295
+ return DEFAULT_EXCLUSION_PATTERNS;
2136
2296
  }
2137
- class ContentSplitterError extends SplitterError {
2297
+ function isRegexPattern(pattern) {
2298
+ return pattern.length > 2 && pattern.startsWith("/") && pattern.endsWith("/");
2138
2299
  }
2139
- class GreedySplitter {
2140
- baseSplitter;
2141
- minChunkSize;
2142
- preferredChunkSize;
2143
- /**
2144
- * Combines a base document splitter with size constraints to produce optimally-sized chunks.
2145
- * The base splitter handles the initial semantic splitting, while this class handles
2146
- * the concatenation strategy.
2147
- */
2148
- constructor(baseSplitter, minChunkSize, preferredChunkSize) {
2149
- this.baseSplitter = baseSplitter;
2150
- this.minChunkSize = minChunkSize;
2151
- this.preferredChunkSize = preferredChunkSize;
2300
+ function patternToRegExp(pattern) {
2301
+ if (isRegexPattern(pattern)) {
2302
+ return new RegExp(pattern.slice(1, -1));
2152
2303
  }
2153
- /**
2154
- * Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks
2155
- * are combined until they reach the minimum size, but splits are preserved at major
2156
- * section boundaries to maintain document structure. This balances the need for
2157
- * context with semantic coherence.
2158
- */
2159
- async splitText(markdown, contentType) {
2160
- const initialChunks = await this.baseSplitter.splitText(markdown, contentType);
2161
- const concatenatedChunks = [];
2162
- let currentChunk = null;
2163
- for (const nextChunk of initialChunks) {
2164
- if (currentChunk) {
2165
- if (this.wouldExceedMaxSize(currentChunk, nextChunk)) {
2166
- concatenatedChunks.push(currentChunk);
2167
- currentChunk = this.cloneChunk(nextChunk);
2168
- continue;
2169
- }
2170
- if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk)) {
2171
- concatenatedChunks.push(currentChunk);
2172
- currentChunk = this.cloneChunk(nextChunk);
2173
- continue;
2174
- }
2175
- currentChunk.content += `${currentChunk.content.endsWith("\n") ? "" : "\n"}${nextChunk.content}`;
2176
- currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
2177
- currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
2178
- } else {
2179
- currentChunk = this.cloneChunk(nextChunk);
2180
- }
2181
- }
2182
- if (currentChunk) {
2183
- concatenatedChunks.push(currentChunk);
2304
+ const re = minimatch.makeRe(pattern, { dot: true });
2305
+ if (!re) throw new Error(`Invalid glob pattern: ${pattern}`);
2306
+ return re;
2307
+ }
2308
+ function matchesAnyPattern(path2, patterns) {
2309
+ if (!patterns || patterns.length === 0) return false;
2310
+ const normalizedPath = path2.startsWith("/") ? path2 : `/${path2}`;
2311
+ return patterns.some((pattern) => {
2312
+ if (isRegexPattern(pattern)) {
2313
+ return patternToRegExp(pattern).test(normalizedPath);
2184
2314
  }
2185
- return concatenatedChunks;
2186
- }
2187
- cloneChunk(chunk) {
2188
- return {
2189
- types: [...chunk.types],
2190
- content: chunk.content,
2191
- section: {
2192
- level: chunk.section.level,
2193
- path: [...chunk.section.path]
2194
- }
2195
- };
2196
- }
2197
- /**
2198
- * H1 and H2 headings represent major conceptual breaks in the document.
2199
- * Preserving these splits helps maintain the document's logical structure.
2200
- */
2201
- startsNewMajorSection(chunk) {
2202
- return chunk.section.level === 1 || chunk.section.level === 2;
2315
+ const pathForMatch = normalizedPath.replace(/^\//, "");
2316
+ const patternForMatch = pattern.startsWith("/") ? pattern.slice(1) : pattern;
2317
+ return minimatch(pathForMatch, patternForMatch, { dot: true });
2318
+ });
2319
+ }
2320
+ function extractPathAndQuery(url) {
2321
+ try {
2322
+ const u = new URL(url);
2323
+ return u.pathname + (u.search || "");
2324
+ } catch {
2325
+ return url;
2203
2326
  }
2204
- /**
2205
- * Size limit check to ensure chunks remain within embedding model constraints.
2206
- * Essential for maintaining consistent embedding quality and avoiding truncation.
2207
- */
2208
- wouldExceedMaxSize(currentChunk, nextChunk) {
2209
- if (!currentChunk) {
2210
- return false;
2327
+ }
2328
+ function shouldIncludeUrl(url, includePatterns, excludePatterns) {
2329
+ const path2 = extractPathAndQuery(url);
2330
+ const normalizedPath = path2.startsWith("/") ? path2 : `/${path2}`;
2331
+ let basename;
2332
+ if (url.startsWith("file://")) {
2333
+ try {
2334
+ const u = new URL(url);
2335
+ basename = u.pathname ? u.pathname.split("/").pop() : void 0;
2336
+ } catch {
2211
2337
  }
2212
- return currentChunk.content.length + nextChunk.content.length > this.preferredChunkSize;
2213
2338
  }
2214
- /**
2215
- * Checks if one path is a prefix of another path, indicating a parent-child relationship
2216
- */
2217
- isPathIncluded(parentPath, childPath) {
2218
- if (parentPath.length >= childPath.length) return false;
2219
- return parentPath.every((part, i) => part === childPath[i]);
2339
+ const stripSlash = (patterns) => patterns?.map((p) => p.startsWith("/") ? p.slice(1) : p);
2340
+ const effectiveExcludePatterns = getEffectiveExclusionPatterns(excludePatterns);
2341
+ if (matchesAnyPattern(url, effectiveExcludePatterns) || matchesAnyPattern(normalizedPath, effectiveExcludePatterns) || basename && matchesAnyPattern(basename, stripSlash(effectiveExcludePatterns)))
2342
+ return false;
2343
+ if (!includePatterns || includePatterns.length === 0) return true;
2344
+ return matchesAnyPattern(url, includePatterns) || matchesAnyPattern(normalizedPath, includePatterns) || (basename ? matchesAnyPattern(basename, stripSlash(includePatterns)) : false);
2345
+ }
2346
+ function computeBaseDirectory(pathname) {
2347
+ if (pathname === "") return "/";
2348
+ if (pathname.endsWith("/")) return pathname;
2349
+ const lastSegment = pathname.split("/").at(-1) || "";
2350
+ const looksLikeFile = lastSegment.includes(".");
2351
+ if (looksLikeFile) {
2352
+ return pathname.replace(/\/[^/]*$/, "/");
2220
2353
  }
2221
- /**
2222
- * Merges section metadata when concatenating chunks, following these rules:
2354
+ return `${pathname}/`;
2355
+ }
2356
+ function isInScope(baseUrl, targetUrl, scope) {
2357
+ if (baseUrl.protocol !== targetUrl.protocol) return false;
2358
+ switch (scope) {
2359
+ case "subpages": {
2360
+ if (baseUrl.hostname !== targetUrl.hostname) return false;
2361
+ const baseDir = computeBaseDirectory(baseUrl.pathname);
2362
+ return targetUrl.pathname.startsWith(baseDir);
2363
+ }
2364
+ case "hostname":
2365
+ return baseUrl.hostname === targetUrl.hostname;
2366
+ case "domain": {
2367
+ return extractPrimaryDomain(baseUrl.hostname) === extractPrimaryDomain(targetUrl.hostname);
2368
+ }
2369
+ default:
2370
+ return false;
2371
+ }
2372
+ }
2373
+ const DEFAULT_MAX_DEPTH = 3;
2374
+ const DEFAULT_CONCURRENCY = 3;
2375
+ class BaseScraperStrategy {
2376
+ /**
2377
+ * Set of normalized URLs that have been marked for processing.
2378
+ *
2379
+ * IMPORTANT: URLs are added to this set BEFORE they are actually processed, not after.
2380
+ * This prevents the same URL from being queued multiple times when discovered from different sources.
2381
+ *
2382
+ * Usage flow:
2383
+ * 1. Initial queue setup: Root URL and initialQueue items are added to visited
2384
+ * 2. During processing: When a page returns links, each link is checked against visited
2385
+ * 3. In processBatch deduplication: Only links NOT in visited are added to the queue AND to visited
2386
+ *
2387
+ * This approach ensures:
2388
+ * - No URL is processed more than once
2389
+ * - No URL appears in the queue multiple times
2390
+ * - Efficient deduplication across concurrent processing
2391
+ */
2392
+ visited = /* @__PURE__ */ new Set();
2393
+ pageCount = 0;
2394
+ totalDiscovered = 0;
2395
+ // Track total URLs discovered (unlimited)
2396
+ effectiveTotal = 0;
2397
+ // Track effective total (limited by maxPages)
2398
+ canonicalBaseUrl;
2399
+ options;
2400
+ constructor(options = {}) {
2401
+ this.options = options;
2402
+ }
2403
+ /**
2404
+ * Determines if a URL should be processed based on scope and include/exclude patterns in ScraperOptions.
2405
+ * Scope is checked first, then patterns.
2406
+ */
2407
+ shouldProcessUrl(url, options) {
2408
+ if (options.scope) {
2409
+ try {
2410
+ const base = this.canonicalBaseUrl ?? new URL$1(options.url);
2411
+ const target = new URL$1(url);
2412
+ if (!isInScope(base, target, options.scope)) return false;
2413
+ } catch {
2414
+ return false;
2415
+ }
2416
+ }
2417
+ return shouldIncludeUrl(url, options.includePatterns, options.excludePatterns);
2418
+ }
2419
+ async processBatch(batch, baseUrl, options, progressCallback, signal) {
2420
+ const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
2421
+ const results = await Promise.all(
2422
+ batch.map(async (item) => {
2423
+ if (signal?.aborted) {
2424
+ throw new CancellationError("Scraping cancelled during batch processing");
2425
+ }
2426
+ const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH;
2427
+ if (item.depth > maxDepth) {
2428
+ return [];
2429
+ }
2430
+ try {
2431
+ const result = await this.processItem(item, options, signal);
2432
+ const shouldCount = item.pageId !== void 0 || result.content !== void 0;
2433
+ let currentPageCount = this.pageCount;
2434
+ if (shouldCount) {
2435
+ currentPageCount = ++this.pageCount;
2436
+ logger.info(
2437
+ `🌐 Scraping page ${currentPageCount}/${this.effectiveTotal} (depth ${item.depth}/${maxDepth}): ${item.url}`
2438
+ );
2439
+ }
2440
+ if (result.status === FetchStatus.NOT_MODIFIED) {
2441
+ logger.debug(`Page unchanged (304): ${item.url}`);
2442
+ if (shouldCount) {
2443
+ await progressCallback({
2444
+ pagesScraped: currentPageCount,
2445
+ totalPages: this.effectiveTotal,
2446
+ totalDiscovered: this.totalDiscovered,
2447
+ currentUrl: item.url,
2448
+ depth: item.depth,
2449
+ maxDepth,
2450
+ result: null,
2451
+ pageId: item.pageId
2452
+ });
2453
+ }
2454
+ return [];
2455
+ }
2456
+ if (result.status === FetchStatus.NOT_FOUND) {
2457
+ logger.debug(`Page deleted (404): ${item.url}`);
2458
+ if (shouldCount) {
2459
+ await progressCallback({
2460
+ pagesScraped: currentPageCount,
2461
+ totalPages: this.effectiveTotal,
2462
+ totalDiscovered: this.totalDiscovered,
2463
+ currentUrl: item.url,
2464
+ depth: item.depth,
2465
+ maxDepth,
2466
+ result: null,
2467
+ pageId: item.pageId,
2468
+ deleted: true
2469
+ });
2470
+ }
2471
+ return [];
2472
+ }
2473
+ if (result.status !== FetchStatus.SUCCESS) {
2474
+ logger.error(`Unknown fetch status: ${result.status}`);
2475
+ return [];
2476
+ }
2477
+ const finalUrl = result.url || item.url;
2478
+ if (result.content) {
2479
+ await progressCallback({
2480
+ pagesScraped: currentPageCount,
2481
+ totalPages: this.effectiveTotal,
2482
+ totalDiscovered: this.totalDiscovered,
2483
+ currentUrl: finalUrl,
2484
+ depth: item.depth,
2485
+ maxDepth,
2486
+ result: {
2487
+ url: finalUrl,
2488
+ title: result.content.title?.trim() || result.title?.trim() || "",
2489
+ contentType: result.contentType || "",
2490
+ textContent: result.content.textContent || "",
2491
+ links: result.content.links || [],
2492
+ errors: result.content.errors || [],
2493
+ chunks: result.content.chunks || [],
2494
+ etag: result.etag || null,
2495
+ lastModified: result.lastModified || null
2496
+ },
2497
+ pageId: item.pageId
2498
+ });
2499
+ }
2500
+ const nextItems = result.links || [];
2501
+ const linkBaseUrl = finalUrl ? new URL$1(finalUrl) : baseUrl;
2502
+ return nextItems.map((value) => {
2503
+ try {
2504
+ const targetUrl = new URL$1(value, linkBaseUrl);
2505
+ if (!this.shouldProcessUrl(targetUrl.href, options)) {
2506
+ return null;
2507
+ }
2508
+ return {
2509
+ url: targetUrl.href,
2510
+ depth: item.depth + 1
2511
+ };
2512
+ } catch (_error) {
2513
+ logger.warn(`❌ Invalid URL: ${value}`);
2514
+ }
2515
+ return null;
2516
+ }).filter((item2) => item2 !== null);
2517
+ } catch (error) {
2518
+ if (options.ignoreErrors) {
2519
+ logger.error(`❌ Failed to process ${item.url}: ${error}`);
2520
+ return [];
2521
+ }
2522
+ throw error;
2523
+ }
2524
+ })
2525
+ );
2526
+ const allLinks = results.flat();
2527
+ const uniqueLinks = [];
2528
+ for (const item of allLinks) {
2529
+ const normalizedUrl = normalizeUrl(item.url, this.options.urlNormalizerOptions);
2530
+ if (!this.visited.has(normalizedUrl)) {
2531
+ this.visited.add(normalizedUrl);
2532
+ uniqueLinks.push(item);
2533
+ this.totalDiscovered++;
2534
+ if (this.effectiveTotal < maxPages) {
2535
+ this.effectiveTotal++;
2536
+ }
2537
+ }
2538
+ }
2539
+ return uniqueLinks;
2540
+ }
2541
+ async scrape(options, progressCallback, signal) {
2542
+ this.visited.clear();
2543
+ this.pageCount = 0;
2544
+ const initialQueue = options.initialQueue || [];
2545
+ const isRefreshMode = initialQueue.length > 0;
2546
+ this.canonicalBaseUrl = new URL$1(options.url);
2547
+ let baseUrl = this.canonicalBaseUrl;
2548
+ const queue = [];
2549
+ const normalizedRootUrl = normalizeUrl(
2550
+ options.url,
2551
+ this.options.urlNormalizerOptions
2552
+ );
2553
+ if (isRefreshMode) {
2554
+ logger.debug(
2555
+ `Starting refresh mode with ${initialQueue.length} pre-populated pages`
2556
+ );
2557
+ for (const item of initialQueue) {
2558
+ const normalizedUrl = normalizeUrl(item.url, this.options.urlNormalizerOptions);
2559
+ if (!this.visited.has(normalizedUrl)) {
2560
+ this.visited.add(normalizedUrl);
2561
+ queue.push(item);
2562
+ }
2563
+ }
2564
+ }
2565
+ if (!this.visited.has(normalizedRootUrl)) {
2566
+ this.visited.add(normalizedRootUrl);
2567
+ queue.unshift({ url: options.url, depth: 0 });
2568
+ }
2569
+ this.totalDiscovered = queue.length;
2570
+ this.effectiveTotal = queue.length;
2571
+ const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
2572
+ const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY;
2573
+ while (queue.length > 0 && this.pageCount < maxPages) {
2574
+ if (signal?.aborted) {
2575
+ logger.debug(`${isRefreshMode ? "Refresh" : "Scraping"} cancelled by signal.`);
2576
+ throw new CancellationError(
2577
+ `${isRefreshMode ? "Refresh" : "Scraping"} cancelled by signal`
2578
+ );
2579
+ }
2580
+ const remainingPages = maxPages - this.pageCount;
2581
+ if (remainingPages <= 0) {
2582
+ break;
2583
+ }
2584
+ const batchSize = Math.min(maxConcurrency, remainingPages, queue.length);
2585
+ const batch = queue.splice(0, batchSize);
2586
+ baseUrl = this.canonicalBaseUrl ?? baseUrl;
2587
+ const newUrls = await this.processBatch(
2588
+ batch,
2589
+ baseUrl,
2590
+ options,
2591
+ progressCallback,
2592
+ signal
2593
+ );
2594
+ queue.push(...newUrls);
2595
+ }
2596
+ }
2597
+ /**
2598
+ * Cleanup resources used by this strategy.
2599
+ * Default implementation does nothing - override in derived classes as needed.
2600
+ */
2601
+ async cleanup() {
2602
+ }
2603
+ }
2604
+ class SplitterError extends Error {
2605
+ }
2606
+ class MinimumChunkSizeError extends SplitterError {
2607
+ constructor(size, maxSize) {
2608
+ super(
2609
+ `Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`
2610
+ );
2611
+ }
2612
+ }
2613
+ class ContentSplitterError extends SplitterError {
2614
+ }
2615
+ class GreedySplitter {
2616
+ baseSplitter;
2617
+ minChunkSize;
2618
+ preferredChunkSize;
2619
+ maxChunkSize;
2620
+ /**
2621
+ * Combines a base document splitter with size constraints to produce optimally-sized chunks.
2622
+ * The base splitter handles the initial semantic splitting, while this class handles
2623
+ * the concatenation strategy.
2624
+ */
2625
+ constructor(baseSplitter, minChunkSize, preferredChunkSize, maxChunkSize) {
2626
+ this.baseSplitter = baseSplitter;
2627
+ this.minChunkSize = minChunkSize;
2628
+ this.preferredChunkSize = preferredChunkSize;
2629
+ this.maxChunkSize = maxChunkSize;
2630
+ }
2631
+ /**
2632
+ * Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks
2633
+ * are combined until they reach the minimum size, but splits are preserved at major
2634
+ * section boundaries to maintain document structure. This balances the need for
2635
+ * context with semantic coherence.
2636
+ */
2637
+ async splitText(markdown, contentType) {
2638
+ const initialChunks = await this.baseSplitter.splitText(markdown, contentType);
2639
+ const concatenatedChunks = [];
2640
+ let currentChunk = null;
2641
+ for (const nextChunk of initialChunks) {
2642
+ if (nextChunk.content.length > this.maxChunkSize) {
2643
+ logger.warn(
2644
+ `⚠ Chunk from base splitter exceeds max size: ${nextChunk.content.length} > ${this.maxChunkSize}`
2645
+ );
2646
+ }
2647
+ if (currentChunk) {
2648
+ const combinedSize = currentChunk.content.length + nextChunk.content.length;
2649
+ if (combinedSize > this.maxChunkSize) {
2650
+ concatenatedChunks.push(currentChunk);
2651
+ currentChunk = this.cloneChunk(nextChunk);
2652
+ continue;
2653
+ }
2654
+ if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk) && !this.isSameSection(currentChunk, nextChunk)) {
2655
+ concatenatedChunks.push(currentChunk);
2656
+ currentChunk = this.cloneChunk(nextChunk);
2657
+ continue;
2658
+ }
2659
+ if (combinedSize > this.preferredChunkSize && currentChunk.content.length >= this.minChunkSize && nextChunk.content.length >= this.minChunkSize) {
2660
+ concatenatedChunks.push(currentChunk);
2661
+ currentChunk = this.cloneChunk(nextChunk);
2662
+ continue;
2663
+ }
2664
+ currentChunk.content += `${currentChunk.content.endsWith("\n") ? "" : "\n"}${nextChunk.content}`;
2665
+ currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
2666
+ currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
2667
+ } else {
2668
+ currentChunk = this.cloneChunk(nextChunk);
2669
+ }
2670
+ }
2671
+ if (currentChunk) {
2672
+ concatenatedChunks.push(currentChunk);
2673
+ }
2674
+ return concatenatedChunks;
2675
+ }
2676
+ cloneChunk(chunk) {
2677
+ return {
2678
+ types: [...chunk.types],
2679
+ content: chunk.content,
2680
+ section: {
2681
+ level: chunk.section.level,
2682
+ path: [...chunk.section.path]
2683
+ }
2684
+ };
2685
+ }
2686
+ /**
2687
+ * H1 and H2 headings represent major conceptual breaks in the document.
2688
+ * Preserving these splits helps maintain the document's logical structure.
2689
+ */
2690
+ startsNewMajorSection(chunk) {
2691
+ return chunk.section.level === 1 || chunk.section.level === 2;
2692
+ }
2693
+ /**
2694
+ * Checks if two chunks belong to the same section by comparing their paths.
2695
+ * Returns true if the paths are identical or if one is a parent of the other.
2696
+ */
2697
+ isSameSection(chunk1, chunk2) {
2698
+ const path1 = chunk1.section.path;
2699
+ const path2 = chunk2.section.path;
2700
+ if (path1.length === path2.length && path1.every((part, i) => part === path2[i])) {
2701
+ return true;
2702
+ }
2703
+ return this.isPathIncluded(path1, path2) || this.isPathIncluded(path2, path1);
2704
+ }
2705
+ /**
2706
+ * Checks if one path is a prefix of another path, indicating a parent-child relationship
2707
+ */
2708
+ isPathIncluded(parentPath, childPath) {
2709
+ if (parentPath.length >= childPath.length) return false;
2710
+ return parentPath.every((part, i) => part === childPath[i]);
2711
+ }
2712
+ /**
2713
+ * Merges section metadata when concatenating chunks, following these rules:
2223
2714
  * 1. Level: Always uses the lowest (most general) level between chunks
2224
2715
  * 2. Path selection:
2225
2716
  * - For parent-child relationships (one path includes the other), uses the child's path
@@ -4195,7 +4686,7 @@ class HtmlMetadataExtractorMiddleware {
4195
4686
  }
4196
4687
  title = title || "Untitled";
4197
4688
  title = title.replace(/\s+/g, " ").trim();
4198
- context.metadata.title = title;
4689
+ context.title = title;
4199
4690
  logger.debug(`Extracted title: "${title}" from ${context.source}`);
4200
4691
  } catch (error) {
4201
4692
  logger.error(`❌ Error extracting metadata from ${context.source}: ${error}`);
@@ -4653,7 +5144,7 @@ ${frame.content}
4653
5144
  * @param next The next middleware function in the pipeline.
4654
5145
  */
4655
5146
  async process(context, next) {
4656
- const contentType = context.options?.headers?.["content-type"] || context.metadata?.contentType || context.metadata?.mimeType;
5147
+ const contentType = context.options?.headers?.["content-type"] || context.contentType;
4657
5148
  if (contentType && typeof contentType === "string" && !MimeTypeUtils.isHtml(contentType)) {
4658
5149
  logger.debug(
4659
5150
  `Skipping Playwright rendering for ${context.source} - content type '${contentType}' is not HTML`
@@ -5014,6 +5505,7 @@ ${text.replace(/^\n+|\n+$/g, "")}
5014
5505
  context.content = markdown;
5015
5506
  logger.debug(`Successfully converted HTML to Markdown for ${context.source}`);
5016
5507
  }
5508
+ context.contentType = "text/markdown";
5017
5509
  } catch (error) {
5018
5510
  logger.error(
5019
5511
  `❌ Error converting HTML to Markdown for ${context.source}: ${error}`
@@ -5053,7 +5545,7 @@ class MarkdownMetadataExtractorMiddleware {
5053
5545
  if (match?.[1]) {
5054
5546
  title = match[1].trim();
5055
5547
  }
5056
- context.metadata.title = title;
5548
+ context.title = title;
5057
5549
  } catch (error) {
5058
5550
  context.errors.push(
5059
5551
  new Error(
@@ -5225,10 +5717,10 @@ function convertToString(content, charset) {
5225
5717
  }
5226
5718
  class BasePipeline {
5227
5719
  /**
5228
- * Determines if this pipeline can process the given content.
5720
+ * Determines if this pipeline can process content with the given MIME type.
5229
5721
  * Must be implemented by derived classes.
5230
5722
  */
5231
- canProcess(_rawContent) {
5723
+ canProcess(_mimeType, _content) {
5232
5724
  throw new Error("Method not implemented.");
5233
5725
  }
5234
5726
  /**
@@ -5289,11 +5781,12 @@ class HtmlPipeline extends BasePipeline {
5289
5781
  this.greedySplitter = new GreedySplitter(
5290
5782
  semanticSplitter,
5291
5783
  SPLITTER_MIN_CHUNK_SIZE,
5292
- preferredChunkSize
5784
+ preferredChunkSize,
5785
+ maxChunkSize
5293
5786
  );
5294
5787
  }
5295
- canProcess(rawContent) {
5296
- return MimeTypeUtils.isHtml(rawContent.mimeType);
5788
+ canProcess(mimeType) {
5789
+ return MimeTypeUtils.isHtml(mimeType);
5297
5790
  }
5298
5791
  async process(rawContent, options, fetcher) {
5299
5792
  const resolvedCharset = resolveCharset(
@@ -5304,8 +5797,9 @@ class HtmlPipeline extends BasePipeline {
5304
5797
  const contentString = convertToString(rawContent.content, resolvedCharset);
5305
5798
  const context = {
5306
5799
  content: contentString,
5800
+ contentType: rawContent.mimeType || "text/html",
5307
5801
  source: rawContent.source,
5308
- metadata: {},
5802
+ // metadata: {},
5309
5803
  links: [],
5310
5804
  errors: [],
5311
5805
  options,
@@ -5320,8 +5814,9 @@ class HtmlPipeline extends BasePipeline {
5320
5814
  typeof context.content === "string" ? context.content : ""
5321
5815
  );
5322
5816
  return {
5323
- textContent: typeof context.content === "string" ? context.content : "",
5324
- metadata: context.metadata,
5817
+ title: context.title,
5818
+ contentType: context.contentType,
5819
+ textContent: context.content,
5325
5820
  links: context.links,
5326
5821
  errors: context.errors,
5327
5822
  chunks
@@ -5345,9 +5840,9 @@ class JsonPipeline extends BasePipeline {
5345
5840
  preserveFormatting: true
5346
5841
  });
5347
5842
  }
5348
- canProcess(rawContent) {
5349
- if (!rawContent.mimeType) return false;
5350
- return MimeTypeUtils.isJson(rawContent.mimeType);
5843
+ canProcess(mimeType) {
5844
+ if (!mimeType) return false;
5845
+ return MimeTypeUtils.isJson(mimeType);
5351
5846
  }
5352
5847
  async process(rawContent, options, fetcher) {
5353
5848
  const contentString = convertToString(rawContent.content, rawContent.charset);
@@ -5362,22 +5857,25 @@ class JsonPipeline extends BasePipeline {
5362
5857
  const fallbackChunks = await this.splitter.splitText(contentString);
5363
5858
  return {
5364
5859
  textContent: contentString,
5365
- metadata: {
5366
- isValidJson: false
5367
- },
5860
+ // metadata: {
5861
+ // isValidJson: false,
5862
+ // },
5368
5863
  links: [],
5369
5864
  errors: [],
5370
5865
  chunks: fallbackChunks
5371
5866
  };
5372
5867
  }
5868
+ const metadata = this.extractMetadata(parsedJson);
5373
5869
  const context = {
5374
5870
  content: contentString,
5375
5871
  source: rawContent.source,
5376
- metadata: {
5377
- ...this.extractMetadata(parsedJson),
5378
- isValidJson,
5379
- jsonStructure: this.analyzeJsonStructure(parsedJson)
5380
- },
5872
+ title: metadata.title,
5873
+ contentType: rawContent.mimeType || "application/json",
5874
+ // metadata: {
5875
+ // ...this.extractMetadata(parsedJson),
5876
+ // isValidJson,
5877
+ // jsonStructure: this.analyzeJsonStructure(parsedJson),
5878
+ // },
5381
5879
  links: [],
5382
5880
  // JSON files typically don't contain links
5383
5881
  errors: [],
@@ -5387,8 +5885,9 @@ class JsonPipeline extends BasePipeline {
5387
5885
  await this.executeMiddlewareStack(this.middleware, context);
5388
5886
  const chunks = await this.splitter.splitText(context.content);
5389
5887
  return {
5888
+ title: context.title,
5889
+ contentType: context.contentType,
5390
5890
  textContent: context.content,
5391
- metadata: context.metadata,
5392
5891
  links: context.links,
5393
5892
  errors: context.errors,
5394
5893
  chunks
@@ -5418,30 +5917,6 @@ class JsonPipeline extends BasePipeline {
5418
5917
  }
5419
5918
  return metadata;
5420
5919
  }
5421
- /**
5422
- * Analyzes the structure of valid JSON for metadata
5423
- */
5424
- analyzeJsonStructure(parsedJson) {
5425
- if (Array.isArray(parsedJson)) {
5426
- return {
5427
- type: "array",
5428
- depth: this.calculateDepth(parsedJson),
5429
- itemCount: parsedJson.length
5430
- };
5431
- } else if (typeof parsedJson === "object" && parsedJson !== null) {
5432
- const obj = parsedJson;
5433
- return {
5434
- type: "object",
5435
- depth: this.calculateDepth(parsedJson),
5436
- propertyCount: Object.keys(obj).length
5437
- };
5438
- } else {
5439
- return {
5440
- type: typeof parsedJson,
5441
- depth: 1
5442
- };
5443
- }
5444
- }
5445
5920
  /**
5446
5921
  * Calculates the maximum nesting depth of a JSON structure
5447
5922
  */
@@ -5482,19 +5957,20 @@ class MarkdownPipeline extends BasePipeline {
5482
5957
  this.greedySplitter = new GreedySplitter(
5483
5958
  semanticSplitter,
5484
5959
  SPLITTER_MIN_CHUNK_SIZE,
5485
- preferredChunkSize
5960
+ preferredChunkSize,
5961
+ maxChunkSize
5486
5962
  );
5487
5963
  }
5488
- canProcess(rawContent) {
5489
- if (!rawContent.mimeType) return false;
5490
- return MimeTypeUtils.isMarkdown(rawContent.mimeType);
5964
+ canProcess(mimeType) {
5965
+ if (!mimeType) return false;
5966
+ return MimeTypeUtils.isMarkdown(mimeType);
5491
5967
  }
5492
5968
  async process(rawContent, options, fetcher) {
5493
5969
  const contentString = convertToString(rawContent.content, rawContent.charset);
5494
5970
  const context = {
5971
+ contentType: rawContent.mimeType || "text/markdown",
5495
5972
  content: contentString,
5496
5973
  source: rawContent.source,
5497
- metadata: {},
5498
5974
  links: [],
5499
5975
  errors: [],
5500
5976
  options,
@@ -5506,8 +5982,9 @@ class MarkdownPipeline extends BasePipeline {
5506
5982
  rawContent.mimeType
5507
5983
  );
5508
5984
  return {
5985
+ title: context.title,
5986
+ contentType: context.contentType,
5509
5987
  textContent: typeof context.content === "string" ? context.content : "",
5510
- metadata: context.metadata,
5511
5988
  links: context.links,
5512
5989
  errors: context.errors,
5513
5990
  chunks
@@ -5517,24 +5994,27 @@ class MarkdownPipeline extends BasePipeline {
5517
5994
  class SourceCodePipeline extends BasePipeline {
5518
5995
  middleware;
5519
5996
  splitter;
5520
- constructor(chunkSize = SPLITTER_PREFERRED_CHUNK_SIZE) {
5997
+ constructor(_preferredChunkSize = SPLITTER_PREFERRED_CHUNK_SIZE, maxChunkSize = SPLITTER_MAX_CHUNK_SIZE) {
5521
5998
  super();
5522
5999
  this.middleware = [];
5523
- this.splitter = new TreesitterSourceCodeSplitter({ maxChunkSize: chunkSize });
6000
+ this.splitter = new TreesitterSourceCodeSplitter({ maxChunkSize });
5524
6001
  }
5525
- canProcess(rawContent) {
5526
- if (!rawContent.mimeType) return false;
5527
- return MimeTypeUtils.isSourceCode(rawContent.mimeType);
6002
+ canProcess(mimeType) {
6003
+ if (!mimeType) return false;
6004
+ return MimeTypeUtils.isSourceCode(mimeType);
5528
6005
  }
5529
6006
  async process(rawContent, options, fetcher) {
5530
6007
  const contentString = convertToString(rawContent.content, rawContent.charset);
5531
6008
  const context = {
6009
+ contentType: rawContent.mimeType || "text/plain",
5532
6010
  content: contentString,
5533
6011
  source: rawContent.source,
5534
- metadata: {
5535
- language: rawContent.mimeType ? MimeTypeUtils.extractLanguageFromMimeType(rawContent.mimeType) : "text",
5536
- isSourceCode: true
5537
- },
6012
+ // metadata: {
6013
+ // language: rawContent.mimeType
6014
+ // ? MimeTypeUtils.extractLanguageFromMimeType(rawContent.mimeType)
6015
+ // : "text",
6016
+ // isSourceCode: true,
6017
+ // },
5538
6018
  links: [],
5539
6019
  // Source code files typically don't contain web links
5540
6020
  errors: [],
@@ -5544,8 +6024,10 @@ class SourceCodePipeline extends BasePipeline {
5544
6024
  await this.executeMiddlewareStack(this.middleware, context);
5545
6025
  const chunks = await this.splitter.splitText(context.content, rawContent.mimeType);
5546
6026
  return {
6027
+ title: context.title,
6028
+ contentType: context.contentType,
5547
6029
  textContent: context.content,
5548
- metadata: context.metadata,
6030
+ // metadata: context.metadata,
5549
6031
  links: context.links,
5550
6032
  errors: context.errors,
5551
6033
  chunks
@@ -5594,17 +6076,22 @@ class TextDocumentSplitter {
5594
6076
  class TextPipeline extends BasePipeline {
5595
6077
  middleware;
5596
6078
  splitter;
5597
- constructor(chunkSize = SPLITTER_PREFERRED_CHUNK_SIZE) {
6079
+ constructor(preferredChunkSize = SPLITTER_PREFERRED_CHUNK_SIZE, maxChunkSize = SPLITTER_MAX_CHUNK_SIZE) {
5598
6080
  super();
5599
6081
  this.middleware = [];
5600
- const textSplitter = new TextDocumentSplitter({ maxChunkSize: chunkSize });
5601
- this.splitter = new GreedySplitter(textSplitter, SPLITTER_MIN_CHUNK_SIZE, chunkSize);
6082
+ const textSplitter = new TextDocumentSplitter({ maxChunkSize });
6083
+ this.splitter = new GreedySplitter(
6084
+ textSplitter,
6085
+ SPLITTER_MIN_CHUNK_SIZE,
6086
+ preferredChunkSize,
6087
+ maxChunkSize
6088
+ );
5602
6089
  }
5603
- canProcess(rawContent) {
5604
- if (!MimeTypeUtils.isSafeForTextProcessing(rawContent.mimeType)) {
6090
+ canProcess(mimeType, content) {
6091
+ if (!MimeTypeUtils.isSafeForTextProcessing(mimeType)) {
5605
6092
  return false;
5606
6093
  }
5607
- if (MimeTypeUtils.isBinary(rawContent.content)) {
6094
+ if (content && MimeTypeUtils.isBinary(content)) {
5608
6095
  return false;
5609
6096
  }
5610
6097
  return true;
@@ -5612,12 +6099,11 @@ class TextPipeline extends BasePipeline {
5612
6099
  async process(rawContent, options, fetcher) {
5613
6100
  const contentString = convertToString(rawContent.content, rawContent.charset);
5614
6101
  const context = {
6102
+ title: "",
6103
+ // Title extraction can be added in middleware if needed
6104
+ contentType: rawContent.mimeType || "text/plain",
5615
6105
  content: contentString,
5616
6106
  source: rawContent.source,
5617
- metadata: {
5618
- contentType: rawContent.mimeType || "text/plain",
5619
- isGenericText: true
5620
- },
5621
6107
  links: [],
5622
6108
  // Generic text content typically doesn't contain structured links
5623
6109
  errors: [],
@@ -5627,394 +6113,283 @@ class TextPipeline extends BasePipeline {
5627
6113
  await this.executeMiddlewareStack(this.middleware, context);
5628
6114
  const chunks = await this.splitter.splitText(context.content, rawContent.mimeType);
5629
6115
  return {
6116
+ title: context.title,
6117
+ contentType: context.contentType,
5630
6118
  textContent: context.content,
5631
- metadata: context.metadata,
5632
6119
  links: context.links,
5633
6120
  errors: context.errors,
5634
- chunks
5635
- };
5636
- }
5637
- }
5638
- let PipelineFactory$1 = class PipelineFactory {
5639
- /**
5640
- * Creates the standard set of content pipelines used by all scraper strategies.
5641
- * Includes HTML, Markdown, JSON, source code, and text processing capabilities.
5642
- * Each pipeline now handles both preprocessing and content-specific splitting.
5643
- * TextPipeline is placed last as the universal fallback for unknown content types.
5644
- *
5645
- * @param config - Optional configuration for pipeline chunk sizes
5646
- * @returns Array of content pipelines in processing order
5647
- */
5648
- static createStandardPipelines(config) {
5649
- const preferredChunkSize = config?.chunkSizes?.preferred ?? SPLITTER_PREFERRED_CHUNK_SIZE;
5650
- const maxChunkSize = config?.chunkSizes?.max ?? 2e3;
5651
- return [
5652
- new JsonPipeline(preferredChunkSize),
5653
- new SourceCodePipeline(preferredChunkSize),
5654
- new HtmlPipeline(preferredChunkSize, maxChunkSize),
5655
- new MarkdownPipeline(preferredChunkSize, maxChunkSize),
5656
- new TextPipeline(preferredChunkSize)
5657
- // Universal fallback - must be last
5658
- ];
5659
- }
5660
- };
5661
- const DEFAULT_FILE_EXCLUSIONS = [
5662
- // CHANGELOG files (case variations)
5663
- "**/CHANGELOG.md",
5664
- "**/changelog.md",
5665
- "**/CHANGELOG.mdx",
5666
- "**/changelog.mdx",
5667
- // LICENSE files (case variations)
5668
- "**/LICENSE",
5669
- "**/LICENSE.md",
5670
- "**/license.md",
5671
- // CODE_OF_CONDUCT files (case variations)
5672
- "**/CODE_OF_CONDUCT.md",
5673
- "**/code_of_conduct.md",
5674
- // Test files
5675
- "**/*.test.*",
5676
- "**/*.spec.*",
5677
- "**/*_test.py",
5678
- "**/*_test.go",
5679
- // Package manager lock files
5680
- "**/*.lock",
5681
- "**/package-lock.json",
5682
- "**/yarn.lock",
5683
- "**/pnpm-lock.yaml",
5684
- "**/go.sum",
5685
- // Build artifacts
5686
- "**/*.min.js",
5687
- "**/*.min.css",
5688
- "**/*.map",
5689
- "**/*.d.ts",
5690
- // IDE/System files
5691
- "**/.DS_Store",
5692
- "**/Thumbs.db",
5693
- "**/*.swp",
5694
- "**/*.swo",
5695
- // Internal config files (using regex pattern)
5696
- "/.*\\.(ini|cfg|conf|log|pid)$/"
5697
- ];
5698
- const DEFAULT_FOLDER_EXCLUSIONS = [
5699
- // Archive and deprecated content (matches anywhere in path)
5700
- "**/archive/**",
5701
- "**/archived/**",
5702
- "**/deprecated/**",
5703
- "**/legacy/**",
5704
- "**/old/**",
5705
- "**/outdated/**",
5706
- "**/previous/**",
5707
- "**/superseded/**",
5708
- // Specific paths that don't follow the general pattern
5709
- "docs/old/**",
5710
- // Test directories
5711
- "**/test/**",
5712
- "**/tests/**",
5713
- "**/__tests__/**",
5714
- "**/spec/**",
5715
- // Build output directories
5716
- "**/dist/**",
5717
- "**/build/**",
5718
- "**/out/**",
5719
- "**/target/**",
5720
- "**/.next/**",
5721
- "**/.nuxt/**",
5722
- // IDE directories
5723
- "**/.vscode/**",
5724
- "**/.idea/**",
5725
- // Internationalization folders - non-English locales
5726
- "**/i18n/ar*/**",
5727
- "**/i18n/de*/**",
5728
- "**/i18n/es*/**",
5729
- "**/i18n/fr*/**",
5730
- "**/i18n/hi*/**",
5731
- "**/i18n/it*/**",
5732
- "**/i18n/ja*/**",
5733
- "**/i18n/ko*/**",
5734
- "**/i18n/nl*/**",
5735
- "**/i18n/pl*/**",
5736
- "**/i18n/pt*/**",
5737
- "**/i18n/ru*/**",
5738
- "**/i18n/sv*/**",
5739
- "**/i18n/th*/**",
5740
- "**/i18n/tr*/**",
5741
- "**/i18n/vi*/**",
5742
- "**/i18n/zh*/**",
5743
- // Common locale folder patterns
5744
- "**/zh-cn/**",
5745
- "**/zh-hk/**",
5746
- "**/zh-mo/**",
5747
- "**/zh-sg/**",
5748
- "**/zh-tw/**"
5749
- ];
5750
- const DEFAULT_EXCLUSION_PATTERNS = [
5751
- ...DEFAULT_FILE_EXCLUSIONS,
5752
- ...DEFAULT_FOLDER_EXCLUSIONS
5753
- ];
5754
- function getEffectiveExclusionPatterns(userPatterns) {
5755
- if (userPatterns !== void 0) {
5756
- return userPatterns;
6121
+ chunks
6122
+ };
5757
6123
  }
5758
- return DEFAULT_EXCLUSION_PATTERNS;
5759
- }
5760
- function isRegexPattern(pattern) {
5761
- return pattern.length > 2 && pattern.startsWith("/") && pattern.endsWith("/");
5762
6124
  }
5763
- function patternToRegExp(pattern) {
5764
- if (isRegexPattern(pattern)) {
5765
- return new RegExp(pattern.slice(1, -1));
6125
+ let PipelineFactory$1 = class PipelineFactory {
6126
+ /**
6127
+ * Creates the standard set of content pipelines used by all scraper strategies.
6128
+ * Includes HTML, Markdown, JSON, source code, and text processing capabilities.
6129
+ * Each pipeline now handles both preprocessing and content-specific splitting.
6130
+ * TextPipeline is placed last as the universal fallback for unknown content types.
6131
+ *
6132
+ * @param config - Optional configuration for pipeline chunk sizes
6133
+ * @returns Array of content pipelines in processing order
6134
+ */
6135
+ static createStandardPipelines(config) {
6136
+ const preferredChunkSize = config?.chunkSizes?.preferred ?? SPLITTER_PREFERRED_CHUNK_SIZE;
6137
+ const maxChunkSize = config?.chunkSizes?.max ?? 2e3;
6138
+ return [
6139
+ new JsonPipeline(preferredChunkSize),
6140
+ new SourceCodePipeline(preferredChunkSize, maxChunkSize),
6141
+ new HtmlPipeline(preferredChunkSize, maxChunkSize),
6142
+ new MarkdownPipeline(preferredChunkSize, maxChunkSize),
6143
+ new TextPipeline(preferredChunkSize, maxChunkSize)
6144
+ // Universal fallback - must be last
6145
+ ];
5766
6146
  }
5767
- const re = minimatch.makeRe(pattern, { dot: true });
5768
- if (!re) throw new Error(`Invalid glob pattern: ${pattern}`);
5769
- return re;
5770
- }
5771
- function matchesAnyPattern(path2, patterns) {
5772
- if (!patterns || patterns.length === 0) return false;
5773
- const normalizedPath = path2.startsWith("/") ? path2 : `/${path2}`;
5774
- return patterns.some((pattern) => {
5775
- if (isRegexPattern(pattern)) {
5776
- return patternToRegExp(pattern).test(normalizedPath);
5777
- }
5778
- return minimatch(normalizedPath.replace(/^\//, ""), pattern, { dot: true });
5779
- });
5780
- }
5781
- function extractPathAndQuery(url) {
5782
- try {
5783
- const u = new URL(url);
5784
- return u.pathname + (u.search || "");
5785
- } catch {
5786
- return url;
6147
+ };
6148
+ class GitHubRepoProcessor {
6149
+ httpFetcher = new HttpFetcher();
6150
+ pipelines;
6151
+ constructor() {
6152
+ this.pipelines = PipelineFactory$1.createStandardPipelines();
5787
6153
  }
5788
- }
5789
- function shouldIncludeUrl(url, includePatterns, excludePatterns) {
5790
- const path2 = extractPathAndQuery(url);
5791
- const normalizedPath = path2.startsWith("/") ? path2 : `/${path2}`;
5792
- let basename;
5793
- if (url.startsWith("file://")) {
5794
- try {
5795
- const u = new URL(url);
5796
- basename = u.pathname ? u.pathname.split("/").pop() : void 0;
5797
- } catch {
6154
+ /**
6155
+ * Parses an HTTPS blob URL to extract repository information.
6156
+ * Format: https://github.com/owner/repo/blob/branch/filepath
6157
+ */
6158
+ parseHttpsBlobUrl(url) {
6159
+ const parsedUrl = new URL(url);
6160
+ const segments = parsedUrl.pathname.split("/").filter(Boolean);
6161
+ if (segments.length < 5 || segments[2] !== "blob") {
6162
+ throw new Error(
6163
+ `Invalid GitHub blob URL format. Expected: https://github.com/owner/repo/blob/branch/filepath. Got: ${url}`
6164
+ );
5798
6165
  }
6166
+ const owner = segments[0];
6167
+ const repo = segments[1];
6168
+ const branch = segments[3];
6169
+ const filePath = segments.slice(4).join("/");
6170
+ return { owner, repo, branch, filePath };
5799
6171
  }
5800
- const stripSlash = (patterns) => patterns?.map((p) => p.startsWith("/") ? p.slice(1) : p);
5801
- const effectiveExcludePatterns = getEffectiveExclusionPatterns(excludePatterns);
5802
- if (matchesAnyPattern(normalizedPath, effectiveExcludePatterns) || basename && matchesAnyPattern(basename, stripSlash(effectiveExcludePatterns)))
5803
- return false;
5804
- if (!includePatterns || includePatterns.length === 0) return true;
5805
- return matchesAnyPattern(normalizedPath, includePatterns) || (basename ? matchesAnyPattern(basename, stripSlash(includePatterns)) : false);
5806
- }
5807
- function computeBaseDirectory(pathname) {
5808
- if (pathname === "") return "/";
5809
- if (pathname.endsWith("/")) return pathname;
5810
- const lastSegment = pathname.split("/").at(-1) || "";
5811
- const looksLikeFile = lastSegment.includes(".");
5812
- if (looksLikeFile) {
5813
- return pathname.replace(/\/[^/]*$/, "/");
6172
+ /**
6173
+ * Fetches the raw content of a file from GitHub.
6174
+ */
6175
+ async fetchFileContent(repoInfo, filePath, etag, signal) {
6176
+ const { owner, repo, branch } = repoInfo;
6177
+ const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${filePath}`;
6178
+ const rawContent = await this.httpFetcher.fetch(rawUrl, { signal, etag });
6179
+ const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath);
6180
+ if (detectedMimeType && rawContent.mimeType === "text/plain") {
6181
+ return {
6182
+ ...rawContent,
6183
+ mimeType: detectedMimeType
6184
+ };
6185
+ }
6186
+ return rawContent;
5814
6187
  }
5815
- return `${pathname}/`;
5816
- }
5817
- function isInScope(baseUrl, targetUrl, scope) {
5818
- if (baseUrl.protocol !== targetUrl.protocol) return false;
5819
- switch (scope) {
5820
- case "subpages": {
5821
- if (baseUrl.hostname !== targetUrl.hostname) return false;
5822
- const baseDir = computeBaseDirectory(baseUrl.pathname);
5823
- return targetUrl.pathname.startsWith(baseDir);
6188
+ /**
6189
+ * Processes a single GitHub repository file from an HTTPS blob URL.
6190
+ */
6191
+ async process(item, options, signal) {
6192
+ const repoInfo = this.parseHttpsBlobUrl(item.url);
6193
+ const { owner, repo, branch, filePath } = repoInfo;
6194
+ const rawContent = await this.fetchFileContent(
6195
+ { owner, repo, branch },
6196
+ filePath,
6197
+ item.etag,
6198
+ signal
6199
+ );
6200
+ if (rawContent.status !== FetchStatus.SUCCESS) {
6201
+ return { url: item.url, links: [], status: rawContent.status };
5824
6202
  }
5825
- case "hostname":
5826
- return baseUrl.hostname === targetUrl.hostname;
5827
- case "domain": {
5828
- return extractPrimaryDomain(baseUrl.hostname) === extractPrimaryDomain(targetUrl.hostname);
6203
+ let processed;
6204
+ for (const pipeline of this.pipelines) {
6205
+ const contentBuffer = Buffer.isBuffer(rawContent.content) ? rawContent.content : Buffer.from(rawContent.content);
6206
+ if (pipeline.canProcess(rawContent.mimeType || "text/plain", contentBuffer)) {
6207
+ logger.debug(
6208
+ `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`
6209
+ );
6210
+ const gitHubOptions = { ...options, scrapeMode: ScrapeMode.Fetch };
6211
+ processed = await pipeline.process(rawContent, gitHubOptions, this.httpFetcher);
6212
+ break;
6213
+ }
5829
6214
  }
5830
- default:
5831
- return false;
6215
+ if (!processed) {
6216
+ logger.warn(
6217
+ `⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
6218
+ );
6219
+ return { url: item.url, links: [], status: FetchStatus.SUCCESS };
6220
+ }
6221
+ for (const err of processed.errors ?? []) {
6222
+ logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`);
6223
+ }
6224
+ const githubUrl = `https://github.com/${owner}/${repo}/blob/${branch}/${filePath}`;
6225
+ const filename = filePath.split("/").pop() || "Untitled";
6226
+ return {
6227
+ url: githubUrl,
6228
+ title: processed.title?.trim() || filename || "Untitled",
6229
+ etag: rawContent.etag,
6230
+ lastModified: rawContent.lastModified,
6231
+ contentType: rawContent.mimeType,
6232
+ content: processed,
6233
+ links: [],
6234
+ // Always return empty links array for individual files
6235
+ status: FetchStatus.SUCCESS
6236
+ };
6237
+ }
6238
+ /**
6239
+ * Cleanup resources used by this processor.
6240
+ */
6241
+ async cleanup() {
6242
+ await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
5832
6243
  }
5833
6244
  }
5834
- const DEFAULT_MAX_DEPTH = 3;
5835
- const DEFAULT_CONCURRENCY = 3;
5836
- class BaseScraperStrategy {
5837
- visited = /* @__PURE__ */ new Set();
5838
- pageCount = 0;
5839
- totalDiscovered = 0;
5840
- // Track total URLs discovered (unlimited)
5841
- effectiveTotal = 0;
5842
- // Track effective total (limited by maxPages)
5843
- canonicalBaseUrl;
5844
- options;
5845
- constructor(options = {}) {
5846
- this.options = options;
6245
+ class GitHubWikiProcessor {
6246
+ httpFetcher = new HttpFetcher();
6247
+ pipelines;
6248
+ constructor() {
6249
+ this.pipelines = PipelineFactory$1.createStandardPipelines();
5847
6250
  }
5848
6251
  /**
5849
- * Determines if a URL should be processed based on scope and include/exclude patterns in ScraperOptions.
5850
- * Scope is checked first, then patterns.
6252
+ * Parses a GitHub wiki URL to extract repository information.
6253
+ */
6254
+ parseGitHubWikiUrl(url) {
6255
+ const parsedUrl = new URL(url);
6256
+ const match = parsedUrl.pathname.match(/^\/([^/]+)\/([^/]+)\/wiki/);
6257
+ if (!match) {
6258
+ throw new Error(`Invalid GitHub wiki URL: ${url}`);
6259
+ }
6260
+ const [, owner, repo] = match;
6261
+ return { owner, repo };
6262
+ }
6263
+ /**
6264
+ * Determines if a URL should be processed within the wiki scope.
5851
6265
  */
5852
6266
  shouldProcessUrl(url, options) {
5853
- if (options.scope) {
5854
- try {
5855
- const base = this.canonicalBaseUrl ?? new URL$1(options.url);
5856
- const target = new URL$1(url);
5857
- if (!isInScope(base, target, options.scope)) return false;
5858
- } catch {
6267
+ try {
6268
+ const parsedUrl = new URL(url);
6269
+ const baseWikiInfo = this.parseGitHubWikiUrl(options.url);
6270
+ const expectedWikiPath = `/${baseWikiInfo.owner}/${baseWikiInfo.repo}/wiki`;
6271
+ if (!parsedUrl.pathname.startsWith(expectedWikiPath)) {
5859
6272
  return false;
5860
6273
  }
6274
+ const wikiPagePath = parsedUrl.pathname.replace(expectedWikiPath, "").replace(/^\//, "");
6275
+ return shouldIncludeUrl(
6276
+ wikiPagePath || "Home",
6277
+ options.includePatterns,
6278
+ options.excludePatterns
6279
+ );
6280
+ } catch {
6281
+ return false;
5861
6282
  }
5862
- return shouldIncludeUrl(url, options.includePatterns, options.excludePatterns);
5863
6283
  }
5864
- // Removed getProcessor method as processing is now handled by strategies using middleware pipelines
5865
- async processBatch(batch, baseUrl, options, progressCallback, signal) {
5866
- const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
5867
- const results = await Promise.all(
5868
- batch.map(async (item) => {
5869
- if (signal?.aborted) {
5870
- throw new CancellationError("Scraping cancelled during batch processing");
5871
- }
5872
- const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH;
5873
- if (item.depth > maxDepth) {
5874
- return [];
5875
- }
5876
- try {
5877
- const result = await this.processItem(item, options, void 0, signal);
5878
- if (item.depth === 0 && !this.canonicalBaseUrl && result?.finalUrl) {
5879
- try {
5880
- const finalUrlStr = result.finalUrl;
5881
- const original = new URL$1(options.url);
5882
- const finalUrlObj = new URL$1(finalUrlStr);
5883
- if (finalUrlObj.href !== original.href && (finalUrlObj.protocol === "http:" || finalUrlObj.protocol === "https:")) {
5884
- this.canonicalBaseUrl = finalUrlObj;
5885
- logger.debug(
5886
- `Updated scope base after redirect: ${original.href} -> ${finalUrlObj.href}`
5887
- );
5888
- } else {
5889
- this.canonicalBaseUrl = original;
5890
- }
5891
- } catch {
5892
- this.canonicalBaseUrl = new URL$1(options.url);
5893
- }
5894
- }
5895
- if (result.document) {
5896
- this.pageCount++;
5897
- logger.info(
5898
- `🌐 Scraping page ${this.pageCount}/${this.effectiveTotal} (depth ${item.depth}/${maxDepth}): ${item.url}`
5899
- );
5900
- await progressCallback({
5901
- pagesScraped: this.pageCount,
5902
- totalPages: this.effectiveTotal,
5903
- totalDiscovered: this.totalDiscovered,
5904
- currentUrl: item.url,
5905
- depth: item.depth,
5906
- maxDepth,
5907
- document: result.document
5908
- });
5909
- }
5910
- const nextItems = result.links || [];
5911
- return nextItems.map((value) => {
5912
- try {
5913
- const targetUrl = new URL$1(value, baseUrl);
5914
- if (!this.shouldProcessUrl(targetUrl.href, options)) {
5915
- return null;
5916
- }
5917
- return {
5918
- url: targetUrl.href,
5919
- depth: item.depth + 1
5920
- };
5921
- } catch (_error) {
5922
- logger.warn(`❌ Invalid URL: ${value}`);
5923
- }
5924
- return null;
5925
- }).filter((item2) => item2 !== null);
5926
- } catch (error) {
5927
- if (options.ignoreErrors) {
5928
- logger.error(`❌ Failed to process ${item.url}: ${error}`);
5929
- return [];
5930
- }
5931
- throw error;
5932
- }
5933
- })
5934
- );
5935
- const allLinks = results.flat();
5936
- const uniqueLinks = [];
5937
- for (const item of allLinks) {
5938
- const normalizedUrl = normalizeUrl(item.url, this.options.urlNormalizerOptions);
5939
- if (!this.visited.has(normalizedUrl)) {
5940
- this.visited.add(normalizedUrl);
5941
- uniqueLinks.push(item);
5942
- this.totalDiscovered++;
5943
- if (this.effectiveTotal < maxPages) {
5944
- this.effectiveTotal++;
6284
+ /**
6285
+ * Processes a single GitHub wiki page.
6286
+ */
6287
+ async process(item, options, signal) {
6288
+ const currentUrl = item.url;
6289
+ try {
6290
+ const rawContent = await this.httpFetcher.fetch(currentUrl, {
6291
+ signal,
6292
+ etag: item.etag
6293
+ });
6294
+ if (rawContent.status !== FetchStatus.SUCCESS) {
6295
+ return { url: currentUrl, links: [], status: rawContent.status };
6296
+ }
6297
+ let processed;
6298
+ for (const pipeline of this.pipelines) {
6299
+ if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) {
6300
+ logger.debug(
6301
+ `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${currentUrl})`
6302
+ );
6303
+ const wikiOptions = { ...options, scrapeMode: ScrapeMode.Fetch };
6304
+ processed = await pipeline.process(rawContent, wikiOptions, this.httpFetcher);
6305
+ break;
5945
6306
  }
5946
6307
  }
5947
- }
5948
- return uniqueLinks;
5949
- }
5950
- async scrape(options, progressCallback, signal) {
5951
- this.visited.clear();
5952
- this.pageCount = 0;
5953
- this.totalDiscovered = 1;
5954
- this.effectiveTotal = 1;
5955
- this.canonicalBaseUrl = new URL$1(options.url);
5956
- let baseUrl = this.canonicalBaseUrl;
5957
- const queue = [{ url: options.url, depth: 0 }];
5958
- this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions));
5959
- const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
5960
- const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY;
5961
- while (queue.length > 0 && this.pageCount < maxPages) {
5962
- if (signal?.aborted) {
5963
- logger.debug("Scraping cancelled by signal.");
5964
- throw new CancellationError("Scraping cancelled by signal");
6308
+ if (!processed) {
6309
+ logger.warn(
6310
+ `⚠️ Unsupported content type "${rawContent.mimeType}" for wiki page ${currentUrl}. Skipping processing.`
6311
+ );
6312
+ return { url: currentUrl, links: [], status: FetchStatus.SUCCESS };
5965
6313
  }
5966
- const remainingPages = maxPages - this.pageCount;
5967
- if (remainingPages <= 0) {
5968
- break;
6314
+ for (const err of processed.errors ?? []) {
6315
+ logger.warn(`⚠️ Processing error for ${currentUrl}: ${err.message}`);
5969
6316
  }
5970
- const batchSize = Math.min(
5971
- maxConcurrency,
5972
- // Use variable
5973
- remainingPages,
5974
- queue.length
5975
- );
5976
- const batch = queue.splice(0, batchSize);
5977
- baseUrl = this.canonicalBaseUrl ?? baseUrl;
5978
- const newUrls = await this.processBatch(
5979
- batch,
5980
- baseUrl,
5981
- options,
5982
- progressCallback,
5983
- signal
5984
- );
5985
- queue.push(...newUrls);
6317
+ const parsedUrl = new URL(currentUrl);
6318
+ const wikiInfo = this.parseGitHubWikiUrl(currentUrl);
6319
+ const wikiPagePath = parsedUrl.pathname.replace(`/${wikiInfo.owner}/${wikiInfo.repo}/wiki`, "").replace(/^\//, "");
6320
+ const pageTitle = wikiPagePath || "Home";
6321
+ const links = processed.links || [];
6322
+ const wikiLinks = links.filter((link) => {
6323
+ if (!link || link.trim() === "" || link === "invalid-url" || link === "not-a-url-at-all") {
6324
+ return false;
6325
+ }
6326
+ return true;
6327
+ }).map((link) => {
6328
+ try {
6329
+ return new URL(link, currentUrl).href;
6330
+ } catch {
6331
+ return null;
6332
+ }
6333
+ }).filter((link) => link !== null).filter((link) => {
6334
+ try {
6335
+ const linkUrl = new URL(link);
6336
+ return linkUrl.hostname === parsedUrl.hostname && linkUrl.pathname.startsWith(`/${wikiInfo.owner}/${wikiInfo.repo}/wiki`);
6337
+ } catch {
6338
+ return false;
6339
+ }
6340
+ });
6341
+ return {
6342
+ url: currentUrl,
6343
+ title: pageTitle,
6344
+ etag: rawContent.etag,
6345
+ lastModified: rawContent.lastModified,
6346
+ contentType: rawContent.mimeType,
6347
+ content: processed,
6348
+ links: wikiLinks,
6349
+ status: FetchStatus.SUCCESS
6350
+ };
6351
+ } catch (error) {
6352
+ logger.warn(`⚠️ Failed to process wiki page ${currentUrl}: ${error}`);
6353
+ return { url: currentUrl, links: [], status: FetchStatus.SUCCESS };
5986
6354
  }
5987
6355
  }
5988
6356
  /**
5989
- * Cleanup resources used by this strategy.
5990
- * Default implementation does nothing - override in derived classes as needed.
6357
+ * Cleanup resources used by this processor.
5991
6358
  */
5992
6359
  async cleanup() {
6360
+ await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
5993
6361
  }
5994
6362
  }
5995
- class GitHubRepoScraperStrategy extends BaseScraperStrategy {
6363
+ class GitHubScraperStrategy extends BaseScraperStrategy {
5996
6364
  httpFetcher = new HttpFetcher();
5997
- pipelines;
5998
- resolvedBranch;
5999
- // Cache the resolved default branch
6000
- constructor() {
6001
- super();
6002
- this.pipelines = PipelineFactory$1.createStandardPipelines();
6003
- }
6365
+ wikiProcessor = new GitHubWikiProcessor();
6366
+ repoProcessor = new GitHubRepoProcessor();
6004
6367
  canHandle(url) {
6005
- const { hostname } = new URL(url);
6006
- return ["github.com", "www.github.com"].includes(hostname);
6007
- }
6008
- /**
6009
- * Override shouldProcessUrl to handle github-file:// URLs specially.
6010
- * These URLs bypass scope checking since they're internal file references.
6011
- */
6012
- shouldProcessUrl(url, options) {
6013
6368
  if (url.startsWith("github-file://")) {
6014
- const filePath = url.replace("github-file://", "");
6015
- return shouldIncludeUrl(filePath, options.includePatterns, options.excludePatterns);
6369
+ return true;
6370
+ }
6371
+ try {
6372
+ const parsedUrl = new URL(url);
6373
+ const { hostname, pathname } = parsedUrl;
6374
+ if (!["github.com", "www.github.com"].includes(hostname)) {
6375
+ return false;
6376
+ }
6377
+ const baseMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/?$/);
6378
+ if (baseMatch) {
6379
+ return true;
6380
+ }
6381
+ const treeMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/tree\//);
6382
+ if (treeMatch) {
6383
+ return true;
6384
+ }
6385
+ const blobMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/blob\//);
6386
+ if (blobMatch) {
6387
+ return true;
6388
+ }
6389
+ return false;
6390
+ } catch {
6391
+ return false;
6016
6392
  }
6017
- return super.shouldProcessUrl(url, options);
6018
6393
  }
6019
6394
  /**
6020
6395
  * Parses a GitHub URL to extract repository information.
@@ -6028,20 +6403,19 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
6028
6403
  const [, owner, repo] = match;
6029
6404
  const segments = parsedUrl.pathname.split("/").filter(Boolean);
6030
6405
  if (segments.length >= 4 && segments[2] === "blob") {
6031
- const branch2 = segments[3];
6406
+ const branch = segments[3];
6032
6407
  const filePath = segments.length > 4 ? segments.slice(4).join("/") : void 0;
6033
- return { owner, repo, branch: branch2, filePath, isBlob: true };
6408
+ return { owner, repo, branch, filePath, isBlob: true };
6034
6409
  }
6035
- if (segments.length < 4 || segments[2] !== "tree") {
6036
- return { owner, repo };
6410
+ if (segments.length >= 4 && segments[2] === "tree") {
6411
+ const branch = segments[3];
6412
+ const subPath = segments.length > 4 ? segments.slice(4).join("/") : void 0;
6413
+ return { owner, repo, branch, subPath };
6037
6414
  }
6038
- const branch = segments[3];
6039
- const subPath = segments.length > 4 ? segments.slice(4).join("/") : void 0;
6040
- return { owner, repo, branch, subPath };
6415
+ return { owner, repo };
6041
6416
  }
6042
6417
  /**
6043
6418
  * Fetches the repository tree structure from GitHub API.
6044
- * Uses 'HEAD' to get the default branch if no branch is specified.
6045
6419
  */
6046
6420
  async fetchRepositoryTree(repoInfo, signal) {
6047
6421
  const { owner, repo, branch } = repoInfo;
@@ -6060,7 +6434,6 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
6060
6434
  targetBranch = "main";
6061
6435
  }
6062
6436
  }
6063
- this.resolvedBranch = targetBranch;
6064
6437
  const treeUrl = `https://api.github.com/repos/${owner}/${repo}/git/trees/${targetBranch}?recursive=1`;
6065
6438
  logger.debug(`Fetching repository tree: ${treeUrl}`);
6066
6439
  const rawContent = await this.httpFetcher.fetch(treeUrl, { signal });
@@ -6082,14 +6455,12 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
6082
6455
  }
6083
6456
  const path2 = item.path;
6084
6457
  const textExtensions = [
6085
- // Documentation
6086
6458
  ".md",
6087
6459
  ".mdx",
6088
6460
  ".txt",
6089
6461
  ".rst",
6090
6462
  ".adoc",
6091
6463
  ".asciidoc",
6092
- // Web technologies
6093
6464
  ".html",
6094
6465
  ".htm",
6095
6466
  ".xml",
@@ -6097,7 +6468,6 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
6097
6468
  ".scss",
6098
6469
  ".sass",
6099
6470
  ".less",
6100
- // Programming languages
6101
6471
  ".js",
6102
6472
  ".jsx",
6103
6473
  ".ts",
@@ -6133,7 +6503,6 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
6133
6503
  ".ps1",
6134
6504
  ".bat",
6135
6505
  ".cmd",
6136
- // Configuration and data
6137
6506
  ".json",
6138
6507
  ".yaml",
6139
6508
  ".yml",
@@ -6147,7 +6516,6 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
6147
6516
  ".dockerignore",
6148
6517
  ".gitattributes",
6149
6518
  ".editorconfig",
6150
- // Build and package management
6151
6519
  ".gradle",
6152
6520
  ".pom",
6153
6521
  ".sbt",
@@ -6156,10 +6524,7 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
6156
6524
  ".make",
6157
6525
  ".dockerfile",
6158
6526
  ".mod",
6159
- // Go modules (go.mod)
6160
6527
  ".sum",
6161
- // Go checksums (go.sum)
6162
- // Other text formats
6163
6528
  ".sql",
6164
6529
  ".graphql",
6165
6530
  ".gql",
@@ -6172,20 +6537,16 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
6172
6537
  ];
6173
6538
  const pathLower = path2.toLowerCase();
6174
6539
  const hasTextExtension = textExtensions.some((ext) => pathLower.endsWith(ext));
6175
- const hasCompoundExtension = pathLower.includes(".env.") || // .env.example, .env.local, etc.
6176
- pathLower.endsWith(".env") || pathLower.includes(".config.") || // webpack.config.js, etc.
6177
- pathLower.includes(".lock");
6540
+ const hasCompoundExtension = pathLower.includes(".env.") || pathLower.endsWith(".env") || pathLower.includes(".config.") || pathLower.includes(".lock");
6178
6541
  const fileName = path2.split("/").pop() || "";
6179
6542
  const fileNameLower = fileName.toLowerCase();
6180
6543
  const commonTextFiles = [
6181
- // Documentation files without extensions
6182
6544
  "readme",
6183
6545
  "license",
6184
6546
  "changelog",
6185
6547
  "contributing",
6186
6548
  "authors",
6187
6549
  "maintainers",
6188
- // Build files without extensions
6189
6550
  "dockerfile",
6190
6551
  "makefile",
6191
6552
  "rakefile",
@@ -6193,374 +6554,125 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
6193
6554
  "podfile",
6194
6555
  "cartfile",
6195
6556
  "brewfile",
6196
- "procfile",
6197
- "vagrantfile",
6198
- "gulpfile",
6199
- "gruntfile",
6200
- // Configuration files (dotfiles)
6201
- ".prettierrc",
6202
- ".eslintrc",
6203
- ".babelrc",
6204
- ".nvmrc",
6205
- ".npmrc"
6206
- ];
6207
- const isCommonTextFile = commonTextFiles.some((name2) => {
6208
- if (name2.startsWith(".")) {
6209
- return fileNameLower === name2 || fileNameLower.startsWith(`${name2}.`);
6210
- }
6211
- return fileNameLower === name2 || fileNameLower.startsWith(`${name2}.`);
6212
- });
6213
- if (!hasTextExtension && !hasCompoundExtension && !isCommonTextFile) {
6214
- return false;
6215
- }
6216
- return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
6217
- }
6218
- /**
6219
- * Fetches the raw content of a file from GitHub.
6220
- */
6221
- async fetchFileContent(repoInfo, filePath, signal) {
6222
- const { owner, repo } = repoInfo;
6223
- const branch = this.resolvedBranch || repoInfo.branch || "main";
6224
- const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${filePath}`;
6225
- const rawContent = await this.httpFetcher.fetch(rawUrl, { signal });
6226
- const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath);
6227
- if (detectedMimeType && rawContent.mimeType === "text/plain") {
6228
- return {
6229
- ...rawContent,
6230
- mimeType: detectedMimeType
6231
- };
6232
- }
6233
- return rawContent;
6234
- }
6235
- async processItem(item, options, _progressCallback, signal) {
6236
- const repoInfo = this.parseGitHubUrl(options.url);
6237
- if (item.depth === 0) {
6238
- if ("isBlob" in repoInfo && repoInfo.isBlob) {
6239
- if (repoInfo.filePath) {
6240
- logger.info(
6241
- `📄 Processing single file: ${repoInfo.owner}/${repoInfo.repo}/${repoInfo.filePath}`
6242
- );
6243
- return { links: [`github-file://${repoInfo.filePath}`] };
6244
- } else {
6245
- logger.warn(
6246
- `⚠️ Blob URL without file path: ${options.url}. No files to process.`
6247
- );
6248
- return { links: [] };
6249
- }
6250
- }
6251
- logger.info(
6252
- `🗂️ Discovering repository structure for ${repoInfo.owner}/${repoInfo.repo}`
6253
- );
6254
- const { tree, resolvedBranch } = await this.fetchRepositoryTree(repoInfo, signal);
6255
- const fileItems = tree.tree.filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)).filter((treeItem) => this.shouldProcessFile(treeItem, options));
6256
- logger.info(
6257
- `📁 Found ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`
6258
- );
6259
- const links = fileItems.map((treeItem) => `github-file://${treeItem.path}`);
6260
- return { links };
6261
- }
6262
- if (item.url.startsWith("github-file://")) {
6263
- const filePath = item.url.replace("github-file://", "");
6264
- logger.info(
6265
- `🗂️ Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`
6266
- );
6267
- const rawContent = await this.fetchFileContent(repoInfo, filePath, signal);
6268
- let processed;
6269
- for (const pipeline of this.pipelines) {
6270
- if (pipeline.canProcess(rawContent)) {
6271
- logger.debug(
6272
- `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`
6273
- );
6274
- const gitHubOptions = { ...options, scrapeMode: ScrapeMode.Fetch };
6275
- processed = await pipeline.process(rawContent, gitHubOptions, this.httpFetcher);
6276
- break;
6277
- }
6278
- }
6279
- if (!processed) {
6280
- logger.warn(
6281
- `⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
6282
- );
6283
- return { document: void 0, links: [] };
6284
- }
6285
- for (const err of processed.errors) {
6286
- logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`);
6287
- }
6288
- const githubUrl = `https://github.com/${repoInfo.owner}/${repoInfo.repo}/blob/${this.resolvedBranch || repoInfo.branch || "main"}/${filePath}`;
6289
- const processedTitle = processed.metadata.title;
6290
- const hasValidTitle = typeof processedTitle === "string" && processedTitle.trim() !== "";
6291
- const fallbackTitle = filePath.split("/").pop() || "Untitled";
6292
- return {
6293
- document: {
6294
- content: typeof processed.textContent === "string" ? processed.textContent : "",
6295
- metadata: {
6296
- url: githubUrl,
6297
- title: hasValidTitle ? processedTitle : fallbackTitle,
6298
- library: options.library,
6299
- version: options.version
6300
- },
6301
- contentType: rawContent.mimeType
6302
- // Preserve the detected MIME type
6303
- },
6304
- links: []
6305
- // Always return empty links array for individual files
6306
- };
6307
- }
6308
- return { document: void 0, links: [] };
6309
- }
6310
- /**
6311
- * Normalize a path by removing leading and trailing slashes.
6312
- */
6313
- normalizePath(path2) {
6314
- return path2.replace(/^\/+/, "").replace(/\/+$/, "");
6315
- }
6316
- isWithinSubPath(path2, subPath) {
6317
- if (!subPath) {
6318
- return true;
6319
- }
6320
- const trimmedSubPath = this.normalizePath(subPath);
6321
- if (trimmedSubPath.length === 0) {
6322
- return true;
6323
- }
6324
- const normalizedPath = this.normalizePath(path2);
6325
- if (normalizedPath === trimmedSubPath) {
6326
- return true;
6327
- }
6328
- return normalizedPath.startsWith(`${trimmedSubPath}/`);
6329
- }
6330
- async scrape(options, progressCallback, signal) {
6331
- const url = new URL(options.url);
6332
- if (!url.hostname.includes("github.com")) {
6333
- throw new Error("URL must be a GitHub URL");
6334
- }
6335
- return super.scrape(options, progressCallback, signal);
6336
- }
6337
- /**
6338
- * Cleanup resources used by this strategy, specifically the pipeline browser instances.
6339
- */
6340
- async cleanup() {
6341
- await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
6342
- }
6343
- }
6344
- class GitHubWikiScraperStrategy extends BaseScraperStrategy {
6345
- httpFetcher = new HttpFetcher();
6346
- pipelines;
6347
- constructor() {
6348
- super();
6349
- this.pipelines = PipelineFactory$1.createStandardPipelines();
6350
- }
6351
- canHandle(url) {
6352
- try {
6353
- const parsedUrl = new URL(url);
6354
- const { hostname, pathname } = parsedUrl;
6355
- return ["github.com", "www.github.com"].includes(hostname) && pathname.includes("/wiki") && pathname.match(/^\/([^/]+)\/([^/]+)\/wiki/) !== null;
6356
- } catch {
6357
- return false;
6358
- }
6359
- }
6360
- /**
6361
- * Parses a GitHub wiki URL to extract repository information.
6362
- */
6363
- parseGitHubWikiUrl(url) {
6364
- const parsedUrl = new URL(url);
6365
- const match = parsedUrl.pathname.match(/^\/([^/]+)\/([^/]+)\/wiki/);
6366
- if (!match) {
6367
- throw new Error(`Invalid GitHub wiki URL: ${url}`);
6368
- }
6369
- const [, owner, repo] = match;
6370
- return { owner, repo };
6371
- }
6372
- /**
6373
- * Override shouldProcessUrl to only process URLs within the wiki scope.
6374
- */
6375
- shouldProcessUrl(url, options) {
6376
- try {
6377
- const parsedUrl = new URL(url);
6378
- const wikiInfo = this.parseGitHubWikiUrl(options.url);
6379
- const expectedWikiPath = `/${wikiInfo.owner}/${wikiInfo.repo}/wiki`;
6380
- if (!parsedUrl.pathname.startsWith(expectedWikiPath)) {
6381
- return false;
6382
- }
6383
- const wikiPagePath = parsedUrl.pathname.replace(expectedWikiPath, "").replace(/^\//, "");
6384
- return shouldIncludeUrl(
6385
- wikiPagePath || "Home",
6386
- options.includePatterns,
6387
- options.excludePatterns
6388
- );
6389
- } catch {
6390
- return false;
6391
- }
6392
- }
6393
- async processItem(item, options, _progressCallback, signal) {
6394
- const currentUrl = item.url;
6395
- logger.info(
6396
- `📖 Processing wiki page ${this.pageCount}/${options.maxPages}: ${currentUrl}`
6397
- );
6398
- try {
6399
- const rawContent = await this.httpFetcher.fetch(currentUrl, { signal });
6400
- let processed;
6401
- for (const pipeline of this.pipelines) {
6402
- if (pipeline.canProcess(rawContent)) {
6403
- logger.debug(
6404
- `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${currentUrl})`
6405
- );
6406
- const wikiOptions = { ...options, scrapeMode: ScrapeMode.Fetch };
6407
- processed = await pipeline.process(rawContent, wikiOptions, this.httpFetcher);
6408
- break;
6409
- }
6410
- }
6411
- if (!processed) {
6412
- logger.warn(
6413
- `⚠️ Unsupported content type "${rawContent.mimeType}" for wiki page ${currentUrl}. Skipping processing.`
6414
- );
6415
- return { document: void 0, links: [] };
6416
- }
6417
- for (const err of processed.errors) {
6418
- logger.warn(`⚠️ Processing error for ${currentUrl}: ${err.message}`);
6557
+ "procfile",
6558
+ "vagrantfile",
6559
+ "gulpfile",
6560
+ "gruntfile",
6561
+ ".prettierrc",
6562
+ ".eslintrc",
6563
+ ".babelrc",
6564
+ ".nvmrc",
6565
+ ".npmrc"
6566
+ ];
6567
+ const isCommonTextFile = commonTextFiles.some((name2) => {
6568
+ if (name2.startsWith(".")) {
6569
+ return fileNameLower === name2 || fileNameLower.startsWith(`${name2}.`);
6419
6570
  }
6420
- const parsedUrl = new URL(currentUrl);
6421
- const wikiInfo = this.parseGitHubWikiUrl(currentUrl);
6422
- const wikiPagePath = parsedUrl.pathname.replace(`/${wikiInfo.owner}/${wikiInfo.repo}/wiki`, "").replace(/^\//, "");
6423
- const pageTitle = wikiPagePath || "Home";
6424
- const document2 = {
6425
- content: typeof processed.textContent === "string" ? processed.textContent : "",
6426
- metadata: {
6427
- url: currentUrl,
6428
- title: typeof processed.metadata.title === "string" && processed.metadata.title.trim() !== "" ? processed.metadata.title : pageTitle,
6429
- library: options.library,
6430
- version: options.version
6431
- },
6432
- contentType: rawContent.mimeType
6433
- };
6434
- const links = processed.links || [];
6435
- const wikiLinks = links.filter((link) => {
6436
- if (!link || link.trim() === "" || link === "invalid-url" || link === "not-a-url-at-all") {
6437
- return false;
6438
- }
6439
- return true;
6440
- }).map((link) => {
6441
- try {
6442
- return new URL(link, currentUrl).href;
6443
- } catch {
6444
- return null;
6445
- }
6446
- }).filter((link) => link !== null).filter((link) => {
6447
- try {
6448
- const linkUrl = new URL(link);
6449
- return linkUrl.hostname === parsedUrl.hostname && linkUrl.pathname.startsWith(`/${wikiInfo.owner}/${wikiInfo.repo}/wiki`);
6450
- } catch {
6451
- return false;
6452
- }
6453
- });
6454
- return { document: document2, links: wikiLinks };
6455
- } catch (error) {
6456
- logger.warn(`⚠️ Failed to process wiki page ${currentUrl}: ${error}`);
6457
- return { document: void 0, links: [] };
6458
- }
6459
- }
6460
- async scrape(options, progressCallback, signal) {
6461
- const url = new URL(options.url);
6462
- if (!url.hostname.includes("github.com") || !url.pathname.includes("/wiki")) {
6463
- throw new Error("URL must be a GitHub wiki URL");
6571
+ return fileNameLower === name2 || fileNameLower.startsWith(`${name2}.`);
6572
+ });
6573
+ if (hasTextExtension || hasCompoundExtension || isCommonTextFile) {
6574
+ return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
6464
6575
  }
6465
- let startUrl = options.url;
6466
- if (url.pathname.endsWith("/wiki") || url.pathname.endsWith("/wiki/")) {
6467
- startUrl = url.pathname.endsWith("/") ? `${options.url}Home` : `${options.url}/Home`;
6576
+ const mimeType = mime.getType(path2);
6577
+ if (mimeType?.startsWith("text/")) {
6578
+ logger.debug(`Including file with text MIME type: ${path2} (${mimeType})`);
6579
+ return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
6468
6580
  }
6469
- const wikiOptions = { ...options, url: startUrl };
6470
- return super.scrape(wikiOptions, progressCallback, signal);
6581
+ return false;
6471
6582
  }
6472
6583
  /**
6473
- * Cleanup resources used by this strategy.
6584
+ * Checks if a path is within the specified subpath.
6474
6585
  */
6475
- async cleanup() {
6476
- await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
6586
+ isWithinSubPath(path2, subPath) {
6587
+ if (!subPath) {
6588
+ return true;
6589
+ }
6590
+ const trimmedSubPath = subPath.replace(/^\/+/, "").replace(/\/+$/, "");
6591
+ if (trimmedSubPath.length === 0) {
6592
+ return true;
6593
+ }
6594
+ const normalizedPath = path2.replace(/^\/+/, "").replace(/\/+$/, "");
6595
+ if (normalizedPath === trimmedSubPath) {
6596
+ return true;
6597
+ }
6598
+ return normalizedPath.startsWith(`${trimmedSubPath}/`);
6477
6599
  }
6478
- }
6479
- class GitHubScraperStrategy {
6480
- repoStrategy = new GitHubRepoScraperStrategy();
6481
- wikiStrategy = new GitHubWikiScraperStrategy();
6482
- canHandle(url) {
6600
+ async processItem(item, options, signal) {
6601
+ if (item.url.startsWith("github-file://")) {
6602
+ logger.info(
6603
+ `🗑️ Legacy github-file:// URL detected, marking as deleted: ${item.url}`
6604
+ );
6605
+ return {
6606
+ url: item.url,
6607
+ links: [],
6608
+ status: FetchStatus.NOT_FOUND
6609
+ };
6610
+ }
6483
6611
  try {
6484
- const parsedUrl = new URL(url);
6485
- const { hostname, pathname } = parsedUrl;
6486
- if (!["github.com", "www.github.com"].includes(hostname)) {
6487
- return false;
6612
+ const parsedUrl = new URL(item.url);
6613
+ if (/^\/[^/]+\/[^/]+\/wiki($|\/)/.test(parsedUrl.pathname)) {
6614
+ return await this.wikiProcessor.process(item, options, signal);
6488
6615
  }
6489
- const pathMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/?$/);
6490
- return pathMatch !== null;
6491
6616
  } catch {
6492
- return false;
6493
- }
6494
- }
6495
- async scrape(options, progressCallback, signal) {
6496
- const url = new URL(options.url);
6497
- if (!url.hostname.includes("github.com")) {
6498
- throw new Error("URL must be a GitHub URL");
6499
6617
  }
6500
- const pathMatch = url.pathname.match(/^\/([^/]+)\/([^/]+)\/?$/);
6501
- if (!pathMatch) {
6502
- throw new Error("URL must be a base GitHub repository URL");
6503
- }
6504
- const [, owner, repo] = pathMatch;
6505
- logger.info(`🚀 Starting comprehensive GitHub scraping for ${owner}/${repo}`);
6506
- let totalPagesDiscovered = 0;
6507
- let wikiPagesScraped = 0;
6508
- let wikiCompleted = false;
6509
- let repoCompleted = false;
6510
- const mergedProgressCallback = async (progress) => {
6511
- if (!wikiCompleted) {
6512
- totalPagesDiscovered = progress.totalDiscovered;
6513
- wikiPagesScraped = progress.pagesScraped;
6514
- } else if (!repoCompleted) {
6515
- progress = {
6516
- ...progress,
6517
- pagesScraped: wikiPagesScraped + progress.pagesScraped,
6518
- totalPages: wikiPagesScraped + progress.totalPages,
6519
- totalDiscovered: totalPagesDiscovered + progress.totalDiscovered
6618
+ if (item.depth === 0) {
6619
+ const repoInfo = this.parseGitHubUrl(options.url);
6620
+ const { owner, repo } = repoInfo;
6621
+ logger.debug(`Discovering GitHub repository ${owner}/${repo}`);
6622
+ const discoveredLinks = [];
6623
+ if ("isBlob" in repoInfo && repoInfo.isBlob && repoInfo.filePath) {
6624
+ const { branch = "main", filePath } = repoInfo;
6625
+ logger.debug(
6626
+ `Single file URL detected: ${owner}/${repo}/${filePath} - indexing file only`
6627
+ );
6628
+ discoveredLinks.push(
6629
+ `https://github.com/${owner}/${repo}/blob/${branch}/${filePath}`
6630
+ );
6631
+ return {
6632
+ url: item.url,
6633
+ links: discoveredLinks,
6634
+ status: FetchStatus.SUCCESS
6520
6635
  };
6521
6636
  }
6522
- await progressCallback(progress);
6523
- };
6524
- try {
6525
6637
  const wikiUrl = `${options.url.replace(/\/$/, "")}/wiki`;
6526
- const wikiOptions = { ...options, url: wikiUrl };
6527
- logger.info(`📖 Attempting to scrape wiki for ${owner}/${repo}`);
6528
- try {
6529
- await this.wikiStrategy.scrape(wikiOptions, mergedProgressCallback, signal);
6530
- wikiCompleted = true;
6531
- logger.info(
6532
- `✅ Completed wiki scraping for ${owner}/${repo} (${wikiPagesScraped} pages)`
6533
- );
6534
- } catch (error) {
6535
- wikiCompleted = true;
6536
- logger.info(`ℹ️ Wiki not available or accessible for ${owner}/${repo}: ${error}`);
6537
- }
6538
- const maxPages = options.maxPages || 1e3;
6539
- const remainingPages = Math.max(0, maxPages - wikiPagesScraped);
6540
- if (remainingPages > 0) {
6541
- logger.info(
6542
- `📂 Scraping repository code for ${owner}/${repo} (${remainingPages} pages remaining)`
6543
- );
6544
- const repoOptions = { ...options, maxPages: remainingPages };
6545
- await this.repoStrategy.scrape(repoOptions, mergedProgressCallback, signal);
6546
- repoCompleted = true;
6547
- logger.info(`✅ Completed repository code scraping for ${owner}/${repo}`);
6548
- } else {
6549
- logger.info(
6550
- `ℹ️ Skipping repository code scraping - page limit reached with wiki content`
6551
- );
6638
+ discoveredLinks.push(wikiUrl);
6639
+ logger.debug(`Discovered wiki URL: ${wikiUrl}`);
6640
+ const { tree, resolvedBranch } = await this.fetchRepositoryTree(repoInfo, signal);
6641
+ const fileItems = tree.tree.filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)).filter((treeItem) => this.shouldProcessFile(treeItem, options));
6642
+ logger.debug(
6643
+ `Discovered ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`
6644
+ );
6645
+ const fileUrls = fileItems.map(
6646
+ (treeItem) => `https://github.com/${owner}/${repo}/blob/${resolvedBranch}/${treeItem.path}`
6647
+ );
6648
+ discoveredLinks.push(...fileUrls);
6649
+ logger.debug(
6650
+ `Discovery complete: ${fileUrls.length} repo file(s) + 1 wiki URL = ${discoveredLinks.length} total URLs`
6651
+ );
6652
+ return { url: item.url, links: discoveredLinks, status: FetchStatus.SUCCESS };
6653
+ }
6654
+ try {
6655
+ const parsedUrl = new URL(item.url);
6656
+ if (/^\/[^/]+\/[^/]+\/blob\//.test(parsedUrl.pathname)) {
6657
+ logger.debug(`Processing HTTPS blob URL at depth ${item.depth}: ${item.url}`);
6658
+ return await this.repoProcessor.process(item, options, signal);
6552
6659
  }
6553
- logger.info(`🎉 Comprehensive GitHub scraping completed for ${owner}/${repo}`);
6554
6660
  } catch (error) {
6555
- logger.error(`❌ GitHub scraping failed for ${owner}/${repo}: ${error}`);
6556
- throw error;
6661
+ logger.warn(`⚠️ Failed to parse blob URL ${item.url}: ${error}`);
6662
+ return { url: item.url, links: [], status: FetchStatus.SUCCESS };
6557
6663
  }
6664
+ logger.debug(`No further processing for URL at depth ${item.depth}: ${item.url}`);
6665
+ return { url: item.url, links: [], status: FetchStatus.SUCCESS };
6666
+ }
6667
+ async scrape(options, progressCallback, signal) {
6668
+ const url = new URL(options.url);
6669
+ if (!url.hostname.includes("github.com")) {
6670
+ throw new Error("URL must be a GitHub URL");
6671
+ }
6672
+ await super.scrape(options, progressCallback, signal);
6558
6673
  }
6559
- /**
6560
- * Cleanup resources used by both underlying strategies.
6561
- */
6562
6674
  async cleanup() {
6563
- await Promise.allSettled([this.repoStrategy.cleanup(), this.wikiStrategy.cleanup()]);
6675
+ await Promise.all([this.wikiProcessor.cleanup(), this.repoProcessor.cleanup()]);
6564
6676
  }
6565
6677
  }
6566
6678
  class LocalFileStrategy extends BaseScraperStrategy {
@@ -6573,23 +6685,41 @@ class LocalFileStrategy extends BaseScraperStrategy {
6573
6685
  canHandle(url) {
6574
6686
  return url.startsWith("file://");
6575
6687
  }
6576
- async processItem(item, options, _progressCallback, _signal) {
6688
+ async processItem(item, options, _signal) {
6577
6689
  let filePath = item.url.replace(/^file:\/\/\/?/, "");
6578
6690
  filePath = decodeURIComponent(filePath);
6579
6691
  if (!filePath.startsWith("/") && process.platform !== "win32") {
6580
6692
  filePath = `/${filePath}`;
6581
6693
  }
6582
- const stats = await fs$1.stat(filePath);
6694
+ let stats;
6695
+ try {
6696
+ stats = await fs$1.stat(filePath);
6697
+ } catch (error) {
6698
+ if (error.code === "ENOENT") {
6699
+ logger.info(`✓ File deleted or not available: ${filePath}`);
6700
+ return {
6701
+ url: item.url,
6702
+ links: [],
6703
+ status: FetchStatus.NOT_FOUND
6704
+ };
6705
+ }
6706
+ throw error;
6707
+ }
6583
6708
  if (stats.isDirectory()) {
6584
6709
  const contents = await fs$1.readdir(filePath);
6585
6710
  const links = contents.map((name2) => `file://${path.join(filePath, name2)}`).filter((url) => this.shouldProcessUrl(url, options));
6586
- return { links };
6711
+ return { url: item.url, links, status: FetchStatus.SUCCESS };
6712
+ }
6713
+ const rawContent = await this.fileFetcher.fetch(item.url, {
6714
+ etag: item.etag
6715
+ });
6716
+ if (rawContent.status === FetchStatus.NOT_MODIFIED) {
6717
+ logger.debug(`✓ File unchanged: ${filePath}`);
6718
+ return { url: rawContent.source, links: [], status: FetchStatus.NOT_MODIFIED };
6587
6719
  }
6588
- logger.info(`🗂️ Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`);
6589
- const rawContent = await this.fileFetcher.fetch(item.url);
6590
6720
  let processed;
6591
6721
  for (const pipeline of this.pipelines) {
6592
- if (pipeline.canProcess(rawContent)) {
6722
+ if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) {
6593
6723
  logger.debug(
6594
6724
  `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`
6595
6725
  );
@@ -6601,22 +6731,22 @@ class LocalFileStrategy extends BaseScraperStrategy {
6601
6731
  logger.warn(
6602
6732
  `⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
6603
6733
  );
6604
- return { document: void 0, links: [] };
6734
+ return { url: rawContent.source, links: [], status: FetchStatus.SUCCESS };
6605
6735
  }
6606
- for (const err of processed.errors) {
6736
+ for (const err of processed.errors ?? []) {
6607
6737
  logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`);
6608
6738
  }
6739
+ const filename = path.basename(filePath);
6740
+ const title = processed.title?.trim() || filename || null;
6609
6741
  return {
6610
- document: {
6611
- content: typeof processed.textContent === "string" ? processed.textContent : "",
6612
- contentType: rawContent.mimeType,
6613
- metadata: {
6614
- url: rawContent.source,
6615
- title: typeof processed.metadata.title === "string" ? processed.metadata.title : "Untitled",
6616
- library: options.library,
6617
- version: options.version
6618
- }
6619
- }
6742
+ url: rawContent.source,
6743
+ title,
6744
+ etag: rawContent.etag,
6745
+ lastModified: rawContent.lastModified,
6746
+ contentType: rawContent.mimeType,
6747
+ content: processed,
6748
+ links: [],
6749
+ status: FetchStatus.SUCCESS
6620
6750
  };
6621
6751
  }
6622
6752
  /**
@@ -6652,19 +6782,32 @@ class WebScraperStrategy extends BaseScraperStrategy {
6652
6782
  * @param signal - Optional abort signal for request cancellation.
6653
6783
  * @returns An object containing the processed document and extracted links.
6654
6784
  */
6655
- async processItem(item, options, _progressCallback, signal) {
6785
+ async processItem(item, options, signal) {
6656
6786
  const { url } = item;
6657
6787
  try {
6788
+ if (item.etag) {
6789
+ logger.debug(`Processing ${url} with stored ETag: ${item.etag}`);
6790
+ }
6658
6791
  const fetchOptions = {
6659
6792
  signal,
6660
6793
  followRedirects: options.followRedirects,
6661
- headers: options.headers
6794
+ headers: options.headers,
6662
6795
  // Forward custom headers
6796
+ etag: item.etag
6797
+ // Pass ETag for conditional requests
6663
6798
  };
6664
6799
  const rawContent = await this.fetcher.fetch(url, fetchOptions);
6800
+ logger.debug(
6801
+ `Fetch result for ${url}: status=${rawContent.status}, etag=${rawContent.etag || "none"}`
6802
+ );
6803
+ if (rawContent.status !== FetchStatus.SUCCESS) {
6804
+ logger.debug(`Skipping pipeline for ${url} due to status: ${rawContent.status}`);
6805
+ return { url: rawContent.source, links: [], status: rawContent.status };
6806
+ }
6665
6807
  let processed;
6666
6808
  for (const pipeline of this.pipelines) {
6667
- if (pipeline.canProcess(rawContent)) {
6809
+ const contentBuffer = Buffer.isBuffer(rawContent.content) ? rawContent.content : Buffer.from(rawContent.content);
6810
+ if (pipeline.canProcess(rawContent.mimeType || "text/plain", contentBuffer)) {
6668
6811
  logger.debug(
6669
6812
  `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${url})`
6670
6813
  );
@@ -6676,40 +6819,47 @@ class WebScraperStrategy extends BaseScraperStrategy {
6676
6819
  logger.warn(
6677
6820
  `⚠️ Unsupported content type "${rawContent.mimeType}" for URL ${url}. Skipping processing.`
6678
6821
  );
6679
- return { document: void 0, links: [] };
6822
+ return { url: rawContent.source, links: [], status: FetchStatus.SUCCESS };
6680
6823
  }
6681
- for (const err of processed.errors) {
6824
+ for (const err of processed.errors ?? []) {
6682
6825
  logger.warn(`⚠️ Processing error for ${url}: ${err.message}`);
6683
6826
  }
6684
6827
  if (!processed.textContent || !processed.textContent.trim()) {
6685
6828
  logger.warn(
6686
6829
  `⚠️ No processable content found for ${url} after pipeline execution.`
6687
6830
  );
6688
- return { document: void 0, links: processed.links };
6831
+ return {
6832
+ url: rawContent.source,
6833
+ links: processed.links,
6834
+ status: FetchStatus.SUCCESS
6835
+ };
6689
6836
  }
6690
- const baseUrl = item.depth === 0 ? new URL(rawContent.source) : this.canonicalBaseUrl ?? new URL(options.url);
6691
- const filteredLinks = processed.links.filter((link) => {
6837
+ if (item.depth === 0) {
6838
+ this.canonicalBaseUrl = new URL(rawContent.source);
6839
+ }
6840
+ const filteredLinks = processed.links?.filter((link) => {
6692
6841
  try {
6693
6842
  const targetUrl = new URL(link);
6694
- const scope = options.scope || "subpages";
6695
- return isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
6843
+ if (!this.shouldProcessUrl(targetUrl.href, options)) {
6844
+ return false;
6845
+ }
6846
+ if (this.shouldFollowLinkFn) {
6847
+ const baseUrl = this.canonicalBaseUrl ?? new URL(options.url);
6848
+ return this.shouldFollowLinkFn(baseUrl, targetUrl);
6849
+ }
6850
+ return true;
6696
6851
  } catch {
6697
6852
  return false;
6698
6853
  }
6699
- });
6854
+ }) ?? [];
6700
6855
  return {
6701
- document: {
6702
- content: processed.textContent,
6703
- metadata: {
6704
- url,
6705
- title: typeof processed.metadata.title === "string" ? processed.metadata.title : "Untitled",
6706
- library: options.library,
6707
- version: options.version,
6708
- ...processed.metadata
6709
- }
6710
- },
6856
+ url: rawContent.source,
6857
+ etag: rawContent.etag,
6858
+ lastModified: rawContent.lastModified,
6859
+ contentType: processed.contentType || rawContent.mimeType,
6860
+ content: processed,
6711
6861
  links: filteredLinks,
6712
- finalUrl: rawContent.source
6862
+ status: FetchStatus.SUCCESS
6713
6863
  };
6714
6864
  } catch (error) {
6715
6865
  logger.error(`❌ Failed processing page ${url}: ${error}`);
@@ -6786,7 +6936,6 @@ class ScraperRegistry {
6786
6936
  this.strategies = [
6787
6937
  new NpmScraperStrategy(),
6788
6938
  new PyPiScraperStrategy(),
6789
- new GitHubWikiScraperStrategy(),
6790
6939
  new GitHubScraperStrategy(),
6791
6940
  new WebScraperStrategy(),
6792
6941
  new LocalFileStrategy()
@@ -6848,55 +6997,64 @@ class PipelineWorker {
6848
6997
  * @param callbacks - Callbacks provided by the manager for reporting.
6849
6998
  */
6850
6999
  async executeJob(job, callbacks) {
6851
- const {
6852
- id: jobId,
6853
- library,
6854
- version: version2,
6855
- sourceUrl,
6856
- scraperOptions,
6857
- abortController
6858
- } = job;
7000
+ const { id: jobId, library, version: version2, scraperOptions, abortController } = job;
6859
7001
  const signal = abortController.signal;
6860
7002
  logger.debug(`[${jobId}] Worker starting job for ${library}@${version2}`);
6861
7003
  try {
6862
- await this.store.removeAllDocuments(library, version2);
6863
- logger.info(
6864
- `💾 Cleared store for ${library}@${version2 || "[no version]"} before scraping.`
6865
- );
6866
- const runtimeOptions = {
6867
- url: sourceUrl ?? "",
6868
- library,
6869
- version: version2,
6870
- ...scraperOptions
6871
- };
7004
+ if (!scraperOptions.isRefresh) {
7005
+ await this.store.removeAllDocuments(library, version2);
7006
+ logger.info(
7007
+ `💾 Cleared store for ${library}@${version2 || "[no version]"} before scraping.`
7008
+ );
7009
+ } else {
7010
+ logger.info(
7011
+ `🔄 Refresh operation - preserving existing data for ${library}@${version2 || "[no version]"}.`
7012
+ );
7013
+ }
6872
7014
  await this.scraperService.scrape(
6873
- runtimeOptions,
7015
+ scraperOptions,
6874
7016
  async (progress) => {
6875
7017
  if (signal.aborted) {
6876
7018
  throw new CancellationError("Job cancelled during scraping progress");
6877
7019
  }
6878
7020
  await callbacks.onJobProgress?.(job, progress);
6879
- if (progress.document) {
7021
+ if (progress.deleted && progress.pageId) {
6880
7022
  try {
6881
- await this.store.addDocument(library, version2, {
6882
- pageContent: progress.document.content,
6883
- metadata: {
6884
- ...progress.document.metadata,
6885
- mimeType: progress.document.contentType
6886
- // Pass contentType as mimeType in metadata
6887
- }
6888
- });
7023
+ await this.store.deletePage(progress.pageId);
6889
7024
  logger.debug(
6890
- `[${jobId}] Stored document: ${progress.document.metadata.url}`
7025
+ `[${jobId}] Deleted page ${progress.pageId}: ${progress.currentUrl}`
7026
+ );
7027
+ } catch (docError) {
7028
+ logger.error(
7029
+ `❌ [${jobId}] Failed to delete page ${progress.pageId}: ${docError}`
7030
+ );
7031
+ const error = docError instanceof Error ? docError : new Error(String(docError));
7032
+ await callbacks.onJobError?.(job, error);
7033
+ throw error;
7034
+ }
7035
+ } else if (progress.result) {
7036
+ try {
7037
+ if (progress.pageId) {
7038
+ await this.store.deletePage(progress.pageId);
7039
+ logger.debug(
7040
+ `[${jobId}] Refreshing page ${progress.pageId}: ${progress.currentUrl}`
7041
+ );
7042
+ }
7043
+ await this.store.addScrapeResult(
7044
+ library,
7045
+ version2,
7046
+ progress.depth,
7047
+ progress.result
6891
7048
  );
7049
+ logger.debug(`[${jobId}] Stored processed content: ${progress.currentUrl}`);
6892
7050
  } catch (docError) {
6893
7051
  logger.error(
6894
- `❌ [${jobId}] Failed to store document ${progress.document.metadata.url}: ${docError}`
7052
+ `❌ [${jobId}] Failed to process content ${progress.currentUrl}: ${docError}`
6895
7053
  );
6896
7054
  await callbacks.onJobError?.(
6897
7055
  job,
6898
7056
  docError instanceof Error ? docError : new Error(String(docError)),
6899
- progress.document
7057
+ progress.result
6900
7058
  );
6901
7059
  }
6902
7060
  }
@@ -7108,15 +7266,8 @@ class PipelineManager {
7108
7266
  /**
7109
7267
  * Enqueues a new document processing job, aborting any existing QUEUED/RUNNING job for the same library+version (including unversioned).
7110
7268
  */
7111
- async enqueueJob(library, version2, options) {
7269
+ async enqueueScrapeJob(library, version2, options) {
7112
7270
  const normalizedVersion = version2 ?? "";
7113
- const {
7114
- url,
7115
- library: _library,
7116
- version: _version,
7117
- signal: _signal,
7118
- ...versionOptions
7119
- } = options;
7120
7271
  const allJobs = await this.getJobs();
7121
7272
  const duplicateJobs = allJobs.filter(
7122
7273
  (job2) => job2.library === library && (job2.version ?? "") === normalizedVersion && // Normalize null to empty string for comparison
@@ -7158,8 +7309,8 @@ class PipelineManager {
7158
7309
  progressMaxPages: 0,
7159
7310
  errorMessage: null,
7160
7311
  updatedAt: /* @__PURE__ */ new Date(),
7161
- sourceUrl: url,
7162
- scraperOptions: versionOptions
7312
+ sourceUrl: options.url,
7313
+ scraperOptions: options
7163
7314
  };
7164
7315
  this.jobMap.set(jobId, job);
7165
7316
  this.jobQueue.push(jobId);
@@ -7174,6 +7325,78 @@ class PipelineManager {
7174
7325
  }
7175
7326
  return jobId;
7176
7327
  }
7328
+ /**
7329
+ * Enqueues a refresh job for an existing library version by re-scraping all pages
7330
+ * and using ETag comparison to skip unchanged content.
7331
+ *
7332
+ * If the version was never completed (interrupted or failed scrape), performs a
7333
+ * full re-scrape from scratch instead of a refresh to ensure completeness.
7334
+ */
7335
+ async enqueueRefreshJob(library, version2) {
7336
+ const normalizedVersion = version2 ?? "";
7337
+ try {
7338
+ const versionId = await this.store.ensureVersion({
7339
+ library,
7340
+ version: normalizedVersion
7341
+ });
7342
+ const versionInfo = await this.store.getVersionById(versionId);
7343
+ if (!versionInfo) {
7344
+ throw new Error(`Version ID ${versionId} not found`);
7345
+ }
7346
+ const libraryInfo = await this.store.getLibraryById(versionInfo.library_id);
7347
+ if (!libraryInfo) {
7348
+ throw new Error(`Library ID ${versionInfo.library_id} not found`);
7349
+ }
7350
+ if (versionInfo && versionInfo.status !== VersionStatus.COMPLETED) {
7351
+ logger.info(
7352
+ `⚠️ Version ${library}@${normalizedVersion || "unversioned"} has status "${versionInfo.status}". Performing full re-scrape instead of refresh.`
7353
+ );
7354
+ return this.enqueueJobWithStoredOptions(library, normalizedVersion);
7355
+ }
7356
+ const pages = await this.store.getPagesByVersionId(versionId);
7357
+ if (pages.length > 0) {
7358
+ logger.debug(
7359
+ `Sample page data: url=${pages[0].url}, etag=${pages[0].etag}, depth=${pages[0].depth}`
7360
+ );
7361
+ }
7362
+ if (pages.length === 0) {
7363
+ throw new Error(
7364
+ `No pages found for ${library}@${normalizedVersion || "unversioned"}. Use scrape_docs to index it first.`
7365
+ );
7366
+ }
7367
+ logger.info(
7368
+ `🔄 Preparing refresh job for ${library}@${normalizedVersion || "unversioned"} with ${pages.length} page(s)`
7369
+ );
7370
+ const initialQueue = pages.map((page) => ({
7371
+ url: page.url,
7372
+ depth: page.depth ?? 0,
7373
+ // Use original depth, fallback to 0 for old data
7374
+ pageId: page.id,
7375
+ etag: page.etag
7376
+ }));
7377
+ const storedOptions = await this.store.getScraperOptions(versionId);
7378
+ const scraperOptions = {
7379
+ url: storedOptions?.sourceUrl || pages[0].url,
7380
+ // Required but not used when initialQueue is set
7381
+ library,
7382
+ version: normalizedVersion,
7383
+ ...storedOptions?.options || {},
7384
+ // Include stored options if available (spread first)
7385
+ // Override with refresh-specific options (these must come after the spread)
7386
+ initialQueue,
7387
+ // Pre-populated queue with existing pages
7388
+ isRefresh: true
7389
+ // Mark this as a refresh operation
7390
+ };
7391
+ logger.info(
7392
+ `📝 Enqueueing refresh job for ${library}@${normalizedVersion || "unversioned"}`
7393
+ );
7394
+ return this.enqueueScrapeJob(library, normalizedVersion, scraperOptions);
7395
+ } catch (error) {
7396
+ logger.error(`❌ Failed to enqueue refresh job: ${error}`);
7397
+ throw error;
7398
+ }
7399
+ }
7177
7400
  /**
7178
7401
  * Enqueues a job using stored scraper options from a previous indexing run.
7179
7402
  * If no stored options are found, throws an error.
@@ -7201,7 +7424,7 @@ class PipelineManager {
7201
7424
  logger.info(
7202
7425
  `🔄 Re-indexing ${library}@${normalizedVersion || "unversioned"} with stored options from ${stored.sourceUrl}`
7203
7426
  );
7204
- return this.enqueueJob(library, normalizedVersion, completeOptions);
7427
+ return this.enqueueScrapeJob(library, normalizedVersion, completeOptions);
7205
7428
  } catch (error) {
7206
7429
  logger.error(`❌ Failed to enqueue job with stored options: ${error}`);
7207
7430
  throw error;
@@ -7418,13 +7641,7 @@ class PipelineManager {
7418
7641
  await this.store.updateVersionStatus(versionId, dbStatus, errorMessage);
7419
7642
  if (newStatus === PipelineJobStatus.QUEUED && job.scraperOptions) {
7420
7643
  try {
7421
- const fullOptions = {
7422
- url: job.sourceUrl ?? "",
7423
- library: job.library,
7424
- version: job.version,
7425
- ...job.scraperOptions
7426
- };
7427
- await this.store.storeScraperOptions(versionId, fullOptions);
7644
+ await this.store.storeScraperOptions(versionId, job.scraperOptions);
7428
7645
  logger.debug(
7429
7646
  `Stored scraper options for ${job.library}@${job.version}: ${job.sourceUrl}`
7430
7647
  );
@@ -7882,7 +8099,7 @@ async function createPipelineWithCallbacks(docService, options = {}) {
7882
8099
  },
7883
8100
  onJobError: async (job, error, document2) => {
7884
8101
  logger.warn(
7885
- `⚠️ Job ${job.id} error ${document2 ? `on document ${document2.metadata.url}` : ""}: ${error.message}`
8102
+ `⚠️ Job ${job.id} error ${document2 ? `on document ${document2.url}` : ""}: ${error.message}`
7886
8103
  );
7887
8104
  }
7888
8105
  });
@@ -8113,6 +8330,45 @@ function createMcpServerInstance(tools, readOnly = false) {
8113
8330
  }
8114
8331
  }
8115
8332
  );
8333
+ server.tool(
8334
+ "refresh_version",
8335
+ "Re-scrape a previously indexed library version, updating only changed pages.",
8336
+ {
8337
+ library: z.string().trim().describe("Library name."),
8338
+ version: z.string().trim().optional().describe("Library version (optional, refreshes unversioned if omitted).")
8339
+ },
8340
+ {
8341
+ title: "Refresh Library Version",
8342
+ destructiveHint: false,
8343
+ // Only updates changed content
8344
+ openWorldHint: true
8345
+ // requires internet access
8346
+ },
8347
+ async ({ library, version: version2 }) => {
8348
+ analytics.track(TelemetryEvent.TOOL_USED, {
8349
+ tool: "refresh_version",
8350
+ context: "mcp_server",
8351
+ library,
8352
+ version: version2
8353
+ });
8354
+ try {
8355
+ const result = await tools.refresh.execute({
8356
+ library,
8357
+ version: version2,
8358
+ waitForCompletion: false
8359
+ // Don't wait for completion
8360
+ });
8361
+ if ("jobId" in result) {
8362
+ return createResponse(`🔄 Refresh job started with ID: ${result.jobId}.`);
8363
+ }
8364
+ return createResponse(
8365
+ `Refresh finished immediately (unexpectedly) with ${result.pagesRefreshed} pages.`
8366
+ );
8367
+ } catch (error) {
8368
+ return createError(error);
8369
+ }
8370
+ }
8371
+ );
8116
8372
  }
8117
8373
  server.tool(
8118
8374
  "search_docs",
@@ -8638,7 +8894,7 @@ class FetchUrlTool {
8638
8894
  logger.info("🔄 Processing content...");
8639
8895
  let processed;
8640
8896
  for (const pipeline of this.pipelines) {
8641
- if (pipeline.canProcess(rawContent)) {
8897
+ if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) {
8642
8898
  processed = await pipeline.process(
8643
8899
  rawContent,
8644
8900
  {
@@ -8673,7 +8929,7 @@ class FetchUrlTool {
8673
8929
  const contentString = convertToString(rawContent.content, resolvedCharset);
8674
8930
  return contentString;
8675
8931
  }
8676
- for (const err of processed.errors) {
8932
+ for (const err of processed.errors ?? []) {
8677
8933
  logger.warn(`⚠️ Processing error for ${url}: ${err.message}`);
8678
8934
  }
8679
8935
  if (typeof processed.textContent !== "string" || !processed.textContent.trim()) {
@@ -8851,6 +9107,61 @@ class ListLibrariesTool {
8851
9107
  return { libraries };
8852
9108
  }
8853
9109
  }
9110
+ class RefreshVersionTool {
9111
+ pipeline;
9112
+ constructor(pipeline) {
9113
+ this.pipeline = pipeline;
9114
+ }
9115
+ async execute(options) {
9116
+ const { library, version: version2, waitForCompletion = true } = options;
9117
+ let internalVersion;
9118
+ const partialVersionRegex = /^\d+(\.\d+)?$/;
9119
+ if (version2 === null || version2 === void 0) {
9120
+ internalVersion = "";
9121
+ } else {
9122
+ const validFullVersion = semver.valid(version2);
9123
+ if (validFullVersion) {
9124
+ internalVersion = validFullVersion;
9125
+ } else if (partialVersionRegex.test(version2)) {
9126
+ const coercedVersion = semver.coerce(version2);
9127
+ if (coercedVersion) {
9128
+ internalVersion = coercedVersion.version;
9129
+ } else {
9130
+ throw new ValidationError(
9131
+ `Invalid version format for refreshing: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`,
9132
+ "RefreshVersionTool"
9133
+ );
9134
+ }
9135
+ } else {
9136
+ throw new ValidationError(
9137
+ `Invalid version format for refreshing: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`,
9138
+ "RefreshVersionTool"
9139
+ );
9140
+ }
9141
+ }
9142
+ internalVersion = internalVersion.toLowerCase();
9143
+ const pipeline = this.pipeline;
9144
+ const refreshVersion = internalVersion === "" ? null : internalVersion;
9145
+ const jobId = await pipeline.enqueueRefreshJob(library, refreshVersion);
9146
+ if (waitForCompletion) {
9147
+ try {
9148
+ await pipeline.waitForJobCompletion(jobId);
9149
+ const finalJob = await pipeline.getJob(jobId);
9150
+ const finalPagesRefreshed = finalJob?.progress?.pagesScraped ?? 0;
9151
+ logger.debug(
9152
+ `Refresh job ${jobId} finished with status ${finalJob?.status}. Pages refreshed: ${finalPagesRefreshed}`
9153
+ );
9154
+ return {
9155
+ pagesRefreshed: finalPagesRefreshed
9156
+ };
9157
+ } catch (error) {
9158
+ logger.error(`❌ Refresh job ${jobId} failed or was cancelled: ${error}`);
9159
+ throw error;
9160
+ }
9161
+ }
9162
+ return { jobId };
9163
+ }
9164
+ }
8854
9165
  class RemoveTool {
8855
9166
  constructor(documentManagementService, pipeline) {
8856
9167
  this.documentManagementService = documentManagementService;
@@ -8871,19 +9182,7 @@ class RemoveTool {
8871
9182
  }
8872
9183
  logger.info(`🗑️ Removing library: ${library}${version2 ? `@${version2}` : ""}`);
8873
9184
  try {
8874
- const result = await this.documentManagementService.findBestVersion(
8875
- library,
8876
- version2
8877
- );
8878
- const normalizedVersion = version2 && version2.trim() !== "" ? version2 : null;
8879
- const versionExists = result.bestMatch === normalizedVersion || result.hasUnversioned && normalizedVersion === null;
8880
- if (!versionExists) {
8881
- const versionText = normalizedVersion ? `Version ${normalizedVersion}` : "Version";
8882
- throw new ToolError(
8883
- `${versionText} not found for library ${library}. Cannot remove non-existent version.`,
8884
- this.constructor.name
8885
- );
8886
- }
9185
+ await this.documentManagementService.validateLibraryExists(library);
8887
9186
  const allJobs = await this.pipeline.getJobs();
8888
9187
  const jobs = allJobs.filter(
8889
9188
  (job) => job.library === library && job.version === (version2 ?? "") && (job.status === PipelineJobStatus.QUEUED || job.status === PipelineJobStatus.RUNNING)
@@ -8950,7 +9249,7 @@ class ScrapeTool {
8950
9249
  internalVersion = internalVersion.toLowerCase();
8951
9250
  const pipeline = this.pipeline;
8952
9251
  const enqueueVersion = internalVersion === "" ? null : internalVersion;
8953
- const jobId = await pipeline.enqueueJob(library, enqueueVersion, {
9252
+ const jobId = await pipeline.enqueueScrapeJob(library, enqueueVersion, {
8954
9253
  url,
8955
9254
  library,
8956
9255
  version: internalVersion,
@@ -8997,7 +9296,18 @@ class DocumentManagementClient {
8997
9296
  logger.debug(`DocumentManagementClient (tRPC) created for: ${this.baseUrl}`);
8998
9297
  }
8999
9298
  async initialize() {
9000
- await this.client.ping.query();
9299
+ try {
9300
+ await this.client.ping.query();
9301
+ } catch (error) {
9302
+ logger.debug(
9303
+ `Failed to connect to DocumentManagement server at ${this.baseUrl}: ${error}`
9304
+ );
9305
+ throw new Error(
9306
+ `Failed to connect to server at ${this.baseUrl}.
9307
+
9308
+ Please verify the server URL includes the correct port (default 8080) and ends with '/api' (e.g., 'http://localhost:8080/api').`
9309
+ );
9310
+ }
9001
9311
  }
9002
9312
  async shutdown() {
9003
9313
  }
@@ -9069,7 +9379,7 @@ class HierarchicalAssemblyStrategy {
9069
9379
  try {
9070
9380
  const chunksByDocument = /* @__PURE__ */ new Map();
9071
9381
  for (const chunk of initialChunks) {
9072
- const url = chunk.metadata.url;
9382
+ const url = chunk.url;
9073
9383
  if (!chunksByDocument.has(url)) {
9074
9384
  chunksByDocument.set(url, []);
9075
9385
  }
@@ -9163,10 +9473,10 @@ class HierarchicalAssemblyStrategy {
9163
9473
  if (debug) {
9164
9474
  return chunks.map(
9165
9475
  (chunk) => `=== #${chunk.id} ${chunk.metadata.path?.join("/")} [${chunk.metadata.level}] ===
9166
- ` + chunk.pageContent
9476
+ ` + chunk.content
9167
9477
  ).join("");
9168
9478
  }
9169
- return chunks.map((chunk) => chunk.pageContent).join("");
9479
+ return chunks.map((chunk) => chunk.content).join("");
9170
9480
  }
9171
9481
  /**
9172
9482
  * Walks up the parent hierarchy from a chunk to collect the complete parent chain.
@@ -9192,42 +9502,17 @@ class HierarchicalAssemblyStrategy {
9192
9502
  visited.add(currentId);
9193
9503
  chainIds.push(currentId);
9194
9504
  depth++;
9195
- try {
9196
- const parentChunk = await documentStore.findParentChunk(
9505
+ let parentChunk = await documentStore.findParentChunk(library, version2, currentId);
9506
+ if (!parentChunk) {
9507
+ parentChunk = await this.findAncestorWithGaps(
9197
9508
  library,
9198
9509
  version2,
9199
- currentId
9510
+ currentChunk.url,
9511
+ currentChunk.metadata.path ?? [],
9512
+ documentStore
9200
9513
  );
9201
- if (parentChunk) {
9202
- currentChunk = parentChunk;
9203
- } else {
9204
- currentChunk = await this.findAncestorWithGaps(
9205
- library,
9206
- version2,
9207
- currentChunk.metadata,
9208
- documentStore
9209
- );
9210
- }
9211
- } catch (error) {
9212
- try {
9213
- const currentMetadata = currentChunk?.metadata;
9214
- if (currentMetadata) {
9215
- currentChunk = await this.findAncestorWithGaps(
9216
- library,
9217
- version2,
9218
- currentMetadata,
9219
- documentStore
9220
- );
9221
- } else {
9222
- currentChunk = null;
9223
- }
9224
- } catch (gapError) {
9225
- logger.warn(
9226
- `Parent lookup failed for chunk ${currentId}: ${error}. Gap search also failed: ${gapError}`
9227
- );
9228
- break;
9229
- }
9230
9514
  }
9515
+ currentChunk = parentChunk;
9231
9516
  }
9232
9517
  if (depth >= maxDepth) {
9233
9518
  logger.warn(
@@ -9240,9 +9525,7 @@ class HierarchicalAssemblyStrategy {
9240
9525
  * Attempts to find ancestors when there are gaps in the hierarchy.
9241
9526
  * Tries progressively shorter path prefixes to find existing ancestor chunks.
9242
9527
  */
9243
- async findAncestorWithGaps(library, version2, metadata, documentStore) {
9244
- const path2 = metadata.path || [];
9245
- const url = metadata.url;
9528
+ async findAncestorWithGaps(library, version2, url, path2, documentStore) {
9246
9529
  if (path2.length <= 1) {
9247
9530
  return null;
9248
9531
  }
@@ -9279,7 +9562,7 @@ class HierarchicalAssemblyStrategy {
9279
9562
  }
9280
9563
  const matchingChunks = allChunks.filter((chunk) => {
9281
9564
  const chunkPath = chunk.metadata.path || [];
9282
- const chunkUrl = chunk.metadata.url;
9565
+ const chunkUrl = chunk.url;
9283
9566
  if (chunkUrl !== url) return false;
9284
9567
  if (chunkPath.length !== targetPath.length) return false;
9285
9568
  return chunkPath.every((part, index) => part === targetPath[index]);
@@ -9301,11 +9584,7 @@ class HierarchicalAssemblyStrategy {
9301
9584
  return current;
9302
9585
  }
9303
9586
  while (true) {
9304
- const parent = await documentStore.findParentChunk(
9305
- library,
9306
- version2,
9307
- current.id
9308
- );
9587
+ const parent = await documentStore.findParentChunk(library, version2, current.id);
9309
9588
  if (!parent) {
9310
9589
  return null;
9311
9590
  }
@@ -9387,7 +9666,7 @@ class HierarchicalAssemblyStrategy {
9387
9666
  const ancestorChunks = await this.findChunksByExactPath(
9388
9667
  library,
9389
9668
  version2,
9390
- referenceChunk.metadata.url,
9669
+ referenceChunk.url,
9391
9670
  ancestorPath,
9392
9671
  documentStore
9393
9672
  );
@@ -9465,13 +9744,9 @@ class HierarchicalAssemblyStrategy {
9465
9744
  for (const chunk of initialChunks) {
9466
9745
  const id = chunk.id;
9467
9746
  chunkIds.add(id);
9468
- try {
9469
- const parent = await documentStore.findParentChunk(library, version2, id);
9470
- if (parent) {
9471
- chunkIds.add(parent.id);
9472
- }
9473
- } catch (error) {
9474
- logger.warn(`Failed to find parent for chunk ${id}: ${error}`);
9747
+ const parent = await documentStore.findParentChunk(library, version2, id);
9748
+ if (parent) {
9749
+ chunkIds.add(parent.id);
9475
9750
  }
9476
9751
  try {
9477
9752
  const children = await documentStore.findChildChunks(library, version2, id, 3);
@@ -9539,7 +9814,7 @@ class MarkdownAssemblyStrategy {
9539
9814
  * Assembles chunks using simple "\n\n" joining (current behavior).
9540
9815
  */
9541
9816
  assembleContent(chunks) {
9542
- return chunks.map((chunk) => chunk.pageContent).join("\n\n");
9817
+ return chunks.map((chunk) => chunk.content).join("\n\n");
9543
9818
  }
9544
9819
  /**
9545
9820
  * Collects related chunk IDs for a single chunk using current context expansion logic.
@@ -9638,7 +9913,7 @@ class DocumentRetrieverService {
9638
9913
  groupResultsByUrl(results) {
9639
9914
  const resultsByUrl = /* @__PURE__ */ new Map();
9640
9915
  for (const result of results) {
9641
- const url = result.metadata.url;
9916
+ const url = result.url;
9642
9917
  if (!resultsByUrl.has(url)) {
9643
9918
  resultsByUrl.set(url, []);
9644
9919
  }
@@ -9653,10 +9928,8 @@ class DocumentRetrieverService {
9653
9928
  * Processes a group of search results from the same URL using appropriate strategy.
9654
9929
  */
9655
9930
  async processUrlGroup(library, version2, url, initialChunks) {
9656
- const mimeType = initialChunks.length > 0 ? initialChunks[0].metadata.mimeType : void 0;
9657
- const maxScore = Math.max(
9658
- ...initialChunks.map((chunk) => chunk.metadata.score)
9659
- );
9931
+ const mimeType = initialChunks.length > 0 ? initialChunks[0].content_type : void 0;
9932
+ const maxScore = Math.max(...initialChunks.map((chunk) => chunk.score));
9660
9933
  const strategy = createContentAssemblyStrategy(mimeType);
9661
9934
  const selectedChunks = await strategy.selectChunks(
9662
9935
  library,
@@ -9845,7 +10118,7 @@ class DocumentStore {
9845
10118
  prepareStatements() {
9846
10119
  const statements = {
9847
10120
  getById: this.db.prepare(
9848
- `SELECT d.*, p.url, p.title, p.content_type
10121
+ `SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type
9849
10122
  FROM documents d
9850
10123
  JOIN pages p ON d.page_id = p.id
9851
10124
  WHERE d.id = ?`
@@ -9858,7 +10131,7 @@ class DocumentStore {
9858
10131
  "UPDATE documents SET embedding = ? WHERE id = ?"
9859
10132
  ),
9860
10133
  insertPage: this.db.prepare(
9861
- "INSERT INTO pages (version_id, url, title, etag, last_modified, content_type) VALUES (?, ?, ?, ?, ?, ?) ON CONFLICT(version_id, url) DO UPDATE SET title = excluded.title, content_type = excluded.content_type"
10134
+ "INSERT INTO pages (version_id, url, title, etag, last_modified, content_type, depth) VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(version_id, url) DO UPDATE SET title = excluded.title, content_type = excluded.content_type, etag = excluded.etag, last_modified = excluded.last_modified, depth = excluded.depth"
9862
10135
  ),
9863
10136
  getPageId: this.db.prepare(
9864
10137
  "SELECT id FROM pages WHERE version_id = ? AND url = ?"
@@ -9869,12 +10142,13 @@ class DocumentStore {
9869
10142
  getLibraryIdByName: this.db.prepare(
9870
10143
  "SELECT id FROM libraries WHERE name = ?"
9871
10144
  ),
10145
+ getLibraryById: this.db.prepare("SELECT * FROM libraries WHERE id = ?"),
9872
10146
  // New version-related statements
9873
10147
  insertVersion: this.db.prepare(
9874
10148
  "INSERT INTO versions (library_id, name, status) VALUES (?, ?, 'not_indexed') ON CONFLICT(library_id, name) DO NOTHING"
9875
10149
  ),
9876
10150
  resolveVersionId: this.db.prepare(
9877
- "SELECT id FROM versions WHERE library_id = ? AND name IS ?"
10151
+ "SELECT id FROM versions WHERE library_id = ? AND name = ?"
9878
10152
  ),
9879
10153
  getVersionById: this.db.prepare("SELECT * FROM versions WHERE id = ?"),
9880
10154
  queryVersionsByLibraryId: this.db.prepare(
@@ -9889,15 +10163,10 @@ class DocumentStore {
9889
10163
  WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')
9890
10164
  )`
9891
10165
  ),
9892
- deleteDocumentsByUrl: this.db.prepare(
9893
- `DELETE FROM documents
9894
- WHERE page_id IN (
9895
- SELECT p.id FROM pages p
9896
- JOIN versions v ON p.version_id = v.id
9897
- JOIN libraries l ON v.library_id = l.id
9898
- WHERE p.url = ? AND l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')
9899
- )`
10166
+ deleteDocumentsByPageId: this.db.prepare(
10167
+ "DELETE FROM documents WHERE page_id = ?"
9900
10168
  ),
10169
+ deletePage: this.db.prepare("DELETE FROM pages WHERE id = ?"),
9901
10170
  deletePages: this.db.prepare(
9902
10171
  `DELETE FROM pages
9903
10172
  WHERE version_id IN (
@@ -9953,7 +10222,7 @@ class DocumentStore {
9953
10222
  ORDER BY l.name, version`
9954
10223
  ),
9955
10224
  getChildChunks: this.db.prepare(`
9956
- SELECT d.*, p.url, p.title, p.content_type FROM documents d
10225
+ SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
9957
10226
  JOIN pages p ON d.page_id = p.id
9958
10227
  JOIN versions v ON p.version_id = v.id
9959
10228
  JOIN libraries l ON v.library_id = l.id
@@ -9967,7 +10236,7 @@ class DocumentStore {
9967
10236
  LIMIT ?
9968
10237
  `),
9969
10238
  getPrecedingSiblings: this.db.prepare(`
9970
- SELECT d.*, p.url, p.title, p.content_type FROM documents d
10239
+ SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
9971
10240
  JOIN pages p ON d.page_id = p.id
9972
10241
  JOIN versions v ON p.version_id = v.id
9973
10242
  JOIN libraries l ON v.library_id = l.id
@@ -9980,7 +10249,7 @@ class DocumentStore {
9980
10249
  LIMIT ?
9981
10250
  `),
9982
10251
  getSubsequentSiblings: this.db.prepare(`
9983
- SELECT d.*, p.url, p.title, p.content_type FROM documents d
10252
+ SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
9984
10253
  JOIN pages p ON d.page_id = p.id
9985
10254
  JOIN versions v ON p.version_id = v.id
9986
10255
  JOIN libraries l ON v.library_id = l.id
@@ -9993,7 +10262,7 @@ class DocumentStore {
9993
10262
  LIMIT ?
9994
10263
  `),
9995
10264
  getParentChunk: this.db.prepare(`
9996
- SELECT d.*, p.url, p.title, p.content_type FROM documents d
10265
+ SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
9997
10266
  JOIN pages p ON d.page_id = p.id
9998
10267
  JOIN versions v ON p.version_id = v.id
9999
10268
  JOIN libraries l ON v.library_id = l.id
@@ -10035,6 +10304,9 @@ class DocumentStore {
10035
10304
  `SELECT v.id, v.library_id FROM versions v
10036
10305
  JOIN libraries l ON v.library_id = l.id
10037
10306
  WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')`
10307
+ ),
10308
+ getPagesByVersionId: this.db.prepare(
10309
+ "SELECT * FROM pages WHERE version_id = ?"
10038
10310
  )
10039
10311
  };
10040
10312
  this.statements = statements;
@@ -10176,7 +10448,7 @@ class DocumentStore {
10176
10448
  this.statements.insertVersion.run(libraryId, normalizedVersion);
10177
10449
  const versionIdRow = this.statements.resolveVersionId.get(
10178
10450
  libraryId,
10179
- normalizedVersion === null ? "" : normalizedVersion
10451
+ normalizedVersion
10180
10452
  );
10181
10453
  if (!versionIdRow || typeof versionIdRow.id !== "number") {
10182
10454
  throw new StoreError(
@@ -10238,6 +10510,32 @@ class DocumentStore {
10238
10510
  throw new StoreError(`Failed to get versions by status: ${error}`);
10239
10511
  }
10240
10512
  }
10513
+ /**
10514
+ * Retrieves a version by its ID.
10515
+ * @param versionId The version ID to retrieve
10516
+ * @returns The version record, or null if not found
10517
+ */
10518
+ async getVersionById(versionId) {
10519
+ try {
10520
+ const row = this.statements.getVersionById.get(versionId);
10521
+ return row || null;
10522
+ } catch (error) {
10523
+ throw new StoreError(`Failed to get version by ID: ${error}`);
10524
+ }
10525
+ }
10526
+ /**
10527
+ * Retrieves a library by its ID.
10528
+ * @param libraryId The library ID to retrieve
10529
+ * @returns The library record, or null if not found
10530
+ */
10531
+ async getLibraryById(libraryId) {
10532
+ try {
10533
+ const row = this.statements.getLibraryById.get(libraryId);
10534
+ return row || null;
10535
+ } catch (error) {
10536
+ throw new StoreError(`Failed to get library by ID: ${error}`);
10537
+ }
10538
+ }
10241
10539
  /**
10242
10540
  * Stores scraper options for a version to enable reproducible indexing.
10243
10541
  * @param versionId The version ID to update
@@ -10245,7 +10543,15 @@ class DocumentStore {
10245
10543
  */
10246
10544
  async storeScraperOptions(versionId, options) {
10247
10545
  try {
10248
- const { url: source_url, library, version: version2, signal, ...scraper_options } = options;
10546
+ const {
10547
+ url: source_url,
10548
+ library: _library,
10549
+ version: _version,
10550
+ signal: _signal,
10551
+ initialQueue: _initialQueue,
10552
+ isRefresh: _isRefresh,
10553
+ ...scraper_options
10554
+ } = options;
10249
10555
  const optionsJson = JSON.stringify(scraper_options);
10250
10556
  this.statements.updateVersionScraperOptions.run(source_url, optionsJson, versionId);
10251
10557
  } catch (error) {
@@ -10356,36 +10662,96 @@ class DocumentStore {
10356
10662
  throw new ConnectionError("Failed to query library versions", error);
10357
10663
  }
10358
10664
  }
10665
+ /**
10666
+ * Helper method to detect if an error is related to input size limits.
10667
+ * Checks for common error messages from various embedding providers.
10668
+ */
10669
+ isInputSizeError(error) {
10670
+ if (!(error instanceof Error)) return false;
10671
+ const message = error.message.toLowerCase();
10672
+ return message.includes("maximum context length") || message.includes("too long") || message.includes("token limit") || message.includes("input is too large") || message.includes("exceeds") || message.includes("max") && message.includes("token");
10673
+ }
10674
+ /**
10675
+ * Creates embeddings for an array of texts with automatic retry logic for size-related errors.
10676
+ * If a batch fails due to size limits:
10677
+ * - Batches with multiple texts are split in half and retried recursively
10678
+ * - Single texts that are too large are truncated and retried once
10679
+ *
10680
+ * @param texts Array of texts to embed
10681
+ * @returns Array of embedding vectors
10682
+ */
10683
+ async embedDocumentsWithRetry(texts) {
10684
+ if (texts.length === 0) {
10685
+ return [];
10686
+ }
10687
+ try {
10688
+ return await this.embeddings.embedDocuments(texts);
10689
+ } catch (error) {
10690
+ if (this.isInputSizeError(error)) {
10691
+ if (texts.length > 1) {
10692
+ const midpoint = Math.floor(texts.length / 2);
10693
+ const firstHalf = texts.slice(0, midpoint);
10694
+ const secondHalf = texts.slice(midpoint);
10695
+ logger.warn(
10696
+ `⚠️ Batch of ${texts.length} texts exceeded size limit, splitting into ${firstHalf.length} + ${secondHalf.length}`
10697
+ );
10698
+ const [firstEmbeddings, secondEmbeddings] = await Promise.all([
10699
+ this.embedDocumentsWithRetry(firstHalf),
10700
+ this.embedDocumentsWithRetry(secondHalf)
10701
+ ]);
10702
+ return [...firstEmbeddings, ...secondEmbeddings];
10703
+ } else {
10704
+ const text = texts[0];
10705
+ const midpoint = Math.floor(text.length / 2);
10706
+ const firstHalf = text.substring(0, midpoint);
10707
+ logger.warn(
10708
+ `⚠️ Single text exceeded embedding size limit (${text.length} chars). Truncating at ${firstHalf.length} chars.`
10709
+ );
10710
+ try {
10711
+ const embedding = await this.embedDocumentsWithRetry([firstHalf]);
10712
+ logger.info(
10713
+ `✓ Using embedding from first half of split text (${firstHalf.length} chars)`
10714
+ );
10715
+ return embedding;
10716
+ } catch (retryError) {
10717
+ logger.error(
10718
+ `❌ Failed to embed even after splitting. Original length: ${text.length}`
10719
+ );
10720
+ throw retryError;
10721
+ }
10722
+ }
10723
+ }
10724
+ throw error;
10725
+ }
10726
+ }
10359
10727
  /**
10360
10728
  * Stores documents with library and version metadata, generating embeddings
10361
10729
  * for vector similarity search. Uses the new pages table to normalize page-level
10362
10730
  * metadata and avoid duplication across document chunks.
10363
10731
  */
10364
- async addDocuments(library, version2, documents) {
10732
+ async addDocuments(library, version2, depth, result) {
10365
10733
  try {
10366
- if (documents.length === 0) {
10734
+ const { title, url, chunks } = result;
10735
+ if (chunks.length === 0) {
10367
10736
  return;
10368
10737
  }
10369
- const documentsByUrl = /* @__PURE__ */ new Map();
10370
- for (const doc of documents) {
10371
- const url = doc.metadata.url;
10372
- if (!url || typeof url !== "string" || !url.trim()) {
10373
- throw new StoreError("Document metadata must include a valid URL");
10374
- }
10375
- if (!documentsByUrl.has(url)) {
10376
- documentsByUrl.set(url, []);
10377
- }
10378
- documentsByUrl.get(url)?.push(doc);
10379
- }
10380
10738
  let paddedEmbeddings = [];
10381
10739
  if (this.isVectorSearchEnabled) {
10382
- const texts = documents.map((doc) => {
10383
- const header = `<title>${doc.metadata.title}</title>
10384
- <url>${doc.metadata.url}</url>
10385
- <path>${(doc.metadata.path || []).join(" / ")}</path>
10740
+ const texts = chunks.map((chunk) => {
10741
+ const header = `<title>${title}</title>
10742
+ <url>${url}</url>
10743
+ <path>${(chunk.section.path || []).join(" / ")}</path>
10386
10744
  `;
10387
- return `${header}${doc.pageContent}`;
10745
+ return `${header}${chunk.content}`;
10388
10746
  });
10747
+ for (let i = 0; i < texts.length; i++) {
10748
+ const textSize = texts[i].length;
10749
+ if (textSize > SPLITTER_MAX_CHUNK_SIZE) {
10750
+ logger.warn(
10751
+ `⚠️ Chunk ${i + 1}/${texts.length} exceeds max size: ${textSize} > ${SPLITTER_MAX_CHUNK_SIZE} chars (URL: ${url})`
10752
+ );
10753
+ }
10754
+ }
10389
10755
  const maxBatchChars = EMBEDDING_BATCH_CHARS;
10390
10756
  const rawEmbeddings = [];
10391
10757
  let currentBatch = [];
@@ -10398,7 +10764,7 @@ class DocumentStore {
10398
10764
  logger.debug(
10399
10765
  `Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
10400
10766
  );
10401
- const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
10767
+ const batchEmbeddings = await this.embedDocumentsWithRetry(currentBatch);
10402
10768
  rawEmbeddings.push(...batchEmbeddings);
10403
10769
  currentBatch = [];
10404
10770
  currentBatchSize = 0;
@@ -10410,7 +10776,7 @@ class DocumentStore {
10410
10776
  logger.debug(
10411
10777
  `Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
10412
10778
  );
10413
- const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
10779
+ const batchEmbeddings = await this.embedDocumentsWithRetry(currentBatch);
10414
10780
  rawEmbeddings.push(...batchEmbeddings);
10415
10781
  currentBatch = [];
10416
10782
  currentBatchSize = 0;
@@ -10421,110 +10787,115 @@ class DocumentStore {
10421
10787
  logger.debug(
10422
10788
  `Processing final embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
10423
10789
  );
10424
- const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
10790
+ const batchEmbeddings = await this.embedDocumentsWithRetry(currentBatch);
10425
10791
  rawEmbeddings.push(...batchEmbeddings);
10426
10792
  }
10427
10793
  paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
10428
10794
  }
10429
10795
  const versionId = await this.resolveVersionId(library, version2);
10430
- for (const url of documentsByUrl.keys()) {
10431
- const deletedCount = await this.deleteDocumentsByUrl(library, version2, url);
10432
- if (deletedCount > 0) {
10433
- logger.debug(`Deleted ${deletedCount} existing documents for URL: ${url}`);
10434
- }
10435
- }
10436
- const transaction = this.db.transaction((docsByUrl) => {
10437
- const pageIds = /* @__PURE__ */ new Map();
10438
- for (const [url, urlDocs] of docsByUrl) {
10439
- const firstDoc = urlDocs[0];
10440
- const title = firstDoc.metadata.title || "";
10441
- const contentType = firstDoc.metadata.contentType || null;
10442
- this.statements.insertPage.run(
10443
- versionId,
10444
- url,
10445
- title,
10446
- null,
10447
- // etag - will be populated during scraping
10448
- null,
10449
- // last_modified - will be populated during scraping
10450
- contentType
10451
- );
10452
- const existingPage = this.statements.getPageId.get(versionId, url);
10453
- if (!existingPage) {
10454
- throw new StoreError(`Failed to get page ID for URL: ${url}`);
10455
- }
10456
- const pageId = existingPage.id;
10457
- pageIds.set(url, pageId);
10796
+ const existingPage = this.statements.getPageId.get(versionId, url);
10797
+ if (existingPage) {
10798
+ const result2 = this.statements.deleteDocumentsByPageId.run(existingPage.id);
10799
+ if (result2.changes > 0) {
10800
+ logger.debug(`Deleted ${result2.changes} existing documents for URL: ${url}`);
10801
+ }
10802
+ }
10803
+ const transaction = this.db.transaction(() => {
10804
+ const contentType = result.contentType || null;
10805
+ const etag = result.etag || null;
10806
+ const lastModified = result.lastModified || null;
10807
+ this.statements.insertPage.run(
10808
+ versionId,
10809
+ url,
10810
+ title || "",
10811
+ etag,
10812
+ lastModified,
10813
+ contentType,
10814
+ depth
10815
+ );
10816
+ const existingPage2 = this.statements.getPageId.get(versionId, url);
10817
+ if (!existingPage2) {
10818
+ throw new StoreError(`Failed to get page ID for URL: ${url}`);
10458
10819
  }
10820
+ const pageId = existingPage2.id;
10459
10821
  let docIndex = 0;
10460
- for (const [url, urlDocs] of docsByUrl) {
10461
- const pageId = pageIds.get(url);
10462
- if (!pageId) {
10463
- throw new StoreError(`Failed to get page ID for URL: ${url}`);
10464
- }
10465
- for (let i = 0; i < urlDocs.length; i++) {
10466
- const doc = urlDocs[i];
10467
- const {
10468
- url: _,
10469
- title: __,
10470
- library: ___,
10471
- version: ____,
10472
- ...chunkMetadata
10473
- } = doc.metadata;
10474
- const result = this.statements.insertDocument.run(
10475
- pageId,
10476
- doc.pageContent,
10477
- JSON.stringify(chunkMetadata),
10478
- i
10479
- // sort_order within this page
10822
+ for (let i = 0; i < chunks.length; i++) {
10823
+ const chunk = chunks[i];
10824
+ const result2 = this.statements.insertDocument.run(
10825
+ pageId,
10826
+ chunk.content,
10827
+ JSON.stringify({
10828
+ types: chunk.types,
10829
+ level: chunk.section.level,
10830
+ path: chunk.section.path
10831
+ }),
10832
+ i
10833
+ // sort_order within this page
10834
+ );
10835
+ const rowId = result2.lastInsertRowid;
10836
+ if (this.isVectorSearchEnabled && paddedEmbeddings.length > 0) {
10837
+ this.statements.insertEmbedding.run(
10838
+ BigInt(rowId),
10839
+ JSON.stringify(paddedEmbeddings[docIndex])
10480
10840
  );
10481
- const rowId = result.lastInsertRowid;
10482
- if (this.isVectorSearchEnabled && paddedEmbeddings.length > 0) {
10483
- this.statements.insertEmbedding.run(
10484
- BigInt(rowId),
10485
- JSON.stringify(paddedEmbeddings[docIndex])
10486
- );
10487
- }
10488
- docIndex++;
10489
10841
  }
10842
+ docIndex++;
10490
10843
  }
10491
10844
  });
10492
- transaction(documentsByUrl);
10845
+ transaction();
10493
10846
  } catch (error) {
10494
10847
  throw new ConnectionError("Failed to add documents to store", error);
10495
10848
  }
10496
10849
  }
10497
10850
  /**
10498
- * Removes documents matching specified library and version
10851
+ * Removes documents and pages matching specified library and version.
10852
+ * This consolidated method deletes both documents and their associated pages.
10499
10853
  * @returns Number of documents deleted
10500
10854
  */
10501
- async deleteDocuments(library, version2) {
10855
+ async deletePages(library, version2) {
10502
10856
  try {
10503
10857
  const normalizedVersion = version2.toLowerCase();
10504
10858
  const result = this.statements.deleteDocuments.run(
10505
10859
  library.toLowerCase(),
10506
10860
  normalizedVersion
10507
10861
  );
10862
+ this.statements.deletePages.run(library.toLowerCase(), normalizedVersion);
10508
10863
  return result.changes;
10509
10864
  } catch (error) {
10510
10865
  throw new ConnectionError("Failed to delete documents", error);
10511
10866
  }
10512
10867
  }
10513
10868
  /**
10514
- * Removes documents for a specific URL within a library and version
10515
- * @returns Number of documents deleted
10869
+ * Deletes a page and all its associated document chunks.
10870
+ * Performs manual deletion in the correct order to satisfy foreign key constraints:
10871
+ * 1. Delete document chunks (page_id references pages.id)
10872
+ * 2. Delete page record
10873
+ *
10874
+ * This method is used during refresh operations when a page returns 404 Not Found.
10516
10875
  */
10517
- async deleteDocumentsByUrl(library, version2, url) {
10876
+ async deletePage(pageId) {
10518
10877
  try {
10519
- const normalizedVersion = version2.toLowerCase();
10520
- const result = this.statements.deleteDocumentsByUrl.run(
10521
- url,
10522
- library.toLowerCase(),
10523
- normalizedVersion
10524
- );
10525
- return result.changes;
10878
+ const docResult = this.statements.deleteDocumentsByPageId.run(pageId);
10879
+ logger.debug(`Deleted ${docResult.changes} document(s) for page ID ${pageId}`);
10880
+ const pageResult = this.statements.deletePage.run(pageId);
10881
+ if (pageResult.changes > 0) {
10882
+ logger.debug(`Deleted page record for page ID ${pageId}`);
10883
+ }
10884
+ } catch (error) {
10885
+ throw new ConnectionError(`Failed to delete page ${pageId}`, error);
10886
+ }
10887
+ }
10888
+ /**
10889
+ * Retrieves all pages for a specific version ID with their metadata.
10890
+ * Used for refresh operations to get existing pages with their ETags and depths.
10891
+ * @returns Array of page records
10892
+ */
10893
+ async getPagesByVersionId(versionId) {
10894
+ try {
10895
+ const result = this.statements.getPagesByVersionId.all(versionId);
10896
+ return result;
10526
10897
  } catch (error) {
10527
- throw new ConnectionError("Failed to delete documents by URL", error);
10898
+ throw new ConnectionError("Failed to get pages by version ID", error);
10528
10899
  }
10529
10900
  }
10530
10901
  /**
@@ -10547,7 +10918,7 @@ class DocumentStore {
10547
10918
  return { documentsDeleted: 0, versionDeleted: false, libraryDeleted: false };
10548
10919
  }
10549
10920
  const { id: versionId, library_id: libraryId } = versionResult;
10550
- const documentsDeleted = await this.deleteDocuments(library, version2);
10921
+ const documentsDeleted = await this.deletePages(library, version2);
10551
10922
  this.statements.deletePages.run(normalizedLibrary, normalizedVersion);
10552
10923
  const versionDeleteResult = this.statements.deleteVersionById.run(versionId);
10553
10924
  const versionDeleted = versionDeleteResult.changes > 0;
@@ -10565,6 +10936,27 @@ class DocumentStore {
10565
10936
  throw new ConnectionError("Failed to remove version", error);
10566
10937
  }
10567
10938
  }
10939
+ /**
10940
+ * Parses the metadata field from a JSON string to an object.
10941
+ * This is necessary because better-sqlite3's json() function returns a string, not an object.
10942
+ */
10943
+ parseMetadata(row) {
10944
+ if (row.metadata && typeof row.metadata === "string") {
10945
+ try {
10946
+ row.metadata = JSON.parse(row.metadata);
10947
+ } catch (error) {
10948
+ logger.warn(`Failed to parse metadata JSON: ${error}`);
10949
+ row.metadata = {};
10950
+ }
10951
+ }
10952
+ return row;
10953
+ }
10954
+ /**
10955
+ * Parses metadata for an array of rows.
10956
+ */
10957
+ parseMetadataArray(rows) {
10958
+ return rows.map((row) => this.parseMetadata(row));
10959
+ }
10568
10960
  /**
10569
10961
  * Retrieves a document by its ID.
10570
10962
  * @param id The ID of the document.
@@ -10572,13 +10964,11 @@ class DocumentStore {
10572
10964
  */
10573
10965
  async getById(id) {
10574
10966
  try {
10575
- const row = this.statements.getById.get(
10576
- BigInt(id)
10577
- );
10967
+ const row = this.statements.getById.get(BigInt(id));
10578
10968
  if (!row) {
10579
10969
  return null;
10580
10970
  }
10581
- return mapDbDocumentToDocument(row);
10971
+ return this.parseMetadata(row);
10582
10972
  } catch (error) {
10583
10973
  throw new ConnectionError(`Failed to get document by ID ${id}`, error);
10584
10974
  }
@@ -10662,26 +11052,20 @@ class DocumentStore {
10662
11052
  );
10663
11053
  const rankedResults = this.assignRanks(rawResults);
10664
11054
  const topResults = rankedResults.sort((a, b) => b.rrf_score - a.rrf_score).slice(0, limit);
10665
- return topResults.map((row) => ({
10666
- ...mapDbDocumentToDocument({
11055
+ return topResults.map((row) => {
11056
+ const result = {
10667
11057
  ...row,
10668
11058
  url: row.url || "",
10669
11059
  // Ensure url is never undefined
10670
- title: row.title,
10671
- content_type: row.content_type
10672
- }),
10673
- metadata: {
10674
- ...JSON.parse(row.metadata),
10675
- id: row.id,
11060
+ title: row.title || null,
11061
+ content_type: row.content_type || null
11062
+ };
11063
+ return Object.assign(result, {
10676
11064
  score: row.rrf_score,
10677
11065
  vec_rank: row.vec_rank,
10678
- fts_rank: row.fts_rank,
10679
- // Explicitly add page fields if they exist
10680
- url: row.url || "",
10681
- title: row.title || "",
10682
- ...row.content_type && { contentType: row.content_type }
10683
- }
10684
- }));
11066
+ fts_rank: row.fts_rank
11067
+ });
11068
+ });
10685
11069
  } else {
10686
11070
  const stmt = this.db.prepare(`
10687
11071
  SELECT
@@ -10713,28 +11097,21 @@ class DocumentStore {
10713
11097
  ftsQuery,
10714
11098
  limit
10715
11099
  );
10716
- return rawResults.map((row, index) => ({
10717
- ...mapDbDocumentToDocument({
11100
+ return rawResults.map((row, index) => {
11101
+ const result = {
10718
11102
  ...row,
10719
11103
  url: row.url || "",
10720
11104
  // Ensure url is never undefined
10721
- title: row.title,
10722
- content_type: row.content_type
10723
- }),
10724
- metadata: {
10725
- ...JSON.parse(row.metadata),
10726
- id: row.id,
11105
+ title: row.title || null,
11106
+ content_type: row.content_type || null
11107
+ };
11108
+ return Object.assign(result, {
10727
11109
  score: -row.fts_score,
10728
11110
  // Convert BM25 score to positive value for consistency
10729
- fts_rank: index + 1,
11111
+ fts_rank: index + 1
10730
11112
  // Assign rank based on order (1-based)
10731
- // Explicitly ensure vec_rank is not included in FTS-only mode
10732
- // Explicitly add page fields
10733
- url: row.url || "",
10734
- title: row.title || "",
10735
- ...row.content_type && { contentType: row.content_type }
10736
- }
10737
- }));
11113
+ });
11114
+ });
10738
11115
  }
10739
11116
  } catch (error) {
10740
11117
  throw new ConnectionError(
@@ -10753,18 +11130,17 @@ class DocumentStore {
10753
11130
  return [];
10754
11131
  }
10755
11132
  const parentPath = parent.metadata.path ?? [];
10756
- const parentUrl = parent.metadata.url;
10757
11133
  const normalizedVersion = version2.toLowerCase();
10758
11134
  const result = this.statements.getChildChunks.all(
10759
11135
  library.toLowerCase(),
10760
11136
  normalizedVersion,
10761
- parentUrl,
11137
+ parent.url,
10762
11138
  parentPath.length + 1,
10763
11139
  JSON.stringify(parentPath),
10764
11140
  BigInt(id),
10765
11141
  limit
10766
11142
  );
10767
- return result.map((row) => mapDbDocumentToDocument(row));
11143
+ return this.parseMetadataArray(result);
10768
11144
  } catch (error) {
10769
11145
  throw new ConnectionError(`Failed to find child chunks for ID ${id}`, error);
10770
11146
  }
@@ -10778,17 +11154,16 @@ class DocumentStore {
10778
11154
  if (!reference) {
10779
11155
  return [];
10780
11156
  }
10781
- const refMetadata = reference.metadata;
10782
11157
  const normalizedVersion = version2.toLowerCase();
10783
11158
  const result = this.statements.getPrecedingSiblings.all(
10784
11159
  library.toLowerCase(),
10785
11160
  normalizedVersion,
10786
- refMetadata.url,
11161
+ reference.url,
10787
11162
  BigInt(id),
10788
- JSON.stringify(refMetadata.path),
11163
+ JSON.stringify(reference.metadata.path),
10789
11164
  limit
10790
11165
  );
10791
- return result.reverse().map((row) => mapDbDocumentToDocument(row));
11166
+ return this.parseMetadataArray(result).reverse();
10792
11167
  } catch (error) {
10793
11168
  throw new ConnectionError(
10794
11169
  `Failed to find preceding sibling chunks for ID ${id}`,
@@ -10805,17 +11180,16 @@ class DocumentStore {
10805
11180
  if (!reference) {
10806
11181
  return [];
10807
11182
  }
10808
- const refMetadata = reference.metadata;
10809
11183
  const normalizedVersion = version2.toLowerCase();
10810
11184
  const result = this.statements.getSubsequentSiblings.all(
10811
11185
  library.toLowerCase(),
10812
11186
  normalizedVersion,
10813
- refMetadata.url,
11187
+ reference.url,
10814
11188
  BigInt(id),
10815
- JSON.stringify(refMetadata.path),
11189
+ JSON.stringify(reference.metadata.path),
10816
11190
  limit
10817
11191
  );
10818
- return result.map((row) => mapDbDocumentToDocument(row));
11192
+ return this.parseMetadataArray(result);
10819
11193
  } catch (error) {
10820
11194
  throw new ConnectionError(
10821
11195
  `Failed to find subsequent sibling chunks for ID ${id}`,
@@ -10825,6 +11199,8 @@ class DocumentStore {
10825
11199
  }
10826
11200
  /**
10827
11201
  * Finds the parent chunk of a given document.
11202
+ * Returns null if no parent is found or if there's a database error.
11203
+ * Database errors are logged but not thrown to maintain consistent behavior.
10828
11204
  */
10829
11205
  async findParentChunk(library, version2, id) {
10830
11206
  try {
@@ -10832,8 +11208,7 @@ class DocumentStore {
10832
11208
  if (!child) {
10833
11209
  return null;
10834
11210
  }
10835
- const childMetadata = child.metadata;
10836
- const path2 = childMetadata.path ?? [];
11211
+ const path2 = child.metadata.path ?? [];
10837
11212
  const parentPath = path2.slice(0, -1);
10838
11213
  if (parentPath.length === 0) {
10839
11214
  return null;
@@ -10842,21 +11217,22 @@ class DocumentStore {
10842
11217
  const result = this.statements.getParentChunk.get(
10843
11218
  library.toLowerCase(),
10844
11219
  normalizedVersion,
10845
- childMetadata.url,
11220
+ child.url,
10846
11221
  JSON.stringify(parentPath),
10847
11222
  BigInt(id)
10848
11223
  );
10849
11224
  if (!result) {
10850
11225
  return null;
10851
11226
  }
10852
- return mapDbDocumentToDocument(result);
11227
+ return this.parseMetadata(result);
10853
11228
  } catch (error) {
10854
- throw new ConnectionError(`Failed to find parent chunk for ID ${id}`, error);
11229
+ logger.warn(`Failed to find parent chunk for ID ${id}: ${error}`);
11230
+ return null;
10855
11231
  }
10856
11232
  }
10857
11233
  /**
10858
11234
  * Fetches multiple documents by their IDs in a single call.
10859
- * Returns an array of Document objects, sorted by their sort_order.
11235
+ * Returns an array of DbPageChunk objects, sorted by their sort_order.
10860
11236
  */
10861
11237
  async findChunksByIds(library, version2, ids) {
10862
11238
  if (!ids.length) return [];
@@ -10864,7 +11240,7 @@ class DocumentStore {
10864
11240
  const normalizedVersion = version2.toLowerCase();
10865
11241
  const placeholders = ids.map(() => "?").join(",");
10866
11242
  const stmt = this.db.prepare(
10867
- `SELECT d.*, p.url, p.title, p.content_type FROM documents d
11243
+ `SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
10868
11244
  JOIN pages p ON d.page_id = p.id
10869
11245
  JOIN versions v ON p.version_id = v.id
10870
11246
  JOIN libraries l ON v.library_id = l.id
@@ -10878,20 +11254,20 @@ class DocumentStore {
10878
11254
  normalizedVersion,
10879
11255
  ...ids
10880
11256
  );
10881
- return rows.map((row) => mapDbDocumentToDocument(row));
11257
+ return this.parseMetadataArray(rows);
10882
11258
  } catch (error) {
10883
11259
  throw new ConnectionError("Failed to fetch documents by IDs", error);
10884
11260
  }
10885
11261
  }
10886
11262
  /**
10887
11263
  * Fetches all document chunks for a specific URL within a library and version.
10888
- * Returns documents sorted by their sort_order for proper reassembly.
11264
+ * Returns DbPageChunk objects sorted by their sort_order for proper reassembly.
10889
11265
  */
10890
11266
  async findChunksByUrl(library, version2, url) {
10891
11267
  try {
10892
11268
  const normalizedVersion = version2.toLowerCase();
10893
11269
  const stmt = this.db.prepare(
10894
- `SELECT d.*, p.url, p.title, p.content_type FROM documents d
11270
+ `SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
10895
11271
  JOIN pages p ON d.page_id = p.id
10896
11272
  JOIN versions v ON p.version_id = v.id
10897
11273
  JOIN libraries l ON v.library_id = l.id
@@ -10905,7 +11281,7 @@ class DocumentStore {
10905
11281
  normalizedVersion,
10906
11282
  url
10907
11283
  );
10908
- return rows.map((row) => mapDbDocumentToDocument(row));
11284
+ return this.parseMetadataArray(rows);
10909
11285
  } catch (error) {
10910
11286
  throw new ConnectionError(`Failed to fetch documents by URL ${url}`, error);
10911
11287
  }
@@ -10923,9 +11299,8 @@ class DocumentManagementService {
10923
11299
  return (version2 ?? "").toLowerCase();
10924
11300
  }
10925
11301
  constructor(storePath, embeddingConfig, pipelineConfig) {
10926
- const dbDir = storePath;
10927
- const dbPath = path.join(dbDir, "documents.db");
10928
- logger.debug(`Using database directory: ${dbDir}`);
11302
+ const dbPath = storePath === ":memory:" ? ":memory:" : path.join(storePath, "documents.db");
11303
+ logger.debug(`Using database path: ${dbPath}`);
10929
11304
  this.store = new DocumentStore(dbPath, embeddingConfig);
10930
11305
  this.documentRetriever = new DocumentRetrieverService(this.store);
10931
11306
  this.pipelines = PipelineFactory$1.createStandardPipelines(pipelineConfig);
@@ -11136,9 +11511,24 @@ class DocumentManagementService {
11136
11511
  logger.info(
11137
11512
  `🗑️ Removing all documents from ${library}@${normalizedVersion || "[no version]"} store`
11138
11513
  );
11139
- const count = await this.store.deleteDocuments(library, normalizedVersion);
11514
+ const count = await this.store.deletePages(library, normalizedVersion);
11140
11515
  logger.info(`🗑️ Deleted ${count} documents`);
11141
11516
  }
11517
+ /**
11518
+ * Deletes a page and all its associated document chunks.
11519
+ * This is used during refresh operations when a page returns 404 Not Found.
11520
+ */
11521
+ async deletePage(pageId) {
11522
+ logger.debug(`Deleting page ID: ${pageId}`);
11523
+ await this.store.deletePage(pageId);
11524
+ }
11525
+ /**
11526
+ * Retrieves all pages for a specific version ID with their metadata.
11527
+ * Used for refresh operations to get existing pages with their ETags and depths.
11528
+ */
11529
+ async getPagesByVersionId(versionId) {
11530
+ return this.store.getPagesByVersionId(versionId);
11531
+ }
11142
11532
  /**
11143
11533
  * Completely removes a library version and all associated documents.
11144
11534
  * Also removes the library if no other versions remain.
@@ -11147,15 +11537,13 @@ class DocumentManagementService {
11147
11537
  */
11148
11538
  async removeVersion(library, version2) {
11149
11539
  const normalizedVersion = this.normalizeVersion(version2);
11150
- logger.info(`🗑️ Removing version: ${library}@${normalizedVersion || "[no version]"}`);
11540
+ logger.debug(`Removing version: ${library}@${normalizedVersion || "[no version]"}`);
11151
11541
  const result = await this.store.removeVersion(library, normalizedVersion, true);
11152
- logger.info(
11153
- `🗑️ Removed ${result.documentsDeleted} documents, version: ${result.versionDeleted}, library: ${result.libraryDeleted}`
11154
- );
11542
+ logger.info(`🗑️ Removed ${result.documentsDeleted} documents`);
11155
11543
  if (result.versionDeleted && result.libraryDeleted) {
11156
- logger.info(`✅ Completely removed library ${library} (was last version)`);
11544
+ logger.info(`🗑️ Completely removed library ${library} (was last version)`);
11157
11545
  } else if (result.versionDeleted) {
11158
- logger.info(`✅ Removed version ${library}@${normalizedVersion || "[no version]"}`);
11546
+ logger.info(`🗑️ Removed version ${library}@${normalizedVersion || "[no version]"}`);
11159
11547
  } else {
11160
11548
  logger.warn(
11161
11549
  `⚠️ Version ${library}@${normalizedVersion || "[no version]"} not found`
@@ -11163,91 +11551,68 @@ class DocumentManagementService {
11163
11551
  }
11164
11552
  }
11165
11553
  /**
11166
- * Adds a document to the store, splitting it into smaller chunks for better search results.
11167
- * Uses SemanticMarkdownSplitter to maintain markdown structure and content types during splitting.
11168
- * Preserves hierarchical structure of documents and distinguishes between text and code segments.
11169
- * If version is omitted, the document is added without a specific version.
11554
+ * Adds pre-processed content directly to the store.
11555
+ * This method is used when content has already been processed by a pipeline,
11556
+ * avoiding redundant processing. Used primarily by the scraping pipeline.
11557
+ *
11558
+ * @param library Library name
11559
+ * @param version Version string (null/undefined for unversioned)
11560
+ * @param processed Pre-processed content with chunks already created
11561
+ * @param pageId Optional page ID for refresh operations
11170
11562
  */
11171
- async addDocument(library, version2, document2) {
11563
+ async addScrapeResult(library, version2, depth, result) {
11172
11564
  const processingStart = performance.now();
11173
11565
  const normalizedVersion = this.normalizeVersion(version2);
11174
- const url = document2.metadata.url;
11175
- if (!url || typeof url !== "string" || !url.trim()) {
11176
- throw new StoreError("Document metadata must include a valid URL");
11566
+ const { url, title, chunks, contentType } = result;
11567
+ if (!url) {
11568
+ throw new StoreError("Processed content metadata must include a valid URL");
11177
11569
  }
11178
- logger.info(`📚 Adding document: ${document2.metadata.title}`);
11179
- if (!document2.pageContent.trim()) {
11180
- throw new Error("Document content cannot be empty");
11570
+ logger.info(`📚 Adding processed content: ${title || url}`);
11571
+ if (chunks.length === 0) {
11572
+ logger.warn(`⚠️ No chunks in processed content for ${url}. Skipping.`);
11573
+ return;
11181
11574
  }
11182
- const contentType = document2.metadata.mimeType;
11183
11575
  try {
11184
- const rawContent = {
11185
- source: url,
11186
- content: document2.pageContent,
11187
- mimeType: contentType || "text/plain"
11188
- };
11189
- const pipeline = this.pipelines.find((p) => p.canProcess(rawContent));
11190
- if (!pipeline) {
11191
- logger.warn(
11192
- `⚠️ Unsupported content type "${rawContent.mimeType}" for document ${url}. Skipping processing.`
11193
- );
11194
- return;
11195
- }
11196
- logger.debug(
11197
- `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${url})`
11198
- );
11199
- const scraperOptions = {
11200
- url,
11201
- library,
11202
- version: normalizedVersion,
11203
- scrapeMode: ScrapeMode.Fetch,
11204
- ignoreErrors: false,
11205
- maxConcurrency: 1
11206
- };
11207
- const processed = await pipeline.process(rawContent, scraperOptions);
11208
- const chunks = processed.chunks;
11209
- const splitDocs = chunks.map((chunk) => ({
11210
- pageContent: chunk.content,
11211
- metadata: {
11212
- ...document2.metadata,
11213
- level: chunk.section.level,
11214
- path: chunk.section.path
11215
- }
11216
- }));
11217
- logger.info(`✂️ Split document into ${splitDocs.length} chunks`);
11218
- await this.store.addDocuments(library, normalizedVersion, splitDocs);
11576
+ logger.info(`✂️ Storing ${chunks.length} pre-split chunks`);
11577
+ await this.store.addDocuments(library, normalizedVersion, depth, result);
11219
11578
  const processingTime = performance.now() - processingStart;
11579
+ const totalContentSize = chunks.reduce(
11580
+ (sum, chunk) => sum + chunk.content.length,
11581
+ 0
11582
+ );
11220
11583
  analytics.track(TelemetryEvent.DOCUMENT_PROCESSED, {
11221
11584
  // Content characteristics (privacy-safe)
11222
- mimeType: contentType || "unknown",
11223
- contentSizeBytes: document2.pageContent.length,
11585
+ mimeType: contentType,
11586
+ contentSizeBytes: totalContentSize,
11224
11587
  // Processing metrics
11225
11588
  processingTimeMs: Math.round(processingTime),
11226
- chunksCreated: splitDocs.length,
11589
+ chunksCreated: chunks.length,
11227
11590
  // Document characteristics
11228
- hasTitle: !!document2.metadata.title,
11229
- hasDescription: !!document2.metadata.description,
11591
+ hasTitle: !!title,
11230
11592
  urlDomain: extractHostname(url),
11231
- depth: document2.metadata.depth,
11593
+ depth,
11232
11594
  // Library context
11233
11595
  library,
11234
11596
  libraryVersion: normalizedVersion || null,
11235
11597
  // Processing efficiency
11236
- avgChunkSizeBytes: Math.round(document2.pageContent.length / splitDocs.length),
11598
+ avgChunkSizeBytes: Math.round(totalContentSize / chunks.length),
11237
11599
  processingSpeedKbPerSec: Math.round(
11238
- document2.pageContent.length / 1024 / (processingTime / 1e3)
11600
+ totalContentSize / 1024 / (processingTime / 1e3)
11239
11601
  )
11240
11602
  });
11241
11603
  } catch (error) {
11242
11604
  const processingTime = performance.now() - processingStart;
11243
11605
  if (error instanceof Error) {
11244
11606
  analytics.captureException(error, {
11245
- mimeType: contentType || "unknown",
11246
- contentSizeBytes: document2.pageContent.length,
11607
+ mimeType: contentType,
11608
+ contentSizeBytes: chunks.reduce(
11609
+ (sum, chunk) => sum + chunk.content.length,
11610
+ 0
11611
+ ),
11247
11612
  processingTimeMs: Math.round(processingTime),
11248
11613
  library,
11249
11614
  libraryVersion: normalizedVersion || null,
11250
- context: "document_processing",
11615
+ context: "processed_content_storage",
11251
11616
  component: DocumentManagementService.constructor.name
11252
11617
  });
11253
11618
  }
@@ -11277,6 +11642,18 @@ class DocumentManagementService {
11277
11642
  );
11278
11643
  return versionId;
11279
11644
  }
11645
+ /**
11646
+ * Retrieves a version by its ID from the database.
11647
+ */
11648
+ async getVersionById(versionId) {
11649
+ return this.store.getVersionById(versionId);
11650
+ }
11651
+ /**
11652
+ * Retrieves a library by its ID from the database.
11653
+ */
11654
+ async getLibraryById(libraryId) {
11655
+ return this.store.getLibraryById(libraryId);
11656
+ }
11280
11657
  }
11281
11658
  async function createDocumentManagement(options = {}) {
11282
11659
  if (options.serverUrl) {
@@ -11368,6 +11745,7 @@ async function initializeTools(docService, pipeline) {
11368
11745
  listLibraries: new ListLibrariesTool(docService),
11369
11746
  findVersion: new FindVersionTool(docService),
11370
11747
  scrape: new ScrapeTool(pipeline),
11748
+ refresh: new RefreshVersionTool(pipeline),
11371
11749
  search: new SearchTool(docService),
11372
11750
  listJobs: new ListJobsTool(pipeline),
11373
11751
  getJobInfo: new GetJobInfoTool(pipeline),
@@ -11480,11 +11858,15 @@ const optionalTrimmed = z$1.preprocess(
11480
11858
  (v) => typeof v === "string" ? v.trim() : v,
11481
11859
  z$1.string().min(1).optional().nullable()
11482
11860
  );
11483
- const enqueueInput = z$1.object({
11861
+ const enqueueScrapeInput = z$1.object({
11484
11862
  library: nonEmptyTrimmed,
11485
11863
  version: optionalTrimmed,
11486
11864
  options: z$1.custom()
11487
11865
  });
11866
+ const enqueueRefreshInput = z$1.object({
11867
+ library: nonEmptyTrimmed,
11868
+ version: optionalTrimmed
11869
+ });
11488
11870
  const jobIdInput = z$1.object({ id: z$1.string().min(1) });
11489
11871
  const getJobsInput = z$1.object({
11490
11872
  status: z$1.nativeEnum(PipelineJobStatus).optional()
@@ -11492,12 +11874,12 @@ const getJobsInput = z$1.object({
11492
11874
  function createPipelineRouter(trpc) {
11493
11875
  const tt = trpc;
11494
11876
  return tt.router({
11495
- enqueueJob: tt.procedure.input(enqueueInput).mutation(
11877
+ enqueueScrapeJob: tt.procedure.input(enqueueScrapeInput).mutation(
11496
11878
  async ({
11497
11879
  ctx,
11498
11880
  input
11499
11881
  }) => {
11500
- const jobId = await ctx.pipeline.enqueueJob(
11882
+ const jobId = await ctx.pipeline.enqueueScrapeJob(
11501
11883
  input.library,
11502
11884
  input.version ?? null,
11503
11885
  input.options
@@ -11517,6 +11899,18 @@ function createPipelineRouter(trpc) {
11517
11899
  return { jobId };
11518
11900
  }
11519
11901
  ),
11902
+ enqueueRefreshJob: tt.procedure.input(enqueueRefreshInput).mutation(
11903
+ async ({
11904
+ ctx,
11905
+ input
11906
+ }) => {
11907
+ const jobId = await ctx.pipeline.enqueueRefreshJob(
11908
+ input.library,
11909
+ input.version ?? null
11910
+ );
11911
+ return { jobId };
11912
+ }
11913
+ ),
11520
11914
  getJob: tt.procedure.input(jobIdInput).query(
11521
11915
  async ({
11522
11916
  ctx,
@@ -13456,7 +13850,7 @@ async function registerWorkerService(pipeline) {
13456
13850
  },
13457
13851
  onJobError: async (job, error, document2) => {
13458
13852
  logger.warn(
13459
- `⚠️ Job ${job.id} error ${document2 ? `on document ${document2.metadata.url}` : ""}: ${error.message}`
13853
+ `⚠️ Job ${job.id} error ${document2 ? `on document ${document2.url}` : ""}: ${error.message}`
13460
13854
  );
13461
13855
  analytics.captureException(error, {
13462
13856
  jobId: job.id,
@@ -13996,7 +14390,7 @@ async function findVersionAction(library, options, command) {
13996
14390
  function createFindVersionCommand(program) {
13997
14391
  return program.command("find-version <library>").description("Find the best matching version for a library").option("-v, --version <string>", "Pattern to match (optional, supports ranges)").option(
13998
14392
  "--server-url <url>",
13999
- "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
14393
+ "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
14000
14394
  ).action(findVersionAction);
14001
14395
  }
14002
14396
  async function listAction(options, command) {
@@ -14022,7 +14416,7 @@ async function listAction(options, command) {
14022
14416
  function createListCommand(program) {
14023
14417
  return program.command("list").description("List all available libraries and their versions").option(
14024
14418
  "--server-url <url>",
14025
- "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
14419
+ "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
14026
14420
  ).action(listAction);
14027
14421
  }
14028
14422
  function createMcpCommand(program) {
@@ -14045,7 +14439,7 @@ function createMcpCommand(program) {
14045
14439
  ).env("DOCS_MCP_EMBEDDING_MODEL")
14046
14440
  ).option(
14047
14441
  "--server-url <url>",
14048
- "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
14442
+ "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
14049
14443
  ).option(
14050
14444
  "--read-only",
14051
14445
  "Run in read-only mode (only expose read tools, disable write/job tools)",
@@ -14169,6 +14563,68 @@ function createMcpCommand(program) {
14169
14563
  }
14170
14564
  );
14171
14565
  }
14566
+ async function refreshAction(library, options, command) {
14567
+ await analytics.track(TelemetryEvent.CLI_COMMAND, {
14568
+ command: "refresh",
14569
+ library,
14570
+ version: options.version,
14571
+ useServerUrl: !!options.serverUrl
14572
+ });
14573
+ const serverUrl = options.serverUrl;
14574
+ const globalOptions = getGlobalOptions(command);
14575
+ const embeddingConfig = resolveEmbeddingContext(options.embeddingModel);
14576
+ if (!serverUrl && !embeddingConfig) {
14577
+ throw new Error(
14578
+ "Embedding configuration is required for local refresh operations. Please set DOCS_MCP_EMBEDDING_MODEL environment variable or use --server-url for remote execution."
14579
+ );
14580
+ }
14581
+ const docService = await createDocumentManagement({
14582
+ serverUrl,
14583
+ embeddingConfig,
14584
+ storePath: globalOptions.storePath
14585
+ });
14586
+ let pipeline = null;
14587
+ try {
14588
+ const pipelineOptions = {
14589
+ recoverJobs: false,
14590
+ concurrency: 1,
14591
+ serverUrl
14592
+ };
14593
+ pipeline = await createPipelineWithCallbacks(
14594
+ serverUrl ? void 0 : docService,
14595
+ pipelineOptions
14596
+ );
14597
+ await pipeline.start();
14598
+ const refreshTool = new RefreshVersionTool(pipeline);
14599
+ const result = await refreshTool.execute({
14600
+ library,
14601
+ version: options.version,
14602
+ waitForCompletion: true
14603
+ // Always wait for completion in CLI
14604
+ });
14605
+ if ("pagesRefreshed" in result) {
14606
+ console.log(`✅ Successfully refreshed ${result.pagesRefreshed} pages`);
14607
+ } else {
14608
+ console.log(`🚀 Refresh job started with ID: ${result.jobId}`);
14609
+ }
14610
+ } finally {
14611
+ if (pipeline) await pipeline.stop();
14612
+ await docService.shutdown();
14613
+ }
14614
+ }
14615
+ function createRefreshCommand(program) {
14616
+ return program.command("refresh <library>").description(
14617
+ "Re-scrape an existing library version, updating only changed pages.\n\nUses HTTP ETags to efficiently skip unchanged pages and only re-process\ncontent that has been modified or deleted since the last scrape.\n\nExamples:\n refresh react --version 18.0.0\n refresh mylib\n\nNote: The library and version must already be indexed. Use 'scrape' to index a new library/version."
14618
+ ).option("-v, --version <string>", "Version of the library (optional)").addOption(
14619
+ new Option(
14620
+ "--embedding-model <model>",
14621
+ "Embedding model configuration (e.g., 'openai:text-embedding-3-small')"
14622
+ ).env("DOCS_MCP_EMBEDDING_MODEL")
14623
+ ).option(
14624
+ "--server-url <url>",
14625
+ "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
14626
+ ).action(refreshAction);
14627
+ }
14172
14628
  async function removeAction(library, options, command) {
14173
14629
  await analytics.track(TelemetryEvent.CLI_COMMAND, {
14174
14630
  command: "remove",
@@ -14203,7 +14659,7 @@ function createRemoveCommand(program) {
14203
14659
  "Version to remove (optional, removes unversioned if omitted)"
14204
14660
  ).option(
14205
14661
  "--server-url <url>",
14206
- "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
14662
+ "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
14207
14663
  ).action(removeAction);
14208
14664
  }
14209
14665
  async function scrapeAction(library, url, options, command) {
@@ -14343,7 +14799,7 @@ function createScrapeCommand(program) {
14343
14799
  ).env("DOCS_MCP_EMBEDDING_MODEL")
14344
14800
  ).option(
14345
14801
  "--server-url <url>",
14346
- "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
14802
+ "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
14347
14803
  ).action(scrapeAction);
14348
14804
  }
14349
14805
  async function searchAction(library, query, options, command) {
@@ -14396,7 +14852,7 @@ function createSearchCommand(program) {
14396
14852
  ).env("DOCS_MCP_EMBEDDING_MODEL")
14397
14853
  ).option(
14398
14854
  "--server-url <url>",
14399
- "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
14855
+ "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
14400
14856
  ).action(searchAction);
14401
14857
  }
14402
14858
  function createWebCommand(program) {
@@ -14417,7 +14873,7 @@ function createWebCommand(program) {
14417
14873
  ).env("DOCS_MCP_EMBEDDING_MODEL")
14418
14874
  ).option(
14419
14875
  "--server-url <url>",
14420
- "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)"
14876
+ "URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
14421
14877
  ).action(
14422
14878
  async (cmdOptions, command) => {
14423
14879
  await analytics.track(TelemetryEvent.CLI_COMMAND, {
@@ -14612,6 +15068,7 @@ function createCliProgram() {
14612
15068
  createWebCommand(program);
14613
15069
  createWorkerCommand(program);
14614
15070
  createScrapeCommand(program);
15071
+ createRefreshCommand(program);
14615
15072
  createSearchCommand(program);
14616
15073
  createListCommand(program);
14617
15074
  createFindVersionCommand(program);