@arabold/docs-mcp-server 1.26.1 → 1.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/db/migrations/010-add-depth-to-pages.sql +16 -0
- package/dist/assets/main.css +1 -1
- package/dist/index.js +1785 -1319
- package/dist/index.js.map +1 -1
- package/package.json +34 -29
- package/public/assets/main.css +1 -1
package/dist/index.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
1
|
+
#!/usr/bin/env node --enable-source-maps
|
|
2
2
|
import "dotenv/config";
|
|
3
3
|
import { BedrockEmbeddings } from "@langchain/aws";
|
|
4
4
|
import { GoogleGenerativeAIEmbeddings } from "@langchain/google-genai";
|
|
@@ -6,7 +6,7 @@ import { VertexAIEmbeddings } from "@langchain/google-vertexai";
|
|
|
6
6
|
import { AzureOpenAIEmbeddings, OpenAIEmbeddings } from "@langchain/openai";
|
|
7
7
|
import { Embeddings } from "@langchain/core/embeddings";
|
|
8
8
|
import { PostHog } from "posthog-node";
|
|
9
|
-
import { randomUUID } from "node:crypto";
|
|
9
|
+
import crypto, { randomUUID } from "node:crypto";
|
|
10
10
|
import fs, { existsSync, readFileSync } from "node:fs";
|
|
11
11
|
import path from "node:path";
|
|
12
12
|
import { fileURLToPath, URL as URL$1 } from "node:url";
|
|
@@ -27,6 +27,7 @@ import psl from "psl";
|
|
|
27
27
|
import { HeaderGenerator } from "header-generator";
|
|
28
28
|
import fs$1 from "node:fs/promises";
|
|
29
29
|
import axios from "axios";
|
|
30
|
+
import { minimatch } from "minimatch";
|
|
30
31
|
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
|
|
31
32
|
import remarkGfm from "remark-gfm";
|
|
32
33
|
import remarkHtml from "remark-html";
|
|
@@ -40,7 +41,6 @@ import * as cheerio from "cheerio";
|
|
|
40
41
|
import "node:vm";
|
|
41
42
|
import { gfm } from "@joplin/turndown-plugin-gfm";
|
|
42
43
|
import iconv from "iconv-lite";
|
|
43
|
-
import { minimatch } from "minimatch";
|
|
44
44
|
import { SSEServerTransport } from "@modelcontextprotocol/sdk/server/sse.js";
|
|
45
45
|
import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
|
|
46
46
|
import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
@@ -113,21 +113,6 @@ class MissingCredentialsError extends StoreError {
|
|
|
113
113
|
}
|
|
114
114
|
}
|
|
115
115
|
const VECTOR_DIMENSION = 1536;
|
|
116
|
-
function mapDbDocumentToDocument(doc) {
|
|
117
|
-
const chunkMetadata = JSON.parse(doc.metadata);
|
|
118
|
-
return {
|
|
119
|
-
id: doc.id,
|
|
120
|
-
pageContent: doc.content,
|
|
121
|
-
metadata: {
|
|
122
|
-
...chunkMetadata,
|
|
123
|
-
// Page-level fields are always available from joined queries
|
|
124
|
-
url: doc.url,
|
|
125
|
-
title: doc.title || "",
|
|
126
|
-
// Convert null to empty string for consistency
|
|
127
|
-
...doc.content_type && { contentType: doc.content_type }
|
|
128
|
-
}
|
|
129
|
-
};
|
|
130
|
-
}
|
|
131
116
|
var VersionStatus = /* @__PURE__ */ ((VersionStatus2) => {
|
|
132
117
|
VersionStatus2["NOT_INDEXED"] = "not_indexed";
|
|
133
118
|
VersionStatus2["QUEUED"] = "queued";
|
|
@@ -784,16 +769,16 @@ function extractProtocol(urlOrPath) {
|
|
|
784
769
|
}
|
|
785
770
|
}
|
|
786
771
|
const name = "@arabold/docs-mcp-server";
|
|
787
|
-
const version = "1.26.
|
|
772
|
+
const version = "1.26.2";
|
|
788
773
|
const description = "MCP server for fetching and searching documentation";
|
|
789
774
|
const type = "module";
|
|
790
775
|
const bin = { "docs-mcp-server": "dist/index.js" };
|
|
791
776
|
const license = "MIT";
|
|
792
777
|
const repository = { "type": "git", "url": "git+https://github.com/arabold/docs-mcp-server.git" };
|
|
793
778
|
const files = ["dist", "public", "db", "README.md", "LICENSE", "package.json"];
|
|
794
|
-
const scripts = { "prepare": "husky || true", "build": "vite build --config vite.config.web.ts && vite build", "start": "node --enable-source-maps dist/index.js", "cli": "node --enable-source-maps dist/index.js", "server": "node --enable-source-maps dist/index.ts", "web": "node --enable-source-maps dist/index.ts web", "dev": "npm-run-all --parallel dev:server dev:web", "dev:server": "vite-node --watch src/index.ts", "dev:web": "vite build --config vite.config.web.ts --watch", "test": "vitest run", "test:watch": "vitest", "test:coverage": "vitest run --coverage", "test:
|
|
795
|
-
const dependencies = { "@fastify/formbody": "^8.0.2", "@fastify/static": "^8.
|
|
796
|
-
const devDependencies = { "@biomejs/biome": "^2.
|
|
779
|
+
const scripts = { "prepare": "husky || true", "build": "vite build --config vite.config.web.ts && vite build", "start": "node --enable-source-maps dist/index.js", "cli": "node --enable-source-maps dist/index.js", "server": "node --enable-source-maps dist/index.ts", "web": "node --enable-source-maps dist/index.ts web", "dev": "npm-run-all --parallel dev:server dev:web", "dev:server": "vite-node --watch src/index.ts", "dev:web": "vite build --config vite.config.web.ts --watch", "test": "vitest run", "test:watch": "vitest", "test:coverage": "vitest run --coverage", "test:unit": "vitest run src", "test:e2e": "vitest run test", "test:live": "vitest run --exclude= test/html-pipeline-live-e2e.test.ts", "lint": "biome check .", "lint:fix": "biome check . --fix", "typecheck": "npx tsc --noEmit", "typecheck:build": "npx tsc --noEmit --project tsconfig.build.json", "format": "biome format . --write", "postinstall": "echo 'Skipping Playwright browser install. See README.md for details.'" };
|
|
780
|
+
const dependencies = { "@fastify/formbody": "^8.0.2", "@fastify/static": "^8.3.0", "@joplin/turndown-plugin-gfm": "^1.0.62", "@kitajs/html": "^4.2.9", "@kitajs/ts-html-plugin": "^4.1.1", "@langchain/aws": "^0.1.13", "@langchain/google-genai": "^0.2.16", "@langchain/google-vertexai": "^0.2.16", "@langchain/openai": "^0.6.3", "@modelcontextprotocol/sdk": "^1.20.2", "@trpc/client": "^11.7.1", "@trpc/server": "^11.4.4", "alpinejs": "^3.14.9", "axios": "^1.13.1", "axios-retry": "^4.5.0", "better-sqlite3": "^12.4.1", "cheerio": "^1.1.2", "commander": "^14.0.0", "dompurify": "^3.3.0", "dotenv": "^17.2.3", "env-paths": "^3.0.0", "fastify": "^5.6.1", "flowbite": "^3.1.2", "fuse.js": "^7.1.0", "header-generator": "^2.1.76", "htmx.org": "^2.0.6", "iconv-lite": "^0.6.3", "jose": "^6.1.0", "jsdom": "^26.1.0", "langchain": "^0.3.30", "mime": "^4.1.0", "minimatch": "^10.0.1", "playwright": "^1.52.0", "posthog-node": "^5.11.0", "psl": "^1.15.0", "remark": "^15.0.1", "remark-gfm": "^4.0.1", "remark-html": "^16.0.1", "semver": "^7.7.3", "sqlite-vec": "^0.1.7-alpha.2", "tree-sitter": "^0.21.1", "tree-sitter-javascript": "^0.23.1", "tree-sitter-python": "^0.21.0", "tree-sitter-typescript": "^0.23.2", "turndown": "^7.2.2", "zod": "^4.1.12" };
|
|
781
|
+
const devDependencies = { "@biomejs/biome": "^2.3.2", "@commitlint/cli": "^19.8.1", "@commitlint/config-conventional": "^19.8.1", "@semantic-release/changelog": "^6.0.3", "@semantic-release/git": "^10.0.1", "@semantic-release/github": "^11.0.6", "@semantic-release/npm": "^12.0.2", "@tailwindcss/postcss": "^4.1.16", "@tailwindcss/vite": "^4.1.16", "@types/alpinejs": "^3.13.11", "@types/better-sqlite3": "^7.6.13", "@types/jsdom": "~21.1.7", "@types/lint-staged": "~13.3.0", "@types/node": "^24.1.0", "@types/node-fetch": "^2.6.13", "@types/psl": "^1.1.3", "@types/semver": "^7.7.1", "@types/turndown": "^5.0.6", "autoprefixer": "^10.4.21", "flowbite-typography": "^1.0.5", "husky": "^9.1.7", "lint-staged": "^16.2.6", "memfs": "^4.50.0", "msw": "^2.12.2", "nock": "^14.0.10", "npm-run-all": "^4.1.5", "postcss": "^8.5.6", "semantic-release": "^24.2.9", "tailwindcss": "^4.1.4", "typescript": "^5.9.3", "vite": "^6.3.5", "vite-node": "^3.2.4", "vite-plugin-dts": "^4.5.4", "vitest": "^3.2.4" };
|
|
797
782
|
const engines = { "node": ">=20.0.0" };
|
|
798
783
|
const packageJson = {
|
|
799
784
|
name,
|
|
@@ -1288,10 +1273,10 @@ class PipelineClient {
|
|
|
1288
1273
|
this.activePolling.clear();
|
|
1289
1274
|
logger.debug("PipelineClient stopped");
|
|
1290
1275
|
}
|
|
1291
|
-
async
|
|
1276
|
+
async enqueueScrapeJob(library, version2, options) {
|
|
1292
1277
|
try {
|
|
1293
1278
|
const normalizedVersion = typeof version2 === "string" && version2.trim().length === 0 ? null : version2 ?? null;
|
|
1294
|
-
const result = await this.client.
|
|
1279
|
+
const result = await this.client.enqueueScrapeJob.mutate({
|
|
1295
1280
|
library,
|
|
1296
1281
|
version: normalizedVersion,
|
|
1297
1282
|
options
|
|
@@ -1304,6 +1289,21 @@ class PipelineClient {
|
|
|
1304
1289
|
);
|
|
1305
1290
|
}
|
|
1306
1291
|
}
|
|
1292
|
+
async enqueueRefreshJob(library, version2) {
|
|
1293
|
+
try {
|
|
1294
|
+
const normalizedVersion = typeof version2 === "string" && version2.trim().length === 0 ? null : version2 ?? null;
|
|
1295
|
+
const result = await this.client.enqueueRefreshJob.mutate({
|
|
1296
|
+
library,
|
|
1297
|
+
version: normalizedVersion
|
|
1298
|
+
});
|
|
1299
|
+
logger.debug(`Refresh job ${result.jobId} enqueued successfully`);
|
|
1300
|
+
return result.jobId;
|
|
1301
|
+
} catch (error) {
|
|
1302
|
+
throw new Error(
|
|
1303
|
+
`Failed to enqueue refresh job: ${error instanceof Error ? error.message : String(error)}`
|
|
1304
|
+
);
|
|
1305
|
+
}
|
|
1306
|
+
}
|
|
1307
1307
|
async getJob(jobId) {
|
|
1308
1308
|
try {
|
|
1309
1309
|
const serializedJob = await this.client.getJob.query({ id: jobId });
|
|
@@ -1753,6 +1753,12 @@ class FingerprintGenerator {
|
|
|
1753
1753
|
return this.headerGenerator.getHeaders();
|
|
1754
1754
|
}
|
|
1755
1755
|
}
|
|
1756
|
+
var FetchStatus = /* @__PURE__ */ ((FetchStatus2) => {
|
|
1757
|
+
FetchStatus2["SUCCESS"] = "success";
|
|
1758
|
+
FetchStatus2["NOT_MODIFIED"] = "not_modified";
|
|
1759
|
+
FetchStatus2["NOT_FOUND"] = "not_found";
|
|
1760
|
+
return FetchStatus2;
|
|
1761
|
+
})(FetchStatus || {});
|
|
1756
1762
|
class BrowserFetcher {
|
|
1757
1763
|
browser = null;
|
|
1758
1764
|
page = null;
|
|
@@ -1792,13 +1798,16 @@ class BrowserFetcher {
|
|
|
1792
1798
|
const contentBuffer = Buffer.from(content, "utf-8");
|
|
1793
1799
|
const contentType = response.headers()["content-type"] || "text/html";
|
|
1794
1800
|
const { mimeType, charset } = MimeTypeUtils.parseContentType(contentType);
|
|
1801
|
+
const etag = response.headers().etag;
|
|
1795
1802
|
return {
|
|
1796
1803
|
content: contentBuffer,
|
|
1797
1804
|
mimeType,
|
|
1798
1805
|
charset,
|
|
1799
1806
|
encoding: void 0,
|
|
1800
1807
|
// Browser handles encoding automatically
|
|
1801
|
-
source: finalUrl
|
|
1808
|
+
source: finalUrl,
|
|
1809
|
+
etag,
|
|
1810
|
+
status: FetchStatus.SUCCESS
|
|
1802
1811
|
};
|
|
1803
1812
|
} catch (error) {
|
|
1804
1813
|
if (options?.signal?.aborted) {
|
|
@@ -1859,24 +1868,48 @@ class FileFetcher {
|
|
|
1859
1868
|
/**
|
|
1860
1869
|
* Fetches the content of a file given a file:// URL, decoding percent-encoded paths as needed.
|
|
1861
1870
|
* Uses enhanced MIME type detection for better source code file recognition.
|
|
1871
|
+
* Supports conditional fetching via ETag comparison for efficient refresh operations.
|
|
1862
1872
|
*/
|
|
1863
|
-
async fetch(source,
|
|
1873
|
+
async fetch(source, options) {
|
|
1864
1874
|
let filePath = source.replace(/^file:\/\/\/?/, "");
|
|
1865
1875
|
filePath = decodeURIComponent(filePath);
|
|
1866
1876
|
if (!filePath.startsWith("/") && process.platform !== "win32") {
|
|
1867
1877
|
filePath = `/${filePath}`;
|
|
1868
1878
|
}
|
|
1869
1879
|
try {
|
|
1880
|
+
const stats = await fs$1.stat(filePath);
|
|
1881
|
+
const currentEtag = crypto.createHash("md5").update(stats.mtime.toISOString()).digest("hex");
|
|
1882
|
+
if (options?.etag && options.etag === currentEtag) {
|
|
1883
|
+
return {
|
|
1884
|
+
content: Buffer.from(""),
|
|
1885
|
+
mimeType: "text/plain",
|
|
1886
|
+
source,
|
|
1887
|
+
etag: currentEtag,
|
|
1888
|
+
lastModified: stats.mtime.toISOString(),
|
|
1889
|
+
status: FetchStatus.NOT_MODIFIED
|
|
1890
|
+
};
|
|
1891
|
+
}
|
|
1870
1892
|
const content = await fs$1.readFile(filePath);
|
|
1871
1893
|
const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath);
|
|
1872
1894
|
const mimeType = detectedMimeType || "application/octet-stream";
|
|
1873
1895
|
return {
|
|
1874
1896
|
content,
|
|
1875
1897
|
mimeType,
|
|
1876
|
-
source
|
|
1898
|
+
source,
|
|
1899
|
+
etag: currentEtag,
|
|
1900
|
+
lastModified: stats.mtime.toISOString(),
|
|
1901
|
+
status: FetchStatus.SUCCESS
|
|
1877
1902
|
// Don't assume charset for text files - let the pipeline detect it
|
|
1878
1903
|
};
|
|
1879
1904
|
} catch (error) {
|
|
1905
|
+
if (error.code === "ENOENT") {
|
|
1906
|
+
return {
|
|
1907
|
+
content: Buffer.from(""),
|
|
1908
|
+
mimeType: "text/plain",
|
|
1909
|
+
source,
|
|
1910
|
+
status: FetchStatus.NOT_FOUND
|
|
1911
|
+
};
|
|
1912
|
+
}
|
|
1880
1913
|
throw new ScraperError(
|
|
1881
1914
|
`Failed to read file ${filePath}: ${error.message ?? "Unknown error"}`,
|
|
1882
1915
|
false,
|
|
@@ -1982,6 +2015,12 @@ class HttpFetcher {
|
|
|
1982
2015
|
...options?.headers
|
|
1983
2016
|
// User-provided headers override generated ones
|
|
1984
2017
|
};
|
|
2018
|
+
if (options?.etag) {
|
|
2019
|
+
headers["If-None-Match"] = options.etag;
|
|
2020
|
+
logger.debug(
|
|
2021
|
+
`Conditional request for ${source} with If-None-Match: ${options.etag}`
|
|
2022
|
+
);
|
|
2023
|
+
}
|
|
1985
2024
|
const config = {
|
|
1986
2025
|
responseType: "arraybuffer",
|
|
1987
2026
|
headers: {
|
|
@@ -1995,9 +2034,22 @@ class HttpFetcher {
|
|
|
1995
2034
|
// Pass signal to axios
|
|
1996
2035
|
// Axios follows redirects by default, we need to explicitly disable it if needed
|
|
1997
2036
|
maxRedirects: followRedirects ? 5 : 0,
|
|
1998
|
-
decompress: true
|
|
2037
|
+
decompress: true,
|
|
2038
|
+
// Allow 304 responses to be handled as successful responses
|
|
2039
|
+
validateStatus: (status) => {
|
|
2040
|
+
return status >= 200 && status < 300 || status === 304;
|
|
2041
|
+
}
|
|
1999
2042
|
};
|
|
2000
2043
|
const response = await axios.get(source, config);
|
|
2044
|
+
if (response.status === 304) {
|
|
2045
|
+
logger.debug(`HTTP 304 Not Modified for ${source}`);
|
|
2046
|
+
return {
|
|
2047
|
+
content: Buffer.from(""),
|
|
2048
|
+
mimeType: "text/plain",
|
|
2049
|
+
source,
|
|
2050
|
+
status: FetchStatus.NOT_MODIFIED
|
|
2051
|
+
};
|
|
2052
|
+
}
|
|
2001
2053
|
const contentTypeHeader = response.headers["content-type"];
|
|
2002
2054
|
const { mimeType, charset } = MimeTypeUtils.parseContentType(contentTypeHeader);
|
|
2003
2055
|
const contentEncoding = response.headers["content-encoding"];
|
|
@@ -2017,12 +2069,21 @@ class HttpFetcher {
|
|
|
2017
2069
|
response.request?.responseUrl || // Fallback to axios recorded config URL
|
|
2018
2070
|
response.config?.url || source
|
|
2019
2071
|
);
|
|
2072
|
+
const etag = response.headers.etag || response.headers.ETag;
|
|
2073
|
+
if (etag) {
|
|
2074
|
+
logger.debug(`Received ETag for ${source}: ${etag}`);
|
|
2075
|
+
}
|
|
2076
|
+
const lastModified = response.headers["last-modified"];
|
|
2077
|
+
const lastModifiedISO = lastModified ? new Date(lastModified).toISOString() : void 0;
|
|
2020
2078
|
return {
|
|
2021
2079
|
content,
|
|
2022
2080
|
mimeType,
|
|
2023
2081
|
charset,
|
|
2024
2082
|
encoding: contentEncoding,
|
|
2025
|
-
source: finalUrl
|
|
2083
|
+
source: finalUrl,
|
|
2084
|
+
etag,
|
|
2085
|
+
lastModified: lastModifiedISO,
|
|
2086
|
+
status: FetchStatus.SUCCESS
|
|
2026
2087
|
};
|
|
2027
2088
|
} catch (error) {
|
|
2028
2089
|
const axiosError = error;
|
|
@@ -2031,6 +2092,15 @@ class HttpFetcher {
|
|
|
2031
2092
|
if (options?.signal?.aborted || code === "ERR_CANCELED") {
|
|
2032
2093
|
throw new CancellationError("HTTP fetch cancelled");
|
|
2033
2094
|
}
|
|
2095
|
+
if (status === 404) {
|
|
2096
|
+
logger.debug(`Resource not found (404): ${source}`);
|
|
2097
|
+
return {
|
|
2098
|
+
content: Buffer.from(""),
|
|
2099
|
+
mimeType: "text/plain",
|
|
2100
|
+
source,
|
|
2101
|
+
status: FetchStatus.NOT_FOUND
|
|
2102
|
+
};
|
|
2103
|
+
}
|
|
2034
2104
|
if (!followRedirects && status && status >= 300 && status < 400) {
|
|
2035
2105
|
const location = axiosError.response?.headers?.location;
|
|
2036
2106
|
if (location) {
|
|
@@ -2125,101 +2195,522 @@ class AutoDetectFetcher {
|
|
|
2125
2195
|
]);
|
|
2126
2196
|
}
|
|
2127
2197
|
}
|
|
2128
|
-
|
|
2129
|
-
|
|
2130
|
-
|
|
2131
|
-
|
|
2132
|
-
|
|
2133
|
-
|
|
2134
|
-
|
|
2198
|
+
const DEFAULT_FILE_EXCLUSIONS = [
|
|
2199
|
+
// CHANGELOG files (case variations)
|
|
2200
|
+
"**/CHANGELOG.md",
|
|
2201
|
+
"**/changelog.md",
|
|
2202
|
+
"**/CHANGELOG.mdx",
|
|
2203
|
+
"**/changelog.mdx",
|
|
2204
|
+
// LICENSE files (case variations)
|
|
2205
|
+
"**/LICENSE",
|
|
2206
|
+
"**/LICENSE.md",
|
|
2207
|
+
"**/license.md",
|
|
2208
|
+
// CODE_OF_CONDUCT files (case variations)
|
|
2209
|
+
"**/CODE_OF_CONDUCT.md",
|
|
2210
|
+
"**/code_of_conduct.md",
|
|
2211
|
+
// Test files
|
|
2212
|
+
"**/*.test.*",
|
|
2213
|
+
"**/*.spec.*",
|
|
2214
|
+
"**/*_test.py",
|
|
2215
|
+
"**/*_test.go",
|
|
2216
|
+
// Package manager lock files
|
|
2217
|
+
"**/*.lock",
|
|
2218
|
+
"**/package-lock.json",
|
|
2219
|
+
"**/yarn.lock",
|
|
2220
|
+
"**/pnpm-lock.yaml",
|
|
2221
|
+
"**/go.sum",
|
|
2222
|
+
// Build artifacts
|
|
2223
|
+
"**/*.min.js",
|
|
2224
|
+
"**/*.min.css",
|
|
2225
|
+
"**/*.map",
|
|
2226
|
+
"**/*.d.ts",
|
|
2227
|
+
// IDE/System files
|
|
2228
|
+
"**/.DS_Store",
|
|
2229
|
+
"**/Thumbs.db",
|
|
2230
|
+
"**/*.swp",
|
|
2231
|
+
"**/*.swo",
|
|
2232
|
+
// Internal config files (using regex pattern)
|
|
2233
|
+
"/.*\\.(ini|cfg|conf|log|pid)$/"
|
|
2234
|
+
];
|
|
2235
|
+
const DEFAULT_FOLDER_EXCLUSIONS = [
|
|
2236
|
+
// Archive and deprecated content (matches anywhere in path)
|
|
2237
|
+
"**/archive/**",
|
|
2238
|
+
"**/archived/**",
|
|
2239
|
+
"**/deprecated/**",
|
|
2240
|
+
"**/legacy/**",
|
|
2241
|
+
"**/old/**",
|
|
2242
|
+
"**/outdated/**",
|
|
2243
|
+
"**/previous/**",
|
|
2244
|
+
"**/superseded/**",
|
|
2245
|
+
// Specific paths that don't follow the general pattern
|
|
2246
|
+
"docs/old/**",
|
|
2247
|
+
// Test directories
|
|
2248
|
+
"**/test/**",
|
|
2249
|
+
"**/tests/**",
|
|
2250
|
+
"**/__tests__/**",
|
|
2251
|
+
"**/spec/**",
|
|
2252
|
+
// Build output directories
|
|
2253
|
+
"**/dist/**",
|
|
2254
|
+
"**/build/**",
|
|
2255
|
+
"**/out/**",
|
|
2256
|
+
"**/target/**",
|
|
2257
|
+
"**/.next/**",
|
|
2258
|
+
"**/.nuxt/**",
|
|
2259
|
+
// IDE directories
|
|
2260
|
+
"**/.vscode/**",
|
|
2261
|
+
"**/.idea/**",
|
|
2262
|
+
// Internationalization folders - non-English locales
|
|
2263
|
+
"**/i18n/ar*/**",
|
|
2264
|
+
"**/i18n/de*/**",
|
|
2265
|
+
"**/i18n/es*/**",
|
|
2266
|
+
"**/i18n/fr*/**",
|
|
2267
|
+
"**/i18n/hi*/**",
|
|
2268
|
+
"**/i18n/it*/**",
|
|
2269
|
+
"**/i18n/ja*/**",
|
|
2270
|
+
"**/i18n/ko*/**",
|
|
2271
|
+
"**/i18n/nl*/**",
|
|
2272
|
+
"**/i18n/pl*/**",
|
|
2273
|
+
"**/i18n/pt*/**",
|
|
2274
|
+
"**/i18n/ru*/**",
|
|
2275
|
+
"**/i18n/sv*/**",
|
|
2276
|
+
"**/i18n/th*/**",
|
|
2277
|
+
"**/i18n/tr*/**",
|
|
2278
|
+
"**/i18n/vi*/**",
|
|
2279
|
+
"**/i18n/zh*/**",
|
|
2280
|
+
// Common locale folder patterns
|
|
2281
|
+
"**/zh-cn/**",
|
|
2282
|
+
"**/zh-hk/**",
|
|
2283
|
+
"**/zh-mo/**",
|
|
2284
|
+
"**/zh-sg/**",
|
|
2285
|
+
"**/zh-tw/**"
|
|
2286
|
+
];
|
|
2287
|
+
const DEFAULT_EXCLUSION_PATTERNS = [
|
|
2288
|
+
...DEFAULT_FILE_EXCLUSIONS,
|
|
2289
|
+
...DEFAULT_FOLDER_EXCLUSIONS
|
|
2290
|
+
];
|
|
2291
|
+
function getEffectiveExclusionPatterns(userPatterns) {
|
|
2292
|
+
if (userPatterns !== void 0) {
|
|
2293
|
+
return userPatterns;
|
|
2135
2294
|
}
|
|
2295
|
+
return DEFAULT_EXCLUSION_PATTERNS;
|
|
2136
2296
|
}
|
|
2137
|
-
|
|
2297
|
+
function isRegexPattern(pattern) {
|
|
2298
|
+
return pattern.length > 2 && pattern.startsWith("/") && pattern.endsWith("/");
|
|
2138
2299
|
}
|
|
2139
|
-
|
|
2140
|
-
|
|
2141
|
-
|
|
2142
|
-
preferredChunkSize;
|
|
2143
|
-
/**
|
|
2144
|
-
* Combines a base document splitter with size constraints to produce optimally-sized chunks.
|
|
2145
|
-
* The base splitter handles the initial semantic splitting, while this class handles
|
|
2146
|
-
* the concatenation strategy.
|
|
2147
|
-
*/
|
|
2148
|
-
constructor(baseSplitter, minChunkSize, preferredChunkSize) {
|
|
2149
|
-
this.baseSplitter = baseSplitter;
|
|
2150
|
-
this.minChunkSize = minChunkSize;
|
|
2151
|
-
this.preferredChunkSize = preferredChunkSize;
|
|
2300
|
+
function patternToRegExp(pattern) {
|
|
2301
|
+
if (isRegexPattern(pattern)) {
|
|
2302
|
+
return new RegExp(pattern.slice(1, -1));
|
|
2152
2303
|
}
|
|
2153
|
-
|
|
2154
|
-
|
|
2155
|
-
|
|
2156
|
-
|
|
2157
|
-
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2163
|
-
for (const nextChunk of initialChunks) {
|
|
2164
|
-
if (currentChunk) {
|
|
2165
|
-
if (this.wouldExceedMaxSize(currentChunk, nextChunk)) {
|
|
2166
|
-
concatenatedChunks.push(currentChunk);
|
|
2167
|
-
currentChunk = this.cloneChunk(nextChunk);
|
|
2168
|
-
continue;
|
|
2169
|
-
}
|
|
2170
|
-
if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk)) {
|
|
2171
|
-
concatenatedChunks.push(currentChunk);
|
|
2172
|
-
currentChunk = this.cloneChunk(nextChunk);
|
|
2173
|
-
continue;
|
|
2174
|
-
}
|
|
2175
|
-
currentChunk.content += `${currentChunk.content.endsWith("\n") ? "" : "\n"}${nextChunk.content}`;
|
|
2176
|
-
currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
|
|
2177
|
-
currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
|
|
2178
|
-
} else {
|
|
2179
|
-
currentChunk = this.cloneChunk(nextChunk);
|
|
2180
|
-
}
|
|
2181
|
-
}
|
|
2182
|
-
if (currentChunk) {
|
|
2183
|
-
concatenatedChunks.push(currentChunk);
|
|
2304
|
+
const re = minimatch.makeRe(pattern, { dot: true });
|
|
2305
|
+
if (!re) throw new Error(`Invalid glob pattern: ${pattern}`);
|
|
2306
|
+
return re;
|
|
2307
|
+
}
|
|
2308
|
+
function matchesAnyPattern(path2, patterns) {
|
|
2309
|
+
if (!patterns || patterns.length === 0) return false;
|
|
2310
|
+
const normalizedPath = path2.startsWith("/") ? path2 : `/${path2}`;
|
|
2311
|
+
return patterns.some((pattern) => {
|
|
2312
|
+
if (isRegexPattern(pattern)) {
|
|
2313
|
+
return patternToRegExp(pattern).test(normalizedPath);
|
|
2184
2314
|
}
|
|
2185
|
-
|
|
2186
|
-
|
|
2187
|
-
|
|
2188
|
-
|
|
2189
|
-
|
|
2190
|
-
|
|
2191
|
-
|
|
2192
|
-
|
|
2193
|
-
|
|
2194
|
-
|
|
2195
|
-
|
|
2196
|
-
}
|
|
2197
|
-
/**
|
|
2198
|
-
* H1 and H2 headings represent major conceptual breaks in the document.
|
|
2199
|
-
* Preserving these splits helps maintain the document's logical structure.
|
|
2200
|
-
*/
|
|
2201
|
-
startsNewMajorSection(chunk) {
|
|
2202
|
-
return chunk.section.level === 1 || chunk.section.level === 2;
|
|
2315
|
+
const pathForMatch = normalizedPath.replace(/^\//, "");
|
|
2316
|
+
const patternForMatch = pattern.startsWith("/") ? pattern.slice(1) : pattern;
|
|
2317
|
+
return minimatch(pathForMatch, patternForMatch, { dot: true });
|
|
2318
|
+
});
|
|
2319
|
+
}
|
|
2320
|
+
function extractPathAndQuery(url) {
|
|
2321
|
+
try {
|
|
2322
|
+
const u = new URL(url);
|
|
2323
|
+
return u.pathname + (u.search || "");
|
|
2324
|
+
} catch {
|
|
2325
|
+
return url;
|
|
2203
2326
|
}
|
|
2204
|
-
|
|
2205
|
-
|
|
2206
|
-
|
|
2207
|
-
|
|
2208
|
-
|
|
2209
|
-
|
|
2210
|
-
|
|
2327
|
+
}
|
|
2328
|
+
function shouldIncludeUrl(url, includePatterns, excludePatterns) {
|
|
2329
|
+
const path2 = extractPathAndQuery(url);
|
|
2330
|
+
const normalizedPath = path2.startsWith("/") ? path2 : `/${path2}`;
|
|
2331
|
+
let basename;
|
|
2332
|
+
if (url.startsWith("file://")) {
|
|
2333
|
+
try {
|
|
2334
|
+
const u = new URL(url);
|
|
2335
|
+
basename = u.pathname ? u.pathname.split("/").pop() : void 0;
|
|
2336
|
+
} catch {
|
|
2211
2337
|
}
|
|
2212
|
-
return currentChunk.content.length + nextChunk.content.length > this.preferredChunkSize;
|
|
2213
2338
|
}
|
|
2214
|
-
|
|
2215
|
-
|
|
2216
|
-
|
|
2217
|
-
|
|
2218
|
-
|
|
2219
|
-
|
|
2339
|
+
const stripSlash = (patterns) => patterns?.map((p) => p.startsWith("/") ? p.slice(1) : p);
|
|
2340
|
+
const effectiveExcludePatterns = getEffectiveExclusionPatterns(excludePatterns);
|
|
2341
|
+
if (matchesAnyPattern(url, effectiveExcludePatterns) || matchesAnyPattern(normalizedPath, effectiveExcludePatterns) || basename && matchesAnyPattern(basename, stripSlash(effectiveExcludePatterns)))
|
|
2342
|
+
return false;
|
|
2343
|
+
if (!includePatterns || includePatterns.length === 0) return true;
|
|
2344
|
+
return matchesAnyPattern(url, includePatterns) || matchesAnyPattern(normalizedPath, includePatterns) || (basename ? matchesAnyPattern(basename, stripSlash(includePatterns)) : false);
|
|
2345
|
+
}
|
|
2346
|
+
function computeBaseDirectory(pathname) {
|
|
2347
|
+
if (pathname === "") return "/";
|
|
2348
|
+
if (pathname.endsWith("/")) return pathname;
|
|
2349
|
+
const lastSegment = pathname.split("/").at(-1) || "";
|
|
2350
|
+
const looksLikeFile = lastSegment.includes(".");
|
|
2351
|
+
if (looksLikeFile) {
|
|
2352
|
+
return pathname.replace(/\/[^/]*$/, "/");
|
|
2220
2353
|
}
|
|
2221
|
-
|
|
2222
|
-
|
|
2354
|
+
return `${pathname}/`;
|
|
2355
|
+
}
|
|
2356
|
+
function isInScope(baseUrl, targetUrl, scope) {
|
|
2357
|
+
if (baseUrl.protocol !== targetUrl.protocol) return false;
|
|
2358
|
+
switch (scope) {
|
|
2359
|
+
case "subpages": {
|
|
2360
|
+
if (baseUrl.hostname !== targetUrl.hostname) return false;
|
|
2361
|
+
const baseDir = computeBaseDirectory(baseUrl.pathname);
|
|
2362
|
+
return targetUrl.pathname.startsWith(baseDir);
|
|
2363
|
+
}
|
|
2364
|
+
case "hostname":
|
|
2365
|
+
return baseUrl.hostname === targetUrl.hostname;
|
|
2366
|
+
case "domain": {
|
|
2367
|
+
return extractPrimaryDomain(baseUrl.hostname) === extractPrimaryDomain(targetUrl.hostname);
|
|
2368
|
+
}
|
|
2369
|
+
default:
|
|
2370
|
+
return false;
|
|
2371
|
+
}
|
|
2372
|
+
}
|
|
2373
|
+
const DEFAULT_MAX_DEPTH = 3;
|
|
2374
|
+
const DEFAULT_CONCURRENCY = 3;
|
|
2375
|
+
class BaseScraperStrategy {
|
|
2376
|
+
/**
|
|
2377
|
+
* Set of normalized URLs that have been marked for processing.
|
|
2378
|
+
*
|
|
2379
|
+
* IMPORTANT: URLs are added to this set BEFORE they are actually processed, not after.
|
|
2380
|
+
* This prevents the same URL from being queued multiple times when discovered from different sources.
|
|
2381
|
+
*
|
|
2382
|
+
* Usage flow:
|
|
2383
|
+
* 1. Initial queue setup: Root URL and initialQueue items are added to visited
|
|
2384
|
+
* 2. During processing: When a page returns links, each link is checked against visited
|
|
2385
|
+
* 3. In processBatch deduplication: Only links NOT in visited are added to the queue AND to visited
|
|
2386
|
+
*
|
|
2387
|
+
* This approach ensures:
|
|
2388
|
+
* - No URL is processed more than once
|
|
2389
|
+
* - No URL appears in the queue multiple times
|
|
2390
|
+
* - Efficient deduplication across concurrent processing
|
|
2391
|
+
*/
|
|
2392
|
+
visited = /* @__PURE__ */ new Set();
|
|
2393
|
+
pageCount = 0;
|
|
2394
|
+
totalDiscovered = 0;
|
|
2395
|
+
// Track total URLs discovered (unlimited)
|
|
2396
|
+
effectiveTotal = 0;
|
|
2397
|
+
// Track effective total (limited by maxPages)
|
|
2398
|
+
canonicalBaseUrl;
|
|
2399
|
+
options;
|
|
2400
|
+
constructor(options = {}) {
|
|
2401
|
+
this.options = options;
|
|
2402
|
+
}
|
|
2403
|
+
/**
|
|
2404
|
+
* Determines if a URL should be processed based on scope and include/exclude patterns in ScraperOptions.
|
|
2405
|
+
* Scope is checked first, then patterns.
|
|
2406
|
+
*/
|
|
2407
|
+
shouldProcessUrl(url, options) {
|
|
2408
|
+
if (options.scope) {
|
|
2409
|
+
try {
|
|
2410
|
+
const base = this.canonicalBaseUrl ?? new URL$1(options.url);
|
|
2411
|
+
const target = new URL$1(url);
|
|
2412
|
+
if (!isInScope(base, target, options.scope)) return false;
|
|
2413
|
+
} catch {
|
|
2414
|
+
return false;
|
|
2415
|
+
}
|
|
2416
|
+
}
|
|
2417
|
+
return shouldIncludeUrl(url, options.includePatterns, options.excludePatterns);
|
|
2418
|
+
}
|
|
2419
|
+
async processBatch(batch, baseUrl, options, progressCallback, signal) {
|
|
2420
|
+
const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
|
|
2421
|
+
const results = await Promise.all(
|
|
2422
|
+
batch.map(async (item) => {
|
|
2423
|
+
if (signal?.aborted) {
|
|
2424
|
+
throw new CancellationError("Scraping cancelled during batch processing");
|
|
2425
|
+
}
|
|
2426
|
+
const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH;
|
|
2427
|
+
if (item.depth > maxDepth) {
|
|
2428
|
+
return [];
|
|
2429
|
+
}
|
|
2430
|
+
try {
|
|
2431
|
+
const result = await this.processItem(item, options, signal);
|
|
2432
|
+
const shouldCount = item.pageId !== void 0 || result.content !== void 0;
|
|
2433
|
+
let currentPageCount = this.pageCount;
|
|
2434
|
+
if (shouldCount) {
|
|
2435
|
+
currentPageCount = ++this.pageCount;
|
|
2436
|
+
logger.info(
|
|
2437
|
+
`🌐 Scraping page ${currentPageCount}/${this.effectiveTotal} (depth ${item.depth}/${maxDepth}): ${item.url}`
|
|
2438
|
+
);
|
|
2439
|
+
}
|
|
2440
|
+
if (result.status === FetchStatus.NOT_MODIFIED) {
|
|
2441
|
+
logger.debug(`Page unchanged (304): ${item.url}`);
|
|
2442
|
+
if (shouldCount) {
|
|
2443
|
+
await progressCallback({
|
|
2444
|
+
pagesScraped: currentPageCount,
|
|
2445
|
+
totalPages: this.effectiveTotal,
|
|
2446
|
+
totalDiscovered: this.totalDiscovered,
|
|
2447
|
+
currentUrl: item.url,
|
|
2448
|
+
depth: item.depth,
|
|
2449
|
+
maxDepth,
|
|
2450
|
+
result: null,
|
|
2451
|
+
pageId: item.pageId
|
|
2452
|
+
});
|
|
2453
|
+
}
|
|
2454
|
+
return [];
|
|
2455
|
+
}
|
|
2456
|
+
if (result.status === FetchStatus.NOT_FOUND) {
|
|
2457
|
+
logger.debug(`Page deleted (404): ${item.url}`);
|
|
2458
|
+
if (shouldCount) {
|
|
2459
|
+
await progressCallback({
|
|
2460
|
+
pagesScraped: currentPageCount,
|
|
2461
|
+
totalPages: this.effectiveTotal,
|
|
2462
|
+
totalDiscovered: this.totalDiscovered,
|
|
2463
|
+
currentUrl: item.url,
|
|
2464
|
+
depth: item.depth,
|
|
2465
|
+
maxDepth,
|
|
2466
|
+
result: null,
|
|
2467
|
+
pageId: item.pageId,
|
|
2468
|
+
deleted: true
|
|
2469
|
+
});
|
|
2470
|
+
}
|
|
2471
|
+
return [];
|
|
2472
|
+
}
|
|
2473
|
+
if (result.status !== FetchStatus.SUCCESS) {
|
|
2474
|
+
logger.error(`Unknown fetch status: ${result.status}`);
|
|
2475
|
+
return [];
|
|
2476
|
+
}
|
|
2477
|
+
const finalUrl = result.url || item.url;
|
|
2478
|
+
if (result.content) {
|
|
2479
|
+
await progressCallback({
|
|
2480
|
+
pagesScraped: currentPageCount,
|
|
2481
|
+
totalPages: this.effectiveTotal,
|
|
2482
|
+
totalDiscovered: this.totalDiscovered,
|
|
2483
|
+
currentUrl: finalUrl,
|
|
2484
|
+
depth: item.depth,
|
|
2485
|
+
maxDepth,
|
|
2486
|
+
result: {
|
|
2487
|
+
url: finalUrl,
|
|
2488
|
+
title: result.content.title?.trim() || result.title?.trim() || "",
|
|
2489
|
+
contentType: result.contentType || "",
|
|
2490
|
+
textContent: result.content.textContent || "",
|
|
2491
|
+
links: result.content.links || [],
|
|
2492
|
+
errors: result.content.errors || [],
|
|
2493
|
+
chunks: result.content.chunks || [],
|
|
2494
|
+
etag: result.etag || null,
|
|
2495
|
+
lastModified: result.lastModified || null
|
|
2496
|
+
},
|
|
2497
|
+
pageId: item.pageId
|
|
2498
|
+
});
|
|
2499
|
+
}
|
|
2500
|
+
const nextItems = result.links || [];
|
|
2501
|
+
const linkBaseUrl = finalUrl ? new URL$1(finalUrl) : baseUrl;
|
|
2502
|
+
return nextItems.map((value) => {
|
|
2503
|
+
try {
|
|
2504
|
+
const targetUrl = new URL$1(value, linkBaseUrl);
|
|
2505
|
+
if (!this.shouldProcessUrl(targetUrl.href, options)) {
|
|
2506
|
+
return null;
|
|
2507
|
+
}
|
|
2508
|
+
return {
|
|
2509
|
+
url: targetUrl.href,
|
|
2510
|
+
depth: item.depth + 1
|
|
2511
|
+
};
|
|
2512
|
+
} catch (_error) {
|
|
2513
|
+
logger.warn(`❌ Invalid URL: ${value}`);
|
|
2514
|
+
}
|
|
2515
|
+
return null;
|
|
2516
|
+
}).filter((item2) => item2 !== null);
|
|
2517
|
+
} catch (error) {
|
|
2518
|
+
if (options.ignoreErrors) {
|
|
2519
|
+
logger.error(`❌ Failed to process ${item.url}: ${error}`);
|
|
2520
|
+
return [];
|
|
2521
|
+
}
|
|
2522
|
+
throw error;
|
|
2523
|
+
}
|
|
2524
|
+
})
|
|
2525
|
+
);
|
|
2526
|
+
const allLinks = results.flat();
|
|
2527
|
+
const uniqueLinks = [];
|
|
2528
|
+
for (const item of allLinks) {
|
|
2529
|
+
const normalizedUrl = normalizeUrl(item.url, this.options.urlNormalizerOptions);
|
|
2530
|
+
if (!this.visited.has(normalizedUrl)) {
|
|
2531
|
+
this.visited.add(normalizedUrl);
|
|
2532
|
+
uniqueLinks.push(item);
|
|
2533
|
+
this.totalDiscovered++;
|
|
2534
|
+
if (this.effectiveTotal < maxPages) {
|
|
2535
|
+
this.effectiveTotal++;
|
|
2536
|
+
}
|
|
2537
|
+
}
|
|
2538
|
+
}
|
|
2539
|
+
return uniqueLinks;
|
|
2540
|
+
}
|
|
2541
|
+
async scrape(options, progressCallback, signal) {
|
|
2542
|
+
this.visited.clear();
|
|
2543
|
+
this.pageCount = 0;
|
|
2544
|
+
const initialQueue = options.initialQueue || [];
|
|
2545
|
+
const isRefreshMode = initialQueue.length > 0;
|
|
2546
|
+
this.canonicalBaseUrl = new URL$1(options.url);
|
|
2547
|
+
let baseUrl = this.canonicalBaseUrl;
|
|
2548
|
+
const queue = [];
|
|
2549
|
+
const normalizedRootUrl = normalizeUrl(
|
|
2550
|
+
options.url,
|
|
2551
|
+
this.options.urlNormalizerOptions
|
|
2552
|
+
);
|
|
2553
|
+
if (isRefreshMode) {
|
|
2554
|
+
logger.debug(
|
|
2555
|
+
`Starting refresh mode with ${initialQueue.length} pre-populated pages`
|
|
2556
|
+
);
|
|
2557
|
+
for (const item of initialQueue) {
|
|
2558
|
+
const normalizedUrl = normalizeUrl(item.url, this.options.urlNormalizerOptions);
|
|
2559
|
+
if (!this.visited.has(normalizedUrl)) {
|
|
2560
|
+
this.visited.add(normalizedUrl);
|
|
2561
|
+
queue.push(item);
|
|
2562
|
+
}
|
|
2563
|
+
}
|
|
2564
|
+
}
|
|
2565
|
+
if (!this.visited.has(normalizedRootUrl)) {
|
|
2566
|
+
this.visited.add(normalizedRootUrl);
|
|
2567
|
+
queue.unshift({ url: options.url, depth: 0 });
|
|
2568
|
+
}
|
|
2569
|
+
this.totalDiscovered = queue.length;
|
|
2570
|
+
this.effectiveTotal = queue.length;
|
|
2571
|
+
const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
|
|
2572
|
+
const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY;
|
|
2573
|
+
while (queue.length > 0 && this.pageCount < maxPages) {
|
|
2574
|
+
if (signal?.aborted) {
|
|
2575
|
+
logger.debug(`${isRefreshMode ? "Refresh" : "Scraping"} cancelled by signal.`);
|
|
2576
|
+
throw new CancellationError(
|
|
2577
|
+
`${isRefreshMode ? "Refresh" : "Scraping"} cancelled by signal`
|
|
2578
|
+
);
|
|
2579
|
+
}
|
|
2580
|
+
const remainingPages = maxPages - this.pageCount;
|
|
2581
|
+
if (remainingPages <= 0) {
|
|
2582
|
+
break;
|
|
2583
|
+
}
|
|
2584
|
+
const batchSize = Math.min(maxConcurrency, remainingPages, queue.length);
|
|
2585
|
+
const batch = queue.splice(0, batchSize);
|
|
2586
|
+
baseUrl = this.canonicalBaseUrl ?? baseUrl;
|
|
2587
|
+
const newUrls = await this.processBatch(
|
|
2588
|
+
batch,
|
|
2589
|
+
baseUrl,
|
|
2590
|
+
options,
|
|
2591
|
+
progressCallback,
|
|
2592
|
+
signal
|
|
2593
|
+
);
|
|
2594
|
+
queue.push(...newUrls);
|
|
2595
|
+
}
|
|
2596
|
+
}
|
|
2597
|
+
/**
|
|
2598
|
+
* Cleanup resources used by this strategy.
|
|
2599
|
+
* Default implementation does nothing - override in derived classes as needed.
|
|
2600
|
+
*/
|
|
2601
|
+
async cleanup() {
|
|
2602
|
+
}
|
|
2603
|
+
}
|
|
2604
|
+
class SplitterError extends Error {
|
|
2605
|
+
}
|
|
2606
|
+
class MinimumChunkSizeError extends SplitterError {
|
|
2607
|
+
constructor(size, maxSize) {
|
|
2608
|
+
super(
|
|
2609
|
+
`Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`
|
|
2610
|
+
);
|
|
2611
|
+
}
|
|
2612
|
+
}
|
|
2613
|
+
class ContentSplitterError extends SplitterError {
|
|
2614
|
+
}
|
|
2615
|
+
class GreedySplitter {
|
|
2616
|
+
baseSplitter;
|
|
2617
|
+
minChunkSize;
|
|
2618
|
+
preferredChunkSize;
|
|
2619
|
+
maxChunkSize;
|
|
2620
|
+
/**
|
|
2621
|
+
* Combines a base document splitter with size constraints to produce optimally-sized chunks.
|
|
2622
|
+
* The base splitter handles the initial semantic splitting, while this class handles
|
|
2623
|
+
* the concatenation strategy.
|
|
2624
|
+
*/
|
|
2625
|
+
constructor(baseSplitter, minChunkSize, preferredChunkSize, maxChunkSize) {
|
|
2626
|
+
this.baseSplitter = baseSplitter;
|
|
2627
|
+
this.minChunkSize = minChunkSize;
|
|
2628
|
+
this.preferredChunkSize = preferredChunkSize;
|
|
2629
|
+
this.maxChunkSize = maxChunkSize;
|
|
2630
|
+
}
|
|
2631
|
+
/**
|
|
2632
|
+
* Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks
|
|
2633
|
+
* are combined until they reach the minimum size, but splits are preserved at major
|
|
2634
|
+
* section boundaries to maintain document structure. This balances the need for
|
|
2635
|
+
* context with semantic coherence.
|
|
2636
|
+
*/
|
|
2637
|
+
async splitText(markdown, contentType) {
|
|
2638
|
+
const initialChunks = await this.baseSplitter.splitText(markdown, contentType);
|
|
2639
|
+
const concatenatedChunks = [];
|
|
2640
|
+
let currentChunk = null;
|
|
2641
|
+
for (const nextChunk of initialChunks) {
|
|
2642
|
+
if (nextChunk.content.length > this.maxChunkSize) {
|
|
2643
|
+
logger.warn(
|
|
2644
|
+
`⚠ Chunk from base splitter exceeds max size: ${nextChunk.content.length} > ${this.maxChunkSize}`
|
|
2645
|
+
);
|
|
2646
|
+
}
|
|
2647
|
+
if (currentChunk) {
|
|
2648
|
+
const combinedSize = currentChunk.content.length + nextChunk.content.length;
|
|
2649
|
+
if (combinedSize > this.maxChunkSize) {
|
|
2650
|
+
concatenatedChunks.push(currentChunk);
|
|
2651
|
+
currentChunk = this.cloneChunk(nextChunk);
|
|
2652
|
+
continue;
|
|
2653
|
+
}
|
|
2654
|
+
if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk) && !this.isSameSection(currentChunk, nextChunk)) {
|
|
2655
|
+
concatenatedChunks.push(currentChunk);
|
|
2656
|
+
currentChunk = this.cloneChunk(nextChunk);
|
|
2657
|
+
continue;
|
|
2658
|
+
}
|
|
2659
|
+
if (combinedSize > this.preferredChunkSize && currentChunk.content.length >= this.minChunkSize && nextChunk.content.length >= this.minChunkSize) {
|
|
2660
|
+
concatenatedChunks.push(currentChunk);
|
|
2661
|
+
currentChunk = this.cloneChunk(nextChunk);
|
|
2662
|
+
continue;
|
|
2663
|
+
}
|
|
2664
|
+
currentChunk.content += `${currentChunk.content.endsWith("\n") ? "" : "\n"}${nextChunk.content}`;
|
|
2665
|
+
currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
|
|
2666
|
+
currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
|
|
2667
|
+
} else {
|
|
2668
|
+
currentChunk = this.cloneChunk(nextChunk);
|
|
2669
|
+
}
|
|
2670
|
+
}
|
|
2671
|
+
if (currentChunk) {
|
|
2672
|
+
concatenatedChunks.push(currentChunk);
|
|
2673
|
+
}
|
|
2674
|
+
return concatenatedChunks;
|
|
2675
|
+
}
|
|
2676
|
+
cloneChunk(chunk) {
|
|
2677
|
+
return {
|
|
2678
|
+
types: [...chunk.types],
|
|
2679
|
+
content: chunk.content,
|
|
2680
|
+
section: {
|
|
2681
|
+
level: chunk.section.level,
|
|
2682
|
+
path: [...chunk.section.path]
|
|
2683
|
+
}
|
|
2684
|
+
};
|
|
2685
|
+
}
|
|
2686
|
+
/**
|
|
2687
|
+
* H1 and H2 headings represent major conceptual breaks in the document.
|
|
2688
|
+
* Preserving these splits helps maintain the document's logical structure.
|
|
2689
|
+
*/
|
|
2690
|
+
startsNewMajorSection(chunk) {
|
|
2691
|
+
return chunk.section.level === 1 || chunk.section.level === 2;
|
|
2692
|
+
}
|
|
2693
|
+
/**
|
|
2694
|
+
* Checks if two chunks belong to the same section by comparing their paths.
|
|
2695
|
+
* Returns true if the paths are identical or if one is a parent of the other.
|
|
2696
|
+
*/
|
|
2697
|
+
isSameSection(chunk1, chunk2) {
|
|
2698
|
+
const path1 = chunk1.section.path;
|
|
2699
|
+
const path2 = chunk2.section.path;
|
|
2700
|
+
if (path1.length === path2.length && path1.every((part, i) => part === path2[i])) {
|
|
2701
|
+
return true;
|
|
2702
|
+
}
|
|
2703
|
+
return this.isPathIncluded(path1, path2) || this.isPathIncluded(path2, path1);
|
|
2704
|
+
}
|
|
2705
|
+
/**
|
|
2706
|
+
* Checks if one path is a prefix of another path, indicating a parent-child relationship
|
|
2707
|
+
*/
|
|
2708
|
+
isPathIncluded(parentPath, childPath) {
|
|
2709
|
+
if (parentPath.length >= childPath.length) return false;
|
|
2710
|
+
return parentPath.every((part, i) => part === childPath[i]);
|
|
2711
|
+
}
|
|
2712
|
+
/**
|
|
2713
|
+
* Merges section metadata when concatenating chunks, following these rules:
|
|
2223
2714
|
* 1. Level: Always uses the lowest (most general) level between chunks
|
|
2224
2715
|
* 2. Path selection:
|
|
2225
2716
|
* - For parent-child relationships (one path includes the other), uses the child's path
|
|
@@ -4195,7 +4686,7 @@ class HtmlMetadataExtractorMiddleware {
|
|
|
4195
4686
|
}
|
|
4196
4687
|
title = title || "Untitled";
|
|
4197
4688
|
title = title.replace(/\s+/g, " ").trim();
|
|
4198
|
-
context.
|
|
4689
|
+
context.title = title;
|
|
4199
4690
|
logger.debug(`Extracted title: "${title}" from ${context.source}`);
|
|
4200
4691
|
} catch (error) {
|
|
4201
4692
|
logger.error(`❌ Error extracting metadata from ${context.source}: ${error}`);
|
|
@@ -4653,7 +5144,7 @@ ${frame.content}
|
|
|
4653
5144
|
* @param next The next middleware function in the pipeline.
|
|
4654
5145
|
*/
|
|
4655
5146
|
async process(context, next) {
|
|
4656
|
-
const contentType = context.options?.headers?.["content-type"] || context.
|
|
5147
|
+
const contentType = context.options?.headers?.["content-type"] || context.contentType;
|
|
4657
5148
|
if (contentType && typeof contentType === "string" && !MimeTypeUtils.isHtml(contentType)) {
|
|
4658
5149
|
logger.debug(
|
|
4659
5150
|
`Skipping Playwright rendering for ${context.source} - content type '${contentType}' is not HTML`
|
|
@@ -5014,6 +5505,7 @@ ${text.replace(/^\n+|\n+$/g, "")}
|
|
|
5014
5505
|
context.content = markdown;
|
|
5015
5506
|
logger.debug(`Successfully converted HTML to Markdown for ${context.source}`);
|
|
5016
5507
|
}
|
|
5508
|
+
context.contentType = "text/markdown";
|
|
5017
5509
|
} catch (error) {
|
|
5018
5510
|
logger.error(
|
|
5019
5511
|
`❌ Error converting HTML to Markdown for ${context.source}: ${error}`
|
|
@@ -5053,7 +5545,7 @@ class MarkdownMetadataExtractorMiddleware {
|
|
|
5053
5545
|
if (match?.[1]) {
|
|
5054
5546
|
title = match[1].trim();
|
|
5055
5547
|
}
|
|
5056
|
-
context.
|
|
5548
|
+
context.title = title;
|
|
5057
5549
|
} catch (error) {
|
|
5058
5550
|
context.errors.push(
|
|
5059
5551
|
new Error(
|
|
@@ -5225,10 +5717,10 @@ function convertToString(content, charset) {
|
|
|
5225
5717
|
}
|
|
5226
5718
|
class BasePipeline {
|
|
5227
5719
|
/**
|
|
5228
|
-
* Determines if this pipeline can process the given
|
|
5720
|
+
* Determines if this pipeline can process content with the given MIME type.
|
|
5229
5721
|
* Must be implemented by derived classes.
|
|
5230
5722
|
*/
|
|
5231
|
-
canProcess(
|
|
5723
|
+
canProcess(_mimeType, _content) {
|
|
5232
5724
|
throw new Error("Method not implemented.");
|
|
5233
5725
|
}
|
|
5234
5726
|
/**
|
|
@@ -5289,11 +5781,12 @@ class HtmlPipeline extends BasePipeline {
|
|
|
5289
5781
|
this.greedySplitter = new GreedySplitter(
|
|
5290
5782
|
semanticSplitter,
|
|
5291
5783
|
SPLITTER_MIN_CHUNK_SIZE,
|
|
5292
|
-
preferredChunkSize
|
|
5784
|
+
preferredChunkSize,
|
|
5785
|
+
maxChunkSize
|
|
5293
5786
|
);
|
|
5294
5787
|
}
|
|
5295
|
-
canProcess(
|
|
5296
|
-
return MimeTypeUtils.isHtml(
|
|
5788
|
+
canProcess(mimeType) {
|
|
5789
|
+
return MimeTypeUtils.isHtml(mimeType);
|
|
5297
5790
|
}
|
|
5298
5791
|
async process(rawContent, options, fetcher) {
|
|
5299
5792
|
const resolvedCharset = resolveCharset(
|
|
@@ -5304,8 +5797,9 @@ class HtmlPipeline extends BasePipeline {
|
|
|
5304
5797
|
const contentString = convertToString(rawContent.content, resolvedCharset);
|
|
5305
5798
|
const context = {
|
|
5306
5799
|
content: contentString,
|
|
5800
|
+
contentType: rawContent.mimeType || "text/html",
|
|
5307
5801
|
source: rawContent.source,
|
|
5308
|
-
metadata: {},
|
|
5802
|
+
// metadata: {},
|
|
5309
5803
|
links: [],
|
|
5310
5804
|
errors: [],
|
|
5311
5805
|
options,
|
|
@@ -5320,8 +5814,9 @@ class HtmlPipeline extends BasePipeline {
|
|
|
5320
5814
|
typeof context.content === "string" ? context.content : ""
|
|
5321
5815
|
);
|
|
5322
5816
|
return {
|
|
5323
|
-
|
|
5324
|
-
|
|
5817
|
+
title: context.title,
|
|
5818
|
+
contentType: context.contentType,
|
|
5819
|
+
textContent: context.content,
|
|
5325
5820
|
links: context.links,
|
|
5326
5821
|
errors: context.errors,
|
|
5327
5822
|
chunks
|
|
@@ -5345,9 +5840,9 @@ class JsonPipeline extends BasePipeline {
|
|
|
5345
5840
|
preserveFormatting: true
|
|
5346
5841
|
});
|
|
5347
5842
|
}
|
|
5348
|
-
canProcess(
|
|
5349
|
-
if (!
|
|
5350
|
-
return MimeTypeUtils.isJson(
|
|
5843
|
+
canProcess(mimeType) {
|
|
5844
|
+
if (!mimeType) return false;
|
|
5845
|
+
return MimeTypeUtils.isJson(mimeType);
|
|
5351
5846
|
}
|
|
5352
5847
|
async process(rawContent, options, fetcher) {
|
|
5353
5848
|
const contentString = convertToString(rawContent.content, rawContent.charset);
|
|
@@ -5362,22 +5857,25 @@ class JsonPipeline extends BasePipeline {
|
|
|
5362
5857
|
const fallbackChunks = await this.splitter.splitText(contentString);
|
|
5363
5858
|
return {
|
|
5364
5859
|
textContent: contentString,
|
|
5365
|
-
metadata: {
|
|
5366
|
-
|
|
5367
|
-
},
|
|
5860
|
+
// metadata: {
|
|
5861
|
+
// isValidJson: false,
|
|
5862
|
+
// },
|
|
5368
5863
|
links: [],
|
|
5369
5864
|
errors: [],
|
|
5370
5865
|
chunks: fallbackChunks
|
|
5371
5866
|
};
|
|
5372
5867
|
}
|
|
5868
|
+
const metadata = this.extractMetadata(parsedJson);
|
|
5373
5869
|
const context = {
|
|
5374
5870
|
content: contentString,
|
|
5375
5871
|
source: rawContent.source,
|
|
5376
|
-
|
|
5377
|
-
|
|
5378
|
-
|
|
5379
|
-
|
|
5380
|
-
|
|
5872
|
+
title: metadata.title,
|
|
5873
|
+
contentType: rawContent.mimeType || "application/json",
|
|
5874
|
+
// metadata: {
|
|
5875
|
+
// ...this.extractMetadata(parsedJson),
|
|
5876
|
+
// isValidJson,
|
|
5877
|
+
// jsonStructure: this.analyzeJsonStructure(parsedJson),
|
|
5878
|
+
// },
|
|
5381
5879
|
links: [],
|
|
5382
5880
|
// JSON files typically don't contain links
|
|
5383
5881
|
errors: [],
|
|
@@ -5387,8 +5885,9 @@ class JsonPipeline extends BasePipeline {
|
|
|
5387
5885
|
await this.executeMiddlewareStack(this.middleware, context);
|
|
5388
5886
|
const chunks = await this.splitter.splitText(context.content);
|
|
5389
5887
|
return {
|
|
5888
|
+
title: context.title,
|
|
5889
|
+
contentType: context.contentType,
|
|
5390
5890
|
textContent: context.content,
|
|
5391
|
-
metadata: context.metadata,
|
|
5392
5891
|
links: context.links,
|
|
5393
5892
|
errors: context.errors,
|
|
5394
5893
|
chunks
|
|
@@ -5418,30 +5917,6 @@ class JsonPipeline extends BasePipeline {
|
|
|
5418
5917
|
}
|
|
5419
5918
|
return metadata;
|
|
5420
5919
|
}
|
|
5421
|
-
/**
|
|
5422
|
-
* Analyzes the structure of valid JSON for metadata
|
|
5423
|
-
*/
|
|
5424
|
-
analyzeJsonStructure(parsedJson) {
|
|
5425
|
-
if (Array.isArray(parsedJson)) {
|
|
5426
|
-
return {
|
|
5427
|
-
type: "array",
|
|
5428
|
-
depth: this.calculateDepth(parsedJson),
|
|
5429
|
-
itemCount: parsedJson.length
|
|
5430
|
-
};
|
|
5431
|
-
} else if (typeof parsedJson === "object" && parsedJson !== null) {
|
|
5432
|
-
const obj = parsedJson;
|
|
5433
|
-
return {
|
|
5434
|
-
type: "object",
|
|
5435
|
-
depth: this.calculateDepth(parsedJson),
|
|
5436
|
-
propertyCount: Object.keys(obj).length
|
|
5437
|
-
};
|
|
5438
|
-
} else {
|
|
5439
|
-
return {
|
|
5440
|
-
type: typeof parsedJson,
|
|
5441
|
-
depth: 1
|
|
5442
|
-
};
|
|
5443
|
-
}
|
|
5444
|
-
}
|
|
5445
5920
|
/**
|
|
5446
5921
|
* Calculates the maximum nesting depth of a JSON structure
|
|
5447
5922
|
*/
|
|
@@ -5482,19 +5957,20 @@ class MarkdownPipeline extends BasePipeline {
|
|
|
5482
5957
|
this.greedySplitter = new GreedySplitter(
|
|
5483
5958
|
semanticSplitter,
|
|
5484
5959
|
SPLITTER_MIN_CHUNK_SIZE,
|
|
5485
|
-
preferredChunkSize
|
|
5960
|
+
preferredChunkSize,
|
|
5961
|
+
maxChunkSize
|
|
5486
5962
|
);
|
|
5487
5963
|
}
|
|
5488
|
-
canProcess(
|
|
5489
|
-
if (!
|
|
5490
|
-
return MimeTypeUtils.isMarkdown(
|
|
5964
|
+
canProcess(mimeType) {
|
|
5965
|
+
if (!mimeType) return false;
|
|
5966
|
+
return MimeTypeUtils.isMarkdown(mimeType);
|
|
5491
5967
|
}
|
|
5492
5968
|
async process(rawContent, options, fetcher) {
|
|
5493
5969
|
const contentString = convertToString(rawContent.content, rawContent.charset);
|
|
5494
5970
|
const context = {
|
|
5971
|
+
contentType: rawContent.mimeType || "text/markdown",
|
|
5495
5972
|
content: contentString,
|
|
5496
5973
|
source: rawContent.source,
|
|
5497
|
-
metadata: {},
|
|
5498
5974
|
links: [],
|
|
5499
5975
|
errors: [],
|
|
5500
5976
|
options,
|
|
@@ -5506,8 +5982,9 @@ class MarkdownPipeline extends BasePipeline {
|
|
|
5506
5982
|
rawContent.mimeType
|
|
5507
5983
|
);
|
|
5508
5984
|
return {
|
|
5985
|
+
title: context.title,
|
|
5986
|
+
contentType: context.contentType,
|
|
5509
5987
|
textContent: typeof context.content === "string" ? context.content : "",
|
|
5510
|
-
metadata: context.metadata,
|
|
5511
5988
|
links: context.links,
|
|
5512
5989
|
errors: context.errors,
|
|
5513
5990
|
chunks
|
|
@@ -5517,24 +5994,27 @@ class MarkdownPipeline extends BasePipeline {
|
|
|
5517
5994
|
class SourceCodePipeline extends BasePipeline {
|
|
5518
5995
|
middleware;
|
|
5519
5996
|
splitter;
|
|
5520
|
-
constructor(
|
|
5997
|
+
constructor(_preferredChunkSize = SPLITTER_PREFERRED_CHUNK_SIZE, maxChunkSize = SPLITTER_MAX_CHUNK_SIZE) {
|
|
5521
5998
|
super();
|
|
5522
5999
|
this.middleware = [];
|
|
5523
|
-
this.splitter = new TreesitterSourceCodeSplitter({ maxChunkSize
|
|
6000
|
+
this.splitter = new TreesitterSourceCodeSplitter({ maxChunkSize });
|
|
5524
6001
|
}
|
|
5525
|
-
canProcess(
|
|
5526
|
-
if (!
|
|
5527
|
-
return MimeTypeUtils.isSourceCode(
|
|
6002
|
+
canProcess(mimeType) {
|
|
6003
|
+
if (!mimeType) return false;
|
|
6004
|
+
return MimeTypeUtils.isSourceCode(mimeType);
|
|
5528
6005
|
}
|
|
5529
6006
|
async process(rawContent, options, fetcher) {
|
|
5530
6007
|
const contentString = convertToString(rawContent.content, rawContent.charset);
|
|
5531
6008
|
const context = {
|
|
6009
|
+
contentType: rawContent.mimeType || "text/plain",
|
|
5532
6010
|
content: contentString,
|
|
5533
6011
|
source: rawContent.source,
|
|
5534
|
-
metadata: {
|
|
5535
|
-
|
|
5536
|
-
|
|
5537
|
-
|
|
6012
|
+
// metadata: {
|
|
6013
|
+
// language: rawContent.mimeType
|
|
6014
|
+
// ? MimeTypeUtils.extractLanguageFromMimeType(rawContent.mimeType)
|
|
6015
|
+
// : "text",
|
|
6016
|
+
// isSourceCode: true,
|
|
6017
|
+
// },
|
|
5538
6018
|
links: [],
|
|
5539
6019
|
// Source code files typically don't contain web links
|
|
5540
6020
|
errors: [],
|
|
@@ -5544,8 +6024,10 @@ class SourceCodePipeline extends BasePipeline {
|
|
|
5544
6024
|
await this.executeMiddlewareStack(this.middleware, context);
|
|
5545
6025
|
const chunks = await this.splitter.splitText(context.content, rawContent.mimeType);
|
|
5546
6026
|
return {
|
|
6027
|
+
title: context.title,
|
|
6028
|
+
contentType: context.contentType,
|
|
5547
6029
|
textContent: context.content,
|
|
5548
|
-
metadata: context.metadata,
|
|
6030
|
+
// metadata: context.metadata,
|
|
5549
6031
|
links: context.links,
|
|
5550
6032
|
errors: context.errors,
|
|
5551
6033
|
chunks
|
|
@@ -5594,17 +6076,22 @@ class TextDocumentSplitter {
|
|
|
5594
6076
|
class TextPipeline extends BasePipeline {
|
|
5595
6077
|
middleware;
|
|
5596
6078
|
splitter;
|
|
5597
|
-
constructor(
|
|
6079
|
+
constructor(preferredChunkSize = SPLITTER_PREFERRED_CHUNK_SIZE, maxChunkSize = SPLITTER_MAX_CHUNK_SIZE) {
|
|
5598
6080
|
super();
|
|
5599
6081
|
this.middleware = [];
|
|
5600
|
-
const textSplitter = new TextDocumentSplitter({ maxChunkSize
|
|
5601
|
-
this.splitter = new GreedySplitter(
|
|
6082
|
+
const textSplitter = new TextDocumentSplitter({ maxChunkSize });
|
|
6083
|
+
this.splitter = new GreedySplitter(
|
|
6084
|
+
textSplitter,
|
|
6085
|
+
SPLITTER_MIN_CHUNK_SIZE,
|
|
6086
|
+
preferredChunkSize,
|
|
6087
|
+
maxChunkSize
|
|
6088
|
+
);
|
|
5602
6089
|
}
|
|
5603
|
-
canProcess(
|
|
5604
|
-
if (!MimeTypeUtils.isSafeForTextProcessing(
|
|
6090
|
+
canProcess(mimeType, content) {
|
|
6091
|
+
if (!MimeTypeUtils.isSafeForTextProcessing(mimeType)) {
|
|
5605
6092
|
return false;
|
|
5606
6093
|
}
|
|
5607
|
-
if (MimeTypeUtils.isBinary(
|
|
6094
|
+
if (content && MimeTypeUtils.isBinary(content)) {
|
|
5608
6095
|
return false;
|
|
5609
6096
|
}
|
|
5610
6097
|
return true;
|
|
@@ -5612,12 +6099,11 @@ class TextPipeline extends BasePipeline {
|
|
|
5612
6099
|
async process(rawContent, options, fetcher) {
|
|
5613
6100
|
const contentString = convertToString(rawContent.content, rawContent.charset);
|
|
5614
6101
|
const context = {
|
|
6102
|
+
title: "",
|
|
6103
|
+
// Title extraction can be added in middleware if needed
|
|
6104
|
+
contentType: rawContent.mimeType || "text/plain",
|
|
5615
6105
|
content: contentString,
|
|
5616
6106
|
source: rawContent.source,
|
|
5617
|
-
metadata: {
|
|
5618
|
-
contentType: rawContent.mimeType || "text/plain",
|
|
5619
|
-
isGenericText: true
|
|
5620
|
-
},
|
|
5621
6107
|
links: [],
|
|
5622
6108
|
// Generic text content typically doesn't contain structured links
|
|
5623
6109
|
errors: [],
|
|
@@ -5627,394 +6113,283 @@ class TextPipeline extends BasePipeline {
|
|
|
5627
6113
|
await this.executeMiddlewareStack(this.middleware, context);
|
|
5628
6114
|
const chunks = await this.splitter.splitText(context.content, rawContent.mimeType);
|
|
5629
6115
|
return {
|
|
6116
|
+
title: context.title,
|
|
6117
|
+
contentType: context.contentType,
|
|
5630
6118
|
textContent: context.content,
|
|
5631
|
-
metadata: context.metadata,
|
|
5632
6119
|
links: context.links,
|
|
5633
6120
|
errors: context.errors,
|
|
5634
|
-
chunks
|
|
5635
|
-
};
|
|
5636
|
-
}
|
|
5637
|
-
}
|
|
5638
|
-
let PipelineFactory$1 = class PipelineFactory {
|
|
5639
|
-
/**
|
|
5640
|
-
* Creates the standard set of content pipelines used by all scraper strategies.
|
|
5641
|
-
* Includes HTML, Markdown, JSON, source code, and text processing capabilities.
|
|
5642
|
-
* Each pipeline now handles both preprocessing and content-specific splitting.
|
|
5643
|
-
* TextPipeline is placed last as the universal fallback for unknown content types.
|
|
5644
|
-
*
|
|
5645
|
-
* @param config - Optional configuration for pipeline chunk sizes
|
|
5646
|
-
* @returns Array of content pipelines in processing order
|
|
5647
|
-
*/
|
|
5648
|
-
static createStandardPipelines(config) {
|
|
5649
|
-
const preferredChunkSize = config?.chunkSizes?.preferred ?? SPLITTER_PREFERRED_CHUNK_SIZE;
|
|
5650
|
-
const maxChunkSize = config?.chunkSizes?.max ?? 2e3;
|
|
5651
|
-
return [
|
|
5652
|
-
new JsonPipeline(preferredChunkSize),
|
|
5653
|
-
new SourceCodePipeline(preferredChunkSize),
|
|
5654
|
-
new HtmlPipeline(preferredChunkSize, maxChunkSize),
|
|
5655
|
-
new MarkdownPipeline(preferredChunkSize, maxChunkSize),
|
|
5656
|
-
new TextPipeline(preferredChunkSize)
|
|
5657
|
-
// Universal fallback - must be last
|
|
5658
|
-
];
|
|
5659
|
-
}
|
|
5660
|
-
};
|
|
5661
|
-
const DEFAULT_FILE_EXCLUSIONS = [
|
|
5662
|
-
// CHANGELOG files (case variations)
|
|
5663
|
-
"**/CHANGELOG.md",
|
|
5664
|
-
"**/changelog.md",
|
|
5665
|
-
"**/CHANGELOG.mdx",
|
|
5666
|
-
"**/changelog.mdx",
|
|
5667
|
-
// LICENSE files (case variations)
|
|
5668
|
-
"**/LICENSE",
|
|
5669
|
-
"**/LICENSE.md",
|
|
5670
|
-
"**/license.md",
|
|
5671
|
-
// CODE_OF_CONDUCT files (case variations)
|
|
5672
|
-
"**/CODE_OF_CONDUCT.md",
|
|
5673
|
-
"**/code_of_conduct.md",
|
|
5674
|
-
// Test files
|
|
5675
|
-
"**/*.test.*",
|
|
5676
|
-
"**/*.spec.*",
|
|
5677
|
-
"**/*_test.py",
|
|
5678
|
-
"**/*_test.go",
|
|
5679
|
-
// Package manager lock files
|
|
5680
|
-
"**/*.lock",
|
|
5681
|
-
"**/package-lock.json",
|
|
5682
|
-
"**/yarn.lock",
|
|
5683
|
-
"**/pnpm-lock.yaml",
|
|
5684
|
-
"**/go.sum",
|
|
5685
|
-
// Build artifacts
|
|
5686
|
-
"**/*.min.js",
|
|
5687
|
-
"**/*.min.css",
|
|
5688
|
-
"**/*.map",
|
|
5689
|
-
"**/*.d.ts",
|
|
5690
|
-
// IDE/System files
|
|
5691
|
-
"**/.DS_Store",
|
|
5692
|
-
"**/Thumbs.db",
|
|
5693
|
-
"**/*.swp",
|
|
5694
|
-
"**/*.swo",
|
|
5695
|
-
// Internal config files (using regex pattern)
|
|
5696
|
-
"/.*\\.(ini|cfg|conf|log|pid)$/"
|
|
5697
|
-
];
|
|
5698
|
-
const DEFAULT_FOLDER_EXCLUSIONS = [
|
|
5699
|
-
// Archive and deprecated content (matches anywhere in path)
|
|
5700
|
-
"**/archive/**",
|
|
5701
|
-
"**/archived/**",
|
|
5702
|
-
"**/deprecated/**",
|
|
5703
|
-
"**/legacy/**",
|
|
5704
|
-
"**/old/**",
|
|
5705
|
-
"**/outdated/**",
|
|
5706
|
-
"**/previous/**",
|
|
5707
|
-
"**/superseded/**",
|
|
5708
|
-
// Specific paths that don't follow the general pattern
|
|
5709
|
-
"docs/old/**",
|
|
5710
|
-
// Test directories
|
|
5711
|
-
"**/test/**",
|
|
5712
|
-
"**/tests/**",
|
|
5713
|
-
"**/__tests__/**",
|
|
5714
|
-
"**/spec/**",
|
|
5715
|
-
// Build output directories
|
|
5716
|
-
"**/dist/**",
|
|
5717
|
-
"**/build/**",
|
|
5718
|
-
"**/out/**",
|
|
5719
|
-
"**/target/**",
|
|
5720
|
-
"**/.next/**",
|
|
5721
|
-
"**/.nuxt/**",
|
|
5722
|
-
// IDE directories
|
|
5723
|
-
"**/.vscode/**",
|
|
5724
|
-
"**/.idea/**",
|
|
5725
|
-
// Internationalization folders - non-English locales
|
|
5726
|
-
"**/i18n/ar*/**",
|
|
5727
|
-
"**/i18n/de*/**",
|
|
5728
|
-
"**/i18n/es*/**",
|
|
5729
|
-
"**/i18n/fr*/**",
|
|
5730
|
-
"**/i18n/hi*/**",
|
|
5731
|
-
"**/i18n/it*/**",
|
|
5732
|
-
"**/i18n/ja*/**",
|
|
5733
|
-
"**/i18n/ko*/**",
|
|
5734
|
-
"**/i18n/nl*/**",
|
|
5735
|
-
"**/i18n/pl*/**",
|
|
5736
|
-
"**/i18n/pt*/**",
|
|
5737
|
-
"**/i18n/ru*/**",
|
|
5738
|
-
"**/i18n/sv*/**",
|
|
5739
|
-
"**/i18n/th*/**",
|
|
5740
|
-
"**/i18n/tr*/**",
|
|
5741
|
-
"**/i18n/vi*/**",
|
|
5742
|
-
"**/i18n/zh*/**",
|
|
5743
|
-
// Common locale folder patterns
|
|
5744
|
-
"**/zh-cn/**",
|
|
5745
|
-
"**/zh-hk/**",
|
|
5746
|
-
"**/zh-mo/**",
|
|
5747
|
-
"**/zh-sg/**",
|
|
5748
|
-
"**/zh-tw/**"
|
|
5749
|
-
];
|
|
5750
|
-
const DEFAULT_EXCLUSION_PATTERNS = [
|
|
5751
|
-
...DEFAULT_FILE_EXCLUSIONS,
|
|
5752
|
-
...DEFAULT_FOLDER_EXCLUSIONS
|
|
5753
|
-
];
|
|
5754
|
-
function getEffectiveExclusionPatterns(userPatterns) {
|
|
5755
|
-
if (userPatterns !== void 0) {
|
|
5756
|
-
return userPatterns;
|
|
6121
|
+
chunks
|
|
6122
|
+
};
|
|
5757
6123
|
}
|
|
5758
|
-
return DEFAULT_EXCLUSION_PATTERNS;
|
|
5759
|
-
}
|
|
5760
|
-
function isRegexPattern(pattern) {
|
|
5761
|
-
return pattern.length > 2 && pattern.startsWith("/") && pattern.endsWith("/");
|
|
5762
6124
|
}
|
|
5763
|
-
|
|
5764
|
-
|
|
5765
|
-
|
|
6125
|
+
let PipelineFactory$1 = class PipelineFactory {
|
|
6126
|
+
/**
|
|
6127
|
+
* Creates the standard set of content pipelines used by all scraper strategies.
|
|
6128
|
+
* Includes HTML, Markdown, JSON, source code, and text processing capabilities.
|
|
6129
|
+
* Each pipeline now handles both preprocessing and content-specific splitting.
|
|
6130
|
+
* TextPipeline is placed last as the universal fallback for unknown content types.
|
|
6131
|
+
*
|
|
6132
|
+
* @param config - Optional configuration for pipeline chunk sizes
|
|
6133
|
+
* @returns Array of content pipelines in processing order
|
|
6134
|
+
*/
|
|
6135
|
+
static createStandardPipelines(config) {
|
|
6136
|
+
const preferredChunkSize = config?.chunkSizes?.preferred ?? SPLITTER_PREFERRED_CHUNK_SIZE;
|
|
6137
|
+
const maxChunkSize = config?.chunkSizes?.max ?? 2e3;
|
|
6138
|
+
return [
|
|
6139
|
+
new JsonPipeline(preferredChunkSize),
|
|
6140
|
+
new SourceCodePipeline(preferredChunkSize, maxChunkSize),
|
|
6141
|
+
new HtmlPipeline(preferredChunkSize, maxChunkSize),
|
|
6142
|
+
new MarkdownPipeline(preferredChunkSize, maxChunkSize),
|
|
6143
|
+
new TextPipeline(preferredChunkSize, maxChunkSize)
|
|
6144
|
+
// Universal fallback - must be last
|
|
6145
|
+
];
|
|
5766
6146
|
}
|
|
5767
|
-
|
|
5768
|
-
|
|
5769
|
-
|
|
5770
|
-
|
|
5771
|
-
|
|
5772
|
-
|
|
5773
|
-
const normalizedPath = path2.startsWith("/") ? path2 : `/${path2}`;
|
|
5774
|
-
return patterns.some((pattern) => {
|
|
5775
|
-
if (isRegexPattern(pattern)) {
|
|
5776
|
-
return patternToRegExp(pattern).test(normalizedPath);
|
|
5777
|
-
}
|
|
5778
|
-
return minimatch(normalizedPath.replace(/^\//, ""), pattern, { dot: true });
|
|
5779
|
-
});
|
|
5780
|
-
}
|
|
5781
|
-
function extractPathAndQuery(url) {
|
|
5782
|
-
try {
|
|
5783
|
-
const u = new URL(url);
|
|
5784
|
-
return u.pathname + (u.search || "");
|
|
5785
|
-
} catch {
|
|
5786
|
-
return url;
|
|
6147
|
+
};
|
|
6148
|
+
class GitHubRepoProcessor {
|
|
6149
|
+
httpFetcher = new HttpFetcher();
|
|
6150
|
+
pipelines;
|
|
6151
|
+
constructor() {
|
|
6152
|
+
this.pipelines = PipelineFactory$1.createStandardPipelines();
|
|
5787
6153
|
}
|
|
5788
|
-
|
|
5789
|
-
|
|
5790
|
-
|
|
5791
|
-
|
|
5792
|
-
|
|
5793
|
-
|
|
5794
|
-
|
|
5795
|
-
|
|
5796
|
-
|
|
5797
|
-
|
|
6154
|
+
/**
|
|
6155
|
+
* Parses an HTTPS blob URL to extract repository information.
|
|
6156
|
+
* Format: https://github.com/owner/repo/blob/branch/filepath
|
|
6157
|
+
*/
|
|
6158
|
+
parseHttpsBlobUrl(url) {
|
|
6159
|
+
const parsedUrl = new URL(url);
|
|
6160
|
+
const segments = parsedUrl.pathname.split("/").filter(Boolean);
|
|
6161
|
+
if (segments.length < 5 || segments[2] !== "blob") {
|
|
6162
|
+
throw new Error(
|
|
6163
|
+
`Invalid GitHub blob URL format. Expected: https://github.com/owner/repo/blob/branch/filepath. Got: ${url}`
|
|
6164
|
+
);
|
|
5798
6165
|
}
|
|
6166
|
+
const owner = segments[0];
|
|
6167
|
+
const repo = segments[1];
|
|
6168
|
+
const branch = segments[3];
|
|
6169
|
+
const filePath = segments.slice(4).join("/");
|
|
6170
|
+
return { owner, repo, branch, filePath };
|
|
5799
6171
|
}
|
|
5800
|
-
|
|
5801
|
-
|
|
5802
|
-
|
|
5803
|
-
|
|
5804
|
-
|
|
5805
|
-
|
|
5806
|
-
}
|
|
5807
|
-
|
|
5808
|
-
|
|
5809
|
-
|
|
5810
|
-
|
|
5811
|
-
|
|
5812
|
-
|
|
5813
|
-
|
|
6172
|
+
/**
|
|
6173
|
+
* Fetches the raw content of a file from GitHub.
|
|
6174
|
+
*/
|
|
6175
|
+
async fetchFileContent(repoInfo, filePath, etag, signal) {
|
|
6176
|
+
const { owner, repo, branch } = repoInfo;
|
|
6177
|
+
const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${filePath}`;
|
|
6178
|
+
const rawContent = await this.httpFetcher.fetch(rawUrl, { signal, etag });
|
|
6179
|
+
const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath);
|
|
6180
|
+
if (detectedMimeType && rawContent.mimeType === "text/plain") {
|
|
6181
|
+
return {
|
|
6182
|
+
...rawContent,
|
|
6183
|
+
mimeType: detectedMimeType
|
|
6184
|
+
};
|
|
6185
|
+
}
|
|
6186
|
+
return rawContent;
|
|
5814
6187
|
}
|
|
5815
|
-
|
|
5816
|
-
|
|
5817
|
-
|
|
5818
|
-
|
|
5819
|
-
|
|
5820
|
-
|
|
5821
|
-
|
|
5822
|
-
|
|
5823
|
-
|
|
6188
|
+
/**
|
|
6189
|
+
* Processes a single GitHub repository file from an HTTPS blob URL.
|
|
6190
|
+
*/
|
|
6191
|
+
async process(item, options, signal) {
|
|
6192
|
+
const repoInfo = this.parseHttpsBlobUrl(item.url);
|
|
6193
|
+
const { owner, repo, branch, filePath } = repoInfo;
|
|
6194
|
+
const rawContent = await this.fetchFileContent(
|
|
6195
|
+
{ owner, repo, branch },
|
|
6196
|
+
filePath,
|
|
6197
|
+
item.etag,
|
|
6198
|
+
signal
|
|
6199
|
+
);
|
|
6200
|
+
if (rawContent.status !== FetchStatus.SUCCESS) {
|
|
6201
|
+
return { url: item.url, links: [], status: rawContent.status };
|
|
5824
6202
|
}
|
|
5825
|
-
|
|
5826
|
-
|
|
5827
|
-
|
|
5828
|
-
|
|
6203
|
+
let processed;
|
|
6204
|
+
for (const pipeline of this.pipelines) {
|
|
6205
|
+
const contentBuffer = Buffer.isBuffer(rawContent.content) ? rawContent.content : Buffer.from(rawContent.content);
|
|
6206
|
+
if (pipeline.canProcess(rawContent.mimeType || "text/plain", contentBuffer)) {
|
|
6207
|
+
logger.debug(
|
|
6208
|
+
`Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`
|
|
6209
|
+
);
|
|
6210
|
+
const gitHubOptions = { ...options, scrapeMode: ScrapeMode.Fetch };
|
|
6211
|
+
processed = await pipeline.process(rawContent, gitHubOptions, this.httpFetcher);
|
|
6212
|
+
break;
|
|
6213
|
+
}
|
|
5829
6214
|
}
|
|
5830
|
-
|
|
5831
|
-
|
|
6215
|
+
if (!processed) {
|
|
6216
|
+
logger.warn(
|
|
6217
|
+
`⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
|
|
6218
|
+
);
|
|
6219
|
+
return { url: item.url, links: [], status: FetchStatus.SUCCESS };
|
|
6220
|
+
}
|
|
6221
|
+
for (const err of processed.errors ?? []) {
|
|
6222
|
+
logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`);
|
|
6223
|
+
}
|
|
6224
|
+
const githubUrl = `https://github.com/${owner}/${repo}/blob/${branch}/${filePath}`;
|
|
6225
|
+
const filename = filePath.split("/").pop() || "Untitled";
|
|
6226
|
+
return {
|
|
6227
|
+
url: githubUrl,
|
|
6228
|
+
title: processed.title?.trim() || filename || "Untitled",
|
|
6229
|
+
etag: rawContent.etag,
|
|
6230
|
+
lastModified: rawContent.lastModified,
|
|
6231
|
+
contentType: rawContent.mimeType,
|
|
6232
|
+
content: processed,
|
|
6233
|
+
links: [],
|
|
6234
|
+
// Always return empty links array for individual files
|
|
6235
|
+
status: FetchStatus.SUCCESS
|
|
6236
|
+
};
|
|
6237
|
+
}
|
|
6238
|
+
/**
|
|
6239
|
+
* Cleanup resources used by this processor.
|
|
6240
|
+
*/
|
|
6241
|
+
async cleanup() {
|
|
6242
|
+
await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
|
|
5832
6243
|
}
|
|
5833
6244
|
}
|
|
5834
|
-
|
|
5835
|
-
|
|
5836
|
-
|
|
5837
|
-
|
|
5838
|
-
|
|
5839
|
-
totalDiscovered = 0;
|
|
5840
|
-
// Track total URLs discovered (unlimited)
|
|
5841
|
-
effectiveTotal = 0;
|
|
5842
|
-
// Track effective total (limited by maxPages)
|
|
5843
|
-
canonicalBaseUrl;
|
|
5844
|
-
options;
|
|
5845
|
-
constructor(options = {}) {
|
|
5846
|
-
this.options = options;
|
|
6245
|
+
class GitHubWikiProcessor {
|
|
6246
|
+
httpFetcher = new HttpFetcher();
|
|
6247
|
+
pipelines;
|
|
6248
|
+
constructor() {
|
|
6249
|
+
this.pipelines = PipelineFactory$1.createStandardPipelines();
|
|
5847
6250
|
}
|
|
5848
6251
|
/**
|
|
5849
|
-
*
|
|
5850
|
-
|
|
6252
|
+
* Parses a GitHub wiki URL to extract repository information.
|
|
6253
|
+
*/
|
|
6254
|
+
parseGitHubWikiUrl(url) {
|
|
6255
|
+
const parsedUrl = new URL(url);
|
|
6256
|
+
const match = parsedUrl.pathname.match(/^\/([^/]+)\/([^/]+)\/wiki/);
|
|
6257
|
+
if (!match) {
|
|
6258
|
+
throw new Error(`Invalid GitHub wiki URL: ${url}`);
|
|
6259
|
+
}
|
|
6260
|
+
const [, owner, repo] = match;
|
|
6261
|
+
return { owner, repo };
|
|
6262
|
+
}
|
|
6263
|
+
/**
|
|
6264
|
+
* Determines if a URL should be processed within the wiki scope.
|
|
5851
6265
|
*/
|
|
5852
6266
|
shouldProcessUrl(url, options) {
|
|
5853
|
-
|
|
5854
|
-
|
|
5855
|
-
|
|
5856
|
-
|
|
5857
|
-
|
|
5858
|
-
} catch {
|
|
6267
|
+
try {
|
|
6268
|
+
const parsedUrl = new URL(url);
|
|
6269
|
+
const baseWikiInfo = this.parseGitHubWikiUrl(options.url);
|
|
6270
|
+
const expectedWikiPath = `/${baseWikiInfo.owner}/${baseWikiInfo.repo}/wiki`;
|
|
6271
|
+
if (!parsedUrl.pathname.startsWith(expectedWikiPath)) {
|
|
5859
6272
|
return false;
|
|
5860
6273
|
}
|
|
6274
|
+
const wikiPagePath = parsedUrl.pathname.replace(expectedWikiPath, "").replace(/^\//, "");
|
|
6275
|
+
return shouldIncludeUrl(
|
|
6276
|
+
wikiPagePath || "Home",
|
|
6277
|
+
options.includePatterns,
|
|
6278
|
+
options.excludePatterns
|
|
6279
|
+
);
|
|
6280
|
+
} catch {
|
|
6281
|
+
return false;
|
|
5861
6282
|
}
|
|
5862
|
-
return shouldIncludeUrl(url, options.includePatterns, options.excludePatterns);
|
|
5863
6283
|
}
|
|
5864
|
-
|
|
5865
|
-
|
|
5866
|
-
|
|
5867
|
-
|
|
5868
|
-
|
|
5869
|
-
|
|
5870
|
-
|
|
5871
|
-
|
|
5872
|
-
|
|
5873
|
-
|
|
5874
|
-
|
|
5875
|
-
}
|
|
5876
|
-
|
|
5877
|
-
|
|
5878
|
-
|
|
5879
|
-
|
|
5880
|
-
|
|
5881
|
-
|
|
5882
|
-
|
|
5883
|
-
|
|
5884
|
-
|
|
5885
|
-
|
|
5886
|
-
`Updated scope base after redirect: ${original.href} -> ${finalUrlObj.href}`
|
|
5887
|
-
);
|
|
5888
|
-
} else {
|
|
5889
|
-
this.canonicalBaseUrl = original;
|
|
5890
|
-
}
|
|
5891
|
-
} catch {
|
|
5892
|
-
this.canonicalBaseUrl = new URL$1(options.url);
|
|
5893
|
-
}
|
|
5894
|
-
}
|
|
5895
|
-
if (result.document) {
|
|
5896
|
-
this.pageCount++;
|
|
5897
|
-
logger.info(
|
|
5898
|
-
`🌐 Scraping page ${this.pageCount}/${this.effectiveTotal} (depth ${item.depth}/${maxDepth}): ${item.url}`
|
|
5899
|
-
);
|
|
5900
|
-
await progressCallback({
|
|
5901
|
-
pagesScraped: this.pageCount,
|
|
5902
|
-
totalPages: this.effectiveTotal,
|
|
5903
|
-
totalDiscovered: this.totalDiscovered,
|
|
5904
|
-
currentUrl: item.url,
|
|
5905
|
-
depth: item.depth,
|
|
5906
|
-
maxDepth,
|
|
5907
|
-
document: result.document
|
|
5908
|
-
});
|
|
5909
|
-
}
|
|
5910
|
-
const nextItems = result.links || [];
|
|
5911
|
-
return nextItems.map((value) => {
|
|
5912
|
-
try {
|
|
5913
|
-
const targetUrl = new URL$1(value, baseUrl);
|
|
5914
|
-
if (!this.shouldProcessUrl(targetUrl.href, options)) {
|
|
5915
|
-
return null;
|
|
5916
|
-
}
|
|
5917
|
-
return {
|
|
5918
|
-
url: targetUrl.href,
|
|
5919
|
-
depth: item.depth + 1
|
|
5920
|
-
};
|
|
5921
|
-
} catch (_error) {
|
|
5922
|
-
logger.warn(`❌ Invalid URL: ${value}`);
|
|
5923
|
-
}
|
|
5924
|
-
return null;
|
|
5925
|
-
}).filter((item2) => item2 !== null);
|
|
5926
|
-
} catch (error) {
|
|
5927
|
-
if (options.ignoreErrors) {
|
|
5928
|
-
logger.error(`❌ Failed to process ${item.url}: ${error}`);
|
|
5929
|
-
return [];
|
|
5930
|
-
}
|
|
5931
|
-
throw error;
|
|
5932
|
-
}
|
|
5933
|
-
})
|
|
5934
|
-
);
|
|
5935
|
-
const allLinks = results.flat();
|
|
5936
|
-
const uniqueLinks = [];
|
|
5937
|
-
for (const item of allLinks) {
|
|
5938
|
-
const normalizedUrl = normalizeUrl(item.url, this.options.urlNormalizerOptions);
|
|
5939
|
-
if (!this.visited.has(normalizedUrl)) {
|
|
5940
|
-
this.visited.add(normalizedUrl);
|
|
5941
|
-
uniqueLinks.push(item);
|
|
5942
|
-
this.totalDiscovered++;
|
|
5943
|
-
if (this.effectiveTotal < maxPages) {
|
|
5944
|
-
this.effectiveTotal++;
|
|
6284
|
+
/**
|
|
6285
|
+
* Processes a single GitHub wiki page.
|
|
6286
|
+
*/
|
|
6287
|
+
async process(item, options, signal) {
|
|
6288
|
+
const currentUrl = item.url;
|
|
6289
|
+
try {
|
|
6290
|
+
const rawContent = await this.httpFetcher.fetch(currentUrl, {
|
|
6291
|
+
signal,
|
|
6292
|
+
etag: item.etag
|
|
6293
|
+
});
|
|
6294
|
+
if (rawContent.status !== FetchStatus.SUCCESS) {
|
|
6295
|
+
return { url: currentUrl, links: [], status: rawContent.status };
|
|
6296
|
+
}
|
|
6297
|
+
let processed;
|
|
6298
|
+
for (const pipeline of this.pipelines) {
|
|
6299
|
+
if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) {
|
|
6300
|
+
logger.debug(
|
|
6301
|
+
`Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${currentUrl})`
|
|
6302
|
+
);
|
|
6303
|
+
const wikiOptions = { ...options, scrapeMode: ScrapeMode.Fetch };
|
|
6304
|
+
processed = await pipeline.process(rawContent, wikiOptions, this.httpFetcher);
|
|
6305
|
+
break;
|
|
5945
6306
|
}
|
|
5946
6307
|
}
|
|
5947
|
-
|
|
5948
|
-
|
|
5949
|
-
}
|
|
5950
|
-
|
|
5951
|
-
|
|
5952
|
-
this.pageCount = 0;
|
|
5953
|
-
this.totalDiscovered = 1;
|
|
5954
|
-
this.effectiveTotal = 1;
|
|
5955
|
-
this.canonicalBaseUrl = new URL$1(options.url);
|
|
5956
|
-
let baseUrl = this.canonicalBaseUrl;
|
|
5957
|
-
const queue = [{ url: options.url, depth: 0 }];
|
|
5958
|
-
this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions));
|
|
5959
|
-
const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
|
|
5960
|
-
const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY;
|
|
5961
|
-
while (queue.length > 0 && this.pageCount < maxPages) {
|
|
5962
|
-
if (signal?.aborted) {
|
|
5963
|
-
logger.debug("Scraping cancelled by signal.");
|
|
5964
|
-
throw new CancellationError("Scraping cancelled by signal");
|
|
6308
|
+
if (!processed) {
|
|
6309
|
+
logger.warn(
|
|
6310
|
+
`⚠️ Unsupported content type "${rawContent.mimeType}" for wiki page ${currentUrl}. Skipping processing.`
|
|
6311
|
+
);
|
|
6312
|
+
return { url: currentUrl, links: [], status: FetchStatus.SUCCESS };
|
|
5965
6313
|
}
|
|
5966
|
-
const
|
|
5967
|
-
|
|
5968
|
-
break;
|
|
6314
|
+
for (const err of processed.errors ?? []) {
|
|
6315
|
+
logger.warn(`⚠️ Processing error for ${currentUrl}: ${err.message}`);
|
|
5969
6316
|
}
|
|
5970
|
-
const
|
|
5971
|
-
|
|
5972
|
-
|
|
5973
|
-
|
|
5974
|
-
|
|
5975
|
-
)
|
|
5976
|
-
|
|
5977
|
-
|
|
5978
|
-
|
|
5979
|
-
|
|
5980
|
-
|
|
5981
|
-
|
|
5982
|
-
|
|
5983
|
-
|
|
5984
|
-
|
|
5985
|
-
|
|
6317
|
+
const parsedUrl = new URL(currentUrl);
|
|
6318
|
+
const wikiInfo = this.parseGitHubWikiUrl(currentUrl);
|
|
6319
|
+
const wikiPagePath = parsedUrl.pathname.replace(`/${wikiInfo.owner}/${wikiInfo.repo}/wiki`, "").replace(/^\//, "");
|
|
6320
|
+
const pageTitle = wikiPagePath || "Home";
|
|
6321
|
+
const links = processed.links || [];
|
|
6322
|
+
const wikiLinks = links.filter((link) => {
|
|
6323
|
+
if (!link || link.trim() === "" || link === "invalid-url" || link === "not-a-url-at-all") {
|
|
6324
|
+
return false;
|
|
6325
|
+
}
|
|
6326
|
+
return true;
|
|
6327
|
+
}).map((link) => {
|
|
6328
|
+
try {
|
|
6329
|
+
return new URL(link, currentUrl).href;
|
|
6330
|
+
} catch {
|
|
6331
|
+
return null;
|
|
6332
|
+
}
|
|
6333
|
+
}).filter((link) => link !== null).filter((link) => {
|
|
6334
|
+
try {
|
|
6335
|
+
const linkUrl = new URL(link);
|
|
6336
|
+
return linkUrl.hostname === parsedUrl.hostname && linkUrl.pathname.startsWith(`/${wikiInfo.owner}/${wikiInfo.repo}/wiki`);
|
|
6337
|
+
} catch {
|
|
6338
|
+
return false;
|
|
6339
|
+
}
|
|
6340
|
+
});
|
|
6341
|
+
return {
|
|
6342
|
+
url: currentUrl,
|
|
6343
|
+
title: pageTitle,
|
|
6344
|
+
etag: rawContent.etag,
|
|
6345
|
+
lastModified: rawContent.lastModified,
|
|
6346
|
+
contentType: rawContent.mimeType,
|
|
6347
|
+
content: processed,
|
|
6348
|
+
links: wikiLinks,
|
|
6349
|
+
status: FetchStatus.SUCCESS
|
|
6350
|
+
};
|
|
6351
|
+
} catch (error) {
|
|
6352
|
+
logger.warn(`⚠️ Failed to process wiki page ${currentUrl}: ${error}`);
|
|
6353
|
+
return { url: currentUrl, links: [], status: FetchStatus.SUCCESS };
|
|
5986
6354
|
}
|
|
5987
6355
|
}
|
|
5988
6356
|
/**
|
|
5989
|
-
* Cleanup resources used by this
|
|
5990
|
-
* Default implementation does nothing - override in derived classes as needed.
|
|
6357
|
+
* Cleanup resources used by this processor.
|
|
5991
6358
|
*/
|
|
5992
6359
|
async cleanup() {
|
|
6360
|
+
await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
|
|
5993
6361
|
}
|
|
5994
6362
|
}
|
|
5995
|
-
class
|
|
6363
|
+
class GitHubScraperStrategy extends BaseScraperStrategy {
|
|
5996
6364
|
httpFetcher = new HttpFetcher();
|
|
5997
|
-
|
|
5998
|
-
|
|
5999
|
-
// Cache the resolved default branch
|
|
6000
|
-
constructor() {
|
|
6001
|
-
super();
|
|
6002
|
-
this.pipelines = PipelineFactory$1.createStandardPipelines();
|
|
6003
|
-
}
|
|
6365
|
+
wikiProcessor = new GitHubWikiProcessor();
|
|
6366
|
+
repoProcessor = new GitHubRepoProcessor();
|
|
6004
6367
|
canHandle(url) {
|
|
6005
|
-
const { hostname } = new URL(url);
|
|
6006
|
-
return ["github.com", "www.github.com"].includes(hostname);
|
|
6007
|
-
}
|
|
6008
|
-
/**
|
|
6009
|
-
* Override shouldProcessUrl to handle github-file:// URLs specially.
|
|
6010
|
-
* These URLs bypass scope checking since they're internal file references.
|
|
6011
|
-
*/
|
|
6012
|
-
shouldProcessUrl(url, options) {
|
|
6013
6368
|
if (url.startsWith("github-file://")) {
|
|
6014
|
-
|
|
6015
|
-
|
|
6369
|
+
return true;
|
|
6370
|
+
}
|
|
6371
|
+
try {
|
|
6372
|
+
const parsedUrl = new URL(url);
|
|
6373
|
+
const { hostname, pathname } = parsedUrl;
|
|
6374
|
+
if (!["github.com", "www.github.com"].includes(hostname)) {
|
|
6375
|
+
return false;
|
|
6376
|
+
}
|
|
6377
|
+
const baseMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/?$/);
|
|
6378
|
+
if (baseMatch) {
|
|
6379
|
+
return true;
|
|
6380
|
+
}
|
|
6381
|
+
const treeMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/tree\//);
|
|
6382
|
+
if (treeMatch) {
|
|
6383
|
+
return true;
|
|
6384
|
+
}
|
|
6385
|
+
const blobMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/blob\//);
|
|
6386
|
+
if (blobMatch) {
|
|
6387
|
+
return true;
|
|
6388
|
+
}
|
|
6389
|
+
return false;
|
|
6390
|
+
} catch {
|
|
6391
|
+
return false;
|
|
6016
6392
|
}
|
|
6017
|
-
return super.shouldProcessUrl(url, options);
|
|
6018
6393
|
}
|
|
6019
6394
|
/**
|
|
6020
6395
|
* Parses a GitHub URL to extract repository information.
|
|
@@ -6028,20 +6403,19 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
|
|
|
6028
6403
|
const [, owner, repo] = match;
|
|
6029
6404
|
const segments = parsedUrl.pathname.split("/").filter(Boolean);
|
|
6030
6405
|
if (segments.length >= 4 && segments[2] === "blob") {
|
|
6031
|
-
const
|
|
6406
|
+
const branch = segments[3];
|
|
6032
6407
|
const filePath = segments.length > 4 ? segments.slice(4).join("/") : void 0;
|
|
6033
|
-
return { owner, repo, branch
|
|
6408
|
+
return { owner, repo, branch, filePath, isBlob: true };
|
|
6034
6409
|
}
|
|
6035
|
-
if (segments.length
|
|
6036
|
-
|
|
6410
|
+
if (segments.length >= 4 && segments[2] === "tree") {
|
|
6411
|
+
const branch = segments[3];
|
|
6412
|
+
const subPath = segments.length > 4 ? segments.slice(4).join("/") : void 0;
|
|
6413
|
+
return { owner, repo, branch, subPath };
|
|
6037
6414
|
}
|
|
6038
|
-
|
|
6039
|
-
const subPath = segments.length > 4 ? segments.slice(4).join("/") : void 0;
|
|
6040
|
-
return { owner, repo, branch, subPath };
|
|
6415
|
+
return { owner, repo };
|
|
6041
6416
|
}
|
|
6042
6417
|
/**
|
|
6043
6418
|
* Fetches the repository tree structure from GitHub API.
|
|
6044
|
-
* Uses 'HEAD' to get the default branch if no branch is specified.
|
|
6045
6419
|
*/
|
|
6046
6420
|
async fetchRepositoryTree(repoInfo, signal) {
|
|
6047
6421
|
const { owner, repo, branch } = repoInfo;
|
|
@@ -6060,7 +6434,6 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
|
|
|
6060
6434
|
targetBranch = "main";
|
|
6061
6435
|
}
|
|
6062
6436
|
}
|
|
6063
|
-
this.resolvedBranch = targetBranch;
|
|
6064
6437
|
const treeUrl = `https://api.github.com/repos/${owner}/${repo}/git/trees/${targetBranch}?recursive=1`;
|
|
6065
6438
|
logger.debug(`Fetching repository tree: ${treeUrl}`);
|
|
6066
6439
|
const rawContent = await this.httpFetcher.fetch(treeUrl, { signal });
|
|
@@ -6082,14 +6455,12 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
|
|
|
6082
6455
|
}
|
|
6083
6456
|
const path2 = item.path;
|
|
6084
6457
|
const textExtensions = [
|
|
6085
|
-
// Documentation
|
|
6086
6458
|
".md",
|
|
6087
6459
|
".mdx",
|
|
6088
6460
|
".txt",
|
|
6089
6461
|
".rst",
|
|
6090
6462
|
".adoc",
|
|
6091
6463
|
".asciidoc",
|
|
6092
|
-
// Web technologies
|
|
6093
6464
|
".html",
|
|
6094
6465
|
".htm",
|
|
6095
6466
|
".xml",
|
|
@@ -6097,7 +6468,6 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
|
|
|
6097
6468
|
".scss",
|
|
6098
6469
|
".sass",
|
|
6099
6470
|
".less",
|
|
6100
|
-
// Programming languages
|
|
6101
6471
|
".js",
|
|
6102
6472
|
".jsx",
|
|
6103
6473
|
".ts",
|
|
@@ -6133,7 +6503,6 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
|
|
|
6133
6503
|
".ps1",
|
|
6134
6504
|
".bat",
|
|
6135
6505
|
".cmd",
|
|
6136
|
-
// Configuration and data
|
|
6137
6506
|
".json",
|
|
6138
6507
|
".yaml",
|
|
6139
6508
|
".yml",
|
|
@@ -6147,7 +6516,6 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
|
|
|
6147
6516
|
".dockerignore",
|
|
6148
6517
|
".gitattributes",
|
|
6149
6518
|
".editorconfig",
|
|
6150
|
-
// Build and package management
|
|
6151
6519
|
".gradle",
|
|
6152
6520
|
".pom",
|
|
6153
6521
|
".sbt",
|
|
@@ -6156,10 +6524,7 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
|
|
|
6156
6524
|
".make",
|
|
6157
6525
|
".dockerfile",
|
|
6158
6526
|
".mod",
|
|
6159
|
-
// Go modules (go.mod)
|
|
6160
6527
|
".sum",
|
|
6161
|
-
// Go checksums (go.sum)
|
|
6162
|
-
// Other text formats
|
|
6163
6528
|
".sql",
|
|
6164
6529
|
".graphql",
|
|
6165
6530
|
".gql",
|
|
@@ -6172,20 +6537,16 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
|
|
|
6172
6537
|
];
|
|
6173
6538
|
const pathLower = path2.toLowerCase();
|
|
6174
6539
|
const hasTextExtension = textExtensions.some((ext) => pathLower.endsWith(ext));
|
|
6175
|
-
const hasCompoundExtension = pathLower.includes(".env.") ||
|
|
6176
|
-
pathLower.endsWith(".env") || pathLower.includes(".config.") || // webpack.config.js, etc.
|
|
6177
|
-
pathLower.includes(".lock");
|
|
6540
|
+
const hasCompoundExtension = pathLower.includes(".env.") || pathLower.endsWith(".env") || pathLower.includes(".config.") || pathLower.includes(".lock");
|
|
6178
6541
|
const fileName = path2.split("/").pop() || "";
|
|
6179
6542
|
const fileNameLower = fileName.toLowerCase();
|
|
6180
6543
|
const commonTextFiles = [
|
|
6181
|
-
// Documentation files without extensions
|
|
6182
6544
|
"readme",
|
|
6183
6545
|
"license",
|
|
6184
6546
|
"changelog",
|
|
6185
6547
|
"contributing",
|
|
6186
6548
|
"authors",
|
|
6187
6549
|
"maintainers",
|
|
6188
|
-
// Build files without extensions
|
|
6189
6550
|
"dockerfile",
|
|
6190
6551
|
"makefile",
|
|
6191
6552
|
"rakefile",
|
|
@@ -6193,374 +6554,125 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
|
|
|
6193
6554
|
"podfile",
|
|
6194
6555
|
"cartfile",
|
|
6195
6556
|
"brewfile",
|
|
6196
|
-
"procfile",
|
|
6197
|
-
"vagrantfile",
|
|
6198
|
-
"gulpfile",
|
|
6199
|
-
"gruntfile",
|
|
6200
|
-
|
|
6201
|
-
".
|
|
6202
|
-
".
|
|
6203
|
-
".
|
|
6204
|
-
".
|
|
6205
|
-
|
|
6206
|
-
|
|
6207
|
-
|
|
6208
|
-
|
|
6209
|
-
return fileNameLower === name2 || fileNameLower.startsWith(`${name2}.`);
|
|
6210
|
-
}
|
|
6211
|
-
return fileNameLower === name2 || fileNameLower.startsWith(`${name2}.`);
|
|
6212
|
-
});
|
|
6213
|
-
if (!hasTextExtension && !hasCompoundExtension && !isCommonTextFile) {
|
|
6214
|
-
return false;
|
|
6215
|
-
}
|
|
6216
|
-
return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
|
|
6217
|
-
}
|
|
6218
|
-
/**
|
|
6219
|
-
* Fetches the raw content of a file from GitHub.
|
|
6220
|
-
*/
|
|
6221
|
-
async fetchFileContent(repoInfo, filePath, signal) {
|
|
6222
|
-
const { owner, repo } = repoInfo;
|
|
6223
|
-
const branch = this.resolvedBranch || repoInfo.branch || "main";
|
|
6224
|
-
const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${filePath}`;
|
|
6225
|
-
const rawContent = await this.httpFetcher.fetch(rawUrl, { signal });
|
|
6226
|
-
const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath);
|
|
6227
|
-
if (detectedMimeType && rawContent.mimeType === "text/plain") {
|
|
6228
|
-
return {
|
|
6229
|
-
...rawContent,
|
|
6230
|
-
mimeType: detectedMimeType
|
|
6231
|
-
};
|
|
6232
|
-
}
|
|
6233
|
-
return rawContent;
|
|
6234
|
-
}
|
|
6235
|
-
async processItem(item, options, _progressCallback, signal) {
|
|
6236
|
-
const repoInfo = this.parseGitHubUrl(options.url);
|
|
6237
|
-
if (item.depth === 0) {
|
|
6238
|
-
if ("isBlob" in repoInfo && repoInfo.isBlob) {
|
|
6239
|
-
if (repoInfo.filePath) {
|
|
6240
|
-
logger.info(
|
|
6241
|
-
`📄 Processing single file: ${repoInfo.owner}/${repoInfo.repo}/${repoInfo.filePath}`
|
|
6242
|
-
);
|
|
6243
|
-
return { links: [`github-file://${repoInfo.filePath}`] };
|
|
6244
|
-
} else {
|
|
6245
|
-
logger.warn(
|
|
6246
|
-
`⚠️ Blob URL without file path: ${options.url}. No files to process.`
|
|
6247
|
-
);
|
|
6248
|
-
return { links: [] };
|
|
6249
|
-
}
|
|
6250
|
-
}
|
|
6251
|
-
logger.info(
|
|
6252
|
-
`🗂️ Discovering repository structure for ${repoInfo.owner}/${repoInfo.repo}`
|
|
6253
|
-
);
|
|
6254
|
-
const { tree, resolvedBranch } = await this.fetchRepositoryTree(repoInfo, signal);
|
|
6255
|
-
const fileItems = tree.tree.filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)).filter((treeItem) => this.shouldProcessFile(treeItem, options));
|
|
6256
|
-
logger.info(
|
|
6257
|
-
`📁 Found ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`
|
|
6258
|
-
);
|
|
6259
|
-
const links = fileItems.map((treeItem) => `github-file://${treeItem.path}`);
|
|
6260
|
-
return { links };
|
|
6261
|
-
}
|
|
6262
|
-
if (item.url.startsWith("github-file://")) {
|
|
6263
|
-
const filePath = item.url.replace("github-file://", "");
|
|
6264
|
-
logger.info(
|
|
6265
|
-
`🗂️ Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`
|
|
6266
|
-
);
|
|
6267
|
-
const rawContent = await this.fetchFileContent(repoInfo, filePath, signal);
|
|
6268
|
-
let processed;
|
|
6269
|
-
for (const pipeline of this.pipelines) {
|
|
6270
|
-
if (pipeline.canProcess(rawContent)) {
|
|
6271
|
-
logger.debug(
|
|
6272
|
-
`Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`
|
|
6273
|
-
);
|
|
6274
|
-
const gitHubOptions = { ...options, scrapeMode: ScrapeMode.Fetch };
|
|
6275
|
-
processed = await pipeline.process(rawContent, gitHubOptions, this.httpFetcher);
|
|
6276
|
-
break;
|
|
6277
|
-
}
|
|
6278
|
-
}
|
|
6279
|
-
if (!processed) {
|
|
6280
|
-
logger.warn(
|
|
6281
|
-
`⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
|
|
6282
|
-
);
|
|
6283
|
-
return { document: void 0, links: [] };
|
|
6284
|
-
}
|
|
6285
|
-
for (const err of processed.errors) {
|
|
6286
|
-
logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`);
|
|
6287
|
-
}
|
|
6288
|
-
const githubUrl = `https://github.com/${repoInfo.owner}/${repoInfo.repo}/blob/${this.resolvedBranch || repoInfo.branch || "main"}/${filePath}`;
|
|
6289
|
-
const processedTitle = processed.metadata.title;
|
|
6290
|
-
const hasValidTitle = typeof processedTitle === "string" && processedTitle.trim() !== "";
|
|
6291
|
-
const fallbackTitle = filePath.split("/").pop() || "Untitled";
|
|
6292
|
-
return {
|
|
6293
|
-
document: {
|
|
6294
|
-
content: typeof processed.textContent === "string" ? processed.textContent : "",
|
|
6295
|
-
metadata: {
|
|
6296
|
-
url: githubUrl,
|
|
6297
|
-
title: hasValidTitle ? processedTitle : fallbackTitle,
|
|
6298
|
-
library: options.library,
|
|
6299
|
-
version: options.version
|
|
6300
|
-
},
|
|
6301
|
-
contentType: rawContent.mimeType
|
|
6302
|
-
// Preserve the detected MIME type
|
|
6303
|
-
},
|
|
6304
|
-
links: []
|
|
6305
|
-
// Always return empty links array for individual files
|
|
6306
|
-
};
|
|
6307
|
-
}
|
|
6308
|
-
return { document: void 0, links: [] };
|
|
6309
|
-
}
|
|
6310
|
-
/**
|
|
6311
|
-
* Normalize a path by removing leading and trailing slashes.
|
|
6312
|
-
*/
|
|
6313
|
-
normalizePath(path2) {
|
|
6314
|
-
return path2.replace(/^\/+/, "").replace(/\/+$/, "");
|
|
6315
|
-
}
|
|
6316
|
-
isWithinSubPath(path2, subPath) {
|
|
6317
|
-
if (!subPath) {
|
|
6318
|
-
return true;
|
|
6319
|
-
}
|
|
6320
|
-
const trimmedSubPath = this.normalizePath(subPath);
|
|
6321
|
-
if (trimmedSubPath.length === 0) {
|
|
6322
|
-
return true;
|
|
6323
|
-
}
|
|
6324
|
-
const normalizedPath = this.normalizePath(path2);
|
|
6325
|
-
if (normalizedPath === trimmedSubPath) {
|
|
6326
|
-
return true;
|
|
6327
|
-
}
|
|
6328
|
-
return normalizedPath.startsWith(`${trimmedSubPath}/`);
|
|
6329
|
-
}
|
|
6330
|
-
async scrape(options, progressCallback, signal) {
|
|
6331
|
-
const url = new URL(options.url);
|
|
6332
|
-
if (!url.hostname.includes("github.com")) {
|
|
6333
|
-
throw new Error("URL must be a GitHub URL");
|
|
6334
|
-
}
|
|
6335
|
-
return super.scrape(options, progressCallback, signal);
|
|
6336
|
-
}
|
|
6337
|
-
/**
|
|
6338
|
-
* Cleanup resources used by this strategy, specifically the pipeline browser instances.
|
|
6339
|
-
*/
|
|
6340
|
-
async cleanup() {
|
|
6341
|
-
await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
|
|
6342
|
-
}
|
|
6343
|
-
}
|
|
6344
|
-
class GitHubWikiScraperStrategy extends BaseScraperStrategy {
|
|
6345
|
-
httpFetcher = new HttpFetcher();
|
|
6346
|
-
pipelines;
|
|
6347
|
-
constructor() {
|
|
6348
|
-
super();
|
|
6349
|
-
this.pipelines = PipelineFactory$1.createStandardPipelines();
|
|
6350
|
-
}
|
|
6351
|
-
canHandle(url) {
|
|
6352
|
-
try {
|
|
6353
|
-
const parsedUrl = new URL(url);
|
|
6354
|
-
const { hostname, pathname } = parsedUrl;
|
|
6355
|
-
return ["github.com", "www.github.com"].includes(hostname) && pathname.includes("/wiki") && pathname.match(/^\/([^/]+)\/([^/]+)\/wiki/) !== null;
|
|
6356
|
-
} catch {
|
|
6357
|
-
return false;
|
|
6358
|
-
}
|
|
6359
|
-
}
|
|
6360
|
-
/**
|
|
6361
|
-
* Parses a GitHub wiki URL to extract repository information.
|
|
6362
|
-
*/
|
|
6363
|
-
parseGitHubWikiUrl(url) {
|
|
6364
|
-
const parsedUrl = new URL(url);
|
|
6365
|
-
const match = parsedUrl.pathname.match(/^\/([^/]+)\/([^/]+)\/wiki/);
|
|
6366
|
-
if (!match) {
|
|
6367
|
-
throw new Error(`Invalid GitHub wiki URL: ${url}`);
|
|
6368
|
-
}
|
|
6369
|
-
const [, owner, repo] = match;
|
|
6370
|
-
return { owner, repo };
|
|
6371
|
-
}
|
|
6372
|
-
/**
|
|
6373
|
-
* Override shouldProcessUrl to only process URLs within the wiki scope.
|
|
6374
|
-
*/
|
|
6375
|
-
shouldProcessUrl(url, options) {
|
|
6376
|
-
try {
|
|
6377
|
-
const parsedUrl = new URL(url);
|
|
6378
|
-
const wikiInfo = this.parseGitHubWikiUrl(options.url);
|
|
6379
|
-
const expectedWikiPath = `/${wikiInfo.owner}/${wikiInfo.repo}/wiki`;
|
|
6380
|
-
if (!parsedUrl.pathname.startsWith(expectedWikiPath)) {
|
|
6381
|
-
return false;
|
|
6382
|
-
}
|
|
6383
|
-
const wikiPagePath = parsedUrl.pathname.replace(expectedWikiPath, "").replace(/^\//, "");
|
|
6384
|
-
return shouldIncludeUrl(
|
|
6385
|
-
wikiPagePath || "Home",
|
|
6386
|
-
options.includePatterns,
|
|
6387
|
-
options.excludePatterns
|
|
6388
|
-
);
|
|
6389
|
-
} catch {
|
|
6390
|
-
return false;
|
|
6391
|
-
}
|
|
6392
|
-
}
|
|
6393
|
-
async processItem(item, options, _progressCallback, signal) {
|
|
6394
|
-
const currentUrl = item.url;
|
|
6395
|
-
logger.info(
|
|
6396
|
-
`📖 Processing wiki page ${this.pageCount}/${options.maxPages}: ${currentUrl}`
|
|
6397
|
-
);
|
|
6398
|
-
try {
|
|
6399
|
-
const rawContent = await this.httpFetcher.fetch(currentUrl, { signal });
|
|
6400
|
-
let processed;
|
|
6401
|
-
for (const pipeline of this.pipelines) {
|
|
6402
|
-
if (pipeline.canProcess(rawContent)) {
|
|
6403
|
-
logger.debug(
|
|
6404
|
-
`Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${currentUrl})`
|
|
6405
|
-
);
|
|
6406
|
-
const wikiOptions = { ...options, scrapeMode: ScrapeMode.Fetch };
|
|
6407
|
-
processed = await pipeline.process(rawContent, wikiOptions, this.httpFetcher);
|
|
6408
|
-
break;
|
|
6409
|
-
}
|
|
6410
|
-
}
|
|
6411
|
-
if (!processed) {
|
|
6412
|
-
logger.warn(
|
|
6413
|
-
`⚠️ Unsupported content type "${rawContent.mimeType}" for wiki page ${currentUrl}. Skipping processing.`
|
|
6414
|
-
);
|
|
6415
|
-
return { document: void 0, links: [] };
|
|
6416
|
-
}
|
|
6417
|
-
for (const err of processed.errors) {
|
|
6418
|
-
logger.warn(`⚠️ Processing error for ${currentUrl}: ${err.message}`);
|
|
6557
|
+
"procfile",
|
|
6558
|
+
"vagrantfile",
|
|
6559
|
+
"gulpfile",
|
|
6560
|
+
"gruntfile",
|
|
6561
|
+
".prettierrc",
|
|
6562
|
+
".eslintrc",
|
|
6563
|
+
".babelrc",
|
|
6564
|
+
".nvmrc",
|
|
6565
|
+
".npmrc"
|
|
6566
|
+
];
|
|
6567
|
+
const isCommonTextFile = commonTextFiles.some((name2) => {
|
|
6568
|
+
if (name2.startsWith(".")) {
|
|
6569
|
+
return fileNameLower === name2 || fileNameLower.startsWith(`${name2}.`);
|
|
6419
6570
|
}
|
|
6420
|
-
|
|
6421
|
-
|
|
6422
|
-
|
|
6423
|
-
|
|
6424
|
-
const document2 = {
|
|
6425
|
-
content: typeof processed.textContent === "string" ? processed.textContent : "",
|
|
6426
|
-
metadata: {
|
|
6427
|
-
url: currentUrl,
|
|
6428
|
-
title: typeof processed.metadata.title === "string" && processed.metadata.title.trim() !== "" ? processed.metadata.title : pageTitle,
|
|
6429
|
-
library: options.library,
|
|
6430
|
-
version: options.version
|
|
6431
|
-
},
|
|
6432
|
-
contentType: rawContent.mimeType
|
|
6433
|
-
};
|
|
6434
|
-
const links = processed.links || [];
|
|
6435
|
-
const wikiLinks = links.filter((link) => {
|
|
6436
|
-
if (!link || link.trim() === "" || link === "invalid-url" || link === "not-a-url-at-all") {
|
|
6437
|
-
return false;
|
|
6438
|
-
}
|
|
6439
|
-
return true;
|
|
6440
|
-
}).map((link) => {
|
|
6441
|
-
try {
|
|
6442
|
-
return new URL(link, currentUrl).href;
|
|
6443
|
-
} catch {
|
|
6444
|
-
return null;
|
|
6445
|
-
}
|
|
6446
|
-
}).filter((link) => link !== null).filter((link) => {
|
|
6447
|
-
try {
|
|
6448
|
-
const linkUrl = new URL(link);
|
|
6449
|
-
return linkUrl.hostname === parsedUrl.hostname && linkUrl.pathname.startsWith(`/${wikiInfo.owner}/${wikiInfo.repo}/wiki`);
|
|
6450
|
-
} catch {
|
|
6451
|
-
return false;
|
|
6452
|
-
}
|
|
6453
|
-
});
|
|
6454
|
-
return { document: document2, links: wikiLinks };
|
|
6455
|
-
} catch (error) {
|
|
6456
|
-
logger.warn(`⚠️ Failed to process wiki page ${currentUrl}: ${error}`);
|
|
6457
|
-
return { document: void 0, links: [] };
|
|
6458
|
-
}
|
|
6459
|
-
}
|
|
6460
|
-
async scrape(options, progressCallback, signal) {
|
|
6461
|
-
const url = new URL(options.url);
|
|
6462
|
-
if (!url.hostname.includes("github.com") || !url.pathname.includes("/wiki")) {
|
|
6463
|
-
throw new Error("URL must be a GitHub wiki URL");
|
|
6571
|
+
return fileNameLower === name2 || fileNameLower.startsWith(`${name2}.`);
|
|
6572
|
+
});
|
|
6573
|
+
if (hasTextExtension || hasCompoundExtension || isCommonTextFile) {
|
|
6574
|
+
return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
|
|
6464
6575
|
}
|
|
6465
|
-
|
|
6466
|
-
if (
|
|
6467
|
-
|
|
6576
|
+
const mimeType = mime.getType(path2);
|
|
6577
|
+
if (mimeType?.startsWith("text/")) {
|
|
6578
|
+
logger.debug(`Including file with text MIME type: ${path2} (${mimeType})`);
|
|
6579
|
+
return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
|
|
6468
6580
|
}
|
|
6469
|
-
|
|
6470
|
-
return super.scrape(wikiOptions, progressCallback, signal);
|
|
6581
|
+
return false;
|
|
6471
6582
|
}
|
|
6472
6583
|
/**
|
|
6473
|
-
*
|
|
6584
|
+
* Checks if a path is within the specified subpath.
|
|
6474
6585
|
*/
|
|
6475
|
-
|
|
6476
|
-
|
|
6586
|
+
isWithinSubPath(path2, subPath) {
|
|
6587
|
+
if (!subPath) {
|
|
6588
|
+
return true;
|
|
6589
|
+
}
|
|
6590
|
+
const trimmedSubPath = subPath.replace(/^\/+/, "").replace(/\/+$/, "");
|
|
6591
|
+
if (trimmedSubPath.length === 0) {
|
|
6592
|
+
return true;
|
|
6593
|
+
}
|
|
6594
|
+
const normalizedPath = path2.replace(/^\/+/, "").replace(/\/+$/, "");
|
|
6595
|
+
if (normalizedPath === trimmedSubPath) {
|
|
6596
|
+
return true;
|
|
6597
|
+
}
|
|
6598
|
+
return normalizedPath.startsWith(`${trimmedSubPath}/`);
|
|
6477
6599
|
}
|
|
6478
|
-
|
|
6479
|
-
|
|
6480
|
-
|
|
6481
|
-
|
|
6482
|
-
|
|
6600
|
+
async processItem(item, options, signal) {
|
|
6601
|
+
if (item.url.startsWith("github-file://")) {
|
|
6602
|
+
logger.info(
|
|
6603
|
+
`🗑️ Legacy github-file:// URL detected, marking as deleted: ${item.url}`
|
|
6604
|
+
);
|
|
6605
|
+
return {
|
|
6606
|
+
url: item.url,
|
|
6607
|
+
links: [],
|
|
6608
|
+
status: FetchStatus.NOT_FOUND
|
|
6609
|
+
};
|
|
6610
|
+
}
|
|
6483
6611
|
try {
|
|
6484
|
-
const parsedUrl = new URL(url);
|
|
6485
|
-
|
|
6486
|
-
|
|
6487
|
-
return false;
|
|
6612
|
+
const parsedUrl = new URL(item.url);
|
|
6613
|
+
if (/^\/[^/]+\/[^/]+\/wiki($|\/)/.test(parsedUrl.pathname)) {
|
|
6614
|
+
return await this.wikiProcessor.process(item, options, signal);
|
|
6488
6615
|
}
|
|
6489
|
-
const pathMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/?$/);
|
|
6490
|
-
return pathMatch !== null;
|
|
6491
6616
|
} catch {
|
|
6492
|
-
return false;
|
|
6493
|
-
}
|
|
6494
|
-
}
|
|
6495
|
-
async scrape(options, progressCallback, signal) {
|
|
6496
|
-
const url = new URL(options.url);
|
|
6497
|
-
if (!url.hostname.includes("github.com")) {
|
|
6498
|
-
throw new Error("URL must be a GitHub URL");
|
|
6499
6617
|
}
|
|
6500
|
-
|
|
6501
|
-
|
|
6502
|
-
|
|
6503
|
-
|
|
6504
|
-
|
|
6505
|
-
|
|
6506
|
-
|
|
6507
|
-
|
|
6508
|
-
|
|
6509
|
-
|
|
6510
|
-
|
|
6511
|
-
|
|
6512
|
-
|
|
6513
|
-
|
|
6514
|
-
|
|
6515
|
-
|
|
6516
|
-
|
|
6517
|
-
pagesScraped: wikiPagesScraped + progress.pagesScraped,
|
|
6518
|
-
totalPages: wikiPagesScraped + progress.totalPages,
|
|
6519
|
-
totalDiscovered: totalPagesDiscovered + progress.totalDiscovered
|
|
6618
|
+
if (item.depth === 0) {
|
|
6619
|
+
const repoInfo = this.parseGitHubUrl(options.url);
|
|
6620
|
+
const { owner, repo } = repoInfo;
|
|
6621
|
+
logger.debug(`Discovering GitHub repository ${owner}/${repo}`);
|
|
6622
|
+
const discoveredLinks = [];
|
|
6623
|
+
if ("isBlob" in repoInfo && repoInfo.isBlob && repoInfo.filePath) {
|
|
6624
|
+
const { branch = "main", filePath } = repoInfo;
|
|
6625
|
+
logger.debug(
|
|
6626
|
+
`Single file URL detected: ${owner}/${repo}/${filePath} - indexing file only`
|
|
6627
|
+
);
|
|
6628
|
+
discoveredLinks.push(
|
|
6629
|
+
`https://github.com/${owner}/${repo}/blob/${branch}/${filePath}`
|
|
6630
|
+
);
|
|
6631
|
+
return {
|
|
6632
|
+
url: item.url,
|
|
6633
|
+
links: discoveredLinks,
|
|
6634
|
+
status: FetchStatus.SUCCESS
|
|
6520
6635
|
};
|
|
6521
6636
|
}
|
|
6522
|
-
await progressCallback(progress);
|
|
6523
|
-
};
|
|
6524
|
-
try {
|
|
6525
6637
|
const wikiUrl = `${options.url.replace(/\/$/, "")}/wiki`;
|
|
6526
|
-
|
|
6527
|
-
logger.
|
|
6528
|
-
|
|
6529
|
-
|
|
6530
|
-
|
|
6531
|
-
|
|
6532
|
-
|
|
6533
|
-
|
|
6534
|
-
|
|
6535
|
-
|
|
6536
|
-
|
|
6537
|
-
|
|
6538
|
-
|
|
6539
|
-
|
|
6540
|
-
|
|
6541
|
-
|
|
6542
|
-
|
|
6543
|
-
|
|
6544
|
-
|
|
6545
|
-
|
|
6546
|
-
|
|
6547
|
-
logger.info(`✅ Completed repository code scraping for ${owner}/${repo}`);
|
|
6548
|
-
} else {
|
|
6549
|
-
logger.info(
|
|
6550
|
-
`ℹ️ Skipping repository code scraping - page limit reached with wiki content`
|
|
6551
|
-
);
|
|
6638
|
+
discoveredLinks.push(wikiUrl);
|
|
6639
|
+
logger.debug(`Discovered wiki URL: ${wikiUrl}`);
|
|
6640
|
+
const { tree, resolvedBranch } = await this.fetchRepositoryTree(repoInfo, signal);
|
|
6641
|
+
const fileItems = tree.tree.filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)).filter((treeItem) => this.shouldProcessFile(treeItem, options));
|
|
6642
|
+
logger.debug(
|
|
6643
|
+
`Discovered ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`
|
|
6644
|
+
);
|
|
6645
|
+
const fileUrls = fileItems.map(
|
|
6646
|
+
(treeItem) => `https://github.com/${owner}/${repo}/blob/${resolvedBranch}/${treeItem.path}`
|
|
6647
|
+
);
|
|
6648
|
+
discoveredLinks.push(...fileUrls);
|
|
6649
|
+
logger.debug(
|
|
6650
|
+
`Discovery complete: ${fileUrls.length} repo file(s) + 1 wiki URL = ${discoveredLinks.length} total URLs`
|
|
6651
|
+
);
|
|
6652
|
+
return { url: item.url, links: discoveredLinks, status: FetchStatus.SUCCESS };
|
|
6653
|
+
}
|
|
6654
|
+
try {
|
|
6655
|
+
const parsedUrl = new URL(item.url);
|
|
6656
|
+
if (/^\/[^/]+\/[^/]+\/blob\//.test(parsedUrl.pathname)) {
|
|
6657
|
+
logger.debug(`Processing HTTPS blob URL at depth ${item.depth}: ${item.url}`);
|
|
6658
|
+
return await this.repoProcessor.process(item, options, signal);
|
|
6552
6659
|
}
|
|
6553
|
-
logger.info(`🎉 Comprehensive GitHub scraping completed for ${owner}/${repo}`);
|
|
6554
6660
|
} catch (error) {
|
|
6555
|
-
logger.
|
|
6556
|
-
|
|
6661
|
+
logger.warn(`⚠️ Failed to parse blob URL ${item.url}: ${error}`);
|
|
6662
|
+
return { url: item.url, links: [], status: FetchStatus.SUCCESS };
|
|
6557
6663
|
}
|
|
6664
|
+
logger.debug(`No further processing for URL at depth ${item.depth}: ${item.url}`);
|
|
6665
|
+
return { url: item.url, links: [], status: FetchStatus.SUCCESS };
|
|
6666
|
+
}
|
|
6667
|
+
async scrape(options, progressCallback, signal) {
|
|
6668
|
+
const url = new URL(options.url);
|
|
6669
|
+
if (!url.hostname.includes("github.com")) {
|
|
6670
|
+
throw new Error("URL must be a GitHub URL");
|
|
6671
|
+
}
|
|
6672
|
+
await super.scrape(options, progressCallback, signal);
|
|
6558
6673
|
}
|
|
6559
|
-
/**
|
|
6560
|
-
* Cleanup resources used by both underlying strategies.
|
|
6561
|
-
*/
|
|
6562
6674
|
async cleanup() {
|
|
6563
|
-
await Promise.
|
|
6675
|
+
await Promise.all([this.wikiProcessor.cleanup(), this.repoProcessor.cleanup()]);
|
|
6564
6676
|
}
|
|
6565
6677
|
}
|
|
6566
6678
|
class LocalFileStrategy extends BaseScraperStrategy {
|
|
@@ -6573,23 +6685,41 @@ class LocalFileStrategy extends BaseScraperStrategy {
|
|
|
6573
6685
|
canHandle(url) {
|
|
6574
6686
|
return url.startsWith("file://");
|
|
6575
6687
|
}
|
|
6576
|
-
async processItem(item, options,
|
|
6688
|
+
async processItem(item, options, _signal) {
|
|
6577
6689
|
let filePath = item.url.replace(/^file:\/\/\/?/, "");
|
|
6578
6690
|
filePath = decodeURIComponent(filePath);
|
|
6579
6691
|
if (!filePath.startsWith("/") && process.platform !== "win32") {
|
|
6580
6692
|
filePath = `/${filePath}`;
|
|
6581
6693
|
}
|
|
6582
|
-
|
|
6694
|
+
let stats;
|
|
6695
|
+
try {
|
|
6696
|
+
stats = await fs$1.stat(filePath);
|
|
6697
|
+
} catch (error) {
|
|
6698
|
+
if (error.code === "ENOENT") {
|
|
6699
|
+
logger.info(`✓ File deleted or not available: ${filePath}`);
|
|
6700
|
+
return {
|
|
6701
|
+
url: item.url,
|
|
6702
|
+
links: [],
|
|
6703
|
+
status: FetchStatus.NOT_FOUND
|
|
6704
|
+
};
|
|
6705
|
+
}
|
|
6706
|
+
throw error;
|
|
6707
|
+
}
|
|
6583
6708
|
if (stats.isDirectory()) {
|
|
6584
6709
|
const contents = await fs$1.readdir(filePath);
|
|
6585
6710
|
const links = contents.map((name2) => `file://${path.join(filePath, name2)}`).filter((url) => this.shouldProcessUrl(url, options));
|
|
6586
|
-
return { links };
|
|
6711
|
+
return { url: item.url, links, status: FetchStatus.SUCCESS };
|
|
6712
|
+
}
|
|
6713
|
+
const rawContent = await this.fileFetcher.fetch(item.url, {
|
|
6714
|
+
etag: item.etag
|
|
6715
|
+
});
|
|
6716
|
+
if (rawContent.status === FetchStatus.NOT_MODIFIED) {
|
|
6717
|
+
logger.debug(`✓ File unchanged: ${filePath}`);
|
|
6718
|
+
return { url: rawContent.source, links: [], status: FetchStatus.NOT_MODIFIED };
|
|
6587
6719
|
}
|
|
6588
|
-
logger.info(`🗂️ Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`);
|
|
6589
|
-
const rawContent = await this.fileFetcher.fetch(item.url);
|
|
6590
6720
|
let processed;
|
|
6591
6721
|
for (const pipeline of this.pipelines) {
|
|
6592
|
-
if (pipeline.canProcess(rawContent)) {
|
|
6722
|
+
if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) {
|
|
6593
6723
|
logger.debug(
|
|
6594
6724
|
`Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`
|
|
6595
6725
|
);
|
|
@@ -6601,22 +6731,22 @@ class LocalFileStrategy extends BaseScraperStrategy {
|
|
|
6601
6731
|
logger.warn(
|
|
6602
6732
|
`⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
|
|
6603
6733
|
);
|
|
6604
|
-
return {
|
|
6734
|
+
return { url: rawContent.source, links: [], status: FetchStatus.SUCCESS };
|
|
6605
6735
|
}
|
|
6606
|
-
for (const err of processed.errors) {
|
|
6736
|
+
for (const err of processed.errors ?? []) {
|
|
6607
6737
|
logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`);
|
|
6608
6738
|
}
|
|
6739
|
+
const filename = path.basename(filePath);
|
|
6740
|
+
const title = processed.title?.trim() || filename || null;
|
|
6609
6741
|
return {
|
|
6610
|
-
|
|
6611
|
-
|
|
6612
|
-
|
|
6613
|
-
|
|
6614
|
-
|
|
6615
|
-
|
|
6616
|
-
|
|
6617
|
-
|
|
6618
|
-
}
|
|
6619
|
-
}
|
|
6742
|
+
url: rawContent.source,
|
|
6743
|
+
title,
|
|
6744
|
+
etag: rawContent.etag,
|
|
6745
|
+
lastModified: rawContent.lastModified,
|
|
6746
|
+
contentType: rawContent.mimeType,
|
|
6747
|
+
content: processed,
|
|
6748
|
+
links: [],
|
|
6749
|
+
status: FetchStatus.SUCCESS
|
|
6620
6750
|
};
|
|
6621
6751
|
}
|
|
6622
6752
|
/**
|
|
@@ -6652,19 +6782,32 @@ class WebScraperStrategy extends BaseScraperStrategy {
|
|
|
6652
6782
|
* @param signal - Optional abort signal for request cancellation.
|
|
6653
6783
|
* @returns An object containing the processed document and extracted links.
|
|
6654
6784
|
*/
|
|
6655
|
-
async processItem(item, options,
|
|
6785
|
+
async processItem(item, options, signal) {
|
|
6656
6786
|
const { url } = item;
|
|
6657
6787
|
try {
|
|
6788
|
+
if (item.etag) {
|
|
6789
|
+
logger.debug(`Processing ${url} with stored ETag: ${item.etag}`);
|
|
6790
|
+
}
|
|
6658
6791
|
const fetchOptions = {
|
|
6659
6792
|
signal,
|
|
6660
6793
|
followRedirects: options.followRedirects,
|
|
6661
|
-
headers: options.headers
|
|
6794
|
+
headers: options.headers,
|
|
6662
6795
|
// Forward custom headers
|
|
6796
|
+
etag: item.etag
|
|
6797
|
+
// Pass ETag for conditional requests
|
|
6663
6798
|
};
|
|
6664
6799
|
const rawContent = await this.fetcher.fetch(url, fetchOptions);
|
|
6800
|
+
logger.debug(
|
|
6801
|
+
`Fetch result for ${url}: status=${rawContent.status}, etag=${rawContent.etag || "none"}`
|
|
6802
|
+
);
|
|
6803
|
+
if (rawContent.status !== FetchStatus.SUCCESS) {
|
|
6804
|
+
logger.debug(`Skipping pipeline for ${url} due to status: ${rawContent.status}`);
|
|
6805
|
+
return { url: rawContent.source, links: [], status: rawContent.status };
|
|
6806
|
+
}
|
|
6665
6807
|
let processed;
|
|
6666
6808
|
for (const pipeline of this.pipelines) {
|
|
6667
|
-
|
|
6809
|
+
const contentBuffer = Buffer.isBuffer(rawContent.content) ? rawContent.content : Buffer.from(rawContent.content);
|
|
6810
|
+
if (pipeline.canProcess(rawContent.mimeType || "text/plain", contentBuffer)) {
|
|
6668
6811
|
logger.debug(
|
|
6669
6812
|
`Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${url})`
|
|
6670
6813
|
);
|
|
@@ -6676,40 +6819,47 @@ class WebScraperStrategy extends BaseScraperStrategy {
|
|
|
6676
6819
|
logger.warn(
|
|
6677
6820
|
`⚠️ Unsupported content type "${rawContent.mimeType}" for URL ${url}. Skipping processing.`
|
|
6678
6821
|
);
|
|
6679
|
-
return {
|
|
6822
|
+
return { url: rawContent.source, links: [], status: FetchStatus.SUCCESS };
|
|
6680
6823
|
}
|
|
6681
|
-
for (const err of processed.errors) {
|
|
6824
|
+
for (const err of processed.errors ?? []) {
|
|
6682
6825
|
logger.warn(`⚠️ Processing error for ${url}: ${err.message}`);
|
|
6683
6826
|
}
|
|
6684
6827
|
if (!processed.textContent || !processed.textContent.trim()) {
|
|
6685
6828
|
logger.warn(
|
|
6686
6829
|
`⚠️ No processable content found for ${url} after pipeline execution.`
|
|
6687
6830
|
);
|
|
6688
|
-
return {
|
|
6831
|
+
return {
|
|
6832
|
+
url: rawContent.source,
|
|
6833
|
+
links: processed.links,
|
|
6834
|
+
status: FetchStatus.SUCCESS
|
|
6835
|
+
};
|
|
6689
6836
|
}
|
|
6690
|
-
|
|
6691
|
-
|
|
6837
|
+
if (item.depth === 0) {
|
|
6838
|
+
this.canonicalBaseUrl = new URL(rawContent.source);
|
|
6839
|
+
}
|
|
6840
|
+
const filteredLinks = processed.links?.filter((link) => {
|
|
6692
6841
|
try {
|
|
6693
6842
|
const targetUrl = new URL(link);
|
|
6694
|
-
|
|
6695
|
-
|
|
6843
|
+
if (!this.shouldProcessUrl(targetUrl.href, options)) {
|
|
6844
|
+
return false;
|
|
6845
|
+
}
|
|
6846
|
+
if (this.shouldFollowLinkFn) {
|
|
6847
|
+
const baseUrl = this.canonicalBaseUrl ?? new URL(options.url);
|
|
6848
|
+
return this.shouldFollowLinkFn(baseUrl, targetUrl);
|
|
6849
|
+
}
|
|
6850
|
+
return true;
|
|
6696
6851
|
} catch {
|
|
6697
6852
|
return false;
|
|
6698
6853
|
}
|
|
6699
|
-
});
|
|
6854
|
+
}) ?? [];
|
|
6700
6855
|
return {
|
|
6701
|
-
|
|
6702
|
-
|
|
6703
|
-
|
|
6704
|
-
|
|
6705
|
-
|
|
6706
|
-
library: options.library,
|
|
6707
|
-
version: options.version,
|
|
6708
|
-
...processed.metadata
|
|
6709
|
-
}
|
|
6710
|
-
},
|
|
6856
|
+
url: rawContent.source,
|
|
6857
|
+
etag: rawContent.etag,
|
|
6858
|
+
lastModified: rawContent.lastModified,
|
|
6859
|
+
contentType: processed.contentType || rawContent.mimeType,
|
|
6860
|
+
content: processed,
|
|
6711
6861
|
links: filteredLinks,
|
|
6712
|
-
|
|
6862
|
+
status: FetchStatus.SUCCESS
|
|
6713
6863
|
};
|
|
6714
6864
|
} catch (error) {
|
|
6715
6865
|
logger.error(`❌ Failed processing page ${url}: ${error}`);
|
|
@@ -6786,7 +6936,6 @@ class ScraperRegistry {
|
|
|
6786
6936
|
this.strategies = [
|
|
6787
6937
|
new NpmScraperStrategy(),
|
|
6788
6938
|
new PyPiScraperStrategy(),
|
|
6789
|
-
new GitHubWikiScraperStrategy(),
|
|
6790
6939
|
new GitHubScraperStrategy(),
|
|
6791
6940
|
new WebScraperStrategy(),
|
|
6792
6941
|
new LocalFileStrategy()
|
|
@@ -6848,55 +6997,64 @@ class PipelineWorker {
|
|
|
6848
6997
|
* @param callbacks - Callbacks provided by the manager for reporting.
|
|
6849
6998
|
*/
|
|
6850
6999
|
async executeJob(job, callbacks) {
|
|
6851
|
-
const {
|
|
6852
|
-
id: jobId,
|
|
6853
|
-
library,
|
|
6854
|
-
version: version2,
|
|
6855
|
-
sourceUrl,
|
|
6856
|
-
scraperOptions,
|
|
6857
|
-
abortController
|
|
6858
|
-
} = job;
|
|
7000
|
+
const { id: jobId, library, version: version2, scraperOptions, abortController } = job;
|
|
6859
7001
|
const signal = abortController.signal;
|
|
6860
7002
|
logger.debug(`[${jobId}] Worker starting job for ${library}@${version2}`);
|
|
6861
7003
|
try {
|
|
6862
|
-
|
|
6863
|
-
|
|
6864
|
-
|
|
6865
|
-
|
|
6866
|
-
|
|
6867
|
-
|
|
6868
|
-
|
|
6869
|
-
|
|
6870
|
-
|
|
6871
|
-
}
|
|
7004
|
+
if (!scraperOptions.isRefresh) {
|
|
7005
|
+
await this.store.removeAllDocuments(library, version2);
|
|
7006
|
+
logger.info(
|
|
7007
|
+
`💾 Cleared store for ${library}@${version2 || "[no version]"} before scraping.`
|
|
7008
|
+
);
|
|
7009
|
+
} else {
|
|
7010
|
+
logger.info(
|
|
7011
|
+
`🔄 Refresh operation - preserving existing data for ${library}@${version2 || "[no version]"}.`
|
|
7012
|
+
);
|
|
7013
|
+
}
|
|
6872
7014
|
await this.scraperService.scrape(
|
|
6873
|
-
|
|
7015
|
+
scraperOptions,
|
|
6874
7016
|
async (progress) => {
|
|
6875
7017
|
if (signal.aborted) {
|
|
6876
7018
|
throw new CancellationError("Job cancelled during scraping progress");
|
|
6877
7019
|
}
|
|
6878
7020
|
await callbacks.onJobProgress?.(job, progress);
|
|
6879
|
-
if (progress.
|
|
7021
|
+
if (progress.deleted && progress.pageId) {
|
|
6880
7022
|
try {
|
|
6881
|
-
await this.store.
|
|
6882
|
-
pageContent: progress.document.content,
|
|
6883
|
-
metadata: {
|
|
6884
|
-
...progress.document.metadata,
|
|
6885
|
-
mimeType: progress.document.contentType
|
|
6886
|
-
// Pass contentType as mimeType in metadata
|
|
6887
|
-
}
|
|
6888
|
-
});
|
|
7023
|
+
await this.store.deletePage(progress.pageId);
|
|
6889
7024
|
logger.debug(
|
|
6890
|
-
`[${jobId}]
|
|
7025
|
+
`[${jobId}] Deleted page ${progress.pageId}: ${progress.currentUrl}`
|
|
7026
|
+
);
|
|
7027
|
+
} catch (docError) {
|
|
7028
|
+
logger.error(
|
|
7029
|
+
`❌ [${jobId}] Failed to delete page ${progress.pageId}: ${docError}`
|
|
7030
|
+
);
|
|
7031
|
+
const error = docError instanceof Error ? docError : new Error(String(docError));
|
|
7032
|
+
await callbacks.onJobError?.(job, error);
|
|
7033
|
+
throw error;
|
|
7034
|
+
}
|
|
7035
|
+
} else if (progress.result) {
|
|
7036
|
+
try {
|
|
7037
|
+
if (progress.pageId) {
|
|
7038
|
+
await this.store.deletePage(progress.pageId);
|
|
7039
|
+
logger.debug(
|
|
7040
|
+
`[${jobId}] Refreshing page ${progress.pageId}: ${progress.currentUrl}`
|
|
7041
|
+
);
|
|
7042
|
+
}
|
|
7043
|
+
await this.store.addScrapeResult(
|
|
7044
|
+
library,
|
|
7045
|
+
version2,
|
|
7046
|
+
progress.depth,
|
|
7047
|
+
progress.result
|
|
6891
7048
|
);
|
|
7049
|
+
logger.debug(`[${jobId}] Stored processed content: ${progress.currentUrl}`);
|
|
6892
7050
|
} catch (docError) {
|
|
6893
7051
|
logger.error(
|
|
6894
|
-
`❌ [${jobId}] Failed to
|
|
7052
|
+
`❌ [${jobId}] Failed to process content ${progress.currentUrl}: ${docError}`
|
|
6895
7053
|
);
|
|
6896
7054
|
await callbacks.onJobError?.(
|
|
6897
7055
|
job,
|
|
6898
7056
|
docError instanceof Error ? docError : new Error(String(docError)),
|
|
6899
|
-
progress.
|
|
7057
|
+
progress.result
|
|
6900
7058
|
);
|
|
6901
7059
|
}
|
|
6902
7060
|
}
|
|
@@ -7108,15 +7266,8 @@ class PipelineManager {
|
|
|
7108
7266
|
/**
|
|
7109
7267
|
* Enqueues a new document processing job, aborting any existing QUEUED/RUNNING job for the same library+version (including unversioned).
|
|
7110
7268
|
*/
|
|
7111
|
-
async
|
|
7269
|
+
async enqueueScrapeJob(library, version2, options) {
|
|
7112
7270
|
const normalizedVersion = version2 ?? "";
|
|
7113
|
-
const {
|
|
7114
|
-
url,
|
|
7115
|
-
library: _library,
|
|
7116
|
-
version: _version,
|
|
7117
|
-
signal: _signal,
|
|
7118
|
-
...versionOptions
|
|
7119
|
-
} = options;
|
|
7120
7271
|
const allJobs = await this.getJobs();
|
|
7121
7272
|
const duplicateJobs = allJobs.filter(
|
|
7122
7273
|
(job2) => job2.library === library && (job2.version ?? "") === normalizedVersion && // Normalize null to empty string for comparison
|
|
@@ -7158,8 +7309,8 @@ class PipelineManager {
|
|
|
7158
7309
|
progressMaxPages: 0,
|
|
7159
7310
|
errorMessage: null,
|
|
7160
7311
|
updatedAt: /* @__PURE__ */ new Date(),
|
|
7161
|
-
sourceUrl: url,
|
|
7162
|
-
scraperOptions:
|
|
7312
|
+
sourceUrl: options.url,
|
|
7313
|
+
scraperOptions: options
|
|
7163
7314
|
};
|
|
7164
7315
|
this.jobMap.set(jobId, job);
|
|
7165
7316
|
this.jobQueue.push(jobId);
|
|
@@ -7174,6 +7325,78 @@ class PipelineManager {
|
|
|
7174
7325
|
}
|
|
7175
7326
|
return jobId;
|
|
7176
7327
|
}
|
|
7328
|
+
/**
|
|
7329
|
+
* Enqueues a refresh job for an existing library version by re-scraping all pages
|
|
7330
|
+
* and using ETag comparison to skip unchanged content.
|
|
7331
|
+
*
|
|
7332
|
+
* If the version was never completed (interrupted or failed scrape), performs a
|
|
7333
|
+
* full re-scrape from scratch instead of a refresh to ensure completeness.
|
|
7334
|
+
*/
|
|
7335
|
+
async enqueueRefreshJob(library, version2) {
|
|
7336
|
+
const normalizedVersion = version2 ?? "";
|
|
7337
|
+
try {
|
|
7338
|
+
const versionId = await this.store.ensureVersion({
|
|
7339
|
+
library,
|
|
7340
|
+
version: normalizedVersion
|
|
7341
|
+
});
|
|
7342
|
+
const versionInfo = await this.store.getVersionById(versionId);
|
|
7343
|
+
if (!versionInfo) {
|
|
7344
|
+
throw new Error(`Version ID ${versionId} not found`);
|
|
7345
|
+
}
|
|
7346
|
+
const libraryInfo = await this.store.getLibraryById(versionInfo.library_id);
|
|
7347
|
+
if (!libraryInfo) {
|
|
7348
|
+
throw new Error(`Library ID ${versionInfo.library_id} not found`);
|
|
7349
|
+
}
|
|
7350
|
+
if (versionInfo && versionInfo.status !== VersionStatus.COMPLETED) {
|
|
7351
|
+
logger.info(
|
|
7352
|
+
`⚠️ Version ${library}@${normalizedVersion || "unversioned"} has status "${versionInfo.status}". Performing full re-scrape instead of refresh.`
|
|
7353
|
+
);
|
|
7354
|
+
return this.enqueueJobWithStoredOptions(library, normalizedVersion);
|
|
7355
|
+
}
|
|
7356
|
+
const pages = await this.store.getPagesByVersionId(versionId);
|
|
7357
|
+
if (pages.length > 0) {
|
|
7358
|
+
logger.debug(
|
|
7359
|
+
`Sample page data: url=${pages[0].url}, etag=${pages[0].etag}, depth=${pages[0].depth}`
|
|
7360
|
+
);
|
|
7361
|
+
}
|
|
7362
|
+
if (pages.length === 0) {
|
|
7363
|
+
throw new Error(
|
|
7364
|
+
`No pages found for ${library}@${normalizedVersion || "unversioned"}. Use scrape_docs to index it first.`
|
|
7365
|
+
);
|
|
7366
|
+
}
|
|
7367
|
+
logger.info(
|
|
7368
|
+
`🔄 Preparing refresh job for ${library}@${normalizedVersion || "unversioned"} with ${pages.length} page(s)`
|
|
7369
|
+
);
|
|
7370
|
+
const initialQueue = pages.map((page) => ({
|
|
7371
|
+
url: page.url,
|
|
7372
|
+
depth: page.depth ?? 0,
|
|
7373
|
+
// Use original depth, fallback to 0 for old data
|
|
7374
|
+
pageId: page.id,
|
|
7375
|
+
etag: page.etag
|
|
7376
|
+
}));
|
|
7377
|
+
const storedOptions = await this.store.getScraperOptions(versionId);
|
|
7378
|
+
const scraperOptions = {
|
|
7379
|
+
url: storedOptions?.sourceUrl || pages[0].url,
|
|
7380
|
+
// Required but not used when initialQueue is set
|
|
7381
|
+
library,
|
|
7382
|
+
version: normalizedVersion,
|
|
7383
|
+
...storedOptions?.options || {},
|
|
7384
|
+
// Include stored options if available (spread first)
|
|
7385
|
+
// Override with refresh-specific options (these must come after the spread)
|
|
7386
|
+
initialQueue,
|
|
7387
|
+
// Pre-populated queue with existing pages
|
|
7388
|
+
isRefresh: true
|
|
7389
|
+
// Mark this as a refresh operation
|
|
7390
|
+
};
|
|
7391
|
+
logger.info(
|
|
7392
|
+
`📝 Enqueueing refresh job for ${library}@${normalizedVersion || "unversioned"}`
|
|
7393
|
+
);
|
|
7394
|
+
return this.enqueueScrapeJob(library, normalizedVersion, scraperOptions);
|
|
7395
|
+
} catch (error) {
|
|
7396
|
+
logger.error(`❌ Failed to enqueue refresh job: ${error}`);
|
|
7397
|
+
throw error;
|
|
7398
|
+
}
|
|
7399
|
+
}
|
|
7177
7400
|
/**
|
|
7178
7401
|
* Enqueues a job using stored scraper options from a previous indexing run.
|
|
7179
7402
|
* If no stored options are found, throws an error.
|
|
@@ -7201,7 +7424,7 @@ class PipelineManager {
|
|
|
7201
7424
|
logger.info(
|
|
7202
7425
|
`🔄 Re-indexing ${library}@${normalizedVersion || "unversioned"} with stored options from ${stored.sourceUrl}`
|
|
7203
7426
|
);
|
|
7204
|
-
return this.
|
|
7427
|
+
return this.enqueueScrapeJob(library, normalizedVersion, completeOptions);
|
|
7205
7428
|
} catch (error) {
|
|
7206
7429
|
logger.error(`❌ Failed to enqueue job with stored options: ${error}`);
|
|
7207
7430
|
throw error;
|
|
@@ -7418,13 +7641,7 @@ class PipelineManager {
|
|
|
7418
7641
|
await this.store.updateVersionStatus(versionId, dbStatus, errorMessage);
|
|
7419
7642
|
if (newStatus === PipelineJobStatus.QUEUED && job.scraperOptions) {
|
|
7420
7643
|
try {
|
|
7421
|
-
|
|
7422
|
-
url: job.sourceUrl ?? "",
|
|
7423
|
-
library: job.library,
|
|
7424
|
-
version: job.version,
|
|
7425
|
-
...job.scraperOptions
|
|
7426
|
-
};
|
|
7427
|
-
await this.store.storeScraperOptions(versionId, fullOptions);
|
|
7644
|
+
await this.store.storeScraperOptions(versionId, job.scraperOptions);
|
|
7428
7645
|
logger.debug(
|
|
7429
7646
|
`Stored scraper options for ${job.library}@${job.version}: ${job.sourceUrl}`
|
|
7430
7647
|
);
|
|
@@ -7882,7 +8099,7 @@ async function createPipelineWithCallbacks(docService, options = {}) {
|
|
|
7882
8099
|
},
|
|
7883
8100
|
onJobError: async (job, error, document2) => {
|
|
7884
8101
|
logger.warn(
|
|
7885
|
-
`⚠️ Job ${job.id} error ${document2 ? `on document ${document2.
|
|
8102
|
+
`⚠️ Job ${job.id} error ${document2 ? `on document ${document2.url}` : ""}: ${error.message}`
|
|
7886
8103
|
);
|
|
7887
8104
|
}
|
|
7888
8105
|
});
|
|
@@ -8113,6 +8330,45 @@ function createMcpServerInstance(tools, readOnly = false) {
|
|
|
8113
8330
|
}
|
|
8114
8331
|
}
|
|
8115
8332
|
);
|
|
8333
|
+
server.tool(
|
|
8334
|
+
"refresh_version",
|
|
8335
|
+
"Re-scrape a previously indexed library version, updating only changed pages.",
|
|
8336
|
+
{
|
|
8337
|
+
library: z.string().trim().describe("Library name."),
|
|
8338
|
+
version: z.string().trim().optional().describe("Library version (optional, refreshes unversioned if omitted).")
|
|
8339
|
+
},
|
|
8340
|
+
{
|
|
8341
|
+
title: "Refresh Library Version",
|
|
8342
|
+
destructiveHint: false,
|
|
8343
|
+
// Only updates changed content
|
|
8344
|
+
openWorldHint: true
|
|
8345
|
+
// requires internet access
|
|
8346
|
+
},
|
|
8347
|
+
async ({ library, version: version2 }) => {
|
|
8348
|
+
analytics.track(TelemetryEvent.TOOL_USED, {
|
|
8349
|
+
tool: "refresh_version",
|
|
8350
|
+
context: "mcp_server",
|
|
8351
|
+
library,
|
|
8352
|
+
version: version2
|
|
8353
|
+
});
|
|
8354
|
+
try {
|
|
8355
|
+
const result = await tools.refresh.execute({
|
|
8356
|
+
library,
|
|
8357
|
+
version: version2,
|
|
8358
|
+
waitForCompletion: false
|
|
8359
|
+
// Don't wait for completion
|
|
8360
|
+
});
|
|
8361
|
+
if ("jobId" in result) {
|
|
8362
|
+
return createResponse(`🔄 Refresh job started with ID: ${result.jobId}.`);
|
|
8363
|
+
}
|
|
8364
|
+
return createResponse(
|
|
8365
|
+
`Refresh finished immediately (unexpectedly) with ${result.pagesRefreshed} pages.`
|
|
8366
|
+
);
|
|
8367
|
+
} catch (error) {
|
|
8368
|
+
return createError(error);
|
|
8369
|
+
}
|
|
8370
|
+
}
|
|
8371
|
+
);
|
|
8116
8372
|
}
|
|
8117
8373
|
server.tool(
|
|
8118
8374
|
"search_docs",
|
|
@@ -8638,7 +8894,7 @@ class FetchUrlTool {
|
|
|
8638
8894
|
logger.info("🔄 Processing content...");
|
|
8639
8895
|
let processed;
|
|
8640
8896
|
for (const pipeline of this.pipelines) {
|
|
8641
|
-
if (pipeline.canProcess(rawContent)) {
|
|
8897
|
+
if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) {
|
|
8642
8898
|
processed = await pipeline.process(
|
|
8643
8899
|
rawContent,
|
|
8644
8900
|
{
|
|
@@ -8673,7 +8929,7 @@ class FetchUrlTool {
|
|
|
8673
8929
|
const contentString = convertToString(rawContent.content, resolvedCharset);
|
|
8674
8930
|
return contentString;
|
|
8675
8931
|
}
|
|
8676
|
-
for (const err of processed.errors) {
|
|
8932
|
+
for (const err of processed.errors ?? []) {
|
|
8677
8933
|
logger.warn(`⚠️ Processing error for ${url}: ${err.message}`);
|
|
8678
8934
|
}
|
|
8679
8935
|
if (typeof processed.textContent !== "string" || !processed.textContent.trim()) {
|
|
@@ -8851,6 +9107,61 @@ class ListLibrariesTool {
|
|
|
8851
9107
|
return { libraries };
|
|
8852
9108
|
}
|
|
8853
9109
|
}
|
|
9110
|
+
class RefreshVersionTool {
|
|
9111
|
+
pipeline;
|
|
9112
|
+
constructor(pipeline) {
|
|
9113
|
+
this.pipeline = pipeline;
|
|
9114
|
+
}
|
|
9115
|
+
async execute(options) {
|
|
9116
|
+
const { library, version: version2, waitForCompletion = true } = options;
|
|
9117
|
+
let internalVersion;
|
|
9118
|
+
const partialVersionRegex = /^\d+(\.\d+)?$/;
|
|
9119
|
+
if (version2 === null || version2 === void 0) {
|
|
9120
|
+
internalVersion = "";
|
|
9121
|
+
} else {
|
|
9122
|
+
const validFullVersion = semver.valid(version2);
|
|
9123
|
+
if (validFullVersion) {
|
|
9124
|
+
internalVersion = validFullVersion;
|
|
9125
|
+
} else if (partialVersionRegex.test(version2)) {
|
|
9126
|
+
const coercedVersion = semver.coerce(version2);
|
|
9127
|
+
if (coercedVersion) {
|
|
9128
|
+
internalVersion = coercedVersion.version;
|
|
9129
|
+
} else {
|
|
9130
|
+
throw new ValidationError(
|
|
9131
|
+
`Invalid version format for refreshing: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`,
|
|
9132
|
+
"RefreshVersionTool"
|
|
9133
|
+
);
|
|
9134
|
+
}
|
|
9135
|
+
} else {
|
|
9136
|
+
throw new ValidationError(
|
|
9137
|
+
`Invalid version format for refreshing: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`,
|
|
9138
|
+
"RefreshVersionTool"
|
|
9139
|
+
);
|
|
9140
|
+
}
|
|
9141
|
+
}
|
|
9142
|
+
internalVersion = internalVersion.toLowerCase();
|
|
9143
|
+
const pipeline = this.pipeline;
|
|
9144
|
+
const refreshVersion = internalVersion === "" ? null : internalVersion;
|
|
9145
|
+
const jobId = await pipeline.enqueueRefreshJob(library, refreshVersion);
|
|
9146
|
+
if (waitForCompletion) {
|
|
9147
|
+
try {
|
|
9148
|
+
await pipeline.waitForJobCompletion(jobId);
|
|
9149
|
+
const finalJob = await pipeline.getJob(jobId);
|
|
9150
|
+
const finalPagesRefreshed = finalJob?.progress?.pagesScraped ?? 0;
|
|
9151
|
+
logger.debug(
|
|
9152
|
+
`Refresh job ${jobId} finished with status ${finalJob?.status}. Pages refreshed: ${finalPagesRefreshed}`
|
|
9153
|
+
);
|
|
9154
|
+
return {
|
|
9155
|
+
pagesRefreshed: finalPagesRefreshed
|
|
9156
|
+
};
|
|
9157
|
+
} catch (error) {
|
|
9158
|
+
logger.error(`❌ Refresh job ${jobId} failed or was cancelled: ${error}`);
|
|
9159
|
+
throw error;
|
|
9160
|
+
}
|
|
9161
|
+
}
|
|
9162
|
+
return { jobId };
|
|
9163
|
+
}
|
|
9164
|
+
}
|
|
8854
9165
|
class RemoveTool {
|
|
8855
9166
|
constructor(documentManagementService, pipeline) {
|
|
8856
9167
|
this.documentManagementService = documentManagementService;
|
|
@@ -8871,19 +9182,7 @@ class RemoveTool {
|
|
|
8871
9182
|
}
|
|
8872
9183
|
logger.info(`🗑️ Removing library: ${library}${version2 ? `@${version2}` : ""}`);
|
|
8873
9184
|
try {
|
|
8874
|
-
|
|
8875
|
-
library,
|
|
8876
|
-
version2
|
|
8877
|
-
);
|
|
8878
|
-
const normalizedVersion = version2 && version2.trim() !== "" ? version2 : null;
|
|
8879
|
-
const versionExists = result.bestMatch === normalizedVersion || result.hasUnversioned && normalizedVersion === null;
|
|
8880
|
-
if (!versionExists) {
|
|
8881
|
-
const versionText = normalizedVersion ? `Version ${normalizedVersion}` : "Version";
|
|
8882
|
-
throw new ToolError(
|
|
8883
|
-
`${versionText} not found for library ${library}. Cannot remove non-existent version.`,
|
|
8884
|
-
this.constructor.name
|
|
8885
|
-
);
|
|
8886
|
-
}
|
|
9185
|
+
await this.documentManagementService.validateLibraryExists(library);
|
|
8887
9186
|
const allJobs = await this.pipeline.getJobs();
|
|
8888
9187
|
const jobs = allJobs.filter(
|
|
8889
9188
|
(job) => job.library === library && job.version === (version2 ?? "") && (job.status === PipelineJobStatus.QUEUED || job.status === PipelineJobStatus.RUNNING)
|
|
@@ -8950,7 +9249,7 @@ class ScrapeTool {
|
|
|
8950
9249
|
internalVersion = internalVersion.toLowerCase();
|
|
8951
9250
|
const pipeline = this.pipeline;
|
|
8952
9251
|
const enqueueVersion = internalVersion === "" ? null : internalVersion;
|
|
8953
|
-
const jobId = await pipeline.
|
|
9252
|
+
const jobId = await pipeline.enqueueScrapeJob(library, enqueueVersion, {
|
|
8954
9253
|
url,
|
|
8955
9254
|
library,
|
|
8956
9255
|
version: internalVersion,
|
|
@@ -8997,7 +9296,18 @@ class DocumentManagementClient {
|
|
|
8997
9296
|
logger.debug(`DocumentManagementClient (tRPC) created for: ${this.baseUrl}`);
|
|
8998
9297
|
}
|
|
8999
9298
|
async initialize() {
|
|
9000
|
-
|
|
9299
|
+
try {
|
|
9300
|
+
await this.client.ping.query();
|
|
9301
|
+
} catch (error) {
|
|
9302
|
+
logger.debug(
|
|
9303
|
+
`Failed to connect to DocumentManagement server at ${this.baseUrl}: ${error}`
|
|
9304
|
+
);
|
|
9305
|
+
throw new Error(
|
|
9306
|
+
`Failed to connect to server at ${this.baseUrl}.
|
|
9307
|
+
|
|
9308
|
+
Please verify the server URL includes the correct port (default 8080) and ends with '/api' (e.g., 'http://localhost:8080/api').`
|
|
9309
|
+
);
|
|
9310
|
+
}
|
|
9001
9311
|
}
|
|
9002
9312
|
async shutdown() {
|
|
9003
9313
|
}
|
|
@@ -9069,7 +9379,7 @@ class HierarchicalAssemblyStrategy {
|
|
|
9069
9379
|
try {
|
|
9070
9380
|
const chunksByDocument = /* @__PURE__ */ new Map();
|
|
9071
9381
|
for (const chunk of initialChunks) {
|
|
9072
|
-
const url = chunk.
|
|
9382
|
+
const url = chunk.url;
|
|
9073
9383
|
if (!chunksByDocument.has(url)) {
|
|
9074
9384
|
chunksByDocument.set(url, []);
|
|
9075
9385
|
}
|
|
@@ -9163,10 +9473,10 @@ class HierarchicalAssemblyStrategy {
|
|
|
9163
9473
|
if (debug) {
|
|
9164
9474
|
return chunks.map(
|
|
9165
9475
|
(chunk) => `=== #${chunk.id} ${chunk.metadata.path?.join("/")} [${chunk.metadata.level}] ===
|
|
9166
|
-
` + chunk.
|
|
9476
|
+
` + chunk.content
|
|
9167
9477
|
).join("");
|
|
9168
9478
|
}
|
|
9169
|
-
return chunks.map((chunk) => chunk.
|
|
9479
|
+
return chunks.map((chunk) => chunk.content).join("");
|
|
9170
9480
|
}
|
|
9171
9481
|
/**
|
|
9172
9482
|
* Walks up the parent hierarchy from a chunk to collect the complete parent chain.
|
|
@@ -9192,42 +9502,17 @@ class HierarchicalAssemblyStrategy {
|
|
|
9192
9502
|
visited.add(currentId);
|
|
9193
9503
|
chainIds.push(currentId);
|
|
9194
9504
|
depth++;
|
|
9195
|
-
|
|
9196
|
-
|
|
9505
|
+
let parentChunk = await documentStore.findParentChunk(library, version2, currentId);
|
|
9506
|
+
if (!parentChunk) {
|
|
9507
|
+
parentChunk = await this.findAncestorWithGaps(
|
|
9197
9508
|
library,
|
|
9198
9509
|
version2,
|
|
9199
|
-
|
|
9510
|
+
currentChunk.url,
|
|
9511
|
+
currentChunk.metadata.path ?? [],
|
|
9512
|
+
documentStore
|
|
9200
9513
|
);
|
|
9201
|
-
if (parentChunk) {
|
|
9202
|
-
currentChunk = parentChunk;
|
|
9203
|
-
} else {
|
|
9204
|
-
currentChunk = await this.findAncestorWithGaps(
|
|
9205
|
-
library,
|
|
9206
|
-
version2,
|
|
9207
|
-
currentChunk.metadata,
|
|
9208
|
-
documentStore
|
|
9209
|
-
);
|
|
9210
|
-
}
|
|
9211
|
-
} catch (error) {
|
|
9212
|
-
try {
|
|
9213
|
-
const currentMetadata = currentChunk?.metadata;
|
|
9214
|
-
if (currentMetadata) {
|
|
9215
|
-
currentChunk = await this.findAncestorWithGaps(
|
|
9216
|
-
library,
|
|
9217
|
-
version2,
|
|
9218
|
-
currentMetadata,
|
|
9219
|
-
documentStore
|
|
9220
|
-
);
|
|
9221
|
-
} else {
|
|
9222
|
-
currentChunk = null;
|
|
9223
|
-
}
|
|
9224
|
-
} catch (gapError) {
|
|
9225
|
-
logger.warn(
|
|
9226
|
-
`Parent lookup failed for chunk ${currentId}: ${error}. Gap search also failed: ${gapError}`
|
|
9227
|
-
);
|
|
9228
|
-
break;
|
|
9229
|
-
}
|
|
9230
9514
|
}
|
|
9515
|
+
currentChunk = parentChunk;
|
|
9231
9516
|
}
|
|
9232
9517
|
if (depth >= maxDepth) {
|
|
9233
9518
|
logger.warn(
|
|
@@ -9240,9 +9525,7 @@ class HierarchicalAssemblyStrategy {
|
|
|
9240
9525
|
* Attempts to find ancestors when there are gaps in the hierarchy.
|
|
9241
9526
|
* Tries progressively shorter path prefixes to find existing ancestor chunks.
|
|
9242
9527
|
*/
|
|
9243
|
-
async findAncestorWithGaps(library, version2,
|
|
9244
|
-
const path2 = metadata.path || [];
|
|
9245
|
-
const url = metadata.url;
|
|
9528
|
+
async findAncestorWithGaps(library, version2, url, path2, documentStore) {
|
|
9246
9529
|
if (path2.length <= 1) {
|
|
9247
9530
|
return null;
|
|
9248
9531
|
}
|
|
@@ -9279,7 +9562,7 @@ class HierarchicalAssemblyStrategy {
|
|
|
9279
9562
|
}
|
|
9280
9563
|
const matchingChunks = allChunks.filter((chunk) => {
|
|
9281
9564
|
const chunkPath = chunk.metadata.path || [];
|
|
9282
|
-
const chunkUrl = chunk.
|
|
9565
|
+
const chunkUrl = chunk.url;
|
|
9283
9566
|
if (chunkUrl !== url) return false;
|
|
9284
9567
|
if (chunkPath.length !== targetPath.length) return false;
|
|
9285
9568
|
return chunkPath.every((part, index) => part === targetPath[index]);
|
|
@@ -9301,11 +9584,7 @@ class HierarchicalAssemblyStrategy {
|
|
|
9301
9584
|
return current;
|
|
9302
9585
|
}
|
|
9303
9586
|
while (true) {
|
|
9304
|
-
const parent = await documentStore.findParentChunk(
|
|
9305
|
-
library,
|
|
9306
|
-
version2,
|
|
9307
|
-
current.id
|
|
9308
|
-
);
|
|
9587
|
+
const parent = await documentStore.findParentChunk(library, version2, current.id);
|
|
9309
9588
|
if (!parent) {
|
|
9310
9589
|
return null;
|
|
9311
9590
|
}
|
|
@@ -9387,7 +9666,7 @@ class HierarchicalAssemblyStrategy {
|
|
|
9387
9666
|
const ancestorChunks = await this.findChunksByExactPath(
|
|
9388
9667
|
library,
|
|
9389
9668
|
version2,
|
|
9390
|
-
referenceChunk.
|
|
9669
|
+
referenceChunk.url,
|
|
9391
9670
|
ancestorPath,
|
|
9392
9671
|
documentStore
|
|
9393
9672
|
);
|
|
@@ -9465,13 +9744,9 @@ class HierarchicalAssemblyStrategy {
|
|
|
9465
9744
|
for (const chunk of initialChunks) {
|
|
9466
9745
|
const id = chunk.id;
|
|
9467
9746
|
chunkIds.add(id);
|
|
9468
|
-
|
|
9469
|
-
|
|
9470
|
-
|
|
9471
|
-
chunkIds.add(parent.id);
|
|
9472
|
-
}
|
|
9473
|
-
} catch (error) {
|
|
9474
|
-
logger.warn(`Failed to find parent for chunk ${id}: ${error}`);
|
|
9747
|
+
const parent = await documentStore.findParentChunk(library, version2, id);
|
|
9748
|
+
if (parent) {
|
|
9749
|
+
chunkIds.add(parent.id);
|
|
9475
9750
|
}
|
|
9476
9751
|
try {
|
|
9477
9752
|
const children = await documentStore.findChildChunks(library, version2, id, 3);
|
|
@@ -9539,7 +9814,7 @@ class MarkdownAssemblyStrategy {
|
|
|
9539
9814
|
* Assembles chunks using simple "\n\n" joining (current behavior).
|
|
9540
9815
|
*/
|
|
9541
9816
|
assembleContent(chunks) {
|
|
9542
|
-
return chunks.map((chunk) => chunk.
|
|
9817
|
+
return chunks.map((chunk) => chunk.content).join("\n\n");
|
|
9543
9818
|
}
|
|
9544
9819
|
/**
|
|
9545
9820
|
* Collects related chunk IDs for a single chunk using current context expansion logic.
|
|
@@ -9638,7 +9913,7 @@ class DocumentRetrieverService {
|
|
|
9638
9913
|
groupResultsByUrl(results) {
|
|
9639
9914
|
const resultsByUrl = /* @__PURE__ */ new Map();
|
|
9640
9915
|
for (const result of results) {
|
|
9641
|
-
const url = result.
|
|
9916
|
+
const url = result.url;
|
|
9642
9917
|
if (!resultsByUrl.has(url)) {
|
|
9643
9918
|
resultsByUrl.set(url, []);
|
|
9644
9919
|
}
|
|
@@ -9653,10 +9928,8 @@ class DocumentRetrieverService {
|
|
|
9653
9928
|
* Processes a group of search results from the same URL using appropriate strategy.
|
|
9654
9929
|
*/
|
|
9655
9930
|
async processUrlGroup(library, version2, url, initialChunks) {
|
|
9656
|
-
const mimeType = initialChunks.length > 0 ? initialChunks[0].
|
|
9657
|
-
const maxScore = Math.max(
|
|
9658
|
-
...initialChunks.map((chunk) => chunk.metadata.score)
|
|
9659
|
-
);
|
|
9931
|
+
const mimeType = initialChunks.length > 0 ? initialChunks[0].content_type : void 0;
|
|
9932
|
+
const maxScore = Math.max(...initialChunks.map((chunk) => chunk.score));
|
|
9660
9933
|
const strategy = createContentAssemblyStrategy(mimeType);
|
|
9661
9934
|
const selectedChunks = await strategy.selectChunks(
|
|
9662
9935
|
library,
|
|
@@ -9845,7 +10118,7 @@ class DocumentStore {
|
|
|
9845
10118
|
prepareStatements() {
|
|
9846
10119
|
const statements = {
|
|
9847
10120
|
getById: this.db.prepare(
|
|
9848
|
-
`SELECT d
|
|
10121
|
+
`SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type
|
|
9849
10122
|
FROM documents d
|
|
9850
10123
|
JOIN pages p ON d.page_id = p.id
|
|
9851
10124
|
WHERE d.id = ?`
|
|
@@ -9858,7 +10131,7 @@ class DocumentStore {
|
|
|
9858
10131
|
"UPDATE documents SET embedding = ? WHERE id = ?"
|
|
9859
10132
|
),
|
|
9860
10133
|
insertPage: this.db.prepare(
|
|
9861
|
-
"INSERT INTO pages (version_id, url, title, etag, last_modified, content_type) VALUES (?, ?, ?, ?, ?, ?) ON CONFLICT(version_id, url) DO UPDATE SET title = excluded.title, content_type = excluded.content_type"
|
|
10134
|
+
"INSERT INTO pages (version_id, url, title, etag, last_modified, content_type, depth) VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(version_id, url) DO UPDATE SET title = excluded.title, content_type = excluded.content_type, etag = excluded.etag, last_modified = excluded.last_modified, depth = excluded.depth"
|
|
9862
10135
|
),
|
|
9863
10136
|
getPageId: this.db.prepare(
|
|
9864
10137
|
"SELECT id FROM pages WHERE version_id = ? AND url = ?"
|
|
@@ -9869,12 +10142,13 @@ class DocumentStore {
|
|
|
9869
10142
|
getLibraryIdByName: this.db.prepare(
|
|
9870
10143
|
"SELECT id FROM libraries WHERE name = ?"
|
|
9871
10144
|
),
|
|
10145
|
+
getLibraryById: this.db.prepare("SELECT * FROM libraries WHERE id = ?"),
|
|
9872
10146
|
// New version-related statements
|
|
9873
10147
|
insertVersion: this.db.prepare(
|
|
9874
10148
|
"INSERT INTO versions (library_id, name, status) VALUES (?, ?, 'not_indexed') ON CONFLICT(library_id, name) DO NOTHING"
|
|
9875
10149
|
),
|
|
9876
10150
|
resolveVersionId: this.db.prepare(
|
|
9877
|
-
"SELECT id FROM versions WHERE library_id = ? AND name
|
|
10151
|
+
"SELECT id FROM versions WHERE library_id = ? AND name = ?"
|
|
9878
10152
|
),
|
|
9879
10153
|
getVersionById: this.db.prepare("SELECT * FROM versions WHERE id = ?"),
|
|
9880
10154
|
queryVersionsByLibraryId: this.db.prepare(
|
|
@@ -9889,13 +10163,16 @@ class DocumentStore {
|
|
|
9889
10163
|
WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
9890
10164
|
)`
|
|
9891
10165
|
),
|
|
9892
|
-
|
|
9893
|
-
|
|
9894
|
-
|
|
9895
|
-
|
|
9896
|
-
|
|
10166
|
+
deleteDocumentsByPageId: this.db.prepare(
|
|
10167
|
+
"DELETE FROM documents WHERE page_id = ?"
|
|
10168
|
+
),
|
|
10169
|
+
deletePage: this.db.prepare("DELETE FROM pages WHERE id = ?"),
|
|
10170
|
+
deletePages: this.db.prepare(
|
|
10171
|
+
`DELETE FROM pages
|
|
10172
|
+
WHERE version_id IN (
|
|
10173
|
+
SELECT v.id FROM versions v
|
|
9897
10174
|
JOIN libraries l ON v.library_id = l.id
|
|
9898
|
-
WHERE
|
|
10175
|
+
WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
9899
10176
|
)`
|
|
9900
10177
|
),
|
|
9901
10178
|
getDocumentBySort: this.db.prepare(
|
|
@@ -9945,7 +10222,7 @@ class DocumentStore {
|
|
|
9945
10222
|
ORDER BY l.name, version`
|
|
9946
10223
|
),
|
|
9947
10224
|
getChildChunks: this.db.prepare(`
|
|
9948
|
-
SELECT d
|
|
10225
|
+
SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
|
|
9949
10226
|
JOIN pages p ON d.page_id = p.id
|
|
9950
10227
|
JOIN versions v ON p.version_id = v.id
|
|
9951
10228
|
JOIN libraries l ON v.library_id = l.id
|
|
@@ -9959,7 +10236,7 @@ class DocumentStore {
|
|
|
9959
10236
|
LIMIT ?
|
|
9960
10237
|
`),
|
|
9961
10238
|
getPrecedingSiblings: this.db.prepare(`
|
|
9962
|
-
SELECT d
|
|
10239
|
+
SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
|
|
9963
10240
|
JOIN pages p ON d.page_id = p.id
|
|
9964
10241
|
JOIN versions v ON p.version_id = v.id
|
|
9965
10242
|
JOIN libraries l ON v.library_id = l.id
|
|
@@ -9972,7 +10249,7 @@ class DocumentStore {
|
|
|
9972
10249
|
LIMIT ?
|
|
9973
10250
|
`),
|
|
9974
10251
|
getSubsequentSiblings: this.db.prepare(`
|
|
9975
|
-
SELECT d
|
|
10252
|
+
SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
|
|
9976
10253
|
JOIN pages p ON d.page_id = p.id
|
|
9977
10254
|
JOIN versions v ON p.version_id = v.id
|
|
9978
10255
|
JOIN libraries l ON v.library_id = l.id
|
|
@@ -9985,7 +10262,7 @@ class DocumentStore {
|
|
|
9985
10262
|
LIMIT ?
|
|
9986
10263
|
`),
|
|
9987
10264
|
getParentChunk: this.db.prepare(`
|
|
9988
|
-
SELECT d
|
|
10265
|
+
SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
|
|
9989
10266
|
JOIN pages p ON d.page_id = p.id
|
|
9990
10267
|
JOIN versions v ON p.version_id = v.id
|
|
9991
10268
|
JOIN libraries l ON v.library_id = l.id
|
|
@@ -10027,6 +10304,9 @@ class DocumentStore {
|
|
|
10027
10304
|
`SELECT v.id, v.library_id FROM versions v
|
|
10028
10305
|
JOIN libraries l ON v.library_id = l.id
|
|
10029
10306
|
WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')`
|
|
10307
|
+
),
|
|
10308
|
+
getPagesByVersionId: this.db.prepare(
|
|
10309
|
+
"SELECT * FROM pages WHERE version_id = ?"
|
|
10030
10310
|
)
|
|
10031
10311
|
};
|
|
10032
10312
|
this.statements = statements;
|
|
@@ -10168,7 +10448,7 @@ class DocumentStore {
|
|
|
10168
10448
|
this.statements.insertVersion.run(libraryId, normalizedVersion);
|
|
10169
10449
|
const versionIdRow = this.statements.resolveVersionId.get(
|
|
10170
10450
|
libraryId,
|
|
10171
|
-
normalizedVersion
|
|
10451
|
+
normalizedVersion
|
|
10172
10452
|
);
|
|
10173
10453
|
if (!versionIdRow || typeof versionIdRow.id !== "number") {
|
|
10174
10454
|
throw new StoreError(
|
|
@@ -10230,6 +10510,32 @@ class DocumentStore {
|
|
|
10230
10510
|
throw new StoreError(`Failed to get versions by status: ${error}`);
|
|
10231
10511
|
}
|
|
10232
10512
|
}
|
|
10513
|
+
/**
|
|
10514
|
+
* Retrieves a version by its ID.
|
|
10515
|
+
* @param versionId The version ID to retrieve
|
|
10516
|
+
* @returns The version record, or null if not found
|
|
10517
|
+
*/
|
|
10518
|
+
async getVersionById(versionId) {
|
|
10519
|
+
try {
|
|
10520
|
+
const row = this.statements.getVersionById.get(versionId);
|
|
10521
|
+
return row || null;
|
|
10522
|
+
} catch (error) {
|
|
10523
|
+
throw new StoreError(`Failed to get version by ID: ${error}`);
|
|
10524
|
+
}
|
|
10525
|
+
}
|
|
10526
|
+
/**
|
|
10527
|
+
* Retrieves a library by its ID.
|
|
10528
|
+
* @param libraryId The library ID to retrieve
|
|
10529
|
+
* @returns The library record, or null if not found
|
|
10530
|
+
*/
|
|
10531
|
+
async getLibraryById(libraryId) {
|
|
10532
|
+
try {
|
|
10533
|
+
const row = this.statements.getLibraryById.get(libraryId);
|
|
10534
|
+
return row || null;
|
|
10535
|
+
} catch (error) {
|
|
10536
|
+
throw new StoreError(`Failed to get library by ID: ${error}`);
|
|
10537
|
+
}
|
|
10538
|
+
}
|
|
10233
10539
|
/**
|
|
10234
10540
|
* Stores scraper options for a version to enable reproducible indexing.
|
|
10235
10541
|
* @param versionId The version ID to update
|
|
@@ -10237,7 +10543,15 @@ class DocumentStore {
|
|
|
10237
10543
|
*/
|
|
10238
10544
|
async storeScraperOptions(versionId, options) {
|
|
10239
10545
|
try {
|
|
10240
|
-
const {
|
|
10546
|
+
const {
|
|
10547
|
+
url: source_url,
|
|
10548
|
+
library: _library,
|
|
10549
|
+
version: _version,
|
|
10550
|
+
signal: _signal,
|
|
10551
|
+
initialQueue: _initialQueue,
|
|
10552
|
+
isRefresh: _isRefresh,
|
|
10553
|
+
...scraper_options
|
|
10554
|
+
} = options;
|
|
10241
10555
|
const optionsJson = JSON.stringify(scraper_options);
|
|
10242
10556
|
this.statements.updateVersionScraperOptions.run(source_url, optionsJson, versionId);
|
|
10243
10557
|
} catch (error) {
|
|
@@ -10348,36 +10662,96 @@ class DocumentStore {
|
|
|
10348
10662
|
throw new ConnectionError("Failed to query library versions", error);
|
|
10349
10663
|
}
|
|
10350
10664
|
}
|
|
10665
|
+
/**
|
|
10666
|
+
* Helper method to detect if an error is related to input size limits.
|
|
10667
|
+
* Checks for common error messages from various embedding providers.
|
|
10668
|
+
*/
|
|
10669
|
+
isInputSizeError(error) {
|
|
10670
|
+
if (!(error instanceof Error)) return false;
|
|
10671
|
+
const message = error.message.toLowerCase();
|
|
10672
|
+
return message.includes("maximum context length") || message.includes("too long") || message.includes("token limit") || message.includes("input is too large") || message.includes("exceeds") || message.includes("max") && message.includes("token");
|
|
10673
|
+
}
|
|
10674
|
+
/**
|
|
10675
|
+
* Creates embeddings for an array of texts with automatic retry logic for size-related errors.
|
|
10676
|
+
* If a batch fails due to size limits:
|
|
10677
|
+
* - Batches with multiple texts are split in half and retried recursively
|
|
10678
|
+
* - Single texts that are too large are truncated and retried once
|
|
10679
|
+
*
|
|
10680
|
+
* @param texts Array of texts to embed
|
|
10681
|
+
* @returns Array of embedding vectors
|
|
10682
|
+
*/
|
|
10683
|
+
async embedDocumentsWithRetry(texts) {
|
|
10684
|
+
if (texts.length === 0) {
|
|
10685
|
+
return [];
|
|
10686
|
+
}
|
|
10687
|
+
try {
|
|
10688
|
+
return await this.embeddings.embedDocuments(texts);
|
|
10689
|
+
} catch (error) {
|
|
10690
|
+
if (this.isInputSizeError(error)) {
|
|
10691
|
+
if (texts.length > 1) {
|
|
10692
|
+
const midpoint = Math.floor(texts.length / 2);
|
|
10693
|
+
const firstHalf = texts.slice(0, midpoint);
|
|
10694
|
+
const secondHalf = texts.slice(midpoint);
|
|
10695
|
+
logger.warn(
|
|
10696
|
+
`⚠️ Batch of ${texts.length} texts exceeded size limit, splitting into ${firstHalf.length} + ${secondHalf.length}`
|
|
10697
|
+
);
|
|
10698
|
+
const [firstEmbeddings, secondEmbeddings] = await Promise.all([
|
|
10699
|
+
this.embedDocumentsWithRetry(firstHalf),
|
|
10700
|
+
this.embedDocumentsWithRetry(secondHalf)
|
|
10701
|
+
]);
|
|
10702
|
+
return [...firstEmbeddings, ...secondEmbeddings];
|
|
10703
|
+
} else {
|
|
10704
|
+
const text = texts[0];
|
|
10705
|
+
const midpoint = Math.floor(text.length / 2);
|
|
10706
|
+
const firstHalf = text.substring(0, midpoint);
|
|
10707
|
+
logger.warn(
|
|
10708
|
+
`⚠️ Single text exceeded embedding size limit (${text.length} chars). Truncating at ${firstHalf.length} chars.`
|
|
10709
|
+
);
|
|
10710
|
+
try {
|
|
10711
|
+
const embedding = await this.embedDocumentsWithRetry([firstHalf]);
|
|
10712
|
+
logger.info(
|
|
10713
|
+
`✓ Using embedding from first half of split text (${firstHalf.length} chars)`
|
|
10714
|
+
);
|
|
10715
|
+
return embedding;
|
|
10716
|
+
} catch (retryError) {
|
|
10717
|
+
logger.error(
|
|
10718
|
+
`❌ Failed to embed even after splitting. Original length: ${text.length}`
|
|
10719
|
+
);
|
|
10720
|
+
throw retryError;
|
|
10721
|
+
}
|
|
10722
|
+
}
|
|
10723
|
+
}
|
|
10724
|
+
throw error;
|
|
10725
|
+
}
|
|
10726
|
+
}
|
|
10351
10727
|
/**
|
|
10352
10728
|
* Stores documents with library and version metadata, generating embeddings
|
|
10353
10729
|
* for vector similarity search. Uses the new pages table to normalize page-level
|
|
10354
10730
|
* metadata and avoid duplication across document chunks.
|
|
10355
10731
|
*/
|
|
10356
|
-
async addDocuments(library, version2,
|
|
10732
|
+
async addDocuments(library, version2, depth, result) {
|
|
10357
10733
|
try {
|
|
10358
|
-
|
|
10734
|
+
const { title, url, chunks } = result;
|
|
10735
|
+
if (chunks.length === 0) {
|
|
10359
10736
|
return;
|
|
10360
10737
|
}
|
|
10361
|
-
const documentsByUrl = /* @__PURE__ */ new Map();
|
|
10362
|
-
for (const doc of documents) {
|
|
10363
|
-
const url = doc.metadata.url;
|
|
10364
|
-
if (!url || typeof url !== "string" || !url.trim()) {
|
|
10365
|
-
throw new StoreError("Document metadata must include a valid URL");
|
|
10366
|
-
}
|
|
10367
|
-
if (!documentsByUrl.has(url)) {
|
|
10368
|
-
documentsByUrl.set(url, []);
|
|
10369
|
-
}
|
|
10370
|
-
documentsByUrl.get(url)?.push(doc);
|
|
10371
|
-
}
|
|
10372
10738
|
let paddedEmbeddings = [];
|
|
10373
10739
|
if (this.isVectorSearchEnabled) {
|
|
10374
|
-
const texts =
|
|
10375
|
-
const header = `<title>${
|
|
10376
|
-
<url>${
|
|
10377
|
-
<path>${(
|
|
10740
|
+
const texts = chunks.map((chunk) => {
|
|
10741
|
+
const header = `<title>${title}</title>
|
|
10742
|
+
<url>${url}</url>
|
|
10743
|
+
<path>${(chunk.section.path || []).join(" / ")}</path>
|
|
10378
10744
|
`;
|
|
10379
|
-
return `${header}${
|
|
10745
|
+
return `${header}${chunk.content}`;
|
|
10380
10746
|
});
|
|
10747
|
+
for (let i = 0; i < texts.length; i++) {
|
|
10748
|
+
const textSize = texts[i].length;
|
|
10749
|
+
if (textSize > SPLITTER_MAX_CHUNK_SIZE) {
|
|
10750
|
+
logger.warn(
|
|
10751
|
+
`⚠️ Chunk ${i + 1}/${texts.length} exceeds max size: ${textSize} > ${SPLITTER_MAX_CHUNK_SIZE} chars (URL: ${url})`
|
|
10752
|
+
);
|
|
10753
|
+
}
|
|
10754
|
+
}
|
|
10381
10755
|
const maxBatchChars = EMBEDDING_BATCH_CHARS;
|
|
10382
10756
|
const rawEmbeddings = [];
|
|
10383
10757
|
let currentBatch = [];
|
|
@@ -10390,7 +10764,7 @@ class DocumentStore {
|
|
|
10390
10764
|
logger.debug(
|
|
10391
10765
|
`Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
|
|
10392
10766
|
);
|
|
10393
|
-
const batchEmbeddings = await this.
|
|
10767
|
+
const batchEmbeddings = await this.embedDocumentsWithRetry(currentBatch);
|
|
10394
10768
|
rawEmbeddings.push(...batchEmbeddings);
|
|
10395
10769
|
currentBatch = [];
|
|
10396
10770
|
currentBatchSize = 0;
|
|
@@ -10402,7 +10776,7 @@ class DocumentStore {
|
|
|
10402
10776
|
logger.debug(
|
|
10403
10777
|
`Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
|
|
10404
10778
|
);
|
|
10405
|
-
const batchEmbeddings = await this.
|
|
10779
|
+
const batchEmbeddings = await this.embedDocumentsWithRetry(currentBatch);
|
|
10406
10780
|
rawEmbeddings.push(...batchEmbeddings);
|
|
10407
10781
|
currentBatch = [];
|
|
10408
10782
|
currentBatchSize = 0;
|
|
@@ -10413,110 +10787,115 @@ class DocumentStore {
|
|
|
10413
10787
|
logger.debug(
|
|
10414
10788
|
`Processing final embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
|
|
10415
10789
|
);
|
|
10416
|
-
const batchEmbeddings = await this.
|
|
10790
|
+
const batchEmbeddings = await this.embedDocumentsWithRetry(currentBatch);
|
|
10417
10791
|
rawEmbeddings.push(...batchEmbeddings);
|
|
10418
10792
|
}
|
|
10419
10793
|
paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
|
|
10420
10794
|
}
|
|
10421
10795
|
const versionId = await this.resolveVersionId(library, version2);
|
|
10422
|
-
|
|
10423
|
-
|
|
10424
|
-
|
|
10425
|
-
|
|
10426
|
-
|
|
10427
|
-
|
|
10428
|
-
|
|
10429
|
-
|
|
10430
|
-
|
|
10431
|
-
|
|
10432
|
-
|
|
10433
|
-
|
|
10434
|
-
|
|
10435
|
-
|
|
10436
|
-
|
|
10437
|
-
|
|
10438
|
-
|
|
10439
|
-
|
|
10440
|
-
|
|
10441
|
-
|
|
10442
|
-
|
|
10443
|
-
|
|
10444
|
-
|
|
10445
|
-
if (!existingPage) {
|
|
10446
|
-
throw new StoreError(`Failed to get page ID for URL: ${url}`);
|
|
10447
|
-
}
|
|
10448
|
-
const pageId = existingPage.id;
|
|
10449
|
-
pageIds.set(url, pageId);
|
|
10796
|
+
const existingPage = this.statements.getPageId.get(versionId, url);
|
|
10797
|
+
if (existingPage) {
|
|
10798
|
+
const result2 = this.statements.deleteDocumentsByPageId.run(existingPage.id);
|
|
10799
|
+
if (result2.changes > 0) {
|
|
10800
|
+
logger.debug(`Deleted ${result2.changes} existing documents for URL: ${url}`);
|
|
10801
|
+
}
|
|
10802
|
+
}
|
|
10803
|
+
const transaction = this.db.transaction(() => {
|
|
10804
|
+
const contentType = result.contentType || null;
|
|
10805
|
+
const etag = result.etag || null;
|
|
10806
|
+
const lastModified = result.lastModified || null;
|
|
10807
|
+
this.statements.insertPage.run(
|
|
10808
|
+
versionId,
|
|
10809
|
+
url,
|
|
10810
|
+
title || "",
|
|
10811
|
+
etag,
|
|
10812
|
+
lastModified,
|
|
10813
|
+
contentType,
|
|
10814
|
+
depth
|
|
10815
|
+
);
|
|
10816
|
+
const existingPage2 = this.statements.getPageId.get(versionId, url);
|
|
10817
|
+
if (!existingPage2) {
|
|
10818
|
+
throw new StoreError(`Failed to get page ID for URL: ${url}`);
|
|
10450
10819
|
}
|
|
10820
|
+
const pageId = existingPage2.id;
|
|
10451
10821
|
let docIndex = 0;
|
|
10452
|
-
for (
|
|
10453
|
-
const
|
|
10454
|
-
|
|
10455
|
-
|
|
10456
|
-
|
|
10457
|
-
|
|
10458
|
-
|
|
10459
|
-
|
|
10460
|
-
|
|
10461
|
-
|
|
10462
|
-
|
|
10463
|
-
|
|
10464
|
-
|
|
10465
|
-
|
|
10466
|
-
|
|
10467
|
-
|
|
10468
|
-
|
|
10469
|
-
JSON.stringify(
|
|
10470
|
-
i
|
|
10471
|
-
// sort_order within this page
|
|
10822
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
10823
|
+
const chunk = chunks[i];
|
|
10824
|
+
const result2 = this.statements.insertDocument.run(
|
|
10825
|
+
pageId,
|
|
10826
|
+
chunk.content,
|
|
10827
|
+
JSON.stringify({
|
|
10828
|
+
types: chunk.types,
|
|
10829
|
+
level: chunk.section.level,
|
|
10830
|
+
path: chunk.section.path
|
|
10831
|
+
}),
|
|
10832
|
+
i
|
|
10833
|
+
// sort_order within this page
|
|
10834
|
+
);
|
|
10835
|
+
const rowId = result2.lastInsertRowid;
|
|
10836
|
+
if (this.isVectorSearchEnabled && paddedEmbeddings.length > 0) {
|
|
10837
|
+
this.statements.insertEmbedding.run(
|
|
10838
|
+
BigInt(rowId),
|
|
10839
|
+
JSON.stringify(paddedEmbeddings[docIndex])
|
|
10472
10840
|
);
|
|
10473
|
-
const rowId = result.lastInsertRowid;
|
|
10474
|
-
if (this.isVectorSearchEnabled && paddedEmbeddings.length > 0) {
|
|
10475
|
-
this.statements.insertEmbedding.run(
|
|
10476
|
-
BigInt(rowId),
|
|
10477
|
-
JSON.stringify(paddedEmbeddings[docIndex])
|
|
10478
|
-
);
|
|
10479
|
-
}
|
|
10480
|
-
docIndex++;
|
|
10481
10841
|
}
|
|
10842
|
+
docIndex++;
|
|
10482
10843
|
}
|
|
10483
10844
|
});
|
|
10484
|
-
transaction(
|
|
10845
|
+
transaction();
|
|
10485
10846
|
} catch (error) {
|
|
10486
10847
|
throw new ConnectionError("Failed to add documents to store", error);
|
|
10487
10848
|
}
|
|
10488
10849
|
}
|
|
10489
10850
|
/**
|
|
10490
|
-
* Removes documents matching specified library and version
|
|
10851
|
+
* Removes documents and pages matching specified library and version.
|
|
10852
|
+
* This consolidated method deletes both documents and their associated pages.
|
|
10491
10853
|
* @returns Number of documents deleted
|
|
10492
10854
|
*/
|
|
10493
|
-
async
|
|
10855
|
+
async deletePages(library, version2) {
|
|
10494
10856
|
try {
|
|
10495
10857
|
const normalizedVersion = version2.toLowerCase();
|
|
10496
10858
|
const result = this.statements.deleteDocuments.run(
|
|
10497
10859
|
library.toLowerCase(),
|
|
10498
10860
|
normalizedVersion
|
|
10499
10861
|
);
|
|
10862
|
+
this.statements.deletePages.run(library.toLowerCase(), normalizedVersion);
|
|
10500
10863
|
return result.changes;
|
|
10501
10864
|
} catch (error) {
|
|
10502
10865
|
throw new ConnectionError("Failed to delete documents", error);
|
|
10503
10866
|
}
|
|
10504
10867
|
}
|
|
10505
10868
|
/**
|
|
10506
|
-
*
|
|
10507
|
-
*
|
|
10869
|
+
* Deletes a page and all its associated document chunks.
|
|
10870
|
+
* Performs manual deletion in the correct order to satisfy foreign key constraints:
|
|
10871
|
+
* 1. Delete document chunks (page_id references pages.id)
|
|
10872
|
+
* 2. Delete page record
|
|
10873
|
+
*
|
|
10874
|
+
* This method is used during refresh operations when a page returns 404 Not Found.
|
|
10508
10875
|
*/
|
|
10509
|
-
async
|
|
10876
|
+
async deletePage(pageId) {
|
|
10510
10877
|
try {
|
|
10511
|
-
const
|
|
10512
|
-
|
|
10513
|
-
|
|
10514
|
-
|
|
10515
|
-
|
|
10516
|
-
|
|
10517
|
-
|
|
10878
|
+
const docResult = this.statements.deleteDocumentsByPageId.run(pageId);
|
|
10879
|
+
logger.debug(`Deleted ${docResult.changes} document(s) for page ID ${pageId}`);
|
|
10880
|
+
const pageResult = this.statements.deletePage.run(pageId);
|
|
10881
|
+
if (pageResult.changes > 0) {
|
|
10882
|
+
logger.debug(`Deleted page record for page ID ${pageId}`);
|
|
10883
|
+
}
|
|
10884
|
+
} catch (error) {
|
|
10885
|
+
throw new ConnectionError(`Failed to delete page ${pageId}`, error);
|
|
10886
|
+
}
|
|
10887
|
+
}
|
|
10888
|
+
/**
|
|
10889
|
+
* Retrieves all pages for a specific version ID with their metadata.
|
|
10890
|
+
* Used for refresh operations to get existing pages with their ETags and depths.
|
|
10891
|
+
* @returns Array of page records
|
|
10892
|
+
*/
|
|
10893
|
+
async getPagesByVersionId(versionId) {
|
|
10894
|
+
try {
|
|
10895
|
+
const result = this.statements.getPagesByVersionId.all(versionId);
|
|
10896
|
+
return result;
|
|
10518
10897
|
} catch (error) {
|
|
10519
|
-
throw new ConnectionError("Failed to
|
|
10898
|
+
throw new ConnectionError("Failed to get pages by version ID", error);
|
|
10520
10899
|
}
|
|
10521
10900
|
}
|
|
10522
10901
|
/**
|
|
@@ -10539,7 +10918,8 @@ class DocumentStore {
|
|
|
10539
10918
|
return { documentsDeleted: 0, versionDeleted: false, libraryDeleted: false };
|
|
10540
10919
|
}
|
|
10541
10920
|
const { id: versionId, library_id: libraryId } = versionResult;
|
|
10542
|
-
const documentsDeleted = await this.
|
|
10921
|
+
const documentsDeleted = await this.deletePages(library, version2);
|
|
10922
|
+
this.statements.deletePages.run(normalizedLibrary, normalizedVersion);
|
|
10543
10923
|
const versionDeleteResult = this.statements.deleteVersionById.run(versionId);
|
|
10544
10924
|
const versionDeleted = versionDeleteResult.changes > 0;
|
|
10545
10925
|
let libraryDeleted = false;
|
|
@@ -10556,6 +10936,27 @@ class DocumentStore {
|
|
|
10556
10936
|
throw new ConnectionError("Failed to remove version", error);
|
|
10557
10937
|
}
|
|
10558
10938
|
}
|
|
10939
|
+
/**
|
|
10940
|
+
* Parses the metadata field from a JSON string to an object.
|
|
10941
|
+
* This is necessary because better-sqlite3's json() function returns a string, not an object.
|
|
10942
|
+
*/
|
|
10943
|
+
parseMetadata(row) {
|
|
10944
|
+
if (row.metadata && typeof row.metadata === "string") {
|
|
10945
|
+
try {
|
|
10946
|
+
row.metadata = JSON.parse(row.metadata);
|
|
10947
|
+
} catch (error) {
|
|
10948
|
+
logger.warn(`Failed to parse metadata JSON: ${error}`);
|
|
10949
|
+
row.metadata = {};
|
|
10950
|
+
}
|
|
10951
|
+
}
|
|
10952
|
+
return row;
|
|
10953
|
+
}
|
|
10954
|
+
/**
|
|
10955
|
+
* Parses metadata for an array of rows.
|
|
10956
|
+
*/
|
|
10957
|
+
parseMetadataArray(rows) {
|
|
10958
|
+
return rows.map((row) => this.parseMetadata(row));
|
|
10959
|
+
}
|
|
10559
10960
|
/**
|
|
10560
10961
|
* Retrieves a document by its ID.
|
|
10561
10962
|
* @param id The ID of the document.
|
|
@@ -10563,13 +10964,11 @@ class DocumentStore {
|
|
|
10563
10964
|
*/
|
|
10564
10965
|
async getById(id) {
|
|
10565
10966
|
try {
|
|
10566
|
-
const row = this.statements.getById.get(
|
|
10567
|
-
BigInt(id)
|
|
10568
|
-
);
|
|
10967
|
+
const row = this.statements.getById.get(BigInt(id));
|
|
10569
10968
|
if (!row) {
|
|
10570
10969
|
return null;
|
|
10571
10970
|
}
|
|
10572
|
-
return
|
|
10971
|
+
return this.parseMetadata(row);
|
|
10573
10972
|
} catch (error) {
|
|
10574
10973
|
throw new ConnectionError(`Failed to get document by ID ${id}`, error);
|
|
10575
10974
|
}
|
|
@@ -10653,26 +11052,20 @@ class DocumentStore {
|
|
|
10653
11052
|
);
|
|
10654
11053
|
const rankedResults = this.assignRanks(rawResults);
|
|
10655
11054
|
const topResults = rankedResults.sort((a, b) => b.rrf_score - a.rrf_score).slice(0, limit);
|
|
10656
|
-
return topResults.map((row) =>
|
|
10657
|
-
|
|
11055
|
+
return topResults.map((row) => {
|
|
11056
|
+
const result = {
|
|
10658
11057
|
...row,
|
|
10659
11058
|
url: row.url || "",
|
|
10660
11059
|
// Ensure url is never undefined
|
|
10661
|
-
title: row.title,
|
|
10662
|
-
content_type: row.content_type
|
|
10663
|
-
}
|
|
10664
|
-
|
|
10665
|
-
...JSON.parse(row.metadata),
|
|
10666
|
-
id: row.id,
|
|
11060
|
+
title: row.title || null,
|
|
11061
|
+
content_type: row.content_type || null
|
|
11062
|
+
};
|
|
11063
|
+
return Object.assign(result, {
|
|
10667
11064
|
score: row.rrf_score,
|
|
10668
11065
|
vec_rank: row.vec_rank,
|
|
10669
|
-
fts_rank: row.fts_rank
|
|
10670
|
-
|
|
10671
|
-
|
|
10672
|
-
title: row.title || "",
|
|
10673
|
-
...row.content_type && { contentType: row.content_type }
|
|
10674
|
-
}
|
|
10675
|
-
}));
|
|
11066
|
+
fts_rank: row.fts_rank
|
|
11067
|
+
});
|
|
11068
|
+
});
|
|
10676
11069
|
} else {
|
|
10677
11070
|
const stmt = this.db.prepare(`
|
|
10678
11071
|
SELECT
|
|
@@ -10704,28 +11097,21 @@ class DocumentStore {
|
|
|
10704
11097
|
ftsQuery,
|
|
10705
11098
|
limit
|
|
10706
11099
|
);
|
|
10707
|
-
return rawResults.map((row, index) =>
|
|
10708
|
-
|
|
11100
|
+
return rawResults.map((row, index) => {
|
|
11101
|
+
const result = {
|
|
10709
11102
|
...row,
|
|
10710
11103
|
url: row.url || "",
|
|
10711
11104
|
// Ensure url is never undefined
|
|
10712
|
-
title: row.title,
|
|
10713
|
-
content_type: row.content_type
|
|
10714
|
-
}
|
|
10715
|
-
|
|
10716
|
-
...JSON.parse(row.metadata),
|
|
10717
|
-
id: row.id,
|
|
11105
|
+
title: row.title || null,
|
|
11106
|
+
content_type: row.content_type || null
|
|
11107
|
+
};
|
|
11108
|
+
return Object.assign(result, {
|
|
10718
11109
|
score: -row.fts_score,
|
|
10719
11110
|
// Convert BM25 score to positive value for consistency
|
|
10720
|
-
fts_rank: index + 1
|
|
11111
|
+
fts_rank: index + 1
|
|
10721
11112
|
// Assign rank based on order (1-based)
|
|
10722
|
-
|
|
10723
|
-
|
|
10724
|
-
url: row.url || "",
|
|
10725
|
-
title: row.title || "",
|
|
10726
|
-
...row.content_type && { contentType: row.content_type }
|
|
10727
|
-
}
|
|
10728
|
-
}));
|
|
11113
|
+
});
|
|
11114
|
+
});
|
|
10729
11115
|
}
|
|
10730
11116
|
} catch (error) {
|
|
10731
11117
|
throw new ConnectionError(
|
|
@@ -10744,18 +11130,17 @@ class DocumentStore {
|
|
|
10744
11130
|
return [];
|
|
10745
11131
|
}
|
|
10746
11132
|
const parentPath = parent.metadata.path ?? [];
|
|
10747
|
-
const parentUrl = parent.metadata.url;
|
|
10748
11133
|
const normalizedVersion = version2.toLowerCase();
|
|
10749
11134
|
const result = this.statements.getChildChunks.all(
|
|
10750
11135
|
library.toLowerCase(),
|
|
10751
11136
|
normalizedVersion,
|
|
10752
|
-
|
|
11137
|
+
parent.url,
|
|
10753
11138
|
parentPath.length + 1,
|
|
10754
11139
|
JSON.stringify(parentPath),
|
|
10755
11140
|
BigInt(id),
|
|
10756
11141
|
limit
|
|
10757
11142
|
);
|
|
10758
|
-
return
|
|
11143
|
+
return this.parseMetadataArray(result);
|
|
10759
11144
|
} catch (error) {
|
|
10760
11145
|
throw new ConnectionError(`Failed to find child chunks for ID ${id}`, error);
|
|
10761
11146
|
}
|
|
@@ -10769,17 +11154,16 @@ class DocumentStore {
|
|
|
10769
11154
|
if (!reference) {
|
|
10770
11155
|
return [];
|
|
10771
11156
|
}
|
|
10772
|
-
const refMetadata = reference.metadata;
|
|
10773
11157
|
const normalizedVersion = version2.toLowerCase();
|
|
10774
11158
|
const result = this.statements.getPrecedingSiblings.all(
|
|
10775
11159
|
library.toLowerCase(),
|
|
10776
11160
|
normalizedVersion,
|
|
10777
|
-
|
|
11161
|
+
reference.url,
|
|
10778
11162
|
BigInt(id),
|
|
10779
|
-
JSON.stringify(
|
|
11163
|
+
JSON.stringify(reference.metadata.path),
|
|
10780
11164
|
limit
|
|
10781
11165
|
);
|
|
10782
|
-
return
|
|
11166
|
+
return this.parseMetadataArray(result).reverse();
|
|
10783
11167
|
} catch (error) {
|
|
10784
11168
|
throw new ConnectionError(
|
|
10785
11169
|
`Failed to find preceding sibling chunks for ID ${id}`,
|
|
@@ -10796,17 +11180,16 @@ class DocumentStore {
|
|
|
10796
11180
|
if (!reference) {
|
|
10797
11181
|
return [];
|
|
10798
11182
|
}
|
|
10799
|
-
const refMetadata = reference.metadata;
|
|
10800
11183
|
const normalizedVersion = version2.toLowerCase();
|
|
10801
11184
|
const result = this.statements.getSubsequentSiblings.all(
|
|
10802
11185
|
library.toLowerCase(),
|
|
10803
11186
|
normalizedVersion,
|
|
10804
|
-
|
|
11187
|
+
reference.url,
|
|
10805
11188
|
BigInt(id),
|
|
10806
|
-
JSON.stringify(
|
|
11189
|
+
JSON.stringify(reference.metadata.path),
|
|
10807
11190
|
limit
|
|
10808
11191
|
);
|
|
10809
|
-
return
|
|
11192
|
+
return this.parseMetadataArray(result);
|
|
10810
11193
|
} catch (error) {
|
|
10811
11194
|
throw new ConnectionError(
|
|
10812
11195
|
`Failed to find subsequent sibling chunks for ID ${id}`,
|
|
@@ -10816,6 +11199,8 @@ class DocumentStore {
|
|
|
10816
11199
|
}
|
|
10817
11200
|
/**
|
|
10818
11201
|
* Finds the parent chunk of a given document.
|
|
11202
|
+
* Returns null if no parent is found or if there's a database error.
|
|
11203
|
+
* Database errors are logged but not thrown to maintain consistent behavior.
|
|
10819
11204
|
*/
|
|
10820
11205
|
async findParentChunk(library, version2, id) {
|
|
10821
11206
|
try {
|
|
@@ -10823,8 +11208,7 @@ class DocumentStore {
|
|
|
10823
11208
|
if (!child) {
|
|
10824
11209
|
return null;
|
|
10825
11210
|
}
|
|
10826
|
-
const
|
|
10827
|
-
const path2 = childMetadata.path ?? [];
|
|
11211
|
+
const path2 = child.metadata.path ?? [];
|
|
10828
11212
|
const parentPath = path2.slice(0, -1);
|
|
10829
11213
|
if (parentPath.length === 0) {
|
|
10830
11214
|
return null;
|
|
@@ -10833,21 +11217,22 @@ class DocumentStore {
|
|
|
10833
11217
|
const result = this.statements.getParentChunk.get(
|
|
10834
11218
|
library.toLowerCase(),
|
|
10835
11219
|
normalizedVersion,
|
|
10836
|
-
|
|
11220
|
+
child.url,
|
|
10837
11221
|
JSON.stringify(parentPath),
|
|
10838
11222
|
BigInt(id)
|
|
10839
11223
|
);
|
|
10840
11224
|
if (!result) {
|
|
10841
11225
|
return null;
|
|
10842
11226
|
}
|
|
10843
|
-
return
|
|
11227
|
+
return this.parseMetadata(result);
|
|
10844
11228
|
} catch (error) {
|
|
10845
|
-
|
|
11229
|
+
logger.warn(`Failed to find parent chunk for ID ${id}: ${error}`);
|
|
11230
|
+
return null;
|
|
10846
11231
|
}
|
|
10847
11232
|
}
|
|
10848
11233
|
/**
|
|
10849
11234
|
* Fetches multiple documents by their IDs in a single call.
|
|
10850
|
-
* Returns an array of
|
|
11235
|
+
* Returns an array of DbPageChunk objects, sorted by their sort_order.
|
|
10851
11236
|
*/
|
|
10852
11237
|
async findChunksByIds(library, version2, ids) {
|
|
10853
11238
|
if (!ids.length) return [];
|
|
@@ -10855,7 +11240,7 @@ class DocumentStore {
|
|
|
10855
11240
|
const normalizedVersion = version2.toLowerCase();
|
|
10856
11241
|
const placeholders = ids.map(() => "?").join(",");
|
|
10857
11242
|
const stmt = this.db.prepare(
|
|
10858
|
-
`SELECT d
|
|
11243
|
+
`SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
|
|
10859
11244
|
JOIN pages p ON d.page_id = p.id
|
|
10860
11245
|
JOIN versions v ON p.version_id = v.id
|
|
10861
11246
|
JOIN libraries l ON v.library_id = l.id
|
|
@@ -10869,20 +11254,20 @@ class DocumentStore {
|
|
|
10869
11254
|
normalizedVersion,
|
|
10870
11255
|
...ids
|
|
10871
11256
|
);
|
|
10872
|
-
return
|
|
11257
|
+
return this.parseMetadataArray(rows);
|
|
10873
11258
|
} catch (error) {
|
|
10874
11259
|
throw new ConnectionError("Failed to fetch documents by IDs", error);
|
|
10875
11260
|
}
|
|
10876
11261
|
}
|
|
10877
11262
|
/**
|
|
10878
11263
|
* Fetches all document chunks for a specific URL within a library and version.
|
|
10879
|
-
* Returns
|
|
11264
|
+
* Returns DbPageChunk objects sorted by their sort_order for proper reassembly.
|
|
10880
11265
|
*/
|
|
10881
11266
|
async findChunksByUrl(library, version2, url) {
|
|
10882
11267
|
try {
|
|
10883
11268
|
const normalizedVersion = version2.toLowerCase();
|
|
10884
11269
|
const stmt = this.db.prepare(
|
|
10885
|
-
`SELECT d
|
|
11270
|
+
`SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
|
|
10886
11271
|
JOIN pages p ON d.page_id = p.id
|
|
10887
11272
|
JOIN versions v ON p.version_id = v.id
|
|
10888
11273
|
JOIN libraries l ON v.library_id = l.id
|
|
@@ -10896,7 +11281,7 @@ class DocumentStore {
|
|
|
10896
11281
|
normalizedVersion,
|
|
10897
11282
|
url
|
|
10898
11283
|
);
|
|
10899
|
-
return
|
|
11284
|
+
return this.parseMetadataArray(rows);
|
|
10900
11285
|
} catch (error) {
|
|
10901
11286
|
throw new ConnectionError(`Failed to fetch documents by URL ${url}`, error);
|
|
10902
11287
|
}
|
|
@@ -10914,9 +11299,8 @@ class DocumentManagementService {
|
|
|
10914
11299
|
return (version2 ?? "").toLowerCase();
|
|
10915
11300
|
}
|
|
10916
11301
|
constructor(storePath, embeddingConfig, pipelineConfig) {
|
|
10917
|
-
const
|
|
10918
|
-
|
|
10919
|
-
logger.debug(`Using database directory: ${dbDir}`);
|
|
11302
|
+
const dbPath = storePath === ":memory:" ? ":memory:" : path.join(storePath, "documents.db");
|
|
11303
|
+
logger.debug(`Using database path: ${dbPath}`);
|
|
10920
11304
|
this.store = new DocumentStore(dbPath, embeddingConfig);
|
|
10921
11305
|
this.documentRetriever = new DocumentRetrieverService(this.store);
|
|
10922
11306
|
this.pipelines = PipelineFactory$1.createStandardPipelines(pipelineConfig);
|
|
@@ -11127,9 +11511,24 @@ class DocumentManagementService {
|
|
|
11127
11511
|
logger.info(
|
|
11128
11512
|
`🗑️ Removing all documents from ${library}@${normalizedVersion || "[no version]"} store`
|
|
11129
11513
|
);
|
|
11130
|
-
const count = await this.store.
|
|
11514
|
+
const count = await this.store.deletePages(library, normalizedVersion);
|
|
11131
11515
|
logger.info(`🗑️ Deleted ${count} documents`);
|
|
11132
11516
|
}
|
|
11517
|
+
/**
|
|
11518
|
+
* Deletes a page and all its associated document chunks.
|
|
11519
|
+
* This is used during refresh operations when a page returns 404 Not Found.
|
|
11520
|
+
*/
|
|
11521
|
+
async deletePage(pageId) {
|
|
11522
|
+
logger.debug(`Deleting page ID: ${pageId}`);
|
|
11523
|
+
await this.store.deletePage(pageId);
|
|
11524
|
+
}
|
|
11525
|
+
/**
|
|
11526
|
+
* Retrieves all pages for a specific version ID with their metadata.
|
|
11527
|
+
* Used for refresh operations to get existing pages with their ETags and depths.
|
|
11528
|
+
*/
|
|
11529
|
+
async getPagesByVersionId(versionId) {
|
|
11530
|
+
return this.store.getPagesByVersionId(versionId);
|
|
11531
|
+
}
|
|
11133
11532
|
/**
|
|
11134
11533
|
* Completely removes a library version and all associated documents.
|
|
11135
11534
|
* Also removes the library if no other versions remain.
|
|
@@ -11138,15 +11537,13 @@ class DocumentManagementService {
|
|
|
11138
11537
|
*/
|
|
11139
11538
|
async removeVersion(library, version2) {
|
|
11140
11539
|
const normalizedVersion = this.normalizeVersion(version2);
|
|
11141
|
-
logger.
|
|
11540
|
+
logger.debug(`Removing version: ${library}@${normalizedVersion || "[no version]"}`);
|
|
11142
11541
|
const result = await this.store.removeVersion(library, normalizedVersion, true);
|
|
11143
|
-
logger.info(
|
|
11144
|
-
`🗑️ Removed ${result.documentsDeleted} documents, version: ${result.versionDeleted}, library: ${result.libraryDeleted}`
|
|
11145
|
-
);
|
|
11542
|
+
logger.info(`🗑️ Removed ${result.documentsDeleted} documents`);
|
|
11146
11543
|
if (result.versionDeleted && result.libraryDeleted) {
|
|
11147
|
-
logger.info(
|
|
11544
|
+
logger.info(`🗑️ Completely removed library ${library} (was last version)`);
|
|
11148
11545
|
} else if (result.versionDeleted) {
|
|
11149
|
-
logger.info(
|
|
11546
|
+
logger.info(`🗑️ Removed version ${library}@${normalizedVersion || "[no version]"}`);
|
|
11150
11547
|
} else {
|
|
11151
11548
|
logger.warn(
|
|
11152
11549
|
`⚠️ Version ${library}@${normalizedVersion || "[no version]"} not found`
|
|
@@ -11154,91 +11551,68 @@ class DocumentManagementService {
|
|
|
11154
11551
|
}
|
|
11155
11552
|
}
|
|
11156
11553
|
/**
|
|
11157
|
-
* Adds
|
|
11158
|
-
*
|
|
11159
|
-
*
|
|
11160
|
-
*
|
|
11554
|
+
* Adds pre-processed content directly to the store.
|
|
11555
|
+
* This method is used when content has already been processed by a pipeline,
|
|
11556
|
+
* avoiding redundant processing. Used primarily by the scraping pipeline.
|
|
11557
|
+
*
|
|
11558
|
+
* @param library Library name
|
|
11559
|
+
* @param version Version string (null/undefined for unversioned)
|
|
11560
|
+
* @param processed Pre-processed content with chunks already created
|
|
11561
|
+
* @param pageId Optional page ID for refresh operations
|
|
11161
11562
|
*/
|
|
11162
|
-
async
|
|
11563
|
+
async addScrapeResult(library, version2, depth, result) {
|
|
11163
11564
|
const processingStart = performance.now();
|
|
11164
11565
|
const normalizedVersion = this.normalizeVersion(version2);
|
|
11165
|
-
const url =
|
|
11166
|
-
if (!url
|
|
11167
|
-
throw new StoreError("
|
|
11566
|
+
const { url, title, chunks, contentType } = result;
|
|
11567
|
+
if (!url) {
|
|
11568
|
+
throw new StoreError("Processed content metadata must include a valid URL");
|
|
11168
11569
|
}
|
|
11169
|
-
logger.info(`📚 Adding
|
|
11170
|
-
if (
|
|
11171
|
-
|
|
11570
|
+
logger.info(`📚 Adding processed content: ${title || url}`);
|
|
11571
|
+
if (chunks.length === 0) {
|
|
11572
|
+
logger.warn(`⚠️ No chunks in processed content for ${url}. Skipping.`);
|
|
11573
|
+
return;
|
|
11172
11574
|
}
|
|
11173
|
-
const contentType = document2.metadata.mimeType;
|
|
11174
11575
|
try {
|
|
11175
|
-
|
|
11176
|
-
|
|
11177
|
-
content: document2.pageContent,
|
|
11178
|
-
mimeType: contentType || "text/plain"
|
|
11179
|
-
};
|
|
11180
|
-
const pipeline = this.pipelines.find((p) => p.canProcess(rawContent));
|
|
11181
|
-
if (!pipeline) {
|
|
11182
|
-
logger.warn(
|
|
11183
|
-
`⚠️ Unsupported content type "${rawContent.mimeType}" for document ${url}. Skipping processing.`
|
|
11184
|
-
);
|
|
11185
|
-
return;
|
|
11186
|
-
}
|
|
11187
|
-
logger.debug(
|
|
11188
|
-
`Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${url})`
|
|
11189
|
-
);
|
|
11190
|
-
const scraperOptions = {
|
|
11191
|
-
url,
|
|
11192
|
-
library,
|
|
11193
|
-
version: normalizedVersion,
|
|
11194
|
-
scrapeMode: ScrapeMode.Fetch,
|
|
11195
|
-
ignoreErrors: false,
|
|
11196
|
-
maxConcurrency: 1
|
|
11197
|
-
};
|
|
11198
|
-
const processed = await pipeline.process(rawContent, scraperOptions);
|
|
11199
|
-
const chunks = processed.chunks;
|
|
11200
|
-
const splitDocs = chunks.map((chunk) => ({
|
|
11201
|
-
pageContent: chunk.content,
|
|
11202
|
-
metadata: {
|
|
11203
|
-
...document2.metadata,
|
|
11204
|
-
level: chunk.section.level,
|
|
11205
|
-
path: chunk.section.path
|
|
11206
|
-
}
|
|
11207
|
-
}));
|
|
11208
|
-
logger.info(`✂️ Split document into ${splitDocs.length} chunks`);
|
|
11209
|
-
await this.store.addDocuments(library, normalizedVersion, splitDocs);
|
|
11576
|
+
logger.info(`✂️ Storing ${chunks.length} pre-split chunks`);
|
|
11577
|
+
await this.store.addDocuments(library, normalizedVersion, depth, result);
|
|
11210
11578
|
const processingTime = performance.now() - processingStart;
|
|
11579
|
+
const totalContentSize = chunks.reduce(
|
|
11580
|
+
(sum, chunk) => sum + chunk.content.length,
|
|
11581
|
+
0
|
|
11582
|
+
);
|
|
11211
11583
|
analytics.track(TelemetryEvent.DOCUMENT_PROCESSED, {
|
|
11212
11584
|
// Content characteristics (privacy-safe)
|
|
11213
|
-
mimeType: contentType
|
|
11214
|
-
contentSizeBytes:
|
|
11585
|
+
mimeType: contentType,
|
|
11586
|
+
contentSizeBytes: totalContentSize,
|
|
11215
11587
|
// Processing metrics
|
|
11216
11588
|
processingTimeMs: Math.round(processingTime),
|
|
11217
|
-
chunksCreated:
|
|
11589
|
+
chunksCreated: chunks.length,
|
|
11218
11590
|
// Document characteristics
|
|
11219
|
-
hasTitle: !!
|
|
11220
|
-
hasDescription: !!document2.metadata.description,
|
|
11591
|
+
hasTitle: !!title,
|
|
11221
11592
|
urlDomain: extractHostname(url),
|
|
11222
|
-
depth
|
|
11593
|
+
depth,
|
|
11223
11594
|
// Library context
|
|
11224
11595
|
library,
|
|
11225
11596
|
libraryVersion: normalizedVersion || null,
|
|
11226
11597
|
// Processing efficiency
|
|
11227
|
-
avgChunkSizeBytes: Math.round(
|
|
11598
|
+
avgChunkSizeBytes: Math.round(totalContentSize / chunks.length),
|
|
11228
11599
|
processingSpeedKbPerSec: Math.round(
|
|
11229
|
-
|
|
11600
|
+
totalContentSize / 1024 / (processingTime / 1e3)
|
|
11230
11601
|
)
|
|
11231
11602
|
});
|
|
11232
11603
|
} catch (error) {
|
|
11233
11604
|
const processingTime = performance.now() - processingStart;
|
|
11234
11605
|
if (error instanceof Error) {
|
|
11235
11606
|
analytics.captureException(error, {
|
|
11236
|
-
mimeType: contentType
|
|
11237
|
-
contentSizeBytes:
|
|
11607
|
+
mimeType: contentType,
|
|
11608
|
+
contentSizeBytes: chunks.reduce(
|
|
11609
|
+
(sum, chunk) => sum + chunk.content.length,
|
|
11610
|
+
0
|
|
11611
|
+
),
|
|
11238
11612
|
processingTimeMs: Math.round(processingTime),
|
|
11239
11613
|
library,
|
|
11240
11614
|
libraryVersion: normalizedVersion || null,
|
|
11241
|
-
context: "
|
|
11615
|
+
context: "processed_content_storage",
|
|
11242
11616
|
component: DocumentManagementService.constructor.name
|
|
11243
11617
|
});
|
|
11244
11618
|
}
|
|
@@ -11268,6 +11642,18 @@ class DocumentManagementService {
|
|
|
11268
11642
|
);
|
|
11269
11643
|
return versionId;
|
|
11270
11644
|
}
|
|
11645
|
+
/**
|
|
11646
|
+
* Retrieves a version by its ID from the database.
|
|
11647
|
+
*/
|
|
11648
|
+
async getVersionById(versionId) {
|
|
11649
|
+
return this.store.getVersionById(versionId);
|
|
11650
|
+
}
|
|
11651
|
+
/**
|
|
11652
|
+
* Retrieves a library by its ID from the database.
|
|
11653
|
+
*/
|
|
11654
|
+
async getLibraryById(libraryId) {
|
|
11655
|
+
return this.store.getLibraryById(libraryId);
|
|
11656
|
+
}
|
|
11271
11657
|
}
|
|
11272
11658
|
async function createDocumentManagement(options = {}) {
|
|
11273
11659
|
if (options.serverUrl) {
|
|
@@ -11359,6 +11745,7 @@ async function initializeTools(docService, pipeline) {
|
|
|
11359
11745
|
listLibraries: new ListLibrariesTool(docService),
|
|
11360
11746
|
findVersion: new FindVersionTool(docService),
|
|
11361
11747
|
scrape: new ScrapeTool(pipeline),
|
|
11748
|
+
refresh: new RefreshVersionTool(pipeline),
|
|
11362
11749
|
search: new SearchTool(docService),
|
|
11363
11750
|
listJobs: new ListJobsTool(pipeline),
|
|
11364
11751
|
getJobInfo: new GetJobInfoTool(pipeline),
|
|
@@ -11471,11 +11858,15 @@ const optionalTrimmed = z$1.preprocess(
|
|
|
11471
11858
|
(v) => typeof v === "string" ? v.trim() : v,
|
|
11472
11859
|
z$1.string().min(1).optional().nullable()
|
|
11473
11860
|
);
|
|
11474
|
-
const
|
|
11861
|
+
const enqueueScrapeInput = z$1.object({
|
|
11475
11862
|
library: nonEmptyTrimmed,
|
|
11476
11863
|
version: optionalTrimmed,
|
|
11477
11864
|
options: z$1.custom()
|
|
11478
11865
|
});
|
|
11866
|
+
const enqueueRefreshInput = z$1.object({
|
|
11867
|
+
library: nonEmptyTrimmed,
|
|
11868
|
+
version: optionalTrimmed
|
|
11869
|
+
});
|
|
11479
11870
|
const jobIdInput = z$1.object({ id: z$1.string().min(1) });
|
|
11480
11871
|
const getJobsInput = z$1.object({
|
|
11481
11872
|
status: z$1.nativeEnum(PipelineJobStatus).optional()
|
|
@@ -11483,12 +11874,12 @@ const getJobsInput = z$1.object({
|
|
|
11483
11874
|
function createPipelineRouter(trpc) {
|
|
11484
11875
|
const tt = trpc;
|
|
11485
11876
|
return tt.router({
|
|
11486
|
-
|
|
11877
|
+
enqueueScrapeJob: tt.procedure.input(enqueueScrapeInput).mutation(
|
|
11487
11878
|
async ({
|
|
11488
11879
|
ctx,
|
|
11489
11880
|
input
|
|
11490
11881
|
}) => {
|
|
11491
|
-
const jobId = await ctx.pipeline.
|
|
11882
|
+
const jobId = await ctx.pipeline.enqueueScrapeJob(
|
|
11492
11883
|
input.library,
|
|
11493
11884
|
input.version ?? null,
|
|
11494
11885
|
input.options
|
|
@@ -11508,6 +11899,18 @@ function createPipelineRouter(trpc) {
|
|
|
11508
11899
|
return { jobId };
|
|
11509
11900
|
}
|
|
11510
11901
|
),
|
|
11902
|
+
enqueueRefreshJob: tt.procedure.input(enqueueRefreshInput).mutation(
|
|
11903
|
+
async ({
|
|
11904
|
+
ctx,
|
|
11905
|
+
input
|
|
11906
|
+
}) => {
|
|
11907
|
+
const jobId = await ctx.pipeline.enqueueRefreshJob(
|
|
11908
|
+
input.library,
|
|
11909
|
+
input.version ?? null
|
|
11910
|
+
);
|
|
11911
|
+
return { jobId };
|
|
11912
|
+
}
|
|
11913
|
+
),
|
|
11511
11914
|
getJob: tt.procedure.input(jobIdInput).query(
|
|
11512
11915
|
async ({
|
|
11513
11916
|
ctx,
|
|
@@ -13447,7 +13850,7 @@ async function registerWorkerService(pipeline) {
|
|
|
13447
13850
|
},
|
|
13448
13851
|
onJobError: async (job, error, document2) => {
|
|
13449
13852
|
logger.warn(
|
|
13450
|
-
`⚠️ Job ${job.id} error ${document2 ? `on document ${document2.
|
|
13853
|
+
`⚠️ Job ${job.id} error ${document2 ? `on document ${document2.url}` : ""}: ${error.message}`
|
|
13451
13854
|
);
|
|
13452
13855
|
analytics.captureException(error, {
|
|
13453
13856
|
jobId: job.id,
|
|
@@ -13987,7 +14390,7 @@ async function findVersionAction(library, options, command) {
|
|
|
13987
14390
|
function createFindVersionCommand(program) {
|
|
13988
14391
|
return program.command("find-version <library>").description("Find the best matching version for a library").option("-v, --version <string>", "Pattern to match (optional, supports ranges)").option(
|
|
13989
14392
|
"--server-url <url>",
|
|
13990
|
-
"URL of external pipeline worker RPC (e.g., http://localhost:
|
|
14393
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
|
|
13991
14394
|
).action(findVersionAction);
|
|
13992
14395
|
}
|
|
13993
14396
|
async function listAction(options, command) {
|
|
@@ -14013,7 +14416,7 @@ async function listAction(options, command) {
|
|
|
14013
14416
|
function createListCommand(program) {
|
|
14014
14417
|
return program.command("list").description("List all available libraries and their versions").option(
|
|
14015
14418
|
"--server-url <url>",
|
|
14016
|
-
"URL of external pipeline worker RPC (e.g., http://localhost:
|
|
14419
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
|
|
14017
14420
|
).action(listAction);
|
|
14018
14421
|
}
|
|
14019
14422
|
function createMcpCommand(program) {
|
|
@@ -14036,7 +14439,7 @@ function createMcpCommand(program) {
|
|
|
14036
14439
|
).env("DOCS_MCP_EMBEDDING_MODEL")
|
|
14037
14440
|
).option(
|
|
14038
14441
|
"--server-url <url>",
|
|
14039
|
-
"URL of external pipeline worker RPC (e.g., http://localhost:
|
|
14442
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
|
|
14040
14443
|
).option(
|
|
14041
14444
|
"--read-only",
|
|
14042
14445
|
"Run in read-only mode (only expose read tools, disable write/job tools)",
|
|
@@ -14160,6 +14563,68 @@ function createMcpCommand(program) {
|
|
|
14160
14563
|
}
|
|
14161
14564
|
);
|
|
14162
14565
|
}
|
|
14566
|
+
async function refreshAction(library, options, command) {
|
|
14567
|
+
await analytics.track(TelemetryEvent.CLI_COMMAND, {
|
|
14568
|
+
command: "refresh",
|
|
14569
|
+
library,
|
|
14570
|
+
version: options.version,
|
|
14571
|
+
useServerUrl: !!options.serverUrl
|
|
14572
|
+
});
|
|
14573
|
+
const serverUrl = options.serverUrl;
|
|
14574
|
+
const globalOptions = getGlobalOptions(command);
|
|
14575
|
+
const embeddingConfig = resolveEmbeddingContext(options.embeddingModel);
|
|
14576
|
+
if (!serverUrl && !embeddingConfig) {
|
|
14577
|
+
throw new Error(
|
|
14578
|
+
"Embedding configuration is required for local refresh operations. Please set DOCS_MCP_EMBEDDING_MODEL environment variable or use --server-url for remote execution."
|
|
14579
|
+
);
|
|
14580
|
+
}
|
|
14581
|
+
const docService = await createDocumentManagement({
|
|
14582
|
+
serverUrl,
|
|
14583
|
+
embeddingConfig,
|
|
14584
|
+
storePath: globalOptions.storePath
|
|
14585
|
+
});
|
|
14586
|
+
let pipeline = null;
|
|
14587
|
+
try {
|
|
14588
|
+
const pipelineOptions = {
|
|
14589
|
+
recoverJobs: false,
|
|
14590
|
+
concurrency: 1,
|
|
14591
|
+
serverUrl
|
|
14592
|
+
};
|
|
14593
|
+
pipeline = await createPipelineWithCallbacks(
|
|
14594
|
+
serverUrl ? void 0 : docService,
|
|
14595
|
+
pipelineOptions
|
|
14596
|
+
);
|
|
14597
|
+
await pipeline.start();
|
|
14598
|
+
const refreshTool = new RefreshVersionTool(pipeline);
|
|
14599
|
+
const result = await refreshTool.execute({
|
|
14600
|
+
library,
|
|
14601
|
+
version: options.version,
|
|
14602
|
+
waitForCompletion: true
|
|
14603
|
+
// Always wait for completion in CLI
|
|
14604
|
+
});
|
|
14605
|
+
if ("pagesRefreshed" in result) {
|
|
14606
|
+
console.log(`✅ Successfully refreshed ${result.pagesRefreshed} pages`);
|
|
14607
|
+
} else {
|
|
14608
|
+
console.log(`🚀 Refresh job started with ID: ${result.jobId}`);
|
|
14609
|
+
}
|
|
14610
|
+
} finally {
|
|
14611
|
+
if (pipeline) await pipeline.stop();
|
|
14612
|
+
await docService.shutdown();
|
|
14613
|
+
}
|
|
14614
|
+
}
|
|
14615
|
+
function createRefreshCommand(program) {
|
|
14616
|
+
return program.command("refresh <library>").description(
|
|
14617
|
+
"Re-scrape an existing library version, updating only changed pages.\n\nUses HTTP ETags to efficiently skip unchanged pages and only re-process\ncontent that has been modified or deleted since the last scrape.\n\nExamples:\n refresh react --version 18.0.0\n refresh mylib\n\nNote: The library and version must already be indexed. Use 'scrape' to index a new library/version."
|
|
14618
|
+
).option("-v, --version <string>", "Version of the library (optional)").addOption(
|
|
14619
|
+
new Option(
|
|
14620
|
+
"--embedding-model <model>",
|
|
14621
|
+
"Embedding model configuration (e.g., 'openai:text-embedding-3-small')"
|
|
14622
|
+
).env("DOCS_MCP_EMBEDDING_MODEL")
|
|
14623
|
+
).option(
|
|
14624
|
+
"--server-url <url>",
|
|
14625
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
|
|
14626
|
+
).action(refreshAction);
|
|
14627
|
+
}
|
|
14163
14628
|
async function removeAction(library, options, command) {
|
|
14164
14629
|
await analytics.track(TelemetryEvent.CLI_COMMAND, {
|
|
14165
14630
|
command: "remove",
|
|
@@ -14194,7 +14659,7 @@ function createRemoveCommand(program) {
|
|
|
14194
14659
|
"Version to remove (optional, removes unversioned if omitted)"
|
|
14195
14660
|
).option(
|
|
14196
14661
|
"--server-url <url>",
|
|
14197
|
-
"URL of external pipeline worker RPC (e.g., http://localhost:
|
|
14662
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
|
|
14198
14663
|
).action(removeAction);
|
|
14199
14664
|
}
|
|
14200
14665
|
async function scrapeAction(library, url, options, command) {
|
|
@@ -14334,7 +14799,7 @@ function createScrapeCommand(program) {
|
|
|
14334
14799
|
).env("DOCS_MCP_EMBEDDING_MODEL")
|
|
14335
14800
|
).option(
|
|
14336
14801
|
"--server-url <url>",
|
|
14337
|
-
"URL of external pipeline worker RPC (e.g., http://localhost:
|
|
14802
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
|
|
14338
14803
|
).action(scrapeAction);
|
|
14339
14804
|
}
|
|
14340
14805
|
async function searchAction(library, query, options, command) {
|
|
@@ -14387,7 +14852,7 @@ function createSearchCommand(program) {
|
|
|
14387
14852
|
).env("DOCS_MCP_EMBEDDING_MODEL")
|
|
14388
14853
|
).option(
|
|
14389
14854
|
"--server-url <url>",
|
|
14390
|
-
"URL of external pipeline worker RPC (e.g., http://localhost:
|
|
14855
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
|
|
14391
14856
|
).action(searchAction);
|
|
14392
14857
|
}
|
|
14393
14858
|
function createWebCommand(program) {
|
|
@@ -14408,7 +14873,7 @@ function createWebCommand(program) {
|
|
|
14408
14873
|
).env("DOCS_MCP_EMBEDDING_MODEL")
|
|
14409
14874
|
).option(
|
|
14410
14875
|
"--server-url <url>",
|
|
14411
|
-
"URL of external pipeline worker RPC (e.g., http://localhost:
|
|
14876
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
|
|
14412
14877
|
).action(
|
|
14413
14878
|
async (cmdOptions, command) => {
|
|
14414
14879
|
await analytics.track(TelemetryEvent.CLI_COMMAND, {
|
|
@@ -14603,6 +15068,7 @@ function createCliProgram() {
|
|
|
14603
15068
|
createWebCommand(program);
|
|
14604
15069
|
createWorkerCommand(program);
|
|
14605
15070
|
createScrapeCommand(program);
|
|
15071
|
+
createRefreshCommand(program);
|
|
14606
15072
|
createSearchCommand(program);
|
|
14607
15073
|
createListCommand(program);
|
|
14608
15074
|
createFindVersionCommand(program);
|