@arabold/docs-mcp-server 1.26.2 → 1.27.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/db/migrations/010-add-depth-to-pages.sql +16 -0
- package/dist/assets/main.css +1 -1
- package/dist/index.js +1777 -1320
- package/dist/index.js.map +1 -1
- package/package.json +34 -29
- package/public/assets/main.css +1 -1
package/dist/index.js
CHANGED
|
@@ -6,7 +6,7 @@ import { VertexAIEmbeddings } from "@langchain/google-vertexai";
|
|
|
6
6
|
import { AzureOpenAIEmbeddings, OpenAIEmbeddings } from "@langchain/openai";
|
|
7
7
|
import { Embeddings } from "@langchain/core/embeddings";
|
|
8
8
|
import { PostHog } from "posthog-node";
|
|
9
|
-
import { randomUUID } from "node:crypto";
|
|
9
|
+
import crypto, { randomUUID } from "node:crypto";
|
|
10
10
|
import fs, { existsSync, readFileSync } from "node:fs";
|
|
11
11
|
import path from "node:path";
|
|
12
12
|
import { fileURLToPath, URL as URL$1 } from "node:url";
|
|
@@ -27,6 +27,7 @@ import psl from "psl";
|
|
|
27
27
|
import { HeaderGenerator } from "header-generator";
|
|
28
28
|
import fs$1 from "node:fs/promises";
|
|
29
29
|
import axios from "axios";
|
|
30
|
+
import { minimatch } from "minimatch";
|
|
30
31
|
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
|
|
31
32
|
import remarkGfm from "remark-gfm";
|
|
32
33
|
import remarkHtml from "remark-html";
|
|
@@ -40,7 +41,6 @@ import * as cheerio from "cheerio";
|
|
|
40
41
|
import "node:vm";
|
|
41
42
|
import { gfm } from "@joplin/turndown-plugin-gfm";
|
|
42
43
|
import iconv from "iconv-lite";
|
|
43
|
-
import { minimatch } from "minimatch";
|
|
44
44
|
import { SSEServerTransport } from "@modelcontextprotocol/sdk/server/sse.js";
|
|
45
45
|
import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
|
|
46
46
|
import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
@@ -113,21 +113,6 @@ class MissingCredentialsError extends StoreError {
|
|
|
113
113
|
}
|
|
114
114
|
}
|
|
115
115
|
const VECTOR_DIMENSION = 1536;
|
|
116
|
-
function mapDbDocumentToDocument(doc) {
|
|
117
|
-
const chunkMetadata = JSON.parse(doc.metadata);
|
|
118
|
-
return {
|
|
119
|
-
id: doc.id,
|
|
120
|
-
pageContent: doc.content,
|
|
121
|
-
metadata: {
|
|
122
|
-
...chunkMetadata,
|
|
123
|
-
// Page-level fields are always available from joined queries
|
|
124
|
-
url: doc.url,
|
|
125
|
-
title: doc.title || "",
|
|
126
|
-
// Convert null to empty string for consistency
|
|
127
|
-
...doc.content_type && { contentType: doc.content_type }
|
|
128
|
-
}
|
|
129
|
-
};
|
|
130
|
-
}
|
|
131
116
|
var VersionStatus = /* @__PURE__ */ ((VersionStatus2) => {
|
|
132
117
|
VersionStatus2["NOT_INDEXED"] = "not_indexed";
|
|
133
118
|
VersionStatus2["QUEUED"] = "queued";
|
|
@@ -784,16 +769,16 @@ function extractProtocol(urlOrPath) {
|
|
|
784
769
|
}
|
|
785
770
|
}
|
|
786
771
|
const name = "@arabold/docs-mcp-server";
|
|
787
|
-
const version = "1.
|
|
772
|
+
const version = "1.27.0";
|
|
788
773
|
const description = "MCP server for fetching and searching documentation";
|
|
789
774
|
const type = "module";
|
|
790
775
|
const bin = { "docs-mcp-server": "dist/index.js" };
|
|
791
776
|
const license = "MIT";
|
|
792
777
|
const repository = { "type": "git", "url": "git+https://github.com/arabold/docs-mcp-server.git" };
|
|
793
778
|
const files = ["dist", "public", "db", "README.md", "LICENSE", "package.json"];
|
|
794
|
-
const scripts = { "prepare": "husky || true", "build": "vite build --config vite.config.web.ts && vite build", "start": "node --enable-source-maps dist/index.js", "cli": "node --enable-source-maps dist/index.js", "server": "node --enable-source-maps dist/index.ts", "web": "node --enable-source-maps dist/index.ts web", "dev": "npm-run-all --parallel dev:server dev:web", "dev:server": "vite-node --watch src/index.ts", "dev:web": "vite build --config vite.config.web.ts --watch", "test": "vitest run", "test:watch": "vitest", "test:coverage": "vitest run --coverage", "test:
|
|
795
|
-
const dependencies = { "@fastify/formbody": "^8.0.2", "@fastify/static": "^8.
|
|
796
|
-
const devDependencies = { "@biomejs/biome": "^2.
|
|
779
|
+
const scripts = { "prepare": "husky || true", "build": "vite build --config vite.config.web.ts && vite build", "start": "node --enable-source-maps dist/index.js", "cli": "node --enable-source-maps dist/index.js", "server": "node --enable-source-maps dist/index.ts", "web": "node --enable-source-maps dist/index.ts web", "dev": "npm-run-all --parallel dev:server dev:web", "dev:server": "vite-node --watch src/index.ts", "dev:web": "vite build --config vite.config.web.ts --watch", "test": "vitest run", "test:watch": "vitest", "test:coverage": "vitest run --coverage", "test:unit": "vitest run src", "test:e2e": "vitest run test", "test:live": "vitest run --exclude= test/html-pipeline-live-e2e.test.ts", "lint": "biome check .", "lint:fix": "biome check . --fix", "typecheck": "npx tsc --noEmit", "typecheck:build": "npx tsc --noEmit --project tsconfig.build.json", "format": "biome format . --write", "postinstall": "echo 'Skipping Playwright browser install. See README.md for details.'" };
|
|
780
|
+
const dependencies = { "@fastify/formbody": "^8.0.2", "@fastify/static": "^8.3.0", "@joplin/turndown-plugin-gfm": "^1.0.62", "@kitajs/html": "^4.2.9", "@kitajs/ts-html-plugin": "^4.1.1", "@langchain/aws": "^0.1.13", "@langchain/google-genai": "^0.2.16", "@langchain/google-vertexai": "^0.2.16", "@langchain/openai": "^0.6.3", "@modelcontextprotocol/sdk": "^1.20.2", "@trpc/client": "^11.7.1", "@trpc/server": "^11.4.4", "alpinejs": "^3.14.9", "axios": "^1.13.1", "axios-retry": "^4.5.0", "better-sqlite3": "^12.4.1", "cheerio": "^1.1.2", "commander": "^14.0.0", "dompurify": "^3.3.0", "dotenv": "^17.2.3", "env-paths": "^3.0.0", "fastify": "^5.6.1", "flowbite": "^3.1.2", "fuse.js": "^7.1.0", "header-generator": "^2.1.76", "htmx.org": "^2.0.6", "iconv-lite": "^0.6.3", "jose": "^6.1.0", "jsdom": "^26.1.0", "langchain": "^0.3.30", "mime": "^4.1.0", "minimatch": "^10.0.1", "playwright": "^1.52.0", "posthog-node": "^5.11.0", "psl": "^1.15.0", "remark": "^15.0.1", "remark-gfm": "^4.0.1", "remark-html": "^16.0.1", "semver": "^7.7.3", "sqlite-vec": "^0.1.7-alpha.2", "tree-sitter": "^0.21.1", "tree-sitter-javascript": "^0.23.1", "tree-sitter-python": "^0.21.0", "tree-sitter-typescript": "^0.23.2", "turndown": "^7.2.2", "zod": "^4.1.12" };
|
|
781
|
+
const devDependencies = { "@biomejs/biome": "^2.3.2", "@commitlint/cli": "^19.8.1", "@commitlint/config-conventional": "^19.8.1", "@semantic-release/changelog": "^6.0.3", "@semantic-release/git": "^10.0.1", "@semantic-release/github": "^11.0.6", "@semantic-release/npm": "^12.0.2", "@tailwindcss/postcss": "^4.1.16", "@tailwindcss/vite": "^4.1.16", "@types/alpinejs": "^3.13.11", "@types/better-sqlite3": "^7.6.13", "@types/jsdom": "~21.1.7", "@types/lint-staged": "~13.3.0", "@types/node": "^24.1.0", "@types/node-fetch": "^2.6.13", "@types/psl": "^1.1.3", "@types/semver": "^7.7.1", "@types/turndown": "^5.0.6", "autoprefixer": "^10.4.21", "flowbite-typography": "^1.0.5", "husky": "^9.1.7", "lint-staged": "^16.2.6", "memfs": "^4.50.0", "msw": "^2.12.2", "nock": "^14.0.10", "npm-run-all": "^4.1.5", "postcss": "^8.5.6", "semantic-release": "^24.2.9", "tailwindcss": "^4.1.4", "typescript": "^5.9.3", "vite": "^6.3.5", "vite-node": "^3.2.4", "vite-plugin-dts": "^4.5.4", "vitest": "^3.2.4" };
|
|
797
782
|
const engines = { "node": ">=20.0.0" };
|
|
798
783
|
const packageJson = {
|
|
799
784
|
name,
|
|
@@ -1288,10 +1273,10 @@ class PipelineClient {
|
|
|
1288
1273
|
this.activePolling.clear();
|
|
1289
1274
|
logger.debug("PipelineClient stopped");
|
|
1290
1275
|
}
|
|
1291
|
-
async
|
|
1276
|
+
async enqueueScrapeJob(library, version2, options) {
|
|
1292
1277
|
try {
|
|
1293
1278
|
const normalizedVersion = typeof version2 === "string" && version2.trim().length === 0 ? null : version2 ?? null;
|
|
1294
|
-
const result = await this.client.
|
|
1279
|
+
const result = await this.client.enqueueScrapeJob.mutate({
|
|
1295
1280
|
library,
|
|
1296
1281
|
version: normalizedVersion,
|
|
1297
1282
|
options
|
|
@@ -1304,6 +1289,21 @@ class PipelineClient {
|
|
|
1304
1289
|
);
|
|
1305
1290
|
}
|
|
1306
1291
|
}
|
|
1292
|
+
async enqueueRefreshJob(library, version2) {
|
|
1293
|
+
try {
|
|
1294
|
+
const normalizedVersion = typeof version2 === "string" && version2.trim().length === 0 ? null : version2 ?? null;
|
|
1295
|
+
const result = await this.client.enqueueRefreshJob.mutate({
|
|
1296
|
+
library,
|
|
1297
|
+
version: normalizedVersion
|
|
1298
|
+
});
|
|
1299
|
+
logger.debug(`Refresh job ${result.jobId} enqueued successfully`);
|
|
1300
|
+
return result.jobId;
|
|
1301
|
+
} catch (error) {
|
|
1302
|
+
throw new Error(
|
|
1303
|
+
`Failed to enqueue refresh job: ${error instanceof Error ? error.message : String(error)}`
|
|
1304
|
+
);
|
|
1305
|
+
}
|
|
1306
|
+
}
|
|
1307
1307
|
async getJob(jobId) {
|
|
1308
1308
|
try {
|
|
1309
1309
|
const serializedJob = await this.client.getJob.query({ id: jobId });
|
|
@@ -1753,6 +1753,12 @@ class FingerprintGenerator {
|
|
|
1753
1753
|
return this.headerGenerator.getHeaders();
|
|
1754
1754
|
}
|
|
1755
1755
|
}
|
|
1756
|
+
var FetchStatus = /* @__PURE__ */ ((FetchStatus2) => {
|
|
1757
|
+
FetchStatus2["SUCCESS"] = "success";
|
|
1758
|
+
FetchStatus2["NOT_MODIFIED"] = "not_modified";
|
|
1759
|
+
FetchStatus2["NOT_FOUND"] = "not_found";
|
|
1760
|
+
return FetchStatus2;
|
|
1761
|
+
})(FetchStatus || {});
|
|
1756
1762
|
class BrowserFetcher {
|
|
1757
1763
|
browser = null;
|
|
1758
1764
|
page = null;
|
|
@@ -1792,13 +1798,16 @@ class BrowserFetcher {
|
|
|
1792
1798
|
const contentBuffer = Buffer.from(content, "utf-8");
|
|
1793
1799
|
const contentType = response.headers()["content-type"] || "text/html";
|
|
1794
1800
|
const { mimeType, charset } = MimeTypeUtils.parseContentType(contentType);
|
|
1801
|
+
const etag = response.headers().etag;
|
|
1795
1802
|
return {
|
|
1796
1803
|
content: contentBuffer,
|
|
1797
1804
|
mimeType,
|
|
1798
1805
|
charset,
|
|
1799
1806
|
encoding: void 0,
|
|
1800
1807
|
// Browser handles encoding automatically
|
|
1801
|
-
source: finalUrl
|
|
1808
|
+
source: finalUrl,
|
|
1809
|
+
etag,
|
|
1810
|
+
status: FetchStatus.SUCCESS
|
|
1802
1811
|
};
|
|
1803
1812
|
} catch (error) {
|
|
1804
1813
|
if (options?.signal?.aborted) {
|
|
@@ -1859,24 +1868,48 @@ class FileFetcher {
|
|
|
1859
1868
|
/**
|
|
1860
1869
|
* Fetches the content of a file given a file:// URL, decoding percent-encoded paths as needed.
|
|
1861
1870
|
* Uses enhanced MIME type detection for better source code file recognition.
|
|
1871
|
+
* Supports conditional fetching via ETag comparison for efficient refresh operations.
|
|
1862
1872
|
*/
|
|
1863
|
-
async fetch(source,
|
|
1873
|
+
async fetch(source, options) {
|
|
1864
1874
|
let filePath = source.replace(/^file:\/\/\/?/, "");
|
|
1865
1875
|
filePath = decodeURIComponent(filePath);
|
|
1866
1876
|
if (!filePath.startsWith("/") && process.platform !== "win32") {
|
|
1867
1877
|
filePath = `/${filePath}`;
|
|
1868
1878
|
}
|
|
1869
1879
|
try {
|
|
1880
|
+
const stats = await fs$1.stat(filePath);
|
|
1881
|
+
const currentEtag = crypto.createHash("md5").update(stats.mtime.toISOString()).digest("hex");
|
|
1882
|
+
if (options?.etag && options.etag === currentEtag) {
|
|
1883
|
+
return {
|
|
1884
|
+
content: Buffer.from(""),
|
|
1885
|
+
mimeType: "text/plain",
|
|
1886
|
+
source,
|
|
1887
|
+
etag: currentEtag,
|
|
1888
|
+
lastModified: stats.mtime.toISOString(),
|
|
1889
|
+
status: FetchStatus.NOT_MODIFIED
|
|
1890
|
+
};
|
|
1891
|
+
}
|
|
1870
1892
|
const content = await fs$1.readFile(filePath);
|
|
1871
1893
|
const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath);
|
|
1872
1894
|
const mimeType = detectedMimeType || "application/octet-stream";
|
|
1873
1895
|
return {
|
|
1874
1896
|
content,
|
|
1875
1897
|
mimeType,
|
|
1876
|
-
source
|
|
1898
|
+
source,
|
|
1899
|
+
etag: currentEtag,
|
|
1900
|
+
lastModified: stats.mtime.toISOString(),
|
|
1901
|
+
status: FetchStatus.SUCCESS
|
|
1877
1902
|
// Don't assume charset for text files - let the pipeline detect it
|
|
1878
1903
|
};
|
|
1879
1904
|
} catch (error) {
|
|
1905
|
+
if (error.code === "ENOENT") {
|
|
1906
|
+
return {
|
|
1907
|
+
content: Buffer.from(""),
|
|
1908
|
+
mimeType: "text/plain",
|
|
1909
|
+
source,
|
|
1910
|
+
status: FetchStatus.NOT_FOUND
|
|
1911
|
+
};
|
|
1912
|
+
}
|
|
1880
1913
|
throw new ScraperError(
|
|
1881
1914
|
`Failed to read file ${filePath}: ${error.message ?? "Unknown error"}`,
|
|
1882
1915
|
false,
|
|
@@ -1982,6 +2015,12 @@ class HttpFetcher {
|
|
|
1982
2015
|
...options?.headers
|
|
1983
2016
|
// User-provided headers override generated ones
|
|
1984
2017
|
};
|
|
2018
|
+
if (options?.etag) {
|
|
2019
|
+
headers["If-None-Match"] = options.etag;
|
|
2020
|
+
logger.debug(
|
|
2021
|
+
`Conditional request for ${source} with If-None-Match: ${options.etag}`
|
|
2022
|
+
);
|
|
2023
|
+
}
|
|
1985
2024
|
const config = {
|
|
1986
2025
|
responseType: "arraybuffer",
|
|
1987
2026
|
headers: {
|
|
@@ -1995,9 +2034,22 @@ class HttpFetcher {
|
|
|
1995
2034
|
// Pass signal to axios
|
|
1996
2035
|
// Axios follows redirects by default, we need to explicitly disable it if needed
|
|
1997
2036
|
maxRedirects: followRedirects ? 5 : 0,
|
|
1998
|
-
decompress: true
|
|
2037
|
+
decompress: true,
|
|
2038
|
+
// Allow 304 responses to be handled as successful responses
|
|
2039
|
+
validateStatus: (status) => {
|
|
2040
|
+
return status >= 200 && status < 300 || status === 304;
|
|
2041
|
+
}
|
|
1999
2042
|
};
|
|
2000
2043
|
const response = await axios.get(source, config);
|
|
2044
|
+
if (response.status === 304) {
|
|
2045
|
+
logger.debug(`HTTP 304 Not Modified for ${source}`);
|
|
2046
|
+
return {
|
|
2047
|
+
content: Buffer.from(""),
|
|
2048
|
+
mimeType: "text/plain",
|
|
2049
|
+
source,
|
|
2050
|
+
status: FetchStatus.NOT_MODIFIED
|
|
2051
|
+
};
|
|
2052
|
+
}
|
|
2001
2053
|
const contentTypeHeader = response.headers["content-type"];
|
|
2002
2054
|
const { mimeType, charset } = MimeTypeUtils.parseContentType(contentTypeHeader);
|
|
2003
2055
|
const contentEncoding = response.headers["content-encoding"];
|
|
@@ -2017,12 +2069,21 @@ class HttpFetcher {
|
|
|
2017
2069
|
response.request?.responseUrl || // Fallback to axios recorded config URL
|
|
2018
2070
|
response.config?.url || source
|
|
2019
2071
|
);
|
|
2072
|
+
const etag = response.headers.etag || response.headers.ETag;
|
|
2073
|
+
if (etag) {
|
|
2074
|
+
logger.debug(`Received ETag for ${source}: ${etag}`);
|
|
2075
|
+
}
|
|
2076
|
+
const lastModified = response.headers["last-modified"];
|
|
2077
|
+
const lastModifiedISO = lastModified ? new Date(lastModified).toISOString() : void 0;
|
|
2020
2078
|
return {
|
|
2021
2079
|
content,
|
|
2022
2080
|
mimeType,
|
|
2023
2081
|
charset,
|
|
2024
2082
|
encoding: contentEncoding,
|
|
2025
|
-
source: finalUrl
|
|
2083
|
+
source: finalUrl,
|
|
2084
|
+
etag,
|
|
2085
|
+
lastModified: lastModifiedISO,
|
|
2086
|
+
status: FetchStatus.SUCCESS
|
|
2026
2087
|
};
|
|
2027
2088
|
} catch (error) {
|
|
2028
2089
|
const axiosError = error;
|
|
@@ -2031,6 +2092,15 @@ class HttpFetcher {
|
|
|
2031
2092
|
if (options?.signal?.aborted || code === "ERR_CANCELED") {
|
|
2032
2093
|
throw new CancellationError("HTTP fetch cancelled");
|
|
2033
2094
|
}
|
|
2095
|
+
if (status === 404) {
|
|
2096
|
+
logger.debug(`Resource not found (404): ${source}`);
|
|
2097
|
+
return {
|
|
2098
|
+
content: Buffer.from(""),
|
|
2099
|
+
mimeType: "text/plain",
|
|
2100
|
+
source,
|
|
2101
|
+
status: FetchStatus.NOT_FOUND
|
|
2102
|
+
};
|
|
2103
|
+
}
|
|
2034
2104
|
if (!followRedirects && status && status >= 300 && status < 400) {
|
|
2035
2105
|
const location = axiosError.response?.headers?.location;
|
|
2036
2106
|
if (location) {
|
|
@@ -2125,101 +2195,522 @@ class AutoDetectFetcher {
|
|
|
2125
2195
|
]);
|
|
2126
2196
|
}
|
|
2127
2197
|
}
|
|
2128
|
-
|
|
2129
|
-
|
|
2130
|
-
|
|
2131
|
-
|
|
2132
|
-
|
|
2133
|
-
|
|
2134
|
-
|
|
2198
|
+
const DEFAULT_FILE_EXCLUSIONS = [
|
|
2199
|
+
// CHANGELOG files (case variations)
|
|
2200
|
+
"**/CHANGELOG.md",
|
|
2201
|
+
"**/changelog.md",
|
|
2202
|
+
"**/CHANGELOG.mdx",
|
|
2203
|
+
"**/changelog.mdx",
|
|
2204
|
+
// LICENSE files (case variations)
|
|
2205
|
+
"**/LICENSE",
|
|
2206
|
+
"**/LICENSE.md",
|
|
2207
|
+
"**/license.md",
|
|
2208
|
+
// CODE_OF_CONDUCT files (case variations)
|
|
2209
|
+
"**/CODE_OF_CONDUCT.md",
|
|
2210
|
+
"**/code_of_conduct.md",
|
|
2211
|
+
// Test files
|
|
2212
|
+
"**/*.test.*",
|
|
2213
|
+
"**/*.spec.*",
|
|
2214
|
+
"**/*_test.py",
|
|
2215
|
+
"**/*_test.go",
|
|
2216
|
+
// Package manager lock files
|
|
2217
|
+
"**/*.lock",
|
|
2218
|
+
"**/package-lock.json",
|
|
2219
|
+
"**/yarn.lock",
|
|
2220
|
+
"**/pnpm-lock.yaml",
|
|
2221
|
+
"**/go.sum",
|
|
2222
|
+
// Build artifacts
|
|
2223
|
+
"**/*.min.js",
|
|
2224
|
+
"**/*.min.css",
|
|
2225
|
+
"**/*.map",
|
|
2226
|
+
"**/*.d.ts",
|
|
2227
|
+
// IDE/System files
|
|
2228
|
+
"**/.DS_Store",
|
|
2229
|
+
"**/Thumbs.db",
|
|
2230
|
+
"**/*.swp",
|
|
2231
|
+
"**/*.swo",
|
|
2232
|
+
// Internal config files (using regex pattern)
|
|
2233
|
+
"/.*\\.(ini|cfg|conf|log|pid)$/"
|
|
2234
|
+
];
|
|
2235
|
+
const DEFAULT_FOLDER_EXCLUSIONS = [
|
|
2236
|
+
// Archive and deprecated content (matches anywhere in path)
|
|
2237
|
+
"**/archive/**",
|
|
2238
|
+
"**/archived/**",
|
|
2239
|
+
"**/deprecated/**",
|
|
2240
|
+
"**/legacy/**",
|
|
2241
|
+
"**/old/**",
|
|
2242
|
+
"**/outdated/**",
|
|
2243
|
+
"**/previous/**",
|
|
2244
|
+
"**/superseded/**",
|
|
2245
|
+
// Specific paths that don't follow the general pattern
|
|
2246
|
+
"docs/old/**",
|
|
2247
|
+
// Test directories
|
|
2248
|
+
"**/test/**",
|
|
2249
|
+
"**/tests/**",
|
|
2250
|
+
"**/__tests__/**",
|
|
2251
|
+
"**/spec/**",
|
|
2252
|
+
// Build output directories
|
|
2253
|
+
"**/dist/**",
|
|
2254
|
+
"**/build/**",
|
|
2255
|
+
"**/out/**",
|
|
2256
|
+
"**/target/**",
|
|
2257
|
+
"**/.next/**",
|
|
2258
|
+
"**/.nuxt/**",
|
|
2259
|
+
// IDE directories
|
|
2260
|
+
"**/.vscode/**",
|
|
2261
|
+
"**/.idea/**",
|
|
2262
|
+
// Internationalization folders - non-English locales
|
|
2263
|
+
"**/i18n/ar*/**",
|
|
2264
|
+
"**/i18n/de*/**",
|
|
2265
|
+
"**/i18n/es*/**",
|
|
2266
|
+
"**/i18n/fr*/**",
|
|
2267
|
+
"**/i18n/hi*/**",
|
|
2268
|
+
"**/i18n/it*/**",
|
|
2269
|
+
"**/i18n/ja*/**",
|
|
2270
|
+
"**/i18n/ko*/**",
|
|
2271
|
+
"**/i18n/nl*/**",
|
|
2272
|
+
"**/i18n/pl*/**",
|
|
2273
|
+
"**/i18n/pt*/**",
|
|
2274
|
+
"**/i18n/ru*/**",
|
|
2275
|
+
"**/i18n/sv*/**",
|
|
2276
|
+
"**/i18n/th*/**",
|
|
2277
|
+
"**/i18n/tr*/**",
|
|
2278
|
+
"**/i18n/vi*/**",
|
|
2279
|
+
"**/i18n/zh*/**",
|
|
2280
|
+
// Common locale folder patterns
|
|
2281
|
+
"**/zh-cn/**",
|
|
2282
|
+
"**/zh-hk/**",
|
|
2283
|
+
"**/zh-mo/**",
|
|
2284
|
+
"**/zh-sg/**",
|
|
2285
|
+
"**/zh-tw/**"
|
|
2286
|
+
];
|
|
2287
|
+
const DEFAULT_EXCLUSION_PATTERNS = [
|
|
2288
|
+
...DEFAULT_FILE_EXCLUSIONS,
|
|
2289
|
+
...DEFAULT_FOLDER_EXCLUSIONS
|
|
2290
|
+
];
|
|
2291
|
+
function getEffectiveExclusionPatterns(userPatterns) {
|
|
2292
|
+
if (userPatterns !== void 0) {
|
|
2293
|
+
return userPatterns;
|
|
2135
2294
|
}
|
|
2295
|
+
return DEFAULT_EXCLUSION_PATTERNS;
|
|
2136
2296
|
}
|
|
2137
|
-
|
|
2297
|
+
function isRegexPattern(pattern) {
|
|
2298
|
+
return pattern.length > 2 && pattern.startsWith("/") && pattern.endsWith("/");
|
|
2138
2299
|
}
|
|
2139
|
-
|
|
2140
|
-
|
|
2141
|
-
|
|
2142
|
-
preferredChunkSize;
|
|
2143
|
-
/**
|
|
2144
|
-
* Combines a base document splitter with size constraints to produce optimally-sized chunks.
|
|
2145
|
-
* The base splitter handles the initial semantic splitting, while this class handles
|
|
2146
|
-
* the concatenation strategy.
|
|
2147
|
-
*/
|
|
2148
|
-
constructor(baseSplitter, minChunkSize, preferredChunkSize) {
|
|
2149
|
-
this.baseSplitter = baseSplitter;
|
|
2150
|
-
this.minChunkSize = minChunkSize;
|
|
2151
|
-
this.preferredChunkSize = preferredChunkSize;
|
|
2300
|
+
function patternToRegExp(pattern) {
|
|
2301
|
+
if (isRegexPattern(pattern)) {
|
|
2302
|
+
return new RegExp(pattern.slice(1, -1));
|
|
2152
2303
|
}
|
|
2153
|
-
|
|
2154
|
-
|
|
2155
|
-
|
|
2156
|
-
|
|
2157
|
-
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2163
|
-
for (const nextChunk of initialChunks) {
|
|
2164
|
-
if (currentChunk) {
|
|
2165
|
-
if (this.wouldExceedMaxSize(currentChunk, nextChunk)) {
|
|
2166
|
-
concatenatedChunks.push(currentChunk);
|
|
2167
|
-
currentChunk = this.cloneChunk(nextChunk);
|
|
2168
|
-
continue;
|
|
2169
|
-
}
|
|
2170
|
-
if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk)) {
|
|
2171
|
-
concatenatedChunks.push(currentChunk);
|
|
2172
|
-
currentChunk = this.cloneChunk(nextChunk);
|
|
2173
|
-
continue;
|
|
2174
|
-
}
|
|
2175
|
-
currentChunk.content += `${currentChunk.content.endsWith("\n") ? "" : "\n"}${nextChunk.content}`;
|
|
2176
|
-
currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
|
|
2177
|
-
currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
|
|
2178
|
-
} else {
|
|
2179
|
-
currentChunk = this.cloneChunk(nextChunk);
|
|
2180
|
-
}
|
|
2181
|
-
}
|
|
2182
|
-
if (currentChunk) {
|
|
2183
|
-
concatenatedChunks.push(currentChunk);
|
|
2304
|
+
const re = minimatch.makeRe(pattern, { dot: true });
|
|
2305
|
+
if (!re) throw new Error(`Invalid glob pattern: ${pattern}`);
|
|
2306
|
+
return re;
|
|
2307
|
+
}
|
|
2308
|
+
function matchesAnyPattern(path2, patterns) {
|
|
2309
|
+
if (!patterns || patterns.length === 0) return false;
|
|
2310
|
+
const normalizedPath = path2.startsWith("/") ? path2 : `/${path2}`;
|
|
2311
|
+
return patterns.some((pattern) => {
|
|
2312
|
+
if (isRegexPattern(pattern)) {
|
|
2313
|
+
return patternToRegExp(pattern).test(normalizedPath);
|
|
2184
2314
|
}
|
|
2185
|
-
|
|
2186
|
-
|
|
2187
|
-
|
|
2188
|
-
|
|
2189
|
-
|
|
2190
|
-
|
|
2191
|
-
|
|
2192
|
-
|
|
2193
|
-
|
|
2194
|
-
|
|
2195
|
-
|
|
2196
|
-
}
|
|
2197
|
-
/**
|
|
2198
|
-
* H1 and H2 headings represent major conceptual breaks in the document.
|
|
2199
|
-
* Preserving these splits helps maintain the document's logical structure.
|
|
2200
|
-
*/
|
|
2201
|
-
startsNewMajorSection(chunk) {
|
|
2202
|
-
return chunk.section.level === 1 || chunk.section.level === 2;
|
|
2315
|
+
const pathForMatch = normalizedPath.replace(/^\//, "");
|
|
2316
|
+
const patternForMatch = pattern.startsWith("/") ? pattern.slice(1) : pattern;
|
|
2317
|
+
return minimatch(pathForMatch, patternForMatch, { dot: true });
|
|
2318
|
+
});
|
|
2319
|
+
}
|
|
2320
|
+
function extractPathAndQuery(url) {
|
|
2321
|
+
try {
|
|
2322
|
+
const u = new URL(url);
|
|
2323
|
+
return u.pathname + (u.search || "");
|
|
2324
|
+
} catch {
|
|
2325
|
+
return url;
|
|
2203
2326
|
}
|
|
2204
|
-
|
|
2205
|
-
|
|
2206
|
-
|
|
2207
|
-
|
|
2208
|
-
|
|
2209
|
-
|
|
2210
|
-
|
|
2327
|
+
}
|
|
2328
|
+
function shouldIncludeUrl(url, includePatterns, excludePatterns) {
|
|
2329
|
+
const path2 = extractPathAndQuery(url);
|
|
2330
|
+
const normalizedPath = path2.startsWith("/") ? path2 : `/${path2}`;
|
|
2331
|
+
let basename;
|
|
2332
|
+
if (url.startsWith("file://")) {
|
|
2333
|
+
try {
|
|
2334
|
+
const u = new URL(url);
|
|
2335
|
+
basename = u.pathname ? u.pathname.split("/").pop() : void 0;
|
|
2336
|
+
} catch {
|
|
2211
2337
|
}
|
|
2212
|
-
return currentChunk.content.length + nextChunk.content.length > this.preferredChunkSize;
|
|
2213
2338
|
}
|
|
2214
|
-
|
|
2215
|
-
|
|
2216
|
-
|
|
2217
|
-
|
|
2218
|
-
|
|
2219
|
-
|
|
2339
|
+
const stripSlash = (patterns) => patterns?.map((p) => p.startsWith("/") ? p.slice(1) : p);
|
|
2340
|
+
const effectiveExcludePatterns = getEffectiveExclusionPatterns(excludePatterns);
|
|
2341
|
+
if (matchesAnyPattern(url, effectiveExcludePatterns) || matchesAnyPattern(normalizedPath, effectiveExcludePatterns) || basename && matchesAnyPattern(basename, stripSlash(effectiveExcludePatterns)))
|
|
2342
|
+
return false;
|
|
2343
|
+
if (!includePatterns || includePatterns.length === 0) return true;
|
|
2344
|
+
return matchesAnyPattern(url, includePatterns) || matchesAnyPattern(normalizedPath, includePatterns) || (basename ? matchesAnyPattern(basename, stripSlash(includePatterns)) : false);
|
|
2345
|
+
}
|
|
2346
|
+
function computeBaseDirectory(pathname) {
|
|
2347
|
+
if (pathname === "") return "/";
|
|
2348
|
+
if (pathname.endsWith("/")) return pathname;
|
|
2349
|
+
const lastSegment = pathname.split("/").at(-1) || "";
|
|
2350
|
+
const looksLikeFile = lastSegment.includes(".");
|
|
2351
|
+
if (looksLikeFile) {
|
|
2352
|
+
return pathname.replace(/\/[^/]*$/, "/");
|
|
2220
2353
|
}
|
|
2221
|
-
|
|
2222
|
-
|
|
2354
|
+
return `${pathname}/`;
|
|
2355
|
+
}
|
|
2356
|
+
function isInScope(baseUrl, targetUrl, scope) {
|
|
2357
|
+
if (baseUrl.protocol !== targetUrl.protocol) return false;
|
|
2358
|
+
switch (scope) {
|
|
2359
|
+
case "subpages": {
|
|
2360
|
+
if (baseUrl.hostname !== targetUrl.hostname) return false;
|
|
2361
|
+
const baseDir = computeBaseDirectory(baseUrl.pathname);
|
|
2362
|
+
return targetUrl.pathname.startsWith(baseDir);
|
|
2363
|
+
}
|
|
2364
|
+
case "hostname":
|
|
2365
|
+
return baseUrl.hostname === targetUrl.hostname;
|
|
2366
|
+
case "domain": {
|
|
2367
|
+
return extractPrimaryDomain(baseUrl.hostname) === extractPrimaryDomain(targetUrl.hostname);
|
|
2368
|
+
}
|
|
2369
|
+
default:
|
|
2370
|
+
return false;
|
|
2371
|
+
}
|
|
2372
|
+
}
|
|
2373
|
+
const DEFAULT_MAX_DEPTH = 3;
|
|
2374
|
+
const DEFAULT_CONCURRENCY = 3;
|
|
2375
|
+
class BaseScraperStrategy {
|
|
2376
|
+
/**
|
|
2377
|
+
* Set of normalized URLs that have been marked for processing.
|
|
2378
|
+
*
|
|
2379
|
+
* IMPORTANT: URLs are added to this set BEFORE they are actually processed, not after.
|
|
2380
|
+
* This prevents the same URL from being queued multiple times when discovered from different sources.
|
|
2381
|
+
*
|
|
2382
|
+
* Usage flow:
|
|
2383
|
+
* 1. Initial queue setup: Root URL and initialQueue items are added to visited
|
|
2384
|
+
* 2. During processing: When a page returns links, each link is checked against visited
|
|
2385
|
+
* 3. In processBatch deduplication: Only links NOT in visited are added to the queue AND to visited
|
|
2386
|
+
*
|
|
2387
|
+
* This approach ensures:
|
|
2388
|
+
* - No URL is processed more than once
|
|
2389
|
+
* - No URL appears in the queue multiple times
|
|
2390
|
+
* - Efficient deduplication across concurrent processing
|
|
2391
|
+
*/
|
|
2392
|
+
visited = /* @__PURE__ */ new Set();
|
|
2393
|
+
pageCount = 0;
|
|
2394
|
+
totalDiscovered = 0;
|
|
2395
|
+
// Track total URLs discovered (unlimited)
|
|
2396
|
+
effectiveTotal = 0;
|
|
2397
|
+
// Track effective total (limited by maxPages)
|
|
2398
|
+
canonicalBaseUrl;
|
|
2399
|
+
options;
|
|
2400
|
+
constructor(options = {}) {
|
|
2401
|
+
this.options = options;
|
|
2402
|
+
}
|
|
2403
|
+
/**
|
|
2404
|
+
* Determines if a URL should be processed based on scope and include/exclude patterns in ScraperOptions.
|
|
2405
|
+
* Scope is checked first, then patterns.
|
|
2406
|
+
*/
|
|
2407
|
+
shouldProcessUrl(url, options) {
|
|
2408
|
+
if (options.scope) {
|
|
2409
|
+
try {
|
|
2410
|
+
const base = this.canonicalBaseUrl ?? new URL$1(options.url);
|
|
2411
|
+
const target = new URL$1(url);
|
|
2412
|
+
if (!isInScope(base, target, options.scope)) return false;
|
|
2413
|
+
} catch {
|
|
2414
|
+
return false;
|
|
2415
|
+
}
|
|
2416
|
+
}
|
|
2417
|
+
return shouldIncludeUrl(url, options.includePatterns, options.excludePatterns);
|
|
2418
|
+
}
|
|
2419
|
+
async processBatch(batch, baseUrl, options, progressCallback, signal) {
|
|
2420
|
+
const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
|
|
2421
|
+
const results = await Promise.all(
|
|
2422
|
+
batch.map(async (item) => {
|
|
2423
|
+
if (signal?.aborted) {
|
|
2424
|
+
throw new CancellationError("Scraping cancelled during batch processing");
|
|
2425
|
+
}
|
|
2426
|
+
const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH;
|
|
2427
|
+
if (item.depth > maxDepth) {
|
|
2428
|
+
return [];
|
|
2429
|
+
}
|
|
2430
|
+
try {
|
|
2431
|
+
const result = await this.processItem(item, options, signal);
|
|
2432
|
+
const shouldCount = item.pageId !== void 0 || result.content !== void 0;
|
|
2433
|
+
let currentPageCount = this.pageCount;
|
|
2434
|
+
if (shouldCount) {
|
|
2435
|
+
currentPageCount = ++this.pageCount;
|
|
2436
|
+
logger.info(
|
|
2437
|
+
`🌐 Scraping page ${currentPageCount}/${this.effectiveTotal} (depth ${item.depth}/${maxDepth}): ${item.url}`
|
|
2438
|
+
);
|
|
2439
|
+
}
|
|
2440
|
+
if (result.status === FetchStatus.NOT_MODIFIED) {
|
|
2441
|
+
logger.debug(`Page unchanged (304): ${item.url}`);
|
|
2442
|
+
if (shouldCount) {
|
|
2443
|
+
await progressCallback({
|
|
2444
|
+
pagesScraped: currentPageCount,
|
|
2445
|
+
totalPages: this.effectiveTotal,
|
|
2446
|
+
totalDiscovered: this.totalDiscovered,
|
|
2447
|
+
currentUrl: item.url,
|
|
2448
|
+
depth: item.depth,
|
|
2449
|
+
maxDepth,
|
|
2450
|
+
result: null,
|
|
2451
|
+
pageId: item.pageId
|
|
2452
|
+
});
|
|
2453
|
+
}
|
|
2454
|
+
return [];
|
|
2455
|
+
}
|
|
2456
|
+
if (result.status === FetchStatus.NOT_FOUND) {
|
|
2457
|
+
logger.debug(`Page deleted (404): ${item.url}`);
|
|
2458
|
+
if (shouldCount) {
|
|
2459
|
+
await progressCallback({
|
|
2460
|
+
pagesScraped: currentPageCount,
|
|
2461
|
+
totalPages: this.effectiveTotal,
|
|
2462
|
+
totalDiscovered: this.totalDiscovered,
|
|
2463
|
+
currentUrl: item.url,
|
|
2464
|
+
depth: item.depth,
|
|
2465
|
+
maxDepth,
|
|
2466
|
+
result: null,
|
|
2467
|
+
pageId: item.pageId,
|
|
2468
|
+
deleted: true
|
|
2469
|
+
});
|
|
2470
|
+
}
|
|
2471
|
+
return [];
|
|
2472
|
+
}
|
|
2473
|
+
if (result.status !== FetchStatus.SUCCESS) {
|
|
2474
|
+
logger.error(`Unknown fetch status: ${result.status}`);
|
|
2475
|
+
return [];
|
|
2476
|
+
}
|
|
2477
|
+
const finalUrl = result.url || item.url;
|
|
2478
|
+
if (result.content) {
|
|
2479
|
+
await progressCallback({
|
|
2480
|
+
pagesScraped: currentPageCount,
|
|
2481
|
+
totalPages: this.effectiveTotal,
|
|
2482
|
+
totalDiscovered: this.totalDiscovered,
|
|
2483
|
+
currentUrl: finalUrl,
|
|
2484
|
+
depth: item.depth,
|
|
2485
|
+
maxDepth,
|
|
2486
|
+
result: {
|
|
2487
|
+
url: finalUrl,
|
|
2488
|
+
title: result.content.title?.trim() || result.title?.trim() || "",
|
|
2489
|
+
contentType: result.contentType || "",
|
|
2490
|
+
textContent: result.content.textContent || "",
|
|
2491
|
+
links: result.content.links || [],
|
|
2492
|
+
errors: result.content.errors || [],
|
|
2493
|
+
chunks: result.content.chunks || [],
|
|
2494
|
+
etag: result.etag || null,
|
|
2495
|
+
lastModified: result.lastModified || null
|
|
2496
|
+
},
|
|
2497
|
+
pageId: item.pageId
|
|
2498
|
+
});
|
|
2499
|
+
}
|
|
2500
|
+
const nextItems = result.links || [];
|
|
2501
|
+
const linkBaseUrl = finalUrl ? new URL$1(finalUrl) : baseUrl;
|
|
2502
|
+
return nextItems.map((value) => {
|
|
2503
|
+
try {
|
|
2504
|
+
const targetUrl = new URL$1(value, linkBaseUrl);
|
|
2505
|
+
if (!this.shouldProcessUrl(targetUrl.href, options)) {
|
|
2506
|
+
return null;
|
|
2507
|
+
}
|
|
2508
|
+
return {
|
|
2509
|
+
url: targetUrl.href,
|
|
2510
|
+
depth: item.depth + 1
|
|
2511
|
+
};
|
|
2512
|
+
} catch (_error) {
|
|
2513
|
+
logger.warn(`❌ Invalid URL: ${value}`);
|
|
2514
|
+
}
|
|
2515
|
+
return null;
|
|
2516
|
+
}).filter((item2) => item2 !== null);
|
|
2517
|
+
} catch (error) {
|
|
2518
|
+
if (options.ignoreErrors) {
|
|
2519
|
+
logger.error(`❌ Failed to process ${item.url}: ${error}`);
|
|
2520
|
+
return [];
|
|
2521
|
+
}
|
|
2522
|
+
throw error;
|
|
2523
|
+
}
|
|
2524
|
+
})
|
|
2525
|
+
);
|
|
2526
|
+
const allLinks = results.flat();
|
|
2527
|
+
const uniqueLinks = [];
|
|
2528
|
+
for (const item of allLinks) {
|
|
2529
|
+
const normalizedUrl = normalizeUrl(item.url, this.options.urlNormalizerOptions);
|
|
2530
|
+
if (!this.visited.has(normalizedUrl)) {
|
|
2531
|
+
this.visited.add(normalizedUrl);
|
|
2532
|
+
uniqueLinks.push(item);
|
|
2533
|
+
this.totalDiscovered++;
|
|
2534
|
+
if (this.effectiveTotal < maxPages) {
|
|
2535
|
+
this.effectiveTotal++;
|
|
2536
|
+
}
|
|
2537
|
+
}
|
|
2538
|
+
}
|
|
2539
|
+
return uniqueLinks;
|
|
2540
|
+
}
|
|
2541
|
+
async scrape(options, progressCallback, signal) {
|
|
2542
|
+
this.visited.clear();
|
|
2543
|
+
this.pageCount = 0;
|
|
2544
|
+
const initialQueue = options.initialQueue || [];
|
|
2545
|
+
const isRefreshMode = initialQueue.length > 0;
|
|
2546
|
+
this.canonicalBaseUrl = new URL$1(options.url);
|
|
2547
|
+
let baseUrl = this.canonicalBaseUrl;
|
|
2548
|
+
const queue = [];
|
|
2549
|
+
const normalizedRootUrl = normalizeUrl(
|
|
2550
|
+
options.url,
|
|
2551
|
+
this.options.urlNormalizerOptions
|
|
2552
|
+
);
|
|
2553
|
+
if (isRefreshMode) {
|
|
2554
|
+
logger.debug(
|
|
2555
|
+
`Starting refresh mode with ${initialQueue.length} pre-populated pages`
|
|
2556
|
+
);
|
|
2557
|
+
for (const item of initialQueue) {
|
|
2558
|
+
const normalizedUrl = normalizeUrl(item.url, this.options.urlNormalizerOptions);
|
|
2559
|
+
if (!this.visited.has(normalizedUrl)) {
|
|
2560
|
+
this.visited.add(normalizedUrl);
|
|
2561
|
+
queue.push(item);
|
|
2562
|
+
}
|
|
2563
|
+
}
|
|
2564
|
+
}
|
|
2565
|
+
if (!this.visited.has(normalizedRootUrl)) {
|
|
2566
|
+
this.visited.add(normalizedRootUrl);
|
|
2567
|
+
queue.unshift({ url: options.url, depth: 0 });
|
|
2568
|
+
}
|
|
2569
|
+
this.totalDiscovered = queue.length;
|
|
2570
|
+
this.effectiveTotal = queue.length;
|
|
2571
|
+
const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
|
|
2572
|
+
const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY;
|
|
2573
|
+
while (queue.length > 0 && this.pageCount < maxPages) {
|
|
2574
|
+
if (signal?.aborted) {
|
|
2575
|
+
logger.debug(`${isRefreshMode ? "Refresh" : "Scraping"} cancelled by signal.`);
|
|
2576
|
+
throw new CancellationError(
|
|
2577
|
+
`${isRefreshMode ? "Refresh" : "Scraping"} cancelled by signal`
|
|
2578
|
+
);
|
|
2579
|
+
}
|
|
2580
|
+
const remainingPages = maxPages - this.pageCount;
|
|
2581
|
+
if (remainingPages <= 0) {
|
|
2582
|
+
break;
|
|
2583
|
+
}
|
|
2584
|
+
const batchSize = Math.min(maxConcurrency, remainingPages, queue.length);
|
|
2585
|
+
const batch = queue.splice(0, batchSize);
|
|
2586
|
+
baseUrl = this.canonicalBaseUrl ?? baseUrl;
|
|
2587
|
+
const newUrls = await this.processBatch(
|
|
2588
|
+
batch,
|
|
2589
|
+
baseUrl,
|
|
2590
|
+
options,
|
|
2591
|
+
progressCallback,
|
|
2592
|
+
signal
|
|
2593
|
+
);
|
|
2594
|
+
queue.push(...newUrls);
|
|
2595
|
+
}
|
|
2596
|
+
}
|
|
2597
|
+
/**
|
|
2598
|
+
* Cleanup resources used by this strategy.
|
|
2599
|
+
* Default implementation does nothing - override in derived classes as needed.
|
|
2600
|
+
*/
|
|
2601
|
+
async cleanup() {
|
|
2602
|
+
}
|
|
2603
|
+
}
|
|
2604
|
+
class SplitterError extends Error {
|
|
2605
|
+
}
|
|
2606
|
+
class MinimumChunkSizeError extends SplitterError {
|
|
2607
|
+
constructor(size, maxSize) {
|
|
2608
|
+
super(
|
|
2609
|
+
`Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`
|
|
2610
|
+
);
|
|
2611
|
+
}
|
|
2612
|
+
}
|
|
2613
|
+
class ContentSplitterError extends SplitterError {
|
|
2614
|
+
}
|
|
2615
|
+
class GreedySplitter {
|
|
2616
|
+
baseSplitter;
|
|
2617
|
+
minChunkSize;
|
|
2618
|
+
preferredChunkSize;
|
|
2619
|
+
maxChunkSize;
|
|
2620
|
+
/**
|
|
2621
|
+
* Combines a base document splitter with size constraints to produce optimally-sized chunks.
|
|
2622
|
+
* The base splitter handles the initial semantic splitting, while this class handles
|
|
2623
|
+
* the concatenation strategy.
|
|
2624
|
+
*/
|
|
2625
|
+
constructor(baseSplitter, minChunkSize, preferredChunkSize, maxChunkSize) {
|
|
2626
|
+
this.baseSplitter = baseSplitter;
|
|
2627
|
+
this.minChunkSize = minChunkSize;
|
|
2628
|
+
this.preferredChunkSize = preferredChunkSize;
|
|
2629
|
+
this.maxChunkSize = maxChunkSize;
|
|
2630
|
+
}
|
|
2631
|
+
/**
|
|
2632
|
+
* Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks
|
|
2633
|
+
* are combined until they reach the minimum size, but splits are preserved at major
|
|
2634
|
+
* section boundaries to maintain document structure. This balances the need for
|
|
2635
|
+
* context with semantic coherence.
|
|
2636
|
+
*/
|
|
2637
|
+
async splitText(markdown, contentType) {
|
|
2638
|
+
const initialChunks = await this.baseSplitter.splitText(markdown, contentType);
|
|
2639
|
+
const concatenatedChunks = [];
|
|
2640
|
+
let currentChunk = null;
|
|
2641
|
+
for (const nextChunk of initialChunks) {
|
|
2642
|
+
if (nextChunk.content.length > this.maxChunkSize) {
|
|
2643
|
+
logger.warn(
|
|
2644
|
+
`⚠ Chunk from base splitter exceeds max size: ${nextChunk.content.length} > ${this.maxChunkSize}`
|
|
2645
|
+
);
|
|
2646
|
+
}
|
|
2647
|
+
if (currentChunk) {
|
|
2648
|
+
const combinedSize = currentChunk.content.length + nextChunk.content.length;
|
|
2649
|
+
if (combinedSize > this.maxChunkSize) {
|
|
2650
|
+
concatenatedChunks.push(currentChunk);
|
|
2651
|
+
currentChunk = this.cloneChunk(nextChunk);
|
|
2652
|
+
continue;
|
|
2653
|
+
}
|
|
2654
|
+
if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk) && !this.isSameSection(currentChunk, nextChunk)) {
|
|
2655
|
+
concatenatedChunks.push(currentChunk);
|
|
2656
|
+
currentChunk = this.cloneChunk(nextChunk);
|
|
2657
|
+
continue;
|
|
2658
|
+
}
|
|
2659
|
+
if (combinedSize > this.preferredChunkSize && currentChunk.content.length >= this.minChunkSize && nextChunk.content.length >= this.minChunkSize) {
|
|
2660
|
+
concatenatedChunks.push(currentChunk);
|
|
2661
|
+
currentChunk = this.cloneChunk(nextChunk);
|
|
2662
|
+
continue;
|
|
2663
|
+
}
|
|
2664
|
+
currentChunk.content += `${currentChunk.content.endsWith("\n") ? "" : "\n"}${nextChunk.content}`;
|
|
2665
|
+
currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
|
|
2666
|
+
currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
|
|
2667
|
+
} else {
|
|
2668
|
+
currentChunk = this.cloneChunk(nextChunk);
|
|
2669
|
+
}
|
|
2670
|
+
}
|
|
2671
|
+
if (currentChunk) {
|
|
2672
|
+
concatenatedChunks.push(currentChunk);
|
|
2673
|
+
}
|
|
2674
|
+
return concatenatedChunks;
|
|
2675
|
+
}
|
|
2676
|
+
cloneChunk(chunk) {
|
|
2677
|
+
return {
|
|
2678
|
+
types: [...chunk.types],
|
|
2679
|
+
content: chunk.content,
|
|
2680
|
+
section: {
|
|
2681
|
+
level: chunk.section.level,
|
|
2682
|
+
path: [...chunk.section.path]
|
|
2683
|
+
}
|
|
2684
|
+
};
|
|
2685
|
+
}
|
|
2686
|
+
/**
|
|
2687
|
+
* H1 and H2 headings represent major conceptual breaks in the document.
|
|
2688
|
+
* Preserving these splits helps maintain the document's logical structure.
|
|
2689
|
+
*/
|
|
2690
|
+
startsNewMajorSection(chunk) {
|
|
2691
|
+
return chunk.section.level === 1 || chunk.section.level === 2;
|
|
2692
|
+
}
|
|
2693
|
+
/**
|
|
2694
|
+
* Checks if two chunks belong to the same section by comparing their paths.
|
|
2695
|
+
* Returns true if the paths are identical or if one is a parent of the other.
|
|
2696
|
+
*/
|
|
2697
|
+
isSameSection(chunk1, chunk2) {
|
|
2698
|
+
const path1 = chunk1.section.path;
|
|
2699
|
+
const path2 = chunk2.section.path;
|
|
2700
|
+
if (path1.length === path2.length && path1.every((part, i) => part === path2[i])) {
|
|
2701
|
+
return true;
|
|
2702
|
+
}
|
|
2703
|
+
return this.isPathIncluded(path1, path2) || this.isPathIncluded(path2, path1);
|
|
2704
|
+
}
|
|
2705
|
+
/**
|
|
2706
|
+
* Checks if one path is a prefix of another path, indicating a parent-child relationship
|
|
2707
|
+
*/
|
|
2708
|
+
isPathIncluded(parentPath, childPath) {
|
|
2709
|
+
if (parentPath.length >= childPath.length) return false;
|
|
2710
|
+
return parentPath.every((part, i) => part === childPath[i]);
|
|
2711
|
+
}
|
|
2712
|
+
/**
|
|
2713
|
+
* Merges section metadata when concatenating chunks, following these rules:
|
|
2223
2714
|
* 1. Level: Always uses the lowest (most general) level between chunks
|
|
2224
2715
|
* 2. Path selection:
|
|
2225
2716
|
* - For parent-child relationships (one path includes the other), uses the child's path
|
|
@@ -4195,7 +4686,7 @@ class HtmlMetadataExtractorMiddleware {
|
|
|
4195
4686
|
}
|
|
4196
4687
|
title = title || "Untitled";
|
|
4197
4688
|
title = title.replace(/\s+/g, " ").trim();
|
|
4198
|
-
context.
|
|
4689
|
+
context.title = title;
|
|
4199
4690
|
logger.debug(`Extracted title: "${title}" from ${context.source}`);
|
|
4200
4691
|
} catch (error) {
|
|
4201
4692
|
logger.error(`❌ Error extracting metadata from ${context.source}: ${error}`);
|
|
@@ -4653,7 +5144,7 @@ ${frame.content}
|
|
|
4653
5144
|
* @param next The next middleware function in the pipeline.
|
|
4654
5145
|
*/
|
|
4655
5146
|
async process(context, next) {
|
|
4656
|
-
const contentType = context.options?.headers?.["content-type"] || context.
|
|
5147
|
+
const contentType = context.options?.headers?.["content-type"] || context.contentType;
|
|
4657
5148
|
if (contentType && typeof contentType === "string" && !MimeTypeUtils.isHtml(contentType)) {
|
|
4658
5149
|
logger.debug(
|
|
4659
5150
|
`Skipping Playwright rendering for ${context.source} - content type '${contentType}' is not HTML`
|
|
@@ -5014,6 +5505,7 @@ ${text.replace(/^\n+|\n+$/g, "")}
|
|
|
5014
5505
|
context.content = markdown;
|
|
5015
5506
|
logger.debug(`Successfully converted HTML to Markdown for ${context.source}`);
|
|
5016
5507
|
}
|
|
5508
|
+
context.contentType = "text/markdown";
|
|
5017
5509
|
} catch (error) {
|
|
5018
5510
|
logger.error(
|
|
5019
5511
|
`❌ Error converting HTML to Markdown for ${context.source}: ${error}`
|
|
@@ -5053,7 +5545,7 @@ class MarkdownMetadataExtractorMiddleware {
|
|
|
5053
5545
|
if (match?.[1]) {
|
|
5054
5546
|
title = match[1].trim();
|
|
5055
5547
|
}
|
|
5056
|
-
context.
|
|
5548
|
+
context.title = title;
|
|
5057
5549
|
} catch (error) {
|
|
5058
5550
|
context.errors.push(
|
|
5059
5551
|
new Error(
|
|
@@ -5225,10 +5717,10 @@ function convertToString(content, charset) {
|
|
|
5225
5717
|
}
|
|
5226
5718
|
class BasePipeline {
|
|
5227
5719
|
/**
|
|
5228
|
-
* Determines if this pipeline can process the given
|
|
5720
|
+
* Determines if this pipeline can process content with the given MIME type.
|
|
5229
5721
|
* Must be implemented by derived classes.
|
|
5230
5722
|
*/
|
|
5231
|
-
canProcess(
|
|
5723
|
+
canProcess(_mimeType, _content) {
|
|
5232
5724
|
throw new Error("Method not implemented.");
|
|
5233
5725
|
}
|
|
5234
5726
|
/**
|
|
@@ -5289,11 +5781,12 @@ class HtmlPipeline extends BasePipeline {
|
|
|
5289
5781
|
this.greedySplitter = new GreedySplitter(
|
|
5290
5782
|
semanticSplitter,
|
|
5291
5783
|
SPLITTER_MIN_CHUNK_SIZE,
|
|
5292
|
-
preferredChunkSize
|
|
5784
|
+
preferredChunkSize,
|
|
5785
|
+
maxChunkSize
|
|
5293
5786
|
);
|
|
5294
5787
|
}
|
|
5295
|
-
canProcess(
|
|
5296
|
-
return MimeTypeUtils.isHtml(
|
|
5788
|
+
canProcess(mimeType) {
|
|
5789
|
+
return MimeTypeUtils.isHtml(mimeType);
|
|
5297
5790
|
}
|
|
5298
5791
|
async process(rawContent, options, fetcher) {
|
|
5299
5792
|
const resolvedCharset = resolveCharset(
|
|
@@ -5304,8 +5797,9 @@ class HtmlPipeline extends BasePipeline {
|
|
|
5304
5797
|
const contentString = convertToString(rawContent.content, resolvedCharset);
|
|
5305
5798
|
const context = {
|
|
5306
5799
|
content: contentString,
|
|
5800
|
+
contentType: rawContent.mimeType || "text/html",
|
|
5307
5801
|
source: rawContent.source,
|
|
5308
|
-
metadata: {},
|
|
5802
|
+
// metadata: {},
|
|
5309
5803
|
links: [],
|
|
5310
5804
|
errors: [],
|
|
5311
5805
|
options,
|
|
@@ -5320,8 +5814,9 @@ class HtmlPipeline extends BasePipeline {
|
|
|
5320
5814
|
typeof context.content === "string" ? context.content : ""
|
|
5321
5815
|
);
|
|
5322
5816
|
return {
|
|
5323
|
-
|
|
5324
|
-
|
|
5817
|
+
title: context.title,
|
|
5818
|
+
contentType: context.contentType,
|
|
5819
|
+
textContent: context.content,
|
|
5325
5820
|
links: context.links,
|
|
5326
5821
|
errors: context.errors,
|
|
5327
5822
|
chunks
|
|
@@ -5345,9 +5840,9 @@ class JsonPipeline extends BasePipeline {
|
|
|
5345
5840
|
preserveFormatting: true
|
|
5346
5841
|
});
|
|
5347
5842
|
}
|
|
5348
|
-
canProcess(
|
|
5349
|
-
if (!
|
|
5350
|
-
return MimeTypeUtils.isJson(
|
|
5843
|
+
canProcess(mimeType) {
|
|
5844
|
+
if (!mimeType) return false;
|
|
5845
|
+
return MimeTypeUtils.isJson(mimeType);
|
|
5351
5846
|
}
|
|
5352
5847
|
async process(rawContent, options, fetcher) {
|
|
5353
5848
|
const contentString = convertToString(rawContent.content, rawContent.charset);
|
|
@@ -5362,22 +5857,25 @@ class JsonPipeline extends BasePipeline {
|
|
|
5362
5857
|
const fallbackChunks = await this.splitter.splitText(contentString);
|
|
5363
5858
|
return {
|
|
5364
5859
|
textContent: contentString,
|
|
5365
|
-
metadata: {
|
|
5366
|
-
|
|
5367
|
-
},
|
|
5860
|
+
// metadata: {
|
|
5861
|
+
// isValidJson: false,
|
|
5862
|
+
// },
|
|
5368
5863
|
links: [],
|
|
5369
5864
|
errors: [],
|
|
5370
5865
|
chunks: fallbackChunks
|
|
5371
5866
|
};
|
|
5372
5867
|
}
|
|
5868
|
+
const metadata = this.extractMetadata(parsedJson);
|
|
5373
5869
|
const context = {
|
|
5374
5870
|
content: contentString,
|
|
5375
5871
|
source: rawContent.source,
|
|
5376
|
-
|
|
5377
|
-
|
|
5378
|
-
|
|
5379
|
-
|
|
5380
|
-
|
|
5872
|
+
title: metadata.title,
|
|
5873
|
+
contentType: rawContent.mimeType || "application/json",
|
|
5874
|
+
// metadata: {
|
|
5875
|
+
// ...this.extractMetadata(parsedJson),
|
|
5876
|
+
// isValidJson,
|
|
5877
|
+
// jsonStructure: this.analyzeJsonStructure(parsedJson),
|
|
5878
|
+
// },
|
|
5381
5879
|
links: [],
|
|
5382
5880
|
// JSON files typically don't contain links
|
|
5383
5881
|
errors: [],
|
|
@@ -5387,8 +5885,9 @@ class JsonPipeline extends BasePipeline {
|
|
|
5387
5885
|
await this.executeMiddlewareStack(this.middleware, context);
|
|
5388
5886
|
const chunks = await this.splitter.splitText(context.content);
|
|
5389
5887
|
return {
|
|
5888
|
+
title: context.title,
|
|
5889
|
+
contentType: context.contentType,
|
|
5390
5890
|
textContent: context.content,
|
|
5391
|
-
metadata: context.metadata,
|
|
5392
5891
|
links: context.links,
|
|
5393
5892
|
errors: context.errors,
|
|
5394
5893
|
chunks
|
|
@@ -5418,30 +5917,6 @@ class JsonPipeline extends BasePipeline {
|
|
|
5418
5917
|
}
|
|
5419
5918
|
return metadata;
|
|
5420
5919
|
}
|
|
5421
|
-
/**
|
|
5422
|
-
* Analyzes the structure of valid JSON for metadata
|
|
5423
|
-
*/
|
|
5424
|
-
analyzeJsonStructure(parsedJson) {
|
|
5425
|
-
if (Array.isArray(parsedJson)) {
|
|
5426
|
-
return {
|
|
5427
|
-
type: "array",
|
|
5428
|
-
depth: this.calculateDepth(parsedJson),
|
|
5429
|
-
itemCount: parsedJson.length
|
|
5430
|
-
};
|
|
5431
|
-
} else if (typeof parsedJson === "object" && parsedJson !== null) {
|
|
5432
|
-
const obj = parsedJson;
|
|
5433
|
-
return {
|
|
5434
|
-
type: "object",
|
|
5435
|
-
depth: this.calculateDepth(parsedJson),
|
|
5436
|
-
propertyCount: Object.keys(obj).length
|
|
5437
|
-
};
|
|
5438
|
-
} else {
|
|
5439
|
-
return {
|
|
5440
|
-
type: typeof parsedJson,
|
|
5441
|
-
depth: 1
|
|
5442
|
-
};
|
|
5443
|
-
}
|
|
5444
|
-
}
|
|
5445
5920
|
/**
|
|
5446
5921
|
* Calculates the maximum nesting depth of a JSON structure
|
|
5447
5922
|
*/
|
|
@@ -5482,19 +5957,20 @@ class MarkdownPipeline extends BasePipeline {
|
|
|
5482
5957
|
this.greedySplitter = new GreedySplitter(
|
|
5483
5958
|
semanticSplitter,
|
|
5484
5959
|
SPLITTER_MIN_CHUNK_SIZE,
|
|
5485
|
-
preferredChunkSize
|
|
5960
|
+
preferredChunkSize,
|
|
5961
|
+
maxChunkSize
|
|
5486
5962
|
);
|
|
5487
5963
|
}
|
|
5488
|
-
canProcess(
|
|
5489
|
-
if (!
|
|
5490
|
-
return MimeTypeUtils.isMarkdown(
|
|
5964
|
+
canProcess(mimeType) {
|
|
5965
|
+
if (!mimeType) return false;
|
|
5966
|
+
return MimeTypeUtils.isMarkdown(mimeType);
|
|
5491
5967
|
}
|
|
5492
5968
|
async process(rawContent, options, fetcher) {
|
|
5493
5969
|
const contentString = convertToString(rawContent.content, rawContent.charset);
|
|
5494
5970
|
const context = {
|
|
5971
|
+
contentType: rawContent.mimeType || "text/markdown",
|
|
5495
5972
|
content: contentString,
|
|
5496
5973
|
source: rawContent.source,
|
|
5497
|
-
metadata: {},
|
|
5498
5974
|
links: [],
|
|
5499
5975
|
errors: [],
|
|
5500
5976
|
options,
|
|
@@ -5506,8 +5982,9 @@ class MarkdownPipeline extends BasePipeline {
|
|
|
5506
5982
|
rawContent.mimeType
|
|
5507
5983
|
);
|
|
5508
5984
|
return {
|
|
5985
|
+
title: context.title,
|
|
5986
|
+
contentType: context.contentType,
|
|
5509
5987
|
textContent: typeof context.content === "string" ? context.content : "",
|
|
5510
|
-
metadata: context.metadata,
|
|
5511
5988
|
links: context.links,
|
|
5512
5989
|
errors: context.errors,
|
|
5513
5990
|
chunks
|
|
@@ -5517,24 +5994,27 @@ class MarkdownPipeline extends BasePipeline {
|
|
|
5517
5994
|
class SourceCodePipeline extends BasePipeline {
|
|
5518
5995
|
middleware;
|
|
5519
5996
|
splitter;
|
|
5520
|
-
constructor(
|
|
5997
|
+
constructor(_preferredChunkSize = SPLITTER_PREFERRED_CHUNK_SIZE, maxChunkSize = SPLITTER_MAX_CHUNK_SIZE) {
|
|
5521
5998
|
super();
|
|
5522
5999
|
this.middleware = [];
|
|
5523
|
-
this.splitter = new TreesitterSourceCodeSplitter({ maxChunkSize
|
|
6000
|
+
this.splitter = new TreesitterSourceCodeSplitter({ maxChunkSize });
|
|
5524
6001
|
}
|
|
5525
|
-
canProcess(
|
|
5526
|
-
if (!
|
|
5527
|
-
return MimeTypeUtils.isSourceCode(
|
|
6002
|
+
canProcess(mimeType) {
|
|
6003
|
+
if (!mimeType) return false;
|
|
6004
|
+
return MimeTypeUtils.isSourceCode(mimeType);
|
|
5528
6005
|
}
|
|
5529
6006
|
async process(rawContent, options, fetcher) {
|
|
5530
6007
|
const contentString = convertToString(rawContent.content, rawContent.charset);
|
|
5531
6008
|
const context = {
|
|
6009
|
+
contentType: rawContent.mimeType || "text/plain",
|
|
5532
6010
|
content: contentString,
|
|
5533
6011
|
source: rawContent.source,
|
|
5534
|
-
metadata: {
|
|
5535
|
-
|
|
5536
|
-
|
|
5537
|
-
|
|
6012
|
+
// metadata: {
|
|
6013
|
+
// language: rawContent.mimeType
|
|
6014
|
+
// ? MimeTypeUtils.extractLanguageFromMimeType(rawContent.mimeType)
|
|
6015
|
+
// : "text",
|
|
6016
|
+
// isSourceCode: true,
|
|
6017
|
+
// },
|
|
5538
6018
|
links: [],
|
|
5539
6019
|
// Source code files typically don't contain web links
|
|
5540
6020
|
errors: [],
|
|
@@ -5544,8 +6024,10 @@ class SourceCodePipeline extends BasePipeline {
|
|
|
5544
6024
|
await this.executeMiddlewareStack(this.middleware, context);
|
|
5545
6025
|
const chunks = await this.splitter.splitText(context.content, rawContent.mimeType);
|
|
5546
6026
|
return {
|
|
6027
|
+
title: context.title,
|
|
6028
|
+
contentType: context.contentType,
|
|
5547
6029
|
textContent: context.content,
|
|
5548
|
-
metadata: context.metadata,
|
|
6030
|
+
// metadata: context.metadata,
|
|
5549
6031
|
links: context.links,
|
|
5550
6032
|
errors: context.errors,
|
|
5551
6033
|
chunks
|
|
@@ -5594,17 +6076,22 @@ class TextDocumentSplitter {
|
|
|
5594
6076
|
class TextPipeline extends BasePipeline {
|
|
5595
6077
|
middleware;
|
|
5596
6078
|
splitter;
|
|
5597
|
-
constructor(
|
|
6079
|
+
constructor(preferredChunkSize = SPLITTER_PREFERRED_CHUNK_SIZE, maxChunkSize = SPLITTER_MAX_CHUNK_SIZE) {
|
|
5598
6080
|
super();
|
|
5599
6081
|
this.middleware = [];
|
|
5600
|
-
const textSplitter = new TextDocumentSplitter({ maxChunkSize
|
|
5601
|
-
this.splitter = new GreedySplitter(
|
|
6082
|
+
const textSplitter = new TextDocumentSplitter({ maxChunkSize });
|
|
6083
|
+
this.splitter = new GreedySplitter(
|
|
6084
|
+
textSplitter,
|
|
6085
|
+
SPLITTER_MIN_CHUNK_SIZE,
|
|
6086
|
+
preferredChunkSize,
|
|
6087
|
+
maxChunkSize
|
|
6088
|
+
);
|
|
5602
6089
|
}
|
|
5603
|
-
canProcess(
|
|
5604
|
-
if (!MimeTypeUtils.isSafeForTextProcessing(
|
|
6090
|
+
canProcess(mimeType, content) {
|
|
6091
|
+
if (!MimeTypeUtils.isSafeForTextProcessing(mimeType)) {
|
|
5605
6092
|
return false;
|
|
5606
6093
|
}
|
|
5607
|
-
if (MimeTypeUtils.isBinary(
|
|
6094
|
+
if (content && MimeTypeUtils.isBinary(content)) {
|
|
5608
6095
|
return false;
|
|
5609
6096
|
}
|
|
5610
6097
|
return true;
|
|
@@ -5612,12 +6099,11 @@ class TextPipeline extends BasePipeline {
|
|
|
5612
6099
|
async process(rawContent, options, fetcher) {
|
|
5613
6100
|
const contentString = convertToString(rawContent.content, rawContent.charset);
|
|
5614
6101
|
const context = {
|
|
6102
|
+
title: "",
|
|
6103
|
+
// Title extraction can be added in middleware if needed
|
|
6104
|
+
contentType: rawContent.mimeType || "text/plain",
|
|
5615
6105
|
content: contentString,
|
|
5616
6106
|
source: rawContent.source,
|
|
5617
|
-
metadata: {
|
|
5618
|
-
contentType: rawContent.mimeType || "text/plain",
|
|
5619
|
-
isGenericText: true
|
|
5620
|
-
},
|
|
5621
6107
|
links: [],
|
|
5622
6108
|
// Generic text content typically doesn't contain structured links
|
|
5623
6109
|
errors: [],
|
|
@@ -5627,394 +6113,283 @@ class TextPipeline extends BasePipeline {
|
|
|
5627
6113
|
await this.executeMiddlewareStack(this.middleware, context);
|
|
5628
6114
|
const chunks = await this.splitter.splitText(context.content, rawContent.mimeType);
|
|
5629
6115
|
return {
|
|
6116
|
+
title: context.title,
|
|
6117
|
+
contentType: context.contentType,
|
|
5630
6118
|
textContent: context.content,
|
|
5631
|
-
metadata: context.metadata,
|
|
5632
6119
|
links: context.links,
|
|
5633
6120
|
errors: context.errors,
|
|
5634
|
-
chunks
|
|
5635
|
-
};
|
|
5636
|
-
}
|
|
5637
|
-
}
|
|
5638
|
-
let PipelineFactory$1 = class PipelineFactory {
|
|
5639
|
-
/**
|
|
5640
|
-
* Creates the standard set of content pipelines used by all scraper strategies.
|
|
5641
|
-
* Includes HTML, Markdown, JSON, source code, and text processing capabilities.
|
|
5642
|
-
* Each pipeline now handles both preprocessing and content-specific splitting.
|
|
5643
|
-
* TextPipeline is placed last as the universal fallback for unknown content types.
|
|
5644
|
-
*
|
|
5645
|
-
* @param config - Optional configuration for pipeline chunk sizes
|
|
5646
|
-
* @returns Array of content pipelines in processing order
|
|
5647
|
-
*/
|
|
5648
|
-
static createStandardPipelines(config) {
|
|
5649
|
-
const preferredChunkSize = config?.chunkSizes?.preferred ?? SPLITTER_PREFERRED_CHUNK_SIZE;
|
|
5650
|
-
const maxChunkSize = config?.chunkSizes?.max ?? 2e3;
|
|
5651
|
-
return [
|
|
5652
|
-
new JsonPipeline(preferredChunkSize),
|
|
5653
|
-
new SourceCodePipeline(preferredChunkSize),
|
|
5654
|
-
new HtmlPipeline(preferredChunkSize, maxChunkSize),
|
|
5655
|
-
new MarkdownPipeline(preferredChunkSize, maxChunkSize),
|
|
5656
|
-
new TextPipeline(preferredChunkSize)
|
|
5657
|
-
// Universal fallback - must be last
|
|
5658
|
-
];
|
|
5659
|
-
}
|
|
5660
|
-
};
|
|
5661
|
-
const DEFAULT_FILE_EXCLUSIONS = [
|
|
5662
|
-
// CHANGELOG files (case variations)
|
|
5663
|
-
"**/CHANGELOG.md",
|
|
5664
|
-
"**/changelog.md",
|
|
5665
|
-
"**/CHANGELOG.mdx",
|
|
5666
|
-
"**/changelog.mdx",
|
|
5667
|
-
// LICENSE files (case variations)
|
|
5668
|
-
"**/LICENSE",
|
|
5669
|
-
"**/LICENSE.md",
|
|
5670
|
-
"**/license.md",
|
|
5671
|
-
// CODE_OF_CONDUCT files (case variations)
|
|
5672
|
-
"**/CODE_OF_CONDUCT.md",
|
|
5673
|
-
"**/code_of_conduct.md",
|
|
5674
|
-
// Test files
|
|
5675
|
-
"**/*.test.*",
|
|
5676
|
-
"**/*.spec.*",
|
|
5677
|
-
"**/*_test.py",
|
|
5678
|
-
"**/*_test.go",
|
|
5679
|
-
// Package manager lock files
|
|
5680
|
-
"**/*.lock",
|
|
5681
|
-
"**/package-lock.json",
|
|
5682
|
-
"**/yarn.lock",
|
|
5683
|
-
"**/pnpm-lock.yaml",
|
|
5684
|
-
"**/go.sum",
|
|
5685
|
-
// Build artifacts
|
|
5686
|
-
"**/*.min.js",
|
|
5687
|
-
"**/*.min.css",
|
|
5688
|
-
"**/*.map",
|
|
5689
|
-
"**/*.d.ts",
|
|
5690
|
-
// IDE/System files
|
|
5691
|
-
"**/.DS_Store",
|
|
5692
|
-
"**/Thumbs.db",
|
|
5693
|
-
"**/*.swp",
|
|
5694
|
-
"**/*.swo",
|
|
5695
|
-
// Internal config files (using regex pattern)
|
|
5696
|
-
"/.*\\.(ini|cfg|conf|log|pid)$/"
|
|
5697
|
-
];
|
|
5698
|
-
const DEFAULT_FOLDER_EXCLUSIONS = [
|
|
5699
|
-
// Archive and deprecated content (matches anywhere in path)
|
|
5700
|
-
"**/archive/**",
|
|
5701
|
-
"**/archived/**",
|
|
5702
|
-
"**/deprecated/**",
|
|
5703
|
-
"**/legacy/**",
|
|
5704
|
-
"**/old/**",
|
|
5705
|
-
"**/outdated/**",
|
|
5706
|
-
"**/previous/**",
|
|
5707
|
-
"**/superseded/**",
|
|
5708
|
-
// Specific paths that don't follow the general pattern
|
|
5709
|
-
"docs/old/**",
|
|
5710
|
-
// Test directories
|
|
5711
|
-
"**/test/**",
|
|
5712
|
-
"**/tests/**",
|
|
5713
|
-
"**/__tests__/**",
|
|
5714
|
-
"**/spec/**",
|
|
5715
|
-
// Build output directories
|
|
5716
|
-
"**/dist/**",
|
|
5717
|
-
"**/build/**",
|
|
5718
|
-
"**/out/**",
|
|
5719
|
-
"**/target/**",
|
|
5720
|
-
"**/.next/**",
|
|
5721
|
-
"**/.nuxt/**",
|
|
5722
|
-
// IDE directories
|
|
5723
|
-
"**/.vscode/**",
|
|
5724
|
-
"**/.idea/**",
|
|
5725
|
-
// Internationalization folders - non-English locales
|
|
5726
|
-
"**/i18n/ar*/**",
|
|
5727
|
-
"**/i18n/de*/**",
|
|
5728
|
-
"**/i18n/es*/**",
|
|
5729
|
-
"**/i18n/fr*/**",
|
|
5730
|
-
"**/i18n/hi*/**",
|
|
5731
|
-
"**/i18n/it*/**",
|
|
5732
|
-
"**/i18n/ja*/**",
|
|
5733
|
-
"**/i18n/ko*/**",
|
|
5734
|
-
"**/i18n/nl*/**",
|
|
5735
|
-
"**/i18n/pl*/**",
|
|
5736
|
-
"**/i18n/pt*/**",
|
|
5737
|
-
"**/i18n/ru*/**",
|
|
5738
|
-
"**/i18n/sv*/**",
|
|
5739
|
-
"**/i18n/th*/**",
|
|
5740
|
-
"**/i18n/tr*/**",
|
|
5741
|
-
"**/i18n/vi*/**",
|
|
5742
|
-
"**/i18n/zh*/**",
|
|
5743
|
-
// Common locale folder patterns
|
|
5744
|
-
"**/zh-cn/**",
|
|
5745
|
-
"**/zh-hk/**",
|
|
5746
|
-
"**/zh-mo/**",
|
|
5747
|
-
"**/zh-sg/**",
|
|
5748
|
-
"**/zh-tw/**"
|
|
5749
|
-
];
|
|
5750
|
-
const DEFAULT_EXCLUSION_PATTERNS = [
|
|
5751
|
-
...DEFAULT_FILE_EXCLUSIONS,
|
|
5752
|
-
...DEFAULT_FOLDER_EXCLUSIONS
|
|
5753
|
-
];
|
|
5754
|
-
function getEffectiveExclusionPatterns(userPatterns) {
|
|
5755
|
-
if (userPatterns !== void 0) {
|
|
5756
|
-
return userPatterns;
|
|
6121
|
+
chunks
|
|
6122
|
+
};
|
|
5757
6123
|
}
|
|
5758
|
-
return DEFAULT_EXCLUSION_PATTERNS;
|
|
5759
|
-
}
|
|
5760
|
-
function isRegexPattern(pattern) {
|
|
5761
|
-
return pattern.length > 2 && pattern.startsWith("/") && pattern.endsWith("/");
|
|
5762
6124
|
}
|
|
5763
|
-
|
|
5764
|
-
|
|
5765
|
-
|
|
6125
|
+
let PipelineFactory$1 = class PipelineFactory {
|
|
6126
|
+
/**
|
|
6127
|
+
* Creates the standard set of content pipelines used by all scraper strategies.
|
|
6128
|
+
* Includes HTML, Markdown, JSON, source code, and text processing capabilities.
|
|
6129
|
+
* Each pipeline now handles both preprocessing and content-specific splitting.
|
|
6130
|
+
* TextPipeline is placed last as the universal fallback for unknown content types.
|
|
6131
|
+
*
|
|
6132
|
+
* @param config - Optional configuration for pipeline chunk sizes
|
|
6133
|
+
* @returns Array of content pipelines in processing order
|
|
6134
|
+
*/
|
|
6135
|
+
static createStandardPipelines(config) {
|
|
6136
|
+
const preferredChunkSize = config?.chunkSizes?.preferred ?? SPLITTER_PREFERRED_CHUNK_SIZE;
|
|
6137
|
+
const maxChunkSize = config?.chunkSizes?.max ?? 2e3;
|
|
6138
|
+
return [
|
|
6139
|
+
new JsonPipeline(preferredChunkSize),
|
|
6140
|
+
new SourceCodePipeline(preferredChunkSize, maxChunkSize),
|
|
6141
|
+
new HtmlPipeline(preferredChunkSize, maxChunkSize),
|
|
6142
|
+
new MarkdownPipeline(preferredChunkSize, maxChunkSize),
|
|
6143
|
+
new TextPipeline(preferredChunkSize, maxChunkSize)
|
|
6144
|
+
// Universal fallback - must be last
|
|
6145
|
+
];
|
|
5766
6146
|
}
|
|
5767
|
-
|
|
5768
|
-
|
|
5769
|
-
|
|
5770
|
-
|
|
5771
|
-
|
|
5772
|
-
|
|
5773
|
-
const normalizedPath = path2.startsWith("/") ? path2 : `/${path2}`;
|
|
5774
|
-
return patterns.some((pattern) => {
|
|
5775
|
-
if (isRegexPattern(pattern)) {
|
|
5776
|
-
return patternToRegExp(pattern).test(normalizedPath);
|
|
5777
|
-
}
|
|
5778
|
-
return minimatch(normalizedPath.replace(/^\//, ""), pattern, { dot: true });
|
|
5779
|
-
});
|
|
5780
|
-
}
|
|
5781
|
-
function extractPathAndQuery(url) {
|
|
5782
|
-
try {
|
|
5783
|
-
const u = new URL(url);
|
|
5784
|
-
return u.pathname + (u.search || "");
|
|
5785
|
-
} catch {
|
|
5786
|
-
return url;
|
|
6147
|
+
};
|
|
6148
|
+
class GitHubRepoProcessor {
|
|
6149
|
+
httpFetcher = new HttpFetcher();
|
|
6150
|
+
pipelines;
|
|
6151
|
+
constructor() {
|
|
6152
|
+
this.pipelines = PipelineFactory$1.createStandardPipelines();
|
|
5787
6153
|
}
|
|
5788
|
-
|
|
5789
|
-
|
|
5790
|
-
|
|
5791
|
-
|
|
5792
|
-
|
|
5793
|
-
|
|
5794
|
-
|
|
5795
|
-
|
|
5796
|
-
|
|
5797
|
-
|
|
6154
|
+
/**
|
|
6155
|
+
* Parses an HTTPS blob URL to extract repository information.
|
|
6156
|
+
* Format: https://github.com/owner/repo/blob/branch/filepath
|
|
6157
|
+
*/
|
|
6158
|
+
parseHttpsBlobUrl(url) {
|
|
6159
|
+
const parsedUrl = new URL(url);
|
|
6160
|
+
const segments = parsedUrl.pathname.split("/").filter(Boolean);
|
|
6161
|
+
if (segments.length < 5 || segments[2] !== "blob") {
|
|
6162
|
+
throw new Error(
|
|
6163
|
+
`Invalid GitHub blob URL format. Expected: https://github.com/owner/repo/blob/branch/filepath. Got: ${url}`
|
|
6164
|
+
);
|
|
5798
6165
|
}
|
|
6166
|
+
const owner = segments[0];
|
|
6167
|
+
const repo = segments[1];
|
|
6168
|
+
const branch = segments[3];
|
|
6169
|
+
const filePath = segments.slice(4).join("/");
|
|
6170
|
+
return { owner, repo, branch, filePath };
|
|
5799
6171
|
}
|
|
5800
|
-
|
|
5801
|
-
|
|
5802
|
-
|
|
5803
|
-
|
|
5804
|
-
|
|
5805
|
-
|
|
5806
|
-
}
|
|
5807
|
-
|
|
5808
|
-
|
|
5809
|
-
|
|
5810
|
-
|
|
5811
|
-
|
|
5812
|
-
|
|
5813
|
-
|
|
6172
|
+
/**
|
|
6173
|
+
* Fetches the raw content of a file from GitHub.
|
|
6174
|
+
*/
|
|
6175
|
+
async fetchFileContent(repoInfo, filePath, etag, signal) {
|
|
6176
|
+
const { owner, repo, branch } = repoInfo;
|
|
6177
|
+
const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${filePath}`;
|
|
6178
|
+
const rawContent = await this.httpFetcher.fetch(rawUrl, { signal, etag });
|
|
6179
|
+
const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath);
|
|
6180
|
+
if (detectedMimeType && rawContent.mimeType === "text/plain") {
|
|
6181
|
+
return {
|
|
6182
|
+
...rawContent,
|
|
6183
|
+
mimeType: detectedMimeType
|
|
6184
|
+
};
|
|
6185
|
+
}
|
|
6186
|
+
return rawContent;
|
|
5814
6187
|
}
|
|
5815
|
-
|
|
5816
|
-
|
|
5817
|
-
|
|
5818
|
-
|
|
5819
|
-
|
|
5820
|
-
|
|
5821
|
-
|
|
5822
|
-
|
|
5823
|
-
|
|
6188
|
+
/**
|
|
6189
|
+
* Processes a single GitHub repository file from an HTTPS blob URL.
|
|
6190
|
+
*/
|
|
6191
|
+
async process(item, options, signal) {
|
|
6192
|
+
const repoInfo = this.parseHttpsBlobUrl(item.url);
|
|
6193
|
+
const { owner, repo, branch, filePath } = repoInfo;
|
|
6194
|
+
const rawContent = await this.fetchFileContent(
|
|
6195
|
+
{ owner, repo, branch },
|
|
6196
|
+
filePath,
|
|
6197
|
+
item.etag,
|
|
6198
|
+
signal
|
|
6199
|
+
);
|
|
6200
|
+
if (rawContent.status !== FetchStatus.SUCCESS) {
|
|
6201
|
+
return { url: item.url, links: [], status: rawContent.status };
|
|
5824
6202
|
}
|
|
5825
|
-
|
|
5826
|
-
|
|
5827
|
-
|
|
5828
|
-
|
|
6203
|
+
let processed;
|
|
6204
|
+
for (const pipeline of this.pipelines) {
|
|
6205
|
+
const contentBuffer = Buffer.isBuffer(rawContent.content) ? rawContent.content : Buffer.from(rawContent.content);
|
|
6206
|
+
if (pipeline.canProcess(rawContent.mimeType || "text/plain", contentBuffer)) {
|
|
6207
|
+
logger.debug(
|
|
6208
|
+
`Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`
|
|
6209
|
+
);
|
|
6210
|
+
const gitHubOptions = { ...options, scrapeMode: ScrapeMode.Fetch };
|
|
6211
|
+
processed = await pipeline.process(rawContent, gitHubOptions, this.httpFetcher);
|
|
6212
|
+
break;
|
|
6213
|
+
}
|
|
5829
6214
|
}
|
|
5830
|
-
|
|
5831
|
-
|
|
6215
|
+
if (!processed) {
|
|
6216
|
+
logger.warn(
|
|
6217
|
+
`⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
|
|
6218
|
+
);
|
|
6219
|
+
return { url: item.url, links: [], status: FetchStatus.SUCCESS };
|
|
6220
|
+
}
|
|
6221
|
+
for (const err of processed.errors ?? []) {
|
|
6222
|
+
logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`);
|
|
6223
|
+
}
|
|
6224
|
+
const githubUrl = `https://github.com/${owner}/${repo}/blob/${branch}/${filePath}`;
|
|
6225
|
+
const filename = filePath.split("/").pop() || "Untitled";
|
|
6226
|
+
return {
|
|
6227
|
+
url: githubUrl,
|
|
6228
|
+
title: processed.title?.trim() || filename || "Untitled",
|
|
6229
|
+
etag: rawContent.etag,
|
|
6230
|
+
lastModified: rawContent.lastModified,
|
|
6231
|
+
contentType: rawContent.mimeType,
|
|
6232
|
+
content: processed,
|
|
6233
|
+
links: [],
|
|
6234
|
+
// Always return empty links array for individual files
|
|
6235
|
+
status: FetchStatus.SUCCESS
|
|
6236
|
+
};
|
|
6237
|
+
}
|
|
6238
|
+
/**
|
|
6239
|
+
* Cleanup resources used by this processor.
|
|
6240
|
+
*/
|
|
6241
|
+
async cleanup() {
|
|
6242
|
+
await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
|
|
5832
6243
|
}
|
|
5833
6244
|
}
|
|
5834
|
-
|
|
5835
|
-
|
|
5836
|
-
|
|
5837
|
-
|
|
5838
|
-
|
|
5839
|
-
totalDiscovered = 0;
|
|
5840
|
-
// Track total URLs discovered (unlimited)
|
|
5841
|
-
effectiveTotal = 0;
|
|
5842
|
-
// Track effective total (limited by maxPages)
|
|
5843
|
-
canonicalBaseUrl;
|
|
5844
|
-
options;
|
|
5845
|
-
constructor(options = {}) {
|
|
5846
|
-
this.options = options;
|
|
6245
|
+
class GitHubWikiProcessor {
|
|
6246
|
+
httpFetcher = new HttpFetcher();
|
|
6247
|
+
pipelines;
|
|
6248
|
+
constructor() {
|
|
6249
|
+
this.pipelines = PipelineFactory$1.createStandardPipelines();
|
|
5847
6250
|
}
|
|
5848
6251
|
/**
|
|
5849
|
-
*
|
|
5850
|
-
|
|
6252
|
+
* Parses a GitHub wiki URL to extract repository information.
|
|
6253
|
+
*/
|
|
6254
|
+
parseGitHubWikiUrl(url) {
|
|
6255
|
+
const parsedUrl = new URL(url);
|
|
6256
|
+
const match = parsedUrl.pathname.match(/^\/([^/]+)\/([^/]+)\/wiki/);
|
|
6257
|
+
if (!match) {
|
|
6258
|
+
throw new Error(`Invalid GitHub wiki URL: ${url}`);
|
|
6259
|
+
}
|
|
6260
|
+
const [, owner, repo] = match;
|
|
6261
|
+
return { owner, repo };
|
|
6262
|
+
}
|
|
6263
|
+
/**
|
|
6264
|
+
* Determines if a URL should be processed within the wiki scope.
|
|
5851
6265
|
*/
|
|
5852
6266
|
shouldProcessUrl(url, options) {
|
|
5853
|
-
|
|
5854
|
-
|
|
5855
|
-
|
|
5856
|
-
|
|
5857
|
-
|
|
5858
|
-
} catch {
|
|
6267
|
+
try {
|
|
6268
|
+
const parsedUrl = new URL(url);
|
|
6269
|
+
const baseWikiInfo = this.parseGitHubWikiUrl(options.url);
|
|
6270
|
+
const expectedWikiPath = `/${baseWikiInfo.owner}/${baseWikiInfo.repo}/wiki`;
|
|
6271
|
+
if (!parsedUrl.pathname.startsWith(expectedWikiPath)) {
|
|
5859
6272
|
return false;
|
|
5860
6273
|
}
|
|
6274
|
+
const wikiPagePath = parsedUrl.pathname.replace(expectedWikiPath, "").replace(/^\//, "");
|
|
6275
|
+
return shouldIncludeUrl(
|
|
6276
|
+
wikiPagePath || "Home",
|
|
6277
|
+
options.includePatterns,
|
|
6278
|
+
options.excludePatterns
|
|
6279
|
+
);
|
|
6280
|
+
} catch {
|
|
6281
|
+
return false;
|
|
5861
6282
|
}
|
|
5862
|
-
return shouldIncludeUrl(url, options.includePatterns, options.excludePatterns);
|
|
5863
6283
|
}
|
|
5864
|
-
|
|
5865
|
-
|
|
5866
|
-
|
|
5867
|
-
|
|
5868
|
-
|
|
5869
|
-
|
|
5870
|
-
|
|
5871
|
-
|
|
5872
|
-
|
|
5873
|
-
|
|
5874
|
-
|
|
5875
|
-
}
|
|
5876
|
-
|
|
5877
|
-
|
|
5878
|
-
|
|
5879
|
-
|
|
5880
|
-
|
|
5881
|
-
|
|
5882
|
-
|
|
5883
|
-
|
|
5884
|
-
|
|
5885
|
-
|
|
5886
|
-
`Updated scope base after redirect: ${original.href} -> ${finalUrlObj.href}`
|
|
5887
|
-
);
|
|
5888
|
-
} else {
|
|
5889
|
-
this.canonicalBaseUrl = original;
|
|
5890
|
-
}
|
|
5891
|
-
} catch {
|
|
5892
|
-
this.canonicalBaseUrl = new URL$1(options.url);
|
|
5893
|
-
}
|
|
5894
|
-
}
|
|
5895
|
-
if (result.document) {
|
|
5896
|
-
this.pageCount++;
|
|
5897
|
-
logger.info(
|
|
5898
|
-
`🌐 Scraping page ${this.pageCount}/${this.effectiveTotal} (depth ${item.depth}/${maxDepth}): ${item.url}`
|
|
5899
|
-
);
|
|
5900
|
-
await progressCallback({
|
|
5901
|
-
pagesScraped: this.pageCount,
|
|
5902
|
-
totalPages: this.effectiveTotal,
|
|
5903
|
-
totalDiscovered: this.totalDiscovered,
|
|
5904
|
-
currentUrl: item.url,
|
|
5905
|
-
depth: item.depth,
|
|
5906
|
-
maxDepth,
|
|
5907
|
-
document: result.document
|
|
5908
|
-
});
|
|
5909
|
-
}
|
|
5910
|
-
const nextItems = result.links || [];
|
|
5911
|
-
return nextItems.map((value) => {
|
|
5912
|
-
try {
|
|
5913
|
-
const targetUrl = new URL$1(value, baseUrl);
|
|
5914
|
-
if (!this.shouldProcessUrl(targetUrl.href, options)) {
|
|
5915
|
-
return null;
|
|
5916
|
-
}
|
|
5917
|
-
return {
|
|
5918
|
-
url: targetUrl.href,
|
|
5919
|
-
depth: item.depth + 1
|
|
5920
|
-
};
|
|
5921
|
-
} catch (_error) {
|
|
5922
|
-
logger.warn(`❌ Invalid URL: ${value}`);
|
|
5923
|
-
}
|
|
5924
|
-
return null;
|
|
5925
|
-
}).filter((item2) => item2 !== null);
|
|
5926
|
-
} catch (error) {
|
|
5927
|
-
if (options.ignoreErrors) {
|
|
5928
|
-
logger.error(`❌ Failed to process ${item.url}: ${error}`);
|
|
5929
|
-
return [];
|
|
5930
|
-
}
|
|
5931
|
-
throw error;
|
|
5932
|
-
}
|
|
5933
|
-
})
|
|
5934
|
-
);
|
|
5935
|
-
const allLinks = results.flat();
|
|
5936
|
-
const uniqueLinks = [];
|
|
5937
|
-
for (const item of allLinks) {
|
|
5938
|
-
const normalizedUrl = normalizeUrl(item.url, this.options.urlNormalizerOptions);
|
|
5939
|
-
if (!this.visited.has(normalizedUrl)) {
|
|
5940
|
-
this.visited.add(normalizedUrl);
|
|
5941
|
-
uniqueLinks.push(item);
|
|
5942
|
-
this.totalDiscovered++;
|
|
5943
|
-
if (this.effectiveTotal < maxPages) {
|
|
5944
|
-
this.effectiveTotal++;
|
|
6284
|
+
/**
|
|
6285
|
+
* Processes a single GitHub wiki page.
|
|
6286
|
+
*/
|
|
6287
|
+
async process(item, options, signal) {
|
|
6288
|
+
const currentUrl = item.url;
|
|
6289
|
+
try {
|
|
6290
|
+
const rawContent = await this.httpFetcher.fetch(currentUrl, {
|
|
6291
|
+
signal,
|
|
6292
|
+
etag: item.etag
|
|
6293
|
+
});
|
|
6294
|
+
if (rawContent.status !== FetchStatus.SUCCESS) {
|
|
6295
|
+
return { url: currentUrl, links: [], status: rawContent.status };
|
|
6296
|
+
}
|
|
6297
|
+
let processed;
|
|
6298
|
+
for (const pipeline of this.pipelines) {
|
|
6299
|
+
if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) {
|
|
6300
|
+
logger.debug(
|
|
6301
|
+
`Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${currentUrl})`
|
|
6302
|
+
);
|
|
6303
|
+
const wikiOptions = { ...options, scrapeMode: ScrapeMode.Fetch };
|
|
6304
|
+
processed = await pipeline.process(rawContent, wikiOptions, this.httpFetcher);
|
|
6305
|
+
break;
|
|
5945
6306
|
}
|
|
5946
6307
|
}
|
|
5947
|
-
|
|
5948
|
-
|
|
5949
|
-
}
|
|
5950
|
-
|
|
5951
|
-
|
|
5952
|
-
this.pageCount = 0;
|
|
5953
|
-
this.totalDiscovered = 1;
|
|
5954
|
-
this.effectiveTotal = 1;
|
|
5955
|
-
this.canonicalBaseUrl = new URL$1(options.url);
|
|
5956
|
-
let baseUrl = this.canonicalBaseUrl;
|
|
5957
|
-
const queue = [{ url: options.url, depth: 0 }];
|
|
5958
|
-
this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions));
|
|
5959
|
-
const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
|
|
5960
|
-
const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY;
|
|
5961
|
-
while (queue.length > 0 && this.pageCount < maxPages) {
|
|
5962
|
-
if (signal?.aborted) {
|
|
5963
|
-
logger.debug("Scraping cancelled by signal.");
|
|
5964
|
-
throw new CancellationError("Scraping cancelled by signal");
|
|
6308
|
+
if (!processed) {
|
|
6309
|
+
logger.warn(
|
|
6310
|
+
`⚠️ Unsupported content type "${rawContent.mimeType}" for wiki page ${currentUrl}. Skipping processing.`
|
|
6311
|
+
);
|
|
6312
|
+
return { url: currentUrl, links: [], status: FetchStatus.SUCCESS };
|
|
5965
6313
|
}
|
|
5966
|
-
const
|
|
5967
|
-
|
|
5968
|
-
break;
|
|
6314
|
+
for (const err of processed.errors ?? []) {
|
|
6315
|
+
logger.warn(`⚠️ Processing error for ${currentUrl}: ${err.message}`);
|
|
5969
6316
|
}
|
|
5970
|
-
const
|
|
5971
|
-
|
|
5972
|
-
|
|
5973
|
-
|
|
5974
|
-
|
|
5975
|
-
)
|
|
5976
|
-
|
|
5977
|
-
|
|
5978
|
-
|
|
5979
|
-
|
|
5980
|
-
|
|
5981
|
-
|
|
5982
|
-
|
|
5983
|
-
|
|
5984
|
-
|
|
5985
|
-
|
|
6317
|
+
const parsedUrl = new URL(currentUrl);
|
|
6318
|
+
const wikiInfo = this.parseGitHubWikiUrl(currentUrl);
|
|
6319
|
+
const wikiPagePath = parsedUrl.pathname.replace(`/${wikiInfo.owner}/${wikiInfo.repo}/wiki`, "").replace(/^\//, "");
|
|
6320
|
+
const pageTitle = wikiPagePath || "Home";
|
|
6321
|
+
const links = processed.links || [];
|
|
6322
|
+
const wikiLinks = links.filter((link) => {
|
|
6323
|
+
if (!link || link.trim() === "" || link === "invalid-url" || link === "not-a-url-at-all") {
|
|
6324
|
+
return false;
|
|
6325
|
+
}
|
|
6326
|
+
return true;
|
|
6327
|
+
}).map((link) => {
|
|
6328
|
+
try {
|
|
6329
|
+
return new URL(link, currentUrl).href;
|
|
6330
|
+
} catch {
|
|
6331
|
+
return null;
|
|
6332
|
+
}
|
|
6333
|
+
}).filter((link) => link !== null).filter((link) => {
|
|
6334
|
+
try {
|
|
6335
|
+
const linkUrl = new URL(link);
|
|
6336
|
+
return linkUrl.hostname === parsedUrl.hostname && linkUrl.pathname.startsWith(`/${wikiInfo.owner}/${wikiInfo.repo}/wiki`);
|
|
6337
|
+
} catch {
|
|
6338
|
+
return false;
|
|
6339
|
+
}
|
|
6340
|
+
});
|
|
6341
|
+
return {
|
|
6342
|
+
url: currentUrl,
|
|
6343
|
+
title: pageTitle,
|
|
6344
|
+
etag: rawContent.etag,
|
|
6345
|
+
lastModified: rawContent.lastModified,
|
|
6346
|
+
contentType: rawContent.mimeType,
|
|
6347
|
+
content: processed,
|
|
6348
|
+
links: wikiLinks,
|
|
6349
|
+
status: FetchStatus.SUCCESS
|
|
6350
|
+
};
|
|
6351
|
+
} catch (error) {
|
|
6352
|
+
logger.warn(`⚠️ Failed to process wiki page ${currentUrl}: ${error}`);
|
|
6353
|
+
return { url: currentUrl, links: [], status: FetchStatus.SUCCESS };
|
|
5986
6354
|
}
|
|
5987
6355
|
}
|
|
5988
6356
|
/**
|
|
5989
|
-
* Cleanup resources used by this
|
|
5990
|
-
* Default implementation does nothing - override in derived classes as needed.
|
|
6357
|
+
* Cleanup resources used by this processor.
|
|
5991
6358
|
*/
|
|
5992
6359
|
async cleanup() {
|
|
6360
|
+
await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
|
|
5993
6361
|
}
|
|
5994
6362
|
}
|
|
5995
|
-
class
|
|
6363
|
+
class GitHubScraperStrategy extends BaseScraperStrategy {
|
|
5996
6364
|
httpFetcher = new HttpFetcher();
|
|
5997
|
-
|
|
5998
|
-
|
|
5999
|
-
// Cache the resolved default branch
|
|
6000
|
-
constructor() {
|
|
6001
|
-
super();
|
|
6002
|
-
this.pipelines = PipelineFactory$1.createStandardPipelines();
|
|
6003
|
-
}
|
|
6365
|
+
wikiProcessor = new GitHubWikiProcessor();
|
|
6366
|
+
repoProcessor = new GitHubRepoProcessor();
|
|
6004
6367
|
canHandle(url) {
|
|
6005
|
-
const { hostname } = new URL(url);
|
|
6006
|
-
return ["github.com", "www.github.com"].includes(hostname);
|
|
6007
|
-
}
|
|
6008
|
-
/**
|
|
6009
|
-
* Override shouldProcessUrl to handle github-file:// URLs specially.
|
|
6010
|
-
* These URLs bypass scope checking since they're internal file references.
|
|
6011
|
-
*/
|
|
6012
|
-
shouldProcessUrl(url, options) {
|
|
6013
6368
|
if (url.startsWith("github-file://")) {
|
|
6014
|
-
|
|
6015
|
-
|
|
6369
|
+
return true;
|
|
6370
|
+
}
|
|
6371
|
+
try {
|
|
6372
|
+
const parsedUrl = new URL(url);
|
|
6373
|
+
const { hostname, pathname } = parsedUrl;
|
|
6374
|
+
if (!["github.com", "www.github.com"].includes(hostname)) {
|
|
6375
|
+
return false;
|
|
6376
|
+
}
|
|
6377
|
+
const baseMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/?$/);
|
|
6378
|
+
if (baseMatch) {
|
|
6379
|
+
return true;
|
|
6380
|
+
}
|
|
6381
|
+
const treeMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/tree\//);
|
|
6382
|
+
if (treeMatch) {
|
|
6383
|
+
return true;
|
|
6384
|
+
}
|
|
6385
|
+
const blobMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/blob\//);
|
|
6386
|
+
if (blobMatch) {
|
|
6387
|
+
return true;
|
|
6388
|
+
}
|
|
6389
|
+
return false;
|
|
6390
|
+
} catch {
|
|
6391
|
+
return false;
|
|
6016
6392
|
}
|
|
6017
|
-
return super.shouldProcessUrl(url, options);
|
|
6018
6393
|
}
|
|
6019
6394
|
/**
|
|
6020
6395
|
* Parses a GitHub URL to extract repository information.
|
|
@@ -6028,20 +6403,19 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
|
|
|
6028
6403
|
const [, owner, repo] = match;
|
|
6029
6404
|
const segments = parsedUrl.pathname.split("/").filter(Boolean);
|
|
6030
6405
|
if (segments.length >= 4 && segments[2] === "blob") {
|
|
6031
|
-
const
|
|
6406
|
+
const branch = segments[3];
|
|
6032
6407
|
const filePath = segments.length > 4 ? segments.slice(4).join("/") : void 0;
|
|
6033
|
-
return { owner, repo, branch
|
|
6408
|
+
return { owner, repo, branch, filePath, isBlob: true };
|
|
6034
6409
|
}
|
|
6035
|
-
if (segments.length
|
|
6036
|
-
|
|
6410
|
+
if (segments.length >= 4 && segments[2] === "tree") {
|
|
6411
|
+
const branch = segments[3];
|
|
6412
|
+
const subPath = segments.length > 4 ? segments.slice(4).join("/") : void 0;
|
|
6413
|
+
return { owner, repo, branch, subPath };
|
|
6037
6414
|
}
|
|
6038
|
-
|
|
6039
|
-
const subPath = segments.length > 4 ? segments.slice(4).join("/") : void 0;
|
|
6040
|
-
return { owner, repo, branch, subPath };
|
|
6415
|
+
return { owner, repo };
|
|
6041
6416
|
}
|
|
6042
6417
|
/**
|
|
6043
6418
|
* Fetches the repository tree structure from GitHub API.
|
|
6044
|
-
* Uses 'HEAD' to get the default branch if no branch is specified.
|
|
6045
6419
|
*/
|
|
6046
6420
|
async fetchRepositoryTree(repoInfo, signal) {
|
|
6047
6421
|
const { owner, repo, branch } = repoInfo;
|
|
@@ -6060,7 +6434,6 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
|
|
|
6060
6434
|
targetBranch = "main";
|
|
6061
6435
|
}
|
|
6062
6436
|
}
|
|
6063
|
-
this.resolvedBranch = targetBranch;
|
|
6064
6437
|
const treeUrl = `https://api.github.com/repos/${owner}/${repo}/git/trees/${targetBranch}?recursive=1`;
|
|
6065
6438
|
logger.debug(`Fetching repository tree: ${treeUrl}`);
|
|
6066
6439
|
const rawContent = await this.httpFetcher.fetch(treeUrl, { signal });
|
|
@@ -6082,14 +6455,12 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
|
|
|
6082
6455
|
}
|
|
6083
6456
|
const path2 = item.path;
|
|
6084
6457
|
const textExtensions = [
|
|
6085
|
-
// Documentation
|
|
6086
6458
|
".md",
|
|
6087
6459
|
".mdx",
|
|
6088
6460
|
".txt",
|
|
6089
6461
|
".rst",
|
|
6090
6462
|
".adoc",
|
|
6091
6463
|
".asciidoc",
|
|
6092
|
-
// Web technologies
|
|
6093
6464
|
".html",
|
|
6094
6465
|
".htm",
|
|
6095
6466
|
".xml",
|
|
@@ -6097,7 +6468,6 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
|
|
|
6097
6468
|
".scss",
|
|
6098
6469
|
".sass",
|
|
6099
6470
|
".less",
|
|
6100
|
-
// Programming languages
|
|
6101
6471
|
".js",
|
|
6102
6472
|
".jsx",
|
|
6103
6473
|
".ts",
|
|
@@ -6133,7 +6503,6 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
|
|
|
6133
6503
|
".ps1",
|
|
6134
6504
|
".bat",
|
|
6135
6505
|
".cmd",
|
|
6136
|
-
// Configuration and data
|
|
6137
6506
|
".json",
|
|
6138
6507
|
".yaml",
|
|
6139
6508
|
".yml",
|
|
@@ -6147,7 +6516,6 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
|
|
|
6147
6516
|
".dockerignore",
|
|
6148
6517
|
".gitattributes",
|
|
6149
6518
|
".editorconfig",
|
|
6150
|
-
// Build and package management
|
|
6151
6519
|
".gradle",
|
|
6152
6520
|
".pom",
|
|
6153
6521
|
".sbt",
|
|
@@ -6156,10 +6524,7 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
|
|
|
6156
6524
|
".make",
|
|
6157
6525
|
".dockerfile",
|
|
6158
6526
|
".mod",
|
|
6159
|
-
// Go modules (go.mod)
|
|
6160
6527
|
".sum",
|
|
6161
|
-
// Go checksums (go.sum)
|
|
6162
|
-
// Other text formats
|
|
6163
6528
|
".sql",
|
|
6164
6529
|
".graphql",
|
|
6165
6530
|
".gql",
|
|
@@ -6172,20 +6537,16 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
|
|
|
6172
6537
|
];
|
|
6173
6538
|
const pathLower = path2.toLowerCase();
|
|
6174
6539
|
const hasTextExtension = textExtensions.some((ext) => pathLower.endsWith(ext));
|
|
6175
|
-
const hasCompoundExtension = pathLower.includes(".env.") ||
|
|
6176
|
-
pathLower.endsWith(".env") || pathLower.includes(".config.") || // webpack.config.js, etc.
|
|
6177
|
-
pathLower.includes(".lock");
|
|
6540
|
+
const hasCompoundExtension = pathLower.includes(".env.") || pathLower.endsWith(".env") || pathLower.includes(".config.") || pathLower.includes(".lock");
|
|
6178
6541
|
const fileName = path2.split("/").pop() || "";
|
|
6179
6542
|
const fileNameLower = fileName.toLowerCase();
|
|
6180
6543
|
const commonTextFiles = [
|
|
6181
|
-
// Documentation files without extensions
|
|
6182
6544
|
"readme",
|
|
6183
6545
|
"license",
|
|
6184
6546
|
"changelog",
|
|
6185
6547
|
"contributing",
|
|
6186
6548
|
"authors",
|
|
6187
6549
|
"maintainers",
|
|
6188
|
-
// Build files without extensions
|
|
6189
6550
|
"dockerfile",
|
|
6190
6551
|
"makefile",
|
|
6191
6552
|
"rakefile",
|
|
@@ -6193,374 +6554,125 @@ class GitHubRepoScraperStrategy extends BaseScraperStrategy {
|
|
|
6193
6554
|
"podfile",
|
|
6194
6555
|
"cartfile",
|
|
6195
6556
|
"brewfile",
|
|
6196
|
-
"procfile",
|
|
6197
|
-
"vagrantfile",
|
|
6198
|
-
"gulpfile",
|
|
6199
|
-
"gruntfile",
|
|
6200
|
-
|
|
6201
|
-
".
|
|
6202
|
-
".
|
|
6203
|
-
".
|
|
6204
|
-
".
|
|
6205
|
-
|
|
6206
|
-
|
|
6207
|
-
|
|
6208
|
-
|
|
6209
|
-
return fileNameLower === name2 || fileNameLower.startsWith(`${name2}.`);
|
|
6210
|
-
}
|
|
6211
|
-
return fileNameLower === name2 || fileNameLower.startsWith(`${name2}.`);
|
|
6212
|
-
});
|
|
6213
|
-
if (!hasTextExtension && !hasCompoundExtension && !isCommonTextFile) {
|
|
6214
|
-
return false;
|
|
6215
|
-
}
|
|
6216
|
-
return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
|
|
6217
|
-
}
|
|
6218
|
-
/**
|
|
6219
|
-
* Fetches the raw content of a file from GitHub.
|
|
6220
|
-
*/
|
|
6221
|
-
async fetchFileContent(repoInfo, filePath, signal) {
|
|
6222
|
-
const { owner, repo } = repoInfo;
|
|
6223
|
-
const branch = this.resolvedBranch || repoInfo.branch || "main";
|
|
6224
|
-
const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${filePath}`;
|
|
6225
|
-
const rawContent = await this.httpFetcher.fetch(rawUrl, { signal });
|
|
6226
|
-
const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath);
|
|
6227
|
-
if (detectedMimeType && rawContent.mimeType === "text/plain") {
|
|
6228
|
-
return {
|
|
6229
|
-
...rawContent,
|
|
6230
|
-
mimeType: detectedMimeType
|
|
6231
|
-
};
|
|
6232
|
-
}
|
|
6233
|
-
return rawContent;
|
|
6234
|
-
}
|
|
6235
|
-
async processItem(item, options, _progressCallback, signal) {
|
|
6236
|
-
const repoInfo = this.parseGitHubUrl(options.url);
|
|
6237
|
-
if (item.depth === 0) {
|
|
6238
|
-
if ("isBlob" in repoInfo && repoInfo.isBlob) {
|
|
6239
|
-
if (repoInfo.filePath) {
|
|
6240
|
-
logger.info(
|
|
6241
|
-
`📄 Processing single file: ${repoInfo.owner}/${repoInfo.repo}/${repoInfo.filePath}`
|
|
6242
|
-
);
|
|
6243
|
-
return { links: [`github-file://${repoInfo.filePath}`] };
|
|
6244
|
-
} else {
|
|
6245
|
-
logger.warn(
|
|
6246
|
-
`⚠️ Blob URL without file path: ${options.url}. No files to process.`
|
|
6247
|
-
);
|
|
6248
|
-
return { links: [] };
|
|
6249
|
-
}
|
|
6250
|
-
}
|
|
6251
|
-
logger.info(
|
|
6252
|
-
`🗂️ Discovering repository structure for ${repoInfo.owner}/${repoInfo.repo}`
|
|
6253
|
-
);
|
|
6254
|
-
const { tree, resolvedBranch } = await this.fetchRepositoryTree(repoInfo, signal);
|
|
6255
|
-
const fileItems = tree.tree.filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)).filter((treeItem) => this.shouldProcessFile(treeItem, options));
|
|
6256
|
-
logger.info(
|
|
6257
|
-
`📁 Found ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`
|
|
6258
|
-
);
|
|
6259
|
-
const links = fileItems.map((treeItem) => `github-file://${treeItem.path}`);
|
|
6260
|
-
return { links };
|
|
6261
|
-
}
|
|
6262
|
-
if (item.url.startsWith("github-file://")) {
|
|
6263
|
-
const filePath = item.url.replace("github-file://", "");
|
|
6264
|
-
logger.info(
|
|
6265
|
-
`🗂️ Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`
|
|
6266
|
-
);
|
|
6267
|
-
const rawContent = await this.fetchFileContent(repoInfo, filePath, signal);
|
|
6268
|
-
let processed;
|
|
6269
|
-
for (const pipeline of this.pipelines) {
|
|
6270
|
-
if (pipeline.canProcess(rawContent)) {
|
|
6271
|
-
logger.debug(
|
|
6272
|
-
`Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`
|
|
6273
|
-
);
|
|
6274
|
-
const gitHubOptions = { ...options, scrapeMode: ScrapeMode.Fetch };
|
|
6275
|
-
processed = await pipeline.process(rawContent, gitHubOptions, this.httpFetcher);
|
|
6276
|
-
break;
|
|
6277
|
-
}
|
|
6278
|
-
}
|
|
6279
|
-
if (!processed) {
|
|
6280
|
-
logger.warn(
|
|
6281
|
-
`⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
|
|
6282
|
-
);
|
|
6283
|
-
return { document: void 0, links: [] };
|
|
6284
|
-
}
|
|
6285
|
-
for (const err of processed.errors) {
|
|
6286
|
-
logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`);
|
|
6287
|
-
}
|
|
6288
|
-
const githubUrl = `https://github.com/${repoInfo.owner}/${repoInfo.repo}/blob/${this.resolvedBranch || repoInfo.branch || "main"}/${filePath}`;
|
|
6289
|
-
const processedTitle = processed.metadata.title;
|
|
6290
|
-
const hasValidTitle = typeof processedTitle === "string" && processedTitle.trim() !== "";
|
|
6291
|
-
const fallbackTitle = filePath.split("/").pop() || "Untitled";
|
|
6292
|
-
return {
|
|
6293
|
-
document: {
|
|
6294
|
-
content: typeof processed.textContent === "string" ? processed.textContent : "",
|
|
6295
|
-
metadata: {
|
|
6296
|
-
url: githubUrl,
|
|
6297
|
-
title: hasValidTitle ? processedTitle : fallbackTitle,
|
|
6298
|
-
library: options.library,
|
|
6299
|
-
version: options.version
|
|
6300
|
-
},
|
|
6301
|
-
contentType: rawContent.mimeType
|
|
6302
|
-
// Preserve the detected MIME type
|
|
6303
|
-
},
|
|
6304
|
-
links: []
|
|
6305
|
-
// Always return empty links array for individual files
|
|
6306
|
-
};
|
|
6307
|
-
}
|
|
6308
|
-
return { document: void 0, links: [] };
|
|
6309
|
-
}
|
|
6310
|
-
/**
|
|
6311
|
-
* Normalize a path by removing leading and trailing slashes.
|
|
6312
|
-
*/
|
|
6313
|
-
normalizePath(path2) {
|
|
6314
|
-
return path2.replace(/^\/+/, "").replace(/\/+$/, "");
|
|
6315
|
-
}
|
|
6316
|
-
isWithinSubPath(path2, subPath) {
|
|
6317
|
-
if (!subPath) {
|
|
6318
|
-
return true;
|
|
6319
|
-
}
|
|
6320
|
-
const trimmedSubPath = this.normalizePath(subPath);
|
|
6321
|
-
if (trimmedSubPath.length === 0) {
|
|
6322
|
-
return true;
|
|
6323
|
-
}
|
|
6324
|
-
const normalizedPath = this.normalizePath(path2);
|
|
6325
|
-
if (normalizedPath === trimmedSubPath) {
|
|
6326
|
-
return true;
|
|
6327
|
-
}
|
|
6328
|
-
return normalizedPath.startsWith(`${trimmedSubPath}/`);
|
|
6329
|
-
}
|
|
6330
|
-
async scrape(options, progressCallback, signal) {
|
|
6331
|
-
const url = new URL(options.url);
|
|
6332
|
-
if (!url.hostname.includes("github.com")) {
|
|
6333
|
-
throw new Error("URL must be a GitHub URL");
|
|
6334
|
-
}
|
|
6335
|
-
return super.scrape(options, progressCallback, signal);
|
|
6336
|
-
}
|
|
6337
|
-
/**
|
|
6338
|
-
* Cleanup resources used by this strategy, specifically the pipeline browser instances.
|
|
6339
|
-
*/
|
|
6340
|
-
async cleanup() {
|
|
6341
|
-
await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close()));
|
|
6342
|
-
}
|
|
6343
|
-
}
|
|
6344
|
-
class GitHubWikiScraperStrategy extends BaseScraperStrategy {
|
|
6345
|
-
httpFetcher = new HttpFetcher();
|
|
6346
|
-
pipelines;
|
|
6347
|
-
constructor() {
|
|
6348
|
-
super();
|
|
6349
|
-
this.pipelines = PipelineFactory$1.createStandardPipelines();
|
|
6350
|
-
}
|
|
6351
|
-
canHandle(url) {
|
|
6352
|
-
try {
|
|
6353
|
-
const parsedUrl = new URL(url);
|
|
6354
|
-
const { hostname, pathname } = parsedUrl;
|
|
6355
|
-
return ["github.com", "www.github.com"].includes(hostname) && pathname.includes("/wiki") && pathname.match(/^\/([^/]+)\/([^/]+)\/wiki/) !== null;
|
|
6356
|
-
} catch {
|
|
6357
|
-
return false;
|
|
6358
|
-
}
|
|
6359
|
-
}
|
|
6360
|
-
/**
|
|
6361
|
-
* Parses a GitHub wiki URL to extract repository information.
|
|
6362
|
-
*/
|
|
6363
|
-
parseGitHubWikiUrl(url) {
|
|
6364
|
-
const parsedUrl = new URL(url);
|
|
6365
|
-
const match = parsedUrl.pathname.match(/^\/([^/]+)\/([^/]+)\/wiki/);
|
|
6366
|
-
if (!match) {
|
|
6367
|
-
throw new Error(`Invalid GitHub wiki URL: ${url}`);
|
|
6368
|
-
}
|
|
6369
|
-
const [, owner, repo] = match;
|
|
6370
|
-
return { owner, repo };
|
|
6371
|
-
}
|
|
6372
|
-
/**
|
|
6373
|
-
* Override shouldProcessUrl to only process URLs within the wiki scope.
|
|
6374
|
-
*/
|
|
6375
|
-
shouldProcessUrl(url, options) {
|
|
6376
|
-
try {
|
|
6377
|
-
const parsedUrl = new URL(url);
|
|
6378
|
-
const wikiInfo = this.parseGitHubWikiUrl(options.url);
|
|
6379
|
-
const expectedWikiPath = `/${wikiInfo.owner}/${wikiInfo.repo}/wiki`;
|
|
6380
|
-
if (!parsedUrl.pathname.startsWith(expectedWikiPath)) {
|
|
6381
|
-
return false;
|
|
6382
|
-
}
|
|
6383
|
-
const wikiPagePath = parsedUrl.pathname.replace(expectedWikiPath, "").replace(/^\//, "");
|
|
6384
|
-
return shouldIncludeUrl(
|
|
6385
|
-
wikiPagePath || "Home",
|
|
6386
|
-
options.includePatterns,
|
|
6387
|
-
options.excludePatterns
|
|
6388
|
-
);
|
|
6389
|
-
} catch {
|
|
6390
|
-
return false;
|
|
6391
|
-
}
|
|
6392
|
-
}
|
|
6393
|
-
async processItem(item, options, _progressCallback, signal) {
|
|
6394
|
-
const currentUrl = item.url;
|
|
6395
|
-
logger.info(
|
|
6396
|
-
`📖 Processing wiki page ${this.pageCount}/${options.maxPages}: ${currentUrl}`
|
|
6397
|
-
);
|
|
6398
|
-
try {
|
|
6399
|
-
const rawContent = await this.httpFetcher.fetch(currentUrl, { signal });
|
|
6400
|
-
let processed;
|
|
6401
|
-
for (const pipeline of this.pipelines) {
|
|
6402
|
-
if (pipeline.canProcess(rawContent)) {
|
|
6403
|
-
logger.debug(
|
|
6404
|
-
`Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${currentUrl})`
|
|
6405
|
-
);
|
|
6406
|
-
const wikiOptions = { ...options, scrapeMode: ScrapeMode.Fetch };
|
|
6407
|
-
processed = await pipeline.process(rawContent, wikiOptions, this.httpFetcher);
|
|
6408
|
-
break;
|
|
6409
|
-
}
|
|
6410
|
-
}
|
|
6411
|
-
if (!processed) {
|
|
6412
|
-
logger.warn(
|
|
6413
|
-
`⚠️ Unsupported content type "${rawContent.mimeType}" for wiki page ${currentUrl}. Skipping processing.`
|
|
6414
|
-
);
|
|
6415
|
-
return { document: void 0, links: [] };
|
|
6416
|
-
}
|
|
6417
|
-
for (const err of processed.errors) {
|
|
6418
|
-
logger.warn(`⚠️ Processing error for ${currentUrl}: ${err.message}`);
|
|
6557
|
+
"procfile",
|
|
6558
|
+
"vagrantfile",
|
|
6559
|
+
"gulpfile",
|
|
6560
|
+
"gruntfile",
|
|
6561
|
+
".prettierrc",
|
|
6562
|
+
".eslintrc",
|
|
6563
|
+
".babelrc",
|
|
6564
|
+
".nvmrc",
|
|
6565
|
+
".npmrc"
|
|
6566
|
+
];
|
|
6567
|
+
const isCommonTextFile = commonTextFiles.some((name2) => {
|
|
6568
|
+
if (name2.startsWith(".")) {
|
|
6569
|
+
return fileNameLower === name2 || fileNameLower.startsWith(`${name2}.`);
|
|
6419
6570
|
}
|
|
6420
|
-
|
|
6421
|
-
|
|
6422
|
-
|
|
6423
|
-
|
|
6424
|
-
const document2 = {
|
|
6425
|
-
content: typeof processed.textContent === "string" ? processed.textContent : "",
|
|
6426
|
-
metadata: {
|
|
6427
|
-
url: currentUrl,
|
|
6428
|
-
title: typeof processed.metadata.title === "string" && processed.metadata.title.trim() !== "" ? processed.metadata.title : pageTitle,
|
|
6429
|
-
library: options.library,
|
|
6430
|
-
version: options.version
|
|
6431
|
-
},
|
|
6432
|
-
contentType: rawContent.mimeType
|
|
6433
|
-
};
|
|
6434
|
-
const links = processed.links || [];
|
|
6435
|
-
const wikiLinks = links.filter((link) => {
|
|
6436
|
-
if (!link || link.trim() === "" || link === "invalid-url" || link === "not-a-url-at-all") {
|
|
6437
|
-
return false;
|
|
6438
|
-
}
|
|
6439
|
-
return true;
|
|
6440
|
-
}).map((link) => {
|
|
6441
|
-
try {
|
|
6442
|
-
return new URL(link, currentUrl).href;
|
|
6443
|
-
} catch {
|
|
6444
|
-
return null;
|
|
6445
|
-
}
|
|
6446
|
-
}).filter((link) => link !== null).filter((link) => {
|
|
6447
|
-
try {
|
|
6448
|
-
const linkUrl = new URL(link);
|
|
6449
|
-
return linkUrl.hostname === parsedUrl.hostname && linkUrl.pathname.startsWith(`/${wikiInfo.owner}/${wikiInfo.repo}/wiki`);
|
|
6450
|
-
} catch {
|
|
6451
|
-
return false;
|
|
6452
|
-
}
|
|
6453
|
-
});
|
|
6454
|
-
return { document: document2, links: wikiLinks };
|
|
6455
|
-
} catch (error) {
|
|
6456
|
-
logger.warn(`⚠️ Failed to process wiki page ${currentUrl}: ${error}`);
|
|
6457
|
-
return { document: void 0, links: [] };
|
|
6458
|
-
}
|
|
6459
|
-
}
|
|
6460
|
-
async scrape(options, progressCallback, signal) {
|
|
6461
|
-
const url = new URL(options.url);
|
|
6462
|
-
if (!url.hostname.includes("github.com") || !url.pathname.includes("/wiki")) {
|
|
6463
|
-
throw new Error("URL must be a GitHub wiki URL");
|
|
6571
|
+
return fileNameLower === name2 || fileNameLower.startsWith(`${name2}.`);
|
|
6572
|
+
});
|
|
6573
|
+
if (hasTextExtension || hasCompoundExtension || isCommonTextFile) {
|
|
6574
|
+
return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
|
|
6464
6575
|
}
|
|
6465
|
-
|
|
6466
|
-
if (
|
|
6467
|
-
|
|
6576
|
+
const mimeType = mime.getType(path2);
|
|
6577
|
+
if (mimeType?.startsWith("text/")) {
|
|
6578
|
+
logger.debug(`Including file with text MIME type: ${path2} (${mimeType})`);
|
|
6579
|
+
return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
|
|
6468
6580
|
}
|
|
6469
|
-
|
|
6470
|
-
return super.scrape(wikiOptions, progressCallback, signal);
|
|
6581
|
+
return false;
|
|
6471
6582
|
}
|
|
6472
6583
|
/**
|
|
6473
|
-
*
|
|
6584
|
+
* Checks if a path is within the specified subpath.
|
|
6474
6585
|
*/
|
|
6475
|
-
|
|
6476
|
-
|
|
6586
|
+
isWithinSubPath(path2, subPath) {
|
|
6587
|
+
if (!subPath) {
|
|
6588
|
+
return true;
|
|
6589
|
+
}
|
|
6590
|
+
const trimmedSubPath = subPath.replace(/^\/+/, "").replace(/\/+$/, "");
|
|
6591
|
+
if (trimmedSubPath.length === 0) {
|
|
6592
|
+
return true;
|
|
6593
|
+
}
|
|
6594
|
+
const normalizedPath = path2.replace(/^\/+/, "").replace(/\/+$/, "");
|
|
6595
|
+
if (normalizedPath === trimmedSubPath) {
|
|
6596
|
+
return true;
|
|
6597
|
+
}
|
|
6598
|
+
return normalizedPath.startsWith(`${trimmedSubPath}/`);
|
|
6477
6599
|
}
|
|
6478
|
-
|
|
6479
|
-
|
|
6480
|
-
|
|
6481
|
-
|
|
6482
|
-
|
|
6600
|
+
async processItem(item, options, signal) {
|
|
6601
|
+
if (item.url.startsWith("github-file://")) {
|
|
6602
|
+
logger.info(
|
|
6603
|
+
`🗑️ Legacy github-file:// URL detected, marking as deleted: ${item.url}`
|
|
6604
|
+
);
|
|
6605
|
+
return {
|
|
6606
|
+
url: item.url,
|
|
6607
|
+
links: [],
|
|
6608
|
+
status: FetchStatus.NOT_FOUND
|
|
6609
|
+
};
|
|
6610
|
+
}
|
|
6483
6611
|
try {
|
|
6484
|
-
const parsedUrl = new URL(url);
|
|
6485
|
-
|
|
6486
|
-
|
|
6487
|
-
return false;
|
|
6612
|
+
const parsedUrl = new URL(item.url);
|
|
6613
|
+
if (/^\/[^/]+\/[^/]+\/wiki($|\/)/.test(parsedUrl.pathname)) {
|
|
6614
|
+
return await this.wikiProcessor.process(item, options, signal);
|
|
6488
6615
|
}
|
|
6489
|
-
const pathMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/?$/);
|
|
6490
|
-
return pathMatch !== null;
|
|
6491
6616
|
} catch {
|
|
6492
|
-
return false;
|
|
6493
|
-
}
|
|
6494
|
-
}
|
|
6495
|
-
async scrape(options, progressCallback, signal) {
|
|
6496
|
-
const url = new URL(options.url);
|
|
6497
|
-
if (!url.hostname.includes("github.com")) {
|
|
6498
|
-
throw new Error("URL must be a GitHub URL");
|
|
6499
6617
|
}
|
|
6500
|
-
|
|
6501
|
-
|
|
6502
|
-
|
|
6503
|
-
|
|
6504
|
-
|
|
6505
|
-
|
|
6506
|
-
|
|
6507
|
-
|
|
6508
|
-
|
|
6509
|
-
|
|
6510
|
-
|
|
6511
|
-
|
|
6512
|
-
|
|
6513
|
-
|
|
6514
|
-
|
|
6515
|
-
|
|
6516
|
-
|
|
6517
|
-
pagesScraped: wikiPagesScraped + progress.pagesScraped,
|
|
6518
|
-
totalPages: wikiPagesScraped + progress.totalPages,
|
|
6519
|
-
totalDiscovered: totalPagesDiscovered + progress.totalDiscovered
|
|
6618
|
+
if (item.depth === 0) {
|
|
6619
|
+
const repoInfo = this.parseGitHubUrl(options.url);
|
|
6620
|
+
const { owner, repo } = repoInfo;
|
|
6621
|
+
logger.debug(`Discovering GitHub repository ${owner}/${repo}`);
|
|
6622
|
+
const discoveredLinks = [];
|
|
6623
|
+
if ("isBlob" in repoInfo && repoInfo.isBlob && repoInfo.filePath) {
|
|
6624
|
+
const { branch = "main", filePath } = repoInfo;
|
|
6625
|
+
logger.debug(
|
|
6626
|
+
`Single file URL detected: ${owner}/${repo}/${filePath} - indexing file only`
|
|
6627
|
+
);
|
|
6628
|
+
discoveredLinks.push(
|
|
6629
|
+
`https://github.com/${owner}/${repo}/blob/${branch}/${filePath}`
|
|
6630
|
+
);
|
|
6631
|
+
return {
|
|
6632
|
+
url: item.url,
|
|
6633
|
+
links: discoveredLinks,
|
|
6634
|
+
status: FetchStatus.SUCCESS
|
|
6520
6635
|
};
|
|
6521
6636
|
}
|
|
6522
|
-
await progressCallback(progress);
|
|
6523
|
-
};
|
|
6524
|
-
try {
|
|
6525
6637
|
const wikiUrl = `${options.url.replace(/\/$/, "")}/wiki`;
|
|
6526
|
-
|
|
6527
|
-
logger.
|
|
6528
|
-
|
|
6529
|
-
|
|
6530
|
-
|
|
6531
|
-
|
|
6532
|
-
|
|
6533
|
-
|
|
6534
|
-
|
|
6535
|
-
|
|
6536
|
-
|
|
6537
|
-
|
|
6538
|
-
|
|
6539
|
-
|
|
6540
|
-
|
|
6541
|
-
|
|
6542
|
-
|
|
6543
|
-
|
|
6544
|
-
|
|
6545
|
-
|
|
6546
|
-
|
|
6547
|
-
logger.info(`✅ Completed repository code scraping for ${owner}/${repo}`);
|
|
6548
|
-
} else {
|
|
6549
|
-
logger.info(
|
|
6550
|
-
`ℹ️ Skipping repository code scraping - page limit reached with wiki content`
|
|
6551
|
-
);
|
|
6638
|
+
discoveredLinks.push(wikiUrl);
|
|
6639
|
+
logger.debug(`Discovered wiki URL: ${wikiUrl}`);
|
|
6640
|
+
const { tree, resolvedBranch } = await this.fetchRepositoryTree(repoInfo, signal);
|
|
6641
|
+
const fileItems = tree.tree.filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)).filter((treeItem) => this.shouldProcessFile(treeItem, options));
|
|
6642
|
+
logger.debug(
|
|
6643
|
+
`Discovered ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`
|
|
6644
|
+
);
|
|
6645
|
+
const fileUrls = fileItems.map(
|
|
6646
|
+
(treeItem) => `https://github.com/${owner}/${repo}/blob/${resolvedBranch}/${treeItem.path}`
|
|
6647
|
+
);
|
|
6648
|
+
discoveredLinks.push(...fileUrls);
|
|
6649
|
+
logger.debug(
|
|
6650
|
+
`Discovery complete: ${fileUrls.length} repo file(s) + 1 wiki URL = ${discoveredLinks.length} total URLs`
|
|
6651
|
+
);
|
|
6652
|
+
return { url: item.url, links: discoveredLinks, status: FetchStatus.SUCCESS };
|
|
6653
|
+
}
|
|
6654
|
+
try {
|
|
6655
|
+
const parsedUrl = new URL(item.url);
|
|
6656
|
+
if (/^\/[^/]+\/[^/]+\/blob\//.test(parsedUrl.pathname)) {
|
|
6657
|
+
logger.debug(`Processing HTTPS blob URL at depth ${item.depth}: ${item.url}`);
|
|
6658
|
+
return await this.repoProcessor.process(item, options, signal);
|
|
6552
6659
|
}
|
|
6553
|
-
logger.info(`🎉 Comprehensive GitHub scraping completed for ${owner}/${repo}`);
|
|
6554
6660
|
} catch (error) {
|
|
6555
|
-
logger.
|
|
6556
|
-
|
|
6661
|
+
logger.warn(`⚠️ Failed to parse blob URL ${item.url}: ${error}`);
|
|
6662
|
+
return { url: item.url, links: [], status: FetchStatus.SUCCESS };
|
|
6557
6663
|
}
|
|
6664
|
+
logger.debug(`No further processing for URL at depth ${item.depth}: ${item.url}`);
|
|
6665
|
+
return { url: item.url, links: [], status: FetchStatus.SUCCESS };
|
|
6666
|
+
}
|
|
6667
|
+
async scrape(options, progressCallback, signal) {
|
|
6668
|
+
const url = new URL(options.url);
|
|
6669
|
+
if (!url.hostname.includes("github.com")) {
|
|
6670
|
+
throw new Error("URL must be a GitHub URL");
|
|
6671
|
+
}
|
|
6672
|
+
await super.scrape(options, progressCallback, signal);
|
|
6558
6673
|
}
|
|
6559
|
-
/**
|
|
6560
|
-
* Cleanup resources used by both underlying strategies.
|
|
6561
|
-
*/
|
|
6562
6674
|
async cleanup() {
|
|
6563
|
-
await Promise.
|
|
6675
|
+
await Promise.all([this.wikiProcessor.cleanup(), this.repoProcessor.cleanup()]);
|
|
6564
6676
|
}
|
|
6565
6677
|
}
|
|
6566
6678
|
class LocalFileStrategy extends BaseScraperStrategy {
|
|
@@ -6573,23 +6685,41 @@ class LocalFileStrategy extends BaseScraperStrategy {
|
|
|
6573
6685
|
canHandle(url) {
|
|
6574
6686
|
return url.startsWith("file://");
|
|
6575
6687
|
}
|
|
6576
|
-
async processItem(item, options,
|
|
6688
|
+
async processItem(item, options, _signal) {
|
|
6577
6689
|
let filePath = item.url.replace(/^file:\/\/\/?/, "");
|
|
6578
6690
|
filePath = decodeURIComponent(filePath);
|
|
6579
6691
|
if (!filePath.startsWith("/") && process.platform !== "win32") {
|
|
6580
6692
|
filePath = `/${filePath}`;
|
|
6581
6693
|
}
|
|
6582
|
-
|
|
6694
|
+
let stats;
|
|
6695
|
+
try {
|
|
6696
|
+
stats = await fs$1.stat(filePath);
|
|
6697
|
+
} catch (error) {
|
|
6698
|
+
if (error.code === "ENOENT") {
|
|
6699
|
+
logger.info(`✓ File deleted or not available: ${filePath}`);
|
|
6700
|
+
return {
|
|
6701
|
+
url: item.url,
|
|
6702
|
+
links: [],
|
|
6703
|
+
status: FetchStatus.NOT_FOUND
|
|
6704
|
+
};
|
|
6705
|
+
}
|
|
6706
|
+
throw error;
|
|
6707
|
+
}
|
|
6583
6708
|
if (stats.isDirectory()) {
|
|
6584
6709
|
const contents = await fs$1.readdir(filePath);
|
|
6585
6710
|
const links = contents.map((name2) => `file://${path.join(filePath, name2)}`).filter((url) => this.shouldProcessUrl(url, options));
|
|
6586
|
-
return { links };
|
|
6711
|
+
return { url: item.url, links, status: FetchStatus.SUCCESS };
|
|
6712
|
+
}
|
|
6713
|
+
const rawContent = await this.fileFetcher.fetch(item.url, {
|
|
6714
|
+
etag: item.etag
|
|
6715
|
+
});
|
|
6716
|
+
if (rawContent.status === FetchStatus.NOT_MODIFIED) {
|
|
6717
|
+
logger.debug(`✓ File unchanged: ${filePath}`);
|
|
6718
|
+
return { url: rawContent.source, links: [], status: FetchStatus.NOT_MODIFIED };
|
|
6587
6719
|
}
|
|
6588
|
-
logger.info(`🗂️ Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`);
|
|
6589
|
-
const rawContent = await this.fileFetcher.fetch(item.url);
|
|
6590
6720
|
let processed;
|
|
6591
6721
|
for (const pipeline of this.pipelines) {
|
|
6592
|
-
if (pipeline.canProcess(rawContent)) {
|
|
6722
|
+
if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) {
|
|
6593
6723
|
logger.debug(
|
|
6594
6724
|
`Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`
|
|
6595
6725
|
);
|
|
@@ -6601,22 +6731,22 @@ class LocalFileStrategy extends BaseScraperStrategy {
|
|
|
6601
6731
|
logger.warn(
|
|
6602
6732
|
`⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
|
|
6603
6733
|
);
|
|
6604
|
-
return {
|
|
6734
|
+
return { url: rawContent.source, links: [], status: FetchStatus.SUCCESS };
|
|
6605
6735
|
}
|
|
6606
|
-
for (const err of processed.errors) {
|
|
6736
|
+
for (const err of processed.errors ?? []) {
|
|
6607
6737
|
logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`);
|
|
6608
6738
|
}
|
|
6739
|
+
const filename = path.basename(filePath);
|
|
6740
|
+
const title = processed.title?.trim() || filename || null;
|
|
6609
6741
|
return {
|
|
6610
|
-
|
|
6611
|
-
|
|
6612
|
-
|
|
6613
|
-
|
|
6614
|
-
|
|
6615
|
-
|
|
6616
|
-
|
|
6617
|
-
|
|
6618
|
-
}
|
|
6619
|
-
}
|
|
6742
|
+
url: rawContent.source,
|
|
6743
|
+
title,
|
|
6744
|
+
etag: rawContent.etag,
|
|
6745
|
+
lastModified: rawContent.lastModified,
|
|
6746
|
+
contentType: rawContent.mimeType,
|
|
6747
|
+
content: processed,
|
|
6748
|
+
links: [],
|
|
6749
|
+
status: FetchStatus.SUCCESS
|
|
6620
6750
|
};
|
|
6621
6751
|
}
|
|
6622
6752
|
/**
|
|
@@ -6652,19 +6782,32 @@ class WebScraperStrategy extends BaseScraperStrategy {
|
|
|
6652
6782
|
* @param signal - Optional abort signal for request cancellation.
|
|
6653
6783
|
* @returns An object containing the processed document and extracted links.
|
|
6654
6784
|
*/
|
|
6655
|
-
async processItem(item, options,
|
|
6785
|
+
async processItem(item, options, signal) {
|
|
6656
6786
|
const { url } = item;
|
|
6657
6787
|
try {
|
|
6788
|
+
if (item.etag) {
|
|
6789
|
+
logger.debug(`Processing ${url} with stored ETag: ${item.etag}`);
|
|
6790
|
+
}
|
|
6658
6791
|
const fetchOptions = {
|
|
6659
6792
|
signal,
|
|
6660
6793
|
followRedirects: options.followRedirects,
|
|
6661
|
-
headers: options.headers
|
|
6794
|
+
headers: options.headers,
|
|
6662
6795
|
// Forward custom headers
|
|
6796
|
+
etag: item.etag
|
|
6797
|
+
// Pass ETag for conditional requests
|
|
6663
6798
|
};
|
|
6664
6799
|
const rawContent = await this.fetcher.fetch(url, fetchOptions);
|
|
6800
|
+
logger.debug(
|
|
6801
|
+
`Fetch result for ${url}: status=${rawContent.status}, etag=${rawContent.etag || "none"}`
|
|
6802
|
+
);
|
|
6803
|
+
if (rawContent.status !== FetchStatus.SUCCESS) {
|
|
6804
|
+
logger.debug(`Skipping pipeline for ${url} due to status: ${rawContent.status}`);
|
|
6805
|
+
return { url: rawContent.source, links: [], status: rawContent.status };
|
|
6806
|
+
}
|
|
6665
6807
|
let processed;
|
|
6666
6808
|
for (const pipeline of this.pipelines) {
|
|
6667
|
-
|
|
6809
|
+
const contentBuffer = Buffer.isBuffer(rawContent.content) ? rawContent.content : Buffer.from(rawContent.content);
|
|
6810
|
+
if (pipeline.canProcess(rawContent.mimeType || "text/plain", contentBuffer)) {
|
|
6668
6811
|
logger.debug(
|
|
6669
6812
|
`Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${url})`
|
|
6670
6813
|
);
|
|
@@ -6676,40 +6819,47 @@ class WebScraperStrategy extends BaseScraperStrategy {
|
|
|
6676
6819
|
logger.warn(
|
|
6677
6820
|
`⚠️ Unsupported content type "${rawContent.mimeType}" for URL ${url}. Skipping processing.`
|
|
6678
6821
|
);
|
|
6679
|
-
return {
|
|
6822
|
+
return { url: rawContent.source, links: [], status: FetchStatus.SUCCESS };
|
|
6680
6823
|
}
|
|
6681
|
-
for (const err of processed.errors) {
|
|
6824
|
+
for (const err of processed.errors ?? []) {
|
|
6682
6825
|
logger.warn(`⚠️ Processing error for ${url}: ${err.message}`);
|
|
6683
6826
|
}
|
|
6684
6827
|
if (!processed.textContent || !processed.textContent.trim()) {
|
|
6685
6828
|
logger.warn(
|
|
6686
6829
|
`⚠️ No processable content found for ${url} after pipeline execution.`
|
|
6687
6830
|
);
|
|
6688
|
-
return {
|
|
6831
|
+
return {
|
|
6832
|
+
url: rawContent.source,
|
|
6833
|
+
links: processed.links,
|
|
6834
|
+
status: FetchStatus.SUCCESS
|
|
6835
|
+
};
|
|
6689
6836
|
}
|
|
6690
|
-
|
|
6691
|
-
|
|
6837
|
+
if (item.depth === 0) {
|
|
6838
|
+
this.canonicalBaseUrl = new URL(rawContent.source);
|
|
6839
|
+
}
|
|
6840
|
+
const filteredLinks = processed.links?.filter((link) => {
|
|
6692
6841
|
try {
|
|
6693
6842
|
const targetUrl = new URL(link);
|
|
6694
|
-
|
|
6695
|
-
|
|
6843
|
+
if (!this.shouldProcessUrl(targetUrl.href, options)) {
|
|
6844
|
+
return false;
|
|
6845
|
+
}
|
|
6846
|
+
if (this.shouldFollowLinkFn) {
|
|
6847
|
+
const baseUrl = this.canonicalBaseUrl ?? new URL(options.url);
|
|
6848
|
+
return this.shouldFollowLinkFn(baseUrl, targetUrl);
|
|
6849
|
+
}
|
|
6850
|
+
return true;
|
|
6696
6851
|
} catch {
|
|
6697
6852
|
return false;
|
|
6698
6853
|
}
|
|
6699
|
-
});
|
|
6854
|
+
}) ?? [];
|
|
6700
6855
|
return {
|
|
6701
|
-
|
|
6702
|
-
|
|
6703
|
-
|
|
6704
|
-
|
|
6705
|
-
|
|
6706
|
-
library: options.library,
|
|
6707
|
-
version: options.version,
|
|
6708
|
-
...processed.metadata
|
|
6709
|
-
}
|
|
6710
|
-
},
|
|
6856
|
+
url: rawContent.source,
|
|
6857
|
+
etag: rawContent.etag,
|
|
6858
|
+
lastModified: rawContent.lastModified,
|
|
6859
|
+
contentType: processed.contentType || rawContent.mimeType,
|
|
6860
|
+
content: processed,
|
|
6711
6861
|
links: filteredLinks,
|
|
6712
|
-
|
|
6862
|
+
status: FetchStatus.SUCCESS
|
|
6713
6863
|
};
|
|
6714
6864
|
} catch (error) {
|
|
6715
6865
|
logger.error(`❌ Failed processing page ${url}: ${error}`);
|
|
@@ -6786,7 +6936,6 @@ class ScraperRegistry {
|
|
|
6786
6936
|
this.strategies = [
|
|
6787
6937
|
new NpmScraperStrategy(),
|
|
6788
6938
|
new PyPiScraperStrategy(),
|
|
6789
|
-
new GitHubWikiScraperStrategy(),
|
|
6790
6939
|
new GitHubScraperStrategy(),
|
|
6791
6940
|
new WebScraperStrategy(),
|
|
6792
6941
|
new LocalFileStrategy()
|
|
@@ -6848,55 +6997,64 @@ class PipelineWorker {
|
|
|
6848
6997
|
* @param callbacks - Callbacks provided by the manager for reporting.
|
|
6849
6998
|
*/
|
|
6850
6999
|
async executeJob(job, callbacks) {
|
|
6851
|
-
const {
|
|
6852
|
-
id: jobId,
|
|
6853
|
-
library,
|
|
6854
|
-
version: version2,
|
|
6855
|
-
sourceUrl,
|
|
6856
|
-
scraperOptions,
|
|
6857
|
-
abortController
|
|
6858
|
-
} = job;
|
|
7000
|
+
const { id: jobId, library, version: version2, scraperOptions, abortController } = job;
|
|
6859
7001
|
const signal = abortController.signal;
|
|
6860
7002
|
logger.debug(`[${jobId}] Worker starting job for ${library}@${version2}`);
|
|
6861
7003
|
try {
|
|
6862
|
-
|
|
6863
|
-
|
|
6864
|
-
|
|
6865
|
-
|
|
6866
|
-
|
|
6867
|
-
|
|
6868
|
-
|
|
6869
|
-
|
|
6870
|
-
|
|
6871
|
-
}
|
|
7004
|
+
if (!scraperOptions.isRefresh) {
|
|
7005
|
+
await this.store.removeAllDocuments(library, version2);
|
|
7006
|
+
logger.info(
|
|
7007
|
+
`💾 Cleared store for ${library}@${version2 || "[no version]"} before scraping.`
|
|
7008
|
+
);
|
|
7009
|
+
} else {
|
|
7010
|
+
logger.info(
|
|
7011
|
+
`🔄 Refresh operation - preserving existing data for ${library}@${version2 || "[no version]"}.`
|
|
7012
|
+
);
|
|
7013
|
+
}
|
|
6872
7014
|
await this.scraperService.scrape(
|
|
6873
|
-
|
|
7015
|
+
scraperOptions,
|
|
6874
7016
|
async (progress) => {
|
|
6875
7017
|
if (signal.aborted) {
|
|
6876
7018
|
throw new CancellationError("Job cancelled during scraping progress");
|
|
6877
7019
|
}
|
|
6878
7020
|
await callbacks.onJobProgress?.(job, progress);
|
|
6879
|
-
if (progress.
|
|
7021
|
+
if (progress.deleted && progress.pageId) {
|
|
6880
7022
|
try {
|
|
6881
|
-
await this.store.
|
|
6882
|
-
pageContent: progress.document.content,
|
|
6883
|
-
metadata: {
|
|
6884
|
-
...progress.document.metadata,
|
|
6885
|
-
mimeType: progress.document.contentType
|
|
6886
|
-
// Pass contentType as mimeType in metadata
|
|
6887
|
-
}
|
|
6888
|
-
});
|
|
7023
|
+
await this.store.deletePage(progress.pageId);
|
|
6889
7024
|
logger.debug(
|
|
6890
|
-
`[${jobId}]
|
|
7025
|
+
`[${jobId}] Deleted page ${progress.pageId}: ${progress.currentUrl}`
|
|
7026
|
+
);
|
|
7027
|
+
} catch (docError) {
|
|
7028
|
+
logger.error(
|
|
7029
|
+
`❌ [${jobId}] Failed to delete page ${progress.pageId}: ${docError}`
|
|
7030
|
+
);
|
|
7031
|
+
const error = docError instanceof Error ? docError : new Error(String(docError));
|
|
7032
|
+
await callbacks.onJobError?.(job, error);
|
|
7033
|
+
throw error;
|
|
7034
|
+
}
|
|
7035
|
+
} else if (progress.result) {
|
|
7036
|
+
try {
|
|
7037
|
+
if (progress.pageId) {
|
|
7038
|
+
await this.store.deletePage(progress.pageId);
|
|
7039
|
+
logger.debug(
|
|
7040
|
+
`[${jobId}] Refreshing page ${progress.pageId}: ${progress.currentUrl}`
|
|
7041
|
+
);
|
|
7042
|
+
}
|
|
7043
|
+
await this.store.addScrapeResult(
|
|
7044
|
+
library,
|
|
7045
|
+
version2,
|
|
7046
|
+
progress.depth,
|
|
7047
|
+
progress.result
|
|
6891
7048
|
);
|
|
7049
|
+
logger.debug(`[${jobId}] Stored processed content: ${progress.currentUrl}`);
|
|
6892
7050
|
} catch (docError) {
|
|
6893
7051
|
logger.error(
|
|
6894
|
-
`❌ [${jobId}] Failed to
|
|
7052
|
+
`❌ [${jobId}] Failed to process content ${progress.currentUrl}: ${docError}`
|
|
6895
7053
|
);
|
|
6896
7054
|
await callbacks.onJobError?.(
|
|
6897
7055
|
job,
|
|
6898
7056
|
docError instanceof Error ? docError : new Error(String(docError)),
|
|
6899
|
-
progress.
|
|
7057
|
+
progress.result
|
|
6900
7058
|
);
|
|
6901
7059
|
}
|
|
6902
7060
|
}
|
|
@@ -7108,15 +7266,8 @@ class PipelineManager {
|
|
|
7108
7266
|
/**
|
|
7109
7267
|
* Enqueues a new document processing job, aborting any existing QUEUED/RUNNING job for the same library+version (including unversioned).
|
|
7110
7268
|
*/
|
|
7111
|
-
async
|
|
7269
|
+
async enqueueScrapeJob(library, version2, options) {
|
|
7112
7270
|
const normalizedVersion = version2 ?? "";
|
|
7113
|
-
const {
|
|
7114
|
-
url,
|
|
7115
|
-
library: _library,
|
|
7116
|
-
version: _version,
|
|
7117
|
-
signal: _signal,
|
|
7118
|
-
...versionOptions
|
|
7119
|
-
} = options;
|
|
7120
7271
|
const allJobs = await this.getJobs();
|
|
7121
7272
|
const duplicateJobs = allJobs.filter(
|
|
7122
7273
|
(job2) => job2.library === library && (job2.version ?? "") === normalizedVersion && // Normalize null to empty string for comparison
|
|
@@ -7158,8 +7309,8 @@ class PipelineManager {
|
|
|
7158
7309
|
progressMaxPages: 0,
|
|
7159
7310
|
errorMessage: null,
|
|
7160
7311
|
updatedAt: /* @__PURE__ */ new Date(),
|
|
7161
|
-
sourceUrl: url,
|
|
7162
|
-
scraperOptions:
|
|
7312
|
+
sourceUrl: options.url,
|
|
7313
|
+
scraperOptions: options
|
|
7163
7314
|
};
|
|
7164
7315
|
this.jobMap.set(jobId, job);
|
|
7165
7316
|
this.jobQueue.push(jobId);
|
|
@@ -7174,6 +7325,78 @@ class PipelineManager {
|
|
|
7174
7325
|
}
|
|
7175
7326
|
return jobId;
|
|
7176
7327
|
}
|
|
7328
|
+
/**
|
|
7329
|
+
* Enqueues a refresh job for an existing library version by re-scraping all pages
|
|
7330
|
+
* and using ETag comparison to skip unchanged content.
|
|
7331
|
+
*
|
|
7332
|
+
* If the version was never completed (interrupted or failed scrape), performs a
|
|
7333
|
+
* full re-scrape from scratch instead of a refresh to ensure completeness.
|
|
7334
|
+
*/
|
|
7335
|
+
async enqueueRefreshJob(library, version2) {
|
|
7336
|
+
const normalizedVersion = version2 ?? "";
|
|
7337
|
+
try {
|
|
7338
|
+
const versionId = await this.store.ensureVersion({
|
|
7339
|
+
library,
|
|
7340
|
+
version: normalizedVersion
|
|
7341
|
+
});
|
|
7342
|
+
const versionInfo = await this.store.getVersionById(versionId);
|
|
7343
|
+
if (!versionInfo) {
|
|
7344
|
+
throw new Error(`Version ID ${versionId} not found`);
|
|
7345
|
+
}
|
|
7346
|
+
const libraryInfo = await this.store.getLibraryById(versionInfo.library_id);
|
|
7347
|
+
if (!libraryInfo) {
|
|
7348
|
+
throw new Error(`Library ID ${versionInfo.library_id} not found`);
|
|
7349
|
+
}
|
|
7350
|
+
if (versionInfo && versionInfo.status !== VersionStatus.COMPLETED) {
|
|
7351
|
+
logger.info(
|
|
7352
|
+
`⚠️ Version ${library}@${normalizedVersion || "unversioned"} has status "${versionInfo.status}". Performing full re-scrape instead of refresh.`
|
|
7353
|
+
);
|
|
7354
|
+
return this.enqueueJobWithStoredOptions(library, normalizedVersion);
|
|
7355
|
+
}
|
|
7356
|
+
const pages = await this.store.getPagesByVersionId(versionId);
|
|
7357
|
+
if (pages.length > 0) {
|
|
7358
|
+
logger.debug(
|
|
7359
|
+
`Sample page data: url=${pages[0].url}, etag=${pages[0].etag}, depth=${pages[0].depth}`
|
|
7360
|
+
);
|
|
7361
|
+
}
|
|
7362
|
+
if (pages.length === 0) {
|
|
7363
|
+
throw new Error(
|
|
7364
|
+
`No pages found for ${library}@${normalizedVersion || "unversioned"}. Use scrape_docs to index it first.`
|
|
7365
|
+
);
|
|
7366
|
+
}
|
|
7367
|
+
logger.info(
|
|
7368
|
+
`🔄 Preparing refresh job for ${library}@${normalizedVersion || "unversioned"} with ${pages.length} page(s)`
|
|
7369
|
+
);
|
|
7370
|
+
const initialQueue = pages.map((page) => ({
|
|
7371
|
+
url: page.url,
|
|
7372
|
+
depth: page.depth ?? 0,
|
|
7373
|
+
// Use original depth, fallback to 0 for old data
|
|
7374
|
+
pageId: page.id,
|
|
7375
|
+
etag: page.etag
|
|
7376
|
+
}));
|
|
7377
|
+
const storedOptions = await this.store.getScraperOptions(versionId);
|
|
7378
|
+
const scraperOptions = {
|
|
7379
|
+
url: storedOptions?.sourceUrl || pages[0].url,
|
|
7380
|
+
// Required but not used when initialQueue is set
|
|
7381
|
+
library,
|
|
7382
|
+
version: normalizedVersion,
|
|
7383
|
+
...storedOptions?.options || {},
|
|
7384
|
+
// Include stored options if available (spread first)
|
|
7385
|
+
// Override with refresh-specific options (these must come after the spread)
|
|
7386
|
+
initialQueue,
|
|
7387
|
+
// Pre-populated queue with existing pages
|
|
7388
|
+
isRefresh: true
|
|
7389
|
+
// Mark this as a refresh operation
|
|
7390
|
+
};
|
|
7391
|
+
logger.info(
|
|
7392
|
+
`📝 Enqueueing refresh job for ${library}@${normalizedVersion || "unversioned"}`
|
|
7393
|
+
);
|
|
7394
|
+
return this.enqueueScrapeJob(library, normalizedVersion, scraperOptions);
|
|
7395
|
+
} catch (error) {
|
|
7396
|
+
logger.error(`❌ Failed to enqueue refresh job: ${error}`);
|
|
7397
|
+
throw error;
|
|
7398
|
+
}
|
|
7399
|
+
}
|
|
7177
7400
|
/**
|
|
7178
7401
|
* Enqueues a job using stored scraper options from a previous indexing run.
|
|
7179
7402
|
* If no stored options are found, throws an error.
|
|
@@ -7201,7 +7424,7 @@ class PipelineManager {
|
|
|
7201
7424
|
logger.info(
|
|
7202
7425
|
`🔄 Re-indexing ${library}@${normalizedVersion || "unversioned"} with stored options from ${stored.sourceUrl}`
|
|
7203
7426
|
);
|
|
7204
|
-
return this.
|
|
7427
|
+
return this.enqueueScrapeJob(library, normalizedVersion, completeOptions);
|
|
7205
7428
|
} catch (error) {
|
|
7206
7429
|
logger.error(`❌ Failed to enqueue job with stored options: ${error}`);
|
|
7207
7430
|
throw error;
|
|
@@ -7418,13 +7641,7 @@ class PipelineManager {
|
|
|
7418
7641
|
await this.store.updateVersionStatus(versionId, dbStatus, errorMessage);
|
|
7419
7642
|
if (newStatus === PipelineJobStatus.QUEUED && job.scraperOptions) {
|
|
7420
7643
|
try {
|
|
7421
|
-
|
|
7422
|
-
url: job.sourceUrl ?? "",
|
|
7423
|
-
library: job.library,
|
|
7424
|
-
version: job.version,
|
|
7425
|
-
...job.scraperOptions
|
|
7426
|
-
};
|
|
7427
|
-
await this.store.storeScraperOptions(versionId, fullOptions);
|
|
7644
|
+
await this.store.storeScraperOptions(versionId, job.scraperOptions);
|
|
7428
7645
|
logger.debug(
|
|
7429
7646
|
`Stored scraper options for ${job.library}@${job.version}: ${job.sourceUrl}`
|
|
7430
7647
|
);
|
|
@@ -7882,7 +8099,7 @@ async function createPipelineWithCallbacks(docService, options = {}) {
|
|
|
7882
8099
|
},
|
|
7883
8100
|
onJobError: async (job, error, document2) => {
|
|
7884
8101
|
logger.warn(
|
|
7885
|
-
`⚠️ Job ${job.id} error ${document2 ? `on document ${document2.
|
|
8102
|
+
`⚠️ Job ${job.id} error ${document2 ? `on document ${document2.url}` : ""}: ${error.message}`
|
|
7886
8103
|
);
|
|
7887
8104
|
}
|
|
7888
8105
|
});
|
|
@@ -8113,6 +8330,45 @@ function createMcpServerInstance(tools, readOnly = false) {
|
|
|
8113
8330
|
}
|
|
8114
8331
|
}
|
|
8115
8332
|
);
|
|
8333
|
+
server.tool(
|
|
8334
|
+
"refresh_version",
|
|
8335
|
+
"Re-scrape a previously indexed library version, updating only changed pages.",
|
|
8336
|
+
{
|
|
8337
|
+
library: z.string().trim().describe("Library name."),
|
|
8338
|
+
version: z.string().trim().optional().describe("Library version (optional, refreshes unversioned if omitted).")
|
|
8339
|
+
},
|
|
8340
|
+
{
|
|
8341
|
+
title: "Refresh Library Version",
|
|
8342
|
+
destructiveHint: false,
|
|
8343
|
+
// Only updates changed content
|
|
8344
|
+
openWorldHint: true
|
|
8345
|
+
// requires internet access
|
|
8346
|
+
},
|
|
8347
|
+
async ({ library, version: version2 }) => {
|
|
8348
|
+
analytics.track(TelemetryEvent.TOOL_USED, {
|
|
8349
|
+
tool: "refresh_version",
|
|
8350
|
+
context: "mcp_server",
|
|
8351
|
+
library,
|
|
8352
|
+
version: version2
|
|
8353
|
+
});
|
|
8354
|
+
try {
|
|
8355
|
+
const result = await tools.refresh.execute({
|
|
8356
|
+
library,
|
|
8357
|
+
version: version2,
|
|
8358
|
+
waitForCompletion: false
|
|
8359
|
+
// Don't wait for completion
|
|
8360
|
+
});
|
|
8361
|
+
if ("jobId" in result) {
|
|
8362
|
+
return createResponse(`🔄 Refresh job started with ID: ${result.jobId}.`);
|
|
8363
|
+
}
|
|
8364
|
+
return createResponse(
|
|
8365
|
+
`Refresh finished immediately (unexpectedly) with ${result.pagesRefreshed} pages.`
|
|
8366
|
+
);
|
|
8367
|
+
} catch (error) {
|
|
8368
|
+
return createError(error);
|
|
8369
|
+
}
|
|
8370
|
+
}
|
|
8371
|
+
);
|
|
8116
8372
|
}
|
|
8117
8373
|
server.tool(
|
|
8118
8374
|
"search_docs",
|
|
@@ -8638,7 +8894,7 @@ class FetchUrlTool {
|
|
|
8638
8894
|
logger.info("🔄 Processing content...");
|
|
8639
8895
|
let processed;
|
|
8640
8896
|
for (const pipeline of this.pipelines) {
|
|
8641
|
-
if (pipeline.canProcess(rawContent)) {
|
|
8897
|
+
if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) {
|
|
8642
8898
|
processed = await pipeline.process(
|
|
8643
8899
|
rawContent,
|
|
8644
8900
|
{
|
|
@@ -8673,7 +8929,7 @@ class FetchUrlTool {
|
|
|
8673
8929
|
const contentString = convertToString(rawContent.content, resolvedCharset);
|
|
8674
8930
|
return contentString;
|
|
8675
8931
|
}
|
|
8676
|
-
for (const err of processed.errors) {
|
|
8932
|
+
for (const err of processed.errors ?? []) {
|
|
8677
8933
|
logger.warn(`⚠️ Processing error for ${url}: ${err.message}`);
|
|
8678
8934
|
}
|
|
8679
8935
|
if (typeof processed.textContent !== "string" || !processed.textContent.trim()) {
|
|
@@ -8851,6 +9107,61 @@ class ListLibrariesTool {
|
|
|
8851
9107
|
return { libraries };
|
|
8852
9108
|
}
|
|
8853
9109
|
}
|
|
9110
|
+
class RefreshVersionTool {
|
|
9111
|
+
pipeline;
|
|
9112
|
+
constructor(pipeline) {
|
|
9113
|
+
this.pipeline = pipeline;
|
|
9114
|
+
}
|
|
9115
|
+
async execute(options) {
|
|
9116
|
+
const { library, version: version2, waitForCompletion = true } = options;
|
|
9117
|
+
let internalVersion;
|
|
9118
|
+
const partialVersionRegex = /^\d+(\.\d+)?$/;
|
|
9119
|
+
if (version2 === null || version2 === void 0) {
|
|
9120
|
+
internalVersion = "";
|
|
9121
|
+
} else {
|
|
9122
|
+
const validFullVersion = semver.valid(version2);
|
|
9123
|
+
if (validFullVersion) {
|
|
9124
|
+
internalVersion = validFullVersion;
|
|
9125
|
+
} else if (partialVersionRegex.test(version2)) {
|
|
9126
|
+
const coercedVersion = semver.coerce(version2);
|
|
9127
|
+
if (coercedVersion) {
|
|
9128
|
+
internalVersion = coercedVersion.version;
|
|
9129
|
+
} else {
|
|
9130
|
+
throw new ValidationError(
|
|
9131
|
+
`Invalid version format for refreshing: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`,
|
|
9132
|
+
"RefreshVersionTool"
|
|
9133
|
+
);
|
|
9134
|
+
}
|
|
9135
|
+
} else {
|
|
9136
|
+
throw new ValidationError(
|
|
9137
|
+
`Invalid version format for refreshing: '${version2}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`,
|
|
9138
|
+
"RefreshVersionTool"
|
|
9139
|
+
);
|
|
9140
|
+
}
|
|
9141
|
+
}
|
|
9142
|
+
internalVersion = internalVersion.toLowerCase();
|
|
9143
|
+
const pipeline = this.pipeline;
|
|
9144
|
+
const refreshVersion = internalVersion === "" ? null : internalVersion;
|
|
9145
|
+
const jobId = await pipeline.enqueueRefreshJob(library, refreshVersion);
|
|
9146
|
+
if (waitForCompletion) {
|
|
9147
|
+
try {
|
|
9148
|
+
await pipeline.waitForJobCompletion(jobId);
|
|
9149
|
+
const finalJob = await pipeline.getJob(jobId);
|
|
9150
|
+
const finalPagesRefreshed = finalJob?.progress?.pagesScraped ?? 0;
|
|
9151
|
+
logger.debug(
|
|
9152
|
+
`Refresh job ${jobId} finished with status ${finalJob?.status}. Pages refreshed: ${finalPagesRefreshed}`
|
|
9153
|
+
);
|
|
9154
|
+
return {
|
|
9155
|
+
pagesRefreshed: finalPagesRefreshed
|
|
9156
|
+
};
|
|
9157
|
+
} catch (error) {
|
|
9158
|
+
logger.error(`❌ Refresh job ${jobId} failed or was cancelled: ${error}`);
|
|
9159
|
+
throw error;
|
|
9160
|
+
}
|
|
9161
|
+
}
|
|
9162
|
+
return { jobId };
|
|
9163
|
+
}
|
|
9164
|
+
}
|
|
8854
9165
|
class RemoveTool {
|
|
8855
9166
|
constructor(documentManagementService, pipeline) {
|
|
8856
9167
|
this.documentManagementService = documentManagementService;
|
|
@@ -8871,19 +9182,7 @@ class RemoveTool {
|
|
|
8871
9182
|
}
|
|
8872
9183
|
logger.info(`🗑️ Removing library: ${library}${version2 ? `@${version2}` : ""}`);
|
|
8873
9184
|
try {
|
|
8874
|
-
|
|
8875
|
-
library,
|
|
8876
|
-
version2
|
|
8877
|
-
);
|
|
8878
|
-
const normalizedVersion = version2 && version2.trim() !== "" ? version2 : null;
|
|
8879
|
-
const versionExists = result.bestMatch === normalizedVersion || result.hasUnversioned && normalizedVersion === null;
|
|
8880
|
-
if (!versionExists) {
|
|
8881
|
-
const versionText = normalizedVersion ? `Version ${normalizedVersion}` : "Version";
|
|
8882
|
-
throw new ToolError(
|
|
8883
|
-
`${versionText} not found for library ${library}. Cannot remove non-existent version.`,
|
|
8884
|
-
this.constructor.name
|
|
8885
|
-
);
|
|
8886
|
-
}
|
|
9185
|
+
await this.documentManagementService.validateLibraryExists(library);
|
|
8887
9186
|
const allJobs = await this.pipeline.getJobs();
|
|
8888
9187
|
const jobs = allJobs.filter(
|
|
8889
9188
|
(job) => job.library === library && job.version === (version2 ?? "") && (job.status === PipelineJobStatus.QUEUED || job.status === PipelineJobStatus.RUNNING)
|
|
@@ -8950,7 +9249,7 @@ class ScrapeTool {
|
|
|
8950
9249
|
internalVersion = internalVersion.toLowerCase();
|
|
8951
9250
|
const pipeline = this.pipeline;
|
|
8952
9251
|
const enqueueVersion = internalVersion === "" ? null : internalVersion;
|
|
8953
|
-
const jobId = await pipeline.
|
|
9252
|
+
const jobId = await pipeline.enqueueScrapeJob(library, enqueueVersion, {
|
|
8954
9253
|
url,
|
|
8955
9254
|
library,
|
|
8956
9255
|
version: internalVersion,
|
|
@@ -8997,7 +9296,18 @@ class DocumentManagementClient {
|
|
|
8997
9296
|
logger.debug(`DocumentManagementClient (tRPC) created for: ${this.baseUrl}`);
|
|
8998
9297
|
}
|
|
8999
9298
|
async initialize() {
|
|
9000
|
-
|
|
9299
|
+
try {
|
|
9300
|
+
await this.client.ping.query();
|
|
9301
|
+
} catch (error) {
|
|
9302
|
+
logger.debug(
|
|
9303
|
+
`Failed to connect to DocumentManagement server at ${this.baseUrl}: ${error}`
|
|
9304
|
+
);
|
|
9305
|
+
throw new Error(
|
|
9306
|
+
`Failed to connect to server at ${this.baseUrl}.
|
|
9307
|
+
|
|
9308
|
+
Please verify the server URL includes the correct port (default 8080) and ends with '/api' (e.g., 'http://localhost:8080/api').`
|
|
9309
|
+
);
|
|
9310
|
+
}
|
|
9001
9311
|
}
|
|
9002
9312
|
async shutdown() {
|
|
9003
9313
|
}
|
|
@@ -9069,7 +9379,7 @@ class HierarchicalAssemblyStrategy {
|
|
|
9069
9379
|
try {
|
|
9070
9380
|
const chunksByDocument = /* @__PURE__ */ new Map();
|
|
9071
9381
|
for (const chunk of initialChunks) {
|
|
9072
|
-
const url = chunk.
|
|
9382
|
+
const url = chunk.url;
|
|
9073
9383
|
if (!chunksByDocument.has(url)) {
|
|
9074
9384
|
chunksByDocument.set(url, []);
|
|
9075
9385
|
}
|
|
@@ -9163,10 +9473,10 @@ class HierarchicalAssemblyStrategy {
|
|
|
9163
9473
|
if (debug) {
|
|
9164
9474
|
return chunks.map(
|
|
9165
9475
|
(chunk) => `=== #${chunk.id} ${chunk.metadata.path?.join("/")} [${chunk.metadata.level}] ===
|
|
9166
|
-
` + chunk.
|
|
9476
|
+
` + chunk.content
|
|
9167
9477
|
).join("");
|
|
9168
9478
|
}
|
|
9169
|
-
return chunks.map((chunk) => chunk.
|
|
9479
|
+
return chunks.map((chunk) => chunk.content).join("");
|
|
9170
9480
|
}
|
|
9171
9481
|
/**
|
|
9172
9482
|
* Walks up the parent hierarchy from a chunk to collect the complete parent chain.
|
|
@@ -9192,42 +9502,17 @@ class HierarchicalAssemblyStrategy {
|
|
|
9192
9502
|
visited.add(currentId);
|
|
9193
9503
|
chainIds.push(currentId);
|
|
9194
9504
|
depth++;
|
|
9195
|
-
|
|
9196
|
-
|
|
9505
|
+
let parentChunk = await documentStore.findParentChunk(library, version2, currentId);
|
|
9506
|
+
if (!parentChunk) {
|
|
9507
|
+
parentChunk = await this.findAncestorWithGaps(
|
|
9197
9508
|
library,
|
|
9198
9509
|
version2,
|
|
9199
|
-
|
|
9510
|
+
currentChunk.url,
|
|
9511
|
+
currentChunk.metadata.path ?? [],
|
|
9512
|
+
documentStore
|
|
9200
9513
|
);
|
|
9201
|
-
if (parentChunk) {
|
|
9202
|
-
currentChunk = parentChunk;
|
|
9203
|
-
} else {
|
|
9204
|
-
currentChunk = await this.findAncestorWithGaps(
|
|
9205
|
-
library,
|
|
9206
|
-
version2,
|
|
9207
|
-
currentChunk.metadata,
|
|
9208
|
-
documentStore
|
|
9209
|
-
);
|
|
9210
|
-
}
|
|
9211
|
-
} catch (error) {
|
|
9212
|
-
try {
|
|
9213
|
-
const currentMetadata = currentChunk?.metadata;
|
|
9214
|
-
if (currentMetadata) {
|
|
9215
|
-
currentChunk = await this.findAncestorWithGaps(
|
|
9216
|
-
library,
|
|
9217
|
-
version2,
|
|
9218
|
-
currentMetadata,
|
|
9219
|
-
documentStore
|
|
9220
|
-
);
|
|
9221
|
-
} else {
|
|
9222
|
-
currentChunk = null;
|
|
9223
|
-
}
|
|
9224
|
-
} catch (gapError) {
|
|
9225
|
-
logger.warn(
|
|
9226
|
-
`Parent lookup failed for chunk ${currentId}: ${error}. Gap search also failed: ${gapError}`
|
|
9227
|
-
);
|
|
9228
|
-
break;
|
|
9229
|
-
}
|
|
9230
9514
|
}
|
|
9515
|
+
currentChunk = parentChunk;
|
|
9231
9516
|
}
|
|
9232
9517
|
if (depth >= maxDepth) {
|
|
9233
9518
|
logger.warn(
|
|
@@ -9240,9 +9525,7 @@ class HierarchicalAssemblyStrategy {
|
|
|
9240
9525
|
* Attempts to find ancestors when there are gaps in the hierarchy.
|
|
9241
9526
|
* Tries progressively shorter path prefixes to find existing ancestor chunks.
|
|
9242
9527
|
*/
|
|
9243
|
-
async findAncestorWithGaps(library, version2,
|
|
9244
|
-
const path2 = metadata.path || [];
|
|
9245
|
-
const url = metadata.url;
|
|
9528
|
+
async findAncestorWithGaps(library, version2, url, path2, documentStore) {
|
|
9246
9529
|
if (path2.length <= 1) {
|
|
9247
9530
|
return null;
|
|
9248
9531
|
}
|
|
@@ -9279,7 +9562,7 @@ class HierarchicalAssemblyStrategy {
|
|
|
9279
9562
|
}
|
|
9280
9563
|
const matchingChunks = allChunks.filter((chunk) => {
|
|
9281
9564
|
const chunkPath = chunk.metadata.path || [];
|
|
9282
|
-
const chunkUrl = chunk.
|
|
9565
|
+
const chunkUrl = chunk.url;
|
|
9283
9566
|
if (chunkUrl !== url) return false;
|
|
9284
9567
|
if (chunkPath.length !== targetPath.length) return false;
|
|
9285
9568
|
return chunkPath.every((part, index) => part === targetPath[index]);
|
|
@@ -9301,11 +9584,7 @@ class HierarchicalAssemblyStrategy {
|
|
|
9301
9584
|
return current;
|
|
9302
9585
|
}
|
|
9303
9586
|
while (true) {
|
|
9304
|
-
const parent = await documentStore.findParentChunk(
|
|
9305
|
-
library,
|
|
9306
|
-
version2,
|
|
9307
|
-
current.id
|
|
9308
|
-
);
|
|
9587
|
+
const parent = await documentStore.findParentChunk(library, version2, current.id);
|
|
9309
9588
|
if (!parent) {
|
|
9310
9589
|
return null;
|
|
9311
9590
|
}
|
|
@@ -9387,7 +9666,7 @@ class HierarchicalAssemblyStrategy {
|
|
|
9387
9666
|
const ancestorChunks = await this.findChunksByExactPath(
|
|
9388
9667
|
library,
|
|
9389
9668
|
version2,
|
|
9390
|
-
referenceChunk.
|
|
9669
|
+
referenceChunk.url,
|
|
9391
9670
|
ancestorPath,
|
|
9392
9671
|
documentStore
|
|
9393
9672
|
);
|
|
@@ -9465,13 +9744,9 @@ class HierarchicalAssemblyStrategy {
|
|
|
9465
9744
|
for (const chunk of initialChunks) {
|
|
9466
9745
|
const id = chunk.id;
|
|
9467
9746
|
chunkIds.add(id);
|
|
9468
|
-
|
|
9469
|
-
|
|
9470
|
-
|
|
9471
|
-
chunkIds.add(parent.id);
|
|
9472
|
-
}
|
|
9473
|
-
} catch (error) {
|
|
9474
|
-
logger.warn(`Failed to find parent for chunk ${id}: ${error}`);
|
|
9747
|
+
const parent = await documentStore.findParentChunk(library, version2, id);
|
|
9748
|
+
if (parent) {
|
|
9749
|
+
chunkIds.add(parent.id);
|
|
9475
9750
|
}
|
|
9476
9751
|
try {
|
|
9477
9752
|
const children = await documentStore.findChildChunks(library, version2, id, 3);
|
|
@@ -9539,7 +9814,7 @@ class MarkdownAssemblyStrategy {
|
|
|
9539
9814
|
* Assembles chunks using simple "\n\n" joining (current behavior).
|
|
9540
9815
|
*/
|
|
9541
9816
|
assembleContent(chunks) {
|
|
9542
|
-
return chunks.map((chunk) => chunk.
|
|
9817
|
+
return chunks.map((chunk) => chunk.content).join("\n\n");
|
|
9543
9818
|
}
|
|
9544
9819
|
/**
|
|
9545
9820
|
* Collects related chunk IDs for a single chunk using current context expansion logic.
|
|
@@ -9638,7 +9913,7 @@ class DocumentRetrieverService {
|
|
|
9638
9913
|
groupResultsByUrl(results) {
|
|
9639
9914
|
const resultsByUrl = /* @__PURE__ */ new Map();
|
|
9640
9915
|
for (const result of results) {
|
|
9641
|
-
const url = result.
|
|
9916
|
+
const url = result.url;
|
|
9642
9917
|
if (!resultsByUrl.has(url)) {
|
|
9643
9918
|
resultsByUrl.set(url, []);
|
|
9644
9919
|
}
|
|
@@ -9653,10 +9928,8 @@ class DocumentRetrieverService {
|
|
|
9653
9928
|
* Processes a group of search results from the same URL using appropriate strategy.
|
|
9654
9929
|
*/
|
|
9655
9930
|
async processUrlGroup(library, version2, url, initialChunks) {
|
|
9656
|
-
const mimeType = initialChunks.length > 0 ? initialChunks[0].
|
|
9657
|
-
const maxScore = Math.max(
|
|
9658
|
-
...initialChunks.map((chunk) => chunk.metadata.score)
|
|
9659
|
-
);
|
|
9931
|
+
const mimeType = initialChunks.length > 0 ? initialChunks[0].content_type : void 0;
|
|
9932
|
+
const maxScore = Math.max(...initialChunks.map((chunk) => chunk.score));
|
|
9660
9933
|
const strategy = createContentAssemblyStrategy(mimeType);
|
|
9661
9934
|
const selectedChunks = await strategy.selectChunks(
|
|
9662
9935
|
library,
|
|
@@ -9845,7 +10118,7 @@ class DocumentStore {
|
|
|
9845
10118
|
prepareStatements() {
|
|
9846
10119
|
const statements = {
|
|
9847
10120
|
getById: this.db.prepare(
|
|
9848
|
-
`SELECT d
|
|
10121
|
+
`SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type
|
|
9849
10122
|
FROM documents d
|
|
9850
10123
|
JOIN pages p ON d.page_id = p.id
|
|
9851
10124
|
WHERE d.id = ?`
|
|
@@ -9858,7 +10131,7 @@ class DocumentStore {
|
|
|
9858
10131
|
"UPDATE documents SET embedding = ? WHERE id = ?"
|
|
9859
10132
|
),
|
|
9860
10133
|
insertPage: this.db.prepare(
|
|
9861
|
-
"INSERT INTO pages (version_id, url, title, etag, last_modified, content_type) VALUES (?, ?, ?, ?, ?, ?) ON CONFLICT(version_id, url) DO UPDATE SET title = excluded.title, content_type = excluded.content_type"
|
|
10134
|
+
"INSERT INTO pages (version_id, url, title, etag, last_modified, content_type, depth) VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(version_id, url) DO UPDATE SET title = excluded.title, content_type = excluded.content_type, etag = excluded.etag, last_modified = excluded.last_modified, depth = excluded.depth"
|
|
9862
10135
|
),
|
|
9863
10136
|
getPageId: this.db.prepare(
|
|
9864
10137
|
"SELECT id FROM pages WHERE version_id = ? AND url = ?"
|
|
@@ -9869,12 +10142,13 @@ class DocumentStore {
|
|
|
9869
10142
|
getLibraryIdByName: this.db.prepare(
|
|
9870
10143
|
"SELECT id FROM libraries WHERE name = ?"
|
|
9871
10144
|
),
|
|
10145
|
+
getLibraryById: this.db.prepare("SELECT * FROM libraries WHERE id = ?"),
|
|
9872
10146
|
// New version-related statements
|
|
9873
10147
|
insertVersion: this.db.prepare(
|
|
9874
10148
|
"INSERT INTO versions (library_id, name, status) VALUES (?, ?, 'not_indexed') ON CONFLICT(library_id, name) DO NOTHING"
|
|
9875
10149
|
),
|
|
9876
10150
|
resolveVersionId: this.db.prepare(
|
|
9877
|
-
"SELECT id FROM versions WHERE library_id = ? AND name
|
|
10151
|
+
"SELECT id FROM versions WHERE library_id = ? AND name = ?"
|
|
9878
10152
|
),
|
|
9879
10153
|
getVersionById: this.db.prepare("SELECT * FROM versions WHERE id = ?"),
|
|
9880
10154
|
queryVersionsByLibraryId: this.db.prepare(
|
|
@@ -9889,15 +10163,10 @@ class DocumentStore {
|
|
|
9889
10163
|
WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
9890
10164
|
)`
|
|
9891
10165
|
),
|
|
9892
|
-
|
|
9893
|
-
|
|
9894
|
-
WHERE page_id IN (
|
|
9895
|
-
SELECT p.id FROM pages p
|
|
9896
|
-
JOIN versions v ON p.version_id = v.id
|
|
9897
|
-
JOIN libraries l ON v.library_id = l.id
|
|
9898
|
-
WHERE p.url = ? AND l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
9899
|
-
)`
|
|
10166
|
+
deleteDocumentsByPageId: this.db.prepare(
|
|
10167
|
+
"DELETE FROM documents WHERE page_id = ?"
|
|
9900
10168
|
),
|
|
10169
|
+
deletePage: this.db.prepare("DELETE FROM pages WHERE id = ?"),
|
|
9901
10170
|
deletePages: this.db.prepare(
|
|
9902
10171
|
`DELETE FROM pages
|
|
9903
10172
|
WHERE version_id IN (
|
|
@@ -9953,7 +10222,7 @@ class DocumentStore {
|
|
|
9953
10222
|
ORDER BY l.name, version`
|
|
9954
10223
|
),
|
|
9955
10224
|
getChildChunks: this.db.prepare(`
|
|
9956
|
-
SELECT d
|
|
10225
|
+
SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
|
|
9957
10226
|
JOIN pages p ON d.page_id = p.id
|
|
9958
10227
|
JOIN versions v ON p.version_id = v.id
|
|
9959
10228
|
JOIN libraries l ON v.library_id = l.id
|
|
@@ -9967,7 +10236,7 @@ class DocumentStore {
|
|
|
9967
10236
|
LIMIT ?
|
|
9968
10237
|
`),
|
|
9969
10238
|
getPrecedingSiblings: this.db.prepare(`
|
|
9970
|
-
SELECT d
|
|
10239
|
+
SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
|
|
9971
10240
|
JOIN pages p ON d.page_id = p.id
|
|
9972
10241
|
JOIN versions v ON p.version_id = v.id
|
|
9973
10242
|
JOIN libraries l ON v.library_id = l.id
|
|
@@ -9980,7 +10249,7 @@ class DocumentStore {
|
|
|
9980
10249
|
LIMIT ?
|
|
9981
10250
|
`),
|
|
9982
10251
|
getSubsequentSiblings: this.db.prepare(`
|
|
9983
|
-
SELECT d
|
|
10252
|
+
SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
|
|
9984
10253
|
JOIN pages p ON d.page_id = p.id
|
|
9985
10254
|
JOIN versions v ON p.version_id = v.id
|
|
9986
10255
|
JOIN libraries l ON v.library_id = l.id
|
|
@@ -9993,7 +10262,7 @@ class DocumentStore {
|
|
|
9993
10262
|
LIMIT ?
|
|
9994
10263
|
`),
|
|
9995
10264
|
getParentChunk: this.db.prepare(`
|
|
9996
|
-
SELECT d
|
|
10265
|
+
SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
|
|
9997
10266
|
JOIN pages p ON d.page_id = p.id
|
|
9998
10267
|
JOIN versions v ON p.version_id = v.id
|
|
9999
10268
|
JOIN libraries l ON v.library_id = l.id
|
|
@@ -10035,6 +10304,9 @@ class DocumentStore {
|
|
|
10035
10304
|
`SELECT v.id, v.library_id FROM versions v
|
|
10036
10305
|
JOIN libraries l ON v.library_id = l.id
|
|
10037
10306
|
WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')`
|
|
10307
|
+
),
|
|
10308
|
+
getPagesByVersionId: this.db.prepare(
|
|
10309
|
+
"SELECT * FROM pages WHERE version_id = ?"
|
|
10038
10310
|
)
|
|
10039
10311
|
};
|
|
10040
10312
|
this.statements = statements;
|
|
@@ -10176,7 +10448,7 @@ class DocumentStore {
|
|
|
10176
10448
|
this.statements.insertVersion.run(libraryId, normalizedVersion);
|
|
10177
10449
|
const versionIdRow = this.statements.resolveVersionId.get(
|
|
10178
10450
|
libraryId,
|
|
10179
|
-
normalizedVersion
|
|
10451
|
+
normalizedVersion
|
|
10180
10452
|
);
|
|
10181
10453
|
if (!versionIdRow || typeof versionIdRow.id !== "number") {
|
|
10182
10454
|
throw new StoreError(
|
|
@@ -10238,6 +10510,32 @@ class DocumentStore {
|
|
|
10238
10510
|
throw new StoreError(`Failed to get versions by status: ${error}`);
|
|
10239
10511
|
}
|
|
10240
10512
|
}
|
|
10513
|
+
/**
|
|
10514
|
+
* Retrieves a version by its ID.
|
|
10515
|
+
* @param versionId The version ID to retrieve
|
|
10516
|
+
* @returns The version record, or null if not found
|
|
10517
|
+
*/
|
|
10518
|
+
async getVersionById(versionId) {
|
|
10519
|
+
try {
|
|
10520
|
+
const row = this.statements.getVersionById.get(versionId);
|
|
10521
|
+
return row || null;
|
|
10522
|
+
} catch (error) {
|
|
10523
|
+
throw new StoreError(`Failed to get version by ID: ${error}`);
|
|
10524
|
+
}
|
|
10525
|
+
}
|
|
10526
|
+
/**
|
|
10527
|
+
* Retrieves a library by its ID.
|
|
10528
|
+
* @param libraryId The library ID to retrieve
|
|
10529
|
+
* @returns The library record, or null if not found
|
|
10530
|
+
*/
|
|
10531
|
+
async getLibraryById(libraryId) {
|
|
10532
|
+
try {
|
|
10533
|
+
const row = this.statements.getLibraryById.get(libraryId);
|
|
10534
|
+
return row || null;
|
|
10535
|
+
} catch (error) {
|
|
10536
|
+
throw new StoreError(`Failed to get library by ID: ${error}`);
|
|
10537
|
+
}
|
|
10538
|
+
}
|
|
10241
10539
|
/**
|
|
10242
10540
|
* Stores scraper options for a version to enable reproducible indexing.
|
|
10243
10541
|
* @param versionId The version ID to update
|
|
@@ -10245,7 +10543,15 @@ class DocumentStore {
|
|
|
10245
10543
|
*/
|
|
10246
10544
|
async storeScraperOptions(versionId, options) {
|
|
10247
10545
|
try {
|
|
10248
|
-
const {
|
|
10546
|
+
const {
|
|
10547
|
+
url: source_url,
|
|
10548
|
+
library: _library,
|
|
10549
|
+
version: _version,
|
|
10550
|
+
signal: _signal,
|
|
10551
|
+
initialQueue: _initialQueue,
|
|
10552
|
+
isRefresh: _isRefresh,
|
|
10553
|
+
...scraper_options
|
|
10554
|
+
} = options;
|
|
10249
10555
|
const optionsJson = JSON.stringify(scraper_options);
|
|
10250
10556
|
this.statements.updateVersionScraperOptions.run(source_url, optionsJson, versionId);
|
|
10251
10557
|
} catch (error) {
|
|
@@ -10356,36 +10662,96 @@ class DocumentStore {
|
|
|
10356
10662
|
throw new ConnectionError("Failed to query library versions", error);
|
|
10357
10663
|
}
|
|
10358
10664
|
}
|
|
10665
|
+
/**
|
|
10666
|
+
* Helper method to detect if an error is related to input size limits.
|
|
10667
|
+
* Checks for common error messages from various embedding providers.
|
|
10668
|
+
*/
|
|
10669
|
+
isInputSizeError(error) {
|
|
10670
|
+
if (!(error instanceof Error)) return false;
|
|
10671
|
+
const message = error.message.toLowerCase();
|
|
10672
|
+
return message.includes("maximum context length") || message.includes("too long") || message.includes("token limit") || message.includes("input is too large") || message.includes("exceeds") || message.includes("max") && message.includes("token");
|
|
10673
|
+
}
|
|
10674
|
+
/**
|
|
10675
|
+
* Creates embeddings for an array of texts with automatic retry logic for size-related errors.
|
|
10676
|
+
* If a batch fails due to size limits:
|
|
10677
|
+
* - Batches with multiple texts are split in half and retried recursively
|
|
10678
|
+
* - Single texts that are too large are truncated and retried once
|
|
10679
|
+
*
|
|
10680
|
+
* @param texts Array of texts to embed
|
|
10681
|
+
* @returns Array of embedding vectors
|
|
10682
|
+
*/
|
|
10683
|
+
async embedDocumentsWithRetry(texts) {
|
|
10684
|
+
if (texts.length === 0) {
|
|
10685
|
+
return [];
|
|
10686
|
+
}
|
|
10687
|
+
try {
|
|
10688
|
+
return await this.embeddings.embedDocuments(texts);
|
|
10689
|
+
} catch (error) {
|
|
10690
|
+
if (this.isInputSizeError(error)) {
|
|
10691
|
+
if (texts.length > 1) {
|
|
10692
|
+
const midpoint = Math.floor(texts.length / 2);
|
|
10693
|
+
const firstHalf = texts.slice(0, midpoint);
|
|
10694
|
+
const secondHalf = texts.slice(midpoint);
|
|
10695
|
+
logger.warn(
|
|
10696
|
+
`⚠️ Batch of ${texts.length} texts exceeded size limit, splitting into ${firstHalf.length} + ${secondHalf.length}`
|
|
10697
|
+
);
|
|
10698
|
+
const [firstEmbeddings, secondEmbeddings] = await Promise.all([
|
|
10699
|
+
this.embedDocumentsWithRetry(firstHalf),
|
|
10700
|
+
this.embedDocumentsWithRetry(secondHalf)
|
|
10701
|
+
]);
|
|
10702
|
+
return [...firstEmbeddings, ...secondEmbeddings];
|
|
10703
|
+
} else {
|
|
10704
|
+
const text = texts[0];
|
|
10705
|
+
const midpoint = Math.floor(text.length / 2);
|
|
10706
|
+
const firstHalf = text.substring(0, midpoint);
|
|
10707
|
+
logger.warn(
|
|
10708
|
+
`⚠️ Single text exceeded embedding size limit (${text.length} chars). Truncating at ${firstHalf.length} chars.`
|
|
10709
|
+
);
|
|
10710
|
+
try {
|
|
10711
|
+
const embedding = await this.embedDocumentsWithRetry([firstHalf]);
|
|
10712
|
+
logger.info(
|
|
10713
|
+
`✓ Using embedding from first half of split text (${firstHalf.length} chars)`
|
|
10714
|
+
);
|
|
10715
|
+
return embedding;
|
|
10716
|
+
} catch (retryError) {
|
|
10717
|
+
logger.error(
|
|
10718
|
+
`❌ Failed to embed even after splitting. Original length: ${text.length}`
|
|
10719
|
+
);
|
|
10720
|
+
throw retryError;
|
|
10721
|
+
}
|
|
10722
|
+
}
|
|
10723
|
+
}
|
|
10724
|
+
throw error;
|
|
10725
|
+
}
|
|
10726
|
+
}
|
|
10359
10727
|
/**
|
|
10360
10728
|
* Stores documents with library and version metadata, generating embeddings
|
|
10361
10729
|
* for vector similarity search. Uses the new pages table to normalize page-level
|
|
10362
10730
|
* metadata and avoid duplication across document chunks.
|
|
10363
10731
|
*/
|
|
10364
|
-
async addDocuments(library, version2,
|
|
10732
|
+
async addDocuments(library, version2, depth, result) {
|
|
10365
10733
|
try {
|
|
10366
|
-
|
|
10734
|
+
const { title, url, chunks } = result;
|
|
10735
|
+
if (chunks.length === 0) {
|
|
10367
10736
|
return;
|
|
10368
10737
|
}
|
|
10369
|
-
const documentsByUrl = /* @__PURE__ */ new Map();
|
|
10370
|
-
for (const doc of documents) {
|
|
10371
|
-
const url = doc.metadata.url;
|
|
10372
|
-
if (!url || typeof url !== "string" || !url.trim()) {
|
|
10373
|
-
throw new StoreError("Document metadata must include a valid URL");
|
|
10374
|
-
}
|
|
10375
|
-
if (!documentsByUrl.has(url)) {
|
|
10376
|
-
documentsByUrl.set(url, []);
|
|
10377
|
-
}
|
|
10378
|
-
documentsByUrl.get(url)?.push(doc);
|
|
10379
|
-
}
|
|
10380
10738
|
let paddedEmbeddings = [];
|
|
10381
10739
|
if (this.isVectorSearchEnabled) {
|
|
10382
|
-
const texts =
|
|
10383
|
-
const header = `<title>${
|
|
10384
|
-
<url>${
|
|
10385
|
-
<path>${(
|
|
10740
|
+
const texts = chunks.map((chunk) => {
|
|
10741
|
+
const header = `<title>${title}</title>
|
|
10742
|
+
<url>${url}</url>
|
|
10743
|
+
<path>${(chunk.section.path || []).join(" / ")}</path>
|
|
10386
10744
|
`;
|
|
10387
|
-
return `${header}${
|
|
10745
|
+
return `${header}${chunk.content}`;
|
|
10388
10746
|
});
|
|
10747
|
+
for (let i = 0; i < texts.length; i++) {
|
|
10748
|
+
const textSize = texts[i].length;
|
|
10749
|
+
if (textSize > SPLITTER_MAX_CHUNK_SIZE) {
|
|
10750
|
+
logger.warn(
|
|
10751
|
+
`⚠️ Chunk ${i + 1}/${texts.length} exceeds max size: ${textSize} > ${SPLITTER_MAX_CHUNK_SIZE} chars (URL: ${url})`
|
|
10752
|
+
);
|
|
10753
|
+
}
|
|
10754
|
+
}
|
|
10389
10755
|
const maxBatchChars = EMBEDDING_BATCH_CHARS;
|
|
10390
10756
|
const rawEmbeddings = [];
|
|
10391
10757
|
let currentBatch = [];
|
|
@@ -10398,7 +10764,7 @@ class DocumentStore {
|
|
|
10398
10764
|
logger.debug(
|
|
10399
10765
|
`Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
|
|
10400
10766
|
);
|
|
10401
|
-
const batchEmbeddings = await this.
|
|
10767
|
+
const batchEmbeddings = await this.embedDocumentsWithRetry(currentBatch);
|
|
10402
10768
|
rawEmbeddings.push(...batchEmbeddings);
|
|
10403
10769
|
currentBatch = [];
|
|
10404
10770
|
currentBatchSize = 0;
|
|
@@ -10410,7 +10776,7 @@ class DocumentStore {
|
|
|
10410
10776
|
logger.debug(
|
|
10411
10777
|
`Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
|
|
10412
10778
|
);
|
|
10413
|
-
const batchEmbeddings = await this.
|
|
10779
|
+
const batchEmbeddings = await this.embedDocumentsWithRetry(currentBatch);
|
|
10414
10780
|
rawEmbeddings.push(...batchEmbeddings);
|
|
10415
10781
|
currentBatch = [];
|
|
10416
10782
|
currentBatchSize = 0;
|
|
@@ -10421,110 +10787,115 @@ class DocumentStore {
|
|
|
10421
10787
|
logger.debug(
|
|
10422
10788
|
`Processing final embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
|
|
10423
10789
|
);
|
|
10424
|
-
const batchEmbeddings = await this.
|
|
10790
|
+
const batchEmbeddings = await this.embedDocumentsWithRetry(currentBatch);
|
|
10425
10791
|
rawEmbeddings.push(...batchEmbeddings);
|
|
10426
10792
|
}
|
|
10427
10793
|
paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
|
|
10428
10794
|
}
|
|
10429
10795
|
const versionId = await this.resolveVersionId(library, version2);
|
|
10430
|
-
|
|
10431
|
-
|
|
10432
|
-
|
|
10433
|
-
|
|
10434
|
-
|
|
10435
|
-
|
|
10436
|
-
|
|
10437
|
-
|
|
10438
|
-
|
|
10439
|
-
|
|
10440
|
-
|
|
10441
|
-
|
|
10442
|
-
|
|
10443
|
-
|
|
10444
|
-
|
|
10445
|
-
|
|
10446
|
-
|
|
10447
|
-
|
|
10448
|
-
|
|
10449
|
-
|
|
10450
|
-
|
|
10451
|
-
|
|
10452
|
-
|
|
10453
|
-
if (!existingPage) {
|
|
10454
|
-
throw new StoreError(`Failed to get page ID for URL: ${url}`);
|
|
10455
|
-
}
|
|
10456
|
-
const pageId = existingPage.id;
|
|
10457
|
-
pageIds.set(url, pageId);
|
|
10796
|
+
const existingPage = this.statements.getPageId.get(versionId, url);
|
|
10797
|
+
if (existingPage) {
|
|
10798
|
+
const result2 = this.statements.deleteDocumentsByPageId.run(existingPage.id);
|
|
10799
|
+
if (result2.changes > 0) {
|
|
10800
|
+
logger.debug(`Deleted ${result2.changes} existing documents for URL: ${url}`);
|
|
10801
|
+
}
|
|
10802
|
+
}
|
|
10803
|
+
const transaction = this.db.transaction(() => {
|
|
10804
|
+
const contentType = result.contentType || null;
|
|
10805
|
+
const etag = result.etag || null;
|
|
10806
|
+
const lastModified = result.lastModified || null;
|
|
10807
|
+
this.statements.insertPage.run(
|
|
10808
|
+
versionId,
|
|
10809
|
+
url,
|
|
10810
|
+
title || "",
|
|
10811
|
+
etag,
|
|
10812
|
+
lastModified,
|
|
10813
|
+
contentType,
|
|
10814
|
+
depth
|
|
10815
|
+
);
|
|
10816
|
+
const existingPage2 = this.statements.getPageId.get(versionId, url);
|
|
10817
|
+
if (!existingPage2) {
|
|
10818
|
+
throw new StoreError(`Failed to get page ID for URL: ${url}`);
|
|
10458
10819
|
}
|
|
10820
|
+
const pageId = existingPage2.id;
|
|
10459
10821
|
let docIndex = 0;
|
|
10460
|
-
for (
|
|
10461
|
-
const
|
|
10462
|
-
|
|
10463
|
-
|
|
10464
|
-
|
|
10465
|
-
|
|
10466
|
-
|
|
10467
|
-
|
|
10468
|
-
|
|
10469
|
-
|
|
10470
|
-
|
|
10471
|
-
|
|
10472
|
-
|
|
10473
|
-
|
|
10474
|
-
|
|
10475
|
-
|
|
10476
|
-
|
|
10477
|
-
JSON.stringify(
|
|
10478
|
-
i
|
|
10479
|
-
// sort_order within this page
|
|
10822
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
10823
|
+
const chunk = chunks[i];
|
|
10824
|
+
const result2 = this.statements.insertDocument.run(
|
|
10825
|
+
pageId,
|
|
10826
|
+
chunk.content,
|
|
10827
|
+
JSON.stringify({
|
|
10828
|
+
types: chunk.types,
|
|
10829
|
+
level: chunk.section.level,
|
|
10830
|
+
path: chunk.section.path
|
|
10831
|
+
}),
|
|
10832
|
+
i
|
|
10833
|
+
// sort_order within this page
|
|
10834
|
+
);
|
|
10835
|
+
const rowId = result2.lastInsertRowid;
|
|
10836
|
+
if (this.isVectorSearchEnabled && paddedEmbeddings.length > 0) {
|
|
10837
|
+
this.statements.insertEmbedding.run(
|
|
10838
|
+
BigInt(rowId),
|
|
10839
|
+
JSON.stringify(paddedEmbeddings[docIndex])
|
|
10480
10840
|
);
|
|
10481
|
-
const rowId = result.lastInsertRowid;
|
|
10482
|
-
if (this.isVectorSearchEnabled && paddedEmbeddings.length > 0) {
|
|
10483
|
-
this.statements.insertEmbedding.run(
|
|
10484
|
-
BigInt(rowId),
|
|
10485
|
-
JSON.stringify(paddedEmbeddings[docIndex])
|
|
10486
|
-
);
|
|
10487
|
-
}
|
|
10488
|
-
docIndex++;
|
|
10489
10841
|
}
|
|
10842
|
+
docIndex++;
|
|
10490
10843
|
}
|
|
10491
10844
|
});
|
|
10492
|
-
transaction(
|
|
10845
|
+
transaction();
|
|
10493
10846
|
} catch (error) {
|
|
10494
10847
|
throw new ConnectionError("Failed to add documents to store", error);
|
|
10495
10848
|
}
|
|
10496
10849
|
}
|
|
10497
10850
|
/**
|
|
10498
|
-
* Removes documents matching specified library and version
|
|
10851
|
+
* Removes documents and pages matching specified library and version.
|
|
10852
|
+
* This consolidated method deletes both documents and their associated pages.
|
|
10499
10853
|
* @returns Number of documents deleted
|
|
10500
10854
|
*/
|
|
10501
|
-
async
|
|
10855
|
+
async deletePages(library, version2) {
|
|
10502
10856
|
try {
|
|
10503
10857
|
const normalizedVersion = version2.toLowerCase();
|
|
10504
10858
|
const result = this.statements.deleteDocuments.run(
|
|
10505
10859
|
library.toLowerCase(),
|
|
10506
10860
|
normalizedVersion
|
|
10507
10861
|
);
|
|
10862
|
+
this.statements.deletePages.run(library.toLowerCase(), normalizedVersion);
|
|
10508
10863
|
return result.changes;
|
|
10509
10864
|
} catch (error) {
|
|
10510
10865
|
throw new ConnectionError("Failed to delete documents", error);
|
|
10511
10866
|
}
|
|
10512
10867
|
}
|
|
10513
10868
|
/**
|
|
10514
|
-
*
|
|
10515
|
-
*
|
|
10869
|
+
* Deletes a page and all its associated document chunks.
|
|
10870
|
+
* Performs manual deletion in the correct order to satisfy foreign key constraints:
|
|
10871
|
+
* 1. Delete document chunks (page_id references pages.id)
|
|
10872
|
+
* 2. Delete page record
|
|
10873
|
+
*
|
|
10874
|
+
* This method is used during refresh operations when a page returns 404 Not Found.
|
|
10516
10875
|
*/
|
|
10517
|
-
async
|
|
10876
|
+
async deletePage(pageId) {
|
|
10518
10877
|
try {
|
|
10519
|
-
const
|
|
10520
|
-
|
|
10521
|
-
|
|
10522
|
-
|
|
10523
|
-
|
|
10524
|
-
|
|
10525
|
-
|
|
10878
|
+
const docResult = this.statements.deleteDocumentsByPageId.run(pageId);
|
|
10879
|
+
logger.debug(`Deleted ${docResult.changes} document(s) for page ID ${pageId}`);
|
|
10880
|
+
const pageResult = this.statements.deletePage.run(pageId);
|
|
10881
|
+
if (pageResult.changes > 0) {
|
|
10882
|
+
logger.debug(`Deleted page record for page ID ${pageId}`);
|
|
10883
|
+
}
|
|
10884
|
+
} catch (error) {
|
|
10885
|
+
throw new ConnectionError(`Failed to delete page ${pageId}`, error);
|
|
10886
|
+
}
|
|
10887
|
+
}
|
|
10888
|
+
/**
|
|
10889
|
+
* Retrieves all pages for a specific version ID with their metadata.
|
|
10890
|
+
* Used for refresh operations to get existing pages with their ETags and depths.
|
|
10891
|
+
* @returns Array of page records
|
|
10892
|
+
*/
|
|
10893
|
+
async getPagesByVersionId(versionId) {
|
|
10894
|
+
try {
|
|
10895
|
+
const result = this.statements.getPagesByVersionId.all(versionId);
|
|
10896
|
+
return result;
|
|
10526
10897
|
} catch (error) {
|
|
10527
|
-
throw new ConnectionError("Failed to
|
|
10898
|
+
throw new ConnectionError("Failed to get pages by version ID", error);
|
|
10528
10899
|
}
|
|
10529
10900
|
}
|
|
10530
10901
|
/**
|
|
@@ -10547,7 +10918,7 @@ class DocumentStore {
|
|
|
10547
10918
|
return { documentsDeleted: 0, versionDeleted: false, libraryDeleted: false };
|
|
10548
10919
|
}
|
|
10549
10920
|
const { id: versionId, library_id: libraryId } = versionResult;
|
|
10550
|
-
const documentsDeleted = await this.
|
|
10921
|
+
const documentsDeleted = await this.deletePages(library, version2);
|
|
10551
10922
|
this.statements.deletePages.run(normalizedLibrary, normalizedVersion);
|
|
10552
10923
|
const versionDeleteResult = this.statements.deleteVersionById.run(versionId);
|
|
10553
10924
|
const versionDeleted = versionDeleteResult.changes > 0;
|
|
@@ -10565,6 +10936,27 @@ class DocumentStore {
|
|
|
10565
10936
|
throw new ConnectionError("Failed to remove version", error);
|
|
10566
10937
|
}
|
|
10567
10938
|
}
|
|
10939
|
+
/**
|
|
10940
|
+
* Parses the metadata field from a JSON string to an object.
|
|
10941
|
+
* This is necessary because better-sqlite3's json() function returns a string, not an object.
|
|
10942
|
+
*/
|
|
10943
|
+
parseMetadata(row) {
|
|
10944
|
+
if (row.metadata && typeof row.metadata === "string") {
|
|
10945
|
+
try {
|
|
10946
|
+
row.metadata = JSON.parse(row.metadata);
|
|
10947
|
+
} catch (error) {
|
|
10948
|
+
logger.warn(`Failed to parse metadata JSON: ${error}`);
|
|
10949
|
+
row.metadata = {};
|
|
10950
|
+
}
|
|
10951
|
+
}
|
|
10952
|
+
return row;
|
|
10953
|
+
}
|
|
10954
|
+
/**
|
|
10955
|
+
* Parses metadata for an array of rows.
|
|
10956
|
+
*/
|
|
10957
|
+
parseMetadataArray(rows) {
|
|
10958
|
+
return rows.map((row) => this.parseMetadata(row));
|
|
10959
|
+
}
|
|
10568
10960
|
/**
|
|
10569
10961
|
* Retrieves a document by its ID.
|
|
10570
10962
|
* @param id The ID of the document.
|
|
@@ -10572,13 +10964,11 @@ class DocumentStore {
|
|
|
10572
10964
|
*/
|
|
10573
10965
|
async getById(id) {
|
|
10574
10966
|
try {
|
|
10575
|
-
const row = this.statements.getById.get(
|
|
10576
|
-
BigInt(id)
|
|
10577
|
-
);
|
|
10967
|
+
const row = this.statements.getById.get(BigInt(id));
|
|
10578
10968
|
if (!row) {
|
|
10579
10969
|
return null;
|
|
10580
10970
|
}
|
|
10581
|
-
return
|
|
10971
|
+
return this.parseMetadata(row);
|
|
10582
10972
|
} catch (error) {
|
|
10583
10973
|
throw new ConnectionError(`Failed to get document by ID ${id}`, error);
|
|
10584
10974
|
}
|
|
@@ -10662,26 +11052,20 @@ class DocumentStore {
|
|
|
10662
11052
|
);
|
|
10663
11053
|
const rankedResults = this.assignRanks(rawResults);
|
|
10664
11054
|
const topResults = rankedResults.sort((a, b) => b.rrf_score - a.rrf_score).slice(0, limit);
|
|
10665
|
-
return topResults.map((row) =>
|
|
10666
|
-
|
|
11055
|
+
return topResults.map((row) => {
|
|
11056
|
+
const result = {
|
|
10667
11057
|
...row,
|
|
10668
11058
|
url: row.url || "",
|
|
10669
11059
|
// Ensure url is never undefined
|
|
10670
|
-
title: row.title,
|
|
10671
|
-
content_type: row.content_type
|
|
10672
|
-
}
|
|
10673
|
-
|
|
10674
|
-
...JSON.parse(row.metadata),
|
|
10675
|
-
id: row.id,
|
|
11060
|
+
title: row.title || null,
|
|
11061
|
+
content_type: row.content_type || null
|
|
11062
|
+
};
|
|
11063
|
+
return Object.assign(result, {
|
|
10676
11064
|
score: row.rrf_score,
|
|
10677
11065
|
vec_rank: row.vec_rank,
|
|
10678
|
-
fts_rank: row.fts_rank
|
|
10679
|
-
|
|
10680
|
-
|
|
10681
|
-
title: row.title || "",
|
|
10682
|
-
...row.content_type && { contentType: row.content_type }
|
|
10683
|
-
}
|
|
10684
|
-
}));
|
|
11066
|
+
fts_rank: row.fts_rank
|
|
11067
|
+
});
|
|
11068
|
+
});
|
|
10685
11069
|
} else {
|
|
10686
11070
|
const stmt = this.db.prepare(`
|
|
10687
11071
|
SELECT
|
|
@@ -10713,28 +11097,21 @@ class DocumentStore {
|
|
|
10713
11097
|
ftsQuery,
|
|
10714
11098
|
limit
|
|
10715
11099
|
);
|
|
10716
|
-
return rawResults.map((row, index) =>
|
|
10717
|
-
|
|
11100
|
+
return rawResults.map((row, index) => {
|
|
11101
|
+
const result = {
|
|
10718
11102
|
...row,
|
|
10719
11103
|
url: row.url || "",
|
|
10720
11104
|
// Ensure url is never undefined
|
|
10721
|
-
title: row.title,
|
|
10722
|
-
content_type: row.content_type
|
|
10723
|
-
}
|
|
10724
|
-
|
|
10725
|
-
...JSON.parse(row.metadata),
|
|
10726
|
-
id: row.id,
|
|
11105
|
+
title: row.title || null,
|
|
11106
|
+
content_type: row.content_type || null
|
|
11107
|
+
};
|
|
11108
|
+
return Object.assign(result, {
|
|
10727
11109
|
score: -row.fts_score,
|
|
10728
11110
|
// Convert BM25 score to positive value for consistency
|
|
10729
|
-
fts_rank: index + 1
|
|
11111
|
+
fts_rank: index + 1
|
|
10730
11112
|
// Assign rank based on order (1-based)
|
|
10731
|
-
|
|
10732
|
-
|
|
10733
|
-
url: row.url || "",
|
|
10734
|
-
title: row.title || "",
|
|
10735
|
-
...row.content_type && { contentType: row.content_type }
|
|
10736
|
-
}
|
|
10737
|
-
}));
|
|
11113
|
+
});
|
|
11114
|
+
});
|
|
10738
11115
|
}
|
|
10739
11116
|
} catch (error) {
|
|
10740
11117
|
throw new ConnectionError(
|
|
@@ -10753,18 +11130,17 @@ class DocumentStore {
|
|
|
10753
11130
|
return [];
|
|
10754
11131
|
}
|
|
10755
11132
|
const parentPath = parent.metadata.path ?? [];
|
|
10756
|
-
const parentUrl = parent.metadata.url;
|
|
10757
11133
|
const normalizedVersion = version2.toLowerCase();
|
|
10758
11134
|
const result = this.statements.getChildChunks.all(
|
|
10759
11135
|
library.toLowerCase(),
|
|
10760
11136
|
normalizedVersion,
|
|
10761
|
-
|
|
11137
|
+
parent.url,
|
|
10762
11138
|
parentPath.length + 1,
|
|
10763
11139
|
JSON.stringify(parentPath),
|
|
10764
11140
|
BigInt(id),
|
|
10765
11141
|
limit
|
|
10766
11142
|
);
|
|
10767
|
-
return
|
|
11143
|
+
return this.parseMetadataArray(result);
|
|
10768
11144
|
} catch (error) {
|
|
10769
11145
|
throw new ConnectionError(`Failed to find child chunks for ID ${id}`, error);
|
|
10770
11146
|
}
|
|
@@ -10778,17 +11154,16 @@ class DocumentStore {
|
|
|
10778
11154
|
if (!reference) {
|
|
10779
11155
|
return [];
|
|
10780
11156
|
}
|
|
10781
|
-
const refMetadata = reference.metadata;
|
|
10782
11157
|
const normalizedVersion = version2.toLowerCase();
|
|
10783
11158
|
const result = this.statements.getPrecedingSiblings.all(
|
|
10784
11159
|
library.toLowerCase(),
|
|
10785
11160
|
normalizedVersion,
|
|
10786
|
-
|
|
11161
|
+
reference.url,
|
|
10787
11162
|
BigInt(id),
|
|
10788
|
-
JSON.stringify(
|
|
11163
|
+
JSON.stringify(reference.metadata.path),
|
|
10789
11164
|
limit
|
|
10790
11165
|
);
|
|
10791
|
-
return
|
|
11166
|
+
return this.parseMetadataArray(result).reverse();
|
|
10792
11167
|
} catch (error) {
|
|
10793
11168
|
throw new ConnectionError(
|
|
10794
11169
|
`Failed to find preceding sibling chunks for ID ${id}`,
|
|
@@ -10805,17 +11180,16 @@ class DocumentStore {
|
|
|
10805
11180
|
if (!reference) {
|
|
10806
11181
|
return [];
|
|
10807
11182
|
}
|
|
10808
|
-
const refMetadata = reference.metadata;
|
|
10809
11183
|
const normalizedVersion = version2.toLowerCase();
|
|
10810
11184
|
const result = this.statements.getSubsequentSiblings.all(
|
|
10811
11185
|
library.toLowerCase(),
|
|
10812
11186
|
normalizedVersion,
|
|
10813
|
-
|
|
11187
|
+
reference.url,
|
|
10814
11188
|
BigInt(id),
|
|
10815
|
-
JSON.stringify(
|
|
11189
|
+
JSON.stringify(reference.metadata.path),
|
|
10816
11190
|
limit
|
|
10817
11191
|
);
|
|
10818
|
-
return
|
|
11192
|
+
return this.parseMetadataArray(result);
|
|
10819
11193
|
} catch (error) {
|
|
10820
11194
|
throw new ConnectionError(
|
|
10821
11195
|
`Failed to find subsequent sibling chunks for ID ${id}`,
|
|
@@ -10825,6 +11199,8 @@ class DocumentStore {
|
|
|
10825
11199
|
}
|
|
10826
11200
|
/**
|
|
10827
11201
|
* Finds the parent chunk of a given document.
|
|
11202
|
+
* Returns null if no parent is found or if there's a database error.
|
|
11203
|
+
* Database errors are logged but not thrown to maintain consistent behavior.
|
|
10828
11204
|
*/
|
|
10829
11205
|
async findParentChunk(library, version2, id) {
|
|
10830
11206
|
try {
|
|
@@ -10832,8 +11208,7 @@ class DocumentStore {
|
|
|
10832
11208
|
if (!child) {
|
|
10833
11209
|
return null;
|
|
10834
11210
|
}
|
|
10835
|
-
const
|
|
10836
|
-
const path2 = childMetadata.path ?? [];
|
|
11211
|
+
const path2 = child.metadata.path ?? [];
|
|
10837
11212
|
const parentPath = path2.slice(0, -1);
|
|
10838
11213
|
if (parentPath.length === 0) {
|
|
10839
11214
|
return null;
|
|
@@ -10842,21 +11217,22 @@ class DocumentStore {
|
|
|
10842
11217
|
const result = this.statements.getParentChunk.get(
|
|
10843
11218
|
library.toLowerCase(),
|
|
10844
11219
|
normalizedVersion,
|
|
10845
|
-
|
|
11220
|
+
child.url,
|
|
10846
11221
|
JSON.stringify(parentPath),
|
|
10847
11222
|
BigInt(id)
|
|
10848
11223
|
);
|
|
10849
11224
|
if (!result) {
|
|
10850
11225
|
return null;
|
|
10851
11226
|
}
|
|
10852
|
-
return
|
|
11227
|
+
return this.parseMetadata(result);
|
|
10853
11228
|
} catch (error) {
|
|
10854
|
-
|
|
11229
|
+
logger.warn(`Failed to find parent chunk for ID ${id}: ${error}`);
|
|
11230
|
+
return null;
|
|
10855
11231
|
}
|
|
10856
11232
|
}
|
|
10857
11233
|
/**
|
|
10858
11234
|
* Fetches multiple documents by their IDs in a single call.
|
|
10859
|
-
* Returns an array of
|
|
11235
|
+
* Returns an array of DbPageChunk objects, sorted by their sort_order.
|
|
10860
11236
|
*/
|
|
10861
11237
|
async findChunksByIds(library, version2, ids) {
|
|
10862
11238
|
if (!ids.length) return [];
|
|
@@ -10864,7 +11240,7 @@ class DocumentStore {
|
|
|
10864
11240
|
const normalizedVersion = version2.toLowerCase();
|
|
10865
11241
|
const placeholders = ids.map(() => "?").join(",");
|
|
10866
11242
|
const stmt = this.db.prepare(
|
|
10867
|
-
`SELECT d
|
|
11243
|
+
`SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
|
|
10868
11244
|
JOIN pages p ON d.page_id = p.id
|
|
10869
11245
|
JOIN versions v ON p.version_id = v.id
|
|
10870
11246
|
JOIN libraries l ON v.library_id = l.id
|
|
@@ -10878,20 +11254,20 @@ class DocumentStore {
|
|
|
10878
11254
|
normalizedVersion,
|
|
10879
11255
|
...ids
|
|
10880
11256
|
);
|
|
10881
|
-
return
|
|
11257
|
+
return this.parseMetadataArray(rows);
|
|
10882
11258
|
} catch (error) {
|
|
10883
11259
|
throw new ConnectionError("Failed to fetch documents by IDs", error);
|
|
10884
11260
|
}
|
|
10885
11261
|
}
|
|
10886
11262
|
/**
|
|
10887
11263
|
* Fetches all document chunks for a specific URL within a library and version.
|
|
10888
|
-
* Returns
|
|
11264
|
+
* Returns DbPageChunk objects sorted by their sort_order for proper reassembly.
|
|
10889
11265
|
*/
|
|
10890
11266
|
async findChunksByUrl(library, version2, url) {
|
|
10891
11267
|
try {
|
|
10892
11268
|
const normalizedVersion = version2.toLowerCase();
|
|
10893
11269
|
const stmt = this.db.prepare(
|
|
10894
|
-
`SELECT d
|
|
11270
|
+
`SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d
|
|
10895
11271
|
JOIN pages p ON d.page_id = p.id
|
|
10896
11272
|
JOIN versions v ON p.version_id = v.id
|
|
10897
11273
|
JOIN libraries l ON v.library_id = l.id
|
|
@@ -10905,7 +11281,7 @@ class DocumentStore {
|
|
|
10905
11281
|
normalizedVersion,
|
|
10906
11282
|
url
|
|
10907
11283
|
);
|
|
10908
|
-
return
|
|
11284
|
+
return this.parseMetadataArray(rows);
|
|
10909
11285
|
} catch (error) {
|
|
10910
11286
|
throw new ConnectionError(`Failed to fetch documents by URL ${url}`, error);
|
|
10911
11287
|
}
|
|
@@ -10923,9 +11299,8 @@ class DocumentManagementService {
|
|
|
10923
11299
|
return (version2 ?? "").toLowerCase();
|
|
10924
11300
|
}
|
|
10925
11301
|
constructor(storePath, embeddingConfig, pipelineConfig) {
|
|
10926
|
-
const
|
|
10927
|
-
|
|
10928
|
-
logger.debug(`Using database directory: ${dbDir}`);
|
|
11302
|
+
const dbPath = storePath === ":memory:" ? ":memory:" : path.join(storePath, "documents.db");
|
|
11303
|
+
logger.debug(`Using database path: ${dbPath}`);
|
|
10929
11304
|
this.store = new DocumentStore(dbPath, embeddingConfig);
|
|
10930
11305
|
this.documentRetriever = new DocumentRetrieverService(this.store);
|
|
10931
11306
|
this.pipelines = PipelineFactory$1.createStandardPipelines(pipelineConfig);
|
|
@@ -11136,9 +11511,24 @@ class DocumentManagementService {
|
|
|
11136
11511
|
logger.info(
|
|
11137
11512
|
`🗑️ Removing all documents from ${library}@${normalizedVersion || "[no version]"} store`
|
|
11138
11513
|
);
|
|
11139
|
-
const count = await this.store.
|
|
11514
|
+
const count = await this.store.deletePages(library, normalizedVersion);
|
|
11140
11515
|
logger.info(`🗑️ Deleted ${count} documents`);
|
|
11141
11516
|
}
|
|
11517
|
+
/**
|
|
11518
|
+
* Deletes a page and all its associated document chunks.
|
|
11519
|
+
* This is used during refresh operations when a page returns 404 Not Found.
|
|
11520
|
+
*/
|
|
11521
|
+
async deletePage(pageId) {
|
|
11522
|
+
logger.debug(`Deleting page ID: ${pageId}`);
|
|
11523
|
+
await this.store.deletePage(pageId);
|
|
11524
|
+
}
|
|
11525
|
+
/**
|
|
11526
|
+
* Retrieves all pages for a specific version ID with their metadata.
|
|
11527
|
+
* Used for refresh operations to get existing pages with their ETags and depths.
|
|
11528
|
+
*/
|
|
11529
|
+
async getPagesByVersionId(versionId) {
|
|
11530
|
+
return this.store.getPagesByVersionId(versionId);
|
|
11531
|
+
}
|
|
11142
11532
|
/**
|
|
11143
11533
|
* Completely removes a library version and all associated documents.
|
|
11144
11534
|
* Also removes the library if no other versions remain.
|
|
@@ -11147,15 +11537,13 @@ class DocumentManagementService {
|
|
|
11147
11537
|
*/
|
|
11148
11538
|
async removeVersion(library, version2) {
|
|
11149
11539
|
const normalizedVersion = this.normalizeVersion(version2);
|
|
11150
|
-
logger.
|
|
11540
|
+
logger.debug(`Removing version: ${library}@${normalizedVersion || "[no version]"}`);
|
|
11151
11541
|
const result = await this.store.removeVersion(library, normalizedVersion, true);
|
|
11152
|
-
logger.info(
|
|
11153
|
-
`🗑️ Removed ${result.documentsDeleted} documents, version: ${result.versionDeleted}, library: ${result.libraryDeleted}`
|
|
11154
|
-
);
|
|
11542
|
+
logger.info(`🗑️ Removed ${result.documentsDeleted} documents`);
|
|
11155
11543
|
if (result.versionDeleted && result.libraryDeleted) {
|
|
11156
|
-
logger.info(
|
|
11544
|
+
logger.info(`🗑️ Completely removed library ${library} (was last version)`);
|
|
11157
11545
|
} else if (result.versionDeleted) {
|
|
11158
|
-
logger.info(
|
|
11546
|
+
logger.info(`🗑️ Removed version ${library}@${normalizedVersion || "[no version]"}`);
|
|
11159
11547
|
} else {
|
|
11160
11548
|
logger.warn(
|
|
11161
11549
|
`⚠️ Version ${library}@${normalizedVersion || "[no version]"} not found`
|
|
@@ -11163,91 +11551,68 @@ class DocumentManagementService {
|
|
|
11163
11551
|
}
|
|
11164
11552
|
}
|
|
11165
11553
|
/**
|
|
11166
|
-
* Adds
|
|
11167
|
-
*
|
|
11168
|
-
*
|
|
11169
|
-
*
|
|
11554
|
+
* Adds pre-processed content directly to the store.
|
|
11555
|
+
* This method is used when content has already been processed by a pipeline,
|
|
11556
|
+
* avoiding redundant processing. Used primarily by the scraping pipeline.
|
|
11557
|
+
*
|
|
11558
|
+
* @param library Library name
|
|
11559
|
+
* @param version Version string (null/undefined for unversioned)
|
|
11560
|
+
* @param processed Pre-processed content with chunks already created
|
|
11561
|
+
* @param pageId Optional page ID for refresh operations
|
|
11170
11562
|
*/
|
|
11171
|
-
async
|
|
11563
|
+
async addScrapeResult(library, version2, depth, result) {
|
|
11172
11564
|
const processingStart = performance.now();
|
|
11173
11565
|
const normalizedVersion = this.normalizeVersion(version2);
|
|
11174
|
-
const url =
|
|
11175
|
-
if (!url
|
|
11176
|
-
throw new StoreError("
|
|
11566
|
+
const { url, title, chunks, contentType } = result;
|
|
11567
|
+
if (!url) {
|
|
11568
|
+
throw new StoreError("Processed content metadata must include a valid URL");
|
|
11177
11569
|
}
|
|
11178
|
-
logger.info(`📚 Adding
|
|
11179
|
-
if (
|
|
11180
|
-
|
|
11570
|
+
logger.info(`📚 Adding processed content: ${title || url}`);
|
|
11571
|
+
if (chunks.length === 0) {
|
|
11572
|
+
logger.warn(`⚠️ No chunks in processed content for ${url}. Skipping.`);
|
|
11573
|
+
return;
|
|
11181
11574
|
}
|
|
11182
|
-
const contentType = document2.metadata.mimeType;
|
|
11183
11575
|
try {
|
|
11184
|
-
|
|
11185
|
-
|
|
11186
|
-
content: document2.pageContent,
|
|
11187
|
-
mimeType: contentType || "text/plain"
|
|
11188
|
-
};
|
|
11189
|
-
const pipeline = this.pipelines.find((p) => p.canProcess(rawContent));
|
|
11190
|
-
if (!pipeline) {
|
|
11191
|
-
logger.warn(
|
|
11192
|
-
`⚠️ Unsupported content type "${rawContent.mimeType}" for document ${url}. Skipping processing.`
|
|
11193
|
-
);
|
|
11194
|
-
return;
|
|
11195
|
-
}
|
|
11196
|
-
logger.debug(
|
|
11197
|
-
`Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${url})`
|
|
11198
|
-
);
|
|
11199
|
-
const scraperOptions = {
|
|
11200
|
-
url,
|
|
11201
|
-
library,
|
|
11202
|
-
version: normalizedVersion,
|
|
11203
|
-
scrapeMode: ScrapeMode.Fetch,
|
|
11204
|
-
ignoreErrors: false,
|
|
11205
|
-
maxConcurrency: 1
|
|
11206
|
-
};
|
|
11207
|
-
const processed = await pipeline.process(rawContent, scraperOptions);
|
|
11208
|
-
const chunks = processed.chunks;
|
|
11209
|
-
const splitDocs = chunks.map((chunk) => ({
|
|
11210
|
-
pageContent: chunk.content,
|
|
11211
|
-
metadata: {
|
|
11212
|
-
...document2.metadata,
|
|
11213
|
-
level: chunk.section.level,
|
|
11214
|
-
path: chunk.section.path
|
|
11215
|
-
}
|
|
11216
|
-
}));
|
|
11217
|
-
logger.info(`✂️ Split document into ${splitDocs.length} chunks`);
|
|
11218
|
-
await this.store.addDocuments(library, normalizedVersion, splitDocs);
|
|
11576
|
+
logger.info(`✂️ Storing ${chunks.length} pre-split chunks`);
|
|
11577
|
+
await this.store.addDocuments(library, normalizedVersion, depth, result);
|
|
11219
11578
|
const processingTime = performance.now() - processingStart;
|
|
11579
|
+
const totalContentSize = chunks.reduce(
|
|
11580
|
+
(sum, chunk) => sum + chunk.content.length,
|
|
11581
|
+
0
|
|
11582
|
+
);
|
|
11220
11583
|
analytics.track(TelemetryEvent.DOCUMENT_PROCESSED, {
|
|
11221
11584
|
// Content characteristics (privacy-safe)
|
|
11222
|
-
mimeType: contentType
|
|
11223
|
-
contentSizeBytes:
|
|
11585
|
+
mimeType: contentType,
|
|
11586
|
+
contentSizeBytes: totalContentSize,
|
|
11224
11587
|
// Processing metrics
|
|
11225
11588
|
processingTimeMs: Math.round(processingTime),
|
|
11226
|
-
chunksCreated:
|
|
11589
|
+
chunksCreated: chunks.length,
|
|
11227
11590
|
// Document characteristics
|
|
11228
|
-
hasTitle: !!
|
|
11229
|
-
hasDescription: !!document2.metadata.description,
|
|
11591
|
+
hasTitle: !!title,
|
|
11230
11592
|
urlDomain: extractHostname(url),
|
|
11231
|
-
depth
|
|
11593
|
+
depth,
|
|
11232
11594
|
// Library context
|
|
11233
11595
|
library,
|
|
11234
11596
|
libraryVersion: normalizedVersion || null,
|
|
11235
11597
|
// Processing efficiency
|
|
11236
|
-
avgChunkSizeBytes: Math.round(
|
|
11598
|
+
avgChunkSizeBytes: Math.round(totalContentSize / chunks.length),
|
|
11237
11599
|
processingSpeedKbPerSec: Math.round(
|
|
11238
|
-
|
|
11600
|
+
totalContentSize / 1024 / (processingTime / 1e3)
|
|
11239
11601
|
)
|
|
11240
11602
|
});
|
|
11241
11603
|
} catch (error) {
|
|
11242
11604
|
const processingTime = performance.now() - processingStart;
|
|
11243
11605
|
if (error instanceof Error) {
|
|
11244
11606
|
analytics.captureException(error, {
|
|
11245
|
-
mimeType: contentType
|
|
11246
|
-
contentSizeBytes:
|
|
11607
|
+
mimeType: contentType,
|
|
11608
|
+
contentSizeBytes: chunks.reduce(
|
|
11609
|
+
(sum, chunk) => sum + chunk.content.length,
|
|
11610
|
+
0
|
|
11611
|
+
),
|
|
11247
11612
|
processingTimeMs: Math.round(processingTime),
|
|
11248
11613
|
library,
|
|
11249
11614
|
libraryVersion: normalizedVersion || null,
|
|
11250
|
-
context: "
|
|
11615
|
+
context: "processed_content_storage",
|
|
11251
11616
|
component: DocumentManagementService.constructor.name
|
|
11252
11617
|
});
|
|
11253
11618
|
}
|
|
@@ -11277,6 +11642,18 @@ class DocumentManagementService {
|
|
|
11277
11642
|
);
|
|
11278
11643
|
return versionId;
|
|
11279
11644
|
}
|
|
11645
|
+
/**
|
|
11646
|
+
* Retrieves a version by its ID from the database.
|
|
11647
|
+
*/
|
|
11648
|
+
async getVersionById(versionId) {
|
|
11649
|
+
return this.store.getVersionById(versionId);
|
|
11650
|
+
}
|
|
11651
|
+
/**
|
|
11652
|
+
* Retrieves a library by its ID from the database.
|
|
11653
|
+
*/
|
|
11654
|
+
async getLibraryById(libraryId) {
|
|
11655
|
+
return this.store.getLibraryById(libraryId);
|
|
11656
|
+
}
|
|
11280
11657
|
}
|
|
11281
11658
|
async function createDocumentManagement(options = {}) {
|
|
11282
11659
|
if (options.serverUrl) {
|
|
@@ -11368,6 +11745,7 @@ async function initializeTools(docService, pipeline) {
|
|
|
11368
11745
|
listLibraries: new ListLibrariesTool(docService),
|
|
11369
11746
|
findVersion: new FindVersionTool(docService),
|
|
11370
11747
|
scrape: new ScrapeTool(pipeline),
|
|
11748
|
+
refresh: new RefreshVersionTool(pipeline),
|
|
11371
11749
|
search: new SearchTool(docService),
|
|
11372
11750
|
listJobs: new ListJobsTool(pipeline),
|
|
11373
11751
|
getJobInfo: new GetJobInfoTool(pipeline),
|
|
@@ -11480,11 +11858,15 @@ const optionalTrimmed = z$1.preprocess(
|
|
|
11480
11858
|
(v) => typeof v === "string" ? v.trim() : v,
|
|
11481
11859
|
z$1.string().min(1).optional().nullable()
|
|
11482
11860
|
);
|
|
11483
|
-
const
|
|
11861
|
+
const enqueueScrapeInput = z$1.object({
|
|
11484
11862
|
library: nonEmptyTrimmed,
|
|
11485
11863
|
version: optionalTrimmed,
|
|
11486
11864
|
options: z$1.custom()
|
|
11487
11865
|
});
|
|
11866
|
+
const enqueueRefreshInput = z$1.object({
|
|
11867
|
+
library: nonEmptyTrimmed,
|
|
11868
|
+
version: optionalTrimmed
|
|
11869
|
+
});
|
|
11488
11870
|
const jobIdInput = z$1.object({ id: z$1.string().min(1) });
|
|
11489
11871
|
const getJobsInput = z$1.object({
|
|
11490
11872
|
status: z$1.nativeEnum(PipelineJobStatus).optional()
|
|
@@ -11492,12 +11874,12 @@ const getJobsInput = z$1.object({
|
|
|
11492
11874
|
function createPipelineRouter(trpc) {
|
|
11493
11875
|
const tt = trpc;
|
|
11494
11876
|
return tt.router({
|
|
11495
|
-
|
|
11877
|
+
enqueueScrapeJob: tt.procedure.input(enqueueScrapeInput).mutation(
|
|
11496
11878
|
async ({
|
|
11497
11879
|
ctx,
|
|
11498
11880
|
input
|
|
11499
11881
|
}) => {
|
|
11500
|
-
const jobId = await ctx.pipeline.
|
|
11882
|
+
const jobId = await ctx.pipeline.enqueueScrapeJob(
|
|
11501
11883
|
input.library,
|
|
11502
11884
|
input.version ?? null,
|
|
11503
11885
|
input.options
|
|
@@ -11517,6 +11899,18 @@ function createPipelineRouter(trpc) {
|
|
|
11517
11899
|
return { jobId };
|
|
11518
11900
|
}
|
|
11519
11901
|
),
|
|
11902
|
+
enqueueRefreshJob: tt.procedure.input(enqueueRefreshInput).mutation(
|
|
11903
|
+
async ({
|
|
11904
|
+
ctx,
|
|
11905
|
+
input
|
|
11906
|
+
}) => {
|
|
11907
|
+
const jobId = await ctx.pipeline.enqueueRefreshJob(
|
|
11908
|
+
input.library,
|
|
11909
|
+
input.version ?? null
|
|
11910
|
+
);
|
|
11911
|
+
return { jobId };
|
|
11912
|
+
}
|
|
11913
|
+
),
|
|
11520
11914
|
getJob: tt.procedure.input(jobIdInput).query(
|
|
11521
11915
|
async ({
|
|
11522
11916
|
ctx,
|
|
@@ -13456,7 +13850,7 @@ async function registerWorkerService(pipeline) {
|
|
|
13456
13850
|
},
|
|
13457
13851
|
onJobError: async (job, error, document2) => {
|
|
13458
13852
|
logger.warn(
|
|
13459
|
-
`⚠️ Job ${job.id} error ${document2 ? `on document ${document2.
|
|
13853
|
+
`⚠️ Job ${job.id} error ${document2 ? `on document ${document2.url}` : ""}: ${error.message}`
|
|
13460
13854
|
);
|
|
13461
13855
|
analytics.captureException(error, {
|
|
13462
13856
|
jobId: job.id,
|
|
@@ -13996,7 +14390,7 @@ async function findVersionAction(library, options, command) {
|
|
|
13996
14390
|
function createFindVersionCommand(program) {
|
|
13997
14391
|
return program.command("find-version <library>").description("Find the best matching version for a library").option("-v, --version <string>", "Pattern to match (optional, supports ranges)").option(
|
|
13998
14392
|
"--server-url <url>",
|
|
13999
|
-
"URL of external pipeline worker RPC (e.g., http://localhost:
|
|
14393
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
|
|
14000
14394
|
).action(findVersionAction);
|
|
14001
14395
|
}
|
|
14002
14396
|
async function listAction(options, command) {
|
|
@@ -14022,7 +14416,7 @@ async function listAction(options, command) {
|
|
|
14022
14416
|
function createListCommand(program) {
|
|
14023
14417
|
return program.command("list").description("List all available libraries and their versions").option(
|
|
14024
14418
|
"--server-url <url>",
|
|
14025
|
-
"URL of external pipeline worker RPC (e.g., http://localhost:
|
|
14419
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
|
|
14026
14420
|
).action(listAction);
|
|
14027
14421
|
}
|
|
14028
14422
|
function createMcpCommand(program) {
|
|
@@ -14045,7 +14439,7 @@ function createMcpCommand(program) {
|
|
|
14045
14439
|
).env("DOCS_MCP_EMBEDDING_MODEL")
|
|
14046
14440
|
).option(
|
|
14047
14441
|
"--server-url <url>",
|
|
14048
|
-
"URL of external pipeline worker RPC (e.g., http://localhost:
|
|
14442
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
|
|
14049
14443
|
).option(
|
|
14050
14444
|
"--read-only",
|
|
14051
14445
|
"Run in read-only mode (only expose read tools, disable write/job tools)",
|
|
@@ -14169,6 +14563,68 @@ function createMcpCommand(program) {
|
|
|
14169
14563
|
}
|
|
14170
14564
|
);
|
|
14171
14565
|
}
|
|
14566
|
+
async function refreshAction(library, options, command) {
|
|
14567
|
+
await analytics.track(TelemetryEvent.CLI_COMMAND, {
|
|
14568
|
+
command: "refresh",
|
|
14569
|
+
library,
|
|
14570
|
+
version: options.version,
|
|
14571
|
+
useServerUrl: !!options.serverUrl
|
|
14572
|
+
});
|
|
14573
|
+
const serverUrl = options.serverUrl;
|
|
14574
|
+
const globalOptions = getGlobalOptions(command);
|
|
14575
|
+
const embeddingConfig = resolveEmbeddingContext(options.embeddingModel);
|
|
14576
|
+
if (!serverUrl && !embeddingConfig) {
|
|
14577
|
+
throw new Error(
|
|
14578
|
+
"Embedding configuration is required for local refresh operations. Please set DOCS_MCP_EMBEDDING_MODEL environment variable or use --server-url for remote execution."
|
|
14579
|
+
);
|
|
14580
|
+
}
|
|
14581
|
+
const docService = await createDocumentManagement({
|
|
14582
|
+
serverUrl,
|
|
14583
|
+
embeddingConfig,
|
|
14584
|
+
storePath: globalOptions.storePath
|
|
14585
|
+
});
|
|
14586
|
+
let pipeline = null;
|
|
14587
|
+
try {
|
|
14588
|
+
const pipelineOptions = {
|
|
14589
|
+
recoverJobs: false,
|
|
14590
|
+
concurrency: 1,
|
|
14591
|
+
serverUrl
|
|
14592
|
+
};
|
|
14593
|
+
pipeline = await createPipelineWithCallbacks(
|
|
14594
|
+
serverUrl ? void 0 : docService,
|
|
14595
|
+
pipelineOptions
|
|
14596
|
+
);
|
|
14597
|
+
await pipeline.start();
|
|
14598
|
+
const refreshTool = new RefreshVersionTool(pipeline);
|
|
14599
|
+
const result = await refreshTool.execute({
|
|
14600
|
+
library,
|
|
14601
|
+
version: options.version,
|
|
14602
|
+
waitForCompletion: true
|
|
14603
|
+
// Always wait for completion in CLI
|
|
14604
|
+
});
|
|
14605
|
+
if ("pagesRefreshed" in result) {
|
|
14606
|
+
console.log(`✅ Successfully refreshed ${result.pagesRefreshed} pages`);
|
|
14607
|
+
} else {
|
|
14608
|
+
console.log(`🚀 Refresh job started with ID: ${result.jobId}`);
|
|
14609
|
+
}
|
|
14610
|
+
} finally {
|
|
14611
|
+
if (pipeline) await pipeline.stop();
|
|
14612
|
+
await docService.shutdown();
|
|
14613
|
+
}
|
|
14614
|
+
}
|
|
14615
|
+
function createRefreshCommand(program) {
|
|
14616
|
+
return program.command("refresh <library>").description(
|
|
14617
|
+
"Re-scrape an existing library version, updating only changed pages.\n\nUses HTTP ETags to efficiently skip unchanged pages and only re-process\ncontent that has been modified or deleted since the last scrape.\n\nExamples:\n refresh react --version 18.0.0\n refresh mylib\n\nNote: The library and version must already be indexed. Use 'scrape' to index a new library/version."
|
|
14618
|
+
).option("-v, --version <string>", "Version of the library (optional)").addOption(
|
|
14619
|
+
new Option(
|
|
14620
|
+
"--embedding-model <model>",
|
|
14621
|
+
"Embedding model configuration (e.g., 'openai:text-embedding-3-small')"
|
|
14622
|
+
).env("DOCS_MCP_EMBEDDING_MODEL")
|
|
14623
|
+
).option(
|
|
14624
|
+
"--server-url <url>",
|
|
14625
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
|
|
14626
|
+
).action(refreshAction);
|
|
14627
|
+
}
|
|
14172
14628
|
async function removeAction(library, options, command) {
|
|
14173
14629
|
await analytics.track(TelemetryEvent.CLI_COMMAND, {
|
|
14174
14630
|
command: "remove",
|
|
@@ -14203,7 +14659,7 @@ function createRemoveCommand(program) {
|
|
|
14203
14659
|
"Version to remove (optional, removes unversioned if omitted)"
|
|
14204
14660
|
).option(
|
|
14205
14661
|
"--server-url <url>",
|
|
14206
|
-
"URL of external pipeline worker RPC (e.g., http://localhost:
|
|
14662
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
|
|
14207
14663
|
).action(removeAction);
|
|
14208
14664
|
}
|
|
14209
14665
|
async function scrapeAction(library, url, options, command) {
|
|
@@ -14343,7 +14799,7 @@ function createScrapeCommand(program) {
|
|
|
14343
14799
|
).env("DOCS_MCP_EMBEDDING_MODEL")
|
|
14344
14800
|
).option(
|
|
14345
14801
|
"--server-url <url>",
|
|
14346
|
-
"URL of external pipeline worker RPC (e.g., http://localhost:
|
|
14802
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
|
|
14347
14803
|
).action(scrapeAction);
|
|
14348
14804
|
}
|
|
14349
14805
|
async function searchAction(library, query, options, command) {
|
|
@@ -14396,7 +14852,7 @@ function createSearchCommand(program) {
|
|
|
14396
14852
|
).env("DOCS_MCP_EMBEDDING_MODEL")
|
|
14397
14853
|
).option(
|
|
14398
14854
|
"--server-url <url>",
|
|
14399
|
-
"URL of external pipeline worker RPC (e.g., http://localhost:
|
|
14855
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
|
|
14400
14856
|
).action(searchAction);
|
|
14401
14857
|
}
|
|
14402
14858
|
function createWebCommand(program) {
|
|
@@ -14417,7 +14873,7 @@ function createWebCommand(program) {
|
|
|
14417
14873
|
).env("DOCS_MCP_EMBEDDING_MODEL")
|
|
14418
14874
|
).option(
|
|
14419
14875
|
"--server-url <url>",
|
|
14420
|
-
"URL of external pipeline worker RPC (e.g., http://localhost:
|
|
14876
|
+
"URL of external pipeline worker RPC (e.g., http://localhost:8080/api)"
|
|
14421
14877
|
).action(
|
|
14422
14878
|
async (cmdOptions, command) => {
|
|
14423
14879
|
await analytics.track(TelemetryEvent.CLI_COMMAND, {
|
|
@@ -14612,6 +15068,7 @@ function createCliProgram() {
|
|
|
14612
15068
|
createWebCommand(program);
|
|
14613
15069
|
createWorkerCommand(program);
|
|
14614
15070
|
createScrapeCommand(program);
|
|
15071
|
+
createRefreshCommand(program);
|
|
14615
15072
|
createSearchCommand(program);
|
|
14616
15073
|
createListCommand(program);
|
|
14617
15074
|
createFindVersionCommand(program);
|