@arabold/docs-mcp-server 1.32.0 → 1.33.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/db/migrations/011-add-vector-triggers.sql +45 -0
- package/dist/index.js +320 -230
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
package/dist/index.js
CHANGED
|
@@ -1472,6 +1472,8 @@ const FETCHER_MAX_CACHE_ITEM_SIZE_BYTES = 500 * 1024;
|
|
|
1472
1472
|
const SPLITTER_MIN_CHUNK_SIZE = 500;
|
|
1473
1473
|
const SPLITTER_PREFERRED_CHUNK_SIZE = 1500;
|
|
1474
1474
|
const SPLITTER_MAX_CHUNK_SIZE = 5e3;
|
|
1475
|
+
const JSON_MAX_NESTING_DEPTH = 5;
|
|
1476
|
+
const JSON_MAX_CHUNKS = 1e3;
|
|
1475
1477
|
const EMBEDDING_BATCH_SIZE = 100;
|
|
1476
1478
|
const EMBEDDING_BATCH_CHARS = 5e4;
|
|
1477
1479
|
const MIGRATION_MAX_RETRIES = 5;
|
|
@@ -2355,19 +2357,17 @@ class BrowserFetcher {
|
|
|
2355
2357
|
);
|
|
2356
2358
|
}
|
|
2357
2359
|
}
|
|
2360
|
+
static async launchBrowser() {
|
|
2361
|
+
return chromium.launch({
|
|
2362
|
+
headless: true,
|
|
2363
|
+
executablePath: process.env.PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH || void 0,
|
|
2364
|
+
args: ["--no-sandbox"]
|
|
2365
|
+
});
|
|
2366
|
+
}
|
|
2358
2367
|
async ensureBrowserReady() {
|
|
2359
2368
|
if (!this.browser) {
|
|
2360
2369
|
logger.debug("Launching browser...");
|
|
2361
|
-
this.browser = await
|
|
2362
|
-
headless: true,
|
|
2363
|
-
args: [
|
|
2364
|
-
"--no-sandbox",
|
|
2365
|
-
"--disable-setuid-sandbox",
|
|
2366
|
-
"--disable-dev-shm-usage",
|
|
2367
|
-
"--disable-web-security",
|
|
2368
|
-
"--disable-features=site-per-process"
|
|
2369
|
-
]
|
|
2370
|
-
});
|
|
2370
|
+
this.browser = await BrowserFetcher.launchBrowser();
|
|
2371
2371
|
}
|
|
2372
2372
|
if (!this.page) {
|
|
2373
2373
|
this.page = await this.browser.newPage();
|
|
@@ -3044,16 +3044,203 @@ class GreedySplitter {
|
|
|
3044
3044
|
return common;
|
|
3045
3045
|
}
|
|
3046
3046
|
}
|
|
3047
|
+
class TextContentSplitter {
|
|
3048
|
+
constructor(options) {
|
|
3049
|
+
this.options = options;
|
|
3050
|
+
}
|
|
3051
|
+
/**
|
|
3052
|
+
* Splits text content into chunks while trying to preserve semantic boundaries.
|
|
3053
|
+
* Prefers paragraph breaks, then line breaks, finally falling back to word boundaries.
|
|
3054
|
+
* Always preserves formatting - trimming should be done by higher-level splitters if needed.
|
|
3055
|
+
*/
|
|
3056
|
+
async split(content) {
|
|
3057
|
+
if (content.length <= this.options.chunkSize) {
|
|
3058
|
+
return [content];
|
|
3059
|
+
}
|
|
3060
|
+
const words = content.split(/\s+/);
|
|
3061
|
+
const longestWord = words.reduce(
|
|
3062
|
+
(max, word) => word.length > max.length ? word : max
|
|
3063
|
+
);
|
|
3064
|
+
if (longestWord.length > this.options.chunkSize) {
|
|
3065
|
+
throw new MinimumChunkSizeError(longestWord.length, this.options.chunkSize);
|
|
3066
|
+
}
|
|
3067
|
+
const paragraphChunks = this.splitByParagraphs(content);
|
|
3068
|
+
if (this.areChunksValid(paragraphChunks)) {
|
|
3069
|
+
return paragraphChunks;
|
|
3070
|
+
}
|
|
3071
|
+
const lineChunks = this.splitByLines(content);
|
|
3072
|
+
if (this.areChunksValid(lineChunks)) {
|
|
3073
|
+
return this.mergeChunks(lineChunks, "");
|
|
3074
|
+
}
|
|
3075
|
+
const wordChunks = await this.splitByWords(content);
|
|
3076
|
+
return this.mergeChunks(wordChunks, " ");
|
|
3077
|
+
}
|
|
3078
|
+
/**
|
|
3079
|
+
* Checks if all chunks are within the maximum size limit
|
|
3080
|
+
*/
|
|
3081
|
+
areChunksValid(chunks) {
|
|
3082
|
+
return chunks.every((chunk) => chunk.length <= this.options.chunkSize);
|
|
3083
|
+
}
|
|
3084
|
+
/**
|
|
3085
|
+
* Splits text into chunks by paragraph boundaries (double newlines)
|
|
3086
|
+
* Preserves all formatting and whitespace including the paragraph separators
|
|
3087
|
+
*/
|
|
3088
|
+
splitByParagraphs(text) {
|
|
3089
|
+
const chunks = [];
|
|
3090
|
+
let startPos = 0;
|
|
3091
|
+
const paragraphRegex = /\n\s*\n/g;
|
|
3092
|
+
let match = paragraphRegex.exec(text);
|
|
3093
|
+
while (match !== null) {
|
|
3094
|
+
const endPos = match.index + match[0].length;
|
|
3095
|
+
const chunk = text.slice(startPos, endPos);
|
|
3096
|
+
if (chunk.length > 2) {
|
|
3097
|
+
chunks.push(chunk);
|
|
3098
|
+
}
|
|
3099
|
+
startPos = endPos;
|
|
3100
|
+
match = paragraphRegex.exec(text);
|
|
3101
|
+
}
|
|
3102
|
+
if (startPos < text.length) {
|
|
3103
|
+
const remainingChunk = text.slice(startPos);
|
|
3104
|
+
if (remainingChunk.length > 2) {
|
|
3105
|
+
chunks.push(remainingChunk);
|
|
3106
|
+
}
|
|
3107
|
+
}
|
|
3108
|
+
return chunks.filter(Boolean);
|
|
3109
|
+
}
|
|
3110
|
+
/**
|
|
3111
|
+
* Splits text into chunks by line boundaries
|
|
3112
|
+
* Preserves all formatting and whitespace, including newlines at the end of each line
|
|
3113
|
+
*/
|
|
3114
|
+
splitByLines(text) {
|
|
3115
|
+
const chunks = [];
|
|
3116
|
+
let startPos = 0;
|
|
3117
|
+
for (let i = 0; i < text.length; i++) {
|
|
3118
|
+
if (text[i] === "\n") {
|
|
3119
|
+
const chunk = text.slice(startPos, i + 1);
|
|
3120
|
+
chunks.push(chunk);
|
|
3121
|
+
startPos = i + 1;
|
|
3122
|
+
}
|
|
3123
|
+
}
|
|
3124
|
+
if (startPos < text.length) {
|
|
3125
|
+
chunks.push(text.slice(startPos));
|
|
3126
|
+
}
|
|
3127
|
+
return chunks;
|
|
3128
|
+
}
|
|
3129
|
+
/**
|
|
3130
|
+
* Uses LangChain's recursive splitter for word-based splitting as a last resort
|
|
3131
|
+
*/
|
|
3132
|
+
async splitByWords(text) {
|
|
3133
|
+
const splitter = new RecursiveCharacterTextSplitter({
|
|
3134
|
+
chunkSize: this.options.chunkSize,
|
|
3135
|
+
chunkOverlap: 0
|
|
3136
|
+
});
|
|
3137
|
+
const chunks = await splitter.splitText(text);
|
|
3138
|
+
return chunks;
|
|
3139
|
+
}
|
|
3140
|
+
/**
|
|
3141
|
+
* Attempts to merge small chunks with previous chunks to minimize fragmentation.
|
|
3142
|
+
* Only merges if combined size is within maxChunkSize.
|
|
3143
|
+
*/
|
|
3144
|
+
mergeChunks(chunks, separator) {
|
|
3145
|
+
const mergedChunks = [];
|
|
3146
|
+
let currentChunk = null;
|
|
3147
|
+
for (const chunk of chunks) {
|
|
3148
|
+
if (currentChunk === null) {
|
|
3149
|
+
currentChunk = chunk;
|
|
3150
|
+
continue;
|
|
3151
|
+
}
|
|
3152
|
+
const currentChunkSize = this.getChunkSize(currentChunk);
|
|
3153
|
+
const nextChunkSize = this.getChunkSize(chunk);
|
|
3154
|
+
if (currentChunkSize + nextChunkSize + separator.length <= this.options.chunkSize) {
|
|
3155
|
+
currentChunk = `${currentChunk}${separator}${chunk}`;
|
|
3156
|
+
} else {
|
|
3157
|
+
mergedChunks.push(currentChunk);
|
|
3158
|
+
currentChunk = chunk;
|
|
3159
|
+
}
|
|
3160
|
+
}
|
|
3161
|
+
if (currentChunk) {
|
|
3162
|
+
mergedChunks.push(currentChunk);
|
|
3163
|
+
}
|
|
3164
|
+
return mergedChunks;
|
|
3165
|
+
}
|
|
3166
|
+
getChunkSize(chunk) {
|
|
3167
|
+
return chunk.length;
|
|
3168
|
+
}
|
|
3169
|
+
wrap(content) {
|
|
3170
|
+
return content;
|
|
3171
|
+
}
|
|
3172
|
+
}
|
|
3173
|
+
class TextDocumentSplitter {
|
|
3174
|
+
options;
|
|
3175
|
+
textSplitter;
|
|
3176
|
+
constructor(options = {}) {
|
|
3177
|
+
this.options = {
|
|
3178
|
+
maxChunkSize: options.maxChunkSize ?? SPLITTER_MAX_CHUNK_SIZE
|
|
3179
|
+
};
|
|
3180
|
+
this.textSplitter = new TextContentSplitter({
|
|
3181
|
+
chunkSize: this.options.maxChunkSize
|
|
3182
|
+
});
|
|
3183
|
+
}
|
|
3184
|
+
async splitText(content) {
|
|
3185
|
+
if (!content.trim()) {
|
|
3186
|
+
return [];
|
|
3187
|
+
}
|
|
3188
|
+
try {
|
|
3189
|
+
const chunks = await this.textSplitter.split(content);
|
|
3190
|
+
return chunks.map((chunk) => ({
|
|
3191
|
+
types: ["text"],
|
|
3192
|
+
content: chunk,
|
|
3193
|
+
section: {
|
|
3194
|
+
level: 0,
|
|
3195
|
+
path: []
|
|
3196
|
+
}
|
|
3197
|
+
}));
|
|
3198
|
+
} catch (error) {
|
|
3199
|
+
if (!(error instanceof MinimumChunkSizeError) && error instanceof Error) {
|
|
3200
|
+
console.warn(
|
|
3201
|
+
`Unexpected text splitting error: ${error.message}. Forcing character-based split.`
|
|
3202
|
+
);
|
|
3203
|
+
}
|
|
3204
|
+
const chunks = [];
|
|
3205
|
+
let offset = 0;
|
|
3206
|
+
while (offset < content.length) {
|
|
3207
|
+
const chunkContent = content.substring(
|
|
3208
|
+
offset,
|
|
3209
|
+
offset + this.options.maxChunkSize
|
|
3210
|
+
);
|
|
3211
|
+
chunks.push({
|
|
3212
|
+
types: ["text"],
|
|
3213
|
+
content: chunkContent,
|
|
3214
|
+
section: {
|
|
3215
|
+
level: 0,
|
|
3216
|
+
path: []
|
|
3217
|
+
}
|
|
3218
|
+
});
|
|
3219
|
+
offset += this.options.maxChunkSize;
|
|
3220
|
+
}
|
|
3221
|
+
return chunks;
|
|
3222
|
+
}
|
|
3223
|
+
}
|
|
3224
|
+
}
|
|
3047
3225
|
class JsonDocumentSplitter {
|
|
3048
3226
|
preserveFormatting;
|
|
3227
|
+
maxDepth;
|
|
3228
|
+
maxChunks;
|
|
3229
|
+
textFallbackSplitter;
|
|
3049
3230
|
constructor(options = {}) {
|
|
3050
3231
|
this.preserveFormatting = options.preserveFormatting ?? true;
|
|
3232
|
+
this.maxDepth = options.maxDepth ?? JSON_MAX_NESTING_DEPTH;
|
|
3233
|
+
this.maxChunks = options.maxChunks ?? JSON_MAX_CHUNKS;
|
|
3234
|
+
this.textFallbackSplitter = new TextDocumentSplitter();
|
|
3051
3235
|
}
|
|
3052
3236
|
async splitText(content, _contentType) {
|
|
3053
3237
|
try {
|
|
3054
3238
|
const parsed = JSON.parse(content);
|
|
3055
3239
|
const chunks = [];
|
|
3056
|
-
this.processValue(parsed, ["root"], 1, 0, chunks, true);
|
|
3240
|
+
await this.processValue(parsed, ["root"], 1, 0, chunks, true);
|
|
3241
|
+
if (chunks.length > this.maxChunks) {
|
|
3242
|
+
return this.textFallbackSplitter.splitText(content);
|
|
3243
|
+
}
|
|
3057
3244
|
return chunks;
|
|
3058
3245
|
} catch {
|
|
3059
3246
|
return [
|
|
@@ -3068,16 +3255,20 @@ class JsonDocumentSplitter {
|
|
|
3068
3255
|
];
|
|
3069
3256
|
}
|
|
3070
3257
|
}
|
|
3071
|
-
processValue(value, path2, level, indentLevel, chunks, isLastItem) {
|
|
3258
|
+
async processValue(value, path2, level, indentLevel, chunks, isLastItem) {
|
|
3259
|
+
if (level > this.maxDepth) {
|
|
3260
|
+
await this.processValueAsText(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3261
|
+
return;
|
|
3262
|
+
}
|
|
3072
3263
|
if (Array.isArray(value)) {
|
|
3073
|
-
this.processArray(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3264
|
+
await this.processArray(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3074
3265
|
} else if (value !== null && typeof value === "object") {
|
|
3075
|
-
this.processObject(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3266
|
+
await this.processObject(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3076
3267
|
} else {
|
|
3077
|
-
this.processPrimitive(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3268
|
+
await this.processPrimitive(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3078
3269
|
}
|
|
3079
3270
|
}
|
|
3080
|
-
processArray(array, path2, level, indentLevel, chunks, isLastItem) {
|
|
3271
|
+
async processArray(array, path2, level, indentLevel, chunks, isLastItem) {
|
|
3081
3272
|
const indent = this.getIndent(indentLevel);
|
|
3082
3273
|
const comma = isLastItem ? "" : ",";
|
|
3083
3274
|
chunks.push({
|
|
@@ -3085,18 +3276,19 @@ class JsonDocumentSplitter {
|
|
|
3085
3276
|
content: `${indent}[`,
|
|
3086
3277
|
section: { level, path: [...path2] }
|
|
3087
3278
|
});
|
|
3088
|
-
array.
|
|
3279
|
+
for (let index = 0; index < array.length; index++) {
|
|
3280
|
+
const item = array[index];
|
|
3089
3281
|
const isLast = index === array.length - 1;
|
|
3090
3282
|
const itemPath = [...path2, `[${index}]`];
|
|
3091
|
-
this.processValue(item, itemPath, level + 1, indentLevel + 1, chunks, isLast);
|
|
3092
|
-
}
|
|
3283
|
+
await this.processValue(item, itemPath, level + 1, indentLevel + 1, chunks, isLast);
|
|
3284
|
+
}
|
|
3093
3285
|
chunks.push({
|
|
3094
3286
|
types: ["code"],
|
|
3095
3287
|
content: `${indent}]${comma}`,
|
|
3096
3288
|
section: { level, path: [...path2] }
|
|
3097
3289
|
});
|
|
3098
3290
|
}
|
|
3099
|
-
processObject(obj, path2, level, indentLevel, chunks, isLastItem) {
|
|
3291
|
+
async processObject(obj, path2, level, indentLevel, chunks, isLastItem) {
|
|
3100
3292
|
const indent = this.getIndent(indentLevel);
|
|
3101
3293
|
const comma = isLastItem ? "" : ",";
|
|
3102
3294
|
const entries = Object.entries(obj);
|
|
@@ -3105,10 +3297,11 @@ class JsonDocumentSplitter {
|
|
|
3105
3297
|
content: `${indent}{`,
|
|
3106
3298
|
section: { level, path: [...path2] }
|
|
3107
3299
|
});
|
|
3108
|
-
|
|
3300
|
+
for (let index = 0; index < entries.length; index++) {
|
|
3301
|
+
const [key, value] = entries[index];
|
|
3109
3302
|
const isLast = index === entries.length - 1;
|
|
3110
3303
|
const propertyPath = [...path2, key];
|
|
3111
|
-
this.processProperty(
|
|
3304
|
+
await this.processProperty(
|
|
3112
3305
|
key,
|
|
3113
3306
|
value,
|
|
3114
3307
|
propertyPath,
|
|
@@ -3117,14 +3310,14 @@ class JsonDocumentSplitter {
|
|
|
3117
3310
|
chunks,
|
|
3118
3311
|
isLast
|
|
3119
3312
|
);
|
|
3120
|
-
}
|
|
3313
|
+
}
|
|
3121
3314
|
chunks.push({
|
|
3122
3315
|
types: ["code"],
|
|
3123
3316
|
content: `${indent}}${comma}`,
|
|
3124
3317
|
section: { level, path: [...path2] }
|
|
3125
3318
|
});
|
|
3126
3319
|
}
|
|
3127
|
-
processProperty(key, value, path2, level, indentLevel, chunks, isLastProperty) {
|
|
3320
|
+
async processProperty(key, value, path2, level, indentLevel, chunks, isLastProperty) {
|
|
3128
3321
|
const indent = this.getIndent(indentLevel);
|
|
3129
3322
|
if (typeof value === "object" && value !== null) {
|
|
3130
3323
|
chunks.push({
|
|
@@ -3132,30 +3325,98 @@ class JsonDocumentSplitter {
|
|
|
3132
3325
|
content: `${indent}"${key}": `,
|
|
3133
3326
|
section: { level, path: path2 }
|
|
3134
3327
|
});
|
|
3135
|
-
this.processValue(value, path2, level, indentLevel, chunks, isLastProperty);
|
|
3328
|
+
await this.processValue(value, path2, level, indentLevel, chunks, isLastProperty);
|
|
3136
3329
|
} else {
|
|
3137
3330
|
const comma = isLastProperty ? "" : ",";
|
|
3138
3331
|
const formattedValue = JSON.stringify(value);
|
|
3139
|
-
|
|
3140
|
-
|
|
3141
|
-
|
|
3142
|
-
|
|
3143
|
-
|
|
3332
|
+
const fullContent = `${indent}"${key}": ${formattedValue}${comma}`;
|
|
3333
|
+
if (fullContent.length > SPLITTER_MAX_CHUNK_SIZE) {
|
|
3334
|
+
const textChunks = await this.textFallbackSplitter.splitText(formattedValue);
|
|
3335
|
+
chunks.push({
|
|
3336
|
+
types: ["code"],
|
|
3337
|
+
content: `${indent}"${key}": `,
|
|
3338
|
+
section: { level, path: path2 }
|
|
3339
|
+
});
|
|
3340
|
+
textChunks.forEach((textChunk, index) => {
|
|
3341
|
+
const isLastChunk = index === textChunks.length - 1;
|
|
3342
|
+
const content = `${textChunk.content}${isLastChunk ? comma : ""}`;
|
|
3343
|
+
chunks.push({
|
|
3344
|
+
types: ["code"],
|
|
3345
|
+
content,
|
|
3346
|
+
section: { level, path: path2 }
|
|
3347
|
+
});
|
|
3348
|
+
});
|
|
3349
|
+
} else {
|
|
3350
|
+
chunks.push({
|
|
3351
|
+
types: ["code"],
|
|
3352
|
+
content: fullContent,
|
|
3353
|
+
section: { level, path: path2 }
|
|
3354
|
+
});
|
|
3355
|
+
}
|
|
3144
3356
|
}
|
|
3145
3357
|
}
|
|
3146
|
-
processPrimitive(value, path2, level, indentLevel, chunks, isLastItem) {
|
|
3358
|
+
async processPrimitive(value, path2, level, indentLevel, chunks, isLastItem) {
|
|
3147
3359
|
const indent = this.getIndent(indentLevel);
|
|
3148
3360
|
const comma = isLastItem ? "" : ",";
|
|
3149
3361
|
const formattedValue = JSON.stringify(value);
|
|
3150
|
-
|
|
3151
|
-
|
|
3152
|
-
|
|
3153
|
-
|
|
3154
|
-
|
|
3362
|
+
const fullContent = `${indent}${formattedValue}${comma}`;
|
|
3363
|
+
if (fullContent.length > SPLITTER_MAX_CHUNK_SIZE) {
|
|
3364
|
+
const textChunks = await this.textFallbackSplitter.splitText(formattedValue);
|
|
3365
|
+
textChunks.forEach((textChunk, index) => {
|
|
3366
|
+
const isFirstChunk = index === 0;
|
|
3367
|
+
const isLastChunk = index === textChunks.length - 1;
|
|
3368
|
+
const valueContent = isFirstChunk ? `${indent}${textChunk.content}` : textChunk.content;
|
|
3369
|
+
const content = `${valueContent}${isLastChunk ? comma : ""}`;
|
|
3370
|
+
chunks.push({
|
|
3371
|
+
types: ["code"],
|
|
3372
|
+
content,
|
|
3373
|
+
section: { level, path: [...path2] }
|
|
3374
|
+
});
|
|
3375
|
+
});
|
|
3376
|
+
} else {
|
|
3377
|
+
chunks.push({
|
|
3378
|
+
types: ["code"],
|
|
3379
|
+
content: fullContent,
|
|
3380
|
+
section: { level, path: path2 }
|
|
3381
|
+
});
|
|
3382
|
+
}
|
|
3155
3383
|
}
|
|
3156
3384
|
getIndent(level) {
|
|
3157
3385
|
return this.preserveFormatting ? " ".repeat(level) : "";
|
|
3158
3386
|
}
|
|
3387
|
+
/**
|
|
3388
|
+
* Process a value that has exceeded the maximum depth limit by serializing it as text.
|
|
3389
|
+
* This prevents excessive chunking of deeply nested structures.
|
|
3390
|
+
* If the serialized value is too large, splits it using the text fallback splitter.
|
|
3391
|
+
*/
|
|
3392
|
+
async processValueAsText(value, path2, level, indentLevel, chunks, isLastItem) {
|
|
3393
|
+
const indent = this.getIndent(indentLevel);
|
|
3394
|
+
const comma = isLastItem ? "" : ",";
|
|
3395
|
+
let serialized;
|
|
3396
|
+
if (this.preserveFormatting) {
|
|
3397
|
+
const lines = JSON.stringify(value, null, 2).split("\n");
|
|
3398
|
+
serialized = lines.map((line, idx) => idx === 0 ? line : `${indent}${line}`).join("\n");
|
|
3399
|
+
} else {
|
|
3400
|
+
serialized = JSON.stringify(value);
|
|
3401
|
+
}
|
|
3402
|
+
const fullContent = `${indent}${serialized}${comma}`;
|
|
3403
|
+
if (fullContent.length > SPLITTER_MAX_CHUNK_SIZE) {
|
|
3404
|
+
const textChunks = await this.textFallbackSplitter.splitText(serialized);
|
|
3405
|
+
for (const textChunk of textChunks) {
|
|
3406
|
+
chunks.push({
|
|
3407
|
+
types: ["code"],
|
|
3408
|
+
content: textChunk.content,
|
|
3409
|
+
section: { level, path: [...path2] }
|
|
3410
|
+
});
|
|
3411
|
+
}
|
|
3412
|
+
} else {
|
|
3413
|
+
chunks.push({
|
|
3414
|
+
types: ["code"],
|
|
3415
|
+
content: fullContent,
|
|
3416
|
+
section: { level, path: [...path2] }
|
|
3417
|
+
});
|
|
3418
|
+
}
|
|
3419
|
+
}
|
|
3159
3420
|
}
|
|
3160
3421
|
class CodeContentSplitter {
|
|
3161
3422
|
constructor(options) {
|
|
@@ -3255,132 +3516,6 @@ class TableContentSplitter {
|
|
|
3255
3516
|
return separator.includes("|") && /^\|?[\s-|]+\|?$/.test(separator);
|
|
3256
3517
|
}
|
|
3257
3518
|
}
|
|
3258
|
-
class TextContentSplitter {
|
|
3259
|
-
constructor(options) {
|
|
3260
|
-
this.options = options;
|
|
3261
|
-
}
|
|
3262
|
-
/**
|
|
3263
|
-
* Splits text content into chunks while trying to preserve semantic boundaries.
|
|
3264
|
-
* Prefers paragraph breaks, then line breaks, finally falling back to word boundaries.
|
|
3265
|
-
* Always preserves formatting - trimming should be done by higher-level splitters if needed.
|
|
3266
|
-
*/
|
|
3267
|
-
async split(content) {
|
|
3268
|
-
if (content.length <= this.options.chunkSize) {
|
|
3269
|
-
return [content];
|
|
3270
|
-
}
|
|
3271
|
-
const words = content.split(/\s+/);
|
|
3272
|
-
const longestWord = words.reduce(
|
|
3273
|
-
(max, word) => word.length > max.length ? word : max
|
|
3274
|
-
);
|
|
3275
|
-
if (longestWord.length > this.options.chunkSize) {
|
|
3276
|
-
throw new MinimumChunkSizeError(longestWord.length, this.options.chunkSize);
|
|
3277
|
-
}
|
|
3278
|
-
const paragraphChunks = this.splitByParagraphs(content);
|
|
3279
|
-
if (this.areChunksValid(paragraphChunks)) {
|
|
3280
|
-
return paragraphChunks;
|
|
3281
|
-
}
|
|
3282
|
-
const lineChunks = this.splitByLines(content);
|
|
3283
|
-
if (this.areChunksValid(lineChunks)) {
|
|
3284
|
-
return this.mergeChunks(lineChunks, "");
|
|
3285
|
-
}
|
|
3286
|
-
const wordChunks = await this.splitByWords(content);
|
|
3287
|
-
return this.mergeChunks(wordChunks, " ");
|
|
3288
|
-
}
|
|
3289
|
-
/**
|
|
3290
|
-
* Checks if all chunks are within the maximum size limit
|
|
3291
|
-
*/
|
|
3292
|
-
areChunksValid(chunks) {
|
|
3293
|
-
return chunks.every((chunk) => chunk.length <= this.options.chunkSize);
|
|
3294
|
-
}
|
|
3295
|
-
/**
|
|
3296
|
-
* Splits text into chunks by paragraph boundaries (double newlines)
|
|
3297
|
-
* Preserves all formatting and whitespace including the paragraph separators
|
|
3298
|
-
*/
|
|
3299
|
-
splitByParagraphs(text) {
|
|
3300
|
-
const chunks = [];
|
|
3301
|
-
let startPos = 0;
|
|
3302
|
-
const paragraphRegex = /\n\s*\n/g;
|
|
3303
|
-
let match = paragraphRegex.exec(text);
|
|
3304
|
-
while (match !== null) {
|
|
3305
|
-
const endPos = match.index + match[0].length;
|
|
3306
|
-
const chunk = text.slice(startPos, endPos);
|
|
3307
|
-
if (chunk.length > 2) {
|
|
3308
|
-
chunks.push(chunk);
|
|
3309
|
-
}
|
|
3310
|
-
startPos = endPos;
|
|
3311
|
-
match = paragraphRegex.exec(text);
|
|
3312
|
-
}
|
|
3313
|
-
if (startPos < text.length) {
|
|
3314
|
-
const remainingChunk = text.slice(startPos);
|
|
3315
|
-
if (remainingChunk.length > 2) {
|
|
3316
|
-
chunks.push(remainingChunk);
|
|
3317
|
-
}
|
|
3318
|
-
}
|
|
3319
|
-
return chunks.filter(Boolean);
|
|
3320
|
-
}
|
|
3321
|
-
/**
|
|
3322
|
-
* Splits text into chunks by line boundaries
|
|
3323
|
-
* Preserves all formatting and whitespace, including newlines at the end of each line
|
|
3324
|
-
*/
|
|
3325
|
-
splitByLines(text) {
|
|
3326
|
-
const chunks = [];
|
|
3327
|
-
let startPos = 0;
|
|
3328
|
-
for (let i = 0; i < text.length; i++) {
|
|
3329
|
-
if (text[i] === "\n") {
|
|
3330
|
-
const chunk = text.slice(startPos, i + 1);
|
|
3331
|
-
chunks.push(chunk);
|
|
3332
|
-
startPos = i + 1;
|
|
3333
|
-
}
|
|
3334
|
-
}
|
|
3335
|
-
if (startPos < text.length) {
|
|
3336
|
-
chunks.push(text.slice(startPos));
|
|
3337
|
-
}
|
|
3338
|
-
return chunks;
|
|
3339
|
-
}
|
|
3340
|
-
/**
|
|
3341
|
-
* Uses LangChain's recursive splitter for word-based splitting as a last resort
|
|
3342
|
-
*/
|
|
3343
|
-
async splitByWords(text) {
|
|
3344
|
-
const splitter = new RecursiveCharacterTextSplitter({
|
|
3345
|
-
chunkSize: this.options.chunkSize,
|
|
3346
|
-
chunkOverlap: 0
|
|
3347
|
-
});
|
|
3348
|
-
const chunks = await splitter.splitText(text);
|
|
3349
|
-
return chunks;
|
|
3350
|
-
}
|
|
3351
|
-
/**
|
|
3352
|
-
* Attempts to merge small chunks with previous chunks to minimize fragmentation.
|
|
3353
|
-
* Only merges if combined size is within maxChunkSize.
|
|
3354
|
-
*/
|
|
3355
|
-
mergeChunks(chunks, separator) {
|
|
3356
|
-
const mergedChunks = [];
|
|
3357
|
-
let currentChunk = null;
|
|
3358
|
-
for (const chunk of chunks) {
|
|
3359
|
-
if (currentChunk === null) {
|
|
3360
|
-
currentChunk = chunk;
|
|
3361
|
-
continue;
|
|
3362
|
-
}
|
|
3363
|
-
const currentChunkSize = this.getChunkSize(currentChunk);
|
|
3364
|
-
const nextChunkSize = this.getChunkSize(chunk);
|
|
3365
|
-
if (currentChunkSize + nextChunkSize + separator.length <= this.options.chunkSize) {
|
|
3366
|
-
currentChunk = `${currentChunk}${separator}${chunk}`;
|
|
3367
|
-
} else {
|
|
3368
|
-
mergedChunks.push(currentChunk);
|
|
3369
|
-
currentChunk = chunk;
|
|
3370
|
-
}
|
|
3371
|
-
}
|
|
3372
|
-
if (currentChunk) {
|
|
3373
|
-
mergedChunks.push(currentChunk);
|
|
3374
|
-
}
|
|
3375
|
-
return mergedChunks;
|
|
3376
|
-
}
|
|
3377
|
-
getChunkSize(chunk) {
|
|
3378
|
-
return chunk.length;
|
|
3379
|
-
}
|
|
3380
|
-
wrap(content) {
|
|
3381
|
-
return content;
|
|
3382
|
-
}
|
|
3383
|
-
}
|
|
3384
3519
|
class SemanticMarkdownSplitter {
|
|
3385
3520
|
constructor(preferredChunkSize, maxChunkSize) {
|
|
3386
3521
|
this.preferredChunkSize = preferredChunkSize;
|
|
@@ -5066,16 +5201,8 @@ class HtmlPlaywrightMiddleware {
|
|
|
5066
5201
|
*/
|
|
5067
5202
|
async ensureBrowser() {
|
|
5068
5203
|
if (!this.browser || !this.browser.isConnected()) {
|
|
5069
|
-
|
|
5070
|
-
|
|
5071
|
-
logger.debug(
|
|
5072
|
-
`Launching new Playwright browser instance (Chromium) with args: ${launchArgs.join(" ") || "none"}...`
|
|
5073
|
-
);
|
|
5074
|
-
this.browser = await chromium.launch({
|
|
5075
|
-
channel: "chromium",
|
|
5076
|
-
args: launchArgs,
|
|
5077
|
-
executablePath
|
|
5078
|
-
});
|
|
5204
|
+
logger.debug("Launching new Playwright browser instance (Chromium)");
|
|
5205
|
+
this.browser = await BrowserFetcher.launchBrowser();
|
|
5079
5206
|
this.browser.on("disconnected", () => {
|
|
5080
5207
|
logger.debug("Playwright browser instance disconnected.");
|
|
5081
5208
|
this.browser = null;
|
|
@@ -6452,45 +6579,6 @@ class MarkdownPipeline extends BasePipeline {
|
|
|
6452
6579
|
};
|
|
6453
6580
|
}
|
|
6454
6581
|
}
|
|
6455
|
-
class TextDocumentSplitter {
|
|
6456
|
-
options;
|
|
6457
|
-
textSplitter;
|
|
6458
|
-
constructor(options = {}) {
|
|
6459
|
-
this.options = {
|
|
6460
|
-
maxChunkSize: options.maxChunkSize ?? SPLITTER_MAX_CHUNK_SIZE
|
|
6461
|
-
};
|
|
6462
|
-
this.textSplitter = new TextContentSplitter({
|
|
6463
|
-
chunkSize: this.options.maxChunkSize
|
|
6464
|
-
});
|
|
6465
|
-
}
|
|
6466
|
-
async splitText(content) {
|
|
6467
|
-
if (!content.trim()) {
|
|
6468
|
-
return [];
|
|
6469
|
-
}
|
|
6470
|
-
try {
|
|
6471
|
-
const chunks = await this.textSplitter.split(content);
|
|
6472
|
-
return chunks.map((chunk) => ({
|
|
6473
|
-
types: ["text"],
|
|
6474
|
-
content: chunk,
|
|
6475
|
-
section: {
|
|
6476
|
-
level: 0,
|
|
6477
|
-
path: []
|
|
6478
|
-
}
|
|
6479
|
-
}));
|
|
6480
|
-
} catch {
|
|
6481
|
-
return [
|
|
6482
|
-
{
|
|
6483
|
-
types: ["text"],
|
|
6484
|
-
content,
|
|
6485
|
-
section: {
|
|
6486
|
-
level: 0,
|
|
6487
|
-
path: []
|
|
6488
|
-
}
|
|
6489
|
-
}
|
|
6490
|
-
];
|
|
6491
|
-
}
|
|
6492
|
-
}
|
|
6493
|
-
}
|
|
6494
6582
|
class TextPipeline extends BasePipeline {
|
|
6495
6583
|
middleware;
|
|
6496
6584
|
splitter;
|
|
@@ -8973,9 +9061,10 @@ class DocumentStore {
|
|
|
8973
9061
|
* - Single texts that are too large are truncated and retried once
|
|
8974
9062
|
*
|
|
8975
9063
|
* @param texts Array of texts to embed
|
|
9064
|
+
* @param isRetry Internal flag to prevent duplicate warning logs
|
|
8976
9065
|
* @returns Array of embedding vectors
|
|
8977
9066
|
*/
|
|
8978
|
-
async embedDocumentsWithRetry(texts) {
|
|
9067
|
+
async embedDocumentsWithRetry(texts, isRetry = false) {
|
|
8979
9068
|
if (texts.length === 0) {
|
|
8980
9069
|
return [];
|
|
8981
9070
|
}
|
|
@@ -8987,26 +9076,27 @@ class DocumentStore {
|
|
|
8987
9076
|
const midpoint = Math.floor(texts.length / 2);
|
|
8988
9077
|
const firstHalf = texts.slice(0, midpoint);
|
|
8989
9078
|
const secondHalf = texts.slice(midpoint);
|
|
8990
|
-
|
|
8991
|
-
|
|
8992
|
-
|
|
9079
|
+
if (!isRetry) {
|
|
9080
|
+
logger.warn(
|
|
9081
|
+
`⚠️ Batch of ${texts.length} texts exceeded size limit, splitting into ${firstHalf.length} + ${secondHalf.length}`
|
|
9082
|
+
);
|
|
9083
|
+
}
|
|
8993
9084
|
const [firstEmbeddings, secondEmbeddings] = await Promise.all([
|
|
8994
|
-
this.embedDocumentsWithRetry(firstHalf),
|
|
8995
|
-
this.embedDocumentsWithRetry(secondHalf)
|
|
9085
|
+
this.embedDocumentsWithRetry(firstHalf, true),
|
|
9086
|
+
this.embedDocumentsWithRetry(secondHalf, true)
|
|
8996
9087
|
]);
|
|
8997
9088
|
return [...firstEmbeddings, ...secondEmbeddings];
|
|
8998
9089
|
} else {
|
|
8999
9090
|
const text = texts[0];
|
|
9000
9091
|
const midpoint = Math.floor(text.length / 2);
|
|
9001
9092
|
const firstHalf = text.substring(0, midpoint);
|
|
9002
|
-
|
|
9003
|
-
|
|
9004
|
-
|
|
9005
|
-
try {
|
|
9006
|
-
const embedding = await this.embedDocumentsWithRetry([firstHalf]);
|
|
9007
|
-
logger.info(
|
|
9008
|
-
`✓ Using embedding from first half of split text (${firstHalf.length} chars)`
|
|
9093
|
+
if (!isRetry) {
|
|
9094
|
+
logger.warn(
|
|
9095
|
+
`⚠️ Single text exceeded embedding size limit (${text.length} chars).`
|
|
9009
9096
|
);
|
|
9097
|
+
}
|
|
9098
|
+
try {
|
|
9099
|
+
const embedding = await this.embedDocumentsWithRetry([firstHalf], true);
|
|
9010
9100
|
return embedding;
|
|
9011
9101
|
} catch (retryError) {
|
|
9012
9102
|
logger.error(
|
|
@@ -9130,8 +9220,8 @@ class DocumentStore {
|
|
|
9130
9220
|
const rowId = result2.lastInsertRowid;
|
|
9131
9221
|
if (this.isVectorSearchEnabled && paddedEmbeddings.length > 0) {
|
|
9132
9222
|
this.statements.insertEmbedding.run(
|
|
9133
|
-
|
|
9134
|
-
|
|
9223
|
+
JSON.stringify(paddedEmbeddings[docIndex]),
|
|
9224
|
+
BigInt(rowId)
|
|
9135
9225
|
);
|
|
9136
9226
|
}
|
|
9137
9227
|
docIndex++;
|
|
@@ -10770,7 +10860,7 @@ const Layout = ({
|
|
|
10770
10860
|
children,
|
|
10771
10861
|
eventClientConfig
|
|
10772
10862
|
}) => {
|
|
10773
|
-
const versionString = version || "1.
|
|
10863
|
+
const versionString = version || "1.33.1";
|
|
10774
10864
|
const versionInitializer = `versionUpdate({ currentVersion: ${`'${versionString}'`} })`;
|
|
10775
10865
|
return /* @__PURE__ */ jsxs("html", { lang: "en", children: [
|
|
10776
10866
|
/* @__PURE__ */ jsxs("head", { children: [
|
|
@@ -13082,7 +13172,7 @@ class AppServer {
|
|
|
13082
13172
|
try {
|
|
13083
13173
|
if (telemetry.isEnabled()) {
|
|
13084
13174
|
telemetry.setGlobalContext({
|
|
13085
|
-
appVersion: "1.
|
|
13175
|
+
appVersion: "1.33.1",
|
|
13086
13176
|
appPlatform: process.platform,
|
|
13087
13177
|
appNodeVersion: process.version,
|
|
13088
13178
|
appServicesEnabled: this.getActiveServicesList(),
|
|
@@ -16569,7 +16659,7 @@ function createCliProgram() {
|
|
|
16569
16659
|
const commandStartTimes = /* @__PURE__ */ new Map();
|
|
16570
16660
|
let globalEventBus = null;
|
|
16571
16661
|
let globalTelemetryService = null;
|
|
16572
|
-
program.name("docs-mcp-server").description("Unified CLI, MCP Server, and Web Interface for Docs MCP Server.").version("1.
|
|
16662
|
+
program.name("docs-mcp-server").description("Unified CLI, MCP Server, and Web Interface for Docs MCP Server.").version("1.33.1").addOption(
|
|
16573
16663
|
new Option("--verbose", "Enable verbose (debug) logging").conflicts("silent")
|
|
16574
16664
|
).addOption(new Option("--silent", "Disable all logging except errors")).addOption(
|
|
16575
16665
|
new Option("--telemetry", "Enable telemetry collection").env("DOCS_MCP_TELEMETRY").argParser((value) => {
|
|
@@ -16603,7 +16693,7 @@ function createCliProgram() {
|
|
|
16603
16693
|
if (shouldEnableTelemetry()) {
|
|
16604
16694
|
if (telemetry.isEnabled()) {
|
|
16605
16695
|
telemetry.setGlobalContext({
|
|
16606
|
-
appVersion: "1.
|
|
16696
|
+
appVersion: "1.33.1",
|
|
16607
16697
|
appPlatform: process.platform,
|
|
16608
16698
|
appNodeVersion: process.version,
|
|
16609
16699
|
appInterface: "cli",
|