@arabold/docs-mcp-server 1.31.1 → 1.33.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/db/migrations/011-add-vector-triggers.sql +45 -0
- package/dist/index.js +345 -212
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1472,6 +1472,8 @@ const FETCHER_MAX_CACHE_ITEM_SIZE_BYTES = 500 * 1024;
|
|
|
1472
1472
|
const SPLITTER_MIN_CHUNK_SIZE = 500;
|
|
1473
1473
|
const SPLITTER_PREFERRED_CHUNK_SIZE = 1500;
|
|
1474
1474
|
const SPLITTER_MAX_CHUNK_SIZE = 5e3;
|
|
1475
|
+
const JSON_MAX_NESTING_DEPTH = 5;
|
|
1476
|
+
const JSON_MAX_CHUNKS = 1e3;
|
|
1475
1477
|
const EMBEDDING_BATCH_SIZE = 100;
|
|
1476
1478
|
const EMBEDDING_BATCH_CHARS = 5e4;
|
|
1477
1479
|
const MIGRATION_MAX_RETRIES = 5;
|
|
@@ -3044,16 +3046,203 @@ class GreedySplitter {
|
|
|
3044
3046
|
return common;
|
|
3045
3047
|
}
|
|
3046
3048
|
}
|
|
3049
|
+
class TextContentSplitter {
|
|
3050
|
+
constructor(options) {
|
|
3051
|
+
this.options = options;
|
|
3052
|
+
}
|
|
3053
|
+
/**
|
|
3054
|
+
* Splits text content into chunks while trying to preserve semantic boundaries.
|
|
3055
|
+
* Prefers paragraph breaks, then line breaks, finally falling back to word boundaries.
|
|
3056
|
+
* Always preserves formatting - trimming should be done by higher-level splitters if needed.
|
|
3057
|
+
*/
|
|
3058
|
+
async split(content) {
|
|
3059
|
+
if (content.length <= this.options.chunkSize) {
|
|
3060
|
+
return [content];
|
|
3061
|
+
}
|
|
3062
|
+
const words = content.split(/\s+/);
|
|
3063
|
+
const longestWord = words.reduce(
|
|
3064
|
+
(max, word) => word.length > max.length ? word : max
|
|
3065
|
+
);
|
|
3066
|
+
if (longestWord.length > this.options.chunkSize) {
|
|
3067
|
+
throw new MinimumChunkSizeError(longestWord.length, this.options.chunkSize);
|
|
3068
|
+
}
|
|
3069
|
+
const paragraphChunks = this.splitByParagraphs(content);
|
|
3070
|
+
if (this.areChunksValid(paragraphChunks)) {
|
|
3071
|
+
return paragraphChunks;
|
|
3072
|
+
}
|
|
3073
|
+
const lineChunks = this.splitByLines(content);
|
|
3074
|
+
if (this.areChunksValid(lineChunks)) {
|
|
3075
|
+
return this.mergeChunks(lineChunks, "");
|
|
3076
|
+
}
|
|
3077
|
+
const wordChunks = await this.splitByWords(content);
|
|
3078
|
+
return this.mergeChunks(wordChunks, " ");
|
|
3079
|
+
}
|
|
3080
|
+
/**
|
|
3081
|
+
* Checks if all chunks are within the maximum size limit
|
|
3082
|
+
*/
|
|
3083
|
+
areChunksValid(chunks) {
|
|
3084
|
+
return chunks.every((chunk) => chunk.length <= this.options.chunkSize);
|
|
3085
|
+
}
|
|
3086
|
+
/**
|
|
3087
|
+
* Splits text into chunks by paragraph boundaries (double newlines)
|
|
3088
|
+
* Preserves all formatting and whitespace including the paragraph separators
|
|
3089
|
+
*/
|
|
3090
|
+
splitByParagraphs(text) {
|
|
3091
|
+
const chunks = [];
|
|
3092
|
+
let startPos = 0;
|
|
3093
|
+
const paragraphRegex = /\n\s*\n/g;
|
|
3094
|
+
let match = paragraphRegex.exec(text);
|
|
3095
|
+
while (match !== null) {
|
|
3096
|
+
const endPos = match.index + match[0].length;
|
|
3097
|
+
const chunk = text.slice(startPos, endPos);
|
|
3098
|
+
if (chunk.length > 2) {
|
|
3099
|
+
chunks.push(chunk);
|
|
3100
|
+
}
|
|
3101
|
+
startPos = endPos;
|
|
3102
|
+
match = paragraphRegex.exec(text);
|
|
3103
|
+
}
|
|
3104
|
+
if (startPos < text.length) {
|
|
3105
|
+
const remainingChunk = text.slice(startPos);
|
|
3106
|
+
if (remainingChunk.length > 2) {
|
|
3107
|
+
chunks.push(remainingChunk);
|
|
3108
|
+
}
|
|
3109
|
+
}
|
|
3110
|
+
return chunks.filter(Boolean);
|
|
3111
|
+
}
|
|
3112
|
+
/**
|
|
3113
|
+
* Splits text into chunks by line boundaries
|
|
3114
|
+
* Preserves all formatting and whitespace, including newlines at the end of each line
|
|
3115
|
+
*/
|
|
3116
|
+
splitByLines(text) {
|
|
3117
|
+
const chunks = [];
|
|
3118
|
+
let startPos = 0;
|
|
3119
|
+
for (let i = 0; i < text.length; i++) {
|
|
3120
|
+
if (text[i] === "\n") {
|
|
3121
|
+
const chunk = text.slice(startPos, i + 1);
|
|
3122
|
+
chunks.push(chunk);
|
|
3123
|
+
startPos = i + 1;
|
|
3124
|
+
}
|
|
3125
|
+
}
|
|
3126
|
+
if (startPos < text.length) {
|
|
3127
|
+
chunks.push(text.slice(startPos));
|
|
3128
|
+
}
|
|
3129
|
+
return chunks;
|
|
3130
|
+
}
|
|
3131
|
+
/**
|
|
3132
|
+
* Uses LangChain's recursive splitter for word-based splitting as a last resort
|
|
3133
|
+
*/
|
|
3134
|
+
async splitByWords(text) {
|
|
3135
|
+
const splitter = new RecursiveCharacterTextSplitter({
|
|
3136
|
+
chunkSize: this.options.chunkSize,
|
|
3137
|
+
chunkOverlap: 0
|
|
3138
|
+
});
|
|
3139
|
+
const chunks = await splitter.splitText(text);
|
|
3140
|
+
return chunks;
|
|
3141
|
+
}
|
|
3142
|
+
/**
|
|
3143
|
+
* Attempts to merge small chunks with previous chunks to minimize fragmentation.
|
|
3144
|
+
* Only merges if combined size is within maxChunkSize.
|
|
3145
|
+
*/
|
|
3146
|
+
mergeChunks(chunks, separator) {
|
|
3147
|
+
const mergedChunks = [];
|
|
3148
|
+
let currentChunk = null;
|
|
3149
|
+
for (const chunk of chunks) {
|
|
3150
|
+
if (currentChunk === null) {
|
|
3151
|
+
currentChunk = chunk;
|
|
3152
|
+
continue;
|
|
3153
|
+
}
|
|
3154
|
+
const currentChunkSize = this.getChunkSize(currentChunk);
|
|
3155
|
+
const nextChunkSize = this.getChunkSize(chunk);
|
|
3156
|
+
if (currentChunkSize + nextChunkSize + separator.length <= this.options.chunkSize) {
|
|
3157
|
+
currentChunk = `${currentChunk}${separator}${chunk}`;
|
|
3158
|
+
} else {
|
|
3159
|
+
mergedChunks.push(currentChunk);
|
|
3160
|
+
currentChunk = chunk;
|
|
3161
|
+
}
|
|
3162
|
+
}
|
|
3163
|
+
if (currentChunk) {
|
|
3164
|
+
mergedChunks.push(currentChunk);
|
|
3165
|
+
}
|
|
3166
|
+
return mergedChunks;
|
|
3167
|
+
}
|
|
3168
|
+
getChunkSize(chunk) {
|
|
3169
|
+
return chunk.length;
|
|
3170
|
+
}
|
|
3171
|
+
wrap(content) {
|
|
3172
|
+
return content;
|
|
3173
|
+
}
|
|
3174
|
+
}
|
|
3175
|
+
class TextDocumentSplitter {
|
|
3176
|
+
options;
|
|
3177
|
+
textSplitter;
|
|
3178
|
+
constructor(options = {}) {
|
|
3179
|
+
this.options = {
|
|
3180
|
+
maxChunkSize: options.maxChunkSize ?? SPLITTER_MAX_CHUNK_SIZE
|
|
3181
|
+
};
|
|
3182
|
+
this.textSplitter = new TextContentSplitter({
|
|
3183
|
+
chunkSize: this.options.maxChunkSize
|
|
3184
|
+
});
|
|
3185
|
+
}
|
|
3186
|
+
async splitText(content) {
|
|
3187
|
+
if (!content.trim()) {
|
|
3188
|
+
return [];
|
|
3189
|
+
}
|
|
3190
|
+
try {
|
|
3191
|
+
const chunks = await this.textSplitter.split(content);
|
|
3192
|
+
return chunks.map((chunk) => ({
|
|
3193
|
+
types: ["text"],
|
|
3194
|
+
content: chunk,
|
|
3195
|
+
section: {
|
|
3196
|
+
level: 0,
|
|
3197
|
+
path: []
|
|
3198
|
+
}
|
|
3199
|
+
}));
|
|
3200
|
+
} catch (error) {
|
|
3201
|
+
if (!(error instanceof MinimumChunkSizeError) && error instanceof Error) {
|
|
3202
|
+
console.warn(
|
|
3203
|
+
`Unexpected text splitting error: ${error.message}. Forcing character-based split.`
|
|
3204
|
+
);
|
|
3205
|
+
}
|
|
3206
|
+
const chunks = [];
|
|
3207
|
+
let offset = 0;
|
|
3208
|
+
while (offset < content.length) {
|
|
3209
|
+
const chunkContent = content.substring(
|
|
3210
|
+
offset,
|
|
3211
|
+
offset + this.options.maxChunkSize
|
|
3212
|
+
);
|
|
3213
|
+
chunks.push({
|
|
3214
|
+
types: ["text"],
|
|
3215
|
+
content: chunkContent,
|
|
3216
|
+
section: {
|
|
3217
|
+
level: 0,
|
|
3218
|
+
path: []
|
|
3219
|
+
}
|
|
3220
|
+
});
|
|
3221
|
+
offset += this.options.maxChunkSize;
|
|
3222
|
+
}
|
|
3223
|
+
return chunks;
|
|
3224
|
+
}
|
|
3225
|
+
}
|
|
3226
|
+
}
|
|
3047
3227
|
class JsonDocumentSplitter {
|
|
3048
3228
|
preserveFormatting;
|
|
3229
|
+
maxDepth;
|
|
3230
|
+
maxChunks;
|
|
3231
|
+
textFallbackSplitter;
|
|
3049
3232
|
constructor(options = {}) {
|
|
3050
3233
|
this.preserveFormatting = options.preserveFormatting ?? true;
|
|
3234
|
+
this.maxDepth = options.maxDepth ?? JSON_MAX_NESTING_DEPTH;
|
|
3235
|
+
this.maxChunks = options.maxChunks ?? JSON_MAX_CHUNKS;
|
|
3236
|
+
this.textFallbackSplitter = new TextDocumentSplitter();
|
|
3051
3237
|
}
|
|
3052
3238
|
async splitText(content, _contentType) {
|
|
3053
3239
|
try {
|
|
3054
3240
|
const parsed = JSON.parse(content);
|
|
3055
3241
|
const chunks = [];
|
|
3056
|
-
this.processValue(parsed, ["root"], 1, 0, chunks, true);
|
|
3242
|
+
await this.processValue(parsed, ["root"], 1, 0, chunks, true);
|
|
3243
|
+
if (chunks.length > this.maxChunks) {
|
|
3244
|
+
return this.textFallbackSplitter.splitText(content);
|
|
3245
|
+
}
|
|
3057
3246
|
return chunks;
|
|
3058
3247
|
} catch {
|
|
3059
3248
|
return [
|
|
@@ -3068,16 +3257,20 @@ class JsonDocumentSplitter {
|
|
|
3068
3257
|
];
|
|
3069
3258
|
}
|
|
3070
3259
|
}
|
|
3071
|
-
processValue(value, path2, level, indentLevel, chunks, isLastItem) {
|
|
3260
|
+
async processValue(value, path2, level, indentLevel, chunks, isLastItem) {
|
|
3261
|
+
if (level > this.maxDepth) {
|
|
3262
|
+
await this.processValueAsText(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3263
|
+
return;
|
|
3264
|
+
}
|
|
3072
3265
|
if (Array.isArray(value)) {
|
|
3073
|
-
this.processArray(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3266
|
+
await this.processArray(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3074
3267
|
} else if (value !== null && typeof value === "object") {
|
|
3075
|
-
this.processObject(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3268
|
+
await this.processObject(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3076
3269
|
} else {
|
|
3077
|
-
this.processPrimitive(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3270
|
+
await this.processPrimitive(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3078
3271
|
}
|
|
3079
3272
|
}
|
|
3080
|
-
processArray(array, path2, level, indentLevel, chunks, isLastItem) {
|
|
3273
|
+
async processArray(array, path2, level, indentLevel, chunks, isLastItem) {
|
|
3081
3274
|
const indent = this.getIndent(indentLevel);
|
|
3082
3275
|
const comma = isLastItem ? "" : ",";
|
|
3083
3276
|
chunks.push({
|
|
@@ -3085,18 +3278,19 @@ class JsonDocumentSplitter {
|
|
|
3085
3278
|
content: `${indent}[`,
|
|
3086
3279
|
section: { level, path: [...path2] }
|
|
3087
3280
|
});
|
|
3088
|
-
array.
|
|
3281
|
+
for (let index = 0; index < array.length; index++) {
|
|
3282
|
+
const item = array[index];
|
|
3089
3283
|
const isLast = index === array.length - 1;
|
|
3090
3284
|
const itemPath = [...path2, `[${index}]`];
|
|
3091
|
-
this.processValue(item, itemPath, level + 1, indentLevel + 1, chunks, isLast);
|
|
3092
|
-
}
|
|
3285
|
+
await this.processValue(item, itemPath, level + 1, indentLevel + 1, chunks, isLast);
|
|
3286
|
+
}
|
|
3093
3287
|
chunks.push({
|
|
3094
3288
|
types: ["code"],
|
|
3095
3289
|
content: `${indent}]${comma}`,
|
|
3096
3290
|
section: { level, path: [...path2] }
|
|
3097
3291
|
});
|
|
3098
3292
|
}
|
|
3099
|
-
processObject(obj, path2, level, indentLevel, chunks, isLastItem) {
|
|
3293
|
+
async processObject(obj, path2, level, indentLevel, chunks, isLastItem) {
|
|
3100
3294
|
const indent = this.getIndent(indentLevel);
|
|
3101
3295
|
const comma = isLastItem ? "" : ",";
|
|
3102
3296
|
const entries = Object.entries(obj);
|
|
@@ -3105,10 +3299,11 @@ class JsonDocumentSplitter {
|
|
|
3105
3299
|
content: `${indent}{`,
|
|
3106
3300
|
section: { level, path: [...path2] }
|
|
3107
3301
|
});
|
|
3108
|
-
|
|
3302
|
+
for (let index = 0; index < entries.length; index++) {
|
|
3303
|
+
const [key, value] = entries[index];
|
|
3109
3304
|
const isLast = index === entries.length - 1;
|
|
3110
3305
|
const propertyPath = [...path2, key];
|
|
3111
|
-
this.processProperty(
|
|
3306
|
+
await this.processProperty(
|
|
3112
3307
|
key,
|
|
3113
3308
|
value,
|
|
3114
3309
|
propertyPath,
|
|
@@ -3117,14 +3312,14 @@ class JsonDocumentSplitter {
|
|
|
3117
3312
|
chunks,
|
|
3118
3313
|
isLast
|
|
3119
3314
|
);
|
|
3120
|
-
}
|
|
3315
|
+
}
|
|
3121
3316
|
chunks.push({
|
|
3122
3317
|
types: ["code"],
|
|
3123
3318
|
content: `${indent}}${comma}`,
|
|
3124
3319
|
section: { level, path: [...path2] }
|
|
3125
3320
|
});
|
|
3126
3321
|
}
|
|
3127
|
-
processProperty(key, value, path2, level, indentLevel, chunks, isLastProperty) {
|
|
3322
|
+
async processProperty(key, value, path2, level, indentLevel, chunks, isLastProperty) {
|
|
3128
3323
|
const indent = this.getIndent(indentLevel);
|
|
3129
3324
|
if (typeof value === "object" && value !== null) {
|
|
3130
3325
|
chunks.push({
|
|
@@ -3132,30 +3327,98 @@ class JsonDocumentSplitter {
|
|
|
3132
3327
|
content: `${indent}"${key}": `,
|
|
3133
3328
|
section: { level, path: path2 }
|
|
3134
3329
|
});
|
|
3135
|
-
this.processValue(value, path2, level, indentLevel, chunks, isLastProperty);
|
|
3330
|
+
await this.processValue(value, path2, level, indentLevel, chunks, isLastProperty);
|
|
3136
3331
|
} else {
|
|
3137
3332
|
const comma = isLastProperty ? "" : ",";
|
|
3138
3333
|
const formattedValue = JSON.stringify(value);
|
|
3139
|
-
|
|
3140
|
-
|
|
3141
|
-
|
|
3142
|
-
|
|
3143
|
-
|
|
3334
|
+
const fullContent = `${indent}"${key}": ${formattedValue}${comma}`;
|
|
3335
|
+
if (fullContent.length > SPLITTER_MAX_CHUNK_SIZE) {
|
|
3336
|
+
const textChunks = await this.textFallbackSplitter.splitText(formattedValue);
|
|
3337
|
+
chunks.push({
|
|
3338
|
+
types: ["code"],
|
|
3339
|
+
content: `${indent}"${key}": `,
|
|
3340
|
+
section: { level, path: path2 }
|
|
3341
|
+
});
|
|
3342
|
+
textChunks.forEach((textChunk, index) => {
|
|
3343
|
+
const isLastChunk = index === textChunks.length - 1;
|
|
3344
|
+
const content = `${textChunk.content}${isLastChunk ? comma : ""}`;
|
|
3345
|
+
chunks.push({
|
|
3346
|
+
types: ["code"],
|
|
3347
|
+
content,
|
|
3348
|
+
section: { level, path: path2 }
|
|
3349
|
+
});
|
|
3350
|
+
});
|
|
3351
|
+
} else {
|
|
3352
|
+
chunks.push({
|
|
3353
|
+
types: ["code"],
|
|
3354
|
+
content: fullContent,
|
|
3355
|
+
section: { level, path: path2 }
|
|
3356
|
+
});
|
|
3357
|
+
}
|
|
3144
3358
|
}
|
|
3145
3359
|
}
|
|
3146
|
-
processPrimitive(value, path2, level, indentLevel, chunks, isLastItem) {
|
|
3360
|
+
async processPrimitive(value, path2, level, indentLevel, chunks, isLastItem) {
|
|
3147
3361
|
const indent = this.getIndent(indentLevel);
|
|
3148
3362
|
const comma = isLastItem ? "" : ",";
|
|
3149
3363
|
const formattedValue = JSON.stringify(value);
|
|
3150
|
-
|
|
3151
|
-
|
|
3152
|
-
|
|
3153
|
-
|
|
3154
|
-
|
|
3364
|
+
const fullContent = `${indent}${formattedValue}${comma}`;
|
|
3365
|
+
if (fullContent.length > SPLITTER_MAX_CHUNK_SIZE) {
|
|
3366
|
+
const textChunks = await this.textFallbackSplitter.splitText(formattedValue);
|
|
3367
|
+
textChunks.forEach((textChunk, index) => {
|
|
3368
|
+
const isFirstChunk = index === 0;
|
|
3369
|
+
const isLastChunk = index === textChunks.length - 1;
|
|
3370
|
+
const valueContent = isFirstChunk ? `${indent}${textChunk.content}` : textChunk.content;
|
|
3371
|
+
const content = `${valueContent}${isLastChunk ? comma : ""}`;
|
|
3372
|
+
chunks.push({
|
|
3373
|
+
types: ["code"],
|
|
3374
|
+
content,
|
|
3375
|
+
section: { level, path: [...path2] }
|
|
3376
|
+
});
|
|
3377
|
+
});
|
|
3378
|
+
} else {
|
|
3379
|
+
chunks.push({
|
|
3380
|
+
types: ["code"],
|
|
3381
|
+
content: fullContent,
|
|
3382
|
+
section: { level, path: path2 }
|
|
3383
|
+
});
|
|
3384
|
+
}
|
|
3155
3385
|
}
|
|
3156
3386
|
getIndent(level) {
|
|
3157
3387
|
return this.preserveFormatting ? " ".repeat(level) : "";
|
|
3158
3388
|
}
|
|
3389
|
+
/**
|
|
3390
|
+
* Process a value that has exceeded the maximum depth limit by serializing it as text.
|
|
3391
|
+
* This prevents excessive chunking of deeply nested structures.
|
|
3392
|
+
* If the serialized value is too large, splits it using the text fallback splitter.
|
|
3393
|
+
*/
|
|
3394
|
+
async processValueAsText(value, path2, level, indentLevel, chunks, isLastItem) {
|
|
3395
|
+
const indent = this.getIndent(indentLevel);
|
|
3396
|
+
const comma = isLastItem ? "" : ",";
|
|
3397
|
+
let serialized;
|
|
3398
|
+
if (this.preserveFormatting) {
|
|
3399
|
+
const lines = JSON.stringify(value, null, 2).split("\n");
|
|
3400
|
+
serialized = lines.map((line, idx) => idx === 0 ? line : `${indent}${line}`).join("\n");
|
|
3401
|
+
} else {
|
|
3402
|
+
serialized = JSON.stringify(value);
|
|
3403
|
+
}
|
|
3404
|
+
const fullContent = `${indent}${serialized}${comma}`;
|
|
3405
|
+
if (fullContent.length > SPLITTER_MAX_CHUNK_SIZE) {
|
|
3406
|
+
const textChunks = await this.textFallbackSplitter.splitText(serialized);
|
|
3407
|
+
for (const textChunk of textChunks) {
|
|
3408
|
+
chunks.push({
|
|
3409
|
+
types: ["code"],
|
|
3410
|
+
content: textChunk.content,
|
|
3411
|
+
section: { level, path: [...path2] }
|
|
3412
|
+
});
|
|
3413
|
+
}
|
|
3414
|
+
} else {
|
|
3415
|
+
chunks.push({
|
|
3416
|
+
types: ["code"],
|
|
3417
|
+
content: fullContent,
|
|
3418
|
+
section: { level, path: [...path2] }
|
|
3419
|
+
});
|
|
3420
|
+
}
|
|
3421
|
+
}
|
|
3159
3422
|
}
|
|
3160
3423
|
class CodeContentSplitter {
|
|
3161
3424
|
constructor(options) {
|
|
@@ -3255,132 +3518,6 @@ class TableContentSplitter {
|
|
|
3255
3518
|
return separator.includes("|") && /^\|?[\s-|]+\|?$/.test(separator);
|
|
3256
3519
|
}
|
|
3257
3520
|
}
|
|
3258
|
-
class TextContentSplitter {
|
|
3259
|
-
constructor(options) {
|
|
3260
|
-
this.options = options;
|
|
3261
|
-
}
|
|
3262
|
-
/**
|
|
3263
|
-
* Splits text content into chunks while trying to preserve semantic boundaries.
|
|
3264
|
-
* Prefers paragraph breaks, then line breaks, finally falling back to word boundaries.
|
|
3265
|
-
* Always preserves formatting - trimming should be done by higher-level splitters if needed.
|
|
3266
|
-
*/
|
|
3267
|
-
async split(content) {
|
|
3268
|
-
if (content.length <= this.options.chunkSize) {
|
|
3269
|
-
return [content];
|
|
3270
|
-
}
|
|
3271
|
-
const words = content.split(/\s+/);
|
|
3272
|
-
const longestWord = words.reduce(
|
|
3273
|
-
(max, word) => word.length > max.length ? word : max
|
|
3274
|
-
);
|
|
3275
|
-
if (longestWord.length > this.options.chunkSize) {
|
|
3276
|
-
throw new MinimumChunkSizeError(longestWord.length, this.options.chunkSize);
|
|
3277
|
-
}
|
|
3278
|
-
const paragraphChunks = this.splitByParagraphs(content);
|
|
3279
|
-
if (this.areChunksValid(paragraphChunks)) {
|
|
3280
|
-
return paragraphChunks;
|
|
3281
|
-
}
|
|
3282
|
-
const lineChunks = this.splitByLines(content);
|
|
3283
|
-
if (this.areChunksValid(lineChunks)) {
|
|
3284
|
-
return this.mergeChunks(lineChunks, "");
|
|
3285
|
-
}
|
|
3286
|
-
const wordChunks = await this.splitByWords(content);
|
|
3287
|
-
return this.mergeChunks(wordChunks, " ");
|
|
3288
|
-
}
|
|
3289
|
-
/**
|
|
3290
|
-
* Checks if all chunks are within the maximum size limit
|
|
3291
|
-
*/
|
|
3292
|
-
areChunksValid(chunks) {
|
|
3293
|
-
return chunks.every((chunk) => chunk.length <= this.options.chunkSize);
|
|
3294
|
-
}
|
|
3295
|
-
/**
|
|
3296
|
-
* Splits text into chunks by paragraph boundaries (double newlines)
|
|
3297
|
-
* Preserves all formatting and whitespace including the paragraph separators
|
|
3298
|
-
*/
|
|
3299
|
-
splitByParagraphs(text) {
|
|
3300
|
-
const chunks = [];
|
|
3301
|
-
let startPos = 0;
|
|
3302
|
-
const paragraphRegex = /\n\s*\n/g;
|
|
3303
|
-
let match = paragraphRegex.exec(text);
|
|
3304
|
-
while (match !== null) {
|
|
3305
|
-
const endPos = match.index + match[0].length;
|
|
3306
|
-
const chunk = text.slice(startPos, endPos);
|
|
3307
|
-
if (chunk.length > 2) {
|
|
3308
|
-
chunks.push(chunk);
|
|
3309
|
-
}
|
|
3310
|
-
startPos = endPos;
|
|
3311
|
-
match = paragraphRegex.exec(text);
|
|
3312
|
-
}
|
|
3313
|
-
if (startPos < text.length) {
|
|
3314
|
-
const remainingChunk = text.slice(startPos);
|
|
3315
|
-
if (remainingChunk.length > 2) {
|
|
3316
|
-
chunks.push(remainingChunk);
|
|
3317
|
-
}
|
|
3318
|
-
}
|
|
3319
|
-
return chunks.filter(Boolean);
|
|
3320
|
-
}
|
|
3321
|
-
/**
|
|
3322
|
-
* Splits text into chunks by line boundaries
|
|
3323
|
-
* Preserves all formatting and whitespace, including newlines at the end of each line
|
|
3324
|
-
*/
|
|
3325
|
-
splitByLines(text) {
|
|
3326
|
-
const chunks = [];
|
|
3327
|
-
let startPos = 0;
|
|
3328
|
-
for (let i = 0; i < text.length; i++) {
|
|
3329
|
-
if (text[i] === "\n") {
|
|
3330
|
-
const chunk = text.slice(startPos, i + 1);
|
|
3331
|
-
chunks.push(chunk);
|
|
3332
|
-
startPos = i + 1;
|
|
3333
|
-
}
|
|
3334
|
-
}
|
|
3335
|
-
if (startPos < text.length) {
|
|
3336
|
-
chunks.push(text.slice(startPos));
|
|
3337
|
-
}
|
|
3338
|
-
return chunks;
|
|
3339
|
-
}
|
|
3340
|
-
/**
|
|
3341
|
-
* Uses LangChain's recursive splitter for word-based splitting as a last resort
|
|
3342
|
-
*/
|
|
3343
|
-
async splitByWords(text) {
|
|
3344
|
-
const splitter = new RecursiveCharacterTextSplitter({
|
|
3345
|
-
chunkSize: this.options.chunkSize,
|
|
3346
|
-
chunkOverlap: 0
|
|
3347
|
-
});
|
|
3348
|
-
const chunks = await splitter.splitText(text);
|
|
3349
|
-
return chunks;
|
|
3350
|
-
}
|
|
3351
|
-
/**
|
|
3352
|
-
* Attempts to merge small chunks with previous chunks to minimize fragmentation.
|
|
3353
|
-
* Only merges if combined size is within maxChunkSize.
|
|
3354
|
-
*/
|
|
3355
|
-
mergeChunks(chunks, separator) {
|
|
3356
|
-
const mergedChunks = [];
|
|
3357
|
-
let currentChunk = null;
|
|
3358
|
-
for (const chunk of chunks) {
|
|
3359
|
-
if (currentChunk === null) {
|
|
3360
|
-
currentChunk = chunk;
|
|
3361
|
-
continue;
|
|
3362
|
-
}
|
|
3363
|
-
const currentChunkSize = this.getChunkSize(currentChunk);
|
|
3364
|
-
const nextChunkSize = this.getChunkSize(chunk);
|
|
3365
|
-
if (currentChunkSize + nextChunkSize + separator.length <= this.options.chunkSize) {
|
|
3366
|
-
currentChunk = `${currentChunk}${separator}${chunk}`;
|
|
3367
|
-
} else {
|
|
3368
|
-
mergedChunks.push(currentChunk);
|
|
3369
|
-
currentChunk = chunk;
|
|
3370
|
-
}
|
|
3371
|
-
}
|
|
3372
|
-
if (currentChunk) {
|
|
3373
|
-
mergedChunks.push(currentChunk);
|
|
3374
|
-
}
|
|
3375
|
-
return mergedChunks;
|
|
3376
|
-
}
|
|
3377
|
-
getChunkSize(chunk) {
|
|
3378
|
-
return chunk.length;
|
|
3379
|
-
}
|
|
3380
|
-
wrap(content) {
|
|
3381
|
-
return content;
|
|
3382
|
-
}
|
|
3383
|
-
}
|
|
3384
3521
|
class SemanticMarkdownSplitter {
|
|
3385
3522
|
constructor(preferredChunkSize, maxChunkSize) {
|
|
3386
3523
|
this.preferredChunkSize = preferredChunkSize;
|
|
@@ -6452,45 +6589,6 @@ class MarkdownPipeline extends BasePipeline {
|
|
|
6452
6589
|
};
|
|
6453
6590
|
}
|
|
6454
6591
|
}
|
|
6455
|
-
class TextDocumentSplitter {
|
|
6456
|
-
options;
|
|
6457
|
-
textSplitter;
|
|
6458
|
-
constructor(options = {}) {
|
|
6459
|
-
this.options = {
|
|
6460
|
-
maxChunkSize: options.maxChunkSize ?? SPLITTER_MAX_CHUNK_SIZE
|
|
6461
|
-
};
|
|
6462
|
-
this.textSplitter = new TextContentSplitter({
|
|
6463
|
-
chunkSize: this.options.maxChunkSize
|
|
6464
|
-
});
|
|
6465
|
-
}
|
|
6466
|
-
async splitText(content) {
|
|
6467
|
-
if (!content.trim()) {
|
|
6468
|
-
return [];
|
|
6469
|
-
}
|
|
6470
|
-
try {
|
|
6471
|
-
const chunks = await this.textSplitter.split(content);
|
|
6472
|
-
return chunks.map((chunk) => ({
|
|
6473
|
-
types: ["text"],
|
|
6474
|
-
content: chunk,
|
|
6475
|
-
section: {
|
|
6476
|
-
level: 0,
|
|
6477
|
-
path: []
|
|
6478
|
-
}
|
|
6479
|
-
}));
|
|
6480
|
-
} catch {
|
|
6481
|
-
return [
|
|
6482
|
-
{
|
|
6483
|
-
types: ["text"],
|
|
6484
|
-
content,
|
|
6485
|
-
section: {
|
|
6486
|
-
level: 0,
|
|
6487
|
-
path: []
|
|
6488
|
-
}
|
|
6489
|
-
}
|
|
6490
|
-
];
|
|
6491
|
-
}
|
|
6492
|
-
}
|
|
6493
|
-
}
|
|
6494
6592
|
class TextPipeline extends BasePipeline {
|
|
6495
6593
|
middleware;
|
|
6496
6594
|
splitter;
|
|
@@ -8973,9 +9071,10 @@ class DocumentStore {
|
|
|
8973
9071
|
* - Single texts that are too large are truncated and retried once
|
|
8974
9072
|
*
|
|
8975
9073
|
* @param texts Array of texts to embed
|
|
9074
|
+
* @param isRetry Internal flag to prevent duplicate warning logs
|
|
8976
9075
|
* @returns Array of embedding vectors
|
|
8977
9076
|
*/
|
|
8978
|
-
async embedDocumentsWithRetry(texts) {
|
|
9077
|
+
async embedDocumentsWithRetry(texts, isRetry = false) {
|
|
8979
9078
|
if (texts.length === 0) {
|
|
8980
9079
|
return [];
|
|
8981
9080
|
}
|
|
@@ -8987,26 +9086,27 @@ class DocumentStore {
|
|
|
8987
9086
|
const midpoint = Math.floor(texts.length / 2);
|
|
8988
9087
|
const firstHalf = texts.slice(0, midpoint);
|
|
8989
9088
|
const secondHalf = texts.slice(midpoint);
|
|
8990
|
-
|
|
8991
|
-
|
|
8992
|
-
|
|
9089
|
+
if (!isRetry) {
|
|
9090
|
+
logger.warn(
|
|
9091
|
+
`⚠️ Batch of ${texts.length} texts exceeded size limit, splitting into ${firstHalf.length} + ${secondHalf.length}`
|
|
9092
|
+
);
|
|
9093
|
+
}
|
|
8993
9094
|
const [firstEmbeddings, secondEmbeddings] = await Promise.all([
|
|
8994
|
-
this.embedDocumentsWithRetry(firstHalf),
|
|
8995
|
-
this.embedDocumentsWithRetry(secondHalf)
|
|
9095
|
+
this.embedDocumentsWithRetry(firstHalf, true),
|
|
9096
|
+
this.embedDocumentsWithRetry(secondHalf, true)
|
|
8996
9097
|
]);
|
|
8997
9098
|
return [...firstEmbeddings, ...secondEmbeddings];
|
|
8998
9099
|
} else {
|
|
8999
9100
|
const text = texts[0];
|
|
9000
9101
|
const midpoint = Math.floor(text.length / 2);
|
|
9001
9102
|
const firstHalf = text.substring(0, midpoint);
|
|
9002
|
-
|
|
9003
|
-
|
|
9004
|
-
|
|
9005
|
-
try {
|
|
9006
|
-
const embedding = await this.embedDocumentsWithRetry([firstHalf]);
|
|
9007
|
-
logger.info(
|
|
9008
|
-
`✓ Using embedding from first half of split text (${firstHalf.length} chars)`
|
|
9103
|
+
if (!isRetry) {
|
|
9104
|
+
logger.warn(
|
|
9105
|
+
`⚠️ Single text exceeded embedding size limit (${text.length} chars).`
|
|
9009
9106
|
);
|
|
9107
|
+
}
|
|
9108
|
+
try {
|
|
9109
|
+
const embedding = await this.embedDocumentsWithRetry([firstHalf], true);
|
|
9010
9110
|
return embedding;
|
|
9011
9111
|
} catch (retryError) {
|
|
9012
9112
|
logger.error(
|
|
@@ -9130,8 +9230,8 @@ class DocumentStore {
|
|
|
9130
9230
|
const rowId = result2.lastInsertRowid;
|
|
9131
9231
|
if (this.isVectorSearchEnabled && paddedEmbeddings.length > 0) {
|
|
9132
9232
|
this.statements.insertEmbedding.run(
|
|
9133
|
-
|
|
9134
|
-
|
|
9233
|
+
JSON.stringify(paddedEmbeddings[docIndex]),
|
|
9234
|
+
BigInt(rowId)
|
|
9135
9235
|
);
|
|
9136
9236
|
}
|
|
9137
9237
|
docIndex++;
|
|
@@ -10056,6 +10156,8 @@ async function registerMcpService(server, docService, pipeline, readOnly = false
|
|
|
10056
10156
|
const mcpServer = createMcpServerInstance(mcpTools, readOnly);
|
|
10057
10157
|
const authMiddleware = authManager ? createAuthMiddleware(authManager) : null;
|
|
10058
10158
|
const sseTransports = {};
|
|
10159
|
+
const heartbeatIntervals = {};
|
|
10160
|
+
const HEARTBEAT_INTERVAL_MS = 3e4;
|
|
10059
10161
|
server.route({
|
|
10060
10162
|
method: "GET",
|
|
10061
10163
|
url: "/sse",
|
|
@@ -10067,12 +10169,31 @@ async function registerMcpService(server, docService, pipeline, readOnly = false
|
|
|
10067
10169
|
if (telemetry.isEnabled()) {
|
|
10068
10170
|
logger.info(`🔗 MCP client connected: ${transport.sessionId}`);
|
|
10069
10171
|
}
|
|
10070
|
-
|
|
10172
|
+
const heartbeatInterval = setInterval(() => {
|
|
10173
|
+
try {
|
|
10174
|
+
reply.raw.write(": heartbeat\n\n");
|
|
10175
|
+
} catch {
|
|
10176
|
+
clearInterval(heartbeatInterval);
|
|
10177
|
+
delete heartbeatIntervals[transport.sessionId];
|
|
10178
|
+
}
|
|
10179
|
+
}, HEARTBEAT_INTERVAL_MS);
|
|
10180
|
+
heartbeatIntervals[transport.sessionId] = heartbeatInterval;
|
|
10181
|
+
const cleanupConnection = () => {
|
|
10182
|
+
const interval = heartbeatIntervals[transport.sessionId];
|
|
10183
|
+
if (interval) {
|
|
10184
|
+
clearInterval(interval);
|
|
10185
|
+
delete heartbeatIntervals[transport.sessionId];
|
|
10186
|
+
}
|
|
10071
10187
|
delete sseTransports[transport.sessionId];
|
|
10072
10188
|
transport.close();
|
|
10073
10189
|
if (telemetry.isEnabled()) {
|
|
10074
10190
|
logger.info(`🔗 MCP client disconnected: ${transport.sessionId}`);
|
|
10075
10191
|
}
|
|
10192
|
+
};
|
|
10193
|
+
reply.raw.on("close", cleanupConnection);
|
|
10194
|
+
reply.raw.on("error", (error) => {
|
|
10195
|
+
logger.debug(`SSE connection error: ${error}`);
|
|
10196
|
+
cleanupConnection();
|
|
10076
10197
|
});
|
|
10077
10198
|
await mcpServer.connect(transport);
|
|
10078
10199
|
} catch (error) {
|
|
@@ -10114,10 +10235,15 @@ async function registerMcpService(server, docService, pipeline, readOnly = false
|
|
|
10114
10235
|
const requestTransport = new StreamableHTTPServerTransport({
|
|
10115
10236
|
sessionIdGenerator: void 0
|
|
10116
10237
|
});
|
|
10117
|
-
|
|
10238
|
+
const cleanupRequest = () => {
|
|
10118
10239
|
logger.debug("Streamable HTTP request closed");
|
|
10119
10240
|
requestTransport.close();
|
|
10120
10241
|
requestServer.close();
|
|
10242
|
+
};
|
|
10243
|
+
reply.raw.on("close", cleanupRequest);
|
|
10244
|
+
reply.raw.on("error", (error) => {
|
|
10245
|
+
logger.debug(`Streamable HTTP connection error: ${error}`);
|
|
10246
|
+
cleanupRequest();
|
|
10121
10247
|
});
|
|
10122
10248
|
await requestServer.connect(requestTransport);
|
|
10123
10249
|
await requestTransport.handleRequest(request.raw, reply.raw, request.body);
|
|
@@ -10130,10 +10256,17 @@ async function registerMcpService(server, docService, pipeline, readOnly = false
|
|
|
10130
10256
|
}
|
|
10131
10257
|
});
|
|
10132
10258
|
mcpServer._sseTransports = sseTransports;
|
|
10259
|
+
mcpServer._heartbeatIntervals = heartbeatIntervals;
|
|
10133
10260
|
return mcpServer;
|
|
10134
10261
|
}
|
|
10135
10262
|
async function cleanupMcpService(mcpServer) {
|
|
10136
10263
|
try {
|
|
10264
|
+
const heartbeatIntervals = mcpServer._heartbeatIntervals;
|
|
10265
|
+
if (heartbeatIntervals) {
|
|
10266
|
+
for (const interval of Object.values(heartbeatIntervals)) {
|
|
10267
|
+
clearInterval(interval);
|
|
10268
|
+
}
|
|
10269
|
+
}
|
|
10137
10270
|
const sseTransports = mcpServer._sseTransports;
|
|
10138
10271
|
if (sseTransports) {
|
|
10139
10272
|
for (const transport of Object.values(sseTransports)) {
|
|
@@ -10737,7 +10870,7 @@ const Layout = ({
|
|
|
10737
10870
|
children,
|
|
10738
10871
|
eventClientConfig
|
|
10739
10872
|
}) => {
|
|
10740
|
-
const versionString = version || "1.
|
|
10873
|
+
const versionString = version || "1.33.0";
|
|
10741
10874
|
const versionInitializer = `versionUpdate({ currentVersion: ${`'${versionString}'`} })`;
|
|
10742
10875
|
return /* @__PURE__ */ jsxs("html", { lang: "en", children: [
|
|
10743
10876
|
/* @__PURE__ */ jsxs("head", { children: [
|
|
@@ -13049,7 +13182,7 @@ class AppServer {
|
|
|
13049
13182
|
try {
|
|
13050
13183
|
if (telemetry.isEnabled()) {
|
|
13051
13184
|
telemetry.setGlobalContext({
|
|
13052
|
-
appVersion: "1.
|
|
13185
|
+
appVersion: "1.33.0",
|
|
13053
13186
|
appPlatform: process.platform,
|
|
13054
13187
|
appNodeVersion: process.version,
|
|
13055
13188
|
appServicesEnabled: this.getActiveServicesList(),
|
|
@@ -16536,7 +16669,7 @@ function createCliProgram() {
|
|
|
16536
16669
|
const commandStartTimes = /* @__PURE__ */ new Map();
|
|
16537
16670
|
let globalEventBus = null;
|
|
16538
16671
|
let globalTelemetryService = null;
|
|
16539
|
-
program.name("docs-mcp-server").description("Unified CLI, MCP Server, and Web Interface for Docs MCP Server.").version("1.
|
|
16672
|
+
program.name("docs-mcp-server").description("Unified CLI, MCP Server, and Web Interface for Docs MCP Server.").version("1.33.0").addOption(
|
|
16540
16673
|
new Option("--verbose", "Enable verbose (debug) logging").conflicts("silent")
|
|
16541
16674
|
).addOption(new Option("--silent", "Disable all logging except errors")).addOption(
|
|
16542
16675
|
new Option("--telemetry", "Enable telemetry collection").env("DOCS_MCP_TELEMETRY").argParser((value) => {
|
|
@@ -16570,7 +16703,7 @@ function createCliProgram() {
|
|
|
16570
16703
|
if (shouldEnableTelemetry()) {
|
|
16571
16704
|
if (telemetry.isEnabled()) {
|
|
16572
16705
|
telemetry.setGlobalContext({
|
|
16573
|
-
appVersion: "1.
|
|
16706
|
+
appVersion: "1.33.0",
|
|
16574
16707
|
appPlatform: process.platform,
|
|
16575
16708
|
appNodeVersion: process.version,
|
|
16576
16709
|
appInterface: "cli",
|