@arabold/docs-mcp-server 1.32.0 → 1.33.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/db/migrations/011-add-vector-triggers.sql +45 -0
- package/dist/index.js +310 -210
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1472,6 +1472,8 @@ const FETCHER_MAX_CACHE_ITEM_SIZE_BYTES = 500 * 1024;
|
|
|
1472
1472
|
const SPLITTER_MIN_CHUNK_SIZE = 500;
|
|
1473
1473
|
const SPLITTER_PREFERRED_CHUNK_SIZE = 1500;
|
|
1474
1474
|
const SPLITTER_MAX_CHUNK_SIZE = 5e3;
|
|
1475
|
+
const JSON_MAX_NESTING_DEPTH = 5;
|
|
1476
|
+
const JSON_MAX_CHUNKS = 1e3;
|
|
1475
1477
|
const EMBEDDING_BATCH_SIZE = 100;
|
|
1476
1478
|
const EMBEDDING_BATCH_CHARS = 5e4;
|
|
1477
1479
|
const MIGRATION_MAX_RETRIES = 5;
|
|
@@ -3044,16 +3046,203 @@ class GreedySplitter {
|
|
|
3044
3046
|
return common;
|
|
3045
3047
|
}
|
|
3046
3048
|
}
|
|
3049
|
+
class TextContentSplitter {
|
|
3050
|
+
constructor(options) {
|
|
3051
|
+
this.options = options;
|
|
3052
|
+
}
|
|
3053
|
+
/**
|
|
3054
|
+
* Splits text content into chunks while trying to preserve semantic boundaries.
|
|
3055
|
+
* Prefers paragraph breaks, then line breaks, finally falling back to word boundaries.
|
|
3056
|
+
* Always preserves formatting - trimming should be done by higher-level splitters if needed.
|
|
3057
|
+
*/
|
|
3058
|
+
async split(content) {
|
|
3059
|
+
if (content.length <= this.options.chunkSize) {
|
|
3060
|
+
return [content];
|
|
3061
|
+
}
|
|
3062
|
+
const words = content.split(/\s+/);
|
|
3063
|
+
const longestWord = words.reduce(
|
|
3064
|
+
(max, word) => word.length > max.length ? word : max
|
|
3065
|
+
);
|
|
3066
|
+
if (longestWord.length > this.options.chunkSize) {
|
|
3067
|
+
throw new MinimumChunkSizeError(longestWord.length, this.options.chunkSize);
|
|
3068
|
+
}
|
|
3069
|
+
const paragraphChunks = this.splitByParagraphs(content);
|
|
3070
|
+
if (this.areChunksValid(paragraphChunks)) {
|
|
3071
|
+
return paragraphChunks;
|
|
3072
|
+
}
|
|
3073
|
+
const lineChunks = this.splitByLines(content);
|
|
3074
|
+
if (this.areChunksValid(lineChunks)) {
|
|
3075
|
+
return this.mergeChunks(lineChunks, "");
|
|
3076
|
+
}
|
|
3077
|
+
const wordChunks = await this.splitByWords(content);
|
|
3078
|
+
return this.mergeChunks(wordChunks, " ");
|
|
3079
|
+
}
|
|
3080
|
+
/**
|
|
3081
|
+
* Checks if all chunks are within the maximum size limit
|
|
3082
|
+
*/
|
|
3083
|
+
areChunksValid(chunks) {
|
|
3084
|
+
return chunks.every((chunk) => chunk.length <= this.options.chunkSize);
|
|
3085
|
+
}
|
|
3086
|
+
/**
|
|
3087
|
+
* Splits text into chunks by paragraph boundaries (double newlines)
|
|
3088
|
+
* Preserves all formatting and whitespace including the paragraph separators
|
|
3089
|
+
*/
|
|
3090
|
+
splitByParagraphs(text) {
|
|
3091
|
+
const chunks = [];
|
|
3092
|
+
let startPos = 0;
|
|
3093
|
+
const paragraphRegex = /\n\s*\n/g;
|
|
3094
|
+
let match = paragraphRegex.exec(text);
|
|
3095
|
+
while (match !== null) {
|
|
3096
|
+
const endPos = match.index + match[0].length;
|
|
3097
|
+
const chunk = text.slice(startPos, endPos);
|
|
3098
|
+
if (chunk.length > 2) {
|
|
3099
|
+
chunks.push(chunk);
|
|
3100
|
+
}
|
|
3101
|
+
startPos = endPos;
|
|
3102
|
+
match = paragraphRegex.exec(text);
|
|
3103
|
+
}
|
|
3104
|
+
if (startPos < text.length) {
|
|
3105
|
+
const remainingChunk = text.slice(startPos);
|
|
3106
|
+
if (remainingChunk.length > 2) {
|
|
3107
|
+
chunks.push(remainingChunk);
|
|
3108
|
+
}
|
|
3109
|
+
}
|
|
3110
|
+
return chunks.filter(Boolean);
|
|
3111
|
+
}
|
|
3112
|
+
/**
|
|
3113
|
+
* Splits text into chunks by line boundaries
|
|
3114
|
+
* Preserves all formatting and whitespace, including newlines at the end of each line
|
|
3115
|
+
*/
|
|
3116
|
+
splitByLines(text) {
|
|
3117
|
+
const chunks = [];
|
|
3118
|
+
let startPos = 0;
|
|
3119
|
+
for (let i = 0; i < text.length; i++) {
|
|
3120
|
+
if (text[i] === "\n") {
|
|
3121
|
+
const chunk = text.slice(startPos, i + 1);
|
|
3122
|
+
chunks.push(chunk);
|
|
3123
|
+
startPos = i + 1;
|
|
3124
|
+
}
|
|
3125
|
+
}
|
|
3126
|
+
if (startPos < text.length) {
|
|
3127
|
+
chunks.push(text.slice(startPos));
|
|
3128
|
+
}
|
|
3129
|
+
return chunks;
|
|
3130
|
+
}
|
|
3131
|
+
/**
|
|
3132
|
+
* Uses LangChain's recursive splitter for word-based splitting as a last resort
|
|
3133
|
+
*/
|
|
3134
|
+
async splitByWords(text) {
|
|
3135
|
+
const splitter = new RecursiveCharacterTextSplitter({
|
|
3136
|
+
chunkSize: this.options.chunkSize,
|
|
3137
|
+
chunkOverlap: 0
|
|
3138
|
+
});
|
|
3139
|
+
const chunks = await splitter.splitText(text);
|
|
3140
|
+
return chunks;
|
|
3141
|
+
}
|
|
3142
|
+
/**
|
|
3143
|
+
* Attempts to merge small chunks with previous chunks to minimize fragmentation.
|
|
3144
|
+
* Only merges if combined size is within maxChunkSize.
|
|
3145
|
+
*/
|
|
3146
|
+
mergeChunks(chunks, separator) {
|
|
3147
|
+
const mergedChunks = [];
|
|
3148
|
+
let currentChunk = null;
|
|
3149
|
+
for (const chunk of chunks) {
|
|
3150
|
+
if (currentChunk === null) {
|
|
3151
|
+
currentChunk = chunk;
|
|
3152
|
+
continue;
|
|
3153
|
+
}
|
|
3154
|
+
const currentChunkSize = this.getChunkSize(currentChunk);
|
|
3155
|
+
const nextChunkSize = this.getChunkSize(chunk);
|
|
3156
|
+
if (currentChunkSize + nextChunkSize + separator.length <= this.options.chunkSize) {
|
|
3157
|
+
currentChunk = `${currentChunk}${separator}${chunk}`;
|
|
3158
|
+
} else {
|
|
3159
|
+
mergedChunks.push(currentChunk);
|
|
3160
|
+
currentChunk = chunk;
|
|
3161
|
+
}
|
|
3162
|
+
}
|
|
3163
|
+
if (currentChunk) {
|
|
3164
|
+
mergedChunks.push(currentChunk);
|
|
3165
|
+
}
|
|
3166
|
+
return mergedChunks;
|
|
3167
|
+
}
|
|
3168
|
+
getChunkSize(chunk) {
|
|
3169
|
+
return chunk.length;
|
|
3170
|
+
}
|
|
3171
|
+
wrap(content) {
|
|
3172
|
+
return content;
|
|
3173
|
+
}
|
|
3174
|
+
}
|
|
3175
|
+
class TextDocumentSplitter {
|
|
3176
|
+
options;
|
|
3177
|
+
textSplitter;
|
|
3178
|
+
constructor(options = {}) {
|
|
3179
|
+
this.options = {
|
|
3180
|
+
maxChunkSize: options.maxChunkSize ?? SPLITTER_MAX_CHUNK_SIZE
|
|
3181
|
+
};
|
|
3182
|
+
this.textSplitter = new TextContentSplitter({
|
|
3183
|
+
chunkSize: this.options.maxChunkSize
|
|
3184
|
+
});
|
|
3185
|
+
}
|
|
3186
|
+
async splitText(content) {
|
|
3187
|
+
if (!content.trim()) {
|
|
3188
|
+
return [];
|
|
3189
|
+
}
|
|
3190
|
+
try {
|
|
3191
|
+
const chunks = await this.textSplitter.split(content);
|
|
3192
|
+
return chunks.map((chunk) => ({
|
|
3193
|
+
types: ["text"],
|
|
3194
|
+
content: chunk,
|
|
3195
|
+
section: {
|
|
3196
|
+
level: 0,
|
|
3197
|
+
path: []
|
|
3198
|
+
}
|
|
3199
|
+
}));
|
|
3200
|
+
} catch (error) {
|
|
3201
|
+
if (!(error instanceof MinimumChunkSizeError) && error instanceof Error) {
|
|
3202
|
+
console.warn(
|
|
3203
|
+
`Unexpected text splitting error: ${error.message}. Forcing character-based split.`
|
|
3204
|
+
);
|
|
3205
|
+
}
|
|
3206
|
+
const chunks = [];
|
|
3207
|
+
let offset = 0;
|
|
3208
|
+
while (offset < content.length) {
|
|
3209
|
+
const chunkContent = content.substring(
|
|
3210
|
+
offset,
|
|
3211
|
+
offset + this.options.maxChunkSize
|
|
3212
|
+
);
|
|
3213
|
+
chunks.push({
|
|
3214
|
+
types: ["text"],
|
|
3215
|
+
content: chunkContent,
|
|
3216
|
+
section: {
|
|
3217
|
+
level: 0,
|
|
3218
|
+
path: []
|
|
3219
|
+
}
|
|
3220
|
+
});
|
|
3221
|
+
offset += this.options.maxChunkSize;
|
|
3222
|
+
}
|
|
3223
|
+
return chunks;
|
|
3224
|
+
}
|
|
3225
|
+
}
|
|
3226
|
+
}
|
|
3047
3227
|
class JsonDocumentSplitter {
|
|
3048
3228
|
preserveFormatting;
|
|
3229
|
+
maxDepth;
|
|
3230
|
+
maxChunks;
|
|
3231
|
+
textFallbackSplitter;
|
|
3049
3232
|
constructor(options = {}) {
|
|
3050
3233
|
this.preserveFormatting = options.preserveFormatting ?? true;
|
|
3234
|
+
this.maxDepth = options.maxDepth ?? JSON_MAX_NESTING_DEPTH;
|
|
3235
|
+
this.maxChunks = options.maxChunks ?? JSON_MAX_CHUNKS;
|
|
3236
|
+
this.textFallbackSplitter = new TextDocumentSplitter();
|
|
3051
3237
|
}
|
|
3052
3238
|
async splitText(content, _contentType) {
|
|
3053
3239
|
try {
|
|
3054
3240
|
const parsed = JSON.parse(content);
|
|
3055
3241
|
const chunks = [];
|
|
3056
|
-
this.processValue(parsed, ["root"], 1, 0, chunks, true);
|
|
3242
|
+
await this.processValue(parsed, ["root"], 1, 0, chunks, true);
|
|
3243
|
+
if (chunks.length > this.maxChunks) {
|
|
3244
|
+
return this.textFallbackSplitter.splitText(content);
|
|
3245
|
+
}
|
|
3057
3246
|
return chunks;
|
|
3058
3247
|
} catch {
|
|
3059
3248
|
return [
|
|
@@ -3068,16 +3257,20 @@ class JsonDocumentSplitter {
|
|
|
3068
3257
|
];
|
|
3069
3258
|
}
|
|
3070
3259
|
}
|
|
3071
|
-
processValue(value, path2, level, indentLevel, chunks, isLastItem) {
|
|
3260
|
+
async processValue(value, path2, level, indentLevel, chunks, isLastItem) {
|
|
3261
|
+
if (level > this.maxDepth) {
|
|
3262
|
+
await this.processValueAsText(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3263
|
+
return;
|
|
3264
|
+
}
|
|
3072
3265
|
if (Array.isArray(value)) {
|
|
3073
|
-
this.processArray(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3266
|
+
await this.processArray(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3074
3267
|
} else if (value !== null && typeof value === "object") {
|
|
3075
|
-
this.processObject(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3268
|
+
await this.processObject(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3076
3269
|
} else {
|
|
3077
|
-
this.processPrimitive(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3270
|
+
await this.processPrimitive(value, path2, level, indentLevel, chunks, isLastItem);
|
|
3078
3271
|
}
|
|
3079
3272
|
}
|
|
3080
|
-
processArray(array, path2, level, indentLevel, chunks, isLastItem) {
|
|
3273
|
+
async processArray(array, path2, level, indentLevel, chunks, isLastItem) {
|
|
3081
3274
|
const indent = this.getIndent(indentLevel);
|
|
3082
3275
|
const comma = isLastItem ? "" : ",";
|
|
3083
3276
|
chunks.push({
|
|
@@ -3085,18 +3278,19 @@ class JsonDocumentSplitter {
|
|
|
3085
3278
|
content: `${indent}[`,
|
|
3086
3279
|
section: { level, path: [...path2] }
|
|
3087
3280
|
});
|
|
3088
|
-
array.
|
|
3281
|
+
for (let index = 0; index < array.length; index++) {
|
|
3282
|
+
const item = array[index];
|
|
3089
3283
|
const isLast = index === array.length - 1;
|
|
3090
3284
|
const itemPath = [...path2, `[${index}]`];
|
|
3091
|
-
this.processValue(item, itemPath, level + 1, indentLevel + 1, chunks, isLast);
|
|
3092
|
-
}
|
|
3285
|
+
await this.processValue(item, itemPath, level + 1, indentLevel + 1, chunks, isLast);
|
|
3286
|
+
}
|
|
3093
3287
|
chunks.push({
|
|
3094
3288
|
types: ["code"],
|
|
3095
3289
|
content: `${indent}]${comma}`,
|
|
3096
3290
|
section: { level, path: [...path2] }
|
|
3097
3291
|
});
|
|
3098
3292
|
}
|
|
3099
|
-
processObject(obj, path2, level, indentLevel, chunks, isLastItem) {
|
|
3293
|
+
async processObject(obj, path2, level, indentLevel, chunks, isLastItem) {
|
|
3100
3294
|
const indent = this.getIndent(indentLevel);
|
|
3101
3295
|
const comma = isLastItem ? "" : ",";
|
|
3102
3296
|
const entries = Object.entries(obj);
|
|
@@ -3105,10 +3299,11 @@ class JsonDocumentSplitter {
|
|
|
3105
3299
|
content: `${indent}{`,
|
|
3106
3300
|
section: { level, path: [...path2] }
|
|
3107
3301
|
});
|
|
3108
|
-
|
|
3302
|
+
for (let index = 0; index < entries.length; index++) {
|
|
3303
|
+
const [key, value] = entries[index];
|
|
3109
3304
|
const isLast = index === entries.length - 1;
|
|
3110
3305
|
const propertyPath = [...path2, key];
|
|
3111
|
-
this.processProperty(
|
|
3306
|
+
await this.processProperty(
|
|
3112
3307
|
key,
|
|
3113
3308
|
value,
|
|
3114
3309
|
propertyPath,
|
|
@@ -3117,14 +3312,14 @@ class JsonDocumentSplitter {
|
|
|
3117
3312
|
chunks,
|
|
3118
3313
|
isLast
|
|
3119
3314
|
);
|
|
3120
|
-
}
|
|
3315
|
+
}
|
|
3121
3316
|
chunks.push({
|
|
3122
3317
|
types: ["code"],
|
|
3123
3318
|
content: `${indent}}${comma}`,
|
|
3124
3319
|
section: { level, path: [...path2] }
|
|
3125
3320
|
});
|
|
3126
3321
|
}
|
|
3127
|
-
processProperty(key, value, path2, level, indentLevel, chunks, isLastProperty) {
|
|
3322
|
+
async processProperty(key, value, path2, level, indentLevel, chunks, isLastProperty) {
|
|
3128
3323
|
const indent = this.getIndent(indentLevel);
|
|
3129
3324
|
if (typeof value === "object" && value !== null) {
|
|
3130
3325
|
chunks.push({
|
|
@@ -3132,30 +3327,98 @@ class JsonDocumentSplitter {
|
|
|
3132
3327
|
content: `${indent}"${key}": `,
|
|
3133
3328
|
section: { level, path: path2 }
|
|
3134
3329
|
});
|
|
3135
|
-
this.processValue(value, path2, level, indentLevel, chunks, isLastProperty);
|
|
3330
|
+
await this.processValue(value, path2, level, indentLevel, chunks, isLastProperty);
|
|
3136
3331
|
} else {
|
|
3137
3332
|
const comma = isLastProperty ? "" : ",";
|
|
3138
3333
|
const formattedValue = JSON.stringify(value);
|
|
3139
|
-
|
|
3140
|
-
|
|
3141
|
-
|
|
3142
|
-
|
|
3143
|
-
|
|
3334
|
+
const fullContent = `${indent}"${key}": ${formattedValue}${comma}`;
|
|
3335
|
+
if (fullContent.length > SPLITTER_MAX_CHUNK_SIZE) {
|
|
3336
|
+
const textChunks = await this.textFallbackSplitter.splitText(formattedValue);
|
|
3337
|
+
chunks.push({
|
|
3338
|
+
types: ["code"],
|
|
3339
|
+
content: `${indent}"${key}": `,
|
|
3340
|
+
section: { level, path: path2 }
|
|
3341
|
+
});
|
|
3342
|
+
textChunks.forEach((textChunk, index) => {
|
|
3343
|
+
const isLastChunk = index === textChunks.length - 1;
|
|
3344
|
+
const content = `${textChunk.content}${isLastChunk ? comma : ""}`;
|
|
3345
|
+
chunks.push({
|
|
3346
|
+
types: ["code"],
|
|
3347
|
+
content,
|
|
3348
|
+
section: { level, path: path2 }
|
|
3349
|
+
});
|
|
3350
|
+
});
|
|
3351
|
+
} else {
|
|
3352
|
+
chunks.push({
|
|
3353
|
+
types: ["code"],
|
|
3354
|
+
content: fullContent,
|
|
3355
|
+
section: { level, path: path2 }
|
|
3356
|
+
});
|
|
3357
|
+
}
|
|
3144
3358
|
}
|
|
3145
3359
|
}
|
|
3146
|
-
processPrimitive(value, path2, level, indentLevel, chunks, isLastItem) {
|
|
3360
|
+
async processPrimitive(value, path2, level, indentLevel, chunks, isLastItem) {
|
|
3147
3361
|
const indent = this.getIndent(indentLevel);
|
|
3148
3362
|
const comma = isLastItem ? "" : ",";
|
|
3149
3363
|
const formattedValue = JSON.stringify(value);
|
|
3150
|
-
|
|
3151
|
-
|
|
3152
|
-
|
|
3153
|
-
|
|
3154
|
-
|
|
3364
|
+
const fullContent = `${indent}${formattedValue}${comma}`;
|
|
3365
|
+
if (fullContent.length > SPLITTER_MAX_CHUNK_SIZE) {
|
|
3366
|
+
const textChunks = await this.textFallbackSplitter.splitText(formattedValue);
|
|
3367
|
+
textChunks.forEach((textChunk, index) => {
|
|
3368
|
+
const isFirstChunk = index === 0;
|
|
3369
|
+
const isLastChunk = index === textChunks.length - 1;
|
|
3370
|
+
const valueContent = isFirstChunk ? `${indent}${textChunk.content}` : textChunk.content;
|
|
3371
|
+
const content = `${valueContent}${isLastChunk ? comma : ""}`;
|
|
3372
|
+
chunks.push({
|
|
3373
|
+
types: ["code"],
|
|
3374
|
+
content,
|
|
3375
|
+
section: { level, path: [...path2] }
|
|
3376
|
+
});
|
|
3377
|
+
});
|
|
3378
|
+
} else {
|
|
3379
|
+
chunks.push({
|
|
3380
|
+
types: ["code"],
|
|
3381
|
+
content: fullContent,
|
|
3382
|
+
section: { level, path: path2 }
|
|
3383
|
+
});
|
|
3384
|
+
}
|
|
3155
3385
|
}
|
|
3156
3386
|
getIndent(level) {
|
|
3157
3387
|
return this.preserveFormatting ? " ".repeat(level) : "";
|
|
3158
3388
|
}
|
|
3389
|
+
/**
|
|
3390
|
+
* Process a value that has exceeded the maximum depth limit by serializing it as text.
|
|
3391
|
+
* This prevents excessive chunking of deeply nested structures.
|
|
3392
|
+
* If the serialized value is too large, splits it using the text fallback splitter.
|
|
3393
|
+
*/
|
|
3394
|
+
async processValueAsText(value, path2, level, indentLevel, chunks, isLastItem) {
|
|
3395
|
+
const indent = this.getIndent(indentLevel);
|
|
3396
|
+
const comma = isLastItem ? "" : ",";
|
|
3397
|
+
let serialized;
|
|
3398
|
+
if (this.preserveFormatting) {
|
|
3399
|
+
const lines = JSON.stringify(value, null, 2).split("\n");
|
|
3400
|
+
serialized = lines.map((line, idx) => idx === 0 ? line : `${indent}${line}`).join("\n");
|
|
3401
|
+
} else {
|
|
3402
|
+
serialized = JSON.stringify(value);
|
|
3403
|
+
}
|
|
3404
|
+
const fullContent = `${indent}${serialized}${comma}`;
|
|
3405
|
+
if (fullContent.length > SPLITTER_MAX_CHUNK_SIZE) {
|
|
3406
|
+
const textChunks = await this.textFallbackSplitter.splitText(serialized);
|
|
3407
|
+
for (const textChunk of textChunks) {
|
|
3408
|
+
chunks.push({
|
|
3409
|
+
types: ["code"],
|
|
3410
|
+
content: textChunk.content,
|
|
3411
|
+
section: { level, path: [...path2] }
|
|
3412
|
+
});
|
|
3413
|
+
}
|
|
3414
|
+
} else {
|
|
3415
|
+
chunks.push({
|
|
3416
|
+
types: ["code"],
|
|
3417
|
+
content: fullContent,
|
|
3418
|
+
section: { level, path: [...path2] }
|
|
3419
|
+
});
|
|
3420
|
+
}
|
|
3421
|
+
}
|
|
3159
3422
|
}
|
|
3160
3423
|
class CodeContentSplitter {
|
|
3161
3424
|
constructor(options) {
|
|
@@ -3255,132 +3518,6 @@ class TableContentSplitter {
|
|
|
3255
3518
|
return separator.includes("|") && /^\|?[\s-|]+\|?$/.test(separator);
|
|
3256
3519
|
}
|
|
3257
3520
|
}
|
|
3258
|
-
class TextContentSplitter {
|
|
3259
|
-
constructor(options) {
|
|
3260
|
-
this.options = options;
|
|
3261
|
-
}
|
|
3262
|
-
/**
|
|
3263
|
-
* Splits text content into chunks while trying to preserve semantic boundaries.
|
|
3264
|
-
* Prefers paragraph breaks, then line breaks, finally falling back to word boundaries.
|
|
3265
|
-
* Always preserves formatting - trimming should be done by higher-level splitters if needed.
|
|
3266
|
-
*/
|
|
3267
|
-
async split(content) {
|
|
3268
|
-
if (content.length <= this.options.chunkSize) {
|
|
3269
|
-
return [content];
|
|
3270
|
-
}
|
|
3271
|
-
const words = content.split(/\s+/);
|
|
3272
|
-
const longestWord = words.reduce(
|
|
3273
|
-
(max, word) => word.length > max.length ? word : max
|
|
3274
|
-
);
|
|
3275
|
-
if (longestWord.length > this.options.chunkSize) {
|
|
3276
|
-
throw new MinimumChunkSizeError(longestWord.length, this.options.chunkSize);
|
|
3277
|
-
}
|
|
3278
|
-
const paragraphChunks = this.splitByParagraphs(content);
|
|
3279
|
-
if (this.areChunksValid(paragraphChunks)) {
|
|
3280
|
-
return paragraphChunks;
|
|
3281
|
-
}
|
|
3282
|
-
const lineChunks = this.splitByLines(content);
|
|
3283
|
-
if (this.areChunksValid(lineChunks)) {
|
|
3284
|
-
return this.mergeChunks(lineChunks, "");
|
|
3285
|
-
}
|
|
3286
|
-
const wordChunks = await this.splitByWords(content);
|
|
3287
|
-
return this.mergeChunks(wordChunks, " ");
|
|
3288
|
-
}
|
|
3289
|
-
/**
|
|
3290
|
-
* Checks if all chunks are within the maximum size limit
|
|
3291
|
-
*/
|
|
3292
|
-
areChunksValid(chunks) {
|
|
3293
|
-
return chunks.every((chunk) => chunk.length <= this.options.chunkSize);
|
|
3294
|
-
}
|
|
3295
|
-
/**
|
|
3296
|
-
* Splits text into chunks by paragraph boundaries (double newlines)
|
|
3297
|
-
* Preserves all formatting and whitespace including the paragraph separators
|
|
3298
|
-
*/
|
|
3299
|
-
splitByParagraphs(text) {
|
|
3300
|
-
const chunks = [];
|
|
3301
|
-
let startPos = 0;
|
|
3302
|
-
const paragraphRegex = /\n\s*\n/g;
|
|
3303
|
-
let match = paragraphRegex.exec(text);
|
|
3304
|
-
while (match !== null) {
|
|
3305
|
-
const endPos = match.index + match[0].length;
|
|
3306
|
-
const chunk = text.slice(startPos, endPos);
|
|
3307
|
-
if (chunk.length > 2) {
|
|
3308
|
-
chunks.push(chunk);
|
|
3309
|
-
}
|
|
3310
|
-
startPos = endPos;
|
|
3311
|
-
match = paragraphRegex.exec(text);
|
|
3312
|
-
}
|
|
3313
|
-
if (startPos < text.length) {
|
|
3314
|
-
const remainingChunk = text.slice(startPos);
|
|
3315
|
-
if (remainingChunk.length > 2) {
|
|
3316
|
-
chunks.push(remainingChunk);
|
|
3317
|
-
}
|
|
3318
|
-
}
|
|
3319
|
-
return chunks.filter(Boolean);
|
|
3320
|
-
}
|
|
3321
|
-
/**
|
|
3322
|
-
* Splits text into chunks by line boundaries
|
|
3323
|
-
* Preserves all formatting and whitespace, including newlines at the end of each line
|
|
3324
|
-
*/
|
|
3325
|
-
splitByLines(text) {
|
|
3326
|
-
const chunks = [];
|
|
3327
|
-
let startPos = 0;
|
|
3328
|
-
for (let i = 0; i < text.length; i++) {
|
|
3329
|
-
if (text[i] === "\n") {
|
|
3330
|
-
const chunk = text.slice(startPos, i + 1);
|
|
3331
|
-
chunks.push(chunk);
|
|
3332
|
-
startPos = i + 1;
|
|
3333
|
-
}
|
|
3334
|
-
}
|
|
3335
|
-
if (startPos < text.length) {
|
|
3336
|
-
chunks.push(text.slice(startPos));
|
|
3337
|
-
}
|
|
3338
|
-
return chunks;
|
|
3339
|
-
}
|
|
3340
|
-
/**
|
|
3341
|
-
* Uses LangChain's recursive splitter for word-based splitting as a last resort
|
|
3342
|
-
*/
|
|
3343
|
-
async splitByWords(text) {
|
|
3344
|
-
const splitter = new RecursiveCharacterTextSplitter({
|
|
3345
|
-
chunkSize: this.options.chunkSize,
|
|
3346
|
-
chunkOverlap: 0
|
|
3347
|
-
});
|
|
3348
|
-
const chunks = await splitter.splitText(text);
|
|
3349
|
-
return chunks;
|
|
3350
|
-
}
|
|
3351
|
-
/**
|
|
3352
|
-
* Attempts to merge small chunks with previous chunks to minimize fragmentation.
|
|
3353
|
-
* Only merges if combined size is within maxChunkSize.
|
|
3354
|
-
*/
|
|
3355
|
-
mergeChunks(chunks, separator) {
|
|
3356
|
-
const mergedChunks = [];
|
|
3357
|
-
let currentChunk = null;
|
|
3358
|
-
for (const chunk of chunks) {
|
|
3359
|
-
if (currentChunk === null) {
|
|
3360
|
-
currentChunk = chunk;
|
|
3361
|
-
continue;
|
|
3362
|
-
}
|
|
3363
|
-
const currentChunkSize = this.getChunkSize(currentChunk);
|
|
3364
|
-
const nextChunkSize = this.getChunkSize(chunk);
|
|
3365
|
-
if (currentChunkSize + nextChunkSize + separator.length <= this.options.chunkSize) {
|
|
3366
|
-
currentChunk = `${currentChunk}${separator}${chunk}`;
|
|
3367
|
-
} else {
|
|
3368
|
-
mergedChunks.push(currentChunk);
|
|
3369
|
-
currentChunk = chunk;
|
|
3370
|
-
}
|
|
3371
|
-
}
|
|
3372
|
-
if (currentChunk) {
|
|
3373
|
-
mergedChunks.push(currentChunk);
|
|
3374
|
-
}
|
|
3375
|
-
return mergedChunks;
|
|
3376
|
-
}
|
|
3377
|
-
getChunkSize(chunk) {
|
|
3378
|
-
return chunk.length;
|
|
3379
|
-
}
|
|
3380
|
-
wrap(content) {
|
|
3381
|
-
return content;
|
|
3382
|
-
}
|
|
3383
|
-
}
|
|
3384
3521
|
class SemanticMarkdownSplitter {
|
|
3385
3522
|
constructor(preferredChunkSize, maxChunkSize) {
|
|
3386
3523
|
this.preferredChunkSize = preferredChunkSize;
|
|
@@ -6452,45 +6589,6 @@ class MarkdownPipeline extends BasePipeline {
|
|
|
6452
6589
|
};
|
|
6453
6590
|
}
|
|
6454
6591
|
}
|
|
6455
|
-
class TextDocumentSplitter {
|
|
6456
|
-
options;
|
|
6457
|
-
textSplitter;
|
|
6458
|
-
constructor(options = {}) {
|
|
6459
|
-
this.options = {
|
|
6460
|
-
maxChunkSize: options.maxChunkSize ?? SPLITTER_MAX_CHUNK_SIZE
|
|
6461
|
-
};
|
|
6462
|
-
this.textSplitter = new TextContentSplitter({
|
|
6463
|
-
chunkSize: this.options.maxChunkSize
|
|
6464
|
-
});
|
|
6465
|
-
}
|
|
6466
|
-
async splitText(content) {
|
|
6467
|
-
if (!content.trim()) {
|
|
6468
|
-
return [];
|
|
6469
|
-
}
|
|
6470
|
-
try {
|
|
6471
|
-
const chunks = await this.textSplitter.split(content);
|
|
6472
|
-
return chunks.map((chunk) => ({
|
|
6473
|
-
types: ["text"],
|
|
6474
|
-
content: chunk,
|
|
6475
|
-
section: {
|
|
6476
|
-
level: 0,
|
|
6477
|
-
path: []
|
|
6478
|
-
}
|
|
6479
|
-
}));
|
|
6480
|
-
} catch {
|
|
6481
|
-
return [
|
|
6482
|
-
{
|
|
6483
|
-
types: ["text"],
|
|
6484
|
-
content,
|
|
6485
|
-
section: {
|
|
6486
|
-
level: 0,
|
|
6487
|
-
path: []
|
|
6488
|
-
}
|
|
6489
|
-
}
|
|
6490
|
-
];
|
|
6491
|
-
}
|
|
6492
|
-
}
|
|
6493
|
-
}
|
|
6494
6592
|
class TextPipeline extends BasePipeline {
|
|
6495
6593
|
middleware;
|
|
6496
6594
|
splitter;
|
|
@@ -8973,9 +9071,10 @@ class DocumentStore {
|
|
|
8973
9071
|
* - Single texts that are too large are truncated and retried once
|
|
8974
9072
|
*
|
|
8975
9073
|
* @param texts Array of texts to embed
|
|
9074
|
+
* @param isRetry Internal flag to prevent duplicate warning logs
|
|
8976
9075
|
* @returns Array of embedding vectors
|
|
8977
9076
|
*/
|
|
8978
|
-
async embedDocumentsWithRetry(texts) {
|
|
9077
|
+
async embedDocumentsWithRetry(texts, isRetry = false) {
|
|
8979
9078
|
if (texts.length === 0) {
|
|
8980
9079
|
return [];
|
|
8981
9080
|
}
|
|
@@ -8987,26 +9086,27 @@ class DocumentStore {
|
|
|
8987
9086
|
const midpoint = Math.floor(texts.length / 2);
|
|
8988
9087
|
const firstHalf = texts.slice(0, midpoint);
|
|
8989
9088
|
const secondHalf = texts.slice(midpoint);
|
|
8990
|
-
|
|
8991
|
-
|
|
8992
|
-
|
|
9089
|
+
if (!isRetry) {
|
|
9090
|
+
logger.warn(
|
|
9091
|
+
`⚠️ Batch of ${texts.length} texts exceeded size limit, splitting into ${firstHalf.length} + ${secondHalf.length}`
|
|
9092
|
+
);
|
|
9093
|
+
}
|
|
8993
9094
|
const [firstEmbeddings, secondEmbeddings] = await Promise.all([
|
|
8994
|
-
this.embedDocumentsWithRetry(firstHalf),
|
|
8995
|
-
this.embedDocumentsWithRetry(secondHalf)
|
|
9095
|
+
this.embedDocumentsWithRetry(firstHalf, true),
|
|
9096
|
+
this.embedDocumentsWithRetry(secondHalf, true)
|
|
8996
9097
|
]);
|
|
8997
9098
|
return [...firstEmbeddings, ...secondEmbeddings];
|
|
8998
9099
|
} else {
|
|
8999
9100
|
const text = texts[0];
|
|
9000
9101
|
const midpoint = Math.floor(text.length / 2);
|
|
9001
9102
|
const firstHalf = text.substring(0, midpoint);
|
|
9002
|
-
|
|
9003
|
-
|
|
9004
|
-
|
|
9005
|
-
try {
|
|
9006
|
-
const embedding = await this.embedDocumentsWithRetry([firstHalf]);
|
|
9007
|
-
logger.info(
|
|
9008
|
-
`✓ Using embedding from first half of split text (${firstHalf.length} chars)`
|
|
9103
|
+
if (!isRetry) {
|
|
9104
|
+
logger.warn(
|
|
9105
|
+
`⚠️ Single text exceeded embedding size limit (${text.length} chars).`
|
|
9009
9106
|
);
|
|
9107
|
+
}
|
|
9108
|
+
try {
|
|
9109
|
+
const embedding = await this.embedDocumentsWithRetry([firstHalf], true);
|
|
9010
9110
|
return embedding;
|
|
9011
9111
|
} catch (retryError) {
|
|
9012
9112
|
logger.error(
|
|
@@ -9130,8 +9230,8 @@ class DocumentStore {
|
|
|
9130
9230
|
const rowId = result2.lastInsertRowid;
|
|
9131
9231
|
if (this.isVectorSearchEnabled && paddedEmbeddings.length > 0) {
|
|
9132
9232
|
this.statements.insertEmbedding.run(
|
|
9133
|
-
|
|
9134
|
-
|
|
9233
|
+
JSON.stringify(paddedEmbeddings[docIndex]),
|
|
9234
|
+
BigInt(rowId)
|
|
9135
9235
|
);
|
|
9136
9236
|
}
|
|
9137
9237
|
docIndex++;
|
|
@@ -10770,7 +10870,7 @@ const Layout = ({
|
|
|
10770
10870
|
children,
|
|
10771
10871
|
eventClientConfig
|
|
10772
10872
|
}) => {
|
|
10773
|
-
const versionString = version || "1.
|
|
10873
|
+
const versionString = version || "1.33.0";
|
|
10774
10874
|
const versionInitializer = `versionUpdate({ currentVersion: ${`'${versionString}'`} })`;
|
|
10775
10875
|
return /* @__PURE__ */ jsxs("html", { lang: "en", children: [
|
|
10776
10876
|
/* @__PURE__ */ jsxs("head", { children: [
|
|
@@ -13082,7 +13182,7 @@ class AppServer {
|
|
|
13082
13182
|
try {
|
|
13083
13183
|
if (telemetry.isEnabled()) {
|
|
13084
13184
|
telemetry.setGlobalContext({
|
|
13085
|
-
appVersion: "1.
|
|
13185
|
+
appVersion: "1.33.0",
|
|
13086
13186
|
appPlatform: process.platform,
|
|
13087
13187
|
appNodeVersion: process.version,
|
|
13088
13188
|
appServicesEnabled: this.getActiveServicesList(),
|
|
@@ -16569,7 +16669,7 @@ function createCliProgram() {
|
|
|
16569
16669
|
const commandStartTimes = /* @__PURE__ */ new Map();
|
|
16570
16670
|
let globalEventBus = null;
|
|
16571
16671
|
let globalTelemetryService = null;
|
|
16572
|
-
program.name("docs-mcp-server").description("Unified CLI, MCP Server, and Web Interface for Docs MCP Server.").version("1.
|
|
16672
|
+
program.name("docs-mcp-server").description("Unified CLI, MCP Server, and Web Interface for Docs MCP Server.").version("1.33.0").addOption(
|
|
16573
16673
|
new Option("--verbose", "Enable verbose (debug) logging").conflicts("silent")
|
|
16574
16674
|
).addOption(new Option("--silent", "Disable all logging except errors")).addOption(
|
|
16575
16675
|
new Option("--telemetry", "Enable telemetry collection").env("DOCS_MCP_TELEMETRY").argParser((value) => {
|
|
@@ -16603,7 +16703,7 @@ function createCliProgram() {
|
|
|
16603
16703
|
if (shouldEnableTelemetry()) {
|
|
16604
16704
|
if (telemetry.isEnabled()) {
|
|
16605
16705
|
telemetry.setGlobalContext({
|
|
16606
|
-
appVersion: "1.
|
|
16706
|
+
appVersion: "1.33.0",
|
|
16607
16707
|
appPlatform: process.platform,
|
|
16608
16708
|
appNodeVersion: process.version,
|
|
16609
16709
|
appInterface: "cli",
|