@arabold/docs-mcp-server 1.12.4 → 1.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{DocumentManagementService-BupnR1eC.js → DocumentManagementService-BGW9iWNn.js} +281 -209
- package/dist/DocumentManagementService-BGW9iWNn.js.map +1 -0
- package/dist/{EmbeddingFactory-DZKXkqOe.js → EmbeddingFactory-0Z5e_g1J.js} +4 -3
- package/dist/EmbeddingFactory-0Z5e_g1J.js.map +1 -0
- package/dist/{FindVersionTool-BcnLvjlo.js → FindVersionTool-DhhmoGU7.js} +34 -64
- package/dist/FindVersionTool-DhhmoGU7.js.map +1 -0
- package/dist/{RemoveTool-Bqpr8F9m.js → RemoveTool-BZPTXvhj.js} +2 -2
- package/dist/{RemoveTool-Bqpr8F9m.js.map → RemoveTool-BZPTXvhj.js.map} +1 -1
- package/dist/cli.js +3 -3
- package/dist/server.js +4 -3
- package/dist/server.js.map +1 -1
- package/dist/web.js +3 -2
- package/dist/web.js.map +1 -1
- package/package.json +1 -1
- package/dist/DocumentManagementService-BupnR1eC.js.map +0 -1
- package/dist/EmbeddingFactory-DZKXkqOe.js.map +0 -1
- package/dist/FindVersionTool-BcnLvjlo.js.map +0 -1
package/dist/{DocumentManagementService-BupnR1eC.js → DocumentManagementService-BGW9iWNn.js}
RENAMED
|
@@ -10,6 +10,7 @@ import { VirtualConsole, JSDOM } from "jsdom";
|
|
|
10
10
|
import { chromium } from "playwright";
|
|
11
11
|
import { gfm } from "@joplin/turndown-plugin-gfm";
|
|
12
12
|
import TurndownService from "turndown";
|
|
13
|
+
import { TextDecoder } from "node:util";
|
|
13
14
|
import { URL as URL$1, fileURLToPath } from "node:url";
|
|
14
15
|
import * as semver from "semver";
|
|
15
16
|
import semver__default from "semver";
|
|
@@ -168,6 +169,49 @@ const FETCHER_BASE_DELAY = 1e3;
|
|
|
168
169
|
const SPLITTER_MIN_CHUNK_SIZE = 500;
|
|
169
170
|
const SPLITTER_PREFERRED_CHUNK_SIZE = 1500;
|
|
170
171
|
const SPLITTER_MAX_CHUNK_SIZE = 5e3;
|
|
172
|
+
const EMBEDDING_BATCH_SIZE = 300;
|
|
173
|
+
class MimeTypeUtils {
|
|
174
|
+
/**
|
|
175
|
+
* Parses a Content-Type header string into its MIME type and charset.
|
|
176
|
+
* @param contentTypeHeader The Content-Type header string (e.g., "text/html; charset=utf-8").
|
|
177
|
+
* @returns A ParsedContentType object, or a default if parsing fails.
|
|
178
|
+
*/
|
|
179
|
+
static parseContentType(contentTypeHeader) {
|
|
180
|
+
if (!contentTypeHeader) {
|
|
181
|
+
return { mimeType: "application/octet-stream" };
|
|
182
|
+
}
|
|
183
|
+
const parts = contentTypeHeader.split(";").map((part) => part.trim());
|
|
184
|
+
const mimeType = parts[0].toLowerCase();
|
|
185
|
+
let charset;
|
|
186
|
+
for (let i = 1; i < parts.length; i++) {
|
|
187
|
+
const param = parts[i];
|
|
188
|
+
if (param.toLowerCase().startsWith("charset=")) {
|
|
189
|
+
charset = param.substring("charset=".length).toLowerCase();
|
|
190
|
+
break;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
return { mimeType, charset };
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Checks if a MIME type represents HTML content.
|
|
197
|
+
*/
|
|
198
|
+
static isHtml(mimeType) {
|
|
199
|
+
return mimeType === "text/html" || mimeType === "application/xhtml+xml";
|
|
200
|
+
}
|
|
201
|
+
/**
|
|
202
|
+
* Checks if a MIME type represents Markdown content.
|
|
203
|
+
*/
|
|
204
|
+
static isMarkdown(mimeType) {
|
|
205
|
+
return mimeType === "text/markdown" || mimeType === "text/x-markdown";
|
|
206
|
+
}
|
|
207
|
+
/**
|
|
208
|
+
* Checks if a MIME type represents plain text content.
|
|
209
|
+
*/
|
|
210
|
+
static isText(mimeType) {
|
|
211
|
+
return mimeType.startsWith("text/");
|
|
212
|
+
}
|
|
213
|
+
// Extend with more helpers as needed (isJson, isXml, isPdf, etc.)
|
|
214
|
+
}
|
|
171
215
|
class FingerprintGenerator {
|
|
172
216
|
headerGenerator;
|
|
173
217
|
/**
|
|
@@ -245,11 +289,15 @@ class HttpFetcher {
|
|
|
245
289
|
maxRedirects: followRedirects ? 5 : 0
|
|
246
290
|
};
|
|
247
291
|
const response = await axios.get(source, config);
|
|
292
|
+
const contentTypeHeader = response.headers["content-type"];
|
|
293
|
+
const { mimeType, charset } = MimeTypeUtils.parseContentType(contentTypeHeader);
|
|
294
|
+
const contentEncoding = response.headers["content-encoding"];
|
|
248
295
|
return {
|
|
249
296
|
content: response.data,
|
|
250
|
-
mimeType
|
|
251
|
-
|
|
252
|
-
encoding:
|
|
297
|
+
mimeType,
|
|
298
|
+
charset,
|
|
299
|
+
encoding: contentEncoding,
|
|
300
|
+
source
|
|
253
301
|
};
|
|
254
302
|
} catch (error) {
|
|
255
303
|
const axiosError = error;
|
|
@@ -322,55 +370,11 @@ class FileFetcher {
|
|
|
322
370
|
}
|
|
323
371
|
}
|
|
324
372
|
}
|
|
325
|
-
class ContentProcessingPipeline {
|
|
326
|
-
middleware;
|
|
327
|
-
/**
|
|
328
|
-
* Creates an instance of ContentProcessingPipeline.
|
|
329
|
-
* @param middleware An array of middleware instances to execute in order.
|
|
330
|
-
*/
|
|
331
|
-
constructor(middleware) {
|
|
332
|
-
this.middleware = middleware;
|
|
333
|
-
}
|
|
334
|
-
/**
|
|
335
|
-
* Executes the middleware pipeline with the given initial context.
|
|
336
|
-
* @param initialContext The starting context for the pipeline.
|
|
337
|
-
* @returns A promise that resolves with the final context after all middleware have executed.
|
|
338
|
-
*/
|
|
339
|
-
async run(initialContext) {
|
|
340
|
-
let index = -1;
|
|
341
|
-
const dispatch = async (i) => {
|
|
342
|
-
if (i <= index) {
|
|
343
|
-
throw new Error("next() called multiple times");
|
|
344
|
-
}
|
|
345
|
-
index = i;
|
|
346
|
-
const mw = this.middleware[i];
|
|
347
|
-
if (!mw) {
|
|
348
|
-
return;
|
|
349
|
-
}
|
|
350
|
-
const next = dispatch.bind(null, i + 1);
|
|
351
|
-
try {
|
|
352
|
-
await mw.process(initialContext, next);
|
|
353
|
-
} catch (error) {
|
|
354
|
-
initialContext.errors.push(
|
|
355
|
-
error instanceof Error ? error : new Error(String(error))
|
|
356
|
-
);
|
|
357
|
-
logger.warn(`Error in middleware pipeline: ${error}`);
|
|
358
|
-
}
|
|
359
|
-
};
|
|
360
|
-
await dispatch(0);
|
|
361
|
-
return initialContext;
|
|
362
|
-
}
|
|
363
|
-
}
|
|
364
373
|
class HtmlCheerioParserMiddleware {
|
|
365
374
|
async process(context, next) {
|
|
366
|
-
if (!context.contentType.startsWith("text/html")) {
|
|
367
|
-
await next();
|
|
368
|
-
return;
|
|
369
|
-
}
|
|
370
|
-
const htmlString = typeof context.content === "string" ? context.content : Buffer.from(context.content).toString("utf-8");
|
|
371
375
|
try {
|
|
372
376
|
logger.debug(`Parsing HTML content with Cheerio from ${context.source}`);
|
|
373
|
-
const $ = cheerio.load(
|
|
377
|
+
const $ = cheerio.load(context.content);
|
|
374
378
|
context.dom = $;
|
|
375
379
|
await next();
|
|
376
380
|
} catch (error) {
|
|
@@ -403,17 +407,15 @@ function createJSDOM(html, options) {
|
|
|
403
407
|
class HtmlLinkExtractorMiddleware {
|
|
404
408
|
/**
|
|
405
409
|
* Processes the context to extract links from the sanitized HTML body.
|
|
406
|
-
* @param context The current
|
|
410
|
+
* @param context The current middleware context.
|
|
407
411
|
* @param next Function to call the next middleware.
|
|
408
412
|
*/
|
|
409
413
|
async process(context, next) {
|
|
410
414
|
const $ = context.dom;
|
|
411
415
|
if (!$) {
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
);
|
|
416
|
-
}
|
|
416
|
+
logger.warn(
|
|
417
|
+
`Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`
|
|
418
|
+
);
|
|
417
419
|
await next();
|
|
418
420
|
return;
|
|
419
421
|
}
|
|
@@ -460,11 +462,9 @@ class HtmlMetadataExtractorMiddleware {
|
|
|
460
462
|
async process(context, next) {
|
|
461
463
|
const $ = context.dom;
|
|
462
464
|
if (!$) {
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
);
|
|
467
|
-
}
|
|
465
|
+
logger.warn(
|
|
466
|
+
`Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`
|
|
467
|
+
);
|
|
468
468
|
await next();
|
|
469
469
|
return;
|
|
470
470
|
}
|
|
@@ -526,10 +526,6 @@ class HtmlPlaywrightMiddleware {
|
|
|
526
526
|
}
|
|
527
527
|
}
|
|
528
528
|
async process(context, next) {
|
|
529
|
-
if (!context.contentType.startsWith("text/html")) {
|
|
530
|
-
await next();
|
|
531
|
-
return;
|
|
532
|
-
}
|
|
533
529
|
const scrapeMode = context.options?.scrapeMode ?? ScrapeMode.Auto;
|
|
534
530
|
const shouldRunPlaywright = scrapeMode === ScrapeMode.Playwright || scrapeMode === ScrapeMode.Auto;
|
|
535
531
|
if (!shouldRunPlaywright) {
|
|
@@ -552,7 +548,7 @@ class HtmlPlaywrightMiddleware {
|
|
|
552
548
|
if (route.request().url() === context.source) {
|
|
553
549
|
return route.fulfill({
|
|
554
550
|
status: 200,
|
|
555
|
-
contentType:
|
|
551
|
+
contentType: "text/html",
|
|
556
552
|
body: context.content
|
|
557
553
|
});
|
|
558
554
|
}
|
|
@@ -655,11 +651,9 @@ class HtmlSanitizerMiddleware {
|
|
|
655
651
|
async process(context, next) {
|
|
656
652
|
const $ = context.dom;
|
|
657
653
|
if (!$) {
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
);
|
|
662
|
-
}
|
|
654
|
+
logger.warn(
|
|
655
|
+
`Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`
|
|
656
|
+
);
|
|
663
657
|
await next();
|
|
664
658
|
return;
|
|
665
659
|
}
|
|
@@ -769,11 +763,9 @@ ${text.replace(/^\n+|\n+$/g, "")}
|
|
|
769
763
|
async process(context, next) {
|
|
770
764
|
const $ = context.dom;
|
|
771
765
|
if (!$) {
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
);
|
|
776
|
-
}
|
|
766
|
+
logger.warn(
|
|
767
|
+
`Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware ran correctly.`
|
|
768
|
+
);
|
|
777
769
|
await next();
|
|
778
770
|
return;
|
|
779
771
|
}
|
|
@@ -785,10 +777,8 @@ ${text.replace(/^\n+|\n+$/g, "")}
|
|
|
785
777
|
const warnMsg = `HTML to Markdown conversion resulted in empty content for ${context.source}.`;
|
|
786
778
|
logger.warn(warnMsg);
|
|
787
779
|
context.content = "";
|
|
788
|
-
context.contentType = "text/markdown";
|
|
789
780
|
} else {
|
|
790
781
|
context.content = markdown;
|
|
791
|
-
context.contentType = "text/markdown";
|
|
792
782
|
logger.debug(`Successfully converted HTML to Markdown for ${context.source}`);
|
|
793
783
|
}
|
|
794
784
|
} catch (error) {
|
|
@@ -809,10 +799,8 @@ class MarkdownLinkExtractorMiddleware {
|
|
|
809
799
|
* @param next Function to call the next middleware.
|
|
810
800
|
*/
|
|
811
801
|
async process(context, next) {
|
|
812
|
-
if (context.
|
|
813
|
-
|
|
814
|
-
context.links = [];
|
|
815
|
-
}
|
|
802
|
+
if (!Array.isArray(context.links)) {
|
|
803
|
+
context.links = [];
|
|
816
804
|
}
|
|
817
805
|
await next();
|
|
818
806
|
}
|
|
@@ -824,31 +812,153 @@ class MarkdownMetadataExtractorMiddleware {
|
|
|
824
812
|
* @param next Function to call the next middleware.
|
|
825
813
|
*/
|
|
826
814
|
async process(context, next) {
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
}
|
|
833
|
-
let title = "Untitled";
|
|
834
|
-
if (context.contentType === "text/markdown") {
|
|
835
|
-
const match = textContent.match(/^#\s+(.*)$/m);
|
|
836
|
-
if (match?.[1]) {
|
|
837
|
-
title = match[1].trim();
|
|
838
|
-
}
|
|
839
|
-
}
|
|
840
|
-
context.metadata.title = title;
|
|
841
|
-
} catch (error) {
|
|
842
|
-
context.errors.push(
|
|
843
|
-
new Error(
|
|
844
|
-
`Failed to extract metadata from Markdown: ${error instanceof Error ? error.message : String(error)}`
|
|
845
|
-
)
|
|
846
|
-
);
|
|
815
|
+
try {
|
|
816
|
+
let title = "Untitled";
|
|
817
|
+
const match = context.content.match(/^#\s+(.*)$/m);
|
|
818
|
+
if (match?.[1]) {
|
|
819
|
+
title = match[1].trim();
|
|
847
820
|
}
|
|
821
|
+
context.metadata.title = title;
|
|
822
|
+
} catch (error) {
|
|
823
|
+
context.errors.push(
|
|
824
|
+
new Error(
|
|
825
|
+
`Failed to extract metadata from Markdown: ${error instanceof Error ? error.message : String(error)}`
|
|
826
|
+
)
|
|
827
|
+
);
|
|
848
828
|
}
|
|
849
829
|
await next();
|
|
850
830
|
}
|
|
851
831
|
}
|
|
832
|
+
function convertToString(content, charset) {
|
|
833
|
+
if (Buffer.isBuffer(content)) {
|
|
834
|
+
const decoder = new TextDecoder(charset || "utf-8");
|
|
835
|
+
return decoder.decode(content);
|
|
836
|
+
}
|
|
837
|
+
return content;
|
|
838
|
+
}
|
|
839
|
+
class BasePipeline {
|
|
840
|
+
/**
|
|
841
|
+
* Determines if this pipeline can process the given content.
|
|
842
|
+
* Must be implemented by derived classes.
|
|
843
|
+
*/
|
|
844
|
+
canProcess(_rawContent) {
|
|
845
|
+
throw new Error("Method not implemented.");
|
|
846
|
+
}
|
|
847
|
+
/**
|
|
848
|
+
* Processes the raw content through the pipeline.
|
|
849
|
+
* Must be implemented by derived classes.
|
|
850
|
+
*/
|
|
851
|
+
async process(_rawContent, _options, _fetcher) {
|
|
852
|
+
throw new Error("Method not implemented.");
|
|
853
|
+
}
|
|
854
|
+
/**
|
|
855
|
+
* Executes a middleware stack on the given context.
|
|
856
|
+
* This is a utility method used by derived pipeline classes.
|
|
857
|
+
*
|
|
858
|
+
* @param middleware - The middleware stack to execute
|
|
859
|
+
* @param context - The context to process
|
|
860
|
+
*/
|
|
861
|
+
async executeMiddlewareStack(middleware, context) {
|
|
862
|
+
let index = -1;
|
|
863
|
+
const dispatch = async (i) => {
|
|
864
|
+
if (i <= index) throw new Error("next() called multiple times");
|
|
865
|
+
index = i;
|
|
866
|
+
const mw = middleware[i];
|
|
867
|
+
if (!mw) return;
|
|
868
|
+
await mw.process(context, dispatch.bind(null, i + 1));
|
|
869
|
+
};
|
|
870
|
+
try {
|
|
871
|
+
await dispatch(0);
|
|
872
|
+
} catch (error) {
|
|
873
|
+
context.errors.push(error instanceof Error ? error : new Error(String(error)));
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
/**
|
|
877
|
+
* Cleans up resources when the pipeline is no longer needed.
|
|
878
|
+
* Default implementation does nothing.
|
|
879
|
+
*/
|
|
880
|
+
async close() {
|
|
881
|
+
}
|
|
882
|
+
}
|
|
883
|
+
class HtmlPipeline extends BasePipeline {
|
|
884
|
+
playwrightMiddleware;
|
|
885
|
+
standardMiddleware;
|
|
886
|
+
constructor() {
|
|
887
|
+
super();
|
|
888
|
+
this.playwrightMiddleware = new HtmlPlaywrightMiddleware();
|
|
889
|
+
this.standardMiddleware = [
|
|
890
|
+
new HtmlCheerioParserMiddleware(),
|
|
891
|
+
new HtmlMetadataExtractorMiddleware(),
|
|
892
|
+
new HtmlLinkExtractorMiddleware(),
|
|
893
|
+
new HtmlSanitizerMiddleware(),
|
|
894
|
+
new HtmlToMarkdownMiddleware()
|
|
895
|
+
];
|
|
896
|
+
}
|
|
897
|
+
canProcess(rawContent) {
|
|
898
|
+
return MimeTypeUtils.isHtml(rawContent.mimeType);
|
|
899
|
+
}
|
|
900
|
+
async process(rawContent, options, fetcher) {
|
|
901
|
+
const contentString = convertToString(rawContent.content, rawContent.charset);
|
|
902
|
+
const context = {
|
|
903
|
+
content: contentString,
|
|
904
|
+
source: rawContent.source,
|
|
905
|
+
metadata: {},
|
|
906
|
+
links: [],
|
|
907
|
+
errors: [],
|
|
908
|
+
options,
|
|
909
|
+
fetcher
|
|
910
|
+
};
|
|
911
|
+
let middleware = [...this.standardMiddleware];
|
|
912
|
+
if (options.scrapeMode === "playwright" || options.scrapeMode === "auto") {
|
|
913
|
+
middleware = [this.playwrightMiddleware, ...middleware];
|
|
914
|
+
}
|
|
915
|
+
await this.executeMiddlewareStack(middleware, context);
|
|
916
|
+
return {
|
|
917
|
+
textContent: typeof context.content === "string" ? context.content : "",
|
|
918
|
+
metadata: context.metadata,
|
|
919
|
+
links: context.links,
|
|
920
|
+
errors: context.errors
|
|
921
|
+
};
|
|
922
|
+
}
|
|
923
|
+
async close() {
|
|
924
|
+
await this.playwrightMiddleware.closeBrowser();
|
|
925
|
+
}
|
|
926
|
+
}
|
|
927
|
+
class MarkdownPipeline extends BasePipeline {
|
|
928
|
+
middleware;
|
|
929
|
+
constructor() {
|
|
930
|
+
super();
|
|
931
|
+
this.middleware = [
|
|
932
|
+
new MarkdownMetadataExtractorMiddleware(),
|
|
933
|
+
new MarkdownLinkExtractorMiddleware()
|
|
934
|
+
];
|
|
935
|
+
}
|
|
936
|
+
canProcess(rawContent) {
|
|
937
|
+
if (!rawContent.mimeType) return false;
|
|
938
|
+
return MimeTypeUtils.isMarkdown(rawContent.mimeType) || MimeTypeUtils.isText(rawContent.mimeType);
|
|
939
|
+
}
|
|
940
|
+
async process(rawContent, options, fetcher) {
|
|
941
|
+
const contentString = convertToString(rawContent.content, rawContent.charset);
|
|
942
|
+
const context = {
|
|
943
|
+
content: contentString,
|
|
944
|
+
source: rawContent.source,
|
|
945
|
+
metadata: {},
|
|
946
|
+
links: [],
|
|
947
|
+
errors: [],
|
|
948
|
+
options,
|
|
949
|
+
fetcher
|
|
950
|
+
};
|
|
951
|
+
await this.executeMiddlewareStack(this.middleware, context);
|
|
952
|
+
return {
|
|
953
|
+
textContent: typeof context.content === "string" ? context.content : "",
|
|
954
|
+
metadata: context.metadata,
|
|
955
|
+
links: context.links,
|
|
956
|
+
errors: context.errors
|
|
957
|
+
};
|
|
958
|
+
}
|
|
959
|
+
async close() {
|
|
960
|
+
}
|
|
961
|
+
}
|
|
852
962
|
class PipelineError extends Error {
|
|
853
963
|
constructor(message, cause) {
|
|
854
964
|
super(message);
|
|
@@ -976,12 +1086,15 @@ class BaseScraperStrategy {
|
|
|
976
1086
|
class WebScraperStrategy extends BaseScraperStrategy {
|
|
977
1087
|
httpFetcher = new HttpFetcher();
|
|
978
1088
|
shouldFollowLinkFn;
|
|
979
|
-
|
|
980
|
-
|
|
1089
|
+
htmlPipeline;
|
|
1090
|
+
markdownPipeline;
|
|
1091
|
+
pipelines;
|
|
981
1092
|
constructor(options = {}) {
|
|
982
1093
|
super({ urlNormalizerOptions: options.urlNormalizerOptions });
|
|
983
1094
|
this.shouldFollowLinkFn = options.shouldFollowLink;
|
|
984
|
-
this.
|
|
1095
|
+
this.htmlPipeline = new HtmlPipeline();
|
|
1096
|
+
this.markdownPipeline = new MarkdownPipeline();
|
|
1097
|
+
this.pipelines = [this.htmlPipeline, this.markdownPipeline];
|
|
985
1098
|
}
|
|
986
1099
|
canHandle(url) {
|
|
987
1100
|
try {
|
|
@@ -1015,54 +1128,28 @@ class WebScraperStrategy extends BaseScraperStrategy {
|
|
|
1015
1128
|
followRedirects: options.followRedirects
|
|
1016
1129
|
};
|
|
1017
1130
|
const rawContent = await this.httpFetcher.fetch(url, fetchOptions);
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
options,
|
|
1027
|
-
fetcher: this.httpFetcher
|
|
1028
|
-
};
|
|
1029
|
-
let pipeline;
|
|
1030
|
-
if (initialContext.contentType.startsWith("text/html")) {
|
|
1031
|
-
const htmlPipelineSteps = [
|
|
1032
|
-
this.playwrightMiddleware,
|
|
1033
|
-
// Use the instance member
|
|
1034
|
-
// TODO: Add HtmlJsExecutorMiddleware here if needed based on options
|
|
1035
|
-
new HtmlCheerioParserMiddleware(),
|
|
1036
|
-
// Always runs after content is finalized
|
|
1037
|
-
new HtmlMetadataExtractorMiddleware(),
|
|
1038
|
-
new HtmlLinkExtractorMiddleware(),
|
|
1039
|
-
new HtmlSanitizerMiddleware(),
|
|
1040
|
-
// Element remover
|
|
1041
|
-
new HtmlToMarkdownMiddleware()
|
|
1042
|
-
];
|
|
1043
|
-
pipeline = new ContentProcessingPipeline(htmlPipelineSteps);
|
|
1044
|
-
} else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain") {
|
|
1045
|
-
pipeline = new ContentProcessingPipeline([
|
|
1046
|
-
new MarkdownMetadataExtractorMiddleware(),
|
|
1047
|
-
new MarkdownLinkExtractorMiddleware()
|
|
1048
|
-
// Placeholder for now
|
|
1049
|
-
]);
|
|
1050
|
-
} else {
|
|
1131
|
+
let processed;
|
|
1132
|
+
for (const pipeline of this.pipelines) {
|
|
1133
|
+
if (pipeline.canProcess(rawContent)) {
|
|
1134
|
+
processed = await pipeline.process(rawContent, options, this.httpFetcher);
|
|
1135
|
+
break;
|
|
1136
|
+
}
|
|
1137
|
+
}
|
|
1138
|
+
if (!processed) {
|
|
1051
1139
|
logger.warn(
|
|
1052
|
-
`Unsupported content type "${
|
|
1140
|
+
`Unsupported content type "${rawContent.mimeType}" for URL ${url}. Skipping processing.`
|
|
1053
1141
|
);
|
|
1054
1142
|
return { document: void 0, links: [] };
|
|
1055
1143
|
}
|
|
1056
|
-
const
|
|
1057
|
-
for (const err of finalContext.errors) {
|
|
1144
|
+
for (const err of processed.errors) {
|
|
1058
1145
|
logger.warn(`Processing error for ${url}: ${err.message}`);
|
|
1059
1146
|
}
|
|
1060
|
-
if (
|
|
1147
|
+
if (!processed.textContent || !processed.textContent.trim()) {
|
|
1061
1148
|
logger.warn(`No processable content found for ${url} after pipeline execution.`);
|
|
1062
|
-
return { document: void 0, links:
|
|
1149
|
+
return { document: void 0, links: processed.links };
|
|
1063
1150
|
}
|
|
1064
1151
|
const baseUrl = new URL(options.url);
|
|
1065
|
-
const filteredLinks =
|
|
1152
|
+
const filteredLinks = processed.links.filter((link) => {
|
|
1066
1153
|
try {
|
|
1067
1154
|
const targetUrl = new URL(link);
|
|
1068
1155
|
const scope = options.scope || "subpages";
|
|
@@ -1073,20 +1160,16 @@ class WebScraperStrategy extends BaseScraperStrategy {
|
|
|
1073
1160
|
});
|
|
1074
1161
|
return {
|
|
1075
1162
|
document: {
|
|
1076
|
-
content:
|
|
1077
|
-
// Final processed content (Markdown)
|
|
1163
|
+
content: processed.textContent,
|
|
1078
1164
|
metadata: {
|
|
1079
|
-
url
|
|
1080
|
-
|
|
1081
|
-
// Ensure title is a string, default to "Untitled"
|
|
1082
|
-
title: typeof finalContext.metadata.title === "string" ? finalContext.metadata.title : "Untitled",
|
|
1165
|
+
url,
|
|
1166
|
+
title: typeof processed.metadata.title === "string" ? processed.metadata.title : "Untitled",
|
|
1083
1167
|
library: options.library,
|
|
1084
|
-
version: options.version
|
|
1085
|
-
|
|
1168
|
+
version: options.version,
|
|
1169
|
+
...processed.metadata
|
|
1086
1170
|
}
|
|
1087
1171
|
},
|
|
1088
1172
|
links: filteredLinks
|
|
1089
|
-
// Use the filtered links
|
|
1090
1173
|
};
|
|
1091
1174
|
} catch (error) {
|
|
1092
1175
|
logger.error(`Failed processing page ${url}: ${error}`);
|
|
@@ -1101,7 +1184,8 @@ class WebScraperStrategy extends BaseScraperStrategy {
|
|
|
1101
1184
|
try {
|
|
1102
1185
|
await super.scrape(options, progressCallback, signal);
|
|
1103
1186
|
} finally {
|
|
1104
|
-
await this.
|
|
1187
|
+
await this.htmlPipeline.close();
|
|
1188
|
+
await this.markdownPipeline.close();
|
|
1105
1189
|
}
|
|
1106
1190
|
}
|
|
1107
1191
|
}
|
|
@@ -1153,6 +1237,15 @@ class GitHubScraperStrategy {
|
|
|
1153
1237
|
}
|
|
1154
1238
|
class LocalFileStrategy extends BaseScraperStrategy {
|
|
1155
1239
|
fileFetcher = new FileFetcher();
|
|
1240
|
+
htmlPipeline;
|
|
1241
|
+
markdownPipeline;
|
|
1242
|
+
pipelines;
|
|
1243
|
+
constructor() {
|
|
1244
|
+
super();
|
|
1245
|
+
this.htmlPipeline = new HtmlPipeline();
|
|
1246
|
+
this.markdownPipeline = new MarkdownPipeline();
|
|
1247
|
+
this.pipelines = [this.htmlPipeline, this.markdownPipeline];
|
|
1248
|
+
}
|
|
1156
1249
|
canHandle(url) {
|
|
1157
1250
|
return url.startsWith("file://");
|
|
1158
1251
|
}
|
|
@@ -1167,62 +1260,41 @@ class LocalFileStrategy extends BaseScraperStrategy {
|
|
|
1167
1260
|
}
|
|
1168
1261
|
logger.info(`📄 Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`);
|
|
1169
1262
|
const rawContent = await this.fileFetcher.fetch(item.url);
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
errors: [],
|
|
1179
|
-
options
|
|
1180
|
-
// Pass the full options object
|
|
1181
|
-
};
|
|
1182
|
-
let pipeline;
|
|
1183
|
-
if (initialContext.contentType.startsWith("text/html")) {
|
|
1184
|
-
pipeline = new ContentProcessingPipeline([
|
|
1185
|
-
new HtmlCheerioParserMiddleware(),
|
|
1186
|
-
new HtmlMetadataExtractorMiddleware(),
|
|
1187
|
-
// No HtmlLinkExtractorMiddleware needed for local files
|
|
1188
|
-
new HtmlSanitizerMiddleware(),
|
|
1189
|
-
new HtmlToMarkdownMiddleware()
|
|
1190
|
-
]);
|
|
1191
|
-
} else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain" || // Treat plain text as markdown
|
|
1192
|
-
initialContext.contentType.startsWith("text/")) {
|
|
1193
|
-
pipeline = new ContentProcessingPipeline([
|
|
1194
|
-
new MarkdownMetadataExtractorMiddleware()
|
|
1195
|
-
// No MarkdownLinkExtractorMiddleware needed for local files
|
|
1196
|
-
]);
|
|
1197
|
-
} else {
|
|
1263
|
+
let processed;
|
|
1264
|
+
for (const pipeline of this.pipelines) {
|
|
1265
|
+
if (pipeline.canProcess(rawContent)) {
|
|
1266
|
+
processed = await pipeline.process(rawContent, options, this.fileFetcher);
|
|
1267
|
+
break;
|
|
1268
|
+
}
|
|
1269
|
+
}
|
|
1270
|
+
if (!processed) {
|
|
1198
1271
|
logger.warn(
|
|
1199
|
-
`Unsupported content type "${
|
|
1272
|
+
`Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
|
|
1200
1273
|
);
|
|
1201
1274
|
return { document: void 0, links: [] };
|
|
1202
1275
|
}
|
|
1203
|
-
const
|
|
1204
|
-
for (const err of finalContext.errors) {
|
|
1276
|
+
for (const err of processed.errors) {
|
|
1205
1277
|
logger.warn(`Processing error for ${filePath}: ${err.message}`);
|
|
1206
1278
|
}
|
|
1207
|
-
const finalContentString = typeof finalContext.content === "string" ? finalContext.content : Buffer.from(finalContext.content).toString("utf-8");
|
|
1208
1279
|
return {
|
|
1209
1280
|
document: {
|
|
1210
|
-
|
|
1211
|
-
content: finalContentString,
|
|
1281
|
+
content: typeof processed.textContent === "string" ? processed.textContent : "",
|
|
1212
1282
|
metadata: {
|
|
1213
|
-
url:
|
|
1214
|
-
|
|
1215
|
-
// Ensure title is a string, default to "Untitled"
|
|
1216
|
-
title: typeof finalContext.metadata.title === "string" ? finalContext.metadata.title : "Untitled",
|
|
1283
|
+
url: rawContent.source,
|
|
1284
|
+
title: typeof processed.metadata.title === "string" ? processed.metadata.title : "Untitled",
|
|
1217
1285
|
library: options.library,
|
|
1218
1286
|
version: options.version
|
|
1219
1287
|
}
|
|
1220
1288
|
}
|
|
1221
|
-
// No links returned from file content processing
|
|
1222
1289
|
};
|
|
1223
1290
|
}
|
|
1224
1291
|
async scrape(options, progressCallback, signal) {
|
|
1225
|
-
|
|
1292
|
+
try {
|
|
1293
|
+
await super.scrape(options, progressCallback, signal);
|
|
1294
|
+
} finally {
|
|
1295
|
+
await this.htmlPipeline.close();
|
|
1296
|
+
await this.markdownPipeline.close();
|
|
1297
|
+
}
|
|
1226
1298
|
}
|
|
1227
1299
|
}
|
|
1228
1300
|
class NpmScraperStrategy {
|
|
@@ -2758,7 +2830,7 @@ class DocumentStore {
|
|
|
2758
2830
|
*/
|
|
2759
2831
|
async initializeEmbeddings() {
|
|
2760
2832
|
const modelSpec = process.env.DOCS_MCP_EMBEDDING_MODEL || "text-embedding-3-small";
|
|
2761
|
-
const { createEmbeddingModel } = await import("./EmbeddingFactory-
|
|
2833
|
+
const { createEmbeddingModel } = await import("./EmbeddingFactory-0Z5e_g1J.js");
|
|
2762
2834
|
this.embeddings = createEmbeddingModel(modelSpec);
|
|
2763
2835
|
const testVector = await this.embeddings.embedQuery("test");
|
|
2764
2836
|
this.modelDimension = testVector.length;
|
|
@@ -2873,7 +2945,12 @@ class DocumentStore {
|
|
|
2873
2945
|
`;
|
|
2874
2946
|
return `${header}${doc.pageContent}`;
|
|
2875
2947
|
});
|
|
2876
|
-
const rawEmbeddings =
|
|
2948
|
+
const rawEmbeddings = [];
|
|
2949
|
+
for (let i = 0; i < texts.length; i += EMBEDDING_BATCH_SIZE) {
|
|
2950
|
+
const batchTexts = texts.slice(i, i + EMBEDDING_BATCH_SIZE);
|
|
2951
|
+
const batchEmbeddings = await this.embeddings.embedDocuments(batchTexts);
|
|
2952
|
+
rawEmbeddings.push(...batchEmbeddings);
|
|
2953
|
+
}
|
|
2877
2954
|
const paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
|
|
2878
2955
|
const transaction = this.db.transaction((docs) => {
|
|
2879
2956
|
for (let i = 0; i < docs.length; i++) {
|
|
@@ -3372,12 +3449,11 @@ class DocumentManagementService {
|
|
|
3372
3449
|
}
|
|
3373
3450
|
}
|
|
3374
3451
|
export {
|
|
3375
|
-
ContentProcessingPipeline as C,
|
|
3376
3452
|
DocumentManagementService as D,
|
|
3377
3453
|
FileFetcher as F,
|
|
3378
3454
|
HttpFetcher as H,
|
|
3379
3455
|
LibraryNotFoundError as L,
|
|
3380
|
-
|
|
3456
|
+
MarkdownPipeline as M,
|
|
3381
3457
|
PipelineJobStatus as P,
|
|
3382
3458
|
SearchTool as S,
|
|
3383
3459
|
ToolError as T,
|
|
@@ -3392,18 +3468,14 @@ export {
|
|
|
3392
3468
|
DEFAULT_HTTP_PORT as h,
|
|
3393
3469
|
DEFAULT_MAX_CONCURRENCY as i,
|
|
3394
3470
|
ScrapeMode as j,
|
|
3395
|
-
|
|
3471
|
+
HtmlPipeline as k,
|
|
3396
3472
|
logger as l,
|
|
3397
|
-
|
|
3398
|
-
|
|
3399
|
-
|
|
3400
|
-
|
|
3401
|
-
|
|
3402
|
-
|
|
3403
|
-
setLogLevel as s
|
|
3404
|
-
getProjectRoot as t,
|
|
3405
|
-
DEFAULT_WEB_PORT as u,
|
|
3406
|
-
DimensionError as v,
|
|
3407
|
-
VECTOR_DIMENSION as w
|
|
3473
|
+
ScraperError as m,
|
|
3474
|
+
createJSDOM as n,
|
|
3475
|
+
getProjectRoot as o,
|
|
3476
|
+
DEFAULT_WEB_PORT as p,
|
|
3477
|
+
DimensionError as q,
|
|
3478
|
+
VECTOR_DIMENSION as r,
|
|
3479
|
+
setLogLevel as s
|
|
3408
3480
|
};
|
|
3409
|
-
//# sourceMappingURL=DocumentManagementService-
|
|
3481
|
+
//# sourceMappingURL=DocumentManagementService-BGW9iWNn.js.map
|