@arabold/docs-mcp-server 1.12.4 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,7 @@ import { VirtualConsole, JSDOM } from "jsdom";
10
10
  import { chromium } from "playwright";
11
11
  import { gfm } from "@joplin/turndown-plugin-gfm";
12
12
  import TurndownService from "turndown";
13
+ import { TextDecoder } from "node:util";
13
14
  import { URL as URL$1, fileURLToPath } from "node:url";
14
15
  import * as semver from "semver";
15
16
  import semver__default from "semver";
@@ -168,6 +169,49 @@ const FETCHER_BASE_DELAY = 1e3;
168
169
  const SPLITTER_MIN_CHUNK_SIZE = 500;
169
170
  const SPLITTER_PREFERRED_CHUNK_SIZE = 1500;
170
171
  const SPLITTER_MAX_CHUNK_SIZE = 5e3;
172
+ const EMBEDDING_BATCH_SIZE = 100;
173
+ class MimeTypeUtils {
174
+ /**
175
+ * Parses a Content-Type header string into its MIME type and charset.
176
+ * @param contentTypeHeader The Content-Type header string (e.g., "text/html; charset=utf-8").
177
+ * @returns A ParsedContentType object, or a default if parsing fails.
178
+ */
179
+ static parseContentType(contentTypeHeader) {
180
+ if (!contentTypeHeader) {
181
+ return { mimeType: "application/octet-stream" };
182
+ }
183
+ const parts = contentTypeHeader.split(";").map((part) => part.trim());
184
+ const mimeType = parts[0].toLowerCase();
185
+ let charset;
186
+ for (let i = 1; i < parts.length; i++) {
187
+ const param = parts[i];
188
+ if (param.toLowerCase().startsWith("charset=")) {
189
+ charset = param.substring("charset=".length).toLowerCase();
190
+ break;
191
+ }
192
+ }
193
+ return { mimeType, charset };
194
+ }
195
+ /**
196
+ * Checks if a MIME type represents HTML content.
197
+ */
198
+ static isHtml(mimeType) {
199
+ return mimeType === "text/html" || mimeType === "application/xhtml+xml";
200
+ }
201
+ /**
202
+ * Checks if a MIME type represents Markdown content.
203
+ */
204
+ static isMarkdown(mimeType) {
205
+ return mimeType === "text/markdown" || mimeType === "text/x-markdown";
206
+ }
207
+ /**
208
+ * Checks if a MIME type represents plain text content.
209
+ */
210
+ static isText(mimeType) {
211
+ return mimeType.startsWith("text/");
212
+ }
213
+ // Extend with more helpers as needed (isJson, isXml, isPdf, etc.)
214
+ }
171
215
  class FingerprintGenerator {
172
216
  headerGenerator;
173
217
  /**
@@ -245,11 +289,15 @@ class HttpFetcher {
245
289
  maxRedirects: followRedirects ? 5 : 0
246
290
  };
247
291
  const response = await axios.get(source, config);
292
+ const contentTypeHeader = response.headers["content-type"];
293
+ const { mimeType, charset } = MimeTypeUtils.parseContentType(contentTypeHeader);
294
+ const contentEncoding = response.headers["content-encoding"];
248
295
  return {
249
296
  content: response.data,
250
- mimeType: response.headers["content-type"] || "application/octet-stream",
251
- source,
252
- encoding: response.headers["content-encoding"]
297
+ mimeType,
298
+ charset,
299
+ encoding: contentEncoding,
300
+ source
253
301
  };
254
302
  } catch (error) {
255
303
  const axiosError = error;
@@ -322,55 +370,11 @@ class FileFetcher {
322
370
  }
323
371
  }
324
372
  }
325
- class ContentProcessingPipeline {
326
- middleware;
327
- /**
328
- * Creates an instance of ContentProcessingPipeline.
329
- * @param middleware An array of middleware instances to execute in order.
330
- */
331
- constructor(middleware) {
332
- this.middleware = middleware;
333
- }
334
- /**
335
- * Executes the middleware pipeline with the given initial context.
336
- * @param initialContext The starting context for the pipeline.
337
- * @returns A promise that resolves with the final context after all middleware have executed.
338
- */
339
- async run(initialContext) {
340
- let index = -1;
341
- const dispatch = async (i) => {
342
- if (i <= index) {
343
- throw new Error("next() called multiple times");
344
- }
345
- index = i;
346
- const mw = this.middleware[i];
347
- if (!mw) {
348
- return;
349
- }
350
- const next = dispatch.bind(null, i + 1);
351
- try {
352
- await mw.process(initialContext, next);
353
- } catch (error) {
354
- initialContext.errors.push(
355
- error instanceof Error ? error : new Error(String(error))
356
- );
357
- logger.warn(`Error in middleware pipeline: ${error}`);
358
- }
359
- };
360
- await dispatch(0);
361
- return initialContext;
362
- }
363
- }
364
373
  class HtmlCheerioParserMiddleware {
365
374
  async process(context, next) {
366
- if (!context.contentType.startsWith("text/html")) {
367
- await next();
368
- return;
369
- }
370
- const htmlString = typeof context.content === "string" ? context.content : Buffer.from(context.content).toString("utf-8");
371
375
  try {
372
376
  logger.debug(`Parsing HTML content with Cheerio from ${context.source}`);
373
- const $ = cheerio.load(htmlString);
377
+ const $ = cheerio.load(context.content);
374
378
  context.dom = $;
375
379
  await next();
376
380
  } catch (error) {
@@ -403,17 +407,15 @@ function createJSDOM(html, options) {
403
407
  class HtmlLinkExtractorMiddleware {
404
408
  /**
405
409
  * Processes the context to extract links from the sanitized HTML body.
406
- * @param context The current processing context.
410
+ * @param context The current middleware context.
407
411
  * @param next Function to call the next middleware.
408
412
  */
409
413
  async process(context, next) {
410
414
  const $ = context.dom;
411
415
  if (!$) {
412
- if (context.contentType.startsWith("text/html")) {
413
- logger.warn(
414
- `Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware runs before this.`
415
- );
416
- }
416
+ logger.warn(
417
+ `Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`
418
+ );
417
419
  await next();
418
420
  return;
419
421
  }
@@ -460,11 +462,9 @@ class HtmlMetadataExtractorMiddleware {
460
462
  async process(context, next) {
461
463
  const $ = context.dom;
462
464
  if (!$) {
463
- if (context.contentType.startsWith("text/html")) {
464
- logger.warn(
465
- `Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware runs before this.`
466
- );
467
- }
465
+ logger.warn(
466
+ `Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`
467
+ );
468
468
  await next();
469
469
  return;
470
470
  }
@@ -526,10 +526,6 @@ class HtmlPlaywrightMiddleware {
526
526
  }
527
527
  }
528
528
  async process(context, next) {
529
- if (!context.contentType.startsWith("text/html")) {
530
- await next();
531
- return;
532
- }
533
529
  const scrapeMode = context.options?.scrapeMode ?? ScrapeMode.Auto;
534
530
  const shouldRunPlaywright = scrapeMode === ScrapeMode.Playwright || scrapeMode === ScrapeMode.Auto;
535
531
  if (!shouldRunPlaywright) {
@@ -552,7 +548,7 @@ class HtmlPlaywrightMiddleware {
552
548
  if (route.request().url() === context.source) {
553
549
  return route.fulfill({
554
550
  status: 200,
555
- contentType: context.contentType,
551
+ contentType: "text/html",
556
552
  body: context.content
557
553
  });
558
554
  }
@@ -655,11 +651,9 @@ class HtmlSanitizerMiddleware {
655
651
  async process(context, next) {
656
652
  const $ = context.dom;
657
653
  if (!$) {
658
- if (context.contentType.startsWith("text/html")) {
659
- logger.warn(
660
- `Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`
661
- );
662
- }
654
+ logger.warn(
655
+ `Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`
656
+ );
663
657
  await next();
664
658
  return;
665
659
  }
@@ -769,11 +763,9 @@ ${text.replace(/^\n+|\n+$/g, "")}
769
763
  async process(context, next) {
770
764
  const $ = context.dom;
771
765
  if (!$) {
772
- if (context.contentType.startsWith("text/html")) {
773
- logger.warn(
774
- `Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware ran correctly.`
775
- );
776
- }
766
+ logger.warn(
767
+ `Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware ran correctly.`
768
+ );
777
769
  await next();
778
770
  return;
779
771
  }
@@ -785,10 +777,8 @@ ${text.replace(/^\n+|\n+$/g, "")}
785
777
  const warnMsg = `HTML to Markdown conversion resulted in empty content for ${context.source}.`;
786
778
  logger.warn(warnMsg);
787
779
  context.content = "";
788
- context.contentType = "text/markdown";
789
780
  } else {
790
781
  context.content = markdown;
791
- context.contentType = "text/markdown";
792
782
  logger.debug(`Successfully converted HTML to Markdown for ${context.source}`);
793
783
  }
794
784
  } catch (error) {
@@ -809,10 +799,8 @@ class MarkdownLinkExtractorMiddleware {
809
799
  * @param next Function to call the next middleware.
810
800
  */
811
801
  async process(context, next) {
812
- if (context.contentType === "text/markdown") {
813
- if (!Array.isArray(context.links)) {
814
- context.links = [];
815
- }
802
+ if (!Array.isArray(context.links)) {
803
+ context.links = [];
816
804
  }
817
805
  await next();
818
806
  }
@@ -824,31 +812,153 @@ class MarkdownMetadataExtractorMiddleware {
824
812
  * @param next Function to call the next middleware.
825
813
  */
826
814
  async process(context, next) {
827
- if (context.contentType === "text/markdown" || context.contentType === "text/plain") {
828
- try {
829
- const textContent = typeof context.content === "string" ? context.content : Buffer.from(context.content).toString("utf-8");
830
- if (typeof context.content !== "string") {
831
- context.content = textContent;
832
- }
833
- let title = "Untitled";
834
- if (context.contentType === "text/markdown") {
835
- const match = textContent.match(/^#\s+(.*)$/m);
836
- if (match?.[1]) {
837
- title = match[1].trim();
838
- }
839
- }
840
- context.metadata.title = title;
841
- } catch (error) {
842
- context.errors.push(
843
- new Error(
844
- `Failed to extract metadata from Markdown: ${error instanceof Error ? error.message : String(error)}`
845
- )
846
- );
815
+ try {
816
+ let title = "Untitled";
817
+ const match = context.content.match(/^#\s+(.*)$/m);
818
+ if (match?.[1]) {
819
+ title = match[1].trim();
847
820
  }
821
+ context.metadata.title = title;
822
+ } catch (error) {
823
+ context.errors.push(
824
+ new Error(
825
+ `Failed to extract metadata from Markdown: ${error instanceof Error ? error.message : String(error)}`
826
+ )
827
+ );
848
828
  }
849
829
  await next();
850
830
  }
851
831
  }
832
+ function convertToString(content, charset) {
833
+ if (Buffer.isBuffer(content)) {
834
+ const decoder = new TextDecoder(charset || "utf-8");
835
+ return decoder.decode(content);
836
+ }
837
+ return content;
838
+ }
839
+ class BasePipeline {
840
+ /**
841
+ * Determines if this pipeline can process the given content.
842
+ * Must be implemented by derived classes.
843
+ */
844
+ canProcess(_rawContent) {
845
+ throw new Error("Method not implemented.");
846
+ }
847
+ /**
848
+ * Processes the raw content through the pipeline.
849
+ * Must be implemented by derived classes.
850
+ */
851
+ async process(_rawContent, _options, _fetcher) {
852
+ throw new Error("Method not implemented.");
853
+ }
854
+ /**
855
+ * Executes a middleware stack on the given context.
856
+ * This is a utility method used by derived pipeline classes.
857
+ *
858
+ * @param middleware - The middleware stack to execute
859
+ * @param context - The context to process
860
+ */
861
+ async executeMiddlewareStack(middleware, context) {
862
+ let index = -1;
863
+ const dispatch = async (i) => {
864
+ if (i <= index) throw new Error("next() called multiple times");
865
+ index = i;
866
+ const mw = middleware[i];
867
+ if (!mw) return;
868
+ await mw.process(context, dispatch.bind(null, i + 1));
869
+ };
870
+ try {
871
+ await dispatch(0);
872
+ } catch (error) {
873
+ context.errors.push(error instanceof Error ? error : new Error(String(error)));
874
+ }
875
+ }
876
+ /**
877
+ * Cleans up resources when the pipeline is no longer needed.
878
+ * Default implementation does nothing.
879
+ */
880
+ async close() {
881
+ }
882
+ }
883
+ class HtmlPipeline extends BasePipeline {
884
+ playwrightMiddleware;
885
+ standardMiddleware;
886
+ constructor() {
887
+ super();
888
+ this.playwrightMiddleware = new HtmlPlaywrightMiddleware();
889
+ this.standardMiddleware = [
890
+ new HtmlCheerioParserMiddleware(),
891
+ new HtmlMetadataExtractorMiddleware(),
892
+ new HtmlLinkExtractorMiddleware(),
893
+ new HtmlSanitizerMiddleware(),
894
+ new HtmlToMarkdownMiddleware()
895
+ ];
896
+ }
897
+ canProcess(rawContent) {
898
+ return MimeTypeUtils.isHtml(rawContent.mimeType);
899
+ }
900
+ async process(rawContent, options, fetcher) {
901
+ const contentString = convertToString(rawContent.content, rawContent.charset);
902
+ const context = {
903
+ content: contentString,
904
+ source: rawContent.source,
905
+ metadata: {},
906
+ links: [],
907
+ errors: [],
908
+ options,
909
+ fetcher
910
+ };
911
+ let middleware = [...this.standardMiddleware];
912
+ if (options.scrapeMode === "playwright" || options.scrapeMode === "auto") {
913
+ middleware = [this.playwrightMiddleware, ...middleware];
914
+ }
915
+ await this.executeMiddlewareStack(middleware, context);
916
+ return {
917
+ textContent: typeof context.content === "string" ? context.content : "",
918
+ metadata: context.metadata,
919
+ links: context.links,
920
+ errors: context.errors
921
+ };
922
+ }
923
+ async close() {
924
+ await this.playwrightMiddleware.closeBrowser();
925
+ }
926
+ }
927
+ class MarkdownPipeline extends BasePipeline {
928
+ middleware;
929
+ constructor() {
930
+ super();
931
+ this.middleware = [
932
+ new MarkdownMetadataExtractorMiddleware(),
933
+ new MarkdownLinkExtractorMiddleware()
934
+ ];
935
+ }
936
+ canProcess(rawContent) {
937
+ if (!rawContent.mimeType) return false;
938
+ return MimeTypeUtils.isMarkdown(rawContent.mimeType) || MimeTypeUtils.isText(rawContent.mimeType);
939
+ }
940
+ async process(rawContent, options, fetcher) {
941
+ const contentString = convertToString(rawContent.content, rawContent.charset);
942
+ const context = {
943
+ content: contentString,
944
+ source: rawContent.source,
945
+ metadata: {},
946
+ links: [],
947
+ errors: [],
948
+ options,
949
+ fetcher
950
+ };
951
+ await this.executeMiddlewareStack(this.middleware, context);
952
+ return {
953
+ textContent: typeof context.content === "string" ? context.content : "",
954
+ metadata: context.metadata,
955
+ links: context.links,
956
+ errors: context.errors
957
+ };
958
+ }
959
+ async close() {
960
+ }
961
+ }
852
962
  class PipelineError extends Error {
853
963
  constructor(message, cause) {
854
964
  super(message);
@@ -976,12 +1086,15 @@ class BaseScraperStrategy {
976
1086
  class WebScraperStrategy extends BaseScraperStrategy {
977
1087
  httpFetcher = new HttpFetcher();
978
1088
  shouldFollowLinkFn;
979
- playwrightMiddleware;
980
- // Add member
1089
+ htmlPipeline;
1090
+ markdownPipeline;
1091
+ pipelines;
981
1092
  constructor(options = {}) {
982
1093
  super({ urlNormalizerOptions: options.urlNormalizerOptions });
983
1094
  this.shouldFollowLinkFn = options.shouldFollowLink;
984
- this.playwrightMiddleware = new HtmlPlaywrightMiddleware();
1095
+ this.htmlPipeline = new HtmlPipeline();
1096
+ this.markdownPipeline = new MarkdownPipeline();
1097
+ this.pipelines = [this.htmlPipeline, this.markdownPipeline];
985
1098
  }
986
1099
  canHandle(url) {
987
1100
  try {
@@ -1015,54 +1128,28 @@ class WebScraperStrategy extends BaseScraperStrategy {
1015
1128
  followRedirects: options.followRedirects
1016
1129
  };
1017
1130
  const rawContent = await this.httpFetcher.fetch(url, fetchOptions);
1018
- const initialContext = {
1019
- content: rawContent.content,
1020
- contentType: rawContent.mimeType,
1021
- source: rawContent.source,
1022
- // Use the final source URL after redirects
1023
- metadata: {},
1024
- links: [],
1025
- errors: [],
1026
- options,
1027
- fetcher: this.httpFetcher
1028
- };
1029
- let pipeline;
1030
- if (initialContext.contentType.startsWith("text/html")) {
1031
- const htmlPipelineSteps = [
1032
- this.playwrightMiddleware,
1033
- // Use the instance member
1034
- // TODO: Add HtmlJsExecutorMiddleware here if needed based on options
1035
- new HtmlCheerioParserMiddleware(),
1036
- // Always runs after content is finalized
1037
- new HtmlMetadataExtractorMiddleware(),
1038
- new HtmlLinkExtractorMiddleware(),
1039
- new HtmlSanitizerMiddleware(),
1040
- // Element remover
1041
- new HtmlToMarkdownMiddleware()
1042
- ];
1043
- pipeline = new ContentProcessingPipeline(htmlPipelineSteps);
1044
- } else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain") {
1045
- pipeline = new ContentProcessingPipeline([
1046
- new MarkdownMetadataExtractorMiddleware(),
1047
- new MarkdownLinkExtractorMiddleware()
1048
- // Placeholder for now
1049
- ]);
1050
- } else {
1131
+ let processed;
1132
+ for (const pipeline of this.pipelines) {
1133
+ if (pipeline.canProcess(rawContent)) {
1134
+ processed = await pipeline.process(rawContent, options, this.httpFetcher);
1135
+ break;
1136
+ }
1137
+ }
1138
+ if (!processed) {
1051
1139
  logger.warn(
1052
- `Unsupported content type "${initialContext.contentType}" for URL ${url}. Skipping processing.`
1140
+ `Unsupported content type "${rawContent.mimeType}" for URL ${url}. Skipping processing.`
1053
1141
  );
1054
1142
  return { document: void 0, links: [] };
1055
1143
  }
1056
- const finalContext = await pipeline.run(initialContext);
1057
- for (const err of finalContext.errors) {
1144
+ for (const err of processed.errors) {
1058
1145
  logger.warn(`Processing error for ${url}: ${err.message}`);
1059
1146
  }
1060
- if (typeof finalContext.content !== "string" || !finalContext.content.trim()) {
1147
+ if (!processed.textContent || !processed.textContent.trim()) {
1061
1148
  logger.warn(`No processable content found for ${url} after pipeline execution.`);
1062
- return { document: void 0, links: finalContext.links };
1149
+ return { document: void 0, links: processed.links };
1063
1150
  }
1064
1151
  const baseUrl = new URL(options.url);
1065
- const filteredLinks = finalContext.links.filter((link) => {
1152
+ const filteredLinks = processed.links.filter((link) => {
1066
1153
  try {
1067
1154
  const targetUrl = new URL(link);
1068
1155
  const scope = options.scope || "subpages";
@@ -1073,20 +1160,16 @@ class WebScraperStrategy extends BaseScraperStrategy {
1073
1160
  });
1074
1161
  return {
1075
1162
  document: {
1076
- content: finalContext.content,
1077
- // Final processed content (Markdown)
1163
+ content: processed.textContent,
1078
1164
  metadata: {
1079
- url: finalContext.source,
1080
- // URL after redirects
1081
- // Ensure title is a string, default to "Untitled"
1082
- title: typeof finalContext.metadata.title === "string" ? finalContext.metadata.title : "Untitled",
1165
+ url,
1166
+ title: typeof processed.metadata.title === "string" ? processed.metadata.title : "Untitled",
1083
1167
  library: options.library,
1084
- version: options.version
1085
- // Add other metadata from context if needed
1168
+ version: options.version,
1169
+ ...processed.metadata
1086
1170
  }
1087
1171
  },
1088
1172
  links: filteredLinks
1089
- // Use the filtered links
1090
1173
  };
1091
1174
  } catch (error) {
1092
1175
  logger.error(`Failed processing page ${url}: ${error}`);
@@ -1101,7 +1184,8 @@ class WebScraperStrategy extends BaseScraperStrategy {
1101
1184
  try {
1102
1185
  await super.scrape(options, progressCallback, signal);
1103
1186
  } finally {
1104
- await this.playwrightMiddleware.closeBrowser();
1187
+ await this.htmlPipeline.close();
1188
+ await this.markdownPipeline.close();
1105
1189
  }
1106
1190
  }
1107
1191
  }
@@ -1153,6 +1237,15 @@ class GitHubScraperStrategy {
1153
1237
  }
1154
1238
  class LocalFileStrategy extends BaseScraperStrategy {
1155
1239
  fileFetcher = new FileFetcher();
1240
+ htmlPipeline;
1241
+ markdownPipeline;
1242
+ pipelines;
1243
+ constructor() {
1244
+ super();
1245
+ this.htmlPipeline = new HtmlPipeline();
1246
+ this.markdownPipeline = new MarkdownPipeline();
1247
+ this.pipelines = [this.htmlPipeline, this.markdownPipeline];
1248
+ }
1156
1249
  canHandle(url) {
1157
1250
  return url.startsWith("file://");
1158
1251
  }
@@ -1167,62 +1260,41 @@ class LocalFileStrategy extends BaseScraperStrategy {
1167
1260
  }
1168
1261
  logger.info(`📄 Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`);
1169
1262
  const rawContent = await this.fileFetcher.fetch(item.url);
1170
- const initialContext = {
1171
- content: rawContent.content,
1172
- contentType: rawContent.mimeType,
1173
- source: rawContent.source,
1174
- // file:// URL
1175
- metadata: {},
1176
- links: [],
1177
- // LocalFileStrategy doesn't extract links from file content itself
1178
- errors: [],
1179
- options
1180
- // Pass the full options object
1181
- };
1182
- let pipeline;
1183
- if (initialContext.contentType.startsWith("text/html")) {
1184
- pipeline = new ContentProcessingPipeline([
1185
- new HtmlCheerioParserMiddleware(),
1186
- new HtmlMetadataExtractorMiddleware(),
1187
- // No HtmlLinkExtractorMiddleware needed for local files
1188
- new HtmlSanitizerMiddleware(),
1189
- new HtmlToMarkdownMiddleware()
1190
- ]);
1191
- } else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain" || // Treat plain text as markdown
1192
- initialContext.contentType.startsWith("text/")) {
1193
- pipeline = new ContentProcessingPipeline([
1194
- new MarkdownMetadataExtractorMiddleware()
1195
- // No MarkdownLinkExtractorMiddleware needed for local files
1196
- ]);
1197
- } else {
1263
+ let processed;
1264
+ for (const pipeline of this.pipelines) {
1265
+ if (pipeline.canProcess(rawContent)) {
1266
+ processed = await pipeline.process(rawContent, options, this.fileFetcher);
1267
+ break;
1268
+ }
1269
+ }
1270
+ if (!processed) {
1198
1271
  logger.warn(
1199
- `Unsupported content type "${initialContext.contentType}" for file ${filePath}. Skipping processing.`
1272
+ `Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
1200
1273
  );
1201
1274
  return { document: void 0, links: [] };
1202
1275
  }
1203
- const finalContext = await pipeline.run(initialContext);
1204
- for (const err of finalContext.errors) {
1276
+ for (const err of processed.errors) {
1205
1277
  logger.warn(`Processing error for ${filePath}: ${err.message}`);
1206
1278
  }
1207
- const finalContentString = typeof finalContext.content === "string" ? finalContext.content : Buffer.from(finalContext.content).toString("utf-8");
1208
1279
  return {
1209
1280
  document: {
1210
- // Use the potentially empty string content
1211
- content: finalContentString,
1281
+ content: typeof processed.textContent === "string" ? processed.textContent : "",
1212
1282
  metadata: {
1213
- url: finalContext.source,
1214
- // Use context source (file:// URL)
1215
- // Ensure title is a string, default to "Untitled"
1216
- title: typeof finalContext.metadata.title === "string" ? finalContext.metadata.title : "Untitled",
1283
+ url: rawContent.source,
1284
+ title: typeof processed.metadata.title === "string" ? processed.metadata.title : "Untitled",
1217
1285
  library: options.library,
1218
1286
  version: options.version
1219
1287
  }
1220
1288
  }
1221
- // No links returned from file content processing
1222
1289
  };
1223
1290
  }
1224
1291
  async scrape(options, progressCallback, signal) {
1225
- await super.scrape(options, progressCallback, signal);
1292
+ try {
1293
+ await super.scrape(options, progressCallback, signal);
1294
+ } finally {
1295
+ await this.htmlPipeline.close();
1296
+ await this.markdownPipeline.close();
1297
+ }
1226
1298
  }
1227
1299
  }
1228
1300
  class NpmScraperStrategy {
@@ -1456,7 +1528,9 @@ class PipelineManager {
1456
1528
  };
1457
1529
  this.jobMap.set(jobId, job);
1458
1530
  this.jobQueue.push(jobId);
1459
- logger.info(`📝 Job enqueued: ${jobId} for ${library}@${version}`);
1531
+ logger.info(
1532
+ `📝 Job enqueued: ${jobId} for ${library}${version ? `@${version}` : ""}`
1533
+ );
1460
1534
  await this.callbacks.onJobStatusChange?.(job);
1461
1535
  if (this.isRunning) {
1462
1536
  this._processQueue();
@@ -2758,7 +2832,7 @@ class DocumentStore {
2758
2832
  */
2759
2833
  async initializeEmbeddings() {
2760
2834
  const modelSpec = process.env.DOCS_MCP_EMBEDDING_MODEL || "text-embedding-3-small";
2761
- const { createEmbeddingModel } = await import("./EmbeddingFactory-DZKXkqOe.js");
2835
+ const { createEmbeddingModel } = await import("./EmbeddingFactory-Dz1hdJJe.js");
2762
2836
  this.embeddings = createEmbeddingModel(modelSpec);
2763
2837
  const testVector = await this.embeddings.embedQuery("test");
2764
2838
  this.modelDimension = testVector.length;
@@ -2873,7 +2947,12 @@ class DocumentStore {
2873
2947
  `;
2874
2948
  return `${header}${doc.pageContent}`;
2875
2949
  });
2876
- const rawEmbeddings = await this.embeddings.embedDocuments(texts);
2950
+ const rawEmbeddings = [];
2951
+ for (let i = 0; i < texts.length; i += EMBEDDING_BATCH_SIZE) {
2952
+ const batchTexts = texts.slice(i, i + EMBEDDING_BATCH_SIZE);
2953
+ const batchEmbeddings = await this.embeddings.embedDocuments(batchTexts);
2954
+ rawEmbeddings.push(...batchEmbeddings);
2955
+ }
2877
2956
  const paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
2878
2957
  const transaction = this.db.transaction((docs) => {
2879
2958
  for (let i = 0; i < docs.length; i++) {
@@ -3372,12 +3451,11 @@ class DocumentManagementService {
3372
3451
  }
3373
3452
  }
3374
3453
  export {
3375
- ContentProcessingPipeline as C,
3376
3454
  DocumentManagementService as D,
3377
3455
  FileFetcher as F,
3378
3456
  HttpFetcher as H,
3379
3457
  LibraryNotFoundError as L,
3380
- MarkdownMetadataExtractorMiddleware as M,
3458
+ MarkdownPipeline as M,
3381
3459
  PipelineJobStatus as P,
3382
3460
  SearchTool as S,
3383
3461
  ToolError as T,
@@ -3392,18 +3470,14 @@ export {
3392
3470
  DEFAULT_HTTP_PORT as h,
3393
3471
  DEFAULT_MAX_CONCURRENCY as i,
3394
3472
  ScrapeMode as j,
3395
- HtmlPlaywrightMiddleware as k,
3473
+ HtmlPipeline as k,
3396
3474
  logger as l,
3397
- HtmlCheerioParserMiddleware as m,
3398
- HtmlMetadataExtractorMiddleware as n,
3399
- HtmlSanitizerMiddleware as o,
3400
- HtmlToMarkdownMiddleware as p,
3401
- ScraperError as q,
3402
- createJSDOM as r,
3403
- setLogLevel as s,
3404
- getProjectRoot as t,
3405
- DEFAULT_WEB_PORT as u,
3406
- DimensionError as v,
3407
- VECTOR_DIMENSION as w
3475
+ ScraperError as m,
3476
+ createJSDOM as n,
3477
+ getProjectRoot as o,
3478
+ DEFAULT_WEB_PORT as p,
3479
+ DimensionError as q,
3480
+ VECTOR_DIMENSION as r,
3481
+ setLogLevel as s
3408
3482
  };
3409
- //# sourceMappingURL=DocumentManagementService-BupnR1eC.js.map
3483
+ //# sourceMappingURL=DocumentManagementService-BZ_ZZgPI.js.map