@arabold/docs-mcp-server 1.34.0 → 1.35.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -33,6 +33,7 @@ import mime from "mime";
33
33
  import { HeaderGenerator } from "header-generator";
34
34
  import fs$1 from "node:fs/promises";
35
35
  import axios from "axios";
36
+ import { MarkItDown } from "markitdown-ts";
36
37
  import { VirtualConsole, JSDOM } from "jsdom";
37
38
  import psl from "psl";
38
39
  import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
@@ -945,6 +946,10 @@ const DEFAULT_CONFIG = {
945
946
  childLimit: 3,
946
947
  precedingSiblingsLimit: 1,
947
948
  subsequentSiblingsLimit: 2
949
+ },
950
+ document: {
951
+ maxSize: 10 * 1024 * 1024
952
+ // 10MB max size for PDF/Office documents
948
953
  }
949
954
  };
950
955
  const AppConfigSchema = z.object({
@@ -1018,7 +1023,10 @@ const AppConfigSchema = z.object({
1018
1023
  childLimit: z.coerce.number().int().default(DEFAULT_CONFIG.assembly.childLimit),
1019
1024
  precedingSiblingsLimit: z.coerce.number().int().default(DEFAULT_CONFIG.assembly.precedingSiblingsLimit),
1020
1025
  subsequentSiblingsLimit: z.coerce.number().int().default(DEFAULT_CONFIG.assembly.subsequentSiblingsLimit)
1021
- }).default(DEFAULT_CONFIG.assembly)
1026
+ }).default(DEFAULT_CONFIG.assembly),
1027
+ document: z.object({
1028
+ maxSize: z.coerce.number().int().default(DEFAULT_CONFIG.document.maxSize)
1029
+ }).default(DEFAULT_CONFIG.document)
1022
1030
  });
1023
1031
  const defaults = AppConfigSchema.parse({});
1024
1032
  const configMappings = [
@@ -2370,6 +2378,31 @@ class MimeTypeUtils {
2370
2378
  static isJson(mimeType) {
2371
2379
  return mimeType === "application/json" || mimeType === "text/json" || mimeType === "text/x-json";
2372
2380
  }
2381
+ /**
2382
+ * Checks if a MIME type represents PDF content.
2383
+ */
2384
+ static isPdf(mimeType) {
2385
+ return mimeType === "application/pdf";
2386
+ }
2387
+ /**
2388
+ * Checks if a MIME type represents an Office document (DOCX, XLSX, PPTX).
2389
+ */
2390
+ static isOfficeDocument(mimeType) {
2391
+ return mimeType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" || mimeType === "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" || mimeType === "application/vnd.openxmlformats-officedocument.presentationml.presentation";
2392
+ }
2393
+ /**
2394
+ * Checks if a MIME type represents a Jupyter Notebook.
2395
+ */
2396
+ static isJupyterNotebook(mimeType) {
2397
+ return mimeType === "application/x-ipynb+json";
2398
+ }
2399
+ /**
2400
+ * Checks if a MIME type represents a document that can be processed
2401
+ * by the DocumentPipeline (PDF, Office docs, Jupyter notebooks).
2402
+ */
2403
+ static isSupportedDocument(mimeType) {
2404
+ return MimeTypeUtils.isPdf(mimeType) || MimeTypeUtils.isOfficeDocument(mimeType) || MimeTypeUtils.isJupyterNotebook(mimeType);
2405
+ }
2373
2406
  /**
2374
2407
  * Checks if a MIME type represents source code that should be wrapped in code blocks.
2375
2408
  */
@@ -3098,7 +3131,9 @@ function normalizeUrl(url, options = defaultNormalizerOptions) {
3098
3131
  try {
3099
3132
  const parsedUrl = new URL(url);
3100
3133
  const finalOptions = { ...defaultNormalizerOptions, ...options };
3101
- const normalized = new URL(parsedUrl.origin + parsedUrl.pathname);
3134
+ const normalized = new URL(url);
3135
+ normalized.search = "";
3136
+ normalized.hash = "";
3102
3137
  if (finalOptions.removeIndex) {
3103
3138
  normalized.pathname = normalized.pathname.replace(
3104
3139
  /\/index\.(html|htm|asp|php|jsp)$/i,
@@ -3110,13 +3145,13 @@ function normalizeUrl(url, options = defaultNormalizerOptions) {
3110
3145
  }
3111
3146
  const preservedHash = !finalOptions.removeHash ? parsedUrl.hash : "";
3112
3147
  const preservedSearch = !finalOptions.removeQuery ? parsedUrl.search : "";
3113
- let result = normalized.origin + normalized.pathname;
3114
- if (preservedSearch) {
3115
- result += preservedSearch;
3148
+ if (!finalOptions.removeQuery) {
3149
+ normalized.search = preservedSearch;
3116
3150
  }
3117
- if (preservedHash) {
3118
- result += preservedHash;
3151
+ if (!finalOptions.removeHash) {
3152
+ normalized.hash = preservedHash;
3119
3153
  }
3154
+ let result = normalized.href;
3120
3155
  if (finalOptions.ignoreCase) {
3121
3156
  result = result.toLowerCase();
3122
3157
  }
@@ -3790,6 +3825,181 @@ ${"```"}`;
3790
3825
  return window2.document;
3791
3826
  }
3792
3827
  }
3828
+ class BasePipeline {
3829
+ /**
3830
+ * Determines if this pipeline can process content with the given MIME type.
3831
+ * Must be implemented by derived classes.
3832
+ */
3833
+ canProcess(_mimeType, _content) {
3834
+ throw new Error("Method not implemented.");
3835
+ }
3836
+ /**
3837
+ * Processes the raw content through the pipeline.
3838
+ * Must be implemented by derived classes.
3839
+ */
3840
+ async process(_rawContent, _options, _fetcher) {
3841
+ throw new Error("Method not implemented.");
3842
+ }
3843
+ /**
3844
+ * Cleanup resources used by this pipeline.
3845
+ * Default implementation does nothing - override in derived classes as needed.
3846
+ */
3847
+ async close() {
3848
+ }
3849
+ /**
3850
+ * Executes a middleware stack on the given context.
3851
+ * This is a utility method used by derived pipeline classes.
3852
+ *
3853
+ * @param middleware - The middleware stack to execute
3854
+ * @param context - The context to process
3855
+ */
3856
+ async executeMiddlewareStack(middleware, context) {
3857
+ let index = -1;
3858
+ const dispatch = async (i) => {
3859
+ if (i <= index) throw new Error("next() called multiple times");
3860
+ index = i;
3861
+ const mw = middleware[i];
3862
+ if (!mw) return;
3863
+ await mw.process(context, dispatch.bind(null, i + 1));
3864
+ };
3865
+ try {
3866
+ await dispatch(0);
3867
+ } catch (error) {
3868
+ context.errors.push(error instanceof Error ? error : new Error(String(error)));
3869
+ }
3870
+ }
3871
+ }
3872
+ class DocumentPipeline extends BasePipeline {
3873
+ markitdown;
3874
+ splitter;
3875
+ maxSize;
3876
+ constructor(config) {
3877
+ super();
3878
+ this.markitdown = new MarkItDown();
3879
+ this.maxSize = config.document.maxSize;
3880
+ const semanticSplitter = new SemanticMarkdownSplitter(
3881
+ config.splitter.preferredChunkSize,
3882
+ config.splitter.maxChunkSize
3883
+ );
3884
+ this.splitter = new GreedySplitter(
3885
+ semanticSplitter,
3886
+ config.splitter.minChunkSize,
3887
+ config.splitter.preferredChunkSize,
3888
+ config.splitter.maxChunkSize
3889
+ );
3890
+ }
3891
+ canProcess(mimeType) {
3892
+ return MimeTypeUtils.isSupportedDocument(mimeType);
3893
+ }
3894
+ async process(rawContent, _options) {
3895
+ const buffer = Buffer.isBuffer(rawContent.content) ? rawContent.content : Buffer.from(rawContent.content);
3896
+ if (buffer.length > this.maxSize) {
3897
+ logger.warn(
3898
+ `Document exceeds size limit (${buffer.length} > ${this.maxSize}): ${rawContent.source}`
3899
+ );
3900
+ return {
3901
+ title: null,
3902
+ contentType: rawContent.mimeType,
3903
+ textContent: null,
3904
+ links: [],
3905
+ errors: [new Error(`Document exceeds maximum size of ${this.maxSize} bytes`)],
3906
+ chunks: []
3907
+ };
3908
+ }
3909
+ const extension = this.extractExtension(rawContent.source);
3910
+ if (!extension) {
3911
+ logger.warn(`Could not determine file extension: ${rawContent.source}`);
3912
+ return {
3913
+ title: null,
3914
+ contentType: rawContent.mimeType,
3915
+ textContent: null,
3916
+ links: [],
3917
+ errors: [new Error("Could not determine file extension for document")],
3918
+ chunks: []
3919
+ };
3920
+ }
3921
+ try {
3922
+ const result = await this.markitdown.convertBuffer(buffer, {
3923
+ file_extension: `.${extension}`
3924
+ });
3925
+ if (!result?.markdown) {
3926
+ logger.warn(`No content extracted from document: ${rawContent.source}`);
3927
+ return {
3928
+ title: null,
3929
+ contentType: rawContent.mimeType,
3930
+ textContent: null,
3931
+ links: [],
3932
+ errors: [],
3933
+ chunks: []
3934
+ };
3935
+ }
3936
+ const title = result.title || this.extractFilename(rawContent.source);
3937
+ let markdown = result.markdown;
3938
+ if (extension === "xlsx") {
3939
+ markdown = this.promoteTableHeaders(markdown);
3940
+ }
3941
+ const chunks = await this.splitter.splitText(markdown, "text/markdown");
3942
+ return {
3943
+ title,
3944
+ contentType: "text/markdown",
3945
+ // Output is always markdown
3946
+ textContent: markdown,
3947
+ links: [],
3948
+ // Documents don't have extractable links
3949
+ errors: [],
3950
+ chunks
3951
+ };
3952
+ } catch (error) {
3953
+ const errorName = error instanceof Error ? error.name : "UnknownError";
3954
+ const safeMessage = `Failed to convert document: ${errorName}`;
3955
+ logger.warn(`${safeMessage} for ${rawContent.source}`);
3956
+ return {
3957
+ title: null,
3958
+ contentType: rawContent.mimeType,
3959
+ textContent: null,
3960
+ links: [],
3961
+ errors: [new Error(safeMessage)],
3962
+ chunks: []
3963
+ };
3964
+ }
3965
+ }
3966
+ extractExtension(source) {
3967
+ try {
3968
+ const url = new URL(source);
3969
+ return this.getExtensionFromPath(url.pathname);
3970
+ } catch {
3971
+ return this.getExtensionFromPath(source);
3972
+ }
3973
+ }
3974
+ getExtensionFromPath(pathStr) {
3975
+ const lastSlash = pathStr.lastIndexOf("/");
3976
+ const filename = lastSlash >= 0 ? pathStr.substring(lastSlash + 1) : pathStr;
3977
+ const lastDot = filename.lastIndexOf(".");
3978
+ if (lastDot > 0) {
3979
+ return filename.substring(lastDot + 1).toLowerCase();
3980
+ }
3981
+ return null;
3982
+ }
3983
+ /**
3984
+ * Post-processes Markdown to fix empty table headers generated by sheet-to-html conversions.
3985
+ * Detects tables where the header row is empty and promotes the first data row to be the header.
3986
+ */
3987
+ promoteTableHeaders(markdown) {
3988
+ const emptyHeaderPattern = /^\|(?:\s*\|)+\s*$\r?\n^(\|(?:\s*:?-+:?\s*\|)+)\s*$\r?\n^(\|.*\|)\s*$/gm;
3989
+ return markdown.replace(emptyHeaderPattern, "$2\n$1");
3990
+ }
3991
+ extractFilename(source) {
3992
+ try {
3993
+ const url = new URL(source);
3994
+ const pathname = url.pathname;
3995
+ const lastSlash = pathname.lastIndexOf("/");
3996
+ return pathname.substring(lastSlash + 1) || null;
3997
+ } catch {
3998
+ const lastSlash = source.lastIndexOf("/");
3999
+ return source.substring(lastSlash + 1) || null;
4000
+ }
4001
+ }
4002
+ }
3793
4003
  class HtmlCheerioParserMiddleware {
3794
4004
  async process(context, next) {
3795
4005
  try {
@@ -5194,50 +5404,6 @@ function convertToString(content, charset) {
5194
5404
  }
5195
5405
  }
5196
5406
  }
5197
- class BasePipeline {
5198
- /**
5199
- * Determines if this pipeline can process content with the given MIME type.
5200
- * Must be implemented by derived classes.
5201
- */
5202
- canProcess(_mimeType, _content) {
5203
- throw new Error("Method not implemented.");
5204
- }
5205
- /**
5206
- * Processes the raw content through the pipeline.
5207
- * Must be implemented by derived classes.
5208
- */
5209
- async process(_rawContent, _options, _fetcher) {
5210
- throw new Error("Method not implemented.");
5211
- }
5212
- /**
5213
- * Cleanup resources used by this pipeline.
5214
- * Default implementation does nothing - override in derived classes as needed.
5215
- */
5216
- async close() {
5217
- }
5218
- /**
5219
- * Executes a middleware stack on the given context.
5220
- * This is a utility method used by derived pipeline classes.
5221
- *
5222
- * @param middleware - The middleware stack to execute
5223
- * @param context - The context to process
5224
- */
5225
- async executeMiddlewareStack(middleware, context) {
5226
- let index = -1;
5227
- const dispatch = async (i) => {
5228
- if (i <= index) throw new Error("next() called multiple times");
5229
- index = i;
5230
- const mw = middleware[i];
5231
- if (!mw) return;
5232
- await mw.process(context, dispatch.bind(null, i + 1));
5233
- };
5234
- try {
5235
- await dispatch(0);
5236
- } catch (error) {
5237
- context.errors.push(error instanceof Error ? error : new Error(String(error)));
5238
- }
5239
- }
5240
- }
5241
5407
  class HtmlPipeline extends BasePipeline {
5242
5408
  playwrightMiddleware;
5243
5409
  standardMiddleware;
@@ -7067,7 +7233,7 @@ class TextPipeline extends BasePipeline {
7067
7233
  let PipelineFactory$1 = class PipelineFactory {
7068
7234
  /**
7069
7235
  * Creates the standard set of content pipelines used by all scraper strategies.
7070
- * Includes HTML, Markdown, JSON, source code, and text processing capabilities.
7236
+ * Includes HTML, Markdown, JSON, source code, document, and text processing capabilities.
7071
7237
  * Each pipeline now handles both preprocessing and content-specific splitting.
7072
7238
  * TextPipeline is placed last as the universal fallback for unknown content types.
7073
7239
  *
@@ -7077,6 +7243,8 @@ let PipelineFactory$1 = class PipelineFactory {
7077
7243
  return [
7078
7244
  new JsonPipeline(appConfig),
7079
7245
  new SourceCodePipeline(appConfig),
7246
+ new DocumentPipeline(appConfig),
7247
+ // PDF, Office docs, Jupyter notebooks
7080
7248
  new HtmlPipeline(appConfig),
7081
7249
  new MarkdownPipeline(appConfig),
7082
7250
  new TextPipeline(appConfig)
@@ -11246,7 +11414,7 @@ const Layout = ({
11246
11414
  children,
11247
11415
  eventClientConfig
11248
11416
  }) => {
11249
- const versionString = version || "1.34.0";
11417
+ const versionString = version || "1.35.0";
11250
11418
  const versionInitializer = `versionUpdate({ currentVersion: ${`'${versionString}'`} })`;
11251
11419
  return /* @__PURE__ */ jsxs("html", { lang: "en", children: [
11252
11420
  /* @__PURE__ */ jsxs("head", { children: [
@@ -12692,7 +12860,8 @@ function registerNewJobRoutes(server, scrapeTool, scraperConfig) {
12692
12860
  reply.type("text/html");
12693
12861
  try {
12694
12862
  let parsePatterns = function(input) {
12695
- if (!input) return void 0;
12863
+ if (input === void 0) return void 0;
12864
+ if (input.trim() === "") return [];
12696
12865
  return input.split(/\n|,/).map((s) => s.trim()).filter((s) => s.length > 0);
12697
12866
  }, parseHeaders2 = function(input) {
12698
12867
  if (!input) return void 0;
@@ -13044,7 +13213,7 @@ const LibrarySearchCard = ({ library }) => {
13044
13213
  ] });
13045
13214
  };
13046
13215
  const SearchResultItem = async ({ result }) => {
13047
- const isMarkdown = result.mimeType ? MimeTypeUtils.isMarkdown(result.mimeType) : true;
13216
+ const isMarkdown = result.mimeType ? MimeTypeUtils.isMarkdown(result.mimeType) || MimeTypeUtils.isSupportedDocument(result.mimeType) : true;
13048
13217
  const jsdom = createJSDOM("");
13049
13218
  const purifier = DOMPurify(jsdom.window);
13050
13219
  let contentElement;
@@ -13586,7 +13755,7 @@ class AppServer {
13586
13755
  try {
13587
13756
  if (telemetry.isEnabled()) {
13588
13757
  telemetry.setGlobalContext({
13589
- appVersion: "1.34.0",
13758
+ appVersion: "1.35.0",
13590
13759
  appPlatform: process.platform,
13591
13760
  appNodeVersion: process.version,
13592
13761
  appServicesEnabled: this.getActiveServicesList(),
@@ -14810,8 +14979,12 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
14810
14979
  ".tsv",
14811
14980
  ".log"
14812
14981
  ];
14982
+ const documentExtensions = [".pdf", ".docx", ".xlsx", ".pptx", ".ipynb"];
14813
14983
  const pathLower = path2.toLowerCase();
14814
14984
  const hasTextExtension = textExtensions.some((ext) => pathLower.endsWith(ext));
14985
+ const hasDocumentExtension = documentExtensions.some(
14986
+ (ext) => pathLower.endsWith(ext)
14987
+ );
14815
14988
  const hasCompoundExtension = pathLower.includes(".env.") || pathLower.endsWith(".env") || pathLower.includes(".config.") || pathLower.includes(".lock");
14816
14989
  const fileName = path2.split("/").pop() || "";
14817
14990
  const fileNameLower = fileName.toLowerCase();
@@ -14845,7 +15018,7 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
14845
15018
  }
14846
15019
  return fileNameLower === name || fileNameLower.startsWith(`${name}.`);
14847
15020
  });
14848
- if (hasTextExtension || hasCompoundExtension || isCommonTextFile) {
15021
+ if (hasTextExtension || hasDocumentExtension || hasCompoundExtension || isCommonTextFile) {
14849
15022
  return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
14850
15023
  }
14851
15024
  const mimeType = mime.getType(path2);
@@ -14982,7 +15155,23 @@ class LocalFileStrategy extends BaseScraperStrategy {
14982
15155
  }
14983
15156
  if (stats.isDirectory()) {
14984
15157
  const contents = await fs$1.readdir(filePath);
14985
- const links = contents.map((name) => `file://${path.join(filePath, name)}`).filter((url) => this.shouldProcessUrl(url, options));
15158
+ const links = contents.map((name) => {
15159
+ const url = new URL(`file://${path.join(filePath, name)}`);
15160
+ if (url.hostname !== "") {
15161
+ url.pathname = `/${url.hostname}${url.pathname}`;
15162
+ url.hostname = "";
15163
+ }
15164
+ return url.href;
15165
+ }).filter((url) => {
15166
+ const allowed = this.shouldProcessUrl(url, options);
15167
+ if (!allowed) {
15168
+ logger.debug(`Skipping out-of-scope link: ${url}`);
15169
+ }
15170
+ return allowed;
15171
+ });
15172
+ logger.debug(
15173
+ `Found ${links.length} files in ${filePath} (from ${contents.length} entries)`
15174
+ );
14986
15175
  return { url: item.url, links, status: FetchStatus.SUCCESS };
14987
15176
  }
14988
15177
  const rawContent = await this.fileFetcher.fetch(item.url, {
@@ -17216,7 +17405,7 @@ function createCli(argv) {
17216
17405
  let globalEventBus = null;
17217
17406
  let globalTelemetryService = null;
17218
17407
  const commandStartTimes = /* @__PURE__ */ new Map();
17219
- const cli = yargs(hideBin(argv)).scriptName("docs-mcp-server").strict().usage("Usage: $0 <command> [options]").version("1.34.0").option("verbose", {
17408
+ const cli = yargs(hideBin(argv)).scriptName("docs-mcp-server").strict().usage("Usage: $0 <command> [options]").version("1.35.0").option("verbose", {
17220
17409
  type: "boolean",
17221
17410
  description: "Enable verbose (debug) logging",
17222
17411
  default: false
@@ -17272,7 +17461,7 @@ function createCli(argv) {
17272
17461
  if (shouldEnableTelemetry() && telemetry.isEnabled()) {
17273
17462
  const commandName = argv2._[0]?.toString() || "default";
17274
17463
  telemetry.setGlobalContext({
17275
- appVersion: "1.34.0",
17464
+ appVersion: "1.35.0",
17276
17465
  appPlatform: process.platform,
17277
17466
  appNodeVersion: process.version,
17278
17467
  appInterface: "cli",