@just-every/mcp-read-website-fast 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,99 @@
1
+ #!/usr/bin/env node
2
+ import { Command } from 'commander';
3
+ import { CrawlQueue } from './crawler/queue.js';
4
+ import { readFileSync } from 'fs';
5
+ import { fileURLToPath } from 'url';
6
+ import { dirname, join } from 'path';
7
+ const __filename = fileURLToPath(import.meta.url);
8
+ const __dirname = dirname(__filename);
9
+ const packageJson = JSON.parse(readFileSync(join(__dirname, '../package.json'), 'utf-8'));
10
+ const program = new Command();
11
+ program
12
+ .name('mcp')
13
+ .description('Markdown Content Preprocessor - Extract and convert web content to clean Markdown')
14
+ .version(packageJson.version);
15
+ program
16
+ .command('fetch <url>')
17
+ .description('Fetch a URL and convert to Markdown')
18
+ .option('-d, --depth <number>', 'Crawl depth (0 = single page)', '0')
19
+ .option('-c, --concurrency <number>', 'Max concurrent requests', '3')
20
+ .option('--no-robots', 'Ignore robots.txt')
21
+ .option('--all-origins', 'Allow cross-origin crawling')
22
+ .option('-u, --user-agent <string>', 'Custom user agent')
23
+ .option('--cache-dir <path>', 'Cache directory', '.cache')
24
+ .option('-t, --timeout <ms>', 'Request timeout in milliseconds', '30000')
25
+ .option('-o, --output <format>', 'Output format: json, markdown, or both', 'markdown')
26
+ .action(async (url, options) => {
27
+ try {
28
+ const crawlOptions = {
29
+ depth: parseInt(options.depth, 10),
30
+ maxConcurrency: parseInt(options.concurrency, 10),
31
+ respectRobots: options.robots,
32
+ sameOriginOnly: !options.allOrigins,
33
+ userAgent: options.userAgent,
34
+ cacheDir: options.cacheDir,
35
+ timeout: parseInt(options.timeout, 10)
36
+ };
37
+ const queue = new CrawlQueue(crawlOptions);
38
+ await queue.init();
39
+ console.error(`Fetching ${url}...`);
40
+ const results = await queue.crawl(url);
41
+ if (options.output === 'json') {
42
+ console.log(JSON.stringify(results, null, 2));
43
+ }
44
+ else if (options.output === 'markdown') {
45
+ results.forEach(result => {
46
+ if (result.error) {
47
+ console.error(`Error for ${result.url}: ${result.error}`);
48
+ }
49
+ else if (result.markdown) {
50
+ console.log(result.markdown);
51
+ if (results.length > 1) {
52
+ console.log('\n---\n');
53
+ }
54
+ }
55
+ });
56
+ }
57
+ else if (options.output === 'both') {
58
+ results.forEach(result => {
59
+ console.log(`\n## URL: ${result.url}\n`);
60
+ if (result.error) {
61
+ console.error(`Error: ${result.error}`);
62
+ }
63
+ else {
64
+ console.log(result.markdown);
65
+ }
66
+ });
67
+ }
68
+ const hasErrors = results.some(r => r.error);
69
+ if (hasErrors) {
70
+ process.exit(1);
71
+ }
72
+ }
73
+ catch (error) {
74
+ console.error('Error:', error instanceof Error ? error.message : error);
75
+ process.exit(1);
76
+ }
77
+ });
78
+ program
79
+ .command('clear-cache')
80
+ .description('Clear the cache directory')
81
+ .option('--cache-dir <path>', 'Cache directory', '.cache')
82
+ .action(async (options) => {
83
+ try {
84
+ const { rm } = await import('fs/promises');
85
+ await rm(options.cacheDir, { recursive: true, force: true });
86
+ console.log(`Cache cleared: ${options.cacheDir}`);
87
+ }
88
+ catch (error) {
89
+ console.error('Error clearing cache:', error);
90
+ process.exit(1);
91
+ }
92
+ });
93
+ program
94
+ .command('serve')
95
+ .description('Run as an MCP server')
96
+ .action(async () => {
97
+ await import('./serve.js');
98
+ });
99
+ program.parse();
@@ -0,0 +1,16 @@
1
+ export interface FetchMarkdownOptions {
2
+ depth?: number;
3
+ maxConcurrency?: number;
4
+ respectRobots?: boolean;
5
+ sameOriginOnly?: boolean;
6
+ userAgent?: string;
7
+ cacheDir?: string;
8
+ timeout?: number;
9
+ }
10
+ export interface FetchMarkdownResult {
11
+ markdown: string;
12
+ title?: string;
13
+ links?: string[];
14
+ error?: string;
15
+ }
16
+ export declare function fetchMarkdown(url: string, options?: FetchMarkdownOptions): Promise<FetchMarkdownResult>;
@@ -0,0 +1,36 @@
1
+ import { CrawlQueue } from '../crawler/queue.js';
2
+ export async function fetchMarkdown(url, options = {}) {
3
+ try {
4
+ const crawlOptions = {
5
+ depth: options.depth ?? 0,
6
+ maxConcurrency: options.maxConcurrency ?? 3,
7
+ respectRobots: options.respectRobots ?? true,
8
+ sameOriginOnly: options.sameOriginOnly ?? true,
9
+ userAgent: options.userAgent,
10
+ cacheDir: options.cacheDir ?? '.cache',
11
+ timeout: options.timeout ?? 30000
12
+ };
13
+ const queue = new CrawlQueue(crawlOptions);
14
+ await queue.init();
15
+ const results = await queue.crawl(url);
16
+ const mainResult = results[0];
17
+ if (!mainResult) {
18
+ return {
19
+ markdown: '',
20
+ error: 'No results returned'
21
+ };
22
+ }
23
+ return {
24
+ markdown: mainResult.markdown,
25
+ title: mainResult.title,
26
+ links: mainResult.links,
27
+ error: mainResult.error
28
+ };
29
+ }
30
+ catch (error) {
31
+ return {
32
+ markdown: '',
33
+ error: error instanceof Error ? error.message : 'Unknown error'
34
+ };
35
+ }
36
+ }
@@ -0,0 +1,4 @@
1
+ import { JSDOM } from 'jsdom';
2
+ import { Article } from '../types.js';
3
+ export declare function extractArticle(dom: JSDOM): Article | null;
4
+ export declare function hasContent(html: string): boolean;
@@ -0,0 +1,115 @@
1
+ import { Readability } from '@mozilla/readability';
2
+ export function extractArticle(dom) {
3
+ const document = dom.window.document;
4
+ const baseUrl = dom.window.location.href;
5
+ const articleParagraph = document.querySelector('article p');
6
+ const hasStrongArticleIndicators = (document.querySelector('article') !== null &&
7
+ articleParagraph?.textContent && articleParagraph.textContent.length > 200) ||
8
+ document.querySelector('[itemtype*="BlogPosting"]') !== null ||
9
+ document.querySelector('[itemtype*="NewsArticle"]') !== null ||
10
+ document.querySelector('meta[property="article:published_time"]') !== null;
11
+ if (hasStrongArticleIndicators) {
12
+ const documentClone = document.cloneNode(true);
13
+ const reader = new Readability(documentClone);
14
+ const article = reader.parse();
15
+ if (article && article.content && article.content.trim().length > 500) {
16
+ return {
17
+ title: article.title || 'Untitled',
18
+ content: article.content || '',
19
+ textContent: article.textContent || '',
20
+ length: article.length || 0,
21
+ excerpt: article.excerpt || '',
22
+ byline: article.byline || null,
23
+ dir: article.dir || null,
24
+ lang: article.lang || null,
25
+ siteName: article.siteName || null,
26
+ publishedTime: article.publishedTime || null,
27
+ baseUrl
28
+ };
29
+ }
30
+ }
31
+ return extractContentManually(dom);
32
+ }
33
+ function extractContentManually(dom) {
34
+ try {
35
+ const document = dom.window.document;
36
+ const baseUrl = dom.window.location.href;
37
+ const title = document.querySelector('title')?.textContent ||
38
+ document.querySelector('h1')?.textContent ||
39
+ document.querySelector('meta[property="og:title"]')?.getAttribute('content') ||
40
+ document.querySelector('meta[name="title"]')?.getAttribute('content') ||
41
+ 'Untitled Page';
42
+ const byline = document.querySelector('meta[name="author"]')?.getAttribute('content') ||
43
+ document.querySelector('[rel="author"]')?.textContent ||
44
+ document.querySelector('.author')?.textContent ||
45
+ null;
46
+ if (!document.body) {
47
+ const html = document.documentElement?.innerHTML || '';
48
+ return {
49
+ title: title.trim(),
50
+ content: html,
51
+ byline,
52
+ excerpt: '',
53
+ dir: null,
54
+ lang: document.documentElement?.lang || null,
55
+ length: html.length,
56
+ siteName: null,
57
+ textContent: document.documentElement?.textContent || '',
58
+ publishedTime: null,
59
+ baseUrl
60
+ };
61
+ }
62
+ const contentClone = document.body.cloneNode(true);
63
+ const selectorsToRemove = [
64
+ 'script', 'style', 'noscript', 'template'
65
+ ];
66
+ selectorsToRemove.forEach(selector => {
67
+ try {
68
+ contentClone.querySelectorAll(selector).forEach(el => el.remove());
69
+ }
70
+ catch (e) {
71
+ }
72
+ });
73
+ const mainContent = contentClone;
74
+ const content = mainContent.innerHTML || mainContent.textContent || '';
75
+ return {
76
+ title: title.trim(),
77
+ content,
78
+ byline,
79
+ excerpt: '',
80
+ dir: null,
81
+ lang: document.documentElement?.lang || null,
82
+ length: content.length,
83
+ siteName: null,
84
+ textContent: mainContent.textContent || '',
85
+ publishedTime: null,
86
+ baseUrl
87
+ };
88
+ }
89
+ catch (error) {
90
+ console.error('Error in manual extraction:', error);
91
+ return {
92
+ title: 'Error extracting content',
93
+ content: dom.window.document.body?.innerHTML || dom.window.document.documentElement?.innerHTML || '',
94
+ byline: null,
95
+ excerpt: '',
96
+ dir: null,
97
+ lang: null,
98
+ length: 0,
99
+ siteName: null,
100
+ textContent: dom.window.document.body?.textContent || '',
101
+ publishedTime: null,
102
+ baseUrl: dom.window.location.href
103
+ };
104
+ }
105
+ }
106
+ export function hasContent(html) {
107
+ const lowerHtml = html.toLowerCase();
108
+ if (lowerHtml.includes('<noscript>') &&
109
+ !lowerHtml.includes('<article') &&
110
+ !lowerHtml.includes('<main')) {
111
+ return false;
112
+ }
113
+ const textContent = html.replace(/<[^>]*>/g, '').trim();
114
+ return textContent.length > 100;
115
+ }
@@ -0,0 +1,3 @@
1
+ import { JSDOM } from 'jsdom';
2
+ export declare function htmlToDom(html: string, url: string): JSDOM;
3
+ export declare function extractLinks(dom: JSDOM): string[];
@@ -0,0 +1,53 @@
1
+ import { JSDOM } from 'jsdom';
2
+ export function htmlToDom(html, url) {
3
+ try {
4
+ return new JSDOM(html, {
5
+ url,
6
+ contentType: 'text/html',
7
+ includeNodeLocations: false,
8
+ runScripts: 'outside-only',
9
+ resources: 'usable',
10
+ pretendToBeVisual: true
11
+ });
12
+ }
13
+ catch (error) {
14
+ console.error('Error parsing HTML with JSDOM, trying with minimal options:', error);
15
+ try {
16
+ return new JSDOM(html, {
17
+ url,
18
+ contentType: 'text/html'
19
+ });
20
+ }
21
+ catch (fallbackError) {
22
+ console.error('Fallback parsing also failed:', fallbackError);
23
+ return new JSDOM(`<!DOCTYPE html><html><body>${html}</body></html>`, {
24
+ url,
25
+ contentType: 'text/html'
26
+ });
27
+ }
28
+ }
29
+ }
30
+ export function extractLinks(dom) {
31
+ const document = dom.window.document;
32
+ const links = [];
33
+ const baseUrl = dom.window.location.href;
34
+ const anchorElements = document.querySelectorAll('a[href]');
35
+ anchorElements.forEach((element) => {
36
+ try {
37
+ const href = element.getAttribute('href');
38
+ if (!href)
39
+ return;
40
+ if (href.startsWith('mailto:') ||
41
+ href.startsWith('tel:') ||
42
+ href.startsWith('javascript:') ||
43
+ href.startsWith('#')) {
44
+ return;
45
+ }
46
+ const absoluteUrl = new URL(href, baseUrl).href;
47
+ links.push(absoluteUrl);
48
+ }
49
+ catch {
50
+ }
51
+ });
52
+ return [...new Set(links)];
53
+ }
@@ -0,0 +1,9 @@
1
+ import TurndownService from 'turndown';
2
+ export declare function createTurndownService(): TurndownService;
3
+ export declare function htmlToMarkdown(html: string): string;
4
+ export declare function formatArticleMarkdown(article: {
5
+ title: string;
6
+ content: string;
7
+ byline?: string | null;
8
+ baseUrl?: string;
9
+ }): string;
@@ -0,0 +1,134 @@
1
+ import TurndownService from 'turndown';
2
+ import { gfm } from 'turndown-plugin-gfm';
3
+ import { JSDOM } from 'jsdom';
4
+ function convertRelativeUrls(html, baseUrl) {
5
+ try {
6
+ const dom = new JSDOM(html, { url: baseUrl });
7
+ const document = dom.window.document;
8
+ document.querySelectorAll('a[href]').forEach(link => {
9
+ const href = link.getAttribute('href');
10
+ if (href && !href.startsWith('http://') && !href.startsWith('https://') &&
11
+ !href.startsWith('//') && !href.startsWith('mailto:') &&
12
+ !href.startsWith('tel:') && !href.startsWith('javascript:') &&
13
+ !href.startsWith('#')) {
14
+ try {
15
+ const absoluteUrl = new URL(href, baseUrl).href;
16
+ link.setAttribute('href', absoluteUrl);
17
+ }
18
+ catch (e) {
19
+ }
20
+ }
21
+ });
22
+ document.querySelectorAll('img[src]').forEach(img => {
23
+ const src = img.getAttribute('src');
24
+ if (src && !src.startsWith('http://') && !src.startsWith('https://') &&
25
+ !src.startsWith('//') && !src.startsWith('data:')) {
26
+ try {
27
+ const absoluteUrl = new URL(src, baseUrl).href;
28
+ img.setAttribute('src', absoluteUrl);
29
+ }
30
+ catch (e) {
31
+ }
32
+ }
33
+ });
34
+ const bodyElement = document.body || document.documentElement;
35
+ return bodyElement ? bodyElement.innerHTML : html;
36
+ }
37
+ catch (e) {
38
+ return html;
39
+ }
40
+ }
41
+ export function createTurndownService() {
42
+ const turndown = new TurndownService({
43
+ headingStyle: 'atx',
44
+ codeBlockStyle: 'fenced',
45
+ linkStyle: 'inlined',
46
+ emDelimiter: '_',
47
+ bulletListMarker: '-',
48
+ strongDelimiter: '**',
49
+ hr: '---',
50
+ blankReplacement: (_content, node) => {
51
+ return node.isBlock ? '\n\n' : '';
52
+ },
53
+ keepReplacement: (content, node) => {
54
+ return node.isBlock ? '\n\n' + content + '\n\n' : content;
55
+ },
56
+ defaultReplacement: (content, node) => {
57
+ return node.isBlock ? '\n\n' + content + '\n\n' : content;
58
+ }
59
+ });
60
+ turndown.use(gfm);
61
+ turndown.addRule('media', {
62
+ filter: ['iframe', 'video', 'audio', 'embed'],
63
+ replacement: (_content, node) => {
64
+ const element = node;
65
+ const src = element.getAttribute('src') || element.getAttribute('data-src');
66
+ const title = element.getAttribute('title') || element.getAttribute('alt') || 'media';
67
+ if (src) {
68
+ return `\n\n[${title}](${src})\n\n`;
69
+ }
70
+ return '';
71
+ }
72
+ });
73
+ turndown.addRule('figure', {
74
+ filter: 'figure',
75
+ replacement: (content, node) => {
76
+ const figure = node;
77
+ const caption = figure.querySelector('figcaption');
78
+ if (caption) {
79
+ const captionText = caption.textContent || '';
80
+ return `\n\n${content.trim()}\n*${captionText}*\n\n`;
81
+ }
82
+ return `\n\n${content.trim()}\n\n`;
83
+ }
84
+ });
85
+ return turndown;
86
+ }
87
+ export function htmlToMarkdown(html) {
88
+ const turndown = createTurndownService();
89
+ let markdown = turndown.turndown(html);
90
+ markdown = markdown
91
+ .replace(/\n{3,}/g, '\n\n')
92
+ .replace(/\s+$/gm, '')
93
+ .trim();
94
+ return markdown;
95
+ }
96
+ export function formatArticleMarkdown(article) {
97
+ try {
98
+ const turndown = createTurndownService();
99
+ let markdown = '';
100
+ if (article.title && article.title.trim()) {
101
+ markdown = `# ${article.title}\n\n`;
102
+ }
103
+ if (article.byline) {
104
+ markdown += `*By ${article.byline}*\n\n---\n\n`;
105
+ }
106
+ try {
107
+ const processedContent = article.baseUrl
108
+ ? convertRelativeUrls(article.content, article.baseUrl)
109
+ : article.content;
110
+ markdown += turndown.turndown(processedContent);
111
+ }
112
+ catch (conversionError) {
113
+ console.error('Error converting HTML to markdown:', conversionError);
114
+ const tempDiv = typeof document !== 'undefined'
115
+ ? document.createElement('div')
116
+ : null;
117
+ if (tempDiv) {
118
+ tempDiv.innerHTML = article.content;
119
+ markdown += tempDiv.textContent || article.content;
120
+ }
121
+ else {
122
+ markdown += article.content.replace(/<[^>]*>/g, ' ').replace(/\s+/g, ' ');
123
+ }
124
+ }
125
+ return markdown
126
+ .replace(/\n{3,}/g, '\n\n')
127
+ .replace(/\s+$/gm, '')
128
+ .trim();
129
+ }
130
+ catch (error) {
131
+ console.error('Fatal error in formatArticleMarkdown:', error);
132
+ return article.title ? `# ${article.title}\n\n[Content extraction failed]` : '[Content extraction failed]';
133
+ }
134
+ }
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env node
2
+ export {};
package/dist/serve.js ADDED
@@ -0,0 +1,171 @@
1
+ #!/usr/bin/env node
2
+ import { Server } from "@modelcontextprotocol/sdk/server/index.js";
3
+ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
4
+ import { CallToolRequestSchema, ListToolsRequestSchema, ListResourcesRequestSchema, ReadResourceRequestSchema, } from "@modelcontextprotocol/sdk/types.js";
5
+ let fetchMarkdownModule;
6
+ let fsPromises;
7
+ let pathModule;
8
+ const server = new Server({
9
+ name: "read-website-fast",
10
+ version: "0.1.0",
11
+ }, {
12
+ capabilities: {
13
+ tools: {},
14
+ resources: {},
15
+ },
16
+ });
17
+ const READ_WEBSITE_TOOL = {
18
+ name: "read_website_fast",
19
+ description: "Quickly reads webpages and converts to markdown for fast, token efficient web scraping",
20
+ inputSchema: {
21
+ type: "object",
22
+ properties: {
23
+ url: {
24
+ type: "string",
25
+ description: "HTTP/HTTPS URL to fetch and convert to markdown",
26
+ },
27
+ depth: {
28
+ type: "number",
29
+ description: "Crawl depth (0 = single page)",
30
+ default: 0,
31
+ },
32
+ respectRobots: {
33
+ type: "boolean",
34
+ description: "Whether to respect robots.txt",
35
+ default: true,
36
+ },
37
+ },
38
+ required: ["url"],
39
+ },
40
+ };
41
+ const RESOURCES = [
42
+ {
43
+ uri: "read-website-fast://status",
44
+ name: "Cache Status",
45
+ mimeType: "application/json",
46
+ description: "Get cache status information",
47
+ },
48
+ {
49
+ uri: "read-website-fast://clear-cache",
50
+ name: "Clear Cache",
51
+ mimeType: "application/json",
52
+ description: "Clear the cache directory",
53
+ },
54
+ ];
55
+ server.setRequestHandler(ListToolsRequestSchema, async () => ({
56
+ tools: [READ_WEBSITE_TOOL],
57
+ }));
58
+ server.setRequestHandler(CallToolRequestSchema, async (request) => {
59
+ if (request.params.name !== "read_website_fast") {
60
+ throw new Error(`Unknown tool: ${request.params.name}`);
61
+ }
62
+ if (!fetchMarkdownModule) {
63
+ fetchMarkdownModule = await import("./internal/fetchMarkdown.js");
64
+ }
65
+ const args = request.params.arguments;
66
+ const result = await fetchMarkdownModule.fetchMarkdown(args.url, {
67
+ depth: args.depth ?? 0,
68
+ respectRobots: args.respectRobots ?? true,
69
+ });
70
+ if (result.error) {
71
+ throw new Error(result.error);
72
+ }
73
+ return {
74
+ content: [{ type: "text", text: result.markdown }],
75
+ };
76
+ });
77
+ server.setRequestHandler(ListResourcesRequestSchema, async () => ({
78
+ resources: RESOURCES,
79
+ }));
80
+ server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
81
+ const uri = request.params.uri;
82
+ if (!fsPromises) {
83
+ fsPromises = await import("fs/promises");
84
+ }
85
+ if (!pathModule) {
86
+ pathModule = await import("path");
87
+ }
88
+ if (uri === "read-website-fast://status") {
89
+ try {
90
+ const cacheDir = ".cache";
91
+ const files = await fsPromises.readdir(cacheDir).catch(() => []);
92
+ let totalSize = 0;
93
+ for (const file of files) {
94
+ const stats = await fsPromises
95
+ .stat(pathModule.join(cacheDir, file))
96
+ .catch(() => null);
97
+ if (stats) {
98
+ totalSize += stats.size;
99
+ }
100
+ }
101
+ return {
102
+ contents: [
103
+ {
104
+ uri,
105
+ mimeType: "application/json",
106
+ text: JSON.stringify({
107
+ cacheSize: totalSize,
108
+ cacheFiles: files.length,
109
+ cacheSizeFormatted: `${(totalSize / 1024 / 1024).toFixed(2)} MB`,
110
+ }, null, 2),
111
+ },
112
+ ],
113
+ };
114
+ }
115
+ catch (error) {
116
+ return {
117
+ contents: [
118
+ {
119
+ uri,
120
+ mimeType: "application/json",
121
+ text: JSON.stringify({
122
+ error: "Failed to get cache status",
123
+ message: error instanceof Error ? error.message : "Unknown error",
124
+ }, null, 2),
125
+ },
126
+ ],
127
+ };
128
+ }
129
+ }
130
+ if (uri === "read-website-fast://clear-cache") {
131
+ try {
132
+ await fsPromises.rm(".cache", { recursive: true, force: true });
133
+ return {
134
+ contents: [
135
+ {
136
+ uri,
137
+ mimeType: "application/json",
138
+ text: JSON.stringify({
139
+ status: "success",
140
+ message: "Cache cleared successfully",
141
+ }, null, 2),
142
+ },
143
+ ],
144
+ };
145
+ }
146
+ catch (error) {
147
+ return {
148
+ contents: [
149
+ {
150
+ uri,
151
+ mimeType: "application/json",
152
+ text: JSON.stringify({
153
+ status: "error",
154
+ message: error instanceof Error ? error.message : "Failed to clear cache",
155
+ }, null, 2),
156
+ },
157
+ ],
158
+ };
159
+ }
160
+ }
161
+ throw new Error(`Unknown resource: ${uri}`);
162
+ });
163
+ async function runServer() {
164
+ const transport = new StdioServerTransport();
165
+ await server.connect(transport);
166
+ console.error("read-website-fast MCP server running");
167
+ }
168
+ runServer().catch((error) => {
169
+ console.error("Server error:", error);
170
+ process.exit(1);
171
+ });
@@ -0,0 +1,26 @@
1
+ export interface ChunkOptions {
2
+ maxTokens?: number;
3
+ maxChars?: number;
4
+ splitOn?: 'heading' | 'paragraph' | 'sentence';
5
+ overlap?: number;
6
+ }
7
+ export interface Chunk {
8
+ content: string;
9
+ index: number;
10
+ tokens?: number;
11
+ metadata?: {
12
+ headings?: string[];
13
+ startLine?: number;
14
+ endLine?: number;
15
+ };
16
+ }
17
+ export declare class MarkdownChunker {
18
+ private options;
19
+ constructor(options?: ChunkOptions);
20
+ chunk(markdown: string): Chunk[];
21
+ private chunkByHeading;
22
+ private chunkByParagraph;
23
+ private chunkBySentence;
24
+ private getOverlapLines;
25
+ estimateTokens(text: string): number;
26
+ }