npm - @just-every/mcp-read-website-fast - Versions diffs - 0.1.11 → 0.1.13 - Mend

@just-every/mcp-read-website-fast 0.1.11 → 0.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/README.md +2 -0
package/dist/index.js +2 -4
package/dist/internal/fetchMarkdown.js +2 -4
package/package.json +3 -9
package/dist/cache/disk.d.ts +0 -12
package/dist/cache/disk.js +0 -54
package/dist/cache/normalize.d.ts +0 -2
package/dist/cache/normalize.js +0 -31
package/dist/crawler/fetch.d.ts +0 -8
package/dist/crawler/fetch.js +0 -43
package/dist/crawler/queue.d.ts +0 -14
package/dist/crawler/queue.js +0 -148
package/dist/crawler/robots.d.ts +0 -8
package/dist/crawler/robots.js +0 -47
package/dist/parser/article.d.ts +0 -4
package/dist/parser/article.js +0 -125
package/dist/parser/dom.d.ts +0 -3
package/dist/parser/dom.js +0 -60
package/dist/parser/markdown.d.ts +0 -9
package/dist/parser/markdown.js +0 -147

package/README.md CHANGED Viewed

@@ -11,6 +11,8 @@ Existing MCP web crawlers are slow and consume large quantities of tokens. This
 This MCP package fetches web pages locally, strips noise, and converts content to clean Markdown while preserving links. Designed for Claude Code, IDEs and LLM pipelines with minimal token footprint. Crawl sites locally with minimal dependencies.
+**Note:** This package now uses [@just-every/crawl](https://www.npmjs.com/package/@just-every/crawl) for its core crawling and markdown conversion functionality.
 ## Features
 - **Fast startup** using official MCP SDK with lazy loading for optimal performance

package/dist/index.js CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/usr/bin/env node
 import { Command } from 'commander';
-import { CrawlQueue } from './crawler/queue.js';
+import { fetch } from '@just-every/crawl';
 import { readFileSync } from 'fs';
 import { fileURLToPath } from 'url';
 import { dirname, join } from 'path';
@@ -34,10 +34,8 @@ program
             cacheDir: options.cacheDir,
             timeout: parseInt(options.timeout, 10),
         };
-        const queue = new CrawlQueue(crawlOptions);
-        await queue.init();
         console.error(`Fetching ${url}...`);
-        const results = await queue.crawl(url);
+        const results = await fetch(url, crawlOptions);
         if (options.output === 'json') {
             console.log(JSON.stringify(results, null, 2));
         }

package/dist/internal/fetchMarkdown.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { CrawlQueue } from '../crawler/queue.js';
+import { fetch } from '@just-every/crawl';
 export async function fetchMarkdown(url, options = {}) {
     try {
         const crawlOptions = {
@@ -10,9 +10,7 @@ export async function fetchMarkdown(url, options = {}) {
             cacheDir: options.cacheDir ?? '.cache',
             timeout: options.timeout ?? 30000,
         };
-        const queue = new CrawlQueue(crawlOptions);
-        await queue.init();
-        const results = await queue.crawl(url);
+        const results = await fetch(url, crawlOptions);
         const mainResult = results[0];
         if (!mainResult) {
             return {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@just-every/mcp-read-website-fast",
-  "version": "0.1.11",
+  "version": "0.1.13",
   "description": "Markdown Content Preprocessor - Fetch web pages, extract content, convert to clean Markdown",
   "main": "dist/index.js",
   "bin": {
@@ -50,15 +50,9 @@
   "homepage": "https://github.com/just-every/mcp-read-website-fast#readme",
   "license": "MIT",
   "dependencies": {
+    "@just-every/crawl": "^1.0.2",
     "@modelcontextprotocol/sdk": "^1.12.3",
-    "@mozilla/readability": "^0.6.0",
-    "commander": "^14.0.0",
-    "jsdom": "^26.1.0",
-    "p-limit": "^6.2.0",
-    "robots-parser": "^3.0.1",
-    "turndown": "^7.1.3",
-    "turndown-plugin-gfm": "^1.0.2",
-    "undici": "^7.10.0"
+    "commander": "^14.0.0"
   },
   "devDependencies": {
     "@types/jsdom": "^21.1.6",

package/dist/cache/disk.d.ts DELETED Viewed

@@ -1,12 +0,0 @@
-import { CacheEntry } from '../types.js';
-export declare class DiskCache {
-    private cacheDir;
-    constructor(cacheDir?: string);
-    init(): Promise<void>;
-    private getCacheKey;
-    private getCachePath;
-    has(url: string): Promise<boolean>;
-    get(url: string): Promise<CacheEntry | null>;
-    put(url: string, markdown: string, title?: string): Promise<void>;
-    getAge(url: string): Promise<number | null>;
-}

package/dist/cache/disk.js DELETED Viewed

@@ -1,54 +0,0 @@
-import { createHash } from 'crypto';
-import { mkdir, readFile, writeFile, access } from 'fs/promises';
-import { join } from 'path';
-export class DiskCache {
-    cacheDir;
-    constructor(cacheDir = '.cache') {
-        this.cacheDir = cacheDir;
-    }
-    async init() {
-        await mkdir(this.cacheDir, { recursive: true });
-    }
-    getCacheKey(url) {
-        return createHash('sha256').update(url).digest('hex');
-    }
-    getCachePath(url) {
-        const key = this.getCacheKey(url);
-        return join(this.cacheDir, `${key}.json`);
-    }
-    async has(url) {
-        try {
-            await access(this.getCachePath(url));
-            return true;
-        }
-        catch {
-            return false;
-        }
-    }
-    async get(url) {
-        try {
-            const path = this.getCachePath(url);
-            const data = await readFile(path, 'utf-8');
-            return JSON.parse(data);
-        }
-        catch {
-            return null;
-        }
-    }
-    async put(url, markdown, title) {
-        const entry = {
-            url,
-            markdown,
-            timestamp: Date.now(),
-            title,
-        };
-        const path = this.getCachePath(url);
-        await writeFile(path, JSON.stringify(entry, null, 2));
-    }
-    async getAge(url) {
-        const entry = await this.get(url);
-        if (!entry)
-            return null;
-        return Date.now() - entry.timestamp;
-    }
-}

package/dist/cache/normalize.d.ts DELETED Viewed

	@@ -1,2 +0,0 @@
1	- export declare function normalizeUrl(url: string): string;
2	- export declare function isSameOrigin(url1: string, url2: string): boolean;

package/dist/cache/normalize.js DELETED Viewed

@@ -1,31 +0,0 @@
-export function normalizeUrl(url) {
-    try {
-        const parsed = new URL(url);
-        if (parsed.pathname !== '/' && parsed.pathname.endsWith('/')) {
-            parsed.pathname = parsed.pathname.slice(0, -1);
-        }
-        const params = Array.from(parsed.searchParams.entries());
-        params.sort(([a], [b]) => a.localeCompare(b));
-        parsed.search = '';
-        params.forEach(([key, value]) => parsed.searchParams.append(key, value));
-        if ((parsed.protocol === 'http:' && parsed.port === '80') ||
-            (parsed.protocol === 'https:' && parsed.port === '443')) {
-            parsed.port = '';
-        }
-        parsed.hash = '';
-        return parsed.href;
-    }
-    catch {
-        return url;
-    }
-}
-export function isSameOrigin(url1, url2) {
-    try {
-        const u1 = new URL(url1);
-        const u2 = new URL(url2);
-        return u1.origin === u2.origin;
-    }
-    catch {
-        return false;
-    }
-}

package/dist/crawler/fetch.d.ts DELETED Viewed

@@ -1,8 +0,0 @@
-interface FetchOptions {
-    userAgent?: string;
-    timeout?: number;
-    maxRedirections?: number;
-}
-export declare function fetchStream(url: string, options?: FetchOptions): Promise<string>;
-export declare function isValidUrl(url: string): boolean;
-export {};

package/dist/crawler/fetch.js DELETED Viewed

@@ -1,43 +0,0 @@
-import { fetch } from 'undici';
-export async function fetchStream(url, options = {}) {
-    const { userAgent = 'MCP/0.1 (+https://github.com/just-every/mcp-read-website-fast)', timeout = 30000, maxRedirections = 5, } = options;
-    try {
-        const response = await fetch(url, {
-            headers: {
-                'User-Agent': userAgent,
-                Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-                'Accept-Language': 'en-US,en;q=0.5',
-                DNT: '1',
-                Connection: 'keep-alive',
-                'Upgrade-Insecure-Requests': '1',
-            },
-            redirect: maxRedirections > 0 ? 'follow' : 'manual',
-            signal: AbortSignal.timeout(timeout),
-        });
-        if (!response.ok) {
-            throw new Error(`HTTP ${response.status} for ${url}`);
-        }
-        const contentType = response.headers.get('content-type');
-        if (contentType &&
-            !contentType.includes('text/html') &&
-            !contentType.includes('application/xhtml+xml')) {
-            throw new Error(`Non-HTML content type: ${contentType} for ${url}`);
-        }
-        return await response.text();
-    }
-    catch (error) {
-        if (error instanceof Error) {
-            throw new Error(`Failed to fetch ${url}: ${error.message}`);
-        }
-        throw error;
-    }
-}
-export function isValidUrl(url) {
-    try {
-        const parsed = new URL(url);
-        return parsed.protocol === 'http:' || parsed.protocol === 'https:';
-    }
-    catch {
-        return false;
-    }
-}

package/dist/crawler/queue.d.ts DELETED Viewed

@@ -1,14 +0,0 @@
-import { CrawlOptions, CrawlResult } from '../types.js';
-export declare class CrawlQueue {
-    private visited;
-    private queue;
-    private limit;
-    private cache;
-    private options;
-    private results;
-    constructor(options?: CrawlOptions);
-    init(): Promise<void>;
-    crawl(startUrl: string): Promise<CrawlResult[]>;
-    private processQueue;
-    private processUrl;
-}

package/dist/crawler/queue.js DELETED Viewed

@@ -1,148 +0,0 @@
-import pLimit from 'p-limit';
-import { normalizeUrl, isSameOrigin } from '../cache/normalize.js';
-import { DiskCache } from '../cache/disk.js';
-import { fetchStream, isValidUrl } from './fetch.js';
-import { isAllowedByRobots, getCrawlDelay } from './robots.js';
-import { htmlToDom, extractLinks } from '../parser/dom.js';
-import { extractArticle } from '../parser/article.js';
-import { formatArticleMarkdown } from '../parser/markdown.js';
-export class CrawlQueue {
-    visited = new Set();
-    queue = [];
-    limit;
-    cache;
-    options;
-    results = [];
-    constructor(options = {}) {
-        this.options = {
-            depth: options.depth ?? 0,
-            maxConcurrency: options.maxConcurrency ?? 3,
-            respectRobots: options.respectRobots ?? true,
-            sameOriginOnly: options.sameOriginOnly ?? true,
-            userAgent: options.userAgent ?? 'MCP/0.1',
-            cacheDir: options.cacheDir ?? '.cache',
-            timeout: options.timeout ?? 30000,
-        };
-        this.limit = pLimit(this.options.maxConcurrency);
-        this.cache = new DiskCache(this.options.cacheDir);
-    }
-    async init() {
-        await this.cache.init();
-    }
-    async crawl(startUrl) {
-        const normalizedUrl = normalizeUrl(startUrl);
-        if (!isValidUrl(normalizedUrl)) {
-            throw new Error(`Invalid URL: ${startUrl}`);
-        }
-        this.queue.push(normalizedUrl);
-        await this.processQueue(0);
-        return this.results;
-    }
-    async processQueue(currentDepth) {
-        if (currentDepth > this.options.depth)
-            return;
-        const urls = [...this.queue];
-        this.queue = [];
-        const tasks = urls.map(url => this.limit(() => this.processUrl(url, currentDepth)));
-        await Promise.all(tasks);
-        if (this.queue.length > 0) {
-            await this.processQueue(currentDepth + 1);
-        }
-    }
-    async processUrl(url, depth) {
-        const normalizedUrl = normalizeUrl(url);
-        if (this.visited.has(normalizedUrl))
-            return;
-        this.visited.add(normalizedUrl);
-        try {
-            const cached = await this.cache.get(normalizedUrl);
-            if (cached) {
-                this.results.push({
-                    url: normalizedUrl,
-                    markdown: cached.markdown,
-                    title: cached.title,
-                });
-                return;
-            }
-            if (this.options.respectRobots) {
-                const allowed = await isAllowedByRobots(normalizedUrl, this.options.userAgent);
-                if (!allowed) {
-                    this.results.push({
-                        url: normalizedUrl,
-                        markdown: '',
-                        error: 'Blocked by robots.txt',
-                    });
-                    return;
-                }
-                const delay = await getCrawlDelay(normalizedUrl, this.options.userAgent);
-                if (delay > 0) {
-                    await new Promise(resolve => setTimeout(resolve, delay * 1000));
-                }
-            }
-            const html = await fetchStream(normalizedUrl, {
-                userAgent: this.options.userAgent,
-                timeout: this.options.timeout,
-            });
-            if (!html || html.trim().length === 0) {
-                this.results.push({
-                    url: normalizedUrl,
-                    markdown: '',
-                    error: 'Empty response from server',
-                });
-                return;
-            }
-            const dom = htmlToDom(html, normalizedUrl);
-            const article = extractArticle(dom);
-            if (!article) {
-                this.results.push({
-                    url: normalizedUrl,
-                    markdown: '',
-                    error: 'Failed to extract article content',
-                });
-                return;
-            }
-            if (!article.content || article.content.trim().length < 50) {
-                const fallbackMarkdown = `# ${article.title || 'Page Content'}\n\n` +
-                    `*Note: This page appears to be JavaScript-rendered. Limited content extracted.*\n\n` +
-                    (article.textContent
-                        ? article.textContent.substring(0, 1000) + '...'
-                        : 'No text content available');
-                this.results.push({
-                    url: normalizedUrl,
-                    markdown: fallbackMarkdown,
-                    title: article.title || normalizedUrl,
-                    error: 'Limited content extracted (JavaScript-rendered page)',
-                });
-                return;
-            }
-            const markdown = formatArticleMarkdown(article);
-            await this.cache.put(normalizedUrl, markdown, article.title);
-            let links = [];
-            if (depth < this.options.depth) {
-                links = extractLinks(dom);
-                if (this.options.sameOriginOnly) {
-                    links = links.filter(link => isSameOrigin(normalizedUrl, link));
-                }
-                links.forEach(link => {
-                    const normalized = normalizeUrl(link);
-                    if (!this.visited.has(normalized)) {
-                        this.queue.push(normalized);
-                    }
-                });
-            }
-            this.results.push({
-                url: normalizedUrl,
-                markdown,
-                title: article.title,
-                links: links.length > 0 ? links : undefined,
-            });
-        }
-        catch (error) {
-            this.results.push({
-                url: normalizedUrl,
-                markdown: '',
-                error: error instanceof Error ? error.message : 'Unknown error',
-            });
-        }
-    }
-}

package/dist/crawler/robots.d.ts DELETED Viewed

@@ -1,8 +0,0 @@
-interface RobotsChecker {
-    isAllowed(url: string, userAgent?: string): boolean;
-    getCrawlDelay(userAgent?: string): number | undefined;
-}
-export declare function getRobotsChecker(origin: string, userAgent?: string): Promise<RobotsChecker>;
-export declare function isAllowedByRobots(url: string, userAgent?: string): Promise<boolean>;
-export declare function getCrawlDelay(url: string, userAgent?: string): Promise<number>;
-export {};

package/dist/crawler/robots.js DELETED Viewed

@@ -1,47 +0,0 @@
-import { fetchStream } from './fetch.js';
-const robotsCache = new Map();
-export async function getRobotsChecker(origin, userAgent = '*') {
-    const cached = robotsCache.get(origin);
-    if (cached)
-        return cached;
-    try {
-        const robotsUrl = new URL('/robots.txt', origin).href;
-        const robotsTxt = await fetchStream(robotsUrl, {
-            timeout: 5000,
-            userAgent,
-        });
-        const robotsParserModule = (await import('robots-parser'));
-        const robotsParser = robotsParserModule.default || robotsParserModule;
-        const robots = robotsParser(robotsUrl, robotsTxt);
-        robotsCache.set(origin, robots);
-        return robots;
-    }
-    catch {
-        const permissive = {
-            isAllowed: () => true,
-            getCrawlDelay: () => undefined,
-        };
-        robotsCache.set(origin, permissive);
-        return permissive;
-    }
-}
-export async function isAllowedByRobots(url, userAgent = '*') {
-    try {
-        const { origin } = new URL(url);
-        const checker = await getRobotsChecker(origin, userAgent);
-        return checker.isAllowed(url, userAgent);
-    }
-    catch {
-        return true;
-    }
-}
-export async function getCrawlDelay(url, userAgent = '*') {
-    try {
-        const { origin } = new URL(url);
-        const checker = await getRobotsChecker(origin, userAgent);
-        return checker.getCrawlDelay(userAgent) || 0;
-    }
-    catch {
-        return 0;
-    }
-}

package/dist/parser/article.d.ts DELETED Viewed

@@ -1,4 +0,0 @@
-import { JSDOM } from 'jsdom';
-import { Article } from '../types.js';
-export declare function extractArticle(dom: JSDOM): Article | null;
-export declare function hasContent(html: string): boolean;

package/dist/parser/article.js DELETED Viewed

@@ -1,125 +0,0 @@
-import { Readability } from '@mozilla/readability';
-export function extractArticle(dom) {
-    const document = dom.window.document;
-    const baseUrl = dom.window.location.href;
-    const articleParagraph = document.querySelector('article p');
-    const hasStrongArticleIndicators = (document.querySelector('article') !== null &&
-        articleParagraph?.textContent &&
-        articleParagraph.textContent.length > 200) ||
-        document.querySelector('[itemtype*="BlogPosting"]') !== null ||
-        document.querySelector('[itemtype*="NewsArticle"]') !== null ||
-        document.querySelector('meta[property="article:published_time"]') !==
-            null;
-    if (hasStrongArticleIndicators) {
-        const documentClone = document.cloneNode(true);
-        const reader = new Readability(documentClone);
-        const article = reader.parse();
-        if (article && article.content && article.content.trim().length > 500) {
-            return {
-                title: article.title || 'Untitled',
-                content: article.content || '',
-                textContent: article.textContent || '',
-                length: article.length || 0,
-                excerpt: article.excerpt || '',
-                byline: article.byline || null,
-                dir: article.dir || null,
-                lang: article.lang || null,
-                siteName: article.siteName || null,
-                publishedTime: article.publishedTime || null,
-                baseUrl,
-            };
-        }
-    }
-    return extractContentManually(dom);
-}
-function extractContentManually(dom) {
-    try {
-        const document = dom.window.document;
-        const baseUrl = dom.window.location.href;
-        const title = document.querySelector('title')?.textContent ||
-            document.querySelector('h1')?.textContent ||
-            document
-                .querySelector('meta[property="og:title"]')
-                ?.getAttribute('content') ||
-            document
-                .querySelector('meta[name="title"]')
-                ?.getAttribute('content') ||
-            'Untitled Page';
-        const byline = document
-            .querySelector('meta[name="author"]')
-            ?.getAttribute('content') ||
-            document.querySelector('[rel="author"]')?.textContent ||
-            document.querySelector('.author')?.textContent ||
-            null;
-        if (!document.body) {
-            const html = document.documentElement?.innerHTML || '';
-            return {
-                title: title.trim(),
-                content: html,
-                byline,
-                excerpt: '',
-                dir: null,
-                lang: document.documentElement?.lang || null,
-                length: html.length,
-                siteName: null,
-                textContent: document.documentElement?.textContent || '',
-                publishedTime: null,
-                baseUrl,
-            };
-        }
-        const contentClone = document.body.cloneNode(true);
-        const selectorsToRemove = ['script', 'style', 'noscript', 'template'];
-        selectorsToRemove.forEach(selector => {
-            try {
-                contentClone
-                    .querySelectorAll(selector)
-                    .forEach(el => el.remove());
-            }
-            catch {
-            }
-        });
-        const mainContent = contentClone;
-        const content = mainContent.innerHTML || mainContent.textContent || '';
-        return {
-            title: title.trim(),
-            content,
-            byline,
-            excerpt: '',
-            dir: null,
-            lang: document.documentElement?.lang || null,
-            length: content.length,
-            siteName: null,
-            textContent: mainContent.textContent || '',
-            publishedTime: null,
-            baseUrl,
-        };
-    }
-    catch (error) {
-        console.error('Error in manual extraction:', error);
-        return {
-            title: 'Error extracting content',
-            content: dom.window.document.body?.innerHTML ||
-                dom.window.document.documentElement?.innerHTML ||
-                '',
-            byline: null,
-            excerpt: '',
-            dir: null,
-            lang: null,
-            length: 0,
-            siteName: null,
-            textContent: dom.window.document.body?.textContent || '',
-            publishedTime: null,
-            baseUrl: dom.window.location.href,
-        };
-    }
-}
-export function hasContent(html) {
-    const lowerHtml = html.toLowerCase();
-    if (lowerHtml.includes('<noscript>') &&
-        !lowerHtml.includes('<article') &&
-        !lowerHtml.includes('<main')) {
-        return false;
-    }
-    const textContent = html.replace(/<[^>]*>/g, '').trim();
-    return textContent.length > 100;
-}

package/dist/parser/dom.d.ts DELETED Viewed

@@ -1,3 +0,0 @@
-import { JSDOM } from 'jsdom';
-export declare function htmlToDom(html: string, url: string): JSDOM;
-export declare function extractLinks(dom: JSDOM): string[];

package/dist/parser/dom.js DELETED Viewed

@@ -1,60 +0,0 @@
-import { JSDOM, VirtualConsole } from 'jsdom';
-export function htmlToDom(html, url) {
-    try {
-        return new JSDOM(html, {
-            url,
-            contentType: 'text/html',
-            includeNodeLocations: false,
-            runScripts: undefined,
-            resources: undefined,
-            pretendToBeVisual: true,
-            virtualConsole: new VirtualConsole().sendTo(console, {
-                omitJSDOMErrors: true,
-            }),
-        });
-    }
-    catch {
-        try {
-            return new JSDOM(html, {
-                url,
-                contentType: 'text/html',
-                virtualConsole: new VirtualConsole().sendTo(console, {
-                    omitJSDOMErrors: true,
-                }),
-            });
-        }
-        catch {
-            return new JSDOM(`<!DOCTYPE html><html><body>${html}</body></html>`, {
-                url,
-                contentType: 'text/html',
-                virtualConsole: new VirtualConsole().sendTo(console, {
-                    omitJSDOMErrors: true,
-                }),
-            });
-        }
-    }
-}
-export function extractLinks(dom) {
-    const document = dom.window.document;
-    const links = [];
-    const baseUrl = dom.window.location.href;
-    const anchorElements = document.querySelectorAll('a[href]');
-    anchorElements.forEach(element => {
-        try {
-            const href = element.getAttribute('href');
-            if (!href)
-                return;
-            if (href.startsWith('mailto:') ||
-                href.startsWith('tel:') ||
-                href.startsWith('javascript:') ||
-                href.startsWith('#')) {
-                return;
-            }
-            const absoluteUrl = new URL(href, baseUrl).href;
-            links.push(absoluteUrl);
-        }
-        catch {
-        }
-    });
-    return [...new Set(links)];
-}

package/dist/parser/markdown.d.ts DELETED Viewed

@@ -1,9 +0,0 @@
-import TurndownService from 'turndown';
-export declare function createTurndownService(): TurndownService;
-export declare function htmlToMarkdown(html: string): string;
-export declare function formatArticleMarkdown(article: {
-    title: string;
-    content: string;
-    byline?: string | null;
-    baseUrl?: string;
-}): string;

package/dist/parser/markdown.js DELETED Viewed

@@ -1,147 +0,0 @@
-import TurndownService from 'turndown';
-import { gfm } from 'turndown-plugin-gfm';
-import { JSDOM } from 'jsdom';
-function convertRelativeUrls(html, baseUrl) {
-    try {
-        const dom = new JSDOM(html, { url: baseUrl });
-        const document = dom.window.document;
-        document.querySelectorAll('a[href]').forEach(link => {
-            const href = link.getAttribute('href');
-            if (href &&
-                !href.startsWith('http://') &&
-                !href.startsWith('https://') &&
-                !href.startsWith('//') &&
-                !href.startsWith('mailto:') &&
-                !href.startsWith('tel:') &&
-                !href.startsWith('javascript:') &&
-                !href.startsWith('#')) {
-                try {
-                    const absoluteUrl = new URL(href, baseUrl).href;
-                    link.setAttribute('href', absoluteUrl);
-                }
-                catch {
-                }
-            }
-        });
-        document.querySelectorAll('img[src]').forEach(img => {
-            const src = img.getAttribute('src');
-            if (src &&
-                !src.startsWith('http://') &&
-                !src.startsWith('https://') &&
-                !src.startsWith('//') &&
-                !src.startsWith('data:')) {
-                try {
-                    const absoluteUrl = new URL(src, baseUrl).href;
-                    img.setAttribute('src', absoluteUrl);
-                }
-                catch {
-                }
-            }
-        });
-        const bodyElement = document.body || document.documentElement;
-        return bodyElement ? bodyElement.innerHTML : html;
-    }
-    catch {
-        return html;
-    }
-}
-export function createTurndownService() {
-    const turndown = new TurndownService({
-        headingStyle: 'atx',
-        codeBlockStyle: 'fenced',
-        linkStyle: 'inlined',
-        emDelimiter: '_',
-        bulletListMarker: '-',
-        strongDelimiter: '**',
-        hr: '---',
-        blankReplacement: (_content, node) => {
-            return node.isBlock ? '\n\n' : '';
-        },
-        keepReplacement: (content, node) => {
-            return node.isBlock ? '\n\n' + content + '\n\n' : content;
-        },
-        defaultReplacement: (content, node) => {
-            return node.isBlock ? '\n\n' + content + '\n\n' : content;
-        },
-    });
-    turndown.use(gfm);
-    turndown.addRule('media', {
-        filter: ['iframe', 'video', 'audio', 'embed'],
-        replacement: (_content, node) => {
-            const element = node;
-            const src = element.getAttribute('src') || element.getAttribute('data-src');
-            const title = element.getAttribute('title') ||
-                element.getAttribute('alt') ||
-                'media';
-            if (src) {
-                return `\n\n[${title}](${src})\n\n`;
-            }
-            return '';
-        },
-    });
-    turndown.addRule('figure', {
-        filter: 'figure',
-        replacement: (content, node) => {
-            const figure = node;
-            const caption = figure.querySelector('figcaption');
-            if (caption) {
-                const captionText = caption.textContent || '';
-                return `\n\n${content.trim()}\n*${captionText}*\n\n`;
-            }
-            return `\n\n${content.trim()}\n\n`;
-        },
-    });
-    return turndown;
-}
-export function htmlToMarkdown(html) {
-    const turndown = createTurndownService();
-    let markdown = turndown.turndown(html);
-    markdown = markdown
-        .replace(/\n{3,}/g, '\n\n')
-        .replace(/\s+$/gm, '')
-        .trim();
-    return markdown;
-}
-export function formatArticleMarkdown(article) {
-    try {
-        const turndown = createTurndownService();
-        let markdown = '';
-        if (article.title && article.title.trim()) {
-            markdown = `# ${article.title}\n\n`;
-        }
-        if (article.byline) {
-            markdown += `*By ${article.byline}*\n\n---\n\n`;
-        }
-        try {
-            const processedContent = article.baseUrl
-                ? convertRelativeUrls(article.content, article.baseUrl)
-                : article.content;
-            markdown += turndown.turndown(processedContent);
-        }
-        catch (conversionError) {
-            console.error('Error converting HTML to markdown:', conversionError);
-            const tempDiv = typeof document !== 'undefined'
-                ? document.createElement('div')
-                : null;
-            if (tempDiv) {
-                tempDiv.innerHTML = article.content;
-                markdown += tempDiv.textContent || article.content;
-            }
-            else {
-                markdown += article.content
-                    .replace(/<[^>]*>/g, ' ')
-                    .replace(/\s+/g, ' ');
-            }
-        }
-        return markdown
-            .replace(/\n{3,}/g, '\n\n')
-            .replace(/\s+$/gm, '')
-            .trim();
-    }
-    catch (error) {
-        console.error('Fatal error in formatArticleMarkdown:', error);
-        return article.title
-            ? `# ${article.title}\n\n[Content extraction failed]`
-            : '[Content extraction failed]';
-    }
-}