npm - @steipete/summarize - Versions diffs - 0.3.0 → 0.4.0 - Mend

@steipete/summarize 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/CHANGELOG.md +10 -3
package/README.md +7 -3
package/dist/cli.cjs +451 -133
package/dist/cli.cjs.map +4 -4
package/dist/esm/flags.js +18 -1
package/dist/esm/flags.js.map +1 -1
package/dist/esm/markitdown.js +54 -0
package/dist/esm/markitdown.js.map +1 -0
package/dist/esm/prompts/file.js +19 -0
package/dist/esm/prompts/file.js.map +1 -1
package/dist/esm/prompts/index.js +1 -1
package/dist/esm/prompts/index.js.map +1 -1
package/dist/esm/run.js +262 -35
package/dist/esm/run.js.map +1 -1
package/dist/esm/version.js +1 -1
package/dist/types/flags.d.ts +4 -0
package/dist/types/markitdown.d.ts +10 -0
package/dist/types/prompts/file.d.ts +7 -0
package/dist/types/prompts/index.d.ts +1 -1
package/dist/types/run.d.ts +3 -1
package/dist/types/version.d.ts +1 -1
package/docs/README.md +1 -1
package/docs/extract-only.md +10 -7
package/docs/firecrawl.md +2 -2
package/docs/site/docs/config.html +3 -3
package/docs/site/docs/extract-only.html +7 -5
package/docs/site/docs/firecrawl.html +6 -6
package/docs/site/docs/index.html +2 -2
package/docs/site/docs/llm.html +2 -2
package/docs/site/docs/openai.html +2 -2
package/docs/site/docs/website.html +7 -4
package/docs/site/docs/youtube.html +2 -2
package/docs/site/index.html +1 -1
package/docs/website.md +10 -7
package/docs/youtube.md +1 -1
package/package.json +1 -1

package/dist/esm/run.js CHANGED Viewed

@@ -11,17 +11,19 @@ import { buildAssetPromptMessages, classifyUrl, loadLocalAsset, loadRemoteAsset,
 import { createLinkPreviewClient } from './content/index.js';
 import { buildRunMetricsReport } from './costs.js';
 import { createFirecrawlScraper } from './firecrawl.js';
-import { parseDurationMs, parseFirecrawlMode, parseLengthArg, parseMarkdownMode, parseMaxOutputTokensArg, parseMetricsMode, parseRenderMode, parseStreamMode, parseYoutubeMode, } from './flags.js';
+import { parseDurationMs, parseExtractFormat, parseFirecrawlMode, parseLengthArg, parseMarkdownMode, parseMaxOutputTokensArg, parseMetricsMode, parsePreprocessMode, parseRenderMode, parseStreamMode, parseYoutubeMode, } from './flags.js';
 import { generateTextWithModelId, streamTextWithModelId } from './llm/generate-text.js';
 import { resolveGoogleModelForUsage } from './llm/google-models.js';
 import { createHtmlToMarkdownConverter } from './llm/html-to-markdown.js';
 import { normalizeGatewayStyleModelId, parseGatewayStyleModelId } from './llm/model-id.js';
+import { convertToMarkdownWithMarkitdown } from './markitdown.js';
 import { loadLiteLlmCatalog, resolveLiteLlmMaxInputTokensForModelId, resolveLiteLlmMaxOutputTokensForModelId, resolveLiteLlmPricingForModelId, } from './pricing/litellm.js';
-import { buildFileSummaryPrompt, buildLinkSummaryPrompt } from './prompts/index.js';
+import { buildFileSummaryPrompt, buildFileTextSummaryPrompt, buildLinkSummaryPrompt, } from './prompts/index.js';
 import { startOscProgress } from './tty/osc-progress.js';
 import { startSpinner } from './tty/spinner.js';
 import { resolvePackageVersion } from './version.js';
 const BIRD_TIP = 'Tip: Install bird🐦 for better Twitter support: https://github.com/steipete/bird';
+const UVX_TIP = 'Tip: Install uv (uvx) for local Markdown conversion: brew install uv (or set UVX_PATH to your uvx binary).';
 const TWITTER_HOSTS = new Set(['x.com', 'twitter.com', 'mobile.twitter.com']);
 const SUMMARY_LENGTH_MAX_CHARACTERS = {
     short: 1200,
@@ -58,7 +60,7 @@ function isExecutable(filePath) {
 }
 function hasBirdCli(env) {
     const candidates = [];
-    const pathEnv = env.PATH ?? process.env.PATH ?? '';
+    const pathEnv = env.PATH ?? '';
     for (const entry of pathEnv.split(path.delimiter)) {
         if (!entry)
             continue;
@@ -66,6 +68,19 @@ function hasBirdCli(env) {
     }
     return candidates.some((candidate) => isExecutable(candidate));
 }
+function hasUvxCli(env) {
+    if (typeof env.UVX_PATH === 'string' && env.UVX_PATH.trim().length > 0) {
+        return true;
+    }
+    const candidates = [];
+    const pathEnv = env.PATH ?? '';
+    for (const entry of pathEnv.split(path.delimiter)) {
+        if (!entry)
+            continue;
+        candidates.push(path.join(entry, 'uvx'));
+    }
+    return candidates.some((candidate) => isExecutable(candidate));
+}
 async function readTweetWithBird(args) {
     return await new Promise((resolve, reject) => {
         execFile('bird', ['read', args.url, '--json'], {
@@ -108,6 +123,14 @@ function withBirdTip(error, url, env) {
     const combined = `${message}\n${BIRD_TIP}`;
     return error instanceof Error ? new Error(combined, { cause: error }) : new Error(combined);
 }
+function withUvxTip(error, env) {
+    if (hasUvxCli(env)) {
+        return error instanceof Error ? error : new Error(String(error));
+    }
+    const message = error instanceof Error ? error.message : String(error);
+    const combined = `${message}\n${UVX_TIP}`;
+    return error instanceof Error ? new Error(combined, { cause: error }) : new Error(combined);
+}
 const MAX_TEXT_BYTES_DEFAULT = 10 * 1024 * 1024;
 function buildProgram() {
     return new Command()
@@ -115,13 +138,19 @@ function buildProgram() {
         .description('Summarize web pages and YouTube links (uses direct provider API keys).')
         .argument('[input]', 'URL or local file path to summarize')
         .option('--youtube <mode>', 'YouTube transcript source: auto, web (youtubei/captionTracks), yt-dlp (audio+whisper), apify', 'auto')
-        .option('--firecrawl <mode>', 'Firecrawl usage: off, auto (fallback), always (try Firecrawl first).', 'auto')
-        .option('--markdown <mode>', 'Website Markdown output: off, auto (use LLM when configured), llm (force LLM). Only affects --extract-only for non-YouTube URLs.', 'auto')
+        .option('--firecrawl <mode>', 'Firecrawl usage: off, auto (fallback), always (try Firecrawl first). Note: in --format md website mode, defaults to always when FIRECRAWL_API_KEY is set (unless --firecrawl is set explicitly).', 'auto')
+        .option('--format <format>', 'Website/file content format: md|text. For websites: controls the extraction format. For files: controls whether we try to preprocess to Markdown for model compatibility. (default: text)', 'text')
+        .addOption(new Option('--preprocess <mode>', 'Preprocess inputs for model compatibility: off, auto (fallback), always.')
+        .choices(['off', 'auto', 'always'])
+        .default('auto'))
+        .addOption(new Option('--markdown-mode <mode>', 'HTML→Markdown conversion: off, auto (prefer Firecrawl when configured, then LLM when configured, then markitdown when available), llm (force LLM). Only affects --format md for non-YouTube URLs.').default('auto'))
+        .addOption(new Option('--markdown <mode>', 'Deprecated alias for --markdown-mode (use --extract --format md --markdown-mode ...)').hideHelp())
         .option('--length <length>', 'Summary length: short|medium|long|xl|xxl or a character limit like 20000, 20k', 'medium')
         .option('--max-output-tokens <count>', 'Hard cap for LLM output tokens (e.g. 2000, 2k). Overrides provider defaults.', undefined)
         .option('--timeout <duration>', 'Timeout for content fetching and LLM request: 30 (seconds), 30s, 2m, 5000ms', '2m')
         .option('--model <model>', 'LLM model id (gateway-style): xai/..., openai/..., google/... (default: google/gemini-3-flash-preview)', undefined)
-        .option('--extract-only', 'Print extracted content and exit (no LLM summary)', false)
+        .option('--extract', 'Print extracted content and exit (no LLM summary)', false)
+        .addOption(new Option('--extract-only', 'Deprecated alias for --extract').hideHelp())
         .option('--json', 'Output structured JSON (includes prompt + metrics)', false)
         .option('--stream <mode>', 'Stream LLM output: auto (TTY only), on, off. Note: streaming is disabled in --json mode.', 'auto')
         .option('--render <mode>', 'Render Markdown output: auto (TTY only), md-live, md, plain. Note: auto selects md-live when streaming to a TTY.', 'auto')
@@ -250,6 +279,30 @@ function getTextContentFromAttachment(attachment) {
     }
     return { content: '', bytes: 0 };
 }
+function getFileBytesFromAttachment(attachment) {
+    if (attachment.part.type !== 'file')
+        return null;
+    const data = attachment.part.data;
+    return data instanceof Uint8Array ? data : null;
+}
+function shouldMarkitdownConvertMediaType(mediaType) {
+    const mt = mediaType.toLowerCase();
+    if (mt === 'application/pdf')
+        return true;
+    if (mt === 'application/rtf')
+        return true;
+    if (mt === 'text/html' || mt === 'application/xhtml+xml')
+        return true;
+    if (mt === 'application/msword')
+        return true;
+    if (mt.startsWith('application/vnd.openxmlformats-officedocument.'))
+        return true;
+    if (mt === 'application/vnd.ms-excel')
+        return true;
+    if (mt === 'application/vnd.ms-powerpoint')
+        return true;
+    return false;
+}
 function assertProviderSupportsAttachment({ provider, modelId, attachment, }) {
     // xAI via AI SDK currently supports image parts, but not generic file parts (e.g. PDFs).
     if (provider === 'xai' &&
@@ -314,9 +367,10 @@ function attachRichHelp(program, env, stdout) {
     program.addHelpText('after', () => `
 ${heading('Examples')}
   ${cmd('summarize "https://example.com"')}
-  ${cmd('summarize "https://example.com" --extract-only')} ${dim('# website markdown (LLM if configured)')}
-  ${cmd('summarize "https://example.com" --extract-only --markdown llm')} ${dim('# website markdown via LLM')}
-  ${cmd('summarize "https://www.youtube.com/watch?v=I845O57ZSy4&t=11s" --extract-only --youtube web')}
+  ${cmd('summarize "https://example.com" --extract')} ${dim('# extracted plain text')}
+  ${cmd('summarize "https://example.com" --extract --format md')} ${dim('# extracted markdown (prefers Firecrawl when configured)')}
+  ${cmd('summarize "https://example.com" --extract --format md --markdown-mode llm')} ${dim('# extracted markdown via LLM')}
+  ${cmd('summarize "https://www.youtube.com/watch?v=I845O57ZSy4&t=11s" --extract --youtube web')}
   ${cmd('summarize "https://example.com" --length 20k --max-output-tokens 2k --timeout 2m --model openai/gpt-5.2')}
   ${cmd('OPENROUTER_API_KEY=... summarize "https://example.com" --model openai/openai/gpt-oss-20b')}
   ${cmd('summarize "https://example.com" --json --verbose')}
@@ -448,10 +502,11 @@ function writeFinishLine({ stderr, elapsedMs, model, report, costUsd, color, })
     stderr.write('\n');
     stderr.write(`${ansi('1;32', line, color)}\n`);
 }
-export async function runCli(argv, { env, fetch, stdout, stderr }) {
+export async function runCli(argv, { env, fetch, execFile: execFileOverride, stdout, stderr }) {
     ;
     globalThis.AI_SDK_LOG_WARNINGS = false;
     const normalizedArgv = argv.filter((arg) => arg !== '--');
+    const execFileImpl = execFileOverride ?? execFile;
     const version = resolvePackageVersion();
     const program = buildProgram();
     program.configureOutput({
@@ -488,7 +543,7 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
     const lengthArg = parseLengthArg(program.opts().length);
     const maxOutputTokensArg = parseMaxOutputTokensArg(program.opts().maxOutputTokens);
     const timeoutMs = parseDurationMs(program.opts().timeout);
-    const extractOnly = Boolean(program.opts().extractOnly);
+    const extractMode = Boolean(program.opts().extract) || Boolean(program.opts().extractOnly);
     const json = Boolean(program.opts().json);
     const streamMode = parseStreamMode(program.opts().stream);
     const renderMode = parseRenderMode(program.opts().render);
@@ -496,9 +551,20 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
     const metricsMode = parseMetricsMode(program.opts().metrics);
     const metricsEnabled = metricsMode !== 'off';
     const metricsDetailed = metricsMode === 'detailed';
-    const markdownMode = parseMarkdownMode(program.opts().markdown);
+    const preprocessMode = parsePreprocessMode(program.opts().preprocess);
+    const format = parseExtractFormat(program.opts().format);
     const shouldComputeReport = metricsEnabled;
     const isYoutubeUrl = typeof url === 'string' ? /youtube\.com|youtu\.be/i.test(url) : false;
+    const firecrawlExplicitlySet = normalizedArgv.some((arg) => arg === '--firecrawl' || arg.startsWith('--firecrawl='));
+    const markdownModeExplicitlySet = normalizedArgv.some((arg) => arg === '--markdown-mode' ||
+        arg.startsWith('--markdown-mode=') ||
+        arg === '--markdown' ||
+        arg.startsWith('--markdown='));
+    const markdownMode = format === 'markdown'
+        ? parseMarkdownMode(program.opts().markdownMode ??
+            program.opts().markdown ??
+            'auto')
+        : 'off';
     const requestedFirecrawlMode = parseFirecrawlMode(program.opts().firecrawl);
     const modelArg = typeof program.opts().model === 'string' ? program.opts().model : null;
     const { config, path: configPath } = loadSummarizeConfig({ env });
@@ -540,6 +606,12 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
     const anthropicConfigured = typeof anthropicApiKey === 'string' && anthropicApiKey.length > 0;
     const openrouterConfigured = typeof openrouterApiKey === 'string' && openrouterApiKey.length > 0;
     const openrouterOptions = openRouterProviders ? { providers: openRouterProviders } : undefined;
+    if (markdownModeExplicitlySet && format !== 'markdown') {
+        throw new Error('--markdown-mode is only supported with --format md');
+    }
+    if (markdownModeExplicitlySet && inputTarget.kind !== 'url') {
+        throw new Error('--markdown-mode is only supported for website URLs');
+    }
     const llmCalls = [];
     let firecrawlRequests = 0;
     let apifyRequests = 0;
@@ -642,7 +714,7 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
             return streamMode;
         return isRichTty(stdout) ? 'on' : 'off';
     })();
-    const streamingEnabled = effectiveStreamMode === 'on' && !json && !extractOnly;
+    const streamingEnabled = effectiveStreamMode === 'on' && !json && !extractMode;
     const effectiveRenderMode = (() => {
         if (renderMode !== 'auto')
             return renderMode;
@@ -661,8 +733,8 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
         stderr.write(`metrics apify requests=${report.services.apify.requests}\n`);
         stderr.write(`metrics total tok(i/o/t)=${promptTokens ?? 'unknown'}/${completionTokens ?? 'unknown'}/${totalTokens ?? 'unknown'}\n`);
     };
-    if (extractOnly && inputTarget.kind !== 'url') {
-        throw new Error('--extract-only is only supported for website/YouTube URLs');
+    if (extractMode && inputTarget.kind !== 'url') {
+        throw new Error('--extract is only supported for website/YouTube URLs');
     }
     const progressEnabled = isRichTty(stderr) && !verbose && !json;
     let clearProgressBeforeStdout = null;
@@ -697,11 +769,6 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
         if (!hasRequiredKey) {
             throw new Error(`Missing ${requiredKeyEnv} for model ${parsedModel.canonical}. Set the env var or choose a different --model.`);
         }
-        assertProviderSupportsAttachment({
-            provider: parsedModel.provider,
-            modelId: parsedModel.canonical,
-            attachment: { part: attachment.part, mediaType: attachment.mediaType },
-        });
         const modelResolution = await resolveModelIdForLlmCall({
             parsedModel,
             apiKeys: { googleApiKey: apiKeysForLlm.googleApiKey },
@@ -719,14 +786,114 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
         if (textContent && textContent.bytes > MAX_TEXT_BYTES_DEFAULT) {
             throw new Error(`Text file too large (${formatBytes(textContent.bytes)}). Limit is ${formatBytes(MAX_TEXT_BYTES_DEFAULT)}.`);
         }
+        const fileBytes = getFileBytesFromAttachment(attachment);
+        const canPreprocessWithMarkitdown = format === 'markdown' &&
+            preprocessMode !== 'off' &&
+            hasUvxCli(env) &&
+            attachment.part.type === 'file' &&
+            fileBytes !== null &&
+            shouldMarkitdownConvertMediaType(attachment.mediaType);
         const summaryLengthTarget = lengthArg.kind === 'preset' ? lengthArg.preset : { maxCharacters: lengthArg.maxCharacters };
-        const promptText = buildFileSummaryPrompt({
-            filename: attachment.filename,
-            mediaType: attachment.mediaType,
-            summaryLength: summaryLengthTarget,
-            contentLength: textContent?.content.length ?? null,
-        });
-        const promptPayload = buildAssetPromptPayload({ promptText, attachment, textContent });
+        let promptText = '';
+        const buildAttachmentPromptPayload = () => {
+            promptText = buildFileSummaryPrompt({
+                filename: attachment.filename,
+                mediaType: attachment.mediaType,
+                summaryLength: summaryLengthTarget,
+                contentLength: textContent?.content.length ?? null,
+            });
+            return buildAssetPromptPayload({ promptText, attachment, textContent });
+        };
+        const buildMarkitdownPromptPayload = (markdown) => {
+            promptText = buildFileTextSummaryPrompt({
+                filename: attachment.filename,
+                originalMediaType: attachment.mediaType,
+                contentMediaType: 'text/markdown',
+                summaryLength: summaryLengthTarget,
+                contentLength: markdown.length,
+            });
+            return `${promptText}\n\n---\n\n${markdown}`.trim();
+        };
+        let preprocessedMarkdown = null;
+        let usingPreprocessedMarkdown = false;
+        if (preprocessMode === 'always' && canPreprocessWithMarkitdown) {
+            if (!fileBytes) {
+                throw new Error('Internal error: missing file bytes for markitdown preprocessing');
+            }
+            try {
+                preprocessedMarkdown = await convertToMarkdownWithMarkitdown({
+                    bytes: fileBytes,
+                    filenameHint: attachment.filename,
+                    mediaTypeHint: attachment.mediaType,
+                    uvxCommand: env.UVX_PATH,
+                    timeoutMs,
+                    env,
+                    execFileImpl,
+                });
+            }
+            catch (error) {
+                const message = error instanceof Error ? error.message : String(error);
+                throw new Error(`Failed to preprocess ${attachment.mediaType} with markitdown: ${message} (disable with --preprocess off).`);
+            }
+            if (Buffer.byteLength(preprocessedMarkdown, 'utf8') > MAX_TEXT_BYTES_DEFAULT) {
+                throw new Error(`Preprocessed Markdown too large (${formatBytes(Buffer.byteLength(preprocessedMarkdown, 'utf8'))}). Limit is ${formatBytes(MAX_TEXT_BYTES_DEFAULT)}.`);
+            }
+            usingPreprocessedMarkdown = true;
+        }
+        let promptPayload = buildAttachmentPromptPayload();
+        if (usingPreprocessedMarkdown) {
+            if (!preprocessedMarkdown) {
+                throw new Error('Internal error: missing markitdown content for preprocessing');
+            }
+            promptPayload = buildMarkitdownPromptPayload(preprocessedMarkdown);
+        }
+        if (!usingPreprocessedMarkdown) {
+            try {
+                assertProviderSupportsAttachment({
+                    provider: parsedModel.provider,
+                    modelId: parsedModel.canonical,
+                    attachment: { part: attachment.part, mediaType: attachment.mediaType },
+                });
+            }
+            catch (error) {
+                if (!canPreprocessWithMarkitdown) {
+                    if (format === 'markdown' &&
+                        preprocessMode !== 'off' &&
+                        attachment.part.type === 'file' &&
+                        shouldMarkitdownConvertMediaType(attachment.mediaType) &&
+                        !hasUvxCli(env)) {
+                        throw withUvxTip(error, env);
+                    }
+                    throw error;
+                }
+                if (!fileBytes) {
+                    throw new Error('Internal error: missing file bytes for markitdown preprocessing');
+                }
+                try {
+                    preprocessedMarkdown = await convertToMarkdownWithMarkitdown({
+                        bytes: fileBytes,
+                        filenameHint: attachment.filename,
+                        mediaTypeHint: attachment.mediaType,
+                        uvxCommand: env.UVX_PATH,
+                        timeoutMs,
+                        env,
+                        execFileImpl,
+                    });
+                }
+                catch (markitdownError) {
+                    if (preprocessMode === 'auto') {
+                        throw error;
+                    }
+                    const message = markitdownError instanceof Error ? markitdownError.message : String(markitdownError);
+                    throw new Error(`Failed to preprocess ${attachment.mediaType} with markitdown: ${message} (disable with --preprocess off).`);
+                }
+                if (Buffer.byteLength(preprocessedMarkdown, 'utf8') > MAX_TEXT_BYTES_DEFAULT) {
+                    throw new Error(`Preprocessed Markdown too large (${formatBytes(Buffer.byteLength(preprocessedMarkdown, 'utf8'))}). Limit is ${formatBytes(MAX_TEXT_BYTES_DEFAULT)}.`);
+                }
+                usingPreprocessedMarkdown = true;
+                promptPayload = buildMarkitdownPromptPayload(preprocessedMarkdown);
+            }
+        }
         const maxInputTokensForCall = await resolveMaxInputTokensForCall(parsedModelEffective.canonical);
         if (typeof maxInputTokensForCall === 'number' &&
             Number.isFinite(maxInputTokensForCall) &&
@@ -1128,12 +1295,21 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
     if (!url) {
         throw new Error('Only HTTP and HTTPS URLs can be summarized');
     }
-    const firecrawlMode = requestedFirecrawlMode;
+    const wantsMarkdown = format === 'markdown' && !isYoutubeUrl;
+    if (wantsMarkdown && markdownMode === 'off') {
+        throw new Error('--format md conflicts with --markdown-mode off (use --format text)');
+    }
+    const firecrawlMode = (() => {
+        if (wantsMarkdown && !isYoutubeUrl && !firecrawlExplicitlySet && firecrawlConfigured) {
+            return 'always';
+        }
+        return requestedFirecrawlMode;
+    })();
     if (firecrawlMode === 'always' && !firecrawlConfigured) {
         throw new Error('--firecrawl always requires FIRECRAWL_API_KEY');
     }
-    const effectiveMarkdownMode = markdownMode;
-    const markdownRequested = extractOnly && !isYoutubeUrl && effectiveMarkdownMode !== 'off';
+    const markdownRequested = wantsMarkdown;
+    const effectiveMarkdownMode = markdownRequested ? markdownMode : 'off';
     const hasKeyForModel = parsedModelForLlm.provider === 'xai'
         ? xaiConfigured
         : parsedModelForLlm.provider === 'google'
@@ -1150,16 +1326,16 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
                 : parsedModelForLlm.provider === 'anthropic'
                     ? 'ANTHROPIC_API_KEY'
                     : 'OPENAI_API_KEY';
-        throw new Error(`--markdown llm requires ${required} for model ${parsedModelForLlm.canonical}`);
+        throw new Error(`--markdown-mode llm requires ${required} for model ${parsedModelForLlm.canonical}`);
     }
-    writeVerbose(stderr, verbose, `config url=${url} timeoutMs=${timeoutMs} youtube=${youtubeMode} firecrawl=${firecrawlMode} length=${lengthArg.kind === 'preset' ? lengthArg.preset : `${lengthArg.maxCharacters} chars`} maxOutputTokens=${formatOptionalNumber(maxOutputTokensArg)} json=${json} extractOnly=${extractOnly} markdown=${effectiveMarkdownMode} model=${model} stream=${effectiveStreamMode} render=${effectiveRenderMode}`, verboseColor);
+    writeVerbose(stderr, verbose, `config url=${url} timeoutMs=${timeoutMs} youtube=${youtubeMode} firecrawl=${firecrawlMode} length=${lengthArg.kind === 'preset' ? lengthArg.preset : `${lengthArg.maxCharacters} chars`} maxOutputTokens=${formatOptionalNumber(maxOutputTokensArg)} json=${json} extract=${extractMode} format=${format} preprocess=${preprocessMode} markdownMode=${markdownMode} model=${model} stream=${effectiveStreamMode} render=${effectiveRenderMode}`, verboseColor);
     writeVerbose(stderr, verbose, `configFile path=${formatOptionalString(configPath)} model=${formatOptionalString(config?.model ?? null)}`, verboseColor);
     writeVerbose(stderr, verbose, `env xaiKey=${xaiConfigured} openaiKey=${Boolean(apiKey)} googleKey=${googleConfigured} anthropicKey=${anthropicConfigured} openrouterKey=${openrouterConfigured} apifyToken=${Boolean(apifyToken)} firecrawlKey=${firecrawlConfigured}`, verboseColor);
     writeVerbose(stderr, verbose, `markdown requested=${markdownRequested} provider=${markdownProvider}`, verboseColor);
     const scrapeWithFirecrawl = firecrawlConfigured && firecrawlMode !== 'off'
         ? createFirecrawlScraper({ apiKey: firecrawlApiKey, fetchImpl: trackedFetch })
         : null;
-    const convertHtmlToMarkdown = markdownRequested && (effectiveMarkdownMode === 'llm' || markdownProvider !== 'none')
+    const llmHtmlToMarkdown = markdownRequested && (effectiveMarkdownMode === 'llm' || markdownProvider !== 'none')
         ? createHtmlToMarkdownConverter({
             modelId: model,
             xaiApiKey: xaiConfigured ? xaiApiKey : null,
@@ -1174,6 +1350,46 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
             },
         })
         : null;
+    const markitdownHtmlToMarkdown = markdownRequested && preprocessMode !== 'off' && hasUvxCli(env)
+        ? async (args) => {
+            void args.url;
+            void args.title;
+            void args.siteName;
+            return convertToMarkdownWithMarkitdown({
+                bytes: new TextEncoder().encode(args.html),
+                filenameHint: 'page.html',
+                mediaTypeHint: 'text/html',
+                uvxCommand: env.UVX_PATH,
+                timeoutMs: args.timeoutMs,
+                env,
+                execFileImpl,
+            });
+        }
+        : null;
+    const convertHtmlToMarkdown = markdownRequested
+        ? async (args) => {
+            if (effectiveMarkdownMode === 'llm') {
+                if (!llmHtmlToMarkdown) {
+                    throw new Error('No HTML→Markdown converter configured');
+                }
+                return llmHtmlToMarkdown(args);
+            }
+            if (llmHtmlToMarkdown) {
+                try {
+                    return await llmHtmlToMarkdown(args);
+                }
+                catch (error) {
+                    if (!markitdownHtmlToMarkdown)
+                        throw error;
+                    return await markitdownHtmlToMarkdown(args);
+                }
+            }
+            if (markitdownHtmlToMarkdown) {
+                return await markitdownHtmlToMarkdown(args);
+            }
+            throw new Error('No HTML→Markdown converter configured');
+        }
+        : null;
     const readTweetWithBirdClient = hasBirdCli(env)
         ? ({ url, timeoutMs }) => readTweetWithBird({ url, timeoutMs, env })
         : null;
@@ -1367,7 +1583,7 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
         const viaSourceLabel = viaSources.length > 0 ? `, ${viaSources.join('+')}` : '';
         if (progressEnabled) {
             websiteProgress?.stop?.();
-            spinner.setText(extractOnly
+            spinner.setText(extractMode
                 ? `Extracted (${extractedContentSize}${viaSourceLabel})`
                 : `Summarizing (sent ${extractedContentSize}${viaSourceLabel})…`);
         }
@@ -1378,6 +1594,14 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
         writeVerbose(stderr, verbose, `extract transcript textProvided=${extracted.diagnostics.transcript.textProvided} provider=${formatOptionalString(extracted.diagnostics.transcript.provider ?? null)} attemptedProviders=${extracted.diagnostics.transcript.attemptedProviders.length > 0
             ? extracted.diagnostics.transcript.attemptedProviders.join(',')
             : 'none'} notes=${formatOptionalString(extracted.diagnostics.transcript.notes ?? null)}`, verboseColor);
+        if (extractMode &&
+            markdownRequested &&
+            preprocessMode !== 'off' &&
+            effectiveMarkdownMode === 'auto' &&
+            !extracted.diagnostics.markdown.used &&
+            !hasUvxCli(env)) {
+            stderr.write(`${UVX_TIP}\n`);
+        }
         const isYouTube = extracted.siteName === 'YouTube';
         const prompt = buildLinkSummaryPrompt({
             url: extracted.url,
@@ -1391,7 +1615,7 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
             summaryLength: lengthArg.kind === 'preset' ? lengthArg.preset : { maxCharacters: lengthArg.maxCharacters },
             shares: [],
         });
-        if (extractOnly) {
+        if (extractMode) {
             clearProgressForStdout();
             if (json) {
                 const finishReport = shouldComputeReport ? await buildReport() : null;
@@ -1402,6 +1626,7 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
                         timeoutMs,
                         youtube: youtubeMode,
                         firecrawl: firecrawlMode,
+                        format,
                         markdown: effectiveMarkdownMode,
                         length: lengthArg.kind === 'preset'
                             ? { kind: 'preset', preset: lengthArg.preset }
@@ -1472,6 +1697,7 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
                         timeoutMs,
                         youtube: youtubeMode,
                         firecrawl: firecrawlMode,
+                        format,
                         markdown: effectiveMarkdownMode,
                         length: lengthArg.kind === 'preset'
                             ? { kind: 'preset', preset: lengthArg.preset }
@@ -1744,6 +1970,7 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
                     timeoutMs,
                     youtube: youtubeMode,
                     firecrawl: firecrawlMode,
+                    format,
                     markdown: effectiveMarkdownMode,
                     length: lengthArg.kind === 'preset'
                         ? { kind: 'preset', preset: lengthArg.preset }