@steipete/summarize 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +10 -3
- package/README.md +7 -3
- package/dist/cli.cjs +451 -133
- package/dist/cli.cjs.map +4 -4
- package/dist/esm/flags.js +18 -1
- package/dist/esm/flags.js.map +1 -1
- package/dist/esm/markitdown.js +54 -0
- package/dist/esm/markitdown.js.map +1 -0
- package/dist/esm/prompts/file.js +19 -0
- package/dist/esm/prompts/file.js.map +1 -1
- package/dist/esm/prompts/index.js +1 -1
- package/dist/esm/prompts/index.js.map +1 -1
- package/dist/esm/run.js +262 -35
- package/dist/esm/run.js.map +1 -1
- package/dist/esm/version.js +1 -1
- package/dist/types/flags.d.ts +4 -0
- package/dist/types/markitdown.d.ts +10 -0
- package/dist/types/prompts/file.d.ts +7 -0
- package/dist/types/prompts/index.d.ts +1 -1
- package/dist/types/run.d.ts +3 -1
- package/dist/types/version.d.ts +1 -1
- package/docs/README.md +1 -1
- package/docs/extract-only.md +10 -7
- package/docs/firecrawl.md +2 -2
- package/docs/site/docs/config.html +3 -3
- package/docs/site/docs/extract-only.html +7 -5
- package/docs/site/docs/firecrawl.html +6 -6
- package/docs/site/docs/index.html +2 -2
- package/docs/site/docs/llm.html +2 -2
- package/docs/site/docs/openai.html +2 -2
- package/docs/site/docs/website.html +7 -4
- package/docs/site/docs/youtube.html +2 -2
- package/docs/site/index.html +1 -1
- package/docs/website.md +10 -7
- package/docs/youtube.md +1 -1
- package/package.json +1 -1
package/dist/esm/run.js
CHANGED
|
@@ -11,17 +11,19 @@ import { buildAssetPromptMessages, classifyUrl, loadLocalAsset, loadRemoteAsset,
|
|
|
11
11
|
import { createLinkPreviewClient } from './content/index.js';
|
|
12
12
|
import { buildRunMetricsReport } from './costs.js';
|
|
13
13
|
import { createFirecrawlScraper } from './firecrawl.js';
|
|
14
|
-
import { parseDurationMs, parseFirecrawlMode, parseLengthArg, parseMarkdownMode, parseMaxOutputTokensArg, parseMetricsMode, parseRenderMode, parseStreamMode, parseYoutubeMode, } from './flags.js';
|
|
14
|
+
import { parseDurationMs, parseExtractFormat, parseFirecrawlMode, parseLengthArg, parseMarkdownMode, parseMaxOutputTokensArg, parseMetricsMode, parsePreprocessMode, parseRenderMode, parseStreamMode, parseYoutubeMode, } from './flags.js';
|
|
15
15
|
import { generateTextWithModelId, streamTextWithModelId } from './llm/generate-text.js';
|
|
16
16
|
import { resolveGoogleModelForUsage } from './llm/google-models.js';
|
|
17
17
|
import { createHtmlToMarkdownConverter } from './llm/html-to-markdown.js';
|
|
18
18
|
import { normalizeGatewayStyleModelId, parseGatewayStyleModelId } from './llm/model-id.js';
|
|
19
|
+
import { convertToMarkdownWithMarkitdown } from './markitdown.js';
|
|
19
20
|
import { loadLiteLlmCatalog, resolveLiteLlmMaxInputTokensForModelId, resolveLiteLlmMaxOutputTokensForModelId, resolveLiteLlmPricingForModelId, } from './pricing/litellm.js';
|
|
20
|
-
import { buildFileSummaryPrompt, buildLinkSummaryPrompt } from './prompts/index.js';
|
|
21
|
+
import { buildFileSummaryPrompt, buildFileTextSummaryPrompt, buildLinkSummaryPrompt, } from './prompts/index.js';
|
|
21
22
|
import { startOscProgress } from './tty/osc-progress.js';
|
|
22
23
|
import { startSpinner } from './tty/spinner.js';
|
|
23
24
|
import { resolvePackageVersion } from './version.js';
|
|
24
25
|
const BIRD_TIP = 'Tip: Install bird🐦 for better Twitter support: https://github.com/steipete/bird';
|
|
26
|
+
const UVX_TIP = 'Tip: Install uv (uvx) for local Markdown conversion: brew install uv (or set UVX_PATH to your uvx binary).';
|
|
25
27
|
const TWITTER_HOSTS = new Set(['x.com', 'twitter.com', 'mobile.twitter.com']);
|
|
26
28
|
const SUMMARY_LENGTH_MAX_CHARACTERS = {
|
|
27
29
|
short: 1200,
|
|
@@ -58,7 +60,7 @@ function isExecutable(filePath) {
|
|
|
58
60
|
}
|
|
59
61
|
function hasBirdCli(env) {
|
|
60
62
|
const candidates = [];
|
|
61
|
-
const pathEnv = env.PATH ??
|
|
63
|
+
const pathEnv = env.PATH ?? '';
|
|
62
64
|
for (const entry of pathEnv.split(path.delimiter)) {
|
|
63
65
|
if (!entry)
|
|
64
66
|
continue;
|
|
@@ -66,6 +68,19 @@ function hasBirdCli(env) {
|
|
|
66
68
|
}
|
|
67
69
|
return candidates.some((candidate) => isExecutable(candidate));
|
|
68
70
|
}
|
|
71
|
+
function hasUvxCli(env) {
|
|
72
|
+
if (typeof env.UVX_PATH === 'string' && env.UVX_PATH.trim().length > 0) {
|
|
73
|
+
return true;
|
|
74
|
+
}
|
|
75
|
+
const candidates = [];
|
|
76
|
+
const pathEnv = env.PATH ?? '';
|
|
77
|
+
for (const entry of pathEnv.split(path.delimiter)) {
|
|
78
|
+
if (!entry)
|
|
79
|
+
continue;
|
|
80
|
+
candidates.push(path.join(entry, 'uvx'));
|
|
81
|
+
}
|
|
82
|
+
return candidates.some((candidate) => isExecutable(candidate));
|
|
83
|
+
}
|
|
69
84
|
async function readTweetWithBird(args) {
|
|
70
85
|
return await new Promise((resolve, reject) => {
|
|
71
86
|
execFile('bird', ['read', args.url, '--json'], {
|
|
@@ -108,6 +123,14 @@ function withBirdTip(error, url, env) {
|
|
|
108
123
|
const combined = `${message}\n${BIRD_TIP}`;
|
|
109
124
|
return error instanceof Error ? new Error(combined, { cause: error }) : new Error(combined);
|
|
110
125
|
}
|
|
126
|
+
function withUvxTip(error, env) {
|
|
127
|
+
if (hasUvxCli(env)) {
|
|
128
|
+
return error instanceof Error ? error : new Error(String(error));
|
|
129
|
+
}
|
|
130
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
131
|
+
const combined = `${message}\n${UVX_TIP}`;
|
|
132
|
+
return error instanceof Error ? new Error(combined, { cause: error }) : new Error(combined);
|
|
133
|
+
}
|
|
111
134
|
const MAX_TEXT_BYTES_DEFAULT = 10 * 1024 * 1024;
|
|
112
135
|
function buildProgram() {
|
|
113
136
|
return new Command()
|
|
@@ -115,13 +138,19 @@ function buildProgram() {
|
|
|
115
138
|
.description('Summarize web pages and YouTube links (uses direct provider API keys).')
|
|
116
139
|
.argument('[input]', 'URL or local file path to summarize')
|
|
117
140
|
.option('--youtube <mode>', 'YouTube transcript source: auto, web (youtubei/captionTracks), yt-dlp (audio+whisper), apify', 'auto')
|
|
118
|
-
.option('--firecrawl <mode>', 'Firecrawl usage: off, auto (fallback), always (try Firecrawl first).', 'auto')
|
|
119
|
-
.option('--
|
|
141
|
+
.option('--firecrawl <mode>', 'Firecrawl usage: off, auto (fallback), always (try Firecrawl first). Note: in --format md website mode, defaults to always when FIRECRAWL_API_KEY is set (unless --firecrawl is set explicitly).', 'auto')
|
|
142
|
+
.option('--format <format>', 'Website/file content format: md|text. For websites: controls the extraction format. For files: controls whether we try to preprocess to Markdown for model compatibility. (default: text)', 'text')
|
|
143
|
+
.addOption(new Option('--preprocess <mode>', 'Preprocess inputs for model compatibility: off, auto (fallback), always.')
|
|
144
|
+
.choices(['off', 'auto', 'always'])
|
|
145
|
+
.default('auto'))
|
|
146
|
+
.addOption(new Option('--markdown-mode <mode>', 'HTML→Markdown conversion: off, auto (prefer Firecrawl when configured, then LLM when configured, then markitdown when available), llm (force LLM). Only affects --format md for non-YouTube URLs.').default('auto'))
|
|
147
|
+
.addOption(new Option('--markdown <mode>', 'Deprecated alias for --markdown-mode (use --extract --format md --markdown-mode ...)').hideHelp())
|
|
120
148
|
.option('--length <length>', 'Summary length: short|medium|long|xl|xxl or a character limit like 20000, 20k', 'medium')
|
|
121
149
|
.option('--max-output-tokens <count>', 'Hard cap for LLM output tokens (e.g. 2000, 2k). Overrides provider defaults.', undefined)
|
|
122
150
|
.option('--timeout <duration>', 'Timeout for content fetching and LLM request: 30 (seconds), 30s, 2m, 5000ms', '2m')
|
|
123
151
|
.option('--model <model>', 'LLM model id (gateway-style): xai/..., openai/..., google/... (default: google/gemini-3-flash-preview)', undefined)
|
|
124
|
-
.option('--extract
|
|
152
|
+
.option('--extract', 'Print extracted content and exit (no LLM summary)', false)
|
|
153
|
+
.addOption(new Option('--extract-only', 'Deprecated alias for --extract').hideHelp())
|
|
125
154
|
.option('--json', 'Output structured JSON (includes prompt + metrics)', false)
|
|
126
155
|
.option('--stream <mode>', 'Stream LLM output: auto (TTY only), on, off. Note: streaming is disabled in --json mode.', 'auto')
|
|
127
156
|
.option('--render <mode>', 'Render Markdown output: auto (TTY only), md-live, md, plain. Note: auto selects md-live when streaming to a TTY.', 'auto')
|
|
@@ -250,6 +279,30 @@ function getTextContentFromAttachment(attachment) {
|
|
|
250
279
|
}
|
|
251
280
|
return { content: '', bytes: 0 };
|
|
252
281
|
}
|
|
282
|
+
function getFileBytesFromAttachment(attachment) {
|
|
283
|
+
if (attachment.part.type !== 'file')
|
|
284
|
+
return null;
|
|
285
|
+
const data = attachment.part.data;
|
|
286
|
+
return data instanceof Uint8Array ? data : null;
|
|
287
|
+
}
|
|
288
|
+
function shouldMarkitdownConvertMediaType(mediaType) {
|
|
289
|
+
const mt = mediaType.toLowerCase();
|
|
290
|
+
if (mt === 'application/pdf')
|
|
291
|
+
return true;
|
|
292
|
+
if (mt === 'application/rtf')
|
|
293
|
+
return true;
|
|
294
|
+
if (mt === 'text/html' || mt === 'application/xhtml+xml')
|
|
295
|
+
return true;
|
|
296
|
+
if (mt === 'application/msword')
|
|
297
|
+
return true;
|
|
298
|
+
if (mt.startsWith('application/vnd.openxmlformats-officedocument.'))
|
|
299
|
+
return true;
|
|
300
|
+
if (mt === 'application/vnd.ms-excel')
|
|
301
|
+
return true;
|
|
302
|
+
if (mt === 'application/vnd.ms-powerpoint')
|
|
303
|
+
return true;
|
|
304
|
+
return false;
|
|
305
|
+
}
|
|
253
306
|
function assertProviderSupportsAttachment({ provider, modelId, attachment, }) {
|
|
254
307
|
// xAI via AI SDK currently supports image parts, but not generic file parts (e.g. PDFs).
|
|
255
308
|
if (provider === 'xai' &&
|
|
@@ -314,9 +367,10 @@ function attachRichHelp(program, env, stdout) {
|
|
|
314
367
|
program.addHelpText('after', () => `
|
|
315
368
|
${heading('Examples')}
|
|
316
369
|
${cmd('summarize "https://example.com"')}
|
|
317
|
-
${cmd('summarize "https://example.com" --extract
|
|
318
|
-
${cmd('summarize "https://example.com" --extract
|
|
319
|
-
${cmd('summarize "https://
|
|
370
|
+
${cmd('summarize "https://example.com" --extract')} ${dim('# extracted plain text')}
|
|
371
|
+
${cmd('summarize "https://example.com" --extract --format md')} ${dim('# extracted markdown (prefers Firecrawl when configured)')}
|
|
372
|
+
${cmd('summarize "https://example.com" --extract --format md --markdown-mode llm')} ${dim('# extracted markdown via LLM')}
|
|
373
|
+
${cmd('summarize "https://www.youtube.com/watch?v=I845O57ZSy4&t=11s" --extract --youtube web')}
|
|
320
374
|
${cmd('summarize "https://example.com" --length 20k --max-output-tokens 2k --timeout 2m --model openai/gpt-5.2')}
|
|
321
375
|
${cmd('OPENROUTER_API_KEY=... summarize "https://example.com" --model openai/openai/gpt-oss-20b')}
|
|
322
376
|
${cmd('summarize "https://example.com" --json --verbose')}
|
|
@@ -448,10 +502,11 @@ function writeFinishLine({ stderr, elapsedMs, model, report, costUsd, color, })
|
|
|
448
502
|
stderr.write('\n');
|
|
449
503
|
stderr.write(`${ansi('1;32', line, color)}\n`);
|
|
450
504
|
}
|
|
451
|
-
export async function runCli(argv, { env, fetch, stdout, stderr }) {
|
|
505
|
+
export async function runCli(argv, { env, fetch, execFile: execFileOverride, stdout, stderr }) {
|
|
452
506
|
;
|
|
453
507
|
globalThis.AI_SDK_LOG_WARNINGS = false;
|
|
454
508
|
const normalizedArgv = argv.filter((arg) => arg !== '--');
|
|
509
|
+
const execFileImpl = execFileOverride ?? execFile;
|
|
455
510
|
const version = resolvePackageVersion();
|
|
456
511
|
const program = buildProgram();
|
|
457
512
|
program.configureOutput({
|
|
@@ -488,7 +543,7 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
|
|
|
488
543
|
const lengthArg = parseLengthArg(program.opts().length);
|
|
489
544
|
const maxOutputTokensArg = parseMaxOutputTokensArg(program.opts().maxOutputTokens);
|
|
490
545
|
const timeoutMs = parseDurationMs(program.opts().timeout);
|
|
491
|
-
const
|
|
546
|
+
const extractMode = Boolean(program.opts().extract) || Boolean(program.opts().extractOnly);
|
|
492
547
|
const json = Boolean(program.opts().json);
|
|
493
548
|
const streamMode = parseStreamMode(program.opts().stream);
|
|
494
549
|
const renderMode = parseRenderMode(program.opts().render);
|
|
@@ -496,9 +551,20 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
|
|
|
496
551
|
const metricsMode = parseMetricsMode(program.opts().metrics);
|
|
497
552
|
const metricsEnabled = metricsMode !== 'off';
|
|
498
553
|
const metricsDetailed = metricsMode === 'detailed';
|
|
499
|
-
const
|
|
554
|
+
const preprocessMode = parsePreprocessMode(program.opts().preprocess);
|
|
555
|
+
const format = parseExtractFormat(program.opts().format);
|
|
500
556
|
const shouldComputeReport = metricsEnabled;
|
|
501
557
|
const isYoutubeUrl = typeof url === 'string' ? /youtube\.com|youtu\.be/i.test(url) : false;
|
|
558
|
+
const firecrawlExplicitlySet = normalizedArgv.some((arg) => arg === '--firecrawl' || arg.startsWith('--firecrawl='));
|
|
559
|
+
const markdownModeExplicitlySet = normalizedArgv.some((arg) => arg === '--markdown-mode' ||
|
|
560
|
+
arg.startsWith('--markdown-mode=') ||
|
|
561
|
+
arg === '--markdown' ||
|
|
562
|
+
arg.startsWith('--markdown='));
|
|
563
|
+
const markdownMode = format === 'markdown'
|
|
564
|
+
? parseMarkdownMode(program.opts().markdownMode ??
|
|
565
|
+
program.opts().markdown ??
|
|
566
|
+
'auto')
|
|
567
|
+
: 'off';
|
|
502
568
|
const requestedFirecrawlMode = parseFirecrawlMode(program.opts().firecrawl);
|
|
503
569
|
const modelArg = typeof program.opts().model === 'string' ? program.opts().model : null;
|
|
504
570
|
const { config, path: configPath } = loadSummarizeConfig({ env });
|
|
@@ -540,6 +606,12 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
|
|
|
540
606
|
const anthropicConfigured = typeof anthropicApiKey === 'string' && anthropicApiKey.length > 0;
|
|
541
607
|
const openrouterConfigured = typeof openrouterApiKey === 'string' && openrouterApiKey.length > 0;
|
|
542
608
|
const openrouterOptions = openRouterProviders ? { providers: openRouterProviders } : undefined;
|
|
609
|
+
if (markdownModeExplicitlySet && format !== 'markdown') {
|
|
610
|
+
throw new Error('--markdown-mode is only supported with --format md');
|
|
611
|
+
}
|
|
612
|
+
if (markdownModeExplicitlySet && inputTarget.kind !== 'url') {
|
|
613
|
+
throw new Error('--markdown-mode is only supported for website URLs');
|
|
614
|
+
}
|
|
543
615
|
const llmCalls = [];
|
|
544
616
|
let firecrawlRequests = 0;
|
|
545
617
|
let apifyRequests = 0;
|
|
@@ -642,7 +714,7 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
|
|
|
642
714
|
return streamMode;
|
|
643
715
|
return isRichTty(stdout) ? 'on' : 'off';
|
|
644
716
|
})();
|
|
645
|
-
const streamingEnabled = effectiveStreamMode === 'on' && !json && !
|
|
717
|
+
const streamingEnabled = effectiveStreamMode === 'on' && !json && !extractMode;
|
|
646
718
|
const effectiveRenderMode = (() => {
|
|
647
719
|
if (renderMode !== 'auto')
|
|
648
720
|
return renderMode;
|
|
@@ -661,8 +733,8 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
|
|
|
661
733
|
stderr.write(`metrics apify requests=${report.services.apify.requests}\n`);
|
|
662
734
|
stderr.write(`metrics total tok(i/o/t)=${promptTokens ?? 'unknown'}/${completionTokens ?? 'unknown'}/${totalTokens ?? 'unknown'}\n`);
|
|
663
735
|
};
|
|
664
|
-
if (
|
|
665
|
-
throw new Error('--extract
|
|
736
|
+
if (extractMode && inputTarget.kind !== 'url') {
|
|
737
|
+
throw new Error('--extract is only supported for website/YouTube URLs');
|
|
666
738
|
}
|
|
667
739
|
const progressEnabled = isRichTty(stderr) && !verbose && !json;
|
|
668
740
|
let clearProgressBeforeStdout = null;
|
|
@@ -697,11 +769,6 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
|
|
|
697
769
|
if (!hasRequiredKey) {
|
|
698
770
|
throw new Error(`Missing ${requiredKeyEnv} for model ${parsedModel.canonical}. Set the env var or choose a different --model.`);
|
|
699
771
|
}
|
|
700
|
-
assertProviderSupportsAttachment({
|
|
701
|
-
provider: parsedModel.provider,
|
|
702
|
-
modelId: parsedModel.canonical,
|
|
703
|
-
attachment: { part: attachment.part, mediaType: attachment.mediaType },
|
|
704
|
-
});
|
|
705
772
|
const modelResolution = await resolveModelIdForLlmCall({
|
|
706
773
|
parsedModel,
|
|
707
774
|
apiKeys: { googleApiKey: apiKeysForLlm.googleApiKey },
|
|
@@ -719,14 +786,114 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
|
|
|
719
786
|
if (textContent && textContent.bytes > MAX_TEXT_BYTES_DEFAULT) {
|
|
720
787
|
throw new Error(`Text file too large (${formatBytes(textContent.bytes)}). Limit is ${formatBytes(MAX_TEXT_BYTES_DEFAULT)}.`);
|
|
721
788
|
}
|
|
789
|
+
const fileBytes = getFileBytesFromAttachment(attachment);
|
|
790
|
+
const canPreprocessWithMarkitdown = format === 'markdown' &&
|
|
791
|
+
preprocessMode !== 'off' &&
|
|
792
|
+
hasUvxCli(env) &&
|
|
793
|
+
attachment.part.type === 'file' &&
|
|
794
|
+
fileBytes !== null &&
|
|
795
|
+
shouldMarkitdownConvertMediaType(attachment.mediaType);
|
|
722
796
|
const summaryLengthTarget = lengthArg.kind === 'preset' ? lengthArg.preset : { maxCharacters: lengthArg.maxCharacters };
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
797
|
+
let promptText = '';
|
|
798
|
+
const buildAttachmentPromptPayload = () => {
|
|
799
|
+
promptText = buildFileSummaryPrompt({
|
|
800
|
+
filename: attachment.filename,
|
|
801
|
+
mediaType: attachment.mediaType,
|
|
802
|
+
summaryLength: summaryLengthTarget,
|
|
803
|
+
contentLength: textContent?.content.length ?? null,
|
|
804
|
+
});
|
|
805
|
+
return buildAssetPromptPayload({ promptText, attachment, textContent });
|
|
806
|
+
};
|
|
807
|
+
const buildMarkitdownPromptPayload = (markdown) => {
|
|
808
|
+
promptText = buildFileTextSummaryPrompt({
|
|
809
|
+
filename: attachment.filename,
|
|
810
|
+
originalMediaType: attachment.mediaType,
|
|
811
|
+
contentMediaType: 'text/markdown',
|
|
812
|
+
summaryLength: summaryLengthTarget,
|
|
813
|
+
contentLength: markdown.length,
|
|
814
|
+
});
|
|
815
|
+
return `${promptText}\n\n---\n\n${markdown}`.trim();
|
|
816
|
+
};
|
|
817
|
+
let preprocessedMarkdown = null;
|
|
818
|
+
let usingPreprocessedMarkdown = false;
|
|
819
|
+
if (preprocessMode === 'always' && canPreprocessWithMarkitdown) {
|
|
820
|
+
if (!fileBytes) {
|
|
821
|
+
throw new Error('Internal error: missing file bytes for markitdown preprocessing');
|
|
822
|
+
}
|
|
823
|
+
try {
|
|
824
|
+
preprocessedMarkdown = await convertToMarkdownWithMarkitdown({
|
|
825
|
+
bytes: fileBytes,
|
|
826
|
+
filenameHint: attachment.filename,
|
|
827
|
+
mediaTypeHint: attachment.mediaType,
|
|
828
|
+
uvxCommand: env.UVX_PATH,
|
|
829
|
+
timeoutMs,
|
|
830
|
+
env,
|
|
831
|
+
execFileImpl,
|
|
832
|
+
});
|
|
833
|
+
}
|
|
834
|
+
catch (error) {
|
|
835
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
836
|
+
throw new Error(`Failed to preprocess ${attachment.mediaType} with markitdown: ${message} (disable with --preprocess off).`);
|
|
837
|
+
}
|
|
838
|
+
if (Buffer.byteLength(preprocessedMarkdown, 'utf8') > MAX_TEXT_BYTES_DEFAULT) {
|
|
839
|
+
throw new Error(`Preprocessed Markdown too large (${formatBytes(Buffer.byteLength(preprocessedMarkdown, 'utf8'))}). Limit is ${formatBytes(MAX_TEXT_BYTES_DEFAULT)}.`);
|
|
840
|
+
}
|
|
841
|
+
usingPreprocessedMarkdown = true;
|
|
842
|
+
}
|
|
843
|
+
let promptPayload = buildAttachmentPromptPayload();
|
|
844
|
+
if (usingPreprocessedMarkdown) {
|
|
845
|
+
if (!preprocessedMarkdown) {
|
|
846
|
+
throw new Error('Internal error: missing markitdown content for preprocessing');
|
|
847
|
+
}
|
|
848
|
+
promptPayload = buildMarkitdownPromptPayload(preprocessedMarkdown);
|
|
849
|
+
}
|
|
850
|
+
if (!usingPreprocessedMarkdown) {
|
|
851
|
+
try {
|
|
852
|
+
assertProviderSupportsAttachment({
|
|
853
|
+
provider: parsedModel.provider,
|
|
854
|
+
modelId: parsedModel.canonical,
|
|
855
|
+
attachment: { part: attachment.part, mediaType: attachment.mediaType },
|
|
856
|
+
});
|
|
857
|
+
}
|
|
858
|
+
catch (error) {
|
|
859
|
+
if (!canPreprocessWithMarkitdown) {
|
|
860
|
+
if (format === 'markdown' &&
|
|
861
|
+
preprocessMode !== 'off' &&
|
|
862
|
+
attachment.part.type === 'file' &&
|
|
863
|
+
shouldMarkitdownConvertMediaType(attachment.mediaType) &&
|
|
864
|
+
!hasUvxCli(env)) {
|
|
865
|
+
throw withUvxTip(error, env);
|
|
866
|
+
}
|
|
867
|
+
throw error;
|
|
868
|
+
}
|
|
869
|
+
if (!fileBytes) {
|
|
870
|
+
throw new Error('Internal error: missing file bytes for markitdown preprocessing');
|
|
871
|
+
}
|
|
872
|
+
try {
|
|
873
|
+
preprocessedMarkdown = await convertToMarkdownWithMarkitdown({
|
|
874
|
+
bytes: fileBytes,
|
|
875
|
+
filenameHint: attachment.filename,
|
|
876
|
+
mediaTypeHint: attachment.mediaType,
|
|
877
|
+
uvxCommand: env.UVX_PATH,
|
|
878
|
+
timeoutMs,
|
|
879
|
+
env,
|
|
880
|
+
execFileImpl,
|
|
881
|
+
});
|
|
882
|
+
}
|
|
883
|
+
catch (markitdownError) {
|
|
884
|
+
if (preprocessMode === 'auto') {
|
|
885
|
+
throw error;
|
|
886
|
+
}
|
|
887
|
+
const message = markitdownError instanceof Error ? markitdownError.message : String(markitdownError);
|
|
888
|
+
throw new Error(`Failed to preprocess ${attachment.mediaType} with markitdown: ${message} (disable with --preprocess off).`);
|
|
889
|
+
}
|
|
890
|
+
if (Buffer.byteLength(preprocessedMarkdown, 'utf8') > MAX_TEXT_BYTES_DEFAULT) {
|
|
891
|
+
throw new Error(`Preprocessed Markdown too large (${formatBytes(Buffer.byteLength(preprocessedMarkdown, 'utf8'))}). Limit is ${formatBytes(MAX_TEXT_BYTES_DEFAULT)}.`);
|
|
892
|
+
}
|
|
893
|
+
usingPreprocessedMarkdown = true;
|
|
894
|
+
promptPayload = buildMarkitdownPromptPayload(preprocessedMarkdown);
|
|
895
|
+
}
|
|
896
|
+
}
|
|
730
897
|
const maxInputTokensForCall = await resolveMaxInputTokensForCall(parsedModelEffective.canonical);
|
|
731
898
|
if (typeof maxInputTokensForCall === 'number' &&
|
|
732
899
|
Number.isFinite(maxInputTokensForCall) &&
|
|
@@ -1128,12 +1295,21 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
|
|
|
1128
1295
|
if (!url) {
|
|
1129
1296
|
throw new Error('Only HTTP and HTTPS URLs can be summarized');
|
|
1130
1297
|
}
|
|
1131
|
-
const
|
|
1298
|
+
const wantsMarkdown = format === 'markdown' && !isYoutubeUrl;
|
|
1299
|
+
if (wantsMarkdown && markdownMode === 'off') {
|
|
1300
|
+
throw new Error('--format md conflicts with --markdown-mode off (use --format text)');
|
|
1301
|
+
}
|
|
1302
|
+
const firecrawlMode = (() => {
|
|
1303
|
+
if (wantsMarkdown && !isYoutubeUrl && !firecrawlExplicitlySet && firecrawlConfigured) {
|
|
1304
|
+
return 'always';
|
|
1305
|
+
}
|
|
1306
|
+
return requestedFirecrawlMode;
|
|
1307
|
+
})();
|
|
1132
1308
|
if (firecrawlMode === 'always' && !firecrawlConfigured) {
|
|
1133
1309
|
throw new Error('--firecrawl always requires FIRECRAWL_API_KEY');
|
|
1134
1310
|
}
|
|
1135
|
-
const
|
|
1136
|
-
const
|
|
1311
|
+
const markdownRequested = wantsMarkdown;
|
|
1312
|
+
const effectiveMarkdownMode = markdownRequested ? markdownMode : 'off';
|
|
1137
1313
|
const hasKeyForModel = parsedModelForLlm.provider === 'xai'
|
|
1138
1314
|
? xaiConfigured
|
|
1139
1315
|
: parsedModelForLlm.provider === 'google'
|
|
@@ -1150,16 +1326,16 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
|
|
|
1150
1326
|
: parsedModelForLlm.provider === 'anthropic'
|
|
1151
1327
|
? 'ANTHROPIC_API_KEY'
|
|
1152
1328
|
: 'OPENAI_API_KEY';
|
|
1153
|
-
throw new Error(`--markdown llm requires ${required} for model ${parsedModelForLlm.canonical}`);
|
|
1329
|
+
throw new Error(`--markdown-mode llm requires ${required} for model ${parsedModelForLlm.canonical}`);
|
|
1154
1330
|
}
|
|
1155
|
-
writeVerbose(stderr, verbose, `config url=${url} timeoutMs=${timeoutMs} youtube=${youtubeMode} firecrawl=${firecrawlMode} length=${lengthArg.kind === 'preset' ? lengthArg.preset : `${lengthArg.maxCharacters} chars`} maxOutputTokens=${formatOptionalNumber(maxOutputTokensArg)} json=${json}
|
|
1331
|
+
writeVerbose(stderr, verbose, `config url=${url} timeoutMs=${timeoutMs} youtube=${youtubeMode} firecrawl=${firecrawlMode} length=${lengthArg.kind === 'preset' ? lengthArg.preset : `${lengthArg.maxCharacters} chars`} maxOutputTokens=${formatOptionalNumber(maxOutputTokensArg)} json=${json} extract=${extractMode} format=${format} preprocess=${preprocessMode} markdownMode=${markdownMode} model=${model} stream=${effectiveStreamMode} render=${effectiveRenderMode}`, verboseColor);
|
|
1156
1332
|
writeVerbose(stderr, verbose, `configFile path=${formatOptionalString(configPath)} model=${formatOptionalString(config?.model ?? null)}`, verboseColor);
|
|
1157
1333
|
writeVerbose(stderr, verbose, `env xaiKey=${xaiConfigured} openaiKey=${Boolean(apiKey)} googleKey=${googleConfigured} anthropicKey=${anthropicConfigured} openrouterKey=${openrouterConfigured} apifyToken=${Boolean(apifyToken)} firecrawlKey=${firecrawlConfigured}`, verboseColor);
|
|
1158
1334
|
writeVerbose(stderr, verbose, `markdown requested=${markdownRequested} provider=${markdownProvider}`, verboseColor);
|
|
1159
1335
|
const scrapeWithFirecrawl = firecrawlConfigured && firecrawlMode !== 'off'
|
|
1160
1336
|
? createFirecrawlScraper({ apiKey: firecrawlApiKey, fetchImpl: trackedFetch })
|
|
1161
1337
|
: null;
|
|
1162
|
-
const
|
|
1338
|
+
const llmHtmlToMarkdown = markdownRequested && (effectiveMarkdownMode === 'llm' || markdownProvider !== 'none')
|
|
1163
1339
|
? createHtmlToMarkdownConverter({
|
|
1164
1340
|
modelId: model,
|
|
1165
1341
|
xaiApiKey: xaiConfigured ? xaiApiKey : null,
|
|
@@ -1174,6 +1350,46 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
|
|
|
1174
1350
|
},
|
|
1175
1351
|
})
|
|
1176
1352
|
: null;
|
|
1353
|
+
const markitdownHtmlToMarkdown = markdownRequested && preprocessMode !== 'off' && hasUvxCli(env)
|
|
1354
|
+
? async (args) => {
|
|
1355
|
+
void args.url;
|
|
1356
|
+
void args.title;
|
|
1357
|
+
void args.siteName;
|
|
1358
|
+
return convertToMarkdownWithMarkitdown({
|
|
1359
|
+
bytes: new TextEncoder().encode(args.html),
|
|
1360
|
+
filenameHint: 'page.html',
|
|
1361
|
+
mediaTypeHint: 'text/html',
|
|
1362
|
+
uvxCommand: env.UVX_PATH,
|
|
1363
|
+
timeoutMs: args.timeoutMs,
|
|
1364
|
+
env,
|
|
1365
|
+
execFileImpl,
|
|
1366
|
+
});
|
|
1367
|
+
}
|
|
1368
|
+
: null;
|
|
1369
|
+
const convertHtmlToMarkdown = markdownRequested
|
|
1370
|
+
? async (args) => {
|
|
1371
|
+
if (effectiveMarkdownMode === 'llm') {
|
|
1372
|
+
if (!llmHtmlToMarkdown) {
|
|
1373
|
+
throw new Error('No HTML→Markdown converter configured');
|
|
1374
|
+
}
|
|
1375
|
+
return llmHtmlToMarkdown(args);
|
|
1376
|
+
}
|
|
1377
|
+
if (llmHtmlToMarkdown) {
|
|
1378
|
+
try {
|
|
1379
|
+
return await llmHtmlToMarkdown(args);
|
|
1380
|
+
}
|
|
1381
|
+
catch (error) {
|
|
1382
|
+
if (!markitdownHtmlToMarkdown)
|
|
1383
|
+
throw error;
|
|
1384
|
+
return await markitdownHtmlToMarkdown(args);
|
|
1385
|
+
}
|
|
1386
|
+
}
|
|
1387
|
+
if (markitdownHtmlToMarkdown) {
|
|
1388
|
+
return await markitdownHtmlToMarkdown(args);
|
|
1389
|
+
}
|
|
1390
|
+
throw new Error('No HTML→Markdown converter configured');
|
|
1391
|
+
}
|
|
1392
|
+
: null;
|
|
1177
1393
|
const readTweetWithBirdClient = hasBirdCli(env)
|
|
1178
1394
|
? ({ url, timeoutMs }) => readTweetWithBird({ url, timeoutMs, env })
|
|
1179
1395
|
: null;
|
|
@@ -1367,7 +1583,7 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
|
|
|
1367
1583
|
const viaSourceLabel = viaSources.length > 0 ? `, ${viaSources.join('+')}` : '';
|
|
1368
1584
|
if (progressEnabled) {
|
|
1369
1585
|
websiteProgress?.stop?.();
|
|
1370
|
-
spinner.setText(
|
|
1586
|
+
spinner.setText(extractMode
|
|
1371
1587
|
? `Extracted (${extractedContentSize}${viaSourceLabel})`
|
|
1372
1588
|
: `Summarizing (sent ${extractedContentSize}${viaSourceLabel})…`);
|
|
1373
1589
|
}
|
|
@@ -1378,6 +1594,14 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
|
|
|
1378
1594
|
writeVerbose(stderr, verbose, `extract transcript textProvided=${extracted.diagnostics.transcript.textProvided} provider=${formatOptionalString(extracted.diagnostics.transcript.provider ?? null)} attemptedProviders=${extracted.diagnostics.transcript.attemptedProviders.length > 0
|
|
1379
1595
|
? extracted.diagnostics.transcript.attemptedProviders.join(',')
|
|
1380
1596
|
: 'none'} notes=${formatOptionalString(extracted.diagnostics.transcript.notes ?? null)}`, verboseColor);
|
|
1597
|
+
if (extractMode &&
|
|
1598
|
+
markdownRequested &&
|
|
1599
|
+
preprocessMode !== 'off' &&
|
|
1600
|
+
effectiveMarkdownMode === 'auto' &&
|
|
1601
|
+
!extracted.diagnostics.markdown.used &&
|
|
1602
|
+
!hasUvxCli(env)) {
|
|
1603
|
+
stderr.write(`${UVX_TIP}\n`);
|
|
1604
|
+
}
|
|
1381
1605
|
const isYouTube = extracted.siteName === 'YouTube';
|
|
1382
1606
|
const prompt = buildLinkSummaryPrompt({
|
|
1383
1607
|
url: extracted.url,
|
|
@@ -1391,7 +1615,7 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
|
|
|
1391
1615
|
summaryLength: lengthArg.kind === 'preset' ? lengthArg.preset : { maxCharacters: lengthArg.maxCharacters },
|
|
1392
1616
|
shares: [],
|
|
1393
1617
|
});
|
|
1394
|
-
if (
|
|
1618
|
+
if (extractMode) {
|
|
1395
1619
|
clearProgressForStdout();
|
|
1396
1620
|
if (json) {
|
|
1397
1621
|
const finishReport = shouldComputeReport ? await buildReport() : null;
|
|
@@ -1402,6 +1626,7 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
|
|
|
1402
1626
|
timeoutMs,
|
|
1403
1627
|
youtube: youtubeMode,
|
|
1404
1628
|
firecrawl: firecrawlMode,
|
|
1629
|
+
format,
|
|
1405
1630
|
markdown: effectiveMarkdownMode,
|
|
1406
1631
|
length: lengthArg.kind === 'preset'
|
|
1407
1632
|
? { kind: 'preset', preset: lengthArg.preset }
|
|
@@ -1472,6 +1697,7 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
|
|
|
1472
1697
|
timeoutMs,
|
|
1473
1698
|
youtube: youtubeMode,
|
|
1474
1699
|
firecrawl: firecrawlMode,
|
|
1700
|
+
format,
|
|
1475
1701
|
markdown: effectiveMarkdownMode,
|
|
1476
1702
|
length: lengthArg.kind === 'preset'
|
|
1477
1703
|
? { kind: 'preset', preset: lengthArg.preset }
|
|
@@ -1744,6 +1970,7 @@ export async function runCli(argv, { env, fetch, stdout, stderr }) {
|
|
|
1744
1970
|
timeoutMs,
|
|
1745
1971
|
youtube: youtubeMode,
|
|
1746
1972
|
firecrawl: firecrawlMode,
|
|
1973
|
+
format,
|
|
1747
1974
|
markdown: effectiveMarkdownMode,
|
|
1748
1975
|
length: lengthArg.kind === 'preset'
|
|
1749
1976
|
? { kind: 'preset', preset: lengthArg.preset }
|