@steipete/summarize 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +80 -5
- package/README.md +122 -20
- package/dist/cli.cjs +8446 -4360
- package/dist/cli.cjs.map +4 -4
- package/dist/esm/cli-main.js +47 -2
- package/dist/esm/cli-main.js.map +1 -1
- package/dist/esm/config.js +368 -3
- package/dist/esm/config.js.map +1 -1
- package/dist/esm/content/link-preview/content/index.js +13 -0
- package/dist/esm/content/link-preview/content/index.js.map +1 -1
- package/dist/esm/content/link-preview/content/utils.js +3 -1
- package/dist/esm/content/link-preview/content/utils.js.map +1 -1
- package/dist/esm/content/link-preview/content/video.js +96 -0
- package/dist/esm/content/link-preview/content/video.js.map +1 -0
- package/dist/esm/content/link-preview/transcript/providers/youtube/captions.js +21 -21
- package/dist/esm/content/link-preview/transcript/providers/youtube/captions.js.map +1 -1
- package/dist/esm/costs.js.map +1 -1
- package/dist/esm/flags.js +41 -1
- package/dist/esm/flags.js.map +1 -1
- package/dist/esm/generate-free.js +616 -0
- package/dist/esm/generate-free.js.map +1 -0
- package/dist/esm/llm/cli.js +290 -0
- package/dist/esm/llm/cli.js.map +1 -0
- package/dist/esm/llm/generate-text.js +159 -105
- package/dist/esm/llm/generate-text.js.map +1 -1
- package/dist/esm/llm/html-to-markdown.js +4 -2
- package/dist/esm/llm/html-to-markdown.js.map +1 -1
- package/dist/esm/markitdown.js +54 -0
- package/dist/esm/markitdown.js.map +1 -0
- package/dist/esm/model-auto.js +353 -0
- package/dist/esm/model-auto.js.map +1 -0
- package/dist/esm/model-spec.js +82 -0
- package/dist/esm/model-spec.js.map +1 -0
- package/dist/esm/prompts/cli.js +18 -0
- package/dist/esm/prompts/cli.js.map +1 -0
- package/dist/esm/prompts/file.js +21 -2
- package/dist/esm/prompts/file.js.map +1 -1
- package/dist/esm/prompts/index.js +2 -1
- package/dist/esm/prompts/index.js.map +1 -1
- package/dist/esm/prompts/link-summary.js +3 -8
- package/dist/esm/prompts/link-summary.js.map +1 -1
- package/dist/esm/refresh-free.js +667 -0
- package/dist/esm/refresh-free.js.map +1 -0
- package/dist/esm/run.js +1612 -533
- package/dist/esm/run.js.map +1 -1
- package/dist/esm/version.js +1 -1
- package/dist/types/config.d.ts +58 -5
- package/dist/types/content/link-preview/content/types.d.ts +10 -0
- package/dist/types/content/link-preview/content/utils.d.ts +1 -1
- package/dist/types/content/link-preview/content/video.d.ts +5 -0
- package/dist/types/costs.d.ts +2 -1
- package/dist/types/flags.d.ts +7 -0
- package/dist/types/generate-free.d.ts +17 -0
- package/dist/types/llm/cli.d.ts +24 -0
- package/dist/types/llm/generate-text.d.ts +13 -4
- package/dist/types/llm/html-to-markdown.d.ts +9 -3
- package/dist/types/markitdown.d.ts +10 -0
- package/dist/types/model-auto.d.ts +23 -0
- package/dist/types/model-spec.d.ts +33 -0
- package/dist/types/prompts/cli.d.ts +8 -0
- package/dist/types/prompts/file.d.ts +7 -0
- package/dist/types/prompts/index.d.ts +2 -1
- package/dist/types/refresh-free.d.ts +19 -0
- package/dist/types/run.d.ts +3 -1
- package/dist/types/version.d.ts +1 -1
- package/docs/README.md +4 -1
- package/docs/cli.md +95 -0
- package/docs/config.md +123 -1
- package/docs/extract-only.md +10 -7
- package/docs/firecrawl.md +2 -2
- package/docs/llm.md +24 -4
- package/docs/manual-tests.md +40 -0
- package/docs/model-auto.md +92 -0
- package/docs/site/assets/site.js +20 -17
- package/docs/site/docs/config.html +3 -3
- package/docs/site/docs/extract-only.html +7 -5
- package/docs/site/docs/firecrawl.html +6 -6
- package/docs/site/docs/index.html +2 -2
- package/docs/site/docs/llm.html +2 -2
- package/docs/site/docs/openai.html +2 -2
- package/docs/site/docs/website.html +7 -4
- package/docs/site/docs/youtube.html +2 -2
- package/docs/site/index.html +1 -1
- package/docs/smoketest.md +58 -0
- package/docs/website.md +13 -8
- package/docs/youtube.md +1 -1
- package/package.json +8 -4
- package/dist/esm/content/link-preview/transcript/providers/twitter.js +0 -12
- package/dist/esm/content/link-preview/transcript/providers/twitter.js.map +0 -1
- package/dist/esm/content/link-preview/transcript/providers/youtube/ytdlp.js +0 -114
- package/dist/esm/content/link-preview/transcript/providers/youtube/ytdlp.js.map +0 -1
- package/dist/esm/summarizeHome.js +0 -20
- package/dist/esm/summarizeHome.js.map +0 -1
- package/dist/esm/tty/live-markdown.js +0 -52
- package/dist/esm/tty/live-markdown.js.map +0 -1
- package/dist/types/content/link-preview/transcript/providers/twitter.d.ts +0 -3
- package/dist/types/content/link-preview/transcript/providers/youtube/ytdlp.d.ts +0 -3
- package/dist/types/summarizeHome.d.ts +0 -6
- package/dist/types/tty/live-markdown.d.ts +0 -10
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# Auto model selection (`--model auto`)
|
|
2
|
+
|
|
3
|
+
`--model auto` picks a model based on input kind + token size, and retries with fallbacks when something fails.
|
|
4
|
+
|
|
5
|
+
This is also the built-in default when you don’t specify a model.
|
|
6
|
+
|
|
7
|
+
## What it does
|
|
8
|
+
|
|
9
|
+
- Builds an ordered list of model “attempts” from `candidates[]` (native first, optional OpenRouter fallback).
|
|
10
|
+
- Skips attempts that don’t have the required API key configured.
|
|
11
|
+
- On any request error, tries the next attempt.
|
|
12
|
+
- If no model is usable, prints the extracted text (no LLM summary). Use `--extract` if you want the raw extracted content even when models are available.
|
|
13
|
+
- Auto prepends CLI attempts only when `cli.enabled` is set (see `docs/cli.md`).
|
|
14
|
+
- Order follows `cli.enabled`.
|
|
15
|
+
|
|
16
|
+
## OpenRouter vs native
|
|
17
|
+
|
|
18
|
+
Model ids:
|
|
19
|
+
|
|
20
|
+
- Native: `<provider>/<model>` (e.g. `openai/gpt-5-mini`, `google/gemini-3-flash-preview`)
|
|
21
|
+
- Forced OpenRouter: `openrouter/<author>/<slug>` (e.g. `openrouter/meta-llama/llama-3.3-70b-instruct:free`)
|
|
22
|
+
|
|
23
|
+
Behavior:
|
|
24
|
+
|
|
25
|
+
- If you pass an `openrouter/...` model id, the request uses OpenRouter (and requires `OPENROUTER_API_KEY`).
|
|
26
|
+
- If you pass a native model id, the CLI prefers the native provider SDK when its key is available, and can fall back to OpenRouter when no native key exists (and `OPENROUTER_API_KEY` is set).
|
|
27
|
+
|
|
28
|
+
## How selection works
|
|
29
|
+
|
|
30
|
+
- Uses the order you provide in `model.rules[].candidates[]` (or `bands[].candidates[]`).
|
|
31
|
+
- Filters out candidates that can’t fit the prompt (max input tokens, LiteLLM catalog).
|
|
32
|
+
- For a native candidate, auto mode may add an OpenRouter fallback attempt right after it (when `OPENROUTER_API_KEY` is set and video understanding isn’t required).
|
|
33
|
+
|
|
34
|
+
Notes:
|
|
35
|
+
|
|
36
|
+
- Auto mode is non-streaming (so a failed attempt won’t partially print output).
|
|
37
|
+
- Video understanding is only attempted when `--video-mode` is `auto` or `understand`, and a video-capable model is selected.
|
|
38
|
+
|
|
39
|
+
## Config
|
|
40
|
+
|
|
41
|
+
Default config file: `~/.summarize/config.json`
|
|
42
|
+
|
|
43
|
+
This file is parsed leniently (JSON5), but **comments are not allowed**.
|
|
44
|
+
|
|
45
|
+
`model.rules` is optional; when omitted, built-in defaults apply.
|
|
46
|
+
|
|
47
|
+
`model.rules[].when` is optional, and when present must be an array (e.g. `["video","youtube"]`).
|
|
48
|
+
|
|
49
|
+
Rules can be either:
|
|
50
|
+
|
|
51
|
+
- `candidates: string[]`
|
|
52
|
+
- `bands: [{ token?: { min?: number; max?: number }, candidates: string[] }]`
|
|
53
|
+
|
|
54
|
+
Example:
|
|
55
|
+
|
|
56
|
+
```json
|
|
57
|
+
{
|
|
58
|
+
"model": {
|
|
59
|
+
"mode": "auto",
|
|
60
|
+
"rules": [
|
|
61
|
+
{
|
|
62
|
+
"when": ["video"],
|
|
63
|
+
"candidates": ["google/gemini-3-flash-preview"]
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
"when": ["website", "youtube"],
|
|
67
|
+
"bands": [
|
|
68
|
+
{
|
|
69
|
+
"token": { "max": 8000 },
|
|
70
|
+
"candidates": ["openai/gpt-5-mini"]
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
"candidates": ["xai/grok-4-fast-non-reasoning"]
|
|
74
|
+
}
|
|
75
|
+
]
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
"candidates": ["openai/gpt-5-mini", "openrouter/openai/gpt-5-mini"]
|
|
79
|
+
}
|
|
80
|
+
]
|
|
81
|
+
},
|
|
82
|
+
"media": { "videoMode": "auto" }
|
|
83
|
+
}
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Minimal shorthand:
|
|
87
|
+
|
|
88
|
+
```json
|
|
89
|
+
{
|
|
90
|
+
"model": "auto"
|
|
91
|
+
}
|
|
92
|
+
```
|
package/docs/site/assets/site.js
CHANGED
|
@@ -31,24 +31,27 @@ const highlightNav = () => {
|
|
|
31
31
|
|
|
32
32
|
const wireCopyButtons = () => {
|
|
33
33
|
const buttons = document.querySelectorAll('[data-copy]')
|
|
34
|
+
const handleCopyClick = async (button) => {
|
|
35
|
+
const selector = button.getAttribute('data-copy')
|
|
36
|
+
const target = selector ? document.querySelector(selector) : null
|
|
37
|
+
const text = target?.textContent?.trim() ?? ''
|
|
38
|
+
if (!text) return
|
|
39
|
+
try {
|
|
40
|
+
await navigator.clipboard.writeText(text)
|
|
41
|
+
const prev = button.textContent ?? ''
|
|
42
|
+
button.textContent = 'Copied'
|
|
43
|
+
button.setAttribute('data-copied', '1')
|
|
44
|
+
window.setTimeout(() => {
|
|
45
|
+
button.textContent = prev
|
|
46
|
+
button.removeAttribute('data-copied')
|
|
47
|
+
}, 900)
|
|
48
|
+
} catch {
|
|
49
|
+
// ignore
|
|
50
|
+
}
|
|
51
|
+
}
|
|
34
52
|
for (const button of buttons) {
|
|
35
|
-
button.addEventListener('click',
|
|
36
|
-
|
|
37
|
-
const target = selector ? document.querySelector(selector) : null
|
|
38
|
-
const text = target?.textContent?.trim() ?? ''
|
|
39
|
-
if (!text) return
|
|
40
|
-
try {
|
|
41
|
-
await navigator.clipboard.writeText(text)
|
|
42
|
-
const prev = button.textContent ?? ''
|
|
43
|
-
button.textContent = 'Copied'
|
|
44
|
-
button.setAttribute('data-copied', '1')
|
|
45
|
-
window.setTimeout(() => {
|
|
46
|
-
button.textContent = prev
|
|
47
|
-
button.removeAttribute('data-copied')
|
|
48
|
-
}, 900)
|
|
49
|
-
} catch {
|
|
50
|
-
// ignore
|
|
51
|
-
}
|
|
53
|
+
button.addEventListener('click', () => {
|
|
54
|
+
void handleCopyClick(button)
|
|
52
55
|
})
|
|
53
56
|
}
|
|
54
57
|
}
|
|
@@ -31,7 +31,7 @@
|
|
|
31
31
|
<a href="./index.html">Overview</a>
|
|
32
32
|
<a href="./website.html">Website mode</a>
|
|
33
33
|
<a href="./youtube.html">YouTube mode</a>
|
|
34
|
-
<a href="./extract-only.html">Extract
|
|
34
|
+
<a href="./extract-only.html">Extract</a>
|
|
35
35
|
<a href="./llm.html">LLM</a>
|
|
36
36
|
<a href="./openai.html">OpenAI</a>
|
|
37
37
|
<a href="./firecrawl.html">Firecrawl</a>
|
|
@@ -46,7 +46,7 @@
|
|
|
46
46
|
<h2>Environment variables</h2>
|
|
47
47
|
<ul>
|
|
48
48
|
<li><code>OPENAI_API_KEY</code>, <code>XAI_API_KEY</code>, <code>GEMINI_API_KEY</code> — provider keys (only needed when a mode actually calls an LLM).</li>
|
|
49
|
-
<li><code>FIRECRAWL_API_KEY</code> — optional extraction fallback
|
|
49
|
+
<li><code>FIRECRAWL_API_KEY</code> — optional extraction fallback / preferred Markdown output in extract mode.</li>
|
|
50
50
|
</ul>
|
|
51
51
|
|
|
52
52
|
<h2>Tips</h2>
|
|
@@ -57,7 +57,7 @@
|
|
|
57
57
|
</ul>
|
|
58
58
|
|
|
59
59
|
<div class="note">
|
|
60
|
-
Want the raw content? Use <code>--extract
|
|
60
|
+
Want the raw content? Use <code>--extract</code> (then decide where the summary should happen).
|
|
61
61
|
</div>
|
|
62
62
|
</article>
|
|
63
63
|
</section>
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
<meta charset="utf-8" />
|
|
5
5
|
<meta name="viewport" content="width=device-width,initial-scale=1" />
|
|
6
6
|
<meta name="color-scheme" content="dark light" />
|
|
7
|
-
<title>Extract
|
|
7
|
+
<title>Extract — summarize</title>
|
|
8
8
|
<link rel="canonical" href="https://summarize.sh/docs/extract-only" />
|
|
9
9
|
<link rel="preconnect" href="https://fonts.googleapis.com" />
|
|
10
10
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
|
|
@@ -31,7 +31,7 @@
|
|
|
31
31
|
<a href="./index.html">Overview</a>
|
|
32
32
|
<a href="./website.html">Website mode</a>
|
|
33
33
|
<a href="./youtube.html">YouTube mode</a>
|
|
34
|
-
<a href="./extract-only.html">Extract
|
|
34
|
+
<a href="./extract-only.html">Extract</a>
|
|
35
35
|
<a href="./llm.html">LLM</a>
|
|
36
36
|
<a href="./openai.html">OpenAI</a>
|
|
37
37
|
<a href="./firecrawl.html">Firecrawl</a>
|
|
@@ -40,7 +40,7 @@
|
|
|
40
40
|
|
|
41
41
|
<article class="doc reveal">
|
|
42
42
|
<p class="kicker">mode</p>
|
|
43
|
-
<h1>Extract
|
|
43
|
+
<h1>Extract</h1>
|
|
44
44
|
<p>Print the extracted content and stop. No summary call.</p>
|
|
45
45
|
|
|
46
46
|
<h2>Usage</h2>
|
|
@@ -50,7 +50,7 @@
|
|
|
50
50
|
<span class="terminal__dot terminal__dot--b"></span>
|
|
51
51
|
<span class="terminal__dot terminal__dot--c"></span>
|
|
52
52
|
</div>
|
|
53
|
-
<pre><code id="ex-only">summarize --extract
|
|
53
|
+
<pre><code id="ex-only">summarize --extract "https://example.com/article"</code></pre>
|
|
54
54
|
</div>
|
|
55
55
|
<div class="copyRow">
|
|
56
56
|
<span class="hint">Good for piping into your own tooling.</span>
|
|
@@ -63,7 +63,9 @@
|
|
|
63
63
|
<li><code>--verbose</code> — show which extractor ran and why.</li>
|
|
64
64
|
<li><code>--timeout</code> — tune crawling budget (<code>2m</code> default).</li>
|
|
65
65
|
<li><code>--firecrawl off|auto|always</code> — choose the fallback strategy.</li>
|
|
66
|
-
<li><code>--
|
|
66
|
+
<li><code>--format md|text</code> — choose extracted output format.</li>
|
|
67
|
+
<li><code>--markdown-mode off|auto|llm</code> — for non-YouTube URLs with <code>--format md</code>, control HTML→Markdown conversion (LLM) + fallbacks.</li>
|
|
68
|
+
<li><code>--preprocess off|auto|always</code> — controls markitdown usage (default <code>auto</code>).</li>
|
|
67
69
|
</ul>
|
|
68
70
|
</article>
|
|
69
71
|
</section>
|
|
@@ -31,7 +31,7 @@
|
|
|
31
31
|
<a href="./index.html">Overview</a>
|
|
32
32
|
<a href="./website.html">Website mode</a>
|
|
33
33
|
<a href="./youtube.html">YouTube mode</a>
|
|
34
|
-
<a href="./extract-only.html">Extract
|
|
34
|
+
<a href="./extract-only.html">Extract</a>
|
|
35
35
|
<a href="./llm.html">LLM</a>
|
|
36
36
|
<a href="./openai.html">OpenAI</a>
|
|
37
37
|
<a href="./firecrawl.html">Firecrawl</a>
|
|
@@ -41,7 +41,7 @@
|
|
|
41
41
|
<article class="doc reveal">
|
|
42
42
|
<p class="kicker">extractor</p>
|
|
43
43
|
<h1>Firecrawl</h1>
|
|
44
|
-
<p>Used as a fallback when HTML extraction looks blocked or too thin
|
|
44
|
+
<p>Used as a fallback when HTML extraction looks blocked or too thin — and as a preferred Markdown source in extract mode (when configured).</p>
|
|
45
45
|
|
|
46
46
|
<h2>Key</h2>
|
|
47
47
|
<ul>
|
|
@@ -49,14 +49,14 @@
|
|
|
49
49
|
<li>Control behavior with <code>--firecrawl off|auto|always</code>.</li>
|
|
50
50
|
</ul>
|
|
51
51
|
|
|
52
|
-
<h2>Extract
|
|
52
|
+
<h2>Extract + Markdown</h2>
|
|
53
53
|
<ul>
|
|
54
|
-
<li><code>--extract
|
|
55
|
-
<li><code>--
|
|
54
|
+
<li><code>--extract</code> prints the extracted content.</li>
|
|
55
|
+
<li><code>--extract --format md</code> outputs Markdown for non-YouTube URLs.</li>
|
|
56
56
|
</ul>
|
|
57
57
|
|
|
58
58
|
<div class="note">
|
|
59
|
-
If you only want plain text: use <code>--
|
|
59
|
+
If you only want plain text: use <code>--extract --format text</code>.
|
|
60
60
|
</div>
|
|
61
61
|
</article>
|
|
62
62
|
</section>
|
|
@@ -34,7 +34,7 @@
|
|
|
34
34
|
<div class="pillRow">
|
|
35
35
|
<span class="pill"><span class="pill__dot" aria-hidden="true"></span> Website</span>
|
|
36
36
|
<span class="pill"><span class="pill__dot" aria-hidden="true" style="background: var(--accent2)"></span> YouTube</span>
|
|
37
|
-
<span class="pill"><span class="pill__dot" aria-hidden="true" style="background: var(--accent3)"></span> Extract
|
|
37
|
+
<span class="pill"><span class="pill__dot" aria-hidden="true" style="background: var(--accent3)"></span> Extract</span>
|
|
38
38
|
</div>
|
|
39
39
|
</div>
|
|
40
40
|
</div>
|
|
@@ -52,7 +52,7 @@
|
|
|
52
52
|
<div class="small">docs/youtube.md</div>
|
|
53
53
|
</a>
|
|
54
54
|
<a class="card reveal" href="./extract-only.html">
|
|
55
|
-
<h2>Extract
|
|
55
|
+
<h2>Extract</h2>
|
|
56
56
|
<p>Get the cleaned content and stop; perfect for piping.</p>
|
|
57
57
|
<div class="small">docs/extract-only.md</div>
|
|
58
58
|
</a>
|
package/docs/site/docs/llm.html
CHANGED
|
@@ -31,7 +31,7 @@
|
|
|
31
31
|
<a href="./index.html">Overview</a>
|
|
32
32
|
<a href="./website.html">Website mode</a>
|
|
33
33
|
<a href="./youtube.html">YouTube mode</a>
|
|
34
|
-
<a href="./extract-only.html">Extract
|
|
34
|
+
<a href="./extract-only.html">Extract</a>
|
|
35
35
|
<a href="./llm.html">LLM</a>
|
|
36
36
|
<a href="./openai.html">OpenAI</a>
|
|
37
37
|
<a href="./firecrawl.html">Firecrawl</a>
|
|
@@ -53,7 +53,7 @@
|
|
|
53
53
|
<h2>Practical advice</h2>
|
|
54
54
|
<ul>
|
|
55
55
|
<li>Pin <code>--model</code> for stable output.</li>
|
|
56
|
-
<li>When using <code>--markdown llm</code>, provider fallback is disabled by design.</li>
|
|
56
|
+
<li>When using <code>--markdown-mode llm</code>, provider fallback is disabled by design.</li>
|
|
57
57
|
<li>For audits / tooling, prefer <code>--json</code> + fixed model.</li>
|
|
58
58
|
</ul>
|
|
59
59
|
</article>
|
|
@@ -31,7 +31,7 @@
|
|
|
31
31
|
<a href="./index.html">Overview</a>
|
|
32
32
|
<a href="./website.html">Website mode</a>
|
|
33
33
|
<a href="./youtube.html">YouTube mode</a>
|
|
34
|
-
<a href="./extract-only.html">Extract
|
|
34
|
+
<a href="./extract-only.html">Extract</a>
|
|
35
35
|
<a href="./llm.html">LLM</a>
|
|
36
36
|
<a href="./openai.html">OpenAI</a>
|
|
37
37
|
<a href="./firecrawl.html">Firecrawl</a>
|
|
@@ -45,7 +45,7 @@
|
|
|
45
45
|
|
|
46
46
|
<h2>Notes</h2>
|
|
47
47
|
<ul>
|
|
48
|
-
<li>Some modes (like <code>--extract
|
|
48
|
+
<li>Some modes (like <code>--extract</code>) don’t need an LLM at all.</li>
|
|
49
49
|
<li>When output is used downstream, prefer <code>--json</code> and pin <code>--model</code>.</li>
|
|
50
50
|
</ul>
|
|
51
51
|
|
|
@@ -31,7 +31,7 @@
|
|
|
31
31
|
<a href="./index.html">Overview</a>
|
|
32
32
|
<a href="./website.html">Website mode</a>
|
|
33
33
|
<a href="./youtube.html">YouTube mode</a>
|
|
34
|
-
<a href="./extract-only.html">Extract
|
|
34
|
+
<a href="./extract-only.html">Extract</a>
|
|
35
35
|
<a href="./llm.html">LLM</a>
|
|
36
36
|
<a href="./openai.html">OpenAI</a>
|
|
37
37
|
<a href="./firecrawl.html">Firecrawl</a>
|
|
@@ -41,20 +41,23 @@
|
|
|
41
41
|
<article class="doc reveal">
|
|
42
42
|
<p class="kicker">mode</p>
|
|
43
43
|
<h1>Website mode</h1>
|
|
44
|
-
<p>Fetch HTML → extract “article-ish” content → normalize to clean text. If extraction looks blocked or too thin, retry via Firecrawl Markdown (optional)
|
|
44
|
+
<p>Fetch HTML → extract “article-ish” content → normalize to clean text. If extraction looks blocked or too thin, retry via Firecrawl Markdown (optional). With <code>--format md</code>, the CLI prefers Firecrawl Markdown when configured and can also convert HTML → Markdown via <code>--markdown-mode</code> (LLM) or <code>uvx markitdown</code>.</p>
|
|
45
45
|
|
|
46
46
|
<h2>Flags</h2>
|
|
47
47
|
<ul>
|
|
48
48
|
<li><code>--firecrawl off|auto|always</code></li>
|
|
49
49
|
<li><code>--timeout 30s|2m|5000ms</code> (default <code>2m</code>)</li>
|
|
50
|
-
<li><code>--extract
|
|
50
|
+
<li><code>--extract</code> (print extracted content; no summary call)</li>
|
|
51
|
+
<li><code>--format md|text</code> (default <code>text</code>)</li>
|
|
52
|
+
<li><code>--markdown-mode off|auto|llm</code> (only with <code>--format md</code>)</li>
|
|
53
|
+
<li><code>--preprocess off|auto|always</code> (default <code>auto</code>; controls markitdown usage)</li>
|
|
51
54
|
<li><code>--json</code> (emit a single JSON object)</li>
|
|
52
55
|
<li><code>--verbose</code> (progress + which extractor was used)</li>
|
|
53
56
|
<li><code>--metrics off|on|detailed</code></li>
|
|
54
57
|
</ul>
|
|
55
58
|
|
|
56
59
|
<div class="note">
|
|
57
|
-
Plain-text mode: <code>--
|
|
60
|
+
Plain-text mode: <code>--extract --format text</code>.
|
|
58
61
|
</div>
|
|
59
62
|
</article>
|
|
60
63
|
</section>
|
|
@@ -31,7 +31,7 @@
|
|
|
31
31
|
<a href="./index.html">Overview</a>
|
|
32
32
|
<a href="./website.html">Website mode</a>
|
|
33
33
|
<a href="./youtube.html">YouTube mode</a>
|
|
34
|
-
<a href="./extract-only.html">Extract
|
|
34
|
+
<a href="./extract-only.html">Extract</a>
|
|
35
35
|
<a href="./llm.html">LLM</a>
|
|
36
36
|
<a href="./openai.html">OpenAI</a>
|
|
37
37
|
<a href="./firecrawl.html">Firecrawl</a>
|
|
@@ -45,7 +45,7 @@
|
|
|
45
45
|
|
|
46
46
|
<h2>Tip</h2>
|
|
47
47
|
<ul>
|
|
48
|
-
<li>If you only want the transcript: use <code>--extract
|
|
48
|
+
<li>If you only want the transcript: use <code>--extract</code>.</li>
|
|
49
49
|
<li>For pipelines: add <code>--json</code>.</li>
|
|
50
50
|
</ul>
|
|
51
51
|
</article>
|
package/docs/site/index.html
CHANGED
|
@@ -104,7 +104,7 @@
|
|
|
104
104
|
</div>
|
|
105
105
|
<div class="card reveal">
|
|
106
106
|
<h2>Built for pipelines</h2>
|
|
107
|
-
<p><code>--extract
|
|
107
|
+
<p><code>--extract</code>, <code>--json</code>, and <code>--metrics</code> make it scriptable.</p>
|
|
108
108
|
<div class="small">Compose it with your own tools</div>
|
|
109
109
|
</div>
|
|
110
110
|
<div class="card reveal">
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# Smoke Test Plan (20 combos)
|
|
2
|
+
|
|
3
|
+
Goal: exercise URL + file inputs, extraction + LLM summary paths, multiple models.
|
|
4
|
+
|
|
5
|
+
## Preconditions
|
|
6
|
+
- API keys set for at least: `OPENAI_API_KEY`, `GEMINI_API_KEY`.
|
|
7
|
+
- Optional: `FIRECRAWL_API_KEY` to test fallback (if available).
|
|
8
|
+
|
|
9
|
+
## Models (cheap/fast)
|
|
10
|
+
- `openai/gpt-5-mini`
|
|
11
|
+
- `google/gemini-3-flash-preview`
|
|
12
|
+
|
|
13
|
+
## Matrix (20 cases)
|
|
14
|
+
|
|
15
|
+
### Websites (LLM summary, 10)
|
|
16
|
+
1) Static HTML: `https://example.com` (model: gemini-3-flash)
|
|
17
|
+
2) Wikipedia article: `https://en.wikipedia.org/wiki/Swift_(programming_language)` (model: gpt-5-mini)
|
|
18
|
+
3) MDN doc: `https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/200` (model: gemini-3-flash)
|
|
19
|
+
4) Reuters article: `https://www.reuters.com/world/` (model: gpt-5-mini)
|
|
20
|
+
5) BBC article: `https://www.bbc.com/news` (model: gemini-3-flash)
|
|
21
|
+
6) GitHub README: `https://github.com/vitejs/vite` (model: gpt-5-mini)
|
|
22
|
+
7) Substack post: pick any public post (model: gemini-3-flash)
|
|
23
|
+
8) Medium post: pick any public post (model: gpt-5-mini)
|
|
24
|
+
9) JS-heavy page: `https://vercel.com` (model: gemini-3-flash)
|
|
25
|
+
10) 404 page: `https://example.com/does-not-exist` (model: gpt-5-mini)
|
|
26
|
+
|
|
27
|
+
### YouTube (LLM summary, 2)
|
|
28
|
+
11) Captions available: pick a popular talk/interview (model: gemini-3-flash, `--youtube auto`)
|
|
29
|
+
12) No captions: pick a random channel upload w/o captions (model: gpt-5-mini, `--youtube auto`)
|
|
30
|
+
|
|
31
|
+
### Remote files (LLM summary, 4)
|
|
32
|
+
13) PDF URL: any public PDF report (model: gemini-3-flash)
|
|
33
|
+
14) PNG URL: `https://upload.wikimedia.org/wikipedia/commons/7/70/Example.png` (model: gpt-5-mini)
|
|
34
|
+
15) MP3 URL: any public MP3 sample (model: gemini-3-flash)
|
|
35
|
+
16) CSV URL: any public CSV sample (model: gpt-5-mini)
|
|
36
|
+
|
|
37
|
+
### Local files (LLM summary, 4)
|
|
38
|
+
17) `tests/fixtures/sample.txt` (create if missing) (model: gemini-3-flash)
|
|
39
|
+
18) `tests/fixtures/sample.md` (create if missing) (model: gpt-5-mini)
|
|
40
|
+
19) `tests/fixtures/sample.json` (create if missing) (model: gemini-3-flash)
|
|
41
|
+
20) `tests/fixtures/sample.png` (create if missing; use a real PNG, not 1x1) (model: gpt-5-mini)
|
|
42
|
+
|
|
43
|
+
## Commands (template)
|
|
44
|
+
- Website: `pnpm summarize -- "<url>" --model <model> --length short`
|
|
45
|
+
- YouTube: `pnpm summarize -- "<url>" --model <model> --youtube auto`
|
|
46
|
+
- File URL: `pnpm summarize -- "<url>" --model <model>`
|
|
47
|
+
- Local file: `pnpm summarize -- "<path>" --model <model>`
|
|
48
|
+
|
|
49
|
+
## Capture
|
|
50
|
+
- Log: stdout + stderr, exit code, and timing line.
|
|
51
|
+
- Note extraction path (HTML vs Firecrawl vs YouTube transcript).
|
|
52
|
+
- File errors: media type rejection, size limits, token preflight.
|
|
53
|
+
|
|
54
|
+
## Bug bar
|
|
55
|
+
- Crash, hang, or non-zero exit.
|
|
56
|
+
- Empty summary with non-empty input.
|
|
57
|
+
- Incorrect mode selection (e.g., YouTube treated as normal URL).
|
|
58
|
+
- Wrong fallback behavior or misleading error text.
|
package/docs/website.md
CHANGED
|
@@ -7,21 +7,26 @@ Use this for non-YouTube URLs.
|
|
|
7
7
|
- Fetches the page HTML.
|
|
8
8
|
- Extracts “article-ish” content and normalizes it into clean text.
|
|
9
9
|
- If extraction looks blocked or too thin, it can retry via Firecrawl (Markdown).
|
|
10
|
-
-
|
|
11
|
-
-
|
|
10
|
+
- If a page is effectively “video-only”, it may treat it as a video input (see `--video-mode`).
|
|
11
|
+
- With `--format md`, the CLI prefers Firecrawl Markdown by default when `FIRECRAWL_API_KEY` is configured.
|
|
12
|
+
- With `--format md`, `--markdown-mode auto|llm` can also convert HTML → Markdown via an LLM using the configured `--model` (no provider fallback).
|
|
13
|
+
- With `--format md`, `--markdown-mode auto` may fall back to `uvx markitdown` when available (disable with `--preprocess off`).
|
|
12
14
|
|
|
13
15
|
## Flags
|
|
14
16
|
|
|
15
17
|
- `--firecrawl off|auto|always`
|
|
16
|
-
- `--
|
|
17
|
-
-
|
|
18
|
+
- `--format md|text` (default: `text`)
|
|
19
|
+
- `--markdown-mode off|auto|llm` (default: `auto`; only affects `--format md` for non-YouTube URLs)
|
|
20
|
+
- `--preprocess off|auto|always` (default: `auto`; controls markitdown usage; `always` only affects file inputs)
|
|
21
|
+
- `--video-mode auto|transcript|understand` (only affects video inputs / video-only pages)
|
|
22
|
+
- Plain-text mode: use `--format text`.
|
|
18
23
|
- `--timeout 30s|30|2m|5000ms` (default: `2m`)
|
|
19
|
-
- `--extract
|
|
24
|
+
- `--extract` (print extracted content; no summary LLM call)
|
|
20
25
|
- `--json` (emit a single JSON object)
|
|
21
26
|
- `--verbose` (progress + which extractor was used)
|
|
22
|
-
- `--metrics off|on|detailed` (default: `on`; `detailed`
|
|
27
|
+
- `--metrics off|on|detailed` (default: `on`; `detailed` adds a compact 2nd-line breakdown on stderr)
|
|
23
28
|
|
|
24
29
|
## API keys
|
|
25
30
|
|
|
26
|
-
- Optional: `FIRECRAWL_API_KEY` (for the Firecrawl fallback)
|
|
27
|
-
- Optional: `XAI_API_KEY` / `OPENAI_API_KEY` / `GEMINI_API_KEY` (also accepts `GOOGLE_GENERATIVE_AI_API_KEY` / `GOOGLE_API_KEY`) (required only when `--markdown llm` is used, or when `--markdown auto` falls back to LLM conversion)
|
|
31
|
+
- Optional: `FIRECRAWL_API_KEY` (for the Firecrawl fallback / preferred Markdown output)
|
|
32
|
+
- Optional: `XAI_API_KEY` / `OPENAI_API_KEY` / `GEMINI_API_KEY` (also accepts `GOOGLE_GENERATIVE_AI_API_KEY` / `GOOGLE_API_KEY`) (required only when `--markdown-mode llm` is used, or when `--markdown-mode auto` falls back to LLM conversion)
|
package/docs/youtube.md
CHANGED
package/package.json
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@steipete/summarize",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.5.0",
|
|
4
4
|
"description": "Link → clean text → summary.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
7
|
-
"summarize": "./dist/cli.cjs"
|
|
7
|
+
"summarize": "./dist/cli.cjs",
|
|
8
|
+
"summarizer": "./dist/cli.cjs"
|
|
8
9
|
},
|
|
9
10
|
"main": "./dist/esm/index.js",
|
|
10
11
|
"module": "./dist/esm/index.js",
|
|
@@ -38,6 +39,7 @@
|
|
|
38
39
|
"cheerio": "^1.1.2",
|
|
39
40
|
"es-toolkit": "^1.43.0",
|
|
40
41
|
"gpt-tokenizer": "^3.4.0",
|
|
42
|
+
"json5": "^2.2.3",
|
|
41
43
|
"sanitize-html": "^2.17.0"
|
|
42
44
|
},
|
|
43
45
|
"devDependencies": {
|
|
@@ -57,6 +59,8 @@
|
|
|
57
59
|
"mime": "^4.1.0",
|
|
58
60
|
"ora": "^9.0.0",
|
|
59
61
|
"osc-progress": "^0.1.0",
|
|
62
|
+
"oxlint": "^1.35.0",
|
|
63
|
+
"oxlint-tsgolint": "^0.10.0",
|
|
60
64
|
"tokentally": "github:steipete/tokentally#v0.1.0",
|
|
61
65
|
"tsx": "^4.21.0",
|
|
62
66
|
"typescript": "^5.9.3",
|
|
@@ -71,8 +75,8 @@
|
|
|
71
75
|
"typecheck": "tsc -p tsconfig.build.json --noEmit",
|
|
72
76
|
"summarize": "tsx src/cli.ts",
|
|
73
77
|
"format": "biome format --write .",
|
|
74
|
-
"lint": "biome check .",
|
|
75
|
-
"lint:fix": "biome check --write .",
|
|
78
|
+
"lint": "biome check . && oxlint --type-aware --tsconfig tsconfig.build.json --config .oxlintrc.json .",
|
|
79
|
+
"lint:fix": "biome check --write . && oxlint --type-aware --tsconfig tsconfig.build.json --config .oxlintrc.json --fix .",
|
|
76
80
|
"check": "pnpm lint && pnpm test:coverage",
|
|
77
81
|
"test": "pnpm build && vitest run",
|
|
78
82
|
"test:coverage": "pnpm build && vitest run --coverage",
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
const TWITTER_URL_PATTERN = /twitter\.com|x\.com/i;
|
|
2
|
-
export const canHandle = ({ url }) => TWITTER_URL_PATTERN.test(url);
|
|
3
|
-
export const fetchTranscript = async (_context, _options) => {
|
|
4
|
-
await Promise.resolve();
|
|
5
|
-
return {
|
|
6
|
-
text: null,
|
|
7
|
-
source: null,
|
|
8
|
-
attemptedProviders: [],
|
|
9
|
-
metadata: { provider: 'twitter', reason: 'not_implemented' },
|
|
10
|
-
};
|
|
11
|
-
};
|
|
12
|
-
//# sourceMappingURL=twitter.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"twitter.js","sourceRoot":"","sources":["../../../../../../src/content/link-preview/transcript/providers/twitter.ts"],"names":[],"mappings":"AAEA,MAAM,mBAAmB,GAAG,sBAAsB,CAAA;AAElD,MAAM,CAAC,MAAM,SAAS,GAAG,CAAC,EAAE,GAAG,EAAmB,EAAW,EAAE,CAAC,mBAAmB,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;AAE7F,MAAM,CAAC,MAAM,eAAe,GAAG,KAAK,EAClC,QAAyB,EACzB,QAA8B,EACL,EAAE;IAC3B,MAAM,OAAO,CAAC,OAAO,EAAE,CAAA;IACvB,OAAO;QACL,IAAI,EAAE,IAAI;QACV,MAAM,EAAE,IAAI;QACZ,kBAAkB,EAAE,EAAE;QACtB,QAAQ,EAAE,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,EAAE,iBAAiB,EAAE;KAC7D,CAAA;AACH,CAAC,CAAA"}
|
|
@@ -1,114 +0,0 @@
|
|
|
1
|
-
import { execFile } from 'node:child_process';
|
|
2
|
-
import { promisify } from 'node:util';
|
|
3
|
-
import { fetchWithTimeout } from '../../../fetch-with-timeout.js';
|
|
4
|
-
import { sanitizeYoutubeJsonResponse } from '../../utils.js';
|
|
5
|
-
const execFileAsync = promisify(execFile);
|
|
6
|
-
const isRecord = (value) => typeof value === 'object' && value !== null && !Array.isArray(value);
|
|
7
|
-
const parseJson3Transcript = (raw) => {
|
|
8
|
-
try {
|
|
9
|
-
const parsed = JSON.parse(raw);
|
|
10
|
-
if (!isRecord(parsed)) {
|
|
11
|
-
return null;
|
|
12
|
-
}
|
|
13
|
-
const eventsUnknown = parsed.events;
|
|
14
|
-
if (!Array.isArray(eventsUnknown)) {
|
|
15
|
-
return null;
|
|
16
|
-
}
|
|
17
|
-
const lines = [];
|
|
18
|
-
for (const event of eventsUnknown) {
|
|
19
|
-
if (!isRecord(event)) {
|
|
20
|
-
continue;
|
|
21
|
-
}
|
|
22
|
-
const eventRecord = event;
|
|
23
|
-
const segs = Array.isArray(eventRecord.segs) ? eventRecord.segs : null;
|
|
24
|
-
if (!segs) {
|
|
25
|
-
continue;
|
|
26
|
-
}
|
|
27
|
-
const text = segs
|
|
28
|
-
.map((seg) => {
|
|
29
|
-
if (!isRecord(seg)) {
|
|
30
|
-
return '';
|
|
31
|
-
}
|
|
32
|
-
const segRecord = seg;
|
|
33
|
-
return typeof segRecord.utf8 === 'string' ? segRecord.utf8 : '';
|
|
34
|
-
})
|
|
35
|
-
.join('')
|
|
36
|
-
.trim();
|
|
37
|
-
if (text.length > 0) {
|
|
38
|
-
lines.push(text);
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
const transcript = lines.join('\n').trim();
|
|
42
|
-
return transcript.length > 0 ? transcript : null;
|
|
43
|
-
}
|
|
44
|
-
catch {
|
|
45
|
-
return null;
|
|
46
|
-
}
|
|
47
|
-
};
|
|
48
|
-
function pickCaptionUrl(info) {
|
|
49
|
-
const sources = [info.subtitles, info.automatic_captions];
|
|
50
|
-
const candidates = [];
|
|
51
|
-
for (const source of sources) {
|
|
52
|
-
if (!isRecord(source))
|
|
53
|
-
continue;
|
|
54
|
-
for (const [lang, entries] of Object.entries(source)) {
|
|
55
|
-
candidates.push([lang, entries]);
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
const languagePreference = (lang) => {
|
|
59
|
-
const lower = lang.toLowerCase();
|
|
60
|
-
if (lower === 'en')
|
|
61
|
-
return 0;
|
|
62
|
-
if (lower.startsWith('en-'))
|
|
63
|
-
return 1;
|
|
64
|
-
if (lower.startsWith('en'))
|
|
65
|
-
return 2;
|
|
66
|
-
return 10;
|
|
67
|
-
};
|
|
68
|
-
const sorted = candidates.toSorted(([a], [b]) => languagePreference(a) - languagePreference(b));
|
|
69
|
-
for (const [, entries] of sorted) {
|
|
70
|
-
if (!Array.isArray(entries))
|
|
71
|
-
continue;
|
|
72
|
-
const normalized = entries.filter((entry) => isRecord(entry));
|
|
73
|
-
const json3 = normalized.find((entry) => entry.ext === 'json3' && typeof entry.url === 'string');
|
|
74
|
-
if (json3?.url && typeof json3.url === 'string') {
|
|
75
|
-
return json3.url;
|
|
76
|
-
}
|
|
77
|
-
const vtt = normalized.find((entry) => entry.ext === 'vtt' && typeof entry.url === 'string');
|
|
78
|
-
if (vtt?.url && typeof vtt.url === 'string') {
|
|
79
|
-
return vtt.url;
|
|
80
|
-
}
|
|
81
|
-
}
|
|
82
|
-
return null;
|
|
83
|
-
}
|
|
84
|
-
export async function fetchTranscriptWithYtDlp(fetchImpl, url, { timeoutMs } = {}) {
|
|
85
|
-
try {
|
|
86
|
-
const { stdout } = await execFileAsync('yt-dlp', ['--dump-single-json', '--no-playlist', '--no-warnings', url], { timeout: typeof timeoutMs === 'number' && Number.isFinite(timeoutMs) ? timeoutMs : 60_000 });
|
|
87
|
-
const parsed = JSON.parse(stdout);
|
|
88
|
-
if (!isRecord(parsed)) {
|
|
89
|
-
return null;
|
|
90
|
-
}
|
|
91
|
-
const info = parsed;
|
|
92
|
-
const captionUrl = pickCaptionUrl(info);
|
|
93
|
-
if (!captionUrl) {
|
|
94
|
-
return null;
|
|
95
|
-
}
|
|
96
|
-
const response = await fetchWithTimeout(fetchImpl, captionUrl, undefined, 60_000);
|
|
97
|
-
if (!response.ok) {
|
|
98
|
-
return null;
|
|
99
|
-
}
|
|
100
|
-
const raw = await response.text();
|
|
101
|
-
const sanitized = sanitizeYoutubeJsonResponse(raw);
|
|
102
|
-
return parseJson3Transcript(sanitized);
|
|
103
|
-
}
|
|
104
|
-
catch (error) {
|
|
105
|
-
const code = error && typeof error === 'object' && 'code' in error
|
|
106
|
-
? error.code
|
|
107
|
-
: null;
|
|
108
|
-
if (code === 'ENOENT') {
|
|
109
|
-
return null;
|
|
110
|
-
}
|
|
111
|
-
return null;
|
|
112
|
-
}
|
|
113
|
-
}
|
|
114
|
-
//# sourceMappingURL=ytdlp.js.map
|