summd 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api.js +3 -7
- package/dist/index.js +12 -2
- package/dist/url-to-md.js +209 -0
- package/package.json +7 -2
package/dist/api.js
CHANGED
|
@@ -26,16 +26,12 @@ function text(result) {
|
|
|
26
26
|
return result.content.map(c => c.text).join('');
|
|
27
27
|
}
|
|
28
28
|
// ── Public API ────────────────────────────────────────────────────────────────
|
|
29
|
-
export async function add(content, tags, opts, note) {
|
|
29
|
+
export async function add(content, tags, opts, note, sourceUrl) {
|
|
30
30
|
const args = { content, tags, source_type: 'cli' };
|
|
31
31
|
if (note)
|
|
32
32
|
args.note = note;
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
export async function addUrl(url, tags, opts, note) {
|
|
36
|
-
const args = { url, tags, source_type: 'cli' };
|
|
37
|
-
if (note)
|
|
38
|
-
args.note = note;
|
|
33
|
+
if (sourceUrl)
|
|
34
|
+
args.source_url = sourceUrl;
|
|
39
35
|
return text(await tool('add_entry', args, opts));
|
|
40
36
|
}
|
|
41
37
|
export async function search(query, limit, opts) {
|
package/dist/index.js
CHANGED
|
@@ -5,6 +5,7 @@ import { createInterface } from 'readline';
|
|
|
5
5
|
import { program } from 'commander';
|
|
6
6
|
import { saveKey } from './config.js';
|
|
7
7
|
import * as api from './api.js';
|
|
8
|
+
import { urlToMarkdown } from './url-to-md.js';
|
|
8
9
|
const CODE_EXTS = new Set([
|
|
9
10
|
'js', 'ts', 'jsx', 'tsx', 'mjs', 'cjs',
|
|
10
11
|
'py', 'rb', 'go', 'rs', 'java', 'kt', 'swift', 'c', 'cpp', 'cs',
|
|
@@ -67,9 +68,18 @@ program
|
|
|
67
68
|
.option('-n, --note <text>', 'Personal annotation (not included in AI summarization)')
|
|
68
69
|
.action(async (content, cmd) => {
|
|
69
70
|
const tags = cmd.tag ? cmd.tag.split(',').map(t => t.trim()) : [];
|
|
70
|
-
// URL mode:
|
|
71
|
+
// URL mode: fetch + convert locally (local IP), then send content to server
|
|
71
72
|
if (cmd.url) {
|
|
72
|
-
|
|
73
|
+
let result;
|
|
74
|
+
try {
|
|
75
|
+
process.stderr.write('Fetching…\n');
|
|
76
|
+
result = await urlToMarkdown(cmd.url);
|
|
77
|
+
}
|
|
78
|
+
catch (err) {
|
|
79
|
+
console.error(`URL fetch failed: ${err.message}`);
|
|
80
|
+
process.exit(1);
|
|
81
|
+
}
|
|
82
|
+
console.log(await api.add(result.markdown, tags, globalOpts(), cmd.note, cmd.url));
|
|
73
83
|
return;
|
|
74
84
|
}
|
|
75
85
|
let body = content;
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* URL → Markdown conversion.
|
|
3
|
+
*
|
|
4
|
+
* YouTube: yt-dlp (install: pip install yt-dlp | brew install yt-dlp)
|
|
5
|
+
* Twitter: fxtwitter API
|
|
6
|
+
* Article: fetch → jsdom + Readability → turndown
|
|
7
|
+
*/
|
|
8
|
+
import { execFile } from 'child_process';
|
|
9
|
+
import { readFileSync, readdirSync, unlinkSync } from 'fs';
|
|
10
|
+
import { tmpdir } from 'os';
|
|
11
|
+
import { join } from 'path';
|
|
12
|
+
import { promisify } from 'util';
|
|
13
|
+
import { JSDOM } from 'jsdom';
|
|
14
|
+
import { Readability } from '@mozilla/readability';
|
|
15
|
+
import TurndownService from 'turndown';
|
|
16
|
+
const execFileAsync = promisify(execFile);
|
|
17
|
+
// ── Public entry point ────────────────────────────────────────────────────────
|
|
18
|
+
export async function urlToMarkdown(url) {
|
|
19
|
+
const videoId = extractYouTubeId(url);
|
|
20
|
+
if (videoId)
|
|
21
|
+
return fetchYouTubeTranscript(url, videoId);
|
|
22
|
+
const twitter = extractTwitterStatusId(url);
|
|
23
|
+
if (twitter)
|
|
24
|
+
return fetchTwitterPost(url, twitter.user, twitter.id);
|
|
25
|
+
// Non-tweet Twitter URLs (profile, home, etc.) fall through to fetchArticle.
|
|
26
|
+
// fetch() returns a JS-SPA shell so Readability will likely throw,
|
|
27
|
+
// but that error is more informative than a pre-emptive rejection.
|
|
28
|
+
return fetchArticle(url);
|
|
29
|
+
}
|
|
30
|
+
// ── URL detection ─────────────────────────────────────────────────────────────
|
|
31
|
+
function extractYouTubeId(url) {
|
|
32
|
+
try {
|
|
33
|
+
const u = new URL(url);
|
|
34
|
+
if (/youtube\.com/.test(u.hostname)) {
|
|
35
|
+
const v = u.searchParams.get('v');
|
|
36
|
+
if (v)
|
|
37
|
+
return v;
|
|
38
|
+
const m = u.pathname.match(/^\/(shorts|live|embed)\/([^/?#]+)/);
|
|
39
|
+
if (m)
|
|
40
|
+
return m[2];
|
|
41
|
+
}
|
|
42
|
+
if (/youtu\.be/.test(u.hostname))
|
|
43
|
+
return u.pathname.slice(1).split('?')[0] || null;
|
|
44
|
+
}
|
|
45
|
+
catch { }
|
|
46
|
+
return null;
|
|
47
|
+
}
|
|
48
|
+
function extractTwitterStatusId(url) {
|
|
49
|
+
try {
|
|
50
|
+
const u = new URL(url);
|
|
51
|
+
if (!/(?:twitter|x)\.com/.test(u.hostname))
|
|
52
|
+
return null;
|
|
53
|
+
const m = u.pathname.match(/^\/([^/]+)\/status\/(\d+)/);
|
|
54
|
+
if (m)
|
|
55
|
+
return { user: m[1], id: m[2] };
|
|
56
|
+
}
|
|
57
|
+
catch { }
|
|
58
|
+
return null;
|
|
59
|
+
}
|
|
60
|
+
// ── YouTube ───────────────────────────────────────────────────────────────────
|
|
61
|
+
async function fetchYouTubeTranscript(url, videoId) {
|
|
62
|
+
// Fetch title and transcript in parallel — they're independent
|
|
63
|
+
const [title, ytDlp] = await Promise.all([
|
|
64
|
+
fetchYouTubeTitle(url, videoId),
|
|
65
|
+
ytDlpTranscript(url, videoId),
|
|
66
|
+
]);
|
|
67
|
+
const body = ytDlp.transcript ?? (ytDlp.reason === 'not-installed'
|
|
68
|
+
? '> Transcript unavailable.\n> Install yt-dlp to enable: `pip install yt-dlp` or `brew install yt-dlp`'
|
|
69
|
+
: '> Transcript unavailable for this video.');
|
|
70
|
+
return { title, markdown: `# ${title}\n\n${body}\n\nsource: ${url}` };
|
|
71
|
+
}
|
|
72
|
+
async function fetchYouTubeTitle(url, videoId) {
|
|
73
|
+
try {
|
|
74
|
+
const res = await fetch(`https://www.youtube.com/oembed?url=${encodeURIComponent(url)}&format=json`, { signal: AbortSignal.timeout(5_000) });
|
|
75
|
+
if (res.ok)
|
|
76
|
+
return (await res.json()).title ?? `YouTube: ${videoId}`;
|
|
77
|
+
}
|
|
78
|
+
catch { }
|
|
79
|
+
return `YouTube: ${videoId}`;
|
|
80
|
+
}
|
|
81
|
+
// yt-dlp — works from any IP, supports auth/geo-block via --cookies-from-browser
|
|
82
|
+
async function ytDlpTranscript(url, videoId) {
|
|
83
|
+
const dir = tmpdir();
|
|
84
|
+
const outTemplate = join(dir, videoId); // output: {videoId}.en.json3
|
|
85
|
+
try {
|
|
86
|
+
await execFileAsync('yt-dlp', [
|
|
87
|
+
'--write-subs',
|
|
88
|
+
'--write-auto-subs',
|
|
89
|
+
'--sub-langs', 'en',
|
|
90
|
+
'--sub-format', 'json3',
|
|
91
|
+
'--skip-download',
|
|
92
|
+
'--quiet',
|
|
93
|
+
'--no-progress',
|
|
94
|
+
'-o', outTemplate,
|
|
95
|
+
url,
|
|
96
|
+
], { timeout: 30_000 });
|
|
97
|
+
}
|
|
98
|
+
catch (e) {
|
|
99
|
+
// Distinguish "yt-dlp not in PATH" from "ran but video has no captions"
|
|
100
|
+
const isNotFound = e.code === 'ENOENT';
|
|
101
|
+
return { transcript: null, reason: isNotFound ? 'not-installed' : 'no-transcript' };
|
|
102
|
+
}
|
|
103
|
+
// Find output file: {videoId}.en.json3, {videoId}.en-US.json3, etc.
|
|
104
|
+
try {
|
|
105
|
+
const files = readdirSync(dir).filter(f => f.startsWith(videoId) && f.endsWith('.json3'));
|
|
106
|
+
for (const file of files) {
|
|
107
|
+
const filePath = join(dir, file);
|
|
108
|
+
try {
|
|
109
|
+
const text = parseTimedText(JSON.parse(readFileSync(filePath, 'utf8')));
|
|
110
|
+
if (text)
|
|
111
|
+
return { transcript: text };
|
|
112
|
+
}
|
|
113
|
+
finally {
|
|
114
|
+
try {
|
|
115
|
+
unlinkSync(filePath);
|
|
116
|
+
}
|
|
117
|
+
catch { }
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
catch { }
|
|
122
|
+
return { transcript: null, reason: 'no-transcript' };
|
|
123
|
+
}
|
|
124
|
+
// json3 parser — shared algorithm with browser extension
|
|
125
|
+
function parseTimedText(data) {
|
|
126
|
+
const paragraphs = [];
|
|
127
|
+
let current = [];
|
|
128
|
+
for (const event of data.events ?? []) {
|
|
129
|
+
if (!event.segs)
|
|
130
|
+
continue;
|
|
131
|
+
const text = event.segs.map(s => s.utf8 ?? '').join('');
|
|
132
|
+
if (text === '\n') {
|
|
133
|
+
const p = current.join('').replace(/\s+/g, ' ').trim();
|
|
134
|
+
if (p)
|
|
135
|
+
paragraphs.push(p);
|
|
136
|
+
current = [];
|
|
137
|
+
}
|
|
138
|
+
else {
|
|
139
|
+
current.push(text.replace(/\n/g, ' '));
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
const last = current.join('').replace(/\s+/g, ' ').trim();
|
|
143
|
+
if (last)
|
|
144
|
+
paragraphs.push(last);
|
|
145
|
+
return paragraphs.join('\n\n');
|
|
146
|
+
}
|
|
147
|
+
// ── Twitter / X ───────────────────────────────────────────────────────────────
|
|
148
|
+
async function fetchTwitterPost(url, user, id) {
|
|
149
|
+
const res = await fetch(`https://api.fxtwitter.com/${user}/status/${id}`, {
|
|
150
|
+
signal: AbortSignal.timeout(8_000),
|
|
151
|
+
});
|
|
152
|
+
if (!res.ok)
|
|
153
|
+
throw new Error(`fxtwitter HTTP ${res.status}`);
|
|
154
|
+
const data = await res.json();
|
|
155
|
+
const tweet = data.tweet;
|
|
156
|
+
if (!tweet)
|
|
157
|
+
throw new Error('No tweet data from fxtwitter');
|
|
158
|
+
const author = tweet.author?.name ?? tweet.author?.screen_name ?? 'Unknown';
|
|
159
|
+
const title = `${author} on X`;
|
|
160
|
+
const photos = (tweet.media?.photos ?? []).map(p => ``).join('\n');
|
|
161
|
+
const markdown = [
|
|
162
|
+
`# ${title}`,
|
|
163
|
+
tweet.text || null,
|
|
164
|
+
photos || null,
|
|
165
|
+
tweet.created_at?.trim() ?? null,
|
|
166
|
+
`source: ${url}`,
|
|
167
|
+
].filter(Boolean).join('\n\n');
|
|
168
|
+
return { title, markdown };
|
|
169
|
+
}
|
|
170
|
+
// ── Article ───────────────────────────────────────────────────────────────────
|
|
171
|
+
async function fetchArticle(url) {
|
|
172
|
+
const res = await fetch(url, {
|
|
173
|
+
headers: {
|
|
174
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
|
|
175
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
176
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
177
|
+
},
|
|
178
|
+
signal: AbortSignal.timeout(10_000),
|
|
179
|
+
});
|
|
180
|
+
if (!res.ok)
|
|
181
|
+
throw new Error(`HTTP ${res.status}`);
|
|
182
|
+
const ct = res.headers.get('content-type') ?? '';
|
|
183
|
+
if (!ct.includes('html'))
|
|
184
|
+
throw new Error(`Unsupported content type: ${ct}`);
|
|
185
|
+
const html = await res.text();
|
|
186
|
+
const dom = new JSDOM(html, { url });
|
|
187
|
+
const article = new Readability(dom.window.document).parse();
|
|
188
|
+
if (!article?.content)
|
|
189
|
+
throw new Error('Could not extract article content');
|
|
190
|
+
const title = article.title || new URL(url).hostname;
|
|
191
|
+
const body = buildTurndown().turndown(article.content)
|
|
192
|
+
.replace(/\[]\([^)]*\)/g, '') // remove empty links (e.g. <a><img data:…></a>)
|
|
193
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
194
|
+
.trim();
|
|
195
|
+
return { title, markdown: `# ${title}\n\n${body}\n\nsource: ${url}` };
|
|
196
|
+
}
|
|
197
|
+
function buildTurndown() {
|
|
198
|
+
const td = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
|
|
199
|
+
td.addRule('images', {
|
|
200
|
+
filter: 'img',
|
|
201
|
+
replacement: (_content, node) => {
|
|
202
|
+
const n = node;
|
|
203
|
+
if (!n.src || n.src.startsWith('data:'))
|
|
204
|
+
return '';
|
|
205
|
+
return ``;
|
|
206
|
+
},
|
|
207
|
+
});
|
|
208
|
+
return td;
|
|
209
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "summd",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.5",
|
|
4
4
|
"description": "CLI for sum.md — Sum to anything.",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"bin": {
|
|
@@ -17,10 +17,15 @@
|
|
|
17
17
|
"prepublishOnly": "npm run build"
|
|
18
18
|
},
|
|
19
19
|
"dependencies": {
|
|
20
|
-
"
|
|
20
|
+
"@mozilla/readability": "^0.6.0",
|
|
21
|
+
"commander": "^12.0.0",
|
|
22
|
+
"jsdom": "^29.0.2",
|
|
23
|
+
"turndown": "^7.2.4"
|
|
21
24
|
},
|
|
22
25
|
"devDependencies": {
|
|
26
|
+
"@types/jsdom": "^28.0.1",
|
|
23
27
|
"@types/node": "^20.0.0",
|
|
28
|
+
"@types/turndown": "^5.0.6",
|
|
24
29
|
"typescript": "^5.0.0"
|
|
25
30
|
},
|
|
26
31
|
"engines": {
|