websnap-reader 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +187 -0
- package/dist/fetcher.d.ts +20 -0
- package/dist/fetcher.d.ts.map +1 -0
- package/dist/fetcher.js +350 -0
- package/dist/fetcher.js.map +1 -0
- package/dist/formatter.d.ts +17 -0
- package/dist/formatter.d.ts.map +1 -0
- package/dist/formatter.js +116 -0
- package/dist/formatter.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +248 -0
- package/dist/index.js.map +1 -0
- package/dist/parser.d.ts +25 -0
- package/dist/parser.d.ts.map +1 -0
- package/dist/parser.js +340 -0
- package/dist/parser.js.map +1 -0
- package/dist/summarizer.d.ts +15 -0
- package/dist/summarizer.d.ts.map +1 -0
- package/dist/summarizer.js +197 -0
- package/dist/summarizer.js.map +1 -0
- package/package.json +44 -0
package/dist/parser.js
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* parser.ts - Extract clean article content from raw HTML
|
|
4
|
+
*
|
|
5
|
+
* Implements a simplified readability algorithm:
|
|
6
|
+
* 1. Strip non-content elements (nav, ads, scripts, styles, etc.)
|
|
7
|
+
* 2. Identify the main content container
|
|
8
|
+
* 3. Extract metadata (title, author, date)
|
|
9
|
+
* 4. Calculate reading statistics
|
|
10
|
+
*/
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.parseContent = parseContent;
|
|
13
|
+
// Tags to completely remove (including children)
|
|
14
|
+
const REMOVE_TAGS = new Set([
|
|
15
|
+
"script",
|
|
16
|
+
"style",
|
|
17
|
+
"noscript",
|
|
18
|
+
"iframe",
|
|
19
|
+
"object",
|
|
20
|
+
"embed",
|
|
21
|
+
"applet",
|
|
22
|
+
"link",
|
|
23
|
+
"meta",
|
|
24
|
+
"svg",
|
|
25
|
+
"canvas",
|
|
26
|
+
"video",
|
|
27
|
+
"audio",
|
|
28
|
+
"source",
|
|
29
|
+
"track",
|
|
30
|
+
"map",
|
|
31
|
+
"area",
|
|
32
|
+
]);
|
|
33
|
+
// Tags that typically contain navigation, not content
|
|
34
|
+
const NAV_TAGS = new Set([
|
|
35
|
+
"nav",
|
|
36
|
+
"header",
|
|
37
|
+
"footer",
|
|
38
|
+
"aside",
|
|
39
|
+
"menu",
|
|
40
|
+
"menuitem",
|
|
41
|
+
]);
|
|
42
|
+
// Class/ID patterns that indicate non-content
|
|
43
|
+
const NOISE_PATTERNS = [
|
|
44
|
+
/\bad[s]?\b/i,
|
|
45
|
+
/\bbanner\b/i,
|
|
46
|
+
/\bbreadcrumb/i,
|
|
47
|
+
/\bcomment/i,
|
|
48
|
+
/\bcommunity/i,
|
|
49
|
+
/\bcover-wrap/i,
|
|
50
|
+
/\bfooter/i,
|
|
51
|
+
/\bheader\b/i,
|
|
52
|
+
/\blegend/i,
|
|
53
|
+
/\bmenu/i,
|
|
54
|
+
/\bmodal/i,
|
|
55
|
+
/\bnav\b/i,
|
|
56
|
+
/\bnavigation/i,
|
|
57
|
+
/\bpopup/i,
|
|
58
|
+
/\brelated/i,
|
|
59
|
+
/\bremark/i,
|
|
60
|
+
/\bsearch/i,
|
|
61
|
+
/\bshare/i,
|
|
62
|
+
/\bsidebar/i,
|
|
63
|
+
/\bsocial/i,
|
|
64
|
+
/\bsponsor/i,
|
|
65
|
+
/\btags?\b/i,
|
|
66
|
+
/\btoolbar/i,
|
|
67
|
+
/\bwidget/i,
|
|
68
|
+
/\bcookie/i,
|
|
69
|
+
/\bgdpr/i,
|
|
70
|
+
/\bnewsletter/i,
|
|
71
|
+
/\bsubscri/i,
|
|
72
|
+
/\bpromo/i,
|
|
73
|
+
/\brecommend/i,
|
|
74
|
+
/\btoast/i,
|
|
75
|
+
/\boverlay/i,
|
|
76
|
+
];
|
|
77
|
+
// Class/ID patterns that indicate content
|
|
78
|
+
const CONTENT_PATTERNS = [
|
|
79
|
+
/\barticle/i,
|
|
80
|
+
/\bbody/i,
|
|
81
|
+
/\bcontent/i,
|
|
82
|
+
/\bentry/i,
|
|
83
|
+
/\bhentry/i,
|
|
84
|
+
/\bmain/i,
|
|
85
|
+
/\bpage/i,
|
|
86
|
+
/\bpost\b/i,
|
|
87
|
+
/\btext/i,
|
|
88
|
+
/\bblog/i,
|
|
89
|
+
/\bstory/i,
|
|
90
|
+
];
|
|
91
|
+
/**
|
|
92
|
+
* Parse HTML and extract clean article content
|
|
93
|
+
*/
|
|
94
|
+
function parseContent(html, url) {
|
|
95
|
+
// Extract metadata first (from head)
|
|
96
|
+
const title = extractTitle(html);
|
|
97
|
+
const author = extractMeta(html, [
|
|
98
|
+
'meta[name="author"]',
|
|
99
|
+
'meta[property="article:author"]',
|
|
100
|
+
'meta[name="sailthru.author"]',
|
|
101
|
+
'meta[name="parsely-author"]',
|
|
102
|
+
]);
|
|
103
|
+
const date = extractMeta(html, [
|
|
104
|
+
'meta[property="article:published_time"]',
|
|
105
|
+
'meta[name="date"]',
|
|
106
|
+
'meta[name="publishdate"]',
|
|
107
|
+
'meta[name="sailthru.date"]',
|
|
108
|
+
'meta[property="og:updated_time"]',
|
|
109
|
+
'meta[name="parsely-pub-date"]',
|
|
110
|
+
]);
|
|
111
|
+
const siteName = extractMeta(html, [
|
|
112
|
+
'meta[property="og:site_name"]',
|
|
113
|
+
'meta[name="application-name"]',
|
|
114
|
+
]);
|
|
115
|
+
const description = extractMeta(html, [
|
|
116
|
+
'meta[property="og:description"]',
|
|
117
|
+
'meta[name="description"]',
|
|
118
|
+
'meta[name="twitter:description"]',
|
|
119
|
+
]);
|
|
120
|
+
// Clean and extract content
|
|
121
|
+
let cleanedHtml = removeTagsCompletely(html, REMOVE_TAGS);
|
|
122
|
+
cleanedHtml = removeNoiseElements(cleanedHtml);
|
|
123
|
+
const contentHtml = extractMainContent(cleanedHtml);
|
|
124
|
+
// Get plain text
|
|
125
|
+
const textContent = htmlToPlainText(contentHtml);
|
|
126
|
+
const words = textContent.split(/\s+/).filter((w) => w.length > 0);
|
|
127
|
+
const wordCount = words.length;
|
|
128
|
+
const readingTime = formatReadingTime(wordCount);
|
|
129
|
+
return {
|
|
130
|
+
title,
|
|
131
|
+
author: author ? cleanText(author) : null,
|
|
132
|
+
date: date ? formatDate(date) : null,
|
|
133
|
+
siteName: siteName ? cleanText(siteName) : null,
|
|
134
|
+
description: description ? cleanText(description) : null,
|
|
135
|
+
content: contentHtml,
|
|
136
|
+
textContent,
|
|
137
|
+
wordCount,
|
|
138
|
+
readingTime,
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Extract the page title from multiple sources
|
|
143
|
+
*/
|
|
144
|
+
function extractTitle(html) {
|
|
145
|
+
// Try og:title first
|
|
146
|
+
const ogTitle = extractMetaContent(html, 'property="og:title"');
|
|
147
|
+
if (ogTitle)
|
|
148
|
+
return cleanText(ogTitle);
|
|
149
|
+
// Try twitter:title
|
|
150
|
+
const twitterTitle = extractMetaContent(html, 'name="twitter:title"');
|
|
151
|
+
if (twitterTitle)
|
|
152
|
+
return cleanText(twitterTitle);
|
|
153
|
+
// Try <title> tag
|
|
154
|
+
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
155
|
+
if (titleMatch) {
|
|
156
|
+
let title = cleanText(titleMatch[1]);
|
|
157
|
+
// Remove site name suffix patterns like " | Site Name" or " - Site Name"
|
|
158
|
+
title = title.replace(/\s*[\|\-\u2013\u2014]\s*[^|\-\u2013\u2014]*$/, "");
|
|
159
|
+
return title;
|
|
160
|
+
}
|
|
161
|
+
// Try first <h1>
|
|
162
|
+
const h1Match = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
|
|
163
|
+
if (h1Match)
|
|
164
|
+
return cleanText(h1Match[1]);
|
|
165
|
+
return "Untitled";
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Extract meta tag content by attribute selector patterns
|
|
169
|
+
*/
|
|
170
|
+
function extractMeta(html, selectors) {
|
|
171
|
+
for (const selector of selectors) {
|
|
172
|
+
// Parse the selector to get attribute name and value
|
|
173
|
+
const attrMatch = selector.match(/meta\[(\w+)="([^"]+)"\]/);
|
|
174
|
+
if (!attrMatch)
|
|
175
|
+
continue;
|
|
176
|
+
const [, attrName, attrValue] = attrMatch;
|
|
177
|
+
const value = extractMetaContent(html, `${attrName}="${attrValue}"`);
|
|
178
|
+
if (value)
|
|
179
|
+
return value;
|
|
180
|
+
}
|
|
181
|
+
return null;
|
|
182
|
+
}
|
|
183
|
+
/**
|
|
184
|
+
* Extract content attribute from a meta tag
|
|
185
|
+
*/
|
|
186
|
+
function extractMetaContent(html, attrString) {
|
|
187
|
+
// Match meta tag with the given attribute, extracting content
|
|
188
|
+
const escapedAttr = attrString.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
189
|
+
const patterns = [
|
|
190
|
+
new RegExp(`<meta[^>]*${escapedAttr}[^>]*content="([^"]*)"`, "i"),
|
|
191
|
+
new RegExp(`<meta[^>]*content="([^"]*)"[^>]*${escapedAttr}`, "i"),
|
|
192
|
+
new RegExp(`<meta[^>]*${escapedAttr}[^>]*content='([^']*)'`, "i"),
|
|
193
|
+
new RegExp(`<meta[^>]*content='([^']*)'[^>]*${escapedAttr}`, "i"),
|
|
194
|
+
];
|
|
195
|
+
for (const pattern of patterns) {
|
|
196
|
+
const match = html.match(pattern);
|
|
197
|
+
if (match && match[1])
|
|
198
|
+
return match[1];
|
|
199
|
+
}
|
|
200
|
+
return null;
|
|
201
|
+
}
|
|
202
|
+
/**
|
|
203
|
+
* Completely remove specified tags and their content
|
|
204
|
+
*/
|
|
205
|
+
function removeTagsCompletely(html, tags) {
|
|
206
|
+
let result = html;
|
|
207
|
+
for (const tag of tags) {
|
|
208
|
+
// Self-closing and opening/closing variants
|
|
209
|
+
const selfClosing = new RegExp(`<${tag}[^>]*/\\s*>`, "gi");
|
|
210
|
+
const withContent = new RegExp(`<${tag}[\\s>][\\s\\S]*?<\\/${tag}>`, "gi");
|
|
211
|
+
result = result.replace(withContent, "");
|
|
212
|
+
result = result.replace(selfClosing, "");
|
|
213
|
+
}
|
|
214
|
+
return result;
|
|
215
|
+
}
|
|
216
|
+
/**
|
|
217
|
+
* Remove elements that match noise patterns based on class/id
|
|
218
|
+
*/
|
|
219
|
+
function removeNoiseElements(html) {
|
|
220
|
+
let result = html;
|
|
221
|
+
// Remove nav-type tags
|
|
222
|
+
for (const tag of NAV_TAGS) {
|
|
223
|
+
const regex = new RegExp(`<${tag}[\\s>][\\s\\S]*?<\\/${tag}>`, "gi");
|
|
224
|
+
result = result.replace(regex, "");
|
|
225
|
+
}
|
|
226
|
+
// Remove elements with noisy class/id attributes
|
|
227
|
+
// This is a heuristic: find opening tags with class/id matching noise patterns
|
|
228
|
+
// and try to remove the element
|
|
229
|
+
for (const pattern of NOISE_PATTERNS) {
|
|
230
|
+
// Match div/section/aside with matching class or id
|
|
231
|
+
const tagRegex = new RegExp(`<(div|section|aside|ul|ol|form|figure)[^>]*(?:class|id)="[^"]*${pattern.source}[^"]*"[\\s\\S]*?<\\/\\1>`, "gi");
|
|
232
|
+
result = result.replace(tagRegex, "");
|
|
233
|
+
}
|
|
234
|
+
// Remove hidden elements
|
|
235
|
+
result = result.replace(/<[^>]*(?:display\s*:\s*none|visibility\s*:\s*hidden|aria-hidden="true")[^>]*>[\s\S]*?<\/[^>]+>/gi, "");
|
|
236
|
+
return result;
|
|
237
|
+
}
|
|
238
|
+
/**
|
|
239
|
+
* Extract the main content area from the HTML
|
|
240
|
+
*/
|
|
241
|
+
function extractMainContent(html) {
|
|
242
|
+
// Try to find <article> tag first
|
|
243
|
+
const articleMatch = html.match(/<article[^>]*>([\s\S]*?)<\/article>/i);
|
|
244
|
+
if (articleMatch && articleMatch[1].length > 200) {
|
|
245
|
+
return articleMatch[1];
|
|
246
|
+
}
|
|
247
|
+
// Try <main> tag
|
|
248
|
+
const mainMatch = html.match(/<main[^>]*>([\s\S]*?)<\/main>/i);
|
|
249
|
+
if (mainMatch && mainMatch[1].length > 200) {
|
|
250
|
+
return mainMatch[1];
|
|
251
|
+
}
|
|
252
|
+
// Try [role="main"]
|
|
253
|
+
const roleMainMatch = html.match(/<[^>]*role="main"[^>]*>([\s\S]*?)<\/[^>]+>/i);
|
|
254
|
+
if (roleMainMatch && roleMainMatch[1].length > 200) {
|
|
255
|
+
return roleMainMatch[1];
|
|
256
|
+
}
|
|
257
|
+
// Try content-indicative class/id
|
|
258
|
+
for (const pattern of CONTENT_PATTERNS) {
|
|
259
|
+
const regex = new RegExp(`<(div|section)[^>]*(?:class|id)="[^"]*${pattern.source}[^"]*"[^>]*>([\\s\\S]*?)<\\/\\1>`, "gi");
|
|
260
|
+
let match;
|
|
261
|
+
let best = "";
|
|
262
|
+
while ((match = regex.exec(html)) !== null) {
|
|
263
|
+
if (match[2].length > best.length) {
|
|
264
|
+
best = match[2];
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
if (best.length > 200)
|
|
268
|
+
return best;
|
|
269
|
+
}
|
|
270
|
+
// Fallback: find <body> content
|
|
271
|
+
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
|
272
|
+
if (bodyMatch)
|
|
273
|
+
return bodyMatch[1];
|
|
274
|
+
return html;
|
|
275
|
+
}
|
|
276
|
+
/**
|
|
277
|
+
* Convert HTML to plain text
|
|
278
|
+
*/
|
|
279
|
+
function htmlToPlainText(html) {
|
|
280
|
+
return html
|
|
281
|
+
// Replace <br> and block elements with newlines
|
|
282
|
+
.replace(/<br\s*\/?>/gi, "\n")
|
|
283
|
+
.replace(/<\/(p|div|li|h[1-6]|blockquote|tr)>/gi, "\n")
|
|
284
|
+
.replace(/<(p|div|li|h[1-6]|blockquote|tr)[^>]*>/gi, "\n")
|
|
285
|
+
// Remove all remaining HTML tags
|
|
286
|
+
.replace(/<[^>]+>/g, "")
|
|
287
|
+
// Decode common HTML entities
|
|
288
|
+
.replace(/&/g, "&")
|
|
289
|
+
.replace(/</g, "<")
|
|
290
|
+
.replace(/>/g, ">")
|
|
291
|
+
.replace(/"/g, '"')
|
|
292
|
+
.replace(/'/g, "'")
|
|
293
|
+
.replace(/ /g, " ")
|
|
294
|
+
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code)))
|
|
295
|
+
// Clean up whitespace
|
|
296
|
+
.replace(/[ \t]+/g, " ")
|
|
297
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
298
|
+
.trim();
|
|
299
|
+
}
|
|
300
|
+
/**
|
|
301
|
+
* Clean extracted text
|
|
302
|
+
*/
|
|
303
|
+
function cleanText(text) {
|
|
304
|
+
return text
|
|
305
|
+
.replace(/<[^>]+>/g, "")
|
|
306
|
+
.replace(/&/g, "&")
|
|
307
|
+
.replace(/</g, "<")
|
|
308
|
+
.replace(/>/g, ">")
|
|
309
|
+
.replace(/"/g, '"')
|
|
310
|
+
.replace(/'/g, "'")
|
|
311
|
+
.replace(/ /g, " ")
|
|
312
|
+
.replace(/\s+/g, " ")
|
|
313
|
+
.trim();
|
|
314
|
+
}
|
|
315
|
+
/**
|
|
316
|
+
* Format a date string into a human-readable format
|
|
317
|
+
*/
|
|
318
|
+
function formatDate(dateStr) {
|
|
319
|
+
try {
|
|
320
|
+
const date = new Date(dateStr);
|
|
321
|
+
if (isNaN(date.getTime()))
|
|
322
|
+
return dateStr;
|
|
323
|
+
return date.toLocaleDateString("en-US", {
|
|
324
|
+
year: "numeric",
|
|
325
|
+
month: "long",
|
|
326
|
+
day: "numeric",
|
|
327
|
+
});
|
|
328
|
+
}
|
|
329
|
+
catch {
|
|
330
|
+
return dateStr;
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
/**
|
|
334
|
+
* Calculate and format reading time
|
|
335
|
+
*/
|
|
336
|
+
function formatReadingTime(wordCount) {
|
|
337
|
+
const minutes = Math.max(1, Math.ceil(wordCount / 238));
|
|
338
|
+
return `${minutes} min read`;
|
|
339
|
+
}
|
|
340
|
+
//# sourceMappingURL=parser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"parser.js","sourceRoot":"","sources":["../src/parser.ts"],"names":[],"mappings":";AAAA;;;;;;;;GAQG;;AAmGH,oCAiDC;AAtID,iDAAiD;AACjD,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC;IAC1B,QAAQ;IACR,OAAO;IACP,UAAU;IACV,QAAQ;IACR,QAAQ;IACR,OAAO;IACP,QAAQ;IACR,MAAM;IACN,MAAM;IACN,KAAK;IACL,QAAQ;IACR,OAAO;IACP,OAAO;IACP,QAAQ;IACR,OAAO;IACP,KAAK;IACL,MAAM;CACP,CAAC,CAAC;AAEH,sDAAsD;AACtD,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC;IACvB,KAAK;IACL,QAAQ;IACR,QAAQ;IACR,OAAO;IACP,MAAM;IACN,UAAU;CACX,CAAC,CAAC;AAEH,8CAA8C;AAC9C,MAAM,cAAc,GAAG;IACrB,aAAa;IACb,aAAa;IACb,eAAe;IACf,YAAY;IACZ,cAAc;IACd,eAAe;IACf,WAAW;IACX,aAAa;IACb,WAAW;IACX,SAAS;IACT,UAAU;IACV,UAAU;IACV,eAAe;IACf,UAAU;IACV,YAAY;IACZ,WAAW;IACX,WAAW;IACX,UAAU;IACV,YAAY;IACZ,WAAW;IACX,YAAY;IACZ,YAAY;IACZ,YAAY;IACZ,WAAW;IACX,WAAW;IACX,SAAS;IACT,eAAe;IACf,YAAY;IACZ,UAAU;IACV,cAAc;IACd,UAAU;IACV,YAAY;CACb,CAAC;AAEF,0CAA0C;AAC1C,MAAM,gBAAgB,GAAG;IACvB,YAAY;IACZ,SAAS;IACT,YAAY;IACZ,UAAU;IACV,WAAW;IACX,SAAS;IACT,SAAS;IACT,WAAW;IACX,SAAS;IACT,SAAS;IACT,UAAU;CACX,CAAC;AAEF;;GAEG;AACH,SAAgB,YAAY,CAAC,IAAY,EAAE,GAAW;IACpD,qCAAqC;IACrC,MAAM,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;IACjC,MAAM,MAAM,GAAG,WAAW,CAAC,IAAI,EAAE;QAC/B,qBAAqB;QACrB,iCAAiC;QACjC,8BAA8B;QAC9B,6BAA6B;KAC9B,CAAC,CAAC;IACH,MAAM,IAAI,GAAG,WAAW,CAAC,IAAI,EAAE;QAC7B,yCAAyC;QACzC,mBAAmB;QACnB,0BAA0B;QAC1B,4BAA4B;QAC5B,kCAAkC;QAClC,+BAA+B;KAChC,CAAC,CAAC;IACH,MAAM,QAAQ,GAAG,WAAW,CAAC,IAAI,EAAE;QACjC,+BAA+B;QAC/B,+BAA+B;KAChC,CAAC,CAAC;IACH,MAAM,WAAW,GAAG,WAAW,CAAC,IAAI,EAAE;QACpC,iCAAiC;QACjC,0BAA0B;QAC1B,kCAAkC;KACnC,CAAC,CAAC;IAEH,4BAA4B;IAC5B,IAAI,WAAW,GAAG,oBAAoB,CAAC,IAAI,EAAE,WAAW,CAAC,CAAC;IAC1D,WAAW,GAAG,mBAAmB,CAAC,WAAW,CAAC,CAAC;IAC/C,MAAM,WAAW,GAAG,kBAAkB,CAAC,WAAW,CAAC,CAAC;IAEpD,iBAAiB;IACjB,MAAM,WAAW,GAAG,eAAe,CAAC,WAAW,CAAC,CAAC;IACjD,MAAM,KAAK,GAAG,WAAW,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IACnE,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,CAAC;IAC/B,MAAM,WAAW,GAAG,iBAAiB,CAAC,SAAS,CAAC,CAAC;IAEjD,OAAO;QACL,KAAK;QACL,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI;QACzC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI;QACpC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI;QAC/C,WAAW,EAAE,WAAW,CAAC,CAAC,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,IAAI;QACxD,OAAO,EAAE,WAAW;QACpB,WAAW;QACX,SAAS;QACT,WAAW;KACZ,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,YAAY,CAAC,IAAY;IAChC,qBAAqB;IACrB,MAAM,OAAO,GAAG,kBAAkB,CAAC,IAAI,EAAE,qBAAqB,CAAC,CAAC;IAChE,IAAI,OAAO;QAAE,OAAO,SAAS,CAAC,OAAO,CAAC,CAAC;IAEvC,oBAAoB;IACpB,MAAM,YAAY,GAAG,kBAAkB,CAAC,IAAI,EAAE,sBAAsB,CAAC,CAAC;IACtE,IAAI,YAAY;QAAE,OAAO,SAAS,CAAC,YAAY,CAAC,CAAC;IAEjD,kBAAkB;IAClB,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;IAClE,IAAI,UAAU,EAAE,CAAC;QACf,IAAI,KAAK,GAAG,SAAS,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC;QACrC,yEAAyE;QACzE,KAAK,GAAG,KAAK,CAAC,OAAO,CAAC,8CAA8C,EAAE,EAAE,CAAC,CAAC;QAC1E,OAAO,KAAK,CAAC;IACf,CAAC;IAED,iBAAiB;IACjB,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,4BAA4B,CAAC,CAAC;IACzD,IAAI,OAAO;QAAE,OAAO,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;IAE1C,OAAO,UAAU,CAAC;AACpB,CAAC;AAED;;GAEG;AACH,SAAS,WAAW,CAAC,IAAY,EAAE,SAAmB;IACpD,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,qDAAqD;QACrD,MAAM,SAAS,GAAG,QAAQ,CAAC,KAAK,CAC9B,yBAAyB,CAC1B,CAAC;QACF,IAAI,CAAC,SAAS;YAAE,SAAS;QAEzB,MAAM,CAAC,EAAE,QAAQ,EAAE,SAAS,CAAC,GAAG,SAAS,CAAC;QAC1C,MAAM,KAAK,GAAG,kBAAkB,CAAC,IAAI,EAAE,GAAG,QAAQ,KAAK,SAAS,GAAG,CAAC,CAAC;QACrE,IAAI,KAAK;YAAE,OAAO,KAAK,CAAC;IAC1B,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,SAAS,kBAAkB,CAAC,IAAY,EAAE,UAAkB;IAC1D,8DAA8D;IAC9D,MAAM,WAAW,GAAG,UAAU,CAAC,OAAO,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAC;IACtE,MAAM,QAAQ,GAAG;QACf,IAAI,MAAM,CACR,aAAa,WAAW,wBAAwB,EAChD,GAAG,CACJ;QACD,IAAI,MAAM,CACR,mCAAmC,WAAW,EAAE,EAChD,GAAG,CACJ;QACD,IAAI,MAAM,CACR,aAAa,WAAW,wBAAwB,EAChD,GAAG,CACJ;QACD,IAAI,MAAM,CACR,mCAAmC,WAAW,EAAE,EAChD,GAAG,CACJ;KACF,CAAC;IAEF,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAClC,IAAI,KAAK,IAAI,KAAK,CAAC,CAAC,CAAC;YAAE,OAAO,KAAK,CAAC,CAAC,CAAC,CAAC;IACzC,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,SAAS,oBAAoB,CAAC,IAAY,EAAE,IAAiB;IAC3D,IAAI,MAAM,GAAG,IAAI,CAAC;IAClB,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,4CAA4C;QAC5C,MAAM,WAAW,GAAG,IAAI,MAAM,CAAC,IAAI,GAAG,aAAa,EAAE,IAAI,CAAC,CAAC;QAC3D,MAAM,WAAW,GAAG,IAAI,MAAM,CAC5B,IAAI,GAAG,uBAAuB,GAAG,GAAG,EACpC,IAAI,CACL,CAAC;QACF,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;QACzC,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;IAC3C,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,SAAS,mBAAmB,CAAC,IAAY;IACvC,IAAI,MAAM,GAAG,IAAI,CAAC;IAElB,uBAAuB;IACvB,KAAK,MAAM,GAAG,IAAI,QAAQ,EAAE,CAAC;QAC3B,MAAM,KAAK,GAAG,IAAI,MAAM,CACtB,IAAI,GAAG,uBAAuB,GAAG,GAAG,EACpC,IAAI,CACL,CAAC;QACF,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IACrC,CAAC;IAED,iDAAiD;IACjD,+EAA+E;IAC/E,gCAAgC;IAChC,KAAK,MAAM,OAAO,IAAI,cAAc,EAAE,CAAC;QACrC,oDAAoD;QACpD,MAAM,QAAQ,GAAG,IAAI,MAAM,CACzB,iEAAiE,OAAO,CAAC,MAAM,0BAA0B,EACzG,IAAI,CACL,CAAC;QACF,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;IACxC,CAAC;IAED,yBAAyB;IACzB,MAAM,GAAG,MAAM,CAAC,OAAO,CACrB,kGAAkG,EAClG,EAAE,CACH,CAAC;IAEF,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,SAAS,kBAAkB,CAAC,IAAY;IACtC,kCAAkC;IAClC,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAC7B,sCAAsC,CACvC,CAAC;IACF,IAAI,YAAY,IAAI,YAAY,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;QACjD,OAAO,YAAY,CAAC,CAAC,CAAC,CAAC;IACzB,CAAC;IAED,iBAAiB;IACjB,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,gCAAgC,CAAC,CAAC;IAC/D,IAAI,SAAS,IAAI,SAAS,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;QAC3C,OAAO,SAAS,CAAC,CAAC,CAAC,CAAC;IACtB,CAAC;IAED,oBAAoB;IACpB,MAAM,aAAa,GAAG,IAAI,CAAC,KAAK,CAC9B,6CAA6C,CAC9C,CAAC;IACF,IAAI,aAAa,IAAI,aAAa,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;QACnD,OAAO,aAAa,CAAC,CAAC,CAAC,CAAC;IAC1B,CAAC;IAED,kCAAkC;IAClC,KAAK,MAAM,OAAO,IAAI,gBAAgB,EAAE,CAAC;QACvC,MAAM,KAAK,GAAG,IAAI,MAAM,CACtB,yCAAyC,OAAO,CAAC,MAAM,kCAAkC,EACzF,IAAI,CACL,CAAC;QACF,IAAI,KAAK,CAAC;QACV,IAAI,IAAI,GAAG,EAAE,CAAC;QACd,OAAO,CAAC,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YAC3C,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;gBAClC,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;QACH,CAAC;QACD,IAAI,IAAI,CAAC,MAAM,GAAG,GAAG;YAAE,OAAO,IAAI,CAAC;IACrC,CAAC;IAED,gCAAgC;IAChC,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,gCAAgC,CAAC,CAAC;IAC/D,IAAI,SAAS;QAAE,OAAO,SAAS,CAAC,CAAC,CAAC,CAAC;IAEnC,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,SAAS,eAAe,CAAC,IAAY;IACnC,OAAO,IAAI;QACT,gDAAgD;SAC/C,OAAO,CAAC,cAAc,EAAE,IAAI,CAAC;SAC7B,OAAO,CAAC,uCAAuC,EAAE,IAAI,CAAC;SACtD,OAAO,CAAC,0CAA0C,EAAE,IAAI,CAAC;QAC1D,iCAAiC;SAChC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC;QACxB,8BAA8B;SAC7B,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;SACtB,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC;SACrB,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC;SACrB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;SACtB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE,CAAC,MAAM,CAAC,YAAY,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC;QACvE,sBAAsB;SACrB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;SAC1B,IAAI,EAAE,CAAC;AACZ,CAAC;AAED;;GAEG;AACH,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,IAAI;SACR,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC;SACvB,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;SACtB,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC;SACrB,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC;SACrB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;SACtB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,IAAI,EAAE,CAAC;AACZ,CAAC;AAED;;GAEG;AACH,SAAS,UAAU,CAAC,OAAe;IACjC,IAAI,CAAC;QACH,MAAM,IAAI,GAAG,IAAI,IAAI,CAAC,OAAO,CAAC,CAAC;QAC/B,IAAI,KAAK,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YAAE,OAAO,OAAO,CAAC;QAC1C,OAAO,IAAI,CAAC,kBAAkB,CAAC,OAAO,EAAE;YACtC,IAAI,EAAE,SAAS;YACf,KAAK,EAAE,MAAM;YACb,GAAG,EAAE,SAAS;SACf,CAAC,CAAC;IACL,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,OAAO,CAAC;IACjB,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,iBAAiB,CAAC,SAAiB;IAC1C,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,IAAI,CAAC,SAAS,GAAG,GAAG,CAAC,CAAC,CAAC;IACxD,OAAO,GAAG,OAAO,WAAW,CAAC;AAC/B,CAAC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* summarizer.ts - AI-powered article summarization
|
|
3
|
+
*
|
|
4
|
+
* Supports multiple backends:
|
|
5
|
+
* 1. OpenAI API (set OPENAI_API_KEY env var)
|
|
6
|
+
* 2. Anthropic API (set ANTHROPIC_API_KEY env var)
|
|
7
|
+
* 3. Local LLM via Ollama (default, no API key needed)
|
|
8
|
+
* 4. Fallback: extractive summary (no AI needed)
|
|
9
|
+
*/
|
|
10
|
+
import { ParsedArticle } from "./parser";
|
|
11
|
+
/**
|
|
12
|
+
* Generate a 3-sentence summary of the article
|
|
13
|
+
*/
|
|
14
|
+
export declare function summarize(article: ParsedArticle): Promise<string>;
|
|
15
|
+
//# sourceMappingURL=summarizer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"summarizer.d.ts","sourceRoot":"","sources":["../src/summarizer.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AAYzC;;GAEG;AACH,wBAAsB,SAAS,CAAC,OAAO,EAAE,aAAa,GAAG,OAAO,CAAC,MAAM,CAAC,CAmDvE"}
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* summarizer.ts - AI-powered article summarization
|
|
4
|
+
*
|
|
5
|
+
* Supports multiple backends:
|
|
6
|
+
* 1. OpenAI API (set OPENAI_API_KEY env var)
|
|
7
|
+
* 2. Anthropic API (set ANTHROPIC_API_KEY env var)
|
|
8
|
+
* 3. Local LLM via Ollama (default, no API key needed)
|
|
9
|
+
* 4. Fallback: extractive summary (no AI needed)
|
|
10
|
+
*/
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.summarize = summarize;
|
|
13
|
+
const formatter_1 = require("./formatter");
|
|
14
|
+
/**
|
|
15
|
+
* Generate a 3-sentence summary of the article
|
|
16
|
+
*/
|
|
17
|
+
async function summarize(article) {
|
|
18
|
+
const config = {
|
|
19
|
+
openaiApiKey: process.env.OPENAI_API_KEY,
|
|
20
|
+
openaiModel: process.env.OPENAI_MODEL || "gpt-4o-mini",
|
|
21
|
+
anthropicApiKey: process.env.ANTHROPIC_API_KEY,
|
|
22
|
+
anthropicModel: process.env.ANTHROPIC_MODEL || "claude-sonnet-4-20250514",
|
|
23
|
+
ollamaUrl: process.env.OLLAMA_URL || "http://127.0.0.1:11434",
|
|
24
|
+
ollamaModel: process.env.OLLAMA_MODEL || "llama3.2",
|
|
25
|
+
};
|
|
26
|
+
const prompt = (0, formatter_1.formatSummaryPrompt)(article);
|
|
27
|
+
// Try OpenAI first
|
|
28
|
+
if (config.openaiApiKey) {
|
|
29
|
+
try {
|
|
30
|
+
return await summarizeOpenAI(prompt, config);
|
|
31
|
+
}
|
|
32
|
+
catch (err) {
|
|
33
|
+
process.stderr.write(`\x1b[33mOpenAI failed:\x1b[0m ${err.message}\n`);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
// Try Anthropic
|
|
37
|
+
if (config.anthropicApiKey) {
|
|
38
|
+
try {
|
|
39
|
+
return await summarizeAnthropic(prompt, config);
|
|
40
|
+
}
|
|
41
|
+
catch (err) {
|
|
42
|
+
process.stderr.write(`\x1b[33mAnthropic failed:\x1b[0m ${err.message}\n`);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
// Try Ollama (local)
|
|
46
|
+
try {
|
|
47
|
+
return await summarizeOllama(prompt, config);
|
|
48
|
+
}
|
|
49
|
+
catch (err) {
|
|
50
|
+
process.stderr.write(`\x1b[33mOllama failed:\x1b[0m ${err.message}\n`);
|
|
51
|
+
}
|
|
52
|
+
// Fallback: extractive summary
|
|
53
|
+
process.stderr.write(`\x1b[33mNo AI backend available. Using extractive summary.\x1b[0m\n`);
|
|
54
|
+
process.stderr.write(`\x1b[90mSet OPENAI_API_KEY, ANTHROPIC_API_KEY, or run Ollama for AI summaries.\x1b[0m\n`);
|
|
55
|
+
return extractiveSummary(article);
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* OpenAI Chat Completions API
|
|
59
|
+
*/
|
|
60
|
+
async function summarizeOpenAI(prompt, config) {
|
|
61
|
+
const response = await fetch("https://api.openai.com/v1/chat/completions", {
|
|
62
|
+
method: "POST",
|
|
63
|
+
headers: {
|
|
64
|
+
"Content-Type": "application/json",
|
|
65
|
+
Authorization: `Bearer ${config.openaiApiKey}`,
|
|
66
|
+
},
|
|
67
|
+
body: JSON.stringify({
|
|
68
|
+
model: config.openaiModel,
|
|
69
|
+
messages: [
|
|
70
|
+
{
|
|
71
|
+
role: "system",
|
|
72
|
+
content: "You are a concise article summarizer. Always respond with exactly 3 sentences.",
|
|
73
|
+
},
|
|
74
|
+
{ role: "user", content: prompt },
|
|
75
|
+
],
|
|
76
|
+
temperature: 0.3,
|
|
77
|
+
max_tokens: 300,
|
|
78
|
+
}),
|
|
79
|
+
});
|
|
80
|
+
if (!response.ok) {
|
|
81
|
+
const body = await response.text();
|
|
82
|
+
throw new Error(`OpenAI API ${response.status}: ${body.substring(0, 200)}`);
|
|
83
|
+
}
|
|
84
|
+
const data = (await response.json());
|
|
85
|
+
return data.choices?.[0]?.message?.content?.trim() || "Summary unavailable.";
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Anthropic Messages API
|
|
89
|
+
*/
|
|
90
|
+
async function summarizeAnthropic(prompt, config) {
|
|
91
|
+
const response = await fetch("https://api.anthropic.com/v1/messages", {
|
|
92
|
+
method: "POST",
|
|
93
|
+
headers: {
|
|
94
|
+
"Content-Type": "application/json",
|
|
95
|
+
"x-api-key": config.anthropicApiKey,
|
|
96
|
+
"anthropic-version": "2023-06-01",
|
|
97
|
+
},
|
|
98
|
+
body: JSON.stringify({
|
|
99
|
+
model: config.anthropicModel,
|
|
100
|
+
max_tokens: 300,
|
|
101
|
+
messages: [{ role: "user", content: prompt }],
|
|
102
|
+
system: "You are a concise article summarizer. Always respond with exactly 3 sentences.",
|
|
103
|
+
}),
|
|
104
|
+
});
|
|
105
|
+
if (!response.ok) {
|
|
106
|
+
const body = await response.text();
|
|
107
|
+
throw new Error(`Anthropic API ${response.status}: ${body.substring(0, 200)}`);
|
|
108
|
+
}
|
|
109
|
+
const data = (await response.json());
|
|
110
|
+
return data.content?.[0]?.text?.trim() || "Summary unavailable.";
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Ollama local LLM API
|
|
114
|
+
*/
|
|
115
|
+
async function summarizeOllama(prompt, config) {
|
|
116
|
+
const controller = new AbortController();
|
|
117
|
+
const timer = setTimeout(() => controller.abort(), 60000);
|
|
118
|
+
try {
|
|
119
|
+
const response = await fetch(`${config.ollamaUrl}/api/generate`, {
|
|
120
|
+
method: "POST",
|
|
121
|
+
headers: { "Content-Type": "application/json" },
|
|
122
|
+
body: JSON.stringify({
|
|
123
|
+
model: config.ollamaModel,
|
|
124
|
+
prompt,
|
|
125
|
+
system: "You are a concise article summarizer. Always respond with exactly 3 sentences.",
|
|
126
|
+
stream: false,
|
|
127
|
+
options: {
|
|
128
|
+
temperature: 0.3,
|
|
129
|
+
num_predict: 300,
|
|
130
|
+
},
|
|
131
|
+
}),
|
|
132
|
+
signal: controller.signal,
|
|
133
|
+
});
|
|
134
|
+
if (!response.ok) {
|
|
135
|
+
throw new Error(`Ollama ${response.status}`);
|
|
136
|
+
}
|
|
137
|
+
const data = (await response.json());
|
|
138
|
+
return data.response?.trim() || "Summary unavailable.";
|
|
139
|
+
}
|
|
140
|
+
finally {
|
|
141
|
+
clearTimeout(timer);
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* Extractive summary fallback (no AI needed)
|
|
146
|
+
* Picks the most representative sentences from the article.
|
|
147
|
+
*/
|
|
148
|
+
function extractiveSummary(article) {
|
|
149
|
+
const text = article.textContent;
|
|
150
|
+
// Split into sentences
|
|
151
|
+
const sentences = text
|
|
152
|
+
.split(/(?<=[.!?])\s+/)
|
|
153
|
+
.map((s) => s.trim())
|
|
154
|
+
.filter((s) => {
|
|
155
|
+
// Filter out very short or very long sentences
|
|
156
|
+
const words = s.split(/\s+/).length;
|
|
157
|
+
return words >= 5 && words <= 50;
|
|
158
|
+
});
|
|
159
|
+
if (sentences.length === 0) {
|
|
160
|
+
return article.description || "No summary available.";
|
|
161
|
+
}
|
|
162
|
+
if (sentences.length <= 3) {
|
|
163
|
+
return sentences.join(" ");
|
|
164
|
+
}
|
|
165
|
+
// Simple scoring: prefer sentences that appear early and contain key terms
|
|
166
|
+
const titleWords = new Set(article.title.toLowerCase().split(/\s+/).filter((w) => w.length > 3));
|
|
167
|
+
const scored = sentences.map((sentence, index) => {
|
|
168
|
+
let score = 0;
|
|
169
|
+
// Position bonus: first sentences are usually important
|
|
170
|
+
if (index === 0)
|
|
171
|
+
score += 5;
|
|
172
|
+
else if (index === 1)
|
|
173
|
+
score += 3;
|
|
174
|
+
else if (index === 2)
|
|
175
|
+
score += 2;
|
|
176
|
+
// Title word overlap
|
|
177
|
+
const words = sentence.toLowerCase().split(/\s+/);
|
|
178
|
+
for (const w of words) {
|
|
179
|
+
if (titleWords.has(w))
|
|
180
|
+
score += 2;
|
|
181
|
+
}
|
|
182
|
+
// Sentence length: prefer medium-length sentences
|
|
183
|
+
if (words.length >= 10 && words.length <= 30)
|
|
184
|
+
score += 1;
|
|
185
|
+
// Penalize sentences with too many special characters (likely not prose)
|
|
186
|
+
const specialChars = (sentence.match(/[^a-zA-Z0-9\s.,!?'"()-]/g) || []).length;
|
|
187
|
+
if (specialChars > 5)
|
|
188
|
+
score -= 3;
|
|
189
|
+
return { sentence, score, index };
|
|
190
|
+
});
|
|
191
|
+
// Sort by score, take top 3, then re-sort by position
|
|
192
|
+
scored.sort((a, b) => b.score - a.score);
|
|
193
|
+
const top3 = scored.slice(0, 3);
|
|
194
|
+
top3.sort((a, b) => a.index - b.index);
|
|
195
|
+
return top3.map((s) => s.sentence).join(" ");
|
|
196
|
+
}
|
|
197
|
+
//# sourceMappingURL=summarizer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"summarizer.js","sourceRoot":"","sources":["../src/summarizer.ts"],"names":[],"mappings":";AAAA;;;;;;;;GAQG;;AAiBH,8BAmDC;AAjED,2CAAkD;AAWlD;;GAEG;AACI,KAAK,UAAU,SAAS,CAAC,OAAsB;IACpD,MAAM,MAAM,GAAqB;QAC/B,YAAY,EAAE,OAAO,CAAC,GAAG,CAAC,cAAc;QACxC,WAAW,EAAE,OAAO,CAAC,GAAG,CAAC,YAAY,IAAI,aAAa;QACtD,eAAe,EAAE,OAAO,CAAC,GAAG,CAAC,iBAAiB;QAC9C,cAAc,EAAE,OAAO,CAAC,GAAG,CAAC,eAAe,IAAI,0BAA0B;QACzE,SAAS,EAAE,OAAO,CAAC,GAAG,CAAC,UAAU,IAAI,wBAAwB;QAC7D,WAAW,EAAE,OAAO,CAAC,GAAG,CAAC,YAAY,IAAI,UAAU;KACpD,CAAC;IAEF,MAAM,MAAM,GAAG,IAAA,+BAAmB,EAAC,OAAO,CAAC,CAAC;IAE5C,mBAAmB;IACnB,IAAI,MAAM,CAAC,YAAY,EAAE,CAAC;QACxB,IAAI,CAAC;YACH,OAAO,MAAM,eAAe,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QAC/C,CAAC;QAAC,OAAO,GAAQ,EAAE,CAAC;YAClB,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,iCAAiC,GAAG,CAAC,OAAO,IAAI,CACjD,CAAC;QACJ,CAAC;IACH,CAAC;IAED,gBAAgB;IAChB,IAAI,MAAM,CAAC,eAAe,EAAE,CAAC;QAC3B,IAAI,CAAC;YACH,OAAO,MAAM,kBAAkB,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QAClD,CAAC;QAAC,OAAO,GAAQ,EAAE,CAAC;YAClB,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,oCAAoC,GAAG,CAAC,OAAO,IAAI,CACpD,CAAC;QACJ,CAAC;IACH,CAAC;IAED,qBAAqB;IACrB,IAAI,CAAC;QACH,OAAO,MAAM,eAAe,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC/C,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,iCAAiC,GAAG,CAAC,OAAO,IAAI,CACjD,CAAC;IACJ,CAAC;IAED,+BAA+B;IAC/B,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,qEAAqE,CACtE,CAAC;IACF,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,yFAAyF,CAC1F,CAAC;IACF,OAAO,iBAAiB,CAAC,OAAO,CAAC,CAAC;AACpC,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,eAAe,CAC5B,MAAc,EACd,MAAwB;IAExB,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,4CAA4C,EAAE;QACzE,MAAM,EAAE,MAAM;QACd,OAAO,EAAE;YACP,cAAc,EAAE,kBAAkB;YAClC,aAAa,EAAE,UAAU,MAAM,CAAC,YAAY,EAAE;SAC/C;QACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;YACnB,KAAK,EAAE,MAAM,CAAC,WAAW;YACzB,QAAQ,EAAE;gBACR;oBACE,IAAI,EAAE,QAAQ;oBACd,OAAO,EACL,gFAAgF;iBACnF;gBACD,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE;aAClC;YACD,WAAW,EAAE,GAAG;YAChB,UAAU,EAAE,GAAG;SAChB,CAAC;KACH,CAAC,CAAC;IAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QACnC,MAAM,IAAI,KAAK,CAAC,cAAc,QAAQ,CAAC,MAAM,KAAK,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC;IAC9E,CAAC;IAED,MAAM,IAAI,GAAG,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAwB,CAAC;IAC5D,OAAO,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,sBAAsB,CAAC;AAC/E,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,kBAAkB,CAC/B,MAAc,EACd,MAAwB;IAExB,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,uCAAuC,EAAE;QACpE,MAAM,EAAE,MAAM;QACd,OAAO,EAAE;YACP,cAAc,EAAE,kBAAkB;YAClC,WAAW,EAAE,MAAM,CAAC,eAAgB;YACpC,mBAAmB,EAAE,YAAY;SAClC;QACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;YACnB,KAAK,EAAE,MAAM,CAAC,cAAc;YAC5B,UAAU,EAAE,GAAG;YACf,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;YAC7C,MAAM,EACJ,gFAAgF;SACnF,CAAC;KACH,CAAC,CAAC;IAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QACnC,MAAM,IAAI,KAAK,CACb,iBAAiB,QAAQ,CAAC,MAAM,KAAK,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAC9D,CAAC;IACJ,CAAC;IAED,MAAM,IAAI,GAAG,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAwB,CAAC;IAC5D,OAAO,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,sBAAsB,CAAC;AACnE,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,eAAe,CAC5B,MAAc,EACd,MAAwB;IAExB,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;IACzC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,KAAK,CAAC,CAAC;IAE1D,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,MAAM,CAAC,SAAS,eAAe,EAAE;YAC/D,MAAM,EAAE,MAAM;YACd,OAAO,EAAE,EAAE,cAAc,EAAE,kBAAkB,EAAE;YAC/C,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;gBACnB,KAAK,EAAE,MAAM,CAAC,WAAW;gBACzB,MAAM;gBACN,MAAM,EACJ,gFAAgF;gBAClF,MAAM,EAAE,KAAK;gBACb,OAAO,EAAE;oBACP,WAAW,EAAE,GAAG;oBAChB,WAAW,EAAE,GAAG;iBACjB;aACF,CAAC;YACF,MAAM,EAAE,UAAU,CAAC,MAAM;SAC1B,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,MAAM,IAAI,KAAK,CAAC,UAAU,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;QAC/C,CAAC;QAED,MAAM,IAAI,GAAG,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAwB,CAAC;QAC5D,OAAO,IAAI,CAAC,QAAQ,EAAE,IAAI,EAAE,IAAI,sBAAsB,CAAC;IACzD,CAAC;YAAS,CAAC;QACT,YAAY,CAAC,KAAK,CAAC,CAAC;IACtB,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,SAAS,iBAAiB,CAAC,OAAsB;IAC/C,MAAM,IAAI,GAAG,OAAO,CAAC,WAAW,CAAC;IAEjC,uBAAuB;IACvB,MAAM,SAAS,GAAG,IAAI;SACnB,KAAK,CAAC,eAAe,CAAC;SACtB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;SACpB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;QACZ,+CAA+C;QAC/C,MAAM,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC;QACpC,OAAO,KAAK,IAAI,CAAC,IAAI,KAAK,IAAI,EAAE,CAAC;IACnC,CAAC,CAAC,CAAC;IAEL,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC3B,OAAO,OAAO,CAAC,WAAW,IAAI,uBAAuB,CAAC;IACxD,CAAC;IAED,IAAI,SAAS,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;QAC1B,OAAO,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC7B,CAAC;IAED,2EAA2E;IAC3E,MAAM,UAAU,GAAG,IAAI,GAAG,CACxB,OAAO,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CACrE,CAAC;IAEF,MAAM,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,KAAK,EAAE,EAAE;QAC/C,IAAI,KAAK,GAAG,CAAC,CAAC;QAEd,wDAAwD;QACxD,IAAI,KAAK,KAAK,CAAC;YAAE,KAAK,IAAI,CAAC,CAAC;aACvB,IAAI,KAAK,KAAK,CAAC;YAAE,KAAK,IAAI,CAAC,CAAC;aAC5B,IAAI,KAAK,KAAK,CAAC;YAAE,KAAK,IAAI,CAAC,CAAC;QAEjC,qBAAqB;QACrB,MAAM,KAAK,GAAG,QAAQ,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QAClD,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;YACtB,IAAI,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;gBAAE,KAAK,IAAI,CAAC,CAAC;QACpC,CAAC;QAED,kDAAkD;QAClD,IAAI,KAAK,CAAC,MAAM,IAAI,EAAE,IAAI,KAAK,CAAC,MAAM,IAAI,EAAE;YAAE,KAAK,IAAI,CAAC,CAAC;QAEzD,yEAAyE;QACzE,MAAM,YAAY,GAAG,CAAC,QAAQ,CAAC,KAAK,CAAC,0BAA0B,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC;QAC/E,IAAI,YAAY,GAAG,CAAC;YAAE,KAAK,IAAI,CAAC,CAAC;QAEjC,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC;IACpC,CAAC,CAAC,CAAC;IAEH,sDAAsD;IACtD,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;IACzC,MAAM,IAAI,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IAChC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;IAEvC,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAC/C,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "websnap-reader",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Turn any URL into clean markdown. A better reader mode for your terminal.",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"bin": {
|
|
7
|
+
"websnap": "dist/index.js"
|
|
8
|
+
},
|
|
9
|
+
"scripts": {
|
|
10
|
+
"build": "tsc",
|
|
11
|
+
"dev": "tsc --watch",
|
|
12
|
+
"start": "node dist/index.js",
|
|
13
|
+
"prepublishOnly": "npm run build"
|
|
14
|
+
},
|
|
15
|
+
"keywords": [
|
|
16
|
+
"cli",
|
|
17
|
+
"markdown",
|
|
18
|
+
"reader-mode",
|
|
19
|
+
"web-scraper",
|
|
20
|
+
"chrome-cdp",
|
|
21
|
+
"readability",
|
|
22
|
+
"article-extractor"
|
|
23
|
+
],
|
|
24
|
+
"author": "Wilson Xu",
|
|
25
|
+
"license": "MIT",
|
|
26
|
+
"publishConfig": {
|
|
27
|
+
"access": "public"
|
|
28
|
+
},
|
|
29
|
+
"dependencies": {
|
|
30
|
+
"commander": "^12.1.0",
|
|
31
|
+
"node-html-markdown": "^1.3.0"
|
|
32
|
+
},
|
|
33
|
+
"devDependencies": {
|
|
34
|
+
"@types/node": "^20.11.0",
|
|
35
|
+
"typescript": "^5.3.3"
|
|
36
|
+
},
|
|
37
|
+
"engines": {
|
|
38
|
+
"node": ">=18.0.0"
|
|
39
|
+
},
|
|
40
|
+
"files": [
|
|
41
|
+
"dist",
|
|
42
|
+
"README.md"
|
|
43
|
+
]
|
|
44
|
+
}
|