websnap-reader 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,116 @@
1
+ "use strict";
2
+ /**
3
+ * formatter.ts - Convert parsed articles to markdown or JSON output
4
+ */
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.formatMarkdown = formatMarkdown;
7
+ exports.formatJSON = formatJSON;
8
+ exports.formatSummaryPrompt = formatSummaryPrompt;
9
+ const node_html_markdown_1 = require("node-html-markdown");
10
+ // Singleton converter with sensible defaults
11
+ const nhm = new node_html_markdown_1.NodeHtmlMarkdown({
12
+ preferNativeParser: false,
13
+ codeBlockStyle: "fenced",
14
+ bulletMarker: "-",
15
+ strongDelimiter: "**",
16
+ emDelimiter: "*",
17
+ maxConsecutiveNewlines: 2,
18
+ },
19
+ // Custom translators for better output
20
+ undefined, undefined);
21
+ /**
22
+ * Format article as clean markdown
23
+ */
24
+ function formatMarkdown(article, url) {
25
+ const sections = [];
26
+ // Title
27
+ sections.push(`# ${article.title}`);
28
+ // Metadata line
29
+ const metaParts = [];
30
+ if (article.author)
31
+ metaParts.push(`By ${article.author}`);
32
+ if (article.date)
33
+ metaParts.push(article.date);
34
+ if (article.siteName)
35
+ metaParts.push(article.siteName);
36
+ if (metaParts.length > 0) {
37
+ sections.push(`*${metaParts.join(" | ")}*`);
38
+ }
39
+ // Reading stats
40
+ sections.push(`> ${article.wordCount.toLocaleString()} words | ${article.readingTime}`);
41
+ // Description / lead
42
+ if (article.description) {
43
+ sections.push(`**${article.description}**`);
44
+ }
45
+ // Separator
46
+ sections.push("---");
47
+ // Main content converted to markdown
48
+ const markdown = nhm.translate(article.content);
49
+ sections.push(cleanMarkdown(markdown));
50
+ // Footer
51
+ sections.push("---");
52
+ sections.push(`*Source: [${article.title}](${url})*`);
53
+ return sections.join("\n\n");
54
+ }
55
+ /**
56
+ * Format article as structured JSON
57
+ */
58
+ function formatJSON(article, url) {
59
+ const markdown = nhm.translate(article.content);
60
+ const output = {
61
+ url,
62
+ title: article.title,
63
+ author: article.author,
64
+ date: article.date,
65
+ siteName: article.siteName,
66
+ description: article.description,
67
+ content: cleanMarkdown(markdown),
68
+ wordCount: article.wordCount,
69
+ readingTime: article.readingTime,
70
+ extractedAt: new Date().toISOString(),
71
+ };
72
+ return JSON.stringify(output, null, 2);
73
+ }
74
+ /**
75
+ * Format a prompt for AI summarization
76
+ */
77
+ function formatSummaryPrompt(article) {
78
+ // Truncate to ~3000 words to keep within context limits
79
+ const words = article.textContent.split(/\s+/);
80
+ const truncated = words.length > 3000 ? words.slice(0, 3000).join(" ") + "..." : article.textContent;
81
+ return [
82
+ "Summarize the following article in exactly 3 concise sentences.",
83
+ "Focus on the key points, findings, or arguments.",
84
+ "Write in a neutral, informative tone.",
85
+ "",
86
+ `Title: ${article.title}`,
87
+ article.author ? `Author: ${article.author}` : "",
88
+ "",
89
+ "Article text:",
90
+ truncated,
91
+ "",
92
+ "Your 3-sentence summary:",
93
+ ]
94
+ .filter((line) => line !== undefined)
95
+ .join("\n");
96
+ }
97
+ /**
98
+ * Clean up markdown output
99
+ */
100
+ function cleanMarkdown(md) {
101
+ return (md
102
+ // Remove excessive blank lines
103
+ .replace(/\n{4,}/g, "\n\n\n")
104
+ // Remove trailing whitespace on lines
105
+ .replace(/[ \t]+$/gm, "")
106
+ // Clean up link references that have no href
107
+ .replace(/\[([^\]]+)\]\(\s*\)/g, "$1")
108
+ // Remove empty headings
109
+ .replace(/^#{1,6}\s*$/gm, "")
110
+ // Remove image alt text that's just whitespace
111
+ .replace(/!\[\s*\]\([^)]+\)/g, "")
112
+ // Normalize bullet lists
113
+ .replace(/^\*\s/gm, "- ")
114
+ .trim());
115
+ }
116
+ //# sourceMappingURL=formatter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"formatter.js","sourceRoot":"","sources":["../src/formatter.ts"],"names":[],"mappings":";AAAA;;GAEG;;AAuBH,wCAqCC;AAKD,gCAiBC;AAKD,kDAqBC;AA1GD,2DAAsD;AAGtD,6CAA6C;AAC7C,MAAM,GAAG,GAAG,IAAI,qCAAgB,CAC9B;IACE,kBAAkB,EAAE,KAAK;IACzB,cAAc,EAAE,QAAQ;IACxB,YAAY,EAAE,GAAG;IACjB,eAAe,EAAE,IAAI;IACrB,WAAW,EAAE,GAAG;IAChB,sBAAsB,EAAE,CAAC;CAC1B;AACD,uCAAuC;AACvC,SAAS,EACT,SAAS,CACV,CAAC;AAEF;;GAEG;AACH,SAAgB,cAAc,CAAC,OAAsB,EAAE,GAAW;IAChE,MAAM,QAAQ,GAAa,EAAE,CAAC;IAE9B,QAAQ;IACR,QAAQ,CAAC,IAAI,CAAC,KAAK,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC;IAEpC,gBAAgB;IAChB,MAAM,SAAS,GAAa,EAAE,CAAC;IAC/B,IAAI,OAAO,CAAC,MAAM;QAAE,SAAS,CAAC,IAAI,CAAC,MAAM,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;IAC3D,IAAI,OAAO,CAAC,IAAI;QAAE,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IAC/C,IAAI,OAAO,CAAC,QAAQ;QAAE,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;IACvD,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACzB,QAAQ,CAAC,IAAI,CAAC,IAAI,SAAS,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAC9C,CAAC;IAED,gBAAgB;IAChB,QAAQ,CAAC,IAAI,CACX,KAAK,OAAO,CAAC,SAAS,CAAC,cAAc,EAAE,YAAY,OAAO,CAAC,WAAW,EAAE,CACzE,CAAC;IAEF,qBAAqB;IACrB,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC;QACxB,QAAQ,CAAC,IAAI,CAAC,KAAK,OAAO,CAAC,WAAW,IAAI,CAAC,CAAC;IAC9C,CAAC;IAED,YAAY;IACZ,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAErB,qCAAqC;IACrC,MAAM,QAAQ,GAAG,GAAG,CAAC,SAAS,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;IAChD,QAAQ,CAAC,IAAI,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC,CAAC;IAEvC,SAAS;IACT,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACrB,QAAQ,CAAC,IAAI,CAAC,aAAa,OAAO,CAAC,KAAK,KAAK,GAAG,IAAI,CAAC,CAAC;IAEtD,OAAO,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;AAC/B,CAAC;AAED;;GAEG;AACH,SAAgB,UAAU,CAAC,OAAsB,EAAE,GAAW;IAC5D,MAAM,QAAQ,GAAG,GAAG,CAAC,SAAS,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;IAEhD,MAAM,MAAM,GAAG;QACb,GAAG;QACH,KAAK,EAAE,OAAO,CAAC,KAAK;QACpB,MAAM,EAAE,OAAO,CAAC,MAAM;QACtB,IAAI,EAAE,OAAO,CAAC,IAAI;QAClB,QAAQ,EAAE,OAAO,CAAC,QAAQ;QAC1B,WAAW,EAAE,OAAO,CAAC,WAAW;QAChC,OAAO,EAAE,aAAa,CAAC,QAAQ,CAAC;QAChC,SAAS,EAAE,OAAO,CAAC,SAAS;QAC5B,WAAW,EAAE,OAAO,CAAC,WAAW;QAChC,WAAW,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;KACtC,CAAC;IAEF,OAAO,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;AACzC,CAAC;AAED;;GAEG;AACH,SAAgB,mBAAmB,CAAC,OAAsB;IACxD,wDAAwD;IACxD,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IAC/C,MAAM,SAAS,GACb,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC,WAAW,CAAC;IAErF,OAAO;QACL,iEAAiE;QACjE,kDAAkD;QAClD,uCAAuC;QACvC,EAAE;QACF,UAAU,OAAO,CAAC,KAAK,EAAE;QACzB,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,WAAW,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE;QACjD,EAAE;QACF,eAAe;QACf,SAAS;QACT,EAAE;QACF,0BAA0B;KAC3B;SACE,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,KAAK,SAAS,CAAC;SACpC,IAAI,CAAC,IAAI,CAAC,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,SAAS,aAAa,CAAC,EAAU;IAC/B,OAAO,CACL,EAAE;QACA,+BAA+B;SAC9B,OAAO,CAAC,SAAS,EAAE,QAAQ,CAAC;QAC7B,sCAAsC;SACrC,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC;QACzB,6CAA6C;SAC5C,OAAO,CAAC,sBAAsB,EAAE,IAAI,CAAC;QACtC,wBAAwB;SACvB,OAAO,CAAC,eAAe,EAAE,EAAE,CAAC;QAC7B,+CAA+C;SAC9C,OAAO,CAAC,oBAAoB,EAAE,EAAE,CAAC;QAClC,yBAAyB;SACxB,OAAO,CAAC,SAAS,EAAE,IAAI,CAAC;SACxB,IAAI,EAAE,CACV,CAAC;AACJ,CAAC"}
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env node
2
+ export {};
3
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":""}
package/dist/index.js ADDED
@@ -0,0 +1,248 @@
1
+ #!/usr/bin/env node
2
+ "use strict";
3
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
4
+ if (k2 === undefined) k2 = k;
5
+ var desc = Object.getOwnPropertyDescriptor(m, k);
6
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
7
+ desc = { enumerable: true, get: function() { return m[k]; } };
8
+ }
9
+ Object.defineProperty(o, k2, desc);
10
+ }) : (function(o, m, k, k2) {
11
+ if (k2 === undefined) k2 = k;
12
+ o[k2] = m[k];
13
+ }));
14
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
15
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
16
+ }) : function(o, v) {
17
+ o["default"] = v;
18
+ });
19
+ var __importStar = (this && this.__importStar) || (function () {
20
+ var ownKeys = function(o) {
21
+ ownKeys = Object.getOwnPropertyNames || function (o) {
22
+ var ar = [];
23
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
24
+ return ar;
25
+ };
26
+ return ownKeys(o);
27
+ };
28
+ return function (mod) {
29
+ if (mod && mod.__esModule) return mod;
30
+ var result = {};
31
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
32
+ __setModuleDefault(result, mod);
33
+ return result;
34
+ };
35
+ })();
36
+ Object.defineProperty(exports, "__esModule", { value: true });
37
+ const commander_1 = require("commander");
38
+ const fetcher_1 = require("./fetcher");
39
+ const parser_1 = require("./parser");
40
+ const formatter_1 = require("./formatter");
41
+ const summarizer_1 = require("./summarizer");
42
+ const fs = __importStar(require("fs"));
43
+ const path = __importStar(require("path"));
44
+ const VERSION = "1.0.0";
45
+ const program = new commander_1.Command();
46
+ program
47
+ .name("websnap")
48
+ .description("Turn any URL into clean markdown. A better reader mode for your terminal.")
49
+ .version(VERSION);
50
+ // Default command: snap a single URL
51
+ program
52
+ .argument("[url]", "URL to snap")
53
+ .option("--json", "Output structured JSON instead of markdown")
54
+ .option("--summary", "Generate an AI-powered 3-sentence summary")
55
+ .option("--raw", "Output raw extracted HTML before markdown conversion")
56
+ .option("-o, --output <file>", "Write output to a file instead of stdout")
57
+ .option("--cdp <endpoint>", "Chrome DevTools Protocol endpoint", "http://127.0.0.1:9222")
58
+ .option("--timeout <ms>", "Page load timeout in milliseconds", "15000")
59
+ .option("--no-headless", "Use visible browser (requires CDP connection)")
60
+ .option("--user-agent <string>", "Custom User-Agent header")
61
+ .action(async (url, options) => {
62
+ if (!url) {
63
+ program.help();
64
+ return;
65
+ }
66
+ await snapURL(url, options);
67
+ });
68
+ // Batch command
69
+ program
70
+ .command("batch <file>")
71
+ .description("Batch process URLs from a file (one URL per line)")
72
+ .option("--json", "Output structured JSON for each URL")
73
+ .option("--summary", "Generate AI summaries for each URL")
74
+ .option("--outdir <dir>", "Write each result to a separate file in this directory")
75
+ .option("--cdp <endpoint>", "Chrome DevTools Protocol endpoint", "http://127.0.0.1:9222")
76
+ .option("--timeout <ms>", "Page load timeout in milliseconds", "15000")
77
+ .option("--delay <ms>", "Delay between requests in milliseconds", "1000")
78
+ .action(async (file, options) => {
79
+ await batchProcess(file, options);
80
+ });
81
+ async function snapURL(url, options) {
82
+ const timeout = parseInt(options.timeout, 10) || 15000;
83
+ try {
84
+ // Validate URL
85
+ const parsedUrl = new URL(url);
86
+ if (!["http:", "https:"].includes(parsedUrl.protocol)) {
87
+ console.error(`\x1b[31mError:\x1b[0m Invalid protocol "${parsedUrl.protocol}". Use http: or https:`);
88
+ process.exit(1);
89
+ }
90
+ }
91
+ catch {
92
+ console.error(`\x1b[31mError:\x1b[0m Invalid URL "${url}"`);
93
+ process.exit(1);
94
+ }
95
+ let html;
96
+ try {
97
+ process.stderr.write(`\x1b[90mFetching ${url}...\x1b[0m\n`);
98
+ html = await (0, fetcher_1.fetchPage)(url, {
99
+ cdpEndpoint: options.cdp,
100
+ timeout,
101
+ userAgent: options.userAgent,
102
+ });
103
+ }
104
+ catch (err) {
105
+ console.error(`\x1b[31mError fetching page:\x1b[0m ${err.message || err}`);
106
+ process.exit(1);
107
+ }
108
+ if (options.raw) {
109
+ output(html, options.output);
110
+ return null;
111
+ }
112
+ const article = (0, parser_1.parseContent)(html, url);
113
+ if (options.summary) {
114
+ process.stderr.write(`\x1b[90mGenerating summary...\x1b[0m\n`);
115
+ const summaryText = await (0, summarizer_1.summarize)(article);
116
+ if (options.json) {
117
+ const jsonOut = {
118
+ url,
119
+ title: article.title,
120
+ author: article.author,
121
+ date: article.date,
122
+ summary: summaryText,
123
+ wordCount: article.wordCount,
124
+ readingTime: article.readingTime,
125
+ };
126
+ output(JSON.stringify(jsonOut, null, 2), options.output);
127
+ }
128
+ else {
129
+ const lines = [
130
+ `# ${article.title}`,
131
+ "",
132
+ article.author ? `*By ${article.author}*` : "",
133
+ article.date ? `*${article.date}*` : "",
134
+ "",
135
+ "## Summary",
136
+ "",
137
+ summaryText,
138
+ "",
139
+ `---`,
140
+ `*Source: ${url}*`,
141
+ ]
142
+ .filter(Boolean)
143
+ .join("\n");
144
+ output(lines, options.output);
145
+ }
146
+ return article;
147
+ }
148
+ if (options.json) {
149
+ output((0, formatter_1.formatJSON)(article, url), options.output);
150
+ }
151
+ else {
152
+ output((0, formatter_1.formatMarkdown)(article, url), options.output);
153
+ }
154
+ return article;
155
+ }
156
+ async function batchProcess(file, options) {
157
+ const filePath = path.resolve(file);
158
+ if (!fs.existsSync(filePath)) {
159
+ console.error(`\x1b[31mError:\x1b[0m File not found: ${filePath}`);
160
+ process.exit(1);
161
+ }
162
+ const content = fs.readFileSync(filePath, "utf-8");
163
+ const urls = content
164
+ .split("\n")
165
+ .map((line) => line.trim())
166
+ .filter((line) => line && !line.startsWith("#"));
167
+ if (urls.length === 0) {
168
+ console.error(`\x1b[31mError:\x1b[0m No URLs found in ${filePath}`);
169
+ process.exit(1);
170
+ }
171
+ process.stderr.write(`\x1b[90mProcessing ${urls.length} URL(s)...\x1b[0m\n`);
172
+ if (options.outdir) {
173
+ fs.mkdirSync(options.outdir, { recursive: true });
174
+ }
175
+ const delay = parseInt(options.delay, 10) || 1000;
176
+ const results = [];
177
+ let successCount = 0;
178
+ let failCount = 0;
179
+ for (let i = 0; i < urls.length; i++) {
180
+ const url = urls[i];
181
+ process.stderr.write(`\x1b[90m[${i + 1}/${urls.length}] ${url}\x1b[0m\n`);
182
+ try {
183
+ const html = await (0, fetcher_1.fetchPage)(url, {
184
+ cdpEndpoint: options.cdp,
185
+ timeout: parseInt(options.timeout, 10) || 15000,
186
+ userAgent: options.userAgent,
187
+ });
188
+ const article = (0, parser_1.parseContent)(html, url);
189
+ if (options.outdir) {
190
+ const slug = slugify(article.title || `page-${i + 1}`);
191
+ const ext = options.json ? ".json" : ".md";
192
+ const outPath = path.join(options.outdir, slug + ext);
193
+ const content = options.json
194
+ ? (0, formatter_1.formatJSON)(article, url)
195
+ : (0, formatter_1.formatMarkdown)(article, url);
196
+ fs.writeFileSync(outPath, content, "utf-8");
197
+ process.stderr.write(` \x1b[32m->\x1b[0m ${outPath}\n`);
198
+ }
199
+ if (options.json) {
200
+ results.push({
201
+ url,
202
+ title: article.title,
203
+ author: article.author,
204
+ date: article.date,
205
+ wordCount: article.wordCount,
206
+ readingTime: article.readingTime,
207
+ status: "ok",
208
+ });
209
+ }
210
+ successCount++;
211
+ }
212
+ catch (err) {
213
+ process.stderr.write(` \x1b[31mFailed:\x1b[0m ${err.message || err}\n`);
214
+ if (options.json) {
215
+ results.push({ url, status: "error", error: err.message || String(err) });
216
+ }
217
+ failCount++;
218
+ }
219
+ // Delay between requests (skip after last)
220
+ if (i < urls.length - 1 && delay > 0) {
221
+ await new Promise((r) => setTimeout(r, delay));
222
+ }
223
+ }
224
+ process.stderr.write(`\n\x1b[90mDone: ${successCount} succeeded, ${failCount} failed\x1b[0m\n`);
225
+ if (options.json && !options.outdir) {
226
+ console.log(JSON.stringify(results, null, 2));
227
+ }
228
+ }
229
+ function output(content, filePath) {
230
+ if (filePath) {
231
+ const resolved = path.resolve(filePath);
232
+ fs.mkdirSync(path.dirname(resolved), { recursive: true });
233
+ fs.writeFileSync(resolved, content, "utf-8");
234
+ process.stderr.write(`\x1b[32mWritten to ${resolved}\x1b[0m\n`);
235
+ }
236
+ else {
237
+ console.log(content);
238
+ }
239
+ }
240
+ function slugify(text) {
241
+ return text
242
+ .toLowerCase()
243
+ .replace(/[^a-z0-9]+/g, "-")
244
+ .replace(/^-+|-+$/g, "")
245
+ .substring(0, 80);
246
+ }
247
+ program.parse(process.argv);
248
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAEA,yCAAoC;AACpC,uCAAsC;AACtC,qCAAuD;AACvD,2CAA8E;AAC9E,6CAAyC;AACzC,uCAAyB;AACzB,2CAA6B;AAG7B,MAAM,OAAO,GAAG,OAAO,CAAC;AAExB,MAAM,OAAO,GAAG,IAAI,mBAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,SAAS,CAAC;KACf,WAAW,CACV,2EAA2E,CAC5E;KACA,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,qCAAqC;AACrC,OAAO;KACJ,QAAQ,CAAC,OAAO,EAAE,aAAa,CAAC;KAChC,MAAM,CAAC,QAAQ,EAAE,4CAA4C,CAAC;KAC9D,MAAM,CAAC,WAAW,EAAE,2CAA2C,CAAC;KAChE,MAAM,CAAC,OAAO,EAAE,sDAAsD,CAAC;KACvE,MAAM,CAAC,qBAAqB,EAAE,0CAA0C,CAAC;KACzE,MAAM,CACL,kBAAkB,EAClB,mCAAmC,EACnC,uBAAuB,CACxB;KACA,MAAM,CAAC,gBAAgB,EAAE,mCAAmC,EAAE,OAAO,CAAC;KACtE,MAAM,CAAC,eAAe,EAAE,+CAA+C,CAAC;KACxE,MAAM,CACL,uBAAuB,EACvB,0BAA0B,CAC3B;KACA,MAAM,CAAC,KAAK,EAAE,GAAuB,EAAE,OAA4B,EAAE,EAAE;IACtE,IAAI,CAAC,GAAG,EAAE,CAAC;QACT,OAAO,CAAC,IAAI,EAAE,CAAC;QACf,OAAO;IACT,CAAC;IACD,MAAM,OAAO,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;AAC9B,CAAC,CAAC,CAAC;AAEL,gBAAgB;AAChB,OAAO;KACJ,OAAO,CAAC,cAAc,CAAC;KACvB,WAAW,CAAC,mDAAmD,CAAC;KAChE,MAAM,CAAC,QAAQ,EAAE,qCAAqC,CAAC;KACvD,MAAM,CAAC,WAAW,EAAE,oCAAoC,CAAC;KACzD,MAAM,CACL,gBAAgB,EAChB,wDAAwD,CACzD;KACA,MAAM,CACL,kBAAkB,EAClB,mCAAmC,EACnC,uBAAuB,CACxB;KACA,MAAM,CAAC,gBAAgB,EAAE,mCAAmC,EAAE,OAAO,CAAC;KACtE,MAAM,CAAC,cAAc,EAAE,wCAAwC,EAAE,MAAM,CAAC;KACxE,MAAM,CAAC,KAAK,EAAE,IAAY,EAAE,OAA4B,EAAE,EAAE;IAC3D,MAAM,YAAY,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;AACpC,CAAC,CAAC,CAAC;AAEL,KAAK,UAAU,OAAO,CACpB,GAAW,EACX,OAA4B;IAE5B,MAAM,OAAO,GAAG,QAAQ,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,IAAI,KAAK,CAAC;IAEvD,IAAI,CAAC;QACH,eAAe;QACf,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC/B,IAAI,CAAC,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,QAAQ,CAAC,EAAE,CAAC;YACtD,OAAO,CAAC,KAAK,CAAC,2CAA2C,SAAS,CAAC,QAAQ,wBAAwB,CAAC,CAAC;YACrG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,CAAC,KAAK,CAAC,sCAAsC,GAAG,GAAG,CAAC,CAAC;QAC5D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,IAAI,IAAY,CAAC;IAEjB,IAAI,CAAC;QACH,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,oBAAoB,GAAG,cAAc,CAAC,CAAC;QAC5D,IAAI,GAAG,MAAM,IAAA,mBAAS,EAAC,GAAG,EAAE;YAC1B,WAAW,EAAE,OAAO,CAAC,GAAG;YACxB,OAAO;YACP,SAAS,EAAE,OAAO,CAAC,SAAS;SAC7B,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,OAAO,CAAC,KAAK,CACX,uCAAuC,GAAG,CAAC,OAAO,IAAI,GAAG,EAAE,CAC5D,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;QAChB,MAAM,CAAC,IAAI,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;QAC7B,OAAO,IAAI,CAAC;IACd,CAAC;IAED,MAAM,OAAO,GAAG,IAAA,qBAAY,EAAC,IAAI,EAAE,GAAG,CAAC,CAAC;IAExC,IAAI,OAAO,CAAC,OAAO,EAAE,CAAC;QACpB,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,wCAAwC,CAAC,CAAC;QAC/D,MAAM,WAAW,GAAG,MAAM,IAAA,sBAAS,EAAC,OAAO,CAAC,CAAC;QAC7C,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;YACjB,MAAM,OAAO,GAAG;gBACd,GAAG;gBACH,KAAK,EAAE,OAAO,CAAC,KAAK;gBACpB,MAAM,EAAE,OAAO,CAAC,MAAM;gBACtB,IAAI,EAAE,OAAO,CAAC,IAAI;gBAClB,OAAO,EAAE,WAAW;gBACpB,SAAS,EAAE,OAAO,CAAC,SAAS;gBAC5B,WAAW,EAAE,OAAO,CAAC,WAAW;aACjC,CAAC;YACF,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;QAC3D,CAAC;aAAM,CAAC;YACN,MAAM,KAAK,GAAG;gBACZ,KAAK,OAAO,CAAC,KAAK,EAAE;gBACpB,EAAE;gBACF,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,EAAE;gBAC9C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,OAAO,CAAC,IAAI,GAAG,CAAC,CAAC,CAAC,EAAE;gBACvC,EAAE;gBACF,YAAY;gBACZ,EAAE;gBACF,WAAW;gBACX,EAAE;gBACF,KAAK;gBACL,YAAY,GAAG,GAAG;aACnB;iBACE,MAAM,CAAC,OAAO,CAAC;iBACf,IAAI,CAAC,IAAI,CAAC,CAAC;YACd,MAAM,CAAC,KAAK,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;QAChC,CAAC;QACD,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;QACjB,MAAM,CAAC,IAAA,sBAAU,EAAC,OAAO,EAAE,GAAG,CAAC,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;IACnD,CAAC;SAAM,CAAC;QACN,MAAM,CAAC,IAAA,0BAAc,EAAC,OAAO,EAAE,GAAG,CAAC,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;IACvD,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,KAAK,UAAU,YAAY,CACzB,IAAY,EACZ,OAA4B;IAE5B,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IAEpC,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;QAC7B,OAAO,CAAC,KAAK,CAAC,yCAAyC,QAAQ,EAAE,CAAC,CAAC;QACnE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,MAAM,OAAO,GAAG,EAAE,CAAC,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACnD,MAAM,IAAI,GAAG,OAAO;SACjB,KAAK,CAAC,IAAI,CAAC;SACX,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;SAC1B,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;IAEnD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtB,OAAO,CAAC,KAAK,CAAC,0CAA0C,QAAQ,EAAE,CAAC,CAAC;QACpE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,sBAAsB,IAAI,CAAC,MAAM,qBAAqB,CACvD,CAAC;IAEF,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;QACnB,EAAE,CAAC,SAAS,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACpD,CAAC;IAED,MAAM,KAAK,GAAG,QAAQ,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,IAAI,CAAC;IAClD,MAAM,OAAO,GAAU,EAAE,CAAC;IAC1B,IAAI,YAAY,GAAG,CAAC,CAAC;IACrB,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;QACpB,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,YAAY,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,MAAM,KAAK,GAAG,WAAW,CACpD,CAAC;QAEF,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,MAAM,IAAA,mBAAS,EAAC,GAAG,EAAE;gBAChC,WAAW,EAAE,OAAO,CAAC,GAAG;gBACxB,OAAO,EAAE,QAAQ,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,IAAI,KAAK;gBAC/C,SAAS,EAAE,OAAO,CAAC,SAAS;aAC7B,CAAC,CAAC;YAEH,MAAM,OAAO,GAAG,IAAA,qBAAY,EAAC,IAAI,EAAE,GAAG,CAAC,CAAC;YAExC,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;gBACnB,MAAM,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC,KAAK,IAAI,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;gBACvD,MAAM,GAAG,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC;gBAC3C,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,IAAI,GAAG,GAAG,CAAC,CAAC;gBACtD,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI;oBAC1B,CAAC,CAAC,IAAA,sBAAU,EAAC,OAAO,EAAE,GAAG,CAAC;oBAC1B,CAAC,CAAC,IAAA,0BAAc,EAAC,OAAO,EAAE,GAAG,CAAC,CAAC;gBACjC,EAAE,CAAC,aAAa,CAAC,OAAO,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;gBAC5C,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,uBAAuB,OAAO,IAAI,CAAC,CAAC;YAC3D,CAAC;YAED,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;gBACjB,OAAO,CAAC,IAAI,CAAC;oBACX,GAAG;oBACH,KAAK,EAAE,OAAO,CAAC,KAAK;oBACpB,MAAM,EAAE,OAAO,CAAC,MAAM;oBACtB,IAAI,EAAE,OAAO,CAAC,IAAI;oBAClB,SAAS,EAAE,OAAO,CAAC,SAAS;oBAC5B,WAAW,EAAE,OAAO,CAAC,WAAW;oBAChC,MAAM,EAAE,IAAI;iBACb,CAAC,CAAC;YACL,CAAC;YAED,YAAY,EAAE,CAAC;QACjB,CAAC;QAAC,OAAO,GAAQ,EAAE,CAAC;YAClB,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,4BAA4B,GAAG,CAAC,OAAO,IAAI,GAAG,IAAI,CACnD,CAAC;YACF,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;gBACjB,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,KAAK,EAAE,GAAG,CAAC,OAAO,IAAI,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAC5E,CAAC;YACD,SAAS,EAAE,CAAC;QACd,CAAC;QAED,2CAA2C;QAC3C,IAAI,CAAC,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;YACrC,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,CAAC;QACjD,CAAC;IACH,CAAC;IAED,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,mBAAmB,YAAY,eAAe,SAAS,kBAAkB,CAC1E,CAAC;IAEF,IAAI,OAAO,CAAC,IAAI,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC;QACpC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAChD,CAAC;AACH,CAAC;AAED,SAAS,MAAM,CAAC,OAAe,EAAE,QAAiB;IAChD,IAAI,QAAQ,EAAE,CAAC;QACb,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QACxC,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAC1D,EAAE,CAAC,aAAa,CAAC,QAAQ,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;QAC7C,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,sBAAsB,QAAQ,WAAW,CAAC,CAAC;IAClE,CAAC;SAAM,CAAC;QACN,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;IACvB,CAAC;AACH,CAAC;AAED,SAAS,OAAO,CAAC,IAAY;IAC3B,OAAO,IAAI;SACR,WAAW,EAAE;SACb,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC;SAC3B,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC;SACvB,SAAS,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;AACtB,CAAC;AAED,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC"}
@@ -0,0 +1,25 @@
1
+ /**
2
+ * parser.ts - Extract clean article content from raw HTML
3
+ *
4
+ * Implements a simplified readability algorithm:
5
+ * 1. Strip non-content elements (nav, ads, scripts, styles, etc.)
6
+ * 2. Identify the main content container
7
+ * 3. Extract metadata (title, author, date)
8
+ * 4. Calculate reading statistics
9
+ */
10
+ export interface ParsedArticle {
11
+ title: string;
12
+ author: string | null;
13
+ date: string | null;
14
+ siteName: string | null;
15
+ description: string | null;
16
+ content: string;
17
+ textContent: string;
18
+ wordCount: number;
19
+ readingTime: string;
20
+ }
21
+ /**
22
+ * Parse HTML and extract clean article content
23
+ */
24
+ export declare function parseContent(html: string, url: string): ParsedArticle;
25
+ //# sourceMappingURL=parser.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"parser.d.ts","sourceRoot":"","sources":["../src/parser.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,MAAM,WAAW,aAAa;IAC5B,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACtB,IAAI,EAAE,MAAM,GAAG,IAAI,CAAC;IACpB,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC;CACrB;AAoFD;;GAEG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,aAAa,CAiDrE"}