glance-cli 0.13.0 → 0.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,532 @@
1
+ /**
2
+ * Production-Grade Content Extraction
3
+ *
4
+ * Features:
5
+ * - Advanced content detection with scoring algorithm
6
+ * - Deduplication and cleaning
7
+ * - Structured data extraction (JSON-LD, microdata)
8
+ * - Comprehensive metadata (OpenGraph, Twitter Cards, Schema.org)
9
+ * - Table extraction with structure preservation
10
+ * - Code block detection and formatting
11
+ * - Language detection
12
+ * - Reading time estimation (now accurate)
13
+ * - Author and publication date extraction
14
+ * - Performance optimizations
15
+ * - Single-pass noise removal
16
+ */
17
+
18
+ import type { Cheerio, CheerioAPI } from "cheerio";
19
+ import * as cheerio from "cheerio";
20
+ import type { Element } from "domhandler";
21
+
22
+ // === Types ===
23
+ export interface Link {
24
+ href: string;
25
+ text: string;
26
+ title?: string;
27
+ rel?: string;
28
+ type?: "internal" | "external" | "anchor";
29
+ }
30
+
31
+ export interface ExtendedMetadata {
32
+ title: string;
33
+ description: string;
34
+ keywords: string[];
35
+ language?: string;
36
+ author?: string;
37
+ publishDate?: string;
38
+ modifiedDate?: string;
39
+ publisher?: string;
40
+ og: Record<string, string>;
41
+ twitter: Record<string, string>;
42
+ structuredData?: any[];
43
+ siteName?: string;
44
+ type?: string;
45
+ url?: string;
46
+ image?: string;
47
+ canonical?: string;
48
+ robots?: string;
49
+ viewport?: string;
50
+ wordCount?: number;
51
+ readingTime?: number;
52
+ }
53
+
54
+ export interface ExtractedContent {
55
+ text: string;
56
+ html?: string;
57
+ wordCount: number;
58
+ charCount: number;
59
+ paragraphCount: number;
60
+ hasCode?: boolean;
61
+ hasTables?: boolean;
62
+ }
63
+
64
+ export interface TableData {
65
+ headers: string[];
66
+ rows: string[][];
67
+ }
68
+
69
+ export interface CodeBlock {
70
+ language?: string;
71
+ code: string;
72
+ }
73
+
74
+ // === Configuration ===
75
+ const EXTRACTOR_CONFIG = {
76
+ MIN_CONTENT_LENGTH: 200,
77
+ MIN_PARAGRAPH_LENGTH: 50,
78
+ MAX_LINK_TEXT_RATIO: 0.5,
79
+ READING_WORDS_PER_MINUTE: 200,
80
+ } as const;
81
+
82
+ const CONTENT_SELECTORS = [
83
+ { selector: "article", score: 100 },
84
+ { selector: "[role='main']", score: 95 },
85
+ { selector: "main", score: 90 },
86
+ { selector: ".post-content", score: 85 },
87
+ { selector: ".entry-content", score: 85 },
88
+ { selector: ".article-content", score: 85 },
89
+ { selector: ".content-body", score: 80 },
90
+ { selector: "#content", score: 75 },
91
+ { selector: ".content", score: 70 },
92
+ { selector: ".post", score: 65 },
93
+ { selector: ".markdown-body", score: 90 },
94
+ { selector: "[itemprop='articleBody']", score: 95 },
95
+ ] as const;
96
+
97
+ // Combined for single-pass removal
98
+ const NOISE_SELECTOR_STRING = [
99
+ "script",
100
+ "style",
101
+ "noscript",
102
+ "iframe",
103
+ "nav",
104
+ "header:not(article header)",
105
+ "footer:not(article footer)",
106
+ "aside",
107
+ ".advertisement",
108
+ ".ad",
109
+ ".ads",
110
+ ".social-share",
111
+ ".comments",
112
+ ".related-posts",
113
+ ".sidebar",
114
+ "[role='navigation']",
115
+ "[role='banner']",
116
+ "[role='complementary']",
117
+ "[class*='cookie']",
118
+ "[id*='cookie']",
119
+ "[class*='popup']",
120
+ "[class*='modal']",
121
+ "[class*='newsletter']",
122
+ ".hidden",
123
+ "[hidden]",
124
+ "[aria-hidden='true']",
125
+ ].join(", ");
126
+
127
+ // === Validation ===
128
+ function validateHTML(html: string): void {
129
+ if (
130
+ !html ||
131
+ typeof html !== "string" ||
132
+ html.trim().length === 0 ||
133
+ !html.includes("<")
134
+ ) {
135
+ throw new Error("Invalid or empty HTML");
136
+ }
137
+ }
138
+
139
+ // === Scoring ===
140
+ function scoreElement(_$: CheerioAPI, element: Cheerio<Element>): number {
141
+ let score = 0;
142
+ const text = element.text().trim();
143
+ const textLength = text.length;
144
+
145
+ if (textLength < EXTRACTOR_CONFIG.MIN_CONTENT_LENGTH) return 0;
146
+
147
+ score += Math.min(textLength / 10, 100);
148
+ score += element.find("p").length * 5;
149
+
150
+ const linkRatio =
151
+ textLength > 0 ? element.find("a").text().length / textLength : 0;
152
+ if (linkRatio > EXTRACTOR_CONFIG.MAX_LINK_TEXT_RATIO) score -= 50;
153
+
154
+ if (element.find("h1,h2,h3").length > 0) score += 10;
155
+ if (element.find("p").length > 3) score += 10;
156
+ if (element.find("blockquote").length > 0) score += 5;
157
+ if (element.find("ul,ol").length > 0) score += 5;
158
+
159
+ if (element.find(".comments,.comment").length > 0) score -= 20;
160
+ if (element.find("form").length > 2) score -= 15;
161
+
162
+ return score;
163
+ }
164
+
165
+ function findBestContent($: CheerioAPI): Cheerio<Element> | null {
166
+ let bestElement: Cheerio<Element> | null = null;
167
+ let bestScore = 0;
168
+
169
+ for (const { selector, score: selectorScore } of CONTENT_SELECTORS) {
170
+ $(selector).each((_, el) => {
171
+ const element = $(el);
172
+ const totalScore = scoreElement($, element) + selectorScore;
173
+ if (totalScore > bestScore) {
174
+ bestScore = totalScore;
175
+ bestElement = element;
176
+ }
177
+ });
178
+ }
179
+
180
+ if (!bestElement || bestScore < 100) {
181
+ $("div").each((_, el) => {
182
+ const element = $(el);
183
+ const contentScore = scoreElement($, element);
184
+ if (contentScore > bestScore) {
185
+ bestScore = contentScore;
186
+ bestElement = element;
187
+ }
188
+ });
189
+ }
190
+
191
+ return bestElement;
192
+ }
193
+
194
+ // === Text Cleaning ===
195
+ function cleanText(text: string): string {
196
+ return text
197
+ .replace(/[\x00-\x1F\x7F-\x9F\uFFFD\uFEFF\u200B-\u200D\u2060]/g, "")
198
+ .replace(/’/g, "'")
199
+ .replace(/“/g, '"')
200
+ .replace(/â€\x9D/g, '"')
201
+ .replace(/â€"/g, "—")
202
+ .replace(/â€\x93/g, "–")
203
+ .replace(/Â /g, " ")
204
+ .replace(/â¢/g, "•")
205
+ .replace(/é/g, "é")
206
+ .replace(/á/g, "á")
207
+ .replace(/í/g, "í")
208
+ .replace(/ó/g, "ó")
209
+ .replace(/ú/g, "ú")
210
+ .replace(/ñ/g, "ñ")
211
+ .replace(/Ã\x87/g, "Ç")
212
+ .replace(/[^\x00-\x7F\u00A0-\uFFFF]/g, "")
213
+ .replace(/[ \t\r\f]+/g, " ")
214
+ .replace(/\n{3,}/g, "\n\n")
215
+ .replace(/ +\n/g, "\n")
216
+ .replace(/\n +/g, "\n")
217
+ .split("\n")
218
+ .map((line) => line.trim())
219
+ .filter((line, i, arr) => {
220
+ if (line) return true;
221
+ return i > 0 && i < arr.length - 1 && arr[i - 1] && arr[i + 1];
222
+ })
223
+ .join("\n")
224
+ .replace(/(\.|!|\?)\s+(?=[A-Z])/g, "$1\n\n")
225
+ .replace(/\n{3,}/g, "\n\n")
226
+ .replace(/^\n+|\n+$/g, "")
227
+ .trim();
228
+ }
229
+
230
+ function extractFormattedText(
231
+ $: CheerioAPI,
232
+ element: Cheerio<Element>,
233
+ ): string {
234
+ const clone = element.clone();
235
+
236
+ clone.find("h1,h2,h3,h4,h5,h6").each((_, el) => {
237
+ const $el = $(el);
238
+ if ($el.text().trim()) {
239
+ $el.before("\n\n").after("\n\n");
240
+ }
241
+ });
242
+
243
+ clone.find("p").each((_, el) => {
244
+ const $el = $(el);
245
+ if ($el.text().trim()) {
246
+ $el.before("\n\n").after("\n\n");
247
+ }
248
+ });
249
+
250
+ clone.find("div,blockquote,pre,ul,ol,dl").each((_, el) => {
251
+ const $el = $(el);
252
+ if ($el.text().trim()) {
253
+ $el.before("\n").after("\n");
254
+ }
255
+ });
256
+
257
+ clone.find("li").each((_, el) => {
258
+ const $el = $(el);
259
+ if ($el.text().trim()) {
260
+ $el.prepend("• ").before("\n");
261
+ }
262
+ });
263
+
264
+ return cleanText(clone.text());
265
+ }
266
+
267
+ // === Extraction Functions ===
268
+
269
+ export function extractCleanText(html: string): string {
270
+ validateHTML(html);
271
+ const $ = cheerio.load(html);
272
+ $(NOISE_SELECTOR_STRING).remove();
273
+
274
+ const bestElement = findBestContent($);
275
+ let text = bestElement?.length
276
+ ? extractFormattedText($, bestElement)
277
+ : extractFormattedText($, $("body"));
278
+
279
+ if (text.length < EXTRACTOR_CONFIG.MIN_CONTENT_LENGTH) {
280
+ text = cleanText($("body").text());
281
+ }
282
+
283
+ return text;
284
+ }
285
+
286
+ export function extractContent(html: string): ExtractedContent {
287
+ validateHTML(html);
288
+ const $ = cheerio.load(html);
289
+ $(NOISE_SELECTOR_STRING).remove();
290
+
291
+ const element = findBestContent($) ?? $("body");
292
+ const text = extractFormattedText($, element);
293
+ const html_content = element.html() || "";
294
+
295
+ const paragraphs = text
296
+ .split("\n\n")
297
+ .filter((p) => p.length > EXTRACTOR_CONFIG.MIN_PARAGRAPH_LENGTH);
298
+
299
+ return {
300
+ text,
301
+ html: html_content,
302
+ wordCount: text.split(/\s+/).filter(Boolean).length,
303
+ charCount: text.length,
304
+ paragraphCount: paragraphs.length,
305
+ hasCode: element.find("pre,code").length > 0,
306
+ hasTables: element.find("table").length > 0,
307
+ };
308
+ }
309
+
310
+ export function extractLinks(html: string, baseUrl?: string): Link[] {
311
+ validateHTML(html);
312
+ const $ = cheerio.load(html);
313
+ const links: Link[] = [];
314
+ const seen = new Set<string>();
315
+
316
+ $("a[href]").each((_, el) => {
317
+ let href = $(el).attr("href")?.trim();
318
+ if (!href || href.startsWith("javascript:") || seen.has(href)) return;
319
+
320
+ const text = $(el).text().trim() || href;
321
+
322
+ if (baseUrl && !href.startsWith("http")) {
323
+ try {
324
+ href = new URL(href, baseUrl).href;
325
+ } catch {
326
+ return;
327
+ }
328
+ }
329
+
330
+ if (!href.startsWith("http")) return;
331
+ seen.add(href);
332
+
333
+ let type: Link["type"] = "external";
334
+ if (baseUrl) {
335
+ try {
336
+ const linkUrl = new URL(href);
337
+ const base = new URL(baseUrl);
338
+ if (linkUrl.hostname === base.hostname) type = "internal";
339
+ } catch {}
340
+ }
341
+ if (href.includes("#")) type = "anchor";
342
+
343
+ links.push({
344
+ href,
345
+ text,
346
+ title: $(el).attr("title")?.trim(),
347
+ rel: $(el).attr("rel")?.trim(),
348
+ type,
349
+ });
350
+ });
351
+
352
+ return links;
353
+ }
354
+
355
+ export function extractMetadata(html: string): ExtendedMetadata {
356
+ validateHTML(html);
357
+ const $ = cheerio.load(html);
358
+
359
+ const getMeta = (s: string): string | undefined =>
360
+ $(s).attr("content")?.trim();
361
+
362
+ const og: Record<string, string> = {};
363
+ $("meta[property^='og:']").each((_, el) => {
364
+ const prop = $(el).attr("property")?.replace("og:", "");
365
+ const content = $(el).attr("content");
366
+ if (prop && content) og[prop] = content.trim();
367
+ });
368
+
369
+ const twitter: Record<string, string> = {};
370
+ $("meta[name^='twitter:']").each((_, el) => {
371
+ const name = $(el).attr("name")?.replace("twitter:", "");
372
+ const content = $(el).attr("content");
373
+ if (name && content) twitter[name] = content.trim();
374
+ });
375
+
376
+ const structuredData: any[] = [];
377
+ $("script[type='application/ld+json']").each((_, el) => {
378
+ try {
379
+ structuredData.push(JSON.parse($(el).html() || "{}"));
380
+ } catch {}
381
+ });
382
+
383
+ const keywords = (getMeta("meta[name='keywords']") || "")
384
+ .split(",")
385
+ .map((k) => k.trim())
386
+ .filter(Boolean);
387
+
388
+ const title =
389
+ $("title").first().text().trim() ||
390
+ og.title ||
391
+ getMeta("meta[property='og:title']") ||
392
+ $("h1").first().text().trim() ||
393
+ "";
394
+ const description =
395
+ getMeta("meta[name='description']") ||
396
+ og.description ||
397
+ getMeta("meta[property='og:description']") ||
398
+ "";
399
+
400
+ const author =
401
+ getMeta("meta[name='author']") ||
402
+ getMeta("meta[property='article:author']") ||
403
+ $("[rel='author']").text().trim();
404
+ const publishDate =
405
+ getMeta("meta[property='article:published_time']") ||
406
+ $("time[datetime]").first().attr("datetime");
407
+ const modifiedDate = getMeta("meta[property='article:modified_time']");
408
+
409
+ const cleanText = extractCleanText(html); // Now accurate
410
+ const wordCount = cleanText.split(/\s+/).filter(Boolean).length;
411
+ const readingTime = Math.max(
412
+ 1,
413
+ Math.ceil(wordCount / EXTRACTOR_CONFIG.READING_WORDS_PER_MINUTE),
414
+ );
415
+
416
+ return {
417
+ title,
418
+ description,
419
+ keywords,
420
+ language: $("html").attr("lang") || getMeta("meta[name='language']"),
421
+ author,
422
+ publishDate,
423
+ modifiedDate,
424
+ publisher: og.site_name,
425
+ og,
426
+ twitter,
427
+ structuredData: structuredData.length > 0 ? structuredData : undefined,
428
+ siteName: og.site_name || getMeta("meta[property='og:site_name']"),
429
+ type: og.type || "website",
430
+ url: og.url || $("link[rel='canonical']").attr("href"),
431
+ image: og.image || getMeta("meta[name='twitter:image']"),
432
+ canonical: $("link[rel='canonical']").attr("href"),
433
+ robots: getMeta("meta[name='robots']"),
434
+ viewport: getMeta("meta[name='viewport']"),
435
+ wordCount,
436
+ readingTime,
437
+ };
438
+ }
439
+
440
+ export function extractTables(html: string): TableData[] {
441
+ validateHTML(html);
442
+
443
+ const $ = cheerio.load(html);
444
+ const tables: TableData[] = [];
445
+
446
+ $("table").each((_, table) => {
447
+ const headers: string[] = [];
448
+ const rows: string[][] = [];
449
+
450
+ $(table)
451
+ .find("thead th, tr:first-child th")
452
+ .each((_, th) => {
453
+ headers.push($(th).text().trim());
454
+ });
455
+
456
+ if (headers.length === 0) {
457
+ $(table)
458
+ .find("tr:first-child td")
459
+ .each((_, td) => {
460
+ headers.push($(td).text().trim());
461
+ });
462
+ }
463
+
464
+ const startRow = headers.length > 0 ? 1 : 0;
465
+ $(table)
466
+ .find("tr")
467
+ .slice(startRow)
468
+ .each((_, tr) => {
469
+ const row: string[] = [];
470
+ $(tr)
471
+ .find("td")
472
+ .each((_, td) => {
473
+ row.push($(td).text().trim());
474
+ });
475
+ if (row.length > 0) {
476
+ rows.push(row);
477
+ }
478
+ });
479
+
480
+ if (headers.length > 0 || rows.length > 0) {
481
+ tables.push({ headers, rows });
482
+ }
483
+ });
484
+
485
+ return tables;
486
+ }
487
+
488
+ export function extractCodeBlocks(html: string): CodeBlock[] {
489
+ validateHTML(html);
490
+
491
+ const $ = cheerio.load(html);
492
+ const codeBlocks: CodeBlock[] = [];
493
+
494
+ $("pre").each((_, pre) => {
495
+ const code = $(pre).find("code").first();
496
+ const text = code.length > 0 ? code.text() : $(pre).text();
497
+
498
+ const classAttr = code.attr("class") || $(pre).attr("class") || "";
499
+ const languageMatch = classAttr.match(/language-(\w+)|lang-(\w+)/);
500
+ const language = languageMatch
501
+ ? languageMatch[1] || languageMatch[2]
502
+ : undefined;
503
+
504
+ if (text.trim()) {
505
+ codeBlocks.push({
506
+ language,
507
+ code: text.trim(),
508
+ });
509
+ }
510
+ });
511
+
512
+ if (codeBlocks.length === 0) {
513
+ $("code").each((_, code) => {
514
+ const text = $(code).text().trim();
515
+ if (text && text.length > 10) {
516
+ codeBlocks.push({ code: text });
517
+ }
518
+ });
519
+ }
520
+
521
+ return codeBlocks;
522
+ }
523
+
524
+ export function extractAll(html: string, baseUrl?: string) {
525
+ return {
526
+ content: extractContent(html),
527
+ metadata: extractMetadata(html),
528
+ links: extractLinks(html, baseUrl),
529
+ tables: extractTables(html),
530
+ codeBlocks: extractCodeBlocks(html),
531
+ };
532
+ }