glance-cli 0.13.0 → 0.13.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/dist/cli.js +136 -1065
- package/package.json +3 -2
- package/src/cli/commands.ts +832 -0
- package/src/cli/config.ts +24 -0
- package/src/cli/display.ts +269 -0
- package/src/cli/errors.ts +31 -0
- package/src/cli/index.ts +237 -0
- package/src/cli/logger.ts +43 -0
- package/src/cli/types.ts +114 -0
- package/src/cli/utils.ts +239 -0
- package/src/cli/validators.ts +176 -0
- package/src/cli.ts +17 -0
- package/src/core/compat.ts +96 -0
- package/src/core/extractor.ts +532 -0
- package/src/core/fetcher.ts +592 -0
- package/src/core/formatter.ts +742 -0
- package/src/core/language-detector.ts +382 -0
- package/src/core/screenshot.ts +444 -0
- package/src/core/service-detector.ts +411 -0
- package/src/core/summarizer.ts +656 -0
- package/src/core/text-cleaner.ts +150 -0
- package/src/core/voice.ts +708 -0
|
@@ -0,0 +1,532 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Production-Grade Content Extraction
|
|
3
|
+
*
|
|
4
|
+
* Features:
|
|
5
|
+
* - Advanced content detection with scoring algorithm
|
|
6
|
+
* - Deduplication and cleaning
|
|
7
|
+
* - Structured data extraction (JSON-LD, microdata)
|
|
8
|
+
* - Comprehensive metadata (OpenGraph, Twitter Cards, Schema.org)
|
|
9
|
+
* - Table extraction with structure preservation
|
|
10
|
+
* - Code block detection and formatting
|
|
11
|
+
* - Language detection
|
|
12
|
+
* - Reading time estimation (now accurate)
|
|
13
|
+
* - Author and publication date extraction
|
|
14
|
+
* - Performance optimizations
|
|
15
|
+
* - Single-pass noise removal
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import type { Cheerio, CheerioAPI } from "cheerio";
|
|
19
|
+
import * as cheerio from "cheerio";
|
|
20
|
+
import type { Element } from "domhandler";
|
|
21
|
+
|
|
22
|
+
// === Types ===
|
|
23
|
+
export interface Link {
|
|
24
|
+
href: string;
|
|
25
|
+
text: string;
|
|
26
|
+
title?: string;
|
|
27
|
+
rel?: string;
|
|
28
|
+
type?: "internal" | "external" | "anchor";
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export interface ExtendedMetadata {
|
|
32
|
+
title: string;
|
|
33
|
+
description: string;
|
|
34
|
+
keywords: string[];
|
|
35
|
+
language?: string;
|
|
36
|
+
author?: string;
|
|
37
|
+
publishDate?: string;
|
|
38
|
+
modifiedDate?: string;
|
|
39
|
+
publisher?: string;
|
|
40
|
+
og: Record<string, string>;
|
|
41
|
+
twitter: Record<string, string>;
|
|
42
|
+
structuredData?: any[];
|
|
43
|
+
siteName?: string;
|
|
44
|
+
type?: string;
|
|
45
|
+
url?: string;
|
|
46
|
+
image?: string;
|
|
47
|
+
canonical?: string;
|
|
48
|
+
robots?: string;
|
|
49
|
+
viewport?: string;
|
|
50
|
+
wordCount?: number;
|
|
51
|
+
readingTime?: number;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export interface ExtractedContent {
|
|
55
|
+
text: string;
|
|
56
|
+
html?: string;
|
|
57
|
+
wordCount: number;
|
|
58
|
+
charCount: number;
|
|
59
|
+
paragraphCount: number;
|
|
60
|
+
hasCode?: boolean;
|
|
61
|
+
hasTables?: boolean;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export interface TableData {
|
|
65
|
+
headers: string[];
|
|
66
|
+
rows: string[][];
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
export interface CodeBlock {
|
|
70
|
+
language?: string;
|
|
71
|
+
code: string;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// === Configuration ===
|
|
75
|
+
const EXTRACTOR_CONFIG = {
|
|
76
|
+
MIN_CONTENT_LENGTH: 200,
|
|
77
|
+
MIN_PARAGRAPH_LENGTH: 50,
|
|
78
|
+
MAX_LINK_TEXT_RATIO: 0.5,
|
|
79
|
+
READING_WORDS_PER_MINUTE: 200,
|
|
80
|
+
} as const;
|
|
81
|
+
|
|
82
|
+
const CONTENT_SELECTORS = [
|
|
83
|
+
{ selector: "article", score: 100 },
|
|
84
|
+
{ selector: "[role='main']", score: 95 },
|
|
85
|
+
{ selector: "main", score: 90 },
|
|
86
|
+
{ selector: ".post-content", score: 85 },
|
|
87
|
+
{ selector: ".entry-content", score: 85 },
|
|
88
|
+
{ selector: ".article-content", score: 85 },
|
|
89
|
+
{ selector: ".content-body", score: 80 },
|
|
90
|
+
{ selector: "#content", score: 75 },
|
|
91
|
+
{ selector: ".content", score: 70 },
|
|
92
|
+
{ selector: ".post", score: 65 },
|
|
93
|
+
{ selector: ".markdown-body", score: 90 },
|
|
94
|
+
{ selector: "[itemprop='articleBody']", score: 95 },
|
|
95
|
+
] as const;
|
|
96
|
+
|
|
97
|
+
// Combined for single-pass removal
|
|
98
|
+
const NOISE_SELECTOR_STRING = [
|
|
99
|
+
"script",
|
|
100
|
+
"style",
|
|
101
|
+
"noscript",
|
|
102
|
+
"iframe",
|
|
103
|
+
"nav",
|
|
104
|
+
"header:not(article header)",
|
|
105
|
+
"footer:not(article footer)",
|
|
106
|
+
"aside",
|
|
107
|
+
".advertisement",
|
|
108
|
+
".ad",
|
|
109
|
+
".ads",
|
|
110
|
+
".social-share",
|
|
111
|
+
".comments",
|
|
112
|
+
".related-posts",
|
|
113
|
+
".sidebar",
|
|
114
|
+
"[role='navigation']",
|
|
115
|
+
"[role='banner']",
|
|
116
|
+
"[role='complementary']",
|
|
117
|
+
"[class*='cookie']",
|
|
118
|
+
"[id*='cookie']",
|
|
119
|
+
"[class*='popup']",
|
|
120
|
+
"[class*='modal']",
|
|
121
|
+
"[class*='newsletter']",
|
|
122
|
+
".hidden",
|
|
123
|
+
"[hidden]",
|
|
124
|
+
"[aria-hidden='true']",
|
|
125
|
+
].join(", ");
|
|
126
|
+
|
|
127
|
+
// === Validation ===
|
|
128
|
+
function validateHTML(html: string): void {
|
|
129
|
+
if (
|
|
130
|
+
!html ||
|
|
131
|
+
typeof html !== "string" ||
|
|
132
|
+
html.trim().length === 0 ||
|
|
133
|
+
!html.includes("<")
|
|
134
|
+
) {
|
|
135
|
+
throw new Error("Invalid or empty HTML");
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// === Scoring ===
|
|
140
|
+
function scoreElement(_$: CheerioAPI, element: Cheerio<Element>): number {
|
|
141
|
+
let score = 0;
|
|
142
|
+
const text = element.text().trim();
|
|
143
|
+
const textLength = text.length;
|
|
144
|
+
|
|
145
|
+
if (textLength < EXTRACTOR_CONFIG.MIN_CONTENT_LENGTH) return 0;
|
|
146
|
+
|
|
147
|
+
score += Math.min(textLength / 10, 100);
|
|
148
|
+
score += element.find("p").length * 5;
|
|
149
|
+
|
|
150
|
+
const linkRatio =
|
|
151
|
+
textLength > 0 ? element.find("a").text().length / textLength : 0;
|
|
152
|
+
if (linkRatio > EXTRACTOR_CONFIG.MAX_LINK_TEXT_RATIO) score -= 50;
|
|
153
|
+
|
|
154
|
+
if (element.find("h1,h2,h3").length > 0) score += 10;
|
|
155
|
+
if (element.find("p").length > 3) score += 10;
|
|
156
|
+
if (element.find("blockquote").length > 0) score += 5;
|
|
157
|
+
if (element.find("ul,ol").length > 0) score += 5;
|
|
158
|
+
|
|
159
|
+
if (element.find(".comments,.comment").length > 0) score -= 20;
|
|
160
|
+
if (element.find("form").length > 2) score -= 15;
|
|
161
|
+
|
|
162
|
+
return score;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
function findBestContent($: CheerioAPI): Cheerio<Element> | null {
|
|
166
|
+
let bestElement: Cheerio<Element> | null = null;
|
|
167
|
+
let bestScore = 0;
|
|
168
|
+
|
|
169
|
+
for (const { selector, score: selectorScore } of CONTENT_SELECTORS) {
|
|
170
|
+
$(selector).each((_, el) => {
|
|
171
|
+
const element = $(el);
|
|
172
|
+
const totalScore = scoreElement($, element) + selectorScore;
|
|
173
|
+
if (totalScore > bestScore) {
|
|
174
|
+
bestScore = totalScore;
|
|
175
|
+
bestElement = element;
|
|
176
|
+
}
|
|
177
|
+
});
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
if (!bestElement || bestScore < 100) {
|
|
181
|
+
$("div").each((_, el) => {
|
|
182
|
+
const element = $(el);
|
|
183
|
+
const contentScore = scoreElement($, element);
|
|
184
|
+
if (contentScore > bestScore) {
|
|
185
|
+
bestScore = contentScore;
|
|
186
|
+
bestElement = element;
|
|
187
|
+
}
|
|
188
|
+
});
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
return bestElement;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// === Text Cleaning ===
|
|
195
|
+
function cleanText(text: string): string {
|
|
196
|
+
return text
|
|
197
|
+
.replace(/[\x00-\x1F\x7F-\x9F\uFFFD\uFEFF\u200B-\u200D\u2060]/g, "")
|
|
198
|
+
.replace(/’/g, "'")
|
|
199
|
+
.replace(/“/g, '"')
|
|
200
|
+
.replace(/â€\x9D/g, '"')
|
|
201
|
+
.replace(/â€"/g, "—")
|
|
202
|
+
.replace(/â€\x93/g, "–")
|
|
203
|
+
.replace(/Â /g, " ")
|
|
204
|
+
.replace(/â¢/g, "•")
|
|
205
|
+
.replace(/é/g, "é")
|
|
206
|
+
.replace(/á/g, "á")
|
|
207
|
+
.replace(/Ã/g, "í")
|
|
208
|
+
.replace(/ó/g, "ó")
|
|
209
|
+
.replace(/ú/g, "ú")
|
|
210
|
+
.replace(/ñ/g, "ñ")
|
|
211
|
+
.replace(/Ã\x87/g, "Ç")
|
|
212
|
+
.replace(/[^\x00-\x7F\u00A0-\uFFFF]/g, "")
|
|
213
|
+
.replace(/[ \t\r\f]+/g, " ")
|
|
214
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
215
|
+
.replace(/ +\n/g, "\n")
|
|
216
|
+
.replace(/\n +/g, "\n")
|
|
217
|
+
.split("\n")
|
|
218
|
+
.map((line) => line.trim())
|
|
219
|
+
.filter((line, i, arr) => {
|
|
220
|
+
if (line) return true;
|
|
221
|
+
return i > 0 && i < arr.length - 1 && arr[i - 1] && arr[i + 1];
|
|
222
|
+
})
|
|
223
|
+
.join("\n")
|
|
224
|
+
.replace(/(\.|!|\?)\s+(?=[A-Z])/g, "$1\n\n")
|
|
225
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
226
|
+
.replace(/^\n+|\n+$/g, "")
|
|
227
|
+
.trim();
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
function extractFormattedText(
|
|
231
|
+
$: CheerioAPI,
|
|
232
|
+
element: Cheerio<Element>,
|
|
233
|
+
): string {
|
|
234
|
+
const clone = element.clone();
|
|
235
|
+
|
|
236
|
+
clone.find("h1,h2,h3,h4,h5,h6").each((_, el) => {
|
|
237
|
+
const $el = $(el);
|
|
238
|
+
if ($el.text().trim()) {
|
|
239
|
+
$el.before("\n\n").after("\n\n");
|
|
240
|
+
}
|
|
241
|
+
});
|
|
242
|
+
|
|
243
|
+
clone.find("p").each((_, el) => {
|
|
244
|
+
const $el = $(el);
|
|
245
|
+
if ($el.text().trim()) {
|
|
246
|
+
$el.before("\n\n").after("\n\n");
|
|
247
|
+
}
|
|
248
|
+
});
|
|
249
|
+
|
|
250
|
+
clone.find("div,blockquote,pre,ul,ol,dl").each((_, el) => {
|
|
251
|
+
const $el = $(el);
|
|
252
|
+
if ($el.text().trim()) {
|
|
253
|
+
$el.before("\n").after("\n");
|
|
254
|
+
}
|
|
255
|
+
});
|
|
256
|
+
|
|
257
|
+
clone.find("li").each((_, el) => {
|
|
258
|
+
const $el = $(el);
|
|
259
|
+
if ($el.text().trim()) {
|
|
260
|
+
$el.prepend("• ").before("\n");
|
|
261
|
+
}
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
return cleanText(clone.text());
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// === Extraction Functions ===
|
|
268
|
+
|
|
269
|
+
export function extractCleanText(html: string): string {
|
|
270
|
+
validateHTML(html);
|
|
271
|
+
const $ = cheerio.load(html);
|
|
272
|
+
$(NOISE_SELECTOR_STRING).remove();
|
|
273
|
+
|
|
274
|
+
const bestElement = findBestContent($);
|
|
275
|
+
let text = bestElement?.length
|
|
276
|
+
? extractFormattedText($, bestElement)
|
|
277
|
+
: extractFormattedText($, $("body"));
|
|
278
|
+
|
|
279
|
+
if (text.length < EXTRACTOR_CONFIG.MIN_CONTENT_LENGTH) {
|
|
280
|
+
text = cleanText($("body").text());
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
return text;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
export function extractContent(html: string): ExtractedContent {
|
|
287
|
+
validateHTML(html);
|
|
288
|
+
const $ = cheerio.load(html);
|
|
289
|
+
$(NOISE_SELECTOR_STRING).remove();
|
|
290
|
+
|
|
291
|
+
const element = findBestContent($) ?? $("body");
|
|
292
|
+
const text = extractFormattedText($, element);
|
|
293
|
+
const html_content = element.html() || "";
|
|
294
|
+
|
|
295
|
+
const paragraphs = text
|
|
296
|
+
.split("\n\n")
|
|
297
|
+
.filter((p) => p.length > EXTRACTOR_CONFIG.MIN_PARAGRAPH_LENGTH);
|
|
298
|
+
|
|
299
|
+
return {
|
|
300
|
+
text,
|
|
301
|
+
html: html_content,
|
|
302
|
+
wordCount: text.split(/\s+/).filter(Boolean).length,
|
|
303
|
+
charCount: text.length,
|
|
304
|
+
paragraphCount: paragraphs.length,
|
|
305
|
+
hasCode: element.find("pre,code").length > 0,
|
|
306
|
+
hasTables: element.find("table").length > 0,
|
|
307
|
+
};
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
export function extractLinks(html: string, baseUrl?: string): Link[] {
|
|
311
|
+
validateHTML(html);
|
|
312
|
+
const $ = cheerio.load(html);
|
|
313
|
+
const links: Link[] = [];
|
|
314
|
+
const seen = new Set<string>();
|
|
315
|
+
|
|
316
|
+
$("a[href]").each((_, el) => {
|
|
317
|
+
let href = $(el).attr("href")?.trim();
|
|
318
|
+
if (!href || href.startsWith("javascript:") || seen.has(href)) return;
|
|
319
|
+
|
|
320
|
+
const text = $(el).text().trim() || href;
|
|
321
|
+
|
|
322
|
+
if (baseUrl && !href.startsWith("http")) {
|
|
323
|
+
try {
|
|
324
|
+
href = new URL(href, baseUrl).href;
|
|
325
|
+
} catch {
|
|
326
|
+
return;
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
if (!href.startsWith("http")) return;
|
|
331
|
+
seen.add(href);
|
|
332
|
+
|
|
333
|
+
let type: Link["type"] = "external";
|
|
334
|
+
if (baseUrl) {
|
|
335
|
+
try {
|
|
336
|
+
const linkUrl = new URL(href);
|
|
337
|
+
const base = new URL(baseUrl);
|
|
338
|
+
if (linkUrl.hostname === base.hostname) type = "internal";
|
|
339
|
+
} catch {}
|
|
340
|
+
}
|
|
341
|
+
if (href.includes("#")) type = "anchor";
|
|
342
|
+
|
|
343
|
+
links.push({
|
|
344
|
+
href,
|
|
345
|
+
text,
|
|
346
|
+
title: $(el).attr("title")?.trim(),
|
|
347
|
+
rel: $(el).attr("rel")?.trim(),
|
|
348
|
+
type,
|
|
349
|
+
});
|
|
350
|
+
});
|
|
351
|
+
|
|
352
|
+
return links;
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
export function extractMetadata(html: string): ExtendedMetadata {
|
|
356
|
+
validateHTML(html);
|
|
357
|
+
const $ = cheerio.load(html);
|
|
358
|
+
|
|
359
|
+
const getMeta = (s: string): string | undefined =>
|
|
360
|
+
$(s).attr("content")?.trim();
|
|
361
|
+
|
|
362
|
+
const og: Record<string, string> = {};
|
|
363
|
+
$("meta[property^='og:']").each((_, el) => {
|
|
364
|
+
const prop = $(el).attr("property")?.replace("og:", "");
|
|
365
|
+
const content = $(el).attr("content");
|
|
366
|
+
if (prop && content) og[prop] = content.trim();
|
|
367
|
+
});
|
|
368
|
+
|
|
369
|
+
const twitter: Record<string, string> = {};
|
|
370
|
+
$("meta[name^='twitter:']").each((_, el) => {
|
|
371
|
+
const name = $(el).attr("name")?.replace("twitter:", "");
|
|
372
|
+
const content = $(el).attr("content");
|
|
373
|
+
if (name && content) twitter[name] = content.trim();
|
|
374
|
+
});
|
|
375
|
+
|
|
376
|
+
const structuredData: any[] = [];
|
|
377
|
+
$("script[type='application/ld+json']").each((_, el) => {
|
|
378
|
+
try {
|
|
379
|
+
structuredData.push(JSON.parse($(el).html() || "{}"));
|
|
380
|
+
} catch {}
|
|
381
|
+
});
|
|
382
|
+
|
|
383
|
+
const keywords = (getMeta("meta[name='keywords']") || "")
|
|
384
|
+
.split(",")
|
|
385
|
+
.map((k) => k.trim())
|
|
386
|
+
.filter(Boolean);
|
|
387
|
+
|
|
388
|
+
const title =
|
|
389
|
+
$("title").first().text().trim() ||
|
|
390
|
+
og.title ||
|
|
391
|
+
getMeta("meta[property='og:title']") ||
|
|
392
|
+
$("h1").first().text().trim() ||
|
|
393
|
+
"";
|
|
394
|
+
const description =
|
|
395
|
+
getMeta("meta[name='description']") ||
|
|
396
|
+
og.description ||
|
|
397
|
+
getMeta("meta[property='og:description']") ||
|
|
398
|
+
"";
|
|
399
|
+
|
|
400
|
+
const author =
|
|
401
|
+
getMeta("meta[name='author']") ||
|
|
402
|
+
getMeta("meta[property='article:author']") ||
|
|
403
|
+
$("[rel='author']").text().trim();
|
|
404
|
+
const publishDate =
|
|
405
|
+
getMeta("meta[property='article:published_time']") ||
|
|
406
|
+
$("time[datetime]").first().attr("datetime");
|
|
407
|
+
const modifiedDate = getMeta("meta[property='article:modified_time']");
|
|
408
|
+
|
|
409
|
+
const cleanText = extractCleanText(html); // Now accurate
|
|
410
|
+
const wordCount = cleanText.split(/\s+/).filter(Boolean).length;
|
|
411
|
+
const readingTime = Math.max(
|
|
412
|
+
1,
|
|
413
|
+
Math.ceil(wordCount / EXTRACTOR_CONFIG.READING_WORDS_PER_MINUTE),
|
|
414
|
+
);
|
|
415
|
+
|
|
416
|
+
return {
|
|
417
|
+
title,
|
|
418
|
+
description,
|
|
419
|
+
keywords,
|
|
420
|
+
language: $("html").attr("lang") || getMeta("meta[name='language']"),
|
|
421
|
+
author,
|
|
422
|
+
publishDate,
|
|
423
|
+
modifiedDate,
|
|
424
|
+
publisher: og.site_name,
|
|
425
|
+
og,
|
|
426
|
+
twitter,
|
|
427
|
+
structuredData: structuredData.length > 0 ? structuredData : undefined,
|
|
428
|
+
siteName: og.site_name || getMeta("meta[property='og:site_name']"),
|
|
429
|
+
type: og.type || "website",
|
|
430
|
+
url: og.url || $("link[rel='canonical']").attr("href"),
|
|
431
|
+
image: og.image || getMeta("meta[name='twitter:image']"),
|
|
432
|
+
canonical: $("link[rel='canonical']").attr("href"),
|
|
433
|
+
robots: getMeta("meta[name='robots']"),
|
|
434
|
+
viewport: getMeta("meta[name='viewport']"),
|
|
435
|
+
wordCount,
|
|
436
|
+
readingTime,
|
|
437
|
+
};
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
export function extractTables(html: string): TableData[] {
|
|
441
|
+
validateHTML(html);
|
|
442
|
+
|
|
443
|
+
const $ = cheerio.load(html);
|
|
444
|
+
const tables: TableData[] = [];
|
|
445
|
+
|
|
446
|
+
$("table").each((_, table) => {
|
|
447
|
+
const headers: string[] = [];
|
|
448
|
+
const rows: string[][] = [];
|
|
449
|
+
|
|
450
|
+
$(table)
|
|
451
|
+
.find("thead th, tr:first-child th")
|
|
452
|
+
.each((_, th) => {
|
|
453
|
+
headers.push($(th).text().trim());
|
|
454
|
+
});
|
|
455
|
+
|
|
456
|
+
if (headers.length === 0) {
|
|
457
|
+
$(table)
|
|
458
|
+
.find("tr:first-child td")
|
|
459
|
+
.each((_, td) => {
|
|
460
|
+
headers.push($(td).text().trim());
|
|
461
|
+
});
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
const startRow = headers.length > 0 ? 1 : 0;
|
|
465
|
+
$(table)
|
|
466
|
+
.find("tr")
|
|
467
|
+
.slice(startRow)
|
|
468
|
+
.each((_, tr) => {
|
|
469
|
+
const row: string[] = [];
|
|
470
|
+
$(tr)
|
|
471
|
+
.find("td")
|
|
472
|
+
.each((_, td) => {
|
|
473
|
+
row.push($(td).text().trim());
|
|
474
|
+
});
|
|
475
|
+
if (row.length > 0) {
|
|
476
|
+
rows.push(row);
|
|
477
|
+
}
|
|
478
|
+
});
|
|
479
|
+
|
|
480
|
+
if (headers.length > 0 || rows.length > 0) {
|
|
481
|
+
tables.push({ headers, rows });
|
|
482
|
+
}
|
|
483
|
+
});
|
|
484
|
+
|
|
485
|
+
return tables;
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
export function extractCodeBlocks(html: string): CodeBlock[] {
|
|
489
|
+
validateHTML(html);
|
|
490
|
+
|
|
491
|
+
const $ = cheerio.load(html);
|
|
492
|
+
const codeBlocks: CodeBlock[] = [];
|
|
493
|
+
|
|
494
|
+
$("pre").each((_, pre) => {
|
|
495
|
+
const code = $(pre).find("code").first();
|
|
496
|
+
const text = code.length > 0 ? code.text() : $(pre).text();
|
|
497
|
+
|
|
498
|
+
const classAttr = code.attr("class") || $(pre).attr("class") || "";
|
|
499
|
+
const languageMatch = classAttr.match(/language-(\w+)|lang-(\w+)/);
|
|
500
|
+
const language = languageMatch
|
|
501
|
+
? languageMatch[1] || languageMatch[2]
|
|
502
|
+
: undefined;
|
|
503
|
+
|
|
504
|
+
if (text.trim()) {
|
|
505
|
+
codeBlocks.push({
|
|
506
|
+
language,
|
|
507
|
+
code: text.trim(),
|
|
508
|
+
});
|
|
509
|
+
}
|
|
510
|
+
});
|
|
511
|
+
|
|
512
|
+
if (codeBlocks.length === 0) {
|
|
513
|
+
$("code").each((_, code) => {
|
|
514
|
+
const text = $(code).text().trim();
|
|
515
|
+
if (text && text.length > 10) {
|
|
516
|
+
codeBlocks.push({ code: text });
|
|
517
|
+
}
|
|
518
|
+
});
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
return codeBlocks;
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
export function extractAll(html: string, baseUrl?: string) {
|
|
525
|
+
return {
|
|
526
|
+
content: extractContent(html),
|
|
527
|
+
metadata: extractMetadata(html),
|
|
528
|
+
links: extractLinks(html, baseUrl),
|
|
529
|
+
tables: extractTables(html),
|
|
530
|
+
codeBlocks: extractCodeBlocks(html),
|
|
531
|
+
};
|
|
532
|
+
}
|