llm-search-tools 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +244 -0
- package/dist/index.d.ts +18 -0
- package/dist/index.js +40 -0
- package/dist/index.js.map +1 -0
- package/dist/integration.test.d.ts +1 -0
- package/dist/integration.test.js +237 -0
- package/dist/modules/answerbox.test.d.ts +1 -0
- package/dist/modules/answerbox.test.js +105 -0
- package/dist/modules/autocomplete.d.ts +11 -0
- package/dist/modules/autocomplete.js +159 -0
- package/dist/modules/autocomplete.test.d.ts +1 -0
- package/dist/modules/autocomplete.test.js +188 -0
- package/dist/modules/common.d.ts +26 -0
- package/dist/modules/common.js +263 -0
- package/dist/modules/common.test.d.ts +1 -0
- package/dist/modules/common.test.js +87 -0
- package/dist/modules/crawl.d.ts +9 -0
- package/dist/modules/crawl.js +117 -0
- package/dist/modules/crawl.test.d.ts +1 -0
- package/dist/modules/crawl.test.js +48 -0
- package/dist/modules/events.d.ts +8 -0
- package/dist/modules/events.js +129 -0
- package/dist/modules/events.test.d.ts +1 -0
- package/dist/modules/events.test.js +104 -0
- package/dist/modules/finance.d.ts +10 -0
- package/dist/modules/finance.js +20 -0
- package/dist/modules/finance.test.d.ts +1 -0
- package/dist/modules/finance.test.js +77 -0
- package/dist/modules/flights.d.ts +8 -0
- package/dist/modules/flights.js +135 -0
- package/dist/modules/flights.test.d.ts +1 -0
- package/dist/modules/flights.test.js +128 -0
- package/dist/modules/hackernews.d.ts +8 -0
- package/dist/modules/hackernews.js +87 -0
- package/dist/modules/hackernews.js.map +1 -0
- package/dist/modules/images.test.d.ts +1 -0
- package/dist/modules/images.test.js +145 -0
- package/dist/modules/integrations.test.d.ts +1 -0
- package/dist/modules/integrations.test.js +93 -0
- package/dist/modules/media.d.ts +11 -0
- package/dist/modules/media.js +132 -0
- package/dist/modules/media.test.d.ts +1 -0
- package/dist/modules/media.test.js +186 -0
- package/dist/modules/news.d.ts +3 -0
- package/dist/modules/news.js +39 -0
- package/dist/modules/news.test.d.ts +1 -0
- package/dist/modules/news.test.js +88 -0
- package/dist/modules/parser.d.ts +19 -0
- package/dist/modules/parser.js +361 -0
- package/dist/modules/parser.test.d.ts +1 -0
- package/dist/modules/parser.test.js +151 -0
- package/dist/modules/reddit.d.ts +21 -0
- package/dist/modules/reddit.js +107 -0
- package/dist/modules/scrape.d.ts +16 -0
- package/dist/modules/scrape.js +272 -0
- package/dist/modules/scrape.test.d.ts +1 -0
- package/dist/modules/scrape.test.js +232 -0
- package/dist/modules/scraper.d.ts +12 -0
- package/dist/modules/scraper.js +640 -0
- package/dist/modules/scrapers/anidb.d.ts +8 -0
- package/dist/modules/scrapers/anidb.js +156 -0
- package/dist/modules/scrapers/duckduckgo.d.ts +6 -0
- package/dist/modules/scrapers/duckduckgo.js +284 -0
- package/dist/modules/scrapers/google-news.d.ts +2 -0
- package/dist/modules/scrapers/google-news.js +60 -0
- package/dist/modules/scrapers/google.d.ts +6 -0
- package/dist/modules/scrapers/google.js +211 -0
- package/dist/modules/scrapers/searxng.d.ts +2 -0
- package/dist/modules/scrapers/searxng.js +93 -0
- package/dist/modules/scrapers/thetvdb.d.ts +3 -0
- package/dist/modules/scrapers/thetvdb.js +147 -0
- package/dist/modules/scrapers/tmdb.d.ts +3 -0
- package/dist/modules/scrapers/tmdb.js +172 -0
- package/dist/modules/scrapers/yahoo-finance.d.ts +2 -0
- package/dist/modules/scrapers/yahoo-finance.js +33 -0
- package/dist/modules/search.d.ts +5 -0
- package/dist/modules/search.js +45 -0
- package/dist/modules/search.js.map +1 -0
- package/dist/modules/search.test.d.ts +1 -0
- package/dist/modules/search.test.js +219 -0
- package/dist/modules/urbandictionary.d.ts +12 -0
- package/dist/modules/urbandictionary.js +26 -0
- package/dist/modules/webpage.d.ts +4 -0
- package/dist/modules/webpage.js +150 -0
- package/dist/modules/webpage.js.map +1 -0
- package/dist/modules/wikipedia.d.ts +5 -0
- package/dist/modules/wikipedia.js +85 -0
- package/dist/modules/wikipedia.js.map +1 -0
- package/dist/scripts/interactive-search.d.ts +1 -0
- package/dist/scripts/interactive-search.js +98 -0
- package/dist/test.d.ts +1 -0
- package/dist/test.js +179 -0
- package/dist/test.js.map +1 -0
- package/dist/testBraveSearch.d.ts +1 -0
- package/dist/testBraveSearch.js +34 -0
- package/dist/testDuckDuckGo.d.ts +1 -0
- package/dist/testDuckDuckGo.js +52 -0
- package/dist/testEcosia.d.ts +1 -0
- package/dist/testEcosia.js +57 -0
- package/dist/testSearchModule.d.ts +1 -0
- package/dist/testSearchModule.js +95 -0
- package/dist/testwebpage.d.ts +1 -0
- package/dist/testwebpage.js +81 -0
- package/dist/types.d.ts +174 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/createTestDocx.d.ts +1 -0
- package/dist/utils/createTestDocx.js +58 -0
- package/dist/utils/htmlcleaner.d.ts +20 -0
- package/dist/utils/htmlcleaner.js +172 -0
- package/docs/README.md +275 -0
- package/docs/autocomplete.md +73 -0
- package/docs/crawling.md +88 -0
- package/docs/events.md +58 -0
- package/docs/examples.md +158 -0
- package/docs/finance.md +60 -0
- package/docs/flights.md +71 -0
- package/docs/hackernews.md +121 -0
- package/docs/media.md +87 -0
- package/docs/news.md +75 -0
- package/docs/parser.md +197 -0
- package/docs/scraper.md +347 -0
- package/docs/search.md +106 -0
- package/docs/wikipedia.md +91 -0
- package/package.json +97 -0
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
export interface ProxyConfig {
|
|
2
|
+
type: "http" | "https" | "socks4" | "socks5";
|
|
3
|
+
host: string;
|
|
4
|
+
port: number;
|
|
5
|
+
auth?: {
|
|
6
|
+
username: string;
|
|
7
|
+
password: string;
|
|
8
|
+
};
|
|
9
|
+
url?: string;
|
|
10
|
+
}
|
|
11
|
+
export interface ScraperOptions {
|
|
12
|
+
limit?: number;
|
|
13
|
+
safeSearch?: boolean;
|
|
14
|
+
timeout?: number;
|
|
15
|
+
forcePuppeteer?: boolean;
|
|
16
|
+
proxy?: ProxyConfig | string;
|
|
17
|
+
antiBot?: {
|
|
18
|
+
enabled?: boolean;
|
|
19
|
+
maxRetries?: number;
|
|
20
|
+
retryDelay?: number;
|
|
21
|
+
};
|
|
22
|
+
searxngInstance?: string;
|
|
23
|
+
category?: "web" | "news" | "images" | "videos";
|
|
24
|
+
}
|
|
25
|
+
export interface SearchOptions {
|
|
26
|
+
limit?: number;
|
|
27
|
+
safeSearch?: boolean;
|
|
28
|
+
timeout?: number;
|
|
29
|
+
forcePuppeteer?: boolean;
|
|
30
|
+
proxy?: ProxyConfig | string;
|
|
31
|
+
antiBot?: {
|
|
32
|
+
enabled?: boolean;
|
|
33
|
+
maxRetries?: number;
|
|
34
|
+
retryDelay?: number;
|
|
35
|
+
};
|
|
36
|
+
searxngInstance?: string;
|
|
37
|
+
}
|
|
38
|
+
export interface SearchResult {
|
|
39
|
+
title: string;
|
|
40
|
+
url: string;
|
|
41
|
+
snippet?: string;
|
|
42
|
+
source: "google" | "duckduckgo" | "wikipedia" | "hackernews" | "searxng" | "google-news" | "duckduckgo-news" | "google-images" | "duckduckgo-images" | "searxng-images";
|
|
43
|
+
}
|
|
44
|
+
export interface ImageResult extends SearchResult {
|
|
45
|
+
imageUrl: string;
|
|
46
|
+
thumbnailUrl?: string;
|
|
47
|
+
width?: number;
|
|
48
|
+
height?: number;
|
|
49
|
+
source: "google-images" | "duckduckgo-images" | "searxng-images";
|
|
50
|
+
}
|
|
51
|
+
export interface NewsResult extends SearchResult {
|
|
52
|
+
sourceName?: string;
|
|
53
|
+
publishedAt?: string | Date;
|
|
54
|
+
imageUrl?: string;
|
|
55
|
+
}
|
|
56
|
+
export interface WikipediaResult extends SearchResult {
|
|
57
|
+
extract?: string;
|
|
58
|
+
thumbnail?: string;
|
|
59
|
+
}
|
|
60
|
+
export interface HackerNewsResult extends SearchResult {
|
|
61
|
+
id?: number;
|
|
62
|
+
points?: number;
|
|
63
|
+
author?: string;
|
|
64
|
+
comments?: number;
|
|
65
|
+
time?: Date;
|
|
66
|
+
}
|
|
67
|
+
export interface FinanceResult {
|
|
68
|
+
symbol: string;
|
|
69
|
+
shortName?: string;
|
|
70
|
+
longName?: string;
|
|
71
|
+
regularMarketPrice?: number;
|
|
72
|
+
regularMarketChange?: number;
|
|
73
|
+
regularMarketChangePercent?: number;
|
|
74
|
+
regularMarketTime?: Date;
|
|
75
|
+
currency?: string;
|
|
76
|
+
exchange?: string;
|
|
77
|
+
marketState?: string;
|
|
78
|
+
source: "yahoo-finance";
|
|
79
|
+
}
|
|
80
|
+
export interface WebpageContent {
|
|
81
|
+
title?: string;
|
|
82
|
+
content: string;
|
|
83
|
+
textContent: string;
|
|
84
|
+
length: number;
|
|
85
|
+
excerpt?: string;
|
|
86
|
+
siteName?: string;
|
|
87
|
+
favicon?: string;
|
|
88
|
+
markdown?: string;
|
|
89
|
+
imageUrls?: string[];
|
|
90
|
+
rawHtml?: string;
|
|
91
|
+
}
|
|
92
|
+
export interface SearchError {
|
|
93
|
+
message: string;
|
|
94
|
+
code: string;
|
|
95
|
+
originalError?: unknown;
|
|
96
|
+
}
|
|
97
|
+
export interface CrawlOptions extends ScraperOptions {
|
|
98
|
+
maxPages?: number;
|
|
99
|
+
maxDepth?: number;
|
|
100
|
+
crawlType?: "cheerio" | "puppeteer";
|
|
101
|
+
stayOnDomain?: boolean;
|
|
102
|
+
ignoreRobotsTxt?: boolean;
|
|
103
|
+
}
|
|
104
|
+
export interface CrawledPage extends WebpageContent {
|
|
105
|
+
url: string;
|
|
106
|
+
depth: number;
|
|
107
|
+
}
|
|
108
|
+
export type CrawlResult = CrawledPage[];
|
|
109
|
+
export interface AutocompleteResult {
|
|
110
|
+
query: string;
|
|
111
|
+
suggestions: string[];
|
|
112
|
+
source: string;
|
|
113
|
+
}
|
|
114
|
+
export interface AutocompleteOptions {
|
|
115
|
+
limit?: number;
|
|
116
|
+
proxy?: ProxyConfig | string;
|
|
117
|
+
timeout?: number;
|
|
118
|
+
}
|
|
119
|
+
export interface Flight {
|
|
120
|
+
airline: string;
|
|
121
|
+
departureTime: string;
|
|
122
|
+
arrivalTime: string;
|
|
123
|
+
duration: string;
|
|
124
|
+
price: string;
|
|
125
|
+
stops: string;
|
|
126
|
+
origin?: string;
|
|
127
|
+
destination?: string;
|
|
128
|
+
}
|
|
129
|
+
export interface FlightResult {
|
|
130
|
+
flights: Flight[];
|
|
131
|
+
url: string;
|
|
132
|
+
source: "google-flights";
|
|
133
|
+
}
|
|
134
|
+
export interface FlightSearchOptions extends ScraperOptions {
|
|
135
|
+
departureDate?: string;
|
|
136
|
+
returnDate?: string;
|
|
137
|
+
from?: string;
|
|
138
|
+
to?: string;
|
|
139
|
+
}
|
|
140
|
+
export interface Event {
|
|
141
|
+
title: string;
|
|
142
|
+
date: string;
|
|
143
|
+
location: string;
|
|
144
|
+
link?: string;
|
|
145
|
+
description?: string;
|
|
146
|
+
image?: string;
|
|
147
|
+
}
|
|
148
|
+
export interface EventResult {
|
|
149
|
+
events: Event[];
|
|
150
|
+
url: string;
|
|
151
|
+
source: "google-events";
|
|
152
|
+
}
|
|
153
|
+
export interface EventSearchOptions extends ScraperOptions {
|
|
154
|
+
date?: "today" | "tomorrow" | "week" | "weekend" | "month" | "next_month";
|
|
155
|
+
}
|
|
156
|
+
export interface MediaResult {
|
|
157
|
+
title: string;
|
|
158
|
+
description?: string;
|
|
159
|
+
rating?: string;
|
|
160
|
+
releaseDate?: string;
|
|
161
|
+
cast?: string[];
|
|
162
|
+
genres?: string[];
|
|
163
|
+
posterUrl?: string;
|
|
164
|
+
watchProviders?: {
|
|
165
|
+
name: string;
|
|
166
|
+
type: "stream" | "rent" | "buy";
|
|
167
|
+
}[];
|
|
168
|
+
url: string;
|
|
169
|
+
source: "tmdb" | "thetvdb" | "anidb";
|
|
170
|
+
mediaType: "movie" | "tv" | "anime";
|
|
171
|
+
}
|
|
172
|
+
export interface MediaSearchOptions extends ScraperOptions {
|
|
173
|
+
type?: "movie" | "tv" | "anime";
|
|
174
|
+
}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":";AAAA,0CAA0C"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
const jszip_1 = __importDefault(require("jszip"));
|
|
7
|
+
const fs_1 = require("fs");
|
|
8
|
+
async function createTestDocx() {
|
|
9
|
+
const zip = new jszip_1.default();
|
|
10
|
+
// Add required DOCX files
|
|
11
|
+
zip.file('word/document.xml', `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
|
12
|
+
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
13
|
+
<w:body>
|
|
14
|
+
<w:p>
|
|
15
|
+
<w:r>
|
|
16
|
+
<w:t>Sample Test Document</w:t>
|
|
17
|
+
</w:r>
|
|
18
|
+
</w:p>
|
|
19
|
+
<w:p>
|
|
20
|
+
<w:r>
|
|
21
|
+
<w:t>This is a test DOCX document for llm-kit parser testing.</w:t>
|
|
22
|
+
</w:r>
|
|
23
|
+
</w:p>
|
|
24
|
+
<w:p>
|
|
25
|
+
<w:r>
|
|
26
|
+
<w:t>It contains some sample paragraphs to verify DOCX parsing.</w:t>
|
|
27
|
+
</w:r>
|
|
28
|
+
</w:p>
|
|
29
|
+
<w:p>
|
|
30
|
+
<w:r>
|
|
31
|
+
<w:t>Each paragraph should be properly extracted and formatted.</w:t>
|
|
32
|
+
</w:r>
|
|
33
|
+
</w:p>
|
|
34
|
+
</w:body>
|
|
35
|
+
</w:document>`);
|
|
36
|
+
zip.file('[Content_Types].xml', `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
|
37
|
+
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
38
|
+
<Default Extension="xml" ContentType="application/xml"/>
|
|
39
|
+
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
40
|
+
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
|
|
41
|
+
</Types>`);
|
|
42
|
+
zip.file('_rels/.rels', `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
|
43
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
44
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
|
|
45
|
+
</Relationships>`);
|
|
46
|
+
// Generate the DOCX file
|
|
47
|
+
const content = await zip.generateAsync({
|
|
48
|
+
type: 'nodebuffer',
|
|
49
|
+
compression: 'DEFLATE',
|
|
50
|
+
compressionOptions: {
|
|
51
|
+
level: 9
|
|
52
|
+
}
|
|
53
|
+
});
|
|
54
|
+
// Write to file
|
|
55
|
+
(0, fs_1.writeFileSync)('test/files/test.docx', content);
|
|
56
|
+
}
|
|
57
|
+
// Create the test DOCX file
|
|
58
|
+
createTestDocx().catch(console.error);
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Process HTML to convert relative URLs to absolute ones
|
|
3
|
+
* @param html The HTML string
|
|
4
|
+
* @param baseUrl The base URL for resolving relative URLs
|
|
5
|
+
* @returns HTML with absolute URLs
|
|
6
|
+
*/
|
|
7
|
+
export declare function makeUrlsAbsolute(html: string, baseUrl: string): string;
|
|
8
|
+
/**
|
|
9
|
+
* Cleans HTML by removing invisible elements and minifies it while preserving visible content
|
|
10
|
+
* @param html The HTML string to clean
|
|
11
|
+
* @returns Cleaned and minified HTML with visible content preserved
|
|
12
|
+
*/
|
|
13
|
+
export declare function cleanAndMinifyHtml(html: string): string;
|
|
14
|
+
/**
|
|
15
|
+
* Process HTML by cleaning it and converting all URLs to absolute
|
|
16
|
+
* @param html The HTML string to process
|
|
17
|
+
* @param baseUrl The base URL for resolving relative URLs
|
|
18
|
+
* @returns Processed HTML
|
|
19
|
+
*/
|
|
20
|
+
export declare function processHtml(html: string, baseUrl: string): string;
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// utils/htmlcleaner.ts - utilities for processing and cleaning html
|
|
3
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
4
|
+
if (k2 === undefined) k2 = k;
|
|
5
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
6
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
7
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
8
|
+
}
|
|
9
|
+
Object.defineProperty(o, k2, desc);
|
|
10
|
+
}) : (function(o, m, k, k2) {
|
|
11
|
+
if (k2 === undefined) k2 = k;
|
|
12
|
+
o[k2] = m[k];
|
|
13
|
+
}));
|
|
14
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
15
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
16
|
+
}) : function(o, v) {
|
|
17
|
+
o["default"] = v;
|
|
18
|
+
});
|
|
19
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
20
|
+
var ownKeys = function(o) {
|
|
21
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
22
|
+
var ar = [];
|
|
23
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
24
|
+
return ar;
|
|
25
|
+
};
|
|
26
|
+
return ownKeys(o);
|
|
27
|
+
};
|
|
28
|
+
return function (mod) {
|
|
29
|
+
if (mod && mod.__esModule) return mod;
|
|
30
|
+
var result = {};
|
|
31
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
32
|
+
__setModuleDefault(result, mod);
|
|
33
|
+
return result;
|
|
34
|
+
};
|
|
35
|
+
})();
|
|
36
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
37
|
+
exports.makeUrlsAbsolute = makeUrlsAbsolute;
|
|
38
|
+
exports.cleanAndMinifyHtml = cleanAndMinifyHtml;
|
|
39
|
+
exports.processHtml = processHtml;
|
|
40
|
+
const cheerio = __importStar(require("cheerio"));
|
|
41
|
+
/**
|
|
42
|
+
* Process HTML to convert relative URLs to absolute ones
|
|
43
|
+
* @param html The HTML string
|
|
44
|
+
* @param baseUrl The base URL for resolving relative URLs
|
|
45
|
+
* @returns HTML with absolute URLs
|
|
46
|
+
*/
|
|
47
|
+
function makeUrlsAbsolute(html, baseUrl) {
|
|
48
|
+
const $ = cheerio.load(html);
|
|
49
|
+
// convert relative urls to absolute
|
|
50
|
+
["href", "src"].forEach((attr) => {
|
|
51
|
+
$(`[${attr}]`).each((_, element) => {
|
|
52
|
+
const el = $(element);
|
|
53
|
+
const value = $(el).attr(attr);
|
|
54
|
+
if (value && !value.startsWith("data:") && !value.startsWith("http")) {
|
|
55
|
+
try {
|
|
56
|
+
const absoluteUrl = new URL(value, baseUrl).toString();
|
|
57
|
+
$(el).attr(attr, absoluteUrl);
|
|
58
|
+
}
|
|
59
|
+
catch {
|
|
60
|
+
// keep original if URL parsing fails
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
});
|
|
64
|
+
});
|
|
65
|
+
return $.html();
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Cleans HTML by removing invisible elements and minifies it while preserving visible content
|
|
69
|
+
* @param html The HTML string to clean
|
|
70
|
+
* @returns Cleaned and minified HTML with visible content preserved
|
|
71
|
+
*/
|
|
72
|
+
function cleanAndMinifyHtml(html) {
|
|
73
|
+
// Load HTML into Cheerio
|
|
74
|
+
const $ = cheerio.load(html);
|
|
75
|
+
// Remove invisible and non-content elements
|
|
76
|
+
const elementsToRemove = [
|
|
77
|
+
"script",
|
|
78
|
+
"style",
|
|
79
|
+
"meta",
|
|
80
|
+
'link[rel="stylesheet"]',
|
|
81
|
+
'link[rel="preload"]',
|
|
82
|
+
'link[rel="prefetch"]',
|
|
83
|
+
"iframe",
|
|
84
|
+
"noscript",
|
|
85
|
+
"svg",
|
|
86
|
+
"video",
|
|
87
|
+
"object",
|
|
88
|
+
"embed",
|
|
89
|
+
"canvas",
|
|
90
|
+
"template",
|
|
91
|
+
'[style*="display:none"]',
|
|
92
|
+
'[style*="display: none"]',
|
|
93
|
+
'[style*="visibility:hidden"]',
|
|
94
|
+
'[style*="visibility: hidden"]',
|
|
95
|
+
"[hidden]",
|
|
96
|
+
'[aria-hidden="true"]',
|
|
97
|
+
];
|
|
98
|
+
// Remove all specified elements
|
|
99
|
+
elementsToRemove.forEach((selector) => {
|
|
100
|
+
$(selector).remove();
|
|
101
|
+
});
|
|
102
|
+
// Remove JavaScript event handlers
|
|
103
|
+
$("*").each((_, element) => {
|
|
104
|
+
const el = $(element);
|
|
105
|
+
const tagName = el.prop('tagName')?.toLowerCase();
|
|
106
|
+
// Get all attributes
|
|
107
|
+
const attributes = [];
|
|
108
|
+
el[0].attribs && Object.keys(el[0].attribs).forEach(attr => {
|
|
109
|
+
attributes.push(attr);
|
|
110
|
+
});
|
|
111
|
+
// Process each attribute
|
|
112
|
+
attributes.forEach(attr => {
|
|
113
|
+
// Remove JavaScript event handlers
|
|
114
|
+
if (attr.startsWith("on")) {
|
|
115
|
+
el.removeAttr(attr);
|
|
116
|
+
return;
|
|
117
|
+
}
|
|
118
|
+
// Keep essential attributes but remove others
|
|
119
|
+
const keepAttribute =
|
|
120
|
+
// Keep structural attributes
|
|
121
|
+
attr === "href" ||
|
|
122
|
+
attr === "src" ||
|
|
123
|
+
attr === "alt" ||
|
|
124
|
+
attr === "title" ||
|
|
125
|
+
// Keep basic formatting
|
|
126
|
+
attr === "colspan" ||
|
|
127
|
+
attr === "rowspan" ||
|
|
128
|
+
// Keep semantic attributes
|
|
129
|
+
attr === "role" ||
|
|
130
|
+
attr === "aria-label" ||
|
|
131
|
+
// Keep header identification
|
|
132
|
+
(attr === "id" &&
|
|
133
|
+
(tagName === "h1" ||
|
|
134
|
+
tagName === "h2" ||
|
|
135
|
+
tagName === "h3" ||
|
|
136
|
+
tagName === "h4" ||
|
|
137
|
+
tagName === "h5" ||
|
|
138
|
+
tagName === "h6" ||
|
|
139
|
+
tagName === "header"));
|
|
140
|
+
if (!keepAttribute) {
|
|
141
|
+
el.removeAttr(attr);
|
|
142
|
+
}
|
|
143
|
+
});
|
|
144
|
+
});
|
|
145
|
+
// Basic HTML minification
|
|
146
|
+
let result = $.html();
|
|
147
|
+
// Minify the HTML
|
|
148
|
+
result = result
|
|
149
|
+
// Remove comments
|
|
150
|
+
.replace(/<!--[\s\S]*?-->/g, "")
|
|
151
|
+
// Remove extra whitespace
|
|
152
|
+
.replace(/\s{2,}/g, " ")
|
|
153
|
+
// Remove whitespace between tags
|
|
154
|
+
.replace(/>\s+</g, "><")
|
|
155
|
+
// Remove whitespace at start/end of each line
|
|
156
|
+
.replace(/^\s+|\s+$/gm, "")
|
|
157
|
+
// Remove unnecessary line breaks while keeping some structure
|
|
158
|
+
.replace(/\n+/g, "\n");
|
|
159
|
+
return result;
|
|
160
|
+
}
|
|
161
|
+
/**
|
|
162
|
+
* Process HTML by cleaning it and converting all URLs to absolute
|
|
163
|
+
* @param html The HTML string to process
|
|
164
|
+
* @param baseUrl The base URL for resolving relative URLs
|
|
165
|
+
* @returns Processed HTML
|
|
166
|
+
*/
|
|
167
|
+
function processHtml(html, baseUrl) {
|
|
168
|
+
// first clean and minify
|
|
169
|
+
const cleaned = cleanAndMinifyHtml(html);
|
|
170
|
+
// then convert urls to absolute
|
|
171
|
+
return makeUrlsAbsolute(cleaned, baseUrl);
|
|
172
|
+
}
|