skillshield 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +400 -0
- package/dist/channels/discord.d.ts +18 -0
- package/dist/channels/discord.d.ts.map +1 -0
- package/dist/channels/discord.js +275 -0
- package/dist/channels/discord.js.map +1 -0
- package/dist/channels/index.d.ts +67 -0
- package/dist/channels/index.d.ts.map +1 -0
- package/dist/channels/index.js +127 -0
- package/dist/channels/index.js.map +1 -0
- package/dist/channels/slack.d.ts +20 -0
- package/dist/channels/slack.d.ts.map +1 -0
- package/dist/channels/slack.js +296 -0
- package/dist/channels/slack.js.map +1 -0
- package/dist/channels/telegram.d.ts +20 -0
- package/dist/channels/telegram.d.ts.map +1 -0
- package/dist/channels/telegram.js +223 -0
- package/dist/channels/telegram.js.map +1 -0
- package/dist/channels/whatsapp.d.ts +25 -0
- package/dist/channels/whatsapp.d.ts.map +1 -0
- package/dist/channels/whatsapp.js +187 -0
- package/dist/channels/whatsapp.js.map +1 -0
- package/dist/cli/commands/badge.d.ts +11 -0
- package/dist/cli/commands/badge.d.ts.map +1 -0
- package/dist/cli/commands/badge.js +98 -0
- package/dist/cli/commands/badge.js.map +1 -0
- package/dist/cli/commands/config.d.ts +3 -0
- package/dist/cli/commands/config.d.ts.map +1 -0
- package/dist/cli/commands/config.js +140 -0
- package/dist/cli/commands/config.js.map +1 -0
- package/dist/cli/commands/deploy.d.ts +3 -0
- package/dist/cli/commands/deploy.d.ts.map +1 -0
- package/dist/cli/commands/deploy.js +56 -0
- package/dist/cli/commands/deploy.js.map +1 -0
- package/dist/cli/commands/init.d.ts +3 -0
- package/dist/cli/commands/init.d.ts.map +1 -0
- package/dist/cli/commands/init.js +99 -0
- package/dist/cli/commands/init.js.map +1 -0
- package/dist/cli/commands/install.d.ts +3 -0
- package/dist/cli/commands/install.d.ts.map +1 -0
- package/dist/cli/commands/install.js +90 -0
- package/dist/cli/commands/install.js.map +1 -0
- package/dist/cli/commands/list.d.ts +3 -0
- package/dist/cli/commands/list.d.ts.map +1 -0
- package/dist/cli/commands/list.js +76 -0
- package/dist/cli/commands/list.js.map +1 -0
- package/dist/cli/commands/run.d.ts +3 -0
- package/dist/cli/commands/run.d.ts.map +1 -0
- package/dist/cli/commands/run.js +160 -0
- package/dist/cli/commands/run.js.map +1 -0
- package/dist/cli/commands/scan.d.ts +3 -0
- package/dist/cli/commands/scan.d.ts.map +1 -0
- package/dist/cli/commands/scan.js +133 -0
- package/dist/cli/commands/scan.js.map +1 -0
- package/dist/cli/commands/search.d.ts +3 -0
- package/dist/cli/commands/search.d.ts.map +1 -0
- package/dist/cli/commands/search.js +56 -0
- package/dist/cli/commands/search.js.map +1 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +70 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/core/config.d.ts +167 -0
- package/dist/core/config.d.ts.map +1 -0
- package/dist/core/config.js +398 -0
- package/dist/core/config.js.map +1 -0
- package/dist/core/parser.d.ts +34 -0
- package/dist/core/parser.d.ts.map +1 -0
- package/dist/core/parser.js +462 -0
- package/dist/core/parser.js.map +1 -0
- package/dist/core/runtime.d.ts +68 -0
- package/dist/core/runtime.d.ts.map +1 -0
- package/dist/core/runtime.js +560 -0
- package/dist/core/runtime.js.map +1 -0
- package/dist/core/types.d.ts +525 -0
- package/dist/core/types.d.ts.map +1 -0
- package/dist/core/types.js +44 -0
- package/dist/core/types.js.map +1 -0
- package/dist/guard/index.d.ts +57 -0
- package/dist/guard/index.d.ts.map +1 -0
- package/dist/guard/index.js +238 -0
- package/dist/guard/index.js.map +1 -0
- package/dist/guard/patterns.d.ts +21 -0
- package/dist/guard/patterns.d.ts.map +1 -0
- package/dist/guard/patterns.js +797 -0
- package/dist/guard/patterns.js.map +1 -0
- package/dist/hub/index.d.ts +44 -0
- package/dist/hub/index.d.ts.map +1 -0
- package/dist/hub/index.js +144 -0
- package/dist/hub/index.js.map +1 -0
- package/dist/hub/registry.d.ts +52 -0
- package/dist/hub/registry.d.ts.map +1 -0
- package/dist/hub/registry.js +192 -0
- package/dist/hub/registry.js.map +1 -0
- package/dist/i18n/index.d.ts +19 -0
- package/dist/i18n/index.d.ts.map +1 -0
- package/dist/i18n/index.js +92 -0
- package/dist/i18n/index.js.map +1 -0
- package/dist/i18n/locales/en.d.ts +110 -0
- package/dist/i18n/locales/en.d.ts.map +1 -0
- package/dist/i18n/locales/en.js +123 -0
- package/dist/i18n/locales/en.js.map +1 -0
- package/dist/i18n/locales/es.d.ts +110 -0
- package/dist/i18n/locales/es.d.ts.map +1 -0
- package/dist/i18n/locales/es.js +123 -0
- package/dist/i18n/locales/es.js.map +1 -0
- package/dist/i18n/locales/pt.d.ts +110 -0
- package/dist/i18n/locales/pt.d.ts.map +1 -0
- package/dist/i18n/locales/pt.js +123 -0
- package/dist/i18n/locales/pt.js.map +1 -0
- package/dist/i18n/locales/zh.d.ts +110 -0
- package/dist/i18n/locales/zh.d.ts.map +1 -0
- package/dist/i18n/locales/zh.js +123 -0
- package/dist/i18n/locales/zh.js.map +1 -0
- package/dist/index.d.ts +168 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +275 -0
- package/dist/index.js.map +1 -0
- package/dist/router/index.d.ts +89 -0
- package/dist/router/index.d.ts.map +1 -0
- package/dist/router/index.js +292 -0
- package/dist/router/index.js.map +1 -0
- package/dist/router/providers.d.ts +48 -0
- package/dist/router/providers.d.ts.map +1 -0
- package/dist/router/providers.js +733 -0
- package/dist/router/providers.js.map +1 -0
- package/dist/runtime/executor.d.ts +96 -0
- package/dist/runtime/executor.d.ts.map +1 -0
- package/dist/runtime/executor.js +389 -0
- package/dist/runtime/executor.js.map +1 -0
- package/dist/sandbox/index.d.ts +52 -0
- package/dist/sandbox/index.d.ts.map +1 -0
- package/dist/sandbox/index.js +248 -0
- package/dist/sandbox/index.js.map +1 -0
- package/dist/security/skillguard.d.ts +25 -0
- package/dist/security/skillguard.d.ts.map +1 -0
- package/dist/security/skillguard.js +137 -0
- package/dist/security/skillguard.js.map +1 -0
- package/dist/tools/index.d.ts +55 -0
- package/dist/tools/index.d.ts.map +1 -0
- package/dist/tools/index.js +276 -0
- package/dist/tools/index.js.map +1 -0
- package/dist/tools/web-engine.d.ts +158 -0
- package/dist/tools/web-engine.d.ts.map +1 -0
- package/dist/tools/web-engine.js +802 -0
- package/dist/tools/web-engine.js.map +1 -0
- package/dist/tools/web-tools.d.ts +173 -0
- package/dist/tools/web-tools.d.ts.map +1 -0
- package/dist/tools/web-tools.js +251 -0
- package/dist/tools/web-tools.js.map +1 -0
- package/dist/utils/errors.d.ts +44 -0
- package/dist/utils/errors.d.ts.map +1 -0
- package/dist/utils/errors.js +130 -0
- package/dist/utils/errors.js.map +1 -0
- package/dist/utils/logger.d.ts +28 -0
- package/dist/utils/logger.d.ts.map +1 -0
- package/dist/utils/logger.js +121 -0
- package/dist/utils/logger.js.map +1 -0
- package/examples/basic-usage.ts +276 -0
- package/examples/code-reviewer.skill.md +83 -0
- package/examples/creative-writer.skill.md +80 -0
- package/examples/data-analyzer.skill.md +61 -0
- package/examples/hello-world.skill.md +36 -0
- package/examples/sample-skill.md +156 -0
- package/examples/summarizer.skill.md +62 -0
- package/examples/translator.skill.md +45 -0
- package/package.json +110 -0
|
@@ -0,0 +1,802 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SkillKit Native Web Intelligence Engine
|
|
3
|
+
* =========================================
|
|
4
|
+
* Zero external API dependencies. No Tavily, no SerpAPI, no paid keys.
|
|
5
|
+
* SkillKit builds its own web intelligence from scratch.
|
|
6
|
+
*
|
|
7
|
+
* 5 capabilities (inspired by Tavily's architecture, built natively):
|
|
8
|
+
* 1. SEARCH — Multi-source web search (DuckDuckGo HTML + Google fallback + SearXNG)
|
|
9
|
+
* 2. EXTRACT — Content extraction from any URL (Readability-style + metadata)
|
|
10
|
+
* 3. CRAWL — Deep website crawling with link following and depth control
|
|
11
|
+
* 4. RESEARCH — Multi-step deep research with AI-powered synthesis
|
|
12
|
+
* 5. MAP — Site URL structure discovery (sitemap.xml + link harvesting)
|
|
13
|
+
*
|
|
14
|
+
* How it works WITHOUT paid APIs:
|
|
15
|
+
* - Search: Scrapes DuckDuckGo HTML (no API key needed), parses results
|
|
16
|
+
* - Extract: Fetches raw HTML, strips boilerplate via content-density algorithm
|
|
17
|
+
* - Crawl: BFS link-follower with domain scoping, depth limits, robots.txt respect
|
|
18
|
+
* - Research: Orchestrates search → extract → synthesize in multiple rounds
|
|
19
|
+
* - Map: Parses sitemap.xml + discovers links via crawl (URL-only mode)
|
|
20
|
+
*
|
|
21
|
+
* @module web-engine
|
|
22
|
+
*/
|
|
23
|
+
// ─── Configuration ──────────────────────────────────────────────────────────
|
|
24
|
+
const USER_AGENT = 'SkillKit/1.0 (https://github.com/artefactforge/skillkit)';
|
|
25
|
+
const FETCH_TIMEOUT = 15000;
|
|
26
|
+
const MAX_CONTENT_LENGTH = 500000; // 500KB max per page
|
|
27
|
+
const MAX_REDIRECTS = 5;
|
|
28
|
+
// ─── HTTP Helpers ───────────────────────────────────────────────────────────
|
|
29
|
+
/**
|
|
30
|
+
* Robust fetch with timeout, redirect following, and User-Agent.
|
|
31
|
+
*/
|
|
32
|
+
async function safeFetch(url, options = {}) {
|
|
33
|
+
const fetch = (await import('node-fetch')).default;
|
|
34
|
+
const controller = new AbortController();
|
|
35
|
+
const timeout = options.timeout || FETCH_TIMEOUT;
|
|
36
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
37
|
+
try {
|
|
38
|
+
const response = await fetch(url, {
|
|
39
|
+
headers: {
|
|
40
|
+
'User-Agent': USER_AGENT,
|
|
41
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
42
|
+
'Accept-Language': 'en-US,en;q=0.9,es;q=0.8,zh;q=0.7,pt;q=0.6',
|
|
43
|
+
'Accept-Encoding': 'identity',
|
|
44
|
+
...(options.headers || {}),
|
|
45
|
+
},
|
|
46
|
+
redirect: 'follow',
|
|
47
|
+
follow: options.maxRedirects || MAX_REDIRECTS,
|
|
48
|
+
signal: controller.signal,
|
|
49
|
+
size: MAX_CONTENT_LENGTH,
|
|
50
|
+
});
|
|
51
|
+
clearTimeout(timeoutId);
|
|
52
|
+
const html = await response.text();
|
|
53
|
+
const respHeaders = {};
|
|
54
|
+
response.headers.forEach((v, k) => { respHeaders[k] = v; });
|
|
55
|
+
return {
|
|
56
|
+
status: response.status,
|
|
57
|
+
html,
|
|
58
|
+
finalUrl: response.url || url,
|
|
59
|
+
headers: respHeaders,
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
catch (error) {
|
|
63
|
+
clearTimeout(timeoutId);
|
|
64
|
+
const err = error;
|
|
65
|
+
if (err.name === 'AbortError')
|
|
66
|
+
throw new Error(`Fetch timed out after ${timeout}ms: ${url}`);
|
|
67
|
+
throw new Error(`Fetch failed for ${url}: ${err.message}`);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
// ─── HTML Parsing Utilities ─────────────────────────────────────────────────
|
|
71
|
+
/**
|
|
72
|
+
* Extract text content from HTML, stripping tags.
|
|
73
|
+
* Lightweight — no external DOM parser dependency.
|
|
74
|
+
*/
|
|
75
|
+
function stripHtml(html) {
|
|
76
|
+
return html
|
|
77
|
+
// Remove script and style blocks entirely
|
|
78
|
+
.replace(/<script[\s\S]*?<\/script>/gi, '')
|
|
79
|
+
.replace(/<style[\s\S]*?<\/style>/gi, '')
|
|
80
|
+
.replace(/<noscript[\s\S]*?<\/noscript>/gi, '')
|
|
81
|
+
// Remove HTML comments
|
|
82
|
+
.replace(/<!--[\s\S]*?-->/g, '')
|
|
83
|
+
// Convert block elements to newlines
|
|
84
|
+
.replace(/<\/(p|div|h[1-6]|li|tr|br|blockquote|article|section|header|footer|nav|aside)>/gi, '\n')
|
|
85
|
+
.replace(/<(br|hr)\s*\/?>/gi, '\n')
|
|
86
|
+
// Remove remaining tags
|
|
87
|
+
.replace(/<[^>]+>/g, '')
|
|
88
|
+
// Decode common HTML entities
|
|
89
|
+
.replace(/&/g, '&')
|
|
90
|
+
.replace(/</g, '<')
|
|
91
|
+
.replace(/>/g, '>')
|
|
92
|
+
.replace(/"/g, '"')
|
|
93
|
+
.replace(/'/g, "'")
|
|
94
|
+
.replace(/ /g, ' ')
|
|
95
|
+
.replace(/&#(\d+);/g, (_, n) => String.fromCharCode(parseInt(n)))
|
|
96
|
+
// Clean up whitespace
|
|
97
|
+
.replace(/[ \t]+/g, ' ')
|
|
98
|
+
.replace(/\n\s*\n/g, '\n\n')
|
|
99
|
+
.trim();
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Extract the main content from HTML using a content-density heuristic.
|
|
103
|
+
* Inspired by Mozilla's Readability algorithm but simplified.
|
|
104
|
+
* Scores text blocks by density (text-to-tag ratio) and picks the richest zone.
|
|
105
|
+
*/
|
|
106
|
+
function extractMainContent(html) {
|
|
107
|
+
// Remove nav, header, footer, sidebar, ads
|
|
108
|
+
let cleaned = html
|
|
109
|
+
.replace(/<(nav|header|footer|aside|menu|sidebar)[\s\S]*?<\/\1>/gi, '')
|
|
110
|
+
.replace(/<div[^>]*(sidebar|menu|nav|footer|header|banner|ad|promo|cookie|popup|modal|overlay)[^>]*>[\s\S]*?<\/div>/gi, '')
|
|
111
|
+
.replace(/<(script|style|noscript|iframe|svg)[\s\S]*?<\/\1>/gi, '')
|
|
112
|
+
.replace(/<!--[\s\S]*?-->/g, '');
|
|
113
|
+
// Try to find <article> or <main> first
|
|
114
|
+
const articleMatch = cleaned.match(/<(article|main)[^>]*>([\s\S]*?)<\/\1>/i);
|
|
115
|
+
if (articleMatch) {
|
|
116
|
+
return stripHtml(articleMatch[2]);
|
|
117
|
+
}
|
|
118
|
+
// Content-density approach: split into blocks, score each
|
|
119
|
+
const blocks = cleaned.split(/<(?:div|section|article|p)[^>]*>/i);
|
|
120
|
+
let bestBlock = '';
|
|
121
|
+
let bestScore = 0;
|
|
122
|
+
for (const block of blocks) {
|
|
123
|
+
const text = stripHtml(block);
|
|
124
|
+
const words = text.split(/\s+/).filter(w => w.length > 2);
|
|
125
|
+
const tagCount = (block.match(/<[^>]+>/g) || []).length;
|
|
126
|
+
// Score = word count penalized by tag density
|
|
127
|
+
const score = words.length - (tagCount * 2);
|
|
128
|
+
if (score > bestScore && words.length > 20) {
|
|
129
|
+
bestScore = score;
|
|
130
|
+
bestBlock = text;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
// Fallback: just strip the whole thing
|
|
134
|
+
if (!bestBlock || bestBlock.length < 100) {
|
|
135
|
+
return stripHtml(cleaned);
|
|
136
|
+
}
|
|
137
|
+
return bestBlock;
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Extract metadata from HTML <head>.
|
|
141
|
+
*/
|
|
142
|
+
function extractMetadata(html) {
|
|
143
|
+
const meta = {};
|
|
144
|
+
// <title>
|
|
145
|
+
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
146
|
+
if (titleMatch)
|
|
147
|
+
meta.title = stripHtml(titleMatch[1]).trim();
|
|
148
|
+
// <meta> tags
|
|
149
|
+
const metaRegex = /<meta\s+(?:[^>]*?(?:name|property)\s*=\s*["']([^"']+)["'][^>]*?content\s*=\s*["']([^"']+)["']|[^>]*?content\s*=\s*["']([^"']+)["'][^>]*?(?:name|property)\s*=\s*["']([^"']+)["'])[^>]*\/?>/gi;
|
|
150
|
+
let match;
|
|
151
|
+
while ((match = metaRegex.exec(html)) !== null) {
|
|
152
|
+
const key = (match[1] || match[4] || '').toLowerCase();
|
|
153
|
+
const value = match[2] || match[3] || '';
|
|
154
|
+
if (key && value)
|
|
155
|
+
meta[key] = value;
|
|
156
|
+
}
|
|
157
|
+
return meta;
|
|
158
|
+
}
|
|
159
|
+
/**
|
|
160
|
+
* Extract all href links from HTML.
|
|
161
|
+
*/
|
|
162
|
+
function extractLinks(html, baseUrl) {
|
|
163
|
+
const links = new Set();
|
|
164
|
+
const hrefRegex = /href\s*=\s*["']([^"'#]+)["']/gi;
|
|
165
|
+
let match;
|
|
166
|
+
while ((match = hrefRegex.exec(html)) !== null) {
|
|
167
|
+
let href = match[1].trim();
|
|
168
|
+
if (!href || href.startsWith('javascript:') || href.startsWith('mailto:') || href.startsWith('tel:'))
|
|
169
|
+
continue;
|
|
170
|
+
try {
|
|
171
|
+
const resolved = new URL(href, baseUrl).href;
|
|
172
|
+
links.add(resolved);
|
|
173
|
+
}
|
|
174
|
+
catch {
|
|
175
|
+
// Invalid URL, skip
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
return Array.from(links);
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* Extract all image src URLs from HTML.
|
|
182
|
+
*/
|
|
183
|
+
function extractImages(html, baseUrl) {
|
|
184
|
+
const images = new Set();
|
|
185
|
+
const srcRegex = /<img[^>]+src\s*=\s*["']([^"']+)["']/gi;
|
|
186
|
+
let match;
|
|
187
|
+
while ((match = srcRegex.exec(html)) !== null) {
|
|
188
|
+
try {
|
|
189
|
+
const resolved = new URL(match[1].trim(), baseUrl).href;
|
|
190
|
+
images.add(resolved);
|
|
191
|
+
}
|
|
192
|
+
catch {
|
|
193
|
+
// Skip invalid
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
return Array.from(images);
|
|
197
|
+
}
|
|
198
|
+
/**
|
|
199
|
+
* Check if a URL is in the same domain as the base.
|
|
200
|
+
*/
|
|
201
|
+
function isSameDomain(url, baseUrl) {
|
|
202
|
+
try {
|
|
203
|
+
const a = new URL(url);
|
|
204
|
+
const b = new URL(baseUrl);
|
|
205
|
+
return a.hostname === b.hostname;
|
|
206
|
+
}
|
|
207
|
+
catch {
|
|
208
|
+
return false;
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
// ─── 1. SEARCH ENGINE ──────────────────────────────────────────────────────
|
|
212
|
+
/**
|
|
213
|
+
* Native web search using DuckDuckGo HTML scraping.
|
|
214
|
+
* No API key needed. Free. Unlimited.
|
|
215
|
+
*
|
|
216
|
+
* Fallback order: DuckDuckGo HTML → Google HTML (if DDG blocked) → SearXNG (if configured)
|
|
217
|
+
*/
|
|
218
|
+
export async function nativeSearch(query, options = {}) {
|
|
219
|
+
const startTime = Date.now();
|
|
220
|
+
const maxResults = options.max_results || 10;
|
|
221
|
+
// Try DuckDuckGo first
|
|
222
|
+
try {
|
|
223
|
+
const results = await searchDuckDuckGo(query, maxResults, options.region, options.time_range);
|
|
224
|
+
if (results.length > 0) {
|
|
225
|
+
return {
|
|
226
|
+
query,
|
|
227
|
+
results,
|
|
228
|
+
total_results: results.length,
|
|
229
|
+
search_engine: 'duckduckgo',
|
|
230
|
+
response_time_ms: Date.now() - startTime,
|
|
231
|
+
};
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
catch {
|
|
235
|
+
// DDG failed, try fallback
|
|
236
|
+
}
|
|
237
|
+
// Fallback: SearXNG if configured
|
|
238
|
+
if (options.searxng_url) {
|
|
239
|
+
try {
|
|
240
|
+
const results = await searchSearXNG(query, maxResults, options.searxng_url);
|
|
241
|
+
if (results.length > 0) {
|
|
242
|
+
return {
|
|
243
|
+
query,
|
|
244
|
+
results,
|
|
245
|
+
total_results: results.length,
|
|
246
|
+
search_engine: 'searxng',
|
|
247
|
+
response_time_ms: Date.now() - startTime,
|
|
248
|
+
};
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
catch {
|
|
252
|
+
// SearXNG failed
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
// Fallback: Google HTML scraping
|
|
256
|
+
try {
|
|
257
|
+
const results = await searchGoogleHTML(query, maxResults, options.region);
|
|
258
|
+
return {
|
|
259
|
+
query,
|
|
260
|
+
results,
|
|
261
|
+
total_results: results.length,
|
|
262
|
+
search_engine: 'google',
|
|
263
|
+
response_time_ms: Date.now() - startTime,
|
|
264
|
+
};
|
|
265
|
+
}
|
|
266
|
+
catch {
|
|
267
|
+
// All engines failed
|
|
268
|
+
}
|
|
269
|
+
return {
|
|
270
|
+
query,
|
|
271
|
+
results: [],
|
|
272
|
+
total_results: 0,
|
|
273
|
+
search_engine: 'none',
|
|
274
|
+
response_time_ms: Date.now() - startTime,
|
|
275
|
+
};
|
|
276
|
+
}
|
|
277
|
+
async function searchDuckDuckGo(query, maxResults, region, timeRange) {
|
|
278
|
+
const params = new URLSearchParams({
|
|
279
|
+
q: query,
|
|
280
|
+
t: 'h_', // Text-only mode
|
|
281
|
+
ia: 'web',
|
|
282
|
+
});
|
|
283
|
+
if (region)
|
|
284
|
+
params.set('kl', region);
|
|
285
|
+
if (timeRange)
|
|
286
|
+
params.set('df', timeRange);
|
|
287
|
+
const url = `https://html.duckduckgo.com/html/?${params.toString()}`;
|
|
288
|
+
const { html } = await safeFetch(url, {
|
|
289
|
+
headers: { 'Accept': 'text/html' },
|
|
290
|
+
});
|
|
291
|
+
const results = [];
|
|
292
|
+
// Parse DuckDuckGo HTML results
|
|
293
|
+
// Each result is in a div with class "result"
|
|
294
|
+
const resultBlocks = html.split(/class="result\s/);
|
|
295
|
+
for (let i = 1; i < resultBlocks.length && results.length < maxResults; i++) {
|
|
296
|
+
const block = resultBlocks[i];
|
|
297
|
+
// Extract URL from result__a href
|
|
298
|
+
const urlMatch = block.match(/class="result__a"[^>]*href="([^"]+)"/);
|
|
299
|
+
if (!urlMatch)
|
|
300
|
+
continue;
|
|
301
|
+
let resultUrl = urlMatch[1];
|
|
302
|
+
// DuckDuckGo sometimes wraps URLs in a redirect
|
|
303
|
+
if (resultUrl.includes('uddg=')) {
|
|
304
|
+
const decoded = decodeURIComponent(resultUrl.split('uddg=')[1]?.split('&')[0] || '');
|
|
305
|
+
if (decoded)
|
|
306
|
+
resultUrl = decoded;
|
|
307
|
+
}
|
|
308
|
+
// Extract title
|
|
309
|
+
const titleMatch = block.match(/class="result__a"[^>]*>([\s\S]*?)<\/a>/);
|
|
310
|
+
const title = titleMatch ? stripHtml(titleMatch[1]).trim() : '';
|
|
311
|
+
// Extract snippet
|
|
312
|
+
const snippetMatch = block.match(/class="result__snippet"[^>]*>([\s\S]*?)<\/(?:a|span|div)/);
|
|
313
|
+
const snippet = snippetMatch ? stripHtml(snippetMatch[1]).trim() : '';
|
|
314
|
+
if (resultUrl && title) {
|
|
315
|
+
results.push({
|
|
316
|
+
title,
|
|
317
|
+
url: resultUrl,
|
|
318
|
+
snippet,
|
|
319
|
+
source: 'duckduckgo',
|
|
320
|
+
position: results.length + 1,
|
|
321
|
+
});
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
return results;
|
|
325
|
+
}
|
|
326
|
+
async function searchGoogleHTML(query, maxResults, region) {
|
|
327
|
+
const params = new URLSearchParams({
|
|
328
|
+
q: query,
|
|
329
|
+
num: String(Math.min(maxResults, 20)),
|
|
330
|
+
});
|
|
331
|
+
if (region)
|
|
332
|
+
params.set('hl', region.split('-')[0] || 'en');
|
|
333
|
+
const url = `https://www.google.com/search?${params.toString()}`;
|
|
334
|
+
const { html } = await safeFetch(url, {
|
|
335
|
+
headers: {
|
|
336
|
+
'Accept': 'text/html',
|
|
337
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
338
|
+
},
|
|
339
|
+
});
|
|
340
|
+
const results = [];
|
|
341
|
+
// Google wraps results in <div class="g">
|
|
342
|
+
const blocks = html.split(/<div class="g"/);
|
|
343
|
+
for (let i = 1; i < blocks.length && results.length < maxResults; i++) {
|
|
344
|
+
const block = blocks[i];
|
|
345
|
+
// URL in <a href="">
|
|
346
|
+
const urlMatch = block.match(/<a[^>]+href="(https?:\/\/[^"]+)"/);
|
|
347
|
+
if (!urlMatch)
|
|
348
|
+
continue;
|
|
349
|
+
const resultUrl = urlMatch[1];
|
|
350
|
+
// Title inside <h3>
|
|
351
|
+
const titleMatch = block.match(/<h3[^>]*>([\s\S]*?)<\/h3>/);
|
|
352
|
+
const title = titleMatch ? stripHtml(titleMatch[1]).trim() : '';
|
|
353
|
+
// Snippet in various spans
|
|
354
|
+
const snippetMatch = block.match(/data-sncf="[^"]*"[^>]*>([\s\S]*?)<\/(?:span|div)>/);
|
|
355
|
+
const snippet = snippetMatch ? stripHtml(snippetMatch[1]).trim() : '';
|
|
356
|
+
if (resultUrl && title && !resultUrl.includes('google.com/search')) {
|
|
357
|
+
results.push({
|
|
358
|
+
title,
|
|
359
|
+
url: resultUrl,
|
|
360
|
+
snippet,
|
|
361
|
+
source: 'google',
|
|
362
|
+
position: results.length + 1,
|
|
363
|
+
});
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
return results;
|
|
367
|
+
}
|
|
368
|
+
async function searchSearXNG(query, maxResults, instanceUrl) {
|
|
369
|
+
const params = new URLSearchParams({
|
|
370
|
+
q: query,
|
|
371
|
+
format: 'json',
|
|
372
|
+
categories: 'general',
|
|
373
|
+
});
|
|
374
|
+
const url = `${instanceUrl.replace(/\/$/, '')}/search?${params.toString()}`;
|
|
375
|
+
const fetch = (await import('node-fetch')).default;
|
|
376
|
+
const response = await fetch(url, {
|
|
377
|
+
headers: { 'Accept': 'application/json', 'User-Agent': USER_AGENT },
|
|
378
|
+
timeout: FETCH_TIMEOUT,
|
|
379
|
+
});
|
|
380
|
+
if (!response.ok)
|
|
381
|
+
throw new Error(`SearXNG returned ${response.status}`);
|
|
382
|
+
const data = await response.json();
|
|
383
|
+
const results = [];
|
|
384
|
+
for (const item of (data.results || []).slice(0, maxResults)) {
|
|
385
|
+
results.push({
|
|
386
|
+
title: item.title || '',
|
|
387
|
+
url: item.url || '',
|
|
388
|
+
snippet: item.content || '',
|
|
389
|
+
source: 'searxng',
|
|
390
|
+
position: results.length + 1,
|
|
391
|
+
});
|
|
392
|
+
}
|
|
393
|
+
return results;
|
|
394
|
+
}
|
|
395
|
+
// ─── 2. CONTENT EXTRACTOR ──────────────────────────────────────────────────
|
|
396
|
+
/**
|
|
397
|
+
* Extract clean, structured content from one or more URLs.
|
|
398
|
+
* Native implementation — no Tavily, no Readability.js dependency.
|
|
399
|
+
* Uses content-density algorithm + metadata extraction.
|
|
400
|
+
*/
|
|
401
|
+
export async function nativeExtract(urls, options = {}) {
|
|
402
|
+
const startTime = Date.now();
|
|
403
|
+
const urlArray = Array.isArray(urls) ? urls : [urls];
|
|
404
|
+
const results = [];
|
|
405
|
+
const failed = [];
|
|
406
|
+
// Process URLs concurrently (max 5 at a time)
|
|
407
|
+
const batchSize = 5;
|
|
408
|
+
for (let i = 0; i < urlArray.length; i += batchSize) {
|
|
409
|
+
const batch = urlArray.slice(i, i + batchSize);
|
|
410
|
+
const batchResults = await Promise.allSettled(batch.map(url => extractSingleUrl(url, options)));
|
|
411
|
+
for (let j = 0; j < batchResults.length; j++) {
|
|
412
|
+
const result = batchResults[j];
|
|
413
|
+
if (result.status === 'fulfilled') {
|
|
414
|
+
results.push(result.value);
|
|
415
|
+
}
|
|
416
|
+
else {
|
|
417
|
+
failed.push({ url: batch[j], error: result.reason?.message || 'Unknown error' });
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
return {
|
|
422
|
+
results,
|
|
423
|
+
failed,
|
|
424
|
+
response_time_ms: Date.now() - startTime,
|
|
425
|
+
};
|
|
426
|
+
}
|
|
427
|
+
async function extractSingleUrl(url, options) {
|
|
428
|
+
const { html, finalUrl } = await safeFetch(url, { timeout: options.timeout });
|
|
429
|
+
const metadata = extractMetadata(html);
|
|
430
|
+
const content = extractMainContent(html);
|
|
431
|
+
const words = content.split(/\s+/).filter(w => w.length > 0);
|
|
432
|
+
const result = {
|
|
433
|
+
url: finalUrl,
|
|
434
|
+
title: metadata.title || metadata['og:title'] || '',
|
|
435
|
+
description: metadata.description || metadata['og:description'] || '',
|
|
436
|
+
content,
|
|
437
|
+
language: metadata['og:locale'] || metadata.language || detectLanguage(content),
|
|
438
|
+
published_date: metadata['article:published_time'] || metadata['date'] || undefined,
|
|
439
|
+
author: metadata.author || metadata['article:author'] || undefined,
|
|
440
|
+
word_count: words.length,
|
|
441
|
+
links: (options.include_links !== false) ? extractLinks(html, finalUrl) : [],
|
|
442
|
+
images: options.include_images ? extractImages(html, finalUrl) : [],
|
|
443
|
+
metadata,
|
|
444
|
+
};
|
|
445
|
+
if (options.include_raw_html) {
|
|
446
|
+
result.raw_html = html;
|
|
447
|
+
}
|
|
448
|
+
return result;
|
|
449
|
+
}
|
|
450
|
+
/**
|
|
451
|
+
* Simple language detection from text content.
|
|
452
|
+
*/
|
|
453
|
+
function detectLanguage(text) {
|
|
454
|
+
const sample = text.slice(0, 1000).toLowerCase();
|
|
455
|
+
// Spanish indicators
|
|
456
|
+
if (/\b(el|la|los|las|que|por|para|como|pero|con|del|una|este|esta)\b/g.test(sample)) {
|
|
457
|
+
const spanishCount = (sample.match(/\b(el|la|los|las|que|por|para|como|pero|con|del|una)\b/g) || []).length;
|
|
458
|
+
if (spanishCount > 5)
|
|
459
|
+
return 'es';
|
|
460
|
+
}
|
|
461
|
+
// Chinese characters
|
|
462
|
+
if (/[\u4e00-\u9fff]/.test(sample))
|
|
463
|
+
return 'zh';
|
|
464
|
+
// Portuguese indicators
|
|
465
|
+
if (/\b(não|são|também|está|isso|você|pelo|pela)\b/g.test(sample))
|
|
466
|
+
return 'pt';
|
|
467
|
+
// Default to English
|
|
468
|
+
return 'en';
|
|
469
|
+
}
|
|
470
|
+
// ─── 3. WEB CRAWLER ────────────────────────────────────────────────────────
|
|
471
|
+
/**
|
|
472
|
+
* Native BFS web crawler.
|
|
473
|
+
* Follows links from a starting URL, collecting content from discovered pages.
|
|
474
|
+
* Respects: domain scope, depth limits, URL patterns, page limits.
|
|
475
|
+
*/
|
|
476
|
+
export async function nativeCrawl(startUrl, options = {}) {
|
|
477
|
+
const startTime = Date.now();
|
|
478
|
+
const maxDepth = Math.min(options.max_depth || 2, 5);
|
|
479
|
+
const maxPages = Math.min(options.max_pages || 10, 50);
|
|
480
|
+
const sameDomain = options.same_domain_only !== false;
|
|
481
|
+
const visited = new Set();
|
|
482
|
+
const pages = [];
|
|
483
|
+
let totalLinksDiscovered = 0;
|
|
484
|
+
let maxDepthReached = 0;
|
|
485
|
+
// Compile include/exclude patterns
|
|
486
|
+
const includeRegex = options.include_patterns?.map(p => new RegExp(p, 'i')) || [];
|
|
487
|
+
const excludeRegex = options.exclude_patterns?.map(p => new RegExp(p, 'i')) || [];
|
|
488
|
+
// BFS queue: [url, depth]
|
|
489
|
+
const queue = [[startUrl, 0]];
|
|
490
|
+
visited.add(normalizeUrl(startUrl));
|
|
491
|
+
while (queue.length > 0 && pages.length < maxPages) {
|
|
492
|
+
const [url, depth] = queue.shift();
|
|
493
|
+
if (depth > maxDepth)
|
|
494
|
+
continue;
|
|
495
|
+
if (depth > maxDepthReached)
|
|
496
|
+
maxDepthReached = depth;
|
|
497
|
+
try {
|
|
498
|
+
const { html, finalUrl } = await safeFetch(url, { timeout: 10000 });
|
|
499
|
+
const metadata = extractMetadata(html);
|
|
500
|
+
const content = extractMainContent(html);
|
|
501
|
+
const links = extractLinks(html, finalUrl);
|
|
502
|
+
totalLinksDiscovered += links.length;
|
|
503
|
+
pages.push({
|
|
504
|
+
url: finalUrl,
|
|
505
|
+
title: metadata.title || '',
|
|
506
|
+
content,
|
|
507
|
+
depth,
|
|
508
|
+
links_found: links.length,
|
|
509
|
+
word_count: content.split(/\s+/).filter(w => w.length > 0).length,
|
|
510
|
+
});
|
|
511
|
+
// Enqueue child links
|
|
512
|
+
if (depth < maxDepth) {
|
|
513
|
+
for (const link of links) {
|
|
514
|
+
const normalized = normalizeUrl(link);
|
|
515
|
+
if (visited.has(normalized))
|
|
516
|
+
continue;
|
|
517
|
+
if (sameDomain && !isSameDomain(link, startUrl))
|
|
518
|
+
continue;
|
|
519
|
+
if (includeRegex.length > 0 && !includeRegex.some(r => r.test(link)))
|
|
520
|
+
continue;
|
|
521
|
+
if (excludeRegex.some(r => r.test(link)))
|
|
522
|
+
continue;
|
|
523
|
+
if (!isValidCrawlUrl(link))
|
|
524
|
+
continue;
|
|
525
|
+
visited.add(normalized);
|
|
526
|
+
queue.push([link, depth + 1]);
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
catch {
|
|
531
|
+
// Skip pages that fail to fetch
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
return {
|
|
535
|
+
base_url: startUrl,
|
|
536
|
+
pages,
|
|
537
|
+
total_pages: pages.length,
|
|
538
|
+
total_links_discovered: totalLinksDiscovered,
|
|
539
|
+
max_depth_reached: maxDepthReached,
|
|
540
|
+
response_time_ms: Date.now() - startTime,
|
|
541
|
+
};
|
|
542
|
+
}
|
|
543
|
+
function normalizeUrl(url) {
|
|
544
|
+
try {
|
|
545
|
+
const parsed = new URL(url);
|
|
546
|
+
// Remove trailing slash, fragment, common tracking params
|
|
547
|
+
parsed.hash = '';
|
|
548
|
+
parsed.searchParams.delete('utm_source');
|
|
549
|
+
parsed.searchParams.delete('utm_medium');
|
|
550
|
+
parsed.searchParams.delete('utm_campaign');
|
|
551
|
+
parsed.searchParams.delete('ref');
|
|
552
|
+
return parsed.href.replace(/\/+$/, '');
|
|
553
|
+
}
|
|
554
|
+
catch {
|
|
555
|
+
return url;
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
function isValidCrawlUrl(url) {
|
|
559
|
+
try {
|
|
560
|
+
const parsed = new URL(url);
|
|
561
|
+
if (!['http:', 'https:'].includes(parsed.protocol))
|
|
562
|
+
return false;
|
|
563
|
+
// Skip common non-content extensions
|
|
564
|
+
const ext = parsed.pathname.split('.').pop()?.toLowerCase() || '';
|
|
565
|
+
const skipExts = ['pdf', 'jpg', 'jpeg', 'png', 'gif', 'svg', 'mp4', 'mp3', 'zip', 'tar', 'gz', 'exe', 'dmg', 'css', 'js', 'woff', 'woff2', 'ttf', 'ico'];
|
|
566
|
+
if (skipExts.includes(ext))
|
|
567
|
+
return false;
|
|
568
|
+
return true;
|
|
569
|
+
}
|
|
570
|
+
catch {
|
|
571
|
+
return false;
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
// ─── 4. DEEP RESEARCH ENGINE ───────────────────────────────────────────────
|
|
575
|
+
/**
|
|
576
|
+
* Multi-step deep research engine.
|
|
577
|
+
* Performs multiple rounds of: search → extract top results → identify gaps → refine search.
|
|
578
|
+
*
|
|
579
|
+
* If a `synthesize` callback is provided (AI model function), it generates a cohesive summary.
|
|
580
|
+
* Without it, returns structured findings from multiple search rounds.
|
|
581
|
+
*/
|
|
582
|
+
export async function nativeResearch(query, options = {}) {
|
|
583
|
+
const startTime = Date.now();
|
|
584
|
+
const maxRounds = options.depth === 'basic' ? 1 : (options.max_rounds || 3);
|
|
585
|
+
const maxSources = options.max_sources || 15;
|
|
586
|
+
const allSources = [];
|
|
587
|
+
const allFindings = [];
|
|
588
|
+
const seenUrls = new Set();
|
|
589
|
+
let totalPagesAnalyzed = 0;
|
|
590
|
+
// Round 1: Initial search
|
|
591
|
+
let currentQuery = query;
|
|
592
|
+
for (let round = 0; round < maxRounds && allSources.length < maxSources; round++) {
|
|
593
|
+
// Search
|
|
594
|
+
const searchResults = await nativeSearch(currentQuery, {
|
|
595
|
+
max_results: 8,
|
|
596
|
+
searxng_url: options.searxng_url,
|
|
597
|
+
});
|
|
598
|
+
// Extract content from top results
|
|
599
|
+
const urlsToExtract = searchResults.results
|
|
600
|
+
.filter(r => !seenUrls.has(r.url))
|
|
601
|
+
.slice(0, 5)
|
|
602
|
+
.map(r => r.url);
|
|
603
|
+
for (const url of urlsToExtract) {
|
|
604
|
+
seenUrls.add(url);
|
|
605
|
+
}
|
|
606
|
+
if (urlsToExtract.length > 0) {
|
|
607
|
+
const extracted = await nativeExtract(urlsToExtract, {
|
|
608
|
+
include_links: false,
|
|
609
|
+
include_images: false,
|
|
610
|
+
timeout: 10000,
|
|
611
|
+
});
|
|
612
|
+
for (const page of extracted.results) {
|
|
613
|
+
totalPagesAnalyzed++;
|
|
614
|
+
// Score relevance by keyword overlap
|
|
615
|
+
const queryWords = query.toLowerCase().split(/\s+/);
|
|
616
|
+
const contentLower = page.content.toLowerCase();
|
|
617
|
+
const matchCount = queryWords.filter(w => contentLower.includes(w)).length;
|
|
618
|
+
const relevance = matchCount / queryWords.length;
|
|
619
|
+
allSources.push({
|
|
620
|
+
title: page.title,
|
|
621
|
+
url: page.url,
|
|
622
|
+
snippet: page.content.slice(0, 300),
|
|
623
|
+
relevance_score: Math.round(relevance * 100) / 100,
|
|
624
|
+
});
|
|
625
|
+
// Extract key sentences as findings
|
|
626
|
+
const sentences = page.content
|
|
627
|
+
.split(/[.!?]+/)
|
|
628
|
+
.map(s => s.trim())
|
|
629
|
+
.filter(s => s.length > 30 && s.length < 500);
|
|
630
|
+
const relevantSentences = sentences.filter(s => {
|
|
631
|
+
const sLower = s.toLowerCase();
|
|
632
|
+
return queryWords.some(w => sLower.includes(w));
|
|
633
|
+
});
|
|
634
|
+
allFindings.push(...relevantSentences.slice(0, 3));
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
// Refine query for next round based on what we found
|
|
638
|
+
if (round < maxRounds - 1) {
|
|
639
|
+
// Extract frequent terms from findings to refine the search
|
|
640
|
+
const findingsText = allFindings.join(' ').toLowerCase();
|
|
641
|
+
const wordFreq = new Map();
|
|
642
|
+
const stopWords = new Set(['the', 'is', 'at', 'which', 'on', 'a', 'an', 'and', 'or', 'but', 'in', 'with', 'to', 'for', 'of', 'that', 'this', 'it', 'be', 'as', 'by', 'from', 'are', 'was', 'were', 'been', 'has', 'have', 'had', 'not', 'can', 'will']);
|
|
643
|
+
for (const word of findingsText.split(/\s+/)) {
|
|
644
|
+
if (word.length > 4 && !stopWords.has(word)) {
|
|
645
|
+
wordFreq.set(word, (wordFreq.get(word) || 0) + 1);
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
// Add top emerging terms to refine the query
|
|
649
|
+
const topTerms = Array.from(wordFreq.entries())
|
|
650
|
+
.sort((a, b) => b[1] - a[1])
|
|
651
|
+
.slice(0, 3)
|
|
652
|
+
.map(([word]) => word);
|
|
653
|
+
currentQuery = `${query} ${topTerms.join(' ')}`;
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
// Sort sources by relevance
|
|
657
|
+
allSources.sort((a, b) => b.relevance_score - a.relevance_score);
|
|
658
|
+
// Deduplicate findings
|
|
659
|
+
const uniqueFindings = [...new Set(allFindings)].slice(0, 20);
|
|
660
|
+
// Synthesize if AI callback provided
|
|
661
|
+
let summary = '';
|
|
662
|
+
if (options.synthesize && uniqueFindings.length > 0) {
|
|
663
|
+
try {
|
|
664
|
+
summary = await options.synthesize(uniqueFindings, query);
|
|
665
|
+
}
|
|
666
|
+
catch {
|
|
667
|
+
summary = `Research on "${query}" found ${allSources.length} sources with ${uniqueFindings.length} key findings across ${maxRounds} search rounds.`;
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
else {
|
|
671
|
+
summary = uniqueFindings.slice(0, 5).join('. ');
|
|
672
|
+
}
|
|
673
|
+
return {
|
|
674
|
+
query,
|
|
675
|
+
summary,
|
|
676
|
+
key_findings: uniqueFindings,
|
|
677
|
+
sources: allSources.slice(0, maxSources),
|
|
678
|
+
search_rounds: maxRounds,
|
|
679
|
+
total_pages_analyzed: totalPagesAnalyzed,
|
|
680
|
+
response_time_ms: Date.now() - startTime,
|
|
681
|
+
};
|
|
682
|
+
}
|
|
683
|
+
// ─── 5. SITE MAPPER ────────────────────────────────────────────────────────
|
|
684
|
+
/**
|
|
685
|
+
* Map a website's complete URL structure.
|
|
686
|
+
* First tries sitemap.xml, then falls back to link-discovery crawl (URL-only mode).
|
|
687
|
+
* Fast — doesn't extract content, just discovers URLs.
|
|
688
|
+
*/
|
|
689
|
+
export async function nativeMap(url, options = {}) {
|
|
690
|
+
const startTime = Date.now();
|
|
691
|
+
const maxUrls = Math.min(options.max_urls || 100, 500);
|
|
692
|
+
const discoveredUrls = new Set();
|
|
693
|
+
let sitemapFound = false;
|
|
694
|
+
// Step 1: Try sitemap.xml
|
|
695
|
+
try {
|
|
696
|
+
const sitemapUrls = await parseSitemap(url, maxUrls);
|
|
697
|
+
if (sitemapUrls.length > 0) {
|
|
698
|
+
sitemapFound = true;
|
|
699
|
+
for (const u of sitemapUrls) {
|
|
700
|
+
discoveredUrls.add(u);
|
|
701
|
+
if (discoveredUrls.size >= maxUrls)
|
|
702
|
+
break;
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
}
|
|
706
|
+
catch {
|
|
707
|
+
// No sitemap, continue to crawl
|
|
708
|
+
}
|
|
709
|
+
// Step 2: If sitemap insufficient, crawl for more URLs
|
|
710
|
+
if (discoveredUrls.size < maxUrls) {
|
|
711
|
+
const remaining = maxUrls - discoveredUrls.size;
|
|
712
|
+
const crawlResult = await nativeCrawl(url, {
|
|
713
|
+
max_depth: options.max_depth || 3,
|
|
714
|
+
max_pages: Math.min(remaining, 30), // Crawl max 30 pages for URL discovery
|
|
715
|
+
same_domain_only: !options.include_subdomains,
|
|
716
|
+
});
|
|
717
|
+
for (const page of crawlResult.pages) {
|
|
718
|
+
discoveredUrls.add(page.url);
|
|
719
|
+
}
|
|
720
|
+
}
|
|
721
|
+
const urlArray = Array.from(discoveredUrls).slice(0, maxUrls);
|
|
722
|
+
return {
|
|
723
|
+
base_url: url,
|
|
724
|
+
urls: urlArray,
|
|
725
|
+
total_urls: urlArray.length,
|
|
726
|
+
sitemap_found: sitemapFound,
|
|
727
|
+
response_time_ms: Date.now() - startTime,
|
|
728
|
+
};
|
|
729
|
+
}
|
|
730
|
+
async function parseSitemap(baseUrl, maxUrls) {
|
|
731
|
+
const urls = [];
|
|
732
|
+
// Try common sitemap locations
|
|
733
|
+
const sitemapPaths = ['/sitemap.xml', '/sitemap_index.xml', '/sitemap/sitemap.xml'];
|
|
734
|
+
for (const path of sitemapPaths) {
|
|
735
|
+
try {
|
|
736
|
+
const sitemapUrl = new URL(path, baseUrl).href;
|
|
737
|
+
const { html: xml } = await safeFetch(sitemapUrl, { timeout: 10000 });
|
|
738
|
+
if (!xml.includes('<urlset') && !xml.includes('<sitemapindex'))
|
|
739
|
+
continue;
|
|
740
|
+
// Check if it's a sitemap index (contains other sitemaps)
|
|
741
|
+
if (xml.includes('<sitemapindex')) {
|
|
742
|
+
const sitemapLocs = xml.match(/<loc>(.*?)<\/loc>/gi) || [];
|
|
743
|
+
for (const loc of sitemapLocs.slice(0, 5)) {
|
|
744
|
+
const childUrl = loc.replace(/<\/?loc>/gi, '').trim();
|
|
745
|
+
try {
|
|
746
|
+
const { html: childXml } = await safeFetch(childUrl, { timeout: 10000 });
|
|
747
|
+
const childLocs = childXml.match(/<loc>(.*?)<\/loc>/gi) || [];
|
|
748
|
+
for (const childLoc of childLocs) {
|
|
749
|
+
const pageUrl = childLoc.replace(/<\/?loc>/gi, '').trim();
|
|
750
|
+
urls.push(pageUrl);
|
|
751
|
+
if (urls.length >= maxUrls)
|
|
752
|
+
return urls;
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
catch {
|
|
756
|
+
continue;
|
|
757
|
+
}
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
else {
|
|
761
|
+
// Regular sitemap with <url> entries
|
|
762
|
+
const locs = xml.match(/<loc>(.*?)<\/loc>/gi) || [];
|
|
763
|
+
for (const loc of locs) {
|
|
764
|
+
const pageUrl = loc.replace(/<\/?loc>/gi, '').trim();
|
|
765
|
+
urls.push(pageUrl);
|
|
766
|
+
if (urls.length >= maxUrls)
|
|
767
|
+
return urls;
|
|
768
|
+
}
|
|
769
|
+
}
|
|
770
|
+
if (urls.length > 0)
|
|
771
|
+
break; // Found valid sitemap, stop trying others
|
|
772
|
+
}
|
|
773
|
+
catch {
|
|
774
|
+
continue;
|
|
775
|
+
}
|
|
776
|
+
}
|
|
777
|
+
// Also try robots.txt for sitemap references
|
|
778
|
+
if (urls.length === 0) {
|
|
779
|
+
try {
|
|
780
|
+
const robotsUrl = new URL('/robots.txt', baseUrl).href;
|
|
781
|
+
const { html: robotsTxt } = await safeFetch(robotsUrl, { timeout: 5000 });
|
|
782
|
+
const sitemapMatches = robotsTxt.match(/Sitemap:\s*(.+)/gi) || [];
|
|
783
|
+
for (const match of sitemapMatches) {
|
|
784
|
+
const sitemapUrl = match.replace(/Sitemap:\s*/i, '').trim();
|
|
785
|
+
try {
|
|
786
|
+
const childUrls = await parseSitemap(sitemapUrl, maxUrls - urls.length);
|
|
787
|
+
urls.push(...childUrls);
|
|
788
|
+
if (urls.length >= maxUrls)
|
|
789
|
+
return urls;
|
|
790
|
+
}
|
|
791
|
+
catch {
|
|
792
|
+
continue;
|
|
793
|
+
}
|
|
794
|
+
}
|
|
795
|
+
}
|
|
796
|
+
catch {
|
|
797
|
+
// No robots.txt
|
|
798
|
+
}
|
|
799
|
+
}
|
|
800
|
+
return urls;
|
|
801
|
+
}
|
|
802
|
+
//# sourceMappingURL=web-engine.js.map
|