recker 1.0.97 → 1.0.98
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/scrape/spider.d.ts +1 -0
- package/dist/browser/scrape/spider.js +35 -10
- package/dist/scrape/spider.d.ts +1 -0
- package/dist/scrape/spider.js +35 -10
- package/dist/version.js +1 -1
- package/package.json +1 -1
|
@@ -115,23 +115,34 @@ function getRootDomain(hostname) {
|
|
|
115
115
|
}
|
|
116
116
|
return parts.slice(-2).join('.');
|
|
117
117
|
}
|
|
118
|
-
function shouldCrawl(url, baseHost, options, baseRootDomain) {
|
|
118
|
+
function shouldCrawl(url, baseHost, options, baseRootDomain, allowedDomains) {
|
|
119
119
|
try {
|
|
120
120
|
const parsed = new URL(url);
|
|
121
121
|
if (!['http:', 'https:'].includes(parsed.protocol)) {
|
|
122
122
|
return false;
|
|
123
123
|
}
|
|
124
124
|
const hostname = parsed.hostname.replace(/^www\./, '');
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
125
|
+
if (allowedDomains) {
|
|
126
|
+
if (allowedDomains.exact.has(hostname)) {
|
|
127
|
+
}
|
|
128
|
+
else if (allowedDomains.wildcards.some(w => hostname === w || hostname.endsWith('.' + w))) {
|
|
129
|
+
}
|
|
130
|
+
else {
|
|
130
131
|
return false;
|
|
132
|
+
}
|
|
131
133
|
}
|
|
132
|
-
else
|
|
133
|
-
|
|
134
|
-
|
|
134
|
+
else {
|
|
135
|
+
const sameDomain = options.sameDomain;
|
|
136
|
+
if (sameDomain === 'subdomain') {
|
|
137
|
+
const pageRoot = getRootDomain(hostname);
|
|
138
|
+
const rootDomain = baseRootDomain ?? getRootDomain(baseHost);
|
|
139
|
+
if (pageRoot !== rootDomain)
|
|
140
|
+
return false;
|
|
141
|
+
}
|
|
142
|
+
else if (sameDomain !== false) {
|
|
143
|
+
if (hostname !== baseHost)
|
|
144
|
+
return false;
|
|
145
|
+
}
|
|
135
146
|
}
|
|
136
147
|
const skipExtensions = [
|
|
137
148
|
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico',
|
|
@@ -290,6 +301,20 @@ export class Spider {
|
|
|
290
301
|
useSitemap: options.useSitemap ?? false,
|
|
291
302
|
transport: options.transport ?? 'auto',
|
|
292
303
|
sitemapUrl: options.sitemapUrl,
|
|
304
|
+
allowedDomains: options.allowedDomains ? (() => {
|
|
305
|
+
const exact = new Set();
|
|
306
|
+
const wildcards = [];
|
|
307
|
+
for (const d of options.allowedDomains) {
|
|
308
|
+
const lower = d.toLowerCase().replace(/^www\./, '');
|
|
309
|
+
if (lower.startsWith('*.')) {
|
|
310
|
+
wildcards.push(lower.slice(2));
|
|
311
|
+
}
|
|
312
|
+
else {
|
|
313
|
+
exact.add(lower);
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
return { exact, wildcards };
|
|
317
|
+
})() : undefined,
|
|
293
318
|
exclude: options.exclude,
|
|
294
319
|
include: options.include,
|
|
295
320
|
onPage: options.onPage,
|
|
@@ -1113,7 +1138,7 @@ export class Spider {
|
|
|
1113
1138
|
if (!link.href)
|
|
1114
1139
|
continue;
|
|
1115
1140
|
const normalized = normalizeUrl(link.href);
|
|
1116
|
-
if (!shouldCrawl(normalized, this.baseHost, this.options, this.baseRootDomain))
|
|
1141
|
+
if (!shouldCrawl(normalized, this.baseHost, this.options, this.baseRootDomain, this.options.allowedDomains))
|
|
1117
1142
|
continue;
|
|
1118
1143
|
candidateUrls.push(normalized);
|
|
1119
1144
|
candidates.push({ url: normalized, depth: item.depth + 1 });
|
package/dist/scrape/spider.d.ts
CHANGED
package/dist/scrape/spider.js
CHANGED
|
@@ -115,23 +115,34 @@ function getRootDomain(hostname) {
|
|
|
115
115
|
}
|
|
116
116
|
return parts.slice(-2).join('.');
|
|
117
117
|
}
|
|
118
|
-
function shouldCrawl(url, baseHost, options, baseRootDomain) {
|
|
118
|
+
function shouldCrawl(url, baseHost, options, baseRootDomain, allowedDomains) {
|
|
119
119
|
try {
|
|
120
120
|
const parsed = new URL(url);
|
|
121
121
|
if (!['http:', 'https:'].includes(parsed.protocol)) {
|
|
122
122
|
return false;
|
|
123
123
|
}
|
|
124
124
|
const hostname = parsed.hostname.replace(/^www\./, '');
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
125
|
+
if (allowedDomains) {
|
|
126
|
+
if (allowedDomains.exact.has(hostname)) {
|
|
127
|
+
}
|
|
128
|
+
else if (allowedDomains.wildcards.some(w => hostname === w || hostname.endsWith('.' + w))) {
|
|
129
|
+
}
|
|
130
|
+
else {
|
|
130
131
|
return false;
|
|
132
|
+
}
|
|
131
133
|
}
|
|
132
|
-
else
|
|
133
|
-
|
|
134
|
-
|
|
134
|
+
else {
|
|
135
|
+
const sameDomain = options.sameDomain;
|
|
136
|
+
if (sameDomain === 'subdomain') {
|
|
137
|
+
const pageRoot = getRootDomain(hostname);
|
|
138
|
+
const rootDomain = baseRootDomain ?? getRootDomain(baseHost);
|
|
139
|
+
if (pageRoot !== rootDomain)
|
|
140
|
+
return false;
|
|
141
|
+
}
|
|
142
|
+
else if (sameDomain !== false) {
|
|
143
|
+
if (hostname !== baseHost)
|
|
144
|
+
return false;
|
|
145
|
+
}
|
|
135
146
|
}
|
|
136
147
|
const skipExtensions = [
|
|
137
148
|
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico',
|
|
@@ -290,6 +301,20 @@ export class Spider {
|
|
|
290
301
|
useSitemap: options.useSitemap ?? false,
|
|
291
302
|
transport: options.transport ?? 'auto',
|
|
292
303
|
sitemapUrl: options.sitemapUrl,
|
|
304
|
+
allowedDomains: options.allowedDomains ? (() => {
|
|
305
|
+
const exact = new Set();
|
|
306
|
+
const wildcards = [];
|
|
307
|
+
for (const d of options.allowedDomains) {
|
|
308
|
+
const lower = d.toLowerCase().replace(/^www\./, '');
|
|
309
|
+
if (lower.startsWith('*.')) {
|
|
310
|
+
wildcards.push(lower.slice(2));
|
|
311
|
+
}
|
|
312
|
+
else {
|
|
313
|
+
exact.add(lower);
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
return { exact, wildcards };
|
|
317
|
+
})() : undefined,
|
|
293
318
|
exclude: options.exclude,
|
|
294
319
|
include: options.include,
|
|
295
320
|
onPage: options.onPage,
|
|
@@ -1113,7 +1138,7 @@ export class Spider {
|
|
|
1113
1138
|
if (!link.href)
|
|
1114
1139
|
continue;
|
|
1115
1140
|
const normalized = normalizeUrl(link.href);
|
|
1116
|
-
if (!shouldCrawl(normalized, this.baseHost, this.options, this.baseRootDomain))
|
|
1141
|
+
if (!shouldCrawl(normalized, this.baseHost, this.options, this.baseRootDomain, this.options.allowedDomains))
|
|
1117
1142
|
continue;
|
|
1118
1143
|
candidateUrls.push(normalized);
|
|
1119
1144
|
candidates.push({ url: normalized, depth: item.depth + 1 });
|
package/dist/version.js
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "recker",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.98",
|
|
4
4
|
"description": "Multi-Protocol SDK for the AI Era - HTTP, WebSocket, DNS, FTP, SFTP, Telnet, HLS unified with AI providers and MCP tools",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"types": "./dist/index.d.ts",
|