recker 1.0.97 → 1.0.98

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,6 +15,7 @@ export interface SpiderOptions {
15
15
  concurrency?: number;
16
16
  timeout?: number;
17
17
  delay?: number;
18
+ allowedDomains?: string[];
18
19
  exclude?: RegExp[];
19
20
  include?: RegExp[];
20
21
  userAgent?: string;
@@ -115,23 +115,34 @@ function getRootDomain(hostname) {
115
115
  }
116
116
  return parts.slice(-2).join('.');
117
117
  }
118
- function shouldCrawl(url, baseHost, options, baseRootDomain) {
118
+ function shouldCrawl(url, baseHost, options, baseRootDomain, allowedDomains) {
119
119
  try {
120
120
  const parsed = new URL(url);
121
121
  if (!['http:', 'https:'].includes(parsed.protocol)) {
122
122
  return false;
123
123
  }
124
124
  const hostname = parsed.hostname.replace(/^www\./, '');
125
- const sameDomain = options.sameDomain;
126
- if (sameDomain === 'subdomain') {
127
- const pageRoot = getRootDomain(hostname);
128
- const rootDomain = baseRootDomain ?? getRootDomain(baseHost);
129
- if (pageRoot !== rootDomain)
125
+ if (allowedDomains) {
126
+ if (allowedDomains.exact.has(hostname)) {
127
+ }
128
+ else if (allowedDomains.wildcards.some(w => hostname === w || hostname.endsWith('.' + w))) {
129
+ }
130
+ else {
130
131
  return false;
132
+ }
131
133
  }
132
- else if (sameDomain !== false) {
133
- if (hostname !== baseHost)
134
- return false;
134
+ else {
135
+ const sameDomain = options.sameDomain;
136
+ if (sameDomain === 'subdomain') {
137
+ const pageRoot = getRootDomain(hostname);
138
+ const rootDomain = baseRootDomain ?? getRootDomain(baseHost);
139
+ if (pageRoot !== rootDomain)
140
+ return false;
141
+ }
142
+ else if (sameDomain !== false) {
143
+ if (hostname !== baseHost)
144
+ return false;
145
+ }
135
146
  }
136
147
  const skipExtensions = [
137
148
  '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico',
@@ -290,6 +301,20 @@ export class Spider {
290
301
  useSitemap: options.useSitemap ?? false,
291
302
  transport: options.transport ?? 'auto',
292
303
  sitemapUrl: options.sitemapUrl,
304
+ allowedDomains: options.allowedDomains ? (() => {
305
+ const exact = new Set();
306
+ const wildcards = [];
307
+ for (const d of options.allowedDomains) {
308
+ const lower = d.toLowerCase().replace(/^www\./, '');
309
+ if (lower.startsWith('*.')) {
310
+ wildcards.push(lower.slice(2));
311
+ }
312
+ else {
313
+ exact.add(lower);
314
+ }
315
+ }
316
+ return { exact, wildcards };
317
+ })() : undefined,
293
318
  exclude: options.exclude,
294
319
  include: options.include,
295
320
  onPage: options.onPage,
@@ -1113,7 +1138,7 @@ export class Spider {
1113
1138
  if (!link.href)
1114
1139
  continue;
1115
1140
  const normalized = normalizeUrl(link.href);
1116
- if (!shouldCrawl(normalized, this.baseHost, this.options, this.baseRootDomain))
1141
+ if (!shouldCrawl(normalized, this.baseHost, this.options, this.baseRootDomain, this.options.allowedDomains))
1117
1142
  continue;
1118
1143
  candidateUrls.push(normalized);
1119
1144
  candidates.push({ url: normalized, depth: item.depth + 1 });
@@ -15,6 +15,7 @@ export interface SpiderOptions {
15
15
  concurrency?: number;
16
16
  timeout?: number;
17
17
  delay?: number;
18
+ allowedDomains?: string[];
18
19
  exclude?: RegExp[];
19
20
  include?: RegExp[];
20
21
  userAgent?: string;
@@ -115,23 +115,34 @@ function getRootDomain(hostname) {
115
115
  }
116
116
  return parts.slice(-2).join('.');
117
117
  }
118
- function shouldCrawl(url, baseHost, options, baseRootDomain) {
118
+ function shouldCrawl(url, baseHost, options, baseRootDomain, allowedDomains) {
119
119
  try {
120
120
  const parsed = new URL(url);
121
121
  if (!['http:', 'https:'].includes(parsed.protocol)) {
122
122
  return false;
123
123
  }
124
124
  const hostname = parsed.hostname.replace(/^www\./, '');
125
- const sameDomain = options.sameDomain;
126
- if (sameDomain === 'subdomain') {
127
- const pageRoot = getRootDomain(hostname);
128
- const rootDomain = baseRootDomain ?? getRootDomain(baseHost);
129
- if (pageRoot !== rootDomain)
125
+ if (allowedDomains) {
126
+ if (allowedDomains.exact.has(hostname)) {
127
+ }
128
+ else if (allowedDomains.wildcards.some(w => hostname === w || hostname.endsWith('.' + w))) {
129
+ }
130
+ else {
130
131
  return false;
132
+ }
131
133
  }
132
- else if (sameDomain !== false) {
133
- if (hostname !== baseHost)
134
- return false;
134
+ else {
135
+ const sameDomain = options.sameDomain;
136
+ if (sameDomain === 'subdomain') {
137
+ const pageRoot = getRootDomain(hostname);
138
+ const rootDomain = baseRootDomain ?? getRootDomain(baseHost);
139
+ if (pageRoot !== rootDomain)
140
+ return false;
141
+ }
142
+ else if (sameDomain !== false) {
143
+ if (hostname !== baseHost)
144
+ return false;
145
+ }
135
146
  }
136
147
  const skipExtensions = [
137
148
  '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico',
@@ -290,6 +301,20 @@ export class Spider {
290
301
  useSitemap: options.useSitemap ?? false,
291
302
  transport: options.transport ?? 'auto',
292
303
  sitemapUrl: options.sitemapUrl,
304
+ allowedDomains: options.allowedDomains ? (() => {
305
+ const exact = new Set();
306
+ const wildcards = [];
307
+ for (const d of options.allowedDomains) {
308
+ const lower = d.toLowerCase().replace(/^www\./, '');
309
+ if (lower.startsWith('*.')) {
310
+ wildcards.push(lower.slice(2));
311
+ }
312
+ else {
313
+ exact.add(lower);
314
+ }
315
+ }
316
+ return { exact, wildcards };
317
+ })() : undefined,
293
318
  exclude: options.exclude,
294
319
  include: options.include,
295
320
  onPage: options.onPage,
@@ -1113,7 +1138,7 @@ export class Spider {
1113
1138
  if (!link.href)
1114
1139
  continue;
1115
1140
  const normalized = normalizeUrl(link.href);
1116
- if (!shouldCrawl(normalized, this.baseHost, this.options, this.baseRootDomain))
1141
+ if (!shouldCrawl(normalized, this.baseHost, this.options, this.baseRootDomain, this.options.allowedDomains))
1117
1142
  continue;
1118
1143
  candidateUrls.push(normalized);
1119
1144
  candidates.push({ url: normalized, depth: item.depth + 1 });
package/dist/version.js CHANGED
@@ -1,4 +1,4 @@
1
- const VERSION = '1.0.97';
1
+ const VERSION = '1.0.98';
2
2
  let _version = null;
3
3
  export async function getVersion() {
4
4
  if (_version)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "recker",
3
- "version": "1.0.97",
3
+ "version": "1.0.98",
4
4
  "description": "Multi-Protocol SDK for the AI Era - HTTP, WebSocket, DNS, FTP, SFTP, Telnet, HLS unified with AI providers and MCP tools",
5
5
  "main": "./dist/index.js",
6
6
  "types": "./dist/index.d.ts",