recker 1.0.73 → 1.0.75-next.2e5a94f
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -18
- package/dist/browser/core/client.d.ts +14 -8
- package/dist/browser/core/client.js +199 -17
- package/dist/browser/core/errors.d.ts +15 -1
- package/dist/browser/core/errors.js +140 -9
- package/dist/browser/core/request.d.ts +5 -0
- package/dist/browser/core/request.js +33 -2
- package/dist/browser/core-runtime/plugin-manifest.d.ts +24 -0
- package/dist/browser/core-runtime/plugin-manifest.js +159 -0
- package/dist/browser/core-runtime/request-context.d.ts +13 -0
- package/dist/browser/core-runtime/request-context.js +24 -0
- package/dist/browser/core-runtime/typed-events.d.ts +89 -0
- package/dist/browser/core-runtime/typed-events.js +34 -0
- package/dist/browser/index.iife.min.js +79 -79
- package/dist/browser/index.min.js +79 -79
- package/dist/browser/index.mini.iife.js +913 -97
- package/dist/browser/index.mini.iife.min.js +46 -46
- package/dist/browser/index.mini.min.js +46 -46
- package/dist/browser/index.mini.umd.js +913 -97
- package/dist/browser/index.mini.umd.min.js +46 -46
- package/dist/browser/index.umd.min.js +79 -79
- package/dist/browser/plugins/auth/aws-sigv4.d.ts +1 -0
- package/dist/browser/plugins/auth/aws-sigv4.js +19 -2
- package/dist/browser/plugins/retry.js +29 -1
- package/dist/browser/presets/aws.d.ts +1 -0
- package/dist/browser/presets/aws.js +62 -1
- package/dist/browser/runner/request-runner.d.ts +15 -5
- package/dist/browser/runner/request-runner.js +164 -30
- package/dist/browser/scrape/parser/nodes/html.d.ts +6 -0
- package/dist/browser/scrape/parser/nodes/html.js +70 -18
- package/dist/browser/scrape/parser/nodes/node.d.ts +1 -0
- package/dist/browser/scrape/parser/nodes/node.js +5 -0
- package/dist/browser/scrape/spider.d.ts +1 -0
- package/dist/browser/scrape/spider.js +39 -26
- package/dist/browser/seo/analyzer.d.ts +1 -1
- package/dist/browser/seo/analyzer.js +73 -42
- package/dist/browser/seo/index.d.ts +1 -1
- package/dist/browser/seo/rules/types.d.ts +2 -0
- package/dist/browser/seo/seo-spider.d.ts +2 -3
- package/dist/browser/seo/seo-spider.js +26 -202
- package/dist/browser/seo/types.d.ts +4 -0
- package/dist/browser/seo/validators/sitemap.js +9 -2
- package/dist/browser/transport/fetch.js +38 -5
- package/dist/browser/transport/undici.js +73 -11
- package/dist/browser/transport/worker.d.ts +0 -1
- package/dist/browser/transport/worker.js +1 -3
- package/dist/browser/types/index.d.ts +24 -0
- package/dist/cli/commands/mcp.js +5 -3
- package/dist/core/client.d.ts +14 -8
- package/dist/core/client.js +199 -17
- package/dist/core/errors.d.ts +15 -1
- package/dist/core/errors.js +140 -9
- package/dist/core/request.d.ts +5 -0
- package/dist/core/request.js +33 -2
- package/dist/core-runtime/plugin-manifest.d.ts +24 -0
- package/dist/core-runtime/plugin-manifest.js +159 -0
- package/dist/core-runtime/request-context.d.ts +13 -0
- package/dist/core-runtime/request-context.js +24 -0
- package/dist/core-runtime/typed-events.d.ts +89 -0
- package/dist/core-runtime/typed-events.js +34 -0
- package/dist/index.d.ts +2 -1
- package/dist/index.js +2 -1
- package/dist/mcp/cli.js +10 -8
- package/dist/mcp/profiles.d.ts +1 -1
- package/dist/mcp/profiles.js +31 -6
- package/dist/mcp/tools/categories.js +0 -1
- package/dist/plugins/auth/aws-sigv4.d.ts +1 -0
- package/dist/plugins/auth/aws-sigv4.js +19 -2
- package/dist/plugins/retry.js +29 -1
- package/dist/presets/aws.d.ts +1 -0
- package/dist/presets/aws.js +62 -1
- package/dist/recker.d.ts +3 -0
- package/dist/recker.js +5 -0
- package/dist/runner/request-runner.d.ts +15 -5
- package/dist/runner/request-runner.js +164 -30
- package/dist/scrape/parser/nodes/html.d.ts +6 -0
- package/dist/scrape/parser/nodes/html.js +70 -18
- package/dist/scrape/parser/nodes/node.d.ts +1 -0
- package/dist/scrape/parser/nodes/node.js +5 -0
- package/dist/scrape/spider.d.ts +1 -0
- package/dist/scrape/spider.js +39 -26
- package/dist/search/google.d.ts +67 -0
- package/dist/search/google.js +480 -0
- package/dist/search/index.d.ts +3 -0
- package/dist/search/index.js +1 -0
- package/dist/seo/analyzer.d.ts +1 -1
- package/dist/seo/analyzer.js +73 -42
- package/dist/seo/index.d.ts +1 -1
- package/dist/seo/rules/types.d.ts +2 -0
- package/dist/seo/seo-spider.d.ts +2 -3
- package/dist/seo/seo-spider.js +26 -202
- package/dist/seo/types.d.ts +4 -0
- package/dist/seo/validators/sitemap.js +9 -2
- package/dist/transport/fetch.js +38 -5
- package/dist/transport/undici.js +73 -11
- package/dist/transport/worker.d.ts +0 -1
- package/dist/transport/worker.js +1 -3
- package/dist/types/index.d.ts +24 -0
- package/dist/version.js +1 -1
- package/package.json +9 -1
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { QueueCancelledError } from '../core/errors.js';
|
|
1
2
|
class SimpleEmitter {
|
|
2
3
|
listeners = new Map();
|
|
3
4
|
on(event, listener) {
|
|
@@ -43,10 +44,14 @@ export class RequestRunner extends SimpleEmitter {
|
|
|
43
44
|
queue = [];
|
|
44
45
|
activeCount = 0;
|
|
45
46
|
paused = false;
|
|
46
|
-
results = new Map();
|
|
47
47
|
stats = { total: 0, successful: 0, failed: 0 };
|
|
48
48
|
startTime = 0;
|
|
49
49
|
pendingRetries = 0;
|
|
50
|
+
isCancelled = false;
|
|
51
|
+
cancelReason = new QueueCancelledError('Request runner cancelled');
|
|
52
|
+
retryTimers = new Map();
|
|
53
|
+
timeoutId;
|
|
54
|
+
abortUnsubscribe;
|
|
50
55
|
constructor(options = {}) {
|
|
51
56
|
super();
|
|
52
57
|
this.concurrency = options.concurrency || 5;
|
|
@@ -67,48 +72,146 @@ export class RequestRunner extends SimpleEmitter {
|
|
|
67
72
|
this.processNext();
|
|
68
73
|
}
|
|
69
74
|
async run(items, processor, options = {}) {
|
|
75
|
+
this.queue = [];
|
|
76
|
+
this.activeCount = 0;
|
|
77
|
+
this.pendingRetries = 0;
|
|
78
|
+
this.stats = { total: 0, successful: 0, failed: 0 };
|
|
79
|
+
this.isCancelled = false;
|
|
80
|
+
if (this.retryTimers.size > 0) {
|
|
81
|
+
for (const [, timer] of this.retryTimers) {
|
|
82
|
+
clearTimeout(timer);
|
|
83
|
+
}
|
|
84
|
+
this.retryTimers.clear();
|
|
85
|
+
}
|
|
86
|
+
if (this.timeoutId) {
|
|
87
|
+
clearTimeout(this.timeoutId);
|
|
88
|
+
this.timeoutId = undefined;
|
|
89
|
+
}
|
|
90
|
+
if (this.abortUnsubscribe) {
|
|
91
|
+
this.abortUnsubscribe();
|
|
92
|
+
this.abortUnsubscribe = undefined;
|
|
93
|
+
}
|
|
70
94
|
this.startTime = Date.now();
|
|
71
95
|
this.stats = { total: items.length, successful: 0, failed: 0 };
|
|
72
|
-
this.
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
96
|
+
this.isCancelled = false;
|
|
97
|
+
this.cancelReason = new QueueCancelledError('Request runner cancelled', {
|
|
98
|
+
queueName: 'request-runner',
|
|
99
|
+
request: undefined
|
|
100
|
+
});
|
|
101
|
+
if (options.signal) {
|
|
102
|
+
const signal = options.signal;
|
|
103
|
+
if (signal.aborted) {
|
|
104
|
+
const reason = signal.reason instanceof Error
|
|
105
|
+
? signal.reason
|
|
106
|
+
: new QueueCancelledError('Request runner signal was aborted', {
|
|
107
|
+
queueName: 'request-runner',
|
|
108
|
+
request: undefined
|
|
109
|
+
});
|
|
110
|
+
this.cancelAll(reason);
|
|
111
|
+
}
|
|
112
|
+
else {
|
|
113
|
+
const handleAbort = () => {
|
|
114
|
+
const reason = signal.reason instanceof Error
|
|
115
|
+
? signal.reason
|
|
116
|
+
: new QueueCancelledError('Request runner signal was aborted', {
|
|
117
|
+
queueName: 'request-runner',
|
|
118
|
+
request: undefined
|
|
119
|
+
});
|
|
120
|
+
this.cancelAll(reason);
|
|
121
|
+
};
|
|
122
|
+
signal.addEventListener('abort', handleAbort, { once: true });
|
|
123
|
+
this.abortUnsubscribe = () => signal.removeEventListener('abort', handleAbort);
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
if (options.deadlineMs !== undefined) {
|
|
127
|
+
const deadline = options.deadlineMs;
|
|
128
|
+
if (deadline <= 0) {
|
|
129
|
+
this.cancelAll(new QueueCancelledError('Request runner deadline elapsed', {
|
|
130
|
+
queueName: 'request-runner',
|
|
131
|
+
request: undefined
|
|
132
|
+
}));
|
|
133
|
+
}
|
|
134
|
+
else {
|
|
135
|
+
this.timeoutId = setTimeout(() => {
|
|
136
|
+
this.cancelAll(new QueueCancelledError('Request runner deadline exceeded', {
|
|
137
|
+
queueName: 'request-runner',
|
|
138
|
+
request: undefined
|
|
139
|
+
}));
|
|
140
|
+
}, deadline);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
try {
|
|
144
|
+
const promises = items.map((item, index) => {
|
|
145
|
+
return new Promise((resolve) => {
|
|
146
|
+
this.add(() => processor(item, index), {
|
|
147
|
+
priority: options.priority,
|
|
148
|
+
id: String(index),
|
|
149
|
+
retries: options.retries,
|
|
150
|
+
resolve,
|
|
151
|
+
trackTotal: false
|
|
152
|
+
});
|
|
81
153
|
});
|
|
82
154
|
});
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
155
|
+
const results = await Promise.all(promises);
|
|
156
|
+
return {
|
|
157
|
+
results,
|
|
158
|
+
stats: {
|
|
159
|
+
...this.stats,
|
|
160
|
+
duration: Date.now() - this.startTime
|
|
161
|
+
}
|
|
162
|
+
};
|
|
163
|
+
}
|
|
164
|
+
finally {
|
|
165
|
+
if (this.timeoutId) {
|
|
166
|
+
clearTimeout(this.timeoutId);
|
|
167
|
+
this.timeoutId = undefined;
|
|
90
168
|
}
|
|
91
|
-
|
|
169
|
+
if (this.abortUnsubscribe) {
|
|
170
|
+
this.abortUnsubscribe();
|
|
171
|
+
this.abortUnsubscribe = undefined;
|
|
172
|
+
}
|
|
173
|
+
}
|
|
92
174
|
}
|
|
93
175
|
queueTask(task) {
|
|
94
176
|
this.queue.push(task);
|
|
95
177
|
this.queue.sort((a, b) => b.priority - a.priority);
|
|
96
178
|
}
|
|
97
179
|
scheduleRetry(task, delay) {
|
|
98
|
-
this.
|
|
99
|
-
|
|
100
|
-
setTimeout(() => {
|
|
101
|
-
this.pendingRetries--;
|
|
102
|
-
this.queueTask(task);
|
|
103
|
-
this.processNext();
|
|
104
|
-
}, delay);
|
|
180
|
+
if (this.isCancelled) {
|
|
181
|
+
this.resolveTask(task, this.cancelReason);
|
|
105
182
|
return;
|
|
106
183
|
}
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
184
|
+
if (delay <= 0) {
|
|
185
|
+
this.queueTask(task);
|
|
186
|
+
this.processNext();
|
|
187
|
+
return;
|
|
188
|
+
}
|
|
189
|
+
const timerKey = `${task.id}-${Date.now()}-${Math.random().toString(16).slice(2, 8)}`;
|
|
190
|
+
const enqueueTask = () => {
|
|
191
|
+
this.retryTimers.delete(timerKey);
|
|
192
|
+
this.pendingRetries--;
|
|
193
|
+
if (this.isCancelled) {
|
|
194
|
+
return;
|
|
195
|
+
}
|
|
196
|
+
this.queueTask(task);
|
|
197
|
+
this.processNext();
|
|
198
|
+
};
|
|
199
|
+
this.retryTimers.set(timerKey, setTimeout(() => {
|
|
200
|
+
if (this.isCancelled) {
|
|
201
|
+
this.pendingRetries--;
|
|
202
|
+
this.retryTimers.delete(timerKey);
|
|
203
|
+
this.resolveTask(task, this.cancelReason);
|
|
204
|
+
return;
|
|
205
|
+
}
|
|
206
|
+
enqueueTask();
|
|
207
|
+
}, delay));
|
|
208
|
+
this.pendingRetries++;
|
|
110
209
|
}
|
|
111
210
|
async processNext() {
|
|
211
|
+
if (this.isCancelled) {
|
|
212
|
+
this.resolveQueue();
|
|
213
|
+
return;
|
|
214
|
+
}
|
|
112
215
|
if (this.paused || this.activeCount >= this.concurrency || this.queue.length === 0) {
|
|
113
216
|
return;
|
|
114
217
|
}
|
|
@@ -124,6 +227,10 @@ export class RequestRunner extends SimpleEmitter {
|
|
|
124
227
|
this.emit('taskComplete', { task, result });
|
|
125
228
|
}
|
|
126
229
|
catch (error) {
|
|
230
|
+
if (this.isCancelled) {
|
|
231
|
+
this.resolveTask(task, this.cancelReason);
|
|
232
|
+
return;
|
|
233
|
+
}
|
|
127
234
|
const remaining = task.retries ?? 0;
|
|
128
235
|
if (remaining > 0) {
|
|
129
236
|
task.retries = remaining - 1;
|
|
@@ -131,8 +238,7 @@ export class RequestRunner extends SimpleEmitter {
|
|
|
131
238
|
this.scheduleRetry(task, this.retryDelay);
|
|
132
239
|
}
|
|
133
240
|
else {
|
|
134
|
-
this.
|
|
135
|
-
task.resolve?.(error);
|
|
241
|
+
this.resolveTask(task, error);
|
|
136
242
|
this.emit('taskError', { task, error });
|
|
137
243
|
}
|
|
138
244
|
}
|
|
@@ -145,6 +251,34 @@ export class RequestRunner extends SimpleEmitter {
|
|
|
145
251
|
this.processNext();
|
|
146
252
|
}
|
|
147
253
|
}
|
|
254
|
+
cancelAll(reason) {
|
|
255
|
+
if (this.isCancelled) {
|
|
256
|
+
return;
|
|
257
|
+
}
|
|
258
|
+
this.isCancelled = true;
|
|
259
|
+
this.cancelReason = reason;
|
|
260
|
+
for (const [, timer] of this.retryTimers) {
|
|
261
|
+
clearTimeout(timer);
|
|
262
|
+
}
|
|
263
|
+
this.retryTimers.clear();
|
|
264
|
+
this.pendingRetries = 0;
|
|
265
|
+
this.resolveQueue();
|
|
266
|
+
}
|
|
267
|
+
resolveQueue() {
|
|
268
|
+
while (this.queue.length > 0) {
|
|
269
|
+
const task = this.queue.shift();
|
|
270
|
+
if (task) {
|
|
271
|
+
this.resolveTask(task, this.cancelReason);
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
resolveTask(task, error) {
|
|
276
|
+
if (!task.resolve) {
|
|
277
|
+
return;
|
|
278
|
+
}
|
|
279
|
+
this.stats.failed++;
|
|
280
|
+
task.resolve?.(error);
|
|
281
|
+
}
|
|
148
282
|
getProgress() {
|
|
149
283
|
const completed = this.stats.successful + this.stats.failed;
|
|
150
284
|
return {
|
|
@@ -33,11 +33,16 @@ export default class HTMLElement extends Node {
|
|
|
33
33
|
private voidTag;
|
|
34
34
|
private _attrs?;
|
|
35
35
|
private _rawAttrs?;
|
|
36
|
+
private _queryCache?;
|
|
36
37
|
private _parseOptions;
|
|
37
38
|
rawTagName: string;
|
|
38
39
|
id: string;
|
|
39
40
|
classList: DOMTokenList;
|
|
40
41
|
nodeType: NodeType;
|
|
42
|
+
private get isSelectorCacheEnabled();
|
|
43
|
+
private getQueryCache;
|
|
44
|
+
private clearQueryCache;
|
|
45
|
+
invalidateSelectorCacheRecursively(): void;
|
|
41
46
|
private quoteAttribute;
|
|
42
47
|
constructor(tagName: string, keyAttrs: KeyAttributes, rawAttrs?: string, parentNode?: HTMLElement | null, range?: [number, number], voidTag?: VoidTag, _parseOptions?: Partial<Options>);
|
|
43
48
|
removeChild(node: Node): this;
|
|
@@ -104,6 +109,7 @@ export interface Options {
|
|
|
104
109
|
tags?: string[];
|
|
105
110
|
closingSlash?: boolean;
|
|
106
111
|
};
|
|
112
|
+
selectorCache?: boolean;
|
|
107
113
|
}
|
|
108
114
|
export declare function base_parse(data: string, options?: Partial<Options>): HTMLElement[];
|
|
109
115
|
export declare function parse(data: string, options?: Partial<Options>): HTMLElement;
|
|
@@ -98,11 +98,31 @@ export default class HTMLElement extends Node {
|
|
|
98
98
|
voidTag;
|
|
99
99
|
_attrs;
|
|
100
100
|
_rawAttrs;
|
|
101
|
+
_queryCache;
|
|
101
102
|
_parseOptions;
|
|
102
103
|
rawTagName;
|
|
103
104
|
id;
|
|
104
105
|
classList;
|
|
105
106
|
nodeType = NodeType.ELEMENT_NODE;
|
|
107
|
+
get isSelectorCacheEnabled() {
|
|
108
|
+
return this._parseOptions?.selectorCache !== false;
|
|
109
|
+
}
|
|
110
|
+
getQueryCache() {
|
|
111
|
+
if (!this._queryCache) {
|
|
112
|
+
this._queryCache = new Map();
|
|
113
|
+
}
|
|
114
|
+
return this._queryCache;
|
|
115
|
+
}
|
|
116
|
+
clearQueryCache() {
|
|
117
|
+
this._queryCache = undefined;
|
|
118
|
+
}
|
|
119
|
+
invalidateSelectorCacheRecursively() {
|
|
120
|
+
let current = this;
|
|
121
|
+
while (current) {
|
|
122
|
+
current.clearQueryCache();
|
|
123
|
+
current = current.parentNode;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
106
126
|
quoteAttribute(attr) {
|
|
107
127
|
if (attr == null) {
|
|
108
128
|
return 'null';
|
|
@@ -144,6 +164,7 @@ export default class HTMLElement extends Node {
|
|
|
144
164
|
this.childNodes = this.childNodes.filter((child) => {
|
|
145
165
|
return child !== node;
|
|
146
166
|
});
|
|
167
|
+
this.invalidateSelectorCacheRecursively();
|
|
147
168
|
return this;
|
|
148
169
|
}
|
|
149
170
|
exchangeChild(oldNode, newNode) {
|
|
@@ -154,6 +175,7 @@ export default class HTMLElement extends Node {
|
|
|
154
175
|
}
|
|
155
176
|
return child;
|
|
156
177
|
});
|
|
178
|
+
this.invalidateSelectorCacheRecursively();
|
|
157
179
|
return this;
|
|
158
180
|
}
|
|
159
181
|
get tagName() {
|
|
@@ -182,6 +204,7 @@ export default class HTMLElement extends Node {
|
|
|
182
204
|
set textContent(val) {
|
|
183
205
|
const content = [new TextNode(val, this)];
|
|
184
206
|
this.childNodes = content;
|
|
207
|
+
this.invalidateSelectorCacheRecursively();
|
|
185
208
|
}
|
|
186
209
|
get text() {
|
|
187
210
|
return decode(this.rawText);
|
|
@@ -249,6 +272,7 @@ export default class HTMLElement extends Node {
|
|
|
249
272
|
resetParent(nodes, this);
|
|
250
273
|
resetParent(this.childNodes, null);
|
|
251
274
|
this.childNodes = nodes;
|
|
275
|
+
this.invalidateSelectorCacheRecursively();
|
|
252
276
|
}
|
|
253
277
|
set_content(content, options = {}) {
|
|
254
278
|
if (content instanceof Node) {
|
|
@@ -264,6 +288,7 @@ export default class HTMLElement extends Node {
|
|
|
264
288
|
resetParent(this.childNodes, null);
|
|
265
289
|
resetParent(content, this);
|
|
266
290
|
this.childNodes = content;
|
|
291
|
+
this.invalidateSelectorCacheRecursively();
|
|
267
292
|
return this;
|
|
268
293
|
}
|
|
269
294
|
replaceWith(...nodes) {
|
|
@@ -293,6 +318,7 @@ export default class HTMLElement extends Node {
|
|
|
293
318
|
...resetParent(content, parent),
|
|
294
319
|
...parent.childNodes.slice(idx + 1),
|
|
295
320
|
];
|
|
321
|
+
parent.invalidateSelectorCacheRecursively();
|
|
296
322
|
return this;
|
|
297
323
|
}
|
|
298
324
|
get outerHTML() {
|
|
@@ -312,6 +338,7 @@ export default class HTMLElement extends Node {
|
|
|
312
338
|
}
|
|
313
339
|
}
|
|
314
340
|
}
|
|
341
|
+
this.invalidateSelectorCacheRecursively();
|
|
315
342
|
return this;
|
|
316
343
|
}
|
|
317
344
|
get structure() {
|
|
@@ -357,6 +384,7 @@ export default class HTMLElement extends Node {
|
|
|
357
384
|
this.childNodes[o++] = node;
|
|
358
385
|
});
|
|
359
386
|
this.childNodes.length = o;
|
|
387
|
+
this.invalidateSelectorCacheRecursively();
|
|
360
388
|
const attrs = Object.keys(this.rawAttributes)
|
|
361
389
|
.map((key) => {
|
|
362
390
|
const val = this.rawAttributes[key];
|
|
@@ -368,16 +396,49 @@ export default class HTMLElement extends Node {
|
|
|
368
396
|
return this;
|
|
369
397
|
}
|
|
370
398
|
querySelectorAll(selector) {
|
|
371
|
-
|
|
399
|
+
if (this.isSelectorCacheEnabled) {
|
|
400
|
+
const cached = this.getQueryCache().get(selector);
|
|
401
|
+
if (cached?.all) {
|
|
402
|
+
return cached.all.slice();
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
const nodes = selectAll(selector, this, {
|
|
372
406
|
xmlMode: false,
|
|
373
407
|
adapter: Matcher,
|
|
374
408
|
});
|
|
409
|
+
if (this.isSelectorCacheEnabled) {
|
|
410
|
+
const cacheEntry = this.getQueryCache().get(selector) || {};
|
|
411
|
+
cacheEntry.all = nodes;
|
|
412
|
+
if (cacheEntry.first === undefined) {
|
|
413
|
+
cacheEntry.first = nodes[0] || null;
|
|
414
|
+
}
|
|
415
|
+
this.getQueryCache().set(selector, cacheEntry);
|
|
416
|
+
}
|
|
417
|
+
return nodes;
|
|
375
418
|
}
|
|
376
419
|
querySelector(selector) {
|
|
377
|
-
|
|
420
|
+
if (this.isSelectorCacheEnabled) {
|
|
421
|
+
const cached = this.getQueryCache().get(selector);
|
|
422
|
+
if (cached?.first !== undefined) {
|
|
423
|
+
return cached.first || null;
|
|
424
|
+
}
|
|
425
|
+
if (cached?.all) {
|
|
426
|
+
const first = cached.all[0] || null;
|
|
427
|
+
cached.first = first;
|
|
428
|
+
this.getQueryCache().set(selector, cached);
|
|
429
|
+
return first;
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
const result = selectOne(selector, this, {
|
|
378
433
|
xmlMode: false,
|
|
379
434
|
adapter: Matcher,
|
|
380
435
|
});
|
|
436
|
+
if (this.isSelectorCacheEnabled) {
|
|
437
|
+
const cacheEntry = this.getQueryCache().get(selector) || {};
|
|
438
|
+
cacheEntry.first = result;
|
|
439
|
+
this.getQueryCache().set(selector, cacheEntry);
|
|
440
|
+
}
|
|
441
|
+
return result;
|
|
381
442
|
}
|
|
382
443
|
getElementsByTagName(tagName) {
|
|
383
444
|
const upperCasedTagName = tagName.toUpperCase();
|
|
@@ -440,22 +501,6 @@ export default class HTMLElement extends Node {
|
|
|
440
501
|
const mapChild = new Map();
|
|
441
502
|
let el = this;
|
|
442
503
|
let old = null;
|
|
443
|
-
function findOne(test, elems) {
|
|
444
|
-
let elem = null;
|
|
445
|
-
for (let i = 0, l = elems.length; i < l && !elem; i++) {
|
|
446
|
-
const el = elems[i];
|
|
447
|
-
if (test(el)) {
|
|
448
|
-
elem = el;
|
|
449
|
-
}
|
|
450
|
-
else {
|
|
451
|
-
const child = mapChild.get(el);
|
|
452
|
-
if (child) {
|
|
453
|
-
elem = findOne(test, [child]);
|
|
454
|
-
}
|
|
455
|
-
}
|
|
456
|
-
}
|
|
457
|
-
return elem;
|
|
458
|
-
}
|
|
459
504
|
while (el) {
|
|
460
505
|
if (old)
|
|
461
506
|
mapChild.set(el, old);
|
|
@@ -545,6 +590,7 @@ export default class HTMLElement extends Node {
|
|
|
545
590
|
if (key === 'id') {
|
|
546
591
|
this.id = '';
|
|
547
592
|
}
|
|
593
|
+
this.invalidateSelectorCacheRecursively();
|
|
548
594
|
return this;
|
|
549
595
|
}
|
|
550
596
|
hasAttribute(key) {
|
|
@@ -580,6 +626,7 @@ export default class HTMLElement extends Node {
|
|
|
580
626
|
if (key === 'id') {
|
|
581
627
|
this.id = value;
|
|
582
628
|
}
|
|
629
|
+
this.invalidateSelectorCacheRecursively();
|
|
583
630
|
return this;
|
|
584
631
|
}
|
|
585
632
|
setAttributes(attributes) {
|
|
@@ -597,6 +644,7 @@ export default class HTMLElement extends Node {
|
|
|
597
644
|
return `${name}=${this.quoteAttribute(String(val))}`;
|
|
598
645
|
})
|
|
599
646
|
.join(' ');
|
|
647
|
+
this.invalidateSelectorCacheRecursively();
|
|
600
648
|
return this;
|
|
601
649
|
}
|
|
602
650
|
insertAdjacentHTML(where, html) {
|
|
@@ -625,11 +673,13 @@ export default class HTMLElement extends Node {
|
|
|
625
673
|
const nodes = resolveInsertable(insertable, this._parseOptions);
|
|
626
674
|
resetParent(nodes, this);
|
|
627
675
|
this.childNodes.unshift(...nodes);
|
|
676
|
+
this.invalidateSelectorCacheRecursively();
|
|
628
677
|
}
|
|
629
678
|
append(...insertable) {
|
|
630
679
|
const nodes = resolveInsertable(insertable, this._parseOptions);
|
|
631
680
|
resetParent(nodes, this);
|
|
632
681
|
this.childNodes.push(...nodes);
|
|
682
|
+
this.invalidateSelectorCacheRecursively();
|
|
633
683
|
}
|
|
634
684
|
before(...insertable) {
|
|
635
685
|
if (!this.parentNode)
|
|
@@ -638,6 +688,7 @@ export default class HTMLElement extends Node {
|
|
|
638
688
|
const siblings = this.parentNode.childNodes;
|
|
639
689
|
resetParent(nodes, this.parentNode);
|
|
640
690
|
siblings.splice(siblings.indexOf(this), 0, ...nodes);
|
|
691
|
+
this.parentNode.invalidateSelectorCacheRecursively();
|
|
641
692
|
}
|
|
642
693
|
after(...insertable) {
|
|
643
694
|
if (!this.parentNode)
|
|
@@ -646,6 +697,7 @@ export default class HTMLElement extends Node {
|
|
|
646
697
|
const siblings = this.parentNode.childNodes;
|
|
647
698
|
resetParent(nodes, this.parentNode);
|
|
648
699
|
siblings.splice(siblings.indexOf(this) + 1, 0, ...nodes);
|
|
700
|
+
this.parentNode.invalidateSelectorCacheRecursively();
|
|
649
701
|
}
|
|
650
702
|
get nextSibling() {
|
|
651
703
|
if (this.parentNode) {
|
|
@@ -12,6 +12,7 @@ export default abstract class Node {
|
|
|
12
12
|
abstract clone(): Node;
|
|
13
13
|
constructor(parentNode?: HTMLElement | null, range?: [number, number]);
|
|
14
14
|
remove(): this;
|
|
15
|
+
invalidateSelectorCacheRecursively(): void;
|
|
15
16
|
get innerText(): string;
|
|
16
17
|
get textContent(): string;
|
|
17
18
|
set textContent(val: string);
|
|
@@ -11,6 +11,9 @@ export default class Node {
|
|
|
11
11
|
}
|
|
12
12
|
remove() {
|
|
13
13
|
if (this.parentNode) {
|
|
14
|
+
if (typeof this.parentNode.invalidateSelectorCacheRecursively === 'function') {
|
|
15
|
+
this.parentNode.invalidateSelectorCacheRecursively();
|
|
16
|
+
}
|
|
14
17
|
const children = this.parentNode.childNodes;
|
|
15
18
|
this.parentNode.childNodes = children.filter((child) => {
|
|
16
19
|
return this !== child;
|
|
@@ -19,6 +22,8 @@ export default class Node {
|
|
|
19
22
|
}
|
|
20
23
|
return this;
|
|
21
24
|
}
|
|
25
|
+
invalidateSelectorCacheRecursively() {
|
|
26
|
+
}
|
|
22
27
|
get innerText() {
|
|
23
28
|
return this.rawText;
|
|
24
29
|
}
|
package/dist/scrape/spider.d.ts
CHANGED
|
@@ -123,6 +123,7 @@ export declare class Spider {
|
|
|
123
123
|
private robotsData;
|
|
124
124
|
private sitemapValidation;
|
|
125
125
|
private robotsValidation;
|
|
126
|
+
private toHeaderRecord;
|
|
126
127
|
constructor(options?: SpiderOptions);
|
|
127
128
|
crawl(startUrl: string): Promise<SpiderResult>;
|
|
128
129
|
private fetchRobotsTxt;
|
package/dist/scrape/spider.js
CHANGED
|
@@ -76,9 +76,6 @@ function shouldCrawl(url, baseHost, options) {
|
|
|
76
76
|
return false;
|
|
77
77
|
}
|
|
78
78
|
}
|
|
79
|
-
function sleep(ms) {
|
|
80
|
-
return new Promise(resolve => setTimeout(resolve, ms));
|
|
81
|
-
}
|
|
82
79
|
function parseExtractSelectors(selectors) {
|
|
83
80
|
const schema = {};
|
|
84
81
|
for (const sel of selectors) {
|
|
@@ -115,6 +112,13 @@ export class Spider {
|
|
|
115
112
|
robotsData = null;
|
|
116
113
|
sitemapValidation = null;
|
|
117
114
|
robotsValidation = null;
|
|
115
|
+
toHeaderRecord(headers) {
|
|
116
|
+
const headerRecord = {};
|
|
117
|
+
headers.forEach((value, key) => {
|
|
118
|
+
headerRecord[key] = value;
|
|
119
|
+
});
|
|
120
|
+
return headerRecord;
|
|
121
|
+
}
|
|
118
122
|
constructor(options = {}) {
|
|
119
123
|
let extractSchema;
|
|
120
124
|
if (options.extract) {
|
|
@@ -194,7 +198,7 @@ export class Spider {
|
|
|
194
198
|
await this.fetchSitemaps(baseUrl);
|
|
195
199
|
}
|
|
196
200
|
const pending = new Map();
|
|
197
|
-
const scheduleUrl = (item
|
|
201
|
+
const scheduleUrl = (item) => {
|
|
198
202
|
const normalized = normalizeUrl(item.url);
|
|
199
203
|
if (this.visited.has(normalized))
|
|
200
204
|
return;
|
|
@@ -230,7 +234,7 @@ export class Spider {
|
|
|
230
234
|
try {
|
|
231
235
|
const urlHost = new URL(sitemapUrl.loc).hostname;
|
|
232
236
|
if (urlHost === this.baseHost) {
|
|
233
|
-
scheduleUrl({ url: sitemapUrl.loc, depth: 1 }
|
|
237
|
+
scheduleUrl({ url: sitemapUrl.loc, depth: 1 });
|
|
234
238
|
}
|
|
235
239
|
}
|
|
236
240
|
catch {
|
|
@@ -303,7 +307,7 @@ export class Spider {
|
|
|
303
307
|
return {
|
|
304
308
|
status: response.status,
|
|
305
309
|
text: await response.text(),
|
|
306
|
-
headers:
|
|
310
|
+
headers: this.toHeaderRecord(response.headers),
|
|
307
311
|
};
|
|
308
312
|
};
|
|
309
313
|
try {
|
|
@@ -351,40 +355,49 @@ export class Spider {
|
|
|
351
355
|
}
|
|
352
356
|
buildSitemapAnalysis() {
|
|
353
357
|
const crawledUrls = new Set(this.results.map(r => normalizeUrl(r.url)));
|
|
354
|
-
const
|
|
358
|
+
const sitemapUrlSet = this.sitemapUrlSet.size > 0
|
|
359
|
+
? this.sitemapUrlSet
|
|
360
|
+
: new Set(this.sitemapUrls.map((u) => normalizeUrl(u.loc)));
|
|
361
|
+
const crawledFromSitemap = Array.from(sitemapUrlSet)
|
|
362
|
+
.filter(url => crawledUrls.has(url))
|
|
363
|
+
.length;
|
|
355
364
|
const linkedUrls = new Set();
|
|
356
|
-
|
|
357
|
-
for (const link of page.links) {
|
|
358
|
-
if (link.href) {
|
|
359
|
-
linkedUrls.add(normalizeUrl(link.href));
|
|
360
|
-
}
|
|
361
|
-
}
|
|
362
|
-
}
|
|
363
|
-
const orphanUrls = this.sitemapUrls
|
|
364
|
-
.filter(u => {
|
|
365
|
-
const normalized = normalizeUrl(u.loc);
|
|
366
|
-
return !linkedUrls.has(normalized) && crawledUrls.has(normalized);
|
|
367
|
-
})
|
|
368
|
-
.map(u => u.loc);
|
|
369
|
-
const missingFromSitemap = Array.from(crawledUrls)
|
|
370
|
-
.filter(url => !this.sitemapUrlSet.has(url));
|
|
371
|
-
const blockedBySitemapRobots = [];
|
|
365
|
+
const blockedBySitemapRobotsSet = new Set();
|
|
372
366
|
if (this.robotsData) {
|
|
373
367
|
for (const sitemapUrl of this.sitemapUrls) {
|
|
374
368
|
try {
|
|
369
|
+
const normalized = normalizeUrl(sitemapUrl.loc);
|
|
375
370
|
const urlPath = new URL(sitemapUrl.loc).pathname;
|
|
376
371
|
if (!isPathAllowed(this.robotsData, urlPath, this.options.userAgent)) {
|
|
377
|
-
|
|
372
|
+
blockedBySitemapRobotsSet.add(normalized);
|
|
378
373
|
}
|
|
379
374
|
}
|
|
380
375
|
catch {
|
|
381
376
|
}
|
|
382
377
|
}
|
|
383
378
|
}
|
|
379
|
+
for (const page of this.results) {
|
|
380
|
+
for (const link of page.links) {
|
|
381
|
+
if (link.href) {
|
|
382
|
+
linkedUrls.add(normalizeUrl(link.href));
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
const orphanUrlSet = new Set();
|
|
387
|
+
for (const u of this.sitemapUrls) {
|
|
388
|
+
const normalized = normalizeUrl(u.loc);
|
|
389
|
+
if (!linkedUrls.has(normalized) && !blockedBySitemapRobotsSet.has(normalized)) {
|
|
390
|
+
orphanUrlSet.add(normalized);
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
const orphanUrls = Array.from(orphanUrlSet);
|
|
394
|
+
const missingFromSitemap = Array.from(crawledUrls)
|
|
395
|
+
.filter(url => !sitemapUrlSet.has(url));
|
|
396
|
+
const blockedBySitemapRobots = Array.from(blockedBySitemapRobotsSet);
|
|
384
397
|
return {
|
|
385
398
|
found: this.sitemapUrls.length > 0,
|
|
386
|
-
url: this.
|
|
387
|
-
totalUrls:
|
|
399
|
+
url: this.sitemapUrls[0]?.loc,
|
|
400
|
+
totalUrls: sitemapUrlSet.size,
|
|
388
401
|
crawledFromSitemap,
|
|
389
402
|
orphanUrls,
|
|
390
403
|
missingFromSitemap,
|