scraply 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/core/queue.js CHANGED
@@ -3,19 +3,21 @@ import { loadJSON, saveJSON, deletePath } from '../storage/files.js';
3
3
  /**
4
4
  * @typedef {Object} QueueEntry
5
5
  * @property {string} url
6
- * @property {string|null} file - path to the saved crawled file, or null
6
+ * @property {string|null} file - filename of the saved crawled record (relative to crawledDir), or null
7
7
  * @property {number|null} status - last HTTP status
8
8
  * @property {string|null} error - error message, or null
9
+ * @property {string|null} skipped - reason the page was skipped (e.g. content-type), or null
9
10
  * @property {string|null} referrer - URL this entry was discovered on
10
11
  * @property {number} depth
11
12
  */
12
13
 
13
- const isProcessed = (entry) => entry.file !== null || entry.error !== null;
14
+ const isProcessed = (entry) => entry.file !== null || entry.error !== null || entry.skipped !== null;
14
15
 
15
16
  /**
16
17
  * Owns the crawl queue: dedup, depth limiting, status tracking and durable
17
- * checkpointing. Persistence is debounced so a high-concurrency crawl does not
18
- * rewrite the queue file on every single URL.
18
+ * checkpointing. Status totals are tracked incrementally (O(1) reads) and
19
+ * persistence is debounced so a high-concurrency crawl does not rewrite the
20
+ * queue file on every single URL.
19
21
  */
20
22
  export class QueueManager {
21
23
  /** @param {{ config: import('../index.js').ResolvedConfig, logger: any }} deps */
@@ -32,16 +34,30 @@ export class QueueManager {
32
34
  /** @type {QueueEntry[]} */
33
35
  this._pending = [];
34
36
  this._cursor = 0;
37
+ this._crawled = 0;
38
+ this._errors = 0;
39
+ this._skipped = 0;
35
40
  this._dirty = false;
36
41
  this._timer = null;
37
42
  this._persistInterval = 1000;
38
43
  }
39
44
 
40
- /** Loads any previously persisted queue and rebuilds the in-memory indexes. */
45
+ /** Loads any previously persisted queue and rebuilds the in-memory indexes and totals. */
41
46
  load() {
42
47
  this.entries = loadJSON(this.path, []) ?? [];
43
48
  this.index = new Set(this.entries.map((entry) => entry.url));
44
- this._pending = this.entries.filter((entry) => !isProcessed(entry));
49
+ this._pending = [];
50
+ this._crawled = 0;
51
+ this._errors = 0;
52
+ this._skipped = 0;
53
+
54
+ for (const entry of this.entries) {
55
+ if (entry.file !== null) this._crawled += 1;
56
+ else if (entry.error !== null) this._errors += 1;
57
+ else if (entry.skipped !== null) this._skipped += 1;
58
+ else this._pending.push(entry);
59
+ }
60
+
45
61
  this._cursor = 0;
46
62
  return this.entries;
47
63
  }
@@ -59,7 +75,7 @@ export class QueueManager {
59
75
  add(url, { depth = 0, referrer = null } = {}) {
60
76
  if (this.index.has(url) || depth > this.maxDepth) return false;
61
77
 
62
- const entry = { url, file: null, status: null, error: null, referrer, depth };
78
+ const entry = { url, file: null, status: null, error: null, skipped: null, referrer, depth };
63
79
  this.index.add(url);
64
80
  this.entries.push(entry);
65
81
  this._pending.push(entry);
@@ -76,29 +92,82 @@ export class QueueManager {
76
92
  entry.file = file;
77
93
  entry.status = status;
78
94
  entry.error = null;
95
+ entry.skipped = null;
96
+ this._crawled += 1;
79
97
  this._markDirty();
80
98
  }
81
99
 
82
100
  markError(entry, { error, status }) {
83
101
  entry.error = error;
84
102
  entry.status = status ?? null;
103
+ this._errors += 1;
104
+ this._markDirty();
105
+ }
106
+
107
+ markSkipped(entry, { reason, status }) {
108
+ entry.skipped = reason;
109
+ entry.status = status ?? null;
110
+ this._skipped += 1;
85
111
  this._markDirty();
86
112
  }
87
113
 
114
+ /**
115
+ * Returns matching terminal entries to the pending set so the next crawl
116
+ * retries them. Persists immediately so a fresh `load()` (e.g. at the start of
117
+ * `crawl()`) sees the requeued entries.
118
+ * @param {(entry: QueueEntry) => boolean} match
119
+ * @returns {number} how many entries were requeued
120
+ */
121
+ _requeue(match) {
122
+ let count = 0;
123
+ for (const entry of this.entries) {
124
+ if (!match(entry)) continue;
125
+
126
+ if (entry.error !== null) this._errors -= 1;
127
+ if (entry.skipped !== null) this._skipped -= 1;
128
+
129
+ entry.error = null;
130
+ entry.skipped = null;
131
+ entry.status = null;
132
+ this._pending.push(entry);
133
+ count += 1;
134
+ }
135
+ if (count > 0) this.flush();
136
+ return count;
137
+ }
138
+
139
+ /** Re-queues every errored entry for retry. @returns {number} */
140
+ requeueErrors() {
141
+ return this._requeue((entry) => entry.error !== null);
142
+ }
143
+
144
+ /**
145
+ * Re-queues every skipped entry for another attempt. Useful after widening
146
+ * `allowedContentTypes` (or changing `sites`) so previously skipped URLs are
147
+ * reconsidered. @returns {number}
148
+ */
149
+ requeueSkipped() {
150
+ return this._requeue((entry) => entry.skipped !== null);
151
+ }
152
+
88
153
  isAllProcessed() {
89
- return this.entries.length > 0 && this.entries.every(isProcessed);
154
+ return this.entries.length > 0 && this.pendingCount() === 0;
90
155
  }
91
156
 
92
157
  pendingCount() {
93
- return this.entries.filter((entry) => !isProcessed(entry)).length;
158
+ return this.entries.length - this._crawled - this._errors - this._skipped;
94
159
  }
95
160
 
96
161
  crawledCount() {
97
- return this.entries.filter((entry) => entry.file !== null).length;
162
+ return this._crawled;
98
163
  }
99
164
 
100
165
  errorCount() {
101
- return this.entries.filter((entry) => entry.error !== null).length;
166
+ return this._errors;
167
+ }
168
+
169
+ skippedCount() {
170
+ return this._skipped;
102
171
  }
103
172
 
104
173
  /** Clears in-memory state and removes the persisted queue file. */
@@ -107,6 +176,9 @@ export class QueueManager {
107
176
  this.index = new Set();
108
177
  this._pending = [];
109
178
  this._cursor = 0;
179
+ this._crawled = 0;
180
+ this._errors = 0;
181
+ this._skipped = 0;
110
182
  this._dirty = false;
111
183
  deletePath(this.path);
112
184
  }
package/src/core/retry.js CHANGED
@@ -1,4 +1,5 @@
1
1
  import { delay } from '../util/delay.js';
2
+ import { RateLimitError } from './errors.js';
2
3
 
3
4
  /** Derives how long to wait (ms) from rate-limit headers, falling back to a default. */
4
5
  const computeWait = (headers = {}, fallback) => {
@@ -22,46 +23,53 @@ const computeWait = (headers = {}, fallback) => {
22
23
  * Wraps a fetch operation with retry and rate-limit handling shared by every
23
24
  * fetcher backend.
24
25
  *
25
- * @param {{ config: import('../index.js').ResolvedConfig, logger: any, onRateLimitExit: (code: number) => void }} deps
26
+ * Rate limiting (HTTP 429) is handled independently of the normal retry budget:
27
+ * when `rateLimit.exitOnLimit` is false the runner waits (honoring `retry-after`
28
+ * / `x-ratelimit-reset`) and retries until the host relents; otherwise it throws
29
+ * a `RateLimitError` so the crawl aborts cleanly and can be resumed later.
30
+ *
31
+ * @param {{ config: import('../index.js').ResolvedConfig, logger: any }} deps
26
32
  */
27
- export const createRetryRunner = ({ config, logger, onRateLimitExit }) => {
33
+ export const createRetryRunner = ({ config, logger }) => {
28
34
  const { retry, rateLimit } = config;
29
35
 
30
- const shouldRetry = async (error) => {
31
- const status = error?.response?.status;
32
- if (status === undefined) return true; // network/transport error
33
-
34
- if (status === 429) {
35
- if (rateLimit.exitOnLimit) return false; // run() handles the exit
36
- const wait = computeWait(error.response.headers, rateLimit.fallbackDelay);
37
- logger.warn(`Rate limited. Waiting ${Math.round(wait / 1000)}s before retrying...`);
38
- await delay(wait);
39
- return true;
40
- }
41
-
42
- return retry.statusCodes.includes(status);
43
- };
44
-
45
36
  const run = async (fn) => {
46
- for (let attempt = 0; ; attempt++) {
37
+ let attempt = 0;
38
+
39
+ for (;;) {
47
40
  try {
48
41
  return await fn();
49
42
  } catch (error) {
50
- const canRetry = attempt < retry.max && (await shouldRetry(error));
51
- if (canRetry) {
52
- logger.info(`Retry ${attempt + 1}/${retry.max} -> ${error.message}`);
43
+ const status = error?.response?.status;
44
+
45
+ if (status === 429) {
46
+ if (rateLimit.exitOnLimit) {
47
+ logger.warn(`Rate limited. Aborting crawl (exitOnLimit) with code ${rateLimit.exitCode}.`);
48
+ throw new RateLimitError('Rate limited', {
49
+ code: rateLimit.exitCode,
50
+ headers: error.response.headers,
51
+ cause: error
52
+ });
53
+ }
54
+
55
+ const wait = computeWait(error.response.headers, rateLimit.fallbackDelay);
56
+ logger.warn(`Rate limited. Waiting ${Math.round(wait / 1000)}s before retrying...`);
57
+ await delay(wait);
58
+ continue; // rate-limit waits never consume the retry budget
59
+ }
60
+
61
+ const retriable = status === undefined || retry.statusCodes.includes(status);
62
+ if (retriable && attempt < retry.max) {
63
+ attempt += 1;
64
+ logger.info(`Retry ${attempt}/${retry.max} -> ${error.message}`);
53
65
  if (retry.delay > 0) await delay(retry.delay);
54
66
  continue;
55
67
  }
56
68
 
57
- if (error?.response?.status === 429) {
58
- logger.warn(`Force exiting with code ${rateLimit.exitCode} (rate limited).`);
59
- onRateLimitExit(rateLimit.exitCode);
60
- }
61
69
  throw error;
62
70
  }
63
71
  }
64
72
  };
65
73
 
66
- return { run, shouldRetry };
74
+ return { run };
67
75
  };