@nitpicker/crawler 0.4.2 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/archive/archive-accessor.d.ts +6 -1
- package/lib/archive/archive-accessor.js +7 -0
- package/lib/archive/database.js +2 -1
- package/package.json +5 -2
- package/CHANGELOG.md +0 -16
- package/src/archive/__mock__/.gitignore +0 -3
- package/src/archive/__mock__/mock.sqlite +0 -0
- package/src/archive/archive-accessor.ts +0 -337
- package/src/archive/archive.ts +0 -408
- package/src/archive/database.spec.ts +0 -469
- package/src/archive/database.ts +0 -1059
- package/src/archive/debug.ts +0 -10
- package/src/archive/filesystem/append-text.spec.ts +0 -26
- package/src/archive/filesystem/append-text.ts +0 -16
- package/src/archive/filesystem/copy-dir-sync.spec.ts +0 -27
- package/src/archive/filesystem/copy-dir-sync.ts +0 -10
- package/src/archive/filesystem/copy-dir.spec.ts +0 -33
- package/src/archive/filesystem/copy-dir.ts +0 -14
- package/src/archive/filesystem/exists.spec.ts +0 -33
- package/src/archive/filesystem/exists.ts +0 -10
- package/src/archive/filesystem/get-file-list.spec.ts +0 -37
- package/src/archive/filesystem/get-file-list.ts +0 -13
- package/src/archive/filesystem/index.ts +0 -17
- package/src/archive/filesystem/is-dir.spec.ts +0 -29
- package/src/archive/filesystem/is-dir.ts +0 -11
- package/src/archive/filesystem/mkdir.spec.ts +0 -37
- package/src/archive/filesystem/mkdir.ts +0 -16
- package/src/archive/filesystem/output-json.spec.ts +0 -34
- package/src/archive/filesystem/output-json.ts +0 -16
- package/src/archive/filesystem/output-text.spec.ts +0 -31
- package/src/archive/filesystem/output-text.ts +0 -35
- package/src/archive/filesystem/read-json.spec.ts +0 -26
- package/src/archive/filesystem/read-json.ts +0 -12
- package/src/archive/filesystem/read-text.spec.ts +0 -25
- package/src/archive/filesystem/read-text.ts +0 -11
- package/src/archive/filesystem/readline.spec.ts +0 -29
- package/src/archive/filesystem/readline.ts +0 -30
- package/src/archive/filesystem/remove.spec.ts +0 -34
- package/src/archive/filesystem/remove.ts +0 -11
- package/src/archive/filesystem/rename.spec.ts +0 -46
- package/src/archive/filesystem/rename.ts +0 -21
- package/src/archive/filesystem/tar.spec.ts +0 -33
- package/src/archive/filesystem/tar.ts +0 -27
- package/src/archive/filesystem/untar.spec.ts +0 -34
- package/src/archive/filesystem/untar.ts +0 -36
- package/src/archive/index.ts +0 -13
- package/src/archive/page.spec.ts +0 -368
- package/src/archive/page.ts +0 -420
- package/src/archive/resource.spec.ts +0 -101
- package/src/archive/resource.ts +0 -73
- package/src/archive/safe-path.spec.ts +0 -44
- package/src/archive/safe-path.ts +0 -18
- package/src/archive/types.ts +0 -227
- package/src/crawler/clear-destination-cache.spec.ts +0 -20
- package/src/crawler/clear-destination-cache.ts +0 -9
- package/src/crawler/crawler.ts +0 -873
- package/src/crawler/decompose-url.spec.ts +0 -48
- package/src/crawler/decompose-url.ts +0 -90
- package/src/crawler/destination-cache.spec.ts +0 -23
- package/src/crawler/destination-cache.ts +0 -8
- package/src/crawler/detect-pagination-pattern.spec.ts +0 -169
- package/src/crawler/detect-pagination-pattern.ts +0 -66
- package/src/crawler/fetch-destination.ts +0 -257
- package/src/crawler/fetch-robots-txt.spec.ts +0 -83
- package/src/crawler/fetch-robots-txt.ts +0 -91
- package/src/crawler/find-best-matching-scope.spec.ts +0 -39
- package/src/crawler/find-best-matching-scope.ts +0 -57
- package/src/crawler/generate-predicted-urls.spec.ts +0 -42
- package/src/crawler/generate-predicted-urls.ts +0 -34
- package/src/crawler/handle-ignore-and-skip.spec.ts +0 -66
- package/src/crawler/handle-ignore-and-skip.ts +0 -30
- package/src/crawler/handle-resource-response.spec.ts +0 -45
- package/src/crawler/handle-resource-response.ts +0 -21
- package/src/crawler/handle-scrape-end.spec.ts +0 -109
- package/src/crawler/handle-scrape-end.ts +0 -115
- package/src/crawler/handle-scrape-error.spec.ts +0 -105
- package/src/crawler/handle-scrape-error.ts +0 -58
- package/src/crawler/index.ts +0 -2
- package/src/crawler/inject-scope-auth.spec.ts +0 -36
- package/src/crawler/inject-scope-auth.ts +0 -27
- package/src/crawler/is-external-url.spec.ts +0 -31
- package/src/crawler/is-external-url.ts +0 -17
- package/src/crawler/is-in-any-lower-layer.spec.ts +0 -31
- package/src/crawler/is-in-any-lower-layer.ts +0 -22
- package/src/crawler/link-list.spec.ts +0 -355
- package/src/crawler/link-list.ts +0 -275
- package/src/crawler/link-to-page-data.spec.ts +0 -133
- package/src/crawler/link-to-page-data.ts +0 -34
- package/src/crawler/net-timeout-error.spec.ts +0 -25
- package/src/crawler/net-timeout-error.ts +0 -11
- package/src/crawler/protocol-agnostic-key.spec.ts +0 -40
- package/src/crawler/protocol-agnostic-key.ts +0 -11
- package/src/crawler/reconstruct-url.spec.ts +0 -37
- package/src/crawler/reconstruct-url.ts +0 -37
- package/src/crawler/robots-checker.spec.ts +0 -104
- package/src/crawler/robots-checker.ts +0 -73
- package/src/crawler/should-discard-predicted.spec.ts +0 -125
- package/src/crawler/should-discard-predicted.ts +0 -33
- package/src/crawler/should-skip-url.spec.ts +0 -77
- package/src/crawler/should-skip-url.ts +0 -37
- package/src/crawler/types.ts +0 -146
- package/src/crawler-orchestrator.ts +0 -401
- package/src/debug.ts +0 -10
- package/src/index.ts +0 -25
- package/src/types.ts +0 -30
- package/src/utils/array/each-splitted.spec.ts +0 -38
- package/src/utils/array/each-splitted.ts +0 -19
- package/src/utils/array/index.ts +0 -1
- package/src/utils/debug.ts +0 -6
- package/src/utils/error/dom-evaluation-error.spec.ts +0 -20
- package/src/utils/error/dom-evaluation-error.ts +0 -6
- package/src/utils/error/error-emitter.spec.ts +0 -78
- package/src/utils/error/error-emitter.ts +0 -44
- package/src/utils/error/index.ts +0 -3
- package/src/utils/index.ts +0 -5
- package/src/utils/object/clean-object.spec.ts +0 -24
- package/src/utils/object/clean-object.ts +0 -13
- package/src/utils/object/index.ts +0 -1
- package/src/utils/types/index.ts +0 -1
- package/src/utils/types/types.ts +0 -65
- package/tsconfig.json +0 -11
- package/tsconfig.tsbuildinfo +0 -1
package/src/crawler/crawler.ts
DELETED
|
@@ -1,873 +0,0 @@
|
|
|
1
|
-
import type { CrawlerEventTypes, CrawlerOptions } from './types.js';
|
|
2
|
-
import type {
|
|
3
|
-
ChangePhaseEvent,
|
|
4
|
-
PageData,
|
|
5
|
-
ResourceEntry,
|
|
6
|
-
ScrapeResult,
|
|
7
|
-
} from '@d-zero/beholder';
|
|
8
|
-
import type { ExURL } from '@d-zero/shared/parse-url';
|
|
9
|
-
|
|
10
|
-
import { existsSync } from 'node:fs';
|
|
11
|
-
import path from 'node:path';
|
|
12
|
-
|
|
13
|
-
import Scraper from '@d-zero/beholder';
|
|
14
|
-
import { deal } from '@d-zero/dealer';
|
|
15
|
-
import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
|
|
16
|
-
import { retryCall } from '@d-zero/shared/retry';
|
|
17
|
-
import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
|
|
18
|
-
import c from 'ansi-colors';
|
|
19
|
-
|
|
20
|
-
import pkg from '../../package.json' with { type: 'json' };
|
|
21
|
-
import { crawlerLog } from '../debug.js';
|
|
22
|
-
|
|
23
|
-
import { detectPaginationPattern } from './detect-pagination-pattern.js';
|
|
24
|
-
import { fetchDestination } from './fetch-destination.js';
|
|
25
|
-
import { generatePredictedUrls } from './generate-predicted-urls.js';
|
|
26
|
-
import { handleIgnoreAndSkip } from './handle-ignore-and-skip.js';
|
|
27
|
-
import { handleResourceResponse } from './handle-resource-response.js';
|
|
28
|
-
import { handleScrapeEnd } from './handle-scrape-end.js';
|
|
29
|
-
import { handleScrapeError } from './handle-scrape-error.js';
|
|
30
|
-
import { injectScopeAuth } from './inject-scope-auth.js';
|
|
31
|
-
import { isExternalUrl } from './is-external-url.js';
|
|
32
|
-
import LinkList from './link-list.js';
|
|
33
|
-
import { linkToPageData } from './link-to-page-data.js';
|
|
34
|
-
import { protocolAgnosticKey } from './protocol-agnostic-key.js';
|
|
35
|
-
import { RobotsChecker } from './robots-checker.js';
|
|
36
|
-
import { shouldDiscardPredicted } from './should-discard-predicted.js';
|
|
37
|
-
import { shouldSkipUrl } from './should-skip-url.js';
|
|
38
|
-
|
|
39
|
-
export type { CrawlerOptions } from './types.js';
|
|
40
|
-
|
|
41
|
-
/**
|
|
42
|
-
* The core crawler engine that discovers and scrapes web pages.
|
|
43
|
-
*
|
|
44
|
-
* The Crawler manages the crawl queue, uses the dealer pattern for concurrent
|
|
45
|
-
* page scraping via `@d-zero/beholder`, handles scrape results, and emits
|
|
46
|
-
* events defined by {@link CrawlerEventTypes}. It supports recursive crawling
|
|
47
|
-
* within a defined scope, external page fetching, URL exclusion, and resumable crawls.
|
|
48
|
-
*
|
|
49
|
-
* Crawling is performed concurrently using the dealer pattern, with
|
|
50
|
-
* configurable parallelism up to {@link Crawler.MAX_PROCESS_LENGTH}.
|
|
51
|
-
*/
|
|
52
|
-
export default class Crawler extends EventEmitter<CrawlerEventTypes> {
|
|
53
|
-
/** Flag set by `abort()` to signal in-progress tasks to exit early. */
|
|
54
|
-
#aborted = false;
|
|
55
|
-
/** Tracks discovered URLs, their scrape status, and deduplication. */
|
|
56
|
-
readonly #linkList = new LinkList();
|
|
57
|
-
/** Merged crawler configuration (user overrides + defaults). */
|
|
58
|
-
readonly #options: CrawlerOptions;
|
|
59
|
-
/** Set of resource URLs (without hash) already captured, for deduplication. */
|
|
60
|
-
readonly #resources = new Set<string>();
|
|
61
|
-
/** URLs restored from a previous session that still need to be scraped. */
|
|
62
|
-
#resumedPending: ExURL[] = [];
|
|
63
|
-
/** URLs already scraped in a previous session, used to populate the `seen` set in {@link #runDeal}. */
|
|
64
|
-
#resumedScraped: string[] = [];
|
|
65
|
-
/** Checker for robots.txt compliance. */
|
|
66
|
-
readonly #robotsChecker: RobotsChecker;
|
|
67
|
-
|
|
68
|
-
/** Maps hostnames to their scope URLs. Defines the crawl boundary for internal/external classification. */
|
|
69
|
-
readonly #scope = new Map<string /* hostname */, ExURL[]>();
|
|
70
|
-
|
|
71
|
-
/**
|
|
72
|
-
* Create a new Crawler instance.
|
|
73
|
-
* @param options - Configuration options for crawling behavior. All fields have
|
|
74
|
-
* sensible defaults if omitted.
|
|
75
|
-
*/
|
|
76
|
-
constructor(options?: Partial<CrawlerOptions>) {
|
|
77
|
-
super();
|
|
78
|
-
this.#options = {
|
|
79
|
-
interval: options?.interval || 0,
|
|
80
|
-
parallels: options?.parallels || 0,
|
|
81
|
-
recursive: options?.recursive ?? true,
|
|
82
|
-
fromList: false,
|
|
83
|
-
captureImages: options?.captureImages ?? true,
|
|
84
|
-
executablePath: options?.executablePath ?? null,
|
|
85
|
-
fetchExternal: options?.fetchExternal ?? true,
|
|
86
|
-
scope: options?.scope ?? [],
|
|
87
|
-
excludes: options?.excludes || [],
|
|
88
|
-
excludeKeywords: options?.excludeKeywords || [],
|
|
89
|
-
excludeUrls: options?.excludeUrls || [],
|
|
90
|
-
maxExcludedDepth: options?.maxExcludedDepth || 10,
|
|
91
|
-
retry: options?.retry ?? 3,
|
|
92
|
-
disableQueries: options?.disableQueries ?? false,
|
|
93
|
-
verbose: options?.verbose ?? false,
|
|
94
|
-
userAgent: options?.userAgent || `Nitpicker/${pkg.version}`,
|
|
95
|
-
ignoreRobots: options?.ignoreRobots ?? false,
|
|
96
|
-
};
|
|
97
|
-
|
|
98
|
-
this.#robotsChecker = new RobotsChecker(
|
|
99
|
-
this.#options.userAgent,
|
|
100
|
-
!this.#options.ignoreRobots,
|
|
101
|
-
);
|
|
102
|
-
|
|
103
|
-
for (const urlStr of this.#options.scope) {
|
|
104
|
-
const url = parseUrl(urlStr, this.#options);
|
|
105
|
-
if (url) {
|
|
106
|
-
const existing = this.#scope.get(url.hostname) || [];
|
|
107
|
-
this.#scope.set(url.hostname, [...existing, url]);
|
|
108
|
-
}
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
/**
|
|
113
|
-
* Abort the current crawl operation.
|
|
114
|
-
*
|
|
115
|
-
* Sets the aborted flag and immediately emits a `crawlEnd` event.
|
|
116
|
-
* In-progress scrape tasks will check the flag and exit early.
|
|
117
|
-
*/
|
|
118
|
-
abort() {
|
|
119
|
-
this.#aborted = true;
|
|
120
|
-
void this.emit('crawlEnd', {});
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
/**
|
|
124
|
-
* Retrieve the list of Chromium process IDs that are still running.
|
|
125
|
-
*
|
|
126
|
-
* In the current architecture, process cleanup is handled by the dealer,
|
|
127
|
-
* so this always returns an empty array.
|
|
128
|
-
* @returns An empty array (reserved for future use).
|
|
129
|
-
*/
|
|
130
|
-
getUndeadPid() {
|
|
131
|
-
return [];
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
/**
|
|
135
|
-
* Restore crawl state from a previous session for resumable crawling.
|
|
136
|
-
*
|
|
137
|
-
* Repopulates the link list with pending and already-scraped URLs,
|
|
138
|
-
* and restores the set of known resource URLs to avoid duplicates.
|
|
139
|
-
* @param pending - URLs that were pending (not yet scraped) in the previous session.
|
|
140
|
-
* @param scraped - URLs that were already scraped in the previous session.
|
|
141
|
-
* @param resources - Resource URLs that were already captured in the previous session.
|
|
142
|
-
*/
|
|
143
|
-
resume(pending: string[], scraped: string[], resources: string[]) {
|
|
144
|
-
this.#resumedPending = this.#linkList.resume(pending, scraped, this.#options);
|
|
145
|
-
this.#resumedScraped = scraped;
|
|
146
|
-
for (const resource of resources) {
|
|
147
|
-
this.#resources.add(resource);
|
|
148
|
-
}
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
/**
|
|
152
|
-
* Start crawling from a single root URL.
|
|
153
|
-
*
|
|
154
|
-
* Adds the root URL to the scope (if not already present) and the link list,
|
|
155
|
-
* then begins the deal-based concurrent crawl. Discovered child pages are
|
|
156
|
-
* automatically added to the queue when recursive mode is enabled.
|
|
157
|
-
* @param url - The root URL to begin crawling from.
|
|
158
|
-
*/
|
|
159
|
-
start(url: ExURL) {
|
|
160
|
-
const existing = this.#scope.get(url.hostname) || [];
|
|
161
|
-
if (!existing.some((u) => u.href === url.href)) {
|
|
162
|
-
this.#scope.set(url.hostname, [...existing, url]);
|
|
163
|
-
}
|
|
164
|
-
this.#linkList.add(url);
|
|
165
|
-
|
|
166
|
-
const isResuming = this.#resumedScraped.length > 0;
|
|
167
|
-
const initialUrls = isResuming ? this.#resumedPending : [url];
|
|
168
|
-
const resumeOffset = this.#resumedScraped.length;
|
|
169
|
-
|
|
170
|
-
if (initialUrls.length === 0) {
|
|
171
|
-
crawlerLog('Crawl End (nothing to resume)');
|
|
172
|
-
void this.emit('crawlEnd', {});
|
|
173
|
-
return;
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
void this.#runDeal(initialUrls, resumeOffset).catch((error) => {
|
|
177
|
-
crawlerLog('runDeal error: %O', error);
|
|
178
|
-
void this.emit('error', {
|
|
179
|
-
pid: process.pid,
|
|
180
|
-
isMainProcess: true,
|
|
181
|
-
url: url.href,
|
|
182
|
-
error: error instanceof Error ? error : new Error(String(error)),
|
|
183
|
-
});
|
|
184
|
-
void this.emit('crawlEnd', {});
|
|
185
|
-
});
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
/**
|
|
189
|
-
* Start crawling a pre-defined list of URLs in non-recursive mode.
|
|
190
|
-
*
|
|
191
|
-
* Each URL in the list is added to the scope and the link list. Recursive
|
|
192
|
-
* crawling is disabled; only the provided URLs will be scraped.
|
|
193
|
-
* @param pageList - The list of URLs to crawl. Must contain at least one URL.
|
|
194
|
-
* @throws {Error} If the page list is empty.
|
|
195
|
-
*/
|
|
196
|
-
startMultiple(pageList: ExURL[]) {
|
|
197
|
-
if (!pageList[0]) {
|
|
198
|
-
throw new Error('pageList is empty');
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
const scopeMap = new Map<string, Set<string>>();
|
|
202
|
-
for (const pageUrl of pageList) {
|
|
203
|
-
const existing = this.#scope.get(pageUrl.hostname) || [];
|
|
204
|
-
const existingHrefs =
|
|
205
|
-
scopeMap.get(pageUrl.hostname) || new Set(existing.map((u) => u.href));
|
|
206
|
-
|
|
207
|
-
if (!existingHrefs.has(pageUrl.href)) {
|
|
208
|
-
this.#scope.set(pageUrl.hostname, [...existing, pageUrl]);
|
|
209
|
-
existingHrefs.add(pageUrl.href);
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
scopeMap.set(pageUrl.hostname, existingHrefs);
|
|
213
|
-
this.#linkList.add(pageUrl);
|
|
214
|
-
}
|
|
215
|
-
this.#options.recursive = false;
|
|
216
|
-
this.#options.fromList = true;
|
|
217
|
-
void this.#runDeal(pageList).catch((error) => {
|
|
218
|
-
crawlerLog('runDeal error: %O', error);
|
|
219
|
-
void this.emit('error', {
|
|
220
|
-
pid: process.pid,
|
|
221
|
-
isMainProcess: true,
|
|
222
|
-
url: pageList[0]!.href,
|
|
223
|
-
error: error instanceof Error ? error : new Error(String(error)),
|
|
224
|
-
});
|
|
225
|
-
void this.emit('crawlEnd', {});
|
|
226
|
-
});
|
|
227
|
-
}
|
|
228
|
-
|
|
229
|
-
/**
|
|
230
|
-
* Processes captured sub-resources from a page scrape, deduplicates them,
|
|
231
|
-
* and emits `response` / `responseReferrers` events for new resources.
|
|
232
|
-
* @param resources - Sub-resource entries captured during the page load
|
|
233
|
-
*/
|
|
234
|
-
#handleResources(resources: ResourceEntry[]) {
|
|
235
|
-
for (const { resource, pageUrl } of resources) {
|
|
236
|
-
const { isNew } = handleResourceResponse(
|
|
237
|
-
resource as CrawlerEventTypes['response']['resource'],
|
|
238
|
-
this.#resources,
|
|
239
|
-
);
|
|
240
|
-
if (isNew) {
|
|
241
|
-
void this.emit('response', {
|
|
242
|
-
resource: resource as CrawlerEventTypes['response']['resource'],
|
|
243
|
-
});
|
|
244
|
-
}
|
|
245
|
-
void this.emit('responseReferrers', {
|
|
246
|
-
url: pageUrl,
|
|
247
|
-
src: resource.url.withoutHash,
|
|
248
|
-
});
|
|
249
|
-
}
|
|
250
|
-
}
|
|
251
|
-
/**
|
|
252
|
-
* Dispatches a scrape result to the appropriate handler based on its type.
|
|
253
|
-
*
|
|
254
|
-
* - `success`: Processes anchors, enqueues new URLs, triggers predicted
|
|
255
|
-
* pagination detection, and emits `page` / `externalPage` events.
|
|
256
|
-
* - `skipped`: Marks the URL as done and emits `skip`.
|
|
257
|
-
* - `error`: Creates a fallback PageData, marks as done, and emits `error`.
|
|
258
|
-
* @param result - The scrape result from beholder
|
|
259
|
-
* @param url - The URL that was scraped
|
|
260
|
-
* @param push - Dealer's push callback to enqueue newly discovered URLs
|
|
261
|
-
* @param paginationState - Mutable state for predicted pagination cascade prevention
|
|
262
|
-
* @param paginationState.lastPushedUrl
|
|
263
|
-
* @param paginationState.lastPushedWasPredicted
|
|
264
|
-
* @param concurrency - Current concurrency level, used to determine predicted URL count
|
|
265
|
-
*/
|
|
266
|
-
#handleResult(
|
|
267
|
-
result: ScrapeResult,
|
|
268
|
-
url: ExURL,
|
|
269
|
-
push: (...items: ExURL[]) => Promise<void>,
|
|
270
|
-
paginationState?: { lastPushedUrl: string | null; lastPushedWasPredicted: boolean },
|
|
271
|
-
concurrency?: number,
|
|
272
|
-
) {
|
|
273
|
-
switch (result.type) {
|
|
274
|
-
case 'success': {
|
|
275
|
-
if (!result.pageData) break;
|
|
276
|
-
handleScrapeEnd(
|
|
277
|
-
result.pageData,
|
|
278
|
-
this.#linkList,
|
|
279
|
-
this.#scope,
|
|
280
|
-
this.#options,
|
|
281
|
-
(newUrl, opts) => {
|
|
282
|
-
this.#linkList.add(newUrl, opts);
|
|
283
|
-
void push(newUrl);
|
|
284
|
-
|
|
285
|
-
// Predicted pagination detection
|
|
286
|
-
if (!paginationState || !concurrency) return;
|
|
287
|
-
|
|
288
|
-
// metadataOnly / external: update tracking but skip pattern detection
|
|
289
|
-
if (opts?.metadataOnly || isExternalUrl(newUrl, this.#scope)) {
|
|
290
|
-
paginationState.lastPushedUrl = newUrl.withoutHashAndAuth;
|
|
291
|
-
paginationState.lastPushedWasPredicted = false;
|
|
292
|
-
return;
|
|
293
|
-
}
|
|
294
|
-
|
|
295
|
-
// Skip comparison when last push was predicted (cascade prevention)
|
|
296
|
-
if (
|
|
297
|
-
paginationState.lastPushedUrl &&
|
|
298
|
-
!paginationState.lastPushedWasPredicted
|
|
299
|
-
) {
|
|
300
|
-
const pattern = detectPaginationPattern(
|
|
301
|
-
paginationState.lastPushedUrl,
|
|
302
|
-
newUrl.withoutHashAndAuth,
|
|
303
|
-
);
|
|
304
|
-
if (pattern) {
|
|
305
|
-
const urls = generatePredictedUrls(
|
|
306
|
-
pattern,
|
|
307
|
-
newUrl.withoutHashAndAuth,
|
|
308
|
-
concurrency,
|
|
309
|
-
);
|
|
310
|
-
for (const specUrlStr of urls) {
|
|
311
|
-
const specUrl = parseUrl(specUrlStr, this.#options);
|
|
312
|
-
if (specUrl) {
|
|
313
|
-
this.#linkList.add(specUrl, { predicted: true });
|
|
314
|
-
void push(specUrl);
|
|
315
|
-
}
|
|
316
|
-
}
|
|
317
|
-
paginationState.lastPushedUrl = newUrl.withoutHashAndAuth;
|
|
318
|
-
paginationState.lastPushedWasPredicted = true;
|
|
319
|
-
return;
|
|
320
|
-
}
|
|
321
|
-
}
|
|
322
|
-
|
|
323
|
-
paginationState.lastPushedUrl = newUrl.withoutHashAndAuth;
|
|
324
|
-
paginationState.lastPushedWasPredicted = false;
|
|
325
|
-
},
|
|
326
|
-
);
|
|
327
|
-
if (result.pageData.isExternal) {
|
|
328
|
-
void this.emit('externalPage', { result: result.pageData });
|
|
329
|
-
} else {
|
|
330
|
-
void this.emit('page', { result: result.pageData });
|
|
331
|
-
}
|
|
332
|
-
break;
|
|
333
|
-
}
|
|
334
|
-
case 'skipped': {
|
|
335
|
-
if (!result.ignored) break;
|
|
336
|
-
handleIgnoreAndSkip(
|
|
337
|
-
result.ignored.url,
|
|
338
|
-
this.#linkList,
|
|
339
|
-
this.#scope,
|
|
340
|
-
this.#options,
|
|
341
|
-
);
|
|
342
|
-
void this.emit('skip', {
|
|
343
|
-
url: result.ignored.url.href,
|
|
344
|
-
reason: JSON.stringify(result.ignored),
|
|
345
|
-
isExternal: isExternalUrl(result.ignored.url, this.#scope),
|
|
346
|
-
});
|
|
347
|
-
break;
|
|
348
|
-
}
|
|
349
|
-
case 'error': {
|
|
350
|
-
if (!result.error) break;
|
|
351
|
-
const error = new Error(result.error.message);
|
|
352
|
-
error.name = result.error.name;
|
|
353
|
-
error.stack = result.error.stack;
|
|
354
|
-
const { result: pageResult } = handleScrapeError(
|
|
355
|
-
{
|
|
356
|
-
url,
|
|
357
|
-
error,
|
|
358
|
-
shutdown: result.error.shutdown,
|
|
359
|
-
pid: undefined,
|
|
360
|
-
},
|
|
361
|
-
this.#linkList,
|
|
362
|
-
this.#scope,
|
|
363
|
-
this.#options,
|
|
364
|
-
);
|
|
365
|
-
if (pageResult) {
|
|
366
|
-
if (pageResult.isExternal) {
|
|
367
|
-
void this.emit('externalPage', { result: pageResult });
|
|
368
|
-
} else {
|
|
369
|
-
void this.emit('page', { result: pageResult });
|
|
370
|
-
}
|
|
371
|
-
}
|
|
372
|
-
void this.emit('error', {
|
|
373
|
-
pid: process.pid,
|
|
374
|
-
isMainProcess: true,
|
|
375
|
-
url: url.href,
|
|
376
|
-
error,
|
|
377
|
-
});
|
|
378
|
-
break;
|
|
379
|
-
}
|
|
380
|
-
}
|
|
381
|
-
}
|
|
382
|
-
/**
|
|
383
|
-
* Launches a fresh Puppeteer browser, runs the beholder scraper, and cleans up.
|
|
384
|
-
*
|
|
385
|
-
* WHY per-URL browser: Each URL gets its own browser instance to ensure
|
|
386
|
-
* complete isolation (cookies, cache, service workers). The browser is always
|
|
387
|
-
* closed in the `finally` block, even on error.
|
|
388
|
-
* @param url - Target URL to scrape
|
|
389
|
-
* @param update - Callback for progress messages
|
|
390
|
-
* @param isExternal - Whether the URL is external to the crawl scope
|
|
391
|
-
* @param metadataOnly - When true, only extract title metadata
|
|
392
|
-
* @param headCheckResult - Optional HEAD result to pass to the scraper, avoiding a redundant request
|
|
393
|
-
* @returns The scrape result from beholder
|
|
394
|
-
*/
|
|
395
|
-
async #launchBrowserAndScrape(
|
|
396
|
-
url: ExURL,
|
|
397
|
-
update: (log: string) => void,
|
|
398
|
-
isExternal: boolean,
|
|
399
|
-
metadataOnly: boolean,
|
|
400
|
-
headCheckResult?: PageData,
|
|
401
|
-
): Promise<ScrapeResult> {
|
|
402
|
-
update('Launching browser%dots%');
|
|
403
|
-
if (this.#options.executablePath) {
|
|
404
|
-
const execPath = path.resolve(this.#options.executablePath);
|
|
405
|
-
if (!existsSync(execPath)) {
|
|
406
|
-
throw new Error(`Executable path does not exist: ${execPath}`);
|
|
407
|
-
}
|
|
408
|
-
}
|
|
409
|
-
const puppeteer = await import('puppeteer');
|
|
410
|
-
const browser = await puppeteer.launch({
|
|
411
|
-
headless: true,
|
|
412
|
-
...(this.#options.executablePath
|
|
413
|
-
? { executablePath: this.#options.executablePath }
|
|
414
|
-
: {}),
|
|
415
|
-
});
|
|
416
|
-
|
|
417
|
-
try {
|
|
418
|
-
update('Creating page%dots%');
|
|
419
|
-
const page = await browser.newPage();
|
|
420
|
-
await page.setUserAgent(this.#options.userAgent);
|
|
421
|
-
const scraper = new Scraper();
|
|
422
|
-
|
|
423
|
-
scraper.on('changePhase', (e) => {
|
|
424
|
-
const msg = formatPhaseLog(e);
|
|
425
|
-
if (msg) {
|
|
426
|
-
update(msg);
|
|
427
|
-
}
|
|
428
|
-
void this.emit('changePhase', e);
|
|
429
|
-
});
|
|
430
|
-
|
|
431
|
-
const result = await scraper.scrapeStart(page, url, {
|
|
432
|
-
isExternal,
|
|
433
|
-
captureImages: !isExternal && this.#options.captureImages,
|
|
434
|
-
excludeKeywords: this.#options.excludeKeywords,
|
|
435
|
-
disableQueries: this.#options.disableQueries,
|
|
436
|
-
metadataOnly,
|
|
437
|
-
retries: this.#options.retry,
|
|
438
|
-
headCheckResult,
|
|
439
|
-
});
|
|
440
|
-
|
|
441
|
-
update('Closing browser%dots%');
|
|
442
|
-
return result;
|
|
443
|
-
} catch (error) {
|
|
444
|
-
return {
|
|
445
|
-
type: 'error',
|
|
446
|
-
resources: [],
|
|
447
|
-
error: {
|
|
448
|
-
name: error instanceof Error ? error.name : 'Error',
|
|
449
|
-
message: error instanceof Error ? error.message : String(error),
|
|
450
|
-
stack: error instanceof Error ? error.stack : undefined,
|
|
451
|
-
shutdown: true,
|
|
452
|
-
},
|
|
453
|
-
};
|
|
454
|
-
} finally {
|
|
455
|
-
await browser.close().catch(() => {});
|
|
456
|
-
}
|
|
457
|
-
}
|
|
458
|
-
/**
|
|
459
|
-
* Runs the deal-based concurrent crawl loop.
|
|
460
|
-
*
|
|
461
|
-
* WHY deal(): The `@d-zero/dealer` pattern provides concurrent item processing
|
|
462
|
-
* with a dynamic queue — new URLs discovered during scraping are pushed via the
|
|
463
|
-
* `push` callback and automatically scheduled. The `onPush` deduplication ensures
|
|
464
|
-
* each URL is processed at most once (protocol-agnostic comparison).
|
|
465
|
-
* @param initialUrls - Starting URLs to seed the deal queue
|
|
466
|
-
* @param resumeOffset - Number of URLs already scraped in a previous session,
|
|
467
|
-
* added to the progress counter for accurate display
|
|
468
|
-
*/
|
|
469
|
-
async #runDeal(initialUrls: ExURL[], resumeOffset = 0) {
|
|
470
|
-
const seen = new Set<string>(
|
|
471
|
-
initialUrls.map((u) => protocolAgnosticKey(u.withoutHashAndAuth)),
|
|
472
|
-
);
|
|
473
|
-
|
|
474
|
-
// Add scraped URLs to seen to prevent re-processing during resume
|
|
475
|
-
for (const url of this.#resumedScraped) {
|
|
476
|
-
seen.add(protocolAgnosticKey(url));
|
|
477
|
-
}
|
|
478
|
-
|
|
479
|
-
// external URL の追跡(target は deal の total/done から導出)
|
|
480
|
-
const externalUrls = new Set<string>();
|
|
481
|
-
const externalDoneUrls = new Set<string>();
|
|
482
|
-
|
|
483
|
-
// 初期 URL を分類(onPush を通らないため)
|
|
484
|
-
for (const url of initialUrls) {
|
|
485
|
-
if (isExternalUrl(url, this.#scope)) {
|
|
486
|
-
externalUrls.add(protocolAgnosticKey(url.withoutHashAndAuth));
|
|
487
|
-
}
|
|
488
|
-
}
|
|
489
|
-
|
|
490
|
-
const concurrency = this.#options.parallels
|
|
491
|
-
? Math.max(this.#options.parallels, 1)
|
|
492
|
-
: Crawler.MAX_PROCESS_LENGTH;
|
|
493
|
-
|
|
494
|
-
// Predicted pagination state
|
|
495
|
-
const paginationState = {
|
|
496
|
-
lastPushedUrl: null as string | null,
|
|
497
|
-
lastPushedWasPredicted: false,
|
|
498
|
-
};
|
|
499
|
-
|
|
500
|
-
await deal(
|
|
501
|
-
initialUrls,
|
|
502
|
-
(url, update, _index, setLineHeader, push) => {
|
|
503
|
-
const isExternal = isExternalUrl(url, this.#scope);
|
|
504
|
-
const urlText = isExternal ? c.dim(url.href) : c.cyan(url.href);
|
|
505
|
-
setLineHeader(`%braille% ${urlText}: `);
|
|
506
|
-
injectScopeAuth(url, this.#scope);
|
|
507
|
-
this.#linkList.add(url);
|
|
508
|
-
this.#linkList.progress(url);
|
|
509
|
-
|
|
510
|
-
return async () => {
|
|
511
|
-
if (this.#aborted) return;
|
|
512
|
-
const log = createTimedUpdate(update, this.#options.verbose);
|
|
513
|
-
|
|
514
|
-
try {
|
|
515
|
-
const robotsAllowed = await this.#robotsChecker.isAllowed(url);
|
|
516
|
-
if (!robotsAllowed) {
|
|
517
|
-
handleIgnoreAndSkip(url, this.#linkList, this.#scope, this.#options);
|
|
518
|
-
void this.emit('skip', {
|
|
519
|
-
url: url.href,
|
|
520
|
-
reason: 'blocked by robots.txt',
|
|
521
|
-
isExternal,
|
|
522
|
-
});
|
|
523
|
-
log(c.gray('Blocked by robots.txt'));
|
|
524
|
-
return;
|
|
525
|
-
}
|
|
526
|
-
|
|
527
|
-
const isSkip = shouldSkipUrl({
|
|
528
|
-
url,
|
|
529
|
-
excludes: this.#options.excludes,
|
|
530
|
-
excludeUrls: this.#options.excludeUrls,
|
|
531
|
-
options: this.#options,
|
|
532
|
-
});
|
|
533
|
-
|
|
534
|
-
if (isSkip) {
|
|
535
|
-
handleIgnoreAndSkip(url, this.#linkList, this.#scope, this.#options);
|
|
536
|
-
void this.emit('skip', { url: url.href, reason: 'excluded', isExternal });
|
|
537
|
-
log(c.gray('Skipped'));
|
|
538
|
-
return;
|
|
539
|
-
}
|
|
540
|
-
|
|
541
|
-
if (!this.#options.fetchExternal && isExternal) {
|
|
542
|
-
const pageData = linkToPageData({
|
|
543
|
-
url,
|
|
544
|
-
isExternal,
|
|
545
|
-
isLowerLayer: false,
|
|
546
|
-
});
|
|
547
|
-
this.#linkList.done(url, this.#scope, { page: pageData }, this.#options);
|
|
548
|
-
void this.emit('externalPage', { result: pageData });
|
|
549
|
-
log(c.dim('External (skip fetch)'));
|
|
550
|
-
return;
|
|
551
|
-
}
|
|
552
|
-
|
|
553
|
-
const metadataOnly = this.#linkList.isMetadataOnly(url.withoutHash);
|
|
554
|
-
const isPredicted = this.#linkList.isPredicted(url.withoutHashAndAuth);
|
|
555
|
-
|
|
556
|
-
log('Scraping%dots%');
|
|
557
|
-
const result = await this.#scrapePage(url, log, metadataOnly);
|
|
558
|
-
|
|
559
|
-
// Discard predicted URLs that failed (404, error, etc.)
|
|
560
|
-
if (isPredicted && shouldDiscardPredicted(result)) {
|
|
561
|
-
handleIgnoreAndSkip(url, this.#linkList, this.#scope, this.#options);
|
|
562
|
-
log(c.dim('Predicted (discarded)'));
|
|
563
|
-
return;
|
|
564
|
-
}
|
|
565
|
-
|
|
566
|
-
log('Saving results%dots%');
|
|
567
|
-
this.#handleResult(result, url, push, paginationState, concurrency);
|
|
568
|
-
this.#handleResources(result.resources);
|
|
569
|
-
log(formatResultSummary(result));
|
|
570
|
-
} finally {
|
|
571
|
-
if (isExternal) {
|
|
572
|
-
externalDoneUrls.add(protocolAgnosticKey(url.withoutHashAndAuth));
|
|
573
|
-
}
|
|
574
|
-
}
|
|
575
|
-
};
|
|
576
|
-
},
|
|
577
|
-
{
|
|
578
|
-
limit: concurrency,
|
|
579
|
-
interval: this.#options.interval,
|
|
580
|
-
verbose: this.#options.verbose || !process.stdout.isTTY,
|
|
581
|
-
header: (_progress, done, total, limit) => {
|
|
582
|
-
const allDone = done + resumeOffset;
|
|
583
|
-
const allTotal = total + resumeOffset;
|
|
584
|
-
const extTotal = externalUrls.size;
|
|
585
|
-
const extDone = externalDoneUrls.size;
|
|
586
|
-
const pct = allTotal > 0 ? Math.round((allDone / allTotal) * 100) : 0;
|
|
587
|
-
return (
|
|
588
|
-
c.bold(`Crawling: ${allDone - extDone}/${allTotal - extTotal}`) +
|
|
589
|
-
c.dim(`(${extDone}/${extTotal})`) +
|
|
590
|
-
c.bold(` (${pct}%) [${limit} parallel]`)
|
|
591
|
-
);
|
|
592
|
-
},
|
|
593
|
-
onPush: (url) => {
|
|
594
|
-
const key = protocolAgnosticKey(url.withoutHashAndAuth);
|
|
595
|
-
if (seen.has(key)) return false;
|
|
596
|
-
seen.add(key);
|
|
597
|
-
if (isExternalUrl(url, this.#scope)) {
|
|
598
|
-
externalUrls.add(key);
|
|
599
|
-
}
|
|
600
|
-
return true;
|
|
601
|
-
},
|
|
602
|
-
},
|
|
603
|
-
);
|
|
604
|
-
|
|
605
|
-
crawlerLog('Crawl End');
|
|
606
|
-
void this.emit('crawlEnd', {});
|
|
607
|
-
}
|
|
608
|
-
/**
|
|
609
|
-
* Orchestrates the full scrape pipeline for a single URL.
|
|
610
|
-
*
|
|
611
|
-
* Flow:
|
|
612
|
-
* 1. Non-HTTP protocols → delegate directly to browser scraper
|
|
613
|
-
* 2. HEAD pre-flight → check availability and content type
|
|
614
|
-
* 3. Title-only mode → extract `<title>` via partial GET, skip browser
|
|
615
|
-
* 4. Non-HTML content → return HEAD result, skip browser
|
|
616
|
-
* 5. HTML content → launch browser with preflight result
|
|
617
|
-
* @param url - Target URL to scrape
|
|
618
|
-
* @param update - Callback for progress messages
|
|
619
|
-
* @param metadataOnly - When true, only extract title metadata without full browser scraping
|
|
620
|
-
* @returns The scrape result
|
|
621
|
-
*/
|
|
622
|
-
async #scrapePage(
|
|
623
|
-
url: ExURL,
|
|
624
|
-
update: (log: string) => void,
|
|
625
|
-
metadataOnly: boolean,
|
|
626
|
-
): Promise<ScrapeResult> {
|
|
627
|
-
const isExternal = isExternalUrl(url, this.#scope);
|
|
628
|
-
|
|
629
|
-
// Non-HTTP protocols (mailto:, tel:, etc.) — let the scraper handle early return
|
|
630
|
-
if (!url.isHTTP) {
|
|
631
|
-
return this.#launchBrowserAndScrape(url, update, isExternal, metadataOnly);
|
|
632
|
-
}
|
|
633
|
-
|
|
634
|
-
// Pre-flight: lightweight HEAD request to check server availability
|
|
635
|
-
update('HEAD request%dots%');
|
|
636
|
-
let headCheckResult: PageData;
|
|
637
|
-
try {
|
|
638
|
-
headCheckResult = await this.#sendHeadRequest(url, isExternal, update);
|
|
639
|
-
} catch (error) {
|
|
640
|
-
// Server unreachable — skip browser launch entirely
|
|
641
|
-
update(c.red('Unreachable'));
|
|
642
|
-
return {
|
|
643
|
-
type: 'error',
|
|
644
|
-
resources: [],
|
|
645
|
-
error: {
|
|
646
|
-
name: error instanceof Error ? error.name : 'Error',
|
|
647
|
-
message: error instanceof Error ? error.message : String(error),
|
|
648
|
-
stack: error instanceof Error ? error.stack : undefined,
|
|
649
|
-
shutdown: false,
|
|
650
|
-
},
|
|
651
|
-
};
|
|
652
|
-
}
|
|
653
|
-
|
|
654
|
-
// Title-only mode — extract <title> via partial GET for HTML, skip browser
|
|
655
|
-
if (metadataOnly) {
|
|
656
|
-
if (
|
|
657
|
-
headCheckResult.contentType === null ||
|
|
658
|
-
headCheckResult.contentType === 'text/html'
|
|
659
|
-
) {
|
|
660
|
-
update('Fetching title%dots%');
|
|
661
|
-
try {
|
|
662
|
-
const titleResult = await fetchDestination({
|
|
663
|
-
url,
|
|
664
|
-
isExternal,
|
|
665
|
-
method: 'GET',
|
|
666
|
-
options: { titleBytesLimit: 16_384 },
|
|
667
|
-
userAgent: this.#options.userAgent,
|
|
668
|
-
});
|
|
669
|
-
return {
|
|
670
|
-
type: 'success',
|
|
671
|
-
pageData: { ...titleResult, isTarget: false },
|
|
672
|
-
resources: [],
|
|
673
|
-
};
|
|
674
|
-
} catch (error) {
|
|
675
|
-
crawlerLog('Title GET failed for %s: %O', url.href, error);
|
|
676
|
-
}
|
|
677
|
-
}
|
|
678
|
-
return {
|
|
679
|
-
type: 'success',
|
|
680
|
-
pageData: { ...headCheckResult, isTarget: false },
|
|
681
|
-
resources: [],
|
|
682
|
-
};
|
|
683
|
-
}
|
|
684
|
-
|
|
685
|
-
// Non-HTML content — skip browser
|
|
686
|
-
if (
|
|
687
|
-
headCheckResult.contentType !== null &&
|
|
688
|
-
headCheckResult.contentType !== 'text/html'
|
|
689
|
-
) {
|
|
690
|
-
return {
|
|
691
|
-
type: 'success',
|
|
692
|
-
pageData: headCheckResult,
|
|
693
|
-
resources: [],
|
|
694
|
-
};
|
|
695
|
-
}
|
|
696
|
-
|
|
697
|
-
// HTML or unknown content type — launch browser with preflight result
|
|
698
|
-
return this.#launchBrowserAndScrape(
|
|
699
|
-
url,
|
|
700
|
-
update,
|
|
701
|
-
isExternal,
|
|
702
|
-
metadataOnly,
|
|
703
|
-
headCheckResult,
|
|
704
|
-
);
|
|
705
|
-
}
|
|
706
|
-
/**
|
|
707
|
-
* Performs a pre-flight HTTP HEAD request with retry logic.
|
|
708
|
-
*
|
|
709
|
-
* WHY pre-flight: Avoids launching a browser for URLs that are unreachable,
|
|
710
|
-
* non-HTML, or return error status codes. This saves significant time and
|
|
711
|
-
* resources compared to launching Puppeteer for every URL.
|
|
712
|
-
* @param url - Target URL to check
|
|
713
|
-
* @param isExternal - Whether the URL is external to the crawl scope
|
|
714
|
-
* @param update - Callback for progress messages shown in the dealer display
|
|
715
|
-
* @returns Lightweight page data from the HEAD response
|
|
716
|
-
*/
|
|
717
|
-
async #sendHeadRequest(
|
|
718
|
-
url: ExURL,
|
|
719
|
-
isExternal: boolean,
|
|
720
|
-
update: (msg: string) => void,
|
|
721
|
-
): Promise<PageData> {
|
|
722
|
-
return retryCall(
|
|
723
|
-
() => fetchDestination({ url, isExternal, userAgent: this.#options.userAgent }),
|
|
724
|
-
{
|
|
725
|
-
retries: this.#options.retry,
|
|
726
|
-
label: 'HEAD request',
|
|
727
|
-
onWait: (determinedInterval, retryCount, label, error) => {
|
|
728
|
-
update(
|
|
729
|
-
`${label}: ${error.message} — %countdown(${determinedInterval},fetchHead_${retryCount},s)%s (retry #${retryCount + 1})`,
|
|
730
|
-
);
|
|
731
|
-
},
|
|
732
|
-
onGiveUp: (retryCount, error, label) => {
|
|
733
|
-
update(
|
|
734
|
-
c.red(`${label}: gave up after ${retryCount} retries — ${error.message}`),
|
|
735
|
-
);
|
|
736
|
-
},
|
|
737
|
-
},
|
|
738
|
-
);
|
|
739
|
-
}
|
|
740
|
-
|
|
741
|
-
/**
|
|
742
|
-
* The default maximum number of concurrent scraping processes.
|
|
743
|
-
*
|
|
744
|
-
* Used when `parallels` is not specified or is set to 0.
|
|
745
|
-
*/
|
|
746
|
-
static readonly MAX_PROCESS_LENGTH = 10;
|
|
747
|
-
}
|
|
748
|
-
|
|
749
|
-
/**
|
|
750
|
-
* Colorize an HTTP status code string for terminal display.
|
|
751
|
-
*
|
|
752
|
-
* - 2xx: green
|
|
753
|
-
* - 3xx: yellow
|
|
754
|
-
* - 4xx/5xx: red
|
|
755
|
-
* - Unknown: no color
|
|
756
|
-
* @param status - The HTTP status code, or `undefined` if unknown.
|
|
757
|
-
* @returns A colorized "Done (status)" string.
|
|
758
|
-
*/
|
|
759
|
-
function colorStatus(status: number | undefined) {
|
|
760
|
-
const text = `Done (${status ?? '?'})`;
|
|
761
|
-
if (!status) return text;
|
|
762
|
-
if (status < 300) return c.green(text);
|
|
763
|
-
if (status < 400) return c.yellow(text);
|
|
764
|
-
return c.red(text);
|
|
765
|
-
}
|
|
766
|
-
|
|
767
|
-
/**
|
|
768
|
-
* Maps a beholder phase event to a human-readable log message for the dealer display.
|
|
769
|
-
* Returns `null` for phases that should not produce visible output (e.g. scrapeStart/End).
|
|
770
|
-
* @param e - The phase change event from beholder
|
|
771
|
-
* @returns A formatted message string, or `null` to suppress output
|
|
772
|
-
*/
|
|
773
|
-
function formatPhaseLog(e: ChangePhaseEvent): string | null {
|
|
774
|
-
switch (e.name) {
|
|
775
|
-
case 'scrapeStart':
|
|
776
|
-
case 'scrapeEnd': {
|
|
777
|
-
return null;
|
|
778
|
-
}
|
|
779
|
-
case 'headRequest': {
|
|
780
|
-
return 'HEAD request%dots%';
|
|
781
|
-
}
|
|
782
|
-
case 'openPage': {
|
|
783
|
-
return e.message;
|
|
784
|
-
}
|
|
785
|
-
case 'loadDOMContent': {
|
|
786
|
-
return c.dim('DOM loaded');
|
|
787
|
-
}
|
|
788
|
-
case 'getHTML': {
|
|
789
|
-
return 'Getting HTML%dots%';
|
|
790
|
-
}
|
|
791
|
-
case 'waitNetworkIdle': {
|
|
792
|
-
return 'Waiting for network idle%dots%';
|
|
793
|
-
}
|
|
794
|
-
case 'getAnchors': {
|
|
795
|
-
return 'Extracting anchors%dots%';
|
|
796
|
-
}
|
|
797
|
-
case 'getMeta': {
|
|
798
|
-
return 'Extracting meta%dots%';
|
|
799
|
-
}
|
|
800
|
-
case 'extractImages': {
|
|
801
|
-
return 'Fetching images%dots%';
|
|
802
|
-
}
|
|
803
|
-
case 'setViewport':
|
|
804
|
-
case 'scrollToBottom':
|
|
805
|
-
case 'waitImageLoad':
|
|
806
|
-
case 'retryWait': {
|
|
807
|
-
return e.message;
|
|
808
|
-
}
|
|
809
|
-
case 'retryExhausted': {
|
|
810
|
-
return c.red(e.message);
|
|
811
|
-
}
|
|
812
|
-
case 'getImages': {
|
|
813
|
-
return e.message;
|
|
814
|
-
}
|
|
815
|
-
case 'pageSkipped': {
|
|
816
|
-
return c.yellow(`Skipped: ${e.message}`);
|
|
817
|
-
}
|
|
818
|
-
default: {
|
|
819
|
-
return e.name;
|
|
820
|
-
}
|
|
821
|
-
}
|
|
822
|
-
}
|
|
823
|
-
|
|
824
|
-
/**
|
|
825
|
-
* Wraps an update callback to append elapsed time between calls (e.g. `+42ms`).
|
|
826
|
-
* Only active when verbose mode is enabled; otherwise returns the original callback.
|
|
827
|
-
* @param update - The original dealer update callback
|
|
828
|
-
* @param verbose - Whether verbose mode is enabled
|
|
829
|
-
* @returns A wrapped update callback that appends timing information
|
|
830
|
-
*/
|
|
831
|
-
function createTimedUpdate(
|
|
832
|
-
update: (msg: string) => void,
|
|
833
|
-
verbose?: boolean,
|
|
834
|
-
): (msg: string) => void {
|
|
835
|
-
if (!verbose) return update;
|
|
836
|
-
let prev = Date.now();
|
|
837
|
-
return (msg: string) => {
|
|
838
|
-
const now = Date.now();
|
|
839
|
-
const delta = now - prev;
|
|
840
|
-
prev = now;
|
|
841
|
-
update(`${msg} ${c.dim(`+${delta}ms`)}`);
|
|
842
|
-
};
|
|
843
|
-
}
|
|
844
|
-
|
|
845
|
-
/**
|
|
846
|
-
* Formats a one-line summary of a scrape result for the dealer display.
|
|
847
|
-
* Shows HTTP status (colorized), anchor/image/resource counts for target pages.
|
|
848
|
-
* @param result - The scrape result to summarize
|
|
849
|
-
* @returns A colorized summary string
|
|
850
|
-
*/
|
|
851
|
-
function formatResultSummary(result: ScrapeResult): string {
|
|
852
|
-
switch (result.type) {
|
|
853
|
-
case 'success': {
|
|
854
|
-
const status = colorStatus(result.pageData?.status);
|
|
855
|
-
if (result.pageData?.isTarget) {
|
|
856
|
-
const anchors = result.pageData.anchorList.length;
|
|
857
|
-
const images = result.pageData.imageList.length;
|
|
858
|
-
const resources = result.resources.length;
|
|
859
|
-
return `${status} ${c.cyan(`\u{1F517} ${anchors}`)} ${c.magenta(`\u{1F5BC}\u{FE0F} ${images}`)} ${c.dim(`\u{1F4E6} ${resources}`)}`;
|
|
860
|
-
}
|
|
861
|
-
return status;
|
|
862
|
-
}
|
|
863
|
-
case 'skipped': {
|
|
864
|
-
return c.gray('Skipped');
|
|
865
|
-
}
|
|
866
|
-
case 'error': {
|
|
867
|
-
return c.red('Error');
|
|
868
|
-
}
|
|
869
|
-
default: {
|
|
870
|
-
return result.type;
|
|
871
|
-
}
|
|
872
|
-
}
|
|
873
|
-
}
|