@arcblock/crawler 1.5.1 → 1.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -54,6 +54,7 @@ function createCrawlQueue(queue, handler) {
54
54
  options: {
55
55
  concurrency: config_1.config.concurrency,
56
56
  enableScheduledJob: true,
57
+ maxRetries: 3,
57
58
  },
58
59
  onJob: (job) => __awaiter(this, void 0, void 0, function* () {
59
60
  const startTime = Date.now();
@@ -205,6 +206,22 @@ const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], voi
205
206
  let html = null;
206
207
  let screenshot = null;
207
208
  const meta = {};
209
+ const closePageSafely = () => __awaiter(void 0, void 0, void 0, function* () {
210
+ try {
211
+ yield page.close();
212
+ }
213
+ catch (error) {
214
+ if ((0, puppeteer_1.isBrowserConnectionError)(error)) {
215
+ try {
216
+ yield (0, puppeteer_1.closeBrowser)({ trimCache: false });
217
+ }
218
+ catch (closeError) {
219
+ config_1.logger.warn('Failed to close browser after page close error', { error: closeError });
220
+ }
221
+ }
222
+ config_1.logger.warn('Failed to close page:', { error });
223
+ }
224
+ });
208
225
  try {
209
226
  const response = yield page.goto(url, { timeout });
210
227
  if (!response) {
@@ -291,14 +308,23 @@ const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], voi
291
308
  config_1.logger.error('Failed to get html:', err);
292
309
  throw err;
293
310
  }
311
+ yield closePageSafely();
294
312
  }
295
313
  catch (error) {
314
+ if ((0, puppeteer_1.isBrowserConnectionError)(error)) {
315
+ try {
316
+ yield (0, puppeteer_1.closeBrowser)({ trimCache: false });
317
+ }
318
+ catch (closeError) {
319
+ config_1.logger.warn('Failed to close browser after page error', { error: closeError });
320
+ }
321
+ }
322
+ else {
323
+ yield closePageSafely();
324
+ }
296
325
  config_1.logger.error('Failed to get page content:', error);
297
326
  throw error;
298
327
  }
299
- finally {
300
- yield page.close();
301
- }
302
328
  return {
303
329
  html,
304
330
  screenshot,
@@ -62,6 +62,8 @@ function collectMetrics() {
62
62
  try {
63
63
  // 收集队列大小
64
64
  const jobStats = yield store_1.Job.stats();
65
+ // Reset first to clear queues that no longer have jobs
66
+ exports.queueSize.reset();
65
67
  jobStats.queues.forEach((q) => {
66
68
  exports.queueSize.set({ queue: q.queue }, q.count);
67
69
  });
@@ -1,16 +1,16 @@
1
- import puppeteer, { Browser, Page, ResourceType } from '@blocklet/puppeteer';
1
+ import puppeteer, { Browser, ResourceType } from '@blocklet/puppeteer';
2
2
  export { puppeteer };
3
3
  export declare function ensurePuppeteerrc(): Promise<{
4
4
  cacheDirectory: string;
5
5
  temporaryDirectory: string;
6
6
  }>;
7
7
  export declare function ensureBrowser(): Promise<void>;
8
- export declare function connectBrowser(): Promise<Browser | null>;
9
8
  export declare function launchBrowser(): Promise<Browser>;
9
+ export declare function isBrowserConnectionError(error: unknown): boolean;
10
10
  export declare const getBrowser: () => Promise<Browser>;
11
11
  export declare const closeBrowser: ({ trimCache }?: {
12
12
  trimCache?: boolean;
13
- }) => Promise<void>;
13
+ }) => Promise<void> | undefined;
14
14
  export declare function initPage({ abortResourceTypes }?: {
15
15
  abortResourceTypes?: ResourceType[];
16
- }): Promise<Page>;
16
+ }): Promise<any>;
@@ -15,27 +15,25 @@ Object.defineProperty(exports, "__esModule", { value: true });
15
15
  exports.closeBrowser = exports.getBrowser = exports.puppeteer = void 0;
16
16
  exports.ensurePuppeteerrc = ensurePuppeteerrc;
17
17
  exports.ensureBrowser = ensureBrowser;
18
- exports.connectBrowser = connectBrowser;
19
18
  exports.launchBrowser = launchBrowser;
19
+ exports.isBrowserConnectionError = isBrowserConnectionError;
20
20
  exports.initPage = initPage;
21
21
  const puppeteer_1 = __importDefault(require("@blocklet/puppeteer"));
22
22
  exports.puppeteer = puppeteer_1.default;
23
23
  const fs_extra_1 = __importDefault(require("fs-extra"));
24
24
  const path_1 = __importDefault(require("path"));
25
- const timers_1 = require("timers");
26
25
  const config_1 = require("./config");
27
- const store_1 = require("./store");
28
26
  const utils_1 = require("./utils");
29
- const BrowserStatus = {
30
- None: 'None',
31
- Launching: 'Launching',
32
- Ready: 'Ready',
33
- };
34
- let browserStatus = BrowserStatus.None;
35
- /** Chromium WebSocket endpoint that allows puppeteer browser instance to connect to the browser */
36
- let browserEndpoint = '';
37
27
  let browser;
38
- let browserActivatedTimer;
28
+ let browserInitInFlight;
29
+ let closingBrowser;
30
+ const BROWSER_CONNECTION_ERROR_PATTERNS = [
31
+ /protocol error/i,
32
+ /target closed/i,
33
+ /browser disconnected/i,
34
+ /session closed/i,
35
+ /target crashed/i,
36
+ ];
39
37
  function ensurePuppeteerrc() {
40
38
  return __awaiter(this, void 0, void 0, function* () {
41
39
  const cacheDirectory = path_1.default.join(config_1.config.cacheDir, 'puppeteer', 'cache');
@@ -78,7 +76,7 @@ function ensureBrowser() {
78
76
  }
79
77
  // try to launch browser
80
78
  if (config_1.config.isProd) {
81
- const browser = yield launchBrowser();
79
+ const browser = yield (0, exports.getBrowser)();
82
80
  if (!browser) {
83
81
  throw new Error('Failed to launch browser');
84
82
  }
@@ -87,34 +85,8 @@ function ensureBrowser() {
87
85
  config_1.logger.info('Puppeteer is ready');
88
86
  });
89
87
  }
90
- function connectBrowser() {
91
- return __awaiter(this, void 0, void 0, function* () {
92
- if (!browserEndpoint) {
93
- return null;
94
- }
95
- // retry if browser is launching
96
- if (browserStatus === BrowserStatus.Launching) {
97
- yield (0, utils_1.sleep)(Math.floor(Math.random() * 1000));
98
- return connectBrowser();
99
- }
100
- try {
101
- browser = yield puppeteer_1.default.connect({
102
- browserWSEndpoint: browserEndpoint,
103
- });
104
- config_1.logger.info('Connect browser success');
105
- }
106
- catch (err) {
107
- config_1.logger.warn('Connect browser failed, clear endpoint', err);
108
- browserEndpoint = '';
109
- return null;
110
- }
111
- return browser;
112
- });
113
- }
114
88
  function launchBrowser() {
115
89
  return __awaiter(this, void 0, void 0, function* () {
116
- browserEndpoint = '';
117
- browserStatus = BrowserStatus.Launching;
118
90
  try {
119
91
  browser = yield puppeteer_1.default.launch({
120
92
  headless: true,
@@ -146,125 +118,139 @@ function launchBrowser() {
146
118
  '--disable-gpu-sandbox',
147
119
  ],
148
120
  });
121
+ attachBrowserListeners(browser);
149
122
  config_1.logger.info('Launch browser');
150
123
  }
151
124
  catch (error) {
152
125
  config_1.logger.error('launch browser failed: ', error);
153
- browserStatus = BrowserStatus.None;
154
- browserEndpoint = '';
155
126
  throw error;
156
127
  }
157
- // save browserWSEndpoint to cache
158
- browserEndpoint = yield browser.wsEndpoint();
159
- browserStatus = BrowserStatus.Ready;
160
128
  return browser;
161
129
  });
162
130
  }
163
- function checkBrowserActivated() {
164
- clearBrowserActivatedTimer();
165
- let count = 0;
166
- browserActivatedTimer = (0, timers_1.setInterval)(() => __awaiter(this, void 0, void 0, function* () {
167
- var _a;
168
- if (browser) {
169
- const pages = yield browser.pages().catch(() => []);
170
- const jobCount = yield store_1.Job.count().catch(() => 0);
171
- // Check if browser is inactive: only blank page AND no pending jobs
172
- const isInactive = pages.length === 1 && ((_a = pages[0]) === null || _a === void 0 ? void 0 : _a.url()) === 'about:blank' && jobCount === 0;
173
- if (isInactive) {
174
- count++;
175
- config_1.logger.debug(`Browser inactive count: ${count}/3`);
176
- }
177
- else {
178
- count = 0;
179
- if (jobCount > 0) {
180
- config_1.logger.debug(`Browser has ${jobCount} pending jobs, keeping active`);
181
- }
182
- }
183
- if (count >= 3) {
184
- config_1.logger.info('Browser inactive for 3 minutes, closing...');
185
- yield (0, exports.closeBrowser)({
186
- trimCache: true,
187
- });
188
- }
131
+ function resetBrowserState(reason) {
132
+ if (reason) {
133
+ config_1.logger.warn('Reset browser state', { reason });
134
+ }
135
+ browser = null;
136
+ browserInitInFlight = null;
137
+ }
138
+ function isBrowserConnectionError(error) {
139
+ const message = error instanceof Error ? error.message : String(error || '');
140
+ return BROWSER_CONNECTION_ERROR_PATTERNS.some((pattern) => pattern.test(message));
141
+ }
142
+ function attachBrowserListeners(target) {
143
+ target.on('disconnected', () => {
144
+ if (browser !== target) {
145
+ return;
189
146
  }
190
- }), 1000 * 60);
147
+ config_1.logger.warn('Browser disconnected');
148
+ resetBrowserState('disconnected');
149
+ });
191
150
  }
192
- function clearBrowserActivatedTimer() {
193
- if (browserActivatedTimer) {
194
- (0, timers_1.clearInterval)(browserActivatedTimer);
195
- browserActivatedTimer = null;
196
- }
151
+ function initBrowser() {
152
+ return __awaiter(this, void 0, void 0, function* () {
153
+ // sleep random time (0 ~ 5s),to avoid concurrent blocklet
154
+ yield (0, utils_1.sleep)(Math.floor(Math.random() * 1000 * 5));
155
+ const launchedBrowser = yield launchBrowser();
156
+ if (launchedBrowser) {
157
+ config_1.logger.debug('getBrowser.launchedBrowser');
158
+ browser = launchedBrowser;
159
+ return browser;
160
+ }
161
+ throw new Error('No browser to use, should install redis or browser');
162
+ });
197
163
  }
198
164
  const getBrowser = () => __awaiter(void 0, void 0, void 0, function* () {
199
- if (browser)
200
- return browser;
201
- // sleep random time (0 ~ 5s),to avoid concurrent blocklet
202
- yield (0, utils_1.sleep)(Math.floor(Math.random() * 1000 * 5));
203
- // try to connect browser
204
- const connectedBrowser = yield connectBrowser();
205
- if (connectedBrowser) {
206
- config_1.logger.debug('getBrowser.connectedBrowser');
207
- browser = connectedBrowser;
208
- checkBrowserActivated();
209
- return browser;
165
+ // Wait for any ongoing browser close operation to complete
166
+ if (closingBrowser) {
167
+ yield closingBrowser;
210
168
  }
211
- // try to launch browser
212
- const launchedBrowser = yield launchBrowser();
213
- if (launchedBrowser) {
214
- config_1.logger.debug('getBrowser.launchedBrowser');
215
- browser = launchedBrowser;
216
- checkBrowserActivated();
217
- return browser;
169
+ if (browser) {
170
+ if (browser.isConnected()) {
171
+ return browser;
172
+ }
173
+ config_1.logger.warn('Browser instance is disconnected, resetting');
174
+ resetBrowserState('disconnected');
175
+ }
176
+ if (browserInitInFlight) {
177
+ return browserInitInFlight;
218
178
  }
219
- throw new Error('No browser to use, should install redis or browser');
179
+ const initPromise = initBrowser();
180
+ browserInitInFlight = initPromise;
181
+ return initPromise.finally(() => {
182
+ if (browserInitInFlight === initPromise) {
183
+ browserInitInFlight = null;
184
+ }
185
+ });
220
186
  });
221
187
  exports.getBrowser = getBrowser;
222
- const closeBrowser = (...args_1) => __awaiter(void 0, [...args_1], void 0, function* ({ trimCache = true } = {}) {
188
+ const closeBrowser = ({ trimCache = true } = {}) => {
189
+ // Return existing close operation if already in progress
190
+ if (closingBrowser) {
191
+ return closingBrowser;
192
+ }
223
193
  if (!browser)
224
194
  return;
225
- // close all pages
226
- try {
227
- const pages = yield browser.pages();
228
- yield Promise.all(pages.map((page) => page.close()));
229
- }
230
- catch (err) {
231
- config_1.logger.warn('Failed to close all pages:', err);
232
- }
233
- // close browser
234
- try {
235
- yield browser.close();
236
- }
237
- catch (err) {
238
- config_1.logger.warn('Failed to close browser:', err);
239
- }
240
- // clear cache
241
- try {
242
- if (trimCache) {
243
- yield puppeteer_1.default.trimCache();
244
- config_1.logger.debug('Trim cache success');
195
+ const target = browser;
196
+ browser = null;
197
+ browserInitInFlight = null;
198
+ const doClose = () => __awaiter(void 0, void 0, void 0, function* () {
199
+ // close all pages
200
+ try {
201
+ const pages = yield target.pages();
202
+ yield Promise.all(pages.map((page) => page.close().catch(() => { })));
203
+ }
204
+ catch (err) {
205
+ config_1.logger.warn('Failed to close all pages:', err);
245
206
  }
246
- // try to clear temporary directory
247
- // if (puppeteerConfig) {
248
- // await fs.emptyDir(puppeteerConfig.temporaryDirectory);
249
- // }
250
- if (global.gc) {
251
- global.gc();
207
+ // close browser
208
+ try {
209
+ yield target.close();
252
210
  }
253
- }
254
- catch (err) {
255
- config_1.logger.warn('Failed to clear browser cache:', err);
256
- }
257
- browser = null;
258
- clearBrowserActivatedTimer();
259
- browserEndpoint = '';
260
- browserStatus = BrowserStatus.None;
261
- config_1.logger.info('Close browser success');
262
- });
211
+ catch (err) {
212
+ config_1.logger.warn('Failed to close browser:', err);
213
+ }
214
+ // clear cache
215
+ try {
216
+ if (trimCache) {
217
+ yield puppeteer_1.default.trimCache();
218
+ config_1.logger.debug('Trim cache success');
219
+ }
220
+ if (global.gc) {
221
+ global.gc();
222
+ }
223
+ }
224
+ catch (err) {
225
+ config_1.logger.warn('Failed to clear browser cache:', err);
226
+ }
227
+ config_1.logger.info('Close browser success');
228
+ });
229
+ closingBrowser = doClose().finally(() => {
230
+ closingBrowser = null;
231
+ });
232
+ return closingBrowser;
233
+ };
263
234
  exports.closeBrowser = closeBrowser;
264
235
  function initPage() {
265
236
  return __awaiter(this, arguments, void 0, function* ({ abortResourceTypes = [] } = {}) {
266
- const browser = yield (0, exports.getBrowser)();
267
- const page = yield browser.newPage();
237
+ const currentBrowser = yield (0, exports.getBrowser)();
238
+ let page;
239
+ try {
240
+ page = yield currentBrowser.newPage();
241
+ }
242
+ catch (error) {
243
+ // If newPage fails due to connection error, close browser and retry once
244
+ if (isBrowserConnectionError(error)) {
245
+ config_1.logger.warn('Failed to create new page due to connection error, restarting browser');
246
+ yield (0, exports.closeBrowser)({ trimCache: false });
247
+ const newBrowser = yield (0, exports.getBrowser)();
248
+ page = yield newBrowser.newPage();
249
+ }
250
+ else {
251
+ throw error;
252
+ }
253
+ }
268
254
  yield page.setViewport({ width: 1440, height: 900 });
269
255
  // page setting
270
256
  // add custom headers
@@ -4,7 +4,7 @@ export declare function convertJobToSnapshot({ job, snapshot }: {
4
4
  job: JobState;
5
5
  snapshot?: Partial<SnapshotModel>;
6
6
  }): SnapshotModel;
7
- export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<keyof SnapshotModel>): Promise<SnapshotModel>;
7
+ export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<keyof SnapshotModel>): Promise<SnapshotModel | null>;
8
8
  /**
9
9
  * get snapshot from db or crawl queue
10
10
  */
@@ -54,6 +54,12 @@ function formatSnapshot(snapshot, columns) {
54
54
  dataDir: config_1.config.dataDir,
55
55
  snapshot,
56
56
  });
57
+ // If the file is missing, delete the invalid snapshot record
58
+ if ((err === null || err === void 0 ? void 0 : err.code) === 'ENOENT') {
59
+ config_1.logger.warn('HTML file missing, deleting invalid snapshot record', { jobId: snapshot.jobId });
60
+ yield store_1.Snapshot.destroy({ where: { jobId: snapshot.jobId } });
61
+ return null;
62
+ }
57
63
  data.html = '';
58
64
  }
59
65
  }
@@ -114,14 +120,26 @@ function deleteSnapshots(where_1) {
114
120
  });
115
121
  const jobIds = yield Promise.all(snapshots.map((snapshot) => __awaiter(this, void 0, void 0, function* () {
116
122
  try {
123
+ // Check reference count before deleting files
124
+ // Only delete file if no other snapshots reference it
125
+ const deleteFilePromises = [];
126
+ if (snapshot.html) {
127
+ const htmlRefCount = yield store_1.Snapshot.count({ where: { html: snapshot.html } });
128
+ if (htmlRefCount <= 1) {
129
+ deleteFilePromises.push(promises_1.default.unlink(node_path_1.default.join(config_1.config.dataDir, snapshot.html)).catch(() => { }));
130
+ }
131
+ }
132
+ if (snapshot.screenshot) {
133
+ const screenshotRefCount = yield store_1.Snapshot.count({ where: { screenshot: snapshot.screenshot } });
134
+ if (screenshotRefCount <= 1) {
135
+ deleteFilePromises.push(promises_1.default.unlink(node_path_1.default.join(config_1.config.dataDir, snapshot.screenshot)).catch(() => { }));
136
+ }
137
+ }
117
138
  try {
118
- yield Promise.all([
119
- snapshot.html && promises_1.default.unlink(node_path_1.default.join(config_1.config.dataDir, snapshot.html)),
120
- snapshot.screenshot && promises_1.default.unlink(node_path_1.default.join(config_1.config.dataDir, snapshot.screenshot)),
121
- ]);
139
+ yield Promise.all(deleteFilePromises);
122
140
  }
123
141
  catch (err) {
124
- config_1.logger.error('Failed to delete snapshot', { err, snapshot, dataDir: config_1.config.dataDir });
142
+ config_1.logger.error('Failed to delete snapshot files', { err, snapshot, dataDir: config_1.config.dataDir });
125
143
  }
126
144
  yield snapshot.destroy({ transaction: txn });
127
145
  return snapshot.jobId;
@@ -15,7 +15,7 @@ import fs from 'fs-extra';
15
15
  import path from 'path';
16
16
  import { config, logger } from './config';
17
17
  import { jobDurationSeconds, jobTotalLatencySeconds, jobsEnqueuedTotal, jobsTotal } from './metrics';
18
- import { initPage } from './puppeteer';
18
+ import { closeBrowser, initPage, isBrowserConnectionError } from './puppeteer';
19
19
  import { createCarbonImage } from './services/carbon';
20
20
  import { convertJobToSnapshot, deleteSnapshots, formatSnapshot } from './services/snapshot';
21
21
  import { Job, Snapshot, sequelize } from './store';
@@ -42,6 +42,7 @@ export function createCrawlQueue(queue, handler) {
42
42
  options: {
43
43
  concurrency: config.concurrency,
44
44
  enableScheduledJob: true,
45
+ maxRetries: 3,
45
46
  },
46
47
  onJob: (job) => __awaiter(this, void 0, void 0, function* () {
47
48
  const startTime = Date.now();
@@ -193,6 +194,22 @@ export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_
193
194
  let html = null;
194
195
  let screenshot = null;
195
196
  const meta = {};
197
+ const closePageSafely = () => __awaiter(void 0, void 0, void 0, function* () {
198
+ try {
199
+ yield page.close();
200
+ }
201
+ catch (error) {
202
+ if (isBrowserConnectionError(error)) {
203
+ try {
204
+ yield closeBrowser({ trimCache: false });
205
+ }
206
+ catch (closeError) {
207
+ logger.warn('Failed to close browser after page close error', { error: closeError });
208
+ }
209
+ }
210
+ logger.warn('Failed to close page:', { error });
211
+ }
212
+ });
196
213
  try {
197
214
  const response = yield page.goto(url, { timeout });
198
215
  if (!response) {
@@ -279,14 +296,23 @@ export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_
279
296
  logger.error('Failed to get html:', err);
280
297
  throw err;
281
298
  }
299
+ yield closePageSafely();
282
300
  }
283
301
  catch (error) {
302
+ if (isBrowserConnectionError(error)) {
303
+ try {
304
+ yield closeBrowser({ trimCache: false });
305
+ }
306
+ catch (closeError) {
307
+ logger.warn('Failed to close browser after page error', { error: closeError });
308
+ }
309
+ }
310
+ else {
311
+ yield closePageSafely();
312
+ }
284
313
  logger.error('Failed to get page content:', error);
285
314
  throw error;
286
315
  }
287
- finally {
288
- yield page.close();
289
- }
290
316
  return {
291
317
  html,
292
318
  screenshot,
@@ -56,6 +56,8 @@ export function collectMetrics() {
56
56
  try {
57
57
  // 收集队列大小
58
58
  const jobStats = yield Job.stats();
59
+ // Reset first to clear queues that no longer have jobs
60
+ queueSize.reset();
59
61
  jobStats.queues.forEach((q) => {
60
62
  queueSize.set({ queue: q.queue }, q.count);
61
63
  });
@@ -1,16 +1,16 @@
1
- import puppeteer, { Browser, Page, ResourceType } from '@blocklet/puppeteer';
1
+ import puppeteer, { Browser, ResourceType } from '@blocklet/puppeteer';
2
2
  export { puppeteer };
3
3
  export declare function ensurePuppeteerrc(): Promise<{
4
4
  cacheDirectory: string;
5
5
  temporaryDirectory: string;
6
6
  }>;
7
7
  export declare function ensureBrowser(): Promise<void>;
8
- export declare function connectBrowser(): Promise<Browser | null>;
9
8
  export declare function launchBrowser(): Promise<Browser>;
9
+ export declare function isBrowserConnectionError(error: unknown): boolean;
10
10
  export declare const getBrowser: () => Promise<Browser>;
11
11
  export declare const closeBrowser: ({ trimCache }?: {
12
12
  trimCache?: boolean;
13
- }) => Promise<void>;
13
+ }) => Promise<void> | undefined;
14
14
  export declare function initPage({ abortResourceTypes }?: {
15
15
  abortResourceTypes?: ResourceType[];
16
- }): Promise<Page>;
16
+ }): Promise<any>;
@@ -10,20 +10,18 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
10
10
  import puppeteer from '@blocklet/puppeteer';
11
11
  import fs from 'fs-extra';
12
12
  import path from 'path';
13
- import { clearInterval, setInterval } from 'timers';
14
13
  import { config, logger } from './config';
15
- import { Job } from './store';
16
14
  import { CRAWLER_FLAG, sleep } from './utils';
17
- const BrowserStatus = {
18
- None: 'None',
19
- Launching: 'Launching',
20
- Ready: 'Ready',
21
- };
22
- let browserStatus = BrowserStatus.None;
23
- /** Chromium WebSocket endpoint that allows puppeteer browser instance to connect to the browser */
24
- let browserEndpoint = '';
25
15
  let browser;
26
- let browserActivatedTimer;
16
+ let browserInitInFlight;
17
+ let closingBrowser;
18
+ const BROWSER_CONNECTION_ERROR_PATTERNS = [
19
+ /protocol error/i,
20
+ /target closed/i,
21
+ /browser disconnected/i,
22
+ /session closed/i,
23
+ /target crashed/i,
24
+ ];
27
25
  export { puppeteer };
28
26
  export function ensurePuppeteerrc() {
29
27
  return __awaiter(this, void 0, void 0, function* () {
@@ -67,7 +65,7 @@ export function ensureBrowser() {
67
65
  }
68
66
  // try to launch browser
69
67
  if (config.isProd) {
70
- const browser = yield launchBrowser();
68
+ const browser = yield getBrowser();
71
69
  if (!browser) {
72
70
  throw new Error('Failed to launch browser');
73
71
  }
@@ -76,34 +74,8 @@ export function ensureBrowser() {
76
74
  logger.info('Puppeteer is ready');
77
75
  });
78
76
  }
79
- export function connectBrowser() {
80
- return __awaiter(this, void 0, void 0, function* () {
81
- if (!browserEndpoint) {
82
- return null;
83
- }
84
- // retry if browser is launching
85
- if (browserStatus === BrowserStatus.Launching) {
86
- yield sleep(Math.floor(Math.random() * 1000));
87
- return connectBrowser();
88
- }
89
- try {
90
- browser = yield puppeteer.connect({
91
- browserWSEndpoint: browserEndpoint,
92
- });
93
- logger.info('Connect browser success');
94
- }
95
- catch (err) {
96
- logger.warn('Connect browser failed, clear endpoint', err);
97
- browserEndpoint = '';
98
- return null;
99
- }
100
- return browser;
101
- });
102
- }
103
77
  export function launchBrowser() {
104
78
  return __awaiter(this, void 0, void 0, function* () {
105
- browserEndpoint = '';
106
- browserStatus = BrowserStatus.Launching;
107
79
  try {
108
80
  browser = yield puppeteer.launch({
109
81
  headless: true,
@@ -135,123 +107,137 @@ export function launchBrowser() {
135
107
  '--disable-gpu-sandbox',
136
108
  ],
137
109
  });
110
+ attachBrowserListeners(browser);
138
111
  logger.info('Launch browser');
139
112
  }
140
113
  catch (error) {
141
114
  logger.error('launch browser failed: ', error);
142
- browserStatus = BrowserStatus.None;
143
- browserEndpoint = '';
144
115
  throw error;
145
116
  }
146
- // save browserWSEndpoint to cache
147
- browserEndpoint = yield browser.wsEndpoint();
148
- browserStatus = BrowserStatus.Ready;
149
117
  return browser;
150
118
  });
151
119
  }
152
- function checkBrowserActivated() {
153
- clearBrowserActivatedTimer();
154
- let count = 0;
155
- browserActivatedTimer = setInterval(() => __awaiter(this, void 0, void 0, function* () {
156
- var _a;
157
- if (browser) {
158
- const pages = yield browser.pages().catch(() => []);
159
- const jobCount = yield Job.count().catch(() => 0);
160
- // Check if browser is inactive: only blank page AND no pending jobs
161
- const isInactive = pages.length === 1 && ((_a = pages[0]) === null || _a === void 0 ? void 0 : _a.url()) === 'about:blank' && jobCount === 0;
162
- if (isInactive) {
163
- count++;
164
- logger.debug(`Browser inactive count: ${count}/3`);
165
- }
166
- else {
167
- count = 0;
168
- if (jobCount > 0) {
169
- logger.debug(`Browser has ${jobCount} pending jobs, keeping active`);
170
- }
171
- }
172
- if (count >= 3) {
173
- logger.info('Browser inactive for 3 minutes, closing...');
174
- yield closeBrowser({
175
- trimCache: true,
176
- });
177
- }
120
+ function resetBrowserState(reason) {
121
+ if (reason) {
122
+ logger.warn('Reset browser state', { reason });
123
+ }
124
+ browser = null;
125
+ browserInitInFlight = null;
126
+ }
127
+ export function isBrowserConnectionError(error) {
128
+ const message = error instanceof Error ? error.message : String(error || '');
129
+ return BROWSER_CONNECTION_ERROR_PATTERNS.some((pattern) => pattern.test(message));
130
+ }
131
+ function attachBrowserListeners(target) {
132
+ target.on('disconnected', () => {
133
+ if (browser !== target) {
134
+ return;
178
135
  }
179
- }), 1000 * 60);
136
+ logger.warn('Browser disconnected');
137
+ resetBrowserState('disconnected');
138
+ });
180
139
  }
181
- function clearBrowserActivatedTimer() {
182
- if (browserActivatedTimer) {
183
- clearInterval(browserActivatedTimer);
184
- browserActivatedTimer = null;
185
- }
140
+ function initBrowser() {
141
+ return __awaiter(this, void 0, void 0, function* () {
142
+ // sleep random time (0 ~ 5s),to avoid concurrent blocklet
143
+ yield sleep(Math.floor(Math.random() * 1000 * 5));
144
+ const launchedBrowser = yield launchBrowser();
145
+ if (launchedBrowser) {
146
+ logger.debug('getBrowser.launchedBrowser');
147
+ browser = launchedBrowser;
148
+ return browser;
149
+ }
150
+ throw new Error('No browser to use, should install redis or browser');
151
+ });
186
152
  }
187
153
  export const getBrowser = () => __awaiter(void 0, void 0, void 0, function* () {
188
- if (browser)
189
- return browser;
190
- // sleep random time (0 ~ 5s),to avoid concurrent blocklet
191
- yield sleep(Math.floor(Math.random() * 1000 * 5));
192
- // try to connect browser
193
- const connectedBrowser = yield connectBrowser();
194
- if (connectedBrowser) {
195
- logger.debug('getBrowser.connectedBrowser');
196
- browser = connectedBrowser;
197
- checkBrowserActivated();
198
- return browser;
154
+ // Wait for any ongoing browser close operation to complete
155
+ if (closingBrowser) {
156
+ yield closingBrowser;
199
157
  }
200
- // try to launch browser
201
- const launchedBrowser = yield launchBrowser();
202
- if (launchedBrowser) {
203
- logger.debug('getBrowser.launchedBrowser');
204
- browser = launchedBrowser;
205
- checkBrowserActivated();
206
- return browser;
158
+ if (browser) {
159
+ if (browser.isConnected()) {
160
+ return browser;
161
+ }
162
+ logger.warn('Browser instance is disconnected, resetting');
163
+ resetBrowserState('disconnected');
164
+ }
165
+ if (browserInitInFlight) {
166
+ return browserInitInFlight;
207
167
  }
208
- throw new Error('No browser to use, should install redis or browser');
168
+ const initPromise = initBrowser();
169
+ browserInitInFlight = initPromise;
170
+ return initPromise.finally(() => {
171
+ if (browserInitInFlight === initPromise) {
172
+ browserInitInFlight = null;
173
+ }
174
+ });
209
175
  });
210
- export const closeBrowser = (...args_1) => __awaiter(void 0, [...args_1], void 0, function* ({ trimCache = true } = {}) {
176
+ export const closeBrowser = ({ trimCache = true } = {}) => {
177
+ // Return existing close operation if already in progress
178
+ if (closingBrowser) {
179
+ return closingBrowser;
180
+ }
211
181
  if (!browser)
212
182
  return;
213
- // close all pages
214
- try {
215
- const pages = yield browser.pages();
216
- yield Promise.all(pages.map((page) => page.close()));
217
- }
218
- catch (err) {
219
- logger.warn('Failed to close all pages:', err);
220
- }
221
- // close browser
222
- try {
223
- yield browser.close();
224
- }
225
- catch (err) {
226
- logger.warn('Failed to close browser:', err);
227
- }
228
- // clear cache
229
- try {
230
- if (trimCache) {
231
- yield puppeteer.trimCache();
232
- logger.debug('Trim cache success');
183
+ const target = browser;
184
+ browser = null;
185
+ browserInitInFlight = null;
186
+ const doClose = () => __awaiter(void 0, void 0, void 0, function* () {
187
+ // close all pages
188
+ try {
189
+ const pages = yield target.pages();
190
+ yield Promise.all(pages.map((page) => page.close().catch(() => { })));
191
+ }
192
+ catch (err) {
193
+ logger.warn('Failed to close all pages:', err);
233
194
  }
234
- // try to clear temporary directory
235
- // if (puppeteerConfig) {
236
- // await fs.emptyDir(puppeteerConfig.temporaryDirectory);
237
- // }
238
- if (global.gc) {
239
- global.gc();
195
+ // close browser
196
+ try {
197
+ yield target.close();
240
198
  }
241
- }
242
- catch (err) {
243
- logger.warn('Failed to clear browser cache:', err);
244
- }
245
- browser = null;
246
- clearBrowserActivatedTimer();
247
- browserEndpoint = '';
248
- browserStatus = BrowserStatus.None;
249
- logger.info('Close browser success');
250
- });
199
+ catch (err) {
200
+ logger.warn('Failed to close browser:', err);
201
+ }
202
+ // clear cache
203
+ try {
204
+ if (trimCache) {
205
+ yield puppeteer.trimCache();
206
+ logger.debug('Trim cache success');
207
+ }
208
+ if (global.gc) {
209
+ global.gc();
210
+ }
211
+ }
212
+ catch (err) {
213
+ logger.warn('Failed to clear browser cache:', err);
214
+ }
215
+ logger.info('Close browser success');
216
+ });
217
+ closingBrowser = doClose().finally(() => {
218
+ closingBrowser = null;
219
+ });
220
+ return closingBrowser;
221
+ };
251
222
  export function initPage() {
252
223
  return __awaiter(this, arguments, void 0, function* ({ abortResourceTypes = [] } = {}) {
253
- const browser = yield getBrowser();
254
- const page = yield browser.newPage();
224
+ const currentBrowser = yield getBrowser();
225
+ let page;
226
+ try {
227
+ page = yield currentBrowser.newPage();
228
+ }
229
+ catch (error) {
230
+ // If newPage fails due to connection error, close browser and retry once
231
+ if (isBrowserConnectionError(error)) {
232
+ logger.warn('Failed to create new page due to connection error, restarting browser');
233
+ yield closeBrowser({ trimCache: false });
234
+ const newBrowser = yield getBrowser();
235
+ page = yield newBrowser.newPage();
236
+ }
237
+ else {
238
+ throw error;
239
+ }
240
+ }
255
241
  yield page.setViewport({ width: 1440, height: 900 });
256
242
  // page setting
257
243
  // add custom headers
@@ -4,7 +4,7 @@ export declare function convertJobToSnapshot({ job, snapshot }: {
4
4
  job: JobState;
5
5
  snapshot?: Partial<SnapshotModel>;
6
6
  }): SnapshotModel;
7
- export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<keyof SnapshotModel>): Promise<SnapshotModel>;
7
+ export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<keyof SnapshotModel>): Promise<SnapshotModel | null>;
8
8
  /**
9
9
  * get snapshot from db or crawl queue
10
10
  */
@@ -44,6 +44,12 @@ export function formatSnapshot(snapshot, columns) {
44
44
  dataDir: config.dataDir,
45
45
  snapshot,
46
46
  });
47
+ // If the file is missing, delete the invalid snapshot record
48
+ if ((err === null || err === void 0 ? void 0 : err.code) === 'ENOENT') {
49
+ logger.warn('HTML file missing, deleting invalid snapshot record', { jobId: snapshot.jobId });
50
+ yield Snapshot.destroy({ where: { jobId: snapshot.jobId } });
51
+ return null;
52
+ }
47
53
  data.html = '';
48
54
  }
49
55
  }
@@ -104,14 +110,26 @@ export function deleteSnapshots(where_1) {
104
110
  });
105
111
  const jobIds = yield Promise.all(snapshots.map((snapshot) => __awaiter(this, void 0, void 0, function* () {
106
112
  try {
113
+ // Check reference count before deleting files
114
+ // Only delete file if no other snapshots reference it
115
+ const deleteFilePromises = [];
116
+ if (snapshot.html) {
117
+ const htmlRefCount = yield Snapshot.count({ where: { html: snapshot.html } });
118
+ if (htmlRefCount <= 1) {
119
+ deleteFilePromises.push(fs.unlink(path.join(config.dataDir, snapshot.html)).catch(() => { }));
120
+ }
121
+ }
122
+ if (snapshot.screenshot) {
123
+ const screenshotRefCount = yield Snapshot.count({ where: { screenshot: snapshot.screenshot } });
124
+ if (screenshotRefCount <= 1) {
125
+ deleteFilePromises.push(fs.unlink(path.join(config.dataDir, snapshot.screenshot)).catch(() => { }));
126
+ }
127
+ }
107
128
  try {
108
- yield Promise.all([
109
- snapshot.html && fs.unlink(path.join(config.dataDir, snapshot.html)),
110
- snapshot.screenshot && fs.unlink(path.join(config.dataDir, snapshot.screenshot)),
111
- ]);
129
+ yield Promise.all(deleteFilePromises);
112
130
  }
113
131
  catch (err) {
114
- logger.error('Failed to delete snapshot', { err, snapshot, dataDir: config.dataDir });
132
+ logger.error('Failed to delete snapshot files', { err, snapshot, dataDir: config.dataDir });
115
133
  }
116
134
  yield snapshot.destroy({ transaction: txn });
117
135
  return snapshot.jobId;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arcblock/crawler",
3
- "version": "1.5.1",
3
+ "version": "1.5.3",
4
4
  "main": "lib/cjs/index.js",
5
5
  "module": "lib/esm/index.js",
6
6
  "types": "lib/cjs/index.d.ts",
@@ -45,12 +45,12 @@
45
45
  ]
46
46
  },
47
47
  "dependencies": {
48
- "@abtnode/cron": "^1.17.5",
49
- "@abtnode/models": "^1.17.5",
50
- "@abtnode/queue": "^1.17.5",
51
- "@blocklet/logger": "^1.17.5",
48
+ "@abtnode/cron": "^1.17.7",
49
+ "@abtnode/models": "^1.17.7",
50
+ "@abtnode/queue": "^1.17.7",
51
+ "@blocklet/logger": "^1.17.7",
52
52
  "@blocklet/puppeteer": "^22.11.3",
53
- "@blocklet/sdk": "^1.17.5",
53
+ "@blocklet/sdk": "^1.17.7",
54
54
  "@sequelize/core": "7.0.0-alpha.46",
55
55
  "@sequelize/sqlite3": "7.0.0-alpha.46",
56
56
  "axios": "^1.7.9",