@arcblock/crawler 1.4.6 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/cjs/crawler.d.ts +8 -6
- package/lib/cjs/crawler.js +91 -50
- package/lib/cjs/index.d.ts +2 -0
- package/lib/cjs/index.js +2 -0
- package/lib/cjs/metrics.d.ts +18 -0
- package/lib/cjs/metrics.js +88 -0
- package/lib/cjs/puppeteer.js +17 -6
- package/lib/cjs/site.js +17 -7
- package/lib/cjs/store/job.d.ts +30 -1
- package/lib/cjs/store/job.js +56 -0
- package/lib/cjs/store/migrate.js +6 -0
- package/lib/cjs/store/migrations/20251226-job-processing.d.ts +6 -0
- package/lib/cjs/store/migrations/20251226-job-processing.js +37 -0
- package/lib/esm/crawler.d.ts +8 -6
- package/lib/esm/crawler.js +90 -50
- package/lib/esm/index.d.ts +2 -0
- package/lib/esm/index.js +2 -0
- package/lib/esm/metrics.d.ts +18 -0
- package/lib/esm/metrics.js +82 -0
- package/lib/esm/puppeteer.js +17 -6
- package/lib/esm/site.js +19 -9
- package/lib/esm/store/job.d.ts +30 -1
- package/lib/esm/store/job.js +56 -0
- package/lib/esm/store/migrate.js +6 -0
- package/lib/esm/store/migrations/20251226-job-processing.d.ts +6 -0
- package/lib/esm/store/migrations/20251226-job-processing.js +33 -0
- package/package.json +7 -6
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.up = up;
|
|
13
|
+
exports.down = down;
|
|
14
|
+
/* eslint-disable no-console */
|
|
15
|
+
const core_1 = require("@sequelize/core");
|
|
16
|
+
function up(_a) {
|
|
17
|
+
return __awaiter(this, arguments, void 0, function* ({ context }) {
|
|
18
|
+
console.log('[20251226-job-processing:up] Migrating...');
|
|
19
|
+
yield context.addColumn('jobs', 'processingBy', {
|
|
20
|
+
type: core_1.DataTypes.STRING(32),
|
|
21
|
+
allowNull: true,
|
|
22
|
+
});
|
|
23
|
+
yield context.addColumn('jobs', 'processingAt', {
|
|
24
|
+
type: core_1.DataTypes.INTEGER,
|
|
25
|
+
allowNull: true,
|
|
26
|
+
});
|
|
27
|
+
console.log('[20251226-job-processing:up] Migrated successfully!');
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
function down(_a) {
|
|
31
|
+
return __awaiter(this, arguments, void 0, function* ({ context }) {
|
|
32
|
+
console.log('[20251226-job-processing:down] Migrating...');
|
|
33
|
+
yield context.removeColumn('jobs', 'processingBy');
|
|
34
|
+
yield context.removeColumn('jobs', 'processingAt');
|
|
35
|
+
console.log('[20251226-job-processing:down] Migrated successfully!');
|
|
36
|
+
});
|
|
37
|
+
}
|
package/lib/esm/crawler.d.ts
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import { Page } from '@blocklet/puppeteer';
|
|
2
2
|
import { JobState, SnapshotModel } from './store';
|
|
3
|
-
declare
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
3
|
+
export declare const queueMap: {
|
|
4
|
+
urlCrawler: any;
|
|
5
|
+
syncCrawler: any;
|
|
6
|
+
codeCrawler: any;
|
|
7
|
+
cronJobs: any;
|
|
8
|
+
};
|
|
8
9
|
export declare function initQueue(): void;
|
|
9
10
|
type PageHandler = {
|
|
10
11
|
handleScreenshot?: (page: Page, params?: JobState) => Promise<Buffer | null>;
|
|
@@ -28,6 +29,7 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
|
|
|
28
29
|
* @param params
|
|
29
30
|
* @param callback callback when job finished
|
|
30
31
|
*/
|
|
31
|
-
export declare function enqueue(
|
|
32
|
+
export declare function enqueue(queueName: keyof typeof queueMap, params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
32
33
|
export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
33
34
|
export declare function crawlCode(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
35
|
+
export {};
|
package/lib/esm/crawler.js
CHANGED
|
@@ -14,50 +14,57 @@ import { randomUUID } from 'crypto';
|
|
|
14
14
|
import fs from 'fs-extra';
|
|
15
15
|
import path from 'path';
|
|
16
16
|
import { config, logger } from './config';
|
|
17
|
+
import { jobDurationSeconds, jobTotalLatencySeconds, jobsEnqueuedTotal, jobsTotal } from './metrics';
|
|
17
18
|
import { initPage } from './puppeteer';
|
|
18
19
|
import { createCarbonImage } from './services/carbon';
|
|
19
20
|
import { convertJobToSnapshot, deleteSnapshots, formatSnapshot } from './services/snapshot';
|
|
20
21
|
import { Job, Snapshot, sequelize } from './store';
|
|
21
22
|
import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5, sleep } from './utils';
|
|
22
23
|
const { BaseState } = require('@abtnode/models');
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
24
|
+
export const queueMap = {
|
|
25
|
+
urlCrawler: null,
|
|
26
|
+
syncCrawler: null,
|
|
27
|
+
codeCrawler: null,
|
|
28
|
+
cronJobs: null,
|
|
29
|
+
};
|
|
28
30
|
export function initQueue() {
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
31
|
+
queueMap.urlCrawler = createCrawlQueue('urlCrawler');
|
|
32
|
+
queueMap.syncCrawler = createCrawlQueue('syncCrawler');
|
|
33
|
+
queueMap.codeCrawler = createCrawlQueue('codeCrawler', {
|
|
32
34
|
handleScreenshot: createCarbonImage,
|
|
33
35
|
});
|
|
34
|
-
|
|
36
|
+
queueMap.cronJobs = createCrawlQueue('cronJobs');
|
|
35
37
|
}
|
|
36
38
|
export function createCrawlQueue(queue, handler) {
|
|
37
39
|
const db = new BaseState(Job);
|
|
38
40
|
return createQueue({
|
|
39
41
|
store: new SequelizeStore(db, queue),
|
|
40
|
-
|
|
42
|
+
options: {
|
|
43
|
+
concurrency: config.concurrency,
|
|
44
|
+
enableScheduledJob: true,
|
|
45
|
+
},
|
|
41
46
|
onJob: (job) => __awaiter(this, void 0, void 0, function* () {
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
if (!job.ignoreRobots) {
|
|
45
|
-
const canCrawl = yield isAcceptCrawler(job.url);
|
|
46
|
-
if (!canCrawl) {
|
|
47
|
-
logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
|
|
48
|
-
const snapshot = convertJobToSnapshot({
|
|
49
|
-
job,
|
|
50
|
-
snapshot: {
|
|
51
|
-
status: 'failed',
|
|
52
|
-
error: 'Denied by robots.txt',
|
|
53
|
-
},
|
|
54
|
-
});
|
|
55
|
-
yield Snapshot.upsert(snapshot);
|
|
56
|
-
return snapshot;
|
|
57
|
-
}
|
|
58
|
-
}
|
|
47
|
+
const startTime = Date.now();
|
|
48
|
+
let status = 'failed';
|
|
59
49
|
const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config.cookies || []).concat(job.cookies || []), localStorage: (config.localStorage || []).concat(job.localStorage || []), url: formatUrl(job.url) });
|
|
60
50
|
try {
|
|
51
|
+
logger.info(`Starting to execute ${queue} job`, Object.assign(Object.assign({}, job), { queueSize: yield Job.count() }));
|
|
52
|
+
// check robots.txt
|
|
53
|
+
if (!job.ignoreRobots) {
|
|
54
|
+
const canCrawl = yield isAcceptCrawler(job.url);
|
|
55
|
+
if (!canCrawl) {
|
|
56
|
+
logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
|
|
57
|
+
const snapshot = convertJobToSnapshot({
|
|
58
|
+
job,
|
|
59
|
+
snapshot: {
|
|
60
|
+
status: 'failed',
|
|
61
|
+
error: 'Denied by robots.txt',
|
|
62
|
+
},
|
|
63
|
+
});
|
|
64
|
+
yield Snapshot.upsert(snapshot);
|
|
65
|
+
return snapshot;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
61
68
|
// get page content later
|
|
62
69
|
const result = yield getPageContent(formattedJob, handler);
|
|
63
70
|
if (!result || (!result.html && !result.screenshot)) {
|
|
@@ -75,18 +82,13 @@ export function createCrawlQueue(queue, handler) {
|
|
|
75
82
|
const snapshot = yield sequelize.transaction((txn) => __awaiter(this, void 0, void 0, function* () {
|
|
76
83
|
// delete old snapshot
|
|
77
84
|
if (formattedJob.replace) {
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
}, { txn });
|
|
83
|
-
if (deletedJobIds) {
|
|
84
|
-
logger.info('Deleted old snapshot', { deletedJobIds });
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
catch (error) {
|
|
85
|
+
const deletedJobIds = yield deleteSnapshots({
|
|
86
|
+
url: formattedJob.url,
|
|
87
|
+
replace: true,
|
|
88
|
+
}, { txn }).catch((error) => {
|
|
88
89
|
logger.error('Failed to delete old snapshot', { error, formattedJob });
|
|
89
|
-
}
|
|
90
|
+
});
|
|
91
|
+
logger.info('Deleted old snapshot', { deletedJobIds });
|
|
90
92
|
}
|
|
91
93
|
// save html and screenshot to data dir
|
|
92
94
|
const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
|
|
@@ -106,10 +108,12 @@ export function createCrawlQueue(queue, handler) {
|
|
|
106
108
|
yield Snapshot.upsert(snapshot, { transaction: txn });
|
|
107
109
|
return snapshot;
|
|
108
110
|
}));
|
|
111
|
+
status = 'success';
|
|
109
112
|
return snapshot;
|
|
110
113
|
}
|
|
111
114
|
catch (error) {
|
|
112
115
|
logger.error(`Failed to crawl ${formattedJob.url}`, { error, formattedJob });
|
|
116
|
+
status = 'failed';
|
|
113
117
|
const snapshot = convertJobToSnapshot({
|
|
114
118
|
job: formattedJob,
|
|
115
119
|
snapshot: {
|
|
@@ -120,6 +124,14 @@ export function createCrawlQueue(queue, handler) {
|
|
|
120
124
|
yield Snapshot.upsert(snapshot);
|
|
121
125
|
return snapshot;
|
|
122
126
|
}
|
|
127
|
+
finally {
|
|
128
|
+
const now = Date.now();
|
|
129
|
+
jobsTotal.inc({ queue, status });
|
|
130
|
+
jobDurationSeconds.observe({ queue, status }, (now - startTime) / 1000);
|
|
131
|
+
if (job.enqueuedAt) {
|
|
132
|
+
jobTotalLatencySeconds.observe({ queue, status }, (now - job.enqueuedAt) / 1000);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
123
135
|
}),
|
|
124
136
|
});
|
|
125
137
|
}
|
|
@@ -155,7 +167,7 @@ function saveSnapshotToLocal(_a) {
|
|
|
155
167
|
};
|
|
156
168
|
});
|
|
157
169
|
}
|
|
158
|
-
export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, format = 'webp', timeout =
|
|
170
|
+
export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, format = 'webp', timeout = 60 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }, handler) {
|
|
159
171
|
const page = yield initPage();
|
|
160
172
|
if (width && height) {
|
|
161
173
|
yield page.setViewport({ width, height, deviceScaleFactor: 2 });
|
|
@@ -287,31 +299,59 @@ export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_
|
|
|
287
299
|
* @param callback callback when job finished
|
|
288
300
|
*/
|
|
289
301
|
// eslint-disable-next-line require-await
|
|
290
|
-
export function enqueue(
|
|
302
|
+
export function enqueue(queueName, params, callback) {
|
|
291
303
|
return __awaiter(this, void 0, void 0, function* () {
|
|
304
|
+
const queue = queueMap[queueName];
|
|
305
|
+
if (!queue) {
|
|
306
|
+
throw new Error(`Queue ${queueName} not found`);
|
|
307
|
+
}
|
|
292
308
|
// skip duplicate job
|
|
293
309
|
const existsJob = yield Job.isExists(params);
|
|
294
310
|
if (existsJob && !params.sync) {
|
|
295
311
|
logger.info(`Crawl job already exists for ${params.url}, skip`);
|
|
296
312
|
return existsJob.id;
|
|
297
313
|
}
|
|
298
|
-
logger.info('enqueue crawl job', params);
|
|
299
314
|
const jobId = randomUUID();
|
|
300
|
-
const
|
|
315
|
+
const enqueuedAt = Date.now();
|
|
316
|
+
const job = queue.push({ job: Object.assign(Object.assign({}, params), { id: jobId, enqueuedAt }), jobId });
|
|
317
|
+
jobsEnqueuedTotal.inc({ queue: queueName });
|
|
318
|
+
// Get current queue size for logging
|
|
319
|
+
const queueSize = yield Job.count();
|
|
320
|
+
logger.info('enqueue crawl job', Object.assign(Object.assign({}, params), { jobId, queueSize }));
|
|
301
321
|
job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
|
|
302
|
-
|
|
303
|
-
|
|
322
|
+
try {
|
|
323
|
+
const isSuccess = (result === null || result === void 0 ? void 0 : result.status) === 'success';
|
|
324
|
+
const queueSize = yield Job.count();
|
|
325
|
+
logger.info(`Crawl completed ${params.url}, status: ${isSuccess ? 'success' : 'failed'}`, {
|
|
326
|
+
job: params,
|
|
327
|
+
result,
|
|
328
|
+
queueSize,
|
|
329
|
+
});
|
|
330
|
+
callback === null || callback === void 0 ? void 0 : callback(result ? yield formatSnapshot(result) : null);
|
|
331
|
+
}
|
|
332
|
+
catch (error) {
|
|
333
|
+
logger.error(`Error in finished event handler for ${params.url}`, { error });
|
|
334
|
+
callback === null || callback === void 0 ? void 0 : callback(null);
|
|
335
|
+
}
|
|
336
|
+
}));
|
|
337
|
+
job.on('failed', (_a) => __awaiter(this, [_a], void 0, function* ({ error }) {
|
|
338
|
+
try {
|
|
339
|
+
const queueSize = yield Job.count();
|
|
340
|
+
logger.error(`Failed to execute job for ${params.url}`, { error, job: params, queueSize });
|
|
341
|
+
}
|
|
342
|
+
catch (err) {
|
|
343
|
+
logger.error(`Error in failed event handler for ${params.url}`, { error: err });
|
|
344
|
+
}
|
|
345
|
+
finally {
|
|
346
|
+
callback === null || callback === void 0 ? void 0 : callback(null);
|
|
347
|
+
}
|
|
304
348
|
}));
|
|
305
|
-
job.on('failed', ({ error }) => {
|
|
306
|
-
logger.error(`Failed to execute job for ${params.url}`, { error, job: params });
|
|
307
|
-
callback === null || callback === void 0 ? void 0 : callback(null);
|
|
308
|
-
});
|
|
309
349
|
return jobId;
|
|
310
350
|
});
|
|
311
351
|
}
|
|
312
352
|
export function crawlUrl(params, callback) {
|
|
313
|
-
return enqueue(params.sync ?
|
|
353
|
+
return enqueue(params.sync ? 'syncCrawler' : 'urlCrawler', params, callback);
|
|
314
354
|
}
|
|
315
355
|
export function crawlCode(params, callback) {
|
|
316
|
-
return enqueue(
|
|
356
|
+
return enqueue('codeCrawler', Object.assign({ ignoreRobots: true, includeHtml: false, includeScreenshot: true }, params), callback);
|
|
317
357
|
}
|
package/lib/esm/index.d.ts
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import { Config } from './config';
|
|
2
2
|
export * from './crawler';
|
|
3
3
|
export * from './services/snapshot';
|
|
4
|
+
export * from './store/job';
|
|
4
5
|
export * as utils from './utils';
|
|
6
|
+
export * from './metrics';
|
|
5
7
|
export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
|
package/lib/esm/index.js
CHANGED
|
@@ -16,7 +16,9 @@ import { ensureBrowser } from './puppeteer';
|
|
|
16
16
|
import { migrate } from './store/migrate';
|
|
17
17
|
export * from './crawler';
|
|
18
18
|
export * from './services/snapshot';
|
|
19
|
+
export * from './store/job';
|
|
19
20
|
export * as utils from './utils';
|
|
21
|
+
export * from './metrics';
|
|
20
22
|
export function initCrawler(params) {
|
|
21
23
|
return __awaiter(this, void 0, void 0, function* () {
|
|
22
24
|
var _a;
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import { Counter, Gauge, Histogram } from 'prom-client';
|
|
2
|
+
export declare const jobsTotal: Counter<"queue" | "status">;
|
|
3
|
+
export declare const jobsEnqueuedTotal: Counter<"queue">;
|
|
4
|
+
export declare const jobDurationSeconds: Histogram<"queue" | "status">;
|
|
5
|
+
export declare const jobTotalLatencySeconds: Histogram<"queue" | "status">;
|
|
6
|
+
export declare const queueSize: Gauge<"queue">;
|
|
7
|
+
/**
|
|
8
|
+
* Collect all metrics by querying database
|
|
9
|
+
*/
|
|
10
|
+
export declare function collectMetrics(): Promise<void>;
|
|
11
|
+
/**
|
|
12
|
+
* Get metrics in Prometheus format
|
|
13
|
+
*/
|
|
14
|
+
export declare function getMetrics(): Promise<string>;
|
|
15
|
+
/**
|
|
16
|
+
* Get content type for metrics endpoint
|
|
17
|
+
*/
|
|
18
|
+
export declare function getContentType(): "text/plain; version=0.0.4; charset=utf-8";
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
import { Counter, Gauge, Histogram, Registry } from 'prom-client';
|
|
11
|
+
import { Job } from './store';
|
|
12
|
+
// Create a new registry
|
|
13
|
+
const register = new Registry();
|
|
14
|
+
// ========== Counter - 爬取任务计数 ==========
|
|
15
|
+
export const jobsTotal = new Counter({
|
|
16
|
+
name: 'crawler_jobs_total',
|
|
17
|
+
help: 'Total number of crawl jobs processed',
|
|
18
|
+
labelNames: ['queue', 'status'],
|
|
19
|
+
registers: [register],
|
|
20
|
+
});
|
|
21
|
+
// ========== Counter - 入队任务数 ==========
|
|
22
|
+
export const jobsEnqueuedTotal = new Counter({
|
|
23
|
+
name: 'crawler_jobs_enqueued_total',
|
|
24
|
+
help: 'Total number of crawl jobs enqueued',
|
|
25
|
+
labelNames: ['queue'],
|
|
26
|
+
registers: [register],
|
|
27
|
+
});
|
|
28
|
+
// ========== Histogram - 任务执行耗时 ==========
|
|
29
|
+
export const jobDurationSeconds = new Histogram({
|
|
30
|
+
name: 'crawler_job_duration_seconds',
|
|
31
|
+
help: 'Duration of crawl job execution in seconds',
|
|
32
|
+
labelNames: ['queue', 'status'],
|
|
33
|
+
buckets: [10, 30, 60, 120, 300, 600, 900, 1800, 3600],
|
|
34
|
+
registers: [register],
|
|
35
|
+
});
|
|
36
|
+
// ========== Histogram - 入队到完成总耗时 ==========
|
|
37
|
+
export const jobTotalLatencySeconds = new Histogram({
|
|
38
|
+
name: 'crawler_job_total_latency_seconds',
|
|
39
|
+
help: 'Total latency from enqueue to completion in seconds',
|
|
40
|
+
labelNames: ['queue', 'status'],
|
|
41
|
+
buckets: [10, 30, 60, 120, 300, 600, 900, 1800, 3600],
|
|
42
|
+
registers: [register],
|
|
43
|
+
});
|
|
44
|
+
// ========== Gauge - 队列大小 ==========
|
|
45
|
+
export const queueSize = new Gauge({
|
|
46
|
+
name: 'crawler_queue_size',
|
|
47
|
+
help: 'Current number of jobs in queue',
|
|
48
|
+
labelNames: ['queue'],
|
|
49
|
+
registers: [register],
|
|
50
|
+
});
|
|
51
|
+
/**
|
|
52
|
+
* Collect all metrics by querying database
|
|
53
|
+
*/
|
|
54
|
+
export function collectMetrics() {
|
|
55
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
56
|
+
try {
|
|
57
|
+
// 收集队列大小
|
|
58
|
+
const jobStats = yield Job.stats();
|
|
59
|
+
jobStats.queues.forEach((q) => {
|
|
60
|
+
queueSize.set({ queue: q.queue }, q.count);
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
catch (error) {
|
|
64
|
+
console.error('Error collecting metrics:', error);
|
|
65
|
+
}
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Get metrics in Prometheus format
|
|
70
|
+
*/
|
|
71
|
+
export function getMetrics() {
|
|
72
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
73
|
+
yield collectMetrics();
|
|
74
|
+
return register.metrics();
|
|
75
|
+
});
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Get content type for metrics endpoint
|
|
79
|
+
*/
|
|
80
|
+
export function getContentType() {
|
|
81
|
+
return register.contentType;
|
|
82
|
+
}
|
package/lib/esm/puppeteer.js
CHANGED
|
@@ -12,6 +12,7 @@ import fs from 'fs-extra';
|
|
|
12
12
|
import path from 'path';
|
|
13
13
|
import { clearInterval, setInterval } from 'timers';
|
|
14
14
|
import { config, logger } from './config';
|
|
15
|
+
import { Job } from './store';
|
|
15
16
|
import { CRAWLER_FLAG, sleep } from './utils';
|
|
16
17
|
const BrowserStatus = {
|
|
17
18
|
None: 'None',
|
|
@@ -113,21 +114,25 @@ export function launchBrowser() {
|
|
|
113
114
|
'--no-sandbox',
|
|
114
115
|
'--no-zygote',
|
|
115
116
|
'--disable-setuid-sandbox',
|
|
116
|
-
'--disable-gpu',
|
|
117
117
|
'--disable-dev-shm-usage',
|
|
118
118
|
'--disable-site-isolation-trials',
|
|
119
|
-
'--disable-accelerated-2d-canvas',
|
|
120
119
|
'--disable-extensions',
|
|
121
|
-
'--js-flags=--max_old_space_size=
|
|
120
|
+
'--js-flags=--max_old_space_size=768', // 限制V8内存
|
|
122
121
|
'--disable-background-networking',
|
|
123
122
|
'--disable-default-apps',
|
|
124
123
|
// '--disable-web-security', // 允许跨域请求
|
|
125
|
-
'--disable-software-rasterizer',
|
|
126
124
|
'--disable-crash-reporter',
|
|
127
125
|
'--disable-service-workers',
|
|
128
126
|
'--disable-notifications',
|
|
129
127
|
'--disable-infobars',
|
|
130
128
|
'--font-render-hinting=none',
|
|
129
|
+
// WebGL: use software GL fallback for servers without GPU
|
|
130
|
+
'--enable-webgl',
|
|
131
|
+
'--ignore-gpu-blocklist',
|
|
132
|
+
'--use-gl=swiftshader',
|
|
133
|
+
'--use-angle=swiftshader',
|
|
134
|
+
'--enable-unsafe-swiftshader',
|
|
135
|
+
'--disable-gpu-sandbox',
|
|
131
136
|
],
|
|
132
137
|
});
|
|
133
138
|
logger.info('Launch browser');
|
|
@@ -151,12 +156,18 @@ function checkBrowserActivated() {
|
|
|
151
156
|
var _a;
|
|
152
157
|
if (browser) {
|
|
153
158
|
const pages = yield browser.pages().catch(() => []);
|
|
154
|
-
|
|
159
|
+
const jobCount = yield Job.count().catch(() => 0);
|
|
160
|
+
// Check if browser is inactive: only blank page AND no pending jobs
|
|
161
|
+
const isInactive = pages.length === 1 && ((_a = pages[0]) === null || _a === void 0 ? void 0 : _a.url()) === 'about:blank' && jobCount === 0;
|
|
162
|
+
if (isInactive) {
|
|
155
163
|
count++;
|
|
156
164
|
logger.debug(`Browser inactive count: ${count}/3`);
|
|
157
165
|
}
|
|
158
166
|
else {
|
|
159
|
-
count = 0;
|
|
167
|
+
count = 0;
|
|
168
|
+
if (jobCount > 0) {
|
|
169
|
+
logger.debug(`Browser has ${jobCount} pending jobs, keeping active`);
|
|
170
|
+
}
|
|
160
171
|
}
|
|
161
172
|
if (count >= 3) {
|
|
162
173
|
logger.info('Browser inactive for 3 minutes, closing...');
|
package/lib/esm/site.js
CHANGED
|
@@ -10,8 +10,9 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
10
10
|
import uniq from 'lodash/uniq';
|
|
11
11
|
import { randomUUID } from 'node:crypto';
|
|
12
12
|
import { config, logger } from './config';
|
|
13
|
-
import {
|
|
14
|
-
import {
|
|
13
|
+
import { queueMap } from './crawler';
|
|
14
|
+
import { jobsEnqueuedTotal } from './metrics';
|
|
15
|
+
import { Job, Snapshot } from './store';
|
|
15
16
|
import { formatUrl, getSitemapList } from './utils';
|
|
16
17
|
const crawlBlockletRunningMap = new Map();
|
|
17
18
|
function parseSitemapUrl(sitemapItem) {
|
|
@@ -63,21 +64,30 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
|
|
|
63
64
|
});
|
|
64
65
|
crawlCount++;
|
|
65
66
|
const jobId = randomUUID();
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
67
|
+
queueMap.cronJobs.push({
|
|
68
|
+
job: {
|
|
69
|
+
id: jobId,
|
|
70
|
+
url,
|
|
71
|
+
lastModified: sitemapItem.lastmod,
|
|
72
|
+
includeScreenshot: false,
|
|
73
|
+
includeHtml: true,
|
|
74
|
+
replace: true,
|
|
75
|
+
enqueuedAt: Date.now(),
|
|
76
|
+
},
|
|
77
|
+
jobId,
|
|
78
|
+
delay: 5,
|
|
73
79
|
});
|
|
80
|
+
jobsEnqueuedTotal.inc({ queue: 'cronJobs' });
|
|
74
81
|
return jobId;
|
|
75
82
|
}), { concurrency: ((_b = config.siteCron) === null || _b === void 0 ? void 0 : _b.concurrency) || 30 });
|
|
83
|
+
// Get current queue size for logging
|
|
84
|
+
const queueSize = yield Job.count();
|
|
76
85
|
logger.info('Enqueued jobs from sitemap finished', {
|
|
77
86
|
url,
|
|
78
87
|
pathname,
|
|
79
88
|
processCount,
|
|
80
89
|
crawlCount,
|
|
90
|
+
queueSize,
|
|
81
91
|
});
|
|
82
92
|
return jobIds;
|
|
83
93
|
}
|
package/lib/esm/store/job.d.ts
CHANGED
|
@@ -17,6 +17,7 @@ export interface JobState {
|
|
|
17
17
|
replace?: boolean;
|
|
18
18
|
sync?: boolean;
|
|
19
19
|
ignoreRobots?: boolean;
|
|
20
|
+
enqueuedAt?: number;
|
|
20
21
|
headers?: Record<string, string>;
|
|
21
22
|
cookies?: CookieParam[];
|
|
22
23
|
localStorage?: {
|
|
@@ -32,18 +33,46 @@ export interface JobModel {
|
|
|
32
33
|
willRunAt: number;
|
|
33
34
|
delay: number;
|
|
34
35
|
cancelled: boolean;
|
|
36
|
+
processingBy: string | null;
|
|
37
|
+
processingAt: number | null;
|
|
35
38
|
}
|
|
36
39
|
export declare class Job extends Model<JobModel> implements JobModel {
|
|
37
40
|
id: JobModel['id'];
|
|
38
41
|
queue: JobModel['queue'];
|
|
39
42
|
job: JobModel['job'];
|
|
40
43
|
retryCount: JobModel['retryCount'];
|
|
41
|
-
willRunAt: JobModel['willRunAt'];
|
|
42
44
|
delay: JobModel['delay'];
|
|
45
|
+
willRunAt: JobModel['willRunAt'];
|
|
43
46
|
cancelled: JobModel['cancelled'];
|
|
47
|
+
processingBy: JobModel['processingBy'];
|
|
48
|
+
processingAt: JobModel['processingAt'];
|
|
44
49
|
static initModel(sequelize: Sequelize): typeof Job;
|
|
45
50
|
static findJob(condition: Partial<JobState>): Promise<JobModel | null>;
|
|
46
51
|
static isExists(condition: Partial<JobState> & {
|
|
47
52
|
url: string;
|
|
48
53
|
}): Promise<JobModel | null | undefined>;
|
|
54
|
+
static paginate({ page, pageSize, queue, }?: {
|
|
55
|
+
page?: number;
|
|
56
|
+
pageSize?: number;
|
|
57
|
+
queue?: string;
|
|
58
|
+
}): Promise<{
|
|
59
|
+
total: number;
|
|
60
|
+
page: number;
|
|
61
|
+
pageSize: number;
|
|
62
|
+
totalPages: number;
|
|
63
|
+
data: JobModel[];
|
|
64
|
+
}>;
|
|
65
|
+
static stats(): Promise<{
|
|
66
|
+
total: number;
|
|
67
|
+
queues: {
|
|
68
|
+
queue: string;
|
|
69
|
+
count: number;
|
|
70
|
+
}[];
|
|
71
|
+
}>;
|
|
72
|
+
static deleteByQueue(queue: string): Promise<{
|
|
73
|
+
deleted: number;
|
|
74
|
+
}>;
|
|
75
|
+
static deleteByIds(ids: string[]): Promise<{
|
|
76
|
+
deleted: number;
|
|
77
|
+
}>;
|
|
49
78
|
}
|
package/lib/esm/store/job.js
CHANGED
|
@@ -37,6 +37,14 @@ export class Job extends Model {
|
|
|
37
37
|
type: DataTypes.BOOLEAN,
|
|
38
38
|
defaultValue: false,
|
|
39
39
|
},
|
|
40
|
+
processingBy: {
|
|
41
|
+
type: DataTypes.STRING(32),
|
|
42
|
+
allowNull: true,
|
|
43
|
+
},
|
|
44
|
+
processingAt: {
|
|
45
|
+
type: DataTypes.INTEGER,
|
|
46
|
+
allowNull: true,
|
|
47
|
+
},
|
|
40
48
|
createdAt: {
|
|
41
49
|
type: DataTypes.DATE,
|
|
42
50
|
defaultValue: DataTypes.NOW,
|
|
@@ -88,4 +96,52 @@ export class Job extends Model {
|
|
|
88
96
|
return existsJob === null || existsJob === void 0 ? void 0 : existsJob.get();
|
|
89
97
|
});
|
|
90
98
|
}
|
|
99
|
+
static paginate() {
|
|
100
|
+
return __awaiter(this, arguments, void 0, function* ({ page = 1, pageSize = 20, queue, } = {}) {
|
|
101
|
+
const where = queue ? { queue } : {};
|
|
102
|
+
const offset = (page - 1) * pageSize;
|
|
103
|
+
const { count, rows } = yield Job.findAndCountAll({
|
|
104
|
+
where,
|
|
105
|
+
order: [['createdAt', 'DESC']],
|
|
106
|
+
limit: pageSize,
|
|
107
|
+
offset,
|
|
108
|
+
});
|
|
109
|
+
return {
|
|
110
|
+
total: count,
|
|
111
|
+
page,
|
|
112
|
+
pageSize,
|
|
113
|
+
totalPages: Math.ceil(count / pageSize),
|
|
114
|
+
data: rows.map((row) => row.toJSON()),
|
|
115
|
+
};
|
|
116
|
+
});
|
|
117
|
+
}
|
|
118
|
+
static stats() {
|
|
119
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
120
|
+
const results = (yield Job.findAll({
|
|
121
|
+
attributes: ['queue', [sequelize.fn('COUNT', sequelize.col('id')), 'count']],
|
|
122
|
+
group: ['queue'],
|
|
123
|
+
raw: true,
|
|
124
|
+
}));
|
|
125
|
+
const total = results.reduce((sum, item) => sum + Number(item.count), 0);
|
|
126
|
+
return {
|
|
127
|
+
total,
|
|
128
|
+
queues: results.map((item) => ({
|
|
129
|
+
queue: item.queue,
|
|
130
|
+
count: Number(item.count),
|
|
131
|
+
})),
|
|
132
|
+
};
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
static deleteByQueue(queue) {
|
|
136
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
137
|
+
const count = yield Job.destroy({ where: { queue } });
|
|
138
|
+
return { deleted: count };
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
static deleteByIds(ids) {
|
|
142
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
143
|
+
const count = yield Job.destroy({ where: { id: ids } });
|
|
144
|
+
return { deleted: count };
|
|
145
|
+
});
|
|
146
|
+
}
|
|
91
147
|
}
|
package/lib/esm/store/migrate.js
CHANGED
|
@@ -3,6 +3,7 @@ import { SequelizeStorage, Umzug } from 'umzug';
|
|
|
3
3
|
import { sequelize } from './index';
|
|
4
4
|
import * as migration20250615 from './migrations/20250615-genesis';
|
|
5
5
|
import * as migration20250616Replace from './migrations/20250616-replace';
|
|
6
|
+
import * as migration20251226JobProcessing from './migrations/20251226-job-processing';
|
|
6
7
|
const umzug = new Umzug({
|
|
7
8
|
migrations: [
|
|
8
9
|
{
|
|
@@ -15,6 +16,11 @@ const umzug = new Umzug({
|
|
|
15
16
|
up: ({ context }) => migration20250616Replace.up({ context }),
|
|
16
17
|
down: ({ context }) => migration20250616Replace.down({ context }),
|
|
17
18
|
},
|
|
19
|
+
{
|
|
20
|
+
name: '20251226-job-processing',
|
|
21
|
+
up: ({ context }) => migration20251226JobProcessing.up({ context }),
|
|
22
|
+
down: ({ context }) => migration20251226JobProcessing.down({ context }),
|
|
23
|
+
},
|
|
18
24
|
],
|
|
19
25
|
context: sequelize.getQueryInterface(),
|
|
20
26
|
storage: new SequelizeStorage({ sequelize }),
|