@arcblock/crawler 1.4.7 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/cjs/crawler.d.ts +8 -6
- package/lib/cjs/crawler.js +91 -50
- package/lib/cjs/index.d.ts +2 -0
- package/lib/cjs/index.js +2 -0
- package/lib/cjs/metrics.d.ts +18 -0
- package/lib/cjs/metrics.js +88 -0
- package/lib/cjs/puppeteer.js +17 -6
- package/lib/cjs/site.js +17 -7
- package/lib/cjs/store/job.d.ts +30 -1
- package/lib/cjs/store/job.js +56 -0
- package/lib/cjs/store/migrate.js +6 -0
- package/lib/cjs/store/migrations/20251226-job-processing.d.ts +6 -0
- package/lib/cjs/store/migrations/20251226-job-processing.js +37 -0
- package/lib/esm/crawler.d.ts +8 -6
- package/lib/esm/crawler.js +90 -50
- package/lib/esm/index.d.ts +2 -0
- package/lib/esm/index.js +2 -0
- package/lib/esm/metrics.d.ts +18 -0
- package/lib/esm/metrics.js +82 -0
- package/lib/esm/puppeteer.js +17 -6
- package/lib/esm/site.js +19 -9
- package/lib/esm/store/job.d.ts +30 -1
- package/lib/esm/store/job.js +56 -0
- package/lib/esm/store/migrate.js +6 -0
- package/lib/esm/store/migrations/20251226-job-processing.d.ts +6 -0
- package/lib/esm/store/migrations/20251226-job-processing.js +33 -0
- package/package.json +2 -1
package/lib/cjs/crawler.d.ts
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import { Page } from '@blocklet/puppeteer';
|
|
2
2
|
import { JobState, SnapshotModel } from './store';
|
|
3
|
-
declare
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
3
|
+
export declare const queueMap: {
|
|
4
|
+
urlCrawler: any;
|
|
5
|
+
syncCrawler: any;
|
|
6
|
+
codeCrawler: any;
|
|
7
|
+
cronJobs: any;
|
|
8
|
+
};
|
|
8
9
|
export declare function initQueue(): void;
|
|
9
10
|
type PageHandler = {
|
|
10
11
|
handleScreenshot?: (page: Page, params?: JobState) => Promise<Buffer | null>;
|
|
@@ -28,6 +29,7 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
|
|
|
28
29
|
* @param params
|
|
29
30
|
* @param callback callback when job finished
|
|
30
31
|
*/
|
|
31
|
-
export declare function enqueue(
|
|
32
|
+
export declare function enqueue(queueName: keyof typeof queueMap, params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
32
33
|
export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
33
34
|
export declare function crawlCode(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
35
|
+
export {};
|
package/lib/cjs/crawler.js
CHANGED
|
@@ -12,7 +12,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
12
12
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
13
|
};
|
|
14
14
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
-
exports.getPageContent = exports.
|
|
15
|
+
exports.getPageContent = exports.queueMap = void 0;
|
|
16
16
|
exports.initQueue = initQueue;
|
|
17
17
|
exports.createCrawlQueue = createCrawlQueue;
|
|
18
18
|
exports.getDataDir = getDataDir;
|
|
@@ -26,49 +26,57 @@ const crypto_1 = require("crypto");
|
|
|
26
26
|
const fs_extra_1 = __importDefault(require("fs-extra"));
|
|
27
27
|
const path_1 = __importDefault(require("path"));
|
|
28
28
|
const config_1 = require("./config");
|
|
29
|
+
const metrics_1 = require("./metrics");
|
|
29
30
|
const puppeteer_1 = require("./puppeteer");
|
|
30
31
|
const carbon_1 = require("./services/carbon");
|
|
31
32
|
const snapshot_1 = require("./services/snapshot");
|
|
32
33
|
const store_1 = require("./store");
|
|
33
34
|
const utils_1 = require("./utils");
|
|
34
35
|
const { BaseState } = require('@abtnode/models');
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
36
|
+
exports.queueMap = {
|
|
37
|
+
urlCrawler: null,
|
|
38
|
+
syncCrawler: null,
|
|
39
|
+
codeCrawler: null,
|
|
40
|
+
cronJobs: null,
|
|
41
|
+
};
|
|
39
42
|
function initQueue() {
|
|
40
|
-
exports.
|
|
41
|
-
exports.
|
|
42
|
-
exports.
|
|
43
|
+
exports.queueMap.urlCrawler = createCrawlQueue('urlCrawler');
|
|
44
|
+
exports.queueMap.syncCrawler = createCrawlQueue('syncCrawler');
|
|
45
|
+
exports.queueMap.codeCrawler = createCrawlQueue('codeCrawler', {
|
|
43
46
|
handleScreenshot: carbon_1.createCarbonImage,
|
|
44
47
|
});
|
|
45
|
-
exports.
|
|
48
|
+
exports.queueMap.cronJobs = createCrawlQueue('cronJobs');
|
|
46
49
|
}
|
|
47
50
|
function createCrawlQueue(queue, handler) {
|
|
48
51
|
const db = new BaseState(store_1.Job);
|
|
49
52
|
return (0, queue_1.default)({
|
|
50
53
|
store: new sequelize_1.default(db, queue),
|
|
51
|
-
|
|
54
|
+
options: {
|
|
55
|
+
concurrency: config_1.config.concurrency,
|
|
56
|
+
enableScheduledJob: true,
|
|
57
|
+
},
|
|
52
58
|
onJob: (job) => __awaiter(this, void 0, void 0, function* () {
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
if (!job.ignoreRobots) {
|
|
56
|
-
const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
|
|
57
|
-
if (!canCrawl) {
|
|
58
|
-
config_1.logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
|
|
59
|
-
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
60
|
-
job,
|
|
61
|
-
snapshot: {
|
|
62
|
-
status: 'failed',
|
|
63
|
-
error: 'Denied by robots.txt',
|
|
64
|
-
},
|
|
65
|
-
});
|
|
66
|
-
yield store_1.Snapshot.upsert(snapshot);
|
|
67
|
-
return snapshot;
|
|
68
|
-
}
|
|
69
|
-
}
|
|
59
|
+
const startTime = Date.now();
|
|
60
|
+
let status = 'failed';
|
|
70
61
|
const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config_1.config.cookies || []).concat(job.cookies || []), localStorage: (config_1.config.localStorage || []).concat(job.localStorage || []), url: (0, utils_1.formatUrl)(job.url) });
|
|
71
62
|
try {
|
|
63
|
+
config_1.logger.info(`Starting to execute ${queue} job`, Object.assign(Object.assign({}, job), { queueSize: yield store_1.Job.count() }));
|
|
64
|
+
// check robots.txt
|
|
65
|
+
if (!job.ignoreRobots) {
|
|
66
|
+
const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
|
|
67
|
+
if (!canCrawl) {
|
|
68
|
+
config_1.logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
|
|
69
|
+
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
70
|
+
job,
|
|
71
|
+
snapshot: {
|
|
72
|
+
status: 'failed',
|
|
73
|
+
error: 'Denied by robots.txt',
|
|
74
|
+
},
|
|
75
|
+
});
|
|
76
|
+
yield store_1.Snapshot.upsert(snapshot);
|
|
77
|
+
return snapshot;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
72
80
|
// get page content later
|
|
73
81
|
const result = yield (0, exports.getPageContent)(formattedJob, handler);
|
|
74
82
|
if (!result || (!result.html && !result.screenshot)) {
|
|
@@ -86,18 +94,13 @@ function createCrawlQueue(queue, handler) {
|
|
|
86
94
|
const snapshot = yield store_1.sequelize.transaction((txn) => __awaiter(this, void 0, void 0, function* () {
|
|
87
95
|
// delete old snapshot
|
|
88
96
|
if (formattedJob.replace) {
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
}, { txn });
|
|
94
|
-
if (deletedJobIds) {
|
|
95
|
-
config_1.logger.info('Deleted old snapshot', { deletedJobIds });
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
|
-
catch (error) {
|
|
97
|
+
const deletedJobIds = yield (0, snapshot_1.deleteSnapshots)({
|
|
98
|
+
url: formattedJob.url,
|
|
99
|
+
replace: true,
|
|
100
|
+
}, { txn }).catch((error) => {
|
|
99
101
|
config_1.logger.error('Failed to delete old snapshot', { error, formattedJob });
|
|
100
|
-
}
|
|
102
|
+
});
|
|
103
|
+
config_1.logger.info('Deleted old snapshot', { deletedJobIds });
|
|
101
104
|
}
|
|
102
105
|
// save html and screenshot to data dir
|
|
103
106
|
const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
|
|
@@ -117,10 +120,12 @@ function createCrawlQueue(queue, handler) {
|
|
|
117
120
|
yield store_1.Snapshot.upsert(snapshot, { transaction: txn });
|
|
118
121
|
return snapshot;
|
|
119
122
|
}));
|
|
123
|
+
status = 'success';
|
|
120
124
|
return snapshot;
|
|
121
125
|
}
|
|
122
126
|
catch (error) {
|
|
123
127
|
config_1.logger.error(`Failed to crawl ${formattedJob.url}`, { error, formattedJob });
|
|
128
|
+
status = 'failed';
|
|
124
129
|
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
125
130
|
job: formattedJob,
|
|
126
131
|
snapshot: {
|
|
@@ -131,6 +136,14 @@ function createCrawlQueue(queue, handler) {
|
|
|
131
136
|
yield store_1.Snapshot.upsert(snapshot);
|
|
132
137
|
return snapshot;
|
|
133
138
|
}
|
|
139
|
+
finally {
|
|
140
|
+
const now = Date.now();
|
|
141
|
+
metrics_1.jobsTotal.inc({ queue, status });
|
|
142
|
+
metrics_1.jobDurationSeconds.observe({ queue, status }, (now - startTime) / 1000);
|
|
143
|
+
if (job.enqueuedAt) {
|
|
144
|
+
metrics_1.jobTotalLatencySeconds.observe({ queue, status }, (now - job.enqueuedAt) / 1000);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
134
147
|
}),
|
|
135
148
|
});
|
|
136
149
|
}
|
|
@@ -166,7 +179,7 @@ function saveSnapshotToLocal(_a) {
|
|
|
166
179
|
};
|
|
167
180
|
});
|
|
168
181
|
}
|
|
169
|
-
const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, format = 'webp', timeout =
|
|
182
|
+
const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, format = 'webp', timeout = 60 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }, handler) {
|
|
170
183
|
const page = yield (0, puppeteer_1.initPage)();
|
|
171
184
|
if (width && height) {
|
|
172
185
|
yield page.setViewport({ width, height, deviceScaleFactor: 2 });
|
|
@@ -299,31 +312,59 @@ exports.getPageContent = getPageContent;
|
|
|
299
312
|
* @param callback callback when job finished
|
|
300
313
|
*/
|
|
301
314
|
// eslint-disable-next-line require-await
|
|
302
|
-
function enqueue(
|
|
315
|
+
function enqueue(queueName, params, callback) {
|
|
303
316
|
return __awaiter(this, void 0, void 0, function* () {
|
|
317
|
+
const queue = exports.queueMap[queueName];
|
|
318
|
+
if (!queue) {
|
|
319
|
+
throw new Error(`Queue ${queueName} not found`);
|
|
320
|
+
}
|
|
304
321
|
// skip duplicate job
|
|
305
322
|
const existsJob = yield store_1.Job.isExists(params);
|
|
306
323
|
if (existsJob && !params.sync) {
|
|
307
324
|
config_1.logger.info(`Crawl job already exists for ${params.url}, skip`);
|
|
308
325
|
return existsJob.id;
|
|
309
326
|
}
|
|
310
|
-
config_1.logger.info('enqueue crawl job', params);
|
|
311
327
|
const jobId = (0, crypto_1.randomUUID)();
|
|
312
|
-
const
|
|
328
|
+
const enqueuedAt = Date.now();
|
|
329
|
+
const job = queue.push({ job: Object.assign(Object.assign({}, params), { id: jobId, enqueuedAt }), jobId });
|
|
330
|
+
metrics_1.jobsEnqueuedTotal.inc({ queue: queueName });
|
|
331
|
+
// Get current queue size for logging
|
|
332
|
+
const queueSize = yield store_1.Job.count();
|
|
333
|
+
config_1.logger.info('enqueue crawl job', Object.assign(Object.assign({}, params), { jobId, queueSize }));
|
|
313
334
|
job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
|
|
314
|
-
|
|
315
|
-
|
|
335
|
+
try {
|
|
336
|
+
const isSuccess = (result === null || result === void 0 ? void 0 : result.status) === 'success';
|
|
337
|
+
const queueSize = yield store_1.Job.count();
|
|
338
|
+
config_1.logger.info(`Crawl completed ${params.url}, status: ${isSuccess ? 'success' : 'failed'}`, {
|
|
339
|
+
job: params,
|
|
340
|
+
result,
|
|
341
|
+
queueSize,
|
|
342
|
+
});
|
|
343
|
+
callback === null || callback === void 0 ? void 0 : callback(result ? yield (0, snapshot_1.formatSnapshot)(result) : null);
|
|
344
|
+
}
|
|
345
|
+
catch (error) {
|
|
346
|
+
config_1.logger.error(`Error in finished event handler for ${params.url}`, { error });
|
|
347
|
+
callback === null || callback === void 0 ? void 0 : callback(null);
|
|
348
|
+
}
|
|
349
|
+
}));
|
|
350
|
+
job.on('failed', (_a) => __awaiter(this, [_a], void 0, function* ({ error }) {
|
|
351
|
+
try {
|
|
352
|
+
const queueSize = yield store_1.Job.count();
|
|
353
|
+
config_1.logger.error(`Failed to execute job for ${params.url}`, { error, job: params, queueSize });
|
|
354
|
+
}
|
|
355
|
+
catch (err) {
|
|
356
|
+
config_1.logger.error(`Error in failed event handler for ${params.url}`, { error: err });
|
|
357
|
+
}
|
|
358
|
+
finally {
|
|
359
|
+
callback === null || callback === void 0 ? void 0 : callback(null);
|
|
360
|
+
}
|
|
316
361
|
}));
|
|
317
|
-
job.on('failed', ({ error }) => {
|
|
318
|
-
config_1.logger.error(`Failed to execute job for ${params.url}`, { error, job: params });
|
|
319
|
-
callback === null || callback === void 0 ? void 0 : callback(null);
|
|
320
|
-
});
|
|
321
362
|
return jobId;
|
|
322
363
|
});
|
|
323
364
|
}
|
|
324
365
|
function crawlUrl(params, callback) {
|
|
325
|
-
return enqueue(params.sync ?
|
|
366
|
+
return enqueue(params.sync ? 'syncCrawler' : 'urlCrawler', params, callback);
|
|
326
367
|
}
|
|
327
368
|
function crawlCode(params, callback) {
|
|
328
|
-
return enqueue(
|
|
369
|
+
return enqueue('codeCrawler', Object.assign({ ignoreRobots: true, includeHtml: false, includeScreenshot: true }, params), callback);
|
|
329
370
|
}
|
package/lib/cjs/index.d.ts
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import { Config } from './config';
|
|
2
2
|
export * from './crawler';
|
|
3
3
|
export * from './services/snapshot';
|
|
4
|
+
export * from './store/job';
|
|
4
5
|
export * as utils from './utils';
|
|
6
|
+
export * from './metrics';
|
|
5
7
|
export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
|
package/lib/cjs/index.js
CHANGED
|
@@ -59,7 +59,9 @@ const puppeteer_1 = require("./puppeteer");
|
|
|
59
59
|
const migrate_1 = require("./store/migrate");
|
|
60
60
|
__exportStar(require("./crawler"), exports);
|
|
61
61
|
__exportStar(require("./services/snapshot"), exports);
|
|
62
|
+
__exportStar(require("./store/job"), exports);
|
|
62
63
|
exports.utils = __importStar(require("./utils"));
|
|
64
|
+
__exportStar(require("./metrics"), exports);
|
|
63
65
|
function initCrawler(params) {
|
|
64
66
|
return __awaiter(this, void 0, void 0, function* () {
|
|
65
67
|
var _a;
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import { Counter, Gauge, Histogram } from 'prom-client';
|
|
2
|
+
export declare const jobsTotal: Counter<"queue" | "status">;
|
|
3
|
+
export declare const jobsEnqueuedTotal: Counter<"queue">;
|
|
4
|
+
export declare const jobDurationSeconds: Histogram<"queue" | "status">;
|
|
5
|
+
export declare const jobTotalLatencySeconds: Histogram<"queue" | "status">;
|
|
6
|
+
export declare const queueSize: Gauge<"queue">;
|
|
7
|
+
/**
|
|
8
|
+
* Collect all metrics by querying database
|
|
9
|
+
*/
|
|
10
|
+
export declare function collectMetrics(): Promise<void>;
|
|
11
|
+
/**
|
|
12
|
+
* Get metrics in Prometheus format
|
|
13
|
+
*/
|
|
14
|
+
export declare function getMetrics(): Promise<string>;
|
|
15
|
+
/**
|
|
16
|
+
* Get content type for metrics endpoint
|
|
17
|
+
*/
|
|
18
|
+
export declare function getContentType(): "text/plain; version=0.0.4; charset=utf-8";
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.queueSize = exports.jobTotalLatencySeconds = exports.jobDurationSeconds = exports.jobsEnqueuedTotal = exports.jobsTotal = void 0;
|
|
13
|
+
exports.collectMetrics = collectMetrics;
|
|
14
|
+
exports.getMetrics = getMetrics;
|
|
15
|
+
exports.getContentType = getContentType;
|
|
16
|
+
const prom_client_1 = require("prom-client");
|
|
17
|
+
const store_1 = require("./store");
|
|
18
|
+
// Create a new registry
|
|
19
|
+
const register = new prom_client_1.Registry();
|
|
20
|
+
// ========== Counter - 爬取任务计数 ==========
|
|
21
|
+
exports.jobsTotal = new prom_client_1.Counter({
|
|
22
|
+
name: 'crawler_jobs_total',
|
|
23
|
+
help: 'Total number of crawl jobs processed',
|
|
24
|
+
labelNames: ['queue', 'status'],
|
|
25
|
+
registers: [register],
|
|
26
|
+
});
|
|
27
|
+
// ========== Counter - 入队任务数 ==========
|
|
28
|
+
exports.jobsEnqueuedTotal = new prom_client_1.Counter({
|
|
29
|
+
name: 'crawler_jobs_enqueued_total',
|
|
30
|
+
help: 'Total number of crawl jobs enqueued',
|
|
31
|
+
labelNames: ['queue'],
|
|
32
|
+
registers: [register],
|
|
33
|
+
});
|
|
34
|
+
// ========== Histogram - 任务执行耗时 ==========
|
|
35
|
+
exports.jobDurationSeconds = new prom_client_1.Histogram({
|
|
36
|
+
name: 'crawler_job_duration_seconds',
|
|
37
|
+
help: 'Duration of crawl job execution in seconds',
|
|
38
|
+
labelNames: ['queue', 'status'],
|
|
39
|
+
buckets: [10, 30, 60, 120, 300, 600, 900, 1800, 3600],
|
|
40
|
+
registers: [register],
|
|
41
|
+
});
|
|
42
|
+
// ========== Histogram - 入队到完成总耗时 ==========
|
|
43
|
+
exports.jobTotalLatencySeconds = new prom_client_1.Histogram({
|
|
44
|
+
name: 'crawler_job_total_latency_seconds',
|
|
45
|
+
help: 'Total latency from enqueue to completion in seconds',
|
|
46
|
+
labelNames: ['queue', 'status'],
|
|
47
|
+
buckets: [10, 30, 60, 120, 300, 600, 900, 1800, 3600],
|
|
48
|
+
registers: [register],
|
|
49
|
+
});
|
|
50
|
+
// ========== Gauge - 队列大小 ==========
|
|
51
|
+
exports.queueSize = new prom_client_1.Gauge({
|
|
52
|
+
name: 'crawler_queue_size',
|
|
53
|
+
help: 'Current number of jobs in queue',
|
|
54
|
+
labelNames: ['queue'],
|
|
55
|
+
registers: [register],
|
|
56
|
+
});
|
|
57
|
+
/**
|
|
58
|
+
* Collect all metrics by querying database
|
|
59
|
+
*/
|
|
60
|
+
function collectMetrics() {
|
|
61
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
62
|
+
try {
|
|
63
|
+
// 收集队列大小
|
|
64
|
+
const jobStats = yield store_1.Job.stats();
|
|
65
|
+
jobStats.queues.forEach((q) => {
|
|
66
|
+
exports.queueSize.set({ queue: q.queue }, q.count);
|
|
67
|
+
});
|
|
68
|
+
}
|
|
69
|
+
catch (error) {
|
|
70
|
+
console.error('Error collecting metrics:', error);
|
|
71
|
+
}
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Get metrics in Prometheus format
|
|
76
|
+
*/
|
|
77
|
+
function getMetrics() {
|
|
78
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
79
|
+
yield collectMetrics();
|
|
80
|
+
return register.metrics();
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Get content type for metrics endpoint
|
|
85
|
+
*/
|
|
86
|
+
function getContentType() {
|
|
87
|
+
return register.contentType;
|
|
88
|
+
}
|
package/lib/cjs/puppeteer.js
CHANGED
|
@@ -24,6 +24,7 @@ const fs_extra_1 = __importDefault(require("fs-extra"));
|
|
|
24
24
|
const path_1 = __importDefault(require("path"));
|
|
25
25
|
const timers_1 = require("timers");
|
|
26
26
|
const config_1 = require("./config");
|
|
27
|
+
const store_1 = require("./store");
|
|
27
28
|
const utils_1 = require("./utils");
|
|
28
29
|
const BrowserStatus = {
|
|
29
30
|
None: 'None',
|
|
@@ -124,21 +125,25 @@ function launchBrowser() {
|
|
|
124
125
|
'--no-sandbox',
|
|
125
126
|
'--no-zygote',
|
|
126
127
|
'--disable-setuid-sandbox',
|
|
127
|
-
'--disable-gpu',
|
|
128
128
|
'--disable-dev-shm-usage',
|
|
129
129
|
'--disable-site-isolation-trials',
|
|
130
|
-
'--disable-accelerated-2d-canvas',
|
|
131
130
|
'--disable-extensions',
|
|
132
|
-
'--js-flags=--max_old_space_size=
|
|
131
|
+
'--js-flags=--max_old_space_size=768', // 限制V8内存
|
|
133
132
|
'--disable-background-networking',
|
|
134
133
|
'--disable-default-apps',
|
|
135
134
|
// '--disable-web-security', // 允许跨域请求
|
|
136
|
-
'--disable-software-rasterizer',
|
|
137
135
|
'--disable-crash-reporter',
|
|
138
136
|
'--disable-service-workers',
|
|
139
137
|
'--disable-notifications',
|
|
140
138
|
'--disable-infobars',
|
|
141
139
|
'--font-render-hinting=none',
|
|
140
|
+
// WebGL: use software GL fallback for servers without GPU
|
|
141
|
+
'--enable-webgl',
|
|
142
|
+
'--ignore-gpu-blocklist',
|
|
143
|
+
'--use-gl=swiftshader',
|
|
144
|
+
'--use-angle=swiftshader',
|
|
145
|
+
'--enable-unsafe-swiftshader',
|
|
146
|
+
'--disable-gpu-sandbox',
|
|
142
147
|
],
|
|
143
148
|
});
|
|
144
149
|
config_1.logger.info('Launch browser');
|
|
@@ -162,12 +167,18 @@ function checkBrowserActivated() {
|
|
|
162
167
|
var _a;
|
|
163
168
|
if (browser) {
|
|
164
169
|
const pages = yield browser.pages().catch(() => []);
|
|
165
|
-
|
|
170
|
+
const jobCount = yield store_1.Job.count().catch(() => 0);
|
|
171
|
+
// Check if browser is inactive: only blank page AND no pending jobs
|
|
172
|
+
const isInactive = pages.length === 1 && ((_a = pages[0]) === null || _a === void 0 ? void 0 : _a.url()) === 'about:blank' && jobCount === 0;
|
|
173
|
+
if (isInactive) {
|
|
166
174
|
count++;
|
|
167
175
|
config_1.logger.debug(`Browser inactive count: ${count}/3`);
|
|
168
176
|
}
|
|
169
177
|
else {
|
|
170
|
-
count = 0;
|
|
178
|
+
count = 0;
|
|
179
|
+
if (jobCount > 0) {
|
|
180
|
+
config_1.logger.debug(`Browser has ${jobCount} pending jobs, keeping active`);
|
|
181
|
+
}
|
|
171
182
|
}
|
|
172
183
|
if (count >= 3) {
|
|
173
184
|
config_1.logger.info('Browser inactive for 3 minutes, closing...');
|
package/lib/cjs/site.js
CHANGED
|
@@ -17,6 +17,7 @@ const uniq_1 = __importDefault(require("lodash/uniq"));
|
|
|
17
17
|
const node_crypto_1 = require("node:crypto");
|
|
18
18
|
const config_1 = require("./config");
|
|
19
19
|
const crawler_1 = require("./crawler");
|
|
20
|
+
const metrics_1 = require("./metrics");
|
|
20
21
|
const store_1 = require("./store");
|
|
21
22
|
const utils_1 = require("./utils");
|
|
22
23
|
const crawlBlockletRunningMap = new Map();
|
|
@@ -69,21 +70,30 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
|
|
|
69
70
|
});
|
|
70
71
|
crawlCount++;
|
|
71
72
|
const jobId = (0, node_crypto_1.randomUUID)();
|
|
72
|
-
crawler_1.
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
73
|
+
crawler_1.queueMap.cronJobs.push({
|
|
74
|
+
job: {
|
|
75
|
+
id: jobId,
|
|
76
|
+
url,
|
|
77
|
+
lastModified: sitemapItem.lastmod,
|
|
78
|
+
includeScreenshot: false,
|
|
79
|
+
includeHtml: true,
|
|
80
|
+
replace: true,
|
|
81
|
+
enqueuedAt: Date.now(),
|
|
82
|
+
},
|
|
83
|
+
jobId,
|
|
84
|
+
delay: 5,
|
|
79
85
|
});
|
|
86
|
+
metrics_1.jobsEnqueuedTotal.inc({ queue: 'cronJobs' });
|
|
80
87
|
return jobId;
|
|
81
88
|
}), { concurrency: ((_b = config_1.config.siteCron) === null || _b === void 0 ? void 0 : _b.concurrency) || 30 });
|
|
89
|
+
// Get current queue size for logging
|
|
90
|
+
const queueSize = yield store_1.Job.count();
|
|
82
91
|
config_1.logger.info('Enqueued jobs from sitemap finished', {
|
|
83
92
|
url,
|
|
84
93
|
pathname,
|
|
85
94
|
processCount,
|
|
86
95
|
crawlCount,
|
|
96
|
+
queueSize,
|
|
87
97
|
});
|
|
88
98
|
return jobIds;
|
|
89
99
|
}
|
package/lib/cjs/store/job.d.ts
CHANGED
|
@@ -17,6 +17,7 @@ export interface JobState {
|
|
|
17
17
|
replace?: boolean;
|
|
18
18
|
sync?: boolean;
|
|
19
19
|
ignoreRobots?: boolean;
|
|
20
|
+
enqueuedAt?: number;
|
|
20
21
|
headers?: Record<string, string>;
|
|
21
22
|
cookies?: CookieParam[];
|
|
22
23
|
localStorage?: {
|
|
@@ -32,18 +33,46 @@ export interface JobModel {
|
|
|
32
33
|
willRunAt: number;
|
|
33
34
|
delay: number;
|
|
34
35
|
cancelled: boolean;
|
|
36
|
+
processingBy: string | null;
|
|
37
|
+
processingAt: number | null;
|
|
35
38
|
}
|
|
36
39
|
export declare class Job extends Model<JobModel> implements JobModel {
|
|
37
40
|
id: JobModel['id'];
|
|
38
41
|
queue: JobModel['queue'];
|
|
39
42
|
job: JobModel['job'];
|
|
40
43
|
retryCount: JobModel['retryCount'];
|
|
41
|
-
willRunAt: JobModel['willRunAt'];
|
|
42
44
|
delay: JobModel['delay'];
|
|
45
|
+
willRunAt: JobModel['willRunAt'];
|
|
43
46
|
cancelled: JobModel['cancelled'];
|
|
47
|
+
processingBy: JobModel['processingBy'];
|
|
48
|
+
processingAt: JobModel['processingAt'];
|
|
44
49
|
static initModel(sequelize: Sequelize): typeof Job;
|
|
45
50
|
static findJob(condition: Partial<JobState>): Promise<JobModel | null>;
|
|
46
51
|
static isExists(condition: Partial<JobState> & {
|
|
47
52
|
url: string;
|
|
48
53
|
}): Promise<JobModel | null | undefined>;
|
|
54
|
+
static paginate({ page, pageSize, queue, }?: {
|
|
55
|
+
page?: number;
|
|
56
|
+
pageSize?: number;
|
|
57
|
+
queue?: string;
|
|
58
|
+
}): Promise<{
|
|
59
|
+
total: number;
|
|
60
|
+
page: number;
|
|
61
|
+
pageSize: number;
|
|
62
|
+
totalPages: number;
|
|
63
|
+
data: JobModel[];
|
|
64
|
+
}>;
|
|
65
|
+
static stats(): Promise<{
|
|
66
|
+
total: number;
|
|
67
|
+
queues: {
|
|
68
|
+
queue: string;
|
|
69
|
+
count: number;
|
|
70
|
+
}[];
|
|
71
|
+
}>;
|
|
72
|
+
static deleteByQueue(queue: string): Promise<{
|
|
73
|
+
deleted: number;
|
|
74
|
+
}>;
|
|
75
|
+
static deleteByIds(ids: string[]): Promise<{
|
|
76
|
+
deleted: number;
|
|
77
|
+
}>;
|
|
49
78
|
}
|
package/lib/cjs/store/job.js
CHANGED
|
@@ -76,6 +76,14 @@ class Job extends core_1.Model {
|
|
|
76
76
|
type: core_1.DataTypes.BOOLEAN,
|
|
77
77
|
defaultValue: false,
|
|
78
78
|
},
|
|
79
|
+
processingBy: {
|
|
80
|
+
type: core_1.DataTypes.STRING(32),
|
|
81
|
+
allowNull: true,
|
|
82
|
+
},
|
|
83
|
+
processingAt: {
|
|
84
|
+
type: core_1.DataTypes.INTEGER,
|
|
85
|
+
allowNull: true,
|
|
86
|
+
},
|
|
79
87
|
createdAt: {
|
|
80
88
|
type: core_1.DataTypes.DATE,
|
|
81
89
|
defaultValue: core_1.DataTypes.NOW,
|
|
@@ -127,5 +135,53 @@ class Job extends core_1.Model {
|
|
|
127
135
|
return existsJob === null || existsJob === void 0 ? void 0 : existsJob.get();
|
|
128
136
|
});
|
|
129
137
|
}
|
|
138
|
+
static paginate() {
|
|
139
|
+
return __awaiter(this, arguments, void 0, function* ({ page = 1, pageSize = 20, queue, } = {}) {
|
|
140
|
+
const where = queue ? { queue } : {};
|
|
141
|
+
const offset = (page - 1) * pageSize;
|
|
142
|
+
const { count, rows } = yield Job.findAndCountAll({
|
|
143
|
+
where,
|
|
144
|
+
order: [['createdAt', 'DESC']],
|
|
145
|
+
limit: pageSize,
|
|
146
|
+
offset,
|
|
147
|
+
});
|
|
148
|
+
return {
|
|
149
|
+
total: count,
|
|
150
|
+
page,
|
|
151
|
+
pageSize,
|
|
152
|
+
totalPages: Math.ceil(count / pageSize),
|
|
153
|
+
data: rows.map((row) => row.toJSON()),
|
|
154
|
+
};
|
|
155
|
+
});
|
|
156
|
+
}
|
|
157
|
+
static stats() {
|
|
158
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
159
|
+
const results = (yield Job.findAll({
|
|
160
|
+
attributes: ['queue', [core_1.default.fn('COUNT', core_1.default.col('id')), 'count']],
|
|
161
|
+
group: ['queue'],
|
|
162
|
+
raw: true,
|
|
163
|
+
}));
|
|
164
|
+
const total = results.reduce((sum, item) => sum + Number(item.count), 0);
|
|
165
|
+
return {
|
|
166
|
+
total,
|
|
167
|
+
queues: results.map((item) => ({
|
|
168
|
+
queue: item.queue,
|
|
169
|
+
count: Number(item.count),
|
|
170
|
+
})),
|
|
171
|
+
};
|
|
172
|
+
});
|
|
173
|
+
}
|
|
174
|
+
static deleteByQueue(queue) {
|
|
175
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
176
|
+
const count = yield Job.destroy({ where: { queue } });
|
|
177
|
+
return { deleted: count };
|
|
178
|
+
});
|
|
179
|
+
}
|
|
180
|
+
static deleteByIds(ids) {
|
|
181
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
182
|
+
const count = yield Job.destroy({ where: { id: ids } });
|
|
183
|
+
return { deleted: count };
|
|
184
|
+
});
|
|
185
|
+
}
|
|
130
186
|
}
|
|
131
187
|
exports.Job = Job;
|
package/lib/cjs/store/migrate.js
CHANGED
|
@@ -40,6 +40,7 @@ const umzug_1 = require("umzug");
|
|
|
40
40
|
const index_1 = require("./index");
|
|
41
41
|
const migration20250615 = __importStar(require("./migrations/20250615-genesis"));
|
|
42
42
|
const migration20250616Replace = __importStar(require("./migrations/20250616-replace"));
|
|
43
|
+
const migration20251226JobProcessing = __importStar(require("./migrations/20251226-job-processing"));
|
|
43
44
|
const umzug = new umzug_1.Umzug({
|
|
44
45
|
migrations: [
|
|
45
46
|
{
|
|
@@ -52,6 +53,11 @@ const umzug = new umzug_1.Umzug({
|
|
|
52
53
|
up: ({ context }) => migration20250616Replace.up({ context }),
|
|
53
54
|
down: ({ context }) => migration20250616Replace.down({ context }),
|
|
54
55
|
},
|
|
56
|
+
{
|
|
57
|
+
name: '20251226-job-processing',
|
|
58
|
+
up: ({ context }) => migration20251226JobProcessing.up({ context }),
|
|
59
|
+
down: ({ context }) => migration20251226JobProcessing.down({ context }),
|
|
60
|
+
},
|
|
55
61
|
],
|
|
56
62
|
context: index_1.sequelize.getQueryInterface(),
|
|
57
63
|
storage: new umzug_1.SequelizeStorage({ sequelize: index_1.sequelize }),
|