@arcblock/crawler 1.4.6 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,37 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.up = up;
13
+ exports.down = down;
14
+ /* eslint-disable no-console */
15
+ const core_1 = require("@sequelize/core");
16
+ function up(_a) {
17
+ return __awaiter(this, arguments, void 0, function* ({ context }) {
18
+ console.log('[20251226-job-processing:up] Migrating...');
19
+ yield context.addColumn('jobs', 'processingBy', {
20
+ type: core_1.DataTypes.STRING(32),
21
+ allowNull: true,
22
+ });
23
+ yield context.addColumn('jobs', 'processingAt', {
24
+ type: core_1.DataTypes.INTEGER,
25
+ allowNull: true,
26
+ });
27
+ console.log('[20251226-job-processing:up] Migrated successfully!');
28
+ });
29
+ }
30
+ function down(_a) {
31
+ return __awaiter(this, arguments, void 0, function* ({ context }) {
32
+ console.log('[20251226-job-processing:down] Migrating...');
33
+ yield context.removeColumn('jobs', 'processingBy');
34
+ yield context.removeColumn('jobs', 'processingAt');
35
+ console.log('[20251226-job-processing:down] Migrated successfully!');
36
+ });
37
+ }
@@ -1,10 +1,11 @@
1
1
  import { Page } from '@blocklet/puppeteer';
2
2
  import { JobState, SnapshotModel } from './store';
3
- declare let crawlQueue: any;
4
- declare let syncQueue: any;
5
- declare let codeQueue: any;
6
- declare let cronQueue: any;
7
- export { crawlQueue, syncQueue, codeQueue, cronQueue };
3
+ export declare const queueMap: {
4
+ urlCrawler: any;
5
+ syncCrawler: any;
6
+ codeCrawler: any;
7
+ cronJobs: any;
8
+ };
8
9
  export declare function initQueue(): void;
9
10
  type PageHandler = {
10
11
  handleScreenshot?: (page: Page, params?: JobState) => Promise<Buffer | null>;
@@ -28,6 +29,7 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
28
29
  * @param params
29
30
  * @param callback callback when job finished
30
31
  */
31
- export declare function enqueue(queue: any, params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
32
+ export declare function enqueue(queueName: keyof typeof queueMap, params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
32
33
  export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
33
34
  export declare function crawlCode(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
35
+ export {};
@@ -14,50 +14,57 @@ import { randomUUID } from 'crypto';
14
14
  import fs from 'fs-extra';
15
15
  import path from 'path';
16
16
  import { config, logger } from './config';
17
+ import { jobDurationSeconds, jobTotalLatencySeconds, jobsEnqueuedTotal, jobsTotal } from './metrics';
17
18
  import { initPage } from './puppeteer';
18
19
  import { createCarbonImage } from './services/carbon';
19
20
  import { convertJobToSnapshot, deleteSnapshots, formatSnapshot } from './services/snapshot';
20
21
  import { Job, Snapshot, sequelize } from './store';
21
22
  import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5, sleep } from './utils';
22
23
  const { BaseState } = require('@abtnode/models');
23
- let crawlQueue;
24
- let syncQueue;
25
- let codeQueue;
26
- let cronQueue;
27
- export { crawlQueue, syncQueue, codeQueue, cronQueue };
24
+ export const queueMap = {
25
+ urlCrawler: null,
26
+ syncCrawler: null,
27
+ codeCrawler: null,
28
+ cronJobs: null,
29
+ };
28
30
  export function initQueue() {
29
- crawlQueue = createCrawlQueue('urlCrawler');
30
- syncQueue = createCrawlQueue('syncCrawler');
31
- codeQueue = createCrawlQueue('codeCrawler', {
31
+ queueMap.urlCrawler = createCrawlQueue('urlCrawler');
32
+ queueMap.syncCrawler = createCrawlQueue('syncCrawler');
33
+ queueMap.codeCrawler = createCrawlQueue('codeCrawler', {
32
34
  handleScreenshot: createCarbonImage,
33
35
  });
34
- cronQueue = createCrawlQueue('cronJobs');
36
+ queueMap.cronJobs = createCrawlQueue('cronJobs');
35
37
  }
36
38
  export function createCrawlQueue(queue, handler) {
37
39
  const db = new BaseState(Job);
38
40
  return createQueue({
39
41
  store: new SequelizeStore(db, queue),
40
- concurrency: config.concurrency,
42
+ options: {
43
+ concurrency: config.concurrency,
44
+ enableScheduledJob: true,
45
+ },
41
46
  onJob: (job) => __awaiter(this, void 0, void 0, function* () {
42
- logger.info('Starting to execute crawl job', job);
43
- // check robots.txt
44
- if (!job.ignoreRobots) {
45
- const canCrawl = yield isAcceptCrawler(job.url);
46
- if (!canCrawl) {
47
- logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
48
- const snapshot = convertJobToSnapshot({
49
- job,
50
- snapshot: {
51
- status: 'failed',
52
- error: 'Denied by robots.txt',
53
- },
54
- });
55
- yield Snapshot.upsert(snapshot);
56
- return snapshot;
57
- }
58
- }
47
+ const startTime = Date.now();
48
+ let status = 'failed';
59
49
  const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config.cookies || []).concat(job.cookies || []), localStorage: (config.localStorage || []).concat(job.localStorage || []), url: formatUrl(job.url) });
60
50
  try {
51
+ logger.info(`Starting to execute ${queue} job`, Object.assign(Object.assign({}, job), { queueSize: yield Job.count() }));
52
+ // check robots.txt
53
+ if (!job.ignoreRobots) {
54
+ const canCrawl = yield isAcceptCrawler(job.url);
55
+ if (!canCrawl) {
56
+ logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
57
+ const snapshot = convertJobToSnapshot({
58
+ job,
59
+ snapshot: {
60
+ status: 'failed',
61
+ error: 'Denied by robots.txt',
62
+ },
63
+ });
64
+ yield Snapshot.upsert(snapshot);
65
+ return snapshot;
66
+ }
67
+ }
61
68
  // get page content later
62
69
  const result = yield getPageContent(formattedJob, handler);
63
70
  if (!result || (!result.html && !result.screenshot)) {
@@ -75,18 +82,13 @@ export function createCrawlQueue(queue, handler) {
75
82
  const snapshot = yield sequelize.transaction((txn) => __awaiter(this, void 0, void 0, function* () {
76
83
  // delete old snapshot
77
84
  if (formattedJob.replace) {
78
- try {
79
- const deletedJobIds = yield deleteSnapshots({
80
- url: formattedJob.url,
81
- replace: true,
82
- }, { txn });
83
- if (deletedJobIds) {
84
- logger.info('Deleted old snapshot', { deletedJobIds });
85
- }
86
- }
87
- catch (error) {
85
+ const deletedJobIds = yield deleteSnapshots({
86
+ url: formattedJob.url,
87
+ replace: true,
88
+ }, { txn }).catch((error) => {
88
89
  logger.error('Failed to delete old snapshot', { error, formattedJob });
89
- }
90
+ });
91
+ logger.info('Deleted old snapshot', { deletedJobIds });
90
92
  }
91
93
  // save html and screenshot to data dir
92
94
  const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
@@ -106,10 +108,12 @@ export function createCrawlQueue(queue, handler) {
106
108
  yield Snapshot.upsert(snapshot, { transaction: txn });
107
109
  return snapshot;
108
110
  }));
111
+ status = 'success';
109
112
  return snapshot;
110
113
  }
111
114
  catch (error) {
112
115
  logger.error(`Failed to crawl ${formattedJob.url}`, { error, formattedJob });
116
+ status = 'failed';
113
117
  const snapshot = convertJobToSnapshot({
114
118
  job: formattedJob,
115
119
  snapshot: {
@@ -120,6 +124,14 @@ export function createCrawlQueue(queue, handler) {
120
124
  yield Snapshot.upsert(snapshot);
121
125
  return snapshot;
122
126
  }
127
+ finally {
128
+ const now = Date.now();
129
+ jobsTotal.inc({ queue, status });
130
+ jobDurationSeconds.observe({ queue, status }, (now - startTime) / 1000);
131
+ if (job.enqueuedAt) {
132
+ jobTotalLatencySeconds.observe({ queue, status }, (now - job.enqueuedAt) / 1000);
133
+ }
134
+ }
123
135
  }),
124
136
  });
125
137
  }
@@ -155,7 +167,7 @@ function saveSnapshotToLocal(_a) {
155
167
  };
156
168
  });
157
169
  }
158
- export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, format = 'webp', timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }, handler) {
170
+ export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, format = 'webp', timeout = 60 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }, handler) {
159
171
  const page = yield initPage();
160
172
  if (width && height) {
161
173
  yield page.setViewport({ width, height, deviceScaleFactor: 2 });
@@ -287,31 +299,59 @@ export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_
287
299
  * @param callback callback when job finished
288
300
  */
289
301
  // eslint-disable-next-line require-await
290
- export function enqueue(queue, params, callback) {
302
+ export function enqueue(queueName, params, callback) {
291
303
  return __awaiter(this, void 0, void 0, function* () {
304
+ const queue = queueMap[queueName];
305
+ if (!queue) {
306
+ throw new Error(`Queue ${queueName} not found`);
307
+ }
292
308
  // skip duplicate job
293
309
  const existsJob = yield Job.isExists(params);
294
310
  if (existsJob && !params.sync) {
295
311
  logger.info(`Crawl job already exists for ${params.url}, skip`);
296
312
  return existsJob.id;
297
313
  }
298
- logger.info('enqueue crawl job', params);
299
314
  const jobId = randomUUID();
300
- const job = queue.push(Object.assign(Object.assign({}, params), { id: jobId }));
315
+ const enqueuedAt = Date.now();
316
+ const job = queue.push({ job: Object.assign(Object.assign({}, params), { id: jobId, enqueuedAt }), jobId });
317
+ jobsEnqueuedTotal.inc({ queue: queueName });
318
+ // Get current queue size for logging
319
+ const queueSize = yield Job.count();
320
+ logger.info('enqueue crawl job', Object.assign(Object.assign({}, params), { jobId, queueSize }));
301
321
  job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
302
- logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
303
- callback === null || callback === void 0 ? void 0 : callback(result ? yield formatSnapshot(result) : null);
322
+ try {
323
+ const isSuccess = (result === null || result === void 0 ? void 0 : result.status) === 'success';
324
+ const queueSize = yield Job.count();
325
+ logger.info(`Crawl completed ${params.url}, status: ${isSuccess ? 'success' : 'failed'}`, {
326
+ job: params,
327
+ result,
328
+ queueSize,
329
+ });
330
+ callback === null || callback === void 0 ? void 0 : callback(result ? yield formatSnapshot(result) : null);
331
+ }
332
+ catch (error) {
333
+ logger.error(`Error in finished event handler for ${params.url}`, { error });
334
+ callback === null || callback === void 0 ? void 0 : callback(null);
335
+ }
336
+ }));
337
+ job.on('failed', (_a) => __awaiter(this, [_a], void 0, function* ({ error }) {
338
+ try {
339
+ const queueSize = yield Job.count();
340
+ logger.error(`Failed to execute job for ${params.url}`, { error, job: params, queueSize });
341
+ }
342
+ catch (err) {
343
+ logger.error(`Error in failed event handler for ${params.url}`, { error: err });
344
+ }
345
+ finally {
346
+ callback === null || callback === void 0 ? void 0 : callback(null);
347
+ }
304
348
  }));
305
- job.on('failed', ({ error }) => {
306
- logger.error(`Failed to execute job for ${params.url}`, { error, job: params });
307
- callback === null || callback === void 0 ? void 0 : callback(null);
308
- });
309
349
  return jobId;
310
350
  });
311
351
  }
312
352
  export function crawlUrl(params, callback) {
313
- return enqueue(params.sync ? syncQueue : crawlQueue, params, callback);
353
+ return enqueue(params.sync ? 'syncCrawler' : 'urlCrawler', params, callback);
314
354
  }
315
355
  export function crawlCode(params, callback) {
316
- return enqueue(codeQueue, Object.assign({ ignoreRobots: true, includeHtml: false, includeScreenshot: true }, params), callback);
356
+ return enqueue('codeCrawler', Object.assign({ ignoreRobots: true, includeHtml: false, includeScreenshot: true }, params), callback);
317
357
  }
@@ -1,5 +1,7 @@
1
1
  import { Config } from './config';
2
2
  export * from './crawler';
3
3
  export * from './services/snapshot';
4
+ export * from './store/job';
4
5
  export * as utils from './utils';
6
+ export * from './metrics';
5
7
  export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
package/lib/esm/index.js CHANGED
@@ -16,7 +16,9 @@ import { ensureBrowser } from './puppeteer';
16
16
  import { migrate } from './store/migrate';
17
17
  export * from './crawler';
18
18
  export * from './services/snapshot';
19
+ export * from './store/job';
19
20
  export * as utils from './utils';
21
+ export * from './metrics';
20
22
  export function initCrawler(params) {
21
23
  return __awaiter(this, void 0, void 0, function* () {
22
24
  var _a;
@@ -0,0 +1,18 @@
1
+ import { Counter, Gauge, Histogram } from 'prom-client';
2
+ export declare const jobsTotal: Counter<"queue" | "status">;
3
+ export declare const jobsEnqueuedTotal: Counter<"queue">;
4
+ export declare const jobDurationSeconds: Histogram<"queue" | "status">;
5
+ export declare const jobTotalLatencySeconds: Histogram<"queue" | "status">;
6
+ export declare const queueSize: Gauge<"queue">;
7
+ /**
8
+ * Collect all metrics by querying database
9
+ */
10
+ export declare function collectMetrics(): Promise<void>;
11
+ /**
12
+ * Get metrics in Prometheus format
13
+ */
14
+ export declare function getMetrics(): Promise<string>;
15
+ /**
16
+ * Get content type for metrics endpoint
17
+ */
18
+ export declare function getContentType(): "text/plain; version=0.0.4; charset=utf-8";
@@ -0,0 +1,82 @@
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import { Counter, Gauge, Histogram, Registry } from 'prom-client';
11
+ import { Job } from './store';
12
+ // Create a new registry
13
+ const register = new Registry();
14
+ // ========== Counter - 爬取任务计数 ==========
15
+ export const jobsTotal = new Counter({
16
+ name: 'crawler_jobs_total',
17
+ help: 'Total number of crawl jobs processed',
18
+ labelNames: ['queue', 'status'],
19
+ registers: [register],
20
+ });
21
+ // ========== Counter - 入队任务数 ==========
22
+ export const jobsEnqueuedTotal = new Counter({
23
+ name: 'crawler_jobs_enqueued_total',
24
+ help: 'Total number of crawl jobs enqueued',
25
+ labelNames: ['queue'],
26
+ registers: [register],
27
+ });
28
+ // ========== Histogram - 任务执行耗时 ==========
29
+ export const jobDurationSeconds = new Histogram({
30
+ name: 'crawler_job_duration_seconds',
31
+ help: 'Duration of crawl job execution in seconds',
32
+ labelNames: ['queue', 'status'],
33
+ buckets: [10, 30, 60, 120, 300, 600, 900, 1800, 3600],
34
+ registers: [register],
35
+ });
36
+ // ========== Histogram - 入队到完成总耗时 ==========
37
+ export const jobTotalLatencySeconds = new Histogram({
38
+ name: 'crawler_job_total_latency_seconds',
39
+ help: 'Total latency from enqueue to completion in seconds',
40
+ labelNames: ['queue', 'status'],
41
+ buckets: [10, 30, 60, 120, 300, 600, 900, 1800, 3600],
42
+ registers: [register],
43
+ });
44
+ // ========== Gauge - 队列大小 ==========
45
+ export const queueSize = new Gauge({
46
+ name: 'crawler_queue_size',
47
+ help: 'Current number of jobs in queue',
48
+ labelNames: ['queue'],
49
+ registers: [register],
50
+ });
51
+ /**
52
+ * Collect all metrics by querying database
53
+ */
54
+ export function collectMetrics() {
55
+ return __awaiter(this, void 0, void 0, function* () {
56
+ try {
57
+ // 收集队列大小
58
+ const jobStats = yield Job.stats();
59
+ jobStats.queues.forEach((q) => {
60
+ queueSize.set({ queue: q.queue }, q.count);
61
+ });
62
+ }
63
+ catch (error) {
64
+ console.error('Error collecting metrics:', error);
65
+ }
66
+ });
67
+ }
68
+ /**
69
+ * Get metrics in Prometheus format
70
+ */
71
+ export function getMetrics() {
72
+ return __awaiter(this, void 0, void 0, function* () {
73
+ yield collectMetrics();
74
+ return register.metrics();
75
+ });
76
+ }
77
+ /**
78
+ * Get content type for metrics endpoint
79
+ */
80
+ export function getContentType() {
81
+ return register.contentType;
82
+ }
@@ -12,6 +12,7 @@ import fs from 'fs-extra';
12
12
  import path from 'path';
13
13
  import { clearInterval, setInterval } from 'timers';
14
14
  import { config, logger } from './config';
15
+ import { Job } from './store';
15
16
  import { CRAWLER_FLAG, sleep } from './utils';
16
17
  const BrowserStatus = {
17
18
  None: 'None',
@@ -113,21 +114,25 @@ export function launchBrowser() {
113
114
  '--no-sandbox',
114
115
  '--no-zygote',
115
116
  '--disable-setuid-sandbox',
116
- '--disable-gpu',
117
117
  '--disable-dev-shm-usage',
118
118
  '--disable-site-isolation-trials',
119
- '--disable-accelerated-2d-canvas',
120
119
  '--disable-extensions',
121
- '--js-flags=--max_old_space_size=512', // 限制V8内存
120
+ '--js-flags=--max_old_space_size=768', // 限制V8内存
122
121
  '--disable-background-networking',
123
122
  '--disable-default-apps',
124
123
  // '--disable-web-security', // 允许跨域请求
125
- '--disable-software-rasterizer',
126
124
  '--disable-crash-reporter',
127
125
  '--disable-service-workers',
128
126
  '--disable-notifications',
129
127
  '--disable-infobars',
130
128
  '--font-render-hinting=none',
129
+ // WebGL: use software GL fallback for servers without GPU
130
+ '--enable-webgl',
131
+ '--ignore-gpu-blocklist',
132
+ '--use-gl=swiftshader',
133
+ '--use-angle=swiftshader',
134
+ '--enable-unsafe-swiftshader',
135
+ '--disable-gpu-sandbox',
131
136
  ],
132
137
  });
133
138
  logger.info('Launch browser');
@@ -151,12 +156,18 @@ function checkBrowserActivated() {
151
156
  var _a;
152
157
  if (browser) {
153
158
  const pages = yield browser.pages().catch(() => []);
154
- if (pages.length === 1 && ((_a = pages[0]) === null || _a === void 0 ? void 0 : _a.url()) === 'about:blank') {
159
+ const jobCount = yield Job.count().catch(() => 0);
160
+ // Check if browser is inactive: only blank page AND no pending jobs
161
+ const isInactive = pages.length === 1 && ((_a = pages[0]) === null || _a === void 0 ? void 0 : _a.url()) === 'about:blank' && jobCount === 0;
162
+ if (isInactive) {
155
163
  count++;
156
164
  logger.debug(`Browser inactive count: ${count}/3`);
157
165
  }
158
166
  else {
159
- count = 0; // 重置计数器!
167
+ count = 0;
168
+ if (jobCount > 0) {
169
+ logger.debug(`Browser has ${jobCount} pending jobs, keeping active`);
170
+ }
160
171
  }
161
172
  if (count >= 3) {
162
173
  logger.info('Browser inactive for 3 minutes, closing...');
package/lib/esm/site.js CHANGED
@@ -10,8 +10,9 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
10
10
  import uniq from 'lodash/uniq';
11
11
  import { randomUUID } from 'node:crypto';
12
12
  import { config, logger } from './config';
13
- import { cronQueue } from './crawler';
14
- import { Snapshot } from './store';
13
+ import { queueMap } from './crawler';
14
+ import { jobsEnqueuedTotal } from './metrics';
15
+ import { Job, Snapshot } from './store';
15
16
  import { formatUrl, getSitemapList } from './utils';
16
17
  const crawlBlockletRunningMap = new Map();
17
18
  function parseSitemapUrl(sitemapItem) {
@@ -63,21 +64,30 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
63
64
  });
64
65
  crawlCount++;
65
66
  const jobId = randomUUID();
66
- cronQueue.push({
67
- id: jobId,
68
- url,
69
- lastModified: sitemapItem.lastmod,
70
- includeScreenshot: false,
71
- includeHtml: true,
72
- replace: true,
67
+ queueMap.cronJobs.push({
68
+ job: {
69
+ id: jobId,
70
+ url,
71
+ lastModified: sitemapItem.lastmod,
72
+ includeScreenshot: false,
73
+ includeHtml: true,
74
+ replace: true,
75
+ enqueuedAt: Date.now(),
76
+ },
77
+ jobId,
78
+ delay: 5,
73
79
  });
80
+ jobsEnqueuedTotal.inc({ queue: 'cronJobs' });
74
81
  return jobId;
75
82
  }), { concurrency: ((_b = config.siteCron) === null || _b === void 0 ? void 0 : _b.concurrency) || 30 });
83
+ // Get current queue size for logging
84
+ const queueSize = yield Job.count();
76
85
  logger.info('Enqueued jobs from sitemap finished', {
77
86
  url,
78
87
  pathname,
79
88
  processCount,
80
89
  crawlCount,
90
+ queueSize,
81
91
  });
82
92
  return jobIds;
83
93
  }
@@ -17,6 +17,7 @@ export interface JobState {
17
17
  replace?: boolean;
18
18
  sync?: boolean;
19
19
  ignoreRobots?: boolean;
20
+ enqueuedAt?: number;
20
21
  headers?: Record<string, string>;
21
22
  cookies?: CookieParam[];
22
23
  localStorage?: {
@@ -32,18 +33,46 @@ export interface JobModel {
32
33
  willRunAt: number;
33
34
  delay: number;
34
35
  cancelled: boolean;
36
+ processingBy: string | null;
37
+ processingAt: number | null;
35
38
  }
36
39
  export declare class Job extends Model<JobModel> implements JobModel {
37
40
  id: JobModel['id'];
38
41
  queue: JobModel['queue'];
39
42
  job: JobModel['job'];
40
43
  retryCount: JobModel['retryCount'];
41
- willRunAt: JobModel['willRunAt'];
42
44
  delay: JobModel['delay'];
45
+ willRunAt: JobModel['willRunAt'];
43
46
  cancelled: JobModel['cancelled'];
47
+ processingBy: JobModel['processingBy'];
48
+ processingAt: JobModel['processingAt'];
44
49
  static initModel(sequelize: Sequelize): typeof Job;
45
50
  static findJob(condition: Partial<JobState>): Promise<JobModel | null>;
46
51
  static isExists(condition: Partial<JobState> & {
47
52
  url: string;
48
53
  }): Promise<JobModel | null | undefined>;
54
+ static paginate({ page, pageSize, queue, }?: {
55
+ page?: number;
56
+ pageSize?: number;
57
+ queue?: string;
58
+ }): Promise<{
59
+ total: number;
60
+ page: number;
61
+ pageSize: number;
62
+ totalPages: number;
63
+ data: JobModel[];
64
+ }>;
65
+ static stats(): Promise<{
66
+ total: number;
67
+ queues: {
68
+ queue: string;
69
+ count: number;
70
+ }[];
71
+ }>;
72
+ static deleteByQueue(queue: string): Promise<{
73
+ deleted: number;
74
+ }>;
75
+ static deleteByIds(ids: string[]): Promise<{
76
+ deleted: number;
77
+ }>;
49
78
  }
@@ -37,6 +37,14 @@ export class Job extends Model {
37
37
  type: DataTypes.BOOLEAN,
38
38
  defaultValue: false,
39
39
  },
40
+ processingBy: {
41
+ type: DataTypes.STRING(32),
42
+ allowNull: true,
43
+ },
44
+ processingAt: {
45
+ type: DataTypes.INTEGER,
46
+ allowNull: true,
47
+ },
40
48
  createdAt: {
41
49
  type: DataTypes.DATE,
42
50
  defaultValue: DataTypes.NOW,
@@ -88,4 +96,52 @@ export class Job extends Model {
88
96
  return existsJob === null || existsJob === void 0 ? void 0 : existsJob.get();
89
97
  });
90
98
  }
99
+ static paginate() {
100
+ return __awaiter(this, arguments, void 0, function* ({ page = 1, pageSize = 20, queue, } = {}) {
101
+ const where = queue ? { queue } : {};
102
+ const offset = (page - 1) * pageSize;
103
+ const { count, rows } = yield Job.findAndCountAll({
104
+ where,
105
+ order: [['createdAt', 'DESC']],
106
+ limit: pageSize,
107
+ offset,
108
+ });
109
+ return {
110
+ total: count,
111
+ page,
112
+ pageSize,
113
+ totalPages: Math.ceil(count / pageSize),
114
+ data: rows.map((row) => row.toJSON()),
115
+ };
116
+ });
117
+ }
118
+ static stats() {
119
+ return __awaiter(this, void 0, void 0, function* () {
120
+ const results = (yield Job.findAll({
121
+ attributes: ['queue', [sequelize.fn('COUNT', sequelize.col('id')), 'count']],
122
+ group: ['queue'],
123
+ raw: true,
124
+ }));
125
+ const total = results.reduce((sum, item) => sum + Number(item.count), 0);
126
+ return {
127
+ total,
128
+ queues: results.map((item) => ({
129
+ queue: item.queue,
130
+ count: Number(item.count),
131
+ })),
132
+ };
133
+ });
134
+ }
135
+ static deleteByQueue(queue) {
136
+ return __awaiter(this, void 0, void 0, function* () {
137
+ const count = yield Job.destroy({ where: { queue } });
138
+ return { deleted: count };
139
+ });
140
+ }
141
+ static deleteByIds(ids) {
142
+ return __awaiter(this, void 0, void 0, function* () {
143
+ const count = yield Job.destroy({ where: { id: ids } });
144
+ return { deleted: count };
145
+ });
146
+ }
91
147
  }
@@ -3,6 +3,7 @@ import { SequelizeStorage, Umzug } from 'umzug';
3
3
  import { sequelize } from './index';
4
4
  import * as migration20250615 from './migrations/20250615-genesis';
5
5
  import * as migration20250616Replace from './migrations/20250616-replace';
6
+ import * as migration20251226JobProcessing from './migrations/20251226-job-processing';
6
7
  const umzug = new Umzug({
7
8
  migrations: [
8
9
  {
@@ -15,6 +16,11 @@ const umzug = new Umzug({
15
16
  up: ({ context }) => migration20250616Replace.up({ context }),
16
17
  down: ({ context }) => migration20250616Replace.down({ context }),
17
18
  },
19
+ {
20
+ name: '20251226-job-processing',
21
+ up: ({ context }) => migration20251226JobProcessing.up({ context }),
22
+ down: ({ context }) => migration20251226JobProcessing.down({ context }),
23
+ },
18
24
  ],
19
25
  context: sequelize.getQueryInterface(),
20
26
  storage: new SequelizeStorage({ sequelize }),
@@ -0,0 +1,6 @@
1
+ export declare function up({ context }: {
2
+ context: any;
3
+ }): Promise<void>;
4
+ export declare function down({ context }: {
5
+ context: any;
6
+ }): Promise<void>;