@arcblock/crawler 1.1.6 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/lib/cjs/crawler.d.ts +11 -4
  2. package/lib/cjs/crawler.js +96 -59
  3. package/lib/cjs/index.d.ts +1 -0
  4. package/lib/cjs/index.js +3 -5
  5. package/lib/cjs/services/carbon.d.ts +3 -0
  6. package/lib/cjs/services/carbon.js +87 -0
  7. package/lib/cjs/services/snapshot.d.ts +5 -2
  8. package/lib/cjs/services/snapshot.js +36 -6
  9. package/lib/cjs/site.d.ts +1 -1
  10. package/lib/cjs/site.js +9 -3
  11. package/lib/cjs/store/index.d.ts +4 -1
  12. package/lib/cjs/store/index.js +37 -45
  13. package/lib/cjs/store/job.d.ts +5 -0
  14. package/lib/cjs/store/migrate.d.ts +4 -0
  15. package/lib/cjs/store/migrate.js +63 -0
  16. package/lib/cjs/store/migrations/20250615-genesis.d.ts +6 -0
  17. package/lib/cjs/store/migrations/20250615-genesis.js +114 -0
  18. package/lib/cjs/store/migrations/20250616-replace.d.ts +6 -0
  19. package/lib/cjs/store/migrations/20250616-replace.js +40 -0
  20. package/lib/cjs/store/snapshot.d.ts +2 -0
  21. package/lib/cjs/store/snapshot.js +7 -0
  22. package/lib/esm/crawler.d.ts +11 -4
  23. package/lib/esm/crawler.js +92 -57
  24. package/lib/esm/index.d.ts +1 -0
  25. package/lib/esm/index.js +1 -4
  26. package/lib/esm/services/carbon.d.ts +3 -0
  27. package/lib/esm/services/carbon.js +84 -0
  28. package/lib/esm/services/snapshot.d.ts +5 -2
  29. package/lib/esm/services/snapshot.js +33 -4
  30. package/lib/esm/site.d.ts +1 -1
  31. package/lib/esm/site.js +9 -3
  32. package/lib/esm/store/index.d.ts +4 -1
  33. package/lib/esm/store/index.js +23 -45
  34. package/lib/esm/store/job.d.ts +5 -0
  35. package/lib/esm/store/migrate.d.ts +4 -0
  36. package/lib/esm/store/migrate.js +26 -0
  37. package/lib/esm/store/migrations/20250615-genesis.d.ts +6 -0
  38. package/lib/esm/store/migrations/20250615-genesis.js +110 -0
  39. package/lib/esm/store/migrations/20250616-replace.d.ts +6 -0
  40. package/lib/esm/store/migrations/20250616-replace.js +36 -0
  41. package/lib/esm/store/snapshot.d.ts +2 -0
  42. package/lib/esm/store/snapshot.js +7 -0
  43. package/package.json +3 -2
@@ -1,11 +1,15 @@
1
- import { JobState } from './store/job';
2
- import { SnapshotModel } from './store/snapshot';
3
- export declare function createCrawlQueue(): void;
1
+ import { Page } from '@blocklet/puppeteer';
2
+ import { JobState, SnapshotModel } from './store';
3
+ type PageHandler = {
4
+ handleScreenshot?: (page: Page, params?: JobState) => Promise<Buffer | null>;
5
+ handleHtml?: (page: Page, params?: JobState) => Promise<string | null>;
6
+ };
7
+ export declare function createCrawlQueue(queue: string, handler?: PageHandler): any;
4
8
  export declare function getDataDir(): Promise<{
5
9
  htmlDir: string;
6
10
  screenshotDir: string;
7
11
  }>;
8
- export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
12
+ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, format, timeout, waitTime, fullPage, headers, cookies, localStorage, }: JobState, handler?: PageHandler) => Promise<{
9
13
  html: string | null;
10
14
  screenshot: Uint8Array<ArrayBufferLike> | null;
11
15
  meta: {
@@ -18,4 +22,7 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
18
22
  * @param params
19
23
  * @param callback callback when job finished
20
24
  */
25
+ export declare function enqueue(queue: any, params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
21
26
  export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
27
+ export declare function crawlCode(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
28
+ export {};
@@ -15,7 +15,9 @@ Object.defineProperty(exports, "__esModule", { value: true });
15
15
  exports.getPageContent = void 0;
16
16
  exports.createCrawlQueue = createCrawlQueue;
17
17
  exports.getDataDir = getDataDir;
18
+ exports.enqueue = enqueue;
18
19
  exports.crawlUrl = crawlUrl;
20
+ exports.crawlCode = crawlCode;
19
21
  const queue_1 = __importDefault(require("@abtnode/queue"));
20
22
  const sequelize_1 = __importDefault(require("@abtnode/queue/lib/store/sequelize"));
21
23
  const crypto_1 = require("crypto");
@@ -23,44 +25,44 @@ const fs_extra_1 = __importDefault(require("fs-extra"));
23
25
  const path_1 = __importDefault(require("path"));
24
26
  const config_1 = require("./config");
25
27
  const puppeteer_1 = require("./puppeteer");
28
+ const carbon_1 = require("./services/carbon");
26
29
  const snapshot_1 = require("./services/snapshot");
27
- const job_1 = require("./store/job");
28
- const snapshot_2 = require("./store/snapshot");
30
+ const store_1 = require("./store");
29
31
  const utils_1 = require("./utils");
30
32
  const { BaseState } = require('@abtnode/models');
31
- let crawlQueue;
32
- function createCrawlQueue() {
33
- const db = new BaseState(job_1.Job);
34
- crawlQueue = (0, queue_1.default)({
35
- store: new sequelize_1.default(db, 'crawler'),
33
+ // eslint-disable-next-line import/no-mutable-exports
34
+ const crawlQueue = createCrawlQueue('urlCrawler');
35
+ const syncQueue = createCrawlQueue('syncCrawler');
36
+ const codeQueue = createCrawlQueue('codeCrawler', {
37
+ handleScreenshot: carbon_1.createCarbonImage,
38
+ });
39
+ function createCrawlQueue(queue, handler) {
40
+ const db = new BaseState(store_1.Job);
41
+ return (0, queue_1.default)({
42
+ store: new sequelize_1.default(db, queue),
36
43
  concurrency: config_1.config.concurrency,
37
44
  onJob: (job) => __awaiter(this, void 0, void 0, function* () {
38
45
  config_1.logger.info('Starting to execute crawl job', job);
39
- const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
40
- if (!canCrawl) {
41
- config_1.logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
42
- const snapshot = (0, snapshot_1.convertJobToSnapshot)({
43
- job,
44
- snapshot: {
45
- status: 'failed',
46
- error: 'Denied by robots.txt',
47
- },
48
- });
49
- yield snapshot_2.Snapshot.upsert(snapshot);
50
- return snapshot;
46
+ // check robots.txt
47
+ if (!job.ignoreRobots) {
48
+ const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
49
+ if (!canCrawl) {
50
+ config_1.logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
51
+ const snapshot = (0, snapshot_1.convertJobToSnapshot)({
52
+ job,
53
+ snapshot: {
54
+ status: 'failed',
55
+ error: 'Denied by robots.txt',
56
+ },
57
+ });
58
+ yield store_1.Snapshot.upsert(snapshot);
59
+ return snapshot;
60
+ }
51
61
  }
52
- // if index reach autoCloseBrowserCount, close browser
53
- // try {
54
- // if (index >= autoCloseBrowserCount) {
55
- // await closeBrowser({ trimCache: false });
56
- // }
57
- // } catch (error) {
58
- // logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
59
- // }
60
62
  const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config_1.config.cookies || []).concat(job.cookies || []), localStorage: (config_1.config.localStorage || []).concat(job.localStorage || []), url: (0, utils_1.formatUrl)(job.url) });
61
63
  try {
62
64
  // get page content later
63
- const result = yield (0, exports.getPageContent)(formattedJob);
65
+ const result = yield (0, exports.getPageContent)(formattedJob, handler);
64
66
  if (!result || (!result.html && !result.screenshot)) {
65
67
  config_1.logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
66
68
  const snapshot = (0, snapshot_1.convertJobToSnapshot)({
@@ -70,25 +72,43 @@ function createCrawlQueue() {
70
72
  error: 'Failed to crawl content',
71
73
  },
72
74
  });
73
- yield snapshot_2.Snapshot.upsert(snapshot);
75
+ yield store_1.Snapshot.upsert(snapshot);
74
76
  return snapshot;
75
77
  }
76
- // save html and screenshot to data dir
77
- const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
78
- screenshot: result.screenshot,
79
- html: result.html,
80
- });
81
- // const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
82
- const snapshot = (0, snapshot_1.convertJobToSnapshot)({
83
- job: formattedJob,
84
- snapshot: {
85
- status: 'success',
86
- screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config_1.config.dataDir, ''),
87
- html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config_1.config.dataDir, ''),
88
- meta: result.meta,
89
- },
90
- });
91
- yield snapshot_2.Snapshot.upsert(snapshot);
78
+ const snapshot = yield store_1.sequelize.transaction((txn) => __awaiter(this, void 0, void 0, function* () {
79
+ // delete old snapshot
80
+ if (formattedJob.replace) {
81
+ try {
82
+ const deletedJobIds = yield (0, snapshot_1.deleteSnapshots)({
83
+ url: formattedJob.url,
84
+ replace: true,
85
+ }, { txn });
86
+ if (deletedJobIds) {
87
+ config_1.logger.info('Deleted old snapshot', { deletedJobIds });
88
+ }
89
+ }
90
+ catch (error) {
91
+ config_1.logger.error('Failed to delete old snapshot', { error, formattedJob });
92
+ }
93
+ }
94
+ // save html and screenshot to data dir
95
+ const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
96
+ screenshot: result.screenshot,
97
+ html: result.html,
98
+ format: formattedJob.format,
99
+ });
100
+ const snapshot = (0, snapshot_1.convertJobToSnapshot)({
101
+ job: formattedJob,
102
+ snapshot: {
103
+ status: 'success',
104
+ screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config_1.config.dataDir, ''),
105
+ html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config_1.config.dataDir, ''),
106
+ meta: result.meta,
107
+ },
108
+ });
109
+ yield store_1.Snapshot.upsert(snapshot, { transaction: txn });
110
+ return snapshot;
111
+ }));
92
112
  return snapshot;
93
113
  }
94
114
  catch (error) {
@@ -100,7 +120,7 @@ function createCrawlQueue() {
100
120
  error: 'Internal error',
101
121
  },
102
122
  });
103
- yield snapshot_2.Snapshot.upsert(snapshot);
123
+ yield store_1.Snapshot.upsert(snapshot);
104
124
  return snapshot;
105
125
  }
106
126
  }),
@@ -116,13 +136,13 @@ function getDataDir() {
116
136
  });
117
137
  }
118
138
  function saveSnapshotToLocal(_a) {
119
- return __awaiter(this, arguments, void 0, function* ({ screenshot, html }) {
139
+ return __awaiter(this, arguments, void 0, function* ({ screenshot, html, format = 'webp', }) {
120
140
  const { htmlDir, screenshotDir } = yield getDataDir();
121
141
  let screenshotPath = null;
122
142
  let htmlPath = null;
123
143
  if (screenshot) {
124
144
  const hash = (0, utils_1.md5)(screenshot);
125
- screenshotPath = path_1.default.join(screenshotDir, `${hash}.webp`);
145
+ screenshotPath = path_1.default.join(screenshotDir, `${hash}.${format}`);
126
146
  config_1.logger.debug('saveSnapshotToLocal.screenshot', { screenshotPath });
127
147
  yield fs_extra_1.default.writeFile(screenshotPath, screenshot);
128
148
  }
@@ -138,7 +158,7 @@ function saveSnapshotToLocal(_a) {
138
158
  };
139
159
  });
140
160
  }
141
- const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies, localStorage, }) {
161
+ const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, format = 'webp', timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }, handler) {
142
162
  const page = yield (0, puppeteer_1.initPage)();
143
163
  if (width && height) {
144
164
  yield page.setViewport({ width, height, deviceScaleFactor: 2 });
@@ -175,9 +195,18 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
175
195
  }
176
196
  // await for networkidle0
177
197
  // https://pptr.dev/api/puppeteer.page.waitfornetworkidle
178
- yield page.waitForNetworkIdle({
179
- idleTime: 1.5 * 1000,
180
- });
198
+ try {
199
+ yield Promise.all([
200
+ page.waitForNetworkIdle({
201
+ idleTime: 1.5 * 1000,
202
+ timeout,
203
+ }),
204
+ (0, utils_1.sleep)(waitTime),
205
+ ]);
206
+ }
207
+ catch (err) {
208
+ config_1.logger.warn(`Failed to wait for network idle in ${url}:`, err);
209
+ }
181
210
  // get screenshot
182
211
  if (includeScreenshot) {
183
212
  // Try to find the tallest element and set the browser to the same height
@@ -193,7 +222,9 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
193
222
  }
194
223
  }
195
224
  try {
196
- screenshot = yield page.screenshot({ fullPage, quality, type: 'webp' });
225
+ screenshot = (handler === null || handler === void 0 ? void 0 : handler.handleScreenshot)
226
+ ? yield handler.handleScreenshot(page)
227
+ : yield page.screenshot({ fullPage, quality, type: format });
197
228
  }
198
229
  catch (err) {
199
230
  config_1.logger.error('Failed to get screenshot:', err);
@@ -221,12 +252,12 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
221
252
  // check if the page is an error page
222
253
  const isErrorPage = ['<h2>Unexpected Application Error!</h2>', 'Current route occurred an error'].some((errorHtml) => data.html.includes(errorHtml));
223
254
  if (isErrorPage) {
224
- throw new Error('Page is an error page');
255
+ throw new Error(`${url} is an error page`);
225
256
  }
226
257
  meta.title = data.title;
227
258
  meta.description = data.description;
228
259
  if (includeHtml) {
229
- html = data.html;
260
+ html = (handler === null || handler === void 0 ? void 0 : handler.handleHtml) ? yield handler.handleHtml(page) : data.html;
230
261
  }
231
262
  }
232
263
  catch (err) {
@@ -254,17 +285,17 @@ exports.getPageContent = getPageContent;
254
285
  * @param callback callback when job finished
255
286
  */
256
287
  // eslint-disable-next-line require-await
257
- function crawlUrl(params, callback) {
288
+ function enqueue(queue, params, callback) {
258
289
  return __awaiter(this, void 0, void 0, function* () {
259
290
  // skip duplicate job
260
- const existsJob = yield job_1.Job.isExists(params);
261
- if (existsJob) {
291
+ const existsJob = yield store_1.Job.isExists(params);
292
+ if (existsJob && !params.sync) {
262
293
  config_1.logger.info(`Crawl job already exists for ${params.url}, skip`);
263
294
  return existsJob.id;
264
295
  }
265
296
  config_1.logger.info('enqueue crawl job', params);
266
297
  const jobId = (0, crypto_1.randomUUID)();
267
- const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
298
+ const job = queue.push(Object.assign(Object.assign({}, params), { id: jobId }));
268
299
  job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
269
300
  config_1.logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
270
301
  callback === null || callback === void 0 ? void 0 : callback(result ? yield (0, snapshot_1.formatSnapshot)(result) : null);
@@ -276,3 +307,9 @@ function crawlUrl(params, callback) {
276
307
  return jobId;
277
308
  });
278
309
  }
310
+ function crawlUrl(params, callback) {
311
+ return enqueue(params.sync ? syncQueue : crawlQueue, params, callback);
312
+ }
313
+ function crawlCode(params, callback) {
314
+ return enqueue(codeQueue, Object.assign({ ignoreRobots: true, includeHtml: false, includeScreenshot: true }, params), callback);
315
+ }
@@ -3,4 +3,5 @@ export * from './crawler';
3
3
  export * from './site';
4
4
  export * from './services/snapshot';
5
5
  export * as utils from './utils';
6
+ export { migrate } from './store/migrate';
6
7
  export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
package/lib/cjs/index.js CHANGED
@@ -48,28 +48,26 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
48
48
  return (mod && mod.__esModule) ? mod : { "default": mod };
49
49
  };
50
50
  Object.defineProperty(exports, "__esModule", { value: true });
51
- exports.utils = void 0;
51
+ exports.migrate = exports.utils = void 0;
52
52
  exports.initCrawler = initCrawler;
53
53
  /* eslint-disable @typescript-eslint/indent */
54
54
  const merge_1 = __importDefault(require("lodash/merge"));
55
55
  const config_1 = require("./config");
56
- const crawler_1 = require("./crawler");
57
56
  const cron_1 = require("./cron");
58
57
  const puppeteer_1 = require("./puppeteer");
59
- const store_1 = require("./store");
60
58
  __exportStar(require("./crawler"), exports);
61
59
  __exportStar(require("./site"), exports);
62
60
  __exportStar(require("./services/snapshot"), exports);
63
61
  exports.utils = __importStar(require("./utils"));
62
+ var migrate_1 = require("./store/migrate");
63
+ Object.defineProperty(exports, "migrate", { enumerable: true, get: function () { return migrate_1.migrate; } });
64
64
  function initCrawler(params) {
65
65
  return __awaiter(this, void 0, void 0, function* () {
66
66
  var _a;
67
67
  (0, merge_1.default)(config_1.config, params);
68
68
  config_1.logger.info('Init crawler', { params, config: config_1.config });
69
69
  try {
70
- yield (0, store_1.initDatabase)();
71
70
  yield (0, puppeteer_1.ensureBrowser)();
72
- yield (0, crawler_1.createCrawlQueue)();
73
71
  if ((_a = config_1.config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
74
72
  yield (0, cron_1.initCron)();
75
73
  }
@@ -0,0 +1,3 @@
1
+ import { Page } from '@blocklet/puppeteer';
2
+ import { JobState } from '../store';
3
+ export declare function createCarbonImage(page: Page, params?: JobState): Promise<Buffer<ArrayBuffer>>;
@@ -0,0 +1,87 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.createCarbonImage = createCarbonImage;
13
+ const config_1 = require("../config");
14
+ // TODO expose local version of dom-to-image
15
+ const DOM_TO_IMAGE_URL = 'https://unpkg.com/dom-to-image@2.6.0/dist/dom-to-image.min.js';
16
+ function createCarbonImage(page, params) {
17
+ return __awaiter(this, void 0, void 0, function* () {
18
+ try {
19
+ yield page.addScriptTag({ url: DOM_TO_IMAGE_URL });
20
+ yield page.waitForSelector('.export-container', { visible: true, timeout: (params === null || params === void 0 ? void 0 : params.timeout) || 120 });
21
+ const targetElement = yield page.$('.export-container');
22
+ const format = (params === null || params === void 0 ? void 0 : params.format) || 'png';
23
+ const dataUrl = yield page.evaluate((target = document, imageFormat = 'png') => {
24
+ const query = new URLSearchParams(document.location.search);
25
+ const EXPORT_SIZES_HASH = {
26
+ '1x': '1',
27
+ '2x': '2',
28
+ '4x': '4',
29
+ };
30
+ const exportSize = EXPORT_SIZES_HASH[query.get('es')] || '2';
31
+ if (!target) {
32
+ throw new Error('Target element not found');
33
+ }
34
+ target.querySelectorAll('span[role="presentation"]').forEach((node) => {
35
+ var _a;
36
+ const el = node;
37
+ if (el && el.innerText && el.innerText.match(/%[A-Fa-f0-9]{2}/)) {
38
+ (_a = el.innerText.match(/%[A-Fa-f0-9]{2}/g)) === null || _a === void 0 ? void 0 : _a.forEach((t) => {
39
+ el.innerHTML = el.innerHTML.replace(t, encodeURIComponent(t));
40
+ });
41
+ }
42
+ });
43
+ const width = target.offsetWidth * exportSize;
44
+ const height = query.get('si') === 'true'
45
+ ? target.offsetWidth * exportSize
46
+ : target.offsetHeight * exportSize;
47
+ const config = {
48
+ style: {
49
+ transform: `scale(${exportSize})`,
50
+ 'transform-origin': 'center',
51
+ background: query.get('si') ? query.get('bg') : 'none',
52
+ },
53
+ filter: (n) => {
54
+ if (n.className) {
55
+ return String(n.className).indexOf('eliminateOnRender') < 0;
56
+ }
57
+ return true;
58
+ },
59
+ width,
60
+ height,
61
+ };
62
+ switch (imageFormat) {
63
+ case 'jpeg':
64
+ // @ts-ignore: domtoimage is injected by addScriptTag
65
+ return domtoimage.toJpeg(target, config);
66
+ case 'webp':
67
+ // dom-to-image doesn't support webp directly, fall back to png
68
+ // @ts-ignore: domtoimage is injected by addScriptTag
69
+ return domtoimage.toPng(target, config);
70
+ case 'png':
71
+ default:
72
+ // @ts-ignore: domtoimage is injected by addScriptTag
73
+ return domtoimage.toPng(target, config);
74
+ }
75
+ }, targetElement, format);
76
+ const base64Data = dataUrl.split(',')[1];
77
+ if (!base64Data) {
78
+ throw new Error('Failed to extract base64 data from image');
79
+ }
80
+ return Buffer.from(base64Data, 'base64');
81
+ }
82
+ catch (e) {
83
+ config_1.logger.error('failed to crawl from carbon', { error: e });
84
+ throw e;
85
+ }
86
+ });
87
+ }
@@ -1,5 +1,5 @@
1
- import { JobState } from '../store/job';
2
- import { SnapshotModel } from '../store/snapshot';
1
+ import { Transaction, WhereOptions } from '@sequelize/core';
2
+ import { JobState, SnapshotModel } from '../store';
3
3
  export declare function convertJobToSnapshot({ job, snapshot }: {
4
4
  job: JobState;
5
5
  snapshot?: Partial<SnapshotModel>;
@@ -10,3 +10,6 @@ export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<
10
10
  */
11
11
  export declare function getSnapshot(jobId: string): Promise<SnapshotModel | null>;
12
12
  export declare function getLatestSnapshot(url: string): Promise<SnapshotModel | null>;
13
+ export declare function deleteSnapshots(where: WhereOptions<SnapshotModel>, { txn }?: {
14
+ txn?: Transaction;
15
+ }): Promise<string[]>;
@@ -16,17 +16,17 @@ exports.convertJobToSnapshot = convertJobToSnapshot;
16
16
  exports.formatSnapshot = formatSnapshot;
17
17
  exports.getSnapshot = getSnapshot;
18
18
  exports.getLatestSnapshot = getLatestSnapshot;
19
+ exports.deleteSnapshots = deleteSnapshots;
19
20
  const cloneDeep_1 = __importDefault(require("lodash/cloneDeep"));
20
21
  const pick_1 = __importDefault(require("lodash/pick"));
21
22
  const promises_1 = __importDefault(require("node:fs/promises"));
22
23
  const node_path_1 = __importDefault(require("node:path"));
23
24
  const ufo_1 = require("ufo");
24
25
  const config_1 = require("../config");
25
- const job_1 = require("../store/job");
26
- const snapshot_1 = require("../store/snapshot");
26
+ const store_1 = require("../store");
27
27
  const utils_1 = require("../utils");
28
28
  function convertJobToSnapshot({ job, snapshot }) {
29
- return Object.assign({ jobId: job.jobId || job.id, url: job.url, lastModified: job.lastModified || new Date().toISOString(), options: {
29
+ return Object.assign({ jobId: job.jobId || job.id, url: job.url, lastModified: job.lastModified || new Date().toISOString(), replace: job.replace, options: {
30
30
  width: job.width,
31
31
  height: job.height,
32
32
  includeScreenshot: job.includeScreenshot,
@@ -64,11 +64,11 @@ function formatSnapshot(snapshot, columns) {
64
64
  */
65
65
  function getSnapshot(jobId) {
66
66
  return __awaiter(this, void 0, void 0, function* () {
67
- const snapshot = yield snapshot_1.Snapshot.findSnapshot({ where: { jobId } });
67
+ const snapshot = yield store_1.Snapshot.findSnapshot({ where: { jobId } });
68
68
  if (snapshot) {
69
69
  return formatSnapshot(snapshot);
70
70
  }
71
- const job = yield job_1.Job.findJob({ id: jobId });
71
+ const job = yield store_1.Job.findJob({ id: jobId });
72
72
  if (job) {
73
73
  return {
74
74
  jobId,
@@ -80,12 +80,42 @@ function getSnapshot(jobId) {
80
80
  }
81
81
  function getLatestSnapshot(url) {
82
82
  return __awaiter(this, void 0, void 0, function* () {
83
- const snapshot = yield snapshot_1.Snapshot.findSnapshot({
83
+ const snapshot = yield store_1.Snapshot.findSnapshot({
84
84
  where: {
85
85
  url: (0, utils_1.formatUrl)(url),
86
86
  status: 'success',
87
87
  },
88
+ order: [
89
+ ['lastModified', 'DESC'],
90
+ ['updatedAt', 'DESC'],
91
+ ],
88
92
  });
89
93
  return snapshot ? formatSnapshot(snapshot) : null;
90
94
  });
91
95
  }
96
+ function deleteSnapshots(where_1) {
97
+ return __awaiter(this, arguments, void 0, function* (where, { txn } = {}) {
98
+ const snapshots = yield store_1.Snapshot.findAll({
99
+ where,
100
+ order: [
101
+ ['lastModified', 'DESC'],
102
+ ['updatedAt', 'DESC'],
103
+ ],
104
+ });
105
+ const jobIds = yield Promise.all(snapshots.map((snapshot) => __awaiter(this, void 0, void 0, function* () {
106
+ try {
107
+ yield Promise.all([
108
+ snapshot.html && promises_1.default.unlink(node_path_1.default.join(config_1.config.dataDir, snapshot.html)),
109
+ snapshot.screenshot && promises_1.default.unlink(node_path_1.default.join(config_1.config.dataDir, snapshot.screenshot)),
110
+ ]);
111
+ yield snapshot.destroy({ transaction: txn });
112
+ return snapshot.jobId;
113
+ }
114
+ catch (error) {
115
+ config_1.logger.error('Failed to delete snapshot', { error, snapshot });
116
+ throw error;
117
+ }
118
+ })));
119
+ return jobIds.filter(Boolean);
120
+ });
121
+ }
package/lib/cjs/site.d.ts CHANGED
@@ -1,2 +1,2 @@
1
1
  import { Site } from './config';
2
- export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null)[]>;
2
+ export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(`${string}-${string}-${string}-${string}-${string}` | null)[]>;
package/lib/cjs/site.js CHANGED
@@ -14,12 +14,14 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
14
14
  Object.defineProperty(exports, "__esModule", { value: true });
15
15
  exports.crawlSite = void 0;
16
16
  const uniq_1 = __importDefault(require("lodash/uniq"));
17
+ const node_crypto_1 = require("node:crypto");
17
18
  const p_map_1 = __importDefault(require("p-map"));
18
19
  const config_1 = require("./config");
19
20
  const crawler_1 = require("./crawler");
20
- const snapshot_1 = require("./store/snapshot");
21
+ const store_1 = require("./store");
21
22
  const utils_1 = require("./utils");
22
23
  const crawlBlockletRunningMap = new Map();
24
+ const crawlQueue = (0, crawler_1.createCrawlQueue)('cronJobs');
23
25
  function parseSitemapUrl(sitemapItem) {
24
26
  var _a;
25
27
  const links = ((_a = sitemapItem.links) === null || _a === void 0 ? void 0 : _a.map((item) => item.url)) || [];
@@ -48,7 +50,7 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
48
50
  try {
49
51
  const jobIds = yield (0, p_map_1.default)(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
50
52
  processCount++;
51
- const snapshot = yield snapshot_1.Snapshot.findOne({ where: { url: (0, utils_1.formatUrl)(url) } });
53
+ const snapshot = yield store_1.Snapshot.findOne({ where: { url: (0, utils_1.formatUrl)(url) } });
52
54
  if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
53
55
  const lastModified = new Date(snapshot.lastModified);
54
56
  // skip if snapshot lastModified is greater than sitemap lastmod
@@ -67,12 +69,16 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
67
69
  url,
68
70
  });
69
71
  crawlCount++;
70
- return (0, crawler_1.crawlUrl)({
72
+ const jobId = (0, node_crypto_1.randomUUID)();
73
+ crawlQueue.push({
74
+ id: jobId,
71
75
  url,
72
76
  lastModified: sitemapItem.lastmod,
73
77
  includeScreenshot: false,
74
78
  includeHtml: true,
79
+ replace: true,
75
80
  });
81
+ return jobId;
76
82
  }), { concurrency: ((_b = config_1.config.siteCron) === null || _b === void 0 ? void 0 : _b.concurrency) || 30 });
77
83
  config_1.logger.info('Enqueued jobs from sitemap finished', {
78
84
  url,
@@ -1,3 +1,6 @@
1
1
  import { Sequelize } from '@sequelize/core';
2
2
  import { SqliteDialect } from '@sequelize/sqlite3';
3
- export declare function initDatabase(): Promise<Sequelize<SqliteDialect>>;
3
+ declare const sequelize: Sequelize<SqliteDialect>;
4
+ export { sequelize };
5
+ export * from './job';
6
+ export * from './snapshot';