@arcblock/crawler 1.1.5 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +1 -2
  2. package/lib/cjs/config.d.ts +9 -3
  3. package/lib/cjs/config.js +2 -10
  4. package/lib/cjs/crawler.d.ts +3 -4
  5. package/lib/cjs/crawler.js +74 -48
  6. package/lib/cjs/cron.js +5 -0
  7. package/lib/cjs/index.d.ts +2 -4
  8. package/lib/cjs/index.js +6 -6
  9. package/lib/cjs/services/snapshot.d.ts +5 -2
  10. package/lib/cjs/services/snapshot.js +44 -7
  11. package/lib/cjs/site.d.ts +1 -1
  12. package/lib/cjs/site.js +11 -4
  13. package/lib/cjs/store/index.d.ts +4 -1
  14. package/lib/cjs/store/index.js +37 -45
  15. package/lib/cjs/store/job.d.ts +6 -1
  16. package/lib/cjs/store/migrate.d.ts +4 -0
  17. package/lib/cjs/store/migrate.js +63 -0
  18. package/lib/cjs/store/migrations/20250615-genesis.d.ts +6 -0
  19. package/lib/cjs/store/migrations/20250615-genesis.js +114 -0
  20. package/lib/cjs/store/migrations/20250616-replace.d.ts +6 -0
  21. package/lib/cjs/store/migrations/20250616-replace.js +40 -0
  22. package/lib/cjs/store/snapshot.d.ts +8 -0
  23. package/lib/cjs/store/snapshot.js +7 -0
  24. package/lib/esm/config.d.ts +9 -3
  25. package/lib/esm/config.js +2 -10
  26. package/lib/esm/crawler.d.ts +3 -4
  27. package/lib/esm/crawler.js +71 -45
  28. package/lib/esm/cron.js +5 -0
  29. package/lib/esm/index.d.ts +2 -4
  30. package/lib/esm/index.js +4 -5
  31. package/lib/esm/services/snapshot.d.ts +5 -2
  32. package/lib/esm/services/snapshot.js +41 -5
  33. package/lib/esm/site.d.ts +1 -1
  34. package/lib/esm/site.js +11 -4
  35. package/lib/esm/store/index.d.ts +4 -1
  36. package/lib/esm/store/index.js +23 -45
  37. package/lib/esm/store/job.d.ts +6 -1
  38. package/lib/esm/store/migrate.d.ts +4 -0
  39. package/lib/esm/store/migrate.js +26 -0
  40. package/lib/esm/store/migrations/20250615-genesis.d.ts +6 -0
  41. package/lib/esm/store/migrations/20250615-genesis.js +110 -0
  42. package/lib/esm/store/migrations/20250616-replace.d.ts +6 -0
  43. package/lib/esm/store/migrations/20250616-replace.js +36 -0
  44. package/lib/esm/store/snapshot.d.ts +8 -0
  45. package/lib/esm/store/snapshot.js +7 -0
  46. package/package.json +3 -2
package/README.md CHANGED
@@ -43,8 +43,7 @@ await initCrawler({
43
43
  immediate: !!env.preferences.cronImmediate,
44
44
  sites: env.preferences.cronSites,
45
45
  time: env.preferences.cronTime,
46
- crawlConcurrency: env.preferences.crawlConcurrency,
47
- sitemapConcurrency: env.preferences.sitemapConcurrency,
46
+ concurrency: env.preferences.concurrency,
48
47
  },
49
48
  });
50
49
  ```
@@ -1,3 +1,4 @@
1
+ import { CookieParam } from '@blocklet/puppeteer';
1
2
  export type Site = {
2
3
  url: string;
3
4
  pathname: string;
@@ -11,14 +12,19 @@ export type Config = {
11
12
  appUrl: string;
12
13
  cacheDir: string;
13
14
  puppeteerPath?: string;
14
- siteCron: {
15
+ concurrency: number;
16
+ siteCron?: {
15
17
  sites: Site[];
16
18
  time: string;
17
19
  enabled: boolean;
18
20
  immediate: boolean;
19
- crawlConcurrency: number;
20
- sitemapConcurrency: number;
21
+ concurrency: number;
21
22
  };
23
+ cookies?: CookieParam[];
24
+ localStorage?: {
25
+ key: string;
26
+ value: string;
27
+ }[];
22
28
  };
23
29
  export declare const logger: any;
24
30
  export declare const config: Config;
package/lib/cjs/config.js CHANGED
@@ -9,17 +9,9 @@ exports.logger = (0, logger_1.default)('@arcblock/crawler', { level: process.env
9
9
  exports.config = {
10
10
  isProd: process.env.NODE_ENV === 'production',
11
11
  dataDir: process.env.BLOCKLET_DATA_DIR,
12
- appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
13
12
  cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
13
+ appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
14
14
  appUrl: process.env.BLOCKLET_APP_URL || '/',
15
15
  puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
16
- // cron
17
- siteCron: {
18
- sites: [],
19
- enabled: true,
20
- time: '0 0 0 * * *',
21
- immediate: false,
22
- crawlConcurrency: 2,
23
- sitemapConcurrency: 30,
24
- },
16
+ concurrency: 2,
25
17
  };
@@ -1,11 +1,10 @@
1
- import { JobState } from './store/job';
2
- import { SnapshotModel } from './store/snapshot';
3
- export declare function createCrawlQueue(): void;
1
+ import { JobState, SnapshotModel } from './store';
2
+ export declare function createCrawlQueue(queue: string): any;
4
3
  export declare function getDataDir(): Promise<{
5
4
  htmlDir: string;
6
5
  screenshotDir: string;
7
6
  }>;
8
- export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
7
+ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, waitTime, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
9
8
  html: string | null;
10
9
  screenshot: Uint8Array<ArrayBufferLike> | null;
11
10
  meta: {
@@ -24,16 +24,16 @@ const path_1 = __importDefault(require("path"));
24
24
  const config_1 = require("./config");
25
25
  const puppeteer_1 = require("./puppeteer");
26
26
  const snapshot_1 = require("./services/snapshot");
27
- const job_1 = require("./store/job");
28
- const snapshot_2 = require("./store/snapshot");
27
+ const store_1 = require("./store");
29
28
  const utils_1 = require("./utils");
30
29
  const { BaseState } = require('@abtnode/models');
31
- let crawlQueue;
32
- function createCrawlQueue() {
33
- const db = new BaseState(job_1.Job);
34
- crawlQueue = (0, queue_1.default)({
35
- store: new sequelize_1.default(db, 'crawler'),
36
- concurrency: config_1.config.siteCron.crawlConcurrency,
30
+ // eslint-disable-next-line import/no-mutable-exports
31
+ const crawlQueue = createCrawlQueue('urlCrawler');
32
+ function createCrawlQueue(queue) {
33
+ const db = new BaseState(store_1.Job);
34
+ return (0, queue_1.default)({
35
+ store: new sequelize_1.default(db, queue),
36
+ concurrency: config_1.config.concurrency,
37
37
  onJob: (job) => __awaiter(this, void 0, void 0, function* () {
38
38
  config_1.logger.info('Starting to execute crawl job', job);
39
39
  const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
@@ -46,7 +46,7 @@ function createCrawlQueue() {
46
46
  error: 'Denied by robots.txt',
47
47
  },
48
48
  });
49
- yield snapshot_2.Snapshot.upsert(snapshot);
49
+ yield store_1.Snapshot.upsert(snapshot);
50
50
  return snapshot;
51
51
  }
52
52
  // if index reach autoCloseBrowserCount, close browser
@@ -57,54 +57,67 @@ function createCrawlQueue() {
57
57
  // } catch (error) {
58
58
  // logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
59
59
  // }
60
+ const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config_1.config.cookies || []).concat(job.cookies || []), localStorage: (config_1.config.localStorage || []).concat(job.localStorage || []), url: (0, utils_1.formatUrl)(job.url) });
60
61
  try {
61
62
  // get page content later
62
- const result = yield (0, exports.getPageContent)(Object.assign({ localStorage: {
63
- // for blocklet theme
64
- blocklet_theme_prefer: 'light',
65
- // for blocklet domain warning
66
- 'domain-warning-skip': Date.now().toString(),
67
- } }, job));
63
+ const result = yield (0, exports.getPageContent)(formattedJob);
68
64
  if (!result || (!result.html && !result.screenshot)) {
69
- config_1.logger.error(`failed to crawl ${job.url}, empty content`, job);
65
+ config_1.logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
70
66
  const snapshot = (0, snapshot_1.convertJobToSnapshot)({
71
- job,
67
+ job: formattedJob,
72
68
  snapshot: {
73
69
  status: 'failed',
74
70
  error: 'Failed to crawl content',
75
71
  },
76
72
  });
77
- yield snapshot_2.Snapshot.upsert(snapshot);
73
+ yield store_1.Snapshot.upsert(snapshot);
78
74
  return snapshot;
79
75
  }
80
- // save html and screenshot to data dir
81
- const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
82
- screenshot: result.screenshot,
83
- html: result.html,
84
- });
85
- // const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
86
- const snapshot = (0, snapshot_1.convertJobToSnapshot)({
87
- job,
88
- snapshot: {
89
- status: 'success',
90
- screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config_1.config.dataDir, ''),
91
- html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config_1.config.dataDir, ''),
92
- meta: result.meta,
93
- },
94
- });
95
- yield snapshot_2.Snapshot.upsert(snapshot);
76
+ const snapshot = yield store_1.sequelize.transaction((txn) => __awaiter(this, void 0, void 0, function* () {
77
+ // delete old snapshot
78
+ if (formattedJob.replace) {
79
+ try {
80
+ const deletedJobIds = yield (0, snapshot_1.deleteSnapshots)({
81
+ url: formattedJob.url,
82
+ replace: true,
83
+ }, { txn });
84
+ if (deletedJobIds) {
85
+ config_1.logger.info('Deleted old snapshot', { deletedJobIds });
86
+ }
87
+ }
88
+ catch (error) {
89
+ config_1.logger.error('Failed to delete old snapshot', { error, formattedJob });
90
+ }
91
+ }
92
+ // save html and screenshot to data dir
93
+ const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
94
+ screenshot: result.screenshot,
95
+ html: result.html,
96
+ });
97
+ const snapshot = (0, snapshot_1.convertJobToSnapshot)({
98
+ job: formattedJob,
99
+ snapshot: {
100
+ status: 'success',
101
+ screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config_1.config.dataDir, ''),
102
+ html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config_1.config.dataDir, ''),
103
+ meta: result.meta,
104
+ },
105
+ });
106
+ yield store_1.Snapshot.upsert(snapshot, { transaction: txn });
107
+ return snapshot;
108
+ }));
96
109
  return snapshot;
97
110
  }
98
111
  catch (error) {
99
- config_1.logger.error(`Failed to crawl ${job.url}`, { error, job });
112
+ config_1.logger.error(`Failed to crawl ${formattedJob.url}`, { error, formattedJob });
100
113
  const snapshot = (0, snapshot_1.convertJobToSnapshot)({
101
- job,
114
+ job: formattedJob,
102
115
  snapshot: {
103
116
  status: 'failed',
104
117
  error: 'Internal error',
105
118
  },
106
119
  });
107
- yield snapshot_2.Snapshot.upsert(snapshot);
120
+ yield store_1.Snapshot.upsert(snapshot);
108
121
  return snapshot;
109
122
  }
110
123
  }),
@@ -142,7 +155,7 @@ function saveSnapshotToLocal(_a) {
142
155
  };
143
156
  });
144
157
  }
145
- const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies = [], localStorage, }) {
158
+ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }) {
146
159
  const page = yield (0, puppeteer_1.initPage)();
147
160
  if (width && height) {
148
161
  yield page.setViewport({ width, height, deviceScaleFactor: 2 });
@@ -150,13 +163,18 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
150
163
  if (headers) {
151
164
  yield page.setExtraHTTPHeaders(headers);
152
165
  }
153
- if (cookies === null || cookies === void 0 ? void 0 : cookies.length) {
154
- yield page.setCookie(...cookies);
166
+ // handle cookies
167
+ if (cookies) {
168
+ const { hostname } = new URL(url);
169
+ const cookieParams = cookies.map((item) => (Object.assign(Object.assign({}, item), { expires: item.expires ? new Date(item.expires).getTime() : undefined, domain: item.domain || hostname, path: item.path || '/' })));
170
+ yield page.setCookie(...cookieParams);
155
171
  }
172
+ // handle localStorage
156
173
  if (localStorage) {
157
174
  yield page.evaluateOnNewDocument((items) => {
158
- Object.entries(items).forEach(([key, value]) => {
159
- window.localStorage.setItem(key, value);
175
+ items.forEach((item) => {
176
+ const value = item.value === 'now()' ? new Date().toISOString() : item.value;
177
+ window.localStorage.setItem(item.key, value);
160
178
  });
161
179
  }, localStorage);
162
180
  }
@@ -174,9 +192,18 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
174
192
  }
175
193
  // await for networkidle0
176
194
  // https://pptr.dev/api/puppeteer.page.waitfornetworkidle
177
- yield page.waitForNetworkIdle({
178
- idleTime: 1.5 * 1000,
179
- });
195
+ try {
196
+ yield Promise.all([
197
+ page.waitForNetworkIdle({
198
+ idleTime: 1.5 * 1000,
199
+ timeout,
200
+ }),
201
+ (0, utils_1.sleep)(waitTime),
202
+ ]);
203
+ }
204
+ catch (err) {
205
+ config_1.logger.warn(`Failed to wait for network idle in ${url}:`, err);
206
+ }
180
207
  // get screenshot
181
208
  if (includeScreenshot) {
182
209
  // Try to find the tallest element and set the browser to the same height
@@ -220,7 +247,7 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
220
247
  // check if the page is an error page
221
248
  const isErrorPage = ['<h2>Unexpected Application Error!</h2>', 'Current route occurred an error'].some((errorHtml) => data.html.includes(errorHtml));
222
249
  if (isErrorPage) {
223
- throw new Error('Page is an error page');
250
+ throw new Error(`${url} is an error page`);
224
251
  }
225
252
  meta.title = data.title;
226
253
  meta.description = data.description;
@@ -255,9 +282,8 @@ exports.getPageContent = getPageContent;
255
282
  // eslint-disable-next-line require-await
256
283
  function crawlUrl(params, callback) {
257
284
  return __awaiter(this, void 0, void 0, function* () {
258
- params = Object.assign(Object.assign({}, params), { url: (0, utils_1.formatUrl)(params.url) });
259
285
  // skip duplicate job
260
- const existsJob = yield job_1.Job.isExists(params);
286
+ const existsJob = yield store_1.Job.isExists(params);
261
287
  if (existsJob) {
262
288
  config_1.logger.info(`Crawl job already exists for ${params.url}, skip`);
263
289
  return existsJob.id;
package/lib/cjs/cron.js CHANGED
@@ -20,6 +20,8 @@ let cron = null;
20
20
  function initCron() {
21
21
  if (cron)
22
22
  return;
23
+ if (!config_1.config.siteCron)
24
+ return;
23
25
  config_1.logger.info('Init cron', { config: config_1.config.siteCron });
24
26
  cron = cron_1.default.init({
25
27
  context: {},
@@ -29,6 +31,9 @@ function initCron() {
29
31
  time: config_1.config.siteCron.time,
30
32
  options: { runOnInit: config_1.config.siteCron.immediate },
31
33
  fn: () => __awaiter(this, void 0, void 0, function* () {
34
+ var _a;
35
+ if (!((_a = config_1.config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled))
36
+ return;
32
37
  config_1.logger.info('Start cron to crawl site', { sites: config_1.config.siteCron.sites });
33
38
  for (const site of config_1.config.siteCron.sites) {
34
39
  try {
@@ -3,7 +3,5 @@ export * from './crawler';
3
3
  export * from './site';
4
4
  export * from './services/snapshot';
5
5
  export * as utils from './utils';
6
- type DeepPartial<T> = T extends object ? {
7
- [P in keyof T]?: DeepPartial<T[P]>;
8
- } : T;
9
- export declare function initCrawler(params: DeepPartial<Pick<Config, 'puppeteerPath' | 'siteCron'>>): Promise<void>;
6
+ export { migrate } from './store/migrate';
7
+ export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
package/lib/cjs/index.js CHANGED
@@ -48,27 +48,27 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
48
48
  return (mod && mod.__esModule) ? mod : { "default": mod };
49
49
  };
50
50
  Object.defineProperty(exports, "__esModule", { value: true });
51
- exports.utils = void 0;
51
+ exports.migrate = exports.utils = void 0;
52
52
  exports.initCrawler = initCrawler;
53
+ /* eslint-disable @typescript-eslint/indent */
53
54
  const merge_1 = __importDefault(require("lodash/merge"));
54
55
  const config_1 = require("./config");
55
- const crawler_1 = require("./crawler");
56
56
  const cron_1 = require("./cron");
57
57
  const puppeteer_1 = require("./puppeteer");
58
- const store_1 = require("./store");
59
58
  __exportStar(require("./crawler"), exports);
60
59
  __exportStar(require("./site"), exports);
61
60
  __exportStar(require("./services/snapshot"), exports);
62
61
  exports.utils = __importStar(require("./utils"));
62
+ var migrate_1 = require("./store/migrate");
63
+ Object.defineProperty(exports, "migrate", { enumerable: true, get: function () { return migrate_1.migrate; } });
63
64
  function initCrawler(params) {
64
65
  return __awaiter(this, void 0, void 0, function* () {
66
+ var _a;
65
67
  (0, merge_1.default)(config_1.config, params);
66
68
  config_1.logger.info('Init crawler', { params, config: config_1.config });
67
69
  try {
68
- yield (0, store_1.initDatabase)();
69
70
  yield (0, puppeteer_1.ensureBrowser)();
70
- yield (0, crawler_1.createCrawlQueue)();
71
- if (config_1.config.siteCron.enabled) {
71
+ if ((_a = config_1.config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
72
72
  yield (0, cron_1.initCron)();
73
73
  }
74
74
  }
@@ -1,5 +1,5 @@
1
- import { JobState } from '../store/job';
2
- import { SnapshotModel } from '../store/snapshot';
1
+ import { Transaction, WhereOptions } from '@sequelize/core';
2
+ import { JobState, SnapshotModel } from '../store';
3
3
  export declare function convertJobToSnapshot({ job, snapshot }: {
4
4
  job: JobState;
5
5
  snapshot?: Partial<SnapshotModel>;
@@ -10,3 +10,6 @@ export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<
10
10
  */
11
11
  export declare function getSnapshot(jobId: string): Promise<SnapshotModel | null>;
12
12
  export declare function getLatestSnapshot(url: string): Promise<SnapshotModel | null>;
13
+ export declare function deleteSnapshots(where: WhereOptions<SnapshotModel>, { txn }?: {
14
+ txn?: Transaction;
15
+ }): Promise<string[]>;
@@ -16,16 +16,17 @@ exports.convertJobToSnapshot = convertJobToSnapshot;
16
16
  exports.formatSnapshot = formatSnapshot;
17
17
  exports.getSnapshot = getSnapshot;
18
18
  exports.getLatestSnapshot = getLatestSnapshot;
19
+ exports.deleteSnapshots = deleteSnapshots;
20
+ const cloneDeep_1 = __importDefault(require("lodash/cloneDeep"));
19
21
  const pick_1 = __importDefault(require("lodash/pick"));
20
22
  const promises_1 = __importDefault(require("node:fs/promises"));
21
23
  const node_path_1 = __importDefault(require("node:path"));
22
24
  const ufo_1 = require("ufo");
23
25
  const config_1 = require("../config");
24
- const job_1 = require("../store/job");
25
- const snapshot_1 = require("../store/snapshot");
26
+ const store_1 = require("../store");
26
27
  const utils_1 = require("../utils");
27
28
  function convertJobToSnapshot({ job, snapshot }) {
28
- return Object.assign({ jobId: job.jobId || job.id, url: job.url, lastModified: job.lastModified || new Date().toISOString(), options: {
29
+ return Object.assign({ jobId: job.jobId || job.id, url: job.url, lastModified: job.lastModified || new Date().toISOString(), replace: job.replace, options: {
29
30
  width: job.width,
30
31
  height: job.height,
31
32
  includeScreenshot: job.includeScreenshot,
@@ -36,7 +37,7 @@ function convertJobToSnapshot({ job, snapshot }) {
36
37
  }
37
38
  function formatSnapshot(snapshot, columns) {
38
39
  return __awaiter(this, void 0, void 0, function* () {
39
- let data = Object.assign({}, snapshot);
40
+ let data = (0, cloneDeep_1.default)(snapshot);
40
41
  // format screenshot path to full url
41
42
  if (data.screenshot) {
42
43
  data.screenshot = (0, ufo_1.joinURL)(config_1.config.appUrl, data.screenshot);
@@ -46,6 +47,12 @@ function formatSnapshot(snapshot, columns) {
46
47
  const html = yield promises_1.default.readFile(node_path_1.default.join(config_1.config.dataDir, data.html));
47
48
  data.html = html.toString();
48
49
  }
50
+ // remove sensitive options that should not be returned
51
+ if (data.options) {
52
+ delete data.options.cookies;
53
+ delete data.options.localStorage;
54
+ delete data.options.headers;
55
+ }
49
56
  if (columns === null || columns === void 0 ? void 0 : columns.length) {
50
57
  data = (0, pick_1.default)(data, columns);
51
58
  }
@@ -57,11 +64,11 @@ function formatSnapshot(snapshot, columns) {
57
64
  */
58
65
  function getSnapshot(jobId) {
59
66
  return __awaiter(this, void 0, void 0, function* () {
60
- const snapshot = yield snapshot_1.Snapshot.findSnapshot({ where: { jobId } });
67
+ const snapshot = yield store_1.Snapshot.findSnapshot({ where: { jobId } });
61
68
  if (snapshot) {
62
69
  return formatSnapshot(snapshot);
63
70
  }
64
- const job = yield job_1.Job.findJob({ id: jobId });
71
+ const job = yield store_1.Job.findJob({ id: jobId });
65
72
  if (job) {
66
73
  return {
67
74
  jobId,
@@ -73,12 +80,42 @@ function getSnapshot(jobId) {
73
80
  }
74
81
  function getLatestSnapshot(url) {
75
82
  return __awaiter(this, void 0, void 0, function* () {
76
- const snapshot = yield snapshot_1.Snapshot.findSnapshot({
83
+ const snapshot = yield store_1.Snapshot.findSnapshot({
77
84
  where: {
78
85
  url: (0, utils_1.formatUrl)(url),
79
86
  status: 'success',
80
87
  },
88
+ order: [
89
+ ['lastModified', 'DESC'],
90
+ ['updatedAt', 'DESC'],
91
+ ],
81
92
  });
82
93
  return snapshot ? formatSnapshot(snapshot) : null;
83
94
  });
84
95
  }
96
+ function deleteSnapshots(where_1) {
97
+ return __awaiter(this, arguments, void 0, function* (where, { txn } = {}) {
98
+ const snapshots = yield store_1.Snapshot.findAll({
99
+ where,
100
+ order: [
101
+ ['lastModified', 'DESC'],
102
+ ['updatedAt', 'DESC'],
103
+ ],
104
+ });
105
+ const jobIds = yield Promise.all(snapshots.map((snapshot) => __awaiter(this, void 0, void 0, function* () {
106
+ try {
107
+ yield Promise.all([
108
+ snapshot.html && promises_1.default.unlink(node_path_1.default.join(config_1.config.dataDir, snapshot.html)),
109
+ snapshot.screenshot && promises_1.default.unlink(node_path_1.default.join(config_1.config.dataDir, snapshot.screenshot)),
110
+ ]);
111
+ yield snapshot.destroy({ transaction: txn });
112
+ return snapshot.jobId;
113
+ }
114
+ catch (error) {
115
+ config_1.logger.error('Failed to delete snapshot', { error, snapshot });
116
+ throw error;
117
+ }
118
+ })));
119
+ return jobIds.filter(Boolean);
120
+ });
121
+ }
package/lib/cjs/site.d.ts CHANGED
@@ -1,2 +1,2 @@
1
1
  import { Site } from './config';
2
- export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null)[]>;
2
+ export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(`${string}-${string}-${string}-${string}-${string}` | null)[]>;
package/lib/cjs/site.js CHANGED
@@ -14,12 +14,14 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
14
14
  Object.defineProperty(exports, "__esModule", { value: true });
15
15
  exports.crawlSite = void 0;
16
16
  const uniq_1 = __importDefault(require("lodash/uniq"));
17
+ const node_crypto_1 = require("node:crypto");
17
18
  const p_map_1 = __importDefault(require("p-map"));
18
19
  const config_1 = require("./config");
19
20
  const crawler_1 = require("./crawler");
20
- const snapshot_1 = require("./store/snapshot");
21
+ const store_1 = require("./store");
21
22
  const utils_1 = require("./utils");
22
23
  const crawlBlockletRunningMap = new Map();
24
+ const crawlQueue = (0, crawler_1.createCrawlQueue)('cronJobs');
23
25
  function parseSitemapUrl(sitemapItem) {
24
26
  var _a;
25
27
  const links = ((_a = sitemapItem.links) === null || _a === void 0 ? void 0 : _a.map((item) => item.url)) || [];
@@ -27,6 +29,7 @@ function parseSitemapUrl(sitemapItem) {
27
29
  return urls.map((url) => ({ url, sitemapItem }));
28
30
  }
29
31
  const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
32
+ var _b;
30
33
  config_1.logger.info(`Start crawl from sitemap ${url}`, { pathname });
31
34
  const key = `${url}-${pathname}`;
32
35
  if (crawlBlockletRunningMap.has(key)) {
@@ -47,7 +50,7 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
47
50
  try {
48
51
  const jobIds = yield (0, p_map_1.default)(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
49
52
  processCount++;
50
- const snapshot = yield snapshot_1.Snapshot.findOne({ where: { url: (0, utils_1.formatUrl)(url) } });
53
+ const snapshot = yield store_1.Snapshot.findOne({ where: { url: (0, utils_1.formatUrl)(url) } });
51
54
  if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
52
55
  const lastModified = new Date(snapshot.lastModified);
53
56
  // skip if snapshot lastModified is greater than sitemap lastmod
@@ -66,13 +69,17 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
66
69
  url,
67
70
  });
68
71
  crawlCount++;
69
- return (0, crawler_1.crawlUrl)({
72
+ const jobId = (0, node_crypto_1.randomUUID)();
73
+ crawlQueue.push({
74
+ id: jobId,
70
75
  url,
71
76
  lastModified: sitemapItem.lastmod,
72
77
  includeScreenshot: false,
73
78
  includeHtml: true,
79
+ replace: true,
74
80
  });
75
- }), { concurrency: config_1.config.siteCron.sitemapConcurrency });
81
+ return jobId;
82
+ }), { concurrency: ((_b = config_1.config.siteCron) === null || _b === void 0 ? void 0 : _b.concurrency) || 30 });
76
83
  config_1.logger.info('Enqueued jobs from sitemap finished', {
77
84
  url,
78
85
  pathname,
@@ -1,3 +1,6 @@
1
1
  import { Sequelize } from '@sequelize/core';
2
2
  import { SqliteDialect } from '@sequelize/sqlite3';
3
- export declare function initDatabase(): Promise<Sequelize<SqliteDialect>>;
3
+ declare const sequelize: Sequelize<SqliteDialect>;
4
+ export { sequelize };
5
+ export * from './job';
6
+ export * from './snapshot';
@@ -1,57 +1,49 @@
1
1
  "use strict";
2
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
- return new (P || (P = Promise))(function (resolve, reject) {
5
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
- step((generator = generator.apply(thisArg, _arguments || [])).next());
9
- });
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
+ for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
10
15
  };
11
16
  var __importDefault = (this && this.__importDefault) || function (mod) {
12
17
  return (mod && mod.__esModule) ? mod : { "default": mod };
13
18
  };
14
19
  Object.defineProperty(exports, "__esModule", { value: true });
15
- exports.initDatabase = initDatabase;
20
+ exports.sequelize = void 0;
16
21
  const core_1 = require("@sequelize/core");
17
22
  const sqlite3_1 = require("@sequelize/sqlite3");
18
23
  const path_1 = __importDefault(require("path"));
19
24
  const config_1 = require("../config");
20
25
  const job_1 = require("./job");
21
26
  const snapshot_1 = require("./snapshot");
22
- function initDatabase() {
23
- return __awaiter(this, void 0, void 0, function* () {
24
- const sequelize = new core_1.Sequelize({
25
- dialect: sqlite3_1.SqliteDialect,
26
- storage: path_1.default.join(config_1.config.dataDir, 'snap-kit.db'),
27
- logging: (msg) => process.env.SQLITE_LOG && config_1.logger.debug(msg),
28
- pool: {
29
- min: 0,
30
- max: 10,
31
- idle: 10000,
32
- },
33
- retry: {
34
- match: [/SQLITE_BUSY/],
35
- name: 'query',
36
- max: 10,
37
- },
38
- });
39
- job_1.Job.initModel(sequelize);
40
- snapshot_1.Snapshot.initModel(sequelize);
41
- try {
42
- yield Promise.all([
43
- sequelize.query('pragma journal_mode = WAL;'),
44
- sequelize.query('pragma synchronous = normal;'),
45
- sequelize.query('pragma journal_size_limit = 67108864;'),
46
- ]);
47
- yield sequelize.authenticate();
48
- yield sequelize.sync({ alter: process.env.ALTER_SQLITE === 'true' });
49
- config_1.logger.info('Successfully connected to database');
50
- }
51
- catch (error) {
52
- config_1.logger.error('Failed to connect to database:', error);
53
- throw error;
54
- }
55
- return sequelize;
56
- });
57
- }
27
+ const sequelize = new core_1.Sequelize({
28
+ dialect: sqlite3_1.SqliteDialect,
29
+ storage: path_1.default.join(config_1.config.dataDir, 'snap-kit.db'),
30
+ logging: (msg) => process.env.SQLITE_LOG && config_1.logger.debug(msg),
31
+ pool: {
32
+ min: 0,
33
+ max: 10,
34
+ idle: 10000,
35
+ },
36
+ retry: {
37
+ match: [/SQLITE_BUSY/],
38
+ name: 'query',
39
+ max: 10,
40
+ },
41
+ });
42
+ exports.sequelize = sequelize;
43
+ sequelize.query('pragma journal_mode = WAL;');
44
+ sequelize.query('pragma synchronous = normal;');
45
+ sequelize.query('pragma journal_size_limit = 67108864;');
46
+ job_1.Job.initModel(sequelize);
47
+ snapshot_1.Snapshot.initModel(sequelize);
48
+ __exportStar(require("./job"), exports);
49
+ __exportStar(require("./snapshot"), exports);
@@ -12,9 +12,14 @@ export interface JobState {
12
12
  timeout?: number;
13
13
  fullPage?: boolean;
14
14
  lastModified?: string;
15
+ waitTime?: number;
16
+ replace?: boolean;
15
17
  headers?: Record<string, string>;
16
18
  cookies?: CookieParam[];
17
- localStorage?: Record<string, string>;
19
+ localStorage?: {
20
+ key: string;
21
+ value: string;
22
+ }[];
18
23
  }
19
24
  export interface JobModel {
20
25
  id: string;
@@ -0,0 +1,4 @@
1
+ import { Umzug } from 'umzug';
2
+ declare const umzug: Umzug<import("@sequelize/sqlite3").SqliteQueryInterface<import("@sequelize/sqlite3").SqliteDialect>>;
3
+ export declare function migrate(): Promise<import("umzug").MigrationMeta[]>;
4
+ export { umzug };