@arcblock/crawler 1.0.6 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +1 -0
  2. package/lib/cjs/config.d.ts +22 -0
  3. package/{dist → lib/cjs}/config.js +9 -3
  4. package/lib/cjs/crawler.d.ts +26 -0
  5. package/{dist → lib/cjs}/crawler.js +44 -112
  6. package/lib/cjs/cron.d.ts +1 -0
  7. package/lib/cjs/cron.js +49 -0
  8. package/lib/cjs/index.d.ts +9 -0
  9. package/lib/cjs/index.js +78 -0
  10. package/{esm → lib/cjs}/puppeteer.d.ts +2 -2
  11. package/{dist → lib/cjs}/puppeteer.js +43 -54
  12. package/lib/cjs/services/snapshot.d.ts +12 -0
  13. package/lib/cjs/services/snapshot.js +84 -0
  14. package/lib/cjs/site.d.ts +2 -0
  15. package/lib/cjs/site.js +76 -0
  16. package/lib/cjs/store/index.d.ts +3 -0
  17. package/{dist/db → lib/cjs/store}/index.js +21 -5
  18. package/{dist/db → lib/cjs/store}/job.d.ts +4 -3
  19. package/lib/cjs/store/job.js +110 -0
  20. package/{dist/db → lib/cjs/store}/snapshot.d.ts +5 -6
  21. package/lib/cjs/store/snapshot.js +68 -0
  22. package/lib/cjs/utils.d.ts +32 -0
  23. package/{dist → lib/cjs}/utils.js +67 -78
  24. package/lib/esm/config.d.ts +22 -0
  25. package/{esm → lib/esm}/config.js +9 -3
  26. package/lib/esm/crawler.d.ts +26 -0
  27. package/{esm → lib/esm}/crawler.js +35 -100
  28. package/lib/esm/cron.d.ts +1 -0
  29. package/lib/esm/cron.js +43 -0
  30. package/lib/esm/index.d.ts +9 -0
  31. package/{esm → lib/esm}/index.js +19 -10
  32. package/{dist → lib/esm}/puppeteer.d.ts +2 -2
  33. package/{esm → lib/esm}/puppeteer.js +21 -32
  34. package/lib/esm/services/snapshot.d.ts +12 -0
  35. package/lib/esm/services/snapshot.js +75 -0
  36. package/lib/esm/site.d.ts +2 -0
  37. package/lib/esm/site.js +69 -0
  38. package/lib/esm/store/index.d.ts +3 -0
  39. package/{esm/db → lib/esm/store}/index.js +22 -6
  40. package/{esm/db → lib/esm/store}/job.d.ts +4 -3
  41. package/lib/esm/store/job.js +73 -0
  42. package/{esm/db → lib/esm/store}/snapshot.d.ts +5 -6
  43. package/lib/esm/store/snapshot.js +64 -0
  44. package/lib/esm/utils.d.ts +32 -0
  45. package/{esm → lib/esm}/utils.js +64 -71
  46. package/package.json +20 -32
  47. package/third.d.ts +0 -0
  48. package/dist/blocklet.d.ts +0 -6
  49. package/dist/blocklet.js +0 -199
  50. package/dist/cache.d.ts +0 -10
  51. package/dist/cache.js +0 -119
  52. package/dist/config.d.ts +0 -10
  53. package/dist/crawler.d.ts +0 -28
  54. package/dist/db/index.d.ts +0 -1
  55. package/dist/db/job.js +0 -54
  56. package/dist/db/snapshot.js +0 -52
  57. package/dist/index.d.ts +0 -6
  58. package/dist/index.js +0 -45
  59. package/dist/middleware.d.ts +0 -4
  60. package/dist/middleware.js +0 -44
  61. package/dist/utils.d.ts +0 -17
  62. package/esm/blocklet.d.ts +0 -6
  63. package/esm/blocklet.js +0 -190
  64. package/esm/cache.d.ts +0 -10
  65. package/esm/cache.js +0 -114
  66. package/esm/config.d.ts +0 -10
  67. package/esm/crawler.d.ts +0 -28
  68. package/esm/db/index.d.ts +0 -1
  69. package/esm/db/job.js +0 -50
  70. package/esm/db/snapshot.js +0 -48
  71. package/esm/index.d.ts +0 -6
  72. package/esm/middleware.d.ts +0 -4
  73. package/esm/middleware.js +0 -41
  74. package/esm/utils.d.ts +0 -17
@@ -51,33 +51,28 @@ exports.ensureBrowser = ensureBrowser;
51
51
  exports.connectBrowser = connectBrowser;
52
52
  exports.launchBrowser = launchBrowser;
53
53
  exports.initPage = initPage;
54
- // import fs from 'fs-extra';
55
- // import path from 'path';
56
54
  const puppeteer_1 = __importDefault(require("@blocklet/puppeteer"));
57
55
  exports.puppeteer = puppeteer_1.default;
58
- const config_1 = require("@blocklet/sdk/lib/config");
59
56
  const fs_extra_1 = __importDefault(require("fs-extra"));
60
57
  const path_1 = __importDefault(require("path"));
61
58
  const timers_1 = require("timers");
62
- const cache_1 = require("./cache");
63
- const config_2 = require("./config");
59
+ const config_1 = require("./config");
64
60
  const utils_1 = require("./utils");
65
- // let puppeteerConfig: {
66
- // cacheDirectory: string;
67
- // temporaryDirectory: string;
68
- // };
69
- const BROWSER_WS_ENDPOINT_KEY = `browserWSEndpoint-${config_1.env.appId || 'unknown'}`;
70
61
  const BrowserStatus = {
62
+ None: 'None',
71
63
  Launching: 'Launching',
72
64
  Ready: 'Ready',
73
65
  };
66
+ let browserStatus = BrowserStatus.None;
67
+ /** Chromium WebSocket endpoint that allows puppeteer browser instance to connect to the browser */
68
+ let browserEndpoint = '';
74
69
  let browser;
75
70
  let browserActivatedTimer;
76
71
  function ensurePuppeteerrc() {
77
72
  return __awaiter(this, void 0, void 0, function* () {
78
- const cacheDirectory = path_1.default.join(config_2.config.cacheDir, 'puppeteer', 'cache');
79
- const temporaryDirectory = path_1.default.join(config_2.config.cacheDir, 'puppeteer', 'tmp');
80
- const puppeteerrcPath = path_1.default.join(config_2.config.appDir, '.puppeteerrc.js');
73
+ const cacheDirectory = path_1.default.join(config_1.config.cacheDir, 'puppeteer', 'cache');
74
+ const temporaryDirectory = path_1.default.join(config_1.config.cacheDir, 'puppeteer', 'tmp');
75
+ const puppeteerrcPath = path_1.default.join(config_1.config.appDir, '.puppeteerrc.js');
81
76
  // ensure directory exists
82
77
  yield Promise.all([fs_extra_1.default.ensureDir(cacheDirectory), fs_extra_1.default.ensureDir(temporaryDirectory), fs_extra_1.default.ensureFile(puppeteerrcPath)]);
83
78
  const puppeteerConfig = {
@@ -86,17 +81,17 @@ function ensurePuppeteerrc() {
86
81
  };
87
82
  const fileContent = `module.exports = ${JSON.stringify(puppeteerConfig, null, 2)}`;
88
83
  yield fs_extra_1.default.writeFile(puppeteerrcPath, fileContent);
89
- config_2.logger.debug(`Puppeteerrc file created at ${puppeteerrcPath}`, puppeteerConfig);
84
+ config_1.logger.debug(`Puppeteerrc file created at ${puppeteerrcPath}`, puppeteerConfig);
90
85
  return puppeteerConfig;
91
86
  });
92
87
  }
93
88
  function ensureBrowser() {
94
89
  return __awaiter(this, void 0, void 0, function* () {
95
90
  const puppeteerConfig = yield ensurePuppeteerrc();
96
- const executablePath = process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium';
97
- config_2.logger.debug('Chromium executablePath', executablePath);
98
- if (!fs_extra_1.default.existsSync(executablePath)) {
99
- config_2.logger.info('start download browser', puppeteerConfig);
91
+ const executablePath = config_1.config.puppeteerPath;
92
+ config_1.logger.debug('executablePath', executablePath);
93
+ if (!executablePath || !fs_extra_1.default.existsSync(executablePath)) {
94
+ config_1.logger.info('start download browser', puppeteerConfig);
100
95
  const { downloadBrowser } = yield (() => __awaiter(this, void 0, void 0, function* () {
101
96
  try {
102
97
  // @ts-ignore
@@ -104,45 +99,44 @@ function ensureBrowser() {
104
99
  return yield Promise.resolve().then(() => __importStar(require('@blocklet/puppeteer/internal/node/install.js')));
105
100
  }
106
101
  catch (err) {
107
- config_2.logger.warn('Skipping browser installation because the Puppeteer build is not available. Run `npm install` again after you have re-built Puppeteer.');
102
+ config_1.logger.warn('Skipping browser installation because the Puppeteer build is not available. Run `npm install` again after you have re-built Puppeteer.');
108
103
  }
109
104
  }))();
110
105
  if (downloadBrowser) {
111
106
  yield downloadBrowser();
112
- config_2.logger.info('Browser download completed successfully');
107
+ config_1.logger.info('Browser download completed successfully');
113
108
  }
114
109
  }
115
110
  // try to launch browser
116
- if (config_2.config.testOnInitialize) {
111
+ if (config_1.config.isProd) {
117
112
  const browser = yield launchBrowser();
118
113
  if (!browser) {
119
114
  throw new Error('Failed to launch browser');
120
115
  }
121
116
  yield (0, exports.closeBrowser)();
122
117
  }
123
- config_2.logger.info('Puppeteer is ready');
118
+ config_1.logger.info('Puppeteer is ready');
124
119
  });
125
120
  }
126
121
  function connectBrowser() {
127
122
  return __awaiter(this, void 0, void 0, function* () {
128
- const browserWSEndpoint = yield cache_1.useCache.get(BROWSER_WS_ENDPOINT_KEY);
129
- if (!browserWSEndpoint) {
123
+ if (!browserEndpoint) {
130
124
  return null;
131
125
  }
132
126
  // retry if browser is launching
133
- if (browserWSEndpoint.status === BrowserStatus.Launching) {
127
+ if (browserStatus === BrowserStatus.Launching) {
134
128
  yield (0, utils_1.sleep)(Math.floor(Math.random() * 1000));
135
129
  return connectBrowser();
136
130
  }
137
131
  try {
138
132
  browser = yield puppeteer_1.default.connect({
139
- browserWSEndpoint: browserWSEndpoint.endpoint,
133
+ browserWSEndpoint: browserEndpoint,
140
134
  });
141
- config_2.logger.info('Connect browser success');
135
+ config_1.logger.info('Connect browser success');
142
136
  }
143
137
  catch (err) {
144
- config_2.logger.warn('Connect browser failed, clear endpoint', err);
145
- yield cache_1.useCache.remove(BROWSER_WS_ENDPOINT_KEY);
138
+ config_1.logger.warn('Connect browser failed, clear endpoint', err);
139
+ browserEndpoint = '';
146
140
  return null;
147
141
  }
148
142
  return browser;
@@ -150,12 +144,9 @@ function connectBrowser() {
150
144
  }
151
145
  function launchBrowser() {
152
146
  return __awaiter(this, void 0, void 0, function* () {
153
- yield cache_1.useCache.set(BROWSER_WS_ENDPOINT_KEY, {
154
- endpoint: null,
155
- status: BrowserStatus.Launching,
156
- });
147
+ browserEndpoint = '';
148
+ browserStatus = BrowserStatus.Launching;
157
149
  try {
158
- // @ts-ignore
159
150
  browser = yield puppeteer_1.default.launch({
160
151
  headless: true,
161
152
  args: [
@@ -182,20 +173,17 @@ function launchBrowser() {
182
173
  '--font-render-hinting=none',
183
174
  ],
184
175
  });
185
- config_2.logger.info('Launch browser');
176
+ config_1.logger.info('Launch browser');
186
177
  }
187
178
  catch (error) {
188
- config_2.logger.error('launch browser failed: ', error);
189
- // cleanup browser endpoint
190
- yield cache_1.useCache.remove(BROWSER_WS_ENDPOINT_KEY);
179
+ config_1.logger.error('launch browser failed: ', error);
180
+ browserStatus = BrowserStatus.None;
181
+ browserEndpoint = '';
191
182
  throw error;
192
183
  }
193
184
  // save browserWSEndpoint to cache
194
- const endpoint = yield browser.wsEndpoint();
195
- yield cache_1.useCache.set(BROWSER_WS_ENDPOINT_KEY, {
196
- endpoint,
197
- status: BrowserStatus.Ready,
198
- });
185
+ browserEndpoint = yield browser.wsEndpoint();
186
+ browserStatus = BrowserStatus.Ready;
199
187
  return browser;
200
188
  });
201
189
  }
@@ -208,13 +196,13 @@ function checkBrowserActivated() {
208
196
  const pages = yield browser.pages().catch(() => []);
209
197
  if (pages.length === 1 && ((_a = pages[0]) === null || _a === void 0 ? void 0 : _a.url()) === 'about:blank') {
210
198
  count++;
211
- config_2.logger.debug(`Browser inactive count: ${count}/3`);
199
+ config_1.logger.debug(`Browser inactive count: ${count}/3`);
212
200
  }
213
201
  else {
214
202
  count = 0; // 重置计数器!
215
203
  }
216
204
  if (count >= 3) {
217
- config_2.logger.info('Browser inactive for 3 minutes, closing...');
205
+ config_1.logger.info('Browser inactive for 3 minutes, closing...');
218
206
  yield (0, exports.closeBrowser)({
219
207
  trimCache: true,
220
208
  });
@@ -236,14 +224,15 @@ const getBrowser = () => __awaiter(void 0, void 0, void 0, function* () {
236
224
  // try to connect browser
237
225
  const connectedBrowser = yield connectBrowser();
238
226
  if (connectedBrowser) {
239
- config_2.logger.debug('getBrowser.connectedBrowser');
227
+ config_1.logger.debug('getBrowser.connectedBrowser');
240
228
  browser = connectedBrowser;
229
+ checkBrowserActivated();
241
230
  return browser;
242
231
  }
243
232
  // try to launch browser
244
233
  const launchedBrowser = yield launchBrowser();
245
234
  if (launchedBrowser) {
246
- config_2.logger.debug('getBrowser.launchedBrowser');
235
+ config_1.logger.debug('getBrowser.launchedBrowser');
247
236
  browser = launchedBrowser;
248
237
  checkBrowserActivated();
249
238
  return browser;
@@ -260,20 +249,20 @@ const closeBrowser = (...args_1) => __awaiter(void 0, [...args_1], void 0, funct
260
249
  yield Promise.all(pages.map((page) => page.close()));
261
250
  }
262
251
  catch (err) {
263
- config_2.logger.warn('Failed to close all pages:', err);
252
+ config_1.logger.warn('Failed to close all pages:', err);
264
253
  }
265
254
  // close browser
266
255
  try {
267
256
  yield browser.close();
268
257
  }
269
258
  catch (err) {
270
- config_2.logger.warn('Failed to close browser:', err);
259
+ config_1.logger.warn('Failed to close browser:', err);
271
260
  }
272
261
  // clear cache
273
262
  try {
274
263
  if (trimCache) {
275
264
  yield puppeteer_1.default.trimCache();
276
- config_2.logger.debug('Trim cache success');
265
+ config_1.logger.debug('Trim cache success');
277
266
  }
278
267
  // try to clear temporary directory
279
268
  // if (puppeteerConfig) {
@@ -284,12 +273,13 @@ const closeBrowser = (...args_1) => __awaiter(void 0, [...args_1], void 0, funct
284
273
  }
285
274
  }
286
275
  catch (err) {
287
- config_2.logger.warn('Failed to clear browser cache:', err);
276
+ config_1.logger.warn('Failed to clear browser cache:', err);
288
277
  }
289
278
  browser = null;
290
279
  clearBrowserActivatedTimer();
291
- yield cache_1.useCache.remove(BROWSER_WS_ENDPOINT_KEY);
292
- config_2.logger.info('Close browser success');
280
+ browserEndpoint = '';
281
+ browserStatus = BrowserStatus.None;
282
+ config_1.logger.info('Close browser success');
293
283
  });
294
284
  exports.closeBrowser = closeBrowser;
295
285
  function initPage() {
@@ -306,7 +296,6 @@ function initPage() {
306
296
  if (abortResourceTypes.length > 0) {
307
297
  yield page.setRequestInterception(true);
308
298
  page.on('request', (req) => {
309
- // @ts-ignore
310
299
  if (abortResourceTypes.includes(req.resourceType())) {
311
300
  return req.abort();
312
301
  }
@@ -0,0 +1,12 @@
1
+ import { JobState } from '../store/job';
2
+ import { SnapshotModel } from '../store/snapshot';
3
+ export declare function convertJobToSnapshot({ job, snapshot }: {
4
+ job: JobState;
5
+ snapshot?: Partial<SnapshotModel>;
6
+ }): SnapshotModel;
7
+ export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<keyof SnapshotModel>): Promise<SnapshotModel>;
8
+ /**
9
+ * get snapshot from db or crawl queue
10
+ */
11
+ export declare function getSnapshot(jobId: string): Promise<SnapshotModel | null>;
12
+ export declare function getLatestSnapshot(url: string): Promise<SnapshotModel | null>;
@@ -0,0 +1,84 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ var __importDefault = (this && this.__importDefault) || function (mod) {
12
+ return (mod && mod.__esModule) ? mod : { "default": mod };
13
+ };
14
+ Object.defineProperty(exports, "__esModule", { value: true });
15
+ exports.convertJobToSnapshot = convertJobToSnapshot;
16
+ exports.formatSnapshot = formatSnapshot;
17
+ exports.getSnapshot = getSnapshot;
18
+ exports.getLatestSnapshot = getLatestSnapshot;
19
+ const pick_1 = __importDefault(require("lodash/pick"));
20
+ const promises_1 = __importDefault(require("node:fs/promises"));
21
+ const node_path_1 = __importDefault(require("node:path"));
22
+ const ufo_1 = require("ufo");
23
+ const config_1 = require("../config");
24
+ const job_1 = require("../store/job");
25
+ const snapshot_1 = require("../store/snapshot");
26
+ const utils_1 = require("../utils");
27
+ function convertJobToSnapshot({ job, snapshot }) {
28
+ return Object.assign({ jobId: job.jobId || job.id, url: job.url, lastModified: job.lastModified || new Date().toISOString(), options: {
29
+ width: job.width,
30
+ height: job.height,
31
+ includeScreenshot: job.includeScreenshot,
32
+ includeHtml: job.includeHtml,
33
+ quality: job.quality,
34
+ fullPage: job.fullPage,
35
+ } }, snapshot);
36
+ }
37
+ function formatSnapshot(snapshot, columns) {
38
+ return __awaiter(this, void 0, void 0, function* () {
39
+ let data = Object.assign({}, snapshot);
40
+ // format screenshot path to full url
41
+ if (data.screenshot) {
42
+ data.screenshot = (0, ufo_1.joinURL)(config_1.config.appUrl, data.screenshot);
43
+ }
44
+ // format html path to string
45
+ if (data.html) {
46
+ const html = yield promises_1.default.readFile(node_path_1.default.join(config_1.config.dataDir, data.html));
47
+ data.html = html.toString();
48
+ }
49
+ if (columns === null || columns === void 0 ? void 0 : columns.length) {
50
+ data = (0, pick_1.default)(data, columns);
51
+ }
52
+ return data;
53
+ });
54
+ }
55
+ /**
56
+ * get snapshot from db or crawl queue
57
+ */
58
+ function getSnapshot(jobId) {
59
+ return __awaiter(this, void 0, void 0, function* () {
60
+ const snapshot = yield snapshot_1.Snapshot.findSnapshot({ where: { jobId } });
61
+ if (snapshot) {
62
+ return formatSnapshot(snapshot);
63
+ }
64
+ const job = yield job_1.Job.findJob({ id: jobId });
65
+ if (job) {
66
+ return {
67
+ jobId,
68
+ status: 'pending',
69
+ };
70
+ }
71
+ return null;
72
+ });
73
+ }
74
+ function getLatestSnapshot(url) {
75
+ return __awaiter(this, void 0, void 0, function* () {
76
+ const snapshot = yield snapshot_1.Snapshot.findSnapshot({
77
+ where: {
78
+ url: (0, utils_1.formatUrl)(url),
79
+ status: 'success',
80
+ },
81
+ });
82
+ return snapshot ? formatSnapshot(snapshot) : null;
83
+ });
84
+ }
@@ -0,0 +1,2 @@
1
+ import { Site } from './config';
2
+ export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | undefined)[]>;
@@ -0,0 +1,76 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ var __importDefault = (this && this.__importDefault) || function (mod) {
12
+ return (mod && mod.__esModule) ? mod : { "default": mod };
13
+ };
14
+ Object.defineProperty(exports, "__esModule", { value: true });
15
+ exports.crawlSite = void 0;
16
+ const uniq_1 = __importDefault(require("lodash/uniq"));
17
+ const p_map_1 = __importDefault(require("p-map"));
18
+ const config_1 = require("./config");
19
+ const crawler_1 = require("./crawler");
20
+ const snapshot_1 = require("./store/snapshot");
21
+ const utils_1 = require("./utils");
22
+ const crawlBlockletRunningMap = new Map();
23
+ function parseSitemapUrl(sitemapItem) {
24
+ var _a;
25
+ const links = ((_a = sitemapItem.links) === null || _a === void 0 ? void 0 : _a.map((item) => item.url)) || [];
26
+ const urls = (0, uniq_1.default)([...links, sitemapItem.url]).filter(Boolean);
27
+ return urls.map((url) => ({ url, sitemapItem }));
28
+ }
29
+ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
30
+ config_1.logger.info(`Start crawl from sitemap ${url}`, { pathname });
31
+ const sitemapList = yield (0, utils_1.getSitemapList)(url);
32
+ const pathnameRegex = new RegExp(pathname);
33
+ const sitemapItems = sitemapList
34
+ .filter((item) => new URL(item.url).pathname.match(pathnameRegex))
35
+ .flatMap((sitemapItem) => {
36
+ return parseSitemapUrl(sitemapItem);
37
+ });
38
+ config_1.logger.info(`Found ${sitemapItems.length} sitemap items which match ${pathname} from ${url}`);
39
+ const crawlableItems = (yield Promise.all(sitemapItems.map((_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
40
+ const snapshot = yield snapshot_1.Snapshot.findOne({ where: { url: (0, utils_1.formatUrl)(url) } });
41
+ if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
42
+ const lastModified = new Date(snapshot.lastModified);
43
+ // skip if snapshot lastModified is greater than sitemap lastmod
44
+ if (sitemapItem.lastmod && lastModified >= new Date(sitemapItem.lastmod)) {
45
+ return null;
46
+ }
47
+ // skip if interval time has not been reached
48
+ if (Date.now() - lastModified.getTime() < interval * 24 * 60 * 60 * 1000) {
49
+ return null;
50
+ }
51
+ }
52
+ return { url, sitemapItem };
53
+ })))).filter(Boolean);
54
+ config_1.logger.info(`Found ${crawlableItems.length} pages to crawl from sitemap ${url}`, { pathname });
55
+ const key = `${url}-${pathname}`;
56
+ crawlBlockletRunningMap.set(key, crawlableItems);
57
+ try {
58
+ const jobIds = yield (0, p_map_1.default)(crawlableItems, ({ url, sitemapItem }) => {
59
+ return (0, crawler_1.crawlUrl)({
60
+ url,
61
+ lastModified: sitemapItem.lastmod,
62
+ includeScreenshot: false,
63
+ includeHtml: true,
64
+ });
65
+ }, { concurrency: config_1.config.siteCron.concurrency });
66
+ return jobIds;
67
+ }
68
+ catch (error) {
69
+ config_1.logger.error(`Failed to crawl from sitemap ${url} ${pathname}`, error);
70
+ throw new Error(error);
71
+ }
72
+ finally {
73
+ crawlBlockletRunningMap.delete(key);
74
+ }
75
+ });
76
+ exports.crawlSite = crawlSite;
@@ -0,0 +1,3 @@
1
+ import { Sequelize } from '@sequelize/core';
2
+ import { SqliteDialect } from '@sequelize/sqlite3';
3
+ export declare function initDatabase(): Promise<Sequelize<SqliteDialect>>;
@@ -12,23 +12,38 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
12
12
  return (mod && mod.__esModule) ? mod : { "default": mod };
13
13
  };
14
14
  Object.defineProperty(exports, "__esModule", { value: true });
15
- exports.ensureDatabase = ensureDatabase;
15
+ exports.initDatabase = initDatabase;
16
16
  const core_1 = require("@sequelize/core");
17
17
  const sqlite3_1 = require("@sequelize/sqlite3");
18
18
  const path_1 = __importDefault(require("path"));
19
19
  const config_1 = require("../config");
20
20
  const job_1 = require("./job");
21
21
  const snapshot_1 = require("./snapshot");
22
- function ensureDatabase() {
22
+ function initDatabase() {
23
23
  return __awaiter(this, void 0, void 0, function* () {
24
24
  const sequelize = new core_1.Sequelize({
25
25
  dialect: sqlite3_1.SqliteDialect,
26
26
  storage: path_1.default.join(config_1.config.dataDir, 'snap-kit.db'),
27
- logging: (msg) => config_1.logger.debug(msg),
27
+ logging: (msg) => process.env.SQLITE_LOG && config_1.logger.debug(msg),
28
+ pool: {
29
+ min: 0,
30
+ max: 10,
31
+ idle: 10000,
32
+ },
33
+ retry: {
34
+ match: [/SQLITE_BUSY/],
35
+ name: 'query',
36
+ max: 10,
37
+ },
28
38
  });
29
- yield (0, snapshot_1.initSnapshotModel)(sequelize);
30
- yield (0, job_1.initJobModel)(sequelize);
39
+ job_1.Job.initModel(sequelize);
40
+ snapshot_1.Snapshot.initModel(sequelize);
31
41
  try {
42
+ yield Promise.all([
43
+ sequelize.query('pragma journal_mode = WAL;'),
44
+ sequelize.query('pragma synchronous = normal;'),
45
+ sequelize.query('pragma journal_size_limit = 67108864;'),
46
+ ]);
32
47
  yield sequelize.authenticate();
33
48
  yield sequelize.sync();
34
49
  config_1.logger.info('Successfully connected to database');
@@ -37,5 +52,6 @@ function ensureDatabase() {
37
52
  config_1.logger.error('Failed to connect to database:', error);
38
53
  throw error;
39
54
  }
55
+ return sequelize;
40
56
  });
41
57
  }
@@ -10,6 +10,7 @@ export interface JobState {
10
10
  quality?: number;
11
11
  timeout?: number;
12
12
  fullPage?: boolean;
13
+ lastModified?: string;
13
14
  }
14
15
  export interface JobModel {
15
16
  id: string;
@@ -20,7 +21,7 @@ export interface JobModel {
20
21
  delay: number;
21
22
  cancelled: boolean;
22
23
  }
23
- declare class Job extends Model<JobModel> implements JobModel {
24
+ export declare class Job extends Model<JobModel> implements JobModel {
24
25
  id: JobModel['id'];
25
26
  queue: JobModel['queue'];
26
27
  job: JobModel['job'];
@@ -28,6 +29,6 @@ declare class Job extends Model<JobModel> implements JobModel {
28
29
  willRunAt: JobModel['willRunAt'];
29
30
  delay: JobModel['delay'];
30
31
  cancelled: JobModel['cancelled'];
32
+ static initModel(sequelize: Sequelize): typeof Job;
33
+ static findJob(condition: Partial<JobState>): Promise<JobModel | null>;
31
34
  }
32
- export { Job };
33
- export declare function initJobModel(sequelize: Sequelize): typeof Job;
@@ -0,0 +1,110 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
36
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
37
+ return new (P || (P = Promise))(function (resolve, reject) {
38
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
39
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
40
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
41
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
42
+ });
43
+ };
44
+ Object.defineProperty(exports, "__esModule", { value: true });
45
+ exports.Job = void 0;
46
+ const core_1 = __importStar(require("@sequelize/core"));
47
+ class Job extends core_1.Model {
48
+ static initModel(sequelize) {
49
+ return Job.init({
50
+ id: {
51
+ type: core_1.DataTypes.STRING(40),
52
+ primaryKey: true,
53
+ },
54
+ queue: {
55
+ type: core_1.DataTypes.STRING(32),
56
+ allowNull: false,
57
+ },
58
+ job: {
59
+ type: core_1.DataTypes.JSON,
60
+ allowNull: false,
61
+ },
62
+ retryCount: {
63
+ type: core_1.DataTypes.INTEGER,
64
+ },
65
+ delay: {
66
+ type: core_1.DataTypes.INTEGER,
67
+ },
68
+ willRunAt: {
69
+ type: core_1.DataTypes.INTEGER,
70
+ },
71
+ cancelled: {
72
+ type: core_1.DataTypes.BOOLEAN,
73
+ defaultValue: false,
74
+ },
75
+ createdAt: {
76
+ type: core_1.DataTypes.DATE,
77
+ defaultValue: core_1.DataTypes.NOW,
78
+ index: true,
79
+ },
80
+ updatedAt: {
81
+ type: core_1.DataTypes.DATE,
82
+ defaultValue: core_1.DataTypes.NOW,
83
+ index: true,
84
+ },
85
+ }, {
86
+ sequelize,
87
+ indexes: [{ fields: ['queue'] }],
88
+ modelName: 'job',
89
+ tableName: 'jobs',
90
+ timestamps: true,
91
+ });
92
+ }
93
+ static findJob(condition) {
94
+ return __awaiter(this, void 0, void 0, function* () {
95
+ const where = Object.keys(condition)
96
+ .filter((key) => condition[key] !== undefined)
97
+ .map((key) => {
98
+ return core_1.default.where(core_1.default.fn('json_extract', core_1.default.col('job'), `$.${key}`), condition[key]);
99
+ });
100
+ const job = yield Job.findOne({
101
+ where: {
102
+ [core_1.default.Op.and]: where,
103
+ },
104
+ order: [['createdAt', 'DESC']],
105
+ });
106
+ return (job === null || job === void 0 ? void 0 : job.toJSON()) || null;
107
+ });
108
+ }
109
+ }
110
+ exports.Job = Job;
@@ -1,5 +1,5 @@
1
- import { Model, Sequelize } from '@sequelize/core';
2
- interface SnapshotModel {
1
+ import { FindOptions, Model, Sequelize } from '@sequelize/core';
2
+ export interface SnapshotModel {
3
3
  jobId: string;
4
4
  url: string;
5
5
  status: 'success' | 'failed' | 'pending';
@@ -16,7 +16,7 @@ interface SnapshotModel {
16
16
  fullPage?: boolean;
17
17
  };
18
18
  }
19
- declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
19
+ export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
20
20
  jobId: SnapshotModel['jobId'];
21
21
  url: SnapshotModel['url'];
22
22
  status: SnapshotModel['status'];
@@ -25,7 +25,6 @@ declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
25
25
  error?: SnapshotModel['error'];
26
26
  lastModified?: SnapshotModel['lastModified'];
27
27
  options: SnapshotModel['options'];
28
+ static initModel(sequelize: Sequelize): typeof Snapshot;
29
+ static findSnapshot(condition: FindOptions<SnapshotModel>): Promise<SnapshotModel | null>;
28
30
  }
29
- export { Snapshot };
30
- export type { SnapshotModel };
31
- export declare function initSnapshotModel(sequelize: Sequelize): typeof Snapshot;