@arcblock/crawler 1.0.6 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +1 -0
  2. package/lib/cjs/config.d.ts +22 -0
  3. package/{dist → lib/cjs}/config.js +9 -3
  4. package/lib/cjs/crawler.d.ts +26 -0
  5. package/{dist → lib/cjs}/crawler.js +44 -112
  6. package/lib/cjs/cron.d.ts +1 -0
  7. package/lib/cjs/cron.js +49 -0
  8. package/lib/cjs/index.d.ts +9 -0
  9. package/lib/cjs/index.js +78 -0
  10. package/{esm → lib/cjs}/puppeteer.d.ts +2 -2
  11. package/{dist → lib/cjs}/puppeteer.js +43 -54
  12. package/lib/cjs/services/snapshot.d.ts +12 -0
  13. package/lib/cjs/services/snapshot.js +84 -0
  14. package/lib/cjs/site.d.ts +2 -0
  15. package/lib/cjs/site.js +76 -0
  16. package/lib/cjs/store/index.d.ts +3 -0
  17. package/{dist/db → lib/cjs/store}/index.js +21 -5
  18. package/{dist/db → lib/cjs/store}/job.d.ts +4 -3
  19. package/lib/cjs/store/job.js +110 -0
  20. package/{dist/db → lib/cjs/store}/snapshot.d.ts +5 -6
  21. package/lib/cjs/store/snapshot.js +68 -0
  22. package/lib/cjs/utils.d.ts +32 -0
  23. package/{dist → lib/cjs}/utils.js +67 -78
  24. package/lib/esm/config.d.ts +22 -0
  25. package/{esm → lib/esm}/config.js +9 -3
  26. package/lib/esm/crawler.d.ts +26 -0
  27. package/{esm → lib/esm}/crawler.js +35 -100
  28. package/lib/esm/cron.d.ts +1 -0
  29. package/lib/esm/cron.js +43 -0
  30. package/lib/esm/index.d.ts +9 -0
  31. package/{esm → lib/esm}/index.js +19 -10
  32. package/{dist → lib/esm}/puppeteer.d.ts +2 -2
  33. package/{esm → lib/esm}/puppeteer.js +21 -32
  34. package/lib/esm/services/snapshot.d.ts +12 -0
  35. package/lib/esm/services/snapshot.js +75 -0
  36. package/lib/esm/site.d.ts +2 -0
  37. package/lib/esm/site.js +69 -0
  38. package/lib/esm/store/index.d.ts +3 -0
  39. package/{esm/db → lib/esm/store}/index.js +22 -6
  40. package/{esm/db → lib/esm/store}/job.d.ts +4 -3
  41. package/lib/esm/store/job.js +73 -0
  42. package/{esm/db → lib/esm/store}/snapshot.d.ts +5 -6
  43. package/lib/esm/store/snapshot.js +64 -0
  44. package/lib/esm/utils.d.ts +32 -0
  45. package/{esm → lib/esm}/utils.js +64 -71
  46. package/package.json +20 -32
  47. package/third.d.ts +0 -0
  48. package/dist/blocklet.d.ts +0 -6
  49. package/dist/blocklet.js +0 -199
  50. package/dist/cache.d.ts +0 -10
  51. package/dist/cache.js +0 -119
  52. package/dist/config.d.ts +0 -10
  53. package/dist/crawler.d.ts +0 -28
  54. package/dist/db/index.d.ts +0 -1
  55. package/dist/db/job.js +0 -54
  56. package/dist/db/snapshot.js +0 -52
  57. package/dist/index.d.ts +0 -6
  58. package/dist/index.js +0 -45
  59. package/dist/middleware.d.ts +0 -4
  60. package/dist/middleware.js +0 -44
  61. package/dist/utils.d.ts +0 -17
  62. package/esm/blocklet.d.ts +0 -6
  63. package/esm/blocklet.js +0 -190
  64. package/esm/cache.d.ts +0 -10
  65. package/esm/cache.js +0 -114
  66. package/esm/config.d.ts +0 -10
  67. package/esm/crawler.d.ts +0 -28
  68. package/esm/db/index.d.ts +0 -1
  69. package/esm/db/job.js +0 -50
  70. package/esm/db/snapshot.js +0 -48
  71. package/esm/index.d.ts +0 -6
  72. package/esm/middleware.d.ts +0 -4
  73. package/esm/middleware.js +0 -41
  74. package/esm/utils.d.ts +0 -17
@@ -7,20 +7,29 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
7
7
  step((generator = generator.apply(thisArg, _arguments || [])).next());
8
8
  });
9
9
  };
10
+ import merge from 'lodash/merge';
10
11
  import { config, logger } from './config';
11
12
  import { createCrawlQueue } from './crawler';
12
- import { ensureDatabase } from './db';
13
+ import { initCron } from './cron';
13
14
  import { ensureBrowser } from './puppeteer';
14
- export * from './blocklet';
15
+ import { initDatabase } from './store';
15
16
  export * from './crawler';
16
- export * from './middleware';
17
- export { Snapshot } from './db/snapshot';
18
- export function initCrawler(_config) {
17
+ export * from './site';
18
+ export * from './services/snapshot';
19
+ export * as utils from './utils';
20
+ export function initCrawler(params) {
19
21
  return __awaiter(this, void 0, void 0, function* () {
20
- Object.assign(config, _config);
21
- logger.debug('init crawler', config);
22
- yield ensureDatabase();
23
- yield createCrawlQueue();
24
- yield ensureBrowser();
22
+ logger.info('Init crawler', { params });
23
+ merge(config, params);
24
+ try {
25
+ yield initDatabase();
26
+ yield ensureBrowser();
27
+ yield createCrawlQueue();
28
+ yield initCron();
29
+ }
30
+ catch (err) {
31
+ logger.error('Init crawler error', { err });
32
+ throw err;
33
+ }
25
34
  });
26
35
  }
@@ -1,4 +1,4 @@
1
- import puppeteer, { Browser, Page } from '@blocklet/puppeteer';
1
+ import puppeteer, { Browser, Page, ResourceType } from '@blocklet/puppeteer';
2
2
  export { puppeteer };
3
3
  export declare function ensurePuppeteerrc(): Promise<{
4
4
  cacheDirectory: string;
@@ -12,5 +12,5 @@ export declare const closeBrowser: ({ trimCache }?: {
12
12
  trimCache?: boolean;
13
13
  }) => Promise<void>;
14
14
  export declare function initPage({ abortResourceTypes }?: {
15
- abortResourceTypes?: never[] | undefined;
15
+ abortResourceTypes?: ResourceType[];
16
16
  }): Promise<Page>;
@@ -7,25 +7,20 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
7
7
  step((generator = generator.apply(thisArg, _arguments || [])).next());
8
8
  });
9
9
  };
10
- // import fs from 'fs-extra';
11
- // import path from 'path';
12
10
  import puppeteer from '@blocklet/puppeteer';
13
- import { env } from '@blocklet/sdk/lib/config';
14
11
  import fs from 'fs-extra';
15
12
  import path from 'path';
16
13
  import { clearInterval, setInterval } from 'timers';
17
- import { useCache } from './cache';
18
14
  import { config, logger } from './config';
19
15
  import { CRAWLER_FLAG, sleep } from './utils';
20
- // let puppeteerConfig: {
21
- // cacheDirectory: string;
22
- // temporaryDirectory: string;
23
- // };
24
- const BROWSER_WS_ENDPOINT_KEY = `browserWSEndpoint-${env.appId || 'unknown'}`;
25
16
  const BrowserStatus = {
17
+ None: 'None',
26
18
  Launching: 'Launching',
27
19
  Ready: 'Ready',
28
20
  };
21
+ let browserStatus = BrowserStatus.None;
22
+ /** Chromium WebSocket endpoint that allows puppeteer browser instance to connect to the browser */
23
+ let browserEndpoint = '';
29
24
  let browser;
30
25
  let browserActivatedTimer;
31
26
  export { puppeteer };
@@ -49,9 +44,9 @@ export function ensurePuppeteerrc() {
49
44
  export function ensureBrowser() {
50
45
  return __awaiter(this, void 0, void 0, function* () {
51
46
  const puppeteerConfig = yield ensurePuppeteerrc();
52
- const executablePath = process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium';
53
- logger.debug('Chromium executablePath', executablePath);
54
- if (!fs.existsSync(executablePath)) {
47
+ const executablePath = config.puppeteerPath;
48
+ logger.debug('executablePath', executablePath);
49
+ if (!executablePath || !fs.existsSync(executablePath)) {
55
50
  logger.info('start download browser', puppeteerConfig);
56
51
  const { downloadBrowser } = yield (() => __awaiter(this, void 0, void 0, function* () {
57
52
  try {
@@ -69,7 +64,7 @@ export function ensureBrowser() {
69
64
  }
70
65
  }
71
66
  // try to launch browser
72
- if (config.testOnInitialize) {
67
+ if (config.isProd) {
73
68
  const browser = yield launchBrowser();
74
69
  if (!browser) {
75
70
  throw new Error('Failed to launch browser');
@@ -81,24 +76,23 @@ export function ensureBrowser() {
81
76
  }
82
77
  export function connectBrowser() {
83
78
  return __awaiter(this, void 0, void 0, function* () {
84
- const browserWSEndpoint = yield useCache.get(BROWSER_WS_ENDPOINT_KEY);
85
- if (!browserWSEndpoint) {
79
+ if (!browserEndpoint) {
86
80
  return null;
87
81
  }
88
82
  // retry if browser is launching
89
- if (browserWSEndpoint.status === BrowserStatus.Launching) {
83
+ if (browserStatus === BrowserStatus.Launching) {
90
84
  yield sleep(Math.floor(Math.random() * 1000));
91
85
  return connectBrowser();
92
86
  }
93
87
  try {
94
88
  browser = yield puppeteer.connect({
95
- browserWSEndpoint: browserWSEndpoint.endpoint,
89
+ browserWSEndpoint: browserEndpoint,
96
90
  });
97
91
  logger.info('Connect browser success');
98
92
  }
99
93
  catch (err) {
100
94
  logger.warn('Connect browser failed, clear endpoint', err);
101
- yield useCache.remove(BROWSER_WS_ENDPOINT_KEY);
95
+ browserEndpoint = '';
102
96
  return null;
103
97
  }
104
98
  return browser;
@@ -106,12 +100,9 @@ export function connectBrowser() {
106
100
  }
107
101
  export function launchBrowser() {
108
102
  return __awaiter(this, void 0, void 0, function* () {
109
- yield useCache.set(BROWSER_WS_ENDPOINT_KEY, {
110
- endpoint: null,
111
- status: BrowserStatus.Launching,
112
- });
103
+ browserEndpoint = '';
104
+ browserStatus = BrowserStatus.Launching;
113
105
  try {
114
- // @ts-ignore
115
106
  browser = yield puppeteer.launch({
116
107
  headless: true,
117
108
  args: [
@@ -142,16 +133,13 @@ export function launchBrowser() {
142
133
  }
143
134
  catch (error) {
144
135
  logger.error('launch browser failed: ', error);
145
- // cleanup browser endpoint
146
- yield useCache.remove(BROWSER_WS_ENDPOINT_KEY);
136
+ browserStatus = BrowserStatus.None;
137
+ browserEndpoint = '';
147
138
  throw error;
148
139
  }
149
140
  // save browserWSEndpoint to cache
150
- const endpoint = yield browser.wsEndpoint();
151
- yield useCache.set(BROWSER_WS_ENDPOINT_KEY, {
152
- endpoint,
153
- status: BrowserStatus.Ready,
154
- });
141
+ browserEndpoint = yield browser.wsEndpoint();
142
+ browserStatus = BrowserStatus.Ready;
155
143
  return browser;
156
144
  });
157
145
  }
@@ -194,6 +182,7 @@ export const getBrowser = () => __awaiter(void 0, void 0, void 0, function* () {
194
182
  if (connectedBrowser) {
195
183
  logger.debug('getBrowser.connectedBrowser');
196
184
  browser = connectedBrowser;
185
+ checkBrowserActivated();
197
186
  return browser;
198
187
  }
199
188
  // try to launch browser
@@ -243,7 +232,8 @@ export const closeBrowser = (...args_1) => __awaiter(void 0, [...args_1], void 0
243
232
  }
244
233
  browser = null;
245
234
  clearBrowserActivatedTimer();
246
- yield useCache.remove(BROWSER_WS_ENDPOINT_KEY);
235
+ browserEndpoint = '';
236
+ browserStatus = BrowserStatus.None;
247
237
  logger.info('Close browser success');
248
238
  });
249
239
  export function initPage() {
@@ -260,7 +250,6 @@ export function initPage() {
260
250
  if (abortResourceTypes.length > 0) {
261
251
  yield page.setRequestInterception(true);
262
252
  page.on('request', (req) => {
263
- // @ts-ignore
264
253
  if (abortResourceTypes.includes(req.resourceType())) {
265
254
  return req.abort();
266
255
  }
@@ -0,0 +1,12 @@
1
+ import { JobState } from '../store/job';
2
+ import { SnapshotModel } from '../store/snapshot';
3
+ export declare function convertJobToSnapshot({ job, snapshot }: {
4
+ job: JobState;
5
+ snapshot?: Partial<SnapshotModel>;
6
+ }): SnapshotModel;
7
+ export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<keyof SnapshotModel>): Promise<SnapshotModel>;
8
+ /**
9
+ * get snapshot from db or crawl queue
10
+ */
11
+ export declare function getSnapshot(jobId: string): Promise<SnapshotModel | null>;
12
+ export declare function getLatestSnapshot(url: string): Promise<SnapshotModel | null>;
@@ -0,0 +1,75 @@
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import pick from 'lodash/pick';
11
+ import fs from 'node:fs/promises';
12
+ import path from 'node:path';
13
+ import { joinURL } from 'ufo';
14
+ import { config } from '../config';
15
+ import { Job } from '../store/job';
16
+ import { Snapshot } from '../store/snapshot';
17
+ import { formatUrl } from '../utils';
18
+ export function convertJobToSnapshot({ job, snapshot }) {
19
+ return Object.assign({ jobId: job.jobId || job.id, url: job.url, lastModified: job.lastModified || new Date().toISOString(), options: {
20
+ width: job.width,
21
+ height: job.height,
22
+ includeScreenshot: job.includeScreenshot,
23
+ includeHtml: job.includeHtml,
24
+ quality: job.quality,
25
+ fullPage: job.fullPage,
26
+ } }, snapshot);
27
+ }
28
+ export function formatSnapshot(snapshot, columns) {
29
+ return __awaiter(this, void 0, void 0, function* () {
30
+ let data = Object.assign({}, snapshot);
31
+ // format screenshot path to full url
32
+ if (data.screenshot) {
33
+ data.screenshot = joinURL(config.appUrl, data.screenshot);
34
+ }
35
+ // format html path to string
36
+ if (data.html) {
37
+ const html = yield fs.readFile(path.join(config.dataDir, data.html));
38
+ data.html = html.toString();
39
+ }
40
+ if (columns === null || columns === void 0 ? void 0 : columns.length) {
41
+ data = pick(data, columns);
42
+ }
43
+ return data;
44
+ });
45
+ }
46
+ /**
47
+ * get snapshot from db or crawl queue
48
+ */
49
+ export function getSnapshot(jobId) {
50
+ return __awaiter(this, void 0, void 0, function* () {
51
+ const snapshot = yield Snapshot.findSnapshot({ where: { jobId } });
52
+ if (snapshot) {
53
+ return formatSnapshot(snapshot);
54
+ }
55
+ const job = yield Job.findJob({ id: jobId });
56
+ if (job) {
57
+ return {
58
+ jobId,
59
+ status: 'pending',
60
+ };
61
+ }
62
+ return null;
63
+ });
64
+ }
65
+ export function getLatestSnapshot(url) {
66
+ return __awaiter(this, void 0, void 0, function* () {
67
+ const snapshot = yield Snapshot.findSnapshot({
68
+ where: {
69
+ url: formatUrl(url),
70
+ status: 'success',
71
+ },
72
+ });
73
+ return snapshot ? formatSnapshot(snapshot) : null;
74
+ });
75
+ }
@@ -0,0 +1,2 @@
1
+ import { Site } from './config';
2
+ export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | undefined)[]>;
@@ -0,0 +1,69 @@
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import uniq from 'lodash/uniq';
11
+ import pMap from 'p-map';
12
+ import { config, logger } from './config';
13
+ import { crawlUrl } from './crawler';
14
+ import { Snapshot } from './store/snapshot';
15
+ import { formatUrl, getSitemapList } from './utils';
16
+ const crawlBlockletRunningMap = new Map();
17
+ function parseSitemapUrl(sitemapItem) {
18
+ var _a;
19
+ const links = ((_a = sitemapItem.links) === null || _a === void 0 ? void 0 : _a.map((item) => item.url)) || [];
20
+ const urls = uniq([...links, sitemapItem.url]).filter(Boolean);
21
+ return urls.map((url) => ({ url, sitemapItem }));
22
+ }
23
+ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
24
+ logger.info(`Start crawl from sitemap ${url}`, { pathname });
25
+ const sitemapList = yield getSitemapList(url);
26
+ const pathnameRegex = new RegExp(pathname);
27
+ const sitemapItems = sitemapList
28
+ .filter((item) => new URL(item.url).pathname.match(pathnameRegex))
29
+ .flatMap((sitemapItem) => {
30
+ return parseSitemapUrl(sitemapItem);
31
+ });
32
+ logger.info(`Found ${sitemapItems.length} sitemap items which match ${pathname} from ${url}`);
33
+ const crawlableItems = (yield Promise.all(sitemapItems.map((_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
34
+ const snapshot = yield Snapshot.findOne({ where: { url: formatUrl(url) } });
35
+ if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
36
+ const lastModified = new Date(snapshot.lastModified);
37
+ // skip if snapshot lastModified is greater than sitemap lastmod
38
+ if (sitemapItem.lastmod && lastModified >= new Date(sitemapItem.lastmod)) {
39
+ return null;
40
+ }
41
+ // skip if interval time has not been reached
42
+ if (Date.now() - lastModified.getTime() < interval * 24 * 60 * 60 * 1000) {
43
+ return null;
44
+ }
45
+ }
46
+ return { url, sitemapItem };
47
+ })))).filter(Boolean);
48
+ logger.info(`Found ${crawlableItems.length} pages to crawl from sitemap ${url}`, { pathname });
49
+ const key = `${url}-${pathname}`;
50
+ crawlBlockletRunningMap.set(key, crawlableItems);
51
+ try {
52
+ const jobIds = yield pMap(crawlableItems, ({ url, sitemapItem }) => {
53
+ return crawlUrl({
54
+ url,
55
+ lastModified: sitemapItem.lastmod,
56
+ includeScreenshot: false,
57
+ includeHtml: true,
58
+ });
59
+ }, { concurrency: config.siteCron.concurrency });
60
+ return jobIds;
61
+ }
62
+ catch (error) {
63
+ logger.error(`Failed to crawl from sitemap ${url} ${pathname}`, error);
64
+ throw new Error(error);
65
+ }
66
+ finally {
67
+ crawlBlockletRunningMap.delete(key);
68
+ }
69
+ });
@@ -0,0 +1,3 @@
1
+ import { Sequelize } from '@sequelize/core';
2
+ import { SqliteDialect } from '@sequelize/sqlite3';
3
+ export declare function initDatabase(): Promise<Sequelize<SqliteDialect>>;
@@ -11,18 +11,33 @@ import { Sequelize } from '@sequelize/core';
11
11
  import { SqliteDialect } from '@sequelize/sqlite3';
12
12
  import path from 'path';
13
13
  import { config, logger } from '../config';
14
- import { initJobModel } from './job';
15
- import { initSnapshotModel } from './snapshot';
16
- export function ensureDatabase() {
14
+ import { Job } from './job';
15
+ import { Snapshot } from './snapshot';
16
+ export function initDatabase() {
17
17
  return __awaiter(this, void 0, void 0, function* () {
18
18
  const sequelize = new Sequelize({
19
19
  dialect: SqliteDialect,
20
20
  storage: path.join(config.dataDir, 'snap-kit.db'),
21
- logging: (msg) => logger.debug(msg),
21
+ logging: (msg) => process.env.SQLITE_LOG && logger.debug(msg),
22
+ pool: {
23
+ min: 0,
24
+ max: 10,
25
+ idle: 10000,
26
+ },
27
+ retry: {
28
+ match: [/SQLITE_BUSY/],
29
+ name: 'query',
30
+ max: 10,
31
+ },
22
32
  });
23
- yield initSnapshotModel(sequelize);
24
- yield initJobModel(sequelize);
33
+ Job.initModel(sequelize);
34
+ Snapshot.initModel(sequelize);
25
35
  try {
36
+ yield Promise.all([
37
+ sequelize.query('pragma journal_mode = WAL;'),
38
+ sequelize.query('pragma synchronous = normal;'),
39
+ sequelize.query('pragma journal_size_limit = 67108864;'),
40
+ ]);
26
41
  yield sequelize.authenticate();
27
42
  yield sequelize.sync();
28
43
  logger.info('Successfully connected to database');
@@ -31,5 +46,6 @@ export function ensureDatabase() {
31
46
  logger.error('Failed to connect to database:', error);
32
47
  throw error;
33
48
  }
49
+ return sequelize;
34
50
  });
35
51
  }
@@ -10,6 +10,7 @@ export interface JobState {
10
10
  quality?: number;
11
11
  timeout?: number;
12
12
  fullPage?: boolean;
13
+ lastModified?: string;
13
14
  }
14
15
  export interface JobModel {
15
16
  id: string;
@@ -20,7 +21,7 @@ export interface JobModel {
20
21
  delay: number;
21
22
  cancelled: boolean;
22
23
  }
23
- declare class Job extends Model<JobModel> implements JobModel {
24
+ export declare class Job extends Model<JobModel> implements JobModel {
24
25
  id: JobModel['id'];
25
26
  queue: JobModel['queue'];
26
27
  job: JobModel['job'];
@@ -28,6 +29,6 @@ declare class Job extends Model<JobModel> implements JobModel {
28
29
  willRunAt: JobModel['willRunAt'];
29
30
  delay: JobModel['delay'];
30
31
  cancelled: JobModel['cancelled'];
32
+ static initModel(sequelize: Sequelize): typeof Job;
33
+ static findJob(condition: Partial<JobState>): Promise<JobModel | null>;
31
34
  }
32
- export { Job };
33
- export declare function initJobModel(sequelize: Sequelize): typeof Job;
@@ -0,0 +1,73 @@
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import sequelize, { DataTypes, Model } from '@sequelize/core';
11
+ export class Job extends Model {
12
+ static initModel(sequelize) {
13
+ return Job.init({
14
+ id: {
15
+ type: DataTypes.STRING(40),
16
+ primaryKey: true,
17
+ },
18
+ queue: {
19
+ type: DataTypes.STRING(32),
20
+ allowNull: false,
21
+ },
22
+ job: {
23
+ type: DataTypes.JSON,
24
+ allowNull: false,
25
+ },
26
+ retryCount: {
27
+ type: DataTypes.INTEGER,
28
+ },
29
+ delay: {
30
+ type: DataTypes.INTEGER,
31
+ },
32
+ willRunAt: {
33
+ type: DataTypes.INTEGER,
34
+ },
35
+ cancelled: {
36
+ type: DataTypes.BOOLEAN,
37
+ defaultValue: false,
38
+ },
39
+ createdAt: {
40
+ type: DataTypes.DATE,
41
+ defaultValue: DataTypes.NOW,
42
+ index: true,
43
+ },
44
+ updatedAt: {
45
+ type: DataTypes.DATE,
46
+ defaultValue: DataTypes.NOW,
47
+ index: true,
48
+ },
49
+ }, {
50
+ sequelize,
51
+ indexes: [{ fields: ['queue'] }],
52
+ modelName: 'job',
53
+ tableName: 'jobs',
54
+ timestamps: true,
55
+ });
56
+ }
57
+ static findJob(condition) {
58
+ return __awaiter(this, void 0, void 0, function* () {
59
+ const where = Object.keys(condition)
60
+ .filter((key) => condition[key] !== undefined)
61
+ .map((key) => {
62
+ return sequelize.where(sequelize.fn('json_extract', sequelize.col('job'), `$.${key}`), condition[key]);
63
+ });
64
+ const job = yield Job.findOne({
65
+ where: {
66
+ [sequelize.Op.and]: where,
67
+ },
68
+ order: [['createdAt', 'DESC']],
69
+ });
70
+ return (job === null || job === void 0 ? void 0 : job.toJSON()) || null;
71
+ });
72
+ }
73
+ }
@@ -1,5 +1,5 @@
1
- import { Model, Sequelize } from '@sequelize/core';
2
- interface SnapshotModel {
1
+ import { FindOptions, Model, Sequelize } from '@sequelize/core';
2
+ export interface SnapshotModel {
3
3
  jobId: string;
4
4
  url: string;
5
5
  status: 'success' | 'failed' | 'pending';
@@ -16,7 +16,7 @@ interface SnapshotModel {
16
16
  fullPage?: boolean;
17
17
  };
18
18
  }
19
- declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
19
+ export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
20
20
  jobId: SnapshotModel['jobId'];
21
21
  url: SnapshotModel['url'];
22
22
  status: SnapshotModel['status'];
@@ -25,7 +25,6 @@ declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
25
25
  error?: SnapshotModel['error'];
26
26
  lastModified?: SnapshotModel['lastModified'];
27
27
  options: SnapshotModel['options'];
28
+ static initModel(sequelize: Sequelize): typeof Snapshot;
29
+ static findSnapshot(condition: FindOptions<SnapshotModel>): Promise<SnapshotModel | null>;
28
30
  }
29
- export { Snapshot };
30
- export type { SnapshotModel };
31
- export declare function initSnapshotModel(sequelize: Sequelize): typeof Snapshot;
@@ -0,0 +1,64 @@
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import { DataTypes, Model } from '@sequelize/core';
11
+ export class Snapshot extends Model {
12
+ static initModel(sequelize) {
13
+ return Snapshot.init({
14
+ jobId: {
15
+ type: DataTypes.STRING,
16
+ primaryKey: true,
17
+ allowNull: false,
18
+ },
19
+ url: {
20
+ type: DataTypes.STRING,
21
+ allowNull: false,
22
+ index: true,
23
+ },
24
+ status: {
25
+ type: DataTypes.ENUM('success', 'failed', 'pending'),
26
+ allowNull: false,
27
+ },
28
+ html: {
29
+ type: DataTypes.TEXT,
30
+ allowNull: true,
31
+ },
32
+ screenshot: {
33
+ type: DataTypes.STRING,
34
+ allowNull: true,
35
+ },
36
+ error: {
37
+ type: DataTypes.STRING,
38
+ allowNull: true,
39
+ },
40
+ lastModified: {
41
+ type: DataTypes.STRING,
42
+ allowNull: true,
43
+ },
44
+ options: {
45
+ type: DataTypes.JSON,
46
+ allowNull: true,
47
+ },
48
+ }, {
49
+ sequelize,
50
+ modelName: 'snapshot',
51
+ tableName: 'snap',
52
+ timestamps: true,
53
+ });
54
+ }
55
+ static findSnapshot(condition) {
56
+ return __awaiter(this, void 0, void 0, function* () {
57
+ const snapshot = yield Snapshot.findOne(Object.assign({ order: [
58
+ ['lastModified', 'DESC'],
59
+ ['updatedAt', 'DESC'],
60
+ ] }, condition));
61
+ return (snapshot === null || snapshot === void 0 ? void 0 : snapshot.toJSON()) || null;
62
+ });
63
+ }
64
+ }
@@ -0,0 +1,32 @@
1
+ import { Page } from '@blocklet/puppeteer';
2
+ import { Request } from 'express';
3
+ export declare const axios: import("axios").AxiosInstance;
4
+ export declare const CRAWLER_FLAG = "x-arcblock-crawler";
5
+ export declare const sleep: (ms: number) => Promise<unknown>;
6
+ /**
7
+ * Check if the request is a arcblock crawler
8
+ */
9
+ export declare const isSelfCrawler: (req: Request) => boolean;
10
+ /**
11
+ * Check if the request is a static file
12
+ */
13
+ export declare function isStaticFile(req: Request): boolean;
14
+ /**
15
+ * Check if the request is a spider
16
+ */
17
+ export declare function isSpider(req: Request): boolean;
18
+ /**
19
+ * Get and parse the robots.txt by `robots-parser`
20
+ */
21
+ export declare function getRobots(url: string): Promise<import("robots-parser").Robot | null>;
22
+ /**
23
+ * Check if the url is allowed to crawl from robots.txt
24
+ */
25
+ export declare const isAcceptCrawler: (url: string) => Promise<boolean | undefined>;
26
+ /**
27
+ * Get and parse the sitemap.xml by `sitemap` package
28
+ */
29
+ export declare const getSitemapList: (url: string) => Promise<import("sitemap").SitemapItem[]>;
30
+ export declare const formatUrl: (url: string) => string;
31
+ export declare function md5(content: string | Uint8Array): string;
32
+ export declare function findMaxScrollHeight(page: Page): Promise<number>;