@arcblock/crawler 1.0.6 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/README.md +66 -0
  2. package/lib/cjs/config.d.ts +24 -0
  3. package/{dist → lib/cjs}/config.js +13 -5
  4. package/lib/cjs/crawler.d.ts +30 -0
  5. package/{dist → lib/cjs}/crawler.js +63 -117
  6. package/lib/cjs/cron.d.ts +1 -0
  7. package/lib/cjs/cron.js +49 -0
  8. package/lib/cjs/index.d.ts +9 -0
  9. package/lib/cjs/index.js +80 -0
  10. package/{esm → lib/cjs}/puppeteer.d.ts +2 -2
  11. package/{dist → lib/cjs}/puppeteer.js +43 -54
  12. package/lib/cjs/services/snapshot.d.ts +12 -0
  13. package/lib/cjs/services/snapshot.js +84 -0
  14. package/lib/cjs/site.d.ts +2 -0
  15. package/lib/cjs/site.js +79 -0
  16. package/lib/cjs/store/index.d.ts +3 -0
  17. package/{dist/db → lib/cjs/store}/index.js +22 -6
  18. package/{dist/db → lib/cjs/store}/job.d.ts +4 -3
  19. package/lib/cjs/store/job.js +110 -0
  20. package/{dist/db → lib/cjs/store}/snapshot.d.ts +10 -6
  21. package/lib/cjs/store/snapshot.js +72 -0
  22. package/lib/cjs/utils.d.ts +32 -0
  23. package/{dist → lib/cjs}/utils.js +67 -78
  24. package/lib/esm/config.d.ts +24 -0
  25. package/lib/esm/config.js +19 -0
  26. package/lib/esm/crawler.d.ts +30 -0
  27. package/{esm → lib/esm}/crawler.js +54 -105
  28. package/lib/esm/cron.d.ts +1 -0
  29. package/lib/esm/cron.js +43 -0
  30. package/lib/esm/index.d.ts +9 -0
  31. package/{esm → lib/esm}/index.js +21 -10
  32. package/{dist → lib/esm}/puppeteer.d.ts +2 -2
  33. package/{esm → lib/esm}/puppeteer.js +21 -32
  34. package/lib/esm/services/snapshot.d.ts +12 -0
  35. package/lib/esm/services/snapshot.js +75 -0
  36. package/lib/esm/site.d.ts +2 -0
  37. package/lib/esm/site.js +72 -0
  38. package/lib/esm/store/index.d.ts +3 -0
  39. package/{esm/db → lib/esm/store}/index.js +23 -7
  40. package/{esm/db → lib/esm/store}/job.d.ts +4 -3
  41. package/lib/esm/store/job.js +73 -0
  42. package/{esm/db → lib/esm/store}/snapshot.d.ts +10 -6
  43. package/lib/esm/store/snapshot.js +68 -0
  44. package/lib/esm/utils.d.ts +32 -0
  45. package/{esm → lib/esm}/utils.js +64 -71
  46. package/package.json +20 -32
  47. package/third.d.ts +0 -0
  48. package/dist/blocklet.d.ts +0 -6
  49. package/dist/blocklet.js +0 -199
  50. package/dist/cache.d.ts +0 -10
  51. package/dist/cache.js +0 -119
  52. package/dist/config.d.ts +0 -10
  53. package/dist/crawler.d.ts +0 -28
  54. package/dist/db/index.d.ts +0 -1
  55. package/dist/db/job.js +0 -54
  56. package/dist/db/snapshot.js +0 -52
  57. package/dist/index.d.ts +0 -6
  58. package/dist/index.js +0 -45
  59. package/dist/middleware.d.ts +0 -4
  60. package/dist/middleware.js +0 -44
  61. package/dist/utils.d.ts +0 -17
  62. package/esm/blocklet.d.ts +0 -6
  63. package/esm/blocklet.js +0 -190
  64. package/esm/cache.d.ts +0 -10
  65. package/esm/cache.js +0 -114
  66. package/esm/config.d.ts +0 -10
  67. package/esm/config.js +0 -11
  68. package/esm/crawler.d.ts +0 -28
  69. package/esm/db/index.d.ts +0 -1
  70. package/esm/db/job.js +0 -50
  71. package/esm/db/snapshot.js +0 -48
  72. package/esm/index.d.ts +0 -6
  73. package/esm/middleware.d.ts +0 -4
  74. package/esm/middleware.js +0 -41
  75. package/esm/utils.d.ts +0 -17
package/esm/cache.js DELETED
@@ -1,114 +0,0 @@
1
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
- return new (P || (P = Promise))(function (resolve, reject) {
4
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
- step((generator = generator.apply(thisArg, _arguments || [])).next());
8
- });
9
- };
10
- var _a;
11
- import { createPool } from 'generic-pool';
12
- import { createClient } from 'redis';
13
- import { config, logger } from './config';
14
- const cacheKeyPrefix = ((_a = process.env) === null || _a === void 0 ? void 0 : _a.BLOCKLET_REAL_DID) ? `${process.env.BLOCKLET_REAL_DID}:` : '';
15
- const MAX_REDIS_RETRY = 3;
16
- const ttl = 1000 * 60 * 60 * 24 * 7;
17
- export const cachePool = createPool({
18
- create: () => __awaiter(void 0, void 0, void 0, function* () {
19
- try {
20
- const { redisUrl } = config;
21
- const redisClient = createClient({
22
- url: redisUrl,
23
- socket: {
24
- // @ts-ignore
25
- reconnectStrategy: (retries) => {
26
- if (retries >= MAX_REDIS_RETRY) {
27
- return new Error('Retry Time Exhausted');
28
- }
29
- return Math.min(retries * 500, 1000 * 3);
30
- },
31
- },
32
- });
33
- redisClient.on('error', (err) => logger.warn('Redis Client Error:', err));
34
- yield redisClient.connect();
35
- logger.info(`Successfully connected to Redis: ${redisUrl}`);
36
- return redisClient;
37
- }
38
- catch (error) {
39
- logger.warn('Redis connection failed', error);
40
- return null;
41
- }
42
- }),
43
- destroy: (client) => __awaiter(void 0, void 0, void 0, function* () {
44
- // if is redis client
45
- if (client.isReady) {
46
- yield client.quit();
47
- }
48
- }),
49
- }, {
50
- max: 2, // 2 clients
51
- min: 0,
52
- // evictionRunIntervalMillis: 0,
53
- });
54
- export const memoryPool = createPool({
55
- create: () => {
56
- const map = new Map();
57
- // @ts-ignore
58
- map.del = map.delete;
59
- return Promise.resolve(map);
60
- },
61
- destroy: (client) => {
62
- client.clear();
63
- return Promise.resolve();
64
- },
65
- }, {
66
- max: 10,
67
- min: 0,
68
- });
69
- export const withCache = (cb) => __awaiter(void 0, void 0, void 0, function* () {
70
- const pool = config.redisUrl ? cachePool : memoryPool;
71
- const client = yield pool.acquire();
72
- if (client) {
73
- try {
74
- return cb(client);
75
- }
76
- finally {
77
- // release client to pool, let other use
78
- yield pool.release(client);
79
- }
80
- }
81
- });
82
- export const formatKey = (key) => {
83
- return `${cacheKeyPrefix}${key}`;
84
- };
85
- export const useCache = {
86
- get: (key) => {
87
- return withCache((client) => __awaiter(void 0, void 0, void 0, function* () {
88
- const value = yield client.get(formatKey(key));
89
- try {
90
- return JSON.parse(value);
91
- }
92
- catch (error) {
93
- // ignore error
94
- }
95
- return value;
96
- }));
97
- },
98
- set: (key, value, options) => {
99
- return withCache((client) => {
100
- const formatValue = typeof value === 'string' ? value : JSON.stringify(value);
101
- return client.set(formatKey(key), formatValue, Object.assign({ PX: ttl }, options));
102
- });
103
- },
104
- remove: (key) => {
105
- return withCache((client) => {
106
- return client.del(formatKey(key));
107
- });
108
- },
109
- list: (key = '*') => {
110
- return withCache((client) => {
111
- return client.keys(formatKey(key));
112
- });
113
- },
114
- };
package/esm/config.d.ts DELETED
@@ -1,10 +0,0 @@
1
- export declare const logger: any;
2
- export declare const config: {
3
- redisUrl: string;
4
- dataDir: string;
5
- appDir: string;
6
- appUrl: string;
7
- puppeteerPath: string;
8
- cacheDir: string;
9
- testOnInitialize: boolean;
10
- };
package/esm/config.js DELETED
@@ -1,11 +0,0 @@
1
- import createLogger from '@blocklet/logger';
2
- export const logger = createLogger('crawler', { level: process.env.LOG_LEVEL || 'info' });
3
- export const config = {
4
- redisUrl: process.env.REDIS_URL,
5
- dataDir: process.env.BLOCKLET_DATA_DIR,
6
- appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
7
- appUrl: process.env.BLOCKLET_APP_URL,
8
- puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
9
- cacheDir: process.env.BLOCKLET_CACHE_DIR,
10
- testOnInitialize: process.env.NODE_ENV === 'production',
11
- };
package/esm/crawler.d.ts DELETED
@@ -1,28 +0,0 @@
1
- import { JobState } from './db/job';
2
- import { SnapshotModel } from './db/snapshot';
3
- export declare function createCrawlQueue(): void;
4
- export declare function getDataDir(): Promise<{
5
- htmlDir: string;
6
- screenshotDir: string;
7
- }>;
8
- export declare const getPageContent: ({ url, formatPageContent, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
9
- url: string;
10
- formatPageContent?: Function;
11
- includeScreenshot?: boolean;
12
- includeHtml?: boolean;
13
- width?: number;
14
- height?: number;
15
- quality?: number;
16
- timeout?: number;
17
- fullPage?: boolean;
18
- }) => Promise<{
19
- html: string;
20
- screenshot: Uint8Array<ArrayBufferLike> | null;
21
- }>;
22
- export declare function createCrawlJob(params: JobState, callback?: (snapshot: SnapshotModel | null) => void): Promise<any>;
23
- export declare function getJob(condition: Partial<JobState>): Promise<any>;
24
- export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<keyof SnapshotModel>): Promise<SnapshotModel>;
25
- /**
26
- * get snapshot from db or crawl queue
27
- */
28
- export declare function getSnapshot(jobId: string): Promise<SnapshotModel | null>;
package/esm/db/index.d.ts DELETED
@@ -1 +0,0 @@
1
- export declare function ensureDatabase(): Promise<void>;
package/esm/db/job.js DELETED
@@ -1,50 +0,0 @@
1
- import { DataTypes, Model } from '@sequelize/core';
2
- class Job extends Model {
3
- }
4
- export { Job };
5
- export function initJobModel(sequelize) {
6
- Job.init({
7
- id: {
8
- type: DataTypes.STRING(40),
9
- primaryKey: true,
10
- },
11
- queue: {
12
- type: DataTypes.STRING(32),
13
- allowNull: false,
14
- },
15
- job: {
16
- type: DataTypes.JSON,
17
- allowNull: false,
18
- },
19
- retryCount: {
20
- type: DataTypes.INTEGER,
21
- },
22
- delay: {
23
- type: DataTypes.INTEGER,
24
- },
25
- willRunAt: {
26
- type: DataTypes.INTEGER,
27
- },
28
- cancelled: {
29
- type: DataTypes.BOOLEAN,
30
- defaultValue: false,
31
- },
32
- createdAt: {
33
- type: DataTypes.DATE,
34
- defaultValue: DataTypes.NOW,
35
- index: true,
36
- },
37
- updatedAt: {
38
- type: DataTypes.DATE,
39
- defaultValue: DataTypes.NOW,
40
- index: true,
41
- },
42
- }, {
43
- sequelize,
44
- indexes: [{ fields: ['queue'] }],
45
- modelName: 'job',
46
- tableName: 'jobs',
47
- timestamps: true,
48
- });
49
- return Job;
50
- }
@@ -1,48 +0,0 @@
1
- import { DataTypes, Model } from '@sequelize/core';
2
- class Snapshot extends Model {
3
- }
4
- export { Snapshot };
5
- export function initSnapshotModel(sequelize) {
6
- Snapshot.init({
7
- jobId: {
8
- type: DataTypes.STRING,
9
- primaryKey: true,
10
- allowNull: false,
11
- },
12
- url: {
13
- type: DataTypes.STRING,
14
- allowNull: false,
15
- index: true,
16
- },
17
- status: {
18
- type: DataTypes.ENUM('success', 'failed'),
19
- allowNull: false,
20
- },
21
- html: {
22
- type: DataTypes.TEXT,
23
- allowNull: true,
24
- },
25
- screenshot: {
26
- type: DataTypes.STRING,
27
- allowNull: true,
28
- },
29
- error: {
30
- type: DataTypes.STRING,
31
- allowNull: true,
32
- },
33
- lastModified: {
34
- type: DataTypes.STRING,
35
- allowNull: true,
36
- },
37
- options: {
38
- type: DataTypes.JSON,
39
- allowNull: true,
40
- },
41
- }, {
42
- sequelize,
43
- modelName: 'snapshot',
44
- tableName: 'snap',
45
- timestamps: true,
46
- });
47
- return Snapshot;
48
- }
package/esm/index.d.ts DELETED
@@ -1,6 +0,0 @@
1
- import { config } from './config';
2
- export * from './blocklet';
3
- export * from './crawler';
4
- export * from './middleware';
5
- export { Snapshot } from './db/snapshot';
6
- export declare function initCrawler(_config: Partial<typeof config>): Promise<void>;
@@ -1,4 +0,0 @@
1
- export declare function initSEOMiddleware({ autoReturnHtml, allowCrawler, }: {
2
- autoReturnHtml?: Boolean;
3
- allowCrawler?: Boolean | Function;
4
- }): (req: any, res: any, next: Function) => Promise<any>;
package/esm/middleware.js DELETED
@@ -1,41 +0,0 @@
1
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
- return new (P || (P = Promise))(function (resolve, reject) {
4
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
- step((generator = generator.apply(thisArg, _arguments || [])).next());
8
- });
9
- };
10
- import { useCache } from './cache';
11
- import { getFullUrl, isAcceptCrawler, isBotUserAgent, isSelfCrawler } from './utils';
12
- export function initSEOMiddleware({ autoReturnHtml = true, allowCrawler = true, }) {
13
- return (req, res, next) => __awaiter(this, void 0, void 0, function* () {
14
- const isBot = isBotUserAgent(req);
15
- const isSelf = isSelfCrawler(req);
16
- if (!isBot || isSelf) {
17
- return next();
18
- }
19
- const fullUrl = getFullUrl(req);
20
- const canCrawl = yield isAcceptCrawler(fullUrl);
21
- const allowCrawlerResult = typeof allowCrawler === 'function' ? allowCrawler(req) : allowCrawler;
22
- // can not crawl, skip
23
- if (!canCrawl || !allowCrawlerResult) {
24
- return next();
25
- }
26
- const cacheData = yield useCache.get(fullUrl);
27
- // add cached html to req
28
- req.cachedHtml = (cacheData === null || cacheData === void 0 ? void 0 : cacheData.content) || cacheData || null;
29
- // add cached lastModified to req, ISO string to GMT string
30
- req.cachedLastmod = (cacheData === null || cacheData === void 0 ? void 0 : cacheData.lastModified) ? new Date(cacheData === null || cacheData === void 0 ? void 0 : cacheData.lastModified).toUTCString() : null;
31
- if (req.cachedLastmod) {
32
- res.setHeader('Last-Modified', req.cachedLastmod);
33
- }
34
- if (autoReturnHtml && req.cachedHtml) {
35
- res.send(req.cachedHtml);
36
- return;
37
- }
38
- // missing cache
39
- next();
40
- });
41
- }
package/esm/utils.d.ts DELETED
@@ -1,17 +0,0 @@
1
- import { Page } from '@blocklet/puppeteer';
2
- export declare const api: import("axios").AxiosInstance;
3
- export declare const sleep: (ms: number) => Promise<unknown>;
4
- export declare const CRAWLER_FLAG = "x-crawler";
5
- export declare const isSelfCrawler: (req: any) => boolean;
6
- export declare const getDefaultRobotsUrl: (url: string) => string;
7
- export declare function getRobots(url: string): Promise<import("robots-parser").Robot | null>;
8
- export declare const getDefaultSitemapUrl: (url: string) => string;
9
- export declare const isAcceptCrawler: (url: string) => Promise<boolean | undefined>;
10
- export declare const getSitemapList: (url: string) => Promise<import("sitemap").SitemapItem[]>;
11
- export declare const isBotUserAgent: (req: any) => boolean;
12
- export declare const getComponentInfo: () => {};
13
- export declare const getFullUrl: (req: any) => string;
14
- export declare const getRelativePath: (url: string) => string;
15
- export declare const formatUrl: (url: string) => string;
16
- export declare function md5(content: string | Uint8Array): string;
17
- export declare function findMaxScrollHeight(page: Page): Promise<number>;