@arcblock/crawler 1.0.6 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +66 -0
- package/lib/cjs/config.d.ts +24 -0
- package/{dist → lib/cjs}/config.js +13 -5
- package/lib/cjs/crawler.d.ts +30 -0
- package/{dist → lib/cjs}/crawler.js +63 -117
- package/lib/cjs/cron.d.ts +1 -0
- package/lib/cjs/cron.js +49 -0
- package/lib/cjs/index.d.ts +9 -0
- package/lib/cjs/index.js +80 -0
- package/{esm → lib/cjs}/puppeteer.d.ts +2 -2
- package/{dist → lib/cjs}/puppeteer.js +43 -54
- package/lib/cjs/services/snapshot.d.ts +12 -0
- package/lib/cjs/services/snapshot.js +84 -0
- package/lib/cjs/site.d.ts +2 -0
- package/lib/cjs/site.js +79 -0
- package/lib/cjs/store/index.d.ts +3 -0
- package/{dist/db → lib/cjs/store}/index.js +22 -6
- package/{dist/db → lib/cjs/store}/job.d.ts +4 -3
- package/lib/cjs/store/job.js +110 -0
- package/{dist/db → lib/cjs/store}/snapshot.d.ts +10 -6
- package/lib/cjs/store/snapshot.js +72 -0
- package/lib/cjs/utils.d.ts +32 -0
- package/{dist → lib/cjs}/utils.js +67 -78
- package/lib/esm/config.d.ts +24 -0
- package/lib/esm/config.js +19 -0
- package/lib/esm/crawler.d.ts +30 -0
- package/{esm → lib/esm}/crawler.js +54 -105
- package/lib/esm/cron.d.ts +1 -0
- package/lib/esm/cron.js +43 -0
- package/lib/esm/index.d.ts +9 -0
- package/{esm → lib/esm}/index.js +21 -10
- package/{dist → lib/esm}/puppeteer.d.ts +2 -2
- package/{esm → lib/esm}/puppeteer.js +21 -32
- package/lib/esm/services/snapshot.d.ts +12 -0
- package/lib/esm/services/snapshot.js +75 -0
- package/lib/esm/site.d.ts +2 -0
- package/lib/esm/site.js +72 -0
- package/lib/esm/store/index.d.ts +3 -0
- package/{esm/db → lib/esm/store}/index.js +23 -7
- package/{esm/db → lib/esm/store}/job.d.ts +4 -3
- package/lib/esm/store/job.js +73 -0
- package/{esm/db → lib/esm/store}/snapshot.d.ts +10 -6
- package/lib/esm/store/snapshot.js +68 -0
- package/lib/esm/utils.d.ts +32 -0
- package/{esm → lib/esm}/utils.js +64 -71
- package/package.json +20 -32
- package/third.d.ts +0 -0
- package/dist/blocklet.d.ts +0 -6
- package/dist/blocklet.js +0 -199
- package/dist/cache.d.ts +0 -10
- package/dist/cache.js +0 -119
- package/dist/config.d.ts +0 -10
- package/dist/crawler.d.ts +0 -28
- package/dist/db/index.d.ts +0 -1
- package/dist/db/job.js +0 -54
- package/dist/db/snapshot.js +0 -52
- package/dist/index.d.ts +0 -6
- package/dist/index.js +0 -45
- package/dist/middleware.d.ts +0 -4
- package/dist/middleware.js +0 -44
- package/dist/utils.d.ts +0 -17
- package/esm/blocklet.d.ts +0 -6
- package/esm/blocklet.js +0 -190
- package/esm/cache.d.ts +0 -10
- package/esm/cache.js +0 -114
- package/esm/config.d.ts +0 -10
- package/esm/config.js +0 -11
- package/esm/crawler.d.ts +0 -28
- package/esm/db/index.d.ts +0 -1
- package/esm/db/job.js +0 -50
- package/esm/db/snapshot.js +0 -48
- package/esm/index.d.ts +0 -6
- package/esm/middleware.d.ts +0 -4
- package/esm/middleware.js +0 -41
- package/esm/utils.d.ts +0 -17
package/esm/cache.js
DELETED
|
@@ -1,114 +0,0 @@
|
|
|
1
|
-
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
-
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
-
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
-
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
-
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
-
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
-
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
-
});
|
|
9
|
-
};
|
|
10
|
-
var _a;
|
|
11
|
-
import { createPool } from 'generic-pool';
|
|
12
|
-
import { createClient } from 'redis';
|
|
13
|
-
import { config, logger } from './config';
|
|
14
|
-
const cacheKeyPrefix = ((_a = process.env) === null || _a === void 0 ? void 0 : _a.BLOCKLET_REAL_DID) ? `${process.env.BLOCKLET_REAL_DID}:` : '';
|
|
15
|
-
const MAX_REDIS_RETRY = 3;
|
|
16
|
-
const ttl = 1000 * 60 * 60 * 24 * 7;
|
|
17
|
-
export const cachePool = createPool({
|
|
18
|
-
create: () => __awaiter(void 0, void 0, void 0, function* () {
|
|
19
|
-
try {
|
|
20
|
-
const { redisUrl } = config;
|
|
21
|
-
const redisClient = createClient({
|
|
22
|
-
url: redisUrl,
|
|
23
|
-
socket: {
|
|
24
|
-
// @ts-ignore
|
|
25
|
-
reconnectStrategy: (retries) => {
|
|
26
|
-
if (retries >= MAX_REDIS_RETRY) {
|
|
27
|
-
return new Error('Retry Time Exhausted');
|
|
28
|
-
}
|
|
29
|
-
return Math.min(retries * 500, 1000 * 3);
|
|
30
|
-
},
|
|
31
|
-
},
|
|
32
|
-
});
|
|
33
|
-
redisClient.on('error', (err) => logger.warn('Redis Client Error:', err));
|
|
34
|
-
yield redisClient.connect();
|
|
35
|
-
logger.info(`Successfully connected to Redis: ${redisUrl}`);
|
|
36
|
-
return redisClient;
|
|
37
|
-
}
|
|
38
|
-
catch (error) {
|
|
39
|
-
logger.warn('Redis connection failed', error);
|
|
40
|
-
return null;
|
|
41
|
-
}
|
|
42
|
-
}),
|
|
43
|
-
destroy: (client) => __awaiter(void 0, void 0, void 0, function* () {
|
|
44
|
-
// if is redis client
|
|
45
|
-
if (client.isReady) {
|
|
46
|
-
yield client.quit();
|
|
47
|
-
}
|
|
48
|
-
}),
|
|
49
|
-
}, {
|
|
50
|
-
max: 2, // 2 clients
|
|
51
|
-
min: 0,
|
|
52
|
-
// evictionRunIntervalMillis: 0,
|
|
53
|
-
});
|
|
54
|
-
export const memoryPool = createPool({
|
|
55
|
-
create: () => {
|
|
56
|
-
const map = new Map();
|
|
57
|
-
// @ts-ignore
|
|
58
|
-
map.del = map.delete;
|
|
59
|
-
return Promise.resolve(map);
|
|
60
|
-
},
|
|
61
|
-
destroy: (client) => {
|
|
62
|
-
client.clear();
|
|
63
|
-
return Promise.resolve();
|
|
64
|
-
},
|
|
65
|
-
}, {
|
|
66
|
-
max: 10,
|
|
67
|
-
min: 0,
|
|
68
|
-
});
|
|
69
|
-
export const withCache = (cb) => __awaiter(void 0, void 0, void 0, function* () {
|
|
70
|
-
const pool = config.redisUrl ? cachePool : memoryPool;
|
|
71
|
-
const client = yield pool.acquire();
|
|
72
|
-
if (client) {
|
|
73
|
-
try {
|
|
74
|
-
return cb(client);
|
|
75
|
-
}
|
|
76
|
-
finally {
|
|
77
|
-
// release client to pool, let other use
|
|
78
|
-
yield pool.release(client);
|
|
79
|
-
}
|
|
80
|
-
}
|
|
81
|
-
});
|
|
82
|
-
export const formatKey = (key) => {
|
|
83
|
-
return `${cacheKeyPrefix}${key}`;
|
|
84
|
-
};
|
|
85
|
-
export const useCache = {
|
|
86
|
-
get: (key) => {
|
|
87
|
-
return withCache((client) => __awaiter(void 0, void 0, void 0, function* () {
|
|
88
|
-
const value = yield client.get(formatKey(key));
|
|
89
|
-
try {
|
|
90
|
-
return JSON.parse(value);
|
|
91
|
-
}
|
|
92
|
-
catch (error) {
|
|
93
|
-
// ignore error
|
|
94
|
-
}
|
|
95
|
-
return value;
|
|
96
|
-
}));
|
|
97
|
-
},
|
|
98
|
-
set: (key, value, options) => {
|
|
99
|
-
return withCache((client) => {
|
|
100
|
-
const formatValue = typeof value === 'string' ? value : JSON.stringify(value);
|
|
101
|
-
return client.set(formatKey(key), formatValue, Object.assign({ PX: ttl }, options));
|
|
102
|
-
});
|
|
103
|
-
},
|
|
104
|
-
remove: (key) => {
|
|
105
|
-
return withCache((client) => {
|
|
106
|
-
return client.del(formatKey(key));
|
|
107
|
-
});
|
|
108
|
-
},
|
|
109
|
-
list: (key = '*') => {
|
|
110
|
-
return withCache((client) => {
|
|
111
|
-
return client.keys(formatKey(key));
|
|
112
|
-
});
|
|
113
|
-
},
|
|
114
|
-
};
|
package/esm/config.d.ts
DELETED
package/esm/config.js
DELETED
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
import createLogger from '@blocklet/logger';
|
|
2
|
-
export const logger = createLogger('crawler', { level: process.env.LOG_LEVEL || 'info' });
|
|
3
|
-
export const config = {
|
|
4
|
-
redisUrl: process.env.REDIS_URL,
|
|
5
|
-
dataDir: process.env.BLOCKLET_DATA_DIR,
|
|
6
|
-
appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
|
|
7
|
-
appUrl: process.env.BLOCKLET_APP_URL,
|
|
8
|
-
puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
|
|
9
|
-
cacheDir: process.env.BLOCKLET_CACHE_DIR,
|
|
10
|
-
testOnInitialize: process.env.NODE_ENV === 'production',
|
|
11
|
-
};
|
package/esm/crawler.d.ts
DELETED
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
import { JobState } from './db/job';
|
|
2
|
-
import { SnapshotModel } from './db/snapshot';
|
|
3
|
-
export declare function createCrawlQueue(): void;
|
|
4
|
-
export declare function getDataDir(): Promise<{
|
|
5
|
-
htmlDir: string;
|
|
6
|
-
screenshotDir: string;
|
|
7
|
-
}>;
|
|
8
|
-
export declare const getPageContent: ({ url, formatPageContent, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
|
|
9
|
-
url: string;
|
|
10
|
-
formatPageContent?: Function;
|
|
11
|
-
includeScreenshot?: boolean;
|
|
12
|
-
includeHtml?: boolean;
|
|
13
|
-
width?: number;
|
|
14
|
-
height?: number;
|
|
15
|
-
quality?: number;
|
|
16
|
-
timeout?: number;
|
|
17
|
-
fullPage?: boolean;
|
|
18
|
-
}) => Promise<{
|
|
19
|
-
html: string;
|
|
20
|
-
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
21
|
-
}>;
|
|
22
|
-
export declare function createCrawlJob(params: JobState, callback?: (snapshot: SnapshotModel | null) => void): Promise<any>;
|
|
23
|
-
export declare function getJob(condition: Partial<JobState>): Promise<any>;
|
|
24
|
-
export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<keyof SnapshotModel>): Promise<SnapshotModel>;
|
|
25
|
-
/**
|
|
26
|
-
* get snapshot from db or crawl queue
|
|
27
|
-
*/
|
|
28
|
-
export declare function getSnapshot(jobId: string): Promise<SnapshotModel | null>;
|
package/esm/db/index.d.ts
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
export declare function ensureDatabase(): Promise<void>;
|
package/esm/db/job.js
DELETED
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
import { DataTypes, Model } from '@sequelize/core';
|
|
2
|
-
class Job extends Model {
|
|
3
|
-
}
|
|
4
|
-
export { Job };
|
|
5
|
-
export function initJobModel(sequelize) {
|
|
6
|
-
Job.init({
|
|
7
|
-
id: {
|
|
8
|
-
type: DataTypes.STRING(40),
|
|
9
|
-
primaryKey: true,
|
|
10
|
-
},
|
|
11
|
-
queue: {
|
|
12
|
-
type: DataTypes.STRING(32),
|
|
13
|
-
allowNull: false,
|
|
14
|
-
},
|
|
15
|
-
job: {
|
|
16
|
-
type: DataTypes.JSON,
|
|
17
|
-
allowNull: false,
|
|
18
|
-
},
|
|
19
|
-
retryCount: {
|
|
20
|
-
type: DataTypes.INTEGER,
|
|
21
|
-
},
|
|
22
|
-
delay: {
|
|
23
|
-
type: DataTypes.INTEGER,
|
|
24
|
-
},
|
|
25
|
-
willRunAt: {
|
|
26
|
-
type: DataTypes.INTEGER,
|
|
27
|
-
},
|
|
28
|
-
cancelled: {
|
|
29
|
-
type: DataTypes.BOOLEAN,
|
|
30
|
-
defaultValue: false,
|
|
31
|
-
},
|
|
32
|
-
createdAt: {
|
|
33
|
-
type: DataTypes.DATE,
|
|
34
|
-
defaultValue: DataTypes.NOW,
|
|
35
|
-
index: true,
|
|
36
|
-
},
|
|
37
|
-
updatedAt: {
|
|
38
|
-
type: DataTypes.DATE,
|
|
39
|
-
defaultValue: DataTypes.NOW,
|
|
40
|
-
index: true,
|
|
41
|
-
},
|
|
42
|
-
}, {
|
|
43
|
-
sequelize,
|
|
44
|
-
indexes: [{ fields: ['queue'] }],
|
|
45
|
-
modelName: 'job',
|
|
46
|
-
tableName: 'jobs',
|
|
47
|
-
timestamps: true,
|
|
48
|
-
});
|
|
49
|
-
return Job;
|
|
50
|
-
}
|
package/esm/db/snapshot.js
DELETED
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
import { DataTypes, Model } from '@sequelize/core';
|
|
2
|
-
class Snapshot extends Model {
|
|
3
|
-
}
|
|
4
|
-
export { Snapshot };
|
|
5
|
-
export function initSnapshotModel(sequelize) {
|
|
6
|
-
Snapshot.init({
|
|
7
|
-
jobId: {
|
|
8
|
-
type: DataTypes.STRING,
|
|
9
|
-
primaryKey: true,
|
|
10
|
-
allowNull: false,
|
|
11
|
-
},
|
|
12
|
-
url: {
|
|
13
|
-
type: DataTypes.STRING,
|
|
14
|
-
allowNull: false,
|
|
15
|
-
index: true,
|
|
16
|
-
},
|
|
17
|
-
status: {
|
|
18
|
-
type: DataTypes.ENUM('success', 'failed'),
|
|
19
|
-
allowNull: false,
|
|
20
|
-
},
|
|
21
|
-
html: {
|
|
22
|
-
type: DataTypes.TEXT,
|
|
23
|
-
allowNull: true,
|
|
24
|
-
},
|
|
25
|
-
screenshot: {
|
|
26
|
-
type: DataTypes.STRING,
|
|
27
|
-
allowNull: true,
|
|
28
|
-
},
|
|
29
|
-
error: {
|
|
30
|
-
type: DataTypes.STRING,
|
|
31
|
-
allowNull: true,
|
|
32
|
-
},
|
|
33
|
-
lastModified: {
|
|
34
|
-
type: DataTypes.STRING,
|
|
35
|
-
allowNull: true,
|
|
36
|
-
},
|
|
37
|
-
options: {
|
|
38
|
-
type: DataTypes.JSON,
|
|
39
|
-
allowNull: true,
|
|
40
|
-
},
|
|
41
|
-
}, {
|
|
42
|
-
sequelize,
|
|
43
|
-
modelName: 'snapshot',
|
|
44
|
-
tableName: 'snap',
|
|
45
|
-
timestamps: true,
|
|
46
|
-
});
|
|
47
|
-
return Snapshot;
|
|
48
|
-
}
|
package/esm/index.d.ts
DELETED
package/esm/middleware.d.ts
DELETED
package/esm/middleware.js
DELETED
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
-
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
-
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
-
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
-
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
-
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
-
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
-
});
|
|
9
|
-
};
|
|
10
|
-
import { useCache } from './cache';
|
|
11
|
-
import { getFullUrl, isAcceptCrawler, isBotUserAgent, isSelfCrawler } from './utils';
|
|
12
|
-
export function initSEOMiddleware({ autoReturnHtml = true, allowCrawler = true, }) {
|
|
13
|
-
return (req, res, next) => __awaiter(this, void 0, void 0, function* () {
|
|
14
|
-
const isBot = isBotUserAgent(req);
|
|
15
|
-
const isSelf = isSelfCrawler(req);
|
|
16
|
-
if (!isBot || isSelf) {
|
|
17
|
-
return next();
|
|
18
|
-
}
|
|
19
|
-
const fullUrl = getFullUrl(req);
|
|
20
|
-
const canCrawl = yield isAcceptCrawler(fullUrl);
|
|
21
|
-
const allowCrawlerResult = typeof allowCrawler === 'function' ? allowCrawler(req) : allowCrawler;
|
|
22
|
-
// can not crawl, skip
|
|
23
|
-
if (!canCrawl || !allowCrawlerResult) {
|
|
24
|
-
return next();
|
|
25
|
-
}
|
|
26
|
-
const cacheData = yield useCache.get(fullUrl);
|
|
27
|
-
// add cached html to req
|
|
28
|
-
req.cachedHtml = (cacheData === null || cacheData === void 0 ? void 0 : cacheData.content) || cacheData || null;
|
|
29
|
-
// add cached lastModified to req, ISO string to GMT string
|
|
30
|
-
req.cachedLastmod = (cacheData === null || cacheData === void 0 ? void 0 : cacheData.lastModified) ? new Date(cacheData === null || cacheData === void 0 ? void 0 : cacheData.lastModified).toUTCString() : null;
|
|
31
|
-
if (req.cachedLastmod) {
|
|
32
|
-
res.setHeader('Last-Modified', req.cachedLastmod);
|
|
33
|
-
}
|
|
34
|
-
if (autoReturnHtml && req.cachedHtml) {
|
|
35
|
-
res.send(req.cachedHtml);
|
|
36
|
-
return;
|
|
37
|
-
}
|
|
38
|
-
// missing cache
|
|
39
|
-
next();
|
|
40
|
-
});
|
|
41
|
-
}
|
package/esm/utils.d.ts
DELETED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
import { Page } from '@blocklet/puppeteer';
|
|
2
|
-
export declare const api: import("axios").AxiosInstance;
|
|
3
|
-
export declare const sleep: (ms: number) => Promise<unknown>;
|
|
4
|
-
export declare const CRAWLER_FLAG = "x-crawler";
|
|
5
|
-
export declare const isSelfCrawler: (req: any) => boolean;
|
|
6
|
-
export declare const getDefaultRobotsUrl: (url: string) => string;
|
|
7
|
-
export declare function getRobots(url: string): Promise<import("robots-parser").Robot | null>;
|
|
8
|
-
export declare const getDefaultSitemapUrl: (url: string) => string;
|
|
9
|
-
export declare const isAcceptCrawler: (url: string) => Promise<boolean | undefined>;
|
|
10
|
-
export declare const getSitemapList: (url: string) => Promise<import("sitemap").SitemapItem[]>;
|
|
11
|
-
export declare const isBotUserAgent: (req: any) => boolean;
|
|
12
|
-
export declare const getComponentInfo: () => {};
|
|
13
|
-
export declare const getFullUrl: (req: any) => string;
|
|
14
|
-
export declare const getRelativePath: (url: string) => string;
|
|
15
|
-
export declare const formatUrl: (url: string) => string;
|
|
16
|
-
export declare function md5(content: string | Uint8Array): string;
|
|
17
|
-
export declare function findMaxScrollHeight(page: Page): Promise<number>;
|