@arcblock/crawler-middleware 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,59 @@
1
+ # @arcblock/crawler-middleware
2
+
3
+ This express middleware provides pre-rendered HTML generated by SnapKit for Blocklets, enabling them to return complete HTML content to web spider. This is essential for SEO and ensuring that search engines can properly index dynamically generated content.
4
+
5
+ ## Usage
6
+
7
+ ```typescript
8
+ import { createSnapshotMiddleware } from '@arcblock/crawler-middleware';
9
+
10
+ const app = express();
11
+
12
+ app.use(
13
+ createSnapshotMiddleware({
14
+ endpoint: process.env.SNAP_KIT_ENDPOINT,
15
+ accessKey: process.env.SNAP_KIT_ACCESS_KEY,
16
+ allowCrawler: (req) => {
17
+ return req.path === '/';
18
+ },
19
+ }),
20
+ );
21
+ ```
22
+
23
+ ## How it Works
24
+
25
+ 1. The middleware intercepts incoming requests.
26
+ 2. It checks if the request is from a web crawler.
27
+ 3. Try to read and return HTML from the local cache.
28
+ 4. If the cache is not found, an asynchronous request is made to SnapKit, and the local cache is updated.
29
+ 5. The current request does not return the cached content; the next crawler visit will hit step 3 and return the cache directly.
30
+
31
+ ## Options
32
+
33
+ The options for createSnapshotMiddleware:
34
+
35
+ ```typescript
36
+ {
37
+ /** SnapKit endpoint */
38
+ endpoint: string;
39
+ /** SnapKit access key */
40
+ accessKey: string;
41
+ /** Max cache size for LRU cache */
42
+ cacheMax?: number;
43
+ /**
44
+ * Cache update interval
45
+ * When cache exceeds this time, it will try to fetch and update cache from SnapKit
46
+ */
47
+ cacheUpdateInterval?: number;
48
+ };
49
+ ```
50
+
51
+ ## Environment Variables
52
+
53
+ When using this middleware outside of a Blocklet environment, you need to configure the following environment variables:
54
+
55
+ - `BLOCKLET_APP_DATA_DIR`: (Required) Directory path for storing the sqlite file
56
+ - `BLOCKLET_LOG_DIR`: (Required) Directory path for storing @blocklet/logger logs
57
+ - `BLOCKLET_APP_URL`: (Optional) Deployed domain
58
+
59
+ You can set these variables in your `.env` file.
@@ -0,0 +1,26 @@
1
+ import { SnapshotModel } from './store/index';
2
+ export type CacheManagerOptions = {
3
+ /** SnapKit endpoint */
4
+ endpoint: string;
5
+ /** SnapKit access key */
6
+ accessKey: string;
7
+ /** Max cache size for LRU cache */
8
+ cacheMax?: number;
9
+ /**
10
+ * Cache update interval
11
+ * When cache exceeds this time, it will try to fetch and update cache from SnapKit
12
+ */
13
+ cacheUpdateInterval?: number;
14
+ };
15
+ export declare class CacheManager {
16
+ private options;
17
+ private cache;
18
+ private initializedPromise;
19
+ constructor(options: CacheManagerOptions);
20
+ waitReady(): Promise<void>;
21
+ getSnapshot(url: string): Promise<SnapshotModel | null>;
22
+ setSnapshot(url: string, snapshot: SnapshotModel): Promise<void>;
23
+ fetchSnapKit(url: string): Promise<any>;
24
+ isCacheExpired(url: string): Promise<boolean>;
25
+ updateSnapshot(url: string): Promise<void>;
26
+ }
@@ -0,0 +1,111 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.CacheManager = void 0;
13
+ const crawler_1 = require("@arcblock/crawler");
14
+ const lru_cache_1 = require("lru-cache");
15
+ const ufo_1 = require("ufo");
16
+ const env_1 = require("./env");
17
+ const index_1 = require("./store/index");
18
+ class CacheManager {
19
+ constructor(options) {
20
+ this.options = Object.assign({ cacheMax: 500, cacheUpdateInterval: 1000 * 60 * 60 * 24 }, options);
21
+ this.cache = new lru_cache_1.LRUCache({ max: this.options.cacheMax || 500 });
22
+ this.initializedPromise = Promise.all([(0, index_1.initDatabase)()]);
23
+ }
24
+ waitReady() {
25
+ return __awaiter(this, void 0, void 0, function* () {
26
+ yield this.initializedPromise;
27
+ });
28
+ }
29
+ getSnapshot(url) {
30
+ return __awaiter(this, void 0, void 0, function* () {
31
+ const cachedSnapshot = this.cache.get(url);
32
+ if (cachedSnapshot) {
33
+ return cachedSnapshot;
34
+ }
35
+ const snapshot = yield index_1.Snapshot.findOne({ where: { url } });
36
+ if (snapshot) {
37
+ this.cache.set(url, snapshot);
38
+ return snapshot;
39
+ }
40
+ return null;
41
+ });
42
+ }
43
+ setSnapshot(url, snapshot) {
44
+ return __awaiter(this, void 0, void 0, function* () {
45
+ yield index_1.Snapshot.create(snapshot);
46
+ this.cache.set(url, snapshot);
47
+ });
48
+ }
49
+ fetchSnapKit(url) {
50
+ return __awaiter(this, void 0, void 0, function* () {
51
+ const { endpoint, accessKey } = this.options;
52
+ const api = (0, ufo_1.joinURL)(endpoint, 'api/crawl');
53
+ env_1.logger.debug('Fetching snapshot from SnapKit', { url, api });
54
+ try {
55
+ const { data } = yield crawler_1.utils.axios.get(api, {
56
+ params: {
57
+ url,
58
+ },
59
+ headers: {
60
+ Authorization: `Bearer ${accessKey}`,
61
+ },
62
+ });
63
+ const { data: snapshotData } = data || {};
64
+ if ((snapshotData === null || snapshotData === void 0 ? void 0 : snapshotData.status) !== 'success') {
65
+ env_1.logger.info(`No valid HTML found for ${url} from SnapKit`, { snapshotData, data });
66
+ return null;
67
+ }
68
+ env_1.logger.info('Success to fetch content by SnapKit and cache it', {
69
+ url,
70
+ jobId: snapshotData.jobId,
71
+ lastModified: snapshotData.lastModified,
72
+ });
73
+ return snapshotData;
74
+ }
75
+ catch (error) {
76
+ env_1.logger.error('Failed to fetch content by SnapKit', { url, error });
77
+ return null;
78
+ }
79
+ });
80
+ }
81
+ isCacheExpired(url) {
82
+ return __awaiter(this, void 0, void 0, function* () {
83
+ const snapshot = yield this.getSnapshot(url);
84
+ if (!snapshot) {
85
+ return true;
86
+ }
87
+ return Date.now() - new Date(snapshot.createdAt).getTime() > this.options.cacheUpdateInterval;
88
+ });
89
+ }
90
+ updateSnapshot(url) {
91
+ return __awaiter(this, void 0, void 0, function* () {
92
+ try {
93
+ const snapshot = yield this.fetchSnapKit(url);
94
+ if (snapshot) {
95
+ // update db
96
+ const [updatedSnapshot] = yield index_1.Snapshot.upsert({
97
+ url,
98
+ html: snapshot.html,
99
+ lastModified: snapshot.lastModified,
100
+ });
101
+ // update cache
102
+ this.cache.set(url, updatedSnapshot);
103
+ }
104
+ }
105
+ catch (error) {
106
+ env_1.logger.error('Failed to update snapshot', { url, error });
107
+ }
108
+ });
109
+ }
110
+ }
111
+ exports.CacheManager = CacheManager;
@@ -0,0 +1,5 @@
1
+ export declare const env: {
2
+ databasePath: string;
3
+ appUrl: string;
4
+ };
5
+ export declare const logger: any;
package/lib/cjs/env.js ADDED
@@ -0,0 +1,14 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.logger = exports.env = void 0;
7
+ const logger_1 = __importDefault(require("@blocklet/logger"));
8
+ const config_1 = __importDefault(require("@blocklet/sdk/lib/config"));
9
+ const node_path_1 = __importDefault(require("node:path"));
10
+ exports.env = {
11
+ databasePath: node_path_1.default.join(config_1.default.env.dataDir, 'crawler-middleware/snapshot.db'),
12
+ appUrl: config_1.default.env.appUrl,
13
+ };
14
+ exports.logger = (0, logger_1.default)('@arcblock/crawler-middleware', { level: process.env.LOG_LEVEL || 'info' });
@@ -0,0 +1,18 @@
1
+ import { NextFunction, Request, Response } from 'express';
2
+ export declare function createSnapshotMiddleware({ endpoint, accessKey, cacheMax, cacheUpdateInterval, autoReturnHtml, allowCrawler, }: {
3
+ /** SnapKit endpoint */
4
+ endpoint: string;
5
+ /** SnapKit access key */
6
+ accessKey: string;
7
+ /** Max cache size for LRU cache */
8
+ cacheMax?: number;
9
+ /**
10
+ * Cache update interval
11
+ * When cache exceeds this time, it will try to fetch and update cache from SnapKit
12
+ */
13
+ cacheUpdateInterval?: number;
14
+ /** Call res.send(html) when cache hit */
15
+ autoReturnHtml?: boolean;
16
+ /** Custom function to determine whether to return cached content */
17
+ allowCrawler?: (req: Request) => boolean;
18
+ }): (req: Request, res: Response, next: NextFunction) => Promise<void>;
@@ -0,0 +1,72 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.createSnapshotMiddleware = createSnapshotMiddleware;
13
+ const crawler_1 = require("@arcblock/crawler");
14
+ const ufo_1 = require("ufo");
15
+ const cache_1 = require("./cache");
16
+ const env_1 = require("./env");
17
+ const { isSelfCrawler, isSpider, isStaticFile } = crawler_1.utils;
18
+ function getFullUrl(req) {
19
+ const blockletPathname = req.headers['x-path-prefix']
20
+ ? (0, ufo_1.joinURL)(req.headers['x-path-prefix'], req.originalUrl)
21
+ : req.originalUrl;
22
+ return (0, ufo_1.joinURL)(env_1.env.appUrl || req.get('host'), blockletPathname);
23
+ }
24
+ function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500, cacheUpdateInterval = 1000 * 60 * 60 * 24, autoReturnHtml = true, allowCrawler = () => true, }) {
25
+ if (!accessKey || !endpoint) {
26
+ throw new Error('accessKey and endpoint are required');
27
+ }
28
+ const cacheManager = new cache_1.CacheManager({
29
+ endpoint,
30
+ accessKey,
31
+ cacheMax,
32
+ cacheUpdateInterval,
33
+ });
34
+ return (req, res, next) => __awaiter(this, void 0, void 0, function* () {
35
+ yield cacheManager.waitReady();
36
+ if (!allowCrawler(req)) {
37
+ return next();
38
+ }
39
+ const fullUrl = getFullUrl(req);
40
+ // Always fetch content from SnapKit and cache it, even for non-crawler requests
41
+ if (yield cacheManager.isCacheExpired(fullUrl)) {
42
+ env_1.logger.info(`Cache expired for ${fullUrl}, fetching from SnapKit`);
43
+ // Don't await here, the cache will be effective after the next request
44
+ cacheManager.updateSnapshot(fullUrl);
45
+ }
46
+ if (!isSpider(req) || isSelfCrawler(req) || isStaticFile(req)) {
47
+ return next();
48
+ }
49
+ // cache hit
50
+ const cachedSnapshot = yield cacheManager.getSnapshot(fullUrl);
51
+ if (cachedSnapshot) {
52
+ // @ts-ignore
53
+ req.cachedHtml = cachedSnapshot.html;
54
+ if (cachedSnapshot.lastModified) {
55
+ // @ts-ignore
56
+ req.cachedLastmod = new Date(cachedSnapshot.lastModified).toUTCString();
57
+ res.setHeader('Last-Modified', cachedSnapshot.lastModified);
58
+ }
59
+ if (autoReturnHtml) {
60
+ env_1.logger.debug(`Cache hit: ${fullUrl} `, {
61
+ lastModified: cachedSnapshot.lastModified,
62
+ createdAt: cachedSnapshot.createdAt,
63
+ });
64
+ res.send(cachedSnapshot.html);
65
+ return;
66
+ }
67
+ return next();
68
+ }
69
+ env_1.logger.debug(`Cache not hit: ${fullUrl}`);
70
+ return next();
71
+ });
72
+ }
@@ -0,0 +1,4 @@
1
+ import { Sequelize } from '@sequelize/core';
2
+ import { SqliteDialect } from '@sequelize/sqlite3';
3
+ export * from './model-snapshot';
4
+ export declare function initDatabase(): Promise<Sequelize<SqliteDialect>>;
@@ -0,0 +1,66 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
+ for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
15
+ };
16
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
17
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
18
+ return new (P || (P = Promise))(function (resolve, reject) {
19
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
20
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
21
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
22
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
23
+ });
24
+ };
25
+ Object.defineProperty(exports, "__esModule", { value: true });
26
+ exports.initDatabase = initDatabase;
27
+ const core_1 = require("@sequelize/core");
28
+ const sqlite3_1 = require("@sequelize/sqlite3");
29
+ const env_1 = require("../env");
30
+ const model_snapshot_1 = require("./model-snapshot");
31
+ __exportStar(require("./model-snapshot"), exports);
32
+ function initDatabase() {
33
+ return __awaiter(this, void 0, void 0, function* () {
34
+ const sequelize = new core_1.Sequelize({
35
+ dialect: sqlite3_1.SqliteDialect,
36
+ storage: env_1.env.databasePath,
37
+ logging: (msg) => process.env.SQLITE_LOG && env_1.logger.debug(msg),
38
+ pool: {
39
+ min: 0,
40
+ max: 10,
41
+ idle: 10000,
42
+ },
43
+ retry: {
44
+ match: [/SQLITE_BUSY/],
45
+ name: 'query',
46
+ max: 10,
47
+ },
48
+ });
49
+ model_snapshot_1.Snapshot.initModel(sequelize);
50
+ try {
51
+ yield Promise.all([
52
+ sequelize.query('pragma journal_mode = WAL;'),
53
+ sequelize.query('pragma synchronous = normal;'),
54
+ sequelize.query('pragma journal_size_limit = 67108864;'),
55
+ ]);
56
+ yield sequelize.authenticate();
57
+ yield sequelize.sync();
58
+ env_1.logger.info('Successfully connected to database');
59
+ }
60
+ catch (error) {
61
+ env_1.logger.error('Failed to connect to database:', error);
62
+ throw error;
63
+ }
64
+ return sequelize;
65
+ });
66
+ }
@@ -0,0 +1,16 @@
1
+ import { Model, Sequelize } from '@sequelize/core';
2
+ export interface SnapshotModel {
3
+ url: string;
4
+ html: string;
5
+ lastModified?: string;
6
+ createdAt?: string;
7
+ updatedAt?: string;
8
+ }
9
+ export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
10
+ url: SnapshotModel['url'];
11
+ html: SnapshotModel['html'];
12
+ lastModified?: SnapshotModel['lastModified'];
13
+ createdAt: SnapshotModel['createdAt'];
14
+ updatedAt: SnapshotModel['updatedAt'];
15
+ static initModel(sequelize: Sequelize): void;
16
+ }
@@ -0,0 +1,29 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.Snapshot = void 0;
4
+ const core_1 = require("@sequelize/core");
5
+ class Snapshot extends core_1.Model {
6
+ static initModel(sequelize) {
7
+ Snapshot.init({
8
+ url: {
9
+ type: core_1.DataTypes.STRING,
10
+ allowNull: false,
11
+ primaryKey: true,
12
+ },
13
+ html: {
14
+ type: core_1.DataTypes.TEXT,
15
+ allowNull: false,
16
+ },
17
+ lastModified: {
18
+ type: core_1.DataTypes.STRING,
19
+ allowNull: true,
20
+ },
21
+ }, {
22
+ sequelize,
23
+ modelName: 'snapshot',
24
+ tableName: 'snap',
25
+ timestamps: true,
26
+ });
27
+ }
28
+ }
29
+ exports.Snapshot = Snapshot;
@@ -0,0 +1,26 @@
1
+ import { SnapshotModel } from './store/index';
2
+ export type CacheManagerOptions = {
3
+ /** SnapKit endpoint */
4
+ endpoint: string;
5
+ /** SnapKit access key */
6
+ accessKey: string;
7
+ /** Max cache size for LRU cache */
8
+ cacheMax?: number;
9
+ /**
10
+ * Cache update interval
11
+ * When cache exceeds this time, it will try to fetch and update cache from SnapKit
12
+ */
13
+ cacheUpdateInterval?: number;
14
+ };
15
+ export declare class CacheManager {
16
+ private options;
17
+ private cache;
18
+ private initializedPromise;
19
+ constructor(options: CacheManagerOptions);
20
+ waitReady(): Promise<void>;
21
+ getSnapshot(url: string): Promise<SnapshotModel | null>;
22
+ setSnapshot(url: string, snapshot: SnapshotModel): Promise<void>;
23
+ fetchSnapKit(url: string): Promise<any>;
24
+ isCacheExpired(url: string): Promise<boolean>;
25
+ updateSnapshot(url: string): Promise<void>;
26
+ }
@@ -0,0 +1,107 @@
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import { utils } from '@arcblock/crawler';
11
+ import { LRUCache } from 'lru-cache';
12
+ import { joinURL } from 'ufo';
13
+ import { logger } from './env';
14
+ import { Snapshot, initDatabase } from './store/index';
15
+ export class CacheManager {
16
+ constructor(options) {
17
+ this.options = Object.assign({ cacheMax: 500, cacheUpdateInterval: 1000 * 60 * 60 * 24 }, options);
18
+ this.cache = new LRUCache({ max: this.options.cacheMax || 500 });
19
+ this.initializedPromise = Promise.all([initDatabase()]);
20
+ }
21
+ waitReady() {
22
+ return __awaiter(this, void 0, void 0, function* () {
23
+ yield this.initializedPromise;
24
+ });
25
+ }
26
+ getSnapshot(url) {
27
+ return __awaiter(this, void 0, void 0, function* () {
28
+ const cachedSnapshot = this.cache.get(url);
29
+ if (cachedSnapshot) {
30
+ return cachedSnapshot;
31
+ }
32
+ const snapshot = yield Snapshot.findOne({ where: { url } });
33
+ if (snapshot) {
34
+ this.cache.set(url, snapshot);
35
+ return snapshot;
36
+ }
37
+ return null;
38
+ });
39
+ }
40
+ setSnapshot(url, snapshot) {
41
+ return __awaiter(this, void 0, void 0, function* () {
42
+ yield Snapshot.create(snapshot);
43
+ this.cache.set(url, snapshot);
44
+ });
45
+ }
46
+ fetchSnapKit(url) {
47
+ return __awaiter(this, void 0, void 0, function* () {
48
+ const { endpoint, accessKey } = this.options;
49
+ const api = joinURL(endpoint, 'api/crawl');
50
+ logger.debug('Fetching snapshot from SnapKit', { url, api });
51
+ try {
52
+ const { data } = yield utils.axios.get(api, {
53
+ params: {
54
+ url,
55
+ },
56
+ headers: {
57
+ Authorization: `Bearer ${accessKey}`,
58
+ },
59
+ });
60
+ const { data: snapshotData } = data || {};
61
+ if ((snapshotData === null || snapshotData === void 0 ? void 0 : snapshotData.status) !== 'success') {
62
+ logger.info(`No valid HTML found for ${url} from SnapKit`, { snapshotData, data });
63
+ return null;
64
+ }
65
+ logger.info('Success to fetch content by SnapKit and cache it', {
66
+ url,
67
+ jobId: snapshotData.jobId,
68
+ lastModified: snapshotData.lastModified,
69
+ });
70
+ return snapshotData;
71
+ }
72
+ catch (error) {
73
+ logger.error('Failed to fetch content by SnapKit', { url, error });
74
+ return null;
75
+ }
76
+ });
77
+ }
78
+ isCacheExpired(url) {
79
+ return __awaiter(this, void 0, void 0, function* () {
80
+ const snapshot = yield this.getSnapshot(url);
81
+ if (!snapshot) {
82
+ return true;
83
+ }
84
+ return Date.now() - new Date(snapshot.createdAt).getTime() > this.options.cacheUpdateInterval;
85
+ });
86
+ }
87
+ updateSnapshot(url) {
88
+ return __awaiter(this, void 0, void 0, function* () {
89
+ try {
90
+ const snapshot = yield this.fetchSnapKit(url);
91
+ if (snapshot) {
92
+ // update db
93
+ const [updatedSnapshot] = yield Snapshot.upsert({
94
+ url,
95
+ html: snapshot.html,
96
+ lastModified: snapshot.lastModified,
97
+ });
98
+ // update cache
99
+ this.cache.set(url, updatedSnapshot);
100
+ }
101
+ }
102
+ catch (error) {
103
+ logger.error('Failed to update snapshot', { url, error });
104
+ }
105
+ });
106
+ }
107
+ }
@@ -0,0 +1,5 @@
1
+ export declare const env: {
2
+ databasePath: string;
3
+ appUrl: string;
4
+ };
5
+ export declare const logger: any;
package/lib/esm/env.js ADDED
@@ -0,0 +1,8 @@
1
+ import createLogger from '@blocklet/logger';
2
+ import config from '@blocklet/sdk/lib/config';
3
+ import path from 'node:path';
4
+ export const env = {
5
+ databasePath: path.join(config.env.dataDir, 'crawler-middleware/snapshot.db'),
6
+ appUrl: config.env.appUrl,
7
+ };
8
+ export const logger = createLogger('@arcblock/crawler-middleware', { level: process.env.LOG_LEVEL || 'info' });
@@ -0,0 +1,18 @@
1
+ import { NextFunction, Request, Response } from 'express';
2
+ export declare function createSnapshotMiddleware({ endpoint, accessKey, cacheMax, cacheUpdateInterval, autoReturnHtml, allowCrawler, }: {
3
+ /** SnapKit endpoint */
4
+ endpoint: string;
5
+ /** SnapKit access key */
6
+ accessKey: string;
7
+ /** Max cache size for LRU cache */
8
+ cacheMax?: number;
9
+ /**
10
+ * Cache update interval
11
+ * When cache exceeds this time, it will try to fetch and update cache from SnapKit
12
+ */
13
+ cacheUpdateInterval?: number;
14
+ /** Call res.send(html) when cache hit */
15
+ autoReturnHtml?: boolean;
16
+ /** Custom function to determine whether to return cached content */
17
+ allowCrawler?: (req: Request) => boolean;
18
+ }): (req: Request, res: Response, next: NextFunction) => Promise<void>;
@@ -0,0 +1,69 @@
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import { utils } from '@arcblock/crawler';
11
+ import { joinURL } from 'ufo';
12
+ import { CacheManager } from './cache';
13
+ import { env, logger } from './env';
14
+ const { isSelfCrawler, isSpider, isStaticFile } = utils;
15
+ function getFullUrl(req) {
16
+ const blockletPathname = req.headers['x-path-prefix']
17
+ ? joinURL(req.headers['x-path-prefix'], req.originalUrl)
18
+ : req.originalUrl;
19
+ return joinURL(env.appUrl || req.get('host'), blockletPathname);
20
+ }
21
+ export function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500, cacheUpdateInterval = 1000 * 60 * 60 * 24, autoReturnHtml = true, allowCrawler = () => true, }) {
22
+ if (!accessKey || !endpoint) {
23
+ throw new Error('accessKey and endpoint are required');
24
+ }
25
+ const cacheManager = new CacheManager({
26
+ endpoint,
27
+ accessKey,
28
+ cacheMax,
29
+ cacheUpdateInterval,
30
+ });
31
+ return (req, res, next) => __awaiter(this, void 0, void 0, function* () {
32
+ yield cacheManager.waitReady();
33
+ if (!allowCrawler(req)) {
34
+ return next();
35
+ }
36
+ const fullUrl = getFullUrl(req);
37
+ // Always fetch content from SnapKit and cache it, even for non-crawler requests
38
+ if (yield cacheManager.isCacheExpired(fullUrl)) {
39
+ logger.info(`Cache expired for ${fullUrl}, fetching from SnapKit`);
40
+ // Don't await here, the cache will be effective after the next request
41
+ cacheManager.updateSnapshot(fullUrl);
42
+ }
43
+ if (!isSpider(req) || isSelfCrawler(req) || isStaticFile(req)) {
44
+ return next();
45
+ }
46
+ // cache hit
47
+ const cachedSnapshot = yield cacheManager.getSnapshot(fullUrl);
48
+ if (cachedSnapshot) {
49
+ // @ts-ignore
50
+ req.cachedHtml = cachedSnapshot.html;
51
+ if (cachedSnapshot.lastModified) {
52
+ // @ts-ignore
53
+ req.cachedLastmod = new Date(cachedSnapshot.lastModified).toUTCString();
54
+ res.setHeader('Last-Modified', cachedSnapshot.lastModified);
55
+ }
56
+ if (autoReturnHtml) {
57
+ logger.debug(`Cache hit: ${fullUrl} `, {
58
+ lastModified: cachedSnapshot.lastModified,
59
+ createdAt: cachedSnapshot.createdAt,
60
+ });
61
+ res.send(cachedSnapshot.html);
62
+ return;
63
+ }
64
+ return next();
65
+ }
66
+ logger.debug(`Cache not hit: ${fullUrl}`);
67
+ return next();
68
+ });
69
+ }
@@ -0,0 +1,4 @@
1
+ import { Sequelize } from '@sequelize/core';
2
+ import { SqliteDialect } from '@sequelize/sqlite3';
3
+ export * from './model-snapshot';
4
+ export declare function initDatabase(): Promise<Sequelize<SqliteDialect>>;
@@ -0,0 +1,49 @@
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import { Sequelize } from '@sequelize/core';
11
+ import { SqliteDialect } from '@sequelize/sqlite3';
12
+ import { env, logger } from '../env';
13
+ import { Snapshot } from './model-snapshot';
14
+ export * from './model-snapshot';
15
+ export function initDatabase() {
16
+ return __awaiter(this, void 0, void 0, function* () {
17
+ const sequelize = new Sequelize({
18
+ dialect: SqliteDialect,
19
+ storage: env.databasePath,
20
+ logging: (msg) => process.env.SQLITE_LOG && logger.debug(msg),
21
+ pool: {
22
+ min: 0,
23
+ max: 10,
24
+ idle: 10000,
25
+ },
26
+ retry: {
27
+ match: [/SQLITE_BUSY/],
28
+ name: 'query',
29
+ max: 10,
30
+ },
31
+ });
32
+ Snapshot.initModel(sequelize);
33
+ try {
34
+ yield Promise.all([
35
+ sequelize.query('pragma journal_mode = WAL;'),
36
+ sequelize.query('pragma synchronous = normal;'),
37
+ sequelize.query('pragma journal_size_limit = 67108864;'),
38
+ ]);
39
+ yield sequelize.authenticate();
40
+ yield sequelize.sync();
41
+ logger.info('Successfully connected to database');
42
+ }
43
+ catch (error) {
44
+ logger.error('Failed to connect to database:', error);
45
+ throw error;
46
+ }
47
+ return sequelize;
48
+ });
49
+ }
@@ -0,0 +1,16 @@
1
+ import { Model, Sequelize } from '@sequelize/core';
2
+ export interface SnapshotModel {
3
+ url: string;
4
+ html: string;
5
+ lastModified?: string;
6
+ createdAt?: string;
7
+ updatedAt?: string;
8
+ }
9
+ export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
10
+ url: SnapshotModel['url'];
11
+ html: SnapshotModel['html'];
12
+ lastModified?: SnapshotModel['lastModified'];
13
+ createdAt: SnapshotModel['createdAt'];
14
+ updatedAt: SnapshotModel['updatedAt'];
15
+ static initModel(sequelize: Sequelize): void;
16
+ }
@@ -0,0 +1,25 @@
1
+ import { DataTypes, Model } from '@sequelize/core';
2
+ export class Snapshot extends Model {
3
+ static initModel(sequelize) {
4
+ Snapshot.init({
5
+ url: {
6
+ type: DataTypes.STRING,
7
+ allowNull: false,
8
+ primaryKey: true,
9
+ },
10
+ html: {
11
+ type: DataTypes.TEXT,
12
+ allowNull: false,
13
+ },
14
+ lastModified: {
15
+ type: DataTypes.STRING,
16
+ allowNull: true,
17
+ },
18
+ }, {
19
+ sequelize,
20
+ modelName: 'snapshot',
21
+ tableName: 'snap',
22
+ timestamps: true,
23
+ });
24
+ }
25
+ }
package/package.json ADDED
@@ -0,0 +1,101 @@
1
+ {
2
+ "name": "@arcblock/crawler-middleware",
3
+ "version": "1.1.1",
4
+ "main": "lib/cjs/index.js",
5
+ "module": "lib/esm/index.js",
6
+ "types": "lib/cjs/index.d.ts",
7
+ "publishConfig": {
8
+ "access": "public"
9
+ },
10
+ "files": [
11
+ "lib",
12
+ "*.d.ts"
13
+ ],
14
+ "exports": {
15
+ ".": {
16
+ "require": "./lib/cjs/index.js",
17
+ "import": "./lib/esm/index.js",
18
+ "types": "./lib/cjs/index.d.ts"
19
+ }
20
+ },
21
+ "lint-staged": {
22
+ "*.{mjs,js,jsx,ts,tsx}": [
23
+ "prettier --write",
24
+ "eslint"
25
+ ],
26
+ "*.{css,less,scss,json,graphql}": [
27
+ "prettier --write"
28
+ ]
29
+ },
30
+ "browserslist": {
31
+ "production": [
32
+ ">0.2%",
33
+ "not dead",
34
+ "not op_mini all"
35
+ ],
36
+ "development": [
37
+ "last 1 chrome version",
38
+ "last 1 firefox version",
39
+ "last 1 safari version"
40
+ ]
41
+ },
42
+ "dependencies": {
43
+ "@abtnode/cron": "^1.16.43",
44
+ "@abtnode/models": "^1.16.43",
45
+ "@abtnode/queue": "^1.16.43",
46
+ "@blocklet/logger": "^1.16.43",
47
+ "@blocklet/puppeteer": "^22.11.3",
48
+ "@blocklet/sdk": "^1.16.43",
49
+ "@sequelize/core": "7.0.0-alpha.46",
50
+ "@sequelize/sqlite3": "7.0.0-alpha.46",
51
+ "axios": "^1.7.9",
52
+ "fs-extra": "^11.2.0",
53
+ "generic-pool": "^3.9.0",
54
+ "lodash": "^4.17.21",
55
+ "lru-cache": "^10.4.3",
56
+ "redis": "^4.7.0",
57
+ "robots-parser": "^3.0.1",
58
+ "sequelize": "^6.37.7",
59
+ "sitemap": "^7.1.2",
60
+ "sqlite3": "^5.1.7",
61
+ "ufo": "^1.5.4",
62
+ "@arcblock/crawler": "1.1.1"
63
+ },
64
+ "devDependencies": {
65
+ "@blocklet/js-sdk": "^1.16.39",
66
+ "@types/dotenv-flow": "^3.3.3",
67
+ "@types/express": "^4.17.21",
68
+ "@types/fs-extra": "^11.0.4",
69
+ "@types/lodash": "^4.17.16",
70
+ "@types/node": "^20.17.19",
71
+ "express": "^4.21.2",
72
+ "bumpp": "^9.11.1",
73
+ "nodemon": "^3.1.9",
74
+ "npm-run-all": "^4.1.5",
75
+ "puppeteer": "^24.8.2",
76
+ "tsx": "^4.19.3",
77
+ "zx": "^8.3.2"
78
+ },
79
+ "importSort": {
80
+ ".js, .jsx, .mjs": {
81
+ "parser": "babylon",
82
+ "style": "module"
83
+ },
84
+ ".ts, .tsx": {
85
+ "style": "module",
86
+ "parser": "typescript"
87
+ }
88
+ },
89
+ "simple-git-hooks": {
90
+ "pre-commit": "npx lint-staged"
91
+ },
92
+ "scripts": {
93
+ "dev": "tsc -p tsconfig.cjs.json --watch",
94
+ "lint": "tsc --noEmit && eslint src --ext .mjs,.js,.jsx,.ts,.tsx",
95
+ "lint:fix": "npm run lint -- --fix",
96
+ "bundle": "npm run build",
97
+ "build:cjs": "tsc -p tsconfig.cjs.json",
98
+ "build:esm": "tsc -p tsconfig.esm.json",
99
+ "build": "npm run build:cjs && npm run build:esm"
100
+ }
101
+ }