@arcblock/crawler-middleware 1.1.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +36 -24
- package/lib/cjs/cache.d.ts +8 -5
- package/lib/cjs/cache.js +24 -13
- package/lib/cjs/index.d.ts +7 -6
- package/lib/cjs/index.js +12 -10
- package/lib/cjs/store/index.js +1 -0
- package/lib/esm/cache.d.ts +8 -5
- package/lib/esm/cache.js +21 -13
- package/lib/esm/index.d.ts +7 -6
- package/lib/esm/index.js +12 -10
- package/lib/esm/store/index.js +1 -0
- package/package.json +4 -4
package/README.md
CHANGED
|
@@ -2,31 +2,36 @@
|
|
|
2
2
|
|
|
3
3
|
This express middleware provides pre-rendered HTML generated by SnapKit for Blocklets, enabling them to return complete HTML content to web spider. This is essential for SEO and ensuring that search engines can properly index dynamically generated content.
|
|
4
4
|
|
|
5
|
+
## How it Works
|
|
6
|
+
|
|
7
|
+
1. The middleware intercepts incoming requests.
|
|
8
|
+
2. It checks if the request is from a web spider.
|
|
9
|
+
3. Try to read and return HTML from the local cache (Memory LRU Cache + SQLite).
|
|
10
|
+
4. If the cache is not found, an asynchronous request is made to SnapKit, and the local cache is updated.
|
|
11
|
+
5. The current request does not return the cached content; the next spider visit will hit step 3 and return the cache directly.
|
|
12
|
+
|
|
5
13
|
## Usage
|
|
6
14
|
|
|
7
15
|
```typescript
|
|
8
16
|
import { createSnapshotMiddleware } from '@arcblock/crawler-middleware';
|
|
9
17
|
|
|
10
18
|
const app = express();
|
|
19
|
+
const snapshotMiddleware = createSnapshotMiddleware({
|
|
20
|
+
endpoint: process.env.SNAP_KIT_ENDPOINT,
|
|
21
|
+
accessKey: process.env.SNAP_KIT_ACCESS_KEY,
|
|
22
|
+
allowCrawler: (req) => {
|
|
23
|
+
return req.path === '/';
|
|
24
|
+
},
|
|
25
|
+
});
|
|
11
26
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
endpoint: process.env.SNAP_KIT_ENDPOINT,
|
|
15
|
-
accessKey: process.env.SNAP_KIT_ACCESS_KEY,
|
|
16
|
-
allowCrawler: (req) => {
|
|
17
|
-
return req.path === '/';
|
|
18
|
-
},
|
|
19
|
-
}),
|
|
20
|
-
);
|
|
21
|
-
```
|
|
22
|
-
|
|
23
|
-
## How it Works
|
|
27
|
+
// for all route
|
|
28
|
+
app.use(snapshotMiddleware);
|
|
24
29
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
+
// for one route
|
|
31
|
+
app.use('/doc', snapshotMiddleware, (req) => {
|
|
32
|
+
/* ... */
|
|
33
|
+
});
|
|
34
|
+
```
|
|
30
35
|
|
|
31
36
|
## Options
|
|
32
37
|
|
|
@@ -40,11 +45,16 @@ The options for createSnapshotMiddleware:
|
|
|
40
45
|
accessKey: string;
|
|
41
46
|
/** Max cache size for LRU cache */
|
|
42
47
|
cacheMax?: number;
|
|
43
|
-
/**
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
+
/** When cache exceeds this time, it will try to fetch and update cache from SnapKit */
|
|
49
|
+
updateInterval?: number;
|
|
50
|
+
/** When failed cache exceeds this time, it will try to fetch and update cache from SnapKit */
|
|
51
|
+
failedUpdateInterval?: number;
|
|
52
|
+
/** Update queue concurrency */
|
|
53
|
+
updatedConcurrency?: number;
|
|
54
|
+
/** Call res.send(html) when cache hit */
|
|
55
|
+
autoReturnHtml?: boolean;
|
|
56
|
+
/** Custom function to determine whether to return cached content */
|
|
57
|
+
allowCrawler?: (req: Request) => boolean;
|
|
48
58
|
};
|
|
49
59
|
```
|
|
50
60
|
|
|
@@ -52,8 +62,10 @@ The options for createSnapshotMiddleware:
|
|
|
52
62
|
|
|
53
63
|
When using this middleware outside of a Blocklet environment, you need to configure the following environment variables:
|
|
54
64
|
|
|
55
|
-
- `
|
|
65
|
+
- `BLOCKLET_DATA_DIR`: (Required) Directory path for storing the sqlite file
|
|
56
66
|
- `BLOCKLET_LOG_DIR`: (Required) Directory path for storing @blocklet/logger logs
|
|
57
67
|
- `BLOCKLET_APP_URL`: (Optional) Deployed domain
|
|
58
68
|
|
|
59
|
-
|
|
69
|
+
## SQLite
|
|
70
|
+
|
|
71
|
+
When createSnapshotMiddleware is called, it attempts to create an SQLite database at `BLOCKLET_DATA_DIR`. This database is used to cache HTML content retrieved from SnapKit. Please ensure that the deployment environment supports SQLite.
|
package/lib/cjs/cache.d.ts
CHANGED
|
@@ -6,16 +6,18 @@ export type CacheManagerOptions = {
|
|
|
6
6
|
accessKey: string;
|
|
7
7
|
/** Max cache size for LRU cache */
|
|
8
8
|
cacheMax?: number;
|
|
9
|
-
/**
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
9
|
+
/** When cache exceeds this time, it will try to fetch and update cache from SnapKit */
|
|
10
|
+
updateInterval?: number;
|
|
11
|
+
/** When failed cache exceeds this time, it will try to fetch and update cache from SnapKit */
|
|
12
|
+
failedUpdateInterval?: number;
|
|
13
|
+
/** Update queue concurrency */
|
|
14
|
+
updatedConcurrency?: number;
|
|
14
15
|
};
|
|
15
16
|
export declare class CacheManager {
|
|
16
17
|
private options;
|
|
17
18
|
private cache;
|
|
18
19
|
private initializedPromise;
|
|
20
|
+
private updateQueue;
|
|
19
21
|
constructor(options: CacheManagerOptions);
|
|
20
22
|
waitReady(): Promise<void>;
|
|
21
23
|
getSnapshot(url: string): Promise<SnapshotModel | null>;
|
|
@@ -23,4 +25,5 @@ export declare class CacheManager {
|
|
|
23
25
|
fetchSnapKit(url: string): Promise<any>;
|
|
24
26
|
isCacheExpired(url: string): Promise<boolean>;
|
|
25
27
|
updateSnapshot(url: string): Promise<void>;
|
|
28
|
+
enqueueUpdateSnapshot(url: string): any;
|
|
26
29
|
}
|
package/lib/cjs/cache.js
CHANGED
|
@@ -8,17 +8,25 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
8
8
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
9
|
});
|
|
10
10
|
};
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
11
14
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
15
|
exports.CacheManager = void 0;
|
|
13
16
|
const crawler_1 = require("@arcblock/crawler");
|
|
14
17
|
const lru_cache_1 = require("lru-cache");
|
|
18
|
+
const queue_1 = __importDefault(require("queue"));
|
|
15
19
|
const ufo_1 = require("ufo");
|
|
16
20
|
const env_1 = require("./env");
|
|
17
21
|
const index_1 = require("./store/index");
|
|
18
22
|
class CacheManager {
|
|
19
23
|
constructor(options) {
|
|
20
|
-
this.options = Object.assign({ cacheMax: 500,
|
|
24
|
+
this.options = Object.assign({ cacheMax: 500, updateInterval: 1000 * 60 * 60 * 24, failedUpdateInterval: 1000 * 60 * 60 * 24, updatedConcurrency: 10 }, options);
|
|
21
25
|
this.cache = new lru_cache_1.LRUCache({ max: this.options.cacheMax || 500 });
|
|
26
|
+
this.updateQueue = new queue_1.default({
|
|
27
|
+
autostart: true,
|
|
28
|
+
concurrency: this.options.updatedConcurrency,
|
|
29
|
+
});
|
|
22
30
|
this.initializedPromise = Promise.all([(0, index_1.initDatabase)()]);
|
|
23
31
|
}
|
|
24
32
|
waitReady() {
|
|
@@ -48,6 +56,7 @@ class CacheManager {
|
|
|
48
56
|
}
|
|
49
57
|
fetchSnapKit(url) {
|
|
50
58
|
return __awaiter(this, void 0, void 0, function* () {
|
|
59
|
+
var _a;
|
|
51
60
|
const { endpoint, accessKey } = this.options;
|
|
52
61
|
const api = (0, ufo_1.joinURL)(endpoint, 'api/crawl');
|
|
53
62
|
env_1.logger.debug('Fetching snapshot from SnapKit', { url, api });
|
|
@@ -73,7 +82,7 @@ class CacheManager {
|
|
|
73
82
|
return snapshotData;
|
|
74
83
|
}
|
|
75
84
|
catch (error) {
|
|
76
|
-
env_1.logger.error('Failed to fetch content by SnapKit', { url, error });
|
|
85
|
+
env_1.logger.error('Failed to fetch content by SnapKit', { url, error, data: (_a = error === null || error === void 0 ? void 0 : error.response) === null || _a === void 0 ? void 0 : _a.data, accessKey });
|
|
77
86
|
return null;
|
|
78
87
|
}
|
|
79
88
|
});
|
|
@@ -84,28 +93,30 @@ class CacheManager {
|
|
|
84
93
|
if (!snapshot) {
|
|
85
94
|
return true;
|
|
86
95
|
}
|
|
87
|
-
|
|
96
|
+
const interval = snapshot.html ? this.options.updateInterval : this.options.failedUpdateInterval;
|
|
97
|
+
return Date.now() - new Date(snapshot.updatedAt).getTime() > interval;
|
|
88
98
|
});
|
|
89
99
|
}
|
|
90
100
|
updateSnapshot(url) {
|
|
91
101
|
return __awaiter(this, void 0, void 0, function* () {
|
|
92
102
|
try {
|
|
93
103
|
const snapshot = yield this.fetchSnapKit(url);
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
this.cache.set(url, updatedSnapshot);
|
|
103
|
-
}
|
|
104
|
+
// update db
|
|
105
|
+
const [updatedSnapshot] = yield index_1.Snapshot.upsert({
|
|
106
|
+
url,
|
|
107
|
+
html: (snapshot === null || snapshot === void 0 ? void 0 : snapshot.html) || '',
|
|
108
|
+
lastModified: snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified,
|
|
109
|
+
});
|
|
110
|
+
// update cache
|
|
111
|
+
this.cache.set(url, updatedSnapshot);
|
|
104
112
|
}
|
|
105
113
|
catch (error) {
|
|
106
114
|
env_1.logger.error('Failed to update snapshot', { url, error });
|
|
107
115
|
}
|
|
108
116
|
});
|
|
109
117
|
}
|
|
118
|
+
enqueueUpdateSnapshot(url) {
|
|
119
|
+
return this.updateQueue.push(() => this.updateSnapshot(url));
|
|
120
|
+
}
|
|
110
121
|
}
|
|
111
122
|
exports.CacheManager = CacheManager;
|
package/lib/cjs/index.d.ts
CHANGED
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
import { NextFunction, Request, Response } from 'express';
|
|
2
|
-
export declare function createSnapshotMiddleware({ endpoint, accessKey, cacheMax,
|
|
2
|
+
export declare function createSnapshotMiddleware({ endpoint, accessKey, cacheMax, updateInterval, failedUpdateInterval, updatedConcurrency, autoReturnHtml, allowCrawler, }: {
|
|
3
3
|
/** SnapKit endpoint */
|
|
4
4
|
endpoint: string;
|
|
5
5
|
/** SnapKit access key */
|
|
6
6
|
accessKey: string;
|
|
7
7
|
/** Max cache size for LRU cache */
|
|
8
8
|
cacheMax?: number;
|
|
9
|
-
/**
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
9
|
+
/** When cache exceeds this time, it will try to fetch and update cache from SnapKit */
|
|
10
|
+
updateInterval?: number;
|
|
11
|
+
/** When failed cache exceeds this time, it will try to fetch and update cache from SnapKit */
|
|
12
|
+
failedUpdateInterval?: number;
|
|
13
|
+
/** Update queue concurrency */
|
|
14
|
+
updatedConcurrency?: number;
|
|
14
15
|
/** Call res.send(html) when cache hit */
|
|
15
16
|
autoReturnHtml?: boolean;
|
|
16
17
|
/** Custom function to determine whether to return cached content */
|
package/lib/cjs/index.js
CHANGED
|
@@ -21,7 +21,7 @@ function getFullUrl(req) {
|
|
|
21
21
|
: req.originalUrl;
|
|
22
22
|
return (0, ufo_1.joinURL)(env_1.env.appUrl || req.get('host'), blockletPathname);
|
|
23
23
|
}
|
|
24
|
-
function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500,
|
|
24
|
+
function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500, updateInterval = 1000 * 60 * 60 * 24, failedUpdateInterval = 1000 * 60 * 60 * 24, updatedConcurrency = 10, autoReturnHtml = true, allowCrawler = () => true, }) {
|
|
25
25
|
if (!accessKey || !endpoint) {
|
|
26
26
|
throw new Error('accessKey and endpoint are required');
|
|
27
27
|
}
|
|
@@ -29,7 +29,9 @@ function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500, cacheUp
|
|
|
29
29
|
endpoint,
|
|
30
30
|
accessKey,
|
|
31
31
|
cacheMax,
|
|
32
|
-
|
|
32
|
+
updateInterval,
|
|
33
|
+
failedUpdateInterval,
|
|
34
|
+
updatedConcurrency,
|
|
33
35
|
});
|
|
34
36
|
return (req, res, next) => __awaiter(this, void 0, void 0, function* () {
|
|
35
37
|
yield cacheManager.waitReady();
|
|
@@ -37,18 +39,18 @@ function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500, cacheUp
|
|
|
37
39
|
return next();
|
|
38
40
|
}
|
|
39
41
|
const fullUrl = getFullUrl(req);
|
|
40
|
-
// Always fetch content from SnapKit and cache it, even for non-crawler requests
|
|
41
|
-
if (yield cacheManager.isCacheExpired(fullUrl)) {
|
|
42
|
-
env_1.logger.info(`Cache expired for ${fullUrl}, fetching from SnapKit`);
|
|
43
|
-
// Don't await here, the cache will be effective after the next request
|
|
44
|
-
cacheManager.updateSnapshot(fullUrl);
|
|
45
|
-
}
|
|
46
42
|
if (!isSpider(req) || isSelfCrawler(req) || isStaticFile(req)) {
|
|
47
43
|
return next();
|
|
48
44
|
}
|
|
45
|
+
// fetch content from SnapKit and cache it
|
|
46
|
+
// Don't await here, the cache will be effective after the next request
|
|
47
|
+
if (yield cacheManager.isCacheExpired(fullUrl)) {
|
|
48
|
+
cacheManager.enqueueUpdateSnapshot(fullUrl);
|
|
49
|
+
}
|
|
49
50
|
// cache hit
|
|
50
51
|
const cachedSnapshot = yield cacheManager.getSnapshot(fullUrl);
|
|
51
|
-
if (cachedSnapshot) {
|
|
52
|
+
if (cachedSnapshot === null || cachedSnapshot === void 0 ? void 0 : cachedSnapshot.html) {
|
|
53
|
+
env_1.logger.info(`Cache hit: ${fullUrl}`);
|
|
52
54
|
// @ts-ignore
|
|
53
55
|
req.cachedHtml = cachedSnapshot.html;
|
|
54
56
|
if (cachedSnapshot.lastModified) {
|
|
@@ -66,7 +68,7 @@ function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500, cacheUp
|
|
|
66
68
|
}
|
|
67
69
|
return next();
|
|
68
70
|
}
|
|
69
|
-
env_1.logger.
|
|
71
|
+
env_1.logger.info(`Cache miss: ${fullUrl}`);
|
|
70
72
|
return next();
|
|
71
73
|
});
|
|
72
74
|
}
|
package/lib/cjs/store/index.js
CHANGED
|
@@ -31,6 +31,7 @@ const model_snapshot_1 = require("./model-snapshot");
|
|
|
31
31
|
__exportStar(require("./model-snapshot"), exports);
|
|
32
32
|
function initDatabase() {
|
|
33
33
|
return __awaiter(this, void 0, void 0, function* () {
|
|
34
|
+
env_1.logger.debug(`Init database at ${env_1.env.databasePath}`);
|
|
34
35
|
const sequelize = new core_1.Sequelize({
|
|
35
36
|
dialect: sqlite3_1.SqliteDialect,
|
|
36
37
|
storage: env_1.env.databasePath,
|
package/lib/esm/cache.d.ts
CHANGED
|
@@ -6,16 +6,18 @@ export type CacheManagerOptions = {
|
|
|
6
6
|
accessKey: string;
|
|
7
7
|
/** Max cache size for LRU cache */
|
|
8
8
|
cacheMax?: number;
|
|
9
|
-
/**
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
9
|
+
/** When cache exceeds this time, it will try to fetch and update cache from SnapKit */
|
|
10
|
+
updateInterval?: number;
|
|
11
|
+
/** When failed cache exceeds this time, it will try to fetch and update cache from SnapKit */
|
|
12
|
+
failedUpdateInterval?: number;
|
|
13
|
+
/** Update queue concurrency */
|
|
14
|
+
updatedConcurrency?: number;
|
|
14
15
|
};
|
|
15
16
|
export declare class CacheManager {
|
|
16
17
|
private options;
|
|
17
18
|
private cache;
|
|
18
19
|
private initializedPromise;
|
|
20
|
+
private updateQueue;
|
|
19
21
|
constructor(options: CacheManagerOptions);
|
|
20
22
|
waitReady(): Promise<void>;
|
|
21
23
|
getSnapshot(url: string): Promise<SnapshotModel | null>;
|
|
@@ -23,4 +25,5 @@ export declare class CacheManager {
|
|
|
23
25
|
fetchSnapKit(url: string): Promise<any>;
|
|
24
26
|
isCacheExpired(url: string): Promise<boolean>;
|
|
25
27
|
updateSnapshot(url: string): Promise<void>;
|
|
28
|
+
enqueueUpdateSnapshot(url: string): any;
|
|
26
29
|
}
|
package/lib/esm/cache.js
CHANGED
|
@@ -9,13 +9,18 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
9
9
|
};
|
|
10
10
|
import { utils } from '@arcblock/crawler';
|
|
11
11
|
import { LRUCache } from 'lru-cache';
|
|
12
|
+
import Queue from 'queue';
|
|
12
13
|
import { joinURL } from 'ufo';
|
|
13
14
|
import { logger } from './env';
|
|
14
15
|
import { Snapshot, initDatabase } from './store/index';
|
|
15
16
|
export class CacheManager {
|
|
16
17
|
constructor(options) {
|
|
17
|
-
this.options = Object.assign({ cacheMax: 500,
|
|
18
|
+
this.options = Object.assign({ cacheMax: 500, updateInterval: 1000 * 60 * 60 * 24, failedUpdateInterval: 1000 * 60 * 60 * 24, updatedConcurrency: 10 }, options);
|
|
18
19
|
this.cache = new LRUCache({ max: this.options.cacheMax || 500 });
|
|
20
|
+
this.updateQueue = new Queue({
|
|
21
|
+
autostart: true,
|
|
22
|
+
concurrency: this.options.updatedConcurrency,
|
|
23
|
+
});
|
|
19
24
|
this.initializedPromise = Promise.all([initDatabase()]);
|
|
20
25
|
}
|
|
21
26
|
waitReady() {
|
|
@@ -45,6 +50,7 @@ export class CacheManager {
|
|
|
45
50
|
}
|
|
46
51
|
fetchSnapKit(url) {
|
|
47
52
|
return __awaiter(this, void 0, void 0, function* () {
|
|
53
|
+
var _a;
|
|
48
54
|
const { endpoint, accessKey } = this.options;
|
|
49
55
|
const api = joinURL(endpoint, 'api/crawl');
|
|
50
56
|
logger.debug('Fetching snapshot from SnapKit', { url, api });
|
|
@@ -70,7 +76,7 @@ export class CacheManager {
|
|
|
70
76
|
return snapshotData;
|
|
71
77
|
}
|
|
72
78
|
catch (error) {
|
|
73
|
-
logger.error('Failed to fetch content by SnapKit', { url, error });
|
|
79
|
+
logger.error('Failed to fetch content by SnapKit', { url, error, data: (_a = error === null || error === void 0 ? void 0 : error.response) === null || _a === void 0 ? void 0 : _a.data, accessKey });
|
|
74
80
|
return null;
|
|
75
81
|
}
|
|
76
82
|
});
|
|
@@ -81,27 +87,29 @@ export class CacheManager {
|
|
|
81
87
|
if (!snapshot) {
|
|
82
88
|
return true;
|
|
83
89
|
}
|
|
84
|
-
|
|
90
|
+
const interval = snapshot.html ? this.options.updateInterval : this.options.failedUpdateInterval;
|
|
91
|
+
return Date.now() - new Date(snapshot.updatedAt).getTime() > interval;
|
|
85
92
|
});
|
|
86
93
|
}
|
|
87
94
|
updateSnapshot(url) {
|
|
88
95
|
return __awaiter(this, void 0, void 0, function* () {
|
|
89
96
|
try {
|
|
90
97
|
const snapshot = yield this.fetchSnapKit(url);
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
this.cache.set(url, updatedSnapshot);
|
|
100
|
-
}
|
|
98
|
+
// update db
|
|
99
|
+
const [updatedSnapshot] = yield Snapshot.upsert({
|
|
100
|
+
url,
|
|
101
|
+
html: (snapshot === null || snapshot === void 0 ? void 0 : snapshot.html) || '',
|
|
102
|
+
lastModified: snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified,
|
|
103
|
+
});
|
|
104
|
+
// update cache
|
|
105
|
+
this.cache.set(url, updatedSnapshot);
|
|
101
106
|
}
|
|
102
107
|
catch (error) {
|
|
103
108
|
logger.error('Failed to update snapshot', { url, error });
|
|
104
109
|
}
|
|
105
110
|
});
|
|
106
111
|
}
|
|
112
|
+
enqueueUpdateSnapshot(url) {
|
|
113
|
+
return this.updateQueue.push(() => this.updateSnapshot(url));
|
|
114
|
+
}
|
|
107
115
|
}
|
package/lib/esm/index.d.ts
CHANGED
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
import { NextFunction, Request, Response } from 'express';
|
|
2
|
-
export declare function createSnapshotMiddleware({ endpoint, accessKey, cacheMax,
|
|
2
|
+
export declare function createSnapshotMiddleware({ endpoint, accessKey, cacheMax, updateInterval, failedUpdateInterval, updatedConcurrency, autoReturnHtml, allowCrawler, }: {
|
|
3
3
|
/** SnapKit endpoint */
|
|
4
4
|
endpoint: string;
|
|
5
5
|
/** SnapKit access key */
|
|
6
6
|
accessKey: string;
|
|
7
7
|
/** Max cache size for LRU cache */
|
|
8
8
|
cacheMax?: number;
|
|
9
|
-
/**
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
9
|
+
/** When cache exceeds this time, it will try to fetch and update cache from SnapKit */
|
|
10
|
+
updateInterval?: number;
|
|
11
|
+
/** When failed cache exceeds this time, it will try to fetch and update cache from SnapKit */
|
|
12
|
+
failedUpdateInterval?: number;
|
|
13
|
+
/** Update queue concurrency */
|
|
14
|
+
updatedConcurrency?: number;
|
|
14
15
|
/** Call res.send(html) when cache hit */
|
|
15
16
|
autoReturnHtml?: boolean;
|
|
16
17
|
/** Custom function to determine whether to return cached content */
|
package/lib/esm/index.js
CHANGED
|
@@ -18,7 +18,7 @@ function getFullUrl(req) {
|
|
|
18
18
|
: req.originalUrl;
|
|
19
19
|
return joinURL(env.appUrl || req.get('host'), blockletPathname);
|
|
20
20
|
}
|
|
21
|
-
export function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500,
|
|
21
|
+
export function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500, updateInterval = 1000 * 60 * 60 * 24, failedUpdateInterval = 1000 * 60 * 60 * 24, updatedConcurrency = 10, autoReturnHtml = true, allowCrawler = () => true, }) {
|
|
22
22
|
if (!accessKey || !endpoint) {
|
|
23
23
|
throw new Error('accessKey and endpoint are required');
|
|
24
24
|
}
|
|
@@ -26,7 +26,9 @@ export function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500,
|
|
|
26
26
|
endpoint,
|
|
27
27
|
accessKey,
|
|
28
28
|
cacheMax,
|
|
29
|
-
|
|
29
|
+
updateInterval,
|
|
30
|
+
failedUpdateInterval,
|
|
31
|
+
updatedConcurrency,
|
|
30
32
|
});
|
|
31
33
|
return (req, res, next) => __awaiter(this, void 0, void 0, function* () {
|
|
32
34
|
yield cacheManager.waitReady();
|
|
@@ -34,18 +36,18 @@ export function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500,
|
|
|
34
36
|
return next();
|
|
35
37
|
}
|
|
36
38
|
const fullUrl = getFullUrl(req);
|
|
37
|
-
// Always fetch content from SnapKit and cache it, even for non-crawler requests
|
|
38
|
-
if (yield cacheManager.isCacheExpired(fullUrl)) {
|
|
39
|
-
logger.info(`Cache expired for ${fullUrl}, fetching from SnapKit`);
|
|
40
|
-
// Don't await here, the cache will be effective after the next request
|
|
41
|
-
cacheManager.updateSnapshot(fullUrl);
|
|
42
|
-
}
|
|
43
39
|
if (!isSpider(req) || isSelfCrawler(req) || isStaticFile(req)) {
|
|
44
40
|
return next();
|
|
45
41
|
}
|
|
42
|
+
// fetch content from SnapKit and cache it
|
|
43
|
+
// Don't await here, the cache will be effective after the next request
|
|
44
|
+
if (yield cacheManager.isCacheExpired(fullUrl)) {
|
|
45
|
+
cacheManager.enqueueUpdateSnapshot(fullUrl);
|
|
46
|
+
}
|
|
46
47
|
// cache hit
|
|
47
48
|
const cachedSnapshot = yield cacheManager.getSnapshot(fullUrl);
|
|
48
|
-
if (cachedSnapshot) {
|
|
49
|
+
if (cachedSnapshot === null || cachedSnapshot === void 0 ? void 0 : cachedSnapshot.html) {
|
|
50
|
+
logger.info(`Cache hit: ${fullUrl}`);
|
|
49
51
|
// @ts-ignore
|
|
50
52
|
req.cachedHtml = cachedSnapshot.html;
|
|
51
53
|
if (cachedSnapshot.lastModified) {
|
|
@@ -63,7 +65,7 @@ export function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500,
|
|
|
63
65
|
}
|
|
64
66
|
return next();
|
|
65
67
|
}
|
|
66
|
-
logger.
|
|
68
|
+
logger.info(`Cache miss: ${fullUrl}`);
|
|
67
69
|
return next();
|
|
68
70
|
});
|
|
69
71
|
}
|
package/lib/esm/store/index.js
CHANGED
|
@@ -14,6 +14,7 @@ import { Snapshot } from './model-snapshot';
|
|
|
14
14
|
export * from './model-snapshot';
|
|
15
15
|
export function initDatabase() {
|
|
16
16
|
return __awaiter(this, void 0, void 0, function* () {
|
|
17
|
+
logger.debug(`Init database at ${env.databasePath}`);
|
|
17
18
|
const sequelize = new Sequelize({
|
|
18
19
|
dialect: SqliteDialect,
|
|
19
20
|
storage: env.databasePath,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@arcblock/crawler-middleware",
|
|
3
|
-
"version": "1.1.
|
|
3
|
+
"version": "1.1.2",
|
|
4
4
|
"main": "lib/cjs/index.js",
|
|
5
5
|
"module": "lib/esm/index.js",
|
|
6
6
|
"types": "lib/cjs/index.d.ts",
|
|
@@ -42,7 +42,6 @@
|
|
|
42
42
|
"dependencies": {
|
|
43
43
|
"@abtnode/cron": "^1.16.43",
|
|
44
44
|
"@abtnode/models": "^1.16.43",
|
|
45
|
-
"@abtnode/queue": "^1.16.43",
|
|
46
45
|
"@blocklet/logger": "^1.16.43",
|
|
47
46
|
"@blocklet/puppeteer": "^22.11.3",
|
|
48
47
|
"@blocklet/sdk": "^1.16.43",
|
|
@@ -53,13 +52,14 @@
|
|
|
53
52
|
"generic-pool": "^3.9.0",
|
|
54
53
|
"lodash": "^4.17.21",
|
|
55
54
|
"lru-cache": "^10.4.3",
|
|
55
|
+
"queue": "^7.0.0",
|
|
56
56
|
"redis": "^4.7.0",
|
|
57
57
|
"robots-parser": "^3.0.1",
|
|
58
58
|
"sequelize": "^6.37.7",
|
|
59
59
|
"sitemap": "^7.1.2",
|
|
60
60
|
"sqlite3": "^5.1.7",
|
|
61
61
|
"ufo": "^1.5.4",
|
|
62
|
-
"@arcblock/crawler": "1.1.
|
|
62
|
+
"@arcblock/crawler": "1.1.2"
|
|
63
63
|
},
|
|
64
64
|
"devDependencies": {
|
|
65
65
|
"@blocklet/js-sdk": "^1.16.39",
|
|
@@ -68,8 +68,8 @@
|
|
|
68
68
|
"@types/fs-extra": "^11.0.4",
|
|
69
69
|
"@types/lodash": "^4.17.16",
|
|
70
70
|
"@types/node": "^20.17.19",
|
|
71
|
-
"express": "^4.21.2",
|
|
72
71
|
"bumpp": "^9.11.1",
|
|
72
|
+
"express": "^4.21.2",
|
|
73
73
|
"nodemon": "^3.1.9",
|
|
74
74
|
"npm-run-all": "^4.1.5",
|
|
75
75
|
"puppeteer": "^24.8.2",
|