@arcblock/crawler-middleware 1.1.1 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -2,31 +2,43 @@
2
2
 
3
3
  This express middleware provides pre-rendered HTML generated by SnapKit for Blocklets, enabling them to return complete HTML content to web spider. This is essential for SEO and ensuring that search engines can properly index dynamically generated content.
4
4
 
5
+ ## How it Works
6
+
7
+ 1. The middleware intercepts incoming requests.
8
+ 2. It checks if the request is from a web spider.
9
+ 3. Try to read and return HTML from the local cache (Memory LRU Cache + SQLite).
10
+ 4. If the cache is not found, an asynchronous request is made to SnapKit, and the local cache is updated.
11
+ 5. The current request does not return the cached content; the next spider visit will hit step 3 and return the cache directly.
12
+
13
+ ## How to Verify
14
+
15
+ 1. Update your browser's User Agent string to include "spider"
16
+ 2. Visit a page that has already been crawled by SnapKit.
17
+ 3. First Visit (Cache Miss): On your first visit, the cache should be missed. Check the server logs for a "Cache miss" message. and a request has been sent to SnapKit to cache the page.
18
+ 4. Second Visit (Cache Hit): Wait a moment and then revisit the same page. The cache should be hit. The server logs should show a "Cache hit" message, and the returned HTML should include the meta tag: `<meta name="arcblock-crawler" content="true">.`
19
+
5
20
  ## Usage
6
21
 
7
22
  ```typescript
8
23
  import { createSnapshotMiddleware } from '@arcblock/crawler-middleware';
9
24
 
10
25
  const app = express();
26
+ const snapshotMiddleware = createSnapshotMiddleware({
27
+ endpoint: process.env.SNAP_KIT_ENDPOINT,
28
+ accessKey: process.env.SNAP_KIT_ACCESS_KEY,
29
+ allowCrawler: (req) => {
30
+ return req.path === '/';
31
+ },
32
+ });
11
33
 
12
- app.use(
13
- createSnapshotMiddleware({
14
- endpoint: process.env.SNAP_KIT_ENDPOINT,
15
- accessKey: process.env.SNAP_KIT_ACCESS_KEY,
16
- allowCrawler: (req) => {
17
- return req.path === '/';
18
- },
19
- }),
20
- );
21
- ```
22
-
23
- ## How it Works
34
+ // for all route
35
+ app.use(snapshotMiddleware);
24
36
 
25
- 1. The middleware intercepts incoming requests.
26
- 2. It checks if the request is from a web crawler.
27
- 3. Try to read and return HTML from the local cache.
28
- 4. If the cache is not found, an asynchronous request is made to SnapKit, and the local cache is updated.
29
- 5. The current request does not return the cached content; the next crawler visit will hit step 3 and return the cache directly.
37
+ // for one route
38
+ app.use('/doc', snapshotMiddleware, (req) => {
39
+ /* ... */
40
+ });
41
+ ```
30
42
 
31
43
  ## Options
32
44
 
@@ -40,11 +52,16 @@ The options for createSnapshotMiddleware:
40
52
  accessKey: string;
41
53
  /** Max cache size for LRU cache */
42
54
  cacheMax?: number;
43
- /**
44
- * Cache update interval
45
- * When cache exceeds this time, it will try to fetch and update cache from SnapKit
46
- */
47
- cacheUpdateInterval?: number;
55
+ /** When cache exceeds this time, it will try to fetch and update cache from SnapKit */
56
+ updateInterval?: number;
57
+ /** When failed cache exceeds this time, it will try to fetch and update cache from SnapKit */
58
+ failedUpdateInterval?: number;
59
+ /** Update queue concurrency */
60
+ updatedConcurrency?: number;
61
+ /** Call res.send(html) when cache hit */
62
+ autoReturnHtml?: boolean;
63
+ /** Custom function to determine whether to return cached content */
64
+ allowCrawler?: (req: Request) => boolean;
48
65
  };
49
66
  ```
50
67
 
@@ -52,8 +69,10 @@ The options for createSnapshotMiddleware:
52
69
 
53
70
  When using this middleware outside of a Blocklet environment, you need to configure the following environment variables:
54
71
 
55
- - `BLOCKLET_APP_DATA_DIR`: (Required) Directory path for storing the sqlite file
72
+ - `BLOCKLET_DATA_DIR`: (Required) Directory path for storing the sqlite file
56
73
  - `BLOCKLET_LOG_DIR`: (Required) Directory path for storing @blocklet/logger logs
57
74
  - `BLOCKLET_APP_URL`: (Optional) Deployed domain
58
75
 
59
- You can set these variables in your `.env` file.
76
+ ## SQLite
77
+
78
+ When createSnapshotMiddleware is called, it attempts to create an SQLite database at `BLOCKLET_DATA_DIR`. This database is used to cache HTML content retrieved from SnapKit. Please ensure that the deployment environment supports SQLite.
@@ -6,16 +6,18 @@ export type CacheManagerOptions = {
6
6
  accessKey: string;
7
7
  /** Max cache size for LRU cache */
8
8
  cacheMax?: number;
9
- /**
10
- * Cache update interval
11
- * When cache exceeds this time, it will try to fetch and update cache from SnapKit
12
- */
13
- cacheUpdateInterval?: number;
9
+ /** When cache exceeds this time, it will try to fetch and update cache from SnapKit */
10
+ updateInterval?: number;
11
+ /** When failed cache exceeds this time, it will try to fetch and update cache from SnapKit */
12
+ failedUpdateInterval?: number;
13
+ /** Update queue concurrency */
14
+ updatedConcurrency?: number;
14
15
  };
15
16
  export declare class CacheManager {
16
17
  private options;
17
18
  private cache;
18
19
  private initializedPromise;
20
+ private updateQueue;
19
21
  constructor(options: CacheManagerOptions);
20
22
  waitReady(): Promise<void>;
21
23
  getSnapshot(url: string): Promise<SnapshotModel | null>;
@@ -23,4 +25,5 @@ export declare class CacheManager {
23
25
  fetchSnapKit(url: string): Promise<any>;
24
26
  isCacheExpired(url: string): Promise<boolean>;
25
27
  updateSnapshot(url: string): Promise<void>;
28
+ enqueueUpdateSnapshot(url: string): any;
26
29
  }
package/lib/cjs/cache.js CHANGED
@@ -8,17 +8,25 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
8
8
  step((generator = generator.apply(thisArg, _arguments || [])).next());
9
9
  });
10
10
  };
11
+ var __importDefault = (this && this.__importDefault) || function (mod) {
12
+ return (mod && mod.__esModule) ? mod : { "default": mod };
13
+ };
11
14
  Object.defineProperty(exports, "__esModule", { value: true });
12
15
  exports.CacheManager = void 0;
13
16
  const crawler_1 = require("@arcblock/crawler");
14
17
  const lru_cache_1 = require("lru-cache");
18
+ const queue_1 = __importDefault(require("queue"));
15
19
  const ufo_1 = require("ufo");
16
20
  const env_1 = require("./env");
17
21
  const index_1 = require("./store/index");
18
22
  class CacheManager {
19
23
  constructor(options) {
20
- this.options = Object.assign({ cacheMax: 500, cacheUpdateInterval: 1000 * 60 * 60 * 24 }, options);
24
+ this.options = Object.assign({ cacheMax: 500, updateInterval: 1000 * 60 * 60 * 24, failedUpdateInterval: 1000 * 60 * 60 * 24, updatedConcurrency: 10 }, options);
21
25
  this.cache = new lru_cache_1.LRUCache({ max: this.options.cacheMax || 500 });
26
+ this.updateQueue = new queue_1.default({
27
+ autostart: true,
28
+ concurrency: this.options.updatedConcurrency,
29
+ });
22
30
  this.initializedPromise = Promise.all([(0, index_1.initDatabase)()]);
23
31
  }
24
32
  waitReady() {
@@ -48,6 +56,7 @@ class CacheManager {
48
56
  }
49
57
  fetchSnapKit(url) {
50
58
  return __awaiter(this, void 0, void 0, function* () {
59
+ var _a;
51
60
  const { endpoint, accessKey } = this.options;
52
61
  const api = (0, ufo_1.joinURL)(endpoint, 'api/crawl');
53
62
  env_1.logger.debug('Fetching snapshot from SnapKit', { url, api });
@@ -73,7 +82,7 @@ class CacheManager {
73
82
  return snapshotData;
74
83
  }
75
84
  catch (error) {
76
- env_1.logger.error('Failed to fetch content by SnapKit', { url, error });
85
+ env_1.logger.error('Failed to fetch content by SnapKit', { url, error, data: (_a = error === null || error === void 0 ? void 0 : error.response) === null || _a === void 0 ? void 0 : _a.data });
77
86
  return null;
78
87
  }
79
88
  });
@@ -84,28 +93,30 @@ class CacheManager {
84
93
  if (!snapshot) {
85
94
  return true;
86
95
  }
87
- return Date.now() - new Date(snapshot.createdAt).getTime() > this.options.cacheUpdateInterval;
96
+ const interval = snapshot.html ? this.options.updateInterval : this.options.failedUpdateInterval;
97
+ return Date.now() - new Date(snapshot.updatedAt).getTime() > interval;
88
98
  });
89
99
  }
90
100
  updateSnapshot(url) {
91
101
  return __awaiter(this, void 0, void 0, function* () {
92
102
  try {
93
103
  const snapshot = yield this.fetchSnapKit(url);
94
- if (snapshot) {
95
- // update db
96
- const [updatedSnapshot] = yield index_1.Snapshot.upsert({
97
- url,
98
- html: snapshot.html,
99
- lastModified: snapshot.lastModified,
100
- });
101
- // update cache
102
- this.cache.set(url, updatedSnapshot);
103
- }
104
+ // update db
105
+ const [updatedSnapshot] = yield index_1.Snapshot.upsert({
106
+ url,
107
+ html: (snapshot === null || snapshot === void 0 ? void 0 : snapshot.html) || '',
108
+ lastModified: snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified,
109
+ });
110
+ // update cache
111
+ this.cache.set(url, updatedSnapshot);
104
112
  }
105
113
  catch (error) {
106
114
  env_1.logger.error('Failed to update snapshot', { url, error });
107
115
  }
108
116
  });
109
117
  }
118
+ enqueueUpdateSnapshot(url) {
119
+ return this.updateQueue.push(() => this.updateSnapshot(url));
120
+ }
110
121
  }
111
122
  exports.CacheManager = CacheManager;
@@ -1,16 +1,17 @@
1
1
  import { NextFunction, Request, Response } from 'express';
2
- export declare function createSnapshotMiddleware({ endpoint, accessKey, cacheMax, cacheUpdateInterval, autoReturnHtml, allowCrawler, }: {
2
+ export declare function createSnapshotMiddleware({ endpoint, accessKey, cacheMax, updateInterval, failedUpdateInterval, updatedConcurrency, autoReturnHtml, allowCrawler, }: {
3
3
  /** SnapKit endpoint */
4
4
  endpoint: string;
5
5
  /** SnapKit access key */
6
6
  accessKey: string;
7
7
  /** Max cache size for LRU cache */
8
8
  cacheMax?: number;
9
- /**
10
- * Cache update interval
11
- * When cache exceeds this time, it will try to fetch and update cache from SnapKit
12
- */
13
- cacheUpdateInterval?: number;
9
+ /** When cache exceeds this time, it will try to fetch and update cache from SnapKit */
10
+ updateInterval?: number;
11
+ /** When failed cache exceeds this time, it will try to fetch and update cache from SnapKit */
12
+ failedUpdateInterval?: number;
13
+ /** Update queue concurrency */
14
+ updatedConcurrency?: number;
14
15
  /** Call res.send(html) when cache hit */
15
16
  autoReturnHtml?: boolean;
16
17
  /** Custom function to determine whether to return cached content */
package/lib/cjs/index.js CHANGED
@@ -21,7 +21,7 @@ function getFullUrl(req) {
21
21
  : req.originalUrl;
22
22
  return (0, ufo_1.joinURL)(env_1.env.appUrl || req.get('host'), blockletPathname);
23
23
  }
24
- function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500, cacheUpdateInterval = 1000 * 60 * 60 * 24, autoReturnHtml = true, allowCrawler = () => true, }) {
24
+ function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500, updateInterval = 1000 * 60 * 60 * 24, failedUpdateInterval = 1000 * 60 * 60 * 24, updatedConcurrency = 10, autoReturnHtml = true, allowCrawler = () => true, }) {
25
25
  if (!accessKey || !endpoint) {
26
26
  throw new Error('accessKey and endpoint are required');
27
27
  }
@@ -29,7 +29,9 @@ function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500, cacheUp
29
29
  endpoint,
30
30
  accessKey,
31
31
  cacheMax,
32
- cacheUpdateInterval,
32
+ updateInterval,
33
+ failedUpdateInterval,
34
+ updatedConcurrency,
33
35
  });
34
36
  return (req, res, next) => __awaiter(this, void 0, void 0, function* () {
35
37
  yield cacheManager.waitReady();
@@ -37,18 +39,18 @@ function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500, cacheUp
37
39
  return next();
38
40
  }
39
41
  const fullUrl = getFullUrl(req);
40
- // Always fetch content from SnapKit and cache it, even for non-crawler requests
41
- if (yield cacheManager.isCacheExpired(fullUrl)) {
42
- env_1.logger.info(`Cache expired for ${fullUrl}, fetching from SnapKit`);
43
- // Don't await here, the cache will be effective after the next request
44
- cacheManager.updateSnapshot(fullUrl);
45
- }
46
42
  if (!isSpider(req) || isSelfCrawler(req) || isStaticFile(req)) {
47
43
  return next();
48
44
  }
45
+ // fetch content from SnapKit and cache it
46
+ // Don't await here, the cache will be effective after the next request
47
+ if (yield cacheManager.isCacheExpired(fullUrl)) {
48
+ cacheManager.enqueueUpdateSnapshot(fullUrl);
49
+ }
49
50
  // cache hit
50
51
  const cachedSnapshot = yield cacheManager.getSnapshot(fullUrl);
51
- if (cachedSnapshot) {
52
+ if (cachedSnapshot === null || cachedSnapshot === void 0 ? void 0 : cachedSnapshot.html) {
53
+ env_1.logger.info(`Cache hit: ${fullUrl}`);
52
54
  // @ts-ignore
53
55
  req.cachedHtml = cachedSnapshot.html;
54
56
  if (cachedSnapshot.lastModified) {
@@ -66,7 +68,7 @@ function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500, cacheUp
66
68
  }
67
69
  return next();
68
70
  }
69
- env_1.logger.debug(`Cache not hit: ${fullUrl}`);
71
+ env_1.logger.info(`Cache miss: ${fullUrl}`);
70
72
  return next();
71
73
  });
72
74
  }
@@ -31,6 +31,7 @@ const model_snapshot_1 = require("./model-snapshot");
31
31
  __exportStar(require("./model-snapshot"), exports);
32
32
  function initDatabase() {
33
33
  return __awaiter(this, void 0, void 0, function* () {
34
+ env_1.logger.debug(`Init database at ${env_1.env.databasePath}`);
34
35
  const sequelize = new core_1.Sequelize({
35
36
  dialect: sqlite3_1.SqliteDialect,
36
37
  storage: env_1.env.databasePath,
@@ -6,16 +6,18 @@ export type CacheManagerOptions = {
6
6
  accessKey: string;
7
7
  /** Max cache size for LRU cache */
8
8
  cacheMax?: number;
9
- /**
10
- * Cache update interval
11
- * When cache exceeds this time, it will try to fetch and update cache from SnapKit
12
- */
13
- cacheUpdateInterval?: number;
9
+ /** When cache exceeds this time, it will try to fetch and update cache from SnapKit */
10
+ updateInterval?: number;
11
+ /** When failed cache exceeds this time, it will try to fetch and update cache from SnapKit */
12
+ failedUpdateInterval?: number;
13
+ /** Update queue concurrency */
14
+ updatedConcurrency?: number;
14
15
  };
15
16
  export declare class CacheManager {
16
17
  private options;
17
18
  private cache;
18
19
  private initializedPromise;
20
+ private updateQueue;
19
21
  constructor(options: CacheManagerOptions);
20
22
  waitReady(): Promise<void>;
21
23
  getSnapshot(url: string): Promise<SnapshotModel | null>;
@@ -23,4 +25,5 @@ export declare class CacheManager {
23
25
  fetchSnapKit(url: string): Promise<any>;
24
26
  isCacheExpired(url: string): Promise<boolean>;
25
27
  updateSnapshot(url: string): Promise<void>;
28
+ enqueueUpdateSnapshot(url: string): any;
26
29
  }
package/lib/esm/cache.js CHANGED
@@ -9,13 +9,18 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
9
9
  };
10
10
  import { utils } from '@arcblock/crawler';
11
11
  import { LRUCache } from 'lru-cache';
12
+ import Queue from 'queue';
12
13
  import { joinURL } from 'ufo';
13
14
  import { logger } from './env';
14
15
  import { Snapshot, initDatabase } from './store/index';
15
16
  export class CacheManager {
16
17
  constructor(options) {
17
- this.options = Object.assign({ cacheMax: 500, cacheUpdateInterval: 1000 * 60 * 60 * 24 }, options);
18
+ this.options = Object.assign({ cacheMax: 500, updateInterval: 1000 * 60 * 60 * 24, failedUpdateInterval: 1000 * 60 * 60 * 24, updatedConcurrency: 10 }, options);
18
19
  this.cache = new LRUCache({ max: this.options.cacheMax || 500 });
20
+ this.updateQueue = new Queue({
21
+ autostart: true,
22
+ concurrency: this.options.updatedConcurrency,
23
+ });
19
24
  this.initializedPromise = Promise.all([initDatabase()]);
20
25
  }
21
26
  waitReady() {
@@ -45,6 +50,7 @@ export class CacheManager {
45
50
  }
46
51
  fetchSnapKit(url) {
47
52
  return __awaiter(this, void 0, void 0, function* () {
53
+ var _a;
48
54
  const { endpoint, accessKey } = this.options;
49
55
  const api = joinURL(endpoint, 'api/crawl');
50
56
  logger.debug('Fetching snapshot from SnapKit', { url, api });
@@ -70,7 +76,7 @@ export class CacheManager {
70
76
  return snapshotData;
71
77
  }
72
78
  catch (error) {
73
- logger.error('Failed to fetch content by SnapKit', { url, error });
79
+ logger.error('Failed to fetch content by SnapKit', { url, error, data: (_a = error === null || error === void 0 ? void 0 : error.response) === null || _a === void 0 ? void 0 : _a.data });
74
80
  return null;
75
81
  }
76
82
  });
@@ -81,27 +87,29 @@ export class CacheManager {
81
87
  if (!snapshot) {
82
88
  return true;
83
89
  }
84
- return Date.now() - new Date(snapshot.createdAt).getTime() > this.options.cacheUpdateInterval;
90
+ const interval = snapshot.html ? this.options.updateInterval : this.options.failedUpdateInterval;
91
+ return Date.now() - new Date(snapshot.updatedAt).getTime() > interval;
85
92
  });
86
93
  }
87
94
  updateSnapshot(url) {
88
95
  return __awaiter(this, void 0, void 0, function* () {
89
96
  try {
90
97
  const snapshot = yield this.fetchSnapKit(url);
91
- if (snapshot) {
92
- // update db
93
- const [updatedSnapshot] = yield Snapshot.upsert({
94
- url,
95
- html: snapshot.html,
96
- lastModified: snapshot.lastModified,
97
- });
98
- // update cache
99
- this.cache.set(url, updatedSnapshot);
100
- }
98
+ // update db
99
+ const [updatedSnapshot] = yield Snapshot.upsert({
100
+ url,
101
+ html: (snapshot === null || snapshot === void 0 ? void 0 : snapshot.html) || '',
102
+ lastModified: snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified,
103
+ });
104
+ // update cache
105
+ this.cache.set(url, updatedSnapshot);
101
106
  }
102
107
  catch (error) {
103
108
  logger.error('Failed to update snapshot', { url, error });
104
109
  }
105
110
  });
106
111
  }
112
+ enqueueUpdateSnapshot(url) {
113
+ return this.updateQueue.push(() => this.updateSnapshot(url));
114
+ }
107
115
  }
@@ -1,16 +1,17 @@
1
1
  import { NextFunction, Request, Response } from 'express';
2
- export declare function createSnapshotMiddleware({ endpoint, accessKey, cacheMax, cacheUpdateInterval, autoReturnHtml, allowCrawler, }: {
2
+ export declare function createSnapshotMiddleware({ endpoint, accessKey, cacheMax, updateInterval, failedUpdateInterval, updatedConcurrency, autoReturnHtml, allowCrawler, }: {
3
3
  /** SnapKit endpoint */
4
4
  endpoint: string;
5
5
  /** SnapKit access key */
6
6
  accessKey: string;
7
7
  /** Max cache size for LRU cache */
8
8
  cacheMax?: number;
9
- /**
10
- * Cache update interval
11
- * When cache exceeds this time, it will try to fetch and update cache from SnapKit
12
- */
13
- cacheUpdateInterval?: number;
9
+ /** When cache exceeds this time, it will try to fetch and update cache from SnapKit */
10
+ updateInterval?: number;
11
+ /** When failed cache exceeds this time, it will try to fetch and update cache from SnapKit */
12
+ failedUpdateInterval?: number;
13
+ /** Update queue concurrency */
14
+ updatedConcurrency?: number;
14
15
  /** Call res.send(html) when cache hit */
15
16
  autoReturnHtml?: boolean;
16
17
  /** Custom function to determine whether to return cached content */
package/lib/esm/index.js CHANGED
@@ -18,7 +18,7 @@ function getFullUrl(req) {
18
18
  : req.originalUrl;
19
19
  return joinURL(env.appUrl || req.get('host'), blockletPathname);
20
20
  }
21
- export function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500, cacheUpdateInterval = 1000 * 60 * 60 * 24, autoReturnHtml = true, allowCrawler = () => true, }) {
21
+ export function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500, updateInterval = 1000 * 60 * 60 * 24, failedUpdateInterval = 1000 * 60 * 60 * 24, updatedConcurrency = 10, autoReturnHtml = true, allowCrawler = () => true, }) {
22
22
  if (!accessKey || !endpoint) {
23
23
  throw new Error('accessKey and endpoint are required');
24
24
  }
@@ -26,7 +26,9 @@ export function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500,
26
26
  endpoint,
27
27
  accessKey,
28
28
  cacheMax,
29
- cacheUpdateInterval,
29
+ updateInterval,
30
+ failedUpdateInterval,
31
+ updatedConcurrency,
30
32
  });
31
33
  return (req, res, next) => __awaiter(this, void 0, void 0, function* () {
32
34
  yield cacheManager.waitReady();
@@ -34,18 +36,18 @@ export function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500,
34
36
  return next();
35
37
  }
36
38
  const fullUrl = getFullUrl(req);
37
- // Always fetch content from SnapKit and cache it, even for non-crawler requests
38
- if (yield cacheManager.isCacheExpired(fullUrl)) {
39
- logger.info(`Cache expired for ${fullUrl}, fetching from SnapKit`);
40
- // Don't await here, the cache will be effective after the next request
41
- cacheManager.updateSnapshot(fullUrl);
42
- }
43
39
  if (!isSpider(req) || isSelfCrawler(req) || isStaticFile(req)) {
44
40
  return next();
45
41
  }
42
+ // fetch content from SnapKit and cache it
43
+ // Don't await here, the cache will be effective after the next request
44
+ if (yield cacheManager.isCacheExpired(fullUrl)) {
45
+ cacheManager.enqueueUpdateSnapshot(fullUrl);
46
+ }
46
47
  // cache hit
47
48
  const cachedSnapshot = yield cacheManager.getSnapshot(fullUrl);
48
- if (cachedSnapshot) {
49
+ if (cachedSnapshot === null || cachedSnapshot === void 0 ? void 0 : cachedSnapshot.html) {
50
+ logger.info(`Cache hit: ${fullUrl}`);
49
51
  // @ts-ignore
50
52
  req.cachedHtml = cachedSnapshot.html;
51
53
  if (cachedSnapshot.lastModified) {
@@ -63,7 +65,7 @@ export function createSnapshotMiddleware({ endpoint, accessKey, cacheMax = 500,
63
65
  }
64
66
  return next();
65
67
  }
66
- logger.debug(`Cache not hit: ${fullUrl}`);
68
+ logger.info(`Cache miss: ${fullUrl}`);
67
69
  return next();
68
70
  });
69
71
  }
@@ -14,6 +14,7 @@ import { Snapshot } from './model-snapshot';
14
14
  export * from './model-snapshot';
15
15
  export function initDatabase() {
16
16
  return __awaiter(this, void 0, void 0, function* () {
17
+ logger.debug(`Init database at ${env.databasePath}`);
17
18
  const sequelize = new Sequelize({
18
19
  dialect: SqliteDialect,
19
20
  storage: env.databasePath,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arcblock/crawler-middleware",
3
- "version": "1.1.1",
3
+ "version": "1.1.3",
4
4
  "main": "lib/cjs/index.js",
5
5
  "module": "lib/esm/index.js",
6
6
  "types": "lib/cjs/index.d.ts",
@@ -41,38 +41,24 @@
41
41
  },
42
42
  "dependencies": {
43
43
  "@abtnode/cron": "^1.16.43",
44
- "@abtnode/models": "^1.16.43",
45
- "@abtnode/queue": "^1.16.43",
46
44
  "@blocklet/logger": "^1.16.43",
47
- "@blocklet/puppeteer": "^22.11.3",
48
45
  "@blocklet/sdk": "^1.16.43",
49
46
  "@sequelize/core": "7.0.0-alpha.46",
50
47
  "@sequelize/sqlite3": "7.0.0-alpha.46",
51
48
  "axios": "^1.7.9",
52
- "fs-extra": "^11.2.0",
53
- "generic-pool": "^3.9.0",
54
- "lodash": "^4.17.21",
55
49
  "lru-cache": "^10.4.3",
56
- "redis": "^4.7.0",
57
- "robots-parser": "^3.0.1",
58
- "sequelize": "^6.37.7",
59
- "sitemap": "^7.1.2",
50
+ "queue": "^7.0.0",
60
51
  "sqlite3": "^5.1.7",
61
52
  "ufo": "^1.5.4",
62
- "@arcblock/crawler": "1.1.1"
53
+ "@arcblock/crawler": "1.1.3"
63
54
  },
64
55
  "devDependencies": {
65
- "@blocklet/js-sdk": "^1.16.39",
66
- "@types/dotenv-flow": "^3.3.3",
67
56
  "@types/express": "^4.17.21",
68
- "@types/fs-extra": "^11.0.4",
69
- "@types/lodash": "^4.17.16",
70
57
  "@types/node": "^20.17.19",
71
- "express": "^4.21.2",
72
58
  "bumpp": "^9.11.1",
59
+ "express": "^4.21.2",
73
60
  "nodemon": "^3.1.9",
74
61
  "npm-run-all": "^4.1.5",
75
- "puppeteer": "^24.8.2",
76
62
  "tsx": "^4.19.3",
77
63
  "zx": "^8.3.2"
78
64
  },