@arcblock/crawler 1.0.6 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/lib/cjs/config.d.ts +22 -0
- package/{dist → lib/cjs}/config.js +9 -3
- package/lib/cjs/crawler.d.ts +26 -0
- package/{dist → lib/cjs}/crawler.js +44 -112
- package/lib/cjs/cron.d.ts +1 -0
- package/lib/cjs/cron.js +49 -0
- package/lib/cjs/index.d.ts +9 -0
- package/lib/cjs/index.js +78 -0
- package/{esm → lib/cjs}/puppeteer.d.ts +2 -2
- package/{dist → lib/cjs}/puppeteer.js +43 -54
- package/lib/cjs/services/snapshot.d.ts +12 -0
- package/lib/cjs/services/snapshot.js +84 -0
- package/lib/cjs/site.d.ts +2 -0
- package/lib/cjs/site.js +76 -0
- package/lib/cjs/store/index.d.ts +3 -0
- package/{dist/db → lib/cjs/store}/index.js +21 -5
- package/{dist/db → lib/cjs/store}/job.d.ts +4 -3
- package/lib/cjs/store/job.js +110 -0
- package/{dist/db → lib/cjs/store}/snapshot.d.ts +5 -6
- package/lib/cjs/store/snapshot.js +68 -0
- package/lib/cjs/utils.d.ts +32 -0
- package/{dist → lib/cjs}/utils.js +67 -78
- package/lib/esm/config.d.ts +22 -0
- package/{esm → lib/esm}/config.js +9 -3
- package/lib/esm/crawler.d.ts +26 -0
- package/{esm → lib/esm}/crawler.js +35 -100
- package/lib/esm/cron.d.ts +1 -0
- package/lib/esm/cron.js +43 -0
- package/lib/esm/index.d.ts +9 -0
- package/{esm → lib/esm}/index.js +19 -10
- package/{dist → lib/esm}/puppeteer.d.ts +2 -2
- package/{esm → lib/esm}/puppeteer.js +21 -32
- package/lib/esm/services/snapshot.d.ts +12 -0
- package/lib/esm/services/snapshot.js +75 -0
- package/lib/esm/site.d.ts +2 -0
- package/lib/esm/site.js +69 -0
- package/lib/esm/store/index.d.ts +3 -0
- package/{esm/db → lib/esm/store}/index.js +22 -6
- package/{esm/db → lib/esm/store}/job.d.ts +4 -3
- package/lib/esm/store/job.js +73 -0
- package/{esm/db → lib/esm/store}/snapshot.d.ts +5 -6
- package/lib/esm/store/snapshot.js +64 -0
- package/lib/esm/utils.d.ts +32 -0
- package/{esm → lib/esm}/utils.js +64 -71
- package/package.json +20 -32
- package/third.d.ts +0 -0
- package/dist/blocklet.d.ts +0 -6
- package/dist/blocklet.js +0 -199
- package/dist/cache.d.ts +0 -10
- package/dist/cache.js +0 -119
- package/dist/config.d.ts +0 -10
- package/dist/crawler.d.ts +0 -28
- package/dist/db/index.d.ts +0 -1
- package/dist/db/job.js +0 -54
- package/dist/db/snapshot.js +0 -52
- package/dist/index.d.ts +0 -6
- package/dist/index.js +0 -45
- package/dist/middleware.d.ts +0 -4
- package/dist/middleware.js +0 -44
- package/dist/utils.d.ts +0 -17
- package/esm/blocklet.d.ts +0 -6
- package/esm/blocklet.js +0 -190
- package/esm/cache.d.ts +0 -10
- package/esm/cache.js +0 -114
- package/esm/config.d.ts +0 -10
- package/esm/crawler.d.ts +0 -28
- package/esm/db/index.d.ts +0 -1
- package/esm/db/job.js +0 -50
- package/esm/db/snapshot.js +0 -48
- package/esm/index.d.ts +0 -6
- package/esm/middleware.d.ts +0 -4
- package/esm/middleware.js +0 -41
- package/esm/utils.d.ts +0 -17
package/esm/db/snapshot.js
DELETED
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
import { DataTypes, Model } from '@sequelize/core';
|
|
2
|
-
class Snapshot extends Model {
|
|
3
|
-
}
|
|
4
|
-
export { Snapshot };
|
|
5
|
-
export function initSnapshotModel(sequelize) {
|
|
6
|
-
Snapshot.init({
|
|
7
|
-
jobId: {
|
|
8
|
-
type: DataTypes.STRING,
|
|
9
|
-
primaryKey: true,
|
|
10
|
-
allowNull: false,
|
|
11
|
-
},
|
|
12
|
-
url: {
|
|
13
|
-
type: DataTypes.STRING,
|
|
14
|
-
allowNull: false,
|
|
15
|
-
index: true,
|
|
16
|
-
},
|
|
17
|
-
status: {
|
|
18
|
-
type: DataTypes.ENUM('success', 'failed'),
|
|
19
|
-
allowNull: false,
|
|
20
|
-
},
|
|
21
|
-
html: {
|
|
22
|
-
type: DataTypes.TEXT,
|
|
23
|
-
allowNull: true,
|
|
24
|
-
},
|
|
25
|
-
screenshot: {
|
|
26
|
-
type: DataTypes.STRING,
|
|
27
|
-
allowNull: true,
|
|
28
|
-
},
|
|
29
|
-
error: {
|
|
30
|
-
type: DataTypes.STRING,
|
|
31
|
-
allowNull: true,
|
|
32
|
-
},
|
|
33
|
-
lastModified: {
|
|
34
|
-
type: DataTypes.STRING,
|
|
35
|
-
allowNull: true,
|
|
36
|
-
},
|
|
37
|
-
options: {
|
|
38
|
-
type: DataTypes.JSON,
|
|
39
|
-
allowNull: true,
|
|
40
|
-
},
|
|
41
|
-
}, {
|
|
42
|
-
sequelize,
|
|
43
|
-
modelName: 'snapshot',
|
|
44
|
-
tableName: 'snap',
|
|
45
|
-
timestamps: true,
|
|
46
|
-
});
|
|
47
|
-
return Snapshot;
|
|
48
|
-
}
|
package/esm/index.d.ts
DELETED
package/esm/middleware.d.ts
DELETED
package/esm/middleware.js
DELETED
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
-
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
-
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
-
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
-
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
-
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
-
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
-
});
|
|
9
|
-
};
|
|
10
|
-
import { useCache } from './cache';
|
|
11
|
-
import { getFullUrl, isAcceptCrawler, isBotUserAgent, isSelfCrawler } from './utils';
|
|
12
|
-
export function initSEOMiddleware({ autoReturnHtml = true, allowCrawler = true, }) {
|
|
13
|
-
return (req, res, next) => __awaiter(this, void 0, void 0, function* () {
|
|
14
|
-
const isBot = isBotUserAgent(req);
|
|
15
|
-
const isSelf = isSelfCrawler(req);
|
|
16
|
-
if (!isBot || isSelf) {
|
|
17
|
-
return next();
|
|
18
|
-
}
|
|
19
|
-
const fullUrl = getFullUrl(req);
|
|
20
|
-
const canCrawl = yield isAcceptCrawler(fullUrl);
|
|
21
|
-
const allowCrawlerResult = typeof allowCrawler === 'function' ? allowCrawler(req) : allowCrawler;
|
|
22
|
-
// can not crawl, skip
|
|
23
|
-
if (!canCrawl || !allowCrawlerResult) {
|
|
24
|
-
return next();
|
|
25
|
-
}
|
|
26
|
-
const cacheData = yield useCache.get(fullUrl);
|
|
27
|
-
// add cached html to req
|
|
28
|
-
req.cachedHtml = (cacheData === null || cacheData === void 0 ? void 0 : cacheData.content) || cacheData || null;
|
|
29
|
-
// add cached lastModified to req, ISO string to GMT string
|
|
30
|
-
req.cachedLastmod = (cacheData === null || cacheData === void 0 ? void 0 : cacheData.lastModified) ? new Date(cacheData === null || cacheData === void 0 ? void 0 : cacheData.lastModified).toUTCString() : null;
|
|
31
|
-
if (req.cachedLastmod) {
|
|
32
|
-
res.setHeader('Last-Modified', req.cachedLastmod);
|
|
33
|
-
}
|
|
34
|
-
if (autoReturnHtml && req.cachedHtml) {
|
|
35
|
-
res.send(req.cachedHtml);
|
|
36
|
-
return;
|
|
37
|
-
}
|
|
38
|
-
// missing cache
|
|
39
|
-
next();
|
|
40
|
-
});
|
|
41
|
-
}
|
package/esm/utils.d.ts
DELETED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
import { Page } from '@blocklet/puppeteer';
|
|
2
|
-
export declare const api: import("axios").AxiosInstance;
|
|
3
|
-
export declare const sleep: (ms: number) => Promise<unknown>;
|
|
4
|
-
export declare const CRAWLER_FLAG = "x-crawler";
|
|
5
|
-
export declare const isSelfCrawler: (req: any) => boolean;
|
|
6
|
-
export declare const getDefaultRobotsUrl: (url: string) => string;
|
|
7
|
-
export declare function getRobots(url: string): Promise<import("robots-parser").Robot | null>;
|
|
8
|
-
export declare const getDefaultSitemapUrl: (url: string) => string;
|
|
9
|
-
export declare const isAcceptCrawler: (url: string) => Promise<boolean | undefined>;
|
|
10
|
-
export declare const getSitemapList: (url: string) => Promise<import("sitemap").SitemapItem[]>;
|
|
11
|
-
export declare const isBotUserAgent: (req: any) => boolean;
|
|
12
|
-
export declare const getComponentInfo: () => {};
|
|
13
|
-
export declare const getFullUrl: (req: any) => string;
|
|
14
|
-
export declare const getRelativePath: (url: string) => string;
|
|
15
|
-
export declare const formatUrl: (url: string) => string;
|
|
16
|
-
export declare function md5(content: string | Uint8Array): string;
|
|
17
|
-
export declare function findMaxScrollHeight(page: Page): Promise<number>;
|