@arcblock/crawler 1.1.4 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/cjs/crawler.d.ts +1 -1
- package/lib/cjs/crawler.js +17 -2
- package/lib/cjs/store/job.d.ts +3 -0
- package/lib/cjs/utils.js +5 -0
- package/lib/esm/crawler.d.ts +1 -1
- package/lib/esm/crawler.js +17 -2
- package/lib/esm/store/job.d.ts +3 -0
- package/lib/esm/utils.js +5 -0
- package/package.json +9 -10
package/lib/cjs/crawler.d.ts
CHANGED
|
@@ -5,7 +5,7 @@ export declare function getDataDir(): Promise<{
|
|
|
5
5
|
htmlDir: string;
|
|
6
6
|
screenshotDir: string;
|
|
7
7
|
}>;
|
|
8
|
-
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, }: JobState) => Promise<{
|
|
8
|
+
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
|
|
9
9
|
html: string | null;
|
|
10
10
|
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
11
11
|
meta: {
|
package/lib/cjs/crawler.js
CHANGED
|
@@ -59,7 +59,12 @@ function createCrawlQueue() {
|
|
|
59
59
|
// }
|
|
60
60
|
try {
|
|
61
61
|
// get page content later
|
|
62
|
-
const result = yield (0, exports.getPageContent)(
|
|
62
|
+
const result = yield (0, exports.getPageContent)(Object.assign({ localStorage: {
|
|
63
|
+
// for blocklet theme
|
|
64
|
+
blocklet_theme_prefer: 'light',
|
|
65
|
+
// for blocklet domain warning
|
|
66
|
+
'domain-warning-skip': Date.now().toString(),
|
|
67
|
+
} }, job));
|
|
63
68
|
if (!result || (!result.html && !result.screenshot)) {
|
|
64
69
|
config_1.logger.error(`failed to crawl ${job.url}, empty content`, job);
|
|
65
70
|
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
@@ -137,7 +142,7 @@ function saveSnapshotToLocal(_a) {
|
|
|
137
142
|
};
|
|
138
143
|
});
|
|
139
144
|
}
|
|
140
|
-
const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, }) {
|
|
145
|
+
const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies = [], localStorage, }) {
|
|
141
146
|
const page = yield (0, puppeteer_1.initPage)();
|
|
142
147
|
if (width && height) {
|
|
143
148
|
yield page.setViewport({ width, height, deviceScaleFactor: 2 });
|
|
@@ -145,6 +150,16 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
145
150
|
if (headers) {
|
|
146
151
|
yield page.setExtraHTTPHeaders(headers);
|
|
147
152
|
}
|
|
153
|
+
if (cookies === null || cookies === void 0 ? void 0 : cookies.length) {
|
|
154
|
+
yield page.setCookie(...cookies);
|
|
155
|
+
}
|
|
156
|
+
if (localStorage) {
|
|
157
|
+
yield page.evaluateOnNewDocument((items) => {
|
|
158
|
+
Object.entries(items).forEach(([key, value]) => {
|
|
159
|
+
window.localStorage.setItem(key, value);
|
|
160
|
+
});
|
|
161
|
+
}, localStorage);
|
|
162
|
+
}
|
|
148
163
|
let html = null;
|
|
149
164
|
let screenshot = null;
|
|
150
165
|
const meta = {};
|
package/lib/cjs/store/job.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { CookieParam } from '@blocklet/puppeteer';
|
|
1
2
|
import { Model, Sequelize } from '@sequelize/core';
|
|
2
3
|
export interface JobState {
|
|
3
4
|
id?: string;
|
|
@@ -12,6 +13,8 @@ export interface JobState {
|
|
|
12
13
|
fullPage?: boolean;
|
|
13
14
|
lastModified?: string;
|
|
14
15
|
headers?: Record<string, string>;
|
|
16
|
+
cookies?: CookieParam[];
|
|
17
|
+
localStorage?: Record<string, string>;
|
|
15
18
|
}
|
|
16
19
|
export interface JobModel {
|
|
17
20
|
id: string;
|
package/lib/cjs/utils.js
CHANGED
package/lib/esm/crawler.d.ts
CHANGED
|
@@ -5,7 +5,7 @@ export declare function getDataDir(): Promise<{
|
|
|
5
5
|
htmlDir: string;
|
|
6
6
|
screenshotDir: string;
|
|
7
7
|
}>;
|
|
8
|
-
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, }: JobState) => Promise<{
|
|
8
|
+
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
|
|
9
9
|
html: string | null;
|
|
10
10
|
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
11
11
|
meta: {
|
package/lib/esm/crawler.js
CHANGED
|
@@ -50,7 +50,12 @@ export function createCrawlQueue() {
|
|
|
50
50
|
// }
|
|
51
51
|
try {
|
|
52
52
|
// get page content later
|
|
53
|
-
const result = yield getPageContent(
|
|
53
|
+
const result = yield getPageContent(Object.assign({ localStorage: {
|
|
54
|
+
// for blocklet theme
|
|
55
|
+
blocklet_theme_prefer: 'light',
|
|
56
|
+
// for blocklet domain warning
|
|
57
|
+
'domain-warning-skip': Date.now().toString(),
|
|
58
|
+
} }, job));
|
|
54
59
|
if (!result || (!result.html && !result.screenshot)) {
|
|
55
60
|
logger.error(`failed to crawl ${job.url}, empty content`, job);
|
|
56
61
|
const snapshot = convertJobToSnapshot({
|
|
@@ -128,7 +133,7 @@ function saveSnapshotToLocal(_a) {
|
|
|
128
133
|
};
|
|
129
134
|
});
|
|
130
135
|
}
|
|
131
|
-
export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, }) {
|
|
136
|
+
export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies = [], localStorage, }) {
|
|
132
137
|
const page = yield initPage();
|
|
133
138
|
if (width && height) {
|
|
134
139
|
yield page.setViewport({ width, height, deviceScaleFactor: 2 });
|
|
@@ -136,6 +141,16 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
136
141
|
if (headers) {
|
|
137
142
|
yield page.setExtraHTTPHeaders(headers);
|
|
138
143
|
}
|
|
144
|
+
if (cookies === null || cookies === void 0 ? void 0 : cookies.length) {
|
|
145
|
+
yield page.setCookie(...cookies);
|
|
146
|
+
}
|
|
147
|
+
if (localStorage) {
|
|
148
|
+
yield page.evaluateOnNewDocument((items) => {
|
|
149
|
+
Object.entries(items).forEach(([key, value]) => {
|
|
150
|
+
window.localStorage.setItem(key, value);
|
|
151
|
+
});
|
|
152
|
+
}, localStorage);
|
|
153
|
+
}
|
|
139
154
|
let html = null;
|
|
140
155
|
let screenshot = null;
|
|
141
156
|
const meta = {};
|
package/lib/esm/store/job.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { CookieParam } from '@blocklet/puppeteer';
|
|
1
2
|
import { Model, Sequelize } from '@sequelize/core';
|
|
2
3
|
export interface JobState {
|
|
3
4
|
id?: string;
|
|
@@ -12,6 +13,8 @@ export interface JobState {
|
|
|
12
13
|
fullPage?: boolean;
|
|
13
14
|
lastModified?: string;
|
|
14
15
|
headers?: Record<string, string>;
|
|
16
|
+
cookies?: CookieParam[];
|
|
17
|
+
localStorage?: Record<string, string>;
|
|
15
18
|
}
|
|
16
19
|
export interface JobModel {
|
|
17
20
|
id: string;
|
package/lib/esm/utils.js
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@arcblock/crawler",
|
|
3
|
-
"version": "1.1.
|
|
3
|
+
"version": "1.1.5",
|
|
4
4
|
"main": "lib/cjs/index.js",
|
|
5
5
|
"module": "lib/esm/index.js",
|
|
6
6
|
"types": "lib/cjs/index.d.ts",
|
|
@@ -45,33 +45,32 @@
|
|
|
45
45
|
]
|
|
46
46
|
},
|
|
47
47
|
"dependencies": {
|
|
48
|
-
"@abtnode/cron": "^1.16.
|
|
49
|
-
"@abtnode/models": "^1.16.
|
|
50
|
-
"@abtnode/queue": "^1.16.
|
|
51
|
-
"@blocklet/logger": "^1.16.
|
|
48
|
+
"@abtnode/cron": "^1.16.44",
|
|
49
|
+
"@abtnode/models": "^1.16.44",
|
|
50
|
+
"@abtnode/queue": "^1.16.44",
|
|
51
|
+
"@blocklet/logger": "^1.16.44",
|
|
52
52
|
"@blocklet/puppeteer": "^22.11.3",
|
|
53
|
-
"@blocklet/sdk": "^1.16.
|
|
53
|
+
"@blocklet/sdk": "^1.16.44",
|
|
54
54
|
"@sequelize/core": "7.0.0-alpha.46",
|
|
55
55
|
"@sequelize/sqlite3": "7.0.0-alpha.46",
|
|
56
56
|
"axios": "^1.7.9",
|
|
57
57
|
"fs-extra": "^11.2.0",
|
|
58
58
|
"lodash": "^4.17.21",
|
|
59
59
|
"lru-cache": "^10.4.3",
|
|
60
|
+
"p-map": "^7.0.3",
|
|
60
61
|
"robots-parser": "^3.0.1",
|
|
61
62
|
"sitemap": "^7.1.2",
|
|
62
63
|
"sqlite3": "^5.1.7",
|
|
63
|
-
"ufo": "^1.5.4"
|
|
64
|
-
"p-map": "^7.0.3"
|
|
64
|
+
"ufo": "^1.5.4"
|
|
65
65
|
},
|
|
66
66
|
"devDependencies": {
|
|
67
|
-
"@blocklet/js-sdk": "^1.16.39",
|
|
68
67
|
"@types/dotenv-flow": "^3.3.3",
|
|
69
68
|
"@types/express": "^4.17.21",
|
|
70
69
|
"@types/fs-extra": "^11.0.4",
|
|
71
70
|
"@types/lodash": "^4.17.16",
|
|
72
71
|
"@types/node": "^20.17.19",
|
|
73
|
-
"express": "^4.21.2",
|
|
74
72
|
"bumpp": "^9.11.1",
|
|
73
|
+
"express": "^4.21.2",
|
|
75
74
|
"nodemon": "^3.1.9",
|
|
76
75
|
"npm-run-all": "^4.1.5",
|
|
77
76
|
"puppeteer": "^24.8.2",
|