@arcblock/crawler 1.1.4 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,7 @@ export declare function getDataDir(): Promise<{
5
5
  htmlDir: string;
6
6
  screenshotDir: string;
7
7
  }>;
8
- export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, }: JobState) => Promise<{
8
+ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
9
9
  html: string | null;
10
10
  screenshot: Uint8Array<ArrayBufferLike> | null;
11
11
  meta: {
@@ -59,7 +59,12 @@ function createCrawlQueue() {
59
59
  // }
60
60
  try {
61
61
  // get page content later
62
- const result = yield (0, exports.getPageContent)(job);
62
+ const result = yield (0, exports.getPageContent)(Object.assign({ localStorage: {
63
+ // for blocklet theme
64
+ blocklet_theme_prefer: 'light',
65
+ // for blocklet domain warning
66
+ 'domain-warning-skip': Date.now().toString(),
67
+ } }, job));
63
68
  if (!result || (!result.html && !result.screenshot)) {
64
69
  config_1.logger.error(`failed to crawl ${job.url}, empty content`, job);
65
70
  const snapshot = (0, snapshot_1.convertJobToSnapshot)({
@@ -137,7 +142,7 @@ function saveSnapshotToLocal(_a) {
137
142
  };
138
143
  });
139
144
  }
140
- const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, }) {
145
+ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies = [], localStorage, }) {
141
146
  const page = yield (0, puppeteer_1.initPage)();
142
147
  if (width && height) {
143
148
  yield page.setViewport({ width, height, deviceScaleFactor: 2 });
@@ -145,6 +150,16 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
145
150
  if (headers) {
146
151
  yield page.setExtraHTTPHeaders(headers);
147
152
  }
153
+ if (cookies === null || cookies === void 0 ? void 0 : cookies.length) {
154
+ yield page.setCookie(...cookies);
155
+ }
156
+ if (localStorage) {
157
+ yield page.evaluateOnNewDocument((items) => {
158
+ Object.entries(items).forEach(([key, value]) => {
159
+ window.localStorage.setItem(key, value);
160
+ });
161
+ }, localStorage);
162
+ }
148
163
  let html = null;
149
164
  let screenshot = null;
150
165
  const meta = {};
@@ -1,3 +1,4 @@
1
+ import { CookieParam } from '@blocklet/puppeteer';
1
2
  import { Model, Sequelize } from '@sequelize/core';
2
3
  export interface JobState {
3
4
  id?: string;
@@ -12,6 +13,8 @@ export interface JobState {
12
13
  fullPage?: boolean;
13
14
  lastModified?: string;
14
15
  headers?: Record<string, string>;
16
+ cookies?: CookieParam[];
17
+ localStorage?: Record<string, string>;
15
18
  }
16
19
  export interface JobModel {
17
20
  id: string;
package/lib/cjs/utils.js CHANGED
@@ -134,6 +134,11 @@ const staticFileExtensions = [
134
134
  'xls',
135
135
  'xml',
136
136
  'zip',
137
+ 'ts',
138
+ 'json',
139
+ 'md',
140
+ 'yml',
141
+ 'yaml',
137
142
  ];
138
143
  const sleep = (ms) => {
139
144
  return new Promise((resolve) => {
@@ -5,7 +5,7 @@ export declare function getDataDir(): Promise<{
5
5
  htmlDir: string;
6
6
  screenshotDir: string;
7
7
  }>;
8
- export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, }: JobState) => Promise<{
8
+ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
9
9
  html: string | null;
10
10
  screenshot: Uint8Array<ArrayBufferLike> | null;
11
11
  meta: {
@@ -50,7 +50,12 @@ export function createCrawlQueue() {
50
50
  // }
51
51
  try {
52
52
  // get page content later
53
- const result = yield getPageContent(job);
53
+ const result = yield getPageContent(Object.assign({ localStorage: {
54
+ // for blocklet theme
55
+ blocklet_theme_prefer: 'light',
56
+ // for blocklet domain warning
57
+ 'domain-warning-skip': Date.now().toString(),
58
+ } }, job));
54
59
  if (!result || (!result.html && !result.screenshot)) {
55
60
  logger.error(`failed to crawl ${job.url}, empty content`, job);
56
61
  const snapshot = convertJobToSnapshot({
@@ -128,7 +133,7 @@ function saveSnapshotToLocal(_a) {
128
133
  };
129
134
  });
130
135
  }
131
- export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, }) {
136
+ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies = [], localStorage, }) {
132
137
  const page = yield initPage();
133
138
  if (width && height) {
134
139
  yield page.setViewport({ width, height, deviceScaleFactor: 2 });
@@ -136,6 +141,16 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
136
141
  if (headers) {
137
142
  yield page.setExtraHTTPHeaders(headers);
138
143
  }
144
+ if (cookies === null || cookies === void 0 ? void 0 : cookies.length) {
145
+ yield page.setCookie(...cookies);
146
+ }
147
+ if (localStorage) {
148
+ yield page.evaluateOnNewDocument((items) => {
149
+ Object.entries(items).forEach(([key, value]) => {
150
+ window.localStorage.setItem(key, value);
151
+ });
152
+ }, localStorage);
153
+ }
139
154
  let html = null;
140
155
  let screenshot = null;
141
156
  const meta = {};
@@ -1,3 +1,4 @@
1
+ import { CookieParam } from '@blocklet/puppeteer';
1
2
  import { Model, Sequelize } from '@sequelize/core';
2
3
  export interface JobState {
3
4
  id?: string;
@@ -12,6 +13,8 @@ export interface JobState {
12
13
  fullPage?: boolean;
13
14
  lastModified?: string;
14
15
  headers?: Record<string, string>;
16
+ cookies?: CookieParam[];
17
+ localStorage?: Record<string, string>;
15
18
  }
16
19
  export interface JobModel {
17
20
  id: string;
package/lib/esm/utils.js CHANGED
@@ -123,6 +123,11 @@ const staticFileExtensions = [
123
123
  'xls',
124
124
  'xml',
125
125
  'zip',
126
+ 'ts',
127
+ 'json',
128
+ 'md',
129
+ 'yml',
130
+ 'yaml',
126
131
  ];
127
132
  export const sleep = (ms) => {
128
133
  return new Promise((resolve) => {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arcblock/crawler",
3
- "version": "1.1.4",
3
+ "version": "1.1.5",
4
4
  "main": "lib/cjs/index.js",
5
5
  "module": "lib/esm/index.js",
6
6
  "types": "lib/cjs/index.d.ts",
@@ -45,33 +45,32 @@
45
45
  ]
46
46
  },
47
47
  "dependencies": {
48
- "@abtnode/cron": "^1.16.43",
49
- "@abtnode/models": "^1.16.43",
50
- "@abtnode/queue": "^1.16.43",
51
- "@blocklet/logger": "^1.16.43",
48
+ "@abtnode/cron": "^1.16.44",
49
+ "@abtnode/models": "^1.16.44",
50
+ "@abtnode/queue": "^1.16.44",
51
+ "@blocklet/logger": "^1.16.44",
52
52
  "@blocklet/puppeteer": "^22.11.3",
53
- "@blocklet/sdk": "^1.16.43",
53
+ "@blocklet/sdk": "^1.16.44",
54
54
  "@sequelize/core": "7.0.0-alpha.46",
55
55
  "@sequelize/sqlite3": "7.0.0-alpha.46",
56
56
  "axios": "^1.7.9",
57
57
  "fs-extra": "^11.2.0",
58
58
  "lodash": "^4.17.21",
59
59
  "lru-cache": "^10.4.3",
60
+ "p-map": "^7.0.3",
60
61
  "robots-parser": "^3.0.1",
61
62
  "sitemap": "^7.1.2",
62
63
  "sqlite3": "^5.1.7",
63
- "ufo": "^1.5.4",
64
- "p-map": "^7.0.3"
64
+ "ufo": "^1.5.4"
65
65
  },
66
66
  "devDependencies": {
67
- "@blocklet/js-sdk": "^1.16.39",
68
67
  "@types/dotenv-flow": "^3.3.3",
69
68
  "@types/express": "^4.17.21",
70
69
  "@types/fs-extra": "^11.0.4",
71
70
  "@types/lodash": "^4.17.16",
72
71
  "@types/node": "^20.17.19",
73
- "express": "^4.21.2",
74
72
  "bumpp": "^9.11.1",
73
+ "express": "^4.21.2",
75
74
  "nodemon": "^3.1.9",
76
75
  "npm-run-all": "^4.1.5",
77
76
  "puppeteer": "^24.8.2",