ns-rss-spider 0.0.29 → 0.0.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/parsers/36kr.js +37 -1
- package/dist/cjs/parsers/cnbeta.js +5 -1
- package/dist/cjs/parsers/ithome.js +5 -1
- package/dist/cjs/utils/browser.d.ts +29 -0
- package/dist/cjs/utils/browser.js +80 -0
- package/dist/cjs/utils/constants.d.ts +1 -0
- package/dist/cjs/utils/constants.js +29 -0
- package/package.json +2 -1
package/dist/cjs/parsers/36kr.js
CHANGED
|
@@ -22,8 +22,44 @@ __export(kr_exports, {
|
|
|
22
22
|
_36kr: () => _36kr
|
|
23
23
|
});
|
|
24
24
|
module.exports = __toCommonJS(kr_exports);
|
|
25
|
+
var import_zx = require("zx");
|
|
26
|
+
var import_constants = require("../utils/constants");
|
|
27
|
+
var import_browser = require("../utils/browser");
|
|
25
28
|
var _36kr = {
|
|
26
|
-
parse: true
|
|
29
|
+
parse: true,
|
|
30
|
+
getThumbs: async (items) => {
|
|
31
|
+
var _a, _b, _c;
|
|
32
|
+
const url = "https://m.36kr.com/";
|
|
33
|
+
const initialState = await (0, import_browser.runInPage)(url, async (page) => {
|
|
34
|
+
const data = await page.evaluate(async () => window.initialState);
|
|
35
|
+
return data;
|
|
36
|
+
}, {
|
|
37
|
+
userAgent: import_constants.iosUA
|
|
38
|
+
}).catch((e) => {
|
|
39
|
+
console.error(import_zx.chalk.red(`获取 ${url} 错误`), e);
|
|
40
|
+
});
|
|
41
|
+
console.log("initialState", initialState);
|
|
42
|
+
if ((_c = (_b = (_a = initialState == null ? void 0 : initialState.home) == null ? void 0 : _a.flow) == null ? void 0 : _b.itemList) == null ? void 0 : _c.length) {
|
|
43
|
+
const idThumbKv = initialState.home.flow.itemList.reduce((prev, item) => {
|
|
44
|
+
var _a2, _b2;
|
|
45
|
+
if ((item == null ? void 0 : item.itemId) && ((_a2 = item.templateMaterial) == null ? void 0 : _a2.widgetImage)) {
|
|
46
|
+
prev[item.itemId] = (_b2 = item.templateMaterial) == null ? void 0 : _b2.widgetImage;
|
|
47
|
+
}
|
|
48
|
+
return prev;
|
|
49
|
+
}, {});
|
|
50
|
+
const kv = {};
|
|
51
|
+
Object.keys(idThumbKv).forEach((articleId) => {
|
|
52
|
+
var _a2;
|
|
53
|
+
const thumb = idThumbKv[articleId];
|
|
54
|
+
const guid = (_a2 = items.find((a) => (a.guid || a.link || "").includes(`/p/${articleId}`))) == null ? void 0 : _a2.guid;
|
|
55
|
+
if (guid && thumb) {
|
|
56
|
+
kv[guid] = thumb;
|
|
57
|
+
}
|
|
58
|
+
});
|
|
59
|
+
return kv;
|
|
60
|
+
}
|
|
61
|
+
return void 0;
|
|
62
|
+
}
|
|
27
63
|
};
|
|
28
64
|
// Annotate the CommonJS export names for ESM import in node:
|
|
29
65
|
0 && (module.exports = {
|
|
@@ -35,6 +35,7 @@ module.exports = __toCommonJS(cnbeta_exports);
|
|
|
35
35
|
var import_axios = __toESM(require("axios"));
|
|
36
36
|
var import_zx = require("zx");
|
|
37
37
|
var import_cheerio = require("cheerio");
|
|
38
|
+
var import_constants = require("../utils/constants");
|
|
38
39
|
var cnbeta = {
|
|
39
40
|
parse: true,
|
|
40
41
|
fetcher: "http",
|
|
@@ -59,7 +60,10 @@ var cnbeta = {
|
|
|
59
60
|
ignoreProbeImage: true,
|
|
60
61
|
getThumbs: async (items) => {
|
|
61
62
|
const res = await import_axios.default.get("https://m.cnbeta.com.tw/", {
|
|
62
|
-
responseType: "text"
|
|
63
|
+
responseType: "text",
|
|
64
|
+
headers: {
|
|
65
|
+
"User-Agent": import_constants.iosUA
|
|
66
|
+
}
|
|
63
67
|
}).catch((e) => {
|
|
64
68
|
console.error(import_zx.chalk.red("获取 m.cnbeta.com.tw 错误"), e);
|
|
65
69
|
});
|
|
@@ -36,6 +36,7 @@ var import_utils = require("../parseContent/utils");
|
|
|
36
36
|
var import_axios = __toESM(require("axios"));
|
|
37
37
|
var import_zx = require("zx");
|
|
38
38
|
var import_cheerio = require("cheerio");
|
|
39
|
+
var import_constants = require("../utils/constants");
|
|
39
40
|
var ithome = {
|
|
40
41
|
parse: true,
|
|
41
42
|
fetcher: "http",
|
|
@@ -64,7 +65,10 @@ var ithome = {
|
|
|
64
65
|
},
|
|
65
66
|
getThumbs: async (articles) => {
|
|
66
67
|
const res = await import_axios.default.get("https://m.ithome.com/", {
|
|
67
|
-
responseType: "text"
|
|
68
|
+
responseType: "text",
|
|
69
|
+
headers: {
|
|
70
|
+
"User-Agent": import_constants.iosUA
|
|
71
|
+
}
|
|
68
72
|
}).catch((e) => {
|
|
69
73
|
console.error(import_zx.chalk.red("获取 https://m.ithome.com/ 错误"), e);
|
|
70
74
|
});
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import { Page } from "playwright";
|
|
2
|
+
type RunOptions = {
|
|
3
|
+
userAgent?: string;
|
|
4
|
+
};
|
|
5
|
+
/**
|
|
6
|
+
* 在 url 中执行 fn,并获取 fn 的结果
|
|
7
|
+
* @param url
|
|
8
|
+
* @param fn
|
|
9
|
+
* @param options
|
|
10
|
+
* @returns
|
|
11
|
+
*/
|
|
12
|
+
export declare function runInPage<T>(url: string, fn: (page: Page) => Promise<T>, options?: RunOptions): Promise<T>;
|
|
13
|
+
/**
|
|
14
|
+
* 根据 url 获取 content.
|
|
15
|
+
*
|
|
16
|
+
* 1. 支持获取前滚动页面
|
|
17
|
+
*
|
|
18
|
+
* @param url
|
|
19
|
+
* @param options
|
|
20
|
+
* @returns
|
|
21
|
+
*/
|
|
22
|
+
export declare function getPageHtml(url: string, options?: {
|
|
23
|
+
scroll?: {
|
|
24
|
+
offset?: number;
|
|
25
|
+
interval?: number;
|
|
26
|
+
};
|
|
27
|
+
} & RunOptions): Promise<string>;
|
|
28
|
+
export declare function scrollToBottom(page: Page, offset?: number, interval?: number): Promise<void>;
|
|
29
|
+
export {};
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
var __defProp = Object.defineProperty;
|
|
2
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
3
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
4
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
5
|
+
var __export = (target, all) => {
|
|
6
|
+
for (var name in all)
|
|
7
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
8
|
+
};
|
|
9
|
+
var __copyProps = (to, from, except, desc) => {
|
|
10
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
11
|
+
for (let key of __getOwnPropNames(from))
|
|
12
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
13
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
14
|
+
}
|
|
15
|
+
return to;
|
|
16
|
+
};
|
|
17
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
18
|
+
|
|
19
|
+
// src/utils/browser.ts
|
|
20
|
+
var browser_exports = {};
|
|
21
|
+
__export(browser_exports, {
|
|
22
|
+
getPageHtml: () => getPageHtml,
|
|
23
|
+
runInPage: () => runInPage,
|
|
24
|
+
scrollToBottom: () => scrollToBottom
|
|
25
|
+
});
|
|
26
|
+
module.exports = __toCommonJS(browser_exports);
|
|
27
|
+
var import_playwright = require("playwright");
|
|
28
|
+
async function runInPage(url, fn, options) {
|
|
29
|
+
const browser = await import_playwright.chromium.launch();
|
|
30
|
+
const page = await browser.newPage({
|
|
31
|
+
userAgent: options == null ? void 0 : options.userAgent
|
|
32
|
+
});
|
|
33
|
+
let resolve;
|
|
34
|
+
const waitForLoadPromise = new Promise((r) => {
|
|
35
|
+
resolve = r;
|
|
36
|
+
});
|
|
37
|
+
page.on("load", async (page2) => {
|
|
38
|
+
const r = await fn(page2);
|
|
39
|
+
resolve == null ? void 0 : resolve(r);
|
|
40
|
+
});
|
|
41
|
+
await page.goto(url);
|
|
42
|
+
const content = await waitForLoadPromise;
|
|
43
|
+
await browser.close();
|
|
44
|
+
return content;
|
|
45
|
+
}
|
|
46
|
+
async function getPageHtml(url, options) {
|
|
47
|
+
return await runInPage(
|
|
48
|
+
url,
|
|
49
|
+
async (page) => {
|
|
50
|
+
var _a, _b;
|
|
51
|
+
if (options == null ? void 0 : options.scroll) {
|
|
52
|
+
await scrollToBottom(
|
|
53
|
+
page,
|
|
54
|
+
(_a = options == null ? void 0 : options.scroll) == null ? void 0 : _a.offset,
|
|
55
|
+
(_b = options == null ? void 0 : options.scroll) == null ? void 0 : _b.interval
|
|
56
|
+
);
|
|
57
|
+
}
|
|
58
|
+
const html = await page.locator("html").innerHTML();
|
|
59
|
+
return html;
|
|
60
|
+
},
|
|
61
|
+
options
|
|
62
|
+
);
|
|
63
|
+
}
|
|
64
|
+
async function scrollToBottom(page, offset = 300, interval = 10) {
|
|
65
|
+
const height = await page.evaluate(async () => {
|
|
66
|
+
const height2 = document.body.scrollHeight;
|
|
67
|
+
return height2;
|
|
68
|
+
});
|
|
69
|
+
console.log("height", height);
|
|
70
|
+
for (let i = 0; i < height; i += offset) {
|
|
71
|
+
page.mouse.wheel(0, i * offset);
|
|
72
|
+
await page.waitForTimeout(interval);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
76
|
+
0 && (module.exports = {
|
|
77
|
+
getPageHtml,
|
|
78
|
+
runInPage,
|
|
79
|
+
scrollToBottom
|
|
80
|
+
});
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare const iosUA = "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1";
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
var __defProp = Object.defineProperty;
|
|
2
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
3
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
4
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
5
|
+
var __export = (target, all) => {
|
|
6
|
+
for (var name in all)
|
|
7
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
8
|
+
};
|
|
9
|
+
var __copyProps = (to, from, except, desc) => {
|
|
10
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
11
|
+
for (let key of __getOwnPropNames(from))
|
|
12
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
13
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
14
|
+
}
|
|
15
|
+
return to;
|
|
16
|
+
};
|
|
17
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
18
|
+
|
|
19
|
+
// src/utils/constants.ts
|
|
20
|
+
var constants_exports = {};
|
|
21
|
+
__export(constants_exports, {
|
|
22
|
+
iosUA: () => iosUA
|
|
23
|
+
});
|
|
24
|
+
module.exports = __toCommonJS(constants_exports);
|
|
25
|
+
var iosUA = "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1";
|
|
26
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
27
|
+
0 && (module.exports = {
|
|
28
|
+
iosUA
|
|
29
|
+
});
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "ns-rss-spider",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.31",
|
|
4
4
|
"description": "",
|
|
5
5
|
"main": "dist/cjs/index.js",
|
|
6
6
|
"types": "dist/cjs/index.d.ts",
|
|
@@ -39,6 +39,7 @@
|
|
|
39
39
|
"dayjs": "^1.11.10",
|
|
40
40
|
"html-entities": "^2.4.0",
|
|
41
41
|
"lodash": "^4.17.21",
|
|
42
|
+
"playwright": "^1.41.2",
|
|
42
43
|
"probe-image-size": "^7.2.3",
|
|
43
44
|
"rss-parser": "^3.13.0",
|
|
44
45
|
"yargs-parser": "^21.1.1",
|