ns-rss-spider 0.0.30 → 0.0.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,3 @@
1
1
  export { parseRss } from "./parse";
2
2
  export { strategies } from "./strategy";
3
3
  export { parseContent } from "./parseContent";
4
- export { getWebContent } from './utils/browser';
package/dist/cjs/index.js CHANGED
@@ -19,7 +19,6 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
19
19
  // src/index.ts
20
20
  var src_exports = {};
21
21
  __export(src_exports, {
22
- getWebContent: () => import_browser.getWebContent,
23
22
  parseContent: () => import_parseContent.parseContent,
24
23
  parseRss: () => import_parse.parseRss,
25
24
  strategies: () => import_strategy.strategies
@@ -28,10 +27,8 @@ module.exports = __toCommonJS(src_exports);
28
27
  var import_parse = require("./parse");
29
28
  var import_strategy = require("./strategy");
30
29
  var import_parseContent = require("./parseContent");
31
- var import_browser = require("./utils/browser");
32
30
  // Annotate the CommonJS export names for ESM import in node:
33
31
  0 && (module.exports = {
34
- getWebContent,
35
32
  parseContent,
36
33
  parseRss,
37
34
  strategies
@@ -23,34 +23,42 @@ __export(kr_exports, {
23
23
  });
24
24
  module.exports = __toCommonJS(kr_exports);
25
25
  var import_zx = require("zx");
26
- var import_cheerio = require("cheerio");
27
26
  var import_constants = require("../utils/constants");
28
27
  var import_browser = require("../utils/browser");
29
28
  var _36kr = {
30
29
  parse: true,
31
30
  getThumbs: async (items) => {
31
+ var _a, _b, _c;
32
32
  const url = "https://m.36kr.com/";
33
- const html = await (0, import_browser.getWebContent)(url, {
33
+ const initialState = await (0, import_browser.runInPage)(url, async (page) => {
34
+ const data = await page.evaluate(async () => window.initialState);
35
+ return data;
36
+ }, {
34
37
  userAgent: import_constants.iosUA
35
38
  }).catch((e) => {
36
39
  console.error(import_zx.chalk.red(`获取 ${url} 错误`), e);
37
40
  });
38
- if (!html)
39
- return void 0;
40
- const $ = (0, import_cheerio.load)(html, null, false);
41
- const kv = {};
42
- $(".home-flow > .flow-list .article-item a").each((_, $a) => {
43
- var _a, _b;
44
- const thumb = $($a).find("img").attr("src");
45
- const href = $($a).attr("href") || "";
46
- const reg = /\/p\/([\d]+)/;
47
- const articleId = (_a = reg.exec(href)) == null ? void 0 : _a[1];
48
- const guid = (_b = items.find((a) => (a.guid || a.link || "").includes(`/p/${articleId}`))) == null ? void 0 : _b.guid;
49
- if (guid && thumb) {
50
- kv[guid] = thumb;
51
- }
52
- });
53
- return kv;
41
+ console.log("initialState", initialState);
42
+ if ((_c = (_b = (_a = initialState == null ? void 0 : initialState.home) == null ? void 0 : _a.flow) == null ? void 0 : _b.itemList) == null ? void 0 : _c.length) {
43
+ const idThumbKv = initialState.home.flow.itemList.reduce((prev, item) => {
44
+ var _a2, _b2;
45
+ if ((item == null ? void 0 : item.itemId) && ((_a2 = item.templateMaterial) == null ? void 0 : _a2.widgetImage)) {
46
+ prev[item.itemId] = (_b2 = item.templateMaterial) == null ? void 0 : _b2.widgetImage;
47
+ }
48
+ return prev;
49
+ }, {});
50
+ const kv = {};
51
+ Object.keys(idThumbKv).forEach((articleId) => {
52
+ var _a2;
53
+ const thumb = idThumbKv[articleId];
54
+ const guid = (_a2 = items.find((a) => (a.guid || a.link || "").includes(`/p/${articleId}`))) == null ? void 0 : _a2.guid;
55
+ if (guid && thumb) {
56
+ kv[guid] = thumb;
57
+ }
58
+ });
59
+ return kv;
60
+ }
61
+ return void 0;
54
62
  }
55
63
  };
56
64
  // Annotate the CommonJS export names for ESM import in node:
@@ -1,6 +1,8 @@
1
+ var __create = Object.create;
1
2
  var __defProp = Object.defineProperty;
2
3
  var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
4
  var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __getProtoOf = Object.getPrototypeOf;
4
6
  var __hasOwnProp = Object.prototype.hasOwnProperty;
5
7
  var __export = (target, all) => {
6
8
  for (var name in all)
@@ -14,6 +16,14 @@ var __copyProps = (to, from, except, desc) => {
14
16
  }
15
17
  return to;
16
18
  };
19
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
20
+ // If the importer is in node compatibility mode or this is not an ESM
21
+ // file that has been converted to a CommonJS file using a Babel-
22
+ // compatible transform (i.e. "__esModule" has not been set), then set
23
+ // "default" to the CommonJS "module.exports" for node compatibility.
24
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
25
+ mod
26
+ ));
17
27
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
28
 
19
29
  // src/parsers/ifanr.ts
@@ -22,8 +32,41 @@ __export(ifanr_exports, {
22
32
  ifanr: () => ifanr
23
33
  });
24
34
  module.exports = __toCommonJS(ifanr_exports);
35
+ var import_axios = __toESM(require("axios"));
36
+ var import_zx = require("zx");
37
+ var import_cheerio = require("cheerio");
38
+ var import_constants = require("../utils/constants");
25
39
  var ifanr = {
26
- parse: true
40
+ parse: false,
41
+ getThumbs: async (items) => {
42
+ const url = "https://www.ifanr.com/";
43
+ const res = await import_axios.default.get(url, {
44
+ responseType: "text",
45
+ headers: {
46
+ "User-Agent": import_constants.iosUA
47
+ }
48
+ }).catch((e) => {
49
+ console.error(import_zx.chalk.red(`获取 ${url} 错误`), e);
50
+ });
51
+ if (!res)
52
+ return void 0;
53
+ const html = res.data;
54
+ const $ = (0, import_cheerio.load)(html, null, false);
55
+ const kv = {};
56
+ $("#articles-list .article-item > a").each((_, $a) => {
57
+ var _a, _b, _c;
58
+ const bg = $($a).find(".left-box").css("background-image") || "";
59
+ const thumb = (_a = /url\(([\s\S]+)\)/.exec(bg)) == null ? void 0 : _a[1];
60
+ const href = $($a).attr("href") || "";
61
+ const reg = /ifanr.com\/([\d]+)/;
62
+ const articleId = (_b = reg.exec(href)) == null ? void 0 : _b[1];
63
+ const guid = (_c = items.find((a) => (a.guid || "").includes(`p=${articleId}`))) == null ? void 0 : _c.guid;
64
+ if (guid && thumb) {
65
+ kv[guid] = thumb;
66
+ }
67
+ });
68
+ return kv;
69
+ }
27
70
  };
28
71
  // Annotate the CommonJS export names for ESM import in node:
29
72
  0 && (module.exports = {
@@ -1,3 +1,29 @@
1
- export declare function getWebContent(url: string, options?: {
1
+ import { Page } from "playwright";
2
+ type RunOptions = {
2
3
  userAgent?: string;
3
- }): Promise<string>;
4
+ };
5
+ /**
6
+ * 在 url 中执行 fn,并获取 fn 的结果
7
+ * @param url
8
+ * @param fn
9
+ * @param options
10
+ * @returns
11
+ */
12
+ export declare function runInPage<T>(url: string, fn: (page: Page) => Promise<T>, options?: RunOptions): Promise<T>;
13
+ /**
14
+ * 根据 url 获取 content.
15
+ *
16
+ * 1. 支持获取前滚动页面
17
+ *
18
+ * @param url
19
+ * @param options
20
+ * @returns
21
+ */
22
+ export declare function getPageHtml(url: string, options?: {
23
+ scroll?: {
24
+ offset?: number;
25
+ interval?: number;
26
+ };
27
+ } & RunOptions): Promise<string>;
28
+ export declare function scrollToBottom(page: Page, offset?: number, interval?: number): Promise<void>;
29
+ export {};
@@ -19,11 +19,13 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
19
19
  // src/utils/browser.ts
20
20
  var browser_exports = {};
21
21
  __export(browser_exports, {
22
- getWebContent: () => getWebContent
22
+ getPageHtml: () => getPageHtml,
23
+ runInPage: () => runInPage,
24
+ scrollToBottom: () => scrollToBottom
23
25
  });
24
26
  module.exports = __toCommonJS(browser_exports);
25
27
  var import_playwright = require("playwright");
26
- async function getWebContent(url, options) {
28
+ async function runInPage(url, fn, options) {
27
29
  const browser = await import_playwright.chromium.launch();
28
30
  const page = await browser.newPage({
29
31
  userAgent: options == null ? void 0 : options.userAgent
@@ -32,15 +34,47 @@ async function getWebContent(url, options) {
32
34
  const waitForLoadPromise = new Promise((r) => {
33
35
  resolve = r;
34
36
  });
35
- page.on("load", (page2) => {
36
- page2.locator("html").innerHTML().then((h) => resolve == null ? void 0 : resolve(h));
37
+ page.on("load", async (page2) => {
38
+ const r = await fn(page2);
39
+ resolve == null ? void 0 : resolve(r);
37
40
  });
38
41
  await page.goto(url);
39
42
  const content = await waitForLoadPromise;
40
43
  await browser.close();
41
44
  return content;
42
45
  }
46
+ async function getPageHtml(url, options) {
47
+ return await runInPage(
48
+ url,
49
+ async (page) => {
50
+ var _a, _b;
51
+ if (options == null ? void 0 : options.scroll) {
52
+ await scrollToBottom(
53
+ page,
54
+ (_a = options == null ? void 0 : options.scroll) == null ? void 0 : _a.offset,
55
+ (_b = options == null ? void 0 : options.scroll) == null ? void 0 : _b.interval
56
+ );
57
+ }
58
+ const html = await page.locator("html").innerHTML();
59
+ return html;
60
+ },
61
+ options
62
+ );
63
+ }
64
+ async function scrollToBottom(page, offset = 300, interval = 10) {
65
+ const height = await page.evaluate(async () => {
66
+ const height2 = document.body.scrollHeight;
67
+ return height2;
68
+ });
69
+ console.log("height", height);
70
+ for (let i = 0; i < height; i += offset) {
71
+ page.mouse.wheel(0, i * offset);
72
+ await page.waitForTimeout(interval);
73
+ }
74
+ }
43
75
  // Annotate the CommonJS export names for ESM import in node:
44
76
  0 && (module.exports = {
45
- getWebContent
77
+ getPageHtml,
78
+ runInPage,
79
+ scrollToBottom
46
80
  });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ns-rss-spider",
3
- "version": "0.0.30",
3
+ "version": "0.0.32",
4
4
  "description": "",
5
5
  "main": "dist/cjs/index.js",
6
6
  "types": "dist/cjs/index.d.ts",