ns-rss-spider 0.0.32 → 0.0.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cjs/cli.js CHANGED
@@ -40,7 +40,9 @@ async function main() {
40
40
  let rst;
41
41
  switch (cmd) {
42
42
  case "parse":
43
- rst = await (0, import_index.parseRss)(rest.name, rest.feed, rest.server);
43
+ rst = await (0, import_index.parseRss)(rest.name, rest.feed, {
44
+ server: rest.server
45
+ });
44
46
  break;
45
47
  default:
46
48
  console.warn("未知命令", cmd);
@@ -1,5 +1,8 @@
1
1
  import { ServerInfo } from "./upload";
2
- export declare function parseRss(name: string, feed: string, server?: ServerInfo): Promise<{
2
+ export declare function parseRss(name: string, feed: string, options?: {
3
+ server?: ServerInfo;
4
+ take?: number;
5
+ }): Promise<{
3
6
  contents: any[];
4
7
  thumbs?: Record<string, string>;
5
8
  }>;
package/dist/cjs/parse.js CHANGED
@@ -32,20 +32,17 @@ __export(parse_exports, {
32
32
  parseRss: () => parseRss
33
33
  });
34
34
  module.exports = __toCommonJS(parse_exports);
35
- var import_rss_parser = __toESM(require("rss-parser"));
36
35
  var import_zx = require("zx");
37
36
  var import_strategy = require("./strategy");
38
37
  var import_parseContent = require("./parseContent");
39
38
  var import_upload = require("./upload");
40
39
  var import_axios = __toESM(require("axios"));
41
- async function parseRss(name, feed, server) {
42
- const parser = new import_rss_parser.default({
43
- customFields: {
44
- item: ["source"]
45
- }
46
- });
40
+ var import_parseRssList = require("./parseContent/parseRssList");
41
+ async function parseRss(name, feed, options) {
42
+ const { server } = options || {};
43
+ console.log(import_zx.chalk.green("start", name, feed));
47
44
  console.log(import_zx.chalk.green("正在拉取 rss 列表"));
48
- const result = await parser.parseURL(feed);
45
+ const result = await (0, import_parseRssList.parseRssList)(feed);
49
46
  if (!result.items.length) {
50
47
  throw Error("rss no conent");
51
48
  }
@@ -66,7 +63,8 @@ async function parseRss(name, feed, server) {
66
63
  item.guid = item.link;
67
64
  }
68
65
  });
69
- for (let item of result.items) {
66
+ const items = (options == null ? void 0 : options.take) ? result.items.slice(0, options.take) : result.items;
67
+ for (let item of items) {
70
68
  console.log(import_zx.chalk.green(`正在解析文章 【${item.title}】`));
71
69
  if (!item.guid) {
72
70
  console.log(JSON.stringify(item));
@@ -0,0 +1,6 @@
1
+ import Parser from "rss-parser";
2
+ export declare function parseRssList(feed: string): Promise<{
3
+ [key: string]: any;
4
+ } & Parser.Output<{
5
+ source: any;
6
+ }>>;
@@ -0,0 +1,50 @@
1
+ var __create = Object.create;
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __getProtoOf = Object.getPrototypeOf;
6
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
7
+ var __export = (target, all) => {
8
+ for (var name in all)
9
+ __defProp(target, name, { get: all[name], enumerable: true });
10
+ };
11
+ var __copyProps = (to, from, except, desc) => {
12
+ if (from && typeof from === "object" || typeof from === "function") {
13
+ for (let key of __getOwnPropNames(from))
14
+ if (!__hasOwnProp.call(to, key) && key !== except)
15
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
16
+ }
17
+ return to;
18
+ };
19
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
20
+ // If the importer is in node compatibility mode or this is not an ESM
21
+ // file that has been converted to a CommonJS file using a Babel-
22
+ // compatible transform (i.e. "__esModule" has not been set), then set
23
+ // "default" to the CommonJS "module.exports" for node compatibility.
24
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
25
+ mod
26
+ ));
27
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
28
+
29
+ // src/parseContent/parseRssList.ts
30
+ var parseRssList_exports = {};
31
+ __export(parseRssList_exports, {
32
+ parseRssList: () => parseRssList
33
+ });
34
+ module.exports = __toCommonJS(parseRssList_exports);
35
+ var import_rss_parser = __toESM(require("rss-parser"));
36
+ var import_request = require("../utils/request");
37
+ async function parseRssList(feed) {
38
+ const xml = await (0, import_request.getPageHtmlByHttp)(feed);
39
+ const parser = new import_rss_parser.default({
40
+ customFields: {
41
+ item: ["source"]
42
+ }
43
+ });
44
+ const result = await parser.parseString(xml);
45
+ return result;
46
+ }
47
+ // Annotate the CommonJS export names for ESM import in node:
48
+ 0 && (module.exports = {
49
+ parseRssList
50
+ });
@@ -38,7 +38,6 @@ var _36kr = {
38
38
  }).catch((e) => {
39
39
  console.error(import_zx.chalk.red(`获取 ${url} 错误`), e);
40
40
  });
41
- console.log("initialState", initialState);
42
41
  if ((_c = (_b = (_a = initialState == null ? void 0 : initialState.home) == null ? void 0 : _a.flow) == null ? void 0 : _b.itemList) == null ? void 0 : _c.length) {
43
42
  const idThumbKv = initialState.home.flow.itemList.reduce((prev, item) => {
44
43
  var _a2, _b2;
@@ -74,7 +74,7 @@ var cnbeta = {
74
74
  const kv = {};
75
75
  $("#main > section li .txt_thumb > a").each((_, $a) => {
76
76
  var _a, _b;
77
- const thumb = $($a).find("img").attr("src");
77
+ const thumb = $($a).find("img").attr("data-cfsrc");
78
78
  const href = $($a).attr("href") || "";
79
79
  const reg = /([\d]+).htm/;
80
80
  const articleId = (_a = reg.exec(href)) == null ? void 0 : _a[1];
@@ -0,0 +1,2 @@
1
+ import { ParseStrategy } from "../types";
2
+ export declare const techrunch: ParseStrategy;
@@ -0,0 +1,63 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __export = (target, all) => {
6
+ for (var name in all)
7
+ __defProp(target, name, { get: all[name], enumerable: true });
8
+ };
9
+ var __copyProps = (to, from, except, desc) => {
10
+ if (from && typeof from === "object" || typeof from === "function") {
11
+ for (let key of __getOwnPropNames(from))
12
+ if (!__hasOwnProp.call(to, key) && key !== except)
13
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
14
+ }
15
+ return to;
16
+ };
17
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
+
19
+ // src/parsers/techrunch.ts
20
+ var techrunch_exports = {};
21
+ __export(techrunch_exports, {
22
+ techrunch: () => techrunch
23
+ });
24
+ module.exports = __toCommonJS(techrunch_exports);
25
+ var import_zx = require("zx");
26
+ var import_constants = require("../utils/constants");
27
+ var import_browser = require("../utils/browser");
28
+ var techrunch = {
29
+ parse: true,
30
+ fetcher: "http",
31
+ getContentElementFromArticle: ($) => {
32
+ const el = $(".article-content");
33
+ return el;
34
+ },
35
+ ignoreProbeImage: true,
36
+ getThumbs: async (items) => {
37
+ var _a, _b;
38
+ const url = "https://techcrunch.com/";
39
+ const initialState = await (0, import_browser.runInPage)(url, async (page) => {
40
+ const data = await page.evaluate(async () => window.tc_app_data);
41
+ return data;
42
+ }, {
43
+ userAgent: import_constants.iosUA
44
+ }).catch((e) => {
45
+ console.error(import_zx.chalk.red(`获取 ${url} 错误`), e);
46
+ });
47
+ if ((_b = (_a = initialState == null ? void 0 : initialState.entities) == null ? void 0 : _a.posts) == null ? void 0 : _b.length) {
48
+ const idThumbKv = initialState.entities.posts.reduce((prev, item) => {
49
+ var _a2;
50
+ if (((_a2 = item == null ? void 0 : item.guid) == null ? void 0 : _a2.rendered) && item.jetpack_featured_media_url) {
51
+ prev[item.guid.rendered] = item.jetpack_featured_media_url;
52
+ }
53
+ return prev;
54
+ }, {});
55
+ return idThumbKv;
56
+ }
57
+ return void 0;
58
+ }
59
+ };
60
+ // Annotate the CommonJS export names for ESM import in node:
61
+ 0 && (module.exports = {
62
+ techrunch
63
+ });
@@ -27,12 +27,14 @@ var import_ifanr = require("./parsers/ifanr");
27
27
  var import_theverge = require("./parsers/theverge");
28
28
  var import_ithome = require("./parsers/ithome");
29
29
  var import_kr = require("./parsers/36kr");
30
+ var import_techrunch = require("./parsers/techrunch");
30
31
  var strategies = {
31
32
  cnbeta: import_cnbeta.cnbeta,
32
33
  ifanr: import_ifanr.ifanr,
33
34
  theverge: import_theverge.theverge,
34
35
  ithome: import_ithome.ithome,
35
- "36kr": import_kr._36kr
36
+ "36kr": import_kr._36kr,
37
+ techrunch: import_techrunch.techrunch
36
38
  };
37
39
  // Annotate the CommonJS export names for ESM import in node:
38
40
  0 && (module.exports = {
@@ -19,11 +19,15 @@ export declare function runInPage<T>(url: string, fn: (page: Page) => Promise<T>
19
19
  * @param options
20
20
  * @returns
21
21
  */
22
- export declare function getPageHtml(url: string, options?: {
22
+ export declare function getPageHtmlByBrowser(url: string, options?: {
23
23
  scroll?: {
24
24
  offset?: number;
25
25
  interval?: number;
26
26
  };
27
+ waitForElement?: {
28
+ selector: string;
29
+ timeout?: number;
30
+ };
27
31
  } & RunOptions): Promise<string>;
28
32
  export declare function scrollToBottom(page: Page, offset?: number, interval?: number): Promise<void>;
29
33
  export {};
@@ -19,7 +19,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
19
19
  // src/utils/browser.ts
20
20
  var browser_exports = {};
21
21
  __export(browser_exports, {
22
- getPageHtml: () => getPageHtml,
22
+ getPageHtmlByBrowser: () => getPageHtmlByBrowser,
23
23
  runInPage: () => runInPage,
24
24
  scrollToBottom: () => scrollToBottom
25
25
  });
@@ -43,11 +43,16 @@ async function runInPage(url, fn, options) {
43
43
  await browser.close();
44
44
  return content;
45
45
  }
46
- async function getPageHtml(url, options) {
46
+ async function getPageHtmlByBrowser(url, options) {
47
47
  return await runInPage(
48
48
  url,
49
49
  async (page) => {
50
50
  var _a, _b;
51
+ if (options == null ? void 0 : options.waitForElement) {
52
+ await page.waitForSelector(options.waitForElement.selector, {
53
+ timeout: options.waitForElement.timeout || 1e3 * 3
54
+ });
55
+ }
51
56
  if (options == null ? void 0 : options.scroll) {
52
57
  await scrollToBottom(
53
58
  page,
@@ -74,7 +79,7 @@ async function scrollToBottom(page, offset = 300, interval = 10) {
74
79
  }
75
80
  // Annotate the CommonJS export names for ESM import in node:
76
81
  0 && (module.exports = {
77
- getPageHtml,
82
+ getPageHtmlByBrowser,
78
83
  runInPage,
79
84
  scrollToBottom
80
85
  });
@@ -0,0 +1 @@
1
+ export declare function getPageHtmlByHttp(feed: string): Promise<string>;
@@ -0,0 +1,49 @@
1
+ var __create = Object.create;
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __getProtoOf = Object.getPrototypeOf;
6
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
7
+ var __export = (target, all) => {
8
+ for (var name in all)
9
+ __defProp(target, name, { get: all[name], enumerable: true });
10
+ };
11
+ var __copyProps = (to, from, except, desc) => {
12
+ if (from && typeof from === "object" || typeof from === "function") {
13
+ for (let key of __getOwnPropNames(from))
14
+ if (!__hasOwnProp.call(to, key) && key !== except)
15
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
16
+ }
17
+ return to;
18
+ };
19
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
20
+ // If the importer is in node compatibility mode or this is not an ESM
21
+ // file that has been converted to a CommonJS file using a Babel-
22
+ // compatible transform (i.e. "__esModule" has not been set), then set
23
+ // "default" to the CommonJS "module.exports" for node compatibility.
24
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
25
+ mod
26
+ ));
27
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
28
+
29
+ // src/utils/request.ts
30
+ var request_exports = {};
31
+ __export(request_exports, {
32
+ getPageHtmlByHttp: () => getPageHtmlByHttp
33
+ });
34
+ module.exports = __toCommonJS(request_exports);
35
+ var import_axios = __toESM(require("axios"));
36
+ var import_constants = require("./constants");
37
+ async function getPageHtmlByHttp(feed) {
38
+ const res = await import_axios.default.get(feed, {
39
+ responseType: "text",
40
+ headers: {
41
+ "User-Agent": import_constants.iosUA
42
+ }
43
+ });
44
+ return res.data;
45
+ }
46
+ // Annotate the CommonJS export names for ESM import in node:
47
+ 0 && (module.exports = {
48
+ getPageHtmlByHttp
49
+ });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ns-rss-spider",
3
- "version": "0.0.32",
3
+ "version": "0.0.35",
4
4
  "description": "",
5
5
  "main": "dist/cjs/index.js",
6
6
  "types": "dist/cjs/index.d.ts",