ns-rss-spider 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1,10 @@
1
- export declare function parseRss(name: string, feed: string): Promise<(import("./types").SimpleContent | import("./types").RichContent)[]>;
1
+ import { ServerInfo } from "./upload";
2
+ export declare function parseRss(name: string, feed: string, server?: ServerInfo): Promise<import("./types").Article[] | {
3
+ storePath: any;
4
+ type: "simple" | "rich";
5
+ guid: string;
6
+ title: string;
7
+ link: string;
8
+ description: string;
9
+ pubDate: Date;
10
+ }[]>;
package/dist/cjs/parse.js CHANGED
@@ -36,22 +36,35 @@ var import_rss_parser = __toESM(require("rss-parser"));
36
36
  var import_zx = require("zx");
37
37
  var import_strategy = require("./strategy");
38
38
  var import_parseContent = require("./parseContent");
39
- async function parseRss(name, feed) {
39
+ var import_upload = require("./upload");
40
+ async function parseRss(name, feed, server) {
40
41
  const parser = new import_rss_parser.default();
41
- console.log(import_zx.chalk.blue("正在拉取 rss 列表"));
42
+ console.log(import_zx.chalk.green("正在拉取 rss 列表"));
42
43
  const result = await parser.parseURL(feed);
43
44
  if (!result.items.length) {
44
45
  throw Error("rss no conent");
45
46
  }
46
47
  const contents = [];
47
48
  for (let item of result.items) {
48
- console.log(import_zx.chalk.blue(`正在解析文章 【${item.title}】`));
49
+ console.log(import_zx.chalk.green(`正在解析文章 【${item.title}】`));
50
+ if (!item.guid) {
51
+ console.log(JSON.stringify(item));
52
+ throw Error(`item has no guid`);
53
+ }
49
54
  const content = await (0, import_parseContent.parseContent)(item, import_strategy.strategies[name]).catch((e) => console.error(import_zx.chalk.red("文章解析失败"), e.message));
50
55
  if (content) {
51
56
  contents.push(content);
52
57
  }
53
58
  }
54
- return contents;
59
+ if (server) {
60
+ return await (0, import_upload.uploadContent)({
61
+ app: name,
62
+ items: contents,
63
+ ...server
64
+ });
65
+ } else {
66
+ return contents;
67
+ }
55
68
  }
56
69
  // Annotate the CommonJS export names for ESM import in node:
57
70
  0 && (module.exports = {
@@ -1,5 +1,5 @@
1
1
  import { Item as RssItem } from "rss-parser";
2
- import { Content, ParseStrategy } from "../types";
2
+ import { Article, ParseStrategy } from "../types";
3
3
  /**
4
4
  * 解析文章内容
5
5
  * 1. 获取 html 片段。(rss、http、playwright)
@@ -13,4 +13,4 @@ import { Content, ParseStrategy } from "../types";
13
13
  * @param strategy
14
14
  * @returns
15
15
  */
16
- export declare function parseContent(item: RssItem, strategy?: ParseStrategy): Promise<Content>;
16
+ export declare function parseContent(item: RssItem, strategy?: ParseStrategy): Promise<Article>;
@@ -45,26 +45,28 @@ async function parseContent(item, strategy) {
45
45
  $ = cheerio.load(item.content || "", {}, false);
46
46
  element = $.root();
47
47
  } else {
48
- console.log(import_zx.chalk.blue("正在拉取文章内容"), item.link);
48
+ console.log(import_zx.chalk.green("正在拉取文章内容"), item.link);
49
49
  const article = await (0, import_getArticleHtml.getArticleHtml)(item.link, strategy.fetcher);
50
50
  $ = cheerio.load(article);
51
51
  element = ((_a = strategy.getContentHtmlFromArticle) == null ? void 0 : _a.call(strategy, $)) || $.root();
52
52
  }
53
- console.log(import_zx.chalk.blue("正在预处理 html"));
53
+ console.log(import_zx.chalk.green("正在预处理 html"));
54
54
  (0, import_stripeHtml.stripeHtml)($, element);
55
55
  if (!(strategy == null ? void 0 : strategy.parse)) {
56
56
  return {
57
57
  type: "simple",
58
+ guid: item.guid,
58
59
  title: item.title,
60
+ link: item.link,
59
61
  description: ((_b = element.html()) == null ? void 0 : _b.trim()) || "",
60
62
  pubDate: new Date(item.pubDate)
61
63
  };
62
64
  }
63
- console.log(import_zx.chalk.blue("正在解析文章内容"));
65
+ console.log(import_zx.chalk.green("正在解析文章内容"));
64
66
  const rich = await (0, import_parseContent.parseContent)($, element, item, strategy);
65
67
  let extra = {};
66
68
  if (strategy.getExtraItems) {
67
- console.log(import_zx.chalk.blue("正在执行 getExtraItems"));
69
+ console.log(import_zx.chalk.green("正在执行 getExtraItems"));
68
70
  extra = await strategy.getExtraItems($, rich, item).catch((e) => {
69
71
  console.error(import_zx.chalk.red("getExtraItems 识别"), e);
70
72
  }) || {};
@@ -1,4 +1,4 @@
1
1
  import { Item as RssItem } from "rss-parser";
2
- import type { ParseStrategy, RichContent } from "../types";
2
+ import type { ParseStrategy, RichArticle } from "../types";
3
3
  import { AnyNode, Cheerio, CheerioAPI } from "cheerio";
4
- export declare function parseContent($: CheerioAPI, $element: Cheerio<AnyNode>, item: RssItem, strategy?: ParseStrategy): Promise<RichContent>;
4
+ export declare function parseContent($: CheerioAPI, $element: Cheerio<AnyNode>, item: RssItem, strategy?: ParseStrategy): Promise<RichArticle>;
@@ -53,7 +53,7 @@ async function parseContent($, $element, item, strategy) {
53
53
  }
54
54
  });
55
55
  for (let item2 of srcs) {
56
- console.log(import_zx.chalk.blue("正在解析图片", item2.src));
56
+ console.log(import_zx.chalk.green("正在解析图片", item2.src));
57
57
  const result = await (0, import_probe_image_size.default)(item2.src, {
58
58
  rejectUnauthorized: false,
59
59
  open_timeout: 1e3 * 5
@@ -95,7 +95,9 @@ async function parseContent($, $element, item, strategy) {
95
95
  });
96
96
  return {
97
97
  type: "rich",
98
+ guid: item.guid,
98
99
  title: item.title,
100
+ link: item.link,
99
101
  description: ((_a = item.contentSnippet) == null ? void 0 : _a.trim()) || "",
100
102
  content: (0, import_html_entities.decode)($element.html().trim()),
101
103
  pubDate: new Date(item.pubDate),
@@ -4,15 +4,17 @@ export interface ParseStrategy {
4
4
  parse: boolean;
5
5
  fetcher?: 'http' | 'playwright';
6
6
  getContentHtmlFromArticle?: (aritcle: CheerioAPI) => Cheerio<AnyNode>;
7
- getExtraItems?: (html: CheerioAPI, current: RichContent, item: Item) => Promise<Record<string, any>>;
7
+ getExtraItems?: (html: CheerioAPI, current: RichArticle, item: Item) => Promise<Record<string, any>>;
8
8
  }
9
- export interface SimpleContent {
9
+ export interface SimpleArticle {
10
10
  type: 'simple';
11
+ guid: string;
11
12
  title: string;
13
+ link: string;
12
14
  description: string;
13
15
  pubDate: Date;
14
16
  }
15
- export interface RichContentImage {
17
+ export interface RichArticleImage {
16
18
  url: string;
17
19
  type?: string;
18
20
  width?: number;
@@ -22,9 +24,9 @@ export interface RichContentImage {
22
24
  heightUnit?: string;
23
25
  title?: string;
24
26
  }
25
- export interface RichContent extends Omit<SimpleContent, 'type'> {
27
+ export interface RichArticle extends Omit<SimpleArticle, 'type'> {
26
28
  type: 'rich';
27
29
  content: string;
28
- images: RichContentImage[];
30
+ images: RichArticleImage[];
29
31
  }
30
- export type Content = SimpleContent | RichContent;
32
+ export type Article = SimpleArticle | RichArticle;
@@ -0,0 +1,20 @@
1
+ import { Article } from "../types";
2
+ export interface ServerInfo {
3
+ getUploadApi?: string;
4
+ latest?: {
5
+ guid: string;
6
+ pubDate: Date;
7
+ };
8
+ }
9
+ export declare function uploadContent({ app, getUploadApi, latest, items, }: {
10
+ app: string;
11
+ items: Article[];
12
+ } & ServerInfo): Promise<Article[] | {
13
+ storePath: any;
14
+ type: "simple" | "rich";
15
+ guid: string;
16
+ title: string;
17
+ link: string;
18
+ description: string;
19
+ pubDate: Date;
20
+ }[]>;
@@ -0,0 +1,87 @@
1
+ var __create = Object.create;
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __getProtoOf = Object.getPrototypeOf;
6
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
7
+ var __export = (target, all) => {
8
+ for (var name in all)
9
+ __defProp(target, name, { get: all[name], enumerable: true });
10
+ };
11
+ var __copyProps = (to, from, except, desc) => {
12
+ if (from && typeof from === "object" || typeof from === "function") {
13
+ for (let key of __getOwnPropNames(from))
14
+ if (!__hasOwnProp.call(to, key) && key !== except)
15
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
16
+ }
17
+ return to;
18
+ };
19
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
20
+ // If the importer is in node compatibility mode or this is not an ESM
21
+ // file that has been converted to a CommonJS file using a Babel-
22
+ // compatible transform (i.e. "__esModule" has not been set), then set
23
+ // "default" to the CommonJS "module.exports" for node compatibility.
24
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
25
+ mod
26
+ ));
27
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
28
+
29
+ // src/upload/index.ts
30
+ var upload_exports = {};
31
+ __export(upload_exports, {
32
+ uploadContent: () => uploadContent
33
+ });
34
+ module.exports = __toCommonJS(upload_exports);
35
+ var import_lodash = require("lodash");
36
+ var import_axios = __toESM(require("axios"));
37
+ var import_dayjs = __toESM(require("dayjs"));
38
+ var import_zx = require("zx");
39
+ async function uploadContent({
40
+ app,
41
+ getUploadApi,
42
+ latest,
43
+ items
44
+ }) {
45
+ var _a, _b;
46
+ const filterItems = latest ? items.filter((item) => {
47
+ return item.pubDate >= latest.pubDate && item.guid !== latest.guid;
48
+ }) : items;
49
+ if (!getUploadApi) {
50
+ return filterItems;
51
+ }
52
+ let result = [];
53
+ for (let item of filterItems) {
54
+ console.log(import_zx.chalk.green("正在获取上传地址", item.title));
55
+ const uploadRst = await import_axios.default.post(getUploadApi, {
56
+ app,
57
+ name: (0, import_dayjs.default)(item.pubDate).format("MM-DDTHH-mm-ss") + Math.random().toString(32).slice(2)
58
+ }, {
59
+ responseType: "json"
60
+ }).catch((e) => {
61
+ console.error(import_zx.chalk.red("get upload url error"), e);
62
+ });
63
+ if (!((_b = (_a = uploadRst == null ? void 0 : uploadRst.data) == null ? void 0 : _a.data) == null ? void 0 : _b.url)) {
64
+ console.error(import_zx.chalk.red(`upload error ${JSON.stringify(uploadRst)}`));
65
+ continue;
66
+ }
67
+ const url = uploadRst.data.data.url;
68
+ const storePath = uploadRst.data.data.storePath;
69
+ console.log(import_zx.chalk.green("正在上传", item.title, url));
70
+ const res = await import_axios.default.put(url, item).catch((e) => {
71
+ console.error(import_zx.chalk.red("upload oss error"), e);
72
+ });
73
+ if (!res || res.status !== 200) {
74
+ console.error(import_zx.chalk.red("upload status not 200"), res);
75
+ continue;
76
+ }
77
+ result.push({
78
+ ...(0, import_lodash.omit)(item, "content", "images"),
79
+ storePath
80
+ });
81
+ }
82
+ return result;
83
+ }
84
+ // Annotate the CommonJS export names for ESM import in node:
85
+ 0 && (module.exports = {
86
+ uploadContent
87
+ });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ns-rss-spider",
3
- "version": "0.0.2",
3
+ "version": "0.0.3",
4
4
  "description": "",
5
5
  "main": "dist/cjs/index.js",
6
6
  "types": "dist/cjs/index.d.ts",
@@ -11,7 +11,7 @@
11
11
  "build:deps": "father prebundle",
12
12
  "check": "father doctor",
13
13
  "prepublishOnly": "father doctor && npm run build",
14
- "pub": "npm run check && npm version patch && npm run build && npm publish --registry=https://registry.npmjs.org && git push origin master --tags && npm run sync"
14
+ "pub": "npm run check && npm version patch && npm run build && npm publish --registry=https://registry.npmjs.org && git push origin master --tags"
15
15
  },
16
16
  "bin": {
17
17
  "ns-rss-spider": "./dist/cjs/cli.js"
@@ -27,6 +27,7 @@
27
27
  "access": "public"
28
28
  },
29
29
  "devDependencies": {
30
+ "@types/lodash": "^4.14.202",
30
31
  "@types/probe-image-size": "^7.2.4",
31
32
  "father": "^4.4.0",
32
33
  "vitest": "^1.2.2"
@@ -34,7 +35,9 @@
34
35
  "dependencies": {
35
36
  "axios": "^1.6.7",
36
37
  "cheerio": "^1.0.0-rc.12",
38
+ "dayjs": "^1.11.10",
37
39
  "html-entities": "^2.4.0",
40
+ "lodash": "^4.17.21",
38
41
  "probe-image-size": "^7.2.3",
39
42
  "rss-parser": "^3.13.0",
40
43
  "yargs-parser": "^21.1.1",