ns-rss-spider 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,23 @@
1
+ # rss-spider
2
+
3
+ [![NPM version](https://img.shields.io/npm/v/rss-spider.svg?style=flat)](https://npmjs.org/package/rss-spider)
4
+ [![NPM downloads](http://img.shields.io/npm/dm/rss-spider.svg?style=flat)](https://npmjs.org/package/rss-spider)
5
+
6
+ ## Install
7
+
8
+ ```bash
9
+ $ yarn install
10
+ ```
11
+
12
+ ```bash
13
+ $ npm run dev
14
+ $ npm run build
15
+ ```
16
+
17
+ ## Options
18
+
19
+ TODO
20
+
21
+ ## LICENSE
22
+
23
+ MIT
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env node
2
+ export {};
@@ -0,0 +1,61 @@
1
+ #!/usr/bin/env node
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __copyProps = (to, from, except, desc) => {
9
+ if (from && typeof from === "object" || typeof from === "function") {
10
+ for (let key of __getOwnPropNames(from))
11
+ if (!__hasOwnProp.call(to, key) && key !== except)
12
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
13
+ }
14
+ return to;
15
+ };
16
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
17
+ // If the importer is in node compatibility mode or this is not an ESM
18
+ // file that has been converted to a CommonJS file using a Babel-
19
+ // compatible transform (i.e. "__esModule" has not been set), then set
20
+ // "default" to the CommonJS "module.exports" for node compatibility.
21
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
22
+ mod
23
+ ));
24
+
25
+ // src/cli.ts
26
+ var import_promises = require("fs/promises");
27
+ var import_yargs_parser = __toESM(require("yargs-parser"));
28
+ var import_index = require("./index");
29
+ var import_zx = require("zx");
30
+ var import_getData = require("./utils/getData");
31
+ var argv = (0, import_yargs_parser.default)(process.argv);
32
+ if (argv.v || argv.version) {
33
+ const pkg = require("../../package.json");
34
+ console.log(pkg.version);
35
+ process.exit(0);
36
+ }
37
+ async function main() {
38
+ const data = (0, import_getData.getData)() || {};
39
+ const { cmd, ...rest } = data;
40
+ let rst;
41
+ switch (cmd) {
42
+ case "parse":
43
+ rst = await (0, import_index.parseRss)(rest.name, rest.feed);
44
+ break;
45
+ default:
46
+ console.warn("未知命令", cmd);
47
+ return;
48
+ }
49
+ if (process.env.DISP_ARGV) {
50
+ console.info("写入结果文件");
51
+ return (0, import_promises.writeFile)("./result.json", JSON.stringify(rst || {}));
52
+ } else {
53
+ console.info("输出结果");
54
+ console.log(rst);
55
+ }
56
+ }
57
+ main().catch((err) => {
58
+ console.error(import_zx.chalk.red(err.message));
59
+ console.error(err);
60
+ process.exit(1);
61
+ });
@@ -0,0 +1,3 @@
1
+ export { parseRss } from "./parse";
2
+ export { strategies } from "./strategy";
3
+ export { parseContent } from "./parseContent";
@@ -0,0 +1,35 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __export = (target, all) => {
6
+ for (var name in all)
7
+ __defProp(target, name, { get: all[name], enumerable: true });
8
+ };
9
+ var __copyProps = (to, from, except, desc) => {
10
+ if (from && typeof from === "object" || typeof from === "function") {
11
+ for (let key of __getOwnPropNames(from))
12
+ if (!__hasOwnProp.call(to, key) && key !== except)
13
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
14
+ }
15
+ return to;
16
+ };
17
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
+
19
+ // src/index.ts
20
+ var src_exports = {};
21
+ __export(src_exports, {
22
+ parseContent: () => import_parseContent.parseContent,
23
+ parseRss: () => import_parse.parseRss,
24
+ strategies: () => import_strategy.strategies
25
+ });
26
+ module.exports = __toCommonJS(src_exports);
27
+ var import_parse = require("./parse");
28
+ var import_strategy = require("./strategy");
29
+ var import_parseContent = require("./parseContent");
30
+ // Annotate the CommonJS export names for ESM import in node:
31
+ 0 && (module.exports = {
32
+ parseContent,
33
+ parseRss,
34
+ strategies
35
+ });
@@ -0,0 +1 @@
1
+ export declare function parseRss(name: string, feed: string): Promise<(import("./types").SimpleContent | import("./types").RichContent)[]>;
@@ -0,0 +1,59 @@
1
+ var __create = Object.create;
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __getProtoOf = Object.getPrototypeOf;
6
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
7
+ var __export = (target, all) => {
8
+ for (var name in all)
9
+ __defProp(target, name, { get: all[name], enumerable: true });
10
+ };
11
+ var __copyProps = (to, from, except, desc) => {
12
+ if (from && typeof from === "object" || typeof from === "function") {
13
+ for (let key of __getOwnPropNames(from))
14
+ if (!__hasOwnProp.call(to, key) && key !== except)
15
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
16
+ }
17
+ return to;
18
+ };
19
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
20
+ // If the importer is in node compatibility mode or this is not an ESM
21
+ // file that has been converted to a CommonJS file using a Babel-
22
+ // compatible transform (i.e. "__esModule" has not been set), then set
23
+ // "default" to the CommonJS "module.exports" for node compatibility.
24
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
25
+ mod
26
+ ));
27
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
28
+
29
+ // src/parse.ts
30
+ var parse_exports = {};
31
+ __export(parse_exports, {
32
+ parseRss: () => parseRss
33
+ });
34
+ module.exports = __toCommonJS(parse_exports);
35
+ var import_rss_parser = __toESM(require("rss-parser"));
36
+ var import_zx = require("zx");
37
+ var import_strategy = require("./strategy");
38
+ var import_parseContent = require("./parseContent");
39
+ async function parseRss(name, feed) {
40
+ const parser = new import_rss_parser.default();
41
+ console.log(import_zx.chalk.blue("正在拉取 rss 列表"));
42
+ const result = await parser.parseURL(feed);
43
+ if (!result.items.length) {
44
+ throw Error("rss no conent");
45
+ }
46
+ const contents = [];
47
+ for (let item of result.items) {
48
+ console.log(import_zx.chalk.blue(`正在解析文章 【${item.title}】`));
49
+ const content = await (0, import_parseContent.parseContent)(item, import_strategy.strategies[name]).catch((e) => console.error(import_zx.chalk.red("文章解析失败"), e.message));
50
+ if (content) {
51
+ contents.push(content);
52
+ }
53
+ }
54
+ return contents;
55
+ }
56
+ // Annotate the CommonJS export names for ESM import in node:
57
+ 0 && (module.exports = {
58
+ parseRss
59
+ });
@@ -0,0 +1,2 @@
1
+ import { ParseStrategy } from "../types";
2
+ export declare const getArticleHtml: (url: string, fetcher: ParseStrategy["fetcher"]) => Promise<any>;
@@ -0,0 +1,49 @@
1
+ var __create = Object.create;
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __getProtoOf = Object.getPrototypeOf;
6
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
7
+ var __export = (target, all) => {
8
+ for (var name in all)
9
+ __defProp(target, name, { get: all[name], enumerable: true });
10
+ };
11
+ var __copyProps = (to, from, except, desc) => {
12
+ if (from && typeof from === "object" || typeof from === "function") {
13
+ for (let key of __getOwnPropNames(from))
14
+ if (!__hasOwnProp.call(to, key) && key !== except)
15
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
16
+ }
17
+ return to;
18
+ };
19
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
20
+ // If the importer is in node compatibility mode or this is not an ESM
21
+ // file that has been converted to a CommonJS file using a Babel-
22
+ // compatible transform (i.e. "__esModule" has not been set), then set
23
+ // "default" to the CommonJS "module.exports" for node compatibility.
24
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
25
+ mod
26
+ ));
27
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
28
+
29
+ // src/parseContent/getArticleHtml.ts
30
+ var getArticleHtml_exports = {};
31
+ __export(getArticleHtml_exports, {
32
+ getArticleHtml: () => getArticleHtml
33
+ });
34
+ module.exports = __toCommonJS(getArticleHtml_exports);
35
+ var import_axios = __toESM(require("axios"));
36
+ var getArticleHtml = async (url, fetcher) => {
37
+ if (fetcher === "playwright") {
38
+ throw Error("todo");
39
+ } else {
40
+ const rst = await import_axios.default.get(url, {
41
+ responseType: "text"
42
+ });
43
+ return rst.data;
44
+ }
45
+ };
46
+ // Annotate the CommonJS export names for ESM import in node:
47
+ 0 && (module.exports = {
48
+ getArticleHtml
49
+ });
@@ -0,0 +1,16 @@
1
+ import { Item as RssItem } from "rss-parser";
2
+ import { Content, ParseStrategy } from "../types";
3
+ /**
4
+ * 解析文章内容
5
+ * 1. 获取 html 片段。(rss、http、playwright)
6
+ * 2. 对 html 处理
7
+ * 清理无用样式
8
+ * 3. 从 html 计算 images、videos、parsed(todo)
9
+ * 4. 获取自定义的额外解析项(可选)
10
+ * 5. 返回结果
11
+ *
12
+ * @param item
13
+ * @param strategy
14
+ * @returns
15
+ */
16
+ export declare function parseContent(item: RssItem, strategy?: ParseStrategy): Promise<Content>;
@@ -0,0 +1,80 @@
1
+ var __create = Object.create;
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __getProtoOf = Object.getPrototypeOf;
6
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
7
+ var __export = (target, all) => {
8
+ for (var name in all)
9
+ __defProp(target, name, { get: all[name], enumerable: true });
10
+ };
11
+ var __copyProps = (to, from, except, desc) => {
12
+ if (from && typeof from === "object" || typeof from === "function") {
13
+ for (let key of __getOwnPropNames(from))
14
+ if (!__hasOwnProp.call(to, key) && key !== except)
15
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
16
+ }
17
+ return to;
18
+ };
19
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
20
+ // If the importer is in node compatibility mode or this is not an ESM
21
+ // file that has been converted to a CommonJS file using a Babel-
22
+ // compatible transform (i.e. "__esModule" has not been set), then set
23
+ // "default" to the CommonJS "module.exports" for node compatibility.
24
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
25
+ mod
26
+ ));
27
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
28
+
29
+ // src/parseContent/index.ts
30
+ var parseContent_exports = {};
31
+ __export(parseContent_exports, {
32
+ parseContent: () => parseContent
33
+ });
34
+ module.exports = __toCommonJS(parseContent_exports);
35
+ var import_getArticleHtml = require("./getArticleHtml");
36
+ var cheerio = __toESM(require("cheerio"));
37
+ var import_stripeHtml = require("./stripeHtml");
38
+ var import_parseContent = require("./parseContent");
39
+ var import_zx = require("zx");
40
+ async function parseContent(item, strategy) {
41
+ var _a, _b;
42
+ let element;
43
+ let $;
44
+ if (!strategy || !strategy.fetcher) {
45
+ $ = cheerio.load(item.content || "", {}, false);
46
+ element = $.root();
47
+ } else {
48
+ console.log(import_zx.chalk.blue("正在拉取文章内容"), item.link);
49
+ const article = await (0, import_getArticleHtml.getArticleHtml)(item.link, strategy.fetcher);
50
+ $ = cheerio.load(article);
51
+ element = ((_a = strategy.getContentHtmlFromArticle) == null ? void 0 : _a.call(strategy, $)) || $.root();
52
+ }
53
+ console.log(import_zx.chalk.blue("正在预处理 html"));
54
+ (0, import_stripeHtml.stripeHtml)($, element);
55
+ if (!(strategy == null ? void 0 : strategy.parse)) {
56
+ return {
57
+ type: "simple",
58
+ title: item.title,
59
+ description: ((_b = element.html()) == null ? void 0 : _b.trim()) || "",
60
+ pubDate: new Date(item.pubDate)
61
+ };
62
+ }
63
+ console.log(import_zx.chalk.blue("正在解析文章内容"));
64
+ const rich = await (0, import_parseContent.parseContent)($, element, item, strategy);
65
+ let extra = {};
66
+ if (strategy.getExtraItems) {
67
+ console.log(import_zx.chalk.blue("正在执行 getExtraItems"));
68
+ extra = await strategy.getExtraItems($, rich, item).catch((e) => {
69
+ console.error(import_zx.chalk.red("getExtraItems 识别"), e);
70
+ }) || {};
71
+ }
72
+ return {
73
+ ...rich,
74
+ ...extra
75
+ };
76
+ }
77
+ // Annotate the CommonJS export names for ESM import in node:
78
+ 0 && (module.exports = {
79
+ parseContent
80
+ });
@@ -0,0 +1,4 @@
1
+ import { Item as RssItem } from "rss-parser";
2
+ import type { ParseStrategy, RichContent } from "../types";
3
+ import { AnyNode, Cheerio, CheerioAPI } from "cheerio";
4
+ export declare function parseContent($: CheerioAPI, $element: Cheerio<AnyNode>, item: RssItem, strategy?: ParseStrategy): Promise<RichContent>;
@@ -0,0 +1,108 @@
1
+ var __create = Object.create;
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __getProtoOf = Object.getPrototypeOf;
6
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
7
+ var __export = (target, all) => {
8
+ for (var name in all)
9
+ __defProp(target, name, { get: all[name], enumerable: true });
10
+ };
11
+ var __copyProps = (to, from, except, desc) => {
12
+ if (from && typeof from === "object" || typeof from === "function") {
13
+ for (let key of __getOwnPropNames(from))
14
+ if (!__hasOwnProp.call(to, key) && key !== except)
15
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
16
+ }
17
+ return to;
18
+ };
19
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
20
+ // If the importer is in node compatibility mode or this is not an ESM
21
+ // file that has been converted to a CommonJS file using a Babel-
22
+ // compatible transform (i.e. "__esModule" has not been set), then set
23
+ // "default" to the CommonJS "module.exports" for node compatibility.
24
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
25
+ mod
26
+ ));
27
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
28
+
29
+ // src/parseContent/parseContent.ts
30
+ var parseContent_exports = {};
31
+ __export(parseContent_exports, {
32
+ parseContent: () => parseContent
33
+ });
34
+ module.exports = __toCommonJS(parseContent_exports);
35
+ var import_utils = require("./utils");
36
+ var import_probe_image_size = __toESM(require("probe-image-size"));
37
+ var import_zx = require("zx");
38
+ var import_html_entities = require("html-entities");
39
+ async function parseContent($, $element, item, strategy) {
40
+ var _a;
41
+ const srcs = [];
42
+ const images = [];
43
+ (0, import_utils.walk_the_DOM)($, $element, (node) => {
44
+ if (node.is("img")) {
45
+ const src = node.attr("src");
46
+ const title = node.attr("title") || void 0;
47
+ if (src) {
48
+ srcs.push({
49
+ src,
50
+ title
51
+ });
52
+ }
53
+ }
54
+ });
55
+ for (let item2 of srcs) {
56
+ console.log(import_zx.chalk.blue("正在解析图片", item2.src));
57
+ const result = await (0, import_probe_image_size.default)(item2.src, {
58
+ rejectUnauthorized: false,
59
+ open_timeout: 1e3 * 5
60
+ }).catch((e) => {
61
+ console.error("解析图片失败", e.message);
62
+ return Error((e == null ? void 0 : e.message) || e || "unknown error");
63
+ });
64
+ if (result instanceof Error) {
65
+ images.push({
66
+ url: item2.src,
67
+ title: item2.title
68
+ });
69
+ } else {
70
+ images.push({
71
+ url: item2.src,
72
+ title: item2.title,
73
+ width: result.width,
74
+ height: result.height,
75
+ type: result.mime,
76
+ widthUnit: result.wUnits !== "px" ? result.wUnits : void 0,
77
+ heightUnit: result.hUnits !== "px" ? result.hUnits : void 0,
78
+ realUrl: result.url !== item2.src ? result.url : void 0
79
+ });
80
+ }
81
+ }
82
+ $($element).find("img").each((_, img) => {
83
+ const src = $(img).attr("src");
84
+ if (src) {
85
+ $(img).removeAttr("src");
86
+ $(img).attr("data-src", src);
87
+ const meta = images.find((i) => i.url === src);
88
+ if (meta == null ? void 0 : meta.width) {
89
+ $(img).attr("data-w", `${meta.width}`);
90
+ }
91
+ if (meta == null ? void 0 : meta.height) {
92
+ $(img).attr("data-h", `${meta.height}`);
93
+ }
94
+ }
95
+ });
96
+ return {
97
+ type: "rich",
98
+ title: item.title,
99
+ description: ((_a = item.contentSnippet) == null ? void 0 : _a.trim()) || "",
100
+ content: (0, import_html_entities.decode)($element.html().trim()),
101
+ pubDate: new Date(item.pubDate),
102
+ images
103
+ };
104
+ }
105
+ // Annotate the CommonJS export names for ESM import in node:
106
+ 0 && (module.exports = {
107
+ parseContent
108
+ });
@@ -0,0 +1,2 @@
1
+ import type { Cheerio, AnyNode, CheerioAPI } from "cheerio";
2
+ export declare function stripeHtml($: CheerioAPI, element: Cheerio<AnyNode>): void;
@@ -0,0 +1,46 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __export = (target, all) => {
6
+ for (var name in all)
7
+ __defProp(target, name, { get: all[name], enumerable: true });
8
+ };
9
+ var __copyProps = (to, from, except, desc) => {
10
+ if (from && typeof from === "object" || typeof from === "function") {
11
+ for (let key of __getOwnPropNames(from))
12
+ if (!__hasOwnProp.call(to, key) && key !== except)
13
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
14
+ }
15
+ return to;
16
+ };
17
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
+
19
+ // src/parseContent/stripeHtml.ts
20
+ var stripeHtml_exports = {};
21
+ __export(stripeHtml_exports, {
22
+ stripeHtml: () => stripeHtml
23
+ });
24
+ module.exports = __toCommonJS(stripeHtml_exports);
25
+ var import_utils = require("./utils");
26
+ function stripeHtml($, element) {
27
+ $(element).find("script").remove();
28
+ (0, import_utils.walk_the_DOM)($, element, (el) => {
29
+ for (let attr of [
30
+ "id",
31
+ "style",
32
+ "class",
33
+ "width",
34
+ "height"
35
+ ]) {
36
+ $(el).removeAttr(attr);
37
+ }
38
+ if ($(el).is("a")) {
39
+ $(el).removeAttr("target");
40
+ }
41
+ });
42
+ }
43
+ // Annotate the CommonJS export names for ESM import in node:
44
+ 0 && (module.exports = {
45
+ stripeHtml
46
+ });
@@ -0,0 +1,2 @@
1
+ import { AnyNode, Cheerio, CheerioAPI } from "cheerio";
2
+ export declare function walk_the_DOM($: CheerioAPI, n: Cheerio<AnyNode>, func: (el: Cheerio<AnyNode>) => void): void;
@@ -0,0 +1,39 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __export = (target, all) => {
6
+ for (var name in all)
7
+ __defProp(target, name, { get: all[name], enumerable: true });
8
+ };
9
+ var __copyProps = (to, from, except, desc) => {
10
+ if (from && typeof from === "object" || typeof from === "function") {
11
+ for (let key of __getOwnPropNames(from))
12
+ if (!__hasOwnProp.call(to, key) && key !== except)
13
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
14
+ }
15
+ return to;
16
+ };
17
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
+
19
+ // src/parseContent/utils.ts
20
+ var utils_exports = {};
21
+ __export(utils_exports, {
22
+ walk_the_DOM: () => walk_the_DOM
23
+ });
24
+ module.exports = __toCommonJS(utils_exports);
25
+ function walk_the_DOM($, n, func) {
26
+ let node = n;
27
+ if (node.length === 0) {
28
+ return;
29
+ }
30
+ func(node);
31
+ const children = node.children();
32
+ $(children).each((_, c) => {
33
+ walk_the_DOM($, $(c), func);
34
+ });
35
+ }
36
+ // Annotate the CommonJS export names for ESM import in node:
37
+ 0 && (module.exports = {
38
+ walk_the_DOM
39
+ });
@@ -0,0 +1,2 @@
1
+ import { ParseStrategy } from "../types";
2
+ export declare const cnbeta: ParseStrategy;
@@ -0,0 +1,44 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __export = (target, all) => {
6
+ for (var name in all)
7
+ __defProp(target, name, { get: all[name], enumerable: true });
8
+ };
9
+ var __copyProps = (to, from, except, desc) => {
10
+ if (from && typeof from === "object" || typeof from === "function") {
11
+ for (let key of __getOwnPropNames(from))
12
+ if (!__hasOwnProp.call(to, key) && key !== except)
13
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
14
+ }
15
+ return to;
16
+ };
17
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
+
19
+ // src/parsers/cnbeta.ts
20
+ var cnbeta_exports = {};
21
+ __export(cnbeta_exports, {
22
+ cnbeta: () => cnbeta
23
+ });
24
+ module.exports = __toCommonJS(cnbeta_exports);
25
+ var cnbeta = {
26
+ parse: true,
27
+ fetcher: "http",
28
+ getContentHtmlFromArticle: ($) => {
29
+ const el = $(".article-content");
30
+ el.remove(".tac");
31
+ return el;
32
+ },
33
+ getExtraItems: async ($, rich) => {
34
+ return {
35
+ description: rich.description.replace(`
36
+ 阅读全文`, ""),
37
+ pageFrom: $(".cnbeta-article .title .source a").text().toLowerCase()
38
+ };
39
+ }
40
+ };
41
+ // Annotate the CommonJS export names for ESM import in node:
42
+ 0 && (module.exports = {
43
+ cnbeta
44
+ });
@@ -0,0 +1,2 @@
1
+ import { ParseStrategy } from "../types";
2
+ export declare const ifanr: ParseStrategy;
@@ -0,0 +1,36 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __export = (target, all) => {
6
+ for (var name in all)
7
+ __defProp(target, name, { get: all[name], enumerable: true });
8
+ };
9
+ var __copyProps = (to, from, except, desc) => {
10
+ if (from && typeof from === "object" || typeof from === "function") {
11
+ for (let key of __getOwnPropNames(from))
12
+ if (!__hasOwnProp.call(to, key) && key !== except)
13
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
14
+ }
15
+ return to;
16
+ };
17
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
+
19
+ // src/parsers/ifanr.ts
20
+ var ifanr_exports = {};
21
+ __export(ifanr_exports, {
22
+ ifanr: () => ifanr
23
+ });
24
+ module.exports = __toCommonJS(ifanr_exports);
25
+ var ifanr = {
26
+ parse: true,
27
+ fetcher: "http",
28
+ getContentHtmlFromArticle: ($) => {
29
+ const el = $("#article-content-wrapper article");
30
+ return el;
31
+ }
32
+ };
33
+ // Annotate the CommonJS export names for ESM import in node:
34
+ 0 && (module.exports = {
35
+ ifanr
36
+ });
@@ -0,0 +1,2 @@
1
+ import { ParseStrategy } from "../types";
2
+ export declare const ithome: ParseStrategy;
@@ -0,0 +1,53 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __export = (target, all) => {
6
+ for (var name in all)
7
+ __defProp(target, name, { get: all[name], enumerable: true });
8
+ };
9
+ var __copyProps = (to, from, except, desc) => {
10
+ if (from && typeof from === "object" || typeof from === "function") {
11
+ for (let key of __getOwnPropNames(from))
12
+ if (!__hasOwnProp.call(to, key) && key !== except)
13
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
14
+ }
15
+ return to;
16
+ };
17
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
+
19
+ // src/parsers/ithome.ts
20
+ var ithome_exports = {};
21
+ __export(ithome_exports, {
22
+ ithome: () => ithome
23
+ });
24
+ module.exports = __toCommonJS(ithome_exports);
25
+ var import_utils = require("../parseContent/utils");
26
+ var ithome = {
27
+ parse: true,
28
+ fetcher: "http",
29
+ getContentHtmlFromArticle: ($) => {
30
+ const el = $("#paragraph");
31
+ $(el).find("img").each((_, img) => {
32
+ const original = $(img).attr("data-original");
33
+ if (original) {
34
+ $(img).attr("src", original);
35
+ $(img).removeAttr("data-original");
36
+ }
37
+ });
38
+ [".dy-live-bar", ".tougao-user", ".ad-tips", "dir"].forEach((sel) => {
39
+ $(el).find(sel).remove();
40
+ });
41
+ (0, import_utils.walk_the_DOM)($, $(el), (c) => {
42
+ $(c).removeAttr("data-vmark");
43
+ });
44
+ return $(el);
45
+ },
46
+ getExtraItems: async ($, rich, item) => {
47
+ return {};
48
+ }
49
+ };
50
+ // Annotate the CommonJS export names for ESM import in node:
51
+ 0 && (module.exports = {
52
+ ithome
53
+ });
@@ -0,0 +1,2 @@
1
+ import { ParseStrategy } from "../types";
2
+ export declare const theverge: ParseStrategy;
@@ -0,0 +1,31 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __export = (target, all) => {
6
+ for (var name in all)
7
+ __defProp(target, name, { get: all[name], enumerable: true });
8
+ };
9
+ var __copyProps = (to, from, except, desc) => {
10
+ if (from && typeof from === "object" || typeof from === "function") {
11
+ for (let key of __getOwnPropNames(from))
12
+ if (!__hasOwnProp.call(to, key) && key !== except)
13
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
14
+ }
15
+ return to;
16
+ };
17
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
+
19
+ // src/parsers/theverge.ts
20
+ var theverge_exports = {};
21
+ __export(theverge_exports, {
22
+ theverge: () => theverge
23
+ });
24
+ module.exports = __toCommonJS(theverge_exports);
25
+ var theverge = {
26
+ parse: true
27
+ };
28
+ // Annotate the CommonJS export names for ESM import in node:
29
+ 0 && (module.exports = {
30
+ theverge
31
+ });
@@ -0,0 +1,2 @@
1
+ import { ParseStrategy } from "./types";
2
+ export declare const strategies: Record<string, ParseStrategy>;
@@ -0,0 +1,38 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __export = (target, all) => {
6
+ for (var name in all)
7
+ __defProp(target, name, { get: all[name], enumerable: true });
8
+ };
9
+ var __copyProps = (to, from, except, desc) => {
10
+ if (from && typeof from === "object" || typeof from === "function") {
11
+ for (let key of __getOwnPropNames(from))
12
+ if (!__hasOwnProp.call(to, key) && key !== except)
13
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
14
+ }
15
+ return to;
16
+ };
17
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
+
19
+ // src/strategy.ts
20
+ var strategy_exports = {};
21
+ __export(strategy_exports, {
22
+ strategies: () => strategies
23
+ });
24
+ module.exports = __toCommonJS(strategy_exports);
25
+ var import_cnbeta = require("./parsers/cnbeta");
26
+ var import_ifanr = require("./parsers/ifanr");
27
+ var import_theverge = require("./parsers/theverge");
28
+ var import_ithome = require("./parsers/ithome");
29
+ var strategies = {
30
+ cnbeta: import_cnbeta.cnbeta,
31
+ ifanr: import_ifanr.ifanr,
32
+ theverge: import_theverge.theverge,
33
+ ithome: import_ithome.ithome
34
+ };
35
+ // Annotate the CommonJS export names for ESM import in node:
36
+ 0 && (module.exports = {
37
+ strategies
38
+ });
@@ -0,0 +1,30 @@
1
+ import type { AnyNode, Cheerio, CheerioAPI } from 'cheerio';
2
+ import { Item } from 'rss-parser';
3
+ export interface ParseStrategy {
4
+ parse: boolean;
5
+ fetcher?: 'http' | 'playwright';
6
+ getContentHtmlFromArticle?: (aritcle: CheerioAPI) => Cheerio<AnyNode>;
7
+ getExtraItems?: (html: CheerioAPI, current: RichContent, item: Item) => Promise<Record<string, any>>;
8
+ }
9
+ export interface SimpleContent {
10
+ type: 'simple';
11
+ title: string;
12
+ description: string;
13
+ pubDate: Date;
14
+ }
15
+ export interface RichContentImage {
16
+ url: string;
17
+ type?: string;
18
+ width?: number;
19
+ height?: number;
20
+ realUrl?: string;
21
+ widthUnit?: string;
22
+ heightUnit?: string;
23
+ title?: string;
24
+ }
25
+ export interface RichContent extends Omit<SimpleContent, 'type'> {
26
+ type: 'rich';
27
+ content: string;
28
+ images: RichContentImage[];
29
+ }
30
+ export type Content = SimpleContent | RichContent;
@@ -0,0 +1,17 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __copyProps = (to, from, except, desc) => {
6
+ if (from && typeof from === "object" || typeof from === "function") {
7
+ for (let key of __getOwnPropNames(from))
8
+ if (!__hasOwnProp.call(to, key) && key !== except)
9
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
10
+ }
11
+ return to;
12
+ };
13
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
14
+
15
+ // src/types.ts
16
+ var types_exports = {};
17
+ module.exports = __toCommonJS(types_exports);
@@ -0,0 +1 @@
1
+ export declare const getData: (str?: string) => any;
@@ -0,0 +1,63 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+
30
+ // src/utils/getData.ts
31
+ var getData_exports = {};
32
+ __export(getData_exports, {
33
+ getData: () => getData
34
+ });
35
+ module.exports = __toCommonJS(getData_exports);
36
+ var import_yargs_parser = __toESM(require("yargs-parser"));
37
+ var getData = function(str) {
38
+ let argv_str = "";
39
+ if (str) {
40
+ argv_str = str;
41
+ } else if (process.env.DISP_ARGV) {
42
+ argv_str = `--data=${process.env.DISP_ARGV}`;
43
+ } else if (process.argv) {
44
+ argv_str = process.argv;
45
+ }
46
+ ;
47
+ const argv = (0, import_yargs_parser.default)(argv_str);
48
+ if (argv.data && typeof argv.data == "string") {
49
+ try {
50
+ const data = JSON.parse(decodeURIComponent(Buffer.from(argv.data, "base64").toString()));
51
+ return data;
52
+ } catch (e) {
53
+ return null;
54
+ }
55
+ ;
56
+ }
57
+ ;
58
+ return null;
59
+ };
60
+ // Annotate the CommonJS export names for ESM import in node:
61
+ 0 && (module.exports = {
62
+ getData
63
+ });
package/package.json ADDED
@@ -0,0 +1,43 @@
1
+ {
2
+ "name": "ns-rss-spider",
3
+ "version": "0.0.2",
4
+ "description": "",
5
+ "main": "dist/cjs/index.js",
6
+ "types": "dist/cjs/index.d.ts",
7
+ "scripts": {
8
+ "dev": "father dev",
9
+ "test": "vitest run",
10
+ "build": "father build",
11
+ "build:deps": "father prebundle",
12
+ "check": "father doctor",
13
+ "prepublishOnly": "father doctor && npm run build",
14
+ "pub": "npm run check && npm version patch && npm run build && npm publish --registry=https://registry.npmjs.org && git push origin master --tags && npm run sync"
15
+ },
16
+ "bin": {
17
+ "ns-rss-spider": "./dist/cjs/cli.js"
18
+ },
19
+ "keywords": [],
20
+ "authors": [],
21
+ "license": "MIT",
22
+ "files": [
23
+ "dist",
24
+ "compiled"
25
+ ],
26
+ "publishConfig": {
27
+ "access": "public"
28
+ },
29
+ "devDependencies": {
30
+ "@types/probe-image-size": "^7.2.4",
31
+ "father": "^4.4.0",
32
+ "vitest": "^1.2.2"
33
+ },
34
+ "dependencies": {
35
+ "axios": "^1.6.7",
36
+ "cheerio": "^1.0.0-rc.12",
37
+ "html-entities": "^2.4.0",
38
+ "probe-image-size": "^7.2.3",
39
+ "rss-parser": "^3.13.0",
40
+ "yargs-parser": "^21.1.1",
41
+ "zx": "4.x"
42
+ }
43
+ }