ns-rss-spider 0.0.18 → 0.0.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/parse.d.ts +1 -1
- package/dist/cjs/parseContent/index.js +1 -1
- package/dist/cjs/parseContent/parseContent.js +10 -1
- package/dist/cjs/parsers/cnbeta.js +12 -5
- package/dist/cjs/parsers/ifanr.js +1 -6
- package/dist/cjs/parsers/ithome.js +1 -1
- package/dist/cjs/types.d.ts +4 -2
- package/package.json +1 -1
package/dist/cjs/parse.d.ts
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import { ServerInfo } from "./upload";
|
|
2
2
|
export declare function parseRss(name: string, feed: string, server?: ServerInfo): Promise<import("./types").Article[] | {
|
|
3
3
|
storePath: any;
|
|
4
|
-
source?: string | undefined;
|
|
5
4
|
type: "simple" | "rich";
|
|
6
5
|
guid: string;
|
|
7
6
|
title: string;
|
|
@@ -9,5 +8,6 @@ export declare function parseRss(name: string, feed: string, server?: ServerInfo
|
|
|
9
8
|
description: string;
|
|
10
9
|
pubDate: Date;
|
|
11
10
|
author?: string | undefined;
|
|
11
|
+
source?: string | undefined;
|
|
12
12
|
categories?: string | undefined;
|
|
13
13
|
}[]>;
|
|
@@ -49,7 +49,7 @@ async function parseContent(item, strategy) {
|
|
|
49
49
|
console.log(import_zx.chalk.green("正在拉取文章内容"), item.link);
|
|
50
50
|
const article = await (0, import_getArticleHtml.getArticleHtml)(item.link, strategy.fetcher);
|
|
51
51
|
$ = cheerio.load(article);
|
|
52
|
-
element = ((_a = strategy.
|
|
52
|
+
element = ((_a = strategy.getContentElementFromArticle) == null ? void 0 : _a.call(strategy, $)) || $.root();
|
|
53
53
|
}
|
|
54
54
|
console.log(import_zx.chalk.green("正在预处理 html"));
|
|
55
55
|
(0, import_stripeHtml.stripeHtml)($, element);
|
|
@@ -39,6 +39,7 @@ var import_html_entities = require("html-entities");
|
|
|
39
39
|
var import_generateSsrContent = require("./generateSsrContent");
|
|
40
40
|
var import_getBasicFromItem = require("./getBasicFromItem");
|
|
41
41
|
async function parseContent($, $element, item, strategy) {
|
|
42
|
+
var _a;
|
|
42
43
|
const srcs = [];
|
|
43
44
|
const images = [];
|
|
44
45
|
(0, import_utils.walk_the_DOM)($, $element, (node) => {
|
|
@@ -54,6 +55,13 @@ async function parseContent($, $element, item, strategy) {
|
|
|
54
55
|
}
|
|
55
56
|
});
|
|
56
57
|
for (let item2 of srcs) {
|
|
58
|
+
if (strategy == null ? void 0 : strategy.ignoreProbeImage) {
|
|
59
|
+
images.push({
|
|
60
|
+
url: item2.src,
|
|
61
|
+
title: item2.title
|
|
62
|
+
});
|
|
63
|
+
continue;
|
|
64
|
+
}
|
|
57
65
|
console.log(import_zx.chalk.green("正在解析图片", item2.src));
|
|
58
66
|
const result = await (0, import_probe_image_size.default)(item2.src, {
|
|
59
67
|
rejectUnauthorized: false,
|
|
@@ -93,7 +101,8 @@ async function parseContent($, $element, item, strategy) {
|
|
|
93
101
|
}
|
|
94
102
|
}
|
|
95
103
|
});
|
|
96
|
-
const
|
|
104
|
+
const elCont = ((_a = strategy == null ? void 0 : strategy.getContentFromHtml) == null ? void 0 : _a.call(strategy, $, $element)) || $element.html() || "";
|
|
105
|
+
const content = (0, import_html_entities.decode)(elCont.trim());
|
|
97
106
|
return {
|
|
98
107
|
type: "rich",
|
|
99
108
|
...(0, import_getBasicFromItem.getBasicFromItem)(item),
|
|
@@ -25,18 +25,25 @@ module.exports = __toCommonJS(cnbeta_exports);
|
|
|
25
25
|
var cnbeta = {
|
|
26
26
|
parse: true,
|
|
27
27
|
fetcher: "http",
|
|
28
|
-
|
|
28
|
+
getContentElementFromArticle: ($) => {
|
|
29
29
|
const el = $(".article-content");
|
|
30
|
-
el.remove(".tac");
|
|
31
30
|
return el;
|
|
32
31
|
},
|
|
32
|
+
getContentFromHtml: ($, $el) => {
|
|
33
|
+
const desc = $(".cnbeta-article-body .article-summary");
|
|
34
|
+
if (desc.length) {
|
|
35
|
+
$(desc).find(".topic").remove();
|
|
36
|
+
$(desc).find("p").removeAttr("class").addClass("introduction");
|
|
37
|
+
}
|
|
38
|
+
return (desc.html() || "") + ($el.html() || "");
|
|
39
|
+
},
|
|
33
40
|
getExtraItems: async ($, rich) => {
|
|
34
41
|
return {
|
|
35
42
|
description: rich.description.replace(`
|
|
36
|
-
阅读全文`, "")
|
|
37
|
-
pageFrom: $(".cnbeta-article .title .source a").text().toLowerCase()
|
|
43
|
+
阅读全文`, "")
|
|
38
44
|
};
|
|
39
|
-
}
|
|
45
|
+
},
|
|
46
|
+
ignoreProbeImage: true
|
|
40
47
|
};
|
|
41
48
|
// Annotate the CommonJS export names for ESM import in node:
|
|
42
49
|
0 && (module.exports = {
|
|
@@ -23,12 +23,7 @@ __export(ifanr_exports, {
|
|
|
23
23
|
});
|
|
24
24
|
module.exports = __toCommonJS(ifanr_exports);
|
|
25
25
|
var ifanr = {
|
|
26
|
-
parse: true
|
|
27
|
-
fetcher: "http",
|
|
28
|
-
getContentHtmlFromArticle: ($) => {
|
|
29
|
-
const el = $("#article-content-wrapper article");
|
|
30
|
-
return el;
|
|
31
|
-
}
|
|
26
|
+
parse: true
|
|
32
27
|
};
|
|
33
28
|
// Annotate the CommonJS export names for ESM import in node:
|
|
34
29
|
0 && (module.exports = {
|
|
@@ -26,7 +26,7 @@ var import_utils = require("../parseContent/utils");
|
|
|
26
26
|
var ithome = {
|
|
27
27
|
parse: true,
|
|
28
28
|
fetcher: "http",
|
|
29
|
-
|
|
29
|
+
getContentElementFromArticle: ($) => {
|
|
30
30
|
const el = $("#paragraph");
|
|
31
31
|
$(el).find("img").each((_, img) => {
|
|
32
32
|
const original = $(img).attr("data-original");
|
package/dist/cjs/types.d.ts
CHANGED
|
@@ -7,8 +7,10 @@ export type RssItem = Item & {
|
|
|
7
7
|
export interface ParseStrategy {
|
|
8
8
|
parse: boolean;
|
|
9
9
|
fetcher?: 'http' | 'playwright';
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
getContentElementFromArticle?: (aritcle: CheerioAPI) => Cheerio<AnyNode>;
|
|
11
|
+
getContentFromHtml?: ($: CheerioAPI, node: Cheerio<AnyNode>) => string;
|
|
12
|
+
getExtraItems?: ($: CheerioAPI, current: RichArticle, item: Item) => Promise<Record<string, any>>;
|
|
13
|
+
ignoreProbeImage?: boolean;
|
|
12
14
|
}
|
|
13
15
|
export interface SimpleArticle {
|
|
14
16
|
type: 'simple';
|