ns-rss-spider 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/parse.d.ts +10 -1
- package/dist/cjs/parse.js +17 -4
- package/dist/cjs/parseContent/index.d.ts +2 -2
- package/dist/cjs/parseContent/index.js +6 -4
- package/dist/cjs/parseContent/parseContent.d.ts +2 -2
- package/dist/cjs/parseContent/parseContent.js +3 -1
- package/dist/cjs/types.d.ts +8 -6
- package/dist/cjs/upload/index.d.ts +20 -0
- package/dist/cjs/upload/index.js +87 -0
- package/package.json +5 -2
package/dist/cjs/parse.d.ts
CHANGED
|
@@ -1 +1,10 @@
|
|
|
1
|
-
|
|
1
|
+
import { ServerInfo } from "./upload";
|
|
2
|
+
export declare function parseRss(name: string, feed: string, server?: ServerInfo): Promise<import("./types").Article[] | {
|
|
3
|
+
storePath: any;
|
|
4
|
+
type: "simple" | "rich";
|
|
5
|
+
guid: string;
|
|
6
|
+
title: string;
|
|
7
|
+
link: string;
|
|
8
|
+
description: string;
|
|
9
|
+
pubDate: Date;
|
|
10
|
+
}[]>;
|
package/dist/cjs/parse.js
CHANGED
|
@@ -36,22 +36,35 @@ var import_rss_parser = __toESM(require("rss-parser"));
|
|
|
36
36
|
var import_zx = require("zx");
|
|
37
37
|
var import_strategy = require("./strategy");
|
|
38
38
|
var import_parseContent = require("./parseContent");
|
|
39
|
-
|
|
39
|
+
var import_upload = require("./upload");
|
|
40
|
+
async function parseRss(name, feed, server) {
|
|
40
41
|
const parser = new import_rss_parser.default();
|
|
41
|
-
console.log(import_zx.chalk.
|
|
42
|
+
console.log(import_zx.chalk.green("正在拉取 rss 列表"));
|
|
42
43
|
const result = await parser.parseURL(feed);
|
|
43
44
|
if (!result.items.length) {
|
|
44
45
|
throw Error("rss no conent");
|
|
45
46
|
}
|
|
46
47
|
const contents = [];
|
|
47
48
|
for (let item of result.items) {
|
|
48
|
-
console.log(import_zx.chalk.
|
|
49
|
+
console.log(import_zx.chalk.green(`正在解析文章 【${item.title}】`));
|
|
50
|
+
if (!item.guid) {
|
|
51
|
+
console.log(JSON.stringify(item));
|
|
52
|
+
throw Error(`item has no guid`);
|
|
53
|
+
}
|
|
49
54
|
const content = await (0, import_parseContent.parseContent)(item, import_strategy.strategies[name]).catch((e) => console.error(import_zx.chalk.red("文章解析失败"), e.message));
|
|
50
55
|
if (content) {
|
|
51
56
|
contents.push(content);
|
|
52
57
|
}
|
|
53
58
|
}
|
|
54
|
-
|
|
59
|
+
if (server) {
|
|
60
|
+
return await (0, import_upload.uploadContent)({
|
|
61
|
+
app: name,
|
|
62
|
+
items: contents,
|
|
63
|
+
...server
|
|
64
|
+
});
|
|
65
|
+
} else {
|
|
66
|
+
return contents;
|
|
67
|
+
}
|
|
55
68
|
}
|
|
56
69
|
// Annotate the CommonJS export names for ESM import in node:
|
|
57
70
|
0 && (module.exports = {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { Item as RssItem } from "rss-parser";
|
|
2
|
-
import {
|
|
2
|
+
import { Article, ParseStrategy } from "../types";
|
|
3
3
|
/**
|
|
4
4
|
* 解析文章内容
|
|
5
5
|
* 1. 获取 html 片段。(rss、http、playwright)
|
|
@@ -13,4 +13,4 @@ import { Content, ParseStrategy } from "../types";
|
|
|
13
13
|
* @param strategy
|
|
14
14
|
* @returns
|
|
15
15
|
*/
|
|
16
|
-
export declare function parseContent(item: RssItem, strategy?: ParseStrategy): Promise<
|
|
16
|
+
export declare function parseContent(item: RssItem, strategy?: ParseStrategy): Promise<Article>;
|
|
@@ -45,26 +45,28 @@ async function parseContent(item, strategy) {
|
|
|
45
45
|
$ = cheerio.load(item.content || "", {}, false);
|
|
46
46
|
element = $.root();
|
|
47
47
|
} else {
|
|
48
|
-
console.log(import_zx.chalk.
|
|
48
|
+
console.log(import_zx.chalk.green("正在拉取文章内容"), item.link);
|
|
49
49
|
const article = await (0, import_getArticleHtml.getArticleHtml)(item.link, strategy.fetcher);
|
|
50
50
|
$ = cheerio.load(article);
|
|
51
51
|
element = ((_a = strategy.getContentHtmlFromArticle) == null ? void 0 : _a.call(strategy, $)) || $.root();
|
|
52
52
|
}
|
|
53
|
-
console.log(import_zx.chalk.
|
|
53
|
+
console.log(import_zx.chalk.green("正在预处理 html"));
|
|
54
54
|
(0, import_stripeHtml.stripeHtml)($, element);
|
|
55
55
|
if (!(strategy == null ? void 0 : strategy.parse)) {
|
|
56
56
|
return {
|
|
57
57
|
type: "simple",
|
|
58
|
+
guid: item.guid,
|
|
58
59
|
title: item.title,
|
|
60
|
+
link: item.link,
|
|
59
61
|
description: ((_b = element.html()) == null ? void 0 : _b.trim()) || "",
|
|
60
62
|
pubDate: new Date(item.pubDate)
|
|
61
63
|
};
|
|
62
64
|
}
|
|
63
|
-
console.log(import_zx.chalk.
|
|
65
|
+
console.log(import_zx.chalk.green("正在解析文章内容"));
|
|
64
66
|
const rich = await (0, import_parseContent.parseContent)($, element, item, strategy);
|
|
65
67
|
let extra = {};
|
|
66
68
|
if (strategy.getExtraItems) {
|
|
67
|
-
console.log(import_zx.chalk.
|
|
69
|
+
console.log(import_zx.chalk.green("正在执行 getExtraItems"));
|
|
68
70
|
extra = await strategy.getExtraItems($, rich, item).catch((e) => {
|
|
69
71
|
console.error(import_zx.chalk.red("getExtraItems 识别"), e);
|
|
70
72
|
}) || {};
|
|
@@ -1,4 +1,4 @@
|
|
|
1
1
|
import { Item as RssItem } from "rss-parser";
|
|
2
|
-
import type { ParseStrategy,
|
|
2
|
+
import type { ParseStrategy, RichArticle } from "../types";
|
|
3
3
|
import { AnyNode, Cheerio, CheerioAPI } from "cheerio";
|
|
4
|
-
export declare function parseContent($: CheerioAPI, $element: Cheerio<AnyNode>, item: RssItem, strategy?: ParseStrategy): Promise<
|
|
4
|
+
export declare function parseContent($: CheerioAPI, $element: Cheerio<AnyNode>, item: RssItem, strategy?: ParseStrategy): Promise<RichArticle>;
|
|
@@ -53,7 +53,7 @@ async function parseContent($, $element, item, strategy) {
|
|
|
53
53
|
}
|
|
54
54
|
});
|
|
55
55
|
for (let item2 of srcs) {
|
|
56
|
-
console.log(import_zx.chalk.
|
|
56
|
+
console.log(import_zx.chalk.green("正在解析图片", item2.src));
|
|
57
57
|
const result = await (0, import_probe_image_size.default)(item2.src, {
|
|
58
58
|
rejectUnauthorized: false,
|
|
59
59
|
open_timeout: 1e3 * 5
|
|
@@ -95,7 +95,9 @@ async function parseContent($, $element, item, strategy) {
|
|
|
95
95
|
});
|
|
96
96
|
return {
|
|
97
97
|
type: "rich",
|
|
98
|
+
guid: item.guid,
|
|
98
99
|
title: item.title,
|
|
100
|
+
link: item.link,
|
|
99
101
|
description: ((_a = item.contentSnippet) == null ? void 0 : _a.trim()) || "",
|
|
100
102
|
content: (0, import_html_entities.decode)($element.html().trim()),
|
|
101
103
|
pubDate: new Date(item.pubDate),
|
package/dist/cjs/types.d.ts
CHANGED
|
@@ -4,15 +4,17 @@ export interface ParseStrategy {
|
|
|
4
4
|
parse: boolean;
|
|
5
5
|
fetcher?: 'http' | 'playwright';
|
|
6
6
|
getContentHtmlFromArticle?: (aritcle: CheerioAPI) => Cheerio<AnyNode>;
|
|
7
|
-
getExtraItems?: (html: CheerioAPI, current:
|
|
7
|
+
getExtraItems?: (html: CheerioAPI, current: RichArticle, item: Item) => Promise<Record<string, any>>;
|
|
8
8
|
}
|
|
9
|
-
export interface
|
|
9
|
+
export interface SimpleArticle {
|
|
10
10
|
type: 'simple';
|
|
11
|
+
guid: string;
|
|
11
12
|
title: string;
|
|
13
|
+
link: string;
|
|
12
14
|
description: string;
|
|
13
15
|
pubDate: Date;
|
|
14
16
|
}
|
|
15
|
-
export interface
|
|
17
|
+
export interface RichArticleImage {
|
|
16
18
|
url: string;
|
|
17
19
|
type?: string;
|
|
18
20
|
width?: number;
|
|
@@ -22,9 +24,9 @@ export interface RichContentImage {
|
|
|
22
24
|
heightUnit?: string;
|
|
23
25
|
title?: string;
|
|
24
26
|
}
|
|
25
|
-
export interface
|
|
27
|
+
export interface RichArticle extends Omit<SimpleArticle, 'type'> {
|
|
26
28
|
type: 'rich';
|
|
27
29
|
content: string;
|
|
28
|
-
images:
|
|
30
|
+
images: RichArticleImage[];
|
|
29
31
|
}
|
|
30
|
-
export type
|
|
32
|
+
export type Article = SimpleArticle | RichArticle;
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import { Article } from "../types";
|
|
2
|
+
export interface ServerInfo {
|
|
3
|
+
getUploadApi?: string;
|
|
4
|
+
latest?: {
|
|
5
|
+
guid: string;
|
|
6
|
+
pubDate: Date;
|
|
7
|
+
};
|
|
8
|
+
}
|
|
9
|
+
export declare function uploadContent({ app, getUploadApi, latest, items, }: {
|
|
10
|
+
app: string;
|
|
11
|
+
items: Article[];
|
|
12
|
+
} & ServerInfo): Promise<Article[] | {
|
|
13
|
+
storePath: any;
|
|
14
|
+
type: "simple" | "rich";
|
|
15
|
+
guid: string;
|
|
16
|
+
title: string;
|
|
17
|
+
link: string;
|
|
18
|
+
description: string;
|
|
19
|
+
pubDate: Date;
|
|
20
|
+
}[]>;
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
var __create = Object.create;
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
6
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
7
|
+
var __export = (target, all) => {
|
|
8
|
+
for (var name in all)
|
|
9
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
10
|
+
};
|
|
11
|
+
var __copyProps = (to, from, except, desc) => {
|
|
12
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
13
|
+
for (let key of __getOwnPropNames(from))
|
|
14
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
15
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
16
|
+
}
|
|
17
|
+
return to;
|
|
18
|
+
};
|
|
19
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
20
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
21
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
22
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
23
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
24
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
25
|
+
mod
|
|
26
|
+
));
|
|
27
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
28
|
+
|
|
29
|
+
// src/upload/index.ts
|
|
30
|
+
var upload_exports = {};
|
|
31
|
+
__export(upload_exports, {
|
|
32
|
+
uploadContent: () => uploadContent
|
|
33
|
+
});
|
|
34
|
+
module.exports = __toCommonJS(upload_exports);
|
|
35
|
+
var import_lodash = require("lodash");
|
|
36
|
+
var import_axios = __toESM(require("axios"));
|
|
37
|
+
var import_dayjs = __toESM(require("dayjs"));
|
|
38
|
+
var import_zx = require("zx");
|
|
39
|
+
async function uploadContent({
|
|
40
|
+
app,
|
|
41
|
+
getUploadApi,
|
|
42
|
+
latest,
|
|
43
|
+
items
|
|
44
|
+
}) {
|
|
45
|
+
var _a, _b;
|
|
46
|
+
const filterItems = latest ? items.filter((item) => {
|
|
47
|
+
return item.pubDate >= latest.pubDate && item.guid !== latest.guid;
|
|
48
|
+
}) : items;
|
|
49
|
+
if (!getUploadApi) {
|
|
50
|
+
return filterItems;
|
|
51
|
+
}
|
|
52
|
+
let result = [];
|
|
53
|
+
for (let item of filterItems) {
|
|
54
|
+
console.log(import_zx.chalk.green("正在获取上传地址", item.title));
|
|
55
|
+
const uploadRst = await import_axios.default.post(getUploadApi, {
|
|
56
|
+
app,
|
|
57
|
+
name: (0, import_dayjs.default)(item.pubDate).format("MM-DDTHH-mm-ss") + Math.random().toString(32).slice(2)
|
|
58
|
+
}, {
|
|
59
|
+
responseType: "json"
|
|
60
|
+
}).catch((e) => {
|
|
61
|
+
console.error(import_zx.chalk.red("get upload url error"), e);
|
|
62
|
+
});
|
|
63
|
+
if (!((_b = (_a = uploadRst == null ? void 0 : uploadRst.data) == null ? void 0 : _a.data) == null ? void 0 : _b.url)) {
|
|
64
|
+
console.error(import_zx.chalk.red(`upload error ${JSON.stringify(uploadRst)}`));
|
|
65
|
+
continue;
|
|
66
|
+
}
|
|
67
|
+
const url = uploadRst.data.data.url;
|
|
68
|
+
const storePath = uploadRst.data.data.storePath;
|
|
69
|
+
console.log(import_zx.chalk.green("正在上传", item.title, url));
|
|
70
|
+
const res = await import_axios.default.put(url, item).catch((e) => {
|
|
71
|
+
console.error(import_zx.chalk.red("upload oss error"), e);
|
|
72
|
+
});
|
|
73
|
+
if (!res || res.status !== 200) {
|
|
74
|
+
console.error(import_zx.chalk.red("upload status not 200"), res);
|
|
75
|
+
continue;
|
|
76
|
+
}
|
|
77
|
+
result.push({
|
|
78
|
+
...(0, import_lodash.omit)(item, "content", "images"),
|
|
79
|
+
storePath
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
return result;
|
|
83
|
+
}
|
|
84
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
85
|
+
0 && (module.exports = {
|
|
86
|
+
uploadContent
|
|
87
|
+
});
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "ns-rss-spider",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.3",
|
|
4
4
|
"description": "",
|
|
5
5
|
"main": "dist/cjs/index.js",
|
|
6
6
|
"types": "dist/cjs/index.d.ts",
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
"build:deps": "father prebundle",
|
|
12
12
|
"check": "father doctor",
|
|
13
13
|
"prepublishOnly": "father doctor && npm run build",
|
|
14
|
-
"pub": "npm run check && npm version patch && npm run build && npm publish --registry=https://registry.npmjs.org && git push origin master --tags
|
|
14
|
+
"pub": "npm run check && npm version patch && npm run build && npm publish --registry=https://registry.npmjs.org && git push origin master --tags"
|
|
15
15
|
},
|
|
16
16
|
"bin": {
|
|
17
17
|
"ns-rss-spider": "./dist/cjs/cli.js"
|
|
@@ -27,6 +27,7 @@
|
|
|
27
27
|
"access": "public"
|
|
28
28
|
},
|
|
29
29
|
"devDependencies": {
|
|
30
|
+
"@types/lodash": "^4.14.202",
|
|
30
31
|
"@types/probe-image-size": "^7.2.4",
|
|
31
32
|
"father": "^4.4.0",
|
|
32
33
|
"vitest": "^1.2.2"
|
|
@@ -34,7 +35,9 @@
|
|
|
34
35
|
"dependencies": {
|
|
35
36
|
"axios": "^1.6.7",
|
|
36
37
|
"cheerio": "^1.0.0-rc.12",
|
|
38
|
+
"dayjs": "^1.11.10",
|
|
37
39
|
"html-entities": "^2.4.0",
|
|
40
|
+
"lodash": "^4.17.21",
|
|
38
41
|
"probe-image-size": "^7.2.3",
|
|
39
42
|
"rss-parser": "^3.13.0",
|
|
40
43
|
"yargs-parser": "^21.1.1",
|