ns-rss-spider 0.0.33 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/cli.js +3 -1
- package/dist/cjs/parse.d.ts +4 -1
- package/dist/cjs/parse.js +7 -9
- package/dist/cjs/parseContent/parseRssList.d.ts +6 -0
- package/dist/cjs/parseContent/parseRssList.js +50 -0
- package/dist/cjs/parsers/36kr.js +0 -1
- package/dist/cjs/parsers/techrunch.d.ts +2 -0
- package/dist/cjs/parsers/techrunch.js +63 -0
- package/dist/cjs/strategy.js +3 -1
- package/dist/cjs/utils/browser.d.ts +5 -1
- package/dist/cjs/utils/browser.js +8 -3
- package/dist/cjs/utils/request.d.ts +1 -0
- package/dist/cjs/utils/request.js +49 -0
- package/package.json +1 -1
package/dist/cjs/cli.js
CHANGED
|
@@ -40,7 +40,9 @@ async function main() {
|
|
|
40
40
|
let rst;
|
|
41
41
|
switch (cmd) {
|
|
42
42
|
case "parse":
|
|
43
|
-
rst = await (0, import_index.parseRss)(rest.name, rest.feed,
|
|
43
|
+
rst = await (0, import_index.parseRss)(rest.name, rest.feed, {
|
|
44
|
+
server: rest.server
|
|
45
|
+
});
|
|
44
46
|
break;
|
|
45
47
|
default:
|
|
46
48
|
console.warn("未知命令", cmd);
|
package/dist/cjs/parse.d.ts
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
import { ServerInfo } from "./upload";
|
|
2
|
-
export declare function parseRss(name: string, feed: string,
|
|
2
|
+
export declare function parseRss(name: string, feed: string, options?: {
|
|
3
|
+
server?: ServerInfo;
|
|
4
|
+
take?: number;
|
|
5
|
+
}): Promise<{
|
|
3
6
|
contents: any[];
|
|
4
7
|
thumbs?: Record<string, string>;
|
|
5
8
|
}>;
|
package/dist/cjs/parse.js
CHANGED
|
@@ -32,20 +32,17 @@ __export(parse_exports, {
|
|
|
32
32
|
parseRss: () => parseRss
|
|
33
33
|
});
|
|
34
34
|
module.exports = __toCommonJS(parse_exports);
|
|
35
|
-
var import_rss_parser = __toESM(require("rss-parser"));
|
|
36
35
|
var import_zx = require("zx");
|
|
37
36
|
var import_strategy = require("./strategy");
|
|
38
37
|
var import_parseContent = require("./parseContent");
|
|
39
38
|
var import_upload = require("./upload");
|
|
40
39
|
var import_axios = __toESM(require("axios"));
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
}
|
|
46
|
-
});
|
|
40
|
+
var import_parseRssList = require("./parseContent/parseRssList");
|
|
41
|
+
async function parseRss(name, feed, options) {
|
|
42
|
+
const { server } = options || {};
|
|
43
|
+
console.log(import_zx.chalk.green("start", name, feed));
|
|
47
44
|
console.log(import_zx.chalk.green("正在拉取 rss 列表"));
|
|
48
|
-
const result = await
|
|
45
|
+
const result = await (0, import_parseRssList.parseRssList)(feed);
|
|
49
46
|
if (!result.items.length) {
|
|
50
47
|
throw Error("rss no conent");
|
|
51
48
|
}
|
|
@@ -66,7 +63,8 @@ async function parseRss(name, feed, server) {
|
|
|
66
63
|
item.guid = item.link;
|
|
67
64
|
}
|
|
68
65
|
});
|
|
69
|
-
|
|
66
|
+
const items = (options == null ? void 0 : options.take) ? result.items.slice(0, options.take) : result.items;
|
|
67
|
+
for (let item of items) {
|
|
70
68
|
console.log(import_zx.chalk.green(`正在解析文章 【${item.title}】`));
|
|
71
69
|
if (!item.guid) {
|
|
72
70
|
console.log(JSON.stringify(item));
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
var __create = Object.create;
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
6
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
7
|
+
var __export = (target, all) => {
|
|
8
|
+
for (var name in all)
|
|
9
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
10
|
+
};
|
|
11
|
+
var __copyProps = (to, from, except, desc) => {
|
|
12
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
13
|
+
for (let key of __getOwnPropNames(from))
|
|
14
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
15
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
16
|
+
}
|
|
17
|
+
return to;
|
|
18
|
+
};
|
|
19
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
20
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
21
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
22
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
23
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
24
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
25
|
+
mod
|
|
26
|
+
));
|
|
27
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
28
|
+
|
|
29
|
+
// src/parseContent/parseRssList.ts
|
|
30
|
+
var parseRssList_exports = {};
|
|
31
|
+
__export(parseRssList_exports, {
|
|
32
|
+
parseRssList: () => parseRssList
|
|
33
|
+
});
|
|
34
|
+
module.exports = __toCommonJS(parseRssList_exports);
|
|
35
|
+
var import_rss_parser = __toESM(require("rss-parser"));
|
|
36
|
+
var import_request = require("../utils/request");
|
|
37
|
+
async function parseRssList(feed) {
|
|
38
|
+
const xml = await (0, import_request.getPageHtmlByHttp)(feed);
|
|
39
|
+
const parser = new import_rss_parser.default({
|
|
40
|
+
customFields: {
|
|
41
|
+
item: ["source"]
|
|
42
|
+
}
|
|
43
|
+
});
|
|
44
|
+
const result = await parser.parseString(xml);
|
|
45
|
+
return result;
|
|
46
|
+
}
|
|
47
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
48
|
+
0 && (module.exports = {
|
|
49
|
+
parseRssList
|
|
50
|
+
});
|
package/dist/cjs/parsers/36kr.js
CHANGED
|
@@ -38,7 +38,6 @@ var _36kr = {
|
|
|
38
38
|
}).catch((e) => {
|
|
39
39
|
console.error(import_zx.chalk.red(`获取 ${url} 错误`), e);
|
|
40
40
|
});
|
|
41
|
-
console.log("initialState", initialState);
|
|
42
41
|
if ((_c = (_b = (_a = initialState == null ? void 0 : initialState.home) == null ? void 0 : _a.flow) == null ? void 0 : _b.itemList) == null ? void 0 : _c.length) {
|
|
43
42
|
const idThumbKv = initialState.home.flow.itemList.reduce((prev, item) => {
|
|
44
43
|
var _a2, _b2;
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
var __defProp = Object.defineProperty;
|
|
2
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
3
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
4
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
5
|
+
var __export = (target, all) => {
|
|
6
|
+
for (var name in all)
|
|
7
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
8
|
+
};
|
|
9
|
+
var __copyProps = (to, from, except, desc) => {
|
|
10
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
11
|
+
for (let key of __getOwnPropNames(from))
|
|
12
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
13
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
14
|
+
}
|
|
15
|
+
return to;
|
|
16
|
+
};
|
|
17
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
18
|
+
|
|
19
|
+
// src/parsers/techrunch.ts
|
|
20
|
+
var techrunch_exports = {};
|
|
21
|
+
__export(techrunch_exports, {
|
|
22
|
+
techrunch: () => techrunch
|
|
23
|
+
});
|
|
24
|
+
module.exports = __toCommonJS(techrunch_exports);
|
|
25
|
+
var import_zx = require("zx");
|
|
26
|
+
var import_constants = require("../utils/constants");
|
|
27
|
+
var import_browser = require("../utils/browser");
|
|
28
|
+
var techrunch = {
|
|
29
|
+
parse: true,
|
|
30
|
+
fetcher: "http",
|
|
31
|
+
getContentElementFromArticle: ($) => {
|
|
32
|
+
const el = $(".article-content");
|
|
33
|
+
return el;
|
|
34
|
+
},
|
|
35
|
+
ignoreProbeImage: true,
|
|
36
|
+
getThumbs: async (items) => {
|
|
37
|
+
var _a, _b;
|
|
38
|
+
const url = "https://techcrunch.com/";
|
|
39
|
+
const initialState = await (0, import_browser.runInPage)(url, async (page) => {
|
|
40
|
+
const data = await page.evaluate(async () => window.tc_app_data);
|
|
41
|
+
return data;
|
|
42
|
+
}, {
|
|
43
|
+
userAgent: import_constants.iosUA
|
|
44
|
+
}).catch((e) => {
|
|
45
|
+
console.error(import_zx.chalk.red(`获取 ${url} 错误`), e);
|
|
46
|
+
});
|
|
47
|
+
if ((_b = (_a = initialState == null ? void 0 : initialState.entities) == null ? void 0 : _a.posts) == null ? void 0 : _b.length) {
|
|
48
|
+
const idThumbKv = initialState.entities.posts.reduce((prev, item) => {
|
|
49
|
+
var _a2;
|
|
50
|
+
if (((_a2 = item == null ? void 0 : item.guid) == null ? void 0 : _a2.rendered) && item.jetpack_featured_media_url) {
|
|
51
|
+
prev[item.guid.rendered] = item.jetpack_featured_media_url;
|
|
52
|
+
}
|
|
53
|
+
return prev;
|
|
54
|
+
}, {});
|
|
55
|
+
return idThumbKv;
|
|
56
|
+
}
|
|
57
|
+
return void 0;
|
|
58
|
+
}
|
|
59
|
+
};
|
|
60
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
61
|
+
0 && (module.exports = {
|
|
62
|
+
techrunch
|
|
63
|
+
});
|
package/dist/cjs/strategy.js
CHANGED
|
@@ -27,12 +27,14 @@ var import_ifanr = require("./parsers/ifanr");
|
|
|
27
27
|
var import_theverge = require("./parsers/theverge");
|
|
28
28
|
var import_ithome = require("./parsers/ithome");
|
|
29
29
|
var import_kr = require("./parsers/36kr");
|
|
30
|
+
var import_techrunch = require("./parsers/techrunch");
|
|
30
31
|
var strategies = {
|
|
31
32
|
cnbeta: import_cnbeta.cnbeta,
|
|
32
33
|
ifanr: import_ifanr.ifanr,
|
|
33
34
|
theverge: import_theverge.theverge,
|
|
34
35
|
ithome: import_ithome.ithome,
|
|
35
|
-
"36kr": import_kr._36kr
|
|
36
|
+
"36kr": import_kr._36kr,
|
|
37
|
+
techrunch: import_techrunch.techrunch
|
|
36
38
|
};
|
|
37
39
|
// Annotate the CommonJS export names for ESM import in node:
|
|
38
40
|
0 && (module.exports = {
|
|
@@ -19,11 +19,15 @@ export declare function runInPage<T>(url: string, fn: (page: Page) => Promise<T>
|
|
|
19
19
|
* @param options
|
|
20
20
|
* @returns
|
|
21
21
|
*/
|
|
22
|
-
export declare function
|
|
22
|
+
export declare function getPageHtmlByBrowser(url: string, options?: {
|
|
23
23
|
scroll?: {
|
|
24
24
|
offset?: number;
|
|
25
25
|
interval?: number;
|
|
26
26
|
};
|
|
27
|
+
waitForElement?: {
|
|
28
|
+
selector: string;
|
|
29
|
+
timeout?: number;
|
|
30
|
+
};
|
|
27
31
|
} & RunOptions): Promise<string>;
|
|
28
32
|
export declare function scrollToBottom(page: Page, offset?: number, interval?: number): Promise<void>;
|
|
29
33
|
export {};
|
|
@@ -19,7 +19,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
19
19
|
// src/utils/browser.ts
|
|
20
20
|
var browser_exports = {};
|
|
21
21
|
__export(browser_exports, {
|
|
22
|
-
|
|
22
|
+
getPageHtmlByBrowser: () => getPageHtmlByBrowser,
|
|
23
23
|
runInPage: () => runInPage,
|
|
24
24
|
scrollToBottom: () => scrollToBottom
|
|
25
25
|
});
|
|
@@ -43,11 +43,16 @@ async function runInPage(url, fn, options) {
|
|
|
43
43
|
await browser.close();
|
|
44
44
|
return content;
|
|
45
45
|
}
|
|
46
|
-
async function
|
|
46
|
+
async function getPageHtmlByBrowser(url, options) {
|
|
47
47
|
return await runInPage(
|
|
48
48
|
url,
|
|
49
49
|
async (page) => {
|
|
50
50
|
var _a, _b;
|
|
51
|
+
if (options == null ? void 0 : options.waitForElement) {
|
|
52
|
+
await page.waitForSelector(options.waitForElement.selector, {
|
|
53
|
+
timeout: options.waitForElement.timeout || 1e3 * 3
|
|
54
|
+
});
|
|
55
|
+
}
|
|
51
56
|
if (options == null ? void 0 : options.scroll) {
|
|
52
57
|
await scrollToBottom(
|
|
53
58
|
page,
|
|
@@ -74,7 +79,7 @@ async function scrollToBottom(page, offset = 300, interval = 10) {
|
|
|
74
79
|
}
|
|
75
80
|
// Annotate the CommonJS export names for ESM import in node:
|
|
76
81
|
0 && (module.exports = {
|
|
77
|
-
|
|
82
|
+
getPageHtmlByBrowser,
|
|
78
83
|
runInPage,
|
|
79
84
|
scrollToBottom
|
|
80
85
|
});
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function getPageHtmlByHttp(feed: string): Promise<string>;
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
var __create = Object.create;
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
6
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
7
|
+
var __export = (target, all) => {
|
|
8
|
+
for (var name in all)
|
|
9
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
10
|
+
};
|
|
11
|
+
var __copyProps = (to, from, except, desc) => {
|
|
12
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
13
|
+
for (let key of __getOwnPropNames(from))
|
|
14
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
15
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
16
|
+
}
|
|
17
|
+
return to;
|
|
18
|
+
};
|
|
19
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
20
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
21
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
22
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
23
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
24
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
25
|
+
mod
|
|
26
|
+
));
|
|
27
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
28
|
+
|
|
29
|
+
// src/utils/request.ts
|
|
30
|
+
var request_exports = {};
|
|
31
|
+
__export(request_exports, {
|
|
32
|
+
getPageHtmlByHttp: () => getPageHtmlByHttp
|
|
33
|
+
});
|
|
34
|
+
module.exports = __toCommonJS(request_exports);
|
|
35
|
+
var import_axios = __toESM(require("axios"));
|
|
36
|
+
var import_constants = require("./constants");
|
|
37
|
+
async function getPageHtmlByHttp(feed) {
|
|
38
|
+
const res = await import_axios.default.get(feed, {
|
|
39
|
+
responseType: "text",
|
|
40
|
+
headers: {
|
|
41
|
+
"User-Agent": import_constants.iosUA
|
|
42
|
+
}
|
|
43
|
+
});
|
|
44
|
+
return res.data;
|
|
45
|
+
}
|
|
46
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
47
|
+
0 && (module.exports = {
|
|
48
|
+
getPageHtmlByHttp
|
|
49
|
+
});
|