ns-rss-spider 1.1.5 → 1.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/dist/cjs/parseContent/index.js +36 -5
  2. package/dist/cjs/parseContent/newsletter/parseHtml.d.ts +4 -0
  3. package/dist/cjs/parseContent/newsletter/parseHtml.js +54 -0
  4. package/dist/cjs/parseContent/newsletter/parseNewsLetter.d.ts +4 -0
  5. package/dist/cjs/parseContent/newsletter/parseNewsLetter.js +50 -0
  6. package/dist/cjs/parseContent/newsletter/parseNoteItem.d.ts +4 -0
  7. package/dist/cjs/parseContent/newsletter/parseNoteItem.js +59 -0
  8. package/dist/cjs/parseContent/newsletter/plainToGroup.d.ts +8 -0
  9. package/dist/cjs/parseContent/newsletter/plainToGroup.js +73 -0
  10. package/dist/cjs/parseContent/newsletter/tpl.d.ts +11 -0
  11. package/dist/cjs/parseContent/newsletter/tpl.js +86 -0
  12. package/dist/cjs/parseContent/newsletter/types.d.ts +46 -0
  13. package/dist/cjs/parseContent/newsletter/types.js +17 -0
  14. package/dist/cjs/parseContent/parseContent.d.ts +2 -2
  15. package/dist/cjs/parseContent/parseContent.js +3 -6
  16. package/dist/cjs/parseContent/stripeHtml.d.ts +2 -1
  17. package/dist/cjs/parseContent/stripeHtml.js +6 -1
  18. package/dist/cjs/parsers/avanderlee.js +10 -1
  19. package/dist/cjs/parsers/cnbeta.js +1 -1
  20. package/dist/cjs/parsers/cssweekly.d.ts +4 -0
  21. package/dist/cjs/parsers/cssweekly.js +51 -0
  22. package/dist/cjs/parsers/iosdevweekly.d.ts +4 -0
  23. package/dist/cjs/parsers/iosdevweekly.js +56 -0
  24. package/dist/cjs/parsers/javascriptweekly.d.ts +8 -0
  25. package/dist/cjs/parsers/javascriptweekly.js +124 -0
  26. package/dist/cjs/strategy.js +10 -1
  27. package/dist/cjs/types.d.ts +18 -3
  28. package/dist/cjs/upload/index.d.ts +1 -1
  29. package/dist/cjs/utils/arraySplit.d.ts +1 -0
  30. package/dist/cjs/utils/arraySplit.js +42 -0
  31. package/package.json +1 -1
@@ -49,19 +49,50 @@ async function parseContent(item, strategy) {
49
49
  console.log(import_zx.chalk.green("正在拉取文章内容"), item.link);
50
50
  const article = await (0, import_getArticleHtml.getArticleHtml)(item.link, strategy.fetcher);
51
51
  $ = cheerio.load(article);
52
- element = ((_a = strategy.getContentElementFromArticle) == null ? void 0 : _a.call(strategy, $)) || $.root();
52
+ element = ((_a = strategy.getContentElementFromArticle) == null ? void 0 : _a.call(strategy, $)) || $.root().find("body");
53
+ }
54
+ let newsletter;
55
+ if ((strategy == null ? void 0 : strategy.articleType) === "newsletter") {
56
+ console.log("解析 newsletter");
57
+ const ns = await strategy.run($, element).catch((e) => {
58
+ console.error(import_zx.chalk.red("parse topic error"));
59
+ console.error(e);
60
+ return void 0;
61
+ });
62
+ if (!ns || !ns.topics.length) {
63
+ console.error(import_zx.chalk.red("未解析到 topics"));
64
+ console.log(ns);
65
+ } else {
66
+ newsletter = ns;
67
+ }
53
68
  }
54
69
  console.log(import_zx.chalk.green("正在预处理 html"));
55
- (0, import_stripeHtml.stripeHtml)($, element);
56
- if (!(strategy == null ? void 0 : strategy.parse)) {
70
+ (0, import_stripeHtml.stripeHtml)($, element, strategy);
71
+ const base = {
72
+ ...(0, import_getBasicFromItem.getBasicFromItem)(item),
73
+ content: ((_b = element.html()) == null ? void 0 : _b.trim()) || ""
74
+ };
75
+ if (!(strategy == null ? void 0 : strategy.parse) || strategy.articleType === "newsletter" && !newsletter) {
57
76
  return {
58
77
  type: "simple",
59
- ...(0, import_getBasicFromItem.getBasicFromItem)(item),
60
- content: ((_b = element.html()) == null ? void 0 : _b.trim()) || ""
78
+ ...base
79
+ };
80
+ }
81
+ if (strategy.articleType === "newsletter") {
82
+ return {
83
+ type: "newsletter",
84
+ ...base,
85
+ newsletter
61
86
  };
62
87
  }
63
88
  console.log(import_zx.chalk.green("正在解析文章内容"));
64
89
  const rich = await (0, import_parseContent.parseContent)($, element, item, strategy);
90
+ if (!rich) {
91
+ return {
92
+ type: "simple",
93
+ ...base
94
+ };
95
+ }
65
96
  let extra = {};
66
97
  if (rich.type === "rich") {
67
98
  if (strategy.getExtraItems) {
@@ -0,0 +1,4 @@
1
+ import { AnyNode, Cheerio, CheerioAPI } from "cheerio";
2
+ export declare const parseHtml: ($: CheerioAPI, $el: Cheerio<AnyNode>, options?: {
3
+ removeLink?: boolean | undefined;
4
+ }) => string;
@@ -0,0 +1,54 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __export = (target, all) => {
6
+ for (var name in all)
7
+ __defProp(target, name, { get: all[name], enumerable: true });
8
+ };
9
+ var __copyProps = (to, from, except, desc) => {
10
+ if (from && typeof from === "object" || typeof from === "function") {
11
+ for (let key of __getOwnPropNames(from))
12
+ if (!__hasOwnProp.call(to, key) && key !== except)
13
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
14
+ }
15
+ return to;
16
+ };
17
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
+
19
+ // src/parseContent/newsletter/parseHtml.ts
20
+ var parseHtml_exports = {};
21
+ __export(parseHtml_exports, {
22
+ parseHtml: () => parseHtml
23
+ });
24
+ module.exports = __toCommonJS(parseHtml_exports);
25
+ var import_html_entities = require("html-entities");
26
+ var parseHtml = ($, $el, options = {}) => {
27
+ const { removeLink } = options;
28
+ $el.find("*").each(function() {
29
+ const tag = this.tagName.toLowerCase();
30
+ if (tag === "a") {
31
+ if (removeLink) {
32
+ $(this).replaceWith($(`<span class="ns-desc-link" data-href="${this.attribs.href}">${$(this).text()}</span>`));
33
+ } else {
34
+ this.attribs = {
35
+ href: this.attribs.href
36
+ };
37
+ }
38
+ } else if (tag === "image") {
39
+ this.attribs = {
40
+ src: this.attribs.src
41
+ };
42
+ } else {
43
+ this.attribs = {};
44
+ }
45
+ if (["div", "span", "p", "ol", "ul", "blockquote", "a", "hr"].includes(tag) && $(this).children().length === 0 && $(this).text().trim() === "") {
46
+ $(this).remove();
47
+ }
48
+ });
49
+ return (0, import_html_entities.decode)($el.html().trim());
50
+ };
51
+ // Annotate the CommonJS export names for ESM import in node:
52
+ 0 && (module.exports = {
53
+ parseHtml
54
+ });
@@ -0,0 +1,4 @@
1
+ import { AnyNode, Cheerio, CheerioAPI } from "cheerio";
2
+ import { ParseTpl } from "./types";
3
+ import { SnapshotResult } from "./types";
4
+ export declare function parseNewsLetter(tpl: ParseTpl, $: CheerioAPI, $root: Cheerio<AnyNode>): SnapshotResult;
@@ -0,0 +1,50 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __export = (target, all) => {
6
+ for (var name in all)
7
+ __defProp(target, name, { get: all[name], enumerable: true });
8
+ };
9
+ var __copyProps = (to, from, except, desc) => {
10
+ if (from && typeof from === "object" || typeof from === "function") {
11
+ for (let key of __getOwnPropNames(from))
12
+ if (!__hasOwnProp.call(to, key) && key !== except)
13
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
14
+ }
15
+ return to;
16
+ };
17
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
+
19
+ // src/parseContent/newsletter/parseNewsLetter.ts
20
+ var parseNewsLetter_exports = {};
21
+ __export(parseNewsLetter_exports, {
22
+ parseNewsLetter: () => parseNewsLetter
23
+ });
24
+ module.exports = __toCommonJS(parseNewsLetter_exports);
25
+ var import_parseHtml = require("./parseHtml");
26
+ function parseNewsLetter(tpl, $, $root) {
27
+ let rst = {
28
+ name: "",
29
+ topics: []
30
+ };
31
+ if (tpl.description) {
32
+ const $desc = $root.find(tpl.description);
33
+ if ($desc.html()) {
34
+ const desc = (0, import_parseHtml.parseHtml)($, $root.find(tpl.description), {
35
+ removeLink: tpl.descriptionRemoveLink
36
+ });
37
+ if (desc) {
38
+ rst.description = desc;
39
+ }
40
+ }
41
+ }
42
+ if (tpl.image) {
43
+ rst.image = $root.find(tpl.image).first().attr("src");
44
+ }
45
+ return rst;
46
+ }
47
+ // Annotate the CommonJS export names for ESM import in node:
48
+ 0 && (module.exports = {
49
+ parseNewsLetter
50
+ });
@@ -0,0 +1,4 @@
1
+ import type { AnyNode, Cheerio, CheerioAPI } from "cheerio";
2
+ import { ParseTpl, ParseTplTopicCommon } from "./types";
3
+ import { SnapshotNote } from './types';
4
+ export declare function parseNoteItem($: CheerioAPI, $el: Cheerio<AnyNode>, tpl: ParseTplTopicCommon, transform?: ParseTpl["transform"]): SnapshotNote | undefined;
@@ -0,0 +1,59 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __export = (target, all) => {
6
+ for (var name in all)
7
+ __defProp(target, name, { get: all[name], enumerable: true });
8
+ };
9
+ var __copyProps = (to, from, except, desc) => {
10
+ if (from && typeof from === "object" || typeof from === "function") {
11
+ for (let key of __getOwnPropNames(from))
12
+ if (!__hasOwnProp.call(to, key) && key !== except)
13
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
14
+ }
15
+ return to;
16
+ };
17
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
+
19
+ // src/parseContent/newsletter/parseNoteItem.ts
20
+ var parseNoteItem_exports = {};
21
+ __export(parseNoteItem_exports, {
22
+ parseNoteItem: () => parseNoteItem
23
+ });
24
+ module.exports = __toCommonJS(parseNoteItem_exports);
25
+ var import_parseHtml = require("./parseHtml");
26
+ function parseNoteItem($, $el, tpl, transform) {
27
+ const $link = $el.find(tpl.noteLink).first();
28
+ const url = $link.attr("href");
29
+ if (!url) {
30
+ return void 0;
31
+ }
32
+ const $tag = $el.find(tpl.noteTag).first();
33
+ const tag = $tag.text().trim();
34
+ const $desc = $el.find(tpl.noteBody).first();
35
+ let title = tpl.noteNoTitle ? "" : $link.text().trim();
36
+ title = (transform == null ? void 0 : transform("noteTitle", title)) || title;
37
+ let descriptionHtml;
38
+ if (tpl.noteRichDesc) {
39
+ descriptionHtml = (0, import_parseHtml.parseHtml)($, $desc, {
40
+ removeLink: tpl.noteRichDescRemoveLink
41
+ });
42
+ }
43
+ let description = $desc.text().trim() || "";
44
+ description = (transform == null ? void 0 : transform("noteDesc", description)) || description;
45
+ const rst = {
46
+ title: tpl.noteNoTitle ? "" : $link.text().trim(),
47
+ url,
48
+ description,
49
+ tag
50
+ };
51
+ if (descriptionHtml) {
52
+ rst.descriptionHtml = descriptionHtml;
53
+ }
54
+ return rst;
55
+ }
56
+ // Annotate the CommonJS export names for ESM import in node:
57
+ 0 && (module.exports = {
58
+ parseNoteItem
59
+ });
@@ -0,0 +1,8 @@
1
+ import { AnyNode, Cheerio, CheerioAPI } from "cheerio";
2
+ export declare const emptyGroup: string;
3
+ export declare const emptyNote: string;
4
+ export declare function plainToGroup({ root, isTopicTitle, isTitle, }: {
5
+ root: Cheerio<AnyNode>;
6
+ isTopicTitle: (node: Cheerio<AnyNode>) => boolean;
7
+ isTitle: (node: Cheerio<AnyNode>) => boolean;
8
+ }): CheerioAPI;
@@ -0,0 +1,73 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __export = (target, all) => {
6
+ for (var name in all)
7
+ __defProp(target, name, { get: all[name], enumerable: true });
8
+ };
9
+ var __copyProps = (to, from, except, desc) => {
10
+ if (from && typeof from === "object" || typeof from === "function") {
11
+ for (let key of __getOwnPropNames(from))
12
+ if (!__hasOwnProp.call(to, key) && key !== except)
13
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
14
+ }
15
+ return to;
16
+ };
17
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
+
19
+ // src/parseContent/newsletter/plainToGroup.ts
20
+ var plainToGroup_exports = {};
21
+ __export(plainToGroup_exports, {
22
+ emptyGroup: () => emptyGroup,
23
+ emptyNote: () => emptyNote,
24
+ plainToGroup: () => plainToGroup
25
+ });
26
+ module.exports = __toCommonJS(plainToGroup_exports);
27
+ var import_cheerio = require("cheerio");
28
+ var emptyGroup = `
29
+ <div class='ns-topic'>
30
+ <div class='ns-topic-title'></div>
31
+ <div class='ns-topic-content'></div>
32
+ </div>
33
+ `.trim();
34
+ var emptyNote = `
35
+ <div class='ns-note'>
36
+ <div class='ns-note-title'></div>
37
+ <div class='ns-note-content'></div>
38
+ </div>
39
+ `.trim();
40
+ function plainToGroup({
41
+ root,
42
+ isTopicTitle,
43
+ isTitle
44
+ }) {
45
+ const $ = (0, import_cheerio.load)("<body></body>");
46
+ let currentTopic = $(emptyGroup);
47
+ let currentNote = null;
48
+ root.children().each((_i, node) => {
49
+ const $node = $(node);
50
+ if (isTopicTitle($node)) {
51
+ currentTopic = $(emptyGroup);
52
+ currentTopic.find(".ns-topic-title").append($node.clone());
53
+ $("body").append(currentTopic);
54
+ } else if (isTitle($node)) {
55
+ currentNote = $(emptyNote);
56
+ currentNote.find(".ns-note-title").append($node.clone());
57
+ currentTopic.find(".ns-topic-content").append(currentNote);
58
+ } else {
59
+ if (currentNote) {
60
+ currentNote.find(".ns-note-content").append($node.clone());
61
+ } else {
62
+ console.error("invalid currentNote");
63
+ }
64
+ }
65
+ });
66
+ return $;
67
+ }
68
+ // Annotate the CommonJS export names for ESM import in node:
69
+ 0 && (module.exports = {
70
+ emptyGroup,
71
+ emptyNote,
72
+ plainToGroup
73
+ });
@@ -0,0 +1,11 @@
1
+ import type { AnyNode, Cheerio, CheerioAPI } from "cheerio";
2
+ import type { SnapshotResult } from "./types";
3
+ import { ParseTpl } from "./types";
4
+ /**
5
+ * linear 和 group 并存场景下的解析思路
6
+ *
7
+ * 1. 通过 selector 选出 linerTitle、linearItem、group、linerTitle、linearItem、group... 序列
8
+ * 2. 遇到一个 linearTitle 或者一个 group,就组成一个新的 group
9
+ *
10
+ */
11
+ export declare const parse: (tpl: ParseTpl, $: CheerioAPI, el: Cheerio<AnyNode>) => SnapshotResult;
@@ -0,0 +1,86 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __export = (target, all) => {
6
+ for (var name in all)
7
+ __defProp(target, name, { get: all[name], enumerable: true });
8
+ };
9
+ var __copyProps = (to, from, except, desc) => {
10
+ if (from && typeof from === "object" || typeof from === "function") {
11
+ for (let key of __getOwnPropNames(from))
12
+ if (!__hasOwnProp.call(to, key) && key !== except)
13
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
14
+ }
15
+ return to;
16
+ };
17
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
+
19
+ // src/parseContent/newsletter/tpl.ts
20
+ var tpl_exports = {};
21
+ __export(tpl_exports, {
22
+ parse: () => parse
23
+ });
24
+ module.exports = __toCommonJS(tpl_exports);
25
+ var import_parseNewsLetter = require("./parseNewsLetter");
26
+ var import_parseNoteItem = require("./parseNoteItem");
27
+ var parse = (tpl, $, el) => {
28
+ let rst = (0, import_parseNewsLetter.parseNewsLetter)(tpl, $, $(el));
29
+ let selectors = [];
30
+ if (tpl.linearTopic) {
31
+ selectors = selectors.concat([tpl.linearTopic.title, tpl.linearTopic.item]);
32
+ }
33
+ if (tpl.groupTopic) {
34
+ selectors.push(`${tpl.groupTopic.group}`.trim());
35
+ }
36
+ const topicFlags = $(el).find(selectors.join(","));
37
+ let t = [];
38
+ topicFlags.each((_, el2) => {
39
+ var _a, _b, _c, _d;
40
+ if (el2.nodeType !== 1) {
41
+ throw Error("invalid title or item: nodeType !== 1");
42
+ }
43
+ const $el = $(el2);
44
+ const isLinearTitle = (_a = tpl.linearTopic) == null ? void 0 : _a.matchTitle($el);
45
+ const isLinearItem = (_b = tpl.linearTopic) == null ? void 0 : _b.matchItem($el);
46
+ if (isLinearTitle) {
47
+ let title = $el.text().trim();
48
+ title = ((_c = tpl.transform) == null ? void 0 : _c.call(tpl, "noteTitle", title)) || title;
49
+ t.push({
50
+ title,
51
+ notes: []
52
+ });
53
+ } else if (isLinearItem) {
54
+ if (!t.length) {
55
+ t.push({
56
+ title: "",
57
+ notes: []
58
+ });
59
+ }
60
+ const item = (0, import_parseNoteItem.parseNoteItem)($, $el, tpl.linearTopic, tpl.transform);
61
+ if (item) {
62
+ t[t.length - 1].notes.push(item);
63
+ }
64
+ } else {
65
+ const $group = $el;
66
+ const gTpl = tpl.groupTopic;
67
+ let title = $group.find(gTpl.title).first().text().trim();
68
+ title = ((_d = tpl.transform) == null ? void 0 : _d.call(tpl, "noteTitle", title)) || title;
69
+ const notes = Array.from($group.find(gTpl.item)).map((item) => {
70
+ const $item = $(item);
71
+ return (0, import_parseNoteItem.parseNoteItem)($, $item, gTpl, tpl.transform);
72
+ }).filter((n) => !!n);
73
+ t.push({
74
+ title,
75
+ notes
76
+ });
77
+ }
78
+ });
79
+ t = t.filter((t2) => t2.notes.length > 0);
80
+ rst.topics = t;
81
+ return rst;
82
+ };
83
+ // Annotate the CommonJS export names for ESM import in node:
84
+ 0 && (module.exports = {
85
+ parse
86
+ });
@@ -0,0 +1,46 @@
1
+ import { AnyNode, Cheerio } from "cheerio";
2
+ export type Selector = string;
3
+ export interface ParseTplTopicCommon {
4
+ root?: Selector;
5
+ title: Selector;
6
+ item: Selector;
7
+ noteLink: Selector;
8
+ noteBody?: Selector;
9
+ noteFrom?: Selector;
10
+ noteNoTitle?: boolean;
11
+ noteTag?: Selector;
12
+ noteRichDesc?: boolean;
13
+ noteRichDescRemoveLink?: boolean;
14
+ }
15
+ export interface ParseTpl {
16
+ description?: Selector;
17
+ descriptionRemoveLink?: boolean;
18
+ image?: Selector;
19
+ transform?: (type: "date" | "noteTitle" | "noteDesc", content: string) => string | undefined;
20
+ linearTopic?: ParseTplTopicCommon & {
21
+ matchTitle: (el: Cheerio<AnyNode>) => boolean;
22
+ matchItem: (el: Cheerio<AnyNode>) => boolean;
23
+ };
24
+ groupTopic?: {
25
+ group: Selector;
26
+ } & ParseTplTopicCommon;
27
+ }
28
+ export interface SnapshotResult {
29
+ name: string;
30
+ description?: string;
31
+ image?: string;
32
+ topics: SnapshotTopic[];
33
+ }
34
+ export interface SnapshotTopic {
35
+ title: string;
36
+ description?: string;
37
+ image?: string;
38
+ notes: SnapshotNote[];
39
+ }
40
+ export interface SnapshotNote {
41
+ url: string;
42
+ title: string;
43
+ description: string;
44
+ descriptionHtml?: string;
45
+ tag?: string;
46
+ }
@@ -0,0 +1,17 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __copyProps = (to, from, except, desc) => {
6
+ if (from && typeof from === "object" || typeof from === "function") {
7
+ for (let key of __getOwnPropNames(from))
8
+ if (!__hasOwnProp.call(to, key) && key !== except)
9
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
10
+ }
11
+ return to;
12
+ };
13
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
14
+
15
+ // src/parseContent/newsletter/types.ts
16
+ var types_exports = {};
17
+ module.exports = __toCommonJS(types_exports);
@@ -1,3 +1,3 @@
1
- import type { Article, ParseStrategy, RssItem } from "../types";
1
+ import type { Article, ParseStrategyArticle, RssItem } from "../types";
2
2
  import { AnyNode, Cheerio, CheerioAPI } from "cheerio";
3
- export declare function parseContent($: CheerioAPI, $element: Cheerio<AnyNode>, item: RssItem, strategy?: ParseStrategy): Promise<Article>;
3
+ export declare function parseContent($: CheerioAPI, $element: Cheerio<AnyNode>, item: RssItem, strategy?: ParseStrategyArticle): Promise<Article | undefined>;
@@ -47,7 +47,7 @@ async function parseContent($, $element, item, strategy) {
47
47
  if (node.is("img")) {
48
48
  const src = node.attr("src");
49
49
  const title = node.attr("title") || void 0;
50
- if (src) {
50
+ if (src && !src.startsWith("data:")) {
51
51
  srcs.push({
52
52
  src,
53
53
  title
@@ -55,6 +55,7 @@ async function parseContent($, $element, item, strategy) {
55
55
  }
56
56
  }
57
57
  });
58
+ console.log("开始解析图片,数目:", srcs.length);
58
59
  for (let item2 of srcs) {
59
60
  if (strategy == null ? void 0 : strategy.ignoreProbeImage) {
60
61
  images.push({
@@ -109,11 +110,7 @@ async function parseContent($, $element, item, strategy) {
109
110
  if (!content) {
110
111
  console.error(import_zx.chalk.red("解析 html 内容出错, html 如下"));
111
112
  console.log($.html());
112
- return {
113
- type: "simple",
114
- content: "",
115
- ...(0, import_getBasicFromItem.getBasicFromItem)(item)
116
- };
113
+ return void 0;
117
114
  }
118
115
  return {
119
116
  type: "rich",
@@ -1,2 +1,3 @@
1
1
  import type { Cheerio, AnyNode, CheerioAPI } from "cheerio";
2
- export declare function stripeHtml($: CheerioAPI, element: Cheerio<AnyNode>): void;
2
+ import type { ParseStrategy } from "../types";
3
+ export declare function stripeHtml($: CheerioAPI, element: Cheerio<AnyNode>, strategy?: ParseStrategy): void;
@@ -23,7 +23,7 @@ __export(stripeHtml_exports, {
23
23
  });
24
24
  module.exports = __toCommonJS(stripeHtml_exports);
25
25
  var import_utils = require("./utils");
26
- function stripeHtml($, element) {
26
+ function stripeHtml($, element, strategy) {
27
27
  $(element).find("script").remove();
28
28
  (0, import_utils.walk_the_DOM)($, element, (el) => {
29
29
  for (let attr of [
@@ -39,6 +39,11 @@ function stripeHtml($, element) {
39
39
  $(el).removeAttr("target");
40
40
  }
41
41
  if ($(el).is("img")) {
42
+ if (strategy == null ? void 0 : strategy.imageSrcAttr) {
43
+ const src2 = $(el).attr(strategy.imageSrcAttr);
44
+ $(el).attr("src", src2);
45
+ ;
46
+ }
42
47
  const src = $(el).attr("src");
43
48
  if (src == null ? void 0 : src.startsWith("//")) {
44
49
  console.log("img `//` 开头域名添加 https", src);
@@ -23,7 +23,16 @@ __export(avanderlee_exports, {
23
23
  });
24
24
  module.exports = __toCommonJS(avanderlee_exports);
25
25
  var avanderlee = {
26
- parse: false
26
+ parse: true,
27
+ fetcher: "http",
28
+ imageSrcAttr: "data-lazy-src",
29
+ getContentElementFromArticle: ($) => {
30
+ const el = $("#content > article > section.post-content");
31
+ for (let sel of [".gotngnp", ".article-inline-newsletter-container", "noscript", 'a > svg[aria-hidden="true"]']) {
32
+ $(el).find(sel).remove();
33
+ }
34
+ return el;
35
+ }
27
36
  };
28
37
  // Annotate the CommonJS export names for ESM import in node:
29
38
  0 && (module.exports = {
@@ -57,7 +57,7 @@ var cnbeta = {
57
57
  阅读全文`, "")
58
58
  };
59
59
  },
60
- //ignoreProbeImage: true,
60
+ ignoreProbeImage: true,
61
61
  getThumbs: async (items) => {
62
62
  const res = await import_axios.default.get("http://m.cnbeta.com.tw/", {
63
63
  responseType: "text",
@@ -0,0 +1,4 @@
1
+ import { ParseTpl } from "../parseContent/newsletter/types";
2
+ import { ParseStrategy } from "../types";
3
+ export declare const tpl: ParseTpl;
4
+ export declare const cssweekly: ParseStrategy;
@@ -0,0 +1,51 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __export = (target, all) => {
6
+ for (var name in all)
7
+ __defProp(target, name, { get: all[name], enumerable: true });
8
+ };
9
+ var __copyProps = (to, from, except, desc) => {
10
+ if (from && typeof from === "object" || typeof from === "function") {
11
+ for (let key of __getOwnPropNames(from))
12
+ if (!__hasOwnProp.call(to, key) && key !== except)
13
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
14
+ }
15
+ return to;
16
+ };
17
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
+
19
+ // src/parsers/cssweekly.ts
20
+ var cssweekly_exports = {};
21
+ __export(cssweekly_exports, {
22
+ cssweekly: () => cssweekly,
23
+ tpl: () => tpl
24
+ });
25
+ module.exports = __toCommonJS(cssweekly_exports);
26
+ var import_tpl = require("../parseContent/newsletter/tpl");
27
+ var tpl = {
28
+ description: ".newsletter-intro",
29
+ groupTopic: {
30
+ group: "section.newsletter-section",
31
+ title: "h2.section-title",
32
+ item: "article.newsletter-article",
33
+ noteLink: ".newsletter-header h2.article-title a",
34
+ noteBody: "p",
35
+ noteRichDesc: true
36
+ }
37
+ };
38
+ var cssweekly = {
39
+ parse: true,
40
+ fetcher: "http",
41
+ articleType: "newsletter",
42
+ run: async ($, el) => {
43
+ const rst = await (0, import_tpl.parse)(tpl, $, el);
44
+ return rst;
45
+ }
46
+ };
47
+ // Annotate the CommonJS export names for ESM import in node:
48
+ 0 && (module.exports = {
49
+ cssweekly,
50
+ tpl
51
+ });
@@ -0,0 +1,4 @@
1
+ import { ParseTpl } from "../parseContent/newsletter/types";
2
+ import { ParseStrategy } from "../types";
3
+ export declare const tpl: ParseTpl;
4
+ export declare const iosdevweekly: ParseStrategy;
@@ -0,0 +1,56 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __export = (target, all) => {
6
+ for (var name in all)
7
+ __defProp(target, name, { get: all[name], enumerable: true });
8
+ };
9
+ var __copyProps = (to, from, except, desc) => {
10
+ if (from && typeof from === "object" || typeof from === "function") {
11
+ for (let key of __getOwnPropNames(from))
12
+ if (!__hasOwnProp.call(to, key) && key !== except)
13
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
14
+ }
15
+ return to;
16
+ };
17
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
+
19
+ // src/parsers/iosdevweekly.ts
20
+ var iosdevweekly_exports = {};
21
+ __export(iosdevweekly_exports, {
22
+ iosdevweekly: () => iosdevweekly,
23
+ tpl: () => tpl
24
+ });
25
+ module.exports = __toCommonJS(iosdevweekly_exports);
26
+ var import_tpl = require("../parseContent/newsletter/tpl");
27
+ var tpl = {
28
+ description: ".category.cc-comment #comment > .item--issue",
29
+ descriptionRemoveLink: true,
30
+ groupTopic: {
31
+ group: ".issue__body > .category",
32
+ title: "h2.category__title",
33
+ item: ".item.item--issue",
34
+ noteLink: "h3.item__title > a",
35
+ noteBody: "p",
36
+ noteRichDesc: true,
37
+ noteRichDescRemoveLink: true
38
+ }
39
+ };
40
+ var iosdevweekly = {
41
+ parse: true,
42
+ fetcher: "http",
43
+ articleType: "newsletter",
44
+ getContentElementFromArticle($) {
45
+ return $("article.issue");
46
+ },
47
+ run: async ($, el) => {
48
+ const rst = await (0, import_tpl.parse)(tpl, $, el);
49
+ return rst;
50
+ }
51
+ };
52
+ // Annotate the CommonJS export names for ESM import in node:
53
+ 0 && (module.exports = {
54
+ iosdevweekly,
55
+ tpl
56
+ });
@@ -0,0 +1,8 @@
1
+ import { ParseTpl } from "../parseContent/newsletter/types";
2
+ import { ParseStrategy } from "../types";
3
+ import { CheerioAPI } from "cheerio";
4
+ export declare const tpl: ParseTpl;
5
+ export declare function transform($: CheerioAPI): CheerioAPI;
6
+ export declare function transformFromInDesc($: CheerioAPI): CheerioAPI;
7
+ export declare function transformExceptionGroup($: CheerioAPI): CheerioAPI;
8
+ export declare const javascriptweekly: ParseStrategy;
@@ -0,0 +1,124 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __export = (target, all) => {
6
+ for (var name in all)
7
+ __defProp(target, name, { get: all[name], enumerable: true });
8
+ };
9
+ var __copyProps = (to, from, except, desc) => {
10
+ if (from && typeof from === "object" || typeof from === "function") {
11
+ for (let key of __getOwnPropNames(from))
12
+ if (!__hasOwnProp.call(to, key) && key !== except)
13
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
14
+ }
15
+ return to;
16
+ };
17
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
+
19
+ // src/parsers/javascriptweekly.ts
20
+ var javascriptweekly_exports = {};
21
+ __export(javascriptweekly_exports, {
22
+ javascriptweekly: () => javascriptweekly,
23
+ tpl: () => tpl,
24
+ transform: () => transform,
25
+ transformExceptionGroup: () => transformExceptionGroup,
26
+ transformFromInDesc: () => transformFromInDesc
27
+ });
28
+ module.exports = __toCommonJS(javascriptweekly_exports);
29
+ var import_tpl = require("../parseContent/newsletter/tpl");
30
+ var import_arraySplit = require("../utils/arraySplit");
31
+ var r = "#content";
32
+ var tpl = {
33
+ image: `${r} .el-fullwidthimage img`,
34
+ transform: (type, desc) => {
35
+ var _a;
36
+ if (type === "noteDesc") {
37
+ let d = desc.trim();
38
+ if (d.startsWith("—")) {
39
+ d = d.substring(1).trim();
40
+ }
41
+ return d;
42
+ } else if (type === "noteTitle" && desc.endsWith(":")) {
43
+ return desc.substring(0, desc.length - 1);
44
+ } else if (type === "date") {
45
+ return ((_a = desc.split("—")[1]) == null ? void 0 : _a.trim()) || "";
46
+ }
47
+ },
48
+ linearTopic: {
49
+ title: `${r} > .el-heading`,
50
+ matchTitle: (el) => el.hasClass("el-heading"),
51
+ item: `${r} > .el-item, ${r} > .miniitem.item`,
52
+ matchItem: (el) => ["el-item", "miniitem"].some((cls) => el.hasClass(cls)),
53
+ noteLink: ".desc a",
54
+ noteBody: ".desc",
55
+ noteFrom: ".name",
56
+ noteTag: ".tag-sponsor",
57
+ noteRichDesc: true,
58
+ noteRichDescRemoveLink: true
59
+ },
60
+ groupTopic: {
61
+ group: `${r} .content.el-md, ${r} > .el-subtable table.content.el-content.briefs`,
62
+ title: "tbody > tr > td > p",
63
+ item: "tbody > tr > td > ul > li",
64
+ noteLink: "a",
65
+ noteBody: "*",
66
+ noteTag: ".tag-sponsor",
67
+ noteRichDesc: true,
68
+ noteRichDescRemoveLink: true
69
+ //noteDescIncludeTitle: true,
70
+ }
71
+ };
72
+ function transform($) {
73
+ transformExceptionGroup($);
74
+ transformFromInDesc($);
75
+ return $;
76
+ }
77
+ function transformFromInDesc($) {
78
+ const items = $(tpl.linearTopic.item);
79
+ items.each((i, item) => {
80
+ const $item = $(item);
81
+ if ($item.hasClass("miniitem")) {
82
+ $item.find("tbody > tr > td > p.desc > .name").last().remove();
83
+ }
84
+ });
85
+ return $;
86
+ }
87
+ function transformExceptionGroup($) {
88
+ const groups = $(tpl.groupTopic.group);
89
+ groups.each((i, group) => {
90
+ const $group = $(group).find("tbody > tr > td");
91
+ if (Array.from($group.children()).map((el) => el.tagName.toLowerCase()).join("") === "pp") {
92
+ const noteEl = $group.children().last();
93
+ const lis = (0, import_arraySplit.arraySplit)(Array.from(noteEl.contents()), (el) => el.nodeType === 1 && el.tagName === "br");
94
+ noteEl.replaceWith($("<ul></ul>")).html("");
95
+ lis.forEach((li) => {
96
+ const l = $("<li></li>");
97
+ const p = $("<p></p>");
98
+ li.forEach((el) => {
99
+ p.append(el);
100
+ });
101
+ l.append(p);
102
+ $group.children().last().append(l);
103
+ });
104
+ }
105
+ });
106
+ return $;
107
+ }
108
+ var javascriptweekly = {
109
+ parse: true,
110
+ fetcher: "http",
111
+ articleType: "newsletter",
112
+ run: async ($, el) => {
113
+ const rst = await (0, import_tpl.parse)(tpl, transform($), el);
114
+ return rst;
115
+ }
116
+ };
117
+ // Annotate the CommonJS export names for ESM import in node:
118
+ 0 && (module.exports = {
119
+ javascriptweekly,
120
+ tpl,
121
+ transform,
122
+ transformExceptionGroup,
123
+ transformFromInDesc
124
+ });
@@ -33,6 +33,9 @@ var import_tmtpost = require("./parsers/tmtpost");
33
33
  var import_leiphone = require("./parsers/leiphone");
34
34
  var import_oschina = require("./parsers/oschina");
35
35
  var import_avanderlee = require("./parsers/avanderlee");
36
+ var import_iosdevweekly = require("./parsers/iosdevweekly");
37
+ var import_cssweekly = require("./parsers/cssweekly");
38
+ var import_javascriptweekly = require("./parsers/javascriptweekly");
36
39
  var strategies = {
37
40
  cnbeta: import_cnbeta.cnbeta,
38
41
  ifanr: import_ifanr.ifanr,
@@ -44,7 +47,13 @@ var strategies = {
44
47
  tmtpost: import_tmtpost.tmtpost,
45
48
  leiphone: import_leiphone.leiphone,
46
49
  oschina: import_oschina.oschina,
47
- avanderlee: import_avanderlee.avanderlee
50
+ avanderlee: import_avanderlee.avanderlee,
51
+ iosdevweekly: import_iosdevweekly.iosdevweekly,
52
+ cssweekly: import_cssweekly.cssweekly,
53
+ javascriptweekly: import_javascriptweekly.javascriptweekly,
54
+ nodeweekly: import_javascriptweekly.javascriptweekly,
55
+ reactstatus: import_javascriptweekly.javascriptweekly,
56
+ frontendfocus: import_javascriptweekly.javascriptweekly
48
57
  };
49
58
  // Annotate the CommonJS export names for ESM import in node:
50
59
  0 && (module.exports = {
@@ -1,18 +1,28 @@
1
1
  import type { AnyNode, Cheerio, CheerioAPI } from 'cheerio';
2
2
  import { Item } from 'rss-parser';
3
+ import { SnapshotResult } from './parseContent/newsletter/types';
3
4
  export type RssItem = Item & {
4
5
  source?: string;
5
6
  author?: string;
6
7
  };
7
- export interface ParseStrategy {
8
+ interface ParseStrategyBase {
8
9
  parse: boolean;
9
10
  fetcher?: 'http' | 'playwright';
10
11
  getContentElementFromArticle?: (aritcle: CheerioAPI) => Cheerio<AnyNode>;
12
+ imageSrcAttr?: string;
13
+ getThumbs?: (items: Item[]) => Promise<Record<string, string> | undefined>;
14
+ }
15
+ export interface ParseStrategyArticle extends ParseStrategyBase {
16
+ articleType?: 'article';
11
17
  getContentFromHtml?: ($: CheerioAPI, node: Cheerio<AnyNode>) => string;
12
18
  getExtraItems?: ($: CheerioAPI, current: RichArticle, item: Item) => Promise<Record<string, any>>;
13
19
  ignoreProbeImage?: boolean;
14
- getThumbs?: (items: Item[]) => Promise<Record<string, string> | undefined>;
15
20
  }
21
+ export interface ParseStrategyMewsLetter extends ParseStrategyBase {
22
+ articleType: 'newsletter';
23
+ run: ($: CheerioAPI, node: Cheerio<AnyNode>) => Promise<SnapshotResult>;
24
+ }
25
+ export type ParseStrategy = ParseStrategyArticle | ParseStrategyMewsLetter;
16
26
  export interface SimpleArticle {
17
27
  type: 'simple';
18
28
  guid: string;
@@ -42,4 +52,9 @@ export interface RichArticle extends Omit<SimpleArticle, 'type'> {
42
52
  ssrContent: string;
43
53
  images: RichArticleImage[];
44
54
  }
45
- export type Article = SimpleArticle | RichArticle;
55
+ export interface NewsletterArticle extends Omit<SimpleArticle, 'type'> {
56
+ type: 'newsletter';
57
+ newsletter: SnapshotResult;
58
+ }
59
+ export type Article = SimpleArticle | RichArticle | NewsletterArticle;
60
+ export {};
@@ -12,7 +12,7 @@ export declare function uploadContent({ app, getUploadApi, items, }: {
12
12
  items: Article[];
13
13
  } & ServerInfo): Promise<Article[] | {
14
14
  storePath: any;
15
- type: "simple" | "rich";
15
+ type: "newsletter" | "simple" | "rich";
16
16
  guid: string;
17
17
  title: string;
18
18
  link: string;
@@ -0,0 +1 @@
1
+ export declare const arraySplit: <T>(arr: T[], fn: (item: T) => boolean) => T[][];
@@ -0,0 +1,42 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __export = (target, all) => {
6
+ for (var name in all)
7
+ __defProp(target, name, { get: all[name], enumerable: true });
8
+ };
9
+ var __copyProps = (to, from, except, desc) => {
10
+ if (from && typeof from === "object" || typeof from === "function") {
11
+ for (let key of __getOwnPropNames(from))
12
+ if (!__hasOwnProp.call(to, key) && key !== except)
13
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
14
+ }
15
+ return to;
16
+ };
17
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
+
19
+ // src/utils/arraySplit.ts
20
+ var arraySplit_exports = {};
21
+ __export(arraySplit_exports, {
22
+ arraySplit: () => arraySplit
23
+ });
24
+ module.exports = __toCommonJS(arraySplit_exports);
25
+ var arraySplit = (arr, fn) => {
26
+ const rst = [];
27
+ let tmp = [];
28
+ arr.forEach((el, i) => {
29
+ if (fn(el)) {
30
+ rst.push(tmp);
31
+ tmp = [];
32
+ } else {
33
+ tmp.push(el);
34
+ }
35
+ });
36
+ rst.push(tmp);
37
+ return rst;
38
+ };
39
+ // Annotate the CommonJS export names for ESM import in node:
40
+ 0 && (module.exports = {
41
+ arraySplit
42
+ });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ns-rss-spider",
3
- "version": "1.1.5",
3
+ "version": "1.1.8",
4
4
  "description": "",
5
5
  "main": "dist/cjs/index.js",
6
6
  "types": "dist/cjs/index.d.ts",