rssany 0.1.2 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -50
- package/app/plugins/builtin/agi-eval-evaluation.rssany.js +188 -0
- package/app/plugins/builtin/amii-research-talent.rssany.js +73 -0
- package/app/plugins/builtin/anthropic-research.rssany.js +155 -0
- package/app/plugins/builtin/appen-resources.rssany.js +155 -0
- package/app/plugins/builtin/baai-wudao-paper-article.rssany.js +185 -0
- package/app/plugins/builtin/baaidata-csdn.rssany.js +242 -0
- package/app/plugins/builtin/baidu-research.rssany.js +222 -0
- package/app/plugins/builtin/brightdata-blog.rssany.js +301 -0
- package/app/plugins/builtin/bytedance-seed-research.rssany.js +231 -0
- package/app/plugins/builtin/five-radar.rssany.js +490 -0
- package/app/plugins/builtin/flageval-news.rssany.js +118 -0
- package/app/plugins/builtin/google-deepmind-research.rssany.js +223 -0
- package/app/plugins/builtin/google-research-datasets.rssany.js +171 -0
- package/app/plugins/builtin/google-research.rssany.js +220 -0
- package/app/plugins/builtin/google.rssany.js +187 -0
- package/app/plugins/builtin/hacker-news-newest.rssany.js +130 -0
- package/app/plugins/builtin/harvard-dataverse.rssany.js +166 -0
- package/app/plugins/builtin/huaweicloud-bbs-blogs.rssany.js +185 -0
- package/app/plugins/builtin/lingowhale.rssany.js +119 -0
- package/app/plugins/builtin/meituan-tech.rssany.js +130 -0
- package/app/plugins/builtin/meta-ai-publications.rssany.js +221 -0
- package/app/plugins/builtin/mila-quebec.rssany.js +199 -0
- package/app/plugins/builtin/mit-csail-research.rssany.js +208 -0
- package/app/plugins/builtin/moonshot.rssany.js +127 -0
- package/app/plugins/builtin/opendatalab-news.rssany.js +174 -0
- package/app/plugins/builtin/opendatalab.rssany.js +109 -0
- package/app/plugins/builtin/opendrivelab-autonomous-driving.rssany.js +114 -0
- package/app/plugins/builtin/opendrivelab-embodiedai.rssany.js +114 -0
- package/app/plugins/builtin/opendrivelab-publications.rssany.js +130 -0
- package/app/plugins/builtin/opendrivelab.rssany.js +333 -0
- package/app/plugins/builtin/paperswithcode.rssany.js +227 -0
- package/app/plugins/builtin/pjlab-adg-publications.rssany.js +202 -0
- package/app/plugins/builtin/rss.rssany.js +11 -1
- package/app/plugins/builtin/selectdataset.rssany.js +206 -0
- package/app/plugins/builtin/sensetime-tech-achievements.rssany.js +154 -0
- package/app/plugins/builtin/supervisely-blog.rssany.js +159 -0
- package/app/plugins/builtin/uci-ml-repository.rssany.js +111 -0
- package/app/plugins/builtin/venturebeat.rssany.js +97 -0
- package/app/plugins/builtin/worldlabs.rssany.js +129 -0
- package/app/plugins/builtin/x.rssany.js +159 -0
- package/app/plugins/builtin/xiaohongshu.rssany.js +283 -0
- package/app/plugins/builtin/zhipu-research.rssany.js +334 -0
- package/dist/index.js +79 -9
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/webui/build/200.html +6 -6
- package/webui/build/_app/immutable/assets/0.BB88QFoe.css +1 -0
- package/webui/build/_app/immutable/assets/{homeFeedPanelStore.BopJZtHu.css → homeFeedPanelStore.iOmfP2qL.css} +1 -1
- package/webui/build/_app/immutable/chunks/CZD-YNDw.js +31 -0
- package/webui/build/_app/immutable/chunks/{DcAshVxe.js → D6VIKef0.js} +1 -1
- package/webui/build/_app/immutable/chunks/{EIZIMsXK.js → Dbqx2mXq.js} +1 -1
- package/webui/build/_app/immutable/chunks/DeX-oq5W.js +41 -0
- package/webui/build/_app/immutable/chunks/{BXCWEhUd.js → dhB8G5Is.js} +1 -1
- package/webui/build/_app/immutable/entry/{app.DdgnooOk.js → app.XPso7q7g.js} +2 -2
- package/webui/build/_app/immutable/entry/start.Db4snNCd.js +1 -0
- package/webui/build/_app/immutable/nodes/0.BKTQePmA.js +11 -0
- package/webui/build/_app/immutable/nodes/{1.5DFDaT4c.js → 1.BS3_Rfxm.js} +1 -1
- package/webui/build/_app/immutable/nodes/{10.OVK4i9XE.js → 10.CyyxDCIS.js} +1 -1
- package/webui/build/_app/immutable/nodes/{11.Dhn_rO4A.js → 11.CtYgIaGj.js} +1 -1
- package/webui/build/_app/immutable/nodes/{14.B_KpJLxn.js → 14.D5OEGPR2.js} +1 -1
- package/webui/build/_app/immutable/nodes/{15.RaWaA-0I.js → 15.B4dFN1Gk.js} +1 -1
- package/webui/build/_app/immutable/nodes/{16.DSUgqolV.js → 16.M7ZII7tl.js} +1 -1
- package/webui/build/_app/immutable/nodes/{3.wQvGs9w-.js → 3.7r8v7qkm.js} +1 -1
- package/webui/build/_app/immutable/nodes/{5.CCtn90c0.js → 5.CHIzoGrb.js} +1 -1
- package/webui/build/_app/immutable/nodes/{6.C2_mjW1u.js → 6.BDBqx-GY.js} +1 -1
- package/webui/build/_app/immutable/nodes/{7.Dwz6W7A1.js → 7.D5czsDmz.js} +1 -1
- package/webui/build/_app/immutable/nodes/{8.DzkEw6rx.js → 8.pjVNsCdV.js} +1 -1
- package/webui/build/_app/immutable/nodes/{9.DtlXEwe1.js → 9.CsARv1BH.js} +1 -1
- package/webui/build/_app/version.json +1 -1
- package/webui/build/_app/immutable/assets/0.C6Q_nuW9.css +0 -1
- package/webui/build/_app/immutable/chunks/CkUAV0m0.js +0 -41
- package/webui/build/_app/immutable/chunks/CtijX1u3.js +0 -31
- package/webui/build/_app/immutable/entry/start.DhJaJZhR.js +0 -1
- package/webui/build/_app/immutable/nodes/0.BE05Cuc4.js +0 -11
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
// Baidu Research 插件:抓取 Blog 列表条目(不做正文 enrich)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
const BLOG_ITEM_PATH_RE = /^\/Blog\/index-view(?:\/)?$/i;
|
|
8
|
+
const MONTH_TO_INDEX = {
|
|
9
|
+
jan: 0,
|
|
10
|
+
feb: 1,
|
|
11
|
+
mar: 2,
|
|
12
|
+
apr: 3,
|
|
13
|
+
may: 4,
|
|
14
|
+
jun: 5,
|
|
15
|
+
jul: 6,
|
|
16
|
+
aug: 7,
|
|
17
|
+
sep: 8,
|
|
18
|
+
oct: 9,
|
|
19
|
+
nov: 10,
|
|
20
|
+
dec: 11,
|
|
21
|
+
};
|
|
22
|
+
const TEXT_NODE_SELECTORS = "div, p, span, h1, h2, h3, h4, h5, h6, strong, em";
|
|
23
|
+
const MONTH_DAY_TEXT_RE = /^([A-Za-z]{3,9})\s+\d{1,2}(?:st|nd|rd|th)?(?:\s*[,,]\s*\d{0,4})?$/i;
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
function normalizeText(text) {
|
|
27
|
+
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
function hashGuid(input) {
|
|
32
|
+
return _deps.createHash("sha256").update(input).digest("hex");
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
function toAbsoluteHttpUrl(rawHref, baseUrl) {
|
|
37
|
+
if (!rawHref) return null;
|
|
38
|
+
const href = rawHref.trim();
|
|
39
|
+
if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
|
|
40
|
+
try {
|
|
41
|
+
const url = new URL(href, baseUrl);
|
|
42
|
+
if (!/^https?:$/i.test(url.protocol)) return null;
|
|
43
|
+
return url.href;
|
|
44
|
+
} catch {
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
function inferYearFromTexts(texts) {
|
|
51
|
+
for (const text of texts) {
|
|
52
|
+
const m = normalizeText(text).match(/\b(19|20)\d{2}\b/);
|
|
53
|
+
if (!m) continue;
|
|
54
|
+
const year = Number(m[0]);
|
|
55
|
+
if (year >= 1990 && year <= new Date().getUTCFullYear() + 1) {
|
|
56
|
+
return year;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
return undefined;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
function parsePubDate(rawText, fallbackTexts) {
|
|
64
|
+
const text = normalizeText(rawText).replace(",", ",");
|
|
65
|
+
if (!text) return undefined;
|
|
66
|
+
|
|
67
|
+
const monthMatch = text.match(
|
|
68
|
+
/^([A-Za-z]{3,9})\s+(\d{1,2})(?:st|nd|rd|th)?(?:\s*,\s*)?(?:(\d{4}))?$/
|
|
69
|
+
);
|
|
70
|
+
if (monthMatch) {
|
|
71
|
+
const month = MONTH_TO_INDEX[monthMatch[1].slice(0, 3).toLowerCase()];
|
|
72
|
+
if (month != null) {
|
|
73
|
+
const day = Number(monthMatch[2]);
|
|
74
|
+
const year = monthMatch[3] ? Number(monthMatch[3]) : inferYearFromTexts(fallbackTexts);
|
|
75
|
+
if (!year) return undefined;
|
|
76
|
+
const parsed = new Date(Date.UTC(year, month, day, 12, 0, 0));
|
|
77
|
+
if (!Number.isNaN(parsed.getTime())) return parsed;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const numericMatch = text.match(/^(\d{4})[/-](\d{1,2})[/-](\d{1,2})$/);
|
|
82
|
+
if (numericMatch) {
|
|
83
|
+
const year = Number(numericMatch[1]);
|
|
84
|
+
const month = Number(numericMatch[2]) - 1;
|
|
85
|
+
const day = Number(numericMatch[3]);
|
|
86
|
+
const parsed = new Date(Date.UTC(year, month, day, 12, 0, 0));
|
|
87
|
+
if (!Number.isNaN(parsed.getTime())) return parsed;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
return undefined;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
function isDateLikeText(text) {
|
|
95
|
+
return MONTH_DAY_TEXT_RE.test(normalizeText(text));
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
function getLeafTexts(anchor) {
|
|
100
|
+
const all = anchor.querySelectorAll(TEXT_NODE_SELECTORS);
|
|
101
|
+
const out = [];
|
|
102
|
+
const seen = new Set();
|
|
103
|
+
for (const el of all) {
|
|
104
|
+
if (el.querySelector(TEXT_NODE_SELECTORS) != null) continue;
|
|
105
|
+
const text = normalizeText(el.textContent);
|
|
106
|
+
if (!text || text === "MORE") continue;
|
|
107
|
+
if (seen.has(text)) continue;
|
|
108
|
+
seen.add(text);
|
|
109
|
+
out.push(text);
|
|
110
|
+
}
|
|
111
|
+
return out;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
function parseAnchorItem(anchor, finalUrl) {
|
|
116
|
+
const link = toAbsoluteHttpUrl(anchor.getAttribute("href"), finalUrl);
|
|
117
|
+
if (!link) return null;
|
|
118
|
+
|
|
119
|
+
const linkUrl = new URL(link);
|
|
120
|
+
if (linkUrl.hostname !== "research.baidu.com") return null;
|
|
121
|
+
if (!BLOG_ITEM_PATH_RE.test(linkUrl.pathname)) return null;
|
|
122
|
+
if (!/^\d+$/.test(linkUrl.searchParams.get("id") ?? "")) return null;
|
|
123
|
+
|
|
124
|
+
const texts = getLeafTexts(anchor);
|
|
125
|
+
if (texts.length === 0) return null;
|
|
126
|
+
|
|
127
|
+
const datedTexts = texts.map((text) => ({ text, date: parsePubDate(text, texts) }));
|
|
128
|
+
const dateIndex = datedTexts.findIndex((x) => x.date != null);
|
|
129
|
+
const pubDate = dateIndex >= 0 ? datedTexts[dateIndex].date : undefined;
|
|
130
|
+
|
|
131
|
+
const nonDateTexts = datedTexts
|
|
132
|
+
.filter((x) => x.date == null && !isDateLikeText(x.text))
|
|
133
|
+
.map((x) => x.text);
|
|
134
|
+
const titleCandidates = nonDateTexts.filter((text) => text.length >= 6);
|
|
135
|
+
const looksLikeSummary = (text) => text.endsWith("...") || text.split(/\s+/).length >= 22;
|
|
136
|
+
|
|
137
|
+
let title = "";
|
|
138
|
+
if (titleCandidates.length > 0) {
|
|
139
|
+
title = [...titleCandidates]
|
|
140
|
+
.sort((a, b) => {
|
|
141
|
+
const aPenalty = looksLikeSummary(a) ? 1 : 0;
|
|
142
|
+
const bPenalty = looksLikeSummary(b) ? 1 : 0;
|
|
143
|
+
if (aPenalty !== bPenalty) return aPenalty - bPenalty;
|
|
144
|
+
return a.length - b.length;
|
|
145
|
+
})[0];
|
|
146
|
+
}
|
|
147
|
+
if (!title && dateIndex > 0) {
|
|
148
|
+
title = datedTexts.slice(0, dateIndex).map((x) => x.text).find((text) => text.length >= 6) ?? "";
|
|
149
|
+
}
|
|
150
|
+
if (!title) return null;
|
|
151
|
+
|
|
152
|
+
const summary = nonDateTexts.find(
|
|
153
|
+
(text) => text !== title && (text.endsWith("...") || text.length >= 40)
|
|
154
|
+
);
|
|
155
|
+
|
|
156
|
+
return {
|
|
157
|
+
guid: hashGuid(link),
|
|
158
|
+
title,
|
|
159
|
+
link,
|
|
160
|
+
pubDate: pubDate ?? new Date(),
|
|
161
|
+
author: "Baidu Research",
|
|
162
|
+
summary: summary || undefined,
|
|
163
|
+
sourceId: "baidu-research",
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
function parseBlogItems(html, finalUrl) {
|
|
169
|
+
const root = _deps.parseHtml(html);
|
|
170
|
+
const anchors = root.querySelectorAll("a[href]");
|
|
171
|
+
const seen = new Set();
|
|
172
|
+
const items = [];
|
|
173
|
+
|
|
174
|
+
for (const anchor of anchors) {
|
|
175
|
+
const item = parseAnchorItem(anchor, finalUrl);
|
|
176
|
+
if (!item) continue;
|
|
177
|
+
if (seen.has(item.link)) continue;
|
|
178
|
+
seen.add(item.link);
|
|
179
|
+
items.push(item);
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
return items;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
function mergeByLink(itemsA, itemsB) {
|
|
187
|
+
const byLink = new Map();
|
|
188
|
+
for (const item of [...itemsA, ...itemsB]) {
|
|
189
|
+
if (byLink.has(item.link)) continue;
|
|
190
|
+
byLink.set(item.link, item);
|
|
191
|
+
}
|
|
192
|
+
return Array.from(byLink.values());
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
async function fetchItems(sourceId, ctx) {
|
|
197
|
+
_deps = ctx.deps;
|
|
198
|
+
const primary = await ctx.fetchHtml(sourceId, { waitMs: 4500 });
|
|
199
|
+
let items = parseBlogItems(primary.html, primary.finalUrl || sourceId);
|
|
200
|
+
|
|
201
|
+
const primaryUrl = new URL(primary.finalUrl || sourceId);
|
|
202
|
+
if (items.length < 5 && primaryUrl.hostname === "research.baidu.com") {
|
|
203
|
+
const blogUrl = new URL("/Blog", primaryUrl).href;
|
|
204
|
+
if (blogUrl !== primaryUrl.href) {
|
|
205
|
+
const blogPage = await ctx.fetchHtml(blogUrl, { waitMs: 4500 });
|
|
206
|
+
items = mergeByLink(items, parseBlogItems(blogPage.html, blogPage.finalUrl || blogUrl));
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
if (items.length === 0) {
|
|
211
|
+
throw new Error("[baidu-research] 未解析到 Blog 条目,页面结构可能已变化");
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
return items;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
export default {
|
|
219
|
+
id: "baidu-research",
|
|
220
|
+
listUrlPattern: /^https?:\/\/research\.baidu\.com\/(?:(?:Index|Blog)\/?)?(?:\?.*)?$/i,
|
|
221
|
+
fetchItems,
|
|
222
|
+
};
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
// Bright Data 博客插件:优先解析站点 RSS feed,失败时回退解析列表页(不做正文 enrich)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
const BRIGHTDATA_ORIGIN = "https://brightdata.com";
|
|
8
|
+
const LIST_URL_RE =
|
|
9
|
+
/^https?:\/\/(?:www\.)?brightdata\.com\/blog(?:\/(?:page\/\d+|[a-z0-9-]+(?:\/page\/\d+)?)?)?\/?(?:\?.*)?$/i;
|
|
10
|
+
const ARTICLE_PATH_RE = /^\/blog\/([^/?#/]+)\/([^/?#/]+)\/?$/i;
|
|
11
|
+
const MIN_READ_RE = /^\d+\s*min\s*read$/i;
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
function normalizeText(text) {
|
|
15
|
+
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
function hashGuid(input) {
|
|
20
|
+
return _deps.createHash("sha256").update(input).digest("hex");
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
function toAbsoluteHttpUrl(rawHref, baseUrl) {
|
|
25
|
+
if (!rawHref) return null;
|
|
26
|
+
const href = rawHref.trim();
|
|
27
|
+
if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
|
|
28
|
+
try {
|
|
29
|
+
const url = new URL(href, baseUrl);
|
|
30
|
+
if (!/^https?:$/i.test(url.protocol)) return null;
|
|
31
|
+
return url.href;
|
|
32
|
+
} catch {
|
|
33
|
+
return null;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
function parsePubDate(raw) {
|
|
39
|
+
if (!raw) return undefined;
|
|
40
|
+
const date = new Date(raw);
|
|
41
|
+
return Number.isNaN(date.getTime()) ? undefined : date;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
function isBrightDataArticleUrl(urlText) {
|
|
46
|
+
try {
|
|
47
|
+
const url = new URL(urlText);
|
|
48
|
+
if (!/(^|\.)brightdata\.com$/i.test(url.hostname)) return false;
|
|
49
|
+
const m = url.pathname.match(ARTICLE_PATH_RE);
|
|
50
|
+
if (!m) return false;
|
|
51
|
+
return m[1].toLowerCase() !== "page" && m[1].toLowerCase() !== "feed";
|
|
52
|
+
} catch {
|
|
53
|
+
return false;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
function formatCategory(raw) {
|
|
59
|
+
return raw
|
|
60
|
+
.split("-")
|
|
61
|
+
.map((part) => part ? part[0].toUpperCase() + part.slice(1) : "")
|
|
62
|
+
.filter(Boolean)
|
|
63
|
+
.join(" ");
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
function extractCategoryFromLink(link) {
|
|
68
|
+
try {
|
|
69
|
+
const url = new URL(link);
|
|
70
|
+
const m = url.pathname.match(ARTICLE_PATH_RE);
|
|
71
|
+
if (!m) return undefined;
|
|
72
|
+
const category = formatCategory(m[1]);
|
|
73
|
+
return category || undefined;
|
|
74
|
+
} catch {
|
|
75
|
+
return undefined;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
function uniqueTexts(values) {
|
|
81
|
+
const seen = new Set();
|
|
82
|
+
const out = [];
|
|
83
|
+
for (const value of values) {
|
|
84
|
+
if (!value || seen.has(value)) continue;
|
|
85
|
+
seen.add(value);
|
|
86
|
+
out.push(value);
|
|
87
|
+
}
|
|
88
|
+
return out;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
function queryText(node, selectors) {
|
|
93
|
+
for (const selector of selectors) {
|
|
94
|
+
try {
|
|
95
|
+
const text = normalizeText(node.querySelector(selector)?.textContent);
|
|
96
|
+
if (text) return text;
|
|
97
|
+
} catch {
|
|
98
|
+
// ignore unsupported selectors
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
return "";
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
function deriveFeedUrlFromListUrl(sourceId) {
|
|
106
|
+
try {
|
|
107
|
+
const url = new URL(sourceId, BRIGHTDATA_ORIGIN);
|
|
108
|
+
const parts = url.pathname.split("/").filter(Boolean);
|
|
109
|
+
if (parts[0] !== "blog") return new URL("/blog/feed/", url.origin).href;
|
|
110
|
+
|
|
111
|
+
if (parts.length >= 2 && parts[1].toLowerCase() !== "page") {
|
|
112
|
+
return new URL(`/blog/${parts[1]}/feed/`, url.origin).href;
|
|
113
|
+
}
|
|
114
|
+
return new URL("/blog/feed/", url.origin).href;
|
|
115
|
+
} catch {
|
|
116
|
+
return null;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
function resolveFeedUrl(root, pageUrl) {
|
|
122
|
+
const feedHref = root
|
|
123
|
+
.querySelector('link[rel="alternate"][type="application/rss+xml"][href], link[href*="/feed/"][type="application/rss+xml"]')
|
|
124
|
+
?.getAttribute("href");
|
|
125
|
+
return toAbsoluteHttpUrl(feedHref, pageUrl);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
async function fetchFeedItems(feedUrl) {
|
|
130
|
+
const res = await fetch(feedUrl, {
|
|
131
|
+
redirect: "follow",
|
|
132
|
+
headers: {
|
|
133
|
+
"User-Agent":
|
|
134
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
|
135
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
136
|
+
},
|
|
137
|
+
});
|
|
138
|
+
if (!res.ok) {
|
|
139
|
+
throw new Error(`[brightdata-blog] 获取 RSS feed 失败: ${res.status}`);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
const xml = await res.text();
|
|
143
|
+
const root = _deps.parseHtml(xml);
|
|
144
|
+
const items = [];
|
|
145
|
+
const seen = new Set();
|
|
146
|
+
|
|
147
|
+
for (const entry of root.querySelectorAll("item")) {
|
|
148
|
+
const title = queryText(entry, ["title"]);
|
|
149
|
+
const link = toAbsoluteHttpUrl(queryText(entry, ["link"]), feedUrl);
|
|
150
|
+
if (!title || !link || !isBrightDataArticleUrl(link) || seen.has(link)) continue;
|
|
151
|
+
seen.add(link);
|
|
152
|
+
|
|
153
|
+
const summary = queryText(entry, ["description"]);
|
|
154
|
+
const author = queryText(entry, ["dc\\:creator", "creator", "author"]);
|
|
155
|
+
const pubDateRaw = queryText(entry, ["pubDate", "published", "updated", "dc\\:date"]);
|
|
156
|
+
const pubDate = parsePubDate(pubDateRaw) ?? new Date();
|
|
157
|
+
|
|
158
|
+
const categories = entry
|
|
159
|
+
.querySelectorAll("category")
|
|
160
|
+
.map((node) => normalizeText(node.textContent))
|
|
161
|
+
.filter(Boolean);
|
|
162
|
+
const fallbackCategory = extractCategoryFromLink(link);
|
|
163
|
+
const finalCategories = categories.length > 0
|
|
164
|
+
? uniqueTexts(categories)
|
|
165
|
+
: (fallbackCategory ? [fallbackCategory] : undefined);
|
|
166
|
+
|
|
167
|
+
items.push({
|
|
168
|
+
guid: hashGuid(link),
|
|
169
|
+
title,
|
|
170
|
+
link,
|
|
171
|
+
pubDate,
|
|
172
|
+
author: author || undefined,
|
|
173
|
+
summary: summary || undefined,
|
|
174
|
+
});
|
|
175
|
+
}
|
|
176
|
+
return items;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
function collectLeafTexts(anchor) {
|
|
181
|
+
const nodes = anchor.querySelectorAll("h1, h2, h3, h4, h5, h6, p, span, div");
|
|
182
|
+
const texts = [];
|
|
183
|
+
for (const node of nodes) {
|
|
184
|
+
if (node.querySelector("h1, h2, h3, h4, h5, h6, p, span, div")) continue;
|
|
185
|
+
const text = normalizeText(node.textContent);
|
|
186
|
+
if (/<[^>]+>/.test(text)) continue;
|
|
187
|
+
if (text) texts.push(text);
|
|
188
|
+
}
|
|
189
|
+
return uniqueTexts(texts);
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
function isMetaText(text) {
|
|
194
|
+
const normalized = normalizeText(text).toLowerCase();
|
|
195
|
+
if (!normalized) return true;
|
|
196
|
+
if (MIN_READ_RE.test(normalized)) return true;
|
|
197
|
+
return (
|
|
198
|
+
normalized === "editor's pick" ||
|
|
199
|
+
normalized === "latest articles" ||
|
|
200
|
+
normalized === "all categories" ||
|
|
201
|
+
normalized.includes("min read")
|
|
202
|
+
);
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
function extractTitle(anchor, texts, category) {
|
|
207
|
+
const categoryText = normalizeText(category).toLowerCase();
|
|
208
|
+
const heading = normalizeText(anchor.querySelector("h1, h2, h3, h4, h5, h6")?.textContent);
|
|
209
|
+
if (heading && !isMetaText(heading) && heading.toLowerCase() !== categoryText) return heading;
|
|
210
|
+
|
|
211
|
+
for (const text of texts) {
|
|
212
|
+
if (isMetaText(text)) continue;
|
|
213
|
+
if (categoryText && text.toLowerCase() === categoryText) continue;
|
|
214
|
+
if (text.length < 12) continue;
|
|
215
|
+
return text;
|
|
216
|
+
}
|
|
217
|
+
return "";
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
function extractSummary(title, texts) {
|
|
222
|
+
for (const text of texts) {
|
|
223
|
+
if (!text || text === title || isMetaText(text)) continue;
|
|
224
|
+
if (text.length < 20) continue;
|
|
225
|
+
return text;
|
|
226
|
+
}
|
|
227
|
+
return "";
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
function extractAuthor(anchor) {
|
|
232
|
+
const images = anchor.querySelectorAll("img[alt]");
|
|
233
|
+
for (const image of images) {
|
|
234
|
+
const alt = normalizeText(image.getAttribute("alt"));
|
|
235
|
+
if (!alt) continue;
|
|
236
|
+
if (/^[A-Z][A-Za-z.'-]+(?:\s+[A-Z][A-Za-z.'-]+){1,2}$/.test(alt)) return alt;
|
|
237
|
+
}
|
|
238
|
+
return "";
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
function parseHtmlItems(root, baseUrl) {
|
|
243
|
+
const items = [];
|
|
244
|
+
const seen = new Set();
|
|
245
|
+
for (const anchor of root.querySelectorAll("a[href]")) {
|
|
246
|
+
const link = toAbsoluteHttpUrl(anchor.getAttribute("href"), baseUrl);
|
|
247
|
+
if (!link || !isBrightDataArticleUrl(link) || seen.has(link)) continue;
|
|
248
|
+
const category = extractCategoryFromLink(link);
|
|
249
|
+
|
|
250
|
+
const texts = collectLeafTexts(anchor);
|
|
251
|
+
const title = extractTitle(anchor, texts, category);
|
|
252
|
+
if (!title) continue;
|
|
253
|
+
|
|
254
|
+
seen.add(link);
|
|
255
|
+
const summary = extractSummary(title, texts);
|
|
256
|
+
const author = extractAuthor(anchor);
|
|
257
|
+
|
|
258
|
+
items.push({
|
|
259
|
+
guid: hashGuid(link),
|
|
260
|
+
title,
|
|
261
|
+
link,
|
|
262
|
+
pubDate: new Date(),
|
|
263
|
+
author: author || undefined,
|
|
264
|
+
summary: summary || undefined,
|
|
265
|
+
});
|
|
266
|
+
}
|
|
267
|
+
return items;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
async function fetchItems(sourceId, ctx) {
|
|
272
|
+
_deps = ctx.deps;
|
|
273
|
+
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
|
|
274
|
+
const root = _deps.parseHtml(html);
|
|
275
|
+
const pageUrl = finalUrl || sourceId || BRIGHTDATA_ORIGIN;
|
|
276
|
+
|
|
277
|
+
const discoveredFeedUrl = resolveFeedUrl(root, pageUrl);
|
|
278
|
+
const fallbackFeedUrl = deriveFeedUrlFromListUrl(pageUrl);
|
|
279
|
+
const feedUrls = uniqueTexts([discoveredFeedUrl, fallbackFeedUrl]);
|
|
280
|
+
for (const feedUrl of feedUrls) {
|
|
281
|
+
try {
|
|
282
|
+
const fromFeed = await fetchFeedItems(feedUrl);
|
|
283
|
+
if (fromFeed.length > 0) return fromFeed;
|
|
284
|
+
} catch {
|
|
285
|
+
// feed 失败时回退 HTML 解析
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
const fromHtml = parseHtmlItems(root, pageUrl);
|
|
290
|
+
if (fromHtml.length === 0) {
|
|
291
|
+
throw new Error("[brightdata-blog] 未解析到文章条目,页面结构可能已变化");
|
|
292
|
+
}
|
|
293
|
+
return fromHtml;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
export default {
|
|
298
|
+
id: "brightdata-blog",
|
|
299
|
+
listUrlPattern: LIST_URL_RE,
|
|
300
|
+
fetchItems,
|
|
301
|
+
};
|