rssany 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/app/plugins/builtin/agi-eval-evaluation.rssany.js +188 -0
- package/app/plugins/builtin/amii-research-talent.rssany.js +73 -0
- package/app/plugins/builtin/anthropic-research.rssany.js +155 -0
- package/app/plugins/builtin/appen-resources.rssany.js +155 -0
- package/app/plugins/builtin/baai-wudao-paper-article.rssany.js +185 -0
- package/app/plugins/builtin/baaidata-csdn.rssany.js +242 -0
- package/app/plugins/builtin/baidu-research.rssany.js +222 -0
- package/app/plugins/builtin/brightdata-blog.rssany.js +301 -0
- package/app/plugins/builtin/bytedance-seed-research.rssany.js +231 -0
- package/app/plugins/builtin/five-radar.rssany.js +490 -0
- package/app/plugins/builtin/flageval-news.rssany.js +118 -0
- package/app/plugins/builtin/google-deepmind-research.rssany.js +223 -0
- package/app/plugins/builtin/google-research-datasets.rssany.js +171 -0
- package/app/plugins/builtin/google-research.rssany.js +220 -0
- package/app/plugins/builtin/google.rssany.js +187 -0
- package/app/plugins/builtin/hacker-news-newest.rssany.js +130 -0
- package/app/plugins/builtin/harvard-dataverse.rssany.js +166 -0
- package/app/plugins/builtin/huaweicloud-bbs-blogs.rssany.js +185 -0
- package/app/plugins/builtin/lingowhale.rssany.js +119 -0
- package/app/plugins/builtin/meituan-tech.rssany.js +130 -0
- package/app/plugins/builtin/meta-ai-publications.rssany.js +221 -0
- package/app/plugins/builtin/mila-quebec.rssany.js +199 -0
- package/app/plugins/builtin/mit-csail-research.rssany.js +208 -0
- package/app/plugins/builtin/moonshot.rssany.js +127 -0
- package/app/plugins/builtin/opendatalab-news.rssany.js +174 -0
- package/app/plugins/builtin/opendatalab.rssany.js +109 -0
- package/app/plugins/builtin/opendrivelab-autonomous-driving.rssany.js +114 -0
- package/app/plugins/builtin/opendrivelab-embodiedai.rssany.js +114 -0
- package/app/plugins/builtin/opendrivelab-publications.rssany.js +130 -0
- package/app/plugins/builtin/opendrivelab.rssany.js +333 -0
- package/app/plugins/builtin/paperswithcode.rssany.js +227 -0
- package/app/plugins/builtin/pjlab-adg-publications.rssany.js +202 -0
- package/app/plugins/builtin/rss.rssany.js +11 -1
- package/app/plugins/builtin/selectdataset.rssany.js +206 -0
- package/app/plugins/builtin/sensetime-tech-achievements.rssany.js +154 -0
- package/app/plugins/builtin/supervisely-blog.rssany.js +159 -0
- package/app/plugins/builtin/uci-ml-repository.rssany.js +111 -0
- package/app/plugins/builtin/venturebeat.rssany.js +97 -0
- package/app/plugins/builtin/worldlabs.rssany.js +129 -0
- package/app/plugins/builtin/x.rssany.js +159 -0
- package/app/plugins/builtin/xiaohongshu.rssany.js +283 -0
- package/app/plugins/builtin/zhipu-research.rssany.js +334 -0
- package/dist/index.js +62 -4
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/webui/build/200.html +6 -6
- package/webui/build/_app/immutable/assets/{0.DjU2hdCQ.css → 0.BB88QFoe.css} +1 -1
- package/webui/build/_app/immutable/assets/{homeFeedPanelStore.BopJZtHu.css → homeFeedPanelStore.iOmfP2qL.css} +1 -1
- package/webui/build/_app/immutable/chunks/CZD-YNDw.js +31 -0
- package/webui/build/_app/immutable/chunks/{C85CNwD2.js → D6VIKef0.js} +1 -1
- package/webui/build/_app/immutable/chunks/{CllQAdvt.js → Dbqx2mXq.js} +1 -1
- package/webui/build/_app/immutable/chunks/DeX-oq5W.js +41 -0
- package/webui/build/_app/immutable/chunks/{CdMsRjxJ.js → dhB8G5Is.js} +1 -1
- package/webui/build/_app/immutable/entry/{app.BcD2eSsQ.js → app.XPso7q7g.js} +2 -2
- package/webui/build/_app/immutable/entry/start.Db4snNCd.js +1 -0
- package/webui/build/_app/immutable/nodes/0.BKTQePmA.js +11 -0
- package/webui/build/_app/immutable/nodes/{1.DU9aYGAb.js → 1.BS3_Rfxm.js} +1 -1
- package/webui/build/_app/immutable/nodes/{10.Db6vw7Ih.js → 10.CyyxDCIS.js} +1 -1
- package/webui/build/_app/immutable/nodes/{11.BaAcorz3.js → 11.CtYgIaGj.js} +1 -1
- package/webui/build/_app/immutable/nodes/{14.DqT4pcrQ.js → 14.D5OEGPR2.js} +1 -1
- package/webui/build/_app/immutable/nodes/{15.CCLbjxnH.js → 15.B4dFN1Gk.js} +1 -1
- package/webui/build/_app/immutable/nodes/{16.DiigpVdP.js → 16.M7ZII7tl.js} +1 -1
- package/webui/build/_app/immutable/nodes/{3.DEcYOQc-.js → 3.7r8v7qkm.js} +1 -1
- package/webui/build/_app/immutable/nodes/{5.CvM1TkLG.js → 5.CHIzoGrb.js} +1 -1
- package/webui/build/_app/immutable/nodes/{6.Dscr6LkS.js → 6.BDBqx-GY.js} +1 -1
- package/webui/build/_app/immutable/nodes/{7.Bp60MobD.js → 7.D5czsDmz.js} +1 -1
- package/webui/build/_app/immutable/nodes/{8.DwSg0MHh.js → 8.pjVNsCdV.js} +1 -1
- package/webui/build/_app/immutable/nodes/{9.BeYOUjxR.js → 9.CsARv1BH.js} +1 -1
- package/webui/build/_app/version.json +1 -1
- package/webui/build/_app/immutable/chunks/CtijX1u3.js +0 -31
- package/webui/build/_app/immutable/chunks/Dv1VCsiB.js +0 -41
- package/webui/build/_app/immutable/entry/start.CbkdJdz1.js +0 -1
- package/webui/build/_app/immutable/nodes/0.DSUDmOx2.js +0 -11
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
// Google Search 插件:将搜索结果页转换为 FeedItem 列表(不含 enrich)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
function normalizeText(text) {
|
|
7
|
+
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
function isGoogleHost(hostname) {
|
|
11
|
+
return /^([a-z0-9-]+\.)*google\.[a-z.]+$/i.test(hostname);
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
function resolveResultLink(rawHref, pageUrl) {
|
|
15
|
+
if (!rawHref) return null;
|
|
16
|
+
const href = rawHref.trim();
|
|
17
|
+
if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
|
|
18
|
+
|
|
19
|
+
let url;
|
|
20
|
+
try {
|
|
21
|
+
url = new URL(href, pageUrl);
|
|
22
|
+
} catch {
|
|
23
|
+
return null;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
if (isGoogleHost(url.hostname) && url.pathname === "/url") {
|
|
27
|
+
const q = url.searchParams.get("q") ?? url.searchParams.get("url");
|
|
28
|
+
if (!q) return null;
|
|
29
|
+
try {
|
|
30
|
+
const target = new URL(q);
|
|
31
|
+
return /^https?:$/i.test(target.protocol) ? target.href : null;
|
|
32
|
+
} catch {
|
|
33
|
+
return null;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
if (!/^https?:$/i.test(url.protocol)) return null;
|
|
38
|
+
if (isGoogleHost(url.hostname)) return null;
|
|
39
|
+
return url.href;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function closestAnchor(node) {
|
|
43
|
+
let cur = node;
|
|
44
|
+
while (cur) {
|
|
45
|
+
if (cur.tagName?.toLowerCase() === "a") return cur;
|
|
46
|
+
cur = cur.parentNode ?? null;
|
|
47
|
+
}
|
|
48
|
+
return null;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function extractSnippet(startNode) {
|
|
52
|
+
const snippetSelectors = [".VwiC3b", ".IsZvec", ".MUxGbd", ".lyLwlc"];
|
|
53
|
+
let cur = startNode;
|
|
54
|
+
for (let i = 0; i < 6 && cur; i += 1) {
|
|
55
|
+
for (const sel of snippetSelectors) {
|
|
56
|
+
const el = cur.querySelector?.(sel);
|
|
57
|
+
const text = normalizeText(el?.textContent);
|
|
58
|
+
if (text) return text;
|
|
59
|
+
}
|
|
60
|
+
cur = cur.parentNode ?? null;
|
|
61
|
+
}
|
|
62
|
+
return "";
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function looksLikeBlockedPage(root, html, finalUrl) {
|
|
66
|
+
const text = normalizeText(root.textContent).toLowerCase();
|
|
67
|
+
const body = (html ?? "").toLowerCase();
|
|
68
|
+
if (finalUrl.includes("/sorry/")) return true;
|
|
69
|
+
if (root.querySelector("#captcha-form, #recaptcha, .g-recaptcha")) return true;
|
|
70
|
+
if (body.includes("/httpservice/retry/enablejs")) return true;
|
|
71
|
+
if (body.includes("id=\"yvlrue\"") || body.includes("if you're having trouble accessing google search")) return true;
|
|
72
|
+
if (body.includes("sg_rel")) return true;
|
|
73
|
+
return text.includes("about this page") || text.includes("unusual traffic") || text.includes("captcha");
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
async function checkAuth(page, _url) {
|
|
77
|
+
try {
|
|
78
|
+
const currentUrl = page.url() || "";
|
|
79
|
+
if (currentUrl.includes("/sorry/") || currentUrl.includes("/httpservice/retry/enablejs")) return false;
|
|
80
|
+
|
|
81
|
+
const blockedDom = await page.$("#captcha-form, #recaptcha, .g-recaptcha, #yvlrue");
|
|
82
|
+
if (blockedDom) return false;
|
|
83
|
+
|
|
84
|
+
const bodyText = await page.evaluate(() => (document.body?.innerText ?? "").toLowerCase());
|
|
85
|
+
if (
|
|
86
|
+
bodyText.includes("unusual traffic") ||
|
|
87
|
+
bodyText.includes("about this page") ||
|
|
88
|
+
bodyText.includes("captcha") ||
|
|
89
|
+
bodyText.includes("if you're having trouble accessing google search")
|
|
90
|
+
) {
|
|
91
|
+
return false;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// 首页或搜索页出现 q 输入框即可认为当前会话可用;若被风控会在上面的分支提前返回 false。
|
|
95
|
+
const searchBox = await page.$('textarea[name="q"], input[name="q"]');
|
|
96
|
+
return !!searchBox;
|
|
97
|
+
} catch {
|
|
98
|
+
return false;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function parseFromResultBlocks(root, pageUrl) {
|
|
103
|
+
const resultBlocks = root.querySelectorAll("#rso .MjjYud .A6K0A, #rso .MjjYud .tF2Cxc");
|
|
104
|
+
const seen = new Set();
|
|
105
|
+
const items = [];
|
|
106
|
+
|
|
107
|
+
for (const block of resultBlocks) {
|
|
108
|
+
const anchor =
|
|
109
|
+
block.querySelector('a[jsname="UWckNb"][href]') ??
|
|
110
|
+
block.querySelector(".yuRUbf a[href]") ??
|
|
111
|
+
block.querySelector("a[href]");
|
|
112
|
+
const link = resolveResultLink(anchor?.getAttribute("href"), pageUrl);
|
|
113
|
+
if (!link || seen.has(link)) continue;
|
|
114
|
+
|
|
115
|
+
const titleNode =
|
|
116
|
+
block.querySelector("h3.LC20lb, h3.DKV0Md, h3.MBeuO, h3") ??
|
|
117
|
+
anchor?.querySelector?.("h3");
|
|
118
|
+
const title = normalizeText(titleNode?.textContent);
|
|
119
|
+
if (!title) continue;
|
|
120
|
+
|
|
121
|
+
seen.add(link);
|
|
122
|
+
const summary = extractSnippet(block) || extractSnippet(titleNode ?? block) || title;
|
|
123
|
+
items.push({
|
|
124
|
+
guid: _deps.createHash("sha256").update(link).digest("hex"),
|
|
125
|
+
title,
|
|
126
|
+
link,
|
|
127
|
+
pubDate: new Date(),
|
|
128
|
+
author: "Google Search",
|
|
129
|
+
summary,
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
return items;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
function parseFromHeadingFallback(root, pageUrl) {
|
|
137
|
+
const seen = new Set();
|
|
138
|
+
const items = [];
|
|
139
|
+
const titles = root.querySelectorAll("h3");
|
|
140
|
+
for (const h3 of titles) {
|
|
141
|
+
const title = normalizeText(h3.textContent);
|
|
142
|
+
if (!title) continue;
|
|
143
|
+
const anchor = closestAnchor(h3);
|
|
144
|
+
const link = resolveResultLink(anchor?.getAttribute("href"), pageUrl);
|
|
145
|
+
if (!link || seen.has(link)) continue;
|
|
146
|
+
seen.add(link);
|
|
147
|
+
const summary = extractSnippet(h3) || title;
|
|
148
|
+
items.push({
|
|
149
|
+
guid: _deps.createHash("sha256").update(link).digest("hex"),
|
|
150
|
+
title,
|
|
151
|
+
link,
|
|
152
|
+
pubDate: new Date(),
|
|
153
|
+
author: "Google Search",
|
|
154
|
+
summary,
|
|
155
|
+
});
|
|
156
|
+
}
|
|
157
|
+
return items;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
async function fetchItems(sourceId, ctx) {
|
|
161
|
+
_deps = ctx.deps;
|
|
162
|
+
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 2500 });
|
|
163
|
+
const root = _deps.parseHtml(html);
|
|
164
|
+
const pageUrl = new URL(finalUrl);
|
|
165
|
+
|
|
166
|
+
const fromBlocks = parseFromResultBlocks(root, pageUrl);
|
|
167
|
+
const items = fromBlocks.length > 0 ? fromBlocks : parseFromHeadingFallback(root, pageUrl);
|
|
168
|
+
|
|
169
|
+
if (items.length === 0) {
|
|
170
|
+
if (looksLikeBlockedPage(root, html, finalUrl)) {
|
|
171
|
+
throw new Error("[google] 命中 Google 验证页(reCAPTCHA/风控),当前会话无法稳定抓取搜索结果");
|
|
172
|
+
}
|
|
173
|
+
throw new Error("[google] 未解析到搜索结果,页面结构可能已变化");
|
|
174
|
+
}
|
|
175
|
+
return items;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
export default {
|
|
179
|
+
id: "google-search",
|
|
180
|
+
listUrlPattern: /^https?:\/\/(www\.)?google\.[^/]+\/search(\?.*)?$/i,
|
|
181
|
+
fetchItems,
|
|
182
|
+
checkAuth,
|
|
183
|
+
loginUrl: "https://www.google.com/",
|
|
184
|
+
domain: "google.com",
|
|
185
|
+
loginTimeoutMs: 5 * 60 * 1000,
|
|
186
|
+
pollIntervalMs: 2000,
|
|
187
|
+
};
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
// Hacker News newest 插件:解析 newest 列表页为 FeedItem(仅列表,不做正文 enrich)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
const HN_ORIGIN = "https://news.ycombinator.com";
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
function normalizeText(text) {
|
|
11
|
+
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
function toAbsoluteUrl(rawHref, baseUrl) {
|
|
16
|
+
const href = normalizeText(rawHref);
|
|
17
|
+
if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
|
|
18
|
+
try {
|
|
19
|
+
const url = new URL(href, baseUrl);
|
|
20
|
+
if (!/^https?:$/i.test(url.protocol)) return null;
|
|
21
|
+
return url.href;
|
|
22
|
+
} catch {
|
|
23
|
+
return null;
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
function parsePubDate(rawTitle) {
|
|
29
|
+
const text = normalizeText(rawTitle);
|
|
30
|
+
if (!text) return new Date();
|
|
31
|
+
|
|
32
|
+
const parts = text.split(/\s+/);
|
|
33
|
+
const epochPart = parts[1];
|
|
34
|
+
if (epochPart && /^\d{10}$/.test(epochPart)) {
|
|
35
|
+
return new Date(Number(epochPart) * 1000);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const date = new Date(parts[0]);
|
|
39
|
+
return Number.isNaN(date.getTime()) ? new Date() : date;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
function parseTitleLink(row, pageUrl, itemId) {
|
|
44
|
+
const anchors = row.querySelectorAll("a[href]");
|
|
45
|
+
for (const anchor of anchors) {
|
|
46
|
+
const title = normalizeText(anchor.textContent);
|
|
47
|
+
if (!title) continue;
|
|
48
|
+
const href = anchor.getAttribute("href") ?? "";
|
|
49
|
+
if (/^vote\?/i.test(href) || /^from\?site=/i.test(href)) continue;
|
|
50
|
+
const link = toAbsoluteUrl(href, pageUrl);
|
|
51
|
+
if (!link) continue;
|
|
52
|
+
return { title, link };
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
return {
|
|
56
|
+
title: `HN Item ${itemId}`,
|
|
57
|
+
link: new URL(`/item?id=${itemId}`, pageUrl).href,
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
function parseMeta(root, row, itemId) {
|
|
63
|
+
const itemHref = `item?id=${itemId}`;
|
|
64
|
+
const ageAnchors = root.querySelectorAll(`a[href="${itemHref}"]`);
|
|
65
|
+
|
|
66
|
+
let ageSpan = null;
|
|
67
|
+
for (const anchor of ageAnchors) {
|
|
68
|
+
const parent = anchor.parentNode;
|
|
69
|
+
if (!parent || parent.tagName?.toLowerCase() !== "span") continue;
|
|
70
|
+
const titleAttr = parent.getAttribute?.("title");
|
|
71
|
+
if (!titleAttr) continue;
|
|
72
|
+
ageSpan = parent;
|
|
73
|
+
break;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const pubDate = parsePubDate(ageSpan?.getAttribute?.("title"));
|
|
77
|
+
const metaContainer = ageSpan?.parentNode ?? null;
|
|
78
|
+
const author = normalizeText(metaContainer?.querySelector?.('a[href^="user?id="]')?.textContent) || undefined;
|
|
79
|
+
const scoreText = normalizeText(metaContainer?.querySelector?.(`span[id="score_${itemId}"]`)?.textContent);
|
|
80
|
+
const commentLinks = metaContainer?.querySelectorAll?.(`a[href="${itemHref}"]`) ?? [];
|
|
81
|
+
const commentText = normalizeText(commentLinks[commentLinks.length - 1]?.textContent);
|
|
82
|
+
const siteText = normalizeText(row.querySelector?.('a[href^="from?site="]')?.textContent);
|
|
83
|
+
|
|
84
|
+
const summaryParts = [siteText, scoreText, commentText].filter(Boolean);
|
|
85
|
+
const summary = summaryParts.length > 0 ? summaryParts.join(" | ") : undefined;
|
|
86
|
+
|
|
87
|
+
return { pubDate, author, summary };
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
async function fetchItems(sourceId, ctx) {
|
|
92
|
+
_deps = ctx.deps;
|
|
93
|
+
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3000 });
|
|
94
|
+
const root = _deps.parseHtml(html);
|
|
95
|
+
const pageUrl = new URL(finalUrl || sourceId, HN_ORIGIN);
|
|
96
|
+
const items = [];
|
|
97
|
+
const seen = new Set();
|
|
98
|
+
|
|
99
|
+
for (const row of root.querySelectorAll("tr[id]")) {
|
|
100
|
+
const itemId = normalizeText(row.getAttribute("id"));
|
|
101
|
+
if (!/^\d+$/.test(itemId) || seen.has(itemId)) continue;
|
|
102
|
+
|
|
103
|
+
const { title, link } = parseTitleLink(row, pageUrl, itemId);
|
|
104
|
+
const { pubDate, author, summary } = parseMeta(root, row, itemId);
|
|
105
|
+
seen.add(itemId);
|
|
106
|
+
|
|
107
|
+
items.push({
|
|
108
|
+
guid: _deps.createHash("sha256").update(`hn:${itemId}`).digest("hex"),
|
|
109
|
+
title: title || "(无标题)",
|
|
110
|
+
link,
|
|
111
|
+
pubDate,
|
|
112
|
+
author,
|
|
113
|
+
summary,
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
if (items.length === 0) {
|
|
118
|
+
throw new Error("[hacker-news-newest] 未解析到条目,页面结构可能已变化");
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return items;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
export default {
|
|
126
|
+
id: "hacker-news-newest",
|
|
127
|
+
listUrlPattern: /^https?:\/\/news\.ycombinator\.com\/newest\/?(\?.*)?$/i,
|
|
128
|
+
refreshInterval: "10min",
|
|
129
|
+
fetchItems,
|
|
130
|
+
};
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
const DATAVERSE_ORIGIN = "https://dataverse.harvard.edu";
|
|
5
|
+
const DATAVERSE_SEARCH_API = `${DATAVERSE_ORIGIN}/api/search`;
|
|
6
|
+
const DEFAULT_QUERY = "*";
|
|
7
|
+
const DEFAULT_PER_PAGE = 30;
|
|
8
|
+
const MAX_PER_PAGE = 100;
|
|
9
|
+
|
|
10
|
+
function normalizeText(text) {
|
|
11
|
+
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
function hashGuid(input) {
|
|
15
|
+
return _deps.createHash("sha256").update(input).digest("hex");
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
function parseDate(value) {
|
|
19
|
+
const text = normalizeText(value);
|
|
20
|
+
if (!text) return undefined;
|
|
21
|
+
const parsed = new Date(text);
|
|
22
|
+
return Number.isNaN(parsed.getTime()) ? undefined : parsed;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function toSafeInteger(value, fallback, { min = Number.NEGATIVE_INFINITY, max = Number.POSITIVE_INFINITY } = {}) {
|
|
26
|
+
const raw = typeof value === "string" ? Number(value) : value;
|
|
27
|
+
if (!Number.isInteger(raw)) return fallback;
|
|
28
|
+
if (raw < min || raw > max) return fallback;
|
|
29
|
+
return raw;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function toHttpUrl(rawUrl, baseUrl = DATAVERSE_ORIGIN) {
|
|
33
|
+
const text = normalizeText(rawUrl);
|
|
34
|
+
if (!text) return null;
|
|
35
|
+
try {
|
|
36
|
+
const url = new URL(text, baseUrl);
|
|
37
|
+
if (!/^https?:$/i.test(url.protocol)) return null;
|
|
38
|
+
return url.href;
|
|
39
|
+
} catch {
|
|
40
|
+
return null;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
function extractAuthor(record) {
|
|
45
|
+
if (Array.isArray(record?.authors)) {
|
|
46
|
+
const authors = record.authors.map((x) => normalizeText(x)).filter(Boolean);
|
|
47
|
+
if (authors.length > 0) return authors.join(", ");
|
|
48
|
+
}
|
|
49
|
+
const publisher = normalizeText(record?.publisher);
|
|
50
|
+
if (publisher) return publisher;
|
|
51
|
+
const dataverseName = normalizeText(record?.name_of_dataverse);
|
|
52
|
+
return dataverseName || undefined;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function extractSubtree(sourceId) {
|
|
56
|
+
try {
|
|
57
|
+
const url = new URL(sourceId);
|
|
58
|
+
const fromQuery = normalizeText(url.searchParams.get("subtree"));
|
|
59
|
+
if (fromQuery) return fromQuery;
|
|
60
|
+
|
|
61
|
+
const dataverseAlias = normalizeText(url.searchParams.get("alias"));
|
|
62
|
+
if (url.pathname === "/dataverse.xhtml" && dataverseAlias) {
|
|
63
|
+
return dataverseAlias;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
const match = url.pathname.match(/^\/dataverse\/([^/?#]+)\/?$/i);
|
|
67
|
+
if (!match) return undefined;
|
|
68
|
+
return decodeURIComponent(match[1]);
|
|
69
|
+
} catch {
|
|
70
|
+
return undefined;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function buildQuery(sourceId) {
|
|
75
|
+
const params = new URLSearchParams();
|
|
76
|
+
params.set("type", "dataset");
|
|
77
|
+
params.set("sort", "date");
|
|
78
|
+
params.set("order", "desc");
|
|
79
|
+
|
|
80
|
+
try {
|
|
81
|
+
const url = new URL(sourceId);
|
|
82
|
+
const q = normalizeText(url.searchParams.get("q")) || DEFAULT_QUERY;
|
|
83
|
+
const perPage = toSafeInteger(
|
|
84
|
+
url.searchParams.get("per_page") ?? url.searchParams.get("perPage"),
|
|
85
|
+
DEFAULT_PER_PAGE,
|
|
86
|
+
{ min: 1, max: MAX_PER_PAGE }
|
|
87
|
+
);
|
|
88
|
+
const start = toSafeInteger(url.searchParams.get("start"), 0, { min: 0 });
|
|
89
|
+
const sort = normalizeText(url.searchParams.get("sort"));
|
|
90
|
+
const order = normalizeText(url.searchParams.get("order"));
|
|
91
|
+
|
|
92
|
+
params.set("q", q);
|
|
93
|
+
params.set("per_page", String(perPage));
|
|
94
|
+
params.set("start", String(start));
|
|
95
|
+
if (/^[A-Za-z_]+$/.test(sort)) params.set("sort", sort);
|
|
96
|
+
if (/^(asc|desc)$/i.test(order)) params.set("order", order.toLowerCase());
|
|
97
|
+
} catch {
|
|
98
|
+
params.set("q", DEFAULT_QUERY);
|
|
99
|
+
params.set("per_page", String(DEFAULT_PER_PAGE));
|
|
100
|
+
params.set("start", "0");
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const subtree = extractSubtree(sourceId);
|
|
104
|
+
if (subtree) params.set("subtree", subtree);
|
|
105
|
+
return params;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
function toFeedItem(record, index) {
|
|
109
|
+
if (!record || typeof record !== "object") return null;
|
|
110
|
+
|
|
111
|
+
const title = normalizeText(record.name);
|
|
112
|
+
const link = toHttpUrl(record.url);
|
|
113
|
+
if (!title || !link) return null;
|
|
114
|
+
|
|
115
|
+
const guidSeed = normalizeText(record.global_id) || link;
|
|
116
|
+
const pubDate =
|
|
117
|
+
parseDate(record.published_at) ??
|
|
118
|
+
parseDate(record.updatedAt) ??
|
|
119
|
+
parseDate(record.createdAt) ??
|
|
120
|
+
new Date(Date.now() - index * 1000);
|
|
121
|
+
const summary = normalizeText(record.description);
|
|
122
|
+
|
|
123
|
+
return {
|
|
124
|
+
guid: hashGuid(guidSeed),
|
|
125
|
+
title,
|
|
126
|
+
link,
|
|
127
|
+
pubDate,
|
|
128
|
+
author: extractAuthor(record),
|
|
129
|
+
summary: summary || undefined,
|
|
130
|
+
sourceId: "harvard-dataverse",
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
async function fetchItems(sourceId, ctx) {
|
|
135
|
+
_deps = ctx.deps;
|
|
136
|
+
const query = buildQuery(sourceId);
|
|
137
|
+
const apiUrl = `${DATAVERSE_SEARCH_API}?${query.toString()}`;
|
|
138
|
+
const response = await fetch(apiUrl, {
|
|
139
|
+
headers: {
|
|
140
|
+
Accept: "application/json",
|
|
141
|
+
},
|
|
142
|
+
});
|
|
143
|
+
|
|
144
|
+
if (!response.ok) {
|
|
145
|
+
throw new Error(`[harvard-dataverse] 请求搜索接口失败: HTTP ${response.status}`);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
const payload = await response.json().catch(() => null);
|
|
149
|
+
const records = payload?.data?.items;
|
|
150
|
+
if (!Array.isArray(records)) {
|
|
151
|
+
throw new Error("[harvard-dataverse] 搜索接口响应结构异常");
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
const items = records.map((record, index) => toFeedItem(record, index)).filter(Boolean);
|
|
155
|
+
if (items.length === 0) {
|
|
156
|
+
throw new Error("[harvard-dataverse] 未解析到条目,接口结构可能已变化");
|
|
157
|
+
}
|
|
158
|
+
return items;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
export default {
|
|
162
|
+
id: "harvard-dataverse",
|
|
163
|
+
listUrlPattern:
|
|
164
|
+
/^https?:\/\/dataverse\.harvard\.edu(?:\/?$|\/\?.*|\/dataverse\/[^/?#]+\/?(?:\?.*)?|\/dataverse\.xhtml(?:\?.*)?)$/i,
|
|
165
|
+
fetchItems,
|
|
166
|
+
};
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
// 华为云社区博客插件:抓取 https://bbs.huaweicloud.com/blogs 列表条目(默认仅列表,不做 enrich)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
const HUAWEICLOUD_ORIGIN = "https://bbs.huaweicloud.com";
|
|
8
|
+
const BLOG_PATH_RE = /^\/blogs\/\d+$/;
|
|
9
|
+
const DATE_RE = /(\d{4})[/-](\d{1,2})[/-](\d{1,2})/;
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
function normalizeText(text) {
|
|
13
|
+
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
function cleanTitle(text) {
|
|
18
|
+
return normalizeText(text).replace(/\s+HOT$/i, "");
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
function hashGuid(input) {
|
|
23
|
+
return _deps.createHash("sha256").update(input).digest("hex");
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
function toAbsoluteUrl(href, baseUrl) {
|
|
28
|
+
if (!href) return null;
|
|
29
|
+
try {
|
|
30
|
+
const url = new URL(href, baseUrl);
|
|
31
|
+
if (!/^https?:$/i.test(url.protocol)) return null;
|
|
32
|
+
return url.href;
|
|
33
|
+
} catch {
|
|
34
|
+
return null;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
function getBlogPath(href, baseUrl) {
|
|
40
|
+
const absolute = toAbsoluteUrl(href, baseUrl);
|
|
41
|
+
if (!absolute) return null;
|
|
42
|
+
try {
|
|
43
|
+
const url = new URL(absolute);
|
|
44
|
+
const normalizedPath = url.pathname.replace(/\/+$/, "");
|
|
45
|
+
return BLOG_PATH_RE.test(normalizedPath) ? normalizedPath : null;
|
|
46
|
+
} catch {
|
|
47
|
+
return null;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
function parseDate(text) {
|
|
53
|
+
const normalized = normalizeText(text);
|
|
54
|
+
const m = normalized.match(DATE_RE);
|
|
55
|
+
if (!m) return undefined;
|
|
56
|
+
const [, year, month, day] = m;
|
|
57
|
+
const iso = `${year}-${month.padStart(2, "0")}-${day.padStart(2, "0")}T00:00:00+08:00`;
|
|
58
|
+
const date = new Date(iso);
|
|
59
|
+
return Number.isNaN(date.getTime()) ? undefined : date;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
function extractDate(card) {
|
|
64
|
+
const allText = normalizeText(card?.textContent);
|
|
65
|
+
return parseDate(allText);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
function extractAuthor(card) {
|
|
70
|
+
if (!card) return undefined;
|
|
71
|
+
const authorAnchor = card.querySelector('a[id^="ydcomm_blog_author_"]') ??
|
|
72
|
+
card.querySelector('a[href^="/community/usersnew/"]');
|
|
73
|
+
return normalizeText(authorAnchor?.textContent) || undefined;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
function extractSummary(card, title, linkPath, pageUrl) {
|
|
78
|
+
if (!card) return undefined;
|
|
79
|
+
let best = "";
|
|
80
|
+
for (const anchor of card.querySelectorAll("a[href]")) {
|
|
81
|
+
const href = anchor.getAttribute("href") || "";
|
|
82
|
+
const path = getBlogPath(href, pageUrl);
|
|
83
|
+
if (!path || path !== linkPath) continue;
|
|
84
|
+
const text = normalizeText(anchor.textContent);
|
|
85
|
+
if (!text) continue;
|
|
86
|
+
if (text === title) continue;
|
|
87
|
+
if (text.length > best.length) best = text;
|
|
88
|
+
}
|
|
89
|
+
return best || undefined;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
function mapCardToFeedItem(card, pageUrl) {
|
|
94
|
+
const titleAnchor = card?.querySelector('a[id^="ydcomm_blog_title_"][href]') ??
|
|
95
|
+
card?.querySelector('a[title][href^="/blogs/"], a[title][href*="/blogs/"]');
|
|
96
|
+
const href = titleAnchor?.getAttribute("href") ?? "";
|
|
97
|
+
const linkPath = getBlogPath(href, pageUrl);
|
|
98
|
+
if (!linkPath) return null;
|
|
99
|
+
|
|
100
|
+
const link = toAbsoluteUrl(linkPath, pageUrl);
|
|
101
|
+
if (!link) return null;
|
|
102
|
+
|
|
103
|
+
const title = cleanTitle(titleAnchor?.getAttribute("title")) || cleanTitle(titleAnchor?.textContent);
|
|
104
|
+
if (!title) return null;
|
|
105
|
+
|
|
106
|
+
const pubDate = extractDate(card) ?? new Date();
|
|
107
|
+
const summary = extractSummary(card, title, linkPath, pageUrl);
|
|
108
|
+
const author = extractAuthor(card);
|
|
109
|
+
|
|
110
|
+
return {
|
|
111
|
+
guid: hashGuid(link),
|
|
112
|
+
title,
|
|
113
|
+
link,
|
|
114
|
+
pubDate,
|
|
115
|
+
summary: summary || undefined,
|
|
116
|
+
author,
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
function findCardRoot(node) {
|
|
122
|
+
let current = node ?? null;
|
|
123
|
+
for (let i = 0; i < 8 && current; i += 1) {
|
|
124
|
+
if (typeof current.getAttribute === "function") {
|
|
125
|
+
const id = current.getAttribute("id") || "";
|
|
126
|
+
if (id.startsWith("ydcomm_blog_content_")) return current;
|
|
127
|
+
}
|
|
128
|
+
current = current.parentNode ?? null;
|
|
129
|
+
}
|
|
130
|
+
return null;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
function parseFromCardBlocks(root, pageUrl) {
|
|
135
|
+
const items = [];
|
|
136
|
+
const seen = new Set();
|
|
137
|
+
const cards = root.querySelectorAll('div[id^="ydcomm_blog_content_"]');
|
|
138
|
+
for (const card of cards) {
|
|
139
|
+
const item = mapCardToFeedItem(card, pageUrl);
|
|
140
|
+
if (!item || seen.has(item.link)) continue;
|
|
141
|
+
seen.add(item.link);
|
|
142
|
+
items.push(item);
|
|
143
|
+
}
|
|
144
|
+
return items;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
function parseFromTitleAnchors(root, pageUrl) {
|
|
149
|
+
const items = [];
|
|
150
|
+
const seen = new Set();
|
|
151
|
+
const anchors = root.querySelectorAll('a[id^="ydcomm_blog_title_"][href]');
|
|
152
|
+
for (const anchor of anchors) {
|
|
153
|
+
const card = findCardRoot(anchor);
|
|
154
|
+
const item = mapCardToFeedItem(card ?? anchor.parentNode, pageUrl);
|
|
155
|
+
if (!item || seen.has(item.link)) continue;
|
|
156
|
+
seen.add(item.link);
|
|
157
|
+
items.push(item);
|
|
158
|
+
}
|
|
159
|
+
return items;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
async function fetchItems(sourceId, ctx) {
|
|
164
|
+
_deps = ctx.deps;
|
|
165
|
+
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 4500 });
|
|
166
|
+
const root = _deps.parseHtml(html);
|
|
167
|
+
const pageUrl = new URL(finalUrl || sourceId, HUAWEICLOUD_ORIGIN);
|
|
168
|
+
|
|
169
|
+
const itemsFromCards = parseFromCardBlocks(root, pageUrl);
|
|
170
|
+
const items = itemsFromCards.length > 0 ? itemsFromCards : parseFromTitleAnchors(root, pageUrl);
|
|
171
|
+
|
|
172
|
+
if (items.length === 0) {
|
|
173
|
+
throw new Error("[huaweicloud-bbs-blogs] 未解析到博客条目,页面结构可能已变化");
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
return items;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
export default {
|
|
181
|
+
id: "huaweicloud-bbs-blogs",
|
|
182
|
+
listUrlPattern: /^https?:\/\/bbs\.huaweicloud\.com\/blogs\/?(\?.*)?$/i,
|
|
183
|
+
refreshInterval: "1h",
|
|
184
|
+
fetchItems,
|
|
185
|
+
};
|