rssany 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/app/plugins/builtin/agi-eval-evaluation.rssany.js +188 -0
- package/app/plugins/builtin/amii-research-talent.rssany.js +73 -0
- package/app/plugins/builtin/anthropic-research.rssany.js +155 -0
- package/app/plugins/builtin/appen-resources.rssany.js +155 -0
- package/app/plugins/builtin/baai-wudao-paper-article.rssany.js +185 -0
- package/app/plugins/builtin/baaidata-csdn.rssany.js +242 -0
- package/app/plugins/builtin/baidu-research.rssany.js +222 -0
- package/app/plugins/builtin/brightdata-blog.rssany.js +301 -0
- package/app/plugins/builtin/bytedance-seed-research.rssany.js +231 -0
- package/app/plugins/builtin/five-radar.rssany.js +490 -0
- package/app/plugins/builtin/flageval-news.rssany.js +118 -0
- package/app/plugins/builtin/google-deepmind-research.rssany.js +223 -0
- package/app/plugins/builtin/google-research-datasets.rssany.js +171 -0
- package/app/plugins/builtin/google-research.rssany.js +220 -0
- package/app/plugins/builtin/google.rssany.js +187 -0
- package/app/plugins/builtin/hacker-news-newest.rssany.js +130 -0
- package/app/plugins/builtin/harvard-dataverse.rssany.js +166 -0
- package/app/plugins/builtin/huaweicloud-bbs-blogs.rssany.js +185 -0
- package/app/plugins/builtin/lingowhale.rssany.js +119 -0
- package/app/plugins/builtin/meituan-tech.rssany.js +130 -0
- package/app/plugins/builtin/meta-ai-publications.rssany.js +221 -0
- package/app/plugins/builtin/mila-quebec.rssany.js +199 -0
- package/app/plugins/builtin/mit-csail-research.rssany.js +208 -0
- package/app/plugins/builtin/moonshot.rssany.js +127 -0
- package/app/plugins/builtin/opendatalab-news.rssany.js +174 -0
- package/app/plugins/builtin/opendatalab.rssany.js +109 -0
- package/app/plugins/builtin/opendrivelab-autonomous-driving.rssany.js +114 -0
- package/app/plugins/builtin/opendrivelab-embodiedai.rssany.js +114 -0
- package/app/plugins/builtin/opendrivelab-publications.rssany.js +130 -0
- package/app/plugins/builtin/opendrivelab.rssany.js +333 -0
- package/app/plugins/builtin/paperswithcode.rssany.js +227 -0
- package/app/plugins/builtin/pjlab-adg-publications.rssany.js +202 -0
- package/app/plugins/builtin/rss.rssany.js +11 -1
- package/app/plugins/builtin/selectdataset.rssany.js +206 -0
- package/app/plugins/builtin/sensetime-tech-achievements.rssany.js +154 -0
- package/app/plugins/builtin/supervisely-blog.rssany.js +159 -0
- package/app/plugins/builtin/uci-ml-repository.rssany.js +111 -0
- package/app/plugins/builtin/venturebeat.rssany.js +97 -0
- package/app/plugins/builtin/worldlabs.rssany.js +129 -0
- package/app/plugins/builtin/x.rssany.js +159 -0
- package/app/plugins/builtin/xiaohongshu.rssany.js +283 -0
- package/app/plugins/builtin/zhipu-research.rssany.js +334 -0
- package/dist/index.js +62 -4
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/webui/build/200.html +6 -6
- package/webui/build/_app/immutable/assets/{0.DjU2hdCQ.css → 0.BB88QFoe.css} +1 -1
- package/webui/build/_app/immutable/assets/{homeFeedPanelStore.BopJZtHu.css → homeFeedPanelStore.iOmfP2qL.css} +1 -1
- package/webui/build/_app/immutable/chunks/CZD-YNDw.js +31 -0
- package/webui/build/_app/immutable/chunks/{C85CNwD2.js → D6VIKef0.js} +1 -1
- package/webui/build/_app/immutable/chunks/{CllQAdvt.js → Dbqx2mXq.js} +1 -1
- package/webui/build/_app/immutable/chunks/DeX-oq5W.js +41 -0
- package/webui/build/_app/immutable/chunks/{CdMsRjxJ.js → dhB8G5Is.js} +1 -1
- package/webui/build/_app/immutable/entry/{app.BcD2eSsQ.js → app.XPso7q7g.js} +2 -2
- package/webui/build/_app/immutable/entry/start.Db4snNCd.js +1 -0
- package/webui/build/_app/immutable/nodes/0.BKTQePmA.js +11 -0
- package/webui/build/_app/immutable/nodes/{1.DU9aYGAb.js → 1.BS3_Rfxm.js} +1 -1
- package/webui/build/_app/immutable/nodes/{10.Db6vw7Ih.js → 10.CyyxDCIS.js} +1 -1
- package/webui/build/_app/immutable/nodes/{11.BaAcorz3.js → 11.CtYgIaGj.js} +1 -1
- package/webui/build/_app/immutable/nodes/{14.DqT4pcrQ.js → 14.D5OEGPR2.js} +1 -1
- package/webui/build/_app/immutable/nodes/{15.CCLbjxnH.js → 15.B4dFN1Gk.js} +1 -1
- package/webui/build/_app/immutable/nodes/{16.DiigpVdP.js → 16.M7ZII7tl.js} +1 -1
- package/webui/build/_app/immutable/nodes/{3.DEcYOQc-.js → 3.7r8v7qkm.js} +1 -1
- package/webui/build/_app/immutable/nodes/{5.CvM1TkLG.js → 5.CHIzoGrb.js} +1 -1
- package/webui/build/_app/immutable/nodes/{6.Dscr6LkS.js → 6.BDBqx-GY.js} +1 -1
- package/webui/build/_app/immutable/nodes/{7.Bp60MobD.js → 7.D5czsDmz.js} +1 -1
- package/webui/build/_app/immutable/nodes/{8.DwSg0MHh.js → 8.pjVNsCdV.js} +1 -1
- package/webui/build/_app/immutable/nodes/{9.BeYOUjxR.js → 9.CsARv1BH.js} +1 -1
- package/webui/build/_app/version.json +1 -1
- package/webui/build/_app/immutable/chunks/CtijX1u3.js +0 -31
- package/webui/build/_app/immutable/chunks/Dv1VCsiB.js +0 -41
- package/webui/build/_app/immutable/entry/start.CbkdJdz1.js +0 -1
- package/webui/build/_app/immutable/nodes/0.DSUDmOx2.js +0 -11
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
const SITE_ID = "baai-wudao-paper-article";
|
|
5
|
+
const OWNER = "BAAI-WuDao";
|
|
6
|
+
const REPO = "Paper-Article";
|
|
7
|
+
const README_PATH = "README.md";
|
|
8
|
+
const README_RAW_URL = `https://raw.githubusercontent.com/${OWNER}/${REPO}/main/${README_PATH}`;
|
|
9
|
+
const README_COMMITS_API_URL = `https://api.github.com/repos/${OWNER}/${REPO}/commits?path=${encodeURIComponent(
|
|
10
|
+
README_PATH
|
|
11
|
+
)}&per_page=1`;
|
|
12
|
+
|
|
13
|
+
function normalizeText(text) {
|
|
14
|
+
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
function hashGuid(input) {
|
|
18
|
+
return _deps.createHash("sha256").update(input).digest("hex");
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
function cleanUrl(raw) {
|
|
22
|
+
const text = normalizeText(raw).replace(/[)>.,;!?]+$/g, "");
|
|
23
|
+
try {
|
|
24
|
+
const url = new URL(text);
|
|
25
|
+
if (!/^https?:$/i.test(url.protocol)) return null;
|
|
26
|
+
return url.href;
|
|
27
|
+
} catch {
|
|
28
|
+
return null;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
async function fetchJson(url) {
|
|
33
|
+
const response = await fetch(url, {
|
|
34
|
+
headers: {
|
|
35
|
+
Accept: "application/vnd.github+json",
|
|
36
|
+
"User-Agent": "RssAny/1.0 (+https://github.com/rssany/rssany)",
|
|
37
|
+
},
|
|
38
|
+
});
|
|
39
|
+
if (!response.ok) {
|
|
40
|
+
throw new Error(`HTTP ${response.status}`);
|
|
41
|
+
}
|
|
42
|
+
try {
|
|
43
|
+
return await response.json();
|
|
44
|
+
} catch {
|
|
45
|
+
throw new Error("接口返回非 JSON 数据");
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
async function fetchReadmeMarkdown() {
|
|
50
|
+
const response = await fetch(README_RAW_URL, {
|
|
51
|
+
headers: {
|
|
52
|
+
Accept: "text/plain",
|
|
53
|
+
"User-Agent": "RssAny/1.0 (+https://github.com/rssany/rssany)",
|
|
54
|
+
},
|
|
55
|
+
});
|
|
56
|
+
if (!response.ok) {
|
|
57
|
+
throw new Error(`HTTP ${response.status}`);
|
|
58
|
+
}
|
|
59
|
+
const markdown = await response.text();
|
|
60
|
+
if (!normalizeText(markdown)) {
|
|
61
|
+
throw new Error("README 内容为空");
|
|
62
|
+
}
|
|
63
|
+
return markdown;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
async function fetchReadmeUpdatedAt() {
|
|
67
|
+
try {
|
|
68
|
+
const payload = await fetchJson(README_COMMITS_API_URL);
|
|
69
|
+
const first = Array.isArray(payload) ? payload[0] : undefined;
|
|
70
|
+
const rawDate =
|
|
71
|
+
first?.commit?.committer?.date ??
|
|
72
|
+
first?.commit?.author?.date ??
|
|
73
|
+
first?.committer?.date ??
|
|
74
|
+
first?.author?.date;
|
|
75
|
+
const parsed = rawDate ? new Date(rawDate) : undefined;
|
|
76
|
+
if (parsed && !Number.isNaN(parsed.getTime())) return parsed;
|
|
77
|
+
} catch {
|
|
78
|
+
// 忽略日期接口异常,回退到当前时间保证插件可用
|
|
79
|
+
}
|
|
80
|
+
return new Date();
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
function parseSectionName(line) {
|
|
84
|
+
const m = normalizeText(line).match(/^###\s+(.+)$/);
|
|
85
|
+
if (!m) return "";
|
|
86
|
+
const name = normalizeText(m[1]).toLowerCase();
|
|
87
|
+
if (name.includes("paper")) return "Paper";
|
|
88
|
+
if (name.includes("article")) return "Article";
|
|
89
|
+
return "";
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
function parseTitleLine(line) {
|
|
93
|
+
const text = normalizeText(line);
|
|
94
|
+
const m = text.match(/^\*\s+\*\*(.+?)\*\*\s*$/);
|
|
95
|
+
if (!m) return "";
|
|
96
|
+
return normalizeText(m[1]);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
function parseLinkLine(line) {
|
|
100
|
+
const text = normalizeText(line);
|
|
101
|
+
if (!text) return null;
|
|
102
|
+
const directMatch = text.match(/链接[::]\s*(https?:\/\/\S+)/i);
|
|
103
|
+
if (directMatch) return cleanUrl(directMatch[1]);
|
|
104
|
+
const urlMatch = text.match(/(https?:\/\/\S+)/i);
|
|
105
|
+
if (urlMatch) return cleanUrl(urlMatch[1]);
|
|
106
|
+
return null;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
function parseItemsFromReadme(markdown, baseDate) {
|
|
110
|
+
const lines = markdown.split(/\r?\n/);
|
|
111
|
+
const items = [];
|
|
112
|
+
const seenLinks = new Set();
|
|
113
|
+
let currentSection = "";
|
|
114
|
+
|
|
115
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
116
|
+
const line = lines[i];
|
|
117
|
+
const section = parseSectionName(line);
|
|
118
|
+
if (section) {
|
|
119
|
+
currentSection = section;
|
|
120
|
+
continue;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
const title = parseTitleLine(line);
|
|
124
|
+
if (!title) continue;
|
|
125
|
+
|
|
126
|
+
let link = null;
|
|
127
|
+
for (let j = i + 1; j < Math.min(lines.length, i + 8); j += 1) {
|
|
128
|
+
link = parseLinkLine(lines[j]);
|
|
129
|
+
if (link) break;
|
|
130
|
+
if (parseTitleLine(lines[j]) || parseSectionName(lines[j])) break;
|
|
131
|
+
}
|
|
132
|
+
if (!link || seenLinks.has(link)) continue;
|
|
133
|
+
seenLinks.add(link);
|
|
134
|
+
|
|
135
|
+
const pubDate = new Date(baseDate.getTime() - items.length * 1000);
|
|
136
|
+
const category = currentSection || "Paper-Article";
|
|
137
|
+
const summary =
|
|
138
|
+
category === "Paper"
|
|
139
|
+
? "BAAI-WuDao Paper collection"
|
|
140
|
+
: category === "Article"
|
|
141
|
+
? "BAAI-WuDao related article"
|
|
142
|
+
: "BAAI-WuDao Paper-Article repository";
|
|
143
|
+
|
|
144
|
+
items.push({
|
|
145
|
+
guid: hashGuid(link),
|
|
146
|
+
title,
|
|
147
|
+
link,
|
|
148
|
+
pubDate,
|
|
149
|
+
summary,
|
|
150
|
+
sourceId: SITE_ID,
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
return items;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
async function fetchItems(sourceId, _ctx) {
|
|
158
|
+
_deps = _ctx.deps;
|
|
159
|
+
let sourceUrl;
|
|
160
|
+
try {
|
|
161
|
+
sourceUrl = new URL(sourceId);
|
|
162
|
+
} catch {
|
|
163
|
+
throw new Error(`[${SITE_ID}] 无效 URL: ${sourceId}`);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
if (
|
|
167
|
+
sourceUrl.hostname !== "github.com" ||
|
|
168
|
+
sourceUrl.pathname.replace(/\/+$/, "") !== `/${OWNER}/${REPO}`
|
|
169
|
+
) {
|
|
170
|
+
throw new Error(`[${SITE_ID}] 仅支持仓库 URL: https://github.com/${OWNER}/${REPO}`);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
const [markdown, baseDate] = await Promise.all([fetchReadmeMarkdown(), fetchReadmeUpdatedAt()]);
|
|
174
|
+
const items = parseItemsFromReadme(markdown, baseDate);
|
|
175
|
+
if (items.length === 0) {
|
|
176
|
+
throw new Error(`[${SITE_ID}] 未解析到条目,README 结构可能已变化`);
|
|
177
|
+
}
|
|
178
|
+
return items;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
export default {
|
|
182
|
+
id: SITE_ID,
|
|
183
|
+
listUrlPattern: /^https?:\/\/(www\.)?github\.com\/BAAI-WuDao\/Paper-Article\/?(?:\?.*)?$/i,
|
|
184
|
+
fetchItems,
|
|
185
|
+
};
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
const ARTICLE_PATH_RE = /^\/([0-9a-f]{24})\.html$/i;
|
|
5
|
+
const DATE_RE = /(\d{4})-(\d{1,2})-(\d{1,2})(?:\s+(\d{1,2}):(\d{2})(?::(\d{2}))?)?/;
|
|
6
|
+
|
|
7
|
+
function normalizeText(text) {
|
|
8
|
+
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
function hashGuid(input) {
|
|
12
|
+
return _deps.createHash("sha256").update(input).digest("hex");
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
function toAbsoluteUrl(rawHref, pageUrl) {
|
|
16
|
+
if (!rawHref) return null;
|
|
17
|
+
try {
|
|
18
|
+
const url = new URL(rawHref, pageUrl);
|
|
19
|
+
if (!/^https?:$/i.test(url.protocol)) return null;
|
|
20
|
+
return url.href;
|
|
21
|
+
} catch {
|
|
22
|
+
return null;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
function parsePubDate(rawText) {
|
|
27
|
+
const text = normalizeText(rawText);
|
|
28
|
+
const match = text.match(DATE_RE);
|
|
29
|
+
if (!match) return new Date();
|
|
30
|
+
const [, y, mm, dd, hh, min, sec] = match;
|
|
31
|
+
const iso = `${y}-${mm.padStart(2, "0")}-${dd.padStart(2, "0")}T${(hh ?? "00").padStart(2, "0")}:${(min ?? "00").padStart(2, "0")}:${(sec ?? "00").padStart(2, "0")}+08:00`;
|
|
32
|
+
const parsed = new Date(iso);
|
|
33
|
+
return Number.isNaN(parsed.getTime()) ? new Date() : parsed;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function findNearbyDateText(anchor) {
|
|
37
|
+
let current = anchor;
|
|
38
|
+
for (let i = 0; i < 6 && current; i += 1) {
|
|
39
|
+
const text = normalizeText(current.textContent);
|
|
40
|
+
if (DATE_RE.test(text)) return text;
|
|
41
|
+
current = current.parentNode ?? null;
|
|
42
|
+
}
|
|
43
|
+
return "";
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
function isNoiseTitle(title) {
|
|
47
|
+
const text = normalizeText(title);
|
|
48
|
+
if (!text) return true;
|
|
49
|
+
if (text === "暂无图片" || text === "登录" || text === "加入社区") return true;
|
|
50
|
+
return false;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function pickBestTitle(candidates) {
|
|
54
|
+
const normalized = [...new Set(candidates.map((x) => normalizeText(x)).filter(Boolean))];
|
|
55
|
+
const preferred = normalized
|
|
56
|
+
.filter((text) => !isNoiseTitle(text))
|
|
57
|
+
.filter((text) => text.length >= 6 && text.length <= 80)
|
|
58
|
+
.sort((a, b) => a.length - b.length);
|
|
59
|
+
if (preferred.length > 0) return preferred[0];
|
|
60
|
+
|
|
61
|
+
const fallback = normalized
|
|
62
|
+
.filter((text) => !isNoiseTitle(text))
|
|
63
|
+
.filter((text) => text.length >= 6)
|
|
64
|
+
.sort((a, b) => a.length - b.length);
|
|
65
|
+
if (fallback.length > 0) return fallback[0];
|
|
66
|
+
|
|
67
|
+
return "";
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function parseDomItems(html, finalUrl) {
|
|
71
|
+
const root = _deps.parseHtml(html);
|
|
72
|
+
const grouped = new Map();
|
|
73
|
+
const items = [];
|
|
74
|
+
const anchors = root.querySelectorAll("a[href]");
|
|
75
|
+
|
|
76
|
+
for (const anchor of anchors) {
|
|
77
|
+
const href = toAbsoluteUrl(anchor.getAttribute("href"), finalUrl);
|
|
78
|
+
if (!href) continue;
|
|
79
|
+
const url = new URL(href);
|
|
80
|
+
if (url.hostname !== "baaidata.csdn.net") continue;
|
|
81
|
+
if (!ARTICLE_PATH_RE.test(url.pathname)) continue;
|
|
82
|
+
|
|
83
|
+
const canonicalLink = `${url.origin}${url.pathname}`;
|
|
84
|
+
const group = grouped.get(canonicalLink) ?? { anchors: [], titles: [] };
|
|
85
|
+
group.anchors.push(anchor);
|
|
86
|
+
group.titles.push(anchor.textContent ?? "");
|
|
87
|
+
grouped.set(canonicalLink, group);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
for (const [canonicalLink, group] of grouped) {
|
|
91
|
+
const title = pickBestTitle(group.titles);
|
|
92
|
+
if (!title) continue;
|
|
93
|
+
|
|
94
|
+
let dateText = "";
|
|
95
|
+
for (const anchor of group.anchors) {
|
|
96
|
+
const found = findNearbyDateText(anchor);
|
|
97
|
+
if (found) {
|
|
98
|
+
dateText = found;
|
|
99
|
+
break;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
items.push({
|
|
104
|
+
guid: hashGuid(canonicalLink),
|
|
105
|
+
title,
|
|
106
|
+
link: canonicalLink,
|
|
107
|
+
pubDate: parsePubDate(dateText),
|
|
108
|
+
author: "智源数据社区",
|
|
109
|
+
sourceId: "baaidata-csdn",
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
items.sort((a, b) => b.pubDate.getTime() - a.pubDate.getTime());
|
|
114
|
+
return items;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
function findJsonObjectEnd(raw, startIndex) {
|
|
118
|
+
let depth = 0;
|
|
119
|
+
let inString = false;
|
|
120
|
+
let escaped = false;
|
|
121
|
+
|
|
122
|
+
for (let i = startIndex; i < raw.length; i += 1) {
|
|
123
|
+
const ch = raw[i];
|
|
124
|
+
if (inString) {
|
|
125
|
+
if (escaped) {
|
|
126
|
+
escaped = false;
|
|
127
|
+
continue;
|
|
128
|
+
}
|
|
129
|
+
if (ch === "\\") {
|
|
130
|
+
escaped = true;
|
|
131
|
+
continue;
|
|
132
|
+
}
|
|
133
|
+
if (ch === "\"") inString = false;
|
|
134
|
+
continue;
|
|
135
|
+
}
|
|
136
|
+
if (ch === "\"") {
|
|
137
|
+
inString = true;
|
|
138
|
+
continue;
|
|
139
|
+
}
|
|
140
|
+
if (ch === "{") depth += 1;
|
|
141
|
+
if (ch === "}") {
|
|
142
|
+
depth -= 1;
|
|
143
|
+
if (depth === 0) return i;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
return -1;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
function extractInitialState(html) {
|
|
151
|
+
const marker = "window.__INITIAL_STATE__=";
|
|
152
|
+
const markerIndex = html.indexOf(marker);
|
|
153
|
+
if (markerIndex < 0) return null;
|
|
154
|
+
|
|
155
|
+
const objStart = html.indexOf("{", markerIndex + marker.length);
|
|
156
|
+
if (objStart < 0) return null;
|
|
157
|
+
|
|
158
|
+
const objEnd = findJsonObjectEnd(html, objStart);
|
|
159
|
+
if (objEnd < 0) return null;
|
|
160
|
+
|
|
161
|
+
const jsonRaw = html.slice(objStart, objEnd + 1);
|
|
162
|
+
try {
|
|
163
|
+
const parsed = JSON.parse(jsonRaw);
|
|
164
|
+
return parsed && typeof parsed === "object" ? parsed : null;
|
|
165
|
+
} catch {
|
|
166
|
+
return null;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
function toItemFromState(raw, finalUrl) {
|
|
171
|
+
if (!raw || typeof raw !== "object") return null;
|
|
172
|
+
const entry = raw.content && typeof raw.content === "object" ? raw.content : raw;
|
|
173
|
+
|
|
174
|
+
const id = normalizeText(entry.id ?? "");
|
|
175
|
+
const title = normalizeText(entry.name ?? entry.title ?? "");
|
|
176
|
+
if (!title) return null;
|
|
177
|
+
|
|
178
|
+
const rawLink = normalizeText(raw.contentUrl ?? entry.contentUrl ?? entry.pagePath ?? "");
|
|
179
|
+
const fallbackLink = id ? `/${id}.html` : "";
|
|
180
|
+
const link = toAbsoluteUrl(rawLink || fallbackLink, finalUrl);
|
|
181
|
+
if (!link) return null;
|
|
182
|
+
|
|
183
|
+
const canonical = new URL(link);
|
|
184
|
+
const canonicalLink = `${canonical.origin}${canonical.pathname}`;
|
|
185
|
+
|
|
186
|
+
const summary = normalizeText(entry.desc ?? raw.desc ?? "");
|
|
187
|
+
const author = normalizeText(raw.nickname ?? raw.username ?? raw.user?.nickname ?? "");
|
|
188
|
+
const pubDate = parsePubDate(entry.createdTime ?? raw.createdTime ?? "");
|
|
189
|
+
|
|
190
|
+
return {
|
|
191
|
+
guid: hashGuid(canonicalLink),
|
|
192
|
+
title,
|
|
193
|
+
link: canonicalLink,
|
|
194
|
+
pubDate,
|
|
195
|
+
author: author || "智源数据社区",
|
|
196
|
+
summary: summary || undefined,
|
|
197
|
+
sourceId: "baaidata-csdn",
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
function parseStateItems(html, finalUrl) {
|
|
202
|
+
const state = extractInitialState(html);
|
|
203
|
+
if (!state) return [];
|
|
204
|
+
|
|
205
|
+
const latest = Array.isArray(state?.articleContent?.latest) ? state.articleContent.latest : [];
|
|
206
|
+
const hot = Array.isArray(state?.articleContent?.hot) ? state.articleContent.hot : [];
|
|
207
|
+
const headlines = Array.isArray(state?.headlines) ? state.headlines : [];
|
|
208
|
+
|
|
209
|
+
const merged = [...latest, ...hot, ...headlines];
|
|
210
|
+
const seen = new Set();
|
|
211
|
+
const items = [];
|
|
212
|
+
|
|
213
|
+
for (const raw of merged) {
|
|
214
|
+
const item = toItemFromState(raw, finalUrl);
|
|
215
|
+
if (!item) continue;
|
|
216
|
+
if (seen.has(item.link)) continue;
|
|
217
|
+
seen.add(item.link);
|
|
218
|
+
items.push(item);
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
items.sort((a, b) => b.pubDate.getTime() - a.pubDate.getTime());
|
|
222
|
+
return items;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
async function fetchItems(sourceId, ctx) {
|
|
226
|
+
_deps = ctx.deps;
|
|
227
|
+
const rendered = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
|
|
228
|
+
const fromDom = parseDomItems(rendered.html, rendered.finalUrl);
|
|
229
|
+
if (fromDom.length > 0) return fromDom;
|
|
230
|
+
|
|
231
|
+
const raw = await ctx.fetchHtml(sourceId, { waitMs: 3500, purify: false });
|
|
232
|
+
const fromState = parseStateItems(raw.html, raw.finalUrl);
|
|
233
|
+
if (fromState.length > 0) return fromState;
|
|
234
|
+
|
|
235
|
+
throw new Error("[baaidata-csdn] 未解析到条目,页面结构可能已变化");
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
export default {
|
|
239
|
+
id: "baaidata-csdn",
|
|
240
|
+
listUrlPattern: /^https?:\/\/baaidata\.csdn\.net\/?(?:\?.*)?$/i,
|
|
241
|
+
fetchItems,
|
|
242
|
+
};
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
// Baidu Research 插件:抓取 Blog 列表条目(不做正文 enrich)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
const BLOG_ITEM_PATH_RE = /^\/Blog\/index-view(?:\/)?$/i;
|
|
8
|
+
const MONTH_TO_INDEX = {
|
|
9
|
+
jan: 0,
|
|
10
|
+
feb: 1,
|
|
11
|
+
mar: 2,
|
|
12
|
+
apr: 3,
|
|
13
|
+
may: 4,
|
|
14
|
+
jun: 5,
|
|
15
|
+
jul: 6,
|
|
16
|
+
aug: 7,
|
|
17
|
+
sep: 8,
|
|
18
|
+
oct: 9,
|
|
19
|
+
nov: 10,
|
|
20
|
+
dec: 11,
|
|
21
|
+
};
|
|
22
|
+
const TEXT_NODE_SELECTORS = "div, p, span, h1, h2, h3, h4, h5, h6, strong, em";
|
|
23
|
+
const MONTH_DAY_TEXT_RE = /^([A-Za-z]{3,9})\s+\d{1,2}(?:st|nd|rd|th)?(?:\s*[,,]\s*\d{0,4})?$/i;
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
function normalizeText(text) {
|
|
27
|
+
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
function hashGuid(input) {
|
|
32
|
+
return _deps.createHash("sha256").update(input).digest("hex");
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
function toAbsoluteHttpUrl(rawHref, baseUrl) {
|
|
37
|
+
if (!rawHref) return null;
|
|
38
|
+
const href = rawHref.trim();
|
|
39
|
+
if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
|
|
40
|
+
try {
|
|
41
|
+
const url = new URL(href, baseUrl);
|
|
42
|
+
if (!/^https?:$/i.test(url.protocol)) return null;
|
|
43
|
+
return url.href;
|
|
44
|
+
} catch {
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
function inferYearFromTexts(texts) {
|
|
51
|
+
for (const text of texts) {
|
|
52
|
+
const m = normalizeText(text).match(/\b(19|20)\d{2}\b/);
|
|
53
|
+
if (!m) continue;
|
|
54
|
+
const year = Number(m[0]);
|
|
55
|
+
if (year >= 1990 && year <= new Date().getUTCFullYear() + 1) {
|
|
56
|
+
return year;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
return undefined;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
function parsePubDate(rawText, fallbackTexts) {
|
|
64
|
+
const text = normalizeText(rawText).replace(",", ",");
|
|
65
|
+
if (!text) return undefined;
|
|
66
|
+
|
|
67
|
+
const monthMatch = text.match(
|
|
68
|
+
/^([A-Za-z]{3,9})\s+(\d{1,2})(?:st|nd|rd|th)?(?:\s*,\s*)?(?:(\d{4}))?$/
|
|
69
|
+
);
|
|
70
|
+
if (monthMatch) {
|
|
71
|
+
const month = MONTH_TO_INDEX[monthMatch[1].slice(0, 3).toLowerCase()];
|
|
72
|
+
if (month != null) {
|
|
73
|
+
const day = Number(monthMatch[2]);
|
|
74
|
+
const year = monthMatch[3] ? Number(monthMatch[3]) : inferYearFromTexts(fallbackTexts);
|
|
75
|
+
if (!year) return undefined;
|
|
76
|
+
const parsed = new Date(Date.UTC(year, month, day, 12, 0, 0));
|
|
77
|
+
if (!Number.isNaN(parsed.getTime())) return parsed;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const numericMatch = text.match(/^(\d{4})[/-](\d{1,2})[/-](\d{1,2})$/);
|
|
82
|
+
if (numericMatch) {
|
|
83
|
+
const year = Number(numericMatch[1]);
|
|
84
|
+
const month = Number(numericMatch[2]) - 1;
|
|
85
|
+
const day = Number(numericMatch[3]);
|
|
86
|
+
const parsed = new Date(Date.UTC(year, month, day, 12, 0, 0));
|
|
87
|
+
if (!Number.isNaN(parsed.getTime())) return parsed;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
return undefined;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
function isDateLikeText(text) {
|
|
95
|
+
return MONTH_DAY_TEXT_RE.test(normalizeText(text));
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
function getLeafTexts(anchor) {
|
|
100
|
+
const all = anchor.querySelectorAll(TEXT_NODE_SELECTORS);
|
|
101
|
+
const out = [];
|
|
102
|
+
const seen = new Set();
|
|
103
|
+
for (const el of all) {
|
|
104
|
+
if (el.querySelector(TEXT_NODE_SELECTORS) != null) continue;
|
|
105
|
+
const text = normalizeText(el.textContent);
|
|
106
|
+
if (!text || text === "MORE") continue;
|
|
107
|
+
if (seen.has(text)) continue;
|
|
108
|
+
seen.add(text);
|
|
109
|
+
out.push(text);
|
|
110
|
+
}
|
|
111
|
+
return out;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
function parseAnchorItem(anchor, finalUrl) {
|
|
116
|
+
const link = toAbsoluteHttpUrl(anchor.getAttribute("href"), finalUrl);
|
|
117
|
+
if (!link) return null;
|
|
118
|
+
|
|
119
|
+
const linkUrl = new URL(link);
|
|
120
|
+
if (linkUrl.hostname !== "research.baidu.com") return null;
|
|
121
|
+
if (!BLOG_ITEM_PATH_RE.test(linkUrl.pathname)) return null;
|
|
122
|
+
if (!/^\d+$/.test(linkUrl.searchParams.get("id") ?? "")) return null;
|
|
123
|
+
|
|
124
|
+
const texts = getLeafTexts(anchor);
|
|
125
|
+
if (texts.length === 0) return null;
|
|
126
|
+
|
|
127
|
+
const datedTexts = texts.map((text) => ({ text, date: parsePubDate(text, texts) }));
|
|
128
|
+
const dateIndex = datedTexts.findIndex((x) => x.date != null);
|
|
129
|
+
const pubDate = dateIndex >= 0 ? datedTexts[dateIndex].date : undefined;
|
|
130
|
+
|
|
131
|
+
const nonDateTexts = datedTexts
|
|
132
|
+
.filter((x) => x.date == null && !isDateLikeText(x.text))
|
|
133
|
+
.map((x) => x.text);
|
|
134
|
+
const titleCandidates = nonDateTexts.filter((text) => text.length >= 6);
|
|
135
|
+
const looksLikeSummary = (text) => text.endsWith("...") || text.split(/\s+/).length >= 22;
|
|
136
|
+
|
|
137
|
+
let title = "";
|
|
138
|
+
if (titleCandidates.length > 0) {
|
|
139
|
+
title = [...titleCandidates]
|
|
140
|
+
.sort((a, b) => {
|
|
141
|
+
const aPenalty = looksLikeSummary(a) ? 1 : 0;
|
|
142
|
+
const bPenalty = looksLikeSummary(b) ? 1 : 0;
|
|
143
|
+
if (aPenalty !== bPenalty) return aPenalty - bPenalty;
|
|
144
|
+
return a.length - b.length;
|
|
145
|
+
})[0];
|
|
146
|
+
}
|
|
147
|
+
if (!title && dateIndex > 0) {
|
|
148
|
+
title = datedTexts.slice(0, dateIndex).map((x) => x.text).find((text) => text.length >= 6) ?? "";
|
|
149
|
+
}
|
|
150
|
+
if (!title) return null;
|
|
151
|
+
|
|
152
|
+
const summary = nonDateTexts.find(
|
|
153
|
+
(text) => text !== title && (text.endsWith("...") || text.length >= 40)
|
|
154
|
+
);
|
|
155
|
+
|
|
156
|
+
return {
|
|
157
|
+
guid: hashGuid(link),
|
|
158
|
+
title,
|
|
159
|
+
link,
|
|
160
|
+
pubDate: pubDate ?? new Date(),
|
|
161
|
+
author: "Baidu Research",
|
|
162
|
+
summary: summary || undefined,
|
|
163
|
+
sourceId: "baidu-research",
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
function parseBlogItems(html, finalUrl) {
|
|
169
|
+
const root = _deps.parseHtml(html);
|
|
170
|
+
const anchors = root.querySelectorAll("a[href]");
|
|
171
|
+
const seen = new Set();
|
|
172
|
+
const items = [];
|
|
173
|
+
|
|
174
|
+
for (const anchor of anchors) {
|
|
175
|
+
const item = parseAnchorItem(anchor, finalUrl);
|
|
176
|
+
if (!item) continue;
|
|
177
|
+
if (seen.has(item.link)) continue;
|
|
178
|
+
seen.add(item.link);
|
|
179
|
+
items.push(item);
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
return items;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
function mergeByLink(itemsA, itemsB) {
|
|
187
|
+
const byLink = new Map();
|
|
188
|
+
for (const item of [...itemsA, ...itemsB]) {
|
|
189
|
+
if (byLink.has(item.link)) continue;
|
|
190
|
+
byLink.set(item.link, item);
|
|
191
|
+
}
|
|
192
|
+
return Array.from(byLink.values());
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
async function fetchItems(sourceId, ctx) {
|
|
197
|
+
_deps = ctx.deps;
|
|
198
|
+
const primary = await ctx.fetchHtml(sourceId, { waitMs: 4500 });
|
|
199
|
+
let items = parseBlogItems(primary.html, primary.finalUrl || sourceId);
|
|
200
|
+
|
|
201
|
+
const primaryUrl = new URL(primary.finalUrl || sourceId);
|
|
202
|
+
if (items.length < 5 && primaryUrl.hostname === "research.baidu.com") {
|
|
203
|
+
const blogUrl = new URL("/Blog", primaryUrl).href;
|
|
204
|
+
if (blogUrl !== primaryUrl.href) {
|
|
205
|
+
const blogPage = await ctx.fetchHtml(blogUrl, { waitMs: 4500 });
|
|
206
|
+
items = mergeByLink(items, parseBlogItems(blogPage.html, blogPage.finalUrl || blogUrl));
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
if (items.length === 0) {
|
|
211
|
+
throw new Error("[baidu-research] 未解析到 Blog 条目,页面结构可能已变化");
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
return items;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
export default {
|
|
219
|
+
id: "baidu-research",
|
|
220
|
+
listUrlPattern: /^https?:\/\/research\.baidu\.com\/(?:(?:Index|Blog)\/?)?(?:\?.*)?$/i,
|
|
221
|
+
fetchItems,
|
|
222
|
+
};
|