rssany 0.1.2 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -50
- package/app/plugins/builtin/agi-eval-evaluation.rssany.js +188 -0
- package/app/plugins/builtin/amii-research-talent.rssany.js +73 -0
- package/app/plugins/builtin/anthropic-research.rssany.js +155 -0
- package/app/plugins/builtin/appen-resources.rssany.js +155 -0
- package/app/plugins/builtin/baai-wudao-paper-article.rssany.js +185 -0
- package/app/plugins/builtin/baaidata-csdn.rssany.js +242 -0
- package/app/plugins/builtin/baidu-research.rssany.js +222 -0
- package/app/plugins/builtin/brightdata-blog.rssany.js +301 -0
- package/app/plugins/builtin/bytedance-seed-research.rssany.js +231 -0
- package/app/plugins/builtin/five-radar.rssany.js +490 -0
- package/app/plugins/builtin/flageval-news.rssany.js +118 -0
- package/app/plugins/builtin/google-deepmind-research.rssany.js +223 -0
- package/app/plugins/builtin/google-research-datasets.rssany.js +171 -0
- package/app/plugins/builtin/google-research.rssany.js +220 -0
- package/app/plugins/builtin/google.rssany.js +187 -0
- package/app/plugins/builtin/hacker-news-newest.rssany.js +130 -0
- package/app/plugins/builtin/harvard-dataverse.rssany.js +166 -0
- package/app/plugins/builtin/huaweicloud-bbs-blogs.rssany.js +185 -0
- package/app/plugins/builtin/lingowhale.rssany.js +119 -0
- package/app/plugins/builtin/meituan-tech.rssany.js +130 -0
- package/app/plugins/builtin/meta-ai-publications.rssany.js +221 -0
- package/app/plugins/builtin/mila-quebec.rssany.js +199 -0
- package/app/plugins/builtin/mit-csail-research.rssany.js +208 -0
- package/app/plugins/builtin/moonshot.rssany.js +127 -0
- package/app/plugins/builtin/opendatalab-news.rssany.js +174 -0
- package/app/plugins/builtin/opendatalab.rssany.js +109 -0
- package/app/plugins/builtin/opendrivelab-autonomous-driving.rssany.js +114 -0
- package/app/plugins/builtin/opendrivelab-embodiedai.rssany.js +114 -0
- package/app/plugins/builtin/opendrivelab-publications.rssany.js +130 -0
- package/app/plugins/builtin/opendrivelab.rssany.js +333 -0
- package/app/plugins/builtin/paperswithcode.rssany.js +227 -0
- package/app/plugins/builtin/pjlab-adg-publications.rssany.js +202 -0
- package/app/plugins/builtin/rss.rssany.js +11 -1
- package/app/plugins/builtin/selectdataset.rssany.js +206 -0
- package/app/plugins/builtin/sensetime-tech-achievements.rssany.js +154 -0
- package/app/plugins/builtin/supervisely-blog.rssany.js +159 -0
- package/app/plugins/builtin/uci-ml-repository.rssany.js +111 -0
- package/app/plugins/builtin/venturebeat.rssany.js +97 -0
- package/app/plugins/builtin/worldlabs.rssany.js +129 -0
- package/app/plugins/builtin/x.rssany.js +159 -0
- package/app/plugins/builtin/xiaohongshu.rssany.js +283 -0
- package/app/plugins/builtin/zhipu-research.rssany.js +334 -0
- package/dist/index.js +79 -9
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/webui/build/200.html +6 -6
- package/webui/build/_app/immutable/assets/0.BB88QFoe.css +1 -0
- package/webui/build/_app/immutable/assets/{homeFeedPanelStore.BopJZtHu.css → homeFeedPanelStore.iOmfP2qL.css} +1 -1
- package/webui/build/_app/immutable/chunks/CZD-YNDw.js +31 -0
- package/webui/build/_app/immutable/chunks/{DcAshVxe.js → D6VIKef0.js} +1 -1
- package/webui/build/_app/immutable/chunks/{EIZIMsXK.js → Dbqx2mXq.js} +1 -1
- package/webui/build/_app/immutable/chunks/DeX-oq5W.js +41 -0
- package/webui/build/_app/immutable/chunks/{BXCWEhUd.js → dhB8G5Is.js} +1 -1
- package/webui/build/_app/immutable/entry/{app.DdgnooOk.js → app.XPso7q7g.js} +2 -2
- package/webui/build/_app/immutable/entry/start.Db4snNCd.js +1 -0
- package/webui/build/_app/immutable/nodes/0.BKTQePmA.js +11 -0
- package/webui/build/_app/immutable/nodes/{1.5DFDaT4c.js → 1.BS3_Rfxm.js} +1 -1
- package/webui/build/_app/immutable/nodes/{10.OVK4i9XE.js → 10.CyyxDCIS.js} +1 -1
- package/webui/build/_app/immutable/nodes/{11.Dhn_rO4A.js → 11.CtYgIaGj.js} +1 -1
- package/webui/build/_app/immutable/nodes/{14.B_KpJLxn.js → 14.D5OEGPR2.js} +1 -1
- package/webui/build/_app/immutable/nodes/{15.RaWaA-0I.js → 15.B4dFN1Gk.js} +1 -1
- package/webui/build/_app/immutable/nodes/{16.DSUgqolV.js → 16.M7ZII7tl.js} +1 -1
- package/webui/build/_app/immutable/nodes/{3.wQvGs9w-.js → 3.7r8v7qkm.js} +1 -1
- package/webui/build/_app/immutable/nodes/{5.CCtn90c0.js → 5.CHIzoGrb.js} +1 -1
- package/webui/build/_app/immutable/nodes/{6.C2_mjW1u.js → 6.BDBqx-GY.js} +1 -1
- package/webui/build/_app/immutable/nodes/{7.Dwz6W7A1.js → 7.D5czsDmz.js} +1 -1
- package/webui/build/_app/immutable/nodes/{8.DzkEw6rx.js → 8.pjVNsCdV.js} +1 -1
- package/webui/build/_app/immutable/nodes/{9.DtlXEwe1.js → 9.CsARv1BH.js} +1 -1
- package/webui/build/_app/version.json +1 -1
- package/webui/build/_app/immutable/assets/0.C6Q_nuW9.css +0 -1
- package/webui/build/_app/immutable/chunks/CkUAV0m0.js +0 -41
- package/webui/build/_app/immutable/chunks/CtijX1u3.js +0 -31
- package/webui/build/_app/immutable/entry/start.DhJaJZhR.js +0 -1
- package/webui/build/_app/immutable/nodes/0.BE05Cuc4.js +0 -11
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
// 小红书站点插件:用户主页列表抓取、笔记详情提取、认证流程
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
const XHS_ORIGIN = "https://www.xiaohongshu.com";
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
function getOrigin(url) {
|
|
11
|
+
try {
|
|
12
|
+
return new URL(url).origin;
|
|
13
|
+
} catch {
|
|
14
|
+
return XHS_ORIGIN;
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
function buildExploreLinkWithXsec(profileHref, origin) {
|
|
20
|
+
try {
|
|
21
|
+
const fullUrl = new URL(profileHref.replace(/&/g, "&"), origin);
|
|
22
|
+
const pathSegs = fullUrl.pathname.split("/").filter(Boolean);
|
|
23
|
+
const noteId = pathSegs[pathSegs.length - 1];
|
|
24
|
+
if (!noteId || !/^[0-9a-f]+$/i.test(noteId)) return null;
|
|
25
|
+
const token = fullUrl.searchParams.get("xsec_token");
|
|
26
|
+
const source = fullUrl.searchParams.get("xsec_source") ?? "pc_user";
|
|
27
|
+
if (!token) return null;
|
|
28
|
+
const explore = new URL(`/explore/${noteId}`, origin);
|
|
29
|
+
explore.searchParams.set("xsec_token", token);
|
|
30
|
+
explore.searchParams.set("xsec_source", source);
|
|
31
|
+
return explore.href;
|
|
32
|
+
} catch {
|
|
33
|
+
return null;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
function parseListHtml(html, url) {
|
|
39
|
+
const root = _deps.parseHtml(html);
|
|
40
|
+
const origin = getOrigin(url);
|
|
41
|
+
const feed = root.querySelector("#userPostedFeeds");
|
|
42
|
+
if (!feed) return [];
|
|
43
|
+
const sections = feed.querySelectorAll("section[data-v-79abd645][data-index]");
|
|
44
|
+
const items = [];
|
|
45
|
+
for (const section of sections) {
|
|
46
|
+
const profileWithToken = section.querySelector('a[href*="xsec_token="]');
|
|
47
|
+
const profileHref = profileWithToken?.getAttribute("href")?.trim();
|
|
48
|
+
let link;
|
|
49
|
+
if (profileHref && profileHref.includes("/user/profile/")) {
|
|
50
|
+
const withXsec = buildExploreLinkWithXsec(profileHref, origin);
|
|
51
|
+
if (withXsec) link = withXsec;
|
|
52
|
+
else link = new URL(profileHref.replace(/&/g, "&"), origin).href;
|
|
53
|
+
} else {
|
|
54
|
+
const linkEl = section.querySelector('a[href^="/explore/"]');
|
|
55
|
+
const href = linkEl?.getAttribute("href")?.trim();
|
|
56
|
+
if (!href) continue;
|
|
57
|
+
link = new URL(href, origin).href;
|
|
58
|
+
}
|
|
59
|
+
const titleEl = section.querySelector("span[data-v-51ec0135]");
|
|
60
|
+
const title = (titleEl?.textContent ?? "").trim() || "笔记";
|
|
61
|
+
const authorEl = section.querySelector('a[aria-current="page"] span');
|
|
62
|
+
const author = (authorEl?.textContent ?? "").trim() || undefined;
|
|
63
|
+
items.push({
|
|
64
|
+
guid: _deps.createHash("sha256").update(link).digest("hex"),
|
|
65
|
+
title,
|
|
66
|
+
link,
|
|
67
|
+
pubDate: new Date(),
|
|
68
|
+
author,
|
|
69
|
+
summary: title,
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
return items;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
function descToMarkdown(descEl) {
|
|
77
|
+
if (!descEl) return "";
|
|
78
|
+
const noteText = descEl.querySelector(".note-text");
|
|
79
|
+
if (!noteText) {
|
|
80
|
+
return (descEl.textContent ?? "").trim();
|
|
81
|
+
}
|
|
82
|
+
const parts = [];
|
|
83
|
+
for (const node of noteText.childNodes) {
|
|
84
|
+
if (node.nodeType === 3) {
|
|
85
|
+
const text = (node.textContent ?? "").trim();
|
|
86
|
+
if (text) parts.push(text);
|
|
87
|
+
} else if (node.nodeType === 1) {
|
|
88
|
+
const el = node;
|
|
89
|
+
const tagName = el.tagName?.toLowerCase();
|
|
90
|
+
if (tagName === "img") {
|
|
91
|
+
const alt = el.getAttribute("alt") || "";
|
|
92
|
+
if (alt) parts.push(alt);
|
|
93
|
+
} else if (tagName === "a" && el.classList?.contains("tag")) {
|
|
94
|
+
const txt = (el.textContent ?? "").trim();
|
|
95
|
+
if (txt) parts.push(txt);
|
|
96
|
+
} else {
|
|
97
|
+
const txt = (el.textContent ?? "").trim();
|
|
98
|
+
if (txt) parts.push(txt);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
let result = parts.join(" ").replace(/\s+/g, " ").trim();
|
|
103
|
+
if (!result) result = (descEl.textContent ?? "").trim();
|
|
104
|
+
if (!result && descEl.parentNode) result = (descEl.parentNode.textContent ?? "").trim();
|
|
105
|
+
return result;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
function extractUrl(val) {
|
|
110
|
+
if (!val) return null;
|
|
111
|
+
const decoded = val.replace(/"/g, '"').replace(/&/g, "&");
|
|
112
|
+
const m = decoded.match(/url\s*\(\s*["']?([^"')]+)["']?\s*\)/);
|
|
113
|
+
if (m) {
|
|
114
|
+
let url = m[1].trim();
|
|
115
|
+
url = url.replace(/^["']|["']$/g, "");
|
|
116
|
+
return url || null;
|
|
117
|
+
}
|
|
118
|
+
return null;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
function collectNoteImages(root) {
|
|
123
|
+
const urls = [];
|
|
124
|
+
const seen = new Set();
|
|
125
|
+
const add = (url) => {
|
|
126
|
+
const u = (url || "").trim();
|
|
127
|
+
if (u && !seen.has(u) && (u.startsWith("http") || u.startsWith("//"))) {
|
|
128
|
+
seen.add(u);
|
|
129
|
+
urls.push(u.startsWith("//") ? "https:" + u : u);
|
|
130
|
+
}
|
|
131
|
+
};
|
|
132
|
+
const imgs = root.querySelectorAll(".img-container img, .note-slider-img img, .note-slider img, .xhs-webplayer img, .note-content img, [class*='note-detail'] img, .media-container img, .video-player-media img");
|
|
133
|
+
for (const el of imgs) {
|
|
134
|
+
const src = el.getAttribute("src") || el.getAttribute("data-src") || el.getAttribute("data-lazy-src");
|
|
135
|
+
if (src) add(src);
|
|
136
|
+
}
|
|
137
|
+
const posterSelectors = ["xg-poster", "[class*='xgplayer-poster']", ".player-container [style*='background-image']", ".render-ssr-image [style*='background-image']", "[class*='player-container'] [style*='background-image']", ".video-player-media [style*='background-image']", ".media-container [style*='background-image']"];
|
|
138
|
+
for (const sel of posterSelectors) {
|
|
139
|
+
const els = root.querySelectorAll(sel);
|
|
140
|
+
for (const el of els) {
|
|
141
|
+
const style = el.getAttribute("style");
|
|
142
|
+
const url = extractUrl(style ?? "");
|
|
143
|
+
if (url) add(url);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
const anyBg = root.querySelectorAll("[style*='background-image']");
|
|
147
|
+
for (const el of anyBg) {
|
|
148
|
+
const url = extractUrl(el.getAttribute("style") ?? "");
|
|
149
|
+
if (url && (url.includes("xhscdn") || url.includes("sns-webpic") || url.includes("sns-avatar"))) add(url);
|
|
150
|
+
}
|
|
151
|
+
return urls;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
function parseNoteDate(dateEl) {
|
|
156
|
+
const text = (dateEl?.textContent ?? "").trim();
|
|
157
|
+
if (!text) return undefined;
|
|
158
|
+
const now = new Date();
|
|
159
|
+
const published = text.match(/发布于\s*(\d{4})-(\d{1,2})-(\d{1,2})/);
|
|
160
|
+
if (published) {
|
|
161
|
+
const [, y, m, d] = published;
|
|
162
|
+
return new Date(`${y}-${m.padStart(2, "0")}-${d.padStart(2, "0")}T12:00:00.000Z`);
|
|
163
|
+
}
|
|
164
|
+
const edited = text.match(/编辑于\s*(\d{1,2})-(\d{1,2})/);
|
|
165
|
+
if (edited) {
|
|
166
|
+
const [, m, d] = edited;
|
|
167
|
+
let year = now.getFullYear();
|
|
168
|
+
const month = parseInt(m, 10);
|
|
169
|
+
const day = parseInt(d, 10);
|
|
170
|
+
const built = new Date(year, month - 1, day);
|
|
171
|
+
if (built > now) year -= 1;
|
|
172
|
+
return new Date(`${year}-${String(month).padStart(2, "0")}-${String(day).padStart(2, "0")}T12:00:00.000Z`);
|
|
173
|
+
}
|
|
174
|
+
const relativeMatch = text.match(/(编辑于|发布于)\s*(\d+)\s*(分钟|小时|天|周|个月)前/);
|
|
175
|
+
if (relativeMatch) {
|
|
176
|
+
const [, , amount, unit] = relativeMatch;
|
|
177
|
+
const num = parseInt(amount, 10);
|
|
178
|
+
const msMap = { 分钟: 60_000, 小时: 3_600_000, 天: 86_400_000, 周: 604_800_000, 个月: 2_592_000_000 };
|
|
179
|
+
return new Date(now.getTime() - (num * (msMap[unit] ?? 0)));
|
|
180
|
+
}
|
|
181
|
+
return undefined;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
function extractDetailHtml(html) {
|
|
186
|
+
const root = _deps.parseHtml(html);
|
|
187
|
+
// 作者
|
|
188
|
+
let authorEl = null;
|
|
189
|
+
const authorSelectors = [
|
|
190
|
+
".author .info a.name .username",
|
|
191
|
+
".info a.name .username",
|
|
192
|
+
".info .username",
|
|
193
|
+
"a.name .username",
|
|
194
|
+
".author-container .username",
|
|
195
|
+
".author .username",
|
|
196
|
+
];
|
|
197
|
+
for (const sel of authorSelectors) {
|
|
198
|
+
authorEl = root.querySelector(sel);
|
|
199
|
+
if (authorEl) break;
|
|
200
|
+
}
|
|
201
|
+
if (!authorEl) {
|
|
202
|
+
const containers = root.querySelectorAll(".author, .author-container, .interaction-container, .info");
|
|
203
|
+
for (const c of containers) {
|
|
204
|
+
const u = c.querySelector("a.name .username") ?? c.querySelector(".username");
|
|
205
|
+
if (u) { authorEl = u; break; }
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
if (!authorEl) {
|
|
209
|
+
for (const u of root.querySelectorAll(".username")) {
|
|
210
|
+
let p = u.parentNode ?? null;
|
|
211
|
+
for (let i = 0; i < 5 && p; i++) {
|
|
212
|
+
const cls = p.getAttribute?.("class") || "";
|
|
213
|
+
if (typeof cls === "string" && (cls.includes("name") || cls.includes("info") || cls.includes("author"))) {
|
|
214
|
+
authorEl = u; break;
|
|
215
|
+
}
|
|
216
|
+
p = p.parentNode ?? null;
|
|
217
|
+
}
|
|
218
|
+
if (authorEl) break;
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
const author = (authorEl?.textContent ?? "").trim() || undefined;
|
|
222
|
+
// 标题
|
|
223
|
+
const titleEl = root.querySelector("#detail-title") ?? root.querySelector(".note-content .title") ?? root.querySelector("h1.title");
|
|
224
|
+
const title = (titleEl?.textContent ?? "").trim() || undefined;
|
|
225
|
+
// 正文
|
|
226
|
+
const descEl = root.querySelector("#detail-desc") ?? root.querySelector(".note-content .desc") ?? root.querySelector(".desc");
|
|
227
|
+
const descText = descToMarkdown(descEl);
|
|
228
|
+
const imgUrls = collectNoteImages(root);
|
|
229
|
+
const imgMd = imgUrls.length > 0 ? imgUrls.map((u) => `\n\n`).join("") : "";
|
|
230
|
+
let content = (descText + imgMd).trim() || title || imgMd.trim() || undefined;
|
|
231
|
+
// 发布时间
|
|
232
|
+
let dateEl = root.querySelector(".bottom-container span.date") ?? root.querySelector(".bottom-container .date");
|
|
233
|
+
if (!dateEl) {
|
|
234
|
+
for (const span of root.querySelectorAll("span")) {
|
|
235
|
+
if (/(编辑于|发布于)/.test(span.textContent ?? "")) { dateEl = span; break; }
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
const pubDate = parseNoteDate(dateEl);
|
|
239
|
+
return { author, title, content, pubDate };
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
async function fetchItems(sourceId, ctx) {
|
|
244
|
+
_deps = ctx.deps;
|
|
245
|
+
const { html, finalUrl } = await ctx.fetchHtml(sourceId);
|
|
246
|
+
return parseListHtml(html, finalUrl);
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
async function enrichItem(item, ctx) {
|
|
251
|
+
const { html } = await ctx.fetchHtml(item.link);
|
|
252
|
+
const detail = extractDetailHtml(html);
|
|
253
|
+
return {
|
|
254
|
+
...item,
|
|
255
|
+
author: detail.author ?? item.author,
|
|
256
|
+
title: detail.title ?? item.title,
|
|
257
|
+
content: detail.content ? `<p>${detail.content.replace(/\n\n/g, "</p><p>")}</p>` : undefined,
|
|
258
|
+
pubDate: detail.pubDate ?? item.pubDate,
|
|
259
|
+
};
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
async function checkAuth(page, _url) {
|
|
264
|
+
try {
|
|
265
|
+
const loginButton = await page.$(".reds-button-new.login-btn.large.primary");
|
|
266
|
+
return loginButton == null;
|
|
267
|
+
} catch {
|
|
268
|
+
return false;
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
export default {
|
|
274
|
+
id: "xiaohongshu",
|
|
275
|
+
listUrlPattern: "https://xiaohongshu.com/user/profile/{userId}",
|
|
276
|
+
fetchItems,
|
|
277
|
+
enrichItem,
|
|
278
|
+
checkAuth,
|
|
279
|
+
loginUrl: "https://www.xiaohongshu.com/",
|
|
280
|
+
domain: "xiaohongshu.com",
|
|
281
|
+
loginTimeoutMs: 30 * 1000,
|
|
282
|
+
pollIntervalMs: 2000,
|
|
283
|
+
};
|
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
// 智谱研究页插件:仅抓取列表,不做正文 enrich(兼容净化后的 HTML)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
const ZHIPU_RESEARCH_URL = "https://www.zhipuai.cn/zh/research";
|
|
8
|
+
const ZHIPU_ORIGIN = "https://www.zhipuai.cn";
|
|
9
|
+
const DATE_RE = /^\d{4}[/-]\d{1,2}[/-]\d{1,2}$/;
|
|
10
|
+
const RESEARCH_TAGS = new Set(["多模态", "语言模型", "基座模型", "推理模型", "Agent", "代码模型"]);
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
function normalizeText(text) {
|
|
14
|
+
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
function hashGuid(input) {
|
|
19
|
+
return _deps.createHash("sha256").update(input).digest("hex");
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
function parseDate(dateText) {
|
|
24
|
+
if (!dateText) return undefined;
|
|
25
|
+
const normalized = normalizeText(dateText);
|
|
26
|
+
const m = normalized.match(/(\d{4})[/-](\d{1,2})[/-](\d{1,2})/);
|
|
27
|
+
if (!m) return undefined;
|
|
28
|
+
const [, y, mm, dd] = m;
|
|
29
|
+
return new Date(`${y}-${mm.padStart(2, "0")}-${dd.padStart(2, "0")}T00:00:00.000Z`);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
function findArrayEnd(raw, startIndex) {
|
|
34
|
+
let inString = false;
|
|
35
|
+
let escaped = false;
|
|
36
|
+
let depth = 0;
|
|
37
|
+
for (let i = startIndex; i < raw.length; i += 1) {
|
|
38
|
+
const ch = raw[i];
|
|
39
|
+
if (inString) {
|
|
40
|
+
if (escaped) {
|
|
41
|
+
escaped = false;
|
|
42
|
+
continue;
|
|
43
|
+
}
|
|
44
|
+
if (ch === "\\") {
|
|
45
|
+
escaped = true;
|
|
46
|
+
continue;
|
|
47
|
+
}
|
|
48
|
+
if (ch === "\"") {
|
|
49
|
+
inString = false;
|
|
50
|
+
}
|
|
51
|
+
continue;
|
|
52
|
+
}
|
|
53
|
+
if (ch === "\"") {
|
|
54
|
+
inString = true;
|
|
55
|
+
continue;
|
|
56
|
+
}
|
|
57
|
+
if (ch === "[") depth += 1;
|
|
58
|
+
if (ch === "]") {
|
|
59
|
+
depth -= 1;
|
|
60
|
+
if (depth === 0) return i;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
return -1;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
function extractBlogsItems(html) {
|
|
68
|
+
const normalized = html.replace(/\\"/g, "\"").replace(/\\\\/g, "\\");
|
|
69
|
+
const marker = "\"blogsItems\":";
|
|
70
|
+
const markerIndex = normalized.indexOf(marker);
|
|
71
|
+
if (markerIndex < 0) return [];
|
|
72
|
+
const arrayStart = normalized.indexOf("[", markerIndex + marker.length);
|
|
73
|
+
if (arrayStart < 0) return [];
|
|
74
|
+
const arrayEnd = findArrayEnd(normalized, arrayStart);
|
|
75
|
+
if (arrayEnd < 0) return [];
|
|
76
|
+
const arrayRaw = normalized.slice(arrayStart, arrayEnd + 1);
|
|
77
|
+
try {
|
|
78
|
+
const parsed = JSON.parse(arrayRaw);
|
|
79
|
+
return Array.isArray(parsed) ? parsed : [];
|
|
80
|
+
} catch {
|
|
81
|
+
return [];
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
async function fetchRawHtml(url) {
|
|
87
|
+
try {
|
|
88
|
+
const res = await fetch(url, {
|
|
89
|
+
redirect: "follow",
|
|
90
|
+
headers: {
|
|
91
|
+
"User-Agent":
|
|
92
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
|
93
|
+
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
94
|
+
},
|
|
95
|
+
});
|
|
96
|
+
if (!res.ok) return undefined;
|
|
97
|
+
return await res.text();
|
|
98
|
+
} catch {
|
|
99
|
+
return undefined;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
function buildItemsFromBlogsItems(blogsItems) {
|
|
105
|
+
const items = [];
|
|
106
|
+
for (const blog of blogsItems) {
|
|
107
|
+
if (typeof blog !== "object" || blog == null) continue;
|
|
108
|
+
const id = String(blog.id ?? "").trim();
|
|
109
|
+
if (!id) continue;
|
|
110
|
+
const title = normalizeText(blog.title_zh ?? blog.title_en ?? "");
|
|
111
|
+
if (!title) continue;
|
|
112
|
+
const link = `${ZHIPU_ORIGIN}/zh/research/${id}`;
|
|
113
|
+
const summary = normalizeText(blog.resume_zh ?? blog.resume_en ?? "");
|
|
114
|
+
const createdAt = String(blog.createAt ?? "").trim();
|
|
115
|
+
const pubDate = createdAt ? new Date(createdAt) : new Date();
|
|
116
|
+
const category = normalizeText(blog.tag_zh ?? blog.tag_en ?? "");
|
|
117
|
+
items.push({
|
|
118
|
+
guid: hashGuid(link),
|
|
119
|
+
title,
|
|
120
|
+
link,
|
|
121
|
+
pubDate: Number.isNaN(pubDate.getTime()) ? new Date() : pubDate,
|
|
122
|
+
summary: summary || undefined,
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
return items;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
function buildTitleIdMap(blogsItems) {
|
|
130
|
+
const map = new Map();
|
|
131
|
+
for (const blog of blogsItems) {
|
|
132
|
+
if (typeof blog !== "object" || blog == null) continue;
|
|
133
|
+
const id = String(blog.id ?? "").trim();
|
|
134
|
+
const title = normalizeText(blog.title_zh ?? blog.title_en ?? "");
|
|
135
|
+
if (!id || !title) continue;
|
|
136
|
+
map.set(title, id);
|
|
137
|
+
}
|
|
138
|
+
return map;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
function isDateText(text) {
|
|
143
|
+
return DATE_RE.test(normalizeText(text));
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
function uniqueTexts(texts) {
|
|
148
|
+
const out = [];
|
|
149
|
+
const seen = new Set();
|
|
150
|
+
for (const t of texts) {
|
|
151
|
+
if (!t || seen.has(t)) continue;
|
|
152
|
+
seen.add(t);
|
|
153
|
+
out.push(t);
|
|
154
|
+
}
|
|
155
|
+
return out;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
function getLeafTexts(node) {
|
|
160
|
+
const leafs = node
|
|
161
|
+
.querySelectorAll("div, span, p, h1, h2, h3")
|
|
162
|
+
.filter((el) => el.querySelector("div, span, p, h1, h2, h3") == null)
|
|
163
|
+
.map((el) => normalizeText(el.textContent))
|
|
164
|
+
.filter(Boolean)
|
|
165
|
+
.filter((t) => !t.includes("没有更多"))
|
|
166
|
+
.filter((t) => !t.includes("加载更多"));
|
|
167
|
+
return uniqueTexts(leafs);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
function countDateNodes(node) {
|
|
172
|
+
return node
|
|
173
|
+
.querySelectorAll("p")
|
|
174
|
+
.map((p) => normalizeText(p.textContent))
|
|
175
|
+
.filter((t) => isDateText(t))
|
|
176
|
+
.length;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
function findCardRootFromDateNode(dateNode) {
|
|
181
|
+
let current = dateNode?.parentNode ?? null;
|
|
182
|
+
let candidate = null;
|
|
183
|
+
for (let i = 0; i < 8 && current; i += 1) {
|
|
184
|
+
if (current.nodeType !== _deps.NodeType.ELEMENT_NODE) {
|
|
185
|
+
current = current.parentNode ?? null;
|
|
186
|
+
continue;
|
|
187
|
+
}
|
|
188
|
+
const dateCount = countDateNodes(current);
|
|
189
|
+
if (dateCount === 1) {
|
|
190
|
+
const leafs = getLeafTexts(current);
|
|
191
|
+
const hasTitleCandidate = leafs.some((t) => !isDateText(t) && !RESEARCH_TAGS.has(t) && t.length >= 6);
|
|
192
|
+
if (hasTitleCandidate) candidate = current;
|
|
193
|
+
}
|
|
194
|
+
if (dateCount > 1) break;
|
|
195
|
+
current = current.parentNode ?? null;
|
|
196
|
+
}
|
|
197
|
+
return candidate;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
function parseCardItem(card, dateText, titleIdMap) {
|
|
202
|
+
const leafs = getLeafTexts(card);
|
|
203
|
+
const category = leafs.find((t) => RESEARCH_TAGS.has(t));
|
|
204
|
+
const textCandidates = leafs
|
|
205
|
+
.filter((t) => !isDateText(t))
|
|
206
|
+
.filter((t) => !RESEARCH_TAGS.has(t))
|
|
207
|
+
.filter((t) => t !== "时间排序" && t !== "研究");
|
|
208
|
+
if (textCandidates.length === 0) return null;
|
|
209
|
+
const title = textCandidates[0];
|
|
210
|
+
const summary = textCandidates[1];
|
|
211
|
+
const id = titleIdMap.get(title);
|
|
212
|
+
const link = id
|
|
213
|
+
? `${ZHIPU_ORIGIN}/zh/research/${id}`
|
|
214
|
+
: `${ZHIPU_RESEARCH_URL}#${encodeURIComponent(title)}`;
|
|
215
|
+
const pubDate = parseDate(dateText) ?? new Date();
|
|
216
|
+
return {
|
|
217
|
+
guid: hashGuid(id ? link : `${title}|${normalizeText(dateText)}`),
|
|
218
|
+
title,
|
|
219
|
+
link,
|
|
220
|
+
pubDate,
|
|
221
|
+
summary: summary || undefined,
|
|
222
|
+
categories: category ? [category] : undefined,
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
function buildItemsFromDom(html, titleIdMap) {
|
|
228
|
+
const root = _deps.parseHtml(html);
|
|
229
|
+
const dateNodes = root.querySelectorAll("p")
|
|
230
|
+
.map((p) => ({ node: p, dateText: normalizeText(p.textContent) }))
|
|
231
|
+
.filter((x) => isDateText(x.dateText));
|
|
232
|
+
const seen = new Set();
|
|
233
|
+
const items = [];
|
|
234
|
+
for (const { node, dateText } of dateNodes) {
|
|
235
|
+
const card = findCardRootFromDateNode(node);
|
|
236
|
+
if (!card) continue;
|
|
237
|
+
const parsed = parseCardItem(card, dateText, titleIdMap);
|
|
238
|
+
if (!parsed) continue;
|
|
239
|
+
const key = `${parsed.title}|${parsed.pubDate.toISOString()}`;
|
|
240
|
+
if (seen.has(key)) continue;
|
|
241
|
+
seen.add(key);
|
|
242
|
+
items.push(parsed);
|
|
243
|
+
}
|
|
244
|
+
return items;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
function buildItemsFromLeafSequence(html, titleIdMap) {
|
|
249
|
+
const root = _deps.parseHtml(html);
|
|
250
|
+
const leafTexts = uniqueTexts(
|
|
251
|
+
root
|
|
252
|
+
.querySelectorAll("div, span, p, h1, h2, h3")
|
|
253
|
+
.filter((el) => el.querySelector("div, span, p, h1, h2, h3") == null)
|
|
254
|
+
.map((el) => normalizeText(el.textContent))
|
|
255
|
+
.filter(Boolean)
|
|
256
|
+
.filter((t) => !t.includes("没有更多"))
|
|
257
|
+
.filter((t) => !t.includes("加载更多"))
|
|
258
|
+
.filter((t) => t !== "研究" && t !== "时间排序")
|
|
259
|
+
);
|
|
260
|
+
|
|
261
|
+
const items = [];
|
|
262
|
+
const seen = new Set();
|
|
263
|
+
for (let i = 0; i < leafTexts.length; i += 1) {
|
|
264
|
+
const dateText = leafTexts[i];
|
|
265
|
+
if (!isDateText(dateText)) continue;
|
|
266
|
+
const category = i > 0 && RESEARCH_TAGS.has(leafTexts[i - 1]) ? leafTexts[i - 1] : undefined;
|
|
267
|
+
|
|
268
|
+
let title = "";
|
|
269
|
+
let summary;
|
|
270
|
+
for (let j = i + 1; j < leafTexts.length; j += 1) {
|
|
271
|
+
const t = leafTexts[j];
|
|
272
|
+
if (isDateText(t)) break;
|
|
273
|
+
if (RESEARCH_TAGS.has(t)) continue;
|
|
274
|
+
if (!title) {
|
|
275
|
+
title = t;
|
|
276
|
+
continue;
|
|
277
|
+
}
|
|
278
|
+
summary = t;
|
|
279
|
+
break;
|
|
280
|
+
}
|
|
281
|
+
if (!title || title.length < 4) continue;
|
|
282
|
+
|
|
283
|
+
const id = titleIdMap.get(title);
|
|
284
|
+
const link = id
|
|
285
|
+
? `${ZHIPU_ORIGIN}/zh/research/${id}`
|
|
286
|
+
: `${ZHIPU_RESEARCH_URL}#${encodeURIComponent(title)}`;
|
|
287
|
+
const pubDate = parseDate(dateText) ?? new Date();
|
|
288
|
+
const key = `${title}|${pubDate.toISOString()}`;
|
|
289
|
+
if (seen.has(key)) continue;
|
|
290
|
+
seen.add(key);
|
|
291
|
+
items.push({
|
|
292
|
+
guid: hashGuid(id ? link : `${title}|${normalizeText(dateText)}`),
|
|
293
|
+
title,
|
|
294
|
+
link,
|
|
295
|
+
pubDate,
|
|
296
|
+
summary: summary || undefined,
|
|
297
|
+
});
|
|
298
|
+
}
|
|
299
|
+
return items;
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
async function fetchItems(sourceId, ctx) {
|
|
304
|
+
_deps = ctx.deps;
|
|
305
|
+
// 需要读取页面脚本里的 blogsItems(包含详情 id),因此这里禁用净化。
|
|
306
|
+
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 5000, purify: false });
|
|
307
|
+
|
|
308
|
+
let blogsItems = extractBlogsItems(html);
|
|
309
|
+
if (blogsItems.length === 0) {
|
|
310
|
+
const rawHtml = await fetchRawHtml(finalUrl || sourceId);
|
|
311
|
+
if (rawHtml) {
|
|
312
|
+
blogsItems = extractBlogsItems(rawHtml);
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
const parsedFromScript = buildItemsFromBlogsItems(blogsItems);
|
|
317
|
+
if (parsedFromScript.length > 0) return parsedFromScript;
|
|
318
|
+
|
|
319
|
+
const titleIdMap = buildTitleIdMap(blogsItems);
|
|
320
|
+
const parsedFromDom = buildItemsFromDom(html, titleIdMap);
|
|
321
|
+
if (parsedFromDom.length > 0) return parsedFromDom;
|
|
322
|
+
|
|
323
|
+
const parsedFromLeafs = buildItemsFromLeafSequence(html, titleIdMap);
|
|
324
|
+
if (parsedFromLeafs.length > 0) return parsedFromLeafs;
|
|
325
|
+
|
|
326
|
+
throw new Error("[zhipu-research] 未解析到研究条目,页面结构可能已变化");
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
export default {
|
|
331
|
+
id: "zhipu-research",
|
|
332
|
+
listUrlPattern: ZHIPU_RESEARCH_URL,
|
|
333
|
+
fetchItems,
|
|
334
|
+
};
|