rssany 0.1.2 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/README.md +28 -50
  2. package/app/plugins/builtin/agi-eval-evaluation.rssany.js +188 -0
  3. package/app/plugins/builtin/amii-research-talent.rssany.js +73 -0
  4. package/app/plugins/builtin/anthropic-research.rssany.js +155 -0
  5. package/app/plugins/builtin/appen-resources.rssany.js +155 -0
  6. package/app/plugins/builtin/baai-wudao-paper-article.rssany.js +185 -0
  7. package/app/plugins/builtin/baaidata-csdn.rssany.js +242 -0
  8. package/app/plugins/builtin/baidu-research.rssany.js +222 -0
  9. package/app/plugins/builtin/brightdata-blog.rssany.js +301 -0
  10. package/app/plugins/builtin/bytedance-seed-research.rssany.js +231 -0
  11. package/app/plugins/builtin/five-radar.rssany.js +490 -0
  12. package/app/plugins/builtin/flageval-news.rssany.js +118 -0
  13. package/app/plugins/builtin/google-deepmind-research.rssany.js +223 -0
  14. package/app/plugins/builtin/google-research-datasets.rssany.js +171 -0
  15. package/app/plugins/builtin/google-research.rssany.js +220 -0
  16. package/app/plugins/builtin/google.rssany.js +187 -0
  17. package/app/plugins/builtin/hacker-news-newest.rssany.js +130 -0
  18. package/app/plugins/builtin/harvard-dataverse.rssany.js +166 -0
  19. package/app/plugins/builtin/huaweicloud-bbs-blogs.rssany.js +185 -0
  20. package/app/plugins/builtin/lingowhale.rssany.js +119 -0
  21. package/app/plugins/builtin/meituan-tech.rssany.js +130 -0
  22. package/app/plugins/builtin/meta-ai-publications.rssany.js +221 -0
  23. package/app/plugins/builtin/mila-quebec.rssany.js +199 -0
  24. package/app/plugins/builtin/mit-csail-research.rssany.js +208 -0
  25. package/app/plugins/builtin/moonshot.rssany.js +127 -0
  26. package/app/plugins/builtin/opendatalab-news.rssany.js +174 -0
  27. package/app/plugins/builtin/opendatalab.rssany.js +109 -0
  28. package/app/plugins/builtin/opendrivelab-autonomous-driving.rssany.js +114 -0
  29. package/app/plugins/builtin/opendrivelab-embodiedai.rssany.js +114 -0
  30. package/app/plugins/builtin/opendrivelab-publications.rssany.js +130 -0
  31. package/app/plugins/builtin/opendrivelab.rssany.js +333 -0
  32. package/app/plugins/builtin/paperswithcode.rssany.js +227 -0
  33. package/app/plugins/builtin/pjlab-adg-publications.rssany.js +202 -0
  34. package/app/plugins/builtin/rss.rssany.js +11 -1
  35. package/app/plugins/builtin/selectdataset.rssany.js +206 -0
  36. package/app/plugins/builtin/sensetime-tech-achievements.rssany.js +154 -0
  37. package/app/plugins/builtin/supervisely-blog.rssany.js +159 -0
  38. package/app/plugins/builtin/uci-ml-repository.rssany.js +111 -0
  39. package/app/plugins/builtin/venturebeat.rssany.js +97 -0
  40. package/app/plugins/builtin/worldlabs.rssany.js +129 -0
  41. package/app/plugins/builtin/x.rssany.js +159 -0
  42. package/app/plugins/builtin/xiaohongshu.rssany.js +283 -0
  43. package/app/plugins/builtin/zhipu-research.rssany.js +334 -0
  44. package/dist/index.js +79 -9
  45. package/dist/index.js.map +1 -1
  46. package/package.json +1 -1
  47. package/webui/build/200.html +6 -6
  48. package/webui/build/_app/immutable/assets/0.BB88QFoe.css +1 -0
  49. package/webui/build/_app/immutable/assets/{homeFeedPanelStore.BopJZtHu.css → homeFeedPanelStore.iOmfP2qL.css} +1 -1
  50. package/webui/build/_app/immutable/chunks/CZD-YNDw.js +31 -0
  51. package/webui/build/_app/immutable/chunks/{DcAshVxe.js → D6VIKef0.js} +1 -1
  52. package/webui/build/_app/immutable/chunks/{EIZIMsXK.js → Dbqx2mXq.js} +1 -1
  53. package/webui/build/_app/immutable/chunks/DeX-oq5W.js +41 -0
  54. package/webui/build/_app/immutable/chunks/{BXCWEhUd.js → dhB8G5Is.js} +1 -1
  55. package/webui/build/_app/immutable/entry/{app.DdgnooOk.js → app.XPso7q7g.js} +2 -2
  56. package/webui/build/_app/immutable/entry/start.Db4snNCd.js +1 -0
  57. package/webui/build/_app/immutable/nodes/0.BKTQePmA.js +11 -0
  58. package/webui/build/_app/immutable/nodes/{1.5DFDaT4c.js → 1.BS3_Rfxm.js} +1 -1
  59. package/webui/build/_app/immutable/nodes/{10.OVK4i9XE.js → 10.CyyxDCIS.js} +1 -1
  60. package/webui/build/_app/immutable/nodes/{11.Dhn_rO4A.js → 11.CtYgIaGj.js} +1 -1
  61. package/webui/build/_app/immutable/nodes/{14.B_KpJLxn.js → 14.D5OEGPR2.js} +1 -1
  62. package/webui/build/_app/immutable/nodes/{15.RaWaA-0I.js → 15.B4dFN1Gk.js} +1 -1
  63. package/webui/build/_app/immutable/nodes/{16.DSUgqolV.js → 16.M7ZII7tl.js} +1 -1
  64. package/webui/build/_app/immutable/nodes/{3.wQvGs9w-.js → 3.7r8v7qkm.js} +1 -1
  65. package/webui/build/_app/immutable/nodes/{5.CCtn90c0.js → 5.CHIzoGrb.js} +1 -1
  66. package/webui/build/_app/immutable/nodes/{6.C2_mjW1u.js → 6.BDBqx-GY.js} +1 -1
  67. package/webui/build/_app/immutable/nodes/{7.Dwz6W7A1.js → 7.D5czsDmz.js} +1 -1
  68. package/webui/build/_app/immutable/nodes/{8.DzkEw6rx.js → 8.pjVNsCdV.js} +1 -1
  69. package/webui/build/_app/immutable/nodes/{9.DtlXEwe1.js → 9.CsARv1BH.js} +1 -1
  70. package/webui/build/_app/version.json +1 -1
  71. package/webui/build/_app/immutable/assets/0.C6Q_nuW9.css +0 -1
  72. package/webui/build/_app/immutable/chunks/CkUAV0m0.js +0 -41
  73. package/webui/build/_app/immutable/chunks/CtijX1u3.js +0 -31
  74. package/webui/build/_app/immutable/entry/start.DhJaJZhR.js +0 -1
  75. package/webui/build/_app/immutable/nodes/0.BE05Cuc4.js +0 -11
@@ -0,0 +1,118 @@
1
+ let _deps;
2
+
3
+
4
+
5
+ const FLAGEVAL_ORIGIN = "https://flageval.baai.ac.cn";
6
+ const FLAGEVAL_NEWS_API = `${FLAGEVAL_ORIGIN}/api/news/?page=1&pageSize=10`;
7
+ const MAX_PAGES = 5;
8
+
9
+
10
+ function normalizeText(text) {
11
+ return (text ?? "").replace(/\s+/g, " ").trim();
12
+ }
13
+
14
+
15
+ function hashGuid(input) {
16
+ return _deps.createHash("sha256").update(input).digest("hex");
17
+ }
18
+
19
+
20
+ function parseDate(rawValue) {
21
+ const value = normalizeText(rawValue);
22
+ if (!value) return new Date();
23
+ const date = new Date(value);
24
+ return Number.isNaN(date.getTime()) ? new Date() : date;
25
+ }
26
+
27
+
28
+ function resolveLink(news) {
29
+ const linkTo = normalizeText(news.linkTo);
30
+ if (linkTo) {
31
+ try {
32
+ const url = new URL(linkTo);
33
+ if (url.protocol === "http:" || url.protocol === "https:") return url.href;
34
+ } catch {
35
+ // ignore invalid linkTo and use fallback
36
+ }
37
+ }
38
+ const id = String(news.id ?? "").trim();
39
+ return `${FLAGEVAL_ORIGIN}/#/news/detail/${encodeURIComponent(id || "unknown")}`;
40
+ }
41
+
42
+
43
+ async function fetchNewsPage(url) {
44
+ const response = await fetch(url, {
45
+ redirect: "follow",
46
+ headers: {
47
+ "User-Agent":
48
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
49
+ "Accept": "application/json,text/plain,*/*",
50
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
51
+ },
52
+ });
53
+ if (!response.ok) {
54
+ throw new Error(`[flageval-news] 拉取新闻列表失败: HTTP ${response.status}`);
55
+ }
56
+ return await response.json();
57
+ }
58
+
59
+
60
+ async function fetchNewsList() {
61
+ const all = [];
62
+ let next = FLAGEVAL_NEWS_API;
63
+
64
+ for (let page = 0; page < MAX_PAGES && next; page += 1) {
65
+ const payload = await fetchNewsPage(next);
66
+ const results = Array.isArray(payload?.results) ? payload.results : [];
67
+ all.push(...results);
68
+ next = typeof payload?.next === "string" && payload.next ? payload.next : "";
69
+ }
70
+
71
+ return all;
72
+ }
73
+
74
+
75
+ async function fetchItems(_sourceId, _ctx) {
76
+ _deps = _ctx.deps;
77
+ const newsList = await fetchNewsList();
78
+ const seen = new Set();
79
+ const items = [];
80
+
81
+ for (const news of newsList) {
82
+ if (typeof news !== "object" || news == null) continue;
83
+
84
+ const title = normalizeText(news.title);
85
+ if (!title) continue;
86
+
87
+ const link = resolveLink(news);
88
+ const id = String(news.id ?? "").trim();
89
+ const guid = hashGuid(id ? `${id}|${link}` : link);
90
+ if (seen.has(guid)) continue;
91
+ seen.add(guid);
92
+
93
+ const summary = normalizeText(news.description) || undefined;
94
+ const pubDate = parseDate(news.publishedAt ?? news.updatedAt ?? news.createdAt);
95
+
96
+ items.push({
97
+ guid,
98
+ title,
99
+ link,
100
+ pubDate,
101
+ summary,
102
+ sourceId: "flageval-news",
103
+ });
104
+ }
105
+
106
+ if (items.length === 0) {
107
+ throw new Error("[flageval-news] 未解析到新闻条目,接口结构可能已变化");
108
+ }
109
+
110
+ return items;
111
+ }
112
+
113
+
114
+ export default {
115
+ id: "flageval-news",
116
+ listUrlPattern: /^https?:\/\/flageval\.baai\.ac\.cn\/#\/news(?:[/?].*)?$/i,
117
+ fetchItems,
118
+ };
@@ -0,0 +1,223 @@
1
+ let _deps;
2
+
3
+ // Google DeepMind Research 插件:抓取 research 页面中的最新研究条目(不做 enrich)
4
+
5
+
6
+
7
+ const DEEPMIND_RESEARCH_URL = "https://deepmind.google/research/";
8
+ const DEEPMIND_ORIGIN = "https://deepmind.google";
9
+ const MONTH_TO_INDEX = {
10
+ january: 0,
11
+ february: 1,
12
+ march: 2,
13
+ april: 3,
14
+ may: 4,
15
+ june: 5,
16
+ july: 6,
17
+ august: 7,
18
+ september: 8,
19
+ october: 9,
20
+ november: 10,
21
+ december: 11,
22
+ };
23
+
24
+
25
+ function normalizeText(text) {
26
+ return (text ?? "").replace(/\s+/g, " ").trim();
27
+ }
28
+
29
+
30
+ function hashGuid(input) {
31
+ return _deps.createHash("sha256").update(input).digest("hex");
32
+ }
33
+
34
+
35
+ function toAbsoluteHttpUrl(rawHref, baseUrl) {
36
+ if (!rawHref) return null;
37
+ const href = rawHref.trim();
38
+ if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
39
+ try {
40
+ const url = new URL(href, baseUrl);
41
+ if (!/^https?:$/i.test(url.protocol)) return null;
42
+ return url.href;
43
+ } catch {
44
+ return null;
45
+ }
46
+ }
47
+
48
+
49
+ function parsePubDate(rawDate) {
50
+ const normalized = normalizeText(rawDate);
51
+ if (!normalized) return undefined;
52
+
53
+ const monthYear = normalized.match(/^([A-Za-z]+)\s+(\d{4})$/);
54
+ if (monthYear) {
55
+ const monthName = monthYear[1]?.toLowerCase();
56
+ const year = Number(monthYear[2]);
57
+ const month = monthName ? MONTH_TO_INDEX[monthName] : undefined;
58
+ if (month != null && Number.isFinite(year)) {
59
+ return new Date(Date.UTC(year, month, 1, 12, 0, 0));
60
+ }
61
+ }
62
+
63
+ const direct = new Date(normalized);
64
+ if (!Number.isNaN(direct.getTime())) return direct;
65
+ return undefined;
66
+ }
67
+
68
+
69
+ function isResearchLink(link) {
70
+ let url;
71
+ try {
72
+ url = new URL(link);
73
+ } catch {
74
+ return false;
75
+ }
76
+
77
+ if (url.hostname === "deepmind.google") {
78
+ if (/^\/research\/publications\/\d+\/?$/i.test(url.pathname)) return true;
79
+ if (/^\/blog\/[^?#]+/i.test(url.pathname)) return true;
80
+ return false;
81
+ }
82
+
83
+ if (url.hostname === "blog.google") {
84
+ return /^\/technology\/google-deepmind\/[^?#]+/i.test(url.pathname);
85
+ }
86
+
87
+ return false;
88
+ }
89
+
90
+
91
+ function extractTitle(container) {
92
+ const heading =
93
+ container.querySelector("h1") ??
94
+ container.querySelector("h2") ??
95
+ container.querySelector("h3") ??
96
+ container.querySelector("h4");
97
+ const title = normalizeText(heading?.textContent);
98
+ if (title) return title;
99
+ return "";
100
+ }
101
+
102
+
103
+ function extractSummary(container, title) {
104
+ const summary = normalizeText(container.querySelector("p")?.textContent);
105
+ if (!summary) return undefined;
106
+ if (summary === title) return undefined;
107
+ return summary;
108
+ }
109
+
110
+
111
+ function isCallToActionTitle(title) {
112
+ return /^(learn|view|see|read|watch)\b/i.test(title);
113
+ }
114
+
115
+
116
+ function parseItemsFromArticles(root, baseUrl) {
117
+ const items = [];
118
+ const seen = new Set();
119
+ const articles = root.querySelectorAll("article");
120
+
121
+ for (const article of articles) {
122
+ const title = extractTitle(article);
123
+ if (!title || isCallToActionTitle(title)) continue;
124
+
125
+ const anchor = article.querySelector("a[href]");
126
+ const link = toAbsoluteHttpUrl(anchor?.getAttribute("href"), baseUrl);
127
+ if (!link || !isResearchLink(link)) continue;
128
+ if (seen.has(link)) continue;
129
+ seen.add(link);
130
+
131
+ const dateRaw =
132
+ article.querySelector("time")?.getAttribute("datetime") ??
133
+ article.querySelector("time")?.textContent ??
134
+ "";
135
+ const pubDate = parsePubDate(dateRaw) ?? new Date();
136
+ const summary = extractSummary(article, title);
137
+
138
+ items.push({
139
+ guid: hashGuid(link),
140
+ title,
141
+ link,
142
+ pubDate,
143
+ author: "Google DeepMind",
144
+ summary,
145
+ sourceId: "google-deepmind-research",
146
+ });
147
+ }
148
+
149
+ return items;
150
+ }
151
+
152
+
153
+ function findTitleAroundAnchor(anchor) {
154
+ const inlineHeading =
155
+ anchor.querySelector("h1, h2, h3, h4") ??
156
+ anchor.parentNode?.querySelector?.("h1, h2, h3, h4");
157
+ const title = normalizeText(inlineHeading?.textContent);
158
+ if (title) return title;
159
+
160
+ const text = normalizeText(anchor.textContent);
161
+ if (text && text.length >= 8 && !isCallToActionTitle(text)) return text;
162
+ return "";
163
+ }
164
+
165
+
166
+ function parseItemsFromAnchors(root, baseUrl) {
167
+ const items = [];
168
+ const seen = new Set();
169
+ const anchors = root.querySelectorAll("a[href]");
170
+
171
+ for (const anchor of anchors) {
172
+ const link = toAbsoluteHttpUrl(anchor.getAttribute("href"), baseUrl);
173
+ if (!link || !isResearchLink(link)) continue;
174
+ if (seen.has(link)) continue;
175
+
176
+ const title = findTitleAroundAnchor(anchor);
177
+ if (!title) continue;
178
+
179
+ const container = anchor.parentNode ?? anchor;
180
+ const dateRaw =
181
+ container.querySelector?.("time")?.getAttribute?.("datetime") ??
182
+ container.querySelector?.("time")?.textContent ??
183
+ "";
184
+ const pubDate = parsePubDate(dateRaw) ?? new Date();
185
+ const summary = extractSummary(container, title);
186
+
187
+ seen.add(link);
188
+ items.push({
189
+ guid: hashGuid(link),
190
+ title,
191
+ link,
192
+ pubDate,
193
+ author: "Google DeepMind",
194
+ summary,
195
+ sourceId: "google-deepmind-research",
196
+ });
197
+ }
198
+
199
+ return items;
200
+ }
201
+
202
+
203
+ async function fetchItems(sourceId, ctx) {
204
+ _deps = ctx.deps;
205
+ const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 4500 });
206
+ const root = _deps.parseHtml(html);
207
+ const baseUrl = finalUrl || DEEPMIND_ORIGIN;
208
+
209
+ const fromArticles = parseItemsFromArticles(root, baseUrl);
210
+ if (fromArticles.length > 0) return fromArticles;
211
+
212
+ const fromAnchors = parseItemsFromAnchors(root, baseUrl);
213
+ if (fromAnchors.length > 0) return fromAnchors;
214
+
215
+ throw new Error("[google-deepmind-research] 未解析到研究条目,页面结构可能已变化");
216
+ }
217
+
218
+
219
+ export default {
220
+ id: "google-deepmind-research",
221
+ listUrlPattern: /^https?:\/\/deepmind\.google\/research\/?(?:\?.*)?$/i,
222
+ fetchItems,
223
+ };
@@ -0,0 +1,171 @@
1
+ let _deps;
2
+
3
+
4
+
5
+ const GOOGLE_RESEARCH_AUTHOR = "Google Research Datasets";
6
+ const DATASETS_URL = "https://research.google/resources/datasets/";
7
+ const MIN_SUMMARY_LENGTH = 24;
8
+
9
+
10
+ function normalizeText(text) {
11
+ return (text ?? "").replace(/\s+/g, " ").trim();
12
+ }
13
+
14
+
15
+ function hashGuid(input) {
16
+ return _deps.createHash("sha256").update(input).digest("hex");
17
+ }
18
+
19
+
20
+ function isGoogleHost(hostname) {
21
+ return /^([a-z0-9-]+\.)*google\.[a-z.]+$/i.test(hostname);
22
+ }
23
+
24
+
25
+ function resolveHttpUrl(rawHref, baseUrl) {
26
+ if (!rawHref) return null;
27
+ const href = rawHref.trim();
28
+ if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
29
+
30
+ try {
31
+ const url = new URL(href, baseUrl);
32
+ if (!/^https?:$/i.test(url.protocol)) return null;
33
+ return url;
34
+ } catch {
35
+ return null;
36
+ }
37
+ }
38
+
39
+
40
+ function resolveResultLink(rawHref, baseUrl) {
41
+ const url = resolveHttpUrl(rawHref, baseUrl);
42
+ if (!url) return null;
43
+
44
+ if (isGoogleHost(url.hostname) && url.pathname === "/url") {
45
+ const target = resolveHttpUrl(url.searchParams.get("q") ?? url.searchParams.get("url"), url.href);
46
+ return target?.href ?? null;
47
+ }
48
+ return url.href;
49
+ }
50
+
51
+
52
+ function dedupeTexts(texts) {
53
+ const out = [];
54
+ const seen = new Set();
55
+ for (const text of texts) {
56
+ const normalized = normalizeText(text);
57
+ if (!normalized) continue;
58
+ const key = normalized.toLowerCase();
59
+ if (seen.has(key)) continue;
60
+ seen.add(key);
61
+ out.push(normalized);
62
+ }
63
+ return out;
64
+ }
65
+
66
+
67
+ function extractLeafTexts(anchor) {
68
+ const leaves = anchor
69
+ .querySelectorAll("h1,h2,h3,h4,h5,h6,p,span,div")
70
+ .filter((node) => node.querySelector("h1,h2,h3,h4,h5,h6,p,span,div") == null)
71
+ .map((node) => normalizeText(node.textContent))
72
+ .filter(Boolean);
73
+ return dedupeTexts(leaves);
74
+ }
75
+
76
+
77
+ function parseYearFromText(text) {
78
+ if (!text) return undefined;
79
+ const matches = text.match(/\b20\d{2}\b/g) ?? [];
80
+ if (matches.length === 0) return undefined;
81
+ const currentYear = new Date().getUTCFullYear();
82
+ const years = matches
83
+ .map((x) => Number(x))
84
+ .filter((x) => Number.isFinite(x))
85
+ .filter((x) => x >= 2000 && x <= currentYear + 1);
86
+ if (years.length === 0) return undefined;
87
+ return Math.max(...years);
88
+ }
89
+
90
+
91
+ function buildItem(title, link, summary, index) {
92
+ const year = parseYearFromText(`${title} ${summary ?? ""}`);
93
+ const pubDate = year == null
94
+ ? new Date(Date.now() - index * 1000)
95
+ : new Date(Date.UTC(year, 0, 1, 12, 0, 0));
96
+ return {
97
+ guid: hashGuid(link),
98
+ title,
99
+ link,
100
+ pubDate,
101
+ author: GOOGLE_RESEARCH_AUTHOR,
102
+ summary: summary || undefined,
103
+ };
104
+ }
105
+
106
+
107
+ function parseFromPurifiedHtml(html, finalUrl) {
108
+ const root = _deps.parseHtml(html);
109
+ const anchors = root.querySelectorAll("a[href]");
110
+ const seenLinks = new Set();
111
+ const items = [];
112
+
113
+ for (const anchor of anchors) {
114
+ const link = resolveResultLink(anchor.getAttribute("href"), finalUrl);
115
+ if (!link || seenLinks.has(link)) continue;
116
+
117
+ const texts = extractLeafTexts(anchor);
118
+ if (texts.length < 2) continue;
119
+ const title = texts[0];
120
+ const summary = texts.find((text) => text !== title && text.length >= MIN_SUMMARY_LENGTH);
121
+ if (!title || !summary) continue;
122
+
123
+ seenLinks.add(link);
124
+ items.push(buildItem(title, link, summary, items.length));
125
+ }
126
+
127
+ return items;
128
+ }
129
+
130
+
131
+ function parseFromRawHtml(html, finalUrl) {
132
+ const root = _deps.parseHtml(html);
133
+ const anchors = root.querySelectorAll("a.row-card[href]");
134
+ const seenLinks = new Set();
135
+ const items = [];
136
+
137
+ for (const anchor of anchors) {
138
+ const link = resolveResultLink(anchor.getAttribute("href"), finalUrl);
139
+ if (!link || seenLinks.has(link)) continue;
140
+
141
+ const title = normalizeText(anchor.querySelector(".row-card__heading")?.textContent);
142
+ const summary = normalizeText(anchor.querySelector(".row-card__subheading__item")?.textContent);
143
+ if (!title || !summary) continue;
144
+
145
+ seenLinks.add(link);
146
+ items.push(buildItem(title, link, summary, items.length));
147
+ }
148
+
149
+ return items;
150
+ }
151
+
152
+
153
+ async function fetchItems(sourceId, ctx) {
154
+ _deps = ctx.deps;
155
+ const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
156
+ const fromPurified = parseFromPurifiedHtml(html, finalUrl || sourceId || DATASETS_URL);
157
+ if (fromPurified.length > 0) return fromPurified;
158
+
159
+ const raw = await ctx.fetchHtml(sourceId, { waitMs: 3500, purify: false });
160
+ const fromRaw = parseFromRawHtml(raw.html, raw.finalUrl || sourceId || DATASETS_URL);
161
+ if (fromRaw.length > 0) return fromRaw;
162
+
163
+ throw new Error("[google-research-datasets] 未解析到数据集条目,页面结构可能已变化");
164
+ }
165
+
166
+
167
+ export default {
168
+ id: "google-research-datasets",
169
+ listUrlPattern: /^https?:\/\/research\.google\/resources\/datasets\/?(\?.*)?$/i,
170
+ fetchItems,
171
+ };