rssany 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/app/plugins/builtin/agi-eval-evaluation.rssany.js +188 -0
  2. package/app/plugins/builtin/amii-research-talent.rssany.js +73 -0
  3. package/app/plugins/builtin/anthropic-research.rssany.js +155 -0
  4. package/app/plugins/builtin/appen-resources.rssany.js +155 -0
  5. package/app/plugins/builtin/baai-wudao-paper-article.rssany.js +185 -0
  6. package/app/plugins/builtin/baaidata-csdn.rssany.js +242 -0
  7. package/app/plugins/builtin/baidu-research.rssany.js +222 -0
  8. package/app/plugins/builtin/brightdata-blog.rssany.js +301 -0
  9. package/app/plugins/builtin/bytedance-seed-research.rssany.js +231 -0
  10. package/app/plugins/builtin/five-radar.rssany.js +490 -0
  11. package/app/plugins/builtin/flageval-news.rssany.js +118 -0
  12. package/app/plugins/builtin/google-deepmind-research.rssany.js +223 -0
  13. package/app/plugins/builtin/google-research-datasets.rssany.js +171 -0
  14. package/app/plugins/builtin/google-research.rssany.js +220 -0
  15. package/app/plugins/builtin/google.rssany.js +187 -0
  16. package/app/plugins/builtin/hacker-news-newest.rssany.js +130 -0
  17. package/app/plugins/builtin/harvard-dataverse.rssany.js +166 -0
  18. package/app/plugins/builtin/huaweicloud-bbs-blogs.rssany.js +185 -0
  19. package/app/plugins/builtin/lingowhale.rssany.js +119 -0
  20. package/app/plugins/builtin/meituan-tech.rssany.js +130 -0
  21. package/app/plugins/builtin/meta-ai-publications.rssany.js +221 -0
  22. package/app/plugins/builtin/mila-quebec.rssany.js +199 -0
  23. package/app/plugins/builtin/mit-csail-research.rssany.js +208 -0
  24. package/app/plugins/builtin/moonshot.rssany.js +127 -0
  25. package/app/plugins/builtin/opendatalab-news.rssany.js +174 -0
  26. package/app/plugins/builtin/opendatalab.rssany.js +109 -0
  27. package/app/plugins/builtin/opendrivelab-autonomous-driving.rssany.js +114 -0
  28. package/app/plugins/builtin/opendrivelab-embodiedai.rssany.js +114 -0
  29. package/app/plugins/builtin/opendrivelab-publications.rssany.js +130 -0
  30. package/app/plugins/builtin/opendrivelab.rssany.js +333 -0
  31. package/app/plugins/builtin/paperswithcode.rssany.js +227 -0
  32. package/app/plugins/builtin/pjlab-adg-publications.rssany.js +202 -0
  33. package/app/plugins/builtin/rss.rssany.js +11 -1
  34. package/app/plugins/builtin/selectdataset.rssany.js +206 -0
  35. package/app/plugins/builtin/sensetime-tech-achievements.rssany.js +154 -0
  36. package/app/plugins/builtin/supervisely-blog.rssany.js +159 -0
  37. package/app/plugins/builtin/theinformation-briefings.rssany.js +136 -0
  38. package/app/plugins/builtin/uci-ml-repository.rssany.js +111 -0
  39. package/app/plugins/builtin/venturebeat.rssany.js +97 -0
  40. package/app/plugins/builtin/worldlabs.rssany.js +129 -0
  41. package/app/plugins/builtin/x.rssany.js +328 -0
  42. package/app/plugins/builtin/xiaohongshu.rssany.js +283 -0
  43. package/app/plugins/builtin/zhipu-research.rssany.js +334 -0
  44. package/dist/index.js +62 -4
  45. package/dist/index.js.map +1 -1
  46. package/package.json +1 -1
  47. package/webui/build/200.html +6 -6
  48. package/webui/build/_app/immutable/assets/{0.DjU2hdCQ.css → 0.BB88QFoe.css} +1 -1
  49. package/webui/build/_app/immutable/assets/homeFeedPanelStore.CSvlNcpm.css +1 -0
  50. package/webui/build/_app/immutable/chunks/BwlaCkNX.js +36 -0
  51. package/webui/build/_app/immutable/chunks/C0J2-L94.js +1 -0
  52. package/webui/build/_app/immutable/chunks/CLOXMsDk.js +36 -0
  53. package/webui/build/_app/immutable/chunks/{C85CNwD2.js → DgceFEv5.js} +1 -1
  54. package/webui/build/_app/immutable/chunks/{CllQAdvt.js → SqCUd34O.js} +1 -1
  55. package/webui/build/_app/immutable/entry/{app.BcD2eSsQ.js → app.B8zBPipq.js} +2 -2
  56. package/webui/build/_app/immutable/entry/start.CxRCKeCl.js +1 -0
  57. package/webui/build/_app/immutable/nodes/0.ChLNE3xy.js +11 -0
  58. package/webui/build/_app/immutable/nodes/{1.DU9aYGAb.js → 1.1N74-4Io.js} +1 -1
  59. package/webui/build/_app/immutable/nodes/{10.Db6vw7Ih.js → 10.DY30t9Ib.js} +1 -1
  60. package/webui/build/_app/immutable/nodes/{11.BaAcorz3.js → 11.ITuxnukH.js} +1 -1
  61. package/webui/build/_app/immutable/nodes/12.qLzWqB1c.js +1 -0
  62. package/webui/build/_app/immutable/nodes/{14.DqT4pcrQ.js → 14.BHnIxbVM.js} +1 -1
  63. package/webui/build/_app/immutable/nodes/{15.CCLbjxnH.js → 15.CLjT9il3.js} +1 -1
  64. package/webui/build/_app/immutable/nodes/{16.DiigpVdP.js → 16.BD-mKCLN.js} +1 -1
  65. package/webui/build/_app/immutable/nodes/{3.DEcYOQc-.js → 3.Dt5o2Fmz.js} +1 -1
  66. package/webui/build/_app/immutable/nodes/{5.CvM1TkLG.js → 5.Dy3vSsIP.js} +1 -1
  67. package/webui/build/_app/immutable/nodes/{6.Dscr6LkS.js → 6.DvclsL6H.js} +1 -1
  68. package/webui/build/_app/immutable/nodes/{7.Bp60MobD.js → 7.D2nJy-Uz.js} +1 -1
  69. package/webui/build/_app/immutable/nodes/{8.DwSg0MHh.js → 8.C75mhrqs.js} +1 -1
  70. package/webui/build/_app/immutable/nodes/{9.BeYOUjxR.js → 9.Bp_QXw3w.js} +1 -1
  71. package/webui/build/_app/version.json +1 -1
  72. package/webui/build/_app/immutable/assets/homeFeedPanelStore.BopJZtHu.css +0 -1
  73. package/webui/build/_app/immutable/chunks/CdMsRjxJ.js +0 -1
  74. package/webui/build/_app/immutable/chunks/CtijX1u3.js +0 -31
  75. package/webui/build/_app/immutable/chunks/Dv1VCsiB.js +0 -41
  76. package/webui/build/_app/immutable/entry/start.CbkdJdz1.js +0 -1
  77. package/webui/build/_app/immutable/nodes/0.DSUDmOx2.js +0 -11
  78. package/webui/build/_app/immutable/nodes/12.Cg8AeCSH.js +0 -1
@@ -0,0 +1,154 @@
1
+ let _deps;
2
+
3
+
4
+ const SITE_ID = "sensetime-tech-achievements";
5
+ const DATE_RE = /\b(20\d{2})-(\d{1,2})-(\d{1,2})\b/;
6
+
7
+ function normalizeText(text) {
8
+ return (text ?? "").replace(/\s+/g, " ").trim();
9
+ }
10
+
11
+ function hashGuid(input) {
12
+ return _deps.createHash("sha256").update(input).digest("hex");
13
+ }
14
+
15
+ function toAbsoluteUrl(rawHref, baseUrl) {
16
+ if (!rawHref) return null;
17
+ const href = rawHref.trim();
18
+ if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
19
+ try {
20
+ const url = new URL(href, baseUrl);
21
+ if (!/^https?:$/i.test(url.protocol)) return null;
22
+ return url.href;
23
+ } catch {
24
+ return null;
25
+ }
26
+ }
27
+
28
+ function parseDate(dateText) {
29
+ const text = normalizeText(dateText);
30
+ const m = text.match(DATE_RE);
31
+ if (!m) return undefined;
32
+ const [, y, mm, dd] = m;
33
+ return new Date(`${y}-${mm.padStart(2, "0")}-${dd.padStart(2, "0")}T00:00:00.000Z`);
34
+ }
35
+
36
+ function extractDateText(anchor) {
37
+ let current = anchor;
38
+ for (let i = 0; i < 8 && current; i += 1) {
39
+ const text = normalizeText(current.textContent);
40
+ const m = text.match(DATE_RE);
41
+ if (m) return m[0];
42
+ current = current.parentNode ?? null;
43
+ }
44
+ return "";
45
+ }
46
+
47
+ function extractTags(anchor, title, dateText) {
48
+ let current = anchor;
49
+ for (let i = 0; i < 6 && current; i += 1) {
50
+ const spans = current.querySelectorAll?.("span") ?? [];
51
+ const tags = spans
52
+ .map((s) => normalizeText(s.textContent))
53
+ .filter(Boolean)
54
+ .filter((x) => x !== title && x !== dateText)
55
+ .filter((x) => !DATE_RE.test(x));
56
+ if (tags.length > 0) return Array.from(new Set(tags));
57
+ current = current.parentNode ?? null;
58
+ }
59
+ return [];
60
+ }
61
+
62
+ function parseItemsFromHtml(html, finalUrl) {
63
+ const root = _deps.parseHtml(html);
64
+ const anchors = root.querySelectorAll('a[href*="/technology-new-detail/"]');
65
+ const seen = new Set();
66
+ const items = [];
67
+
68
+ for (const anchor of anchors) {
69
+ const link = toAbsoluteUrl(anchor.getAttribute("href"), finalUrl);
70
+ if (!link || seen.has(link)) continue;
71
+
72
+ const title = normalizeText(anchor.textContent);
73
+ if (!title) continue;
74
+
75
+ seen.add(link);
76
+ const dateText = extractDateText(anchor);
77
+ const tags = extractTags(anchor, title, dateText);
78
+ const summary = [dateText, tags.join(" / ")].filter(Boolean).join(" | ");
79
+ const pubDate = parseDate(dateText) ?? new Date();
80
+
81
+ items.push({
82
+ guid: hashGuid(link),
83
+ title,
84
+ link,
85
+ pubDate,
86
+ summary: summary || undefined,
87
+ });
88
+ }
89
+
90
+ return items;
91
+ }
92
+
93
+ async function fetchItemsFromApi(finalUrl) {
94
+ const origin = new URL(finalUrl).origin;
95
+ const apiUrl = new URL("/rest/v1/contents/1/getlistbyparam/48/1/20/0/0?scene=1", origin);
96
+
97
+ const res = await fetch(apiUrl, {
98
+ headers: {
99
+ Accept: "application/json,text/plain,*/*",
100
+ "User-Agent":
101
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
102
+ },
103
+ });
104
+ if (!res.ok) return [];
105
+
106
+ const data = await res.json();
107
+ const list = Array.isArray(data?.data?.lists) ? data.data.lists : [];
108
+ const items = [];
109
+ const seen = new Set();
110
+
111
+ for (const row of list) {
112
+ const contentId = String(row?.contentId ?? "").trim();
113
+ const title = normalizeText(row?.title);
114
+ if (!contentId || !title) continue;
115
+ const link = new URL(`/cn/technology-new-detail/${contentId}?categoryId=48`, origin).href;
116
+ if (seen.has(link)) continue;
117
+ seen.add(link);
118
+
119
+ const dateText = normalizeText(row?.createTime);
120
+ const tags = Array.isArray(row?.tagnames)
121
+ ? row.tagnames.map((x) => normalizeText(x)).filter(Boolean)
122
+ : [];
123
+ const summary = [dateText, tags.join(" / ")].filter(Boolean).join(" | ");
124
+ const pubDate = parseDate(dateText) ?? new Date();
125
+
126
+ items.push({
127
+ guid: hashGuid(link),
128
+ title,
129
+ link,
130
+ pubDate,
131
+ summary: summary || undefined,
132
+ });
133
+ }
134
+
135
+ return items;
136
+ }
137
+
138
+ async function fetchItems(sourceId, ctx) {
139
+ _deps = ctx.deps;
140
+ const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
141
+ const items = parseItemsFromHtml(html, finalUrl);
142
+ if (items.length > 0) return items;
143
+
144
+ const fallbackItems = await fetchItemsFromApi(finalUrl);
145
+ if (fallbackItems.length > 0) return fallbackItems;
146
+
147
+ throw new Error(`[${SITE_ID}] 未解析到学术成果条目,页面结构或接口可能已变化`);
148
+ }
149
+
150
+ export default {
151
+ id: SITE_ID,
152
+ listUrlPattern: /^https?:\/\/(www\.)?sensetime\.com\/cn\/technology-achievements(\?.*)?$/i,
153
+ fetchItems,
154
+ };
@@ -0,0 +1,159 @@
1
+ let _deps;
2
+
3
+ // Supervisely Blog 插件:抓取列表页并解析为 FeedItem(不做正文 enrich)
4
+
5
+
6
+
7
+ const SUPERVISELY_ORIGIN = "https://supervisely.com";
8
+ const MONTH_INDEX = {
9
+ jan: 0,
10
+ feb: 1,
11
+ mar: 2,
12
+ apr: 3,
13
+ may: 4,
14
+ jun: 5,
15
+ jul: 6,
16
+ aug: 7,
17
+ sep: 8,
18
+ oct: 9,
19
+ nov: 10,
20
+ dec: 11,
21
+ };
22
+
23
+
24
+ function normalizeText(text) {
25
+ return (text ?? "").replace(/\s+/g, " ").trim();
26
+ }
27
+
28
+
29
+ function hashGuid(input) {
30
+ return _deps.createHash("sha256").update(input).digest("hex");
31
+ }
32
+
33
+
34
+ function toAbsoluteUrl(href, baseUrl) {
35
+ if (!href) return null;
36
+ try {
37
+ const url = new URL(href, baseUrl);
38
+ if (!/^https?:$/i.test(url.protocol)) return null;
39
+ return url.href;
40
+ } catch {
41
+ return null;
42
+ }
43
+ }
44
+
45
+
46
+ function parsePubDate(rawText) {
47
+ const text = normalizeText(rawText);
48
+ const m = text.match(/^([A-Za-z]{3,9})\s+(\d{1,2}),\s*(\d{4})$/);
49
+ if (!m) return undefined;
50
+ const month = MONTH_INDEX[m[1].slice(0, 3).toLowerCase()];
51
+ if (month == null) return undefined;
52
+ const day = Number(m[2]);
53
+ const year = Number(m[3]);
54
+ if (!Number.isInteger(day) || !Number.isInteger(year)) return undefined;
55
+ return new Date(Date.UTC(year, month, day, 0, 0, 0));
56
+ }
57
+
58
+
59
+ function looksLikeBlogLink(link) {
60
+ try {
61
+ const u = new URL(link);
62
+ return /^\/blog\/[^/]+\/?$/i.test(u.pathname);
63
+ } catch {
64
+ return false;
65
+ }
66
+ }
67
+
68
+
69
+ function findAncestor(node, maxDepth) {
70
+ let current = node?.parentNode ?? null;
71
+ for (let i = 0; i < maxDepth && current; i += 1) {
72
+ if (current.querySelector?.("time")) return current;
73
+ current = current.parentNode ?? null;
74
+ }
75
+ return node?.parentNode ?? null;
76
+ }
77
+
78
+
79
+ function buildFeedItem({ title, link, summary, author, pubDate }) {
80
+ return {
81
+ guid: hashGuid(link),
82
+ title,
83
+ link,
84
+ pubDate: pubDate ?? new Date(),
85
+ author: author || undefined,
86
+ summary: summary || undefined,
87
+ };
88
+ }
89
+
90
+
91
+ function parseFromCards(root, baseUrl) {
92
+ const seen = new Set();
93
+ const items = [];
94
+ const cards = root.querySelectorAll("div.blog-card");
95
+
96
+ for (const card of cards) {
97
+ const titleAnchor = card.querySelector("h4 a[href]");
98
+ const title = normalizeText(titleAnchor?.textContent);
99
+ const link = toAbsoluteUrl(titleAnchor?.getAttribute("href"), baseUrl);
100
+ if (!title || !link || !looksLikeBlogLink(link) || seen.has(link)) continue;
101
+
102
+ const summary = normalizeText(card.querySelector("p")?.textContent);
103
+ const author = normalizeText(card.querySelector('b[rel="author"], address b')?.textContent);
104
+ const pubDateText = normalizeText(card.querySelector("time")?.textContent);
105
+ const pubDate = parsePubDate(pubDateText);
106
+
107
+ seen.add(link);
108
+ items.push(buildFeedItem({ title, link, summary, author, pubDate }));
109
+ }
110
+
111
+ return items;
112
+ }
113
+
114
+
115
+ function parseFromHeadingFallback(root, baseUrl) {
116
+ const seen = new Set();
117
+ const items = [];
118
+ const anchors = root.querySelectorAll('h4 a[href*="/blog/"]');
119
+
120
+ for (const anchor of anchors) {
121
+ const title = normalizeText(anchor.textContent);
122
+ const link = toAbsoluteUrl(anchor.getAttribute("href"), baseUrl);
123
+ if (!title || !link || !looksLikeBlogLink(link) || seen.has(link)) continue;
124
+
125
+ const container = findAncestor(anchor, 7);
126
+ const summary = normalizeText(container?.querySelector("p")?.textContent);
127
+ const author = normalizeText(container?.querySelector('b[rel="author"], address b')?.textContent);
128
+ const pubDateText = normalizeText(container?.querySelector("time")?.textContent);
129
+ const pubDate = parsePubDate(pubDateText);
130
+
131
+ seen.add(link);
132
+ items.push(buildFeedItem({ title, link, summary, author, pubDate }));
133
+ }
134
+
135
+ return items;
136
+ }
137
+
138
+
139
+ async function fetchItems(sourceId, ctx) {
140
+ _deps = ctx.deps;
141
+ const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
142
+ const root = _deps.parseHtml(html);
143
+ const baseUrl = finalUrl || SUPERVISELY_ORIGIN;
144
+
145
+ const fromCards = parseFromCards(root, baseUrl);
146
+ const items = fromCards.length > 0 ? fromCards : parseFromHeadingFallback(root, baseUrl);
147
+
148
+ if (items.length === 0) {
149
+ throw new Error("[supervisely-blog] 未解析到文章条目,页面结构可能已变化");
150
+ }
151
+ return items;
152
+ }
153
+
154
+
155
+ export default {
156
+ id: "supervisely-blog",
157
+ listUrlPattern: /^https?:\/\/(www\.)?supervisely\.com\/blog\/?(?:\?.*)?$/i,
158
+ fetchItems,
159
+ };
@@ -0,0 +1,136 @@
1
+ let _deps;
2
+
3
+ // The Information — Briefings 列表页:https://www.theinformation.com/briefings
4
+ // 结构:.content-feed .article.briefing.feed-item,标题 h3.title a,摘要 .briefing-dek,时间 .authors
5
+
6
+ const ORIGIN = "https://www.theinformation.com";
7
+ const LIST_URL_RE =
8
+ /^https?:\/\/(www\.)?theinformation\.com\/briefings\/?(\?.*)?$/i;
9
+
10
+
11
+ function normalizeText(text) {
12
+ return (text ?? "").replace(/\s+/g, " ").trim();
13
+ }
14
+
15
+
16
+ function hashGuid(input) {
17
+ return _deps.createHash("sha256").update(input).digest("hex");
18
+ }
19
+
20
+
21
+ function toAbsoluteHttpUrl(rawHref, baseUrl) {
22
+ if (!rawHref) return null;
23
+ const href = rawHref.trim();
24
+ if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
25
+ try {
26
+ const url = new URL(href, baseUrl);
27
+ if (!/^https?:$/i.test(url.protocol)) return null;
28
+ return url.href;
29
+ } catch {
30
+ return null;
31
+ }
32
+ }
33
+
34
+
35
+ function pad2(n) {
36
+ return String(n).padStart(2, "0");
37
+ }
38
+
39
+
40
+ /** .authors 文本:Apr 14, 2026 · 5:41am PDT(可含 · 1 comment);Node 不能可靠解析 PDT 缩写,手动换算 offset */
41
+ function parseBriefingAuthorsDate(raw) {
42
+ let t = normalizeText(raw);
43
+ t = t.replace(/\s*·\s*\d+\s+comments?\s*$/i, "").trim();
44
+
45
+ const m = t.match(
46
+ /^(.+?\d{4})\s*·\s*(\d{1,2}:\d{2}\s*(?:am|pm))\s*(PDT|PST|PT)\s*$/i
47
+ );
48
+ if (m) {
49
+ const datePart = m[1].trim();
50
+ const timePart = m[2].trim();
51
+ const tz = m[3].toUpperCase();
52
+ const offset = tz === "PDT" ? "-07:00" : "-08:00";
53
+
54
+ const hm = timePart.match(/(\d{1,2}):(\d{2})\s*(am|pm)/i);
55
+ const d0 = new Date(datePart);
56
+ if (hm && !Number.isNaN(d0.getTime())) {
57
+ let h = Number(hm[1]);
58
+ const min = Number(hm[2]);
59
+ const ap = hm[3].toLowerCase();
60
+ if (ap === "pm" && h < 12) h += 12;
61
+ if (ap === "am" && h === 12) h = 0;
62
+ const y = d0.getFullYear();
63
+ const mo = d0.getMonth() + 1;
64
+ const da = d0.getDate();
65
+ const iso = `${y}-${pad2(mo)}-${pad2(da)}T${pad2(h)}:${pad2(min)}:00${offset}`;
66
+ const out = new Date(iso);
67
+ if (!Number.isNaN(out.getTime())) return out;
68
+ }
69
+ }
70
+
71
+ const first = t.split("·")[0].trim();
72
+ const fallback = new Date(first);
73
+ return Number.isNaN(fallback.getTime()) ? new Date() : fallback;
74
+ }
75
+
76
+
77
+ function parseBriefingItems(html, pageUrl) {
78
+ const root = _deps.parseHtml(html);
79
+ const items = [];
80
+ const seen = new Set();
81
+
82
+ for (const node of root.querySelectorAll(".content-feed .article.briefing.feed-item")) {
83
+ const linkEl = node.querySelector("h3.title a[href]");
84
+ if (!linkEl) continue;
85
+
86
+ const title = normalizeText(linkEl.textContent);
87
+ const link = toAbsoluteHttpUrl(linkEl.getAttribute("href"), pageUrl);
88
+ if (!title || !link || seen.has(link)) continue;
89
+ seen.add(link);
90
+
91
+ const authorsText = normalizeText(node.querySelector(".authors")?.textContent ?? "");
92
+ const pubDate = parseBriefingAuthorsDate(authorsText);
93
+ const summary = normalizeText(node.querySelector(".briefing-dek")?.textContent ?? "") || undefined;
94
+
95
+ items.push({
96
+ guid: hashGuid(link),
97
+ title,
98
+ link,
99
+ pubDate,
100
+ summary,
101
+ });
102
+ }
103
+
104
+ return items;
105
+ }
106
+
107
+
108
+ async function fetchItems(sourceId, ctx) {
109
+ _deps = ctx.deps;
110
+ const { html, finalUrl, status } = await ctx.fetchHtml(sourceId, {
111
+ waitMs: 5000,
112
+ waitForSelector: ".content-feed .article.briefing",
113
+ waitForSelectorTimeoutMs: 25_000,
114
+ });
115
+
116
+ const pageUrl = finalUrl || sourceId || ORIGIN;
117
+ const items = parseBriefingItems(html, pageUrl);
118
+
119
+ if (items.length === 0) {
120
+ const hint = status && status >= 400 ? ` HTTP ${status}` : "";
121
+ throw new Error(
122
+ `[theinformation-briefings] 未解析到条目,页面结构可能已变化或需登录后抓取。${hint}`
123
+ );
124
+ }
125
+
126
+ items.sort((a, b) => b.pubDate.getTime() - a.pubDate.getTime());
127
+ return items;
128
+ }
129
+
130
+
131
+ export default {
132
+ id: "theinformation-briefings",
133
+ listUrlPattern: LIST_URL_RE,
134
+ refreshInterval: "1h",
135
+ fetchItems,
136
+ };
@@ -0,0 +1,111 @@
1
+ let _deps;
2
+
3
+
4
+ const UCI_ORIGIN = "https://archive.ics.uci.edu";
5
+
6
+ function normalizeText(text) {
7
+ return (text ?? "").replace(/\s+/g, " ").trim();
8
+ }
9
+
10
+ function hashGuid(input) {
11
+ return _deps.createHash("sha256").update(input).digest("hex");
12
+ }
13
+
14
+ function resolveDatasetLink(rawHref, baseUrl) {
15
+ const href = normalizeText(rawHref);
16
+ if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
17
+ try {
18
+ const url = new URL(href, baseUrl);
19
+ if (!/^https?:$/i.test(url.protocol)) return null;
20
+ if (url.hostname !== "archive.ics.uci.edu") return null;
21
+ if (!/^\/dataset\/\d+\/[^/?#]+$/i.test(url.pathname)) return null;
22
+ url.search = "";
23
+ url.hash = "";
24
+ return url.href;
25
+ } catch {
26
+ return null;
27
+ }
28
+ }
29
+
30
+ function pickSummaryNearNode(node) {
31
+ let cur = node;
32
+ for (let i = 0; i < 6 && cur; i += 1) {
33
+ const p = cur.querySelector?.("p");
34
+ const summary = normalizeText(p?.textContent);
35
+ if (summary) return summary;
36
+ cur = cur.parentNode ?? null;
37
+ }
38
+ return "";
39
+ }
40
+
41
+ function buildItem({ title, link, summary, index }) {
42
+ return {
43
+ guid: hashGuid(link),
44
+ title,
45
+ link,
46
+ pubDate: new Date(Date.now() - index * 1000),
47
+ summary: summary || undefined,
48
+ sourceId: "uci-ml-repository",
49
+ };
50
+ }
51
+
52
+ function parseFromHeadingAnchors(root, baseUrl) {
53
+ const anchors = root.querySelectorAll('h2 a[href^="/dataset/"]');
54
+ const items = [];
55
+ const seen = new Set();
56
+
57
+ for (const anchor of anchors) {
58
+ const link = resolveDatasetLink(anchor.getAttribute("href"), baseUrl);
59
+ if (!link || seen.has(link)) continue;
60
+
61
+ const title = normalizeText(anchor.textContent);
62
+ if (!title) continue;
63
+
64
+ const summary = pickSummaryNearNode(anchor.parentNode ?? anchor);
65
+ seen.add(link);
66
+ items.push(buildItem({ title, link, summary, index: items.length }));
67
+ }
68
+ return items;
69
+ }
70
+
71
+ function parseFromGenericAnchors(root, baseUrl) {
72
+ const anchors = root.querySelectorAll('a[href^="/dataset/"]');
73
+ const items = [];
74
+ const seen = new Set();
75
+
76
+ for (const anchor of anchors) {
77
+ const link = resolveDatasetLink(anchor.getAttribute("href"), baseUrl);
78
+ if (!link || seen.has(link)) continue;
79
+
80
+ const titleFromText = normalizeText(anchor.textContent);
81
+ const titleFromImage = normalizeText(anchor.querySelector("img")?.getAttribute("alt"));
82
+ const title = titleFromText || titleFromImage;
83
+ if (!title) continue;
84
+
85
+ const summary = pickSummaryNearNode(anchor.parentNode ?? anchor);
86
+ seen.add(link);
87
+ items.push(buildItem({ title, link, summary, index: items.length }));
88
+ }
89
+ return items;
90
+ }
91
+
92
+ async function fetchItems(sourceId, ctx) {
93
+ _deps = ctx.deps;
94
+ const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 4000 });
95
+ const baseUrl = finalUrl || sourceId || UCI_ORIGIN;
96
+ const root = _deps.parseHtml(html);
97
+
98
+ const byHeading = parseFromHeadingAnchors(root, baseUrl);
99
+ if (byHeading.length > 0) return byHeading;
100
+
101
+ const fallback = parseFromGenericAnchors(root, baseUrl);
102
+ if (fallback.length > 0) return fallback;
103
+
104
+ throw new Error("[uci-ml-repository] 未解析到数据集条目,页面结构可能已变化");
105
+ }
106
+
107
+ export default {
108
+ id: "uci-ml-repository",
109
+ listUrlPattern: /^https?:\/\/archive\.ics\.uci\.edu(?:\/(?:datasets\/?)?)?(?:\?.*)?$/i,
110
+ fetchItems,
111
+ };
@@ -0,0 +1,97 @@
1
+ let _deps;
2
+
3
+ // VentureBeat 插件:通过官方 RSS Feed 拉取列表,规避首页安全检查页
4
+
5
+
6
+
7
+
8
+ function normalizeText(text) {
9
+ return (text ?? "").replace(/\s+/g, " ").trim();
10
+ }
11
+
12
+
13
+ function stripHtml(text) {
14
+ return normalizeText((text ?? "").replace(/<[^>]*>/g, " "));
15
+ }
16
+
17
+
18
+ function toValidDate(raw) {
19
+ if (!raw) return new Date();
20
+ const date = new Date(raw);
21
+ return Number.isNaN(date.getTime()) ? new Date() : date;
22
+ }
23
+
24
+
25
+ function toFeedUrl(sourceId) {
26
+ const url = new URL(sourceId);
27
+ const path = url.pathname.replace(/\/+$/, "");
28
+ if (path.endsWith("/feed")) return url.href;
29
+ url.pathname = path ? `${path}/feed/` : "/feed/";
30
+ url.search = "";
31
+ url.hash = "";
32
+ return url.href;
33
+ }
34
+
35
+
36
+ function mapFeedItem(item) {
37
+ const link = normalizeText(item.link ?? "");
38
+ if (!/^https?:\/\//i.test(link)) return null;
39
+
40
+ const title = normalizeText(item.title ?? "");
41
+ const pubDate = toValidDate(item.isoDate ?? item.pubDate);
42
+ const summary = normalizeText(item.contentSnippet ?? "") || stripHtml(item.summary ?? item.content ?? "");
43
+ const author = normalizeText(item.creator ?? item.author ?? "") || undefined;
44
+
45
+ return {
46
+ guid: _deps.createHash("sha256").update(link).digest("hex"),
47
+ title: title || "(无标题)",
48
+ link,
49
+ pubDate,
50
+ author,
51
+ summary: summary || undefined,
52
+ };
53
+ }
54
+
55
+
56
+ async function fetchItems(sourceId, _ctx) {
57
+ _deps = _ctx.deps;
58
+ const parser = new _deps.RssParser({
59
+ timeout: 15_000,
60
+ headers: {
61
+ "User-Agent": "RssAny/1.0 (+https://github.com/rssany/rssany)",
62
+ Accept: "application/rss+xml,application/atom+xml,application/xml,text/xml,*/*",
63
+ },
64
+ });
65
+ const feedUrl = toFeedUrl(sourceId);
66
+ let feed;
67
+ try {
68
+ feed = await parser.parseURL(feedUrl);
69
+ } catch (err) {
70
+ const msg = err instanceof Error ? err.message : String(err);
71
+ throw new Error(`[venturebeat] 抓取 feed 失败: ${feedUrl} (${msg})`);
72
+ }
73
+
74
+ const seen = new Set();
75
+ const items = [];
76
+ for (const item of feed.items ?? []) {
77
+ const mapped = mapFeedItem(item);
78
+ if (!mapped) continue;
79
+ if (seen.has(mapped.link)) continue;
80
+ seen.add(mapped.link);
81
+ items.push(mapped);
82
+ }
83
+
84
+ if (items.length === 0) {
85
+ throw new Error(`[venturebeat] 未解析到条目: ${feedUrl}`);
86
+ }
87
+
88
+ return items;
89
+ }
90
+
91
+
92
+ export default {
93
+ id: "venturebeat",
94
+ listUrlPattern: /^https?:\/\/(www\.)?venturebeat\.com\/?(\?.*)?$/i,
95
+ refreshInterval: "1h",
96
+ fetchItems,
97
+ };