rssany 0.1.6 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. package/README.md +23 -27
  2. package/app/plugins/builtin/agi-eval-evaluation.rssany.js +7 -8
  3. package/app/plugins/builtin/amii-research-talent.rssany.js +6 -7
  4. package/app/plugins/builtin/anthropic-research.rssany.js +6 -8
  5. package/app/plugins/builtin/appen-resources.rssany.js +6 -7
  6. package/app/plugins/builtin/baai-wudao-paper-article.rssany.js +9 -10
  7. package/app/plugins/builtin/baaidata-csdn.rssany.js +6 -7
  8. package/app/plugins/builtin/baidu-research.rssany.js +5 -8
  9. package/app/plugins/builtin/brightdata-blog.rssany.js +7 -12
  10. package/app/plugins/builtin/bytedance-seed-research.rssany.js +5 -7
  11. package/app/plugins/builtin/email.rssany.js +9 -9
  12. package/app/plugins/builtin/five-radar.rssany.js +10 -12
  13. package/app/plugins/builtin/flageval-news.rssany.js +5 -7
  14. package/app/plugins/builtin/google-deepmind-research.rssany.js +7 -9
  15. package/app/plugins/builtin/google-research-datasets.rssany.js +6 -8
  16. package/app/plugins/builtin/google-research.rssany.js +6 -8
  17. package/app/plugins/builtin/hacker-news-newest.rssany.js +7 -9
  18. package/app/plugins/builtin/harvard-dataverse.rssany.js +6 -8
  19. package/app/plugins/builtin/huaweicloud-bbs-blogs.rssany.js +7 -9
  20. package/app/plugins/builtin/lingowhale.rssany.js +7 -9
  21. package/app/plugins/builtin/meituan-tech.rssany.js +7 -10
  22. package/app/plugins/builtin/meta-ai-publications.rssany.js +6 -11
  23. package/app/plugins/builtin/mila-quebec.rssany.js +6 -8
  24. package/app/plugins/builtin/mit-csail-research.rssany.js +7 -9
  25. package/app/plugins/builtin/moonshot.rssany.js +6 -8
  26. package/app/plugins/builtin/opendatalab-news.rssany.js +6 -7
  27. package/app/plugins/builtin/opendatalab.rssany.js +5 -6
  28. package/app/plugins/builtin/opendrivelab-autonomous-driving.rssany.js +6 -7
  29. package/app/plugins/builtin/opendrivelab-embodiedai.rssany.js +7 -8
  30. package/app/plugins/builtin/opendrivelab-publications.rssany.js +7 -9
  31. package/app/plugins/builtin/opendrivelab.rssany.js +7 -8
  32. package/app/plugins/builtin/paperswithcode.rssany.js +6 -8
  33. package/app/plugins/builtin/pjlab-adg-publications.rssany.js +8 -10
  34. package/app/plugins/builtin/rss.rssany.js +11 -12
  35. package/app/plugins/builtin/selectdataset.rssany.js +6 -8
  36. package/app/plugins/builtin/sensetime-tech-achievements.rssany.js +7 -8
  37. package/app/plugins/builtin/supervisely-blog.rssany.js +6 -8
  38. package/app/plugins/builtin/theinformation-briefings.rssany.js +144 -136
  39. package/app/plugins/builtin/uci-ml-repository.rssany.js +6 -7
  40. package/app/plugins/builtin/venturebeat.rssany.js +7 -9
  41. package/app/plugins/builtin/worldlabs.rssany.js +6 -8
  42. package/app/plugins/builtin/x.rssany.js +7 -9
  43. package/app/plugins/builtin/xiaohongshu.rssany.js +119 -56
  44. package/app/plugins/builtin/zhipu-research.rssany.js +7 -10
  45. package/app/plugins/site.rssany.js +25 -25
  46. package/{statics → app/statics}/README.md +7 -7
  47. package/bin/rssany.js +226 -6
  48. package/dist/index.js +545 -396
  49. package/dist/index.js.map +1 -1
  50. package/package.json +20 -13
  51. package/scripts/dev.mjs +114 -0
  52. package/scripts/reset.mjs +1 -1
  53. package/app/plugins/builtin/google.rssany.js +0 -187
  54. package/init/config.json +0 -17
  55. package/init/sources.json +0 -353
  56. package/statics/401.html +0 -56
  57. package/statics/404.html +0 -12
  58. package/statics/image.png +0 -0
  59. package/webui/build/200.html +0 -49
  60. package/webui/build/_app/env.js +0 -1
  61. package/webui/build/_app/immutable/assets/0.BB88QFoe.css +0 -1
  62. package/webui/build/_app/immutable/assets/10.Dj8_pmut.css +0 -1
  63. package/webui/build/_app/immutable/assets/11.qYZMiTb0.css +0 -1
  64. package/webui/build/_app/immutable/assets/12.Ct59LCqW.css +0 -1
  65. package/webui/build/_app/immutable/assets/13.BhO9zvFi.css +0 -1
  66. package/webui/build/_app/immutable/assets/14.CujIhjQK.css +0 -1
  67. package/webui/build/_app/immutable/assets/15.nNGjXhCQ.css +0 -1
  68. package/webui/build/_app/immutable/assets/16.PP9XLDf7.css +0 -1
  69. package/webui/build/_app/immutable/assets/4.9wPHhVwv.css +0 -1
  70. package/webui/build/_app/immutable/assets/5.ClehBQ0g.css +0 -1
  71. package/webui/build/_app/immutable/assets/6.DSJfjJwx.css +0 -1
  72. package/webui/build/_app/immutable/assets/7.CrNxmd8B.css +0 -1
  73. package/webui/build/_app/immutable/assets/8.Ba5_jYIY.css +0 -1
  74. package/webui/build/_app/immutable/assets/9.m-LCx_kl.css +0 -1
  75. package/webui/build/_app/immutable/assets/BackToParentRoute.DGk-X5ow.css +0 -1
  76. package/webui/build/_app/immutable/assets/SourcesList.yTBBi3_m.css +0 -1
  77. package/webui/build/_app/immutable/assets/homeFeedPanelStore.CSvlNcpm.css +0 -1
  78. package/webui/build/_app/immutable/chunks/B-OsL1Ct.js +0 -1
  79. package/webui/build/_app/immutable/chunks/B2Q1a1-H.js +0 -2
  80. package/webui/build/_app/immutable/chunks/BK3WtZwv.js +0 -1
  81. package/webui/build/_app/immutable/chunks/BQqoDzLx.js +0 -1
  82. package/webui/build/_app/immutable/chunks/BUApaBEI.js +0 -1
  83. package/webui/build/_app/immutable/chunks/BbWUOQ_m.js +0 -1
  84. package/webui/build/_app/immutable/chunks/Bfc47y5P.js +0 -1
  85. package/webui/build/_app/immutable/chunks/Bp63qm3L.js +0 -1
  86. package/webui/build/_app/immutable/chunks/BwlaCkNX.js +0 -36
  87. package/webui/build/_app/immutable/chunks/C0J2-L94.js +0 -1
  88. package/webui/build/_app/immutable/chunks/CBY2biv-.js +0 -1
  89. package/webui/build/_app/immutable/chunks/CLOXMsDk.js +0 -36
  90. package/webui/build/_app/immutable/chunks/CVzlFH44.js +0 -1
  91. package/webui/build/_app/immutable/chunks/CWNeClHp.js +0 -6
  92. package/webui/build/_app/immutable/chunks/Cihqbfi5.js +0 -1
  93. package/webui/build/_app/immutable/chunks/D5GvRCv7.js +0 -1
  94. package/webui/build/_app/immutable/chunks/DEDI7Ecm.js +0 -1
  95. package/webui/build/_app/immutable/chunks/DFuhmi31.js +0 -1
  96. package/webui/build/_app/immutable/chunks/DMWEh-Ek.js +0 -2
  97. package/webui/build/_app/immutable/chunks/DgceFEv5.js +0 -1
  98. package/webui/build/_app/immutable/chunks/DjNLq3TF.js +0 -1
  99. package/webui/build/_app/immutable/chunks/Dt2CddFe.js +0 -1
  100. package/webui/build/_app/immutable/chunks/Dw782Tjs.js +0 -1
  101. package/webui/build/_app/immutable/chunks/SqCUd34O.js +0 -1
  102. package/webui/build/_app/immutable/chunks/Xy_fhzQq.js +0 -1
  103. package/webui/build/_app/immutable/chunks/hp4PFHFv.js +0 -1
  104. package/webui/build/_app/immutable/chunks/lk5LaiqA.js +0 -1
  105. package/webui/build/_app/immutable/chunks/mW5RwvnK.js +0 -13
  106. package/webui/build/_app/immutable/chunks/tB7QMF3U.js +0 -1
  107. package/webui/build/_app/immutable/chunks/xtNWTdbD.js +0 -1
  108. package/webui/build/_app/immutable/entry/app.B8zBPipq.js +0 -2
  109. package/webui/build/_app/immutable/entry/start.CxRCKeCl.js +0 -1
  110. package/webui/build/_app/immutable/nodes/0.ChLNE3xy.js +0 -11
  111. package/webui/build/_app/immutable/nodes/1.1N74-4Io.js +0 -1
  112. package/webui/build/_app/immutable/nodes/10.DY30t9Ib.js +0 -1
  113. package/webui/build/_app/immutable/nodes/11.ITuxnukH.js +0 -1
  114. package/webui/build/_app/immutable/nodes/12.qLzWqB1c.js +0 -1
  115. package/webui/build/_app/immutable/nodes/13.nT3SOzEB.js +0 -1
  116. package/webui/build/_app/immutable/nodes/14.BHnIxbVM.js +0 -1
  117. package/webui/build/_app/immutable/nodes/15.CLjT9il3.js +0 -1
  118. package/webui/build/_app/immutable/nodes/16.BD-mKCLN.js +0 -24
  119. package/webui/build/_app/immutable/nodes/17.BtYZF6FM.js +0 -1
  120. package/webui/build/_app/immutable/nodes/18.Ba_qJjp6.js +0 -1
  121. package/webui/build/_app/immutable/nodes/2.BYWOpaxy.js +0 -1
  122. package/webui/build/_app/immutable/nodes/3.Dt5o2Fmz.js +0 -1
  123. package/webui/build/_app/immutable/nodes/4.DTSxpKm7.js +0 -2
  124. package/webui/build/_app/immutable/nodes/5.Dy3vSsIP.js +0 -1
  125. package/webui/build/_app/immutable/nodes/6.DvclsL6H.js +0 -1
  126. package/webui/build/_app/immutable/nodes/7.D2nJy-Uz.js +0 -1
  127. package/webui/build/_app/immutable/nodes/8.C75mhrqs.js +0 -1
  128. package/webui/build/_app/immutable/nodes/9.Bp_QXw3w.js +0 -1
  129. package/webui/build/_app/version.json +0 -1
@@ -1,6 +1,12 @@
1
1
  // 内置 RSS/Atom/JSON Feed:通过浏览器(Puppeteer)拉取 Feed URL,再用 rss-parser 解析;
2
2
  // 与站点插件一致走 Chrome,便于应对需浏览器环境或代理的场景;XML 使用 HTTP 响应原文(useHttpResponseBody)。
3
-
3
+ export const id = "__rss__";
4
+ export const name = "RSS Feed";
5
+ export const pattern = /^https:\/\//;
6
+ export const match = looksLikeFeed;
7
+ export const priority = 20;
8
+ export const refreshInterval = "1h";
9
+
4
10
  const UA = "RssAny/1.0 (+https://github.com/joohw/rssany)";
5
11
 
6
12
  const IMAGE_TYPE_RE = /^image\//i;
@@ -76,7 +82,7 @@ function extractItemImageUrl(item) {
76
82
  firstImgSrcFromHtml(item.summary) ||
77
83
  firstImgSrcFromHtml(item["content:encoded"]) ||
78
84
  firstImgSrcFromHtml(item.contentSnippet);
79
- if (fromHtml && /^https?:\/\//i.test(fromHtml)) {
85
+ if (fromHtml && /^https:\/\//i.test(fromHtml)) {
80
86
  return fromHtml;
81
87
  }
82
88
 
@@ -102,13 +108,7 @@ async function fetchFeedXml(url, ctx) {
102
108
  return html;
103
109
  }
104
110
 
105
- export default {
106
- id: "__rss__",
107
- pattern: /^https?:\/\//,
108
- match: looksLikeFeed,
109
- priority: 20,
110
- refreshInterval: "1h",
111
- async fetchItems(sourceId, ctx) {
111
+ export async function fetchItems(sourceId, ctx) {
112
112
  const { deps } = ctx;
113
113
  const xml = await fetchFeedXml(sourceId, ctx);
114
114
  const parser = new deps.RssParser({
@@ -155,9 +155,8 @@ export default {
155
155
  if (!imageUrl) return base;
156
156
  return { ...base, imageUrl, cover_img: imageUrl };
157
157
  });
158
- },
159
- };
160
-
158
+ }
159
+
161
160
  function looksLikeFeed(url) {
162
161
  const lower = url.toLowerCase();
163
162
  return (
@@ -1,3 +1,7 @@
1
+ export const id = "selectdataset";
2
+ export const name = "Selectdataset";
3
+ export const listUrlPattern = /^https:\/\/(www\.)?selectdataset\.com\/(?:$|\?.*|search(?:\?.*)?|subject(?:\?.*)?)$/i;
4
+
1
5
  let _deps;
2
6
 
3
7
  // SelectDataset 插件:解析首页/搜索页 Nuxt payload,输出数据集条目(不含 enrich)
@@ -66,7 +70,7 @@ function parseFromAnchorDom(html, finalUrl) {
66
70
  let link = null;
67
71
  try {
68
72
  const url = new URL(href, baseUrl);
69
- if (!/^https?:$/i.test(url.protocol)) continue;
73
+ if (!/^https:$/i.test(url.protocol)) continue;
70
74
  if (!/\/dataset\/[A-Za-z0-9]{16,}/.test(url.pathname)) continue;
71
75
  link = url.href;
72
76
  } catch {
@@ -184,7 +188,7 @@ function parseFromNuxtPayload(html) {
184
188
  }
185
189
 
186
190
 
187
- async function fetchItems(sourceId, ctx) {
191
+ export async function fetchItems(sourceId, ctx) {
188
192
  _deps = ctx.deps;
189
193
  const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
190
194
  const fromAnchorDom = parseFromAnchorDom(html, finalUrl);
@@ -198,9 +202,3 @@ async function fetchItems(sourceId, ctx) {
198
202
  throw new Error("[selectdataset] 未解析到数据集条目,页面结构可能已变化");
199
203
  }
200
204
 
201
-
202
- export default {
203
- id: "selectdataset",
204
- listUrlPattern: /^https?:\/\/(www\.)?selectdataset\.com\/(?:$|\?.*|search(?:\?.*)?|subject(?:\?.*)?)$/i,
205
- fetchItems,
206
- };
@@ -1,7 +1,11 @@
1
+ export const id = "sensetime-tech-achievements";
2
+ export const name = "Sensetime Tech Achievements";
3
+ export const listUrlPattern = /^https:\/\/(www\.)?sensetime\.com\/cn\/technology-achievements(\?.*)?$/i;
4
+
1
5
  let _deps;
2
6
 
3
7
 
4
- const SITE_ID = "sensetime-tech-achievements";
8
+ const SITE_ID = id;
5
9
  const DATE_RE = /\b(20\d{2})-(\d{1,2})-(\d{1,2})\b/;
6
10
 
7
11
  function normalizeText(text) {
@@ -18,7 +22,7 @@ function toAbsoluteUrl(rawHref, baseUrl) {
18
22
  if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
19
23
  try {
20
24
  const url = new URL(href, baseUrl);
21
- if (!/^https?:$/i.test(url.protocol)) return null;
25
+ if (!/^https:$/i.test(url.protocol)) return null;
22
26
  return url.href;
23
27
  } catch {
24
28
  return null;
@@ -135,7 +139,7 @@ async function fetchItemsFromApi(finalUrl) {
135
139
  return items;
136
140
  }
137
141
 
138
- async function fetchItems(sourceId, ctx) {
142
+ export async function fetchItems(sourceId, ctx) {
139
143
  _deps = ctx.deps;
140
144
  const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
141
145
  const items = parseItemsFromHtml(html, finalUrl);
@@ -147,8 +151,3 @@ async function fetchItems(sourceId, ctx) {
147
151
  throw new Error(`[${SITE_ID}] 未解析到学术成果条目,页面结构或接口可能已变化`);
148
152
  }
149
153
 
150
- export default {
151
- id: SITE_ID,
152
- listUrlPattern: /^https?:\/\/(www\.)?sensetime\.com\/cn\/technology-achievements(\?.*)?$/i,
153
- fetchItems,
154
- };
@@ -1,3 +1,7 @@
1
+ export const id = "supervisely-blog";
2
+ export const name = "Supervisely Blog";
3
+ export const listUrlPattern = /^https:\/\/(www\.)?supervisely\.com\/blog\/?(?:\?.*)?$/i;
4
+
1
5
  let _deps;
2
6
 
3
7
  // Supervisely Blog 插件:抓取列表页并解析为 FeedItem(不做正文 enrich)
@@ -35,7 +39,7 @@ function toAbsoluteUrl(href, baseUrl) {
35
39
  if (!href) return null;
36
40
  try {
37
41
  const url = new URL(href, baseUrl);
38
- if (!/^https?:$/i.test(url.protocol)) return null;
42
+ if (!/^https:$/i.test(url.protocol)) return null;
39
43
  return url.href;
40
44
  } catch {
41
45
  return null;
@@ -136,7 +140,7 @@ function parseFromHeadingFallback(root, baseUrl) {
136
140
  }
137
141
 
138
142
 
139
- async function fetchItems(sourceId, ctx) {
143
+ export async function fetchItems(sourceId, ctx) {
140
144
  _deps = ctx.deps;
141
145
  const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
142
146
  const root = _deps.parseHtml(html);
@@ -151,9 +155,3 @@ async function fetchItems(sourceId, ctx) {
151
155
  return items;
152
156
  }
153
157
 
154
-
155
- export default {
156
- id: "supervisely-blog",
157
- listUrlPattern: /^https?:\/\/(www\.)?supervisely\.com\/blog\/?(?:\?.*)?$/i,
158
- fetchItems,
159
- };
@@ -1,136 +1,144 @@
1
- let _deps;
2
-
3
- // The Information Briefings 列表页:https://www.theinformation.com/briefings
4
- // 结构:.content-feed .article.briefing.feed-item,标题 h3.title a,摘要 .briefing-dek,时间 .authors
5
-
6
- const ORIGIN = "https://www.theinformation.com";
7
- const LIST_URL_RE =
8
- /^https?:\/\/(www\.)?theinformation\.com\/briefings\/?(\?.*)?$/i;
9
-
10
-
11
- function normalizeText(text) {
12
- return (text ?? "").replace(/\s+/g, " ").trim();
13
- }
14
-
15
-
16
- function hashGuid(input) {
17
- return _deps.createHash("sha256").update(input).digest("hex");
18
- }
19
-
20
-
21
- function toAbsoluteHttpUrl(rawHref, baseUrl) {
22
- if (!rawHref) return null;
23
- const href = rawHref.trim();
24
- if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
25
- try {
26
- const url = new URL(href, baseUrl);
27
- if (!/^https?:$/i.test(url.protocol)) return null;
28
- return url.href;
29
- } catch {
30
- return null;
31
- }
32
- }
33
-
34
-
35
- function pad2(n) {
36
- return String(n).padStart(2, "0");
37
- }
38
-
39
-
40
- /** .authors 文本:Apr 14, 2026 · 5:41am PDT(可含 · 1 comment);Node 不能可靠解析 PDT 缩写,手动换算 offset */
41
- function parseBriefingAuthorsDate(raw) {
42
- let t = normalizeText(raw);
43
- t = t.replace(/\s*·\s*\d+\s+comments?\s*$/i, "").trim();
44
-
45
- const m = t.match(
46
- /^(.+?\d{4})\s*·\s*(\d{1,2}:\d{2}\s*(?:am|pm))\s*(PDT|PST|PT)\s*$/i
47
- );
48
- if (m) {
49
- const datePart = m[1].trim();
50
- const timePart = m[2].trim();
51
- const tz = m[3].toUpperCase();
52
- const offset = tz === "PDT" ? "-07:00" : "-08:00";
53
-
54
- const hm = timePart.match(/(\d{1,2}):(\d{2})\s*(am|pm)/i);
55
- const d0 = new Date(datePart);
56
- if (hm && !Number.isNaN(d0.getTime())) {
57
- let h = Number(hm[1]);
58
- const min = Number(hm[2]);
59
- const ap = hm[3].toLowerCase();
60
- if (ap === "pm" && h < 12) h += 12;
61
- if (ap === "am" && h === 12) h = 0;
62
- const y = d0.getFullYear();
63
- const mo = d0.getMonth() + 1;
64
- const da = d0.getDate();
65
- const iso = `${y}-${pad2(mo)}-${pad2(da)}T${pad2(h)}:${pad2(min)}:00${offset}`;
66
- const out = new Date(iso);
67
- if (!Number.isNaN(out.getTime())) return out;
68
- }
69
- }
70
-
71
- const first = t.split("·")[0].trim();
72
- const fallback = new Date(first);
73
- return Number.isNaN(fallback.getTime()) ? new Date() : fallback;
74
- }
75
-
76
-
77
- function parseBriefingItems(html, pageUrl) {
78
- const root = _deps.parseHtml(html);
79
- const items = [];
80
- const seen = new Set();
81
-
82
- for (const node of root.querySelectorAll(".content-feed .article.briefing.feed-item")) {
83
- const linkEl = node.querySelector("h3.title a[href]");
84
- if (!linkEl) continue;
85
-
86
- const title = normalizeText(linkEl.textContent);
87
- const link = toAbsoluteHttpUrl(linkEl.getAttribute("href"), pageUrl);
88
- if (!title || !link || seen.has(link)) continue;
89
- seen.add(link);
90
-
91
- const authorsText = normalizeText(node.querySelector(".authors")?.textContent ?? "");
92
- const pubDate = parseBriefingAuthorsDate(authorsText);
93
- const summary = normalizeText(node.querySelector(".briefing-dek")?.textContent ?? "") || undefined;
94
-
95
- items.push({
96
- guid: hashGuid(link),
97
- title,
98
- link,
99
- pubDate,
100
- summary,
101
- });
102
- }
103
-
104
- return items;
105
- }
106
-
107
-
108
- async function fetchItems(sourceId, ctx) {
109
- _deps = ctx.deps;
110
- const { html, finalUrl, status } = await ctx.fetchHtml(sourceId, {
111
- waitMs: 5000,
112
- waitForSelector: ".content-feed .article.briefing",
113
- waitForSelectorTimeoutMs: 25_000,
114
- });
115
-
116
- const pageUrl = finalUrl || sourceId || ORIGIN;
117
- const items = parseBriefingItems(html, pageUrl);
118
-
119
- if (items.length === 0) {
120
- const hint = status && status >= 400 ? ` HTTP ${status}` : "";
121
- throw new Error(
122
- `[theinformation-briefings] 未解析到条目,页面结构可能已变化或需登录后抓取。${hint}`
123
- );
124
- }
125
-
126
- items.sort((a, b) => b.pubDate.getTime() - a.pubDate.getTime());
127
- return items;
128
- }
129
-
130
-
131
- export default {
132
- id: "theinformation-briefings",
133
- listUrlPattern: LIST_URL_RE,
134
- refreshInterval: "1h",
135
- fetchItems,
136
- };
1
+ export const id = "theinformation";
2
+ export const name = "Theinformation";
3
+ export const listUrlPattern = /^https:\/\/(www\.)?theinformation\.com\/(briefings|features\/[^/]+)\/?(\?.*)?$/i;
4
+ export const refreshInterval = "1h";
5
+
6
+ let _deps;
7
+
8
+ // The Information — AI Agenda 和 Briefings 列表页
9
+ // 当前结构:.article.feed-item,标题 h3.title a,分类 .category-content a,作者 .authors,摘要 .recent-excerpt .long-excerpt
10
+
11
+ const ORIGIN = "https://www.theinformation.com";
12
+
13
+ function normalizeText(text) {
14
+ return (text ?? "").replace(/\s+/g, " ").trim();
15
+ }
16
+
17
+
18
+ function hashGuid(input) {
19
+ return _deps.createHash("sha256").update(input).digest("hex");
20
+ }
21
+
22
+
23
+ function toAbsoluteHttpUrl(rawHref, baseUrl) {
24
+ if (!rawHref) return null;
25
+ const href = rawHref.trim();
26
+ if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
27
+ try {
28
+ const url = new URL(href, baseUrl);
29
+ if (!/^https:$/i.test(url.protocol)) return null;
30
+ return url.href;
31
+ } catch {
32
+ return null;
33
+ }
34
+ }
35
+
36
+
37
+ function pad2(n) {
38
+ return String(n).padStart(2, "0");
39
+ }
40
+
41
+
42
+ /** .authors 文本:By Author · Apr 14, 2026 · 7:52am PDT */
43
+ function parseAuthorsDate(raw) {
44
+ let t = normalizeText(raw);
45
+ t = t.replace(/\s*·\s*\d+\s+comments?\s*$/i, "").trim();
46
+
47
+ const m = t.match(
48
+ /^By\s+(.+?)\s*·\s*(.+?\d{4})\s*·\s*(\d{1,2}:\d{2}\s*(?:am|pm))\s*(PDT|PST|PT)\s*$/i
49
+ );
50
+ if (m) {
51
+ const author = m[1].trim();
52
+ const datePart = m[2].trim();
53
+ const timePart = m[3].trim();
54
+ const tz = m[4].toUpperCase();
55
+ const offset = tz === "PDT" ? "-07:00" : "-08:00";
56
+
57
+ const hm = timePart.match(/(\d{1,2}):(\d{2})\s*(am|pm)/i);
58
+ const d0 = new Date(datePart);
59
+ if (hm && !Number.isNaN(d0.getTime())) {
60
+ let h = Number(hm[1]);
61
+ const min = Number(hm[2]);
62
+ const ap = hm[3].toLowerCase();
63
+ if (ap === "pm" && h < 12) h += 12;
64
+ if (ap === "am" && h === 12) h = 0;
65
+ const y = d0.getFullYear();
66
+ const mo = d0.getMonth() + 1;
67
+ const da = d0.getDate();
68
+ const iso = `${y}-${pad2(mo)}-${pad2(da)}T${pad2(h)}:${pad2(min)}:00${offset}`;
69
+ const pubDate = new Date(iso);
70
+ if (!Number.isNaN(pubDate.getTime())) return { author, pubDate };
71
+ }
72
+ }
73
+
74
+ const authorMatch = t.match(/^By\s+(.+?)\s*·/i);
75
+ const author = authorMatch ? authorMatch[1].trim() : undefined;
76
+ const dateStr = t.replace(/^By\s+.*?\s*·\s*/, "").trim();
77
+ const pubDate = new Date(dateStr);
78
+ return { author, pubDate: Number.isNaN(pubDate.getTime()) ? new Date() : pubDate };
79
+ }
80
+
81
+
82
+ function parseFeedItems(html, pageUrl) {
83
+ const root = _deps.parseHtml(html);
84
+ const items = [];
85
+ const seen = new Set();
86
+
87
+ for (const node of root.querySelectorAll(".article.feed-item")) {
88
+ const linkEl = node.querySelector("h3.title a[href]");
89
+ if (!linkEl) continue;
90
+
91
+ const title = normalizeText(linkEl.textContent);
92
+ const link = toAbsoluteHttpUrl(linkEl.getAttribute("href"), pageUrl);
93
+ if (!title || !link || seen.has(link)) continue;
94
+ seen.add(link);
95
+
96
+ const authorsText = normalizeText(node.querySelector(".authors")?.textContent ?? "");
97
+ const { author, pubDate } = parseAuthorsDate(authorsText);
98
+
99
+ const summary = normalizeText(
100
+ node.querySelector(".recent-excerpt .long-excerpt")?.textContent ??
101
+ node.querySelector(".recent-excerpt")?.textContent ??
102
+ node.querySelector(".short-excerpt")?.textContent ??
103
+ ""
104
+ ) || undefined;
105
+
106
+ const categoryEl = node.querySelector(".category-content a");
107
+ const category = categoryEl ? normalizeText(categoryEl.textContent) : undefined;
108
+
109
+ items.push({
110
+ guid: hashGuid(link),
111
+ title,
112
+ link,
113
+ pubDate,
114
+ author,
115
+ summary,
116
+ categories: category ? [category] : undefined,
117
+ });
118
+ }
119
+
120
+ return items;
121
+ }
122
+
123
+
124
+ export async function fetchItems(sourceId, ctx) {
125
+ _deps = ctx.deps;
126
+ const { html, finalUrl, status } = await ctx.fetchHtml(sourceId, {
127
+ waitMs: 5000,
128
+ waitForSelector: ".article.feed-item",
129
+ waitForSelectorTimeoutMs: 25_000,
130
+ });
131
+
132
+ const pageUrl = finalUrl || sourceId || ORIGIN;
133
+ const items = parseFeedItems(html, pageUrl);
134
+
135
+ if (items.length === 0) {
136
+ const hint = status && status >= 400 ? ` HTTP ${status}` : "";
137
+ throw new Error(
138
+ `[theinformation] 未解析到条目,页面结构可能已变化或需登录后抓取。${hint}`
139
+ );
140
+ }
141
+
142
+ items.sort((a, b) => b.pubDate.getTime() - a.pubDate.getTime());
143
+ return items;
144
+ }
@@ -1,3 +1,7 @@
1
+ export const id = "uci-ml-repository";
2
+ export const name = "UCI Ml Repository";
3
+ export const listUrlPattern = /^https:\/\/archive\.ics\.uci\.edu(?:\/(?:datasets\/?)?)?(?:\?.*)?$/i;
4
+
1
5
  let _deps;
2
6
 
3
7
 
@@ -16,7 +20,7 @@ function resolveDatasetLink(rawHref, baseUrl) {
16
20
  if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
17
21
  try {
18
22
  const url = new URL(href, baseUrl);
19
- if (!/^https?:$/i.test(url.protocol)) return null;
23
+ if (!/^https:$/i.test(url.protocol)) return null;
20
24
  if (url.hostname !== "archive.ics.uci.edu") return null;
21
25
  if (!/^\/dataset\/\d+\/[^/?#]+$/i.test(url.pathname)) return null;
22
26
  url.search = "";
@@ -89,7 +93,7 @@ function parseFromGenericAnchors(root, baseUrl) {
89
93
  return items;
90
94
  }
91
95
 
92
- async function fetchItems(sourceId, ctx) {
96
+ export async function fetchItems(sourceId, ctx) {
93
97
  _deps = ctx.deps;
94
98
  const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 4000 });
95
99
  const baseUrl = finalUrl || sourceId || UCI_ORIGIN;
@@ -104,8 +108,3 @@ async function fetchItems(sourceId, ctx) {
104
108
  throw new Error("[uci-ml-repository] 未解析到数据集条目,页面结构可能已变化");
105
109
  }
106
110
 
107
- export default {
108
- id: "uci-ml-repository",
109
- listUrlPattern: /^https?:\/\/archive\.ics\.uci\.edu(?:\/(?:datasets\/?)?)?(?:\?.*)?$/i,
110
- fetchItems,
111
- };
@@ -1,3 +1,8 @@
1
+ export const id = "venturebeat";
2
+ export const name = "Venturebeat";
3
+ export const listUrlPattern = /^https:\/\/(www\.)?venturebeat\.com\/?(\?.*)?$/i;
4
+ export const refreshInterval = "1h";
5
+
1
6
  let _deps;
2
7
 
3
8
  // VentureBeat 插件:通过官方 RSS Feed 拉取列表,规避首页安全检查页
@@ -35,7 +40,7 @@ function toFeedUrl(sourceId) {
35
40
 
36
41
  function mapFeedItem(item) {
37
42
  const link = normalizeText(item.link ?? "");
38
- if (!/^https?:\/\//i.test(link)) return null;
43
+ if (!/^https:\/\//i.test(link)) return null;
39
44
 
40
45
  const title = normalizeText(item.title ?? "");
41
46
  const pubDate = toValidDate(item.isoDate ?? item.pubDate);
@@ -53,7 +58,7 @@ function mapFeedItem(item) {
53
58
  }
54
59
 
55
60
 
56
- async function fetchItems(sourceId, _ctx) {
61
+ export async function fetchItems(sourceId, _ctx) {
57
62
  _deps = _ctx.deps;
58
63
  const parser = new _deps.RssParser({
59
64
  timeout: 15_000,
@@ -88,10 +93,3 @@ async function fetchItems(sourceId, _ctx) {
88
93
  return items;
89
94
  }
90
95
 
91
-
92
- export default {
93
- id: "venturebeat",
94
- listUrlPattern: /^https?:\/\/(www\.)?venturebeat\.com\/?(\?.*)?$/i,
95
- refreshInterval: "1h",
96
- fetchItems,
97
- };
@@ -1,3 +1,7 @@
1
+ export const id = "worldlabs";
2
+ export const name = "Worldlabs";
3
+ export const listUrlPattern = /^https:\/\/(www\.)?worldlabs\.ai\/blog(\?.*)?$/i;
4
+
1
5
  let _deps;
2
6
 
3
7
  // World Labs 博客插件:抓取 Research & Insights 列表页,输出 FeedItem(不含 enrich)
@@ -39,7 +43,7 @@ function toAbsoluteHttpUrl(rawHref, baseUrl) {
39
43
  if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
40
44
  try {
41
45
  const url = new URL(href, baseUrl);
42
- if (!/^https?:$/i.test(url.protocol)) return null;
46
+ if (!/^https:$/i.test(url.protocol)) return null;
43
47
  return url.href;
44
48
  } catch {
45
49
  return null;
@@ -99,7 +103,7 @@ function parseCard(anchor, finalUrl) {
99
103
  }
100
104
 
101
105
 
102
- async function fetchItems(sourceId, ctx) {
106
+ export async function fetchItems(sourceId, ctx) {
103
107
  _deps = ctx.deps;
104
108
  const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
105
109
  const root = _deps.parseHtml(html);
@@ -121,9 +125,3 @@ async function fetchItems(sourceId, ctx) {
121
125
  return items;
122
126
  }
123
127
 
124
-
125
- export default {
126
- id: "worldlabs",
127
- listUrlPattern: /^https?:\/\/(www\.)?worldlabs\.ai\/blog(\?.*)?$/i,
128
- fetchItems,
129
- };
@@ -1,3 +1,7 @@
1
+ export const id = "x";
2
+ export const name = "X";
3
+ export const listUrlPattern = /^https:\/\/(www\.)?x\.com\/[^/?#]+\/?(?:[?#].*)?$/i;
4
+
1
5
  let _deps;
2
6
 
3
7
  // X (Twitter) 站点插件:用户主页列表抓取与解析
@@ -161,13 +165,13 @@ function extractMediaUrl(article) {
161
165
  const video = article.querySelector("video[poster]");
162
166
  if (video) {
163
167
  const poster = video.getAttribute("poster");
164
- if (poster && /^https?:\/\//i.test(poster)) return poster;
168
+ if (poster && /^https:\/\//i.test(poster)) return poster;
165
169
  }
166
170
  for (const img of article.querySelectorAll(
167
171
  '[data-testid="card.wrapper"] img[src*="twimg.com/card_img"], [data-testid="card.wrapper"] img[src*="pbs.twimg.com/card_img"]',
168
172
  )) {
169
173
  const src = img.getAttribute("src");
170
- if (src && /^https?:\/\//i.test(src) && !/profile_images/i.test(src)) {
174
+ if (src && /^https:\/\//i.test(src) && !/profile_images/i.test(src)) {
171
175
  return normalizeCardImgUrl(src);
172
176
  }
173
177
  }
@@ -288,7 +292,7 @@ function entriesToFeedItems(entries) {
288
292
  }
289
293
 
290
294
 
291
- async function fetchItems(sourceId, ctx) {
295
+ export async function fetchItems(sourceId, ctx) {
292
296
  _deps = ctx.deps;
293
297
  const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 6000 });
294
298
  const root = _deps.parseHtml(html);
@@ -320,9 +324,3 @@ async function fetchItems(sourceId, ctx) {
320
324
  throw new Error(`[X] ${message}`);
321
325
  }
322
326
 
323
-
324
- export default {
325
- id: "x",
326
- listUrlPattern: "https://x.com/{username}",
327
- fetchItems,
328
- };