rssany 0.1.6 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +23 -27
- package/app/plugins/builtin/agi-eval-evaluation.rssany.js +7 -8
- package/app/plugins/builtin/amii-research-talent.rssany.js +6 -7
- package/app/plugins/builtin/anthropic-research.rssany.js +6 -8
- package/app/plugins/builtin/appen-resources.rssany.js +6 -7
- package/app/plugins/builtin/baai-wudao-paper-article.rssany.js +9 -10
- package/app/plugins/builtin/baaidata-csdn.rssany.js +6 -7
- package/app/plugins/builtin/baidu-research.rssany.js +5 -8
- package/app/plugins/builtin/brightdata-blog.rssany.js +7 -12
- package/app/plugins/builtin/bytedance-seed-research.rssany.js +5 -7
- package/app/plugins/builtin/email.rssany.js +9 -9
- package/app/plugins/builtin/five-radar.rssany.js +10 -12
- package/app/plugins/builtin/flageval-news.rssany.js +5 -7
- package/app/plugins/builtin/google-deepmind-research.rssany.js +7 -9
- package/app/plugins/builtin/google-research-datasets.rssany.js +6 -8
- package/app/plugins/builtin/google-research.rssany.js +6 -8
- package/app/plugins/builtin/hacker-news-newest.rssany.js +7 -9
- package/app/plugins/builtin/harvard-dataverse.rssany.js +6 -8
- package/app/plugins/builtin/huaweicloud-bbs-blogs.rssany.js +7 -9
- package/app/plugins/builtin/lingowhale.rssany.js +7 -9
- package/app/plugins/builtin/meituan-tech.rssany.js +7 -10
- package/app/plugins/builtin/meta-ai-publications.rssany.js +6 -11
- package/app/plugins/builtin/mila-quebec.rssany.js +6 -8
- package/app/plugins/builtin/mit-csail-research.rssany.js +7 -9
- package/app/plugins/builtin/moonshot.rssany.js +6 -8
- package/app/plugins/builtin/opendatalab-news.rssany.js +6 -7
- package/app/plugins/builtin/opendatalab.rssany.js +5 -6
- package/app/plugins/builtin/opendrivelab-autonomous-driving.rssany.js +6 -7
- package/app/plugins/builtin/opendrivelab-embodiedai.rssany.js +7 -8
- package/app/plugins/builtin/opendrivelab-publications.rssany.js +7 -9
- package/app/plugins/builtin/opendrivelab.rssany.js +7 -8
- package/app/plugins/builtin/paperswithcode.rssany.js +6 -8
- package/app/plugins/builtin/pjlab-adg-publications.rssany.js +8 -10
- package/app/plugins/builtin/rss.rssany.js +11 -12
- package/app/plugins/builtin/selectdataset.rssany.js +6 -8
- package/app/plugins/builtin/sensetime-tech-achievements.rssany.js +7 -8
- package/app/plugins/builtin/supervisely-blog.rssany.js +6 -8
- package/app/plugins/builtin/theinformation-briefings.rssany.js +144 -136
- package/app/plugins/builtin/uci-ml-repository.rssany.js +6 -7
- package/app/plugins/builtin/venturebeat.rssany.js +7 -9
- package/app/plugins/builtin/worldlabs.rssany.js +6 -8
- package/app/plugins/builtin/x.rssany.js +7 -9
- package/app/plugins/builtin/xiaohongshu.rssany.js +119 -56
- package/app/plugins/builtin/zhipu-research.rssany.js +7 -10
- package/app/plugins/site.rssany.js +25 -25
- package/{statics → app/statics}/README.md +7 -7
- package/bin/rssany.js +226 -6
- package/dist/index.js +545 -396
- package/dist/index.js.map +1 -1
- package/package.json +20 -13
- package/scripts/dev.mjs +114 -0
- package/scripts/reset.mjs +1 -1
- package/app/plugins/builtin/google.rssany.js +0 -187
- package/init/config.json +0 -17
- package/init/sources.json +0 -353
- package/statics/401.html +0 -56
- package/statics/404.html +0 -12
- package/statics/image.png +0 -0
- package/webui/build/200.html +0 -49
- package/webui/build/_app/env.js +0 -1
- package/webui/build/_app/immutable/assets/0.BB88QFoe.css +0 -1
- package/webui/build/_app/immutable/assets/10.Dj8_pmut.css +0 -1
- package/webui/build/_app/immutable/assets/11.qYZMiTb0.css +0 -1
- package/webui/build/_app/immutable/assets/12.Ct59LCqW.css +0 -1
- package/webui/build/_app/immutable/assets/13.BhO9zvFi.css +0 -1
- package/webui/build/_app/immutable/assets/14.CujIhjQK.css +0 -1
- package/webui/build/_app/immutable/assets/15.nNGjXhCQ.css +0 -1
- package/webui/build/_app/immutable/assets/16.PP9XLDf7.css +0 -1
- package/webui/build/_app/immutable/assets/4.9wPHhVwv.css +0 -1
- package/webui/build/_app/immutable/assets/5.ClehBQ0g.css +0 -1
- package/webui/build/_app/immutable/assets/6.DSJfjJwx.css +0 -1
- package/webui/build/_app/immutable/assets/7.CrNxmd8B.css +0 -1
- package/webui/build/_app/immutable/assets/8.Ba5_jYIY.css +0 -1
- package/webui/build/_app/immutable/assets/9.m-LCx_kl.css +0 -1
- package/webui/build/_app/immutable/assets/BackToParentRoute.DGk-X5ow.css +0 -1
- package/webui/build/_app/immutable/assets/SourcesList.yTBBi3_m.css +0 -1
- package/webui/build/_app/immutable/assets/homeFeedPanelStore.CSvlNcpm.css +0 -1
- package/webui/build/_app/immutable/chunks/B-OsL1Ct.js +0 -1
- package/webui/build/_app/immutable/chunks/B2Q1a1-H.js +0 -2
- package/webui/build/_app/immutable/chunks/BK3WtZwv.js +0 -1
- package/webui/build/_app/immutable/chunks/BQqoDzLx.js +0 -1
- package/webui/build/_app/immutable/chunks/BUApaBEI.js +0 -1
- package/webui/build/_app/immutable/chunks/BbWUOQ_m.js +0 -1
- package/webui/build/_app/immutable/chunks/Bfc47y5P.js +0 -1
- package/webui/build/_app/immutable/chunks/Bp63qm3L.js +0 -1
- package/webui/build/_app/immutable/chunks/BwlaCkNX.js +0 -36
- package/webui/build/_app/immutable/chunks/C0J2-L94.js +0 -1
- package/webui/build/_app/immutable/chunks/CBY2biv-.js +0 -1
- package/webui/build/_app/immutable/chunks/CLOXMsDk.js +0 -36
- package/webui/build/_app/immutable/chunks/CVzlFH44.js +0 -1
- package/webui/build/_app/immutable/chunks/CWNeClHp.js +0 -6
- package/webui/build/_app/immutable/chunks/Cihqbfi5.js +0 -1
- package/webui/build/_app/immutable/chunks/D5GvRCv7.js +0 -1
- package/webui/build/_app/immutable/chunks/DEDI7Ecm.js +0 -1
- package/webui/build/_app/immutable/chunks/DFuhmi31.js +0 -1
- package/webui/build/_app/immutable/chunks/DMWEh-Ek.js +0 -2
- package/webui/build/_app/immutable/chunks/DgceFEv5.js +0 -1
- package/webui/build/_app/immutable/chunks/DjNLq3TF.js +0 -1
- package/webui/build/_app/immutable/chunks/Dt2CddFe.js +0 -1
- package/webui/build/_app/immutable/chunks/Dw782Tjs.js +0 -1
- package/webui/build/_app/immutable/chunks/SqCUd34O.js +0 -1
- package/webui/build/_app/immutable/chunks/Xy_fhzQq.js +0 -1
- package/webui/build/_app/immutable/chunks/hp4PFHFv.js +0 -1
- package/webui/build/_app/immutable/chunks/lk5LaiqA.js +0 -1
- package/webui/build/_app/immutable/chunks/mW5RwvnK.js +0 -13
- package/webui/build/_app/immutable/chunks/tB7QMF3U.js +0 -1
- package/webui/build/_app/immutable/chunks/xtNWTdbD.js +0 -1
- package/webui/build/_app/immutable/entry/app.B8zBPipq.js +0 -2
- package/webui/build/_app/immutable/entry/start.CxRCKeCl.js +0 -1
- package/webui/build/_app/immutable/nodes/0.ChLNE3xy.js +0 -11
- package/webui/build/_app/immutable/nodes/1.1N74-4Io.js +0 -1
- package/webui/build/_app/immutable/nodes/10.DY30t9Ib.js +0 -1
- package/webui/build/_app/immutable/nodes/11.ITuxnukH.js +0 -1
- package/webui/build/_app/immutable/nodes/12.qLzWqB1c.js +0 -1
- package/webui/build/_app/immutable/nodes/13.nT3SOzEB.js +0 -1
- package/webui/build/_app/immutable/nodes/14.BHnIxbVM.js +0 -1
- package/webui/build/_app/immutable/nodes/15.CLjT9il3.js +0 -1
- package/webui/build/_app/immutable/nodes/16.BD-mKCLN.js +0 -24
- package/webui/build/_app/immutable/nodes/17.BtYZF6FM.js +0 -1
- package/webui/build/_app/immutable/nodes/18.Ba_qJjp6.js +0 -1
- package/webui/build/_app/immutable/nodes/2.BYWOpaxy.js +0 -1
- package/webui/build/_app/immutable/nodes/3.Dt5o2Fmz.js +0 -1
- package/webui/build/_app/immutable/nodes/4.DTSxpKm7.js +0 -2
- package/webui/build/_app/immutable/nodes/5.Dy3vSsIP.js +0 -1
- package/webui/build/_app/immutable/nodes/6.DvclsL6H.js +0 -1
- package/webui/build/_app/immutable/nodes/7.D2nJy-Uz.js +0 -1
- package/webui/build/_app/immutable/nodes/8.C75mhrqs.js +0 -1
- package/webui/build/_app/immutable/nodes/9.Bp_QXw3w.js +0 -1
- package/webui/build/_app/version.json +0 -1
|
@@ -1,6 +1,12 @@
|
|
|
1
1
|
// 内置 RSS/Atom/JSON Feed:通过浏览器(Puppeteer)拉取 Feed URL,再用 rss-parser 解析;
|
|
2
2
|
// 与站点插件一致走 Chrome,便于应对需浏览器环境或代理的场景;XML 使用 HTTP 响应原文(useHttpResponseBody)。
|
|
3
|
-
|
|
3
|
+
export const id = "__rss__";
|
|
4
|
+
export const name = "RSS Feed";
|
|
5
|
+
export const pattern = /^https:\/\//;
|
|
6
|
+
export const match = looksLikeFeed;
|
|
7
|
+
export const priority = 20;
|
|
8
|
+
export const refreshInterval = "1h";
|
|
9
|
+
|
|
4
10
|
const UA = "RssAny/1.0 (+https://github.com/joohw/rssany)";
|
|
5
11
|
|
|
6
12
|
const IMAGE_TYPE_RE = /^image\//i;
|
|
@@ -76,7 +82,7 @@ function extractItemImageUrl(item) {
|
|
|
76
82
|
firstImgSrcFromHtml(item.summary) ||
|
|
77
83
|
firstImgSrcFromHtml(item["content:encoded"]) ||
|
|
78
84
|
firstImgSrcFromHtml(item.contentSnippet);
|
|
79
|
-
if (fromHtml && /^https
|
|
85
|
+
if (fromHtml && /^https:\/\//i.test(fromHtml)) {
|
|
80
86
|
return fromHtml;
|
|
81
87
|
}
|
|
82
88
|
|
|
@@ -102,13 +108,7 @@ async function fetchFeedXml(url, ctx) {
|
|
|
102
108
|
return html;
|
|
103
109
|
}
|
|
104
110
|
|
|
105
|
-
export
|
|
106
|
-
id: "__rss__",
|
|
107
|
-
pattern: /^https?:\/\//,
|
|
108
|
-
match: looksLikeFeed,
|
|
109
|
-
priority: 20,
|
|
110
|
-
refreshInterval: "1h",
|
|
111
|
-
async fetchItems(sourceId, ctx) {
|
|
111
|
+
export async function fetchItems(sourceId, ctx) {
|
|
112
112
|
const { deps } = ctx;
|
|
113
113
|
const xml = await fetchFeedXml(sourceId, ctx);
|
|
114
114
|
const parser = new deps.RssParser({
|
|
@@ -155,9 +155,8 @@ export default {
|
|
|
155
155
|
if (!imageUrl) return base;
|
|
156
156
|
return { ...base, imageUrl, cover_img: imageUrl };
|
|
157
157
|
});
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
|
|
158
|
+
}
|
|
159
|
+
|
|
161
160
|
function looksLikeFeed(url) {
|
|
162
161
|
const lower = url.toLowerCase();
|
|
163
162
|
return (
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
export const id = "selectdataset";
|
|
2
|
+
export const name = "Selectdataset";
|
|
3
|
+
export const listUrlPattern = /^https:\/\/(www\.)?selectdataset\.com\/(?:$|\?.*|search(?:\?.*)?|subject(?:\?.*)?)$/i;
|
|
4
|
+
|
|
1
5
|
let _deps;
|
|
2
6
|
|
|
3
7
|
// SelectDataset 插件:解析首页/搜索页 Nuxt payload,输出数据集条目(不含 enrich)
|
|
@@ -66,7 +70,7 @@ function parseFromAnchorDom(html, finalUrl) {
|
|
|
66
70
|
let link = null;
|
|
67
71
|
try {
|
|
68
72
|
const url = new URL(href, baseUrl);
|
|
69
|
-
if (!/^https
|
|
73
|
+
if (!/^https:$/i.test(url.protocol)) continue;
|
|
70
74
|
if (!/\/dataset\/[A-Za-z0-9]{16,}/.test(url.pathname)) continue;
|
|
71
75
|
link = url.href;
|
|
72
76
|
} catch {
|
|
@@ -184,7 +188,7 @@ function parseFromNuxtPayload(html) {
|
|
|
184
188
|
}
|
|
185
189
|
|
|
186
190
|
|
|
187
|
-
async function fetchItems(sourceId, ctx) {
|
|
191
|
+
export async function fetchItems(sourceId, ctx) {
|
|
188
192
|
_deps = ctx.deps;
|
|
189
193
|
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
|
|
190
194
|
const fromAnchorDom = parseFromAnchorDom(html, finalUrl);
|
|
@@ -198,9 +202,3 @@ async function fetchItems(sourceId, ctx) {
|
|
|
198
202
|
throw new Error("[selectdataset] 未解析到数据集条目,页面结构可能已变化");
|
|
199
203
|
}
|
|
200
204
|
|
|
201
|
-
|
|
202
|
-
export default {
|
|
203
|
-
id: "selectdataset",
|
|
204
|
-
listUrlPattern: /^https?:\/\/(www\.)?selectdataset\.com\/(?:$|\?.*|search(?:\?.*)?|subject(?:\?.*)?)$/i,
|
|
205
|
-
fetchItems,
|
|
206
|
-
};
|
|
@@ -1,7 +1,11 @@
|
|
|
1
|
+
export const id = "sensetime-tech-achievements";
|
|
2
|
+
export const name = "Sensetime Tech Achievements";
|
|
3
|
+
export const listUrlPattern = /^https:\/\/(www\.)?sensetime\.com\/cn\/technology-achievements(\?.*)?$/i;
|
|
4
|
+
|
|
1
5
|
let _deps;
|
|
2
6
|
|
|
3
7
|
|
|
4
|
-
const SITE_ID =
|
|
8
|
+
const SITE_ID = id;
|
|
5
9
|
const DATE_RE = /\b(20\d{2})-(\d{1,2})-(\d{1,2})\b/;
|
|
6
10
|
|
|
7
11
|
function normalizeText(text) {
|
|
@@ -18,7 +22,7 @@ function toAbsoluteUrl(rawHref, baseUrl) {
|
|
|
18
22
|
if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
|
|
19
23
|
try {
|
|
20
24
|
const url = new URL(href, baseUrl);
|
|
21
|
-
if (!/^https
|
|
25
|
+
if (!/^https:$/i.test(url.protocol)) return null;
|
|
22
26
|
return url.href;
|
|
23
27
|
} catch {
|
|
24
28
|
return null;
|
|
@@ -135,7 +139,7 @@ async function fetchItemsFromApi(finalUrl) {
|
|
|
135
139
|
return items;
|
|
136
140
|
}
|
|
137
141
|
|
|
138
|
-
async function fetchItems(sourceId, ctx) {
|
|
142
|
+
export async function fetchItems(sourceId, ctx) {
|
|
139
143
|
_deps = ctx.deps;
|
|
140
144
|
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
|
|
141
145
|
const items = parseItemsFromHtml(html, finalUrl);
|
|
@@ -147,8 +151,3 @@ async function fetchItems(sourceId, ctx) {
|
|
|
147
151
|
throw new Error(`[${SITE_ID}] 未解析到学术成果条目,页面结构或接口可能已变化`);
|
|
148
152
|
}
|
|
149
153
|
|
|
150
|
-
export default {
|
|
151
|
-
id: SITE_ID,
|
|
152
|
-
listUrlPattern: /^https?:\/\/(www\.)?sensetime\.com\/cn\/technology-achievements(\?.*)?$/i,
|
|
153
|
-
fetchItems,
|
|
154
|
-
};
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
export const id = "supervisely-blog";
|
|
2
|
+
export const name = "Supervisely Blog";
|
|
3
|
+
export const listUrlPattern = /^https:\/\/(www\.)?supervisely\.com\/blog\/?(?:\?.*)?$/i;
|
|
4
|
+
|
|
1
5
|
let _deps;
|
|
2
6
|
|
|
3
7
|
// Supervisely Blog 插件:抓取列表页并解析为 FeedItem(不做正文 enrich)
|
|
@@ -35,7 +39,7 @@ function toAbsoluteUrl(href, baseUrl) {
|
|
|
35
39
|
if (!href) return null;
|
|
36
40
|
try {
|
|
37
41
|
const url = new URL(href, baseUrl);
|
|
38
|
-
if (!/^https
|
|
42
|
+
if (!/^https:$/i.test(url.protocol)) return null;
|
|
39
43
|
return url.href;
|
|
40
44
|
} catch {
|
|
41
45
|
return null;
|
|
@@ -136,7 +140,7 @@ function parseFromHeadingFallback(root, baseUrl) {
|
|
|
136
140
|
}
|
|
137
141
|
|
|
138
142
|
|
|
139
|
-
async function fetchItems(sourceId, ctx) {
|
|
143
|
+
export async function fetchItems(sourceId, ctx) {
|
|
140
144
|
_deps = ctx.deps;
|
|
141
145
|
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
|
|
142
146
|
const root = _deps.parseHtml(html);
|
|
@@ -151,9 +155,3 @@ async function fetchItems(sourceId, ctx) {
|
|
|
151
155
|
return items;
|
|
152
156
|
}
|
|
153
157
|
|
|
154
|
-
|
|
155
|
-
export default {
|
|
156
|
-
id: "supervisely-blog",
|
|
157
|
-
listUrlPattern: /^https?:\/\/(www\.)?supervisely\.com\/blog\/?(?:\?.*)?$/i,
|
|
158
|
-
fetchItems,
|
|
159
|
-
};
|
|
@@ -1,136 +1,144 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
if (!
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
return
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
const
|
|
52
|
-
const
|
|
53
|
-
|
|
54
|
-
const
|
|
55
|
-
const
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
const
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
const
|
|
66
|
-
const
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
const
|
|
92
|
-
const
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
};
|
|
1
|
+
export const id = "theinformation";
|
|
2
|
+
export const name = "Theinformation";
|
|
3
|
+
export const listUrlPattern = /^https:\/\/(www\.)?theinformation\.com\/(briefings|features\/[^/]+)\/?(\?.*)?$/i;
|
|
4
|
+
export const refreshInterval = "1h";
|
|
5
|
+
|
|
6
|
+
let _deps;
|
|
7
|
+
|
|
8
|
+
// The Information — AI Agenda 和 Briefings 列表页
|
|
9
|
+
// 当前结构:.article.feed-item,标题 h3.title a,分类 .category-content a,作者 .authors,摘要 .recent-excerpt .long-excerpt
|
|
10
|
+
|
|
11
|
+
const ORIGIN = "https://www.theinformation.com";
|
|
12
|
+
|
|
13
|
+
function normalizeText(text) {
|
|
14
|
+
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
function hashGuid(input) {
|
|
19
|
+
return _deps.createHash("sha256").update(input).digest("hex");
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
function toAbsoluteHttpUrl(rawHref, baseUrl) {
|
|
24
|
+
if (!rawHref) return null;
|
|
25
|
+
const href = rawHref.trim();
|
|
26
|
+
if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
|
|
27
|
+
try {
|
|
28
|
+
const url = new URL(href, baseUrl);
|
|
29
|
+
if (!/^https:$/i.test(url.protocol)) return null;
|
|
30
|
+
return url.href;
|
|
31
|
+
} catch {
|
|
32
|
+
return null;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
function pad2(n) {
|
|
38
|
+
return String(n).padStart(2, "0");
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
/** .authors 文本:By Author · Apr 14, 2026 · 7:52am PDT */
|
|
43
|
+
function parseAuthorsDate(raw) {
|
|
44
|
+
let t = normalizeText(raw);
|
|
45
|
+
t = t.replace(/\s*·\s*\d+\s+comments?\s*$/i, "").trim();
|
|
46
|
+
|
|
47
|
+
const m = t.match(
|
|
48
|
+
/^By\s+(.+?)\s*·\s*(.+?\d{4})\s*·\s*(\d{1,2}:\d{2}\s*(?:am|pm))\s*(PDT|PST|PT)\s*$/i
|
|
49
|
+
);
|
|
50
|
+
if (m) {
|
|
51
|
+
const author = m[1].trim();
|
|
52
|
+
const datePart = m[2].trim();
|
|
53
|
+
const timePart = m[3].trim();
|
|
54
|
+
const tz = m[4].toUpperCase();
|
|
55
|
+
const offset = tz === "PDT" ? "-07:00" : "-08:00";
|
|
56
|
+
|
|
57
|
+
const hm = timePart.match(/(\d{1,2}):(\d{2})\s*(am|pm)/i);
|
|
58
|
+
const d0 = new Date(datePart);
|
|
59
|
+
if (hm && !Number.isNaN(d0.getTime())) {
|
|
60
|
+
let h = Number(hm[1]);
|
|
61
|
+
const min = Number(hm[2]);
|
|
62
|
+
const ap = hm[3].toLowerCase();
|
|
63
|
+
if (ap === "pm" && h < 12) h += 12;
|
|
64
|
+
if (ap === "am" && h === 12) h = 0;
|
|
65
|
+
const y = d0.getFullYear();
|
|
66
|
+
const mo = d0.getMonth() + 1;
|
|
67
|
+
const da = d0.getDate();
|
|
68
|
+
const iso = `${y}-${pad2(mo)}-${pad2(da)}T${pad2(h)}:${pad2(min)}:00${offset}`;
|
|
69
|
+
const pubDate = new Date(iso);
|
|
70
|
+
if (!Number.isNaN(pubDate.getTime())) return { author, pubDate };
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const authorMatch = t.match(/^By\s+(.+?)\s*·/i);
|
|
75
|
+
const author = authorMatch ? authorMatch[1].trim() : undefined;
|
|
76
|
+
const dateStr = t.replace(/^By\s+.*?\s*·\s*/, "").trim();
|
|
77
|
+
const pubDate = new Date(dateStr);
|
|
78
|
+
return { author, pubDate: Number.isNaN(pubDate.getTime()) ? new Date() : pubDate };
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
function parseFeedItems(html, pageUrl) {
|
|
83
|
+
const root = _deps.parseHtml(html);
|
|
84
|
+
const items = [];
|
|
85
|
+
const seen = new Set();
|
|
86
|
+
|
|
87
|
+
for (const node of root.querySelectorAll(".article.feed-item")) {
|
|
88
|
+
const linkEl = node.querySelector("h3.title a[href]");
|
|
89
|
+
if (!linkEl) continue;
|
|
90
|
+
|
|
91
|
+
const title = normalizeText(linkEl.textContent);
|
|
92
|
+
const link = toAbsoluteHttpUrl(linkEl.getAttribute("href"), pageUrl);
|
|
93
|
+
if (!title || !link || seen.has(link)) continue;
|
|
94
|
+
seen.add(link);
|
|
95
|
+
|
|
96
|
+
const authorsText = normalizeText(node.querySelector(".authors")?.textContent ?? "");
|
|
97
|
+
const { author, pubDate } = parseAuthorsDate(authorsText);
|
|
98
|
+
|
|
99
|
+
const summary = normalizeText(
|
|
100
|
+
node.querySelector(".recent-excerpt .long-excerpt")?.textContent ??
|
|
101
|
+
node.querySelector(".recent-excerpt")?.textContent ??
|
|
102
|
+
node.querySelector(".short-excerpt")?.textContent ??
|
|
103
|
+
""
|
|
104
|
+
) || undefined;
|
|
105
|
+
|
|
106
|
+
const categoryEl = node.querySelector(".category-content a");
|
|
107
|
+
const category = categoryEl ? normalizeText(categoryEl.textContent) : undefined;
|
|
108
|
+
|
|
109
|
+
items.push({
|
|
110
|
+
guid: hashGuid(link),
|
|
111
|
+
title,
|
|
112
|
+
link,
|
|
113
|
+
pubDate,
|
|
114
|
+
author,
|
|
115
|
+
summary,
|
|
116
|
+
categories: category ? [category] : undefined,
|
|
117
|
+
});
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
return items;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
export async function fetchItems(sourceId, ctx) {
|
|
125
|
+
_deps = ctx.deps;
|
|
126
|
+
const { html, finalUrl, status } = await ctx.fetchHtml(sourceId, {
|
|
127
|
+
waitMs: 5000,
|
|
128
|
+
waitForSelector: ".article.feed-item",
|
|
129
|
+
waitForSelectorTimeoutMs: 25_000,
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
const pageUrl = finalUrl || sourceId || ORIGIN;
|
|
133
|
+
const items = parseFeedItems(html, pageUrl);
|
|
134
|
+
|
|
135
|
+
if (items.length === 0) {
|
|
136
|
+
const hint = status && status >= 400 ? ` HTTP ${status}` : "";
|
|
137
|
+
throw new Error(
|
|
138
|
+
`[theinformation] 未解析到条目,页面结构可能已变化或需登录后抓取。${hint}`
|
|
139
|
+
);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
items.sort((a, b) => b.pubDate.getTime() - a.pubDate.getTime());
|
|
143
|
+
return items;
|
|
144
|
+
}
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
export const id = "uci-ml-repository";
|
|
2
|
+
export const name = "UCI Ml Repository";
|
|
3
|
+
export const listUrlPattern = /^https:\/\/archive\.ics\.uci\.edu(?:\/(?:datasets\/?)?)?(?:\?.*)?$/i;
|
|
4
|
+
|
|
1
5
|
let _deps;
|
|
2
6
|
|
|
3
7
|
|
|
@@ -16,7 +20,7 @@ function resolveDatasetLink(rawHref, baseUrl) {
|
|
|
16
20
|
if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
|
|
17
21
|
try {
|
|
18
22
|
const url = new URL(href, baseUrl);
|
|
19
|
-
if (!/^https
|
|
23
|
+
if (!/^https:$/i.test(url.protocol)) return null;
|
|
20
24
|
if (url.hostname !== "archive.ics.uci.edu") return null;
|
|
21
25
|
if (!/^\/dataset\/\d+\/[^/?#]+$/i.test(url.pathname)) return null;
|
|
22
26
|
url.search = "";
|
|
@@ -89,7 +93,7 @@ function parseFromGenericAnchors(root, baseUrl) {
|
|
|
89
93
|
return items;
|
|
90
94
|
}
|
|
91
95
|
|
|
92
|
-
async function fetchItems(sourceId, ctx) {
|
|
96
|
+
export async function fetchItems(sourceId, ctx) {
|
|
93
97
|
_deps = ctx.deps;
|
|
94
98
|
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 4000 });
|
|
95
99
|
const baseUrl = finalUrl || sourceId || UCI_ORIGIN;
|
|
@@ -104,8 +108,3 @@ async function fetchItems(sourceId, ctx) {
|
|
|
104
108
|
throw new Error("[uci-ml-repository] 未解析到数据集条目,页面结构可能已变化");
|
|
105
109
|
}
|
|
106
110
|
|
|
107
|
-
export default {
|
|
108
|
-
id: "uci-ml-repository",
|
|
109
|
-
listUrlPattern: /^https?:\/\/archive\.ics\.uci\.edu(?:\/(?:datasets\/?)?)?(?:\?.*)?$/i,
|
|
110
|
-
fetchItems,
|
|
111
|
-
};
|
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
export const id = "venturebeat";
|
|
2
|
+
export const name = "Venturebeat";
|
|
3
|
+
export const listUrlPattern = /^https:\/\/(www\.)?venturebeat\.com\/?(\?.*)?$/i;
|
|
4
|
+
export const refreshInterval = "1h";
|
|
5
|
+
|
|
1
6
|
let _deps;
|
|
2
7
|
|
|
3
8
|
// VentureBeat 插件:通过官方 RSS Feed 拉取列表,规避首页安全检查页
|
|
@@ -35,7 +40,7 @@ function toFeedUrl(sourceId) {
|
|
|
35
40
|
|
|
36
41
|
function mapFeedItem(item) {
|
|
37
42
|
const link = normalizeText(item.link ?? "");
|
|
38
|
-
if (!/^https
|
|
43
|
+
if (!/^https:\/\//i.test(link)) return null;
|
|
39
44
|
|
|
40
45
|
const title = normalizeText(item.title ?? "");
|
|
41
46
|
const pubDate = toValidDate(item.isoDate ?? item.pubDate);
|
|
@@ -53,7 +58,7 @@ function mapFeedItem(item) {
|
|
|
53
58
|
}
|
|
54
59
|
|
|
55
60
|
|
|
56
|
-
async function fetchItems(sourceId, _ctx) {
|
|
61
|
+
export async function fetchItems(sourceId, _ctx) {
|
|
57
62
|
_deps = _ctx.deps;
|
|
58
63
|
const parser = new _deps.RssParser({
|
|
59
64
|
timeout: 15_000,
|
|
@@ -88,10 +93,3 @@ async function fetchItems(sourceId, _ctx) {
|
|
|
88
93
|
return items;
|
|
89
94
|
}
|
|
90
95
|
|
|
91
|
-
|
|
92
|
-
export default {
|
|
93
|
-
id: "venturebeat",
|
|
94
|
-
listUrlPattern: /^https?:\/\/(www\.)?venturebeat\.com\/?(\?.*)?$/i,
|
|
95
|
-
refreshInterval: "1h",
|
|
96
|
-
fetchItems,
|
|
97
|
-
};
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
export const id = "worldlabs";
|
|
2
|
+
export const name = "Worldlabs";
|
|
3
|
+
export const listUrlPattern = /^https:\/\/(www\.)?worldlabs\.ai\/blog(\?.*)?$/i;
|
|
4
|
+
|
|
1
5
|
let _deps;
|
|
2
6
|
|
|
3
7
|
// World Labs 博客插件:抓取 Research & Insights 列表页,输出 FeedItem(不含 enrich)
|
|
@@ -39,7 +43,7 @@ function toAbsoluteHttpUrl(rawHref, baseUrl) {
|
|
|
39
43
|
if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
|
|
40
44
|
try {
|
|
41
45
|
const url = new URL(href, baseUrl);
|
|
42
|
-
if (!/^https
|
|
46
|
+
if (!/^https:$/i.test(url.protocol)) return null;
|
|
43
47
|
return url.href;
|
|
44
48
|
} catch {
|
|
45
49
|
return null;
|
|
@@ -99,7 +103,7 @@ function parseCard(anchor, finalUrl) {
|
|
|
99
103
|
}
|
|
100
104
|
|
|
101
105
|
|
|
102
|
-
async function fetchItems(sourceId, ctx) {
|
|
106
|
+
export async function fetchItems(sourceId, ctx) {
|
|
103
107
|
_deps = ctx.deps;
|
|
104
108
|
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
|
|
105
109
|
const root = _deps.parseHtml(html);
|
|
@@ -121,9 +125,3 @@ async function fetchItems(sourceId, ctx) {
|
|
|
121
125
|
return items;
|
|
122
126
|
}
|
|
123
127
|
|
|
124
|
-
|
|
125
|
-
export default {
|
|
126
|
-
id: "worldlabs",
|
|
127
|
-
listUrlPattern: /^https?:\/\/(www\.)?worldlabs\.ai\/blog(\?.*)?$/i,
|
|
128
|
-
fetchItems,
|
|
129
|
-
};
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
export const id = "x";
|
|
2
|
+
export const name = "X";
|
|
3
|
+
export const listUrlPattern = /^https:\/\/(www\.)?x\.com\/[^/?#]+\/?(?:[?#].*)?$/i;
|
|
4
|
+
|
|
1
5
|
let _deps;
|
|
2
6
|
|
|
3
7
|
// X (Twitter) 站点插件:用户主页列表抓取与解析
|
|
@@ -161,13 +165,13 @@ function extractMediaUrl(article) {
|
|
|
161
165
|
const video = article.querySelector("video[poster]");
|
|
162
166
|
if (video) {
|
|
163
167
|
const poster = video.getAttribute("poster");
|
|
164
|
-
if (poster && /^https
|
|
168
|
+
if (poster && /^https:\/\//i.test(poster)) return poster;
|
|
165
169
|
}
|
|
166
170
|
for (const img of article.querySelectorAll(
|
|
167
171
|
'[data-testid="card.wrapper"] img[src*="twimg.com/card_img"], [data-testid="card.wrapper"] img[src*="pbs.twimg.com/card_img"]',
|
|
168
172
|
)) {
|
|
169
173
|
const src = img.getAttribute("src");
|
|
170
|
-
if (src && /^https
|
|
174
|
+
if (src && /^https:\/\//i.test(src) && !/profile_images/i.test(src)) {
|
|
171
175
|
return normalizeCardImgUrl(src);
|
|
172
176
|
}
|
|
173
177
|
}
|
|
@@ -288,7 +292,7 @@ function entriesToFeedItems(entries) {
|
|
|
288
292
|
}
|
|
289
293
|
|
|
290
294
|
|
|
291
|
-
async function fetchItems(sourceId, ctx) {
|
|
295
|
+
export async function fetchItems(sourceId, ctx) {
|
|
292
296
|
_deps = ctx.deps;
|
|
293
297
|
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 6000 });
|
|
294
298
|
const root = _deps.parseHtml(html);
|
|
@@ -320,9 +324,3 @@ async function fetchItems(sourceId, ctx) {
|
|
|
320
324
|
throw new Error(`[X] ${message}`);
|
|
321
325
|
}
|
|
322
326
|
|
|
323
|
-
|
|
324
|
-
export default {
|
|
325
|
-
id: "x",
|
|
326
|
-
listUrlPattern: "https://x.com/{username}",
|
|
327
|
-
fetchItems,
|
|
328
|
-
};
|