rssany 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/app/plugins/builtin/agi-eval-evaluation.rssany.js +188 -0
- package/app/plugins/builtin/amii-research-talent.rssany.js +73 -0
- package/app/plugins/builtin/anthropic-research.rssany.js +155 -0
- package/app/plugins/builtin/appen-resources.rssany.js +155 -0
- package/app/plugins/builtin/baai-wudao-paper-article.rssany.js +185 -0
- package/app/plugins/builtin/baaidata-csdn.rssany.js +242 -0
- package/app/plugins/builtin/baidu-research.rssany.js +222 -0
- package/app/plugins/builtin/brightdata-blog.rssany.js +301 -0
- package/app/plugins/builtin/bytedance-seed-research.rssany.js +231 -0
- package/app/plugins/builtin/five-radar.rssany.js +490 -0
- package/app/plugins/builtin/flageval-news.rssany.js +118 -0
- package/app/plugins/builtin/google-deepmind-research.rssany.js +223 -0
- package/app/plugins/builtin/google-research-datasets.rssany.js +171 -0
- package/app/plugins/builtin/google-research.rssany.js +220 -0
- package/app/plugins/builtin/google.rssany.js +187 -0
- package/app/plugins/builtin/hacker-news-newest.rssany.js +130 -0
- package/app/plugins/builtin/harvard-dataverse.rssany.js +166 -0
- package/app/plugins/builtin/huaweicloud-bbs-blogs.rssany.js +185 -0
- package/app/plugins/builtin/lingowhale.rssany.js +119 -0
- package/app/plugins/builtin/meituan-tech.rssany.js +130 -0
- package/app/plugins/builtin/meta-ai-publications.rssany.js +221 -0
- package/app/plugins/builtin/mila-quebec.rssany.js +199 -0
- package/app/plugins/builtin/mit-csail-research.rssany.js +208 -0
- package/app/plugins/builtin/moonshot.rssany.js +127 -0
- package/app/plugins/builtin/opendatalab-news.rssany.js +174 -0
- package/app/plugins/builtin/opendatalab.rssany.js +109 -0
- package/app/plugins/builtin/opendrivelab-autonomous-driving.rssany.js +114 -0
- package/app/plugins/builtin/opendrivelab-embodiedai.rssany.js +114 -0
- package/app/plugins/builtin/opendrivelab-publications.rssany.js +130 -0
- package/app/plugins/builtin/opendrivelab.rssany.js +333 -0
- package/app/plugins/builtin/paperswithcode.rssany.js +227 -0
- package/app/plugins/builtin/pjlab-adg-publications.rssany.js +202 -0
- package/app/plugins/builtin/rss.rssany.js +11 -1
- package/app/plugins/builtin/selectdataset.rssany.js +206 -0
- package/app/plugins/builtin/sensetime-tech-achievements.rssany.js +154 -0
- package/app/plugins/builtin/supervisely-blog.rssany.js +159 -0
- package/app/plugins/builtin/uci-ml-repository.rssany.js +111 -0
- package/app/plugins/builtin/venturebeat.rssany.js +97 -0
- package/app/plugins/builtin/worldlabs.rssany.js +129 -0
- package/app/plugins/builtin/x.rssany.js +159 -0
- package/app/plugins/builtin/xiaohongshu.rssany.js +283 -0
- package/app/plugins/builtin/zhipu-research.rssany.js +334 -0
- package/dist/index.js +62 -4
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/webui/build/200.html +6 -6
- package/webui/build/_app/immutable/assets/{0.DjU2hdCQ.css → 0.BB88QFoe.css} +1 -1
- package/webui/build/_app/immutable/assets/{homeFeedPanelStore.BopJZtHu.css → homeFeedPanelStore.iOmfP2qL.css} +1 -1
- package/webui/build/_app/immutable/chunks/CZD-YNDw.js +31 -0
- package/webui/build/_app/immutable/chunks/{C85CNwD2.js → D6VIKef0.js} +1 -1
- package/webui/build/_app/immutable/chunks/{CllQAdvt.js → Dbqx2mXq.js} +1 -1
- package/webui/build/_app/immutable/chunks/DeX-oq5W.js +41 -0
- package/webui/build/_app/immutable/chunks/{CdMsRjxJ.js → dhB8G5Is.js} +1 -1
- package/webui/build/_app/immutable/entry/{app.BcD2eSsQ.js → app.XPso7q7g.js} +2 -2
- package/webui/build/_app/immutable/entry/start.Db4snNCd.js +1 -0
- package/webui/build/_app/immutable/nodes/0.BKTQePmA.js +11 -0
- package/webui/build/_app/immutable/nodes/{1.DU9aYGAb.js → 1.BS3_Rfxm.js} +1 -1
- package/webui/build/_app/immutable/nodes/{10.Db6vw7Ih.js → 10.CyyxDCIS.js} +1 -1
- package/webui/build/_app/immutable/nodes/{11.BaAcorz3.js → 11.CtYgIaGj.js} +1 -1
- package/webui/build/_app/immutable/nodes/{14.DqT4pcrQ.js → 14.D5OEGPR2.js} +1 -1
- package/webui/build/_app/immutable/nodes/{15.CCLbjxnH.js → 15.B4dFN1Gk.js} +1 -1
- package/webui/build/_app/immutable/nodes/{16.DiigpVdP.js → 16.M7ZII7tl.js} +1 -1
- package/webui/build/_app/immutable/nodes/{3.DEcYOQc-.js → 3.7r8v7qkm.js} +1 -1
- package/webui/build/_app/immutable/nodes/{5.CvM1TkLG.js → 5.CHIzoGrb.js} +1 -1
- package/webui/build/_app/immutable/nodes/{6.Dscr6LkS.js → 6.BDBqx-GY.js} +1 -1
- package/webui/build/_app/immutable/nodes/{7.Bp60MobD.js → 7.D5czsDmz.js} +1 -1
- package/webui/build/_app/immutable/nodes/{8.DwSg0MHh.js → 8.pjVNsCdV.js} +1 -1
- package/webui/build/_app/immutable/nodes/{9.BeYOUjxR.js → 9.CsARv1BH.js} +1 -1
- package/webui/build/_app/version.json +1 -1
- package/webui/build/_app/immutable/chunks/CtijX1u3.js +0 -31
- package/webui/build/_app/immutable/chunks/Dv1VCsiB.js +0 -41
- package/webui/build/_app/immutable/entry/start.CbkdJdz1.js +0 -1
- package/webui/build/_app/immutable/nodes/0.DSUDmOx2.js +0 -11
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
const SITE_ID = "sensetime-tech-achievements";
|
|
5
|
+
const DATE_RE = /\b(20\d{2})-(\d{1,2})-(\d{1,2})\b/;
|
|
6
|
+
|
|
7
|
+
function normalizeText(text) {
|
|
8
|
+
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
function hashGuid(input) {
|
|
12
|
+
return _deps.createHash("sha256").update(input).digest("hex");
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
function toAbsoluteUrl(rawHref, baseUrl) {
|
|
16
|
+
if (!rawHref) return null;
|
|
17
|
+
const href = rawHref.trim();
|
|
18
|
+
if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
|
|
19
|
+
try {
|
|
20
|
+
const url = new URL(href, baseUrl);
|
|
21
|
+
if (!/^https?:$/i.test(url.protocol)) return null;
|
|
22
|
+
return url.href;
|
|
23
|
+
} catch {
|
|
24
|
+
return null;
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function parseDate(dateText) {
|
|
29
|
+
const text = normalizeText(dateText);
|
|
30
|
+
const m = text.match(DATE_RE);
|
|
31
|
+
if (!m) return undefined;
|
|
32
|
+
const [, y, mm, dd] = m;
|
|
33
|
+
return new Date(`${y}-${mm.padStart(2, "0")}-${dd.padStart(2, "0")}T00:00:00.000Z`);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function extractDateText(anchor) {
|
|
37
|
+
let current = anchor;
|
|
38
|
+
for (let i = 0; i < 8 && current; i += 1) {
|
|
39
|
+
const text = normalizeText(current.textContent);
|
|
40
|
+
const m = text.match(DATE_RE);
|
|
41
|
+
if (m) return m[0];
|
|
42
|
+
current = current.parentNode ?? null;
|
|
43
|
+
}
|
|
44
|
+
return "";
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function extractTags(anchor, title, dateText) {
|
|
48
|
+
let current = anchor;
|
|
49
|
+
for (let i = 0; i < 6 && current; i += 1) {
|
|
50
|
+
const spans = current.querySelectorAll?.("span") ?? [];
|
|
51
|
+
const tags = spans
|
|
52
|
+
.map((s) => normalizeText(s.textContent))
|
|
53
|
+
.filter(Boolean)
|
|
54
|
+
.filter((x) => x !== title && x !== dateText)
|
|
55
|
+
.filter((x) => !DATE_RE.test(x));
|
|
56
|
+
if (tags.length > 0) return Array.from(new Set(tags));
|
|
57
|
+
current = current.parentNode ?? null;
|
|
58
|
+
}
|
|
59
|
+
return [];
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
function parseItemsFromHtml(html, finalUrl) {
|
|
63
|
+
const root = _deps.parseHtml(html);
|
|
64
|
+
const anchors = root.querySelectorAll('a[href*="/technology-new-detail/"]');
|
|
65
|
+
const seen = new Set();
|
|
66
|
+
const items = [];
|
|
67
|
+
|
|
68
|
+
for (const anchor of anchors) {
|
|
69
|
+
const link = toAbsoluteUrl(anchor.getAttribute("href"), finalUrl);
|
|
70
|
+
if (!link || seen.has(link)) continue;
|
|
71
|
+
|
|
72
|
+
const title = normalizeText(anchor.textContent);
|
|
73
|
+
if (!title) continue;
|
|
74
|
+
|
|
75
|
+
seen.add(link);
|
|
76
|
+
const dateText = extractDateText(anchor);
|
|
77
|
+
const tags = extractTags(anchor, title, dateText);
|
|
78
|
+
const summary = [dateText, tags.join(" / ")].filter(Boolean).join(" | ");
|
|
79
|
+
const pubDate = parseDate(dateText) ?? new Date();
|
|
80
|
+
|
|
81
|
+
items.push({
|
|
82
|
+
guid: hashGuid(link),
|
|
83
|
+
title,
|
|
84
|
+
link,
|
|
85
|
+
pubDate,
|
|
86
|
+
summary: summary || undefined,
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
return items;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
async function fetchItemsFromApi(finalUrl) {
|
|
94
|
+
const origin = new URL(finalUrl).origin;
|
|
95
|
+
const apiUrl = new URL("/rest/v1/contents/1/getlistbyparam/48/1/20/0/0?scene=1", origin);
|
|
96
|
+
|
|
97
|
+
const res = await fetch(apiUrl, {
|
|
98
|
+
headers: {
|
|
99
|
+
Accept: "application/json,text/plain,*/*",
|
|
100
|
+
"User-Agent":
|
|
101
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
|
102
|
+
},
|
|
103
|
+
});
|
|
104
|
+
if (!res.ok) return [];
|
|
105
|
+
|
|
106
|
+
const data = await res.json();
|
|
107
|
+
const list = Array.isArray(data?.data?.lists) ? data.data.lists : [];
|
|
108
|
+
const items = [];
|
|
109
|
+
const seen = new Set();
|
|
110
|
+
|
|
111
|
+
for (const row of list) {
|
|
112
|
+
const contentId = String(row?.contentId ?? "").trim();
|
|
113
|
+
const title = normalizeText(row?.title);
|
|
114
|
+
if (!contentId || !title) continue;
|
|
115
|
+
const link = new URL(`/cn/technology-new-detail/${contentId}?categoryId=48`, origin).href;
|
|
116
|
+
if (seen.has(link)) continue;
|
|
117
|
+
seen.add(link);
|
|
118
|
+
|
|
119
|
+
const dateText = normalizeText(row?.createTime);
|
|
120
|
+
const tags = Array.isArray(row?.tagnames)
|
|
121
|
+
? row.tagnames.map((x) => normalizeText(x)).filter(Boolean)
|
|
122
|
+
: [];
|
|
123
|
+
const summary = [dateText, tags.join(" / ")].filter(Boolean).join(" | ");
|
|
124
|
+
const pubDate = parseDate(dateText) ?? new Date();
|
|
125
|
+
|
|
126
|
+
items.push({
|
|
127
|
+
guid: hashGuid(link),
|
|
128
|
+
title,
|
|
129
|
+
link,
|
|
130
|
+
pubDate,
|
|
131
|
+
summary: summary || undefined,
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
return items;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
async function fetchItems(sourceId, ctx) {
|
|
139
|
+
_deps = ctx.deps;
|
|
140
|
+
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
|
|
141
|
+
const items = parseItemsFromHtml(html, finalUrl);
|
|
142
|
+
if (items.length > 0) return items;
|
|
143
|
+
|
|
144
|
+
const fallbackItems = await fetchItemsFromApi(finalUrl);
|
|
145
|
+
if (fallbackItems.length > 0) return fallbackItems;
|
|
146
|
+
|
|
147
|
+
throw new Error(`[${SITE_ID}] 未解析到学术成果条目,页面结构或接口可能已变化`);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
export default {
|
|
151
|
+
id: SITE_ID,
|
|
152
|
+
listUrlPattern: /^https?:\/\/(www\.)?sensetime\.com\/cn\/technology-achievements(\?.*)?$/i,
|
|
153
|
+
fetchItems,
|
|
154
|
+
};
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
// Supervisely Blog 插件:抓取列表页并解析为 FeedItem(不做正文 enrich)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
const SUPERVISELY_ORIGIN = "https://supervisely.com";
|
|
8
|
+
const MONTH_INDEX = {
|
|
9
|
+
jan: 0,
|
|
10
|
+
feb: 1,
|
|
11
|
+
mar: 2,
|
|
12
|
+
apr: 3,
|
|
13
|
+
may: 4,
|
|
14
|
+
jun: 5,
|
|
15
|
+
jul: 6,
|
|
16
|
+
aug: 7,
|
|
17
|
+
sep: 8,
|
|
18
|
+
oct: 9,
|
|
19
|
+
nov: 10,
|
|
20
|
+
dec: 11,
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
function normalizeText(text) {
|
|
25
|
+
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
function hashGuid(input) {
|
|
30
|
+
return _deps.createHash("sha256").update(input).digest("hex");
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
function toAbsoluteUrl(href, baseUrl) {
|
|
35
|
+
if (!href) return null;
|
|
36
|
+
try {
|
|
37
|
+
const url = new URL(href, baseUrl);
|
|
38
|
+
if (!/^https?:$/i.test(url.protocol)) return null;
|
|
39
|
+
return url.href;
|
|
40
|
+
} catch {
|
|
41
|
+
return null;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
function parsePubDate(rawText) {
|
|
47
|
+
const text = normalizeText(rawText);
|
|
48
|
+
const m = text.match(/^([A-Za-z]{3,9})\s+(\d{1,2}),\s*(\d{4})$/);
|
|
49
|
+
if (!m) return undefined;
|
|
50
|
+
const month = MONTH_INDEX[m[1].slice(0, 3).toLowerCase()];
|
|
51
|
+
if (month == null) return undefined;
|
|
52
|
+
const day = Number(m[2]);
|
|
53
|
+
const year = Number(m[3]);
|
|
54
|
+
if (!Number.isInteger(day) || !Number.isInteger(year)) return undefined;
|
|
55
|
+
return new Date(Date.UTC(year, month, day, 0, 0, 0));
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
function looksLikeBlogLink(link) {
|
|
60
|
+
try {
|
|
61
|
+
const u = new URL(link);
|
|
62
|
+
return /^\/blog\/[^/]+\/?$/i.test(u.pathname);
|
|
63
|
+
} catch {
|
|
64
|
+
return false;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
function findAncestor(node, maxDepth) {
|
|
70
|
+
let current = node?.parentNode ?? null;
|
|
71
|
+
for (let i = 0; i < maxDepth && current; i += 1) {
|
|
72
|
+
if (current.querySelector?.("time")) return current;
|
|
73
|
+
current = current.parentNode ?? null;
|
|
74
|
+
}
|
|
75
|
+
return node?.parentNode ?? null;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
function buildFeedItem({ title, link, summary, author, pubDate }) {
|
|
80
|
+
return {
|
|
81
|
+
guid: hashGuid(link),
|
|
82
|
+
title,
|
|
83
|
+
link,
|
|
84
|
+
pubDate: pubDate ?? new Date(),
|
|
85
|
+
author: author || undefined,
|
|
86
|
+
summary: summary || undefined,
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
function parseFromCards(root, baseUrl) {
|
|
92
|
+
const seen = new Set();
|
|
93
|
+
const items = [];
|
|
94
|
+
const cards = root.querySelectorAll("div.blog-card");
|
|
95
|
+
|
|
96
|
+
for (const card of cards) {
|
|
97
|
+
const titleAnchor = card.querySelector("h4 a[href]");
|
|
98
|
+
const title = normalizeText(titleAnchor?.textContent);
|
|
99
|
+
const link = toAbsoluteUrl(titleAnchor?.getAttribute("href"), baseUrl);
|
|
100
|
+
if (!title || !link || !looksLikeBlogLink(link) || seen.has(link)) continue;
|
|
101
|
+
|
|
102
|
+
const summary = normalizeText(card.querySelector("p")?.textContent);
|
|
103
|
+
const author = normalizeText(card.querySelector('b[rel="author"], address b')?.textContent);
|
|
104
|
+
const pubDateText = normalizeText(card.querySelector("time")?.textContent);
|
|
105
|
+
const pubDate = parsePubDate(pubDateText);
|
|
106
|
+
|
|
107
|
+
seen.add(link);
|
|
108
|
+
items.push(buildFeedItem({ title, link, summary, author, pubDate }));
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
return items;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
function parseFromHeadingFallback(root, baseUrl) {
|
|
116
|
+
const seen = new Set();
|
|
117
|
+
const items = [];
|
|
118
|
+
const anchors = root.querySelectorAll('h4 a[href*="/blog/"]');
|
|
119
|
+
|
|
120
|
+
for (const anchor of anchors) {
|
|
121
|
+
const title = normalizeText(anchor.textContent);
|
|
122
|
+
const link = toAbsoluteUrl(anchor.getAttribute("href"), baseUrl);
|
|
123
|
+
if (!title || !link || !looksLikeBlogLink(link) || seen.has(link)) continue;
|
|
124
|
+
|
|
125
|
+
const container = findAncestor(anchor, 7);
|
|
126
|
+
const summary = normalizeText(container?.querySelector("p")?.textContent);
|
|
127
|
+
const author = normalizeText(container?.querySelector('b[rel="author"], address b')?.textContent);
|
|
128
|
+
const pubDateText = normalizeText(container?.querySelector("time")?.textContent);
|
|
129
|
+
const pubDate = parsePubDate(pubDateText);
|
|
130
|
+
|
|
131
|
+
seen.add(link);
|
|
132
|
+
items.push(buildFeedItem({ title, link, summary, author, pubDate }));
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
return items;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
async function fetchItems(sourceId, ctx) {
|
|
140
|
+
_deps = ctx.deps;
|
|
141
|
+
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
|
|
142
|
+
const root = _deps.parseHtml(html);
|
|
143
|
+
const baseUrl = finalUrl || SUPERVISELY_ORIGIN;
|
|
144
|
+
|
|
145
|
+
const fromCards = parseFromCards(root, baseUrl);
|
|
146
|
+
const items = fromCards.length > 0 ? fromCards : parseFromHeadingFallback(root, baseUrl);
|
|
147
|
+
|
|
148
|
+
if (items.length === 0) {
|
|
149
|
+
throw new Error("[supervisely-blog] 未解析到文章条目,页面结构可能已变化");
|
|
150
|
+
}
|
|
151
|
+
return items;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
export default {
|
|
156
|
+
id: "supervisely-blog",
|
|
157
|
+
listUrlPattern: /^https?:\/\/(www\.)?supervisely\.com\/blog\/?(?:\?.*)?$/i,
|
|
158
|
+
fetchItems,
|
|
159
|
+
};
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
const UCI_ORIGIN = "https://archive.ics.uci.edu";
|
|
5
|
+
|
|
6
|
+
function normalizeText(text) {
|
|
7
|
+
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
function hashGuid(input) {
|
|
11
|
+
return _deps.createHash("sha256").update(input).digest("hex");
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
function resolveDatasetLink(rawHref, baseUrl) {
|
|
15
|
+
const href = normalizeText(rawHref);
|
|
16
|
+
if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
|
|
17
|
+
try {
|
|
18
|
+
const url = new URL(href, baseUrl);
|
|
19
|
+
if (!/^https?:$/i.test(url.protocol)) return null;
|
|
20
|
+
if (url.hostname !== "archive.ics.uci.edu") return null;
|
|
21
|
+
if (!/^\/dataset\/\d+\/[^/?#]+$/i.test(url.pathname)) return null;
|
|
22
|
+
url.search = "";
|
|
23
|
+
url.hash = "";
|
|
24
|
+
return url.href;
|
|
25
|
+
} catch {
|
|
26
|
+
return null;
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function pickSummaryNearNode(node) {
|
|
31
|
+
let cur = node;
|
|
32
|
+
for (let i = 0; i < 6 && cur; i += 1) {
|
|
33
|
+
const p = cur.querySelector?.("p");
|
|
34
|
+
const summary = normalizeText(p?.textContent);
|
|
35
|
+
if (summary) return summary;
|
|
36
|
+
cur = cur.parentNode ?? null;
|
|
37
|
+
}
|
|
38
|
+
return "";
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
function buildItem({ title, link, summary, index }) {
|
|
42
|
+
return {
|
|
43
|
+
guid: hashGuid(link),
|
|
44
|
+
title,
|
|
45
|
+
link,
|
|
46
|
+
pubDate: new Date(Date.now() - index * 1000),
|
|
47
|
+
summary: summary || undefined,
|
|
48
|
+
sourceId: "uci-ml-repository",
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function parseFromHeadingAnchors(root, baseUrl) {
|
|
53
|
+
const anchors = root.querySelectorAll('h2 a[href^="/dataset/"]');
|
|
54
|
+
const items = [];
|
|
55
|
+
const seen = new Set();
|
|
56
|
+
|
|
57
|
+
for (const anchor of anchors) {
|
|
58
|
+
const link = resolveDatasetLink(anchor.getAttribute("href"), baseUrl);
|
|
59
|
+
if (!link || seen.has(link)) continue;
|
|
60
|
+
|
|
61
|
+
const title = normalizeText(anchor.textContent);
|
|
62
|
+
if (!title) continue;
|
|
63
|
+
|
|
64
|
+
const summary = pickSummaryNearNode(anchor.parentNode ?? anchor);
|
|
65
|
+
seen.add(link);
|
|
66
|
+
items.push(buildItem({ title, link, summary, index: items.length }));
|
|
67
|
+
}
|
|
68
|
+
return items;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function parseFromGenericAnchors(root, baseUrl) {
|
|
72
|
+
const anchors = root.querySelectorAll('a[href^="/dataset/"]');
|
|
73
|
+
const items = [];
|
|
74
|
+
const seen = new Set();
|
|
75
|
+
|
|
76
|
+
for (const anchor of anchors) {
|
|
77
|
+
const link = resolveDatasetLink(anchor.getAttribute("href"), baseUrl);
|
|
78
|
+
if (!link || seen.has(link)) continue;
|
|
79
|
+
|
|
80
|
+
const titleFromText = normalizeText(anchor.textContent);
|
|
81
|
+
const titleFromImage = normalizeText(anchor.querySelector("img")?.getAttribute("alt"));
|
|
82
|
+
const title = titleFromText || titleFromImage;
|
|
83
|
+
if (!title) continue;
|
|
84
|
+
|
|
85
|
+
const summary = pickSummaryNearNode(anchor.parentNode ?? anchor);
|
|
86
|
+
seen.add(link);
|
|
87
|
+
items.push(buildItem({ title, link, summary, index: items.length }));
|
|
88
|
+
}
|
|
89
|
+
return items;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
async function fetchItems(sourceId, ctx) {
|
|
93
|
+
_deps = ctx.deps;
|
|
94
|
+
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 4000 });
|
|
95
|
+
const baseUrl = finalUrl || sourceId || UCI_ORIGIN;
|
|
96
|
+
const root = _deps.parseHtml(html);
|
|
97
|
+
|
|
98
|
+
const byHeading = parseFromHeadingAnchors(root, baseUrl);
|
|
99
|
+
if (byHeading.length > 0) return byHeading;
|
|
100
|
+
|
|
101
|
+
const fallback = parseFromGenericAnchors(root, baseUrl);
|
|
102
|
+
if (fallback.length > 0) return fallback;
|
|
103
|
+
|
|
104
|
+
throw new Error("[uci-ml-repository] 未解析到数据集条目,页面结构可能已变化");
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
export default {
|
|
108
|
+
id: "uci-ml-repository",
|
|
109
|
+
listUrlPattern: /^https?:\/\/archive\.ics\.uci\.edu(?:\/(?:datasets\/?)?)?(?:\?.*)?$/i,
|
|
110
|
+
fetchItems,
|
|
111
|
+
};
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
// VentureBeat 插件:通过官方 RSS Feed 拉取列表,规避首页安全检查页
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
function normalizeText(text) {
|
|
9
|
+
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
function stripHtml(text) {
|
|
14
|
+
return normalizeText((text ?? "").replace(/<[^>]*>/g, " "));
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
function toValidDate(raw) {
|
|
19
|
+
if (!raw) return new Date();
|
|
20
|
+
const date = new Date(raw);
|
|
21
|
+
return Number.isNaN(date.getTime()) ? new Date() : date;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
function toFeedUrl(sourceId) {
|
|
26
|
+
const url = new URL(sourceId);
|
|
27
|
+
const path = url.pathname.replace(/\/+$/, "");
|
|
28
|
+
if (path.endsWith("/feed")) return url.href;
|
|
29
|
+
url.pathname = path ? `${path}/feed/` : "/feed/";
|
|
30
|
+
url.search = "";
|
|
31
|
+
url.hash = "";
|
|
32
|
+
return url.href;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
function mapFeedItem(item) {
|
|
37
|
+
const link = normalizeText(item.link ?? "");
|
|
38
|
+
if (!/^https?:\/\//i.test(link)) return null;
|
|
39
|
+
|
|
40
|
+
const title = normalizeText(item.title ?? "");
|
|
41
|
+
const pubDate = toValidDate(item.isoDate ?? item.pubDate);
|
|
42
|
+
const summary = normalizeText(item.contentSnippet ?? "") || stripHtml(item.summary ?? item.content ?? "");
|
|
43
|
+
const author = normalizeText(item.creator ?? item.author ?? "") || undefined;
|
|
44
|
+
|
|
45
|
+
return {
|
|
46
|
+
guid: _deps.createHash("sha256").update(link).digest("hex"),
|
|
47
|
+
title: title || "(无标题)",
|
|
48
|
+
link,
|
|
49
|
+
pubDate,
|
|
50
|
+
author,
|
|
51
|
+
summary: summary || undefined,
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
async function fetchItems(sourceId, _ctx) {
|
|
57
|
+
_deps = _ctx.deps;
|
|
58
|
+
const parser = new _deps.RssParser({
|
|
59
|
+
timeout: 15_000,
|
|
60
|
+
headers: {
|
|
61
|
+
"User-Agent": "RssAny/1.0 (+https://github.com/rssany/rssany)",
|
|
62
|
+
Accept: "application/rss+xml,application/atom+xml,application/xml,text/xml,*/*",
|
|
63
|
+
},
|
|
64
|
+
});
|
|
65
|
+
const feedUrl = toFeedUrl(sourceId);
|
|
66
|
+
let feed;
|
|
67
|
+
try {
|
|
68
|
+
feed = await parser.parseURL(feedUrl);
|
|
69
|
+
} catch (err) {
|
|
70
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
71
|
+
throw new Error(`[venturebeat] 抓取 feed 失败: ${feedUrl} (${msg})`);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const seen = new Set();
|
|
75
|
+
const items = [];
|
|
76
|
+
for (const item of feed.items ?? []) {
|
|
77
|
+
const mapped = mapFeedItem(item);
|
|
78
|
+
if (!mapped) continue;
|
|
79
|
+
if (seen.has(mapped.link)) continue;
|
|
80
|
+
seen.add(mapped.link);
|
|
81
|
+
items.push(mapped);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
if (items.length === 0) {
|
|
85
|
+
throw new Error(`[venturebeat] 未解析到条目: ${feedUrl}`);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return items;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
export default {
|
|
93
|
+
id: "venturebeat",
|
|
94
|
+
listUrlPattern: /^https?:\/\/(www\.)?venturebeat\.com\/?(\?.*)?$/i,
|
|
95
|
+
refreshInterval: "1h",
|
|
96
|
+
fetchItems,
|
|
97
|
+
};
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
// World Labs 博客插件:抓取 Research & Insights 列表页,输出 FeedItem(不含 enrich)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
const MONTH_NAME =
|
|
8
|
+
"January|February|March|April|May|June|July|August|September|October|November|December";
|
|
9
|
+
const DATE_RE = new RegExp(`\\b(${MONTH_NAME})\\s+\\d{1,2},\\s+\\d{4}\\b`, "i");
|
|
10
|
+
const MONTH_INDEX = {
|
|
11
|
+
january: 0,
|
|
12
|
+
february: 1,
|
|
13
|
+
march: 2,
|
|
14
|
+
april: 3,
|
|
15
|
+
may: 4,
|
|
16
|
+
june: 5,
|
|
17
|
+
july: 6,
|
|
18
|
+
august: 7,
|
|
19
|
+
september: 8,
|
|
20
|
+
october: 9,
|
|
21
|
+
november: 10,
|
|
22
|
+
december: 11,
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
function normalizeText(text) {
|
|
27
|
+
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
function hashGuid(input) {
|
|
32
|
+
return _deps.createHash("sha256").update(input).digest("hex");
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
function toAbsoluteHttpUrl(rawHref, baseUrl) {
|
|
37
|
+
if (!rawHref) return null;
|
|
38
|
+
const href = rawHref.trim();
|
|
39
|
+
if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
|
|
40
|
+
try {
|
|
41
|
+
const url = new URL(href, baseUrl);
|
|
42
|
+
if (!/^https?:$/i.test(url.protocol)) return null;
|
|
43
|
+
return url.href;
|
|
44
|
+
} catch {
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
function parseDateAndAuthor(metaText) {
|
|
51
|
+
const text = normalizeText(metaText);
|
|
52
|
+
const m = text.match(DATE_RE);
|
|
53
|
+
if (!m) return { pubDate: new Date(), author: undefined };
|
|
54
|
+
|
|
55
|
+
const dateText = m[0];
|
|
56
|
+
const parts = dateText.match(/^(?<month>[A-Za-z]+)\s+(?<day>\d{1,2}),\s*(?<year>\d{4})$/);
|
|
57
|
+
let date = new Date();
|
|
58
|
+
if (parts?.groups) {
|
|
59
|
+
const month = MONTH_INDEX[parts.groups.month.toLowerCase()];
|
|
60
|
+
const day = Number(parts.groups.day);
|
|
61
|
+
const year = Number(parts.groups.year);
|
|
62
|
+
if (month != null && Number.isFinite(day) && Number.isFinite(year)) {
|
|
63
|
+
// 统一用 UTC 中午,避免仅有日期时因时区导致前后一天偏移。
|
|
64
|
+
date = new Date(Date.UTC(year, month, day, 12, 0, 0));
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
const authorText = normalizeText(text.slice(m.index + dateText.length)).replace(/^[|/\-•·,:]+/, "").trim();
|
|
68
|
+
|
|
69
|
+
return {
|
|
70
|
+
pubDate: Number.isNaN(date.getTime()) ? new Date() : date,
|
|
71
|
+
author: authorText || undefined,
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
function parseCard(anchor, finalUrl) {
|
|
77
|
+
const title = normalizeText(anchor.querySelector("h2, h3")?.textContent);
|
|
78
|
+
if (!title) return null;
|
|
79
|
+
|
|
80
|
+
const link = toAbsoluteHttpUrl(anchor.getAttribute("href"), finalUrl);
|
|
81
|
+
if (!link) return null;
|
|
82
|
+
|
|
83
|
+
const paragraphTexts = anchor
|
|
84
|
+
.querySelectorAll("p")
|
|
85
|
+
.map((p) => normalizeText(p.textContent))
|
|
86
|
+
.filter(Boolean);
|
|
87
|
+
const metaText = paragraphTexts.find((t) => DATE_RE.test(t)) ?? paragraphTexts[0] ?? "";
|
|
88
|
+
const { pubDate, author } = parseDateAndAuthor(metaText);
|
|
89
|
+
const summary = paragraphTexts.find((t) => t !== metaText && !DATE_RE.test(t));
|
|
90
|
+
|
|
91
|
+
return {
|
|
92
|
+
guid: hashGuid(link),
|
|
93
|
+
title,
|
|
94
|
+
link,
|
|
95
|
+
pubDate,
|
|
96
|
+
author,
|
|
97
|
+
summary: summary || undefined,
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
async function fetchItems(sourceId, ctx) {
|
|
103
|
+
_deps = ctx.deps;
|
|
104
|
+
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
|
|
105
|
+
const root = _deps.parseHtml(html);
|
|
106
|
+
|
|
107
|
+
const seen = new Set();
|
|
108
|
+
const items = [];
|
|
109
|
+
const anchors = root.querySelectorAll("a[href]");
|
|
110
|
+
for (const anchor of anchors) {
|
|
111
|
+
const item = parseCard(anchor, finalUrl);
|
|
112
|
+
if (!item) continue;
|
|
113
|
+
if (seen.has(item.link)) continue;
|
|
114
|
+
seen.add(item.link);
|
|
115
|
+
items.push(item);
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
if (items.length === 0) {
|
|
119
|
+
throw new Error("[worldlabs] 未解析到条目,页面结构可能已变化");
|
|
120
|
+
}
|
|
121
|
+
return items;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
export default {
|
|
126
|
+
id: "worldlabs",
|
|
127
|
+
listUrlPattern: /^https?:\/\/(www\.)?worldlabs\.ai\/blog(\?.*)?$/i,
|
|
128
|
+
fetchItems,
|
|
129
|
+
};
|