rssany 0.1.2 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -50
- package/app/plugins/builtin/agi-eval-evaluation.rssany.js +188 -0
- package/app/plugins/builtin/amii-research-talent.rssany.js +73 -0
- package/app/plugins/builtin/anthropic-research.rssany.js +155 -0
- package/app/plugins/builtin/appen-resources.rssany.js +155 -0
- package/app/plugins/builtin/baai-wudao-paper-article.rssany.js +185 -0
- package/app/plugins/builtin/baaidata-csdn.rssany.js +242 -0
- package/app/plugins/builtin/baidu-research.rssany.js +222 -0
- package/app/plugins/builtin/brightdata-blog.rssany.js +301 -0
- package/app/plugins/builtin/bytedance-seed-research.rssany.js +231 -0
- package/app/plugins/builtin/five-radar.rssany.js +490 -0
- package/app/plugins/builtin/flageval-news.rssany.js +118 -0
- package/app/plugins/builtin/google-deepmind-research.rssany.js +223 -0
- package/app/plugins/builtin/google-research-datasets.rssany.js +171 -0
- package/app/plugins/builtin/google-research.rssany.js +220 -0
- package/app/plugins/builtin/google.rssany.js +187 -0
- package/app/plugins/builtin/hacker-news-newest.rssany.js +130 -0
- package/app/plugins/builtin/harvard-dataverse.rssany.js +166 -0
- package/app/plugins/builtin/huaweicloud-bbs-blogs.rssany.js +185 -0
- package/app/plugins/builtin/lingowhale.rssany.js +119 -0
- package/app/plugins/builtin/meituan-tech.rssany.js +130 -0
- package/app/plugins/builtin/meta-ai-publications.rssany.js +221 -0
- package/app/plugins/builtin/mila-quebec.rssany.js +199 -0
- package/app/plugins/builtin/mit-csail-research.rssany.js +208 -0
- package/app/plugins/builtin/moonshot.rssany.js +127 -0
- package/app/plugins/builtin/opendatalab-news.rssany.js +174 -0
- package/app/plugins/builtin/opendatalab.rssany.js +109 -0
- package/app/plugins/builtin/opendrivelab-autonomous-driving.rssany.js +114 -0
- package/app/plugins/builtin/opendrivelab-embodiedai.rssany.js +114 -0
- package/app/plugins/builtin/opendrivelab-publications.rssany.js +130 -0
- package/app/plugins/builtin/opendrivelab.rssany.js +333 -0
- package/app/plugins/builtin/paperswithcode.rssany.js +227 -0
- package/app/plugins/builtin/pjlab-adg-publications.rssany.js +202 -0
- package/app/plugins/builtin/rss.rssany.js +11 -1
- package/app/plugins/builtin/selectdataset.rssany.js +206 -0
- package/app/plugins/builtin/sensetime-tech-achievements.rssany.js +154 -0
- package/app/plugins/builtin/supervisely-blog.rssany.js +159 -0
- package/app/plugins/builtin/uci-ml-repository.rssany.js +111 -0
- package/app/plugins/builtin/venturebeat.rssany.js +97 -0
- package/app/plugins/builtin/worldlabs.rssany.js +129 -0
- package/app/plugins/builtin/x.rssany.js +159 -0
- package/app/plugins/builtin/xiaohongshu.rssany.js +283 -0
- package/app/plugins/builtin/zhipu-research.rssany.js +334 -0
- package/dist/index.js +79 -9
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/webui/build/200.html +6 -6
- package/webui/build/_app/immutable/assets/0.BB88QFoe.css +1 -0
- package/webui/build/_app/immutable/assets/{homeFeedPanelStore.BopJZtHu.css → homeFeedPanelStore.iOmfP2qL.css} +1 -1
- package/webui/build/_app/immutable/chunks/CZD-YNDw.js +31 -0
- package/webui/build/_app/immutable/chunks/{DcAshVxe.js → D6VIKef0.js} +1 -1
- package/webui/build/_app/immutable/chunks/{EIZIMsXK.js → Dbqx2mXq.js} +1 -1
- package/webui/build/_app/immutable/chunks/DeX-oq5W.js +41 -0
- package/webui/build/_app/immutable/chunks/{BXCWEhUd.js → dhB8G5Is.js} +1 -1
- package/webui/build/_app/immutable/entry/{app.DdgnooOk.js → app.XPso7q7g.js} +2 -2
- package/webui/build/_app/immutable/entry/start.Db4snNCd.js +1 -0
- package/webui/build/_app/immutable/nodes/0.BKTQePmA.js +11 -0
- package/webui/build/_app/immutable/nodes/{1.5DFDaT4c.js → 1.BS3_Rfxm.js} +1 -1
- package/webui/build/_app/immutable/nodes/{10.OVK4i9XE.js → 10.CyyxDCIS.js} +1 -1
- package/webui/build/_app/immutable/nodes/{11.Dhn_rO4A.js → 11.CtYgIaGj.js} +1 -1
- package/webui/build/_app/immutable/nodes/{14.B_KpJLxn.js → 14.D5OEGPR2.js} +1 -1
- package/webui/build/_app/immutable/nodes/{15.RaWaA-0I.js → 15.B4dFN1Gk.js} +1 -1
- package/webui/build/_app/immutable/nodes/{16.DSUgqolV.js → 16.M7ZII7tl.js} +1 -1
- package/webui/build/_app/immutable/nodes/{3.wQvGs9w-.js → 3.7r8v7qkm.js} +1 -1
- package/webui/build/_app/immutable/nodes/{5.CCtn90c0.js → 5.CHIzoGrb.js} +1 -1
- package/webui/build/_app/immutable/nodes/{6.C2_mjW1u.js → 6.BDBqx-GY.js} +1 -1
- package/webui/build/_app/immutable/nodes/{7.Dwz6W7A1.js → 7.D5czsDmz.js} +1 -1
- package/webui/build/_app/immutable/nodes/{8.DzkEw6rx.js → 8.pjVNsCdV.js} +1 -1
- package/webui/build/_app/immutable/nodes/{9.DtlXEwe1.js → 9.CsARv1BH.js} +1 -1
- package/webui/build/_app/version.json +1 -1
- package/webui/build/_app/immutable/assets/0.C6Q_nuW9.css +0 -1
- package/webui/build/_app/immutable/chunks/CkUAV0m0.js +0 -41
- package/webui/build/_app/immutable/chunks/CtijX1u3.js +0 -31
- package/webui/build/_app/immutable/entry/start.DhJaJZhR.js +0 -1
- package/webui/build/_app/immutable/nodes/0.BE05Cuc4.js +0 -11
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
const SITE_ID = "pjlab-adg-publications";
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
function normalizeText(text) {
|
|
9
|
+
return (text ?? "")
|
|
10
|
+
.replace(/\u00a0/g, " ")
|
|
11
|
+
.replace(/\s+/g, " ")
|
|
12
|
+
.trim();
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
function hashGuid(input) {
|
|
17
|
+
return _deps.createHash("sha256").update(input).digest("hex");
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
function parseYear(raw) {
|
|
22
|
+
const text = normalizeText(raw);
|
|
23
|
+
const match = text.match(/\b(19|20)\d{2}\b/);
|
|
24
|
+
if (!match) return undefined;
|
|
25
|
+
const year = Number(match[0]);
|
|
26
|
+
return Number.isFinite(year) ? year : undefined;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
function toAbsoluteLink(rawHref, baseUrl) {
|
|
31
|
+
if (!rawHref) return null;
|
|
32
|
+
const href = rawHref.trim();
|
|
33
|
+
if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
|
|
34
|
+
try {
|
|
35
|
+
const url = new URL(href, baseUrl);
|
|
36
|
+
if (!/^https?:$/i.test(url.protocol)) return null;
|
|
37
|
+
return url.href;
|
|
38
|
+
} catch {
|
|
39
|
+
return null;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
function pickBestLink(detailNode, pageUrl, entryId) {
|
|
45
|
+
const linkNodes = detailNode.querySelectorAll(".links a[href], a[href]");
|
|
46
|
+
const candidates = [];
|
|
47
|
+
for (const node of linkNodes) {
|
|
48
|
+
const link = toAbsoluteLink(node.getAttribute("href"), pageUrl);
|
|
49
|
+
if (!link) continue;
|
|
50
|
+
const label = normalizeText(node.textContent).toLowerCase();
|
|
51
|
+
candidates.push({ link, label });
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const preferHtml = candidates.find((x) => x.label === "html" || x.label === "arxiv" || x.label === "doi");
|
|
55
|
+
if (preferHtml) return preferHtml.link;
|
|
56
|
+
|
|
57
|
+
const preferPdf = candidates.find((x) => x.label === "pdf");
|
|
58
|
+
if (preferPdf) return preferPdf.link;
|
|
59
|
+
|
|
60
|
+
if (candidates.length > 0) return candidates[0].link;
|
|
61
|
+
if (entryId) return `${pageUrl}#${encodeURIComponent(entryId)}`;
|
|
62
|
+
return pageUrl;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
function buildSummary(author, periodical) {
|
|
67
|
+
const chunks = [author, periodical].map((x) => normalizeText(x)).filter(Boolean);
|
|
68
|
+
if (chunks.length === 0) return undefined;
|
|
69
|
+
return chunks.join(" | ");
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
function elementChildren(node) {
|
|
74
|
+
return node.childNodes.filter((child) => child.tagName != null);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
function directDivChildren(node) {
|
|
79
|
+
return elementChildren(node).filter((child) => child.tagName?.toLowerCase() === "div");
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
function extractTitle(detailNode) {
|
|
84
|
+
const fromClass = normalizeText(detailNode.querySelector(".title")?.textContent);
|
|
85
|
+
if (fromClass) return fromClass;
|
|
86
|
+
|
|
87
|
+
const divs = directDivChildren(detailNode);
|
|
88
|
+
for (const div of divs) {
|
|
89
|
+
if (div.querySelector("em")) continue;
|
|
90
|
+
if (div.querySelector("a[href]")) continue;
|
|
91
|
+
const text = normalizeText(div.textContent);
|
|
92
|
+
if (!text) continue;
|
|
93
|
+
if (text.length < 8) continue;
|
|
94
|
+
return text;
|
|
95
|
+
}
|
|
96
|
+
return "";
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
function extractAuthor(detailNode, title) {
|
|
101
|
+
const fromClass = normalizeText(detailNode.querySelector(".author")?.textContent);
|
|
102
|
+
if (fromClass) return fromClass;
|
|
103
|
+
|
|
104
|
+
const divs = directDivChildren(detailNode);
|
|
105
|
+
const textDivs = divs
|
|
106
|
+
.filter((div) => !div.querySelector("em"))
|
|
107
|
+
.filter((div) => !div.querySelector("a[href]"))
|
|
108
|
+
.map((div) => normalizeText(div.textContent))
|
|
109
|
+
.filter(Boolean);
|
|
110
|
+
const candidate = textDivs.find((text) => text !== title);
|
|
111
|
+
return candidate || undefined;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
function extractPeriodical(detailNode) {
|
|
116
|
+
const fromClass = normalizeText(
|
|
117
|
+
(detailNode.querySelector(".periodical em") ?? detailNode.querySelector(".periodical"))?.textContent
|
|
118
|
+
);
|
|
119
|
+
if (fromClass) return fromClass;
|
|
120
|
+
|
|
121
|
+
const divs = directDivChildren(detailNode);
|
|
122
|
+
for (const div of divs) {
|
|
123
|
+
const emText = normalizeText(div.querySelector("em")?.textContent);
|
|
124
|
+
if (emText) return emText;
|
|
125
|
+
}
|
|
126
|
+
return "";
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
function parseOneEntry(liNode, currentYear, pageUrl) {
|
|
131
|
+
const detailNode = liNode.querySelector("div[id]") ?? liNode.querySelector("div");
|
|
132
|
+
if (!detailNode) return null;
|
|
133
|
+
|
|
134
|
+
const entryId = normalizeText(detailNode.getAttribute("id"));
|
|
135
|
+
const title = extractTitle(detailNode);
|
|
136
|
+
if (!title) return null;
|
|
137
|
+
|
|
138
|
+
const author = extractAuthor(detailNode, title);
|
|
139
|
+
const periodical = extractPeriodical(detailNode);
|
|
140
|
+
const fallbackYear = parseYear(`${periodical} ${detailNode.textContent}`);
|
|
141
|
+
const finalYear = currentYear ?? fallbackYear;
|
|
142
|
+
const pubDate = finalYear != null ? new Date(Date.UTC(finalYear, 0, 1, 0, 0, 0)) : new Date();
|
|
143
|
+
const badge = normalizeText((liNode.querySelector(".abbr .badge") ?? liNode.querySelector("abbr"))?.textContent) || undefined;
|
|
144
|
+
const link = pickBestLink(detailNode, pageUrl, entryId);
|
|
145
|
+
const guidSeed = entryId || link || `${title}|${author ?? ""}|${finalYear ?? ""}`;
|
|
146
|
+
|
|
147
|
+
return {
|
|
148
|
+
guid: hashGuid(guidSeed),
|
|
149
|
+
title,
|
|
150
|
+
link,
|
|
151
|
+
pubDate,
|
|
152
|
+
author,
|
|
153
|
+
summary: buildSummary(author, periodical),
|
|
154
|
+
sourceId: SITE_ID,
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
function parseItems(html, finalUrl) {
|
|
160
|
+
const root = _deps.parseHtml(html);
|
|
161
|
+
const container = root.querySelector("article") ?? root;
|
|
162
|
+
const items = [];
|
|
163
|
+
const seenGuid = new Set();
|
|
164
|
+
let currentYear;
|
|
165
|
+
|
|
166
|
+
const stream = container.querySelectorAll("h2, li");
|
|
167
|
+
for (const node of stream) {
|
|
168
|
+
const tag = node.tagName?.toLowerCase();
|
|
169
|
+
if (tag === "h2") {
|
|
170
|
+
const year = parseYear(node.textContent);
|
|
171
|
+
if (year != null) currentYear = year;
|
|
172
|
+
continue;
|
|
173
|
+
}
|
|
174
|
+
if (tag !== "li") continue;
|
|
175
|
+
|
|
176
|
+
const item = parseOneEntry(node, currentYear, finalUrl);
|
|
177
|
+
if (!item) continue;
|
|
178
|
+
if (seenGuid.has(item.guid)) continue;
|
|
179
|
+
seenGuid.add(item.guid);
|
|
180
|
+
items.push(item);
|
|
181
|
+
}
|
|
182
|
+
return items;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
async function fetchItems(sourceId, ctx) {
|
|
187
|
+
_deps = ctx.deps;
|
|
188
|
+
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
|
|
189
|
+
const pageUrl = finalUrl || sourceId;
|
|
190
|
+
const items = parseItems(html, pageUrl);
|
|
191
|
+
if (items.length === 0) {
|
|
192
|
+
throw new Error("[pjlab-adg-publications] 未解析到论文条目,页面结构可能已变化");
|
|
193
|
+
}
|
|
194
|
+
return items;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
export default {
|
|
199
|
+
id: SITE_ID,
|
|
200
|
+
listUrlPattern: /^https?:\/\/pjlab-adg\.github\.io\/publications\/?(\?.*)?$/i,
|
|
201
|
+
fetchItems,
|
|
202
|
+
};
|
|
@@ -12,6 +12,16 @@ function trimUrl(s) {
|
|
|
12
12
|
return t || undefined;
|
|
13
13
|
}
|
|
14
14
|
|
|
15
|
+
/** rss-parser 常把多位作者压成一段逗号(或中文逗号)分隔文本,拆成数组入库。 */
|
|
16
|
+
function authorsFromCommaText(authorRaw) {
|
|
17
|
+
if (typeof authorRaw !== "string") return undefined;
|
|
18
|
+
const parts = authorRaw
|
|
19
|
+
.split(/[,,]/)
|
|
20
|
+
.map((s) => s.trim())
|
|
21
|
+
.filter(Boolean);
|
|
22
|
+
return parts.length > 0 ? parts : undefined;
|
|
23
|
+
}
|
|
24
|
+
|
|
15
25
|
/** 从 rss-parser 条目上尽量取出配图 URL(入库用 imageUrl,与 Gateway 的 cover_img 对齐)。 */
|
|
16
26
|
function extractItemImageUrl(item) {
|
|
17
27
|
const enc = item.enclosure;
|
|
@@ -127,7 +137,7 @@ export default {
|
|
|
127
137
|
: new Date();
|
|
128
138
|
const authorRaw =
|
|
129
139
|
typeof item.creator === "string" ? item.creator : typeof item.author === "string" ? item.author : undefined;
|
|
130
|
-
const author = authorRaw
|
|
140
|
+
const author = authorsFromCommaText(authorRaw);
|
|
131
141
|
const summary =
|
|
132
142
|
typeof item.summary === "string" ? item.summary : typeof item.contentSnippet === "string" ? item.contentSnippet : undefined;
|
|
133
143
|
const content =
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
// SelectDataset 插件:解析首页/搜索页 Nuxt payload,输出数据集条目(不含 enrich)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
const SELECT_DATASET_ORIGIN = "https://www.selectdataset.com";
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
function normalizeText(text) {
|
|
11
|
+
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
function hashGuid(input) {
|
|
16
|
+
return _deps.createHash("sha256").update(input).digest("hex");
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
function parseDate(dateText) {
|
|
21
|
+
const text = normalizeText(dateText);
|
|
22
|
+
if (!text || text === "0") return new Date();
|
|
23
|
+
const m = text.match(
|
|
24
|
+
/^(\d{4})-(\d{1,2})-(\d{1,2})(?:[ T](\d{1,2}):(\d{1,2})(?::(\d{1,2}))?)?$/
|
|
25
|
+
);
|
|
26
|
+
if (!m) {
|
|
27
|
+
const fallback = new Date(text);
|
|
28
|
+
return Number.isNaN(fallback.getTime()) ? new Date() : fallback;
|
|
29
|
+
}
|
|
30
|
+
const [, y, mm, dd, hh = "0", mi = "0", ss = "0"] = m;
|
|
31
|
+
// 站点时间以中国时区为主,显式补 +08:00 避免环境时区影响排序。
|
|
32
|
+
const withTimezone = `${y}-${mm.padStart(2, "0")}-${dd.padStart(2, "0")}T` +
|
|
33
|
+
`${hh.padStart(2, "0")}:${mi.padStart(2, "0")}:${ss.padStart(2, "0")}+08:00`;
|
|
34
|
+
const parsed = new Date(withTimezone);
|
|
35
|
+
return Number.isNaN(parsed.getTime()) ? new Date() : parsed;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
function toAbsoluteDatasetUrl(datasetId) {
|
|
40
|
+
const id = normalizeText(datasetId);
|
|
41
|
+
if (!id) return null;
|
|
42
|
+
return `${SELECT_DATASET_ORIGIN}/dataset/${id}`;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
function dedupeItems(items) {
|
|
47
|
+
const seen = new Set();
|
|
48
|
+
const out = [];
|
|
49
|
+
for (const item of items) {
|
|
50
|
+
if (!item?.link || seen.has(item.link)) continue;
|
|
51
|
+
seen.add(item.link);
|
|
52
|
+
out.push(item);
|
|
53
|
+
}
|
|
54
|
+
return out;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
function parseFromAnchorDom(html, finalUrl) {
|
|
59
|
+
const root = _deps.parseHtml(html);
|
|
60
|
+
const baseUrl = finalUrl || SELECT_DATASET_ORIGIN;
|
|
61
|
+
const items = [];
|
|
62
|
+
|
|
63
|
+
for (const anchor of root.querySelectorAll('a[href*="/dataset/"]')) {
|
|
64
|
+
const href = anchor.getAttribute("href");
|
|
65
|
+
if (!href) continue;
|
|
66
|
+
let link = null;
|
|
67
|
+
try {
|
|
68
|
+
const url = new URL(href, baseUrl);
|
|
69
|
+
if (!/^https?:$/i.test(url.protocol)) continue;
|
|
70
|
+
if (!/\/dataset\/[A-Za-z0-9]{16,}/.test(url.pathname)) continue;
|
|
71
|
+
link = url.href;
|
|
72
|
+
} catch {
|
|
73
|
+
continue;
|
|
74
|
+
}
|
|
75
|
+
const title = normalizeText(anchor.textContent);
|
|
76
|
+
if (!title) continue;
|
|
77
|
+
|
|
78
|
+
items.push({
|
|
79
|
+
guid: hashGuid(link),
|
|
80
|
+
title,
|
|
81
|
+
link,
|
|
82
|
+
pubDate: new Date(),
|
|
83
|
+
summary: undefined,
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
return dedupeItems(items);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
function createNuxtResolver(table) {
|
|
92
|
+
const cache = new Map();
|
|
93
|
+
const inProgress = new Set();
|
|
94
|
+
|
|
95
|
+
function decodeRef(index) {
|
|
96
|
+
if (cache.has(index)) return cache.get(index);
|
|
97
|
+
if (inProgress.has(index)) return undefined;
|
|
98
|
+
inProgress.add(index);
|
|
99
|
+
const decoded = decodeValue(table[index]);
|
|
100
|
+
inProgress.delete(index);
|
|
101
|
+
cache.set(index, decoded);
|
|
102
|
+
return decoded;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function decodeValue(value) {
|
|
106
|
+
if (typeof value === "number") {
|
|
107
|
+
if (Number.isInteger(value) && value >= 0 && value < table.length) {
|
|
108
|
+
return decodeRef(value);
|
|
109
|
+
}
|
|
110
|
+
return value;
|
|
111
|
+
}
|
|
112
|
+
if (value == null || typeof value !== "object") return value;
|
|
113
|
+
|
|
114
|
+
if (Array.isArray(value)) {
|
|
115
|
+
if (value.length === 2 && (value[0] === "Reactive" || value[0] === "ShallowReactive")) {
|
|
116
|
+
return decodeValue(value[1]);
|
|
117
|
+
}
|
|
118
|
+
if (value.length === 2 && value[0] === "Set") {
|
|
119
|
+
const raw = decodeValue(value[1]);
|
|
120
|
+
return Array.isArray(raw) ? raw : [];
|
|
121
|
+
}
|
|
122
|
+
return value.map((x) => decodeValue(x));
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
const out = {};
|
|
126
|
+
for (const [key, v] of Object.entries(value)) {
|
|
127
|
+
out[key] = decodeValue(v);
|
|
128
|
+
}
|
|
129
|
+
return out;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
return { decodeRef };
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
function toFeedItem(record) {
|
|
137
|
+
const title = normalizeText(record.dataset_name);
|
|
138
|
+
const link = toAbsoluteDatasetUrl(record.id);
|
|
139
|
+
if (!title || !link) return null;
|
|
140
|
+
|
|
141
|
+
const summary = normalizeText(record.dataset_desc);
|
|
142
|
+
const author = normalizeText(record.ext_host_name);
|
|
143
|
+
return {
|
|
144
|
+
guid: hashGuid(link),
|
|
145
|
+
title,
|
|
146
|
+
link,
|
|
147
|
+
pubDate: parseDate(record.date_index_update || record.date_dataset_update),
|
|
148
|
+
author: author || undefined,
|
|
149
|
+
summary: summary || undefined,
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
function parseFromNuxtPayload(html) {
|
|
155
|
+
const root = _deps.parseHtml(html);
|
|
156
|
+
const payload = root.querySelector("#__NUXT_DATA__")?.textContent;
|
|
157
|
+
if (!payload) return [];
|
|
158
|
+
|
|
159
|
+
let table;
|
|
160
|
+
try {
|
|
161
|
+
table = JSON.parse(payload);
|
|
162
|
+
} catch {
|
|
163
|
+
return [];
|
|
164
|
+
}
|
|
165
|
+
if (!Array.isArray(table)) return [];
|
|
166
|
+
|
|
167
|
+
const { decodeRef } = createNuxtResolver(table);
|
|
168
|
+
const items = [];
|
|
169
|
+
|
|
170
|
+
for (let i = 0; i < table.length; i += 1) {
|
|
171
|
+
const entry = table[i];
|
|
172
|
+
if (entry == null || typeof entry !== "object" || Array.isArray(entry)) continue;
|
|
173
|
+
if (!("dataset_name" in entry) || !("id" in entry)) continue;
|
|
174
|
+
|
|
175
|
+
const decoded = decodeRef(i);
|
|
176
|
+
if (!decoded || typeof decoded !== "object") continue;
|
|
177
|
+
const item = toFeedItem(decoded);
|
|
178
|
+
if (item) items.push(item);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
const deduped = dedupeItems(items);
|
|
182
|
+
deduped.sort((a, b) => b.pubDate.getTime() - a.pubDate.getTime());
|
|
183
|
+
return deduped;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
async function fetchItems(sourceId, ctx) {
|
|
188
|
+
_deps = ctx.deps;
|
|
189
|
+
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
|
|
190
|
+
const fromAnchorDom = parseFromAnchorDom(html, finalUrl);
|
|
191
|
+
if (fromAnchorDom.length > 0) return fromAnchorDom;
|
|
192
|
+
|
|
193
|
+
// 净化 HTML 下无稳定数据集链接时,回退到未净化 payload 取 dataset id 与更新时间。
|
|
194
|
+
const raw = await ctx.fetchHtml(sourceId, { waitMs: 3500, purify: false });
|
|
195
|
+
const fromPayload = parseFromNuxtPayload(raw.html);
|
|
196
|
+
if (fromPayload.length > 0) return fromPayload;
|
|
197
|
+
|
|
198
|
+
throw new Error("[selectdataset] 未解析到数据集条目,页面结构可能已变化");
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
export default {
|
|
203
|
+
id: "selectdataset",
|
|
204
|
+
listUrlPattern: /^https?:\/\/(www\.)?selectdataset\.com\/(?:$|\?.*|search(?:\?.*)?|subject(?:\?.*)?)$/i,
|
|
205
|
+
fetchItems,
|
|
206
|
+
};
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
const SITE_ID = "sensetime-tech-achievements";
|
|
5
|
+
const DATE_RE = /\b(20\d{2})-(\d{1,2})-(\d{1,2})\b/;
|
|
6
|
+
|
|
7
|
+
function normalizeText(text) {
|
|
8
|
+
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
function hashGuid(input) {
|
|
12
|
+
return _deps.createHash("sha256").update(input).digest("hex");
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
function toAbsoluteUrl(rawHref, baseUrl) {
|
|
16
|
+
if (!rawHref) return null;
|
|
17
|
+
const href = rawHref.trim();
|
|
18
|
+
if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
|
|
19
|
+
try {
|
|
20
|
+
const url = new URL(href, baseUrl);
|
|
21
|
+
if (!/^https?:$/i.test(url.protocol)) return null;
|
|
22
|
+
return url.href;
|
|
23
|
+
} catch {
|
|
24
|
+
return null;
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function parseDate(dateText) {
|
|
29
|
+
const text = normalizeText(dateText);
|
|
30
|
+
const m = text.match(DATE_RE);
|
|
31
|
+
if (!m) return undefined;
|
|
32
|
+
const [, y, mm, dd] = m;
|
|
33
|
+
return new Date(`${y}-${mm.padStart(2, "0")}-${dd.padStart(2, "0")}T00:00:00.000Z`);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function extractDateText(anchor) {
|
|
37
|
+
let current = anchor;
|
|
38
|
+
for (let i = 0; i < 8 && current; i += 1) {
|
|
39
|
+
const text = normalizeText(current.textContent);
|
|
40
|
+
const m = text.match(DATE_RE);
|
|
41
|
+
if (m) return m[0];
|
|
42
|
+
current = current.parentNode ?? null;
|
|
43
|
+
}
|
|
44
|
+
return "";
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function extractTags(anchor, title, dateText) {
|
|
48
|
+
let current = anchor;
|
|
49
|
+
for (let i = 0; i < 6 && current; i += 1) {
|
|
50
|
+
const spans = current.querySelectorAll?.("span") ?? [];
|
|
51
|
+
const tags = spans
|
|
52
|
+
.map((s) => normalizeText(s.textContent))
|
|
53
|
+
.filter(Boolean)
|
|
54
|
+
.filter((x) => x !== title && x !== dateText)
|
|
55
|
+
.filter((x) => !DATE_RE.test(x));
|
|
56
|
+
if (tags.length > 0) return Array.from(new Set(tags));
|
|
57
|
+
current = current.parentNode ?? null;
|
|
58
|
+
}
|
|
59
|
+
return [];
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
function parseItemsFromHtml(html, finalUrl) {
|
|
63
|
+
const root = _deps.parseHtml(html);
|
|
64
|
+
const anchors = root.querySelectorAll('a[href*="/technology-new-detail/"]');
|
|
65
|
+
const seen = new Set();
|
|
66
|
+
const items = [];
|
|
67
|
+
|
|
68
|
+
for (const anchor of anchors) {
|
|
69
|
+
const link = toAbsoluteUrl(anchor.getAttribute("href"), finalUrl);
|
|
70
|
+
if (!link || seen.has(link)) continue;
|
|
71
|
+
|
|
72
|
+
const title = normalizeText(anchor.textContent);
|
|
73
|
+
if (!title) continue;
|
|
74
|
+
|
|
75
|
+
seen.add(link);
|
|
76
|
+
const dateText = extractDateText(anchor);
|
|
77
|
+
const tags = extractTags(anchor, title, dateText);
|
|
78
|
+
const summary = [dateText, tags.join(" / ")].filter(Boolean).join(" | ");
|
|
79
|
+
const pubDate = parseDate(dateText) ?? new Date();
|
|
80
|
+
|
|
81
|
+
items.push({
|
|
82
|
+
guid: hashGuid(link),
|
|
83
|
+
title,
|
|
84
|
+
link,
|
|
85
|
+
pubDate,
|
|
86
|
+
summary: summary || undefined,
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
return items;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
async function fetchItemsFromApi(finalUrl) {
|
|
94
|
+
const origin = new URL(finalUrl).origin;
|
|
95
|
+
const apiUrl = new URL("/rest/v1/contents/1/getlistbyparam/48/1/20/0/0?scene=1", origin);
|
|
96
|
+
|
|
97
|
+
const res = await fetch(apiUrl, {
|
|
98
|
+
headers: {
|
|
99
|
+
Accept: "application/json,text/plain,*/*",
|
|
100
|
+
"User-Agent":
|
|
101
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
|
102
|
+
},
|
|
103
|
+
});
|
|
104
|
+
if (!res.ok) return [];
|
|
105
|
+
|
|
106
|
+
const data = await res.json();
|
|
107
|
+
const list = Array.isArray(data?.data?.lists) ? data.data.lists : [];
|
|
108
|
+
const items = [];
|
|
109
|
+
const seen = new Set();
|
|
110
|
+
|
|
111
|
+
for (const row of list) {
|
|
112
|
+
const contentId = String(row?.contentId ?? "").trim();
|
|
113
|
+
const title = normalizeText(row?.title);
|
|
114
|
+
if (!contentId || !title) continue;
|
|
115
|
+
const link = new URL(`/cn/technology-new-detail/${contentId}?categoryId=48`, origin).href;
|
|
116
|
+
if (seen.has(link)) continue;
|
|
117
|
+
seen.add(link);
|
|
118
|
+
|
|
119
|
+
const dateText = normalizeText(row?.createTime);
|
|
120
|
+
const tags = Array.isArray(row?.tagnames)
|
|
121
|
+
? row.tagnames.map((x) => normalizeText(x)).filter(Boolean)
|
|
122
|
+
: [];
|
|
123
|
+
const summary = [dateText, tags.join(" / ")].filter(Boolean).join(" | ");
|
|
124
|
+
const pubDate = parseDate(dateText) ?? new Date();
|
|
125
|
+
|
|
126
|
+
items.push({
|
|
127
|
+
guid: hashGuid(link),
|
|
128
|
+
title,
|
|
129
|
+
link,
|
|
130
|
+
pubDate,
|
|
131
|
+
summary: summary || undefined,
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
return items;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
async function fetchItems(sourceId, ctx) {
|
|
139
|
+
_deps = ctx.deps;
|
|
140
|
+
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
|
|
141
|
+
const items = parseItemsFromHtml(html, finalUrl);
|
|
142
|
+
if (items.length > 0) return items;
|
|
143
|
+
|
|
144
|
+
const fallbackItems = await fetchItemsFromApi(finalUrl);
|
|
145
|
+
if (fallbackItems.length > 0) return fallbackItems;
|
|
146
|
+
|
|
147
|
+
throw new Error(`[${SITE_ID}] 未解析到学术成果条目,页面结构或接口可能已变化`);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
export default {
|
|
151
|
+
id: SITE_ID,
|
|
152
|
+
listUrlPattern: /^https?:\/\/(www\.)?sensetime\.com\/cn\/technology-achievements(\?.*)?$/i,
|
|
153
|
+
fetchItems,
|
|
154
|
+
};
|