rssany 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/app/plugins/builtin/agi-eval-evaluation.rssany.js +188 -0
- package/app/plugins/builtin/amii-research-talent.rssany.js +73 -0
- package/app/plugins/builtin/anthropic-research.rssany.js +155 -0
- package/app/plugins/builtin/appen-resources.rssany.js +155 -0
- package/app/plugins/builtin/baai-wudao-paper-article.rssany.js +185 -0
- package/app/plugins/builtin/baaidata-csdn.rssany.js +242 -0
- package/app/plugins/builtin/baidu-research.rssany.js +222 -0
- package/app/plugins/builtin/brightdata-blog.rssany.js +301 -0
- package/app/plugins/builtin/bytedance-seed-research.rssany.js +231 -0
- package/app/plugins/builtin/five-radar.rssany.js +490 -0
- package/app/plugins/builtin/flageval-news.rssany.js +118 -0
- package/app/plugins/builtin/google-deepmind-research.rssany.js +223 -0
- package/app/plugins/builtin/google-research-datasets.rssany.js +171 -0
- package/app/plugins/builtin/google-research.rssany.js +220 -0
- package/app/plugins/builtin/google.rssany.js +187 -0
- package/app/plugins/builtin/hacker-news-newest.rssany.js +130 -0
- package/app/plugins/builtin/harvard-dataverse.rssany.js +166 -0
- package/app/plugins/builtin/huaweicloud-bbs-blogs.rssany.js +185 -0
- package/app/plugins/builtin/lingowhale.rssany.js +119 -0
- package/app/plugins/builtin/meituan-tech.rssany.js +130 -0
- package/app/plugins/builtin/meta-ai-publications.rssany.js +221 -0
- package/app/plugins/builtin/mila-quebec.rssany.js +199 -0
- package/app/plugins/builtin/mit-csail-research.rssany.js +208 -0
- package/app/plugins/builtin/moonshot.rssany.js +127 -0
- package/app/plugins/builtin/opendatalab-news.rssany.js +174 -0
- package/app/plugins/builtin/opendatalab.rssany.js +109 -0
- package/app/plugins/builtin/opendrivelab-autonomous-driving.rssany.js +114 -0
- package/app/plugins/builtin/opendrivelab-embodiedai.rssany.js +114 -0
- package/app/plugins/builtin/opendrivelab-publications.rssany.js +130 -0
- package/app/plugins/builtin/opendrivelab.rssany.js +333 -0
- package/app/plugins/builtin/paperswithcode.rssany.js +227 -0
- package/app/plugins/builtin/pjlab-adg-publications.rssany.js +202 -0
- package/app/plugins/builtin/rss.rssany.js +11 -1
- package/app/plugins/builtin/selectdataset.rssany.js +206 -0
- package/app/plugins/builtin/sensetime-tech-achievements.rssany.js +154 -0
- package/app/plugins/builtin/supervisely-blog.rssany.js +159 -0
- package/app/plugins/builtin/uci-ml-repository.rssany.js +111 -0
- package/app/plugins/builtin/venturebeat.rssany.js +97 -0
- package/app/plugins/builtin/worldlabs.rssany.js +129 -0
- package/app/plugins/builtin/x.rssany.js +159 -0
- package/app/plugins/builtin/xiaohongshu.rssany.js +283 -0
- package/app/plugins/builtin/zhipu-research.rssany.js +334 -0
- package/dist/index.js +62 -4
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/webui/build/200.html +6 -6
- package/webui/build/_app/immutable/assets/{0.DjU2hdCQ.css → 0.BB88QFoe.css} +1 -1
- package/webui/build/_app/immutable/assets/{homeFeedPanelStore.BopJZtHu.css → homeFeedPanelStore.iOmfP2qL.css} +1 -1
- package/webui/build/_app/immutable/chunks/CZD-YNDw.js +31 -0
- package/webui/build/_app/immutable/chunks/{C85CNwD2.js → D6VIKef0.js} +1 -1
- package/webui/build/_app/immutable/chunks/{CllQAdvt.js → Dbqx2mXq.js} +1 -1
- package/webui/build/_app/immutable/chunks/DeX-oq5W.js +41 -0
- package/webui/build/_app/immutable/chunks/{CdMsRjxJ.js → dhB8G5Is.js} +1 -1
- package/webui/build/_app/immutable/entry/{app.BcD2eSsQ.js → app.XPso7q7g.js} +2 -2
- package/webui/build/_app/immutable/entry/start.Db4snNCd.js +1 -0
- package/webui/build/_app/immutable/nodes/0.BKTQePmA.js +11 -0
- package/webui/build/_app/immutable/nodes/{1.DU9aYGAb.js → 1.BS3_Rfxm.js} +1 -1
- package/webui/build/_app/immutable/nodes/{10.Db6vw7Ih.js → 10.CyyxDCIS.js} +1 -1
- package/webui/build/_app/immutable/nodes/{11.BaAcorz3.js → 11.CtYgIaGj.js} +1 -1
- package/webui/build/_app/immutable/nodes/{14.DqT4pcrQ.js → 14.D5OEGPR2.js} +1 -1
- package/webui/build/_app/immutable/nodes/{15.CCLbjxnH.js → 15.B4dFN1Gk.js} +1 -1
- package/webui/build/_app/immutable/nodes/{16.DiigpVdP.js → 16.M7ZII7tl.js} +1 -1
- package/webui/build/_app/immutable/nodes/{3.DEcYOQc-.js → 3.7r8v7qkm.js} +1 -1
- package/webui/build/_app/immutable/nodes/{5.CvM1TkLG.js → 5.CHIzoGrb.js} +1 -1
- package/webui/build/_app/immutable/nodes/{6.Dscr6LkS.js → 6.BDBqx-GY.js} +1 -1
- package/webui/build/_app/immutable/nodes/{7.Bp60MobD.js → 7.D5czsDmz.js} +1 -1
- package/webui/build/_app/immutable/nodes/{8.DwSg0MHh.js → 8.pjVNsCdV.js} +1 -1
- package/webui/build/_app/immutable/nodes/{9.BeYOUjxR.js → 9.CsARv1BH.js} +1 -1
- package/webui/build/_app/version.json +1 -1
- package/webui/build/_app/immutable/chunks/CtijX1u3.js +0 -31
- package/webui/build/_app/immutable/chunks/Dv1VCsiB.js +0 -41
- package/webui/build/_app/immutable/entry/start.CbkdJdz1.js +0 -1
- package/webui/build/_app/immutable/nodes/0.DSUDmOx2.js +0 -11
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
// 语鲸 Open API 插件:将语鲸推荐文章 / 今日文章转换为条目列表
|
|
2
|
+
//
|
|
3
|
+
// sourceId 格式:
|
|
4
|
+
// lingowhale://articles — 推荐文章(默认拉取 2 页,可用 ?pages=5 调整)
|
|
5
|
+
// lingowhale://today — 今日文章(自动翻页拉完)
|
|
6
|
+
//
|
|
7
|
+
// 认证参数可内联于 URL query:
|
|
8
|
+
// lingowhale://articles?app_id=xxx&app_secret=yyy
|
|
9
|
+
// 也可通过环境变量提供(优先级低于 URL 参数):
|
|
10
|
+
// LINGOWHALE_APP_ID / LINGOWHALE_APP_SECRET
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
const BASE_URL = "https://open.lingowhale.com/open-api/v1";
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
function resolveCredentials(sourceId) {
|
|
17
|
+
let appId = process.env.LINGOWHALE_APP_ID ?? "";
|
|
18
|
+
let appSecret = process.env.LINGOWHALE_APP_SECRET ?? "";
|
|
19
|
+
try {
|
|
20
|
+
const parsed = new URL(sourceId.replace(/^lingowhale:/, "http://x"));
|
|
21
|
+
appId = parsed.searchParams.get("app_id") ?? appId;
|
|
22
|
+
appSecret = parsed.searchParams.get("app_secret") ?? appSecret;
|
|
23
|
+
} catch { /* fallback to env vars */ }
|
|
24
|
+
if (!appId || !appSecret) {
|
|
25
|
+
throw new Error(
|
|
26
|
+
"[LingowhalePlugin] 缺少认证信息:请在 sourceId 中提供 ?app_id=&app_secret= 参数," +
|
|
27
|
+
"或设置环境变量 LINGOWHALE_APP_ID / LINGOWHALE_APP_SECRET"
|
|
28
|
+
);
|
|
29
|
+
}
|
|
30
|
+
return { appId, appSecret };
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function resolveEndpoint(sourceId) {
|
|
34
|
+
return sourceId.includes("://today") ? "today" : "articles";
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function resolveMaxPages(sourceId) {
|
|
38
|
+
try {
|
|
39
|
+
const parsed = new URL(sourceId.replace(/^lingowhale:/, "http://x"));
|
|
40
|
+
const pages = parseInt(parsed.searchParams.get("pages") ?? "", 10);
|
|
41
|
+
if (!isNaN(pages) && pages > 0) return pages;
|
|
42
|
+
} catch { /* ignore */ }
|
|
43
|
+
return 2;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
function buildHeaders(appId, appSecret) {
|
|
47
|
+
return {
|
|
48
|
+
"X-App-ID": appId,
|
|
49
|
+
"X-App-Secret": appSecret,
|
|
50
|
+
"Content-Type": "application/json",
|
|
51
|
+
"Accept": "application/json",
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function stripHl(text) {
|
|
56
|
+
return text.replace(/<\/?hl>/g, "");
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function mapArticle(article) {
|
|
60
|
+
const link = article.orig_url ?? `https://lingowhale.com/article/${article.entry_id}`;
|
|
61
|
+
const rawSummary = article.abstract || article.description;
|
|
62
|
+
return {
|
|
63
|
+
guid: article.entry_id,
|
|
64
|
+
title: article.title ?? "(无标题)",
|
|
65
|
+
link,
|
|
66
|
+
pubDate: article.pub_time ? new Date(article.pub_time * 1000) : new Date(),
|
|
67
|
+
summary: rawSummary ? stripHl(rawSummary) : undefined,
|
|
68
|
+
content: article.html || undefined,
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
async function fetchArticles(appId, appSecret, maxPages) {
|
|
73
|
+
const headers = buildHeaders(appId, appSecret);
|
|
74
|
+
const items = [];
|
|
75
|
+
for (let page = 1; page <= maxPages; page++) {
|
|
76
|
+
const res = await fetch(`${BASE_URL}/articles?page=${page}&page_size=100`, { headers });
|
|
77
|
+
if (!res.ok) throw new Error(`[LingowhalePlugin] HTTP ${res.status} 拉取文章列表失败`);
|
|
78
|
+
const json = await res.json();
|
|
79
|
+
if (json.code !== 0) throw new Error(`[LingowhalePlugin] API 错误:${json.message}`);
|
|
80
|
+
const pageItems = json.data.items ?? [];
|
|
81
|
+
items.push(...pageItems.map(mapArticle));
|
|
82
|
+
const total = json.data.total ?? 0;
|
|
83
|
+
if (items.length >= total || pageItems.length === 0) break;
|
|
84
|
+
}
|
|
85
|
+
return items;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
async function fetchTodayArticles(appId, appSecret) {
|
|
89
|
+
const headers = buildHeaders(appId, appSecret);
|
|
90
|
+
const items = [];
|
|
91
|
+
let cursor;
|
|
92
|
+
for (let round = 0; round < 10; round++) {
|
|
93
|
+
const cursorParam = cursor ? `&cursor=${encodeURIComponent(cursor)}` : "";
|
|
94
|
+
const res = await fetch(`${BASE_URL}/articles/today?page_size=20${cursorParam}`, { headers });
|
|
95
|
+
if (!res.ok) throw new Error(`[LingowhalePlugin] HTTP ${res.status} 拉取今日文章失败`);
|
|
96
|
+
const json = await res.json();
|
|
97
|
+
if (json.code !== 0) throw new Error(`[LingowhalePlugin] API 错误:${json.message}`);
|
|
98
|
+
const pageItems = json.data.items ?? [];
|
|
99
|
+
items.push(...pageItems.map(mapArticle));
|
|
100
|
+
cursor = json.data.nextCursor;
|
|
101
|
+
if (!cursor || pageItems.length === 0) break;
|
|
102
|
+
}
|
|
103
|
+
return items;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
export default {
|
|
108
|
+
id: "lingowhale",
|
|
109
|
+
listUrlPattern: /^lingowhale:\/\//,
|
|
110
|
+
refreshInterval: "1h",
|
|
111
|
+
|
|
112
|
+
async fetchItems(sourceId, _ctx) {
|
|
113
|
+
const { appId, appSecret } = resolveCredentials(sourceId);
|
|
114
|
+
const endpoint = resolveEndpoint(sourceId);
|
|
115
|
+
const maxPages = resolveMaxPages(sourceId);
|
|
116
|
+
if (endpoint === "today") return fetchTodayArticles(appId, appSecret);
|
|
117
|
+
return fetchArticles(appId, appSecret, maxPages);
|
|
118
|
+
},
|
|
119
|
+
};
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
// 美团技术团队博客:https://tech.meituan.com/
|
|
4
|
+
// 列表页解析 .post-container,可选 enrichItem 拉取正文
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
function normalizeText(text) {
|
|
8
|
+
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
function toAbsoluteUrl(href, baseUrl) {
|
|
12
|
+
if (!href) return null;
|
|
13
|
+
try {
|
|
14
|
+
const url = new URL(href, baseUrl);
|
|
15
|
+
if (!/^https?:$/i.test(url.protocol)) return null;
|
|
16
|
+
return url.href;
|
|
17
|
+
} catch {
|
|
18
|
+
return null;
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/** 解析 "2026年03月13日" 或 "2026-03-13" */
|
|
23
|
+
function parseDate(text) {
|
|
24
|
+
const raw = normalizeText(text);
|
|
25
|
+
const m = raw.match(/(\d{4})[年/-](\d{1,2})[月/-](\d{1,2})/);
|
|
26
|
+
if (!m) return undefined;
|
|
27
|
+
const [, y, mon, d] = m;
|
|
28
|
+
const iso = `${y}-${mon.padStart(2, "0")}-${d.padStart(2, "0")}T00:00:00+08:00`;
|
|
29
|
+
const date = new Date(iso);
|
|
30
|
+
return Number.isNaN(date.getTime()) ? undefined : date;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function hashGuid(link) {
|
|
34
|
+
return _deps.createHash("sha256").update(link).digest("hex");
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
async function fetchItems(sourceId, ctx) {
|
|
38
|
+
_deps = ctx.deps;
|
|
39
|
+
const { html, finalUrl } = await ctx.fetchHtml(sourceId, {
|
|
40
|
+
waitMs: 5000,
|
|
41
|
+
waitForSelector: ".post-container",
|
|
42
|
+
waitForSelectorTimeoutMs: 25000,
|
|
43
|
+
});
|
|
44
|
+
const root = _deps.parseHtml(html);
|
|
45
|
+
const baseUrl = finalUrl || sourceId;
|
|
46
|
+
|
|
47
|
+
const items = [];
|
|
48
|
+
const seen = new Set();
|
|
49
|
+
|
|
50
|
+
// 兼容 .row.post-container-wrapper > .col-md-6 > .post-container 或直接 .post-container
|
|
51
|
+
let containers = root.querySelectorAll(".post-container");
|
|
52
|
+
if (containers.length === 0) {
|
|
53
|
+
containers = root.querySelectorAll(".post-container-wrapper .post-container");
|
|
54
|
+
}
|
|
55
|
+
for (const el of containers) {
|
|
56
|
+
const titleEl = el.querySelector(".post-title a[href]");
|
|
57
|
+
let href = titleEl?.getAttribute("href");
|
|
58
|
+
if (!href) {
|
|
59
|
+
const moreLink = el.querySelector("a.more-link[href]");
|
|
60
|
+
if (moreLink) href = moreLink.getAttribute("href");
|
|
61
|
+
}
|
|
62
|
+
if (!href) continue;
|
|
63
|
+
|
|
64
|
+
const link = toAbsoluteUrl(href, baseUrl);
|
|
65
|
+
if (!link) continue;
|
|
66
|
+
try {
|
|
67
|
+
const u = new URL(link);
|
|
68
|
+
if (!u.hostname.endsWith("tech.meituan.com")) continue;
|
|
69
|
+
} catch {
|
|
70
|
+
continue;
|
|
71
|
+
}
|
|
72
|
+
if (seen.has(link)) continue;
|
|
73
|
+
seen.add(link);
|
|
74
|
+
|
|
75
|
+
const title = normalizeText(titleEl?.textContent) || normalizeText(el.querySelector(".post-title")?.textContent);
|
|
76
|
+
if (!title) continue;
|
|
77
|
+
|
|
78
|
+
const dateEl = el.querySelector(".m-post-date");
|
|
79
|
+
const pubDate = parseDate(dateEl?.textContent) ?? new Date();
|
|
80
|
+
|
|
81
|
+
const authorEl = el.querySelector(".m-post-nick");
|
|
82
|
+
const authorRaw = normalizeText(authorEl?.textContent);
|
|
83
|
+
const author = authorRaw ? [authorRaw] : undefined;
|
|
84
|
+
|
|
85
|
+
const summaryEl = el.querySelector(".post-content.post-expect");
|
|
86
|
+
let summary = "";
|
|
87
|
+
if (summaryEl) {
|
|
88
|
+
const clone = summaryEl.clone();
|
|
89
|
+
const moreLink = clone.querySelector("a.more-link");
|
|
90
|
+
if (moreLink) moreLink.remove();
|
|
91
|
+
summary = normalizeText(clone.textContent);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
const tagLinks = el.querySelectorAll(".tag-links a[rel='tag']");
|
|
95
|
+
const categories = tagLinks.length
|
|
96
|
+
? Array.from(tagLinks).map((a) => normalizeText(a.textContent)).filter(Boolean)
|
|
97
|
+
: undefined;
|
|
98
|
+
|
|
99
|
+
items.push({
|
|
100
|
+
guid: hashGuid(link),
|
|
101
|
+
title,
|
|
102
|
+
link,
|
|
103
|
+
pubDate,
|
|
104
|
+
author,
|
|
105
|
+
summary: summary || undefined,
|
|
106
|
+
categories,
|
|
107
|
+
sourceRef: sourceId,
|
|
108
|
+
});
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
if (items.length === 0) {
|
|
112
|
+
throw new Error("[meituan-tech] 未解析到文章条目,请检查列表页结构是否变化");
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
return items;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/** 可选:拉取详情页正文,使用默认 Readability 提取 */
|
|
119
|
+
async function enrichItem(item, ctx) {
|
|
120
|
+
return ctx.extractItem(item);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
export default {
|
|
124
|
+
id: "meituan-tech",
|
|
125
|
+
listUrlPattern: /^https?:\/\/(www\.)?tech\.meituan\.com(\/.*)?$/i,
|
|
126
|
+
detailUrlPattern: /^https?:\/\/(www\.)?tech\.meituan\.com\/\d{4}\/\d{2}\/\d{2}\/[^/]+\.html(?:\?.*)?$/i,
|
|
127
|
+
refreshInterval: "1day",
|
|
128
|
+
fetchItems,
|
|
129
|
+
enrichItem,
|
|
130
|
+
};
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
// Meta AI Publications 插件:抓取结果页中的 publication 条目(不做正文 enrich)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
const PUBLICATION_PATH_RE = /^\/research\/publications\/[^?#]+\/?$/i;
|
|
8
|
+
const PUBLICATION_RESULTS_URL_RE =
|
|
9
|
+
/^https?:\/\/ai\.meta\.com\/results\/?\?.*content_types(?:%5B0%5D|\[0\])=publication(?:&.*)?$/i;
|
|
10
|
+
const MONTH_TO_INDEX = {
|
|
11
|
+
january: 0,
|
|
12
|
+
february: 1,
|
|
13
|
+
march: 2,
|
|
14
|
+
april: 3,
|
|
15
|
+
may: 4,
|
|
16
|
+
june: 5,
|
|
17
|
+
july: 6,
|
|
18
|
+
august: 7,
|
|
19
|
+
september: 8,
|
|
20
|
+
october: 9,
|
|
21
|
+
november: 10,
|
|
22
|
+
december: 11,
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
function normalizeText(text) {
|
|
27
|
+
return (text ?? "")
|
|
28
|
+
.replace(/[\u200B-\u200D\uFEFF]/g, "")
|
|
29
|
+
.replace(/\s+/g, " ")
|
|
30
|
+
.trim();
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
function hashGuid(input) {
|
|
35
|
+
return _deps.createHash("sha256").update(input).digest("hex");
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
function toAbsolutePublicationUrl(rawHref, pageUrl) {
|
|
40
|
+
if (!rawHref) return null;
|
|
41
|
+
const href = rawHref.trim();
|
|
42
|
+
if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
|
|
43
|
+
try {
|
|
44
|
+
const url = new URL(href, pageUrl);
|
|
45
|
+
if (!/^https?:$/i.test(url.protocol)) return null;
|
|
46
|
+
if (url.hostname !== "ai.meta.com") return null;
|
|
47
|
+
if (!PUBLICATION_PATH_RE.test(url.pathname)) return null;
|
|
48
|
+
return url.href;
|
|
49
|
+
} catch {
|
|
50
|
+
return null;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
function parsePubDate(rawDate) {
|
|
56
|
+
const normalized = normalizeText(rawDate);
|
|
57
|
+
if (!normalized) return undefined;
|
|
58
|
+
const m = normalized.match(
|
|
59
|
+
/^(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}),\s*(\d{4})$/i
|
|
60
|
+
);
|
|
61
|
+
if (!m) return undefined;
|
|
62
|
+
const monthName = m[1]?.toLowerCase();
|
|
63
|
+
const day = Number(m[2]);
|
|
64
|
+
const year = Number(m[3]);
|
|
65
|
+
const month = monthName ? MONTH_TO_INDEX[monthName] : undefined;
|
|
66
|
+
if (month == null || !Number.isFinite(day) || !Number.isFinite(year)) return undefined;
|
|
67
|
+
return new Date(Date.UTC(year, month, day, 0, 0, 0));
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
function decodeTitleFromLink(link) {
|
|
72
|
+
try {
|
|
73
|
+
const path = new URL(link).pathname;
|
|
74
|
+
const slug = path.split("/").filter(Boolean).at(-1) ?? "";
|
|
75
|
+
if (!slug) return "";
|
|
76
|
+
const decoded = decodeURIComponent(slug).replace(/[-_]+/g, " ");
|
|
77
|
+
return normalizeText(decoded.replace(/\b([a-z])/g, (m) => m.toUpperCase()));
|
|
78
|
+
} catch {
|
|
79
|
+
return "";
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
function looksLikeCategory(text) {
|
|
85
|
+
if (!text || text.length > 80) return false;
|
|
86
|
+
if (/[.!?]/.test(text)) return false;
|
|
87
|
+
if (/\d{4}/.test(text)) return false;
|
|
88
|
+
const letters = text.replace(/[^A-Za-z]/g, "");
|
|
89
|
+
if (!letters) return false;
|
|
90
|
+
const uppercase = letters.replace(/[^A-Z]/g, "").length;
|
|
91
|
+
const ratio = uppercase / letters.length;
|
|
92
|
+
return ratio >= 0.7 || text.includes("|");
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
function looksLikeAuthorLine(text) {
|
|
97
|
+
if (!text) return false;
|
|
98
|
+
if (text.length < 8 || text.length > 700) return false;
|
|
99
|
+
if (!text.includes(",")) return false;
|
|
100
|
+
if (/[.!?]/.test(text)) return false;
|
|
101
|
+
return /[A-Za-z]/.test(text);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
function collectUniqueTexts(nodes) {
|
|
106
|
+
const out = [];
|
|
107
|
+
const seen = new Set();
|
|
108
|
+
for (const node of nodes) {
|
|
109
|
+
const text = normalizeText(node.textContent);
|
|
110
|
+
if (!text || seen.has(text)) continue;
|
|
111
|
+
seen.add(text);
|
|
112
|
+
out.push(text);
|
|
113
|
+
}
|
|
114
|
+
return out;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
function findCardRoot(anchor) {
|
|
119
|
+
let cur = anchor;
|
|
120
|
+
let fallback = null;
|
|
121
|
+
for (let i = 0; i < 8 && cur; i += 1) {
|
|
122
|
+
const headings = collectUniqueTexts(cur.querySelectorAll?.("h1, h2, h3, h4, h5, h6") ?? []);
|
|
123
|
+
const paragraphs = collectUniqueTexts(cur.querySelectorAll?.("p") ?? []);
|
|
124
|
+
const hasDate = paragraphs.some((p) => parsePubDate(p) != null);
|
|
125
|
+
if (headings.length >= 2 && hasDate) return cur;
|
|
126
|
+
if (headings.length >= 1 && paragraphs.length >= 3) fallback = cur;
|
|
127
|
+
cur = cur.parentNode ?? null;
|
|
128
|
+
}
|
|
129
|
+
return fallback ?? anchor.parentNode ?? anchor;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
function extractTitle(card, link) {
|
|
134
|
+
const headings = collectUniqueTexts(card.querySelectorAll("h1, h2, h3, h4, h5, h6"));
|
|
135
|
+
const candidates = headings
|
|
136
|
+
.filter((text) => !/^(publication|read the paper)$/i.test(text))
|
|
137
|
+
.filter((text) => !looksLikeCategory(text));
|
|
138
|
+
const chosen = candidates.find((text) => text.length >= 8) ?? candidates.at(-1) ?? "";
|
|
139
|
+
if (chosen) return chosen;
|
|
140
|
+
return decodeTitleFromLink(link);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
function extractPubDateFromCard(card) {
|
|
145
|
+
const timeDatetime = card.querySelector("time[datetime]")?.getAttribute("datetime");
|
|
146
|
+
if (timeDatetime) {
|
|
147
|
+
const direct = new Date(timeDatetime);
|
|
148
|
+
if (!Number.isNaN(direct.getTime())) return direct;
|
|
149
|
+
}
|
|
150
|
+
const paragraphs = collectUniqueTexts(card.querySelectorAll("p, span, div"));
|
|
151
|
+
for (const text of paragraphs) {
|
|
152
|
+
const parsed = parsePubDate(text);
|
|
153
|
+
if (parsed) return parsed;
|
|
154
|
+
}
|
|
155
|
+
return undefined;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
function extractSummary(card, title) {
|
|
160
|
+
const paragraphs = collectUniqueTexts(card.querySelectorAll("p"));
|
|
161
|
+
const candidates = paragraphs
|
|
162
|
+
.filter((text) => !/^(publication|read the paper)$/i.test(text))
|
|
163
|
+
.filter((text) => parsePubDate(text) == null)
|
|
164
|
+
.filter((text) => !looksLikeAuthorLine(text))
|
|
165
|
+
.filter((text) => !looksLikeCategory(text))
|
|
166
|
+
.filter((text) => text !== title);
|
|
167
|
+
const summary = candidates.find((text) => text.length >= 40) ?? "";
|
|
168
|
+
return summary || undefined;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
async function fetchItems(sourceId, ctx) {
|
|
173
|
+
_deps = ctx.deps;
|
|
174
|
+
const { html, finalUrl, status } = await ctx.fetchHtml(sourceId, { waitMs: 3500, purify: false });
|
|
175
|
+
if (status >= 400) {
|
|
176
|
+
throw new Error(`[meta-ai-publications] 抓取失败: HTTP ${status}`);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
const root = _deps.parseHtml(html);
|
|
180
|
+
const pageUrl = finalUrl || sourceId;
|
|
181
|
+
const anchors = root.querySelectorAll('a[href*="/research/publications/"]');
|
|
182
|
+
const seen = new Set();
|
|
183
|
+
const items = [];
|
|
184
|
+
|
|
185
|
+
for (const anchor of anchors) {
|
|
186
|
+
const link = toAbsolutePublicationUrl(anchor.getAttribute("href"), pageUrl);
|
|
187
|
+
if (!link || seen.has(link)) continue;
|
|
188
|
+
seen.add(link);
|
|
189
|
+
|
|
190
|
+
const card = findCardRoot(anchor);
|
|
191
|
+
const title = extractTitle(card, link);
|
|
192
|
+
if (!title) continue;
|
|
193
|
+
|
|
194
|
+
const pubDate = extractPubDateFromCard(card) ?? new Date();
|
|
195
|
+
const summary = extractSummary(card, title);
|
|
196
|
+
|
|
197
|
+
items.push({
|
|
198
|
+
guid: hashGuid(link),
|
|
199
|
+
title,
|
|
200
|
+
link,
|
|
201
|
+
pubDate,
|
|
202
|
+
author: "Meta AI",
|
|
203
|
+
summary,
|
|
204
|
+
sourceId: "meta-ai-publications",
|
|
205
|
+
});
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
if (items.length === 0) {
|
|
209
|
+
throw new Error(
|
|
210
|
+
"[meta-ai-publications] 未解析到 publication 条目,可能是页面结构变化,或需要更长等待时间/可用浏览器会话"
|
|
211
|
+
);
|
|
212
|
+
}
|
|
213
|
+
return items;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
export default {
|
|
218
|
+
id: "meta-ai-publications",
|
|
219
|
+
listUrlPattern: PUBLICATION_RESULTS_URL_RE,
|
|
220
|
+
fetchItems,
|
|
221
|
+
};
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
let _deps;
|
|
2
|
+
|
|
3
|
+
// Mila (Quebec AI Institute) 新闻列表插件:支持首页 /en 与新闻页 /en/news
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
const MILA_ORIGIN = "https://mila.quebec";
|
|
8
|
+
const NEWS_PATH_RE = /^\/en\/news\/[^/?#]+\/?$/i;
|
|
9
|
+
const NOISE_TITLE_RE = /^(read the (story|news)|see more news)$/i;
|
|
10
|
+
const MONTH_TO_INDEX = {
|
|
11
|
+
jan: 0,
|
|
12
|
+
feb: 1,
|
|
13
|
+
mar: 2,
|
|
14
|
+
apr: 3,
|
|
15
|
+
may: 4,
|
|
16
|
+
jun: 5,
|
|
17
|
+
jul: 6,
|
|
18
|
+
aug: 7,
|
|
19
|
+
sep: 8,
|
|
20
|
+
oct: 9,
|
|
21
|
+
nov: 10,
|
|
22
|
+
dec: 11,
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
function normalizeText(text) {
|
|
27
|
+
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
function hashGuid(input) {
|
|
32
|
+
return _deps.createHash("sha256").update(input).digest("hex");
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
function toAbsoluteHttpUrl(rawHref, baseUrl) {
|
|
37
|
+
if (!rawHref) return null;
|
|
38
|
+
const href = rawHref.trim();
|
|
39
|
+
if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
|
|
40
|
+
try {
|
|
41
|
+
const url = new URL(href, baseUrl);
|
|
42
|
+
if (!/^https?:$/i.test(url.protocol)) return null;
|
|
43
|
+
return url.href;
|
|
44
|
+
} catch {
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
function isNewsArticleUrl(url) {
|
|
51
|
+
try {
|
|
52
|
+
return NEWS_PATH_RE.test(new URL(url).pathname);
|
|
53
|
+
} catch {
|
|
54
|
+
return false;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
function extractDateFromText(text) {
|
|
60
|
+
const normalized = normalizeText(text).replace(/,/g, " ");
|
|
61
|
+
if (!normalized) return undefined;
|
|
62
|
+
|
|
63
|
+
const m = normalized.match(
|
|
64
|
+
/(?:^|\b)(\d{1,2})\s+(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+(\d{4})(?:\b|$)/i
|
|
65
|
+
);
|
|
66
|
+
if (!m) return undefined;
|
|
67
|
+
|
|
68
|
+
const day = Number(m[1]);
|
|
69
|
+
const monthIdx = MONTH_TO_INDEX[m[2].slice(0, 3).toLowerCase()];
|
|
70
|
+
const year = Number(m[3]);
|
|
71
|
+
if (monthIdx == null || !Number.isFinite(day) || !Number.isFinite(year)) return undefined;
|
|
72
|
+
|
|
73
|
+
const d = new Date(Date.UTC(year, monthIdx, day, 12, 0, 0));
|
|
74
|
+
if (Number.isNaN(d.getTime())) return undefined;
|
|
75
|
+
return d;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
function extractDateNearNode(node) {
|
|
80
|
+
const timeDatetime = node.querySelector?.("time[datetime]")?.getAttribute("datetime");
|
|
81
|
+
if (timeDatetime) {
|
|
82
|
+
const d = new Date(timeDatetime);
|
|
83
|
+
if (!Number.isNaN(d.getTime())) return d;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
let current = node;
|
|
87
|
+
for (let i = 0; i < 6 && current; i += 1) {
|
|
88
|
+
const parsed = extractDateFromText(current.textContent);
|
|
89
|
+
if (parsed) return parsed;
|
|
90
|
+
current = current.parentNode ?? null;
|
|
91
|
+
}
|
|
92
|
+
return undefined;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
function scoreTitle(text) {
|
|
97
|
+
const normalized = normalizeText(text);
|
|
98
|
+
if (!normalized) return 0;
|
|
99
|
+
if (NOISE_TITLE_RE.test(normalized.toLowerCase())) return 1;
|
|
100
|
+
if (normalized.length < 5) return 2;
|
|
101
|
+
return 10 + Math.min(normalized.length, 120);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
function titleFromUrl(link) {
|
|
106
|
+
try {
|
|
107
|
+
const slug = decodeURIComponent(new URL(link).pathname.split("/").filter(Boolean).pop() ?? "");
|
|
108
|
+
return normalizeText(slug.replace(/[-_]+/g, " "));
|
|
109
|
+
} catch {
|
|
110
|
+
return "Mila News";
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
function chooseSummary(node, title) {
|
|
116
|
+
let current = node;
|
|
117
|
+
for (let i = 0; i < 4 && current; i += 1) {
|
|
118
|
+
const candidates = current
|
|
119
|
+
.querySelectorAll?.("p")
|
|
120
|
+
?.map((p) => normalizeText(p.textContent))
|
|
121
|
+
?.filter(Boolean) ?? [];
|
|
122
|
+
for (const text of candidates) {
|
|
123
|
+
const lower = text.toLowerCase();
|
|
124
|
+
if (lower === title.toLowerCase()) continue;
|
|
125
|
+
if (NOISE_TITLE_RE.test(lower)) continue;
|
|
126
|
+
return text;
|
|
127
|
+
}
|
|
128
|
+
current = current.parentNode ?? null;
|
|
129
|
+
}
|
|
130
|
+
return undefined;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
async function fetchItems(sourceId, ctx) {
|
|
135
|
+
_deps = ctx.deps;
|
|
136
|
+
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 4000 });
|
|
137
|
+
const root = _deps.parseHtml(html);
|
|
138
|
+
const pageUrl = finalUrl || sourceId || MILA_ORIGIN;
|
|
139
|
+
const byLink = new Map();
|
|
140
|
+
|
|
141
|
+
for (const anchor of root.querySelectorAll("a[href]")) {
|
|
142
|
+
const link = toAbsoluteHttpUrl(anchor.getAttribute("href"), pageUrl);
|
|
143
|
+
if (!link || !isNewsArticleUrl(link)) continue;
|
|
144
|
+
|
|
145
|
+
const rawTitle = normalizeText(anchor.textContent);
|
|
146
|
+
const titleScore = scoreTitle(rawTitle);
|
|
147
|
+
const pubDate = extractDateNearNode(anchor);
|
|
148
|
+
const summary = chooseSummary(anchor, rawTitle || "");
|
|
149
|
+
|
|
150
|
+
const existing = byLink.get(link) ?? {
|
|
151
|
+
link,
|
|
152
|
+
title: "",
|
|
153
|
+
titleScore: 0,
|
|
154
|
+
pubDate: undefined,
|
|
155
|
+
summary: undefined,
|
|
156
|
+
};
|
|
157
|
+
|
|
158
|
+
if (titleScore > existing.titleScore) {
|
|
159
|
+
existing.title = rawTitle;
|
|
160
|
+
existing.titleScore = titleScore;
|
|
161
|
+
}
|
|
162
|
+
if (!existing.pubDate && pubDate) existing.pubDate = pubDate;
|
|
163
|
+
if (!existing.summary && summary) existing.summary = summary;
|
|
164
|
+
|
|
165
|
+
byLink.set(link, existing);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
const items = Array.from(byLink.values()).map((entry) => {
|
|
169
|
+
const title = entry.title && !NOISE_TITLE_RE.test(entry.title.toLowerCase())
|
|
170
|
+
? entry.title
|
|
171
|
+
: titleFromUrl(entry.link);
|
|
172
|
+
const summary = entry.summary && normalizeText(entry.summary) !== normalizeText(title)
|
|
173
|
+
? entry.summary
|
|
174
|
+
: undefined;
|
|
175
|
+
return {
|
|
176
|
+
guid: hashGuid(entry.link),
|
|
177
|
+
title,
|
|
178
|
+
link: entry.link,
|
|
179
|
+
pubDate: entry.pubDate ?? new Date(),
|
|
180
|
+
author: "Mila",
|
|
181
|
+
summary,
|
|
182
|
+
};
|
|
183
|
+
});
|
|
184
|
+
|
|
185
|
+
items.sort((a, b) => b.pubDate.getTime() - a.pubDate.getTime());
|
|
186
|
+
|
|
187
|
+
if (items.length === 0) {
|
|
188
|
+
throw new Error("[mila-quebec] 未解析到新闻条目,页面结构可能已变化");
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
return items;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
export default {
|
|
196
|
+
id: "mila-quebec",
|
|
197
|
+
listUrlPattern: /^https?:\/\/(www\.)?mila\.quebec\/en(?:\/news)?(?:\/)?(?:\?.*)?$/i,
|
|
198
|
+
fetchItems,
|
|
199
|
+
};
|