rssany 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -22
- package/app/plugins/builtin/agi-eval-evaluation.rssany.js +6 -7
- package/app/plugins/builtin/amii-research-talent.rssany.js +6 -7
- package/app/plugins/builtin/anthropic-research.rssany.js +6 -8
- package/app/plugins/builtin/appen-resources.rssany.js +6 -7
- package/app/plugins/builtin/baai-wudao-paper-article.rssany.js +9 -10
- package/app/plugins/builtin/baaidata-csdn.rssany.js +6 -7
- package/app/plugins/builtin/baidu-research.rssany.js +5 -8
- package/app/plugins/builtin/brightdata-blog.rssany.js +6 -11
- package/app/plugins/builtin/bytedance-seed-research.rssany.js +5 -7
- package/app/plugins/builtin/email.rssany.js +9 -9
- package/app/plugins/builtin/five-radar.rssany.js +9 -11
- package/app/plugins/builtin/flageval-news.rssany.js +5 -7
- package/app/plugins/builtin/google-deepmind-research.rssany.js +6 -8
- package/app/plugins/builtin/google-research-datasets.rssany.js +6 -8
- package/app/plugins/builtin/google-research.rssany.js +6 -8
- package/app/plugins/builtin/hacker-news-newest.rssany.js +7 -9
- package/app/plugins/builtin/harvard-dataverse.rssany.js +6 -8
- package/app/plugins/builtin/huaweicloud-bbs-blogs.rssany.js +7 -9
- package/app/plugins/builtin/lingowhale.rssany.js +7 -9
- package/app/plugins/builtin/meituan-tech.rssany.js +7 -10
- package/app/plugins/builtin/meta-ai-publications.rssany.js +6 -11
- package/app/plugins/builtin/mila-quebec.rssany.js +6 -8
- package/app/plugins/builtin/mit-csail-research.rssany.js +7 -9
- package/app/plugins/builtin/moonshot.rssany.js +6 -8
- package/app/plugins/builtin/opendatalab-news.rssany.js +6 -7
- package/app/plugins/builtin/opendatalab.rssany.js +5 -6
- package/app/plugins/builtin/opendrivelab-autonomous-driving.rssany.js +6 -7
- package/app/plugins/builtin/opendrivelab-embodiedai.rssany.js +7 -8
- package/app/plugins/builtin/opendrivelab-publications.rssany.js +6 -8
- package/app/plugins/builtin/opendrivelab.rssany.js +7 -8
- package/app/plugins/builtin/paperswithcode.rssany.js +6 -8
- package/app/plugins/builtin/pjlab-adg-publications.rssany.js +7 -9
- package/app/plugins/builtin/rss.rssany.js +11 -12
- package/app/plugins/builtin/selectdataset.rssany.js +6 -8
- package/app/plugins/builtin/sensetime-tech-achievements.rssany.js +7 -8
- package/app/plugins/builtin/supervisely-blog.rssany.js +6 -8
- package/app/plugins/builtin/theinformation-briefings.rssany.js +7 -13
- package/app/plugins/builtin/uci-ml-repository.rssany.js +6 -7
- package/app/plugins/builtin/venturebeat.rssany.js +7 -9
- package/app/plugins/builtin/worldlabs.rssany.js +6 -8
- package/app/plugins/builtin/x.rssany.js +7 -9
- package/app/plugins/builtin/xiaohongshu.rssany.js +119 -56
- package/app/plugins/builtin/zhipu-research.rssany.js +5 -8
- package/app/plugins/site.rssany.js +25 -26
- package/{statics → app/statics}/README.md +7 -7
- package/bin/rssany.js +226 -6
- package/dist/index.js +209 -152
- package/dist/index.js.map +1 -1
- package/package.json +16 -9
- package/scripts/dev.mjs +114 -0
- package/scripts/reset.mjs +1 -1
- package/init/config.json +0 -17
- package/init/sources.json +0 -353
- package/statics/401.html +0 -56
- package/statics/404.html +0 -12
- package/statics/image.png +0 -0
- package/webui/build/200.html +0 -49
- package/webui/build/_app/env.js +0 -1
- package/webui/build/_app/immutable/assets/0.BB88QFoe.css +0 -1
- package/webui/build/_app/immutable/assets/10.Dj8_pmut.css +0 -1
- package/webui/build/_app/immutable/assets/11.qYZMiTb0.css +0 -1
- package/webui/build/_app/immutable/assets/12.DfJcfUWl.css +0 -1
- package/webui/build/_app/immutable/assets/13.BhO9zvFi.css +0 -1
- package/webui/build/_app/immutable/assets/14.CujIhjQK.css +0 -1
- package/webui/build/_app/immutable/assets/15.nNGjXhCQ.css +0 -1
- package/webui/build/_app/immutable/assets/16.PP9XLDf7.css +0 -1
- package/webui/build/_app/immutable/assets/4.9wPHhVwv.css +0 -1
- package/webui/build/_app/immutable/assets/5.B-dPiwB7.css +0 -1
- package/webui/build/_app/immutable/assets/6.B27N7pdA.css +0 -1
- package/webui/build/_app/immutable/assets/7.CrNxmd8B.css +0 -1
- package/webui/build/_app/immutable/assets/8.Cgji2b15.css +0 -1
- package/webui/build/_app/immutable/assets/9.BsCIAvn3.css +0 -1
- package/webui/build/_app/immutable/assets/BackToParentRoute.DGk-X5ow.css +0 -1
- package/webui/build/_app/immutable/assets/SourcesList.yTBBi3_m.css +0 -1
- package/webui/build/_app/immutable/assets/homeFeedPanelStore.CSvlNcpm.css +0 -1
- package/webui/build/_app/immutable/chunks/5LVkDJzw.js +0 -1
- package/webui/build/_app/immutable/chunks/B-OsL1Ct.js +0 -1
- package/webui/build/_app/immutable/chunks/B2Q1a1-H.js +0 -2
- package/webui/build/_app/immutable/chunks/BK3WtZwv.js +0 -1
- package/webui/build/_app/immutable/chunks/BQqoDzLx.js +0 -1
- package/webui/build/_app/immutable/chunks/BUApaBEI.js +0 -1
- package/webui/build/_app/immutable/chunks/BbWUOQ_m.js +0 -1
- package/webui/build/_app/immutable/chunks/Bfc47y5P.js +0 -1
- package/webui/build/_app/immutable/chunks/Bns1MuyM.js +0 -36
- package/webui/build/_app/immutable/chunks/Bp63qm3L.js +0 -1
- package/webui/build/_app/immutable/chunks/Bu9HsS-V.js +0 -1
- package/webui/build/_app/immutable/chunks/CBY2biv-.js +0 -1
- package/webui/build/_app/immutable/chunks/CVzlFH44.js +0 -1
- package/webui/build/_app/immutable/chunks/CWNeClHp.js +0 -6
- package/webui/build/_app/immutable/chunks/Cihqbfi5.js +0 -1
- package/webui/build/_app/immutable/chunks/CmjOpds-.js +0 -1
- package/webui/build/_app/immutable/chunks/D5GvRCv7.js +0 -1
- package/webui/build/_app/immutable/chunks/DEDI7Ecm.js +0 -1
- package/webui/build/_app/immutable/chunks/DFuhmi31.js +0 -1
- package/webui/build/_app/immutable/chunks/DMWEh-Ek.js +0 -2
- package/webui/build/_app/immutable/chunks/DjNLq3TF.js +0 -1
- package/webui/build/_app/immutable/chunks/Dt2CddFe.js +0 -1
- package/webui/build/_app/immutable/chunks/Dw782Tjs.js +0 -1
- package/webui/build/_app/immutable/chunks/Xy_fhzQq.js +0 -1
- package/webui/build/_app/immutable/chunks/bvuf_jZd.js +0 -36
- package/webui/build/_app/immutable/chunks/hp4PFHFv.js +0 -1
- package/webui/build/_app/immutable/chunks/lk5LaiqA.js +0 -1
- package/webui/build/_app/immutable/chunks/mW5RwvnK.js +0 -13
- package/webui/build/_app/immutable/chunks/tB7QMF3U.js +0 -1
- package/webui/build/_app/immutable/chunks/xtNWTdbD.js +0 -1
- package/webui/build/_app/immutable/entry/app.BVkrDt5l.js +0 -2
- package/webui/build/_app/immutable/entry/start.D3Q-BMMd.js +0 -1
- package/webui/build/_app/immutable/nodes/0.I1lQdWMl.js +0 -11
- package/webui/build/_app/immutable/nodes/1.BiQQfx2j.js +0 -1
- package/webui/build/_app/immutable/nodes/10.CvfUsqsw.js +0 -1
- package/webui/build/_app/immutable/nodes/11.B4LHPNL6.js +0 -1
- package/webui/build/_app/immutable/nodes/12.DVFJuIWI.js +0 -1
- package/webui/build/_app/immutable/nodes/13.nT3SOzEB.js +0 -1
- package/webui/build/_app/immutable/nodes/14.DfaAf0f8.js +0 -1
- package/webui/build/_app/immutable/nodes/15.CMzkX9OK.js +0 -1
- package/webui/build/_app/immutable/nodes/16.zPgTQNze.js +0 -24
- package/webui/build/_app/immutable/nodes/17.BtYZF6FM.js +0 -1
- package/webui/build/_app/immutable/nodes/18.BIzqhTqv.js +0 -1
- package/webui/build/_app/immutable/nodes/2.BYWOpaxy.js +0 -1
- package/webui/build/_app/immutable/nodes/3.B8Viux9S.js +0 -1
- package/webui/build/_app/immutable/nodes/4.DTSxpKm7.js +0 -2
- package/webui/build/_app/immutable/nodes/5.B6fR3n6J.js +0 -2
- package/webui/build/_app/immutable/nodes/6.j2O5Mwjv.js +0 -1
- package/webui/build/_app/immutable/nodes/7.Bd2USIrl.js +0 -1
- package/webui/build/_app/immutable/nodes/8.Bw_d63B_.js +0 -1
- package/webui/build/_app/immutable/nodes/9.pMMi5PP6.js +0 -1
- package/webui/build/_app/version.json +0 -1
|
@@ -1,7 +1,11 @@
|
|
|
1
|
+
export const id = "sensetime-tech-achievements";
|
|
2
|
+
export const name = "Sensetime Tech Achievements";
|
|
3
|
+
export const listUrlPattern = /^https:\/\/(www\.)?sensetime\.com\/cn\/technology-achievements(\?.*)?$/i;
|
|
4
|
+
|
|
1
5
|
let _deps;
|
|
2
6
|
|
|
3
7
|
|
|
4
|
-
const SITE_ID =
|
|
8
|
+
const SITE_ID = id;
|
|
5
9
|
const DATE_RE = /\b(20\d{2})-(\d{1,2})-(\d{1,2})\b/;
|
|
6
10
|
|
|
7
11
|
function normalizeText(text) {
|
|
@@ -18,7 +22,7 @@ function toAbsoluteUrl(rawHref, baseUrl) {
|
|
|
18
22
|
if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
|
|
19
23
|
try {
|
|
20
24
|
const url = new URL(href, baseUrl);
|
|
21
|
-
if (!/^https
|
|
25
|
+
if (!/^https:$/i.test(url.protocol)) return null;
|
|
22
26
|
return url.href;
|
|
23
27
|
} catch {
|
|
24
28
|
return null;
|
|
@@ -135,7 +139,7 @@ async function fetchItemsFromApi(finalUrl) {
|
|
|
135
139
|
return items;
|
|
136
140
|
}
|
|
137
141
|
|
|
138
|
-
async function fetchItems(sourceId, ctx) {
|
|
142
|
+
export async function fetchItems(sourceId, ctx) {
|
|
139
143
|
_deps = ctx.deps;
|
|
140
144
|
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
|
|
141
145
|
const items = parseItemsFromHtml(html, finalUrl);
|
|
@@ -147,8 +151,3 @@ async function fetchItems(sourceId, ctx) {
|
|
|
147
151
|
throw new Error(`[${SITE_ID}] 未解析到学术成果条目,页面结构或接口可能已变化`);
|
|
148
152
|
}
|
|
149
153
|
|
|
150
|
-
export default {
|
|
151
|
-
id: SITE_ID,
|
|
152
|
-
listUrlPattern: /^https?:\/\/(www\.)?sensetime\.com\/cn\/technology-achievements(\?.*)?$/i,
|
|
153
|
-
fetchItems,
|
|
154
|
-
};
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
export const id = "supervisely-blog";
|
|
2
|
+
export const name = "Supervisely Blog";
|
|
3
|
+
export const listUrlPattern = /^https:\/\/(www\.)?supervisely\.com\/blog\/?(?:\?.*)?$/i;
|
|
4
|
+
|
|
1
5
|
let _deps;
|
|
2
6
|
|
|
3
7
|
// Supervisely Blog 插件:抓取列表页并解析为 FeedItem(不做正文 enrich)
|
|
@@ -35,7 +39,7 @@ function toAbsoluteUrl(href, baseUrl) {
|
|
|
35
39
|
if (!href) return null;
|
|
36
40
|
try {
|
|
37
41
|
const url = new URL(href, baseUrl);
|
|
38
|
-
if (!/^https
|
|
42
|
+
if (!/^https:$/i.test(url.protocol)) return null;
|
|
39
43
|
return url.href;
|
|
40
44
|
} catch {
|
|
41
45
|
return null;
|
|
@@ -136,7 +140,7 @@ function parseFromHeadingFallback(root, baseUrl) {
|
|
|
136
140
|
}
|
|
137
141
|
|
|
138
142
|
|
|
139
|
-
async function fetchItems(sourceId, ctx) {
|
|
143
|
+
export async function fetchItems(sourceId, ctx) {
|
|
140
144
|
_deps = ctx.deps;
|
|
141
145
|
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
|
|
142
146
|
const root = _deps.parseHtml(html);
|
|
@@ -151,9 +155,3 @@ async function fetchItems(sourceId, ctx) {
|
|
|
151
155
|
return items;
|
|
152
156
|
}
|
|
153
157
|
|
|
154
|
-
|
|
155
|
-
export default {
|
|
156
|
-
id: "supervisely-blog",
|
|
157
|
-
listUrlPattern: /^https?:\/\/(www\.)?supervisely\.com\/blog\/?(?:\?.*)?$/i,
|
|
158
|
-
fetchItems,
|
|
159
|
-
};
|
|
@@ -1,12 +1,14 @@
|
|
|
1
|
+
export const id = "theinformation";
|
|
2
|
+
export const name = "Theinformation";
|
|
3
|
+
export const listUrlPattern = /^https:\/\/(www\.)?theinformation\.com\/(briefings|features\/[^/]+)\/?(\?.*)?$/i;
|
|
4
|
+
export const refreshInterval = "1h";
|
|
5
|
+
|
|
1
6
|
let _deps;
|
|
2
7
|
|
|
3
8
|
// The Information — AI Agenda 和 Briefings 列表页
|
|
4
9
|
// 当前结构:.article.feed-item,标题 h3.title a,分类 .category-content a,作者 .authors,摘要 .recent-excerpt .long-excerpt
|
|
5
10
|
|
|
6
11
|
const ORIGIN = "https://www.theinformation.com";
|
|
7
|
-
const LIST_URL_RE =
|
|
8
|
-
/^https?:\/\/(www\.)?theinformation\.com\/(briefings|features\/[^/]+)\/?(\?.*)?$/i;
|
|
9
|
-
|
|
10
12
|
|
|
11
13
|
function normalizeText(text) {
|
|
12
14
|
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
@@ -24,7 +26,7 @@ function toAbsoluteHttpUrl(rawHref, baseUrl) {
|
|
|
24
26
|
if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
|
|
25
27
|
try {
|
|
26
28
|
const url = new URL(href, baseUrl);
|
|
27
|
-
if (!/^https
|
|
29
|
+
if (!/^https:$/i.test(url.protocol)) return null;
|
|
28
30
|
return url.href;
|
|
29
31
|
} catch {
|
|
30
32
|
return null;
|
|
@@ -119,7 +121,7 @@ function parseFeedItems(html, pageUrl) {
|
|
|
119
121
|
}
|
|
120
122
|
|
|
121
123
|
|
|
122
|
-
async function fetchItems(sourceId, ctx) {
|
|
124
|
+
export async function fetchItems(sourceId, ctx) {
|
|
123
125
|
_deps = ctx.deps;
|
|
124
126
|
const { html, finalUrl, status } = await ctx.fetchHtml(sourceId, {
|
|
125
127
|
waitMs: 5000,
|
|
@@ -140,11 +142,3 @@ async function fetchItems(sourceId, ctx) {
|
|
|
140
142
|
items.sort((a, b) => b.pubDate.getTime() - a.pubDate.getTime());
|
|
141
143
|
return items;
|
|
142
144
|
}
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
export default {
|
|
146
|
-
id: "theinformation",
|
|
147
|
-
listUrlPattern: LIST_URL_RE,
|
|
148
|
-
refreshInterval: "1h",
|
|
149
|
-
fetchItems,
|
|
150
|
-
};
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
export const id = "uci-ml-repository";
|
|
2
|
+
export const name = "UCI Ml Repository";
|
|
3
|
+
export const listUrlPattern = /^https:\/\/archive\.ics\.uci\.edu(?:\/(?:datasets\/?)?)?(?:\?.*)?$/i;
|
|
4
|
+
|
|
1
5
|
let _deps;
|
|
2
6
|
|
|
3
7
|
|
|
@@ -16,7 +20,7 @@ function resolveDatasetLink(rawHref, baseUrl) {
|
|
|
16
20
|
if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
|
|
17
21
|
try {
|
|
18
22
|
const url = new URL(href, baseUrl);
|
|
19
|
-
if (!/^https
|
|
23
|
+
if (!/^https:$/i.test(url.protocol)) return null;
|
|
20
24
|
if (url.hostname !== "archive.ics.uci.edu") return null;
|
|
21
25
|
if (!/^\/dataset\/\d+\/[^/?#]+$/i.test(url.pathname)) return null;
|
|
22
26
|
url.search = "";
|
|
@@ -89,7 +93,7 @@ function parseFromGenericAnchors(root, baseUrl) {
|
|
|
89
93
|
return items;
|
|
90
94
|
}
|
|
91
95
|
|
|
92
|
-
async function fetchItems(sourceId, ctx) {
|
|
96
|
+
export async function fetchItems(sourceId, ctx) {
|
|
93
97
|
_deps = ctx.deps;
|
|
94
98
|
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 4000 });
|
|
95
99
|
const baseUrl = finalUrl || sourceId || UCI_ORIGIN;
|
|
@@ -104,8 +108,3 @@ async function fetchItems(sourceId, ctx) {
|
|
|
104
108
|
throw new Error("[uci-ml-repository] 未解析到数据集条目,页面结构可能已变化");
|
|
105
109
|
}
|
|
106
110
|
|
|
107
|
-
export default {
|
|
108
|
-
id: "uci-ml-repository",
|
|
109
|
-
listUrlPattern: /^https?:\/\/archive\.ics\.uci\.edu(?:\/(?:datasets\/?)?)?(?:\?.*)?$/i,
|
|
110
|
-
fetchItems,
|
|
111
|
-
};
|
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
export const id = "venturebeat";
|
|
2
|
+
export const name = "Venturebeat";
|
|
3
|
+
export const listUrlPattern = /^https:\/\/(www\.)?venturebeat\.com\/?(\?.*)?$/i;
|
|
4
|
+
export const refreshInterval = "1h";
|
|
5
|
+
|
|
1
6
|
let _deps;
|
|
2
7
|
|
|
3
8
|
// VentureBeat 插件:通过官方 RSS Feed 拉取列表,规避首页安全检查页
|
|
@@ -35,7 +40,7 @@ function toFeedUrl(sourceId) {
|
|
|
35
40
|
|
|
36
41
|
function mapFeedItem(item) {
|
|
37
42
|
const link = normalizeText(item.link ?? "");
|
|
38
|
-
if (!/^https
|
|
43
|
+
if (!/^https:\/\//i.test(link)) return null;
|
|
39
44
|
|
|
40
45
|
const title = normalizeText(item.title ?? "");
|
|
41
46
|
const pubDate = toValidDate(item.isoDate ?? item.pubDate);
|
|
@@ -53,7 +58,7 @@ function mapFeedItem(item) {
|
|
|
53
58
|
}
|
|
54
59
|
|
|
55
60
|
|
|
56
|
-
async function fetchItems(sourceId, _ctx) {
|
|
61
|
+
export async function fetchItems(sourceId, _ctx) {
|
|
57
62
|
_deps = _ctx.deps;
|
|
58
63
|
const parser = new _deps.RssParser({
|
|
59
64
|
timeout: 15_000,
|
|
@@ -88,10 +93,3 @@ async function fetchItems(sourceId, _ctx) {
|
|
|
88
93
|
return items;
|
|
89
94
|
}
|
|
90
95
|
|
|
91
|
-
|
|
92
|
-
export default {
|
|
93
|
-
id: "venturebeat",
|
|
94
|
-
listUrlPattern: /^https?:\/\/(www\.)?venturebeat\.com\/?(\?.*)?$/i,
|
|
95
|
-
refreshInterval: "1h",
|
|
96
|
-
fetchItems,
|
|
97
|
-
};
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
export const id = "worldlabs";
|
|
2
|
+
export const name = "Worldlabs";
|
|
3
|
+
export const listUrlPattern = /^https:\/\/(www\.)?worldlabs\.ai\/blog(\?.*)?$/i;
|
|
4
|
+
|
|
1
5
|
let _deps;
|
|
2
6
|
|
|
3
7
|
// World Labs 博客插件:抓取 Research & Insights 列表页,输出 FeedItem(不含 enrich)
|
|
@@ -39,7 +43,7 @@ function toAbsoluteHttpUrl(rawHref, baseUrl) {
|
|
|
39
43
|
if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
|
|
40
44
|
try {
|
|
41
45
|
const url = new URL(href, baseUrl);
|
|
42
|
-
if (!/^https
|
|
46
|
+
if (!/^https:$/i.test(url.protocol)) return null;
|
|
43
47
|
return url.href;
|
|
44
48
|
} catch {
|
|
45
49
|
return null;
|
|
@@ -99,7 +103,7 @@ function parseCard(anchor, finalUrl) {
|
|
|
99
103
|
}
|
|
100
104
|
|
|
101
105
|
|
|
102
|
-
async function fetchItems(sourceId, ctx) {
|
|
106
|
+
export async function fetchItems(sourceId, ctx) {
|
|
103
107
|
_deps = ctx.deps;
|
|
104
108
|
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
|
|
105
109
|
const root = _deps.parseHtml(html);
|
|
@@ -121,9 +125,3 @@ async function fetchItems(sourceId, ctx) {
|
|
|
121
125
|
return items;
|
|
122
126
|
}
|
|
123
127
|
|
|
124
|
-
|
|
125
|
-
export default {
|
|
126
|
-
id: "worldlabs",
|
|
127
|
-
listUrlPattern: /^https?:\/\/(www\.)?worldlabs\.ai\/blog(\?.*)?$/i,
|
|
128
|
-
fetchItems,
|
|
129
|
-
};
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
export const id = "x";
|
|
2
|
+
export const name = "X";
|
|
3
|
+
export const listUrlPattern = /^https:\/\/(www\.)?x\.com\/[^/?#]+\/?(?:[?#].*)?$/i;
|
|
4
|
+
|
|
1
5
|
let _deps;
|
|
2
6
|
|
|
3
7
|
// X (Twitter) 站点插件:用户主页列表抓取与解析
|
|
@@ -161,13 +165,13 @@ function extractMediaUrl(article) {
|
|
|
161
165
|
const video = article.querySelector("video[poster]");
|
|
162
166
|
if (video) {
|
|
163
167
|
const poster = video.getAttribute("poster");
|
|
164
|
-
if (poster && /^https
|
|
168
|
+
if (poster && /^https:\/\//i.test(poster)) return poster;
|
|
165
169
|
}
|
|
166
170
|
for (const img of article.querySelectorAll(
|
|
167
171
|
'[data-testid="card.wrapper"] img[src*="twimg.com/card_img"], [data-testid="card.wrapper"] img[src*="pbs.twimg.com/card_img"]',
|
|
168
172
|
)) {
|
|
169
173
|
const src = img.getAttribute("src");
|
|
170
|
-
if (src && /^https
|
|
174
|
+
if (src && /^https:\/\//i.test(src) && !/profile_images/i.test(src)) {
|
|
171
175
|
return normalizeCardImgUrl(src);
|
|
172
176
|
}
|
|
173
177
|
}
|
|
@@ -288,7 +292,7 @@ function entriesToFeedItems(entries) {
|
|
|
288
292
|
}
|
|
289
293
|
|
|
290
294
|
|
|
291
|
-
async function fetchItems(sourceId, ctx) {
|
|
295
|
+
export async function fetchItems(sourceId, ctx) {
|
|
292
296
|
_deps = ctx.deps;
|
|
293
297
|
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 6000 });
|
|
294
298
|
const root = _deps.parseHtml(html);
|
|
@@ -320,9 +324,3 @@ async function fetchItems(sourceId, ctx) {
|
|
|
320
324
|
throw new Error(`[X] ${message}`);
|
|
321
325
|
}
|
|
322
326
|
|
|
323
|
-
|
|
324
|
-
export default {
|
|
325
|
-
id: "x",
|
|
326
|
-
listUrlPattern: "https://x.com/{username}",
|
|
327
|
-
fetchItems,
|
|
328
|
-
};
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
export const id = "xiaohongshu";
|
|
2
|
+
export const name = "Xiaohongshu";
|
|
3
|
+
export const listUrlPattern = /^https:\/\/(www\.)?xiaohongshu\.com\/user\/profile\/[^/?#]+\/?(?:[?#].*)?$/i;
|
|
4
|
+
|
|
1
5
|
let _deps;
|
|
2
6
|
|
|
3
7
|
// 小红书站点插件:用户主页列表抓取、笔记详情提取、认证流程
|
|
@@ -5,6 +9,41 @@ let _deps;
|
|
|
5
9
|
|
|
6
10
|
|
|
7
11
|
const XHS_ORIGIN = "https://www.xiaohongshu.com";
|
|
12
|
+
const XHS_NOTE_PATH_RE = /^\/(?:explore|discovery\/item)\/([0-9a-f]{24})\/?$/i;
|
|
13
|
+
const XHS_NOTE_ID_RE = /^[0-9a-f]{24}$/i;
|
|
14
|
+
const XHS_NOTE_ID_IN_IMG_RE = /xhscdn\.com\/\d+\/([0-9a-f]{24})/i;
|
|
15
|
+
const XHS_PROFILE_USER_RE = /\/user\/profile\/([0-9a-f]{24})/i;
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
function hashNoteGuid(noteId) {
|
|
19
|
+
return _deps.createHash("sha256").update(`xhs:note:${noteId}`).digest("hex");
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
function extractProfileUserId(url) {
|
|
24
|
+
const m = String(url).match(XHS_PROFILE_USER_RE);
|
|
25
|
+
return m?.[1]?.toLowerCase() ?? null;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
function buildExploreLink(noteId, origin) {
|
|
30
|
+
return `${origin.replace(/\/$/, "")}/explore/${noteId}`;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
function extractNoteIdFromSection(section, profileUserId) {
|
|
35
|
+
for (const img of section.querySelectorAll('img[src*="xhscdn"]')) {
|
|
36
|
+
const src = img.getAttribute("src")?.trim() ?? "";
|
|
37
|
+
const fromImg = src.match(XHS_NOTE_ID_IN_IMG_RE);
|
|
38
|
+
if (fromImg?.[1] && fromImg[1] !== profileUserId) return fromImg[1].toLowerCase();
|
|
39
|
+
}
|
|
40
|
+
const html = section.outerHTML ?? "";
|
|
41
|
+
for (const match of html.match(/[0-9a-f]{24}/gi) ?? []) {
|
|
42
|
+
const id = match.toLowerCase();
|
|
43
|
+
if (id !== profileUserId && XHS_NOTE_ID_RE.test(id)) return id;
|
|
44
|
+
}
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
8
47
|
|
|
9
48
|
|
|
10
49
|
function getOrigin(url) {
|
|
@@ -16,57 +55,99 @@ function getOrigin(url) {
|
|
|
16
55
|
}
|
|
17
56
|
|
|
18
57
|
|
|
19
|
-
function
|
|
58
|
+
function normalizeXhsUrl(href, origin) {
|
|
59
|
+
try {
|
|
60
|
+
const url = new URL(href.replace(/&/g, "&"), origin);
|
|
61
|
+
url.hash = "";
|
|
62
|
+
return url;
|
|
63
|
+
} catch {
|
|
64
|
+
return null;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
function normalizeXhsItemLink(href, origin) {
|
|
70
|
+
const url = normalizeXhsUrl(href, origin);
|
|
71
|
+
if (!url) return null;
|
|
72
|
+
|
|
20
73
|
try {
|
|
21
|
-
|
|
22
|
-
const
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
const token = fullUrl.searchParams.get("xsec_token");
|
|
26
|
-
const source = fullUrl.searchParams.get("xsec_source") ?? "pc_user";
|
|
27
|
-
if (!token) return null;
|
|
28
|
-
const explore = new URL(`/explore/${noteId}`, origin);
|
|
29
|
-
explore.searchParams.set("xsec_token", token);
|
|
30
|
-
explore.searchParams.set("xsec_source", source);
|
|
31
|
-
return explore.href;
|
|
74
|
+
if (!/(^|\.)xiaohongshu\.com$/i.test(url.hostname)) return null;
|
|
75
|
+
const m = url.pathname.match(XHS_NOTE_PATH_RE);
|
|
76
|
+
if (!m?.[1]) return null;
|
|
77
|
+
return buildExploreLink(m[1].toLowerCase(), url.origin);
|
|
32
78
|
} catch {
|
|
33
79
|
return null;
|
|
34
80
|
}
|
|
35
81
|
}
|
|
36
82
|
|
|
37
83
|
|
|
84
|
+
function extractRedirectItemLink(href, origin) {
|
|
85
|
+
const wrapper = normalizeXhsUrl(href, origin);
|
|
86
|
+
if (!wrapper) return null;
|
|
87
|
+
if (!/\/website-login\/error\/?$/i.test(wrapper.pathname)) return null;
|
|
88
|
+
|
|
89
|
+
const redirectPath = wrapper.searchParams.get("redirectPath");
|
|
90
|
+
if (!redirectPath) return null;
|
|
91
|
+
return normalizeXhsItemLink(redirectPath, origin);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
function extractListItemLink(section, origin, profileUserId) {
|
|
96
|
+
const noteId = extractNoteIdFromSection(section, profileUserId);
|
|
97
|
+
if (noteId) return buildExploreLink(noteId, origin);
|
|
98
|
+
|
|
99
|
+
const anchors = section.querySelectorAll("a[href]");
|
|
100
|
+
const candidates = [];
|
|
101
|
+
for (const anchor of anchors) {
|
|
102
|
+
const href = anchor.getAttribute("href")?.trim();
|
|
103
|
+
if (!href) continue;
|
|
104
|
+
|
|
105
|
+
const direct = normalizeXhsItemLink(href, origin);
|
|
106
|
+
if (direct) candidates.push(direct);
|
|
107
|
+
|
|
108
|
+
const redirected = extractRedirectItemLink(href, origin);
|
|
109
|
+
if (redirected) candidates.push(redirected);
|
|
110
|
+
}
|
|
111
|
+
return candidates[0] ?? null;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
|
|
38
115
|
function parseListHtml(html, url) {
|
|
39
116
|
const root = _deps.parseHtml(html);
|
|
40
117
|
const origin = getOrigin(url);
|
|
118
|
+
const profileUserId = extractProfileUserId(url);
|
|
41
119
|
const feed = root.querySelector("#userPostedFeeds");
|
|
42
120
|
if (!feed) return [];
|
|
43
|
-
const sections = feed.querySelectorAll("section[data-
|
|
121
|
+
const sections = feed.querySelectorAll("section[data-index]");
|
|
44
122
|
const items = [];
|
|
123
|
+
const seenNoteIds = new Set();
|
|
45
124
|
for (const section of sections) {
|
|
46
|
-
const
|
|
47
|
-
const
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
link = new URL(href, origin).href;
|
|
58
|
-
}
|
|
59
|
-
const titleEl = section.querySelector("span[data-v-51ec0135]");
|
|
60
|
-
const title = (titleEl?.textContent ?? "").trim() || "笔记";
|
|
61
|
-
const authorEl = section.querySelector('a[aria-current="page"] span');
|
|
125
|
+
const noteId = extractNoteIdFromSection(section, profileUserId);
|
|
126
|
+
const link = noteId
|
|
127
|
+
? buildExploreLink(noteId, origin)
|
|
128
|
+
: extractListItemLink(section, origin, profileUserId);
|
|
129
|
+
if (!link) continue;
|
|
130
|
+
const dedupeKey = noteId ?? link;
|
|
131
|
+
if (seenNoteIds.has(dedupeKey)) continue;
|
|
132
|
+
seenNoteIds.add(dedupeKey);
|
|
133
|
+
const titleEl = section.querySelector("span[data-v-51ec0135]") ?? section.querySelector(".title span") ?? section.querySelector("span");
|
|
134
|
+
const title = (titleEl?.textContent ?? "").trim() || "Note";
|
|
135
|
+
const authorEl = section.querySelector('a[aria-current="page"] .name') ?? section.querySelector('a[aria-current="page"] span');
|
|
62
136
|
const author = (authorEl?.textContent ?? "").trim() || undefined;
|
|
137
|
+
const imageEl = section.querySelector("img[data-xhs-img], img");
|
|
138
|
+
const image = imageEl?.getAttribute("src")?.trim() || undefined;
|
|
139
|
+
const summary = image ? undefined : title;
|
|
140
|
+
const guid = noteId ? hashNoteGuid(noteId) : _deps.createHash("sha256").update(link).digest("hex");
|
|
63
141
|
items.push({
|
|
64
|
-
guid
|
|
142
|
+
guid,
|
|
65
143
|
title,
|
|
66
144
|
link,
|
|
67
145
|
pubDate: new Date(),
|
|
68
146
|
author,
|
|
69
|
-
summary
|
|
147
|
+
summary,
|
|
148
|
+
imageUrl: image,
|
|
149
|
+
coverImg: image,
|
|
150
|
+
cover_img: image,
|
|
70
151
|
});
|
|
71
152
|
}
|
|
72
153
|
return items;
|
|
@@ -240,9 +321,14 @@ function extractDetailHtml(html) {
|
|
|
240
321
|
}
|
|
241
322
|
|
|
242
323
|
|
|
243
|
-
async function fetchItems(sourceId, ctx) {
|
|
324
|
+
export async function fetchItems(sourceId, ctx) {
|
|
244
325
|
_deps = ctx.deps;
|
|
245
|
-
const { html, finalUrl } = await ctx.fetchHtml(sourceId
|
|
326
|
+
const { html, finalUrl } = await ctx.fetchHtml(sourceId, {
|
|
327
|
+
waitMs: 3000,
|
|
328
|
+
waitForSelector: "#userPostedFeeds",
|
|
329
|
+
waitForSelectorTimeoutMs: 15000,
|
|
330
|
+
scrollBeforeSnapshot: { selector: "#userPostedFeeds", rounds: 8, pauseMs: 900 },
|
|
331
|
+
});
|
|
246
332
|
return parseListHtml(html, finalUrl);
|
|
247
333
|
}
|
|
248
334
|
|
|
@@ -258,26 +344,3 @@ async function enrichItem(item, ctx) {
|
|
|
258
344
|
pubDate: detail.pubDate ?? item.pubDate,
|
|
259
345
|
};
|
|
260
346
|
}
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
async function checkAuth(page, _url) {
|
|
264
|
-
try {
|
|
265
|
-
const loginButton = await page.$(".reds-button-new.login-btn.large.primary");
|
|
266
|
-
return loginButton == null;
|
|
267
|
-
} catch {
|
|
268
|
-
return false;
|
|
269
|
-
}
|
|
270
|
-
}
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
export default {
|
|
274
|
-
id: "xiaohongshu",
|
|
275
|
-
listUrlPattern: "https://xiaohongshu.com/user/profile/{userId}",
|
|
276
|
-
fetchItems,
|
|
277
|
-
enrichItem,
|
|
278
|
-
checkAuth,
|
|
279
|
-
loginUrl: "https://www.xiaohongshu.com/",
|
|
280
|
-
domain: "xiaohongshu.com",
|
|
281
|
-
loginTimeoutMs: 30 * 1000,
|
|
282
|
-
pollIntervalMs: 2000,
|
|
283
|
-
};
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
export const id = "zhipu-research";
|
|
2
|
+
export const name = "Zhipu Research";
|
|
3
|
+
export const listUrlPattern = /^https:\/\/(www\.)?zhipuai\.cn\/zh\/research\/?(?:[?#].*)?$/i;
|
|
4
|
+
|
|
1
5
|
let _deps;
|
|
2
6
|
|
|
3
7
|
// 智谱研究页插件:仅抓取列表,不做正文 enrich(兼容净化后的 HTML)
|
|
@@ -300,7 +304,7 @@ function buildItemsFromLeafSequence(html, titleIdMap) {
|
|
|
300
304
|
}
|
|
301
305
|
|
|
302
306
|
|
|
303
|
-
async function fetchItems(sourceId, ctx) {
|
|
307
|
+
export async function fetchItems(sourceId, ctx) {
|
|
304
308
|
_deps = ctx.deps;
|
|
305
309
|
// 需要读取页面脚本里的 blogsItems(包含详情 id),因此这里禁用净化。
|
|
306
310
|
const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 5000, purify: false });
|
|
@@ -325,10 +329,3 @@ async function fetchItems(sourceId, ctx) {
|
|
|
325
329
|
|
|
326
330
|
throw new Error("[zhipu-research] 未解析到研究条目,页面结构可能已变化");
|
|
327
331
|
}
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
export default {
|
|
331
|
-
id: "zhipu-research",
|
|
332
|
-
listUrlPattern: ZHIPU_RESEARCH_URL,
|
|
333
|
-
fetchItems,
|
|
334
|
-
};
|
|
@@ -1,26 +1,25 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Site
|
|
3
|
-
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*/
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
};
|
|
1
|
+
/**
|
|
2
|
+
* Site plugin template. The admin UI copies this file to .rssany/plugins/{id}.rssany.js.
|
|
3
|
+
* Plugin protocol: named exports. No export default is required.
|
|
4
|
+
*
|
|
5
|
+
* Interface: app/scraper/sources/web/site.ts
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// Predefined fields stay together at the top.
|
|
9
|
+
export const id = "__PLUGIN_ID__";
|
|
10
|
+
export const name = "__PLUGIN_ID__";
|
|
11
|
+
// eslint-disable-next-line no-undef
|
|
12
|
+
export const listUrlPattern = __LIST_URL_PATTERN__;
|
|
13
|
+
export const refreshInterval = "1day";
|
|
14
|
+
|
|
15
|
+
export async function fetchItems(sourceId, ctx) {
|
|
16
|
+
const { html, finalUrl } = await ctx.fetchHtml(sourceId, {
|
|
17
|
+
waitMs: 2000,
|
|
18
|
+
purify: true,
|
|
19
|
+
});
|
|
20
|
+
const root = ctx.deps.parseHtml(html);
|
|
21
|
+
void root;
|
|
22
|
+
void finalUrl;
|
|
23
|
+
// TODO: Parse the list page and return FeedItem objects.
|
|
24
|
+
return [];
|
|
25
|
+
}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
# statics
|
|
2
|
-
|
|
3
|
-
静态 HTML 页面:home(首页)、401、404。
|
|
4
|
-
|
|
5
|
-
- **home.html**:首页,含 Try This 示例链接;下方「需登录的站点」从 `/plugins` 拉取,每个站点可点击「打开登录页」调用 `POST /auth/ensure?siteId=...` 批量做登录。
|
|
6
|
-
- **401.html**:需登录时返回;占位符 `{{listUrl}}` 由 router 注入为失败请求的订阅地址;页内「打开有头登录页」按钮调用 `POST /auth/ensure?url=...` 弹出有头浏览器完成登录。
|
|
7
|
-
- **404.html**:无匹配站点时返回。
|
|
1
|
+
# statics
|
|
2
|
+
|
|
3
|
+
静态 HTML 页面:home(首页)、401、404。
|
|
4
|
+
|
|
5
|
+
- **home.html**:首页,含 Try This 示例链接;下方「需登录的站点」从 `/plugins` 拉取,每个站点可点击「打开登录页」调用 `POST /auth/ensure?siteId=...` 批量做登录。
|
|
6
|
+
- **401.html**:需登录时返回;占位符 `{{listUrl}}` 由 router 注入为失败请求的订阅地址;页内「打开有头登录页」按钮调用 `POST /auth/ensure?url=...` 弹出有头浏览器完成登录。
|
|
7
|
+
- **404.html**:无匹配站点时返回。
|