rsshub 1.0.0-master.f71451d β 1.0.0-master.f75997f
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/config.ts +14 -0
- package/lib/errors/index.test.ts +2 -2
- package/lib/middleware/template.tsx +12 -3
- package/lib/routes/0x80/index.ts +87 -0
- package/lib/routes/0x80/namespace.ts +7 -0
- package/lib/routes/aljazeera/index.ts +17 -14
- package/lib/routes/apple/podcast.ts +64 -0
- package/lib/routes/bilibili/cache.ts +1 -1
- package/lib/routes/bing/daily-wallpaper.ts +9 -8
- package/lib/routes/byau/namespace.ts +6 -0
- package/lib/routes/byau/xinwen/index.ts +72 -0
- package/lib/routes/cpcaauto/index.ts +255 -0
- package/lib/routes/cpcaauto/namespace.ts +8 -0
- package/lib/routes/dehenglaw/index.ts +128 -0
- package/lib/routes/dehenglaw/namespace.ts +8 -0
- package/lib/routes/dehenglaw/templates/description.art +7 -0
- package/lib/routes/gov/stats/index.ts +25 -22
- package/lib/routes/gxmzu/ai.ts +1 -1
- package/lib/routes/gxmzu/lib.ts +9 -26
- package/lib/routes/gxmzu/utils/index.ts +31 -13
- package/lib/routes/gxmzu/yjs.ts +1 -1
- package/lib/routes/jou/utils/index.ts +35 -25
- package/lib/routes/lofter/tag.ts +3 -3
- package/lib/routes/lofter/user.ts +3 -3
- package/lib/routes/njxzc/utils/index.ts +31 -13
- package/lib/routes/qingting/podcast.ts +61 -39
- package/lib/routes/reuters/common.ts +2 -2
- package/lib/routes/sara/index.ts +66 -0
- package/lib/routes/sara/namespace.ts +6 -0
- package/lib/routes/tencent/news/author.ts +13 -11
- package/lib/routes/test/index.ts +11 -1
- package/lib/routes/twitter/api/mobile-api/login.ts +29 -28
- package/lib/routes/twitter/namespace.ts +2 -2
- package/lib/routes/twitter/user.ts +5 -0
- package/lib/routes/u3c3/index.ts +1 -1
- package/lib/routes/u3c3/namespace.ts +1 -1
- package/lib/routes/u9a9/index.ts +2 -2
- package/lib/routes/u9a9/namespace.ts +1 -1
- package/lib/routes/zsxq/group.ts +63 -0
- package/lib/routes/zsxq/namespace.ts +6 -0
- package/lib/routes/zsxq/types.ts +149 -0
- package/lib/routes/zsxq/user.ts +58 -0
- package/lib/routes/zsxq/utils.ts +70 -0
- package/lib/setup.test.ts +183 -12
- package/lib/utils/render.ts +1 -1
- package/lib/utils/request-rewriter/get.ts +8 -1
- package/lib/utils/wechat-mp.test.ts +411 -32
- package/lib/utils/wechat-mp.ts +447 -76
- package/lib/views/{rss3-ums.ts β rss3.ts} +2 -2
- package/package.json +14 -14
package/lib/utils/wechat-mp.ts
CHANGED
|
@@ -26,9 +26,240 @@
|
|
|
26
26
|
*/
|
|
27
27
|
|
|
28
28
|
import ofetch from '@/utils/ofetch';
|
|
29
|
-
import {
|
|
29
|
+
import { type Cheerio, type CheerioAPI, type Element, load } from 'cheerio';
|
|
30
30
|
import { parseDate } from '@/utils/parse-date';
|
|
31
31
|
import cache from '@/utils/cache';
|
|
32
|
+
import logger from '@/utils/logger';
|
|
33
|
+
|
|
34
|
+
class WeChatMpError extends Error {
|
|
35
|
+
constructor(message: string) {
|
|
36
|
+
super(message);
|
|
37
|
+
this.name = 'WeChatMpError';
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const MAINTAINERS = ['@Rongronggg9'];
|
|
42
|
+
|
|
43
|
+
const formatLogNoMention = (...params: string[]): string => `wechat-mp: ${params.join(': ')}`;
|
|
44
|
+
const formatLog = (...params: string[]): string => `${formatLogNoMention(...params)}
|
|
45
|
+
Consider raise an issue (mentioning ${MAINTAINERS.join(', ')}) with the article URL for further investigation`;
|
|
46
|
+
let warn = (...params: string[]) => logger.warn(formatLog(...params));
|
|
47
|
+
const error = (...params: string[]): never => {
|
|
48
|
+
const msg = formatLog(...params);
|
|
49
|
+
logger.error(msg);
|
|
50
|
+
throw new WeChatMpError(msg);
|
|
51
|
+
};
|
|
52
|
+
const errorNoMention = (...params: string[]): never => {
|
|
53
|
+
const msg = formatLogNoMention(...params);
|
|
54
|
+
logger.error(msg);
|
|
55
|
+
throw new WeChatMpError(msg);
|
|
56
|
+
};
|
|
57
|
+
const toggleWerror = (() => {
|
|
58
|
+
const onFunc = (...params: string[]) => error('WarningAsError', ...params);
|
|
59
|
+
const offFunc = warn;
|
|
60
|
+
return (on: boolean) => {
|
|
61
|
+
warn = on ? onFunc : offFunc;
|
|
62
|
+
};
|
|
63
|
+
})();
|
|
64
|
+
|
|
65
|
+
const replaceReturnNewline = (() => {
|
|
66
|
+
const returnRegExp = /\r|\\(r|x0d)/g;
|
|
67
|
+
const newlineRegExp = /\n|\\(n|x0a)/g;
|
|
68
|
+
return (text: string, replaceReturnWith = '', replaceNewlineWith = '<br>') => text.replaceAll(returnRegExp, replaceReturnWith).replaceAll(newlineRegExp, replaceNewlineWith);
|
|
69
|
+
})();
|
|
70
|
+
const fixUrl = (() => {
|
|
71
|
+
const ampRegExp = /(&|\\x26)amp;/g;
|
|
72
|
+
return (text: string) => text.replaceAll(ampRegExp, '&');
|
|
73
|
+
})();
|
|
74
|
+
|
|
75
|
+
class LoopContinue extends Error {
|
|
76
|
+
constructor() {
|
|
77
|
+
super('');
|
|
78
|
+
this.name = 'LoopContinue';
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
class LoopReturn extends Error {
|
|
83
|
+
to_return: any;
|
|
84
|
+
|
|
85
|
+
constructor(to_return: any) {
|
|
86
|
+
super('');
|
|
87
|
+
this.name = 'LoopReturn';
|
|
88
|
+
this.to_return = to_return;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
const forEachScript = ($: CheerioAPI | string, callback: (script) => void, defaultReturn: any = null, selector = 'script[nonce][type="text/javascript"]') => {
|
|
93
|
+
const scripts = typeof $ === 'string' ? [$] : $(selector).toArray();
|
|
94
|
+
for (const script of scripts) {
|
|
95
|
+
try {
|
|
96
|
+
callback(script);
|
|
97
|
+
} catch (error) {
|
|
98
|
+
if (error instanceof LoopReturn) {
|
|
99
|
+
return error.to_return;
|
|
100
|
+
} else if (error instanceof LoopContinue) {
|
|
101
|
+
continue;
|
|
102
|
+
}
|
|
103
|
+
throw error;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
return defaultReturn;
|
|
107
|
+
};
|
|
108
|
+
|
|
109
|
+
// view-source a *_SHARE_PAGE type article and search for `ITEM_SHOW_TYPE_MAP`
|
|
110
|
+
// Please update the comments below if you find new types or new examples
|
|
111
|
+
const showTypeMap = {
|
|
112
|
+
// "Article".
|
|
113
|
+
// May be combined with media, but type won't change
|
|
114
|
+
// Combined with audio and iframe: https://mp.weixin.qq.com/s/FnjcMXZ1xdS-d6n-pUUyyw
|
|
115
|
+
APP_MSG_PAGE: '0',
|
|
116
|
+
// https://mp.weixin.qq.com/s?__biz=Mzg4NTA1MTkwNA==&mid=2247532942&idx=1&sn=a84e4adbe49fdb39e4d4c1b5c12a4c3f
|
|
117
|
+
VIDEO_SHARE_PAGE: '5',
|
|
118
|
+
MUSIC_SHARE_PAGE: '6',
|
|
119
|
+
// https://mp.weixin.qq.com/s/FY6yQC_e4NMAxK0FBr6jwQ
|
|
120
|
+
AUDIO_SHARE_PAGE: '7',
|
|
121
|
+
// https://mp.weixin.qq.com/s/4p5YmYuASiQSYFiy7KqydQ
|
|
122
|
+
// https://mp.weixin.qq.com/s?__biz=Mzg4NTA1MTkwNA==&mid=2247532936&idx=4&sn=624054c20ded6ee85c6632f419c6f758
|
|
123
|
+
IMG_SHARE_PAGE: '8',
|
|
124
|
+
TEXT_SHARE_PAGE: '10',
|
|
125
|
+
SHORT_CONTENT_PAGE: '17',
|
|
126
|
+
};
|
|
127
|
+
const showTypeMapReverse = Object.fromEntries(Object.entries(showTypeMap).map(([k, v]) => [v, k]));
|
|
128
|
+
|
|
129
|
+
class ExtractMetadata {
|
|
130
|
+
private static genAssignmentRegExp = (varName: string, valuePattern: string, assignPattern: string) => RegExp(`\\b${varName}\\s*${assignPattern}\\s*(?<quote>["'])(?<value>${valuePattern})\\k<quote>`, 'mg');
|
|
131
|
+
|
|
132
|
+
private static genExtractFunc = (
|
|
133
|
+
varName: string,
|
|
134
|
+
{
|
|
135
|
+
valuePattern = '\\w+',
|
|
136
|
+
assignPattern = '=',
|
|
137
|
+
allowNotFound = false,
|
|
138
|
+
multiple = false,
|
|
139
|
+
}: {
|
|
140
|
+
valuePattern?: string;
|
|
141
|
+
assignPattern?: string;
|
|
142
|
+
allowNotFound?: boolean;
|
|
143
|
+
multiple?: boolean;
|
|
144
|
+
}
|
|
145
|
+
) => {
|
|
146
|
+
const regExp = this.genAssignmentRegExp(varName, valuePattern, assignPattern);
|
|
147
|
+
return (str: string) => {
|
|
148
|
+
const values: string[] = [];
|
|
149
|
+
for (const match of str.matchAll(regExp)) {
|
|
150
|
+
const value = <string>match.groups?.value;
|
|
151
|
+
if (!multiple) {
|
|
152
|
+
return value;
|
|
153
|
+
}
|
|
154
|
+
values.push(value);
|
|
155
|
+
}
|
|
156
|
+
if (!allowNotFound && values.length === 0) {
|
|
157
|
+
throw new LoopContinue();
|
|
158
|
+
}
|
|
159
|
+
return multiple ? values : null;
|
|
160
|
+
};
|
|
161
|
+
};
|
|
162
|
+
|
|
163
|
+
private static doExtract = (metadataToBeExtracted: Record<string, (str: string) => string | string[] | null | undefined>, scriptText: string) => {
|
|
164
|
+
const metadataExtracted: Record<string, string | string[]> = {};
|
|
165
|
+
for (const [key, extractFunc] of Object.entries(metadataToBeExtracted)) {
|
|
166
|
+
metadataExtracted[key] = <string>extractFunc(scriptText);
|
|
167
|
+
}
|
|
168
|
+
metadataExtracted._extractedFrom = scriptText;
|
|
169
|
+
return metadataExtracted;
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
private static commonMetadataToBeExtracted = {
|
|
173
|
+
showType: this.genExtractFunc('item_show_type', { valuePattern: '\\d+' }),
|
|
174
|
+
realShowType: this.genExtractFunc('real_item_show_type', { valuePattern: '\\d+' }),
|
|
175
|
+
createTime: this.genExtractFunc('ct', { valuePattern: '\\d+', allowNotFound: true }),
|
|
176
|
+
sourceUrl: this.genExtractFunc('msg_source_url', { valuePattern: `https?://[^'"]*`, allowNotFound: true }),
|
|
177
|
+
};
|
|
178
|
+
|
|
179
|
+
static common = ($: CheerioAPI) =>
|
|
180
|
+
forEachScript(
|
|
181
|
+
$,
|
|
182
|
+
(script) => {
|
|
183
|
+
const scriptText = $(script).text();
|
|
184
|
+
const metadataExtracted = <Record<string, string>> this.doExtract(this.commonMetadataToBeExtracted, scriptText);
|
|
185
|
+
const showType = showTypeMapReverse[metadataExtracted.showType];
|
|
186
|
+
const realShowType = showTypeMapReverse[metadataExtracted.realShowType];
|
|
187
|
+
metadataExtracted.sourceUrl = metadataExtracted.sourceUrl && fixUrl(metadataExtracted.sourceUrl);
|
|
188
|
+
if (showType) {
|
|
189
|
+
metadataExtracted.showType = showType;
|
|
190
|
+
} else {
|
|
191
|
+
warn('showType not found', `item_show_type=${metadataExtracted.showType}`);
|
|
192
|
+
}
|
|
193
|
+
if (realShowType) {
|
|
194
|
+
metadataExtracted.realShowType = realShowType;
|
|
195
|
+
} else {
|
|
196
|
+
warn('realShowType not found', `real_item_show_type=${metadataExtracted.realShowType}`);
|
|
197
|
+
}
|
|
198
|
+
if (metadataExtracted.showType !== metadataExtracted.realShowType) {
|
|
199
|
+
// never seen this happen, waiting for examples
|
|
200
|
+
warn('showType mismatch', `item_show_type=${metadataExtracted.showType}, real_item_show_type=${metadataExtracted.realShowType}`);
|
|
201
|
+
}
|
|
202
|
+
throw new LoopReturn(metadataExtracted);
|
|
203
|
+
},
|
|
204
|
+
{},
|
|
205
|
+
'script[nonce][type="text/javascript"]:contains("real_item_show_type")'
|
|
206
|
+
);
|
|
207
|
+
|
|
208
|
+
private static audioMetadataToBeExtracted = {
|
|
209
|
+
voiceId: this.genExtractFunc('voiceid', { assignPattern: ':' }),
|
|
210
|
+
duration: this.genExtractFunc('duration', { valuePattern: '\\d*', assignPattern: ':', allowNotFound: true }),
|
|
211
|
+
};
|
|
212
|
+
|
|
213
|
+
// never seen a audio article containing multiple audio, waiting for examples
|
|
214
|
+
static audio = ($: CheerioAPI) =>
|
|
215
|
+
forEachScript(
|
|
216
|
+
$,
|
|
217
|
+
(script) => {
|
|
218
|
+
const scriptText = $(script).text();
|
|
219
|
+
const metadataExtracted = <Record<string, string>> this.doExtract(this.audioMetadataToBeExtracted, scriptText);
|
|
220
|
+
throw new LoopReturn(metadataExtracted);
|
|
221
|
+
},
|
|
222
|
+
{},
|
|
223
|
+
'script[nonce][type="text/javascript"]:contains("voiceid")'
|
|
224
|
+
);
|
|
225
|
+
|
|
226
|
+
private static imgMetadataToBeExtracted = {
|
|
227
|
+
imgUrls: this.genExtractFunc('cdn_url', { valuePattern: `https?://[^'"]*`, assignPattern: ':', multiple: true }),
|
|
228
|
+
};
|
|
229
|
+
|
|
230
|
+
static img = ($: CheerioAPI) =>
|
|
231
|
+
forEachScript(
|
|
232
|
+
$,
|
|
233
|
+
(script) => {
|
|
234
|
+
const scriptText = $(script).text();
|
|
235
|
+
const metadataExtracted = <Record<string, string[]>> this.doExtract(this.imgMetadataToBeExtracted, scriptText);
|
|
236
|
+
if (Array.isArray(metadataExtracted.imgUrls)) {
|
|
237
|
+
metadataExtracted.imgUrls = metadataExtracted.imgUrls.map((url) => fixUrl(url));
|
|
238
|
+
}
|
|
239
|
+
throw new LoopReturn(metadataExtracted);
|
|
240
|
+
},
|
|
241
|
+
{},
|
|
242
|
+
'script[nonce][type="text/javascript"]:contains("picture_page_info_list")'
|
|
243
|
+
);
|
|
244
|
+
|
|
245
|
+
private static locationMetadataToBeExtracted = {
|
|
246
|
+
countryName: this.genExtractFunc('countryName', { valuePattern: `[^'"]*`, assignPattern: ':' }),
|
|
247
|
+
provinceName: this.genExtractFunc('provinceName', { valuePattern: `[^'"]*`, assignPattern: ':' }),
|
|
248
|
+
cityName: this.genExtractFunc('cityName', { valuePattern: `[^'"]*`, assignPattern: ':' }),
|
|
249
|
+
};
|
|
250
|
+
|
|
251
|
+
static location = ($: CheerioAPI) =>
|
|
252
|
+
forEachScript(
|
|
253
|
+
$,
|
|
254
|
+
(script) => {
|
|
255
|
+
const scriptText = $(script).text();
|
|
256
|
+
const metadataExtracted = this.doExtract(this.locationMetadataToBeExtracted, scriptText);
|
|
257
|
+
throw new LoopReturn(metadataExtracted);
|
|
258
|
+
},
|
|
259
|
+
{},
|
|
260
|
+
'script[nonce][type="text/javascript"]:contains("countryName")'
|
|
261
|
+
);
|
|
262
|
+
}
|
|
32
263
|
|
|
33
264
|
const replaceTag = ($, oldTag, newTagName) => {
|
|
34
265
|
oldTag = $(oldTag);
|
|
@@ -55,15 +286,23 @@ const detectOriginalArticleUrl = ($) => {
|
|
|
55
286
|
return null;
|
|
56
287
|
};
|
|
57
288
|
|
|
58
|
-
const
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
289
|
+
const genAudioSrc = (voiceId: string) => `https://res.wx.qq.com/voice/getvoice?mediaid=${voiceId}`;
|
|
290
|
+
const genAudioTag = (src: string, title: string) => `<audio controls src="${src}" title="${title}" style="width:100%"/>`;
|
|
291
|
+
const genVideoSrc = (videoId: string) => {
|
|
292
|
+
const newSearchParams = new URLSearchParams({
|
|
293
|
+
origin: 'https://mp.weixin.qq.com',
|
|
294
|
+
containerId: 'js_tx_video_container_0.3863487104715233',
|
|
295
|
+
vid: videoId,
|
|
296
|
+
width: '677',
|
|
297
|
+
height: '380.8125',
|
|
298
|
+
autoplay: 'false',
|
|
299
|
+
allowFullScreen: 'true',
|
|
300
|
+
chid: '17',
|
|
301
|
+
full: 'true',
|
|
302
|
+
show1080p: 'false',
|
|
303
|
+
isDebugIframe: 'false',
|
|
304
|
+
});
|
|
305
|
+
return `https://v.qq.com/txp/iframe/player.html?${newSearchParams.toString()}`;
|
|
67
306
|
};
|
|
68
307
|
|
|
69
308
|
/**
|
|
@@ -99,6 +338,33 @@ const fixArticleContent = (html?: string | Cheerio<Element>, skipImg = false) =>
|
|
|
99
338
|
}
|
|
100
339
|
});
|
|
101
340
|
}
|
|
341
|
+
// fix audio: https://mp.weixin.qq.com/s/FnjcMXZ1xdS-d6n-pUUyyw
|
|
342
|
+
$('mpvoice[voice_encode_fileid]').each((_, voice) => {
|
|
343
|
+
const $voice = $(voice);
|
|
344
|
+
const voiceId = $voice.attr('voice_encode_fileid');
|
|
345
|
+
if (voiceId) {
|
|
346
|
+
const title = $voice.attr('name') || 'Audio';
|
|
347
|
+
$voice.replaceWith(genAudioTag(genAudioSrc(voiceId), title));
|
|
348
|
+
}
|
|
349
|
+
});
|
|
350
|
+
// fix iframe: https://mp.weixin.qq.com/s/FnjcMXZ1xdS-d6n-pUUyyw
|
|
351
|
+
$('iframe.video_iframe[data-src]').each((_, iframe) => {
|
|
352
|
+
const $iframe = $(iframe);
|
|
353
|
+
const dataSrc = <string>$iframe.attr('data-src');
|
|
354
|
+
const srcUrlObj = new URL(dataSrc);
|
|
355
|
+
if (srcUrlObj.host === 'v.qq.com' && srcUrlObj.searchParams.has('vid')) {
|
|
356
|
+
const newSrc = genVideoSrc(<string>srcUrlObj.searchParams.get('vid'));
|
|
357
|
+
$iframe.attr('src', newSrc);
|
|
358
|
+
$iframe.removeAttr('data-src');
|
|
359
|
+
const width = $iframe.attr('data-w');
|
|
360
|
+
const ratio = $iframe.attr('data-ratio');
|
|
361
|
+
if (width && ratio) {
|
|
362
|
+
const width_ = Math.min(Number.parseInt(width), 677);
|
|
363
|
+
$iframe.attr('width', width_.toString());
|
|
364
|
+
$iframe.attr('height', (width_ / Number.parseFloat(ratio)).toString());
|
|
365
|
+
}
|
|
366
|
+
} // else {} FIXME: https://mp.weixin.qq.com/s?__biz=Mzg5Mjk3MzE4OQ==&mid=2247549515&idx=2&sn=a608fca597f0589c1aebd6d0b82ff6e9
|
|
367
|
+
});
|
|
102
368
|
// fix section
|
|
103
369
|
$('section').each((_, section) => {
|
|
104
370
|
const $section = $(section);
|
|
@@ -122,17 +388,6 @@ const fixArticleContent = (html?: string | Cheerio<Element>, skipImg = false) =>
|
|
|
122
388
|
// clear line index tags in code section
|
|
123
389
|
$('.code-snippet__line-index').remove();
|
|
124
390
|
|
|
125
|
-
// fix single picture article
|
|
126
|
-
// example: https://mp.weixin.qq.com/s/4p5YmYuASiQSYFiy7KqydQ
|
|
127
|
-
$('script').each((_, script) => {
|
|
128
|
-
const $script = $(script);
|
|
129
|
-
const matchs = $script.html()?.match(/document\.getElementById\('js_image_desc'\)\.innerHTML = "(.*)"\.replace/);
|
|
130
|
-
|
|
131
|
-
if (matchs) {
|
|
132
|
-
$script.replaceWith(matchs[1].replaceAll('\r', '').replaceAll('\n', '<br>').replaceAll('\\x0d', '').replaceAll('\\x0a', '<br>'));
|
|
133
|
-
}
|
|
134
|
-
});
|
|
135
|
-
|
|
136
391
|
// clean scripts
|
|
137
392
|
$('script').remove();
|
|
138
393
|
return $.html();
|
|
@@ -146,18 +401,21 @@ const fixArticleContent = (html?: string | Cheerio<Element>, skipImg = false) =>
|
|
|
146
401
|
// abtest_cookie, wx_header
|
|
147
402
|
// Known params (temporary link):
|
|
148
403
|
// src, timestamp, ver, signature, new (unessential)
|
|
149
|
-
const normalizeUrl = (url, bypassHostCheck = false) => {
|
|
404
|
+
const normalizeUrl = (url: string, bypassHostCheck = false) => {
|
|
150
405
|
const oriUrl = url;
|
|
406
|
+
// already seen some weird urls with `&` escaped as `&`, so fix it
|
|
407
|
+
// calling fixUrl should always be safe since having `&` or `\x26` in a URL is meaningless
|
|
408
|
+
url = fixUrl(url);
|
|
151
409
|
const urlObj = new URL(url);
|
|
152
410
|
if (!bypassHostCheck && urlObj.host !== 'mp.weixin.qq.com') {
|
|
153
|
-
|
|
411
|
+
error('URL host must be "mp.weixin.qq.com"', url);
|
|
154
412
|
}
|
|
155
413
|
urlObj.protocol = 'https:';
|
|
156
414
|
urlObj.hash = ''; // remove hash
|
|
157
|
-
if (
|
|
415
|
+
if (urlObj.pathname.startsWith('/s/')) {
|
|
158
416
|
// a short link, just remove all the params
|
|
159
417
|
urlObj.search = '';
|
|
160
|
-
} else if (
|
|
418
|
+
} else if (urlObj.pathname === '/s') {
|
|
161
419
|
const biz = urlObj.searchParams.get('__biz');
|
|
162
420
|
const mid = urlObj.searchParams.get('mid') || urlObj.searchParams.get('appmsgid');
|
|
163
421
|
const idx = urlObj.searchParams.get('idx') || urlObj.searchParams.get('itemidx');
|
|
@@ -175,60 +433,165 @@ const normalizeUrl = (url, bypassHostCheck = false) => {
|
|
|
175
433
|
// a temporary link, remove all unessential params
|
|
176
434
|
urlObj.search = `?src=${src}×tamp=${timestamp}&ver=${ver}&signature=${signature}`;
|
|
177
435
|
} else {
|
|
178
|
-
|
|
436
|
+
warn('unknown URL search parameters', oriUrl);
|
|
179
437
|
}
|
|
180
438
|
}
|
|
181
439
|
} else {
|
|
182
|
-
|
|
440
|
+
warn('unknown URL path', oriUrl);
|
|
183
441
|
}
|
|
184
442
|
return urlObj.href;
|
|
185
443
|
};
|
|
186
444
|
|
|
445
|
+
class PageParsers {
|
|
446
|
+
private static common = ($: CheerioAPI, commonMetadata: Record<string, string>) => {
|
|
447
|
+
const title = replaceReturnNewline($('meta[property="og:title"]').attr('content') || '', '', ' ');
|
|
448
|
+
const author = replaceReturnNewline($('meta[name=author]').attr('content') || '', '', ' ');
|
|
449
|
+
const pubDate = commonMetadata.createTime ? parseDate(Number.parseInt(commonMetadata.createTime) * 1000) : undefined;
|
|
450
|
+
const mpName = $('.wx_follow_nickname').first().text()?.trim();
|
|
451
|
+
|
|
452
|
+
let summary = replaceReturnNewline($('meta[name=description]').attr('content') || '');
|
|
453
|
+
const description = summary;
|
|
454
|
+
summary = summary.replaceAll('<br>', ' ') === title ? '' : summary;
|
|
455
|
+
|
|
456
|
+
return { title, author, description, summary, pubDate, mpName } as {
|
|
457
|
+
title: string;
|
|
458
|
+
author: string;
|
|
459
|
+
description: string;
|
|
460
|
+
summary: string;
|
|
461
|
+
pubDate?: Date;
|
|
462
|
+
mpName?: string;
|
|
463
|
+
enclosure_url?: string;
|
|
464
|
+
itunes_duration?: string | number;
|
|
465
|
+
enclosure_type?: string;
|
|
466
|
+
};
|
|
467
|
+
};
|
|
468
|
+
private static appMsg = async ($: CheerioAPI, commonMetadata: Record<string, string>) => {
|
|
469
|
+
const page = PageParsers.common($, commonMetadata);
|
|
470
|
+
page.description = fixArticleContent($('#js_content'));
|
|
471
|
+
const originalArticleUrl = detectOriginalArticleUrl($);
|
|
472
|
+
if (originalArticleUrl) {
|
|
473
|
+
// No article or article is too short, try to fetch the description from the original article
|
|
474
|
+
const data = await ofetch(normalizeUrl(originalArticleUrl));
|
|
475
|
+
const original$ = load(data);
|
|
476
|
+
page.description += fixArticleContent(original$('#js_content'));
|
|
477
|
+
}
|
|
478
|
+
return page;
|
|
479
|
+
};
|
|
480
|
+
private static img = ($: CheerioAPI, commonMetadata: Record<string, string>) => {
|
|
481
|
+
const page = PageParsers.common($, commonMetadata);
|
|
482
|
+
const imgUrls = ExtractMetadata.img($)?.imgUrls;
|
|
483
|
+
let imgHtml = '';
|
|
484
|
+
if (Array.isArray(imgUrls) && imgUrls.length > 0) {
|
|
485
|
+
for (const imgUrl of imgUrls) {
|
|
486
|
+
imgHtml += `<br><br><img src="${imgUrl}" />`;
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
page.description += imgHtml;
|
|
490
|
+
return page;
|
|
491
|
+
};
|
|
492
|
+
private static audio = ($: CheerioAPI, commonMetadata: Record<string, string>) => {
|
|
493
|
+
const page = PageParsers.common($, commonMetadata);
|
|
494
|
+
const audioMetadata = ExtractMetadata.audio($);
|
|
495
|
+
const audioUrl = genAudioSrc(audioMetadata.voiceId);
|
|
496
|
+
page.enclosure_url = audioUrl;
|
|
497
|
+
page.itunes_duration = audioMetadata.duration;
|
|
498
|
+
page.enclosure_type = 'audio/mp3'; // FIXME: may it be other types?
|
|
499
|
+
page.description += '<br><br>' + genAudioTag(audioUrl, page.title);
|
|
500
|
+
return page;
|
|
501
|
+
};
|
|
502
|
+
private static fallback = ($: CheerioAPI, commonMetadata: Record<string, string>) => {
|
|
503
|
+
const page = PageParsers.common($, commonMetadata);
|
|
504
|
+
const image = $('meta[property="og:image"]').attr('content');
|
|
505
|
+
if (image) {
|
|
506
|
+
page.description += `<br><br><img src="${image}" />`;
|
|
507
|
+
}
|
|
508
|
+
return page;
|
|
509
|
+
};
|
|
510
|
+
static dispatch = async (html: string, url: string) => {
|
|
511
|
+
const $ = load(html);
|
|
512
|
+
const commonMetadata = ExtractMetadata.common($);
|
|
513
|
+
let page: Record<string, any>;
|
|
514
|
+
let pageText: string, pageTextShort: string;
|
|
515
|
+
switch (commonMetadata.showType) {
|
|
516
|
+
case 'APP_MSG_PAGE':
|
|
517
|
+
page = await PageParsers.appMsg($, commonMetadata);
|
|
518
|
+
break;
|
|
519
|
+
case 'AUDIO_SHARE_PAGE':
|
|
520
|
+
page = PageParsers.audio($, commonMetadata);
|
|
521
|
+
break;
|
|
522
|
+
case 'IMG_SHARE_PAGE':
|
|
523
|
+
page = PageParsers.img($, commonMetadata);
|
|
524
|
+
break;
|
|
525
|
+
case 'VIDEO_SHARE_PAGE':
|
|
526
|
+
page = PageParsers.fallback($, commonMetadata);
|
|
527
|
+
break;
|
|
528
|
+
case undefined:
|
|
529
|
+
$('script, style').remove();
|
|
530
|
+
pageText = $('title, body').text().replaceAll(/\s+/g, ' ').trim();
|
|
531
|
+
pageTextShort = pageText.slice(0, 25);
|
|
532
|
+
if (pageText.length >= 25 + '...'.length) {
|
|
533
|
+
pageTextShort = pageText.slice(0, 25);
|
|
534
|
+
pageTextShort += '...';
|
|
535
|
+
}
|
|
536
|
+
if (pageText.includes('ε·²θ’«εεΈθ
ε ι€')) {
|
|
537
|
+
errorNoMention('deleted by author', pageTextShort, url);
|
|
538
|
+
} else if (new URL(url).pathname.includes('captcha') || pageText.includes('η―ε’εΌεΈΈ')) {
|
|
539
|
+
errorNoMention('request blocked by WAF', pageTextShort, url);
|
|
540
|
+
} else {
|
|
541
|
+
error('unknown page, probably due to WAF', pageTextShort, url);
|
|
542
|
+
}
|
|
543
|
+
return {}; // just to make TypeScript happy, actually UNREACHABLE
|
|
544
|
+
default:
|
|
545
|
+
warn('new showType, trying fallback method', `showType=${commonMetadata.showType}`, url);
|
|
546
|
+
page = PageParsers.fallback($, commonMetadata);
|
|
547
|
+
}
|
|
548
|
+
const locationMetadata = ExtractMetadata.location($);
|
|
549
|
+
let location = '';
|
|
550
|
+
for (const loc of [locationMetadata.countryName, locationMetadata.provinceName, locationMetadata.cityName]) {
|
|
551
|
+
if (loc) {
|
|
552
|
+
location += loc + ' ';
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
location = location.trim();
|
|
556
|
+
if (location) {
|
|
557
|
+
page.description += `<p>πε葨δΊοΌ${location}</p>`;
|
|
558
|
+
}
|
|
559
|
+
if (commonMetadata.sourceUrl) {
|
|
560
|
+
page.description += `<p><a href="${commonMetadata.sourceUrl}">ποΈ ι
θ―»εζ</a></p>`;
|
|
561
|
+
}
|
|
562
|
+
return page;
|
|
563
|
+
};
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
const redirectHelper = async (url: string, maxRedirects: number = 5) => {
|
|
567
|
+
maxRedirects--;
|
|
568
|
+
const raw = await ofetch.raw(url);
|
|
569
|
+
if ([301, 302, 303, 307, 308].includes(raw.status)) {
|
|
570
|
+
if (!raw.headers.has('location')) {
|
|
571
|
+
error('redirect without location', url);
|
|
572
|
+
} else if (maxRedirects <= 0) {
|
|
573
|
+
error('too many redirects', url);
|
|
574
|
+
}
|
|
575
|
+
return await redirectHelper(<string>raw.headers.get('location'), maxRedirects);
|
|
576
|
+
}
|
|
577
|
+
return raw;
|
|
578
|
+
};
|
|
579
|
+
|
|
187
580
|
/**
|
|
188
581
|
* Fetch article and its metadata from WeChat MP (mp.weixin.qq.com).
|
|
189
582
|
*
|
|
190
583
|
* If you use this function, no need to call `fixArticleContent`
|
|
191
|
-
* @param
|
|
192
|
-
* @param
|
|
193
|
-
* @
|
|
194
|
-
* @return {Promise<object>} - An object containing the article and its metadata.
|
|
584
|
+
* @param url - The url of the article.
|
|
585
|
+
* @param bypassHostCheck - Whether to bypass host check.
|
|
586
|
+
* @return - An object containing the article and its metadata.
|
|
195
587
|
*/
|
|
196
|
-
const fetchArticle = (url, bypassHostCheck = false) => {
|
|
588
|
+
const fetchArticle = (url: string, bypassHostCheck: boolean = false) => {
|
|
197
589
|
url = normalizeUrl(url, bypassHostCheck);
|
|
198
590
|
return cache.tryGet(url, async () => {
|
|
199
|
-
const
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
const author = $('meta[name=author]').attr('content');
|
|
204
|
-
let summary = $('meta[name=description]').attr('content');
|
|
205
|
-
summary = summary === title ? '' : summary;
|
|
206
|
-
let description = fixArticleContent($('#js_content'));
|
|
207
|
-
// No article get or article is too short, try the original url
|
|
208
|
-
const originalUrl = detectOriginalArticleUrl($);
|
|
209
|
-
if (originalUrl) {
|
|
210
|
-
// try to fetch the description from the original article
|
|
211
|
-
const data = await ofetch(normalizeUrl(originalUrl, bypassHostCheck));
|
|
212
|
-
const original$ = load(data);
|
|
213
|
-
description += fixArticleContent(original$('#js_content'));
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
const sourceUrl = detectSourceUrl($);
|
|
217
|
-
if (sourceUrl) {
|
|
218
|
-
description += `<a href="${sourceUrl}">ι
θ―»εζ</a>`;
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
let pubDate;
|
|
222
|
-
const publish_time_script = $('script[nonce][type="text/javascript"]:contains("var ct")').text();
|
|
223
|
-
const publish_time_match = publish_time_script && publish_time_script.match(/var ct *= *"?(\d{10})"?/);
|
|
224
|
-
const publish_timestamp = publish_time_match && publish_time_match[1];
|
|
225
|
-
if (publish_timestamp) {
|
|
226
|
-
pubDate = parseDate(Number.parseInt(publish_timestamp) * 1000);
|
|
227
|
-
}
|
|
228
|
-
|
|
229
|
-
let mpName = $('.profile_nickname').first().text();
|
|
230
|
-
mpName = mpName && mpName.trim();
|
|
231
|
-
return { title, author, description, summary, pubDate, mpName, link: url };
|
|
591
|
+
const raw = await redirectHelper(url);
|
|
592
|
+
// pass the redirected URL to dispatcher for better error logging
|
|
593
|
+
const page = await PageParsers.dispatch(raw._data, raw.url);
|
|
594
|
+
return { ...page, link: url };
|
|
232
595
|
}) as Promise<{
|
|
233
596
|
title: string;
|
|
234
597
|
author: string;
|
|
@@ -237,6 +600,9 @@ const fetchArticle = (url, bypassHostCheck = false) => {
|
|
|
237
600
|
pubDate?: Date;
|
|
238
601
|
mpName?: string;
|
|
239
602
|
link: string;
|
|
603
|
+
enclosure_type?: string;
|
|
604
|
+
enclosure_url?: string;
|
|
605
|
+
itunes_duration?: string | number;
|
|
240
606
|
}>;
|
|
241
607
|
};
|
|
242
608
|
|
|
@@ -257,18 +623,23 @@ const fetchArticle = (url, bypassHostCheck = false) => {
|
|
|
257
623
|
* @return {Promise<object>} - The incoming `item` object, with the article and its metadata filled in.
|
|
258
624
|
*/
|
|
259
625
|
const finishArticleItem = async (item, setMpNameAsAuthor = false, skipLink = false) => {
|
|
260
|
-
const
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
626
|
+
const fetchedItem = await fetchArticle(item.link);
|
|
627
|
+
for (const key in fetchedItem) {
|
|
628
|
+
switch (key) {
|
|
629
|
+
case 'author':
|
|
630
|
+
item.author = setMpNameAsAuthor
|
|
631
|
+
? fetchedItem.mpName || item.author // the Official Account itself. if your route return articles from different accounts, you may want to use this
|
|
632
|
+
: fetchedItem.author || item.author; // the real author of the article. if your route return articles from a certain account, use this
|
|
633
|
+
break;
|
|
634
|
+
case 'link':
|
|
635
|
+
item.link = skipLink ? item.link : fetchedItem.link || item.link;
|
|
636
|
+
break;
|
|
637
|
+
default:
|
|
638
|
+
item[key] = item[key] || fetchedItem[key];
|
|
639
|
+
}
|
|
270
640
|
}
|
|
271
641
|
return item;
|
|
272
642
|
};
|
|
273
643
|
|
|
274
|
-
|
|
644
|
+
const exportedForTestingOnly = { toggleWerror, ExtractMetadata, showTypeMapReverse };
|
|
645
|
+
export { exportedForTestingOnly, WeChatMpError, fixArticleContent, fetchArticle, finishArticleItem, normalizeUrl };
|
|
@@ -10,7 +10,7 @@ const NETWORK = 'RSS';
|
|
|
10
10
|
const TAG = 'RSS';
|
|
11
11
|
const TYPE = 'feed';
|
|
12
12
|
|
|
13
|
-
const
|
|
13
|
+
const rss3 = (data) => {
|
|
14
14
|
const currentUnixTsp = dayjs().unix();
|
|
15
15
|
const umsResult = {
|
|
16
16
|
data: data.item.map((item) => {
|
|
@@ -59,4 +59,4 @@ function getOwnershipFieldFromURL(item) {
|
|
|
59
59
|
}
|
|
60
60
|
}
|
|
61
61
|
|
|
62
|
-
export default
|
|
62
|
+
export default rss3;
|