rsshub 1.0.0-master.f71451d β†’ 1.0.0-master.f75997f

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/lib/config.ts +14 -0
  2. package/lib/errors/index.test.ts +2 -2
  3. package/lib/middleware/template.tsx +12 -3
  4. package/lib/routes/0x80/index.ts +87 -0
  5. package/lib/routes/0x80/namespace.ts +7 -0
  6. package/lib/routes/aljazeera/index.ts +17 -14
  7. package/lib/routes/apple/podcast.ts +64 -0
  8. package/lib/routes/bilibili/cache.ts +1 -1
  9. package/lib/routes/bing/daily-wallpaper.ts +9 -8
  10. package/lib/routes/byau/namespace.ts +6 -0
  11. package/lib/routes/byau/xinwen/index.ts +72 -0
  12. package/lib/routes/cpcaauto/index.ts +255 -0
  13. package/lib/routes/cpcaauto/namespace.ts +8 -0
  14. package/lib/routes/dehenglaw/index.ts +128 -0
  15. package/lib/routes/dehenglaw/namespace.ts +8 -0
  16. package/lib/routes/dehenglaw/templates/description.art +7 -0
  17. package/lib/routes/gov/stats/index.ts +25 -22
  18. package/lib/routes/gxmzu/ai.ts +1 -1
  19. package/lib/routes/gxmzu/lib.ts +9 -26
  20. package/lib/routes/gxmzu/utils/index.ts +31 -13
  21. package/lib/routes/gxmzu/yjs.ts +1 -1
  22. package/lib/routes/jou/utils/index.ts +35 -25
  23. package/lib/routes/lofter/tag.ts +3 -3
  24. package/lib/routes/lofter/user.ts +3 -3
  25. package/lib/routes/njxzc/utils/index.ts +31 -13
  26. package/lib/routes/qingting/podcast.ts +61 -39
  27. package/lib/routes/reuters/common.ts +2 -2
  28. package/lib/routes/sara/index.ts +66 -0
  29. package/lib/routes/sara/namespace.ts +6 -0
  30. package/lib/routes/tencent/news/author.ts +13 -11
  31. package/lib/routes/test/index.ts +11 -1
  32. package/lib/routes/twitter/api/mobile-api/login.ts +29 -28
  33. package/lib/routes/twitter/namespace.ts +2 -2
  34. package/lib/routes/twitter/user.ts +5 -0
  35. package/lib/routes/u3c3/index.ts +1 -1
  36. package/lib/routes/u3c3/namespace.ts +1 -1
  37. package/lib/routes/u9a9/index.ts +2 -2
  38. package/lib/routes/u9a9/namespace.ts +1 -1
  39. package/lib/routes/zsxq/group.ts +63 -0
  40. package/lib/routes/zsxq/namespace.ts +6 -0
  41. package/lib/routes/zsxq/types.ts +149 -0
  42. package/lib/routes/zsxq/user.ts +58 -0
  43. package/lib/routes/zsxq/utils.ts +70 -0
  44. package/lib/setup.test.ts +183 -12
  45. package/lib/utils/render.ts +1 -1
  46. package/lib/utils/request-rewriter/get.ts +8 -1
  47. package/lib/utils/wechat-mp.test.ts +411 -32
  48. package/lib/utils/wechat-mp.ts +447 -76
  49. package/lib/views/{rss3-ums.ts β†’ rss3.ts} +2 -2
  50. package/package.json +14 -14
@@ -26,9 +26,240 @@
26
26
  */
27
27
 
28
28
  import ofetch from '@/utils/ofetch';
29
- import { load, type Cheerio, type Element } from 'cheerio';
29
+ import { type Cheerio, type CheerioAPI, type Element, load } from 'cheerio';
30
30
  import { parseDate } from '@/utils/parse-date';
31
31
  import cache from '@/utils/cache';
32
+ import logger from '@/utils/logger';
33
+
34
+ class WeChatMpError extends Error {
35
+ constructor(message: string) {
36
+ super(message);
37
+ this.name = 'WeChatMpError';
38
+ }
39
+ }
40
+
41
+ const MAINTAINERS = ['@Rongronggg9'];
42
+
43
+ const formatLogNoMention = (...params: string[]): string => `wechat-mp: ${params.join(': ')}`;
44
+ const formatLog = (...params: string[]): string => `${formatLogNoMention(...params)}
45
+ Consider raise an issue (mentioning ${MAINTAINERS.join(', ')}) with the article URL for further investigation`;
46
+ let warn = (...params: string[]) => logger.warn(formatLog(...params));
47
+ const error = (...params: string[]): never => {
48
+ const msg = formatLog(...params);
49
+ logger.error(msg);
50
+ throw new WeChatMpError(msg);
51
+ };
52
+ const errorNoMention = (...params: string[]): never => {
53
+ const msg = formatLogNoMention(...params);
54
+ logger.error(msg);
55
+ throw new WeChatMpError(msg);
56
+ };
57
+ const toggleWerror = (() => {
58
+ const onFunc = (...params: string[]) => error('WarningAsError', ...params);
59
+ const offFunc = warn;
60
+ return (on: boolean) => {
61
+ warn = on ? onFunc : offFunc;
62
+ };
63
+ })();
64
+
65
+ const replaceReturnNewline = (() => {
66
+ const returnRegExp = /\r|\\(r|x0d)/g;
67
+ const newlineRegExp = /\n|\\(n|x0a)/g;
68
+ return (text: string, replaceReturnWith = '', replaceNewlineWith = '<br>') => text.replaceAll(returnRegExp, replaceReturnWith).replaceAll(newlineRegExp, replaceNewlineWith);
69
+ })();
70
+ const fixUrl = (() => {
71
+ const ampRegExp = /(&|\\x26)amp;/g;
72
+ return (text: string) => text.replaceAll(ampRegExp, '&');
73
+ })();
74
+
75
+ class LoopContinue extends Error {
76
+ constructor() {
77
+ super('');
78
+ this.name = 'LoopContinue';
79
+ }
80
+ }
81
+
82
+ class LoopReturn extends Error {
83
+ to_return: any;
84
+
85
+ constructor(to_return: any) {
86
+ super('');
87
+ this.name = 'LoopReturn';
88
+ this.to_return = to_return;
89
+ }
90
+ }
91
+
92
+ const forEachScript = ($: CheerioAPI | string, callback: (script) => void, defaultReturn: any = null, selector = 'script[nonce][type="text/javascript"]') => {
93
+ const scripts = typeof $ === 'string' ? [$] : $(selector).toArray();
94
+ for (const script of scripts) {
95
+ try {
96
+ callback(script);
97
+ } catch (error) {
98
+ if (error instanceof LoopReturn) {
99
+ return error.to_return;
100
+ } else if (error instanceof LoopContinue) {
101
+ continue;
102
+ }
103
+ throw error;
104
+ }
105
+ }
106
+ return defaultReturn;
107
+ };
108
+
109
+ // view-source a *_SHARE_PAGE type article and search for `ITEM_SHOW_TYPE_MAP`
110
+ // Please update the comments below if you find new types or new examples
111
+ const showTypeMap = {
112
+ // "Article".
113
+ // May be combined with media, but type won't change
114
+ // Combined with audio and iframe: https://mp.weixin.qq.com/s/FnjcMXZ1xdS-d6n-pUUyyw
115
+ APP_MSG_PAGE: '0',
116
+ // https://mp.weixin.qq.com/s?__biz=Mzg4NTA1MTkwNA==&mid=2247532942&idx=1&sn=a84e4adbe49fdb39e4d4c1b5c12a4c3f
117
+ VIDEO_SHARE_PAGE: '5',
118
+ MUSIC_SHARE_PAGE: '6',
119
+ // https://mp.weixin.qq.com/s/FY6yQC_e4NMAxK0FBr6jwQ
120
+ AUDIO_SHARE_PAGE: '7',
121
+ // https://mp.weixin.qq.com/s/4p5YmYuASiQSYFiy7KqydQ
122
+ // https://mp.weixin.qq.com/s?__biz=Mzg4NTA1MTkwNA==&mid=2247532936&idx=4&sn=624054c20ded6ee85c6632f419c6f758
123
+ IMG_SHARE_PAGE: '8',
124
+ TEXT_SHARE_PAGE: '10',
125
+ SHORT_CONTENT_PAGE: '17',
126
+ };
127
+ const showTypeMapReverse = Object.fromEntries(Object.entries(showTypeMap).map(([k, v]) => [v, k]));
128
+
129
+ class ExtractMetadata {
130
+ private static genAssignmentRegExp = (varName: string, valuePattern: string, assignPattern: string) => RegExp(`\\b${varName}\\s*${assignPattern}\\s*(?<quote>["'])(?<value>${valuePattern})\\k<quote>`, 'mg');
131
+
132
+ private static genExtractFunc = (
133
+ varName: string,
134
+ {
135
+ valuePattern = '\\w+',
136
+ assignPattern = '=',
137
+ allowNotFound = false,
138
+ multiple = false,
139
+ }: {
140
+ valuePattern?: string;
141
+ assignPattern?: string;
142
+ allowNotFound?: boolean;
143
+ multiple?: boolean;
144
+ }
145
+ ) => {
146
+ const regExp = this.genAssignmentRegExp(varName, valuePattern, assignPattern);
147
+ return (str: string) => {
148
+ const values: string[] = [];
149
+ for (const match of str.matchAll(regExp)) {
150
+ const value = <string>match.groups?.value;
151
+ if (!multiple) {
152
+ return value;
153
+ }
154
+ values.push(value);
155
+ }
156
+ if (!allowNotFound && values.length === 0) {
157
+ throw new LoopContinue();
158
+ }
159
+ return multiple ? values : null;
160
+ };
161
+ };
162
+
163
+ private static doExtract = (metadataToBeExtracted: Record<string, (str: string) => string | string[] | null | undefined>, scriptText: string) => {
164
+ const metadataExtracted: Record<string, string | string[]> = {};
165
+ for (const [key, extractFunc] of Object.entries(metadataToBeExtracted)) {
166
+ metadataExtracted[key] = <string>extractFunc(scriptText);
167
+ }
168
+ metadataExtracted._extractedFrom = scriptText;
169
+ return metadataExtracted;
170
+ };
171
+
172
+ private static commonMetadataToBeExtracted = {
173
+ showType: this.genExtractFunc('item_show_type', { valuePattern: '\\d+' }),
174
+ realShowType: this.genExtractFunc('real_item_show_type', { valuePattern: '\\d+' }),
175
+ createTime: this.genExtractFunc('ct', { valuePattern: '\\d+', allowNotFound: true }),
176
+ sourceUrl: this.genExtractFunc('msg_source_url', { valuePattern: `https?://[^'"]*`, allowNotFound: true }),
177
+ };
178
+
179
+ static common = ($: CheerioAPI) =>
180
+ forEachScript(
181
+ $,
182
+ (script) => {
183
+ const scriptText = $(script).text();
184
+ const metadataExtracted = <Record<string, string>> this.doExtract(this.commonMetadataToBeExtracted, scriptText);
185
+ const showType = showTypeMapReverse[metadataExtracted.showType];
186
+ const realShowType = showTypeMapReverse[metadataExtracted.realShowType];
187
+ metadataExtracted.sourceUrl = metadataExtracted.sourceUrl && fixUrl(metadataExtracted.sourceUrl);
188
+ if (showType) {
189
+ metadataExtracted.showType = showType;
190
+ } else {
191
+ warn('showType not found', `item_show_type=${metadataExtracted.showType}`);
192
+ }
193
+ if (realShowType) {
194
+ metadataExtracted.realShowType = realShowType;
195
+ } else {
196
+ warn('realShowType not found', `real_item_show_type=${metadataExtracted.realShowType}`);
197
+ }
198
+ if (metadataExtracted.showType !== metadataExtracted.realShowType) {
199
+ // never seen this happen, waiting for examples
200
+ warn('showType mismatch', `item_show_type=${metadataExtracted.showType}, real_item_show_type=${metadataExtracted.realShowType}`);
201
+ }
202
+ throw new LoopReturn(metadataExtracted);
203
+ },
204
+ {},
205
+ 'script[nonce][type="text/javascript"]:contains("real_item_show_type")'
206
+ );
207
+
208
+ private static audioMetadataToBeExtracted = {
209
+ voiceId: this.genExtractFunc('voiceid', { assignPattern: ':' }),
210
+ duration: this.genExtractFunc('duration', { valuePattern: '\\d*', assignPattern: ':', allowNotFound: true }),
211
+ };
212
+
213
+ // never seen a audio article containing multiple audio, waiting for examples
214
+ static audio = ($: CheerioAPI) =>
215
+ forEachScript(
216
+ $,
217
+ (script) => {
218
+ const scriptText = $(script).text();
219
+ const metadataExtracted = <Record<string, string>> this.doExtract(this.audioMetadataToBeExtracted, scriptText);
220
+ throw new LoopReturn(metadataExtracted);
221
+ },
222
+ {},
223
+ 'script[nonce][type="text/javascript"]:contains("voiceid")'
224
+ );
225
+
226
+ private static imgMetadataToBeExtracted = {
227
+ imgUrls: this.genExtractFunc('cdn_url', { valuePattern: `https?://[^'"]*`, assignPattern: ':', multiple: true }),
228
+ };
229
+
230
+ static img = ($: CheerioAPI) =>
231
+ forEachScript(
232
+ $,
233
+ (script) => {
234
+ const scriptText = $(script).text();
235
+ const metadataExtracted = <Record<string, string[]>> this.doExtract(this.imgMetadataToBeExtracted, scriptText);
236
+ if (Array.isArray(metadataExtracted.imgUrls)) {
237
+ metadataExtracted.imgUrls = metadataExtracted.imgUrls.map((url) => fixUrl(url));
238
+ }
239
+ throw new LoopReturn(metadataExtracted);
240
+ },
241
+ {},
242
+ 'script[nonce][type="text/javascript"]:contains("picture_page_info_list")'
243
+ );
244
+
245
+ private static locationMetadataToBeExtracted = {
246
+ countryName: this.genExtractFunc('countryName', { valuePattern: `[^'"]*`, assignPattern: ':' }),
247
+ provinceName: this.genExtractFunc('provinceName', { valuePattern: `[^'"]*`, assignPattern: ':' }),
248
+ cityName: this.genExtractFunc('cityName', { valuePattern: `[^'"]*`, assignPattern: ':' }),
249
+ };
250
+
251
+ static location = ($: CheerioAPI) =>
252
+ forEachScript(
253
+ $,
254
+ (script) => {
255
+ const scriptText = $(script).text();
256
+ const metadataExtracted = this.doExtract(this.locationMetadataToBeExtracted, scriptText);
257
+ throw new LoopReturn(metadataExtracted);
258
+ },
259
+ {},
260
+ 'script[nonce][type="text/javascript"]:contains("countryName")'
261
+ );
262
+ }
32
263
 
33
264
  const replaceTag = ($, oldTag, newTagName) => {
34
265
  oldTag = $(oldTag);
@@ -55,15 +286,23 @@ const detectOriginalArticleUrl = ($) => {
55
286
  return null;
56
287
  };
57
288
 
58
- const detectSourceUrl = ($) => {
59
- const matchs = $.root()
60
- .html()
61
- .match(/msg_source_url = '(.+)';/);
62
-
63
- if (matchs) {
64
- return matchs[1];
65
- }
66
- return null;
289
+ const genAudioSrc = (voiceId: string) => `https://res.wx.qq.com/voice/getvoice?mediaid=${voiceId}`;
290
+ const genAudioTag = (src: string, title: string) => `<audio controls src="${src}" title="${title}" style="width:100%"/>`;
291
+ const genVideoSrc = (videoId: string) => {
292
+ const newSearchParams = new URLSearchParams({
293
+ origin: 'https://mp.weixin.qq.com',
294
+ containerId: 'js_tx_video_container_0.3863487104715233',
295
+ vid: videoId,
296
+ width: '677',
297
+ height: '380.8125',
298
+ autoplay: 'false',
299
+ allowFullScreen: 'true',
300
+ chid: '17',
301
+ full: 'true',
302
+ show1080p: 'false',
303
+ isDebugIframe: 'false',
304
+ });
305
+ return `https://v.qq.com/txp/iframe/player.html?${newSearchParams.toString()}`;
67
306
  };
68
307
 
69
308
  /**
@@ -99,6 +338,33 @@ const fixArticleContent = (html?: string | Cheerio<Element>, skipImg = false) =>
99
338
  }
100
339
  });
101
340
  }
341
+ // fix audio: https://mp.weixin.qq.com/s/FnjcMXZ1xdS-d6n-pUUyyw
342
+ $('mpvoice[voice_encode_fileid]').each((_, voice) => {
343
+ const $voice = $(voice);
344
+ const voiceId = $voice.attr('voice_encode_fileid');
345
+ if (voiceId) {
346
+ const title = $voice.attr('name') || 'Audio';
347
+ $voice.replaceWith(genAudioTag(genAudioSrc(voiceId), title));
348
+ }
349
+ });
350
+ // fix iframe: https://mp.weixin.qq.com/s/FnjcMXZ1xdS-d6n-pUUyyw
351
+ $('iframe.video_iframe[data-src]').each((_, iframe) => {
352
+ const $iframe = $(iframe);
353
+ const dataSrc = <string>$iframe.attr('data-src');
354
+ const srcUrlObj = new URL(dataSrc);
355
+ if (srcUrlObj.host === 'v.qq.com' && srcUrlObj.searchParams.has('vid')) {
356
+ const newSrc = genVideoSrc(<string>srcUrlObj.searchParams.get('vid'));
357
+ $iframe.attr('src', newSrc);
358
+ $iframe.removeAttr('data-src');
359
+ const width = $iframe.attr('data-w');
360
+ const ratio = $iframe.attr('data-ratio');
361
+ if (width && ratio) {
362
+ const width_ = Math.min(Number.parseInt(width), 677);
363
+ $iframe.attr('width', width_.toString());
364
+ $iframe.attr('height', (width_ / Number.parseFloat(ratio)).toString());
365
+ }
366
+ } // else {} FIXME: https://mp.weixin.qq.com/s?__biz=Mzg5Mjk3MzE4OQ==&mid=2247549515&idx=2&sn=a608fca597f0589c1aebd6d0b82ff6e9
367
+ });
102
368
  // fix section
103
369
  $('section').each((_, section) => {
104
370
  const $section = $(section);
@@ -122,17 +388,6 @@ const fixArticleContent = (html?: string | Cheerio<Element>, skipImg = false) =>
122
388
  // clear line index tags in code section
123
389
  $('.code-snippet__line-index').remove();
124
390
 
125
- // fix single picture article
126
- // example: https://mp.weixin.qq.com/s/4p5YmYuASiQSYFiy7KqydQ
127
- $('script').each((_, script) => {
128
- const $script = $(script);
129
- const matchs = $script.html()?.match(/document\.getElementById\('js_image_desc'\)\.innerHTML = "(.*)"\.replace/);
130
-
131
- if (matchs) {
132
- $script.replaceWith(matchs[1].replaceAll('\r', '').replaceAll('\n', '<br>').replaceAll('\\x0d', '').replaceAll('\\x0a', '<br>'));
133
- }
134
- });
135
-
136
391
  // clean scripts
137
392
  $('script').remove();
138
393
  return $.html();
@@ -146,18 +401,21 @@ const fixArticleContent = (html?: string | Cheerio<Element>, skipImg = false) =>
146
401
  // abtest_cookie, wx_header
147
402
  // Known params (temporary link):
148
403
  // src, timestamp, ver, signature, new (unessential)
149
- const normalizeUrl = (url, bypassHostCheck = false) => {
404
+ const normalizeUrl = (url: string, bypassHostCheck = false) => {
150
405
  const oriUrl = url;
406
+ // already seen some weird urls with `&` escaped as `&amp;`, so fix it
407
+ // calling fixUrl should always be safe since having `&amp;` or `\x26` in a URL is meaningless
408
+ url = fixUrl(url);
151
409
  const urlObj = new URL(url);
152
410
  if (!bypassHostCheck && urlObj.host !== 'mp.weixin.qq.com') {
153
- throw new Error('wechat-mp: URL host must be "mp.weixin.qq.com", but got ' + oriUrl);
411
+ error('URL host must be "mp.weixin.qq.com"', url);
154
412
  }
155
413
  urlObj.protocol = 'https:';
156
414
  urlObj.hash = ''; // remove hash
157
- if (/^\/s\/.+/.test(urlObj.pathname)) {
415
+ if (urlObj.pathname.startsWith('/s/')) {
158
416
  // a short link, just remove all the params
159
417
  urlObj.search = '';
160
- } else if (/^\/s$/.test(urlObj.pathname)) {
418
+ } else if (urlObj.pathname === '/s') {
161
419
  const biz = urlObj.searchParams.get('__biz');
162
420
  const mid = urlObj.searchParams.get('mid') || urlObj.searchParams.get('appmsgid');
163
421
  const idx = urlObj.searchParams.get('idx') || urlObj.searchParams.get('itemidx');
@@ -175,60 +433,165 @@ const normalizeUrl = (url, bypassHostCheck = false) => {
175
433
  // a temporary link, remove all unessential params
176
434
  urlObj.search = `?src=${src}&timestamp=${timestamp}&ver=${ver}&signature=${signature}`;
177
435
  } else {
178
- // unknown link, just let it go
436
+ warn('unknown URL search parameters', oriUrl);
179
437
  }
180
438
  }
181
439
  } else {
182
- // IDK what it is, just let it go
440
+ warn('unknown URL path', oriUrl);
183
441
  }
184
442
  return urlObj.href;
185
443
  };
186
444
 
445
+ class PageParsers {
446
+ private static common = ($: CheerioAPI, commonMetadata: Record<string, string>) => {
447
+ const title = replaceReturnNewline($('meta[property="og:title"]').attr('content') || '', '', ' ');
448
+ const author = replaceReturnNewline($('meta[name=author]').attr('content') || '', '', ' ');
449
+ const pubDate = commonMetadata.createTime ? parseDate(Number.parseInt(commonMetadata.createTime) * 1000) : undefined;
450
+ const mpName = $('.wx_follow_nickname').first().text()?.trim();
451
+
452
+ let summary = replaceReturnNewline($('meta[name=description]').attr('content') || '');
453
+ const description = summary;
454
+ summary = summary.replaceAll('<br>', ' ') === title ? '' : summary;
455
+
456
+ return { title, author, description, summary, pubDate, mpName } as {
457
+ title: string;
458
+ author: string;
459
+ description: string;
460
+ summary: string;
461
+ pubDate?: Date;
462
+ mpName?: string;
463
+ enclosure_url?: string;
464
+ itunes_duration?: string | number;
465
+ enclosure_type?: string;
466
+ };
467
+ };
468
+ private static appMsg = async ($: CheerioAPI, commonMetadata: Record<string, string>) => {
469
+ const page = PageParsers.common($, commonMetadata);
470
+ page.description = fixArticleContent($('#js_content'));
471
+ const originalArticleUrl = detectOriginalArticleUrl($);
472
+ if (originalArticleUrl) {
473
+ // No article or article is too short, try to fetch the description from the original article
474
+ const data = await ofetch(normalizeUrl(originalArticleUrl));
475
+ const original$ = load(data);
476
+ page.description += fixArticleContent(original$('#js_content'));
477
+ }
478
+ return page;
479
+ };
480
+ private static img = ($: CheerioAPI, commonMetadata: Record<string, string>) => {
481
+ const page = PageParsers.common($, commonMetadata);
482
+ const imgUrls = ExtractMetadata.img($)?.imgUrls;
483
+ let imgHtml = '';
484
+ if (Array.isArray(imgUrls) && imgUrls.length > 0) {
485
+ for (const imgUrl of imgUrls) {
486
+ imgHtml += `<br><br><img src="${imgUrl}" />`;
487
+ }
488
+ }
489
+ page.description += imgHtml;
490
+ return page;
491
+ };
492
+ private static audio = ($: CheerioAPI, commonMetadata: Record<string, string>) => {
493
+ const page = PageParsers.common($, commonMetadata);
494
+ const audioMetadata = ExtractMetadata.audio($);
495
+ const audioUrl = genAudioSrc(audioMetadata.voiceId);
496
+ page.enclosure_url = audioUrl;
497
+ page.itunes_duration = audioMetadata.duration;
498
+ page.enclosure_type = 'audio/mp3'; // FIXME: may it be other types?
499
+ page.description += '<br><br>' + genAudioTag(audioUrl, page.title);
500
+ return page;
501
+ };
502
+ private static fallback = ($: CheerioAPI, commonMetadata: Record<string, string>) => {
503
+ const page = PageParsers.common($, commonMetadata);
504
+ const image = $('meta[property="og:image"]').attr('content');
505
+ if (image) {
506
+ page.description += `<br><br><img src="${image}" />`;
507
+ }
508
+ return page;
509
+ };
510
+ static dispatch = async (html: string, url: string) => {
511
+ const $ = load(html);
512
+ const commonMetadata = ExtractMetadata.common($);
513
+ let page: Record<string, any>;
514
+ let pageText: string, pageTextShort: string;
515
+ switch (commonMetadata.showType) {
516
+ case 'APP_MSG_PAGE':
517
+ page = await PageParsers.appMsg($, commonMetadata);
518
+ break;
519
+ case 'AUDIO_SHARE_PAGE':
520
+ page = PageParsers.audio($, commonMetadata);
521
+ break;
522
+ case 'IMG_SHARE_PAGE':
523
+ page = PageParsers.img($, commonMetadata);
524
+ break;
525
+ case 'VIDEO_SHARE_PAGE':
526
+ page = PageParsers.fallback($, commonMetadata);
527
+ break;
528
+ case undefined:
529
+ $('script, style').remove();
530
+ pageText = $('title, body').text().replaceAll(/\s+/g, ' ').trim();
531
+ pageTextShort = pageText.slice(0, 25);
532
+ if (pageText.length >= 25 + '...'.length) {
533
+ pageTextShort = pageText.slice(0, 25);
534
+ pageTextShort += '...';
535
+ }
536
+ if (pageText.includes('ε·²θ’«ε‘εΈƒθ€…εˆ ι™€')) {
537
+ errorNoMention('deleted by author', pageTextShort, url);
538
+ } else if (new URL(url).pathname.includes('captcha') || pageText.includes('ηŽ―ε’ƒεΌ‚εΈΈ')) {
539
+ errorNoMention('request blocked by WAF', pageTextShort, url);
540
+ } else {
541
+ error('unknown page, probably due to WAF', pageTextShort, url);
542
+ }
543
+ return {}; // just to make TypeScript happy, actually UNREACHABLE
544
+ default:
545
+ warn('new showType, trying fallback method', `showType=${commonMetadata.showType}`, url);
546
+ page = PageParsers.fallback($, commonMetadata);
547
+ }
548
+ const locationMetadata = ExtractMetadata.location($);
549
+ let location = '';
550
+ for (const loc of [locationMetadata.countryName, locationMetadata.provinceName, locationMetadata.cityName]) {
551
+ if (loc) {
552
+ location += loc + ' ';
553
+ }
554
+ }
555
+ location = location.trim();
556
+ if (location) {
557
+ page.description += `<p>πŸ“ε‘θ‘¨δΊŽοΌš${location}</p>`;
558
+ }
559
+ if (commonMetadata.sourceUrl) {
560
+ page.description += `<p><a href="${commonMetadata.sourceUrl}">πŸ”—οΈ ι˜…θ―»εŽŸζ–‡</a></p>`;
561
+ }
562
+ return page;
563
+ };
564
+ }
565
+
566
+ const redirectHelper = async (url: string, maxRedirects: number = 5) => {
567
+ maxRedirects--;
568
+ const raw = await ofetch.raw(url);
569
+ if ([301, 302, 303, 307, 308].includes(raw.status)) {
570
+ if (!raw.headers.has('location')) {
571
+ error('redirect without location', url);
572
+ } else if (maxRedirects <= 0) {
573
+ error('too many redirects', url);
574
+ }
575
+ return await redirectHelper(<string>raw.headers.get('location'), maxRedirects);
576
+ }
577
+ return raw;
578
+ };
579
+
187
580
  /**
188
581
  * Fetch article and its metadata from WeChat MP (mp.weixin.qq.com).
189
582
  *
190
583
  * If you use this function, no need to call `fixArticleContent`
191
- * @param {object} ctx - The context object.
192
- * @param {string} url - The url of the article.
193
- * @param {boolean} bypassHostCheck - Whether to bypass host check.
194
- * @return {Promise<object>} - An object containing the article and its metadata.
584
+ * @param url - The url of the article.
585
+ * @param bypassHostCheck - Whether to bypass host check.
586
+ * @return - An object containing the article and its metadata.
195
587
  */
196
- const fetchArticle = (url, bypassHostCheck = false) => {
588
+ const fetchArticle = (url: string, bypassHostCheck: boolean = false) => {
197
589
  url = normalizeUrl(url, bypassHostCheck);
198
590
  return cache.tryGet(url, async () => {
199
- const data = await ofetch(url);
200
- const $ = load(data);
201
-
202
- const title = ($('meta[property="og:title"]').attr('content') || '').replaceAll('\\r', '').replaceAll('\\n', ' ');
203
- const author = $('meta[name=author]').attr('content');
204
- let summary = $('meta[name=description]').attr('content');
205
- summary = summary === title ? '' : summary;
206
- let description = fixArticleContent($('#js_content'));
207
- // No article get or article is too short, try the original url
208
- const originalUrl = detectOriginalArticleUrl($);
209
- if (originalUrl) {
210
- // try to fetch the description from the original article
211
- const data = await ofetch(normalizeUrl(originalUrl, bypassHostCheck));
212
- const original$ = load(data);
213
- description += fixArticleContent(original$('#js_content'));
214
- }
215
-
216
- const sourceUrl = detectSourceUrl($);
217
- if (sourceUrl) {
218
- description += `<a href="${sourceUrl}">ι˜…θ―»εŽŸζ–‡</a>`;
219
- }
220
-
221
- let pubDate;
222
- const publish_time_script = $('script[nonce][type="text/javascript"]:contains("var ct")').text();
223
- const publish_time_match = publish_time_script && publish_time_script.match(/var ct *= *"?(\d{10})"?/);
224
- const publish_timestamp = publish_time_match && publish_time_match[1];
225
- if (publish_timestamp) {
226
- pubDate = parseDate(Number.parseInt(publish_timestamp) * 1000);
227
- }
228
-
229
- let mpName = $('.profile_nickname').first().text();
230
- mpName = mpName && mpName.trim();
231
- return { title, author, description, summary, pubDate, mpName, link: url };
591
+ const raw = await redirectHelper(url);
592
+ // pass the redirected URL to dispatcher for better error logging
593
+ const page = await PageParsers.dispatch(raw._data, raw.url);
594
+ return { ...page, link: url };
232
595
  }) as Promise<{
233
596
  title: string;
234
597
  author: string;
@@ -237,6 +600,9 @@ const fetchArticle = (url, bypassHostCheck = false) => {
237
600
  pubDate?: Date;
238
601
  mpName?: string;
239
602
  link: string;
603
+ enclosure_type?: string;
604
+ enclosure_url?: string;
605
+ itunes_duration?: string | number;
240
606
  }>;
241
607
  };
242
608
 
@@ -257,18 +623,23 @@ const fetchArticle = (url, bypassHostCheck = false) => {
257
623
  * @return {Promise<object>} - The incoming `item` object, with the article and its metadata filled in.
258
624
  */
259
625
  const finishArticleItem = async (item, setMpNameAsAuthor = false, skipLink = false) => {
260
- const { title, author, description, summary, pubDate, mpName, link } = await fetchArticle(item.link);
261
- item.title = title || item.title;
262
- item.description = description || item.description;
263
- item.summary = summary || item.summary;
264
- item.pubDate = pubDate || item.pubDate;
265
- item.author = setMpNameAsAuthor
266
- ? mpName || item.author // the Official Account itself. if your route return articles from different accounts, you may want to use this
267
- : author || item.author; // the real author of the article. if your route return articles from a certain account, use this
268
- if (!skipLink) {
269
- item.link = link || item.link;
626
+ const fetchedItem = await fetchArticle(item.link);
627
+ for (const key in fetchedItem) {
628
+ switch (key) {
629
+ case 'author':
630
+ item.author = setMpNameAsAuthor
631
+ ? fetchedItem.mpName || item.author // the Official Account itself. if your route return articles from different accounts, you may want to use this
632
+ : fetchedItem.author || item.author; // the real author of the article. if your route return articles from a certain account, use this
633
+ break;
634
+ case 'link':
635
+ item.link = skipLink ? item.link : fetchedItem.link || item.link;
636
+ break;
637
+ default:
638
+ item[key] = item[key] || fetchedItem[key];
639
+ }
270
640
  }
271
641
  return item;
272
642
  };
273
643
 
274
- export { fixArticleContent, fetchArticle, finishArticleItem, normalizeUrl };
644
+ const exportedForTestingOnly = { toggleWerror, ExtractMetadata, showTypeMapReverse };
645
+ export { exportedForTestingOnly, WeChatMpError, fixArticleContent, fetchArticle, finishArticleItem, normalizeUrl };
@@ -10,7 +10,7 @@ const NETWORK = 'RSS';
10
10
  const TAG = 'RSS';
11
11
  const TYPE = 'feed';
12
12
 
13
- const rss3Ums = (data) => {
13
+ const rss3 = (data) => {
14
14
  const currentUnixTsp = dayjs().unix();
15
15
  const umsResult = {
16
16
  data: data.item.map((item) => {
@@ -59,4 +59,4 @@ function getOwnershipFieldFromURL(item) {
59
59
  }
60
60
  }
61
61
 
62
- export default rss3Ums;
62
+ export default rss3;