npm - rsshub - Versions diffs - 1.0.0-master.f71451d → 1.0.0-master.f75997f - Mend

rsshub 1.0.0-master.f71451d → 1.0.0-master.f75997f

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

package/lib/config.ts +14 -0
package/lib/errors/index.test.ts +2 -2
package/lib/middleware/template.tsx +12 -3
package/lib/routes/0x80/index.ts +87 -0
package/lib/routes/0x80/namespace.ts +7 -0
package/lib/routes/aljazeera/index.ts +17 -14
package/lib/routes/apple/podcast.ts +64 -0
package/lib/routes/bilibili/cache.ts +1 -1
package/lib/routes/bing/daily-wallpaper.ts +9 -8
package/lib/routes/byau/namespace.ts +6 -0
package/lib/routes/byau/xinwen/index.ts +72 -0
package/lib/routes/cpcaauto/index.ts +255 -0
package/lib/routes/cpcaauto/namespace.ts +8 -0
package/lib/routes/dehenglaw/index.ts +128 -0
package/lib/routes/dehenglaw/namespace.ts +8 -0
package/lib/routes/dehenglaw/templates/description.art +7 -0
package/lib/routes/gov/stats/index.ts +25 -22
package/lib/routes/gxmzu/ai.ts +1 -1
package/lib/routes/gxmzu/lib.ts +9 -26
package/lib/routes/gxmzu/utils/index.ts +31 -13
package/lib/routes/gxmzu/yjs.ts +1 -1
package/lib/routes/jou/utils/index.ts +35 -25
package/lib/routes/lofter/tag.ts +3 -3
package/lib/routes/lofter/user.ts +3 -3
package/lib/routes/njxzc/utils/index.ts +31 -13
package/lib/routes/qingting/podcast.ts +61 -39
package/lib/routes/reuters/common.ts +2 -2
package/lib/routes/sara/index.ts +66 -0
package/lib/routes/sara/namespace.ts +6 -0
package/lib/routes/tencent/news/author.ts +13 -11
package/lib/routes/test/index.ts +11 -1
package/lib/routes/twitter/api/mobile-api/login.ts +29 -28
package/lib/routes/twitter/namespace.ts +2 -2
package/lib/routes/twitter/user.ts +5 -0
package/lib/routes/u3c3/index.ts +1 -1
package/lib/routes/u3c3/namespace.ts +1 -1
package/lib/routes/u9a9/index.ts +2 -2
package/lib/routes/u9a9/namespace.ts +1 -1
package/lib/routes/zsxq/group.ts +63 -0
package/lib/routes/zsxq/namespace.ts +6 -0
package/lib/routes/zsxq/types.ts +149 -0
package/lib/routes/zsxq/user.ts +58 -0
package/lib/routes/zsxq/utils.ts +70 -0
package/lib/setup.test.ts +183 -12
package/lib/utils/render.ts +1 -1
package/lib/utils/request-rewriter/get.ts +8 -1
package/lib/utils/wechat-mp.test.ts +411 -32
package/lib/utils/wechat-mp.ts +447 -76
package/lib/views/{rss3-ums.ts → rss3.ts} +2 -2
package/package.json +14 -14

package/lib/utils/wechat-mp.ts CHANGED Viewed

@@ -26,9 +26,240 @@
  */
 import ofetch from '@/utils/ofetch';
-import { load, type Cheerio, type Element } from 'cheerio';
+import { type Cheerio, type CheerioAPI, type Element, load } from 'cheerio';
 import { parseDate } from '@/utils/parse-date';
 import cache from '@/utils/cache';
+import logger from '@/utils/logger';
+class WeChatMpError extends Error {
+    constructor(message: string) {
+        super(message);
+        this.name = 'WeChatMpError';
+    }
+}
+const MAINTAINERS = ['@Rongronggg9'];
+const formatLogNoMention = (...params: string[]): string => `wechat-mp: ${params.join(': ')}`;
+const formatLog = (...params: string[]): string => `${formatLogNoMention(...params)}
+Consider raise an issue (mentioning ${MAINTAINERS.join(', ')}) with the article URL for further investigation`;
+let warn = (...params: string[]) => logger.warn(formatLog(...params));
+const error = (...params: string[]): never => {
+    const msg = formatLog(...params);
+    logger.error(msg);
+    throw new WeChatMpError(msg);
+};
+const errorNoMention = (...params: string[]): never => {
+    const msg = formatLogNoMention(...params);
+    logger.error(msg);
+    throw new WeChatMpError(msg);
+};
+const toggleWerror = (() => {
+    const onFunc = (...params: string[]) => error('WarningAsError', ...params);
+    const offFunc = warn;
+    return (on: boolean) => {
+        warn = on ? onFunc : offFunc;
+    };
+})();
+const replaceReturnNewline = (() => {
+    const returnRegExp = /\r|\\(r|x0d)/g;
+    const newlineRegExp = /\n|\\(n|x0a)/g;
+    return (text: string, replaceReturnWith = '', replaceNewlineWith = '<br>') => text.replaceAll(returnRegExp, replaceReturnWith).replaceAll(newlineRegExp, replaceNewlineWith);
+})();
+const fixUrl = (() => {
+    const ampRegExp = /(&|\\x26)amp;/g;
+    return (text: string) => text.replaceAll(ampRegExp, '&');
+})();
+class LoopContinue extends Error {
+    constructor() {
+        super('');
+        this.name = 'LoopContinue';
+    }
+}
+class LoopReturn extends Error {
+    to_return: any;
+    constructor(to_return: any) {
+        super('');
+        this.name = 'LoopReturn';
+        this.to_return = to_return;
+    }
+}
+const forEachScript = ($: CheerioAPI | string, callback: (script) => void, defaultReturn: any = null, selector = 'script[nonce][type="text/javascript"]') => {
+    const scripts = typeof $ === 'string' ? [$] : $(selector).toArray();
+    for (const script of scripts) {
+        try {
+            callback(script);
+        } catch (error) {
+            if (error instanceof LoopReturn) {
+                return error.to_return;
+            } else if (error instanceof LoopContinue) {
+                continue;
+            }
+            throw error;
+        }
+    }
+    return defaultReturn;
+};
+// view-source a *_SHARE_PAGE type article and search for `ITEM_SHOW_TYPE_MAP`
+// Please update the comments below if you find new types or new examples
+const showTypeMap = {
+    // "Article".
+    // May be combined with media, but type won't change
+    // Combined with audio and iframe: https://mp.weixin.qq.com/s/FnjcMXZ1xdS-d6n-pUUyyw
+    APP_MSG_PAGE: '0',
+    // https://mp.weixin.qq.com/s?__biz=Mzg4NTA1MTkwNA==&mid=2247532942&idx=1&sn=a84e4adbe49fdb39e4d4c1b5c12a4c3f
+    VIDEO_SHARE_PAGE: '5',
+    MUSIC_SHARE_PAGE: '6',
+    // https://mp.weixin.qq.com/s/FY6yQC_e4NMAxK0FBr6jwQ
+    AUDIO_SHARE_PAGE: '7',
+    // https://mp.weixin.qq.com/s/4p5YmYuASiQSYFiy7KqydQ
+    // https://mp.weixin.qq.com/s?__biz=Mzg4NTA1MTkwNA==&mid=2247532936&idx=4&sn=624054c20ded6ee85c6632f419c6f758
+    IMG_SHARE_PAGE: '8',
+    TEXT_SHARE_PAGE: '10',
+    SHORT_CONTENT_PAGE: '17',
+};
+const showTypeMapReverse = Object.fromEntries(Object.entries(showTypeMap).map(([k, v]) => [v, k]));
+class ExtractMetadata {
+    private static genAssignmentRegExp = (varName: string, valuePattern: string, assignPattern: string) => RegExp(`\\b${varName}\\s*${assignPattern}\\s*(?<quote>["'])(?<value>${valuePattern})\\k<quote>`, 'mg');
+    private static genExtractFunc = (
+        varName: string,
+        {
+            valuePattern = '\\w+',
+            assignPattern = '=',
+            allowNotFound = false,
+            multiple = false,
+        }: {
+            valuePattern?: string;
+            assignPattern?: string;
+            allowNotFound?: boolean;
+            multiple?: boolean;
+        }
+    ) => {
+        const regExp = this.genAssignmentRegExp(varName, valuePattern, assignPattern);
+        return (str: string) => {
+            const values: string[] = [];
+            for (const match of str.matchAll(regExp)) {
+                const value = <string>match.groups?.value;
+                if (!multiple) {
+                    return value;
+                }
+                values.push(value);
+            }
+            if (!allowNotFound && values.length === 0) {
+                throw new LoopContinue();
+            }
+            return multiple ? values : null;
+        };
+    };
+    private static doExtract = (metadataToBeExtracted: Record<string, (str: string) => string | string[] | null | undefined>, scriptText: string) => {
+        const metadataExtracted: Record<string, string | string[]> = {};
+        for (const [key, extractFunc] of Object.entries(metadataToBeExtracted)) {
+            metadataExtracted[key] = <string>extractFunc(scriptText);
+        }
+        metadataExtracted._extractedFrom = scriptText;
+        return metadataExtracted;
+    };
+    private static commonMetadataToBeExtracted = {
+        showType: this.genExtractFunc('item_show_type', { valuePattern: '\\d+' }),
+        realShowType: this.genExtractFunc('real_item_show_type', { valuePattern: '\\d+' }),
+        createTime: this.genExtractFunc('ct', { valuePattern: '\\d+', allowNotFound: true }),
+        sourceUrl: this.genExtractFunc('msg_source_url', { valuePattern: `https?://[^'"]*`, allowNotFound: true }),
+    };
+    static common = ($: CheerioAPI) =>
+        forEachScript(
+            $,
+            (script) => {
+                const scriptText = $(script).text();
+                const metadataExtracted = <Record<string, string>> this.doExtract(this.commonMetadataToBeExtracted, scriptText);
+                const showType = showTypeMapReverse[metadataExtracted.showType];
+                const realShowType = showTypeMapReverse[metadataExtracted.realShowType];
+                metadataExtracted.sourceUrl = metadataExtracted.sourceUrl && fixUrl(metadataExtracted.sourceUrl);
+                if (showType) {
+                    metadataExtracted.showType = showType;
+                } else {
+                    warn('showType not found', `item_show_type=${metadataExtracted.showType}`);
+                }
+                if (realShowType) {
+                    metadataExtracted.realShowType = realShowType;
+                } else {
+                    warn('realShowType not found', `real_item_show_type=${metadataExtracted.realShowType}`);
+                }
+                if (metadataExtracted.showType !== metadataExtracted.realShowType) {
+                    // never seen this happen, waiting for examples
+                    warn('showType mismatch', `item_show_type=${metadataExtracted.showType}, real_item_show_type=${metadataExtracted.realShowType}`);
+                }
+                throw new LoopReturn(metadataExtracted);
+            },
+            {},
+            'script[nonce][type="text/javascript"]:contains("real_item_show_type")'
+        );
+    private static audioMetadataToBeExtracted = {
+        voiceId: this.genExtractFunc('voiceid', { assignPattern: ':' }),
+        duration: this.genExtractFunc('duration', { valuePattern: '\\d*', assignPattern: ':', allowNotFound: true }),
+    };
+    // never seen a audio article containing multiple audio, waiting for examples
+    static audio = ($: CheerioAPI) =>
+        forEachScript(
+            $,
+            (script) => {
+                const scriptText = $(script).text();
+                const metadataExtracted = <Record<string, string>> this.doExtract(this.audioMetadataToBeExtracted, scriptText);
+                throw new LoopReturn(metadataExtracted);
+            },
+            {},
+            'script[nonce][type="text/javascript"]:contains("voiceid")'
+        );
+    private static imgMetadataToBeExtracted = {
+        imgUrls: this.genExtractFunc('cdn_url', { valuePattern: `https?://[^'"]*`, assignPattern: ':', multiple: true }),
+    };
+    static img = ($: CheerioAPI) =>
+        forEachScript(
+            $,
+            (script) => {
+                const scriptText = $(script).text();
+                const metadataExtracted = <Record<string, string[]>> this.doExtract(this.imgMetadataToBeExtracted, scriptText);
+                if (Array.isArray(metadataExtracted.imgUrls)) {
+                    metadataExtracted.imgUrls = metadataExtracted.imgUrls.map((url) => fixUrl(url));
+                }
+                throw new LoopReturn(metadataExtracted);
+            },
+            {},
+            'script[nonce][type="text/javascript"]:contains("picture_page_info_list")'
+        );
+    private static locationMetadataToBeExtracted = {
+        countryName: this.genExtractFunc('countryName', { valuePattern: `[^'"]*`, assignPattern: ':' }),
+        provinceName: this.genExtractFunc('provinceName', { valuePattern: `[^'"]*`, assignPattern: ':' }),
+        cityName: this.genExtractFunc('cityName', { valuePattern: `[^'"]*`, assignPattern: ':' }),
+    };
+    static location = ($: CheerioAPI) =>
+        forEachScript(
+            $,
+            (script) => {
+                const scriptText = $(script).text();
+                const metadataExtracted = this.doExtract(this.locationMetadataToBeExtracted, scriptText);
+                throw new LoopReturn(metadataExtracted);
+            },
+            {},
+            'script[nonce][type="text/javascript"]:contains("countryName")'
+        );
+}
 const replaceTag = ($, oldTag, newTagName) => {
     oldTag = $(oldTag);
@@ -55,15 +286,23 @@ const detectOriginalArticleUrl = ($) => {
     return null;
 };
-const detectSourceUrl = ($) => {
-    const matchs = $.root()
-        .html()
-        .match(/msg_source_url = '(.+)';/);
-    if (matchs) {
-        return matchs[1];
-    }
-    return null;
+const genAudioSrc = (voiceId: string) => `https://res.wx.qq.com/voice/getvoice?mediaid=${voiceId}`;
+const genAudioTag = (src: string, title: string) => `<audio controls src="${src}" title="${title}" style="width:100%"/>`;
+const genVideoSrc = (videoId: string) => {
+    const newSearchParams = new URLSearchParams({
+        origin: 'https://mp.weixin.qq.com',
+        containerId: 'js_tx_video_container_0.3863487104715233',
+        vid: videoId,
+        width: '677',
+        height: '380.8125',
+        autoplay: 'false',
+        allowFullScreen: 'true',
+        chid: '17',
+        full: 'true',
+        show1080p: 'false',
+        isDebugIframe: 'false',
+    });
+    return `https://v.qq.com/txp/iframe/player.html?${newSearchParams.toString()}`;
 };
 /**
@@ -99,6 +338,33 @@ const fixArticleContent = (html?: string | Cheerio<Element>, skipImg = false) =>
             }
         });
     }
+    // fix audio: https://mp.weixin.qq.com/s/FnjcMXZ1xdS-d6n-pUUyyw
+    $('mpvoice[voice_encode_fileid]').each((_, voice) => {
+        const $voice = $(voice);
+        const voiceId = $voice.attr('voice_encode_fileid');
+        if (voiceId) {
+            const title = $voice.attr('name') || 'Audio';
+            $voice.replaceWith(genAudioTag(genAudioSrc(voiceId), title));
+        }
+    });
+    // fix iframe: https://mp.weixin.qq.com/s/FnjcMXZ1xdS-d6n-pUUyyw
+    $('iframe.video_iframe[data-src]').each((_, iframe) => {
+        const $iframe = $(iframe);
+        const dataSrc = <string>$iframe.attr('data-src');
+        const srcUrlObj = new URL(dataSrc);
+        if (srcUrlObj.host === 'v.qq.com' && srcUrlObj.searchParams.has('vid')) {
+            const newSrc = genVideoSrc(<string>srcUrlObj.searchParams.get('vid'));
+            $iframe.attr('src', newSrc);
+            $iframe.removeAttr('data-src');
+            const width = $iframe.attr('data-w');
+            const ratio = $iframe.attr('data-ratio');
+            if (width && ratio) {
+                const width_ = Math.min(Number.parseInt(width), 677);
+                $iframe.attr('width', width_.toString());
+                $iframe.attr('height', (width_ / Number.parseFloat(ratio)).toString());
+            }
+        } // else {} FIXME: https://mp.weixin.qq.com/s?__biz=Mzg5Mjk3MzE4OQ==&mid=2247549515&idx=2&sn=a608fca597f0589c1aebd6d0b82ff6e9
+    });
     // fix section
     $('section').each((_, section) => {
         const $section = $(section);
@@ -122,17 +388,6 @@ const fixArticleContent = (html?: string | Cheerio<Element>, skipImg = false) =>
     // clear line index tags in code section
     $('.code-snippet__line-index').remove();
-    // fix single picture article
-    // example: https://mp.weixin.qq.com/s/4p5YmYuASiQSYFiy7KqydQ
-    $('script').each((_, script) => {
-        const $script = $(script);
-        const matchs = $script.html()?.match(/document\.getElementById\('js_image_desc'\)\.innerHTML = "(.*)"\.replace/);
-        if (matchs) {
-            $script.replaceWith(matchs[1].replaceAll('\r', '').replaceAll('\n', '<br>').replaceAll('\\x0d', '').replaceAll('\\x0a', '<br>'));
-        }
-    });
     // clean scripts
     $('script').remove();
     return $.html();
@@ -146,18 +401,21 @@ const fixArticleContent = (html?: string | Cheerio<Element>, skipImg = false) =>
 // abtest_cookie, wx_header
 // Known params (temporary link):
 // src, timestamp, ver, signature, new (unessential)
-const normalizeUrl = (url, bypassHostCheck = false) => {
+const normalizeUrl = (url: string, bypassHostCheck = false) => {
     const oriUrl = url;
+    // already seen some weird urls with `&` escaped as `&amp;`, so fix it
+    // calling fixUrl should always be safe since having `&amp;` or `\x26` in a URL is meaningless
+    url = fixUrl(url);
     const urlObj = new URL(url);
     if (!bypassHostCheck && urlObj.host !== 'mp.weixin.qq.com') {
-        throw new Error('wechat-mp: URL host must be "mp.weixin.qq.com", but got ' + oriUrl);
+        error('URL host must be "mp.weixin.qq.com"', url);
     }
     urlObj.protocol = 'https:';
     urlObj.hash = ''; // remove hash
-    if (/^\/s\/.+/.test(urlObj.pathname)) {
+    if (urlObj.pathname.startsWith('/s/')) {
         // a short link, just remove all the params
         urlObj.search = '';
-    } else if (/^\/s$/.test(urlObj.pathname)) {
+    } else if (urlObj.pathname === '/s') {
         const biz = urlObj.searchParams.get('__biz');
         const mid = urlObj.searchParams.get('mid') || urlObj.searchParams.get('appmsgid');
         const idx = urlObj.searchParams.get('idx') || urlObj.searchParams.get('itemidx');
@@ -175,60 +433,165 @@ const normalizeUrl = (url, bypassHostCheck = false) => {
                 // a temporary link, remove all unessential params
                 urlObj.search = `?src=${src}&timestamp=${timestamp}&ver=${ver}&signature=${signature}`;
             } else {
-                // unknown link, just let it go
+                warn('unknown URL search parameters', oriUrl);
             }
         }
     } else {
-        // IDK what it is, just let it go
+        warn('unknown URL path', oriUrl);
     }
     return urlObj.href;
 };
+class PageParsers {
+    private static common = ($: CheerioAPI, commonMetadata: Record<string, string>) => {
+        const title = replaceReturnNewline($('meta[property="og:title"]').attr('content') || '', '', ' ');
+        const author = replaceReturnNewline($('meta[name=author]').attr('content') || '', '', ' ');
+        const pubDate = commonMetadata.createTime ? parseDate(Number.parseInt(commonMetadata.createTime) * 1000) : undefined;
+        const mpName = $('.wx_follow_nickname').first().text()?.trim();
+        let summary = replaceReturnNewline($('meta[name=description]').attr('content') || '');
+        const description = summary;
+        summary = summary.replaceAll('<br>', ' ') === title ? '' : summary;
+        return { title, author, description, summary, pubDate, mpName } as {
+            title: string;
+            author: string;
+            description: string;
+            summary: string;
+            pubDate?: Date;
+            mpName?: string;
+            enclosure_url?: string;
+            itunes_duration?: string | number;
+            enclosure_type?: string;
+        };
+    };
+    private static appMsg = async ($: CheerioAPI, commonMetadata: Record<string, string>) => {
+        const page = PageParsers.common($, commonMetadata);
+        page.description = fixArticleContent($('#js_content'));
+        const originalArticleUrl = detectOriginalArticleUrl($);
+        if (originalArticleUrl) {
+            // No article or article is too short, try to fetch the description from the original article
+            const data = await ofetch(normalizeUrl(originalArticleUrl));
+            const original$ = load(data);
+            page.description += fixArticleContent(original$('#js_content'));
+        }
+        return page;
+    };
+    private static img = ($: CheerioAPI, commonMetadata: Record<string, string>) => {
+        const page = PageParsers.common($, commonMetadata);
+        const imgUrls = ExtractMetadata.img($)?.imgUrls;
+        let imgHtml = '';
+        if (Array.isArray(imgUrls) && imgUrls.length > 0) {
+            for (const imgUrl of imgUrls) {
+                imgHtml += `<br><br><img src="${imgUrl}" />`;
+            }
+        }
+        page.description += imgHtml;
+        return page;
+    };
+    private static audio = ($: CheerioAPI, commonMetadata: Record<string, string>) => {
+        const page = PageParsers.common($, commonMetadata);
+        const audioMetadata = ExtractMetadata.audio($);
+        const audioUrl = genAudioSrc(audioMetadata.voiceId);
+        page.enclosure_url = audioUrl;
+        page.itunes_duration = audioMetadata.duration;
+        page.enclosure_type = 'audio/mp3'; // FIXME: may it be other types?
+        page.description += '<br><br>' + genAudioTag(audioUrl, page.title);
+        return page;
+    };
+    private static fallback = ($: CheerioAPI, commonMetadata: Record<string, string>) => {
+        const page = PageParsers.common($, commonMetadata);
+        const image = $('meta[property="og:image"]').attr('content');
+        if (image) {
+            page.description += `<br><br><img src="${image}" />`;
+        }
+        return page;
+    };
+    static dispatch = async (html: string, url: string) => {
+        const $ = load(html);
+        const commonMetadata = ExtractMetadata.common($);
+        let page: Record<string, any>;
+        let pageText: string, pageTextShort: string;
+        switch (commonMetadata.showType) {
+            case 'APP_MSG_PAGE':
+                page = await PageParsers.appMsg($, commonMetadata);
+                break;
+            case 'AUDIO_SHARE_PAGE':
+                page = PageParsers.audio($, commonMetadata);
+                break;
+            case 'IMG_SHARE_PAGE':
+                page = PageParsers.img($, commonMetadata);
+                break;
+            case 'VIDEO_SHARE_PAGE':
+                page = PageParsers.fallback($, commonMetadata);
+                break;
+            case undefined:
+                $('script, style').remove();
+                pageText = $('title, body').text().replaceAll(/\s+/g, ' ').trim();
+                pageTextShort = pageText.slice(0, 25);
+                if (pageText.length >= 25 + '...'.length) {
+                    pageTextShort = pageText.slice(0, 25);
+                    pageTextShort += '...';
+                }
+                if (pageText.includes('已被发布者删除')) {
+                    errorNoMention('deleted by author', pageTextShort, url);
+                } else if (new URL(url).pathname.includes('captcha') || pageText.includes('环境异常')) {
+                    errorNoMention('request blocked by WAF', pageTextShort, url);
+                } else {
+                    error('unknown page, probably due to WAF', pageTextShort, url);
+                }
+                return {}; // just to make TypeScript happy, actually UNREACHABLE
+            default:
+                warn('new showType, trying fallback method', `showType=${commonMetadata.showType}`, url);
+                page = PageParsers.fallback($, commonMetadata);
+        }
+        const locationMetadata = ExtractMetadata.location($);
+        let location = '';
+        for (const loc of [locationMetadata.countryName, locationMetadata.provinceName, locationMetadata.cityName]) {
+            if (loc) {
+                location += loc + ' ';
+            }
+        }
+        location = location.trim();
+        if (location) {
+            page.description += `<p>📍发表于：${location}</p>`;
+        }
+        if (commonMetadata.sourceUrl) {
+            page.description += `<p><a href="${commonMetadata.sourceUrl}">🔗️ 阅读原文</a></p>`;
+        }
+        return page;
+    };
+}
+const redirectHelper = async (url: string, maxRedirects: number = 5) => {
+    maxRedirects--;
+    const raw = await ofetch.raw(url);
+    if ([301, 302, 303, 307, 308].includes(raw.status)) {
+        if (!raw.headers.has('location')) {
+            error('redirect without location', url);
+        } else if (maxRedirects <= 0) {
+            error('too many redirects', url);
+        }
+        return await redirectHelper(<string>raw.headers.get('location'), maxRedirects);
+    }
+    return raw;
+};
 /**
  * Fetch article and its metadata from WeChat MP (mp.weixin.qq.com).
  *
  * If you use this function, no need to call `fixArticleContent`
- * @param {object} ctx - The context object.
- * @param {string} url - The url of the article.
- * @param {boolean} bypassHostCheck - Whether to bypass host check.
- * @return {Promise<object>} - An object containing the article and its metadata.
+ * @param url - The url of the article.
+ * @param bypassHostCheck - Whether to bypass host check.
+ * @return - An object containing the article and its metadata.
  */
-const fetchArticle = (url, bypassHostCheck = false) => {
+const fetchArticle = (url: string, bypassHostCheck: boolean = false) => {
     url = normalizeUrl(url, bypassHostCheck);
     return cache.tryGet(url, async () => {
-        const data = await ofetch(url);
-        const $ = load(data);
-        const title = ($('meta[property="og:title"]').attr('content') || '').replaceAll('\\r', '').replaceAll('\\n', ' ');
-        const author = $('meta[name=author]').attr('content');
-        let summary = $('meta[name=description]').attr('content');
-        summary = summary === title ? '' : summary;
-        let description = fixArticleContent($('#js_content'));
-        // No article get or article is too short, try the original url
-        const originalUrl = detectOriginalArticleUrl($);
-        if (originalUrl) {
-            // try to fetch the description from the original article
-            const data = await ofetch(normalizeUrl(originalUrl, bypassHostCheck));
-            const original$ = load(data);
-            description += fixArticleContent(original$('#js_content'));
-        }
-        const sourceUrl = detectSourceUrl($);
-        if (sourceUrl) {
-            description += `<a href="${sourceUrl}">阅读原文</a>`;
-        }
-        let pubDate;
-        const publish_time_script = $('script[nonce][type="text/javascript"]:contains("var ct")').text();
-        const publish_time_match = publish_time_script && publish_time_script.match(/var ct *= *"?(\d{10})"?/);
-        const publish_timestamp = publish_time_match && publish_time_match[1];
-        if (publish_timestamp) {
-            pubDate = parseDate(Number.parseInt(publish_timestamp) * 1000);
-        }
-        let mpName = $('.profile_nickname').first().text();
-        mpName = mpName && mpName.trim();
-        return { title, author, description, summary, pubDate, mpName, link: url };
+        const raw = await redirectHelper(url);
+        // pass the redirected URL to dispatcher for better error logging
+        const page = await PageParsers.dispatch(raw._data, raw.url);
+        return { ...page, link: url };
     }) as Promise<{
         title: string;
         author: string;
@@ -237,6 +600,9 @@ const fetchArticle = (url, bypassHostCheck = false) => {
         pubDate?: Date;
         mpName?: string;
         link: string;
+        enclosure_type?: string;
+        enclosure_url?: string;
+        itunes_duration?: string | number;
     }>;
 };
@@ -257,18 +623,23 @@ const fetchArticle = (url, bypassHostCheck = false) => {
  * @return {Promise<object>} - The incoming `item` object, with the article and its metadata filled in.
  */
 const finishArticleItem = async (item, setMpNameAsAuthor = false, skipLink = false) => {
-    const { title, author, description, summary, pubDate, mpName, link } = await fetchArticle(item.link);
-    item.title = title || item.title;
-    item.description = description || item.description;
-    item.summary = summary || item.summary;
-    item.pubDate = pubDate || item.pubDate;
-    item.author = setMpNameAsAuthor
-        ? mpName || item.author // the Official Account itself. if your route return articles from different accounts, you may want to use this
-        : author || item.author; // the real author of the article. if your route return articles from a certain account, use this
-    if (!skipLink) {
-        item.link = link || item.link;
+    const fetchedItem = await fetchArticle(item.link);
+    for (const key in fetchedItem) {
+        switch (key) {
+            case 'author':
+                item.author = setMpNameAsAuthor
+                    ? fetchedItem.mpName || item.author // the Official Account itself. if your route return articles from different accounts, you may want to use this
+                    : fetchedItem.author || item.author; // the real author of the article. if your route return articles from a certain account, use this
+                break;
+            case 'link':
+                item.link = skipLink ? item.link : fetchedItem.link || item.link;
+                break;
+            default:
+                item[key] = item[key] || fetchedItem[key];
+        }
     }
     return item;
 };
-export { fixArticleContent, fetchArticle, finishArticleItem, normalizeUrl };
+const exportedForTestingOnly = { toggleWerror, ExtractMetadata, showTypeMapReverse };
+export { exportedForTestingOnly, WeChatMpError, fixArticleContent, fetchArticle, finishArticleItem, normalizeUrl };

package/lib/views/{rss3-ums.ts → rss3.ts} RENAMED Viewed

@@ -10,7 +10,7 @@ const NETWORK = 'RSS';
 const TAG = 'RSS';
 const TYPE = 'feed';
-const rss3Ums = (data) => {
+const rss3 = (data) => {
     const currentUnixTsp = dayjs().unix();
     const umsResult = {
         data: data.item.map((item) => {
@@ -59,4 +59,4 @@ function getOwnershipFieldFromURL(item) {
     }
 }
-export default rss3Ums;
+export default rss3;