npm - rssany - Versions diffs - 0.1.2 → 0.1.5 - Mend

rssany 0.1.2 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

package/app/plugins/builtin/supervisely-blog.rssany.js ADDED Viewed

@@ -0,0 +1,159 @@
+let _deps;
+// Supervisely Blog 插件：抓取列表页并解析为 FeedItem（不做正文 enrich）
+const SUPERVISELY_ORIGIN = "https://supervisely.com";
+const MONTH_INDEX = {
+  jan: 0,
+  feb: 1,
+  mar: 2,
+  apr: 3,
+  may: 4,
+  jun: 5,
+  jul: 6,
+  aug: 7,
+  sep: 8,
+  oct: 9,
+  nov: 10,
+  dec: 11,
+};
+function normalizeText(text) {
+  return (text ?? "").replace(/\s+/g, " ").trim();
+}
+function hashGuid(input) {
+  return _deps.createHash("sha256").update(input).digest("hex");
+}
+function toAbsoluteUrl(href, baseUrl) {
+  if (!href) return null;
+  try {
+    const url = new URL(href, baseUrl);
+    if (!/^https?:$/i.test(url.protocol)) return null;
+    return url.href;
+  } catch {
+    return null;
+  }
+}
+function parsePubDate(rawText) {
+  const text = normalizeText(rawText);
+  const m = text.match(/^([A-Za-z]{3,9})\s+(\d{1,2}),\s*(\d{4})$/);
+  if (!m) return undefined;
+  const month = MONTH_INDEX[m[1].slice(0, 3).toLowerCase()];
+  if (month == null) return undefined;
+  const day = Number(m[2]);
+  const year = Number(m[3]);
+  if (!Number.isInteger(day) || !Number.isInteger(year)) return undefined;
+  return new Date(Date.UTC(year, month, day, 0, 0, 0));
+}
+function looksLikeBlogLink(link) {
+  try {
+    const u = new URL(link);
+    return /^\/blog\/[^/]+\/?$/i.test(u.pathname);
+  } catch {
+    return false;
+  }
+}
+function findAncestor(node, maxDepth) {
+  let current = node?.parentNode ?? null;
+  for (let i = 0; i < maxDepth && current; i += 1) {
+    if (current.querySelector?.("time")) return current;
+    current = current.parentNode ?? null;
+  }
+  return node?.parentNode ?? null;
+}
+function buildFeedItem({ title, link, summary, author, pubDate }) {
+  return {
+    guid: hashGuid(link),
+    title,
+    link,
+    pubDate: pubDate ?? new Date(),
+    author: author || undefined,
+    summary: summary || undefined,
+  };
+}
+function parseFromCards(root, baseUrl) {
+  const seen = new Set();
+  const items = [];
+  const cards = root.querySelectorAll("div.blog-card");
+  for (const card of cards) {
+    const titleAnchor = card.querySelector("h4 a[href]");
+    const title = normalizeText(titleAnchor?.textContent);
+    const link = toAbsoluteUrl(titleAnchor?.getAttribute("href"), baseUrl);
+    if (!title || !link || !looksLikeBlogLink(link) || seen.has(link)) continue;
+    const summary = normalizeText(card.querySelector("p")?.textContent);
+    const author = normalizeText(card.querySelector('b[rel="author"], address b')?.textContent);
+    const pubDateText = normalizeText(card.querySelector("time")?.textContent);
+    const pubDate = parsePubDate(pubDateText);
+    seen.add(link);
+    items.push(buildFeedItem({ title, link, summary, author, pubDate }));
+  }
+  return items;
+}
+function parseFromHeadingFallback(root, baseUrl) {
+  const seen = new Set();
+  const items = [];
+  const anchors = root.querySelectorAll('h4 a[href*="/blog/"]');
+  for (const anchor of anchors) {
+    const title = normalizeText(anchor.textContent);
+    const link = toAbsoluteUrl(anchor.getAttribute("href"), baseUrl);
+    if (!title || !link || !looksLikeBlogLink(link) || seen.has(link)) continue;
+    const container = findAncestor(anchor, 7);
+    const summary = normalizeText(container?.querySelector("p")?.textContent);
+    const author = normalizeText(container?.querySelector('b[rel="author"], address b')?.textContent);
+    const pubDateText = normalizeText(container?.querySelector("time")?.textContent);
+    const pubDate = parsePubDate(pubDateText);
+    seen.add(link);
+    items.push(buildFeedItem({ title, link, summary, author, pubDate }));
+  }
+  return items;
+}
+async function fetchItems(sourceId, ctx) {
+  _deps = ctx.deps;
+  const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
+  const root = _deps.parseHtml(html);
+  const baseUrl = finalUrl || SUPERVISELY_ORIGIN;
+  const fromCards = parseFromCards(root, baseUrl);
+  const items = fromCards.length > 0 ? fromCards : parseFromHeadingFallback(root, baseUrl);
+  if (items.length === 0) {
+    throw new Error("[supervisely-blog] 未解析到文章条目，页面结构可能已变化");
+  }
+  return items;
+}
+export default {
+  id: "supervisely-blog",
+  listUrlPattern: /^https?:\/\/(www\.)?supervisely\.com\/blog\/?(?:\?.*)?$/i,
+  fetchItems,
+};

package/app/plugins/builtin/uci-ml-repository.rssany.js ADDED Viewed

@@ -0,0 +1,111 @@
+let _deps;
+const UCI_ORIGIN = "https://archive.ics.uci.edu";
+function normalizeText(text) {
+  return (text ?? "").replace(/\s+/g, " ").trim();
+}
+function hashGuid(input) {
+  return _deps.createHash("sha256").update(input).digest("hex");
+}
+function resolveDatasetLink(rawHref, baseUrl) {
+  const href = normalizeText(rawHref);
+  if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
+  try {
+    const url = new URL(href, baseUrl);
+    if (!/^https?:$/i.test(url.protocol)) return null;
+    if (url.hostname !== "archive.ics.uci.edu") return null;
+    if (!/^\/dataset\/\d+\/[^/?#]+$/i.test(url.pathname)) return null;
+    url.search = "";
+    url.hash = "";
+    return url.href;
+  } catch {
+    return null;
+  }
+}
+function pickSummaryNearNode(node) {
+  let cur = node;
+  for (let i = 0; i < 6 && cur; i += 1) {
+    const p = cur.querySelector?.("p");
+    const summary = normalizeText(p?.textContent);
+    if (summary) return summary;
+    cur = cur.parentNode ?? null;
+  }
+  return "";
+}
+function buildItem({ title, link, summary, index }) {
+  return {
+    guid: hashGuid(link),
+    title,
+    link,
+    pubDate: new Date(Date.now() - index * 1000),
+    summary: summary || undefined,
+    sourceId: "uci-ml-repository",
+  };
+}
+function parseFromHeadingAnchors(root, baseUrl) {
+  const anchors = root.querySelectorAll('h2 a[href^="/dataset/"]');
+  const items = [];
+  const seen = new Set();
+  for (const anchor of anchors) {
+    const link = resolveDatasetLink(anchor.getAttribute("href"), baseUrl);
+    if (!link || seen.has(link)) continue;
+    const title = normalizeText(anchor.textContent);
+    if (!title) continue;
+    const summary = pickSummaryNearNode(anchor.parentNode ?? anchor);
+    seen.add(link);
+    items.push(buildItem({ title, link, summary, index: items.length }));
+  }
+  return items;
+}
+function parseFromGenericAnchors(root, baseUrl) {
+  const anchors = root.querySelectorAll('a[href^="/dataset/"]');
+  const items = [];
+  const seen = new Set();
+  for (const anchor of anchors) {
+    const link = resolveDatasetLink(anchor.getAttribute("href"), baseUrl);
+    if (!link || seen.has(link)) continue;
+    const titleFromText = normalizeText(anchor.textContent);
+    const titleFromImage = normalizeText(anchor.querySelector("img")?.getAttribute("alt"));
+    const title = titleFromText || titleFromImage;
+    if (!title) continue;
+    const summary = pickSummaryNearNode(anchor.parentNode ?? anchor);
+    seen.add(link);
+    items.push(buildItem({ title, link, summary, index: items.length }));
+  }
+  return items;
+}
+async function fetchItems(sourceId, ctx) {
+  _deps = ctx.deps;
+  const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 4000 });
+  const baseUrl = finalUrl || sourceId || UCI_ORIGIN;
+  const root = _deps.parseHtml(html);
+  const byHeading = parseFromHeadingAnchors(root, baseUrl);
+  if (byHeading.length > 0) return byHeading;
+  const fallback = parseFromGenericAnchors(root, baseUrl);
+  if (fallback.length > 0) return fallback;
+  throw new Error("[uci-ml-repository] 未解析到数据集条目，页面结构可能已变化");
+}
+export default {
+  id: "uci-ml-repository",
+  listUrlPattern: /^https?:\/\/archive\.ics\.uci\.edu(?:\/(?:datasets\/?)?)?(?:\?.*)?$/i,
+  fetchItems,
+};

package/app/plugins/builtin/venturebeat.rssany.js ADDED Viewed

@@ -0,0 +1,97 @@
+let _deps;
+// VentureBeat 插件：通过官方 RSS Feed 拉取列表，规避首页安全检查页
+function normalizeText(text) {
+  return (text ?? "").replace(/\s+/g, " ").trim();
+}
+function stripHtml(text) {
+  return normalizeText((text ?? "").replace(/<[^>]*>/g, " "));
+}
+function toValidDate(raw) {
+  if (!raw) return new Date();
+  const date = new Date(raw);
+  return Number.isNaN(date.getTime()) ? new Date() : date;
+}
+function toFeedUrl(sourceId) {
+  const url = new URL(sourceId);
+  const path = url.pathname.replace(/\/+$/, "");
+  if (path.endsWith("/feed")) return url.href;
+  url.pathname = path ? `${path}/feed/` : "/feed/";
+  url.search = "";
+  url.hash = "";
+  return url.href;
+}
+function mapFeedItem(item) {
+  const link = normalizeText(item.link ?? "");
+  if (!/^https?:\/\//i.test(link)) return null;
+  const title = normalizeText(item.title ?? "");
+  const pubDate = toValidDate(item.isoDate ?? item.pubDate);
+  const summary = normalizeText(item.contentSnippet ?? "") || stripHtml(item.summary ?? item.content ?? "");
+  const author = normalizeText(item.creator ?? item.author ?? "") || undefined;
+  return {
+    guid: _deps.createHash("sha256").update(link).digest("hex"),
+    title: title || "(无标题)",
+    link,
+    pubDate,
+    author,
+    summary: summary || undefined,
+  };
+}
+async function fetchItems(sourceId, _ctx) {
+  _deps = _ctx.deps;
+  const parser = new _deps.RssParser({
+    timeout: 15_000,
+    headers: {
+      "User-Agent": "RssAny/1.0 (+https://github.com/rssany/rssany)",
+      Accept: "application/rss+xml,application/atom+xml,application/xml,text/xml,*/*",
+    },
+  });
+  const feedUrl = toFeedUrl(sourceId);
+  let feed;
+  try {
+    feed = await parser.parseURL(feedUrl);
+  } catch (err) {
+    const msg = err instanceof Error ? err.message : String(err);
+    throw new Error(`[venturebeat] 抓取 feed 失败: ${feedUrl} (${msg})`);
+  }
+  const seen = new Set();
+  const items = [];
+  for (const item of feed.items ?? []) {
+    const mapped = mapFeedItem(item);
+    if (!mapped) continue;
+    if (seen.has(mapped.link)) continue;
+    seen.add(mapped.link);
+    items.push(mapped);
+  }
+  if (items.length === 0) {
+    throw new Error(`[venturebeat] 未解析到条目: ${feedUrl}`);
+  }
+  return items;
+}
+export default {
+  id: "venturebeat",
+  listUrlPattern: /^https?:\/\/(www\.)?venturebeat\.com\/?(\?.*)?$/i,
+  refreshInterval: "1h",
+  fetchItems,
+};

package/app/plugins/builtin/worldlabs.rssany.js ADDED Viewed

@@ -0,0 +1,129 @@
+let _deps;
+// World Labs 博客插件：抓取 Research & Insights 列表页，输出 FeedItem（不含 enrich）
+const MONTH_NAME =
+  "January|February|March|April|May|June|July|August|September|October|November|December";
+const DATE_RE = new RegExp(`\\b(${MONTH_NAME})\\s+\\d{1,2},\\s+\\d{4}\\b`, "i");
+const MONTH_INDEX = {
+  january: 0,
+  february: 1,
+  march: 2,
+  april: 3,
+  may: 4,
+  june: 5,
+  july: 6,
+  august: 7,
+  september: 8,
+  october: 9,
+  november: 10,
+  december: 11,
+};
+function normalizeText(text) {
+  return (text ?? "").replace(/\s+/g, " ").trim();
+}
+function hashGuid(input) {
+  return _deps.createHash("sha256").update(input).digest("hex");
+}
+function toAbsoluteHttpUrl(rawHref, baseUrl) {
+  if (!rawHref) return null;
+  const href = rawHref.trim();
+  if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
+  try {
+    const url = new URL(href, baseUrl);
+    if (!/^https?:$/i.test(url.protocol)) return null;
+    return url.href;
+  } catch {
+    return null;
+  }
+}
+function parseDateAndAuthor(metaText) {
+  const text = normalizeText(metaText);
+  const m = text.match(DATE_RE);
+  if (!m) return { pubDate: new Date(), author: undefined };
+  const dateText = m[0];
+  const parts = dateText.match(/^(?<month>[A-Za-z]+)\s+(?<day>\d{1,2}),\s*(?<year>\d{4})$/);
+  let date = new Date();
+  if (parts?.groups) {
+    const month = MONTH_INDEX[parts.groups.month.toLowerCase()];
+    const day = Number(parts.groups.day);
+    const year = Number(parts.groups.year);
+    if (month != null && Number.isFinite(day) && Number.isFinite(year)) {
+      // 统一用 UTC 中午，避免仅有日期时因时区导致前后一天偏移。
+      date = new Date(Date.UTC(year, month, day, 12, 0, 0));
+    }
+  }
+  const authorText = normalizeText(text.slice(m.index + dateText.length)).replace(/^[|/\-•·,:]+/, "").trim();
+  return {
+    pubDate: Number.isNaN(date.getTime()) ? new Date() : date,
+    author: authorText || undefined,
+  };
+}
+function parseCard(anchor, finalUrl) {
+  const title = normalizeText(anchor.querySelector("h2, h3")?.textContent);
+  if (!title) return null;
+  const link = toAbsoluteHttpUrl(anchor.getAttribute("href"), finalUrl);
+  if (!link) return null;
+  const paragraphTexts = anchor
+    .querySelectorAll("p")
+    .map((p) => normalizeText(p.textContent))
+    .filter(Boolean);
+  const metaText = paragraphTexts.find((t) => DATE_RE.test(t)) ?? paragraphTexts[0] ?? "";
+  const { pubDate, author } = parseDateAndAuthor(metaText);
+  const summary = paragraphTexts.find((t) => t !== metaText && !DATE_RE.test(t));
+  return {
+    guid: hashGuid(link),
+    title,
+    link,
+    pubDate,
+    author,
+    summary: summary || undefined,
+  };
+}
+async function fetchItems(sourceId, ctx) {
+  _deps = ctx.deps;
+  const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
+  const root = _deps.parseHtml(html);
+  const seen = new Set();
+  const items = [];
+  const anchors = root.querySelectorAll("a[href]");
+  for (const anchor of anchors) {
+    const item = parseCard(anchor, finalUrl);
+    if (!item) continue;
+    if (seen.has(item.link)) continue;
+    seen.add(item.link);
+    items.push(item);
+  }
+  if (items.length === 0) {
+    throw new Error("[worldlabs] 未解析到条目，页面结构可能已变化");
+  }
+  return items;
+}
+export default {
+  id: "worldlabs",
+  listUrlPattern: /^https?:\/\/(www\.)?worldlabs\.ai\/blog(\?.*)?$/i,
+  fetchItems,
+};

package/app/plugins/builtin/x.rssany.js ADDED Viewed

@@ -0,0 +1,159 @@
+let _deps;
+// X (Twitter) 站点插件：用户主页列表抓取与解析
+const X_ORIGIN = "https://x.com";
+function getOrigin(url) {
+  try {
+    return new URL(url).origin;
+  } catch {
+    return X_ORIGIN;
+  }
+}
+function normalizeText(text) {
+  return (text ?? "").replace(/\s+/g, " ").trim();
+}
+function statusPathFromHref(href) {
+  if (!href) return null;
+  try {
+    const normalized = href.startsWith("http") ? new URL(href).pathname : href.split("?")[0];
+    const m = normalized.match(/^\/([A-Za-z0-9_]{1,32})\/status\/(\d+)/);
+    if (!m) return null;
+    return `/${m[1]}/status/${m[2]}`;
+  } catch {
+    return null;
+  }
+}
+function extractAuthor(article, statusPath) {
+  const nameBlock = article.querySelector('[data-testid="User-Name"]');
+  if (nameBlock) {
+    const profileAnchors = nameBlock.querySelectorAll('a[href^="/"]');
+    for (const a of profileAnchors) {
+      const href = a.getAttribute("href") || "";
+      if (/^\/[A-Za-z0-9_]{1,32}$/.test(href)) return href.slice(1);
+    }
+    const text = normalizeText(nameBlock.textContent);
+    const mention = text.match(/@([A-Za-z0-9_]{1,32})/);
+    if (mention) return mention[1];
+  }
+  if (statusPath) {
+    const m = statusPath.match(/^\/([A-Za-z0-9_]{1,32})\/status\/\d+$/);
+    if (m) return m[1];
+  }
+  return undefined;
+}
+function extractTweetText(article) {
+  const textNode = article.querySelector('[data-testid="tweetText"]') ?? article.querySelector('[lang]');
+  const text = normalizeText(textNode?.textContent);
+  const hasShowMore = !!article.querySelector('[data-testid="tweet-text-show-more-link"]');
+  if (!text) return hasShowMore ? "推文内容较长，请打开原文查看" : "";
+  return hasShowMore ? `${text} ...` : text;
+}
+function parseArticles(root, origin) {
+  const entries = [];
+  const seen = new Set();
+  const articles = root.querySelectorAll('article[data-testid="tweet"], article[role="article"]');
+  for (const article of articles) {
+    const links = article.querySelectorAll('a[href*="/status/"]');
+    let statusPath = null;
+    for (const a of links) {
+      const p = statusPathFromHref(a.getAttribute("href"));
+      if (p) {
+        statusPath = p;
+        break;
+      }
+    }
+    if (!statusPath || seen.has(statusPath)) continue;
+    seen.add(statusPath);
+    const link = new URL(statusPath, origin).href;
+    const text = extractTweetText(article);
+    const author = extractAuthor(article, statusPath);
+    const pubDate = article.querySelector("time[datetime]")?.getAttribute("datetime") || undefined;
+    entries.push({ link, text, author, pubDate });
+  }
+  return entries;
+}
+function extractEntriesFromJson(data, origin) {
+  if (typeof data !== "object" || data == null) return [];
+  const entries = [];
+  const str = JSON.stringify(data);
+  const seen = new Set();
+  const matches = str.match(/\/([A-Za-z0-9_]{1,32})\/status\/(\d+)/g) || [];
+  for (const raw of matches) {
+    const m = raw.match(/^\/([A-Za-z0-9_]{1,32})\/status\/(\d+)$/);
+    if (!m) continue;
+    const statusPath = `/${m[1]}/status/${m[2]}`;
+    if (seen.has(statusPath)) continue;
+    seen.add(statusPath);
+    entries.push({ link: new URL(statusPath, origin).href, text: "", author: m[1], pubDate: undefined });
+  }
+  return entries;
+}
+function entriesToFeedItems(entries) {
+  return entries.map(({ link, text, author, pubDate }) => ({
+    guid: _deps.createHash("sha256").update(link).digest("hex"),
+    title: text || undefined,
+    link,
+    pubDate: pubDate ? new Date(pubDate) : new Date(),
+    author,
+    summary: text || undefined,
+  }));
+}
+async function fetchItems(sourceId, ctx) {
+  _deps = ctx.deps;
+  const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 6000 });
+  const root = _deps.parseHtml(html);
+  const origin = getOrigin(finalUrl);
+  let entries = parseArticles(root, origin);
+  if (entries.length > 0) return entriesToFeedItems(entries);
+  const scripts = root.querySelectorAll('script[type="application/json"]');
+  for (const script of scripts) {
+    try {
+      const data = JSON.parse(script.textContent || "");
+      const fromJson = extractEntriesFromJson(data, origin);
+      if (fromJson.length > 0) {
+        entries = fromJson;
+        break;
+      }
+    } catch {
+      // ignore broken JSON blocks
+    }
+  }
+  if (entries.length > 0) return entriesToFeedItems(entries);
+  const bodyText = normalizeText(root.textContent).toLowerCase();
+  const isErrorPage = bodyText.includes("something went wrong") || bodyText.includes("try again");
+  const message = isErrorPage
+    ? "X 页面暂不可用（可能被风控或需登录），请稍后重试或切换为有头模式并确认登录态"
+    : "未解析到推文条目，可能被风控或需登录";
+  throw new Error(`[X] ${message}`);
+}
+export default {
+  id: "x",
+  listUrlPattern: "https://x.com/{username}",
+  fetchItems,
+};