npm - rssany - Versions diffs - 0.1.4 → 0.1.6 - Mend

rssany 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

package/app/plugins/builtin/paperswithcode.rssany.js ADDED Viewed

@@ -0,0 +1,227 @@
+let _deps;
+const SITE_ID = "paperswithcode";
+const API_ORIGIN = "https://paperswithcode.co";
+const DEFAULT_TRENDING_LIMIT = 30;
+const DEFAULT_MAX_AGE_DAYS = 180;
+const DEFAULT_LATEST_PAGE_SIZE = 30;
+function normalizeText(text) {
+  return (text ?? "").replace(/\s+/g, " ").trim();
+}
+function hashGuid(input) {
+  return _deps.createHash("sha256").update(input).digest("hex");
+}
+function toValidDate(raw) {
+  const text = normalizeText(raw);
+  if (!text) return new Date();
+  const parsed = new Date(text);
+  return Number.isNaN(parsed.getTime()) ? new Date() : parsed;
+}
+function parsePositiveInt(raw, fallback, { min = 1, max = 200 } = {}) {
+  const n = Number.parseInt(normalizeText(raw), 10);
+  if (Number.isNaN(n)) return fallback;
+  return Math.min(max, Math.max(min, n));
+}
+function resolveMode(sourceUrl) {
+  const sort = normalizeText(sourceUrl.searchParams.get("sort")).toLowerCase();
+  if (["latest", "new", "date", "date_published"].includes(sort)) return "latest";
+  if (["trending", "hot", "popular"].includes(sort)) return "trending";
+  return "trending";
+}
+function toPaperLink(arxivId) {
+  const id = normalizeText(arxivId);
+  if (!id) return null;
+  return `${API_ORIGIN}/paper/${encodeURIComponent(id)}`;
+}
+function joinCategories(tasks) {
+  if (!Array.isArray(tasks)) return undefined;
+  const values = tasks
+    .map((task) => {
+      if (typeof task === "string") return normalizeText(task);
+      if (task && typeof task === "object") return normalizeText(task.name);
+      return "";
+    })
+    .filter(Boolean);
+  return values.length > 0 ? values : undefined;
+}
+function buildSummaryFromTrending(item) {
+  const repoName = normalizeText(item?.repository?.name);
+  const repoOwner = normalizeText(item?.repository?.owner);
+  const stars = Number(item?.repository?.num_stars ?? 0);
+  const tasks = joinCategories(item?.tasks);
+  const parts = [];
+  if (repoOwner && repoName) {
+    parts.push(`Repo: ${repoOwner}/${repoName}`);
+  }
+  if (stars > 0) {
+    parts.push(`Stars: ${stars}`);
+  }
+  if (tasks?.length) {
+    parts.push(`Tasks: ${tasks.join(", ")}`);
+  }
+  return parts.length ? parts.join(" | ") : undefined;
+}
+function mapTrendingItem(item) {
+  const title = normalizeText(item?.title);
+  const link = toPaperLink(item?.arxiv_id);
+  if (!title || !link) return null;
+  return {
+    guid: hashGuid(link),
+    title,
+    link,
+    pubDate: toValidDate(item?.date_published),
+    summary: buildSummaryFromTrending(item),
+    sourceId: SITE_ID,
+  };
+}
+function mapLatestItem(item) {
+  const title = normalizeText(item?.title);
+  const link = toPaperLink(item?.arxiv_id) || normalizeText(item?.url_abs);
+  if (!title || !link) return null;
+  const summary = normalizeText(item?.abstract);
+  const firstAuthor = Array.isArray(item?.authors) ? normalizeText(item.authors[0]) : "";
+  return {
+    guid: hashGuid(link),
+    title,
+    link,
+    pubDate: toValidDate(item?.published),
+    author: firstAuthor || undefined,
+    summary: summary || undefined,
+    sourceId: SITE_ID,
+  };
+}
+async function fetchJson(url) {
+  const res = await fetch(url, {
+    headers: {
+      Accept: "application/json",
+      "User-Agent":
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+    },
+  });
+  if (!res.ok) {
+    throw new Error(`HTTP ${res.status}`);
+  }
+  try {
+    return await res.json();
+  } catch {
+    throw new Error("接口返回非 JSON 数据");
+  }
+}
+function dedupeByLink(items) {
+  const seen = new Set();
+  const output = [];
+  for (const item of items) {
+    if (!item || !item.link) continue;
+    if (seen.has(item.link)) continue;
+    seen.add(item.link);
+    output.push(item);
+  }
+  return output;
+}
+async function fetchTrendingItems(sourceUrl) {
+  const limit = parsePositiveInt(sourceUrl.searchParams.get("limit"), DEFAULT_TRENDING_LIMIT, { max: 100 });
+  const maxAgeDays = parsePositiveInt(
+    sourceUrl.searchParams.get("max_age_days"),
+    DEFAULT_MAX_AGE_DAYS,
+    { max: 3650 }
+  );
+  const apiUrl = new URL("/api/v1/papers/trending", API_ORIGIN);
+  apiUrl.searchParams.set("limit", String(limit));
+  apiUrl.searchParams.set("max_age_days", String(maxAgeDays));
+  const payload = await fetchJson(apiUrl);
+  const rows = Array.isArray(payload) ? payload : [];
+  return dedupeByLink(rows.map(mapTrendingItem).filter(Boolean));
+}
+async function fetchLatestItems(sourceUrl) {
+  const pageSize = parsePositiveInt(sourceUrl.searchParams.get("page_size"), DEFAULT_LATEST_PAGE_SIZE, {
+    max: 100,
+  });
+  const page = parsePositiveInt(sourceUrl.searchParams.get("page"), 1, { max: 1000 });
+  const apiUrl = new URL("/api/v1/papers/", API_ORIGIN);
+  apiUrl.searchParams.set("page", String(page));
+  apiUrl.searchParams.set("page_size", String(pageSize));
+  apiUrl.searchParams.set("order_by", "date_published");
+  apiUrl.searchParams.set("order_dir", "desc");
+  apiUrl.searchParams.set("include_resources", "true");
+  const payload = await fetchJson(apiUrl);
+  const rows = Array.isArray(payload?.results) ? payload.results : [];
+  return dedupeByLink(rows.map(mapLatestItem).filter(Boolean));
+}
+async function fetchItems(sourceId, _ctx) {
+  _deps = _ctx.deps;
+  let sourceUrl;
+  try {
+    sourceUrl = new URL(sourceId);
+  } catch {
+    throw new Error(`[${SITE_ID}] 无效 URL: ${sourceId}`);
+  }
+  const mode = resolveMode(sourceUrl);
+  const errors = [];
+  const tryMode = async (m) => {
+    try {
+      return m === "latest" ? await fetchLatestItems(sourceUrl) : await fetchTrendingItems(sourceUrl);
+    } catch (err) {
+      const msg = err instanceof Error ? err.message : String(err);
+      errors.push(`${m}: ${msg}`);
+      return [];
+    }
+  };
+  const primary = await tryMode(mode);
+  if (primary.length > 0) return primary;
+  const fallbackMode = mode === "trending" ? "latest" : "trending";
+  const fallback = await tryMode(fallbackMode);
+  if (fallback.length > 0) return fallback;
+  throw new Error(`[${SITE_ID}] 未解析到条目（${errors.join(" | ")}）`);
+}
+export default {
+  id: SITE_ID,
+  listUrlPattern: /^https?:\/\/(www\.)?paperswithcode\.(co|com)(?:\/(?:papers)?\/?)?(?:\?.*)?$/i,
+  fetchItems,
+};

package/app/plugins/builtin/pjlab-adg-publications.rssany.js ADDED Viewed

@@ -0,0 +1,202 @@
+let _deps;
+const SITE_ID = "pjlab-adg-publications";
+function normalizeText(text) {
+  return (text ?? "")
+    .replace(/\u00a0/g, " ")
+    .replace(/\s+/g, " ")
+    .trim();
+}
+function hashGuid(input) {
+  return _deps.createHash("sha256").update(input).digest("hex");
+}
+function parseYear(raw) {
+  const text = normalizeText(raw);
+  const match = text.match(/\b(19|20)\d{2}\b/);
+  if (!match) return undefined;
+  const year = Number(match[0]);
+  return Number.isFinite(year) ? year : undefined;
+}
+function toAbsoluteLink(rawHref, baseUrl) {
+  if (!rawHref) return null;
+  const href = rawHref.trim();
+  if (!href || href.startsWith("#") || href.startsWith("javascript:")) return null;
+  try {
+    const url = new URL(href, baseUrl);
+    if (!/^https?:$/i.test(url.protocol)) return null;
+    return url.href;
+  } catch {
+    return null;
+  }
+}
+function pickBestLink(detailNode, pageUrl, entryId) {
+  const linkNodes = detailNode.querySelectorAll(".links a[href], a[href]");
+  const candidates = [];
+  for (const node of linkNodes) {
+    const link = toAbsoluteLink(node.getAttribute("href"), pageUrl);
+    if (!link) continue;
+    const label = normalizeText(node.textContent).toLowerCase();
+    candidates.push({ link, label });
+  }
+  const preferHtml = candidates.find((x) => x.label === "html" || x.label === "arxiv" || x.label === "doi");
+  if (preferHtml) return preferHtml.link;
+  const preferPdf = candidates.find((x) => x.label === "pdf");
+  if (preferPdf) return preferPdf.link;
+  if (candidates.length > 0) return candidates[0].link;
+  if (entryId) return `${pageUrl}#${encodeURIComponent(entryId)}`;
+  return pageUrl;
+}
+function buildSummary(author, periodical) {
+  const chunks = [author, periodical].map((x) => normalizeText(x)).filter(Boolean);
+  if (chunks.length === 0) return undefined;
+  return chunks.join(" | ");
+}
+function elementChildren(node) {
+  return node.childNodes.filter((child) => child.tagName != null);
+}
+function directDivChildren(node) {
+  return elementChildren(node).filter((child) => child.tagName?.toLowerCase() === "div");
+}
+function extractTitle(detailNode) {
+  const fromClass = normalizeText(detailNode.querySelector(".title")?.textContent);
+  if (fromClass) return fromClass;
+  const divs = directDivChildren(detailNode);
+  for (const div of divs) {
+    if (div.querySelector("em")) continue;
+    if (div.querySelector("a[href]")) continue;
+    const text = normalizeText(div.textContent);
+    if (!text) continue;
+    if (text.length < 8) continue;
+    return text;
+  }
+  return "";
+}
+function extractAuthor(detailNode, title) {
+  const fromClass = normalizeText(detailNode.querySelector(".author")?.textContent);
+  if (fromClass) return fromClass;
+  const divs = directDivChildren(detailNode);
+  const textDivs = divs
+    .filter((div) => !div.querySelector("em"))
+    .filter((div) => !div.querySelector("a[href]"))
+    .map((div) => normalizeText(div.textContent))
+    .filter(Boolean);
+  const candidate = textDivs.find((text) => text !== title);
+  return candidate || undefined;
+}
+function extractPeriodical(detailNode) {
+  const fromClass = normalizeText(
+    (detailNode.querySelector(".periodical em") ?? detailNode.querySelector(".periodical"))?.textContent
+  );
+  if (fromClass) return fromClass;
+  const divs = directDivChildren(detailNode);
+  for (const div of divs) {
+    const emText = normalizeText(div.querySelector("em")?.textContent);
+    if (emText) return emText;
+  }
+  return "";
+}
+function parseOneEntry(liNode, currentYear, pageUrl) {
+  const detailNode = liNode.querySelector("div[id]") ?? liNode.querySelector("div");
+  if (!detailNode) return null;
+  const entryId = normalizeText(detailNode.getAttribute("id"));
+  const title = extractTitle(detailNode);
+  if (!title) return null;
+  const author = extractAuthor(detailNode, title);
+  const periodical = extractPeriodical(detailNode);
+  const fallbackYear = parseYear(`${periodical} ${detailNode.textContent}`);
+  const finalYear = currentYear ?? fallbackYear;
+  const pubDate = finalYear != null ? new Date(Date.UTC(finalYear, 0, 1, 0, 0, 0)) : new Date();
+  const badge = normalizeText((liNode.querySelector(".abbr .badge") ?? liNode.querySelector("abbr"))?.textContent) || undefined;
+  const link = pickBestLink(detailNode, pageUrl, entryId);
+  const guidSeed = entryId || link || `${title}|${author ?? ""}|${finalYear ?? ""}`;
+  return {
+    guid: hashGuid(guidSeed),
+    title,
+    link,
+    pubDate,
+    author,
+    summary: buildSummary(author, periodical),
+    sourceId: SITE_ID,
+  };
+}
+function parseItems(html, finalUrl) {
+  const root = _deps.parseHtml(html);
+  const container = root.querySelector("article") ?? root;
+  const items = [];
+  const seenGuid = new Set();
+  let currentYear;
+  const stream = container.querySelectorAll("h2, li");
+  for (const node of stream) {
+    const tag = node.tagName?.toLowerCase();
+    if (tag === "h2") {
+      const year = parseYear(node.textContent);
+      if (year != null) currentYear = year;
+      continue;
+    }
+    if (tag !== "li") continue;
+    const item = parseOneEntry(node, currentYear, finalUrl);
+    if (!item) continue;
+    if (seenGuid.has(item.guid)) continue;
+    seenGuid.add(item.guid);
+    items.push(item);
+  }
+  return items;
+}
+async function fetchItems(sourceId, ctx) {
+  _deps = ctx.deps;
+  const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
+  const pageUrl = finalUrl || sourceId;
+  const items = parseItems(html, pageUrl);
+  if (items.length === 0) {
+    throw new Error("[pjlab-adg-publications] 未解析到论文条目，页面结构可能已变化");
+  }
+  return items;
+}
+export default {
+  id: SITE_ID,
+  listUrlPattern: /^https?:\/\/pjlab-adg\.github\.io\/publications\/?(\?.*)?$/i,
+  fetchItems,
+};

package/app/plugins/builtin/rss.rssany.js CHANGED Viewed

@@ -12,6 +12,16 @@ function trimUrl(s) {
   return t || undefined;
 }
+/** rss-parser 常把多位作者压成一段逗号（或中文逗号）分隔文本，拆成数组入库。 */
+function authorsFromCommaText(authorRaw) {
+  if (typeof authorRaw !== "string") return undefined;
+  const parts = authorRaw
+    .split(/[,，]/)
+    .map((s) => s.trim())
+    .filter(Boolean);
+  return parts.length > 0 ? parts : undefined;
+}
 /** 从 rss-parser 条目上尽量取出配图 URL（入库用 imageUrl，与 Gateway 的 cover_img 对齐）。 */
 function extractItemImageUrl(item) {
   const enc = item.enclosure;
@@ -127,7 +137,7 @@ export default {
             : new Date();
       const authorRaw =
         typeof item.creator === "string" ? item.creator : typeof item.author === "string" ? item.author : undefined;
-      const author = authorRaw ? [authorRaw] : undefined;
+      const author = authorsFromCommaText(authorRaw);
       const summary =
         typeof item.summary === "string" ? item.summary : typeof item.contentSnippet === "string" ? item.contentSnippet : undefined;
       const content =

package/app/plugins/builtin/selectdataset.rssany.js ADDED Viewed

@@ -0,0 +1,206 @@
+let _deps;
+// SelectDataset 插件：解析首页/搜索页 Nuxt payload，输出数据集条目（不含 enrich）
+const SELECT_DATASET_ORIGIN = "https://www.selectdataset.com";
+function normalizeText(text) {
+  return (text ?? "").replace(/\s+/g, " ").trim();
+}
+function hashGuid(input) {
+  return _deps.createHash("sha256").update(input).digest("hex");
+}
+function parseDate(dateText) {
+  const text = normalizeText(dateText);
+  if (!text || text === "0") return new Date();
+  const m = text.match(
+    /^(\d{4})-(\d{1,2})-(\d{1,2})(?:[ T](\d{1,2}):(\d{1,2})(?::(\d{1,2}))?)?$/
+  );
+  if (!m) {
+    const fallback = new Date(text);
+    return Number.isNaN(fallback.getTime()) ? new Date() : fallback;
+  }
+  const [, y, mm, dd, hh = "0", mi = "0", ss = "0"] = m;
+  // 站点时间以中国时区为主，显式补 +08:00 避免环境时区影响排序。
+  const withTimezone = `${y}-${mm.padStart(2, "0")}-${dd.padStart(2, "0")}T` +
+    `${hh.padStart(2, "0")}:${mi.padStart(2, "0")}:${ss.padStart(2, "0")}+08:00`;
+  const parsed = new Date(withTimezone);
+  return Number.isNaN(parsed.getTime()) ? new Date() : parsed;
+}
+function toAbsoluteDatasetUrl(datasetId) {
+  const id = normalizeText(datasetId);
+  if (!id) return null;
+  return `${SELECT_DATASET_ORIGIN}/dataset/${id}`;
+}
+function dedupeItems(items) {
+  const seen = new Set();
+  const out = [];
+  for (const item of items) {
+    if (!item?.link || seen.has(item.link)) continue;
+    seen.add(item.link);
+    out.push(item);
+  }
+  return out;
+}
+function parseFromAnchorDom(html, finalUrl) {
+  const root = _deps.parseHtml(html);
+  const baseUrl = finalUrl || SELECT_DATASET_ORIGIN;
+  const items = [];
+  for (const anchor of root.querySelectorAll('a[href*="/dataset/"]')) {
+    const href = anchor.getAttribute("href");
+    if (!href) continue;
+    let link = null;
+    try {
+      const url = new URL(href, baseUrl);
+      if (!/^https?:$/i.test(url.protocol)) continue;
+      if (!/\/dataset\/[A-Za-z0-9]{16,}/.test(url.pathname)) continue;
+      link = url.href;
+    } catch {
+      continue;
+    }
+    const title = normalizeText(anchor.textContent);
+    if (!title) continue;
+    items.push({
+      guid: hashGuid(link),
+      title,
+      link,
+      pubDate: new Date(),
+      summary: undefined,
+    });
+  }
+  return dedupeItems(items);
+}
+function createNuxtResolver(table) {
+  const cache = new Map();
+  const inProgress = new Set();
+  function decodeRef(index) {
+    if (cache.has(index)) return cache.get(index);
+    if (inProgress.has(index)) return undefined;
+    inProgress.add(index);
+    const decoded = decodeValue(table[index]);
+    inProgress.delete(index);
+    cache.set(index, decoded);
+    return decoded;
+  }
+  function decodeValue(value) {
+    if (typeof value === "number") {
+      if (Number.isInteger(value) && value >= 0 && value < table.length) {
+        return decodeRef(value);
+      }
+      return value;
+    }
+    if (value == null || typeof value !== "object") return value;
+    if (Array.isArray(value)) {
+      if (value.length === 2 && (value[0] === "Reactive" || value[0] === "ShallowReactive")) {
+        return decodeValue(value[1]);
+      }
+      if (value.length === 2 && value[0] === "Set") {
+        const raw = decodeValue(value[1]);
+        return Array.isArray(raw) ? raw : [];
+      }
+      return value.map((x) => decodeValue(x));
+    }
+    const out = {};
+    for (const [key, v] of Object.entries(value)) {
+      out[key] = decodeValue(v);
+    }
+    return out;
+  }
+  return { decodeRef };
+}
+function toFeedItem(record) {
+  const title = normalizeText(record.dataset_name);
+  const link = toAbsoluteDatasetUrl(record.id);
+  if (!title || !link) return null;
+  const summary = normalizeText(record.dataset_desc);
+  const author = normalizeText(record.ext_host_name);
+  return {
+    guid: hashGuid(link),
+    title,
+    link,
+    pubDate: parseDate(record.date_index_update || record.date_dataset_update),
+    author: author || undefined,
+    summary: summary || undefined,
+  };
+}
+function parseFromNuxtPayload(html) {
+  const root = _deps.parseHtml(html);
+  const payload = root.querySelector("#__NUXT_DATA__")?.textContent;
+  if (!payload) return [];
+  let table;
+  try {
+    table = JSON.parse(payload);
+  } catch {
+    return [];
+  }
+  if (!Array.isArray(table)) return [];
+  const { decodeRef } = createNuxtResolver(table);
+  const items = [];
+  for (let i = 0; i < table.length; i += 1) {
+    const entry = table[i];
+    if (entry == null || typeof entry !== "object" || Array.isArray(entry)) continue;
+    if (!("dataset_name" in entry) || !("id" in entry)) continue;
+    const decoded = decodeRef(i);
+    if (!decoded || typeof decoded !== "object") continue;
+    const item = toFeedItem(decoded);
+    if (item) items.push(item);
+  }
+  const deduped = dedupeItems(items);
+  deduped.sort((a, b) => b.pubDate.getTime() - a.pubDate.getTime());
+  return deduped;
+}
+async function fetchItems(sourceId, ctx) {
+  _deps = ctx.deps;
+  const { html, finalUrl } = await ctx.fetchHtml(sourceId, { waitMs: 3500 });
+  const fromAnchorDom = parseFromAnchorDom(html, finalUrl);
+  if (fromAnchorDom.length > 0) return fromAnchorDom;
+  // 净化 HTML 下无稳定数据集链接时，回退到未净化 payload 取 dataset id 与更新时间。
+  const raw = await ctx.fetchHtml(sourceId, { waitMs: 3500, purify: false });
+  const fromPayload = parseFromNuxtPayload(raw.html);
+  if (fromPayload.length > 0) return fromPayload;
+  throw new Error("[selectdataset] 未解析到数据集条目，页面结构可能已变化");
+}
+export default {
+  id: "selectdataset",
+  listUrlPattern: /^https?:\/\/(www\.)?selectdataset\.com\/(?:$|\?.*|search(?:\?.*)?|subject(?:\?.*)?)$/i,
+  fetchItems,
+};