npm - @ariesfish/feedloom - Versions diffs - 0.1.1 → 0.1.3 - Mend

@ariesfish/feedloom 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.md +2 -1
package/dist/cli.js +398 -56
package/dist/site-rules/wechat.toml +5 -0
package/dist/site-rules/xiaohongshu.toml +55 -0
package/dist/site-rules/youtube.toml +10 -0
package/dist/site-rules/zhihu.toml +22 -0
package/package.json +2 -2
package/skills/feedloom/SKILL.md +11 -5
package/skills/feedloom/references/site-rules.md +104 -0

package/README.md CHANGED Viewed

@@ -281,7 +281,8 @@ npm test
 - Respect robots.txt, website terms of service, copyright, and rate limits.
 - For dynamic pages, try `--fetch-mode browser` first.
 - For static blogs and news sites, `--fetch-mode static` is usually faster.
-- If article extraction is poor for a specific site, keep private TOML site rules outside the package and pass them with `--site-rules-dir <dir>`.
+- Feedloom ships bundled TOML site rules for common dynamic/structured sites such as WeChat official account articles and Zhihu. Site rules can define extraction, cleanup, and fetch preferences. For example, the bundled Zhihu rule uses browser fetch with copied Chrome state when `--chrome-user-data-dir`/`--chrome-profile` are configured.
+- If article extraction is poor for a specific site, keep private TOML site rules outside the package and pass them with `--site-rules-dir <dir>`. Private rules are loaded after bundled rules.
 - For large batches, test with `--limit` before running the full job.
 ## Acknowledgements

package/dist/cli.js CHANGED Viewed

@@ -2,7 +2,8 @@
 // src/cli.ts
 import { readdir as readdir2 } from "fs/promises";
-import { join as join7, resolve as resolve2 } from "path";
+import { dirname, join as join7, resolve as resolve2 } from "path";
+import { fileURLToPath } from "url";
 import { Command } from "commander";
 // src/cleaning/profiles.ts
@@ -38,7 +39,30 @@ function profileFromTomlRule(name, rule) {
     },
     metadata: {
       fixedAuthor: rule.metadata?.fixed_author,
-      titleSuffixPatterns: rule.metadata?.strip_title_regexes
+      titleSuffixPatterns: rule.metadata?.strip_title_regexes,
+      authorSuffixPatterns: rule.metadata?.strip_author_regexes,
+      authorSelectors: rule.metadata?.author_selectors,
+      authorMetaNames: rule.metadata?.author_meta_names,
+      authorMetaItemprops: rule.metadata?.author_meta_itemprops,
+      authorMetaProperties: rule.metadata?.author_meta_properties
+    },
+    fetch: {
+      mode: rule.fetch?.mode,
+      preferBrowserState: rule.fetch?.prefer_browser_state,
+      waitMs: rule.fetch?.wait_ms,
+      networkIdle: rule.fetch?.network_idle,
+      waitSelector: rule.fetch?.wait_selector,
+      waitSelectorState: rule.fetch?.wait_selector_state,
+      clickSelectors: rule.fetch?.click_selectors,
+      scrollToBottom: rule.fetch?.scroll_to_bottom,
+      useProxyEnv: rule.fetch?.use_proxy_env
+    },
+    media: {
+      includeMetaImages: rule.media?.include_meta_images,
+      imageMetaProperties: rule.media?.image_meta_properties
+    },
+    extraction: {
+      requireText: rule.extract?.require_text
     }
   };
 }
@@ -118,13 +142,13 @@ async function runPageActions(page, options) {
     await page.locator(selector).first().click({ timeout: 5e3 }).catch(() => void 0);
   }
   if (options.scrollToBottom) {
-    await page.evaluate(async () => {
-      const delay = (ms) => new Promise((resolve3) => setTimeout(resolve3, ms));
+    await page.evaluate(`(async () => {
+      const delay = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
       for (let i = 0; i < 8; i += 1) {
         window.scrollTo(0, document.body.scrollHeight);
         await delay(250);
       }
-    });
+    })()`);
   }
   if (options.waitSelector) {
     await page.locator(options.waitSelector).first().waitFor({
@@ -405,13 +429,13 @@ async function fetchWithStealthContext(context, url, options) {
       await page.locator(selector).first().click({ timeout: 5e3 }).catch(() => void 0);
     }
     if (options.scrollToBottom) {
-      await page.evaluate(async () => {
-        const delay = (ms) => new Promise((resolve3) => setTimeout(resolve3, ms));
+      await page.evaluate(`(async () => {
+        const delay = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
         for (let i = 0; i < 8; i += 1) {
           window.scrollTo(0, document.body.scrollHeight);
           await delay(250);
         }
-      });
+      })()`);
     }
     if (options.waitSelector) {
       await page.locator(options.waitSelector).first().waitFor({ state: options.waitSelectorState ?? "attached", timeout: timeoutMs }).catch(() => void 0);
@@ -820,8 +844,8 @@ function imageSource(img) {
   return first || null;
 }
 async function localizeImages(html, options) {
-  const { document: document2 } = parseHTML(`<!doctype html><html><body>${html}</body></html>`);
-  const images = Array.from(document2.querySelectorAll("img"));
+  const { document } = parseHTML(`<!doctype html><html><body>${html}</body></html>`);
+  const images = Array.from(document.querySelectorAll("img"));
   if (images.length === 0) return html;
   const fetchImage = options.fetchImage ?? fetch;
   const seen = /* @__PURE__ */ new Map();
@@ -860,7 +884,7 @@ async function localizeImages(html, options) {
     img.removeAttribute("data-original");
     img.removeAttribute("data-src");
   }
-  return document2.body.innerHTML;
+  return document.body.innerHTML;
 }
 // src/cleaning/clean-html.ts
@@ -921,6 +945,22 @@ function removeTrailingSiblings(element, removals, reason) {
     sibling = next;
   }
 }
+function truncationCutPoint(root, element) {
+  let current = element;
+  let best = element;
+  while (current.parentElement && current.parentElement !== root) {
+    if (current.previousElementSibling) {
+      best = current;
+    }
+    current = current.parentElement;
+  }
+  return current.previousElementSibling ? current : best;
+}
+function truncateFromElement(root, element, removals, reason) {
+  const cutPoint = truncationCutPoint(root, element);
+  removeTrailingSiblings(cutPoint, removals, reason);
+  removeElement(removals, "site-profile:content-pattern", reason, cutPoint);
+}
 function compileProfileRegexes(profiles, key) {
   return profiles.flatMap(
     (profile) => (profile.removals?.[key] ?? []).map((pattern) => ({ profile: profile.name, regex: new RegExp(pattern, "i") }))
@@ -952,8 +992,7 @@ function removeByTextPatterns(root, profiles, removals) {
     }
     const cut = text.length <= 240 ? cutContains.find((entry) => text.includes(entry.marker)) ?? cutRegexes.find((entry) => entry.regex.test(text)) : void 0;
     if (cut) {
-      removeTrailingSiblings(element, removals, cut.profile);
-      removeElement(removals, "site-profile:content-pattern", cut.profile, element);
+      truncateFromElement(root, element, removals, cut.profile);
       return;
     }
     const exactProfile = dropExact.get(text);
@@ -985,6 +1024,18 @@ function cleanupTitle(metadata, profiles) {
   }
   metadata.title = title;
 }
+function cleanupAuthor(metadata, profiles) {
+  if (!metadata.author) {
+    return;
+  }
+  let author = metadata.author;
+  for (const profile of profiles) {
+    for (const pattern of profile.metadata?.authorSuffixPatterns ?? []) {
+      author = author.replace(new RegExp(pattern, "i"), "").trim();
+    }
+  }
+  metadata.author = author;
+}
 function applySiteProfiles(root, profiles, removals) {
   removeByExactSelectors(root, profiles, removals);
   removeByPartialAttributePatterns(root, profiles, removals);
@@ -993,6 +1044,7 @@ function applySiteProfiles(root, profiles, removals) {
 function applyMetadataProfiles(metadata, profiles) {
   applyFixedAuthor(metadata, profiles);
   cleanupTitle(metadata, profiles);
+  cleanupAuthor(metadata, profiles);
 }
 // src/cleaning/clean-html.ts
@@ -1014,17 +1066,17 @@ var DEFAULT_FEEDLOOM_PROFILE = {
   }
 };
 var DefuddleClass = DefuddleModule.default ?? DefuddleModule.Defuddle;
-function firstMetaContent(document2, names) {
+function firstMetaContent(document, names) {
   for (const name of names) {
     const escaped = name.replace(/"/g, '\\"');
-    const element = document2.querySelector(`meta[property="${escaped}"], meta[name="${escaped}"], meta[itemprop="${escaped}"]`);
+    const element = document.querySelector(`meta[property="${escaped}"], meta[name="${escaped}"], meta[itemprop="${escaped}"]`);
     const content = element?.getAttribute("content")?.trim();
     if (content) return content;
   }
   return void 0;
 }
-function jsonLdValue(document2, keys) {
-  for (const script of Array.from(document2.querySelectorAll('script[type="application/ld+json"]'))) {
+function jsonLdValue(document, keys) {
+  for (const script of Array.from(document.querySelectorAll('script[type="application/ld+json"]'))) {
     const text = script.textContent?.trim();
     if (!text) continue;
     try {
@@ -1045,27 +1097,69 @@ function jsonLdValue(document2, keys) {
   }
   return void 0;
 }
-function toMetadata(result, document2) {
+function profileAuthorFromDocument(document, profiles) {
+  for (const profile of profiles) {
+    const metadata = profile.metadata;
+    if (!metadata) continue;
+    for (const selector of metadata.authorSelectors ?? []) {
+      const author = document.querySelector(selector)?.textContent?.replace(/\s+/g, " ").trim();
+      if (author) return author;
+    }
+    const metaNames = [
+      ...(metadata.authorMetaNames ?? []).map((value) => ({ attr: "name", value })),
+      ...(metadata.authorMetaItemprops ?? []).map((value) => ({ attr: "itemprop", value })),
+      ...(metadata.authorMetaProperties ?? []).map((value) => ({ attr: "property", value }))
+    ];
+    for (const entry of metaNames) {
+      const escaped = entry.value.replace(/"/g, '\\"');
+      const author = document.querySelector(`meta[${entry.attr}="${escaped}"]`)?.getAttribute("content")?.trim();
+      if (author) return author;
+    }
+  }
+  return void 0;
+}
+function toMetadata(result, document, profiles) {
   return {
-    title: result.title || firstMetaContent(document2, ["og:title", "twitter:title"]) || document2.querySelector("title")?.textContent?.trim() || void 0,
-    description: result.description || firstMetaContent(document2, ["description", "og:description", "twitter:description"]),
+    title: result.title || firstMetaContent(document, ["og:title", "twitter:title"]) || document.querySelector("title")?.textContent?.trim() || void 0,
+    description: result.description || firstMetaContent(document, ["description", "og:description", "twitter:description"]),
     domain: result.domain || void 0,
     favicon: result.favicon || void 0,
-    image: result.image || firstMetaContent(document2, ["og:image", "twitter:image"]),
-    language: result.language || document2.documentElement.getAttribute("lang") || void 0,
-    published: result.published || firstMetaContent(document2, ["article:published_time", "date", "datePublished", "pubdate", "publishdate"]) || jsonLdValue(document2, ["datePublished", "dateCreated"]),
-    author: result.author || firstMetaContent(document2, ["author", "article:author", "twitter:creator"]) || jsonLdValue(document2, ["author", "creator"]),
-    site: result.site || firstMetaContent(document2, ["og:site_name", "application-name"]),
+    image: result.image || firstMetaContent(document, ["og:image", "twitter:image"]),
+    language: result.language || document.documentElement.getAttribute("lang") || void 0,
+    published: result.published || firstMetaContent(document, ["article:published_time", "date", "datePublished", "pubdate", "publishdate"]) || jsonLdValue(document, ["datePublished", "dateCreated"]),
+    author: result.author || profileAuthorFromDocument(document, profiles) || firstMetaContent(document, ["author", "article:author", "twitter:creator"]) || jsonLdValue(document, ["author", "creator"]),
+    site: result.site || firstMetaContent(document, ["og:site_name", "application-name"]),
     schemaOrgData: result.schemaOrgData,
     wordCount: result.wordCount,
     parseTime: result.parseTime
   };
 }
-function serializeProfiledContent(content, profiles, removals) {
-  const { document: document2 } = parseHTML2(`<!doctype html><html><body><main data-feedloom-profile-root="true">${content}</main></body></html>`);
-  const root = document2.querySelector('[data-feedloom-profile-root="true"]') ?? document2.body;
+function appendMetaImages(document, root, profiles) {
+  const properties = profiles.flatMap((profile) => profile.media?.includeMetaImages ? profile.media.imageMetaProperties ?? ["og:image"] : []);
+  if (properties.length === 0) {
+    return;
+  }
+  const seen = new Set(Array.from(root.querySelectorAll("img")).map((img) => img.getAttribute("src") ?? ""));
+  for (const property of properties) {
+    const escaped = property.replace(/"/g, '\\"');
+    for (const meta of Array.from(document.querySelectorAll(`meta[property="${escaped}"], meta[name="${escaped}"], meta[itemprop="${escaped}"]`))) {
+      const src = meta.getAttribute("content")?.trim();
+      if (!src || seen.has(src)) continue;
+      const img = document.createElement("img");
+      img.setAttribute("src", src);
+      img.setAttribute("alt", "");
+      root.appendChild(document.createElement("p"));
+      root.lastElementChild?.appendChild(img);
+      seen.add(src);
+    }
+  }
+}
+function serializeProfiledContent(document, content, profiles, removals) {
+  const { document: contentDocument } = parseHTML2(`<!doctype html><html><body><main data-feedloom-profile-root="true">${content}</main></body></html>`);
+  const root = contentDocument.querySelector('[data-feedloom-profile-root="true"]') ?? contentDocument.body;
+  appendMetaImages(document, root, profiles);
   applySiteProfiles(root, profiles, removals);
-  const serialized = root.innerHTML || root.outerHTML || document2.body.innerHTML;
+  const serialized = root.innerHTML || root.outerHTML || contentDocument.body.innerHTML;
   return serialized.trim() ? `${serialized.trim()}
 ` : "";
 }
@@ -1080,9 +1174,9 @@ var HtmlCleaner = class {
     const preferredContentSelector = this.options.contentSelector ?? firstContentSelector(activeProfiles);
     const removals = [];
     const html = /<html[\s>]/i.test(rawHtml) ? rawHtml : `<!doctype html><html><body>${rawHtml}</body></html>`;
-    const { document: document2 } = parseHTML2(html);
-    const contentSelector = preferredContentSelector && document2.querySelector(preferredContentSelector) ? preferredContentSelector : void 0;
-    const doc = document2;
+    const { document } = parseHTML2(html);
+    const contentSelector = preferredContentSelector && document.querySelector(preferredContentSelector) ? preferredContentSelector : void 0;
+    const doc = document;
     if (this.options.baseUrl) {
       doc.URL = this.options.baseUrl;
     }
@@ -1099,12 +1193,14 @@ var HtmlCleaner = class {
       removeExactSelectors: this.options.removeExactSelectors,
       removePartialSelectors: this.options.removePartialSelectors,
       removeContentPatterns: this.options.removeContentPatterns,
-      standardize: this.options.standardize
+      standardize: this.options.standardize,
+      fetch: this.options.defuddleFetch,
+      language: this.options.language
     });
     const result = parser2.parseAsync ? await parser2.parseAsync() : parser2.parse();
-    const metadata = toMetadata(result, document2);
+    const metadata = toMetadata(result, document, activeProfiles);
     applyMetadataProfiles(metadata, activeProfiles);
-    const content = serializeProfiledContent(result.content, postProfiles, removals);
+    const content = serializeProfiledContent(document, result.content, postProfiles, removals);
     return {
       content,
       contentMarkdown: result.contentMarkdown,
@@ -1121,6 +1217,212 @@ async function cleanHtml(rawHtml, options = {}) {
   return new HtmlCleaner(options).parse(rawHtml);
 }
+// src/fetch/proxy-fetch.ts
+import { request as httpRequest } from "http";
+import { connect as tlsConnect } from "tls";
+var REDIRECT_STATUSES = /* @__PURE__ */ new Set([301, 302, 303, 307, 308]);
+var DEFAULT_REDIRECT_LIMIT = 10;
+function envProxyForUrl(targetUrl) {
+  const raw = targetUrl.protocol === "https:" ? process.env.HTTPS_PROXY || process.env.https_proxy || process.env.ALL_PROXY || process.env.all_proxy : process.env.HTTP_PROXY || process.env.http_proxy || process.env.ALL_PROXY || process.env.all_proxy;
+  if (!raw || noProxyMatches(targetUrl.hostname)) {
+    return null;
+  }
+  try {
+    return new URL(raw);
+  } catch {
+    return null;
+  }
+}
+function noProxyMatches(hostname) {
+  const raw = process.env.NO_PROXY ?? process.env.no_proxy ?? "";
+  if (!raw) return false;
+  const host = hostname.toLowerCase();
+  return raw.split(",").map((entry) => entry.trim().toLowerCase()).some((entry) => {
+    if (!entry) return false;
+    if (entry === "*") return true;
+    if (entry.startsWith(".")) return host === entry.slice(1) || host.endsWith(entry);
+    return host === entry || host.endsWith(`.${entry}`);
+  });
+}
+function headersToRecord(headers) {
+  const record = {};
+  if (!headers) return record;
+  new Headers(headers).forEach((value, key) => {
+    record[key] = value;
+  });
+  return record;
+}
+function responseHeaders(headers) {
+  const result = new Headers();
+  for (const [key, value] of Object.entries(headers)) {
+    if (Array.isArray(value)) {
+      for (const item of value) result.append(key, item);
+    } else if (value !== void 0) {
+      result.set(key, String(value));
+    }
+  }
+  return result;
+}
+async function bodyToBuffer(body) {
+  if (body === void 0 || body === null) return void 0;
+  if (typeof ReadableStream !== "undefined" && body instanceof ReadableStream) {
+    throw new Error("proxy-aware fetch does not support streaming request bodies");
+  }
+  if (typeof body === "string") return Buffer.from(body);
+  if (body instanceof URLSearchParams) return Buffer.from(body.toString());
+  if (body instanceof ArrayBuffer) return Buffer.from(body);
+  if (ArrayBuffer.isView(body)) return Buffer.from(body.buffer, body.byteOffset, body.byteLength);
+  if (typeof Blob !== "undefined" && body instanceof Blob) return Buffer.from(await body.arrayBuffer());
+  throw new Error("proxy-aware fetch only supports buffered request bodies");
+}
+function collectResponse(res, done) {
+  const chunks = [];
+  res.on("data", (chunk) => {
+    chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
+  });
+  res.on("end", () => {
+    done(null, {
+      status: res.statusCode ?? 0,
+      statusText: res.statusMessage ?? "",
+      headers: Object.fromEntries(responseHeaders(res.headers)),
+      body: Buffer.concat(chunks)
+    });
+  });
+  res.on("error", (error) => done(error));
+}
+function proxyAuthorization(proxy) {
+  if (!proxy.username) return void 0;
+  return `Basic ${Buffer.from(`${decodeURIComponent(proxy.username)}:${decodeURIComponent(proxy.password)}`).toString("base64")}`;
+}
+function requestViaHttpProxy(targetUrl, proxy, method, headers, body, signal) {
+  if (proxy.protocol !== "http:") {
+    throw new Error(`Unsupported proxy protocol: ${proxy.protocol}`);
+  }
+  return new Promise((resolve3, reject) => {
+    let settled = false;
+    let active = null;
+    const done = (error, response) => {
+      if (settled) return;
+      settled = true;
+      signal?.removeEventListener("abort", abort);
+      if (error) reject(error);
+      else if (response) resolve3(response);
+      else reject(new Error("Proxy request ended without a response"));
+    };
+    const abort = () => active?.destroy(new Error("The operation was aborted"));
+    if (signal?.aborted) {
+      done(new Error("The operation was aborted"));
+      return;
+    }
+    signal?.addEventListener("abort", abort, { once: true });
+    const targetPort = targetUrl.port ? Number(targetUrl.port) : targetUrl.protocol === "https:" ? 443 : 80;
+    const proxyPort = proxy.port ? Number(proxy.port) : 8080;
+    const auth = proxyAuthorization(proxy);
+    const requestHeaders2 = {
+      ...headers,
+      host: targetUrl.host
+    };
+    if (body && !Object.keys(requestHeaders2).some((key) => key.toLowerCase() === "content-length")) {
+      requestHeaders2["content-length"] = String(body.byteLength);
+    }
+    if (targetUrl.protocol === "https:") {
+      const connectHeaders = { host: `${targetUrl.hostname}:${targetPort}` };
+      if (auth) connectHeaders["proxy-authorization"] = auth;
+      const connectReq = httpRequest({
+        host: proxy.hostname,
+        port: proxyPort,
+        method: "CONNECT",
+        path: `${targetUrl.hostname}:${targetPort}`,
+        headers: connectHeaders
+      });
+      active = connectReq;
+      connectReq.on("connect", (connectRes, socket) => {
+        if (connectRes.statusCode !== 200) {
+          socket.destroy();
+          done(new Error(`Proxy CONNECT failed: ${connectRes.statusCode ?? 0}`));
+          return;
+        }
+        const tlsSocket = tlsConnect({ socket, host: targetUrl.hostname, servername: targetUrl.hostname });
+        active = tlsSocket;
+        tlsSocket.on("error", (error) => done(error));
+        tlsSocket.on("secureConnect", () => {
+          const req2 = httpRequest({
+            method,
+            path: `${targetUrl.pathname}${targetUrl.search}`,
+            headers: requestHeaders2,
+            createConnection: () => tlsSocket
+          }, (res) => collectResponse(res, done));
+          active = req2;
+          req2.on("error", (error) => done(error));
+          if (body) req2.write(body);
+          req2.end();
+        });
+      });
+      connectReq.on("error", (error) => done(error));
+      connectReq.end();
+      return;
+    }
+    if (auth) requestHeaders2["proxy-authorization"] = auth;
+    const req = httpRequest({
+      host: proxy.hostname,
+      port: proxyPort,
+      method,
+      path: targetUrl.href,
+      headers: requestHeaders2
+    }, (res) => collectResponse(res, done));
+    active = req;
+    req.on("error", (error) => done(error));
+    if (body) req.write(body);
+    req.end();
+  });
+}
+function requestUrl(input) {
+  if (input instanceof URL) return input;
+  if (typeof input === "string") return new URL(input);
+  return new URL(input.url);
+}
+function requestMethod(input, init) {
+  if (init.method) return init.method.toUpperCase();
+  if (input instanceof Request) return input.method.toUpperCase();
+  return "GET";
+}
+function requestHeaders(input, init) {
+  return {
+    ...input instanceof Request ? headersToRecord(input.headers) : {},
+    ...headersToRecord(init.headers)
+  };
+}
+async function proxyAwareFetchInternal(input, init, redirectsLeft) {
+  const url = requestUrl(input);
+  const proxy = envProxyForUrl(url);
+  if (!proxy) {
+    return fetch(input, init);
+  }
+  const method = requestMethod(input, init);
+  const headers = requestHeaders(input, init);
+  const body = await bodyToBuffer(init.body ?? (input instanceof Request ? input.body : void 0));
+  const proxied = await requestViaHttpProxy(url, proxy, method, headers, body, init.signal ?? void 0);
+  const location = proxied.headers.location;
+  const redirectMode = init.redirect ?? "follow";
+  if (redirectMode !== "manual" && REDIRECT_STATUSES.has(proxied.status) && location && redirectsLeft > 0) {
+    const nextUrl = new URL(location, url);
+    const nextInit = { ...init };
+    if (proxied.status === 303) {
+      nextInit.method = "GET";
+      nextInit.body = void 0;
+    }
+    return proxyAwareFetchInternal(nextUrl, nextInit, redirectsLeft - 1);
+  }
+  return new Response(new Uint8Array(proxied.body), {
+    status: proxied.status,
+    statusText: proxied.statusText,
+    headers: proxied.headers
+  });
+}
+async function proxyAwareFetch(input, init = {}) {
+  return proxyAwareFetchInternal(input, init, DEFAULT_REDIRECT_LIMIT);
+}
 // src/fetch/strategy.ts
 import { writeFile as writeFile3 } from "fs/promises";
@@ -1135,8 +1437,8 @@ function extractPreloadedMarkdownUrl(html, baseUrl) {
   }
   return new URL(rawUrl, baseUrl).toString();
 }
-function removeNoise(document2) {
-  document2.querySelectorAll("script, style, noscript, svg, iframe").forEach((element) => element.remove());
+function removeNoise(document) {
+  document.querySelectorAll("script, style, noscript, svg, iframe").forEach((element) => element.remove());
 }
 function normalizedTextLength(element) {
   return (element?.textContent ?? "").replace(/\s+/g, " ").trim().length;
@@ -1145,12 +1447,12 @@ function htmlHasMeaningfulContent(url, html) {
   if (extractPreloadedMarkdownUrl(html, url) !== null) {
     return true;
   }
-  const { document: document2 } = parseHTML3(html);
-  removeNoise(document2);
+  const { document } = parseHTML3(html);
+  removeNoise(document);
   const selectors = ["#js_content", "article", "main", "section", "div", "body"];
   let bestLength = 0;
   for (const selector of selectors) {
-    document2.querySelectorAll(selector).forEach((element) => {
+    document.querySelectorAll(selector).forEach((element) => {
       bestLength = Math.max(bestLength, normalizedTextLength(element));
     });
     if (bestLength >= 600 && selector !== "div") {
@@ -1245,11 +1547,11 @@ async function fetchBrowserHtmlWithBrowserState(url, config) {
 }
 // src/fetch/static.ts
-async function fetchStaticHtml(url, timeoutMs = 6e4) {
+async function fetchStaticHtml(url, timeoutMs = 6e4, fetchImpl = fetch) {
   const controller = new AbortController();
   const timeout = setTimeout(() => controller.abort(), timeoutMs);
   try {
-    const response = await fetch(url, {
+    const response = await fetchImpl(url, {
       redirect: "follow",
       signal: controller.signal,
       headers: {
@@ -1278,7 +1580,7 @@ async function writeOutputIfRequested(outputPath, html) {
 }
 async function fetchHtmlResult(url, options = {}) {
   const isMeaningful = options.isMeaningful ?? htmlHasMeaningfulContent;
-  const staticFetch = options.staticFetch ?? (async (targetUrl) => (await fetchStaticHtml(targetUrl)).html);
+  const staticFetch = options.staticFetch ?? (async (targetUrl) => (await fetchStaticHtml(targetUrl, void 0, options.useProxyEnv ? proxyAwareFetch : void 0)).html);
   const browserFetch = options.browserFetch ?? ((targetUrl) => fetchBrowserHtml(targetUrl, {
     waitMs: options.waitMs,
     networkIdle: options.networkIdle,
@@ -1446,9 +1748,9 @@ ${code}
 \`\`\``).replace(/\[\s*\]\((?:#|javascript:void\(0\)|javascript:;)\)/gi, "").replace(/(^|[^\\])\$(?=\d)/g, "$1\\$").replace(/\n\s*\n\s*([-*+]\s)/g, "\n$1").replace(/[ \t]+\n/g, "\n").replace(/\n{3,}/g, "\n\n").trim();
 }
 function htmlFragmentText(fragment) {
-  const { document: document2 } = parseHTML4(`<!doctype html><html><body>${fragment}</body></html>`);
-  document2.querySelectorAll("br").forEach((br) => br.replaceWith(document2.createTextNode("\n")));
-  return document2.body.textContent ?? "";
+  const { document } = parseHTML4(`<!doctype html><html><body>${fragment}</body></html>`);
+  document.querySelectorAll("br").forEach((br) => br.replaceWith(document.createTextNode("\n")));
+  return document.body.textContent ?? "";
 }
 function fencedCodeHtml(text) {
   const escaped = text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
@@ -1544,10 +1846,45 @@ function resolveCreatedValue(item, published) {
   if (item.publishedAt) return createdFromItemDate(item.publishedAt);
   return (/* @__PURE__ */ new Date()).toISOString().replace(/\.\d{3}Z$/, "Z");
 }
+function mergeProfileFetchOptions(options, profiles) {
+  const merged = { ...options };
+  for (const profile of profiles) {
+    if (profile.fetch?.mode) merged.fetchMode = profile.fetch.mode;
+    if (profile.fetch?.waitMs !== void 0) merged.waitMs = profile.fetch.waitMs;
+    if (profile.fetch?.networkIdle !== void 0) merged.networkIdle = profile.fetch.networkIdle;
+    if (profile.fetch?.waitSelector) merged.waitSelector = profile.fetch.waitSelector;
+    if (profile.fetch?.waitSelectorState) merged.waitSelectorState = profile.fetch.waitSelectorState;
+    if (profile.fetch?.clickSelectors) merged.clickSelectors = profile.fetch.clickSelectors;
+    if (profile.fetch?.scrollToBottom !== void 0) merged.scrollToBottom = profile.fetch.scrollToBottom;
+    if (profile.fetch?.useProxyEnv !== void 0) merged.useProxyEnv = profile.fetch.useProxyEnv;
+    if (profile.fetch?.preferBrowserState && options.browserStateDefaults) {
+      merged.browserState = {
+        ...options.browserStateDefaults,
+        waitMs: merged.waitMs,
+        networkIdle: merged.networkIdle,
+        proxy: merged.proxy,
+        dnsOverHttps: merged.dnsOverHttps,
+        waitSelector: merged.waitSelector,
+        waitSelectorState: merged.waitSelectorState,
+        clickSelectors: merged.clickSelectors,
+        scrollToBottom: merged.scrollToBottom,
+        headless: merged.headless,
+        realChromeDefaults: merged.realChromeDefaults
+      };
+    }
+  }
+  return merged;
+}
 async function processItem(item, options) {
-  const html = await fetchHtml(item.url, options);
+  const urlProfiles = selectActiveProfiles(options.profiles, item.url, "");
+  const fetchOptions = mergeProfileFetchOptions(options, urlProfiles);
+  const html = await fetchHtml(item.url, fetchOptions);
   const activeProfiles = selectActiveProfiles(options.profiles, item.url, html);
-  const cleaned = await cleanHtml(html, { baseUrl: item.url, profiles: options.profiles, activeProfiles });
+  const defuddleFetch = activeProfiles.some((profile) => profile.fetch?.useProxyEnv) ? proxyAwareFetch : void 0;
+  const cleaned = await cleanHtml(html, { baseUrl: item.url, profiles: options.profiles, activeProfiles, defuddleFetch });
+  if (activeProfiles.some((profile) => profile.extraction?.requireText) && !cleaned.content.replace(/<[^>]*>/g, "").trim()) {
+    throw new Error("matched site rule requires extracted text, but no text content was extracted");
+  }
   const title = cleaned.metadata.title || item.sourceTitle || titleFromUrl(item.url);
   await cleanupExistingNote(options.outputDir, item.url);
   const contentHtml = options.localizeAssets === false ? cleaned.content : await localizeImages(cleaned.content, {
@@ -1619,7 +1956,10 @@ var ProgressTracker = class {
 var program = new Command();
 async function siteRulePathsFromDir(dir) {
   const names = await readdir2(dir);
-  return names.filter((name) => name.endsWith(".toml")).map((name) => join7(dir, name));
+  return names.filter((name) => name.endsWith(".toml")).sort().map((name) => join7(dir, name));
+}
+function builtinSiteRulesDir() {
+  return join7(dirname(fileURLToPath(import.meta.url)), "site-rules");
 }
 function positiveIntOption(value, fallback) {
   const parsed = Number(value ?? fallback);
@@ -1656,7 +1996,9 @@ program.name("feedloom").description("Archive long-form web content as clean Mar
       positiveIntOption(options.limit, 0)
     );
     const siteRulesDir = String(options.siteRulesDir || "");
-    const profiles = siteRulesDir ? await loadSiteProfiles(await siteRulePathsFromDir(resolve2(siteRulesDir))) : [];
+    const builtinRulePaths = await siteRulePathsFromDir(builtinSiteRulesDir());
+    const customRulePaths = siteRulesDir ? await siteRulePathsFromDir(resolve2(siteRulesDir)) : [];
+    const profiles = await loadSiteProfiles([...builtinRulePaths, ...customRulePaths]);
     const outputDir = String(options.outputDir ?? "clippings");
     let failures = 0;
     const tracker = new ProgressTracker(selected, outputDir);
@@ -1675,6 +2017,10 @@ program.name("feedloom").description("Archive long-form web content as clean Mar
       headless: !Boolean(options.headful),
       realChromeDefaults: options.realChromeDefaults !== false
     };
+    const browserStateDefaults = {
+      userDataDir: String(options.chromeUserDataDir || ""),
+      profile: String(options.chromeProfile || "Default")
+    };
     const sessions = options.reuseBrowser === false ? null : new BatchFetchSessions({
       browser: browserOptions,
       stealth: {
@@ -1687,15 +2033,11 @@ program.name("feedloom").description("Archive long-form web content as clean Mar
       for (const item of selected) {
         tracker.start(item.url);
         try {
-          const browserState = options.preferBrowserState ? {
-            userDataDir: String(options.chromeUserDataDir || ""),
-            profile: String(options.chromeProfile || "Default"),
-            ...browserOptions
-          } : null;
           const result = await processItem(item, {
             outputDir,
             profiles,
-            browserState,
+            browserState: options.preferBrowserState ? { ...browserStateDefaults, ...browserOptions } : null,
+            browserStateDefaults,
             fetchMode,
             ...browserOptions,
             solveCloudflare: Boolean(options.solveCloudflare),

package/dist/site-rules/wechat.toml ADDED Viewed

@@ -0,0 +1,5 @@
+[match]
+host_suffixes = ["mp.weixin.qq.com"]
+[extract]
+selectors = ["#js_content"]

package/dist/site-rules/xiaohongshu.toml ADDED Viewed

@@ -0,0 +1,55 @@
+[match]
+host_suffixes = ["xiaohongshu.com"]
+[fetch]
+mode = "auto"
+scroll_to_bottom = true
+wait_ms = 5000
+[extract]
+selectors = [".note-content", "#noteContainer", ".note-container", ".note-detail"]
+[metadata]
+strip_title_regexes = ["\\s*-\\s*小红书\\s*$"]
+strip_author_regexes = ["关注$"]
+author_selectors = [".author-container .username", ".author-wrapper .name", ".user-name"]
+author_meta_names = ["author"]
+author_meta_properties = ["article:author"]
+[media]
+include_meta_images = true
+image_meta_properties = ["og:image"]
+[clean.remove]
+selectors = [
+  ".side-bar",
+  ".left-container",
+  ".comments-el",
+  ".interactions",
+  ".engage-bar",
+  ".note-detail-dropdown",
+  ".close-circle",
+  ".close-box",
+  ".login-container",
+  ".bottom-container .notedetail-menu",
+  ".author",
+  ".author-container",
+  ".media-container",
+  ".fraction",
+  ".arrow-controller",
+  ".pagination-media-container"
+]
+text_contains = [
+  "创作中心",
+  "业务合作",
+  "沪ICP备",
+  "营业执照",
+  "违法不良信息举报电话",
+  "行吟信息科技",
+  "小红书网页版",
+  "登录后推荐更懂你的笔记"
+]
+exact_text = ["关注", "加载中", "更多", "发现", "直播", "发布", "通知"]
+[clean.truncate]
+after_regexes = ["^共 \\d+ 条评论$", "^相关推荐$", "^登录后推荐更懂你的笔记$"]

package/dist/site-rules/youtube.toml ADDED Viewed

@@ -0,0 +1,10 @@
+[match]
+host_suffixes = ["youtube.com", "youtu.be"]
+url_regexes = ["https?://(www\\.)?youtube\\.com/watch\\?", "https?://youtu\\.be/"]
+[fetch]
+mode = "auto"
+use_proxy_env = true
+[extract]
+require_text = true

package/dist/site-rules/zhihu.toml ADDED Viewed

@@ -0,0 +1,22 @@
+[match]
+host_suffixes = ["zhihu.com"]
+[extract]
+selectors = ["[class*=\"Post-RichTextContainer\"]", "[class*=\"RichText ztext\"]", "[class*=\"RichContent-inner\"]"]
+[metadata]
+strip_title_regexes = ["\\s*-\\s*知乎\\s*$"]
+[fetch]
+mode = "browser"
+prefer_browser_state = true
+scroll_to_bottom = true
+wait_ms = 8000
+[clean.remove]
+class_contains = ["RichText-LinkCardContainer"]
+text_regexes = ["^目录收起$", "^目录收起.*References$", "^.+\\d+ 赞同 · \\d+ 评论 文章$", "^.+\\d+ 赞同 · \\d+ 评论 文章$", "^\\d+ 赞同 · \\d+ 评论 文章$"]
+exact_text = ["目录", "收起"]
+[clean.truncate]
+after_regexes = ["^发布于 ", "^赞同 ", "^\\d+ 条评论$", "^分享$", "^申请转载$", ".*的广告$"]

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ariesfish/feedloom",
-  "version": "0.1.1",
+  "version": "0.1.3",
   "type": "module",
   "author": "ariesfish",
   "license": "MIT",
@@ -24,7 +24,7 @@
   ],
   "scripts": {
     "dev": "tsx src/cli.ts",
-    "build": "tsup src/cli.ts --format esm --dts --clean",
+    "build": "tsup src/cli.ts --format esm --dts --clean && rm -rf dist/site-rules && cp -R src/site-rules dist/site-rules",
     "typecheck": "tsc --noEmit",
     "test": "vitest run",
     "prepublishOnly": "npm run typecheck && npm test && npm run build"

package/skills/feedloom/SKILL.md CHANGED Viewed

@@ -22,6 +22,8 @@ npx -y @ariesfish/feedloom <inputs...> [options]
 ## Common usage
+Before running Feedloom, check whether this skill directory has a `site-rules/` directory. If it exists, always pass it with `--site-rules-dir $HOME/.agents/skills/feedloom/site-rules`; do not omit available site rules.
 ```bash
 npx -y @ariesfish/feedloom "https://example.com/article"
 npx -y @ariesfish/feedloom urls.txt
@@ -56,20 +58,24 @@ Use the least expensive mode that works:
 - `--wait-ms <ms>`, `--wait-selector <selector>`, `--scroll-to-bottom`: give dynamic pages time or actions to reveal article content.
 - `--click-selector <selector...>`: click dismiss/expand selectors before extracting HTML.
 - `--headful`: show the browser window for debugging login, popups, or dynamic loading.
-- `--site-rules-dir <dir>`: load optional private TOML extraction/cleaning rules from a local directory, for example `skills/feedloom/site-rules/` reference folder.
+- `--site-rules-dir <dir>`: load optional private TOML extraction/cleaning rules from a local directory, for example `$HOME/.agents/skills/feedloom/site-rules/` reference folder.
 - `--solve-cloudflare`, `--proxy <server>`, `--dns-over-https`: use only when stealth fetching needs them.
 Run `npx -y @ariesfish/feedloom --help` for the complete option list. Do not invent unsupported options.
-## Private site rules
+## Site rules
+Feedloom ships built-in TOML site rules in the package for common sites such as WeChat and Zhihu. These are loaded automatically; do not pass a special option for built-in rules.
-Site-specific TOML rules are intentionally optional and should not be assumed to be bundled with the package. If the user keeps private rules next to this skill, pass that directory explicitly:
+Private skill rules are also supported and are mandatory to use when present next to this skill. Always check for `$HOME/.agents/skills/feedloom/site-rules/` before clipping. If that directory exists, pass it explicitly on every Feedloom command using the `$HOME`-prefixed path:
 ```bash
-npx -y @ariesfish/feedloom "https://example.com/article" --site-rules-dir skills/feedloom/site-rules
+npx -y @ariesfish/feedloom "https://example.com/article" --site-rules-dir $HOME/.agents/skills/feedloom/site-rules
 ```
-Treat rule files in `skills/feedloom/site-rules/` as local reference material: use them only when present and relevant.
+Treat rule files in `$HOME/.agents/skills/feedloom/site-rules/` as local reference material and use them whenever available; never skip an existing site-rules directory unless the user explicitly asks not to use it.
+For adding or editing private rules, read `references/site-rules.md`. It contains the TOML schema, examples, `[fetch]` behavior, and validation workflow.
 ## Output

package/skills/feedloom/references/site-rules.md ADDED Viewed

@@ -0,0 +1,104 @@
+# Feedloom site rules
+Use TOML site rules when Feedloom needs a narrow site-specific selector, cleanup overlay, metadata normalization, or conservative fetch preference. Do not write ad-hoc scrapers.
+## Locations
+Private skill rules live in:
+```text
+$HOME/.agents/skills/feedloom/site-rules/<site>.toml
+```
+When the private rules directory exists, pass it on every command:
+```bash
+npx -y @ariesfish/feedloom "https://example.com/article" --site-rules-dir $HOME/.agents/skills/feedloom/site-rules
+```
+## Add a private rule
+Create or edit one TOML file per site:
+```bash
+mkdir -p $HOME/.agents/skills/feedloom/site-rules
+$EDITOR $HOME/.agents/skills/feedloom/site-rules/example.toml
+```
+Minimal rule:
+```toml
+[match]
+host_suffixes = ["example.com"]
+[extract]
+selectors = ["article", "main"]
+```
+Rule with fetch preferences:
+```toml
+[match]
+host_suffixes = ["zhihu.com"]
+[fetch]
+mode = "browser"
+prefer_browser_state = true
+scroll_to_bottom = true
+wait_ms = 8000
+[extract]
+selectors = ["[class*=\"Post-RichTextContainer\"]", "[class*=\"RichText ztext\"]"]
+```
+## Schema
+Supported sections:
+- `[match]`: `host_suffixes`, `host_regexes`, `url_regexes`, `html_markers`.
+- `[fetch]`: `mode`, `prefer_browser_state`, `wait_ms`, `network_idle`, `wait_selector`, `wait_selector_state`, `click_selectors`, `scroll_to_bottom`, `use_proxy_env`.
+- `[extract]`: `selectors`, `require_text`.
+- `[metadata]`: `fixed_author`, `strip_title_regexes`, `strip_author_regexes`, `author_selectors`, `author_meta_names`, `author_meta_itemprops`, `author_meta_properties`.
+- `[clean.remove]`: `selectors`, `class_contains`, `id_contains`, `attr_contains`, `text_contains`, `text_regexes`, `exact_text`.
+- `[clean.truncate]`: `after_contains`, `after_regexes`.
+## Fetch rules
+Use `[fetch]` only when a site consistently needs browser rendering, local Chrome state, scrolling, waiting, clicking, or proxy-aware requests.
+`use_proxy_env = true` tells Feedloom to use `HTTP_PROXY`, `HTTPS_PROXY`, `ALL_PROXY`, and `NO_PROXY` for static fetches and Defuddle async extractor fetches. Use this for YouTube transcript capture and similar extractor-backed pages that need the user's proxy settings.
+`prefer_browser_state = true` only tells Feedloom to use copied Chrome state for matching URLs. It does not store the local Chrome path. The command still needs Chrome state parameters when login state is required:
+```bash
+npx -y @ariesfish/feedloom \
+  --chrome-user-data-dir "$HOME/Library/Application Support/Google/Chrome" \
+  --chrome-profile Default \
+  --site-rules-dir $HOME/.agents/skills/feedloom/site-rules \
+  "https://zhuanlan.zhihu.com/p/..."
+```
+## Rules for writing rules
+- Prefer narrow domain-specific selectors over broad selectors.
+- Prefer content containers over page shells. Avoid `body` unless the HTML is already minimal.
+- Use `require_text = true` when a matched extractor-backed page should fail instead of writing an empty note.
+- Use cleanup only for repeated, stable noise inside otherwise correct content.
+- Use truncation only for stable tail markers where everything after the marker is non-article content.
+- Do not add aggressive crawling, high concurrency, repeated challenge solving, or broad stealth defaults.
+- Keep private rules outside project repos unless the user is working on Feedloom itself.
+## Validation
+After adding or editing a private rule, test one known URL and inspect the Markdown:
+```bash
+outdir=$(mktemp -d /tmp/feedloom-rule-test-XXXXXX)
+npx -y @ariesfish/feedloom \
+  --output-dir "$outdir" \
+  --site-rules-dir $HOME/.agents/skills/feedloom/site-rules \
+  "https://example.com/article"
+find "$outdir" -maxdepth 2 -type f | sort
+```
+For sites that require Chrome state, add the Chrome state options shown above.