npm - @ariesfish/feedloom - Versions diffs - 0.1.0 → 0.1.2 - Mend

@ariesfish/feedloom 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -68,6 +68,22 @@ npm link
 feedloom --help
 ```
+## Agent Skill
+Feedloom ships an Agent Skill in `skills/feedloom`, so agents that support the `skills` CLI can install the clipping workflow directly from the package or repository:
+```bash
+npx skills add @ariesfish/feedloom --skill feedloom
+```
+For a global install across supported agents:
+```bash
+npx skills add @ariesfish/feedloom --skill feedloom --global
+```
+After installing the skill, ask your agent to save article URLs, URL lists, or RSS feeds as Markdown. The skill runs the CLI through `npx -y @ariesfish/feedloom` by default.
 ## Quick Start
 Archive a single article to the default `clippings/` directory:
@@ -242,6 +258,7 @@ Only use this on your own device and accounts. Always respect the target site's
 --prefer-browser-state          Try local Chrome user state first
 --chrome-user-data-dir <path>   Chrome User Data directory
 --chrome-profile <name>         Chrome profile name. Default: Default
+--site-rules-dir <dir>          Optional directory of private TOML site rules
 ```
 For the full option list, run:
@@ -264,7 +281,7 @@ npm test
 - Respect robots.txt, website terms of service, copyright, and rate limits.
 - For dynamic pages, try `--fetch-mode browser` first.
 - For static blogs and news sites, `--fetch-mode static` is usually faster.
-- If article extraction is poor for a specific site, add or adjust a site rule in `src/site-rules/`.
+- If article extraction is poor for a specific site, keep private TOML site rules outside the package and pass them with `--site-rules-dir <dir>`.
 - For large batches, test with `--limit` before running the full job.
 ## Acknowledgements

package/dist/cli.js CHANGED Viewed

@@ -2,8 +2,7 @@
 // src/cli.ts
 import { readdir as readdir2 } from "fs/promises";
-import { dirname, join as join7, resolve as resolve2 } from "path";
-import { fileURLToPath } from "url";
+import { join as join7, resolve as resolve2 } from "path";
 import { Command } from "commander";
 // src/cleaning/profiles.ts
@@ -39,7 +38,11 @@ function profileFromTomlRule(name, rule) {
     },
     metadata: {
       fixedAuthor: rule.metadata?.fixed_author,
-      titleSuffixPatterns: rule.metadata?.strip_title_regexes
+      titleSuffixPatterns: rule.metadata?.strip_title_regexes,
+      authorSelectors: rule.metadata?.author_selectors,
+      authorMetaNames: rule.metadata?.author_meta_names,
+      authorMetaItemprops: rule.metadata?.author_meta_itemprops,
+      authorMetaProperties: rule.metadata?.author_meta_properties
     }
   };
 }
@@ -922,6 +925,22 @@ function removeTrailingSiblings(element, removals, reason) {
     sibling = next;
   }
 }
+function truncationCutPoint(root, element) {
+  let current = element;
+  let best = element;
+  while (current.parentElement && current.parentElement !== root) {
+    if (current.previousElementSibling) {
+      best = current;
+    }
+    current = current.parentElement;
+  }
+  return current.previousElementSibling ? current : best;
+}
+function truncateFromElement(root, element, removals, reason) {
+  const cutPoint = truncationCutPoint(root, element);
+  removeTrailingSiblings(cutPoint, removals, reason);
+  removeElement(removals, "site-profile:content-pattern", reason, cutPoint);
+}
 function compileProfileRegexes(profiles, key) {
   return profiles.flatMap(
     (profile) => (profile.removals?.[key] ?? []).map((pattern) => ({ profile: profile.name, regex: new RegExp(pattern, "i") }))
@@ -953,8 +972,7 @@ function removeByTextPatterns(root, profiles, removals) {
     }
     const cut = text.length <= 240 ? cutContains.find((entry) => text.includes(entry.marker)) ?? cutRegexes.find((entry) => entry.regex.test(text)) : void 0;
     if (cut) {
-      removeTrailingSiblings(element, removals, cut.profile);
-      removeElement(removals, "site-profile:content-pattern", cut.profile, element);
+      truncateFromElement(root, element, removals, cut.profile);
       return;
     }
     const exactProfile = dropExact.get(text);
@@ -1046,7 +1064,28 @@ function jsonLdValue(document2, keys) {
   }
   return void 0;
 }
-function toMetadata(result, document2) {
+function profileAuthorFromDocument(document2, profiles) {
+  for (const profile of profiles) {
+    const metadata = profile.metadata;
+    if (!metadata) continue;
+    for (const selector of metadata.authorSelectors ?? []) {
+      const author = document2.querySelector(selector)?.textContent?.replace(/\s+/g, " ").trim();
+      if (author) return author;
+    }
+    const metaNames = [
+      ...(metadata.authorMetaNames ?? []).map((value) => ({ attr: "name", value })),
+      ...(metadata.authorMetaItemprops ?? []).map((value) => ({ attr: "itemprop", value })),
+      ...(metadata.authorMetaProperties ?? []).map((value) => ({ attr: "property", value }))
+    ];
+    for (const entry of metaNames) {
+      const escaped = entry.value.replace(/"/g, '\\"');
+      const author = document2.querySelector(`meta[${entry.attr}="${escaped}"]`)?.getAttribute("content")?.trim();
+      if (author) return author;
+    }
+  }
+  return void 0;
+}
+function toMetadata(result, document2, profiles) {
   return {
     title: result.title || firstMetaContent(document2, ["og:title", "twitter:title"]) || document2.querySelector("title")?.textContent?.trim() || void 0,
     description: result.description || firstMetaContent(document2, ["description", "og:description", "twitter:description"]),
@@ -1055,7 +1094,7 @@ function toMetadata(result, document2) {
     image: result.image || firstMetaContent(document2, ["og:image", "twitter:image"]),
     language: result.language || document2.documentElement.getAttribute("lang") || void 0,
     published: result.published || firstMetaContent(document2, ["article:published_time", "date", "datePublished", "pubdate", "publishdate"]) || jsonLdValue(document2, ["datePublished", "dateCreated"]),
-    author: result.author || firstMetaContent(document2, ["author", "article:author", "twitter:creator"]) || jsonLdValue(document2, ["author", "creator"]),
+    author: result.author || profileAuthorFromDocument(document2, profiles) || firstMetaContent(document2, ["author", "article:author", "twitter:creator"]) || jsonLdValue(document2, ["author", "creator"]),
     site: result.site || firstMetaContent(document2, ["og:site_name", "application-name"]),
     schemaOrgData: result.schemaOrgData,
     wordCount: result.wordCount,
@@ -1103,7 +1142,7 @@ var HtmlCleaner = class {
       standardize: this.options.standardize
     });
     const result = parser2.parseAsync ? await parser2.parseAsync() : parser2.parse();
-    const metadata = toMetadata(result, document2);
+    const metadata = toMetadata(result, document2, activeProfiles);
     applyMetadataProfiles(metadata, activeProfiles);
     const content = serializeProfiledContent(result.content, postProfiles, removals);
     return {
@@ -1618,27 +1657,9 @@ var ProgressTracker = class {
 // src/cli.ts
 var program = new Command();
-async function standardSiteRulePaths() {
-  const here = dirname(fileURLToPath(import.meta.url));
-  const candidates = [
-    resolve2(here, "../src/site-rules"),
-    resolve2(here, "../../src/site-rules"),
-    resolve2(process.cwd(), "src/site-rules"),
-    resolve2(here, "../../src/feedloom/site_rules"),
-    resolve2(process.cwd(), "../src/feedloom/site_rules"),
-    resolve2(process.cwd(), "src/feedloom/site_rules")
-  ];
-  for (const dir of candidates) {
-    try {
-      const names = await readdir2(dir);
-      return names.filter((name) => name.endsWith(".toml")).map((name) => join7(dir, name));
-    } catch (error) {
-      if (error.code !== "ENOENT") {
-        throw error;
-      }
-    }
-  }
-  return [];
+async function siteRulePathsFromDir(dir) {
+  const names = await readdir2(dir);
+  return names.filter((name) => name.endsWith(".toml")).map((name) => join7(dir, name));
 }
 function positiveIntOption(value, fallback) {
   const parsed = Number(value ?? fallback);
@@ -1647,7 +1668,7 @@ function positiveIntOption(value, fallback) {
   }
   return parsed;
 }
-program.name("feedloom").description("Archive long-form web content as clean Markdown with local assets").version("0.1.0").option("--output-dir <dir>", "Output directory for markdown notes", "clippings").option("--source-kind <kind>", "auto, html-page, or rss-feed", "auto").option("--since <date>", "Only keep feed entries on or after YYYY-MM-DD", "").option("--limit <n>", "Process only first N deduplicated URLs", "0").option("--start <n>", "Start from 1-based index after deduplication", "1").option("--end <n>", "End at 1-based index after deduplication", "0").option("--prefer-browser-state", "Try copied local Chrome profile before regular browser fallback", false).option("--chrome-user-data-dir <path>", "Chrome user data directory used with --prefer-browser-state", "").option("--chrome-profile <name>", "Chrome profile directory name", "Default").option("--fetch-mode <mode>", "auto, static, browser, or stealth", "auto").option("--no-network-idle", "Do not wait for browser networkidle before reading HTML").option("--wait-ms <ms>", "Extra browser wait after load", "2500").option("--solve-cloudflare", "In stealth mode, attempt Cloudflare Turnstile/interstitial challenge handling", false).option("--disable-resources", "In stealth mode, block images/media/fonts/stylesheets for speed", false).option("--proxy <server>", "Proxy server for browser/stealth fetch, e.g. http://127.0.0.1:8080", "").option("--dns-over-https", "Use Chromium Cloudflare DNS-over-HTTPS flag for browser/stealth fetch", false).option("--wait-selector <selector>", "Wait for a CSS selector after page load", "").option("--wait-selector-state <state>", "attached, detached, visible, or hidden", "attached").option("--click-selector <selector...>", "Click one or more selectors after page load", []).option("--scroll-to-bottom", "Scroll to the bottom before reading HTML", false).option("--headful", "Run browser/browser-state fetches with a visible Chrome window", false).option("--no-real-chrome-defaults", "Disable Scrapling-inspired real Chrome context defaults").option("--no-reuse-browser", "Disable batch browser/stealth context reuse").argument("[inputs...]", "URLs or files containing URLs").action(async (inputs, options) => {
+program.name("feedloom").description("Archive long-form web content as clean Markdown with local assets").version("0.1.0").option("--output-dir <dir>", "Output directory for markdown notes", "clippings").option("--source-kind <kind>", "auto, html-page, or rss-feed", "auto").option("--since <date>", "Only keep feed entries on or after YYYY-MM-DD", "").option("--limit <n>", "Process only first N deduplicated URLs", "0").option("--start <n>", "Start from 1-based index after deduplication", "1").option("--end <n>", "End at 1-based index after deduplication", "0").option("--prefer-browser-state", "Try copied local Chrome profile before regular browser fallback", false).option("--chrome-user-data-dir <path>", "Chrome user data directory used with --prefer-browser-state", "").option("--chrome-profile <name>", "Chrome profile directory name", "Default").option("--fetch-mode <mode>", "auto, static, browser, or stealth", "auto").option("--no-network-idle", "Do not wait for browser networkidle before reading HTML").option("--wait-ms <ms>", "Extra browser wait after load", "2500").option("--solve-cloudflare", "In stealth mode, attempt Cloudflare Turnstile/interstitial challenge handling", false).option("--disable-resources", "In stealth mode, block images/media/fonts/stylesheets for speed", false).option("--proxy <server>", "Proxy server for browser/stealth fetch, e.g. http://127.0.0.1:8080", "").option("--dns-over-https", "Use Chromium Cloudflare DNS-over-HTTPS flag for browser/stealth fetch", false).option("--wait-selector <selector>", "Wait for a CSS selector after page load", "").option("--wait-selector-state <state>", "attached, detached, visible, or hidden", "attached").option("--click-selector <selector...>", "Click one or more selectors after page load", []).option("--scroll-to-bottom", "Scroll to the bottom before reading HTML", false).option("--headful", "Run browser/browser-state fetches with a visible Chrome window", false).option("--site-rules-dir <dir>", "Optional directory of private TOML site extraction/cleaning rules", "").option("--no-real-chrome-defaults", "Disable Scrapling-inspired real Chrome context defaults").option("--no-reuse-browser", "Disable batch browser/stealth context reuse").argument("[inputs...]", "URLs or files containing URLs").action(async (inputs, options) => {
   if (inputs.length === 0) {
     program.help({ error: true });
   }
@@ -1674,7 +1695,8 @@ program.name("feedloom").description("Archive long-form web content as clean Mar
       positiveIntOption(options.end, 0),
       positiveIntOption(options.limit, 0)
     );
-    const profiles = await loadSiteProfiles(await standardSiteRulePaths());
+    const siteRulesDir = String(options.siteRulesDir || "");
+    const profiles = siteRulesDir ? await loadSiteProfiles(await siteRulePathsFromDir(resolve2(siteRulesDir))) : [];
     const outputDir = String(options.outputDir ?? "clippings");
     let failures = 0;
     const tracker = new ProgressTracker(selected, outputDir);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ariesfish/feedloom",
-  "version": "0.1.0",
+  "version": "0.1.2",
   "type": "module",
   "author": "ariesfish",
   "license": "MIT",
@@ -18,6 +18,7 @@
   },
   "files": [
     "dist",
+    "skills",
     "README.md",
     "LICENSE"
   ],

package/skills/feedloom/SKILL.md ADDED Viewed

@@ -0,0 +1,80 @@
+---
+name: feedloom
+description: Capture long-form web content, article URLs, URL list files, or RSS/Atom feeds into clean Markdown with local assets using the Feedloom CLI. Use for web clipping, saving articles as Markdown, archiving URL batches, clipping Zhihu/WeChat/Kaggle/blog posts, 抓取网页文章, 保存为 Markdown, URL 列表转归档, RSS 归档, and 网页长文归档.
+---
+# Feedloom
+Use Feedloom for article clipping instead of writing ad-hoc scrapers.
+## Command
+```bash
+npx -y @ariesfish/feedloom <inputs...> [options]
+```
+## Inputs
+- Direct article URLs.
+- Files containing URLs, one per line.
+- Markdown checklist files with lines like `- [ ] <url>` or `- [x] <url>`.
+- RSS/Atom feeds with `--source-kind rss-feed`.
+## Common usage
+Before running Feedloom, check whether this skill directory has a `site-rules/` directory. If it exists, always pass it with `--site-rules-dir $HOME/.agents/skills/feedloom/site-rules`; do not omit available site rules.
+```bash
+npx -y @ariesfish/feedloom "https://example.com/article"
+npx -y @ariesfish/feedloom urls.txt
+npx -y @ariesfish/feedloom urls.txt --limit 10
+npx -y @ariesfish/feedloom urls.txt --start 11 --end 20
+npx -y @ariesfish/feedloom urls.txt --output-dir clippings
+npx -y @ariesfish/feedloom "https://example.com/feed.xml" --source-kind rss-feed
+npx -y @ariesfish/feedloom "https://example.com/feed.xml" --source-kind rss-feed --since 2026-01-01
+npx -y @ariesfish/feedloom "https://example.com/article" --fetch-mode browser --wait-ms 4000 --scroll-to-bottom
+npx -y @ariesfish/feedloom "https://example.com/article" --prefer-browser-state
+```
+## Fetch workflow
+Use the least expensive mode that works:
+1. Start with default `auto`. It tries meaningful content in order: `static` → `browser-state` when `--prefer-browser-state` is set → `browser` → `stealth`.
+2. Use `--fetch-mode static` only for simple pages when speed matters and JavaScript/login state is unnecessary.
+3. Use `--fetch-mode browser` for JavaScript-rendered pages; add `--wait-ms`, `--wait-selector`, `--click-selector`, or `--scroll-to-bottom` only when needed.
+4. Use `--prefer-browser-state` with `--chrome-user-data-dir` / `--chrome-profile` for pages that need local login state.
+5. Use `--fetch-mode stealth` only after static/browser fails or for anti-bot pages; add `--solve-cloudflare`, `--proxy`, or `--dns-over-https` only when required.
+6. For batches, test one URL first, then run the list with the working options plus `--limit`, `--start`, or `--end` as needed.
+## Useful options
+- `--output-dir <dir>`: write notes and assets somewhere other than `clippings/`.
+- `--source-kind rss-feed`: treat input as an RSS/Atom feed and archive feed entries.
+- `--since <YYYY-MM-DD>`: limit RSS/Atom entries by date.
+- `--limit <n>`, `--start <n>`, `--end <n>`: process URL lists in small batches or resume partway through a list.
+- `--fetch-mode <static|browser|stealth>`: force a specific fetch layer when `auto` is too broad or too slow.
+- `--prefer-browser-state`: try a copied local Chrome profile before regular browser fallback.
+- `--wait-ms <ms>`, `--wait-selector <selector>`, `--scroll-to-bottom`: give dynamic pages time or actions to reveal article content.
+- `--click-selector <selector...>`: click dismiss/expand selectors before extracting HTML.
+- `--headful`: show the browser window for debugging login, popups, or dynamic loading.
+- `--site-rules-dir <dir>`: load optional private TOML extraction/cleaning rules from a local directory, for example `$HOME/.agents/skills/feedloom/site-rules/` reference folder.
+- `--solve-cloudflare`, `--proxy <server>`, `--dns-over-https`: use only when stealth fetching needs them.
+Run `npx -y @ariesfish/feedloom --help` for the complete option list. Do not invent unsupported options.
+## Private site rules
+Site-specific TOML rules are optional in the package, but mandatory to use when present next to this skill. Always check for `$HOME/.agents/skills/feedloom/site-rules/` before clipping. If that directory exists, pass it explicitly on every Feedloom command using the `$HOME`-prefixed path:
+```bash
+npx -y @ariesfish/feedloom "https://example.com/article" --site-rules-dir $HOME/.agents/skills/feedloom/site-rules
+```
+Treat rule files in `$HOME/.agents/skills/feedloom/site-rules/` as local reference material and use them whenever available; never skip an existing site-rules directory unless the user explicitly asks not to use it.
+## Output
+- Markdown files are written to `clippings/` by default, or to `--output-dir`.
+- Assets are written under an `assets/` subdirectory.
+- Successful Markdown checklist items are marked done.