@ariesfish/feedloom 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -68,6 +68,22 @@ npm link
68
68
  feedloom --help
69
69
  ```
70
70
 
71
+ ## Agent Skill
72
+
73
+ Feedloom ships an Agent Skill in `skills/feedloom`, so agents that support the `skills` CLI can install the clipping workflow directly from the package or repository:
74
+
75
+ ```bash
76
+ npx skills add @ariesfish/feedloom --skill feedloom
77
+ ```
78
+
79
+ For a global install across supported agents:
80
+
81
+ ```bash
82
+ npx skills add @ariesfish/feedloom --skill feedloom --global
83
+ ```
84
+
85
+ After installing the skill, ask your agent to save article URLs, URL lists, or RSS feeds as Markdown. The skill runs the CLI through `npx -y @ariesfish/feedloom` by default.
86
+
71
87
  ## Quick Start
72
88
 
73
89
  Archive a single article to the default `clippings/` directory:
@@ -242,6 +258,7 @@ Only use this on your own device and accounts. Always respect the target site's
242
258
  --prefer-browser-state Try local Chrome user state first
243
259
  --chrome-user-data-dir <path> Chrome User Data directory
244
260
  --chrome-profile <name> Chrome profile name. Default: Default
261
+ --site-rules-dir <dir> Optional directory of private TOML site rules
245
262
  ```
246
263
 
247
264
  For the full option list, run:
@@ -264,7 +281,7 @@ npm test
264
281
  - Respect robots.txt, website terms of service, copyright, and rate limits.
265
282
  - For dynamic pages, try `--fetch-mode browser` first.
266
283
  - For static blogs and news sites, `--fetch-mode static` is usually faster.
267
- - If article extraction is poor for a specific site, add or adjust a site rule in `src/site-rules/`.
284
+ - If article extraction is poor for a specific site, keep private TOML site rules outside the package and pass them with `--site-rules-dir <dir>`.
268
285
  - For large batches, test with `--limit` before running the full job.
269
286
 
270
287
  ## Acknowledgements
package/dist/cli.js CHANGED
@@ -2,8 +2,7 @@
2
2
 
3
3
  // src/cli.ts
4
4
  import { readdir as readdir2 } from "fs/promises";
5
- import { dirname, join as join7, resolve as resolve2 } from "path";
6
- import { fileURLToPath } from "url";
5
+ import { join as join7, resolve as resolve2 } from "path";
7
6
  import { Command } from "commander";
8
7
 
9
8
  // src/cleaning/profiles.ts
@@ -1618,27 +1617,9 @@ var ProgressTracker = class {
1618
1617
 
1619
1618
  // src/cli.ts
1620
1619
  var program = new Command();
1621
- async function standardSiteRulePaths() {
1622
- const here = dirname(fileURLToPath(import.meta.url));
1623
- const candidates = [
1624
- resolve2(here, "../src/site-rules"),
1625
- resolve2(here, "../../src/site-rules"),
1626
- resolve2(process.cwd(), "src/site-rules"),
1627
- resolve2(here, "../../src/feedloom/site_rules"),
1628
- resolve2(process.cwd(), "../src/feedloom/site_rules"),
1629
- resolve2(process.cwd(), "src/feedloom/site_rules")
1630
- ];
1631
- for (const dir of candidates) {
1632
- try {
1633
- const names = await readdir2(dir);
1634
- return names.filter((name) => name.endsWith(".toml")).map((name) => join7(dir, name));
1635
- } catch (error) {
1636
- if (error.code !== "ENOENT") {
1637
- throw error;
1638
- }
1639
- }
1640
- }
1641
- return [];
1620
+ async function siteRulePathsFromDir(dir) {
1621
+ const names = await readdir2(dir);
1622
+ return names.filter((name) => name.endsWith(".toml")).map((name) => join7(dir, name));
1642
1623
  }
1643
1624
  function positiveIntOption(value, fallback) {
1644
1625
  const parsed = Number(value ?? fallback);
@@ -1647,7 +1628,7 @@ function positiveIntOption(value, fallback) {
1647
1628
  }
1648
1629
  return parsed;
1649
1630
  }
1650
- program.name("feedloom").description("Archive long-form web content as clean Markdown with local assets").version("0.1.0").option("--output-dir <dir>", "Output directory for markdown notes", "clippings").option("--source-kind <kind>", "auto, html-page, or rss-feed", "auto").option("--since <date>", "Only keep feed entries on or after YYYY-MM-DD", "").option("--limit <n>", "Process only first N deduplicated URLs", "0").option("--start <n>", "Start from 1-based index after deduplication", "1").option("--end <n>", "End at 1-based index after deduplication", "0").option("--prefer-browser-state", "Try copied local Chrome profile before regular browser fallback", false).option("--chrome-user-data-dir <path>", "Chrome user data directory used with --prefer-browser-state", "").option("--chrome-profile <name>", "Chrome profile directory name", "Default").option("--fetch-mode <mode>", "auto, static, browser, or stealth", "auto").option("--no-network-idle", "Do not wait for browser networkidle before reading HTML").option("--wait-ms <ms>", "Extra browser wait after load", "2500").option("--solve-cloudflare", "In stealth mode, attempt Cloudflare Turnstile/interstitial challenge handling", false).option("--disable-resources", "In stealth mode, block images/media/fonts/stylesheets for speed", false).option("--proxy <server>", "Proxy server for browser/stealth fetch, e.g. http://127.0.0.1:8080", "").option("--dns-over-https", "Use Chromium Cloudflare DNS-over-HTTPS flag for browser/stealth fetch", false).option("--wait-selector <selector>", "Wait for a CSS selector after page load", "").option("--wait-selector-state <state>", "attached, detached, visible, or hidden", "attached").option("--click-selector <selector...>", "Click one or more selectors after page load", []).option("--scroll-to-bottom", "Scroll to the bottom before reading HTML", false).option("--headful", "Run browser/browser-state fetches with a visible Chrome window", false).option("--no-real-chrome-defaults", "Disable Scrapling-inspired real Chrome context defaults").option("--no-reuse-browser", "Disable batch browser/stealth context reuse").argument("[inputs...]", "URLs or files containing URLs").action(async (inputs, options) => {
1631
+ program.name("feedloom").description("Archive long-form web content as clean Markdown with local assets").version("0.1.0").option("--output-dir <dir>", "Output directory for markdown notes", "clippings").option("--source-kind <kind>", "auto, html-page, or rss-feed", "auto").option("--since <date>", "Only keep feed entries on or after YYYY-MM-DD", "").option("--limit <n>", "Process only first N deduplicated URLs", "0").option("--start <n>", "Start from 1-based index after deduplication", "1").option("--end <n>", "End at 1-based index after deduplication", "0").option("--prefer-browser-state", "Try copied local Chrome profile before regular browser fallback", false).option("--chrome-user-data-dir <path>", "Chrome user data directory used with --prefer-browser-state", "").option("--chrome-profile <name>", "Chrome profile directory name", "Default").option("--fetch-mode <mode>", "auto, static, browser, or stealth", "auto").option("--no-network-idle", "Do not wait for browser networkidle before reading HTML").option("--wait-ms <ms>", "Extra browser wait after load", "2500").option("--solve-cloudflare", "In stealth mode, attempt Cloudflare Turnstile/interstitial challenge handling", false).option("--disable-resources", "In stealth mode, block images/media/fonts/stylesheets for speed", false).option("--proxy <server>", "Proxy server for browser/stealth fetch, e.g. http://127.0.0.1:8080", "").option("--dns-over-https", "Use Chromium Cloudflare DNS-over-HTTPS flag for browser/stealth fetch", false).option("--wait-selector <selector>", "Wait for a CSS selector after page load", "").option("--wait-selector-state <state>", "attached, detached, visible, or hidden", "attached").option("--click-selector <selector...>", "Click one or more selectors after page load", []).option("--scroll-to-bottom", "Scroll to the bottom before reading HTML", false).option("--headful", "Run browser/browser-state fetches with a visible Chrome window", false).option("--site-rules-dir <dir>", "Optional directory of private TOML site extraction/cleaning rules", "").option("--no-real-chrome-defaults", "Disable Scrapling-inspired real Chrome context defaults").option("--no-reuse-browser", "Disable batch browser/stealth context reuse").argument("[inputs...]", "URLs or files containing URLs").action(async (inputs, options) => {
1651
1632
  if (inputs.length === 0) {
1652
1633
  program.help({ error: true });
1653
1634
  }
@@ -1674,7 +1655,8 @@ program.name("feedloom").description("Archive long-form web content as clean Mar
1674
1655
  positiveIntOption(options.end, 0),
1675
1656
  positiveIntOption(options.limit, 0)
1676
1657
  );
1677
- const profiles = await loadSiteProfiles(await standardSiteRulePaths());
1658
+ const siteRulesDir = String(options.siteRulesDir || "");
1659
+ const profiles = siteRulesDir ? await loadSiteProfiles(await siteRulePathsFromDir(resolve2(siteRulesDir))) : [];
1678
1660
  const outputDir = String(options.outputDir ?? "clippings");
1679
1661
  let failures = 0;
1680
1662
  const tracker = new ProgressTracker(selected, outputDir);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ariesfish/feedloom",
3
- "version": "0.1.0",
3
+ "version": "0.1.1",
4
4
  "type": "module",
5
5
  "author": "ariesfish",
6
6
  "license": "MIT",
@@ -18,6 +18,7 @@
18
18
  },
19
19
  "files": [
20
20
  "dist",
21
+ "skills",
21
22
  "README.md",
22
23
  "LICENSE"
23
24
  ],
@@ -0,0 +1,78 @@
1
+ ---
2
+ name: feedloom
3
+ description: Capture long-form web content, article URLs, URL list files, or RSS/Atom feeds into clean Markdown with local assets using the Feedloom CLI. Use for web clipping, saving articles as Markdown, archiving URL batches, clipping Zhihu/WeChat/Kaggle/blog posts, 抓取网页文章, 保存为 Markdown, URL 列表转归档, RSS 归档, and 网页长文归档.
4
+ ---
5
+
6
+ # Feedloom
7
+
8
+ Use Feedloom for article clipping instead of writing ad-hoc scrapers.
9
+
10
+ ## Command
11
+
12
+ ```bash
13
+ npx -y @ariesfish/feedloom <inputs...> [options]
14
+ ```
15
+
16
+ ## Inputs
17
+
18
+ - Direct article URLs.
19
+ - Files containing URLs, one per line.
20
+ - Markdown checklist files with lines like `- [ ] <url>` or `- [x] <url>`.
21
+ - RSS/Atom feeds with `--source-kind rss-feed`.
22
+
23
+ ## Common usage
24
+
25
+ ```bash
26
+ npx -y @ariesfish/feedloom "https://example.com/article"
27
+ npx -y @ariesfish/feedloom urls.txt
28
+ npx -y @ariesfish/feedloom urls.txt --limit 10
29
+ npx -y @ariesfish/feedloom urls.txt --start 11 --end 20
30
+ npx -y @ariesfish/feedloom urls.txt --output-dir clippings
31
+ npx -y @ariesfish/feedloom "https://example.com/feed.xml" --source-kind rss-feed
32
+ npx -y @ariesfish/feedloom "https://example.com/feed.xml" --source-kind rss-feed --since 2026-01-01
33
+ npx -y @ariesfish/feedloom "https://example.com/article" --fetch-mode browser --wait-ms 4000 --scroll-to-bottom
34
+ npx -y @ariesfish/feedloom "https://example.com/article" --prefer-browser-state
35
+ ```
36
+
37
+ ## Fetch workflow
38
+
39
+ Use the least expensive mode that works:
40
+
41
+ 1. Start with default `auto`. It tries meaningful content in order: `static` → `browser-state` when `--prefer-browser-state` is set → `browser` → `stealth`.
42
+ 2. Use `--fetch-mode static` only for simple pages when speed matters and JavaScript/login state is unnecessary.
43
+ 3. Use `--fetch-mode browser` for JavaScript-rendered pages; add `--wait-ms`, `--wait-selector`, `--click-selector`, or `--scroll-to-bottom` only when needed.
44
+ 4. Use `--prefer-browser-state` with `--chrome-user-data-dir` / `--chrome-profile` for pages that need local login state.
45
+ 5. Use `--fetch-mode stealth` only after static/browser fails or for anti-bot pages; add `--solve-cloudflare`, `--proxy`, or `--dns-over-https` only when required.
46
+ 6. For batches, test one URL first, then run the list with the working options plus `--limit`, `--start`, or `--end` as needed.
47
+
48
+ ## Useful options
49
+
50
+ - `--output-dir <dir>`: write notes and assets somewhere other than `clippings/`.
51
+ - `--source-kind rss-feed`: treat input as an RSS/Atom feed and archive feed entries.
52
+ - `--since <YYYY-MM-DD>`: limit RSS/Atom entries by date.
53
+ - `--limit <n>`, `--start <n>`, `--end <n>`: process URL lists in small batches or resume partway through a list.
54
+ - `--fetch-mode <static|browser|stealth>`: force a specific fetch layer when `auto` is too broad or too slow.
55
+ - `--prefer-browser-state`: try a copied local Chrome profile before regular browser fallback.
56
+ - `--wait-ms <ms>`, `--wait-selector <selector>`, `--scroll-to-bottom`: give dynamic pages time or actions to reveal article content.
57
+ - `--click-selector <selector...>`: click dismiss/expand selectors before extracting HTML.
58
+ - `--headful`: show the browser window for debugging login, popups, or dynamic loading.
59
+ - `--site-rules-dir <dir>`: load optional private TOML extraction/cleaning rules from a local directory, for example `skills/feedloom/site-rules/` reference folder.
60
+ - `--solve-cloudflare`, `--proxy <server>`, `--dns-over-https`: use only when stealth fetching needs them.
61
+
62
+ Run `npx -y @ariesfish/feedloom --help` for the complete option list. Do not invent unsupported options.
63
+
64
+ ## Private site rules
65
+
66
+ Site-specific TOML rules are intentionally optional and should not be assumed to be bundled with the package. If the user keeps private rules next to this skill, pass that directory explicitly:
67
+
68
+ ```bash
69
+ npx -y @ariesfish/feedloom "https://example.com/article" --site-rules-dir skills/feedloom/site-rules
70
+ ```
71
+
72
+ Treat rule files in `skills/feedloom/site-rules/` as local reference material: use them only when present and relevant.
73
+
74
+ ## Output
75
+
76
+ - Markdown files are written to `clippings/` by default, or to `--output-dir`.
77
+ - Assets are written under an `assets/` subdirectory.
78
+ - Successful Markdown checklist items are marked done.