@ariesfish/feedloom 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -68,6 +68,22 @@ npm link
68
68
  feedloom --help
69
69
  ```
70
70
 
71
+ ## Agent Skill
72
+
73
+ Feedloom ships an Agent Skill in `skills/feedloom`, so agents that support the `skills` CLI can install the clipping workflow directly from the package or repository:
74
+
75
+ ```bash
76
+ npx skills add @ariesfish/feedloom --skill feedloom
77
+ ```
78
+
79
+ For a global install across supported agents:
80
+
81
+ ```bash
82
+ npx skills add @ariesfish/feedloom --skill feedloom --global
83
+ ```
84
+
85
+ After installing the skill, ask your agent to save article URLs, URL lists, or RSS feeds as Markdown. The skill runs the CLI through `npx -y @ariesfish/feedloom` by default.
86
+
71
87
  ## Quick Start
72
88
 
73
89
  Archive a single article to the default `clippings/` directory:
@@ -242,6 +258,7 @@ Only use this on your own device and accounts. Always respect the target site's
242
258
  --prefer-browser-state Try local Chrome user state first
243
259
  --chrome-user-data-dir <path> Chrome User Data directory
244
260
  --chrome-profile <name> Chrome profile name. Default: Default
261
+ --site-rules-dir <dir> Optional directory of private TOML site rules
245
262
  ```
246
263
 
247
264
  For the full option list, run:
@@ -264,7 +281,7 @@ npm test
264
281
  - Respect robots.txt, website terms of service, copyright, and rate limits.
265
282
  - For dynamic pages, try `--fetch-mode browser` first.
266
283
  - For static blogs and news sites, `--fetch-mode static` is usually faster.
267
- - If article extraction is poor for a specific site, add or adjust a site rule in `src/site-rules/`.
284
+ - If article extraction is poor for a specific site, keep private TOML site rules outside the package and pass them with `--site-rules-dir <dir>`.
268
285
  - For large batches, test with `--limit` before running the full job.
269
286
 
270
287
  ## Acknowledgements
package/dist/cli.js CHANGED
@@ -2,8 +2,7 @@
2
2
 
3
3
  // src/cli.ts
4
4
  import { readdir as readdir2 } from "fs/promises";
5
- import { dirname, join as join7, resolve as resolve2 } from "path";
6
- import { fileURLToPath } from "url";
5
+ import { join as join7, resolve as resolve2 } from "path";
7
6
  import { Command } from "commander";
8
7
 
9
8
  // src/cleaning/profiles.ts
@@ -39,7 +38,11 @@ function profileFromTomlRule(name, rule) {
39
38
  },
40
39
  metadata: {
41
40
  fixedAuthor: rule.metadata?.fixed_author,
42
- titleSuffixPatterns: rule.metadata?.strip_title_regexes
41
+ titleSuffixPatterns: rule.metadata?.strip_title_regexes,
42
+ authorSelectors: rule.metadata?.author_selectors,
43
+ authorMetaNames: rule.metadata?.author_meta_names,
44
+ authorMetaItemprops: rule.metadata?.author_meta_itemprops,
45
+ authorMetaProperties: rule.metadata?.author_meta_properties
43
46
  }
44
47
  };
45
48
  }
@@ -922,6 +925,22 @@ function removeTrailingSiblings(element, removals, reason) {
922
925
  sibling = next;
923
926
  }
924
927
  }
928
+ function truncationCutPoint(root, element) {
929
+ let current = element;
930
+ let best = element;
931
+ while (current.parentElement && current.parentElement !== root) {
932
+ if (current.previousElementSibling) {
933
+ best = current;
934
+ }
935
+ current = current.parentElement;
936
+ }
937
+ return current.previousElementSibling ? current : best;
938
+ }
939
+ function truncateFromElement(root, element, removals, reason) {
940
+ const cutPoint = truncationCutPoint(root, element);
941
+ removeTrailingSiblings(cutPoint, removals, reason);
942
+ removeElement(removals, "site-profile:content-pattern", reason, cutPoint);
943
+ }
925
944
  function compileProfileRegexes(profiles, key) {
926
945
  return profiles.flatMap(
927
946
  (profile) => (profile.removals?.[key] ?? []).map((pattern) => ({ profile: profile.name, regex: new RegExp(pattern, "i") }))
@@ -953,8 +972,7 @@ function removeByTextPatterns(root, profiles, removals) {
953
972
  }
954
973
  const cut = text.length <= 240 ? cutContains.find((entry) => text.includes(entry.marker)) ?? cutRegexes.find((entry) => entry.regex.test(text)) : void 0;
955
974
  if (cut) {
956
- removeTrailingSiblings(element, removals, cut.profile);
957
- removeElement(removals, "site-profile:content-pattern", cut.profile, element);
975
+ truncateFromElement(root, element, removals, cut.profile);
958
976
  return;
959
977
  }
960
978
  const exactProfile = dropExact.get(text);
@@ -1046,7 +1064,28 @@ function jsonLdValue(document2, keys) {
1046
1064
  }
1047
1065
  return void 0;
1048
1066
  }
1049
- function toMetadata(result, document2) {
1067
+ function profileAuthorFromDocument(document2, profiles) {
1068
+ for (const profile of profiles) {
1069
+ const metadata = profile.metadata;
1070
+ if (!metadata) continue;
1071
+ for (const selector of metadata.authorSelectors ?? []) {
1072
+ const author = document2.querySelector(selector)?.textContent?.replace(/\s+/g, " ").trim();
1073
+ if (author) return author;
1074
+ }
1075
+ const metaNames = [
1076
+ ...(metadata.authorMetaNames ?? []).map((value) => ({ attr: "name", value })),
1077
+ ...(metadata.authorMetaItemprops ?? []).map((value) => ({ attr: "itemprop", value })),
1078
+ ...(metadata.authorMetaProperties ?? []).map((value) => ({ attr: "property", value }))
1079
+ ];
1080
+ for (const entry of metaNames) {
1081
+ const escaped = entry.value.replace(/"/g, '\\"');
1082
+ const author = document2.querySelector(`meta[${entry.attr}="${escaped}"]`)?.getAttribute("content")?.trim();
1083
+ if (author) return author;
1084
+ }
1085
+ }
1086
+ return void 0;
1087
+ }
1088
+ function toMetadata(result, document2, profiles) {
1050
1089
  return {
1051
1090
  title: result.title || firstMetaContent(document2, ["og:title", "twitter:title"]) || document2.querySelector("title")?.textContent?.trim() || void 0,
1052
1091
  description: result.description || firstMetaContent(document2, ["description", "og:description", "twitter:description"]),
@@ -1055,7 +1094,7 @@ function toMetadata(result, document2) {
1055
1094
  image: result.image || firstMetaContent(document2, ["og:image", "twitter:image"]),
1056
1095
  language: result.language || document2.documentElement.getAttribute("lang") || void 0,
1057
1096
  published: result.published || firstMetaContent(document2, ["article:published_time", "date", "datePublished", "pubdate", "publishdate"]) || jsonLdValue(document2, ["datePublished", "dateCreated"]),
1058
- author: result.author || firstMetaContent(document2, ["author", "article:author", "twitter:creator"]) || jsonLdValue(document2, ["author", "creator"]),
1097
+ author: result.author || profileAuthorFromDocument(document2, profiles) || firstMetaContent(document2, ["author", "article:author", "twitter:creator"]) || jsonLdValue(document2, ["author", "creator"]),
1059
1098
  site: result.site || firstMetaContent(document2, ["og:site_name", "application-name"]),
1060
1099
  schemaOrgData: result.schemaOrgData,
1061
1100
  wordCount: result.wordCount,
@@ -1103,7 +1142,7 @@ var HtmlCleaner = class {
1103
1142
  standardize: this.options.standardize
1104
1143
  });
1105
1144
  const result = parser2.parseAsync ? await parser2.parseAsync() : parser2.parse();
1106
- const metadata = toMetadata(result, document2);
1145
+ const metadata = toMetadata(result, document2, activeProfiles);
1107
1146
  applyMetadataProfiles(metadata, activeProfiles);
1108
1147
  const content = serializeProfiledContent(result.content, postProfiles, removals);
1109
1148
  return {
@@ -1618,27 +1657,9 @@ var ProgressTracker = class {
1618
1657
 
1619
1658
  // src/cli.ts
1620
1659
  var program = new Command();
1621
- async function standardSiteRulePaths() {
1622
- const here = dirname(fileURLToPath(import.meta.url));
1623
- const candidates = [
1624
- resolve2(here, "../src/site-rules"),
1625
- resolve2(here, "../../src/site-rules"),
1626
- resolve2(process.cwd(), "src/site-rules"),
1627
- resolve2(here, "../../src/feedloom/site_rules"),
1628
- resolve2(process.cwd(), "../src/feedloom/site_rules"),
1629
- resolve2(process.cwd(), "src/feedloom/site_rules")
1630
- ];
1631
- for (const dir of candidates) {
1632
- try {
1633
- const names = await readdir2(dir);
1634
- return names.filter((name) => name.endsWith(".toml")).map((name) => join7(dir, name));
1635
- } catch (error) {
1636
- if (error.code !== "ENOENT") {
1637
- throw error;
1638
- }
1639
- }
1640
- }
1641
- return [];
1660
+ async function siteRulePathsFromDir(dir) {
1661
+ const names = await readdir2(dir);
1662
+ return names.filter((name) => name.endsWith(".toml")).map((name) => join7(dir, name));
1642
1663
  }
1643
1664
  function positiveIntOption(value, fallback) {
1644
1665
  const parsed = Number(value ?? fallback);
@@ -1647,7 +1668,7 @@ function positiveIntOption(value, fallback) {
1647
1668
  }
1648
1669
  return parsed;
1649
1670
  }
1650
- program.name("feedloom").description("Archive long-form web content as clean Markdown with local assets").version("0.1.0").option("--output-dir <dir>", "Output directory for markdown notes", "clippings").option("--source-kind <kind>", "auto, html-page, or rss-feed", "auto").option("--since <date>", "Only keep feed entries on or after YYYY-MM-DD", "").option("--limit <n>", "Process only first N deduplicated URLs", "0").option("--start <n>", "Start from 1-based index after deduplication", "1").option("--end <n>", "End at 1-based index after deduplication", "0").option("--prefer-browser-state", "Try copied local Chrome profile before regular browser fallback", false).option("--chrome-user-data-dir <path>", "Chrome user data directory used with --prefer-browser-state", "").option("--chrome-profile <name>", "Chrome profile directory name", "Default").option("--fetch-mode <mode>", "auto, static, browser, or stealth", "auto").option("--no-network-idle", "Do not wait for browser networkidle before reading HTML").option("--wait-ms <ms>", "Extra browser wait after load", "2500").option("--solve-cloudflare", "In stealth mode, attempt Cloudflare Turnstile/interstitial challenge handling", false).option("--disable-resources", "In stealth mode, block images/media/fonts/stylesheets for speed", false).option("--proxy <server>", "Proxy server for browser/stealth fetch, e.g. http://127.0.0.1:8080", "").option("--dns-over-https", "Use Chromium Cloudflare DNS-over-HTTPS flag for browser/stealth fetch", false).option("--wait-selector <selector>", "Wait for a CSS selector after page load", "").option("--wait-selector-state <state>", "attached, detached, visible, or hidden", "attached").option("--click-selector <selector...>", "Click one or more selectors after page load", []).option("--scroll-to-bottom", "Scroll to the bottom before reading HTML", false).option("--headful", "Run browser/browser-state fetches with a visible Chrome window", false).option("--no-real-chrome-defaults", "Disable Scrapling-inspired real Chrome context defaults").option("--no-reuse-browser", "Disable batch browser/stealth context reuse").argument("[inputs...]", "URLs or files containing URLs").action(async (inputs, options) => {
1671
+ program.name("feedloom").description("Archive long-form web content as clean Markdown with local assets").version("0.1.0").option("--output-dir <dir>", "Output directory for markdown notes", "clippings").option("--source-kind <kind>", "auto, html-page, or rss-feed", "auto").option("--since <date>", "Only keep feed entries on or after YYYY-MM-DD", "").option("--limit <n>", "Process only first N deduplicated URLs", "0").option("--start <n>", "Start from 1-based index after deduplication", "1").option("--end <n>", "End at 1-based index after deduplication", "0").option("--prefer-browser-state", "Try copied local Chrome profile before regular browser fallback", false).option("--chrome-user-data-dir <path>", "Chrome user data directory used with --prefer-browser-state", "").option("--chrome-profile <name>", "Chrome profile directory name", "Default").option("--fetch-mode <mode>", "auto, static, browser, or stealth", "auto").option("--no-network-idle", "Do not wait for browser networkidle before reading HTML").option("--wait-ms <ms>", "Extra browser wait after load", "2500").option("--solve-cloudflare", "In stealth mode, attempt Cloudflare Turnstile/interstitial challenge handling", false).option("--disable-resources", "In stealth mode, block images/media/fonts/stylesheets for speed", false).option("--proxy <server>", "Proxy server for browser/stealth fetch, e.g. http://127.0.0.1:8080", "").option("--dns-over-https", "Use Chromium Cloudflare DNS-over-HTTPS flag for browser/stealth fetch", false).option("--wait-selector <selector>", "Wait for a CSS selector after page load", "").option("--wait-selector-state <state>", "attached, detached, visible, or hidden", "attached").option("--click-selector <selector...>", "Click one or more selectors after page load", []).option("--scroll-to-bottom", "Scroll to the bottom before reading HTML", false).option("--headful", "Run browser/browser-state fetches with a visible Chrome window", false).option("--site-rules-dir <dir>", "Optional directory of private TOML site extraction/cleaning rules", "").option("--no-real-chrome-defaults", "Disable Scrapling-inspired real Chrome context defaults").option("--no-reuse-browser", "Disable batch browser/stealth context reuse").argument("[inputs...]", "URLs or files containing URLs").action(async (inputs, options) => {
1651
1672
  if (inputs.length === 0) {
1652
1673
  program.help({ error: true });
1653
1674
  }
@@ -1674,7 +1695,8 @@ program.name("feedloom").description("Archive long-form web content as clean Mar
1674
1695
  positiveIntOption(options.end, 0),
1675
1696
  positiveIntOption(options.limit, 0)
1676
1697
  );
1677
- const profiles = await loadSiteProfiles(await standardSiteRulePaths());
1698
+ const siteRulesDir = String(options.siteRulesDir || "");
1699
+ const profiles = siteRulesDir ? await loadSiteProfiles(await siteRulePathsFromDir(resolve2(siteRulesDir))) : [];
1678
1700
  const outputDir = String(options.outputDir ?? "clippings");
1679
1701
  let failures = 0;
1680
1702
  const tracker = new ProgressTracker(selected, outputDir);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ariesfish/feedloom",
3
- "version": "0.1.0",
3
+ "version": "0.1.2",
4
4
  "type": "module",
5
5
  "author": "ariesfish",
6
6
  "license": "MIT",
@@ -18,6 +18,7 @@
18
18
  },
19
19
  "files": [
20
20
  "dist",
21
+ "skills",
21
22
  "README.md",
22
23
  "LICENSE"
23
24
  ],
@@ -0,0 +1,80 @@
1
+ ---
2
+ name: feedloom
3
+ description: Capture long-form web content, article URLs, URL list files, or RSS/Atom feeds into clean Markdown with local assets using the Feedloom CLI. Use for web clipping, saving articles as Markdown, archiving URL batches, clipping Zhihu/WeChat/Kaggle/blog posts, 抓取网页文章, 保存为 Markdown, URL 列表转归档, RSS 归档, and 网页长文归档.
4
+ ---
5
+
6
+ # Feedloom
7
+
8
+ Use Feedloom for article clipping instead of writing ad-hoc scrapers.
9
+
10
+ ## Command
11
+
12
+ ```bash
13
+ npx -y @ariesfish/feedloom <inputs...> [options]
14
+ ```
15
+
16
+ ## Inputs
17
+
18
+ - Direct article URLs.
19
+ - Files containing URLs, one per line.
20
+ - Markdown checklist files with lines like `- [ ] <url>` or `- [x] <url>`.
21
+ - RSS/Atom feeds with `--source-kind rss-feed`.
22
+
23
+ ## Common usage
24
+
25
+ Before running Feedloom, check whether this skill directory has a `site-rules/` directory. If it exists, always pass it with `--site-rules-dir $HOME/.agents/skills/feedloom/site-rules`; do not omit available site rules.
26
+
27
+ ```bash
28
+ npx -y @ariesfish/feedloom "https://example.com/article"
29
+ npx -y @ariesfish/feedloom urls.txt
30
+ npx -y @ariesfish/feedloom urls.txt --limit 10
31
+ npx -y @ariesfish/feedloom urls.txt --start 11 --end 20
32
+ npx -y @ariesfish/feedloom urls.txt --output-dir clippings
33
+ npx -y @ariesfish/feedloom "https://example.com/feed.xml" --source-kind rss-feed
34
+ npx -y @ariesfish/feedloom "https://example.com/feed.xml" --source-kind rss-feed --since 2026-01-01
35
+ npx -y @ariesfish/feedloom "https://example.com/article" --fetch-mode browser --wait-ms 4000 --scroll-to-bottom
36
+ npx -y @ariesfish/feedloom "https://example.com/article" --prefer-browser-state
37
+ ```
38
+
39
+ ## Fetch workflow
40
+
41
+ Use the least expensive mode that works:
42
+
43
+ 1. Start with default `auto`. It tries meaningful content in order: `static` → `browser-state` when `--prefer-browser-state` is set → `browser` → `stealth`.
44
+ 2. Use `--fetch-mode static` only for simple pages when speed matters and JavaScript/login state is unnecessary.
45
+ 3. Use `--fetch-mode browser` for JavaScript-rendered pages; add `--wait-ms`, `--wait-selector`, `--click-selector`, or `--scroll-to-bottom` only when needed.
46
+ 4. Use `--prefer-browser-state` with `--chrome-user-data-dir` / `--chrome-profile` for pages that need local login state.
47
+ 5. Use `--fetch-mode stealth` only after static/browser fails or for anti-bot pages; add `--solve-cloudflare`, `--proxy`, or `--dns-over-https` only when required.
48
+ 6. For batches, test one URL first, then run the list with the working options plus `--limit`, `--start`, or `--end` as needed.
49
+
50
+ ## Useful options
51
+
52
+ - `--output-dir <dir>`: write notes and assets somewhere other than `clippings/`.
53
+ - `--source-kind rss-feed`: treat input as an RSS/Atom feed and archive feed entries.
54
+ - `--since <YYYY-MM-DD>`: limit RSS/Atom entries by date.
55
+ - `--limit <n>`, `--start <n>`, `--end <n>`: process URL lists in small batches or resume partway through a list.
56
+ - `--fetch-mode <static|browser|stealth>`: force a specific fetch layer when `auto` is too broad or too slow.
57
+ - `--prefer-browser-state`: try a copied local Chrome profile before regular browser fallback.
58
+ - `--wait-ms <ms>`, `--wait-selector <selector>`, `--scroll-to-bottom`: give dynamic pages time or actions to reveal article content.
59
+ - `--click-selector <selector...>`: click dismiss/expand selectors before extracting HTML.
60
+ - `--headful`: show the browser window for debugging login, popups, or dynamic loading.
61
+ - `--site-rules-dir <dir>`: load optional private TOML extraction/cleaning rules from a local directory, for example `$HOME/.agents/skills/feedloom/site-rules/` reference folder.
62
+ - `--solve-cloudflare`, `--proxy <server>`, `--dns-over-https`: use only when stealth fetching needs them.
63
+
64
+ Run `npx -y @ariesfish/feedloom --help` for the complete option list. Do not invent unsupported options.
65
+
66
+ ## Private site rules
67
+
68
+ Site-specific TOML rules are optional in the package, but mandatory to use when present next to this skill. Always check for `$HOME/.agents/skills/feedloom/site-rules/` before clipping. If that directory exists, pass it explicitly on every Feedloom command using the `$HOME`-prefixed path:
69
+
70
+ ```bash
71
+ npx -y @ariesfish/feedloom "https://example.com/article" --site-rules-dir $HOME/.agents/skills/feedloom/site-rules
72
+ ```
73
+
74
+ Treat rule files in `$HOME/.agents/skills/feedloom/site-rules/` as local reference material and use them whenever available; never skip an existing site-rules directory unless the user explicitly asks not to use it.
75
+
76
+ ## Output
77
+
78
+ - Markdown files are written to `clippings/` by default, or to `--output-dir`.
79
+ - Assets are written under an `assets/` subdirectory.
80
+ - Successful Markdown checklist items are marked done.