@ariesfish/feedloom 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -1
- package/dist/cli.js +53 -31
- package/package.json +2 -1
- package/skills/feedloom/SKILL.md +80 -0
package/README.md
CHANGED
|
@@ -68,6 +68,22 @@ npm link
|
|
|
68
68
|
feedloom --help
|
|
69
69
|
```
|
|
70
70
|
|
|
71
|
+
## Agent Skill
|
|
72
|
+
|
|
73
|
+
Feedloom ships an Agent Skill in `skills/feedloom`, so agents that support the `skills` CLI can install the clipping workflow directly from the package or repository:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
npx skills add @ariesfish/feedloom --skill feedloom
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
For a global install across supported agents:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
npx skills add @ariesfish/feedloom --skill feedloom --global
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
After installing the skill, ask your agent to save article URLs, URL lists, or RSS feeds as Markdown. The skill runs the CLI through `npx -y @ariesfish/feedloom` by default.
|
|
86
|
+
|
|
71
87
|
## Quick Start
|
|
72
88
|
|
|
73
89
|
Archive a single article to the default `clippings/` directory:
|
|
@@ -242,6 +258,7 @@ Only use this on your own device and accounts. Always respect the target site's
|
|
|
242
258
|
--prefer-browser-state Try local Chrome user state first
|
|
243
259
|
--chrome-user-data-dir <path> Chrome User Data directory
|
|
244
260
|
--chrome-profile <name> Chrome profile name. Default: Default
|
|
261
|
+
--site-rules-dir <dir> Optional directory of private TOML site rules
|
|
245
262
|
```
|
|
246
263
|
|
|
247
264
|
For the full option list, run:
|
|
@@ -264,7 +281,7 @@ npm test
|
|
|
264
281
|
- Respect robots.txt, website terms of service, copyright, and rate limits.
|
|
265
282
|
- For dynamic pages, try `--fetch-mode browser` first.
|
|
266
283
|
- For static blogs and news sites, `--fetch-mode static` is usually faster.
|
|
267
|
-
- If article extraction is poor for a specific site,
|
|
284
|
+
- If article extraction is poor for a specific site, keep private TOML site rules outside the package and pass them with `--site-rules-dir <dir>`.
|
|
268
285
|
- For large batches, test with `--limit` before running the full job.
|
|
269
286
|
|
|
270
287
|
## Acknowledgements
|
package/dist/cli.js
CHANGED
|
@@ -2,8 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
// src/cli.ts
|
|
4
4
|
import { readdir as readdir2 } from "fs/promises";
|
|
5
|
-
import {
|
|
6
|
-
import { fileURLToPath } from "url";
|
|
5
|
+
import { join as join7, resolve as resolve2 } from "path";
|
|
7
6
|
import { Command } from "commander";
|
|
8
7
|
|
|
9
8
|
// src/cleaning/profiles.ts
|
|
@@ -39,7 +38,11 @@ function profileFromTomlRule(name, rule) {
|
|
|
39
38
|
},
|
|
40
39
|
metadata: {
|
|
41
40
|
fixedAuthor: rule.metadata?.fixed_author,
|
|
42
|
-
titleSuffixPatterns: rule.metadata?.strip_title_regexes
|
|
41
|
+
titleSuffixPatterns: rule.metadata?.strip_title_regexes,
|
|
42
|
+
authorSelectors: rule.metadata?.author_selectors,
|
|
43
|
+
authorMetaNames: rule.metadata?.author_meta_names,
|
|
44
|
+
authorMetaItemprops: rule.metadata?.author_meta_itemprops,
|
|
45
|
+
authorMetaProperties: rule.metadata?.author_meta_properties
|
|
43
46
|
}
|
|
44
47
|
};
|
|
45
48
|
}
|
|
@@ -922,6 +925,22 @@ function removeTrailingSiblings(element, removals, reason) {
|
|
|
922
925
|
sibling = next;
|
|
923
926
|
}
|
|
924
927
|
}
|
|
928
|
+
function truncationCutPoint(root, element) {
|
|
929
|
+
let current = element;
|
|
930
|
+
let best = element;
|
|
931
|
+
while (current.parentElement && current.parentElement !== root) {
|
|
932
|
+
if (current.previousElementSibling) {
|
|
933
|
+
best = current;
|
|
934
|
+
}
|
|
935
|
+
current = current.parentElement;
|
|
936
|
+
}
|
|
937
|
+
return current.previousElementSibling ? current : best;
|
|
938
|
+
}
|
|
939
|
+
function truncateFromElement(root, element, removals, reason) {
|
|
940
|
+
const cutPoint = truncationCutPoint(root, element);
|
|
941
|
+
removeTrailingSiblings(cutPoint, removals, reason);
|
|
942
|
+
removeElement(removals, "site-profile:content-pattern", reason, cutPoint);
|
|
943
|
+
}
|
|
925
944
|
function compileProfileRegexes(profiles, key) {
|
|
926
945
|
return profiles.flatMap(
|
|
927
946
|
(profile) => (profile.removals?.[key] ?? []).map((pattern) => ({ profile: profile.name, regex: new RegExp(pattern, "i") }))
|
|
@@ -953,8 +972,7 @@ function removeByTextPatterns(root, profiles, removals) {
|
|
|
953
972
|
}
|
|
954
973
|
const cut = text.length <= 240 ? cutContains.find((entry) => text.includes(entry.marker)) ?? cutRegexes.find((entry) => entry.regex.test(text)) : void 0;
|
|
955
974
|
if (cut) {
|
|
956
|
-
|
|
957
|
-
removeElement(removals, "site-profile:content-pattern", cut.profile, element);
|
|
975
|
+
truncateFromElement(root, element, removals, cut.profile);
|
|
958
976
|
return;
|
|
959
977
|
}
|
|
960
978
|
const exactProfile = dropExact.get(text);
|
|
@@ -1046,7 +1064,28 @@ function jsonLdValue(document2, keys) {
|
|
|
1046
1064
|
}
|
|
1047
1065
|
return void 0;
|
|
1048
1066
|
}
|
|
1049
|
-
function
|
|
1067
|
+
function profileAuthorFromDocument(document2, profiles) {
|
|
1068
|
+
for (const profile of profiles) {
|
|
1069
|
+
const metadata = profile.metadata;
|
|
1070
|
+
if (!metadata) continue;
|
|
1071
|
+
for (const selector of metadata.authorSelectors ?? []) {
|
|
1072
|
+
const author = document2.querySelector(selector)?.textContent?.replace(/\s+/g, " ").trim();
|
|
1073
|
+
if (author) return author;
|
|
1074
|
+
}
|
|
1075
|
+
const metaNames = [
|
|
1076
|
+
...(metadata.authorMetaNames ?? []).map((value) => ({ attr: "name", value })),
|
|
1077
|
+
...(metadata.authorMetaItemprops ?? []).map((value) => ({ attr: "itemprop", value })),
|
|
1078
|
+
...(metadata.authorMetaProperties ?? []).map((value) => ({ attr: "property", value }))
|
|
1079
|
+
];
|
|
1080
|
+
for (const entry of metaNames) {
|
|
1081
|
+
const escaped = entry.value.replace(/"/g, '\\"');
|
|
1082
|
+
const author = document2.querySelector(`meta[${entry.attr}="${escaped}"]`)?.getAttribute("content")?.trim();
|
|
1083
|
+
if (author) return author;
|
|
1084
|
+
}
|
|
1085
|
+
}
|
|
1086
|
+
return void 0;
|
|
1087
|
+
}
|
|
1088
|
+
function toMetadata(result, document2, profiles) {
|
|
1050
1089
|
return {
|
|
1051
1090
|
title: result.title || firstMetaContent(document2, ["og:title", "twitter:title"]) || document2.querySelector("title")?.textContent?.trim() || void 0,
|
|
1052
1091
|
description: result.description || firstMetaContent(document2, ["description", "og:description", "twitter:description"]),
|
|
@@ -1055,7 +1094,7 @@ function toMetadata(result, document2) {
|
|
|
1055
1094
|
image: result.image || firstMetaContent(document2, ["og:image", "twitter:image"]),
|
|
1056
1095
|
language: result.language || document2.documentElement.getAttribute("lang") || void 0,
|
|
1057
1096
|
published: result.published || firstMetaContent(document2, ["article:published_time", "date", "datePublished", "pubdate", "publishdate"]) || jsonLdValue(document2, ["datePublished", "dateCreated"]),
|
|
1058
|
-
author: result.author || firstMetaContent(document2, ["author", "article:author", "twitter:creator"]) || jsonLdValue(document2, ["author", "creator"]),
|
|
1097
|
+
author: result.author || profileAuthorFromDocument(document2, profiles) || firstMetaContent(document2, ["author", "article:author", "twitter:creator"]) || jsonLdValue(document2, ["author", "creator"]),
|
|
1059
1098
|
site: result.site || firstMetaContent(document2, ["og:site_name", "application-name"]),
|
|
1060
1099
|
schemaOrgData: result.schemaOrgData,
|
|
1061
1100
|
wordCount: result.wordCount,
|
|
@@ -1103,7 +1142,7 @@ var HtmlCleaner = class {
|
|
|
1103
1142
|
standardize: this.options.standardize
|
|
1104
1143
|
});
|
|
1105
1144
|
const result = parser2.parseAsync ? await parser2.parseAsync() : parser2.parse();
|
|
1106
|
-
const metadata = toMetadata(result, document2);
|
|
1145
|
+
const metadata = toMetadata(result, document2, activeProfiles);
|
|
1107
1146
|
applyMetadataProfiles(metadata, activeProfiles);
|
|
1108
1147
|
const content = serializeProfiledContent(result.content, postProfiles, removals);
|
|
1109
1148
|
return {
|
|
@@ -1618,27 +1657,9 @@ var ProgressTracker = class {
|
|
|
1618
1657
|
|
|
1619
1658
|
// src/cli.ts
|
|
1620
1659
|
var program = new Command();
|
|
1621
|
-
async function
|
|
1622
|
-
const
|
|
1623
|
-
|
|
1624
|
-
resolve2(here, "../src/site-rules"),
|
|
1625
|
-
resolve2(here, "../../src/site-rules"),
|
|
1626
|
-
resolve2(process.cwd(), "src/site-rules"),
|
|
1627
|
-
resolve2(here, "../../src/feedloom/site_rules"),
|
|
1628
|
-
resolve2(process.cwd(), "../src/feedloom/site_rules"),
|
|
1629
|
-
resolve2(process.cwd(), "src/feedloom/site_rules")
|
|
1630
|
-
];
|
|
1631
|
-
for (const dir of candidates) {
|
|
1632
|
-
try {
|
|
1633
|
-
const names = await readdir2(dir);
|
|
1634
|
-
return names.filter((name) => name.endsWith(".toml")).map((name) => join7(dir, name));
|
|
1635
|
-
} catch (error) {
|
|
1636
|
-
if (error.code !== "ENOENT") {
|
|
1637
|
-
throw error;
|
|
1638
|
-
}
|
|
1639
|
-
}
|
|
1640
|
-
}
|
|
1641
|
-
return [];
|
|
1660
|
+
async function siteRulePathsFromDir(dir) {
|
|
1661
|
+
const names = await readdir2(dir);
|
|
1662
|
+
return names.filter((name) => name.endsWith(".toml")).map((name) => join7(dir, name));
|
|
1642
1663
|
}
|
|
1643
1664
|
function positiveIntOption(value, fallback) {
|
|
1644
1665
|
const parsed = Number(value ?? fallback);
|
|
@@ -1647,7 +1668,7 @@ function positiveIntOption(value, fallback) {
|
|
|
1647
1668
|
}
|
|
1648
1669
|
return parsed;
|
|
1649
1670
|
}
|
|
1650
|
-
program.name("feedloom").description("Archive long-form web content as clean Markdown with local assets").version("0.1.0").option("--output-dir <dir>", "Output directory for markdown notes", "clippings").option("--source-kind <kind>", "auto, html-page, or rss-feed", "auto").option("--since <date>", "Only keep feed entries on or after YYYY-MM-DD", "").option("--limit <n>", "Process only first N deduplicated URLs", "0").option("--start <n>", "Start from 1-based index after deduplication", "1").option("--end <n>", "End at 1-based index after deduplication", "0").option("--prefer-browser-state", "Try copied local Chrome profile before regular browser fallback", false).option("--chrome-user-data-dir <path>", "Chrome user data directory used with --prefer-browser-state", "").option("--chrome-profile <name>", "Chrome profile directory name", "Default").option("--fetch-mode <mode>", "auto, static, browser, or stealth", "auto").option("--no-network-idle", "Do not wait for browser networkidle before reading HTML").option("--wait-ms <ms>", "Extra browser wait after load", "2500").option("--solve-cloudflare", "In stealth mode, attempt Cloudflare Turnstile/interstitial challenge handling", false).option("--disable-resources", "In stealth mode, block images/media/fonts/stylesheets for speed", false).option("--proxy <server>", "Proxy server for browser/stealth fetch, e.g. http://127.0.0.1:8080", "").option("--dns-over-https", "Use Chromium Cloudflare DNS-over-HTTPS flag for browser/stealth fetch", false).option("--wait-selector <selector>", "Wait for a CSS selector after page load", "").option("--wait-selector-state <state>", "attached, detached, visible, or hidden", "attached").option("--click-selector <selector...>", "Click one or more selectors after page load", []).option("--scroll-to-bottom", "Scroll to the bottom before reading HTML", false).option("--headful", "Run browser/browser-state fetches with a visible Chrome window", false).option("--no-real-chrome-defaults", "Disable Scrapling-inspired real Chrome context defaults").option("--no-reuse-browser", "Disable batch browser/stealth context reuse").argument("[inputs...]", "URLs or files containing URLs").action(async (inputs, options) => {
|
|
1671
|
+
program.name("feedloom").description("Archive long-form web content as clean Markdown with local assets").version("0.1.0").option("--output-dir <dir>", "Output directory for markdown notes", "clippings").option("--source-kind <kind>", "auto, html-page, or rss-feed", "auto").option("--since <date>", "Only keep feed entries on or after YYYY-MM-DD", "").option("--limit <n>", "Process only first N deduplicated URLs", "0").option("--start <n>", "Start from 1-based index after deduplication", "1").option("--end <n>", "End at 1-based index after deduplication", "0").option("--prefer-browser-state", "Try copied local Chrome profile before regular browser fallback", false).option("--chrome-user-data-dir <path>", "Chrome user data directory used with --prefer-browser-state", "").option("--chrome-profile <name>", "Chrome profile directory name", "Default").option("--fetch-mode <mode>", "auto, static, browser, or stealth", "auto").option("--no-network-idle", "Do not wait for browser networkidle before reading HTML").option("--wait-ms <ms>", "Extra browser wait after load", "2500").option("--solve-cloudflare", "In stealth mode, attempt Cloudflare Turnstile/interstitial challenge handling", false).option("--disable-resources", "In stealth mode, block images/media/fonts/stylesheets for speed", false).option("--proxy <server>", "Proxy server for browser/stealth fetch, e.g. http://127.0.0.1:8080", "").option("--dns-over-https", "Use Chromium Cloudflare DNS-over-HTTPS flag for browser/stealth fetch", false).option("--wait-selector <selector>", "Wait for a CSS selector after page load", "").option("--wait-selector-state <state>", "attached, detached, visible, or hidden", "attached").option("--click-selector <selector...>", "Click one or more selectors after page load", []).option("--scroll-to-bottom", "Scroll to the bottom before reading HTML", false).option("--headful", "Run browser/browser-state fetches with a visible Chrome window", false).option("--site-rules-dir <dir>", "Optional directory of private TOML site extraction/cleaning rules", "").option("--no-real-chrome-defaults", "Disable Scrapling-inspired real Chrome context defaults").option("--no-reuse-browser", "Disable batch browser/stealth context reuse").argument("[inputs...]", "URLs or files containing URLs").action(async (inputs, options) => {
|
|
1651
1672
|
if (inputs.length === 0) {
|
|
1652
1673
|
program.help({ error: true });
|
|
1653
1674
|
}
|
|
@@ -1674,7 +1695,8 @@ program.name("feedloom").description("Archive long-form web content as clean Mar
|
|
|
1674
1695
|
positiveIntOption(options.end, 0),
|
|
1675
1696
|
positiveIntOption(options.limit, 0)
|
|
1676
1697
|
);
|
|
1677
|
-
const
|
|
1698
|
+
const siteRulesDir = String(options.siteRulesDir || "");
|
|
1699
|
+
const profiles = siteRulesDir ? await loadSiteProfiles(await siteRulePathsFromDir(resolve2(siteRulesDir))) : [];
|
|
1678
1700
|
const outputDir = String(options.outputDir ?? "clippings");
|
|
1679
1701
|
let failures = 0;
|
|
1680
1702
|
const tracker = new ProgressTracker(selected, outputDir);
|
package/package.json
CHANGED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: feedloom
|
|
3
|
+
description: Capture long-form web content, article URLs, URL list files, or RSS/Atom feeds into clean Markdown with local assets using the Feedloom CLI. Use for web clipping, saving articles as Markdown, archiving URL batches, clipping Zhihu/WeChat/Kaggle/blog posts, 抓取网页文章, 保存为 Markdown, URL 列表转归档, RSS 归档, and 网页长文归档.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Feedloom
|
|
7
|
+
|
|
8
|
+
Use Feedloom for article clipping instead of writing ad-hoc scrapers.
|
|
9
|
+
|
|
10
|
+
## Command
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
npx -y @ariesfish/feedloom <inputs...> [options]
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Inputs
|
|
17
|
+
|
|
18
|
+
- Direct article URLs.
|
|
19
|
+
- Files containing URLs, one per line.
|
|
20
|
+
- Markdown checklist files with lines like `- [ ] <url>` or `- [x] <url>`.
|
|
21
|
+
- RSS/Atom feeds with `--source-kind rss-feed`.
|
|
22
|
+
|
|
23
|
+
## Common usage
|
|
24
|
+
|
|
25
|
+
Before running Feedloom, check whether this skill directory has a `site-rules/` directory. If it exists, always pass it with `--site-rules-dir $HOME/.agents/skills/feedloom/site-rules`; do not omit available site rules.
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
npx -y @ariesfish/feedloom "https://example.com/article"
|
|
29
|
+
npx -y @ariesfish/feedloom urls.txt
|
|
30
|
+
npx -y @ariesfish/feedloom urls.txt --limit 10
|
|
31
|
+
npx -y @ariesfish/feedloom urls.txt --start 11 --end 20
|
|
32
|
+
npx -y @ariesfish/feedloom urls.txt --output-dir clippings
|
|
33
|
+
npx -y @ariesfish/feedloom "https://example.com/feed.xml" --source-kind rss-feed
|
|
34
|
+
npx -y @ariesfish/feedloom "https://example.com/feed.xml" --source-kind rss-feed --since 2026-01-01
|
|
35
|
+
npx -y @ariesfish/feedloom "https://example.com/article" --fetch-mode browser --wait-ms 4000 --scroll-to-bottom
|
|
36
|
+
npx -y @ariesfish/feedloom "https://example.com/article" --prefer-browser-state
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Fetch workflow
|
|
40
|
+
|
|
41
|
+
Use the least expensive mode that works:
|
|
42
|
+
|
|
43
|
+
1. Start with default `auto`. It tries meaningful content in order: `static` → `browser-state` when `--prefer-browser-state` is set → `browser` → `stealth`.
|
|
44
|
+
2. Use `--fetch-mode static` only for simple pages when speed matters and JavaScript/login state is unnecessary.
|
|
45
|
+
3. Use `--fetch-mode browser` for JavaScript-rendered pages; add `--wait-ms`, `--wait-selector`, `--click-selector`, or `--scroll-to-bottom` only when needed.
|
|
46
|
+
4. Use `--prefer-browser-state` with `--chrome-user-data-dir` / `--chrome-profile` for pages that need local login state.
|
|
47
|
+
5. Use `--fetch-mode stealth` only after static/browser fails or for anti-bot pages; add `--solve-cloudflare`, `--proxy`, or `--dns-over-https` only when required.
|
|
48
|
+
6. For batches, test one URL first, then run the list with the working options plus `--limit`, `--start`, or `--end` as needed.
|
|
49
|
+
|
|
50
|
+
## Useful options
|
|
51
|
+
|
|
52
|
+
- `--output-dir <dir>`: write notes and assets somewhere other than `clippings/`.
|
|
53
|
+
- `--source-kind rss-feed`: treat input as an RSS/Atom feed and archive feed entries.
|
|
54
|
+
- `--since <YYYY-MM-DD>`: limit RSS/Atom entries by date.
|
|
55
|
+
- `--limit <n>`, `--start <n>`, `--end <n>`: process URL lists in small batches or resume partway through a list.
|
|
56
|
+
- `--fetch-mode <static|browser|stealth>`: force a specific fetch layer when `auto` is too broad or too slow.
|
|
57
|
+
- `--prefer-browser-state`: try a copied local Chrome profile before regular browser fallback.
|
|
58
|
+
- `--wait-ms <ms>`, `--wait-selector <selector>`, `--scroll-to-bottom`: give dynamic pages time or actions to reveal article content.
|
|
59
|
+
- `--click-selector <selector...>`: click dismiss/expand selectors before extracting HTML.
|
|
60
|
+
- `--headful`: show the browser window for debugging login, popups, or dynamic loading.
|
|
61
|
+
- `--site-rules-dir <dir>`: load optional private TOML extraction/cleaning rules from a local directory, for example `$HOME/.agents/skills/feedloom/site-rules/` reference folder.
|
|
62
|
+
- `--solve-cloudflare`, `--proxy <server>`, `--dns-over-https`: use only when stealth fetching needs them.
|
|
63
|
+
|
|
64
|
+
Run `npx -y @ariesfish/feedloom --help` for the complete option list. Do not invent unsupported options.
|
|
65
|
+
|
|
66
|
+
## Private site rules
|
|
67
|
+
|
|
68
|
+
Site-specific TOML rules are optional in the package, but mandatory to use when present next to this skill. Always check for `$HOME/.agents/skills/feedloom/site-rules/` before clipping. If that directory exists, pass it explicitly on every Feedloom command using the `$HOME`-prefixed path:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
npx -y @ariesfish/feedloom "https://example.com/article" --site-rules-dir $HOME/.agents/skills/feedloom/site-rules
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Treat rule files in `$HOME/.agents/skills/feedloom/site-rules/` as local reference material and use them whenever available; never skip an existing site-rules directory unless the user explicitly asks not to use it.
|
|
75
|
+
|
|
76
|
+
## Output
|
|
77
|
+
|
|
78
|
+
- Markdown files are written to `clippings/` by default, or to `--output-dir`.
|
|
79
|
+
- Assets are written under an `assets/` subdirectory.
|
|
80
|
+
- Successful Markdown checklist items are marked done.
|