@ariesfish/feedloom 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/dist/cli.js +398 -56
- package/dist/site-rules/wechat.toml +5 -0
- package/dist/site-rules/xiaohongshu.toml +55 -0
- package/dist/site-rules/youtube.toml +10 -0
- package/dist/site-rules/zhihu.toml +22 -0
- package/package.json +2 -2
- package/skills/feedloom/SKILL.md +11 -5
- package/skills/feedloom/references/site-rules.md +104 -0
package/README.md
CHANGED
|
@@ -281,7 +281,8 @@ npm test
|
|
|
281
281
|
- Respect robots.txt, website terms of service, copyright, and rate limits.
|
|
282
282
|
- For dynamic pages, try `--fetch-mode browser` first.
|
|
283
283
|
- For static blogs and news sites, `--fetch-mode static` is usually faster.
|
|
284
|
-
-
|
|
284
|
+
- Feedloom ships bundled TOML site rules for common dynamic/structured sites such as WeChat official account articles and Zhihu. Site rules can define extraction, cleanup, and fetch preferences. For example, the bundled Zhihu rule uses browser fetch with copied Chrome state when `--chrome-user-data-dir`/`--chrome-profile` are configured.
|
|
285
|
+
- If article extraction is poor for a specific site, keep private TOML site rules outside the package and pass them with `--site-rules-dir <dir>`. Private rules are loaded after bundled rules.
|
|
285
286
|
- For large batches, test with `--limit` before running the full job.
|
|
286
287
|
|
|
287
288
|
## Acknowledgements
|
package/dist/cli.js
CHANGED
|
@@ -2,7 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
// src/cli.ts
|
|
4
4
|
import { readdir as readdir2 } from "fs/promises";
|
|
5
|
-
import { join as join7, resolve as resolve2 } from "path";
|
|
5
|
+
import { dirname, join as join7, resolve as resolve2 } from "path";
|
|
6
|
+
import { fileURLToPath } from "url";
|
|
6
7
|
import { Command } from "commander";
|
|
7
8
|
|
|
8
9
|
// src/cleaning/profiles.ts
|
|
@@ -38,7 +39,30 @@ function profileFromTomlRule(name, rule) {
|
|
|
38
39
|
},
|
|
39
40
|
metadata: {
|
|
40
41
|
fixedAuthor: rule.metadata?.fixed_author,
|
|
41
|
-
titleSuffixPatterns: rule.metadata?.strip_title_regexes
|
|
42
|
+
titleSuffixPatterns: rule.metadata?.strip_title_regexes,
|
|
43
|
+
authorSuffixPatterns: rule.metadata?.strip_author_regexes,
|
|
44
|
+
authorSelectors: rule.metadata?.author_selectors,
|
|
45
|
+
authorMetaNames: rule.metadata?.author_meta_names,
|
|
46
|
+
authorMetaItemprops: rule.metadata?.author_meta_itemprops,
|
|
47
|
+
authorMetaProperties: rule.metadata?.author_meta_properties
|
|
48
|
+
},
|
|
49
|
+
fetch: {
|
|
50
|
+
mode: rule.fetch?.mode,
|
|
51
|
+
preferBrowserState: rule.fetch?.prefer_browser_state,
|
|
52
|
+
waitMs: rule.fetch?.wait_ms,
|
|
53
|
+
networkIdle: rule.fetch?.network_idle,
|
|
54
|
+
waitSelector: rule.fetch?.wait_selector,
|
|
55
|
+
waitSelectorState: rule.fetch?.wait_selector_state,
|
|
56
|
+
clickSelectors: rule.fetch?.click_selectors,
|
|
57
|
+
scrollToBottom: rule.fetch?.scroll_to_bottom,
|
|
58
|
+
useProxyEnv: rule.fetch?.use_proxy_env
|
|
59
|
+
},
|
|
60
|
+
media: {
|
|
61
|
+
includeMetaImages: rule.media?.include_meta_images,
|
|
62
|
+
imageMetaProperties: rule.media?.image_meta_properties
|
|
63
|
+
},
|
|
64
|
+
extraction: {
|
|
65
|
+
requireText: rule.extract?.require_text
|
|
42
66
|
}
|
|
43
67
|
};
|
|
44
68
|
}
|
|
@@ -118,13 +142,13 @@ async function runPageActions(page, options) {
|
|
|
118
142
|
await page.locator(selector).first().click({ timeout: 5e3 }).catch(() => void 0);
|
|
119
143
|
}
|
|
120
144
|
if (options.scrollToBottom) {
|
|
121
|
-
await page.evaluate(async () => {
|
|
122
|
-
const delay = (ms) => new Promise((
|
|
145
|
+
await page.evaluate(`(async () => {
|
|
146
|
+
const delay = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
|
|
123
147
|
for (let i = 0; i < 8; i += 1) {
|
|
124
148
|
window.scrollTo(0, document.body.scrollHeight);
|
|
125
149
|
await delay(250);
|
|
126
150
|
}
|
|
127
|
-
});
|
|
151
|
+
})()`);
|
|
128
152
|
}
|
|
129
153
|
if (options.waitSelector) {
|
|
130
154
|
await page.locator(options.waitSelector).first().waitFor({
|
|
@@ -405,13 +429,13 @@ async function fetchWithStealthContext(context, url, options) {
|
|
|
405
429
|
await page.locator(selector).first().click({ timeout: 5e3 }).catch(() => void 0);
|
|
406
430
|
}
|
|
407
431
|
if (options.scrollToBottom) {
|
|
408
|
-
await page.evaluate(async () => {
|
|
409
|
-
const delay = (ms) => new Promise((
|
|
432
|
+
await page.evaluate(`(async () => {
|
|
433
|
+
const delay = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
|
|
410
434
|
for (let i = 0; i < 8; i += 1) {
|
|
411
435
|
window.scrollTo(0, document.body.scrollHeight);
|
|
412
436
|
await delay(250);
|
|
413
437
|
}
|
|
414
|
-
});
|
|
438
|
+
})()`);
|
|
415
439
|
}
|
|
416
440
|
if (options.waitSelector) {
|
|
417
441
|
await page.locator(options.waitSelector).first().waitFor({ state: options.waitSelectorState ?? "attached", timeout: timeoutMs }).catch(() => void 0);
|
|
@@ -820,8 +844,8 @@ function imageSource(img) {
|
|
|
820
844
|
return first || null;
|
|
821
845
|
}
|
|
822
846
|
async function localizeImages(html, options) {
|
|
823
|
-
const { document
|
|
824
|
-
const images = Array.from(
|
|
847
|
+
const { document } = parseHTML(`<!doctype html><html><body>${html}</body></html>`);
|
|
848
|
+
const images = Array.from(document.querySelectorAll("img"));
|
|
825
849
|
if (images.length === 0) return html;
|
|
826
850
|
const fetchImage = options.fetchImage ?? fetch;
|
|
827
851
|
const seen = /* @__PURE__ */ new Map();
|
|
@@ -860,7 +884,7 @@ async function localizeImages(html, options) {
|
|
|
860
884
|
img.removeAttribute("data-original");
|
|
861
885
|
img.removeAttribute("data-src");
|
|
862
886
|
}
|
|
863
|
-
return
|
|
887
|
+
return document.body.innerHTML;
|
|
864
888
|
}
|
|
865
889
|
|
|
866
890
|
// src/cleaning/clean-html.ts
|
|
@@ -921,6 +945,22 @@ function removeTrailingSiblings(element, removals, reason) {
|
|
|
921
945
|
sibling = next;
|
|
922
946
|
}
|
|
923
947
|
}
|
|
948
|
+
function truncationCutPoint(root, element) {
|
|
949
|
+
let current = element;
|
|
950
|
+
let best = element;
|
|
951
|
+
while (current.parentElement && current.parentElement !== root) {
|
|
952
|
+
if (current.previousElementSibling) {
|
|
953
|
+
best = current;
|
|
954
|
+
}
|
|
955
|
+
current = current.parentElement;
|
|
956
|
+
}
|
|
957
|
+
return current.previousElementSibling ? current : best;
|
|
958
|
+
}
|
|
959
|
+
function truncateFromElement(root, element, removals, reason) {
|
|
960
|
+
const cutPoint = truncationCutPoint(root, element);
|
|
961
|
+
removeTrailingSiblings(cutPoint, removals, reason);
|
|
962
|
+
removeElement(removals, "site-profile:content-pattern", reason, cutPoint);
|
|
963
|
+
}
|
|
924
964
|
function compileProfileRegexes(profiles, key) {
|
|
925
965
|
return profiles.flatMap(
|
|
926
966
|
(profile) => (profile.removals?.[key] ?? []).map((pattern) => ({ profile: profile.name, regex: new RegExp(pattern, "i") }))
|
|
@@ -952,8 +992,7 @@ function removeByTextPatterns(root, profiles, removals) {
|
|
|
952
992
|
}
|
|
953
993
|
const cut = text.length <= 240 ? cutContains.find((entry) => text.includes(entry.marker)) ?? cutRegexes.find((entry) => entry.regex.test(text)) : void 0;
|
|
954
994
|
if (cut) {
|
|
955
|
-
|
|
956
|
-
removeElement(removals, "site-profile:content-pattern", cut.profile, element);
|
|
995
|
+
truncateFromElement(root, element, removals, cut.profile);
|
|
957
996
|
return;
|
|
958
997
|
}
|
|
959
998
|
const exactProfile = dropExact.get(text);
|
|
@@ -985,6 +1024,18 @@ function cleanupTitle(metadata, profiles) {
|
|
|
985
1024
|
}
|
|
986
1025
|
metadata.title = title;
|
|
987
1026
|
}
|
|
1027
|
+
function cleanupAuthor(metadata, profiles) {
|
|
1028
|
+
if (!metadata.author) {
|
|
1029
|
+
return;
|
|
1030
|
+
}
|
|
1031
|
+
let author = metadata.author;
|
|
1032
|
+
for (const profile of profiles) {
|
|
1033
|
+
for (const pattern of profile.metadata?.authorSuffixPatterns ?? []) {
|
|
1034
|
+
author = author.replace(new RegExp(pattern, "i"), "").trim();
|
|
1035
|
+
}
|
|
1036
|
+
}
|
|
1037
|
+
metadata.author = author;
|
|
1038
|
+
}
|
|
988
1039
|
function applySiteProfiles(root, profiles, removals) {
|
|
989
1040
|
removeByExactSelectors(root, profiles, removals);
|
|
990
1041
|
removeByPartialAttributePatterns(root, profiles, removals);
|
|
@@ -993,6 +1044,7 @@ function applySiteProfiles(root, profiles, removals) {
|
|
|
993
1044
|
function applyMetadataProfiles(metadata, profiles) {
|
|
994
1045
|
applyFixedAuthor(metadata, profiles);
|
|
995
1046
|
cleanupTitle(metadata, profiles);
|
|
1047
|
+
cleanupAuthor(metadata, profiles);
|
|
996
1048
|
}
|
|
997
1049
|
|
|
998
1050
|
// src/cleaning/clean-html.ts
|
|
@@ -1014,17 +1066,17 @@ var DEFAULT_FEEDLOOM_PROFILE = {
|
|
|
1014
1066
|
}
|
|
1015
1067
|
};
|
|
1016
1068
|
var DefuddleClass = DefuddleModule.default ?? DefuddleModule.Defuddle;
|
|
1017
|
-
function firstMetaContent(
|
|
1069
|
+
function firstMetaContent(document, names) {
|
|
1018
1070
|
for (const name of names) {
|
|
1019
1071
|
const escaped = name.replace(/"/g, '\\"');
|
|
1020
|
-
const element =
|
|
1072
|
+
const element = document.querySelector(`meta[property="${escaped}"], meta[name="${escaped}"], meta[itemprop="${escaped}"]`);
|
|
1021
1073
|
const content = element?.getAttribute("content")?.trim();
|
|
1022
1074
|
if (content) return content;
|
|
1023
1075
|
}
|
|
1024
1076
|
return void 0;
|
|
1025
1077
|
}
|
|
1026
|
-
function jsonLdValue(
|
|
1027
|
-
for (const script of Array.from(
|
|
1078
|
+
function jsonLdValue(document, keys) {
|
|
1079
|
+
for (const script of Array.from(document.querySelectorAll('script[type="application/ld+json"]'))) {
|
|
1028
1080
|
const text = script.textContent?.trim();
|
|
1029
1081
|
if (!text) continue;
|
|
1030
1082
|
try {
|
|
@@ -1045,27 +1097,69 @@ function jsonLdValue(document2, keys) {
|
|
|
1045
1097
|
}
|
|
1046
1098
|
return void 0;
|
|
1047
1099
|
}
|
|
1048
|
-
function
|
|
1100
|
+
function profileAuthorFromDocument(document, profiles) {
|
|
1101
|
+
for (const profile of profiles) {
|
|
1102
|
+
const metadata = profile.metadata;
|
|
1103
|
+
if (!metadata) continue;
|
|
1104
|
+
for (const selector of metadata.authorSelectors ?? []) {
|
|
1105
|
+
const author = document.querySelector(selector)?.textContent?.replace(/\s+/g, " ").trim();
|
|
1106
|
+
if (author) return author;
|
|
1107
|
+
}
|
|
1108
|
+
const metaNames = [
|
|
1109
|
+
...(metadata.authorMetaNames ?? []).map((value) => ({ attr: "name", value })),
|
|
1110
|
+
...(metadata.authorMetaItemprops ?? []).map((value) => ({ attr: "itemprop", value })),
|
|
1111
|
+
...(metadata.authorMetaProperties ?? []).map((value) => ({ attr: "property", value }))
|
|
1112
|
+
];
|
|
1113
|
+
for (const entry of metaNames) {
|
|
1114
|
+
const escaped = entry.value.replace(/"/g, '\\"');
|
|
1115
|
+
const author = document.querySelector(`meta[${entry.attr}="${escaped}"]`)?.getAttribute("content")?.trim();
|
|
1116
|
+
if (author) return author;
|
|
1117
|
+
}
|
|
1118
|
+
}
|
|
1119
|
+
return void 0;
|
|
1120
|
+
}
|
|
1121
|
+
function toMetadata(result, document, profiles) {
|
|
1049
1122
|
return {
|
|
1050
|
-
title: result.title || firstMetaContent(
|
|
1051
|
-
description: result.description || firstMetaContent(
|
|
1123
|
+
title: result.title || firstMetaContent(document, ["og:title", "twitter:title"]) || document.querySelector("title")?.textContent?.trim() || void 0,
|
|
1124
|
+
description: result.description || firstMetaContent(document, ["description", "og:description", "twitter:description"]),
|
|
1052
1125
|
domain: result.domain || void 0,
|
|
1053
1126
|
favicon: result.favicon || void 0,
|
|
1054
|
-
image: result.image || firstMetaContent(
|
|
1055
|
-
language: result.language ||
|
|
1056
|
-
published: result.published || firstMetaContent(
|
|
1057
|
-
author: result.author || firstMetaContent(
|
|
1058
|
-
site: result.site || firstMetaContent(
|
|
1127
|
+
image: result.image || firstMetaContent(document, ["og:image", "twitter:image"]),
|
|
1128
|
+
language: result.language || document.documentElement.getAttribute("lang") || void 0,
|
|
1129
|
+
published: result.published || firstMetaContent(document, ["article:published_time", "date", "datePublished", "pubdate", "publishdate"]) || jsonLdValue(document, ["datePublished", "dateCreated"]),
|
|
1130
|
+
author: result.author || profileAuthorFromDocument(document, profiles) || firstMetaContent(document, ["author", "article:author", "twitter:creator"]) || jsonLdValue(document, ["author", "creator"]),
|
|
1131
|
+
site: result.site || firstMetaContent(document, ["og:site_name", "application-name"]),
|
|
1059
1132
|
schemaOrgData: result.schemaOrgData,
|
|
1060
1133
|
wordCount: result.wordCount,
|
|
1061
1134
|
parseTime: result.parseTime
|
|
1062
1135
|
};
|
|
1063
1136
|
}
|
|
1064
|
-
function
|
|
1065
|
-
const
|
|
1066
|
-
|
|
1137
|
+
function appendMetaImages(document, root, profiles) {
|
|
1138
|
+
const properties = profiles.flatMap((profile) => profile.media?.includeMetaImages ? profile.media.imageMetaProperties ?? ["og:image"] : []);
|
|
1139
|
+
if (properties.length === 0) {
|
|
1140
|
+
return;
|
|
1141
|
+
}
|
|
1142
|
+
const seen = new Set(Array.from(root.querySelectorAll("img")).map((img) => img.getAttribute("src") ?? ""));
|
|
1143
|
+
for (const property of properties) {
|
|
1144
|
+
const escaped = property.replace(/"/g, '\\"');
|
|
1145
|
+
for (const meta of Array.from(document.querySelectorAll(`meta[property="${escaped}"], meta[name="${escaped}"], meta[itemprop="${escaped}"]`))) {
|
|
1146
|
+
const src = meta.getAttribute("content")?.trim();
|
|
1147
|
+
if (!src || seen.has(src)) continue;
|
|
1148
|
+
const img = document.createElement("img");
|
|
1149
|
+
img.setAttribute("src", src);
|
|
1150
|
+
img.setAttribute("alt", "");
|
|
1151
|
+
root.appendChild(document.createElement("p"));
|
|
1152
|
+
root.lastElementChild?.appendChild(img);
|
|
1153
|
+
seen.add(src);
|
|
1154
|
+
}
|
|
1155
|
+
}
|
|
1156
|
+
}
|
|
1157
|
+
function serializeProfiledContent(document, content, profiles, removals) {
|
|
1158
|
+
const { document: contentDocument } = parseHTML2(`<!doctype html><html><body><main data-feedloom-profile-root="true">${content}</main></body></html>`);
|
|
1159
|
+
const root = contentDocument.querySelector('[data-feedloom-profile-root="true"]') ?? contentDocument.body;
|
|
1160
|
+
appendMetaImages(document, root, profiles);
|
|
1067
1161
|
applySiteProfiles(root, profiles, removals);
|
|
1068
|
-
const serialized = root.innerHTML || root.outerHTML ||
|
|
1162
|
+
const serialized = root.innerHTML || root.outerHTML || contentDocument.body.innerHTML;
|
|
1069
1163
|
return serialized.trim() ? `${serialized.trim()}
|
|
1070
1164
|
` : "";
|
|
1071
1165
|
}
|
|
@@ -1080,9 +1174,9 @@ var HtmlCleaner = class {
|
|
|
1080
1174
|
const preferredContentSelector = this.options.contentSelector ?? firstContentSelector(activeProfiles);
|
|
1081
1175
|
const removals = [];
|
|
1082
1176
|
const html = /<html[\s>]/i.test(rawHtml) ? rawHtml : `<!doctype html><html><body>${rawHtml}</body></html>`;
|
|
1083
|
-
const { document
|
|
1084
|
-
const contentSelector = preferredContentSelector &&
|
|
1085
|
-
const doc =
|
|
1177
|
+
const { document } = parseHTML2(html);
|
|
1178
|
+
const contentSelector = preferredContentSelector && document.querySelector(preferredContentSelector) ? preferredContentSelector : void 0;
|
|
1179
|
+
const doc = document;
|
|
1086
1180
|
if (this.options.baseUrl) {
|
|
1087
1181
|
doc.URL = this.options.baseUrl;
|
|
1088
1182
|
}
|
|
@@ -1099,12 +1193,14 @@ var HtmlCleaner = class {
|
|
|
1099
1193
|
removeExactSelectors: this.options.removeExactSelectors,
|
|
1100
1194
|
removePartialSelectors: this.options.removePartialSelectors,
|
|
1101
1195
|
removeContentPatterns: this.options.removeContentPatterns,
|
|
1102
|
-
standardize: this.options.standardize
|
|
1196
|
+
standardize: this.options.standardize,
|
|
1197
|
+
fetch: this.options.defuddleFetch,
|
|
1198
|
+
language: this.options.language
|
|
1103
1199
|
});
|
|
1104
1200
|
const result = parser2.parseAsync ? await parser2.parseAsync() : parser2.parse();
|
|
1105
|
-
const metadata = toMetadata(result,
|
|
1201
|
+
const metadata = toMetadata(result, document, activeProfiles);
|
|
1106
1202
|
applyMetadataProfiles(metadata, activeProfiles);
|
|
1107
|
-
const content = serializeProfiledContent(result.content, postProfiles, removals);
|
|
1203
|
+
const content = serializeProfiledContent(document, result.content, postProfiles, removals);
|
|
1108
1204
|
return {
|
|
1109
1205
|
content,
|
|
1110
1206
|
contentMarkdown: result.contentMarkdown,
|
|
@@ -1121,6 +1217,212 @@ async function cleanHtml(rawHtml, options = {}) {
|
|
|
1121
1217
|
return new HtmlCleaner(options).parse(rawHtml);
|
|
1122
1218
|
}
|
|
1123
1219
|
|
|
1220
|
+
// src/fetch/proxy-fetch.ts
|
|
1221
|
+
import { request as httpRequest } from "http";
|
|
1222
|
+
import { connect as tlsConnect } from "tls";
|
|
1223
|
+
var REDIRECT_STATUSES = /* @__PURE__ */ new Set([301, 302, 303, 307, 308]);
|
|
1224
|
+
var DEFAULT_REDIRECT_LIMIT = 10;
|
|
1225
|
+
function envProxyForUrl(targetUrl) {
|
|
1226
|
+
const raw = targetUrl.protocol === "https:" ? process.env.HTTPS_PROXY || process.env.https_proxy || process.env.ALL_PROXY || process.env.all_proxy : process.env.HTTP_PROXY || process.env.http_proxy || process.env.ALL_PROXY || process.env.all_proxy;
|
|
1227
|
+
if (!raw || noProxyMatches(targetUrl.hostname)) {
|
|
1228
|
+
return null;
|
|
1229
|
+
}
|
|
1230
|
+
try {
|
|
1231
|
+
return new URL(raw);
|
|
1232
|
+
} catch {
|
|
1233
|
+
return null;
|
|
1234
|
+
}
|
|
1235
|
+
}
|
|
1236
|
+
function noProxyMatches(hostname) {
|
|
1237
|
+
const raw = process.env.NO_PROXY ?? process.env.no_proxy ?? "";
|
|
1238
|
+
if (!raw) return false;
|
|
1239
|
+
const host = hostname.toLowerCase();
|
|
1240
|
+
return raw.split(",").map((entry) => entry.trim().toLowerCase()).some((entry) => {
|
|
1241
|
+
if (!entry) return false;
|
|
1242
|
+
if (entry === "*") return true;
|
|
1243
|
+
if (entry.startsWith(".")) return host === entry.slice(1) || host.endsWith(entry);
|
|
1244
|
+
return host === entry || host.endsWith(`.${entry}`);
|
|
1245
|
+
});
|
|
1246
|
+
}
|
|
1247
|
+
function headersToRecord(headers) {
|
|
1248
|
+
const record = {};
|
|
1249
|
+
if (!headers) return record;
|
|
1250
|
+
new Headers(headers).forEach((value, key) => {
|
|
1251
|
+
record[key] = value;
|
|
1252
|
+
});
|
|
1253
|
+
return record;
|
|
1254
|
+
}
|
|
1255
|
+
function responseHeaders(headers) {
|
|
1256
|
+
const result = new Headers();
|
|
1257
|
+
for (const [key, value] of Object.entries(headers)) {
|
|
1258
|
+
if (Array.isArray(value)) {
|
|
1259
|
+
for (const item of value) result.append(key, item);
|
|
1260
|
+
} else if (value !== void 0) {
|
|
1261
|
+
result.set(key, String(value));
|
|
1262
|
+
}
|
|
1263
|
+
}
|
|
1264
|
+
return result;
|
|
1265
|
+
}
|
|
1266
|
+
async function bodyToBuffer(body) {
|
|
1267
|
+
if (body === void 0 || body === null) return void 0;
|
|
1268
|
+
if (typeof ReadableStream !== "undefined" && body instanceof ReadableStream) {
|
|
1269
|
+
throw new Error("proxy-aware fetch does not support streaming request bodies");
|
|
1270
|
+
}
|
|
1271
|
+
if (typeof body === "string") return Buffer.from(body);
|
|
1272
|
+
if (body instanceof URLSearchParams) return Buffer.from(body.toString());
|
|
1273
|
+
if (body instanceof ArrayBuffer) return Buffer.from(body);
|
|
1274
|
+
if (ArrayBuffer.isView(body)) return Buffer.from(body.buffer, body.byteOffset, body.byteLength);
|
|
1275
|
+
if (typeof Blob !== "undefined" && body instanceof Blob) return Buffer.from(await body.arrayBuffer());
|
|
1276
|
+
throw new Error("proxy-aware fetch only supports buffered request bodies");
|
|
1277
|
+
}
|
|
1278
|
+
function collectResponse(res, done) {
|
|
1279
|
+
const chunks = [];
|
|
1280
|
+
res.on("data", (chunk) => {
|
|
1281
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
1282
|
+
});
|
|
1283
|
+
res.on("end", () => {
|
|
1284
|
+
done(null, {
|
|
1285
|
+
status: res.statusCode ?? 0,
|
|
1286
|
+
statusText: res.statusMessage ?? "",
|
|
1287
|
+
headers: Object.fromEntries(responseHeaders(res.headers)),
|
|
1288
|
+
body: Buffer.concat(chunks)
|
|
1289
|
+
});
|
|
1290
|
+
});
|
|
1291
|
+
res.on("error", (error) => done(error));
|
|
1292
|
+
}
|
|
1293
|
+
function proxyAuthorization(proxy) {
|
|
1294
|
+
if (!proxy.username) return void 0;
|
|
1295
|
+
return `Basic ${Buffer.from(`${decodeURIComponent(proxy.username)}:${decodeURIComponent(proxy.password)}`).toString("base64")}`;
|
|
1296
|
+
}
|
|
1297
|
+
function requestViaHttpProxy(targetUrl, proxy, method, headers, body, signal) {
|
|
1298
|
+
if (proxy.protocol !== "http:") {
|
|
1299
|
+
throw new Error(`Unsupported proxy protocol: ${proxy.protocol}`);
|
|
1300
|
+
}
|
|
1301
|
+
return new Promise((resolve3, reject) => {
|
|
1302
|
+
let settled = false;
|
|
1303
|
+
let active = null;
|
|
1304
|
+
const done = (error, response) => {
|
|
1305
|
+
if (settled) return;
|
|
1306
|
+
settled = true;
|
|
1307
|
+
signal?.removeEventListener("abort", abort);
|
|
1308
|
+
if (error) reject(error);
|
|
1309
|
+
else if (response) resolve3(response);
|
|
1310
|
+
else reject(new Error("Proxy request ended without a response"));
|
|
1311
|
+
};
|
|
1312
|
+
const abort = () => active?.destroy(new Error("The operation was aborted"));
|
|
1313
|
+
if (signal?.aborted) {
|
|
1314
|
+
done(new Error("The operation was aborted"));
|
|
1315
|
+
return;
|
|
1316
|
+
}
|
|
1317
|
+
signal?.addEventListener("abort", abort, { once: true });
|
|
1318
|
+
const targetPort = targetUrl.port ? Number(targetUrl.port) : targetUrl.protocol === "https:" ? 443 : 80;
|
|
1319
|
+
const proxyPort = proxy.port ? Number(proxy.port) : 8080;
|
|
1320
|
+
const auth = proxyAuthorization(proxy);
|
|
1321
|
+
const requestHeaders2 = {
|
|
1322
|
+
...headers,
|
|
1323
|
+
host: targetUrl.host
|
|
1324
|
+
};
|
|
1325
|
+
if (body && !Object.keys(requestHeaders2).some((key) => key.toLowerCase() === "content-length")) {
|
|
1326
|
+
requestHeaders2["content-length"] = String(body.byteLength);
|
|
1327
|
+
}
|
|
1328
|
+
if (targetUrl.protocol === "https:") {
|
|
1329
|
+
const connectHeaders = { host: `${targetUrl.hostname}:${targetPort}` };
|
|
1330
|
+
if (auth) connectHeaders["proxy-authorization"] = auth;
|
|
1331
|
+
const connectReq = httpRequest({
|
|
1332
|
+
host: proxy.hostname,
|
|
1333
|
+
port: proxyPort,
|
|
1334
|
+
method: "CONNECT",
|
|
1335
|
+
path: `${targetUrl.hostname}:${targetPort}`,
|
|
1336
|
+
headers: connectHeaders
|
|
1337
|
+
});
|
|
1338
|
+
active = connectReq;
|
|
1339
|
+
connectReq.on("connect", (connectRes, socket) => {
|
|
1340
|
+
if (connectRes.statusCode !== 200) {
|
|
1341
|
+
socket.destroy();
|
|
1342
|
+
done(new Error(`Proxy CONNECT failed: ${connectRes.statusCode ?? 0}`));
|
|
1343
|
+
return;
|
|
1344
|
+
}
|
|
1345
|
+
const tlsSocket = tlsConnect({ socket, host: targetUrl.hostname, servername: targetUrl.hostname });
|
|
1346
|
+
active = tlsSocket;
|
|
1347
|
+
tlsSocket.on("error", (error) => done(error));
|
|
1348
|
+
tlsSocket.on("secureConnect", () => {
|
|
1349
|
+
const req2 = httpRequest({
|
|
1350
|
+
method,
|
|
1351
|
+
path: `${targetUrl.pathname}${targetUrl.search}`,
|
|
1352
|
+
headers: requestHeaders2,
|
|
1353
|
+
createConnection: () => tlsSocket
|
|
1354
|
+
}, (res) => collectResponse(res, done));
|
|
1355
|
+
active = req2;
|
|
1356
|
+
req2.on("error", (error) => done(error));
|
|
1357
|
+
if (body) req2.write(body);
|
|
1358
|
+
req2.end();
|
|
1359
|
+
});
|
|
1360
|
+
});
|
|
1361
|
+
connectReq.on("error", (error) => done(error));
|
|
1362
|
+
connectReq.end();
|
|
1363
|
+
return;
|
|
1364
|
+
}
|
|
1365
|
+
if (auth) requestHeaders2["proxy-authorization"] = auth;
|
|
1366
|
+
const req = httpRequest({
|
|
1367
|
+
host: proxy.hostname,
|
|
1368
|
+
port: proxyPort,
|
|
1369
|
+
method,
|
|
1370
|
+
path: targetUrl.href,
|
|
1371
|
+
headers: requestHeaders2
|
|
1372
|
+
}, (res) => collectResponse(res, done));
|
|
1373
|
+
active = req;
|
|
1374
|
+
req.on("error", (error) => done(error));
|
|
1375
|
+
if (body) req.write(body);
|
|
1376
|
+
req.end();
|
|
1377
|
+
});
|
|
1378
|
+
}
|
|
1379
|
+
function requestUrl(input) {
|
|
1380
|
+
if (input instanceof URL) return input;
|
|
1381
|
+
if (typeof input === "string") return new URL(input);
|
|
1382
|
+
return new URL(input.url);
|
|
1383
|
+
}
|
|
1384
|
+
function requestMethod(input, init) {
|
|
1385
|
+
if (init.method) return init.method.toUpperCase();
|
|
1386
|
+
if (input instanceof Request) return input.method.toUpperCase();
|
|
1387
|
+
return "GET";
|
|
1388
|
+
}
|
|
1389
|
+
function requestHeaders(input, init) {
|
|
1390
|
+
return {
|
|
1391
|
+
...input instanceof Request ? headersToRecord(input.headers) : {},
|
|
1392
|
+
...headersToRecord(init.headers)
|
|
1393
|
+
};
|
|
1394
|
+
}
|
|
1395
|
+
async function proxyAwareFetchInternal(input, init, redirectsLeft) {
|
|
1396
|
+
const url = requestUrl(input);
|
|
1397
|
+
const proxy = envProxyForUrl(url);
|
|
1398
|
+
if (!proxy) {
|
|
1399
|
+
return fetch(input, init);
|
|
1400
|
+
}
|
|
1401
|
+
const method = requestMethod(input, init);
|
|
1402
|
+
const headers = requestHeaders(input, init);
|
|
1403
|
+
const body = await bodyToBuffer(init.body ?? (input instanceof Request ? input.body : void 0));
|
|
1404
|
+
const proxied = await requestViaHttpProxy(url, proxy, method, headers, body, init.signal ?? void 0);
|
|
1405
|
+
const location = proxied.headers.location;
|
|
1406
|
+
const redirectMode = init.redirect ?? "follow";
|
|
1407
|
+
if (redirectMode !== "manual" && REDIRECT_STATUSES.has(proxied.status) && location && redirectsLeft > 0) {
|
|
1408
|
+
const nextUrl = new URL(location, url);
|
|
1409
|
+
const nextInit = { ...init };
|
|
1410
|
+
if (proxied.status === 303) {
|
|
1411
|
+
nextInit.method = "GET";
|
|
1412
|
+
nextInit.body = void 0;
|
|
1413
|
+
}
|
|
1414
|
+
return proxyAwareFetchInternal(nextUrl, nextInit, redirectsLeft - 1);
|
|
1415
|
+
}
|
|
1416
|
+
return new Response(new Uint8Array(proxied.body), {
|
|
1417
|
+
status: proxied.status,
|
|
1418
|
+
statusText: proxied.statusText,
|
|
1419
|
+
headers: proxied.headers
|
|
1420
|
+
});
|
|
1421
|
+
}
|
|
1422
|
+
async function proxyAwareFetch(input, init = {}) {
|
|
1423
|
+
return proxyAwareFetchInternal(input, init, DEFAULT_REDIRECT_LIMIT);
|
|
1424
|
+
}
|
|
1425
|
+
|
|
1124
1426
|
// src/fetch/strategy.ts
|
|
1125
1427
|
import { writeFile as writeFile3 } from "fs/promises";
|
|
1126
1428
|
|
|
@@ -1135,8 +1437,8 @@ function extractPreloadedMarkdownUrl(html, baseUrl) {
|
|
|
1135
1437
|
}
|
|
1136
1438
|
return new URL(rawUrl, baseUrl).toString();
|
|
1137
1439
|
}
|
|
1138
|
-
function removeNoise(
|
|
1139
|
-
|
|
1440
|
+
function removeNoise(document) {
|
|
1441
|
+
document.querySelectorAll("script, style, noscript, svg, iframe").forEach((element) => element.remove());
|
|
1140
1442
|
}
|
|
1141
1443
|
function normalizedTextLength(element) {
|
|
1142
1444
|
return (element?.textContent ?? "").replace(/\s+/g, " ").trim().length;
|
|
@@ -1145,12 +1447,12 @@ function htmlHasMeaningfulContent(url, html) {
|
|
|
1145
1447
|
if (extractPreloadedMarkdownUrl(html, url) !== null) {
|
|
1146
1448
|
return true;
|
|
1147
1449
|
}
|
|
1148
|
-
const { document
|
|
1149
|
-
removeNoise(
|
|
1450
|
+
const { document } = parseHTML3(html);
|
|
1451
|
+
removeNoise(document);
|
|
1150
1452
|
const selectors = ["#js_content", "article", "main", "section", "div", "body"];
|
|
1151
1453
|
let bestLength = 0;
|
|
1152
1454
|
for (const selector of selectors) {
|
|
1153
|
-
|
|
1455
|
+
document.querySelectorAll(selector).forEach((element) => {
|
|
1154
1456
|
bestLength = Math.max(bestLength, normalizedTextLength(element));
|
|
1155
1457
|
});
|
|
1156
1458
|
if (bestLength >= 600 && selector !== "div") {
|
|
@@ -1245,11 +1547,11 @@ async function fetchBrowserHtmlWithBrowserState(url, config) {
|
|
|
1245
1547
|
}
|
|
1246
1548
|
|
|
1247
1549
|
// src/fetch/static.ts
|
|
1248
|
-
async function fetchStaticHtml(url, timeoutMs = 6e4) {
|
|
1550
|
+
async function fetchStaticHtml(url, timeoutMs = 6e4, fetchImpl = fetch) {
|
|
1249
1551
|
const controller = new AbortController();
|
|
1250
1552
|
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
|
1251
1553
|
try {
|
|
1252
|
-
const response = await
|
|
1554
|
+
const response = await fetchImpl(url, {
|
|
1253
1555
|
redirect: "follow",
|
|
1254
1556
|
signal: controller.signal,
|
|
1255
1557
|
headers: {
|
|
@@ -1278,7 +1580,7 @@ async function writeOutputIfRequested(outputPath, html) {
|
|
|
1278
1580
|
}
|
|
1279
1581
|
async function fetchHtmlResult(url, options = {}) {
|
|
1280
1582
|
const isMeaningful = options.isMeaningful ?? htmlHasMeaningfulContent;
|
|
1281
|
-
const staticFetch = options.staticFetch ?? (async (targetUrl) => (await fetchStaticHtml(targetUrl)).html);
|
|
1583
|
+
const staticFetch = options.staticFetch ?? (async (targetUrl) => (await fetchStaticHtml(targetUrl, void 0, options.useProxyEnv ? proxyAwareFetch : void 0)).html);
|
|
1282
1584
|
const browserFetch = options.browserFetch ?? ((targetUrl) => fetchBrowserHtml(targetUrl, {
|
|
1283
1585
|
waitMs: options.waitMs,
|
|
1284
1586
|
networkIdle: options.networkIdle,
|
|
@@ -1446,9 +1748,9 @@ ${code}
|
|
|
1446
1748
|
\`\`\``).replace(/\[\s*\]\((?:#|javascript:void\(0\)|javascript:;)\)/gi, "").replace(/(^|[^\\])\$(?=\d)/g, "$1\\$").replace(/\n\s*\n\s*([-*+]\s)/g, "\n$1").replace(/[ \t]+\n/g, "\n").replace(/\n{3,}/g, "\n\n").trim();
|
|
1447
1749
|
}
|
|
1448
1750
|
function htmlFragmentText(fragment) {
|
|
1449
|
-
const { document
|
|
1450
|
-
|
|
1451
|
-
return
|
|
1751
|
+
const { document } = parseHTML4(`<!doctype html><html><body>${fragment}</body></html>`);
|
|
1752
|
+
document.querySelectorAll("br").forEach((br) => br.replaceWith(document.createTextNode("\n")));
|
|
1753
|
+
return document.body.textContent ?? "";
|
|
1452
1754
|
}
|
|
1453
1755
|
function fencedCodeHtml(text) {
|
|
1454
1756
|
const escaped = text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">");
|
|
@@ -1544,10 +1846,45 @@ function resolveCreatedValue(item, published) {
|
|
|
1544
1846
|
if (item.publishedAt) return createdFromItemDate(item.publishedAt);
|
|
1545
1847
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/\.\d{3}Z$/, "Z");
|
|
1546
1848
|
}
|
|
1849
|
+
function mergeProfileFetchOptions(options, profiles) {
|
|
1850
|
+
const merged = { ...options };
|
|
1851
|
+
for (const profile of profiles) {
|
|
1852
|
+
if (profile.fetch?.mode) merged.fetchMode = profile.fetch.mode;
|
|
1853
|
+
if (profile.fetch?.waitMs !== void 0) merged.waitMs = profile.fetch.waitMs;
|
|
1854
|
+
if (profile.fetch?.networkIdle !== void 0) merged.networkIdle = profile.fetch.networkIdle;
|
|
1855
|
+
if (profile.fetch?.waitSelector) merged.waitSelector = profile.fetch.waitSelector;
|
|
1856
|
+
if (profile.fetch?.waitSelectorState) merged.waitSelectorState = profile.fetch.waitSelectorState;
|
|
1857
|
+
if (profile.fetch?.clickSelectors) merged.clickSelectors = profile.fetch.clickSelectors;
|
|
1858
|
+
if (profile.fetch?.scrollToBottom !== void 0) merged.scrollToBottom = profile.fetch.scrollToBottom;
|
|
1859
|
+
if (profile.fetch?.useProxyEnv !== void 0) merged.useProxyEnv = profile.fetch.useProxyEnv;
|
|
1860
|
+
if (profile.fetch?.preferBrowserState && options.browserStateDefaults) {
|
|
1861
|
+
merged.browserState = {
|
|
1862
|
+
...options.browserStateDefaults,
|
|
1863
|
+
waitMs: merged.waitMs,
|
|
1864
|
+
networkIdle: merged.networkIdle,
|
|
1865
|
+
proxy: merged.proxy,
|
|
1866
|
+
dnsOverHttps: merged.dnsOverHttps,
|
|
1867
|
+
waitSelector: merged.waitSelector,
|
|
1868
|
+
waitSelectorState: merged.waitSelectorState,
|
|
1869
|
+
clickSelectors: merged.clickSelectors,
|
|
1870
|
+
scrollToBottom: merged.scrollToBottom,
|
|
1871
|
+
headless: merged.headless,
|
|
1872
|
+
realChromeDefaults: merged.realChromeDefaults
|
|
1873
|
+
};
|
|
1874
|
+
}
|
|
1875
|
+
}
|
|
1876
|
+
return merged;
|
|
1877
|
+
}
|
|
1547
1878
|
async function processItem(item, options) {
|
|
1548
|
-
const
|
|
1879
|
+
const urlProfiles = selectActiveProfiles(options.profiles, item.url, "");
|
|
1880
|
+
const fetchOptions = mergeProfileFetchOptions(options, urlProfiles);
|
|
1881
|
+
const html = await fetchHtml(item.url, fetchOptions);
|
|
1549
1882
|
const activeProfiles = selectActiveProfiles(options.profiles, item.url, html);
|
|
1550
|
-
const
|
|
1883
|
+
const defuddleFetch = activeProfiles.some((profile) => profile.fetch?.useProxyEnv) ? proxyAwareFetch : void 0;
|
|
1884
|
+
const cleaned = await cleanHtml(html, { baseUrl: item.url, profiles: options.profiles, activeProfiles, defuddleFetch });
|
|
1885
|
+
if (activeProfiles.some((profile) => profile.extraction?.requireText) && !cleaned.content.replace(/<[^>]*>/g, "").trim()) {
|
|
1886
|
+
throw new Error("matched site rule requires extracted text, but no text content was extracted");
|
|
1887
|
+
}
|
|
1551
1888
|
const title = cleaned.metadata.title || item.sourceTitle || titleFromUrl(item.url);
|
|
1552
1889
|
await cleanupExistingNote(options.outputDir, item.url);
|
|
1553
1890
|
const contentHtml = options.localizeAssets === false ? cleaned.content : await localizeImages(cleaned.content, {
|
|
@@ -1619,7 +1956,10 @@ var ProgressTracker = class {
|
|
|
1619
1956
|
var program = new Command();
|
|
1620
1957
|
async function siteRulePathsFromDir(dir) {
|
|
1621
1958
|
const names = await readdir2(dir);
|
|
1622
|
-
return names.filter((name) => name.endsWith(".toml")).map((name) => join7(dir, name));
|
|
1959
|
+
return names.filter((name) => name.endsWith(".toml")).sort().map((name) => join7(dir, name));
|
|
1960
|
+
}
|
|
1961
|
+
function builtinSiteRulesDir() {
|
|
1962
|
+
return join7(dirname(fileURLToPath(import.meta.url)), "site-rules");
|
|
1623
1963
|
}
|
|
1624
1964
|
function positiveIntOption(value, fallback) {
|
|
1625
1965
|
const parsed = Number(value ?? fallback);
|
|
@@ -1656,7 +1996,9 @@ program.name("feedloom").description("Archive long-form web content as clean Mar
|
|
|
1656
1996
|
positiveIntOption(options.limit, 0)
|
|
1657
1997
|
);
|
|
1658
1998
|
const siteRulesDir = String(options.siteRulesDir || "");
|
|
1659
|
-
const
|
|
1999
|
+
const builtinRulePaths = await siteRulePathsFromDir(builtinSiteRulesDir());
|
|
2000
|
+
const customRulePaths = siteRulesDir ? await siteRulePathsFromDir(resolve2(siteRulesDir)) : [];
|
|
2001
|
+
const profiles = await loadSiteProfiles([...builtinRulePaths, ...customRulePaths]);
|
|
1660
2002
|
const outputDir = String(options.outputDir ?? "clippings");
|
|
1661
2003
|
let failures = 0;
|
|
1662
2004
|
const tracker = new ProgressTracker(selected, outputDir);
|
|
@@ -1675,6 +2017,10 @@ program.name("feedloom").description("Archive long-form web content as clean Mar
|
|
|
1675
2017
|
headless: !Boolean(options.headful),
|
|
1676
2018
|
realChromeDefaults: options.realChromeDefaults !== false
|
|
1677
2019
|
};
|
|
2020
|
+
const browserStateDefaults = {
|
|
2021
|
+
userDataDir: String(options.chromeUserDataDir || ""),
|
|
2022
|
+
profile: String(options.chromeProfile || "Default")
|
|
2023
|
+
};
|
|
1678
2024
|
const sessions = options.reuseBrowser === false ? null : new BatchFetchSessions({
|
|
1679
2025
|
browser: browserOptions,
|
|
1680
2026
|
stealth: {
|
|
@@ -1687,15 +2033,11 @@ program.name("feedloom").description("Archive long-form web content as clean Mar
|
|
|
1687
2033
|
for (const item of selected) {
|
|
1688
2034
|
tracker.start(item.url);
|
|
1689
2035
|
try {
|
|
1690
|
-
const browserState = options.preferBrowserState ? {
|
|
1691
|
-
userDataDir: String(options.chromeUserDataDir || ""),
|
|
1692
|
-
profile: String(options.chromeProfile || "Default"),
|
|
1693
|
-
...browserOptions
|
|
1694
|
-
} : null;
|
|
1695
2036
|
const result = await processItem(item, {
|
|
1696
2037
|
outputDir,
|
|
1697
2038
|
profiles,
|
|
1698
|
-
browserState,
|
|
2039
|
+
browserState: options.preferBrowserState ? { ...browserStateDefaults, ...browserOptions } : null,
|
|
2040
|
+
browserStateDefaults,
|
|
1699
2041
|
fetchMode,
|
|
1700
2042
|
...browserOptions,
|
|
1701
2043
|
solveCloudflare: Boolean(options.solveCloudflare),
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
[match]
|
|
2
|
+
host_suffixes = ["xiaohongshu.com"]
|
|
3
|
+
|
|
4
|
+
[fetch]
|
|
5
|
+
mode = "auto"
|
|
6
|
+
scroll_to_bottom = true
|
|
7
|
+
wait_ms = 5000
|
|
8
|
+
|
|
9
|
+
[extract]
|
|
10
|
+
selectors = [".note-content", "#noteContainer", ".note-container", ".note-detail"]
|
|
11
|
+
|
|
12
|
+
[metadata]
|
|
13
|
+
strip_title_regexes = ["\\s*-\\s*小红书\\s*$"]
|
|
14
|
+
strip_author_regexes = ["关注$"]
|
|
15
|
+
author_selectors = [".author-container .username", ".author-wrapper .name", ".user-name"]
|
|
16
|
+
author_meta_names = ["author"]
|
|
17
|
+
author_meta_properties = ["article:author"]
|
|
18
|
+
|
|
19
|
+
[media]
|
|
20
|
+
include_meta_images = true
|
|
21
|
+
image_meta_properties = ["og:image"]
|
|
22
|
+
|
|
23
|
+
[clean.remove]
|
|
24
|
+
selectors = [
|
|
25
|
+
".side-bar",
|
|
26
|
+
".left-container",
|
|
27
|
+
".comments-el",
|
|
28
|
+
".interactions",
|
|
29
|
+
".engage-bar",
|
|
30
|
+
".note-detail-dropdown",
|
|
31
|
+
".close-circle",
|
|
32
|
+
".close-box",
|
|
33
|
+
".login-container",
|
|
34
|
+
".bottom-container .notedetail-menu",
|
|
35
|
+
".author",
|
|
36
|
+
".author-container",
|
|
37
|
+
".media-container",
|
|
38
|
+
".fraction",
|
|
39
|
+
".arrow-controller",
|
|
40
|
+
".pagination-media-container"
|
|
41
|
+
]
|
|
42
|
+
text_contains = [
|
|
43
|
+
"创作中心",
|
|
44
|
+
"业务合作",
|
|
45
|
+
"沪ICP备",
|
|
46
|
+
"营业执照",
|
|
47
|
+
"违法不良信息举报电话",
|
|
48
|
+
"行吟信息科技",
|
|
49
|
+
"小红书网页版",
|
|
50
|
+
"登录后推荐更懂你的笔记"
|
|
51
|
+
]
|
|
52
|
+
exact_text = ["关注", "加载中", "更多", "发现", "直播", "发布", "通知"]
|
|
53
|
+
|
|
54
|
+
[clean.truncate]
|
|
55
|
+
after_regexes = ["^共 \\d+ 条评论$", "^相关推荐$", "^登录后推荐更懂你的笔记$"]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
[match]
|
|
2
|
+
host_suffixes = ["zhihu.com"]
|
|
3
|
+
|
|
4
|
+
[extract]
|
|
5
|
+
selectors = ["[class*=\"Post-RichTextContainer\"]", "[class*=\"RichText ztext\"]", "[class*=\"RichContent-inner\"]"]
|
|
6
|
+
|
|
7
|
+
[metadata]
|
|
8
|
+
strip_title_regexes = ["\\s*-\\s*知乎\\s*$"]
|
|
9
|
+
|
|
10
|
+
[fetch]
|
|
11
|
+
mode = "browser"
|
|
12
|
+
prefer_browser_state = true
|
|
13
|
+
scroll_to_bottom = true
|
|
14
|
+
wait_ms = 8000
|
|
15
|
+
|
|
16
|
+
[clean.remove]
|
|
17
|
+
class_contains = ["RichText-LinkCardContainer"]
|
|
18
|
+
text_regexes = ["^目录收起$", "^目录收起.*References$", "^.+\\d+ 赞同 · \\d+ 评论 文章$", "^.+\\d+ 赞同 · \\d+ 评论 文章$", "^\\d+ 赞同 · \\d+ 评论 文章$"]
|
|
19
|
+
exact_text = ["目录", "收起"]
|
|
20
|
+
|
|
21
|
+
[clean.truncate]
|
|
22
|
+
after_regexes = ["^发布于 ", "^赞同 ", "^\\d+ 条评论$", "^分享$", "^申请转载$", ".*的广告$"]
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ariesfish/feedloom",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.3",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"author": "ariesfish",
|
|
6
6
|
"license": "MIT",
|
|
@@ -24,7 +24,7 @@
|
|
|
24
24
|
],
|
|
25
25
|
"scripts": {
|
|
26
26
|
"dev": "tsx src/cli.ts",
|
|
27
|
-
"build": "tsup src/cli.ts --format esm --dts --clean",
|
|
27
|
+
"build": "tsup src/cli.ts --format esm --dts --clean && rm -rf dist/site-rules && cp -R src/site-rules dist/site-rules",
|
|
28
28
|
"typecheck": "tsc --noEmit",
|
|
29
29
|
"test": "vitest run",
|
|
30
30
|
"prepublishOnly": "npm run typecheck && npm test && npm run build"
|
package/skills/feedloom/SKILL.md
CHANGED
|
@@ -22,6 +22,8 @@ npx -y @ariesfish/feedloom <inputs...> [options]
|
|
|
22
22
|
|
|
23
23
|
## Common usage
|
|
24
24
|
|
|
25
|
+
Before running Feedloom, check whether this skill directory has a `site-rules/` directory. If it exists, always pass it with `--site-rules-dir $HOME/.agents/skills/feedloom/site-rules`; do not omit available site rules.
|
|
26
|
+
|
|
25
27
|
```bash
|
|
26
28
|
npx -y @ariesfish/feedloom "https://example.com/article"
|
|
27
29
|
npx -y @ariesfish/feedloom urls.txt
|
|
@@ -56,20 +58,24 @@ Use the least expensive mode that works:
|
|
|
56
58
|
- `--wait-ms <ms>`, `--wait-selector <selector>`, `--scroll-to-bottom`: give dynamic pages time or actions to reveal article content.
|
|
57
59
|
- `--click-selector <selector...>`: click dismiss/expand selectors before extracting HTML.
|
|
58
60
|
- `--headful`: show the browser window for debugging login, popups, or dynamic loading.
|
|
59
|
-
- `--site-rules-dir <dir>`: load optional private TOML extraction/cleaning rules from a local directory, for example
|
|
61
|
+
- `--site-rules-dir <dir>`: load optional private TOML extraction/cleaning rules from a local directory, for example `$HOME/.agents/skills/feedloom/site-rules/` reference folder.
|
|
60
62
|
- `--solve-cloudflare`, `--proxy <server>`, `--dns-over-https`: use only when stealth fetching needs them.
|
|
61
63
|
|
|
62
64
|
Run `npx -y @ariesfish/feedloom --help` for the complete option list. Do not invent unsupported options.
|
|
63
65
|
|
|
64
|
-
##
|
|
66
|
+
## Site rules
|
|
67
|
+
|
|
68
|
+
Feedloom ships built-in TOML site rules in the package for common sites such as WeChat and Zhihu. These are loaded automatically; do not pass a special option for built-in rules.
|
|
65
69
|
|
|
66
|
-
|
|
70
|
+
Private skill rules are also supported and are mandatory to use when present next to this skill. Always check for `$HOME/.agents/skills/feedloom/site-rules/` before clipping. If that directory exists, pass it explicitly on every Feedloom command using the `$HOME`-prefixed path:
|
|
67
71
|
|
|
68
72
|
```bash
|
|
69
|
-
npx -y @ariesfish/feedloom "https://example.com/article" --site-rules-dir skills/feedloom/site-rules
|
|
73
|
+
npx -y @ariesfish/feedloom "https://example.com/article" --site-rules-dir $HOME/.agents/skills/feedloom/site-rules
|
|
70
74
|
```
|
|
71
75
|
|
|
72
|
-
Treat rule files in
|
|
76
|
+
Treat rule files in `$HOME/.agents/skills/feedloom/site-rules/` as local reference material and use them whenever available; never skip an existing site-rules directory unless the user explicitly asks not to use it.
|
|
77
|
+
|
|
78
|
+
For adding or editing private rules, read `references/site-rules.md`. It contains the TOML schema, examples, `[fetch]` behavior, and validation workflow.
|
|
73
79
|
|
|
74
80
|
## Output
|
|
75
81
|
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# Feedloom site rules
|
|
2
|
+
|
|
3
|
+
Use TOML site rules when Feedloom needs a narrow site-specific selector, cleanup overlay, metadata normalization, or conservative fetch preference. Do not write ad-hoc scrapers.
|
|
4
|
+
|
|
5
|
+
## Locations
|
|
6
|
+
|
|
7
|
+
Private skill rules live in:
|
|
8
|
+
|
|
9
|
+
```text
|
|
10
|
+
$HOME/.agents/skills/feedloom/site-rules/<site>.toml
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
When the private rules directory exists, pass it on every command:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
npx -y @ariesfish/feedloom "https://example.com/article" --site-rules-dir $HOME/.agents/skills/feedloom/site-rules
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Add a private rule
|
|
20
|
+
|
|
21
|
+
Create or edit one TOML file per site:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
mkdir -p $HOME/.agents/skills/feedloom/site-rules
|
|
25
|
+
$EDITOR $HOME/.agents/skills/feedloom/site-rules/example.toml
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Minimal rule:
|
|
29
|
+
|
|
30
|
+
```toml
|
|
31
|
+
[match]
|
|
32
|
+
host_suffixes = ["example.com"]
|
|
33
|
+
|
|
34
|
+
[extract]
|
|
35
|
+
selectors = ["article", "main"]
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Rule with fetch preferences:
|
|
39
|
+
|
|
40
|
+
```toml
|
|
41
|
+
[match]
|
|
42
|
+
host_suffixes = ["zhihu.com"]
|
|
43
|
+
|
|
44
|
+
[fetch]
|
|
45
|
+
mode = "browser"
|
|
46
|
+
prefer_browser_state = true
|
|
47
|
+
scroll_to_bottom = true
|
|
48
|
+
wait_ms = 8000
|
|
49
|
+
|
|
50
|
+
[extract]
|
|
51
|
+
selectors = ["[class*=\"Post-RichTextContainer\"]", "[class*=\"RichText ztext\"]"]
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Schema
|
|
55
|
+
|
|
56
|
+
Supported sections:
|
|
57
|
+
|
|
58
|
+
- `[match]`: `host_suffixes`, `host_regexes`, `url_regexes`, `html_markers`.
|
|
59
|
+
- `[fetch]`: `mode`, `prefer_browser_state`, `wait_ms`, `network_idle`, `wait_selector`, `wait_selector_state`, `click_selectors`, `scroll_to_bottom`, `use_proxy_env`.
|
|
60
|
+
- `[extract]`: `selectors`, `require_text`.
|
|
61
|
+
- `[metadata]`: `fixed_author`, `strip_title_regexes`, `strip_author_regexes`, `author_selectors`, `author_meta_names`, `author_meta_itemprops`, `author_meta_properties`.
|
|
62
|
+
- `[clean.remove]`: `selectors`, `class_contains`, `id_contains`, `attr_contains`, `text_contains`, `text_regexes`, `exact_text`.
|
|
63
|
+
- `[clean.truncate]`: `after_contains`, `after_regexes`.
|
|
64
|
+
|
|
65
|
+
## Fetch rules
|
|
66
|
+
|
|
67
|
+
Use `[fetch]` only when a site consistently needs browser rendering, local Chrome state, scrolling, waiting, clicking, or proxy-aware requests.
|
|
68
|
+
|
|
69
|
+
`use_proxy_env = true` tells Feedloom to use `HTTP_PROXY`, `HTTPS_PROXY`, `ALL_PROXY`, and `NO_PROXY` for static fetches and Defuddle async extractor fetches. Use this for YouTube transcript capture and similar extractor-backed pages that need the user's proxy settings.
|
|
70
|
+
|
|
71
|
+
`prefer_browser_state = true` only tells Feedloom to use copied Chrome state for matching URLs. It does not store the local Chrome path. The command still needs Chrome state parameters when login state is required:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
npx -y @ariesfish/feedloom \
|
|
75
|
+
--chrome-user-data-dir "$HOME/Library/Application Support/Google/Chrome" \
|
|
76
|
+
--chrome-profile Default \
|
|
77
|
+
--site-rules-dir $HOME/.agents/skills/feedloom/site-rules \
|
|
78
|
+
"https://zhuanlan.zhihu.com/p/..."
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Rules for writing rules
|
|
82
|
+
|
|
83
|
+
- Prefer narrow domain-specific selectors over broad selectors.
|
|
84
|
+
- Prefer content containers over page shells. Avoid `body` unless the HTML is already minimal.
|
|
85
|
+
- Use `require_text = true` when a matched extractor-backed page should fail instead of writing an empty note.
|
|
86
|
+
- Use cleanup only for repeated, stable noise inside otherwise correct content.
|
|
87
|
+
- Use truncation only for stable tail markers where everything after the marker is non-article content.
|
|
88
|
+
- Do not add aggressive crawling, high concurrency, repeated challenge solving, or broad stealth defaults.
|
|
89
|
+
- Keep private rules outside project repos unless the user is working on Feedloom itself.
|
|
90
|
+
|
|
91
|
+
## Validation
|
|
92
|
+
|
|
93
|
+
After adding or editing a private rule, test one known URL and inspect the Markdown:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
outdir=$(mktemp -d /tmp/feedloom-rule-test-XXXXXX)
|
|
97
|
+
npx -y @ariesfish/feedloom \
|
|
98
|
+
--output-dir "$outdir" \
|
|
99
|
+
--site-rules-dir $HOME/.agents/skills/feedloom/site-rules \
|
|
100
|
+
"https://example.com/article"
|
|
101
|
+
find "$outdir" -maxdepth 2 -type f | sort
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
For sites that require Chrome state, add the Chrome state options shown above.
|