@ariesfish/feedloom 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -281,7 +281,8 @@ npm test
281
281
  - Respect robots.txt, website terms of service, copyright, and rate limits.
282
282
  - For dynamic pages, try `--fetch-mode browser` first.
283
283
  - For static blogs and news sites, `--fetch-mode static` is usually faster.
284
- - If article extraction is poor for a specific site, keep private TOML site rules outside the package and pass them with `--site-rules-dir <dir>`.
284
+ - Feedloom ships bundled TOML site rules for common dynamic/structured sites such as WeChat official account articles and Zhihu. Site rules can define extraction, cleanup, and fetch preferences. For example, the bundled Zhihu rule uses browser fetch with copied Chrome state when `--chrome-user-data-dir`/`--chrome-profile` are configured.
285
+ - If article extraction is poor for a specific site, keep private TOML site rules outside the package and pass them with `--site-rules-dir <dir>`. Private rules are loaded after bundled rules.
285
286
  - For large batches, test with `--limit` before running the full job.
286
287
 
287
288
  ## Acknowledgements
package/dist/cli.js CHANGED
@@ -2,7 +2,8 @@
2
2
 
3
3
  // src/cli.ts
4
4
  import { readdir as readdir2 } from "fs/promises";
5
- import { join as join7, resolve as resolve2 } from "path";
5
+ import { dirname, join as join7, resolve as resolve2 } from "path";
6
+ import { fileURLToPath } from "url";
6
7
  import { Command } from "commander";
7
8
 
8
9
  // src/cleaning/profiles.ts
@@ -39,10 +40,29 @@ function profileFromTomlRule(name, rule) {
39
40
  metadata: {
40
41
  fixedAuthor: rule.metadata?.fixed_author,
41
42
  titleSuffixPatterns: rule.metadata?.strip_title_regexes,
43
+ authorSuffixPatterns: rule.metadata?.strip_author_regexes,
42
44
  authorSelectors: rule.metadata?.author_selectors,
43
45
  authorMetaNames: rule.metadata?.author_meta_names,
44
46
  authorMetaItemprops: rule.metadata?.author_meta_itemprops,
45
47
  authorMetaProperties: rule.metadata?.author_meta_properties
48
+ },
49
+ fetch: {
50
+ mode: rule.fetch?.mode,
51
+ preferBrowserState: rule.fetch?.prefer_browser_state,
52
+ waitMs: rule.fetch?.wait_ms,
53
+ networkIdle: rule.fetch?.network_idle,
54
+ waitSelector: rule.fetch?.wait_selector,
55
+ waitSelectorState: rule.fetch?.wait_selector_state,
56
+ clickSelectors: rule.fetch?.click_selectors,
57
+ scrollToBottom: rule.fetch?.scroll_to_bottom,
58
+ useProxyEnv: rule.fetch?.use_proxy_env
59
+ },
60
+ media: {
61
+ includeMetaImages: rule.media?.include_meta_images,
62
+ imageMetaProperties: rule.media?.image_meta_properties
63
+ },
64
+ extraction: {
65
+ requireText: rule.extract?.require_text
46
66
  }
47
67
  };
48
68
  }
@@ -122,13 +142,13 @@ async function runPageActions(page, options) {
122
142
  await page.locator(selector).first().click({ timeout: 5e3 }).catch(() => void 0);
123
143
  }
124
144
  if (options.scrollToBottom) {
125
- await page.evaluate(async () => {
126
- const delay = (ms) => new Promise((resolve3) => setTimeout(resolve3, ms));
145
+ await page.evaluate(`(async () => {
146
+ const delay = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
127
147
  for (let i = 0; i < 8; i += 1) {
128
148
  window.scrollTo(0, document.body.scrollHeight);
129
149
  await delay(250);
130
150
  }
131
- });
151
+ })()`);
132
152
  }
133
153
  if (options.waitSelector) {
134
154
  await page.locator(options.waitSelector).first().waitFor({
@@ -409,13 +429,13 @@ async function fetchWithStealthContext(context, url, options) {
409
429
  await page.locator(selector).first().click({ timeout: 5e3 }).catch(() => void 0);
410
430
  }
411
431
  if (options.scrollToBottom) {
412
- await page.evaluate(async () => {
413
- const delay = (ms) => new Promise((resolve3) => setTimeout(resolve3, ms));
432
+ await page.evaluate(`(async () => {
433
+ const delay = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
414
434
  for (let i = 0; i < 8; i += 1) {
415
435
  window.scrollTo(0, document.body.scrollHeight);
416
436
  await delay(250);
417
437
  }
418
- });
438
+ })()`);
419
439
  }
420
440
  if (options.waitSelector) {
421
441
  await page.locator(options.waitSelector).first().waitFor({ state: options.waitSelectorState ?? "attached", timeout: timeoutMs }).catch(() => void 0);
@@ -824,8 +844,8 @@ function imageSource(img) {
824
844
  return first || null;
825
845
  }
826
846
  async function localizeImages(html, options) {
827
- const { document: document2 } = parseHTML(`<!doctype html><html><body>${html}</body></html>`);
828
- const images = Array.from(document2.querySelectorAll("img"));
847
+ const { document } = parseHTML(`<!doctype html><html><body>${html}</body></html>`);
848
+ const images = Array.from(document.querySelectorAll("img"));
829
849
  if (images.length === 0) return html;
830
850
  const fetchImage = options.fetchImage ?? fetch;
831
851
  const seen = /* @__PURE__ */ new Map();
@@ -864,7 +884,7 @@ async function localizeImages(html, options) {
864
884
  img.removeAttribute("data-original");
865
885
  img.removeAttribute("data-src");
866
886
  }
867
- return document2.body.innerHTML;
887
+ return document.body.innerHTML;
868
888
  }
869
889
 
870
890
  // src/cleaning/clean-html.ts
@@ -1004,6 +1024,18 @@ function cleanupTitle(metadata, profiles) {
1004
1024
  }
1005
1025
  metadata.title = title;
1006
1026
  }
1027
+ function cleanupAuthor(metadata, profiles) {
1028
+ if (!metadata.author) {
1029
+ return;
1030
+ }
1031
+ let author = metadata.author;
1032
+ for (const profile of profiles) {
1033
+ for (const pattern of profile.metadata?.authorSuffixPatterns ?? []) {
1034
+ author = author.replace(new RegExp(pattern, "i"), "").trim();
1035
+ }
1036
+ }
1037
+ metadata.author = author;
1038
+ }
1007
1039
  function applySiteProfiles(root, profiles, removals) {
1008
1040
  removeByExactSelectors(root, profiles, removals);
1009
1041
  removeByPartialAttributePatterns(root, profiles, removals);
@@ -1012,6 +1044,7 @@ function applySiteProfiles(root, profiles, removals) {
1012
1044
  function applyMetadataProfiles(metadata, profiles) {
1013
1045
  applyFixedAuthor(metadata, profiles);
1014
1046
  cleanupTitle(metadata, profiles);
1047
+ cleanupAuthor(metadata, profiles);
1015
1048
  }
1016
1049
 
1017
1050
  // src/cleaning/clean-html.ts
@@ -1033,17 +1066,17 @@ var DEFAULT_FEEDLOOM_PROFILE = {
1033
1066
  }
1034
1067
  };
1035
1068
  var DefuddleClass = DefuddleModule.default ?? DefuddleModule.Defuddle;
1036
- function firstMetaContent(document2, names) {
1069
+ function firstMetaContent(document, names) {
1037
1070
  for (const name of names) {
1038
1071
  const escaped = name.replace(/"/g, '\\"');
1039
- const element = document2.querySelector(`meta[property="${escaped}"], meta[name="${escaped}"], meta[itemprop="${escaped}"]`);
1072
+ const element = document.querySelector(`meta[property="${escaped}"], meta[name="${escaped}"], meta[itemprop="${escaped}"]`);
1040
1073
  const content = element?.getAttribute("content")?.trim();
1041
1074
  if (content) return content;
1042
1075
  }
1043
1076
  return void 0;
1044
1077
  }
1045
- function jsonLdValue(document2, keys) {
1046
- for (const script of Array.from(document2.querySelectorAll('script[type="application/ld+json"]'))) {
1078
+ function jsonLdValue(document, keys) {
1079
+ for (const script of Array.from(document.querySelectorAll('script[type="application/ld+json"]'))) {
1047
1080
  const text = script.textContent?.trim();
1048
1081
  if (!text) continue;
1049
1082
  try {
@@ -1064,12 +1097,12 @@ function jsonLdValue(document2, keys) {
1064
1097
  }
1065
1098
  return void 0;
1066
1099
  }
1067
- function profileAuthorFromDocument(document2, profiles) {
1100
+ function profileAuthorFromDocument(document, profiles) {
1068
1101
  for (const profile of profiles) {
1069
1102
  const metadata = profile.metadata;
1070
1103
  if (!metadata) continue;
1071
1104
  for (const selector of metadata.authorSelectors ?? []) {
1072
- const author = document2.querySelector(selector)?.textContent?.replace(/\s+/g, " ").trim();
1105
+ const author = document.querySelector(selector)?.textContent?.replace(/\s+/g, " ").trim();
1073
1106
  if (author) return author;
1074
1107
  }
1075
1108
  const metaNames = [
@@ -1079,33 +1112,54 @@ function profileAuthorFromDocument(document2, profiles) {
1079
1112
  ];
1080
1113
  for (const entry of metaNames) {
1081
1114
  const escaped = entry.value.replace(/"/g, '\\"');
1082
- const author = document2.querySelector(`meta[${entry.attr}="${escaped}"]`)?.getAttribute("content")?.trim();
1115
+ const author = document.querySelector(`meta[${entry.attr}="${escaped}"]`)?.getAttribute("content")?.trim();
1083
1116
  if (author) return author;
1084
1117
  }
1085
1118
  }
1086
1119
  return void 0;
1087
1120
  }
1088
- function toMetadata(result, document2, profiles) {
1121
+ function toMetadata(result, document, profiles) {
1089
1122
  return {
1090
- title: result.title || firstMetaContent(document2, ["og:title", "twitter:title"]) || document2.querySelector("title")?.textContent?.trim() || void 0,
1091
- description: result.description || firstMetaContent(document2, ["description", "og:description", "twitter:description"]),
1123
+ title: result.title || firstMetaContent(document, ["og:title", "twitter:title"]) || document.querySelector("title")?.textContent?.trim() || void 0,
1124
+ description: result.description || firstMetaContent(document, ["description", "og:description", "twitter:description"]),
1092
1125
  domain: result.domain || void 0,
1093
1126
  favicon: result.favicon || void 0,
1094
- image: result.image || firstMetaContent(document2, ["og:image", "twitter:image"]),
1095
- language: result.language || document2.documentElement.getAttribute("lang") || void 0,
1096
- published: result.published || firstMetaContent(document2, ["article:published_time", "date", "datePublished", "pubdate", "publishdate"]) || jsonLdValue(document2, ["datePublished", "dateCreated"]),
1097
- author: result.author || profileAuthorFromDocument(document2, profiles) || firstMetaContent(document2, ["author", "article:author", "twitter:creator"]) || jsonLdValue(document2, ["author", "creator"]),
1098
- site: result.site || firstMetaContent(document2, ["og:site_name", "application-name"]),
1127
+ image: result.image || firstMetaContent(document, ["og:image", "twitter:image"]),
1128
+ language: result.language || document.documentElement.getAttribute("lang") || void 0,
1129
+ published: result.published || firstMetaContent(document, ["article:published_time", "date", "datePublished", "pubdate", "publishdate"]) || jsonLdValue(document, ["datePublished", "dateCreated"]),
1130
+ author: result.author || profileAuthorFromDocument(document, profiles) || firstMetaContent(document, ["author", "article:author", "twitter:creator"]) || jsonLdValue(document, ["author", "creator"]),
1131
+ site: result.site || firstMetaContent(document, ["og:site_name", "application-name"]),
1099
1132
  schemaOrgData: result.schemaOrgData,
1100
1133
  wordCount: result.wordCount,
1101
1134
  parseTime: result.parseTime
1102
1135
  };
1103
1136
  }
1104
- function serializeProfiledContent(content, profiles, removals) {
1105
- const { document: document2 } = parseHTML2(`<!doctype html><html><body><main data-feedloom-profile-root="true">${content}</main></body></html>`);
1106
- const root = document2.querySelector('[data-feedloom-profile-root="true"]') ?? document2.body;
1137
+ function appendMetaImages(document, root, profiles) {
1138
+ const properties = profiles.flatMap((profile) => profile.media?.includeMetaImages ? profile.media.imageMetaProperties ?? ["og:image"] : []);
1139
+ if (properties.length === 0) {
1140
+ return;
1141
+ }
1142
+ const seen = new Set(Array.from(root.querySelectorAll("img")).map((img) => img.getAttribute("src") ?? ""));
1143
+ for (const property of properties) {
1144
+ const escaped = property.replace(/"/g, '\\"');
1145
+ for (const meta of Array.from(document.querySelectorAll(`meta[property="${escaped}"], meta[name="${escaped}"], meta[itemprop="${escaped}"]`))) {
1146
+ const src = meta.getAttribute("content")?.trim();
1147
+ if (!src || seen.has(src)) continue;
1148
+ const img = document.createElement("img");
1149
+ img.setAttribute("src", src);
1150
+ img.setAttribute("alt", "");
1151
+ root.appendChild(document.createElement("p"));
1152
+ root.lastElementChild?.appendChild(img);
1153
+ seen.add(src);
1154
+ }
1155
+ }
1156
+ }
1157
+ function serializeProfiledContent(document, content, profiles, removals) {
1158
+ const { document: contentDocument } = parseHTML2(`<!doctype html><html><body><main data-feedloom-profile-root="true">${content}</main></body></html>`);
1159
+ const root = contentDocument.querySelector('[data-feedloom-profile-root="true"]') ?? contentDocument.body;
1160
+ appendMetaImages(document, root, profiles);
1107
1161
  applySiteProfiles(root, profiles, removals);
1108
- const serialized = root.innerHTML || root.outerHTML || document2.body.innerHTML;
1162
+ const serialized = root.innerHTML || root.outerHTML || contentDocument.body.innerHTML;
1109
1163
  return serialized.trim() ? `${serialized.trim()}
1110
1164
  ` : "";
1111
1165
  }
@@ -1120,9 +1174,9 @@ var HtmlCleaner = class {
1120
1174
  const preferredContentSelector = this.options.contentSelector ?? firstContentSelector(activeProfiles);
1121
1175
  const removals = [];
1122
1176
  const html = /<html[\s>]/i.test(rawHtml) ? rawHtml : `<!doctype html><html><body>${rawHtml}</body></html>`;
1123
- const { document: document2 } = parseHTML2(html);
1124
- const contentSelector = preferredContentSelector && document2.querySelector(preferredContentSelector) ? preferredContentSelector : void 0;
1125
- const doc = document2;
1177
+ const { document } = parseHTML2(html);
1178
+ const contentSelector = preferredContentSelector && document.querySelector(preferredContentSelector) ? preferredContentSelector : void 0;
1179
+ const doc = document;
1126
1180
  if (this.options.baseUrl) {
1127
1181
  doc.URL = this.options.baseUrl;
1128
1182
  }
@@ -1139,12 +1193,14 @@ var HtmlCleaner = class {
1139
1193
  removeExactSelectors: this.options.removeExactSelectors,
1140
1194
  removePartialSelectors: this.options.removePartialSelectors,
1141
1195
  removeContentPatterns: this.options.removeContentPatterns,
1142
- standardize: this.options.standardize
1196
+ standardize: this.options.standardize,
1197
+ fetch: this.options.defuddleFetch,
1198
+ language: this.options.language
1143
1199
  });
1144
1200
  const result = parser2.parseAsync ? await parser2.parseAsync() : parser2.parse();
1145
- const metadata = toMetadata(result, document2, activeProfiles);
1201
+ const metadata = toMetadata(result, document, activeProfiles);
1146
1202
  applyMetadataProfiles(metadata, activeProfiles);
1147
- const content = serializeProfiledContent(result.content, postProfiles, removals);
1203
+ const content = serializeProfiledContent(document, result.content, postProfiles, removals);
1148
1204
  return {
1149
1205
  content,
1150
1206
  contentMarkdown: result.contentMarkdown,
@@ -1161,6 +1217,212 @@ async function cleanHtml(rawHtml, options = {}) {
1161
1217
  return new HtmlCleaner(options).parse(rawHtml);
1162
1218
  }
1163
1219
 
1220
+ // src/fetch/proxy-fetch.ts
1221
+ import { request as httpRequest } from "http";
1222
+ import { connect as tlsConnect } from "tls";
1223
+ var REDIRECT_STATUSES = /* @__PURE__ */ new Set([301, 302, 303, 307, 308]);
1224
+ var DEFAULT_REDIRECT_LIMIT = 10;
1225
+ function envProxyForUrl(targetUrl) {
1226
+ const raw = targetUrl.protocol === "https:" ? process.env.HTTPS_PROXY || process.env.https_proxy || process.env.ALL_PROXY || process.env.all_proxy : process.env.HTTP_PROXY || process.env.http_proxy || process.env.ALL_PROXY || process.env.all_proxy;
1227
+ if (!raw || noProxyMatches(targetUrl.hostname)) {
1228
+ return null;
1229
+ }
1230
+ try {
1231
+ return new URL(raw);
1232
+ } catch {
1233
+ return null;
1234
+ }
1235
+ }
1236
+ function noProxyMatches(hostname) {
1237
+ const raw = process.env.NO_PROXY ?? process.env.no_proxy ?? "";
1238
+ if (!raw) return false;
1239
+ const host = hostname.toLowerCase();
1240
+ return raw.split(",").map((entry) => entry.trim().toLowerCase()).some((entry) => {
1241
+ if (!entry) return false;
1242
+ if (entry === "*") return true;
1243
+ if (entry.startsWith(".")) return host === entry.slice(1) || host.endsWith(entry);
1244
+ return host === entry || host.endsWith(`.${entry}`);
1245
+ });
1246
+ }
1247
+ function headersToRecord(headers) {
1248
+ const record = {};
1249
+ if (!headers) return record;
1250
+ new Headers(headers).forEach((value, key) => {
1251
+ record[key] = value;
1252
+ });
1253
+ return record;
1254
+ }
1255
+ function responseHeaders(headers) {
1256
+ const result = new Headers();
1257
+ for (const [key, value] of Object.entries(headers)) {
1258
+ if (Array.isArray(value)) {
1259
+ for (const item of value) result.append(key, item);
1260
+ } else if (value !== void 0) {
1261
+ result.set(key, String(value));
1262
+ }
1263
+ }
1264
+ return result;
1265
+ }
1266
+ async function bodyToBuffer(body) {
1267
+ if (body === void 0 || body === null) return void 0;
1268
+ if (typeof ReadableStream !== "undefined" && body instanceof ReadableStream) {
1269
+ throw new Error("proxy-aware fetch does not support streaming request bodies");
1270
+ }
1271
+ if (typeof body === "string") return Buffer.from(body);
1272
+ if (body instanceof URLSearchParams) return Buffer.from(body.toString());
1273
+ if (body instanceof ArrayBuffer) return Buffer.from(body);
1274
+ if (ArrayBuffer.isView(body)) return Buffer.from(body.buffer, body.byteOffset, body.byteLength);
1275
+ if (typeof Blob !== "undefined" && body instanceof Blob) return Buffer.from(await body.arrayBuffer());
1276
+ throw new Error("proxy-aware fetch only supports buffered request bodies");
1277
+ }
1278
+ function collectResponse(res, done) {
1279
+ const chunks = [];
1280
+ res.on("data", (chunk) => {
1281
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
1282
+ });
1283
+ res.on("end", () => {
1284
+ done(null, {
1285
+ status: res.statusCode ?? 0,
1286
+ statusText: res.statusMessage ?? "",
1287
+ headers: Object.fromEntries(responseHeaders(res.headers)),
1288
+ body: Buffer.concat(chunks)
1289
+ });
1290
+ });
1291
+ res.on("error", (error) => done(error));
1292
+ }
1293
+ function proxyAuthorization(proxy) {
1294
+ if (!proxy.username) return void 0;
1295
+ return `Basic ${Buffer.from(`${decodeURIComponent(proxy.username)}:${decodeURIComponent(proxy.password)}`).toString("base64")}`;
1296
+ }
1297
+ function requestViaHttpProxy(targetUrl, proxy, method, headers, body, signal) {
1298
+ if (proxy.protocol !== "http:") {
1299
+ throw new Error(`Unsupported proxy protocol: ${proxy.protocol}`);
1300
+ }
1301
+ return new Promise((resolve3, reject) => {
1302
+ let settled = false;
1303
+ let active = null;
1304
+ const done = (error, response) => {
1305
+ if (settled) return;
1306
+ settled = true;
1307
+ signal?.removeEventListener("abort", abort);
1308
+ if (error) reject(error);
1309
+ else if (response) resolve3(response);
1310
+ else reject(new Error("Proxy request ended without a response"));
1311
+ };
1312
+ const abort = () => active?.destroy(new Error("The operation was aborted"));
1313
+ if (signal?.aborted) {
1314
+ done(new Error("The operation was aborted"));
1315
+ return;
1316
+ }
1317
+ signal?.addEventListener("abort", abort, { once: true });
1318
+ const targetPort = targetUrl.port ? Number(targetUrl.port) : targetUrl.protocol === "https:" ? 443 : 80;
1319
+ const proxyPort = proxy.port ? Number(proxy.port) : 8080;
1320
+ const auth = proxyAuthorization(proxy);
1321
+ const requestHeaders2 = {
1322
+ ...headers,
1323
+ host: targetUrl.host
1324
+ };
1325
+ if (body && !Object.keys(requestHeaders2).some((key) => key.toLowerCase() === "content-length")) {
1326
+ requestHeaders2["content-length"] = String(body.byteLength);
1327
+ }
1328
+ if (targetUrl.protocol === "https:") {
1329
+ const connectHeaders = { host: `${targetUrl.hostname}:${targetPort}` };
1330
+ if (auth) connectHeaders["proxy-authorization"] = auth;
1331
+ const connectReq = httpRequest({
1332
+ host: proxy.hostname,
1333
+ port: proxyPort,
1334
+ method: "CONNECT",
1335
+ path: `${targetUrl.hostname}:${targetPort}`,
1336
+ headers: connectHeaders
1337
+ });
1338
+ active = connectReq;
1339
+ connectReq.on("connect", (connectRes, socket) => {
1340
+ if (connectRes.statusCode !== 200) {
1341
+ socket.destroy();
1342
+ done(new Error(`Proxy CONNECT failed: ${connectRes.statusCode ?? 0}`));
1343
+ return;
1344
+ }
1345
+ const tlsSocket = tlsConnect({ socket, host: targetUrl.hostname, servername: targetUrl.hostname });
1346
+ active = tlsSocket;
1347
+ tlsSocket.on("error", (error) => done(error));
1348
+ tlsSocket.on("secureConnect", () => {
1349
+ const req2 = httpRequest({
1350
+ method,
1351
+ path: `${targetUrl.pathname}${targetUrl.search}`,
1352
+ headers: requestHeaders2,
1353
+ createConnection: () => tlsSocket
1354
+ }, (res) => collectResponse(res, done));
1355
+ active = req2;
1356
+ req2.on("error", (error) => done(error));
1357
+ if (body) req2.write(body);
1358
+ req2.end();
1359
+ });
1360
+ });
1361
+ connectReq.on("error", (error) => done(error));
1362
+ connectReq.end();
1363
+ return;
1364
+ }
1365
+ if (auth) requestHeaders2["proxy-authorization"] = auth;
1366
+ const req = httpRequest({
1367
+ host: proxy.hostname,
1368
+ port: proxyPort,
1369
+ method,
1370
+ path: targetUrl.href,
1371
+ headers: requestHeaders2
1372
+ }, (res) => collectResponse(res, done));
1373
+ active = req;
1374
+ req.on("error", (error) => done(error));
1375
+ if (body) req.write(body);
1376
+ req.end();
1377
+ });
1378
+ }
1379
+ function requestUrl(input) {
1380
+ if (input instanceof URL) return input;
1381
+ if (typeof input === "string") return new URL(input);
1382
+ return new URL(input.url);
1383
+ }
1384
+ function requestMethod(input, init) {
1385
+ if (init.method) return init.method.toUpperCase();
1386
+ if (input instanceof Request) return input.method.toUpperCase();
1387
+ return "GET";
1388
+ }
1389
+ function requestHeaders(input, init) {
1390
+ return {
1391
+ ...input instanceof Request ? headersToRecord(input.headers) : {},
1392
+ ...headersToRecord(init.headers)
1393
+ };
1394
+ }
1395
+ async function proxyAwareFetchInternal(input, init, redirectsLeft) {
1396
+ const url = requestUrl(input);
1397
+ const proxy = envProxyForUrl(url);
1398
+ if (!proxy) {
1399
+ return fetch(input, init);
1400
+ }
1401
+ const method = requestMethod(input, init);
1402
+ const headers = requestHeaders(input, init);
1403
+ const body = await bodyToBuffer(init.body ?? (input instanceof Request ? input.body : void 0));
1404
+ const proxied = await requestViaHttpProxy(url, proxy, method, headers, body, init.signal ?? void 0);
1405
+ const location = proxied.headers.location;
1406
+ const redirectMode = init.redirect ?? "follow";
1407
+ if (redirectMode !== "manual" && REDIRECT_STATUSES.has(proxied.status) && location && redirectsLeft > 0) {
1408
+ const nextUrl = new URL(location, url);
1409
+ const nextInit = { ...init };
1410
+ if (proxied.status === 303) {
1411
+ nextInit.method = "GET";
1412
+ nextInit.body = void 0;
1413
+ }
1414
+ return proxyAwareFetchInternal(nextUrl, nextInit, redirectsLeft - 1);
1415
+ }
1416
+ return new Response(new Uint8Array(proxied.body), {
1417
+ status: proxied.status,
1418
+ statusText: proxied.statusText,
1419
+ headers: proxied.headers
1420
+ });
1421
+ }
1422
+ async function proxyAwareFetch(input, init = {}) {
1423
+ return proxyAwareFetchInternal(input, init, DEFAULT_REDIRECT_LIMIT);
1424
+ }
1425
+
1164
1426
  // src/fetch/strategy.ts
1165
1427
  import { writeFile as writeFile3 } from "fs/promises";
1166
1428
 
@@ -1175,8 +1437,8 @@ function extractPreloadedMarkdownUrl(html, baseUrl) {
1175
1437
  }
1176
1438
  return new URL(rawUrl, baseUrl).toString();
1177
1439
  }
1178
- function removeNoise(document2) {
1179
- document2.querySelectorAll("script, style, noscript, svg, iframe").forEach((element) => element.remove());
1440
+ function removeNoise(document) {
1441
+ document.querySelectorAll("script, style, noscript, svg, iframe").forEach((element) => element.remove());
1180
1442
  }
1181
1443
  function normalizedTextLength(element) {
1182
1444
  return (element?.textContent ?? "").replace(/\s+/g, " ").trim().length;
@@ -1185,12 +1447,12 @@ function htmlHasMeaningfulContent(url, html) {
1185
1447
  if (extractPreloadedMarkdownUrl(html, url) !== null) {
1186
1448
  return true;
1187
1449
  }
1188
- const { document: document2 } = parseHTML3(html);
1189
- removeNoise(document2);
1450
+ const { document } = parseHTML3(html);
1451
+ removeNoise(document);
1190
1452
  const selectors = ["#js_content", "article", "main", "section", "div", "body"];
1191
1453
  let bestLength = 0;
1192
1454
  for (const selector of selectors) {
1193
- document2.querySelectorAll(selector).forEach((element) => {
1455
+ document.querySelectorAll(selector).forEach((element) => {
1194
1456
  bestLength = Math.max(bestLength, normalizedTextLength(element));
1195
1457
  });
1196
1458
  if (bestLength >= 600 && selector !== "div") {
@@ -1285,11 +1547,11 @@ async function fetchBrowserHtmlWithBrowserState(url, config) {
1285
1547
  }
1286
1548
 
1287
1549
  // src/fetch/static.ts
1288
- async function fetchStaticHtml(url, timeoutMs = 6e4) {
1550
+ async function fetchStaticHtml(url, timeoutMs = 6e4, fetchImpl = fetch) {
1289
1551
  const controller = new AbortController();
1290
1552
  const timeout = setTimeout(() => controller.abort(), timeoutMs);
1291
1553
  try {
1292
- const response = await fetch(url, {
1554
+ const response = await fetchImpl(url, {
1293
1555
  redirect: "follow",
1294
1556
  signal: controller.signal,
1295
1557
  headers: {
@@ -1318,7 +1580,7 @@ async function writeOutputIfRequested(outputPath, html) {
1318
1580
  }
1319
1581
  async function fetchHtmlResult(url, options = {}) {
1320
1582
  const isMeaningful = options.isMeaningful ?? htmlHasMeaningfulContent;
1321
- const staticFetch = options.staticFetch ?? (async (targetUrl) => (await fetchStaticHtml(targetUrl)).html);
1583
+ const staticFetch = options.staticFetch ?? (async (targetUrl) => (await fetchStaticHtml(targetUrl, void 0, options.useProxyEnv ? proxyAwareFetch : void 0)).html);
1322
1584
  const browserFetch = options.browserFetch ?? ((targetUrl) => fetchBrowserHtml(targetUrl, {
1323
1585
  waitMs: options.waitMs,
1324
1586
  networkIdle: options.networkIdle,
@@ -1486,9 +1748,9 @@ ${code}
1486
1748
  \`\`\``).replace(/\[\s*\]\((?:#|javascript:void\(0\)|javascript:;)\)/gi, "").replace(/(^|[^\\])\$(?=\d)/g, "$1\\$").replace(/\n\s*\n\s*([-*+]\s)/g, "\n$1").replace(/[ \t]+\n/g, "\n").replace(/\n{3,}/g, "\n\n").trim();
1487
1749
  }
1488
1750
  function htmlFragmentText(fragment) {
1489
- const { document: document2 } = parseHTML4(`<!doctype html><html><body>${fragment}</body></html>`);
1490
- document2.querySelectorAll("br").forEach((br) => br.replaceWith(document2.createTextNode("\n")));
1491
- return document2.body.textContent ?? "";
1751
+ const { document } = parseHTML4(`<!doctype html><html><body>${fragment}</body></html>`);
1752
+ document.querySelectorAll("br").forEach((br) => br.replaceWith(document.createTextNode("\n")));
1753
+ return document.body.textContent ?? "";
1492
1754
  }
1493
1755
  function fencedCodeHtml(text) {
1494
1756
  const escaped = text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
@@ -1584,10 +1846,45 @@ function resolveCreatedValue(item, published) {
1584
1846
  if (item.publishedAt) return createdFromItemDate(item.publishedAt);
1585
1847
  return (/* @__PURE__ */ new Date()).toISOString().replace(/\.\d{3}Z$/, "Z");
1586
1848
  }
1849
+ function mergeProfileFetchOptions(options, profiles) {
1850
+ const merged = { ...options };
1851
+ for (const profile of profiles) {
1852
+ if (profile.fetch?.mode) merged.fetchMode = profile.fetch.mode;
1853
+ if (profile.fetch?.waitMs !== void 0) merged.waitMs = profile.fetch.waitMs;
1854
+ if (profile.fetch?.networkIdle !== void 0) merged.networkIdle = profile.fetch.networkIdle;
1855
+ if (profile.fetch?.waitSelector) merged.waitSelector = profile.fetch.waitSelector;
1856
+ if (profile.fetch?.waitSelectorState) merged.waitSelectorState = profile.fetch.waitSelectorState;
1857
+ if (profile.fetch?.clickSelectors) merged.clickSelectors = profile.fetch.clickSelectors;
1858
+ if (profile.fetch?.scrollToBottom !== void 0) merged.scrollToBottom = profile.fetch.scrollToBottom;
1859
+ if (profile.fetch?.useProxyEnv !== void 0) merged.useProxyEnv = profile.fetch.useProxyEnv;
1860
+ if (profile.fetch?.preferBrowserState && options.browserStateDefaults) {
1861
+ merged.browserState = {
1862
+ ...options.browserStateDefaults,
1863
+ waitMs: merged.waitMs,
1864
+ networkIdle: merged.networkIdle,
1865
+ proxy: merged.proxy,
1866
+ dnsOverHttps: merged.dnsOverHttps,
1867
+ waitSelector: merged.waitSelector,
1868
+ waitSelectorState: merged.waitSelectorState,
1869
+ clickSelectors: merged.clickSelectors,
1870
+ scrollToBottom: merged.scrollToBottom,
1871
+ headless: merged.headless,
1872
+ realChromeDefaults: merged.realChromeDefaults
1873
+ };
1874
+ }
1875
+ }
1876
+ return merged;
1877
+ }
1587
1878
  async function processItem(item, options) {
1588
- const html = await fetchHtml(item.url, options);
1879
+ const urlProfiles = selectActiveProfiles(options.profiles, item.url, "");
1880
+ const fetchOptions = mergeProfileFetchOptions(options, urlProfiles);
1881
+ const html = await fetchHtml(item.url, fetchOptions);
1589
1882
  const activeProfiles = selectActiveProfiles(options.profiles, item.url, html);
1590
- const cleaned = await cleanHtml(html, { baseUrl: item.url, profiles: options.profiles, activeProfiles });
1883
+ const defuddleFetch = activeProfiles.some((profile) => profile.fetch?.useProxyEnv) ? proxyAwareFetch : void 0;
1884
+ const cleaned = await cleanHtml(html, { baseUrl: item.url, profiles: options.profiles, activeProfiles, defuddleFetch });
1885
+ if (activeProfiles.some((profile) => profile.extraction?.requireText) && !cleaned.content.replace(/<[^>]*>/g, "").trim()) {
1886
+ throw new Error("matched site rule requires extracted text, but no text content was extracted");
1887
+ }
1591
1888
  const title = cleaned.metadata.title || item.sourceTitle || titleFromUrl(item.url);
1592
1889
  await cleanupExistingNote(options.outputDir, item.url);
1593
1890
  const contentHtml = options.localizeAssets === false ? cleaned.content : await localizeImages(cleaned.content, {
@@ -1659,7 +1956,10 @@ var ProgressTracker = class {
1659
1956
  var program = new Command();
1660
1957
  async function siteRulePathsFromDir(dir) {
1661
1958
  const names = await readdir2(dir);
1662
- return names.filter((name) => name.endsWith(".toml")).map((name) => join7(dir, name));
1959
+ return names.filter((name) => name.endsWith(".toml")).sort().map((name) => join7(dir, name));
1960
+ }
1961
+ function builtinSiteRulesDir() {
1962
+ return join7(dirname(fileURLToPath(import.meta.url)), "site-rules");
1663
1963
  }
1664
1964
  function positiveIntOption(value, fallback) {
1665
1965
  const parsed = Number(value ?? fallback);
@@ -1696,7 +1996,9 @@ program.name("feedloom").description("Archive long-form web content as clean Mar
1696
1996
  positiveIntOption(options.limit, 0)
1697
1997
  );
1698
1998
  const siteRulesDir = String(options.siteRulesDir || "");
1699
- const profiles = siteRulesDir ? await loadSiteProfiles(await siteRulePathsFromDir(resolve2(siteRulesDir))) : [];
1999
+ const builtinRulePaths = await siteRulePathsFromDir(builtinSiteRulesDir());
2000
+ const customRulePaths = siteRulesDir ? await siteRulePathsFromDir(resolve2(siteRulesDir)) : [];
2001
+ const profiles = await loadSiteProfiles([...builtinRulePaths, ...customRulePaths]);
1700
2002
  const outputDir = String(options.outputDir ?? "clippings");
1701
2003
  let failures = 0;
1702
2004
  const tracker = new ProgressTracker(selected, outputDir);
@@ -1715,6 +2017,10 @@ program.name("feedloom").description("Archive long-form web content as clean Mar
1715
2017
  headless: !Boolean(options.headful),
1716
2018
  realChromeDefaults: options.realChromeDefaults !== false
1717
2019
  };
2020
+ const browserStateDefaults = {
2021
+ userDataDir: String(options.chromeUserDataDir || ""),
2022
+ profile: String(options.chromeProfile || "Default")
2023
+ };
1718
2024
  const sessions = options.reuseBrowser === false ? null : new BatchFetchSessions({
1719
2025
  browser: browserOptions,
1720
2026
  stealth: {
@@ -1727,15 +2033,11 @@ program.name("feedloom").description("Archive long-form web content as clean Mar
1727
2033
  for (const item of selected) {
1728
2034
  tracker.start(item.url);
1729
2035
  try {
1730
- const browserState = options.preferBrowserState ? {
1731
- userDataDir: String(options.chromeUserDataDir || ""),
1732
- profile: String(options.chromeProfile || "Default"),
1733
- ...browserOptions
1734
- } : null;
1735
2036
  const result = await processItem(item, {
1736
2037
  outputDir,
1737
2038
  profiles,
1738
- browserState,
2039
+ browserState: options.preferBrowserState ? { ...browserStateDefaults, ...browserOptions } : null,
2040
+ browserStateDefaults,
1739
2041
  fetchMode,
1740
2042
  ...browserOptions,
1741
2043
  solveCloudflare: Boolean(options.solveCloudflare),
@@ -0,0 +1,5 @@
1
+ [match]
2
+ host_suffixes = ["mp.weixin.qq.com"]
3
+
4
+ [extract]
5
+ selectors = ["#js_content"]
@@ -0,0 +1,55 @@
1
+ [match]
2
+ host_suffixes = ["xiaohongshu.com"]
3
+
4
+ [fetch]
5
+ mode = "auto"
6
+ scroll_to_bottom = true
7
+ wait_ms = 5000
8
+
9
+ [extract]
10
+ selectors = [".note-content", "#noteContainer", ".note-container", ".note-detail"]
11
+
12
+ [metadata]
13
+ strip_title_regexes = ["\\s*-\\s*小红书\\s*$"]
14
+ strip_author_regexes = ["关注$"]
15
+ author_selectors = [".author-container .username", ".author-wrapper .name", ".user-name"]
16
+ author_meta_names = ["author"]
17
+ author_meta_properties = ["article:author"]
18
+
19
+ [media]
20
+ include_meta_images = true
21
+ image_meta_properties = ["og:image"]
22
+
23
+ [clean.remove]
24
+ selectors = [
25
+ ".side-bar",
26
+ ".left-container",
27
+ ".comments-el",
28
+ ".interactions",
29
+ ".engage-bar",
30
+ ".note-detail-dropdown",
31
+ ".close-circle",
32
+ ".close-box",
33
+ ".login-container",
34
+ ".bottom-container .notedetail-menu",
35
+ ".author",
36
+ ".author-container",
37
+ ".media-container",
38
+ ".fraction",
39
+ ".arrow-controller",
40
+ ".pagination-media-container"
41
+ ]
42
+ text_contains = [
43
+ "创作中心",
44
+ "业务合作",
45
+ "沪ICP备",
46
+ "营业执照",
47
+ "违法不良信息举报电话",
48
+ "行吟信息科技",
49
+ "小红书网页版",
50
+ "登录后推荐更懂你的笔记"
51
+ ]
52
+ exact_text = ["关注", "加载中", "更多", "发现", "直播", "发布", "通知"]
53
+
54
+ [clean.truncate]
55
+ after_regexes = ["^共 \\d+ 条评论$", "^相关推荐$", "^登录后推荐更懂你的笔记$"]
@@ -0,0 +1,10 @@
1
+ [match]
2
+ host_suffixes = ["youtube.com", "youtu.be"]
3
+ url_regexes = ["https?://(www\\.)?youtube\\.com/watch\\?", "https?://youtu\\.be/"]
4
+
5
+ [fetch]
6
+ mode = "auto"
7
+ use_proxy_env = true
8
+
9
+ [extract]
10
+ require_text = true
@@ -0,0 +1,22 @@
1
+ [match]
2
+ host_suffixes = ["zhihu.com"]
3
+
4
+ [extract]
5
+ selectors = ["[class*=\"Post-RichTextContainer\"]", "[class*=\"RichText ztext\"]", "[class*=\"RichContent-inner\"]"]
6
+
7
+ [metadata]
8
+ strip_title_regexes = ["\\s*-\\s*知乎\\s*$"]
9
+
10
+ [fetch]
11
+ mode = "browser"
12
+ prefer_browser_state = true
13
+ scroll_to_bottom = true
14
+ wait_ms = 8000
15
+
16
+ [clean.remove]
17
+ class_contains = ["RichText-LinkCardContainer"]
18
+ text_regexes = ["^目录收起$", "^目录收起.*References$", "^.+\\d+ 赞同 · \\d+ 评论 文章$", "^.+\\d+ 赞同 · \\d+ 评论 文章$", "^\\d+ 赞同 · \\d+ 评论 文章$"]
19
+ exact_text = ["目录", "收起"]
20
+
21
+ [clean.truncate]
22
+ after_regexes = ["^发布于 ", "^赞同 ", "^\\d+ 条评论$", "^分享$", "^申请转载$", ".*的广告$"]
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ariesfish/feedloom",
3
- "version": "0.1.2",
3
+ "version": "0.1.3",
4
4
  "type": "module",
5
5
  "author": "ariesfish",
6
6
  "license": "MIT",
@@ -24,7 +24,7 @@
24
24
  ],
25
25
  "scripts": {
26
26
  "dev": "tsx src/cli.ts",
27
- "build": "tsup src/cli.ts --format esm --dts --clean",
27
+ "build": "tsup src/cli.ts --format esm --dts --clean && rm -rf dist/site-rules && cp -R src/site-rules dist/site-rules",
28
28
  "typecheck": "tsc --noEmit",
29
29
  "test": "vitest run",
30
30
  "prepublishOnly": "npm run typecheck && npm test && npm run build"
@@ -63,9 +63,11 @@ Use the least expensive mode that works:
63
63
 
64
64
  Run `npx -y @ariesfish/feedloom --help` for the complete option list. Do not invent unsupported options.
65
65
 
66
- ## Private site rules
66
+ ## Site rules
67
67
 
68
- Site-specific TOML rules are optional in the package, but mandatory to use when present next to this skill. Always check for `$HOME/.agents/skills/feedloom/site-rules/` before clipping. If that directory exists, pass it explicitly on every Feedloom command using the `$HOME`-prefixed path:
68
+ Feedloom ships built-in TOML site rules in the package for common sites such as WeChat and Zhihu. These are loaded automatically; do not pass a special option for built-in rules.
69
+
70
+ Private skill rules are also supported and are mandatory to use when present next to this skill. Always check for `$HOME/.agents/skills/feedloom/site-rules/` before clipping. If that directory exists, pass it explicitly on every Feedloom command using the `$HOME`-prefixed path:
69
71
 
70
72
  ```bash
71
73
  npx -y @ariesfish/feedloom "https://example.com/article" --site-rules-dir $HOME/.agents/skills/feedloom/site-rules
@@ -73,6 +75,8 @@ npx -y @ariesfish/feedloom "https://example.com/article" --site-rules-dir $HOME/
73
75
 
74
76
  Treat rule files in `$HOME/.agents/skills/feedloom/site-rules/` as local reference material and use them whenever available; never skip an existing site-rules directory unless the user explicitly asks not to use it.
75
77
 
78
+ For adding or editing private rules, read `references/site-rules.md`. It contains the TOML schema, examples, `[fetch]` behavior, and validation workflow.
79
+
76
80
  ## Output
77
81
 
78
82
  - Markdown files are written to `clippings/` by default, or to `--output-dir`.
@@ -0,0 +1,104 @@
1
+ # Feedloom site rules
2
+
3
+ Use TOML site rules when Feedloom needs a narrow site-specific selector, cleanup overlay, metadata normalization, or conservative fetch preference. Do not write ad-hoc scrapers.
4
+
5
+ ## Locations
6
+
7
+ Private skill rules live in:
8
+
9
+ ```text
10
+ $HOME/.agents/skills/feedloom/site-rules/<site>.toml
11
+ ```
12
+
13
+ When the private rules directory exists, pass it on every command:
14
+
15
+ ```bash
16
+ npx -y @ariesfish/feedloom "https://example.com/article" --site-rules-dir $HOME/.agents/skills/feedloom/site-rules
17
+ ```
18
+
19
+ ## Add a private rule
20
+
21
+ Create or edit one TOML file per site:
22
+
23
+ ```bash
24
+ mkdir -p $HOME/.agents/skills/feedloom/site-rules
25
+ $EDITOR $HOME/.agents/skills/feedloom/site-rules/example.toml
26
+ ```
27
+
28
+ Minimal rule:
29
+
30
+ ```toml
31
+ [match]
32
+ host_suffixes = ["example.com"]
33
+
34
+ [extract]
35
+ selectors = ["article", "main"]
36
+ ```
37
+
38
+ Rule with fetch preferences:
39
+
40
+ ```toml
41
+ [match]
42
+ host_suffixes = ["zhihu.com"]
43
+
44
+ [fetch]
45
+ mode = "browser"
46
+ prefer_browser_state = true
47
+ scroll_to_bottom = true
48
+ wait_ms = 8000
49
+
50
+ [extract]
51
+ selectors = ["[class*=\"Post-RichTextContainer\"]", "[class*=\"RichText ztext\"]"]
52
+ ```
53
+
54
+ ## Schema
55
+
56
+ Supported sections:
57
+
58
+ - `[match]`: `host_suffixes`, `host_regexes`, `url_regexes`, `html_markers`.
59
+ - `[fetch]`: `mode`, `prefer_browser_state`, `wait_ms`, `network_idle`, `wait_selector`, `wait_selector_state`, `click_selectors`, `scroll_to_bottom`, `use_proxy_env`.
60
+ - `[extract]`: `selectors`, `require_text`.
61
+ - `[metadata]`: `fixed_author`, `strip_title_regexes`, `strip_author_regexes`, `author_selectors`, `author_meta_names`, `author_meta_itemprops`, `author_meta_properties`.
62
+ - `[clean.remove]`: `selectors`, `class_contains`, `id_contains`, `attr_contains`, `text_contains`, `text_regexes`, `exact_text`.
63
+ - `[clean.truncate]`: `after_contains`, `after_regexes`.
64
+
65
+ ## Fetch rules
66
+
67
+ Use `[fetch]` only when a site consistently needs browser rendering, local Chrome state, scrolling, waiting, clicking, or proxy-aware requests.
68
+
69
+ `use_proxy_env = true` tells Feedloom to use `HTTP_PROXY`, `HTTPS_PROXY`, `ALL_PROXY`, and `NO_PROXY` for static fetches and Defuddle async extractor fetches. Use this for YouTube transcript capture and similar extractor-backed pages that need the user's proxy settings.
70
+
71
+ `prefer_browser_state = true` only tells Feedloom to use copied Chrome state for matching URLs. It does not store the local Chrome path. The command still needs Chrome state parameters when login state is required:
72
+
73
+ ```bash
74
+ npx -y @ariesfish/feedloom \
75
+ --chrome-user-data-dir "$HOME/Library/Application Support/Google/Chrome" \
76
+ --chrome-profile Default \
77
+ --site-rules-dir $HOME/.agents/skills/feedloom/site-rules \
78
+ "https://zhuanlan.zhihu.com/p/..."
79
+ ```
80
+
81
+ ## Rules for writing rules
82
+
83
+ - Prefer narrow domain-specific selectors over broad selectors.
84
+ - Prefer content containers over page shells. Avoid `body` unless the HTML is already minimal.
85
+ - Use `require_text = true` when a matched extractor-backed page should fail instead of writing an empty note.
86
+ - Use cleanup only for repeated, stable noise inside otherwise correct content.
87
+ - Use truncation only for stable tail markers where everything after the marker is non-article content.
88
+ - Do not add aggressive crawling, high concurrency, repeated challenge solving, or broad stealth defaults.
89
+ - Keep private rules outside project repos unless the user is working on Feedloom itself.
90
+
91
+ ## Validation
92
+
93
+ After adding or editing a private rule, test one known URL and inspect the Markdown:
94
+
95
+ ```bash
96
+ outdir=$(mktemp -d /tmp/feedloom-rule-test-XXXXXX)
97
+ npx -y @ariesfish/feedloom \
98
+ --output-dir "$outdir" \
99
+ --site-rules-dir $HOME/.agents/skills/feedloom/site-rules \
100
+ "https://example.com/article"
101
+ find "$outdir" -maxdepth 2 -type f | sort
102
+ ```
103
+
104
+ For sites that require Chrome state, add the Chrome state options shown above.