@ariesfish/feedloom 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +282 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +1745 -0
- package/package.json +52 -0
package/dist/cli.js
ADDED
|
@@ -0,0 +1,1745 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// src/cli.ts
|
|
4
|
+
import { readdir as readdir2 } from "fs/promises";
|
|
5
|
+
import { dirname, join as join7, resolve as resolve2 } from "path";
|
|
6
|
+
import { fileURLToPath } from "url";
|
|
7
|
+
import { Command } from "commander";
|
|
8
|
+
|
|
9
|
+
// src/cleaning/profiles.ts
|
|
10
|
+
import { readFile } from "fs/promises";
|
|
11
|
+
import { parse } from "@iarna/toml";
|
|
12
|
+
function partialAttributePatterns(rule) {
|
|
13
|
+
return [
|
|
14
|
+
...rule.clean?.remove?.class_contains ?? [],
|
|
15
|
+
...rule.clean?.remove?.id_contains ?? [],
|
|
16
|
+
...rule.clean?.remove?.attr_contains ?? []
|
|
17
|
+
];
|
|
18
|
+
}
|
|
19
|
+
function profileFromTomlRule(name, rule) {
|
|
20
|
+
return {
|
|
21
|
+
name,
|
|
22
|
+
match: {
|
|
23
|
+
hostSuffixes: rule.match?.host_suffixes,
|
|
24
|
+
hostRegexes: rule.match?.host_regexes,
|
|
25
|
+
urlRegexes: rule.match?.url_regexes,
|
|
26
|
+
htmlMarkers: rule.match?.html_markers
|
|
27
|
+
},
|
|
28
|
+
content: {
|
|
29
|
+
selectors: rule.extract?.selectors
|
|
30
|
+
},
|
|
31
|
+
removals: {
|
|
32
|
+
exactSelectors: rule.clean?.remove?.selectors,
|
|
33
|
+
partialAttributePatterns: partialAttributePatterns(rule),
|
|
34
|
+
textContains: rule.clean?.remove?.text_contains,
|
|
35
|
+
textRegexes: rule.clean?.remove?.text_regexes,
|
|
36
|
+
cutAfterContains: rule.clean?.truncate?.after_contains,
|
|
37
|
+
cutAfterRegexes: rule.clean?.truncate?.after_regexes,
|
|
38
|
+
dropExactText: rule.clean?.remove?.exact_text
|
|
39
|
+
},
|
|
40
|
+
metadata: {
|
|
41
|
+
fixedAuthor: rule.metadata?.fixed_author,
|
|
42
|
+
titleSuffixPatterns: rule.metadata?.strip_title_regexes
|
|
43
|
+
}
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
async function loadSiteProfiles(paths) {
|
|
47
|
+
const profiles = [];
|
|
48
|
+
for (const path of paths) {
|
|
49
|
+
const text = await readFile(path, "utf8");
|
|
50
|
+
const raw = parse(text);
|
|
51
|
+
const name = path.split(/[\\/]/).pop()?.replace(/\.toml$/i, "") || path;
|
|
52
|
+
profiles.push(profileFromTomlRule(name, raw));
|
|
53
|
+
}
|
|
54
|
+
return profiles;
|
|
55
|
+
}
|
|
56
|
+
function profileMatches(profile, url, html) {
|
|
57
|
+
const match = profile.match;
|
|
58
|
+
if (!match) {
|
|
59
|
+
return true;
|
|
60
|
+
}
|
|
61
|
+
if (url) {
|
|
62
|
+
const parsed = new URL(url);
|
|
63
|
+
const host = parsed.hostname.toLowerCase();
|
|
64
|
+
if (match.hostSuffixes?.some((suffix) => host.endsWith(suffix.toLowerCase()))) {
|
|
65
|
+
return true;
|
|
66
|
+
}
|
|
67
|
+
if (match.hostRegexes?.some((pattern) => new RegExp(pattern, "i").test(host))) {
|
|
68
|
+
return true;
|
|
69
|
+
}
|
|
70
|
+
if (match.urlRegexes?.some((pattern) => new RegExp(pattern, "i").test(url))) {
|
|
71
|
+
return true;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
if (match.htmlMarkers?.some((marker) => html.includes(marker))) {
|
|
75
|
+
return true;
|
|
76
|
+
}
|
|
77
|
+
return !match.hostSuffixes?.length && !match.hostRegexes?.length && !match.urlRegexes?.length && !match.htmlMarkers?.length;
|
|
78
|
+
}
|
|
79
|
+
function selectActiveProfiles(profiles, url, html) {
|
|
80
|
+
return profiles?.filter((profile) => profileMatches(profile, url, html)) ?? [];
|
|
81
|
+
}
|
|
82
|
+
function firstContentSelector(profiles) {
|
|
83
|
+
for (const profile of profiles) {
|
|
84
|
+
const selector = profile.content?.selectors?.find((candidate) => candidate.trim());
|
|
85
|
+
if (selector) {
|
|
86
|
+
return selector;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
return void 0;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// src/fetch/browser.ts
|
|
93
|
+
import { mkdtemp, rm } from "fs/promises";
|
|
94
|
+
import { tmpdir } from "os";
|
|
95
|
+
import { join } from "path";
|
|
96
|
+
import { chromium } from "patchright";
|
|
97
|
+
var SCRAPLING_DEFAULT_ARGS = [
|
|
98
|
+
"--no-pings",
|
|
99
|
+
"--no-first-run",
|
|
100
|
+
"--disable-infobars",
|
|
101
|
+
"--disable-breakpad",
|
|
102
|
+
"--no-service-autorun",
|
|
103
|
+
"--homepage=about:blank",
|
|
104
|
+
"--password-store=basic",
|
|
105
|
+
"--disable-hang-monitor",
|
|
106
|
+
"--no-default-browser-check",
|
|
107
|
+
"--disable-session-crashed-bubble",
|
|
108
|
+
"--disable-search-engine-choice-screen"
|
|
109
|
+
];
|
|
110
|
+
var SCRAPLING_HARMFUL_ARGS = [
|
|
111
|
+
"--enable-automation",
|
|
112
|
+
"--disable-popup-blocking",
|
|
113
|
+
"--disable-component-update",
|
|
114
|
+
"--disable-default-apps",
|
|
115
|
+
"--disable-extensions"
|
|
116
|
+
];
|
|
117
|
+
async function runPageActions(page, options) {
|
|
118
|
+
for (const selector of options.clickSelectors ?? []) {
|
|
119
|
+
await page.locator(selector).first().click({ timeout: 5e3 }).catch(() => void 0);
|
|
120
|
+
}
|
|
121
|
+
if (options.scrollToBottom) {
|
|
122
|
+
await page.evaluate(async () => {
|
|
123
|
+
const delay = (ms) => new Promise((resolve3) => setTimeout(resolve3, ms));
|
|
124
|
+
for (let i = 0; i < 8; i += 1) {
|
|
125
|
+
window.scrollTo(0, document.body.scrollHeight);
|
|
126
|
+
await delay(250);
|
|
127
|
+
}
|
|
128
|
+
});
|
|
129
|
+
}
|
|
130
|
+
if (options.waitSelector) {
|
|
131
|
+
await page.locator(options.waitSelector).first().waitFor({
|
|
132
|
+
state: options.waitSelectorState ?? "attached",
|
|
133
|
+
timeout: options.timeoutMs ?? 9e4
|
|
134
|
+
}).catch(() => void 0);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
async function launchBrowserContext(options) {
|
|
138
|
+
const userDataDir = options.userDataDir ?? await mkdtemp(join(tmpdir(), "feedloom-browser-"));
|
|
139
|
+
const ownsUserDataDir = options.userDataDir === void 0;
|
|
140
|
+
const realChromeDefaults = options.realChromeDefaults ?? false;
|
|
141
|
+
const extraArgs = realChromeDefaults ? [.../* @__PURE__ */ new Set([...options.extraArgs ?? [], ...SCRAPLING_DEFAULT_ARGS])] : [...options.extraArgs ?? []];
|
|
142
|
+
if (options.dnsOverHttps) {
|
|
143
|
+
extraArgs.push("--dns-over-https-templates=https://cloudflare-dns.com/dns-query");
|
|
144
|
+
}
|
|
145
|
+
const context = await chromium.launchPersistentContext(userDataDir, {
|
|
146
|
+
channel: options.channel,
|
|
147
|
+
headless: options.headless ?? true,
|
|
148
|
+
args: extraArgs,
|
|
149
|
+
ignoreDefaultArgs: realChromeDefaults ? SCRAPLING_HARMFUL_ARGS : void 0,
|
|
150
|
+
proxy: options.proxy ? { server: options.proxy } : void 0,
|
|
151
|
+
ignoreHTTPSErrors: true,
|
|
152
|
+
colorScheme: realChromeDefaults ? "dark" : void 0,
|
|
153
|
+
deviceScaleFactor: realChromeDefaults ? 2 : void 0,
|
|
154
|
+
locale: void 0,
|
|
155
|
+
timezoneId: realChromeDefaults ? "" : void 0,
|
|
156
|
+
userAgent: realChromeDefaults ? "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36" : void 0,
|
|
157
|
+
viewport: { width: 1365, height: 900 },
|
|
158
|
+
screen: { width: 1365, height: 900 }
|
|
159
|
+
});
|
|
160
|
+
return { context, userDataDir, ownsUserDataDir };
|
|
161
|
+
}
|
|
162
|
+
async function fetchWithContext(context, url, options) {
|
|
163
|
+
const timeoutMs = options.timeoutMs ?? 9e4;
|
|
164
|
+
const waitMs = options.waitMs ?? 2500;
|
|
165
|
+
const page = await context.newPage();
|
|
166
|
+
try {
|
|
167
|
+
await page.goto(url, { waitUntil: "load", timeout: timeoutMs, referer: options.referer ?? (options.realChromeDefaults ? "https://www.google.com/" : void 0) });
|
|
168
|
+
await page.waitForLoadState("domcontentloaded", { timeout: timeoutMs }).catch(() => void 0);
|
|
169
|
+
if (options.networkIdle ?? true) {
|
|
170
|
+
await page.waitForLoadState("networkidle", { timeout: timeoutMs }).catch(() => void 0);
|
|
171
|
+
}
|
|
172
|
+
await runPageActions(page, options);
|
|
173
|
+
if (waitMs > 0) {
|
|
174
|
+
await page.waitForTimeout(waitMs);
|
|
175
|
+
}
|
|
176
|
+
return await page.content();
|
|
177
|
+
} finally {
|
|
178
|
+
await page.close().catch(() => void 0);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
var BrowserFetchSession = class {
|
|
182
|
+
constructor(options = {}) {
|
|
183
|
+
this.options = options;
|
|
184
|
+
}
|
|
185
|
+
options;
|
|
186
|
+
context = null;
|
|
187
|
+
userDataDir = "";
|
|
188
|
+
ownsUserDataDir = false;
|
|
189
|
+
async start() {
|
|
190
|
+
if (this.context) return;
|
|
191
|
+
const launched = await launchBrowserContext(this.options);
|
|
192
|
+
this.context = launched.context;
|
|
193
|
+
this.userDataDir = launched.userDataDir;
|
|
194
|
+
this.ownsUserDataDir = launched.ownsUserDataDir;
|
|
195
|
+
}
|
|
196
|
+
async fetch(url) {
|
|
197
|
+
await this.start();
|
|
198
|
+
if (!this.context) throw new Error("Browser context was not initialized");
|
|
199
|
+
return fetchWithContext(this.context, url, this.options);
|
|
200
|
+
}
|
|
201
|
+
async close() {
|
|
202
|
+
await this.context?.close().catch(() => void 0);
|
|
203
|
+
this.context = null;
|
|
204
|
+
if (this.ownsUserDataDir && this.userDataDir) {
|
|
205
|
+
await rm(this.userDataDir, { recursive: true, force: true });
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
};
|
|
209
|
+
async function fetchBrowserHtml(url, options = {}) {
|
|
210
|
+
const session = new BrowserFetchSession(options);
|
|
211
|
+
try {
|
|
212
|
+
return await session.fetch(url);
|
|
213
|
+
} finally {
|
|
214
|
+
await session.close();
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
// src/fetch/stealth.ts
|
|
219
|
+
import { mkdtemp as mkdtemp2, rm as rm2 } from "fs/promises";
|
|
220
|
+
import { tmpdir as tmpdir2 } from "os";
|
|
221
|
+
import { join as join2 } from "path";
|
|
222
|
+
import { chromium as chromium2 } from "patchright";
|
|
223
|
+
var DEFAULT_ARGS = [
|
|
224
|
+
"--no-pings",
|
|
225
|
+
"--no-first-run",
|
|
226
|
+
"--disable-infobars",
|
|
227
|
+
"--disable-breakpad",
|
|
228
|
+
"--no-service-autorun",
|
|
229
|
+
"--homepage=about:blank",
|
|
230
|
+
"--password-store=basic",
|
|
231
|
+
"--disable-hang-monitor",
|
|
232
|
+
"--no-default-browser-check",
|
|
233
|
+
"--disable-session-crashed-bubble",
|
|
234
|
+
"--disable-search-engine-choice-screen"
|
|
235
|
+
];
|
|
236
|
+
var STEALTH_ARGS = [
|
|
237
|
+
"--test-type",
|
|
238
|
+
"--lang=en-US",
|
|
239
|
+
"--mute-audio",
|
|
240
|
+
"--disable-sync",
|
|
241
|
+
"--hide-scrollbars",
|
|
242
|
+
"--disable-logging",
|
|
243
|
+
"--start-maximized",
|
|
244
|
+
"--enable-async-dns",
|
|
245
|
+
"--accept-lang=en-US",
|
|
246
|
+
"--use-mock-keychain",
|
|
247
|
+
"--disable-translate",
|
|
248
|
+
"--disable-voice-input",
|
|
249
|
+
"--window-position=0,0",
|
|
250
|
+
"--disable-wake-on-wifi",
|
|
251
|
+
"--ignore-gpu-blocklist",
|
|
252
|
+
"--enable-tcp-fast-open",
|
|
253
|
+
"--enable-web-bluetooth",
|
|
254
|
+
"--disable-cloud-import",
|
|
255
|
+
"--disable-print-preview",
|
|
256
|
+
"--disable-dev-shm-usage",
|
|
257
|
+
"--metrics-recording-only",
|
|
258
|
+
"--disable-crash-reporter",
|
|
259
|
+
"--disable-partial-raster",
|
|
260
|
+
"--disable-gesture-typing",
|
|
261
|
+
"--disable-checker-imaging",
|
|
262
|
+
"--disable-prompt-on-repost",
|
|
263
|
+
"--force-color-profile=srgb",
|
|
264
|
+
"--font-render-hinting=none",
|
|
265
|
+
"--aggressive-cache-discard",
|
|
266
|
+
"--disable-cookie-encryption",
|
|
267
|
+
"--disable-domain-reliability",
|
|
268
|
+
"--disable-threaded-animation",
|
|
269
|
+
"--disable-threaded-scrolling",
|
|
270
|
+
"--enable-simple-cache-backend",
|
|
271
|
+
"--disable-background-networking",
|
|
272
|
+
"--enable-surface-synchronization",
|
|
273
|
+
"--disable-image-animation-resync",
|
|
274
|
+
"--disable-renderer-backgrounding",
|
|
275
|
+
"--disable-ipc-flooding-protection",
|
|
276
|
+
"--prerender-from-omnibox=disabled",
|
|
277
|
+
"--safebrowsing-disable-auto-update",
|
|
278
|
+
"--disable-offer-upload-credit-cards",
|
|
279
|
+
"--disable-background-timer-throttling",
|
|
280
|
+
"--disable-new-content-rendering-timeout",
|
|
281
|
+
"--run-all-compositor-stages-before-draw",
|
|
282
|
+
"--disable-client-side-phishing-detection",
|
|
283
|
+
"--disable-backgrounding-occluded-windows",
|
|
284
|
+
"--disable-layer-tree-host-memory-pressure",
|
|
285
|
+
"--autoplay-policy=user-gesture-required",
|
|
286
|
+
"--disable-offer-store-unmasked-wallet-cards",
|
|
287
|
+
"--disable-blink-features=AutomationControlled",
|
|
288
|
+
"--disable-component-extensions-with-background-pages",
|
|
289
|
+
"--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance",
|
|
290
|
+
"--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4",
|
|
291
|
+
"--disable-features=AudioServiceOutOfProcess,TranslateUI,BlinkGenPropertyTrees"
|
|
292
|
+
];
|
|
293
|
+
var HARMFUL_ARGS = [
|
|
294
|
+
"--enable-automation",
|
|
295
|
+
"--disable-popup-blocking",
|
|
296
|
+
"--disable-component-update",
|
|
297
|
+
"--disable-default-apps",
|
|
298
|
+
"--disable-extensions"
|
|
299
|
+
];
|
|
300
|
+
var EXTRA_RESOURCES = /* @__PURE__ */ new Set([
|
|
301
|
+
"font",
|
|
302
|
+
"image",
|
|
303
|
+
"media",
|
|
304
|
+
"beacon",
|
|
305
|
+
"object",
|
|
306
|
+
"imageset",
|
|
307
|
+
"texttrack",
|
|
308
|
+
"websocket",
|
|
309
|
+
"csp_report",
|
|
310
|
+
"stylesheet"
|
|
311
|
+
]);
|
|
312
|
+
function stealthArgs(options) {
|
|
313
|
+
const args = [...DEFAULT_ARGS, ...STEALTH_ARGS, ...options.extraArgs ?? []];
|
|
314
|
+
if (options.blockWebrtc) {
|
|
315
|
+
args.push("--webrtc-ip-handling-policy=disable_non_proxied_udp", "--force-webrtc-ip-handling-policy");
|
|
316
|
+
}
|
|
317
|
+
if (options.allowWebgl === false) {
|
|
318
|
+
args.push("--disable-webgl", "--disable-webgl-image-chromium", "--disable-webgl2");
|
|
319
|
+
}
|
|
320
|
+
if (options.hideCanvas) {
|
|
321
|
+
args.push("--fingerprinting-canvas-image-data-noise");
|
|
322
|
+
}
|
|
323
|
+
if (options.dnsOverHttps) {
|
|
324
|
+
args.push("--dns-over-https-templates=https://cloudflare-dns.com/dns-query");
|
|
325
|
+
}
|
|
326
|
+
return [...new Set(args)];
|
|
327
|
+
}
|
|
328
|
+
function shouldBlock(route, options) {
|
|
329
|
+
const request = route.request();
|
|
330
|
+
if (options.disableResources && EXTRA_RESOURCES.has(request.resourceType())) return true;
|
|
331
|
+
const host = new URL(request.url()).hostname;
|
|
332
|
+
return options.blockedDomains?.some((domain) => host === domain || host.endsWith(`.${domain}`)) ?? false;
|
|
333
|
+
}
|
|
334
|
+
function cloudflareChallengeType(html) {
|
|
335
|
+
for (const type of ["non-interactive", "managed", "interactive"]) {
|
|
336
|
+
if (html.includes(`cType: '${type}'`)) return type;
|
|
337
|
+
}
|
|
338
|
+
if (/challenges\.cloudflare\.com\/turnstile\/v/i.test(html)) return "embedded";
|
|
339
|
+
if (html.includes("<title>Just a moment...</title>")) return "managed";
|
|
340
|
+
return null;
|
|
341
|
+
}
|
|
342
|
+
async function solveCloudflare(page) {
|
|
343
|
+
let html = await page.content();
|
|
344
|
+
let challenge = cloudflareChallengeType(html);
|
|
345
|
+
if (!challenge) return;
|
|
346
|
+
for (let attempt = 0; attempt < 3 && challenge; attempt += 1) {
|
|
347
|
+
if (challenge === "non-interactive") {
|
|
348
|
+
await page.waitForTimeout(1e3);
|
|
349
|
+
} else {
|
|
350
|
+
const box = await page.locator("#cf_turnstile div, #cf-turnstile div, .turnstile>div>div, .main-content p+div>div>div").last().boundingBox().catch(() => null);
|
|
351
|
+
if (box) {
|
|
352
|
+
await page.mouse.click(box.x + 27, box.y + 26, { delay: 150, button: "left" });
|
|
353
|
+
} else {
|
|
354
|
+
await page.waitForTimeout(1e3);
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
await page.waitForLoadState("networkidle", { timeout: 1e4 }).catch(() => void 0);
|
|
358
|
+
html = await page.content();
|
|
359
|
+
challenge = cloudflareChallengeType(html);
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
async function launchStealthContext(options) {
|
|
363
|
+
const userDataDir = options.userDataDir ?? await mkdtemp2(join2(tmpdir2(), "feedloom-stealth-"));
|
|
364
|
+
const ownsUserDataDir = options.userDataDir === void 0;
|
|
365
|
+
const context = await chromium2.launchPersistentContext(userDataDir, {
|
|
366
|
+
channel: "chromium",
|
|
367
|
+
headless: options.headless ?? true,
|
|
368
|
+
args: stealthArgs(options),
|
|
369
|
+
ignoreDefaultArgs: HARMFUL_ARGS,
|
|
370
|
+
proxy: options.proxy ? { server: options.proxy } : void 0,
|
|
371
|
+
ignoreHTTPSErrors: true,
|
|
372
|
+
colorScheme: "dark",
|
|
373
|
+
deviceScaleFactor: 2,
|
|
374
|
+
isMobile: false,
|
|
375
|
+
hasTouch: false,
|
|
376
|
+
serviceWorkers: "allow",
|
|
377
|
+
screen: { width: 1920, height: 1080 },
|
|
378
|
+
viewport: { width: 1920, height: 1080 },
|
|
379
|
+
permissions: ["geolocation", "notifications"],
|
|
380
|
+
locale: options.locale,
|
|
381
|
+
timezoneId: options.timezoneId,
|
|
382
|
+
userAgent: options.userAgent,
|
|
383
|
+
extraHTTPHeaders: options.extraHeaders
|
|
384
|
+
});
|
|
385
|
+
return { context, userDataDir, ownsUserDataDir };
|
|
386
|
+
}
|
|
387
|
+
async function fetchWithStealthContext(context, url, options) {
|
|
388
|
+
const timeoutMs = options.timeoutMs ?? (options.solveCloudflare ? 6e4 : 3e4);
|
|
389
|
+
const waitMs = options.waitMs ?? 0;
|
|
390
|
+
const page = await context.newPage();
|
|
391
|
+
try {
|
|
392
|
+
page.setDefaultNavigationTimeout(timeoutMs);
|
|
393
|
+
page.setDefaultTimeout(timeoutMs);
|
|
394
|
+
if (options.disableResources || options.blockedDomains?.length) {
|
|
395
|
+
await page.route("**/*", async (route) => shouldBlock(route, options) ? route.abort() : route.continue());
|
|
396
|
+
}
|
|
397
|
+
await page.goto(url, { waitUntil: "load", timeout: timeoutMs, referer: options.extraHeaders?.referer ?? "https://www.google.com/" });
|
|
398
|
+
await page.waitForLoadState("domcontentloaded", { timeout: timeoutMs }).catch(() => void 0);
|
|
399
|
+
if (options.networkIdle) {
|
|
400
|
+
await page.waitForLoadState("networkidle", { timeout: timeoutMs }).catch(() => void 0);
|
|
401
|
+
}
|
|
402
|
+
if (options.solveCloudflare) {
|
|
403
|
+
await solveCloudflare(page);
|
|
404
|
+
}
|
|
405
|
+
for (const selector of options.clickSelectors ?? []) {
|
|
406
|
+
await page.locator(selector).first().click({ timeout: 5e3 }).catch(() => void 0);
|
|
407
|
+
}
|
|
408
|
+
if (options.scrollToBottom) {
|
|
409
|
+
await page.evaluate(async () => {
|
|
410
|
+
const delay = (ms) => new Promise((resolve3) => setTimeout(resolve3, ms));
|
|
411
|
+
for (let i = 0; i < 8; i += 1) {
|
|
412
|
+
window.scrollTo(0, document.body.scrollHeight);
|
|
413
|
+
await delay(250);
|
|
414
|
+
}
|
|
415
|
+
});
|
|
416
|
+
}
|
|
417
|
+
if (options.waitSelector) {
|
|
418
|
+
await page.locator(options.waitSelector).first().waitFor({ state: options.waitSelectorState ?? "attached", timeout: timeoutMs }).catch(() => void 0);
|
|
419
|
+
}
|
|
420
|
+
if (waitMs > 0) {
|
|
421
|
+
await page.waitForTimeout(waitMs);
|
|
422
|
+
}
|
|
423
|
+
return await page.content();
|
|
424
|
+
} finally {
|
|
425
|
+
await page.close().catch(() => void 0);
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
var StealthFetchSession = class {
|
|
429
|
+
constructor(options = {}) {
|
|
430
|
+
this.options = options;
|
|
431
|
+
}
|
|
432
|
+
options;
|
|
433
|
+
context = null;
|
|
434
|
+
userDataDir = "";
|
|
435
|
+
ownsUserDataDir = false;
|
|
436
|
+
async start() {
|
|
437
|
+
if (this.context) return;
|
|
438
|
+
const launched = await launchStealthContext(this.options);
|
|
439
|
+
this.context = launched.context;
|
|
440
|
+
this.userDataDir = launched.userDataDir;
|
|
441
|
+
this.ownsUserDataDir = launched.ownsUserDataDir;
|
|
442
|
+
}
|
|
443
|
+
async fetch(url) {
|
|
444
|
+
await this.start();
|
|
445
|
+
if (!this.context) throw new Error("Stealth context was not initialized");
|
|
446
|
+
return fetchWithStealthContext(this.context, url, this.options);
|
|
447
|
+
}
|
|
448
|
+
async close() {
|
|
449
|
+
await this.context?.close().catch(() => void 0);
|
|
450
|
+
this.context = null;
|
|
451
|
+
if (this.ownsUserDataDir && this.userDataDir) {
|
|
452
|
+
await rm2(this.userDataDir, { recursive: true, force: true });
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
};
|
|
456
|
+
async function fetchStealthHtml(url, options = {}) {
|
|
457
|
+
const session = new StealthFetchSession(options);
|
|
458
|
+
try {
|
|
459
|
+
return await session.fetch(url);
|
|
460
|
+
} finally {
|
|
461
|
+
await session.close();
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
// src/fetch/batch.ts
|
|
466
|
+
var BatchFetchSessions = class {
|
|
467
|
+
constructor(options = {}) {
|
|
468
|
+
this.options = options;
|
|
469
|
+
}
|
|
470
|
+
options;
|
|
471
|
+
browserSession = null;
|
|
472
|
+
stealthSession = null;
|
|
473
|
+
async browserFetch(url) {
|
|
474
|
+
this.browserSession ??= new BrowserFetchSession(this.options.browser);
|
|
475
|
+
return this.browserSession.fetch(url);
|
|
476
|
+
}
|
|
477
|
+
async stealthFetch(url) {
|
|
478
|
+
this.stealthSession ??= new StealthFetchSession(this.options.stealth);
|
|
479
|
+
return this.stealthSession.fetch(url);
|
|
480
|
+
}
|
|
481
|
+
async close() {
|
|
482
|
+
await Promise.all([
|
|
483
|
+
this.browserSession?.close(),
|
|
484
|
+
this.stealthSession?.close()
|
|
485
|
+
]);
|
|
486
|
+
this.browserSession = null;
|
|
487
|
+
this.stealthSession = null;
|
|
488
|
+
}
|
|
489
|
+
};
|
|
490
|
+
|
|
491
|
+
// src/input/inputs.ts
|
|
492
|
+
import { readFile as readFile2, stat, writeFile } from "fs/promises";
|
|
493
|
+
import { resolve } from "path";
|
|
494
|
+
|
|
495
|
+
// src/constants.ts
|
|
496
|
+
var URL_RE = /https?:\/\/[^\s)\]}>"']+/;
|
|
497
|
+
|
|
498
|
+
// src/models.ts
|
|
499
|
+
function makeUrlItem(url, overrides = {}) {
|
|
500
|
+
return {
|
|
501
|
+
url,
|
|
502
|
+
sourceKind: "html-page",
|
|
503
|
+
...overrides
|
|
504
|
+
};
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
// src/input/inputs.ts
|
|
508
|
+
var CheckboxFile = class _CheckboxFile {
|
|
509
|
+
path;
|
|
510
|
+
lines;
|
|
511
|
+
dirty = false;
|
|
512
|
+
constructor(path, lines) {
|
|
513
|
+
this.path = path;
|
|
514
|
+
this.lines = lines;
|
|
515
|
+
}
|
|
516
|
+
static async load(path) {
|
|
517
|
+
const text = await readFile2(path, "utf8");
|
|
518
|
+
return new _CheckboxFile(path, text.split(/\r?\n/).filter((_, index, lines) => index < lines.length - 1 || lines[index] !== ""));
|
|
519
|
+
}
|
|
520
|
+
markDone(lineNo, url) {
|
|
521
|
+
if (lineNo === void 0 || lineNo < 1 || lineNo > this.lines.length) {
|
|
522
|
+
return;
|
|
523
|
+
}
|
|
524
|
+
const line = this.lines[lineNo - 1] ?? "";
|
|
525
|
+
if (line.includes(url) && line.includes("- [ ] ")) {
|
|
526
|
+
this.lines[lineNo - 1] = line.replace("- [ ] ", "- [x] ");
|
|
527
|
+
this.dirty = true;
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
async save() {
|
|
531
|
+
if (this.dirty) {
|
|
532
|
+
await writeFile(this.path, `${this.lines.join("\n")}
|
|
533
|
+
`, "utf8");
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
};
|
|
537
|
+
async function isFile(path) {
|
|
538
|
+
try {
|
|
539
|
+
return (await stat(path)).isFile();
|
|
540
|
+
} catch (error) {
|
|
541
|
+
if (error.code === "ENOENT") {
|
|
542
|
+
return false;
|
|
543
|
+
}
|
|
544
|
+
throw error;
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
async function parseInputs(inputs) {
|
|
548
|
+
const checkboxFiles = /* @__PURE__ */ new Map();
|
|
549
|
+
const items = [];
|
|
550
|
+
const seen = /* @__PURE__ */ new Set();
|
|
551
|
+
for (const raw of inputs) {
|
|
552
|
+
const path = resolve(raw.replace(/^~(?=$|\/)/, process.env.HOME ?? "~"));
|
|
553
|
+
if (await isFile(path)) {
|
|
554
|
+
const checkbox = await CheckboxFile.load(path);
|
|
555
|
+
checkboxFiles.set(path, checkbox);
|
|
556
|
+
checkbox.lines.forEach((line, index) => {
|
|
557
|
+
const match2 = URL_RE.exec(line);
|
|
558
|
+
if (!match2) {
|
|
559
|
+
return;
|
|
560
|
+
}
|
|
561
|
+
const url2 = match2[0];
|
|
562
|
+
if (seen.has(url2)) {
|
|
563
|
+
return;
|
|
564
|
+
}
|
|
565
|
+
seen.add(url2);
|
|
566
|
+
items.push(makeUrlItem(url2, { sourcePath: path, lineNo: index + 1 }));
|
|
567
|
+
});
|
|
568
|
+
continue;
|
|
569
|
+
}
|
|
570
|
+
const match = URL_RE.exec(raw);
|
|
571
|
+
if (!match) {
|
|
572
|
+
throw new Error(`Unsupported input: ${raw}`);
|
|
573
|
+
}
|
|
574
|
+
const url = match[0];
|
|
575
|
+
if (seen.has(url)) {
|
|
576
|
+
continue;
|
|
577
|
+
}
|
|
578
|
+
seen.add(url);
|
|
579
|
+
items.push(makeUrlItem(url));
|
|
580
|
+
}
|
|
581
|
+
if (items.length === 0) {
|
|
582
|
+
throw new Error("No URLs found in input");
|
|
583
|
+
}
|
|
584
|
+
return { items, checkboxFiles };
|
|
585
|
+
}
|
|
586
|
+
function sliceItems(items, start, end, limit) {
|
|
587
|
+
if (start < 1) {
|
|
588
|
+
throw new Error("--start must be at least 1");
|
|
589
|
+
}
|
|
590
|
+
if (end < 0) {
|
|
591
|
+
throw new Error("--end must be 0 or a positive 1-based index");
|
|
592
|
+
}
|
|
593
|
+
if (limit < 0) {
|
|
594
|
+
throw new Error("--limit must be 0 or a positive integer");
|
|
595
|
+
}
|
|
596
|
+
if (end !== 0 && end < start) {
|
|
597
|
+
throw new Error("--end must be greater than or equal to --start");
|
|
598
|
+
}
|
|
599
|
+
const begin = Math.max(start - 1, 0);
|
|
600
|
+
const result = items.slice(begin, end || void 0);
|
|
601
|
+
return limit > 0 ? result.slice(0, limit) : result;
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
// src/input/sources.ts
|
|
605
|
+
import { XMLParser } from "fast-xml-parser";
|
|
606
|
+
var FEED_HINT_RE = /(?:^|\/)(?:feed|rss|atom)(?:$|[./?])/i;
|
|
607
|
+
function looksLikeFeedUrl(url) {
|
|
608
|
+
const parsed = new URL(url);
|
|
609
|
+
const path = parsed.pathname.toLowerCase();
|
|
610
|
+
return FEED_HINT_RE.test(path) || path.endsWith(".xml") || path.endsWith(".rss") || path.endsWith(".atom") || parsed.search.slice(1).toLowerCase().startsWith("feed=");
|
|
611
|
+
}
|
|
612
|
+
async function fetchSourceText(url, timeoutMs = 6e4) {
|
|
613
|
+
const controller = new AbortController();
|
|
614
|
+
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
|
615
|
+
try {
|
|
616
|
+
const response = await fetch(url, {
|
|
617
|
+
redirect: "follow",
|
|
618
|
+
signal: controller.signal,
|
|
619
|
+
headers: { "user-agent": "Mozilla/5.0" }
|
|
620
|
+
});
|
|
621
|
+
if (!response.ok) {
|
|
622
|
+
throw new Error(`HTTP ${response.status} ${response.statusText}`);
|
|
623
|
+
}
|
|
624
|
+
return await response.text();
|
|
625
|
+
} finally {
|
|
626
|
+
clearTimeout(timeout);
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
function parseDateTime(value) {
|
|
630
|
+
const raw = value?.trim();
|
|
631
|
+
if (!raw) {
|
|
632
|
+
return void 0;
|
|
633
|
+
}
|
|
634
|
+
const normalized = raw.endsWith("Z") ? raw : raw;
|
|
635
|
+
const timestamp = Date.parse(normalized);
|
|
636
|
+
if (Number.isNaN(timestamp)) {
|
|
637
|
+
return void 0;
|
|
638
|
+
}
|
|
639
|
+
return new Date(timestamp);
|
|
640
|
+
}
|
|
641
|
+
function parseSinceDate(value) {
|
|
642
|
+
const raw = value.trim();
|
|
643
|
+
if (!/^\d{4}-\d{2}-\d{2}$/.test(raw)) {
|
|
644
|
+
throw new Error(`Invalid --since date: ${value}. Use YYYY-MM-DD.`);
|
|
645
|
+
}
|
|
646
|
+
const date = /* @__PURE__ */ new Date(`${raw}T00:00:00.000Z`);
|
|
647
|
+
if (Number.isNaN(date.getTime())) {
|
|
648
|
+
throw new Error(`Invalid --since date: ${value}. Use YYYY-MM-DD.`);
|
|
649
|
+
}
|
|
650
|
+
return date;
|
|
651
|
+
}
|
|
652
|
+
function asObject(value) {
|
|
653
|
+
return value && typeof value === "object" && !Array.isArray(value) ? value : void 0;
|
|
654
|
+
}
|
|
655
|
+
function asArray(value) {
|
|
656
|
+
if (value === void 0 || value === null) {
|
|
657
|
+
return [];
|
|
658
|
+
}
|
|
659
|
+
return Array.isArray(value) ? value : [value];
|
|
660
|
+
}
|
|
661
|
+
function textValue(value) {
|
|
662
|
+
if (value === void 0 || value === null) {
|
|
663
|
+
return "";
|
|
664
|
+
}
|
|
665
|
+
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
|
|
666
|
+
return String(value).trim();
|
|
667
|
+
}
|
|
668
|
+
const object = asObject(value);
|
|
669
|
+
if (!object) {
|
|
670
|
+
return "";
|
|
671
|
+
}
|
|
672
|
+
const text = object["#text"] ?? object.text;
|
|
673
|
+
return textValue(text);
|
|
674
|
+
}
|
|
675
|
+
function childText(node, ...names) {
|
|
676
|
+
for (const name of names) {
|
|
677
|
+
const value = node[name];
|
|
678
|
+
const text = textValue(value);
|
|
679
|
+
if (text) {
|
|
680
|
+
return text;
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
return "";
|
|
684
|
+
}
|
|
685
|
+
function atomEntryLink(entry) {
|
|
686
|
+
for (const rawLink of asArray(entry.link)) {
|
|
687
|
+
if (typeof rawLink === "string") {
|
|
688
|
+
return rawLink.trim();
|
|
689
|
+
}
|
|
690
|
+
const link = asObject(rawLink);
|
|
691
|
+
if (!link) {
|
|
692
|
+
continue;
|
|
693
|
+
}
|
|
694
|
+
const href = textValue(link.href ?? link["@_href"]);
|
|
695
|
+
const rel = textValue(link.rel ?? link["@_rel"] ?? "alternate").toLowerCase();
|
|
696
|
+
if (href && (rel === "" || rel === "alternate")) {
|
|
697
|
+
return href;
|
|
698
|
+
}
|
|
699
|
+
}
|
|
700
|
+
return "";
|
|
701
|
+
}
|
|
702
|
+
function entryPublishedAt(node) {
|
|
703
|
+
return parseDateTime(childText(node, "published", "updated", "pubDate", "date"));
|
|
704
|
+
}
|
|
705
|
+
var parser = new XMLParser({
|
|
706
|
+
ignoreAttributes: false,
|
|
707
|
+
attributeNamePrefix: "@_",
|
|
708
|
+
removeNSPrefix: true,
|
|
709
|
+
trimValues: true
|
|
710
|
+
});
|
|
711
|
+
function parseFeedEntries(xmlText, sourceUrl) {
|
|
712
|
+
const parsed = parser.parse(xmlText);
|
|
713
|
+
if (asObject(parsed.rss)) {
|
|
714
|
+
const rss = asObject(parsed.rss);
|
|
715
|
+
const channel = asObject(rss.channel);
|
|
716
|
+
if (!channel) {
|
|
717
|
+
return [];
|
|
718
|
+
}
|
|
719
|
+
const feedTitle = childText(channel, "title");
|
|
720
|
+
return asArray(channel.item).map((rawItem) => asObject(rawItem)).filter((item) => item !== void 0).map((item) => {
|
|
721
|
+
const link = childText(item, "link");
|
|
722
|
+
if (!link) {
|
|
723
|
+
return void 0;
|
|
724
|
+
}
|
|
725
|
+
return makeUrlItem(new URL(link, sourceUrl).toString(), {
|
|
726
|
+
inputUrl: sourceUrl,
|
|
727
|
+
sourceKind: "html-page",
|
|
728
|
+
discoveredFrom: sourceUrl,
|
|
729
|
+
sourceTitle: feedTitle || childText(item, "title"),
|
|
730
|
+
publishedAt: entryPublishedAt(item)
|
|
731
|
+
});
|
|
732
|
+
}).filter((item) => item !== void 0);
|
|
733
|
+
}
|
|
734
|
+
if (asObject(parsed.feed)) {
|
|
735
|
+
const feed = asObject(parsed.feed);
|
|
736
|
+
const feedTitle = childText(feed, "title");
|
|
737
|
+
return asArray(feed.entry).map((rawEntry) => asObject(rawEntry)).filter((entry) => entry !== void 0).map((entry) => {
|
|
738
|
+
const link = atomEntryLink(entry);
|
|
739
|
+
if (!link) {
|
|
740
|
+
return void 0;
|
|
741
|
+
}
|
|
742
|
+
return makeUrlItem(new URL(link, sourceUrl).toString(), {
|
|
743
|
+
inputUrl: sourceUrl,
|
|
744
|
+
sourceKind: "html-page",
|
|
745
|
+
discoveredFrom: sourceUrl,
|
|
746
|
+
sourceTitle: feedTitle || childText(entry, "title"),
|
|
747
|
+
publishedAt: entryPublishedAt(entry)
|
|
748
|
+
});
|
|
749
|
+
}).filter((item) => item !== void 0);
|
|
750
|
+
}
|
|
751
|
+
throw new Error("Unsupported feed format");
|
|
752
|
+
}
|
|
753
|
+
async function expandSourceItems(items, sourceKind, since, options = {}) {
|
|
754
|
+
const expanded = [];
|
|
755
|
+
const seen = /* @__PURE__ */ new Set();
|
|
756
|
+
const fetchSource = options.fetchSource ?? fetchSourceText;
|
|
757
|
+
for (const item of items) {
|
|
758
|
+
const kinds = sourceKind === "auto" ? looksLikeFeedUrl(item.url) ? ["rss-feed", "html-page"] : ["html-page"] : [sourceKind];
|
|
759
|
+
let produced;
|
|
760
|
+
let lastError;
|
|
761
|
+
for (const kind of kinds) {
|
|
762
|
+
try {
|
|
763
|
+
if (kind === "html-page") {
|
|
764
|
+
produced = [item];
|
|
765
|
+
break;
|
|
766
|
+
}
|
|
767
|
+
if (kind === "rss-feed") {
|
|
768
|
+
if (sourceKind === "auto" && !looksLikeFeedUrl(item.url)) {
|
|
769
|
+
continue;
|
|
770
|
+
}
|
|
771
|
+
const xmlText = await fetchSource(item.url);
|
|
772
|
+
produced = parseFeedEntries(xmlText, item.url);
|
|
773
|
+
if (produced.length > 0) {
|
|
774
|
+
break;
|
|
775
|
+
}
|
|
776
|
+
}
|
|
777
|
+
} catch (error) {
|
|
778
|
+
lastError = error;
|
|
779
|
+
}
|
|
780
|
+
}
|
|
781
|
+
if (produced === void 0) {
|
|
782
|
+
if (sourceKind === "rss-feed" && lastError !== void 0) {
|
|
783
|
+
throw lastError;
|
|
784
|
+
}
|
|
785
|
+
produced = [item];
|
|
786
|
+
}
|
|
787
|
+
for (const producedItem of produced) {
|
|
788
|
+
producedItem.sourcePath = item.sourcePath;
|
|
789
|
+
producedItem.lineNo = item.lineNo;
|
|
790
|
+
producedItem.inputUrl = producedItem.inputUrl ?? item.url;
|
|
791
|
+
if (since && producedItem.publishedAt && producedItem.publishedAt < since) {
|
|
792
|
+
continue;
|
|
793
|
+
}
|
|
794
|
+
if (seen.has(producedItem.url)) {
|
|
795
|
+
continue;
|
|
796
|
+
}
|
|
797
|
+
seen.add(producedItem.url);
|
|
798
|
+
expanded.push(producedItem);
|
|
799
|
+
}
|
|
800
|
+
}
|
|
801
|
+
return expanded;
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
// src/assets.ts
|
|
805
|
+
import { mkdir, writeFile as writeFile2 } from "fs/promises";
|
|
806
|
+
import { extname, join as join3 } from "path";
|
|
807
|
+
import { parseHTML } from "linkedom";
|
|
808
|
+
function extensionFrom(contentType, url) {
|
|
809
|
+
const pathExt = extname(new URL(url).pathname).replace(/[^.a-z0-9]/gi, "");
|
|
810
|
+
if (pathExt && pathExt.length <= 8) return pathExt;
|
|
811
|
+
if (contentType?.includes("png")) return ".png";
|
|
812
|
+
if (contentType?.includes("webp")) return ".webp";
|
|
813
|
+
if (contentType?.includes("gif")) return ".gif";
|
|
814
|
+
return ".jpg";
|
|
815
|
+
}
|
|
816
|
+
function imageSource(img) {
|
|
817
|
+
const direct = img.getAttribute("data-original") || img.getAttribute("data-src") || img.getAttribute("src");
|
|
818
|
+
if (direct) return direct;
|
|
819
|
+
const srcset = img.getAttribute("data-srcset") || img.getAttribute("srcset");
|
|
820
|
+
const first = srcset?.split(",").map((part) => part.trim().split(/\s+/)[0]).find(Boolean);
|
|
821
|
+
return first || null;
|
|
822
|
+
}
|
|
823
|
+
async function localizeImages(html, options) {
|
|
824
|
+
const { document: document2 } = parseHTML(`<!doctype html><html><body>${html}</body></html>`);
|
|
825
|
+
const images = Array.from(document2.querySelectorAll("img"));
|
|
826
|
+
if (images.length === 0) return html;
|
|
827
|
+
const fetchImage = options.fetchImage ?? fetch;
|
|
828
|
+
const seen = /* @__PURE__ */ new Map();
|
|
829
|
+
let index = 1;
|
|
830
|
+
const assetDir = join3(options.outputDir, "assets", options.noteSlug);
|
|
831
|
+
for (const img of images) {
|
|
832
|
+
const raw = imageSource(img);
|
|
833
|
+
if (!raw || raw.startsWith("data:") || raw.startsWith("blob:")) continue;
|
|
834
|
+
let absolute;
|
|
835
|
+
try {
|
|
836
|
+
absolute = new URL(raw, options.baseUrl).toString();
|
|
837
|
+
} catch {
|
|
838
|
+
continue;
|
|
839
|
+
}
|
|
840
|
+
let rel = seen.get(absolute);
|
|
841
|
+
if (!rel) {
|
|
842
|
+
const response = await fetchImage(absolute);
|
|
843
|
+
if (!response.ok) continue;
|
|
844
|
+
const contentType = response.headers.get("content-type");
|
|
845
|
+
if (contentType && !contentType.toLowerCase().startsWith("image/")) continue;
|
|
846
|
+
const ext = extensionFrom(contentType, absolute);
|
|
847
|
+
const filename = `image-${String(index).padStart(3, "0")}${ext}`;
|
|
848
|
+
index += 1;
|
|
849
|
+
await mkdir(assetDir, { recursive: true });
|
|
850
|
+
await writeFile2(join3(assetDir, filename), new Uint8Array(await response.arrayBuffer()));
|
|
851
|
+
rel = `assets/${encodeURIComponent(options.noteSlug)}/${filename}`;
|
|
852
|
+
seen.set(absolute, rel);
|
|
853
|
+
}
|
|
854
|
+
img.setAttribute("src", rel);
|
|
855
|
+
const alt = img.getAttribute("alt")?.trim().toLowerCase();
|
|
856
|
+
if (alt === "image" || alt === "\u56FE\u50CF" || alt === "\u56FE\u7247") {
|
|
857
|
+
img.setAttribute("alt", "");
|
|
858
|
+
}
|
|
859
|
+
img.removeAttribute("srcset");
|
|
860
|
+
img.removeAttribute("data-srcset");
|
|
861
|
+
img.removeAttribute("data-original");
|
|
862
|
+
img.removeAttribute("data-src");
|
|
863
|
+
}
|
|
864
|
+
return document2.body.innerHTML;
|
|
865
|
+
}
|
|
866
|
+
|
|
867
|
+
// src/cleaning/clean-html.ts
|
|
868
|
+
import * as DefuddleModule from "defuddle";
|
|
869
|
+
import { parseHTML as parseHTML2 } from "linkedom";
|
|
870
|
+
|
|
871
|
+
// src/cleaning/profile-dom.ts
|
|
872
|
+
function textPreview(element) {
|
|
873
|
+
return (element.textContent ?? "").replace(/\s+/g, " ").trim().slice(0, 160);
|
|
874
|
+
}
|
|
875
|
+
function recordRemoval(removals, step, reason, element, selector) {
|
|
876
|
+
removals.push({ step, reason, selector, text: textPreview(element) });
|
|
877
|
+
}
|
|
878
|
+
function removeElement(removals, step, reason, element, selector) {
|
|
879
|
+
recordRemoval(removals, step, reason, element, selector);
|
|
880
|
+
element.remove();
|
|
881
|
+
}
|
|
882
|
+
function removeByExactSelectors(root, profiles, removals) {
|
|
883
|
+
for (const profile of profiles) {
|
|
884
|
+
for (const selector of profile.removals?.exactSelectors ?? []) {
|
|
885
|
+
root.querySelectorAll(selector).forEach((element) => {
|
|
886
|
+
removeElement(removals, "site-profile:exact-selector", profile.name, element, selector);
|
|
887
|
+
});
|
|
888
|
+
}
|
|
889
|
+
}
|
|
890
|
+
}
|
|
891
|
+
function removeByPartialAttributePatterns(root, profiles, removals) {
|
|
892
|
+
const patterns = profiles.flatMap(
|
|
893
|
+
(profile) => (profile.removals?.partialAttributePatterns ?? []).map((pattern) => ({ profile: profile.name, regex: new RegExp(pattern, "i") }))
|
|
894
|
+
);
|
|
895
|
+
if (patterns.length === 0) {
|
|
896
|
+
return;
|
|
897
|
+
}
|
|
898
|
+
root.querySelectorAll("*").forEach((element) => {
|
|
899
|
+
if (element.closest("pre, code, table, figure")) {
|
|
900
|
+
return;
|
|
901
|
+
}
|
|
902
|
+
const attrs = [
|
|
903
|
+
element.id,
|
|
904
|
+
element.getAttribute("class") ?? "",
|
|
905
|
+
element.getAttribute("data-component") ?? "",
|
|
906
|
+
element.getAttribute("data-test") ?? "",
|
|
907
|
+
element.getAttribute("data-testid") ?? "",
|
|
908
|
+
element.getAttribute("data-qa") ?? "",
|
|
909
|
+
element.getAttribute("data-cy") ?? ""
|
|
910
|
+
].join(" ");
|
|
911
|
+
const matched = patterns.find((pattern) => pattern.regex.test(attrs));
|
|
912
|
+
if (matched) {
|
|
913
|
+
removeElement(removals, "site-profile:partial-attribute", matched.profile, element);
|
|
914
|
+
}
|
|
915
|
+
});
|
|
916
|
+
}
|
|
917
|
+
function removeTrailingSiblings(element, removals, reason) {
|
|
918
|
+
let sibling = element.nextElementSibling;
|
|
919
|
+
while (sibling) {
|
|
920
|
+
const next = sibling.nextElementSibling;
|
|
921
|
+
removeElement(removals, "site-profile:content-pattern", reason, sibling);
|
|
922
|
+
sibling = next;
|
|
923
|
+
}
|
|
924
|
+
}
|
|
925
|
+
function compileProfileRegexes(profiles, key) {
|
|
926
|
+
return profiles.flatMap(
|
|
927
|
+
(profile) => (profile.removals?.[key] ?? []).map((pattern) => ({ profile: profile.name, regex: new RegExp(pattern, "i") }))
|
|
928
|
+
);
|
|
929
|
+
}
|
|
930
|
+
function removeByTextPatterns(root, profiles, removals) {
|
|
931
|
+
const textContains = profiles.flatMap(
|
|
932
|
+
(profile) => (profile.removals?.textContains ?? []).map((marker) => ({ profile: profile.name, marker }))
|
|
933
|
+
);
|
|
934
|
+
const cutContains = profiles.flatMap(
|
|
935
|
+
(profile) => (profile.removals?.cutAfterContains ?? []).map((marker) => ({ profile: profile.name, marker }))
|
|
936
|
+
);
|
|
937
|
+
const dropExact = /* @__PURE__ */ new Map();
|
|
938
|
+
for (const profile of profiles) {
|
|
939
|
+
for (const value of profile.removals?.dropExactText ?? []) {
|
|
940
|
+
dropExact.set(value.trim(), profile.name);
|
|
941
|
+
}
|
|
942
|
+
}
|
|
943
|
+
const textRegexes = compileProfileRegexes(profiles, "textRegexes");
|
|
944
|
+
const dropRegexes = compileProfileRegexes(profiles, "dropTextRegexes");
|
|
945
|
+
const cutRegexes = compileProfileRegexes(profiles, "cutAfterRegexes");
|
|
946
|
+
root.querySelectorAll("p, div, section, aside, footer, header, li, h1, h2, h3, h4, h5, h6").forEach((element) => {
|
|
947
|
+
if (element.closest("pre, code, table, figure")) {
|
|
948
|
+
return;
|
|
949
|
+
}
|
|
950
|
+
const text = (element.textContent ?? "").replace(/\s+/g, " ").trim();
|
|
951
|
+
if (!text) {
|
|
952
|
+
return;
|
|
953
|
+
}
|
|
954
|
+
const cut = text.length <= 240 ? cutContains.find((entry) => text.includes(entry.marker)) ?? cutRegexes.find((entry) => entry.regex.test(text)) : void 0;
|
|
955
|
+
if (cut) {
|
|
956
|
+
removeTrailingSiblings(element, removals, cut.profile);
|
|
957
|
+
removeElement(removals, "site-profile:content-pattern", cut.profile, element);
|
|
958
|
+
return;
|
|
959
|
+
}
|
|
960
|
+
const exactProfile = dropExact.get(text);
|
|
961
|
+
const matchedContains = textContains.find(
|
|
962
|
+
(entry) => text === entry.marker || text.length <= Math.max(entry.marker.length * 3, 120) && text.includes(entry.marker)
|
|
963
|
+
);
|
|
964
|
+
const matched = (exactProfile ? { profile: exactProfile } : void 0) ?? matchedContains ?? textRegexes.find((entry) => entry.regex.test(text)) ?? dropRegexes.find((entry) => entry.regex.test(text));
|
|
965
|
+
if (matched) {
|
|
966
|
+
removeElement(removals, "site-profile:content-pattern", matched.profile, element);
|
|
967
|
+
}
|
|
968
|
+
});
|
|
969
|
+
}
|
|
970
|
+
function applyFixedAuthor(metadata, profiles) {
|
|
971
|
+
for (const profile of profiles) {
|
|
972
|
+
if (profile.metadata?.fixedAuthor) {
|
|
973
|
+
metadata.author = profile.metadata.fixedAuthor;
|
|
974
|
+
}
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
function cleanupTitle(metadata, profiles) {
|
|
978
|
+
if (!metadata.title) {
|
|
979
|
+
return;
|
|
980
|
+
}
|
|
981
|
+
let title = metadata.title;
|
|
982
|
+
for (const profile of profiles) {
|
|
983
|
+
for (const pattern of profile.metadata?.titleSuffixPatterns ?? []) {
|
|
984
|
+
title = title.replace(new RegExp(pattern, "i"), "").trim();
|
|
985
|
+
}
|
|
986
|
+
}
|
|
987
|
+
metadata.title = title;
|
|
988
|
+
}
|
|
989
|
+
function applySiteProfiles(root, profiles, removals) {
|
|
990
|
+
removeByExactSelectors(root, profiles, removals);
|
|
991
|
+
removeByPartialAttributePatterns(root, profiles, removals);
|
|
992
|
+
removeByTextPatterns(root, profiles, removals);
|
|
993
|
+
}
|
|
994
|
+
function applyMetadataProfiles(metadata, profiles) {
|
|
995
|
+
applyFixedAuthor(metadata, profiles);
|
|
996
|
+
cleanupTitle(metadata, profiles);
|
|
997
|
+
}
|
|
998
|
+
|
|
999
|
+
// src/cleaning/clean-html.ts
|
|
1000
|
+
var DEFAULT_FEEDLOOM_PROFILE = {
|
|
1001
|
+
name: "feedloom-default",
|
|
1002
|
+
removals: {
|
|
1003
|
+
exactSelectors: [
|
|
1004
|
+
"script",
|
|
1005
|
+
"style",
|
|
1006
|
+
"noscript",
|
|
1007
|
+
".share-buttons",
|
|
1008
|
+
".social-share",
|
|
1009
|
+
".newsletter",
|
|
1010
|
+
".subscribe",
|
|
1011
|
+
".related",
|
|
1012
|
+
".comments"
|
|
1013
|
+
],
|
|
1014
|
+
partialAttributePatterns: ["share", "newsletter", "subscribe", "related", "comment"]
|
|
1015
|
+
}
|
|
1016
|
+
};
|
|
1017
|
+
var DefuddleClass = DefuddleModule.default ?? DefuddleModule.Defuddle;
|
|
1018
|
+
function firstMetaContent(document2, names) {
|
|
1019
|
+
for (const name of names) {
|
|
1020
|
+
const escaped = name.replace(/"/g, '\\"');
|
|
1021
|
+
const element = document2.querySelector(`meta[property="${escaped}"], meta[name="${escaped}"], meta[itemprop="${escaped}"]`);
|
|
1022
|
+
const content = element?.getAttribute("content")?.trim();
|
|
1023
|
+
if (content) return content;
|
|
1024
|
+
}
|
|
1025
|
+
return void 0;
|
|
1026
|
+
}
|
|
1027
|
+
function jsonLdValue(document2, keys) {
|
|
1028
|
+
for (const script of Array.from(document2.querySelectorAll('script[type="application/ld+json"]'))) {
|
|
1029
|
+
const text = script.textContent?.trim();
|
|
1030
|
+
if (!text) continue;
|
|
1031
|
+
try {
|
|
1032
|
+
const parsed = JSON.parse(text);
|
|
1033
|
+
const nodes = Array.isArray(parsed) ? parsed : [parsed];
|
|
1034
|
+
for (const node of nodes) {
|
|
1035
|
+
if (!node || typeof node !== "object") continue;
|
|
1036
|
+
for (const key of keys) {
|
|
1037
|
+
const value = node[key];
|
|
1038
|
+
if (typeof value === "string" && value.trim()) return value.trim();
|
|
1039
|
+
if (value && typeof value === "object" && typeof value.name === "string") {
|
|
1040
|
+
return String(value.name).trim();
|
|
1041
|
+
}
|
|
1042
|
+
}
|
|
1043
|
+
}
|
|
1044
|
+
} catch {
|
|
1045
|
+
}
|
|
1046
|
+
}
|
|
1047
|
+
return void 0;
|
|
1048
|
+
}
|
|
1049
|
+
function toMetadata(result, document2) {
|
|
1050
|
+
return {
|
|
1051
|
+
title: result.title || firstMetaContent(document2, ["og:title", "twitter:title"]) || document2.querySelector("title")?.textContent?.trim() || void 0,
|
|
1052
|
+
description: result.description || firstMetaContent(document2, ["description", "og:description", "twitter:description"]),
|
|
1053
|
+
domain: result.domain || void 0,
|
|
1054
|
+
favicon: result.favicon || void 0,
|
|
1055
|
+
image: result.image || firstMetaContent(document2, ["og:image", "twitter:image"]),
|
|
1056
|
+
language: result.language || document2.documentElement.getAttribute("lang") || void 0,
|
|
1057
|
+
published: result.published || firstMetaContent(document2, ["article:published_time", "date", "datePublished", "pubdate", "publishdate"]) || jsonLdValue(document2, ["datePublished", "dateCreated"]),
|
|
1058
|
+
author: result.author || firstMetaContent(document2, ["author", "article:author", "twitter:creator"]) || jsonLdValue(document2, ["author", "creator"]),
|
|
1059
|
+
site: result.site || firstMetaContent(document2, ["og:site_name", "application-name"]),
|
|
1060
|
+
schemaOrgData: result.schemaOrgData,
|
|
1061
|
+
wordCount: result.wordCount,
|
|
1062
|
+
parseTime: result.parseTime
|
|
1063
|
+
};
|
|
1064
|
+
}
|
|
1065
|
+
function serializeProfiledContent(content, profiles, removals) {
|
|
1066
|
+
const { document: document2 } = parseHTML2(`<!doctype html><html><body><main data-feedloom-profile-root="true">${content}</main></body></html>`);
|
|
1067
|
+
const root = document2.querySelector('[data-feedloom-profile-root="true"]') ?? document2.body;
|
|
1068
|
+
applySiteProfiles(root, profiles, removals);
|
|
1069
|
+
const serialized = root.innerHTML || root.outerHTML || document2.body.innerHTML;
|
|
1070
|
+
return serialized.trim() ? `${serialized.trim()}
|
|
1071
|
+
` : "";
|
|
1072
|
+
}
|
|
1073
|
+
var HtmlCleaner = class {
|
|
1074
|
+
constructor(options = {}) {
|
|
1075
|
+
this.options = options;
|
|
1076
|
+
}
|
|
1077
|
+
options;
|
|
1078
|
+
async parse(rawHtml) {
|
|
1079
|
+
const activeProfiles = this.options.activeProfiles ?? selectActiveProfiles(this.options.profiles, this.options.baseUrl, rawHtml);
|
|
1080
|
+
const postProfiles = [DEFAULT_FEEDLOOM_PROFILE, ...activeProfiles];
|
|
1081
|
+
const preferredContentSelector = this.options.contentSelector ?? firstContentSelector(activeProfiles);
|
|
1082
|
+
const removals = [];
|
|
1083
|
+
const html = /<html[\s>]/i.test(rawHtml) ? rawHtml : `<!doctype html><html><body>${rawHtml}</body></html>`;
|
|
1084
|
+
const { document: document2 } = parseHTML2(html);
|
|
1085
|
+
const contentSelector = preferredContentSelector && document2.querySelector(preferredContentSelector) ? preferredContentSelector : void 0;
|
|
1086
|
+
const doc = document2;
|
|
1087
|
+
if (this.options.baseUrl) {
|
|
1088
|
+
doc.URL = this.options.baseUrl;
|
|
1089
|
+
}
|
|
1090
|
+
if (!doc.styleSheets) {
|
|
1091
|
+
doc.styleSheets = [];
|
|
1092
|
+
}
|
|
1093
|
+
const parser2 = new DefuddleClass(doc, {
|
|
1094
|
+
url: this.options.baseUrl,
|
|
1095
|
+
debug: this.options.debug,
|
|
1096
|
+
contentSelector,
|
|
1097
|
+
removeSmallImages: this.options.removeSmallImages,
|
|
1098
|
+
removeHiddenElements: this.options.removeHiddenElements,
|
|
1099
|
+
removeLowScoring: this.options.removeLowScoring,
|
|
1100
|
+
removeExactSelectors: this.options.removeExactSelectors,
|
|
1101
|
+
removePartialSelectors: this.options.removePartialSelectors,
|
|
1102
|
+
removeContentPatterns: this.options.removeContentPatterns,
|
|
1103
|
+
standardize: this.options.standardize
|
|
1104
|
+
});
|
|
1105
|
+
const result = parser2.parseAsync ? await parser2.parseAsync() : parser2.parse();
|
|
1106
|
+
const metadata = toMetadata(result, document2);
|
|
1107
|
+
applyMetadataProfiles(metadata, activeProfiles);
|
|
1108
|
+
const content = serializeProfiledContent(result.content, postProfiles, removals);
|
|
1109
|
+
return {
|
|
1110
|
+
content,
|
|
1111
|
+
contentMarkdown: result.contentMarkdown,
|
|
1112
|
+
metadata,
|
|
1113
|
+
debug: this.options.debug ? {
|
|
1114
|
+
contentSelector: result.debug?.contentSelector ?? contentSelector ?? preferredContentSelector,
|
|
1115
|
+
activeProfiles: activeProfiles.map((profile) => profile.name),
|
|
1116
|
+
removals: [...result.debug?.removals ?? [], ...removals]
|
|
1117
|
+
} : void 0
|
|
1118
|
+
};
|
|
1119
|
+
}
|
|
1120
|
+
};
|
|
1121
|
+
async function cleanHtml(rawHtml, options = {}) {
|
|
1122
|
+
return new HtmlCleaner(options).parse(rawHtml);
|
|
1123
|
+
}
|
|
1124
|
+
|
|
1125
|
+
// src/fetch/strategy.ts
|
|
1126
|
+
import { writeFile as writeFile3 } from "fs/promises";
|
|
1127
|
+
|
|
1128
|
+
// src/extract/meaningful.ts
|
|
1129
|
+
import { parseHTML as parseHTML3 } from "linkedom";
|
|
1130
|
+
var PRELOADED_MARKDOWN_RE = /window\.preloadPage\s*=\s*f\((['"])(.*?)\1\)/;
|
|
1131
|
+
function extractPreloadedMarkdownUrl(html, baseUrl) {
|
|
1132
|
+
const match = PRELOADED_MARKDOWN_RE.exec(html);
|
|
1133
|
+
const rawUrl = match?.[2]?.trim();
|
|
1134
|
+
if (!rawUrl) {
|
|
1135
|
+
return null;
|
|
1136
|
+
}
|
|
1137
|
+
return new URL(rawUrl, baseUrl).toString();
|
|
1138
|
+
}
|
|
1139
|
+
function removeNoise(document2) {
|
|
1140
|
+
document2.querySelectorAll("script, style, noscript, svg, iframe").forEach((element) => element.remove());
|
|
1141
|
+
}
|
|
1142
|
+
function normalizedTextLength(element) {
|
|
1143
|
+
return (element?.textContent ?? "").replace(/\s+/g, " ").trim().length;
|
|
1144
|
+
}
|
|
1145
|
+
function htmlHasMeaningfulContent(url, html) {
|
|
1146
|
+
if (extractPreloadedMarkdownUrl(html, url) !== null) {
|
|
1147
|
+
return true;
|
|
1148
|
+
}
|
|
1149
|
+
const { document: document2 } = parseHTML3(html);
|
|
1150
|
+
removeNoise(document2);
|
|
1151
|
+
const selectors = ["#js_content", "article", "main", "section", "div", "body"];
|
|
1152
|
+
let bestLength = 0;
|
|
1153
|
+
for (const selector of selectors) {
|
|
1154
|
+
document2.querySelectorAll(selector).forEach((element) => {
|
|
1155
|
+
bestLength = Math.max(bestLength, normalizedTextLength(element));
|
|
1156
|
+
});
|
|
1157
|
+
if (bestLength >= 600 && selector !== "div") {
|
|
1158
|
+
return true;
|
|
1159
|
+
}
|
|
1160
|
+
}
|
|
1161
|
+
return bestLength >= 600;
|
|
1162
|
+
}
|
|
1163
|
+
|
|
1164
|
+
// src/fetch/browser-state.ts
|
|
1165
|
+
import { cp, mkdir as mkdir2, mkdtemp as mkdtemp3, stat as stat2, copyFile, rm as rm3 } from "fs/promises";
|
|
1166
|
+
import { tmpdir as tmpdir3 } from "os";
|
|
1167
|
+
import { basename, join as join4 } from "path";
|
|
1168
|
+
var ROOT_STATE_FILES = ["Local State", "First Run", "Last Version"];
|
|
1169
|
+
var IGNORED_NAMES = /* @__PURE__ */ new Set([
|
|
1170
|
+
"Crashpad",
|
|
1171
|
+
"Code Cache",
|
|
1172
|
+
"GPUCache",
|
|
1173
|
+
"ShaderCache",
|
|
1174
|
+
"GrShaderCache",
|
|
1175
|
+
"GraphiteDawnCache"
|
|
1176
|
+
]);
|
|
1177
|
+
function isIgnoredBrowserStatePath(path) {
|
|
1178
|
+
const name = basename(path);
|
|
1179
|
+
if (IGNORED_NAMES.has(name)) {
|
|
1180
|
+
return true;
|
|
1181
|
+
}
|
|
1182
|
+
if (name.startsWith("Singleton")) {
|
|
1183
|
+
return true;
|
|
1184
|
+
}
|
|
1185
|
+
if (name === "lockfile") {
|
|
1186
|
+
return true;
|
|
1187
|
+
}
|
|
1188
|
+
return /\.(?:lock|tmp|log)$/i.test(name);
|
|
1189
|
+
}
|
|
1190
|
+
async function copyFileIfPresent(source, destination) {
|
|
1191
|
+
try {
|
|
1192
|
+
const info = await stat2(source);
|
|
1193
|
+
if (info.isFile()) {
|
|
1194
|
+
await copyFile(source, destination);
|
|
1195
|
+
}
|
|
1196
|
+
} catch (error) {
|
|
1197
|
+
if (error.code !== "ENOENT") {
|
|
1198
|
+
throw error;
|
|
1199
|
+
}
|
|
1200
|
+
}
|
|
1201
|
+
}
|
|
1202
|
+
async function copyBrowserState(sourceRoot, destRoot, profile) {
|
|
1203
|
+
const profileDir = join4(sourceRoot, profile);
|
|
1204
|
+
const profileInfo = await stat2(profileDir).catch((error) => {
|
|
1205
|
+
if (error.code === "ENOENT") {
|
|
1206
|
+
throw new Error(`Chrome profile not found: ${profileDir}`);
|
|
1207
|
+
}
|
|
1208
|
+
throw error;
|
|
1209
|
+
});
|
|
1210
|
+
if (!profileInfo.isDirectory()) {
|
|
1211
|
+
throw new Error(`Chrome profile is not a directory: ${profileDir}`);
|
|
1212
|
+
}
|
|
1213
|
+
await mkdir2(destRoot, { recursive: true });
|
|
1214
|
+
for (const filename of ROOT_STATE_FILES) {
|
|
1215
|
+
await copyFileIfPresent(join4(sourceRoot, filename), join4(destRoot, filename));
|
|
1216
|
+
}
|
|
1217
|
+
await cp(profileDir, join4(destRoot, profile), {
|
|
1218
|
+
recursive: true,
|
|
1219
|
+
force: true,
|
|
1220
|
+
filter: (source) => !isIgnoredBrowserStatePath(source)
|
|
1221
|
+
});
|
|
1222
|
+
}
|
|
1223
|
+
async function fetchBrowserHtmlWithBrowserState(url, config) {
|
|
1224
|
+
const stateCopy = await mkdtemp3(join4(tmpdir3(), "feedloom-browser-state-"));
|
|
1225
|
+
try {
|
|
1226
|
+
await copyBrowserState(config.userDataDir, stateCopy, config.profile);
|
|
1227
|
+
return await fetchBrowserHtml(url, {
|
|
1228
|
+
userDataDir: stateCopy,
|
|
1229
|
+
channel: "chrome",
|
|
1230
|
+
headless: config.headless ?? true,
|
|
1231
|
+
timeoutMs: 9e4,
|
|
1232
|
+
waitMs: config.waitMs ?? 2500,
|
|
1233
|
+
networkIdle: config.networkIdle ?? true,
|
|
1234
|
+
extraArgs: [`--profile-directory=${config.profile}`],
|
|
1235
|
+
proxy: config.proxy,
|
|
1236
|
+
dnsOverHttps: config.dnsOverHttps,
|
|
1237
|
+
waitSelector: config.waitSelector,
|
|
1238
|
+
waitSelectorState: config.waitSelectorState,
|
|
1239
|
+
clickSelectors: config.clickSelectors,
|
|
1240
|
+
scrollToBottom: config.scrollToBottom,
|
|
1241
|
+
realChromeDefaults: config.realChromeDefaults ?? true
|
|
1242
|
+
});
|
|
1243
|
+
} finally {
|
|
1244
|
+
await rm3(stateCopy, { recursive: true, force: true });
|
|
1245
|
+
}
|
|
1246
|
+
}
|
|
1247
|
+
|
|
1248
|
+
// src/fetch/static.ts
|
|
1249
|
+
async function fetchStaticHtml(url, timeoutMs = 6e4) {
|
|
1250
|
+
const controller = new AbortController();
|
|
1251
|
+
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
|
1252
|
+
try {
|
|
1253
|
+
const response = await fetch(url, {
|
|
1254
|
+
redirect: "follow",
|
|
1255
|
+
signal: controller.signal,
|
|
1256
|
+
headers: {
|
|
1257
|
+
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36",
|
|
1258
|
+
accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
|
1259
|
+
}
|
|
1260
|
+
});
|
|
1261
|
+
if (!response.ok) {
|
|
1262
|
+
throw new Error(`HTTP ${response.status} ${response.statusText}`);
|
|
1263
|
+
}
|
|
1264
|
+
return {
|
|
1265
|
+
url: response.url || url,
|
|
1266
|
+
html: await response.text(),
|
|
1267
|
+
contentType: response.headers.get("content-type") ?? ""
|
|
1268
|
+
};
|
|
1269
|
+
} finally {
|
|
1270
|
+
clearTimeout(timeout);
|
|
1271
|
+
}
|
|
1272
|
+
}
|
|
1273
|
+
|
|
1274
|
+
// src/fetch/strategy.ts
|
|
1275
|
+
async function writeOutputIfRequested(outputPath, html) {
|
|
1276
|
+
if (outputPath) {
|
|
1277
|
+
await writeFile3(outputPath, html, "utf8");
|
|
1278
|
+
}
|
|
1279
|
+
}
|
|
1280
|
+
async function fetchHtmlResult(url, options = {}) {
|
|
1281
|
+
const isMeaningful = options.isMeaningful ?? htmlHasMeaningfulContent;
|
|
1282
|
+
const staticFetch = options.staticFetch ?? (async (targetUrl) => (await fetchStaticHtml(targetUrl)).html);
|
|
1283
|
+
const browserFetch = options.browserFetch ?? ((targetUrl) => fetchBrowserHtml(targetUrl, {
|
|
1284
|
+
waitMs: options.waitMs,
|
|
1285
|
+
networkIdle: options.networkIdle,
|
|
1286
|
+
proxy: options.proxy,
|
|
1287
|
+
dnsOverHttps: options.dnsOverHttps,
|
|
1288
|
+
waitSelector: options.waitSelector,
|
|
1289
|
+
waitSelectorState: options.waitSelectorState,
|
|
1290
|
+
clickSelectors: options.clickSelectors,
|
|
1291
|
+
scrollToBottom: options.scrollToBottom,
|
|
1292
|
+
headless: options.headless,
|
|
1293
|
+
realChromeDefaults: options.realChromeDefaults
|
|
1294
|
+
}));
|
|
1295
|
+
const stealthFetch = options.stealthFetch ?? ((targetUrl) => fetchStealthHtml(targetUrl, {
|
|
1296
|
+
waitMs: options.waitMs,
|
|
1297
|
+
networkIdle: options.networkIdle,
|
|
1298
|
+
solveCloudflare: options.solveCloudflare,
|
|
1299
|
+
disableResources: options.disableResources,
|
|
1300
|
+
proxy: options.proxy,
|
|
1301
|
+
dnsOverHttps: options.dnsOverHttps,
|
|
1302
|
+
waitSelector: options.waitSelector,
|
|
1303
|
+
waitSelectorState: options.waitSelectorState,
|
|
1304
|
+
clickSelectors: options.clickSelectors,
|
|
1305
|
+
scrollToBottom: options.scrollToBottom
|
|
1306
|
+
}));
|
|
1307
|
+
const browserStateFetch = options.browserStateFetch ?? fetchBrowserHtmlWithBrowserState;
|
|
1308
|
+
const mode = options.fetchMode ?? "auto";
|
|
1309
|
+
const attempts = [];
|
|
1310
|
+
if (mode === "auto" || mode === "static") {
|
|
1311
|
+
attempts.push({
|
|
1312
|
+
label: "static",
|
|
1313
|
+
fetch: () => staticFetch(url)
|
|
1314
|
+
});
|
|
1315
|
+
}
|
|
1316
|
+
if (mode === "auto" || mode === "browser") {
|
|
1317
|
+
if (options.browserState) {
|
|
1318
|
+
attempts.push({
|
|
1319
|
+
label: "browser-state",
|
|
1320
|
+
fetch: () => browserStateFetch(url, options.browserState)
|
|
1321
|
+
});
|
|
1322
|
+
}
|
|
1323
|
+
attempts.push({
|
|
1324
|
+
label: "browser",
|
|
1325
|
+
fetch: () => browserFetch(url)
|
|
1326
|
+
});
|
|
1327
|
+
}
|
|
1328
|
+
if (mode === "auto" || mode === "stealth") {
|
|
1329
|
+
attempts.push({
|
|
1330
|
+
label: "stealth",
|
|
1331
|
+
fetch: () => stealthFetch(url)
|
|
1332
|
+
});
|
|
1333
|
+
}
|
|
1334
|
+
const errors = [];
|
|
1335
|
+
for (const attempt of attempts) {
|
|
1336
|
+
try {
|
|
1337
|
+
const html = await attempt.fetch();
|
|
1338
|
+
await writeOutputIfRequested(options.outputPath, html);
|
|
1339
|
+
if (isMeaningful(url, html)) {
|
|
1340
|
+
return { url, finalUrl: url, html, mode: attempt.label, diagnostics: errors };
|
|
1341
|
+
}
|
|
1342
|
+
errors.push(`${attempt.label} missing article content`);
|
|
1343
|
+
} catch (error) {
|
|
1344
|
+
errors.push(`${attempt.label} failed: ${error.message || String(error)}`);
|
|
1345
|
+
}
|
|
1346
|
+
}
|
|
1347
|
+
throw new Error(errors.join("; "));
|
|
1348
|
+
}
|
|
1349
|
+
async function fetchHtml(url, options = {}) {
|
|
1350
|
+
return (await fetchHtmlResult(url, options)).html;
|
|
1351
|
+
}
|
|
1352
|
+
|
|
1353
|
+
// src/output.ts
|
|
1354
|
+
import { createHash } from "crypto";
|
|
1355
|
+
import { mkdir as mkdir3, readdir, readFile as readFile3, rm as rm4, stat as stat3, writeFile as writeFile4 } from "fs/promises";
|
|
1356
|
+
import { basename as basename2, join as join5 } from "path";
|
|
1357
|
+
var FRONTMATTER_ESCAPE_RE = /[\n\r]/g;
|
|
1358
|
+
function sanitizeFilename(title) {
|
|
1359
|
+
return title.replace(/[\\/:*?"<>|]+/g, " ").replace(/\s+/g, " ").trim().replace(/^\.+|\.+$/g, "").slice(0, 180) || "Untitled";
|
|
1360
|
+
}
|
|
1361
|
+
function yamlString(value) {
|
|
1362
|
+
return JSON.stringify(value.replace(FRONTMATTER_ESCAPE_RE, " "));
|
|
1363
|
+
}
|
|
1364
|
+
function renderFrontmatter(source, metadata, created) {
|
|
1365
|
+
const lines = ["---", `source: ${yamlString(source)}`];
|
|
1366
|
+
if (metadata.author) {
|
|
1367
|
+
lines.push(`author: ${yamlString(metadata.author)}`);
|
|
1368
|
+
}
|
|
1369
|
+
lines.push(`created: ${yamlString(created)}`);
|
|
1370
|
+
lines.push("---", "");
|
|
1371
|
+
return `${lines.join("\n")}
|
|
1372
|
+
`;
|
|
1373
|
+
}
|
|
1374
|
+
function noteSource(text) {
|
|
1375
|
+
const lines = text.split(/\r?\n/);
|
|
1376
|
+
if (lines[0]?.trim() !== "---") return "";
|
|
1377
|
+
for (const line of lines.slice(1)) {
|
|
1378
|
+
if (line.trim() === "---") break;
|
|
1379
|
+
const match = line.match(/^\s*source:\s*(.*)\s*$/);
|
|
1380
|
+
if (!match) continue;
|
|
1381
|
+
const raw = match[1].trim();
|
|
1382
|
+
try {
|
|
1383
|
+
return JSON.parse(raw);
|
|
1384
|
+
} catch {
|
|
1385
|
+
return raw.replace(/^['"]|['"]$/g, "");
|
|
1386
|
+
}
|
|
1387
|
+
}
|
|
1388
|
+
return "";
|
|
1389
|
+
}
|
|
1390
|
+
async function cleanupExistingNote(outputDir, sourceUrl) {
|
|
1391
|
+
await mkdir3(outputDir, { recursive: true });
|
|
1392
|
+
const entries = await readdir(outputDir, { withFileTypes: true });
|
|
1393
|
+
for (const entry of entries) {
|
|
1394
|
+
if (!entry.isFile() || !entry.name.endsWith(".md")) continue;
|
|
1395
|
+
const path = join5(outputDir, entry.name);
|
|
1396
|
+
let text = "";
|
|
1397
|
+
try {
|
|
1398
|
+
text = await readFile3(path, "utf8");
|
|
1399
|
+
} catch {
|
|
1400
|
+
continue;
|
|
1401
|
+
}
|
|
1402
|
+
if (noteSource(text) !== sourceUrl && !text.includes(`> Source: ${sourceUrl}`)) continue;
|
|
1403
|
+
await rm4(path, { force: true });
|
|
1404
|
+
await rm4(join5(outputDir, "assets", basename2(entry.name, ".md")), { recursive: true, force: true });
|
|
1405
|
+
return;
|
|
1406
|
+
}
|
|
1407
|
+
}
|
|
1408
|
+
function urlHash(url) {
|
|
1409
|
+
return createHash("sha1").update(url).digest("hex").slice(0, 8);
|
|
1410
|
+
}
|
|
1411
|
+
async function writeMarkdownNote(outputDir, note) {
|
|
1412
|
+
await mkdir3(outputDir, { recursive: true });
|
|
1413
|
+
const base = sanitizeFilename(note.title);
|
|
1414
|
+
let path = join5(outputDir, `${base}.md`);
|
|
1415
|
+
try {
|
|
1416
|
+
await stat3(path);
|
|
1417
|
+
path = join5(outputDir, `${base}-${urlHash(note.sourceUrl)}.md`);
|
|
1418
|
+
} catch (error) {
|
|
1419
|
+
if (error.code !== "ENOENT") {
|
|
1420
|
+
throw error;
|
|
1421
|
+
}
|
|
1422
|
+
}
|
|
1423
|
+
const body = note.markdown.trim();
|
|
1424
|
+
await writeFile4(path, `${renderFrontmatter(note.sourceUrl, note.metadata, note.created)}# ${note.title}
|
|
1425
|
+
|
|
1426
|
+
${body}
|
|
1427
|
+
`, "utf8");
|
|
1428
|
+
return path;
|
|
1429
|
+
}
|
|
1430
|
+
|
|
1431
|
+
// src/render/markdown.ts
|
|
1432
|
+
import { parseHTML as parseHTML4 } from "linkedom";
|
|
1433
|
+
import TurndownService from "turndown";
|
|
1434
|
+
import { gfm } from "turndown-plugin-gfm";
|
|
1435
|
+
function normalizeImageReferences(markdown) {
|
|
1436
|
+
return markdown.replace(/!\[([^\]]*)\]\(<([^>]+)>\)(?:\{[^}]*\})?/g, (_match, alt, url) => {
|
|
1437
|
+
return ``;
|
|
1438
|
+
});
|
|
1439
|
+
}
|
|
1440
|
+
function cleanupMarkdown(markdown) {
|
|
1441
|
+
return normalizeImageReferences(markdown).replace(/^\s*content_copy\s*$/gim, "").replace(/^●●●\n\n```\n([\s\S]*?)\n```\n\n└$/gm, (_match, code) => `\`\`\`
|
|
1442
|
+
\u25CF\u25CF\u25CF
|
|
1443
|
+
|
|
1444
|
+
${code}
|
|
1445
|
+
|
|
1446
|
+
\u2514
|
|
1447
|
+
\`\`\``).replace(/\[\s*\]\((?:#|javascript:void\(0\)|javascript:;)\)/gi, "").replace(/(^|[^\\])\$(?=\d)/g, "$1\\$").replace(/\n\s*\n\s*([-*+]\s)/g, "\n$1").replace(/[ \t]+\n/g, "\n").replace(/\n{3,}/g, "\n\n").trim();
|
|
1448
|
+
}
|
|
1449
|
+
function htmlFragmentText(fragment) {
|
|
1450
|
+
const { document: document2 } = parseHTML4(`<!doctype html><html><body>${fragment}</body></html>`);
|
|
1451
|
+
document2.querySelectorAll("br").forEach((br) => br.replaceWith(document2.createTextNode("\n")));
|
|
1452
|
+
return document2.body.textContent ?? "";
|
|
1453
|
+
}
|
|
1454
|
+
function fencedCodeHtml(text) {
|
|
1455
|
+
const escaped = text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">");
|
|
1456
|
+
return `<pre><code>${escaped}</code></pre>`;
|
|
1457
|
+
}
|
|
1458
|
+
function normalizeTableCellHtml(html) {
|
|
1459
|
+
return html.replace(/<(t[hd])\b([^>]*)>\s*<section\b[^>]*>([\s\S]*?)<\/section>\s*<\/t[hd]>/gi, "<$1$2>$3</$1>").replace(/Input(<span\b[^>]*class=["']math inline["'][^>]*>\s*\/\s*<em>M<\/em>\s*<em>t<\/em>\s*<em>o<\/em>\s*<em>k<\/em>\s*<em>e<\/em>\s*<em>n<\/em>\s*<em>s<\/em>\s*)\s*\|\s*<em>O<\/em>\s*<em>u<\/em>\s*<em>t<\/em>\s*<em>p<\/em>\s*<em>u<\/em>\s*<em>t<\/em>\s*(<\/span>\s*\/M\s*tokens)/gi, 'Input\uFF08/M tokens\uFF09</th><th style="text-align: right;">Output\uFF08/M tokens\uFF09').replace(/<th\b[^>]*>\s*<\/th>/gi, "");
|
|
1460
|
+
}
|
|
1461
|
+
function normalizeBlockCodeHtml(html) {
|
|
1462
|
+
return html.replace(/<code\b[^>]*>(((?:(?!<\/code>)[\s\S])*<br\b(?:(?!<\/code>)[\s\S])*))<\/code>/gi, (_match, codeInnerHtml) => {
|
|
1463
|
+
return fencedCodeHtml(htmlFragmentText(codeInnerHtml).replace(/\n$/, ""));
|
|
1464
|
+
});
|
|
1465
|
+
}
|
|
1466
|
+
function htmlToMarkdown(html) {
|
|
1467
|
+
const turndown = new TurndownService({
|
|
1468
|
+
headingStyle: "atx",
|
|
1469
|
+
codeBlockStyle: "fenced",
|
|
1470
|
+
bulletListMarker: "-"
|
|
1471
|
+
});
|
|
1472
|
+
turndown.use(gfm);
|
|
1473
|
+
turndown.addRule("dropEmptyLinks", {
|
|
1474
|
+
filter: (node) => node.nodeName === "A" && !node.textContent?.trim(),
|
|
1475
|
+
replacement: () => ""
|
|
1476
|
+
});
|
|
1477
|
+
turndown.addRule("blockCodeElement", {
|
|
1478
|
+
filter: (node) => node.nodeName === "CODE" && node.parentNode?.nodeName !== "PRE" && Boolean(node.querySelector?.("br")),
|
|
1479
|
+
replacement: (_content, node) => {
|
|
1480
|
+
return `
|
|
1481
|
+
|
|
1482
|
+
\`\`\`
|
|
1483
|
+
${node.textContent?.replace(/\n$/, "") ?? ""}
|
|
1484
|
+
\`\`\`
|
|
1485
|
+
|
|
1486
|
+
`;
|
|
1487
|
+
}
|
|
1488
|
+
});
|
|
1489
|
+
turndown.addRule("preserveCodeLanguage", {
|
|
1490
|
+
filter: (node) => node.nodeName === "PRE" && node.firstChild?.nodeName === "CODE",
|
|
1491
|
+
replacement: (_content, node) => {
|
|
1492
|
+
const code = node.firstChild;
|
|
1493
|
+
const className = code.getAttribute("class") ?? "";
|
|
1494
|
+
const language = className.match(/language-([\w-]+)/)?.[1] ?? "";
|
|
1495
|
+
return `
|
|
1496
|
+
|
|
1497
|
+
\`\`\`${language}
|
|
1498
|
+
${code.textContent?.replace(/\n$/, "") ?? ""}
|
|
1499
|
+
\`\`\`
|
|
1500
|
+
|
|
1501
|
+
`;
|
|
1502
|
+
}
|
|
1503
|
+
});
|
|
1504
|
+
return `${cleanupMarkdown(turndown.turndown(normalizeBlockCodeHtml(normalizeTableCellHtml(html))))}
|
|
1505
|
+
`;
|
|
1506
|
+
}
|
|
1507
|
+
|
|
1508
|
+
// src/pipeline.ts
|
|
1509
|
+
function titleFromUrl(url) {
|
|
1510
|
+
const parsed = new URL(url);
|
|
1511
|
+
const segment = parsed.pathname.split("/").filter(Boolean).pop();
|
|
1512
|
+
return decodeURIComponent(segment || parsed.hostname || "Untitled").replace(/[-_]+/g, " ").trim() || "Untitled";
|
|
1513
|
+
}
|
|
1514
|
+
function stripDuplicateLeadingHeading(markdown, title) {
|
|
1515
|
+
const normalizedTitle = title.replace(/\s+/g, " ").trim().toLowerCase();
|
|
1516
|
+
return markdown.replace(/^#\s+(.+?)\s*\n+/, (match, heading) => {
|
|
1517
|
+
return heading.replace(/\s+/g, " ").trim().toLowerCase() === normalizedTitle ? "" : match;
|
|
1518
|
+
});
|
|
1519
|
+
}
|
|
1520
|
+
function stripLeadingDateLine(markdown) {
|
|
1521
|
+
return markdown.replace(/^(?:Published\s+)?(?:\d{4}[-/.年]\d{1,2}[-/.月]\d{1,2}日?|[A-Z][a-z]+\s+\d{1,2},\s+\d{4})\s*\n+/i, "");
|
|
1522
|
+
}
|
|
1523
|
+
function demoteTopLevelHeadings(markdown) {
|
|
1524
|
+
const lines = markdown.split("\n");
|
|
1525
|
+
let inFence = false;
|
|
1526
|
+
return lines.map((line) => {
|
|
1527
|
+
if (/^```/.test(line.trim())) {
|
|
1528
|
+
inFence = !inFence;
|
|
1529
|
+
return line;
|
|
1530
|
+
}
|
|
1531
|
+
if (!inFence && /^#(?!#)\s+/.test(line)) {
|
|
1532
|
+
return `#${line}`;
|
|
1533
|
+
}
|
|
1534
|
+
return line;
|
|
1535
|
+
}).join("\n");
|
|
1536
|
+
}
|
|
1537
|
+
function createdFromItemDate(date) {
|
|
1538
|
+
if (date.getUTCHours() === 0 && date.getUTCMinutes() === 0 && date.getUTCSeconds() === 0 && date.getUTCMilliseconds() === 0) {
|
|
1539
|
+
return date.toISOString().slice(0, 10);
|
|
1540
|
+
}
|
|
1541
|
+
return date.toISOString().replace(/\.\d{3}Z$/, "Z");
|
|
1542
|
+
}
|
|
1543
|
+
function resolveCreatedValue(item, published) {
|
|
1544
|
+
if (published?.trim()) return published.trim();
|
|
1545
|
+
if (item.publishedAt) return createdFromItemDate(item.publishedAt);
|
|
1546
|
+
return (/* @__PURE__ */ new Date()).toISOString().replace(/\.\d{3}Z$/, "Z");
|
|
1547
|
+
}
|
|
1548
|
+
async function processItem(item, options) {
|
|
1549
|
+
const html = await fetchHtml(item.url, options);
|
|
1550
|
+
const activeProfiles = selectActiveProfiles(options.profiles, item.url, html);
|
|
1551
|
+
const cleaned = await cleanHtml(html, { baseUrl: item.url, profiles: options.profiles, activeProfiles });
|
|
1552
|
+
const title = cleaned.metadata.title || item.sourceTitle || titleFromUrl(item.url);
|
|
1553
|
+
await cleanupExistingNote(options.outputDir, item.url);
|
|
1554
|
+
const contentHtml = options.localizeAssets === false ? cleaned.content : await localizeImages(cleaned.content, {
|
|
1555
|
+
outputDir: options.outputDir,
|
|
1556
|
+
noteSlug: sanitizeFilename(title),
|
|
1557
|
+
baseUrl: item.url,
|
|
1558
|
+
fetchImage: options.fetchImage
|
|
1559
|
+
});
|
|
1560
|
+
const markdown = demoteTopLevelHeadings(stripLeadingDateLine(stripDuplicateLeadingHeading(htmlToMarkdown(contentHtml), title)));
|
|
1561
|
+
const outputPath = await writeMarkdownNote(options.outputDir, {
|
|
1562
|
+
sourceUrl: item.url,
|
|
1563
|
+
title,
|
|
1564
|
+
metadata: cleaned.metadata,
|
|
1565
|
+
markdown,
|
|
1566
|
+
created: resolveCreatedValue(item, cleaned.metadata.published)
|
|
1567
|
+
});
|
|
1568
|
+
return { item, outputPath, title };
|
|
1569
|
+
}
|
|
1570
|
+
|
|
1571
|
+
// src/tracking.ts
|
|
1572
|
+
import { writeFileSync } from "fs";
|
|
1573
|
+
import { tmpdir as tmpdir4 } from "os";
|
|
1574
|
+
import { join as join6 } from "path";
|
|
1575
|
+
var ProgressTracker = class {
|
|
1576
|
+
constructor(items, targetDir) {
|
|
1577
|
+
this.targetDir = targetDir;
|
|
1578
|
+
this.entries = items.map((item, index) => ({
|
|
1579
|
+
index: index + 1,
|
|
1580
|
+
url: item.url,
|
|
1581
|
+
sourcePath: item.sourcePath,
|
|
1582
|
+
lineNo: item.lineNo,
|
|
1583
|
+
status: "pending"
|
|
1584
|
+
}));
|
|
1585
|
+
if (items.length > 1) {
|
|
1586
|
+
this.path = join6(tmpdir4(), `feedloom-progress-${Date.now()}-${Math.random().toString(16).slice(2)}.json`);
|
|
1587
|
+
this.write();
|
|
1588
|
+
}
|
|
1589
|
+
}
|
|
1590
|
+
targetDir;
|
|
1591
|
+
path;
|
|
1592
|
+
entries;
|
|
1593
|
+
start(url) {
|
|
1594
|
+
this.update(url, { status: "in_progress", error: void 0 });
|
|
1595
|
+
}
|
|
1596
|
+
done(url, notePath) {
|
|
1597
|
+
this.update(url, { status: "done", notePath, error: void 0 });
|
|
1598
|
+
}
|
|
1599
|
+
fail(url, error) {
|
|
1600
|
+
this.update(url, { status: "failed", error });
|
|
1601
|
+
}
|
|
1602
|
+
update(url, patch) {
|
|
1603
|
+
const entry = this.entries.find((item) => item.url === url);
|
|
1604
|
+
if (entry) Object.assign(entry, patch);
|
|
1605
|
+
this.write();
|
|
1606
|
+
}
|
|
1607
|
+
write() {
|
|
1608
|
+
if (!this.path) return;
|
|
1609
|
+
const payload = {
|
|
1610
|
+
created_at: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1611
|
+
target_dir: this.targetDir,
|
|
1612
|
+
items: this.entries
|
|
1613
|
+
};
|
|
1614
|
+
writeFileSync(this.path, `${JSON.stringify(payload, null, 2)}
|
|
1615
|
+
`, "utf8");
|
|
1616
|
+
}
|
|
1617
|
+
};
|
|
1618
|
+
|
|
1619
|
+
// src/cli.ts
|
|
1620
|
+
var program = new Command();
|
|
1621
|
+
async function standardSiteRulePaths() {
|
|
1622
|
+
const here = dirname(fileURLToPath(import.meta.url));
|
|
1623
|
+
const candidates = [
|
|
1624
|
+
resolve2(here, "../src/site-rules"),
|
|
1625
|
+
resolve2(here, "../../src/site-rules"),
|
|
1626
|
+
resolve2(process.cwd(), "src/site-rules"),
|
|
1627
|
+
resolve2(here, "../../src/feedloom/site_rules"),
|
|
1628
|
+
resolve2(process.cwd(), "../src/feedloom/site_rules"),
|
|
1629
|
+
resolve2(process.cwd(), "src/feedloom/site_rules")
|
|
1630
|
+
];
|
|
1631
|
+
for (const dir of candidates) {
|
|
1632
|
+
try {
|
|
1633
|
+
const names = await readdir2(dir);
|
|
1634
|
+
return names.filter((name) => name.endsWith(".toml")).map((name) => join7(dir, name));
|
|
1635
|
+
} catch (error) {
|
|
1636
|
+
if (error.code !== "ENOENT") {
|
|
1637
|
+
throw error;
|
|
1638
|
+
}
|
|
1639
|
+
}
|
|
1640
|
+
}
|
|
1641
|
+
return [];
|
|
1642
|
+
}
|
|
1643
|
+
function positiveIntOption(value, fallback) {
|
|
1644
|
+
const parsed = Number(value ?? fallback);
|
|
1645
|
+
if (!Number.isInteger(parsed)) {
|
|
1646
|
+
throw new Error(`Expected integer option, got ${String(value)}`);
|
|
1647
|
+
}
|
|
1648
|
+
return parsed;
|
|
1649
|
+
}
|
|
1650
|
+
program.name("feedloom").description("Archive long-form web content as clean Markdown with local assets").version("0.1.0").option("--output-dir <dir>", "Output directory for markdown notes", "clippings").option("--source-kind <kind>", "auto, html-page, or rss-feed", "auto").option("--since <date>", "Only keep feed entries on or after YYYY-MM-DD", "").option("--limit <n>", "Process only first N deduplicated URLs", "0").option("--start <n>", "Start from 1-based index after deduplication", "1").option("--end <n>", "End at 1-based index after deduplication", "0").option("--prefer-browser-state", "Try copied local Chrome profile before regular browser fallback", false).option("--chrome-user-data-dir <path>", "Chrome user data directory used with --prefer-browser-state", "").option("--chrome-profile <name>", "Chrome profile directory name", "Default").option("--fetch-mode <mode>", "auto, static, browser, or stealth", "auto").option("--no-network-idle", "Do not wait for browser networkidle before reading HTML").option("--wait-ms <ms>", "Extra browser wait after load", "2500").option("--solve-cloudflare", "In stealth mode, attempt Cloudflare Turnstile/interstitial challenge handling", false).option("--disable-resources", "In stealth mode, block images/media/fonts/stylesheets for speed", false).option("--proxy <server>", "Proxy server for browser/stealth fetch, e.g. http://127.0.0.1:8080", "").option("--dns-over-https", "Use Chromium Cloudflare DNS-over-HTTPS flag for browser/stealth fetch", false).option("--wait-selector <selector>", "Wait for a CSS selector after page load", "").option("--wait-selector-state <state>", "attached, detached, visible, or hidden", "attached").option("--click-selector <selector...>", "Click one or more selectors after page load", []).option("--scroll-to-bottom", "Scroll to the bottom before reading HTML", false).option("--headful", "Run browser/browser-state fetches with a visible Chrome window", false).option("--no-real-chrome-defaults", "Disable Scrapling-inspired real Chrome context defaults").option("--no-reuse-browser", "Disable batch browser/stealth context reuse").argument("[inputs...]", "URLs or files containing URLs").action(async (inputs, options) => {
|
|
1651
|
+
if (inputs.length === 0) {
|
|
1652
|
+
program.help({ error: true });
|
|
1653
|
+
}
|
|
1654
|
+
try {
|
|
1655
|
+
const sourceKind = String(options.sourceKind ?? "auto");
|
|
1656
|
+
if (!["auto", "html-page", "rss-feed"].includes(sourceKind)) {
|
|
1657
|
+
throw new Error("--source-kind must be auto, html-page, or rss-feed");
|
|
1658
|
+
}
|
|
1659
|
+
const fetchMode = String(options.fetchMode ?? "auto");
|
|
1660
|
+
if (!["auto", "static", "browser", "stealth"].includes(fetchMode)) {
|
|
1661
|
+
throw new Error("--fetch-mode must be auto, static, browser, or stealth");
|
|
1662
|
+
}
|
|
1663
|
+
const waitMs = positiveIntOption(options.waitMs, 2500);
|
|
1664
|
+
const waitSelectorState = String(options.waitSelectorState ?? "attached");
|
|
1665
|
+
if (!["attached", "detached", "visible", "hidden"].includes(waitSelectorState)) {
|
|
1666
|
+
throw new Error("--wait-selector-state must be attached, detached, visible, or hidden");
|
|
1667
|
+
}
|
|
1668
|
+
const { items, checkboxFiles } = await parseInputs(inputs);
|
|
1669
|
+
const since = options.since ? parseSinceDate(String(options.since)) : void 0;
|
|
1670
|
+
const expanded = await expandSourceItems(items, sourceKind, since);
|
|
1671
|
+
const selected = sliceItems(
|
|
1672
|
+
expanded,
|
|
1673
|
+
positiveIntOption(options.start, 1),
|
|
1674
|
+
positiveIntOption(options.end, 0),
|
|
1675
|
+
positiveIntOption(options.limit, 0)
|
|
1676
|
+
);
|
|
1677
|
+
const profiles = await loadSiteProfiles(await standardSiteRulePaths());
|
|
1678
|
+
const outputDir = String(options.outputDir ?? "clippings");
|
|
1679
|
+
let failures = 0;
|
|
1680
|
+
const tracker = new ProgressTracker(selected, outputDir);
|
|
1681
|
+
if (tracker.path) {
|
|
1682
|
+
console.error(`Progress: ${tracker.path}`);
|
|
1683
|
+
}
|
|
1684
|
+
const browserOptions = {
|
|
1685
|
+
waitMs,
|
|
1686
|
+
networkIdle: Boolean(options.networkIdle),
|
|
1687
|
+
proxy: String(options.proxy || "") || void 0,
|
|
1688
|
+
dnsOverHttps: Boolean(options.dnsOverHttps),
|
|
1689
|
+
waitSelector: String(options.waitSelector || "") || void 0,
|
|
1690
|
+
waitSelectorState,
|
|
1691
|
+
clickSelectors: Array.isArray(options.clickSelector) ? options.clickSelector.map(String) : [],
|
|
1692
|
+
scrollToBottom: Boolean(options.scrollToBottom),
|
|
1693
|
+
headless: !Boolean(options.headful),
|
|
1694
|
+
realChromeDefaults: options.realChromeDefaults !== false
|
|
1695
|
+
};
|
|
1696
|
+
const sessions = options.reuseBrowser === false ? null : new BatchFetchSessions({
|
|
1697
|
+
browser: browserOptions,
|
|
1698
|
+
stealth: {
|
|
1699
|
+
...browserOptions,
|
|
1700
|
+
solveCloudflare: Boolean(options.solveCloudflare),
|
|
1701
|
+
disableResources: Boolean(options.disableResources)
|
|
1702
|
+
}
|
|
1703
|
+
});
|
|
1704
|
+
try {
|
|
1705
|
+
for (const item of selected) {
|
|
1706
|
+
tracker.start(item.url);
|
|
1707
|
+
try {
|
|
1708
|
+
const browserState = options.preferBrowserState ? {
|
|
1709
|
+
userDataDir: String(options.chromeUserDataDir || ""),
|
|
1710
|
+
profile: String(options.chromeProfile || "Default"),
|
|
1711
|
+
...browserOptions
|
|
1712
|
+
} : null;
|
|
1713
|
+
const result = await processItem(item, {
|
|
1714
|
+
outputDir,
|
|
1715
|
+
profiles,
|
|
1716
|
+
browserState,
|
|
1717
|
+
fetchMode,
|
|
1718
|
+
...browserOptions,
|
|
1719
|
+
solveCloudflare: Boolean(options.solveCloudflare),
|
|
1720
|
+
disableResources: Boolean(options.disableResources),
|
|
1721
|
+
browserFetch: sessions ? (targetUrl) => sessions.browserFetch(targetUrl) : void 0,
|
|
1722
|
+
stealthFetch: sessions ? (targetUrl) => sessions.stealthFetch(targetUrl) : void 0
|
|
1723
|
+
});
|
|
1724
|
+
console.error(`Wrote ${result.outputPath}`);
|
|
1725
|
+
tracker.done(item.url, result.outputPath);
|
|
1726
|
+
const checkbox = item.sourcePath ? checkboxFiles.get(item.sourcePath) : void 0;
|
|
1727
|
+
checkbox?.markDone(item.lineNo, item.url);
|
|
1728
|
+
} catch (error) {
|
|
1729
|
+
failures += 1;
|
|
1730
|
+
const message = error.message || String(error);
|
|
1731
|
+
tracker.fail(item.url, message);
|
|
1732
|
+
console.error(`Failed ${item.url}: ${message}`);
|
|
1733
|
+
}
|
|
1734
|
+
}
|
|
1735
|
+
} finally {
|
|
1736
|
+
await sessions?.close();
|
|
1737
|
+
}
|
|
1738
|
+
await Promise.all([...checkboxFiles.values()].map((checkbox) => checkbox.save()));
|
|
1739
|
+
process.exitCode = failures > 0 ? 1 : 0;
|
|
1740
|
+
} catch (error) {
|
|
1741
|
+
console.error(error.message || String(error));
|
|
1742
|
+
process.exitCode = 2;
|
|
1743
|
+
}
|
|
1744
|
+
});
|
|
1745
|
+
program.parseAsync();
|