@apmantza/greedysearch-pi 1.9.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,451 +1,566 @@
1
- // src/search/fetch-source.mjs — HTTP and browser-based source content fetching
2
- //
3
- // Extracted from search.mjs. PRIMARY path uses Chrome's network stack
4
- // (fetch() over CDP) to produce authentic Chrome TLS/JA3 fingerprints.
5
- // Falls back to Node.js HTTP (via fetcher.mjs) if Chrome is unavailable.
6
-
7
- import {
8
- fetchSourceHttp,
9
- extractContent,
10
- detectBotBlock,
11
- checkContentQuality,
12
- } from "../fetcher.mjs";
13
- import { fetchGitHubContent, parseGitHubUrl } from "../github.mjs";
14
- import { fetchRedditContent, parseRedditUrl } from "../reddit.mjs";
15
- import { trimContentHeadTail } from "../utils/content.mjs";
16
- import { cdp, closeTab, openNewTab } from "./chrome.mjs";
17
- import { SOURCE_FETCH_CONCURRENCY } from "./constants.mjs";
18
- import { trimText } from "./sources.mjs";
19
-
20
- /**
21
- * Fetch a URL using Chrome's Network.loadNetworkResource (Chrome 124+).
22
- * This uses Chrome's native network stack (authentic TLS/JA3 fingerprint)
23
- * without the overhead of page navigation — response body returned via CDP.
24
- *
25
- * Used as FALLBACK when Node.js HTTP fails (TLS mismatch, etc.).
26
- */
27
- async function fetchSourceViaChrome(tab, url, maxChars = 8000) {
28
- const start = Date.now();
29
-
30
- try {
31
- // Get the frameId of the tab for Network.loadNetworkResource
32
- const frames = await cdp(["evalraw", tab, "Page.getFrameTree", "{}"])
33
- .then((r) => JSON.parse(r))
34
- .catch(() => null);
35
- const frameId = frames?.frameTree?.frame?.id || undefined;
36
-
37
- // Load resource using Chrome's network stack (authentic TLS fingerprint)
38
- const raw = await cdp(
39
- [
40
- "evalraw",
41
- tab,
42
- "Network.loadNetworkResource",
43
- JSON.stringify({
44
- frameId,
45
- url,
46
- options: { disableCache: true, includeCredentials: false },
47
- }),
48
- ],
49
- 20000,
50
- );
51
-
52
- const result = JSON.parse(raw);
53
- const resource = result.resource;
54
- if (!resource?.success || !resource.httpStatusCode) {
55
- return {
56
- url,
57
- error:
58
- resource?.netErrorName ||
59
- resource?.netError ||
60
- "loadNetworkResource failed",
61
- source: "chrome",
62
- duration: Date.now() - start,
63
- needsFallback: true,
64
- };
65
- }
66
-
67
- // Read response body from stream
68
- let body = "";
69
- if (resource.stream) {
70
- try {
71
- const ioRaw = await cdp(
72
- [
73
- "evalraw",
74
- tab,
75
- "IO.read",
76
- JSON.stringify({ handle: resource.stream }),
77
- ],
78
- 10000,
79
- );
80
- const ioResult = JSON.parse(ioRaw);
81
- body = ioResult.data || "";
82
- // Close stream
83
- await cdp([
84
- "evalraw",
85
- tab,
86
- "IO.close",
87
- JSON.stringify({ handle: resource.stream }),
88
- ]).catch(() => {});
89
- } catch {}
90
- }
91
-
92
- if (!body || body.length < 100) {
93
- return {
94
- url,
95
- error: "Empty response body from Network.loadNetworkResource",
96
- source: "chrome",
97
- duration: Date.now() - start,
98
- needsFallback: true,
99
- };
100
- }
101
-
102
- // Bot-detection and content extraction
103
- const botCheck = detectBotBlock(resource.httpStatusCode, body, url, url);
104
- if (botCheck.blocked) {
105
- return {
106
- url,
107
- status: resource.httpStatusCode,
108
- error: `Blocked: ${botCheck.reason}`,
109
- source: "chrome",
110
- duration: Date.now() - start,
111
- needsBrowser: true,
112
- };
113
- }
114
-
115
- const extracted = extractContent(body, url);
116
- const quality = checkContentQuality(extracted);
117
- if (!quality.ok) {
118
- return {
119
- url,
120
- status: resource.httpStatusCode,
121
- error: `Low quality: ${quality.reason}`,
122
- source: "chrome",
123
- duration: Date.now() - start,
124
- needsBrowser: true,
125
- };
126
- }
127
-
128
- const content = trimContentHeadTail(extracted.markdown, maxChars);
129
- return {
130
- url,
131
- finalUrl: url,
132
- status: resource.httpStatusCode,
133
- contentType: "text/markdown",
134
- lastModified: "",
135
- publishedTime: extracted.publishedTime || "",
136
- byline: extracted.byline || "",
137
- siteName: extracted.siteName || "",
138
- lang: extracted.lang || "",
139
- title: extracted.title || url,
140
- snippet: extracted.excerpt,
141
- content,
142
- contentChars: content.length,
143
- source: "chrome",
144
- duration: Date.now() - start,
145
- };
146
- } catch (error) {
147
- return {
148
- url,
149
- error: error.message,
150
- source: "chrome",
151
- duration: Date.now() - start,
152
- needsFallback: true,
153
- };
154
- }
155
- }
156
-
157
- export async function fetchSourceContent(url, maxChars = 8000) {
158
- const start = Date.now();
159
-
160
- // Check if it's a GitHub URL
161
- if (parseGitHubUrl(url)) {
162
- const parsed = parseGitHubUrl(url);
163
- if (
164
- parsed &&
165
- (parsed.type === "root" ||
166
- parsed.type === "tree" ||
167
- (parsed.type === "blob" && !parsed.path?.includes(".")))
168
- ) {
169
- const ghResult = await fetchGitHubContent(url);
170
- if (ghResult.ok) {
171
- const content = trimContentHeadTail(ghResult.content, maxChars);
172
- return {
173
- url,
174
- finalUrl: url,
175
- status: 200,
176
- contentType: "text/markdown",
177
- lastModified: "",
178
- title: ghResult.title,
179
- snippet: content.slice(0, 320),
180
- content,
181
- contentChars: content.length,
182
- source: "github-api",
183
- ...(ghResult.tree && { tree: ghResult.tree }),
184
- duration: Date.now() - start,
185
- };
186
- }
187
- process.stderr.write(
188
- `[greedysearch] GitHub API fetch failed, trying HTTP: ${ghResult.error}\n`,
189
- );
190
- }
191
- }
192
-
193
- // Check if it's a Reddit URL (posts and comments)
194
- const redditInfo = parseRedditUrl(url);
195
- if (redditInfo?.type === "post") {
196
- process.stderr.write(
197
- `[greedysearch] Using Reddit JSON API for: ${url.slice(0, 60)}...\n`,
198
- );
199
- const redditResult = await fetchRedditContent(url, maxChars);
200
- if (redditResult.ok) {
201
- const content = trimContentHeadTail(redditResult.markdown, maxChars);
202
- return {
203
- url,
204
- finalUrl: redditResult.finalUrl,
205
- status: redditResult.status,
206
- contentType: "text/markdown",
207
- lastModified: redditResult.lastModified || "",
208
- publishedTime: redditResult.publishedTime || "",
209
- byline: redditResult.byline || "",
210
- siteName: redditResult.siteName || "",
211
- lang: redditResult.lang || "",
212
- title: redditResult.title,
213
- snippet: redditResult.excerpt,
214
- content,
215
- contentChars: content.length,
216
- source: "reddit-api",
217
- duration: Date.now() - start,
218
- };
219
- }
220
- process.stderr.write(
221
- `[greedysearch] Reddit API fetch failed, falling back to HTTP: ${redditResult.error}\n`,
222
- );
223
- }
224
-
225
- // Try HTTP (Node.js fetch) first — fast, works for most sites.
226
- const httpResult = await fetchSourceHttp(url, { timeoutMs: 10000 });
227
-
228
- if (httpResult.ok) {
229
- const content = trimContentHeadTail(httpResult.markdown, maxChars);
230
- return {
231
- url,
232
- finalUrl: httpResult.finalUrl,
233
- status: httpResult.status,
234
- contentType: "text/markdown",
235
- lastModified: httpResult.lastModified || "",
236
- publishedTime: httpResult.publishedTime || "",
237
- byline: httpResult.byline || "",
238
- siteName: httpResult.siteName || "",
239
- lang: httpResult.lang || "",
240
- title: httpResult.title,
241
- snippet: httpResult.excerpt,
242
- content,
243
- contentChars: content.length,
244
- source: "http",
245
- duration: Date.now() - start,
246
- };
247
- }
248
-
249
- // HTTP failed — try Chrome Network.loadNetworkResource (authentic TLS).
250
- // Only attempted if the HTTP error is retryable (network/TLS issues).
251
- if (httpResult.needsBrowser) {
252
- try {
253
- const chromeTab = await openNewTab();
254
- try {
255
- const chromeResult = await fetchSourceViaChrome(
256
- chromeTab,
257
- url,
258
- maxChars,
259
- );
260
- if (chromeResult.content && chromeResult.content.length > 100) {
261
- return chromeResult;
262
- }
263
- } finally {
264
- await closeTab(chromeTab);
265
- }
266
- } catch {
267
- // Chrome unavailable fall through to browser
268
- }
269
- }
270
-
271
- // Last resort full browser navigation (handles JS-heavy pages)
272
- process.stderr.write(
273
- `[greedysearch] HTTP failed for ${url.slice(0, 60)}, trying browser...\n`,
274
- );
275
- return await fetchSourceContentBrowser(url, maxChars);
276
- }
277
-
278
- async function fetchSourceContentBrowser(url, maxChars = 8000) {
279
- const start = Date.now();
280
- let tab;
281
-
282
- try {
283
- tab = await openNewTab();
284
- } catch (e) {
285
- return {
286
- url,
287
- title: "",
288
- content: null,
289
- snippet: "",
290
- contentChars: 0,
291
- error: `openNewTab failed: ${e.message}`,
292
- source: "browser",
293
- duration: Date.now() - start,
294
- };
295
- }
296
-
297
- try {
298
- await cdp(["nav", tab, url], 30000);
299
- await new Promise((r) => setTimeout(r, 800));
300
-
301
- const content = await cdp([
302
- "eval",
303
- tab,
304
- String.raw`
305
- (function(){
306
- var el = document.querySelector('article, [role="main"], main, .post-content, .article-body, #content, .content');
307
- var text = (el || document.body).innerText;
308
- return JSON.stringify({
309
- title: document.title,
310
- content: text.replace(/\s+/g, ' ').trim(),
311
- url: location.href
312
- });
313
- })()
314
- `,
315
- ]);
316
-
317
- const parsed = JSON.parse(content);
318
- const finalContent = trimContentHeadTail(parsed.content, maxChars);
319
-
320
- return {
321
- url,
322
- finalUrl: parsed.url || url,
323
- status: 200,
324
- contentType: "text/plain",
325
- lastModified: "",
326
- title: parsed.title,
327
- snippet: trimText(finalContent, 320),
328
- content: finalContent,
329
- contentChars: finalContent.length,
330
- source: "browser",
331
- duration: Date.now() - start,
332
- };
333
- } catch (error) {
334
- return {
335
- url,
336
- title: "",
337
- content: null,
338
- snippet: "",
339
- contentChars: 0,
340
- error: error.message,
341
- source: "browser",
342
- duration: Date.now() - start,
343
- };
344
- } finally {
345
- await closeTab(tab);
346
- }
347
- }
348
-
349
- export async function fetchMultipleSources(
350
- sources,
351
- maxSources = 5,
352
- maxChars = 8000,
353
- concurrency = SOURCE_FETCH_CONCURRENCY,
354
- ) {
355
- const toFetch = sources.slice(0, maxSources);
356
- if (toFetch.length === 0) return [];
357
-
358
- const workerCount = Math.min(
359
- toFetch.length,
360
- Math.max(
361
- 1,
362
- Number.parseInt(String(concurrency), 10) || SOURCE_FETCH_CONCURRENCY,
363
- ),
364
- );
365
-
366
- process.stderr.write(
367
- `[greedysearch] Fetching content from ${toFetch.length} sources via HTTP (concurrency ${workerCount})...\n`,
368
- );
369
-
370
- const fetched = new Array(toFetch.length);
371
- let nextIndex = 0;
372
- let completed = 0;
373
-
374
- async function worker() {
375
- while (true) {
376
- const index = nextIndex++;
377
- if (index >= toFetch.length) return;
378
-
379
- const s = toFetch[index];
380
- const url = s.canonicalUrl || s.url;
381
- process.stderr.write(
382
- `[greedysearch] [${index + 1}/${toFetch.length}] Fetching: ${url.slice(0, 60)}...\n`,
383
- );
384
-
385
- const result = await fetchSourceContent(url, maxChars).catch((e) => ({
386
- url,
387
- title: "",
388
- content: null,
389
- snippet: "",
390
- contentChars: 0,
391
- error: e.message,
392
- source: "error",
393
- duration: 0,
394
- }));
395
- fetched[index] = {
396
- id: s.id,
397
- ...result,
398
- };
399
-
400
- if (result.content && result.content.length > 100) {
401
- process.stderr.write(
402
- `[greedysearch] ✓ ${result.source}: ${result.content.length} chars\n`,
403
- );
404
- } else if (result.error) {
405
- process.stderr.write(`[greedysearch] ✗ ${result.error.slice(0, 80)}\n`);
406
- }
407
-
408
- completed += 1;
409
- process.stderr.write(`PROGRESS:fetch:${completed}/${toFetch.length}\n`);
410
- }
411
- }
412
-
413
- await Promise.all(Array.from({ length: workerCount }, () => worker()));
414
-
415
- // Log summary
416
- const successful = fetched.filter((f) => f.content && f.content.length > 100);
417
- const httpCount = fetched.filter((f) => f.source === "http").length;
418
- const browserCount = fetched.filter((f) => f.source === "browser").length;
419
-
420
- process.stderr.write(
421
- `[greedysearch] Fetched ${successful.length}/${fetched.length} sources ` +
422
- `(HTTP: ${httpCount}, Browser: ${browserCount})\n`,
423
- );
424
-
425
- return fetched;
426
- }
427
-
428
- export async function fetchTopSource(url) {
429
- const tab = await openNewTab();
430
- await cdp(["list"]); // refresh cache
431
- try {
432
- await cdp(["nav", tab, url], 30000);
433
- await new Promise((r) => setTimeout(r, 800));
434
- const content = await cdp([
435
- "eval",
436
- tab,
437
- String.raw`
438
- (function(){
439
- var el = document.querySelector('article, [role="main"], main, .post-content, .article-body, #content, .content');
440
- var text = (el || document.body).innerText;
441
- return text.replace(/\s+/g, ' ').trim();
442
- })()
443
- `,
444
- ]);
445
- return { url, content };
446
- } catch (e) {
447
- return { url, content: null, error: e.message };
448
- } finally {
449
- await closeTab(tab);
450
- }
451
- }
1
+ // src/search/fetch-source.mjs — HTTP and browser-based source content fetching
2
+ //
3
+ // Extracted from search.mjs. PRIMARY path uses Chrome's network stack
4
+ // (fetch() over CDP) to produce authentic Chrome TLS/JA3 fingerprints.
5
+ // Falls back to Node.js HTTP (via fetcher.mjs) if Chrome is unavailable.
6
+
7
+ import {
8
+ fetchSourceHttp,
9
+ extractContent,
10
+ detectBotBlock,
11
+ checkContentQuality,
12
+ defaultFetchHeaders,
13
+ isPrivateUrl,
14
+ } from "../fetcher.mjs";
15
+ import { fetchGitHubContent, parseGitHubUrl } from "../github.mjs";
16
+ import { fetchRedditContent, parseRedditUrl } from "../reddit.mjs";
17
+ import { trimContentHeadTail } from "../utils/content.mjs";
18
+ import { cdp, closeTab, openNewTab } from "./chrome.mjs";
19
+ import { SOURCE_FETCH_CONCURRENCY } from "./constants.mjs";
20
+ import { extractPdfMarkdown } from "./pdf.mjs";
21
+ import { trimText } from "./sources.mjs";
22
+
23
+ /**
24
+ * Fetch a URL using Chrome's Network.loadNetworkResource (Chrome 124+).
25
+ * This uses Chrome's native network stack (authentic TLS/JA3 fingerprint)
26
+ * without the overhead of page navigation — response body returned via CDP.
27
+ *
28
+ * Used as FALLBACK when Node.js HTTP fails (TLS mismatch, etc.).
29
+ */
30
+ async function fetchSourceViaChrome(tab, url, maxChars = 8000) {
31
+ const start = Date.now();
32
+
33
+ try {
34
+ // Get the frameId of the tab for Network.loadNetworkResource
35
+ const frames = await cdp(["evalraw", tab, "Page.getFrameTree", "{}"])
36
+ .then((r) => JSON.parse(r))
37
+ .catch(() => null);
38
+ const frameId = frames?.frameTree?.frame?.id || undefined;
39
+
40
+ // Load resource using Chrome's network stack (authentic TLS fingerprint)
41
+ const raw = await cdp(
42
+ [
43
+ "evalraw",
44
+ tab,
45
+ "Network.loadNetworkResource",
46
+ JSON.stringify({
47
+ frameId,
48
+ url,
49
+ options: { disableCache: true, includeCredentials: false },
50
+ }),
51
+ ],
52
+ 20000,
53
+ );
54
+
55
+ const result = JSON.parse(raw);
56
+ const resource = result.resource;
57
+ if (!resource?.success || !resource.httpStatusCode) {
58
+ return {
59
+ url,
60
+ error:
61
+ resource?.netErrorName ||
62
+ resource?.netError ||
63
+ "loadNetworkResource failed",
64
+ source: "chrome",
65
+ duration: Date.now() - start,
66
+ needsFallback: true,
67
+ };
68
+ }
69
+
70
+ // Read response body from stream
71
+ let body = "";
72
+ if (resource.stream) {
73
+ try {
74
+ const ioRaw = await cdp(
75
+ [
76
+ "evalraw",
77
+ tab,
78
+ "IO.read",
79
+ JSON.stringify({ handle: resource.stream }),
80
+ ],
81
+ 10000,
82
+ );
83
+ const ioResult = JSON.parse(ioRaw);
84
+ body = ioResult.data || "";
85
+ // Close stream
86
+ await cdp([
87
+ "evalraw",
88
+ tab,
89
+ "IO.close",
90
+ JSON.stringify({ handle: resource.stream }),
91
+ ]).catch(() => {});
92
+ } catch {}
93
+ }
94
+
95
+ if (!body || body.length < 100) {
96
+ return {
97
+ url,
98
+ error: "Empty response body from Network.loadNetworkResource",
99
+ source: "chrome",
100
+ duration: Date.now() - start,
101
+ needsFallback: true,
102
+ };
103
+ }
104
+
105
+ // Bot-detection and content extraction
106
+ const botCheck = detectBotBlock(resource.httpStatusCode, body, url, url);
107
+ if (botCheck.blocked) {
108
+ return {
109
+ url,
110
+ status: resource.httpStatusCode,
111
+ error: `Blocked: ${botCheck.reason}`,
112
+ source: "chrome",
113
+ duration: Date.now() - start,
114
+ needsBrowser: true,
115
+ };
116
+ }
117
+
118
+ const extracted = extractContent(body, url);
119
+ const quality = checkContentQuality(extracted);
120
+ if (!quality.ok) {
121
+ return {
122
+ url,
123
+ status: resource.httpStatusCode,
124
+ error: `Low quality: ${quality.reason}`,
125
+ source: "chrome",
126
+ duration: Date.now() - start,
127
+ needsBrowser: true,
128
+ };
129
+ }
130
+
131
+ const content = trimContentHeadTail(extracted.markdown, maxChars);
132
+ return {
133
+ url,
134
+ finalUrl: url,
135
+ status: resource.httpStatusCode,
136
+ contentType: "text/markdown",
137
+ lastModified: "",
138
+ publishedTime: extracted.publishedTime || "",
139
+ byline: extracted.byline || "",
140
+ siteName: extracted.siteName || "",
141
+ lang: extracted.lang || "",
142
+ title: extracted.title || url,
143
+ snippet: extracted.excerpt,
144
+ content,
145
+ contentChars: content.length,
146
+ source: "chrome",
147
+ duration: Date.now() - start,
148
+ };
149
+ } catch (error) {
150
+ return {
151
+ url,
152
+ error: error.message,
153
+ source: "chrome",
154
+ duration: Date.now() - start,
155
+ needsFallback: true,
156
+ };
157
+ }
158
+ }
159
+
160
+ function isLikelyPdfUrl(url) {
161
+ try {
162
+ const parsed = new URL(url);
163
+ return parsed.pathname.toLowerCase().endsWith(".pdf");
164
+ } catch {
165
+ return false;
166
+ }
167
+ }
168
+
169
+ async function fetchPdfSourceHttp(url, maxChars = 8000) {
170
+ const privateCheck = isPrivateUrl(url);
171
+ if (privateCheck.blocked) {
172
+ return {
173
+ url,
174
+ finalUrl: url,
175
+ status: 403,
176
+ error: `Blocked: ${privateCheck.reason}`,
177
+ source: "pdf-http",
178
+ };
179
+ }
180
+
181
+ const controller = new AbortController();
182
+ const timeoutId = setTimeout(() => controller.abort(), 20000);
183
+ const start = Date.now();
184
+ try {
185
+ const response = await fetch(url, {
186
+ method: "GET",
187
+ redirect: "follow",
188
+ signal: controller.signal,
189
+ headers: defaultFetchHeaders({
190
+ accept: "application/pdf,application/octet-stream;q=0.9,*/*;q=0.5",
191
+ }),
192
+ });
193
+ clearTimeout(timeoutId);
194
+
195
+ const contentType = response.headers.get("content-type") || "";
196
+ const finalUrl = response.url || url;
197
+ const contentLength = Number.parseInt(
198
+ response.headers.get("content-length") || "0",
199
+ 10,
200
+ );
201
+ if (response.status >= 400) {
202
+ return {
203
+ url,
204
+ finalUrl,
205
+ status: response.status,
206
+ error: `HTTP ${response.status}`,
207
+ source: "pdf-http",
208
+ duration: Date.now() - start,
209
+ };
210
+ }
211
+ if (
212
+ !contentType.toLowerCase().includes("application/pdf") &&
213
+ !isLikelyPdfUrl(finalUrl)
214
+ ) {
215
+ return null;
216
+ }
217
+ if (contentLength > 30 * 1024 * 1024) {
218
+ return {
219
+ url,
220
+ finalUrl,
221
+ status: response.status,
222
+ error: `PDF too large: ${contentLength} bytes`,
223
+ source: "pdf-http",
224
+ duration: Date.now() - start,
225
+ };
226
+ }
227
+
228
+ const buffer = Buffer.from(await response.arrayBuffer());
229
+ const pdf = await extractPdfMarkdown(buffer, finalUrl);
230
+ if (!pdf || pdf.error) {
231
+ return {
232
+ url,
233
+ finalUrl,
234
+ status: response.status,
235
+ error: pdf?.error || "PDF text extraction failed",
236
+ source: "pdf-http",
237
+ duration: Date.now() - start,
238
+ };
239
+ }
240
+ const content = trimContentHeadTail(pdf.content, maxChars);
241
+ return {
242
+ url,
243
+ finalUrl,
244
+ status: response.status,
245
+ contentType: "application/pdf",
246
+ lastModified: response.headers.get("last-modified") || "",
247
+ title: pdf.title,
248
+ snippet: trimText(content, 320),
249
+ content,
250
+ contentChars: content.length,
251
+ pages: pdf.pages,
252
+ source: "pdf-http",
253
+ duration: Date.now() - start,
254
+ };
255
+ } catch (error) {
256
+ clearTimeout(timeoutId);
257
+ return {
258
+ url,
259
+ finalUrl: url,
260
+ error: error.message || String(error),
261
+ source: "pdf-http",
262
+ duration: Date.now() - start,
263
+ };
264
+ }
265
+ }
266
+
267
+ export async function fetchSourceContent(url, maxChars = 8000) {
268
+ const start = Date.now();
269
+
270
+ if (isLikelyPdfUrl(url)) {
271
+ const pdfResult = await fetchPdfSourceHttp(url, maxChars);
272
+ if (pdfResult?.content || pdfResult?.status === 403) return pdfResult;
273
+ }
274
+
275
+ // Check if it's a GitHub URL
276
+ if (parseGitHubUrl(url)) {
277
+ const parsed = parseGitHubUrl(url);
278
+ if (
279
+ parsed &&
280
+ (parsed.type === "root" ||
281
+ parsed.type === "tree" ||
282
+ (parsed.type === "blob" && !parsed.path?.includes(".")))
283
+ ) {
284
+ const ghResult = await fetchGitHubContent(url);
285
+ if (ghResult.ok) {
286
+ const content = trimContentHeadTail(ghResult.content, maxChars);
287
+ return {
288
+ url,
289
+ finalUrl: url,
290
+ status: 200,
291
+ contentType: "text/markdown",
292
+ lastModified: "",
293
+ title: ghResult.title,
294
+ snippet: content.slice(0, 320),
295
+ content,
296
+ contentChars: content.length,
297
+ source: "github-api",
298
+ ...(ghResult.tree && { tree: ghResult.tree }),
299
+ duration: Date.now() - start,
300
+ };
301
+ }
302
+ process.stderr.write(
303
+ `[greedysearch] GitHub API fetch failed, trying HTTP: ${ghResult.error}\n`,
304
+ );
305
+ }
306
+ }
307
+
308
+ // Check if it's a Reddit URL (posts and comments)
309
+ const redditInfo = parseRedditUrl(url);
310
+ if (redditInfo?.type === "post") {
311
+ process.stderr.write(
312
+ `[greedysearch] Using Reddit JSON API for: ${url.slice(0, 60)}...\n`,
313
+ );
314
+ const redditResult = await fetchRedditContent(url, maxChars);
315
+ if (redditResult.ok) {
316
+ const content = trimContentHeadTail(redditResult.markdown, maxChars);
317
+ return {
318
+ url,
319
+ finalUrl: redditResult.finalUrl,
320
+ status: redditResult.status,
321
+ contentType: "text/markdown",
322
+ lastModified: redditResult.lastModified || "",
323
+ publishedTime: redditResult.publishedTime || "",
324
+ byline: redditResult.byline || "",
325
+ siteName: redditResult.siteName || "",
326
+ lang: redditResult.lang || "",
327
+ title: redditResult.title,
328
+ snippet: redditResult.excerpt,
329
+ content,
330
+ contentChars: content.length,
331
+ source: "reddit-api",
332
+ duration: Date.now() - start,
333
+ };
334
+ }
335
+ process.stderr.write(
336
+ `[greedysearch] Reddit API fetch failed, falling back to HTTP: ${redditResult.error}\n`,
337
+ );
338
+ }
339
+
340
+ // Try HTTP (Node.js fetch) first — fast, works for most sites.
341
+ const httpResult = await fetchSourceHttp(url, { timeoutMs: 10000 });
342
+
343
+ if (httpResult.ok) {
344
+ const content = trimContentHeadTail(httpResult.markdown, maxChars);
345
+ return {
346
+ url,
347
+ finalUrl: httpResult.finalUrl,
348
+ status: httpResult.status,
349
+ contentType: "text/markdown",
350
+ lastModified: httpResult.lastModified || "",
351
+ publishedTime: httpResult.publishedTime || "",
352
+ byline: httpResult.byline || "",
353
+ siteName: httpResult.siteName || "",
354
+ lang: httpResult.lang || "",
355
+ title: httpResult.title,
356
+ snippet: httpResult.excerpt,
357
+ content,
358
+ contentChars: content.length,
359
+ source: "http",
360
+ duration: Date.now() - start,
361
+ };
362
+ }
363
+
364
+ // HTTP failed — try Chrome Network.loadNetworkResource (authentic TLS).
365
+ // Only attempted if the HTTP error is retryable (network/TLS issues).
366
+ if (httpResult.needsBrowser) {
367
+ try {
368
+ const chromeTab = await openNewTab();
369
+ try {
370
+ const chromeResult = await fetchSourceViaChrome(
371
+ chromeTab,
372
+ url,
373
+ maxChars,
374
+ );
375
+ if (chromeResult.content && chromeResult.content.length > 100) {
376
+ return chromeResult;
377
+ }
378
+ } finally {
379
+ await closeTab(chromeTab);
380
+ }
381
+ } catch {
382
+ // Chrome unavailable fall through to browser
383
+ }
384
+ }
385
+
386
+ // Last resort — full browser navigation (handles JS-heavy pages)
387
+ process.stderr.write(
388
+ `[greedysearch] HTTP failed for ${url.slice(0, 60)}, trying browser...\n`,
389
+ );
390
+ return await fetchSourceContentBrowser(url, maxChars);
391
+ }
392
+
393
+ async function fetchSourceContentBrowser(url, maxChars = 8000) {
394
+ const start = Date.now();
395
+ let tab;
396
+
397
+ try {
398
+ tab = await openNewTab();
399
+ } catch (e) {
400
+ return {
401
+ url,
402
+ title: "",
403
+ content: null,
404
+ snippet: "",
405
+ contentChars: 0,
406
+ error: `openNewTab failed: ${e.message}`,
407
+ source: "browser",
408
+ duration: Date.now() - start,
409
+ };
410
+ }
411
+
412
+ try {
413
+ await cdp(["nav", tab, url], 30000);
414
+ await new Promise((r) => setTimeout(r, 800));
415
+
416
+ const content = await cdp([
417
+ "eval",
418
+ tab,
419
+ String.raw`
420
+ (function(){
421
+ var el = document.querySelector('article, [role="main"], main, .post-content, .article-body, #content, .content');
422
+ var text = (el || document.body).innerText;
423
+ return JSON.stringify({
424
+ title: document.title,
425
+ content: text.replace(/\s+/g, ' ').trim(),
426
+ url: location.href
427
+ });
428
+ })()
429
+ `,
430
+ ]);
431
+
432
+ const parsed = JSON.parse(content);
433
+ const finalContent = trimContentHeadTail(parsed.content, maxChars);
434
+
435
+ return {
436
+ url,
437
+ finalUrl: parsed.url || url,
438
+ status: 200,
439
+ contentType: "text/plain",
440
+ lastModified: "",
441
+ title: parsed.title,
442
+ snippet: trimText(finalContent, 320),
443
+ content: finalContent,
444
+ contentChars: finalContent.length,
445
+ source: "browser",
446
+ duration: Date.now() - start,
447
+ };
448
+ } catch (error) {
449
+ return {
450
+ url,
451
+ title: "",
452
+ content: null,
453
+ snippet: "",
454
+ contentChars: 0,
455
+ error: error.message,
456
+ source: "browser",
457
+ duration: Date.now() - start,
458
+ };
459
+ } finally {
460
+ await closeTab(tab);
461
+ }
462
+ }
463
+
464
+ export async function fetchMultipleSources(
465
+ sources,
466
+ maxSources = 5,
467
+ maxChars = 8000,
468
+ concurrency = SOURCE_FETCH_CONCURRENCY,
469
+ ) {
470
+ const toFetch = sources.slice(0, maxSources);
471
+ if (toFetch.length === 0) return [];
472
+
473
+ const workerCount = Math.min(
474
+ toFetch.length,
475
+ Math.max(
476
+ 1,
477
+ Number.parseInt(String(concurrency), 10) || SOURCE_FETCH_CONCURRENCY,
478
+ ),
479
+ );
480
+
481
+ process.stderr.write(
482
+ `[greedysearch] Fetching content from ${toFetch.length} sources via HTTP (concurrency ${workerCount})...\n`,
483
+ );
484
+
485
+ const fetched = new Array(toFetch.length);
486
+ let nextIndex = 0;
487
+ let completed = 0;
488
+
489
+ async function worker() {
490
+ while (true) {
491
+ const index = nextIndex++;
492
+ if (index >= toFetch.length) return;
493
+
494
+ const s = toFetch[index];
495
+ const url = s.canonicalUrl || s.url;
496
+ process.stderr.write(
497
+ `[greedysearch] [${index + 1}/${toFetch.length}] Fetching: ${url.slice(0, 60)}...\n`,
498
+ );
499
+
500
+ const result = await fetchSourceContent(url, maxChars).catch((e) => ({
501
+ url,
502
+ title: "",
503
+ content: null,
504
+ snippet: "",
505
+ contentChars: 0,
506
+ error: e.message,
507
+ source: "error",
508
+ duration: 0,
509
+ }));
510
+ fetched[index] = {
511
+ id: s.id,
512
+ ...result,
513
+ };
514
+
515
+ if (result.content && result.content.length > 100) {
516
+ process.stderr.write(
517
+ `[greedysearch] ✓ ${result.source}: ${result.content.length} chars\n`,
518
+ );
519
+ } else if (result.error) {
520
+ process.stderr.write(`[greedysearch] ✗ ${result.error.slice(0, 80)}\n`);
521
+ }
522
+
523
+ completed += 1;
524
+ process.stderr.write(`PROGRESS:fetch:${completed}/${toFetch.length}\n`);
525
+ }
526
+ }
527
+
528
+ await Promise.all(Array.from({ length: workerCount }, () => worker()));
529
+
530
+ // Log summary
531
+ const successful = fetched.filter((f) => f.content && f.content.length > 100);
532
+ const httpCount = fetched.filter((f) => f.source === "http").length;
533
+ const browserCount = fetched.filter((f) => f.source === "browser").length;
534
+
535
+ process.stderr.write(
536
+ `[greedysearch] Fetched ${successful.length}/${fetched.length} sources ` +
537
+ `(HTTP: ${httpCount}, Browser: ${browserCount})\n`,
538
+ );
539
+
540
+ return fetched;
541
+ }
542
+
543
+ export async function fetchTopSource(url) {
544
+ const tab = await openNewTab();
545
+ await cdp(["list"]); // refresh cache
546
+ try {
547
+ await cdp(["nav", tab, url], 30000);
548
+ await new Promise((r) => setTimeout(r, 800));
549
+ const content = await cdp([
550
+ "eval",
551
+ tab,
552
+ String.raw`
553
+ (function(){
554
+ var el = document.querySelector('article, [role="main"], main, .post-content, .article-body, #content, .content');
555
+ var text = (el || document.body).innerText;
556
+ return text.replace(/\s+/g, ' ').trim();
557
+ })()
558
+ `,
559
+ ]);
560
+ return { url, content };
561
+ } catch (e) {
562
+ return { url, content: null, error: e.message };
563
+ } finally {
564
+ await closeTab(tab);
565
+ }
566
+ }