@apmantza/greedysearch-pi 1.8.3 → 1.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,262 +1,262 @@
1
- // src/search/fetch-source.mjs — HTTP and browser-based source content fetching
2
- //
3
- // Extracted from search.mjs. Uses fetchSourceHttp from src/fetcher.mjs
4
- // with browser fallback via CDP, plus GitHub content fetching.
5
-
6
- import { spawn } from "node:child_process";
7
- import { tmpdir } from "node:os";
8
- import { join } from "node:path";
9
- import { fetchSourceHttp, shouldUseBrowser } from "../fetcher.mjs";
10
- import { fetchGitHubContent, parseGitHubUrl } from "../github.mjs";
11
- import { fetchRedditContent, parseRedditUrl } from "../reddit.mjs";
12
- import { trimContentHeadTail } from "../utils/content.mjs";
13
- import { cdp, closeTab, closeTabs, openNewTab } from "./chrome.mjs";
14
- import { SOURCE_FETCH_CONCURRENCY } from "./constants.mjs";
15
- import { trimText } from "./sources.mjs";
16
-
17
- export async function fetchSourceContent(url, maxChars = 8000) {
18
- const start = Date.now();
19
-
20
- // Check if it's a GitHub URL
21
- if (parseGitHubUrl(url)) {
22
- const parsed = parseGitHubUrl(url);
23
- if (
24
- parsed &&
25
- (parsed.type === "root" ||
26
- parsed.type === "tree" ||
27
- (parsed.type === "blob" && !parsed.path?.includes(".")))
28
- ) {
29
- const ghResult = await fetchGitHubContent(url);
30
- if (ghResult.ok) {
31
- const content = trimContentHeadTail(ghResult.content, maxChars);
32
- return {
33
- url,
34
- finalUrl: url,
35
- status: 200,
36
- contentType: "text/markdown",
37
- lastModified: "",
38
- title: ghResult.title,
39
- snippet: content.slice(0, 320),
40
- content,
41
- contentChars: content.length,
42
- source: "github-api",
43
- ...(ghResult.tree && { tree: ghResult.tree }),
44
- duration: Date.now() - start,
45
- };
46
- }
47
- process.stderr.write(
48
- `[greedysearch] GitHub API fetch failed, trying HTTP: ${ghResult.error}\n`,
49
- );
50
- }
51
- }
52
-
53
- // Check if it's a Reddit URL (posts and comments)
54
- const redditInfo = parseRedditUrl(url);
55
- if (redditInfo?.type === "post") {
56
- process.stderr.write(
57
- `[greedysearch] Using Reddit JSON API for: ${url.slice(0, 60)}...\n`,
58
- );
59
- const redditResult = await fetchRedditContent(url, maxChars);
60
- if (redditResult.ok) {
61
- const content = trimContentHeadTail(redditResult.markdown, maxChars);
62
- return {
63
- url,
64
- finalUrl: redditResult.finalUrl,
65
- status: redditResult.status,
66
- contentType: "text/markdown",
67
- lastModified: redditResult.lastModified || "",
68
- publishedTime: redditResult.publishedTime || "",
69
- byline: redditResult.byline || "",
70
- siteName: redditResult.siteName || "",
71
- lang: redditResult.lang || "",
72
- title: redditResult.title,
73
- snippet: redditResult.excerpt,
74
- content,
75
- contentChars: content.length,
76
- source: "reddit-api",
77
- duration: Date.now() - start,
78
- };
79
- }
80
- process.stderr.write(
81
- `[greedysearch] Reddit API fetch failed, falling back to HTTP: ${redditResult.error}\n`,
82
- );
83
- }
84
-
85
- // Try HTTP first
86
- const httpResult = await fetchSourceHttp(url, { timeoutMs: 15000 });
87
-
88
- if (httpResult.ok) {
89
- const content = trimContentHeadTail(httpResult.markdown, maxChars);
90
- return {
91
- url,
92
- finalUrl: httpResult.finalUrl,
93
- status: httpResult.status,
94
- contentType: "text/markdown",
95
- lastModified: httpResult.lastModified || "",
96
- publishedTime: httpResult.publishedTime || "",
97
- byline: httpResult.byline || "",
98
- siteName: httpResult.siteName || "",
99
- lang: httpResult.lang || "",
100
- title: httpResult.title,
101
- snippet: httpResult.excerpt,
102
- content,
103
- contentChars: content.length,
104
- source: "http",
105
- duration: Date.now() - start,
106
- };
107
- }
108
-
109
- // HTTP failed — fall back to browser
110
- process.stderr.write(
111
- `[greedysearch] HTTP failed for ${url.slice(0, 60)}, trying browser...\n`,
112
- );
113
- return await fetchSourceContentBrowser(url, maxChars);
114
- }
115
-
116
- async function fetchSourceContentBrowser(url, maxChars = 8000) {
117
- const start = Date.now();
118
- const tab = await openNewTab();
119
-
120
- try {
121
- await cdp(["nav", tab, url], 30000);
122
- await new Promise((r) => setTimeout(r, 1500));
123
-
124
- const content = await cdp([
125
- "eval",
126
- tab,
127
- `
128
- (function(){
129
- var el = document.querySelector('article, [role="main"], main, .post-content, .article-body, #content, .content');
130
- var text = (el || document.body).innerText;
131
- return JSON.stringify({
132
- title: document.title,
133
- content: text.replace(/\\s+/g, ' ').trim(),
134
- url: location.href
135
- });
136
- })()
137
- `,
138
- ]);
139
-
140
- const parsed = JSON.parse(content);
141
- const finalContent = trimContentHeadTail(parsed.content, maxChars);
142
-
143
- return {
144
- url,
145
- finalUrl: parsed.url || url,
146
- status: 200,
147
- contentType: "text/plain",
148
- lastModified: "",
149
- title: parsed.title,
150
- snippet: trimText(finalContent, 320),
151
- content: finalContent,
152
- contentChars: finalContent.length,
153
- source: "browser",
154
- duration: Date.now() - start,
155
- };
156
- } catch (error) {
157
- return {
158
- url,
159
- title: "",
160
- content: null,
161
- snippet: "",
162
- contentChars: 0,
163
- error: error.message,
164
- source: "browser",
165
- duration: Date.now() - start,
166
- };
167
- } finally {
168
- await closeTab(tab);
169
- }
170
- }
171
-
172
- export async function fetchMultipleSources(
173
- sources,
174
- maxSources = 5,
175
- maxChars = 8000,
176
- concurrency = SOURCE_FETCH_CONCURRENCY,
177
- ) {
178
- const toFetch = sources.slice(0, maxSources);
179
- if (toFetch.length === 0) return [];
180
-
181
- const workerCount = Math.min(
182
- toFetch.length,
183
- Math.max(1, parseInt(String(concurrency), 10) || SOURCE_FETCH_CONCURRENCY),
184
- );
185
-
186
- process.stderr.write(
187
- `[greedysearch] Fetching content from ${toFetch.length} sources via HTTP (concurrency ${workerCount})...\n`,
188
- );
189
-
190
- const fetched = new Array(toFetch.length);
191
- let nextIndex = 0;
192
- let completed = 0;
193
-
194
- async function worker() {
195
- while (true) {
196
- const index = nextIndex++;
197
- if (index >= toFetch.length) return;
198
-
199
- const s = toFetch[index];
200
- const url = s.canonicalUrl || s.url;
201
- process.stderr.write(
202
- `[greedysearch] [${index + 1}/${toFetch.length}] Fetching: ${url.slice(0, 60)}...\n`,
203
- );
204
-
205
- const result = await fetchSourceContent(url, maxChars);
206
- fetched[index] = {
207
- id: s.id,
208
- ...result,
209
- };
210
-
211
- if (result.content && result.content.length > 100) {
212
- process.stderr.write(
213
- `[greedysearch] ✓ ${result.source}: ${result.content.length} chars\n`,
214
- );
215
- } else if (result.error) {
216
- process.stderr.write(`[greedysearch] ✗ ${result.error.slice(0, 80)}\n`);
217
- }
218
-
219
- completed += 1;
220
- process.stderr.write(`PROGRESS:fetch:${completed}/${toFetch.length}\n`);
221
- }
222
- }
223
-
224
- await Promise.all(Array.from({ length: workerCount }, () => worker()));
225
-
226
- // Log summary
227
- const successful = fetched.filter((f) => f.content && f.content.length > 100);
228
- const httpCount = fetched.filter((f) => f.source === "http").length;
229
- const browserCount = fetched.filter((f) => f.source === "browser").length;
230
-
231
- process.stderr.write(
232
- `[greedysearch] Fetched ${successful.length}/${fetched.length} sources ` +
233
- `(HTTP: ${httpCount}, Browser: ${browserCount})\n`,
234
- );
235
-
236
- return fetched;
237
- }
238
-
239
- export async function fetchTopSource(url) {
240
- const tab = await openNewTab();
241
- await cdp(["list"]); // refresh cache
242
- try {
243
- await cdp(["nav", tab, url], 30000);
244
- await new Promise((r) => setTimeout(r, 1500));
245
- const content = await cdp([
246
- "eval",
247
- tab,
248
- `
249
- (function(){
250
- var el = document.querySelector('article, [role="main"], main, .post-content, .article-body, #content, .content');
251
- var text = (el || document.body).innerText;
252
- return text.replace(/\\s+/g, ' ').trim();
253
- })()
254
- `,
255
- ]);
256
- return { url, content };
257
- } catch (e) {
258
- return { url, content: null, error: e.message };
259
- } finally {
260
- await closeTab(tab);
261
- }
262
- }
1
+ // src/search/fetch-source.mjs — HTTP and browser-based source content fetching
2
+ //
3
+ // Extracted from search.mjs. Uses fetchSourceHttp from src/fetcher.mjs
4
+ // with browser fallback via CDP, plus GitHub content fetching.
5
+
6
+ import { spawn } from "node:child_process";
7
+ import { tmpdir } from "node:os";
8
+ import { join } from "node:path";
9
+ import { fetchSourceHttp, shouldUseBrowser } from "../fetcher.mjs";
10
+ import { fetchGitHubContent, parseGitHubUrl } from "../github.mjs";
11
+ import { fetchRedditContent, parseRedditUrl } from "../reddit.mjs";
12
+ import { trimContentHeadTail } from "../utils/content.mjs";
13
+ import { cdp, closeTab, closeTabs, openNewTab } from "./chrome.mjs";
14
+ import { SOURCE_FETCH_CONCURRENCY } from "./constants.mjs";
15
+ import { trimText } from "./sources.mjs";
16
+
17
+ export async function fetchSourceContent(url, maxChars = 8000) {
18
+ const start = Date.now();
19
+
20
+ // Check if it's a GitHub URL
21
+ if (parseGitHubUrl(url)) {
22
+ const parsed = parseGitHubUrl(url);
23
+ if (
24
+ parsed &&
25
+ (parsed.type === "root" ||
26
+ parsed.type === "tree" ||
27
+ (parsed.type === "blob" && !parsed.path?.includes(".")))
28
+ ) {
29
+ const ghResult = await fetchGitHubContent(url);
30
+ if (ghResult.ok) {
31
+ const content = trimContentHeadTail(ghResult.content, maxChars);
32
+ return {
33
+ url,
34
+ finalUrl: url,
35
+ status: 200,
36
+ contentType: "text/markdown",
37
+ lastModified: "",
38
+ title: ghResult.title,
39
+ snippet: content.slice(0, 320),
40
+ content,
41
+ contentChars: content.length,
42
+ source: "github-api",
43
+ ...(ghResult.tree && { tree: ghResult.tree }),
44
+ duration: Date.now() - start,
45
+ };
46
+ }
47
+ process.stderr.write(
48
+ `[greedysearch] GitHub API fetch failed, trying HTTP: ${ghResult.error}\n`,
49
+ );
50
+ }
51
+ }
52
+
53
+ // Check if it's a Reddit URL (posts and comments)
54
+ const redditInfo = parseRedditUrl(url);
55
+ if (redditInfo?.type === "post") {
56
+ process.stderr.write(
57
+ `[greedysearch] Using Reddit JSON API for: ${url.slice(0, 60)}...\n`,
58
+ );
59
+ const redditResult = await fetchRedditContent(url, maxChars);
60
+ if (redditResult.ok) {
61
+ const content = trimContentHeadTail(redditResult.markdown, maxChars);
62
+ return {
63
+ url,
64
+ finalUrl: redditResult.finalUrl,
65
+ status: redditResult.status,
66
+ contentType: "text/markdown",
67
+ lastModified: redditResult.lastModified || "",
68
+ publishedTime: redditResult.publishedTime || "",
69
+ byline: redditResult.byline || "",
70
+ siteName: redditResult.siteName || "",
71
+ lang: redditResult.lang || "",
72
+ title: redditResult.title,
73
+ snippet: redditResult.excerpt,
74
+ content,
75
+ contentChars: content.length,
76
+ source: "reddit-api",
77
+ duration: Date.now() - start,
78
+ };
79
+ }
80
+ process.stderr.write(
81
+ `[greedysearch] Reddit API fetch failed, falling back to HTTP: ${redditResult.error}\n`,
82
+ );
83
+ }
84
+
85
+ // Try HTTP first
86
+ const httpResult = await fetchSourceHttp(url, { timeoutMs: 15000 });
87
+
88
+ if (httpResult.ok) {
89
+ const content = trimContentHeadTail(httpResult.markdown, maxChars);
90
+ return {
91
+ url,
92
+ finalUrl: httpResult.finalUrl,
93
+ status: httpResult.status,
94
+ contentType: "text/markdown",
95
+ lastModified: httpResult.lastModified || "",
96
+ publishedTime: httpResult.publishedTime || "",
97
+ byline: httpResult.byline || "",
98
+ siteName: httpResult.siteName || "",
99
+ lang: httpResult.lang || "",
100
+ title: httpResult.title,
101
+ snippet: httpResult.excerpt,
102
+ content,
103
+ contentChars: content.length,
104
+ source: "http",
105
+ duration: Date.now() - start,
106
+ };
107
+ }
108
+
109
+ // HTTP failed — fall back to browser
110
+ process.stderr.write(
111
+ `[greedysearch] HTTP failed for ${url.slice(0, 60)}, trying browser...\n`,
112
+ );
113
+ return await fetchSourceContentBrowser(url, maxChars);
114
+ }
115
+
116
+ async function fetchSourceContentBrowser(url, maxChars = 8000) {
117
+ const start = Date.now();
118
+ const tab = await openNewTab();
119
+
120
+ try {
121
+ await cdp(["nav", tab, url], 30000);
122
+ await new Promise((r) => setTimeout(r, 1500));
123
+
124
+ const content = await cdp([
125
+ "eval",
126
+ tab,
127
+ `
128
+ (function(){
129
+ var el = document.querySelector('article, [role="main"], main, .post-content, .article-body, #content, .content');
130
+ var text = (el || document.body).innerText;
131
+ return JSON.stringify({
132
+ title: document.title,
133
+ content: text.replace(/\\s+/g, ' ').trim(),
134
+ url: location.href
135
+ });
136
+ })()
137
+ `,
138
+ ]);
139
+
140
+ const parsed = JSON.parse(content);
141
+ const finalContent = trimContentHeadTail(parsed.content, maxChars);
142
+
143
+ return {
144
+ url,
145
+ finalUrl: parsed.url || url,
146
+ status: 200,
147
+ contentType: "text/plain",
148
+ lastModified: "",
149
+ title: parsed.title,
150
+ snippet: trimText(finalContent, 320),
151
+ content: finalContent,
152
+ contentChars: finalContent.length,
153
+ source: "browser",
154
+ duration: Date.now() - start,
155
+ };
156
+ } catch (error) {
157
+ return {
158
+ url,
159
+ title: "",
160
+ content: null,
161
+ snippet: "",
162
+ contentChars: 0,
163
+ error: error.message,
164
+ source: "browser",
165
+ duration: Date.now() - start,
166
+ };
167
+ } finally {
168
+ await closeTab(tab);
169
+ }
170
+ }
171
+
172
+ export async function fetchMultipleSources(
173
+ sources,
174
+ maxSources = 5,
175
+ maxChars = 8000,
176
+ concurrency = SOURCE_FETCH_CONCURRENCY,
177
+ ) {
178
+ const toFetch = sources.slice(0, maxSources);
179
+ if (toFetch.length === 0) return [];
180
+
181
+ const workerCount = Math.min(
182
+ toFetch.length,
183
+ Math.max(1, parseInt(String(concurrency), 10) || SOURCE_FETCH_CONCURRENCY),
184
+ );
185
+
186
+ process.stderr.write(
187
+ `[greedysearch] Fetching content from ${toFetch.length} sources via HTTP (concurrency ${workerCount})...\n`,
188
+ );
189
+
190
+ const fetched = new Array(toFetch.length);
191
+ let nextIndex = 0;
192
+ let completed = 0;
193
+
194
+ async function worker() {
195
+ while (true) {
196
+ const index = nextIndex++;
197
+ if (index >= toFetch.length) return;
198
+
199
+ const s = toFetch[index];
200
+ const url = s.canonicalUrl || s.url;
201
+ process.stderr.write(
202
+ `[greedysearch] [${index + 1}/${toFetch.length}] Fetching: ${url.slice(0, 60)}...\n`,
203
+ );
204
+
205
+ const result = await fetchSourceContent(url, maxChars);
206
+ fetched[index] = {
207
+ id: s.id,
208
+ ...result,
209
+ };
210
+
211
+ if (result.content && result.content.length > 100) {
212
+ process.stderr.write(
213
+ `[greedysearch] ✓ ${result.source}: ${result.content.length} chars\n`,
214
+ );
215
+ } else if (result.error) {
216
+ process.stderr.write(`[greedysearch] ✗ ${result.error.slice(0, 80)}\n`);
217
+ }
218
+
219
+ completed += 1;
220
+ process.stderr.write(`PROGRESS:fetch:${completed}/${toFetch.length}\n`);
221
+ }
222
+ }
223
+
224
+ await Promise.all(Array.from({ length: workerCount }, () => worker()));
225
+
226
+ // Log summary
227
+ const successful = fetched.filter((f) => f.content && f.content.length > 100);
228
+ const httpCount = fetched.filter((f) => f.source === "http").length;
229
+ const browserCount = fetched.filter((f) => f.source === "browser").length;
230
+
231
+ process.stderr.write(
232
+ `[greedysearch] Fetched ${successful.length}/${fetched.length} sources ` +
233
+ `(HTTP: ${httpCount}, Browser: ${browserCount})\n`,
234
+ );
235
+
236
+ return fetched;
237
+ }
238
+
239
+ export async function fetchTopSource(url) {
240
+ const tab = await openNewTab();
241
+ await cdp(["list"]); // refresh cache
242
+ try {
243
+ await cdp(["nav", tab, url], 30000);
244
+ await new Promise((r) => setTimeout(r, 1500));
245
+ const content = await cdp([
246
+ "eval",
247
+ tab,
248
+ `
249
+ (function(){
250
+ var el = document.querySelector('article, [role="main"], main, .post-content, .article-body, #content, .content');
251
+ var text = (el || document.body).innerText;
252
+ return text.replace(/\\s+/g, ' ').trim();
253
+ })()
254
+ `,
255
+ ]);
256
+ return { url, content };
257
+ } catch (e) {
258
+ return { url, content: null, error: e.message };
259
+ } finally {
260
+ await closeTab(tab);
261
+ }
262
+ }
@@ -1,59 +1,59 @@
1
- // src/search/output.mjs — Output serialization for search results
2
- //
3
- // Extracted from search.mjs.
4
-
5
- import { existsSync, mkdirSync, writeFileSync } from "node:fs";
6
- import { join } from "node:path";
7
- import { tmpdir } from "node:os";
8
-
9
- const __dir = import.meta.dirname || new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
10
-
11
- export function slugify(query) {
12
- return query
13
- .toLowerCase()
14
- .replace(/[^a-z0-9]+/g, "-")
15
- .replace(/^-|-$/g, "")
16
- .slice(0, 60);
17
- }
18
-
19
- export function resultsDir() {
20
- const dir = join(__dir, "..", "..", "results");
21
- mkdirSync(dir, { recursive: true });
22
- return dir;
23
- }
24
-
25
- export function writeOutput(
26
- data,
27
- outFile,
28
- { inline = false, synthesize = false, query = "" } = {},
29
- ) {
30
- const json = `${JSON.stringify(data, null, 2)}\n`;
31
-
32
- if (outFile) {
33
- writeFileSync(outFile, json, "utf8");
34
- process.stderr.write(`Results written to ${outFile}\n`);
35
- return;
36
- }
37
-
38
- if (inline) {
39
- process.stdout.write(json);
40
- return;
41
- }
42
-
43
- const ts = new Date()
44
- .toISOString()
45
- .replace("T", "_")
46
- .replace(/[:.]/g, "-")
47
- .slice(0, 19);
48
- const slug = slugify(query);
49
- const base = join(resultsDir(), `${ts}_${slug}`);
50
-
51
- writeFileSync(`${base}.json`, json, "utf8");
52
-
53
- if (synthesize && data._synthesis?.answer) {
54
- writeFileSync(`${base}-synthesis.md`, data._synthesis.answer, "utf8");
55
- process.stdout.write(`${base}-synthesis.md\n`);
56
- } else {
57
- process.stdout.write(`${base}.json\n`);
58
- }
1
+ // src/search/output.mjs — Output serialization for search results
2
+ //
3
+ // Extracted from search.mjs.
4
+
5
+ import { existsSync, mkdirSync, writeFileSync } from "node:fs";
6
+ import { join } from "node:path";
7
+ import { tmpdir } from "node:os";
8
+
9
+ const __dir = import.meta.dirname || new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
10
+
11
+ export function slugify(query) {
12
+ return query
13
+ .toLowerCase()
14
+ .replace(/[^a-z0-9]+/g, "-")
15
+ .replace(/^-|-$/g, "")
16
+ .slice(0, 60);
17
+ }
18
+
19
+ export function resultsDir() {
20
+ const dir = join(__dir, "..", "..", "results");
21
+ mkdirSync(dir, { recursive: true });
22
+ return dir;
23
+ }
24
+
25
+ export function writeOutput(
26
+ data,
27
+ outFile,
28
+ { inline = false, synthesize = false, query = "" } = {},
29
+ ) {
30
+ const json = `${JSON.stringify(data, null, 2)}\n`;
31
+
32
+ if (outFile) {
33
+ writeFileSync(outFile, json, "utf8");
34
+ process.stderr.write(`Results written to ${outFile}\n`);
35
+ return;
36
+ }
37
+
38
+ if (inline) {
39
+ process.stdout.write(json);
40
+ return;
41
+ }
42
+
43
+ const ts = new Date()
44
+ .toISOString()
45
+ .replace("T", "_")
46
+ .replace(/[:.]/g, "-")
47
+ .slice(0, 19);
48
+ const slug = slugify(query);
49
+ const base = join(resultsDir(), `${ts}_${slug}`);
50
+
51
+ writeFileSync(`${base}.json`, json, "utf8");
52
+
53
+ if (synthesize && data._synthesis?.answer) {
54
+ writeFileSync(`${base}-synthesis.md`, data._synthesis.answer, "utf8");
55
+ process.stdout.write(`${base}-synthesis.md\n`);
56
+ } else {
57
+ process.stdout.write(`${base}.json\n`);
58
+ }
59
59
  }