@apmantza/greedysearch-pi 1.7.7 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/bin/coding-task.mjs +27 -1
- package/bin/search.mjs +260 -1539
- package/index.ts +134 -421
- package/package.json +1 -1
- package/src/github.mjs +6 -1
- package/src/search/chrome.mjs +223 -0
- package/src/search/constants.mjs +38 -0
- package/src/search/defaults.mjs +15 -0
- package/src/search/engines.mjs +58 -0
- package/src/search/fetch-source.mjs +230 -0
- package/src/search/output.mjs +59 -0
- package/src/search/sources.mjs +446 -0
- package/src/search/synthesis-runner.mjs +64 -0
- package/src/search/synthesis.mjs +212 -0
- package/src/tools/deep-research-handler.ts +37 -0
- package/src/tools/greedy-search-handler.ts +58 -0
- package/src/tools/shared.ts +131 -0
- package/src/types.ts +104 -0
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
// src/search/chrome.mjs — Chrome launch, probe, port file management, and CDP wrapper
|
|
2
|
+
//
|
|
3
|
+
// Extracted from search.mjs to reduce file complexity.
|
|
4
|
+
// Also used by coding-task.mjs (via import).
|
|
5
|
+
//
|
|
6
|
+
// cdp() is re-exported from extractors/common.mjs to avoid duplication.
|
|
7
|
+
|
|
8
|
+
import { spawn } from "node:child_process";
|
|
9
|
+
import {
|
|
10
|
+
existsSync,
|
|
11
|
+
readFileSync,
|
|
12
|
+
renameSync,
|
|
13
|
+
unlinkSync,
|
|
14
|
+
writeFileSync,
|
|
15
|
+
} from "node:fs";
|
|
16
|
+
import http from "node:http";
|
|
17
|
+
import { join } from "node:path";
|
|
18
|
+
|
|
19
|
+
import { GREEDY_PORT, ACTIVE_PORT_FILE, PAGES_CACHE } from "./constants.mjs";
|
|
20
|
+
import { cdp as _cdp } from "../../extractors/common.mjs";
|
|
21
|
+
|
|
22
|
+
const __dir = import.meta.dirname || new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
|
|
23
|
+
|
|
24
|
+
/** Re-export cdp() from the canonical location in extractors/common.mjs */
|
|
25
|
+
export const cdp = _cdp;
|
|
26
|
+
|
|
27
|
+
export async function getAnyTab() {
|
|
28
|
+
const list = await cdp(["list"]);
|
|
29
|
+
const first = list.split("\n")[0];
|
|
30
|
+
if (!first) throw new Error("No Chrome tabs found");
|
|
31
|
+
return first.slice(0, 8);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export async function openNewTab() {
|
|
35
|
+
const anchor = await getAnyTab();
|
|
36
|
+
const raw = await cdp([
|
|
37
|
+
"evalraw",
|
|
38
|
+
anchor,
|
|
39
|
+
"Target.createTarget",
|
|
40
|
+
'{"url":"about:blank"}',
|
|
41
|
+
]);
|
|
42
|
+
const { targetId } = JSON.parse(raw);
|
|
43
|
+
return targetId;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export async function activateTab(targetId) {
|
|
47
|
+
try {
|
|
48
|
+
const anchor = await getAnyTab();
|
|
49
|
+
await cdp([
|
|
50
|
+
"evalraw",
|
|
51
|
+
anchor,
|
|
52
|
+
"Target.activateTarget",
|
|
53
|
+
JSON.stringify({ targetId }),
|
|
54
|
+
]);
|
|
55
|
+
} catch {
|
|
56
|
+
// best-effort
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
export async function closeTab(targetId) {
|
|
61
|
+
try {
|
|
62
|
+
const anchor = await getAnyTab();
|
|
63
|
+
await cdp([
|
|
64
|
+
"evalraw",
|
|
65
|
+
anchor,
|
|
66
|
+
"Target.closeTarget",
|
|
67
|
+
JSON.stringify({ targetId }),
|
|
68
|
+
]);
|
|
69
|
+
} catch {
|
|
70
|
+
/* best-effort */
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
export async function closeTabs(targetIds = []) {
|
|
75
|
+
for (const tid of targetIds) {
|
|
76
|
+
if (!tid) continue;
|
|
77
|
+
await closeTab(tid);
|
|
78
|
+
}
|
|
79
|
+
if (targetIds.length > 0) {
|
|
80
|
+
await new Promise((r) => setTimeout(r, 300));
|
|
81
|
+
await cdp(["list"]).catch(() => null);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
export function getFullTabFromCache(engine, engineDomains) {
|
|
86
|
+
try {
|
|
87
|
+
if (!existsSync(PAGES_CACHE)) return null;
|
|
88
|
+
const pages = JSON.parse(readFileSync(PAGES_CACHE, "utf8"));
|
|
89
|
+
const found = pages.find((p) => p.url.includes(engineDomains[engine]));
|
|
90
|
+
return found ? found.targetId : null;
|
|
91
|
+
} catch {
|
|
92
|
+
return null;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
export function probeGreedyChrome(timeoutMs = 3000) {
|
|
97
|
+
return new Promise((resolve) => {
|
|
98
|
+
const req = http.get(
|
|
99
|
+
`http://localhost:${GREEDY_PORT}/json/version`,
|
|
100
|
+
(res) => {
|
|
101
|
+
res.resume();
|
|
102
|
+
resolve(res.statusCode === 200);
|
|
103
|
+
},
|
|
104
|
+
);
|
|
105
|
+
req.on("error", () => resolve(false));
|
|
106
|
+
req.setTimeout(timeoutMs, () => {
|
|
107
|
+
req.destroy();
|
|
108
|
+
resolve(false);
|
|
109
|
+
});
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
export async function refreshPortFile() {
|
|
114
|
+
const LOCK_FILE = `${ACTIVE_PORT_FILE}.lock`;
|
|
115
|
+
const TEMP_FILE = `${ACTIVE_PORT_FILE}.tmp`;
|
|
116
|
+
const LOCK_STALE_MS = 5000;
|
|
117
|
+
const LOCK_WAIT_MS = 1000;
|
|
118
|
+
|
|
119
|
+
// File-based lock with exclusive create + stale lock recovery
|
|
120
|
+
const lockAcquired = await new Promise((resolve) => {
|
|
121
|
+
const start = Date.now();
|
|
122
|
+
const tryLock = () => {
|
|
123
|
+
try {
|
|
124
|
+
const payload = JSON.stringify({ pid: process.pid, ts: Date.now() });
|
|
125
|
+
writeFileSync(LOCK_FILE, payload, { encoding: "utf8", flag: "wx" });
|
|
126
|
+
resolve(true);
|
|
127
|
+
} catch (e) {
|
|
128
|
+
if (e?.code !== "EEXIST") {
|
|
129
|
+
if (Date.now() - start < LOCK_WAIT_MS) {
|
|
130
|
+
setTimeout(tryLock, 50);
|
|
131
|
+
} else {
|
|
132
|
+
resolve(false);
|
|
133
|
+
}
|
|
134
|
+
return;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
try {
|
|
138
|
+
const lockRaw = readFileSync(LOCK_FILE, "utf8").trim();
|
|
139
|
+
const parsed = lockRaw.startsWith("{")
|
|
140
|
+
? JSON.parse(lockRaw)
|
|
141
|
+
: { ts: Number(lockRaw) };
|
|
142
|
+
const lockTime = Number(parsed?.ts) || 0;
|
|
143
|
+
|
|
144
|
+
if (lockTime > 0 && Date.now() - lockTime > LOCK_STALE_MS) {
|
|
145
|
+
try {
|
|
146
|
+
unlinkSync(LOCK_FILE);
|
|
147
|
+
} catch {}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
if (Date.now() - start < LOCK_WAIT_MS) {
|
|
151
|
+
setTimeout(tryLock, 50);
|
|
152
|
+
} else {
|
|
153
|
+
resolve(false);
|
|
154
|
+
}
|
|
155
|
+
} catch {
|
|
156
|
+
if (Date.now() - start < LOCK_WAIT_MS) {
|
|
157
|
+
setTimeout(tryLock, 50);
|
|
158
|
+
} else {
|
|
159
|
+
resolve(false);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
};
|
|
164
|
+
tryLock();
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
try {
|
|
168
|
+
const body = await new Promise((res, rej) => {
|
|
169
|
+
const req = http.get(
|
|
170
|
+
`http://localhost:${GREEDY_PORT}/json/version`,
|
|
171
|
+
(r) => {
|
|
172
|
+
let b = "";
|
|
173
|
+
r.on("data", (d) => (b += d));
|
|
174
|
+
r.on("end", () => res(b));
|
|
175
|
+
},
|
|
176
|
+
);
|
|
177
|
+
req.on("error", rej);
|
|
178
|
+
req.setTimeout(3000, () => {
|
|
179
|
+
req.destroy();
|
|
180
|
+
rej(new Error("timeout"));
|
|
181
|
+
});
|
|
182
|
+
});
|
|
183
|
+
const { webSocketDebuggerUrl } = JSON.parse(body);
|
|
184
|
+
const wsPath = new URL(webSocketDebuggerUrl).pathname;
|
|
185
|
+
|
|
186
|
+
// Atomic write: write to temp file, then rename
|
|
187
|
+
if (lockAcquired) {
|
|
188
|
+
writeFileSync(TEMP_FILE, `${GREEDY_PORT}\n${wsPath}`, "utf8");
|
|
189
|
+
try {
|
|
190
|
+
unlinkSync(ACTIVE_PORT_FILE);
|
|
191
|
+
} catch {}
|
|
192
|
+
renameSync(TEMP_FILE, ACTIVE_PORT_FILE);
|
|
193
|
+
}
|
|
194
|
+
} catch {
|
|
195
|
+
/* best-effort — launch.mjs already wrote the file on first start */
|
|
196
|
+
} finally {
|
|
197
|
+
if (lockAcquired) {
|
|
198
|
+
try {
|
|
199
|
+
unlinkSync(LOCK_FILE);
|
|
200
|
+
} catch {}
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
export async function ensureChrome() {
|
|
206
|
+
const ready = await probeGreedyChrome();
|
|
207
|
+
if (!ready) {
|
|
208
|
+
process.stderr.write(
|
|
209
|
+
`GreedySearch Chrome not running on port ${GREEDY_PORT} — auto-launching...\n`,
|
|
210
|
+
);
|
|
211
|
+
await new Promise((resolve, reject) => {
|
|
212
|
+
const proc = spawn("node", [join(__dir, "..", "..", "bin", "launch.mjs")], {
|
|
213
|
+
stdio: ["ignore", process.stderr, process.stderr],
|
|
214
|
+
});
|
|
215
|
+
proc.on("close", (code) =>
|
|
216
|
+
code === 0 ? resolve() : reject(new Error("launch.mjs failed")),
|
|
217
|
+
);
|
|
218
|
+
});
|
|
219
|
+
} else {
|
|
220
|
+
// Chrome already running — refresh the port file
|
|
221
|
+
await refreshPortFile();
|
|
222
|
+
}
|
|
223
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
// src/search/constants.mjs — Shared constants for GreedySearch search pipeline
|
|
2
|
+
|
|
3
|
+
import { tmpdir } from "node:os";
|
|
4
|
+
|
|
5
|
+
export const GREEDY_PORT = 9222;
|
|
6
|
+
export const GREEDY_PROFILE_DIR = `${tmpdir().replace(/\\/g, "/")}/greedysearch-chrome-profile`;
|
|
7
|
+
export const ACTIVE_PORT_FILE = `${GREEDY_PROFILE_DIR}/DevToolsActivePort`;
|
|
8
|
+
export const PAGES_CACHE = `${tmpdir().replace(/\\/g, "/")}/cdp-pages.json`;
|
|
9
|
+
|
|
10
|
+
export const ALL_ENGINES = ["perplexity", "bing", "google"];
|
|
11
|
+
|
|
12
|
+
export const ENGINE_DOMAINS = {
|
|
13
|
+
perplexity: "perplexity.ai",
|
|
14
|
+
bing: "copilot.microsoft.com",
|
|
15
|
+
google: "google.com",
|
|
16
|
+
gemini: "gemini.google.com",
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
export const ENGINES = {
|
|
20
|
+
perplexity: "perplexity.mjs",
|
|
21
|
+
pplx: "perplexity.mjs",
|
|
22
|
+
p: "perplexity.mjs",
|
|
23
|
+
bing: "bing-copilot.mjs",
|
|
24
|
+
copilot: "bing-copilot.mjs",
|
|
25
|
+
b: "bing-copilot.mjs",
|
|
26
|
+
google: "google-ai.mjs",
|
|
27
|
+
g: "google-ai.mjs",
|
|
28
|
+
gemini: "gemini.mjs",
|
|
29
|
+
gem: "gemini.mjs",
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
export const SOURCE_FETCH_CONCURRENCY = Math.max(
|
|
33
|
+
1,
|
|
34
|
+
parseInt(process.env.GREEDY_FETCH_CONCURRENCY || "2", 10) || 2,
|
|
35
|
+
);
|
|
36
|
+
|
|
37
|
+
// Tell cdp.mjs to prefer the GreedySearch Chrome profile's DevToolsActivePort
|
|
38
|
+
process.env.CDP_PROFILE_DIR = GREEDY_PROFILE_DIR;
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
// src/search/defaults.mjs — Shared default values and timeouts
|
|
2
|
+
//
|
|
3
|
+
// Centralizes magic numbers used across the codebase.
|
|
4
|
+
// Import from here instead of hardcoding values.
|
|
5
|
+
|
|
6
|
+
export const DEFAULTS = {
|
|
7
|
+
CDP_TIMEOUT: 30000, // Default CDP command timeout (ms)
|
|
8
|
+
CDP_TIMEOUT_SHORT: 15000, // Short CDP timeout for search operations (ms)
|
|
9
|
+
NAV_TIMEOUT: 35000, // Navigation timeout (ms)
|
|
10
|
+
STREAM_TIMEOUT: 30000, // Stream completion timeout (ms)
|
|
11
|
+
COPY_TIMEOUT: 60000, // Copy button appearance timeout (ms)
|
|
12
|
+
CODING_TASK_TIMEOUT: 180000, // Coding task max duration (ms)
|
|
13
|
+
MAX_SOURCE_FETCH: 10, // Max concurrent source fetches
|
|
14
|
+
DESCRIPTION_MAX_LENGTH: 300, // Max answer length in truncated mode
|
|
15
|
+
};
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
// src/search/engines.mjs — Extractor runner
|
|
2
|
+
//
|
|
3
|
+
// Engine map lives in constants.mjs; this module re-exports it for
|
|
4
|
+
// backward compatibility and provides the runExtractor() function.
|
|
5
|
+
|
|
6
|
+
import { spawn } from "node:child_process";
|
|
7
|
+
import { join } from "node:path";
|
|
8
|
+
import { ENGINES, GREEDY_PROFILE_DIR } from "./constants.mjs";
|
|
9
|
+
|
|
10
|
+
export { ENGINES };
|
|
11
|
+
|
|
12
|
+
const __dir = import.meta.dirname || new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
|
|
13
|
+
|
|
14
|
+
export function runExtractor(
|
|
15
|
+
script,
|
|
16
|
+
query,
|
|
17
|
+
tabPrefix = null,
|
|
18
|
+
short = false,
|
|
19
|
+
timeoutMs = null,
|
|
20
|
+
) {
|
|
21
|
+
// Gemini is slower - use longer timeout
|
|
22
|
+
if (timeoutMs === null) {
|
|
23
|
+
timeoutMs = script.includes("gemini") ? 180000 : 90000;
|
|
24
|
+
}
|
|
25
|
+
const extraArgs = [
|
|
26
|
+
...(tabPrefix ? ["--tab", tabPrefix] : []),
|
|
27
|
+
...(short ? ["--short"] : []),
|
|
28
|
+
];
|
|
29
|
+
return new Promise((resolve, reject) => {
|
|
30
|
+
const proc = spawn(
|
|
31
|
+
"node",
|
|
32
|
+
[join(__dir, "..", "..", "extractors", script), query, ...extraArgs],
|
|
33
|
+
{
|
|
34
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
35
|
+
env: { ...process.env, CDP_PROFILE_DIR: GREEDY_PROFILE_DIR },
|
|
36
|
+
},
|
|
37
|
+
);
|
|
38
|
+
let out = "";
|
|
39
|
+
let err = "";
|
|
40
|
+
proc.stdout.on("data", (d) => (out += d));
|
|
41
|
+
proc.stderr.on("data", (d) => (err += d));
|
|
42
|
+
const t = setTimeout(() => {
|
|
43
|
+
proc.kill();
|
|
44
|
+
reject(new Error(`${script} timed out after ${timeoutMs / 1000}s`));
|
|
45
|
+
}, timeoutMs);
|
|
46
|
+
proc.on("close", (code) => {
|
|
47
|
+
clearTimeout(t);
|
|
48
|
+
if (code !== 0) reject(new Error(err.trim() || `extractor exit ${code}`));
|
|
49
|
+
else {
|
|
50
|
+
try {
|
|
51
|
+
resolve(JSON.parse(out.trim()));
|
|
52
|
+
} catch {
|
|
53
|
+
reject(new Error(`bad JSON from ${script}: ${out.slice(0, 100)}`));
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
});
|
|
57
|
+
});
|
|
58
|
+
}
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
// src/search/fetch-source.mjs — HTTP and browser-based source content fetching
|
|
2
|
+
//
|
|
3
|
+
// Extracted from search.mjs. Uses fetchSourceHttp from src/fetcher.mjs
|
|
4
|
+
// with browser fallback via CDP, plus GitHub content fetching.
|
|
5
|
+
|
|
6
|
+
import { spawn } from "node:child_process";
|
|
7
|
+
import { tmpdir } from "node:os";
|
|
8
|
+
import { join } from "node:path";
|
|
9
|
+
import { fetchSourceHttp, shouldUseBrowser } from "../fetcher.mjs";
|
|
10
|
+
import { fetchGitHubContent, parseGitHubUrl } from "../github.mjs";
|
|
11
|
+
import { trimContentHeadTail } from "../utils/content.mjs";
|
|
12
|
+
import { cdp } from "./chrome.mjs";
|
|
13
|
+
import { openNewTab, closeTab, closeTabs } from "./chrome.mjs";
|
|
14
|
+
import { SOURCE_FETCH_CONCURRENCY } from "./constants.mjs";
|
|
15
|
+
import { trimText } from "./sources.mjs";
|
|
16
|
+
|
|
17
|
+
export async function fetchSourceContent(url, maxChars = 8000) {
|
|
18
|
+
const start = Date.now();
|
|
19
|
+
|
|
20
|
+
// Check if it's a GitHub URL
|
|
21
|
+
if (parseGitHubUrl(url)) {
|
|
22
|
+
const parsed = parseGitHubUrl(url);
|
|
23
|
+
if (
|
|
24
|
+
parsed &&
|
|
25
|
+
(parsed.type === "root" ||
|
|
26
|
+
parsed.type === "tree" ||
|
|
27
|
+
(parsed.type === "blob" && !parsed.path?.includes(".")))
|
|
28
|
+
) {
|
|
29
|
+
const ghResult = await fetchGitHubContent(url);
|
|
30
|
+
if (ghResult.ok) {
|
|
31
|
+
const content = trimContentHeadTail(ghResult.content, maxChars);
|
|
32
|
+
return {
|
|
33
|
+
url,
|
|
34
|
+
finalUrl: url,
|
|
35
|
+
status: 200,
|
|
36
|
+
contentType: "text/markdown",
|
|
37
|
+
lastModified: "",
|
|
38
|
+
title: ghResult.title,
|
|
39
|
+
snippet: content.slice(0, 320),
|
|
40
|
+
content,
|
|
41
|
+
contentChars: content.length,
|
|
42
|
+
source: "github-api",
|
|
43
|
+
...(ghResult.tree && { tree: ghResult.tree }),
|
|
44
|
+
duration: Date.now() - start,
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
process.stderr.write(
|
|
48
|
+
`[greedysearch] GitHub API fetch failed, trying HTTP: ${ghResult.error}\n`,
|
|
49
|
+
);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Try HTTP first
|
|
54
|
+
const httpResult = await fetchSourceHttp(url, { timeoutMs: 15000 });
|
|
55
|
+
|
|
56
|
+
if (httpResult.ok) {
|
|
57
|
+
const content = trimContentHeadTail(httpResult.markdown, maxChars);
|
|
58
|
+
return {
|
|
59
|
+
url,
|
|
60
|
+
finalUrl: httpResult.finalUrl,
|
|
61
|
+
status: httpResult.status,
|
|
62
|
+
contentType: "text/markdown",
|
|
63
|
+
lastModified: httpResult.lastModified || "",
|
|
64
|
+
publishedTime: httpResult.publishedTime || "",
|
|
65
|
+
byline: httpResult.byline || "",
|
|
66
|
+
siteName: httpResult.siteName || "",
|
|
67
|
+
lang: httpResult.lang || "",
|
|
68
|
+
title: httpResult.title,
|
|
69
|
+
snippet: httpResult.excerpt,
|
|
70
|
+
content,
|
|
71
|
+
contentChars: content.length,
|
|
72
|
+
source: "http",
|
|
73
|
+
duration: Date.now() - start,
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// HTTP failed — fall back to browser
|
|
78
|
+
process.stderr.write(
|
|
79
|
+
`[greedysearch] HTTP failed for ${url.slice(0, 60)}, trying browser...\n`,
|
|
80
|
+
);
|
|
81
|
+
return await fetchSourceContentBrowser(url, maxChars);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
async function fetchSourceContentBrowser(url, maxChars = 8000) {
|
|
85
|
+
const start = Date.now();
|
|
86
|
+
const tab = await openNewTab();
|
|
87
|
+
|
|
88
|
+
try {
|
|
89
|
+
await cdp(["nav", tab, url], 30000);
|
|
90
|
+
await new Promise((r) => setTimeout(r, 1500));
|
|
91
|
+
|
|
92
|
+
const content = await cdp([
|
|
93
|
+
"eval",
|
|
94
|
+
tab,
|
|
95
|
+
`
|
|
96
|
+
(function(){
|
|
97
|
+
var el = document.querySelector('article, [role="main"], main, .post-content, .article-body, #content, .content');
|
|
98
|
+
var text = (el || document.body).innerText;
|
|
99
|
+
return JSON.stringify({
|
|
100
|
+
title: document.title,
|
|
101
|
+
content: text.replace(/\\s+/g, ' ').trim(),
|
|
102
|
+
url: location.href
|
|
103
|
+
});
|
|
104
|
+
})()
|
|
105
|
+
`,
|
|
106
|
+
]);
|
|
107
|
+
|
|
108
|
+
const parsed = JSON.parse(content);
|
|
109
|
+
const finalContent = trimContentHeadTail(parsed.content, maxChars);
|
|
110
|
+
|
|
111
|
+
return {
|
|
112
|
+
url,
|
|
113
|
+
finalUrl: parsed.url || url,
|
|
114
|
+
status: 200,
|
|
115
|
+
contentType: "text/plain",
|
|
116
|
+
lastModified: "",
|
|
117
|
+
title: parsed.title,
|
|
118
|
+
snippet: trimText(finalContent, 320),
|
|
119
|
+
content: finalContent,
|
|
120
|
+
contentChars: finalContent.length,
|
|
121
|
+
source: "browser",
|
|
122
|
+
duration: Date.now() - start,
|
|
123
|
+
};
|
|
124
|
+
} catch (error) {
|
|
125
|
+
return {
|
|
126
|
+
url,
|
|
127
|
+
title: "",
|
|
128
|
+
content: null,
|
|
129
|
+
snippet: "",
|
|
130
|
+
contentChars: 0,
|
|
131
|
+
error: error.message,
|
|
132
|
+
source: "browser",
|
|
133
|
+
duration: Date.now() - start,
|
|
134
|
+
};
|
|
135
|
+
} finally {
|
|
136
|
+
await closeTab(tab);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
export async function fetchMultipleSources(
|
|
141
|
+
sources,
|
|
142
|
+
maxSources = 5,
|
|
143
|
+
maxChars = 8000,
|
|
144
|
+
concurrency = SOURCE_FETCH_CONCURRENCY,
|
|
145
|
+
) {
|
|
146
|
+
const toFetch = sources.slice(0, maxSources);
|
|
147
|
+
if (toFetch.length === 0) return [];
|
|
148
|
+
|
|
149
|
+
const workerCount = Math.min(
|
|
150
|
+
toFetch.length,
|
|
151
|
+
Math.max(1, parseInt(String(concurrency), 10) || SOURCE_FETCH_CONCURRENCY),
|
|
152
|
+
);
|
|
153
|
+
|
|
154
|
+
process.stderr.write(
|
|
155
|
+
`[greedysearch] Fetching content from ${toFetch.length} sources via HTTP (concurrency ${workerCount})...\n`,
|
|
156
|
+
);
|
|
157
|
+
|
|
158
|
+
const fetched = new Array(toFetch.length);
|
|
159
|
+
let nextIndex = 0;
|
|
160
|
+
let completed = 0;
|
|
161
|
+
|
|
162
|
+
async function worker() {
|
|
163
|
+
while (true) {
|
|
164
|
+
const index = nextIndex++;
|
|
165
|
+
if (index >= toFetch.length) return;
|
|
166
|
+
|
|
167
|
+
const s = toFetch[index];
|
|
168
|
+
const url = s.canonicalUrl || s.url;
|
|
169
|
+
process.stderr.write(
|
|
170
|
+
`[greedysearch] [${index + 1}/${toFetch.length}] Fetching: ${url.slice(0, 60)}...\n`,
|
|
171
|
+
);
|
|
172
|
+
|
|
173
|
+
const result = await fetchSourceContent(url, maxChars);
|
|
174
|
+
fetched[index] = {
|
|
175
|
+
id: s.id,
|
|
176
|
+
...result,
|
|
177
|
+
};
|
|
178
|
+
|
|
179
|
+
if (result.content && result.content.length > 100) {
|
|
180
|
+
process.stderr.write(
|
|
181
|
+
`[greedysearch] ✓ ${result.source}: ${result.content.length} chars\n`,
|
|
182
|
+
);
|
|
183
|
+
} else if (result.error) {
|
|
184
|
+
process.stderr.write(`[greedysearch] ✗ ${result.error.slice(0, 80)}\n`);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
completed += 1;
|
|
188
|
+
process.stderr.write(`PROGRESS:fetch:${completed}/${toFetch.length}\n`);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
await Promise.all(Array.from({ length: workerCount }, () => worker()));
|
|
193
|
+
|
|
194
|
+
// Log summary
|
|
195
|
+
const successful = fetched.filter((f) => f.content && f.content.length > 100);
|
|
196
|
+
const httpCount = fetched.filter((f) => f.source === "http").length;
|
|
197
|
+
const browserCount = fetched.filter((f) => f.source === "browser").length;
|
|
198
|
+
|
|
199
|
+
process.stderr.write(
|
|
200
|
+
`[greedysearch] Fetched ${successful.length}/${fetched.length} sources ` +
|
|
201
|
+
`(HTTP: ${httpCount}, Browser: ${browserCount})\n`,
|
|
202
|
+
);
|
|
203
|
+
|
|
204
|
+
return fetched;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
export async function fetchTopSource(url) {
|
|
208
|
+
const tab = await openNewTab();
|
|
209
|
+
await cdp(["list"]); // refresh cache
|
|
210
|
+
try {
|
|
211
|
+
await cdp(["nav", tab, url], 30000);
|
|
212
|
+
await new Promise((r) => setTimeout(r, 1500));
|
|
213
|
+
const content = await cdp([
|
|
214
|
+
"eval",
|
|
215
|
+
tab,
|
|
216
|
+
`
|
|
217
|
+
(function(){
|
|
218
|
+
var el = document.querySelector('article, [role="main"], main, .post-content, .article-body, #content, .content');
|
|
219
|
+
var text = (el || document.body).innerText;
|
|
220
|
+
return text.replace(/\\s+/g, ' ').trim();
|
|
221
|
+
})()
|
|
222
|
+
`,
|
|
223
|
+
]);
|
|
224
|
+
return { url, content };
|
|
225
|
+
} catch (e) {
|
|
226
|
+
return { url, content: null, error: e.message };
|
|
227
|
+
} finally {
|
|
228
|
+
await closeTab(tab);
|
|
229
|
+
}
|
|
230
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
// src/search/output.mjs — Output serialization for search results
|
|
2
|
+
//
|
|
3
|
+
// Extracted from search.mjs.
|
|
4
|
+
|
|
5
|
+
import { existsSync, mkdirSync, writeFileSync } from "node:fs";
|
|
6
|
+
import { join } from "node:path";
|
|
7
|
+
import { tmpdir } from "node:os";
|
|
8
|
+
|
|
9
|
+
const __dir = import.meta.dirname || new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
|
|
10
|
+
|
|
11
|
+
export function slugify(query) {
|
|
12
|
+
return query
|
|
13
|
+
.toLowerCase()
|
|
14
|
+
.replace(/[^a-z0-9]+/g, "-")
|
|
15
|
+
.replace(/^-|-$/g, "")
|
|
16
|
+
.slice(0, 60);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export function resultsDir() {
|
|
20
|
+
const dir = join(__dir, "..", "..", "results");
|
|
21
|
+
mkdirSync(dir, { recursive: true });
|
|
22
|
+
return dir;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export function writeOutput(
|
|
26
|
+
data,
|
|
27
|
+
outFile,
|
|
28
|
+
{ inline = false, synthesize = false, query = "" } = {},
|
|
29
|
+
) {
|
|
30
|
+
const json = `${JSON.stringify(data, null, 2)}\n`;
|
|
31
|
+
|
|
32
|
+
if (outFile) {
|
|
33
|
+
writeFileSync(outFile, json, "utf8");
|
|
34
|
+
process.stderr.write(`Results written to ${outFile}\n`);
|
|
35
|
+
return;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
if (inline) {
|
|
39
|
+
process.stdout.write(json);
|
|
40
|
+
return;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
const ts = new Date()
|
|
44
|
+
.toISOString()
|
|
45
|
+
.replace("T", "_")
|
|
46
|
+
.replace(/[:.]/g, "-")
|
|
47
|
+
.slice(0, 19);
|
|
48
|
+
const slug = slugify(query);
|
|
49
|
+
const base = join(resultsDir(), `${ts}_${slug}`);
|
|
50
|
+
|
|
51
|
+
writeFileSync(`${base}.json`, json, "utf8");
|
|
52
|
+
|
|
53
|
+
if (synthesize && data._synthesis?.answer) {
|
|
54
|
+
writeFileSync(`${base}-synthesis.md`, data._synthesis.answer, "utf8");
|
|
55
|
+
process.stdout.write(`${base}-synthesis.md\n`);
|
|
56
|
+
} else {
|
|
57
|
+
process.stdout.write(`${base}.json\n`);
|
|
58
|
+
}
|
|
59
|
+
}
|