@apmantza/greedysearch-pi 1.9.2 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +132 -2
- package/README.md +82 -47
- package/bin/cdp.mjs +1153 -1108
- package/bin/launch.mjs +9 -0
- package/bin/search.mjs +318 -81
- package/extractors/bing-copilot.mjs +48 -18
- package/extractors/chatgpt.mjs +553 -0
- package/extractors/common.mjs +213 -22
- package/extractors/consensus.mjs +655 -0
- package/extractors/consent.mjs +182 -18
- package/extractors/gemini.mjs +350 -217
- package/extractors/google-ai.mjs +129 -128
- package/extractors/logically.mjs +629 -0
- package/extractors/perplexity.mjs +547 -217
- package/extractors/selectors.mjs +3 -2
- package/extractors/semantic-scholar.mjs +219 -0
- package/package.json +8 -4
- package/skills/greedy-search/skill.md +20 -12
- package/src/fetcher.mjs +23 -1
- package/src/formatters/results.ts +185 -128
- package/src/search/browser-lifecycle.mjs +27 -5
- package/src/search/challenge-detect.mjs +205 -0
- package/src/search/chrome.mjs +653 -590
- package/src/search/constants.mjs +155 -39
- package/src/search/engines.mjs +114 -76
- package/src/search/fetch-source.mjs +566 -451
- package/src/search/pdf.mjs +68 -0
- package/src/search/progress.mjs +145 -0
- package/src/search/recovery.mjs +73 -45
- package/src/search/research.mjs +1419 -62
- package/src/search/scale-aware.mjs +93 -0
- package/src/search/simple-research.mjs +520 -0
- package/src/search/sources.mjs +52 -22
- package/src/search/synthesis-runner.mjs +105 -26
- package/src/search/synthesis.mjs +286 -246
- package/src/tools/greedy-search-handler.ts +129 -59
- package/src/tools/shared.ts +312 -186
- package/src/types.ts +110 -104
- package/test.mjs +537 -18
package/src/search/constants.mjs
CHANGED
|
@@ -1,39 +1,155 @@
|
|
|
1
|
-
// src/search/constants.mjs — Shared constants for GreedySearch search pipeline
|
|
2
|
-
|
|
3
|
-
import {
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
export const
|
|
9
|
-
export const
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
export const
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
1
|
+
// src/search/constants.mjs — Shared constants for GreedySearch search pipeline
|
|
2
|
+
|
|
3
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
4
|
+
import { homedir } from "node:os";
|
|
5
|
+
import { join } from "node:path";
|
|
6
|
+
import { tmpdir } from "node:os";
|
|
7
|
+
|
|
8
|
+
export const GREEDY_PORT = 9222;
|
|
9
|
+
export const GREEDY_PROFILE_DIR = `${tmpdir().replaceAll("\\", "/")}/greedysearch-chrome-profile`;
|
|
10
|
+
export const ACTIVE_PORT_FILE = `${GREEDY_PROFILE_DIR}/DevToolsActivePort`;
|
|
11
|
+
export const PAGES_CACHE = `${tmpdir().replaceAll("\\", "/")}/cdp-pages.json`;
|
|
12
|
+
export const CHROME_MODE_FILE = `${tmpdir().replaceAll("\\", "/")}/greedysearch-chrome-mode`;
|
|
13
|
+
export const VISIBLE_RECOVERY_LOG = `${tmpdir().replaceAll("\\", "/")}/greedysearch-visible-recovery.jsonl`;
|
|
14
|
+
|
|
15
|
+
// ── User config: ~/.pi/greedyconfig ────────────────────────────────────────
|
|
16
|
+
// Users can override which engines participate in the "all" fan-out and which
|
|
17
|
+
// engine performs optional synthesis.
|
|
18
|
+
// Default engines: perplexity, google, chatgpt; synthesizer: gemini
|
|
19
|
+
|
|
20
|
+
const CONFIG_DIR = join(homedir(), ".pi");
|
|
21
|
+
const CONFIG_FILE = join(CONFIG_DIR, "greedyconfig");
|
|
22
|
+
|
|
23
|
+
// Default engines that participate in the "all" fan-out for normal
|
|
24
|
+
// (non-research) searches. Opt-in research/academic engines like
|
|
25
|
+
// `semantic-scholar` are deliberately excluded — they belong in research
|
|
26
|
+
// mode, not casual web search. Users who want them in normal `engine:all`
|
|
27
|
+
// runs can add them via ~/.pi/greedyconfig (see ensureDefaultConfig()).
|
|
28
|
+
export const DEFAULT_ENGINES = ["perplexity", "google", "chatgpt"];
|
|
29
|
+
export const DEFAULT_SYNTHESIZER = "gemini";
|
|
30
|
+
|
|
31
|
+
function loadUserEngines() {
|
|
32
|
+
try {
|
|
33
|
+
if (existsSync(CONFIG_FILE)) {
|
|
34
|
+
const raw = readFileSync(CONFIG_FILE, "utf8");
|
|
35
|
+
const config = JSON.parse(raw);
|
|
36
|
+
if (
|
|
37
|
+
Array.isArray(config.engines) &&
|
|
38
|
+
config.engines.length > 0 &&
|
|
39
|
+
config.engines.every((e) => typeof e === "string")
|
|
40
|
+
) {
|
|
41
|
+
// Validate each engine exists in ENGINES. Unknown names are
|
|
42
|
+
// silently dropped — but at least once we tell the user about
|
|
43
|
+
// it so a typo in ~/.pi/greedyconfig doesn't quietly shrink
|
|
44
|
+
// the all-search fan-out.
|
|
45
|
+
const valid = config.engines.filter((e) => ENGINES[e]);
|
|
46
|
+
const invalid = config.engines.filter((e) => !ENGINES[e]);
|
|
47
|
+
if (invalid.length > 0) {
|
|
48
|
+
process.stderr.write(
|
|
49
|
+
`[greedysearch] Warning: ignoring unknown engine(s) in ${CONFIG_FILE}: ${invalid.join(", ")}\n` +
|
|
50
|
+
`[greedysearch] Available engines: ${Object.keys(ENGINES).join(", ")}\n`,
|
|
51
|
+
);
|
|
52
|
+
}
|
|
53
|
+
if (valid.length > 0) return valid;
|
|
54
|
+
process.stderr.write(
|
|
55
|
+
`[greedysearch] Warning: no valid engines in ${CONFIG_FILE}, falling back to defaults: ${DEFAULT_ENGINES.join(", ")}\n`,
|
|
56
|
+
);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
} catch {
|
|
60
|
+
// Ignore parse/read errors — fall through to default
|
|
61
|
+
}
|
|
62
|
+
return DEFAULT_ENGINES;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function ensureDefaultConfig() {
|
|
66
|
+
try {
|
|
67
|
+
if (!existsSync(CONFIG_DIR)) mkdirSync(CONFIG_DIR, { recursive: true });
|
|
68
|
+
if (!existsSync(CONFIG_FILE)) {
|
|
69
|
+
writeFileSync(
|
|
70
|
+
CONFIG_FILE,
|
|
71
|
+
JSON.stringify(
|
|
72
|
+
{ engines: DEFAULT_ENGINES, synthesizer: DEFAULT_SYNTHESIZER },
|
|
73
|
+
null,
|
|
74
|
+
2,
|
|
75
|
+
) + "\n",
|
|
76
|
+
"utf8",
|
|
77
|
+
);
|
|
78
|
+
}
|
|
79
|
+
} catch {
|
|
80
|
+
// Best-effort — don't crash if we can't write the config file
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
ensureDefaultConfig();
|
|
85
|
+
|
|
86
|
+
export const SUPPORTED_SYNTHESIZERS = ["gemini", "chatgpt"];
|
|
87
|
+
|
|
88
|
+
function loadUserSynthesizer() {
|
|
89
|
+
try {
|
|
90
|
+
if (existsSync(CONFIG_FILE)) {
|
|
91
|
+
const raw = readFileSync(CONFIG_FILE, "utf8");
|
|
92
|
+
const config = JSON.parse(raw);
|
|
93
|
+
if (typeof config.synthesizer === "string") {
|
|
94
|
+
const normalized = config.synthesizer.toLowerCase();
|
|
95
|
+
if (SUPPORTED_SYNTHESIZERS.includes(normalized)) return normalized;
|
|
96
|
+
process.stderr.write(
|
|
97
|
+
`[greedysearch] Warning: unknown synthesizer "${config.synthesizer}" in ${CONFIG_FILE}\n` +
|
|
98
|
+
`[greedysearch] Available synthesizers: ${SUPPORTED_SYNTHESIZERS.join(", ")}\n` +
|
|
99
|
+
`[greedysearch] Falling back to default: ${DEFAULT_SYNTHESIZER}\n`,
|
|
100
|
+
);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
} catch {
|
|
104
|
+
// Ignore parse/read errors — fall through to default
|
|
105
|
+
}
|
|
106
|
+
return DEFAULT_SYNTHESIZER;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
export const ENGINE_DOMAINS = {
|
|
110
|
+
perplexity: "perplexity.ai",
|
|
111
|
+
bing: "copilot.microsoft.com",
|
|
112
|
+
google: "google.com",
|
|
113
|
+
gemini: "gemini.google.com",
|
|
114
|
+
chatgpt: "chatgpt.com",
|
|
115
|
+
"semantic-scholar": "semanticscholar.org",
|
|
116
|
+
semanticscholar: "semanticscholar.org",
|
|
117
|
+
s2: "semanticscholar.org",
|
|
118
|
+
logically: "logically.app",
|
|
119
|
+
};
|
|
120
|
+
|
|
121
|
+
export const ENGINES = {
|
|
122
|
+
perplexity: "perplexity.mjs",
|
|
123
|
+
p: "perplexity.mjs",
|
|
124
|
+
bing: "bing-copilot.mjs",
|
|
125
|
+
b: "bing-copilot.mjs",
|
|
126
|
+
google: "google-ai.mjs",
|
|
127
|
+
g: "google-ai.mjs",
|
|
128
|
+
gemini: "gemini.mjs",
|
|
129
|
+
gem: "gemini.mjs",
|
|
130
|
+
chatgpt: "chatgpt.mjs",
|
|
131
|
+
gpt: "chatgpt.mjs",
|
|
132
|
+
"semantic-scholar": "semantic-scholar.mjs",
|
|
133
|
+
semanticscholar: "semantic-scholar.mjs",
|
|
134
|
+
s2: "semantic-scholar.mjs",
|
|
135
|
+
logically: "logically.mjs",
|
|
136
|
+
log: "logically.mjs",
|
|
137
|
+
};
|
|
138
|
+
|
|
139
|
+
// ALL_ENGINES drives the "all" fan-out. Edit ~/.pi/greedyconfig to customize.
|
|
140
|
+
export const ALL_ENGINES = loadUserEngines();
|
|
141
|
+
|
|
142
|
+
// Research child searches intentionally reuse the normal configured fan-out.
|
|
143
|
+
// Gemini remains the research planner/final-report synthesizer.
|
|
144
|
+
export const RESEARCH_ENGINES = ALL_ENGINES;
|
|
145
|
+
|
|
146
|
+
// SYNTHESIZER drives optional all-search synthesis. Edit ~/.pi/greedyconfig to customize.
|
|
147
|
+
export const SYNTHESIZER = loadUserSynthesizer();
|
|
148
|
+
|
|
149
|
+
export const SOURCE_FETCH_CONCURRENCY = Math.max(
|
|
150
|
+
1,
|
|
151
|
+
Number.parseInt(process.env.GREEDY_FETCH_CONCURRENCY || "5", 10) || 5,
|
|
152
|
+
);
|
|
153
|
+
|
|
154
|
+
// Tell cdp.mjs to prefer the GreedySearch Chrome profile's DevToolsActivePort
|
|
155
|
+
process.env.CDP_PROFILE_DIR = GREEDY_PROFILE_DIR;
|
package/src/search/engines.mjs
CHANGED
|
@@ -1,76 +1,114 @@
|
|
|
1
|
-
// src/search/engines.mjs — Extractor runner
|
|
2
|
-
//
|
|
3
|
-
// Engine map lives in constants.mjs; this module re-exports it for
|
|
4
|
-
// backward compatibility and provides the runExtractor() function.
|
|
5
|
-
|
|
6
|
-
import { spawn } from "node:child_process";
|
|
7
|
-
import { join } from "node:path";
|
|
8
|
-
import { ENGINES, GREEDY_PROFILE_DIR } from "./constants.mjs";
|
|
9
|
-
|
|
10
|
-
export { ENGINES };
|
|
11
|
-
|
|
12
|
-
const __dir =
|
|
13
|
-
import.meta.dirname ||
|
|
14
|
-
new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
|
|
15
|
-
|
|
16
|
-
export function runExtractor(
|
|
17
|
-
script,
|
|
18
|
-
query,
|
|
19
|
-
tabPrefix = null,
|
|
20
|
-
short = false,
|
|
21
|
-
timeoutMs = null,
|
|
22
|
-
locale = null,
|
|
23
|
-
) {
|
|
24
|
-
// Gemini synthesis: 70s budget (45s stream + ~25s nav/settle overhead)
|
|
25
|
-
//
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
1
|
+
// src/search/engines.mjs — Extractor runner
|
|
2
|
+
//
|
|
3
|
+
// Engine map lives in constants.mjs; this module re-exports it for
|
|
4
|
+
// backward compatibility and provides the runExtractor() function.
|
|
5
|
+
|
|
6
|
+
import { spawn } from "node:child_process";
|
|
7
|
+
import { join } from "node:path";
|
|
8
|
+
import { ENGINES, GREEDY_PROFILE_DIR } from "./constants.mjs";
|
|
9
|
+
|
|
10
|
+
export { ENGINES };
|
|
11
|
+
|
|
12
|
+
const __dir =
|
|
13
|
+
import.meta.dirname ||
|
|
14
|
+
new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
|
|
15
|
+
|
|
16
|
+
export function runExtractor(
|
|
17
|
+
script,
|
|
18
|
+
query,
|
|
19
|
+
tabPrefix = null,
|
|
20
|
+
short = false,
|
|
21
|
+
timeoutMs = null,
|
|
22
|
+
locale = null,
|
|
23
|
+
) {
|
|
24
|
+
// Gemini synthesis: 70s budget (45s stream + ~25s nav/settle overhead)
|
|
25
|
+
// ChatGPT can use a 30s in-page stream wait plus a 35s node-side fallback.
|
|
26
|
+
// Logically research answers can run academic + web searches before streaming.
|
|
27
|
+
// Other engines: 60s budget
|
|
28
|
+
if (timeoutMs === null) {
|
|
29
|
+
timeoutMs = script.includes("logically")
|
|
30
|
+
? 120000
|
|
31
|
+
: script.includes("chatgpt")
|
|
32
|
+
? 80000
|
|
33
|
+
: script.includes("gemini")
|
|
34
|
+
? 70000
|
|
35
|
+
: 60000;
|
|
36
|
+
}
|
|
37
|
+
const extraArgs = [
|
|
38
|
+
...(tabPrefix ? ["--tab", tabPrefix] : []),
|
|
39
|
+
...(short ? ["--short"] : []),
|
|
40
|
+
...(locale ? ["--locale", locale] : []),
|
|
41
|
+
];
|
|
42
|
+
return new Promise((resolve, reject) => {
|
|
43
|
+
const proc = spawn(
|
|
44
|
+
process.execPath,
|
|
45
|
+
[join(__dir, "..", "..", "extractors", script), "--stdin", ...extraArgs],
|
|
46
|
+
{
|
|
47
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
48
|
+
env: { ...process.env, CDP_PROFILE_DIR: GREEDY_PROFILE_DIR },
|
|
49
|
+
},
|
|
50
|
+
);
|
|
51
|
+
// Pipe query via stdin to avoid leaking it in process table command-line
|
|
52
|
+
proc.stdin.write(query);
|
|
53
|
+
proc.stdin.end();
|
|
54
|
+
let out = "";
|
|
55
|
+
let err = "";
|
|
56
|
+
proc.stdout.on("data", (d) => (out += d));
|
|
57
|
+
// Forward child stderr to parent so [engine] stage: lines are visible
|
|
58
|
+
// in real time. Also retain the buffer for the timeout diagnostic path.
|
|
59
|
+
proc.stderr.on("data", (d) => {
|
|
60
|
+
err += d;
|
|
61
|
+
if (process.env.GREEDY_SEARCH_CHILD_STDERR !== "0") {
|
|
62
|
+
process.stderr.write(d);
|
|
63
|
+
}
|
|
64
|
+
});
|
|
65
|
+
const t = setTimeout(() => {
|
|
66
|
+
proc.kill();
|
|
67
|
+
// Surface as much diagnostic info as the killed child produced so the
|
|
68
|
+
// caller can see *which stage* the extractor was in. handleError()
|
|
69
|
+
// emits `{ _envelope, error }` JSON to stdout on graceful failure,
|
|
70
|
+
// but a hard kill discards whatever was buffered.
|
|
71
|
+
const tailLines = (s, n = 20) =>
|
|
72
|
+
String(s ?? "")
|
|
73
|
+
.split(/\r?\n/)
|
|
74
|
+
.filter(Boolean)
|
|
75
|
+
.slice(-n)
|
|
76
|
+
.join("\n");
|
|
77
|
+
let envelope = null;
|
|
78
|
+
try {
|
|
79
|
+
const parsed = JSON.parse(out.trim());
|
|
80
|
+
if (parsed._envelope) envelope = parsed._envelope;
|
|
81
|
+
} catch {}
|
|
82
|
+
const errObj = new Error(
|
|
83
|
+
`${script} timed out after ${timeoutMs / 1000}s` +
|
|
84
|
+
(envelope?.lastStage ? ` (last stage: ${envelope.lastStage})` : ""),
|
|
85
|
+
);
|
|
86
|
+
errObj.engineScript = script;
|
|
87
|
+
errObj.lastStage = envelope?.lastStage || null;
|
|
88
|
+
errObj.partialErr = tailLines(err);
|
|
89
|
+
errObj.partialOut = tailLines(out);
|
|
90
|
+
reject(errObj);
|
|
91
|
+
}, timeoutMs);
|
|
92
|
+
proc.on("close", (code) => {
|
|
93
|
+
clearTimeout(t);
|
|
94
|
+
if (code === 0) {
|
|
95
|
+
try {
|
|
96
|
+
resolve(JSON.parse(out.trim()));
|
|
97
|
+
} catch {
|
|
98
|
+
reject(new Error(`bad JSON from ${script}: ${out.slice(0, 100)}`));
|
|
99
|
+
}
|
|
100
|
+
} else {
|
|
101
|
+
// Try to parse structured error envelope from stdout before falling back
|
|
102
|
+
let envelope = null;
|
|
103
|
+
try {
|
|
104
|
+
const parsed = JSON.parse(out.trim());
|
|
105
|
+
if (parsed._envelope) envelope = parsed._envelope;
|
|
106
|
+
} catch {}
|
|
107
|
+
const msg = err.trim() || `extractor exit ${code}`;
|
|
108
|
+
const errObj = new Error(msg);
|
|
109
|
+
if (envelope) errObj.envelope = envelope;
|
|
110
|
+
reject(errObj);
|
|
111
|
+
}
|
|
112
|
+
});
|
|
113
|
+
});
|
|
114
|
+
}
|