@apmantza/greedysearch-pi 1.9.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,39 +1,150 @@
1
- // src/search/constants.mjs — Shared constants for GreedySearch search pipeline
2
-
3
- import { tmpdir } from "node:os";
4
-
5
- export const GREEDY_PORT = 9222;
6
- export const GREEDY_PROFILE_DIR = `${tmpdir().replaceAll("\\", "/")}/greedysearch-chrome-profile`;
7
- export const ACTIVE_PORT_FILE = `${GREEDY_PROFILE_DIR}/DevToolsActivePort`;
8
- export const PAGES_CACHE = `${tmpdir().replaceAll("\\", "/")}/cdp-pages.json`;
9
- export const CHROME_MODE_FILE = `${tmpdir().replaceAll("\\", "/")}/greedysearch-chrome-mode`;
10
-
11
- // ALL_ENGINES drives the "all" fan-out. Add engines here to include them in multi-engine searches.
12
- // Engines in ENGINES but not in ALL_ENGINES are available for explicit use only.
13
- export const ALL_ENGINES = ["perplexity", "bing", "google"];
14
-
15
- export const ENGINE_DOMAINS = {
16
- perplexity: "perplexity.ai",
17
- bing: "copilot.microsoft.com",
18
- google: "google.com",
19
- gemini: "gemini.google.com",
20
- };
21
-
22
- export const ENGINES = {
23
- perplexity: "perplexity.mjs",
24
- p: "perplexity.mjs",
25
- bing: "bing-copilot.mjs",
26
- b: "bing-copilot.mjs",
27
- google: "google-ai.mjs",
28
- g: "google-ai.mjs",
29
- gemini: "gemini.mjs",
30
- gem: "gemini.mjs",
31
- };
32
-
33
- export const SOURCE_FETCH_CONCURRENCY = Math.max(
34
- 1,
35
- Number.parseInt(process.env.GREEDY_FETCH_CONCURRENCY || "5", 10) || 5,
36
- );
37
-
38
- // Tell cdp.mjs to prefer the GreedySearch Chrome profile's DevToolsActivePort
39
- process.env.CDP_PROFILE_DIR = GREEDY_PROFILE_DIR;
1
+ // src/search/constants.mjs — Shared constants for GreedySearch search pipeline
2
+
3
+ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
4
+ import { homedir } from "node:os";
5
+ import { join } from "node:path";
6
+ import { tmpdir } from "node:os";
7
+
8
+ export const GREEDY_PORT = 9222;
9
+ export const GREEDY_PROFILE_DIR = `${tmpdir().replaceAll("\\", "/")}/greedysearch-chrome-profile`;
10
+ export const ACTIVE_PORT_FILE = `${GREEDY_PROFILE_DIR}/DevToolsActivePort`;
11
+ export const PAGES_CACHE = `${tmpdir().replaceAll("\\", "/")}/cdp-pages.json`;
12
+ export const CHROME_MODE_FILE = `${tmpdir().replaceAll("\\", "/")}/greedysearch-chrome-mode`;
13
+ export const VISIBLE_RECOVERY_LOG = `${tmpdir().replaceAll("\\", "/")}/greedysearch-visible-recovery.jsonl`;
14
+
15
+ // ── User config: ~/.pi/greedyconfig ────────────────────────────────────────
16
+ // Users can override which engines participate in the "all" fan-out and which
17
+ // engine performs optional synthesis.
18
+ // Default engines: perplexity, google, chatgpt; synthesizer: gemini
19
+
20
+ const CONFIG_DIR = join(homedir(), ".pi");
21
+ const CONFIG_FILE = join(CONFIG_DIR, "greedyconfig");
22
+
23
+ export const DEFAULT_ENGINES = ["perplexity", "google", "chatgpt"];
24
+ export const DEFAULT_SYNTHESIZER = "gemini";
25
+
26
+ function loadUserEngines() {
27
+ try {
28
+ if (existsSync(CONFIG_FILE)) {
29
+ const raw = readFileSync(CONFIG_FILE, "utf8");
30
+ const config = JSON.parse(raw);
31
+ if (
32
+ Array.isArray(config.engines) &&
33
+ config.engines.length > 0 &&
34
+ config.engines.every((e) => typeof e === "string")
35
+ ) {
36
+ // Validate each engine exists in ENGINES. Unknown names are
37
+ // silently dropped — but at least once we tell the user about
38
+ // it so a typo in ~/.pi/greedyconfig doesn't quietly shrink
39
+ // the all-search fan-out.
40
+ const valid = config.engines.filter((e) => ENGINES[e]);
41
+ const invalid = config.engines.filter((e) => !ENGINES[e]);
42
+ if (invalid.length > 0) {
43
+ process.stderr.write(
44
+ `[greedysearch] Warning: ignoring unknown engine(s) in ${CONFIG_FILE}: ${invalid.join(", ")}\n` +
45
+ `[greedysearch] Available engines: ${Object.keys(ENGINES).join(", ")}\n`,
46
+ );
47
+ }
48
+ if (valid.length > 0) return valid;
49
+ process.stderr.write(
50
+ `[greedysearch] Warning: no valid engines in ${CONFIG_FILE}, falling back to defaults: ${DEFAULT_ENGINES.join(", ")}\n`,
51
+ );
52
+ }
53
+ }
54
+ } catch {
55
+ // Ignore parse/read errors — fall through to default
56
+ }
57
+ return DEFAULT_ENGINES;
58
+ }
59
+
60
+ function ensureDefaultConfig() {
61
+ try {
62
+ if (!existsSync(CONFIG_DIR)) mkdirSync(CONFIG_DIR, { recursive: true });
63
+ if (!existsSync(CONFIG_FILE)) {
64
+ writeFileSync(
65
+ CONFIG_FILE,
66
+ JSON.stringify(
67
+ { engines: DEFAULT_ENGINES, synthesizer: DEFAULT_SYNTHESIZER },
68
+ null,
69
+ 2,
70
+ ) + "\n",
71
+ "utf8",
72
+ );
73
+ }
74
+ } catch {
75
+ // Best-effort — don't crash if we can't write the config file
76
+ }
77
+ }
78
+
79
+ ensureDefaultConfig();
80
+
81
+ export const SUPPORTED_SYNTHESIZERS = ["gemini", "chatgpt"];
82
+
83
+ function loadUserSynthesizer() {
84
+ try {
85
+ if (existsSync(CONFIG_FILE)) {
86
+ const raw = readFileSync(CONFIG_FILE, "utf8");
87
+ const config = JSON.parse(raw);
88
+ if (typeof config.synthesizer === "string") {
89
+ const normalized = config.synthesizer.toLowerCase();
90
+ if (SUPPORTED_SYNTHESIZERS.includes(normalized)) return normalized;
91
+ process.stderr.write(
92
+ `[greedysearch] Warning: unknown synthesizer "${config.synthesizer}" in ${CONFIG_FILE}\n` +
93
+ `[greedysearch] Available synthesizers: ${SUPPORTED_SYNTHESIZERS.join(", ")}\n` +
94
+ `[greedysearch] Falling back to default: ${DEFAULT_SYNTHESIZER}\n`,
95
+ );
96
+ }
97
+ }
98
+ } catch {
99
+ // Ignore parse/read errors — fall through to default
100
+ }
101
+ return DEFAULT_SYNTHESIZER;
102
+ }
103
+
104
+ export const ENGINE_DOMAINS = {
105
+ perplexity: "perplexity.ai",
106
+ bing: "copilot.microsoft.com",
107
+ google: "google.com",
108
+ gemini: "gemini.google.com",
109
+ chatgpt: "chatgpt.com",
110
+ "semantic-scholar": "semanticscholar.org",
111
+ semanticscholar: "semanticscholar.org",
112
+ s2: "semanticscholar.org",
113
+ logically: "logically.app",
114
+ };
115
+
116
+ export const ENGINES = {
117
+ perplexity: "perplexity.mjs",
118
+ p: "perplexity.mjs",
119
+ bing: "bing-copilot.mjs",
120
+ b: "bing-copilot.mjs",
121
+ google: "google-ai.mjs",
122
+ g: "google-ai.mjs",
123
+ gemini: "gemini.mjs",
124
+ gem: "gemini.mjs",
125
+ chatgpt: "chatgpt.mjs",
126
+ gpt: "chatgpt.mjs",
127
+ "semantic-scholar": "semantic-scholar.mjs",
128
+ semanticscholar: "semantic-scholar.mjs",
129
+ s2: "semantic-scholar.mjs",
130
+ logically: "logically.mjs",
131
+ log: "logically.mjs",
132
+ };
133
+
134
+ // ALL_ENGINES drives the "all" fan-out. Edit ~/.pi/greedyconfig to customize.
135
+ export const ALL_ENGINES = loadUserEngines();
136
+
137
+ // Research child searches intentionally reuse the normal configured fan-out.
138
+ // Gemini remains the research planner/final-report synthesizer.
139
+ export const RESEARCH_ENGINES = ALL_ENGINES;
140
+
141
+ // SYNTHESIZER drives optional all-search synthesis. Edit ~/.pi/greedyconfig to customize.
142
+ export const SYNTHESIZER = loadUserSynthesizer();
143
+
144
+ export const SOURCE_FETCH_CONCURRENCY = Math.max(
145
+ 1,
146
+ Number.parseInt(process.env.GREEDY_FETCH_CONCURRENCY || "5", 10) || 5,
147
+ );
148
+
149
+ // Tell cdp.mjs to prefer the GreedySearch Chrome profile's DevToolsActivePort
150
+ process.env.CDP_PROFILE_DIR = GREEDY_PROFILE_DIR;
@@ -1,76 +1,114 @@
1
- // src/search/engines.mjs — Extractor runner
2
- //
3
- // Engine map lives in constants.mjs; this module re-exports it for
4
- // backward compatibility and provides the runExtractor() function.
5
-
6
- import { spawn } from "node:child_process";
7
- import { join } from "node:path";
8
- import { ENGINES, GREEDY_PROFILE_DIR } from "./constants.mjs";
9
-
10
- export { ENGINES };
11
-
12
- const __dir =
13
- import.meta.dirname ||
14
- new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
15
-
16
- export function runExtractor(
17
- script,
18
- query,
19
- tabPrefix = null,
20
- short = false,
21
- timeoutMs = null,
22
- locale = null,
23
- ) {
24
- // Gemini synthesis: 70s budget (45s stream + ~25s nav/settle overhead)
25
- // Other engines: 60s budget
26
- if (timeoutMs === null) {
27
- timeoutMs = script.includes("gemini") ? 70000 : 60000;
28
- }
29
- const extraArgs = [
30
- ...(tabPrefix ? ["--tab", tabPrefix] : []),
31
- ...(short ? ["--short"] : []),
32
- ...(locale ? ["--locale", locale] : []),
33
- ];
34
- return new Promise((resolve, reject) => {
35
- const proc = spawn(
36
- process.execPath,
37
- [join(__dir, "..", "..", "extractors", script), "--stdin", ...extraArgs],
38
- {
39
- stdio: ["pipe", "pipe", "pipe"],
40
- env: { ...process.env, CDP_PROFILE_DIR: GREEDY_PROFILE_DIR },
41
- },
42
- );
43
- // Pipe query via stdin to avoid leaking it in process table command-line
44
- proc.stdin.write(query);
45
- proc.stdin.end();
46
- let out = "";
47
- let err = "";
48
- proc.stdout.on("data", (d) => (out += d));
49
- proc.stderr.on("data", (d) => (err += d));
50
- const t = setTimeout(() => {
51
- proc.kill();
52
- reject(new Error(`${script} timed out after ${timeoutMs / 1000}s`));
53
- }, timeoutMs);
54
- proc.on("close", (code) => {
55
- clearTimeout(t);
56
- if (code === 0) {
57
- try {
58
- resolve(JSON.parse(out.trim()));
59
- } catch {
60
- reject(new Error(`bad JSON from ${script}: ${out.slice(0, 100)}`));
61
- }
62
- } else {
63
- // Try to parse structured error envelope from stdout before falling back
64
- let envelope = null;
65
- try {
66
- const parsed = JSON.parse(out.trim());
67
- if (parsed._envelope) envelope = parsed._envelope;
68
- } catch {}
69
- const msg = err.trim() || `extractor exit ${code}`;
70
- const errObj = new Error(msg);
71
- if (envelope) errObj.envelope = envelope;
72
- reject(errObj);
73
- }
74
- });
75
- });
76
- }
1
+ // src/search/engines.mjs — Extractor runner
2
+ //
3
+ // Engine map lives in constants.mjs; this module re-exports it for
4
+ // backward compatibility and provides the runExtractor() function.
5
+
6
+ import { spawn } from "node:child_process";
7
+ import { join } from "node:path";
8
+ import { ENGINES, GREEDY_PROFILE_DIR } from "./constants.mjs";
9
+
10
+ export { ENGINES };
11
+
12
+ const __dir =
13
+ import.meta.dirname ||
14
+ new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
15
+
16
+ export function runExtractor(
17
+ script,
18
+ query,
19
+ tabPrefix = null,
20
+ short = false,
21
+ timeoutMs = null,
22
+ locale = null,
23
+ ) {
24
+ // Gemini synthesis: 70s budget (45s stream + ~25s nav/settle overhead)
25
+ // ChatGPT can use a 30s in-page stream wait plus a 35s node-side fallback.
26
+ // Logically research answers can run academic + web searches before streaming.
27
+ // Other engines: 60s budget
28
+ if (timeoutMs === null) {
29
+ timeoutMs = script.includes("logically")
30
+ ? 120000
31
+ : script.includes("chatgpt")
32
+ ? 80000
33
+ : script.includes("gemini")
34
+ ? 70000
35
+ : 60000;
36
+ }
37
+ const extraArgs = [
38
+ ...(tabPrefix ? ["--tab", tabPrefix] : []),
39
+ ...(short ? ["--short"] : []),
40
+ ...(locale ? ["--locale", locale] : []),
41
+ ];
42
+ return new Promise((resolve, reject) => {
43
+ const proc = spawn(
44
+ process.execPath,
45
+ [join(__dir, "..", "..", "extractors", script), "--stdin", ...extraArgs],
46
+ {
47
+ stdio: ["pipe", "pipe", "pipe"],
48
+ env: { ...process.env, CDP_PROFILE_DIR: GREEDY_PROFILE_DIR },
49
+ },
50
+ );
51
+ // Pipe query via stdin to avoid leaking it in process table command-line
52
+ proc.stdin.write(query);
53
+ proc.stdin.end();
54
+ let out = "";
55
+ let err = "";
56
+ proc.stdout.on("data", (d) => (out += d));
57
+ // Forward child stderr to parent so [engine] stage: lines are visible
58
+ // in real time. Also retain the buffer for the timeout diagnostic path.
59
+ proc.stderr.on("data", (d) => {
60
+ err += d;
61
+ if (process.env.GREEDY_SEARCH_CHILD_STDERR !== "0") {
62
+ process.stderr.write(d);
63
+ }
64
+ });
65
+ const t = setTimeout(() => {
66
+ proc.kill();
67
+ // Surface as much diagnostic info as the killed child produced so the
68
+ // caller can see *which stage* the extractor was in. handleError()
69
+ // emits `{ _envelope, error }` JSON to stdout on graceful failure,
70
+ // but a hard kill discards whatever was buffered.
71
+ const tailLines = (s, n = 20) =>
72
+ String(s ?? "")
73
+ .split(/\r?\n/)
74
+ .filter(Boolean)
75
+ .slice(-n)
76
+ .join("\n");
77
+ let envelope = null;
78
+ try {
79
+ const parsed = JSON.parse(out.trim());
80
+ if (parsed._envelope) envelope = parsed._envelope;
81
+ } catch {}
82
+ const errObj = new Error(
83
+ `${script} timed out after ${timeoutMs / 1000}s` +
84
+ (envelope?.lastStage ? ` (last stage: ${envelope.lastStage})` : ""),
85
+ );
86
+ errObj.engineScript = script;
87
+ errObj.lastStage = envelope?.lastStage || null;
88
+ errObj.partialErr = tailLines(err);
89
+ errObj.partialOut = tailLines(out);
90
+ reject(errObj);
91
+ }, timeoutMs);
92
+ proc.on("close", (code) => {
93
+ clearTimeout(t);
94
+ if (code === 0) {
95
+ try {
96
+ resolve(JSON.parse(out.trim()));
97
+ } catch {
98
+ reject(new Error(`bad JSON from ${script}: ${out.slice(0, 100)}`));
99
+ }
100
+ } else {
101
+ // Try to parse structured error envelope from stdout before falling back
102
+ let envelope = null;
103
+ try {
104
+ const parsed = JSON.parse(out.trim());
105
+ if (parsed._envelope) envelope = parsed._envelope;
106
+ } catch {}
107
+ const msg = err.trim() || `extractor exit ${code}`;
108
+ const errObj = new Error(msg);
109
+ if (envelope) errObj.envelope = envelope;
110
+ reject(errObj);
111
+ }
112
+ });
113
+ });
114
+ }