@apmantza/greedysearch-pi 1.9.2 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/CHANGELOG.md +132 -2
  2. package/README.md +82 -47
  3. package/bin/cdp.mjs +1153 -1108
  4. package/bin/launch.mjs +9 -0
  5. package/bin/search.mjs +318 -81
  6. package/extractors/bing-copilot.mjs +48 -18
  7. package/extractors/chatgpt.mjs +553 -0
  8. package/extractors/common.mjs +213 -22
  9. package/extractors/consensus.mjs +655 -0
  10. package/extractors/consent.mjs +182 -18
  11. package/extractors/gemini.mjs +350 -217
  12. package/extractors/google-ai.mjs +129 -128
  13. package/extractors/logically.mjs +629 -0
  14. package/extractors/perplexity.mjs +547 -217
  15. package/extractors/selectors.mjs +3 -2
  16. package/extractors/semantic-scholar.mjs +219 -0
  17. package/package.json +8 -4
  18. package/skills/greedy-search/skill.md +20 -12
  19. package/src/fetcher.mjs +23 -1
  20. package/src/formatters/results.ts +185 -128
  21. package/src/search/browser-lifecycle.mjs +27 -5
  22. package/src/search/challenge-detect.mjs +205 -0
  23. package/src/search/chrome.mjs +653 -590
  24. package/src/search/constants.mjs +155 -39
  25. package/src/search/engines.mjs +114 -76
  26. package/src/search/fetch-source.mjs +566 -451
  27. package/src/search/pdf.mjs +68 -0
  28. package/src/search/progress.mjs +145 -0
  29. package/src/search/recovery.mjs +73 -45
  30. package/src/search/research.mjs +1419 -62
  31. package/src/search/scale-aware.mjs +93 -0
  32. package/src/search/simple-research.mjs +520 -0
  33. package/src/search/sources.mjs +52 -22
  34. package/src/search/synthesis-runner.mjs +105 -26
  35. package/src/search/synthesis.mjs +286 -246
  36. package/src/tools/greedy-search-handler.ts +129 -59
  37. package/src/tools/shared.ts +312 -186
  38. package/src/types.ts +110 -104
  39. package/test.mjs +537 -18
@@ -1,39 +1,155 @@
1
- // src/search/constants.mjs — Shared constants for GreedySearch search pipeline
2
-
3
- import { tmpdir } from "node:os";
4
-
5
- export const GREEDY_PORT = 9222;
6
- export const GREEDY_PROFILE_DIR = `${tmpdir().replaceAll("\\", "/")}/greedysearch-chrome-profile`;
7
- export const ACTIVE_PORT_FILE = `${GREEDY_PROFILE_DIR}/DevToolsActivePort`;
8
- export const PAGES_CACHE = `${tmpdir().replaceAll("\\", "/")}/cdp-pages.json`;
9
- export const CHROME_MODE_FILE = `${tmpdir().replaceAll("\\", "/")}/greedysearch-chrome-mode`;
10
-
11
- // ALL_ENGINES drives the "all" fan-out. Add engines here to include them in multi-engine searches.
12
- // Engines in ENGINES but not in ALL_ENGINES are available for explicit use only.
13
- export const ALL_ENGINES = ["perplexity", "bing", "google"];
14
-
15
- export const ENGINE_DOMAINS = {
16
- perplexity: "perplexity.ai",
17
- bing: "copilot.microsoft.com",
18
- google: "google.com",
19
- gemini: "gemini.google.com",
20
- };
21
-
22
- export const ENGINES = {
23
- perplexity: "perplexity.mjs",
24
- p: "perplexity.mjs",
25
- bing: "bing-copilot.mjs",
26
- b: "bing-copilot.mjs",
27
- google: "google-ai.mjs",
28
- g: "google-ai.mjs",
29
- gemini: "gemini.mjs",
30
- gem: "gemini.mjs",
31
- };
32
-
33
- export const SOURCE_FETCH_CONCURRENCY = Math.max(
34
- 1,
35
- Number.parseInt(process.env.GREEDY_FETCH_CONCURRENCY || "5", 10) || 5,
36
- );
37
-
38
- // Tell cdp.mjs to prefer the GreedySearch Chrome profile's DevToolsActivePort
39
- process.env.CDP_PROFILE_DIR = GREEDY_PROFILE_DIR;
1
+ // src/search/constants.mjs — Shared constants for GreedySearch search pipeline
2
+
3
+ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
4
+ import { homedir } from "node:os";
5
+ import { join } from "node:path";
6
+ import { tmpdir } from "node:os";
7
+
8
+ export const GREEDY_PORT = 9222;
9
+ export const GREEDY_PROFILE_DIR = `${tmpdir().replaceAll("\\", "/")}/greedysearch-chrome-profile`;
10
+ export const ACTIVE_PORT_FILE = `${GREEDY_PROFILE_DIR}/DevToolsActivePort`;
11
+ export const PAGES_CACHE = `${tmpdir().replaceAll("\\", "/")}/cdp-pages.json`;
12
+ export const CHROME_MODE_FILE = `${tmpdir().replaceAll("\\", "/")}/greedysearch-chrome-mode`;
13
+ export const VISIBLE_RECOVERY_LOG = `${tmpdir().replaceAll("\\", "/")}/greedysearch-visible-recovery.jsonl`;
14
+
15
+ // ── User config: ~/.pi/greedyconfig ────────────────────────────────────────
16
+ // Users can override which engines participate in the "all" fan-out and which
17
+ // engine performs optional synthesis.
18
+ // Default engines: perplexity, google, chatgpt; synthesizer: gemini
19
+
20
+ const CONFIG_DIR = join(homedir(), ".pi");
21
+ const CONFIG_FILE = join(CONFIG_DIR, "greedyconfig");
22
+
23
+ // Default engines that participate in the "all" fan-out for normal
24
+ // (non-research) searches. Opt-in research/academic engines like
25
+ // `semantic-scholar` are deliberately excluded — they belong in research
26
+ // mode, not casual web search. Users who want them in normal `engine:all`
27
+ // runs can add them via ~/.pi/greedyconfig (see ensureDefaultConfig()).
28
+ export const DEFAULT_ENGINES = ["perplexity", "google", "chatgpt"];
29
+ export const DEFAULT_SYNTHESIZER = "gemini";
30
+
31
+ function loadUserEngines() {
32
+ try {
33
+ if (existsSync(CONFIG_FILE)) {
34
+ const raw = readFileSync(CONFIG_FILE, "utf8");
35
+ const config = JSON.parse(raw);
36
+ if (
37
+ Array.isArray(config.engines) &&
38
+ config.engines.length > 0 &&
39
+ config.engines.every((e) => typeof e === "string")
40
+ ) {
41
+ // Validate each engine exists in ENGINES. Unknown names are
42
+ // silently dropped — but at least once we tell the user about
43
+ // it so a typo in ~/.pi/greedyconfig doesn't quietly shrink
44
+ // the all-search fan-out.
45
+ const valid = config.engines.filter((e) => ENGINES[e]);
46
+ const invalid = config.engines.filter((e) => !ENGINES[e]);
47
+ if (invalid.length > 0) {
48
+ process.stderr.write(
49
+ `[greedysearch] Warning: ignoring unknown engine(s) in ${CONFIG_FILE}: ${invalid.join(", ")}\n` +
50
+ `[greedysearch] Available engines: ${Object.keys(ENGINES).join(", ")}\n`,
51
+ );
52
+ }
53
+ if (valid.length > 0) return valid;
54
+ process.stderr.write(
55
+ `[greedysearch] Warning: no valid engines in ${CONFIG_FILE}, falling back to defaults: ${DEFAULT_ENGINES.join(", ")}\n`,
56
+ );
57
+ }
58
+ }
59
+ } catch {
60
+ // Ignore parse/read errors — fall through to default
61
+ }
62
+ return DEFAULT_ENGINES;
63
+ }
64
+
65
+ function ensureDefaultConfig() {
66
+ try {
67
+ if (!existsSync(CONFIG_DIR)) mkdirSync(CONFIG_DIR, { recursive: true });
68
+ if (!existsSync(CONFIG_FILE)) {
69
+ writeFileSync(
70
+ CONFIG_FILE,
71
+ JSON.stringify(
72
+ { engines: DEFAULT_ENGINES, synthesizer: DEFAULT_SYNTHESIZER },
73
+ null,
74
+ 2,
75
+ ) + "\n",
76
+ "utf8",
77
+ );
78
+ }
79
+ } catch {
80
+ // Best-effort — don't crash if we can't write the config file
81
+ }
82
+ }
83
+
84
+ ensureDefaultConfig();
85
+
86
+ export const SUPPORTED_SYNTHESIZERS = ["gemini", "chatgpt"];
87
+
88
+ function loadUserSynthesizer() {
89
+ try {
90
+ if (existsSync(CONFIG_FILE)) {
91
+ const raw = readFileSync(CONFIG_FILE, "utf8");
92
+ const config = JSON.parse(raw);
93
+ if (typeof config.synthesizer === "string") {
94
+ const normalized = config.synthesizer.toLowerCase();
95
+ if (SUPPORTED_SYNTHESIZERS.includes(normalized)) return normalized;
96
+ process.stderr.write(
97
+ `[greedysearch] Warning: unknown synthesizer "${config.synthesizer}" in ${CONFIG_FILE}\n` +
98
+ `[greedysearch] Available synthesizers: ${SUPPORTED_SYNTHESIZERS.join(", ")}\n` +
99
+ `[greedysearch] Falling back to default: ${DEFAULT_SYNTHESIZER}\n`,
100
+ );
101
+ }
102
+ }
103
+ } catch {
104
+ // Ignore parse/read errors — fall through to default
105
+ }
106
+ return DEFAULT_SYNTHESIZER;
107
+ }
108
+
109
+ export const ENGINE_DOMAINS = {
110
+ perplexity: "perplexity.ai",
111
+ bing: "copilot.microsoft.com",
112
+ google: "google.com",
113
+ gemini: "gemini.google.com",
114
+ chatgpt: "chatgpt.com",
115
+ "semantic-scholar": "semanticscholar.org",
116
+ semanticscholar: "semanticscholar.org",
117
+ s2: "semanticscholar.org",
118
+ logically: "logically.app",
119
+ };
120
+
121
+ export const ENGINES = {
122
+ perplexity: "perplexity.mjs",
123
+ p: "perplexity.mjs",
124
+ bing: "bing-copilot.mjs",
125
+ b: "bing-copilot.mjs",
126
+ google: "google-ai.mjs",
127
+ g: "google-ai.mjs",
128
+ gemini: "gemini.mjs",
129
+ gem: "gemini.mjs",
130
+ chatgpt: "chatgpt.mjs",
131
+ gpt: "chatgpt.mjs",
132
+ "semantic-scholar": "semantic-scholar.mjs",
133
+ semanticscholar: "semantic-scholar.mjs",
134
+ s2: "semantic-scholar.mjs",
135
+ logically: "logically.mjs",
136
+ log: "logically.mjs",
137
+ };
138
+
139
+ // ALL_ENGINES drives the "all" fan-out. Edit ~/.pi/greedyconfig to customize.
140
+ export const ALL_ENGINES = loadUserEngines();
141
+
142
+ // Research child searches intentionally reuse the normal configured fan-out.
143
+ // Gemini remains the research planner/final-report synthesizer.
144
+ export const RESEARCH_ENGINES = ALL_ENGINES;
145
+
146
+ // SYNTHESIZER drives optional all-search synthesis. Edit ~/.pi/greedyconfig to customize.
147
+ export const SYNTHESIZER = loadUserSynthesizer();
148
+
149
+ export const SOURCE_FETCH_CONCURRENCY = Math.max(
150
+ 1,
151
+ Number.parseInt(process.env.GREEDY_FETCH_CONCURRENCY || "5", 10) || 5,
152
+ );
153
+
154
+ // Tell cdp.mjs to prefer the GreedySearch Chrome profile's DevToolsActivePort
155
+ process.env.CDP_PROFILE_DIR = GREEDY_PROFILE_DIR;
@@ -1,76 +1,114 @@
1
- // src/search/engines.mjs — Extractor runner
2
- //
3
- // Engine map lives in constants.mjs; this module re-exports it for
4
- // backward compatibility and provides the runExtractor() function.
5
-
6
- import { spawn } from "node:child_process";
7
- import { join } from "node:path";
8
- import { ENGINES, GREEDY_PROFILE_DIR } from "./constants.mjs";
9
-
10
- export { ENGINES };
11
-
12
- const __dir =
13
- import.meta.dirname ||
14
- new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
15
-
16
- export function runExtractor(
17
- script,
18
- query,
19
- tabPrefix = null,
20
- short = false,
21
- timeoutMs = null,
22
- locale = null,
23
- ) {
24
- // Gemini synthesis: 70s budget (45s stream + ~25s nav/settle overhead)
25
- // Other engines: 60s budget
26
- if (timeoutMs === null) {
27
- timeoutMs = script.includes("gemini") ? 70000 : 60000;
28
- }
29
- const extraArgs = [
30
- ...(tabPrefix ? ["--tab", tabPrefix] : []),
31
- ...(short ? ["--short"] : []),
32
- ...(locale ? ["--locale", locale] : []),
33
- ];
34
- return new Promise((resolve, reject) => {
35
- const proc = spawn(
36
- process.execPath,
37
- [join(__dir, "..", "..", "extractors", script), "--stdin", ...extraArgs],
38
- {
39
- stdio: ["pipe", "pipe", "pipe"],
40
- env: { ...process.env, CDP_PROFILE_DIR: GREEDY_PROFILE_DIR },
41
- },
42
- );
43
- // Pipe query via stdin to avoid leaking it in process table command-line
44
- proc.stdin.write(query);
45
- proc.stdin.end();
46
- let out = "";
47
- let err = "";
48
- proc.stdout.on("data", (d) => (out += d));
49
- proc.stderr.on("data", (d) => (err += d));
50
- const t = setTimeout(() => {
51
- proc.kill();
52
- reject(new Error(`${script} timed out after ${timeoutMs / 1000}s`));
53
- }, timeoutMs);
54
- proc.on("close", (code) => {
55
- clearTimeout(t);
56
- if (code === 0) {
57
- try {
58
- resolve(JSON.parse(out.trim()));
59
- } catch {
60
- reject(new Error(`bad JSON from ${script}: ${out.slice(0, 100)}`));
61
- }
62
- } else {
63
- // Try to parse structured error envelope from stdout before falling back
64
- let envelope = null;
65
- try {
66
- const parsed = JSON.parse(out.trim());
67
- if (parsed._envelope) envelope = parsed._envelope;
68
- } catch {}
69
- const msg = err.trim() || `extractor exit ${code}`;
70
- const errObj = new Error(msg);
71
- if (envelope) errObj.envelope = envelope;
72
- reject(errObj);
73
- }
74
- });
75
- });
76
- }
1
+ // src/search/engines.mjs — Extractor runner
2
+ //
3
+ // Engine map lives in constants.mjs; this module re-exports it for
4
+ // backward compatibility and provides the runExtractor() function.
5
+
6
+ import { spawn } from "node:child_process";
7
+ import { join } from "node:path";
8
+ import { ENGINES, GREEDY_PROFILE_DIR } from "./constants.mjs";
9
+
10
+ export { ENGINES };
11
+
12
+ const __dir =
13
+ import.meta.dirname ||
14
+ new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
15
+
16
+ export function runExtractor(
17
+ script,
18
+ query,
19
+ tabPrefix = null,
20
+ short = false,
21
+ timeoutMs = null,
22
+ locale = null,
23
+ ) {
24
+ // Gemini synthesis: 70s budget (45s stream + ~25s nav/settle overhead)
25
+ // ChatGPT can use a 30s in-page stream wait plus a 35s node-side fallback.
26
+ // Logically research answers can run academic + web searches before streaming.
27
+ // Other engines: 60s budget
28
+ if (timeoutMs === null) {
29
+ timeoutMs = script.includes("logically")
30
+ ? 120000
31
+ : script.includes("chatgpt")
32
+ ? 80000
33
+ : script.includes("gemini")
34
+ ? 70000
35
+ : 60000;
36
+ }
37
+ const extraArgs = [
38
+ ...(tabPrefix ? ["--tab", tabPrefix] : []),
39
+ ...(short ? ["--short"] : []),
40
+ ...(locale ? ["--locale", locale] : []),
41
+ ];
42
+ return new Promise((resolve, reject) => {
43
+ const proc = spawn(
44
+ process.execPath,
45
+ [join(__dir, "..", "..", "extractors", script), "--stdin", ...extraArgs],
46
+ {
47
+ stdio: ["pipe", "pipe", "pipe"],
48
+ env: { ...process.env, CDP_PROFILE_DIR: GREEDY_PROFILE_DIR },
49
+ },
50
+ );
51
+ // Pipe query via stdin to avoid leaking it in process table command-line
52
+ proc.stdin.write(query);
53
+ proc.stdin.end();
54
+ let out = "";
55
+ let err = "";
56
+ proc.stdout.on("data", (d) => (out += d));
57
+ // Forward child stderr to parent so [engine] stage: lines are visible
58
+ // in real time. Also retain the buffer for the timeout diagnostic path.
59
+ proc.stderr.on("data", (d) => {
60
+ err += d;
61
+ if (process.env.GREEDY_SEARCH_CHILD_STDERR !== "0") {
62
+ process.stderr.write(d);
63
+ }
64
+ });
65
+ const t = setTimeout(() => {
66
+ proc.kill();
67
+ // Surface as much diagnostic info as the killed child produced so the
68
+ // caller can see *which stage* the extractor was in. handleError()
69
+ // emits `{ _envelope, error }` JSON to stdout on graceful failure,
70
+ // but a hard kill discards whatever was buffered.
71
+ const tailLines = (s, n = 20) =>
72
+ String(s ?? "")
73
+ .split(/\r?\n/)
74
+ .filter(Boolean)
75
+ .slice(-n)
76
+ .join("\n");
77
+ let envelope = null;
78
+ try {
79
+ const parsed = JSON.parse(out.trim());
80
+ if (parsed._envelope) envelope = parsed._envelope;
81
+ } catch {}
82
+ const errObj = new Error(
83
+ `${script} timed out after ${timeoutMs / 1000}s` +
84
+ (envelope?.lastStage ? ` (last stage: ${envelope.lastStage})` : ""),
85
+ );
86
+ errObj.engineScript = script;
87
+ errObj.lastStage = envelope?.lastStage || null;
88
+ errObj.partialErr = tailLines(err);
89
+ errObj.partialOut = tailLines(out);
90
+ reject(errObj);
91
+ }, timeoutMs);
92
+ proc.on("close", (code) => {
93
+ clearTimeout(t);
94
+ if (code === 0) {
95
+ try {
96
+ resolve(JSON.parse(out.trim()));
97
+ } catch {
98
+ reject(new Error(`bad JSON from ${script}: ${out.slice(0, 100)}`));
99
+ }
100
+ } else {
101
+ // Try to parse structured error envelope from stdout before falling back
102
+ let envelope = null;
103
+ try {
104
+ const parsed = JSON.parse(out.trim());
105
+ if (parsed._envelope) envelope = parsed._envelope;
106
+ } catch {}
107
+ const msg = err.trim() || `extractor exit ${code}`;
108
+ const errObj = new Error(msg);
109
+ if (envelope) errObj.envelope = envelope;
110
+ reject(errObj);
111
+ }
112
+ });
113
+ });
114
+ }