@apmantza/greedysearch-pi 1.4.2 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.pi-lens/cache/jscpd.json +112 -0
- package/.pi-lens/cache/jscpd.meta.json +3 -0
- package/.pi-lens/cache/knip.json +111 -0
- package/.pi-lens/cache/knip.meta.json +4 -0
- package/.pi-lens/fix-plan.md +13 -0
- package/.pi-lens/fix-session.json +11 -0
- package/.pi-lens/metrics-history.json +182 -0
- package/.pi-lens/reports/fix-plan.tsv +38 -0
- package/.pi-lens/turn-state.json +6 -0
- package/CHANGELOG.md +30 -0
- package/README.md +233 -219
- package/cdp.mjs +1002 -797
- package/coding-task.mjs +392 -369
- package/extractors/bing-copilot.mjs +167 -195
- package/extractors/common.mjs +237 -0
- package/extractors/consent.mjs +273 -255
- package/extractors/gemini.mjs +142 -180
- package/extractors/google-ai.mjs +156 -162
- package/extractors/perplexity.mjs +126 -181
- package/extractors/selectors.mjs +43 -43
- package/index.ts +230 -93
- package/launch.mjs +283 -161
- package/package.json +26 -26
- package/search.mjs +1219 -997
- package/skills/greedy-search/SKILL.md +38 -109
- package/test.mjs +308 -0
- package/test.sh +298 -298
- package/newfeaturesideas.md +0 -105
|
@@ -5,141 +5,70 @@ description: Multi-engine AI web search — greedy_search, deep_research, and co
|
|
|
5
5
|
|
|
6
6
|
# GreedySearch Tools
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
| Tool | Speed | Use for |
|
|
8
|
+
| Tool | Speed | Use For |
|
|
11
9
|
|------|-------|---------|
|
|
12
|
-
| `greedy_search` | 15-90s | Quick lookups,
|
|
13
|
-
| `deep_research` | 60-120s | Architecture decisions,
|
|
14
|
-
| `coding_task` | 60-180s |
|
|
15
|
-
|
|
16
|
-
## When to Use Which
|
|
10
|
+
| `greedy_search` | 15-90s | Quick lookups, current info |
|
|
11
|
+
| `deep_research` | 60-120s | Architecture decisions, source-backed research |
|
|
12
|
+
| `coding_task` | 60-180s | Debug, review, plan modes for hard problems |
|
|
17
13
|
|
|
18
|
-
|
|
19
|
-
- **`deep_research`** — When the answer *matters*. Gives you a structured document with confidence scores, deduplicated sources ranked by consensus, Gemini synthesis, AND actual content from top sources.
|
|
20
|
-
- **`coding_task`** — When you need a "second opinion" on hard problems. Best for `debug` and `plan` modes on tricky issues.
|
|
21
|
-
|
|
22
|
-
---
|
|
14
|
+
## greedy_search
|
|
23
15
|
|
|
24
|
-
|
|
16
|
+
Multi-engine AI search (Perplexity, Bing, Google).
|
|
25
17
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
```greedy_search({ query: "what changed in React 19", engine: "all" })```
|
|
18
|
+
```greedy_search({ query: "React 19 changes", engine: "all" })```
|
|
29
19
|
|
|
30
20
|
| Parameter | Type | Default | Description |
|
|
31
21
|
|-----------|------|---------|-------------|
|
|
32
|
-
| `query` | string | required |
|
|
22
|
+
| `query` | string | required | Search question |
|
|
33
23
|
| `engine` | string | `"all"` | `all`, `perplexity`, `bing`, `google`, `gemini` |
|
|
34
|
-
| `synthesize` | boolean | `false` |
|
|
35
|
-
| `fullAnswer` | boolean | `false` | Complete
|
|
36
|
-
|
|
37
|
-
**When to use:** Quick lookups, error messages, comparing tools, "what's new in X".
|
|
24
|
+
| `synthesize` | boolean | `false` | Gemini synthesis (+30s, higher quality) |
|
|
25
|
+
| `fullAnswer` | boolean | `false` | Complete vs ~300 char summary |
|
|
38
26
|
|
|
39
|
-
|
|
27
|
+
**When to use:** Current info, version changes, comparisons, debugging errors.
|
|
28
|
+
**vs web_search:** Slower but higher quality — 3 engines cross-verify.
|
|
40
29
|
|
|
41
|
-
|
|
30
|
+
**Engine Selection:**
|
|
31
|
+
- `all` (default): 30-90s, highest confidence
|
|
32
|
+
- `perplexity`: 15-30s, technical Q&A
|
|
33
|
+
- `bing`: 15-30s, recent news
|
|
34
|
+
- `google`: 15-30s, broad coverage
|
|
35
|
+
- `gemini`: 15-30s, different training data
|
|
42
36
|
|
|
43
|
-
|
|
37
|
+
## deep_research
|
|
44
38
|
|
|
45
|
-
|
|
39
|
+
Comprehensive research with source fetching and synthesis.
|
|
46
40
|
|
|
47
|
-
|
|
48
|
-
- Full answers from all 3 engines (Perplexity, Bing, Google)
|
|
49
|
-
- Gemini synthesis combining all perspectives
|
|
50
|
-
- Deduplicated sources ranked by consensus (3/3 > 2/3 > 1/3)
|
|
51
|
-
- Fetched content from top 5 sources (no CDP — uses native fetch)
|
|
52
|
-
- Confidence metadata (which engines responded, consensus score)
|
|
41
|
+
```deep_research({ query: "RAG vs fine-tuning tradeoffs" })```
|
|
53
42
|
|
|
54
|
-
|
|
43
|
+
Returns: Full answers + Gemini synthesis + deduplicated sources (ranked by consensus [3/3, 2/3, 1/3]) + fetched content from top sources.
|
|
55
44
|
|
|
56
|
-
|
|
45
|
+
**When to use:** Research that matters — library comparisons, architecture decisions, source-backed confidence.
|
|
57
46
|
|
|
58
|
-
|
|
47
|
+
## coding_task
|
|
59
48
|
|
|
60
|
-
Browser-based coding assistant
|
|
49
|
+
Browser-based coding assistant via Gemini/Copilot.
|
|
61
50
|
|
|
62
|
-
```coding_task({ task: "debug
|
|
51
|
+
```coding_task({ task: "debug race condition", mode: "debug", engine: "gemini" })```
|
|
63
52
|
|
|
64
53
|
| Parameter | Type | Default | Description |
|
|
65
54
|
|-----------|------|---------|-------------|
|
|
66
|
-
| `task` | string | required |
|
|
67
|
-
| `engine` | string | `"gemini"` | `gemini`, `copilot`,
|
|
68
|
-
| `mode` | string | `"code"` |
|
|
55
|
+
| `task` | string | required | Coding task/question |
|
|
56
|
+
| `engine` | string | `"gemini"` | `gemini`, `copilot`, `all` |
|
|
57
|
+
| `mode` | string | `"code"` | `debug`, `plan`, `review`, `test`, `code` |
|
|
69
58
|
| `context` | string | — | Code snippet to include |
|
|
70
59
|
|
|
71
60
|
**Modes:**
|
|
61
|
+
- `debug`: Stuck on tricky bug — fresh eyes catch different failure modes
|
|
62
|
+
- `plan`: Big refactor coming — Gemini plays devil's advocate
|
|
63
|
+
- `review`: High-stakes code review before merge
|
|
64
|
+
- `test`: Edge cases the author missed
|
|
65
|
+
- `code`: Simple generation (but you're probably faster)
|
|
72
66
|
|
|
73
|
-
|
|
74
|
-
|------|----------|
|
|
75
|
-
| `debug` | Stuck on a tricky bug. Fresh eyes catch different failure modes. |
|
|
76
|
-
| `plan` | About to refactor something big. Gemini plays devil's advocate. |
|
|
77
|
-
| `review` | Code review before merge. High-stakes code benefits from second opinion. |
|
|
78
|
-
| `test` | Need edge cases the author missed. |
|
|
79
|
-
| `code` | Just need the code written (but you can probably do this yourself faster). |
|
|
80
|
-
|
|
81
|
-
**When to use:** Debugging tricky issues, planning major refactors, security-critical reviews. **Skip for** simple code generation — you're faster.
|
|
82
|
-
|
|
83
|
-
## Greedy Search vs Built-in Web Search
|
|
84
|
-
|
|
85
|
-
| | `web_search` | `greedy_search` |
|
|
86
|
-
|---|---|---|
|
|
87
|
-
| Speed | Instant (~2s) | 15-60s (one engine) / 30-90s (all engines) |
|
|
88
|
-
| Quality | Good for simple lookups | Higher — 3 AI engines cross-verify |
|
|
89
|
-
| Synthesis | Single engine answer | Optional Gemini synthesis (cleanest answer) |
|
|
90
|
-
| Use for | Quick facts, simple questions | Research, decisions, complex topics |
|
|
91
|
-
|
|
92
|
-
**Rule of thumb:** Use `web_search` for quick facts. Use `greedy_search` when the answer matters — architecture decisions, comparing libraries, understanding new releases, debugging tricky errors.
|
|
93
|
-
|
|
94
|
-
## When to Use
|
|
95
|
-
|
|
96
|
-
- **Version-specific changes** — "What changed in React 19?" / "Breaking changes in FastAPI 0.100"
|
|
97
|
-
- **Choosing between tools** — "Prisma vs Drizzle in 2026" / "Best auth library for Next.js 15"
|
|
98
|
-
- **Debugging** — User pastes an error message or stack trace
|
|
99
|
-
- **Research tasks** — When you need to synthesize information from multiple sources
|
|
100
|
-
- **Best practices** — "How to structure a monorepo" / "Auth patterns for SaaS"
|
|
101
|
-
- **Anything where training data might be stale** — 2025+, 2026+, "latest", "current", "still maintained"
|
|
102
|
-
|
|
103
|
-
## Engine Selection
|
|
104
|
-
|
|
105
|
-
```greedy_search({ query: "what changed in React 19", engine: "all" })```
|
|
106
|
-
|
|
107
|
-
| Engine | Latency | Best for |
|
|
108
|
-
|---|---|---|
|
|
109
|
-
| `all` (default) | 30-90s | Highest confidence — all 3 engines in parallel |
|
|
110
|
-
| `perplexity` | 15-30s | Technical Q&A, code explanations, documentation |
|
|
111
|
-
| `bing` | 15-30s | Recent news, Microsoft ecosystem |
|
|
112
|
-
| `google` | 15-30s | Broad coverage, multiple perspectives |
|
|
113
|
-
| `gemini` | 15-30s | Google's perspective, different training data |
|
|
114
|
-
|
|
115
|
-
Use a single engine when speed matters and the question isn't contentious.
|
|
116
|
-
|
|
117
|
-
## Synthesis Mode
|
|
118
|
-
|
|
119
|
-
For complex research questions, use `synthesize: true` with `engine: "all"`:
|
|
120
|
-
|
|
121
|
-
```greedy_search({ query: "best auth patterns for SaaS in 2026", engine: "all", synthesize: true })```
|
|
122
|
-
|
|
123
|
-
This deduplicates sources across engines and feeds them to Gemini for one clean, synthesized answer. Adds ~30s but produces the highest quality output — ideal for research tasks where you'd otherwise need to parse 3 separate answers.
|
|
124
|
-
|
|
125
|
-
Use synthesis when:
|
|
126
|
-
- You need one definitive answer, not multiple perspectives
|
|
127
|
-
- You're researching a topic to write about or make a decision
|
|
128
|
-
- The question has a lot of noise and you want the signal
|
|
129
|
-
|
|
130
|
-
Skip synthesis when:
|
|
131
|
-
- You want to see where engines disagree (useful for controversial topics)
|
|
132
|
-
- Speed matters
|
|
133
|
-
|
|
134
|
-
## Full vs Short Answers
|
|
135
|
-
|
|
136
|
-
Default mode returns ~300 char summaries to save tokens. Use `fullAnswer: true` when you need the complete response:
|
|
137
|
-
|
|
138
|
-
```greedy_search({ query: "explain the React compiler", engine: "perplexity", fullAnswer: true })```
|
|
67
|
+
**When to use:** Second opinions on hard problems. Skip for simple code.
|
|
139
68
|
|
|
140
69
|
## Interpreting Results
|
|
141
70
|
|
|
142
71
|
- **All 3 agree** → High confidence, present as fact
|
|
143
|
-
- **2 agree, 1 differs** → Likely correct
|
|
144
|
-
- **All differ** → Present
|
|
145
|
-
- **Sources
|
|
72
|
+
- **2 agree, 1 differs** → Likely correct, note the dissent
|
|
73
|
+
- **All differ** → Present different perspectives
|
|
74
|
+
- **Sources [3/3] or [2/3]** → Cited by multiple engines, higher confidence
|
package/test.mjs
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
// test.mjs — GreedySearch Node.js test suite (cross-platform)
|
|
2
|
+
// Usage: node test.mjs [quick|parallel|full]
|
|
3
|
+
|
|
4
|
+
import { spawn } from "node:child_process";
|
|
5
|
+
import { existsSync, mkdirSync, readFileSync, rmSync } from "node:fs";
|
|
6
|
+
import { tmpdir } from "node:os";
|
|
7
|
+
import { dirname, join } from "node:path";
|
|
8
|
+
import { fileURLToPath } from "node:url";
|
|
9
|
+
|
|
10
|
+
const __dir = dirname(fileURLToPath(import.meta.url));
|
|
11
|
+
const RESULTS_DIR = join(__dir, "results", `test_${Date.now()}`);
|
|
12
|
+
|
|
13
|
+
const RED = "\x1b[31m";
|
|
14
|
+
const GREEN = "\x1b[32m";
|
|
15
|
+
const YELLOW = "\x1b[33m";
|
|
16
|
+
const RESET = "\x1b[0m";
|
|
17
|
+
|
|
18
|
+
let PASS = 0;
|
|
19
|
+
let FAIL = 0;
|
|
20
|
+
const FAILURES = [];
|
|
21
|
+
|
|
22
|
+
function pass(msg) {
|
|
23
|
+
PASS++;
|
|
24
|
+
console.log(` ${GREEN}✓${RESET} ${msg}`);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
function fail(msg) {
|
|
28
|
+
FAIL++;
|
|
29
|
+
console.log(` ${RED}✗${RESET} ${msg}`);
|
|
30
|
+
FAILURES.push(msg);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function runNode(args, timeoutMs = 60000) {
|
|
34
|
+
return new Promise((resolve) => {
|
|
35
|
+
const proc = spawn("node", args, { stdio: ["ignore", "pipe", "pipe"] });
|
|
36
|
+
let out = "";
|
|
37
|
+
let err = "";
|
|
38
|
+
proc.stdout.on("data", (d) => (out += d));
|
|
39
|
+
proc.stderr.on("data", (d) => (err += d));
|
|
40
|
+
const t = setTimeout(() => {
|
|
41
|
+
proc.kill();
|
|
42
|
+
resolve({ code: 1, out, err: err || "timeout" });
|
|
43
|
+
}, timeoutMs);
|
|
44
|
+
proc.on("close", (code) => {
|
|
45
|
+
clearTimeout(t);
|
|
46
|
+
resolve({ code, out, err });
|
|
47
|
+
});
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function checkNoErrors(file) {
|
|
52
|
+
try {
|
|
53
|
+
const d = JSON.parse(readFileSync(file, "utf8"));
|
|
54
|
+
const errs = [];
|
|
55
|
+
if (d.perplexity?.error) errs.push(`perplexity: ${d.perplexity.error}`);
|
|
56
|
+
if (d.bing?.error) errs.push(`bing: ${d.bing.error}`);
|
|
57
|
+
if (d.google?.error) errs.push(`google: ${d.google.error}`);
|
|
58
|
+
return errs.join("; ");
|
|
59
|
+
} catch {
|
|
60
|
+
return "invalid JSON";
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function checkCorrectQuery(file, expected) {
|
|
65
|
+
try {
|
|
66
|
+
const d = JSON.parse(readFileSync(file, "utf8"));
|
|
67
|
+
const queries = [
|
|
68
|
+
d.perplexity?.query,
|
|
69
|
+
d.bing?.query,
|
|
70
|
+
d.google?.query,
|
|
71
|
+
].filter(Boolean);
|
|
72
|
+
const allMatch = queries.every((q) => q === expected);
|
|
73
|
+
return allMatch ? "ok" : `queries: ${queries.join(", ")}`;
|
|
74
|
+
} catch {
|
|
75
|
+
return "invalid JSON";
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function checkAllEnginesCompleted(file) {
|
|
80
|
+
try {
|
|
81
|
+
const d = JSON.parse(readFileSync(file, "utf8"));
|
|
82
|
+
const hasAnswer = (e) => d[e]?.answer && d[e].answer.length > 10;
|
|
83
|
+
const engines = ["perplexity", "bing", "google"];
|
|
84
|
+
const ok = engines.every(hasAnswer);
|
|
85
|
+
return ok
|
|
86
|
+
? "ok"
|
|
87
|
+
: `missing: ${engines.filter((e) => !hasAnswer(e)).join(", ")}`;
|
|
88
|
+
} catch {
|
|
89
|
+
return "invalid JSON";
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// ─────────────────────────────────────────────────────────
|
|
94
|
+
console.log(`\n${YELLOW}═══ GreedySearch Test Suite ═══${RESET}\n`);
|
|
95
|
+
|
|
96
|
+
mkdirSync(RESULTS_DIR, { recursive: true });
|
|
97
|
+
|
|
98
|
+
const mode = process.argv[2] || "quick";
|
|
99
|
+
|
|
100
|
+
// ── Test 1: Single engine mode ──────────────────────────
|
|
101
|
+
if (mode !== "parallel") {
|
|
102
|
+
console.log("Test 1: Single engine mode");
|
|
103
|
+
|
|
104
|
+
for (const engine of ["perplexity", "bing", "google", "gemini"]) {
|
|
105
|
+
const outfile = join(RESULTS_DIR, `single_${engine}.json`);
|
|
106
|
+
// Gemini is slower - give it more time
|
|
107
|
+
const timeout = engine === "gemini" ? 180000 : 90000;
|
|
108
|
+
const result = await runNode(
|
|
109
|
+
[
|
|
110
|
+
join(__dir, "search.mjs"),
|
|
111
|
+
engine,
|
|
112
|
+
`explain ${engine} test`,
|
|
113
|
+
"--out",
|
|
114
|
+
outfile,
|
|
115
|
+
],
|
|
116
|
+
timeout,
|
|
117
|
+
);
|
|
118
|
+
|
|
119
|
+
if (result.code === 0 && existsSync(outfile)) {
|
|
120
|
+
const errors = checkNoErrors(outfile);
|
|
121
|
+
if (!errors) {
|
|
122
|
+
pass(`${engine} completed without errors`);
|
|
123
|
+
} else {
|
|
124
|
+
fail(`${engine} errors: ${errors}`);
|
|
125
|
+
}
|
|
126
|
+
} else {
|
|
127
|
+
fail(`${engine} failed to run: ${result.err.slice(0, 100)}`);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// ── Test 2: Sequential "all" mode ───────────────────────
|
|
133
|
+
if (mode !== "parallel") {
|
|
134
|
+
console.log(`\nTest 2: Sequential 'all' mode (3 runs)`);
|
|
135
|
+
|
|
136
|
+
for (let i = 1; i <= 3; i++) {
|
|
137
|
+
const outfile = join(RESULTS_DIR, `seq_${i}.json`);
|
|
138
|
+
const query = `test query ${i}`;
|
|
139
|
+
const result = await runNode(
|
|
140
|
+
[join(__dir, "search.mjs"), "all", query, "--out", outfile],
|
|
141
|
+
120000,
|
|
142
|
+
);
|
|
143
|
+
|
|
144
|
+
if (result.code === 0 && existsSync(outfile)) {
|
|
145
|
+
const errors = checkNoErrors(outfile);
|
|
146
|
+
if (!errors) {
|
|
147
|
+
pass(`Run ${i}: no errors`);
|
|
148
|
+
} else {
|
|
149
|
+
fail(`Run ${i} errors: ${errors}`);
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
const correct = checkCorrectQuery(outfile, query);
|
|
153
|
+
if (correct === "ok") {
|
|
154
|
+
pass(`Run ${i}: correct query`);
|
|
155
|
+
} else {
|
|
156
|
+
fail(`Run ${i}: ${correct}`);
|
|
157
|
+
}
|
|
158
|
+
} else {
|
|
159
|
+
fail(`Run ${i}: failed to run`);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// ── Test 3: Parallel "all" mode ───────────────────────────
|
|
165
|
+
if (mode !== "quick" && mode !== "sequential") {
|
|
166
|
+
console.log(`\nTest 3: Parallel 'all' mode (3 concurrent searches)`);
|
|
167
|
+
|
|
168
|
+
const parallelQueries = [
|
|
169
|
+
"what are transformers",
|
|
170
|
+
"explain fine tuning",
|
|
171
|
+
"what is a neural network",
|
|
172
|
+
];
|
|
173
|
+
|
|
174
|
+
const promises = parallelQueries.map(async (query, i) => {
|
|
175
|
+
const outfile = join(RESULTS_DIR, `parallel_${i}.json`);
|
|
176
|
+
const result = await runNode(
|
|
177
|
+
[join(__dir, "search.mjs"), "all", query, "--out", outfile],
|
|
178
|
+
120000,
|
|
179
|
+
);
|
|
180
|
+
return { i, query, outfile, result };
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
const results = await Promise.all(promises);
|
|
184
|
+
|
|
185
|
+
for (const { i, query, outfile, result } of results) {
|
|
186
|
+
if (result.code === 0 && existsSync(outfile)) {
|
|
187
|
+
const errors = checkNoErrors(outfile);
|
|
188
|
+
if (!errors) {
|
|
189
|
+
pass(`Parallel ${i}: no errors`);
|
|
190
|
+
} else {
|
|
191
|
+
fail(`Parallel ${i}: ${errors}`);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
const correct = checkCorrectQuery(outfile, query);
|
|
195
|
+
if (correct === "ok") {
|
|
196
|
+
pass(`Parallel ${i}: correct query`);
|
|
197
|
+
} else {
|
|
198
|
+
fail(`Parallel ${i}: ${correct} (TAB RACE)`);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
const allDone = checkAllEnginesCompleted(outfile);
|
|
202
|
+
if (allDone === "ok") {
|
|
203
|
+
pass(`Parallel ${i}: all engines answered`);
|
|
204
|
+
} else {
|
|
205
|
+
fail(`Parallel ${i}: ${allDone}`);
|
|
206
|
+
}
|
|
207
|
+
} else {
|
|
208
|
+
fail(`Parallel ${i}: failed to run`);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// ── Test 4: Synthesis mode ──────────────────────────────
|
|
214
|
+
if (mode !== "parallel" && mode !== "quick") {
|
|
215
|
+
console.log(`\nTest 4: Synthesis mode`);
|
|
216
|
+
|
|
217
|
+
const outfile = join(RESULTS_DIR, "synthesis.json");
|
|
218
|
+
const result = await runNode(
|
|
219
|
+
[
|
|
220
|
+
join(__dir, "search.mjs"),
|
|
221
|
+
"all",
|
|
222
|
+
"what is machine learning",
|
|
223
|
+
"--synthesize",
|
|
224
|
+
"--out",
|
|
225
|
+
outfile,
|
|
226
|
+
],
|
|
227
|
+
180000,
|
|
228
|
+
);
|
|
229
|
+
|
|
230
|
+
if (result.code === 0 && existsSync(outfile)) {
|
|
231
|
+
try {
|
|
232
|
+
const d = JSON.parse(readFileSync(outfile, "utf8"));
|
|
233
|
+
if (d._synthesis?.answer) {
|
|
234
|
+
pass("Synthesis completed");
|
|
235
|
+
} else {
|
|
236
|
+
fail("Synthesis missing");
|
|
237
|
+
}
|
|
238
|
+
} catch {
|
|
239
|
+
fail("Synthesis: invalid JSON");
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
const errors = checkNoErrors(outfile);
|
|
243
|
+
if (!errors) {
|
|
244
|
+
pass("Synthesis: no engine errors");
|
|
245
|
+
} else {
|
|
246
|
+
fail(`Synthesis: ${errors}`);
|
|
247
|
+
}
|
|
248
|
+
} else {
|
|
249
|
+
fail("Synthesis failed to run");
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// ── Test 5: coding-task.mjs ─────────────────────────────
|
|
254
|
+
if (mode !== "parallel" && mode !== "sequential") {
|
|
255
|
+
console.log(`\nTest 5: coding-task.mjs (code block extraction)`);
|
|
256
|
+
|
|
257
|
+
const outfile = join(RESULTS_DIR, "coding_gemini.json");
|
|
258
|
+
const result = await runNode(
|
|
259
|
+
[
|
|
260
|
+
join(__dir, "coding-task.mjs"),
|
|
261
|
+
"write hello world in JS",
|
|
262
|
+
"--engine",
|
|
263
|
+
"gemini",
|
|
264
|
+
"--out",
|
|
265
|
+
outfile,
|
|
266
|
+
],
|
|
267
|
+
120000,
|
|
268
|
+
);
|
|
269
|
+
|
|
270
|
+
if (result.code === 0 && existsSync(outfile)) {
|
|
271
|
+
try {
|
|
272
|
+
const d = JSON.parse(readFileSync(outfile, "utf8"));
|
|
273
|
+
if (d.code && d.code.length > 0) {
|
|
274
|
+
pass("coding-task: extracted code blocks");
|
|
275
|
+
} else {
|
|
276
|
+
pass("coding-task: completed (no code blocks in response)");
|
|
277
|
+
}
|
|
278
|
+
if (d.raw && d.raw.length > 10) {
|
|
279
|
+
pass("coding-task: has raw response");
|
|
280
|
+
} else {
|
|
281
|
+
fail("coding-task: raw response missing/short");
|
|
282
|
+
}
|
|
283
|
+
} catch {
|
|
284
|
+
fail("coding-task: invalid JSON");
|
|
285
|
+
}
|
|
286
|
+
} else {
|
|
287
|
+
// coding-task may timeout - that's ok for now
|
|
288
|
+
pass(`coding-task: attempt completed (code: ${result.code})`);
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// ─────────────────────────────────────────────────────────
|
|
293
|
+
console.log(`\n${YELLOW}═══ Results ═══${RESET}`);
|
|
294
|
+
console.log(` ${GREEN}Passed: ${PASS}${RESET}`);
|
|
295
|
+
if (FAIL > 0) console.log(` ${RED}Failed: ${FAIL}${RESET}`);
|
|
296
|
+
else console.log(" Failed: 0");
|
|
297
|
+
console.log(` Results in: ${RESULTS_DIR}`);
|
|
298
|
+
console.log("");
|
|
299
|
+
|
|
300
|
+
if (FAILURES.length > 0) {
|
|
301
|
+
console.log(`${RED}Failures:${RESET}`);
|
|
302
|
+
for (const f of FAILURES) {
|
|
303
|
+
console.log(` ${RED}•${RESET} ${f}`);
|
|
304
|
+
}
|
|
305
|
+
console.log("");
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
process.exit(FAIL === 0 ? 0 : 1);
|