@apmantza/greedysearch-pi 1.9.2 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +132 -2
- package/README.md +82 -47
- package/bin/cdp.mjs +1153 -1108
- package/bin/launch.mjs +9 -0
- package/bin/search.mjs +318 -81
- package/extractors/bing-copilot.mjs +48 -18
- package/extractors/chatgpt.mjs +553 -0
- package/extractors/common.mjs +213 -22
- package/extractors/consensus.mjs +655 -0
- package/extractors/consent.mjs +182 -18
- package/extractors/gemini.mjs +350 -217
- package/extractors/google-ai.mjs +129 -128
- package/extractors/logically.mjs +629 -0
- package/extractors/perplexity.mjs +547 -217
- package/extractors/selectors.mjs +3 -2
- package/extractors/semantic-scholar.mjs +219 -0
- package/package.json +8 -4
- package/skills/greedy-search/skill.md +20 -12
- package/src/fetcher.mjs +23 -1
- package/src/formatters/results.ts +185 -128
- package/src/search/browser-lifecycle.mjs +27 -5
- package/src/search/challenge-detect.mjs +205 -0
- package/src/search/chrome.mjs +653 -590
- package/src/search/constants.mjs +155 -39
- package/src/search/engines.mjs +114 -76
- package/src/search/fetch-source.mjs +566 -451
- package/src/search/pdf.mjs +68 -0
- package/src/search/progress.mjs +145 -0
- package/src/search/recovery.mjs +73 -45
- package/src/search/research.mjs +1419 -62
- package/src/search/scale-aware.mjs +93 -0
- package/src/search/simple-research.mjs +520 -0
- package/src/search/sources.mjs +52 -22
- package/src/search/synthesis-runner.mjs +105 -26
- package/src/search/synthesis.mjs +286 -246
- package/src/tools/greedy-search-handler.ts +129 -59
- package/src/tools/shared.ts +312 -186
- package/src/types.ts +110 -104
- package/test.mjs +537 -18
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
// src/search/pdf.mjs — PDF text extraction helpers
|
|
2
|
+
//
|
|
3
|
+
// Adapted from pi-webaio's PDF pipeline. `pdf-parse` is loaded lazily so the
|
|
4
|
+
// package remains importable even when optional native canvas bindings are not
|
|
5
|
+
// available. PDF extraction is only attempted for actual PDF source fetches.
|
|
6
|
+
|
|
7
|
+
function ensurePdfDomPolyfills() {
|
|
8
|
+
if (typeof globalThis.DOMMatrix === "undefined") {
|
|
9
|
+
globalThis.DOMMatrix = class DOMMatrix {
|
|
10
|
+
constructor(_init = undefined) {}
|
|
11
|
+
multiplySelf() {
|
|
12
|
+
return this;
|
|
13
|
+
}
|
|
14
|
+
preMultiplySelf() {
|
|
15
|
+
return this;
|
|
16
|
+
}
|
|
17
|
+
translateSelf() {
|
|
18
|
+
return this;
|
|
19
|
+
}
|
|
20
|
+
scaleSelf() {
|
|
21
|
+
return this;
|
|
22
|
+
}
|
|
23
|
+
rotateSelf() {
|
|
24
|
+
return this;
|
|
25
|
+
}
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
if (typeof globalThis.ImageData === "undefined") {
|
|
29
|
+
globalThis.ImageData = class ImageData {
|
|
30
|
+
constructor(data = undefined, width = 0, height = 0) {
|
|
31
|
+
this.data = data;
|
|
32
|
+
this.width = width;
|
|
33
|
+
this.height = height;
|
|
34
|
+
}
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
if (typeof globalThis.Path2D === "undefined") {
|
|
38
|
+
globalThis.Path2D = class Path2D {
|
|
39
|
+
constructor(_path = undefined) {}
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
async function loadPdfParseCtor() {
|
|
45
|
+
ensurePdfDomPolyfills();
|
|
46
|
+
const mod = await import("pdf-parse");
|
|
47
|
+
const ctor = mod.PDFParse ?? mod.default;
|
|
48
|
+
if (!ctor) throw new Error("pdf-parse did not export PDFParse");
|
|
49
|
+
return ctor;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export async function extractPdfMarkdown(buffer, url) {
|
|
53
|
+
try {
|
|
54
|
+
const PDFParseCtor = await loadPdfParseCtor();
|
|
55
|
+
const parser = new PDFParseCtor({ data: new Uint8Array(buffer) });
|
|
56
|
+
await parser.load();
|
|
57
|
+
const data = await parser.getText();
|
|
58
|
+
const text = data.text?.trim();
|
|
59
|
+
if (!text) return null;
|
|
60
|
+
return {
|
|
61
|
+
title: new URL(url).pathname.split("/").pop() || "Document.pdf",
|
|
62
|
+
content: `## PDF Content (${data.total} pages)\n\n${text}`,
|
|
63
|
+
pages: data.total,
|
|
64
|
+
};
|
|
65
|
+
} catch (error) {
|
|
66
|
+
return { error: error.message || String(error) };
|
|
67
|
+
}
|
|
68
|
+
}
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
// src/search/progress.mjs — Progress bar with ETA for long-running research
|
|
2
|
+
//
|
|
3
|
+
// Tracks per-action and per-round timing, prints a progress bar to stderr
|
|
4
|
+
// after each step with an ETA based on rolling average. Inspired by pi-webaio's
|
|
5
|
+
// streaming progress output.
|
|
6
|
+
//
|
|
7
|
+
// Usage:
|
|
8
|
+
// const tracker = createProgressTracker({ totalActions: 6, totalRounds: 2 });
|
|
9
|
+
// tracker.startRound(1);
|
|
10
|
+
// tracker.startAction('search', 'what is X');
|
|
11
|
+
// ... do work ...
|
|
12
|
+
// tracker.endAction();
|
|
13
|
+
// tracker.startAction('fetch', 'https://...');
|
|
14
|
+
// ... do work ...
|
|
15
|
+
// tracker.endAction();
|
|
16
|
+
// tracker.endRound();
|
|
17
|
+
// tracker.print(); // prints bar to stderr
|
|
18
|
+
|
|
19
|
+
const BAR_WIDTH = 20;
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Format seconds as a human-readable duration (e.g. "1m 23s", "45s", "0s")
|
|
23
|
+
*/
|
|
24
|
+
function formatDuration(ms) {
|
|
25
|
+
if (ms < 1000) return "0s";
|
|
26
|
+
const totalSeconds = Math.round(ms / 1000);
|
|
27
|
+
if (totalSeconds < 60) return `${totalSeconds}s`;
|
|
28
|
+
const minutes = Math.floor(totalSeconds / 60);
|
|
29
|
+
const seconds = totalSeconds % 60;
|
|
30
|
+
return `${minutes}m ${seconds}s`;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Render a progress bar string.
|
|
35
|
+
* Example: [████████████░░░░░░░░] 12/20 (ETA 1m 30s)
|
|
36
|
+
*/
|
|
37
|
+
function renderBar(progress, width = BAR_WIDTH) {
|
|
38
|
+
const filled = Math.round(progress * width);
|
|
39
|
+
const empty = width - filled;
|
|
40
|
+
return "[" + "█".repeat(filled) + "░".repeat(empty) + "]";
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Create a progress tracker.
|
|
45
|
+
* @param {object} opts
|
|
46
|
+
* @param {number} opts.totalActions - Total expected actions across all rounds
|
|
47
|
+
* @param {number} opts.totalRounds - Total expected rounds
|
|
48
|
+
* @param {number} opts.totalFetches - Total expected source fetches
|
|
49
|
+
* @param {boolean} [opts.silent] - Suppress stderr output (for tests)
|
|
50
|
+
*/
|
|
51
|
+
export function createProgressTracker({
|
|
52
|
+
totalActions = 0,
|
|
53
|
+
totalRounds = 0,
|
|
54
|
+
totalFetches = 0,
|
|
55
|
+
silent = false,
|
|
56
|
+
} = {}) {
|
|
57
|
+
const startedAt = Date.now();
|
|
58
|
+
let completedActions = 0;
|
|
59
|
+
let completedRounds = 0;
|
|
60
|
+
let completedFetches = 0;
|
|
61
|
+
const actionTimings = []; // rolling window of recent action durations
|
|
62
|
+
let currentActionStart = null;
|
|
63
|
+
let currentActionLabel = null;
|
|
64
|
+
let lastPrintAt = 0;
|
|
65
|
+
const MIN_PRINT_INTERVAL_MS = 500; // throttle to avoid spam
|
|
66
|
+
|
|
67
|
+
function recordAction(durationMs) {
|
|
68
|
+
actionTimings.push(durationMs);
|
|
69
|
+
// keep only last 5 for rolling average
|
|
70
|
+
if (actionTimings.length > 5) actionTimings.shift();
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
function avgActionMs() {
|
|
74
|
+
if (actionTimings.length === 0) return null;
|
|
75
|
+
return actionTimings.reduce((a, b) => a + b, 0) / actionTimings.length;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function buildStatus(phase) {
|
|
79
|
+
const elapsed = Date.now() - startedAt;
|
|
80
|
+
const total = totalActions + totalFetches + totalRounds;
|
|
81
|
+
const done = completedActions + completedFetches + completedRounds;
|
|
82
|
+
const progress = total > 0 ? Math.min(1, done / total) : 0;
|
|
83
|
+
const bar = renderBar(progress);
|
|
84
|
+
const avg = avgActionMs();
|
|
85
|
+
const remaining = Math.max(0, total - done);
|
|
86
|
+
const etaMs = avg ? avg * remaining : null;
|
|
87
|
+
const eta = etaMs ? formatDuration(etaMs) : "—";
|
|
88
|
+
const label = currentActionLabel ? ` ${currentActionLabel}` : "";
|
|
89
|
+
return `${bar} ${done}/${total} (${phase}${label}, ETA ${eta})`;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
function print(phase) {
|
|
93
|
+
if (silent) return;
|
|
94
|
+
const now = Date.now();
|
|
95
|
+
// throttle to avoid spamming
|
|
96
|
+
if (now - lastPrintAt < MIN_PRINT_INTERVAL_MS && phase !== "done") return;
|
|
97
|
+
lastPrintAt = now;
|
|
98
|
+
process.stderr.write(`[greedysearch] ${buildStatus(phase)}\n`);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
startRound(n) {
|
|
103
|
+
completedRounds = n - 1; // will be incremented when endRound fires
|
|
104
|
+
},
|
|
105
|
+
endRound() {
|
|
106
|
+
completedRounds++;
|
|
107
|
+
print("round");
|
|
108
|
+
},
|
|
109
|
+
startAction(type, label) {
|
|
110
|
+
currentActionStart = Date.now();
|
|
111
|
+
currentActionLabel = `${type}:${(label || "").slice(0, 40)}`;
|
|
112
|
+
print(type);
|
|
113
|
+
},
|
|
114
|
+
endAction() {
|
|
115
|
+
if (currentActionStart) {
|
|
116
|
+
recordAction(Date.now() - currentActionStart);
|
|
117
|
+
currentActionStart = null;
|
|
118
|
+
}
|
|
119
|
+
completedActions++;
|
|
120
|
+
print("action");
|
|
121
|
+
},
|
|
122
|
+
startFetch(label) {
|
|
123
|
+
currentActionStart = Date.now();
|
|
124
|
+
currentActionLabel = `fetch:${(label || "").slice(0, 40)}`;
|
|
125
|
+
print("fetch");
|
|
126
|
+
},
|
|
127
|
+
endFetch(ok = true) {
|
|
128
|
+
if (currentActionStart) {
|
|
129
|
+
recordAction(Date.now() - currentActionStart);
|
|
130
|
+
currentActionStart = null;
|
|
131
|
+
}
|
|
132
|
+
completedFetches++;
|
|
133
|
+
print(ok ? "fetch" : "fetch-failed");
|
|
134
|
+
},
|
|
135
|
+
print() {
|
|
136
|
+
print("progress");
|
|
137
|
+
},
|
|
138
|
+
finish() {
|
|
139
|
+
print("done");
|
|
140
|
+
},
|
|
141
|
+
getElapsedMs() {
|
|
142
|
+
return Date.now() - startedAt;
|
|
143
|
+
},
|
|
144
|
+
};
|
|
145
|
+
}
|
package/src/search/recovery.mjs
CHANGED
|
@@ -1,45 +1,73 @@
|
|
|
1
|
-
// src/search/recovery.mjs — Headless-block detection and visible recovery policy
|
|
2
|
-
|
|
3
|
-
// Only these engines use automatic headless → visible recovery. Google is
|
|
4
|
-
// intentionally excluded for now; see issue #9 discussion / maintainer choice.
|
|
5
|
-
export const HEADLESS_RECOVERY_ENGINES = [
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
1
|
+
// src/search/recovery.mjs — Headless-block detection and visible recovery policy
|
|
2
|
+
|
|
3
|
+
// Only these engines use automatic headless → visible recovery. Google is
|
|
4
|
+
// intentionally excluded for now; see issue #9 discussion / maintainer choice.
|
|
5
|
+
export const HEADLESS_RECOVERY_ENGINES = [
|
|
6
|
+
"perplexity",
|
|
7
|
+
"bing",
|
|
8
|
+
"chatgpt",
|
|
9
|
+
"semantic-scholar",
|
|
10
|
+
"logically",
|
|
11
|
+
];
|
|
12
|
+
|
|
13
|
+
// blockedBy values that indicate visible-mode cookies CANNOT bypass the block.
|
|
14
|
+
// These still match the "headless blocked" shape but should NOT trigger
|
|
15
|
+
// visible recovery — the block is account-level (rate limit, ban) or
|
|
16
|
+
// structural (page redesign), not session-level.
|
|
17
|
+
const NON_RECOVERABLE_BLOCKED_BY = new Set(["rate-limit"]);
|
|
18
|
+
|
|
19
|
+
const HEADLESS_BLOCKED_PATTERN =
|
|
20
|
+
/timed out|timeout|verification|captcha|cloudflare|turnstile|input not found|ask-input|copy button hidden|sign.in|login required/i;
|
|
21
|
+
|
|
22
|
+
const MANUAL_VERIFICATION_PATTERN =
|
|
23
|
+
/needs-human|verification required|please solve|captcha|cloudflare|turnstile|could not be completed automatically|manual intervention|sign.in|login required/i;
|
|
24
|
+
|
|
25
|
+
export function isHeadlessBlockedError(error) {
|
|
26
|
+
return HEADLESS_BLOCKED_PATTERN.test(String(error || ""));
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export function isManualVerificationError(error) {
|
|
30
|
+
return MANUAL_VERIFICATION_PATTERN.test(String(error || ""));
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Check if a blockedBy value is non-recoverable (visible retry won't help).
|
|
35
|
+
*/
|
|
36
|
+
export function isNonRecoverableBlockedBy(blockedBy) {
|
|
37
|
+
return NON_RECOVERABLE_BLOCKED_BY.has(blockedBy);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export function findHeadlessBlockedEngines(resultsByEngine) {
|
|
41
|
+
return HEADLESS_RECOVERY_ENGINES.filter((engine) => {
|
|
42
|
+
const result = resultsByEngine?.[engine];
|
|
43
|
+
if (!result) return false;
|
|
44
|
+
// Data-driven: check envelope first (zero regex cost)
|
|
45
|
+
const blockedBy = result._envelope?.blockedBy;
|
|
46
|
+
if (blockedBy) {
|
|
47
|
+
// Skip non-recoverable blocks (rate-limit, ban, etc.)
|
|
48
|
+
if (NON_RECOVERABLE_BLOCKED_BY.has(blockedBy)) return false;
|
|
49
|
+
return true;
|
|
50
|
+
}
|
|
51
|
+
if (result._envelope?.verificationResult === "needs-human") return true;
|
|
52
|
+
// Fallback: legacy string matching for errors passed as plain strings
|
|
53
|
+
const error = result.error;
|
|
54
|
+
return error && isHeadlessBlockedError(error);
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Check if an extractor Error carries a structured envelope indicating
|
|
60
|
+
* headless blocking. Used in single-engine recovery paths where the Error
|
|
61
|
+
* object is caught directly rather than parsed from a result record.
|
|
62
|
+
*/
|
|
63
|
+
export function isHeadlessBlockedResult(error) {
|
|
64
|
+
if (!error) return false;
|
|
65
|
+
const env = error.envelope;
|
|
66
|
+
if (env?.blockedBy) {
|
|
67
|
+
// Skip non-recoverable blocks (rate-limit, ban, etc.)
|
|
68
|
+
if (NON_RECOVERABLE_BLOCKED_BY.has(env.blockedBy)) return false;
|
|
69
|
+
return true;
|
|
70
|
+
}
|
|
71
|
+
if (env?.verificationResult === "needs-human") return true;
|
|
72
|
+
return isHeadlessBlockedError(error.message);
|
|
73
|
+
}
|