@apmantza/greedysearch-pi 1.9.2 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/CHANGELOG.md +132 -2
  2. package/README.md +82 -47
  3. package/bin/cdp.mjs +1153 -1108
  4. package/bin/launch.mjs +9 -0
  5. package/bin/search.mjs +318 -81
  6. package/extractors/bing-copilot.mjs +48 -18
  7. package/extractors/chatgpt.mjs +553 -0
  8. package/extractors/common.mjs +213 -22
  9. package/extractors/consensus.mjs +655 -0
  10. package/extractors/consent.mjs +182 -18
  11. package/extractors/gemini.mjs +350 -217
  12. package/extractors/google-ai.mjs +129 -128
  13. package/extractors/logically.mjs +629 -0
  14. package/extractors/perplexity.mjs +547 -217
  15. package/extractors/selectors.mjs +3 -2
  16. package/extractors/semantic-scholar.mjs +219 -0
  17. package/package.json +8 -4
  18. package/skills/greedy-search/skill.md +20 -12
  19. package/src/fetcher.mjs +23 -1
  20. package/src/formatters/results.ts +185 -128
  21. package/src/search/browser-lifecycle.mjs +27 -5
  22. package/src/search/challenge-detect.mjs +205 -0
  23. package/src/search/chrome.mjs +653 -590
  24. package/src/search/constants.mjs +155 -39
  25. package/src/search/engines.mjs +114 -76
  26. package/src/search/fetch-source.mjs +566 -451
  27. package/src/search/pdf.mjs +68 -0
  28. package/src/search/progress.mjs +145 -0
  29. package/src/search/recovery.mjs +73 -45
  30. package/src/search/research.mjs +1419 -62
  31. package/src/search/scale-aware.mjs +93 -0
  32. package/src/search/simple-research.mjs +520 -0
  33. package/src/search/sources.mjs +52 -22
  34. package/src/search/synthesis-runner.mjs +105 -26
  35. package/src/search/synthesis.mjs +286 -246
  36. package/src/tools/greedy-search-handler.ts +129 -59
  37. package/src/tools/shared.ts +312 -186
  38. package/src/types.ts +110 -104
  39. package/test.mjs +537 -18
@@ -0,0 +1,68 @@
1
+ // src/search/pdf.mjs — PDF text extraction helpers
2
+ //
3
+ // Adapted from pi-webaio's PDF pipeline. `pdf-parse` is loaded lazily so the
4
+ // package remains importable even when optional native canvas bindings are not
5
+ // available. PDF extraction is only attempted for actual PDF source fetches.
6
+
7
+ function ensurePdfDomPolyfills() {
8
+ if (typeof globalThis.DOMMatrix === "undefined") {
9
+ globalThis.DOMMatrix = class DOMMatrix {
10
+ constructor(_init = undefined) {}
11
+ multiplySelf() {
12
+ return this;
13
+ }
14
+ preMultiplySelf() {
15
+ return this;
16
+ }
17
+ translateSelf() {
18
+ return this;
19
+ }
20
+ scaleSelf() {
21
+ return this;
22
+ }
23
+ rotateSelf() {
24
+ return this;
25
+ }
26
+ };
27
+ }
28
+ if (typeof globalThis.ImageData === "undefined") {
29
+ globalThis.ImageData = class ImageData {
30
+ constructor(data = undefined, width = 0, height = 0) {
31
+ this.data = data;
32
+ this.width = width;
33
+ this.height = height;
34
+ }
35
+ };
36
+ }
37
+ if (typeof globalThis.Path2D === "undefined") {
38
+ globalThis.Path2D = class Path2D {
39
+ constructor(_path = undefined) {}
40
+ };
41
+ }
42
+ }
43
+
44
+ async function loadPdfParseCtor() {
45
+ ensurePdfDomPolyfills();
46
+ const mod = await import("pdf-parse");
47
+ const ctor = mod.PDFParse ?? mod.default;
48
+ if (!ctor) throw new Error("pdf-parse did not export PDFParse");
49
+ return ctor;
50
+ }
51
+
52
+ export async function extractPdfMarkdown(buffer, url) {
53
+ try {
54
+ const PDFParseCtor = await loadPdfParseCtor();
55
+ const parser = new PDFParseCtor({ data: new Uint8Array(buffer) });
56
+ await parser.load();
57
+ const data = await parser.getText();
58
+ const text = data.text?.trim();
59
+ if (!text) return null;
60
+ return {
61
+ title: new URL(url).pathname.split("/").pop() || "Document.pdf",
62
+ content: `## PDF Content (${data.total} pages)\n\n${text}`,
63
+ pages: data.total,
64
+ };
65
+ } catch (error) {
66
+ return { error: error.message || String(error) };
67
+ }
68
+ }
@@ -0,0 +1,145 @@
1
+ // src/search/progress.mjs — Progress bar with ETA for long-running research
2
+ //
3
+ // Tracks per-action and per-round timing, prints a progress bar to stderr
4
+ // after each step with an ETA based on rolling average. Inspired by pi-webaio's
5
+ // streaming progress output.
6
+ //
7
+ // Usage:
8
+ // const tracker = createProgressTracker({ totalActions: 6, totalRounds: 2 });
9
+ // tracker.startRound(1);
10
+ // tracker.startAction('search', 'what is X');
11
+ // ... do work ...
12
+ // tracker.endAction();
13
+ // tracker.startAction('fetch', 'https://...');
14
+ // ... do work ...
15
+ // tracker.endAction();
16
+ // tracker.endRound();
17
+ // tracker.print(); // prints bar to stderr
18
+
19
+ const BAR_WIDTH = 20;
20
+
21
+ /**
22
+ * Format seconds as a human-readable duration (e.g. "1m 23s", "45s", "0s")
23
+ */
24
+ function formatDuration(ms) {
25
+ if (ms < 1000) return "0s";
26
+ const totalSeconds = Math.round(ms / 1000);
27
+ if (totalSeconds < 60) return `${totalSeconds}s`;
28
+ const minutes = Math.floor(totalSeconds / 60);
29
+ const seconds = totalSeconds % 60;
30
+ return `${minutes}m ${seconds}s`;
31
+ }
32
+
33
+ /**
34
+ * Render a progress bar string.
35
+ * Example: [████████████░░░░░░░░] 12/20 (ETA 1m 30s)
36
+ */
37
+ function renderBar(progress, width = BAR_WIDTH) {
38
+ const filled = Math.round(progress * width);
39
+ const empty = width - filled;
40
+ return "[" + "█".repeat(filled) + "░".repeat(empty) + "]";
41
+ }
42
+
43
+ /**
44
+ * Create a progress tracker.
45
+ * @param {object} opts
46
+ * @param {number} opts.totalActions - Total expected actions across all rounds
47
+ * @param {number} opts.totalRounds - Total expected rounds
48
+ * @param {number} opts.totalFetches - Total expected source fetches
49
+ * @param {boolean} [opts.silent] - Suppress stderr output (for tests)
50
+ */
51
+ export function createProgressTracker({
52
+ totalActions = 0,
53
+ totalRounds = 0,
54
+ totalFetches = 0,
55
+ silent = false,
56
+ } = {}) {
57
+ const startedAt = Date.now();
58
+ let completedActions = 0;
59
+ let completedRounds = 0;
60
+ let completedFetches = 0;
61
+ const actionTimings = []; // rolling window of recent action durations
62
+ let currentActionStart = null;
63
+ let currentActionLabel = null;
64
+ let lastPrintAt = 0;
65
+ const MIN_PRINT_INTERVAL_MS = 500; // throttle to avoid spam
66
+
67
+ function recordAction(durationMs) {
68
+ actionTimings.push(durationMs);
69
+ // keep only last 5 for rolling average
70
+ if (actionTimings.length > 5) actionTimings.shift();
71
+ }
72
+
73
+ function avgActionMs() {
74
+ if (actionTimings.length === 0) return null;
75
+ return actionTimings.reduce((a, b) => a + b, 0) / actionTimings.length;
76
+ }
77
+
78
+ function buildStatus(phase) {
79
+ const elapsed = Date.now() - startedAt;
80
+ const total = totalActions + totalFetches + totalRounds;
81
+ const done = completedActions + completedFetches + completedRounds;
82
+ const progress = total > 0 ? Math.min(1, done / total) : 0;
83
+ const bar = renderBar(progress);
84
+ const avg = avgActionMs();
85
+ const remaining = Math.max(0, total - done);
86
+ const etaMs = avg ? avg * remaining : null;
87
+ const eta = etaMs ? formatDuration(etaMs) : "—";
88
+ const label = currentActionLabel ? ` ${currentActionLabel}` : "";
89
+ return `${bar} ${done}/${total} (${phase}${label}, ETA ${eta})`;
90
+ }
91
+
92
+ function print(phase) {
93
+ if (silent) return;
94
+ const now = Date.now();
95
+ // throttle to avoid spamming
96
+ if (now - lastPrintAt < MIN_PRINT_INTERVAL_MS && phase !== "done") return;
97
+ lastPrintAt = now;
98
+ process.stderr.write(`[greedysearch] ${buildStatus(phase)}\n`);
99
+ }
100
+
101
+ return {
102
+ startRound(n) {
103
+ completedRounds = n - 1; // will be incremented when endRound fires
104
+ },
105
+ endRound() {
106
+ completedRounds++;
107
+ print("round");
108
+ },
109
+ startAction(type, label) {
110
+ currentActionStart = Date.now();
111
+ currentActionLabel = `${type}:${(label || "").slice(0, 40)}`;
112
+ print(type);
113
+ },
114
+ endAction() {
115
+ if (currentActionStart) {
116
+ recordAction(Date.now() - currentActionStart);
117
+ currentActionStart = null;
118
+ }
119
+ completedActions++;
120
+ print("action");
121
+ },
122
+ startFetch(label) {
123
+ currentActionStart = Date.now();
124
+ currentActionLabel = `fetch:${(label || "").slice(0, 40)}`;
125
+ print("fetch");
126
+ },
127
+ endFetch(ok = true) {
128
+ if (currentActionStart) {
129
+ recordAction(Date.now() - currentActionStart);
130
+ currentActionStart = null;
131
+ }
132
+ completedFetches++;
133
+ print(ok ? "fetch" : "fetch-failed");
134
+ },
135
+ print() {
136
+ print("progress");
137
+ },
138
+ finish() {
139
+ print("done");
140
+ },
141
+ getElapsedMs() {
142
+ return Date.now() - startedAt;
143
+ },
144
+ };
145
+ }
@@ -1,45 +1,73 @@
1
- // src/search/recovery.mjs — Headless-block detection and visible recovery policy
2
-
3
- // Only these engines use automatic headless → visible recovery. Google is
4
- // intentionally excluded for now; see issue #9 discussion / maintainer choice.
5
- export const HEADLESS_RECOVERY_ENGINES = ["perplexity", "bing"];
6
-
7
- const HEADLESS_BLOCKED_PATTERN =
8
- /timed out|timeout|verification|captcha|cloudflare|turnstile|input not found|ask-input|clipboard|copy button hidden/i;
9
-
10
- const MANUAL_VERIFICATION_PATTERN =
11
- /needs-human|verification required|please solve|captcha|cloudflare|turnstile|could not be completed automatically|manual intervention/i;
12
-
13
- export function isHeadlessBlockedError(error) {
14
- return HEADLESS_BLOCKED_PATTERN.test(String(error || ""));
15
- }
16
-
17
- export function isManualVerificationError(error) {
18
- return MANUAL_VERIFICATION_PATTERN.test(String(error || ""));
19
- }
20
-
21
- export function findHeadlessBlockedEngines(resultsByEngine) {
22
- return HEADLESS_RECOVERY_ENGINES.filter((engine) => {
23
- const result = resultsByEngine?.[engine];
24
- if (!result) return false;
25
- // Data-driven: check envelope first (zero regex cost)
26
- if (result._envelope?.blockedBy) return true;
27
- if (result._envelope?.verificationResult === "needs-human") return true;
28
- // Fallback: legacy string matching for errors passed as plain strings
29
- const error = result.error;
30
- return error && isHeadlessBlockedError(error);
31
- });
32
- }
33
-
34
- /**
35
- * Check if an extractor Error carries a structured envelope indicating
36
- * headless blocking. Used in single-engine recovery paths where the Error
37
- * object is caught directly rather than parsed from a result record.
38
- */
39
- export function isHeadlessBlockedResult(error) {
40
- if (!error) return false;
41
- const env = error.envelope;
42
- if (env?.blockedBy) return true;
43
- if (env?.verificationResult === "needs-human") return true;
44
- return isHeadlessBlockedError(error.message);
45
- }
1
+ // src/search/recovery.mjs — Headless-block detection and visible recovery policy
2
+
3
+ // Only these engines use automatic headless → visible recovery. Google is
4
+ // intentionally excluded for now; see issue #9 discussion / maintainer choice.
5
+ export const HEADLESS_RECOVERY_ENGINES = [
6
+ "perplexity",
7
+ "bing",
8
+ "chatgpt",
9
+ "semantic-scholar",
10
+ "logically",
11
+ ];
12
+
13
+ // blockedBy values that indicate visible-mode cookies CANNOT bypass the block.
14
+ // These still match the "headless blocked" shape but should NOT trigger
15
+ // visible recovery — the block is account-level (rate limit, ban) or
16
+ // structural (page redesign), not session-level.
17
+ const NON_RECOVERABLE_BLOCKED_BY = new Set(["rate-limit"]);
18
+
19
+ const HEADLESS_BLOCKED_PATTERN =
20
+ /timed out|timeout|verification|captcha|cloudflare|turnstile|input not found|ask-input|copy button hidden|sign.in|login required/i;
21
+
22
+ const MANUAL_VERIFICATION_PATTERN =
23
+ /needs-human|verification required|please solve|captcha|cloudflare|turnstile|could not be completed automatically|manual intervention|sign.in|login required/i;
24
+
25
+ export function isHeadlessBlockedError(error) {
26
+ return HEADLESS_BLOCKED_PATTERN.test(String(error || ""));
27
+ }
28
+
29
+ export function isManualVerificationError(error) {
30
+ return MANUAL_VERIFICATION_PATTERN.test(String(error || ""));
31
+ }
32
+
33
+ /**
34
+ * Check if a blockedBy value is non-recoverable (visible retry won't help).
35
+ */
36
+ export function isNonRecoverableBlockedBy(blockedBy) {
37
+ return NON_RECOVERABLE_BLOCKED_BY.has(blockedBy);
38
+ }
39
+
40
+ export function findHeadlessBlockedEngines(resultsByEngine) {
41
+ return HEADLESS_RECOVERY_ENGINES.filter((engine) => {
42
+ const result = resultsByEngine?.[engine];
43
+ if (!result) return false;
44
+ // Data-driven: check envelope first (zero regex cost)
45
+ const blockedBy = result._envelope?.blockedBy;
46
+ if (blockedBy) {
47
+ // Skip non-recoverable blocks (rate-limit, ban, etc.)
48
+ if (NON_RECOVERABLE_BLOCKED_BY.has(blockedBy)) return false;
49
+ return true;
50
+ }
51
+ if (result._envelope?.verificationResult === "needs-human") return true;
52
+ // Fallback: legacy string matching for errors passed as plain strings
53
+ const error = result.error;
54
+ return error && isHeadlessBlockedError(error);
55
+ });
56
+ }
57
+
58
+ /**
59
+ * Check if an extractor Error carries a structured envelope indicating
60
+ * headless blocking. Used in single-engine recovery paths where the Error
61
+ * object is caught directly rather than parsed from a result record.
62
+ */
63
+ export function isHeadlessBlockedResult(error) {
64
+ if (!error) return false;
65
+ const env = error.envelope;
66
+ if (env?.blockedBy) {
67
+ // Skip non-recoverable blocks (rate-limit, ban, etc.)
68
+ if (NON_RECOVERABLE_BLOCKED_BY.has(env.blockedBy)) return false;
69
+ return true;
70
+ }
71
+ if (env?.verificationResult === "needs-human") return true;
72
+ return isHeadlessBlockedError(error.message);
73
+ }