@apmantza/greedysearch-pi 1.9.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +110 -14
- package/README.md +86 -41
- package/bin/cdp.mjs +1153 -1108
- package/bin/launch.mjs +11 -0
- package/bin/search.mjs +886 -674
- package/extractors/bing-copilot.mjs +528 -374
- package/extractors/chatgpt.mjs +436 -0
- package/extractors/common.mjs +837 -645
- package/extractors/consensus.mjs +655 -0
- package/extractors/consent.mjs +421 -388
- package/extractors/gemini.mjs +335 -217
- package/extractors/logically.mjs +567 -0
- package/extractors/selectors.mjs +3 -2
- package/extractors/semantic-scholar.mjs +219 -0
- package/index.ts +2 -1
- package/package.json +14 -6
- package/skills/greedy-search/skill.md +9 -12
- package/src/fetcher.mjs +8 -1
- package/src/formatters/results.ts +163 -128
- package/src/search/browser-lifecycle.mjs +27 -5
- package/src/search/chrome.mjs +653 -590
- package/src/search/constants.mjs +150 -39
- package/src/search/engines.mjs +114 -76
- package/src/search/fetch-source.mjs +566 -451
- package/src/search/pdf.mjs +68 -0
- package/src/search/recovery.mjs +51 -45
- package/src/search/research.mjs +2579 -0
- package/src/search/sources.mjs +77 -25
- package/src/search/synthesis-runner.mjs +142 -57
- package/src/search/synthesis.mjs +286 -246
- package/src/tools/greedy-search-handler.ts +189 -45
- package/src/tools/shared.ts +187 -186
- package/src/types.ts +110 -104
- package/test.mjs +1342 -534
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
// src/search/pdf.mjs — PDF text extraction helpers
|
|
2
|
+
//
|
|
3
|
+
// Adapted from pi-webaio's PDF pipeline. `pdf-parse` is loaded lazily so the
|
|
4
|
+
// package remains importable even when optional native canvas bindings are not
|
|
5
|
+
// available. PDF extraction is only attempted for actual PDF source fetches.
|
|
6
|
+
|
|
7
|
+
function ensurePdfDomPolyfills() {
|
|
8
|
+
if (typeof globalThis.DOMMatrix === "undefined") {
|
|
9
|
+
globalThis.DOMMatrix = class DOMMatrix {
|
|
10
|
+
constructor(_init = undefined) {}
|
|
11
|
+
multiplySelf() {
|
|
12
|
+
return this;
|
|
13
|
+
}
|
|
14
|
+
preMultiplySelf() {
|
|
15
|
+
return this;
|
|
16
|
+
}
|
|
17
|
+
translateSelf() {
|
|
18
|
+
return this;
|
|
19
|
+
}
|
|
20
|
+
scaleSelf() {
|
|
21
|
+
return this;
|
|
22
|
+
}
|
|
23
|
+
rotateSelf() {
|
|
24
|
+
return this;
|
|
25
|
+
}
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
if (typeof globalThis.ImageData === "undefined") {
|
|
29
|
+
globalThis.ImageData = class ImageData {
|
|
30
|
+
constructor(data = undefined, width = 0, height = 0) {
|
|
31
|
+
this.data = data;
|
|
32
|
+
this.width = width;
|
|
33
|
+
this.height = height;
|
|
34
|
+
}
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
if (typeof globalThis.Path2D === "undefined") {
|
|
38
|
+
globalThis.Path2D = class Path2D {
|
|
39
|
+
constructor(_path = undefined) {}
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
async function loadPdfParseCtor() {
|
|
45
|
+
ensurePdfDomPolyfills();
|
|
46
|
+
const mod = await import("pdf-parse");
|
|
47
|
+
const ctor = mod.PDFParse ?? mod.default;
|
|
48
|
+
if (!ctor) throw new Error("pdf-parse did not export PDFParse");
|
|
49
|
+
return ctor;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export async function extractPdfMarkdown(buffer, url) {
|
|
53
|
+
try {
|
|
54
|
+
const PDFParseCtor = await loadPdfParseCtor();
|
|
55
|
+
const parser = new PDFParseCtor({ data: new Uint8Array(buffer) });
|
|
56
|
+
await parser.load();
|
|
57
|
+
const data = await parser.getText();
|
|
58
|
+
const text = data.text?.trim();
|
|
59
|
+
if (!text) return null;
|
|
60
|
+
return {
|
|
61
|
+
title: new URL(url).pathname.split("/").pop() || "Document.pdf",
|
|
62
|
+
content: `## PDF Content (${data.total} pages)\n\n${text}`,
|
|
63
|
+
pages: data.total,
|
|
64
|
+
};
|
|
65
|
+
} catch (error) {
|
|
66
|
+
return { error: error.message || String(error) };
|
|
67
|
+
}
|
|
68
|
+
}
|
package/src/search/recovery.mjs
CHANGED
|
@@ -1,45 +1,51 @@
|
|
|
1
|
-
// src/search/recovery.mjs — Headless-block detection and visible recovery policy
|
|
2
|
-
|
|
3
|
-
// Only these engines use automatic headless → visible recovery. Google is
|
|
4
|
-
// intentionally excluded for now; see issue #9 discussion / maintainer choice.
|
|
5
|
-
export const HEADLESS_RECOVERY_ENGINES = [
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
const
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
1
|
+
// src/search/recovery.mjs — Headless-block detection and visible recovery policy
|
|
2
|
+
|
|
3
|
+
// Only these engines use automatic headless → visible recovery. Google is
|
|
4
|
+
// intentionally excluded for now; see issue #9 discussion / maintainer choice.
|
|
5
|
+
export const HEADLESS_RECOVERY_ENGINES = [
|
|
6
|
+
"perplexity",
|
|
7
|
+
"bing",
|
|
8
|
+
"chatgpt",
|
|
9
|
+
"semantic-scholar",
|
|
10
|
+
"logically",
|
|
11
|
+
];
|
|
12
|
+
|
|
13
|
+
const HEADLESS_BLOCKED_PATTERN =
|
|
14
|
+
/timed out|timeout|verification|captcha|cloudflare|turnstile|input not found|ask-input|clipboard|copy button hidden|sign.in|login required/i;
|
|
15
|
+
|
|
16
|
+
const MANUAL_VERIFICATION_PATTERN =
|
|
17
|
+
/needs-human|verification required|please solve|captcha|cloudflare|turnstile|could not be completed automatically|manual intervention|sign.in|login required/i;
|
|
18
|
+
|
|
19
|
+
export function isHeadlessBlockedError(error) {
|
|
20
|
+
return HEADLESS_BLOCKED_PATTERN.test(String(error || ""));
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export function isManualVerificationError(error) {
|
|
24
|
+
return MANUAL_VERIFICATION_PATTERN.test(String(error || ""));
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export function findHeadlessBlockedEngines(resultsByEngine) {
|
|
28
|
+
return HEADLESS_RECOVERY_ENGINES.filter((engine) => {
|
|
29
|
+
const result = resultsByEngine?.[engine];
|
|
30
|
+
if (!result) return false;
|
|
31
|
+
// Data-driven: check envelope first (zero regex cost)
|
|
32
|
+
if (result._envelope?.blockedBy) return true;
|
|
33
|
+
if (result._envelope?.verificationResult === "needs-human") return true;
|
|
34
|
+
// Fallback: legacy string matching for errors passed as plain strings
|
|
35
|
+
const error = result.error;
|
|
36
|
+
return error && isHeadlessBlockedError(error);
|
|
37
|
+
});
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Check if an extractor Error carries a structured envelope indicating
|
|
42
|
+
* headless blocking. Used in single-engine recovery paths where the Error
|
|
43
|
+
* object is caught directly rather than parsed from a result record.
|
|
44
|
+
*/
|
|
45
|
+
export function isHeadlessBlockedResult(error) {
|
|
46
|
+
if (!error) return false;
|
|
47
|
+
const env = error.envelope;
|
|
48
|
+
if (env?.blockedBy) return true;
|
|
49
|
+
if (env?.verificationResult === "needs-human") return true;
|
|
50
|
+
return isHeadlessBlockedError(error.message);
|
|
51
|
+
}
|