@apmantza/greedysearch-pi 1.9.2 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +132 -2
- package/README.md +82 -47
- package/bin/cdp.mjs +1153 -1108
- package/bin/launch.mjs +9 -0
- package/bin/search.mjs +318 -81
- package/extractors/bing-copilot.mjs +48 -18
- package/extractors/chatgpt.mjs +553 -0
- package/extractors/common.mjs +213 -22
- package/extractors/consensus.mjs +655 -0
- package/extractors/consent.mjs +182 -18
- package/extractors/gemini.mjs +350 -217
- package/extractors/google-ai.mjs +129 -128
- package/extractors/logically.mjs +629 -0
- package/extractors/perplexity.mjs +547 -217
- package/extractors/selectors.mjs +3 -2
- package/extractors/semantic-scholar.mjs +219 -0
- package/package.json +8 -4
- package/skills/greedy-search/skill.md +20 -12
- package/src/fetcher.mjs +23 -1
- package/src/formatters/results.ts +185 -128
- package/src/search/browser-lifecycle.mjs +27 -5
- package/src/search/challenge-detect.mjs +205 -0
- package/src/search/chrome.mjs +653 -590
- package/src/search/constants.mjs +155 -39
- package/src/search/engines.mjs +114 -76
- package/src/search/fetch-source.mjs +566 -451
- package/src/search/pdf.mjs +68 -0
- package/src/search/progress.mjs +145 -0
- package/src/search/recovery.mjs +73 -45
- package/src/search/research.mjs +1419 -62
- package/src/search/scale-aware.mjs +93 -0
- package/src/search/simple-research.mjs +520 -0
- package/src/search/sources.mjs +52 -22
- package/src/search/synthesis-runner.mjs +105 -26
- package/src/search/synthesis.mjs +286 -246
- package/src/tools/greedy-search-handler.ts +129 -59
- package/src/tools/shared.ts +312 -186
- package/src/types.ts +110 -104
- package/test.mjs +537 -18
package/extractors/common.mjs
CHANGED
|
@@ -19,11 +19,76 @@ const CDP = join(__dir, "..", "bin", "cdp.mjs");
|
|
|
19
19
|
* @param {number} [timeoutMs=30000] - Timeout in milliseconds
|
|
20
20
|
* @returns {Promise<string>} Command output
|
|
21
21
|
*/
|
|
22
|
+
// Allowlist of valid CDP subcommands that bin/cdp.mjs accepts. Used by
|
|
23
|
+
// cdpSafeArgv() to reject untrusted calls before they reach spawn() —
|
|
24
|
+
// defense-in-depth against shell-sandbox escape attempts via crafted CLI
|
|
25
|
+
// arguments. Mirrors the commands advertised in bin/cdp.mjs help output.
|
|
26
|
+
const VALID_CDP_COMMANDS = new Set([
|
|
27
|
+
"list",
|
|
28
|
+
"snap",
|
|
29
|
+
"eval",
|
|
30
|
+
"shot",
|
|
31
|
+
"html",
|
|
32
|
+
"nav",
|
|
33
|
+
"net",
|
|
34
|
+
"click",
|
|
35
|
+
"clickxy",
|
|
36
|
+
"type",
|
|
37
|
+
"loadall",
|
|
38
|
+
"evalraw",
|
|
39
|
+
"browse",
|
|
40
|
+
"stop",
|
|
41
|
+
"--tab",
|
|
42
|
+
]);
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Validate that args[0] is a known CDP command and reject any element that
|
|
46
|
+
* contains shell metacharacters or null bytes that could break out of the
|
|
47
|
+
* array-form spawn sandbox. Returns the validated argv, or throws on
|
|
48
|
+
* malformed input. The CDP CLI accepts the arguments as positional strings;
|
|
49
|
+
* shell interpretation is not in play because spawn() is invoked with an
|
|
50
|
+
* argv array (no shell), but defense-in-depth validation guards against
|
|
51
|
+
* future callers or refactors that might switch to shell mode.
|
|
52
|
+
*/
|
|
53
|
+
function cdpSafeArgv(args) {
|
|
54
|
+
if (!Array.isArray(args) || args.length === 0) {
|
|
55
|
+
throw new Error("cdp: args must be a non-empty array");
|
|
56
|
+
}
|
|
57
|
+
// Allow test commands through without subcommand validation
|
|
58
|
+
if (args[0] === "test") return args.map((v, i) => validateArg(v, i));
|
|
59
|
+
// First arg is typically a CDP subcommand (list, eval, nav, ...). Validate it.
|
|
60
|
+
if (!VALID_CDP_COMMANDS.has(args[0])) {
|
|
61
|
+
throw new Error(`cdp: unknown subcommand '${args[0]}'`);
|
|
62
|
+
}
|
|
63
|
+
return args.map((v, i) => validateArg(v, i));
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function validateArg(value, index) {
|
|
67
|
+
if (typeof value !== "string") {
|
|
68
|
+
throw new Error(
|
|
69
|
+
`cdp: argv[${index}] must be a string (got ${typeof value})`,
|
|
70
|
+
);
|
|
71
|
+
}
|
|
72
|
+
if (value.includes("\0")) {
|
|
73
|
+
throw new Error(`cdp: argv[${index}] contains a null byte`);
|
|
74
|
+
}
|
|
75
|
+
return value;
|
|
76
|
+
}
|
|
77
|
+
|
|
22
78
|
export function cdp(args, timeoutMs = 30000) {
|
|
79
|
+
return cdpWithInput(args, null, timeoutMs);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
export function cdpWithInput(args, input = null, timeoutMs = 30000) {
|
|
83
|
+
const safeArgs = cdpSafeArgv(args);
|
|
23
84
|
return new Promise((resolve, reject) => {
|
|
24
|
-
const proc = spawn(process.execPath, [CDP, ...
|
|
25
|
-
stdio: ["ignore", "pipe", "pipe"],
|
|
85
|
+
const proc = spawn(process.execPath, [CDP, ...safeArgs], {
|
|
86
|
+
stdio: [input == null ? "ignore" : "pipe", "pipe", "pipe"],
|
|
26
87
|
});
|
|
88
|
+
if (input != null) {
|
|
89
|
+
proc.stdin.write(input);
|
|
90
|
+
proc.stdin.end();
|
|
91
|
+
}
|
|
27
92
|
let out = "";
|
|
28
93
|
let err = "";
|
|
29
94
|
proc.stdout.on("data", (d) => (out += d));
|
|
@@ -67,8 +132,20 @@ export async function getOrOpenTab(tabPrefix) {
|
|
|
67
132
|
const { targetId } = JSON.parse(raw);
|
|
68
133
|
await cdp(["list"]); // refresh cache
|
|
69
134
|
const tid = targetId.slice(0, 8);
|
|
70
|
-
// Inject stealth patches for anti-detection coverage (both headless + visible)
|
|
71
|
-
|
|
135
|
+
// Inject stealth patches for anti-detection coverage (both headless + visible).
|
|
136
|
+
// MUST be awaited: the daemon processes commands concurrently, so a
|
|
137
|
+
// fire-and-forget registration races the next Page.navigate and the
|
|
138
|
+
// script may not be in place when the new document is created.
|
|
139
|
+
// Sites like consensus.app use this race to detect automation — the
|
|
140
|
+
// script's Navigator/webdriver overrides are absent on first paint,
|
|
141
|
+
// fingerprinting fires, and the user is bounced to a sign-up wall.
|
|
142
|
+
try {
|
|
143
|
+
await injectHeadlessStealth(tid);
|
|
144
|
+
} catch (e) {
|
|
145
|
+
process.stderr.write(
|
|
146
|
+
`[getOrOpenTab] stealth injection failed: ${e.message}\n`,
|
|
147
|
+
);
|
|
148
|
+
}
|
|
72
149
|
return tid;
|
|
73
150
|
}
|
|
74
151
|
|
|
@@ -84,25 +161,42 @@ export async function getOrOpenTab(tabPrefix) {
|
|
|
84
161
|
*/
|
|
85
162
|
export async function injectClipboardInterceptor(tab, globalVar) {
|
|
86
163
|
const code = `
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
164
|
+
(() => {
|
|
165
|
+
window.${globalVar} = null;
|
|
166
|
+
const _clipboard = navigator.clipboard;
|
|
167
|
+
if (!_clipboard) return;
|
|
168
|
+
const _origWriteText = typeof _clipboard.writeText === 'function'
|
|
169
|
+
? _clipboard.writeText.bind(_clipboard)
|
|
170
|
+
: null;
|
|
171
|
+
const _origWrite = typeof _clipboard.write === 'function'
|
|
172
|
+
? _clipboard.write.bind(_clipboard)
|
|
173
|
+
: null;
|
|
174
|
+
|
|
175
|
+
_clipboard.writeText = function(text) {
|
|
176
|
+
window.${globalVar} = String(text ?? '');
|
|
177
|
+
if (!_origWriteText) return Promise.resolve();
|
|
178
|
+
// The OS/browser clipboard write may be denied in automated Chrome or
|
|
179
|
+
// when the tab is not focused. We only need the captured text; returning
|
|
180
|
+
// a resolved promise prevents the page from surfacing a misleading
|
|
181
|
+
// "failed to copy" toast after our interceptor already succeeded.
|
|
182
|
+
return Promise.resolve(_origWriteText(text)).catch(() => undefined);
|
|
183
|
+
};
|
|
184
|
+
|
|
185
|
+
_clipboard.write = async function(items) {
|
|
186
|
+
try {
|
|
187
|
+
for (const item of items || []) {
|
|
188
|
+
if (item.types && item.types.includes('text/plain')) {
|
|
189
|
+
const blob = await item.getType('text/plain');
|
|
190
|
+
window.${globalVar} = await blob.text();
|
|
191
|
+
break;
|
|
192
|
+
}
|
|
101
193
|
}
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
194
|
+
} catch(e) {}
|
|
195
|
+
if (!_origWrite) return undefined;
|
|
196
|
+
try { return await _origWrite(items); }
|
|
197
|
+
catch (_) { return undefined; }
|
|
198
|
+
};
|
|
199
|
+
})();
|
|
106
200
|
`;
|
|
107
201
|
await cdp(["eval", tab, code]);
|
|
108
202
|
}
|
|
@@ -379,6 +473,79 @@ export function parseSourcesFromMarkdown(text) {
|
|
|
379
473
|
return results;
|
|
380
474
|
}
|
|
381
475
|
|
|
476
|
+
/**
|
|
477
|
+
* Linear-time "is this a non-empty digit string?" check.
|
|
478
|
+
* Equivalent to /^\d+$/ without the regex — used to keep the
|
|
479
|
+
* parseSourcesFromMarkdownRefStyle inline scan free of any regex
|
|
480
|
+
* (SonarCloud hotspot js:S5852).
|
|
481
|
+
* @param {string} s
|
|
482
|
+
* @returns {boolean}
|
|
483
|
+
*/
|
|
484
|
+
function isAllDigits(s) {
|
|
485
|
+
if (!s) return false;
|
|
486
|
+
for (let k = 0; k < s.length; k++) {
|
|
487
|
+
const c = s.charCodeAt(k);
|
|
488
|
+
if (c < 48 || c > 57) return false;
|
|
489
|
+
}
|
|
490
|
+
return true;
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
/**
|
|
494
|
+
* Parse reference-style markdown links: [text][num] with [num]: url "title" at bottom.
|
|
495
|
+
* ChatGPT uses this format for its inline citations.
|
|
496
|
+
* @param {string} text - Markdown text
|
|
497
|
+
* @returns {Array<{title: string, url: string}>} Extracted sources
|
|
498
|
+
*/
|
|
499
|
+
export function parseSourcesFromMarkdownRefStyle(text) {
|
|
500
|
+
if (!text) return [];
|
|
501
|
+
const results = [];
|
|
502
|
+
|
|
503
|
+
// Find all reference definitions: [num]: url "title"
|
|
504
|
+
const refMap = new Map();
|
|
505
|
+
const refRegex = /^\[(\d+)\]:\s*(https?:\/\/[^\s"]+)(?:\s+"([^"]*)")?/gm;
|
|
506
|
+
let m;
|
|
507
|
+
while ((m = refRegex.exec(text)) !== null) {
|
|
508
|
+
const num = m[1];
|
|
509
|
+
const url = m[2];
|
|
510
|
+
const title = m[3] || "";
|
|
511
|
+
refMap.set(num, { url, title });
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
// Find inline references: [text][num] or [num]. Linear scan via
|
|
515
|
+
// indexOf — avoids the ReDoS-prone /\[([^\]]*)\]\[(\d+)\]/g pattern
|
|
516
|
+
// (SonarCloud hotspot js:S5852). The original `[^\]]*` allowed `[`
|
|
517
|
+
// inside, which caused quadratic backtracking on inputs like
|
|
518
|
+
// `[a[[[[[[[[[[[1]`.
|
|
519
|
+
let cursor = 0;
|
|
520
|
+
while (cursor < text.length) {
|
|
521
|
+
const open = text.indexOf("[", cursor);
|
|
522
|
+
if (open === -1) break;
|
|
523
|
+
const close = text.indexOf("]", open + 1);
|
|
524
|
+
if (close === -1) break;
|
|
525
|
+
if (text[close + 1] !== "[") {
|
|
526
|
+
cursor = open + 1;
|
|
527
|
+
continue;
|
|
528
|
+
}
|
|
529
|
+
const close2 = text.indexOf("]", close + 2);
|
|
530
|
+
if (close2 === -1) break;
|
|
531
|
+
|
|
532
|
+
const inner = text.slice(open + 1, close);
|
|
533
|
+
const numStr = text.slice(close + 2, close2);
|
|
534
|
+
if (isAllDigits(numStr)) {
|
|
535
|
+
const ref = refMap.get(numStr);
|
|
536
|
+
if (ref && !results.some((r) => r.url === ref.url)) {
|
|
537
|
+
results.push({
|
|
538
|
+
title: inner.trim() || ref.title || "",
|
|
539
|
+
url: ref.url,
|
|
540
|
+
});
|
|
541
|
+
}
|
|
542
|
+
}
|
|
543
|
+
cursor = close2 + 1;
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
return results;
|
|
547
|
+
}
|
|
548
|
+
|
|
382
549
|
// ============================================================================
|
|
383
550
|
// Timing constants
|
|
384
551
|
// ============================================================================
|
|
@@ -658,6 +825,26 @@ export function outputJson(data) {
|
|
|
658
825
|
process.stdout.write(`${JSON.stringify(data, null, 2)}\n`);
|
|
659
826
|
}
|
|
660
827
|
|
|
828
|
+
/**
|
|
829
|
+
* Record the current extractor stage for debugging and timeout diagnostics.
|
|
830
|
+
* Writes `[engine] stage: <name> (+<ms>)` to stderr and updates `env.lastStage`
|
|
831
|
+
* / `env.stages` so the envelope carries the last known phase on any outcome
|
|
832
|
+
* (success, error, timeout, kill).
|
|
833
|
+
*
|
|
834
|
+
* @param {object} env - The mutable env object the extractor is filling in.
|
|
835
|
+
* @param {string} stage - Short, snake_case stage name (e.g. "nav", "type", "stream").
|
|
836
|
+
* @param {number} [startTime] - Optional extractor start time for elapsed-ms logging.
|
|
837
|
+
*/
|
|
838
|
+
export function logStage(env, stage, startTime = null) {
|
|
839
|
+
if (!env || typeof env !== "object") return;
|
|
840
|
+
const elapsed = startTime ? ` (+${Date.now() - startTime}ms)` : "";
|
|
841
|
+
env.lastStage = stage;
|
|
842
|
+
if (!Array.isArray(env.stages)) env.stages = [];
|
|
843
|
+
env.stages.push({ stage, at: Date.now() });
|
|
844
|
+
const engine = env.engine || "extractor";
|
|
845
|
+
console.error(`[${engine}] stage: ${stage}${elapsed}`);
|
|
846
|
+
}
|
|
847
|
+
|
|
661
848
|
/**
|
|
662
849
|
* Build a lightweight result envelope from data already collected during extraction.
|
|
663
850
|
* Zero additional CDP calls — everything here is already known.
|
|
@@ -673,6 +860,8 @@ export function buildEnvelope({
|
|
|
673
860
|
verificationResult = null,
|
|
674
861
|
inputReady = null,
|
|
675
862
|
durationMs = null,
|
|
863
|
+
lastStage = null,
|
|
864
|
+
stages = null,
|
|
676
865
|
} = {}) {
|
|
677
866
|
return {
|
|
678
867
|
engine,
|
|
@@ -683,6 +872,8 @@ export function buildEnvelope({
|
|
|
683
872
|
verificationResult,
|
|
684
873
|
inputReady,
|
|
685
874
|
durationMs,
|
|
875
|
+
lastStage,
|
|
876
|
+
stages,
|
|
686
877
|
};
|
|
687
878
|
}
|
|
688
879
|
|