@ogulcancelik/pi-web-browse 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +73 -0
- package/SKILL.md +101 -0
- package/lib/bot-protection.js +92 -0
- package/lib/browser-bin.js +133 -0
- package/lib/cdp.js +218 -0
- package/lib/daemon-client.js +134 -0
- package/lib/daemon.js +194 -0
- package/lib/debug-dump.js +76 -0
- package/lib/extract.js +58 -0
- package/lib/fetch.js +71 -0
- package/lib/http-fetch.js +41 -0
- package/lib/search.js +226 -0
- package/package.json +60 -0
- package/web-browse.js +651 -0
package/lib/search.js
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
import { load } from "cheerio";
|
|
2
|
+
|
|
3
|
+
export function extractDuckDuckGoResults(html, num) {
|
|
4
|
+
const $ = load(html);
|
|
5
|
+
const results = [];
|
|
6
|
+
|
|
7
|
+
$(".result").each((i, el) => {
|
|
8
|
+
if (results.length >= num) return false;
|
|
9
|
+
const $el = $(el);
|
|
10
|
+
const titleEl = $el.find(".result__a").first();
|
|
11
|
+
const snippetEl = $el.find(".result__snippet").first();
|
|
12
|
+
|
|
13
|
+
const title = titleEl.text().trim();
|
|
14
|
+
const href = titleEl.attr("href");
|
|
15
|
+
const snippet = snippetEl.text().trim();
|
|
16
|
+
|
|
17
|
+
let link = href;
|
|
18
|
+
if (href && href.includes("uddg=")) {
|
|
19
|
+
const match = href.match(/uddg=([^&]+)/);
|
|
20
|
+
if (match) link = decodeURIComponent(match[1]);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
if (title && link && !link.includes("duckduckgo.com")) {
|
|
24
|
+
results.push({ title, link, snippet });
|
|
25
|
+
}
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
return results;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export async function searchDuckDuckGoLite(httpFetch, headers, query, num) {
|
|
32
|
+
const url = `https://duckduckgo.com/lite/?q=${encodeURIComponent(query)}`;
|
|
33
|
+
const response = await httpFetch(url, { headers });
|
|
34
|
+
if (response.status === 202) throw new Error("DuckDuckGo returned 202 (blocked)");
|
|
35
|
+
if (!response.ok) throw new Error(`Search failed: ${response.status} ${response.statusText}`);
|
|
36
|
+
|
|
37
|
+
const html = await response.text();
|
|
38
|
+
const $ = load(html);
|
|
39
|
+
const results = [];
|
|
40
|
+
|
|
41
|
+
$("a.result-link").each((i, el) => {
|
|
42
|
+
if (results.length >= num) return false;
|
|
43
|
+
const title = $(el).text().trim();
|
|
44
|
+
const link = $(el).attr("href");
|
|
45
|
+
const snippet = $(el).closest("tr").next("tr").find(".result-snippet").text().trim();
|
|
46
|
+
|
|
47
|
+
if (title && link) {
|
|
48
|
+
results.push({ title, link, snippet: snippet || "" });
|
|
49
|
+
}
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
return results;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
export async function searchDuckDuckGo(httpFetch, headers, query, num) {
|
|
56
|
+
const url = `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`;
|
|
57
|
+
const response = await httpFetch(url, { headers });
|
|
58
|
+
if (response.status === 202) throw new Error("DuckDuckGo returned 202 (blocked)");
|
|
59
|
+
if (!response.ok) throw new Error(`Search failed: ${response.status} ${response.statusText}`);
|
|
60
|
+
|
|
61
|
+
const html = await response.text();
|
|
62
|
+
let results = extractDuckDuckGoResults(html, num);
|
|
63
|
+
|
|
64
|
+
if (results.length === 0) {
|
|
65
|
+
results = await searchDuckDuckGoLite(httpFetch, headers, query, num);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
return results;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
export async function searchGoogleFromContext(context, query, num) {
|
|
72
|
+
const clampedNum = Math.max(1, Math.min(num, 20));
|
|
73
|
+
let page;
|
|
74
|
+
|
|
75
|
+
try {
|
|
76
|
+
page = await context.newPage();
|
|
77
|
+
const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(query)}&num=${clampedNum}&hl=en&gl=us&pws=0&safe=off`;
|
|
78
|
+
|
|
79
|
+
await page.goto(searchUrl, { waitUntil: "domcontentloaded", timeout: 20000 });
|
|
80
|
+
await page.waitForTimeout(200 + Math.floor(Math.random() * 300));
|
|
81
|
+
|
|
82
|
+
if (page.url().includes("consent.google.com")) {
|
|
83
|
+
const consentButtons = [
|
|
84
|
+
"button#L2AGLb",
|
|
85
|
+
"button:has-text('I agree')",
|
|
86
|
+
"button:has-text('Accept all')",
|
|
87
|
+
"button:has-text('Accept')",
|
|
88
|
+
];
|
|
89
|
+
|
|
90
|
+
for (const selector of consentButtons) {
|
|
91
|
+
const button = page.locator(selector);
|
|
92
|
+
if (await button.count()) {
|
|
93
|
+
await button.first().click({ timeout: 5000, force: true });
|
|
94
|
+
await page.waitForLoadState("domcontentloaded", { timeout: 15000 });
|
|
95
|
+
break;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
await page.goto(searchUrl, { waitUntil: "domcontentloaded", timeout: 20000 });
|
|
100
|
+
await page.waitForTimeout(200 + Math.floor(Math.random() * 300));
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
await page.waitForSelector("body", { timeout: 10000 });
|
|
104
|
+
|
|
105
|
+
try {
|
|
106
|
+
await page.waitForFunction(() => document.querySelectorAll("h3").length > 0, { timeout: 10000 });
|
|
107
|
+
} catch {
|
|
108
|
+
// allow fallthrough
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
const extractResultsFromDocument = () => {
|
|
112
|
+
const items = [];
|
|
113
|
+
const titleEls = Array.from(document.querySelectorAll("h3"));
|
|
114
|
+
|
|
115
|
+
for (const titleEl of titleEls) {
|
|
116
|
+
const title = titleEl.textContent?.trim();
|
|
117
|
+
const linkEl = titleEl.closest("a[href]");
|
|
118
|
+
const link = linkEl?.getAttribute("href");
|
|
119
|
+
|
|
120
|
+
if (!title || !link) continue;
|
|
121
|
+
|
|
122
|
+
let finalLink = link;
|
|
123
|
+
if (link.startsWith("/url?")) {
|
|
124
|
+
try {
|
|
125
|
+
const url = new URL(`https://www.google.com${link}`);
|
|
126
|
+
finalLink = url.searchParams.get("q") || link;
|
|
127
|
+
} catch {
|
|
128
|
+
finalLink = link;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
if (finalLink.startsWith("/") || finalLink.includes("google.com")) continue;
|
|
133
|
+
|
|
134
|
+
let snippet = "";
|
|
135
|
+
const container =
|
|
136
|
+
linkEl.closest("div.MjjYud, div.g, div[data-snf], div[data-sncf]") || linkEl.parentElement?.parentElement;
|
|
137
|
+
|
|
138
|
+
if (container) {
|
|
139
|
+
const snippetEl = container.querySelector(".VwiC3b, .yXK7lf, .lEBKkf, span.aCOpRe");
|
|
140
|
+
snippet = snippetEl?.textContent?.trim() || "";
|
|
141
|
+
|
|
142
|
+
if (!snippet) {
|
|
143
|
+
const spans = Array.from(container.querySelectorAll("span"))
|
|
144
|
+
.map((el) => el.textContent?.trim() || "")
|
|
145
|
+
.filter((text) => text.length > 40 && text !== title);
|
|
146
|
+
snippet = spans[0] || "";
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
items.push({ title, link: finalLink, snippet });
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
return items;
|
|
154
|
+
};
|
|
155
|
+
|
|
156
|
+
const results = [];
|
|
157
|
+
for (const frame of page.frames()) {
|
|
158
|
+
try {
|
|
159
|
+
const frameResults = await frame.evaluate(extractResultsFromDocument);
|
|
160
|
+
results.push(...frameResults);
|
|
161
|
+
} catch {
|
|
162
|
+
// ignore
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
if (results.length === 0) {
|
|
167
|
+
const diagnostics = await page.evaluate(() => ({
|
|
168
|
+
title: document.title || "",
|
|
169
|
+
text: document.body?.innerText?.slice(0, 500) || "",
|
|
170
|
+
bodyHtmlSnippet: document.body?.innerHTML?.slice(0, 500) || "",
|
|
171
|
+
hasCaptcha: Boolean(document.querySelector("#captcha-form, form[action*='sorry'], .g-recaptcha")),
|
|
172
|
+
resultCount: document.querySelectorAll("h3").length,
|
|
173
|
+
searchBoxCount: document.querySelectorAll("input[name='q'], textarea[name='q']").length,
|
|
174
|
+
}));
|
|
175
|
+
|
|
176
|
+
const blockedSignals = ["unusual traffic", "before you continue", "sorry", "detected", "our systems"];
|
|
177
|
+
if (diagnostics.hasCaptcha || blockedSignals.some((signal) => diagnostics.text.toLowerCase().includes(signal))) {
|
|
178
|
+
throw new Error(`Google blocked automated access (${diagnostics.title || page.url()})`);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
console.error(
|
|
182
|
+
`Google returned zero results (url=${page.url()}, title=${diagnostics.title}, results=${diagnostics.resultCount}, searchBoxes=${diagnostics.searchBoxCount})`,
|
|
183
|
+
);
|
|
184
|
+
console.error(`Google body snippet: ${diagnostics.bodyHtmlSnippet}`);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
return results.slice(0, clampedNum);
|
|
188
|
+
} finally {
|
|
189
|
+
if (page && !page.isClosed()) {
|
|
190
|
+
await page.close().catch(() => {});
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Main search flow: try Google (via browser context) first, fall back to DuckDuckGo.
|
|
197
|
+
*/
|
|
198
|
+
export async function searchWebFromContext({
|
|
199
|
+
context,
|
|
200
|
+
httpFetch,
|
|
201
|
+
headers,
|
|
202
|
+
query,
|
|
203
|
+
numResults,
|
|
204
|
+
log = (msg) => console.error(msg),
|
|
205
|
+
} = {}) {
|
|
206
|
+
let results = [];
|
|
207
|
+
|
|
208
|
+
try {
|
|
209
|
+
results = await searchGoogleFromContext(context, query, numResults);
|
|
210
|
+
} catch (error) {
|
|
211
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
212
|
+
log(`Google search failed: ${message}`);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
if (results.length === 0) {
|
|
216
|
+
log("Google returned no results. Falling back to DuckDuckGo...");
|
|
217
|
+
try {
|
|
218
|
+
results = await searchDuckDuckGo(httpFetch, headers, query, numResults);
|
|
219
|
+
} catch (error) {
|
|
220
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
221
|
+
log(`DuckDuckGo search failed: ${message}`);
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
return results;
|
|
226
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@ogulcancelik/pi-web-browse",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Web search and content extraction skill for pi-coding-agent. Search the web and fetch pages via a real headless browser (CDP). Works on Linux, macOS, and Windows.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "web-browse.js",
|
|
7
|
+
"bin": {
|
|
8
|
+
"web-browse": "./web-browse.js"
|
|
9
|
+
},
|
|
10
|
+
"scripts": {
|
|
11
|
+
"test": "node --test"
|
|
12
|
+
},
|
|
13
|
+
"keywords": [
|
|
14
|
+
"pi-package",
|
|
15
|
+
"pi-skill",
|
|
16
|
+
"agent-skill",
|
|
17
|
+
"web-search",
|
|
18
|
+
"web-scraping",
|
|
19
|
+
"browser-automation",
|
|
20
|
+
"cdp",
|
|
21
|
+
"headless-browser",
|
|
22
|
+
"playwright"
|
|
23
|
+
],
|
|
24
|
+
"author": "Can Celik",
|
|
25
|
+
"license": "MIT",
|
|
26
|
+
"repository": {
|
|
27
|
+
"type": "git",
|
|
28
|
+
"url": "git+https://github.com/ogulcancelik/pi-web-browse.git"
|
|
29
|
+
},
|
|
30
|
+
"bugs": {
|
|
31
|
+
"url": "https://github.com/ogulcancelik/pi-web-browse/issues"
|
|
32
|
+
},
|
|
33
|
+
"homepage": "https://github.com/ogulcancelik/pi-web-browse#readme",
|
|
34
|
+
"engines": {
|
|
35
|
+
"node": ">=18.0.0"
|
|
36
|
+
},
|
|
37
|
+
"os": [
|
|
38
|
+
"linux",
|
|
39
|
+
"darwin",
|
|
40
|
+
"win32"
|
|
41
|
+
],
|
|
42
|
+
"pi": {
|
|
43
|
+
"skills": ["./SKILL.md"]
|
|
44
|
+
},
|
|
45
|
+
"files": [
|
|
46
|
+
"web-browse.js",
|
|
47
|
+
"lib/",
|
|
48
|
+
"SKILL.md",
|
|
49
|
+
"README.md",
|
|
50
|
+
"LICENSE"
|
|
51
|
+
],
|
|
52
|
+
"dependencies": {
|
|
53
|
+
"@mozilla/readability": "^0.6.0",
|
|
54
|
+
"cheerio": "^1.1.2",
|
|
55
|
+
"jsdom": "^27.4.0",
|
|
56
|
+
"playwright": "^1.53.2",
|
|
57
|
+
"turndown": "^7.2.2",
|
|
58
|
+
"undici": "^7.18.0"
|
|
59
|
+
}
|
|
60
|
+
}
|