@alexion42/pi-web-search 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ {
2
+ "nextId": 4,
3
+ "tasks": []
4
+ }
package/CHANGELOG.md ADDED
@@ -0,0 +1,18 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ ## [0.1.0] - 2026-05-24
6
+
7
+ Initial release as `@alexion42/pi-web-search`.
8
+
9
+ Lean fork of [nicobailon/pi-web-access](https://github.com/nicobailon/pi-web-access). Stripped out: Perplexity, Gemini API, Gemini Web, YouTube/video analysis, browser-cookie auth, curator UI, summary review, and librarian skill. What remains: Exa-only search, GitHub cloning, PDF extraction, and URL fetching via Readability → RSC → Jina.
10
+
11
+ ### Included
12
+
13
+ - `web_search` — Exa search with synthesized answers (direct API or zero-config MCP)
14
+ - `code_search` — Code/docs search via Exa MCP
15
+ - `fetch_content` — URL content extraction with GitHub cloning and PDF support
16
+ - `get_search_content` — Retrieve stored search/fetch results
17
+ - `/search` — Interactive command to browse stored results
18
+ - Activity monitor (`Ctrl+Shift+W`)
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Nico Bailon
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,88 @@
1
+ # Pi Web Search
2
+
3
+ Lean web search for Pi, powered by Exa. A lean fork of [nicobailon/pi-web-access](https://github.com/nicobailon/pi-web-access).
4
+
5
+ [![npm version](https://img.shields.io/npm/v/@alexion42/pi-web-search?style=for-the-badge)](https://www.npmjs.com/package/@alexion42/pi-web-search)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg?style=for-the-badge)](https://opensource.org/licenses/MIT)
7
+
8
+ ## Install
9
+
10
+ ```bash
11
+ pi install npm:@alexion42/pi-web-search
12
+ ```
13
+
14
+ Works immediately with no API keys — Exa MCP provides zero-config search. For direct API access, add your key to `~/.pi/web-search.json`:
15
+
16
+ ```json
17
+ {
18
+ "exaApiKey": "exa-..."
19
+ }
20
+ ```
21
+
22
+ Requires Pi v0.37.3+.
23
+
24
+ ## What's Available
25
+
26
+ | Tool | Description |
27
+ |------|-------------|
28
+ | `web_search` | Search the web via Exa with synthesized answers and source citations |
29
+ | `code_search` | Search for code examples, docs, and API references via Exa MCP |
30
+ | `fetch_content` | Extract readable content from URLs, GitHub repos (cloned locally), and PDFs |
31
+ | `get_search_content` | Retrieve stored content from previous searches or fetches |
32
+ | `/search` | Interactive command to browse stored search results |
33
+ | Activity monitor | `Ctrl+Shift+W` to view live request/response activity |
34
+
35
+ Content extraction uses a robust fallback chain: Readability → RSC parser → Jina Reader. Full parameter reference and examples are in [TOOLS.md](./TOOLS.md).
36
+
37
+ ## Configuration
38
+
39
+ All config lives in `~/.pi/web-search.json`. Every field is optional.
40
+
41
+ ```json
42
+ {
43
+ "exaApiKey": "exa-...",
44
+ "githubClone": {
45
+ "enabled": true,
46
+ "maxRepoSizeMB": 350,
47
+ "cloneTimeoutSeconds": 30,
48
+ "clonePath": "/tmp/pi-github-repos"
49
+ },
50
+ "shortcuts": {
51
+ "activity": "ctrl+shift+w"
52
+ }
53
+ }
54
+ ```
55
+
56
+ `EXA_API_KEY` env var takes precedence over config file values. Config changes require a Pi restart.
57
+
58
+ ## How It Works
59
+
60
+ ```
61
+ web_search(query)
62
+ → Exa (direct API with key, MCP without)
63
+
64
+ fetch_content(url)
65
+ → GitHub URL? Clone repo, return file contents + local path
66
+ → HTTP fetch → PDF? Extract text, save to ~/Downloads/
67
+ → HTML? Readability → RSC parser → Jina Reader
68
+ → Text/JSON/Markdown? Return directly
69
+ ```
70
+
71
+ ## Limitations
72
+
73
+ - PDFs are text-extracted only (no OCR for scanned documents).
74
+ - GitHub branch names with slashes may misresolve file paths; the clone still works and the agent can navigate manually.
75
+ - Non-code GitHub URLs (issues, PRs, wiki) fall through to normal web extraction.
76
+
77
+ ## Comparison with Other Pi Search Extensions
78
+
79
+ | Extension | Backends | Differentiator |
80
+ |-----------|----------|----------------|
81
+ | [nicobailon/pi-web-access](https://github.com/nicobailon/pi-web-access) | Perplexity, Gemini, Exa | Upstream — curator, YouTube, video |
82
+ | [ronnieops/pi-search-hub](https://github.com/ronnieops/pi-search-hub) | 12 backends | RRF combine mode, auto-fallback |
83
+ | [code-yeongyu/pi-websearch](https://github.com/code-yeongyu/pi-websearch) | 11 backends + native OpenAI/Anthropic | Provider routing, keyless DDG |
84
+ | [ayagmar/pi-codex-web-search](https://github.com/ayagmar/pi-codex-web-search) | OpenAI Codex CLI | Wraps local `codex` CLI |
85
+ | [iaptsiauri/pi-surf](https://github.com/iaptsiauri/pi-surf) | Brave + custom providers | Scout subagent, pluggable providers |
86
+ | [NicoAvanzDev/pi-web-extension](https://github.com/NicoAvanzDev/pi-web-extension) | Brave, DDG (keyless) | Prompt steering, token-aware |
87
+
88
+ This project's niche: **GitHub cloning + Exa MCP zero-config in a lean package**. No multi-provider routing, no browser UI, no video.
package/TOOLS.md ADDED
@@ -0,0 +1,103 @@
1
+ # Tools Reference
2
+
3
+ Detailed parameter reference and usage examples for all tools provided by Pi Web Search.
4
+
5
+ ## web_search
6
+
7
+ Search the web via Exa. Returns an AI-synthesized answer with source citations.
8
+
9
+ ```typescript
10
+ web_search({ query: "TypeScript best practices 2025" })
11
+ web_search({ queries: ["query 1", "query 2"] })
12
+ web_search({ query: "latest news", numResults: 10, recencyFilter: "week" })
13
+ web_search({ query: "...", domainFilter: ["github.com"] })
14
+ web_search({ query: "...", includeContent: true })
15
+ ```
16
+
17
+ | Parameter | Type | Description |
18
+ |-----------|------|-------------|
19
+ | `query` | `string` | Single search query. For research tasks, prefer `queries` with multiple varied angles instead. |
20
+ | `queries` | `string[]` | Multiple queries searched in sequence, each returning its own synthesized answer. |
21
+ | `numResults` | `number` | Results per query (default: 5, max: 20) |
22
+ | `recencyFilter` | `string` | Filter by recency: `day`, `week`, `month`, or `year` |
23
+ | `domainFilter` | `string[]` | Limit to specific domains (prefix with `-` to exclude, e.g. `["-twitter.com"]`) |
24
+ | `includeContent` | `boolean` | Fetch full page content from sources in background |
25
+
26
+ **Tips:** For comprehensive research, use 2-4 varied queries instead of one broad query. Each query gets its own synthesized answer.
27
+
28
+ ## code_search
29
+
30
+ Search for code examples, documentation, and API references via Exa MCP. No API key required.
31
+
32
+ ```typescript
33
+ code_search({ query: "React useEffect cleanup pattern" })
34
+ code_search({ query: "Express middleware error handling", maxTokens: 10000 })
35
+ ```
36
+
37
+ | Parameter | Type | Description |
38
+ |-----------|------|-------------|
39
+ | `query` | `string` | Programming question, API, library, or debugging topic to search for |
40
+ | `maxTokens` | `number` | Maximum tokens of code/documentation context to return (default: 5000, max: 50000) |
41
+
42
+ ## fetch_content
43
+
44
+ Fetch URL(s) and extract readable content as markdown. Automatically detects and handles GitHub repos, PDFs, and regular web pages.
45
+
46
+ ```typescript
47
+ fetch_content({ url: "https://example.com/article" })
48
+ fetch_content({ urls: ["url1", "url2", "url3"] })
49
+ fetch_content({ url: "https://github.com/owner/repo" })
50
+ fetch_content({ url: "https://example.com/doc.pdf" })
51
+ ```
52
+
53
+ | Parameter | Type | Description |
54
+ |-----------|------|-------------|
55
+ | `url` | `string` | Single URL to fetch |
56
+ | `urls` | `string[]` | Multiple URLs (fetched in parallel) |
57
+ | `forceClone` | `boolean` | Force cloning large GitHub repositories that exceed the size threshold |
58
+
59
+ **GitHub repos:** GitHub URLs are cloned locally instead of scraped. The agent gets real file contents and a local path to explore with `read` and `bash`. Root URLs return the repo tree + README, `/tree/` paths return directory listings, `/blob/` paths return file contents. Repos over 350MB get a lightweight API-based view (override with `forceClone: true`).
60
+
61
+ **PDFs:** PDF URLs are extracted as text and saved to `~/Downloads/` as markdown. Text-based extraction only — no OCR.
62
+
63
+ **Fallback chain:** Readability → RSC parser (Next.js) → Jina Reader (JS-rendered pages). Handles SPAs, JS-heavy pages, and anti-bot protections transparently.
64
+
65
+ ## get_search_content
66
+
67
+ Retrieve stored content from previous `web_search` or `fetch_content` calls. Content over 30,000 chars is truncated in tool responses but stored in full for retrieval here.
68
+
69
+ ```typescript
70
+ get_search_content({ responseId: "abc123", urlIndex: 0 })
71
+ get_search_content({ responseId: "abc123", url: "https://..." })
72
+ get_search_content({ responseId: "abc123", query: "original query" })
73
+ ```
74
+
75
+ | Parameter | Type | Description |
76
+ |-----------|------|-------------|
77
+ | `responseId` | `string` | The response ID from a previous `web_search` or `fetch_content` call |
78
+ | `query` | `string` | Get content for this specific query (from `web_search` results) |
79
+ | `queryIndex` | `number` | Get content for query at this index (0-based) |
80
+ | `url` | `string` | Get content for this specific URL (from `fetch_content` results) |
81
+ | `urlIndex` | `number` | Get content for URL at this index (0-based) |
82
+
83
+ ## /search
84
+
85
+ Interactive command to browse stored search results from the current session. Lists all results with their response IDs for easy retrieval. Supports viewing details and deleting results.
86
+
87
+ ```
88
+ /search
89
+ ```
90
+
91
+ ## Activity Monitor
92
+
93
+ Toggle with `Ctrl+Shift+W` (configurable via `shortcuts.activity` in config) to see live request/response activity:
94
+
95
+ ```
96
+ ─── Web Search Activity ────────────────────────────────────
97
+ API "typescript best practices" 200 2.1s ✓
98
+ GET docs.example.com/article 200 0.8s ✓
99
+ GET blog.example.com/post 404 0.3s ✗
100
+ ────────────────────────────────────────────────────────────
101
+ ```
102
+
103
+ Shows the last 10 API calls and URL fetches with status codes, timing, and rate limit usage. Auto-clears on session switch.
package/activity.ts ADDED
@@ -0,0 +1,101 @@
1
+ // Types
2
+ export interface ActivityEntry {
3
+ id: string;
4
+ type: "api" | "fetch";
5
+ startTime: number;
6
+ endTime?: number;
7
+
8
+ // For API calls
9
+ query?: string;
10
+
11
+ // For URL fetches
12
+ url?: string;
13
+
14
+ // Result - status is number (HTTP code) or null (pending/network error)
15
+ status: number | null;
16
+ error?: string;
17
+ }
18
+
19
+ export interface RateLimitInfo {
20
+ used: number;
21
+ max: number;
22
+ oldestTimestamp: number | null;
23
+ windowMs: number;
24
+ }
25
+
26
+ export class ActivityMonitor {
27
+ private entries: ActivityEntry[] = [];
28
+ private readonly maxEntries = 10;
29
+ private listeners = new Set<() => void>();
30
+ private rateLimitInfo: RateLimitInfo = { used: 0, max: 10, oldestTimestamp: null, windowMs: 60000 };
31
+ private nextId = 1;
32
+
33
+ logStart(partial: Omit<ActivityEntry, "id" | "startTime" | "status">): string {
34
+ const id = `act-${this.nextId++}`;
35
+ const entry: ActivityEntry = {
36
+ ...partial,
37
+ id,
38
+ startTime: Date.now(),
39
+ status: null,
40
+ };
41
+ this.entries.push(entry);
42
+ if (this.entries.length > this.maxEntries) {
43
+ this.entries.shift();
44
+ }
45
+ this.notify();
46
+ return id;
47
+ }
48
+
49
+ logComplete(id: string, status: number): void {
50
+ const entry = this.entries.find((e) => e.id === id);
51
+ if (entry) {
52
+ entry.endTime = Date.now();
53
+ entry.status = status;
54
+ this.notify();
55
+ }
56
+ }
57
+
58
+ logError(id: string, error: string): void {
59
+ const entry = this.entries.find((e) => e.id === id);
60
+ if (entry) {
61
+ entry.endTime = Date.now();
62
+ entry.error = error;
63
+ this.notify();
64
+ }
65
+ }
66
+
67
+ getEntries(): readonly ActivityEntry[] {
68
+ return this.entries;
69
+ }
70
+
71
+ getRateLimitInfo(): RateLimitInfo {
72
+ return this.rateLimitInfo;
73
+ }
74
+
75
+ updateRateLimit(info: RateLimitInfo): void {
76
+ this.rateLimitInfo = info;
77
+ this.notify();
78
+ }
79
+
80
+ onUpdate(callback: () => void): () => void {
81
+ this.listeners.add(callback);
82
+ return () => this.listeners.delete(callback);
83
+ }
84
+
85
+ clear(): void {
86
+ this.entries = [];
87
+ this.rateLimitInfo = { used: 0, max: 10, oldestTimestamp: null, windowMs: 60000 };
88
+ this.notify();
89
+ }
90
+
91
+ private notify(): void {
92
+ for (const cb of this.listeners) {
93
+ try {
94
+ cb();
95
+ } catch {
96
+ }
97
+ }
98
+ }
99
+ }
100
+
101
+ export const activityMonitor = new ActivityMonitor();
package/banner.png ADDED
Binary file
package/code-search.ts ADDED
@@ -0,0 +1,107 @@
1
+ import { activityMonitor } from "./activity.js";
2
+ import { callExaMcp } from "./exa.js";
3
+
4
+ const CODE_CONTEXT_TOOL = "get_code_context_exa";
5
+ const WEB_SEARCH_TOOL = "web_search_exa";
6
+ const DEFAULT_MAX_TOKENS = 5000;
7
+
8
+ let codeContextToolMissing = false;
9
+
10
+ function isMissingMcpToolError(message: string): boolean {
11
+ const normalized = message.toLowerCase();
12
+ return normalized.includes("tool") && normalized.includes("not found");
13
+ }
14
+
15
+ function buildFallbackQuery(query: string): string {
16
+ const normalized = query.toLowerCase();
17
+ const hasCodeTerms = /\b(api|code|docs?|documentation|example|github|implementation|library|source|stackoverflow|stack overflow)\b/.test(normalized);
18
+ return hasCodeTerms ? query : `${query} code examples documentation GitHub Stack Overflow official docs`;
19
+ }
20
+
21
+ function maxTokensToResultCount(maxTokens: number): number {
22
+ return Math.min(20, Math.max(5, Math.ceil(maxTokens / 1000)));
23
+ }
24
+
25
+ function trimApproxTokens(text: string, maxTokens: number): string {
26
+ const maxCharacters = Math.max(1000, maxTokens * 4);
27
+ if (text.length <= maxCharacters) return text;
28
+ return `${text.slice(0, maxCharacters).trimEnd()}\n\n[Truncated by code_search to approximately ${maxTokens} tokens.]`;
29
+ }
30
+
31
+ async function executeFallbackSearch(query: string, maxTokens: number, signal?: AbortSignal): Promise<string> {
32
+ const text = await callExaMcp(
33
+ WEB_SEARCH_TOOL,
34
+ {
35
+ query: buildFallbackQuery(query),
36
+ numResults: maxTokensToResultCount(maxTokens),
37
+ livecrawl: "fallback",
38
+ type: "auto",
39
+ contextMaxCharacters: Math.min(50000, Math.max(1000, maxTokens * 4)),
40
+ },
41
+ signal,
42
+ );
43
+ return trimApproxTokens(text, maxTokens);
44
+ }
45
+
46
+ export async function executeCodeSearch(
47
+ _toolCallId: string,
48
+ params: { query: string; maxTokens?: number },
49
+ signal?: AbortSignal,
50
+ ): Promise<{
51
+ content: Array<{ type: "text"; text: string }>;
52
+ details: { query: string; maxTokens: number; error?: string; mode?: "code-context" | "web-search-fallback" };
53
+ }> {
54
+ const query = params.query.trim();
55
+ if (!query) {
56
+ return {
57
+ content: [{ type: "text", text: "Error: No query provided." }],
58
+ details: { query: "", maxTokens: params.maxTokens ?? DEFAULT_MAX_TOKENS, error: "No query provided" },
59
+ };
60
+ }
61
+
62
+ const maxTokens = params.maxTokens ?? DEFAULT_MAX_TOKENS;
63
+ const activityId = activityMonitor.logStart({ type: "api", query });
64
+
65
+ try {
66
+ let mode: "code-context" | "web-search-fallback" = "web-search-fallback";
67
+ let text: string;
68
+
69
+ if (codeContextToolMissing) {
70
+ text = await executeFallbackSearch(query, maxTokens, signal);
71
+ } else {
72
+ try {
73
+ text = await callExaMcp(
74
+ CODE_CONTEXT_TOOL,
75
+ {
76
+ query,
77
+ tokensNum: maxTokens,
78
+ },
79
+ signal,
80
+ );
81
+ mode = "code-context";
82
+ } catch (err) {
83
+ const message = err instanceof Error ? err.message : String(err);
84
+ if (!isMissingMcpToolError(message)) throw err;
85
+ codeContextToolMissing = true;
86
+ text = await executeFallbackSearch(query, maxTokens, signal);
87
+ }
88
+ }
89
+
90
+ activityMonitor.logComplete(activityId, 200);
91
+ return {
92
+ content: [{ type: "text", text }],
93
+ details: { query, maxTokens, mode },
94
+ };
95
+ } catch (err) {
96
+ const message = err instanceof Error ? err.message : String(err);
97
+ if (message.toLowerCase().includes("abort")) {
98
+ activityMonitor.logComplete(activityId, 0);
99
+ throw err;
100
+ }
101
+ activityMonitor.logError(activityId, message);
102
+ return {
103
+ content: [{ type: "text", text: `Error: ${message}` }],
104
+ details: { query, maxTokens, error: message },
105
+ };
106
+ }
107
+ }