pi-web-access 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md ADDED
@@ -0,0 +1,96 @@
1
+ # Pi Web Access - Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ ## [Unreleased]
6
+
7
+ ## [0.4.2] - 2026-01-27
8
+
9
+ ### Fixed
10
+
11
+ - Single-URL fetches now store content for retrieval via `get_search_content` (previously only multi-URL)
12
+ - Corrected `get_search_content` usage syntax in fetch_content help messages
13
+
14
+ ### Changed
15
+
16
+ - Increased inline content limit from 10K to 30K chars (larger content truncated but fully retrievable)
17
+
18
+ ### Added
19
+
20
+ - Banner image for README
21
+
22
+ ## [0.4.1] - 2026-01-26
23
+
24
+ ### Changed
25
+ - Added `pi` manifest to package.json for pi v0.50.0 package system compliance
26
+ - Added `pi-package` keyword for npm discoverability
27
+
28
+ ## [0.4.0] - 2026-01-19
29
+
30
+ ### Added
31
+
32
+ - PDF extraction via `unpdf` - fetches PDFs from URLs and saves as markdown to `~/Downloads/`
33
+ - Extracts text, metadata (title, author), page count
34
+ - Supports PDFs up to 20MB (vs 5MB for HTML)
35
+ - Handles arxiv URLs with smart title fallback
36
+
37
+ ### Fixed
38
+
39
+ - Plain text URL detection now uses hostname check instead of substring match
40
+
41
+ ## [0.3.0] - 2026-01-19
42
+
43
+ ### Added
44
+
45
+ - RSC (React Server Components) content extraction for Next.js App Router pages
46
+ - Parses flight data from `<script>self.__next_f.push([...])</script>` tags
47
+ - Reconstructs markdown with headings, tables, code blocks, links
48
+ - Handles chunk references and nested components
49
+ - Falls back to RSC extraction when Readability fails
50
+ - Content-type validation rejects binary files (images, PDFs, audio, video, zip)
51
+ - 5MB response size limit (checked via Content-Length header) to prevent memory issues
52
+
53
+ ### Fixed
54
+
55
+ - `fetch_content` now handles plain text URLs (raw.githubusercontent.com, gist.githubusercontent.com, any text/plain response) instead of failing with "Could not extract readable content"
56
+
57
+ ## [0.2.0] - 2026-01-11
58
+
59
+ ### Added
60
+
61
+ - Activity monitor widget (`Ctrl+Shift+O`) showing live request/response activity
62
+ - Displays last 10 API calls and URL fetches with status codes and timing
63
+ - Shows rate limit usage and reset countdown
64
+ - Live updates as requests complete
65
+ - Auto-clears on session switch
66
+
67
+ ### Changed
68
+
69
+ - Refactored activity tracking into dedicated `activity.ts` module
70
+
71
+ ## [0.1.0] - 2026-01-06
72
+
73
+ Initial release. Designed for pi v0.37.3.
74
+
75
+ ### Added
76
+
77
+ - `web_search` tool - Search via Perplexity AI with synthesized answers and citations
78
+ - Single or batch queries (parallel execution)
79
+ - Recency filter (day/week/month/year)
80
+ - Domain filter (include or exclude)
81
+ - Optional async content fetching with agent notification
82
+ - `fetch_content` tool - Fetch and extract readable content from URLs
83
+ - Single URL returns content directly
84
+ - Multiple URLs store for retrieval via `get_search_content`
85
+ - Concurrent fetching (3 max) with 30s timeout
86
+ - `get_search_content` tool - Retrieve stored search results or fetched content
87
+ - Access by response ID, URL, query, or index
88
+ - `/search` command - Interactive browser for stored results
89
+ - TUI rendering with progress bars, URL lists, and expandable previews
90
+ - Session-aware storage with 1-hour TTL
91
+ - Rate limiting (10 req/min for Perplexity API)
92
+ - Config file support (`~/.pi/web-search.json`)
93
+ - Content extraction via Readability + Turndown (max 10k chars)
94
+ - Proper session isolation - pending fetches abort on session switch
95
+ - URL validation before fetch attempts
96
+ - Defensive JSON parsing for API responses
package/README.md ADDED
@@ -0,0 +1,179 @@
1
+ <p>
2
+ <img src="banner.png" alt="pi-web-access" width="1100">
3
+ </p>
4
+
5
+ # Pi Web Access
6
+
7
+ An extension for [Pi coding agent](https://github.com/badlogic/pi-mono/) that gives Pi web capabilities: search via Perplexity AI, fetch and extract content from URLs, and read PDFs.
8
+
9
+ ```typescript
10
+ web_search({ query: "TypeScript best practices 2025" })
11
+ fetch_content({ url: "https://docs.example.com/guide" })
12
+ ```
13
+
14
+ ## Install
15
+
16
+ ```bash
17
+ # Clone to extensions directory
18
+ git clone https://github.com/nicobailon/pi-web-access ~/.pi/agent/extensions/pi-web-access
19
+ cd ~/.pi/agent/extensions/pi-web-access
20
+ npm install
21
+ ```
22
+
23
+ Add your Perplexity API key:
24
+
25
+ ```bash
26
+ # Option 1: Environment variable
27
+ export PERPLEXITY_API_KEY="pplx-..."
28
+
29
+ # Option 2: Config file
30
+ echo '{"perplexityApiKey": "pplx-..."}' > ~/.pi/web-search.json
31
+ ```
32
+
33
+ Get a key at https://perplexity.ai/settings/api
34
+
35
+ **Requires:** Pi v0.37.3+
36
+
37
+ ## Tools
38
+
39
+ ### web_search
40
+
41
+ Search the web via Perplexity AI. Returns synthesized answer with source citations.
42
+
43
+ ```typescript
44
+ // Single query
45
+ web_search({ query: "rust async programming" })
46
+
47
+ // Multiple queries (parallel)
48
+ web_search({ queries: ["query 1", "query 2"] })
49
+
50
+ // With options
51
+ web_search({
52
+ query: "latest news",
53
+ numResults: 10, // Default: 5, max: 20
54
+ recencyFilter: "week", // day, week, month, year
55
+ domainFilter: ["github.com"] // Prefix with - to exclude
56
+ })
57
+
58
+ // Fetch full page content (async)
59
+ web_search({ query: "...", includeContent: true })
60
+ ```
61
+
62
+ When `includeContent: true`, sources are fetched in the background. Agent receives notification when ready.
63
+
64
+ ### fetch_content
65
+
66
+ Fetch URL(s) and extract readable content as markdown.
67
+
68
+ ```typescript
69
+ // Single URL - returns content directly (also stored for retrieval)
70
+ fetch_content({ url: "https://example.com/article" })
71
+
72
+ // Multiple URLs - returns summary (content stored for retrieval)
73
+ fetch_content({ urls: ["url1", "url2", "url3"] })
74
+
75
+ // PDFs - extracted and saved to ~/Downloads/
76
+ fetch_content({ url: "https://arxiv.org/pdf/1706.03762" })
77
+ // → "PDF extracted and saved to: ~/Downloads/arxiv-170603762.md"
78
+ ```
79
+
80
+ **PDF handling:** When fetching a PDF URL, the extension extracts text and saves it as a markdown file in `~/Downloads/`. The agent can then use `read` to access specific sections without loading 200K+ chars into context.
81
+
82
+ ### get_search_content
83
+
84
+ Retrieve stored content from previous searches or fetches.
85
+
86
+ ```typescript
87
+ // By response ID (from web_search or fetch_content)
88
+ get_search_content({ responseId: "abc123", urlIndex: 0 })
89
+
90
+ // By URL
91
+ get_search_content({ responseId: "abc123", url: "https://..." })
92
+
93
+ // By query (for search results)
94
+ get_search_content({ responseId: "abc123", query: "original query" })
95
+ ```
96
+
97
+ ## Features
98
+
99
+ ### Activity Monitor (Ctrl+Shift+O)
100
+
101
+ Toggle live request/response activity:
102
+
103
+ ```
104
+ ─── Web Search Activity ────────────────────────────────────
105
+ API "typescript best practices" 200 2.1s ✓
106
+ GET docs.example.com/article 200 0.8s ✓
107
+ GET blog.example.com/post 404 0.3s ✗
108
+ GET news.example.com/latest ... 1.2s ⋯
109
+ ────────────────────────────────────────────────────────────
110
+ Rate: 3/10 (resets in 42s)
111
+ ```
112
+
113
+ ### RSC Content Extraction
114
+
115
+ Next.js App Router pages embed content as RSC (React Server Components) flight data in script tags. When Readability fails, the extension parses these JSON payloads directly, reconstructing markdown with headings, tables, code blocks, and links.
116
+
117
+ ### TUI Rendering
118
+
119
+ Tool calls render with real-time progress:
120
+
121
+ ```
122
+ ┌─ search "TypeScript best practices 2025" ─────────────────────────┐
123
+ │ [████████░░] searching │
124
+ └───────────────────────────────────────────────────────────────────┘
125
+ ```
126
+
127
+ ## Commands
128
+
129
+ ### /search
130
+
131
+ Browse stored search results interactively.
132
+
133
+ ## How It Works
134
+
135
+ ```
136
+ Agent Request → Perplexity API → Synthesized Answer + Citations
137
+
138
+ [if includeContent: true]
139
+
140
+ Background Fetch (3 concurrent)
141
+
142
+ ┌────────────────┼────────────────┐
143
+ ↓ ↓ ↓
144
+ PDF HTML/Text RSC
145
+ ↓ ↓ ↓
146
+ unpdf → Readability → RSC Parser →
147
+ Save to file Markdown Markdown
148
+ ↓ ↓ ↓
149
+ └────────────────┼────────────────┘
150
+
151
+ Agent Notification (triggerTurn)
152
+ ```
153
+
154
+ ## Rate Limits
155
+
156
+ - **Perplexity API**: 10 requests/minute (enforced client-side)
157
+ - **Content Fetch**: 3 concurrent requests, 30s timeout per URL
158
+ - **Cache TTL**: 1 hour
159
+
160
+ ## Files
161
+
162
+ | File | Purpose |
163
+ |------|---------|
164
+ | `index.ts` | Extension entry, tool definitions, commands, widget |
165
+ | `perplexity.ts` | Perplexity API client, rate limiting |
166
+ | `extract.ts` | URL fetching, content extraction routing |
167
+ | `pdf-extract.ts` | PDF text extraction, saves to markdown |
168
+ | `rsc-extract.ts` | RSC flight data parser for Next.js pages |
169
+ | `storage.ts` | Session-aware result storage |
170
+ | `activity.ts` | Activity tracking for observability widget |
171
+
172
+ ## Limitations
173
+
174
+ - Content extraction works best on article-style pages
175
+ - Heavy JS sites may not extract well (no browser rendering), though Next.js App Router pages with RSC flight data are supported
176
+ - PDFs are extracted as text (no OCR for scanned documents)
177
+ - Max response size: 20MB for PDFs, 5MB for HTML
178
+ - Max inline content: 30,000 chars per URL (larger content stored for retrieval via get_search_content)
179
+ - Requires Pi restart after config file changes
package/activity.ts ADDED
@@ -0,0 +1,102 @@
1
+ // Types
2
+ export interface ActivityEntry {
3
+ id: string;
4
+ type: "api" | "fetch";
5
+ startTime: number;
6
+ endTime?: number;
7
+
8
+ // For API calls
9
+ query?: string;
10
+
11
+ // For URL fetches
12
+ url?: string;
13
+
14
+ // Result - status is number (HTTP code) or null (pending/network error)
15
+ status: number | null;
16
+ error?: string;
17
+ }
18
+
19
+ export interface RateLimitInfo {
20
+ used: number;
21
+ max: number;
22
+ oldestTimestamp: number | null;
23
+ windowMs: number;
24
+ }
25
+
26
+ export class ActivityMonitor {
27
+ private entries: ActivityEntry[] = [];
28
+ private readonly maxEntries = 10;
29
+ private listeners = new Set<() => void>();
30
+ private rateLimitInfo: RateLimitInfo = { used: 0, max: 10, oldestTimestamp: null, windowMs: 60000 };
31
+ private nextId = 1;
32
+
33
+ logStart(partial: Omit<ActivityEntry, "id" | "startTime" | "status">): string {
34
+ const id = `act-${this.nextId++}`;
35
+ const entry: ActivityEntry = {
36
+ ...partial,
37
+ id,
38
+ startTime: Date.now(),
39
+ status: null,
40
+ };
41
+ this.entries.push(entry);
42
+ if (this.entries.length > this.maxEntries) {
43
+ this.entries.shift();
44
+ }
45
+ this.notify();
46
+ return id;
47
+ }
48
+
49
+ logComplete(id: string, status: number): void {
50
+ const entry = this.entries.find((e) => e.id === id);
51
+ if (entry) {
52
+ entry.endTime = Date.now();
53
+ entry.status = status;
54
+ this.notify();
55
+ }
56
+ }
57
+
58
+ logError(id: string, error: string): void {
59
+ const entry = this.entries.find((e) => e.id === id);
60
+ if (entry) {
61
+ entry.endTime = Date.now();
62
+ entry.error = error;
63
+ this.notify();
64
+ }
65
+ }
66
+
67
+ getEntries(): readonly ActivityEntry[] {
68
+ return this.entries;
69
+ }
70
+
71
+ getRateLimitInfo(): RateLimitInfo {
72
+ return this.rateLimitInfo;
73
+ }
74
+
75
+ updateRateLimit(info: RateLimitInfo): void {
76
+ this.rateLimitInfo = info;
77
+ this.notify();
78
+ }
79
+
80
+ onUpdate(callback: () => void): () => void {
81
+ this.listeners.add(callback);
82
+ return () => this.listeners.delete(callback);
83
+ }
84
+
85
+ clear(): void {
86
+ this.entries = [];
87
+ this.rateLimitInfo = { used: 0, max: 10, oldestTimestamp: null, windowMs: 60000 };
88
+ this.notify();
89
+ }
90
+
91
+ private notify(): void {
92
+ for (const cb of this.listeners) {
93
+ try {
94
+ cb();
95
+ } catch {
96
+ /* ignore */
97
+ }
98
+ }
99
+ }
100
+ }
101
+
102
+ export const activityMonitor = new ActivityMonitor();
package/banner.png ADDED
Binary file
package/extract.ts ADDED
@@ -0,0 +1,189 @@
1
+ import { Readability } from "@mozilla/readability";
2
+ import { parseHTML } from "linkedom";
3
+ import TurndownService from "turndown";
4
+ import pLimit from "p-limit";
5
+ import { activityMonitor } from "./activity.js";
6
+ import { extractRSCContent } from "./rsc-extract.js";
7
+ import { extractPDFToMarkdown, isPDF } from "./pdf-extract.js";
8
+
9
+ const DEFAULT_TIMEOUT_MS = 30000;
10
+ const CONCURRENT_LIMIT = 3;
11
+
12
+ const turndown = new TurndownService({
13
+ headingStyle: "atx",
14
+ codeBlockStyle: "fenced",
15
+ });
16
+
17
+ const fetchLimit = pLimit(CONCURRENT_LIMIT);
18
+
19
+ export interface ExtractedContent {
20
+ url: string;
21
+ title: string;
22
+ content: string;
23
+ error: string | null;
24
+ }
25
+
26
+ export async function extractContent(
27
+ url: string,
28
+ signal?: AbortSignal,
29
+ timeoutMs: number = DEFAULT_TIMEOUT_MS,
30
+ ): Promise<ExtractedContent> {
31
+ if (signal?.aborted) {
32
+ return { url, title: "", content: "", error: "Aborted" };
33
+ }
34
+
35
+ try {
36
+ new URL(url);
37
+ } catch {
38
+ return { url, title: "", content: "", error: "Invalid URL" };
39
+ }
40
+
41
+ const activityId = activityMonitor.logStart({ type: "fetch", url });
42
+
43
+ const controller = new AbortController();
44
+ const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
45
+
46
+ const onAbort = () => controller.abort();
47
+ signal?.addEventListener("abort", onAbort);
48
+
49
+ try {
50
+ const response = await fetch(url, {
51
+ signal: controller.signal,
52
+ headers: {
53
+ "User-Agent": "Mozilla/5.0 (compatible; pi-agent/1.0)",
54
+ Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
55
+ },
56
+ });
57
+
58
+ if (!response.ok) {
59
+ activityMonitor.logComplete(activityId, response.status);
60
+ return {
61
+ url,
62
+ title: "",
63
+ content: "",
64
+ error: `HTTP ${response.status}: ${response.statusText}`,
65
+ };
66
+ }
67
+
68
+ // Check content length to avoid memory issues with huge responses
69
+ const contentLengthHeader = response.headers.get("content-length");
70
+ const contentType = response.headers.get("content-type") || "";
71
+ const isPDFContent = isPDF(url, contentType);
72
+ const maxResponseSize = isPDFContent ? 20 * 1024 * 1024 : 5 * 1024 * 1024; // 20MB for PDFs, 5MB otherwise
73
+ if (contentLengthHeader) {
74
+ const contentLength = parseInt(contentLengthHeader, 10);
75
+ if (contentLength > maxResponseSize) {
76
+ activityMonitor.logComplete(activityId, response.status);
77
+ return {
78
+ url,
79
+ title: "",
80
+ content: "",
81
+ error: `Response too large (${Math.round(contentLength / 1024 / 1024)}MB)`,
82
+ };
83
+ }
84
+ }
85
+
86
+ // Handle PDFs - extract and save to markdown file
87
+ if (isPDFContent) {
88
+ try {
89
+ const buffer = await response.arrayBuffer();
90
+ const result = await extractPDFToMarkdown(buffer, url);
91
+ activityMonitor.logComplete(activityId, response.status);
92
+ return {
93
+ url,
94
+ title: result.title,
95
+ content: `PDF extracted and saved to: ${result.outputPath}\n\nPages: ${result.pages}\nCharacters: ${result.chars}`,
96
+ error: null,
97
+ };
98
+ } catch (err) {
99
+ const message = err instanceof Error ? err.message : String(err);
100
+ activityMonitor.logError(activityId, message);
101
+ return { url, title: "", content: "", error: `PDF extraction failed: ${message}` };
102
+ }
103
+ }
104
+
105
+ // Reject binary/non-text content types
106
+ if (contentType.includes("application/octet-stream") ||
107
+ contentType.includes("image/") ||
108
+ contentType.includes("audio/") ||
109
+ contentType.includes("video/") ||
110
+ contentType.includes("application/zip")) {
111
+ activityMonitor.logComplete(activityId, response.status);
112
+ return {
113
+ url,
114
+ title: "",
115
+ content: "",
116
+ error: `Unsupported content type: ${contentType.split(";")[0]}`,
117
+ };
118
+ }
119
+
120
+ // Return plain text directly without Readability
121
+ const urlHostname = new URL(url).hostname;
122
+ const isPlainText = contentType.includes("text/plain") ||
123
+ urlHostname === "raw.githubusercontent.com" ||
124
+ urlHostname === "gist.githubusercontent.com";
125
+
126
+ const text = await response.text();
127
+
128
+ if (isPlainText) {
129
+ activityMonitor.logComplete(activityId, response.status);
130
+ const content = text;
131
+ // Extract filename from URL as title
132
+ const urlPath = new URL(url).pathname;
133
+ const title = urlPath.split("/").pop() || url;
134
+ return { url, title, content, error: null };
135
+ }
136
+
137
+ const html = text;
138
+ const { document } = parseHTML(html);
139
+
140
+ const reader = new Readability(document as unknown as Document);
141
+ const article = reader.parse();
142
+
143
+ if (!article) {
144
+ // Fallback: Try extracting from RSC flight data (Next.js App Router)
145
+ const rscResult = extractRSCContent(html);
146
+ if (rscResult) {
147
+ activityMonitor.logComplete(activityId, response.status);
148
+ return { url, title: rscResult.title, content: rscResult.content, error: null };
149
+ }
150
+
151
+ activityMonitor.logComplete(activityId, response.status);
152
+ return {
153
+ url,
154
+ title: "",
155
+ content: "",
156
+ error: "Could not extract readable content",
157
+ };
158
+ }
159
+
160
+ const markdown = turndown.turndown(article.content);
161
+
162
+ activityMonitor.logComplete(activityId, response.status);
163
+ return {
164
+ url,
165
+ title: article.title || "",
166
+ content: markdown,
167
+ error: null,
168
+ };
169
+ } catch (err) {
170
+ const message = err instanceof Error ? err.message : String(err);
171
+ if (message.toLowerCase().includes("abort")) {
172
+ activityMonitor.logComplete(activityId, 0);
173
+ } else {
174
+ activityMonitor.logError(activityId, message);
175
+ }
176
+ return { url, title: "", content: "", error: message };
177
+ } finally {
178
+ clearTimeout(timeoutId);
179
+ signal?.removeEventListener("abort", onAbort);
180
+ }
181
+ }
182
+
183
+ export async function fetchAllContent(
184
+ urls: string[],
185
+ signal?: AbortSignal,
186
+ timeoutMs?: number,
187
+ ): Promise<ExtractedContent[]> {
188
+ return Promise.all(urls.map((url) => fetchLimit(() => extractContent(url, signal, timeoutMs))));
189
+ }