pi-web-access 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +96 -0
- package/README.md +179 -0
- package/activity.ts +102 -0
- package/banner.png +0 -0
- package/extract.ts +189 -0
- package/index.ts +761 -0
- package/package.json +16 -0
- package/pdf-extract.ts +184 -0
- package/perplexity.ts +181 -0
- package/rsc-extract.ts +338 -0
- package/storage.ts +71 -0
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# Pi Web Access - Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
## [Unreleased]
|
|
6
|
+
|
|
7
|
+
## [0.4.2] - 2026-01-27
|
|
8
|
+
|
|
9
|
+
### Fixed
|
|
10
|
+
|
|
11
|
+
- Single-URL fetches now store content for retrieval via `get_search_content` (previously only multi-URL)
|
|
12
|
+
- Corrected `get_search_content` usage syntax in fetch_content help messages
|
|
13
|
+
|
|
14
|
+
### Changed
|
|
15
|
+
|
|
16
|
+
- Increased inline content limit from 10K to 30K chars (larger content truncated but fully retrievable)
|
|
17
|
+
|
|
18
|
+
### Added
|
|
19
|
+
|
|
20
|
+
- Banner image for README
|
|
21
|
+
|
|
22
|
+
## [0.4.1] - 2026-01-26
|
|
23
|
+
|
|
24
|
+
### Changed
|
|
25
|
+
- Added `pi` manifest to package.json for pi v0.50.0 package system compliance
|
|
26
|
+
- Added `pi-package` keyword for npm discoverability
|
|
27
|
+
|
|
28
|
+
## [0.4.0] - 2026-01-19
|
|
29
|
+
|
|
30
|
+
### Added
|
|
31
|
+
|
|
32
|
+
- PDF extraction via `unpdf` - fetches PDFs from URLs and saves as markdown to `~/Downloads/`
|
|
33
|
+
- Extracts text, metadata (title, author), page count
|
|
34
|
+
- Supports PDFs up to 20MB (vs 5MB for HTML)
|
|
35
|
+
- Handles arxiv URLs with smart title fallback
|
|
36
|
+
|
|
37
|
+
### Fixed
|
|
38
|
+
|
|
39
|
+
- Plain text URL detection now uses hostname check instead of substring match
|
|
40
|
+
|
|
41
|
+
## [0.3.0] - 2026-01-19
|
|
42
|
+
|
|
43
|
+
### Added
|
|
44
|
+
|
|
45
|
+
- RSC (React Server Components) content extraction for Next.js App Router pages
|
|
46
|
+
- Parses flight data from `<script>self.__next_f.push([...])</script>` tags
|
|
47
|
+
- Reconstructs markdown with headings, tables, code blocks, links
|
|
48
|
+
- Handles chunk references and nested components
|
|
49
|
+
- Falls back to RSC extraction when Readability fails
|
|
50
|
+
- Content-type validation rejects binary files (images, PDFs, audio, video, zip)
|
|
51
|
+
- 5MB response size limit (checked via Content-Length header) to prevent memory issues
|
|
52
|
+
|
|
53
|
+
### Fixed
|
|
54
|
+
|
|
55
|
+
- `fetch_content` now handles plain text URLs (raw.githubusercontent.com, gist.githubusercontent.com, any text/plain response) instead of failing with "Could not extract readable content"
|
|
56
|
+
|
|
57
|
+
## [0.2.0] - 2026-01-11
|
|
58
|
+
|
|
59
|
+
### Added
|
|
60
|
+
|
|
61
|
+
- Activity monitor widget (`Ctrl+Shift+O`) showing live request/response activity
|
|
62
|
+
- Displays last 10 API calls and URL fetches with status codes and timing
|
|
63
|
+
- Shows rate limit usage and reset countdown
|
|
64
|
+
- Live updates as requests complete
|
|
65
|
+
- Auto-clears on session switch
|
|
66
|
+
|
|
67
|
+
### Changed
|
|
68
|
+
|
|
69
|
+
- Refactored activity tracking into dedicated `activity.ts` module
|
|
70
|
+
|
|
71
|
+
## [0.1.0] - 2026-01-06
|
|
72
|
+
|
|
73
|
+
Initial release. Designed for pi v0.37.3.
|
|
74
|
+
|
|
75
|
+
### Added
|
|
76
|
+
|
|
77
|
+
- `web_search` tool - Search via Perplexity AI with synthesized answers and citations
|
|
78
|
+
- Single or batch queries (parallel execution)
|
|
79
|
+
- Recency filter (day/week/month/year)
|
|
80
|
+
- Domain filter (include or exclude)
|
|
81
|
+
- Optional async content fetching with agent notification
|
|
82
|
+
- `fetch_content` tool - Fetch and extract readable content from URLs
|
|
83
|
+
- Single URL returns content directly
|
|
84
|
+
- Multiple URLs store for retrieval via `get_search_content`
|
|
85
|
+
- Concurrent fetching (3 max) with 30s timeout
|
|
86
|
+
- `get_search_content` tool - Retrieve stored search results or fetched content
|
|
87
|
+
- Access by response ID, URL, query, or index
|
|
88
|
+
- `/search` command - Interactive browser for stored results
|
|
89
|
+
- TUI rendering with progress bars, URL lists, and expandable previews
|
|
90
|
+
- Session-aware storage with 1-hour TTL
|
|
91
|
+
- Rate limiting (10 req/min for Perplexity API)
|
|
92
|
+
- Config file support (`~/.pi/web-search.json`)
|
|
93
|
+
- Content extraction via Readability + Turndown (max 10k chars)
|
|
94
|
+
- Proper session isolation - pending fetches abort on session switch
|
|
95
|
+
- URL validation before fetch attempts
|
|
96
|
+
- Defensive JSON parsing for API responses
|
package/README.md
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
<p>
|
|
2
|
+
<img src="banner.png" alt="pi-web-access" width="1100">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
# Pi Web Access
|
|
6
|
+
|
|
7
|
+
An extension for [Pi coding agent](https://github.com/badlogic/pi-mono/) that gives Pi web capabilities: search via Perplexity AI, fetch and extract content from URLs, and read PDFs.
|
|
8
|
+
|
|
9
|
+
```typescript
|
|
10
|
+
web_search({ query: "TypeScript best practices 2025" })
|
|
11
|
+
fetch_content({ url: "https://docs.example.com/guide" })
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Install
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
# Clone to extensions directory
|
|
18
|
+
git clone https://github.com/nicobailon/pi-web-access ~/.pi/agent/extensions/pi-web-access
|
|
19
|
+
cd ~/.pi/agent/extensions/pi-web-access
|
|
20
|
+
npm install
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Add your Perplexity API key:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
# Option 1: Environment variable
|
|
27
|
+
export PERPLEXITY_API_KEY="pplx-..."
|
|
28
|
+
|
|
29
|
+
# Option 2: Config file
|
|
30
|
+
echo '{"perplexityApiKey": "pplx-..."}' > ~/.pi/web-search.json
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
Get a key at https://perplexity.ai/settings/api
|
|
34
|
+
|
|
35
|
+
**Requires:** Pi v0.37.3+
|
|
36
|
+
|
|
37
|
+
## Tools
|
|
38
|
+
|
|
39
|
+
### web_search
|
|
40
|
+
|
|
41
|
+
Search the web via Perplexity AI. Returns synthesized answer with source citations.
|
|
42
|
+
|
|
43
|
+
```typescript
|
|
44
|
+
// Single query
|
|
45
|
+
web_search({ query: "rust async programming" })
|
|
46
|
+
|
|
47
|
+
// Multiple queries (parallel)
|
|
48
|
+
web_search({ queries: ["query 1", "query 2"] })
|
|
49
|
+
|
|
50
|
+
// With options
|
|
51
|
+
web_search({
|
|
52
|
+
query: "latest news",
|
|
53
|
+
numResults: 10, // Default: 5, max: 20
|
|
54
|
+
recencyFilter: "week", // day, week, month, year
|
|
55
|
+
domainFilter: ["github.com"] // Prefix with - to exclude
|
|
56
|
+
})
|
|
57
|
+
|
|
58
|
+
// Fetch full page content (async)
|
|
59
|
+
web_search({ query: "...", includeContent: true })
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
When `includeContent: true`, sources are fetched in the background. Agent receives notification when ready.
|
|
63
|
+
|
|
64
|
+
### fetch_content
|
|
65
|
+
|
|
66
|
+
Fetch URL(s) and extract readable content as markdown.
|
|
67
|
+
|
|
68
|
+
```typescript
|
|
69
|
+
// Single URL - returns content directly (also stored for retrieval)
|
|
70
|
+
fetch_content({ url: "https://example.com/article" })
|
|
71
|
+
|
|
72
|
+
// Multiple URLs - returns summary (content stored for retrieval)
|
|
73
|
+
fetch_content({ urls: ["url1", "url2", "url3"] })
|
|
74
|
+
|
|
75
|
+
// PDFs - extracted and saved to ~/Downloads/
|
|
76
|
+
fetch_content({ url: "https://arxiv.org/pdf/1706.03762" })
|
|
77
|
+
// → "PDF extracted and saved to: ~/Downloads/arxiv-170603762.md"
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
**PDF handling:** When fetching a PDF URL, the extension extracts text and saves it as a markdown file in `~/Downloads/`. The agent can then use `read` to access specific sections without loading 200K+ chars into context.
|
|
81
|
+
|
|
82
|
+
### get_search_content
|
|
83
|
+
|
|
84
|
+
Retrieve stored content from previous searches or fetches.
|
|
85
|
+
|
|
86
|
+
```typescript
|
|
87
|
+
// By response ID (from web_search or fetch_content)
|
|
88
|
+
get_search_content({ responseId: "abc123", urlIndex: 0 })
|
|
89
|
+
|
|
90
|
+
// By URL
|
|
91
|
+
get_search_content({ responseId: "abc123", url: "https://..." })
|
|
92
|
+
|
|
93
|
+
// By query (for search results)
|
|
94
|
+
get_search_content({ responseId: "abc123", query: "original query" })
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Features
|
|
98
|
+
|
|
99
|
+
### Activity Monitor (Ctrl+Shift+O)
|
|
100
|
+
|
|
101
|
+
Toggle live request/response activity:
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
─── Web Search Activity ────────────────────────────────────
|
|
105
|
+
API "typescript best practices" 200 2.1s ✓
|
|
106
|
+
GET docs.example.com/article 200 0.8s ✓
|
|
107
|
+
GET blog.example.com/post 404 0.3s ✗
|
|
108
|
+
GET news.example.com/latest ... 1.2s ⋯
|
|
109
|
+
────────────────────────────────────────────────────────────
|
|
110
|
+
Rate: 3/10 (resets in 42s)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### RSC Content Extraction
|
|
114
|
+
|
|
115
|
+
Next.js App Router pages embed content as RSC (React Server Components) flight data in script tags. When Readability fails, the extension parses these JSON payloads directly, reconstructing markdown with headings, tables, code blocks, and links.
|
|
116
|
+
|
|
117
|
+
### TUI Rendering
|
|
118
|
+
|
|
119
|
+
Tool calls render with real-time progress:
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
┌─ search "TypeScript best practices 2025" ─────────────────────────┐
|
|
123
|
+
│ [████████░░] searching │
|
|
124
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Commands
|
|
128
|
+
|
|
129
|
+
### /search
|
|
130
|
+
|
|
131
|
+
Browse stored search results interactively.
|
|
132
|
+
|
|
133
|
+
## How It Works
|
|
134
|
+
|
|
135
|
+
```
|
|
136
|
+
Agent Request → Perplexity API → Synthesized Answer + Citations
|
|
137
|
+
↓
|
|
138
|
+
[if includeContent: true]
|
|
139
|
+
↓
|
|
140
|
+
Background Fetch (3 concurrent)
|
|
141
|
+
↓
|
|
142
|
+
┌────────────────┼────────────────┐
|
|
143
|
+
↓ ↓ ↓
|
|
144
|
+
PDF HTML/Text RSC
|
|
145
|
+
↓ ↓ ↓
|
|
146
|
+
unpdf → Readability → RSC Parser →
|
|
147
|
+
Save to file Markdown Markdown
|
|
148
|
+
↓ ↓ ↓
|
|
149
|
+
└────────────────┼────────────────┘
|
|
150
|
+
↓
|
|
151
|
+
Agent Notification (triggerTurn)
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## Rate Limits
|
|
155
|
+
|
|
156
|
+
- **Perplexity API**: 10 requests/minute (enforced client-side)
|
|
157
|
+
- **Content Fetch**: 3 concurrent requests, 30s timeout per URL
|
|
158
|
+
- **Cache TTL**: 1 hour
|
|
159
|
+
|
|
160
|
+
## Files
|
|
161
|
+
|
|
162
|
+
| File | Purpose |
|
|
163
|
+
|------|---------|
|
|
164
|
+
| `index.ts` | Extension entry, tool definitions, commands, widget |
|
|
165
|
+
| `perplexity.ts` | Perplexity API client, rate limiting |
|
|
166
|
+
| `extract.ts` | URL fetching, content extraction routing |
|
|
167
|
+
| `pdf-extract.ts` | PDF text extraction, saves to markdown |
|
|
168
|
+
| `rsc-extract.ts` | RSC flight data parser for Next.js pages |
|
|
169
|
+
| `storage.ts` | Session-aware result storage |
|
|
170
|
+
| `activity.ts` | Activity tracking for observability widget |
|
|
171
|
+
|
|
172
|
+
## Limitations
|
|
173
|
+
|
|
174
|
+
- Content extraction works best on article-style pages
|
|
175
|
+
- Heavy JS sites may not extract well (no browser rendering), though Next.js App Router pages with RSC flight data are supported
|
|
176
|
+
- PDFs are extracted as text (no OCR for scanned documents)
|
|
177
|
+
- Max response size: 20MB for PDFs, 5MB for HTML
|
|
178
|
+
- Max inline content: 30,000 chars per URL (larger content stored for retrieval via get_search_content)
|
|
179
|
+
- Requires Pi restart after config file changes
|
package/activity.ts
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
// Types
|
|
2
|
+
export interface ActivityEntry {
|
|
3
|
+
id: string;
|
|
4
|
+
type: "api" | "fetch";
|
|
5
|
+
startTime: number;
|
|
6
|
+
endTime?: number;
|
|
7
|
+
|
|
8
|
+
// For API calls
|
|
9
|
+
query?: string;
|
|
10
|
+
|
|
11
|
+
// For URL fetches
|
|
12
|
+
url?: string;
|
|
13
|
+
|
|
14
|
+
// Result - status is number (HTTP code) or null (pending/network error)
|
|
15
|
+
status: number | null;
|
|
16
|
+
error?: string;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export interface RateLimitInfo {
|
|
20
|
+
used: number;
|
|
21
|
+
max: number;
|
|
22
|
+
oldestTimestamp: number | null;
|
|
23
|
+
windowMs: number;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export class ActivityMonitor {
|
|
27
|
+
private entries: ActivityEntry[] = [];
|
|
28
|
+
private readonly maxEntries = 10;
|
|
29
|
+
private listeners = new Set<() => void>();
|
|
30
|
+
private rateLimitInfo: RateLimitInfo = { used: 0, max: 10, oldestTimestamp: null, windowMs: 60000 };
|
|
31
|
+
private nextId = 1;
|
|
32
|
+
|
|
33
|
+
logStart(partial: Omit<ActivityEntry, "id" | "startTime" | "status">): string {
|
|
34
|
+
const id = `act-${this.nextId++}`;
|
|
35
|
+
const entry: ActivityEntry = {
|
|
36
|
+
...partial,
|
|
37
|
+
id,
|
|
38
|
+
startTime: Date.now(),
|
|
39
|
+
status: null,
|
|
40
|
+
};
|
|
41
|
+
this.entries.push(entry);
|
|
42
|
+
if (this.entries.length > this.maxEntries) {
|
|
43
|
+
this.entries.shift();
|
|
44
|
+
}
|
|
45
|
+
this.notify();
|
|
46
|
+
return id;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
logComplete(id: string, status: number): void {
|
|
50
|
+
const entry = this.entries.find((e) => e.id === id);
|
|
51
|
+
if (entry) {
|
|
52
|
+
entry.endTime = Date.now();
|
|
53
|
+
entry.status = status;
|
|
54
|
+
this.notify();
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
logError(id: string, error: string): void {
|
|
59
|
+
const entry = this.entries.find((e) => e.id === id);
|
|
60
|
+
if (entry) {
|
|
61
|
+
entry.endTime = Date.now();
|
|
62
|
+
entry.error = error;
|
|
63
|
+
this.notify();
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
getEntries(): readonly ActivityEntry[] {
|
|
68
|
+
return this.entries;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
getRateLimitInfo(): RateLimitInfo {
|
|
72
|
+
return this.rateLimitInfo;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
updateRateLimit(info: RateLimitInfo): void {
|
|
76
|
+
this.rateLimitInfo = info;
|
|
77
|
+
this.notify();
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
onUpdate(callback: () => void): () => void {
|
|
81
|
+
this.listeners.add(callback);
|
|
82
|
+
return () => this.listeners.delete(callback);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
clear(): void {
|
|
86
|
+
this.entries = [];
|
|
87
|
+
this.rateLimitInfo = { used: 0, max: 10, oldestTimestamp: null, windowMs: 60000 };
|
|
88
|
+
this.notify();
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
private notify(): void {
|
|
92
|
+
for (const cb of this.listeners) {
|
|
93
|
+
try {
|
|
94
|
+
cb();
|
|
95
|
+
} catch {
|
|
96
|
+
/* ignore */
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
export const activityMonitor = new ActivityMonitor();
|
package/banner.png
ADDED
|
Binary file
|
package/extract.ts
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
import { Readability } from "@mozilla/readability";
|
|
2
|
+
import { parseHTML } from "linkedom";
|
|
3
|
+
import TurndownService from "turndown";
|
|
4
|
+
import pLimit from "p-limit";
|
|
5
|
+
import { activityMonitor } from "./activity.js";
|
|
6
|
+
import { extractRSCContent } from "./rsc-extract.js";
|
|
7
|
+
import { extractPDFToMarkdown, isPDF } from "./pdf-extract.js";
|
|
8
|
+
|
|
9
|
+
const DEFAULT_TIMEOUT_MS = 30000;
|
|
10
|
+
const CONCURRENT_LIMIT = 3;
|
|
11
|
+
|
|
12
|
+
const turndown = new TurndownService({
|
|
13
|
+
headingStyle: "atx",
|
|
14
|
+
codeBlockStyle: "fenced",
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
const fetchLimit = pLimit(CONCURRENT_LIMIT);
|
|
18
|
+
|
|
19
|
+
export interface ExtractedContent {
|
|
20
|
+
url: string;
|
|
21
|
+
title: string;
|
|
22
|
+
content: string;
|
|
23
|
+
error: string | null;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export async function extractContent(
|
|
27
|
+
url: string,
|
|
28
|
+
signal?: AbortSignal,
|
|
29
|
+
timeoutMs: number = DEFAULT_TIMEOUT_MS,
|
|
30
|
+
): Promise<ExtractedContent> {
|
|
31
|
+
if (signal?.aborted) {
|
|
32
|
+
return { url, title: "", content: "", error: "Aborted" };
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
try {
|
|
36
|
+
new URL(url);
|
|
37
|
+
} catch {
|
|
38
|
+
return { url, title: "", content: "", error: "Invalid URL" };
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const activityId = activityMonitor.logStart({ type: "fetch", url });
|
|
42
|
+
|
|
43
|
+
const controller = new AbortController();
|
|
44
|
+
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
|
|
45
|
+
|
|
46
|
+
const onAbort = () => controller.abort();
|
|
47
|
+
signal?.addEventListener("abort", onAbort);
|
|
48
|
+
|
|
49
|
+
try {
|
|
50
|
+
const response = await fetch(url, {
|
|
51
|
+
signal: controller.signal,
|
|
52
|
+
headers: {
|
|
53
|
+
"User-Agent": "Mozilla/5.0 (compatible; pi-agent/1.0)",
|
|
54
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
55
|
+
},
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
if (!response.ok) {
|
|
59
|
+
activityMonitor.logComplete(activityId, response.status);
|
|
60
|
+
return {
|
|
61
|
+
url,
|
|
62
|
+
title: "",
|
|
63
|
+
content: "",
|
|
64
|
+
error: `HTTP ${response.status}: ${response.statusText}`,
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// Check content length to avoid memory issues with huge responses
|
|
69
|
+
const contentLengthHeader = response.headers.get("content-length");
|
|
70
|
+
const contentType = response.headers.get("content-type") || "";
|
|
71
|
+
const isPDFContent = isPDF(url, contentType);
|
|
72
|
+
const maxResponseSize = isPDFContent ? 20 * 1024 * 1024 : 5 * 1024 * 1024; // 20MB for PDFs, 5MB otherwise
|
|
73
|
+
if (contentLengthHeader) {
|
|
74
|
+
const contentLength = parseInt(contentLengthHeader, 10);
|
|
75
|
+
if (contentLength > maxResponseSize) {
|
|
76
|
+
activityMonitor.logComplete(activityId, response.status);
|
|
77
|
+
return {
|
|
78
|
+
url,
|
|
79
|
+
title: "",
|
|
80
|
+
content: "",
|
|
81
|
+
error: `Response too large (${Math.round(contentLength / 1024 / 1024)}MB)`,
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// Handle PDFs - extract and save to markdown file
|
|
87
|
+
if (isPDFContent) {
|
|
88
|
+
try {
|
|
89
|
+
const buffer = await response.arrayBuffer();
|
|
90
|
+
const result = await extractPDFToMarkdown(buffer, url);
|
|
91
|
+
activityMonitor.logComplete(activityId, response.status);
|
|
92
|
+
return {
|
|
93
|
+
url,
|
|
94
|
+
title: result.title,
|
|
95
|
+
content: `PDF extracted and saved to: ${result.outputPath}\n\nPages: ${result.pages}\nCharacters: ${result.chars}`,
|
|
96
|
+
error: null,
|
|
97
|
+
};
|
|
98
|
+
} catch (err) {
|
|
99
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
100
|
+
activityMonitor.logError(activityId, message);
|
|
101
|
+
return { url, title: "", content: "", error: `PDF extraction failed: ${message}` };
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// Reject binary/non-text content types
|
|
106
|
+
if (contentType.includes("application/octet-stream") ||
|
|
107
|
+
contentType.includes("image/") ||
|
|
108
|
+
contentType.includes("audio/") ||
|
|
109
|
+
contentType.includes("video/") ||
|
|
110
|
+
contentType.includes("application/zip")) {
|
|
111
|
+
activityMonitor.logComplete(activityId, response.status);
|
|
112
|
+
return {
|
|
113
|
+
url,
|
|
114
|
+
title: "",
|
|
115
|
+
content: "",
|
|
116
|
+
error: `Unsupported content type: ${contentType.split(";")[0]}`,
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Return plain text directly without Readability
|
|
121
|
+
const urlHostname = new URL(url).hostname;
|
|
122
|
+
const isPlainText = contentType.includes("text/plain") ||
|
|
123
|
+
urlHostname === "raw.githubusercontent.com" ||
|
|
124
|
+
urlHostname === "gist.githubusercontent.com";
|
|
125
|
+
|
|
126
|
+
const text = await response.text();
|
|
127
|
+
|
|
128
|
+
if (isPlainText) {
|
|
129
|
+
activityMonitor.logComplete(activityId, response.status);
|
|
130
|
+
const content = text;
|
|
131
|
+
// Extract filename from URL as title
|
|
132
|
+
const urlPath = new URL(url).pathname;
|
|
133
|
+
const title = urlPath.split("/").pop() || url;
|
|
134
|
+
return { url, title, content, error: null };
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
const html = text;
|
|
138
|
+
const { document } = parseHTML(html);
|
|
139
|
+
|
|
140
|
+
const reader = new Readability(document as unknown as Document);
|
|
141
|
+
const article = reader.parse();
|
|
142
|
+
|
|
143
|
+
if (!article) {
|
|
144
|
+
// Fallback: Try extracting from RSC flight data (Next.js App Router)
|
|
145
|
+
const rscResult = extractRSCContent(html);
|
|
146
|
+
if (rscResult) {
|
|
147
|
+
activityMonitor.logComplete(activityId, response.status);
|
|
148
|
+
return { url, title: rscResult.title, content: rscResult.content, error: null };
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
activityMonitor.logComplete(activityId, response.status);
|
|
152
|
+
return {
|
|
153
|
+
url,
|
|
154
|
+
title: "",
|
|
155
|
+
content: "",
|
|
156
|
+
error: "Could not extract readable content",
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
const markdown = turndown.turndown(article.content);
|
|
161
|
+
|
|
162
|
+
activityMonitor.logComplete(activityId, response.status);
|
|
163
|
+
return {
|
|
164
|
+
url,
|
|
165
|
+
title: article.title || "",
|
|
166
|
+
content: markdown,
|
|
167
|
+
error: null,
|
|
168
|
+
};
|
|
169
|
+
} catch (err) {
|
|
170
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
171
|
+
if (message.toLowerCase().includes("abort")) {
|
|
172
|
+
activityMonitor.logComplete(activityId, 0);
|
|
173
|
+
} else {
|
|
174
|
+
activityMonitor.logError(activityId, message);
|
|
175
|
+
}
|
|
176
|
+
return { url, title: "", content: "", error: message };
|
|
177
|
+
} finally {
|
|
178
|
+
clearTimeout(timeoutId);
|
|
179
|
+
signal?.removeEventListener("abort", onAbort);
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
export async function fetchAllContent(
|
|
184
|
+
urls: string[],
|
|
185
|
+
signal?: AbortSignal,
|
|
186
|
+
timeoutMs?: number,
|
|
187
|
+
): Promise<ExtractedContent[]> {
|
|
188
|
+
return Promise.all(urls.map((url) => fetchLimit(() => extractContent(url, signal, timeoutMs))));
|
|
189
|
+
}
|