pi-web-access 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +96 -0
- package/README.md +179 -0
- package/activity.ts +102 -0
- package/banner.png +0 -0
- package/extract.ts +189 -0
- package/index.ts +761 -0
- package/package.json +16 -0
- package/pdf-extract.ts +184 -0
- package/perplexity.ts +181 -0
- package/rsc-extract.ts +338 -0
- package/storage.ts +71 -0
package/package.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "pi-web-access",
|
|
3
|
+
"version": "0.4.2",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"keywords": ["pi-package"],
|
|
6
|
+
"dependencies": {
|
|
7
|
+
"@mozilla/readability": "^0.5.0",
|
|
8
|
+
"linkedom": "^0.16.0",
|
|
9
|
+
"p-limit": "^6.1.0",
|
|
10
|
+
"turndown": "^7.2.0",
|
|
11
|
+
"unpdf": "^1.4.0"
|
|
12
|
+
},
|
|
13
|
+
"pi": {
|
|
14
|
+
"extensions": ["./index.ts"]
|
|
15
|
+
}
|
|
16
|
+
}
|
package/pdf-extract.ts
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF Content Extractor
|
|
3
|
+
*
|
|
4
|
+
* Extracts text from PDF files and saves to markdown.
|
|
5
|
+
* Uses unpdf (pdfjs-dist wrapper) for text extraction.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { getDocumentProxy } from "unpdf";
|
|
9
|
+
import { writeFile, mkdir } from "node:fs/promises";
|
|
10
|
+
import { join, basename } from "node:path";
|
|
11
|
+
import { homedir } from "node:os";
|
|
12
|
+
|
|
13
|
+
export interface PDFExtractResult {
|
|
14
|
+
title: string;
|
|
15
|
+
pages: number;
|
|
16
|
+
chars: number;
|
|
17
|
+
outputPath: string;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export interface PDFExtractOptions {
|
|
21
|
+
maxPages?: number;
|
|
22
|
+
outputDir?: string;
|
|
23
|
+
filename?: string;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
const DEFAULT_MAX_PAGES = 100;
|
|
27
|
+
const DEFAULT_OUTPUT_DIR = join(homedir(), "Downloads");
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Extract text from a PDF buffer and save to markdown file
|
|
31
|
+
*/
|
|
32
|
+
export async function extractPDFToMarkdown(
|
|
33
|
+
buffer: ArrayBuffer,
|
|
34
|
+
url: string,
|
|
35
|
+
options: PDFExtractOptions = {}
|
|
36
|
+
): Promise<PDFExtractResult> {
|
|
37
|
+
const {
|
|
38
|
+
maxPages = DEFAULT_MAX_PAGES,
|
|
39
|
+
outputDir = DEFAULT_OUTPUT_DIR,
|
|
40
|
+
filename
|
|
41
|
+
} = options;
|
|
42
|
+
|
|
43
|
+
const pdf = await getDocumentProxy(new Uint8Array(buffer));
|
|
44
|
+
const metadata = await pdf.getMetadata();
|
|
45
|
+
|
|
46
|
+
// Extract title from metadata or URL
|
|
47
|
+
const metaTitle = metadata.info?.Title as string | undefined;
|
|
48
|
+
const urlTitle = extractTitleFromURL(url);
|
|
49
|
+
const title = metaTitle?.trim() || urlTitle;
|
|
50
|
+
|
|
51
|
+
// Determine pages to extract
|
|
52
|
+
const pagesToExtract = Math.min(pdf.numPages, maxPages);
|
|
53
|
+
const truncated = pdf.numPages > maxPages;
|
|
54
|
+
|
|
55
|
+
// Extract text page by page for better structure
|
|
56
|
+
const pages: { pageNum: number; text: string }[] = [];
|
|
57
|
+
for (let i = 1; i <= pagesToExtract; i++) {
|
|
58
|
+
const page = await pdf.getPage(i);
|
|
59
|
+
const textContent = await page.getTextContent();
|
|
60
|
+
const pageText = textContent.items
|
|
61
|
+
.map((item: unknown) => {
|
|
62
|
+
const textItem = item as { str?: string };
|
|
63
|
+
return textItem.str || "";
|
|
64
|
+
})
|
|
65
|
+
.join(" ")
|
|
66
|
+
.replace(/\s+/g, " ")
|
|
67
|
+
.trim();
|
|
68
|
+
|
|
69
|
+
if (pageText) {
|
|
70
|
+
pages.push({ pageNum: i, text: pageText });
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Build markdown content
|
|
75
|
+
const lines: string[] = [];
|
|
76
|
+
|
|
77
|
+
// Header with metadata
|
|
78
|
+
lines.push(`# ${title}`);
|
|
79
|
+
lines.push("");
|
|
80
|
+
lines.push(`> Source: ${url}`);
|
|
81
|
+
lines.push(`> Pages: ${pdf.numPages}${truncated ? ` (extracted first ${pagesToExtract})` : ""}`);
|
|
82
|
+
if (metadata.info?.Author) {
|
|
83
|
+
lines.push(`> Author: ${metadata.info.Author}`);
|
|
84
|
+
}
|
|
85
|
+
lines.push("");
|
|
86
|
+
lines.push("---");
|
|
87
|
+
lines.push("");
|
|
88
|
+
|
|
89
|
+
// Content with page markers
|
|
90
|
+
for (let i = 0; i < pages.length; i++) {
|
|
91
|
+
if (i > 0) {
|
|
92
|
+
lines.push("");
|
|
93
|
+
lines.push(`<!-- Page ${pages[i].pageNum} -->`);
|
|
94
|
+
lines.push("");
|
|
95
|
+
}
|
|
96
|
+
lines.push(pages[i].text);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
if (truncated) {
|
|
100
|
+
lines.push("");
|
|
101
|
+
lines.push("---");
|
|
102
|
+
lines.push("");
|
|
103
|
+
lines.push(`*[Truncated: Only first ${pagesToExtract} of ${pdf.numPages} pages extracted]*`);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
const content = lines.join("\n");
|
|
107
|
+
|
|
108
|
+
// Generate output filename
|
|
109
|
+
const outputFilename = filename || sanitizeFilename(title) + ".md";
|
|
110
|
+
const outputPath = join(outputDir, outputFilename);
|
|
111
|
+
|
|
112
|
+
// Ensure output directory exists
|
|
113
|
+
await mkdir(outputDir, { recursive: true });
|
|
114
|
+
|
|
115
|
+
// Write file
|
|
116
|
+
await writeFile(outputPath, content, "utf-8");
|
|
117
|
+
|
|
118
|
+
return {
|
|
119
|
+
title,
|
|
120
|
+
pages: pdf.numPages,
|
|
121
|
+
chars: content.length,
|
|
122
|
+
outputPath,
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Extract a reasonable title from URL
|
|
128
|
+
*/
|
|
129
|
+
function extractTitleFromURL(url: string): string {
|
|
130
|
+
try {
|
|
131
|
+
const urlObj = new URL(url);
|
|
132
|
+
const pathname = urlObj.pathname;
|
|
133
|
+
|
|
134
|
+
// Get filename without extension
|
|
135
|
+
let filename = basename(pathname, ".pdf");
|
|
136
|
+
|
|
137
|
+
// Handle arxiv URLs: /pdf/1706.03762 → "arxiv-1706.03762"
|
|
138
|
+
if (urlObj.hostname.includes("arxiv.org")) {
|
|
139
|
+
const match = pathname.match(/\/(?:pdf|abs)\/(\d+\.\d+)/);
|
|
140
|
+
if (match) {
|
|
141
|
+
filename = `arxiv-${match[1]}`;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// Clean up filename
|
|
146
|
+
filename = filename
|
|
147
|
+
.replace(/[_-]+/g, " ")
|
|
148
|
+
.replace(/\s+/g, " ")
|
|
149
|
+
.trim();
|
|
150
|
+
|
|
151
|
+
return filename || "document";
|
|
152
|
+
} catch {
|
|
153
|
+
return "document";
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Sanitize string for use as filename
|
|
159
|
+
*/
|
|
160
|
+
function sanitizeFilename(name: string): string {
|
|
161
|
+
return name
|
|
162
|
+
.toLowerCase()
|
|
163
|
+
.replace(/[^a-z0-9\s-]/g, "")
|
|
164
|
+
.replace(/\s+/g, "-")
|
|
165
|
+
.replace(/-+/g, "-")
|
|
166
|
+
.slice(0, 100)
|
|
167
|
+
.replace(/^-|-$/g, "")
|
|
168
|
+
|| "document";
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Check if URL or content-type indicates a PDF
|
|
173
|
+
*/
|
|
174
|
+
export function isPDF(url: string, contentType?: string): boolean {
|
|
175
|
+
if (contentType?.includes("application/pdf")) {
|
|
176
|
+
return true;
|
|
177
|
+
}
|
|
178
|
+
try {
|
|
179
|
+
const urlObj = new URL(url);
|
|
180
|
+
return urlObj.pathname.toLowerCase().endsWith(".pdf");
|
|
181
|
+
} catch {
|
|
182
|
+
return false;
|
|
183
|
+
}
|
|
184
|
+
}
|
package/perplexity.ts
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
2
|
+
import { homedir } from "node:os";
|
|
3
|
+
import { join } from "node:path";
|
|
4
|
+
import { activityMonitor } from "./activity.js";
|
|
5
|
+
|
|
6
|
+
const PERPLEXITY_API_URL = "https://api.perplexity.ai/chat/completions";
|
|
7
|
+
const CONFIG_PATH = join(homedir(), ".pi", "web-search.json");
|
|
8
|
+
|
|
9
|
+
const RATE_LIMIT = {
|
|
10
|
+
maxRequests: 10,
|
|
11
|
+
windowMs: 60 * 1000,
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
const requestTimestamps: number[] = [];
|
|
15
|
+
|
|
16
|
+
export interface SearchResult {
|
|
17
|
+
title: string;
|
|
18
|
+
url: string;
|
|
19
|
+
snippet: string;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface PerplexityResponse {
|
|
23
|
+
answer: string;
|
|
24
|
+
results: SearchResult[];
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface SearchOptions {
|
|
28
|
+
numResults?: number;
|
|
29
|
+
recencyFilter?: "day" | "week" | "month" | "year";
|
|
30
|
+
domainFilter?: string[];
|
|
31
|
+
signal?: AbortSignal;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
interface WebSearchConfig {
|
|
35
|
+
perplexityApiKey?: string;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
let cachedConfig: WebSearchConfig | null = null;
|
|
39
|
+
|
|
40
|
+
function loadConfig(): WebSearchConfig {
|
|
41
|
+
if (cachedConfig) return cachedConfig;
|
|
42
|
+
|
|
43
|
+
if (existsSync(CONFIG_PATH)) {
|
|
44
|
+
try {
|
|
45
|
+
const content = readFileSync(CONFIG_PATH, "utf-8");
|
|
46
|
+
cachedConfig = JSON.parse(content) as WebSearchConfig;
|
|
47
|
+
return cachedConfig;
|
|
48
|
+
} catch {
|
|
49
|
+
cachedConfig = {};
|
|
50
|
+
}
|
|
51
|
+
} else {
|
|
52
|
+
cachedConfig = {};
|
|
53
|
+
}
|
|
54
|
+
return cachedConfig;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
function getApiKey(): string {
|
|
58
|
+
const config = loadConfig();
|
|
59
|
+
const key = config.perplexityApiKey || process.env.PERPLEXITY_API_KEY;
|
|
60
|
+
if (!key) {
|
|
61
|
+
throw new Error(
|
|
62
|
+
"Perplexity API key not found. Either:\n" +
|
|
63
|
+
` 1. Create ${CONFIG_PATH} with { "perplexityApiKey": "your-key" }\n` +
|
|
64
|
+
" 2. Set PERPLEXITY_API_KEY environment variable\n" +
|
|
65
|
+
"Get a key at https://perplexity.ai/settings/api"
|
|
66
|
+
);
|
|
67
|
+
}
|
|
68
|
+
return key;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function checkRateLimit(): void {
|
|
72
|
+
const now = Date.now();
|
|
73
|
+
const windowStart = now - RATE_LIMIT.windowMs;
|
|
74
|
+
|
|
75
|
+
while (requestTimestamps.length > 0 && requestTimestamps[0] < windowStart) {
|
|
76
|
+
requestTimestamps.shift();
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
if (requestTimestamps.length >= RATE_LIMIT.maxRequests) {
|
|
80
|
+
const waitMs = requestTimestamps[0] + RATE_LIMIT.windowMs - now;
|
|
81
|
+
throw new Error(`Rate limited. Try again in ${Math.ceil(waitMs / 1000)}s`);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
requestTimestamps.push(now);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function validateDomainFilter(domains: string[]): string[] {
|
|
88
|
+
return domains.filter((d) => {
|
|
89
|
+
const domain = d.startsWith("-") ? d.slice(1) : d;
|
|
90
|
+
return /^[a-zA-Z0-9][a-zA-Z0-9-_.]*\.[a-zA-Z]{2,}$/.test(domain);
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
export async function searchWithPerplexity(query: string, options: SearchOptions = {}): Promise<PerplexityResponse> {
|
|
95
|
+
checkRateLimit();
|
|
96
|
+
|
|
97
|
+
const activityId = activityMonitor.logStart({ type: "api", query });
|
|
98
|
+
|
|
99
|
+
activityMonitor.updateRateLimit({
|
|
100
|
+
used: requestTimestamps.length,
|
|
101
|
+
max: RATE_LIMIT.maxRequests,
|
|
102
|
+
oldestTimestamp: requestTimestamps[0] ?? null,
|
|
103
|
+
windowMs: RATE_LIMIT.windowMs,
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
const apiKey = getApiKey();
|
|
107
|
+
const numResults = Math.min(options.numResults ?? 5, 20);
|
|
108
|
+
|
|
109
|
+
const requestBody: Record<string, unknown> = {
|
|
110
|
+
model: "sonar",
|
|
111
|
+
messages: [{ role: "user", content: query }],
|
|
112
|
+
max_tokens: 1024,
|
|
113
|
+
return_related_questions: false,
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
if (options.recencyFilter) {
|
|
117
|
+
requestBody.search_recency_filter = options.recencyFilter;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
if (options.domainFilter && options.domainFilter.length > 0) {
|
|
121
|
+
const validated = validateDomainFilter(options.domainFilter);
|
|
122
|
+
if (validated.length > 0) {
|
|
123
|
+
requestBody.search_domain_filter = validated;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
let response: Response;
|
|
128
|
+
try {
|
|
129
|
+
response = await fetch(PERPLEXITY_API_URL, {
|
|
130
|
+
method: "POST",
|
|
131
|
+
headers: {
|
|
132
|
+
Authorization: `Bearer ${apiKey}`,
|
|
133
|
+
"Content-Type": "application/json",
|
|
134
|
+
},
|
|
135
|
+
body: JSON.stringify(requestBody),
|
|
136
|
+
signal: options.signal,
|
|
137
|
+
});
|
|
138
|
+
} catch (err) {
|
|
139
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
140
|
+
if (message.toLowerCase().includes("abort")) {
|
|
141
|
+
activityMonitor.logComplete(activityId, 0);
|
|
142
|
+
} else {
|
|
143
|
+
activityMonitor.logError(activityId, message);
|
|
144
|
+
}
|
|
145
|
+
throw err;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if (!response.ok) {
|
|
149
|
+
activityMonitor.logComplete(activityId, response.status);
|
|
150
|
+
const errorText = await response.text();
|
|
151
|
+
throw new Error(`Perplexity API error ${response.status}: ${errorText}`);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
let data: Record<string, unknown>;
|
|
155
|
+
try {
|
|
156
|
+
data = await response.json();
|
|
157
|
+
} catch {
|
|
158
|
+
activityMonitor.logComplete(activityId, response.status);
|
|
159
|
+
throw new Error("Perplexity API returned invalid JSON");
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
const answer = (data.choices as Array<{ message?: { content?: string } }>)?.[0]?.message?.content || "";
|
|
163
|
+
const citations = Array.isArray(data.citations) ? data.citations : [];
|
|
164
|
+
|
|
165
|
+
const results: SearchResult[] = [];
|
|
166
|
+
for (let i = 0; i < Math.min(citations.length, numResults); i++) {
|
|
167
|
+
const citation = citations[i];
|
|
168
|
+
if (typeof citation === "string") {
|
|
169
|
+
results.push({ title: `Source ${i + 1}`, url: citation, snippet: "" });
|
|
170
|
+
} else if (citation && typeof citation === "object" && typeof citation.url === "string") {
|
|
171
|
+
results.push({
|
|
172
|
+
title: citation.title || `Source ${i + 1}`,
|
|
173
|
+
url: citation.url,
|
|
174
|
+
snippet: "",
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
activityMonitor.logComplete(activityId, response.status);
|
|
180
|
+
return { answer, results };
|
|
181
|
+
}
|