pi-web-access 0.4.5 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -4,6 +4,24 @@ All notable changes to this project will be documented in this file.
4
4
 
5
5
  ## [Unreleased]
6
6
 
7
+ ## [0.5.0] - 2026-02-01
8
+
9
+ ### Added
10
+ - GitHub repository clone extraction for `fetch_content` -- detects GitHub code URLs, clones repos to `/tmp/pi-github-repos/`, and returns actual file contents plus local path for further exploration with `read` and `bash`
11
+ - Lightweight API fallback for oversized repos (>350MB) and commit SHA URLs via `gh api`
12
+ - Clone cache with concurrent request deduplication (second request awaits first's clone)
13
+ - `forceClone` parameter on `fetch_content` to override the size threshold
14
+ - Configurable via `~/.pi/web-search.json` under `githubClone` key (enabled, maxRepoSizeMB, cloneTimeoutSeconds, clonePath)
15
+ - Falls back to `git clone` when `gh` CLI is unavailable (public repos only)
16
+ - README: GitHub clone documentation with config, flow diagram, and limitations
17
+
18
+ ### Changed
19
+ - Refactored `extractContent`/`fetchAllContent` signatures from positional `timeoutMs` to `ExtractOptions` object
20
+ - Blob/tree fetch titles now include file path (e.g. `owner/repo - src/index.ts`) for better disambiguation in multi-URL results and TUI
21
+
22
+ ### Fixed
23
+ - README: Activity monitor keybinding corrected from `Ctrl+Shift+O` to `Ctrl+Shift+W`
24
+
7
25
  ## [0.4.5] - 2026-02-01
8
26
 
9
27
  ### Changed
package/README.md CHANGED
@@ -4,7 +4,7 @@
4
4
 
5
5
  # Pi Web Access
6
6
 
7
- An extension for [Pi coding agent](https://github.com/badlogic/pi-mono/) that gives Pi web capabilities: search via Perplexity AI, fetch and extract content from URLs, and read PDFs.
7
+ An extension for [Pi coding agent](https://github.com/badlogic/pi-mono/) that gives Pi web capabilities: search via Perplexity AI, fetch and extract content from URLs, clone GitHub repos for local exploration, and read PDFs.
8
8
 
9
9
  ```typescript
10
10
  web_search({ query: "TypeScript best practices 2025" })
@@ -74,6 +74,25 @@ fetch_content({ url: "https://arxiv.org/pdf/1706.03762" })
74
74
  // → "PDF extracted and saved to: ~/Downloads/arxiv-170603762.md"
75
75
  ```
76
76
 
77
+ **GitHub repos:** GitHub code URLs are automatically detected and cloned locally instead of scraping HTML. The agent gets actual file contents and a local path to explore with `read` and `bash`.
78
+
79
+ ```typescript
80
+ // Clone a repo - returns structure + README
81
+ fetch_content({ url: "https://github.com/owner/repo" })
82
+ // → "Repository cloned to: /tmp/pi-github-repos/owner/repo"
83
+
84
+ // Specific file - returns file contents
85
+ fetch_content({ url: "https://github.com/owner/repo/blob/main/src/index.ts" })
86
+
87
+ // Directory - returns listing
88
+ fetch_content({ url: "https://github.com/owner/repo/tree/main/src" })
89
+
90
+ // Force-clone a large repo that exceeds the size threshold
91
+ fetch_content({ url: "https://github.com/big/repo", forceClone: true })
92
+ ```
93
+
94
+ Repos over 350MB get a lightweight API-based view instead of a full clone. Commit SHA URLs are also handled via the API. Clones are cached for the session -- multiple files from the same repo share one clone, but clones are wiped on session change/shutdown and re-cloned as needed.
95
+
77
96
  **PDF handling:** When fetching a PDF URL, the extension extracts text and saves it as a markdown file in `~/Downloads/`. The agent can then use `read` to access specific sections without loading 200K+ chars into context.
78
97
 
79
98
  ### get_search_content
@@ -93,7 +112,7 @@ get_search_content({ responseId: "abc123", query: "original query" })
93
112
 
94
113
  ## Features
95
114
 
96
- ### Activity Monitor (Ctrl+Shift+O)
115
+ ### Activity Monitor (Ctrl+Shift+W)
97
116
 
98
117
  Toggle live request/response activity:
99
118
 
@@ -129,25 +148,67 @@ Browse stored search results interactively.
129
148
 
130
149
  ## How It Works
131
150
 
151
+ ### fetch_content routing
152
+
153
+ ```
154
+ fetch_content(url)
155
+
156
+ ├── github.com code URL? ──→ Clone repo (gh/git --depth 1)
157
+ │ │
158
+ │ ┌───────┼───────┐
159
+ │ ↓ ↓ ↓
160
+ │ root tree blob
161
+ │ ↓ ↓ ↓
162
+ │ tree + dir file
163
+ │ README listing contents
164
+ │ │ │ │
165
+ │ └───────┼───────┘
166
+ │ ↓
167
+ │ Return content + local
168
+ │ path for read/bash
169
+
170
+ ├── PDF? ──→ unpdf → Save to ~/Downloads/
171
+
172
+ ├── Plain text? ──→ Return directly
173
+
174
+ └── HTML ──→ Readability → Markdown
175
+
176
+ [if fails]
177
+
178
+ RSC Parser → Markdown
179
+ ```
180
+
181
+ ### web_search with includeContent
182
+
132
183
  ```
133
184
  Agent Request → Perplexity API → Synthesized Answer + Citations
134
185
 
135
186
  [if includeContent: true]
136
187
 
137
188
  Background Fetch (3 concurrent)
138
-
139
- ┌────────────────┼────────────────┐
140
- ↓ ↓ ↓
141
- PDF HTML/Text RSC
142
- ↓ ↓ ↓
143
- unpdf → Readability → RSC Parser →
144
- Save to file Markdown Markdown
145
- ↓ ↓ ↓
146
- └────────────────┼────────────────┘
189
+ (uses same routing as above)
147
190
 
148
191
  Agent Notification (triggerTurn)
149
192
  ```
150
193
 
194
+ ## Configuration
195
+
196
+ All config lives in `~/.pi/web-search.json`:
197
+
198
+ ```json
199
+ {
200
+ "perplexityApiKey": "pplx-...",
201
+ "githubClone": {
202
+ "enabled": true,
203
+ "maxRepoSizeMB": 350,
204
+ "cloneTimeoutSeconds": 30,
205
+ "clonePath": "/tmp/pi-github-repos"
206
+ }
207
+ }
208
+ ```
209
+
210
+ All `githubClone` fields are optional with the defaults shown above. Set `"enabled": false` to disable GitHub cloning entirely and fall through to normal HTML extraction.
211
+
151
212
  ## Rate Limits
152
213
 
153
214
  - **Perplexity API**: 10 requests/minute (enforced client-side)
@@ -161,6 +222,8 @@ Agent Request → Perplexity API → Synthesized Answer + Citations
161
222
  | `index.ts` | Extension entry, tool definitions, commands, widget |
162
223
  | `perplexity.ts` | Perplexity API client, rate limiting |
163
224
  | `extract.ts` | URL fetching, content extraction routing |
225
+ | `github-extract.ts` | GitHub URL parser, clone cache, content generation |
226
+ | `github-api.ts` | GitHub API fallback for oversized repos and commit SHAs |
164
227
  | `pdf-extract.ts` | PDF text extraction, saves to markdown |
165
228
  | `rsc-extract.ts` | RSC flight data parser for Next.js pages |
166
229
  | `storage.ts` | Session-aware result storage |
@@ -173,4 +236,7 @@ Agent Request → Perplexity API → Synthesized Answer + Citations
173
236
  - PDFs are extracted as text (no OCR for scanned documents)
174
237
  - Max response size: 20MB for PDFs, 5MB for HTML
175
238
  - Max inline content: 30,000 chars per URL (larger content stored for retrieval via get_search_content)
239
+ - GitHub cloning requires `gh` CLI for private repos (public repos fall back to `git clone`)
240
+ - GitHub branch names with slashes (e.g. `feature/foo`) may resolve the wrong file path; the clone still succeeds and the agent can navigate manually
241
+ - Non-code GitHub URLs (issues, PRs, wiki, etc.) fall through to normal Readability extraction
176
242
  - Requires Pi restart after config file changes
package/extract.ts CHANGED
@@ -5,6 +5,7 @@ import pLimit from "p-limit";
5
5
  import { activityMonitor } from "./activity.js";
6
6
  import { extractRSCContent } from "./rsc-extract.js";
7
7
  import { extractPDFToMarkdown, isPDF } from "./pdf-extract.js";
8
+ import { extractGitHub } from "./github-extract.js";
8
9
 
9
10
  const DEFAULT_TIMEOUT_MS = 30000;
10
11
  const CONCURRENT_LIMIT = 3;
@@ -23,11 +24,17 @@ export interface ExtractedContent {
23
24
  error: string | null;
24
25
  }
25
26
 
27
+ export interface ExtractOptions {
28
+ timeoutMs?: number;
29
+ forceClone?: boolean;
30
+ }
31
+
26
32
  export async function extractContent(
27
33
  url: string,
28
34
  signal?: AbortSignal,
29
- timeoutMs: number = DEFAULT_TIMEOUT_MS,
35
+ options?: ExtractOptions,
30
36
  ): Promise<ExtractedContent> {
37
+ const timeoutMs = options?.timeoutMs ?? DEFAULT_TIMEOUT_MS;
31
38
  if (signal?.aborted) {
32
39
  return { url, title: "", content: "", error: "Aborted" };
33
40
  }
@@ -38,6 +45,13 @@ export async function extractContent(
38
45
  return { url, title: "", content: "", error: "Invalid URL" };
39
46
  }
40
47
 
48
+ try {
49
+ const ghResult = await extractGitHub(url, signal, options?.forceClone);
50
+ if (ghResult) return ghResult;
51
+ } catch {
52
+ // GitHub extraction failed unexpectedly, fall through to normal HTTP pipeline
53
+ }
54
+
41
55
  const activityId = activityMonitor.logStart({ type: "fetch", url });
42
56
 
43
57
  const controller = new AbortController();
@@ -127,22 +141,19 @@ export async function extractContent(
127
141
 
128
142
  if (isPlainText) {
129
143
  activityMonitor.logComplete(activityId, response.status);
130
- const content = text;
131
- // Extract filename from URL as title
132
144
  const urlPath = new URL(url).pathname;
133
145
  const title = urlPath.split("/").pop() || url;
134
- return { url, title, content, error: null };
146
+ return { url, title, content: text, error: null };
135
147
  }
136
148
 
137
- const html = text;
138
- const { document } = parseHTML(html);
149
+ const { document } = parseHTML(text);
139
150
 
140
151
  const reader = new Readability(document as unknown as Document);
141
152
  const article = reader.parse();
142
153
 
143
154
  if (!article) {
144
155
  // Fallback: Try extracting from RSC flight data (Next.js App Router)
145
- const rscResult = extractRSCContent(html);
156
+ const rscResult = extractRSCContent(text);
146
157
  if (rscResult) {
147
158
  activityMonitor.logComplete(activityId, response.status);
148
159
  return { url, title: rscResult.title, content: rscResult.content, error: null };
@@ -183,7 +194,7 @@ export async function extractContent(
183
194
  export async function fetchAllContent(
184
195
  urls: string[],
185
196
  signal?: AbortSignal,
186
- timeoutMs?: number,
197
+ options?: ExtractOptions,
187
198
  ): Promise<ExtractedContent[]> {
188
- return Promise.all(urls.map((url) => fetchLimit(() => extractContent(url, signal, timeoutMs))));
199
+ return Promise.all(urls.map((url) => fetchLimit(() => extractContent(url, signal, options))));
189
200
  }
package/github-api.ts ADDED
@@ -0,0 +1,195 @@
1
+ import { execFile } from "node:child_process";
2
+ import type { ExtractedContent } from "./extract.js";
3
+ import type { GitHubUrlInfo } from "./github-extract.js";
4
+
5
+ const MAX_TREE_ENTRIES = 200;
6
+ const MAX_INLINE_FILE_CHARS = 100_000;
7
+
8
+ let ghAvailable: boolean | null = null;
9
+ let ghHintShown = false;
10
+
11
+ export async function checkGhAvailable(): Promise<boolean> {
12
+ if (ghAvailable !== null) return ghAvailable;
13
+
14
+ return new Promise((resolve) => {
15
+ execFile("gh", ["--version"], { timeout: 5000 }, (err) => {
16
+ ghAvailable = !err;
17
+ resolve(ghAvailable);
18
+ });
19
+ });
20
+ }
21
+
22
+ export function showGhHint(): void {
23
+ if (!ghHintShown) {
24
+ ghHintShown = true;
25
+ console.error("[pi-web-access] Install `gh` CLI for better GitHub repo access including private repos.");
26
+ }
27
+ }
28
+
29
+ export async function checkRepoSize(owner: string, repo: string): Promise<number | null> {
30
+ if (!(await checkGhAvailable())) return null;
31
+
32
+ return new Promise((resolve) => {
33
+ execFile("gh", ["api", `repos/${owner}/${repo}`, "--jq", ".size"], { timeout: 10000 }, (err, stdout) => {
34
+ if (err) {
35
+ resolve(null);
36
+ return;
37
+ }
38
+ const kb = parseInt(stdout.trim(), 10);
39
+ resolve(Number.isNaN(kb) ? null : kb);
40
+ });
41
+ });
42
+ }
43
+
44
+ export async function getDefaultBranch(owner: string, repo: string): Promise<string | null> {
45
+ if (!(await checkGhAvailable())) return null;
46
+
47
+ return new Promise((resolve) => {
48
+ execFile("gh", ["api", `repos/${owner}/${repo}`, "--jq", ".default_branch"], { timeout: 10000 }, (err, stdout) => {
49
+ if (err) {
50
+ resolve(null);
51
+ return;
52
+ }
53
+ const branch = stdout.trim();
54
+ resolve(branch || null);
55
+ });
56
+ });
57
+ }
58
+
59
+ async function fetchTreeViaApi(owner: string, repo: string, ref: string): Promise<string | null> {
60
+ if (!(await checkGhAvailable())) return null;
61
+
62
+ return new Promise((resolve) => {
63
+ execFile(
64
+ "gh",
65
+ ["api", `repos/${owner}/${repo}/git/trees/${ref}?recursive=1`, "--jq", ".tree[].path"],
66
+ { timeout: 15000, maxBuffer: 5 * 1024 * 1024 },
67
+ (err, stdout) => {
68
+ if (err) {
69
+ resolve(null);
70
+ return;
71
+ }
72
+ const paths = stdout.trim().split("\n").filter(Boolean);
73
+ if (paths.length === 0) {
74
+ resolve(null);
75
+ return;
76
+ }
77
+ const truncated = paths.length > MAX_TREE_ENTRIES;
78
+ const display = paths.slice(0, MAX_TREE_ENTRIES).join("\n");
79
+ resolve(truncated ? display + `\n... (${paths.length} total entries)` : display);
80
+ },
81
+ );
82
+ });
83
+ }
84
+
85
+ async function fetchReadmeViaApi(owner: string, repo: string, ref: string): Promise<string | null> {
86
+ if (!(await checkGhAvailable())) return null;
87
+
88
+ return new Promise((resolve) => {
89
+ execFile(
90
+ "gh",
91
+ ["api", `repos/${owner}/${repo}/readme?ref=${ref}`, "--jq", ".content"],
92
+ { timeout: 10000 },
93
+ (err, stdout) => {
94
+ if (err) {
95
+ resolve(null);
96
+ return;
97
+ }
98
+ try {
99
+ const decoded = Buffer.from(stdout.trim(), "base64").toString("utf-8");
100
+ resolve(decoded.length > 8192 ? decoded.slice(0, 8192) + "\n\n[README truncated at 8K chars]" : decoded);
101
+ } catch {
102
+ resolve(null);
103
+ }
104
+ },
105
+ );
106
+ });
107
+ }
108
+
109
+ async function fetchFileViaApi(owner: string, repo: string, path: string, ref: string): Promise<string | null> {
110
+ if (!(await checkGhAvailable())) return null;
111
+
112
+ return new Promise((resolve) => {
113
+ execFile(
114
+ "gh",
115
+ ["api", `repos/${owner}/${repo}/contents/${path}?ref=${ref}`, "--jq", ".content"],
116
+ { timeout: 10000, maxBuffer: 2 * 1024 * 1024 },
117
+ (err, stdout) => {
118
+ if (err) {
119
+ resolve(null);
120
+ return;
121
+ }
122
+ try {
123
+ resolve(Buffer.from(stdout.trim(), "base64").toString("utf-8"));
124
+ } catch {
125
+ resolve(null);
126
+ }
127
+ },
128
+ );
129
+ });
130
+ }
131
+
132
+ export async function fetchViaApi(
133
+ url: string,
134
+ owner: string,
135
+ repo: string,
136
+ info: GitHubUrlInfo,
137
+ sizeNote?: string,
138
+ ): Promise<ExtractedContent | null> {
139
+ const ref = info.ref || (await getDefaultBranch(owner, repo));
140
+ if (!ref) return null;
141
+
142
+ const lines: string[] = [];
143
+ if (sizeNote) {
144
+ lines.push(sizeNote);
145
+ lines.push("");
146
+ }
147
+
148
+ if (info.type === "blob" && info.path) {
149
+ const content = await fetchFileViaApi(owner, repo, info.path, ref);
150
+ if (!content) return null;
151
+
152
+ lines.push(`## ${info.path}`);
153
+ if (content.length > MAX_INLINE_FILE_CHARS) {
154
+ lines.push(content.slice(0, MAX_INLINE_FILE_CHARS));
155
+ lines.push(`\n[File truncated at 100K chars]`);
156
+ } else {
157
+ lines.push(content);
158
+ }
159
+
160
+ return {
161
+ url,
162
+ title: `${owner}/${repo} - ${info.path}`,
163
+ content: lines.join("\n"),
164
+ error: null,
165
+ };
166
+ }
167
+
168
+ const [tree, readme] = await Promise.all([
169
+ fetchTreeViaApi(owner, repo, ref),
170
+ fetchReadmeViaApi(owner, repo, ref),
171
+ ]);
172
+
173
+ if (!tree && !readme) return null;
174
+
175
+ if (tree) {
176
+ lines.push("## Structure");
177
+ lines.push(tree);
178
+ lines.push("");
179
+ }
180
+
181
+ if (readme) {
182
+ lines.push("## README.md");
183
+ lines.push(readme);
184
+ lines.push("");
185
+ }
186
+
187
+ lines.push("This is an API-only view. Clone the repo or use `read`/`bash` for deeper exploration.");
188
+
189
+ return {
190
+ url,
191
+ title: `${owner}/${repo}`,
192
+ content: lines.join("\n"),
193
+ error: null,
194
+ };
195
+ }
@@ -0,0 +1,505 @@
1
+ import { existsSync, readFileSync, rmSync, statSync, readdirSync, openSync, readSync, closeSync } from "node:fs";
2
+ import { execFile } from "node:child_process";
3
+ import { homedir } from "node:os";
4
+ import { join, extname } from "node:path";
5
+ import { activityMonitor } from "./activity.js";
6
+ import type { ExtractedContent } from "./extract.js";
7
+ import { checkGhAvailable, checkRepoSize, fetchViaApi, showGhHint } from "./github-api.js";
8
+
9
+ const CONFIG_PATH = join(homedir(), ".pi", "web-search.json");
10
+
11
+ const BINARY_EXTENSIONS = new Set([
12
+ ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".webp", ".svg", ".tiff", ".tif",
13
+ ".mp3", ".mp4", ".avi", ".mov", ".mkv", ".flv", ".wmv", ".wav", ".ogg", ".webm", ".flac", ".aac",
14
+ ".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar", ".zst",
15
+ ".exe", ".dll", ".so", ".dylib", ".bin", ".o", ".a", ".lib",
16
+ ".woff", ".woff2", ".ttf", ".otf", ".eot",
17
+ ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",
18
+ ".sqlite", ".db", ".sqlite3",
19
+ ".pyc", ".pyo", ".class", ".jar", ".war",
20
+ ".iso", ".img", ".dmg",
21
+ ]);
22
+
23
+ const NOISE_DIRS = new Set([
24
+ "node_modules", "vendor", ".next", "dist", "build", "__pycache__",
25
+ ".venv", "venv", ".tox", ".mypy_cache", ".pytest_cache",
26
+ "target", ".gradle", ".idea", ".vscode",
27
+ ]);
28
+
29
+ const MAX_INLINE_FILE_CHARS = 100_000;
30
+ const MAX_TREE_ENTRIES = 200;
31
+
32
+ export interface GitHubUrlInfo {
33
+ owner: string;
34
+ repo: string;
35
+ ref?: string;
36
+ refIsFullSha: boolean;
37
+ path?: string;
38
+ type: "root" | "blob" | "tree";
39
+ }
40
+
41
+ interface CachedClone {
42
+ localPath: string;
43
+ clonePromise: Promise<string | null>;
44
+ }
45
+
46
+ interface GitHubCloneConfig {
47
+ enabled: boolean;
48
+ maxRepoSizeMB: number;
49
+ cloneTimeoutSeconds: number;
50
+ clonePath: string;
51
+ }
52
+
53
+ const cloneCache = new Map<string, CachedClone>();
54
+
55
+ let cachedConfig: GitHubCloneConfig | null = null;
56
+
57
+ function loadGitHubConfig(): GitHubCloneConfig {
58
+ if (cachedConfig) return cachedConfig;
59
+
60
+ const defaults: GitHubCloneConfig = {
61
+ enabled: true,
62
+ maxRepoSizeMB: 350,
63
+ cloneTimeoutSeconds: 30,
64
+ clonePath: "/tmp/pi-github-repos",
65
+ };
66
+
67
+ try {
68
+ if (existsSync(CONFIG_PATH)) {
69
+ const raw = JSON.parse(readFileSync(CONFIG_PATH, "utf-8"));
70
+ const gc = raw.githubClone ?? {};
71
+ cachedConfig = {
72
+ enabled: gc.enabled ?? defaults.enabled,
73
+ maxRepoSizeMB: gc.maxRepoSizeMB ?? defaults.maxRepoSizeMB,
74
+ cloneTimeoutSeconds: gc.cloneTimeoutSeconds ?? defaults.cloneTimeoutSeconds,
75
+ clonePath: gc.clonePath ?? defaults.clonePath,
76
+ };
77
+ return cachedConfig;
78
+ }
79
+ } catch {
80
+ // ignore parse errors
81
+ }
82
+
83
+ cachedConfig = defaults;
84
+ return cachedConfig;
85
+ }
86
+
87
+ const NON_CODE_SEGMENTS = new Set([
88
+ "issues", "pull", "pulls", "discussions", "releases", "wiki",
89
+ "actions", "settings", "security", "projects", "graphs",
90
+ "compare", "commits", "tags", "branches", "stargazers",
91
+ "watchers", "network", "forks", "milestone", "labels",
92
+ "packages", "codespaces", "contribute", "community",
93
+ "sponsors", "invitations", "notifications", "insights",
94
+ ]);
95
+
96
+ export function parseGitHubUrl(url: string): GitHubUrlInfo | null {
97
+ let parsed: URL;
98
+ try {
99
+ parsed = new URL(url);
100
+ } catch {
101
+ return null;
102
+ }
103
+
104
+ if (parsed.hostname !== "github.com") return null;
105
+
106
+ const segments = parsed.pathname.split("/").filter(Boolean);
107
+ if (segments.length < 2) return null;
108
+
109
+ const owner = segments[0];
110
+ const repo = segments[1].replace(/\.git$/, "");
111
+
112
+ if (NON_CODE_SEGMENTS.has(segments[2]?.toLowerCase())) return null;
113
+
114
+ if (segments.length === 2) {
115
+ return { owner, repo, refIsFullSha: false, type: "root" };
116
+ }
117
+
118
+ const action = segments[2];
119
+ if (action !== "blob" && action !== "tree") return null;
120
+ if (segments.length < 4) return null;
121
+
122
+ const ref = segments[3];
123
+ const refIsFullSha = /^[0-9a-f]{40}$/.test(ref);
124
+ const pathParts = segments.slice(4);
125
+ const path = pathParts.length > 0 ? pathParts.join("/") : "";
126
+
127
+ return {
128
+ owner,
129
+ repo,
130
+ ref,
131
+ refIsFullSha,
132
+ path,
133
+ type: action as "blob" | "tree",
134
+ };
135
+ }
136
+
137
+ function cacheKey(owner: string, repo: string, ref?: string): string {
138
+ return ref ? `${owner}/${repo}@${ref}` : `${owner}/${repo}`;
139
+ }
140
+
141
+ function cloneDir(config: GitHubCloneConfig, owner: string, repo: string, ref?: string): string {
142
+ const dirName = ref ? `${repo}@${ref}` : repo;
143
+ return join(config.clonePath, owner, dirName);
144
+ }
145
+
146
+ function execClone(args: string[], localPath: string, timeoutMs: number, signal?: AbortSignal): Promise<string | null> {
147
+ return new Promise((resolve) => {
148
+ const child = execFile(args[0], args.slice(1), { timeout: timeoutMs }, (err) => {
149
+ if (err) {
150
+ try {
151
+ rmSync(localPath, { recursive: true, force: true });
152
+ } catch { /* ignore */ }
153
+ resolve(null);
154
+ return;
155
+ }
156
+ resolve(localPath);
157
+ });
158
+
159
+ if (signal) {
160
+ const onAbort = () => child.kill();
161
+ signal.addEventListener("abort", onAbort, { once: true });
162
+ child.on("exit", () => signal.removeEventListener("abort", onAbort));
163
+ }
164
+ });
165
+ }
166
+
167
+ async function cloneRepo(
168
+ owner: string,
169
+ repo: string,
170
+ ref: string | undefined,
171
+ config: GitHubCloneConfig,
172
+ signal?: AbortSignal,
173
+ ): Promise<string | null> {
174
+ const localPath = cloneDir(config, owner, repo, ref);
175
+
176
+ try {
177
+ rmSync(localPath, { recursive: true, force: true });
178
+ } catch { /* ignore */ }
179
+
180
+ const timeoutMs = config.cloneTimeoutSeconds * 1000;
181
+ const hasGh = await checkGhAvailable();
182
+
183
+ if (hasGh) {
184
+ const args = ["gh", "repo", "clone", `${owner}/${repo}`, localPath, "--", "--depth", "1", "--single-branch"];
185
+ if (ref) args.push("--branch", ref);
186
+ return execClone(args, localPath, timeoutMs, signal);
187
+ }
188
+
189
+ showGhHint();
190
+
191
+ const gitUrl = `https://github.com/${owner}/${repo}.git`;
192
+ const args = ["git", "clone", "--depth", "1", "--single-branch"];
193
+ if (ref) args.push("--branch", ref);
194
+ args.push(gitUrl, localPath);
195
+ return execClone(args, localPath, timeoutMs, signal);
196
+ }
197
+
198
+ function isBinaryFile(filePath: string): boolean {
199
+ const ext = extname(filePath).toLowerCase();
200
+ if (BINARY_EXTENSIONS.has(ext)) return true;
201
+
202
+ let fd: number;
203
+ try {
204
+ fd = openSync(filePath, "r");
205
+ } catch {
206
+ return false;
207
+ }
208
+ try {
209
+ const buf = Buffer.alloc(512);
210
+ const bytesRead = readSync(fd, buf, 0, 512, 0);
211
+ for (let i = 0; i < bytesRead; i++) {
212
+ if (buf[i] === 0) return true;
213
+ }
214
+ } catch {
215
+ return false;
216
+ } finally {
217
+ closeSync(fd);
218
+ }
219
+
220
+ return false;
221
+ }
222
+
223
+ function formatFileSize(bytes: number): string {
224
+ if (bytes < 1024) return `${bytes} B`;
225
+ if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
226
+ return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
227
+ }
228
+
229
+ function buildTree(rootPath: string): string {
230
+ const entries: string[] = [];
231
+
232
+ function walk(dir: string, relPath: string): void {
233
+ if (entries.length >= MAX_TREE_ENTRIES) return;
234
+
235
+ let items: string[];
236
+ try {
237
+ items = readdirSync(dir).sort();
238
+ } catch {
239
+ return;
240
+ }
241
+
242
+ for (const item of items) {
243
+ if (entries.length >= MAX_TREE_ENTRIES) return;
244
+ if (item === ".git") continue;
245
+
246
+ const fullPath = join(dir, item);
247
+ let stat;
248
+ try {
249
+ stat = statSync(fullPath);
250
+ } catch {
251
+ continue;
252
+ }
253
+
254
+ const rel = relPath ? `${relPath}/${item}` : item;
255
+
256
+ if (stat.isDirectory()) {
257
+ if (NOISE_DIRS.has(item)) {
258
+ entries.push(`${rel}/ [skipped]`);
259
+ continue;
260
+ }
261
+ entries.push(`${rel}/`);
262
+ walk(fullPath, rel);
263
+ } else {
264
+ entries.push(rel);
265
+ }
266
+ }
267
+ }
268
+
269
+ walk(rootPath, "");
270
+
271
+ if (entries.length >= MAX_TREE_ENTRIES) {
272
+ entries.push(`... (truncated at ${MAX_TREE_ENTRIES} entries)`);
273
+ }
274
+
275
+ return entries.join("\n");
276
+ }
277
+
278
+ function buildDirListing(rootPath: string, subPath: string): string {
279
+ const targetPath = join(rootPath, subPath);
280
+ const lines: string[] = [];
281
+
282
+ let items: string[];
283
+ try {
284
+ items = readdirSync(targetPath).sort();
285
+ } catch {
286
+ return "(directory not readable)";
287
+ }
288
+
289
+ for (const item of items) {
290
+ if (item === ".git") continue;
291
+ const fullPath = join(targetPath, item);
292
+ try {
293
+ const stat = statSync(fullPath);
294
+ if (stat.isDirectory()) {
295
+ lines.push(` ${item}/`);
296
+ } else {
297
+ lines.push(` ${item} (${formatFileSize(stat.size)})`);
298
+ }
299
+ } catch {
300
+ lines.push(` ${item} (unreadable)`);
301
+ }
302
+ }
303
+
304
+ return lines.join("\n");
305
+ }
306
+
307
+ function readReadme(localPath: string): string | null {
308
+ const candidates = ["README.md", "readme.md", "README", "README.txt", "README.rst"];
309
+ for (const name of candidates) {
310
+ const readmePath = join(localPath, name);
311
+ if (existsSync(readmePath)) {
312
+ try {
313
+ const content = readFileSync(readmePath, "utf-8");
314
+ return content.length > 8192 ? content.slice(0, 8192) + "\n\n[README truncated at 8K chars]" : content;
315
+ } catch {
316
+ return null;
317
+ }
318
+ }
319
+ }
320
+ return null;
321
+ }
322
+
323
+ function generateContent(localPath: string, info: GitHubUrlInfo): string {
324
+ const lines: string[] = [];
325
+ lines.push(`Repository cloned to: ${localPath}`);
326
+ lines.push("");
327
+
328
+ if (info.type === "root") {
329
+ lines.push("## Structure");
330
+ lines.push(buildTree(localPath));
331
+ lines.push("");
332
+
333
+ const readme = readReadme(localPath);
334
+ if (readme) {
335
+ lines.push("## README.md");
336
+ lines.push(readme);
337
+ lines.push("");
338
+ }
339
+
340
+ lines.push("Use `read` and `bash` tools at the path above to explore further.");
341
+ return lines.join("\n");
342
+ }
343
+
344
+ if (info.type === "tree") {
345
+ const dirPath = info.path || "";
346
+ const fullDirPath = join(localPath, dirPath);
347
+
348
+ if (!existsSync(fullDirPath)) {
349
+ lines.push(`Path \`${dirPath}\` not found in clone. Showing repository root instead.`);
350
+ lines.push("");
351
+ lines.push("## Structure");
352
+ lines.push(buildTree(localPath));
353
+ } else {
354
+ lines.push(`## ${dirPath || "/"}`);
355
+ lines.push(buildDirListing(localPath, dirPath));
356
+ }
357
+
358
+ lines.push("");
359
+ lines.push("Use `read` and `bash` tools at the path above to explore further.");
360
+ return lines.join("\n");
361
+ }
362
+
363
+ if (info.type === "blob") {
364
+ const filePath = info.path || "";
365
+ const fullFilePath = join(localPath, filePath);
366
+
367
+ if (!existsSync(fullFilePath)) {
368
+ lines.push(`Path \`${filePath}\` not found in clone. Showing repository root instead.`);
369
+ lines.push("");
370
+ lines.push("## Structure");
371
+ lines.push(buildTree(localPath));
372
+ lines.push("");
373
+ lines.push("Use `read` and `bash` tools at the path above to explore further.");
374
+ return lines.join("\n");
375
+ }
376
+
377
+ const stat = statSync(fullFilePath);
378
+
379
+ if (stat.isDirectory()) {
380
+ lines.push(`## ${filePath || "/"}`);
381
+ lines.push(buildDirListing(localPath, filePath));
382
+ lines.push("");
383
+ lines.push("Use `read` and `bash` tools at the path above to explore further.");
384
+ return lines.join("\n");
385
+ }
386
+
387
+ if (isBinaryFile(fullFilePath)) {
388
+ const ext = extname(filePath).replace(".", "");
389
+ lines.push(`## ${filePath}`);
390
+ lines.push(`Binary file (${ext}, ${formatFileSize(stat.size)}). Use \`read\` or \`bash\` tools at the path above to inspect.`);
391
+ return lines.join("\n");
392
+ }
393
+
394
+ const content = readFileSync(fullFilePath, "utf-8");
395
+ lines.push(`## ${filePath}`);
396
+
397
+ if (content.length > MAX_INLINE_FILE_CHARS) {
398
+ lines.push(content.slice(0, MAX_INLINE_FILE_CHARS));
399
+ lines.push("");
400
+ lines.push(`[File truncated at 100K chars. Full file: ${fullFilePath}]`);
401
+ } else {
402
+ lines.push(content);
403
+ }
404
+
405
+ lines.push("");
406
+ lines.push("Use `read` and `bash` tools at the path above to explore further.");
407
+ return lines.join("\n");
408
+ }
409
+
410
+ return lines.join("\n");
411
+ }
412
+
413
+ async function awaitCachedClone(
414
+ cached: CachedClone,
415
+ url: string,
416
+ owner: string,
417
+ repo: string,
418
+ info: GitHubUrlInfo,
419
+ signal?: AbortSignal,
420
+ ): Promise<ExtractedContent | null> {
421
+ if (signal?.aborted) return fetchViaApi(url, owner, repo, info);
422
+ const result = await cached.clonePromise;
423
+ if (signal?.aborted) return fetchViaApi(url, owner, repo, info);
424
+ if (result) {
425
+ const content = generateContent(result, info);
426
+ const title = info.path ? `${owner}/${repo} - ${info.path}` : `${owner}/${repo}`;
427
+ return { url, title, content, error: null };
428
+ }
429
+ return fetchViaApi(url, owner, repo, info);
430
+ }
431
+
432
+ export async function extractGitHub(
433
+ url: string,
434
+ signal?: AbortSignal,
435
+ forceClone?: boolean,
436
+ ): Promise<ExtractedContent | null> {
437
+ const info = parseGitHubUrl(url);
438
+ if (!info) return null;
439
+
440
+ const config = loadGitHubConfig();
441
+ if (!config.enabled) return null;
442
+
443
+ const { owner, repo } = info;
444
+ const key = cacheKey(owner, repo, info.ref);
445
+
446
+ const cached = cloneCache.get(key);
447
+ if (cached) return awaitCachedClone(cached, url, owner, repo, info, signal);
448
+
449
+ if (info.refIsFullSha) {
450
+ const sizeNote = `Note: Commit SHA URLs use the GitHub API instead of cloning.`;
451
+ return fetchViaApi(url, owner, repo, info, sizeNote);
452
+ }
453
+
454
+ const activityId = activityMonitor.logStart({ type: "fetch", url: `github.com/${owner}/${repo}` });
455
+
456
+ if (!forceClone) {
457
+ const sizeKB = await checkRepoSize(owner, repo);
458
+ if (sizeKB !== null) {
459
+ const sizeMB = sizeKB / 1024;
460
+ if (sizeMB > config.maxRepoSizeMB) {
461
+ activityMonitor.logComplete(activityId, 200);
462
+ const sizeNote =
463
+ `Note: Repository is ${Math.round(sizeMB)}MB (threshold: ${config.maxRepoSizeMB}MB). ` +
464
+ `Showing API-fetched content instead of full clone. Ask the user if they'd like to clone the full repo -- ` +
465
+ `if yes, call fetch_content again with the same URL and add forceClone: true to the params.`;
466
+ return fetchViaApi(url, owner, repo, info, sizeNote);
467
+ }
468
+ }
469
+ }
470
+
471
+ // Re-check: another concurrent caller may have started a clone while we awaited the size check
472
+ const cachedAfterSizeCheck = cloneCache.get(key);
473
+ if (cachedAfterSizeCheck) return awaitCachedClone(cachedAfterSizeCheck, url, owner, repo, info, signal);
474
+
475
+ const clonePromise = cloneRepo(owner, repo, info.ref, config, signal);
476
+ const localPath = cloneDir(config, owner, repo, info.ref);
477
+ cloneCache.set(key, { localPath, clonePromise });
478
+
479
+ const result = await clonePromise;
480
+
481
+ if (!result) {
482
+ cloneCache.delete(key);
483
+ activityMonitor.logError(activityId, "clone failed");
484
+
485
+ const apiFallback = await fetchViaApi(url, owner, repo, info);
486
+ if (apiFallback) return apiFallback;
487
+
488
+ return null;
489
+ }
490
+
491
+ activityMonitor.logComplete(activityId, 200);
492
+ const content = generateContent(result, info);
493
+ const title = info.path ? `${owner}/${repo} - ${info.path}` : `${owner}/${repo}`;
494
+ return { url, title, content, error: null };
495
+ }
496
+
497
+ export function clearCloneCache(): void {
498
+ for (const entry of cloneCache.values()) {
499
+ try {
500
+ rmSync(entry.localPath, { recursive: true, force: true });
501
+ } catch { /* ignore */ }
502
+ }
503
+ cloneCache.clear();
504
+ cachedConfig = null;
505
+ }
package/index.ts CHANGED
@@ -3,6 +3,7 @@ import { Key, Text, truncateToWidth } from "@mariozechner/pi-tui";
3
3
  import { Type } from "@sinclair/typebox";
4
4
  import { StringEnum } from "@mariozechner/pi-ai";
5
5
  import { fetchAllContent, type ExtractedContent } from "./extract.js";
6
+ import { clearCloneCache } from "./github-extract.js";
6
7
  import { searchWithPerplexity, type SearchResult } from "./perplexity.js";
7
8
  import {
8
9
  clearResults,
@@ -111,6 +112,7 @@ function formatEntryLine(
111
112
 
112
113
  function handleSessionChange(ctx: ExtensionContext): void {
113
114
  abortPendingFetches();
115
+ clearCloneCache();
114
116
  sessionActive = true;
115
117
  restoreFromSession(ctx);
116
118
  // Unsubscribe before clear() to avoid callback with stale ctx
@@ -148,6 +150,7 @@ export default function (pi: ExtensionAPI) {
148
150
  pi.on("session_shutdown", () => {
149
151
  sessionActive = false;
150
152
  abortPendingFetches();
153
+ clearCloneCache();
151
154
  clearResults();
152
155
  // Unsubscribe before clear() to avoid callback with stale ctx
153
156
  widgetUnsubscribe?.();
@@ -393,6 +396,9 @@ export default function (pi: ExtensionAPI) {
393
396
  parameters: Type.Object({
394
397
  url: Type.Optional(Type.String({ description: "Single URL to fetch" })),
395
398
  urls: Type.Optional(Type.Array(Type.String(), { description: "Multiple URLs (parallel)" })),
399
+ forceClone: Type.Optional(Type.Boolean({
400
+ description: "Force cloning large GitHub repositories that exceed the size threshold",
401
+ })),
396
402
  }),
397
403
 
398
404
  async execute(_toolCallId, params, signal, onUpdate, _ctx) {
@@ -409,7 +415,9 @@ export default function (pi: ExtensionAPI) {
409
415
  details: { phase: "fetch", progress: 0 },
410
416
  });
411
417
 
412
- const fetchResults = await fetchAllContent(urlList, signal);
418
+ const fetchResults = await fetchAllContent(urlList, signal, {
419
+ forceClone: params.forceClone,
420
+ });
413
421
  const successful = fetchResults.filter((r) => !r.error).length;
414
422
  const totalChars = fetchResults.reduce((sum, r) => sum + r.content.length, 0);
415
423
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-web-access",
3
- "version": "0.4.5",
3
+ "version": "0.5.0",
4
4
  "type": "module",
5
5
  "keywords": ["pi-package", "pi", "pi-coding-agent", "extension", "web-search", "perplexity", "fetch", "scraping"],
6
6
  "dependencies": {