pi-web-toolkit 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,493 @@
1
+ /**
2
+ * Content preview extraction — structural markdown analysis
3
+ *
4
+ * Extracts readable preview snippets from markdown content by analyzing
5
+ * document structure rather than character-level heuristics.
6
+ *
7
+ * Used by web_fetch and web_batch_fetch to show concise page previews.
8
+ */
9
+
10
+ // ---------------------------------------------------------------------------
11
+ // Types
12
+ // ---------------------------------------------------------------------------
13
+
14
+ type BlockType = "heading" | "paragraph" | "list" | "table" | "code" | "rule";
15
+
16
+ type BlockTag = "content" | "toc" | "navigation" | "metadata" | "table_data" | "unknown";
17
+
18
+ interface MarkdownBlock {
19
+ type: BlockType;
20
+ raw: string;
21
+ text: string; // stripped of markdown syntax
22
+ level?: number; // heading level
23
+ lineStart: number;
24
+ }
25
+
26
+ // ---------------------------------------------------------------------------
27
+ // Markdown block parser
28
+ // ---------------------------------------------------------------------------
29
+
30
+ function parseBlocks(md: string): MarkdownBlock[] {
31
+ const lines = md.split("\n");
32
+ const blocks: MarkdownBlock[] = [];
33
+ let i = 0;
34
+
35
+ while (i < lines.length) {
36
+ const line = lines[i];
37
+ const trimmed = line.trim();
38
+
39
+ // Skip empty lines
40
+ if (trimmed === "") {
41
+ i++;
42
+ continue;
43
+ }
44
+
45
+ // Code block
46
+ if (trimmed.startsWith("```") || trimmed.startsWith("~~~")) {
47
+ const fence = trimmed.slice(0, 3);
48
+ const start = i;
49
+ i++;
50
+ while (i < lines.length && !lines[i].trim().startsWith(fence)) i++;
51
+ const raw = lines.slice(start, i + 1).join("\n");
52
+ blocks.push({ type: "code", raw, text: raw, lineStart: start });
53
+ i++;
54
+ continue;
55
+ }
56
+
57
+ // Table
58
+ if (trimmed.startsWith("|")) {
59
+ const start = i;
60
+ const tableLines: string[] = [];
61
+ while (i < lines.length && lines[i].trim().startsWith("|")) {
62
+ tableLines.push(lines[i]);
63
+ i++;
64
+ }
65
+ const raw = tableLines.join("\n");
66
+ blocks.push({ type: "table", raw, text: stripTable(raw), lineStart: start });
67
+ continue;
68
+ }
69
+
70
+ // Heading (ATX style)
71
+ const atxMatch = trimmed.match(/^(#{1,6})\s+(.+)$/);
72
+ if (atxMatch) {
73
+ blocks.push({
74
+ type: "heading",
75
+ raw: line,
76
+ text: atxMatch[2].trim(),
77
+ level: atxMatch[1].length,
78
+ lineStart: i,
79
+ });
80
+ i++;
81
+ continue;
82
+ }
83
+
84
+ // Heading (Setext style)
85
+ if (i + 1 < lines.length) {
86
+ const nextTrimmed = lines[i + 1].trim();
87
+ if (/^=+$/.test(nextTrimmed) || /^-+$/.test(nextTrimmed)) {
88
+ const level = nextTrimmed[0] === "=" ? 1 : 2;
89
+ blocks.push({
90
+ type: "heading",
91
+ raw: lines.slice(i, i + 2).join("\n"),
92
+ text: trimmed,
93
+ level,
94
+ lineStart: i,
95
+ });
96
+ i += 2;
97
+ continue;
98
+ }
99
+ }
100
+
101
+ // Horizontal rule
102
+ if (/^(---|___|\*\*\*|- - -)$/.test(trimmed)) {
103
+ blocks.push({ type: "rule", raw: line, text: "", lineStart: i });
104
+ i++;
105
+ continue;
106
+ }
107
+
108
+ // List item
109
+ if (/^\s*[-*+]\s/.test(trimmed) || /^\s*\d+\.\s/.test(trimmed)) {
110
+ const start = i;
111
+ const listLines: string[] = [lines[i]];
112
+ i++;
113
+ // Continue if next lines are indented or empty (within list)
114
+ while (i < lines.length) {
115
+ const nextLine = lines[i];
116
+ if (nextLine.trim() === "") {
117
+ listLines.push(nextLine);
118
+ i++;
119
+ continue;
120
+ }
121
+ if (/^\s*[-*+]\s/.test(nextLine) || /^\s*\d+\.\s/.test(nextLine)) {
122
+ listLines.push(nextLine);
123
+ i++;
124
+ continue;
125
+ }
126
+ // Continuation line (indented relative to first item)
127
+ if (nextLine.startsWith(" ")) {
128
+ listLines.push(nextLine);
129
+ i++;
130
+ continue;
131
+ }
132
+ break;
133
+ }
134
+ const raw = listLines.join("\n");
135
+ blocks.push({ type: "list", raw, text: stripMarkdown(raw), lineStart: start });
136
+ continue;
137
+ }
138
+
139
+ // Paragraph (default)
140
+ const start = i;
141
+ const paraLines: string[] = [];
142
+ while (i < lines.length && lines[i].trim() !== "") {
143
+ // Stop if we hit a structural element
144
+ const t = lines[i].trim();
145
+ if (/^#{1,6}\s/.test(t) || t.startsWith("```") || t.startsWith("~~~") ||
146
+ t.startsWith("|") || /^\s*[-*+]\s/.test(t) || /^\s*\d+\.\s/.test(t) ||
147
+ /^(---|___|\*\*\*|- - -)$/.test(t)) break;
148
+ paraLines.push(lines[i]);
149
+ i++;
150
+ }
151
+ const raw = paraLines.join("\n");
152
+ blocks.push({ type: "paragraph", raw, text: stripMarkdown(raw), lineStart: start });
153
+ }
154
+
155
+ return blocks;
156
+ }
157
+
158
+ // ---------------------------------------------------------------------------
159
+ // Markdown stripping helpers
160
+ // ---------------------------------------------------------------------------
161
+
162
+ function stripTable(tableMd: string): string {
163
+ const lines = tableMd.split("\n");
164
+ const rows: Array<{ text: string; rawLinkCount: number }> = [];
165
+
166
+ for (const line of lines) {
167
+ const trimmed = line.trim();
168
+ if (!trimmed.startsWith("|")) continue;
169
+
170
+ // Count links on raw markdown before stripping
171
+ const rawLinkCount = (trimmed.match(/\[|\]\(/g) || []).length;
172
+
173
+ // Split by |, removing first and last empty elements
174
+ const allCells = trimmed.split("|").slice(1, -1).map((c) => c.trim());
175
+
176
+ // Split cells into sub-rows wherever we find a pure separator cell.
177
+ // This handles both normal tables (separators on their own line)
178
+ // and condensed tables like HN (separators inline).
179
+ let currentRow: string[] = [];
180
+ for (const cell of allCells) {
181
+ if (/^[-:]+$/.test(cell)) {
182
+ // Flush current row if it has content
183
+ if (currentRow.length > 0) {
184
+ const rowText = currentRow.map((c) => stripMarkdown(c)).join(" ").trim().replace(/\s+/g, " ");
185
+ if (rowText.length > 0 && !/^[-\s|]+$/.test(rowText)) {
186
+ rows.push({ text: rowText, rawLinkCount });
187
+ }
188
+ currentRow = [];
189
+ }
190
+ } else if (cell.length > 0) {
191
+ currentRow.push(cell);
192
+ }
193
+ }
194
+ // Flush final row
195
+ if (currentRow.length > 0) {
196
+ const rowText = currentRow.map((c) => stripMarkdown(c)).join(" ").trim().replace(/\s+/g, " ");
197
+ if (rowText.length > 0 && !/^[-\s|]+$/.test(rowText)) {
198
+ rows.push({ text: rowText, rawLinkCount });
199
+ }
200
+ }
201
+ }
202
+
203
+ // Filter out navigation rows (HN-style header bars, short-link nav)
204
+ return rows
205
+ .filter((r) => {
206
+ const rWords = r.text.split(/\s+/).filter((w) => w.length > 0);
207
+ return !(rWords.length <= 12 && r.rawLinkCount >= 4);
208
+ })
209
+ .map((r) => r.text)
210
+ .join("\n");
211
+ }
212
+
213
+ export function stripMarkdown(md: string): string {
214
+ // Add spaces between adjacent markdown elements to prevent concatenation
215
+ const spaced = md
216
+ .replace(/\]\[/g, "] [") // adjacent links
217
+ .replace(/\*\*\[/g, "** [") // bold then link
218
+ .replace(/\]\*\*/g, "] **"); // link then bold
219
+
220
+ return spaced
221
+ .replace(/!\[[^\]]*\]\([^)]*\)/g, "")
222
+ .replace(/\[([^\]]+)\]\([^)]*\)/g, "$1")
223
+ .replace(/[*_]{1,2}([^*_]+)[*_]{1,2}/g, "$1")
224
+ .replace(/^#{1,6}\s+/gm, "")
225
+ .replace(/^\s*[-*+]\s+/gm, "")
226
+ .replace(/^\s*>\s*/gm, "")
227
+ .replace(/```[\s\S]*?```/g, "")
228
+ .replace(/`([^`]+)`/g, "$1")
229
+ .replace(/\s+/g, " ")
230
+ .trim();
231
+ }
232
+
233
+ // ---------------------------------------------------------------------------
234
+ // Block classification
235
+ // ---------------------------------------------------------------------------
236
+
237
+ function classifyBlock(block: MarkdownBlock, neighbors: MarkdownBlock[]): BlockTag {
238
+ if (block.type === "code" || block.type === "rule") return "unknown";
239
+
240
+ const t = block.text;
241
+ const words = t.split(/\s+/).filter((w) => w.length > 0);
242
+
243
+ // Wikipedia-style edit links
244
+ if (/\[edit\]/i.test(block.raw) || /\bEdit\b/i.test(t)) return "metadata";
245
+
246
+ // TOC detection: common heading labels across languages
247
+ if (block.type === "heading" &&
248
+ /^(Contents|Table of contents|目次|目录|Inhaltsverzeichnis|Sommaire|Contenido|Содержание|Indice|Conteúdo|목차|İçindekiler)$/i.test(t)) {
249
+ return "toc";
250
+ }
251
+
252
+ // List classification
253
+ if (block.type === "list") {
254
+ const items = block.raw.split("\n").filter((l) => /^\s*[-*+]\s/.test(l) || /^\s*\d+\.\s/.test(l));
255
+
256
+ // Detect form dropdowns rendered as list items (e.g. "- select -Afghanistan...")
257
+ const listText = block.text;
258
+ if (/select\s+-/i.test(listText) || /choose\s+/i.test(listText)) return "navigation";
259
+ const titleCaseMatches = listText.match(/[A-Z][a-z]+/g) || [];
260
+ const listWords = listText.split(/\s+/).filter((w) => w.length > 0);
261
+ const titleCaseRatio = titleCaseMatches.length / Math.max(listWords.length, 1);
262
+ if (titleCaseMatches.length > 50 && (listWords.length < 200 || titleCaseRatio > 1.5)) {
263
+ return "navigation";
264
+ }
265
+
266
+ // All items are short links → TOC or navigation
267
+ const allShortLinks = items.every((item) => {
268
+ const stripped = stripMarkdown(item);
269
+ return stripped.length < 40 && /\[/.test(item);
270
+ });
271
+ if (allShortLinks && items.length > 2) return "toc";
272
+
273
+ // Language links pattern: [Lang](https://xx.wikipedia.org/...)
274
+ const langLinks = items.filter((item) =>
275
+ /https?:\/\/[a-z]{2}(-[a-z]+)?\.\w+/.test(item),
276
+ );
277
+ if (langLinks.length > 3) return "navigation";
278
+
279
+ // Mixed list with some links → could be content or navigation
280
+ const linkRatio = items.filter((i) => /\[/.test(i)).length / Math.max(items.length, 1);
281
+ if (linkRatio > 0.8 && items.length > 5) return "navigation";
282
+ }
283
+
284
+ // Table classification
285
+ if (block.type === "table") {
286
+ const rows = block.text.split("\n").filter((r) => r.trim().length > 0);
287
+ if (rows.length === 0) return "unknown";
288
+
289
+ // Check if most rows are short-link navigation
290
+ const navRows = rows.filter((r) => {
291
+ const words = r.split(/\s+/).filter((w) => w.length > 0);
292
+ const linkCount = (r.match(/\[|\]\(/g) || []).length;
293
+ return words.length <= 12 && linkCount >= 2;
294
+ });
295
+ if (navRows.length / rows.length > 0.7) return "navigation";
296
+
297
+ return "table_data";
298
+ }
299
+
300
+ // Paragraph after TOC heading or in nav cluster
301
+ if (block.type === "paragraph") {
302
+ // Very short → likely not content
303
+ if (t.length < 30) return "unknown";
304
+
305
+ // Check if surrounded by navigation
306
+ const nearbyNav = neighbors.filter((n) =>
307
+ n.lineStart >= block.lineStart - 10 && n.lineStart <= block.lineStart + 10 &&
308
+ (n.type === "list" || n.type === "heading"),
309
+ );
310
+ const navRatio = nearbyNav.length / Math.max(nearbyNav.length, 1);
311
+ // CJK text has no spaces — use character length instead of word count
312
+ const isCJK = /[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]/.test(t);
313
+ const navThreshold = isCJK ? 40 : 15;
314
+ if (navRatio > 0.5 && (isCJK ? t.length < navThreshold : words.length < navThreshold)) return "navigation";
315
+ }
316
+
317
+ // Form / dropdown detection: concatenated option lists
318
+ if (block.type === "paragraph") {
319
+ // Detect select dropdowns rendered as text
320
+ if (/^-?\s*select\s+-/i.test(t) || /^-?\s*choose\s+/i.test(t)) return "navigation";
321
+
322
+ // Detect dense concatenated lists (AfghanistanAkrotiriAlbania...)
323
+ // Uses global regex to find Title Case patterns even without spaces
324
+ const titleCaseMatches = t.match(/[A-Z][a-z]+/g) || [];
325
+ const titleCaseRatio = titleCaseMatches.length / Math.max(words.length, 1);
326
+ if (titleCaseMatches.length > 50 && (words.length < 200 || titleCaseRatio > 1.5)) {
327
+ return "navigation";
328
+ }
329
+ }
330
+
331
+ // Default: if it looks like real text with sentences
332
+ // CJK text has no spaces — use character count instead of word count
333
+ const hasCJK = /[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]/.test(t);
334
+ const effectiveWords = hasCJK ? t.length : words.length;
335
+ const wordThreshold = hasCJK ? 30 : 10;
336
+
337
+ if (block.type === "paragraph" && /[.!?。!?]/.test(t) && effectiveWords > wordThreshold) {
338
+ // But not if it's mostly Title Case (dropdown options, form labels)
339
+ const titleCaseWords = words.filter((w) =>
340
+ /^[A-Z][a-z]+$/.test(w) && w.length > 2
341
+ );
342
+ if (titleCaseWords.length / Math.max(words.length, 1) > 0.6) return "navigation";
343
+ return "content";
344
+ }
345
+
346
+ return "unknown";
347
+ }
348
+
349
+ // ---------------------------------------------------------------------------
350
+ // Preview extraction
351
+ // ---------------------------------------------------------------------------
352
+
353
+ interface ScoredBlock {
354
+ block: MarkdownBlock;
355
+ tag: BlockTag;
356
+ score: number;
357
+ }
358
+
359
+ function scoreBlocks(blocks: MarkdownBlock[]): ScoredBlock[] {
360
+ const scored: ScoredBlock[] = [];
361
+
362
+ for (let i = 0; i < blocks.length; i++) {
363
+ const block = blocks[i];
364
+ const neighbors = blocks.slice(Math.max(0, i - 3), Math.min(blocks.length, i + 4));
365
+ const tag = classifyBlock(block, neighbors);
366
+
367
+ let score = 0;
368
+
369
+ switch (tag) {
370
+ case "content":
371
+ score = 100;
372
+ // Bonus for length (but not too long)
373
+ score += Math.min(block.text.length / 20, 30);
374
+ // Bonus for being after a heading
375
+ if (i > 0 && blocks[i - 1].type === "heading") score += 20;
376
+ break;
377
+
378
+ case "table_data": {
379
+ // For table-heavy pages (e.g. HN), score based on content row density
380
+ const rows = block.text.split("\n").filter((r) => r.trim().length > 0);
381
+ const contentRows = rows.filter((r) => {
382
+ const rWords = r.split(/\s+/).filter((w) => w.length > 0);
383
+ const linkCount = (r.match(/\[|\]\(/g) || []).length;
384
+ // Filter out short-link navigation rows (HN header, nav bars)
385
+ return !(rWords.length <= 8 && linkCount >= 2);
386
+ });
387
+ score = Math.min(contentRows.length * 5, 80);
388
+ break;
389
+ }
390
+
391
+ case "toc":
392
+ case "navigation":
393
+ case "metadata":
394
+ score = -100;
395
+ break;
396
+
397
+ case "unknown":
398
+ default: {
399
+ // Unknown paragraphs: score by text quality
400
+ if (block.type === "paragraph") {
401
+ if (block.text.length > 80 && /[.!?。!?]/.test(block.text)) {
402
+ score = 50;
403
+ } else if (block.text.length > 40) {
404
+ score = 20;
405
+ }
406
+ }
407
+ // Unknown lists: short lists with some substance
408
+ if (block.type === "list" && block.text.length > 60) {
409
+ score = 30;
410
+ }
411
+ break;
412
+ }
413
+ }
414
+
415
+ // Penalize blocks near the very top (usually nav)
416
+ if (block.lineStart < 5) score -= 30;
417
+
418
+ // Penalize blocks that look like cookie banners or alerts
419
+ if (/cookie|consent|privacy policy|terms of use/i.test(block.text)) {
420
+ score -= 50;
421
+ }
422
+
423
+ scored.push({ block, tag, score });
424
+ }
425
+
426
+ return scored;
427
+ }
428
+
429
+ function buildPreview(blocks: MarkdownBlock[], maxLen: number): string {
430
+ const scored = scoreBlocks(blocks);
431
+ scored.sort((a, b) => b.score - a.score);
432
+
433
+ const best = scored[0];
434
+ if (!best || best.score <= 0) {
435
+ // Fallback: join all non-nav, non-toc text
436
+ const navTags = new Set(["toc", "navigation", "metadata"]);
437
+ const allText = blocks
438
+ .filter((b) => b.type !== "code" && b.type !== "rule")
439
+ .map((b) => {
440
+ const s = scored.find((sc) => sc.block === b);
441
+ return s && navTags.has(s.tag) ? "" : b.text;
442
+ })
443
+ .join(" ")
444
+ .replace(/\s+/g, " ")
445
+ .trim();
446
+ return allText.slice(0, maxLen);
447
+ }
448
+
449
+ // Build preview from best block, with context
450
+ const idx = blocks.indexOf(best.block);
451
+ const parts: string[] = [];
452
+ let remaining = maxLen;
453
+
454
+ // Start with best block
455
+ const bestText = best.block.text.trim();
456
+ if (bestText.length <= remaining) {
457
+ parts.push(bestText);
458
+ remaining -= bestText.length;
459
+ } else {
460
+ return bestText.slice(0, maxLen).replace(/\s+\S*$/, "") + "...";
461
+ }
462
+
463
+ // Try adjacent content blocks
464
+ for (const offset of [-1, 1, -2, 2]) {
465
+ const neighbor = blocks[idx + offset];
466
+ if (!neighbor || remaining < 20) break;
467
+ const neighborScore = scored.find((s) => s.block === neighbor);
468
+ if (!neighborScore || neighborScore.score < 20) continue;
469
+
470
+ const text = neighbor.text.trim();
471
+ if (text.length <= remaining - 2) {
472
+ parts.push(text);
473
+ remaining -= text.length + 2;
474
+ }
475
+ }
476
+
477
+ return parts.join("\n\n").slice(0, maxLen);
478
+ }
479
+
480
+ /**
481
+ * Extract a plain-text preview from markdown content.
482
+ *
483
+ * Uses structural analysis:
484
+ * - Parses markdown into semantic blocks (headings, paragraphs, lists, tables)
485
+ * - Classifies blocks as content, TOC, navigation, or metadata
486
+ * - Scores content blocks by quality signals (length, sentence structure, position)
487
+ * - Returns the best preview, with context from adjacent good blocks
488
+ */
489
+ export function extractPreview(content: string, maxLen: number = 500): string {
490
+ const blocks = parseBlocks(content);
491
+ if (blocks.length === 0) return "";
492
+ return buildPreview(blocks, maxLen);
493
+ }
@@ -0,0 +1,67 @@
1
+ /**
2
+ * Output sink — truncation + temp-file fallback
3
+ *
4
+ * Centralises the output handling policy for all tools:
5
+ * - Truncate to line/byte budgets
6
+ * - Write full output to a temp file when truncated
7
+ * - Return display text + optional temp-file path
8
+ */
9
+
10
+ import { mkdtemp, writeFile } from "node:fs/promises";
11
+ import * as os from "node:os";
12
+ import * as path from "node:path";
13
+ import {
14
+ truncateHead,
15
+ formatSize,
16
+ DEFAULT_MAX_BYTES,
17
+ DEFAULT_MAX_LINES,
18
+ } from "@earendil-works/pi-coding-agent";
19
+
20
+ export interface OutputSinkOptions {
21
+ /** Temp directory prefix, e.g. "pi-web-search-" */
22
+ tmpPrefix: string;
23
+ /** Whether to always write the full output to a temp file, even when not truncated */
24
+ alwaysWriteFile?: boolean;
25
+ /** Override default max lines */
26
+ maxLines?: number;
27
+ /** Override default max bytes */
28
+ maxBytes?: number;
29
+ }
30
+
31
+ export interface OutputSinkResult {
32
+ /** Display text (possibly truncated) */
33
+ text: string;
34
+ /** Path to the full-output temp file, if one was written */
35
+ fullOutputPath?: string;
36
+ }
37
+
38
+ /**
39
+ * Process raw tool output through truncation policy.
40
+ *
41
+ * When truncated, writes the full raw text to a temp file and appends
42
+ * a truncation notice with the file path.
43
+ */
44
+ export async function writeWithFallback(
45
+ rawText: string,
46
+ options: OutputSinkOptions,
47
+ ): Promise<OutputSinkResult> {
48
+ const maxLines = options.maxLines ?? DEFAULT_MAX_LINES;
49
+ const maxBytes = options.maxBytes ?? DEFAULT_MAX_BYTES;
50
+
51
+ const truncation = truncateHead(rawText, { maxLines, maxBytes });
52
+
53
+ let fullOutputPath: string | undefined;
54
+
55
+ if (truncation.truncated || options.alwaysWriteFile) {
56
+ const tmpDir = await mkdtemp(path.join(os.tmpdir(), options.tmpPrefix));
57
+ fullOutputPath = path.join(tmpDir, "output.txt");
58
+ await writeFile(fullOutputPath, rawText, "utf-8");
59
+ }
60
+
61
+ let text = truncation.content;
62
+ if (truncation.truncated) {
63
+ text += `\n\n[Output truncated: ${truncation.outputLines} of ${truncation.totalLines} lines (${formatSize(truncation.outputBytes)} of ${formatSize(truncation.totalBytes)}). Full output saved to: ${fullOutputPath}]`;
64
+ }
65
+
66
+ return { text, fullOutputPath };
67
+ }
@@ -0,0 +1,77 @@
1
+ /**
2
+ * Shared rendering utilities for web toolkit tools.
3
+ *
4
+ * Functions that format URLs, snippets, and metadata for consistent
5
+ * TUI presentation across web_search, web_fetch, web_browse, and
6
+ * web_batch_fetch.
7
+ */
8
+
9
+ /**
10
+ * Abbreviate a URL for compact display.
11
+ *
12
+ * https://github.com/microsoft/TypeScript/blob/main/README.md
13
+ * → github.com/.../README.md
14
+ *
15
+ * https://example.com
16
+ * → example.com
17
+ */
18
+ export function abbreviateUrl(url: string, maxLen = 45): string {
19
+ try {
20
+ const u = new URL(url);
21
+ const host = u.hostname;
22
+ const path = u.pathname + u.search;
23
+ if (path === "/" || path === "") return host;
24
+ const full = host + path;
25
+ if (full.length <= maxLen) return full;
26
+ // Keep start and end, ellipsis in middle
27
+ const keepStart = Math.floor(maxLen * 0.4);
28
+ const keepEnd = Math.floor(maxLen * 0.35);
29
+ return full.slice(0, keepStart) + "..." + full.slice(-keepEnd);
30
+ } catch {
31
+ if (url.length <= maxLen) return url;
32
+ return url.slice(0, maxLen - 3) + "...";
33
+ }
34
+ }
35
+
36
+ /**
37
+ * Extract domain (hostname) from a URL.
38
+ */
39
+ export function getDomain(url: string): string {
40
+ try {
41
+ return new URL(url).hostname;
42
+ } catch {
43
+ return url;
44
+ }
45
+ }
46
+
47
+ /**
48
+ * Normalize whitespace for display: collapse multiple whitespace
49
+ * chars into a single space and trim.
50
+ */
51
+ export function normalizeWhitespace(text: string): string {
52
+ return text.replace(/\s+/g, " ").trim();
53
+ }
54
+
55
+ /**
56
+ * Format an extraction-quality label.
57
+ *
58
+ * "15KB → 500 chars" (when we extracted a small preview from large source)
59
+ */
60
+ export function formatExtraction(sizeBytes: number, previewChars: number): string {
61
+ const sizeLabel = sizeBytes >= 1024 * 1024
62
+ ? `${(sizeBytes / (1024 * 1024)).toFixed(1)}MB`
63
+ : sizeBytes >= 1024
64
+ ? `${(sizeBytes / 1024).toFixed(1)}KB`
65
+ : `${sizeBytes}B`;
66
+ return `${sizeLabel} → ${previewChars} chars`;
67
+ }
68
+
69
+ /**
70
+ * Extract the human-readable error message from a tool result when
71
+ * the tool threw an error. Falls back to "Unknown error".
72
+ */
73
+ export function getErrorText(result: { content?: Array<{ type?: string; text?: string }> }): string {
74
+ const first = result.content?.[0];
75
+ if (first && typeof first.text === "string") return first.text;
76
+ return "Unknown error";
77
+ }
@@ -1,4 +1,4 @@
1
- import { spawn } from "node:child_process";
1
+ import { runCLI } from "./cli-runner";
2
2
 
3
3
  /**
4
4
  * Run a scrapling CLI command with optional abort signal.
@@ -7,32 +7,7 @@ export function runScrapling(
7
7
  args: string[],
8
8
  signal?: AbortSignal,
9
9
  ): Promise<{ stdout: string; stderr: string; exitCode: number }> {
10
- return new Promise((resolve) => {
11
- const proc = spawn("scrapling", args, { shell: false, stdio: ["ignore", "pipe", "pipe"] });
12
- let stdout = "";
13
- let stderr = "";
14
-
15
- proc.stdout.on("data", (data) => {
16
- stdout += data.toString();
17
- });
18
- proc.stderr.on("data", (data) => {
19
- stderr += data.toString();
20
- });
21
- proc.on("close", (code, closeSignal) => {
22
- const exitCode = code ?? 1;
23
- const signalMessage = closeSignal ? `Process terminated by ${closeSignal}` : "";
24
- resolve({ stdout, stderr: stderr || signalMessage, exitCode });
25
- });
26
- proc.on("error", (err) => resolve({ stdout, stderr: err.message, exitCode: 1 }));
27
-
28
- if (signal) {
29
- const kill = () => {
30
- proc.kill("SIGTERM");
31
- };
32
- if (signal.aborted) kill();
33
- else signal.addEventListener("abort", kill, { once: true });
34
- }
35
- });
10
+ return runCLI({ command: "scrapling", args, signal });
36
11
  }
37
12
 
38
13
  /**