@oh-my-pi/pi-coding-agent 16.0.7 → 16.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +31 -0
- package/dist/cli.js +4752 -12462
- package/dist/types/cli/update-cli.d.ts +11 -0
- package/dist/types/debug/remote-debugger.d.ts +45 -0
- package/dist/types/internal-urls/docs-index.d.ts +19 -0
- package/dist/types/markit/converters/docx.d.ts +6 -0
- package/dist/types/markit/converters/epub.d.ts +15 -0
- package/dist/types/markit/converters/pdf/columns.d.ts +35 -0
- package/dist/types/markit/converters/pdf/extract.d.ts +10 -0
- package/dist/types/markit/converters/pdf/grid.d.ts +25 -0
- package/dist/types/markit/converters/pdf/headers.d.ts +24 -0
- package/dist/types/markit/converters/pdf/index.d.ts +6 -0
- package/dist/types/markit/converters/pdf/render.d.ts +24 -0
- package/dist/types/markit/converters/pdf/types.d.ts +75 -0
- package/dist/types/markit/converters/pptx.d.ts +57 -0
- package/dist/types/markit/converters/xlsx.d.ts +25 -0
- package/dist/types/markit/index.d.ts +2 -0
- package/dist/types/markit/registry.d.ts +16 -0
- package/dist/types/markit/types.d.ts +30 -0
- package/dist/types/session/agent-session.d.ts +7 -8
- package/dist/types/session/auth-storage.d.ts +3 -2
- package/dist/types/session/yield-queue.d.ts +3 -1
- package/dist/types/tools/browser/attach.d.ts +1 -1
- package/dist/types/utils/markit.d.ts +0 -8
- package/dist/types/utils/mupdf-wasm-embed.d.ts +1 -0
- package/dist/types/utils/turndown.d.ts +15 -0
- package/dist/types/utils/zip.d.ts +119 -0
- package/package.json +20 -18
- package/scripts/build-binary.ts +7 -3
- package/scripts/bundle-dist.ts +28 -12
- package/scripts/embed-mupdf-wasm.ts +67 -0
- package/scripts/generate-docs-index.ts +48 -32
- package/scripts/omp +1 -1
- package/src/advisor/__tests__/advisor.test.ts +83 -0
- package/src/advisor/runtime.ts +16 -1
- package/src/cli/auth-broker-cli.ts +1 -3
- package/src/cli/auth-gateway-cli.ts +2 -5
- package/src/cli/update-cli.ts +63 -3
- package/src/config/model-discovery.ts +20 -8
- package/src/config/models-config-schema.ts +8 -1
- package/src/debug/index.ts +44 -0
- package/src/debug/remote-debugger.ts +151 -0
- package/src/debug/report-bundle.ts +2 -1
- package/src/internal-urls/docs-index.generated.txt +2 -0
- package/src/internal-urls/docs-index.ts +102 -0
- package/src/internal-urls/omp-protocol.ts +10 -9
- package/src/markit/NOTICE +32 -0
- package/src/markit/converters/docx.ts +56 -0
- package/src/markit/converters/epub.ts +136 -0
- package/src/markit/converters/mammoth.d.ts +24 -0
- package/src/markit/converters/pdf/columns.ts +103 -0
- package/src/markit/converters/pdf/extract.ts +574 -0
- package/src/markit/converters/pdf/grid.ts +780 -0
- package/src/markit/converters/pdf/headers.ts +106 -0
- package/src/markit/converters/pdf/index.ts +146 -0
- package/src/markit/converters/pdf/render.ts +501 -0
- package/src/markit/converters/pdf/types.ts +84 -0
- package/src/markit/converters/pptx.ts +325 -0
- package/src/markit/converters/xlsx.ts +173 -0
- package/src/markit/index.ts +2 -0
- package/src/markit/registry.ts +59 -0
- package/src/markit/types.ts +35 -0
- package/src/modes/components/snapcompact-shape-preview-doc.md +14 -7
- package/src/modes/components/snapcompact-shape-preview.ts +2 -2
- package/src/modes/controllers/input-controller.ts +29 -8
- package/src/modes/interactive-mode.ts +26 -9
- package/src/prompts/advisor/system.md +1 -0
- package/src/sdk.ts +5 -9
- package/src/session/agent-session.ts +62 -40
- package/src/session/auth-storage.ts +2 -11
- package/src/session/yield-queue.ts +7 -1
- package/src/tools/browser/attach.ts +2 -2
- package/src/tools/fetch.ts +25 -60
- package/src/tools/read.ts +1 -1
- package/src/tools/search.ts +1 -6
- package/src/tools/write.ts +25 -65
- package/src/utils/markit.ts +25 -9
- package/src/utils/mupdf-wasm-embed.ts +12 -0
- package/src/utils/tools-manager.ts +2 -11
- package/src/utils/turndown.ts +83 -0
- package/src/{tools/archive-reader.ts → utils/zip.ts} +453 -83
- package/src/web/scrapers/types.ts +3 -46
- package/dist/types/internal-urls/docs-index.generated.d.ts +0 -2
- package/dist/types/tools/archive-reader.d.ts +0 -49
- package/src/internal-urls/docs-index.generated.ts +0 -120
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
// Adapted from markit-ai (MIT). See ../../NOTICE.
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Running header/footer detection and removal.
|
|
5
|
+
*
|
|
6
|
+
* Many PDFs have repeated text at the top or bottom of every page:
|
|
7
|
+
* document titles, chapter names, page numbers, copyright notices.
|
|
8
|
+
* These pollute the markdown output as false headings or noise.
|
|
9
|
+
*
|
|
10
|
+
* Algorithm:
|
|
11
|
+
* 1. For each page, bucket text boxes by Y position (top/bottom zones)
|
|
12
|
+
* 2. Collect the text content at each zone across all pages
|
|
13
|
+
* 3. Text appearing on >20% of pages OR 8+ consecutive pages is a
|
|
14
|
+
* running header/footer
|
|
15
|
+
* 4. Remove matching text boxes before further processing
|
|
16
|
+
*/
|
|
17
|
+
import type { PageContent } from "./types";
|
|
18
|
+
|
|
19
|
+
/** Minimum number of pages to enable header/footer detection. */
|
|
20
|
+
const MIN_PAGES = 5;
|
|
21
|
+
/** Minimum Y position for top zone (from bottom of page in PDF coords). */
|
|
22
|
+
const TOP_ZONE_MIN_Y = 700;
|
|
23
|
+
/** Maximum Y position for bottom zone. */
|
|
24
|
+
const BOTTOM_ZONE_MAX_Y = 80;
|
|
25
|
+
/**
|
|
26
|
+
* Minimum consecutive pages a text must appear on to be considered a
|
|
27
|
+
* running header/footer. Catches both document-wide headers (appearing
|
|
28
|
+
* on every page) and chapter-specific headers (appearing on 4+ consecutive
|
|
29
|
+
* pages within a chapter).
|
|
30
|
+
*/
|
|
31
|
+
const MIN_CONSECUTIVE_PAGES = 8;
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Detect and remove running headers and footers from all pages.
|
|
35
|
+
* Mutates the pages array in place, removing header/footer text boxes.
|
|
36
|
+
*
|
|
37
|
+
* Uses two strategies:
|
|
38
|
+
* 1. Global frequency: text appearing on > 20% of all pages
|
|
39
|
+
* 2. Consecutive runs: text appearing on 8+ consecutive pages
|
|
40
|
+
*/
|
|
41
|
+
export function stripHeadersFooters(pages: PageContent[]): void {
|
|
42
|
+
if (pages.length < MIN_PAGES) return;
|
|
43
|
+
// Step 1: Build per-page zone text sets
|
|
44
|
+
const pageZoneTexts: Set<string>[] = [];
|
|
45
|
+
for (const page of pages) {
|
|
46
|
+
const zoneTexts = new Set<string>();
|
|
47
|
+
for (const tb of page.textBoxes) {
|
|
48
|
+
const midY = (tb.bounds.top + tb.bounds.bottom) / 2;
|
|
49
|
+
if (midY >= TOP_ZONE_MIN_Y || midY <= BOTTOM_ZONE_MAX_Y) {
|
|
50
|
+
const key = tb.text.trim().replace(/\s+/g, " ");
|
|
51
|
+
if (key.length > 0) zoneTexts.add(key);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
pageZoneTexts.push(zoneTexts);
|
|
55
|
+
}
|
|
56
|
+
// Step 2: Count global frequency AND longest consecutive run for each text
|
|
57
|
+
const globalCount = new Map<string, number>();
|
|
58
|
+
const maxConsecutive = new Map<string, number>();
|
|
59
|
+
// Collect all unique zone texts
|
|
60
|
+
const allTexts = new Set<string>();
|
|
61
|
+
for (const zts of pageZoneTexts) {
|
|
62
|
+
for (const t of zts) allTexts.add(t);
|
|
63
|
+
}
|
|
64
|
+
for (const text of allTexts) {
|
|
65
|
+
let total = 0;
|
|
66
|
+
let consecutive = 0;
|
|
67
|
+
let maxRun = 0;
|
|
68
|
+
for (const zts of pageZoneTexts) {
|
|
69
|
+
if (zts.has(text)) {
|
|
70
|
+
total++;
|
|
71
|
+
consecutive++;
|
|
72
|
+
if (consecutive > maxRun) maxRun = consecutive;
|
|
73
|
+
} else {
|
|
74
|
+
consecutive = 0;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
globalCount.set(text, total);
|
|
78
|
+
maxConsecutive.set(text, maxRun);
|
|
79
|
+
}
|
|
80
|
+
// Step 3: Identify running headers/footers
|
|
81
|
+
const globalThreshold = Math.max(3, Math.floor(pages.length * 0.2));
|
|
82
|
+
const repeatedTexts = new Set<string>();
|
|
83
|
+
for (const text of allTexts) {
|
|
84
|
+
const gc = globalCount.get(text) ?? 0;
|
|
85
|
+
const mc = maxConsecutive.get(text) ?? 0;
|
|
86
|
+
// Global: appears on 20%+ of pages
|
|
87
|
+
if (gc >= globalThreshold) {
|
|
88
|
+
repeatedTexts.add(text);
|
|
89
|
+
continue;
|
|
90
|
+
}
|
|
91
|
+
// Consecutive: appears on 8+ consecutive pages (chapter-level headers)
|
|
92
|
+
if (mc >= MIN_CONSECUTIVE_PAGES) {
|
|
93
|
+
repeatedTexts.add(text);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
if (repeatedTexts.size === 0) return;
|
|
97
|
+
// Step 4: Remove matching text boxes from each page
|
|
98
|
+
for (const page of pages) {
|
|
99
|
+
page.textBoxes = page.textBoxes.filter(tb => {
|
|
100
|
+
const midY = (tb.bounds.top + tb.bounds.bottom) / 2;
|
|
101
|
+
if (midY < TOP_ZONE_MIN_Y && midY > BOTTOM_ZONE_MAX_Y) return true;
|
|
102
|
+
const normalized = tb.text.trim().replace(/\s+/g, " ");
|
|
103
|
+
return !repeatedTexts.has(normalized);
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
}
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
// Adapted from markit-ai (MIT). See ../../NOTICE.
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* PDF to Markdown converter.
|
|
5
|
+
*
|
|
6
|
+
* Uses mupdf (native WASM) for fast PDF parsing and a custom pipeline for
|
|
7
|
+
* table detection via vector line extraction + raycasting.
|
|
8
|
+
*
|
|
9
|
+
* Pipeline:
|
|
10
|
+
* 1. Extract text boxes + vector segments + image regions per page (mupdf)
|
|
11
|
+
* 2. Detect column layout (single vs multi-column)
|
|
12
|
+
* 3. Per column: detect table grids from segments (grid detection + raycasting)
|
|
13
|
+
* 4. Render diagrams as PNG files (if output directory provided)
|
|
14
|
+
* 5. Render tables as markdown tables, free text as paragraphs/headings
|
|
15
|
+
*/
|
|
16
|
+
import * as path from "node:path";
|
|
17
|
+
import type { ConversionResult, Converter, StreamInfo } from "../../types";
|
|
18
|
+
import { detectColumns } from "./columns";
|
|
19
|
+
import { extractPages, renderImageRegion } from "./extract";
|
|
20
|
+
import { resolveTableGrids } from "./grid";
|
|
21
|
+
import { stripHeadersFooters } from "./headers";
|
|
22
|
+
import { renderPageContent } from "./render";
|
|
23
|
+
import type { Segment, TextBox } from "./types";
|
|
24
|
+
|
|
25
|
+
const EXTENSIONS = [".pdf"];
|
|
26
|
+
const MIMETYPES = ["application/pdf", "application/x-pdf"];
|
|
27
|
+
|
|
28
|
+
type ImageBlock = { topY: number; markdown: string };
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Process a set of text boxes (one column or full page): run table detection,
|
|
32
|
+
* separate free text, and render to markdown.
|
|
33
|
+
*/
|
|
34
|
+
function processColumn(
|
|
35
|
+
pageNumber: number,
|
|
36
|
+
textBoxes: TextBox[],
|
|
37
|
+
segments: Segment[],
|
|
38
|
+
imageBlocks: ImageBlock[],
|
|
39
|
+
): string {
|
|
40
|
+
const { grids, consumedIds } = resolveTableGrids(pageNumber, textBoxes, segments);
|
|
41
|
+
const consumedSet = new Set(consumedIds);
|
|
42
|
+
const freeTextBoxes = textBoxes.filter(tb => !consumedSet.has(tb.id));
|
|
43
|
+
return renderPageContent(freeTextBoxes, grids, imageBlocks, textBoxes);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export class PdfConverter implements Converter {
|
|
47
|
+
name = "pdf";
|
|
48
|
+
|
|
49
|
+
accepts(streamInfo: StreamInfo): boolean {
|
|
50
|
+
if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension)) {
|
|
51
|
+
return true;
|
|
52
|
+
}
|
|
53
|
+
if (streamInfo.mimetype && MIMETYPES.some(m => streamInfo.mimetype?.startsWith(m))) {
|
|
54
|
+
return true;
|
|
55
|
+
}
|
|
56
|
+
return false;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
async convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult> {
|
|
60
|
+
const pdfBytes = new Uint8Array(input);
|
|
61
|
+
const pages = await extractPages(pdfBytes);
|
|
62
|
+
// Remove running headers/footers before processing.
|
|
63
|
+
stripHeadersFooters(pages);
|
|
64
|
+
const imageDir = streamInfo.imageDir;
|
|
65
|
+
|
|
66
|
+
const pageMarkdowns: string[] = [];
|
|
67
|
+
for (const page of pages) {
|
|
68
|
+
// Build image blocks for this page.
|
|
69
|
+
const imageBlocks: ImageBlock[] = [];
|
|
70
|
+
if (imageDir && page.images.length > 0) {
|
|
71
|
+
for (const img of page.images) {
|
|
72
|
+
const filename = `${img.id}.png`;
|
|
73
|
+
const filepath = path.join(imageDir, filename);
|
|
74
|
+
try {
|
|
75
|
+
const png = await renderImageRegion(pdfBytes, img);
|
|
76
|
+
await Bun.write(filepath, png);
|
|
77
|
+
imageBlocks.push({ topY: img.topY, markdown: `` });
|
|
78
|
+
} catch {
|
|
79
|
+
// Image rendering failed — skip.
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
} else if (page.images.length > 0) {
|
|
83
|
+
for (const img of page.images) {
|
|
84
|
+
imageBlocks.push({
|
|
85
|
+
topY: img.topY,
|
|
86
|
+
markdown: `<!-- image: ${img.id} (page ${img.pageNumber}, ${img.bbox.w}x${img.bbox.h}pt) -->`,
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Detect column layout.
|
|
92
|
+
// If the page has vertical segments (tables), suppress column detection
|
|
93
|
+
// when one detected column is very narrow — that's a table's first column,
|
|
94
|
+
// not a page layout column.
|
|
95
|
+
const layout = detectColumns(page.textBoxes);
|
|
96
|
+
if (layout.columnCount > 1 && page.segments.some(s => Math.abs(s.x1 - s.x2) <= 0.8)) {
|
|
97
|
+
const pageXMin = Math.min(...page.textBoxes.map(tb => tb.bounds.left));
|
|
98
|
+
const pageXMax = Math.max(...page.textBoxes.map(tb => tb.bounds.right));
|
|
99
|
+
const pageWidth = pageXMax - pageXMin;
|
|
100
|
+
const minColFraction = 0.3;
|
|
101
|
+
const tooNarrow = layout.columns.some(col => {
|
|
102
|
+
const colXMin = Math.min(...col.map(tb => tb.bounds.left));
|
|
103
|
+
const colXMax = Math.max(...col.map(tb => tb.bounds.right));
|
|
104
|
+
return (colXMax - colXMin) / pageWidth < minColFraction;
|
|
105
|
+
});
|
|
106
|
+
if (tooNarrow) {
|
|
107
|
+
layout.columnCount = 1;
|
|
108
|
+
layout.columns = [page.textBoxes];
|
|
109
|
+
layout.boundaries = [];
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
if (layout.columnCount === 1) {
|
|
114
|
+
// Single column — process normally.
|
|
115
|
+
const md = processColumn(page.pageNumber, page.textBoxes, page.segments, imageBlocks);
|
|
116
|
+
if (md.length > 0) pageMarkdowns.push(md);
|
|
117
|
+
} else {
|
|
118
|
+
// Multi-column — process each column independently, then join.
|
|
119
|
+
const columnMarkdowns: string[] = [];
|
|
120
|
+
for (const colBoxes of layout.columns) {
|
|
121
|
+
// Filter segments to those within this column's X range.
|
|
122
|
+
const colXMin = Math.min(...colBoxes.map(tb => tb.bounds.left));
|
|
123
|
+
const colXMax = Math.max(...colBoxes.map(tb => tb.bounds.right));
|
|
124
|
+
const margin = 10;
|
|
125
|
+
const colSegments = page.segments.filter(seg => {
|
|
126
|
+
const segXMin = Math.min(seg.x1, seg.x2);
|
|
127
|
+
const segXMax = Math.max(seg.x1, seg.x2);
|
|
128
|
+
return segXMax >= colXMin - margin && segXMin <= colXMax + margin;
|
|
129
|
+
});
|
|
130
|
+
// Images go with the first column only (no X info to split by).
|
|
131
|
+
const md = processColumn(
|
|
132
|
+
page.pageNumber,
|
|
133
|
+
colBoxes,
|
|
134
|
+
colSegments,
|
|
135
|
+
columnMarkdowns.length === 0 ? imageBlocks : [],
|
|
136
|
+
);
|
|
137
|
+
if (md.length > 0) columnMarkdowns.push(md);
|
|
138
|
+
}
|
|
139
|
+
const joined = columnMarkdowns.join("\n\n");
|
|
140
|
+
if (joined.length > 0) pageMarkdowns.push(joined);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
return { markdown: pageMarkdowns.join("\n\n") };
|
|
145
|
+
}
|
|
146
|
+
}
|