@oh-my-pi/pi-coding-agent 16.0.7 → 16.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +41 -0
- package/dist/cli.js +4817 -12449
- package/dist/types/cli/args.d.ts +1 -0
- package/dist/types/cli/update-cli.d.ts +11 -0
- package/dist/types/commands/launch.d.ts +3 -0
- package/dist/types/debug/remote-debugger.d.ts +45 -0
- package/dist/types/goals/runtime.d.ts +4 -1
- package/dist/types/internal-urls/docs-index.d.ts +19 -0
- package/dist/types/markit/converters/docx.d.ts +6 -0
- package/dist/types/markit/converters/epub.d.ts +15 -0
- package/dist/types/markit/converters/pdf/columns.d.ts +35 -0
- package/dist/types/markit/converters/pdf/extract.d.ts +10 -0
- package/dist/types/markit/converters/pdf/grid.d.ts +25 -0
- package/dist/types/markit/converters/pdf/headers.d.ts +24 -0
- package/dist/types/markit/converters/pdf/index.d.ts +6 -0
- package/dist/types/markit/converters/pdf/render.d.ts +24 -0
- package/dist/types/markit/converters/pdf/types.d.ts +75 -0
- package/dist/types/markit/converters/pptx.d.ts +57 -0
- package/dist/types/markit/converters/xlsx.d.ts +25 -0
- package/dist/types/markit/index.d.ts +2 -0
- package/dist/types/markit/registry.d.ts +16 -0
- package/dist/types/markit/types.d.ts +30 -0
- package/dist/types/modes/print-mode.d.ts +2 -0
- package/dist/types/session/agent-session.d.ts +7 -8
- package/dist/types/session/auth-storage.d.ts +3 -2
- package/dist/types/session/yield-queue.d.ts +3 -1
- package/dist/types/tools/browser/attach.d.ts +1 -1
- package/dist/types/utils/markit.d.ts +0 -8
- package/dist/types/utils/mupdf-wasm-embed.d.ts +1 -0
- package/dist/types/utils/turndown.d.ts +15 -0
- package/dist/types/utils/zip.d.ts +119 -0
- package/package.json +20 -18
- package/scripts/build-binary.ts +7 -3
- package/scripts/bundle-dist.ts +28 -12
- package/scripts/embed-mupdf-wasm.ts +67 -0
- package/scripts/generate-docs-index.ts +48 -32
- package/scripts/omp +1 -1
- package/src/advisor/__tests__/advisor.test.ts +83 -0
- package/src/advisor/runtime.ts +16 -1
- package/src/cli/args.ts +3 -0
- package/src/cli/auth-broker-cli.ts +1 -3
- package/src/cli/auth-gateway-cli.ts +2 -5
- package/src/cli/flag-tables.ts +1 -0
- package/src/cli/update-cli.ts +63 -3
- package/src/commands/launch.ts +3 -0
- package/src/config/model-discovery.ts +20 -8
- package/src/config/models-config-schema.ts +8 -1
- package/src/debug/index.ts +44 -0
- package/src/debug/remote-debugger.ts +151 -0
- package/src/debug/report-bundle.ts +2 -1
- package/src/goals/runtime.ts +19 -7
- package/src/internal-urls/docs-index.generated.txt +2 -0
- package/src/internal-urls/docs-index.ts +102 -0
- package/src/internal-urls/omp-protocol.ts +10 -9
- package/src/main.ts +8 -0
- package/src/markit/NOTICE +32 -0
- package/src/markit/converters/docx.ts +56 -0
- package/src/markit/converters/epub.ts +136 -0
- package/src/markit/converters/mammoth.d.ts +24 -0
- package/src/markit/converters/pdf/columns.ts +103 -0
- package/src/markit/converters/pdf/extract.ts +574 -0
- package/src/markit/converters/pdf/grid.ts +780 -0
- package/src/markit/converters/pdf/headers.ts +106 -0
- package/src/markit/converters/pdf/index.ts +146 -0
- package/src/markit/converters/pdf/render.ts +501 -0
- package/src/markit/converters/pdf/types.ts +84 -0
- package/src/markit/converters/pptx.ts +325 -0
- package/src/markit/converters/xlsx.ts +173 -0
- package/src/markit/index.ts +2 -0
- package/src/markit/registry.ts +59 -0
- package/src/markit/types.ts +35 -0
- package/src/modes/components/snapcompact-shape-preview-doc.md +14 -7
- package/src/modes/components/snapcompact-shape-preview.ts +2 -2
- package/src/modes/controllers/input-controller.ts +29 -8
- package/src/modes/interactive-mode.ts +33 -12
- package/src/modes/print-mode.ts +5 -1
- package/src/prompts/advisor/advise-tool.md +3 -1
- package/src/prompts/advisor/system.md +55 -11
- package/src/sdk.ts +5 -9
- package/src/session/agent-session.ts +72 -42
- package/src/session/auth-storage.ts +2 -11
- package/src/session/yield-queue.ts +7 -1
- package/src/tools/browser/attach.ts +2 -2
- package/src/tools/fetch.ts +25 -60
- package/src/tools/read.ts +1 -1
- package/src/tools/search.ts +1 -6
- package/src/tools/write.ts +25 -65
- package/src/utils/markit.ts +25 -9
- package/src/utils/mupdf-wasm-embed.ts +12 -0
- package/src/utils/tools-manager.ts +2 -11
- package/src/utils/turndown.ts +83 -0
- package/src/{tools/archive-reader.ts → utils/zip.ts} +453 -83
- package/src/web/scrapers/types.ts +3 -46
- package/dist/types/internal-urls/docs-index.generated.d.ts +0 -2
- package/dist/types/tools/archive-reader.d.ts +0 -49
- package/src/internal-urls/docs-index.generated.ts +0 -120
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Harness documentation index for the `omp://` protocol.
|
|
3
|
+
*
|
|
4
|
+
* Compiled binaries and the prepacked npm bundle inline a compressed index from
|
|
5
|
+
* `docs-index.generated.txt` (populated by `scripts/generate-docs-index.ts
|
|
6
|
+
* --generate` at build time). The format is two lines:
|
|
7
|
+
* 1. a plain JSON array of the sorted doc file names, and
|
|
8
|
+
* 2. a base64 gzip blob of the index-aligned doc bodies (`string[]`).
|
|
9
|
+
* Listing/completion (`getDocFilenames`) parses only the small first line and
|
|
10
|
+
* never inflates the blob; the bodies are gunzipped off the event loop (via the
|
|
11
|
+
* async `node:zlib` threadpool) lazily, once, on the first actual read. When the
|
|
12
|
+
* placeholder is empty (dev tree, source checkout), the index is read from the
|
|
13
|
+
* repo `docs/` directory on disk instead.
|
|
14
|
+
*/
|
|
15
|
+
import { readFileSync } from "node:fs";
|
|
16
|
+
import * as path from "node:path";
|
|
17
|
+
import { promisify } from "node:util";
|
|
18
|
+
import { gunzip } from "node:zlib";
|
|
19
|
+
import { Glob } from "bun";
|
|
20
|
+
import docsEmbed from "./docs-index.generated.txt";
|
|
21
|
+
|
|
22
|
+
const gunzipAsync = promisify(gunzip);
|
|
23
|
+
|
|
24
|
+
export interface DocsIndex {
|
|
25
|
+
/** Sorted documentation file names, relative to `docs/`. */
|
|
26
|
+
readonly filenames: readonly string[];
|
|
27
|
+
/** Resolve a doc body by path; inflates the embedded bodies off-thread, lazily, on first call. */
|
|
28
|
+
getBody(relativePath: string): Promise<string | undefined>;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Decode a populated two-line embed (`<filenames JSON>\n<base64 gzip of bodies>`)
|
|
33
|
+
* into a lazily-inflating index, or `null` when there is no newline separator
|
|
34
|
+
* (the empty placeholder, or a malformed payload — the caller decides which).
|
|
35
|
+
* Reading `filenames` never touches the blob; the bodies are gunzipped off the
|
|
36
|
+
* event loop into a path→content table on the first `getBody` call, and that
|
|
37
|
+
* work is shared across concurrent reads.
|
|
38
|
+
*/
|
|
39
|
+
export function decodeDocsIndex(embed: string): DocsIndex | null {
|
|
40
|
+
const newline = embed.indexOf("\n");
|
|
41
|
+
if (newline === -1) return null;
|
|
42
|
+
const filenames = JSON.parse(embed.slice(0, newline)) as string[];
|
|
43
|
+
let bodies: Promise<Record<string, string>> | undefined;
|
|
44
|
+
return {
|
|
45
|
+
filenames,
|
|
46
|
+
getBody(relativePath: string): Promise<string | undefined> {
|
|
47
|
+
bodies ??= (async () => {
|
|
48
|
+
const inflated = await gunzipAsync(Buffer.from(embed.slice(newline + 1), "base64"));
|
|
49
|
+
const decoded = JSON.parse(inflated.toString("utf8")) as string[];
|
|
50
|
+
const map: Record<string, string> = {};
|
|
51
|
+
for (let i = 0; i < filenames.length; i++) map[filenames[i]] = decoded[i];
|
|
52
|
+
return map;
|
|
53
|
+
})();
|
|
54
|
+
return bodies.then(map => map[relativePath]);
|
|
55
|
+
},
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/** Dev tree / source checkout: build the index from the repo `docs/` directory. */
|
|
60
|
+
function readDocsFromDisk(): DocsIndex {
|
|
61
|
+
const docsDir = path.resolve(import.meta.dir, "../../../../docs");
|
|
62
|
+
const filenames: string[] = [];
|
|
63
|
+
const bodies: Record<string, string> = {};
|
|
64
|
+
for (const relativePath of new Glob("**/*.md").scanSync(docsDir)) {
|
|
65
|
+
const normalized = relativePath.split(path.sep).join("/");
|
|
66
|
+
filenames.push(normalized);
|
|
67
|
+
bodies[normalized] = readFileSync(path.join(docsDir, relativePath), "utf8");
|
|
68
|
+
}
|
|
69
|
+
filenames.sort();
|
|
70
|
+
return { filenames, getBody: relativePath => Promise.resolve(bodies[relativePath]) };
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
let index: DocsIndex | undefined;
|
|
74
|
+
function getIndex(): DocsIndex {
|
|
75
|
+
if (index !== undefined) return index;
|
|
76
|
+
// Empty placeholder → dev tree / source checkout: read docs from disk.
|
|
77
|
+
if (docsEmbed.length === 0) {
|
|
78
|
+
index = readDocsFromDisk();
|
|
79
|
+
return index;
|
|
80
|
+
}
|
|
81
|
+
// Populated embed in compiled binaries / npm bundle. A non-empty payload with
|
|
82
|
+
// no newline is a broken build (truncated/corrupt embed), not a placeholder.
|
|
83
|
+
const decoded = decodeDocsIndex(docsEmbed);
|
|
84
|
+
if (decoded === null) {
|
|
85
|
+
throw new Error(
|
|
86
|
+
"Malformed embedded docs index (docs-index.generated.txt): non-empty payload without a newline separator. " +
|
|
87
|
+
"Rebuild with `bun --cwd=packages/coding-agent scripts/generate-docs-index.ts --generate`.",
|
|
88
|
+
);
|
|
89
|
+
}
|
|
90
|
+
index = decoded;
|
|
91
|
+
return index;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/** Sorted list of available documentation file names (relative to `docs/`). */
|
|
95
|
+
export function getDocFilenames(): readonly string[] {
|
|
96
|
+
return getIndex().filenames;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/** Resolve a documentation file's content, or `undefined` when not found. */
|
|
100
|
+
export function getEmbeddedDoc(relativePath: string): Promise<string | undefined> {
|
|
101
|
+
return getIndex().getBody(relativePath);
|
|
102
|
+
}
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* - omp://<file>.md - Reads a specific documentation file
|
|
9
9
|
*/
|
|
10
10
|
import * as path from "node:path";
|
|
11
|
-
import {
|
|
11
|
+
import { getDocFilenames, getEmbeddedDoc } from "./docs-index";
|
|
12
12
|
import type { InternalResource, InternalUrl, ProtocolHandler, UrlCompletion } from "./types";
|
|
13
13
|
|
|
14
14
|
/**
|
|
@@ -34,16 +34,17 @@ export class OmpProtocolHandler implements ProtocolHandler {
|
|
|
34
34
|
}
|
|
35
35
|
|
|
36
36
|
async complete(): Promise<UrlCompletion[]> {
|
|
37
|
-
return
|
|
37
|
+
return getDocFilenames().map(value => ({ value }));
|
|
38
38
|
}
|
|
39
39
|
|
|
40
40
|
async #listDocs(url: InternalUrl): Promise<InternalResource> {
|
|
41
|
-
|
|
41
|
+
const filenames = getDocFilenames();
|
|
42
|
+
if (filenames.length === 0) {
|
|
42
43
|
throw new Error("No documentation files found");
|
|
43
44
|
}
|
|
44
45
|
|
|
45
|
-
const listing =
|
|
46
|
-
const content = `# Documentation\n\n${
|
|
46
|
+
const listing = filenames.map(f => `- [${f}](omp://${f})`).join("\n");
|
|
47
|
+
const content = `# Documentation\n\n${filenames.length} files available:\n\n${listing}\n`;
|
|
47
48
|
|
|
48
49
|
return {
|
|
49
50
|
url: url.href,
|
|
@@ -70,12 +71,12 @@ export class OmpProtocolHandler implements ProtocolHandler {
|
|
|
70
71
|
return this.#listDocs(url);
|
|
71
72
|
}
|
|
72
73
|
|
|
73
|
-
const content =
|
|
74
|
+
const content = await getEmbeddedDoc(docPath);
|
|
74
75
|
if (content === undefined) {
|
|
75
76
|
const lookup = docPath.replace(/\.md$/, "");
|
|
76
|
-
const suggestions =
|
|
77
|
-
f => f.includes(lookup) || lookup.includes(f.replace(/\.md$/, ""))
|
|
78
|
-
|
|
77
|
+
const suggestions = getDocFilenames()
|
|
78
|
+
.filter(f => f.includes(lookup) || lookup.includes(f.replace(/\.md$/, "")))
|
|
79
|
+
.slice(0, 5);
|
|
79
80
|
const suffix =
|
|
80
81
|
suggestions.length > 0
|
|
81
82
|
? `\nDid you mean: ${suggestions.join(", ")}`
|
package/src/main.ts
CHANGED
|
@@ -1039,6 +1039,13 @@ export async function runRootCommand(
|
|
|
1039
1039
|
});
|
|
1040
1040
|
}
|
|
1041
1041
|
|
|
1042
|
+
// --print-thoughts (single-shot print mode) must surface reasoning, so un-hide
|
|
1043
|
+
// thinking before the session is built — otherwise a passive hideThinkingBlock
|
|
1044
|
+
// setting makes the provider omit summaries and the flag prints nothing. An
|
|
1045
|
+
// explicit --hide-thinking below still wins.
|
|
1046
|
+
if (parsedArgs.printThoughts && !isProtocolMode && !isInteractive) {
|
|
1047
|
+
settingsInstance.override("hideThinkingBlock", false);
|
|
1048
|
+
}
|
|
1042
1049
|
// Apply --hide-thinking CLI flag (ephemeral, not persisted)
|
|
1043
1050
|
if (parsedArgs.hideThinking) {
|
|
1044
1051
|
settingsInstance.override("hideThinkingBlock", true);
|
|
@@ -1373,6 +1380,7 @@ export async function runRootCommand(
|
|
|
1373
1380
|
messages: initialArgs.messages,
|
|
1374
1381
|
initialMessage,
|
|
1375
1382
|
initialImages,
|
|
1383
|
+
printThoughts: initialArgs.printThoughts,
|
|
1376
1384
|
});
|
|
1377
1385
|
if ($env.PI_TIMING) {
|
|
1378
1386
|
logger.printTimings();
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
This directory contains an in-house document-to-markdown engine adapted from
|
|
2
|
+
markit-ai (https://github.com/Michaelliv/markit), used under the MIT License.
|
|
3
|
+
|
|
4
|
+
Copyright (c) 2026 Michael Liv
|
|
5
|
+
|
|
6
|
+
Only the converters for the document formats omp supports are ported (pdf,
|
|
7
|
+
docx, pptx, xlsx, epub); the CLI, plugin/provider, and unused converters
|
|
8
|
+
(html, image, audio, plain-text, rss, github, wikipedia, csv, json, yaml,
|
|
9
|
+
ipynb, iwork, zip, xml) were dropped. Legacy binary `.doc`/`.ppt`/`.xls` and
|
|
10
|
+
`.rtf` are routed by the read/fetch tools but have no converter — they surface
|
|
11
|
+
a conversion error, exactly as upstream markit did. Logic is ported faithfully
|
|
12
|
+
so conversion output matches the upstream package.
|
|
13
|
+
|
|
14
|
+
MIT License
|
|
15
|
+
|
|
16
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
17
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
18
|
+
in the Software without restriction, including without limitation the rights
|
|
19
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
20
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
21
|
+
furnished to do so, subject to the following conditions:
|
|
22
|
+
|
|
23
|
+
The above copyright notice and this permission notice shall be included in all
|
|
24
|
+
copies or substantial portions of the Software.
|
|
25
|
+
|
|
26
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
27
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
28
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
29
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
30
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
31
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
32
|
+
SOFTWARE.
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
// Adapted from markit-ai (MIT). See ../NOTICE.
|
|
2
|
+
import * as path from "node:path";
|
|
3
|
+
import mammoth from "mammoth";
|
|
4
|
+
import { createTurndown, normalizeTablesHtml } from "../../utils/turndown";
|
|
5
|
+
import type { ConversionResult, Converter, StreamInfo } from "../types";
|
|
6
|
+
|
|
7
|
+
const EXTENSIONS = [".docx"];
|
|
8
|
+
const MIMETYPES = ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"];
|
|
9
|
+
|
|
10
|
+
export class DocxConverter implements Converter {
|
|
11
|
+
name = "docx";
|
|
12
|
+
|
|
13
|
+
accepts(streamInfo: StreamInfo): boolean {
|
|
14
|
+
if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension)) {
|
|
15
|
+
return true;
|
|
16
|
+
}
|
|
17
|
+
if (streamInfo.mimetype && MIMETYPES.some(m => streamInfo.mimetype?.startsWith(m))) {
|
|
18
|
+
return true;
|
|
19
|
+
}
|
|
20
|
+
return false;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
async convert(input: Buffer, streamInfo: StreamInfo): Promise<ConversionResult> {
|
|
24
|
+
const imageDir = streamInfo.imageDir;
|
|
25
|
+
let imageCount = 0;
|
|
26
|
+
const convertImage = imageDir
|
|
27
|
+
? mammoth.images.imgElement(image => {
|
|
28
|
+
imageCount++;
|
|
29
|
+
const ext = (image.contentType?.split("/")[1] || "png").replace("jpeg", "jpg");
|
|
30
|
+
const filename = `image_${imageCount}.${ext}`;
|
|
31
|
+
const filepath = path.join(imageDir, filename);
|
|
32
|
+
return image.read("base64").then(async base64 => {
|
|
33
|
+
await Bun.write(filepath, Buffer.from(base64, "base64"));
|
|
34
|
+
return { src: filepath, alt: `image_${imageCount}` };
|
|
35
|
+
});
|
|
36
|
+
})
|
|
37
|
+
: mammoth.images.imgElement(image => {
|
|
38
|
+
imageCount++;
|
|
39
|
+
const contentType = image.contentType || "image/png";
|
|
40
|
+
return image.read("base64").then(base64 => {
|
|
41
|
+
return {
|
|
42
|
+
src: `data:${contentType};base64,${base64.slice(0, 0)}`,
|
|
43
|
+
alt: `image_${imageCount}`,
|
|
44
|
+
};
|
|
45
|
+
});
|
|
46
|
+
});
|
|
47
|
+
const { value: html } = await mammoth.convertToHtml({ buffer: input }, { convertImage });
|
|
48
|
+
const turndown = createTurndown();
|
|
49
|
+
let markdown = turndown.turndown(normalizeTablesHtml(html));
|
|
50
|
+
// Replace data URI images with comment placeholders when no imageDir
|
|
51
|
+
if (!imageDir) {
|
|
52
|
+
markdown = markdown.replace(/!\[([^\]]*)\]\(data:[^)]*\)/g, "<!-- image: $1 -->");
|
|
53
|
+
}
|
|
54
|
+
return { markdown: markdown.trim() };
|
|
55
|
+
}
|
|
56
|
+
}
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
// Adapted from markit-ai (MIT). See ../NOTICE.
|
|
2
|
+
import { XMLParser } from "fast-xml-parser";
|
|
3
|
+
import { createTurndown, normalizeTablesHtml } from "../../utils/turndown";
|
|
4
|
+
import { unzip, unzipText } from "../../utils/zip";
|
|
5
|
+
import type { ConversionResult, Converter, StreamInfo } from "../types";
|
|
6
|
+
|
|
7
|
+
const EXTENSIONS = [".epub"];
|
|
8
|
+
const MIMETYPES = ["application/epub", "application/epub+zip", "application/x-epub+zip"];
|
|
9
|
+
|
|
10
|
+
/** A metadata value: a bare string, or a node carrying `#text` and/or array children. */
|
|
11
|
+
type MetaValue = string | MetaNode;
|
|
12
|
+
interface MetaNode {
|
|
13
|
+
"#text"?: string;
|
|
14
|
+
[index: number]: MetaValue;
|
|
15
|
+
}
|
|
16
|
+
interface Metadata {
|
|
17
|
+
"dc:title"?: MetaValue;
|
|
18
|
+
"dc:creator"?: MetaValue;
|
|
19
|
+
"dc:language"?: MetaValue;
|
|
20
|
+
"dc:publisher"?: MetaValue;
|
|
21
|
+
"dc:date"?: MetaValue;
|
|
22
|
+
"dc:description"?: MetaValue;
|
|
23
|
+
}
|
|
24
|
+
interface ManifestItem {
|
|
25
|
+
"@_id": string;
|
|
26
|
+
"@_href": string;
|
|
27
|
+
}
|
|
28
|
+
interface SpineItem {
|
|
29
|
+
"@_idref": string;
|
|
30
|
+
}
|
|
31
|
+
interface OpfDoc {
|
|
32
|
+
package?: {
|
|
33
|
+
metadata?: Metadata;
|
|
34
|
+
manifest?: { item?: ManifestItem | ManifestItem[] };
|
|
35
|
+
spine?: { itemref?: SpineItem | SpineItem[] };
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
interface Rootfile {
|
|
39
|
+
"@_full-path": string;
|
|
40
|
+
}
|
|
41
|
+
interface ContainerDoc {
|
|
42
|
+
container?: { rootfiles?: { rootfile?: Rootfile | Rootfile[] } };
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
export class EpubConverter implements Converter {
|
|
46
|
+
name = "epub";
|
|
47
|
+
|
|
48
|
+
accepts(streamInfo: StreamInfo): boolean {
|
|
49
|
+
if (streamInfo.extension && EXTENSIONS.includes(streamInfo.extension)) return true;
|
|
50
|
+
if (streamInfo.mimetype && MIMETYPES.some(m => streamInfo.mimetype?.startsWith(m))) return true;
|
|
51
|
+
return false;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
async convert(input: Buffer, _streamInfo: StreamInfo): Promise<ConversionResult> {
|
|
55
|
+
const entries = unzip(input);
|
|
56
|
+
const parser = new XMLParser({
|
|
57
|
+
ignoreAttributes: false,
|
|
58
|
+
attributeNamePrefix: "@_",
|
|
59
|
+
textNodeName: "#text",
|
|
60
|
+
processEntities: { maxTotalExpansions: 1_000_000 },
|
|
61
|
+
});
|
|
62
|
+
// Find content.opf path from container.xml
|
|
63
|
+
const containerXml = unzipText(entries, "META-INF/container.xml");
|
|
64
|
+
if (!containerXml) throw new Error("Invalid EPUB: missing container.xml");
|
|
65
|
+
const container = parser.parse(containerXml) as ContainerDoc;
|
|
66
|
+
const rootfile = container.container?.rootfiles?.rootfile;
|
|
67
|
+
const opfPath = Array.isArray(rootfile) ? rootfile[0]["@_full-path"] : rootfile?.["@_full-path"];
|
|
68
|
+
if (!opfPath) throw new Error("Invalid EPUB: missing rootfile path");
|
|
69
|
+
// Parse content.opf
|
|
70
|
+
const opfXml = unzipText(entries, opfPath);
|
|
71
|
+
if (!opfXml) throw new Error("Invalid EPUB: missing content.opf");
|
|
72
|
+
const opf = parser.parse(opfXml) as OpfDoc;
|
|
73
|
+
// Extract metadata
|
|
74
|
+
const meta: Metadata = opf.package?.metadata ?? {};
|
|
75
|
+
const metadata: Record<string, string | undefined> = {
|
|
76
|
+
title: this.getText(meta["dc:title"]),
|
|
77
|
+
authors: this.getTextArray(meta["dc:creator"]).join(", ") || undefined,
|
|
78
|
+
language: this.getText(meta["dc:language"]),
|
|
79
|
+
publisher: this.getText(meta["dc:publisher"]),
|
|
80
|
+
date: this.getText(meta["dc:date"]),
|
|
81
|
+
description: this.getText(meta["dc:description"]),
|
|
82
|
+
};
|
|
83
|
+
// Build manifest map (id → href)
|
|
84
|
+
const manifestItems = opf.package?.manifest?.item;
|
|
85
|
+
const itemList = Array.isArray(manifestItems) ? manifestItems : manifestItems ? [manifestItems] : [];
|
|
86
|
+
const manifest = new Map<string, string>();
|
|
87
|
+
for (const item of itemList) {
|
|
88
|
+
manifest.set(item["@_id"], item["@_href"]);
|
|
89
|
+
}
|
|
90
|
+
// Get spine order
|
|
91
|
+
const spineItems = opf.package?.spine?.itemref;
|
|
92
|
+
const spineList = Array.isArray(spineItems) ? spineItems : spineItems ? [spineItems] : [];
|
|
93
|
+
const spineOrder = spineList.map(s => s["@_idref"]);
|
|
94
|
+
// Resolve file paths
|
|
95
|
+
const basePath = opfPath.includes("/") ? opfPath.substring(0, opfPath.lastIndexOf("/")) : "";
|
|
96
|
+
const turndown = createTurndown();
|
|
97
|
+
const sections: string[] = [];
|
|
98
|
+
// Add metadata header
|
|
99
|
+
const metaLines: string[] = [];
|
|
100
|
+
for (const key in metadata) {
|
|
101
|
+
const value = metadata[key];
|
|
102
|
+
if (value) metaLines.push(`**${key.charAt(0).toUpperCase() + key.slice(1)}:** ${value}`);
|
|
103
|
+
}
|
|
104
|
+
if (metaLines.length > 0) sections.push(metaLines.join("\n"));
|
|
105
|
+
// Convert spine files
|
|
106
|
+
for (const idref of spineOrder) {
|
|
107
|
+
const href = manifest.get(idref);
|
|
108
|
+
if (!href) continue;
|
|
109
|
+
const filePath = basePath ? `${basePath}/${href}` : href;
|
|
110
|
+
const html = unzipText(entries, filePath);
|
|
111
|
+
if (!html) continue;
|
|
112
|
+
// Strip script/style, convert to markdown
|
|
113
|
+
const cleaned = html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "");
|
|
114
|
+
const md = turndown.turndown(normalizeTablesHtml(cleaned)).trim();
|
|
115
|
+
if (md) sections.push(md);
|
|
116
|
+
}
|
|
117
|
+
return {
|
|
118
|
+
markdown: sections.join("\n\n").trim(),
|
|
119
|
+
title: metadata.title,
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
getText(node: MetaValue | undefined): string | undefined {
|
|
124
|
+
if (!node) return undefined;
|
|
125
|
+
if (typeof node === "string") return node;
|
|
126
|
+
if (node["#text"]) return String(node["#text"]);
|
|
127
|
+
if (Array.isArray(node)) return this.getText(node[0]);
|
|
128
|
+
return undefined;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
getTextArray(node: MetaValue | undefined): (string | undefined)[] {
|
|
132
|
+
if (!node) return [];
|
|
133
|
+
const list = Array.isArray(node) ? node : [node];
|
|
134
|
+
return list.map(n => this.getText(n)).filter(Boolean);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
// Minimal ambient types for `mammoth` (ships no types). Declares only what
|
|
2
|
+
// DocxConverter uses. See ../NOTICE.
|
|
3
|
+
declare module "mammoth" {
|
|
4
|
+
interface MammothImage {
|
|
5
|
+
contentType?: string;
|
|
6
|
+
read(encoding: "base64"): Promise<string>;
|
|
7
|
+
}
|
|
8
|
+
interface ImgAttributes {
|
|
9
|
+
src: string;
|
|
10
|
+
alt?: string;
|
|
11
|
+
}
|
|
12
|
+
type ConvertImageHandler = (image: MammothImage) => Promise<ImgAttributes>;
|
|
13
|
+
interface ConvertOptions {
|
|
14
|
+
convertImage?: ConvertImageHandler;
|
|
15
|
+
}
|
|
16
|
+
interface ConvertResult {
|
|
17
|
+
value: string;
|
|
18
|
+
messages: unknown[];
|
|
19
|
+
}
|
|
20
|
+
export const images: { imgElement(fn: ConvertImageHandler): ConvertImageHandler };
|
|
21
|
+
export function convertToHtml(input: { buffer: Buffer }, options?: ConvertOptions): Promise<ConvertResult>;
|
|
22
|
+
const _default: { convertToHtml: typeof convertToHtml; images: typeof images };
|
|
23
|
+
export default _default;
|
|
24
|
+
}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
// Adapted from markit-ai (MIT). See ../../NOTICE.
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Multi-column layout detection and text box reordering.
|
|
5
|
+
*
|
|
6
|
+
* Many PDFs (legal documents, datasheets, academic papers) use two-column
|
|
7
|
+
* layouts. Without column detection, text boxes are ordered by Y position
|
|
8
|
+
* only, interleaving left and right column content.
|
|
9
|
+
*
|
|
10
|
+
* Algorithm:
|
|
11
|
+
* 1. Collect left edges of all text boxes on the page
|
|
12
|
+
* 2. Find the largest horizontal gap between consecutive left edges
|
|
13
|
+
* 3. If gap > MIN_GAP_RATIO of the text width and both sides have
|
|
14
|
+
* enough boxes → multi-column detected
|
|
15
|
+
* 4. Assign each text box to a column based on its center X
|
|
16
|
+
* 5. Return columns in reading order (left-to-right, top-to-bottom)
|
|
17
|
+
*
|
|
18
|
+
* This only detects the column structure. The caller is responsible for
|
|
19
|
+
* processing each column's text boxes independently (table detection,
|
|
20
|
+
* rendering, etc.).
|
|
21
|
+
*/
|
|
22
|
+
import type { TextBox } from "./types";
|
|
23
|
+
|
|
24
|
+
export interface ColumnLayout {
|
|
25
|
+
/** Number of columns detected (1 = single column, 2+ = multi-column). */
|
|
26
|
+
columnCount: number;
|
|
27
|
+
/** Text boxes grouped by column, in reading order (left to right). */
|
|
28
|
+
columns: TextBox[][];
|
|
29
|
+
/** X positions of column boundaries (between columns). */
|
|
30
|
+
boundaries: number[];
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Minimum gap as a fraction of the total text width to consider a column
|
|
35
|
+
* boundary. A two-column layout typically has ~50% gap; we use a lower
|
|
36
|
+
* threshold to catch asymmetric columns.
|
|
37
|
+
*/
|
|
38
|
+
const MIN_GAP_RATIO = 0.15;
|
|
39
|
+
/** Minimum number of text boxes on each side of the gap. */
|
|
40
|
+
const MIN_BOXES_PER_COLUMN = 4;
|
|
41
|
+
/** Minimum gap in absolute points to avoid splitting on small whitespace. */
|
|
42
|
+
const MIN_GAP_PTS = 40;
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Detect column layout and return text boxes grouped by column.
|
|
46
|
+
*
|
|
47
|
+
* For single-column pages, returns all boxes in one group.
|
|
48
|
+
* For multi-column pages, returns boxes split by column in reading order.
|
|
49
|
+
*/
|
|
50
|
+
export function detectColumns(textBoxes: TextBox[]): ColumnLayout {
|
|
51
|
+
if (textBoxes.length < MIN_BOXES_PER_COLUMN * 2) {
|
|
52
|
+
return { columnCount: 1, columns: [textBoxes], boundaries: [] };
|
|
53
|
+
}
|
|
54
|
+
// Collect unique left edges (rounded to avoid float noise)
|
|
55
|
+
const lefts = [...new Set(textBoxes.map(tb => Math.round(tb.bounds.left)))].sort((a, b) => a - b);
|
|
56
|
+
if (lefts.length < 2) {
|
|
57
|
+
return { columnCount: 1, columns: [textBoxes], boundaries: [] };
|
|
58
|
+
}
|
|
59
|
+
const textXMin = lefts[0];
|
|
60
|
+
const textXMax = Math.max(...textBoxes.map(tb => Math.round(tb.bounds.right)));
|
|
61
|
+
const textWidth = textXMax - textXMin;
|
|
62
|
+
if (textWidth <= 0) {
|
|
63
|
+
return { columnCount: 1, columns: [textBoxes], boundaries: [] };
|
|
64
|
+
}
|
|
65
|
+
// Find the largest gap between consecutive left-edge positions
|
|
66
|
+
let maxGap = 0;
|
|
67
|
+
let gapLeft = 0;
|
|
68
|
+
let gapRight = 0;
|
|
69
|
+
for (let i = 1; i < lefts.length; i++) {
|
|
70
|
+
const gap = lefts[i] - lefts[i - 1];
|
|
71
|
+
if (gap > maxGap) {
|
|
72
|
+
maxGap = gap;
|
|
73
|
+
gapLeft = lefts[i - 1];
|
|
74
|
+
gapRight = lefts[i];
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
const gapRatio = maxGap / textWidth;
|
|
78
|
+
if (gapRatio < MIN_GAP_RATIO || maxGap < MIN_GAP_PTS) {
|
|
79
|
+
return { columnCount: 1, columns: [textBoxes], boundaries: [] };
|
|
80
|
+
}
|
|
81
|
+
// Split point is the midpoint of the gap
|
|
82
|
+
const splitX = (gapLeft + gapRight) / 2;
|
|
83
|
+
// Assign boxes to columns based on center X
|
|
84
|
+
const leftCol: TextBox[] = [];
|
|
85
|
+
const rightCol: TextBox[] = [];
|
|
86
|
+
for (const tb of textBoxes) {
|
|
87
|
+
const cx = (tb.bounds.left + tb.bounds.right) / 2;
|
|
88
|
+
if (cx < splitX) {
|
|
89
|
+
leftCol.push(tb);
|
|
90
|
+
} else {
|
|
91
|
+
rightCol.push(tb);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
// Validate both columns have enough content
|
|
95
|
+
if (leftCol.length < MIN_BOXES_PER_COLUMN || rightCol.length < MIN_BOXES_PER_COLUMN) {
|
|
96
|
+
return { columnCount: 1, columns: [textBoxes], boundaries: [] };
|
|
97
|
+
}
|
|
98
|
+
return {
|
|
99
|
+
columnCount: 2,
|
|
100
|
+
columns: [leftCol, rightCol],
|
|
101
|
+
boundaries: [splitX],
|
|
102
|
+
};
|
|
103
|
+
}
|