@dragon708/docmind-markdown 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +162 -0
- package/dist/index.js +647 -0
- package/package.json +46 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import { StructuredDocumentResult } from '@dragon708/docmind-shared';
|
|
2
|
+
export { DocumentBlock, DocumentPage, DocumentTable, StructuredDocumentResult } from '@dragon708/docmind-shared';
|
|
3
|
+
|
|
4
|
+
/** Options for {@link convertStructuredToMarkdown}. */
|
|
5
|
+
interface ConvertStructuredToMarkdownOptions {
|
|
6
|
+
/**
|
|
7
|
+
* When an `image-ref` block has no resolvable `src`, emit this string (default: HTML comment).
|
|
8
|
+
*/
|
|
9
|
+
readonly imagePlaceholder?: string;
|
|
10
|
+
/** If true, prepend a short human-readable metadata block when `result.metadata` has fields. */
|
|
11
|
+
readonly includeMetadataHeader?: boolean;
|
|
12
|
+
/**
|
|
13
|
+
* When `true` (default), insert {@link pageSeparator} between blocks when `pageIndex` increases
|
|
14
|
+
* (1-based page hint in italics when {@link StructuredDocumentResult.pages} is non-empty).
|
|
15
|
+
*/
|
|
16
|
+
readonly pageTransitionMarkers?: boolean;
|
|
17
|
+
/** Line(s) used for `page-break` blocks and page transitions (default: `---`). */
|
|
18
|
+
readonly pageSeparator?: string;
|
|
19
|
+
/**
|
|
20
|
+
* Append markdown for entries in `tables` not referenced by any `table` block (default: `true`).
|
|
21
|
+
*/
|
|
22
|
+
readonly appendUnreferencedTables?: boolean;
|
|
23
|
+
/**
|
|
24
|
+
* Append placeholders or `![alt]()` lines for `images` not referenced by any `image-ref` block (default: `false`).
|
|
25
|
+
*/
|
|
26
|
+
readonly appendUnreferencedImages?: boolean;
|
|
27
|
+
}
|
|
28
|
+
/** @deprecated Use {@link ConvertStructuredToMarkdownOptions} instead. */
|
|
29
|
+
type StructuredToMarkdownOptions = ConvertStructuredToMarkdownOptions;
|
|
30
|
+
/**
|
|
31
|
+
* Converts a {@link StructuredDocumentResult} to readable, semantic Markdown (GFM-style tables).
|
|
32
|
+
*
|
|
33
|
+
* Uses `blocks` in order; resolves `table` / `image-ref` via `tables` and `images`. When blocks are
|
|
34
|
+
* empty or yield no output, falls back to the rollup `text`. Optional sections use `pages`, `metadata`,
|
|
35
|
+
* and unreferenced `tables` / `images` according to options.
|
|
36
|
+
*/
|
|
37
|
+
declare function convertStructuredToMarkdown(result: StructuredDocumentResult, options?: ConvertStructuredToMarkdownOptions): string;
|
|
38
|
+
/**
|
|
39
|
+
* @deprecated Use {@link convertStructuredToMarkdown} — same behavior.
|
|
40
|
+
*/
|
|
41
|
+
declare function structuredDocumentToMarkdown(structured: StructuredDocumentResult, options?: ConvertStructuredToMarkdownOptions): string;
|
|
42
|
+
|
|
43
|
+
/** Options for {@link convertStructuredToLlmText}. */
|
|
44
|
+
interface ConvertStructuredToLlmTextOptions {
|
|
45
|
+
/**
|
|
46
|
+
* When true (default), use {@link StructuredDocumentResult.text} if block rendering is empty.
|
|
47
|
+
*/
|
|
48
|
+
readonly fallbackToRollupText?: boolean;
|
|
49
|
+
/** Opening tag for table blocks (default `[TABLE]`). */
|
|
50
|
+
readonly tableSectionTag?: string;
|
|
51
|
+
/** Tag for image lines (default `[IMAGE]`). */
|
|
52
|
+
readonly imagePlaceholderTag?: string;
|
|
53
|
+
/**
|
|
54
|
+
* Line emitted for `page-break` blocks and for automatic page transitions (default `[PAGE]`).
|
|
55
|
+
*/
|
|
56
|
+
readonly pageBreakMarker?: string;
|
|
57
|
+
/**
|
|
58
|
+
* When true (default), insert {@link pageBreakMarker} when `pageIndex` increases on a block
|
|
59
|
+
* and `pages` is non-empty (same idea as Markdown export, without markdown syntax).
|
|
60
|
+
*/
|
|
61
|
+
readonly pageTransitionMarkers?: boolean;
|
|
62
|
+
/** Append `[WARNINGS]` when `result.warnings` is non-empty (default `true`). */
|
|
63
|
+
readonly includeWarnings?: boolean;
|
|
64
|
+
/**
|
|
65
|
+
* Prepend a short `[DOC]` block when `metadata` has title, author, pageCount, etc. (default `false`).
|
|
66
|
+
*/
|
|
67
|
+
readonly includeDocumentMetadata?: boolean;
|
|
68
|
+
/** Max entries from `metadata.extra` to include (default `5`). */
|
|
69
|
+
readonly metadataExtraMaxKeys?: number;
|
|
70
|
+
/** Append tables not referenced by any `table` block (default `true`). */
|
|
71
|
+
readonly appendUnreferencedTables?: boolean;
|
|
72
|
+
/** Use single newlines between blocks instead of double (default `false`). */
|
|
73
|
+
readonly compact?: boolean;
|
|
74
|
+
/** Omit paragraph blocks whose trimmed text is empty (default `true`). */
|
|
75
|
+
readonly skipEmptyParagraphs?: boolean;
|
|
76
|
+
}
|
|
77
|
+
/** @deprecated Use {@link ConvertStructuredToLlmTextOptions}. */
|
|
78
|
+
type StructuredToLlmTextOptions = ConvertStructuredToLlmTextOptions;
|
|
79
|
+
/**
|
|
80
|
+
* Linearizes {@link StructuredDocumentResult} into plain text for prompts, RAG, and embeddings:
|
|
81
|
+
* explicit `[Hn]` headings, readable tables, compact list lines, configurable page markers,
|
|
82
|
+
* optional `[DOC]` metadata and `[WARNINGS]`. Not Markdown — tuned for density and clarity.
|
|
83
|
+
*/
|
|
84
|
+
declare function convertStructuredToLlmText(result: StructuredDocumentResult, options?: ConvertStructuredToLlmTextOptions): string;
|
|
85
|
+
/**
|
|
86
|
+
* @deprecated Use {@link convertStructuredToLlmText} — same behavior.
|
|
87
|
+
*/
|
|
88
|
+
declare function structuredDocumentToLlmText(structured: StructuredDocumentResult, options?: ConvertStructuredToLlmTextOptions): string;
|
|
89
|
+
|
|
90
|
+
/** One slice of a document for RAG / embeddings / chat. */
|
|
91
|
+
interface StructuredChunk {
|
|
92
|
+
/** Zero-based order in the output array. */
|
|
93
|
+
readonly index: number;
|
|
94
|
+
/** Plain text (LLM-oriented, same family as {@link convertStructuredToLlmText}). */
|
|
95
|
+
readonly text: string;
|
|
96
|
+
/** Markdown slice when {@link SplitStructuredIntoChunksOptions.includeMarkdown} is true. */
|
|
97
|
+
readonly markdown?: string;
|
|
98
|
+
/** Breadcrumb of heading texts in scope for this chunk (best-effort). */
|
|
99
|
+
readonly headingPath?: readonly string[];
|
|
100
|
+
/** Smallest `pageIndex` among blocks in this chunk, when any. */
|
|
101
|
+
readonly pageIndex?: number;
|
|
102
|
+
/** Largest `pageIndex` among blocks in this chunk, when any. */
|
|
103
|
+
readonly pageEndIndex?: number;
|
|
104
|
+
}
|
|
105
|
+
/** Options for {@link splitStructuredIntoChunks}. */
|
|
106
|
+
interface SplitStructuredIntoChunksOptions {
|
|
107
|
+
/** Soft maximum characters for `text` per chunk (default `4000`). Tables may exceed this when {@link preserveTables} is true. */
|
|
108
|
+
readonly maxChars?: number;
|
|
109
|
+
/** Suffix of the previous chunk's `text` prepended to the next (default `0`). */
|
|
110
|
+
readonly overlapChars?: number;
|
|
111
|
+
/**
|
|
112
|
+
* When true (default), start a new chunk before each `heading` block if the current chunk is non-empty.
|
|
113
|
+
*/
|
|
114
|
+
readonly preferHeadings?: boolean;
|
|
115
|
+
/**
|
|
116
|
+
* When true (default), never split a `table` block across chunks; a large table may form a single oversized chunk.
|
|
117
|
+
*/
|
|
118
|
+
readonly preserveTables?: boolean;
|
|
119
|
+
/** When true (default), fill {@link StructuredChunk.markdown} using {@link convertStructuredToMarkdown} per slice. */
|
|
120
|
+
readonly includeMarkdown?: boolean;
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Splits a {@link StructuredDocumentResult} into ordered chunks for RAG / chat.
|
|
124
|
+
*
|
|
125
|
+
* First version: one unit per block, greedy packing with soft `maxChar` limits, optional heading-aligned
|
|
126
|
+
* cuts, and atomic tables. Intended to refine later (finer splits, row-level tables, token limits).
|
|
127
|
+
*/
|
|
128
|
+
declare function splitStructuredIntoChunks(result: StructuredDocumentResult, options?: SplitStructuredIntoChunksOptions): StructuredChunk[];
|
|
129
|
+
|
|
130
|
+
/** Options for {@link renderMarkdown} (same as {@link ConvertStructuredToMarkdownOptions}). */
|
|
131
|
+
type RenderMarkdownOptions = ConvertStructuredToMarkdownOptions;
|
|
132
|
+
/** Options for {@link renderLlmText} (same as {@link ConvertStructuredToLlmTextOptions}). */
|
|
133
|
+
type RenderLlmTextOptions = ConvertStructuredToLlmTextOptions;
|
|
134
|
+
/**
|
|
135
|
+
* Ergonomic alias for {@link convertStructuredToMarkdown}: full document → Markdown string.
|
|
136
|
+
*/
|
|
137
|
+
declare function renderMarkdown(result: StructuredDocumentResult, options?: RenderMarkdownOptions): string;
|
|
138
|
+
/**
|
|
139
|
+
* Ergonomic alias for {@link convertStructuredToLlmText}: full document → plain text for LLM prompts.
|
|
140
|
+
*/
|
|
141
|
+
declare function renderLlmText(result: StructuredDocumentResult, options?: RenderLlmTextOptions): string;
|
|
142
|
+
/** One Markdown slice aligned with chunking (headings / size limits). */
|
|
143
|
+
interface MarkdownSection {
|
|
144
|
+
readonly index: number;
|
|
145
|
+
readonly markdown: string;
|
|
146
|
+
readonly headingPath?: readonly string[];
|
|
147
|
+
readonly pageIndex?: number;
|
|
148
|
+
readonly pageEndIndex?: number;
|
|
149
|
+
/** Plain-text slice for the same block span (optional embedding / preview). */
|
|
150
|
+
readonly text?: string;
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* Options for {@link renderMarkdownSections}; same shape as {@link SplitStructuredIntoChunksOptions}.
|
|
154
|
+
*/
|
|
155
|
+
type RenderMarkdownSectionsOptions = SplitStructuredIntoChunksOptions;
|
|
156
|
+
/**
|
|
157
|
+
* Markdown (and optional parallel plain text) per chunk — delegates to {@link splitStructuredIntoChunks}
|
|
158
|
+
* with `includeMarkdown: true`. Use for sectioned previews, TOC, or per-chunk storage without re-chunking.
|
|
159
|
+
*/
|
|
160
|
+
declare function renderMarkdownSections(result: StructuredDocumentResult, options?: RenderMarkdownSectionsOptions): MarkdownSection[];
|
|
161
|
+
|
|
162
|
+
export { type ConvertStructuredToLlmTextOptions, type ConvertStructuredToMarkdownOptions, type MarkdownSection, type RenderLlmTextOptions, type RenderMarkdownOptions, type RenderMarkdownSectionsOptions, type SplitStructuredIntoChunksOptions, type StructuredChunk, type StructuredToLlmTextOptions, type StructuredToMarkdownOptions, convertStructuredToLlmText, convertStructuredToMarkdown, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,647 @@
|
|
|
1
|
+
// src/structuredToMarkdown.ts
|
|
2
|
+
function clampHeadingLevel(level) {
|
|
3
|
+
if (level === void 0 || !Number.isFinite(level)) return 2;
|
|
4
|
+
const n = Math.floor(level);
|
|
5
|
+
if (n < 1) return 1;
|
|
6
|
+
if (n > 6) return 6;
|
|
7
|
+
return n;
|
|
8
|
+
}
|
|
9
|
+
function escapeTableCell(text) {
|
|
10
|
+
return text.replace(/\|/g, "\\|").replace(/\r?\n/g, " ").trim();
|
|
11
|
+
}
|
|
12
|
+
function safeString(s) {
|
|
13
|
+
return typeof s === "string" ? s : "";
|
|
14
|
+
}
|
|
15
|
+
function safeArrays(result) {
|
|
16
|
+
return {
|
|
17
|
+
blocks: Array.isArray(result.blocks) ? result.blocks : [],
|
|
18
|
+
tables: Array.isArray(result.tables) ? result.tables : [],
|
|
19
|
+
pages: Array.isArray(result.pages) ? result.pages : [],
|
|
20
|
+
images: Array.isArray(result.images) ? result.images : []
|
|
21
|
+
};
|
|
22
|
+
}
|
|
23
|
+
function metadataHeaderLines(meta) {
|
|
24
|
+
const lines = [];
|
|
25
|
+
if (meta.title) lines.push(`**Title:** ${meta.title}`);
|
|
26
|
+
if (meta.author) lines.push(`**Author:** ${meta.author}`);
|
|
27
|
+
if (meta.language) lines.push(`**Language:** ${meta.language}`);
|
|
28
|
+
if (meta.created) lines.push(`**Created:** ${meta.created}`);
|
|
29
|
+
if (meta.modified) lines.push(`**Modified:** ${meta.modified}`);
|
|
30
|
+
if (meta.pageCount !== void 0 && Number.isFinite(meta.pageCount)) {
|
|
31
|
+
lines.push(`**Pages:** ${meta.pageCount}`);
|
|
32
|
+
}
|
|
33
|
+
const extra = meta.extra;
|
|
34
|
+
if (extra && typeof extra === "object") {
|
|
35
|
+
for (const [k, v] of Object.entries(extra)) {
|
|
36
|
+
if (v === void 0 || v === null) continue;
|
|
37
|
+
const rendered = typeof v === "string" || typeof v === "number" || typeof v === "boolean" ? String(v) : JSON.stringify(v);
|
|
38
|
+
if (rendered.length > 200) continue;
|
|
39
|
+
lines.push(`**${k}:** ${rendered}`);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
return lines;
|
|
43
|
+
}
|
|
44
|
+
function tableToMarkdown(table) {
|
|
45
|
+
const rows = table.rows;
|
|
46
|
+
if (rows.length === 0) {
|
|
47
|
+
return table.caption ? `_${escapeTableCell(table.caption)}_
|
|
48
|
+
|
|
49
|
+
*(empty table)*
|
|
50
|
+
` : "*(empty table)*\n";
|
|
51
|
+
}
|
|
52
|
+
const width = Math.max(...rows.map((r) => r.length));
|
|
53
|
+
const header = rows[0] ?? [];
|
|
54
|
+
const sep = Array.from({ length: width }, () => "---");
|
|
55
|
+
const line = (cells) => `| ${Array.from({ length: width }, (_, i) => escapeTableCell(cells[i]?.text ?? "")).join(" | ")} |`;
|
|
56
|
+
const out = [];
|
|
57
|
+
if (table.caption) {
|
|
58
|
+
out.push(`**${escapeTableCell(table.caption)}**`);
|
|
59
|
+
out.push("");
|
|
60
|
+
}
|
|
61
|
+
out.push(line(header));
|
|
62
|
+
out.push(`| ${sep.join(" | ")} |`);
|
|
63
|
+
for (let r = 1; r < rows.length; r++) {
|
|
64
|
+
out.push(line(rows[r]));
|
|
65
|
+
}
|
|
66
|
+
return `${out.join("\n")}
|
|
67
|
+
`;
|
|
68
|
+
}
|
|
69
|
+
function resolveTable(tables, tableId) {
|
|
70
|
+
return tables.find((t) => t.id === tableId);
|
|
71
|
+
}
|
|
72
|
+
function resolveImage(images, imageId) {
|
|
73
|
+
return images.find((i) => i.id === imageId);
|
|
74
|
+
}
|
|
75
|
+
function referencedTableIds(blocks) {
|
|
76
|
+
const ids = /* @__PURE__ */ new Set();
|
|
77
|
+
for (const b of blocks) {
|
|
78
|
+
if (b.type === "table" && typeof b.tableId === "string") ids.add(b.tableId);
|
|
79
|
+
}
|
|
80
|
+
return ids;
|
|
81
|
+
}
|
|
82
|
+
function referencedImageIds(blocks) {
|
|
83
|
+
const ids = /* @__PURE__ */ new Set();
|
|
84
|
+
for (const b of blocks) {
|
|
85
|
+
if (b.type === "image-ref" && typeof b.imageId === "string") ids.add(b.imageId);
|
|
86
|
+
}
|
|
87
|
+
return ids;
|
|
88
|
+
}
|
|
89
|
+
function convertStructuredToMarkdown(result, options) {
|
|
90
|
+
const imagePlaceholder = options?.imagePlaceholder ?? "<!-- image: no src -->";
|
|
91
|
+
const pageSep = (options?.pageSeparator ?? "---").trimEnd();
|
|
92
|
+
const pageTransitions = options?.pageTransitionMarkers !== false;
|
|
93
|
+
const appendOrphanTables = options?.appendUnreferencedTables !== false;
|
|
94
|
+
const appendOrphanImages = options?.appendUnreferencedImages === true;
|
|
95
|
+
const { blocks, tables, pages, images } = safeArrays(result);
|
|
96
|
+
const hasPageModel = pages.length > 0;
|
|
97
|
+
const parts = [];
|
|
98
|
+
if (options?.includeMetadataHeader) {
|
|
99
|
+
const meta = result.metadata;
|
|
100
|
+
if (meta && typeof meta === "object") {
|
|
101
|
+
const lines = metadataHeaderLines(meta);
|
|
102
|
+
if (lines.length > 0) {
|
|
103
|
+
parts.push(lines.join("\n"));
|
|
104
|
+
parts.push("");
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
const orderedDepthCounters = [];
|
|
109
|
+
const resetListState = () => {
|
|
110
|
+
orderedDepthCounters.length = 0;
|
|
111
|
+
};
|
|
112
|
+
let lastPageIndex;
|
|
113
|
+
function pushPageTransitionIfNeeded(block) {
|
|
114
|
+
if (!pageTransitions) return;
|
|
115
|
+
if (block.type === "page-break") return;
|
|
116
|
+
const pi = block.pageIndex;
|
|
117
|
+
if (pi === void 0 || !Number.isFinite(pi)) return;
|
|
118
|
+
const n = Math.floor(pi);
|
|
119
|
+
if (lastPageIndex === void 0) {
|
|
120
|
+
lastPageIndex = n;
|
|
121
|
+
return;
|
|
122
|
+
}
|
|
123
|
+
if (n <= lastPageIndex) {
|
|
124
|
+
lastPageIndex = n;
|
|
125
|
+
return;
|
|
126
|
+
}
|
|
127
|
+
parts.push(pageSep);
|
|
128
|
+
if (hasPageModel) {
|
|
129
|
+
parts.push("");
|
|
130
|
+
parts.push(`*Page ${n + 1}*`);
|
|
131
|
+
}
|
|
132
|
+
parts.push("");
|
|
133
|
+
lastPageIndex = n;
|
|
134
|
+
}
|
|
135
|
+
for (const block of blocks) {
|
|
136
|
+
pushPageTransitionIfNeeded(block);
|
|
137
|
+
const chunk = blockToMarkdown(
|
|
138
|
+
block,
|
|
139
|
+
tables,
|
|
140
|
+
images,
|
|
141
|
+
imagePlaceholder,
|
|
142
|
+
orderedDepthCounters,
|
|
143
|
+
resetListState,
|
|
144
|
+
pageSep
|
|
145
|
+
);
|
|
146
|
+
parts.push(chunk);
|
|
147
|
+
}
|
|
148
|
+
if (appendOrphanTables) {
|
|
149
|
+
const used = referencedTableIds(blocks);
|
|
150
|
+
const orphans = tables.filter((t) => t.id && !used.has(t.id));
|
|
151
|
+
if (orphans.length > 0) {
|
|
152
|
+
parts.push("");
|
|
153
|
+
parts.push("### Additional tables");
|
|
154
|
+
parts.push("");
|
|
155
|
+
for (const t of orphans) {
|
|
156
|
+
parts.push(tableToMarkdown(t).trimEnd());
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
if (appendOrphanImages) {
|
|
161
|
+
const used = referencedImageIds(blocks);
|
|
162
|
+
const orphans = images.filter((i) => i.id && !used.has(i.id));
|
|
163
|
+
if (orphans.length > 0) {
|
|
164
|
+
parts.push("");
|
|
165
|
+
parts.push("### Additional images");
|
|
166
|
+
parts.push("");
|
|
167
|
+
for (const img of orphans) {
|
|
168
|
+
const alt = safeString(img.alt).replace(/]/g, "\\]");
|
|
169
|
+
if (img.src) {
|
|
170
|
+
parts.push(``);
|
|
171
|
+
} else {
|
|
172
|
+
parts.push(
|
|
173
|
+
`${imagePlaceholder} <!-- id: ${escapeCommentText(img.id)} -->`
|
|
174
|
+
);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
let body = parts.join("\n\n").replace(/\n{3,}/g, "\n\n").trimEnd();
|
|
180
|
+
if (body.length === 0) {
|
|
181
|
+
body = safeString(result.text).trim();
|
|
182
|
+
}
|
|
183
|
+
return body;
|
|
184
|
+
}
|
|
185
|
+
function escapeCommentText(s) {
|
|
186
|
+
return s.replace(/-->/g, "-\\->");
|
|
187
|
+
}
|
|
188
|
+
function listItemLine(block, orderedDepthCounters) {
|
|
189
|
+
const depth = Math.max(0, block.depth ?? 0);
|
|
190
|
+
const indent = " ".repeat(depth);
|
|
191
|
+
const style = block.listStyle ?? "unordered";
|
|
192
|
+
if (style === "ordered") {
|
|
193
|
+
while (orderedDepthCounters.length <= depth) orderedDepthCounters.push(0);
|
|
194
|
+
orderedDepthCounters.length = depth + 1;
|
|
195
|
+
orderedDepthCounters[depth] = (orderedDepthCounters[depth] ?? 0) + 1;
|
|
196
|
+
const n = orderedDepthCounters[depth];
|
|
197
|
+
return `${indent}${n}. ${block.text.trim()}`;
|
|
198
|
+
}
|
|
199
|
+
orderedDepthCounters.length = depth;
|
|
200
|
+
return `${indent}- ${block.text.trim()}`;
|
|
201
|
+
}
|
|
202
|
+
function blockToMarkdown(block, tables, images, imagePlaceholder, orderedDepthCounters, resetListState, pageSep) {
|
|
203
|
+
switch (block.type) {
|
|
204
|
+
case "heading": {
|
|
205
|
+
resetListState();
|
|
206
|
+
const level = clampHeadingLevel(block.level);
|
|
207
|
+
const hashes = "#".repeat(level);
|
|
208
|
+
return `${hashes} ${block.text.trim()}`;
|
|
209
|
+
}
|
|
210
|
+
case "paragraph": {
|
|
211
|
+
resetListState();
|
|
212
|
+
return block.text.trim();
|
|
213
|
+
}
|
|
214
|
+
case "list-item":
|
|
215
|
+
return listItemLine(block, orderedDepthCounters);
|
|
216
|
+
case "table": {
|
|
217
|
+
resetListState();
|
|
218
|
+
const t = resolveTable(tables, block.tableId);
|
|
219
|
+
if (!t) {
|
|
220
|
+
return `<!-- table not found: ${escapeCommentText(block.tableId)} -->`;
|
|
221
|
+
}
|
|
222
|
+
return tableToMarkdown(t).trimEnd();
|
|
223
|
+
}
|
|
224
|
+
case "image-ref": {
|
|
225
|
+
resetListState();
|
|
226
|
+
const img = resolveImage(images, block.imageId);
|
|
227
|
+
const altRaw = block.alt ?? img?.alt ?? "";
|
|
228
|
+
const alt = altRaw.replace(/]/g, "\\]");
|
|
229
|
+
const src = img?.src;
|
|
230
|
+
if (src) return ``;
|
|
231
|
+
const kind = img?.kind;
|
|
232
|
+
const hint = kind === "placeholder" ? " (placeholder)" : kind === "embedded" ? " (embedded)" : "";
|
|
233
|
+
return `${imagePlaceholder}${hint}`;
|
|
234
|
+
}
|
|
235
|
+
case "page-break": {
|
|
236
|
+
resetListState();
|
|
237
|
+
return pageSep;
|
|
238
|
+
}
|
|
239
|
+
case "unknown": {
|
|
240
|
+
resetListState();
|
|
241
|
+
const raw = block.raw?.trim();
|
|
242
|
+
if (raw) return raw;
|
|
243
|
+
if (block.hint) return `<!-- ${escapeCommentText(block.hint)} -->`;
|
|
244
|
+
return "<!-- unknown block -->";
|
|
245
|
+
}
|
|
246
|
+
default: {
|
|
247
|
+
const _exhaustive = block;
|
|
248
|
+
return _exhaustive;
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
function structuredDocumentToMarkdown(structured, options) {
|
|
253
|
+
return convertStructuredToMarkdown(structured, options);
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// src/structuredToLlmText.ts
|
|
257
|
+
function clampHeadingLevel2(level) {
|
|
258
|
+
if (level === void 0 || !Number.isFinite(level)) return 2;
|
|
259
|
+
const n = Math.floor(level);
|
|
260
|
+
if (n < 1) return 1;
|
|
261
|
+
if (n > 6) return 6;
|
|
262
|
+
return n;
|
|
263
|
+
}
|
|
264
|
+
function safeArrays2(result) {
|
|
265
|
+
return {
|
|
266
|
+
blocks: Array.isArray(result.blocks) ? result.blocks : [],
|
|
267
|
+
tables: Array.isArray(result.tables) ? result.tables : [],
|
|
268
|
+
pages: Array.isArray(result.pages) ? result.pages : [],
|
|
269
|
+
images: Array.isArray(result.images) ? result.images : [],
|
|
270
|
+
warnings: Array.isArray(result.warnings) ? result.warnings : []
|
|
271
|
+
};
|
|
272
|
+
}
|
|
273
|
+
function metadataDocBlock(meta, extraMax) {
|
|
274
|
+
const lines = [];
|
|
275
|
+
if (meta.title) lines.push(`title: ${meta.title}`);
|
|
276
|
+
if (meta.author) lines.push(`author: ${meta.author}`);
|
|
277
|
+
if (meta.language) lines.push(`language: ${meta.language}`);
|
|
278
|
+
if (meta.pageCount !== void 0 && Number.isFinite(meta.pageCount)) {
|
|
279
|
+
lines.push(`pages: ${meta.pageCount}`);
|
|
280
|
+
}
|
|
281
|
+
if (meta.created) lines.push(`created: ${meta.created}`);
|
|
282
|
+
if (meta.modified) lines.push(`modified: ${meta.modified}`);
|
|
283
|
+
const extra = meta.extra;
|
|
284
|
+
if (extra && typeof extra === "object" && extraMax > 0) {
|
|
285
|
+
let n = 0;
|
|
286
|
+
for (const [k, v] of Object.entries(extra)) {
|
|
287
|
+
if (n >= extraMax) break;
|
|
288
|
+
if (v === void 0 || v === null) continue;
|
|
289
|
+
const s = typeof v === "string" || typeof v === "number" || typeof v === "boolean" ? String(v) : JSON.stringify(v);
|
|
290
|
+
if (s.length > 120) continue;
|
|
291
|
+
lines.push(`${k}: ${s}`);
|
|
292
|
+
n++;
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
if (lines.length === 0) return void 0;
|
|
296
|
+
return `[DOC]
|
|
297
|
+
${lines.join("\n")}`;
|
|
298
|
+
}
|
|
299
|
+
function tableToLlmBlock(table, tag) {
|
|
300
|
+
const lines = [];
|
|
301
|
+
lines.push(`${tag} id=${table.id}`);
|
|
302
|
+
if (table.caption) lines.push(`Caption: ${table.caption}`);
|
|
303
|
+
const rows = table.rows;
|
|
304
|
+
if (rows.length === 0) {
|
|
305
|
+
lines.push("(empty table)");
|
|
306
|
+
return lines.join("\n");
|
|
307
|
+
}
|
|
308
|
+
for (const row of rows) {
|
|
309
|
+
lines.push(row.map((c) => c.text.replace(/\r?\n/g, " ").trim()).join(" | "));
|
|
310
|
+
}
|
|
311
|
+
return lines.join("\n");
|
|
312
|
+
}
|
|
313
|
+
function resolveTable2(tables, tableId) {
|
|
314
|
+
return tables.find((t) => t.id === tableId);
|
|
315
|
+
}
|
|
316
|
+
function referencedTableIds2(blocks) {
|
|
317
|
+
const ids = /* @__PURE__ */ new Set();
|
|
318
|
+
for (const b of blocks) {
|
|
319
|
+
if (b.type === "table" && typeof b.tableId === "string") ids.add(b.tableId);
|
|
320
|
+
}
|
|
321
|
+
return ids;
|
|
322
|
+
}
|
|
323
|
+
function listItemLine2(block, orderedDepthCounters) {
|
|
324
|
+
const depth = Math.max(0, block.depth ?? 0);
|
|
325
|
+
const indent = " ".repeat(depth);
|
|
326
|
+
const style = block.listStyle ?? "unordered";
|
|
327
|
+
const text = block.text.replace(/\r?\n/g, " ").trim();
|
|
328
|
+
if (style === "ordered") {
|
|
329
|
+
while (orderedDepthCounters.length <= depth) orderedDepthCounters.push(0);
|
|
330
|
+
orderedDepthCounters.length = depth + 1;
|
|
331
|
+
orderedDepthCounters[depth] = (orderedDepthCounters[depth] ?? 0) + 1;
|
|
332
|
+
const n = orderedDepthCounters[depth];
|
|
333
|
+
return `${indent}${n}. ${text}`;
|
|
334
|
+
}
|
|
335
|
+
orderedDepthCounters.length = depth;
|
|
336
|
+
return `${indent}\u2022 ${text}`;
|
|
337
|
+
}
|
|
338
|
+
function blockToLlm(block, tables, images, tableTag, imageTag, pageMarker, orderedDepthCounters, resetListState, skipEmptyParagraphs) {
|
|
339
|
+
switch (block.type) {
|
|
340
|
+
case "heading": {
|
|
341
|
+
resetListState();
|
|
342
|
+
const lv = clampHeadingLevel2(block.level);
|
|
343
|
+
return `[H${lv}] ${block.text.replace(/\r?\n/g, " ").trim()}`;
|
|
344
|
+
}
|
|
345
|
+
case "paragraph": {
|
|
346
|
+
resetListState();
|
|
347
|
+
const t = block.text.trim();
|
|
348
|
+
if (t.length === 0 && skipEmptyParagraphs) return void 0;
|
|
349
|
+
return t.replace(/\r?\n/g, " ").replace(/\s+/g, " ").trim();
|
|
350
|
+
}
|
|
351
|
+
case "list-item":
|
|
352
|
+
return listItemLine2(block, orderedDepthCounters);
|
|
353
|
+
case "table": {
|
|
354
|
+
resetListState();
|
|
355
|
+
const t = resolveTable2(tables, block.tableId);
|
|
356
|
+
if (!t) return `${tableTag} MISSING id=${block.tableId}`;
|
|
357
|
+
return tableToLlmBlock(t, tableTag);
|
|
358
|
+
}
|
|
359
|
+
case "image-ref": {
|
|
360
|
+
resetListState();
|
|
361
|
+
const img = images.find((i) => i.id === block.imageId);
|
|
362
|
+
const alt = (block.alt ?? img?.alt ?? "").replace(/\r?\n/g, " ").trim();
|
|
363
|
+
if (img?.src) {
|
|
364
|
+
return `${imageTag} alt=${JSON.stringify(alt)} url=${JSON.stringify(img.src)}`;
|
|
365
|
+
}
|
|
366
|
+
return `${imageTag} id=${JSON.stringify(block.imageId)} alt=${JSON.stringify(alt)} (no url)`;
|
|
367
|
+
}
|
|
368
|
+
case "page-break": {
|
|
369
|
+
resetListState();
|
|
370
|
+
return pageMarker;
|
|
371
|
+
}
|
|
372
|
+
case "unknown": {
|
|
373
|
+
resetListState();
|
|
374
|
+
const raw = block.raw?.trim();
|
|
375
|
+
if (raw) return raw.replace(/\r?\n/g, "\n");
|
|
376
|
+
return block.hint ? `[UNKNOWN: ${block.hint}]` : "[UNKNOWN]";
|
|
377
|
+
}
|
|
378
|
+
default: {
|
|
379
|
+
const _exhaustive = block;
|
|
380
|
+
return _exhaustive;
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
function convertStructuredToLlmText(result, options) {
|
|
385
|
+
const fallback = options?.fallbackToRollupText !== false;
|
|
386
|
+
const tableTag = options?.tableSectionTag ?? "[TABLE]";
|
|
387
|
+
const imageTag = options?.imagePlaceholderTag ?? "[IMAGE]";
|
|
388
|
+
const pageMarker = options?.pageBreakMarker ?? "[PAGE]";
|
|
389
|
+
const pageTransitions = options?.pageTransitionMarkers !== false;
|
|
390
|
+
const includeWarnings = options?.includeWarnings !== false;
|
|
391
|
+
const includeMeta = options?.includeDocumentMetadata === true;
|
|
392
|
+
const extraMax = options?.metadataExtraMaxKeys ?? 5;
|
|
393
|
+
const appendOrphanTables = options?.appendUnreferencedTables !== false;
|
|
394
|
+
const compact = options?.compact === true;
|
|
395
|
+
const skipEmptyParagraphs = options?.skipEmptyParagraphs !== false;
|
|
396
|
+
const sep = compact ? "\n" : "\n\n";
|
|
397
|
+
const { blocks, tables, pages, images, warnings } = safeArrays2(result);
|
|
398
|
+
const hasPageModel = pages.length > 0;
|
|
399
|
+
const parts = [];
|
|
400
|
+
if (includeMeta && result.metadata && typeof result.metadata === "object") {
|
|
401
|
+
const docBlock = metadataDocBlock(result.metadata, extraMax);
|
|
402
|
+
if (docBlock) parts.push(docBlock);
|
|
403
|
+
}
|
|
404
|
+
const orderedDepthCounters = [];
|
|
405
|
+
const resetListState = () => {
|
|
406
|
+
orderedDepthCounters.length = 0;
|
|
407
|
+
};
|
|
408
|
+
let lastPageIndex;
|
|
409
|
+
function pushPageTransitionIfNeeded(block) {
|
|
410
|
+
if (!pageTransitions) return;
|
|
411
|
+
if (block.type === "page-break") return;
|
|
412
|
+
const pi = block.pageIndex;
|
|
413
|
+
if (pi === void 0 || !Number.isFinite(pi)) return;
|
|
414
|
+
const n = Math.floor(pi);
|
|
415
|
+
if (lastPageIndex === void 0) {
|
|
416
|
+
lastPageIndex = n;
|
|
417
|
+
return;
|
|
418
|
+
}
|
|
419
|
+
if (n <= lastPageIndex) {
|
|
420
|
+
lastPageIndex = n;
|
|
421
|
+
return;
|
|
422
|
+
}
|
|
423
|
+
parts.push(pageMarker);
|
|
424
|
+
if (hasPageModel) {
|
|
425
|
+
parts.push(`[PAGE ${n + 1}]`);
|
|
426
|
+
}
|
|
427
|
+
lastPageIndex = n;
|
|
428
|
+
}
|
|
429
|
+
for (const block of blocks) {
|
|
430
|
+
pushPageTransitionIfNeeded(block);
|
|
431
|
+
const chunk = blockToLlm(
|
|
432
|
+
block,
|
|
433
|
+
tables,
|
|
434
|
+
images,
|
|
435
|
+
tableTag,
|
|
436
|
+
imageTag,
|
|
437
|
+
pageMarker,
|
|
438
|
+
orderedDepthCounters,
|
|
439
|
+
resetListState,
|
|
440
|
+
skipEmptyParagraphs
|
|
441
|
+
);
|
|
442
|
+
if (chunk !== void 0 && chunk.length > 0) parts.push(chunk);
|
|
443
|
+
}
|
|
444
|
+
if (appendOrphanTables) {
|
|
445
|
+
const used = referencedTableIds2(blocks);
|
|
446
|
+
const orphans = tables.filter((t) => t.id && !used.has(t.id));
|
|
447
|
+
if (orphans.length > 0) {
|
|
448
|
+
parts.push(`[MORE_TABLES]`);
|
|
449
|
+
for (const t of orphans) {
|
|
450
|
+
parts.push(tableToLlmBlock(t, tableTag));
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
let out = parts.join(sep).replace(/\n{3,}/g, "\n\n").trim();
|
|
455
|
+
if (out.length === 0 && fallback) {
|
|
456
|
+
out = typeof result.text === "string" ? result.text.trim() : "";
|
|
457
|
+
}
|
|
458
|
+
if (includeWarnings && warnings.length > 0) {
|
|
459
|
+
const warnLines = warnings.map((w) => `- ${String(w).replace(/\r?\n/g, " ")}`).join("\n");
|
|
460
|
+
const block = `[WARNINGS]
|
|
461
|
+
${warnLines}`;
|
|
462
|
+
out = out ? `${out}${sep}${block}` : block;
|
|
463
|
+
}
|
|
464
|
+
return out;
|
|
465
|
+
}
|
|
466
|
+
function structuredDocumentToLlmText(structured, options) {
|
|
467
|
+
return convertStructuredToLlmText(structured, options);
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
// src/splitStructuredIntoChunks.ts
|
|
471
|
+
var SLICE_MARKDOWN_OPTS = {
|
|
472
|
+
includeMetadataHeader: false,
|
|
473
|
+
pageTransitionMarkers: false,
|
|
474
|
+
appendUnreferencedTables: false,
|
|
475
|
+
appendUnreferencedImages: false
|
|
476
|
+
};
|
|
477
|
+
var SLICE_LLM_OPTS = {
|
|
478
|
+
fallbackToRollupText: false,
|
|
479
|
+
includeDocumentMetadata: false,
|
|
480
|
+
includeWarnings: false,
|
|
481
|
+
pageTransitionMarkers: false,
|
|
482
|
+
appendUnreferencedTables: false
|
|
483
|
+
};
|
|
484
|
+
function clampHeadingLevel3(level) {
|
|
485
|
+
if (level === void 0 || !Number.isFinite(level)) return 2;
|
|
486
|
+
const n = Math.floor(level);
|
|
487
|
+
if (n < 1) return 1;
|
|
488
|
+
if (n > 6) return 6;
|
|
489
|
+
return n;
|
|
490
|
+
}
|
|
491
|
+
function sliceResult(result, blocks) {
|
|
492
|
+
return {
|
|
493
|
+
...result,
|
|
494
|
+
blocks: [...blocks],
|
|
495
|
+
text: ""
|
|
496
|
+
};
|
|
497
|
+
}
|
|
498
|
+
function renderSlice(result, block, includeMarkdown) {
|
|
499
|
+
const r = sliceResult(result, [block]);
|
|
500
|
+
const text = convertStructuredToLlmText(r, SLICE_LLM_OPTS).trim();
|
|
501
|
+
const md = includeMarkdown ? convertStructuredToMarkdown(r, SLICE_MARKDOWN_OPTS).trim() : "";
|
|
502
|
+
return { text, md };
|
|
503
|
+
}
|
|
504
|
+
function joinChunkParts(parts) {
|
|
505
|
+
return parts.map((p) => p.trim()).filter((p) => p.length > 0).join("\n\n");
|
|
506
|
+
}
|
|
507
|
+
function safeBlocks(result) {
|
|
508
|
+
return Array.isArray(result.blocks) ? result.blocks : [];
|
|
509
|
+
}
|
|
510
|
+
function splitStructuredIntoChunks(result, options) {
|
|
511
|
+
const maxChars = Math.max(1, options?.maxChars ?? 4e3);
|
|
512
|
+
const overlapChars = Math.max(0, options?.overlapChars ?? 0);
|
|
513
|
+
const preferHeadings = options?.preferHeadings !== false;
|
|
514
|
+
const preserveTables = options?.preserveTables !== false;
|
|
515
|
+
const includeMarkdown = options?.includeMarkdown !== false;
|
|
516
|
+
const blocks = safeBlocks(result);
|
|
517
|
+
if (blocks.length === 0) {
|
|
518
|
+
const text = convertStructuredToLlmText(result, {
|
|
519
|
+
...SLICE_LLM_OPTS,
|
|
520
|
+
fallbackToRollupText: true,
|
|
521
|
+
includeWarnings: false
|
|
522
|
+
}).trim();
|
|
523
|
+
const md = includeMarkdown ? convertStructuredToMarkdown(result, {
|
|
524
|
+
...SLICE_MARKDOWN_OPTS,
|
|
525
|
+
includeMetadataHeader: false
|
|
526
|
+
}).trim() : void 0;
|
|
527
|
+
const chunkText = text.length > 0 ? text : typeof result.text === "string" ? result.text.trim() : "";
|
|
528
|
+
return [
|
|
529
|
+
{
|
|
530
|
+
index: 0,
|
|
531
|
+
text: chunkText,
|
|
532
|
+
markdown: md && md.length > 0 ? md : void 0,
|
|
533
|
+
headingPath: void 0,
|
|
534
|
+
pageIndex: void 0,
|
|
535
|
+
pageEndIndex: void 0
|
|
536
|
+
}
|
|
537
|
+
];
|
|
538
|
+
}
|
|
539
|
+
const stack = [];
|
|
540
|
+
const units = [];
|
|
541
|
+
for (const block of blocks) {
|
|
542
|
+
if (block.type === "heading") {
|
|
543
|
+
const L = clampHeadingLevel3(block.level);
|
|
544
|
+
while (stack.length > 0 && stack[stack.length - 1].level >= L) {
|
|
545
|
+
stack.pop();
|
|
546
|
+
}
|
|
547
|
+
stack.push({ level: L, text: block.text.trim() });
|
|
548
|
+
}
|
|
549
|
+
const headingPath = stack.map((h) => h.text);
|
|
550
|
+
const { text, md } = renderSlice(result, block, includeMarkdown);
|
|
551
|
+
const pageIndex = block.pageIndex !== void 0 && Number.isFinite(block.pageIndex) ? Math.floor(block.pageIndex) : void 0;
|
|
552
|
+
units.push({
|
|
553
|
+
text,
|
|
554
|
+
md,
|
|
555
|
+
pageIndex,
|
|
556
|
+
isTable: block.type === "table",
|
|
557
|
+
isHeading: block.type === "heading",
|
|
558
|
+
headingPath
|
|
559
|
+
});
|
|
560
|
+
}
|
|
561
|
+
const chunks = [];
|
|
562
|
+
let current = [];
|
|
563
|
+
let pendingTextPrefix = "";
|
|
564
|
+
function projectedTextLength(next) {
|
|
565
|
+
const body = joinChunkParts(current.map((u) => u.text).concat(next.text));
|
|
566
|
+
const full = pendingTextPrefix ? `${pendingTextPrefix}${body.length > 0 ? "\n\n" + body : ""}` : body;
|
|
567
|
+
return full.length;
|
|
568
|
+
}
|
|
569
|
+
function flush() {
|
|
570
|
+
if (current.length === 0) return;
|
|
571
|
+
const body = joinChunkParts(current.map((u) => u.text));
|
|
572
|
+
const text = pendingTextPrefix ? `${pendingTextPrefix}${body.length > 0 ? "\n\n" + body : ""}`.trim() : body.trim();
|
|
573
|
+
const markdown = includeMarkdown && current.length > 0 ? joinChunkParts(current.map((u) => u.md)).trim() : void 0;
|
|
574
|
+
const pages = current.map((u) => u.pageIndex).filter((n) => n !== void 0);
|
|
575
|
+
const pageIndex = pages.length > 0 ? Math.min(...pages) : void 0;
|
|
576
|
+
const pageEndIndex = pages.length > 0 ? Math.max(...pages) : void 0;
|
|
577
|
+
const headingPath = current.length > 0 ? current[current.length - 1].headingPath : void 0;
|
|
578
|
+
if (text.length > 0 || markdown && markdown.length > 0) {
|
|
579
|
+
chunks.push({
|
|
580
|
+
index: chunks.length,
|
|
581
|
+
text,
|
|
582
|
+
markdown: markdown && markdown.length > 0 ? markdown : void 0,
|
|
583
|
+
headingPath: headingPath && headingPath.length > 0 ? [...headingPath] : void 0,
|
|
584
|
+
pageIndex,
|
|
585
|
+
pageEndIndex
|
|
586
|
+
});
|
|
587
|
+
}
|
|
588
|
+
pendingTextPrefix = overlapChars > 0 && text.length > 0 ? text.slice(Math.max(0, text.length - overlapChars)).trimStart() : "";
|
|
589
|
+
current = [];
|
|
590
|
+
}
|
|
591
|
+
for (let i = 0; i < units.length; i++) {
|
|
592
|
+
const unit = units[i];
|
|
593
|
+
if (preferHeadings && unit.isHeading && current.length > 0) {
|
|
594
|
+
flush();
|
|
595
|
+
}
|
|
596
|
+
if (preserveTables && unit.isTable && unit.text.length > maxChars) {
|
|
597
|
+
if (current.length > 0) flush();
|
|
598
|
+
current = [unit];
|
|
599
|
+
flush();
|
|
600
|
+
continue;
|
|
601
|
+
}
|
|
602
|
+
if (current.length > 0 && projectedTextLength(unit) > maxChars) {
|
|
603
|
+
flush();
|
|
604
|
+
}
|
|
605
|
+
current.push(unit);
|
|
606
|
+
}
|
|
607
|
+
flush();
|
|
608
|
+
if (chunks.length === 0) {
|
|
609
|
+
return [
|
|
610
|
+
{
|
|
611
|
+
index: 0,
|
|
612
|
+
text: typeof result.text === "string" ? result.text.trim() : "",
|
|
613
|
+
markdown: void 0,
|
|
614
|
+
headingPath: void 0,
|
|
615
|
+
pageIndex: void 0,
|
|
616
|
+
pageEndIndex: void 0
|
|
617
|
+
}
|
|
618
|
+
];
|
|
619
|
+
}
|
|
620
|
+
return chunks.map((c, i) => ({ ...c, index: i }));
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
// src/render.ts
|
|
624
|
+
function renderMarkdown(result, options) {
|
|
625
|
+
return convertStructuredToMarkdown(result, options);
|
|
626
|
+
}
|
|
627
|
+
function renderLlmText(result, options) {
|
|
628
|
+
return convertStructuredToLlmText(result, options);
|
|
629
|
+
}
|
|
630
|
+
function renderMarkdownSections(result, options) {
|
|
631
|
+
const chunks = splitStructuredIntoChunks(result, {
|
|
632
|
+
...options,
|
|
633
|
+
includeMarkdown: true
|
|
634
|
+
});
|
|
635
|
+
return chunks.map((c) => ({
|
|
636
|
+
index: c.index,
|
|
637
|
+
markdown: (c.markdown ?? "").trim(),
|
|
638
|
+
headingPath: c.headingPath,
|
|
639
|
+
pageIndex: c.pageIndex,
|
|
640
|
+
pageEndIndex: c.pageEndIndex,
|
|
641
|
+
text: c.text.trim().length > 0 ? c.text.trim() : void 0
|
|
642
|
+
}));
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
export { convertStructuredToLlmText, convertStructuredToMarkdown, renderLlmText, renderMarkdown, renderMarkdownSections, splitStructuredIntoChunks, structuredDocumentToLlmText, structuredDocumentToMarkdown };
|
|
646
|
+
//# sourceMappingURL=index.js.map
|
|
647
|
+
//# sourceMappingURL=index.js.map
|
package/package.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@dragon708/docmind-markdown",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "StructuredDocumentResult → Markdown and LLM-oriented plain text for DocMind.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"sideEffects": false,
|
|
7
|
+
"main": "./dist/index.js",
|
|
8
|
+
"module": "./dist/index.js",
|
|
9
|
+
"types": "./dist/index.d.ts",
|
|
10
|
+
"exports": {
|
|
11
|
+
".": {
|
|
12
|
+
"types": "./dist/index.d.ts",
|
|
13
|
+
"import": "./dist/index.js",
|
|
14
|
+
"default": "./dist/index.js"
|
|
15
|
+
}
|
|
16
|
+
},
|
|
17
|
+
"files": [
|
|
18
|
+
"dist/**/*.js",
|
|
19
|
+
"dist/**/*.d.ts"
|
|
20
|
+
],
|
|
21
|
+
"publishConfig": {
|
|
22
|
+
"access": "public"
|
|
23
|
+
},
|
|
24
|
+
"scripts": {
|
|
25
|
+
"build": "tsup",
|
|
26
|
+
"dev": "tsup --watch",
|
|
27
|
+
"test": "vitest run",
|
|
28
|
+
"clean": "node -e \"try{require('fs').rmSync('dist',{recursive:true,force:true})}catch(e){}\""
|
|
29
|
+
},
|
|
30
|
+
"keywords": [
|
|
31
|
+
"docmind",
|
|
32
|
+
"markdown",
|
|
33
|
+
"structured-document",
|
|
34
|
+
"llm"
|
|
35
|
+
],
|
|
36
|
+
"license": "MIT",
|
|
37
|
+
"dependencies": {
|
|
38
|
+
"@dragon708/docmind-shared": "^1.2.0"
|
|
39
|
+
},
|
|
40
|
+
"devDependencies": {
|
|
41
|
+
"@types/node": "^20.19.37",
|
|
42
|
+
"tsup": "^8.5.1",
|
|
43
|
+
"typescript": "^5.9.3",
|
|
44
|
+
"vitest": "^1.6.1"
|
|
45
|
+
}
|
|
46
|
+
}
|