@doclo/core 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/pdf-utils.ts","../src/runtime/base64.ts"],"sourcesContent":["/**\n * PDF Utilities\n *\n * Edge Runtime compatible PDF manipulation utilities using pdf-lib.\n * These functions work in Node.js, Vercel Edge Functions, Cloudflare Workers, and browsers.\n */\n\nimport { PDFDocument } from 'pdf-lib';\nimport { base64ToArrayBuffer, uint8ArrayToBase64 } from './runtime/base64.js';\nimport type { DocumentIR } from './internal/validation-utils.js';\n\n/**\n * Get the total number of pages in a PDF document\n *\n * @param dataUrl - PDF data URI in format: data:application/pdf;base64,{base64data}\n * @returns Total page count\n * @throws {Error} If the input is not a valid PDF data URL\n *\n * @example\n * ```typescript\n * const pageCount = await getPDFPageCount('data:application/pdf;base64,JVBERi0...');\n * console.log(`PDF has ${pageCount} pages`);\n * ```\n */\nexport async function getPDFPageCount(dataUrl: string): Promise<number> {\n const base64Match = dataUrl.match(/^data:application\\/pdf;base64,(.+)$/);\n if (!base64Match) {\n throw new Error('Invalid PDF data URL format. Expected: data:application/pdf;base64,{base64data}');\n }\n\n const base64Data = base64Match[1];\n const pdfBytes = base64ToArrayBuffer(base64Data);\n const pdfDoc = await PDFDocument.load(pdfBytes);\n return pdfDoc.getPageCount();\n}\n\n/**\n * Split a PDF into multiple smaller PDFs based on page ranges\n *\n * @param dataUrl - PDF data URI in format: data:application/pdf;base64,{base64data}\n * @param pageRanges - Array of [startPage, endPage] tuples (1-indexed, inclusive)\n * @returns Array of PDF data URLs, one for each page range\n * @throws {Error} If the input is not a valid PDF data URL or page ranges are invalid\n *\n * @example\n * ```typescript\n * // Split a 10-page PDF into three chunks\n * const chunks = await splitPDFIntoChunks(pdfDataUrl, [\n * [1, 3], // Pages 1-3\n * [4, 7], // Pages 4-7\n * [8, 10] // Pages 8-10\n * ]);\n * console.log(`Created ${chunks.length} PDF chunks`);\n * ```\n */\nexport async function splitPDFIntoChunks(\n dataUrl: string,\n pageRanges: Array<[number, number]>\n): Promise<string[]> {\n // Extract base64 data from data URL\n const base64Match = dataUrl.match(/^data:application\\/pdf;base64,(.+)$/);\n if (!base64Match) {\n throw new Error('Invalid PDF data URL format. Expected: data:application/pdf;base64,{base64data}');\n }\n\n const base64Data = base64Match[1];\n const pdfBytes = base64ToArrayBuffer(base64Data);\n\n // Load the PDF\n const pdfDoc = await PDFDocument.load(pdfBytes);\n const totalPages = pdfDoc.getPageCount();\n\n const chunks: string[] = [];\n\n for (const [startPage, endPage] of pageRanges) {\n // Validate page range\n if (startPage < 1 || endPage > totalPages || startPage > endPage) {\n throw new Error(\n `Invalid page range [${startPage}, ${endPage}] for PDF with ${totalPages} pages. ` +\n `Page numbers must be 1-indexed and within bounds.`\n );\n }\n\n // Create new PDF with only these pages\n const chunkDoc = await PDFDocument.create();\n const pagesToCopy = Array.from(\n { length: endPage - startPage + 1 },\n (_, i) => startPage - 1 + i // Convert to 0-indexed\n );\n\n const copiedPages = await chunkDoc.copyPages(pdfDoc, pagesToCopy);\n copiedPages.forEach(page => chunkDoc.addPage(page));\n\n // Serialize to base64 using Edge Runtime compatible adapter\n const chunkBytes = await chunkDoc.save();\n const chunkBase64 = uint8ArrayToBase64(chunkBytes);\n chunks.push(`data:application/pdf;base64,${chunkBase64}`);\n }\n\n return chunks;\n}\n\n/**\n * Get the page count from a DocumentIR, with fallback logic\n *\n * This helper function checks multiple sources for page count:\n * 1. `extras.pageCount` (explicit page count from provider or PDF analysis)\n * 2. `pages.length` (fallback - number of pages in the IR)\n *\n * Note: For Unsiloed provider, `pages.length` represents semantic chunks,\n * not traditional pages. Use `extras.totalSemanticChunks` to distinguish.\n *\n * @param ir - DocumentIR to get page count from\n * @returns Page count (or chunk count for Unsiloed)\n *\n * @example\n * ```typescript\n * const ir = await parseNode.run(pdfUrl, { provider: ocrProvider });\n * const pageCount = getDocumentPageCount(ir);\n * console.log(`Document has ${pageCount} pages`);\n * ```\n */\nexport function getDocumentPageCount(ir: DocumentIR): number {\n // Prefer explicit pageCount from extras\n if (ir.extras?.pageCount !== undefined) {\n return ir.extras.pageCount;\n }\n\n // Fallback to pages array length\n return ir.pages.length;\n}\n\n/**\n * Get total page count across multiple DocumentIR objects (chunked results)\n *\n * For chunked parsing results, this sums up the page counts across all chunks.\n * It respects `extras.pageCount` if available, otherwise uses `pages.length`.\n *\n * @param irArray - Array of DocumentIR objects from chunked parsing\n * @returns Total page count across all chunks\n *\n * @example\n * ```typescript\n * const chunks = await parseNode.run(largePdfUrl, {\n * provider: ocrProvider,\n * chunked: { maxPagesPerChunk: 10 }\n * });\n * const totalPages = getTotalPageCount(chunks);\n * console.log(`Total pages across ${chunks.length} chunks: ${totalPages}`);\n * ```\n */\nexport function getTotalPageCount(irArray: DocumentIR[]): number {\n return irArray.reduce((sum, ir) => sum + getDocumentPageCount(ir), 0);\n}\n\n/**\n * Get comprehensive page-related metadata from a DocumentIR\n *\n * Returns detailed information about page counts, chunk information,\n * and whether the result is chunked or a complete document.\n *\n * @param ir - DocumentIR to analyze\n * @returns Metadata object with page count details\n *\n * @example\n * ```typescript\n * const metadata = getPageCountMetadata(ir);\n * console.log(`Document has ${metadata.pageCount} pages`);\n * if (metadata.isChunked) {\n * console.log(`This is chunk ${metadata.chunkIndex + 1} of ${metadata.totalChunks}`);\n * console.log(`Contains pages ${metadata.pageRange[0]} to ${metadata.pageRange[1]}`);\n * }\n * ```\n */\nexport function getPageCountMetadata(ir: DocumentIR): {\n /** Total page count (or chunk count for Unsiloed) */\n pageCount: number;\n /** Number of pages in the IR (may differ from pageCount for chunked docs) */\n pagesInIR: number;\n /** Whether this is a chunked result */\n isChunked: boolean;\n /** For chunked results: which chunk this is (0-indexed) */\n chunkIndex?: number;\n /** For chunked results: total number of chunks */\n totalChunks?: number;\n /** For chunked results: page range [start, end] (1-indexed, inclusive) */\n pageRange?: [number, number];\n /** For Unsiloed: total semantic chunks */\n totalSemanticChunks?: number;\n /** Whether this is from Unsiloed (semantic chunking, not traditional pages) */\n isSemanticChunking: boolean;\n} {\n const pagesInIR = ir.pages.length;\n const pageCount = ir.extras?.pageCount ?? pagesInIR;\n const isSemanticChunking = ir.extras?.totalSemanticChunks !== undefined;\n const isChunked = ir.extras?.chunkIndex !== undefined && ir.extras?.totalChunks !== undefined;\n\n return {\n pageCount,\n pagesInIR,\n isChunked,\n chunkIndex: ir.extras?.chunkIndex as number | undefined,\n totalChunks: ir.extras?.totalChunks as number | undefined,\n pageRange: ir.extras?.pageRange as [number, number] | undefined,\n totalSemanticChunks: ir.extras?.totalSemanticChunks as number | undefined,\n isSemanticChunking\n };\n}\n","/**\n * Universal Base64 Adapter\n *\n * Provides base64 encoding/decoding for both Node.js and Edge Runtime.\n * Replaces Node.js Buffer usage with Web APIs for Edge compatibility.\n *\n * @module @doclo/core/runtime/base64\n */\n\n/**\n * Convert ArrayBuffer to base64 string\n *\n * Uses different strategies depending on runtime:\n * - Edge Runtime / Browser: btoa() with binary string conversion\n * - Node.js: Buffer.toString('base64')\n *\n * @param buffer - ArrayBuffer to encode\n * @returns Base64 encoded string\n *\n * @example\n * ```typescript\n * const buffer = new Uint8Array([72, 101, 108, 108, 111]).buffer;\n * const base64 = arrayBufferToBase64(buffer); // \"SGVsbG8=\"\n * ```\n */\nexport function arrayBufferToBase64(buffer: ArrayBuffer): string {\n // Node.js: Use Buffer for best performance\n if (typeof Buffer !== 'undefined') {\n return Buffer.from(buffer).toString('base64');\n }\n\n // Edge Runtime / Browser: Use btoa() with binary string\n const bytes = new Uint8Array(buffer);\n let binary = '';\n for (let i = 0; i < bytes.byteLength; i++) {\n binary += String.fromCharCode(bytes[i]);\n }\n return btoa(binary);\n}\n\n/**\n * Convert base64 string to ArrayBuffer\n *\n * Uses different strategies depending on runtime:\n * - Edge Runtime / Browser: atob() with Uint8Array conversion\n * - Node.js: Buffer.from(base64, 'base64')\n *\n * @param base64 - Base64 encoded string (with or without data URI prefix)\n * @returns Decoded ArrayBuffer\n *\n * @example\n * ```typescript\n * const buffer = base64ToArrayBuffer(\"SGVsbG8=\");\n * const text = new TextDecoder().decode(buffer); // \"Hello\"\n *\n * // Also handles data URIs\n * const buffer2 = base64ToArrayBuffer(\"data:image/png;base64,iVBORw0KG...\");\n * ```\n */\nexport function base64ToArrayBuffer(base64: string): ArrayBuffer {\n // Remove data URI prefix if present\n const cleanBase64 = base64.replace(/^data:[^;]+;base64,/, '');\n\n // Node.js: Use Buffer for best performance\n if (typeof Buffer !== 'undefined') {\n const buffer = Buffer.from(cleanBase64, 'base64');\n // Convert Node.js Buffer to ArrayBuffer\n return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);\n }\n\n // Edge Runtime / Browser: Use atob()\n const binaryString = atob(cleanBase64);\n const bytes = new Uint8Array(binaryString.length);\n for (let i = 0; i < binaryString.length; i++) {\n bytes[i] = binaryString.charCodeAt(i);\n }\n return bytes.buffer;\n}\n\n/**\n * Convert Uint8Array to base64 string\n *\n * Convenience wrapper around arrayBufferToBase64 for Uint8Array inputs.\n *\n * @param bytes - Uint8Array to encode\n * @returns Base64 encoded string\n */\nexport function uint8ArrayToBase64(bytes: Uint8Array): string {\n return arrayBufferToBase64(bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength) as ArrayBuffer);\n}\n\n/**\n * Convert base64 string to Uint8Array\n *\n * Convenience wrapper around base64ToArrayBuffer with Uint8Array result.\n *\n * @param base64 - Base64 encoded string\n * @returns Decoded Uint8Array\n */\nexport function base64ToUint8Array(base64: string): Uint8Array {\n return new Uint8Array(base64ToArrayBuffer(base64));\n}\n\n/**\n * Create a data URI from ArrayBuffer\n *\n * @param buffer - Data to encode\n * @param mimeType - MIME type (default: application/octet-stream)\n * @returns Data URI string\n *\n * @example\n * ```typescript\n * const buffer = new TextEncoder().encode(\"Hello, World!\");\n * const dataUri = createDataUri(buffer.buffer, 'text/plain');\n * // \"data:text/plain;base64,SGVsbG8sIFdvcmxkIQ==\"\n * ```\n */\nexport function createDataUri(buffer: ArrayBuffer, mimeType = 'application/octet-stream'): string {\n const base64 = arrayBufferToBase64(buffer);\n return `data:${mimeType};base64,${base64}`;\n}\n\n/**\n * Check if a string is a valid data URI\n *\n * @param input - String to check\n * @returns True if valid data URI format\n */\nexport function isDataUri(input: string): boolean {\n return /^data:[^;,]+;base64,/.test(input);\n}\n\n/**\n * Extract MIME type from data URI\n *\n * @param dataUri - Data URI string\n * @returns MIME type or null if invalid\n *\n * @example\n * ```typescript\n * const mime = extractMimeType(\"data:image/png;base64,iVBOR...\");\n * console.log(mime); // \"image/png\"\n * ```\n */\nexport function extractMimeType(dataUri: string): string | null {\n const match = dataUri.match(/^data:([^;,]+);base64,/);\n return match ? match[1] : null;\n}\n"],"mappings":";AAOA,SAAS,mBAAmB;;;ACkBrB,SAAS,oBAAoB,QAA6B;AAE/D,MAAI,OAAO,WAAW,aAAa;AACjC,WAAO,OAAO,KAAK,MAAM,EAAE,SAAS,QAAQ;AAAA,EAC9C;AAGA,QAAM,QAAQ,IAAI,WAAW,MAAM;AACnC,MAAI,SAAS;AACb,WAAS,IAAI,GAAG,IAAI,MAAM,YAAY,KAAK;AACzC,cAAU,OAAO,aAAa,MAAM,CAAC,CAAC;AAAA,EACxC;AACA,SAAO,KAAK,MAAM;AACpB;AAqBO,SAAS,oBAAoB,QAA6B;AAE/D,QAAM,cAAc,OAAO,QAAQ,uBAAuB,EAAE;AAG5D,MAAI,OAAO,WAAW,aAAa;AACjC,UAAM,SAAS,OAAO,KAAK,aAAa,QAAQ;AAEhD,WAAO,OAAO,OAAO,MAAM,OAAO,YAAY,OAAO,aAAa,OAAO,UAAU;AAAA,EACrF;AAGA,QAAM,eAAe,KAAK,WAAW;AACrC,QAAM,QAAQ,IAAI,WAAW,aAAa,MAAM;AAChD,WAAS,IAAI,GAAG,IAAI,aAAa,QAAQ,KAAK;AAC5C,UAAM,CAAC,IAAI,aAAa,WAAW,CAAC;AAAA,EACtC;AACA,SAAO,MAAM;AACf;AAUO,SAAS,mBAAmB,OAA2B;AAC5D,SAAO,oBAAoB,MAAM,OAAO,MAAM,MAAM,YAAY,MAAM,aAAa,MAAM,UAAU,CAAgB;AACrH;;;ADjEA,eAAsB,gBAAgB,SAAkC;AACtE,QAAM,cAAc,QAAQ,MAAM,qCAAqC;AACvE,MAAI,CAAC,aAAa;AAChB,UAAM,IAAI,MAAM,iFAAiF;AAAA,EACnG;AAEA,QAAM,aAAa,YAAY,CAAC;AAChC,QAAM,WAAW,oBAAoB,UAAU;AAC/C,QAAM,SAAS,MAAM,YAAY,KAAK,QAAQ;AAC9C,SAAO,OAAO,aAAa;AAC7B;AAqBA,eAAsB,mBACpB,SACA,YACmB;AAEnB,QAAM,cAAc,QAAQ,MAAM,qCAAqC;AACvE,MAAI,CAAC,aAAa;AAChB,UAAM,IAAI,MAAM,iFAAiF;AAAA,EACnG;AAEA,QAAM,aAAa,YAAY,CAAC;AAChC,QAAM,WAAW,oBAAoB,UAAU;AAG/C,QAAM,SAAS,MAAM,YAAY,KAAK,QAAQ;AAC9C,QAAM,aAAa,OAAO,aAAa;AAEvC,QAAM,SAAmB,CAAC;AAE1B,aAAW,CAAC,WAAW,OAAO,KAAK,YAAY;AAE7C,QAAI,YAAY,KAAK,UAAU,cAAc,YAAY,SAAS;AAChE,YAAM,IAAI;AAAA,QACR,uBAAuB,SAAS,KAAK,OAAO,kBAAkB,UAAU;AAAA,MAE1E;AAAA,IACF;AAGA,UAAM,WAAW,MAAM,YAAY,OAAO;AAC1C,UAAM,cAAc,MAAM;AAAA,MACxB,EAAE,QAAQ,UAAU,YAAY,EAAE;AAAA,MAClC,CAAC,GAAG,MAAM,YAAY,IAAI;AAAA;AAAA,IAC5B;AAEA,UAAM,cAAc,MAAM,SAAS,UAAU,QAAQ,WAAW;AAChE,gBAAY,QAAQ,UAAQ,SAAS,QAAQ,IAAI,CAAC;AAGlD,UAAM,aAAa,MAAM,SAAS,KAAK;AACvC,UAAM,cAAc,mBAAmB,UAAU;AACjD,WAAO,KAAK,+BAA+B,WAAW,EAAE;AAAA,EAC1D;AAEA,SAAO;AACT;AAsBO,SAAS,qBAAqB,IAAwB;AAE3D,MAAI,GAAG,QAAQ,cAAc,QAAW;AACtC,WAAO,GAAG,OAAO;AAAA,EACnB;AAGA,SAAO,GAAG,MAAM;AAClB;AAqBO,SAAS,kBAAkB,SAA+B;AAC/D,SAAO,QAAQ,OAAO,CAAC,KAAK,OAAO,MAAM,qBAAqB,EAAE,GAAG,CAAC;AACtE;AAqBO,SAAS,qBAAqB,IAiBnC;AACA,QAAM,YAAY,GAAG,MAAM;AAC3B,QAAM,YAAY,GAAG,QAAQ,aAAa;AAC1C,QAAM,qBAAqB,GAAG,QAAQ,wBAAwB;AAC9D,QAAM,YAAY,GAAG,QAAQ,eAAe,UAAa,GAAG,QAAQ,gBAAgB;AAEpF,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA,YAAY,GAAG,QAAQ;AAAA,IACvB,aAAa,GAAG,QAAQ;AAAA,IACxB,WAAW,GAAG,QAAQ;AAAA,IACtB,qBAAqB,GAAG,QAAQ;AAAA,IAChC;AAAA,EACF;AACF;","names":[]}
1
+ {"version":3,"sources":["../src/pdf-utils.ts","../src/runtime/base64.ts"],"sourcesContent":["/**\n * PDF Utilities\n *\n * Edge Runtime compatible PDF manipulation utilities using pdf-lib.\n * These functions work in Node.js, Vercel Edge Functions, Cloudflare Workers, and browsers.\n */\n\nimport { PDFDocument } from 'pdf-lib';\nimport { base64ToArrayBuffer, uint8ArrayToBase64 } from './runtime/base64.js';\nimport type { DocumentIR } from './internal/validation-utils.js';\n\n/**\n * Extract raw base64 data from PDF input.\n * Accepts both data URL format and raw base64 strings.\n *\n * @param input - PDF data URL (data:application/pdf;base64,...) or raw base64 string\n * @returns Raw base64 string without data URL prefix\n */\nfunction extractPDFBase64(input: string): string {\n // If it's a data URL, extract the base64 part\n const dataUrlMatch = input.match(/^data:application\\/pdf;base64,(.+)$/);\n if (dataUrlMatch) {\n return dataUrlMatch[1];\n }\n\n // Otherwise assume it's raw base64\n return input;\n}\n\n/**\n * Get the total number of pages in a PDF document\n *\n * @param input - PDF data URL (data:application/pdf;base64,...) or raw base64 string\n * @returns Total page count\n * @throws {Error} If the input is not valid PDF data\n *\n * @example\n * ```typescript\n * // With data URL\n * const pageCount = await getPDFPageCount('data:application/pdf;base64,JVBERi0...');\n *\n * // With raw base64\n * const pageCount = await getPDFPageCount('JVBERi0xLjQK...');\n *\n * console.log(`PDF has ${pageCount} pages`);\n * ```\n */\nexport async function getPDFPageCount(input: string): Promise<number> {\n const base64Data = extractPDFBase64(input);\n const pdfBytes = base64ToArrayBuffer(base64Data);\n const pdfDoc = await PDFDocument.load(pdfBytes);\n return pdfDoc.getPageCount();\n}\n\n/**\n * Split a PDF into multiple smaller PDFs based on page ranges\n *\n * @param input - PDF data URL (data:application/pdf;base64,...) or raw base64 string\n * @param pageRanges - Array of [startPage, endPage] tuples (1-indexed, inclusive)\n * @returns Array of PDF data URLs, one for each page range\n * @throws {Error} If the input is not valid PDF data or page ranges are invalid\n *\n * @example\n * ```typescript\n * // Split a 10-page PDF into three chunks\n * const chunks = await splitPDFIntoChunks(pdfDataUrl, [\n * [1, 3], // Pages 1-3\n * [4, 7], // Pages 4-7\n * [8, 10] // Pages 8-10\n * ]);\n * console.log(`Created ${chunks.length} PDF chunks`);\n * ```\n */\nexport async function splitPDFIntoChunks(\n input: string,\n pageRanges: Array<[number, number]>\n): Promise<string[]> {\n // Extract base64 data (accepts both data URL and raw base64)\n const base64Data = extractPDFBase64(input);\n const pdfBytes = base64ToArrayBuffer(base64Data);\n\n // Load the PDF\n const pdfDoc = await PDFDocument.load(pdfBytes);\n const totalPages = pdfDoc.getPageCount();\n\n const chunks: string[] = [];\n\n for (const [startPage, endPage] of pageRanges) {\n // Validate page range\n if (startPage < 1 || endPage > totalPages || startPage > endPage) {\n throw new Error(\n `Invalid page range [${startPage}, ${endPage}] for PDF with ${totalPages} pages. ` +\n `Page numbers must be 1-indexed and within bounds.`\n );\n }\n\n // Create new PDF with only these pages\n const chunkDoc = await PDFDocument.create();\n const pagesToCopy = Array.from(\n { length: endPage - startPage + 1 },\n (_, i) => startPage - 1 + i // Convert to 0-indexed\n );\n\n const copiedPages = await chunkDoc.copyPages(pdfDoc, pagesToCopy);\n copiedPages.forEach(page => chunkDoc.addPage(page));\n\n // Serialize to base64 using Edge Runtime compatible adapter\n const chunkBytes = await chunkDoc.save();\n const chunkBase64 = uint8ArrayToBase64(chunkBytes);\n chunks.push(`data:application/pdf;base64,${chunkBase64}`);\n }\n\n return chunks;\n}\n\n/**\n * Get the page count from a DocumentIR, with fallback logic\n *\n * This helper function checks multiple sources for page count:\n * 1. `extras.pageCount` (explicit page count from provider or PDF analysis)\n * 2. `pages.length` (fallback - number of pages in the IR)\n *\n * Note: For Unsiloed provider, `pages.length` represents semantic chunks,\n * not traditional pages. Use `extras.totalSemanticChunks` to distinguish.\n *\n * @param ir - DocumentIR to get page count from\n * @returns Page count (or chunk count for Unsiloed)\n *\n * @example\n * ```typescript\n * const ir = await parseNode.run(pdfUrl, { provider: ocrProvider });\n * const pageCount = getDocumentPageCount(ir);\n * console.log(`Document has ${pageCount} pages`);\n * ```\n */\nexport function getDocumentPageCount(ir: DocumentIR): number {\n // Prefer explicit pageCount from extras\n if (ir.extras?.pageCount !== undefined) {\n return ir.extras.pageCount;\n }\n\n // Fallback to pages array length\n return ir.pages.length;\n}\n\n/**\n * Get total page count across multiple DocumentIR objects (chunked results)\n *\n * For chunked parsing results, this sums up the page counts across all chunks.\n * It respects `extras.pageCount` if available, otherwise uses `pages.length`.\n *\n * @param irArray - Array of DocumentIR objects from chunked parsing\n * @returns Total page count across all chunks\n *\n * @example\n * ```typescript\n * const chunks = await parseNode.run(largePdfUrl, {\n * provider: ocrProvider,\n * chunked: { maxPagesPerChunk: 10 }\n * });\n * const totalPages = getTotalPageCount(chunks);\n * console.log(`Total pages across ${chunks.length} chunks: ${totalPages}`);\n * ```\n */\nexport function getTotalPageCount(irArray: DocumentIR[]): number {\n return irArray.reduce((sum, ir) => sum + getDocumentPageCount(ir), 0);\n}\n\n/**\n * Get comprehensive page-related metadata from a DocumentIR\n *\n * Returns detailed information about page counts, chunk information,\n * and whether the result is chunked or a complete document.\n *\n * @param ir - DocumentIR to analyze\n * @returns Metadata object with page count details\n *\n * @example\n * ```typescript\n * const metadata = getPageCountMetadata(ir);\n * console.log(`Document has ${metadata.pageCount} pages`);\n * if (metadata.isChunked) {\n * console.log(`This is chunk ${metadata.chunkIndex + 1} of ${metadata.totalChunks}`);\n * console.log(`Contains pages ${metadata.pageRange[0]} to ${metadata.pageRange[1]}`);\n * }\n * ```\n */\nexport function getPageCountMetadata(ir: DocumentIR): {\n /** Total page count (or chunk count for Unsiloed) */\n pageCount: number;\n /** Number of pages in the IR (may differ from pageCount for chunked docs) */\n pagesInIR: number;\n /** Whether this is a chunked result */\n isChunked: boolean;\n /** For chunked results: which chunk this is (0-indexed) */\n chunkIndex?: number;\n /** For chunked results: total number of chunks */\n totalChunks?: number;\n /** For chunked results: page range [start, end] (1-indexed, inclusive) */\n pageRange?: [number, number];\n /** For Unsiloed: total semantic chunks */\n totalSemanticChunks?: number;\n /** Whether this is from Unsiloed (semantic chunking, not traditional pages) */\n isSemanticChunking: boolean;\n} {\n const pagesInIR = ir.pages.length;\n const pageCount = ir.extras?.pageCount ?? pagesInIR;\n const isSemanticChunking = ir.extras?.totalSemanticChunks !== undefined;\n const isChunked = ir.extras?.chunkIndex !== undefined && ir.extras?.totalChunks !== undefined;\n\n return {\n pageCount,\n pagesInIR,\n isChunked,\n chunkIndex: ir.extras?.chunkIndex as number | undefined,\n totalChunks: ir.extras?.totalChunks as number | undefined,\n pageRange: ir.extras?.pageRange as [number, number] | undefined,\n totalSemanticChunks: ir.extras?.totalSemanticChunks as number | undefined,\n isSemanticChunking\n };\n}\n","/**\n * Universal Base64 Adapter\n *\n * Provides base64 encoding/decoding for both Node.js and Edge Runtime.\n * Replaces Node.js Buffer usage with Web APIs for Edge compatibility.\n *\n * @module @doclo/core/runtime/base64\n */\n\n/**\n * Convert ArrayBuffer to base64 string\n *\n * Uses different strategies depending on runtime:\n * - Edge Runtime / Browser: btoa() with binary string conversion\n * - Node.js: Buffer.toString('base64')\n *\n * @param buffer - ArrayBuffer to encode\n * @returns Base64 encoded string\n *\n * @example\n * ```typescript\n * const buffer = new Uint8Array([72, 101, 108, 108, 111]).buffer;\n * const base64 = arrayBufferToBase64(buffer); // \"SGVsbG8=\"\n * ```\n */\nexport function arrayBufferToBase64(buffer: ArrayBuffer): string {\n // Node.js: Use Buffer for best performance\n if (typeof Buffer !== 'undefined') {\n return Buffer.from(buffer).toString('base64');\n }\n\n // Edge Runtime / Browser: Use btoa() with binary string\n const bytes = new Uint8Array(buffer);\n let binary = '';\n for (let i = 0; i < bytes.byteLength; i++) {\n binary += String.fromCharCode(bytes[i]);\n }\n return btoa(binary);\n}\n\n/**\n * Convert base64 string to ArrayBuffer\n *\n * Uses different strategies depending on runtime:\n * - Edge Runtime / Browser: atob() with Uint8Array conversion\n * - Node.js: Buffer.from(base64, 'base64')\n *\n * @param base64 - Base64 encoded string (with or without data URI prefix)\n * @returns Decoded ArrayBuffer\n *\n * @example\n * ```typescript\n * const buffer = base64ToArrayBuffer(\"SGVsbG8=\");\n * const text = new TextDecoder().decode(buffer); // \"Hello\"\n *\n * // Also handles data URIs\n * const buffer2 = base64ToArrayBuffer(\"data:image/png;base64,iVBORw0KG...\");\n * ```\n */\nexport function base64ToArrayBuffer(base64: string): ArrayBuffer {\n // Remove data URI prefix if present\n const cleanBase64 = base64.replace(/^data:[^;]+;base64,/, '');\n\n // Node.js: Use Buffer for best performance\n if (typeof Buffer !== 'undefined') {\n const buffer = Buffer.from(cleanBase64, 'base64');\n // Convert Node.js Buffer to ArrayBuffer\n return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);\n }\n\n // Edge Runtime / Browser: Use atob()\n const binaryString = atob(cleanBase64);\n const bytes = new Uint8Array(binaryString.length);\n for (let i = 0; i < binaryString.length; i++) {\n bytes[i] = binaryString.charCodeAt(i);\n }\n return bytes.buffer;\n}\n\n/**\n * Convert Uint8Array to base64 string\n *\n * Convenience wrapper around arrayBufferToBase64 for Uint8Array inputs.\n *\n * @param bytes - Uint8Array to encode\n * @returns Base64 encoded string\n */\nexport function uint8ArrayToBase64(bytes: Uint8Array): string {\n return arrayBufferToBase64(bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength) as ArrayBuffer);\n}\n\n/**\n * Convert base64 string to Uint8Array\n *\n * Convenience wrapper around base64ToArrayBuffer with Uint8Array result.\n *\n * @param base64 - Base64 encoded string\n * @returns Decoded Uint8Array\n */\nexport function base64ToUint8Array(base64: string): Uint8Array {\n return new Uint8Array(base64ToArrayBuffer(base64));\n}\n\n/**\n * Create a data URI from ArrayBuffer\n *\n * @param buffer - Data to encode\n * @param mimeType - MIME type (default: application/octet-stream)\n * @returns Data URI string\n *\n * @example\n * ```typescript\n * const buffer = new TextEncoder().encode(\"Hello, World!\");\n * const dataUri = createDataUri(buffer.buffer, 'text/plain');\n * // \"data:text/plain;base64,SGVsbG8sIFdvcmxkIQ==\"\n * ```\n */\nexport function createDataUri(buffer: ArrayBuffer, mimeType = 'application/octet-stream'): string {\n const base64 = arrayBufferToBase64(buffer);\n return `data:${mimeType};base64,${base64}`;\n}\n\n/**\n * Check if a string is a valid data URI\n *\n * @param input - String to check\n * @returns True if valid data URI format\n */\nexport function isDataUri(input: string): boolean {\n return /^data:[^;,]+;base64,/.test(input);\n}\n\n/**\n * Extract MIME type from data URI\n *\n * @param dataUri - Data URI string\n * @returns MIME type or null if invalid\n *\n * @example\n * ```typescript\n * const mime = extractMimeType(\"data:image/png;base64,iVBOR...\");\n * console.log(mime); // \"image/png\"\n * ```\n */\nexport function extractMimeType(dataUri: string): string | null {\n const match = dataUri.match(/^data:([^;,]+);base64,/);\n return match ? match[1] : null;\n}\n"],"mappings":";AAOA,SAAS,mBAAmB;;;ACkBrB,SAAS,oBAAoB,QAA6B;AAE/D,MAAI,OAAO,WAAW,aAAa;AACjC,WAAO,OAAO,KAAK,MAAM,EAAE,SAAS,QAAQ;AAAA,EAC9C;AAGA,QAAM,QAAQ,IAAI,WAAW,MAAM;AACnC,MAAI,SAAS;AACb,WAAS,IAAI,GAAG,IAAI,MAAM,YAAY,KAAK;AACzC,cAAU,OAAO,aAAa,MAAM,CAAC,CAAC;AAAA,EACxC;AACA,SAAO,KAAK,MAAM;AACpB;AAqBO,SAAS,oBAAoB,QAA6B;AAE/D,QAAM,cAAc,OAAO,QAAQ,uBAAuB,EAAE;AAG5D,MAAI,OAAO,WAAW,aAAa;AACjC,UAAM,SAAS,OAAO,KAAK,aAAa,QAAQ;AAEhD,WAAO,OAAO,OAAO,MAAM,OAAO,YAAY,OAAO,aAAa,OAAO,UAAU;AAAA,EACrF;AAGA,QAAM,eAAe,KAAK,WAAW;AACrC,QAAM,QAAQ,IAAI,WAAW,aAAa,MAAM;AAChD,WAAS,IAAI,GAAG,IAAI,aAAa,QAAQ,KAAK;AAC5C,UAAM,CAAC,IAAI,aAAa,WAAW,CAAC;AAAA,EACtC;AACA,SAAO,MAAM;AACf;AAUO,SAAS,mBAAmB,OAA2B;AAC5D,SAAO,oBAAoB,MAAM,OAAO,MAAM,MAAM,YAAY,MAAM,aAAa,MAAM,UAAU,CAAgB;AACrH;;;ADvEA,SAAS,iBAAiB,OAAuB;AAE/C,QAAM,eAAe,MAAM,MAAM,qCAAqC;AACtE,MAAI,cAAc;AAChB,WAAO,aAAa,CAAC;AAAA,EACvB;AAGA,SAAO;AACT;AAoBA,eAAsB,gBAAgB,OAAgC;AACpE,QAAM,aAAa,iBAAiB,KAAK;AACzC,QAAM,WAAW,oBAAoB,UAAU;AAC/C,QAAM,SAAS,MAAM,YAAY,KAAK,QAAQ;AAC9C,SAAO,OAAO,aAAa;AAC7B;AAqBA,eAAsB,mBACpB,OACA,YACmB;AAEnB,QAAM,aAAa,iBAAiB,KAAK;AACzC,QAAM,WAAW,oBAAoB,UAAU;AAG/C,QAAM,SAAS,MAAM,YAAY,KAAK,QAAQ;AAC9C,QAAM,aAAa,OAAO,aAAa;AAEvC,QAAM,SAAmB,CAAC;AAE1B,aAAW,CAAC,WAAW,OAAO,KAAK,YAAY;AAE7C,QAAI,YAAY,KAAK,UAAU,cAAc,YAAY,SAAS;AAChE,YAAM,IAAI;AAAA,QACR,uBAAuB,SAAS,KAAK,OAAO,kBAAkB,UAAU;AAAA,MAE1E;AAAA,IACF;AAGA,UAAM,WAAW,MAAM,YAAY,OAAO;AAC1C,UAAM,cAAc,MAAM;AAAA,MACxB,EAAE,QAAQ,UAAU,YAAY,EAAE;AAAA,MAClC,CAAC,GAAG,MAAM,YAAY,IAAI;AAAA;AAAA,IAC5B;AAEA,UAAM,cAAc,MAAM,SAAS,UAAU,QAAQ,WAAW;AAChE,gBAAY,QAAQ,UAAQ,SAAS,QAAQ,IAAI,CAAC;AAGlD,UAAM,aAAa,MAAM,SAAS,KAAK;AACvC,UAAM,cAAc,mBAAmB,UAAU;AACjD,WAAO,KAAK,+BAA+B,WAAW,EAAE;AAAA,EAC1D;AAEA,SAAO;AACT;AAsBO,SAAS,qBAAqB,IAAwB;AAE3D,MAAI,GAAG,QAAQ,cAAc,QAAW;AACtC,WAAO,GAAG,OAAO;AAAA,EACnB;AAGA,SAAO,GAAG,MAAM;AAClB;AAqBO,SAAS,kBAAkB,SAA+B;AAC/D,SAAO,QAAQ,OAAO,CAAC,KAAK,OAAO,MAAM,qBAAqB,EAAE,GAAG,CAAC;AACtE;AAqBO,SAAS,qBAAqB,IAiBnC;AACA,QAAM,YAAY,GAAG,MAAM;AAC3B,QAAM,YAAY,GAAG,QAAQ,aAAa;AAC1C,QAAM,qBAAqB,GAAG,QAAQ,wBAAwB;AAC9D,QAAM,YAAY,GAAG,QAAQ,eAAe,UAAa,GAAG,QAAQ,gBAAgB;AAEpF,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA,YAAY,GAAG,QAAQ;AAAA,IACvB,aAAa,GAAG,QAAQ;AAAA,IACxB,WAAW,GAAG,QAAQ;AAAA,IACtB,qBAAqB,GAAG,QAAQ;AAAA,IAChC;AAAA,EACF;AACF;","names":[]}
@@ -148,14 +148,20 @@ type MultimodalInput = {
148
148
  base64?: string;
149
149
  fileId?: string;
150
150
  }>;
151
+ /** Optional system prompt (text-only, prepended to conversation) */
152
+ systemPrompt?: string;
151
153
  };
154
+ /** Effort level type for reasoning configuration */
155
+ type ReasoningEffort = 'xhigh' | 'high' | 'medium' | 'low' | 'minimal' | 'none';
152
156
  /** Reasoning configuration (normalized across providers) */
153
157
  type ReasoningConfig = {
154
- /** Reasoning effort level: low (20% budget), medium (50%), high (80%) */
155
- effort?: 'low' | 'medium' | 'high';
158
+ /** Effort level - normalized across providers (xhigh: 95%, high: 80%, medium: 50%, low: 20%, minimal: 10%, none: 0%) */
159
+ effort?: ReasoningEffort;
160
+ /** Direct token budget - used by Anthropic/Google/Qwen models */
161
+ max_tokens?: number;
156
162
  /** Exclude reasoning tokens from response (only use for accuracy, not visible) */
157
163
  exclude?: boolean;
158
- /** Enable reasoning with default (medium) effort */
164
+ /** Enable reasoning with default (medium) effort. Set to false to explicitly disable. */
159
165
  enabled?: boolean;
160
166
  };
161
167
  /** Base LLM provider (text-only) */
@@ -179,6 +185,14 @@ type LLMProvider = {
179
185
  cacheReadInputTokens?: number;
180
186
  }>;
181
187
  };
188
+ /** Text completion response (for non-JSON outputs like JSX/code) */
189
+ type TextResponse = {
190
+ text: string;
191
+ rawText?: string;
192
+ inputTokens?: number;
193
+ outputTokens?: number;
194
+ costUSD?: number;
195
+ };
182
196
  /** Vision-capable LLM provider */
183
197
  type VLMProvider = {
184
198
  /** Full 3-layer identity (provider/model/method) */
@@ -199,6 +213,15 @@ type VLMProvider = {
199
213
  cacheCreationInputTokens?: number;
200
214
  cacheReadInputTokens?: number;
201
215
  }>;
216
+ /**
217
+ * Complete a text prompt without JSON mode (optional).
218
+ * Use this when you need raw text output (JSX, code, markdown, etc.)
219
+ */
220
+ completeText?: (input: {
221
+ input: MultimodalInput;
222
+ max_tokens?: number;
223
+ reasoning?: ReasoningConfig;
224
+ }) => Promise<TextResponse>;
202
225
  capabilities: {
203
226
  supportsImages: true;
204
227
  supportsPDFs: boolean;
@@ -584,7 +607,8 @@ type ParseNodeConfig = {
584
607
  * ```
585
608
  */
586
609
  reasoning?: {
587
- effort?: 'low' | 'medium' | 'high';
610
+ effort?: 'xhigh' | 'high' | 'medium' | 'low' | 'minimal' | 'none';
611
+ max_tokens?: number;
588
612
  exclude?: boolean;
589
613
  enabled?: boolean;
590
614
  };
@@ -637,7 +661,8 @@ type SplitNodeConfig = {
637
661
  * ```
638
662
  */
639
663
  reasoning?: {
640
- effort?: 'low' | 'medium' | 'high';
664
+ effort?: 'xhigh' | 'high' | 'medium' | 'low' | 'minimal' | 'none';
665
+ max_tokens?: number;
641
666
  exclude?: boolean;
642
667
  enabled?: boolean;
643
668
  };
@@ -694,7 +719,8 @@ type CategorizeNodeConfig = {
694
719
  * ```
695
720
  */
696
721
  reasoning?: {
697
- effort?: 'low' | 'medium' | 'high';
722
+ effort?: 'xhigh' | 'high' | 'medium' | 'low' | 'minimal' | 'none';
723
+ max_tokens?: number;
698
724
  exclude?: boolean;
699
725
  enabled?: boolean;
700
726
  };
@@ -724,7 +750,8 @@ type ExtractNodeConfig<T = any> = {
724
750
  };
725
751
  consensus?: ConsensusConfig;
726
752
  reasoning?: {
727
- effort?: 'low' | 'medium' | 'high';
753
+ effort?: 'xhigh' | 'high' | 'medium' | 'low' | 'minimal' | 'none';
754
+ max_tokens?: number;
728
755
  exclude?: boolean;
729
756
  enabled?: boolean;
730
757
  };
@@ -1295,4 +1322,4 @@ declare const RESERVED_VARIABLES: {
1295
1322
  */
1296
1323
  declare function protectReservedVariables(nodeType: 'extract' | 'categorize' | 'parse', userVariables: Record<string, any> | undefined, autoInjectedVariables: Record<string, any>): Record<string, any>;
1297
1324
 
1298
- export { type SegmentationResult as $, type AccessMethod as A, type BBox as B, type ConsensusConfig as C, type DocumentIR as D, type ExtractNodeConfig as E, type FieldVotingDetails as F, type AggregatedMetrics as G, type FlowContext as H, type IRLine as I, type NodeCtx as J, type NodeTypeInfo as K, type LLMProvider as L, type MultimodalInput as M, type NormalizedBBox as N, type OCRProvider as O, type ProviderVendor as P, type NodeDef as Q, type ReasoningConfig as R, type SplitDocument as S, type NodeTypeName as T, type CompatibilityRule as U, type VLMProvider as V, type ValidationResult as W, type JSONSchemaNode as X, type ProcessingMode as Y, type PageRangeOptions as Z, type LanguageOptions as _, type IRPage as a, type ExtractedImage as a0, type OCRProviderOptions as a1, type VLMProviderOptions as a2, type ProviderCitation as a3, type FlowStepLocation as a4, aggregateMetrics as a5, node as a6, runPipeline as a7, FlowExecutionError as a8, FlowValidationError as a9, NODE_COMPATIBILITY_MATRIX as aa, getNodeTypeName as ab, getNodeTypeInfo as ac, getCompatibleTargets as ad, getSuggestedConnections as ae, validateNodeConnection as af, getValidForEachStarters as ag, canStartForEachItemFlow as ah, validateJson as ai, RESERVED_VARIABLES as aj, protectReservedVariables as ak, extractErrorMessage as al, type ProviderIdentity as am, toProviderString as an, parseProviderString as ao, isLocalEndpoint as ap, createIdentity as aq, type OutputFormat as ar, type TableFormat as as, type ChunkingStrategy as at, type LLMDerivedOptions as au, type SupportedMimeType as av, type TraceContextLite as aw, type NodeObservabilityContext as ax, type DocumentIRExtras as b, type LLMJsonProvider as c, type ConsensusRunResult as d, type ConsensusMetadata as e, type OutputWithConsensus as f, type MaybeWithConsensusMetadata as g, type FlowInput as h, type FlowInputValidation as i, type FlowResult as j, type CitationSourceType as k, type LineCitation as l, type FieldCitation as m, type CitationConfig as n, type OutputWithCitations as o, type ParseNodeConfig as p, type SplitNodeConfig as q, type CategorizeNodeConfig as r, type ExtractInputMode as s, type ChunkMetadata as t, type ChunkOutput as u, type ChunkNodeConfig as v, type CombineNodeConfig as w, type OutputNodeConfig as x, type EnhancedExtractionSchema as y, type StepMetric as z };
1325
+ export { type LanguageOptions as $, type AccessMethod as A, type BBox as B, type ConsensusConfig as C, type DocumentIR as D, type ExtractNodeConfig as E, type FieldVotingDetails as F, type AggregatedMetrics as G, type FlowContext as H, type IRLine as I, type NodeCtx as J, type NodeTypeInfo as K, type LLMProvider as L, type MultimodalInput as M, type NormalizedBBox as N, type OCRProvider as O, type ProviderVendor as P, type NodeDef as Q, type ReasoningConfig as R, type SplitDocument as S, type TextResponse as T, type NodeTypeName as U, type VLMProvider as V, type CompatibilityRule as W, type ValidationResult as X, type JSONSchemaNode as Y, type ProcessingMode as Z, type PageRangeOptions as _, type IRPage as a, type SegmentationResult as a0, type ExtractedImage as a1, type OCRProviderOptions as a2, type VLMProviderOptions as a3, type ProviderCitation as a4, type FlowStepLocation as a5, aggregateMetrics as a6, node as a7, runPipeline as a8, FlowExecutionError as a9, FlowValidationError as aa, NODE_COMPATIBILITY_MATRIX as ab, getNodeTypeName as ac, getNodeTypeInfo as ad, getCompatibleTargets as ae, getSuggestedConnections as af, validateNodeConnection as ag, getValidForEachStarters as ah, canStartForEachItemFlow as ai, validateJson as aj, RESERVED_VARIABLES as ak, protectReservedVariables as al, extractErrorMessage as am, type ProviderIdentity as an, toProviderString as ao, parseProviderString as ap, isLocalEndpoint as aq, createIdentity as ar, type ReasoningEffort as as, type OutputFormat as at, type TableFormat as au, type ChunkingStrategy as av, type LLMDerivedOptions as aw, type SupportedMimeType as ax, type TraceContextLite as ay, type NodeObservabilityContext as az, type DocumentIRExtras as b, type LLMJsonProvider as c, type ConsensusRunResult as d, type ConsensusMetadata as e, type OutputWithConsensus as f, type MaybeWithConsensusMetadata as g, type FlowInput as h, type FlowInputValidation as i, type FlowResult as j, type CitationSourceType as k, type LineCitation as l, type FieldCitation as m, type CitationConfig as n, type OutputWithCitations as o, type ParseNodeConfig as p, type SplitNodeConfig as q, type CategorizeNodeConfig as r, type ExtractInputMode as s, type ChunkMetadata as t, type ChunkOutput as u, type ChunkNodeConfig as v, type CombineNodeConfig as w, type OutputNodeConfig as x, type EnhancedExtractionSchema as y, type StepMetric as z };
@@ -1 +1 @@
1
- export { U as CompatibilityRule, a9 as FlowValidationError, aa as NODE_COMPATIBILITY_MATRIX, J as NodeCtx, Q as NodeDef, K as NodeTypeInfo, T as NodeTypeName, W as ValidationResult, ah as canStartForEachItemFlow, ad as getCompatibleTargets, ac as getNodeTypeInfo, ab as getNodeTypeName, ae as getSuggestedConnections, ag as getValidForEachStarters, af as validateNodeConnection } from './validation-BQO54qAY.js';
1
+ export { W as CompatibilityRule, aa as FlowValidationError, ab as NODE_COMPATIBILITY_MATRIX, J as NodeCtx, Q as NodeDef, K as NodeTypeInfo, U as NodeTypeName, X as ValidationResult, ai as canStartForEachItemFlow, ae as getCompatibleTargets, ad as getNodeTypeInfo, ac as getNodeTypeName, af as getSuggestedConnections, ah as getValidForEachStarters, ag as validateNodeConnection } from './validation-C_RN-Xqr.js';