@origints/mammoth 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,52 @@
1
+ import { TransformAst, TransformImpl } from '@origints/core';
2
+ import { DocxToHtmlOptions, DocxToTextOptions } from './options';
3
+ /**
4
+ * Creates a TransformAst for converting DOCX to HTML.
5
+ *
6
+ * @example
7
+ * ```ts
8
+ * const plan = Planner.in(loadFile('document.docx'))
9
+ * .mapIn(docxToHtml())
10
+ * .emit((out, $) => out.add('html', $.get('html').asString()))
11
+ * .compile()
12
+ * ```
13
+ *
14
+ * @example With custom style mapping
15
+ * ```ts
16
+ * const plan = Planner.in(loadFile('document.docx'))
17
+ * .mapIn(docxToHtml({
18
+ * styleMap: [
19
+ * "p[style-name='Title'] => h1.document-title",
20
+ * "p[style-name='Subtitle'] => h2.document-subtitle",
21
+ * ],
22
+ * idPrefix: 'doc-',
23
+ * }))
24
+ * .emit((out, $) => out.add('content', $.get('html').asString()))
25
+ * .compile()
26
+ * ```
27
+ */
28
+ export declare function docxToHtml(options?: DocxToHtmlOptions): TransformAst;
29
+ /**
30
+ * Creates a TransformAst for extracting raw text from DOCX.
31
+ *
32
+ * @example
33
+ * ```ts
34
+ * const plan = Planner.in(loadFile('document.docx'))
35
+ * .mapIn(docxToText())
36
+ * .emit((out, $) => out.add('text', $.get('text').asString()))
37
+ * .compile()
38
+ * ```
39
+ */
40
+ export declare function docxToText(options?: DocxToTextOptions): TransformAst;
41
+ /**
42
+ * Transform implementation for docxToHtml.
43
+ *
44
+ * Accepts Buffer or ReadableStream<Uint8Array> input.
45
+ */
46
+ export declare const docxToHtmlImpl: TransformImpl;
47
+ /**
48
+ * Transform implementation for docxToText.
49
+ *
50
+ * Accepts Buffer or ReadableStream<Uint8Array> input.
51
+ */
52
+ export declare const docxToTextImpl: TransformImpl;
package/dist/index.cjs ADDED
@@ -0,0 +1,2 @@
1
+ "use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const m=require("mammoth");async function s(e){const r=e.getReader(),t=[];try{for(;;){const{done:a,value:n}=await r.read();if(a)break;t.push(n)}return Buffer.concat(t)}finally{r.releaseLock()}}function u(e){return{kind:"transform",namespace:"@origints/mammoth",name:"docxToHtml",args:g(e)}}function o(e){return{kind:"transform",namespace:"@origints/mammoth",name:"docxToText",args:e}}const c={namespace:"@origints/mammoth",name:"docxToHtml",async execute(e,r){const t=y(r),a=await d(e),n=p(t),i=await m.convertToHtml({buffer:a},n);return{html:i.value,messages:i.messages.map(f)}}},l={namespace:"@origints/mammoth",name:"docxToText",async execute(e){const r=await d(e),t=await m.extractRawText({buffer:r});return{text:t.value,messages:t.messages.map(f)}}};function g(e){if(!e)return;const r={};return e.styleMap&&(r.styleMap=e.styleMap),e.includeEmbeddedStyleMap!==void 0&&(r.includeEmbeddedStyleMap=e.includeEmbeddedStyleMap),e.includeDefaultStyleMap!==void 0&&(r.includeDefaultStyleMap=e.includeDefaultStyleMap),e.idPrefix&&(r.idPrefix=e.idPrefix),(e.imageHandling==="inline"||e.imageHandling==="omit")&&(r.imageHandling=e.imageHandling),e.preserveEmptyParagraphs!==void 0&&(r.preserveEmptyParagraphs=e.preserveEmptyParagraphs),Object.keys(r).length>0?r:void 0}function y(e){return e||{}}async function d(e){if(Buffer.isBuffer(e))return e;if(e instanceof ReadableStream)return s(e);throw new Error(`docxToHtml expects Buffer or ReadableStream input, got ${typeof e}`)}function p(e){const r={};return e.styleMap&&(r.styleMap=e.styleMap),e.includeEmbeddedStyleMap!==void 0&&(r.includeEmbeddedStyleMap=e.includeEmbeddedStyleMap),e.includeDefaultStyleMap!==void 0&&(r.includeDefaultStyleMap=e.includeDefaultStyleMap),e.idPrefix&&(r.idPrefix=e.idPrefix),e.preserveEmptyParagraphs===!1?r.ignoreEmptyParagraphs=!0:e.preserveEmptyParagraphs===!0&&(r.ignoreEmptyParagraphs=!1),e.imageHandling&&(r.convertImage=x(e.imageHandling)),e.transformDocument&&(r.transformDocument=e.transformDocument),r}function x(e){return e==="omit"?m.images.imgElement(()=>Promise.resolve({src:""})):e==="inline"?m.images.imgElement(r=>r.readAsBase64String().then(t=>({src:`data:${r.contentType};base64,${t}`}))):m.images.imgElement(async r=>{const t={contentType:r.contentType,read:(n=>n==="base64"?r.readAsBase64String():r.readAsArrayBuffer())},a=await e(t);return a===null?{src:""}:a})}function f(e){return{type:e.type==="error"?"error":"warning",message:e.message}}function T(e){e.register(c),e.register(l)}exports.docxToHtml=u;exports.docxToHtmlImpl=c;exports.docxToText=o;exports.docxToTextImpl=l;exports.registerMammothTransforms=T;exports.streamToBuffer=s;
2
+ //# sourceMappingURL=index.cjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.cjs","sources":["../src/util.ts","../src/convert.ts","../src/index.ts"],"sourcesContent":["/**\n * Utility functions for mammoth package.\n *\n * @module mammoth/util\n */\n\n/**\n * Convert a ReadableStream<Uint8Array> to a Buffer.\n */\nexport async function streamToBuffer(\n stream: ReadableStream<Uint8Array>\n): Promise<Buffer> {\n const reader = stream.getReader()\n const chunks: Uint8Array[] = []\n\n try {\n while (true) {\n const { done, value } = await reader.read()\n if (done) break\n chunks.push(value)\n }\n return Buffer.concat(chunks)\n } finally {\n reader.releaseLock()\n }\n}\n","/**\n * DOCX to HTML conversion transform for Origins.\n *\n * @module mammoth/convert\n */\n\nimport mammoth from 'mammoth'\nimport type { TransformAst, TransformImpl } from '@origints/core'\nimport type {\n DocxToHtmlOptions,\n DocxToTextOptions,\n MammothImageHandler,\n} from './options'\nimport type {\n DocxConversionResult,\n DocxTextResult,\n MammothMessage,\n} from './result'\nimport { streamToBuffer } from './util'\n\n/**\n * Mammoth options type extracted from the library.\n */\ninterface MammothOptions {\n styleMap?: string | string[]\n includeEmbeddedStyleMap?: boolean\n includeDefaultStyleMap?: boolean\n convertImage?: MammothImageConverter\n ignoreEmptyParagraphs?: boolean\n idPrefix?: string\n transformDocument?: (element: unknown) => unknown\n}\n\n/**\n * Mammoth image converter (opaque branded type).\n */\ninterface MammothImageConverter {\n __mammothBrand: 'ImageConverter'\n}\n\n/**\n * Mammoth image interface for custom converters.\n */\ninterface MammothImage {\n contentType: string\n readAsBase64String: () => Promise<string>\n readAsBuffer: () => Promise<Buffer>\n readAsArrayBuffer: () => Promise<ArrayBuffer>\n}\n\n/**\n * Creates a TransformAst for converting DOCX to HTML.\n *\n * @example\n * ```ts\n * const plan = Planner.in(loadFile('document.docx'))\n * .mapIn(docxToHtml())\n * .emit((out, $) => out.add('html', $.get('html').asString()))\n * .compile()\n * ```\n *\n * @example With custom style mapping\n * ```ts\n * const plan = Planner.in(loadFile('document.docx'))\n * .mapIn(docxToHtml({\n * styleMap: [\n * \"p[style-name='Title'] => h1.document-title\",\n * \"p[style-name='Subtitle'] => h2.document-subtitle\",\n * ],\n * idPrefix: 'doc-',\n * }))\n * .emit((out, $) => out.add('content', $.get('html').asString()))\n * .compile()\n * ```\n */\nexport function docxToHtml(options?: DocxToHtmlOptions): TransformAst {\n return {\n kind: 'transform',\n namespace: '@origints/mammoth',\n name: 'docxToHtml',\n args: serializeOptions(options),\n }\n}\n\n/**\n * Creates a TransformAst for extracting raw text from DOCX.\n *\n * @example\n * ```ts\n * const plan = Planner.in(loadFile('document.docx'))\n * .mapIn(docxToText())\n * .emit((out, $) => out.add('text', $.get('text').asString()))\n * .compile()\n * ```\n */\nexport function docxToText(options?: DocxToTextOptions): TransformAst {\n return {\n kind: 'transform',\n namespace: '@origints/mammoth',\n name: 'docxToText',\n args: options,\n }\n}\n\n/**\n * Transform implementation for docxToHtml.\n *\n * Accepts Buffer or ReadableStream<Uint8Array> input.\n */\nexport const docxToHtmlImpl: TransformImpl = {\n namespace: '@origints/mammoth',\n name: 'docxToHtml',\n\n async execute(\n input: unknown,\n args?: unknown\n ): Promise<DocxConversionResult> {\n const options = deserializeOptions(\n args as SerializedDocxOptions | undefined\n )\n const buffer = await toBuffer(input)\n const mammothOptions = toMammothOptions(options)\n\n const result = await mammoth.convertToHtml({ buffer }, mammothOptions)\n\n return {\n html: result.value,\n messages: result.messages.map(toMammothMessage),\n }\n },\n}\n\n/**\n * Transform implementation for docxToText.\n *\n * Accepts Buffer or ReadableStream<Uint8Array> input.\n */\nexport const docxToTextImpl: TransformImpl = {\n namespace: '@origints/mammoth',\n name: 'docxToText',\n\n async execute(input: unknown): Promise<DocxTextResult> {\n const buffer = await toBuffer(input)\n\n // Note: extractRawText doesn't accept options in mammoth's API\n const result = await mammoth.extractRawText({ buffer })\n\n return {\n text: result.value,\n messages: result.messages.map(toMammothMessage),\n }\n },\n}\n\n// ---------------------------------------------------------------------------\n// Internal helpers\n// ---------------------------------------------------------------------------\n\n/**\n * Serialized options that can be stored in TransformAst.args.\n * Function handlers are converted to string identifiers.\n */\ninterface SerializedDocxOptions {\n styleMap?: string[]\n includeEmbeddedStyleMap?: boolean\n includeDefaultStyleMap?: boolean\n idPrefix?: string\n imageHandling?: 'inline' | 'omit'\n preserveEmptyParagraphs?: boolean\n // Note: transformDocument and custom imageHandling functions cannot be serialized\n}\n\n/**\n * Serialize options for storage in TransformAst.\n * Custom functions cannot be serialized and are dropped.\n */\nfunction serializeOptions(\n options?: DocxToHtmlOptions\n): SerializedDocxOptions | undefined {\n if (!options) return undefined\n\n const serialized: SerializedDocxOptions = {}\n\n if (options.styleMap) {\n serialized.styleMap = options.styleMap\n }\n if (options.includeEmbeddedStyleMap !== undefined) {\n serialized.includeEmbeddedStyleMap = options.includeEmbeddedStyleMap\n }\n if (options.includeDefaultStyleMap !== undefined) {\n serialized.includeDefaultStyleMap = options.includeDefaultStyleMap\n }\n if (options.idPrefix) {\n serialized.idPrefix = options.idPrefix\n }\n if (\n options.imageHandling === 'inline' ||\n options.imageHandling === 'omit'\n ) {\n serialized.imageHandling = options.imageHandling\n }\n if (options.preserveEmptyParagraphs !== undefined) {\n serialized.preserveEmptyParagraphs = options.preserveEmptyParagraphs\n }\n\n return Object.keys(serialized).length > 0 ? serialized : undefined\n}\n\n/**\n * Deserialize options from TransformAst.args.\n */\nfunction deserializeOptions(\n serialized?: SerializedDocxOptions\n): DocxToHtmlOptions {\n if (!serialized) return {}\n return serialized\n}\n\n/**\n * Convert input to Buffer.\n */\nasync function toBuffer(input: unknown): Promise<Buffer> {\n if (Buffer.isBuffer(input)) {\n return input\n }\n if (input instanceof ReadableStream) {\n return streamToBuffer(input as ReadableStream<Uint8Array>)\n }\n throw new Error(\n `docxToHtml expects Buffer or ReadableStream input, got ${typeof input}`\n )\n}\n\n/**\n * Convert our options to mammoth options.\n */\nfunction toMammothOptions(options: DocxToHtmlOptions): MammothOptions {\n const mammothOpts: MammothOptions = {}\n\n if (options.styleMap) {\n mammothOpts.styleMap = options.styleMap\n }\n\n if (options.includeEmbeddedStyleMap !== undefined) {\n mammothOpts.includeEmbeddedStyleMap = options.includeEmbeddedStyleMap\n }\n\n if (options.includeDefaultStyleMap !== undefined) {\n mammothOpts.includeDefaultStyleMap = options.includeDefaultStyleMap\n }\n\n if (options.idPrefix) {\n mammothOpts.idPrefix = options.idPrefix\n }\n\n if (options.preserveEmptyParagraphs === false) {\n mammothOpts.ignoreEmptyParagraphs = true\n } else if (options.preserveEmptyParagraphs === true) {\n mammothOpts.ignoreEmptyParagraphs = false\n }\n\n if (options.imageHandling) {\n mammothOpts.convertImage = createImageConverter(options.imageHandling)\n }\n\n if (options.transformDocument) {\n mammothOpts.transformDocument = options.transformDocument\n }\n\n return mammothOpts\n}\n\n/**\n * Create a mammoth image converter from our options.\n */\nfunction createImageConverter(\n handling: 'inline' | 'omit' | MammothImageHandler\n): MammothImageConverter {\n if (handling === 'omit') {\n return mammoth.images.imgElement(() => Promise.resolve({ src: '' }))\n }\n\n if (handling === 'inline') {\n // Use default mammoth behavior (base64 inline)\n return mammoth.images.imgElement((image: MammothImage) =>\n image.readAsBase64String().then(data => ({\n src: `data:${image.contentType};base64,${data}`,\n }))\n )\n }\n\n // Custom handler - adapt our interface to mammoth's\n return mammoth.images.imgElement(async (image: MammothImage) => {\n // Adapt mammoth's Image to our MammothImageElement interface\n const adaptedImage = {\n contentType: image.contentType,\n read: ((encoding: 'base64' | 'buffer') => {\n if (encoding === 'base64') {\n return image.readAsBase64String()\n }\n return image.readAsArrayBuffer()\n }) as {\n (encoding: 'base64'): Promise<string>\n (encoding: 'buffer'): Promise<ArrayBuffer>\n },\n }\n\n const result = await handling(adaptedImage)\n if (result === null) {\n return { src: '' }\n }\n return result\n })\n}\n\n/**\n * Convert mammoth message to our message type.\n */\nfunction toMammothMessage(msg: { type: string; message: string }): MammothMessage {\n return {\n type: msg.type === 'error' ? 'error' : 'warning',\n message: msg.message,\n }\n}\n","/**\n * @origints/mammoth - DOCX to HTML conversion for Origins using mammoth.js\n *\n * This package provides transforms for converting Word documents (.docx) to HTML.\n * It wraps the mammoth.js library and exposes all its conversion options.\n *\n * @packageDocumentation\n *\n * @example Basic usage\n * ```ts\n * import { Planner, loadFile } from '@origints/core'\n * import { docxToHtml, registerMammothTransforms } from '@origints/mammoth'\n *\n * // Register transforms\n * registerMammothTransforms(globalRegistry)\n *\n * // Create a plan\n * const plan = Planner.in(loadFile('document.docx'))\n * .mapIn(docxToHtml())\n * .emit((out, $) => out.add('html', $.get('html').asString()))\n * .compile()\n * ```\n *\n * @example With custom style mapping\n * ```ts\n * const plan = Planner.in(loadFile('document.docx'))\n * .mapIn(docxToHtml({\n * styleMap: [\n * \"p[style-name='Title'] => h1.document-title\",\n * \"p[style-name='Heading 1'] => h1\",\n * \"p[style-name='Heading 2'] => h2\",\n * ],\n * idPrefix: 'doc-',\n * imageHandling: 'omit',\n * }))\n * .emit((out, $) => out.add('content', $.get('html').asString()))\n * .compile()\n * ```\n */\n\n// Re-export option types\nexport type {\n DocxToHtmlOptions,\n DocxToTextOptions,\n MammothImageElement,\n MammothImageHandler,\n MammothImageResult,\n} from './options'\n\n// Re-export result types\nexport type {\n DocxConversionResult,\n DocxTextResult,\n MammothMessage,\n MammothMessageType,\n} from './result'\n\n// Re-export transform creators and implementations\nexport {\n docxToHtml,\n docxToText,\n docxToHtmlImpl,\n docxToTextImpl,\n} from './convert'\n\n// Re-export utilities\nexport { streamToBuffer } from './util'\n\n// ---------------------------------------------------------------------------\n// Auto-registration of transforms\n// ---------------------------------------------------------------------------\n\nimport { docxToHtmlImpl, docxToTextImpl } from './convert'\n\n/**\n * Register the mammoth transforms with a registry.\n * Call this to enable docxToHtml() and docxToText() in your plans.\n *\n * @example\n * ```ts\n * import { globalRegistry } from '@origints/core'\n * import { registerMammothTransforms } from '@origints/mammoth'\n *\n * registerMammothTransforms(globalRegistry)\n * ```\n */\nexport function registerMammothTransforms(registry: {\n register(impl: {\n namespace: string\n name: string\n execute: (...args: unknown[]) => unknown\n }): void\n}): void {\n registry.register(docxToHtmlImpl)\n registry.register(docxToTextImpl)\n}\n"],"names":["streamToBuffer","stream","reader","chunks","done","value","docxToHtml","options","serializeOptions","docxToText","docxToHtmlImpl","input","args","deserializeOptions","buffer","toBuffer","mammothOptions","toMammothOptions","result","mammoth","toMammothMessage","docxToTextImpl","serialized","mammothOpts","createImageConverter","handling","image","data","adaptedImage","encoding","msg","registerMammothTransforms","registry"],"mappings":"2GASA,eAAsBA,EACpBC,EACiB,CACjB,MAAMC,EAASD,EAAO,UAAA,EAChBE,EAAuB,CAAA,EAE7B,GAAI,CACF,OAAa,CACX,KAAM,CAAE,KAAAC,EAAM,MAAAC,CAAA,EAAU,MAAMH,EAAO,KAAA,EACrC,GAAIE,EAAM,MACVD,EAAO,KAAKE,CAAK,CACnB,CACA,OAAO,OAAO,OAAOF,CAAM,CAC7B,QAAA,CACED,EAAO,YAAA,CACT,CACF,CCkDO,SAASI,EAAWC,EAA2C,CACpE,MAAO,CACL,KAAM,YACN,UAAW,oBACX,KAAM,aACN,KAAMC,EAAiBD,CAAO,CAAA,CAElC,CAaO,SAASE,EAAWF,EAA2C,CACpE,MAAO,CACL,KAAM,YACN,UAAW,oBACX,KAAM,aACN,KAAMA,CAAA,CAEV,CAOO,MAAMG,EAAgC,CAC3C,UAAW,oBACX,KAAM,aAEN,MAAM,QACJC,EACAC,EAC+B,CAC/B,MAAML,EAAUM,EACdD,CAAA,EAEIE,EAAS,MAAMC,EAASJ,CAAK,EAC7BK,EAAiBC,EAAiBV,CAAO,EAEzCW,EAAS,MAAMC,EAAQ,cAAc,CAAE,OAAAL,CAAA,EAAUE,CAAc,EAErE,MAAO,CACL,KAAME,EAAO,MACb,SAAUA,EAAO,SAAS,IAAIE,CAAgB,CAAA,CAElD,CACF,EAOaC,EAAgC,CAC3C,UAAW,oBACX,KAAM,aAEN,MAAM,QAAQV,EAAyC,CACrD,MAAMG,EAAS,MAAMC,EAASJ,CAAK,EAG7BO,EAAS,MAAMC,EAAQ,eAAe,CAAE,OAAAL,EAAQ,EAEtD,MAAO,CACL,KAAMI,EAAO,MACb,SAAUA,EAAO,SAAS,IAAIE,CAAgB,CAAA,CAElD,CACF,EAwBA,SAASZ,EACPD,EACmC,CACnC,GAAI,CAACA,EAAS,OAEd,MAAMe,EAAoC,CAAA,EAE1C,OAAIf,EAAQ,WACVe,EAAW,SAAWf,EAAQ,UAE5BA,EAAQ,0BAA4B,SACtCe,EAAW,wBAA0Bf,EAAQ,yBAE3CA,EAAQ,yBAA2B,SACrCe,EAAW,uBAAyBf,EAAQ,wBAE1CA,EAAQ,WACVe,EAAW,SAAWf,EAAQ,WAG9BA,EAAQ,gBAAkB,UAC1BA,EAAQ,gBAAkB,UAE1Be,EAAW,cAAgBf,EAAQ,eAEjCA,EAAQ,0BAA4B,SACtCe,EAAW,wBAA0Bf,EAAQ,yBAGxC,OAAO,KAAKe,CAAU,EAAE,OAAS,EAAIA,EAAa,MAC3D,CAKA,SAAST,EACPS,EACmB,CACnB,OAAKA,GAAmB,CAAA,CAE1B,CAKA,eAAeP,EAASJ,EAAiC,CACvD,GAAI,OAAO,SAASA,CAAK,EACvB,OAAOA,EAET,GAAIA,aAAiB,eACnB,OAAOX,EAAeW,CAAmC,EAE3D,MAAM,IAAI,MACR,0DAA0D,OAAOA,CAAK,EAAA,CAE1E,CAKA,SAASM,EAAiBV,EAA4C,CACpE,MAAMgB,EAA8B,CAAA,EAEpC,OAAIhB,EAAQ,WACVgB,EAAY,SAAWhB,EAAQ,UAG7BA,EAAQ,0BAA4B,SACtCgB,EAAY,wBAA0BhB,EAAQ,yBAG5CA,EAAQ,yBAA2B,SACrCgB,EAAY,uBAAyBhB,EAAQ,wBAG3CA,EAAQ,WACVgB,EAAY,SAAWhB,EAAQ,UAG7BA,EAAQ,0BAA4B,GACtCgB,EAAY,sBAAwB,GAC3BhB,EAAQ,0BAA4B,KAC7CgB,EAAY,sBAAwB,IAGlChB,EAAQ,gBACVgB,EAAY,aAAeC,EAAqBjB,EAAQ,aAAa,GAGnEA,EAAQ,oBACVgB,EAAY,kBAAoBhB,EAAQ,mBAGnCgB,CACT,CAKA,SAASC,EACPC,EACuB,CACvB,OAAIA,IAAa,OACRN,EAAQ,OAAO,WAAW,IAAM,QAAQ,QAAQ,CAAE,IAAK,EAAA,CAAI,CAAC,EAGjEM,IAAa,SAERN,EAAQ,OAAO,WAAYO,GAChCA,EAAM,mBAAA,EAAqB,KAAKC,IAAS,CACvC,IAAK,QAAQD,EAAM,WAAW,WAAWC,CAAI,EAAA,EAC7C,CAAA,EAKCR,EAAQ,OAAO,WAAW,MAAOO,GAAwB,CAE9D,MAAME,EAAe,CACnB,YAAaF,EAAM,YACnB,MAAQG,GACFA,IAAa,SACRH,EAAM,mBAAA,EAERA,EAAM,kBAAA,EACf,EAMIR,EAAS,MAAMO,EAASG,CAAY,EAC1C,OAAIV,IAAW,KACN,CAAE,IAAK,EAAA,EAETA,CACT,CAAC,CACH,CAKA,SAASE,EAAiBU,EAAwD,CAChF,MAAO,CACL,KAAMA,EAAI,OAAS,QAAU,QAAU,UACvC,QAASA,EAAI,OAAA,CAEjB,CC7OO,SAASC,EAA0BC,EAMjC,CACPA,EAAS,SAAStB,CAAc,EAChCsB,EAAS,SAASX,CAAc,CAClC"}
@@ -0,0 +1,62 @@
1
+ /**
2
+ * @origints/mammoth - DOCX to HTML conversion for Origins using mammoth.js
3
+ *
4
+ * This package provides transforms for converting Word documents (.docx) to HTML.
5
+ * It wraps the mammoth.js library and exposes all its conversion options.
6
+ *
7
+ * @packageDocumentation
8
+ *
9
+ * @example Basic usage
10
+ * ```ts
11
+ * import { Planner, loadFile } from '@origints/core'
12
+ * import { docxToHtml, registerMammothTransforms } from '@origints/mammoth'
13
+ *
14
+ * // Register transforms
15
+ * registerMammothTransforms(globalRegistry)
16
+ *
17
+ * // Create a plan
18
+ * const plan = Planner.in(loadFile('document.docx'))
19
+ * .mapIn(docxToHtml())
20
+ * .emit((out, $) => out.add('html', $.get('html').asString()))
21
+ * .compile()
22
+ * ```
23
+ *
24
+ * @example With custom style mapping
25
+ * ```ts
26
+ * const plan = Planner.in(loadFile('document.docx'))
27
+ * .mapIn(docxToHtml({
28
+ * styleMap: [
29
+ * "p[style-name='Title'] => h1.document-title",
30
+ * "p[style-name='Heading 1'] => h1",
31
+ * "p[style-name='Heading 2'] => h2",
32
+ * ],
33
+ * idPrefix: 'doc-',
34
+ * imageHandling: 'omit',
35
+ * }))
36
+ * .emit((out, $) => out.add('content', $.get('html').asString()))
37
+ * .compile()
38
+ * ```
39
+ */
40
+ export type { DocxToHtmlOptions, DocxToTextOptions, MammothImageElement, MammothImageHandler, MammothImageResult, } from './options';
41
+ export type { DocxConversionResult, DocxTextResult, MammothMessage, MammothMessageType, } from './result';
42
+ export { docxToHtml, docxToText, docxToHtmlImpl, docxToTextImpl, } from './convert';
43
+ export { streamToBuffer } from './util';
44
+ /**
45
+ * Register the mammoth transforms with a registry.
46
+ * Call this to enable docxToHtml() and docxToText() in your plans.
47
+ *
48
+ * @example
49
+ * ```ts
50
+ * import { globalRegistry } from '@origints/core'
51
+ * import { registerMammothTransforms } from '@origints/mammoth'
52
+ *
53
+ * registerMammothTransforms(globalRegistry)
54
+ * ```
55
+ */
56
+ export declare function registerMammothTransforms(registry: {
57
+ register(impl: {
58
+ namespace: string;
59
+ name: string;
60
+ execute: (...args: unknown[]) => unknown;
61
+ }): void;
62
+ }): void;
@@ -0,0 +1,105 @@
1
+ import m from "mammoth";
2
+ async function f(e) {
3
+ const r = e.getReader(), a = [];
4
+ try {
5
+ for (; ; ) {
6
+ const { done: t, value: n } = await r.read();
7
+ if (t) break;
8
+ a.push(n);
9
+ }
10
+ return Buffer.concat(a);
11
+ } finally {
12
+ r.releaseLock();
13
+ }
14
+ }
15
+ function x(e) {
16
+ return {
17
+ kind: "transform",
18
+ namespace: "@origints/mammoth",
19
+ name: "docxToHtml",
20
+ args: u(e)
21
+ };
22
+ }
23
+ function h(e) {
24
+ return {
25
+ kind: "transform",
26
+ namespace: "@origints/mammoth",
27
+ name: "docxToText",
28
+ args: e
29
+ };
30
+ }
31
+ const l = {
32
+ namespace: "@origints/mammoth",
33
+ name: "docxToHtml",
34
+ async execute(e, r) {
35
+ const a = o(
36
+ r
37
+ ), t = await s(e), n = g(a), i = await m.convertToHtml({ buffer: t }, n);
38
+ return {
39
+ html: i.value,
40
+ messages: i.messages.map(c)
41
+ };
42
+ }
43
+ }, d = {
44
+ namespace: "@origints/mammoth",
45
+ name: "docxToText",
46
+ async execute(e) {
47
+ const r = await s(e), a = await m.extractRawText({ buffer: r });
48
+ return {
49
+ text: a.value,
50
+ messages: a.messages.map(c)
51
+ };
52
+ }
53
+ };
54
+ function u(e) {
55
+ if (!e) return;
56
+ const r = {};
57
+ return e.styleMap && (r.styleMap = e.styleMap), e.includeEmbeddedStyleMap !== void 0 && (r.includeEmbeddedStyleMap = e.includeEmbeddedStyleMap), e.includeDefaultStyleMap !== void 0 && (r.includeDefaultStyleMap = e.includeDefaultStyleMap), e.idPrefix && (r.idPrefix = e.idPrefix), (e.imageHandling === "inline" || e.imageHandling === "omit") && (r.imageHandling = e.imageHandling), e.preserveEmptyParagraphs !== void 0 && (r.preserveEmptyParagraphs = e.preserveEmptyParagraphs), Object.keys(r).length > 0 ? r : void 0;
58
+ }
59
+ function o(e) {
60
+ return e || {};
61
+ }
62
+ async function s(e) {
63
+ if (Buffer.isBuffer(e))
64
+ return e;
65
+ if (e instanceof ReadableStream)
66
+ return f(e);
67
+ throw new Error(
68
+ `docxToHtml expects Buffer or ReadableStream input, got ${typeof e}`
69
+ );
70
+ }
71
+ function g(e) {
72
+ const r = {};
73
+ return e.styleMap && (r.styleMap = e.styleMap), e.includeEmbeddedStyleMap !== void 0 && (r.includeEmbeddedStyleMap = e.includeEmbeddedStyleMap), e.includeDefaultStyleMap !== void 0 && (r.includeDefaultStyleMap = e.includeDefaultStyleMap), e.idPrefix && (r.idPrefix = e.idPrefix), e.preserveEmptyParagraphs === !1 ? r.ignoreEmptyParagraphs = !0 : e.preserveEmptyParagraphs === !0 && (r.ignoreEmptyParagraphs = !1), e.imageHandling && (r.convertImage = y(e.imageHandling)), e.transformDocument && (r.transformDocument = e.transformDocument), r;
74
+ }
75
+ function y(e) {
76
+ return e === "omit" ? m.images.imgElement(() => Promise.resolve({ src: "" })) : e === "inline" ? m.images.imgElement(
77
+ (r) => r.readAsBase64String().then((a) => ({
78
+ src: `data:${r.contentType};base64,${a}`
79
+ }))
80
+ ) : m.images.imgElement(async (r) => {
81
+ const a = {
82
+ contentType: r.contentType,
83
+ read: ((n) => n === "base64" ? r.readAsBase64String() : r.readAsArrayBuffer())
84
+ }, t = await e(a);
85
+ return t === null ? { src: "" } : t;
86
+ });
87
+ }
88
+ function c(e) {
89
+ return {
90
+ type: e.type === "error" ? "error" : "warning",
91
+ message: e.message
92
+ };
93
+ }
94
+ function M(e) {
95
+ e.register(l), e.register(d);
96
+ }
97
+ export {
98
+ x as docxToHtml,
99
+ l as docxToHtmlImpl,
100
+ h as docxToText,
101
+ d as docxToTextImpl,
102
+ M as registerMammothTransforms,
103
+ f as streamToBuffer
104
+ };
105
+ //# sourceMappingURL=index.es.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.es.js","sources":["../src/util.ts","../src/convert.ts","../src/index.ts"],"sourcesContent":["/**\n * Utility functions for mammoth package.\n *\n * @module mammoth/util\n */\n\n/**\n * Convert a ReadableStream<Uint8Array> to a Buffer.\n */\nexport async function streamToBuffer(\n stream: ReadableStream<Uint8Array>\n): Promise<Buffer> {\n const reader = stream.getReader()\n const chunks: Uint8Array[] = []\n\n try {\n while (true) {\n const { done, value } = await reader.read()\n if (done) break\n chunks.push(value)\n }\n return Buffer.concat(chunks)\n } finally {\n reader.releaseLock()\n }\n}\n","/**\n * DOCX to HTML conversion transform for Origins.\n *\n * @module mammoth/convert\n */\n\nimport mammoth from 'mammoth'\nimport type { TransformAst, TransformImpl } from '@origints/core'\nimport type {\n DocxToHtmlOptions,\n DocxToTextOptions,\n MammothImageHandler,\n} from './options'\nimport type {\n DocxConversionResult,\n DocxTextResult,\n MammothMessage,\n} from './result'\nimport { streamToBuffer } from './util'\n\n/**\n * Mammoth options type extracted from the library.\n */\ninterface MammothOptions {\n styleMap?: string | string[]\n includeEmbeddedStyleMap?: boolean\n includeDefaultStyleMap?: boolean\n convertImage?: MammothImageConverter\n ignoreEmptyParagraphs?: boolean\n idPrefix?: string\n transformDocument?: (element: unknown) => unknown\n}\n\n/**\n * Mammoth image converter (opaque branded type).\n */\ninterface MammothImageConverter {\n __mammothBrand: 'ImageConverter'\n}\n\n/**\n * Mammoth image interface for custom converters.\n */\ninterface MammothImage {\n contentType: string\n readAsBase64String: () => Promise<string>\n readAsBuffer: () => Promise<Buffer>\n readAsArrayBuffer: () => Promise<ArrayBuffer>\n}\n\n/**\n * Creates a TransformAst for converting DOCX to HTML.\n *\n * @example\n * ```ts\n * const plan = Planner.in(loadFile('document.docx'))\n * .mapIn(docxToHtml())\n * .emit((out, $) => out.add('html', $.get('html').asString()))\n * .compile()\n * ```\n *\n * @example With custom style mapping\n * ```ts\n * const plan = Planner.in(loadFile('document.docx'))\n * .mapIn(docxToHtml({\n * styleMap: [\n * \"p[style-name='Title'] => h1.document-title\",\n * \"p[style-name='Subtitle'] => h2.document-subtitle\",\n * ],\n * idPrefix: 'doc-',\n * }))\n * .emit((out, $) => out.add('content', $.get('html').asString()))\n * .compile()\n * ```\n */\nexport function docxToHtml(options?: DocxToHtmlOptions): TransformAst {\n return {\n kind: 'transform',\n namespace: '@origints/mammoth',\n name: 'docxToHtml',\n args: serializeOptions(options),\n }\n}\n\n/**\n * Creates a TransformAst for extracting raw text from DOCX.\n *\n * @example\n * ```ts\n * const plan = Planner.in(loadFile('document.docx'))\n * .mapIn(docxToText())\n * .emit((out, $) => out.add('text', $.get('text').asString()))\n * .compile()\n * ```\n */\nexport function docxToText(options?: DocxToTextOptions): TransformAst {\n return {\n kind: 'transform',\n namespace: '@origints/mammoth',\n name: 'docxToText',\n args: options,\n }\n}\n\n/**\n * Transform implementation for docxToHtml.\n *\n * Accepts Buffer or ReadableStream<Uint8Array> input.\n */\nexport const docxToHtmlImpl: TransformImpl = {\n namespace: '@origints/mammoth',\n name: 'docxToHtml',\n\n async execute(\n input: unknown,\n args?: unknown\n ): Promise<DocxConversionResult> {\n const options = deserializeOptions(\n args as SerializedDocxOptions | undefined\n )\n const buffer = await toBuffer(input)\n const mammothOptions = toMammothOptions(options)\n\n const result = await mammoth.convertToHtml({ buffer }, mammothOptions)\n\n return {\n html: result.value,\n messages: result.messages.map(toMammothMessage),\n }\n },\n}\n\n/**\n * Transform implementation for docxToText.\n *\n * Accepts Buffer or ReadableStream<Uint8Array> input.\n */\nexport const docxToTextImpl: TransformImpl = {\n namespace: '@origints/mammoth',\n name: 'docxToText',\n\n async execute(input: unknown): Promise<DocxTextResult> {\n const buffer = await toBuffer(input)\n\n // Note: extractRawText doesn't accept options in mammoth's API\n const result = await mammoth.extractRawText({ buffer })\n\n return {\n text: result.value,\n messages: result.messages.map(toMammothMessage),\n }\n },\n}\n\n// ---------------------------------------------------------------------------\n// Internal helpers\n// ---------------------------------------------------------------------------\n\n/**\n * Serialized options that can be stored in TransformAst.args.\n * Function handlers are converted to string identifiers.\n */\ninterface SerializedDocxOptions {\n styleMap?: string[]\n includeEmbeddedStyleMap?: boolean\n includeDefaultStyleMap?: boolean\n idPrefix?: string\n imageHandling?: 'inline' | 'omit'\n preserveEmptyParagraphs?: boolean\n // Note: transformDocument and custom imageHandling functions cannot be serialized\n}\n\n/**\n * Serialize options for storage in TransformAst.\n * Custom functions cannot be serialized and are dropped.\n */\nfunction serializeOptions(\n options?: DocxToHtmlOptions\n): SerializedDocxOptions | undefined {\n if (!options) return undefined\n\n const serialized: SerializedDocxOptions = {}\n\n if (options.styleMap) {\n serialized.styleMap = options.styleMap\n }\n if (options.includeEmbeddedStyleMap !== undefined) {\n serialized.includeEmbeddedStyleMap = options.includeEmbeddedStyleMap\n }\n if (options.includeDefaultStyleMap !== undefined) {\n serialized.includeDefaultStyleMap = options.includeDefaultStyleMap\n }\n if (options.idPrefix) {\n serialized.idPrefix = options.idPrefix\n }\n if (\n options.imageHandling === 'inline' ||\n options.imageHandling === 'omit'\n ) {\n serialized.imageHandling = options.imageHandling\n }\n if (options.preserveEmptyParagraphs !== undefined) {\n serialized.preserveEmptyParagraphs = options.preserveEmptyParagraphs\n }\n\n return Object.keys(serialized).length > 0 ? serialized : undefined\n}\n\n/**\n * Deserialize options from TransformAst.args.\n */\nfunction deserializeOptions(\n serialized?: SerializedDocxOptions\n): DocxToHtmlOptions {\n if (!serialized) return {}\n return serialized\n}\n\n/**\n * Convert input to Buffer.\n */\nasync function toBuffer(input: unknown): Promise<Buffer> {\n if (Buffer.isBuffer(input)) {\n return input\n }\n if (input instanceof ReadableStream) {\n return streamToBuffer(input as ReadableStream<Uint8Array>)\n }\n throw new Error(\n `docxToHtml expects Buffer or ReadableStream input, got ${typeof input}`\n )\n}\n\n/**\n * Convert our options to mammoth options.\n */\nfunction toMammothOptions(options: DocxToHtmlOptions): MammothOptions {\n const mammothOpts: MammothOptions = {}\n\n if (options.styleMap) {\n mammothOpts.styleMap = options.styleMap\n }\n\n if (options.includeEmbeddedStyleMap !== undefined) {\n mammothOpts.includeEmbeddedStyleMap = options.includeEmbeddedStyleMap\n }\n\n if (options.includeDefaultStyleMap !== undefined) {\n mammothOpts.includeDefaultStyleMap = options.includeDefaultStyleMap\n }\n\n if (options.idPrefix) {\n mammothOpts.idPrefix = options.idPrefix\n }\n\n if (options.preserveEmptyParagraphs === false) {\n mammothOpts.ignoreEmptyParagraphs = true\n } else if (options.preserveEmptyParagraphs === true) {\n mammothOpts.ignoreEmptyParagraphs = false\n }\n\n if (options.imageHandling) {\n mammothOpts.convertImage = createImageConverter(options.imageHandling)\n }\n\n if (options.transformDocument) {\n mammothOpts.transformDocument = options.transformDocument\n }\n\n return mammothOpts\n}\n\n/**\n * Create a mammoth image converter from our options.\n */\nfunction createImageConverter(\n handling: 'inline' | 'omit' | MammothImageHandler\n): MammothImageConverter {\n if (handling === 'omit') {\n return mammoth.images.imgElement(() => Promise.resolve({ src: '' }))\n }\n\n if (handling === 'inline') {\n // Use default mammoth behavior (base64 inline)\n return mammoth.images.imgElement((image: MammothImage) =>\n image.readAsBase64String().then(data => ({\n src: `data:${image.contentType};base64,${data}`,\n }))\n )\n }\n\n // Custom handler - adapt our interface to mammoth's\n return mammoth.images.imgElement(async (image: MammothImage) => {\n // Adapt mammoth's Image to our MammothImageElement interface\n const adaptedImage = {\n contentType: image.contentType,\n read: ((encoding: 'base64' | 'buffer') => {\n if (encoding === 'base64') {\n return image.readAsBase64String()\n }\n return image.readAsArrayBuffer()\n }) as {\n (encoding: 'base64'): Promise<string>\n (encoding: 'buffer'): Promise<ArrayBuffer>\n },\n }\n\n const result = await handling(adaptedImage)\n if (result === null) {\n return { src: '' }\n }\n return result\n })\n}\n\n/**\n * Convert mammoth message to our message type.\n */\nfunction toMammothMessage(msg: { type: string; message: string }): MammothMessage {\n return {\n type: msg.type === 'error' ? 'error' : 'warning',\n message: msg.message,\n }\n}\n","/**\n * @origints/mammoth - DOCX to HTML conversion for Origins using mammoth.js\n *\n * This package provides transforms for converting Word documents (.docx) to HTML.\n * It wraps the mammoth.js library and exposes all its conversion options.\n *\n * @packageDocumentation\n *\n * @example Basic usage\n * ```ts\n * import { Planner, loadFile } from '@origints/core'\n * import { docxToHtml, registerMammothTransforms } from '@origints/mammoth'\n *\n * // Register transforms\n * registerMammothTransforms(globalRegistry)\n *\n * // Create a plan\n * const plan = Planner.in(loadFile('document.docx'))\n * .mapIn(docxToHtml())\n * .emit((out, $) => out.add('html', $.get('html').asString()))\n * .compile()\n * ```\n *\n * @example With custom style mapping\n * ```ts\n * const plan = Planner.in(loadFile('document.docx'))\n * .mapIn(docxToHtml({\n * styleMap: [\n * \"p[style-name='Title'] => h1.document-title\",\n * \"p[style-name='Heading 1'] => h1\",\n * \"p[style-name='Heading 2'] => h2\",\n * ],\n * idPrefix: 'doc-',\n * imageHandling: 'omit',\n * }))\n * .emit((out, $) => out.add('content', $.get('html').asString()))\n * .compile()\n * ```\n */\n\n// Re-export option types\nexport type {\n DocxToHtmlOptions,\n DocxToTextOptions,\n MammothImageElement,\n MammothImageHandler,\n MammothImageResult,\n} from './options'\n\n// Re-export result types\nexport type {\n DocxConversionResult,\n DocxTextResult,\n MammothMessage,\n MammothMessageType,\n} from './result'\n\n// Re-export transform creators and implementations\nexport {\n docxToHtml,\n docxToText,\n docxToHtmlImpl,\n docxToTextImpl,\n} from './convert'\n\n// Re-export utilities\nexport { streamToBuffer } from './util'\n\n// ---------------------------------------------------------------------------\n// Auto-registration of transforms\n// ---------------------------------------------------------------------------\n\nimport { docxToHtmlImpl, docxToTextImpl } from './convert'\n\n/**\n * Register the mammoth transforms with a registry.\n * Call this to enable docxToHtml() and docxToText() in your plans.\n *\n * @example\n * ```ts\n * import { globalRegistry } from '@origints/core'\n * import { registerMammothTransforms } from '@origints/mammoth'\n *\n * registerMammothTransforms(globalRegistry)\n * ```\n */\nexport function registerMammothTransforms(registry: {\n register(impl: {\n namespace: string\n name: string\n execute: (...args: unknown[]) => unknown\n }): void\n}): void {\n registry.register(docxToHtmlImpl)\n registry.register(docxToTextImpl)\n}\n"],"names":["streamToBuffer","stream","reader","chunks","done","value","docxToHtml","options","serializeOptions","docxToText","docxToHtmlImpl","input","args","deserializeOptions","buffer","toBuffer","mammothOptions","toMammothOptions","result","mammoth","toMammothMessage","docxToTextImpl","serialized","mammothOpts","createImageConverter","handling","image","data","adaptedImage","encoding","msg","registerMammothTransforms","registry"],"mappings":";AASA,eAAsBA,EACpBC,GACiB;AACjB,QAAMC,IAASD,EAAO,UAAA,GAChBE,IAAuB,CAAA;AAE7B,MAAI;AACF,eAAa;AACX,YAAM,EAAE,MAAAC,GAAM,OAAAC,EAAA,IAAU,MAAMH,EAAO,KAAA;AACrC,UAAIE,EAAM;AACV,MAAAD,EAAO,KAAKE,CAAK;AAAA,IACnB;AACA,WAAO,OAAO,OAAOF,CAAM;AAAA,EAC7B,UAAA;AACE,IAAAD,EAAO,YAAA;AAAA,EACT;AACF;ACkDO,SAASI,EAAWC,GAA2C;AACpE,SAAO;AAAA,IACL,MAAM;AAAA,IACN,WAAW;AAAA,IACX,MAAM;AAAA,IACN,MAAMC,EAAiBD,CAAO;AAAA,EAAA;AAElC;AAaO,SAASE,EAAWF,GAA2C;AACpE,SAAO;AAAA,IACL,MAAM;AAAA,IACN,WAAW;AAAA,IACX,MAAM;AAAA,IACN,MAAMA;AAAA,EAAA;AAEV;AAOO,MAAMG,IAAgC;AAAA,EAC3C,WAAW;AAAA,EACX,MAAM;AAAA,EAEN,MAAM,QACJC,GACAC,GAC+B;AAC/B,UAAML,IAAUM;AAAA,MACdD;AAAA,IAAA,GAEIE,IAAS,MAAMC,EAASJ,CAAK,GAC7BK,IAAiBC,EAAiBV,CAAO,GAEzCW,IAAS,MAAMC,EAAQ,cAAc,EAAE,QAAAL,EAAA,GAAUE,CAAc;AAErE,WAAO;AAAA,MACL,MAAME,EAAO;AAAA,MACb,UAAUA,EAAO,SAAS,IAAIE,CAAgB;AAAA,IAAA;AAAA,EAElD;AACF,GAOaC,IAAgC;AAAA,EAC3C,WAAW;AAAA,EACX,MAAM;AAAA,EAEN,MAAM,QAAQV,GAAyC;AACrD,UAAMG,IAAS,MAAMC,EAASJ,CAAK,GAG7BO,IAAS,MAAMC,EAAQ,eAAe,EAAE,QAAAL,GAAQ;AAEtD,WAAO;AAAA,MACL,MAAMI,EAAO;AAAA,MACb,UAAUA,EAAO,SAAS,IAAIE,CAAgB;AAAA,IAAA;AAAA,EAElD;AACF;AAwBA,SAASZ,EACPD,GACmC;AACnC,MAAI,CAACA,EAAS;AAEd,QAAMe,IAAoC,CAAA;AAE1C,SAAIf,EAAQ,aACVe,EAAW,WAAWf,EAAQ,WAE5BA,EAAQ,4BAA4B,WACtCe,EAAW,0BAA0Bf,EAAQ,0BAE3CA,EAAQ,2BAA2B,WACrCe,EAAW,yBAAyBf,EAAQ,yBAE1CA,EAAQ,aACVe,EAAW,WAAWf,EAAQ,YAG9BA,EAAQ,kBAAkB,YAC1BA,EAAQ,kBAAkB,YAE1Be,EAAW,gBAAgBf,EAAQ,gBAEjCA,EAAQ,4BAA4B,WACtCe,EAAW,0BAA0Bf,EAAQ,0BAGxC,OAAO,KAAKe,CAAU,EAAE,SAAS,IAAIA,IAAa;AAC3D;AAKA,SAAST,EACPS,GACmB;AACnB,SAAKA,KAAmB,CAAA;AAE1B;AAKA,eAAeP,EAASJ,GAAiC;AACvD,MAAI,OAAO,SAASA,CAAK;AACvB,WAAOA;AAET,MAAIA,aAAiB;AACnB,WAAOX,EAAeW,CAAmC;AAE3D,QAAM,IAAI;AAAA,IACR,0DAA0D,OAAOA,CAAK;AAAA,EAAA;AAE1E;AAKA,SAASM,EAAiBV,GAA4C;AACpE,QAAMgB,IAA8B,CAAA;AAEpC,SAAIhB,EAAQ,aACVgB,EAAY,WAAWhB,EAAQ,WAG7BA,EAAQ,4BAA4B,WACtCgB,EAAY,0BAA0BhB,EAAQ,0BAG5CA,EAAQ,2BAA2B,WACrCgB,EAAY,yBAAyBhB,EAAQ,yBAG3CA,EAAQ,aACVgB,EAAY,WAAWhB,EAAQ,WAG7BA,EAAQ,4BAA4B,KACtCgB,EAAY,wBAAwB,KAC3BhB,EAAQ,4BAA4B,OAC7CgB,EAAY,wBAAwB,KAGlChB,EAAQ,kBACVgB,EAAY,eAAeC,EAAqBjB,EAAQ,aAAa,IAGnEA,EAAQ,sBACVgB,EAAY,oBAAoBhB,EAAQ,oBAGnCgB;AACT;AAKA,SAASC,EACPC,GACuB;AACvB,SAAIA,MAAa,SACRN,EAAQ,OAAO,WAAW,MAAM,QAAQ,QAAQ,EAAE,KAAK,GAAA,CAAI,CAAC,IAGjEM,MAAa,WAERN,EAAQ,OAAO;AAAA,IAAW,CAACO,MAChCA,EAAM,mBAAA,EAAqB,KAAK,CAAAC,OAAS;AAAA,MACvC,KAAK,QAAQD,EAAM,WAAW,WAAWC,CAAI;AAAA,IAAA,EAC7C;AAAA,EAAA,IAKCR,EAAQ,OAAO,WAAW,OAAOO,MAAwB;AAE9D,UAAME,IAAe;AAAA,MACnB,aAAaF,EAAM;AAAA,MACnB,OAAO,CAACG,MACFA,MAAa,WACRH,EAAM,mBAAA,IAERA,EAAM,kBAAA;AAAA,IACf,GAMIR,IAAS,MAAMO,EAASG,CAAY;AAC1C,WAAIV,MAAW,OACN,EAAE,KAAK,GAAA,IAETA;AAAA,EACT,CAAC;AACH;AAKA,SAASE,EAAiBU,GAAwD;AAChF,SAAO;AAAA,IACL,MAAMA,EAAI,SAAS,UAAU,UAAU;AAAA,IACvC,SAASA,EAAI;AAAA,EAAA;AAEjB;AC7OO,SAASC,EAA0BC,GAMjC;AACP,EAAAA,EAAS,SAAStB,CAAc,GAChCsB,EAAS,SAASX,CAAc;AAClC;"}
@@ -0,0 +1,136 @@
1
+ /**
2
+ * Options for DOCX to HTML conversion using mammoth.
3
+ *
4
+ * @module mammoth/options
5
+ */
6
+ /**
7
+ * Image element provided during image conversion.
8
+ * Contains metadata and methods to read image content.
9
+ */
10
+ export interface MammothImageElement {
11
+ /** MIME content type of the image (e.g., 'image/png', 'image/jpeg') */
12
+ contentType: string;
13
+ /**
14
+ * Read the image content.
15
+ *
16
+ * @param encoding - 'base64' for base64 string, 'buffer' for ArrayBuffer
17
+ * @returns Promise resolving to the image data
18
+ */
19
+ read(encoding: 'base64'): Promise<string>;
20
+ read(encoding: 'buffer'): Promise<ArrayBuffer>;
21
+ }
22
+ /**
23
+ * Result of custom image conversion.
24
+ * Return null to omit the image from output.
25
+ */
26
+ export interface MammothImageResult {
27
+ /** The src attribute for the img element */
28
+ src: string;
29
+ }
30
+ /**
31
+ * Custom image handler function.
32
+ * Called for each image in the document during conversion.
33
+ *
34
+ * @example
35
+ * ```ts
36
+ * const handler: MammothImageHandler = async (image) => {
37
+ * const base64 = await image.read('base64')
38
+ * return { src: `data:${image.contentType};base64,${base64}` }
39
+ * }
40
+ * ```
41
+ */
42
+ export type MammothImageHandler = (image: MammothImageElement) => Promise<MammothImageResult | null>;
43
+ /**
44
+ * Options for DOCX to HTML conversion.
45
+ *
46
+ * These options map to mammoth.js convertToHtml options.
47
+ *
48
+ * @example
49
+ * ```ts
50
+ * const options: DocxToHtmlOptions = {
51
+ * styleMap: [
52
+ * "p[style-name='Title'] => h1.doc-title",
53
+ * "p[style-name='Heading 1'] => h1",
54
+ * ],
55
+ * idPrefix: 'doc-',
56
+ * imageHandling: 'inline',
57
+ * }
58
+ * ```
59
+ */
60
+ export interface DocxToHtmlOptions {
61
+ /**
62
+ * Custom style mappings from Word styles to HTML elements.
63
+ *
64
+ * Each string follows mammoth style map syntax:
65
+ * - `"p[style-name='Heading 1'] => h1"` - Map paragraph style to h1
66
+ * - `"r[style-name='Emphasis'] => em"` - Map run style to em
67
+ * - `"p[style-name='Code'] => pre > code"` - Nested elements
68
+ *
69
+ * @see https://github.com/mwilliamson/mammoth.js#style-map
70
+ */
71
+ styleMap?: string[];
72
+ /**
73
+ * Include the document's embedded style map.
74
+ *
75
+ * When true (default), any style map embedded in the .docx file
76
+ * is combined with the provided styleMap.
77
+ *
78
+ * @default true
79
+ */
80
+ includeEmbeddedStyleMap?: boolean;
81
+ /**
82
+ * Include mammoth's default style mappings.
83
+ *
84
+ * When true (default), mammoth's default style map is used as a base.
85
+ * Set to false to only use explicit styleMap mappings.
86
+ *
87
+ * @default true
88
+ */
89
+ includeDefaultStyleMap?: boolean;
90
+ /**
91
+ * Prefix for generated element IDs.
92
+ *
93
+ * Used for bookmarks, footnotes, and other elements that need unique IDs.
94
+ * Useful when embedding multiple documents in a single page.
95
+ *
96
+ * @example 'doc1-' would generate IDs like 'doc1-footnote-1'
97
+ */
98
+ idPrefix?: string;
99
+ /**
100
+ * How to handle images in the document.
101
+ *
102
+ * - `'inline'` (default): Embed images as base64 data URIs
103
+ * - `'omit'`: Remove all images from output
104
+ * - Custom handler: Function to process each image
105
+ *
106
+ * @default 'inline'
107
+ */
108
+ imageHandling?: 'inline' | 'omit' | MammothImageHandler;
109
+ /**
110
+ * Preserve empty paragraphs in output.
111
+ *
112
+ * When false (default), empty paragraphs are omitted from HTML.
113
+ * Set to true to include them as `<p></p>` elements.
114
+ *
115
+ * @default false
116
+ */
117
+ preserveEmptyParagraphs?: boolean;
118
+ /**
119
+ * Transform the internal document representation before conversion.
120
+ *
121
+ * Advanced option for modifying the document structure.
122
+ * The function receives the mammoth document object and should return
123
+ * a transformed version.
124
+ */
125
+ transformDocument?: (document: unknown) => unknown;
126
+ }
127
+ /**
128
+ * Options for extracting raw text from DOCX.
129
+ */
130
+ export interface DocxToTextOptions {
131
+ /**
132
+ * Prefix for generated element IDs.
133
+ * @see DocxToHtmlOptions.idPrefix
134
+ */
135
+ idPrefix?: string;
136
+ }
@@ -0,0 +1,52 @@
1
+ /**
2
+ * Result types for DOCX conversion.
3
+ *
4
+ * @module mammoth/result
5
+ */
6
+ /**
7
+ * Message severity level.
8
+ */
9
+ export type MammothMessageType = 'warning' | 'error';
10
+ /**
11
+ * Message generated during DOCX conversion.
12
+ *
13
+ * Mammoth produces messages for issues like:
14
+ * - Unrecognized styles
15
+ * - Unsupported features
16
+ * - Missing images
17
+ */
18
+ export interface MammothMessage {
19
+ /** Severity of the message */
20
+ type: MammothMessageType;
21
+ /** Human-readable description of the issue */
22
+ message: string;
23
+ }
24
+ /**
25
+ * Result of DOCX to HTML conversion.
26
+ *
27
+ * Contains the converted HTML and any messages generated during conversion.
28
+ *
29
+ * @example
30
+ * ```ts
31
+ * const result = await docxToHtmlImpl.execute(buffer)
32
+ * console.log(result.html)
33
+ * if (result.messages.length > 0) {
34
+ * console.warn('Conversion warnings:', result.messages)
35
+ * }
36
+ * ```
37
+ */
38
+ export interface DocxConversionResult {
39
+ /** The converted HTML string */
40
+ html: string;
41
+ /** Messages (warnings/errors) generated during conversion */
42
+ messages: MammothMessage[];
43
+ }
44
+ /**
45
+ * Result of DOCX to text extraction.
46
+ */
47
+ export interface DocxTextResult {
48
+ /** The extracted plain text */
49
+ text: string;
50
+ /** Messages (warnings/errors) generated during extraction */
51
+ messages: MammothMessage[];
52
+ }
package/dist/util.d.ts ADDED
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Utility functions for mammoth package.
3
+ *
4
+ * @module mammoth/util
5
+ */
6
+ /**
7
+ * Convert a ReadableStream<Uint8Array> to a Buffer.
8
+ */
9
+ export declare function streamToBuffer(stream: ReadableStream<Uint8Array>): Promise<Buffer>;
package/package.json ADDED
@@ -0,0 +1,47 @@
1
+ {
2
+ "name": "@origints/mammoth",
3
+ "version": "0.1.0",
4
+ "description": "DOCX to HTML conversion for Origins using mammoth.js",
5
+ "type": "module",
6
+ "license": "MIT",
7
+ "exports": {
8
+ ".": {
9
+ "types": "./dist/index.d.ts",
10
+ "import": "./dist/index.es.js",
11
+ "require": "./dist/index.cjs"
12
+ }
13
+ },
14
+ "main": "./dist/index.cjs",
15
+ "module": "./dist/index.es.js",
16
+ "types": "./dist/index.d.ts",
17
+ "files": [
18
+ "dist"
19
+ ],
20
+ "publishConfig": {
21
+ "access": "public"
22
+ },
23
+ "dependencies": {
24
+ "mammoth": "^1.11.0"
25
+ },
26
+ "peerDependencies": {
27
+ "@origints/core": "^0.1.0"
28
+ },
29
+ "devDependencies": {
30
+ "@types/node": "25.0.6",
31
+ "@vitest/coverage-v8": "^4.0.16",
32
+ "eslint": "9.39.2",
33
+ "jszip": "^3.10.1",
34
+ "typescript": "5.9.3",
35
+ "vite": "7.3.1",
36
+ "vite-plugin-dts": "4.5.4",
37
+ "vitest": "4.0.16",
38
+ "@origints/core": "0.1.0"
39
+ },
40
+ "scripts": {
41
+ "build": "vite build",
42
+ "test": "vitest run",
43
+ "test:coverage": "vitest run --coverage",
44
+ "lint": "eslint \"{src,tests}/**/*.{ts,tsx}\" --max-warnings 0",
45
+ "typecheck": "tsc -p tsconfig.json --noEmit"
46
+ }
47
+ }