@docen/import-docx 0.0.0 → 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -127,6 +127,7 @@ await parseDOCX(buffer, {
127
127
  ### Media & Embeds
128
128
 
129
129
  - **Images** with automatic base64 conversion
130
+ - **Grouped Images** (DOCX image groups) support
130
131
  - **Links** (hyperlinks) with href extraction
131
132
 
132
133
  ## Parsing Algorithm
@@ -214,8 +215,8 @@ All colors are imported as hex values (e.g., "#FF0000", "#008000"). Color names
214
215
  ### Image Limitations
215
216
 
216
217
  - Only embedded images are supported (external image links are not fetched)
217
- - Image width/height metadata is preserved but visual sizing may vary
218
- - Title, alt text, and other image attributes have limited DOCX support
218
+ - Image dimensions and title are extracted from DOCX metadata
219
+ - Some DOCX image features (like advanced positioning or text wrapping) have limited support
219
220
 
220
221
  ### Table Cell Types
221
222
 
package/dist/index.cjs CHANGED
@@ -1 +1 @@
1
- "use strict";const xastUtilFromXml=require("xast-util-from-xml"),fflate=require("fflate"),undio=require("undio");function extractRuns(c,s,t){const i=[];for(const n of s.children)if(n.type==="element"){if(n.name==="w:hyperlink"){const l=n,a=l.attributes["r:id"],b=c.get(a);if(b){for(const h of l.children)if(h.type==="element"&&h.name==="w:r"){const k=h,v=r(k,"w:drawing");if(v){const L=m(v,t);L&&i.push(L);continue}const x=r(k,"w:t");if(!x)continue;const I=x.children.find(L=>L.type==="text");if(!I||!I.value)continue;const C=extractMarks(k);C.push({type:"link",attrs:{href:b}});const M={type:"text",text:I.value};C.length>0&&(M.marks=C),i.push(M)}}continue}if(n.name==="w:r"){const l=n,a=r(l,"w:drawing");if(a){const x=m(a,t);x&&i.push(x);continue}if(r(l,"w:br")){const x=extractMarks(l),I={type:"hardBreak"};x.length>0&&(I.marks=x),i.push(I)}const b=r(l,"w:t");if(!b)continue;const h=b.children.find(x=>x.type==="text");if(!h||!h.value)continue;const k=extractMarks(l),v={type:"text",text:h.value};k.length>0&&(v.marks=k),i.push(v)}}return i}function extractMarks(c){const s=[],t=r(c,"w:rPr");if(!t)return s;r(t,"w:b")&&s.push({type:"bold"}),r(t,"w:i")&&s.push({type:"italic"}),r(t,"w:u")&&s.push({type:"underline"}),r(t,"w:strike")&&s.push({type:"strike"}),r(t,"w:highlight")&&s.push({type:"highlight"});const i=r(t,"w:vertAlign");if(i){const h=i.attributes["w:val"];h==="subscript"?s.push({type:"subscript"}):h==="superscript"&&s.push({type:"superscript"})}const n=r(t,"w:color"),l=r(t,"w:shd"),a=r(t,"w:sz"),b=r(t,"w:rFonts");if(n||l||a||b){const h={color:"",backgroundColor:"",fontSize:"",fontFamily:"",lineHeight:""};if(n&&n.attributes["w:val"]){const k=n.attributes["w:val"];if(k!=="auto"){const v=k.startsWith("#")?k:`#${k}`;h.color=v}}if(l&&l.attributes["w:fill"]){const k=l.attributes["w:fill"];if(k!=="auto"){const v=k.startsWith("#")?k:`#${k}`;h.backgroundColor=v}}if(a&&a.attributes["w:val"]){const k=a.attributes["w:val"],v=parseFloat(k);if(!isNaN(v)){const x=Math.round(v/1.5*10)/10;h.fontSize=`${x}px`}}b&&b.attributes["w:ascii"]&&(h.fontFamily=b.attributes["w:ascii"]),s.push({type:"textStyle",attrs:h})}return s}function extractAlignment(c){const s=r(c,"w:pPr");if(!s)return;const t=r(s,"w:jc");if(!t?.attributes["w:val"])return;const i=t.attributes["w:val"],n={left:"left",right:"right",center:"center",both:"justify"}[i];return n?{textAlign:n}:void 0}function m(c,s){const t=w(c,"a:blip");if(!t?.attributes["r:embed"])return null;const i=t.attributes["r:embed"],n=s.get(i);return n?{type:"image",attrs:{src:n,alt:""}}:null}function r(c,s){for(const t of c.children)if(t.type==="element"&&t.name===s)return t}function w(c,s){for(const t of c.children)if(t.type==="element"&&t.name===s)return t;for(const t of c.children)if(t.type==="element"){const i=w(t,s);if(i)return i}}function convertParagraph(c,s,t){let i;for(const a of c.children)if(a.type==="element"&&a.name==="w:pPr"){const b=a;for(const h of b.children)if(h.type==="element"&&h.name==="w:pStyle"){i=h.attributes["w:val"];break}break}if(i){const a=i.match(/^Heading(\d)$/);if(a){const b=parseInt(a[1]);return f$1(c,s,b,t)}}const n=extractRuns(s,c,t);if(n.length===1&&n[0].type==="hardBreak"){for(const a of c.children)if(a.type==="element"&&a.name==="w:r"){for(const b of a.children)if(b.type==="element"&&b.name==="w:br"&&b.attributes["w:type"]==="page")return{type:"horizontalRule"}}}if(n.length===1&&n[0].type==="image")return n[0];const l=extractAlignment(c);return{type:"paragraph",...l&&{attrs:l},content:n}}function f$1(c,s,t,i){return{type:"heading",attrs:{level:t},content:extractRuns(s,c,i)}}function isListItem(c){const s=e(c,"w:pPr");return s?!!e(s,"w:numPr"):!1}function getListInfo(c){const s=e(c,"w:pPr");if(!s)return null;const t=e(s,"w:numPr");if(!t)return null;const i=e(t,"w:ilvl"),n=e(t,"w:numId");return!i||!n?null:{numId:n.attributes["w:val"],level:parseInt(i.attributes["w:val"]||"0")}}function e(c,s){for(const t of c.children)if(t.type==="element"&&t.name===s)return t}function isCodeBlock(c){const s=o(c,"w:pPr");if(!s)return!1;const t=o(s,"w:pStyle");if(!t)return!1;const i=t.attributes["w:val"];return i==="CodeBlock"||i?.startsWith("Code")}function getCodeBlockLanguage(c){const s=o(c,"w:pPr");if(!s)return;const t=o(s,"w:pStyle");if(!t)return;const i=t.attributes["w:val"];if(i?.startsWith("CodeBlock"))return i.replace("CodeBlock","").toLowerCase()||void 0}function o(c,s){for(const t of c.children)if(t.type==="element"&&t.name===s)return t}function isTable(c){return c.name==="w:tbl"}function convertTable(c,s,t){const i=[],n=[];for(const a of c.children)a.type==="element"&&a.name==="w:tr"&&n.push(a);const l=new Map;return n.forEach((a,b)=>{i.push(d(a,b===0,s,t,l,n,b))}),{type:"table",content:i}}function d(c,s,t,i,n,l,a){const b=[];let h=0;for(const k of c.children)if(k.type==="element"&&k.name==="w:tc"){const v=n.get(h);if(v&&v>0){n.set(h,v-1),h++;continue}let x=u$1(k);if(x&&x.rowspan===1){const M=g(l,a,h);M>1&&(x={...x,rowspan:M})}if(x&&x.rowspan>1&&n.set(h,x.rowspan-1),x&&x.rowspan===0){h++;continue}const I="tableCell",C=y(k,t,i);b.push({type:I,...x&&{attrs:x},content:[C]}),h+=x?.colspan||1}return{type:"tableRow",content:b}}function u$1(c){const s={colspan:1,rowspan:1,colwidth:null};let t;for(const i of c.children)if(i.type==="element"&&i.name==="w:tcPr"){t=i;break}if(!t)return s;for(const i of t.children)if(i.type==="element"&&i.name==="w:gridSpan"){const n=i.attributes["w:val"];n&&(s.colspan=parseInt(n));break}for(const i of t.children)if(i.type==="element"&&i.name==="w:vMerge"){i.attributes["w:val"]==="continue"&&(s.rowspan=0);break}for(const i of t.children)if(i.type==="element"&&i.name==="w:tcW"){const n=i.attributes["w:w"];n&&(s.colwidth=parseInt(n));break}return s}function g(c,s,t){let i=1,n=t;for(let l=s+1;l<c.length;l++){const a=c[l];let b=!1;for(const h of a.children)if(h.type==="element"&&h.name==="w:tc"){const k=u$1(h),v=k?.colspan||1;if(n>=0&&n<v){if(k?.rowspan===0)i++,b=!0;else return i;break}n-=v}if(!b)break}return i}function y(c,s,t){const i=[];for(const n of c.children)if(n.type==="element"&&n.name==="w:p"){const l=convertParagraph(n,s,t);i.push(l)}return i[0]||{type:"paragraph",content:[]}}function isTaskItem(c){for(const s of c.children)if(s.type==="element"&&s.name==="w:r"){for(const t of s.children)if(t.type==="element"&&t.name==="w:t"){const i=t.children.find(n=>n.type==="text");if(i&&"value"in i){const n=i.value;return n.startsWith("\u2610")||n.startsWith("\u2611")}}break}return!1}function getTaskItemChecked(c){for(const s of c.children)if(s.type==="element"&&s.name==="w:r"){for(const t of s.children)if(t.type==="element"&&t.name==="w:t"){const i=t.children.find(n=>n.type==="text");if(i&&"value"in i)return i.value.startsWith("\u2611")}break}return!1}function convertTaskItem(c){const s=getTaskItemChecked(c),t=f(c);return{type:"taskItem",attrs:{checked:s},content:[t]}}function f(c){const s=[];let t=!1;for(const n of c.children)if(n.type==="element"&&n.name==="w:r"){let l=!1;if(!t){for(const a of n.children)if(a.type==="element"&&a.name==="w:t"){const b=a.children.find(h=>h.type==="text");if(b&&"value"in b){const h=b.value;if(h.startsWith("\u2610")||h.startsWith("\u2611")){l=!0,t=!0;const k=h.substring(2).trimStart();k.length>0&&s.push({type:"text",text:k})}}}}if(!l){const a=p(n);for(const b of n.children)if(b.type==="element"&&b.name==="w:t"){const h=b.children.find(k=>k.type==="text");if(h&&"value"in h){const k={type:"text",text:h.value};a.length>0&&(k.marks=a),s.push(k)}}}}const i=u(c);return{type:"paragraph",...i&&{attrs:i},content:s.length>0?s:void 0}}function p(c){const s=[];for(const t of c.children)if(t.type==="element"&&t.name==="w:rPr"){const i=t;for(const n of i.children)if(n.type==="element"&&n.name==="w:b"){s.push({type:"bold"});break}for(const n of i.children)if(n.type==="element"&&n.name==="w:i"){s.push({type:"italic"});break}for(const n of i.children)if(n.type==="element"&&n.name==="w:u"){s.push({type:"underline"});break}for(const n of i.children)if(n.type==="element"&&n.name==="w:strike"){s.push({type:"strike"});break}break}return s}function u(c){for(const s of c.children)if(s.type==="element"&&s.name==="w:pPr"){const t=s;for(const i of t.children)if(i.type==="element"&&i.name==="w:jc"){const n=i.attributes["w:val"];if(n==="both")return{textAlign:"justify"};if(n==="center")return{textAlign:"center"};if(n==="right")return{textAlign:"right"};if(n==="left")return{textAlign:"left"}}}}function isHorizontalRule(c){for(const s of c.children)if(s.type==="element"&&s.name==="w:r"){const t=s;let i=!1,n=!1;for(const l of t.children)if(l.type==="element")if(l.name==="w:br")l.attributes["w:type"]==="page"&&(i=!0);else if(l.name==="w:t"){const a=l.children.find(b=>b.type==="text");a&&"value"in a&&a.value&&a.value.trim().length>0&&(n=!0)}else l.name!=="w:rPr"&&(n=!0);if(i&&!n)return!0}return!1}const defaultImageConverter=async c=>({src:undio.toBase64(c.data)});async function parseDOCX(c,s={}){const{convertImage:t=defaultImageConverter,ignoreEmptyParagraphs:i=!1}=s,n=await undio.toUint8Array(c),l=fflate.unzipSync(n),a=A(l),b=X(l),h=new Map;for(const[I,C]of b.entries())try{const M=`image/${Object.keys(l).find(R=>R.endsWith(I)||R.includes(`media/${I}`))?.split(".").pop()?.toLowerCase()||"png"}`,L=await t({id:I,contentType:M,data:C});h.set(I,L.src)}catch(M){console.warn(`Failed to convert image ${I}:`,M);const L=undio.toBase64(C);h.set(I,L)}const k=l["word/document.xml"];if(!k)throw new Error("Invalid DOCX file: missing word/document.xml");const v=xastUtilFromXml.fromXml(new TextDecoder().decode(k)),x=J(l);return P(v,h,a,x,i)}function J(c){const s=new Map,t=new Map,i=c["word/numbering.xml"];if(!i)return s;const n=xastUtilFromXml.fromXml(new TextDecoder().decode(i)),l=new Map;if(n.type==="root"){for(const a of n.children)if(a.type==="element"&&a.name==="w:numbering"){const b=a;for(const h of b.children)if(h.type==="element"&&h.name==="w:abstractNum"){const k=h,v=k.attributes["w:abstractNumId"];for(const x of k.children)if(x.type==="element"&&x.name==="w:lvl"){for(const I of x.children)if(I.type==="element"&&I.name==="w:numFmt"){const C=I.attributes["w:val"];if(C){l.set(v,C);break}}for(const I of x.children)if(I.type==="element"&&I.name==="w:start"){const C=I.attributes["w:val"];C&&t.set(v,parseInt(C,10));break}break}}for(const h of b.children)if(h.type==="element"&&h.name==="w:num"){const k=h,v=k.attributes["w:numId"];for(const x of k.children)if(x.type==="element"&&x.name==="w:abstractNumId"){const I=x.attributes["w:val"],C=l.get(I);if(C){const M=t.get(I);C==="bullet"?s.set(v,{type:"bullet"}):s.set(v,{type:"ordered",...M!==void 0&&{start:M}})}break}}break}}return s}function X(c){const s=new Map,t=c["word/_rels/document.xml.rels"];if(!t)return s;const i=xastUtilFromXml.fromXml(new TextDecoder().decode(t));if(i.type==="root"){for(const n of i.children)if(n.type==="element"&&n.name==="Relationships"){const l=n;for(const a of l.children)if(a.type==="element"&&a.name==="Relationship"){const b=a,h=b.attributes.Type;if(h&&h==="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"){const k=b.attributes.Id,v=b.attributes.Target;if(k&&v){const x="word/"+v,I=c[x];I&&s.set(k,I)}}}break}}return s}function A(c){const s=new Map,t=c["word/_rels/document.xml.rels"];if(!t)return s;const i=xastUtilFromXml.fromXml(new TextDecoder().decode(t));if(i.type==="root"){for(const n of i.children)if(n.type==="element"&&n.name==="Relationships"){const l=n;for(const a of l.children)if(a.type==="element"&&a.name==="Relationship"){const b=a,h=b.attributes.Type;if(h&&h==="http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"){const k=b.attributes.Id,v=b.attributes.Target;k&&v&&s.set(k,v)}}break}}return s}function P(c,s,t,i,n){if(c.type!=="root")return{type:"doc",content:[]};for(const l of c.children)if(l.type==="element"&&l.name==="w:document"){const a=l;for(const b of a.children)if(b.type==="element"&&b.name==="w:body")return{type:"doc",content:B(b.children.filter(h=>h.type==="element"),s,t,i,n)};break}return{type:"doc",content:[]}}function B(c,s,t,i,n){const l=[];let a=0;for(;a<c.length;){const b=c[a];if(b.name==="w:tbl"){l.push(convertTable(b,t,s)),a++,a<c.length&&c[a].name==="w:p"&&T(c[a])&&a++;continue}if(b.name==="w:p"){if(n&&T(b)){a++;continue}if(isCodeBlock(b)){const h=U(c,a);l.push(...h),a+=h.length;continue}if(isTaskItem(b)){const h=$(c,a);l.push(...h),a+=H(c,a);continue}if(isListItem(b)){const h=F(c,a,s,t,i);l.push(...h),a+=z(c,a);continue}if(isHorizontalRule(b)){l.push({type:"horizontalRule"}),a++;continue}l.push(convertParagraph(b,t,s)),a++;continue}a++}return l}function U(c,s){const t=[];let i=s;for(;i<c.length;){const n=c[i];if(n.name!=="w:p"||!isCodeBlock(n))break;const l=getCodeBlockLanguage(n),a={type:"codeBlock",...l&&{attrs:{language:l}},content:_(n)};t.push(a),i++}return t}function F(c,s,t,i,n){const l=[];let a=s;for(;a<c.length;){const b=c[a];if(b.name!=="w:p"||!isListItem(b))break;const h=getListInfo(b);if(!h)break;const k=n.get(h.numId),v=k?.type||"bullet",x=[];for(;a<c.length;){const C=c[a];if(C.name!=="w:p"||!isListItem(C))break;const M=getListInfo(C);if(!M||M.numId!==h.numId)break;const L={type:"listItem",content:[convertParagraph(C,i,t)]};x.push(L),a++}const I={type:v==="bullet"?"bulletList":"orderedList",content:x};v==="ordered"&&(I.attrs={type:null,...k?.start!==void 0&&{start:k.start}}),l.push(I)}return l}function z(c,s){let t=0,i=s;for(;i<c.length;){const n=c[i];if(n.name!=="w:p"||!isListItem(n))break;t++,i++}return t}function $(c,s){const t=[];let i=s;for(;i<c.length;){const n=c[i];if(n.name!=="w:p"||!isTaskItem(n))break;const l=convertTaskItem(n);t.push(l),i++}return[{type:"taskList",content:t}]}function H(c,s){let t=0,i=s;for(;i<c.length;){const n=c[i];if(n.name!=="w:p"||!isTaskItem(n))break;t++,i++}return t}function _(c){const s=[];for(const t of c.children){if(t.type!=="element"||t.name!=="w:r")continue;const i=t;for(const n of i.children)if(n.type==="element"&&n.name==="w:t"){const l=n.children.find(a=>a.type==="text");l&&"value"in l&&s.push({type:"text",text:l.value})}}return s}function T(c){for(const s of c.children){if(s.type!=="element"||s.name!=="w:r")continue;const t=s;for(const i of t.children)if(i.type==="element"&&i.name==="w:t"){const n=i.children.find(l=>l.type==="text");if(n&&"value"in n&&n.value.trim().length>0)return!1}}return!0}exports.convertParagraph=convertParagraph,exports.convertTable=convertTable,exports.convertTaskItem=convertTaskItem,exports.defaultImageConverter=defaultImageConverter,exports.extractAlignment=extractAlignment,exports.extractMarks=extractMarks,exports.extractRuns=extractRuns,exports.getCodeBlockLanguage=getCodeBlockLanguage,exports.getListInfo=getListInfo,exports.getTaskItemChecked=getTaskItemChecked,exports.isCodeBlock=isCodeBlock,exports.isHorizontalRule=isHorizontalRule,exports.isListItem=isListItem,exports.isTable=isTable,exports.isTaskItem=isTaskItem,exports.parseDOCX=parseDOCX;
1
+ "use strict";const xastUtilFromXml=require("xast-util-from-xml"),fflate=require("fflate"),undio=require("undio"),imageMeta=require("image-meta");function extractRuns(c,n,t){const r=[];for(const s of n.children)if(s.type==="element"){if(s.name==="w:hyperlink"){const l=s,a=l.attributes["r:id"],m=c.get(a);if(m){for(const h of l.children)if(h.type==="element"&&h.name==="w:r"){const b=h;let k=i(b,"w:drawing");if(!k){const A=i(b,"mc:AlternateContent");if(A){const N=i(A,"mc:Choice");N&&(k=i(N,"w:drawing"))}}if(k){const A=P$1(k,t);if(A){r.push(A);continue}const N=j$1(k,t);if(r.push(...N),N.length>0)continue}const x=i(b,"w:t");if(!x)continue;const I=x.children.find(A=>A.type==="text");if(!I||!I.value)continue;const T=extractMarks(b);T.push({type:"link",attrs:{href:m}});const C={type:"text",text:I.value};T.length>0&&(C.marks=T),r.push(C)}}continue}if(s.name==="w:r"){const l=s;let a=i(l,"w:drawing");if(!a){const x=i(l,"mc:AlternateContent");if(x){const I=i(x,"mc:Choice");I&&(a=i(I,"w:drawing"))}}if(a){const x=j$1(a,t);if(r.push(...x),x.length>0)continue}if(i(l,"w:br")){const x=extractMarks(l),I={type:"hardBreak"};x.length>0&&(I.marks=x),r.push(I)}const m=i(l,"w:t");if(!m)continue;const h=m.children.find(x=>x.type==="text");if(!h||!h.value)continue;const b=extractMarks(l),k={type:"text",text:h.value};b.length>0&&(k.marks=b),r.push(k)}}return r}function extractMarks(c){const n=[],t=i(c,"w:rPr");if(!t)return n;i(t,"w:b")&&n.push({type:"bold"}),i(t,"w:i")&&n.push({type:"italic"}),i(t,"w:u")&&n.push({type:"underline"}),i(t,"w:strike")&&n.push({type:"strike"}),i(t,"w:highlight")&&n.push({type:"highlight"});const r=i(t,"w:vertAlign");if(r){const h=r.attributes["w:val"];h==="subscript"?n.push({type:"subscript"}):h==="superscript"&&n.push({type:"superscript"})}const s=i(t,"w:color"),l=i(t,"w:shd"),a=i(t,"w:sz"),m=i(t,"w:rFonts");if(s||l||a||m){const h={color:"",backgroundColor:"",fontSize:"",fontFamily:"",lineHeight:""};if(s&&s.attributes["w:val"]){const b=s.attributes["w:val"];if(b!=="auto"){const k=b.startsWith("#")?b:`#${b}`;h.color=k}}if(l&&l.attributes["w:fill"]){const b=l.attributes["w:fill"];if(b!=="auto"){const k=b.startsWith("#")?b:`#${b}`;h.backgroundColor=k}}if(a&&a.attributes["w:val"]){const b=a.attributes["w:val"],k=parseFloat(b);if(!isNaN(k)){const x=Math.round(k/1.5*10)/10;h.fontSize=`${x}px`}}m&&m.attributes["w:ascii"]&&(h.fontFamily=m.attributes["w:ascii"]),n.push({type:"textStyle",attrs:h})}return n}function extractAlignment(c){const n=i(c,"w:pPr");if(!n)return;const t=i(n,"w:jc");if(!t?.attributes["w:val"])return;const r=t.attributes["w:val"],s={left:"left",right:"right",center:"center",both:"justify"}[r];return s?{textAlign:s}:void 0}function P$1(c,n){const t=M(c,"a:blip");if(!t?.attributes["r:embed"])return null;const r=t.attributes["r:embed"],s=n.get(r);if(!s)return null;const l=M(c,"wp:extent");let a,m;if(l){const k=l.attributes.cx,x=l.attributes.cy;if(typeof k=="string"){const I=parseInt(k,10);isNaN(I)||(a=Math.round(I/9525))}if(typeof x=="string"){const I=parseInt(x,10);isNaN(I)||(m=Math.round(I/9525))}}const h=M(c,"wp:docPr");let b;if(h){const k=h.attributes.title;typeof k=="string"&&k&&(b=k)}return{type:"image",attrs:{src:s,alt:"",...a!==void 0&&{width:a},...m!==void 0&&{height:m},...b!==void 0&&{title:b}}}}function j$1(c,n){const t=[],r=i(c,"wp:inline")||i(c,"wp:anchor");if(!r)return t;const s=i(r,"wp:extent");let l,a;if(s){const k=s.attributes.cx,x=s.attributes.cy;if(typeof k=="string"){const I=parseInt(k,10);isNaN(I)||(l=Math.round(I/9525))}if(typeof x=="string"){const I=parseInt(x,10);isNaN(I)||(a=Math.round(I/9525))}}const m=i(r,"a:graphic");if(!m)return t;const h=i(m,"a:graphicData");if(!h)return t;const b=i(h,"wpg:wgp");if(b){const k=i(b,"wpg:grpSp");let x=[];if(k){const I=R(k,"pic:pic"),T=R(k,"pic");x=[...I,...T]}else{const I=R(b,"pic:pic"),T=R(b,"pic");x=[...I,...T]}for(const I of x){const T=i(I,"a:graphic");if(!T){const A=i(I,"pic:blipFill")||M(I,"a:blipFill");if(A){const N=i(A,"a:blip")||M(A,"a:blip");if(N&&N.attributes["r:embed"]){const q=N.attributes["r:embed"],W=n.get(q);if(W){let S=l,O=a;if(W&&l&&a)try{let D,E;if(W.startsWith("data:")){const G=W.split(",")[1];if(G){const K=atob(G),V=new Uint8Array(K.length);for(let Q=0;Q<K.length;Q++)V[Q]=K.charCodeAt(Q);const Y=imageMeta.imageMeta(V);D=Y.width,E=Y.height}}if(D&&E){const G=D/E,K=l/a;Math.abs(G-K)>.1&&(G>K?(S=l,O=Math.round(l/G)):(O=a,S=Math.round(a*G)))}}catch(D){console.warn("Failed to extract image metadata for aspect ratio:",D)}t.push({type:"image",attrs:{src:W,alt:"",...S!==void 0&&{width:S},...O!==void 0&&{height:O}}})}}}continue}const C=P$1({children:[T]},n);if(C){if(l!==void 0&&a!==void 0&&C.attrs.src)try{const A=C.attrs.src;let N,q;if(A.startsWith("data:")){const W=A.split(",")[1];if(W){const S=atob(W),O=new Uint8Array(S.length);for(let E=0;E<S.length;E++)O[E]=S.charCodeAt(E);const D=imageMeta.imageMeta(O);N=D.width,q=D.height}}if(N&&q){const W=N/q,S=l/a;Math.abs(W-S)>.1?W>S?(C.attrs.width=l,C.attrs.height=Math.round(l/W)):(C.attrs.height=a,C.attrs.width=Math.round(a*W)):(C.attrs.width=l,C.attrs.height=a)}else C.attrs.width=l,C.attrs.height=a}catch{C.attrs.width=l,C.attrs.height=a}t.push(C)}}}else{const k=P$1(c,n);k&&t.push(k)}return t}function i(c,n){for(const t of c.children)if(t.type==="element"&&t.name===n)return t}function M(c,n){for(const t of c.children)if(t.type==="element"&&t.name===n)return t;for(const t of c.children)if(t.type==="element"){const r=M(t,n);if(r)return r}}function R(c,n){const t=[];for(const r of c.children)r.type==="element"&&r.name===n&&t.push(r);for(const r of c.children)r.type==="element"&&t.push(...R(r,n));return t}function convertParagraph(c,n,t){let r;for(const a of c.children)if(a.type==="element"&&a.name==="w:pPr"){const m=a;for(const h of m.children)if(h.type==="element"&&h.name==="w:pStyle"){r=h.attributes["w:val"];break}break}if(r){const a=r.match(/^Heading(\d)$/);if(a){const m=parseInt(a[1]);return f$1(c,n,m,t)}}const s=extractRuns(n,c,t);if(s.length===1&&s[0].type==="hardBreak"){for(const a of c.children)if(a.type==="element"&&a.name==="w:r"){for(const m of a.children)if(m.type==="element"&&m.name==="w:br"&&m.attributes["w:type"]==="page")return{type:"horizontalRule"}}}if(s.length===1&&s[0].type==="image")return s[0];const l=extractAlignment(c);return{type:"paragraph",...l&&{attrs:l},content:s}}function f$1(c,n,t,r){return{type:"heading",attrs:{level:t},content:extractRuns(n,c,r)}}function isListItem(c){const n=e(c,"w:pPr");return n?!!e(n,"w:numPr"):!1}function getListInfo(c){const n=e(c,"w:pPr");if(!n)return null;const t=e(n,"w:numPr");if(!t)return null;const r=e(t,"w:ilvl"),s=e(t,"w:numId");return!r||!s?null:{numId:s.attributes["w:val"],level:parseInt(r.attributes["w:val"]||"0")}}function e(c,n){for(const t of c.children)if(t.type==="element"&&t.name===n)return t}function isCodeBlock(c){const n=o(c,"w:pPr");if(!n)return!1;const t=o(n,"w:pStyle");if(!t)return!1;const r=t.attributes["w:val"];return r==="CodeBlock"||r?.startsWith("Code")}function getCodeBlockLanguage(c){const n=o(c,"w:pPr");if(!n)return;const t=o(n,"w:pStyle");if(!t)return;const r=t.attributes["w:val"];if(r?.startsWith("CodeBlock"))return r.replace("CodeBlock","").toLowerCase()||void 0}function o(c,n){for(const t of c.children)if(t.type==="element"&&t.name===n)return t}function isTable(c){return c.name==="w:tbl"}function convertTable(c,n,t){const r=[],s=[];for(const a of c.children)a.type==="element"&&a.name==="w:tr"&&s.push(a);const l=new Map;return s.forEach((a,m)=>{r.push(d(a,m===0,n,t,l,s,m))}),{type:"table",content:r}}function d(c,n,t,r,s,l,a){const m=[];let h=0;for(const b of c.children)if(b.type==="element"&&b.name==="w:tc"){const k=s.get(h);if(k&&k>0){s.set(h,k-1),h++;continue}let x=u$1(b);if(x&&x.rowspan===1){const C=g(l,a,h);C>1&&(x={...x,rowspan:C})}if(x&&x.rowspan>1&&s.set(h,x.rowspan-1),x&&x.rowspan===0){h++;continue}const I="tableCell",T=y(b,t,r);m.push({type:I,...x&&{attrs:x},content:[T]}),h+=x?.colspan||1}return{type:"tableRow",content:m}}function u$1(c){const n={colspan:1,rowspan:1,colwidth:null};let t;for(const r of c.children)if(r.type==="element"&&r.name==="w:tcPr"){t=r;break}if(!t)return n;for(const r of t.children)if(r.type==="element"&&r.name==="w:gridSpan"){const s=r.attributes["w:val"];s&&(n.colspan=parseInt(s));break}for(const r of t.children)if(r.type==="element"&&r.name==="w:vMerge"){r.attributes["w:val"]==="continue"&&(n.rowspan=0);break}for(const r of t.children)if(r.type==="element"&&r.name==="w:tcW"){const s=r.attributes["w:w"];s&&(n.colwidth=parseInt(s));break}return n}function g(c,n,t){let r=1,s=t;for(let l=n+1;l<c.length;l++){const a=c[l];let m=!1;for(const h of a.children)if(h.type==="element"&&h.name==="w:tc"){const b=u$1(h),k=b?.colspan||1;if(s>=0&&s<k){if(b?.rowspan===0)r++,m=!0;else return r;break}s-=k}if(!m)break}return r}function y(c,n,t){const r=[];for(const s of c.children)if(s.type==="element"&&s.name==="w:p"){const l=convertParagraph(s,n,t);r.push(l)}return r[0]||{type:"paragraph",content:[]}}function isTaskItem(c){for(const n of c.children)if(n.type==="element"&&n.name==="w:r"){for(const t of n.children)if(t.type==="element"&&t.name==="w:t"){const r=t.children.find(s=>s.type==="text");if(r&&"value"in r){const s=r.value;return s.startsWith("\u2610")||s.startsWith("\u2611")}}break}return!1}function getTaskItemChecked(c){for(const n of c.children)if(n.type==="element"&&n.name==="w:r"){for(const t of n.children)if(t.type==="element"&&t.name==="w:t"){const r=t.children.find(s=>s.type==="text");if(r&&"value"in r)return r.value.startsWith("\u2611")}break}return!1}function convertTaskItem(c){const n=getTaskItemChecked(c),t=f(c);return{type:"taskItem",attrs:{checked:n},content:[t]}}function f(c){const n=[];let t=!1;for(const s of c.children)if(s.type==="element"&&s.name==="w:r"){let l=!1;if(!t){for(const a of s.children)if(a.type==="element"&&a.name==="w:t"){const m=a.children.find(h=>h.type==="text");if(m&&"value"in m){const h=m.value;if(h.startsWith("\u2610")||h.startsWith("\u2611")){l=!0,t=!0;const b=h.substring(2).trimStart();b.length>0&&n.push({type:"text",text:b})}}}}if(!l){const a=p(s);for(const m of s.children)if(m.type==="element"&&m.name==="w:t"){const h=m.children.find(b=>b.type==="text");if(h&&"value"in h){const b={type:"text",text:h.value};a.length>0&&(b.marks=a),n.push(b)}}}}const r=u(c);return{type:"paragraph",...r&&{attrs:r},content:n.length>0?n:void 0}}function p(c){const n=[];for(const t of c.children)if(t.type==="element"&&t.name==="w:rPr"){const r=t;for(const s of r.children)if(s.type==="element"&&s.name==="w:b"){n.push({type:"bold"});break}for(const s of r.children)if(s.type==="element"&&s.name==="w:i"){n.push({type:"italic"});break}for(const s of r.children)if(s.type==="element"&&s.name==="w:u"){n.push({type:"underline"});break}for(const s of r.children)if(s.type==="element"&&s.name==="w:strike"){n.push({type:"strike"});break}break}return n}function u(c){for(const n of c.children)if(n.type==="element"&&n.name==="w:pPr"){const t=n;for(const r of t.children)if(r.type==="element"&&r.name==="w:jc"){const s=r.attributes["w:val"];if(s==="both")return{textAlign:"justify"};if(s==="center")return{textAlign:"center"};if(s==="right")return{textAlign:"right"};if(s==="left")return{textAlign:"left"}}}}function isHorizontalRule(c){for(const n of c.children)if(n.type==="element"&&n.name==="w:r"){const t=n;let r=!1,s=!1;for(const l of t.children)if(l.type==="element")if(l.name==="w:br")l.attributes["w:type"]==="page"&&(r=!0);else if(l.name==="w:t"){const a=l.children.find(m=>m.type==="text");a&&"value"in a&&a.value&&a.value.trim().length>0&&(s=!0)}else l.name!=="w:rPr"&&(s=!0);if(r&&!s)return!0}return!1}const w="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";function v(c){const n=c.length,t=Math.ceil(n/3)*4,r=Array.from({length:t});let s=0;for(let l=0;l<n;l+=3){const a=c[l],m=l+1<n?c[l+1]:0,h=l+2<n?c[l+2]:0,b=a>>2,k=(a&3)<<4|m>>4,x=(m&15)<<2|h>>6,I=h&63;r[s++]=w[b],r[s++]=w[k],r[s++]=l+1<n?w[x]:"=",r[s++]=l+2<n?w[I]:"="}return r.join("")}const defaultImageConverter=async c=>{const n=v(c.data);return{src:`data:${c.contentType};base64,${n}`}};async function parseDOCX(c,n={}){const{convertImage:t=defaultImageConverter,ignoreEmptyParagraphs:r=!1}=n,s=await undio.toUint8Array(c),l=fflate.unzipSync(s),a=B(l),m=X(l),h=new Map;for(const[I,T]of m.entries())try{let C;try{C=`image/${imageMeta.imageMeta(T).type}`}catch{C="image/png"}const A=await t({id:I,contentType:C,data:T});h.set(I,A.src)}catch(C){console.warn(`Failed to convert image ${I}:`,C);let A="image/png";try{A=`image/${imageMeta.imageMeta(T).type}`}catch{}const N=v(T),q=`data:${A};base64,${N}`;h.set(I,q)}const b=l["word/document.xml"];if(!b)throw new Error("Invalid DOCX file: missing word/document.xml");const k=xastUtilFromXml.fromXml(new TextDecoder().decode(b)),x=J(l);return U(k,h,a,x,r)}function J(c){const n=new Map,t=new Map,r=c["word/numbering.xml"];if(!r)return n;const s=xastUtilFromXml.fromXml(new TextDecoder().decode(r)),l=new Map;if(s.type==="root"){for(const a of s.children)if(a.type==="element"&&a.name==="w:numbering"){const m=a;for(const h of m.children)if(h.type==="element"&&h.name==="w:abstractNum"){const b=h,k=b.attributes["w:abstractNumId"];for(const x of b.children)if(x.type==="element"&&x.name==="w:lvl"){for(const I of x.children)if(I.type==="element"&&I.name==="w:numFmt"){const T=I.attributes["w:val"];if(T){l.set(k,T);break}}for(const I of x.children)if(I.type==="element"&&I.name==="w:start"){const T=I.attributes["w:val"];T&&t.set(k,parseInt(T,10));break}break}}for(const h of m.children)if(h.type==="element"&&h.name==="w:num"){const b=h,k=b.attributes["w:numId"];for(const x of b.children)if(x.type==="element"&&x.name==="w:abstractNumId"){const I=x.attributes["w:val"],T=l.get(I);if(T){const C=t.get(I);T==="bullet"?n.set(k,{type:"bullet"}):n.set(k,{type:"ordered",...C!==void 0&&{start:C}})}break}}break}}return n}function X(c){const n=new Map,t=c["word/_rels/document.xml.rels"];if(!t)return n;const r=xastUtilFromXml.fromXml(new TextDecoder().decode(t));if(r.type==="root"){for(const s of r.children)if(s.type==="element"&&s.name==="Relationships"){const l=s;for(const a of l.children)if(a.type==="element"&&a.name==="Relationship"){const m=a,h=m.attributes.Type;if(h&&h==="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"){const b=m.attributes.Id,k=m.attributes.Target;if(b&&k){const x="word/"+k,I=c[x];I&&n.set(b,I)}}}break}}return n}function B(c){const n=new Map,t=c["word/_rels/document.xml.rels"];if(!t)return n;const r=xastUtilFromXml.fromXml(new TextDecoder().decode(t));if(r.type==="root"){for(const s of r.children)if(s.type==="element"&&s.name==="Relationships"){const l=s;for(const a of l.children)if(a.type==="element"&&a.name==="Relationship"){const m=a,h=m.attributes.Type;if(h&&h==="http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"){const b=m.attributes.Id,k=m.attributes.Target;b&&k&&n.set(b,k)}}break}}return n}function U(c,n,t,r,s){if(c.type!=="root")return{type:"doc",content:[]};for(const l of c.children)if(l.type==="element"&&l.name==="w:document"){const a=l;for(const m of a.children)if(m.type==="element"&&m.name==="w:body")return{type:"doc",content:F(m.children.filter(h=>h.type==="element"),n,t,r,s)};break}return{type:"doc",content:[]}}function F(c,n,t,r,s){const l=[];let a=0;for(;a<c.length;){const m=c[a];if(m.name==="w:tbl"){l.push(convertTable(m,t,n)),a++,a<c.length&&c[a].name==="w:p"&&L(c[a])&&a++;continue}if(m.name==="w:p"){if(s&&L(m)){a++;continue}if(isCodeBlock(m)){const h=P(c,a);l.push(...h),a+=h.length;continue}if(isTaskItem(m)){const h=H(c,a);l.push(...h),a+=_(c,a);continue}if(isListItem(m)){const h=$(c,a,n,t,r);l.push(...h),a+=z(c,a);continue}if(isHorizontalRule(m)){l.push({type:"horizontalRule"}),a++;continue}l.push(convertParagraph(m,t,n)),a++;continue}a++}return l}function P(c,n){const t=[];let r=n;for(;r<c.length;){const s=c[r];if(s.name!=="w:p"||!isCodeBlock(s))break;const l=getCodeBlockLanguage(s),a={type:"codeBlock",...l&&{attrs:{language:l}},content:j(s)};t.push(a),r++}return t}function $(c,n,t,r,s){const l=[];let a=n;for(;a<c.length;){const m=c[a];if(m.name!=="w:p"||!isListItem(m))break;const h=getListInfo(m);if(!h)break;const b=s.get(h.numId),k=b?.type||"bullet",x=[];for(;a<c.length;){const T=c[a];if(T.name!=="w:p"||!isListItem(T))break;const C=getListInfo(T);if(!C||C.numId!==h.numId)break;const A={type:"listItem",content:[convertParagraph(T,r,t)]};x.push(A),a++}const I={type:k==="bullet"?"bulletList":"orderedList",content:x};k==="ordered"&&(I.attrs={type:null,...b?.start!==void 0&&{start:b.start}}),l.push(I)}return l}function z(c,n){let t=0,r=n;for(;r<c.length;){const s=c[r];if(s.name!=="w:p"||!isListItem(s))break;t++,r++}return t}function H(c,n){const t=[];let r=n;for(;r<c.length;){const s=c[r];if(s.name!=="w:p"||!isTaskItem(s))break;const l=convertTaskItem(s);t.push(l),r++}return[{type:"taskList",content:t}]}function _(c,n){let t=0,r=n;for(;r<c.length;){const s=c[r];if(s.name!=="w:p"||!isTaskItem(s))break;t++,r++}return t}function j(c){const n=[];for(const t of c.children){if(t.type!=="element"||t.name!=="w:r")continue;const r=t;for(const s of r.children)if(s.type==="element"&&s.name==="w:t"){const l=s.children.find(a=>a.type==="text");l&&"value"in l&&n.push({type:"text",text:l.value})}}return n}function L(c){for(const n of c.children){if(n.type!=="element"||n.name!=="w:r")continue;const t=n;for(const r of t.children)if(r.type==="element"&&r.name==="w:t"){const s=r.children.find(l=>l.type==="text");if(s&&"value"in s&&s.value.trim().length>0)return!1}}return!0}exports.convertParagraph=convertParagraph,exports.convertTable=convertTable,exports.convertTaskItem=convertTaskItem,exports.defaultImageConverter=defaultImageConverter,exports.extractAlignment=extractAlignment,exports.extractMarks=extractMarks,exports.extractRuns=extractRuns,exports.getCodeBlockLanguage=getCodeBlockLanguage,exports.getListInfo=getListInfo,exports.getTaskItemChecked=getTaskItemChecked,exports.isCodeBlock=isCodeBlock,exports.isHorizontalRule=isHorizontalRule,exports.isListItem=isListItem,exports.isTable=isTable,exports.isTaskItem=isTaskItem,exports.parseDOCX=parseDOCX;
package/dist/index.mjs CHANGED
@@ -1 +1 @@
1
- import{fromXml as g}from"xast-util-from-xml";import{unzipSync as H}from"fflate";import{toBase64 as W,toUint8Array as X}from"undio";function P(o,n,t){const r=[];for(const e of n.children)if(e.type==="element"){if(e.name==="w:hyperlink"){const s=e,i=s.attributes["r:id"],a=o.get(i);if(a){for(const c of s.children)if(c.type==="element"&&c.name==="w:r"){const f=c,u=h(f,"w:drawing");if(u){const w=$(u,t);w&&r.push(w);continue}const l=h(f,"w:t");if(!l)continue;const p=l.children.find(w=>w.type==="text");if(!p||!p.value)continue;const m=k(f);m.push({type:"link",attrs:{href:a}});const d={type:"text",text:p.value};m.length>0&&(d.marks=m),r.push(d)}}continue}if(e.name==="w:r"){const s=e,i=h(s,"w:drawing");if(i){const l=$(i,t);l&&r.push(l);continue}if(h(s,"w:br")){const l=k(s),p={type:"hardBreak"};l.length>0&&(p.marks=l),r.push(p)}const a=h(s,"w:t");if(!a)continue;const c=a.children.find(l=>l.type==="text");if(!c||!c.value)continue;const f=k(s),u={type:"text",text:c.value};f.length>0&&(u.marks=f),r.push(u)}}return r}function k(o){const n=[],t=h(o,"w:rPr");if(!t)return n;h(t,"w:b")&&n.push({type:"bold"}),h(t,"w:i")&&n.push({type:"italic"}),h(t,"w:u")&&n.push({type:"underline"}),h(t,"w:strike")&&n.push({type:"strike"}),h(t,"w:highlight")&&n.push({type:"highlight"});const r=h(t,"w:vertAlign");if(r){const c=r.attributes["w:val"];c==="subscript"?n.push({type:"subscript"}):c==="superscript"&&n.push({type:"superscript"})}const e=h(t,"w:color"),s=h(t,"w:shd"),i=h(t,"w:sz"),a=h(t,"w:rFonts");if(e||s||i||a){const c={color:"",backgroundColor:"",fontSize:"",fontFamily:"",lineHeight:""};if(e&&e.attributes["w:val"]){const f=e.attributes["w:val"];if(f!=="auto"){const u=f.startsWith("#")?f:`#${f}`;c.color=u}}if(s&&s.attributes["w:fill"]){const f=s.attributes["w:fill"];if(f!=="auto"){const u=f.startsWith("#")?f:`#${f}`;c.backgroundColor=u}}if(i&&i.attributes["w:val"]){const f=i.attributes["w:val"],u=parseFloat(f);if(!isNaN(u)){const l=Math.round(u/1.5*10)/10;c.fontSize=`${l}px`}}a&&a.attributes["w:ascii"]&&(c.fontFamily=a.attributes["w:ascii"]),n.push({type:"textStyle",attrs:c})}return n}function B(o){const n=h(o,"w:pPr");if(!n)return;const t=h(n,"w:jc");if(!t?.attributes["w:val"])return;const r=t.attributes["w:val"],e={left:"left",right:"right",center:"center",both:"justify"}[r];return e?{textAlign:e}:void 0}function $(o,n){const t=A(o,"a:blip");if(!t?.attributes["r:embed"])return null;const r=t.attributes["r:embed"],e=n.get(r);return e?{type:"image",attrs:{src:e,alt:""}}:null}function h(o,n){for(const t of o.children)if(t.type==="element"&&t.name===n)return t}function A(o,n){for(const t of o.children)if(t.type==="element"&&t.name===n)return t;for(const t of o.children)if(t.type==="element"){const r=A(t,n);if(r)return r}}function v(o,n,t){let r;for(const i of o.children)if(i.type==="element"&&i.name==="w:pPr"){const a=i;for(const c of a.children)if(c.type==="element"&&c.name==="w:pStyle"){r=c.attributes["w:val"];break}break}if(r){const i=r.match(/^Heading(\d)$/);if(i){const a=parseInt(i[1]);return E(o,n,a,t)}}const e=P(n,o,t);if(e.length===1&&e[0].type==="hardBreak"){for(const i of o.children)if(i.type==="element"&&i.name==="w:r"){for(const a of i.children)if(a.type==="element"&&a.name==="w:br"&&a.attributes["w:type"]==="page")return{type:"horizontalRule"}}}if(e.length===1&&e[0].type==="image")return e[0];const s=B(o);return{type:"paragraph",...s&&{attrs:s},content:e}}function E(o,n,t,r){return{type:"heading",attrs:{level:t},content:P(n,o,r)}}function b(o){const n=y(o,"w:pPr");return n?!!y(n,"w:numPr"):!1}function C(o){const n=y(o,"w:pPr");if(!n)return null;const t=y(n,"w:numPr");if(!t)return null;const r=y(t,"w:ilvl"),e=y(t,"w:numId");return!r||!e?null:{numId:e.attributes["w:val"],level:parseInt(r.attributes["w:val"]||"0")}}function y(o,n){for(const t of o.children)if(t.type==="element"&&t.name===n)return t}function T(o){const n=x(o,"w:pPr");if(!n)return!1;const t=x(n,"w:pStyle");if(!t)return!1;const r=t.attributes["w:val"];return r==="CodeBlock"||r?.startsWith("Code")}function R(o){const n=x(o,"w:pPr");if(!n)return;const t=x(n,"w:pStyle");if(!t)return;const r=t.attributes["w:val"];if(r?.startsWith("CodeBlock"))return r.replace("CodeBlock","").toLowerCase()||void 0}function x(o,n){for(const t of o.children)if(t.type==="element"&&t.name===n)return t}function O(o){return o.name==="w:tbl"}function S(o,n,t){const r=[],e=[];for(const i of o.children)i.type==="element"&&i.name==="w:tr"&&e.push(i);const s=new Map;return e.forEach((i,a)=>{r.push(_(i,a===0,n,t,s,e,a))}),{type:"table",content:r}}function _(o,n,t,r,e,s,i){const a=[];let c=0;for(const f of o.children)if(f.type==="element"&&f.name==="w:tc"){const u=e.get(c);if(u&&u>0){e.set(c,u-1),c++;continue}let l=z(f);if(l&&l.rowspan===1){const d=U(s,i,c);d>1&&(l={...l,rowspan:d})}if(l&&l.rowspan>1&&e.set(c,l.rowspan-1),l&&l.rowspan===0){c++;continue}const p="tableCell",m=J(f,t,r);a.push({type:p,...l&&{attrs:l},content:[m]}),c+=l?.colspan||1}return{type:"tableRow",content:a}}function z(o){const n={colspan:1,rowspan:1,colwidth:null};let t;for(const r of o.children)if(r.type==="element"&&r.name==="w:tcPr"){t=r;break}if(!t)return n;for(const r of t.children)if(r.type==="element"&&r.name==="w:gridSpan"){const e=r.attributes["w:val"];e&&(n.colspan=parseInt(e));break}for(const r of t.children)if(r.type==="element"&&r.name==="w:vMerge"){r.attributes["w:val"]==="continue"&&(n.rowspan=0);break}for(const r of t.children)if(r.type==="element"&&r.name==="w:tcW"){const e=r.attributes["w:w"];e&&(n.colwidth=parseInt(e));break}return n}function U(o,n,t){let r=1,e=t;for(let s=n+1;s<o.length;s++){const i=o[s];let a=!1;for(const c of i.children)if(c.type==="element"&&c.name==="w:tc"){const f=z(c),u=f?.colspan||1;if(e>=0&&e<u){if(f?.rowspan===0)r++,a=!0;else return r;break}e-=u}if(!a)break}return r}function J(o,n,t){const r=[];for(const e of o.children)if(e.type==="element"&&e.name==="w:p"){const s=v(e,n,t);r.push(s)}return r[0]||{type:"paragraph",content:[]}}function I(o){for(const n of o.children)if(n.type==="element"&&n.name==="w:r"){for(const t of n.children)if(t.type==="element"&&t.name==="w:t"){const r=t.children.find(e=>e.type==="text");if(r&&"value"in r){const e=r.value;return e.startsWith("\u2610")||e.startsWith("\u2611")}}break}return!1}function D(o){for(const n of o.children)if(n.type==="element"&&n.name==="w:r"){for(const t of n.children)if(t.type==="element"&&t.name==="w:t"){const r=t.children.find(e=>e.type==="text");if(r&&"value"in r)return r.value.startsWith("\u2611")}break}return!1}function L(o){const n=D(o),t=q(o);return{type:"taskItem",attrs:{checked:n},content:[t]}}function q(o){const n=[];let t=!1;for(const e of o.children)if(e.type==="element"&&e.name==="w:r"){let s=!1;if(!t){for(const i of e.children)if(i.type==="element"&&i.name==="w:t"){const a=i.children.find(c=>c.type==="text");if(a&&"value"in a){const c=a.value;if(c.startsWith("\u2610")||c.startsWith("\u2611")){s=!0,t=!0;const f=c.substring(2).trimStart();f.length>0&&n.push({type:"text",text:f})}}}}if(!s){const i=G(e);for(const a of e.children)if(a.type==="element"&&a.name==="w:t"){const c=a.children.find(f=>f.type==="text");if(c&&"value"in c){const f={type:"text",text:c.value};i.length>0&&(f.marks=i),n.push(f)}}}}const r=K(o);return{type:"paragraph",...r&&{attrs:r},content:n.length>0?n:void 0}}function G(o){const n=[];for(const t of o.children)if(t.type==="element"&&t.name==="w:rPr"){const r=t;for(const e of r.children)if(e.type==="element"&&e.name==="w:b"){n.push({type:"bold"});break}for(const e of r.children)if(e.type==="element"&&e.name==="w:i"){n.push({type:"italic"});break}for(const e of r.children)if(e.type==="element"&&e.name==="w:u"){n.push({type:"underline"});break}for(const e of r.children)if(e.type==="element"&&e.name==="w:strike"){n.push({type:"strike"});break}break}return n}function K(o){for(const n of o.children)if(n.type==="element"&&n.name==="w:pPr"){const t=n;for(const r of t.children)if(r.type==="element"&&r.name==="w:jc"){const e=r.attributes["w:val"];if(e==="both")return{textAlign:"justify"};if(e==="center")return{textAlign:"center"};if(e==="right")return{textAlign:"right"};if(e==="left")return{textAlign:"left"}}}}function F(o){for(const n of o.children)if(n.type==="element"&&n.name==="w:r"){const t=n;let r=!1,e=!1;for(const s of t.children)if(s.type==="element")if(s.name==="w:br")s.attributes["w:type"]==="page"&&(r=!0);else if(s.name==="w:t"){const i=s.children.find(a=>a.type==="text");i&&"value"in i&&i.value&&i.value.trim().length>0&&(e=!0)}else s.name!=="w:rPr"&&(e=!0);if(r&&!e)return!0}return!1}const j=async o=>({src:W(o.data)});async function Q(o,n={}){const{convertImage:t=j,ignoreEmptyParagraphs:r=!1}=n,e=await X(o),s=H(e),i=Z(s),a=Y(s),c=new Map;for(const[p,m]of a.entries())try{const d=`image/${Object.keys(s).find(M=>M.endsWith(p)||M.includes(`media/${p}`))?.split(".").pop()?.toLowerCase()||"png"}`,w=await t({id:p,contentType:d,data:m});c.set(p,w.src)}catch(d){console.warn(`Failed to convert image ${p}:`,d);const w=W(m);c.set(p,w)}const f=s["word/document.xml"];if(!f)throw new Error("Invalid DOCX file: missing word/document.xml");const u=g(new TextDecoder().decode(f)),l=V(s);return tt(u,c,i,l,r)}function V(o){const n=new Map,t=new Map,r=o["word/numbering.xml"];if(!r)return n;const e=g(new TextDecoder().decode(r)),s=new Map;if(e.type==="root"){for(const i of e.children)if(i.type==="element"&&i.name==="w:numbering"){const a=i;for(const c of a.children)if(c.type==="element"&&c.name==="w:abstractNum"){const f=c,u=f.attributes["w:abstractNumId"];for(const l of f.children)if(l.type==="element"&&l.name==="w:lvl"){for(const p of l.children)if(p.type==="element"&&p.name==="w:numFmt"){const m=p.attributes["w:val"];if(m){s.set(u,m);break}}for(const p of l.children)if(p.type==="element"&&p.name==="w:start"){const m=p.attributes["w:val"];m&&t.set(u,parseInt(m,10));break}break}}for(const c of a.children)if(c.type==="element"&&c.name==="w:num"){const f=c,u=f.attributes["w:numId"];for(const l of f.children)if(l.type==="element"&&l.name==="w:abstractNumId"){const p=l.attributes["w:val"],m=s.get(p);if(m){const d=t.get(p);m==="bullet"?n.set(u,{type:"bullet"}):n.set(u,{type:"ordered",...d!==void 0&&{start:d}})}break}}break}}return n}function Y(o){const n=new Map,t=o["word/_rels/document.xml.rels"];if(!t)return n;const r=g(new TextDecoder().decode(t));if(r.type==="root"){for(const e of r.children)if(e.type==="element"&&e.name==="Relationships"){const s=e;for(const i of s.children)if(i.type==="element"&&i.name==="Relationship"){const a=i,c=a.attributes.Type;if(c&&c==="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"){const f=a.attributes.Id,u=a.attributes.Target;if(f&&u){const l="word/"+u,p=o[l];p&&n.set(f,p)}}}break}}return n}function Z(o){const n=new Map,t=o["word/_rels/document.xml.rels"];if(!t)return n;const r=g(new TextDecoder().decode(t));if(r.type==="root"){for(const e of r.children)if(e.type==="element"&&e.name==="Relationships"){const s=e;for(const i of s.children)if(i.type==="element"&&i.name==="Relationship"){const a=i,c=a.attributes.Type;if(c&&c==="http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"){const f=a.attributes.Id,u=a.attributes.Target;f&&u&&n.set(f,u)}}break}}return n}function tt(o,n,t,r,e){if(o.type!=="root")return{type:"doc",content:[]};for(const s of o.children)if(s.type==="element"&&s.name==="w:document"){const i=s;for(const a of i.children)if(a.type==="element"&&a.name==="w:body")return{type:"doc",content:et(a.children.filter(c=>c.type==="element"),n,t,r,e)};break}return{type:"doc",content:[]}}function et(o,n,t,r,e){const s=[];let i=0;for(;i<o.length;){const a=o[i];if(a.name==="w:tbl"){s.push(S(a,t,n)),i++,i<o.length&&o[i].name==="w:p"&&N(o[i])&&i++;continue}if(a.name==="w:p"){if(e&&N(a)){i++;continue}if(T(a)){const c=nt(o,i);s.push(...c),i+=c.length;continue}if(I(a)){const c=it(o,i);s.push(...c),i+=st(o,i);continue}if(b(a)){const c=rt(o,i,n,t,r);s.push(...c),i+=ot(o,i);continue}if(F(a)){s.push({type:"horizontalRule"}),i++;continue}s.push(v(a,t,n)),i++;continue}i++}return s}function nt(o,n){const t=[];let r=n;for(;r<o.length;){const e=o[r];if(e.name!=="w:p"||!T(e))break;const s=R(e),i={type:"codeBlock",...s&&{attrs:{language:s}},content:ct(e)};t.push(i),r++}return t}function rt(o,n,t,r,e){const s=[];let i=n;for(;i<o.length;){const a=o[i];if(a.name!=="w:p"||!b(a))break;const c=C(a);if(!c)break;const f=e.get(c.numId),u=f?.type||"bullet",l=[];for(;i<o.length;){const m=o[i];if(m.name!=="w:p"||!b(m))break;const d=C(m);if(!d||d.numId!==c.numId)break;const w={type:"listItem",content:[v(m,r,t)]};l.push(w),i++}const p={type:u==="bullet"?"bulletList":"orderedList",content:l};u==="ordered"&&(p.attrs={type:null,...f?.start!==void 0&&{start:f.start}}),s.push(p)}return s}function ot(o,n){let t=0,r=n;for(;r<o.length;){const e=o[r];if(e.name!=="w:p"||!b(e))break;t++,r++}return t}function it(o,n){const t=[];let r=n;for(;r<o.length;){const e=o[r];if(e.name!=="w:p"||!I(e))break;const s=L(e);t.push(s),r++}return[{type:"taskList",content:t}]}function st(o,n){let t=0,r=n;for(;r<o.length;){const e=o[r];if(e.name!=="w:p"||!I(e))break;t++,r++}return t}function ct(o){const n=[];for(const t of o.children){if(t.type!=="element"||t.name!=="w:r")continue;const r=t;for(const e of r.children)if(e.type==="element"&&e.name==="w:t"){const s=e.children.find(i=>i.type==="text");s&&"value"in s&&n.push({type:"text",text:s.value})}}return n}function N(o){for(const n of o.children){if(n.type!=="element"||n.name!=="w:r")continue;const t=n;for(const r of t.children)if(r.type==="element"&&r.name==="w:t"){const e=r.children.find(s=>s.type==="text");if(e&&"value"in e&&e.value.trim().length>0)return!1}}return!0}export{v as convertParagraph,S as convertTable,L as convertTaskItem,j as defaultImageConverter,B as extractAlignment,k as extractMarks,P as extractRuns,R as getCodeBlockLanguage,C as getListInfo,D as getTaskItemChecked,T as isCodeBlock,F as isHorizontalRule,b as isListItem,O as isTable,I as isTaskItem,Q as parseDOCX};
1
+ import{fromXml as W}from"xast-util-from-xml";import{unzipSync as et}from"fflate";import{toUint8Array as nt}from"undio";import{imageMeta as F}from"image-meta";function L(o,e,t){const n=[];for(const r of e.children)if(r.type==="element"){if(r.name==="w:hyperlink"){const s=r,i=s.attributes["r:id"],a=o.get(i);if(a){for(const c of s.children)if(c.type==="element"&&c.name==="w:r"){const f=c;let l=h(f,"w:drawing");if(!l){const w=h(f,"mc:AlternateContent");if(w){const y=h(w,"mc:Choice");y&&(l=h(y,"w:drawing"))}}if(l){const w=j(l,t);if(w){n.push(w);continue}const y=_(l,t);if(n.push(...y),y.length>0)continue}const u=h(f,"w:t");if(!u)continue;const p=u.children.find(w=>w.type==="text");if(!p||!p.value)continue;const d=S(f);d.push({type:"link",attrs:{href:a}});const m={type:"text",text:p.value};d.length>0&&(m.marks=d),n.push(m)}}continue}if(r.name==="w:r"){const s=r;let i=h(s,"w:drawing");if(!i){const u=h(s,"mc:AlternateContent");if(u){const p=h(u,"mc:Choice");p&&(i=h(p,"w:drawing"))}}if(i){const u=_(i,t);if(n.push(...u),u.length>0)continue}if(h(s,"w:br")){const u=S(s),p={type:"hardBreak"};u.length>0&&(p.marks=u),n.push(p)}const a=h(s,"w:t");if(!a)continue;const c=a.children.find(u=>u.type==="text");if(!c||!c.value)continue;const f=S(s),l={type:"text",text:c.value};f.length>0&&(l.marks=f),n.push(l)}}return n}function S(o){const e=[],t=h(o,"w:rPr");if(!t)return e;h(t,"w:b")&&e.push({type:"bold"}),h(t,"w:i")&&e.push({type:"italic"}),h(t,"w:u")&&e.push({type:"underline"}),h(t,"w:strike")&&e.push({type:"strike"}),h(t,"w:highlight")&&e.push({type:"highlight"});const n=h(t,"w:vertAlign");if(n){const c=n.attributes["w:val"];c==="subscript"?e.push({type:"subscript"}):c==="superscript"&&e.push({type:"superscript"})}const r=h(t,"w:color"),s=h(t,"w:shd"),i=h(t,"w:sz"),a=h(t,"w:rFonts");if(r||s||i||a){const c={color:"",backgroundColor:"",fontSize:"",fontFamily:"",lineHeight:""};if(r&&r.attributes["w:val"]){const f=r.attributes["w:val"];if(f!=="auto"){const l=f.startsWith("#")?f:`#${f}`;c.color=l}}if(s&&s.attributes["w:fill"]){const f=s.attributes["w:fill"];if(f!=="auto"){const l=f.startsWith("#")?f:`#${f}`;c.backgroundColor=l}}if(i&&i.attributes["w:val"]){const f=i.attributes["w:val"],l=parseFloat(f);if(!isNaN(l)){const u=Math.round(l/1.5*10)/10;c.fontSize=`${u}px`}}a&&a.attributes["w:ascii"]&&(c.fontFamily=a.attributes["w:ascii"]),e.push({type:"textStyle",attrs:c})}return e}function O(o){const e=h(o,"w:pPr");if(!e)return;const t=h(e,"w:jc");if(!t?.attributes["w:val"])return;const n=t.attributes["w:val"],r={left:"left",right:"right",center:"center",both:"justify"}[n];return r?{textAlign:r}:void 0}function j(o,e){const t=P(o,"a:blip");if(!t?.attributes["r:embed"])return null;const n=t.attributes["r:embed"],r=e.get(n);if(!r)return null;const s=P(o,"wp:extent");let i,a;if(s){const l=s.attributes.cx,u=s.attributes.cy;if(typeof l=="string"){const p=parseInt(l,10);isNaN(p)||(i=Math.round(p/9525))}if(typeof u=="string"){const p=parseInt(u,10);isNaN(p)||(a=Math.round(p/9525))}}const c=P(o,"wp:docPr");let f;if(c){const l=c.attributes.title;typeof l=="string"&&l&&(f=l)}return{type:"image",attrs:{src:r,alt:"",...i!==void 0&&{width:i},...a!==void 0&&{height:a},...f!==void 0&&{title:f}}}}function _(o,e){const t=[],n=h(o,"wp:inline")||h(o,"wp:anchor");if(!n)return t;const r=h(n,"wp:extent");let s,i;if(r){const l=r.attributes.cx,u=r.attributes.cy;if(typeof l=="string"){const p=parseInt(l,10);isNaN(p)||(s=Math.round(p/9525))}if(typeof u=="string"){const p=parseInt(u,10);isNaN(p)||(i=Math.round(p/9525))}}const a=h(n,"a:graphic");if(!a)return t;const c=h(a,"a:graphicData");if(!c)return t;const f=h(c,"wpg:wgp");if(f){const l=h(f,"wpg:grpSp");let u=[];if(l){const p=T(l,"pic:pic"),d=T(l,"pic");u=[...p,...d]}else{const p=T(f,"pic:pic"),d=T(f,"pic");u=[...p,...d]}for(const p of u){const d=h(p,"a:graphic");if(!d){const w=h(p,"pic:blipFill")||P(p,"a:blipFill");if(w){const y=h(w,"a:blip")||P(w,"a:blip");if(y&&y.attributes["r:embed"]){const x=y.attributes["r:embed"],g=e.get(x);if(g){let b=s,I=i;if(g&&s&&i)try{let v,k;if(g.startsWith("data:")){const C=g.split(",")[1];if(C){const M=atob(C),U=new Uint8Array(M.length);for(let N=0;N<M.length;N++)U[N]=M.charCodeAt(N);const X=F(U);v=X.width,k=X.height}}if(v&&k){const C=v/k,M=s/i;Math.abs(C-M)>.1&&(C>M?(b=s,I=Math.round(s/C)):(I=i,b=Math.round(i*C)))}}catch(v){console.warn("Failed to extract image metadata for aspect ratio:",v)}t.push({type:"image",attrs:{src:g,alt:"",...b!==void 0&&{width:b},...I!==void 0&&{height:I}}})}}}continue}const m=j({children:[d]},e);if(m){if(s!==void 0&&i!==void 0&&m.attrs.src)try{const w=m.attrs.src;let y,x;if(w.startsWith("data:")){const g=w.split(",")[1];if(g){const b=atob(g),I=new Uint8Array(b.length);for(let k=0;k<b.length;k++)I[k]=b.charCodeAt(k);const v=F(I);y=v.width,x=v.height}}if(y&&x){const g=y/x,b=s/i;Math.abs(g-b)>.1?g>b?(m.attrs.width=s,m.attrs.height=Math.round(s/g)):(m.attrs.height=i,m.attrs.width=Math.round(i*g)):(m.attrs.width=s,m.attrs.height=i)}else m.attrs.width=s,m.attrs.height=i}catch{m.attrs.width=s,m.attrs.height=i}t.push(m)}}}else{const l=j(o,e);l&&t.push(l)}return t}function h(o,e){for(const t of o.children)if(t.type==="element"&&t.name===e)return t}function P(o,e){for(const t of o.children)if(t.type==="element"&&t.name===e)return t;for(const t of o.children)if(t.type==="element"){const n=P(t,e);if(n)return n}}function T(o,e){const t=[];for(const n of o.children)n.type==="element"&&n.name===e&&t.push(n);for(const n of o.children)n.type==="element"&&t.push(...T(n,e));return t}function R(o,e,t){let n;for(const i of o.children)if(i.type==="element"&&i.name==="w:pPr"){const a=i;for(const c of a.children)if(c.type==="element"&&c.name==="w:pStyle"){n=c.attributes["w:val"];break}break}if(n){const i=n.match(/^Heading(\d)$/);if(i){const a=parseInt(i[1]);return rt(o,e,a,t)}}const r=L(e,o,t);if(r.length===1&&r[0].type==="hardBreak"){for(const i of o.children)if(i.type==="element"&&i.name==="w:r"){for(const a of i.children)if(a.type==="element"&&a.name==="w:br"&&a.attributes["w:type"]==="page")return{type:"horizontalRule"}}}if(r.length===1&&r[0].type==="image")return r[0];const s=O(o);return{type:"paragraph",...s&&{attrs:s},content:r}}function rt(o,e,t,n){return{type:"heading",attrs:{level:t},content:L(e,o,n)}}function $(o){const e=A(o,"w:pPr");return e?!!A(e,"w:numPr"):!1}function E(o){const e=A(o,"w:pPr");if(!e)return null;const t=A(e,"w:numPr");if(!t)return null;const n=A(t,"w:ilvl"),r=A(t,"w:numId");return!n||!r?null:{numId:r.attributes["w:val"],level:parseInt(n.attributes["w:val"]||"0")}}function A(o,e){for(const t of o.children)if(t.type==="element"&&t.name===e)return t}function H(o){const e=B(o,"w:pPr");if(!e)return!1;const t=B(e,"w:pStyle");if(!t)return!1;const n=t.attributes["w:val"];return n==="CodeBlock"||n?.startsWith("Code")}function J(o){const e=B(o,"w:pPr");if(!e)return;const t=B(e,"w:pStyle");if(!t)return;const n=t.attributes["w:val"];if(n?.startsWith("CodeBlock"))return n.replace("CodeBlock","").toLowerCase()||void 0}function B(o,e){for(const t of o.children)if(t.type==="element"&&t.name===e)return t}function ot(o){return o.name==="w:tbl"}function q(o,e,t){const n=[],r=[];for(const i of o.children)i.type==="element"&&i.name==="w:tr"&&r.push(i);const s=new Map;return r.forEach((i,a)=>{n.push(it(i,a===0,e,t,s,r,a))}),{type:"table",content:n}}function it(o,e,t,n,r,s,i){const a=[];let c=0;for(const f of o.children)if(f.type==="element"&&f.name==="w:tc"){const l=r.get(c);if(l&&l>0){r.set(c,l-1),c++;continue}let u=G(f);if(u&&u.rowspan===1){const m=st(s,i,c);m>1&&(u={...u,rowspan:m})}if(u&&u.rowspan>1&&r.set(c,u.rowspan-1),u&&u.rowspan===0){c++;continue}const p="tableCell",d=ct(f,t,n);a.push({type:p,...u&&{attrs:u},content:[d]}),c+=u?.colspan||1}return{type:"tableRow",content:a}}function G(o){const e={colspan:1,rowspan:1,colwidth:null};let t;for(const n of o.children)if(n.type==="element"&&n.name==="w:tcPr"){t=n;break}if(!t)return e;for(const n of t.children)if(n.type==="element"&&n.name==="w:gridSpan"){const r=n.attributes["w:val"];r&&(e.colspan=parseInt(r));break}for(const n of t.children)if(n.type==="element"&&n.name==="w:vMerge"){n.attributes["w:val"]==="continue"&&(e.rowspan=0);break}for(const n of t.children)if(n.type==="element"&&n.name==="w:tcW"){const r=n.attributes["w:w"];r&&(e.colwidth=parseInt(r));break}return e}function st(o,e,t){let n=1,r=t;for(let s=e+1;s<o.length;s++){const i=o[s];let a=!1;for(const c of i.children)if(c.type==="element"&&c.name==="w:tc"){const f=G(c),l=f?.colspan||1;if(r>=0&&r<l){if(f?.rowspan===0)n++,a=!0;else return n;break}r-=l}if(!a)break}return n}function ct(o,e,t){const n=[];for(const r of o.children)if(r.type==="element"&&r.name==="w:p"){const s=R(r,e,t);n.push(s)}return n[0]||{type:"paragraph",content:[]}}function D(o){for(const e of o.children)if(e.type==="element"&&e.name==="w:r"){for(const t of e.children)if(t.type==="element"&&t.name==="w:t"){const n=t.children.find(r=>r.type==="text");if(n&&"value"in n){const r=n.value;return r.startsWith("\u2610")||r.startsWith("\u2611")}}break}return!1}function K(o){for(const e of o.children)if(e.type==="element"&&e.name==="w:r"){for(const t of e.children)if(t.type==="element"&&t.name==="w:t"){const n=t.children.find(r=>r.type==="text");if(n&&"value"in n)return n.value.startsWith("\u2611")}break}return!1}function Q(o){const e=K(o),t=at(o);return{type:"taskItem",attrs:{checked:e},content:[t]}}function at(o){const e=[];let t=!1;for(const r of o.children)if(r.type==="element"&&r.name==="w:r"){let s=!1;if(!t){for(const i of r.children)if(i.type==="element"&&i.name==="w:t"){const a=i.children.find(c=>c.type==="text");if(a&&"value"in a){const c=a.value;if(c.startsWith("\u2610")||c.startsWith("\u2611")){s=!0,t=!0;const f=c.substring(2).trimStart();f.length>0&&e.push({type:"text",text:f})}}}}if(!s){const i=ft(r);for(const a of r.children)if(a.type==="element"&&a.name==="w:t"){const c=a.children.find(f=>f.type==="text");if(c&&"value"in c){const f={type:"text",text:c.value};i.length>0&&(f.marks=i),e.push(f)}}}}const n=lt(o);return{type:"paragraph",...n&&{attrs:n},content:e.length>0?e:void 0}}function ft(o){const e=[];for(const t of o.children)if(t.type==="element"&&t.name==="w:rPr"){const n=t;for(const r of n.children)if(r.type==="element"&&r.name==="w:b"){e.push({type:"bold"});break}for(const r of n.children)if(r.type==="element"&&r.name==="w:i"){e.push({type:"italic"});break}for(const r of n.children)if(r.type==="element"&&r.name==="w:u"){e.push({type:"underline"});break}for(const r of n.children)if(r.type==="element"&&r.name==="w:strike"){e.push({type:"strike"});break}break}return e}function lt(o){for(const e of o.children)if(e.type==="element"&&e.name==="w:pPr"){const t=e;for(const n of t.children)if(n.type==="element"&&n.name==="w:jc"){const r=n.attributes["w:val"];if(r==="both")return{textAlign:"justify"};if(r==="center")return{textAlign:"center"};if(r==="right")return{textAlign:"right"};if(r==="left")return{textAlign:"left"}}}}function V(o){for(const e of o.children)if(e.type==="element"&&e.name==="w:r"){const t=e;let n=!1,r=!1;for(const s of t.children)if(s.type==="element")if(s.name==="w:br")s.attributes["w:type"]==="page"&&(n=!0);else if(s.name==="w:t"){const i=s.children.find(a=>a.type==="text");i&&"value"in i&&i.value&&i.value.trim().length>0&&(r=!0)}else s.name!=="w:rPr"&&(r=!0);if(n&&!r)return!0}return!1}const z="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";function Y(o){const e=o.length,t=Math.ceil(e/3)*4,n=Array.from({length:t});let r=0;for(let s=0;s<e;s+=3){const i=o[s],a=s+1<e?o[s+1]:0,c=s+2<e?o[s+2]:0,f=i>>2,l=(i&3)<<4|a>>4,u=(a&15)<<2|c>>6,p=c&63;n[r++]=z[f],n[r++]=z[l],n[r++]=s+1<e?z[u]:"=",n[r++]=s+2<e?z[p]:"="}return n.join("")}const Z=async o=>{const e=Y(o.data);return{src:`data:${o.contentType};base64,${e}`}};async function ut(o,e={}){const{convertImage:t=Z,ignoreEmptyParagraphs:n=!1}=e,r=await nt(o),s=et(r),i=mt(s),a=ht(s),c=new Map;for(const[p,d]of a.entries())try{let m;try{m=`image/${F(d).type}`}catch{m="image/png"}const w=await t({id:p,contentType:m,data:d});c.set(p,w.src)}catch(m){console.warn(`Failed to convert image ${p}:`,m);let w="image/png";try{w=`image/${F(d).type}`}catch{}const y=Y(d),x=`data:${w};base64,${y}`;c.set(p,x)}const f=s["word/document.xml"];if(!f)throw new Error("Invalid DOCX file: missing word/document.xml");const l=W(new TextDecoder().decode(f)),u=pt(s);return dt(l,c,i,u,n)}function pt(o){const e=new Map,t=new Map,n=o["word/numbering.xml"];if(!n)return e;const r=W(new TextDecoder().decode(n)),s=new Map;if(r.type==="root"){for(const i of r.children)if(i.type==="element"&&i.name==="w:numbering"){const a=i;for(const c of a.children)if(c.type==="element"&&c.name==="w:abstractNum"){const f=c,l=f.attributes["w:abstractNumId"];for(const u of f.children)if(u.type==="element"&&u.name==="w:lvl"){for(const p of u.children)if(p.type==="element"&&p.name==="w:numFmt"){const d=p.attributes["w:val"];if(d){s.set(l,d);break}}for(const p of u.children)if(p.type==="element"&&p.name==="w:start"){const d=p.attributes["w:val"];d&&t.set(l,parseInt(d,10));break}break}}for(const c of a.children)if(c.type==="element"&&c.name==="w:num"){const f=c,l=f.attributes["w:numId"];for(const u of f.children)if(u.type==="element"&&u.name==="w:abstractNumId"){const p=u.attributes["w:val"],d=s.get(p);if(d){const m=t.get(p);d==="bullet"?e.set(l,{type:"bullet"}):e.set(l,{type:"ordered",...m!==void 0&&{start:m}})}break}}break}}return e}function ht(o){const e=new Map,t=o["word/_rels/document.xml.rels"];if(!t)return e;const n=W(new TextDecoder().decode(t));if(n.type==="root"){for(const r of n.children)if(r.type==="element"&&r.name==="Relationships"){const s=r;for(const i of s.children)if(i.type==="element"&&i.name==="Relationship"){const a=i,c=a.attributes.Type;if(c&&c==="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"){const f=a.attributes.Id,l=a.attributes.Target;if(f&&l){const u="word/"+l,p=o[u];p&&e.set(f,p)}}}break}}return e}function mt(o){const e=new Map,t=o["word/_rels/document.xml.rels"];if(!t)return e;const n=W(new TextDecoder().decode(t));if(n.type==="root"){for(const r of n.children)if(r.type==="element"&&r.name==="Relationships"){const s=r;for(const i of s.children)if(i.type==="element"&&i.name==="Relationship"){const a=i,c=a.attributes.Type;if(c&&c==="http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"){const f=a.attributes.Id,l=a.attributes.Target;f&&l&&e.set(f,l)}}break}}return e}function dt(o,e,t,n,r){if(o.type!=="root")return{type:"doc",content:[]};for(const s of o.children)if(s.type==="element"&&s.name==="w:document"){const i=s;for(const a of i.children)if(a.type==="element"&&a.name==="w:body")return{type:"doc",content:wt(a.children.filter(c=>c.type==="element"),e,t,n,r)};break}return{type:"doc",content:[]}}function wt(o,e,t,n,r){const s=[];let i=0;for(;i<o.length;){const a=o[i];if(a.name==="w:tbl"){s.push(q(a,t,e)),i++,i<o.length&&o[i].name==="w:p"&&tt(o[i])&&i++;continue}if(a.name==="w:p"){if(r&&tt(a)){i++;continue}if(H(a)){const c=yt(o,i);s.push(...c),i+=c.length;continue}if(D(a)){const c=vt(o,i);s.push(...c),i+=kt(o,i);continue}if($(a)){const c=gt(o,i,e,t,n);s.push(...c),i+=bt(o,i);continue}if(V(a)){s.push({type:"horizontalRule"}),i++;continue}s.push(R(a,t,e)),i++;continue}i++}return s}function yt(o,e){const t=[];let n=e;for(;n<o.length;){const r=o[n];if(r.name!=="w:p"||!H(r))break;const s=J(r),i={type:"codeBlock",...s&&{attrs:{language:s}},content:xt(r)};t.push(i),n++}return t}function gt(o,e,t,n,r){const s=[];let i=e;for(;i<o.length;){const a=o[i];if(a.name!=="w:p"||!$(a))break;const c=E(a);if(!c)break;const f=r.get(c.numId),l=f?.type||"bullet",u=[];for(;i<o.length;){const d=o[i];if(d.name!=="w:p"||!$(d))break;const m=E(d);if(!m||m.numId!==c.numId)break;const w={type:"listItem",content:[R(d,n,t)]};u.push(w),i++}const p={type:l==="bullet"?"bulletList":"orderedList",content:u};l==="ordered"&&(p.attrs={type:null,...f?.start!==void 0&&{start:f.start}}),s.push(p)}return s}function bt(o,e){let t=0,n=e;for(;n<o.length;){const r=o[n];if(r.name!=="w:p"||!$(r))break;t++,n++}return t}function vt(o,e){const t=[];let n=e;for(;n<o.length;){const r=o[n];if(r.name!=="w:p"||!D(r))break;const s=Q(r);t.push(s),n++}return[{type:"taskList",content:t}]}function kt(o,e){let t=0,n=e;for(;n<o.length;){const r=o[n];if(r.name!=="w:p"||!D(r))break;t++,n++}return t}function xt(o){const e=[];for(const t of o.children){if(t.type!=="element"||t.name!=="w:r")continue;const n=t;for(const r of n.children)if(r.type==="element"&&r.name==="w:t"){const s=r.children.find(i=>i.type==="text");s&&"value"in s&&e.push({type:"text",text:s.value})}}return e}function tt(o){for(const e of o.children){if(e.type!=="element"||e.name!=="w:r")continue;const t=e;for(const n of t.children)if(n.type==="element"&&n.name==="w:t"){const r=n.children.find(s=>s.type==="text");if(r&&"value"in r&&r.value.trim().length>0)return!1}}return!0}export{R as convertParagraph,q as convertTable,Q as convertTaskItem,Z as defaultImageConverter,O as extractAlignment,S as extractMarks,L as extractRuns,J as getCodeBlockLanguage,E as getListInfo,K as getTaskItemChecked,H as isCodeBlock,V as isHorizontalRule,$ as isListItem,ot as isTable,D as isTaskItem,ut as parseDOCX};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@docen/import-docx",
3
- "version": "0.0.0",
3
+ "version": "0.0.1",
4
4
  "description": "A powerful TipTap/ProseMirror extension that imports Microsoft Word DOCX files to editor content",
5
5
  "keywords": [
6
6
  "converter",
@@ -45,13 +45,14 @@
45
45
  },
46
46
  "dependencies": {
47
47
  "fflate": "0.8.2",
48
+ "image-meta": "0.2.2",
48
49
  "undio": "0.2.0",
49
50
  "xast-util-from-xml": "4.0.0"
50
51
  },
51
52
  "devDependencies": {
52
53
  "@tiptap/core": "3.7.2",
53
54
  "@types/xast": "2.0.4",
54
- "@docen/tiptap-extensions": "0.0.0"
55
+ "@docen/tiptap-extensions": "0.0.1"
55
56
  },
56
57
  "scripts": {
57
58
  "dev": "unbuild --stub",