npm - @dragon708/docmind-browser - Versions diffs - 1.2.0 → 1.4.0 - Mend

@dragon708/docmind-browser 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -1,7 +1,16 @@
 import { DocMindAnalyzeOptions, AnalysisResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
 export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
 import { OcrOptions } from '@dragon708/docmind-ocr';
+import { AnalyzeDocxIncludeFlags, DocxToHtmlOptions } from '@dragon708/docmind-docx';
+export { AnalyzeDocxIncludeFlags } from '@dragon708/docmind-docx';
+/**
+ * Opciones DOCX para el facade browser (Mammoth + inclusiones v2 de `@dragon708/docmind-docx`; sin APIs Node-only).
+ */
+interface BrowserAnalyzeDocxOptionsSlice {
+    readonly include?: AnalyzeDocxIncludeFlags;
+    readonly html?: DocxToHtmlOptions;
+}
 /**
  * OCR behavior for browser intents that touch raster images.
  * - `off`: do not invoke Tesseract; text stays empty with an explanatory warning.
@@ -20,6 +29,8 @@ interface BrowserOcrOptions extends OcrOptions {
 interface BrowserAnalyzeOptions extends DocMindAnalyzeOptions {
     /** Image OCR only; no PDF in this runtime. See {@link BrowserOcrOptions.mode}. */
     readonly ocr?: BrowserOcrOptions;
+    /** Solo DOCX: ver {@link BrowserAnalyzeDocxOptionsSlice}. */
+    readonly docx?: BrowserAnalyzeDocxOptionsSlice;
 }
 /**
@@ -60,6 +71,43 @@ declare function convertToHtml(input: BrowserAnalyzeInput, options?: BrowserAnal
  */
 declare function runOcr(input: BrowserAnalyzeInput, options?: BrowserAnalyzeOptions): Promise<AnalysisResult>;
+/** High-level features the user can ask DocMind for (per input kind and runtime). */
+type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages";
+declare function docxIncludeRequested(flags?: AnalyzeDocxIncludeFlags): boolean;
+/** DOCX `word/media` en runtime browser (JSZip; sin pipeline Node). */
+interface DocxEmbeddedImageCapabilities {
+    readonly canExtractEmbeddedImages: true;
+    readonly documentsMayIncludeImagesRequiringWebConversion: true;
+    /** En browser no hay conversión EMF/WMF a PNG empaquetada; `convertDocxEmbeddedImageToWeb` devuelve bytes originales + avisos. */
+    readonly webFriendlyRasterConversionInBrowser: false;
+    readonly notes: readonly string[];
+}
+declare const DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER: DocxEmbeddedImageCapabilities;
+interface DocxStructuralCapabilities {
+    readonly ooxmlExtractorsAvailable: true;
+    readonly activatedViaDocxInclude: true;
+    readonly features: readonly string[];
+    readonly notes: readonly string[];
+}
+declare const DOCX_STRUCTURE_CAPABILITIES_BROWSER: DocxStructuralCapabilities;
+/** Whether a {@link PublicCapabilityId} applies to the detected file in this runtime. */
+interface PublicCapabilitySupport {
+    readonly id: PublicCapabilityId;
+    readonly supported: boolean;
+    readonly warnings?: readonly string[];
+}
+/**
+ * Result of {@link getCapabilities}: detected kind, runtime id, per-feature support for this input, and optional global warnings.
+ */
+interface GetCapabilitiesReport {
+    readonly kind: FileKind;
+    readonly runtime: RuntimeDescriptor;
+    readonly capabilities: readonly PublicCapabilitySupport[];
+    readonly docxEmbeddedImages?: DocxEmbeddedImageCapabilities;
+    readonly docxStructure?: DocxStructuralCapabilities;
+    readonly warnings?: readonly string[];
+}
 /**
  * Whether DocMind will try a non-OCR text/HTML path (e.g. Mammoth, UTF-8, PDF text layer — when available).
  */
@@ -91,40 +139,24 @@ interface ExplainAnalysisPlanReport {
     readonly limitations: readonly string[];
     /** Ordered pipeline steps (planned/skipped/done metadata only). */
     readonly plan: ProcessingPlanDescriptor;
+    readonly docxEmbeddedImages?: DocxEmbeddedImageCapabilities;
+    readonly docxStructure?: DocxStructuralCapabilities;
     readonly warnings?: readonly string[];
 }
-/** High-level features the user can ask DocMind for (per input kind and runtime). */
-type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages";
-/** Whether a {@link PublicCapabilityId} applies to the detected file in this runtime. */
-interface PublicCapabilitySupport {
-    readonly id: PublicCapabilityId;
-    readonly supported: boolean;
-    readonly warnings?: readonly string[];
-}
-/**
- * Result of {@link getCapabilities}: detected kind, runtime id, per-feature support for this input, and optional global warnings.
- */
-interface GetCapabilitiesReport {
-    readonly kind: FileKind;
-    readonly runtime: RuntimeDescriptor;
-    readonly capabilities: readonly PublicCapabilitySupport[];
-    readonly warnings?: readonly string[];
-}
-/** Options for {@link explainAnalysisPlan}: shared fields plus optional `ocr` for accurate OCR-step preview. */
-type BrowserExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<BrowserAnalyzeOptions, "ocr">;
+/** Options for {@link explainAnalysisPlan}: shared fields plus optional `ocr` / `docx` for accurate step preview. */
+type BrowserExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<BrowserAnalyzeOptions, "ocr" | "docx">;
 /**
  * Epic 1 — **Capabilities:** detects kind from the same hints as `analyzeFile`, then lists which of
  * `text` | `metadata` | `html` | `ocr` | `pages` apply in the browser (PDF always unsupported).
- * No Mammoth/Tesseract/PDF parsing.
+ * No Mammoth/Tesseract/PDF parsing. For DOCX, {@link GetCapabilitiesReport.docxStructure} / `docxEmbeddedImages` describe v2 opt-in features.
  */
 declare function getCapabilities(input: BrowserAnalyzeInput, options?: GetCapabilitiesOptions): Promise<GetCapabilitiesReport>;
 /**
  * Epic 1 — **Plan preview:** structured explanation (analyzer, native extraction vs OCR, `limitations`, `plan.steps`)
- * for a {@link DocMindPublicIntent}. Optional `ocr` in options refines image steps. No heavy I/O.
+ * for a {@link DocMindPublicIntent}. Optional `ocr` refines image steps; optional `docx.include` adds planned OOXML parallel steps for DOCX. No heavy I/O.
  */
 declare function explainAnalysisPlan(input: BrowserAnalyzeInput, options?: BrowserExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
-export { BROWSER_PDF_UNSUPPORTED_WARNING, type BrowserAnalyzeInput, type BrowserAnalyzeOptions, type BrowserExplainAnalysisPlanOptions, type BrowserOcrMode, type BrowserOcrOptions, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, convertToHtml, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, runOcr };
+export { BROWSER_PDF_UNSUPPORTED_WARNING, type BrowserAnalyzeDocxOptionsSlice, type BrowserAnalyzeInput, type BrowserAnalyzeOptions, type BrowserExplainAnalysisPlanOptions, type BrowserOcrMode, type BrowserOcrOptions, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, type DocxEmbeddedImageCapabilities, type DocxStructuralCapabilities, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, runOcr };

package/dist/index.js CHANGED Viewed

@@ -22,7 +22,41 @@ function prepareBrowserAnalyzeInput(input) {
   assertValidAnalyzeFileInput(input);
   return input;
 }
-async function analyzeDocxForBrowser(input, signal) {
+// src/docxBrowserMapper.ts
+function analyzeDocxOptionsFromBrowser(options) {
+  const sig = options?.signal;
+  const dx = options?.docx;
+  if (!dx?.include && !dx?.html && !sig) return void 0;
+  const out = { ...dx?.html ?? {} };
+  if (dx?.include) out.include = dx.include;
+  if (sig) out.signal = sig;
+  return out;
+}
+function docxPackageResultToAnalysisResult(r) {
+  const base = {
+    fileKind: "docx",
+    analyzer: "docx",
+    status: "ok",
+    kind: "docx",
+    text: r.text,
+    html: r.html,
+    warnings: [...r.warnings]
+  };
+  const v2 = {
+    ...r.structure !== void 0 ? { structure: r.structure } : {},
+    ...r.headings !== void 0 ? { headings: r.headings } : {},
+    ...r.tables !== void 0 ? { tables: r.tables } : {},
+    ...r.blocks !== void 0 ? { blocks: r.blocks } : {},
+    ...r.pagesApprox !== void 0 ? { pagesApprox: r.pagesApprox } : {},
+    ...r.embeddedImages !== void 0 ? { embeddedImages: r.embeddedImages } : {}
+  };
+  return { ...base, ...v2 };
+}
+// src/analyzers/docx.ts
+async function analyzeDocxForBrowser(input, options) {
+  const signal = options?.signal;
   if (signal?.aborted) {
     const err = new Error("The operation was aborted");
     err.name = "AbortError";
@@ -40,16 +74,9 @@ async function analyzeDocxForBrowser(input, signal) {
       warnings: ["No document bytes were provided for analysis."]
     };
   }
-  const r = await analyzeDocx(data);
-  return {
-    fileKind: "docx",
-    analyzer: "docx",
-    status: "ok",
-    kind: "docx",
-    text: r.text,
-    html: r.html,
-    warnings: [...r.warnings]
-  };
+  const docxOpts = analyzeDocxOptionsFromBrowser(options);
+  const r = docxOpts !== void 0 ? await analyzeDocx(data, docxOpts) : await analyzeDocx(data);
+  return docxPackageResultToAnalysisResult(r);
 }
 var OCR_OFF_WARNING = 'OCR mode is "off"; no recognition was run. Use mode "auto" or "force" to extract text from images.';
 function resolveOcrMode(options) {
@@ -120,7 +147,7 @@ async function analyzeFile(input, options) {
     case "pdf":
       return notImplementedResult("pdf", "pdf", [BROWSER_PDF_UNSUPPORTED_WARNING]);
     case "docx":
-      return analyzeDocxForBrowser(bytesInput, options?.signal);
+      return analyzeDocxForBrowser(bytesInput, options);
     case "image":
       return analyzeImageForBrowser(bytesInput, options);
     case "text":
@@ -129,7 +156,7 @@ async function analyzeFile(input, options) {
       return notImplementedResult(fileKind, "none", [UNKNOWN_FORMAT_WARNING]);
   }
 }
-var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not exposed as a separate API; use extractText or analyzeFile.";
+var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not returned by extractMetadata; use analyzeFile, extractText, or convertToHtml with options.docx.include for OOXML structure, headings, tables, blocks, approximate pages, and embedded images.";
 var IMAGE_METADATA_NOTE = "Raster images have no document metadata bundle in this API.";
 function escapeHtmlMinimal(s) {
   return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
@@ -243,19 +270,16 @@ async function runOcr(input, options) {
           warnings: ["No document bytes were provided for analysis."]
         };
       }
-      const r = await analyzeDocx(data);
-      return {
-        fileKind: "docx",
-        analyzer: "docx",
-        status: "ok",
-        kind: "docx",
-        text: r.text,
-        html: r.html,
+      const opt = analyzeDocxOptionsFromBrowser(options);
+      const raw = opt !== void 0 ? await analyzeDocx(data, opt) : await analyzeDocx(data);
+      const withNote = {
+        ...raw,
         warnings: [
-          ...r.warnings,
+          ...raw.warnings,
           "OCR does not apply to DOCX; returned structured text/HTML extract."
         ]
       };
+      return docxPackageResultToAnalysisResult(withNote);
     }
     case "text":
       return analyzeText(bytesInput, { signal });
@@ -264,11 +288,134 @@ async function runOcr(input, options) {
   }
 }
+// src/capabilityReport.ts
+function docxIncludeRequested(flags) {
+  if (!flags) return false;
+  return !!(flags.structure || flags.headings || flags.tables || flags.blocks || flags.pagesApprox || flags.embeddedImages);
+}
+var DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER = {
+  canExtractEmbeddedImages: true,
+  documentsMayIncludeImagesRequiringWebConversion: true,
+  webFriendlyRasterConversionInBrowser: false,
+  notes: [
+    "`extractImagesFromDocx` from `@dragon708/docmind-docx` runs in-browser on the same ZIP bytes as Mammoth.",
+    "PNG, JPEG, GIF, WebP, SVG, TIFF, BMP, ICO are browser-embeddable; EMF/WMF need an external converter or server-side Node tooling."
+  ]
+};
+var DOCX_STRUCTURE_CAPABILITIES_BROWSER = {
+  ooxmlExtractorsAvailable: true,
+  activatedViaDocxInclude: true,
+  features: [
+    "OOXML structure (body blocks)",
+    "headings",
+    "tables",
+    "semantic blocks",
+    "approximate pages (OOXML page-break hints)",
+    "embedded images (word/media; mode web/both still browser-safe but EMF/WMF stay non-raster without a converter)"
+  ],
+  notes: [
+    "Use options.docx.include on analyzeFile, extractText, convertToHtml, or runOcr to merge Mammoth with selected `@dragon708/docmind-docx` extractors.",
+    "extractMetadata for DOCX stays a stub in the browser facade."
+  ]
+};
+var DOCX_META = "Structured document metadata is not exposed separately in the browser runtime; extractMetadata returns a stub for DOCX.";
+var IMAGE_META = "Raster images have no document metadata bundle; extractMetadata returns a stub.";
+var IMAGE_HTML = "No layout HTML for raster images; use extractText or runOcr for text.";
+var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMetadata still returns decoded content.";
+var OCR_OFF_NOTE = 'Image OCR may be skipped when `ocr.mode` is "off" in analyze options.';
+var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
+function slot(id, supported, warnings) {
+  return warnings?.length ? { id, supported, warnings } : { id, supported };
+}
+function buildBrowserCapabilityReport(kind) {
+  const runtime = { id: "browser" };
+  const pdf = BROWSER_PDF_UNSUPPORTED_WARNING;
+  let capabilities;
+  const topWarnings = [];
+  switch (kind) {
+    case "pdf":
+      capabilities = [
+        slot("text", false, [pdf]),
+        slot("metadata", false, [pdf]),
+        slot("html", false, [pdf]),
+        slot("ocr", false, [pdf]),
+        slot("pages", false, [pdf])
+      ];
+      break;
+    case "docx":
+      capabilities = [
+        slot("text", true, [
+          "Mammoth plain text in analyzeFile; extractText clears html. Optional OOXML fields merge when options.docx.include is set."
+        ]),
+        slot("metadata", false, [
+          `${DOCX_META} Use analyzeFile-style routes with options.docx.include for OOXML structure, headings, tables, blocks, approximate pages, and embedded images.`
+        ]),
+        slot("html", true, [
+          "Mammoth HTML in-browser; docxImagesAsDataUri for web-safe images; EMF/WMF placeholders in HTML unless you handle media separately."
+        ]),
+        slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
+        slot("pages", false, [
+          "No PDF page count; approximate DOCX pages via options.docx.include.pagesApprox (heuristic, not print layout)."
+        ])
+      ];
+      break;
+    case "image":
+      capabilities = [
+        slot("text", true, ["Text is obtained via OCR when enabled."]),
+        slot("metadata", false, [IMAGE_META]),
+        slot("html", false, [IMAGE_HTML]),
+        slot("ocr", true, [OCR_OFF_NOTE]),
+        slot("pages", false)
+      ];
+      break;
+    case "text":
+      capabilities = [
+        slot("text", true),
+        slot("metadata", true, [TEXT_META_NOTE]),
+        slot("html", true),
+        slot("ocr", false, ["OCR does not apply to plain text files."]),
+        slot("pages", false)
+      ];
+      break;
+    default:
+      topWarnings.push(UNKNOWN_KIND);
+      capabilities = [
+        slot("text", false),
+        slot("metadata", false),
+        slot("html", false),
+        slot("ocr", false),
+        slot("pages", false)
+      ];
+  }
+  return {
+    kind,
+    runtime,
+    capabilities,
+    ...kind === "docx" ? {
+      docxEmbeddedImages: DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER,
+      docxStructure: DOCX_STRUCTURE_CAPABILITIES_BROWSER
+    } : {},
+    warnings: topWarnings.length > 0 ? topWarnings : void 0
+  };
+}
 // src/analysisPlanReport.ts
 function lim(...items) {
   return items.filter(Boolean);
 }
-function buildBrowserExplainReport(kind, intent, ocrMode, plan) {
+var DOCX_MAMMOTH_PLUS_OPTIONAL_BROWSER = "Mammoth (`analyzeDocx`) extracts text and HTML from OOXML in-browser; optional parallel OOXML/ZIP extractors run when options.docx.include is set.";
+var DOCX_ZIP_NOTE_BROWSER = "Embedded files under word/media are available via @dragon708/docmind-docx when options.docx.include requests embeddedImages (or call extractImagesFromDocx on the same bytes).";
+function finalizeBrowserDocxExplainReport(report) {
+  if (report.kind !== "docx") return report;
+  const limitations = report.limitations.includes(DOCX_ZIP_NOTE_BROWSER) ? report.limitations : [...report.limitations, DOCX_ZIP_NOTE_BROWSER];
+  return {
+    ...report,
+    docxEmbeddedImages: DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER,
+    docxStructure: DOCX_STRUCTURE_CAPABILITIES_BROWSER,
+    limitations
+  };
+}
+function buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInclude) {
   const runtime = { id: "browser" };
   const imageOcrActive = ocrMode !== "off";
   let primaryAnalyzer = kind === "pdf" ? "pdf" : kind === "docx" ? "docx" : kind === "image" ? "image" : kind === "text" ? "text" : "none";
@@ -286,7 +433,7 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan) {
       mayUse: false,
       description: "PDF OCR is not available in the browser."
     };
-    return {
+    return finalizeBrowserDocxExplainReport({
       kind,
       detectedKind: kind,
       runtime,
@@ -297,7 +444,7 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan) {
       limitations,
       plan,
       warnings: [BROWSER_PDF_UNSUPPORTED_WARNING]
-    };
+    });
   }
   if (kind === "unknown") {
     limitations = lim(
@@ -305,7 +452,7 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan) {
     );
     nativeExtraction = { willAttempt: false, description: "No analyzer selected without a known file kind." };
     ocr2 = { mayUse: false, description: "OCR is not used for unknown kinds." };
-    return {
+    return finalizeBrowserDocxExplainReport({
       kind,
       detectedKind: kind,
       runtime,
@@ -315,7 +462,7 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan) {
       ocr: ocr2,
       limitations,
       plan
-    };
+    });
   }
   switch (intent) {
     case "analyzeFile":
@@ -323,7 +470,7 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan) {
       if (kind === "docx") {
         nativeExtraction = {
           willAttempt: true,
-          description: "Mammoth reads OOXML for text" + (intent === "extractText" ? " (HTML omitted in extractText)." : " and HTML.")
+          description: docxIncludeRequested(docxInclude) ? "Mammoth plus parallel OOXML extractors (per options.docx.include)." + (intent === "extractText" ? " HTML cleared in extractText." : "") : DOCX_MAMMOTH_PLUS_OPTIONAL_BROWSER + (intent === "extractText" ? " HTML omitted in extractText." : "")
         };
         ocr2 = { mayUse: false, description: "DOCX does not use OCR in DocMind." };
       } else if (kind === "image") {
@@ -352,7 +499,7 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan) {
         };
         ocr2 = { mayUse: false, description: "OCR is not invoked for this metadata path." };
         limitations = lim(
-          kind === "docx" ? "Structured DOCX metadata is not exposed separately in the browser." : "Raster images have no document metadata bundle in this API."
+          kind === "docx" ? "Structured DOCX metadata is not exposed separately in the browser; use analyzeFile / extractText / convertToHtml with options.docx.include for OOXML fields." : "Raster images have no document metadata bundle in this API."
         );
       } else {
         nativeExtraction = {
@@ -367,7 +514,7 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan) {
       if (kind === "docx") {
         nativeExtraction = {
           willAttempt: true,
-          description: "Mammoth produces HTML; images are not passed through analyzeFile for this intent."
+          description: docxIncludeRequested(docxInclude) ? "Mammoth HTML via analyzeFile plus optional OOXML extractors." : "Mammoth HTML via analyzeFile; optional OOXML v2 when options.docx.include is set."
         };
         ocr2 = { mayUse: false, description: "DOCX path does not use OCR." };
       } else if (kind === "text") {
@@ -401,7 +548,7 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan) {
       } else if (kind === "docx") {
         nativeExtraction = {
           willAttempt: true,
-          description: "Mammoth returns structured text/HTML (OCR does not apply to DOCX)."
+          description: docxIncludeRequested(docxInclude) ? "Mammoth text/HTML plus optional OOXML extractors; not OCR." : "Mammoth text/HTML; optional OOXML v2 via options.docx.include; not OCR."
         };
         ocr2 = { mayUse: false, description: "DOCX is not OCR'd." };
         limitations = lim("Returned content is structured extract, not OCR output.");
@@ -417,7 +564,7 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan) {
       nativeExtraction = { willAttempt: false, description: "Intent not specialized in this runtime." };
       ocr2 = { mayUse: false, description: "See plan steps." };
   }
-  return {
+  return finalizeBrowserDocxExplainReport({
     kind,
     detectedKind: kind,
     runtime,
@@ -427,84 +574,14 @@ function buildBrowserExplainReport(kind, intent, ocrMode, plan) {
     ocr: ocr2,
     limitations,
     plan
-  };
-}
-// src/capabilityReport.ts
-var DOCX_META = "Structured document metadata is not exposed separately in the browser runtime; extractMetadata returns a stub for DOCX.";
-var IMAGE_META = "Raster images have no document metadata bundle; extractMetadata returns a stub.";
-var IMAGE_HTML = "No layout HTML for raster images; use extractText or runOcr for text.";
-var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMetadata still returns decoded content.";
-var OCR_OFF_NOTE = 'Image OCR may be skipped when `ocr.mode` is "off" in analyze options.';
-var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
-function slot(id, supported, warnings) {
-  return warnings?.length ? { id, supported, warnings } : { id, supported };
-}
-function buildBrowserCapabilityReport(kind) {
-  const runtime = { id: "browser" };
-  const pdf = BROWSER_PDF_UNSUPPORTED_WARNING;
-  let capabilities;
-  const topWarnings = [];
-  switch (kind) {
-    case "pdf":
-      capabilities = [
-        slot("text", false, [pdf]),
-        slot("metadata", false, [pdf]),
-        slot("html", false, [pdf]),
-        slot("ocr", false, [pdf]),
-        slot("pages", false, [pdf])
-      ];
-      break;
-    case "docx":
-      capabilities = [
-        slot("text", true),
-        slot("metadata", false, [DOCX_META]),
-        slot("html", true),
-        slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
-        slot("pages", false)
-      ];
-      break;
-    case "image":
-      capabilities = [
-        slot("text", true, ["Text is obtained via OCR when enabled."]),
-        slot("metadata", false, [IMAGE_META]),
-        slot("html", false, [IMAGE_HTML]),
-        slot("ocr", true, [OCR_OFF_NOTE]),
-        slot("pages", false)
-      ];
-      break;
-    case "text":
-      capabilities = [
-        slot("text", true),
-        slot("metadata", true, [TEXT_META_NOTE]),
-        slot("html", true),
-        slot("ocr", false, ["OCR does not apply to plain text files."]),
-        slot("pages", false)
-      ];
-      break;
-    default:
-      topWarnings.push(UNKNOWN_KIND);
-      capabilities = [
-        slot("text", false),
-        slot("metadata", false),
-        slot("html", false),
-        slot("ocr", false),
-        slot("pages", false)
-      ];
-  }
-  return {
-    kind,
-    runtime,
-    capabilities,
-    warnings: topWarnings.length > 0 ? topWarnings : void 0
-  };
+  });
 }
 // src/introspection.ts
 function resolveOcrMode2(ocr2) {
   return ocr2?.mode ?? "auto";
 }
-function planForAnalyzeFile(kind, ocrMode) {
+function planForAnalyzeFile(kind, ocrMode, docxInclude) {
   switch (kind) {
     case "pdf":
       return {
@@ -514,14 +591,17 @@ function planForAnalyzeFile(kind, ocrMode) {
           { id: "pdf_pipeline", status: "skipped" }
         ]
       };
-    case "docx":
+    case "docx": {
+      const parallel = docxIncludeRequested(docxInclude);
       return {
         intent: "analyzeFile",
         steps: [
           { id: "detect_kind", status: "done" },
-          { id: "docx_mammoth", status: "planned" }
+          { id: "docx_mammoth", status: "planned" },
+          ...parallel ? [{ id: "docx_ooxml_parallel", status: "planned" }] : []
         ]
       };
+    }
     case "image":
       return {
         intent: "analyzeFile",
@@ -548,11 +628,11 @@ function planForAnalyzeFile(kind, ocrMode) {
       };
   }
 }
-function planForIntent(intentOpt, kind, ocrMode) {
+function planForIntent(intentOpt, kind, ocrMode, docxInclude) {
   const intent = intentOpt ?? "analyzeFile";
-  if (intent === "analyzeFile") return planForAnalyzeFile(kind, ocrMode);
+  if (intent === "analyzeFile") return planForAnalyzeFile(kind, ocrMode, docxInclude);
   if (intent === "extractText") {
-    const base = planForAnalyzeFile(kind, ocrMode);
+    const base = planForAnalyzeFile(kind, ocrMode, docxInclude);
     return { ...base, intent: "extractText" };
   }
   if (intent === "extractMetadata") {
@@ -575,11 +655,13 @@ function planForIntent(intentOpt, kind, ocrMode) {
   }
   if (intent === "convertToHtml") {
     if (kind === "docx") {
+      const parallel = docxIncludeRequested(docxInclude);
       return {
         intent: "convertToHtml",
         steps: [
           { id: "detect_kind", status: "done" },
-          { id: "docx_mammoth_html", status: "planned" }
+          { id: "docx_mammoth_html", status: "planned" },
+          ...parallel ? [{ id: "docx_ooxml_parallel", status: "planned" }] : []
         ]
       };
     }
@@ -612,11 +694,13 @@ function planForIntent(intentOpt, kind, ocrMode) {
       };
     }
     if (kind === "docx") {
+      const parallel = docxIncludeRequested(docxInclude);
       return {
         intent: "runOcr",
         steps: [
           { id: "detect_kind", status: "done" },
-          { id: "docx_structured_extract", status: "planned" }
+          { id: "docx_mammoth", status: "planned" },
+          ...parallel ? [{ id: "docx_ooxml_parallel", status: "planned" }] : []
         ]
       };
     }
@@ -628,7 +712,7 @@ function planForIntent(intentOpt, kind, ocrMode) {
       ]
     };
   }
-  return planForAnalyzeFile(kind, ocrMode);
+  return planForAnalyzeFile(kind, ocrMode, docxInclude);
 }
 async function getCapabilities(input, options) {
   throwIfAborted(options?.signal);
@@ -642,10 +726,11 @@ async function explainAnalysisPlan(input, options) {
   const kind = detectFileKind(input);
   const intent = options?.intent ?? "analyzeFile";
   const ocrMode = resolveOcrMode2(options?.ocr);
-  const plan = planForIntent(intent, kind, ocrMode);
-  return buildBrowserExplainReport(kind, intent, ocrMode, plan);
+  const docxInc = options?.docx?.include;
+  const plan = planForIntent(intent, kind, ocrMode, docxInc);
+  return buildBrowserExplainReport(kind, intent, ocrMode, plan, docxInc);
 }
-export { BROWSER_PDF_UNSUPPORTED_WARNING, analyzeFile, convertToHtml, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, runOcr };
+export { BROWSER_PDF_UNSUPPORTED_WARNING, DOCX_EMBEDDED_IMAGE_CAPABILITIES_BROWSER, DOCX_STRUCTURE_CAPABILITIES_BROWSER, analyzeFile, convertToHtml, docxIncludeRequested, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, runOcr };
 //# sourceMappingURL=index.js.map
 //# sourceMappingURL=index.js.map

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@dragon708/docmind-browser",
-  "version": "1.2.0",
+  "version": "1.4.0",
   "description": "Official DocMind browser facade: analyzeFile and intent APIs (DOCX, image OCR, text). PDF and fs paths use @dragon708/docmind-node.",
   "type": "module",
   "sideEffects": false,
@@ -33,7 +33,7 @@
   ],
   "license": "MIT",
   "dependencies": {
-    "@dragon708/docmind-docx": "^1.0.0",
+    "@dragon708/docmind-docx": "^1.7.0",
     "@dragon708/docmind-ocr": "^1.0.0",
     "@dragon708/docmind-shared": "^1.1.0"
   },