@dragon708/docmind-browser 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +105 -18
- package/dist/index.js +467 -185
- package/package.json +5 -4
- package/dist/index.js.map +0 -1
package/dist/index.d.ts
CHANGED
|
@@ -1,43 +1,130 @@
|
|
|
1
|
-
import { AnalysisResult } from '@dragon708/docmind-shared';
|
|
2
|
-
export { AnalysisAnalyzer, AnalysisResult, DetectFileKindInput, DocxAnalysisCoreResult, FileKind, FileKindMetadata, GenericAnalysisResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
|
|
1
|
+
import { DocMindAnalyzeOptions, AnalysisResult, FileKind, RuntimeDescriptor, DocMindPublicIntent, AnalysisAnalyzer, ProcessingPlanDescriptor, ExplainAnalysisPlanOptions, GetCapabilitiesOptions } from '@dragon708/docmind-shared';
|
|
2
|
+
export { AnalysisAnalyzer, AnalysisResult, CapabilityDescriptor, DetectFileKindInput, DocMindPublicIntent, DocxAnalysisCoreResult, ExplainAnalysisPlanOptions, ExplainAnalysisPlanResult, FileKind, FileKindMetadata, GenericAnalysisResult, GetCapabilitiesOptions, GetCapabilitiesResult, ImageAnalysisCoreResult, PdfAnalysisCoreResult, TextAnalysisResult, detectFileKind } from '@dragon708/docmind-shared';
|
|
3
3
|
import { OcrOptions } from '@dragon708/docmind-ocr';
|
|
4
4
|
|
|
5
|
-
/**
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
5
|
+
/**
|
|
6
|
+
* OCR behavior for browser intents that touch raster images.
|
|
7
|
+
* - `off`: do not invoke Tesseract; text stays empty with an explanatory warning.
|
|
8
|
+
* - `auto` (default): run OCR when the input is classified as an image.
|
|
9
|
+
* - `force`: same as `auto` in the browser runtime (no PDF-style text layer to compare); reserved for parity with Node.
|
|
10
|
+
*/
|
|
11
|
+
type BrowserOcrMode = "off" | "auto" | "force";
|
|
12
|
+
/** Browser OCR options: Tesseract knobs from `@dragon708/docmind-ocr` plus optional {@link BrowserOcrMode}. */
|
|
13
|
+
interface BrowserOcrOptions extends OcrOptions {
|
|
14
|
+
readonly mode?: BrowserOcrMode;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Options for public browser methods (`analyzeFile`, intent APIs).
|
|
18
|
+
* There is no PDF pipeline in the browser; {@link BrowserOcrOptions.mode} applies to images only.
|
|
19
|
+
*/
|
|
20
|
+
interface BrowserAnalyzeOptions extends DocMindAnalyzeOptions {
|
|
21
|
+
/** Image OCR only; no PDF in this runtime. See {@link BrowserOcrOptions.mode}. */
|
|
22
|
+
readonly ocr?: BrowserOcrOptions;
|
|
9
23
|
}
|
|
10
24
|
|
|
11
|
-
/** PDF is not processed in the browser; use `@dragon708/docmind-node` on the server. */
|
|
12
|
-
declare const BROWSER_PDF_UNSUPPORTED_WARNING = "PDF text extraction is not available in the browser runtime; use @dragon708/docmind-node on the server.";
|
|
13
25
|
/**
|
|
14
26
|
* Inputs supported by the browser entry (DOM types only — no `fs`, no Node `Buffer` in the public surface).
|
|
15
27
|
*/
|
|
16
28
|
type BrowserAnalyzeInput = File | Blob | ArrayBuffer;
|
|
29
|
+
|
|
17
30
|
/**
|
|
18
|
-
* Browser
|
|
31
|
+
* Browser `analyzeFile` router. Package-level scope and limitations are documented on the package entry (`index.ts`).
|
|
32
|
+
*/
|
|
33
|
+
|
|
34
|
+
/** PDF is not processed in the browser; use `@dragon708/docmind-node` on the server. */
|
|
35
|
+
declare const BROWSER_PDF_UNSUPPORTED_WARNING = "PDF text extraction is not available in the browser runtime; use @dragon708/docmind-node on the server.";
|
|
36
|
+
/**
|
|
37
|
+
* Full-document router: DOCX (text + HTML), images (OCR subject to {@link BrowserOcrOptions.mode}),
|
|
38
|
+
* plain text (UTF-8 decode). PDF yields `not_implemented` with {@link BROWSER_PDF_UNSUPPORTED_WARNING}.
|
|
19
39
|
*/
|
|
20
40
|
declare function analyzeFile(input: BrowserAnalyzeInput, options?: BrowserAnalyzeOptions): Promise<AnalysisResult>;
|
|
21
41
|
|
|
22
42
|
/**
|
|
23
|
-
*
|
|
24
|
-
*
|
|
43
|
+
* Full text extraction using the same document pipeline as {@link analyzeFile} (not ad-hoc DOCX/OCR calls).
|
|
44
|
+
* Plain-text inputs still decode via the shared text analyzer inside `analyzeFile`.
|
|
25
45
|
*/
|
|
26
46
|
declare function extractText(input: BrowserAnalyzeInput, options?: BrowserAnalyzeOptions): Promise<AnalysisResult>;
|
|
27
47
|
/**
|
|
28
|
-
*
|
|
29
|
-
*
|
|
48
|
+
* Metadata-oriented view. Does not run heavy extractors for DOCX/images (stubs only).
|
|
49
|
+
* Plain-text files use the same `analyzeFile` path as {@link extractText} for consistency.
|
|
30
50
|
*/
|
|
31
51
|
declare function extractMetadata(input: BrowserAnalyzeInput, options?: BrowserAnalyzeOptions): Promise<AnalysisResult>;
|
|
32
52
|
/**
|
|
33
|
-
* HTML
|
|
34
|
-
* PDF
|
|
53
|
+
* HTML-oriented view. Uses {@link analyzeFile} for DOCX and plain text only so raster images never trigger OCR.
|
|
54
|
+
* PDF and unknown kinds follow the same stubs as other intents.
|
|
35
55
|
*/
|
|
36
56
|
declare function convertToHtml(input: BrowserAnalyzeInput, options?: BrowserAnalyzeOptions): Promise<AnalysisResult>;
|
|
37
57
|
/**
|
|
38
|
-
* OCR
|
|
39
|
-
*
|
|
58
|
+
* OCR-focused intent. Honors {@link BrowserAnalyzeOptions.ocr} **mode** (`off` | `auto` | `force`) for images.
|
|
59
|
+
* DOCX returns structured extract with a notice (no OCR). Text decodes as UTF-8 (no OCR).
|
|
40
60
|
*/
|
|
41
61
|
declare function runOcr(input: BrowserAnalyzeInput, options?: BrowserAnalyzeOptions): Promise<AnalysisResult>;
|
|
42
62
|
|
|
43
|
-
|
|
63
|
+
/**
|
|
64
|
+
* Whether DocMind will try a non-OCR text/HTML path (e.g. Mammoth, UTF-8, PDF text layer — when available).
|
|
65
|
+
*/
|
|
66
|
+
interface NativeExtractionPlan {
|
|
67
|
+
readonly willAttempt: boolean;
|
|
68
|
+
readonly description: string;
|
|
69
|
+
}
|
|
70
|
+
/** Whether raster/PDF OCR could run for this intent + kind (subject to options). */
|
|
71
|
+
interface OcrPlan {
|
|
72
|
+
readonly mayUse: boolean;
|
|
73
|
+
readonly description: string;
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Structured explanation of what DocMind would do for a public intent (no heavy I/O).
|
|
77
|
+
* Suitable for playgrounds and debug UIs.
|
|
78
|
+
*/
|
|
79
|
+
interface ExplainAnalysisPlanReport {
|
|
80
|
+
/** Detected {@link FileKind} from name/MIME/sniff hints. */
|
|
81
|
+
readonly kind: FileKind;
|
|
82
|
+
/** Same as {@link kind}; kept for symmetry with older `ExplainAnalysisPlanResult` consumers. */
|
|
83
|
+
readonly detectedKind: FileKind;
|
|
84
|
+
readonly runtime: RuntimeDescriptor;
|
|
85
|
+
readonly intent: DocMindPublicIntent | (string & {});
|
|
86
|
+
/** Router target analyzer for this kind (`none` when unknown or PDF in browser). */
|
|
87
|
+
readonly primaryAnalyzer: AnalysisAnalyzer;
|
|
88
|
+
readonly nativeExtraction: NativeExtractionPlan;
|
|
89
|
+
readonly ocr: OcrPlan;
|
|
90
|
+
/** User-facing caveats (PDF unsupported, unknown kind, OCR off, etc.). */
|
|
91
|
+
readonly limitations: readonly string[];
|
|
92
|
+
/** Ordered pipeline steps (planned/skipped/done metadata only). */
|
|
93
|
+
readonly plan: ProcessingPlanDescriptor;
|
|
94
|
+
readonly warnings?: readonly string[];
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/** High-level features the user can ask DocMind for (per input kind and runtime). */
|
|
98
|
+
type PublicCapabilityId = "text" | "metadata" | "html" | "ocr" | "pages";
|
|
99
|
+
/** Whether a {@link PublicCapabilityId} applies to the detected file in this runtime. */
|
|
100
|
+
interface PublicCapabilitySupport {
|
|
101
|
+
readonly id: PublicCapabilityId;
|
|
102
|
+
readonly supported: boolean;
|
|
103
|
+
readonly warnings?: readonly string[];
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Result of {@link getCapabilities}: detected kind, runtime id, per-feature support for this input, and optional global warnings.
|
|
107
|
+
*/
|
|
108
|
+
interface GetCapabilitiesReport {
|
|
109
|
+
readonly kind: FileKind;
|
|
110
|
+
readonly runtime: RuntimeDescriptor;
|
|
111
|
+
readonly capabilities: readonly PublicCapabilitySupport[];
|
|
112
|
+
readonly warnings?: readonly string[];
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/** Options for {@link explainAnalysisPlan}: shared fields plus optional `ocr` for accurate OCR-step preview. */
|
|
116
|
+
type BrowserExplainAnalysisPlanOptions = ExplainAnalysisPlanOptions & Pick<BrowserAnalyzeOptions, "ocr">;
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Epic 1 — **Capabilities:** detects kind from the same hints as `analyzeFile`, then lists which of
|
|
120
|
+
* `text` | `metadata` | `html` | `ocr` | `pages` apply in the browser (PDF always unsupported).
|
|
121
|
+
* No Mammoth/Tesseract/PDF parsing.
|
|
122
|
+
*/
|
|
123
|
+
declare function getCapabilities(input: BrowserAnalyzeInput, options?: GetCapabilitiesOptions): Promise<GetCapabilitiesReport>;
|
|
124
|
+
/**
|
|
125
|
+
* Epic 1 — **Plan preview:** structured explanation (analyzer, native extraction vs OCR, `limitations`, `plan.steps`)
|
|
126
|
+
* for a {@link DocMindPublicIntent}. Optional `ocr` in options refines image steps. No heavy I/O.
|
|
127
|
+
*/
|
|
128
|
+
declare function explainAnalysisPlan(input: BrowserAnalyzeInput, options?: BrowserExplainAnalysisPlanOptions): Promise<ExplainAnalysisPlanReport>;
|
|
129
|
+
|
|
130
|
+
export { BROWSER_PDF_UNSUPPORTED_WARNING, type BrowserAnalyzeInput, type BrowserAnalyzeOptions, type BrowserExplainAnalysisPlanOptions, type BrowserOcrMode, type BrowserOcrOptions, type ExplainAnalysisPlanReport, type GetCapabilitiesReport, type NativeExtractionPlan, type OcrPlan, type PublicCapabilityId, type PublicCapabilitySupport, analyzeFile, convertToHtml, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, runOcr };
|
package/dist/index.js
CHANGED
|
@@ -1,9 +1,27 @@
|
|
|
1
1
|
import { assertValidAnalyzeFileInput, detectFileKind, notImplementedResult, UNKNOWN_FORMAT_WARNING, analyzeText, toUint8Array, InvalidInputError } from '@dragon708/docmind-shared';
|
|
2
2
|
export { detectFileKind } from '@dragon708/docmind-shared';
|
|
3
|
-
import {
|
|
3
|
+
import { analyzeDocx } from '@dragon708/docmind-docx';
|
|
4
4
|
import { ocr } from '@dragon708/docmind-ocr';
|
|
5
5
|
|
|
6
6
|
// src/analyzeFile.ts
|
|
7
|
+
function assertBrowserInput(input) {
|
|
8
|
+
const ok = input instanceof File || input instanceof Blob || input instanceof ArrayBuffer;
|
|
9
|
+
if (!ok) {
|
|
10
|
+
throw new InvalidInputError("Expected a File, Blob, or ArrayBuffer.");
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
function throwIfAborted(signal) {
|
|
14
|
+
if (signal?.aborted) {
|
|
15
|
+
const err = new Error("The operation was aborted");
|
|
16
|
+
err.name = "AbortError";
|
|
17
|
+
throw err;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
function prepareBrowserAnalyzeInput(input) {
|
|
21
|
+
assertBrowserInput(input);
|
|
22
|
+
assertValidAnalyzeFileInput(input);
|
|
23
|
+
return input;
|
|
24
|
+
}
|
|
7
25
|
async function analyzeDocxForBrowser(input, signal) {
|
|
8
26
|
if (signal?.aborted) {
|
|
9
27
|
const err = new Error("The operation was aborted");
|
|
@@ -33,6 +51,10 @@ async function analyzeDocxForBrowser(input, signal) {
|
|
|
33
51
|
warnings: [...r.warnings]
|
|
34
52
|
};
|
|
35
53
|
}
|
|
54
|
+
var OCR_OFF_WARNING = 'OCR mode is "off"; no recognition was run. Use mode "auto" or "force" to extract text from images.';
|
|
55
|
+
function resolveOcrMode(options) {
|
|
56
|
+
return options?.ocr?.mode ?? "auto";
|
|
57
|
+
}
|
|
36
58
|
async function analyzeImageForBrowser(input, options) {
|
|
37
59
|
if (options?.signal?.aborted) {
|
|
38
60
|
const err = new Error("The operation was aborted");
|
|
@@ -52,6 +74,19 @@ async function analyzeImageForBrowser(input, options) {
|
|
|
52
74
|
warnings: ["No image bytes were provided for analysis."]
|
|
53
75
|
};
|
|
54
76
|
}
|
|
77
|
+
const mode = resolveOcrMode(options);
|
|
78
|
+
if (mode === "off") {
|
|
79
|
+
return {
|
|
80
|
+
fileKind: "image",
|
|
81
|
+
analyzer: "image",
|
|
82
|
+
status: "ok",
|
|
83
|
+
kind: "image",
|
|
84
|
+
text: "",
|
|
85
|
+
confidence: 0,
|
|
86
|
+
ocrUsed: false,
|
|
87
|
+
warnings: [OCR_OFF_WARNING]
|
|
88
|
+
};
|
|
89
|
+
}
|
|
55
90
|
const ocrOpts = {
|
|
56
91
|
...options?.ocr ?? {},
|
|
57
92
|
signal: options?.ocr?.signal ?? options?.signal
|
|
@@ -71,12 +106,6 @@ async function analyzeImageForBrowser(input, options) {
|
|
|
71
106
|
|
|
72
107
|
// src/analyzeFile.ts
|
|
73
108
|
var BROWSER_PDF_UNSUPPORTED_WARNING = "PDF text extraction is not available in the browser runtime; use @dragon708/docmind-node on the server.";
|
|
74
|
-
function assertBrowserInput(input) {
|
|
75
|
-
const ok = input instanceof File || input instanceof Blob || input instanceof ArrayBuffer;
|
|
76
|
-
if (!ok) {
|
|
77
|
-
throw new InvalidInputError("Expected a File, Blob, or ArrayBuffer.");
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
109
|
async function analyzeFile(input, options) {
|
|
81
110
|
if (options?.signal?.aborted) {
|
|
82
111
|
const err = new Error("The operation was aborted");
|
|
@@ -100,104 +129,29 @@ async function analyzeFile(input, options) {
|
|
|
100
129
|
return notImplementedResult(fileKind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
101
130
|
}
|
|
102
131
|
}
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
if (!ok) {
|
|
106
|
-
throw new InvalidInputError("Expected a File, Blob, or ArrayBuffer.");
|
|
107
|
-
}
|
|
108
|
-
}
|
|
109
|
-
function throwIfAborted(signal) {
|
|
110
|
-
if (signal?.aborted) {
|
|
111
|
-
const err = new Error("The operation was aborted");
|
|
112
|
-
err.name = "AbortError";
|
|
113
|
-
throw err;
|
|
114
|
-
}
|
|
115
|
-
}
|
|
132
|
+
var DOCX_METADATA_STUB = "Structured document metadata for DOCX is not exposed as a separate API; use extractText or analyzeFile.";
|
|
133
|
+
var IMAGE_METADATA_NOTE = "Raster images have no document metadata bundle in this API.";
|
|
116
134
|
function escapeHtmlMinimal(s) {
|
|
117
135
|
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
118
136
|
}
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
return
|
|
137
|
+
function toExtractTextResult(full) {
|
|
138
|
+
if (full.status !== "ok") return full;
|
|
139
|
+
if (full.fileKind === "docx") {
|
|
140
|
+
return { ...full, html: "" };
|
|
141
|
+
}
|
|
142
|
+
return full;
|
|
125
143
|
}
|
|
126
144
|
async function extractText(input, options) {
|
|
127
145
|
throwIfAborted(options?.signal);
|
|
128
|
-
|
|
129
|
-
const
|
|
130
|
-
|
|
131
|
-
const signal = options?.signal;
|
|
132
|
-
switch (kind) {
|
|
133
|
-
case "pdf":
|
|
134
|
-
return notImplementedResult("pdf", "pdf", [BROWSER_PDF_UNSUPPORTED_WARNING]);
|
|
135
|
-
case "docx": {
|
|
136
|
-
const data = await toUint8Array(bytesInput);
|
|
137
|
-
if (data.byteLength === 0) {
|
|
138
|
-
return {
|
|
139
|
-
fileKind: "docx",
|
|
140
|
-
analyzer: "docx",
|
|
141
|
-
status: "ok",
|
|
142
|
-
kind: "docx",
|
|
143
|
-
text: "",
|
|
144
|
-
html: "",
|
|
145
|
-
warnings: ["No document bytes were provided for analysis."]
|
|
146
|
-
};
|
|
147
|
-
}
|
|
148
|
-
const r = await extractTextFromDocx(data);
|
|
149
|
-
return {
|
|
150
|
-
fileKind: "docx",
|
|
151
|
-
analyzer: "docx",
|
|
152
|
-
status: "ok",
|
|
153
|
-
kind: "docx",
|
|
154
|
-
text: r.text,
|
|
155
|
-
html: "",
|
|
156
|
-
warnings: r.warnings
|
|
157
|
-
};
|
|
158
|
-
}
|
|
159
|
-
case "image": {
|
|
160
|
-
const data = await toUint8Array(bytesInput);
|
|
161
|
-
if (data.byteLength === 0) {
|
|
162
|
-
return {
|
|
163
|
-
fileKind: "image",
|
|
164
|
-
analyzer: "image",
|
|
165
|
-
status: "ok",
|
|
166
|
-
kind: "image",
|
|
167
|
-
text: "",
|
|
168
|
-
confidence: 0,
|
|
169
|
-
ocrUsed: true,
|
|
170
|
-
warnings: ["No image bytes were provided for analysis."]
|
|
171
|
-
};
|
|
172
|
-
}
|
|
173
|
-
const ocrOpts = {
|
|
174
|
-
...options?.ocr ?? {},
|
|
175
|
-
signal: options?.ocr?.signal ?? signal
|
|
176
|
-
};
|
|
177
|
-
const r = await ocr(data, ocrOpts);
|
|
178
|
-
return {
|
|
179
|
-
fileKind: "image",
|
|
180
|
-
analyzer: "image",
|
|
181
|
-
status: "ok",
|
|
182
|
-
kind: "image",
|
|
183
|
-
text: r.text,
|
|
184
|
-
confidence: r.confidence,
|
|
185
|
-
ocrUsed: r.ocrUsed,
|
|
186
|
-
warnings: []
|
|
187
|
-
};
|
|
188
|
-
}
|
|
189
|
-
case "text":
|
|
190
|
-
return analyzeText(bytesInput, { signal });
|
|
191
|
-
default:
|
|
192
|
-
return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
193
|
-
}
|
|
146
|
+
prepareBrowserAnalyzeInput(input);
|
|
147
|
+
const full = await analyzeFile(input, options);
|
|
148
|
+
return toExtractTextResult(full);
|
|
194
149
|
}
|
|
195
150
|
async function extractMetadata(input, options) {
|
|
196
151
|
throwIfAborted(options?.signal);
|
|
197
|
-
const resolved =
|
|
152
|
+
const resolved = prepareBrowserAnalyzeInput(input);
|
|
198
153
|
const kind = detectFileKind(resolved);
|
|
199
|
-
|
|
200
|
-
const signal = options?.signal;
|
|
154
|
+
options?.signal;
|
|
201
155
|
switch (kind) {
|
|
202
156
|
case "pdf":
|
|
203
157
|
return notImplementedResult("pdf", "pdf", [BROWSER_PDF_UNSUPPORTED_WARNING]);
|
|
@@ -223,20 +177,59 @@ async function extractMetadata(input, options) {
|
|
|
223
177
|
warnings: [IMAGE_METADATA_NOTE]
|
|
224
178
|
};
|
|
225
179
|
case "text":
|
|
226
|
-
return
|
|
180
|
+
return analyzeFile(input, options);
|
|
227
181
|
default:
|
|
228
182
|
return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
229
183
|
}
|
|
230
184
|
}
|
|
231
185
|
async function convertToHtml(input, options) {
|
|
232
186
|
throwIfAborted(options?.signal);
|
|
233
|
-
const resolved =
|
|
187
|
+
const resolved = prepareBrowserAnalyzeInput(input);
|
|
188
|
+
const kind = detectFileKind(resolved);
|
|
189
|
+
if (kind === "pdf") {
|
|
190
|
+
return notImplementedResult("pdf", "pdf", [BROWSER_PDF_UNSUPPORTED_WARNING]);
|
|
191
|
+
}
|
|
192
|
+
if (kind === "image") {
|
|
193
|
+
return {
|
|
194
|
+
fileKind: "image",
|
|
195
|
+
analyzer: "image",
|
|
196
|
+
status: "ok",
|
|
197
|
+
kind: "image",
|
|
198
|
+
text: "",
|
|
199
|
+
confidence: 0,
|
|
200
|
+
ocrUsed: true,
|
|
201
|
+
warnings: ["No HTML representation for raster images; use extractText / runOcr."]
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
if (kind === "unknown") {
|
|
205
|
+
return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
206
|
+
}
|
|
207
|
+
const r = await analyzeFile(input, options);
|
|
208
|
+
if (r.status !== "ok") return r;
|
|
209
|
+
if (r.fileKind === "text") {
|
|
210
|
+
const html = `<pre>${escapeHtmlMinimal(r.text)}</pre>`;
|
|
211
|
+
return {
|
|
212
|
+
...r,
|
|
213
|
+
html,
|
|
214
|
+
warnings: [
|
|
215
|
+
...r.warnings,
|
|
216
|
+
"HTML for plain text is a <pre> wrapper around decoded UTF-8 content."
|
|
217
|
+
]
|
|
218
|
+
};
|
|
219
|
+
}
|
|
220
|
+
return r;
|
|
221
|
+
}
|
|
222
|
+
async function runOcr(input, options) {
|
|
223
|
+
throwIfAborted(options?.signal);
|
|
224
|
+
const resolved = prepareBrowserAnalyzeInput(input);
|
|
234
225
|
const kind = detectFileKind(resolved);
|
|
235
226
|
const bytesInput = input;
|
|
236
227
|
const signal = options?.signal;
|
|
237
228
|
switch (kind) {
|
|
238
229
|
case "pdf":
|
|
239
230
|
return notImplementedResult("pdf", "pdf", [BROWSER_PDF_UNSUPPORTED_WARNING]);
|
|
231
|
+
case "image":
|
|
232
|
+
return analyzeImageForBrowser(bytesInput, options);
|
|
240
233
|
case "docx": {
|
|
241
234
|
const data = await toUint8Array(bytesInput);
|
|
242
235
|
if (data.byteLength === 0) {
|
|
@@ -250,120 +243,409 @@ async function convertToHtml(input, options) {
|
|
|
250
243
|
warnings: ["No document bytes were provided for analysis."]
|
|
251
244
|
};
|
|
252
245
|
}
|
|
253
|
-
const
|
|
254
|
-
extractTextFromDocx(data),
|
|
255
|
-
convertDocxToHtml(data)
|
|
256
|
-
]);
|
|
246
|
+
const r = await analyzeDocx(data);
|
|
257
247
|
return {
|
|
258
248
|
fileKind: "docx",
|
|
259
249
|
analyzer: "docx",
|
|
260
250
|
status: "ok",
|
|
261
251
|
kind: "docx",
|
|
262
|
-
text:
|
|
263
|
-
html:
|
|
264
|
-
warnings: [
|
|
252
|
+
text: r.text,
|
|
253
|
+
html: r.html,
|
|
254
|
+
warnings: [
|
|
255
|
+
...r.warnings,
|
|
256
|
+
"OCR does not apply to DOCX; returned structured text/HTML extract."
|
|
257
|
+
]
|
|
265
258
|
};
|
|
266
259
|
}
|
|
267
|
-
case "text":
|
|
268
|
-
|
|
269
|
-
|
|
260
|
+
case "text":
|
|
261
|
+
return analyzeText(bytesInput, { signal });
|
|
262
|
+
default:
|
|
263
|
+
return notImplementedResult(kind, "none", [UNKNOWN_FORMAT_WARNING]);
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// src/analysisPlanReport.ts
|
|
268
|
+
function lim(...items) {
|
|
269
|
+
return items.filter(Boolean);
|
|
270
|
+
}
|
|
271
|
+
function buildBrowserExplainReport(kind, intent, ocrMode, plan) {
|
|
272
|
+
const runtime = { id: "browser" };
|
|
273
|
+
const imageOcrActive = ocrMode !== "off";
|
|
274
|
+
let primaryAnalyzer = kind === "pdf" ? "pdf" : kind === "docx" ? "docx" : kind === "image" ? "image" : kind === "text" ? "text" : "none";
|
|
275
|
+
let nativeExtraction;
|
|
276
|
+
let ocr2;
|
|
277
|
+
let limitations = [];
|
|
278
|
+
const ocrOffNote = ocrMode === "off" ? 'Image OCR is skipped when ocr.mode is "off".' : "";
|
|
279
|
+
if (kind === "pdf") {
|
|
280
|
+
limitations = lim(BROWSER_PDF_UNSUPPORTED_WARNING);
|
|
281
|
+
nativeExtraction = {
|
|
282
|
+
willAttempt: false,
|
|
283
|
+
description: "PDF is not processed in the browser runtime; use @dragon708/docmind-node."
|
|
284
|
+
};
|
|
285
|
+
ocr2 = {
|
|
286
|
+
mayUse: false,
|
|
287
|
+
description: "PDF OCR is not available in the browser."
|
|
288
|
+
};
|
|
289
|
+
return {
|
|
290
|
+
kind,
|
|
291
|
+
detectedKind: kind,
|
|
292
|
+
runtime,
|
|
293
|
+
intent,
|
|
294
|
+
primaryAnalyzer: "pdf",
|
|
295
|
+
nativeExtraction,
|
|
296
|
+
ocr: ocr2,
|
|
297
|
+
limitations,
|
|
298
|
+
plan,
|
|
299
|
+
warnings: [BROWSER_PDF_UNSUPPORTED_WARNING]
|
|
300
|
+
};
|
|
301
|
+
}
|
|
302
|
+
if (kind === "unknown") {
|
|
303
|
+
limitations = lim(
|
|
304
|
+
"Could not classify the file from name, MIME, or bytes; analysis will return not_implemented until hints improve."
|
|
305
|
+
);
|
|
306
|
+
nativeExtraction = { willAttempt: false, description: "No analyzer selected without a known file kind." };
|
|
307
|
+
ocr2 = { mayUse: false, description: "OCR is not used for unknown kinds." };
|
|
308
|
+
return {
|
|
309
|
+
kind,
|
|
310
|
+
detectedKind: kind,
|
|
311
|
+
runtime,
|
|
312
|
+
intent,
|
|
313
|
+
primaryAnalyzer: "none",
|
|
314
|
+
nativeExtraction,
|
|
315
|
+
ocr: ocr2,
|
|
316
|
+
limitations,
|
|
317
|
+
plan
|
|
318
|
+
};
|
|
319
|
+
}
|
|
320
|
+
switch (intent) {
|
|
321
|
+
case "analyzeFile":
|
|
322
|
+
case "extractText":
|
|
323
|
+
if (kind === "docx") {
|
|
324
|
+
nativeExtraction = {
|
|
325
|
+
willAttempt: true,
|
|
326
|
+
description: "Mammoth reads OOXML for text" + (intent === "extractText" ? " (HTML omitted in extractText)." : " and HTML.")
|
|
327
|
+
};
|
|
328
|
+
ocr2 = { mayUse: false, description: "DOCX does not use OCR in DocMind." };
|
|
329
|
+
} else if (kind === "image") {
|
|
330
|
+
nativeExtraction = {
|
|
331
|
+
willAttempt: false,
|
|
332
|
+
description: "Raster images have no native text layer in this pipeline."
|
|
333
|
+
};
|
|
334
|
+
ocr2 = {
|
|
335
|
+
mayUse: imageOcrActive,
|
|
336
|
+
description: imageOcrActive ? "Tesseract.js may run to recover text (subject to format support)." : "OCR skipped while ocr.mode is off."
|
|
337
|
+
};
|
|
338
|
+
limitations = lim(ocrOffNote);
|
|
339
|
+
} else {
|
|
340
|
+
nativeExtraction = {
|
|
341
|
+
willAttempt: true,
|
|
342
|
+
description: "Plain text is decoded as UTF-8 (BOM stripped, replacement on invalid bytes)."
|
|
343
|
+
};
|
|
344
|
+
ocr2 = { mayUse: false, description: "OCR does not apply to text files." };
|
|
345
|
+
}
|
|
346
|
+
break;
|
|
347
|
+
case "extractMetadata":
|
|
348
|
+
if (kind === "docx" || kind === "image") {
|
|
349
|
+
nativeExtraction = {
|
|
350
|
+
willAttempt: false,
|
|
351
|
+
description: "No heavy extractor; extractMetadata returns a stub with guidance."
|
|
352
|
+
};
|
|
353
|
+
ocr2 = { mayUse: false, description: "OCR is not invoked for this metadata path." };
|
|
354
|
+
limitations = lim(
|
|
355
|
+
kind === "docx" ? "Structured DOCX metadata is not exposed separately in the browser." : "Raster images have no document metadata bundle in this API."
|
|
356
|
+
);
|
|
357
|
+
} else {
|
|
358
|
+
nativeExtraction = {
|
|
359
|
+
willAttempt: true,
|
|
360
|
+
description: "Plain text is decoded; metadata is limited to decoded content."
|
|
361
|
+
};
|
|
362
|
+
ocr2 = { mayUse: false, description: "OCR does not apply." };
|
|
363
|
+
limitations = lim("Plain text has no structured document metadata.");
|
|
364
|
+
}
|
|
365
|
+
break;
|
|
366
|
+
case "convertToHtml":
|
|
367
|
+
if (kind === "docx") {
|
|
368
|
+
nativeExtraction = {
|
|
369
|
+
willAttempt: true,
|
|
370
|
+
description: "Mammoth produces HTML; images are not passed through analyzeFile for this intent."
|
|
371
|
+
};
|
|
372
|
+
ocr2 = { mayUse: false, description: "DOCX path does not use OCR." };
|
|
373
|
+
} else if (kind === "text") {
|
|
374
|
+
nativeExtraction = {
|
|
375
|
+
willAttempt: true,
|
|
376
|
+
description: "UTF-8 decode then wrap in a <pre> element."
|
|
377
|
+
};
|
|
378
|
+
ocr2 = { mayUse: false, description: "OCR does not apply." };
|
|
379
|
+
} else {
|
|
380
|
+
nativeExtraction = {
|
|
381
|
+
willAttempt: false,
|
|
382
|
+
description: "No rich HTML path for this kind in the browser."
|
|
383
|
+
};
|
|
384
|
+
ocr2 = { mayUse: false, description: "OCR does not produce layout HTML here." };
|
|
385
|
+
limitations = lim(
|
|
386
|
+
kind === "image" ? "Raster images have no HTML representation; use extractText or runOcr." : ""
|
|
387
|
+
);
|
|
388
|
+
}
|
|
389
|
+
break;
|
|
390
|
+
case "runOcr":
|
|
391
|
+
if (kind === "image") {
|
|
392
|
+
nativeExtraction = {
|
|
393
|
+
willAttempt: false,
|
|
394
|
+
description: "No native text layer; recognition is OCR-only."
|
|
395
|
+
};
|
|
396
|
+
ocr2 = {
|
|
397
|
+
mayUse: imageOcrActive,
|
|
398
|
+
description: imageOcrActive ? "Tesseract.js runs for raster text." : "OCR skipped while ocr.mode is off."
|
|
399
|
+
};
|
|
400
|
+
limitations = lim(ocrOffNote);
|
|
401
|
+
} else if (kind === "docx") {
|
|
402
|
+
nativeExtraction = {
|
|
403
|
+
willAttempt: true,
|
|
404
|
+
description: "Mammoth returns structured text/HTML (OCR does not apply to DOCX)."
|
|
405
|
+
};
|
|
406
|
+
ocr2 = { mayUse: false, description: "DOCX is not OCR'd." };
|
|
407
|
+
limitations = lim("Returned content is structured extract, not OCR output.");
|
|
408
|
+
} else {
|
|
409
|
+
nativeExtraction = {
|
|
410
|
+
willAttempt: true,
|
|
411
|
+
description: "Plain text is UTF-8 decoded only."
|
|
412
|
+
};
|
|
413
|
+
ocr2 = { mayUse: false, description: "OCR does not apply to text files." };
|
|
414
|
+
}
|
|
415
|
+
break;
|
|
416
|
+
default:
|
|
417
|
+
nativeExtraction = { willAttempt: false, description: "Intent not specialized in this runtime." };
|
|
418
|
+
ocr2 = { mayUse: false, description: "See plan steps." };
|
|
419
|
+
}
|
|
420
|
+
return {
|
|
421
|
+
kind,
|
|
422
|
+
detectedKind: kind,
|
|
423
|
+
runtime,
|
|
424
|
+
intent,
|
|
425
|
+
primaryAnalyzer,
|
|
426
|
+
nativeExtraction,
|
|
427
|
+
ocr: ocr2,
|
|
428
|
+
limitations,
|
|
429
|
+
plan
|
|
430
|
+
};
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
// src/capabilityReport.ts
|
|
434
|
+
var DOCX_META = "Structured document metadata is not exposed separately in the browser runtime; extractMetadata returns a stub for DOCX.";
|
|
435
|
+
var IMAGE_META = "Raster images have no document metadata bundle; extractMetadata returns a stub.";
|
|
436
|
+
var IMAGE_HTML = "No layout HTML for raster images; use extractText or runOcr for text.";
|
|
437
|
+
var TEXT_META_NOTE = "Plain text has no structured document metadata; extractMetadata still returns decoded content.";
|
|
438
|
+
var OCR_OFF_NOTE = 'Image OCR may be skipped when `ocr.mode` is "off" in analyze options.';
|
|
439
|
+
var UNKNOWN_KIND = "Could not determine file kind from name, MIME, or bytes; all features are reported as unsupported until the kind is known.";
|
|
440
|
+
function slot(id, supported, warnings) {
|
|
441
|
+
return warnings?.length ? { id, supported, warnings } : { id, supported };
|
|
442
|
+
}
|
|
443
|
+
function buildBrowserCapabilityReport(kind) {
|
|
444
|
+
const runtime = { id: "browser" };
|
|
445
|
+
const pdf = BROWSER_PDF_UNSUPPORTED_WARNING;
|
|
446
|
+
let capabilities;
|
|
447
|
+
const topWarnings = [];
|
|
448
|
+
switch (kind) {
|
|
449
|
+
case "pdf":
|
|
450
|
+
capabilities = [
|
|
451
|
+
slot("text", false, [pdf]),
|
|
452
|
+
slot("metadata", false, [pdf]),
|
|
453
|
+
slot("html", false, [pdf]),
|
|
454
|
+
slot("ocr", false, [pdf]),
|
|
455
|
+
slot("pages", false, [pdf])
|
|
456
|
+
];
|
|
457
|
+
break;
|
|
458
|
+
case "docx":
|
|
459
|
+
capabilities = [
|
|
460
|
+
slot("text", true),
|
|
461
|
+
slot("metadata", false, [DOCX_META]),
|
|
462
|
+
slot("html", true),
|
|
463
|
+
slot("ocr", false, ["OCR does not apply to DOCX in DocMind."]),
|
|
464
|
+
slot("pages", false)
|
|
465
|
+
];
|
|
466
|
+
break;
|
|
467
|
+
case "image":
|
|
468
|
+
capabilities = [
|
|
469
|
+
slot("text", true, ["Text is obtained via OCR when enabled."]),
|
|
470
|
+
slot("metadata", false, [IMAGE_META]),
|
|
471
|
+
slot("html", false, [IMAGE_HTML]),
|
|
472
|
+
slot("ocr", true, [OCR_OFF_NOTE]),
|
|
473
|
+
slot("pages", false)
|
|
474
|
+
];
|
|
475
|
+
break;
|
|
476
|
+
case "text":
|
|
477
|
+
capabilities = [
|
|
478
|
+
slot("text", true),
|
|
479
|
+
slot("metadata", true, [TEXT_META_NOTE]),
|
|
480
|
+
slot("html", true),
|
|
481
|
+
slot("ocr", false, ["OCR does not apply to plain text files."]),
|
|
482
|
+
slot("pages", false)
|
|
483
|
+
];
|
|
484
|
+
break;
|
|
485
|
+
default:
|
|
486
|
+
topWarnings.push(UNKNOWN_KIND);
|
|
487
|
+
capabilities = [
|
|
488
|
+
slot("text", false),
|
|
489
|
+
slot("metadata", false),
|
|
490
|
+
slot("html", false),
|
|
491
|
+
slot("ocr", false),
|
|
492
|
+
slot("pages", false)
|
|
493
|
+
];
|
|
494
|
+
}
|
|
495
|
+
return {
|
|
496
|
+
kind,
|
|
497
|
+
runtime,
|
|
498
|
+
capabilities,
|
|
499
|
+
warnings: topWarnings.length > 0 ? topWarnings : void 0
|
|
500
|
+
};
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
// src/introspection.ts
|
|
504
|
+
function resolveOcrMode2(ocr2) {
|
|
505
|
+
return ocr2?.mode ?? "auto";
|
|
506
|
+
}
|
|
507
|
+
function planForAnalyzeFile(kind, ocrMode) {
|
|
508
|
+
switch (kind) {
|
|
509
|
+
case "pdf":
|
|
270
510
|
return {
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
511
|
+
intent: "analyzeFile",
|
|
512
|
+
steps: [
|
|
513
|
+
{ id: "detect_kind", status: "done" },
|
|
514
|
+
{ id: "pdf_pipeline", status: "skipped" }
|
|
515
|
+
]
|
|
516
|
+
};
|
|
517
|
+
case "docx":
|
|
518
|
+
return {
|
|
519
|
+
intent: "analyzeFile",
|
|
520
|
+
steps: [
|
|
521
|
+
{ id: "detect_kind", status: "done" },
|
|
522
|
+
{ id: "docx_mammoth", status: "planned" }
|
|
276
523
|
]
|
|
277
524
|
};
|
|
278
|
-
}
|
|
279
525
|
case "image":
|
|
280
526
|
return {
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
527
|
+
intent: "analyzeFile",
|
|
528
|
+
steps: [
|
|
529
|
+
{ id: "detect_kind", status: "done" },
|
|
530
|
+
{
|
|
531
|
+
id: "image_ocr",
|
|
532
|
+
status: ocrMode === "off" ? "skipped" : "planned"
|
|
533
|
+
}
|
|
534
|
+
]
|
|
535
|
+
};
|
|
536
|
+
case "text":
|
|
537
|
+
return {
|
|
538
|
+
intent: "analyzeFile",
|
|
539
|
+
steps: [
|
|
540
|
+
{ id: "detect_kind", status: "done" },
|
|
541
|
+
{ id: "utf8_decode", status: "planned" }
|
|
542
|
+
]
|
|
289
543
|
};
|
|
290
544
|
default:
|
|
291
|
-
return
|
|
545
|
+
return {
|
|
546
|
+
intent: "analyzeFile",
|
|
547
|
+
steps: [{ id: "detect_kind", status: "done" }, { id: "route", status: "failed" }]
|
|
548
|
+
};
|
|
292
549
|
}
|
|
293
550
|
}
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
status: "ok",
|
|
310
|
-
kind: "image",
|
|
311
|
-
text: "",
|
|
312
|
-
confidence: 0,
|
|
313
|
-
ocrUsed: true,
|
|
314
|
-
warnings: ["No image bytes were provided for analysis."]
|
|
315
|
-
};
|
|
316
|
-
}
|
|
317
|
-
const ocrOpts = {
|
|
318
|
-
...options?.ocr ?? {},
|
|
319
|
-
signal: options?.ocr?.signal ?? signal
|
|
551
|
+
function planForIntent(intentOpt, kind, ocrMode) {
|
|
552
|
+
const intent = intentOpt ?? "analyzeFile";
|
|
553
|
+
if (intent === "analyzeFile") return planForAnalyzeFile(kind, ocrMode);
|
|
554
|
+
if (intent === "extractText") {
|
|
555
|
+
const base = planForAnalyzeFile(kind, ocrMode);
|
|
556
|
+
return { ...base, intent: "extractText" };
|
|
557
|
+
}
|
|
558
|
+
if (intent === "extractMetadata") {
|
|
559
|
+
if (kind === "text") {
|
|
560
|
+
return {
|
|
561
|
+
intent: "extractMetadata",
|
|
562
|
+
steps: [
|
|
563
|
+
{ id: "detect_kind", status: "done" },
|
|
564
|
+
{ id: "utf8_decode", status: "planned" }
|
|
565
|
+
]
|
|
320
566
|
};
|
|
321
|
-
|
|
567
|
+
}
|
|
568
|
+
return {
|
|
569
|
+
intent: "extractMetadata",
|
|
570
|
+
steps: [
|
|
571
|
+
{ id: "detect_kind", status: "done" },
|
|
572
|
+
{ id: "metadata_stub", status: kind === "docx" || kind === "image" ? "planned" : "skipped" }
|
|
573
|
+
]
|
|
574
|
+
};
|
|
575
|
+
}
|
|
576
|
+
if (intent === "convertToHtml") {
|
|
577
|
+
if (kind === "docx") {
|
|
322
578
|
return {
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
confidence: r.confidence,
|
|
329
|
-
ocrUsed: r.ocrUsed,
|
|
330
|
-
warnings: []
|
|
579
|
+
intent: "convertToHtml",
|
|
580
|
+
steps: [
|
|
581
|
+
{ id: "detect_kind", status: "done" },
|
|
582
|
+
{ id: "docx_mammoth_html", status: "planned" }
|
|
583
|
+
]
|
|
331
584
|
};
|
|
332
585
|
}
|
|
333
|
-
|
|
334
|
-
const data = await toUint8Array(bytesInput);
|
|
335
|
-
if (data.byteLength === 0) {
|
|
336
|
-
return {
|
|
337
|
-
fileKind: "docx",
|
|
338
|
-
analyzer: "docx",
|
|
339
|
-
status: "ok",
|
|
340
|
-
kind: "docx",
|
|
341
|
-
text: "",
|
|
342
|
-
html: "",
|
|
343
|
-
warnings: ["No document bytes were provided for analysis."]
|
|
344
|
-
};
|
|
345
|
-
}
|
|
346
|
-
const r = await analyzeDocx(data);
|
|
586
|
+
if (kind === "text") {
|
|
347
587
|
return {
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
html: r.html,
|
|
354
|
-
warnings: [
|
|
355
|
-
...r.warnings,
|
|
356
|
-
"OCR does not apply to DOCX; returned structured text/HTML extract."
|
|
588
|
+
intent: "convertToHtml",
|
|
589
|
+
steps: [
|
|
590
|
+
{ id: "detect_kind", status: "done" },
|
|
591
|
+
{ id: "utf8_decode", status: "planned" },
|
|
592
|
+
{ id: "wrap_pre", status: "planned" }
|
|
357
593
|
]
|
|
358
594
|
};
|
|
359
595
|
}
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
596
|
+
return {
|
|
597
|
+
intent: "convertToHtml",
|
|
598
|
+
steps: [
|
|
599
|
+
{ id: "detect_kind", status: "done" },
|
|
600
|
+
{ id: "rich_html", status: "skipped" }
|
|
601
|
+
]
|
|
602
|
+
};
|
|
364
603
|
}
|
|
604
|
+
if (intent === "runOcr") {
|
|
605
|
+
if (kind === "image") {
|
|
606
|
+
return {
|
|
607
|
+
intent: "runOcr",
|
|
608
|
+
steps: [
|
|
609
|
+
{ id: "detect_kind", status: "done" },
|
|
610
|
+
{ id: "tesseract_ocr", status: ocrMode === "off" ? "skipped" : "planned" }
|
|
611
|
+
]
|
|
612
|
+
};
|
|
613
|
+
}
|
|
614
|
+
if (kind === "docx") {
|
|
615
|
+
return {
|
|
616
|
+
intent: "runOcr",
|
|
617
|
+
steps: [
|
|
618
|
+
{ id: "detect_kind", status: "done" },
|
|
619
|
+
{ id: "docx_structured_extract", status: "planned" }
|
|
620
|
+
]
|
|
621
|
+
};
|
|
622
|
+
}
|
|
623
|
+
return {
|
|
624
|
+
intent: "runOcr",
|
|
625
|
+
steps: [
|
|
626
|
+
{ id: "detect_kind", status: "done" },
|
|
627
|
+
{ id: "ocr", status: "skipped" }
|
|
628
|
+
]
|
|
629
|
+
};
|
|
630
|
+
}
|
|
631
|
+
return planForAnalyzeFile(kind, ocrMode);
|
|
632
|
+
}
|
|
633
|
+
async function getCapabilities(input, options) {
|
|
634
|
+
throwIfAborted(options?.signal);
|
|
635
|
+
prepareBrowserAnalyzeInput(input);
|
|
636
|
+
const kind = detectFileKind(input);
|
|
637
|
+
return buildBrowserCapabilityReport(kind);
|
|
638
|
+
}
|
|
639
|
+
async function explainAnalysisPlan(input, options) {
|
|
640
|
+
throwIfAborted(options?.signal);
|
|
641
|
+
prepareBrowserAnalyzeInput(input);
|
|
642
|
+
const kind = detectFileKind(input);
|
|
643
|
+
const intent = options?.intent ?? "analyzeFile";
|
|
644
|
+
const ocrMode = resolveOcrMode2(options?.ocr);
|
|
645
|
+
const plan = planForIntent(intent, kind, ocrMode);
|
|
646
|
+
return buildBrowserExplainReport(kind, intent, ocrMode, plan);
|
|
365
647
|
}
|
|
366
648
|
|
|
367
|
-
export { BROWSER_PDF_UNSUPPORTED_WARNING, analyzeFile, convertToHtml, extractMetadata, extractText, runOcr };
|
|
649
|
+
export { BROWSER_PDF_UNSUPPORTED_WARNING, analyzeFile, convertToHtml, explainAnalysisPlan, extractMetadata, extractText, getCapabilities, runOcr };
|
|
368
650
|
//# sourceMappingURL=index.js.map
|
|
369
651
|
//# sourceMappingURL=index.js.map
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dragon708/docmind-browser",
|
|
3
|
-
"version": "1.
|
|
4
|
-
"description": "
|
|
3
|
+
"version": "1.2.0",
|
|
4
|
+
"description": "Official DocMind browser facade: analyzeFile and intent APIs (DOCX, image OCR, text). PDF and fs paths use @dragon708/docmind-node.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"sideEffects": false,
|
|
7
7
|
"main": "./dist/index.js",
|
|
@@ -15,7 +15,8 @@
|
|
|
15
15
|
}
|
|
16
16
|
},
|
|
17
17
|
"files": [
|
|
18
|
-
"dist"
|
|
18
|
+
"dist/**/*.js",
|
|
19
|
+
"dist/**/*.d.ts"
|
|
19
20
|
],
|
|
20
21
|
"publishConfig": {
|
|
21
22
|
"access": "public"
|
|
@@ -34,7 +35,7 @@
|
|
|
34
35
|
"dependencies": {
|
|
35
36
|
"@dragon708/docmind-docx": "^1.0.0",
|
|
36
37
|
"@dragon708/docmind-ocr": "^1.0.0",
|
|
37
|
-
"@dragon708/docmind-shared": "^1.
|
|
38
|
+
"@dragon708/docmind-shared": "^1.1.0"
|
|
38
39
|
},
|
|
39
40
|
"devDependencies": {
|
|
40
41
|
"@types/node": "^20.19.37",
|
package/dist/index.js.map
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/analyzers/docx.ts","../src/analyzers/image.ts","../src/analyzeFile.ts","../src/publicActions.ts"],"names":["extractDocx","toUint8Array","assertBrowserInput","InvalidInputError","assertValidAnalyzeFileInput","detectFileKind","notImplementedResult","ocr","analyzeText","UNKNOWN_FORMAT_WARNING"],"mappings":";;;;;;AAOA,eAAsB,qBAAA,CACpB,OACA,MAAA,EACyB;AACzB,EAAA,IAAI,QAAQ,OAAA,EAAS;AACnB,IAAA,MAAM,GAAA,GAAM,IAAI,KAAA,CAAM,2BAA2B,CAAA;AACjD,IAAA,GAAA,CAAI,IAAA,GAAO,YAAA;AACX,IAAA,MAAM,GAAA;AAAA,EACR;AAEA,EAAA,MAAM,IAAA,GAAO,MAAM,YAAA,CAAa,KAAK,CAAA;AACrC,EAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,IAAA,OAAO;AAAA,MACL,QAAA,EAAU,MAAA;AAAA,MACV,QAAA,EAAU,MAAA;AAAA,MACV,MAAA,EAAQ,IAAA;AAAA,MACR,IAAA,EAAM,MAAA;AAAA,MACN,IAAA,EAAM,EAAA;AAAA,MACN,IAAA,EAAM,EAAA;AAAA,MACN,QAAA,EAAU,CAAC,+CAA+C;AAAA,KAC5D;AAAA,EACF;AAEA,EAAA,MAAM,CAAA,GAAI,MAAMA,WAAA,CAAY,IAAI,CAAA;AAChC,EAAA,OAAO;AAAA,IACL,QAAA,EAAU,MAAA;AAAA,IACV,QAAA,EAAU,MAAA;AAAA,IACV,MAAA,EAAQ,IAAA;AAAA,IACR,IAAA,EAAM,MAAA;AAAA,IACN,MAAM,CAAA,CAAE,IAAA;AAAA,IACR,MAAM,CAAA,CAAE,IAAA;AAAA,IACR,QAAA,EAAU,CAAC,GAAG,CAAA,CAAE,QAAQ;AAAA,GAC1B;AACF;AChCA,eAAsB,sBAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,IAAI,OAAA,EAAS,QAAQ,OAAA,EAAS;AAC5B,IAAA,MAAM,GAAA,GAAM,IAAI,KAAA,CAAM,2BAA2B,CAAA;AACjD,IAAA,GAAA,CAAI,IAAA,GAAO,YAAA;AACX,IAAA,MAAM,GAAA;AAAA,EACR;AAEA,EAAA,MAAM,IAAA,GAAO,MAAMC,YAAAA,CAAa,KAAK,CAAA;AACrC,EAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,IAAA,OAAO;AAAA,MACL,QAAA,EAAU,OAAA;AAAA,MACV,QAAA,EAAU,OAAA;AAAA,MACV,MAAA,EAAQ,IAAA;AAAA,MACR,IAAA,EAAM,OAAA;AAAA,MACN,IAAA,EAAM,EAAA;AAAA,MACN,UAAA,EAAY,CAAA;AAAA,MACZ,OAAA,EAAS,IAAA;AAAA,MACT,QAAA,EAAU,CAAC,4CAA4C;AAAA,KACzD;AAAA,EACF;AAEA,EAAA,MAAM,OAAA,GAAU;AAAA,IACd,GAAI,OAAA,EAAS,GAAA,IAAO,EAAC;AAAA,IACrB,MAAA,EAAQ,OAAA,EAAS,GAAA,EAAK,MAAA,IAAU,OAAA,EAAS;AAAA,GAC3C;AAEA,EAAA,MAAM,CAAA,GAAI,MAAM,GAAA,CAAI,IAAA,EAAM,OAAO,CAAA;AACjC,EAAA,OAAO;AAAA,IACL,QAAA,EAAU,OAAA;AAAA,IACV,QAAA,EAAU,OAAA;AAAA,IACV,MAAA,EAAQ,IAAA;AAAA,IACR,IAAA,EAAM,OAAA;AAAA,IACN,MAAM,CAAA,CAAE,IAAA;AAAA,IACR,YAAY,CAAA,CAAE,UAAA;AAAA,IACd,SAAS,CAAA,CAAE,OAAA;AAAA,IACX,UAAU;AAAC,GACb;AACF;;;ACjCO,IAAM,+BAAA,GACX;AAOF,SAAS,mBAAmB,KAAA,EAAsD;AAChF,EAAA,MAAM,EAAA,GACJ,KAAA,YAAiB,IAAA,IACjB,KAAA,YAAiB,QACjB,KAAA,YAAiB,WAAA;AACnB,EAAA,IAAI,CAAC,EAAA,EAAI;AACP,IAAA,MAAM,IAAI,kBAAkB,wCAAwC,CAAA;AAAA,EACtE;AACF;AAKA,eAAsB,WAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,IAAI,OAAA,EAAS,QAAQ,OAAA,EAAS;AAC5B,IAAA,MAAM,GAAA,GAAM,IAAI,KAAA,CAAM,2BAA2B,CAAA;AACjD,IAAA,GAAA,CAAI,IAAA,GAAO,YAAA;AACX,IAAA,MAAM,GAAA;AAAA,EACR;AAEA,EAAA,kBAAA,CAAmB,KAAK,CAAA;AACxB,EAAA,2BAAA,CAA4B,KAAK,CAAA;AAEjC,EAAA,MAAM,QAAA,GAAW,eAAe,KAA4B,CAAA;AAE5D,EAAA,MAAM,UAAA,GAAa,KAAA;AAEnB,EAAA,QAAQ,QAAA;AAAU,IAChB,KAAK,KAAA;AACH,MAAA,OAAO,oBAAA,CAAqB,KAAA,EAAO,KAAA,EAAO,CAAC,+BAA+B,CAAC,CAAA;AAAA,IAC7E,KAAK,MAAA;AACH,MAAA,OAAO,qBAAA,CAAsB,UAAA,EAAY,OAAA,EAAS,MAAM,CAAA;AAAA,IAC1D,KAAK,OAAA;AACH,MAAA,OAAO,sBAAA,CAAuB,YAAY,OAAO,CAAA;AAAA,IACnD,KAAK,MAAA;AACH,MAAA,OAAO,YAAY,UAAA,EAAY,EAAE,MAAA,EAAQ,OAAA,EAAS,QAAQ,CAAA;AAAA,IAC5D;AACE,MAAA,OAAO,oBAAA,CAAqB,QAAA,EAAU,MAAA,EAAQ,CAAC,sBAAsB,CAAC,CAAA;AAAA;AAE5E;AC7CA,SAASC,oBAAmB,KAAA,EAAsD;AAChF,EAAA,MAAM,EAAA,GACJ,KAAA,YAAiB,IAAA,IACjB,KAAA,YAAiB,QACjB,KAAA,YAAiB,WAAA;AACnB,EAAA,IAAI,CAAC,EAAA,EAAI;AACP,IAAA,MAAM,IAAIC,kBAAkB,wCAAwC,CAAA;AAAA,EACtE;AACF;AAEA,SAAS,eAAe,MAAA,EAA4B;AAClD,EAAA,IAAI,QAAQ,OAAA,EAAS;AACnB,IAAA,MAAM,GAAA,GAAM,IAAI,KAAA,CAAM,2BAA2B,CAAA;AACjD,IAAA,GAAA,CAAI,IAAA,GAAO,YAAA;AACX,IAAA,MAAM,GAAA;AAAA,EACR;AACF;AAEA,SAAS,kBAAkB,CAAA,EAAmB;AAC5C,EAAA,OAAO,CAAA,CACJ,OAAA,CAAQ,IAAA,EAAM,OAAO,EACrB,OAAA,CAAQ,IAAA,EAAM,MAAM,CAAA,CACpB,QAAQ,IAAA,EAAM,MAAM,CAAA,CACpB,OAAA,CAAQ,MAAM,QAAQ,CAAA;AAC3B;AAEA,IAAM,kBAAA,GACJ,yGAAA;AAEF,IAAM,mBAAA,GACJ,6DAAA;AAEF,eAAe,OAAO,KAAA,EAA0D;AAC9E,EAAAD,oBAAmB,KAAK,CAAA;AACxB,EAAAE,4BAA4B,KAAK,CAAA;AACjC,EAAA,OAAO,KAAA;AACT;AAMA,eAAsB,WAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,cAAA,CAAe,SAAS,MAAM,CAAA;AAC9B,EAAA,MAAM,QAAA,GAAW,MAAM,MAAA,CAAO,KAAK,CAAA;AACnC,EAAA,MAAM,IAAA,GAAOC,eAAe,QAAQ,CAAA;AACpC,EAAA,MAAM,UAAA,GAAa,KAAA;AACnB,EAAA,MAAM,SAAS,OAAA,EAAS,MAAA;AAExB,EAAA,QAAQ,IAAA;AAAM,IACZ,KAAK,KAAA;AACH,MAAA,OAAOC,oBAAAA,CAAqB,KAAA,EAAO,KAAA,EAAO,CAAC,+BAA+B,CAAC,CAAA;AAAA,IAC7E,KAAK,MAAA,EAAQ;AACX,MAAA,MAAM,IAAA,GAAO,MAAML,YAAAA,CAAa,UAAU,CAAA;AAC1C,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,MAAA;AAAA,UACV,QAAA,EAAU,MAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,MAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,QAAA,EAAU,CAAC,+CAA+C;AAAA,SAC5D;AAAA,MACF;AACA,MAAA,MAAM,CAAA,GAAI,MAAM,mBAAA,CAAoB,IAAI,CAAA;AACxC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,MAAA;AAAA,QACV,QAAA,EAAU,MAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,MAAA;AAAA,QACN,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,IAAA,EAAM,EAAA;AAAA,QACN,UAAU,CAAA,CAAE;AAAA,OACd;AAAA,IACF;AAAA,IACA,KAAK,OAAA,EAAS;AACZ,MAAA,MAAM,IAAA,GAAO,MAAMA,YAAAA,CAAa,UAAU,CAAA;AAC1C,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,OAAA;AAAA,UACV,QAAA,EAAU,OAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,OAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,UAAA,EAAY,CAAA;AAAA,UACZ,OAAA,EAAS,IAAA;AAAA,UACT,QAAA,EAAU,CAAC,4CAA4C;AAAA,SACzD;AAAA,MACF;AACA,MAAA,MAAM,OAAA,GAAU;AAAA,QACd,GAAI,OAAA,EAAS,GAAA,IAAO,EAAC;AAAA,QACrB,MAAA,EAAQ,OAAA,EAAS,GAAA,EAAK,MAAA,IAAU;AAAA,OAClC;AACA,MAAA,MAAM,CAAA,GAAI,MAAMM,GAAAA,CAAI,IAAA,EAAM,OAAO,CAAA;AACjC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,OAAA;AAAA,QACV,QAAA,EAAU,OAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,OAAA;AAAA,QACN,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,YAAY,CAAA,CAAE,UAAA;AAAA,QACd,SAAS,CAAA,CAAE,OAAA;AAAA,QACX,UAAU;AAAC,OACb;AAAA,IACF;AAAA,IACA,KAAK,MAAA;AACH,MAAA,OAAOC,WAAAA,CAAY,UAAA,EAAY,EAAE,MAAA,EAAQ,CAAA;AAAA,IAC3C;AACE,MAAA,OAAOF,oBAAAA,CAAqB,IAAA,EAAM,MAAA,EAAQ,CAACG,sBAAsB,CAAC,CAAA;AAAA;AAExE;AAMA,eAAsB,eAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,cAAA,CAAe,SAAS,MAAM,CAAA;AAC9B,EAAA,MAAM,QAAA,GAAW,MAAM,MAAA,CAAO,KAAK,CAAA;AACnC,EAAA,MAAM,IAAA,GAAOJ,eAAe,QAAQ,CAAA;AACpC,EAAA,MAAM,UAAA,GAAa,KAAA;AACnB,EAAA,MAAM,SAAS,OAAA,EAAS,MAAA;AAExB,EAAA,QAAQ,IAAA;AAAM,IACZ,KAAK,KAAA;AACH,MAAA,OAAOC,oBAAAA,CAAqB,KAAA,EAAO,KAAA,EAAO,CAAC,+BAA+B,CAAC,CAAA;AAAA,IAC7E,KAAK,MAAA;AACH,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,MAAA;AAAA,QACV,QAAA,EAAU,MAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,MAAA;AAAA,QACN,IAAA,EAAM,EAAA;AAAA,QACN,IAAA,EAAM,EAAA;AAAA,QACN,QAAA,EAAU,CAAC,kBAAkB;AAAA,OAC/B;AAAA,IACF,KAAK,OAAA;AACH,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,OAAA;AAAA,QACV,QAAA,EAAU,OAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,OAAA;AAAA,QACN,IAAA,EAAM,EAAA;AAAA,QACN,UAAA,EAAY,CAAA;AAAA,QACZ,OAAA,EAAS,IAAA;AAAA,QACT,QAAA,EAAU,CAAC,mBAAmB;AAAA,OAChC;AAAA,IACF,KAAK,MAAA;AACH,MAAA,OAAOE,WAAAA,CAAY,UAAA,EAAY,EAAE,MAAA,EAAQ,CAAA;AAAA,IAC3C;AACE,MAAA,OAAOF,oBAAAA,CAAqB,IAAA,EAAM,MAAA,EAAQ,CAACG,sBAAsB,CAAC,CAAA;AAAA;AAExE;AAMA,eAAsB,aAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,cAAA,CAAe,SAAS,MAAM,CAAA;AAC9B,EAAA,MAAM,QAAA,GAAW,MAAM,MAAA,CAAO,KAAK,CAAA;AACnC,EAAA,MAAM,IAAA,GAAOJ,eAAe,QAAQ,CAAA;AACpC,EAAA,MAAM,UAAA,GAAa,KAAA;AACnB,EAAA,MAAM,SAAS,OAAA,EAAS,MAAA;AAExB,EAAA,QAAQ,IAAA;AAAM,IACZ,KAAK,KAAA;AACH,MAAA,OAAOC,oBAAAA,CAAqB,KAAA,EAAO,KAAA,EAAO,CAAC,+BAA+B,CAAC,CAAA;AAAA,IAC7E,KAAK,MAAA,EAAQ;AACX,MAAA,MAAM,IAAA,GAAO,MAAML,YAAAA,CAAa,UAAU,CAAA;AAC1C,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,MAAA;AAAA,UACV,QAAA,EAAU,MAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,MAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,QAAA,EAAU,CAAC,+CAA+C;AAAA,SAC5D;AAAA,MACF;AACA,MAAA,MAAM,CAAC,QAAA,EAAU,QAAQ,CAAA,GAAI,MAAM,QAAQ,GAAA,CAAI;AAAA,QAC7C,oBAAoB,IAAI,CAAA;AAAA,QACxB,kBAAkB,IAAI;AAAA,OACvB,CAAA;AACD,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,MAAA;AAAA,QACV,QAAA,EAAU,MAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,MAAA;AAAA,QACN,MAAM,QAAA,CAAS,IAAA;AAAA,QACf,MAAM,QAAA,CAAS,IAAA;AAAA,QACf,UAAU,CAAC,GAAG,SAAS,QAAA,EAAU,GAAG,SAAS,QAAQ;AAAA,OACvD;AAAA,IACF;AAAA,IACA,KAAK,MAAA,EAAQ;AACX,MAAA,MAAM,IAAI,MAAMO,WAAAA,CAAY,UAAA,EAAY,EAAE,QAAQ,CAAA;AAClD,MAAA,MAAM,IAAA,GAAO,CAAA,KAAA,EAAQ,iBAAA,CAAkB,CAAA,CAAE,IAAI,CAAC,CAAA,MAAA,CAAA;AAC9C,MAAA,OAAO;AAAA,QACL,GAAG,CAAA;AAAA,QACH,IAAA;AAAA,QACA,QAAA,EAAU;AAAA,UACR,GAAG,CAAA,CAAE,QAAA;AAAA,UACL;AAAA;AACF,OACF;AAAA,IACF;AAAA,IACA,KAAK,OAAA;AACH,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,OAAA;AAAA,QACV,QAAA,EAAU,OAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,OAAA;AAAA,QACN,IAAA,EAAM,EAAA;AAAA,QACN,UAAA,EAAY,CAAA;AAAA,QACZ,OAAA,EAAS,IAAA;AAAA,QACT,QAAA,EAAU,CAAC,qEAAqE;AAAA,OAClF;AAAA,IACF;AACE,MAAA,OAAOF,oBAAAA,CAAqB,IAAA,EAAM,MAAA,EAAQ,CAACG,sBAAsB,CAAC,CAAA;AAAA;AAExE;AAMA,eAAsB,MAAA,CACpB,OACA,OAAA,EACyB;AACzB,EAAA,cAAA,CAAe,SAAS,MAAM,CAAA;AAC9B,EAAA,MAAM,QAAA,GAAW,MAAM,MAAA,CAAO,KAAK,CAAA;AACnC,EAAA,MAAM,IAAA,GAAOJ,eAAe,QAAQ,CAAA;AACpC,EAAA,MAAM,UAAA,GAAa,KAAA;AACnB,EAAA,MAAM,SAAS,OAAA,EAAS,MAAA;AAExB,EAAA,QAAQ,IAAA;AAAM,IACZ,KAAK,KAAA;AACH,MAAA,OAAOC,oBAAAA,CAAqB,KAAA,EAAO,KAAA,EAAO,CAAC,+BAA+B,CAAC,CAAA;AAAA,IAC7E,KAAK,OAAA,EAAS;AACZ,MAAA,MAAM,IAAA,GAAO,MAAML,YAAAA,CAAa,UAAU,CAAA;AAC1C,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,OAAA;AAAA,UACV,QAAA,EAAU,OAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,OAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,UAAA,EAAY,CAAA;AAAA,UACZ,OAAA,EAAS,IAAA;AAAA,UACT,QAAA,EAAU,CAAC,4CAA4C;AAAA,SACzD;AAAA,MACF;AACA,MAAA,MAAM,OAAA,GAAU;AAAA,QACd,GAAI,OAAA,EAAS,GAAA,IAAO,EAAC;AAAA,QACrB,MAAA,EAAQ,OAAA,EAAS,GAAA,EAAK,MAAA,IAAU;AAAA,OAClC;AACA,MAAA,MAAM,CAAA,GAAI,MAAMM,GAAAA,CAAI,IAAA,EAAM,OAAO,CAAA;AACjC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,OAAA;AAAA,QACV,QAAA,EAAU,OAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,OAAA;AAAA,QACN,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,YAAY,CAAA,CAAE,UAAA;AAAA,QACd,SAAS,CAAA,CAAE,OAAA;AAAA,QACX,UAAU;AAAC,OACb;AAAA,IACF;AAAA,IACA,KAAK,MAAA,EAAQ;AACX,MAAA,MAAM,IAAA,GAAO,MAAMN,YAAAA,CAAa,UAAU,CAAA;AAC1C,MAAA,IAAI,IAAA,CAAK,eAAe,CAAA,EAAG;AACzB,QAAA,OAAO;AAAA,UACL,QAAA,EAAU,MAAA;AAAA,UACV,QAAA,EAAU,MAAA;AAAA,UACV,MAAA,EAAQ,IAAA;AAAA,UACR,IAAA,EAAM,MAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,IAAA,EAAM,EAAA;AAAA,UACN,QAAA,EAAU,CAAC,+CAA+C;AAAA,SAC5D;AAAA,MACF;AACA,MAAA,MAAM,CAAA,GAAI,MAAM,WAAA,CAAY,IAAI,CAAA;AAChC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,MAAA;AAAA,QACV,QAAA,EAAU,MAAA;AAAA,QACV,MAAA,EAAQ,IAAA;AAAA,QACR,IAAA,EAAM,MAAA;AAAA,QACN,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,MAAM,CAAA,CAAE,IAAA;AAAA,QACR,QAAA,EAAU;AAAA,UACR,GAAG,CAAA,CAAE,QAAA;AAAA,UACL;AAAA;AACF,OACF;AAAA,IACF;AAAA,IACA,KAAK,MAAA;AACH,MAAA,OAAOO,WAAAA,CAAY,UAAA,EAAY,EAAE,MAAA,EAAQ,CAAA;AAAA,IAC3C;AACE,MAAA,OAAOF,oBAAAA,CAAqB,IAAA,EAAM,MAAA,EAAQ,CAACG,sBAAsB,CAAC,CAAA;AAAA;AAExE","file":"index.js","sourcesContent":["import { analyzeDocx as extractDocx } from \"@dragon708/docmind-docx\";\nimport type { AnalysisResult, FileLikeInput } from \"@dragon708/docmind-shared\";\nimport { toUint8Array } from \"@dragon708/docmind-shared\";\n\n/**\n * DOCX → `@dragon708/docmind-docx` (browser-safe: Mammoth + JSZip).\n */\nexport async function analyzeDocxForBrowser(\n input: FileLikeInput,\n signal?: AbortSignal,\n): Promise<AnalysisResult> {\n if (signal?.aborted) {\n const err = new Error(\"The operation was aborted\");\n err.name = \"AbortError\";\n throw err;\n }\n\n const data = await toUint8Array(input);\n if (data.byteLength === 0) {\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: \"\",\n html: \"\",\n warnings: [\"No document bytes were provided for analysis.\"],\n };\n }\n\n const r = await extractDocx(data);\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: r.text,\n html: r.html,\n warnings: [...r.warnings],\n };\n}\n","import { ocr } from \"@dragon708/docmind-ocr\";\nimport type { AnalysisResult, FileLikeInput } from \"@dragon708/docmind-shared\";\nimport { toUint8Array } from \"@dragon708/docmind-shared\";\nimport type { BrowserAnalyzeOptions } from \"../browserAnalyzeOptions.js\";\n\n/**\n * Image → `@dragon708/docmind-ocr` (Tesseract in WASM / browser).\n */\nexport async function analyzeImageForBrowser(\n input: FileLikeInput,\n options?: BrowserAnalyzeOptions,\n): Promise<AnalysisResult> {\n if (options?.signal?.aborted) {\n const err = new Error(\"The operation was aborted\");\n err.name = \"AbortError\";\n throw err;\n }\n\n const data = await toUint8Array(input);\n if (data.byteLength === 0) {\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: \"\",\n confidence: 0,\n ocrUsed: true,\n warnings: [\"No image bytes were provided for analysis.\"],\n };\n }\n\n const ocrOpts = {\n ...(options?.ocr ?? {}),\n signal: options?.ocr?.signal ?? options?.signal,\n };\n\n const r = await ocr(data, ocrOpts);\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: r.text,\n confidence: r.confidence,\n ocrUsed: r.ocrUsed,\n warnings: [],\n };\n}\n","import type { AnalysisResult, FileLikeInput } from \"@dragon708/docmind-shared\";\r\nimport {\r\n analyzeText,\r\n assertValidAnalyzeFileInput,\r\n detectFileKind,\r\n InvalidInputError,\r\n notImplementedResult,\r\n UNKNOWN_FORMAT_WARNING,\r\n} from \"@dragon708/docmind-shared\";\r\nimport type { DetectFileKindInput } from \"@dragon708/docmind-shared\";\r\nimport type { BrowserAnalyzeOptions } from \"./browserAnalyzeOptions.js\";\r\nimport { analyzeDocxForBrowser } from \"./analyzers/docx.js\";\r\nimport { analyzeImageForBrowser } from \"./analyzers/image.js\";\r\n\r\n/** PDF is not processed in the browser; use `@dragon708/docmind-node` on the server. */\r\nexport const BROWSER_PDF_UNSUPPORTED_WARNING =\r\n \"PDF text extraction is not available in the browser runtime; use @dragon708/docmind-node on the server.\";\r\n\r\n/**\r\n * Inputs supported by the browser entry (DOM types only — no `fs`, no Node `Buffer` in the public surface).\r\n */\r\nexport type BrowserAnalyzeInput = File | Blob | ArrayBuffer;\r\n\r\nfunction assertBrowserInput(input: unknown): asserts input is BrowserAnalyzeInput {\r\n const ok =\r\n input instanceof File ||\r\n input instanceof Blob ||\r\n input instanceof ArrayBuffer;\r\n if (!ok) {\r\n throw new InvalidInputError(\"Expected a File, Blob, or ArrayBuffer.\");\r\n }\r\n}\r\n\r\n/**\r\n * Browser-only router: DOCX, images (OCR), and text. PDF yields `not_implemented` with a clear warning.\r\n */\r\nexport async function analyzeFile(\r\n input: BrowserAnalyzeInput,\r\n options?: BrowserAnalyzeOptions,\r\n): Promise<AnalysisResult> {\r\n if (options?.signal?.aborted) {\r\n const err = new Error(\"The operation was aborted\");\r\n err.name = \"AbortError\";\r\n throw err;\r\n }\r\n\r\n assertBrowserInput(input);\r\n assertValidAnalyzeFileInput(input);\r\n\r\n const fileKind = detectFileKind(input as DetectFileKindInput);\r\n\r\n const bytesInput = input as FileLikeInput;\r\n\r\n switch (fileKind) {\r\n case \"pdf\":\r\n return notImplementedResult(\"pdf\", \"pdf\", [BROWSER_PDF_UNSUPPORTED_WARNING]);\r\n case \"docx\":\r\n return analyzeDocxForBrowser(bytesInput, options?.signal);\r\n case \"image\":\r\n return analyzeImageForBrowser(bytesInput, options);\r\n case \"text\":\r\n return analyzeText(bytesInput, { signal: options?.signal });\r\n default:\r\n return notImplementedResult(fileKind, \"none\", [UNKNOWN_FORMAT_WARNING]);\r\n }\r\n}\r\n","import type { AnalysisResult, DetectFileKindInput, FileLikeInput } from \"@dragon708/docmind-shared\";\nimport {\n analyzeText,\n assertValidAnalyzeFileInput,\n detectFileKind,\n InvalidInputError,\n notImplementedResult,\n UNKNOWN_FORMAT_WARNING,\n toUint8Array,\n} from \"@dragon708/docmind-shared\";\nimport {\n analyzeDocx,\n convertDocxToHtml,\n extractTextFromDocx,\n} from \"@dragon708/docmind-docx\";\nimport { ocr } from \"@dragon708/docmind-ocr\";\nimport type { BrowserAnalyzeOptions } from \"./browserAnalyzeOptions.js\";\nimport { BROWSER_PDF_UNSUPPORTED_WARNING } from \"./analyzeFile.js\";\nimport type { BrowserAnalyzeInput } from \"./analyzeFile.js\";\n\nfunction assertBrowserInput(input: unknown): asserts input is BrowserAnalyzeInput {\n const ok =\n input instanceof File ||\n input instanceof Blob ||\n input instanceof ArrayBuffer;\n if (!ok) {\n throw new InvalidInputError(\"Expected a File, Blob, or ArrayBuffer.\");\n }\n}\n\nfunction throwIfAborted(signal?: AbortSignal): void {\n if (signal?.aborted) {\n const err = new Error(\"The operation was aborted\");\n err.name = \"AbortError\";\n throw err;\n }\n}\n\nfunction escapeHtmlMinimal(s: string): string {\n return s\n .replace(/&/g, \"&\")\n .replace(/</g, \"<\")\n .replace(/>/g, \">\")\n .replace(/\"/g, \""\");\n}\n\nconst DOCX_METADATA_STUB =\n \"Structured document metadata for DOCX is not exposed as a separate API; use extractText or analyzeFile.\";\n\nconst IMAGE_METADATA_NOTE =\n \"Raster images have no document metadata bundle in this API.\";\n\nasync function kindOf(input: BrowserAnalyzeInput): Promise<DetectFileKindInput> {\n assertBrowserInput(input);\n assertValidAnalyzeFileInput(input);\n return input as DetectFileKindInput;\n}\n\n/**\n * Text only: DOCX → `extractTextFromDocx`; imagen → `ocr`; texto → `analyzeText`.\n * PDF no está soportado en el navegador (mismo aviso que `analyzeFile`).\n */\nexport async function extractText(\n input: BrowserAnalyzeInput,\n options?: BrowserAnalyzeOptions,\n): Promise<AnalysisResult> {\n throwIfAborted(options?.signal);\n const resolved = await kindOf(input);\n const kind = detectFileKind(resolved);\n const bytesInput = input as FileLikeInput;\n const signal = options?.signal;\n\n switch (kind) {\n case \"pdf\":\n return notImplementedResult(\"pdf\", \"pdf\", [BROWSER_PDF_UNSUPPORTED_WARNING]);\n case \"docx\": {\n const data = await toUint8Array(bytesInput);\n if (data.byteLength === 0) {\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: \"\",\n html: \"\",\n warnings: [\"No document bytes were provided for analysis.\"],\n };\n }\n const r = await extractTextFromDocx(data);\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: r.text,\n html: \"\",\n warnings: r.warnings,\n };\n }\n case \"image\": {\n const data = await toUint8Array(bytesInput);\n if (data.byteLength === 0) {\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: \"\",\n confidence: 0,\n ocrUsed: true,\n warnings: [\"No image bytes were provided for analysis.\"],\n };\n }\n const ocrOpts = {\n ...(options?.ocr ?? {}),\n signal: options?.ocr?.signal ?? signal,\n };\n const r = await ocr(data, ocrOpts);\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: r.text,\n confidence: r.confidence,\n ocrUsed: r.ocrUsed,\n warnings: [],\n };\n }\n case \"text\":\n return analyzeText(bytesInput, { signal });\n default:\n return notImplementedResult(kind, \"none\", [UNKNOWN_FORMAT_WARNING]);\n }\n}\n\n/**\n * Metadatos: en el navegador no hay pipeline PDF ni metadatos DOCX dedicados;\n * DOCX/imagen con avisos; texto → `analyzeText`.\n */\nexport async function extractMetadata(\n input: BrowserAnalyzeInput,\n options?: BrowserAnalyzeOptions,\n): Promise<AnalysisResult> {\n throwIfAborted(options?.signal);\n const resolved = await kindOf(input);\n const kind = detectFileKind(resolved);\n const bytesInput = input as FileLikeInput;\n const signal = options?.signal;\n\n switch (kind) {\n case \"pdf\":\n return notImplementedResult(\"pdf\", \"pdf\", [BROWSER_PDF_UNSUPPORTED_WARNING]);\n case \"docx\":\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: \"\",\n html: \"\",\n warnings: [DOCX_METADATA_STUB],\n };\n case \"image\":\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: \"\",\n confidence: 0,\n ocrUsed: true,\n warnings: [IMAGE_METADATA_NOTE],\n };\n case \"text\":\n return analyzeText(bytesInput, { signal });\n default:\n return notImplementedResult(kind, \"none\", [UNKNOWN_FORMAT_WARNING]);\n }\n}\n\n/**\n * HTML: DOCX → `extractTextFromDocx` + `convertDocxToHtml`; texto → `<pre>`;\n * PDF/imagen no aplican en browser como HTML rico.\n */\nexport async function convertToHtml(\n input: BrowserAnalyzeInput,\n options?: BrowserAnalyzeOptions,\n): Promise<AnalysisResult> {\n throwIfAborted(options?.signal);\n const resolved = await kindOf(input);\n const kind = detectFileKind(resolved);\n const bytesInput = input as FileLikeInput;\n const signal = options?.signal;\n\n switch (kind) {\n case \"pdf\":\n return notImplementedResult(\"pdf\", \"pdf\", [BROWSER_PDF_UNSUPPORTED_WARNING]);\n case \"docx\": {\n const data = await toUint8Array(bytesInput);\n if (data.byteLength === 0) {\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: \"\",\n html: \"\",\n warnings: [\"No document bytes were provided for analysis.\"],\n };\n }\n const [textPart, htmlPart] = await Promise.all([\n extractTextFromDocx(data),\n convertDocxToHtml(data),\n ]);\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: textPart.text,\n html: htmlPart.html,\n warnings: [...textPart.warnings, ...htmlPart.warnings],\n };\n }\n case \"text\": {\n const t = await analyzeText(bytesInput, { signal });\n const html = `<pre>${escapeHtmlMinimal(t.text)}</pre>`;\n return {\n ...t,\n html,\n warnings: [\n ...t.warnings,\n \"HTML for plain text is a <pre> wrapper around decoded UTF-8 content.\",\n ],\n } as AnalysisResult;\n }\n case \"image\":\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: \"\",\n confidence: 0,\n ocrUsed: true,\n warnings: [\"No HTML representation for raster images; use extractText / runOcr.\"],\n };\n default:\n return notImplementedResult(kind, \"none\", [UNKNOWN_FORMAT_WARNING]);\n }\n}\n\n/**\n * OCR: imagen → `ocr`; DOCX → `analyzeDocx` con aviso (sin OCR); texto → `analyzeText`.\n * PDF no soportado en browser.\n */\nexport async function runOcr(\n input: BrowserAnalyzeInput,\n options?: BrowserAnalyzeOptions,\n): Promise<AnalysisResult> {\n throwIfAborted(options?.signal);\n const resolved = await kindOf(input);\n const kind = detectFileKind(resolved);\n const bytesInput = input as FileLikeInput;\n const signal = options?.signal;\n\n switch (kind) {\n case \"pdf\":\n return notImplementedResult(\"pdf\", \"pdf\", [BROWSER_PDF_UNSUPPORTED_WARNING]);\n case \"image\": {\n const data = await toUint8Array(bytesInput);\n if (data.byteLength === 0) {\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: \"\",\n confidence: 0,\n ocrUsed: true,\n warnings: [\"No image bytes were provided for analysis.\"],\n };\n }\n const ocrOpts = {\n ...(options?.ocr ?? {}),\n signal: options?.ocr?.signal ?? signal,\n };\n const r = await ocr(data, ocrOpts);\n return {\n fileKind: \"image\",\n analyzer: \"image\",\n status: \"ok\",\n kind: \"image\",\n text: r.text,\n confidence: r.confidence,\n ocrUsed: r.ocrUsed,\n warnings: [],\n };\n }\n case \"docx\": {\n const data = await toUint8Array(bytesInput);\n if (data.byteLength === 0) {\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: \"\",\n html: \"\",\n warnings: [\"No document bytes were provided for analysis.\"],\n };\n }\n const r = await analyzeDocx(data);\n return {\n fileKind: \"docx\",\n analyzer: \"docx\",\n status: \"ok\",\n kind: \"docx\",\n text: r.text,\n html: r.html,\n warnings: [\n ...r.warnings,\n \"OCR does not apply to DOCX; returned structured text/HTML extract.\",\n ],\n };\n }\n case \"text\":\n return analyzeText(bytesInput, { signal });\n default:\n return notImplementedResult(kind, \"none\", [UNKNOWN_FORMAT_WARNING]);\n }\n}\n"]}
|