mineru-open-sdk 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,774 @@
1
+ // src/constants.ts
2
+ var DEFAULT_BASE_URL = "https://mineru.net/api/v4";
3
+ var DEFAULT_FLASH_BASE_URL = "https://mineru.net/api/v1/agent";
4
+
5
+ // src/client.ts
6
+ import { readFile } from "fs/promises";
7
+ import { basename as basename2, extname as extname2 } from "path";
8
+
9
+ // src/errors.ts
10
+ var MinerUError = class extends Error {
11
+ code;
12
+ traceId;
13
+ constructor(code, message, traceId = "") {
14
+ const tag = traceId ? ` (trace: ${traceId})` : "";
15
+ super(`[${code}] ${message}${tag}`);
16
+ this.name = "MinerUError";
17
+ this.code = String(code);
18
+ this.traceId = traceId;
19
+ }
20
+ };
21
+ var AuthError = class extends MinerUError {
22
+ constructor(code, message, traceId = "") {
23
+ super(code, message, traceId);
24
+ this.name = "AuthError";
25
+ }
26
+ };
27
+ var ParamError = class extends MinerUError {
28
+ constructor(code, message, traceId = "") {
29
+ super(code, message, traceId);
30
+ this.name = "ParamError";
31
+ }
32
+ };
33
+ var FileTooLargeError = class extends MinerUError {
34
+ constructor(code, message, traceId = "") {
35
+ super(code, message, traceId);
36
+ this.name = "FileTooLargeError";
37
+ }
38
+ };
39
+ var PageLimitError = class extends MinerUError {
40
+ constructor(code, message, traceId = "") {
41
+ super(code, message, traceId);
42
+ this.name = "PageLimitError";
43
+ }
44
+ };
45
+ var TaskNotFoundError = class extends MinerUError {
46
+ constructor(code, message, traceId = "") {
47
+ super(code, message, traceId);
48
+ this.name = "TaskNotFoundError";
49
+ }
50
+ };
51
+ var ExtractFailedError = class extends MinerUError {
52
+ constructor(code, message, traceId = "") {
53
+ super(code, message, traceId);
54
+ this.name = "ExtractFailedError";
55
+ }
56
+ };
57
+ var TimeoutError = class extends MinerUError {
58
+ timeout;
59
+ taskId;
60
+ constructor(timeout, taskId) {
61
+ super("TIMEOUT", `Task ${taskId} did not complete within ${timeout}s`);
62
+ this.name = "TimeoutError";
63
+ this.timeout = timeout;
64
+ this.taskId = taskId;
65
+ }
66
+ };
67
+ var QuotaExceededError = class extends MinerUError {
68
+ constructor(code, message, traceId = "") {
69
+ super(code, message, traceId);
70
+ this.name = "QuotaExceededError";
71
+ }
72
+ };
73
+ var FlashFileTooLargeError = class extends MinerUError {
74
+ constructor(code, message, traceId = "") {
75
+ super(code, message, traceId);
76
+ this.name = "FlashFileTooLargeError";
77
+ }
78
+ };
79
+ var FlashUnsupportedTypeError = class extends MinerUError {
80
+ constructor(code, message, traceId = "") {
81
+ super(code, message, traceId);
82
+ this.name = "FlashUnsupportedTypeError";
83
+ }
84
+ };
85
+ var FlashPageLimitError = class extends MinerUError {
86
+ constructor(code, message, traceId = "") {
87
+ super(code, message, traceId);
88
+ this.name = "FlashPageLimitError";
89
+ }
90
+ };
91
+ var FlashParamError = class extends MinerUError {
92
+ constructor(code, message, traceId = "") {
93
+ super(code, message, traceId);
94
+ this.name = "FlashParamError";
95
+ }
96
+ };
97
+ var NoAuthClientError = class extends MinerUError {
98
+ constructor() {
99
+ super(
100
+ "-1",
101
+ "This operation requires an authenticated client; pass token to MinerU() or set MINERU_TOKEN env var."
102
+ );
103
+ this.name = "NoAuthClientError";
104
+ }
105
+ };
106
+ var CODE_TO_ERROR = {
107
+ A0202: AuthError,
108
+ A0211: AuthError,
109
+ "-500": ParamError,
110
+ "-10002": ParamError,
111
+ "-60005": FileTooLargeError,
112
+ "-60006": PageLimitError,
113
+ "-60010": ExtractFailedError,
114
+ "-60012": TaskNotFoundError,
115
+ "-60013": MinerUError,
116
+ "-60018": QuotaExceededError,
117
+ "-60019": QuotaExceededError,
118
+ "-30001": FlashFileTooLargeError,
119
+ "-30002": FlashUnsupportedTypeError,
120
+ "-30003": FlashPageLimitError,
121
+ "-30004": FlashParamError
122
+ };
123
+ function raiseForCode(code, msg, traceId = "") {
124
+ const ErrorClass = CODE_TO_ERROR[String(code)] ?? MinerUError;
125
+ throw new ErrorClass(code, msg, traceId);
126
+ }
127
+
128
+ // src/api.ts
129
+ var ApiClient = class {
130
+ baseUrl;
131
+ headers;
132
+ source;
133
+ constructor(token, baseUrl, source = "") {
134
+ this.baseUrl = baseUrl;
135
+ this.headers = {
136
+ Authorization: `Bearer ${token}`,
137
+ "Content-Type": "application/json"
138
+ };
139
+ this.source = source;
140
+ }
141
+ setSource(source) {
142
+ this.source = source;
143
+ }
144
+ async post(path, json) {
145
+ const headers = { ...this.headers };
146
+ if (this.source) {
147
+ headers["source"] = this.source;
148
+ }
149
+ const resp = await fetch(`${this.baseUrl}${path}`, {
150
+ method: "POST",
151
+ headers,
152
+ body: JSON.stringify(json)
153
+ });
154
+ return this.handle(resp);
155
+ }
156
+ async get(path) {
157
+ const resp = await fetch(`${this.baseUrl}${path}`, {
158
+ method: "GET",
159
+ headers: this.headers
160
+ });
161
+ return this.handle(resp);
162
+ }
163
+ async putFile(url, data) {
164
+ const resp = await fetch(url, {
165
+ method: "PUT",
166
+ body: data
167
+ });
168
+ if (!resp.ok) {
169
+ throw new Error(`Upload failed: ${resp.status} ${resp.statusText}`);
170
+ }
171
+ }
172
+ async download(url) {
173
+ const resp = await fetch(url, { redirect: "follow" });
174
+ if (!resp.ok) {
175
+ throw new Error(`Download failed: ${resp.status} ${resp.statusText}`);
176
+ }
177
+ return new Uint8Array(await resp.arrayBuffer());
178
+ }
179
+ async handle(resp) {
180
+ if (!resp.ok) {
181
+ const text = await resp.text().catch(() => "");
182
+ throw new Error(
183
+ `HTTP ${resp.status}: ${resp.statusText}${text ? ` \u2014 ${text}` : ""}`
184
+ );
185
+ }
186
+ const body = await resp.json();
187
+ if (body.code !== 0) {
188
+ raiseForCode(body.code, body.msg ?? "unknown error", body.trace_id ?? "");
189
+ }
190
+ return body;
191
+ }
192
+ };
193
+
194
+ // src/flash-api.ts
195
+ var FlashApiClient = class {
196
+ baseUrl;
197
+ headers;
198
+ source;
199
+ constructor(baseUrl = DEFAULT_FLASH_BASE_URL, source = "") {
200
+ this.baseUrl = baseUrl;
201
+ this.headers = { "Content-Type": "application/json" };
202
+ this.source = source;
203
+ }
204
+ setSource(source) {
205
+ this.source = source;
206
+ }
207
+ async post(path, json) {
208
+ const headers = { ...this.headers };
209
+ if (this.source) {
210
+ headers["source"] = this.source;
211
+ }
212
+ const resp = await fetch(`${this.baseUrl}${path}`, {
213
+ method: "POST",
214
+ headers,
215
+ body: JSON.stringify(json)
216
+ });
217
+ return this.handle(resp);
218
+ }
219
+ async get(path) {
220
+ const resp = await fetch(`${this.baseUrl}${path}`, {
221
+ method: "GET"
222
+ });
223
+ return this.handle(resp);
224
+ }
225
+ async putFile(url, data) {
226
+ const resp = await fetch(url, { method: "PUT", body: data });
227
+ if (!resp.ok) {
228
+ throw new Error(`Upload failed: ${resp.status} ${resp.statusText}`);
229
+ }
230
+ }
231
+ async downloadText(url) {
232
+ const resp = await fetch(url, { redirect: "follow" });
233
+ if (!resp.ok) {
234
+ throw new Error(`Download failed: ${resp.status} ${resp.statusText}`);
235
+ }
236
+ return resp.text();
237
+ }
238
+ async handle(resp) {
239
+ if (resp.status === 429) {
240
+ raiseForCode(
241
+ "RATE_LIMITED",
242
+ "flash API rate limit exceeded; try again later"
243
+ );
244
+ }
245
+ if (!resp.ok) {
246
+ const text = await resp.text().catch(() => "");
247
+ throw new Error(
248
+ `HTTP ${resp.status}: ${resp.statusText}${text ? ` \u2014 ${text}` : ""}`
249
+ );
250
+ }
251
+ const body = await resp.json();
252
+ if (body.code !== 0) {
253
+ raiseForCode(body.code, body.msg ?? "unknown error", body.trace_id ?? "");
254
+ }
255
+ return body;
256
+ }
257
+ };
258
+
259
+ // src/models.ts
260
+ import { writeFile, mkdir } from "fs/promises";
261
+ import { dirname, join } from "path";
262
+ function progressPercent(p) {
263
+ if (p.totalPages === 0) return 0;
264
+ return p.extractedPages / p.totalPages * 100;
265
+ }
266
+ function progressToString(p) {
267
+ return `${p.extractedPages}/${p.totalPages} (${progressPercent(p).toFixed(0)}%)`;
268
+ }
269
+ function createEmptyResult(taskId, state) {
270
+ return {
271
+ taskId,
272
+ state,
273
+ filename: null,
274
+ errCode: "",
275
+ error: null,
276
+ zipUrl: null,
277
+ progress: null,
278
+ markdown: null,
279
+ contentList: null,
280
+ images: [],
281
+ docx: null,
282
+ html: null,
283
+ latex: null,
284
+ _zipBytes: null
285
+ };
286
+ }
287
+ async function ensureDir(filePath) {
288
+ await mkdir(dirname(filePath), { recursive: true });
289
+ }
290
+ async function saveMarkdown(result, path, withImages = true) {
291
+ if (result.markdown == null) {
292
+ throw new Error("No markdown content available (state != done)");
293
+ }
294
+ await ensureDir(path);
295
+ await writeFile(path, result.markdown, "utf-8");
296
+ if (withImages && result.images.length > 0) {
297
+ const imgDir = join(dirname(path), "images");
298
+ await mkdir(imgDir, { recursive: true });
299
+ for (const img of result.images) {
300
+ await writeFile(join(imgDir, img.name), img.data);
301
+ }
302
+ }
303
+ }
304
+ async function saveDocx(result, path) {
305
+ if (result.docx == null) {
306
+ throw new Error(
307
+ "No docx content available"
308
+ );
309
+ }
310
+ await ensureDir(path);
311
+ await writeFile(path, result.docx);
312
+ }
313
+ async function saveHtml(result, path) {
314
+ if (result.html == null) {
315
+ throw new Error(
316
+ "No html content available"
317
+ );
318
+ }
319
+ await ensureDir(path);
320
+ await writeFile(path, result.html, "utf-8");
321
+ }
322
+ async function saveLatex(result, path) {
323
+ if (result.latex == null) {
324
+ throw new Error(
325
+ "No latex content available"
326
+ );
327
+ }
328
+ await ensureDir(path);
329
+ await writeFile(path, result.latex, "utf-8");
330
+ }
331
+ async function saveAll(result, dir) {
332
+ if (result._zipBytes == null) {
333
+ throw new Error("No zip data available (state != done)");
334
+ }
335
+ const { unzipSync: unzipSync2 } = await import("fflate");
336
+ const entries = unzipSync2(new Uint8Array(result._zipBytes));
337
+ await mkdir(dir, { recursive: true });
338
+ for (const [relativePath, content] of Object.entries(entries)) {
339
+ if (relativePath.endsWith("/")) continue;
340
+ const fullPath = join(dir, relativePath);
341
+ await ensureDir(fullPath);
342
+ await writeFile(fullPath, content);
343
+ }
344
+ }
345
+
346
+ // src/zip.ts
347
+ import { unzipSync } from "fflate";
348
+ var IMAGE_EXTENSIONS = /* @__PURE__ */ new Set([
349
+ ".png",
350
+ ".jpg",
351
+ ".jpeg",
352
+ ".gif",
353
+ ".bmp",
354
+ ".svg",
355
+ ".webp"
356
+ ]);
357
+ function extname(filename) {
358
+ const dot = filename.lastIndexOf(".");
359
+ return dot === -1 ? "" : filename.slice(dot).toLowerCase();
360
+ }
361
+ function basename(filepath) {
362
+ const parts = filepath.replace(/\\/g, "/").split("/");
363
+ return parts[parts.length - 1] ?? "";
364
+ }
365
+ function parseZip(zipBytes, taskId, filename = null) {
366
+ const result = createEmptyResult(taskId, "done");
367
+ result.filename = filename;
368
+ result._zipBytes = zipBytes;
369
+ const entries = unzipSync(zipBytes);
370
+ const images = [];
371
+ let contentList = null;
372
+ for (const [relPath, data] of Object.entries(entries)) {
373
+ if (relPath.endsWith("/")) continue;
374
+ const name = basename(relPath);
375
+ const ext = extname(name);
376
+ const text = () => new TextDecoder().decode(data);
377
+ if (ext === ".md") {
378
+ result.markdown = text();
379
+ } else if (name.endsWith("_content_list.json") || name === "content_list.json") {
380
+ contentList = JSON.parse(text());
381
+ } else if (ext === ".json" && contentList == null) {
382
+ try {
383
+ const parsed = JSON.parse(text());
384
+ if (Array.isArray(parsed)) {
385
+ contentList = parsed;
386
+ }
387
+ } catch {
388
+ }
389
+ } else if (IMAGE_EXTENSIONS.has(ext)) {
390
+ images.push({ name, data: new Uint8Array(data), path: relPath });
391
+ } else if (ext === ".docx") {
392
+ result.docx = new Uint8Array(data);
393
+ } else if (ext === ".html" || ext === ".htm") {
394
+ result.html = text();
395
+ } else if (ext === ".tex") {
396
+ result.latex = text();
397
+ }
398
+ }
399
+ result.contentList = contentList;
400
+ result.images = images;
401
+ return result;
402
+ }
403
+
404
+ // src/client.ts
405
+ var MODEL_MAP = {
406
+ pipeline: "pipeline",
407
+ vlm: "vlm",
408
+ html: "MinerU-HTML"
409
+ };
410
+ var HTML_EXTENSIONS = /* @__PURE__ */ new Set([".html", ".htm"]);
411
+ var DEFAULT_SOURCE = "open-api-sdk-js";
412
+ var DEFAULT_TIMEOUT_POLL_SINGLE = 300;
413
+ var DEFAULT_TIMEOUT_POLL_BATCH = 1800;
414
+ function isUrl(source) {
415
+ return source.startsWith("http://") || source.startsWith("https://");
416
+ }
417
+ function getExtension(source) {
418
+ if (isUrl(source)) {
419
+ const path = source.split("?")[0].split("#")[0];
420
+ const dot = path.lastIndexOf(".");
421
+ return dot === -1 ? "" : path.slice(dot).toLowerCase();
422
+ }
423
+ return extname2(source).toLowerCase();
424
+ }
425
+ function inferModel(source) {
426
+ return HTML_EXTENSIONS.has(getExtension(source)) ? "MinerU-HTML" : "vlm";
427
+ }
428
+ function resolveModel(model, source) {
429
+ if (model != null) {
430
+ return MODEL_MAP[model] ?? model;
431
+ }
432
+ return inferModel(source);
433
+ }
434
+ function buildApiOptions(modelVersion, opts) {
435
+ const o = { model_version: modelVersion };
436
+ if (opts.formula !== void 0) o["enable_formula"] = opts.formula;
437
+ if (opts.table !== void 0) o["enable_table"] = opts.table;
438
+ if (opts.language !== void 0) o["language"] = opts.language;
439
+ if (opts.extraFormats?.length) {
440
+ o["extra_formats"] = opts.extraFormats;
441
+ }
442
+ return o;
443
+ }
444
+ function applyFileFields(entry, key, ocr, pages, fileParams) {
445
+ const fp = fileParams?.[key];
446
+ const effectiveOcr = fp?.ocr !== void 0 ? fp.ocr : ocr;
447
+ if (effectiveOcr !== void 0) entry["is_ocr"] = effectiveOcr;
448
+ const effectivePages = fp?.pages || pages;
449
+ if (effectivePages) entry["page_ranges"] = effectivePages;
450
+ if (fp?.dataId) entry["data_id"] = fp.dataId;
451
+ }
452
+ function parseTaskResult(data) {
453
+ const result = createEmptyResult(
454
+ data["task_id"] ?? "",
455
+ data["state"] ?? "unknown"
456
+ );
457
+ result.filename = data["file_name"] ?? null;
458
+ const errCodeRaw = data["err_code"];
459
+ result.errCode = errCodeRaw == null ? "" : String(errCodeRaw);
460
+ result.error = data["err_msg"] || null;
461
+ result.zipUrl = data["full_zip_url"] ?? null;
462
+ const ep = data["extract_progress"];
463
+ if (ep) {
464
+ result.progress = {
465
+ extractedPages: ep["extracted_pages"] ?? 0,
466
+ totalPages: ep["total_pages"] ?? 0,
467
+ startTime: ep["start_time"] ?? ""
468
+ };
469
+ }
470
+ return result;
471
+ }
472
+ function sleep(ms) {
473
+ return new Promise((resolve) => setTimeout(resolve, ms));
474
+ }
475
+ var MinerU = class {
476
+ api;
477
+ flashApi;
478
+ /**
479
+ * @param token - API token.
480
+ * @param baseUrl - API base URL.
481
+ * @param flashBaseUrl - Flash API base URL.
482
+ */
483
+ constructor(token, baseUrl = DEFAULT_BASE_URL, flashBaseUrl) {
484
+ const resolved = token ?? process.env["MINERU_TOKEN"];
485
+ if (resolved) {
486
+ this.api = new ApiClient(resolved, baseUrl, DEFAULT_SOURCE);
487
+ } else {
488
+ this.api = null;
489
+ }
490
+ this.flashApi = new FlashApiClient(flashBaseUrl, DEFAULT_SOURCE);
491
+ }
492
+ setSource(source) {
493
+ if (this.api !== null) {
494
+ this.api.setSource(source);
495
+ }
496
+ this.flashApi.setSource(source);
497
+ }
498
+ requireAuth() {
499
+ if (this.api === null) {
500
+ throw new NoAuthClientError();
501
+ }
502
+ return this.api;
503
+ }
504
+ // ══════════════════════════════════════════════════════════════════
505
+ // Synchronous (blocking) methods
506
+ // ══════════════════════════════════════════════════════════════════
507
+ async extract(source, options = {}) {
508
+ this.requireAuth();
509
+ const { timeout = DEFAULT_TIMEOUT_POLL_SINGLE, ...opts } = options;
510
+ const modelVersion = resolveModel(opts.model, source);
511
+ const apiOpts = buildApiOptions(modelVersion, opts);
512
+ let batchId;
513
+ if (isUrl(source)) {
514
+ batchId = await this.submitUrlsBatch([source], apiOpts, opts.ocr, opts.pages, opts.fileParams);
515
+ } else {
516
+ batchId = await this.uploadAndSubmit([source], apiOpts, opts.ocr, opts.pages, opts.fileParams);
517
+ }
518
+ const results = await this.waitBatch(batchId, timeout);
519
+ return results[0];
520
+ }
521
+ async *extractBatch(sources, options = {}) {
522
+ this.requireAuth();
523
+ const { timeout = DEFAULT_TIMEOUT_POLL_BATCH, ...opts } = options;
524
+ const firstSource = sources[0] ?? "";
525
+ const modelVersion = resolveModel(opts.model, firstSource);
526
+ const apiOpts = buildApiOptions(modelVersion, opts);
527
+ const urls = sources.filter(isUrl);
528
+ const files = sources.filter((s) => !isUrl(s));
529
+ const batchIds = [];
530
+ if (urls.length > 0) {
531
+ batchIds.push(await this.submitUrlsBatch(urls, apiOpts, opts.ocr, void 0, opts.fileParams));
532
+ }
533
+ if (files.length > 0) {
534
+ batchIds.push(await this.uploadAndSubmit(files, apiOpts, opts.ocr, void 0, opts.fileParams));
535
+ }
536
+ yield* this.yieldBatch(batchIds, sources.length, timeout);
537
+ }
538
+ async crawl(url, options = {}) {
539
+ return this.extract(url, { model: "html", timeout: DEFAULT_TIMEOUT_POLL_SINGLE, ...options });
540
+ }
541
+ async *crawlBatch(urls, options = {}) {
542
+ yield* this.extractBatch(urls, { model: "html", timeout: DEFAULT_TIMEOUT_POLL_BATCH, ...options });
543
+ }
544
+ // ══════════════════════════════════════════════════════════════════
545
+ // Async primitives (no polling, no waiting)
546
+ // ══════════════════════════════════════════════════════════════════
547
+ async submit(source, options = {}) {
548
+ this.requireAuth();
549
+ const modelVersion = resolveModel(options.model, source);
550
+ const apiOpts = buildApiOptions(modelVersion, options);
551
+ if (isUrl(source)) {
552
+ return this.submitUrlsBatch([source], apiOpts, options.ocr, options.pages, options.fileParams);
553
+ }
554
+ return this.uploadAndSubmit([source], apiOpts, options.ocr, options.pages, options.fileParams);
555
+ }
556
+ async submitBatch(sources, options = {}) {
557
+ this.requireAuth();
558
+ const firstSource = sources[0] ?? "";
559
+ const modelVersion = resolveModel(options.model, firstSource);
560
+ const apiOpts = buildApiOptions(modelVersion, options);
561
+ const urls = sources.filter(isUrl);
562
+ const files = sources.filter((s) => !isUrl(s));
563
+ if (urls.length === 0 && files.length === 0) {
564
+ throw new Error("No sources provided.");
565
+ }
566
+ if (urls.length > 0 && files.length > 0) {
567
+ throw new Error(
568
+ "submitBatch() does not support mixing URLs and local files in one call. Please submit them separately or use extractBatch() instead."
569
+ );
570
+ }
571
+ if (urls.length > 0) {
572
+ return this.submitUrlsBatch(urls, apiOpts, options.ocr, void 0, options.fileParams);
573
+ }
574
+ return this.uploadAndSubmit(files, apiOpts, options.ocr, void 0, options.fileParams);
575
+ }
576
+ async getTask(taskId) {
577
+ const api = this.requireAuth();
578
+ const body = await api.get(`/extract/task/${taskId}`);
579
+ const result = parseTaskResult(body.data);
580
+ if (result.state === "done" && result.zipUrl) {
581
+ return this.downloadAndParse(result);
582
+ }
583
+ return result;
584
+ }
585
+ async getBatch(batchId) {
586
+ const api = this.requireAuth();
587
+ const body = await api.get(`/extract-results/batch/${batchId}`);
588
+ const items = body.data["extract_result"] ?? [];
589
+ const results = [];
590
+ for (const item of items) {
591
+ let r = parseTaskResult(item);
592
+ if (r.state === "done" && r.zipUrl) {
593
+ r = await this.downloadAndParse(r);
594
+ }
595
+ results.push(r);
596
+ }
597
+ return results;
598
+ }
599
+ // ══════════════════════════════════════════════════════════════════
600
+ // Internal helpers
601
+ // ══════════════════════════════════════════════════════════════════
602
+ async submitUrlsBatch(urls, opts, ocr, pages, fileParams) {
603
+ const files = urls.map((u) => {
604
+ const entry = { url: u };
605
+ applyFileFields(entry, u, ocr, pages, fileParams);
606
+ return entry;
607
+ });
608
+ const body = await this.requireAuth().post("/extract/task/batch", {
609
+ files,
610
+ ...opts
611
+ });
612
+ return body.data["batch_id"];
613
+ }
614
+ async uploadAndSubmit(filePaths, opts, ocr, pages, fileParams) {
615
+ const api = this.requireAuth();
616
+ const filesMeta = filePaths.map((p) => {
617
+ const entry = { name: basename2(p) };
618
+ applyFileFields(entry, p, ocr, pages, fileParams);
619
+ return entry;
620
+ });
621
+ const body = await api.post("/file-urls/batch", {
622
+ files: filesMeta,
623
+ ...opts
624
+ });
625
+ const batchId = body.data["batch_id"];
626
+ const uploadUrls = body.data["file_urls"];
627
+ for (let i = 0; i < filePaths.length; i++) {
628
+ const data = await readFile(filePaths[i]);
629
+ await api.putFile(uploadUrls[i], new Uint8Array(data));
630
+ }
631
+ return batchId;
632
+ }
633
+ async downloadAndParse(result) {
634
+ const zipBytes = await this.requireAuth().download(result.zipUrl);
635
+ const parsed = parseZip(zipBytes, result.taskId, result.filename);
636
+ parsed.zipUrl = result.zipUrl;
637
+ return parsed;
638
+ }
639
+ async waitBatch(batchId, timeout) {
640
+ const deadline = Date.now() + timeout * 1e3;
641
+ let interval = 2e3;
642
+ for (; ; ) {
643
+ const results = await this.getBatch(batchId);
644
+ if (results.every((r) => r.state === "done" || r.state === "failed")) {
645
+ return results;
646
+ }
647
+ if (Date.now() > deadline) {
648
+ throw new TimeoutError(timeout, batchId);
649
+ }
650
+ await sleep(Math.min(interval, Math.max(0, deadline - Date.now())));
651
+ interval = Math.min(interval * 2, 3e4);
652
+ }
653
+ }
654
+ async *yieldBatch(batchIds, total, timeout) {
655
+ const deadline = Date.now() + timeout * 1e3;
656
+ const yielded = /* @__PURE__ */ new Set();
657
+ let interval = 2e3;
658
+ while (yielded.size < total) {
659
+ for (const bid of batchIds) {
660
+ const results = await this.getBatch(bid);
661
+ for (let idx = 0; idx < results.length; idx++) {
662
+ const key = `${bid}:${idx}`;
663
+ const r = results[idx];
664
+ if (!yielded.has(key) && (r.state === "done" || r.state === "failed")) {
665
+ yielded.add(key);
666
+ yield r;
667
+ }
668
+ }
669
+ }
670
+ if (yielded.size >= total) break;
671
+ if (Date.now() > deadline) {
672
+ throw new TimeoutError(timeout, batchIds.join(","));
673
+ }
674
+ await sleep(Math.min(interval, Math.max(0, deadline - Date.now())));
675
+ interval = Math.min(interval * 2, 3e4);
676
+ }
677
+ }
678
+ // ══════════════════════════════════════════════════════════════════
679
+ // Flash (agent) mode
680
+ // ══════════════════════════════════════════════════════════════════
681
+ async flashExtract(source, options = {}) {
682
+ const { language = "ch", pageRange, timeout = DEFAULT_TIMEOUT_POLL_SINGLE } = options;
683
+ let taskId;
684
+ if (isUrl(source)) {
685
+ taskId = await this.flashSubmitUrl(source, language, pageRange);
686
+ } else {
687
+ taskId = await this.flashSubmitFile(source, language, pageRange);
688
+ }
689
+ return this.flashWait(taskId, timeout);
690
+ }
691
+ // ── Flash internal helpers ──
692
+ async flashSubmitUrl(url, language, pageRange) {
693
+ const payload = { url, language };
694
+ if (pageRange != null) payload["page_range"] = pageRange;
695
+ const body = await this.flashApi.post("/parse/url", payload);
696
+ return body.data["task_id"];
697
+ }
698
+ async flashSubmitFile(filePath, language, pageRange) {
699
+ const fileName = basename2(filePath);
700
+ const payload = { file_name: fileName, language };
701
+ if (pageRange != null) payload["page_range"] = pageRange;
702
+ const body = await this.flashApi.post("/parse/file", payload);
703
+ const taskId = body.data["task_id"];
704
+ const fileUrl = body.data["file_url"];
705
+ const data = await readFile(filePath);
706
+ await this.flashApi.putFile(fileUrl, new Uint8Array(data));
707
+ return taskId;
708
+ }
709
+ async flashWait(taskId, timeout) {
710
+ const deadline = Date.now() + timeout * 1e3;
711
+ let interval = 2e3;
712
+ for (; ; ) {
713
+ const result = await this.flashGetTask(taskId);
714
+ if (result.state === "done" || result.state === "failed") return result;
715
+ if (Date.now() > deadline) throw new TimeoutError(timeout, taskId);
716
+ await sleep(Math.min(interval, Math.max(0, deadline - Date.now())));
717
+ interval = Math.min(interval * 2, 3e4);
718
+ }
719
+ }
720
+ async flashGetTask(taskId) {
721
+ const body = await this.flashApi.get(`/parse/${taskId}`);
722
+ return this.parseFlashTask(body.data);
723
+ }
724
+ async parseFlashTask(data) {
725
+ const result = createEmptyResult(
726
+ data["task_id"] ?? "",
727
+ data["state"] ?? "unknown"
728
+ );
729
+ const errCodeRaw = data["err_code"];
730
+ result.errCode = errCodeRaw == null ? "" : String(errCodeRaw);
731
+ result.error = data["err_msg"] || null;
732
+ const ep = data["extract_progress"];
733
+ if (ep) {
734
+ result.progress = {
735
+ extractedPages: ep["extracted_pages"] ?? 0,
736
+ totalPages: ep["total_pages"] ?? 0,
737
+ startTime: ep["start_time"] ?? ""
738
+ };
739
+ }
740
+ if (result.state === "done" && data["markdown_url"]) {
741
+ result.markdown = await this.flashApi.downloadText(
742
+ data["markdown_url"]
743
+ );
744
+ }
745
+ return result;
746
+ }
747
+ };
748
+ export {
749
+ AuthError,
750
+ DEFAULT_BASE_URL,
751
+ DEFAULT_FLASH_BASE_URL,
752
+ ExtractFailedError,
753
+ FileTooLargeError,
754
+ FlashFileTooLargeError,
755
+ FlashPageLimitError,
756
+ FlashParamError,
757
+ FlashUnsupportedTypeError,
758
+ MinerU,
759
+ MinerUError,
760
+ NoAuthClientError,
761
+ PageLimitError,
762
+ ParamError,
763
+ QuotaExceededError,
764
+ TaskNotFoundError,
765
+ TimeoutError,
766
+ progressPercent,
767
+ progressToString,
768
+ saveAll,
769
+ saveDocx,
770
+ saveHtml,
771
+ saveLatex,
772
+ saveMarkdown
773
+ };
774
+ //# sourceMappingURL=index.js.map