@scribe.js/aws-textract 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,105 @@
1
+ # @scribe.js/aws-textract
2
+ Convert AWS Textract output into searchable PDFs. AWS Textract recognition adapter for [scribe.js-ocr](https://www.npmjs.com/package/scribe.js-ocr), an open-source OCR and text-extraction toolkit for browser and Node.js.
3
+
4
+ `scribe.js-ocr` already knows how to parse Textract output into its OCR data model. This package is the thin client that calls AWS Textract and returns that output. It is kept separate so the AWS SDK is only installed by projects that actually use Textract.
5
+
6
+ ## Install
7
+
8
+ ```sh
9
+ npm install scribe.js-ocr @scribe.js/aws-textract
10
+ ```
11
+
12
+ ## Exports
13
+
14
+ | Specifier | Class | Environment |
15
+ | --- | --- | --- |
16
+ | `@scribe.js/aws-textract` | `RecognitionModelTextract` | Node. Imports the AWS SDK as a normal dependency. Supports per-image and async PDF (via S3) recognition. |
17
+ | `@scribe.js/aws-textract/browser` | `RecognitionModelTextractBrowser` | Browser. Imports a pre-bundled AWS SDK, so no bare-specifier resolution is needed. Per-image recognition only. |
18
+
19
+ ## Usage
20
+
21
+ Pick the pattern that matches where your AWS credentials can safely live.
22
+
23
+ ### 1. Node / server-side
24
+
25
+ Credentials stay on the server. Use the default export.
26
+
27
+ ```js
28
+ import scribe from 'scribe.js-ocr';
29
+ import { RecognitionModelTextract } from '@scribe.js/aws-textract';
30
+
31
+ await scribe.importFiles(['document.pdf']);
32
+
33
+ await scribe.recognize({
34
+ model: RecognitionModelTextract,
35
+ modelOptions: { analyzeLayout: true },
36
+ });
37
+
38
+ console.log(await scribe.exportData('text'));
39
+ await scribe.terminate();
40
+ ```
41
+
42
+ Credentials and region resolve through the standard AWS SDK chain (`AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` / `AWS_REGION` env vars, `~/.aws/credentials`, or an IAM role), or pass them explicitly in `modelOptions`:
43
+
44
+ ```js
45
+ modelOptions: {
46
+ region: 'us-east-1',
47
+ credentials: { accessKeyId: '...', secretAccessKey: '...' },
48
+ }
49
+ ```
50
+
51
+ Textract rate limits are per region, so passing an array of regions distributes pages round-robin for higher throughput:
52
+
53
+ ```js
54
+ modelOptions: { region: ['us-east-1', 'us-west-2', 'eu-west-1'] }
55
+ ```
56
+
57
+ ### 2. Browser via proxy server (recommended for production)
58
+
59
+ For a public browser app, the safe setup is to keep credentials off the client entirely. The browser sends each document to a backend you control, which runs the Node model from pattern 1 and streams results back. No AWS key ever reaches the client.
60
+
61
+ The scribe.js repository ships a ready-made client and server you can copy under the `server-textract-proxy` example (in scribe.js repo). The server holds the credentials and calls `RecognitionModelTextract`, and the browser uses a small proxy model that posts the document to your endpoint. This is the right default for most production sites.
62
+
63
+ ### 3. Browser, direct to AWS (advanced / debugging)
64
+
65
+ The `/browser` export calls AWS straight from the browser with credentials you pass in. Those credentials reach the client, so this pattern is appropriate only for local debugging, trusted internal tools, or short-lived Amazon Cognito credentials (advanced users only). **Never ship long-lived IAM keys to a public site** — use pattern 2 instead.
66
+
67
+ ```js
68
+ import scribe from 'scribe.js-ocr';
69
+ import { RecognitionModelTextractBrowser } from '@scribe.js/aws-textract/browser';
70
+
71
+ await scribe.importFiles(fileList);
72
+
73
+ await scribe.recognize({
74
+ model: RecognitionModelTextractBrowser,
75
+ modelOptions: {
76
+ region: 'us-east-1',
77
+ credentials: { accessKeyId: '...', secretAccessKey: '...' },
78
+ analyzeLayout: true,
79
+ },
80
+ });
81
+
82
+ console.log(await scribe.exportData('text'));
83
+ ```
84
+
85
+ ## Options
86
+
87
+ `modelOptions` accepted by `recognize()`:
88
+
89
+ - `analyzeLayout` (boolean) — enable layout analysis. Increases AWS cost.
90
+ - `analyzeTables` (boolean) — enable table analysis. Implies layout analysis. Significantly increases AWS cost.
91
+ - `region` (string | string[]) — AWS region, or an array for multi-region throughput.
92
+ - `credentials` (`{ accessKeyId, secretAccessKey }`) — explicit credentials.
93
+
94
+ The Node `RecognitionModelTextract` additionally supports async PDF recognition (`recognizeDocument`), which uploads to S3 and polls the async Textract API. See the JSDoc in `RecognitionModelAwsTextract.js` for `s3Bucket`, `pollingInterval`, and related options.
95
+
96
+ ## Rebuilding the browser bundle
97
+
98
+ `aws-textract.esm.bundle.min.js` is a checked-in build artifact: the AWS SDK Textract client bundled for the browser. Regenerate it after bumping the AWS SDK version:
99
+
100
+ ```sh
101
+ npm install
102
+ npm run build
103
+ ```
104
+
105
+ The declared `@aws-sdk/*` dependency versions and the bundled version must be kept in sync. Both should match after a fresh `npm install && npm run build`.
@@ -0,0 +1,575 @@
1
+ /**
2
+ * @typedef {Object} RecognitionResult
3
+ * @property {boolean} success
4
+ * @property {string} [rawData]
5
+ * @property {string} format
6
+ * @property {Error} [error]
7
+ */
8
+
9
+ import {
10
+ TextractClient,
11
+ DetectDocumentTextCommand,
12
+ AnalyzeDocumentCommand,
13
+ StartDocumentTextDetectionCommand,
14
+ StartDocumentAnalysisCommand,
15
+ GetDocumentTextDetectionCommand,
16
+ GetDocumentAnalysisCommand,
17
+ } from '@aws-sdk/client-textract';
18
+ import {
19
+ S3Client,
20
+ PutObjectCommand,
21
+ DeleteObjectCommand,
22
+ } from '@aws-sdk/client-s3';
23
+
24
+ /**
25
+ * Manages a pool of AWS regions for round-robin request distribution.
26
+ * Each region tracks its own throttle/backoff state independently.
27
+ */
28
+ class RegionPool {
29
+ /**
30
+ * @param {string[]} regions
31
+ * @param {Object} [defaultCredentials]
32
+ */
33
+ constructor(regions, defaultCredentials) {
34
+ this.entries = regions.map((region) => ({
35
+ region,
36
+ credentials: defaultCredentials,
37
+ backoffUntil: 0,
38
+ consecutiveThrottles: 0,
39
+ }));
40
+ this._index = 0;
41
+ }
42
+
43
+ /** Get the next available region, skipping any currently in backoff. */
44
+ getNext() {
45
+ const now = Date.now();
46
+ const len = this.entries.length;
47
+ for (let i = 0; i < len; i++) {
48
+ const entry = this.entries[(this._index + i) % len];
49
+ if (entry.backoffUntil <= now) {
50
+ this._index = ((this._index + i) % len) + 1;
51
+ return entry;
52
+ }
53
+ }
54
+ // All regions in backoff — return the one expiring soonest.
55
+ const soonest = this.entries.reduce((a, b) => (a.backoffUntil < b.backoffUntil ? a : b));
56
+ return soonest;
57
+ }
58
+
59
+ /** Mark a region as throttled with exponential backoff. */
60
+ markThrottled(entry) {
61
+ entry.consecutiveThrottles++;
62
+ entry.backoffUntil = Date.now() + Math.min(1000 * (2 ** entry.consecutiveThrottles), 32000);
63
+ }
64
+
65
+ /** Mark a region as successful (reset throttle state). */
66
+ markSuccess(entry) {
67
+ entry.consecutiveThrottles = 0;
68
+ entry.backoffUntil = 0;
69
+ }
70
+
71
+ /** Returns true when every region is currently in a backoff window. */
72
+ allInBackoff() {
73
+ const now = Date.now();
74
+ return this.entries.every((e) => e.backoffUntil > now);
75
+ }
76
+ }
77
+
78
+ /**
79
+ * AWS Textract recognition model for use with Scribe.js.
80
+ */
81
+ export class RecognitionModelTextract {
82
+ static config = {
83
+ name: 'AWS Textract',
84
+ outputFormat: 'textract',
85
+ rateLimit: { tps: 1 },
86
+ };
87
+
88
+ static isThrottlingError(error) {
89
+ return error?.$metadata?.httpStatusCode === 429
90
+ || error?.name === 'ThrottlingException'
91
+ || error?.name === 'ProvisionedThroughputExceededException'
92
+ || error?.name === 'LimitExceededException';
93
+ }
94
+
95
+ /** @type {RegionPool|null} */
96
+ static _regionPool = null;
97
+
98
+ /**
99
+ * Lazily creates or reuses a RegionPool from options.region when it is an array.
100
+ * @param {Object} options
101
+ */
102
+ static _ensureRegionPool(options) {
103
+ const regions = options.region;
104
+ if (!Array.isArray(regions) || regions.length <= 1) {
105
+ this._regionPool = null;
106
+ return;
107
+ }
108
+ const currentRegions = this._regionPool?.entries.map((e) => e.region);
109
+ if (!currentRegions || JSON.stringify(currentRegions) !== JSON.stringify(regions)) {
110
+ this._regionPool = new RegionPool(regions, options.credentials);
111
+ }
112
+ }
113
+
114
+ /**
115
+ * Dispatch a single image recognition request across the region pool.
116
+ * Tries each region once on throttle before giving up.
117
+ * @param {Uint8Array} data
118
+ * @param {Object} options
119
+ * @param {boolean} analyzeLayout
120
+ * @param {boolean} analyzeTables
121
+ * @returns {Promise<RecognitionResult>}
122
+ */
123
+ static async _recognizeWithPool(data, options, analyzeLayout, analyzeTables) {
124
+ const pool = this._regionPool;
125
+ const maxAttempts = pool.entries.length;
126
+ const signal = options.signal;
127
+
128
+ for (let attempt = 0; attempt < maxAttempts; attempt++) {
129
+ if (signal && signal.aborted) {
130
+ return { success: false, error: signal.reason instanceof Error ? signal.reason : new Error('Aborted'), format: 'textract' };
131
+ }
132
+ const entry = pool.getNext();
133
+
134
+ // If this region is in backoff, wait for it — but wake early on abort.
135
+ const now = Date.now();
136
+ if (entry.backoffUntil > now) {
137
+ const waitMs = entry.backoffUntil - now;
138
+ await new Promise((resolve) => {
139
+ if (signal && signal.aborted) { resolve(); return; }
140
+ const onAbort = () => { clearTimeout(t); resolve(); };
141
+ const t = setTimeout(() => {
142
+ if (signal) signal.removeEventListener('abort', onAbort);
143
+ resolve();
144
+ }, waitMs);
145
+ if (signal) signal.addEventListener('abort', onAbort, { once: true });
146
+ });
147
+ if (signal && signal.aborted) {
148
+ return { success: false, error: signal.reason instanceof Error ? signal.reason : new Error('Aborted'), format: 'textract' };
149
+ }
150
+ }
151
+
152
+ try {
153
+ const textractClient = new TextractClient({
154
+ region: entry.region,
155
+ ...(entry.credentials && { credentials: entry.credentials }),
156
+ });
157
+
158
+ let command;
159
+ if (analyzeLayout || analyzeTables) {
160
+ const FeatureTypes = [];
161
+ if (analyzeLayout) FeatureTypes.push('LAYOUT');
162
+ if (analyzeTables) FeatureTypes.push('TABLES');
163
+ command = new AnalyzeDocumentCommand({
164
+ Document: { Bytes: data },
165
+ FeatureTypes,
166
+ });
167
+ } else {
168
+ command = new DetectDocumentTextCommand({
169
+ Document: { Bytes: data },
170
+ });
171
+ }
172
+
173
+ const response = await textractClient.send(command, signal ? { abortSignal: signal } : undefined);
174
+ pool.markSuccess(entry);
175
+ return { success: true, rawData: JSON.stringify(response), format: 'textract' };
176
+ } catch (error) {
177
+ if (this.isThrottlingError(error)) {
178
+ pool.markThrottled(entry);
179
+ if (!pool.allInBackoff()) continue;
180
+ }
181
+ return { success: false, error, format: 'textract' };
182
+ }
183
+ }
184
+
185
+ return { success: false, error: new Error('All regions exhausted'), format: 'textract' };
186
+ }
187
+
188
+ /**
189
+ * Recognize text from an image using AWS Textract.
190
+ *
191
+ * Region is resolved in this order:
192
+ * 1. `options.region` — explicit region string
193
+ * 2. Standard AWS SDK resolution: `AWS_REGION` env var → `AWS_DEFAULT_REGION` → `~/.aws/config`
194
+ *
195
+ * Credentials are resolved in this order:
196
+ * 1. `options.credentials` — explicit `{ accessKeyId, secretAccessKey }` object
197
+ * 2. Standard AWS SDK credential chain:
198
+ * - `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` env vars
199
+ * - `~/.aws/credentials` file (with optional `AWS_PROFILE`)
200
+ * - IAM role (when running on EC2/ECS/Lambda)
201
+ *
202
+ * @param {Uint8Array|ArrayBuffer} imageData - Image data
203
+ * @param {Object} [options]
204
+ * @param {boolean} [options.analyzeLayout=false] - Whether to enable layout analysis.
205
+ * Note that enabling layout analysis increases AWS costs.
206
+ * @param {boolean} [options.analyzeTables=false] - Whether to enable table analysis.
207
+ * Enabling table analysis automatically enables layout analysis.
208
+ * Note that enabling table analysis significantly increases AWS costs.
209
+ * @param {string|string[]} [options.region] - AWS region (e.g. 'us-east-1') or array of regions
210
+ * for multi-region throughput scaling (e.g. ['us-east-1', 'us-west-2']).
211
+ * When an array is provided, pages are distributed across regions round-robin
212
+ * with per-region throttle backoff.
213
+ * If not provided, the SDK resolves from AWS_REGION env var, ~/.aws/config, or instance metadata.
214
+ * @param {{accessKeyId: string, secretAccessKey: string}} [options.credentials] - AWS credentials.
215
+ * If not provided, the standard AWS SDK credential chain is used.
216
+ * @param {AbortSignal} [options.signal] - Optional abort signal. When fired, the in-flight
217
+ * AWS SDK call is cancelled via `client.send(cmd, { abortSignal })`.
218
+ * @returns {Promise<RecognitionResult>}
219
+ */
220
+ static async recognizeImage(imageData, options = {}) {
221
+ const data = imageData instanceof ArrayBuffer ? new Uint8Array(imageData) : imageData;
222
+ const analyzeLayout = options.analyzeLayout ?? false;
223
+ const analyzeTables = options.analyzeTables ?? false;
224
+ const signal = options.signal;
225
+
226
+ if (signal && signal.aborted) {
227
+ return { success: false, error: signal.reason instanceof Error ? signal.reason : new Error('Aborted'), format: 'textract' };
228
+ }
229
+
230
+ // Multi-region path: distribute across regions with per-region backoff.
231
+ this._ensureRegionPool(options);
232
+ if (this._regionPool) {
233
+ return this._recognizeWithPool(data, options, analyzeLayout, analyzeTables);
234
+ }
235
+
236
+ // Single-region path.
237
+ const region = (typeof options.region === 'string' && options.region) || undefined;
238
+ const credentials = options.credentials || undefined;
239
+
240
+ try {
241
+ const textractClient = new TextractClient({
242
+ ...(region && { region }),
243
+ ...(credentials && { credentials }),
244
+ });
245
+
246
+ let command;
247
+ if (analyzeLayout || analyzeTables) {
248
+ const FeatureTypes = [];
249
+ if (analyzeLayout) FeatureTypes.push('LAYOUT');
250
+ if (analyzeTables) FeatureTypes.push('TABLES');
251
+ command = new AnalyzeDocumentCommand({
252
+ Document: { Bytes: data },
253
+ FeatureTypes,
254
+ });
255
+ } else {
256
+ command = new DetectDocumentTextCommand({
257
+ Document: { Bytes: data },
258
+ });
259
+ }
260
+
261
+ const response = await textractClient.send(command, signal ? { abortSignal: signal } : undefined);
262
+ return {
263
+ success: true,
264
+ rawData: JSON.stringify(response),
265
+ format: 'textract',
266
+ };
267
+ } catch (error) {
268
+ return {
269
+ success: false,
270
+ error,
271
+ format: 'textract',
272
+ };
273
+ }
274
+ }
275
+
276
+ /**
277
+ * Recognize text from a PDF document using AWS Textract's asynchronous API.
278
+ *
279
+ * Region and credentials are resolved the same way as `recognizeImage()`.
280
+ *
281
+ * @param {Uint8Array|ArrayBuffer} documentData - PDF data
282
+ * @param {Object} [options]
283
+ * @param {boolean} [options.analyzeLayout=false] - Whether to enable layout analysis.
284
+ * @param {boolean} [options.analyzeTables=false] - Whether to enable table analysis.
285
+ * @param {string} options.s3Bucket - S3 bucket name (required)
286
+ * @param {string} [options.s3Key] - S3 key prefix (optional, auto-generated if not provided)
287
+ * @param {boolean} [options.keepS3File=false] - Whether to keep the uploaded S3 file after processing
288
+ * @param {number} [options.pollingInterval=5000] - Polling interval in milliseconds
289
+ * @param {number} [options.maxWaitTime=1800000] - Maximum wait time in milliseconds (default: 30 minutes)
290
+ * @param {string} [options.region] - AWS region.
291
+ * @param {{accessKeyId: string, secretAccessKey: string}} [options.credentials] - AWS credentials.
292
+ * @param {function} [options.progressCallback] - Optional callback for progress reporting.
293
+ * @returns {Promise<RecognitionResult>}
294
+ */
295
+ static async recognizeDocument(documentData, options = {}) {
296
+ const data = documentData instanceof ArrayBuffer ? new Uint8Array(documentData) : documentData;
297
+
298
+ const result = await this.recognizePdfAsync(data, {
299
+ analyzeLayout: options.analyzeLayout ?? false,
300
+ analyzeTables: options.analyzeTables ?? false,
301
+ s3Bucket: options.s3Bucket,
302
+ s3Key: options.s3Key,
303
+ keepS3File: options.keepS3File ?? false,
304
+ pollingInterval: options.pollingInterval ?? 5000,
305
+ maxWaitTime: options.maxWaitTime ?? 1800000,
306
+ region: options.region,
307
+ credentials: options.credentials,
308
+ progressCallback: options.progressCallback,
309
+ });
310
+
311
+ if (result.success) {
312
+ const combined = this.combineTextractResponses(result.data);
313
+ return {
314
+ success: true,
315
+ rawData: JSON.stringify(combined),
316
+ format: 'textract',
317
+ };
318
+ }
319
+ return {
320
+ success: false,
321
+ error: new Error(result.error),
322
+ format: 'textract',
323
+ };
324
+ }
325
+
326
+ static async checkAvailability() {
327
+ return { available: true };
328
+ }
329
+
330
+ /**
331
+ * Asynchronous PDF recognition
332
+ * @param {Uint8Array} pdfData
333
+ * @param {Object} options
334
+ * @param {boolean} [options.analyzeLayout]
335
+ * @param {boolean} [options.analyzeTables]
336
+ * @param {string} options.s3Bucket
337
+ * @param {string} [options.s3Key]
338
+ * @param {boolean} [options.keepS3File]
339
+ * @param {number} [options.pollingInterval]
340
+ * @param {number} [options.maxWaitTime]
341
+ * @param {string} [options.region] - AWS region.
342
+ * @param {{accessKeyId: string, secretAccessKey: string}} [options.credentials] - AWS credentials.
343
+ * @param {function} [options.progressCallback] - Optional callback for progress reporting.
344
+ */
345
+ static recognizePdfAsync = async (pdfData, {
346
+ analyzeLayout = false,
347
+ analyzeTables = false,
348
+ s3Bucket,
349
+ s3Key,
350
+ keepS3File = false,
351
+ pollingInterval = 5000,
352
+ maxWaitTime = 1800000,
353
+ region,
354
+ credentials,
355
+ progressCallback,
356
+ } = {}) => {
357
+ const textractClient = new TextractClient({
358
+ ...(region && { region }),
359
+ ...(credentials && { credentials }),
360
+ });
361
+
362
+ const s3Client = new S3Client({
363
+ ...(region && { region }),
364
+ ...(credentials && { credentials }),
365
+ });
366
+
367
+ const finalS3Key = s3Key || `textract-temp/${Date.now()}-${Math.random().toString(36).slice(2, 11)}.pdf`;
368
+
369
+ try {
370
+ if (progressCallback) progressCallback({ status: 'uploading' });
371
+ await s3Client.send(new PutObjectCommand({
372
+ Bucket: s3Bucket,
373
+ Key: finalS3Key,
374
+ Body: pdfData,
375
+ ContentType: 'application/pdf',
376
+ }));
377
+
378
+ let startCommand;
379
+
380
+ if (analyzeLayout || analyzeTables) {
381
+ const FeatureTypes = [];
382
+ if (analyzeLayout) FeatureTypes.push('LAYOUT');
383
+ if (analyzeTables) FeatureTypes.push('TABLES');
384
+
385
+ startCommand = new StartDocumentAnalysisCommand({
386
+ DocumentLocation: {
387
+ S3Object: {
388
+ Bucket: s3Bucket,
389
+ Name: finalS3Key,
390
+ },
391
+ },
392
+ FeatureTypes,
393
+ });
394
+ } else {
395
+ startCommand = new StartDocumentTextDetectionCommand({
396
+ DocumentLocation: {
397
+ S3Object: {
398
+ Bucket: s3Bucket,
399
+ Name: finalS3Key,
400
+ },
401
+ },
402
+ });
403
+ }
404
+
405
+ if (progressCallback) progressCallback({ status: 'starting' });
406
+ const startResponse = await textractClient.send(startCommand);
407
+ const jobId = startResponse.JobId;
408
+
409
+ const result = await this.pollForCompletion(
410
+ textractClient,
411
+ jobId,
412
+ analyzeLayout || analyzeTables,
413
+ pollingInterval,
414
+ maxWaitTime,
415
+ progressCallback,
416
+ );
417
+
418
+ return result;
419
+ } catch (error) {
420
+ return {
421
+ success: false,
422
+ error: error.message,
423
+ errorCode: error.name,
424
+ };
425
+ } finally {
426
+ if (!keepS3File) {
427
+ try {
428
+ if (progressCallback) progressCallback({ status: 'cleanup' });
429
+ await s3Client.send(new DeleteObjectCommand({
430
+ Bucket: s3Bucket,
431
+ Key: finalS3Key,
432
+ }));
433
+ } catch (cleanupError) {
434
+ console.warn(`Failed to clean up S3 file: ${cleanupError.message}`);
435
+ }
436
+ }
437
+ }
438
+ };
439
+
440
+ /**
441
+ * Poll for job completion
442
+ * @param {TextractClient} textractClient
443
+ * @param {string} jobId
444
+ * @param {boolean} isAnalysis
445
+ * @param {number} pollingInterval
446
+ * @param {number} maxWaitTime
447
+ * @param {function} [progressCallback] - Optional callback for progress reporting.
448
+ */
449
+ static pollForCompletion = async (textractClient, jobId, isAnalysis, pollingInterval, maxWaitTime, progressCallback) => {
450
+ const startTime = Date.now();
451
+ let nextToken = null;
452
+ const allResponses = [];
453
+
454
+ while (Date.now() - startTime < maxWaitTime) {
455
+ try {
456
+ let getCommand;
457
+ if (isAnalysis) {
458
+ getCommand = new GetDocumentAnalysisCommand({
459
+ JobId: jobId,
460
+ NextToken: nextToken,
461
+ });
462
+ } else {
463
+ getCommand = new GetDocumentTextDetectionCommand({
464
+ JobId: jobId,
465
+ NextToken: nextToken,
466
+ });
467
+ }
468
+
469
+ const response = await textractClient.send(getCommand);
470
+
471
+ if (response.JobStatus === 'SUCCEEDED') {
472
+ allResponses.push(response);
473
+
474
+ if (progressCallback) progressCallback({ status: 'retrieving', responsesReceived: allResponses.length });
475
+
476
+ if (response.NextToken) {
477
+ nextToken = response.NextToken;
478
+ continue;
479
+ } else {
480
+ return {
481
+ success: true,
482
+ data: allResponses,
483
+ };
484
+ }
485
+ } else if (response.JobStatus === 'FAILED') {
486
+ return {
487
+ success: false,
488
+ error: response.StatusMessage || 'Textract job failed',
489
+ errorCode: 'TextractJobFailed',
490
+ };
491
+ } else if (response.JobStatus === 'IN_PROGRESS') {
492
+ if (progressCallback) progressCallback({ status: 'polling', elapsedMs: Date.now() - startTime });
493
+ await new Promise((resolve) => setTimeout(resolve, pollingInterval));
494
+ }
495
+ } catch (error) {
496
+ if (error.name === 'InvalidJobIdException') {
497
+ return {
498
+ success: false,
499
+ error: 'Invalid job ID or job expired',
500
+ errorCode: 'InvalidJobId',
501
+ };
502
+ }
503
+ throw error;
504
+ }
505
+ }
506
+
507
+ return {
508
+ success: false,
509
+ error: `Job timed out after ${maxWaitTime / 1000} seconds`,
510
+ errorCode: 'Timeout',
511
+ };
512
+ };
513
+
514
+ /**
515
+ * Combines multiple paginated responses from an asynchronous Textract job into a single response object.
516
+ * @param {Array<Object>} responses - An array of response objects from GetDocumentAnalysisCommand or GetDocumentTextDetectionCommand.
517
+ * @returns {Object} A single, combined response object.
518
+ */
519
+ static combineTextractResponses = (responses) => {
520
+ if (!responses || responses.length === 0) {
521
+ return {};
522
+ }
523
+
524
+ const combined = {
525
+ Blocks: [],
526
+ Warnings: [],
527
+ };
528
+
529
+ let documentMetadata = null;
530
+
531
+ // Detect per-page sync responses: each response is for a single page image,
532
+ // so blocks have Page=1 or no Page property. In this case we must set the
533
+ // correct Page number based on the response index.
534
+ // Paginated async responses already have correct Page values (potentially > 1).
535
+ const isPerPage = responses.length > 1 && responses.every((r) => {
536
+ if (!r.Blocks) return true;
537
+ return r.Blocks.every((b) => !b.Page || b.Page === 1);
538
+ });
539
+
540
+ for (let ri = 0; ri < responses.length; ri++) {
541
+ const response = responses[ri];
542
+ if (response.Blocks) {
543
+ if (isPerPage) {
544
+ for (const block of response.Blocks) {
545
+ block.Page = ri + 1;
546
+ combined.Blocks.push(block);
547
+ }
548
+ } else {
549
+ combined.Blocks.push(...response.Blocks);
550
+ }
551
+ }
552
+ if (response.Warnings) {
553
+ combined.Warnings.push(...response.Warnings);
554
+ }
555
+ if (response.DocumentMetadata && !documentMetadata) {
556
+ documentMetadata = response.DocumentMetadata;
557
+ }
558
+ }
559
+
560
+ if (documentMetadata) {
561
+ combined.DocumentMetadata = { ...documentMetadata };
562
+ if (isPerPage) {
563
+ combined.DocumentMetadata.Pages = responses.length;
564
+ }
565
+ }
566
+
567
+ // Carry over other relevant top-level fields from the first response, except for pagination tokens.
568
+ const template = { ...responses[0] };
569
+ delete template.Blocks;
570
+ delete template.Warnings;
571
+ delete template.NextToken;
572
+
573
+ return { ...template, ...combined };
574
+ };
575
+ }