@claritylabs/cl-sdk 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -369,10 +369,11 @@ async function withRetry(fn, log) {
369
369
 
370
370
  // src/core/concurrency.ts
371
371
  function pLimit(concurrency) {
372
+ const maxConcurrency = Number.isFinite(concurrency) ? Math.max(1, Math.floor(concurrency)) : 1;
372
373
  let active = 0;
373
374
  const queue = [];
374
375
  function next() {
375
- if (queue.length > 0 && active < concurrency) {
376
+ if (queue.length > 0 && active < maxConcurrency) {
376
377
  active++;
377
378
  queue.shift()();
378
379
  }
@@ -2663,6 +2664,35 @@ async function extractPageRange(input, startPage, endPage) {
2663
2664
  const bytes = await newDoc.save();
2664
2665
  return bytesToBase64(new Uint8Array(bytes));
2665
2666
  }
2667
+ async function createPdfPageSlicer(input) {
2668
+ if (isFileIdRef(input)) {
2669
+ throw new Error(
2670
+ "Cannot create a page slicer from a fileId reference. Pass the full PDF as base64/bytes, or provide pre-rendered page assets."
2671
+ );
2672
+ }
2673
+ const srcBytes = await pdfInputToBytes(input);
2674
+ const srcDoc = await import_pdf_lib.PDFDocument.load(srcBytes, { ignoreEncryption: true });
2675
+ const totalPages = srcDoc.getPageCount();
2676
+ const originalBase64 = isBytes(input) ? bytesToBase64(input) : typeof input === "string" ? input : bytesToBase64(srcBytes);
2677
+ return {
2678
+ getPageCount() {
2679
+ return totalPages;
2680
+ },
2681
+ async extractPageRange(startPage, endPage) {
2682
+ const start = Math.max(startPage - 1, 0);
2683
+ const end = Math.min(endPage, totalPages) - 1;
2684
+ if (start === 0 && end >= totalPages - 1) {
2685
+ return originalBase64;
2686
+ }
2687
+ const newDoc = await import_pdf_lib.PDFDocument.create();
2688
+ const indices = Array.from({ length: end - start + 1 }, (_, i) => start + i);
2689
+ const pages = await newDoc.copyPages(srcDoc, indices);
2690
+ pages.forEach((page) => newDoc.addPage(page));
2691
+ const bytes = await newDoc.save();
2692
+ return bytesToBase64(new Uint8Array(bytes));
2693
+ }
2694
+ };
2695
+ }
2666
2696
  async function buildPdfProviderOptions(input, existingOptions) {
2667
2697
  const options = { ...existingOptions };
2668
2698
  if (isFileIdRef(input)) {
@@ -2810,20 +2840,19 @@ async function runExtractor(params) {
2810
2840
  } = params;
2811
2841
  const extractorProviderOptions = { ...providerOptions };
2812
2842
  let fullPrompt;
2813
- const pdfBase64 = await pdfInputToBase64(pdfInput);
2843
+ const needsPdfBase64 = convertPdfToImages && !params.getPageImages || !convertPdfToImages && !params.getPageRangePdf;
2844
+ const pdfBase64 = needsPdfBase64 ? await pdfInputToBase64(pdfInput) : void 0;
2814
2845
  if (convertPdfToImages) {
2815
- const images = await convertPdfToImages(pdfBase64, startPage, endPage);
2846
+ const images = params.getPageImages ? await params.getPageImages(startPage, endPage) : await convertPdfToImages(pdfBase64, startPage, endPage);
2816
2847
  extractorProviderOptions.images = images;
2817
2848
  fullPrompt = `${prompt}
2818
2849
 
2819
2850
  [Document pages ${startPage}-${endPage} are provided as images.]`;
2820
2851
  } else {
2821
2852
  const cacheKey = `${startPage}-${endPage}`;
2822
- let pagesPdf = pageRangeCache?.get(cacheKey);
2823
- if (!pagesPdf) {
2824
- pagesPdf = await extractPageRange(pdfBase64, startPage, endPage);
2825
- pageRangeCache?.set(cacheKey, pagesPdf);
2826
- }
2853
+ const cachedPagesPdf = pageRangeCache?.get(cacheKey);
2854
+ const pagesPdf = cachedPagesPdf ?? (params.getPageRangePdf ? await params.getPageRangePdf(startPage, endPage) : await extractPageRange(pdfBase64, startPage, endPage));
2855
+ if (!cachedPagesPdf) pageRangeCache?.set(cacheKey, pagesPdf);
2827
2856
  extractorProviderOptions.pdfBase64 = pagesPdf;
2828
2857
  fullPrompt = `${prompt}
2829
2858
 
@@ -3809,33 +3838,40 @@ async function formatDocumentContent(doc, generateText, options) {
3809
3838
  for (let i = 0; i < entries.length; i += MAX_ENTRIES_PER_BATCH) {
3810
3839
  batches.push(entries.slice(i, i + MAX_ENTRIES_PER_BATCH));
3811
3840
  }
3812
- for (let batchIdx = 0; batchIdx < batches.length; batchIdx++) {
3813
- const batch = batches[batchIdx];
3814
- try {
3815
- const prompt = buildFormatPrompt(batch.map((e) => ({ id: e.id, text: e.text })));
3816
- const result = await withRetry(
3817
- () => generateText({
3818
- prompt,
3819
- maxTokens: options?.maxTokens ?? 16384,
3820
- providerOptions: options?.providerOptions
3821
- })
3822
- );
3823
- if (result.usage) {
3824
- totalUsage.inputTokens += result.usage.inputTokens;
3825
- totalUsage.outputTokens += result.usage.outputTokens;
3826
- }
3827
- const formatted = parseFormatResponse(result.text);
3828
- if (formatted.size < batch.length) {
3841
+ const limit = pLimit(options?.concurrency ?? 2);
3842
+ const batchResults = await Promise.all(batches.map(
3843
+ (batch, batchIdx) => limit(async () => {
3844
+ try {
3845
+ const prompt = buildFormatPrompt(batch.map((e) => ({ id: e.id, text: e.text })));
3846
+ const result = await withRetry(
3847
+ () => generateText({
3848
+ prompt,
3849
+ maxTokens: options?.maxTokens ?? 16384,
3850
+ providerOptions: options?.providerOptions
3851
+ })
3852
+ );
3853
+ const formatted = parseFormatResponse(result.text);
3854
+ if (formatted.size < batch.length) {
3855
+ await options?.log?.(
3856
+ `Format batch ${batchIdx + 1}/${batches.length}: model returned ${formatted.size}/${batch.length} entries \u2014 unformatted entries will keep original content`
3857
+ );
3858
+ }
3859
+ return { batch, formatted, usage: result.usage };
3860
+ } catch (error) {
3829
3861
  await options?.log?.(
3830
- `Format batch ${batchIdx + 1}/${batches.length}: model returned ${formatted.size}/${batch.length} entries \u2014 unformatted entries will keep original content`
3862
+ `Format batch ${batchIdx + 1}/${batches.length} failed, keeping original content: ${error instanceof Error ? error.message : String(error)}`
3831
3863
  );
3864
+ return void 0;
3832
3865
  }
3833
- applyFormattedContent(doc, batch, formatted);
3834
- } catch (error) {
3835
- await options?.log?.(
3836
- `Format batch ${batchIdx + 1}/${batches.length} failed, keeping original content: ${error instanceof Error ? error.message : String(error)}`
3837
- );
3866
+ })
3867
+ ));
3868
+ for (const result of batchResults) {
3869
+ if (!result) continue;
3870
+ if (result.usage) {
3871
+ totalUsage.inputTokens += result.usage.inputTokens;
3872
+ totalUsage.outputTokens += result.usage.outputTokens;
3838
3873
  }
3874
+ applyFormattedContent(doc, result.batch, result.formatted);
3839
3875
  }
3840
3876
  return { document: doc, usage: totalUsage };
3841
3877
  }
@@ -7106,6 +7142,8 @@ async function resolveReferentialCoverages(params) {
7106
7142
  pageCount,
7107
7143
  generateObject,
7108
7144
  convertPdfToImages,
7145
+ getPageRangePdf,
7146
+ getPageImages,
7109
7147
  concurrency = 2,
7110
7148
  providerOptions,
7111
7149
  modelCapabilities,
@@ -7215,6 +7253,8 @@ async function resolveReferentialCoverages(params) {
7215
7253
  endPage: pageRange.endPage,
7216
7254
  generateObject,
7217
7255
  convertPdfToImages,
7256
+ getPageRangePdf,
7257
+ getPageImages,
7218
7258
  maxTokens: budget.maxTokens,
7219
7259
  providerOptions
7220
7260
  });
@@ -7306,6 +7346,8 @@ async function runFocusedExtractorWithFallback(params) {
7306
7346
  convertPdfToImages,
7307
7347
  providerOptions,
7308
7348
  pageRangeCache,
7349
+ getPageRangePdf,
7350
+ getPageImages,
7309
7351
  trackUsage,
7310
7352
  resolveBudget,
7311
7353
  log
@@ -7319,6 +7361,7 @@ async function runFocusedExtractorWithFallback(params) {
7319
7361
  const hintTokens = ext.maxTokens ?? 4096;
7320
7362
  const taskKind = hintTokens >= 8192 ? "extraction_long_list" : "extraction_focused";
7321
7363
  const budget = resolveBudget(taskKind, hintTokens);
7364
+ const startedAt = Date.now();
7322
7365
  const result = await runExtractor({
7323
7366
  name: task.extractorName,
7324
7367
  prompt: ext.buildPrompt(),
@@ -7330,12 +7373,15 @@ async function runFocusedExtractorWithFallback(params) {
7330
7373
  convertPdfToImages,
7331
7374
  maxTokens: budget.maxTokens,
7332
7375
  providerOptions,
7333
- pageRangeCache
7376
+ pageRangeCache,
7377
+ getPageRangePdf,
7378
+ getPageImages
7334
7379
  });
7335
7380
  trackUsage(result.usage, {
7336
7381
  taskKind,
7337
7382
  label: task.extractorName,
7338
- maxTokens: budget.maxTokens
7383
+ maxTokens: budget.maxTokens,
7384
+ durationMs: Date.now() - startedAt
7339
7385
  });
7340
7386
  if (!ext.fallback?.isEmpty(result.data)) {
7341
7387
  return result;
@@ -7358,6 +7404,7 @@ async function runFocusedExtractorWithFallback(params) {
7358
7404
  const hintTokens = fallbackExt.maxTokens ?? 4096;
7359
7405
  const taskKind = hintTokens >= 8192 ? "extraction_long_list" : "extraction_focused";
7360
7406
  const budget = resolveBudget(taskKind, hintTokens);
7407
+ const startedAt = Date.now();
7361
7408
  const fallbackResult = await runExtractor({
7362
7409
  name: ext.fallback.extractorName,
7363
7410
  prompt: fallbackExt.buildPrompt(),
@@ -7369,12 +7416,15 @@ async function runFocusedExtractorWithFallback(params) {
7369
7416
  convertPdfToImages,
7370
7417
  maxTokens: budget.maxTokens,
7371
7418
  providerOptions,
7372
- pageRangeCache
7419
+ pageRangeCache,
7420
+ getPageRangePdf,
7421
+ getPageImages
7373
7422
  });
7374
7423
  trackUsage(fallbackResult.usage, {
7375
7424
  taskKind,
7376
7425
  label: ext.fallback.extractorName,
7377
- maxTokens: budget.maxTokens
7426
+ maxTokens: budget.maxTokens,
7427
+ durationMs: Date.now() - startedAt
7378
7428
  });
7379
7429
  const focusedData = ext.fallback.deriveFocusedResult(fallbackResult.data);
7380
7430
  return focusedData ? [
@@ -8035,7 +8085,11 @@ function createExtractor(config) {
8035
8085
  generateObject,
8036
8086
  convertPdfToImages,
8037
8087
  concurrency = 2,
8088
+ pageMapConcurrency,
8089
+ extractorConcurrency,
8090
+ formatConcurrency,
8038
8091
  maxReviewRounds = 2,
8092
+ reviewMode = "auto",
8039
8093
  onTokenUsage,
8040
8094
  onProgress,
8041
8095
  log,
@@ -8046,7 +8100,8 @@ function createExtractor(config) {
8046
8100
  modelBudgetConstraints,
8047
8101
  onCheckpointSave
8048
8102
  } = config;
8049
- const limit = pLimit(concurrency);
8103
+ const pageMapLimit = pLimit(pageMapConcurrency ?? concurrency);
8104
+ const extractorLimit = pLimit(extractorConcurrency ?? concurrency);
8050
8105
  const extractorCatalog = formatExtractorCatalogForPrompt();
8051
8106
  let totalUsage = { inputTokens: 0, outputTokens: 0 };
8052
8107
  let modelCalls = 0;
@@ -8081,7 +8136,7 @@ function createExtractor(config) {
8081
8136
  usage,
8082
8137
  usageReported: !!usage
8083
8138
  });
8084
- if (report.durationMs) {
8139
+ if (report.durationMs != null) {
8085
8140
  performanceReport.totalModelCallDurationMs += report.durationMs;
8086
8141
  }
8087
8142
  }
@@ -8138,6 +8193,46 @@ function createExtractor(config) {
8138
8193
  return textIncludesSupplementarySignal(JSON.stringify(value));
8139
8194
  });
8140
8195
  }
8196
+ function getSupplementaryPageRanges(pageAssignments, formInventory) {
8197
+ const pages = /* @__PURE__ */ new Set();
8198
+ for (const assignment of pageAssignments) {
8199
+ if (assignment.pageRole === "supplementary" || assignment.extractorNames.includes("supplementary") || textIncludesSupplementarySignal(assignment.notes)) {
8200
+ pages.add(assignment.localPageNumber);
8201
+ }
8202
+ }
8203
+ for (const form of formInventory?.forms ?? []) {
8204
+ if (form.formType === "notice" || textIncludesSupplementarySignal(form.title) || textIncludesSupplementarySignal(form.formNumber)) {
8205
+ const startPage2 = form.pageStart;
8206
+ const endPage = form.pageEnd ?? form.pageStart;
8207
+ if (typeof startPage2 !== "number" || typeof endPage !== "number") continue;
8208
+ for (let page = startPage2; page <= endPage; page += 1) {
8209
+ pages.add(page);
8210
+ }
8211
+ }
8212
+ }
8213
+ const sortedPages = [...pages].sort((a, b) => a - b);
8214
+ if (sortedPages.length === 0) return [];
8215
+ const ranges = [];
8216
+ let startPage = sortedPages[0];
8217
+ let previousPage = sortedPages[0];
8218
+ for (const page of sortedPages.slice(1)) {
8219
+ if (page === previousPage + 1) {
8220
+ previousPage = page;
8221
+ continue;
8222
+ }
8223
+ ranges.push({ startPage, endPage: previousPage });
8224
+ startPage = page;
8225
+ previousPage = page;
8226
+ }
8227
+ ranges.push({ startPage, endPage: previousPage });
8228
+ return ranges;
8229
+ }
8230
+ function shouldRunLlmReview(mode, report, sourceSpansAvailable) {
8231
+ if (mode === "skip" || maxReviewRounds <= 0) return false;
8232
+ if (mode === "always") return true;
8233
+ if (!sourceSpansAvailable) return true;
8234
+ return report.qualityGateStatus !== "passed" || report.issues.length > 0;
8235
+ }
8141
8236
  function buildAlreadyExtractedSummary(memory) {
8142
8237
  const lines = [];
8143
8238
  const declarationResult = readMemoryRecord(memory, "declarations");
@@ -8167,10 +8262,11 @@ function createExtractor(config) {
8167
8262
  }
8168
8263
  return lines.length > 0 ? lines.join("\n") : "";
8169
8264
  }
8170
- async function runFocusedExtractorTask(task, pdfInput, memory, pageRangeCache) {
8265
+ async function runFocusedExtractorTask(task, pdfInput, memory, pageRangeCache, getPageRangePdf, getPageImages) {
8171
8266
  if (task.extractorName === "supplementary") {
8172
8267
  const alreadyExtractedSummary = buildAlreadyExtractedSummary(memory);
8173
8268
  const budget = resolveBudget("extraction_focused", 4096);
8269
+ const startedAt = Date.now();
8174
8270
  const result = await runExtractor({
8175
8271
  name: "supplementary",
8176
8272
  prompt: buildSupplementaryPrompt(alreadyExtractedSummary),
@@ -8182,12 +8278,15 @@ function createExtractor(config) {
8182
8278
  convertPdfToImages,
8183
8279
  maxTokens: budget.maxTokens,
8184
8280
  providerOptions: activeProviderOptions,
8185
- pageRangeCache
8281
+ pageRangeCache,
8282
+ getPageRangePdf,
8283
+ getPageImages
8186
8284
  });
8187
8285
  trackUsage(result.usage, {
8188
8286
  taskKind: "extraction_focused",
8189
8287
  label: "supplementary",
8190
- maxTokens: budget.maxTokens
8288
+ maxTokens: budget.maxTokens,
8289
+ durationMs: Date.now() - startedAt
8191
8290
  });
8192
8291
  return result;
8193
8292
  }
@@ -8198,6 +8297,8 @@ function createExtractor(config) {
8198
8297
  convertPdfToImages,
8199
8298
  providerOptions: activeProviderOptions,
8200
8299
  pageRangeCache,
8300
+ getPageRangePdf,
8301
+ getPageImages,
8201
8302
  trackUsage,
8202
8303
  resolveBudget,
8203
8304
  log
@@ -8246,20 +8347,68 @@ function createExtractor(config) {
8246
8347
  }
8247
8348
  }
8248
8349
  let pdfBase64Cache;
8350
+ const completedPageRangePdfCache = /* @__PURE__ */ new Map();
8249
8351
  const pageRangePdfCache = /* @__PURE__ */ new Map();
8352
+ const pageRangeImageCache = /* @__PURE__ */ new Map();
8353
+ let pdfSlicerPromise;
8354
+ let fullPdfProviderOptionsPromise;
8355
+ let pageCountPromise;
8250
8356
  async function getPdfBase64ForExtraction() {
8251
8357
  if (pdfBase64Cache === void 0) {
8252
8358
  pdfBase64Cache = await pdfInputToBase64(pdfInput);
8253
8359
  }
8254
8360
  return pdfBase64Cache;
8255
8361
  }
8362
+ async function getCachedPageCount() {
8363
+ if (!pageCountPromise) {
8364
+ pageCountPromise = getPdfSlicer().then((slicer) => slicer.getPageCount()).catch(() => getPdfPageCount(pdfInput));
8365
+ }
8366
+ return pageCountPromise;
8367
+ }
8368
+ async function getFullPdfProviderOptions() {
8369
+ if (!fullPdfProviderOptionsPromise) {
8370
+ fullPdfProviderOptionsPromise = buildPdfProviderOptions(pdfInput, activeProviderOptions);
8371
+ }
8372
+ return fullPdfProviderOptionsPromise;
8373
+ }
8374
+ async function getPdfSlicer() {
8375
+ if (!pdfSlicerPromise) {
8376
+ pdfSlicerPromise = createPdfPageSlicer(pdfInput);
8377
+ }
8378
+ return pdfSlicerPromise;
8379
+ }
8256
8380
  async function getPageRangePdf(startPage, endPage) {
8257
8381
  const cacheKey = `${startPage}-${endPage}`;
8258
- const cached = pageRangePdfCache.get(cacheKey);
8382
+ const cached = completedPageRangePdfCache.get(cacheKey);
8383
+ if (cached) return cached;
8384
+ const pending = pageRangePdfCache.get(cacheKey);
8385
+ if (pending) return pending;
8386
+ const promise = (async () => {
8387
+ const slicer = await getPdfSlicer();
8388
+ const pagesPdf = await slicer.extractPageRange(startPage, endPage);
8389
+ completedPageRangePdfCache.set(cacheKey, pagesPdf);
8390
+ return pagesPdf;
8391
+ })().catch((error) => {
8392
+ pageRangePdfCache.delete(cacheKey);
8393
+ throw error;
8394
+ });
8395
+ pageRangePdfCache.set(cacheKey, promise);
8396
+ return promise;
8397
+ }
8398
+ async function getPageImages(startPage, endPage) {
8399
+ if (!convertPdfToImages) return [];
8400
+ const cacheKey = `${startPage}-${endPage}`;
8401
+ const cached = pageRangeImageCache.get(cacheKey);
8259
8402
  if (cached) return cached;
8260
- const pagesPdf = await extractPageRange(await getPdfBase64ForExtraction(), startPage, endPage);
8261
- pageRangePdfCache.set(cacheKey, pagesPdf);
8262
- return pagesPdf;
8403
+ const promise = (async () => {
8404
+ const pdfBase64 = await getPdfBase64ForExtraction();
8405
+ return convertPdfToImages(pdfBase64, startPage, endPage);
8406
+ })().catch((error) => {
8407
+ pageRangeImageCache.delete(cacheKey);
8408
+ throw error;
8409
+ });
8410
+ pageRangeImageCache.set(cacheKey, promise);
8411
+ return promise;
8263
8412
  }
8264
8413
  let classifyResult;
8265
8414
  if (resumed?.classifyResult && pipelineCtx.isPhaseComplete("classify")) {
@@ -8267,15 +8416,16 @@ function createExtractor(config) {
8267
8416
  onProgress?.("Resuming from checkpoint (classify complete)...");
8268
8417
  } else {
8269
8418
  onProgress?.("Classifying document...");
8270
- const pageCount2 = await getPdfPageCount(pdfInput);
8419
+ const pageCount2 = await getCachedPageCount();
8271
8420
  const budget = resolveBudget("extraction_classify", 512);
8421
+ const startedAt = Date.now();
8272
8422
  const classifyResponse = await safeGenerateObject(
8273
8423
  generateObject,
8274
8424
  {
8275
8425
  prompt: buildClassifyPrompt(),
8276
8426
  schema: ClassifyResultSchema,
8277
8427
  maxTokens: budget.maxTokens,
8278
- providerOptions: await buildPdfProviderOptions(pdfInput, activeProviderOptions)
8428
+ providerOptions: await getFullPdfProviderOptions()
8279
8429
  },
8280
8430
  {
8281
8431
  fallback: { documentType: "policy", policyTypes: ["other"], confidence: 0 },
@@ -8287,7 +8437,8 @@ function createExtractor(config) {
8287
8437
  trackUsage(classifyResponse.usage, {
8288
8438
  taskKind: "extraction_classify",
8289
8439
  label: "classify",
8290
- maxTokens: budget.maxTokens
8440
+ maxTokens: budget.maxTokens,
8441
+ durationMs: Date.now() - startedAt
8291
8442
  });
8292
8443
  classifyResult = classifyResponse.object;
8293
8444
  if (classifyResult.confidence === 0) {
@@ -8305,7 +8456,7 @@ function createExtractor(config) {
8305
8456
  const policyTypes = classifyResult.policyTypes ?? [];
8306
8457
  const primaryType = policyTypes[0] ?? "other";
8307
8458
  const template = getTemplate(primaryType);
8308
- const pageCount = resumed?.pageCount ?? await getPdfPageCount(pdfInput);
8459
+ const pageCount = resumed?.pageCount ?? await getCachedPageCount();
8309
8460
  const templateHints = buildTemplateHints(primaryType, documentType, pageCount, template);
8310
8461
  let formInventory;
8311
8462
  if (resumed?.formInventory && pipelineCtx.isPhaseComplete("form_inventory")) {
@@ -8315,13 +8466,14 @@ function createExtractor(config) {
8315
8466
  } else {
8316
8467
  onProgress?.(`Building form inventory for ${primaryType} ${documentType}...`);
8317
8468
  const budget = resolveBudget("extraction_form_inventory", 2048);
8469
+ const startedAt = Date.now();
8318
8470
  const formInventoryResponse = await safeGenerateObject(
8319
8471
  generateObject,
8320
8472
  {
8321
8473
  prompt: buildFormInventoryPrompt(templateHints),
8322
8474
  schema: FormInventorySchema,
8323
8475
  maxTokens: budget.maxTokens,
8324
- providerOptions: await buildPdfProviderOptions(pdfInput, activeProviderOptions)
8476
+ providerOptions: await getFullPdfProviderOptions()
8325
8477
  },
8326
8478
  {
8327
8479
  fallback: { forms: [] },
@@ -8332,7 +8484,8 @@ function createExtractor(config) {
8332
8484
  trackUsage(formInventoryResponse.usage, {
8333
8485
  taskKind: "extraction_form_inventory",
8334
8486
  label: "form_inventory",
8335
- maxTokens: budget.maxTokens
8487
+ maxTokens: budget.maxTokens,
8488
+ durationMs: Date.now() - startedAt
8336
8489
  });
8337
8490
  formInventory = formInventoryResponse.object;
8338
8491
  memory.set("form_inventory", formInventory);
@@ -8362,9 +8515,10 @@ function createExtractor(config) {
8362
8515
  );
8363
8516
  const pageMapResults = await Promise.all(
8364
8517
  pageMapChunks.map(
8365
- ({ startPage, endPage }) => limit(async () => {
8518
+ ({ startPage, endPage }) => pageMapLimit(async () => {
8366
8519
  const pagesPdf = await getPageRangePdf(startPage, endPage);
8367
8520
  const budget = resolveBudget("extraction_page_map", 2048);
8521
+ const startedAt = Date.now();
8368
8522
  const mapResponse = await safeGenerateObject(
8369
8523
  generateObject,
8370
8524
  {
@@ -8389,7 +8543,8 @@ function createExtractor(config) {
8389
8543
  trackUsage(mapResponse.usage, {
8390
8544
  taskKind: "extraction_page_map",
8391
8545
  label: `page_map:${startPage}-${endPage}`,
8392
- maxTokens: budget.maxTokens
8546
+ maxTokens: budget.maxTokens,
8547
+ durationMs: Date.now() - startedAt
8393
8548
  });
8394
8549
  return mapResponse.object.pages.map((assignment) => ({
8395
8550
  ...assignment,
@@ -8435,14 +8590,32 @@ function createExtractor(config) {
8435
8590
  });
8436
8591
  }
8437
8592
  if (!pipelineCtx.isPhaseComplete("extract")) {
8438
- const tasks = plan.tasks;
8593
+ const supplementaryRanges = getSupplementaryPageRanges(pageAssignments, formInventory);
8594
+ const baseTasks = plan.tasks;
8595
+ const hasPlannedSupplementary = baseTasks.some((task) => task.extractorName === "supplementary");
8596
+ const tasks = hasPlannedSupplementary || supplementaryRanges.length === 0 ? baseTasks : [
8597
+ ...baseTasks,
8598
+ ...supplementaryRanges.map((range) => ({
8599
+ extractorName: "supplementary",
8600
+ startPage: range.startPage,
8601
+ endPage: range.endPage,
8602
+ description: `Page-signaled supplementary extraction for pages ${range.startPage}-${range.endPage}`
8603
+ }))
8604
+ ];
8439
8605
  onProgress?.(`Dispatching ${tasks.length} extractors...`);
8440
8606
  const extractionPdfInput = await getPdfBase64ForExtraction();
8441
8607
  const extractorResults = await Promise.all(
8442
8608
  tasks.map(
8443
- (task) => limit(async () => {
8609
+ (task) => extractorLimit(async () => {
8444
8610
  onProgress?.(`Extracting ${task.extractorName} (pages ${task.startPage}-${task.endPage})...`);
8445
- return runFocusedExtractorTask(task, extractionPdfInput, memory, pageRangePdfCache);
8611
+ return runFocusedExtractorTask(
8612
+ task,
8613
+ extractionPdfInput,
8614
+ memory,
8615
+ completedPageRangePdfCache,
8616
+ getPageRangePdf,
8617
+ convertPdfToImages ? getPageImages : void 0
8618
+ );
8446
8619
  })
8447
8620
  )
8448
8621
  );
@@ -8457,6 +8630,7 @@ function createExtractor(config) {
8457
8630
  try {
8458
8631
  const alreadyExtractedSummary = buildAlreadyExtractedSummary(memory);
8459
8632
  const budget = resolveBudget("extraction_focused", 4096);
8633
+ const startedAt = Date.now();
8460
8634
  const supplementaryResult = await runExtractor({
8461
8635
  name: "supplementary",
8462
8636
  prompt: buildSupplementaryPrompt(alreadyExtractedSummary),
@@ -8468,12 +8642,15 @@ function createExtractor(config) {
8468
8642
  convertPdfToImages,
8469
8643
  maxTokens: budget.maxTokens,
8470
8644
  providerOptions: activeProviderOptions,
8471
- pageRangeCache: pageRangePdfCache
8645
+ pageRangeCache: completedPageRangePdfCache,
8646
+ getPageRangePdf,
8647
+ getPageImages: convertPdfToImages ? getPageImages : void 0
8472
8648
  });
8473
8649
  trackUsage(supplementaryResult.usage, {
8474
8650
  taskKind: "extraction_focused",
8475
8651
  label: "supplementary",
8476
- maxTokens: budget.maxTokens
8652
+ maxTokens: budget.maxTokens,
8653
+ durationMs: Date.now() - startedAt
8477
8654
  });
8478
8655
  mergeMemoryResult(supplementaryResult.name, supplementaryResult.data, memory);
8479
8656
  } catch (error) {
@@ -8493,6 +8670,7 @@ function createExtractor(config) {
8493
8670
  if (!pipelineCtx.isPhaseComplete("resolve_referential")) {
8494
8671
  onProgress?.("Resolving referential coverage limits...");
8495
8672
  try {
8673
+ const startedAt = Date.now();
8496
8674
  const resolution = await resolveReferentialCoverages({
8497
8675
  memory,
8498
8676
  pdfInput,
@@ -8500,6 +8678,8 @@ function createExtractor(config) {
8500
8678
  generateObject,
8501
8679
  convertPdfToImages,
8502
8680
  concurrency,
8681
+ getPageRangePdf,
8682
+ getPageImages: convertPdfToImages ? getPageImages : void 0,
8503
8683
  providerOptions: activeProviderOptions,
8504
8684
  modelCapabilities,
8505
8685
  modelBudgetConstraints,
@@ -8508,7 +8688,8 @@ function createExtractor(config) {
8508
8688
  });
8509
8689
  trackUsage(resolution.usage, {
8510
8690
  taskKind: "extraction_referential_lookup",
8511
- label: "referential_resolution"
8691
+ label: "referential_resolution",
8692
+ durationMs: Date.now() - startedAt
8512
8693
  });
8513
8694
  if (resolution.attempts > 0) {
8514
8695
  await log?.(`Referential resolution: ${resolution.resolved}/${resolution.attempts} resolved, ${resolution.unresolved} unresolved`);
@@ -8530,52 +8711,72 @@ function createExtractor(config) {
8530
8711
  let reviewReport = resumed?.reviewReport;
8531
8712
  if (!pipelineCtx.isPhaseComplete("review")) {
8532
8713
  reviewRounds = [];
8533
- for (let round = 0; round < maxReviewRounds; round++) {
8534
- const extractedKeys = [...memory.keys()].filter((k) => k !== "classify");
8535
- const extractionSummary = summarizeExtraction(memory);
8536
- const pageMapSummary = formatPageMapSummary(pageAssignments);
8537
- const budget = resolveBudget("extraction_review", 1536);
8538
- const reviewResponse = await safeGenerateObject(
8539
- generateObject,
8540
- {
8541
- prompt: buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary, extractorCatalog),
8542
- schema: ReviewResultSchema,
8714
+ groundExtractionMemoryWithSourceSpans(memory, sourceSpans);
8715
+ const preReviewReport = buildExtractionReviewReport({
8716
+ memory,
8717
+ pageAssignments,
8718
+ reviewRounds,
8719
+ sourceSpansAvailable: sourceSpans.length > 0
8720
+ });
8721
+ if (shouldRunLlmReview(reviewMode, preReviewReport, sourceSpans.length > 0)) {
8722
+ for (let round = 0; round < maxReviewRounds; round++) {
8723
+ const extractedKeys = [...memory.keys()].filter((k) => k !== "classify");
8724
+ const extractionSummary = summarizeExtraction(memory);
8725
+ const pageMapSummary = formatPageMapSummary(pageAssignments);
8726
+ const budget = resolveBudget("extraction_review", 1536);
8727
+ const startedAt = Date.now();
8728
+ const reviewResponse = await safeGenerateObject(
8729
+ generateObject,
8730
+ {
8731
+ prompt: buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary, extractorCatalog),
8732
+ schema: ReviewResultSchema,
8733
+ maxTokens: budget.maxTokens,
8734
+ providerOptions: await getFullPdfProviderOptions()
8735
+ },
8736
+ {
8737
+ fallback: { complete: true, missingFields: [], qualityIssues: [], additionalTasks: [] },
8738
+ log,
8739
+ onError: (err, attempt) => log?.(`Review round ${round + 1} attempt ${attempt + 1} failed: ${err}`)
8740
+ }
8741
+ );
8742
+ trackUsage(reviewResponse.usage, {
8743
+ taskKind: "extraction_review",
8744
+ label: `review:${round + 1}`,
8543
8745
  maxTokens: budget.maxTokens,
8544
- providerOptions: await buildPdfProviderOptions(pdfInput, activeProviderOptions)
8545
- },
8546
- {
8547
- fallback: { complete: true, missingFields: [], qualityIssues: [], additionalTasks: [] },
8548
- log,
8549
- onError: (err, attempt) => log?.(`Review round ${round + 1} attempt ${attempt + 1} failed: ${err}`)
8746
+ durationMs: Date.now() - startedAt
8747
+ });
8748
+ reviewRounds.push(toReviewRoundRecord(round + 1, reviewResponse.object));
8749
+ if (reviewResponse.object.qualityIssues?.length) {
8750
+ await log?.(`Review round ${round + 1} quality issues: ${reviewResponse.object.qualityIssues.join("; ")}`);
8550
8751
  }
8551
- );
8552
- trackUsage(reviewResponse.usage, {
8553
- taskKind: "extraction_review",
8554
- label: `review:${round + 1}`,
8555
- maxTokens: budget.maxTokens
8556
- });
8557
- reviewRounds.push(toReviewRoundRecord(round + 1, reviewResponse.object));
8558
- if (reviewResponse.object.qualityIssues?.length) {
8559
- await log?.(`Review round ${round + 1} quality issues: ${reviewResponse.object.qualityIssues.join("; ")}`);
8560
- }
8561
- if (reviewResponse.object.complete || reviewResponse.object.additionalTasks.length === 0) {
8562
- onProgress?.("Extraction complete.");
8563
- break;
8564
- }
8565
- onProgress?.(`Review round ${round + 1}: dispatching ${reviewResponse.object.additionalTasks.length} follow-up extractors...`);
8566
- const extractionPdfInput = await getPdfBase64ForExtraction();
8567
- const followUpResults = await Promise.all(
8568
- reviewResponse.object.additionalTasks.map(
8569
- (task) => limit(async () => {
8570
- return runFocusedExtractorTask(task, extractionPdfInput, memory, pageRangePdfCache);
8571
- })
8572
- )
8573
- );
8574
- for (const result of followUpResults.flatMap((item) => Array.isArray(item) ? item : item ? [item] : [])) {
8575
- if (result) {
8576
- mergeMemoryResult(result.name, result.data, memory);
8752
+ if (reviewResponse.object.complete || reviewResponse.object.additionalTasks.length === 0) {
8753
+ onProgress?.("Extraction complete.");
8754
+ break;
8755
+ }
8756
+ onProgress?.(`Review round ${round + 1}: dispatching ${reviewResponse.object.additionalTasks.length} follow-up extractors...`);
8757
+ const extractionPdfInput = await getPdfBase64ForExtraction();
8758
+ const followUpResults = await Promise.all(
8759
+ reviewResponse.object.additionalTasks.map(
8760
+ (task) => extractorLimit(async () => {
8761
+ return runFocusedExtractorTask(
8762
+ task,
8763
+ extractionPdfInput,
8764
+ memory,
8765
+ completedPageRangePdfCache,
8766
+ getPageRangePdf,
8767
+ convertPdfToImages ? getPageImages : void 0
8768
+ );
8769
+ })
8770
+ )
8771
+ );
8772
+ for (const result of followUpResults.flatMap((item) => Array.isArray(item) ? item : item ? [item] : [])) {
8773
+ if (result) {
8774
+ mergeMemoryResult(result.name, result.data, memory);
8775
+ }
8577
8776
  }
8578
8777
  }
8778
+ } else {
8779
+ onProgress?.("Skipping LLM extraction review; deterministic checks passed.");
8579
8780
  }
8580
8781
  groundExtractionMemoryWithSourceSpans(memory, sourceSpans);
8581
8782
  reviewReport = buildExtractionReviewReport({
@@ -8627,6 +8828,7 @@ function createExtractor(config) {
8627
8828
  onProgress?.("Generating document summary...");
8628
8829
  try {
8629
8830
  const budget = resolveBudget("extraction_summary", 512);
8831
+ const startedAt = Date.now();
8630
8832
  const summaryResponse = await safeGenerateObject(
8631
8833
  generateObject,
8632
8834
  {
@@ -8644,7 +8846,8 @@ function createExtractor(config) {
8644
8846
  trackUsage(summaryResponse.usage, {
8645
8847
  taskKind: "extraction_summary",
8646
8848
  label: "summary",
8647
- maxTokens: budget.maxTokens
8849
+ maxTokens: budget.maxTokens,
8850
+ durationMs: Date.now() - startedAt
8648
8851
  });
8649
8852
  if (summaryResponse.object.summary) {
8650
8853
  document.summary = summaryResponse.object.summary;
@@ -8655,16 +8858,19 @@ function createExtractor(config) {
8655
8858
  }
8656
8859
  onProgress?.("Formatting extracted content...");
8657
8860
  const formatBudget = resolveBudget("extraction_format", 16384);
8861
+ const formatStartedAt = Date.now();
8658
8862
  const formatResult = await formatDocumentContent(document, generateText, {
8659
8863
  providerOptions: activeProviderOptions,
8660
8864
  maxTokens: formatBudget.maxTokens,
8865
+ concurrency: formatConcurrency ?? concurrency,
8661
8866
  onProgress,
8662
8867
  log
8663
8868
  });
8664
8869
  trackUsage(formatResult.usage, {
8665
8870
  taskKind: "extraction_format",
8666
8871
  label: "format",
8667
- maxTokens: formatBudget.maxTokens
8872
+ maxTokens: formatBudget.maxTokens,
8873
+ durationMs: Date.now() - formatStartedAt
8668
8874
  });
8669
8875
  const chunks = chunkDocument(formatResult.document);
8670
8876
  const finalCheckpoint = pipelineCtx.getCheckpoint();