@claritylabs/cl-sdk 0.18.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -38,10 +38,11 @@ async function withRetry(fn, log) {
38
38
 
39
39
  // src/core/concurrency.ts
40
40
  function pLimit(concurrency) {
41
+ const maxConcurrency = Number.isFinite(concurrency) ? Math.max(1, Math.floor(concurrency)) : 1;
41
42
  let active = 0;
42
43
  const queue = [];
43
44
  function next() {
44
- if (queue.length > 0 && active < concurrency) {
45
+ if (queue.length > 0 && active < maxConcurrency) {
45
46
  active++;
46
47
  queue.shift()();
47
48
  }
@@ -2340,6 +2341,35 @@ async function extractPageRange(input, startPage, endPage) {
2340
2341
  const bytes = await newDoc.save();
2341
2342
  return bytesToBase64(new Uint8Array(bytes));
2342
2343
  }
2344
+ async function createPdfPageSlicer(input) {
2345
+ if (isFileIdRef(input)) {
2346
+ throw new Error(
2347
+ "Cannot create a page slicer from a fileId reference. Pass the full PDF as base64/bytes, or provide pre-rendered page assets."
2348
+ );
2349
+ }
2350
+ const srcBytes = await pdfInputToBytes(input);
2351
+ const srcDoc = await PDFDocument.load(srcBytes, { ignoreEncryption: true });
2352
+ const totalPages = srcDoc.getPageCount();
2353
+ const originalBase64 = isBytes(input) ? bytesToBase64(input) : typeof input === "string" ? input : bytesToBase64(srcBytes);
2354
+ return {
2355
+ getPageCount() {
2356
+ return totalPages;
2357
+ },
2358
+ async extractPageRange(startPage, endPage) {
2359
+ const start = Math.max(startPage - 1, 0);
2360
+ const end = Math.min(endPage, totalPages) - 1;
2361
+ if (start === 0 && end >= totalPages - 1) {
2362
+ return originalBase64;
2363
+ }
2364
+ const newDoc = await PDFDocument.create();
2365
+ const indices = Array.from({ length: end - start + 1 }, (_, i) => start + i);
2366
+ const pages = await newDoc.copyPages(srcDoc, indices);
2367
+ pages.forEach((page) => newDoc.addPage(page));
2368
+ const bytes = await newDoc.save();
2369
+ return bytesToBase64(new Uint8Array(bytes));
2370
+ }
2371
+ };
2372
+ }
2343
2373
  async function buildPdfProviderOptions(input, existingOptions) {
2344
2374
  const options = { ...existingOptions };
2345
2375
  if (isFileIdRef(input)) {
@@ -2487,20 +2517,19 @@ async function runExtractor(params) {
2487
2517
  } = params;
2488
2518
  const extractorProviderOptions = { ...providerOptions };
2489
2519
  let fullPrompt;
2490
- const pdfBase64 = await pdfInputToBase64(pdfInput);
2520
+ const needsPdfBase64 = convertPdfToImages && !params.getPageImages || !convertPdfToImages && !params.getPageRangePdf;
2521
+ const pdfBase64 = needsPdfBase64 ? await pdfInputToBase64(pdfInput) : void 0;
2491
2522
  if (convertPdfToImages) {
2492
- const images = await convertPdfToImages(pdfBase64, startPage, endPage);
2523
+ const images = params.getPageImages ? await params.getPageImages(startPage, endPage) : await convertPdfToImages(pdfBase64, startPage, endPage);
2493
2524
  extractorProviderOptions.images = images;
2494
2525
  fullPrompt = `${prompt}
2495
2526
 
2496
2527
  [Document pages ${startPage}-${endPage} are provided as images.]`;
2497
2528
  } else {
2498
2529
  const cacheKey = `${startPage}-${endPage}`;
2499
- let pagesPdf = pageRangeCache?.get(cacheKey);
2500
- if (!pagesPdf) {
2501
- pagesPdf = await extractPageRange(pdfBase64, startPage, endPage);
2502
- pageRangeCache?.set(cacheKey, pagesPdf);
2503
- }
2530
+ const cachedPagesPdf = pageRangeCache?.get(cacheKey);
2531
+ const pagesPdf = cachedPagesPdf ?? (params.getPageRangePdf ? await params.getPageRangePdf(startPage, endPage) : await extractPageRange(pdfBase64, startPage, endPage));
2532
+ if (!cachedPagesPdf) pageRangeCache?.set(cacheKey, pagesPdf);
2504
2533
  extractorProviderOptions.pdfBase64 = pagesPdf;
2505
2534
  fullPrompt = `${prompt}
2506
2535
 
@@ -3486,33 +3515,40 @@ async function formatDocumentContent(doc, generateText, options) {
3486
3515
  for (let i = 0; i < entries.length; i += MAX_ENTRIES_PER_BATCH) {
3487
3516
  batches.push(entries.slice(i, i + MAX_ENTRIES_PER_BATCH));
3488
3517
  }
3489
- for (let batchIdx = 0; batchIdx < batches.length; batchIdx++) {
3490
- const batch = batches[batchIdx];
3491
- try {
3492
- const prompt = buildFormatPrompt(batch.map((e) => ({ id: e.id, text: e.text })));
3493
- const result = await withRetry(
3494
- () => generateText({
3495
- prompt,
3496
- maxTokens: options?.maxTokens ?? 16384,
3497
- providerOptions: options?.providerOptions
3498
- })
3499
- );
3500
- if (result.usage) {
3501
- totalUsage.inputTokens += result.usage.inputTokens;
3502
- totalUsage.outputTokens += result.usage.outputTokens;
3503
- }
3504
- const formatted = parseFormatResponse(result.text);
3505
- if (formatted.size < batch.length) {
3518
+ const limit = pLimit(options?.concurrency ?? 2);
3519
+ const batchResults = await Promise.all(batches.map(
3520
+ (batch, batchIdx) => limit(async () => {
3521
+ try {
3522
+ const prompt = buildFormatPrompt(batch.map((e) => ({ id: e.id, text: e.text })));
3523
+ const result = await withRetry(
3524
+ () => generateText({
3525
+ prompt,
3526
+ maxTokens: options?.maxTokens ?? 16384,
3527
+ providerOptions: options?.providerOptions
3528
+ })
3529
+ );
3530
+ const formatted = parseFormatResponse(result.text);
3531
+ if (formatted.size < batch.length) {
3532
+ await options?.log?.(
3533
+ `Format batch ${batchIdx + 1}/${batches.length}: model returned ${formatted.size}/${batch.length} entries \u2014 unformatted entries will keep original content`
3534
+ );
3535
+ }
3536
+ return { batch, formatted, usage: result.usage };
3537
+ } catch (error) {
3506
3538
  await options?.log?.(
3507
- `Format batch ${batchIdx + 1}/${batches.length}: model returned ${formatted.size}/${batch.length} entries \u2014 unformatted entries will keep original content`
3539
+ `Format batch ${batchIdx + 1}/${batches.length} failed, keeping original content: ${error instanceof Error ? error.message : String(error)}`
3508
3540
  );
3541
+ return void 0;
3509
3542
  }
3510
- applyFormattedContent(doc, batch, formatted);
3511
- } catch (error) {
3512
- await options?.log?.(
3513
- `Format batch ${batchIdx + 1}/${batches.length} failed, keeping original content: ${error instanceof Error ? error.message : String(error)}`
3514
- );
3543
+ })
3544
+ ));
3545
+ for (const result of batchResults) {
3546
+ if (!result) continue;
3547
+ if (result.usage) {
3548
+ totalUsage.inputTokens += result.usage.inputTokens;
3549
+ totalUsage.outputTokens += result.usage.outputTokens;
3515
3550
  }
3551
+ applyFormattedContent(doc, result.batch, result.formatted);
3516
3552
  }
3517
3553
  return { document: doc, usage: totalUsage };
3518
3554
  }
@@ -6783,6 +6819,8 @@ async function resolveReferentialCoverages(params) {
6783
6819
  pageCount,
6784
6820
  generateObject,
6785
6821
  convertPdfToImages,
6822
+ getPageRangePdf,
6823
+ getPageImages,
6786
6824
  concurrency = 2,
6787
6825
  providerOptions,
6788
6826
  modelCapabilities,
@@ -6892,6 +6930,8 @@ async function resolveReferentialCoverages(params) {
6892
6930
  endPage: pageRange.endPage,
6893
6931
  generateObject,
6894
6932
  convertPdfToImages,
6933
+ getPageRangePdf,
6934
+ getPageImages,
6895
6935
  maxTokens: budget.maxTokens,
6896
6936
  providerOptions
6897
6937
  });
@@ -6983,6 +7023,8 @@ async function runFocusedExtractorWithFallback(params) {
6983
7023
  convertPdfToImages,
6984
7024
  providerOptions,
6985
7025
  pageRangeCache,
7026
+ getPageRangePdf,
7027
+ getPageImages,
6986
7028
  trackUsage,
6987
7029
  resolveBudget,
6988
7030
  log
@@ -6996,6 +7038,7 @@ async function runFocusedExtractorWithFallback(params) {
6996
7038
  const hintTokens = ext.maxTokens ?? 4096;
6997
7039
  const taskKind = hintTokens >= 8192 ? "extraction_long_list" : "extraction_focused";
6998
7040
  const budget = resolveBudget(taskKind, hintTokens);
7041
+ const startedAt = Date.now();
6999
7042
  const result = await runExtractor({
7000
7043
  name: task.extractorName,
7001
7044
  prompt: ext.buildPrompt(),
@@ -7007,12 +7050,15 @@ async function runFocusedExtractorWithFallback(params) {
7007
7050
  convertPdfToImages,
7008
7051
  maxTokens: budget.maxTokens,
7009
7052
  providerOptions,
7010
- pageRangeCache
7053
+ pageRangeCache,
7054
+ getPageRangePdf,
7055
+ getPageImages
7011
7056
  });
7012
7057
  trackUsage(result.usage, {
7013
7058
  taskKind,
7014
7059
  label: task.extractorName,
7015
- maxTokens: budget.maxTokens
7060
+ maxTokens: budget.maxTokens,
7061
+ durationMs: Date.now() - startedAt
7016
7062
  });
7017
7063
  if (!ext.fallback?.isEmpty(result.data)) {
7018
7064
  return result;
@@ -7035,6 +7081,7 @@ async function runFocusedExtractorWithFallback(params) {
7035
7081
  const hintTokens = fallbackExt.maxTokens ?? 4096;
7036
7082
  const taskKind = hintTokens >= 8192 ? "extraction_long_list" : "extraction_focused";
7037
7083
  const budget = resolveBudget(taskKind, hintTokens);
7084
+ const startedAt = Date.now();
7038
7085
  const fallbackResult = await runExtractor({
7039
7086
  name: ext.fallback.extractorName,
7040
7087
  prompt: fallbackExt.buildPrompt(),
@@ -7046,12 +7093,15 @@ async function runFocusedExtractorWithFallback(params) {
7046
7093
  convertPdfToImages,
7047
7094
  maxTokens: budget.maxTokens,
7048
7095
  providerOptions,
7049
- pageRangeCache
7096
+ pageRangeCache,
7097
+ getPageRangePdf,
7098
+ getPageImages
7050
7099
  });
7051
7100
  trackUsage(fallbackResult.usage, {
7052
7101
  taskKind,
7053
7102
  label: ext.fallback.extractorName,
7054
- maxTokens: budget.maxTokens
7103
+ maxTokens: budget.maxTokens,
7104
+ durationMs: Date.now() - startedAt
7055
7105
  });
7056
7106
  const focusedData = ext.fallback.deriveFocusedResult(fallbackResult.data);
7057
7107
  return focusedData ? [
@@ -7712,7 +7762,11 @@ function createExtractor(config) {
7712
7762
  generateObject,
7713
7763
  convertPdfToImages,
7714
7764
  concurrency = 2,
7765
+ pageMapConcurrency,
7766
+ extractorConcurrency,
7767
+ formatConcurrency,
7715
7768
  maxReviewRounds = 2,
7769
+ reviewMode = "auto",
7716
7770
  onTokenUsage,
7717
7771
  onProgress,
7718
7772
  log,
@@ -7723,7 +7777,8 @@ function createExtractor(config) {
7723
7777
  modelBudgetConstraints,
7724
7778
  onCheckpointSave
7725
7779
  } = config;
7726
- const limit = pLimit(concurrency);
7780
+ const pageMapLimit = pLimit(pageMapConcurrency ?? concurrency);
7781
+ const extractorLimit = pLimit(extractorConcurrency ?? concurrency);
7727
7782
  const extractorCatalog = formatExtractorCatalogForPrompt();
7728
7783
  let totalUsage = { inputTokens: 0, outputTokens: 0 };
7729
7784
  let modelCalls = 0;
@@ -7758,7 +7813,7 @@ function createExtractor(config) {
7758
7813
  usage,
7759
7814
  usageReported: !!usage
7760
7815
  });
7761
- if (report.durationMs) {
7816
+ if (report.durationMs != null) {
7762
7817
  performanceReport.totalModelCallDurationMs += report.durationMs;
7763
7818
  }
7764
7819
  }
@@ -7815,6 +7870,46 @@ function createExtractor(config) {
7815
7870
  return textIncludesSupplementarySignal(JSON.stringify(value));
7816
7871
  });
7817
7872
  }
7873
+ function getSupplementaryPageRanges(pageAssignments, formInventory) {
7874
+ const pages = /* @__PURE__ */ new Set();
7875
+ for (const assignment of pageAssignments) {
7876
+ if (assignment.pageRole === "supplementary" || assignment.extractorNames.includes("supplementary") || textIncludesSupplementarySignal(assignment.notes)) {
7877
+ pages.add(assignment.localPageNumber);
7878
+ }
7879
+ }
7880
+ for (const form of formInventory?.forms ?? []) {
7881
+ if (form.formType === "notice" || textIncludesSupplementarySignal(form.title) || textIncludesSupplementarySignal(form.formNumber)) {
7882
+ const startPage2 = form.pageStart;
7883
+ const endPage = form.pageEnd ?? form.pageStart;
7884
+ if (typeof startPage2 !== "number" || typeof endPage !== "number") continue;
7885
+ for (let page = startPage2; page <= endPage; page += 1) {
7886
+ pages.add(page);
7887
+ }
7888
+ }
7889
+ }
7890
+ const sortedPages = [...pages].sort((a, b) => a - b);
7891
+ if (sortedPages.length === 0) return [];
7892
+ const ranges = [];
7893
+ let startPage = sortedPages[0];
7894
+ let previousPage = sortedPages[0];
7895
+ for (const page of sortedPages.slice(1)) {
7896
+ if (page === previousPage + 1) {
7897
+ previousPage = page;
7898
+ continue;
7899
+ }
7900
+ ranges.push({ startPage, endPage: previousPage });
7901
+ startPage = page;
7902
+ previousPage = page;
7903
+ }
7904
+ ranges.push({ startPage, endPage: previousPage });
7905
+ return ranges;
7906
+ }
7907
+ function shouldRunLlmReview(mode, report, sourceSpansAvailable) {
7908
+ if (mode === "skip" || maxReviewRounds <= 0) return false;
7909
+ if (mode === "always") return true;
7910
+ if (!sourceSpansAvailable) return true;
7911
+ return report.qualityGateStatus !== "passed" || report.issues.length > 0;
7912
+ }
7818
7913
  function buildAlreadyExtractedSummary(memory) {
7819
7914
  const lines = [];
7820
7915
  const declarationResult = readMemoryRecord(memory, "declarations");
@@ -7844,10 +7939,11 @@ function createExtractor(config) {
7844
7939
  }
7845
7940
  return lines.length > 0 ? lines.join("\n") : "";
7846
7941
  }
7847
- async function runFocusedExtractorTask(task, pdfInput, memory, pageRangeCache) {
7942
+ async function runFocusedExtractorTask(task, pdfInput, memory, pageRangeCache, getPageRangePdf, getPageImages) {
7848
7943
  if (task.extractorName === "supplementary") {
7849
7944
  const alreadyExtractedSummary = buildAlreadyExtractedSummary(memory);
7850
7945
  const budget = resolveBudget("extraction_focused", 4096);
7946
+ const startedAt = Date.now();
7851
7947
  const result = await runExtractor({
7852
7948
  name: "supplementary",
7853
7949
  prompt: buildSupplementaryPrompt(alreadyExtractedSummary),
@@ -7859,12 +7955,15 @@ function createExtractor(config) {
7859
7955
  convertPdfToImages,
7860
7956
  maxTokens: budget.maxTokens,
7861
7957
  providerOptions: activeProviderOptions,
7862
- pageRangeCache
7958
+ pageRangeCache,
7959
+ getPageRangePdf,
7960
+ getPageImages
7863
7961
  });
7864
7962
  trackUsage(result.usage, {
7865
7963
  taskKind: "extraction_focused",
7866
7964
  label: "supplementary",
7867
- maxTokens: budget.maxTokens
7965
+ maxTokens: budget.maxTokens,
7966
+ durationMs: Date.now() - startedAt
7868
7967
  });
7869
7968
  return result;
7870
7969
  }
@@ -7875,6 +7974,8 @@ function createExtractor(config) {
7875
7974
  convertPdfToImages,
7876
7975
  providerOptions: activeProviderOptions,
7877
7976
  pageRangeCache,
7977
+ getPageRangePdf,
7978
+ getPageImages,
7878
7979
  trackUsage,
7879
7980
  resolveBudget,
7880
7981
  log
@@ -7923,20 +8024,68 @@ function createExtractor(config) {
7923
8024
  }
7924
8025
  }
7925
8026
  let pdfBase64Cache;
8027
+ const completedPageRangePdfCache = /* @__PURE__ */ new Map();
7926
8028
  const pageRangePdfCache = /* @__PURE__ */ new Map();
8029
+ const pageRangeImageCache = /* @__PURE__ */ new Map();
8030
+ let pdfSlicerPromise;
8031
+ let fullPdfProviderOptionsPromise;
8032
+ let pageCountPromise;
7927
8033
  async function getPdfBase64ForExtraction() {
7928
8034
  if (pdfBase64Cache === void 0) {
7929
8035
  pdfBase64Cache = await pdfInputToBase64(pdfInput);
7930
8036
  }
7931
8037
  return pdfBase64Cache;
7932
8038
  }
8039
+ async function getCachedPageCount() {
8040
+ if (!pageCountPromise) {
8041
+ pageCountPromise = getPdfSlicer().then((slicer) => slicer.getPageCount()).catch(() => getPdfPageCount(pdfInput));
8042
+ }
8043
+ return pageCountPromise;
8044
+ }
8045
+ async function getFullPdfProviderOptions() {
8046
+ if (!fullPdfProviderOptionsPromise) {
8047
+ fullPdfProviderOptionsPromise = buildPdfProviderOptions(pdfInput, activeProviderOptions);
8048
+ }
8049
+ return fullPdfProviderOptionsPromise;
8050
+ }
8051
+ async function getPdfSlicer() {
8052
+ if (!pdfSlicerPromise) {
8053
+ pdfSlicerPromise = createPdfPageSlicer(pdfInput);
8054
+ }
8055
+ return pdfSlicerPromise;
8056
+ }
7933
8057
  async function getPageRangePdf(startPage, endPage) {
7934
8058
  const cacheKey = `${startPage}-${endPage}`;
7935
- const cached = pageRangePdfCache.get(cacheKey);
8059
+ const cached = completedPageRangePdfCache.get(cacheKey);
8060
+ if (cached) return cached;
8061
+ const pending = pageRangePdfCache.get(cacheKey);
8062
+ if (pending) return pending;
8063
+ const promise = (async () => {
8064
+ const slicer = await getPdfSlicer();
8065
+ const pagesPdf = await slicer.extractPageRange(startPage, endPage);
8066
+ completedPageRangePdfCache.set(cacheKey, pagesPdf);
8067
+ return pagesPdf;
8068
+ })().catch((error) => {
8069
+ pageRangePdfCache.delete(cacheKey);
8070
+ throw error;
8071
+ });
8072
+ pageRangePdfCache.set(cacheKey, promise);
8073
+ return promise;
8074
+ }
8075
+ async function getPageImages(startPage, endPage) {
8076
+ if (!convertPdfToImages) return [];
8077
+ const cacheKey = `${startPage}-${endPage}`;
8078
+ const cached = pageRangeImageCache.get(cacheKey);
7936
8079
  if (cached) return cached;
7937
- const pagesPdf = await extractPageRange(await getPdfBase64ForExtraction(), startPage, endPage);
7938
- pageRangePdfCache.set(cacheKey, pagesPdf);
7939
- return pagesPdf;
8080
+ const promise = (async () => {
8081
+ const pdfBase64 = await getPdfBase64ForExtraction();
8082
+ return convertPdfToImages(pdfBase64, startPage, endPage);
8083
+ })().catch((error) => {
8084
+ pageRangeImageCache.delete(cacheKey);
8085
+ throw error;
8086
+ });
8087
+ pageRangeImageCache.set(cacheKey, promise);
8088
+ return promise;
7940
8089
  }
7941
8090
  let classifyResult;
7942
8091
  if (resumed?.classifyResult && pipelineCtx.isPhaseComplete("classify")) {
@@ -7944,15 +8093,16 @@ function createExtractor(config) {
7944
8093
  onProgress?.("Resuming from checkpoint (classify complete)...");
7945
8094
  } else {
7946
8095
  onProgress?.("Classifying document...");
7947
- const pageCount2 = await getPdfPageCount(pdfInput);
8096
+ const pageCount2 = await getCachedPageCount();
7948
8097
  const budget = resolveBudget("extraction_classify", 512);
8098
+ const startedAt = Date.now();
7949
8099
  const classifyResponse = await safeGenerateObject(
7950
8100
  generateObject,
7951
8101
  {
7952
8102
  prompt: buildClassifyPrompt(),
7953
8103
  schema: ClassifyResultSchema,
7954
8104
  maxTokens: budget.maxTokens,
7955
- providerOptions: await buildPdfProviderOptions(pdfInput, activeProviderOptions)
8105
+ providerOptions: await getFullPdfProviderOptions()
7956
8106
  },
7957
8107
  {
7958
8108
  fallback: { documentType: "policy", policyTypes: ["other"], confidence: 0 },
@@ -7964,7 +8114,8 @@ function createExtractor(config) {
7964
8114
  trackUsage(classifyResponse.usage, {
7965
8115
  taskKind: "extraction_classify",
7966
8116
  label: "classify",
7967
- maxTokens: budget.maxTokens
8117
+ maxTokens: budget.maxTokens,
8118
+ durationMs: Date.now() - startedAt
7968
8119
  });
7969
8120
  classifyResult = classifyResponse.object;
7970
8121
  if (classifyResult.confidence === 0) {
@@ -7982,7 +8133,7 @@ function createExtractor(config) {
7982
8133
  const policyTypes = classifyResult.policyTypes ?? [];
7983
8134
  const primaryType = policyTypes[0] ?? "other";
7984
8135
  const template = getTemplate(primaryType);
7985
- const pageCount = resumed?.pageCount ?? await getPdfPageCount(pdfInput);
8136
+ const pageCount = resumed?.pageCount ?? await getCachedPageCount();
7986
8137
  const templateHints = buildTemplateHints(primaryType, documentType, pageCount, template);
7987
8138
  let formInventory;
7988
8139
  if (resumed?.formInventory && pipelineCtx.isPhaseComplete("form_inventory")) {
@@ -7992,13 +8143,14 @@ function createExtractor(config) {
7992
8143
  } else {
7993
8144
  onProgress?.(`Building form inventory for ${primaryType} ${documentType}...`);
7994
8145
  const budget = resolveBudget("extraction_form_inventory", 2048);
8146
+ const startedAt = Date.now();
7995
8147
  const formInventoryResponse = await safeGenerateObject(
7996
8148
  generateObject,
7997
8149
  {
7998
8150
  prompt: buildFormInventoryPrompt(templateHints),
7999
8151
  schema: FormInventorySchema,
8000
8152
  maxTokens: budget.maxTokens,
8001
- providerOptions: await buildPdfProviderOptions(pdfInput, activeProviderOptions)
8153
+ providerOptions: await getFullPdfProviderOptions()
8002
8154
  },
8003
8155
  {
8004
8156
  fallback: { forms: [] },
@@ -8009,7 +8161,8 @@ function createExtractor(config) {
8009
8161
  trackUsage(formInventoryResponse.usage, {
8010
8162
  taskKind: "extraction_form_inventory",
8011
8163
  label: "form_inventory",
8012
- maxTokens: budget.maxTokens
8164
+ maxTokens: budget.maxTokens,
8165
+ durationMs: Date.now() - startedAt
8013
8166
  });
8014
8167
  formInventory = formInventoryResponse.object;
8015
8168
  memory.set("form_inventory", formInventory);
@@ -8039,9 +8192,10 @@ function createExtractor(config) {
8039
8192
  );
8040
8193
  const pageMapResults = await Promise.all(
8041
8194
  pageMapChunks.map(
8042
- ({ startPage, endPage }) => limit(async () => {
8195
+ ({ startPage, endPage }) => pageMapLimit(async () => {
8043
8196
  const pagesPdf = await getPageRangePdf(startPage, endPage);
8044
8197
  const budget = resolveBudget("extraction_page_map", 2048);
8198
+ const startedAt = Date.now();
8045
8199
  const mapResponse = await safeGenerateObject(
8046
8200
  generateObject,
8047
8201
  {
@@ -8066,7 +8220,8 @@ function createExtractor(config) {
8066
8220
  trackUsage(mapResponse.usage, {
8067
8221
  taskKind: "extraction_page_map",
8068
8222
  label: `page_map:${startPage}-${endPage}`,
8069
- maxTokens: budget.maxTokens
8223
+ maxTokens: budget.maxTokens,
8224
+ durationMs: Date.now() - startedAt
8070
8225
  });
8071
8226
  return mapResponse.object.pages.map((assignment) => ({
8072
8227
  ...assignment,
@@ -8112,14 +8267,32 @@ function createExtractor(config) {
8112
8267
  });
8113
8268
  }
8114
8269
  if (!pipelineCtx.isPhaseComplete("extract")) {
8115
- const tasks = plan.tasks;
8270
+ const supplementaryRanges = getSupplementaryPageRanges(pageAssignments, formInventory);
8271
+ const baseTasks = plan.tasks;
8272
+ const hasPlannedSupplementary = baseTasks.some((task) => task.extractorName === "supplementary");
8273
+ const tasks = hasPlannedSupplementary || supplementaryRanges.length === 0 ? baseTasks : [
8274
+ ...baseTasks,
8275
+ ...supplementaryRanges.map((range) => ({
8276
+ extractorName: "supplementary",
8277
+ startPage: range.startPage,
8278
+ endPage: range.endPage,
8279
+ description: `Page-signaled supplementary extraction for pages ${range.startPage}-${range.endPage}`
8280
+ }))
8281
+ ];
8116
8282
  onProgress?.(`Dispatching ${tasks.length} extractors...`);
8117
8283
  const extractionPdfInput = await getPdfBase64ForExtraction();
8118
8284
  const extractorResults = await Promise.all(
8119
8285
  tasks.map(
8120
- (task) => limit(async () => {
8286
+ (task) => extractorLimit(async () => {
8121
8287
  onProgress?.(`Extracting ${task.extractorName} (pages ${task.startPage}-${task.endPage})...`);
8122
- return runFocusedExtractorTask(task, extractionPdfInput, memory, pageRangePdfCache);
8288
+ return runFocusedExtractorTask(
8289
+ task,
8290
+ extractionPdfInput,
8291
+ memory,
8292
+ completedPageRangePdfCache,
8293
+ getPageRangePdf,
8294
+ convertPdfToImages ? getPageImages : void 0
8295
+ );
8123
8296
  })
8124
8297
  )
8125
8298
  );
@@ -8134,6 +8307,7 @@ function createExtractor(config) {
8134
8307
  try {
8135
8308
  const alreadyExtractedSummary = buildAlreadyExtractedSummary(memory);
8136
8309
  const budget = resolveBudget("extraction_focused", 4096);
8310
+ const startedAt = Date.now();
8137
8311
  const supplementaryResult = await runExtractor({
8138
8312
  name: "supplementary",
8139
8313
  prompt: buildSupplementaryPrompt(alreadyExtractedSummary),
@@ -8145,12 +8319,15 @@ function createExtractor(config) {
8145
8319
  convertPdfToImages,
8146
8320
  maxTokens: budget.maxTokens,
8147
8321
  providerOptions: activeProviderOptions,
8148
- pageRangeCache: pageRangePdfCache
8322
+ pageRangeCache: completedPageRangePdfCache,
8323
+ getPageRangePdf,
8324
+ getPageImages: convertPdfToImages ? getPageImages : void 0
8149
8325
  });
8150
8326
  trackUsage(supplementaryResult.usage, {
8151
8327
  taskKind: "extraction_focused",
8152
8328
  label: "supplementary",
8153
- maxTokens: budget.maxTokens
8329
+ maxTokens: budget.maxTokens,
8330
+ durationMs: Date.now() - startedAt
8154
8331
  });
8155
8332
  mergeMemoryResult(supplementaryResult.name, supplementaryResult.data, memory);
8156
8333
  } catch (error) {
@@ -8170,6 +8347,7 @@ function createExtractor(config) {
8170
8347
  if (!pipelineCtx.isPhaseComplete("resolve_referential")) {
8171
8348
  onProgress?.("Resolving referential coverage limits...");
8172
8349
  try {
8350
+ const startedAt = Date.now();
8173
8351
  const resolution = await resolveReferentialCoverages({
8174
8352
  memory,
8175
8353
  pdfInput,
@@ -8177,6 +8355,8 @@ function createExtractor(config) {
8177
8355
  generateObject,
8178
8356
  convertPdfToImages,
8179
8357
  concurrency,
8358
+ getPageRangePdf,
8359
+ getPageImages: convertPdfToImages ? getPageImages : void 0,
8180
8360
  providerOptions: activeProviderOptions,
8181
8361
  modelCapabilities,
8182
8362
  modelBudgetConstraints,
@@ -8185,7 +8365,8 @@ function createExtractor(config) {
8185
8365
  });
8186
8366
  trackUsage(resolution.usage, {
8187
8367
  taskKind: "extraction_referential_lookup",
8188
- label: "referential_resolution"
8368
+ label: "referential_resolution",
8369
+ durationMs: Date.now() - startedAt
8189
8370
  });
8190
8371
  if (resolution.attempts > 0) {
8191
8372
  await log?.(`Referential resolution: ${resolution.resolved}/${resolution.attempts} resolved, ${resolution.unresolved} unresolved`);
@@ -8207,52 +8388,72 @@ function createExtractor(config) {
8207
8388
  let reviewReport = resumed?.reviewReport;
8208
8389
  if (!pipelineCtx.isPhaseComplete("review")) {
8209
8390
  reviewRounds = [];
8210
- for (let round = 0; round < maxReviewRounds; round++) {
8211
- const extractedKeys = [...memory.keys()].filter((k) => k !== "classify");
8212
- const extractionSummary = summarizeExtraction(memory);
8213
- const pageMapSummary = formatPageMapSummary(pageAssignments);
8214
- const budget = resolveBudget("extraction_review", 1536);
8215
- const reviewResponse = await safeGenerateObject(
8216
- generateObject,
8217
- {
8218
- prompt: buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary, extractorCatalog),
8219
- schema: ReviewResultSchema,
8391
+ groundExtractionMemoryWithSourceSpans(memory, sourceSpans);
8392
+ const preReviewReport = buildExtractionReviewReport({
8393
+ memory,
8394
+ pageAssignments,
8395
+ reviewRounds,
8396
+ sourceSpansAvailable: sourceSpans.length > 0
8397
+ });
8398
+ if (shouldRunLlmReview(reviewMode, preReviewReport, sourceSpans.length > 0)) {
8399
+ for (let round = 0; round < maxReviewRounds; round++) {
8400
+ const extractedKeys = [...memory.keys()].filter((k) => k !== "classify");
8401
+ const extractionSummary = summarizeExtraction(memory);
8402
+ const pageMapSummary = formatPageMapSummary(pageAssignments);
8403
+ const budget = resolveBudget("extraction_review", 1536);
8404
+ const startedAt = Date.now();
8405
+ const reviewResponse = await safeGenerateObject(
8406
+ generateObject,
8407
+ {
8408
+ prompt: buildReviewPrompt(template.required, extractedKeys, extractionSummary, pageMapSummary, extractorCatalog),
8409
+ schema: ReviewResultSchema,
8410
+ maxTokens: budget.maxTokens,
8411
+ providerOptions: await getFullPdfProviderOptions()
8412
+ },
8413
+ {
8414
+ fallback: { complete: true, missingFields: [], qualityIssues: [], additionalTasks: [] },
8415
+ log,
8416
+ onError: (err, attempt) => log?.(`Review round ${round + 1} attempt ${attempt + 1} failed: ${err}`)
8417
+ }
8418
+ );
8419
+ trackUsage(reviewResponse.usage, {
8420
+ taskKind: "extraction_review",
8421
+ label: `review:${round + 1}`,
8220
8422
  maxTokens: budget.maxTokens,
8221
- providerOptions: await buildPdfProviderOptions(pdfInput, activeProviderOptions)
8222
- },
8223
- {
8224
- fallback: { complete: true, missingFields: [], qualityIssues: [], additionalTasks: [] },
8225
- log,
8226
- onError: (err, attempt) => log?.(`Review round ${round + 1} attempt ${attempt + 1} failed: ${err}`)
8423
+ durationMs: Date.now() - startedAt
8424
+ });
8425
+ reviewRounds.push(toReviewRoundRecord(round + 1, reviewResponse.object));
8426
+ if (reviewResponse.object.qualityIssues?.length) {
8427
+ await log?.(`Review round ${round + 1} quality issues: ${reviewResponse.object.qualityIssues.join("; ")}`);
8227
8428
  }
8228
- );
8229
- trackUsage(reviewResponse.usage, {
8230
- taskKind: "extraction_review",
8231
- label: `review:${round + 1}`,
8232
- maxTokens: budget.maxTokens
8233
- });
8234
- reviewRounds.push(toReviewRoundRecord(round + 1, reviewResponse.object));
8235
- if (reviewResponse.object.qualityIssues?.length) {
8236
- await log?.(`Review round ${round + 1} quality issues: ${reviewResponse.object.qualityIssues.join("; ")}`);
8237
- }
8238
- if (reviewResponse.object.complete || reviewResponse.object.additionalTasks.length === 0) {
8239
- onProgress?.("Extraction complete.");
8240
- break;
8241
- }
8242
- onProgress?.(`Review round ${round + 1}: dispatching ${reviewResponse.object.additionalTasks.length} follow-up extractors...`);
8243
- const extractionPdfInput = await getPdfBase64ForExtraction();
8244
- const followUpResults = await Promise.all(
8245
- reviewResponse.object.additionalTasks.map(
8246
- (task) => limit(async () => {
8247
- return runFocusedExtractorTask(task, extractionPdfInput, memory, pageRangePdfCache);
8248
- })
8249
- )
8250
- );
8251
- for (const result of followUpResults.flatMap((item) => Array.isArray(item) ? item : item ? [item] : [])) {
8252
- if (result) {
8253
- mergeMemoryResult(result.name, result.data, memory);
8429
+ if (reviewResponse.object.complete || reviewResponse.object.additionalTasks.length === 0) {
8430
+ onProgress?.("Extraction complete.");
8431
+ break;
8432
+ }
8433
+ onProgress?.(`Review round ${round + 1}: dispatching ${reviewResponse.object.additionalTasks.length} follow-up extractors...`);
8434
+ const extractionPdfInput = await getPdfBase64ForExtraction();
8435
+ const followUpResults = await Promise.all(
8436
+ reviewResponse.object.additionalTasks.map(
8437
+ (task) => extractorLimit(async () => {
8438
+ return runFocusedExtractorTask(
8439
+ task,
8440
+ extractionPdfInput,
8441
+ memory,
8442
+ completedPageRangePdfCache,
8443
+ getPageRangePdf,
8444
+ convertPdfToImages ? getPageImages : void 0
8445
+ );
8446
+ })
8447
+ )
8448
+ );
8449
+ for (const result of followUpResults.flatMap((item) => Array.isArray(item) ? item : item ? [item] : [])) {
8450
+ if (result) {
8451
+ mergeMemoryResult(result.name, result.data, memory);
8452
+ }
8254
8453
  }
8255
8454
  }
8455
+ } else {
8456
+ onProgress?.("Skipping LLM extraction review; deterministic checks passed.");
8256
8457
  }
8257
8458
  groundExtractionMemoryWithSourceSpans(memory, sourceSpans);
8258
8459
  reviewReport = buildExtractionReviewReport({
@@ -8304,6 +8505,7 @@ function createExtractor(config) {
8304
8505
  onProgress?.("Generating document summary...");
8305
8506
  try {
8306
8507
  const budget = resolveBudget("extraction_summary", 512);
8508
+ const startedAt = Date.now();
8307
8509
  const summaryResponse = await safeGenerateObject(
8308
8510
  generateObject,
8309
8511
  {
@@ -8321,7 +8523,8 @@ function createExtractor(config) {
8321
8523
  trackUsage(summaryResponse.usage, {
8322
8524
  taskKind: "extraction_summary",
8323
8525
  label: "summary",
8324
- maxTokens: budget.maxTokens
8526
+ maxTokens: budget.maxTokens,
8527
+ durationMs: Date.now() - startedAt
8325
8528
  });
8326
8529
  if (summaryResponse.object.summary) {
8327
8530
  document.summary = summaryResponse.object.summary;
@@ -8332,16 +8535,19 @@ function createExtractor(config) {
8332
8535
  }
8333
8536
  onProgress?.("Formatting extracted content...");
8334
8537
  const formatBudget = resolveBudget("extraction_format", 16384);
8538
+ const formatStartedAt = Date.now();
8335
8539
  const formatResult = await formatDocumentContent(document, generateText, {
8336
8540
  providerOptions: activeProviderOptions,
8337
8541
  maxTokens: formatBudget.maxTokens,
8542
+ concurrency: formatConcurrency ?? concurrency,
8338
8543
  onProgress,
8339
8544
  log
8340
8545
  });
8341
8546
  trackUsage(formatResult.usage, {
8342
8547
  taskKind: "extraction_format",
8343
8548
  label: "format",
8344
- maxTokens: formatBudget.maxTokens
8549
+ maxTokens: formatBudget.maxTokens,
8550
+ durationMs: Date.now() - formatStartedAt
8345
8551
  });
8346
8552
  const chunks = chunkDocument(formatResult.document);
8347
8553
  const finalCheckpoint = pipelineCtx.getCheckpoint();