@claritylabs/cl-sdk 1.1.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of @claritylabs/cl-sdk might be problematic. Click here for more details.

package/dist/index.mjs CHANGED
@@ -29779,6 +29779,132 @@ var AGENT_TOOLS = [
29779
29779
 
29780
29780
  // src/extraction/pipeline.ts
29781
29781
  import { generateText } from "ai";
29782
+
29783
+ // src/extraction/pdf.ts
29784
+ import {
29785
+ PDFDocument,
29786
+ PDFTextField,
29787
+ PDFCheckBox,
29788
+ PDFDropdown,
29789
+ PDFRadioGroup,
29790
+ StandardFonts,
29791
+ rgb
29792
+ } from "pdf-lib";
29793
+ async function extractPageRange(pdfBase64, startPage, endPage) {
29794
+ const srcBytes = typeof Buffer !== "undefined" ? Buffer.from(pdfBase64, "base64") : Uint8Array.from(atob(pdfBase64), (c) => c.charCodeAt(0));
29795
+ const srcDoc = await PDFDocument.load(srcBytes, { ignoreEncryption: true });
29796
+ const totalPages = srcDoc.getPageCount();
29797
+ const start = Math.max(startPage - 1, 0);
29798
+ const end = Math.min(endPage, totalPages) - 1;
29799
+ if (start === 0 && end >= totalPages - 1) {
29800
+ return pdfBase64;
29801
+ }
29802
+ const newDoc = await PDFDocument.create();
29803
+ const indices = Array.from({ length: end - start + 1 }, (_, i) => start + i);
29804
+ const pages = await newDoc.copyPages(srcDoc, indices);
29805
+ pages.forEach((page) => newDoc.addPage(page));
29806
+ const bytes = await newDoc.save();
29807
+ if (typeof Buffer !== "undefined") {
29808
+ return Buffer.from(bytes).toString("base64");
29809
+ }
29810
+ let binary = "";
29811
+ const uint8 = new Uint8Array(bytes);
29812
+ for (let i = 0; i < uint8.length; i++) {
29813
+ binary += String.fromCharCode(uint8[i]);
29814
+ }
29815
+ return btoa(binary);
29816
+ }
29817
+ async function getPdfPageCount(pdfBase64) {
29818
+ const srcBytes = typeof Buffer !== "undefined" ? Buffer.from(pdfBase64, "base64") : Uint8Array.from(atob(pdfBase64), (c) => c.charCodeAt(0));
29819
+ const doc = await PDFDocument.load(srcBytes, { ignoreEncryption: true });
29820
+ return doc.getPageCount();
29821
+ }
29822
+ function getAcroFormFields(pdfDoc) {
29823
+ const form = pdfDoc.getForm();
29824
+ const fields = form.getFields();
29825
+ if (fields.length === 0) return [];
29826
+ return fields.map((field) => {
29827
+ const name = field.getName();
29828
+ if (field instanceof PDFTextField) {
29829
+ return { name, type: "text" };
29830
+ }
29831
+ if (field instanceof PDFCheckBox) {
29832
+ return { name, type: "checkbox" };
29833
+ }
29834
+ if (field instanceof PDFDropdown) {
29835
+ return { name, type: "dropdown", options: field.getOptions() };
29836
+ }
29837
+ if (field instanceof PDFRadioGroup) {
29838
+ return { name, type: "radio", options: field.getOptions() };
29839
+ }
29840
+ return { name, type: "text" };
29841
+ });
29842
+ }
29843
+ async function fillAcroForm(pdfBytes, mappings) {
29844
+ const pdfDoc = await PDFDocument.load(pdfBytes, { ignoreEncryption: true });
29845
+ const form = pdfDoc.getForm();
29846
+ for (const { acroFormName, value } of mappings) {
29847
+ try {
29848
+ const field = form.getField(acroFormName);
29849
+ if (field instanceof PDFTextField) {
29850
+ field.setText(value);
29851
+ } else if (field instanceof PDFCheckBox) {
29852
+ const lower = value.toLowerCase();
29853
+ if (["yes", "true", "x", "checked", "on"].includes(lower)) {
29854
+ field.check();
29855
+ } else {
29856
+ field.uncheck();
29857
+ }
29858
+ } else if (field instanceof PDFDropdown) {
29859
+ try {
29860
+ field.select(value);
29861
+ } catch {
29862
+ }
29863
+ } else if (field instanceof PDFRadioGroup) {
29864
+ try {
29865
+ field.select(value);
29866
+ } catch {
29867
+ }
29868
+ }
29869
+ } catch {
29870
+ }
29871
+ }
29872
+ form.flatten();
29873
+ return await pdfDoc.save();
29874
+ }
29875
+ async function overlayTextOnPdf(pdfBytes, overlays) {
29876
+ const pdfDoc = await PDFDocument.load(pdfBytes, { ignoreEncryption: true });
29877
+ const font = await pdfDoc.embedFont(StandardFonts.Helvetica);
29878
+ const pageCount = pdfDoc.getPageCount();
29879
+ for (const overlay of overlays) {
29880
+ if (overlay.page < 0 || overlay.page >= pageCount) continue;
29881
+ const page = pdfDoc.getPage(overlay.page);
29882
+ const { width, height } = page.getSize();
29883
+ const fontSize = overlay.fontSize ?? 10;
29884
+ const x = overlay.x / 100 * width;
29885
+ const y = height - overlay.y / 100 * height - fontSize;
29886
+ if (overlay.isCheckmark) {
29887
+ page.drawText("X", {
29888
+ x,
29889
+ y,
29890
+ size: fontSize,
29891
+ font,
29892
+ color: rgb(0, 0, 0)
29893
+ });
29894
+ } else {
29895
+ page.drawText(overlay.text, {
29896
+ x,
29897
+ y,
29898
+ size: fontSize,
29899
+ font,
29900
+ color: rgb(0, 0, 0)
29901
+ });
29902
+ }
29903
+ }
29904
+ return await pdfDoc.save();
29905
+ }
29906
+
29907
+ // src/extraction/pipeline.ts
29782
29908
  var SONNET_MODEL = "claude-sonnet-4-6";
29783
29909
  var HAIKU_MODEL = "claude-haiku-4-5-20251001";
29784
29910
  var DEFAULT_METADATA_PROVIDER_OPTIONS = {
@@ -29915,8 +30041,10 @@ function getPageChunks(totalPages, chunkSize = 30) {
29915
30041
  }
29916
30042
  return chunks;
29917
30043
  }
29918
- async function callModel(model, pdfBase64, prompt, maxTokens, providerOptions, log, onTokenUsage) {
29919
- await log?.(`Calling model (max ${maxTokens} tokens)...`);
30044
+ async function callModel(model, pdfBase64, prompt, maxTokens, providerOptions, log, onTokenUsage, pageRange) {
30045
+ const pdfToSend = pageRange ? await extractPageRange(pdfBase64, pageRange[0], pageRange[1]) : pdfBase64;
30046
+ const rangeLabel = pageRange ? ` [pages ${pageRange[0]}\u2013${pageRange[1]}]` : "";
30047
+ await log?.(`Calling model (max ${maxTokens} tokens)${rangeLabel}...`);
29920
30048
  const start = Date.now();
29921
30049
  const { text, usage } = await withRetry(
29922
30050
  () => generateText({
@@ -29925,7 +30053,7 @@ async function callModel(model, pdfBase64, prompt, maxTokens, providerOptions, l
29925
30053
  messages: [{
29926
30054
  role: "user",
29927
30055
  content: [
29928
- { type: "file", data: pdfBase64, mediaType: "application/pdf" },
30056
+ { type: "file", data: pdfToSend, mediaType: "application/pdf" },
29929
30057
  { type: "text", text: prompt }
29930
30058
  ]
29931
30059
  }],
@@ -30031,7 +30159,9 @@ async function classifyDocumentType(pdfBase64, options) {
30031
30159
  MODEL_TOKEN_LIMITS.classification,
30032
30160
  void 0,
30033
30161
  log,
30034
- onTokenUsage
30162
+ onTokenUsage,
30163
+ [1, 3]
30164
+ // Only need first 3 pages for classification
30035
30165
  );
30036
30166
  try {
30037
30167
  const parsed = JSON.parse(stripFences(raw));
@@ -30120,7 +30250,9 @@ async function extractChunkWithRetry(models, pdfBase64, start, end, sizeIndex, p
30120
30250
  MODEL_TOKEN_LIMITS.sections,
30121
30251
  void 0,
30122
30252
  log,
30123
- onTokenUsage
30253
+ onTokenUsage,
30254
+ [start, end]
30255
+ // Only send this chunk's pages
30124
30256
  );
30125
30257
  try {
30126
30258
  return [JSON.parse(stripFences(chunkRaw))];
@@ -30162,7 +30294,9 @@ async function extractChunkWithRetry(models, pdfBase64, start, end, sizeIndex, p
30162
30294
  MODEL_TOKEN_LIMITS.sectionsFallback,
30163
30295
  fallbackProviderOptions,
30164
30296
  log,
30165
- onTokenUsage
30297
+ onTokenUsage,
30298
+ [start, end]
30299
+ // Only send this chunk's pages
30166
30300
  );
30167
30301
  try {
30168
30302
  return [JSON.parse(stripFences(fallbackRaw))];
@@ -30205,7 +30339,9 @@ async function extractFromPdf(pdfBase64, options) {
30205
30339
  onTokenUsage
30206
30340
  } = options ?? {};
30207
30341
  const resolved = resolveModels(models);
30342
+ const actualPageCount = await getPdfPageCount(pdfBase64);
30208
30343
  await log?.("Pass 1: Extracting metadata...");
30344
+ const metadataPageRange = [1, Math.min(10, actualPageCount)];
30209
30345
  const metadataRaw = await callModel(
30210
30346
  resolved.metadata,
30211
30347
  pdfBase64,
@@ -30213,7 +30349,8 @@ async function extractFromPdf(pdfBase64, options) {
30213
30349
  MODEL_TOKEN_LIMITS.metadata,
30214
30350
  metadataProviderOptions,
30215
30351
  log,
30216
- onTokenUsage
30352
+ onTokenUsage,
30353
+ metadataPageRange
30217
30354
  );
30218
30355
  let metadataResult;
30219
30356
  try {
@@ -30224,7 +30361,7 @@ async function extractFromPdf(pdfBase64, options) {
30224
30361
  throw new Error(`Metadata JSON parse failed: ${e.message}`);
30225
30362
  }
30226
30363
  await onMetadata?.(metadataRaw);
30227
- const pageCount = metadataResult.totalPages || 1;
30364
+ const pageCount = actualPageCount;
30228
30365
  await log?.(`Document: ${pageCount} page(s)`);
30229
30366
  const sectionChunks = await extractSectionChunks(
30230
30367
  resolved,
@@ -30292,7 +30429,9 @@ async function extractQuoteFromPdf(pdfBase64, options) {
30292
30429
  onTokenUsage
30293
30430
  } = options ?? {};
30294
30431
  const resolved = resolveModels(models);
30432
+ const actualPageCount = await getPdfPageCount(pdfBase64);
30295
30433
  await log?.("Pass 1: Extracting quote metadata...");
30434
+ const metadataPageRange = [1, Math.min(10, actualPageCount)];
30296
30435
  const metadataRaw = await callModel(
30297
30436
  resolved.metadata,
30298
30437
  pdfBase64,
@@ -30300,7 +30439,8 @@ async function extractQuoteFromPdf(pdfBase64, options) {
30300
30439
  MODEL_TOKEN_LIMITS.metadata,
30301
30440
  metadataProviderOptions,
30302
30441
  log,
30303
- onTokenUsage
30442
+ onTokenUsage,
30443
+ metadataPageRange
30304
30444
  );
30305
30445
  let metadataResult;
30306
30446
  try {
@@ -30311,7 +30451,7 @@ async function extractQuoteFromPdf(pdfBase64, options) {
30311
30451
  throw new Error(`Quote metadata JSON parse failed: ${e.message}`);
30312
30452
  }
30313
30453
  await onMetadata?.(metadataRaw);
30314
- const pageCount = metadataResult.totalPages || 1;
30454
+ const pageCount = actualPageCount;
30315
30455
  await log?.(`Quote document: ${pageCount} page(s)`);
30316
30456
  const sectionChunks = await extractSectionChunks(
30317
30457
  resolved,
@@ -30328,101 +30468,6 @@ async function extractQuoteFromPdf(pdfBase64, options) {
30328
30468
  const mergedRaw = JSON.stringify(merged);
30329
30469
  return { rawText: mergedRaw, extracted: merged };
30330
30470
  }
30331
-
30332
- // src/extraction/pdf.ts
30333
- import {
30334
- PDFDocument,
30335
- PDFTextField,
30336
- PDFCheckBox,
30337
- PDFDropdown,
30338
- PDFRadioGroup,
30339
- StandardFonts,
30340
- rgb
30341
- } from "pdf-lib";
30342
- function getAcroFormFields(pdfDoc) {
30343
- const form = pdfDoc.getForm();
30344
- const fields = form.getFields();
30345
- if (fields.length === 0) return [];
30346
- return fields.map((field) => {
30347
- const name = field.getName();
30348
- if (field instanceof PDFTextField) {
30349
- return { name, type: "text" };
30350
- }
30351
- if (field instanceof PDFCheckBox) {
30352
- return { name, type: "checkbox" };
30353
- }
30354
- if (field instanceof PDFDropdown) {
30355
- return { name, type: "dropdown", options: field.getOptions() };
30356
- }
30357
- if (field instanceof PDFRadioGroup) {
30358
- return { name, type: "radio", options: field.getOptions() };
30359
- }
30360
- return { name, type: "text" };
30361
- });
30362
- }
30363
- async function fillAcroForm(pdfBytes, mappings) {
30364
- const pdfDoc = await PDFDocument.load(pdfBytes, { ignoreEncryption: true });
30365
- const form = pdfDoc.getForm();
30366
- for (const { acroFormName, value } of mappings) {
30367
- try {
30368
- const field = form.getField(acroFormName);
30369
- if (field instanceof PDFTextField) {
30370
- field.setText(value);
30371
- } else if (field instanceof PDFCheckBox) {
30372
- const lower = value.toLowerCase();
30373
- if (["yes", "true", "x", "checked", "on"].includes(lower)) {
30374
- field.check();
30375
- } else {
30376
- field.uncheck();
30377
- }
30378
- } else if (field instanceof PDFDropdown) {
30379
- try {
30380
- field.select(value);
30381
- } catch {
30382
- }
30383
- } else if (field instanceof PDFRadioGroup) {
30384
- try {
30385
- field.select(value);
30386
- } catch {
30387
- }
30388
- }
30389
- } catch {
30390
- }
30391
- }
30392
- form.flatten();
30393
- return await pdfDoc.save();
30394
- }
30395
- async function overlayTextOnPdf(pdfBytes, overlays) {
30396
- const pdfDoc = await PDFDocument.load(pdfBytes, { ignoreEncryption: true });
30397
- const font = await pdfDoc.embedFont(StandardFonts.Helvetica);
30398
- const pageCount = pdfDoc.getPageCount();
30399
- for (const overlay of overlays) {
30400
- if (overlay.page < 0 || overlay.page >= pageCount) continue;
30401
- const page = pdfDoc.getPage(overlay.page);
30402
- const { width, height } = page.getSize();
30403
- const fontSize = overlay.fontSize ?? 10;
30404
- const x = overlay.x / 100 * width;
30405
- const y = height - overlay.y / 100 * height - fontSize;
30406
- if (overlay.isCheckmark) {
30407
- page.drawText("X", {
30408
- x,
30409
- y,
30410
- size: fontSize,
30411
- font,
30412
- color: rgb(0, 0, 0)
30413
- });
30414
- } else {
30415
- page.drawText(overlay.text, {
30416
- x,
30417
- y,
30418
- size: fontSize,
30419
- font,
30420
- color: rgb(0, 0, 0)
30421
- });
30422
- }
30423
- }
30424
- return await pdfDoc.save();
30425
- }
30426
30471
  export {
30427
30472
  AGENT_TOOLS,
30428
30473
  APPLICATION_CLASSIFY_PROMPT,
@@ -30474,11 +30519,13 @@ export {
30474
30519
  createUniformModelConfig,
30475
30520
  enrichSupplementaryFields,
30476
30521
  extractFromPdf,
30522
+ extractPageRange,
30477
30523
  extractQuoteFromPdf,
30478
30524
  extractSectionsOnly,
30479
30525
  fillAcroForm,
30480
30526
  getAcroFormFields,
30481
30527
  getPageChunks,
30528
+ getPdfPageCount,
30482
30529
  mergeChunkedQuoteSections,
30483
30530
  mergeChunkedSections,
30484
30531
  overlayTextOnPdf,