@claritylabs/cl-sdk 1.1.2 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +15 -1
- package/dist/index.d.ts +15 -1
- package/dist/index.js +146 -97
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +152 -105
- package/dist/index.mjs.map +1 -1
- package/package.json +5 -1
package/dist/index.d.mts
CHANGED
|
@@ -524,6 +524,20 @@ declare function extractQuoteFromPdf(pdfBase64: string, options?: ExtractOptions
|
|
|
524
524
|
extracted: any;
|
|
525
525
|
}>;
|
|
526
526
|
|
|
527
|
+
/**
|
|
528
|
+
* Extract a page range from a PDF and return as base64.
|
|
529
|
+
* Used to reduce API token usage by only sending relevant pages.
|
|
530
|
+
*
|
|
531
|
+
* @param pdfBase64 - Full PDF as base64 string.
|
|
532
|
+
* @param startPage - First page to include (1-indexed).
|
|
533
|
+
* @param endPage - Last page to include (1-indexed, clamped to total pages).
|
|
534
|
+
* @returns Base64 string of the trimmed PDF, or original if range covers all pages.
|
|
535
|
+
*/
|
|
536
|
+
declare function extractPageRange(pdfBase64: string, startPage: number, endPage: number): Promise<string>;
|
|
537
|
+
/**
|
|
538
|
+
* Get the page count of a PDF without fully parsing it.
|
|
539
|
+
*/
|
|
540
|
+
declare function getPdfPageCount(pdfBase64: string): Promise<number>;
|
|
527
541
|
interface AcroFormFieldInfo {
|
|
528
542
|
name: string;
|
|
529
543
|
type: "text" | "checkbox" | "dropdown" | "radio";
|
|
@@ -548,4 +562,4 @@ interface TextOverlay {
|
|
|
548
562
|
/** Overlay text on a flat PDF at specified coordinates. */
|
|
549
563
|
declare function overlayTextOnPdf(pdfBytes: Uint8Array, overlays: TextOverlay[]): Promise<Uint8Array>;
|
|
550
564
|
|
|
551
|
-
export { AGENT_TOOLS, APPLICATION_CLASSIFY_PROMPT, type AcroFormFieldInfo, type AgentContext, type BaseDocument, CLASSIFY_DOCUMENT_PROMPT, CLASSIFY_EMAIL_PROMPT, COI_GENERATION_TOOL, COVERAGE_COMPARISON_TOOL, type ClassifyOptions, type CommunicationIntent, type Coverage, DOCUMENT_LOOKUP_TOOL, EXTRACTION_PROMPT, type ExtractOptions, type ExtractSectionsOptions, type FieldMapping, HAIKU_MODEL, type InsuranceDocument, type LogFn, METADATA_PROMPT, MODEL_TOKEN_LIMITS, type ModelConfig, PLATFORM_CONFIGS, type Platform, type PlatformConfig, type PolicyDocument, type PremiumLine, type PromptBuilder, QUOTE_METADATA_PROMPT, type QuoteDocument, SONNET_MODEL, type Section, type Subjectivity, type Subsection, type TextOverlay, type TokenUsage, type ToolDefinition, type UnderwritingCondition, applyExtracted, applyExtractedQuote, buildAcroFormMappingPrompt, buildAgentSystemPrompt, buildAnswerParsingPrompt, buildAutoFillPrompt, buildBatchEmailGenerationPrompt, buildClassifyMessagePrompt, buildCoiRoutingPrompt, buildConfirmationSummaryPrompt, buildConversationMemoryContext, buildConversationMemoryGuidance, buildCoverageGapPrompt, buildDocumentContext, buildFieldExplanationPrompt, buildFieldExtractionPrompt, buildFlatPdfMappingPrompt, buildFormattingPrompt, buildIdentityPrompt, buildIntentPrompt, buildLookupFillPrompt, buildPolicyContext, buildPolicySectionsPrompt, buildQuestionBatchPrompt, buildQuoteSectionsPrompt, buildQuotesPoliciesPrompt, buildReplyIntentClassificationPrompt, buildSafetyPrompt, buildSectionsPrompt, buildSupplementaryEnrichmentPrompt, buildSystemPrompt, classifyDocumentType, createDefaultModelConfig, createUniformModelConfig, enrichSupplementaryFields, extractFromPdf, extractQuoteFromPdf, extractSectionsOnly, fillAcroForm, getAcroFormFields, getPageChunks, mergeChunkedQuoteSections, mergeChunkedSections, overlayTextOnPdf, sanitizeNulls, stripFences };
|
|
565
|
+
export { AGENT_TOOLS, APPLICATION_CLASSIFY_PROMPT, type AcroFormFieldInfo, type AgentContext, type BaseDocument, CLASSIFY_DOCUMENT_PROMPT, CLASSIFY_EMAIL_PROMPT, COI_GENERATION_TOOL, COVERAGE_COMPARISON_TOOL, type ClassifyOptions, type CommunicationIntent, type Coverage, DOCUMENT_LOOKUP_TOOL, EXTRACTION_PROMPT, type ExtractOptions, type ExtractSectionsOptions, type FieldMapping, HAIKU_MODEL, type InsuranceDocument, type LogFn, METADATA_PROMPT, MODEL_TOKEN_LIMITS, type ModelConfig, PLATFORM_CONFIGS, type Platform, type PlatformConfig, type PolicyDocument, type PremiumLine, type PromptBuilder, QUOTE_METADATA_PROMPT, type QuoteDocument, SONNET_MODEL, type Section, type Subjectivity, type Subsection, type TextOverlay, type TokenUsage, type ToolDefinition, type UnderwritingCondition, applyExtracted, applyExtractedQuote, buildAcroFormMappingPrompt, buildAgentSystemPrompt, buildAnswerParsingPrompt, buildAutoFillPrompt, buildBatchEmailGenerationPrompt, buildClassifyMessagePrompt, buildCoiRoutingPrompt, buildConfirmationSummaryPrompt, buildConversationMemoryContext, buildConversationMemoryGuidance, buildCoverageGapPrompt, buildDocumentContext, buildFieldExplanationPrompt, buildFieldExtractionPrompt, buildFlatPdfMappingPrompt, buildFormattingPrompt, buildIdentityPrompt, buildIntentPrompt, buildLookupFillPrompt, buildPolicyContext, buildPolicySectionsPrompt, buildQuestionBatchPrompt, buildQuoteSectionsPrompt, buildQuotesPoliciesPrompt, buildReplyIntentClassificationPrompt, buildSafetyPrompt, buildSectionsPrompt, buildSupplementaryEnrichmentPrompt, buildSystemPrompt, classifyDocumentType, createDefaultModelConfig, createUniformModelConfig, enrichSupplementaryFields, extractFromPdf, extractPageRange, extractQuoteFromPdf, extractSectionsOnly, fillAcroForm, getAcroFormFields, getPageChunks, getPdfPageCount, mergeChunkedQuoteSections, mergeChunkedSections, overlayTextOnPdf, sanitizeNulls, stripFences };
|
package/dist/index.d.ts
CHANGED
|
@@ -524,6 +524,20 @@ declare function extractQuoteFromPdf(pdfBase64: string, options?: ExtractOptions
|
|
|
524
524
|
extracted: any;
|
|
525
525
|
}>;
|
|
526
526
|
|
|
527
|
+
/**
|
|
528
|
+
* Extract a page range from a PDF and return as base64.
|
|
529
|
+
* Used to reduce API token usage by only sending relevant pages.
|
|
530
|
+
*
|
|
531
|
+
* @param pdfBase64 - Full PDF as base64 string.
|
|
532
|
+
* @param startPage - First page to include (1-indexed).
|
|
533
|
+
* @param endPage - Last page to include (1-indexed, clamped to total pages).
|
|
534
|
+
* @returns Base64 string of the trimmed PDF, or original if range covers all pages.
|
|
535
|
+
*/
|
|
536
|
+
declare function extractPageRange(pdfBase64: string, startPage: number, endPage: number): Promise<string>;
|
|
537
|
+
/**
|
|
538
|
+
* Get the page count of a PDF without fully parsing it.
|
|
539
|
+
*/
|
|
540
|
+
declare function getPdfPageCount(pdfBase64: string): Promise<number>;
|
|
527
541
|
interface AcroFormFieldInfo {
|
|
528
542
|
name: string;
|
|
529
543
|
type: "text" | "checkbox" | "dropdown" | "radio";
|
|
@@ -548,4 +562,4 @@ interface TextOverlay {
|
|
|
548
562
|
/** Overlay text on a flat PDF at specified coordinates. */
|
|
549
563
|
declare function overlayTextOnPdf(pdfBytes: Uint8Array, overlays: TextOverlay[]): Promise<Uint8Array>;
|
|
550
564
|
|
|
551
|
-
export { AGENT_TOOLS, APPLICATION_CLASSIFY_PROMPT, type AcroFormFieldInfo, type AgentContext, type BaseDocument, CLASSIFY_DOCUMENT_PROMPT, CLASSIFY_EMAIL_PROMPT, COI_GENERATION_TOOL, COVERAGE_COMPARISON_TOOL, type ClassifyOptions, type CommunicationIntent, type Coverage, DOCUMENT_LOOKUP_TOOL, EXTRACTION_PROMPT, type ExtractOptions, type ExtractSectionsOptions, type FieldMapping, HAIKU_MODEL, type InsuranceDocument, type LogFn, METADATA_PROMPT, MODEL_TOKEN_LIMITS, type ModelConfig, PLATFORM_CONFIGS, type Platform, type PlatformConfig, type PolicyDocument, type PremiumLine, type PromptBuilder, QUOTE_METADATA_PROMPT, type QuoteDocument, SONNET_MODEL, type Section, type Subjectivity, type Subsection, type TextOverlay, type TokenUsage, type ToolDefinition, type UnderwritingCondition, applyExtracted, applyExtractedQuote, buildAcroFormMappingPrompt, buildAgentSystemPrompt, buildAnswerParsingPrompt, buildAutoFillPrompt, buildBatchEmailGenerationPrompt, buildClassifyMessagePrompt, buildCoiRoutingPrompt, buildConfirmationSummaryPrompt, buildConversationMemoryContext, buildConversationMemoryGuidance, buildCoverageGapPrompt, buildDocumentContext, buildFieldExplanationPrompt, buildFieldExtractionPrompt, buildFlatPdfMappingPrompt, buildFormattingPrompt, buildIdentityPrompt, buildIntentPrompt, buildLookupFillPrompt, buildPolicyContext, buildPolicySectionsPrompt, buildQuestionBatchPrompt, buildQuoteSectionsPrompt, buildQuotesPoliciesPrompt, buildReplyIntentClassificationPrompt, buildSafetyPrompt, buildSectionsPrompt, buildSupplementaryEnrichmentPrompt, buildSystemPrompt, classifyDocumentType, createDefaultModelConfig, createUniformModelConfig, enrichSupplementaryFields, extractFromPdf, extractQuoteFromPdf, extractSectionsOnly, fillAcroForm, getAcroFormFields, getPageChunks, mergeChunkedQuoteSections, mergeChunkedSections, overlayTextOnPdf, sanitizeNulls, stripFences };
|
|
565
|
+
export { AGENT_TOOLS, APPLICATION_CLASSIFY_PROMPT, type AcroFormFieldInfo, type AgentContext, type BaseDocument, CLASSIFY_DOCUMENT_PROMPT, CLASSIFY_EMAIL_PROMPT, COI_GENERATION_TOOL, COVERAGE_COMPARISON_TOOL, type ClassifyOptions, type CommunicationIntent, type Coverage, DOCUMENT_LOOKUP_TOOL, EXTRACTION_PROMPT, type ExtractOptions, type ExtractSectionsOptions, type FieldMapping, HAIKU_MODEL, type InsuranceDocument, type LogFn, METADATA_PROMPT, MODEL_TOKEN_LIMITS, type ModelConfig, PLATFORM_CONFIGS, type Platform, type PlatformConfig, type PolicyDocument, type PremiumLine, type PromptBuilder, QUOTE_METADATA_PROMPT, type QuoteDocument, SONNET_MODEL, type Section, type Subjectivity, type Subsection, type TextOverlay, type TokenUsage, type ToolDefinition, type UnderwritingCondition, applyExtracted, applyExtractedQuote, buildAcroFormMappingPrompt, buildAgentSystemPrompt, buildAnswerParsingPrompt, buildAutoFillPrompt, buildBatchEmailGenerationPrompt, buildClassifyMessagePrompt, buildCoiRoutingPrompt, buildConfirmationSummaryPrompt, buildConversationMemoryContext, buildConversationMemoryGuidance, buildCoverageGapPrompt, buildDocumentContext, buildFieldExplanationPrompt, buildFieldExtractionPrompt, buildFlatPdfMappingPrompt, buildFormattingPrompt, buildIdentityPrompt, buildIntentPrompt, buildLookupFillPrompt, buildPolicyContext, buildPolicySectionsPrompt, buildQuestionBatchPrompt, buildQuoteSectionsPrompt, buildQuotesPoliciesPrompt, buildReplyIntentClassificationPrompt, buildSafetyPrompt, buildSectionsPrompt, buildSupplementaryEnrichmentPrompt, buildSystemPrompt, classifyDocumentType, createDefaultModelConfig, createUniformModelConfig, enrichSupplementaryFields, extractFromPdf, extractPageRange, extractQuoteFromPdf, extractSectionsOnly, fillAcroForm, getAcroFormFields, getPageChunks, getPdfPageCount, mergeChunkedQuoteSections, mergeChunkedSections, overlayTextOnPdf, sanitizeNulls, stripFences };
|
package/dist/index.js
CHANGED
|
@@ -28532,11 +28532,13 @@ __export(index_exports, {
|
|
|
28532
28532
|
createUniformModelConfig: () => createUniformModelConfig,
|
|
28533
28533
|
enrichSupplementaryFields: () => enrichSupplementaryFields,
|
|
28534
28534
|
extractFromPdf: () => extractFromPdf,
|
|
28535
|
+
extractPageRange: () => extractPageRange,
|
|
28535
28536
|
extractQuoteFromPdf: () => extractQuoteFromPdf,
|
|
28536
28537
|
extractSectionsOnly: () => extractSectionsOnly,
|
|
28537
28538
|
fillAcroForm: () => fillAcroForm,
|
|
28538
28539
|
getAcroFormFields: () => getAcroFormFields,
|
|
28539
28540
|
getPageChunks: () => getPageChunks,
|
|
28541
|
+
getPdfPageCount: () => getPdfPageCount,
|
|
28540
28542
|
mergeChunkedQuoteSections: () => mergeChunkedQuoteSections,
|
|
28541
28543
|
mergeChunkedSections: () => mergeChunkedSections,
|
|
28542
28544
|
overlayTextOnPdf: () => overlayTextOnPdf,
|
|
@@ -29862,6 +29864,124 @@ var AGENT_TOOLS = [
|
|
|
29862
29864
|
|
|
29863
29865
|
// src/extraction/pipeline.ts
|
|
29864
29866
|
var import_ai = require("ai");
|
|
29867
|
+
|
|
29868
|
+
// src/extraction/pdf.ts
|
|
29869
|
+
var import_pdf_lib = require("pdf-lib");
|
|
29870
|
+
async function extractPageRange(pdfBase64, startPage, endPage) {
|
|
29871
|
+
const srcBytes = typeof Buffer !== "undefined" ? Buffer.from(pdfBase64, "base64") : Uint8Array.from(atob(pdfBase64), (c) => c.charCodeAt(0));
|
|
29872
|
+
const srcDoc = await import_pdf_lib.PDFDocument.load(srcBytes, { ignoreEncryption: true });
|
|
29873
|
+
const totalPages = srcDoc.getPageCount();
|
|
29874
|
+
const start = Math.max(startPage - 1, 0);
|
|
29875
|
+
const end = Math.min(endPage, totalPages) - 1;
|
|
29876
|
+
if (start === 0 && end >= totalPages - 1) {
|
|
29877
|
+
return pdfBase64;
|
|
29878
|
+
}
|
|
29879
|
+
const newDoc = await import_pdf_lib.PDFDocument.create();
|
|
29880
|
+
const indices = Array.from({ length: end - start + 1 }, (_, i) => start + i);
|
|
29881
|
+
const pages = await newDoc.copyPages(srcDoc, indices);
|
|
29882
|
+
pages.forEach((page) => newDoc.addPage(page));
|
|
29883
|
+
const bytes = await newDoc.save();
|
|
29884
|
+
if (typeof Buffer !== "undefined") {
|
|
29885
|
+
return Buffer.from(bytes).toString("base64");
|
|
29886
|
+
}
|
|
29887
|
+
let binary = "";
|
|
29888
|
+
const uint8 = new Uint8Array(bytes);
|
|
29889
|
+
for (let i = 0; i < uint8.length; i++) {
|
|
29890
|
+
binary += String.fromCharCode(uint8[i]);
|
|
29891
|
+
}
|
|
29892
|
+
return btoa(binary);
|
|
29893
|
+
}
|
|
29894
|
+
async function getPdfPageCount(pdfBase64) {
|
|
29895
|
+
const srcBytes = typeof Buffer !== "undefined" ? Buffer.from(pdfBase64, "base64") : Uint8Array.from(atob(pdfBase64), (c) => c.charCodeAt(0));
|
|
29896
|
+
const doc = await import_pdf_lib.PDFDocument.load(srcBytes, { ignoreEncryption: true });
|
|
29897
|
+
return doc.getPageCount();
|
|
29898
|
+
}
|
|
29899
|
+
function getAcroFormFields(pdfDoc) {
|
|
29900
|
+
const form = pdfDoc.getForm();
|
|
29901
|
+
const fields = form.getFields();
|
|
29902
|
+
if (fields.length === 0) return [];
|
|
29903
|
+
return fields.map((field) => {
|
|
29904
|
+
const name = field.getName();
|
|
29905
|
+
if (field instanceof import_pdf_lib.PDFTextField) {
|
|
29906
|
+
return { name, type: "text" };
|
|
29907
|
+
}
|
|
29908
|
+
if (field instanceof import_pdf_lib.PDFCheckBox) {
|
|
29909
|
+
return { name, type: "checkbox" };
|
|
29910
|
+
}
|
|
29911
|
+
if (field instanceof import_pdf_lib.PDFDropdown) {
|
|
29912
|
+
return { name, type: "dropdown", options: field.getOptions() };
|
|
29913
|
+
}
|
|
29914
|
+
if (field instanceof import_pdf_lib.PDFRadioGroup) {
|
|
29915
|
+
return { name, type: "radio", options: field.getOptions() };
|
|
29916
|
+
}
|
|
29917
|
+
return { name, type: "text" };
|
|
29918
|
+
});
|
|
29919
|
+
}
|
|
29920
|
+
async function fillAcroForm(pdfBytes, mappings) {
|
|
29921
|
+
const pdfDoc = await import_pdf_lib.PDFDocument.load(pdfBytes, { ignoreEncryption: true });
|
|
29922
|
+
const form = pdfDoc.getForm();
|
|
29923
|
+
for (const { acroFormName, value } of mappings) {
|
|
29924
|
+
try {
|
|
29925
|
+
const field = form.getField(acroFormName);
|
|
29926
|
+
if (field instanceof import_pdf_lib.PDFTextField) {
|
|
29927
|
+
field.setText(value);
|
|
29928
|
+
} else if (field instanceof import_pdf_lib.PDFCheckBox) {
|
|
29929
|
+
const lower = value.toLowerCase();
|
|
29930
|
+
if (["yes", "true", "x", "checked", "on"].includes(lower)) {
|
|
29931
|
+
field.check();
|
|
29932
|
+
} else {
|
|
29933
|
+
field.uncheck();
|
|
29934
|
+
}
|
|
29935
|
+
} else if (field instanceof import_pdf_lib.PDFDropdown) {
|
|
29936
|
+
try {
|
|
29937
|
+
field.select(value);
|
|
29938
|
+
} catch {
|
|
29939
|
+
}
|
|
29940
|
+
} else if (field instanceof import_pdf_lib.PDFRadioGroup) {
|
|
29941
|
+
try {
|
|
29942
|
+
field.select(value);
|
|
29943
|
+
} catch {
|
|
29944
|
+
}
|
|
29945
|
+
}
|
|
29946
|
+
} catch {
|
|
29947
|
+
}
|
|
29948
|
+
}
|
|
29949
|
+
form.flatten();
|
|
29950
|
+
return await pdfDoc.save();
|
|
29951
|
+
}
|
|
29952
|
+
async function overlayTextOnPdf(pdfBytes, overlays) {
|
|
29953
|
+
const pdfDoc = await import_pdf_lib.PDFDocument.load(pdfBytes, { ignoreEncryption: true });
|
|
29954
|
+
const font = await pdfDoc.embedFont(import_pdf_lib.StandardFonts.Helvetica);
|
|
29955
|
+
const pageCount = pdfDoc.getPageCount();
|
|
29956
|
+
for (const overlay of overlays) {
|
|
29957
|
+
if (overlay.page < 0 || overlay.page >= pageCount) continue;
|
|
29958
|
+
const page = pdfDoc.getPage(overlay.page);
|
|
29959
|
+
const { width, height } = page.getSize();
|
|
29960
|
+
const fontSize = overlay.fontSize ?? 10;
|
|
29961
|
+
const x = overlay.x / 100 * width;
|
|
29962
|
+
const y = height - overlay.y / 100 * height - fontSize;
|
|
29963
|
+
if (overlay.isCheckmark) {
|
|
29964
|
+
page.drawText("X", {
|
|
29965
|
+
x,
|
|
29966
|
+
y,
|
|
29967
|
+
size: fontSize,
|
|
29968
|
+
font,
|
|
29969
|
+
color: (0, import_pdf_lib.rgb)(0, 0, 0)
|
|
29970
|
+
});
|
|
29971
|
+
} else {
|
|
29972
|
+
page.drawText(overlay.text, {
|
|
29973
|
+
x,
|
|
29974
|
+
y,
|
|
29975
|
+
size: fontSize,
|
|
29976
|
+
font,
|
|
29977
|
+
color: (0, import_pdf_lib.rgb)(0, 0, 0)
|
|
29978
|
+
});
|
|
29979
|
+
}
|
|
29980
|
+
}
|
|
29981
|
+
return await pdfDoc.save();
|
|
29982
|
+
}
|
|
29983
|
+
|
|
29984
|
+
// src/extraction/pipeline.ts
|
|
29865
29985
|
var SONNET_MODEL = "claude-sonnet-4-6";
|
|
29866
29986
|
var HAIKU_MODEL = "claude-haiku-4-5-20251001";
|
|
29867
29987
|
var DEFAULT_METADATA_PROVIDER_OPTIONS = {
|
|
@@ -29998,8 +30118,10 @@ function getPageChunks(totalPages, chunkSize = 30) {
|
|
|
29998
30118
|
}
|
|
29999
30119
|
return chunks;
|
|
30000
30120
|
}
|
|
30001
|
-
async function callModel(model, pdfBase64, prompt, maxTokens, providerOptions, log, onTokenUsage) {
|
|
30002
|
-
await
|
|
30121
|
+
async function callModel(model, pdfBase64, prompt, maxTokens, providerOptions, log, onTokenUsage, pageRange) {
|
|
30122
|
+
const pdfToSend = pageRange ? await extractPageRange(pdfBase64, pageRange[0], pageRange[1]) : pdfBase64;
|
|
30123
|
+
const rangeLabel = pageRange ? ` [pages ${pageRange[0]}\u2013${pageRange[1]}]` : "";
|
|
30124
|
+
await log?.(`Calling model (max ${maxTokens} tokens)${rangeLabel}...`);
|
|
30003
30125
|
const start = Date.now();
|
|
30004
30126
|
const { text, usage } = await withRetry(
|
|
30005
30127
|
() => (0, import_ai.generateText)({
|
|
@@ -30008,7 +30130,7 @@ async function callModel(model, pdfBase64, prompt, maxTokens, providerOptions, l
|
|
|
30008
30130
|
messages: [{
|
|
30009
30131
|
role: "user",
|
|
30010
30132
|
content: [
|
|
30011
|
-
{ type: "file", data:
|
|
30133
|
+
{ type: "file", data: pdfToSend, mediaType: "application/pdf" },
|
|
30012
30134
|
{ type: "text", text: prompt }
|
|
30013
30135
|
]
|
|
30014
30136
|
}],
|
|
@@ -30114,7 +30236,9 @@ async function classifyDocumentType(pdfBase64, options) {
|
|
|
30114
30236
|
MODEL_TOKEN_LIMITS.classification,
|
|
30115
30237
|
void 0,
|
|
30116
30238
|
log,
|
|
30117
|
-
onTokenUsage
|
|
30239
|
+
onTokenUsage,
|
|
30240
|
+
[1, 3]
|
|
30241
|
+
// Only need first 3 pages for classification
|
|
30118
30242
|
);
|
|
30119
30243
|
try {
|
|
30120
30244
|
const parsed = JSON.parse(stripFences(raw));
|
|
@@ -30203,7 +30327,9 @@ async function extractChunkWithRetry(models, pdfBase64, start, end, sizeIndex, p
|
|
|
30203
30327
|
MODEL_TOKEN_LIMITS.sections,
|
|
30204
30328
|
void 0,
|
|
30205
30329
|
log,
|
|
30206
|
-
onTokenUsage
|
|
30330
|
+
onTokenUsage,
|
|
30331
|
+
[start, end]
|
|
30332
|
+
// Only send this chunk's pages
|
|
30207
30333
|
);
|
|
30208
30334
|
try {
|
|
30209
30335
|
return [JSON.parse(stripFences(chunkRaw))];
|
|
@@ -30245,7 +30371,9 @@ async function extractChunkWithRetry(models, pdfBase64, start, end, sizeIndex, p
|
|
|
30245
30371
|
MODEL_TOKEN_LIMITS.sectionsFallback,
|
|
30246
30372
|
fallbackProviderOptions,
|
|
30247
30373
|
log,
|
|
30248
|
-
onTokenUsage
|
|
30374
|
+
onTokenUsage,
|
|
30375
|
+
[start, end]
|
|
30376
|
+
// Only send this chunk's pages
|
|
30249
30377
|
);
|
|
30250
30378
|
try {
|
|
30251
30379
|
return [JSON.parse(stripFences(fallbackRaw))];
|
|
@@ -30288,7 +30416,9 @@ async function extractFromPdf(pdfBase64, options) {
|
|
|
30288
30416
|
onTokenUsage
|
|
30289
30417
|
} = options ?? {};
|
|
30290
30418
|
const resolved = resolveModels(models);
|
|
30419
|
+
const actualPageCount = await getPdfPageCount(pdfBase64);
|
|
30291
30420
|
await log?.("Pass 1: Extracting metadata...");
|
|
30421
|
+
const metadataPageRange = [1, Math.min(10, actualPageCount)];
|
|
30292
30422
|
const metadataRaw = await callModel(
|
|
30293
30423
|
resolved.metadata,
|
|
30294
30424
|
pdfBase64,
|
|
@@ -30296,7 +30426,8 @@ async function extractFromPdf(pdfBase64, options) {
|
|
|
30296
30426
|
MODEL_TOKEN_LIMITS.metadata,
|
|
30297
30427
|
metadataProviderOptions,
|
|
30298
30428
|
log,
|
|
30299
|
-
onTokenUsage
|
|
30429
|
+
onTokenUsage,
|
|
30430
|
+
metadataPageRange
|
|
30300
30431
|
);
|
|
30301
30432
|
let metadataResult;
|
|
30302
30433
|
try {
|
|
@@ -30307,7 +30438,7 @@ async function extractFromPdf(pdfBase64, options) {
|
|
|
30307
30438
|
throw new Error(`Metadata JSON parse failed: ${e.message}`);
|
|
30308
30439
|
}
|
|
30309
30440
|
await onMetadata?.(metadataRaw);
|
|
30310
|
-
const pageCount =
|
|
30441
|
+
const pageCount = actualPageCount;
|
|
30311
30442
|
await log?.(`Document: ${pageCount} page(s)`);
|
|
30312
30443
|
const sectionChunks = await extractSectionChunks(
|
|
30313
30444
|
resolved,
|
|
@@ -30375,7 +30506,9 @@ async function extractQuoteFromPdf(pdfBase64, options) {
|
|
|
30375
30506
|
onTokenUsage
|
|
30376
30507
|
} = options ?? {};
|
|
30377
30508
|
const resolved = resolveModels(models);
|
|
30509
|
+
const actualPageCount = await getPdfPageCount(pdfBase64);
|
|
30378
30510
|
await log?.("Pass 1: Extracting quote metadata...");
|
|
30511
|
+
const metadataPageRange = [1, Math.min(10, actualPageCount)];
|
|
30379
30512
|
const metadataRaw = await callModel(
|
|
30380
30513
|
resolved.metadata,
|
|
30381
30514
|
pdfBase64,
|
|
@@ -30383,7 +30516,8 @@ async function extractQuoteFromPdf(pdfBase64, options) {
|
|
|
30383
30516
|
MODEL_TOKEN_LIMITS.metadata,
|
|
30384
30517
|
metadataProviderOptions,
|
|
30385
30518
|
log,
|
|
30386
|
-
onTokenUsage
|
|
30519
|
+
onTokenUsage,
|
|
30520
|
+
metadataPageRange
|
|
30387
30521
|
);
|
|
30388
30522
|
let metadataResult;
|
|
30389
30523
|
try {
|
|
@@ -30394,7 +30528,7 @@ async function extractQuoteFromPdf(pdfBase64, options) {
|
|
|
30394
30528
|
throw new Error(`Quote metadata JSON parse failed: ${e.message}`);
|
|
30395
30529
|
}
|
|
30396
30530
|
await onMetadata?.(metadataRaw);
|
|
30397
|
-
const pageCount =
|
|
30531
|
+
const pageCount = actualPageCount;
|
|
30398
30532
|
await log?.(`Quote document: ${pageCount} page(s)`);
|
|
30399
30533
|
const sectionChunks = await extractSectionChunks(
|
|
30400
30534
|
resolved,
|
|
@@ -30411,93 +30545,6 @@ async function extractQuoteFromPdf(pdfBase64, options) {
|
|
|
30411
30545
|
const mergedRaw = JSON.stringify(merged);
|
|
30412
30546
|
return { rawText: mergedRaw, extracted: merged };
|
|
30413
30547
|
}
|
|
30414
|
-
|
|
30415
|
-
// src/extraction/pdf.ts
|
|
30416
|
-
var import_pdf_lib = require("pdf-lib");
|
|
30417
|
-
function getAcroFormFields(pdfDoc) {
|
|
30418
|
-
const form = pdfDoc.getForm();
|
|
30419
|
-
const fields = form.getFields();
|
|
30420
|
-
if (fields.length === 0) return [];
|
|
30421
|
-
return fields.map((field) => {
|
|
30422
|
-
const name = field.getName();
|
|
30423
|
-
if (field instanceof import_pdf_lib.PDFTextField) {
|
|
30424
|
-
return { name, type: "text" };
|
|
30425
|
-
}
|
|
30426
|
-
if (field instanceof import_pdf_lib.PDFCheckBox) {
|
|
30427
|
-
return { name, type: "checkbox" };
|
|
30428
|
-
}
|
|
30429
|
-
if (field instanceof import_pdf_lib.PDFDropdown) {
|
|
30430
|
-
return { name, type: "dropdown", options: field.getOptions() };
|
|
30431
|
-
}
|
|
30432
|
-
if (field instanceof import_pdf_lib.PDFRadioGroup) {
|
|
30433
|
-
return { name, type: "radio", options: field.getOptions() };
|
|
30434
|
-
}
|
|
30435
|
-
return { name, type: "text" };
|
|
30436
|
-
});
|
|
30437
|
-
}
|
|
30438
|
-
async function fillAcroForm(pdfBytes, mappings) {
|
|
30439
|
-
const pdfDoc = await import_pdf_lib.PDFDocument.load(pdfBytes, { ignoreEncryption: true });
|
|
30440
|
-
const form = pdfDoc.getForm();
|
|
30441
|
-
for (const { acroFormName, value } of mappings) {
|
|
30442
|
-
try {
|
|
30443
|
-
const field = form.getField(acroFormName);
|
|
30444
|
-
if (field instanceof import_pdf_lib.PDFTextField) {
|
|
30445
|
-
field.setText(value);
|
|
30446
|
-
} else if (field instanceof import_pdf_lib.PDFCheckBox) {
|
|
30447
|
-
const lower = value.toLowerCase();
|
|
30448
|
-
if (["yes", "true", "x", "checked", "on"].includes(lower)) {
|
|
30449
|
-
field.check();
|
|
30450
|
-
} else {
|
|
30451
|
-
field.uncheck();
|
|
30452
|
-
}
|
|
30453
|
-
} else if (field instanceof import_pdf_lib.PDFDropdown) {
|
|
30454
|
-
try {
|
|
30455
|
-
field.select(value);
|
|
30456
|
-
} catch {
|
|
30457
|
-
}
|
|
30458
|
-
} else if (field instanceof import_pdf_lib.PDFRadioGroup) {
|
|
30459
|
-
try {
|
|
30460
|
-
field.select(value);
|
|
30461
|
-
} catch {
|
|
30462
|
-
}
|
|
30463
|
-
}
|
|
30464
|
-
} catch {
|
|
30465
|
-
}
|
|
30466
|
-
}
|
|
30467
|
-
form.flatten();
|
|
30468
|
-
return await pdfDoc.save();
|
|
30469
|
-
}
|
|
30470
|
-
async function overlayTextOnPdf(pdfBytes, overlays) {
|
|
30471
|
-
const pdfDoc = await import_pdf_lib.PDFDocument.load(pdfBytes, { ignoreEncryption: true });
|
|
30472
|
-
const font = await pdfDoc.embedFont(import_pdf_lib.StandardFonts.Helvetica);
|
|
30473
|
-
const pageCount = pdfDoc.getPageCount();
|
|
30474
|
-
for (const overlay of overlays) {
|
|
30475
|
-
if (overlay.page < 0 || overlay.page >= pageCount) continue;
|
|
30476
|
-
const page = pdfDoc.getPage(overlay.page);
|
|
30477
|
-
const { width, height } = page.getSize();
|
|
30478
|
-
const fontSize = overlay.fontSize ?? 10;
|
|
30479
|
-
const x = overlay.x / 100 * width;
|
|
30480
|
-
const y = height - overlay.y / 100 * height - fontSize;
|
|
30481
|
-
if (overlay.isCheckmark) {
|
|
30482
|
-
page.drawText("X", {
|
|
30483
|
-
x,
|
|
30484
|
-
y,
|
|
30485
|
-
size: fontSize,
|
|
30486
|
-
font,
|
|
30487
|
-
color: (0, import_pdf_lib.rgb)(0, 0, 0)
|
|
30488
|
-
});
|
|
30489
|
-
} else {
|
|
30490
|
-
page.drawText(overlay.text, {
|
|
30491
|
-
x,
|
|
30492
|
-
y,
|
|
30493
|
-
size: fontSize,
|
|
30494
|
-
font,
|
|
30495
|
-
color: (0, import_pdf_lib.rgb)(0, 0, 0)
|
|
30496
|
-
});
|
|
30497
|
-
}
|
|
30498
|
-
}
|
|
30499
|
-
return await pdfDoc.save();
|
|
30500
|
-
}
|
|
30501
30548
|
// Annotate the CommonJS export names for ESM import in node:
|
|
30502
30549
|
0 && (module.exports = {
|
|
30503
30550
|
AGENT_TOOLS,
|
|
@@ -30550,11 +30597,13 @@ async function overlayTextOnPdf(pdfBytes, overlays) {
|
|
|
30550
30597
|
createUniformModelConfig,
|
|
30551
30598
|
enrichSupplementaryFields,
|
|
30552
30599
|
extractFromPdf,
|
|
30600
|
+
extractPageRange,
|
|
30553
30601
|
extractQuoteFromPdf,
|
|
30554
30602
|
extractSectionsOnly,
|
|
30555
30603
|
fillAcroForm,
|
|
30556
30604
|
getAcroFormFields,
|
|
30557
30605
|
getPageChunks,
|
|
30606
|
+
getPdfPageCount,
|
|
30558
30607
|
mergeChunkedQuoteSections,
|
|
30559
30608
|
mergeChunkedSections,
|
|
30560
30609
|
overlayTextOnPdf,
|