@doclo/providers-llm 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-7YPJIWRM.js +291 -0
- package/dist/chunk-7YPJIWRM.js.map +1 -0
- package/dist/index.d.ts +275 -4
- package/dist/index.js +317 -146
- package/dist/index.js.map +1 -1
- package/dist/schema-prompt-formatter-AIORLWUF.js +29 -0
- package/dist/schema-prompt-formatter-AIORLWUF.js.map +1 -0
- package/package.json +2 -2
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
// src/schema-prompt-formatter.ts
|
|
2
|
+
function formatSchemaForPrompt(schema, indent = 0) {
|
|
3
|
+
if (!schema || typeof schema !== "object") {
|
|
4
|
+
return "";
|
|
5
|
+
}
|
|
6
|
+
const indentStr = " ".repeat(indent);
|
|
7
|
+
let result = "";
|
|
8
|
+
if (schema.type === "object" && schema.properties) {
|
|
9
|
+
const properties = schema.properties;
|
|
10
|
+
const required = schema.required || [];
|
|
11
|
+
for (const [fieldName, fieldSchema] of Object.entries(properties)) {
|
|
12
|
+
const isRequired = required.includes(fieldName);
|
|
13
|
+
const requiredMarker = isRequired ? " (REQUIRED)" : " (optional)";
|
|
14
|
+
result += `${indentStr}- \`${fieldName}\`${requiredMarker}`;
|
|
15
|
+
const type = getTypeDescription(fieldSchema);
|
|
16
|
+
if (type) {
|
|
17
|
+
result += `: ${type}`;
|
|
18
|
+
}
|
|
19
|
+
if (fieldSchema.description) {
|
|
20
|
+
result += `
|
|
21
|
+
${indentStr} ${fieldSchema.description}`;
|
|
22
|
+
}
|
|
23
|
+
if (fieldSchema.enum) {
|
|
24
|
+
result += `
|
|
25
|
+
${indentStr} Allowed values: ${fieldSchema.enum.map((v) => JSON.stringify(v)).join(", ")}`;
|
|
26
|
+
}
|
|
27
|
+
result += "\n";
|
|
28
|
+
if (fieldSchema.type === "object" && fieldSchema.properties) {
|
|
29
|
+
result += formatSchemaForPrompt(fieldSchema, indent + 1);
|
|
30
|
+
}
|
|
31
|
+
if (fieldSchema.type === "array" && fieldSchema.items) {
|
|
32
|
+
result += `${indentStr} Array items:
|
|
33
|
+
`;
|
|
34
|
+
const itemSchema = Array.isArray(fieldSchema.items) ? fieldSchema.items[0] : fieldSchema.items;
|
|
35
|
+
if (itemSchema && itemSchema.type === "object" && itemSchema.properties) {
|
|
36
|
+
result += formatSchemaForPrompt(itemSchema, indent + 2);
|
|
37
|
+
} else if (itemSchema) {
|
|
38
|
+
const itemType = getTypeDescription(itemSchema);
|
|
39
|
+
result += `${indentStr} ${itemType}
|
|
40
|
+
`;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
return result;
|
|
46
|
+
}
|
|
47
|
+
function getTypeDescription(schema) {
|
|
48
|
+
if (!schema) return "any";
|
|
49
|
+
if (schema.type) {
|
|
50
|
+
const typeStr = Array.isArray(schema.type) ? schema.type.join(" | ") : schema.type;
|
|
51
|
+
if (typeStr === "array" || Array.isArray(schema.type) && schema.type.includes("array")) {
|
|
52
|
+
if (schema.items && !Array.isArray(schema.items) && schema.items.type) {
|
|
53
|
+
const itemType = Array.isArray(schema.items.type) ? schema.items.type.join(" | ") : schema.items.type;
|
|
54
|
+
return `array of ${itemType}`;
|
|
55
|
+
}
|
|
56
|
+
return "array";
|
|
57
|
+
}
|
|
58
|
+
if ((typeStr === "string" || Array.isArray(schema.type) && schema.type.includes("string")) && schema.format) {
|
|
59
|
+
const formatHints = {
|
|
60
|
+
"date": "YYYY-MM-DD",
|
|
61
|
+
"time": "HH:MM or HH:MM:SS",
|
|
62
|
+
"date-time": "YYYY-MM-DDTHH:MM:SS (ISO 8601)"
|
|
63
|
+
};
|
|
64
|
+
const hint = formatHints[schema.format];
|
|
65
|
+
if (hint) {
|
|
66
|
+
return `string (format: ${schema.format}, use ${hint})`;
|
|
67
|
+
}
|
|
68
|
+
return `string (format: ${schema.format})`;
|
|
69
|
+
}
|
|
70
|
+
return typeStr;
|
|
71
|
+
}
|
|
72
|
+
if (schema.anyOf) {
|
|
73
|
+
return schema.anyOf.map((s) => getTypeDescription(s)).join(" OR ");
|
|
74
|
+
}
|
|
75
|
+
if (schema.oneOf) {
|
|
76
|
+
return schema.oneOf.map((s) => getTypeDescription(s)).join(" OR ");
|
|
77
|
+
}
|
|
78
|
+
return "any";
|
|
79
|
+
}
|
|
80
|
+
function buildSchemaPromptSection(schema) {
|
|
81
|
+
const schemaFields = formatSchemaForPrompt(schema);
|
|
82
|
+
return `
|
|
83
|
+
==================================================
|
|
84
|
+
CRITICAL: OUTPUT STRUCTURE REQUIREMENTS
|
|
85
|
+
==================================================
|
|
86
|
+
|
|
87
|
+
YOU MUST RETURN JSON MATCHING THIS EXACT STRUCTURE:
|
|
88
|
+
|
|
89
|
+
${schemaFields}
|
|
90
|
+
|
|
91
|
+
CRITICAL FIELD NAME REQUIREMENTS:
|
|
92
|
+
\u2713 Use EXACTLY the field names shown above (character-for-character match)
|
|
93
|
+
\u2713 Preserve the exact casing (e.g., "fullName", not "full_name" or "FullName")
|
|
94
|
+
\u2713 Do NOT abbreviate field names (e.g., "dob" instead of "dateOfBirth")
|
|
95
|
+
\u2713 Do NOT invent alternative names (e.g., "directorName" instead of "fullName")
|
|
96
|
+
\u2713 Do NOT use snake_case if the schema uses camelCase
|
|
97
|
+
\u2713 Do NOT flatten nested structures or rename nested fields
|
|
98
|
+
\u2713 The schema above is the SINGLE SOURCE OF TRUTH for field naming
|
|
99
|
+
|
|
100
|
+
MISSING DATA:
|
|
101
|
+
- If a required field has no data in the document, use null
|
|
102
|
+
- If an optional field has no data, you may omit it or use null
|
|
103
|
+
- Do NOT invent data that isn't in the document
|
|
104
|
+
|
|
105
|
+
==================================================
|
|
106
|
+
`.trim();
|
|
107
|
+
}
|
|
108
|
+
function combineSchemaAndUserPrompt(schema, userPrompt) {
|
|
109
|
+
const schemaSection = buildSchemaPromptSection(schema);
|
|
110
|
+
if (!userPrompt || userPrompt.trim() === "") {
|
|
111
|
+
return schemaSection + "\n\nTASK: Extract structured data from the provided document.";
|
|
112
|
+
}
|
|
113
|
+
return schemaSection + "\n\n" + userPrompt;
|
|
114
|
+
}
|
|
115
|
+
function buildOutputFormatPrompt(options) {
|
|
116
|
+
const parts = [];
|
|
117
|
+
if (options.outputFormat) {
|
|
118
|
+
switch (options.outputFormat) {
|
|
119
|
+
case "markdown":
|
|
120
|
+
parts.push("Format all text content using markdown syntax. Use proper headings (#, ##, ###), lists (-, *), bold (**text**), and other markdown formatting where appropriate.");
|
|
121
|
+
break;
|
|
122
|
+
case "html":
|
|
123
|
+
parts.push("Format all text content as valid HTML. Use semantic tags like <p>, <h1>-<h6>, <ul>, <ol>, <strong>, <em> where appropriate.");
|
|
124
|
+
break;
|
|
125
|
+
case "json":
|
|
126
|
+
parts.push("For text fields that contain structured data, format them as embedded JSON strings where appropriate.");
|
|
127
|
+
break;
|
|
128
|
+
case "text":
|
|
129
|
+
parts.push("Return plain text without any formatting. No markdown, HTML, or other markup.");
|
|
130
|
+
break;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
if (options.tableFormat) {
|
|
134
|
+
switch (options.tableFormat) {
|
|
135
|
+
case "markdown":
|
|
136
|
+
parts.push("Format all tables using markdown table syntax with | column separators and header row with ---.");
|
|
137
|
+
break;
|
|
138
|
+
case "html":
|
|
139
|
+
parts.push("Format all tables as HTML <table> elements with <thead>, <tbody>, <tr>, <th>, and <td> tags.");
|
|
140
|
+
break;
|
|
141
|
+
case "csv":
|
|
142
|
+
parts.push("Format all tables as CSV with headers in the first row and comma-separated values.");
|
|
143
|
+
break;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
if (options.pageMarkers) {
|
|
147
|
+
parts.push('Insert "---" page break markers between content from different pages of the document.');
|
|
148
|
+
}
|
|
149
|
+
return parts.join("\n");
|
|
150
|
+
}
|
|
151
|
+
function buildLanguageHintsPrompt(languages) {
|
|
152
|
+
if (!languages || languages.length === 0) {
|
|
153
|
+
return "";
|
|
154
|
+
}
|
|
155
|
+
return `The document is written in ${languages.join(", ")}. Extract and preserve text in the original language(s).`;
|
|
156
|
+
}
|
|
157
|
+
function buildConfidencePrompt() {
|
|
158
|
+
return `
|
|
159
|
+
For each extracted field, assess your confidence level and include it in the "_confidence" object:
|
|
160
|
+
- Use a number from 0.0 to 1.0 where:
|
|
161
|
+
- 0.9-1.0: Very high confidence - text is clear and unambiguous
|
|
162
|
+
- 0.7-0.9: High confidence - minor ambiguity but likely correct
|
|
163
|
+
- 0.5-0.7: Medium confidence - some uncertainty or partial visibility
|
|
164
|
+
- 0.3-0.5: Low confidence - significant uncertainty
|
|
165
|
+
- 0.0-0.3: Very low confidence - guessing or text was unclear
|
|
166
|
+
|
|
167
|
+
Include "_confidence" as a sibling object mapping field paths to their scores.
|
|
168
|
+
Example: "_confidence": { "invoiceNumber": 0.95, "amount": 0.82 }
|
|
169
|
+
`.trim();
|
|
170
|
+
}
|
|
171
|
+
function buildSourcesPrompt() {
|
|
172
|
+
return `
|
|
173
|
+
For each extracted field, identify the source location in the document and include it in the "_sources" array:
|
|
174
|
+
Each source entry should contain:
|
|
175
|
+
- "field": The field name/path that was extracted
|
|
176
|
+
- "text": The exact text from the document used for extraction
|
|
177
|
+
- "bbox": Bounding box as [y_min, x_min, y_max, x_max] normalized to 0-1000 scale
|
|
178
|
+
- "page": Page number (0-indexed) where the text appears
|
|
179
|
+
|
|
180
|
+
Include "_sources" as a sibling array to your extracted data.
|
|
181
|
+
Example: "_sources": [{ "field": "invoiceNumber", "text": "INV-001", "bbox": [100, 50, 120, 150], "page": 0 }]
|
|
182
|
+
`.trim();
|
|
183
|
+
}
|
|
184
|
+
function buildBlockClassificationPrompt() {
|
|
185
|
+
return `
|
|
186
|
+
For each extracted element or text block, classify its type in a "_blockTypes" object:
|
|
187
|
+
- "title": Main document title or major section headers
|
|
188
|
+
- "heading": Section headings and subheadings
|
|
189
|
+
- "paragraph": Body text paragraphs
|
|
190
|
+
- "table": Tabular data
|
|
191
|
+
- "list": Bulleted or numbered lists
|
|
192
|
+
- "header": Page headers (repeated at top of pages)
|
|
193
|
+
- "footer": Page footers (repeated at bottom of pages)
|
|
194
|
+
- "caption": Image or figure captions
|
|
195
|
+
- "code": Code blocks or preformatted text
|
|
196
|
+
|
|
197
|
+
Include "_blockTypes" mapping field paths to their block type.
|
|
198
|
+
Example: "_blockTypes": { "summary": "paragraph", "items": "list" }
|
|
199
|
+
`.trim();
|
|
200
|
+
}
|
|
201
|
+
function buildHeaderFooterPrompt(options) {
|
|
202
|
+
const parts = [];
|
|
203
|
+
if (options.extractHeaders) {
|
|
204
|
+
parts.push('Identify and extract document headers (repeated content at the top of pages) into a "_headers" array.');
|
|
205
|
+
}
|
|
206
|
+
if (options.extractFooters) {
|
|
207
|
+
parts.push('Identify and extract document footers (repeated content at the bottom of pages, like page numbers) into a "_footers" array.');
|
|
208
|
+
}
|
|
209
|
+
if (parts.length > 0) {
|
|
210
|
+
parts.push('Each header/footer entry should include: { "text": "...", "pages": [0, 1, 2] } listing which pages contain it.');
|
|
211
|
+
}
|
|
212
|
+
return parts.join("\n");
|
|
213
|
+
}
|
|
214
|
+
function buildChunkingPrompt(strategy, maxChunkSize) {
|
|
215
|
+
const sizeNote = maxChunkSize ? ` Keep chunks under ${maxChunkSize} characters when possible.` : "";
|
|
216
|
+
switch (strategy) {
|
|
217
|
+
case "page":
|
|
218
|
+
return `Organize the extracted content by page. Include page number for each chunk.${sizeNote}`;
|
|
219
|
+
case "section":
|
|
220
|
+
return `Divide the document into logical sections based on headings and structure. Each section should be a coherent unit.${sizeNote}`;
|
|
221
|
+
case "paragraph":
|
|
222
|
+
return `Divide the content into individual paragraphs, preserving the natural paragraph breaks from the document.${sizeNote}`;
|
|
223
|
+
case "semantic":
|
|
224
|
+
return `Divide the document into semantically coherent chunks. Each chunk should be a self-contained unit of meaning that could stand alone.${sizeNote}`;
|
|
225
|
+
default:
|
|
226
|
+
return "";
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
function buildLLMDerivedFeaturesPrompt(options) {
|
|
230
|
+
const parts = [];
|
|
231
|
+
const formatPrompt = buildOutputFormatPrompt(options);
|
|
232
|
+
if (formatPrompt) {
|
|
233
|
+
parts.push(formatPrompt);
|
|
234
|
+
}
|
|
235
|
+
if (options.languageHints && options.languageHints.length > 0) {
|
|
236
|
+
parts.push(buildLanguageHintsPrompt(options.languageHints));
|
|
237
|
+
}
|
|
238
|
+
if (options.includeConfidence) {
|
|
239
|
+
parts.push(buildConfidencePrompt());
|
|
240
|
+
}
|
|
241
|
+
if (options.includeSources) {
|
|
242
|
+
parts.push(buildSourcesPrompt());
|
|
243
|
+
}
|
|
244
|
+
if (options.includeBlockTypes) {
|
|
245
|
+
parts.push(buildBlockClassificationPrompt());
|
|
246
|
+
}
|
|
247
|
+
if (options.extractHeaders || options.extractFooters) {
|
|
248
|
+
parts.push(buildHeaderFooterPrompt(options));
|
|
249
|
+
}
|
|
250
|
+
if (options.chunkingStrategy) {
|
|
251
|
+
parts.push(buildChunkingPrompt(options.chunkingStrategy, options.maxChunkSize));
|
|
252
|
+
}
|
|
253
|
+
if (parts.length === 0) {
|
|
254
|
+
return "";
|
|
255
|
+
}
|
|
256
|
+
return `
|
|
257
|
+
==================================================
|
|
258
|
+
ADDITIONAL OUTPUT REQUIREMENTS
|
|
259
|
+
==================================================
|
|
260
|
+
|
|
261
|
+
${parts.join("\n\n")}
|
|
262
|
+
|
|
263
|
+
==================================================
|
|
264
|
+
`.trim();
|
|
265
|
+
}
|
|
266
|
+
function combineSchemaUserAndDerivedPrompts(schema, userPrompt, derivedOptions) {
|
|
267
|
+
let result = combineSchemaAndUserPrompt(schema, userPrompt);
|
|
268
|
+
if (derivedOptions) {
|
|
269
|
+
const derivedPrompt = buildLLMDerivedFeaturesPrompt(derivedOptions);
|
|
270
|
+
if (derivedPrompt) {
|
|
271
|
+
result = result + "\n\n" + derivedPrompt;
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
return result;
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
export {
|
|
278
|
+
formatSchemaForPrompt,
|
|
279
|
+
buildSchemaPromptSection,
|
|
280
|
+
combineSchemaAndUserPrompt,
|
|
281
|
+
buildOutputFormatPrompt,
|
|
282
|
+
buildLanguageHintsPrompt,
|
|
283
|
+
buildConfidencePrompt,
|
|
284
|
+
buildSourcesPrompt,
|
|
285
|
+
buildBlockClassificationPrompt,
|
|
286
|
+
buildHeaderFooterPrompt,
|
|
287
|
+
buildChunkingPrompt,
|
|
288
|
+
buildLLMDerivedFeaturesPrompt,
|
|
289
|
+
combineSchemaUserAndDerivedPrompts
|
|
290
|
+
};
|
|
291
|
+
//# sourceMappingURL=chunk-7YPJIWRM.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/schema-prompt-formatter.ts"],"sourcesContent":["/**\n * Utility for converting JSON Schema to human-readable prompt text\n * that emphasizes exact field name requirements for structured extraction.\n */\n\n/**\n * JSON Schema type used for prompt formatting.\n * Uses a recursive structure to support nested schemas.\n */\nexport interface JSONSchema {\n type?: string | string[]; // Can be array for union types (e.g., [\"string\", \"null\"])\n properties?: Record<string, JSONSchema>;\n items?: JSONSchema | JSONSchema[]; // Can be array for tuple validation\n description?: string;\n required?: string[];\n enum?: (string | number | boolean | null)[];\n anyOf?: JSONSchema[];\n oneOf?: JSONSchema[];\n allOf?: JSONSchema[];\n format?: string;\n [key: string]: unknown; // Allow additional properties\n}\n\n/**\n * Formats a JSON Schema into prompt text that emphasizes exact field names.\n * This helps LLMs understand they must use the exact field names specified\n * in the schema, not invent their own based on document content.\n */\nexport function formatSchemaForPrompt(schema: JSONSchema, indent: number = 0): string {\n if (!schema || typeof schema !== 'object') {\n return '';\n }\n\n const indentStr = ' '.repeat(indent);\n let result = '';\n\n // Handle object type with properties\n if (schema.type === 'object' && schema.properties) {\n const properties = schema.properties;\n const required = schema.required || [];\n\n for (const [fieldName, fieldSchema] of Object.entries(properties)) {\n const isRequired = required.includes(fieldName);\n const requiredMarker = isRequired ? ' (REQUIRED)' : ' (optional)';\n\n // Field name in backticks to emphasize exactness\n result += `${indentStr}- \\`${fieldName}\\`${requiredMarker}`;\n\n // Type information\n const type = getTypeDescription(fieldSchema);\n if (type) {\n result += `: ${type}`;\n }\n\n // Description if available\n if (fieldSchema.description) {\n result += `\\n${indentStr} ${fieldSchema.description}`;\n }\n\n // Enum values if specified\n if (fieldSchema.enum) {\n result += `\\n${indentStr} Allowed values: ${fieldSchema.enum.map((v) => JSON.stringify(v)).join(', ')}`;\n }\n\n result += '\\n';\n\n // Nested object properties\n if (fieldSchema.type === 'object' && fieldSchema.properties) {\n result += formatSchemaForPrompt(fieldSchema, indent + 1);\n }\n\n // Array item schema\n if (fieldSchema.type === 'array' && fieldSchema.items) {\n result += `${indentStr} Array items:\\n`;\n // Handle both single schema and tuple schemas (array of schemas)\n const itemSchema = Array.isArray(fieldSchema.items)\n ? fieldSchema.items[0] // For tuple validation, describe first item type\n : fieldSchema.items;\n if (itemSchema && itemSchema.type === 'object' && itemSchema.properties) {\n result += formatSchemaForPrompt(itemSchema, indent + 2);\n } else if (itemSchema) {\n const itemType = getTypeDescription(itemSchema);\n result += `${indentStr} ${itemType}\\n`;\n }\n }\n }\n }\n\n return result;\n}\n\n/**\n * Gets a human-readable type description from a schema property\n */\nfunction getTypeDescription(schema: JSONSchema): string {\n if (!schema) return 'any';\n\n if (schema.type) {\n // Handle array of types (e.g., [\"string\", \"null\"])\n const typeStr = Array.isArray(schema.type) ? schema.type.join(' | ') : schema.type;\n\n if (typeStr === 'array' || (Array.isArray(schema.type) && schema.type.includes('array'))) {\n if (schema.items && !Array.isArray(schema.items) && schema.items.type) {\n const itemType = Array.isArray(schema.items.type)\n ? schema.items.type.join(' | ')\n : schema.items.type;\n return `array of ${itemType}`;\n }\n return 'array';\n }\n // Include format information for strings (e.g., date, time, date-time, email, uri)\n if ((typeStr === 'string' || (Array.isArray(schema.type) && schema.type.includes('string'))) && schema.format) {\n const formatHints: Record<string, string> = {\n 'date': 'YYYY-MM-DD',\n 'time': 'HH:MM or HH:MM:SS',\n 'date-time': 'YYYY-MM-DDTHH:MM:SS (ISO 8601)',\n };\n const hint = formatHints[schema.format];\n if (hint) {\n return `string (format: ${schema.format}, use ${hint})`;\n }\n return `string (format: ${schema.format})`;\n }\n return typeStr;\n }\n\n // Handle anyOf, oneOf, allOf\n if (schema.anyOf) {\n return schema.anyOf.map((s) => getTypeDescription(s)).join(' OR ');\n }\n if (schema.oneOf) {\n return schema.oneOf.map((s) => getTypeDescription(s)).join(' OR ');\n }\n\n return 'any';\n}\n\n/**\n * Generates a complete prompt section with schema information and\n * strict field name instructions.\n */\nexport function buildSchemaPromptSection(schema: JSONSchema): string {\n const schemaFields = formatSchemaForPrompt(schema);\n\n return `\n==================================================\nCRITICAL: OUTPUT STRUCTURE REQUIREMENTS\n==================================================\n\nYOU MUST RETURN JSON MATCHING THIS EXACT STRUCTURE:\n\n${schemaFields}\n\nCRITICAL FIELD NAME REQUIREMENTS:\n✓ Use EXACTLY the field names shown above (character-for-character match)\n✓ Preserve the exact casing (e.g., \"fullName\", not \"full_name\" or \"FullName\")\n✓ Do NOT abbreviate field names (e.g., \"dob\" instead of \"dateOfBirth\")\n✓ Do NOT invent alternative names (e.g., \"directorName\" instead of \"fullName\")\n✓ Do NOT use snake_case if the schema uses camelCase\n✓ Do NOT flatten nested structures or rename nested fields\n✓ The schema above is the SINGLE SOURCE OF TRUTH for field naming\n\nMISSING DATA:\n- If a required field has no data in the document, use null\n- If an optional field has no data, you may omit it or use null\n- Do NOT invent data that isn't in the document\n\n==================================================\n`.trim();\n}\n\n/**\n * Combines schema prompt section with user's custom prompt\n */\nexport function combineSchemaAndUserPrompt(\n schema: JSONSchema,\n userPrompt: string\n): string {\n const schemaSection = buildSchemaPromptSection(schema);\n\n if (!userPrompt || userPrompt.trim() === '') {\n return schemaSection + '\\n\\nTASK: Extract structured data from the provided document.';\n }\n\n return schemaSection + '\\n\\n' + userPrompt;\n}\n\n// ============================================================================\n// LLM-Derived Feature Prompts\n// ============================================================================\n\n/**\n * Output format types for LLM text generation\n */\nexport type OutputFormat = 'markdown' | 'html' | 'json' | 'text';\nexport type TableFormat = 'markdown' | 'html' | 'csv';\nexport type ChunkingStrategy = 'page' | 'section' | 'paragraph' | 'semantic';\n\n/**\n * Options for LLM-derived features that are implemented via prompting\n */\nexport interface LLMDerivedPromptOptions {\n outputFormat?: OutputFormat;\n tableFormat?: TableFormat;\n pageMarkers?: boolean;\n includeConfidence?: boolean;\n includeSources?: boolean;\n includeBlockTypes?: boolean;\n extractHeaders?: boolean;\n extractFooters?: boolean;\n chunkingStrategy?: ChunkingStrategy;\n maxChunkSize?: number;\n languageHints?: string[];\n}\n\n/**\n * Builds prompt additions for output format options\n */\nexport function buildOutputFormatPrompt(options: LLMDerivedPromptOptions): string {\n const parts: string[] = [];\n\n // Output format\n if (options.outputFormat) {\n switch (options.outputFormat) {\n case 'markdown':\n parts.push('Format all text content using markdown syntax. Use proper headings (#, ##, ###), lists (-, *), bold (**text**), and other markdown formatting where appropriate.');\n break;\n case 'html':\n parts.push('Format all text content as valid HTML. Use semantic tags like <p>, <h1>-<h6>, <ul>, <ol>, <strong>, <em> where appropriate.');\n break;\n case 'json':\n parts.push('For text fields that contain structured data, format them as embedded JSON strings where appropriate.');\n break;\n case 'text':\n parts.push('Return plain text without any formatting. No markdown, HTML, or other markup.');\n break;\n }\n }\n\n // Table format\n if (options.tableFormat) {\n switch (options.tableFormat) {\n case 'markdown':\n parts.push('Format all tables using markdown table syntax with | column separators and header row with ---.');\n break;\n case 'html':\n parts.push('Format all tables as HTML <table> elements with <thead>, <tbody>, <tr>, <th>, and <td> tags.');\n break;\n case 'csv':\n parts.push('Format all tables as CSV with headers in the first row and comma-separated values.');\n break;\n }\n }\n\n // Page markers\n if (options.pageMarkers) {\n parts.push('Insert \"---\" page break markers between content from different pages of the document.');\n }\n\n return parts.join('\\n');\n}\n\n/**\n * Builds prompt additions for language hints\n */\nexport function buildLanguageHintsPrompt(languages: string[]): string {\n if (!languages || languages.length === 0) {\n return '';\n }\n return `The document is written in ${languages.join(', ')}. Extract and preserve text in the original language(s).`;\n}\n\n/**\n * Builds prompt additions for confidence scoring\n */\nexport function buildConfidencePrompt(): string {\n return `\nFor each extracted field, assess your confidence level and include it in the \"_confidence\" object:\n- Use a number from 0.0 to 1.0 where:\n - 0.9-1.0: Very high confidence - text is clear and unambiguous\n - 0.7-0.9: High confidence - minor ambiguity but likely correct\n - 0.5-0.7: Medium confidence - some uncertainty or partial visibility\n - 0.3-0.5: Low confidence - significant uncertainty\n - 0.0-0.3: Very low confidence - guessing or text was unclear\n\nInclude \"_confidence\" as a sibling object mapping field paths to their scores.\nExample: \"_confidence\": { \"invoiceNumber\": 0.95, \"amount\": 0.82 }\n`.trim();\n}\n\n/**\n * Builds prompt additions for source citations with bounding boxes\n */\nexport function buildSourcesPrompt(): string {\n return `\nFor each extracted field, identify the source location in the document and include it in the \"_sources\" array:\nEach source entry should contain:\n- \"field\": The field name/path that was extracted\n- \"text\": The exact text from the document used for extraction\n- \"bbox\": Bounding box as [y_min, x_min, y_max, x_max] normalized to 0-1000 scale\n- \"page\": Page number (0-indexed) where the text appears\n\nInclude \"_sources\" as a sibling array to your extracted data.\nExample: \"_sources\": [{ \"field\": \"invoiceNumber\", \"text\": \"INV-001\", \"bbox\": [100, 50, 120, 150], \"page\": 0 }]\n`.trim();\n}\n\n/**\n * Builds prompt additions for block type classification\n */\nexport function buildBlockClassificationPrompt(): string {\n return `\nFor each extracted element or text block, classify its type in a \"_blockTypes\" object:\n- \"title\": Main document title or major section headers\n- \"heading\": Section headings and subheadings\n- \"paragraph\": Body text paragraphs\n- \"table\": Tabular data\n- \"list\": Bulleted or numbered lists\n- \"header\": Page headers (repeated at top of pages)\n- \"footer\": Page footers (repeated at bottom of pages)\n- \"caption\": Image or figure captions\n- \"code\": Code blocks or preformatted text\n\nInclude \"_blockTypes\" mapping field paths to their block type.\nExample: \"_blockTypes\": { \"summary\": \"paragraph\", \"items\": \"list\" }\n`.trim();\n}\n\n/**\n * Builds prompt additions for header/footer extraction\n */\nexport function buildHeaderFooterPrompt(options: { extractHeaders?: boolean; extractFooters?: boolean }): string {\n const parts: string[] = [];\n\n if (options.extractHeaders) {\n parts.push('Identify and extract document headers (repeated content at the top of pages) into a \"_headers\" array.');\n }\n\n if (options.extractFooters) {\n parts.push('Identify and extract document footers (repeated content at the bottom of pages, like page numbers) into a \"_footers\" array.');\n }\n\n if (parts.length > 0) {\n parts.push('Each header/footer entry should include: { \"text\": \"...\", \"pages\": [0, 1, 2] } listing which pages contain it.');\n }\n\n return parts.join('\\n');\n}\n\n/**\n * Builds prompt additions for semantic chunking\n */\nexport function buildChunkingPrompt(strategy: ChunkingStrategy, maxChunkSize?: number): string {\n const sizeNote = maxChunkSize\n ? ` Keep chunks under ${maxChunkSize} characters when possible.`\n : '';\n\n switch (strategy) {\n case 'page':\n return `Organize the extracted content by page. Include page number for each chunk.${sizeNote}`;\n case 'section':\n return `Divide the document into logical sections based on headings and structure. Each section should be a coherent unit.${sizeNote}`;\n case 'paragraph':\n return `Divide the content into individual paragraphs, preserving the natural paragraph breaks from the document.${sizeNote}`;\n case 'semantic':\n return `Divide the document into semantically coherent chunks. Each chunk should be a self-contained unit of meaning that could stand alone.${sizeNote}`;\n default:\n return '';\n }\n}\n\n/**\n * Combines all LLM-derived feature prompts into a single prompt section\n */\nexport function buildLLMDerivedFeaturesPrompt(options: LLMDerivedPromptOptions): string {\n const parts: string[] = [];\n\n // Output format options\n const formatPrompt = buildOutputFormatPrompt(options);\n if (formatPrompt) {\n parts.push(formatPrompt);\n }\n\n // Language hints\n if (options.languageHints && options.languageHints.length > 0) {\n parts.push(buildLanguageHintsPrompt(options.languageHints));\n }\n\n // Metadata features (confidence, sources, block types)\n if (options.includeConfidence) {\n parts.push(buildConfidencePrompt());\n }\n\n if (options.includeSources) {\n parts.push(buildSourcesPrompt());\n }\n\n if (options.includeBlockTypes) {\n parts.push(buildBlockClassificationPrompt());\n }\n\n // Header/footer extraction\n if (options.extractHeaders || options.extractFooters) {\n parts.push(buildHeaderFooterPrompt(options));\n }\n\n // Chunking strategy\n if (options.chunkingStrategy) {\n parts.push(buildChunkingPrompt(options.chunkingStrategy, options.maxChunkSize));\n }\n\n if (parts.length === 0) {\n return '';\n }\n\n return `\n==================================================\nADDITIONAL OUTPUT REQUIREMENTS\n==================================================\n\n${parts.join('\\n\\n')}\n\n==================================================\n`.trim();\n}\n\n/**\n * Combines schema prompt with user prompt and LLM-derived features\n */\nexport function combineSchemaUserAndDerivedPrompts(\n schema: JSONSchema,\n userPrompt: string,\n derivedOptions?: LLMDerivedPromptOptions\n): string {\n let result = combineSchemaAndUserPrompt(schema, userPrompt);\n\n if (derivedOptions) {\n const derivedPrompt = buildLLMDerivedFeaturesPrompt(derivedOptions);\n if (derivedPrompt) {\n result = result + '\\n\\n' + derivedPrompt;\n }\n }\n\n return result;\n}\n"],"mappings":";AA4BO,SAAS,sBAAsB,QAAoB,SAAiB,GAAW;AACpF,MAAI,CAAC,UAAU,OAAO,WAAW,UAAU;AACzC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY,KAAK,OAAO,MAAM;AACpC,MAAI,SAAS;AAGb,MAAI,OAAO,SAAS,YAAY,OAAO,YAAY;AACjD,UAAM,aAAa,OAAO;AAC1B,UAAM,WAAW,OAAO,YAAY,CAAC;AAErC,eAAW,CAAC,WAAW,WAAW,KAAK,OAAO,QAAQ,UAAU,GAAG;AACjE,YAAM,aAAa,SAAS,SAAS,SAAS;AAC9C,YAAM,iBAAiB,aAAa,gBAAgB;AAGpD,gBAAU,GAAG,SAAS,OAAO,SAAS,KAAK,cAAc;AAGzD,YAAM,OAAO,mBAAmB,WAAW;AAC3C,UAAI,MAAM;AACR,kBAAU,KAAK,IAAI;AAAA,MACrB;AAGA,UAAI,YAAY,aAAa;AAC3B,kBAAU;AAAA,EAAK,SAAS,KAAK,YAAY,WAAW;AAAA,MACtD;AAGA,UAAI,YAAY,MAAM;AACpB,kBAAU;AAAA,EAAK,SAAS,qBAAqB,YAAY,KAAK,IAAI,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,EAAE,KAAK,IAAI,CAAC;AAAA,MACxG;AAEA,gBAAU;AAGV,UAAI,YAAY,SAAS,YAAY,YAAY,YAAY;AAC3D,kBAAU,sBAAsB,aAAa,SAAS,CAAC;AAAA,MACzD;AAGA,UAAI,YAAY,SAAS,WAAW,YAAY,OAAO;AACrD,kBAAU,GAAG,SAAS;AAAA;AAEtB,cAAM,aAAa,MAAM,QAAQ,YAAY,KAAK,IAC9C,YAAY,MAAM,CAAC,IACnB,YAAY;AAChB,YAAI,cAAc,WAAW,SAAS,YAAY,WAAW,YAAY;AACvE,oBAAU,sBAAsB,YAAY,SAAS,CAAC;AAAA,QACxD,WAAW,YAAY;AACrB,gBAAM,WAAW,mBAAmB,UAAU;AAC9C,oBAAU,GAAG,SAAS,OAAO,QAAQ;AAAA;AAAA,QACvC;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AACT;AAKA,SAAS,mBAAmB,QAA4B;AACtD,MAAI,CAAC,OAAQ,QAAO;AAEpB,MAAI,OAAO,MAAM;AAEf,UAAM,UAAU,MAAM,QAAQ,OAAO,IAAI,IAAI,OAAO,KAAK,KAAK,KAAK,IAAI,OAAO;AAE9E,QAAI,YAAY,WAAY,MAAM,QAAQ,OAAO,IAAI,KAAK,OAAO,KAAK,SAAS,OAAO,GAAI;AACxF,UAAI,OAAO,SAAS,CAAC,MAAM,QAAQ,OAAO,KAAK,KAAK,OAAO,MAAM,MAAM;AACrE,cAAM,WAAW,MAAM,QAAQ,OAAO,MAAM,IAAI,IAC5C,OAAO,MAAM,KAAK,KAAK,KAAK,IAC5B,OAAO,MAAM;AACjB,eAAO,YAAY,QAAQ;AAAA,MAC7B;AACA,aAAO;AAAA,IACT;AAEA,SAAK,YAAY,YAAa,MAAM,QAAQ,OAAO,IAAI,KAAK,OAAO,KAAK,SAAS,QAAQ,MAAO,OAAO,QAAQ;AAC7G,YAAM,cAAsC;AAAA,QAC1C,QAAQ;AAAA,QACR,QAAQ;AAAA,QACR,aAAa;AAAA,MACf;AACA,YAAM,OAAO,YAAY,OAAO,MAAM;AACtC,UAAI,MAAM;AACR,eAAO,mBAAmB,OAAO,MAAM,SAAS,IAAI;AAAA,MACtD;AACA,aAAO,mBAAmB,OAAO,MAAM;AAAA,IACzC;AACA,WAAO;AAAA,EACT;AAGA,MAAI,OAAO,OAAO;AAChB,WAAO,OAAO,MAAM,IAAI,CAAC,MAAM,mBAAmB,CAAC,CAAC,EAAE,KAAK,MAAM;AAAA,EACnE;AACA,MAAI,OAAO,OAAO;AAChB,WAAO,OAAO,MAAM,IAAI,CAAC,MAAM,mBAAmB,CAAC,CAAC,EAAE,KAAK,MAAM;AAAA,EACnE;AAEA,SAAO;AACT;AAMO,SAAS,yBAAyB,QAA4B;AACnE,QAAM,eAAe,sBAAsB,MAAM;AAEjD,SAAO;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOP,YAAY;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAiBZ,KAAK;AACP;AAKO,SAAS,2BACd,QACA,YACQ;AACR,QAAM,gBAAgB,yBAAyB,MAAM;AAErD,MAAI,CAAC,cAAc,WAAW,KAAK,MAAM,IAAI;AAC3C,WAAO,gBAAgB;AAAA,EACzB;AAEA,SAAO,gBAAgB,SAAS;AAClC;AAiCO,SAAS,wBAAwB,SAA0C;AAChF,QAAM,QAAkB,CAAC;AAGzB,MAAI,QAAQ,cAAc;AACxB,YAAQ,QAAQ,cAAc;AAAA,MAC5B,KAAK;AACH,cAAM,KAAK,kKAAkK;AAC7K;AAAA,MACF,KAAK;AACH,cAAM,KAAK,6HAA6H;AACxI;AAAA,MACF,KAAK;AACH,cAAM,KAAK,uGAAuG;AAClH;AAAA,MACF,KAAK;AACH,cAAM,KAAK,+EAA+E;AAC1F;AAAA,IACJ;AAAA,EACF;AAGA,MAAI,QAAQ,aAAa;AACvB,YAAQ,QAAQ,aAAa;AAAA,MAC3B,KAAK;AACH,cAAM,KAAK,iGAAiG;AAC5G;AAAA,MACF,KAAK;AACH,cAAM,KAAK,8FAA8F;AACzG;AAAA,MACF,KAAK;AACH,cAAM,KAAK,oFAAoF;AAC/F;AAAA,IACJ;AAAA,EACF;AAGA,MAAI,QAAQ,aAAa;AACvB,UAAM,KAAK,uFAAuF;AAAA,EACpG;AAEA,SAAO,MAAM,KAAK,IAAI;AACxB;AAKO,SAAS,yBAAyB,WAA6B;AACpE,MAAI,CAAC,aAAa,UAAU,WAAW,GAAG;AACxC,WAAO;AAAA,EACT;AACA,SAAO,8BAA8B,UAAU,KAAK,IAAI,CAAC;AAC3D;AAKO,SAAS,wBAAgC;AAC9C,SAAO;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAWP,KAAK;AACP;AAKO,SAAS,qBAA6B;AAC3C,SAAO;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAUP,KAAK;AACP;AAKO,SAAS,iCAAyC;AACvD,SAAO;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAcP,KAAK;AACP;AAKO,SAAS,wBAAwB,SAAyE;AAC/G,QAAM,QAAkB,CAAC;AAEzB,MAAI,QAAQ,gBAAgB;AAC1B,UAAM,KAAK,uGAAuG;AAAA,EACpH;AAEA,MAAI,QAAQ,gBAAgB;AAC1B,UAAM,KAAK,6HAA6H;AAAA,EAC1I;AAEA,MAAI,MAAM,SAAS,GAAG;AACpB,UAAM,KAAK,gHAAgH;AAAA,EAC7H;AAEA,SAAO,MAAM,KAAK,IAAI;AACxB;AAKO,SAAS,oBAAoB,UAA4B,cAA+B;AAC7F,QAAM,WAAW,eACb,sBAAsB,YAAY,+BAClC;AAEJ,UAAQ,UAAU;AAAA,IAChB,KAAK;AACH,aAAO,8EAA8E,QAAQ;AAAA,IAC/F,KAAK;AACH,aAAO,qHAAqH,QAAQ;AAAA,IACtI,KAAK;AACH,aAAO,4GAA4G,QAAQ;AAAA,IAC7H,KAAK;AACH,aAAO,uIAAuI,QAAQ;AAAA,IACxJ;AACE,aAAO;AAAA,EACX;AACF;AAKO,SAAS,8BAA8B,SAA0C;AACtF,QAAM,QAAkB,CAAC;AAGzB,QAAM,eAAe,wBAAwB,OAAO;AACpD,MAAI,cAAc;AAChB,UAAM,KAAK,YAAY;AAAA,EACzB;AAGA,MAAI,QAAQ,iBAAiB,QAAQ,cAAc,SAAS,GAAG;AAC7D,UAAM,KAAK,yBAAyB,QAAQ,aAAa,CAAC;AAAA,EAC5D;AAGA,MAAI,QAAQ,mBAAmB;AAC7B,UAAM,KAAK,sBAAsB,CAAC;AAAA,EACpC;AAEA,MAAI,QAAQ,gBAAgB;AAC1B,UAAM,KAAK,mBAAmB,CAAC;AAAA,EACjC;AAEA,MAAI,QAAQ,mBAAmB;AAC7B,UAAM,KAAK,+BAA+B,CAAC;AAAA,EAC7C;AAGA,MAAI,QAAQ,kBAAkB,QAAQ,gBAAgB;AACpD,UAAM,KAAK,wBAAwB,OAAO,CAAC;AAAA,EAC7C;AAGA,MAAI,QAAQ,kBAAkB;AAC5B,UAAM,KAAK,oBAAoB,QAAQ,kBAAkB,QAAQ,YAAY,CAAC;AAAA,EAChF;AAEA,MAAI,MAAM,WAAW,GAAG;AACtB,WAAO;AAAA,EACT;AAEA,SAAO;AAAA;AAAA;AAAA;AAAA;AAAA,EAKP,MAAM,KAAK,MAAM,CAAC;AAAA;AAAA;AAAA,EAGlB,KAAK;AACP;AAKO,SAAS,mCACd,QACA,YACA,gBACQ;AACR,MAAI,SAAS,2BAA2B,QAAQ,UAAU;AAE1D,MAAI,gBAAgB;AAClB,UAAM,gBAAgB,8BAA8B,cAAc;AAClE,QAAI,eAAe;AACjB,eAAS,SAAS,SAAS;AAAA,IAC7B;AAAA,EACF;AAEA,SAAO;AACT;","names":[]}
|